From: Andi Kleen Fix SMP race in NMI watchdog on i386/x86-64 Fix a long standing SMP Setup race in the NMI watchdog. The watchdog would tick from very early and check if all CPUs increase their timer interrupts. For that it would check the cpu_online_map. Now if a CPU took too long to boot the watchdog would trigger prematurely because the CPU didn't increase its timer count yet. Fix is to check cpu_callin_map instead of cpu_online_map because the first is only set when a CPU started its timer interrupt. I fixed it on i386 and x86-64. Description of the problem from Manpreet Singh. Thanks. Cc: Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton --- 25-akpm/arch/i386/kernel/nmi.c | 2 +- 25-akpm/arch/i386/kernel/smpboot.c | 2 +- 25-akpm/arch/x86_64/kernel/nmi.c | 4 +++- 25-akpm/arch/x86_64/kernel/smpboot.c | 2 +- 25-akpm/include/asm-i386/smp.h | 1 + 25-akpm/include/asm-x86_64/smp.h | 1 + 6 files changed, 8 insertions(+), 4 deletions(-) diff -puN arch/i386/kernel/nmi.c~i386-x86-64-fix-smp-nmi-watchdog-race arch/i386/kernel/nmi.c --- 25/arch/i386/kernel/nmi.c~i386-x86-64-fix-smp-nmi-watchdog-race 2005-01-16 00:38:11.646059216 -0800 +++ 25-akpm/arch/i386/kernel/nmi.c 2005-01-16 00:38:11.658057392 -0800 @@ -117,7 +117,7 @@ int __init check_nmi_watchdog (void) /* FIXME: Only boot CPU is online at this stage. Check CPUs as they come up. */ for (cpu = 0; cpu < NR_CPUS; cpu++) { - if (!cpu_online(cpu)) + if (!cpu_isset(cpu, cpu_callin_map)) continue; if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { printk("CPU#%d: NMI appears to be stuck!\n", cpu); diff -puN arch/i386/kernel/smpboot.c~i386-x86-64-fix-smp-nmi-watchdog-race arch/i386/kernel/smpboot.c --- 25/arch/i386/kernel/smpboot.c~i386-x86-64-fix-smp-nmi-watchdog-race 2005-01-16 00:38:11.648058912 -0800 +++ 25-akpm/arch/i386/kernel/smpboot.c 2005-01-16 00:38:11.659057240 -0800 @@ -67,7 +67,7 @@ EXPORT_SYMBOL(phys_proc_id); /* bitmap of online cpus */ cpumask_t cpu_online_map; -static cpumask_t cpu_callin_map; +cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; static cpumask_t smp_commenced_mask; diff -puN arch/x86_64/kernel/nmi.c~i386-x86-64-fix-smp-nmi-watchdog-race arch/x86_64/kernel/nmi.c --- 25/arch/x86_64/kernel/nmi.c~i386-x86-64-fix-smp-nmi-watchdog-race 2005-01-16 00:38:11.650058608 -0800 +++ 25-akpm/arch/x86_64/kernel/nmi.c 2005-01-16 00:38:11.660057088 -0800 @@ -130,7 +130,9 @@ int __init check_nmi_watchdog (void) mdelay((10*1000)/nmi_hz); // wait 10 ticks for (cpu = 0; cpu < NR_CPUS; cpu++) { - if (!cpu_online(cpu)) + /* Check cpu_callin_map here because that is set + after the timer is started. */ + if (!cpu_isset(cpu, cpu_callin_map)) continue; if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { printk("CPU#%d: NMI appears to be stuck (%d)!\n", diff -puN arch/x86_64/kernel/smpboot.c~i386-x86-64-fix-smp-nmi-watchdog-race arch/x86_64/kernel/smpboot.c --- 25/arch/x86_64/kernel/smpboot.c~i386-x86-64-fix-smp-nmi-watchdog-race 2005-01-16 00:38:11.651058456 -0800 +++ 25-akpm/arch/x86_64/kernel/smpboot.c 2005-01-16 00:38:11.661056936 -0800 @@ -64,7 +64,7 @@ EXPORT_SYMBOL(phys_proc_id); /* Bitmask of currently online CPUs */ cpumask_t cpu_online_map; -static cpumask_t cpu_callin_map; +cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; static cpumask_t smp_commenced_mask; diff -puN include/asm-i386/smp.h~i386-x86-64-fix-smp-nmi-watchdog-race include/asm-i386/smp.h --- 25/include/asm-i386/smp.h~i386-x86-64-fix-smp-nmi-watchdog-race 2005-01-16 00:38:11.652058304 -0800 +++ 25-akpm/include/asm-i386/smp.h 2005-01-16 00:38:11.660057088 -0800 @@ -53,6 +53,7 @@ extern u8 x86_cpu_to_apicid[]; #define __smp_processor_id() (current_thread_info()->cpu) extern cpumask_t cpu_callout_map; +extern cpumask_t cpu_callin_map; #define cpu_possible_map cpu_callout_map /* We don't mark CPUs online until __cpu_up(), so we need another measure */ diff -puN include/asm-x86_64/smp.h~i386-x86-64-fix-smp-nmi-watchdog-race include/asm-x86_64/smp.h --- 25/include/asm-x86_64/smp.h~i386-x86-64-fix-smp-nmi-watchdog-race 2005-01-16 00:38:11.654058000 -0800 +++ 25-akpm/include/asm-x86_64/smp.h 2005-01-16 00:38:11.657057544 -0800 @@ -59,6 +59,7 @@ extern u8 phys_proc_id[NR_CPUS]; */ extern cpumask_t cpu_callout_map; +extern cpumask_t cpu_callin_map; #define cpu_possible_map cpu_callout_map static inline int num_booting_cpus(void) _