From: Rusty Russell By Matt Fleming, Zwane Mwaikambo, Dipankar Sarma, Vatsa Vaddagiri This is the i386 portion, as modified by Dipankar and Vatsa. We've basically been using it to stress the core code, unlike the PPC64 which actually has hotplug CPUs (but this is better, because we can plug/unplug REALLY fast to find races). As such, it's a curiousity, and a little rough (doesn't allow boot CPU to go down, for example). But great for playing with the code. The main change is that the IPI-sending types of code have been changed to use a cpumask_t rather than a count, since cpus may appear and vanish and we don't want to hold a lock. --- 25-akpm/arch/i386/Kconfig | 9 +++ 25-akpm/arch/i386/kernel/irq.c | 69 +++++++++++++++++++++++ 25-akpm/arch/i386/kernel/process.c | 59 +++++++++++++++++++ 25-akpm/arch/i386/kernel/smp.c | 76 +++++++++++++++++-------- 25-akpm/arch/i386/kernel/smpboot.c | 111 ++++++++++++++++++++++++++++++++++++- 25-akpm/include/asm-i386/cpu.h | 2 25-akpm/include/asm-i386/smp.h | 6 ++ 7 files changed, 306 insertions(+), 26 deletions(-) diff -puN arch/i386/Kconfig~cpuhotplug-04-x86-support arch/i386/Kconfig --- 25/arch/i386/Kconfig~cpuhotplug-04-x86-support Wed Feb 4 12:49:55 2004 +++ 25-akpm/arch/i386/Kconfig Wed Feb 4 12:49:55 2004 @@ -1171,6 +1171,15 @@ config HOTPLUG agent" (/sbin/hotplug) to load modules and set up software needed to use devices as you hotplug them. +config HOTPLUG_CPU + bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" + depends on SMP && HOTPLUG && EXPERIMENTAL + ---help--- + Say Y here to experiment with turning CPUs off and on. CPUs + can be controlled through /sys/devices/system/cpu. + + Say N. + source "drivers/pcmcia/Kconfig" source "drivers/pci/hotplug/Kconfig" diff -puN arch/i386/kernel/irq.c~cpuhotplug-04-x86-support arch/i386/kernel/irq.c --- 25/arch/i386/kernel/irq.c~cpuhotplug-04-x86-support Wed Feb 4 12:49:55 2004 +++ 25-akpm/arch/i386/kernel/irq.c Wed Feb 4 12:49:55 2004 @@ -34,6 +34,8 @@ #include #include #include +#include +#include #include #include @@ -45,6 +47,7 @@ #include #include #include +#include /* * Linux has a controller-independent x86 interrupt architecture. @@ -964,7 +967,69 @@ static int irq_affinity_write_proc(struc return full_count; } +#endif + +#ifdef CONFIG_HOTPLUG_CPU +static void migrate_irqs_from(int cpu) +{ + cpumask_t mask; + unsigned int irq; + + mask = cpumask_of_cpu(cpu); + cpus_complement(mask); + cpus_and(mask, mask, cpu_online_map); + for (irq = 0; irq < NR_IRQS; irq++) { + cpus_and(irq_affinity[irq], irq_affinity[irq], mask); + if (cpus_empty(irq_affinity[irq])) + irq_affinity[irq] = cpumask_of_cpu(0); + + if (irq_desc[irq].handler->set_affinity) + irq_desc[irq].handler->set_affinity(irq, mask); + } +} + +void enable_all_irqs(int cpu) +{ + cpumask_t mask; + unsigned int irq; + + mask = cpumask_of_cpu(cpu); + cpus_or(mask, mask, cpu_online_map); + for (irq = 0; irq < NR_IRQS; irq++) { + cpus_or(irq_affinity[irq], irq_affinity[irq], mask); + if (cpus_empty(irq_affinity[irq])) { + irq_affinity[irq] = cpumask_of_cpu(0); + } + + if (irq_desc[irq].handler->set_affinity) + irq_desc[irq].handler->set_affinity(irq, mask); + } +} +static int irqs_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (int)hcpu; + switch(action) { + case CPU_ONLINE: + /* + * We could go through all the irqs and add + * this processor to the cpu set - zwane + */ + enable_all_irqs(cpu); + break; + case CPU_OFFLINE: + migrate_irqs_from(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata irqs_cpu_nb = { + .notifier_call = irqs_cpu_notify, +}; #endif static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, @@ -1053,5 +1118,9 @@ void init_irq_proc (void) */ for (i = 0; i < NR_IRQS; i++) register_irq_proc(i); +#ifdef CONFIG_HOTPLUG_CPU + register_cpu_notifier(&irqs_cpu_nb); +#endif + } diff -puN arch/i386/kernel/process.c~cpuhotplug-04-x86-support arch/i386/kernel/process.c --- 25/arch/i386/kernel/process.c~cpuhotplug-04-x86-support Wed Feb 4 12:49:55 2004 +++ 25-akpm/arch/i386/kernel/process.c Wed Feb 4 12:49:55 2004 @@ -14,6 +14,7 @@ #define __KERNEL_SYSCALLS__ #include +#include #include #include #include @@ -54,6 +55,9 @@ #include #include +#include +#include + asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int hlt_counter; @@ -132,6 +136,60 @@ static void poll_idle (void) } } +#ifdef CONFIG_HOTPLUG_CPU + +/* We don't actually take CPU down, just spin without interrupts. */ +static inline void check_cpu_quiescent(void) +{ + if (unlikely(__get_cpu_var(cpu_state) == CPU_OFFLINE)) { + int cpu = smp_processor_id(); + + spin_lock(&call_lock); + local_irq_disable(); + preempt_disable(); + + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + BUG_ON(cpu_isset(cpu, cpu_online_map)); + BUG_ON(!cpu_isset(cpu, cpu_active_map)); + cpu_clear(cpu, cpu_active_map); + spin_unlock(&call_lock); + + /* Death loop */ + while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE) + cpu_relax(); + + /* Even with irqs disabled, this is safe, since no + * smp_call_funcion can be headed for us now + * (!cpu_active). */ + spin_lock(&call_lock); + + /* from hereon we're ready to do work */ + __get_cpu_var(cpu_state) = CPU_ONLINE; + wmb(); + + //printk("Cpu %u arisen\n", smp_processor_id()); + + /* Put ourselves online before doing ___flush_tlb_all, + * so we avoid losing one to a race. */ + cpu_set(smp_processor_id(), cpu_active_map); + cpu_set(smp_processor_id(), cpu_online_map); + wmb(); + spin_unlock(&call_lock); + + __flush_tlb_all(); + local_irq_enable(); + + preempt_enable(); + } +} +#else +static inline void check_cpu_quiescent(void) +{ +} +#endif /* CONFIG_HOTPLUG_CPU */ + /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a @@ -148,6 +206,7 @@ void cpu_idle (void) if (!idle) idle = default_idle; + check_cpu_quiescent(); irq_stat[smp_processor_id()].idle_timestamp = jiffies; idle(); } diff -puN arch/i386/kernel/smpboot.c~cpuhotplug-04-x86-support arch/i386/kernel/smpboot.c --- 25/arch/i386/kernel/smpboot.c~cpuhotplug-04-x86-support Wed Feb 4 12:49:55 2004 +++ 25-akpm/arch/i386/kernel/smpboot.c Wed Feb 4 12:49:55 2004 @@ -44,6 +44,9 @@ #include #include #include +#include +#include +#include #include #include @@ -66,7 +69,12 @@ int phys_proc_id[NR_CPUS]; /* Package ID /* bitmap of online cpus */ cpumask_t cpu_online_map; -static cpumask_t cpu_callin_map; +#ifdef CONFIG_HOTPLUG_CPU +cpumask_t cpu_active_map; +#endif + +/* Initialize, although master cpu never calls in */ +static volatile cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; static cpumask_t smp_commenced_mask; @@ -84,6 +92,9 @@ extern unsigned char trampoline_data []; extern unsigned char trampoline_end []; static unsigned char *trampoline_base; +/* State of each CPU. */ +DEFINE_PER_CPU(int, cpu_state) = { 0 }; + /* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller @@ -458,7 +469,9 @@ int __init start_secondary(void *unused) * the local TLBs too. */ local_flush_tlb(); + /* cpu_set should suffice for this stage of bootup -zwane*/ cpu_set(smp_processor_id(), cpu_online_map); + cpu_set(smp_processor_id(), cpu_active_map); wmb(); return cpu_idle(); } @@ -1326,29 +1339,123 @@ __init void arch_init_sched_domains(void who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) { + smp_commenced_mask = cpumask_of_cpu(0); + cpu_callin_map = cpumask_of_cpu(0); + mb(); smp_boot_cpus(max_cpus); } void __devinit smp_prepare_boot_cpu(void) { cpu_set(smp_processor_id(), cpu_online_map); + cpu_set(smp_processor_id(), cpu_active_map); cpu_set(smp_processor_id(), cpu_callout_map); } +#ifdef CONFIG_HOTPLUG_CPU +/* must be called with the cpucontrol mutex held */ +static int __devinit cpu_enable(unsigned int cpu) +{ + /* get the target out of it's holding state */ + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + wmb(); + + /* wait for the processor to ack it. timeout? */ + while (!cpu_online(cpu)) + cpu_relax(); + return 0; +} + +int __cpu_disable(void) +{ + int cpu = smp_processor_id(); + /* + * Nothing for now, perhaps use cpufreq to drop frequency, + * but that could go into generic code. + * + * We won't take down the boot processor on i386 due to some + * interrupts only being able to be serviced by the BSP. + * Especially so if we're not using an IOAPIC -zwane + */ + if (cpu == 0) + return -EBUSY; + + BUG_ON(!cpu_isset(cpu, cpu_active_map)); + BUG_ON(!cpu_isset(cpu, cpu_online_map)); + wmb(); + cpu_clear(cpu, cpu_online_map); + wmb(); + + return 0; +} + +static void do_nothing(void *unused) +{ + return; +} + +void __cpu_die(unsigned int cpu) +{ + unsigned int i; + + /* Final threads can take some time to actually clean up */ + while (!idle_cpu(cpu)) + yield(); + + per_cpu(cpu_state, cpu) = CPU_OFFLINE; + wmb(); + for (i = 0; i < 10; i++) { + wake_idle_cpu(cpu); + /* They ack this in check_cpu_quiescent by setting CPU_DEAD */ + if (per_cpu(cpu_state, cpu) == CPU_DEAD) { + /* + * This prevents a race in smp_call_function due + * to the rapid online of the same CPU which just died. + */ + smp_call_function(do_nothing, NULL, 1, 1); + return; + } + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ/10); + } + printk(KERN_ERR "CPU %u didn't die...\n", cpu); +} +#else /* ... !CONFIG_HOTPLUG_CPU */ +int __cpu_disable(void) +{ + return -ENOSYS; +} + +void __cpu_die(unsigned int cpu) +{ + /* We said "no" in __cpu_disable */ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + int __devinit __cpu_up(unsigned int cpu) { /* This only works at boot for x86. See "rewrite" above. */ - if (cpu_isset(cpu, smp_commenced_mask)) { + if (cpu_isset(cpu, smp_commenced_mask) && cpu_online(cpu)) { local_irq_enable(); return -ENOSYS; } /* In case one didn't come up */ if (!cpu_isset(cpu, cpu_callin_map)) { + printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu); local_irq_enable(); return -EIO; } +#ifdef CONFIG_HOTPLUG_CPU + /* Already up, and in cpu_quiescent now? */ + if (cpu_isset(cpu, smp_commenced_mask)) { + cpu_enable(cpu); + /* we can simply fall through */ + } +#endif + local_irq_enable(); /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); diff -puN arch/i386/kernel/smp.c~cpuhotplug-04-x86-support arch/i386/kernel/smp.c --- 25/arch/i386/kernel/smp.c~cpuhotplug-04-x86-support Wed Feb 4 12:49:55 2004 +++ 25-akpm/arch/i386/kernel/smp.c Wed Feb 4 12:49:55 2004 @@ -355,11 +355,15 @@ static void flush_tlb_others(cpumask_t c */ BUG_ON(cpus_empty(cpumask)); - cpus_and(tmp, cpumask, cpu_online_map); + cpus_and(tmp, cpumask, cpu_callout_map); BUG_ON(!cpus_equal(cpumask, tmp)); BUG_ON(cpu_isset(smp_processor_id(), cpumask)); BUG_ON(!mm); + cpus_and(cpumask, cpumask, cpu_active_map); + if (cpus_empty(cpumask)) + return; + /* * i'm not happy about this global shared spinlock in the * MM hot path, but we'll see how contended it is. @@ -387,9 +391,11 @@ static void flush_tlb_others(cpumask_t c */ send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); - while (!cpus_empty(flush_cpumask)) - /* nothing. lockup detection does not belong here */ + do { mb(); + tmp = flush_cpumask; + cpus_and(tmp, tmp, cpu_active_map); + } while (!cpus_empty(tmp)); flush_mm = NULL; flush_va = 0; @@ -491,13 +497,13 @@ void smp_send_reschedule(int cpu) * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static spinlock_t call_lock = SPIN_LOCK_UNLOCKED; +spinlock_t call_lock = SPIN_LOCK_UNLOCKED; struct call_data_struct { void (*func) (void *info); void *info; - atomic_t started; - atomic_t finished; + cpumask_t not_started; + cpumask_t not_finished; int wait; }; @@ -524,32 +530,44 @@ int smp_call_function (void (*func) (voi */ { struct call_data_struct data; - int cpus = num_online_cpus()-1; + cpumask_t mask; + int cpu; - if (!cpus) - return 0; + spin_lock(&call_lock); + cpu = smp_processor_id(); data.func = func; data.info = info; - atomic_set(&data.started, 0); + data.not_started = cpu_active_map; + cpu_clear(cpu, data.not_started); + if (cpus_empty(data.not_started)) + goto out_unlock; + data.wait = wait; if (wait) - atomic_set(&data.finished, 0); + data.not_finished = data.not_started; - spin_lock(&call_lock); call_data = &data; mb(); /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_allbutself(CALL_FUNCTION_VECTOR); + send_IPI_mask(data.not_started, CALL_FUNCTION_VECTOR); /* Wait for response */ - while (atomic_read(&data.started) != cpus) - barrier(); + do { + mb(); + mask = data.not_started; + cpus_and(mask, mask, cpu_active_map); + } while(!cpus_empty(mask)); if (wait) - while (atomic_read(&data.finished) != cpus) - barrier(); + do { + mb(); + mask = data.not_finished; + cpus_and(mask, mask, cpu_active_map); + } while(!cpus_empty(mask)); + +out_unlock: spin_unlock(&call_lock); return 0; @@ -561,6 +579,7 @@ static void stop_this_cpu (void * dummy) * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); + cpu_clear(smp_processor_id(), cpu_active_map); local_irq_disable(); disable_local_APIC(); if (cpu_data[smp_processor_id()].hlt_works_ok) @@ -593,17 +612,25 @@ asmlinkage void smp_reschedule_interrupt asmlinkage void smp_call_function_interrupt(void) { - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - int wait = call_data->wait; + void (*func) (void *info); + void *info; + int wait; + int cpu = smp_processor_id(); ack_APIC_irq(); + + func = call_data->func; + info = call_data->info; + wait = call_data->wait; + /* * Notify initiating CPU that I've grabbed the data and am * about to execute the function */ - mb(); - atomic_inc(&call_data->started); + smp_mb__before_clear_bit(); + cpu_clear(cpu, call_data->not_started); + smp_mb__after_clear_bit(); + /* * At this point the info structure may be out of scope unless wait==1 */ @@ -612,8 +639,9 @@ asmlinkage void smp_call_function_interr irq_exit(); if (wait) { - mb(); - atomic_inc(&call_data->finished); + smp_mb__before_clear_bit(); + cpu_clear(cpu, call_data->not_finished); + smp_mb__after_clear_bit(); } } diff -puN include/asm-i386/cpu.h~cpuhotplug-04-x86-support include/asm-i386/cpu.h --- 25/include/asm-i386/cpu.h~cpuhotplug-04-x86-support Wed Feb 4 12:49:55 2004 +++ 25-akpm/include/asm-i386/cpu.h Wed Feb 4 12:49:55 2004 @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -23,4 +24,5 @@ static inline int arch_register_cpu(int return register_cpu(&cpu_devices[num].cpu, num, parent); } +DECLARE_PER_CPU(int, cpu_state); #endif /* _ASM_I386_CPU_H_ */ diff -puN include/asm-i386/smp.h~cpuhotplug-04-x86-support include/asm-i386/smp.h --- 25/include/asm-i386/smp.h~cpuhotplug-04-x86-support Wed Feb 4 12:49:55 2004 +++ 25-akpm/include/asm-i386/smp.h Wed Feb 4 12:49:55 2004 @@ -9,6 +9,7 @@ #include #include #include +#include #endif #ifdef CONFIG_X86_LOCAL_APIC @@ -52,6 +53,7 @@ extern void zap_low_mappings (void); */ #define smp_processor_id() (current_thread_info()->cpu) +extern spinlock_t call_lock; extern cpumask_t cpu_callout_map; #define cpu_possible_map cpu_callout_map @@ -84,6 +86,10 @@ static __inline int logical_smp_processo } #endif + +extern int __cpu_disable(void); +extern void __cpu_die(unsigned int cpu); + #endif /* !__ASSEMBLY__ */ #define NO_PROC_ID 0xFF /* No processor magic marker */ _