From: Ingo Molnar This is the current remove-BKL patch. I test-booted it on x86 and x64, trying every conceivable combination of SMP, PREEMPT and PREEMPT_BKL. All other architectures should compile as well. (most of the testing was done with the zaphod patch undone but it applies cleanly on vanilla -mm3 as well and should work fine.) this is the debugging-enabled variant of the patch which has two main debugging features: - debug potentially illegal smp_processor_id() use. Has caught a number of real bugs - e.g. look at the printk.c fix in the patch. - make it possible to enable/disable the BKL via a .config. If this goes upstream we dont want this of course, but for now it gives people a chance to find out whether any particular problem was caused by this patch. This patch has one important fix over the previous BKL patch: on PREEMPT kernels if we preempted BKL-using code then the code still auto-dropped the BKL by mistake. This caused a number of breakages for testers, which breakages went away once this bug was fixed. Also the debugging mechanism has been improved alot relative to the previous BKL patch. Would be nice to test-drive this in -mm. There will likely be some more smp_processor_id() false positives but they are 1) harmless 2) easy to fix up. We could as well find more real smp_processor_id() related breakages as well. The most noteworthy fact is that no BKL-using code was found yet that relied on smp_processor_id(), which is promising from a compatibility POV. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton --- 25-akpm/arch/i386/Kconfig | 11 ++ 25-akpm/arch/i386/kernel/traps.c | 2 25-akpm/arch/i386/lib/delay.c | 2 25-akpm/arch/sh/lib/delay.c | 2 25-akpm/arch/sparc64/lib/delay.c | 2 25-akpm/arch/x86_64/Kconfig | 11 ++ 25-akpm/arch/x86_64/lib/delay.c | 2 25-akpm/include/asm-i386/smp.h | 2 25-akpm/include/asm-x86_64/smp.h | 2 25-akpm/include/linux/hardirq.h | 14 ++- 25-akpm/include/linux/interrupt.h | 4 - 25-akpm/include/linux/preempt.h | 19 ++--- 25-akpm/include/linux/smp.h | 21 +++++ 25-akpm/include/linux/smp_lock.h | 14 +-- 25-akpm/include/net/route.h | 2 25-akpm/include/net/snmp.h | 14 +-- 25-akpm/init/main.c | 4 + 25-akpm/kernel/module.c | 2 25-akpm/kernel/printk.c | 3 25-akpm/kernel/sched.c | 63 ++++++++++++++-- 25-akpm/kernel/softirq.c | 4 - 25-akpm/kernel/stop_machine.c | 4 - 25-akpm/kernel/timer.c | 9 ++ 25-akpm/lib/Kconfig.debug | 10 ++ 25-akpm/lib/kernel_lock.c | 142 +++++++++++++++++++++++++++++++++++++- kernel/irq/handle.c | 0 26 files changed, 308 insertions(+), 57 deletions(-) diff -puN arch/i386/Kconfig~remove-the-bkl-by-turning-it-into-a-semaphore arch/i386/Kconfig --- 25/arch/i386/Kconfig~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.905845600 -0800 +++ 25-akpm/arch/i386/Kconfig 2004-12-03 20:56:38.944839672 -0800 @@ -522,6 +522,17 @@ config PREEMPT Say Y here if you are building a kernel for a desktop, embedded or real-time system. Say N if you are unsure. +config PREEMPT_BKL + bool "Preempt The Big Kernel Lock" + depends on PREEMPT || SMP + default y + help + This option reduces the latency of the kernel by making the + big kernel lock preemptible. + + Say Y here if you are building a kernel for a desktop system. + Say N if you are unsure. + config X86_UP_APIC bool "Local APIC support on uniprocessors" if !SMP depends on !(X86_VISWS || X86_VOYAGER) diff -puN arch/i386/kernel/traps.c~remove-the-bkl-by-turning-it-into-a-semaphore arch/i386/kernel/traps.c --- 25/arch/i386/kernel/traps.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.906845448 -0800 +++ 25-akpm/arch/i386/kernel/traps.c 2004-12-03 20:56:38.945839520 -0800 @@ -339,7 +339,7 @@ void die(const char * str, struct pt_reg }; static int die_counter; - if (die.lock_owner != smp_processor_id()) { + if (die.lock_owner != _smp_processor_id()) { console_verbose(); spin_lock_irq(&die.lock); die.lock_owner = smp_processor_id(); diff -puN arch/i386/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore arch/i386/lib/delay.c --- 25/arch/i386/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.908845144 -0800 +++ 25-akpm/arch/i386/lib/delay.c 2004-12-03 20:56:38.945839520 -0800 @@ -34,7 +34,7 @@ inline void __const_udelay(unsigned long xloops *= 4; __asm__("mull %0" :"=d" (xloops), "=&a" (d0) - :"1" (xloops),"0" (current_cpu_data.loops_per_jiffy * (HZ/4))); + :"1" (xloops),"0" (cpu_data[_smp_processor_id()].loops_per_jiffy * (HZ/4))); __delay(++xloops); } diff -puN arch/sh/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore arch/sh/lib/delay.c --- 25/arch/sh/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.909844992 -0800 +++ 25-akpm/arch/sh/lib/delay.c 2004-12-03 20:56:38.945839520 -0800 @@ -24,7 +24,7 @@ inline void __const_udelay(unsigned long __asm__("dmulu.l %0, %2\n\t" "sts mach, %0" : "=r" (xloops) - : "0" (xloops), "r" (current_cpu_data.loops_per_jiffy) + : "0" (xloops), "r" (cpu_data[_smp_processor_id()].loops_per_jiffy) : "macl", "mach"); __delay(xloops * HZ); } diff -puN arch/sparc64/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore arch/sparc64/lib/delay.c --- 25/arch/sparc64/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.911844688 -0800 +++ 25-akpm/arch/sparc64/lib/delay.c 2004-12-03 20:56:38.946839368 -0800 @@ -31,7 +31,7 @@ void __const_udelay(unsigned long n) { n *= 4; - n *= (cpu_data(smp_processor_id()).udelay_val * (HZ/4)); + n *= (cpu_data(_smp_processor_id()).udelay_val * (HZ/4)); n >>= 32; __delay(n + 1); diff -puN arch/x86_64/Kconfig~remove-the-bkl-by-turning-it-into-a-semaphore arch/x86_64/Kconfig --- 25/arch/x86_64/Kconfig~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.912844536 -0800 +++ 25-akpm/arch/x86_64/Kconfig 2004-12-03 20:56:38.946839368 -0800 @@ -249,6 +249,17 @@ config PREEMPT Say Y here if you are feeling brave and building a kernel for a desktop, embedded or real-time system. Say N if you are unsure. +config PREEMPT_BKL + bool "Preempt The Big Kernel Lock" + depends on PREEMPT || SMP + default y + help + This option reduces the latency of the kernel by making the + big kernel lock preemptible. + + Say Y here if you are building a kernel for a desktop system. + Say N if you are unsure. + config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" depends on SMP diff -puN arch/x86_64/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore arch/x86_64/lib/delay.c --- 25/arch/x86_64/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.914844232 -0800 +++ 25-akpm/arch/x86_64/lib/delay.c 2004-12-03 20:56:38.947839216 -0800 @@ -34,7 +34,7 @@ void __delay(unsigned long loops) inline void __const_udelay(unsigned long xloops) { - __delay(((xloops * current_cpu_data.loops_per_jiffy) >> 32) * HZ); + __delay(((xloops * cpu_data[_smp_processor_id()].loops_per_jiffy) >> 32) * HZ); } void __udelay(unsigned long usecs) diff -puN include/asm-i386/smp.h~remove-the-bkl-by-turning-it-into-a-semaphore include/asm-i386/smp.h --- 25/include/asm-i386/smp.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.915844080 -0800 +++ 25-akpm/include/asm-i386/smp.h 2004-12-03 20:56:38.947839216 -0800 @@ -50,7 +50,7 @@ extern u8 x86_cpu_to_apicid[]; * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -#define smp_processor_id() (current_thread_info()->cpu) +#define __smp_processor_id() (current_thread_info()->cpu) extern cpumask_t cpu_callout_map; #define cpu_possible_map cpu_callout_map diff -puN include/asm-x86_64/smp.h~remove-the-bkl-by-turning-it-into-a-semaphore include/asm-x86_64/smp.h --- 25/include/asm-x86_64/smp.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.916843928 -0800 +++ 25-akpm/include/asm-x86_64/smp.h 2004-12-03 20:56:38.947839216 -0800 @@ -66,7 +66,7 @@ static inline int num_booting_cpus(void) return cpus_weight(cpu_callout_map); } -#define smp_processor_id() read_pda(cpunumber) +#define __smp_processor_id() read_pda(cpunumber) extern __inline int hard_smp_processor_id(void) { diff -puN include/linux/hardirq.h~remove-the-bkl-by-turning-it-into-a-semaphore include/linux/hardirq.h --- 25/include/linux/hardirq.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.918843624 -0800 +++ 25-akpm/include/linux/hardirq.h 2004-12-03 20:56:38.948839064 -0800 @@ -61,12 +61,16 @@ #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) -#ifdef CONFIG_PREEMPT +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) # define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked()) +#else +# define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) +#endif + +#ifdef CONFIG_PREEMPT # define preemptible() (preempt_count() == 0 && !irqs_disabled()) # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1) #else -# define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) # define preemptible() 0 # define IRQ_EXIT_OFFSET HARDIRQ_OFFSET #endif @@ -77,10 +81,10 @@ extern void synchronize_irq(unsigned int # define synchronize_irq(irq) barrier() #endif -#define nmi_enter() (preempt_count() += HARDIRQ_OFFSET) -#define nmi_exit() (preempt_count() -= HARDIRQ_OFFSET) +#define nmi_enter() irq_enter() +#define nmi_exit() sub_preempt_count(HARDIRQ_OFFSET) -#define irq_enter() (preempt_count() += HARDIRQ_OFFSET) +#define irq_enter() add_preempt_count(HARDIRQ_OFFSET) extern void irq_exit(void); #endif /* LINUX_HARDIRQ_H */ diff -puN include/linux/interrupt.h~remove-the-bkl-by-turning-it-into-a-semaphore include/linux/interrupt.h --- 25/include/linux/interrupt.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.919843472 -0800 +++ 25-akpm/include/linux/interrupt.h 2004-12-03 20:56:38.948839064 -0800 @@ -70,9 +70,9 @@ extern void enable_irq(unsigned int irq) /* SoftIRQ primitives. */ #define local_bh_disable() \ - do { preempt_count() += SOFTIRQ_OFFSET; barrier(); } while (0) + do { add_preempt_count(SOFTIRQ_OFFSET); barrier(); } while (0) #define __local_bh_enable() \ - do { barrier(); preempt_count() -= SOFTIRQ_OFFSET; } while (0) + do { barrier(); sub_preempt_count(SOFTIRQ_OFFSET); } while (0) extern void local_bh_enable(void); diff -puN include/linux/preempt.h~remove-the-bkl-by-turning-it-into-a-semaphore include/linux/preempt.h --- 25/include/linux/preempt.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.920843320 -0800 +++ 25-akpm/include/linux/preempt.h 2004-12-03 20:56:38.949838912 -0800 @@ -9,17 +9,18 @@ #include #include -#define preempt_count() (current_thread_info()->preempt_count) +#ifdef CONFIG_DEBUG_PREEMPT + extern void fastcall add_preempt_count(int val); + extern void fastcall sub_preempt_count(int val); +#else +# define add_preempt_count(val) do { preempt_count() += (val); } while (0) +# define sub_preempt_count(val) do { preempt_count() -= (val); } while (0) +#endif -#define inc_preempt_count() \ -do { \ - preempt_count()++; \ -} while (0) +#define inc_preempt_count() add_preempt_count(1) +#define dec_preempt_count() sub_preempt_count(1) -#define dec_preempt_count() \ -do { \ - preempt_count()--; \ -} while (0) +#define preempt_count() (current_thread_info()->preempt_count) #ifdef CONFIG_PREEMPT diff -puN include/linux/smp.h~remove-the-bkl-by-turning-it-into-a-semaphore include/linux/smp.h --- 25/include/linux/smp.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.921843168 -0800 +++ 25-akpm/include/linux/smp.h 2004-12-03 20:56:38.949838912 -0800 @@ -95,8 +95,10 @@ void smp_prepare_boot_cpu(void); /* * These macros fold the SMP functionality into a single CPU system */ - -#define smp_processor_id() 0 + +#if !defined(__smp_processor_id) || !defined(CONFIG_PREEMPT) +# define smp_processor_id() 0 +#endif #define hard_smp_processor_id() 0 #define smp_threads_ready 1 #define smp_call_function(func,info,retry,wait) ({ 0; }) @@ -107,6 +109,21 @@ static inline void smp_send_reschedule(i #endif /* !SMP */ +#ifdef __smp_processor_id +# if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) + /* + * temporary debugging check detecting places that use + * smp_processor_id() in a potentially unsafe way: + */ + extern unsigned int smp_processor_id(void); +# else +# define smp_processor_id() __smp_processor_id() +# endif +# define _smp_processor_id() __smp_processor_id() +#else +# define _smp_processor_id() smp_processor_id() +#endif + #define get_cpu() ({ preempt_disable(); smp_processor_id(); }) #define put_cpu() preempt_enable() #define put_cpu_no_resched() preempt_enable_no_resched() diff -puN include/linux/smp_lock.h~remove-the-bkl-by-turning-it-into-a-semaphore include/linux/smp_lock.h --- 25/include/linux/smp_lock.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.923842864 -0800 +++ 25-akpm/include/linux/smp_lock.h 2004-12-03 20:56:38.950838760 -0800 @@ -9,15 +9,15 @@ #define kernel_locked() (current->lock_depth >= 0) -extern int __lockfunc get_kernel_lock(void); -extern void __lockfunc put_kernel_lock(void); +extern int __lockfunc __reacquire_kernel_lock(void); +extern void __lockfunc __release_kernel_lock(void); /* * Release/re-acquire global kernel lock for the scheduler */ #define release_kernel_lock(tsk) do { \ if (unlikely((tsk)->lock_depth >= 0)) \ - put_kernel_lock(); \ + __release_kernel_lock(); \ } while (0) /* @@ -26,16 +26,16 @@ extern void __lockfunc put_kernel_lock(v * reacquire_kernel_lock() so that the compiler can see * it at compile-time. */ -#ifdef CONFIG_SMP -#define return_value_on_smp return +#if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_BKL) +# define return_value_on_smp return #else -#define return_value_on_smp +# define return_value_on_smp #endif static inline int reacquire_kernel_lock(struct task_struct *task) { if (unlikely(task->lock_depth >= 0)) - return_value_on_smp get_kernel_lock(); + return_value_on_smp __reacquire_kernel_lock(); return 0; } diff -puN include/net/route.h~remove-the-bkl-by-turning-it-into-a-semaphore include/net/route.h --- 25/include/net/route.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.924842712 -0800 +++ 25-akpm/include/net/route.h 2004-12-03 20:56:38.950838760 -0800 @@ -105,7 +105,7 @@ struct rt_cache_stat extern struct rt_cache_stat *rt_cache_stat; #define RT_CACHE_STAT_INC(field) \ - (per_cpu_ptr(rt_cache_stat, smp_processor_id())->field++) + (per_cpu_ptr(rt_cache_stat, _smp_processor_id())->field++) extern struct ip_rt_acct *ip_rt_acct; diff -puN include/net/snmp.h~remove-the-bkl-by-turning-it-into-a-semaphore include/net/snmp.h --- 25/include/net/snmp.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.925842560 -0800 +++ 25-akpm/include/net/snmp.h 2004-12-03 20:56:38.951838608 -0800 @@ -128,18 +128,18 @@ struct linux_mib { #define SNMP_STAT_USRPTR(name) (name[1]) #define SNMP_INC_STATS_BH(mib, field) \ - (per_cpu_ptr(mib[0], smp_processor_id())->mibs[field]++) + (per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field]++) #define SNMP_INC_STATS_OFFSET_BH(mib, field, offset) \ - (per_cpu_ptr(mib[0], smp_processor_id())->mibs[field + (offset)]++) + (per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field + (offset)]++) #define SNMP_INC_STATS_USER(mib, field) \ - (per_cpu_ptr(mib[1], smp_processor_id())->mibs[field]++) + (per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field]++) #define SNMP_INC_STATS(mib, field) \ - (per_cpu_ptr(mib[!in_softirq()], smp_processor_id())->mibs[field]++) + (per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]++) #define SNMP_DEC_STATS(mib, field) \ - (per_cpu_ptr(mib[!in_softirq()], smp_processor_id())->mibs[field]--) + (per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]--) #define SNMP_ADD_STATS_BH(mib, field, addend) \ - (per_cpu_ptr(mib[0], smp_processor_id())->mibs[field] += addend) + (per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field] += addend) #define SNMP_ADD_STATS_USER(mib, field, addend) \ - (per_cpu_ptr(mib[1], smp_processor_id())->mibs[field] += addend) + (per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field] += addend) #endif diff -puN init/main.c~remove-the-bkl-by-turning-it-into-a-semaphore init/main.c --- 25/init/main.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.927842256 -0800 +++ 25-akpm/init/main.c 2004-12-03 20:56:38.951838608 -0800 @@ -445,6 +445,10 @@ asmlinkage void __init start_kernel(void * time - but meanwhile we still have a functioning scheduler. */ sched_init(); + /* + * Disable preemption - early bootup scheduling is extremely + * fragile until we cpu_idle() for the first time. + */ preempt_disable(); build_all_zonelists(); page_alloc_init(); diff -puN kernel/irq/handle.c~remove-the-bkl-by-turning-it-into-a-semaphore kernel/irq/handle.c diff -puN kernel/module.c~remove-the-bkl-by-turning-it-into-a-semaphore kernel/module.c --- 25/kernel/module.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.930841800 -0800 +++ 25-akpm/kernel/module.c 2004-12-03 20:56:38.953838304 -0800 @@ -379,7 +379,7 @@ static void module_unload_init(struct mo for (i = 0; i < NR_CPUS; i++) local_set(&mod->ref[i].count, 0); /* Hold reference count during initialization. */ - local_set(&mod->ref[smp_processor_id()].count, 1); + local_set(&mod->ref[_smp_processor_id()].count, 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; } diff -puN kernel/printk.c~remove-the-bkl-by-turning-it-into-a-semaphore kernel/printk.c --- 25/kernel/printk.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.931841648 -0800 +++ 25-akpm/kernel/printk.c 2004-12-03 20:56:38.954838152 -0800 @@ -645,8 +645,9 @@ void release_console_sem(void) _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ - spin_unlock_irqrestore(&logbuf_lock, flags); + spin_unlock(&logbuf_lock); call_console_drivers(_con_start, _log_end); + local_irq_restore(flags); } console_locked = 0; console_may_schedule = 0; diff -puN kernel/sched.c~remove-the-bkl-by-turning-it-into-a-semaphore kernel/sched.c --- 25/kernel/sched.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.933841344 -0800 +++ 25-akpm/kernel/sched.c 2004-12-03 20:56:38.958837544 -0800 @@ -2513,6 +2513,38 @@ static inline int dependent_sleeper(int } #endif +#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) + +void fastcall add_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON(((int)preempt_count() < 0)); + preempt_count() += val; + /* + * Spinlock count overflowing soon? + */ + BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); +} +EXPORT_SYMBOL(add_preempt_count); + +void fastcall sub_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON(val > preempt_count()); + /* + * Is the spinlock portion underflowing? + */ + BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); + preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +#endif + /* * schedule() is the main scheduler function. */ @@ -2698,7 +2730,10 @@ EXPORT_SYMBOL(schedule); asmlinkage void __sched preempt_schedule(void) { struct thread_info *ti = current_thread_info(); - +#ifdef CONFIG_PREEMPT_BKL + struct task_struct *task = current; + int saved_lock_depth; +#endif /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. @@ -2707,9 +2742,21 @@ asmlinkage void __sched preempt_schedule return; need_resched: - ti->preempt_count = PREEMPT_ACTIVE; + add_preempt_count(PREEMPT_ACTIVE); + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ +#ifdef CONFIG_PREEMPT_BKL + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; +#endif schedule(); - ti->preempt_count = 0; +#ifdef CONFIG_PREEMPT_BKL + task->lock_depth = saved_lock_depth; +#endif + sub_preempt_count(PREEMPT_ACTIVE); /* we could miss a preemption opportunity between schedule and now */ barrier(); @@ -3454,9 +3501,9 @@ asmlinkage long sys_sched_yield(void) static inline void __cond_resched(void) { do { - preempt_count() += PREEMPT_ACTIVE; + add_preempt_count(PREEMPT_ACTIVE); schedule(); - preempt_count() -= PREEMPT_ACTIVE; + sub_preempt_count(PREEMPT_ACTIVE); } while (need_resched()); } @@ -3540,7 +3587,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct runqueue *rq = this_rq(); + struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); atomic_inc(&rq->nr_iowait); schedule(); @@ -3551,7 +3598,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct runqueue *rq = this_rq(); + struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); long ret; atomic_inc(&rq->nr_iowait); @@ -3759,7 +3806,7 @@ void __devinit init_idle(task_t *idle, i spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#ifdef CONFIG_PREEMPT +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) idle->thread_info->preempt_count = (idle->lock_depth >= 0); #else idle->thread_info->preempt_count = 0; diff -puN kernel/softirq.c~remove-the-bkl-by-turning-it-into-a-semaphore kernel/softirq.c --- 25/kernel/softirq.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.934841192 -0800 +++ 25-akpm/kernel/softirq.c 2004-12-03 20:56:38.959837392 -0800 @@ -142,7 +142,7 @@ void local_bh_enable(void) * Keep preemption disabled until we are done with * softirq processing: */ - preempt_count() -= SOFTIRQ_OFFSET - 1; + sub_preempt_count(SOFTIRQ_OFFSET - 1); if (unlikely(!in_interrupt() && local_softirq_pending())) do_softirq(); @@ -163,7 +163,7 @@ EXPORT_SYMBOL(local_bh_enable); */ void irq_exit(void) { - preempt_count() -= IRQ_EXIT_OFFSET; + sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); preempt_enable_no_resched(); diff -puN kernel/stop_machine.c~remove-the-bkl-by-turning-it-into-a-semaphore kernel/stop_machine.c --- 25/kernel/stop_machine.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.935841040 -0800 +++ 25-akpm/kernel/stop_machine.c 2004-12-03 20:56:38.959837392 -0800 @@ -95,7 +95,7 @@ static int stop_machine(void) stopmachine_state = STOPMACHINE_WAIT; for_each_online_cpu(i) { - if (i == smp_processor_id()) + if (i == _smp_processor_id()) continue; ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); if (ret < 0) @@ -177,7 +177,7 @@ struct task_struct *__stop_machine_run(i /* If they don't care which CPU fn runs on, bind to any online one. */ if (cpu == NR_CPUS) - cpu = smp_processor_id(); + cpu = _smp_processor_id(); p = kthread_create(do_stop, &smdata, "kstopmachine"); if (!IS_ERR(p)) { diff -puN kernel/timer.c~remove-the-bkl-by-turning-it-into-a-semaphore kernel/timer.c --- 25/kernel/timer.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.937840736 -0800 +++ 25-akpm/kernel/timer.c 2004-12-03 20:56:38.960837240 -0800 @@ -466,7 +466,14 @@ repeat: smp_wmb(); timer->base = NULL; spin_unlock_irq(&base->lock); - fn(data); + { + u32 preempt_count = preempt_count(); + fn(data); + if (preempt_count != preempt_count()) { + printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count()); + BUG(); + } + } spin_lock_irq(&base->lock); goto repeat; } diff -puN lib/Kconfig.debug~remove-the-bkl-by-turning-it-into-a-semaphore lib/Kconfig.debug --- 25/lib/Kconfig.debug~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.938840584 -0800 +++ 25-akpm/lib/Kconfig.debug 2004-12-03 20:56:38.961837088 -0800 @@ -48,6 +48,16 @@ config DEBUG_SLAB allocation as well as poisoning memory on free to catch use of freed memory. This can make kmalloc/kfree-intensive workloads much slower. +config DEBUG_PREEMPT + bool "Debug preemptible kernel" + depends on PREEMPT && X86 + default y + help + If you say Y here then the kernel will use a debug variant of the + commonly used smp_processor_id() function and will print warnings + if kernel code uses it in a preemption-unsafe way. Also, the kernel + will detect preemption count underflows. + config DEBUG_SPINLOCK bool "Spinlock debugging" depends on DEBUG_KERNEL && (ALPHA || ARM || X86 || IA64 || M32R || MIPS || PARISC || PPC32 || (SUPERH && !SUPERH64) || SPARC32 || SPARC64 || USERMODE || X86_64) diff -puN lib/kernel_lock.c~remove-the-bkl-by-turning-it-into-a-semaphore lib/kernel_lock.c --- 25/lib/kernel_lock.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-12-03 20:56:38.939840432 -0800 +++ 25-akpm/lib/kernel_lock.c 2004-12-03 20:56:38.962836936 -0800 @@ -7,6 +7,141 @@ */ #include #include +#include + +#if defined(CONFIG_PREEMPT) && defined(__smp_processor_id) && \ + defined(CONFIG_DEBUG_PREEMPT) + +/* + * Debugging check. + */ +unsigned int smp_processor_id(void) +{ + unsigned long preempt_count = preempt_count(); + int this_cpu = __smp_processor_id(); + cpumask_t this_mask; + + if (likely(preempt_count)) + goto out; + + if (irqs_disabled()) + goto out; + + /* + * Kernel threads bound to a single CPU can safely use + * smp_processor_id(): + */ + this_mask = cpumask_of_cpu(this_cpu); + + if (cpus_equal(current->cpus_allowed, this_mask)) + goto out; + + /* + * It is valid to assume CPU-locality during early bootup: + */ + if (system_state != SYSTEM_RUNNING) + goto out; + + /* + * Avoid recursion: + */ + preempt_disable(); + + if (!printk_ratelimit()) + goto out_enable; + + printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid); + print_symbol("caller is %s\n", (long)__builtin_return_address(0)); + dump_stack(); + +out_enable: + preempt_enable_no_resched(); +out: + return this_cpu; +} + +EXPORT_SYMBOL(smp_processor_id); + +#endif /* PREEMPT && __smp_processor_id && DEBUG_PREEMPT */ + +#ifdef CONFIG_PREEMPT_BKL +/* + * The 'big kernel semaphore' + * + * This mutex is taken and released recursively by lock_kernel() + * and unlock_kernel(). It is transparently dropped and reaquired + * over schedule(). It is used to protect legacy code that hasn't + * been migrated to a proper locking design yet. + * + * Note: code locked by this semaphore will only be serialized against + * other code using the same locking facility. The code guarantees that + * the task remains on the same CPU. + * + * Don't use in new code. + */ +DECLARE_MUTEX(kernel_sem); + +/* + * Re-acquire the kernel semaphore. + * + * This function is called with preemption off. + * + * We are executing in schedule() so the code must be extremely careful + * about recursion, both due to the down() and due to the enabling of + * preemption. schedule() will re-check the preemption flag after + * reacquiring the semaphore. + */ +int __lockfunc __reacquire_kernel_lock(void) +{ + struct task_struct *task = current; + int saved_lock_depth = task->lock_depth; + + BUG_ON(saved_lock_depth < 0); + + task->lock_depth = -1; + preempt_enable_no_resched(); + + down(&kernel_sem); + + preempt_disable(); + task->lock_depth = saved_lock_depth; + + return 0; +} + +void __lockfunc __release_kernel_lock(void) +{ + up(&kernel_sem); +} + +/* + * Getting the big kernel semaphore. + */ +void __lockfunc lock_kernel(void) +{ + struct task_struct *task = current; + int depth = task->lock_depth + 1; + + if (likely(!depth)) + /* + * No recursion worries - we set up lock_depth _after_ + */ + down(&kernel_sem); + + task->lock_depth = depth; +} + +void __lockfunc unlock_kernel(void) +{ + struct task_struct *task = current; + + BUG_ON(task->lock_depth < 0); + + if (likely(--task->lock_depth < 0)) + up(&kernel_sem); +} + +#else /* * The 'big kernel lock' @@ -34,7 +169,7 @@ static spinlock_t kernel_flag __cachelin * (This works on UP too - _raw_spin_trylock will never * return false in that case) */ -int __lockfunc get_kernel_lock(void) +int __lockfunc __reacquire_kernel_lock(void) { while (!_raw_spin_trylock(&kernel_flag)) { if (test_thread_flag(TIF_NEED_RESCHED)) @@ -45,7 +180,7 @@ int __lockfunc get_kernel_lock(void) return 0; } -void __lockfunc put_kernel_lock(void) +void __lockfunc __release_kernel_lock(void) { _raw_spin_unlock(&kernel_flag); preempt_enable_no_resched(); @@ -122,5 +257,8 @@ void __lockfunc unlock_kernel(void) __unlock_kernel(); } +#endif + EXPORT_SYMBOL(lock_kernel); EXPORT_SYMBOL(unlock_kernel); + _