From: Ingo Molnar SMP locking latencies are one of the last architectural problems that cause millisec-category scheduling delays. CONFIG_PREEMPT tries to solve some of the SMP issues but there are still lots of problems remaining: spinlocks nested at multiple levels, spinning with irqs turned off, and non-nested spinning with preemption turned off permanently. The nesting problem goes like this: if a piece of kernel code (e.g. the MM or ext3's journalling code) does the following: spin_lock(&spinlock_1); ... spin_lock(&spinlock_2); ... then even with CONFIG_PREEMPT enabled, current kernels may spin on spinlock_2 indefinitely. A number of critical sections break their long paths by using cond_resched_lock(), but this does not break the path on SMP, because need_resched() *of the other CPU* is not set so cond_resched_lock() doesnt notice that a reschedule is due. to solve this problem i've introduced a new spinlock field, lock->break_lock, which signals towards the holding CPU that a spinlock-break is requested by another CPU. This field is only set if a CPU is spinning in a spinlock function [at any locking depth], so the default overhead is zero. I've extended cond_resched_lock() to check for this flag - in this case we can also save a reschedule. I've added the lock_need_resched(lock) and need_lockbreak(lock) methods to check for the need to break out of a critical section. Another latency problem was that the stock kernel, even with CONFIG_PREEMPT enabled, didnt have any spin-nicely preemption logic for the following, commonly used SMP locking primitives: read_lock(), spin_lock_irqsave(), spin_lock_irq(), spin_lock_bh(), read_lock_irqsave(), read_lock_irq(), read_lock_bh(), write_lock_irqsave(), write_lock_irq(), write_lock_bh(). Only spin_lock() and write_lock() [the two simplest cases] where covered. In addition to the preemption latency problems, the _irq() variants in the above list didnt do any IRQ-enabling while spinning - possibly resulting in excessive irqs-off sections of code! preempt-smp.patch fixes all these latency problems by spinning irq-nicely (if possible) and by requesting lock-breaks if needed. Two architecture-level changes were necessary for this: the addition of the break_lock field to spinlock_t and rwlock_t, and the addition of the _raw_read_trylock() function. Testing done by Mark H Johnson and myself indicate SMP latencies comparable to the UP kernel - while they were basically indefinitely high without this patch. i successfully test-compiled and test-booted this patch ontop of BK-curr using the following .config combinations: SMP && PREEMPT, !SMP && PREEMPT, SMP && !PREEMPT and !SMP && !PREEMPT on x86, !SMP && !PREEMPT and SMP && PREEMPT on x64. I also test-booted x86 with the generic_read_trylock function to check that it works fine. Essentially the same patch has been in testing as part of the voluntary-preempt patches for some time already. NOTE to architecture maintainers: generic_raw_read_trylock() is a crude version that should be replaced with the proper arch-optimized version ASAP. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton --- 25-akpm/include/asm-alpha/spinlock.h | 6 25-akpm/include/asm-arm/spinlock.h | 8 + 25-akpm/include/asm-i386/spinlock.h | 16 ++ 25-akpm/include/asm-ia64/spinlock.h | 8 + 25-akpm/include/asm-mips/spinlock.h | 8 + 25-akpm/include/asm-parisc/spinlock.h | 5 25-akpm/include/asm-parisc/system.h | 3 25-akpm/include/asm-ppc/spinlock.h | 8 + 25-akpm/include/asm-ppc64/spinlock.h | 8 + 25-akpm/include/asm-s390/spinlock.h | 8 + 25-akpm/include/asm-sh/spinlock.h | 8 + 25-akpm/include/asm-sparc/spinlock.h | 23 ++- 25-akpm/include/asm-sparc64/spinlock.h | 2 25-akpm/include/asm-x86_64/spinlock.h | 16 ++ 25-akpm/include/linux/sched.h | 18 -- 25-akpm/include/linux/spinlock.h | 23 ++- 25-akpm/kernel/sched.c | 31 ++++ 25-akpm/kernel/spinlock.c | 235 ++++++++++++++++++++------------- 18 files changed, 320 insertions(+), 114 deletions(-) diff -puN include/asm-alpha/spinlock.h~preempt-smp include/asm-alpha/spinlock.h --- 25/include/asm-alpha/spinlock.h~preempt-smp 2004-11-30 01:23:40.882552016 -0800 +++ 25-akpm/include/asm-alpha/spinlock.h 2004-11-30 01:23:40.909547912 -0800 @@ -23,6 +23,9 @@ typedef struct { struct task_struct * task; const char *base_file; #endif +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } spinlock_t; #ifdef CONFIG_DEBUG_SPINLOCK @@ -96,6 +99,9 @@ static inline int _raw_spin_trylock(spin typedef struct { volatile unsigned int write_lock:1, read_counter:31; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } /*__attribute__((aligned(32)))*/ rwlock_t; #define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 } diff -puN include/asm-arm/spinlock.h~preempt-smp include/asm-arm/spinlock.h --- 25/include/asm-arm/spinlock.h~preempt-smp 2004-11-30 01:23:40.883551864 -0800 +++ 25-akpm/include/asm-arm/spinlock.h 2004-11-30 01:23:40.910547760 -0800 @@ -17,6 +17,9 @@ */ typedef struct { volatile unsigned int lock; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } spinlock_t; #define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 } @@ -70,6 +73,9 @@ static inline void _raw_spin_unlock(spin */ typedef struct { volatile unsigned int lock; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } rwlock_t; #define RW_LOCK_UNLOCKED (rwlock_t) { 0 } @@ -143,6 +149,8 @@ static inline void _raw_read_unlock(rwlo : "cc", "memory"); } +#define _raw_read_trylock(lock) generic_raw_read_trylock(lock) + static inline int _raw_write_trylock(rwlock_t *rw) { unsigned long tmp; diff -puN include/asm-i386/spinlock.h~preempt-smp include/asm-i386/spinlock.h --- 25/include/asm-i386/spinlock.h~preempt-smp 2004-11-30 01:23:40.884551712 -0800 +++ 25-akpm/include/asm-i386/spinlock.h 2004-11-30 01:23:40.910547760 -0800 @@ -19,6 +19,9 @@ typedef struct { #ifdef CONFIG_DEBUG_SPINLOCK unsigned magic; #endif +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } spinlock_t; #define SPINLOCK_MAGIC 0xdead4ead @@ -166,6 +169,9 @@ typedef struct { #ifdef CONFIG_DEBUG_SPINLOCK unsigned magic; #endif +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } rwlock_t; #define RWLOCK_MAGIC 0xdeaf1eed @@ -212,6 +218,16 @@ static inline void _raw_write_lock(rwloc #define _raw_read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory") #define _raw_write_unlock(rw) asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory") +static inline int _raw_read_trylock(rwlock_t *lock) +{ + atomic_t *count = (atomic_t *)lock; + atomic_dec(count); + if (atomic_read(count) < RW_LOCK_BIAS) + return 1; + atomic_inc(count); + return 0; +} + static inline int _raw_write_trylock(rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; diff -puN include/asm-ia64/spinlock.h~preempt-smp include/asm-ia64/spinlock.h --- 25/include/asm-ia64/spinlock.h~preempt-smp 2004-11-30 01:23:40.886551408 -0800 +++ 25-akpm/include/asm-ia64/spinlock.h 2004-11-30 01:23:40.911547608 -0800 @@ -19,6 +19,9 @@ typedef struct { volatile unsigned int lock; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } spinlock_t; #define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 } @@ -116,6 +119,9 @@ do { \ typedef struct { volatile unsigned int read_counter : 31; volatile unsigned int write_lock : 1; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } rwlock_t; #define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 } @@ -190,6 +196,8 @@ do { \ #endif /* !ASM_SUPPORTED */ +#define _raw_read_trylock(lock) generic_raw_read_trylock(lock) + #define _raw_write_unlock(x) \ ({ \ smp_mb__before_clear_bit(); /* need barrier before releasing lock... */ \ diff -puN include/asm-mips/spinlock.h~preempt-smp include/asm-mips/spinlock.h --- 25/include/asm-mips/spinlock.h~preempt-smp 2004-11-30 01:23:40.887551256 -0800 +++ 25-akpm/include/asm-mips/spinlock.h 2004-11-30 01:23:40.911547608 -0800 @@ -17,6 +17,9 @@ typedef struct { volatile unsigned int lock; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } spinlock_t; #define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 } @@ -127,6 +130,9 @@ static inline unsigned int _raw_spin_try typedef struct { volatile unsigned int lock; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } rwlock_t; #define RW_LOCK_UNLOCKED (rwlock_t) { 0 } @@ -246,6 +252,8 @@ static inline void _raw_write_unlock(rwl : "memory"); } +#define _raw_read_trylock(lock) generic_raw_read_trylock(lock) + static inline int _raw_write_trylock(rwlock_t *rw) { unsigned int tmp; diff -puN include/asm-parisc/spinlock.h~preempt-smp include/asm-parisc/spinlock.h --- 25/include/asm-parisc/spinlock.h~preempt-smp 2004-11-30 01:23:40.889550952 -0800 +++ 25-akpm/include/asm-parisc/spinlock.h 2004-11-30 01:23:40.912547456 -0800 @@ -142,6 +142,9 @@ do { \ typedef struct { spinlock_t lock; volatile int counter; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } rwlock_t; #define RW_LOCK_UNLOCKED (rwlock_t) { __SPIN_LOCK_UNLOCKED, 0 } @@ -150,6 +153,8 @@ typedef struct { #define rwlock_is_locked(lp) ((lp)->counter != 0) +#define _raw_read_trylock(lock) generic_raw_read_trylock(lock) + /* read_lock, read_unlock are pretty straightforward. Of course it somehow * sucks we end up saving/restoring flags twice for read_lock_irqsave aso. */ diff -puN include/asm-parisc/system.h~preempt-smp include/asm-parisc/system.h --- 25/include/asm-parisc/system.h~preempt-smp 2004-11-30 01:23:40.890550800 -0800 +++ 25-akpm/include/asm-parisc/system.h 2004-11-30 01:23:40.912547456 -0800 @@ -176,6 +176,9 @@ typedef struct { void *previous; struct task_struct * task; #endif +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } spinlock_t; #define __lock_aligned __attribute__((__section__(".data.lock_aligned"))) diff -puN include/asm-ppc64/spinlock.h~preempt-smp include/asm-ppc64/spinlock.h --- 25/include/asm-ppc64/spinlock.h~preempt-smp 2004-11-30 01:23:40.892550496 -0800 +++ 25-akpm/include/asm-ppc64/spinlock.h 2004-11-30 01:23:40.913547304 -0800 @@ -23,10 +23,16 @@ typedef struct { volatile unsigned int lock; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } spinlock_t; typedef struct { volatile signed int lock; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } rwlock_t; #ifdef __KERNEL__ @@ -216,6 +222,8 @@ static void __inline__ _raw_read_unlock( : "cr0", "memory"); } +#define _raw_read_trylock(lock) generic_raw_read_trylock(lock) + /* * This returns the old value in the lock, * so we got the write lock if the return value is 0. diff -puN include/asm-ppc/spinlock.h~preempt-smp include/asm-ppc/spinlock.h --- 25/include/asm-ppc/spinlock.h~preempt-smp 2004-11-30 01:23:40.893550344 -0800 +++ 25-akpm/include/asm-ppc/spinlock.h 2004-11-30 01:23:40.913547304 -0800 @@ -13,6 +13,9 @@ typedef struct { volatile unsigned long owner_pc; volatile unsigned long owner_cpu; #endif +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } spinlock_t; #ifdef __KERNEL__ @@ -83,6 +86,9 @@ typedef struct { #ifdef CONFIG_DEBUG_SPINLOCK volatile unsigned long owner_pc; #endif +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } rwlock_t; #ifdef CONFIG_DEBUG_SPINLOCK @@ -192,5 +198,7 @@ extern int _raw_write_trylock(rwlock_t * #endif +#define _raw_read_trylock(lock) generic_raw_read_trylock(lock) + #endif /* __ASM_SPINLOCK_H */ #endif /* __KERNEL__ */ diff -puN include/asm-s390/spinlock.h~preempt-smp include/asm-s390/spinlock.h --- 25/include/asm-s390/spinlock.h~preempt-smp 2004-11-30 01:23:40.894550192 -0800 +++ 25-akpm/include/asm-s390/spinlock.h 2004-11-30 01:23:40.914547152 -0800 @@ -36,6 +36,9 @@ typedef struct { volatile unsigned int lock; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } __attribute__ ((aligned (4))) spinlock_t; #define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 } @@ -105,6 +108,9 @@ extern inline void _raw_spin_unlock(spin typedef struct { volatile unsigned long lock; volatile unsigned long owner_pc; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } rwlock_t; #define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 } @@ -211,6 +217,8 @@ typedef struct { "m" ((rw)->lock) : "2", "3", "cc", "memory" ) #endif /* __s390x__ */ +#define _raw_read_trylock(lock) generic_raw_read_trylock(lock) + extern inline int _raw_write_trylock(rwlock_t *rw) { unsigned long result, reg; diff -puN include/asm-sh/spinlock.h~preempt-smp include/asm-sh/spinlock.h --- 25/include/asm-sh/spinlock.h~preempt-smp 2004-11-30 01:23:40.895550040 -0800 +++ 25-akpm/include/asm-sh/spinlock.h 2004-11-30 01:23:40.914547152 -0800 @@ -17,6 +17,9 @@ */ typedef struct { volatile unsigned long lock; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } spinlock_t; #define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 } @@ -68,6 +71,9 @@ static inline void _raw_spin_unlock(spin typedef struct { spinlock_t lock; atomic_t counter; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } rwlock_t; #define RW_LOCK_BIAS 0x01000000 @@ -105,6 +111,8 @@ static inline void _raw_write_unlock(rwl _raw_spin_unlock(&rw->lock); } +#define _raw_read_trylock(lock) generic_raw_read_trylock(lock) + static inline int _raw_write_trylock(rwlock_t *rw) { if (atomic_sub_and_test(RW_LOCK_BIAS, &rw->counter)) diff -puN include/asm-sparc64/spinlock.h~preempt-smp include/asm-sparc64/spinlock.h --- 25/include/asm-sparc64/spinlock.h~preempt-smp 2004-11-30 01:23:40.897549736 -0800 +++ 25-akpm/include/asm-sparc64/spinlock.h 2004-11-30 01:23:40.915547000 -0800 @@ -304,6 +304,8 @@ do { unsigned long flags; \ #endif /* CONFIG_DEBUG_SPINLOCK */ +#define _raw_read_trylock(lock) generic_raw_read_trylock(lock) + #endif /* !(__ASSEMBLY__) */ #endif /* !(__SPARC64_SPINLOCK_H) */ diff -puN include/asm-sparc/spinlock.h~preempt-smp include/asm-sparc/spinlock.h --- 25/include/asm-sparc/spinlock.h~preempt-smp 2004-11-30 01:23:40.898549584 -0800 +++ 25-akpm/include/asm-sparc/spinlock.h 2004-11-30 01:23:40.915547000 -0800 @@ -16,6 +16,9 @@ struct _spinlock_debug { unsigned char lock; unsigned long owner_pc; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif }; typedef struct _spinlock_debug spinlock_t; @@ -36,6 +39,9 @@ struct _rwlock_debug { volatile unsigned int lock; unsigned long owner_pc; unsigned long reader_pc[NR_CPUS]; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif }; typedef struct _rwlock_debug rwlock_t; @@ -79,8 +85,14 @@ do { unsigned long flags; \ #else /* !CONFIG_DEBUG_SPINLOCK */ -typedef unsigned char spinlock_t; -#define SPIN_LOCK_UNLOCKED 0 +typedef struct { + unsigned char lock; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif +} spinlock_t; + +#define SPIN_LOCK_UNLOCKED { 0, } #define spin_lock_init(lock) (*((unsigned char *)(lock)) = 0) #define spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0) @@ -137,7 +149,12 @@ extern __inline__ void _raw_spin_unlock( * XXX This might create some problems with my dual spinlock * XXX scheme, deadlocks etc. -DaveM */ -typedef struct { volatile unsigned int lock; } rwlock_t; +typedef struct { + volatile unsigned int lock; +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif +} rwlock_t; #define RW_LOCK_UNLOCKED (rwlock_t) { 0 } diff -puN include/asm-x86_64/spinlock.h~preempt-smp include/asm-x86_64/spinlock.h --- 25/include/asm-x86_64/spinlock.h~preempt-smp 2004-11-30 01:23:40.900549280 -0800 +++ 25-akpm/include/asm-x86_64/spinlock.h 2004-11-30 01:23:40.916546848 -0800 @@ -18,6 +18,9 @@ typedef struct { #ifdef CONFIG_DEBUG_SPINLOCK unsigned magic; #endif +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } spinlock_t; #define SPINLOCK_MAGIC 0xdead4ead @@ -139,6 +142,9 @@ typedef struct { #ifdef CONFIG_DEBUG_SPINLOCK unsigned magic; #endif +#ifdef CONFIG_PREEMPT + unsigned int break_lock; +#endif } rwlock_t; #define RWLOCK_MAGIC 0xdeaf1eed @@ -185,6 +191,16 @@ static inline void _raw_write_lock(rwloc #define _raw_read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory") #define _raw_write_unlock(rw) asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory") +static inline int _raw_read_trylock(rwlock_t *lock) +{ + atomic_t *count = (atomic_t *)lock; + atomic_dec(count); + if (atomic_read(count) < RW_LOCK_BIAS) + return 1; + atomic_inc(count); + return 0; +} + static inline int _raw_write_trylock(rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; diff -puN include/linux/sched.h~preempt-smp include/linux/sched.h --- 25/include/linux/sched.h~preempt-smp 2004-11-30 01:23:40.901549128 -0800 +++ 25-akpm/include/linux/sched.h 2004-11-30 01:23:40.917546696 -0800 @@ -1053,23 +1053,7 @@ static inline void cond_resched(void) __cond_resched(); } -/* - * cond_resched_lock() - if a reschedule is pending, drop the given lock, - * call schedule, and on return reacquire the lock. - * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level - * operations here to prevent schedule() from being called twice (once via - * spin_unlock(), once by hand). - */ -static inline void cond_resched_lock(spinlock_t * lock) -{ - if (need_resched()) { - _raw_spin_unlock(lock); - preempt_enable_no_resched(); - __cond_resched(); - spin_lock(lock); - } -} +extern int cond_resched_lock(spinlock_t * lock); /* Reevaluate whether the task has signals pending delivery. This is required every time the blocked sigset_t changes. diff -puN include/linux/spinlock.h~preempt-smp include/linux/spinlock.h --- 25/include/linux/spinlock.h~preempt-smp 2004-11-30 01:23:40.902548976 -0800 +++ 25-akpm/include/linux/spinlock.h 2004-11-30 01:23:40.918546544 -0800 @@ -47,6 +47,7 @@ #include int __lockfunc _spin_trylock(spinlock_t *lock); +int __lockfunc _read_trylock(rwlock_t *lock); int __lockfunc _write_trylock(rwlock_t *lock); void __lockfunc _spin_lock(spinlock_t *lock) __acquires(spinlock_t); @@ -79,6 +80,7 @@ void __lockfunc _write_unlock_irq(rwlock void __lockfunc _write_unlock_bh(rwlock_t *lock) __releases(rwlock_t); int __lockfunc _spin_trylock_bh(spinlock_t *lock); +int __lockfunc generic_raw_read_trylock(rwlock_t *lock); int in_lock_functions(unsigned long addr); #else @@ -231,11 +233,15 @@ typedef struct { #define _raw_read_unlock(lock) do { (void)(lock); } while(0) #define _raw_write_lock(lock) do { (void)(lock); } while(0) #define _raw_write_unlock(lock) do { (void)(lock); } while(0) +#define _raw_read_trylock(lock) ({ (void)(lock); (1); }) #define _raw_write_trylock(lock) ({ (void)(lock); (1); }) #define _spin_trylock(lock) ({preempt_disable(); _raw_spin_trylock(lock) ? \ 1 : ({preempt_enable(); 0;});}) +#define _read_trylock(lock) ({preempt_disable();_raw_read_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) + #define _write_trylock(lock) ({preempt_disable(); _raw_write_trylock(lock) ? \ 1 : ({preempt_enable(); 0;});}) @@ -437,16 +443,12 @@ do { \ * methods are defined as nops in the case they are not required. */ #define spin_trylock(lock) __cond_lock(_spin_trylock(lock)) +#define read_trylock(lock) __cond_lock(_read_trylock(lock)) #define write_trylock(lock) __cond_lock(_write_trylock(lock)) -/* Where's read_trylock? */ - #define spin_lock(lock) _spin_lock(lock) #define write_lock(lock) _write_lock(lock) #define read_lock(lock) _read_lock(lock) -#define spin_unlock(lock) _spin_unlock(lock) -#define write_unlock(lock) _write_unlock(lock) -#define read_unlock(lock) _read_unlock(lock) #ifdef CONFIG_SMP #define spin_lock_irqsave(lock, flags) flags = _spin_lock_irqsave(lock) @@ -466,6 +468,11 @@ do { \ #define write_lock_irq(lock) _write_lock_irq(lock) #define write_lock_bh(lock) _write_lock_bh(lock) + +#define spin_unlock(lock) _spin_unlock(lock) +#define write_unlock(lock) _write_unlock(lock) +#define read_unlock(lock) _read_unlock(lock) + #define spin_unlock_irqrestore(lock, flags) _spin_unlock_irqrestore(lock, flags) #define spin_unlock_irq(lock) _spin_unlock_irq(lock) #define spin_unlock_bh(lock) _spin_unlock_bh(lock) @@ -502,6 +509,7 @@ extern void _metered_read_lock (rwloc extern void _metered_read_unlock (rwlock_t *lock); extern void _metered_write_lock (rwlock_t *lock); extern void _metered_write_unlock (rwlock_t *lock); +extern int _metered_read_trylock (rwlock_t *lock); extern int _metered_write_trylock(rwlock_t *lock); #endif @@ -531,8 +539,11 @@ static inline void bit_spin_lock(int bit preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) while (test_and_set_bit(bitnum, addr)) { - while (test_bit(bitnum, addr)) + while (test_bit(bitnum, addr)) { + preempt_enable(); cpu_relax(); + preempt_disable(); + } } #endif __acquire(bitlock); diff -puN kernel/sched.c~preempt-smp kernel/sched.c --- 25/kernel/sched.c~preempt-smp 2004-11-30 01:23:40.904548672 -0800 +++ 25-akpm/kernel/sched.c 2004-11-30 01:23:40.922545936 -0800 @@ -3458,6 +3458,37 @@ void __sched __cond_resched(void) EXPORT_SYMBOL(__cond_resched); +/* + * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int cond_resched_lock(spinlock_t * lock) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) + if (lock->break_lock) { + lock->break_lock = 0; + spin_unlock(lock); + cpu_relax(); + spin_lock(lock); + } +#endif + if (need_resched()) { + _raw_spin_unlock(lock); + preempt_enable_no_resched(); + set_current_state(TASK_RUNNING); + schedule(); + spin_lock(lock); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched_lock); + /** * yield - yield the current processor to other threads. * diff -puN kernel/spinlock.c~preempt-smp kernel/spinlock.c --- 25/kernel/spinlock.c~preempt-smp 2004-11-30 01:23:40.906548368 -0800 +++ 25-akpm/kernel/spinlock.c 2004-11-30 01:23:40.923545784 -0800 @@ -2,6 +2,8 @@ * Copyright (2004) Linus Torvalds * * Author: Zwane Mwaikambo + * + * Copyright (2004) Ingo Molnar */ #include @@ -11,6 +13,17 @@ #include #include +/* + * Generic declaration of the raw read_trylock() function, + * architectures are supposed to optimize this: + */ +int __lockfunc generic_raw_read_trylock(rwlock_t *lock) +{ + _raw_read_lock(lock); + return 1; +} +EXPORT_SYMBOL(generic_raw_read_trylock); + int __lockfunc _spin_trylock(spinlock_t *lock) { preempt_disable(); @@ -22,86 +35,29 @@ int __lockfunc _spin_trylock(spinlock_t } EXPORT_SYMBOL(_spin_trylock); -int __lockfunc _write_trylock(rwlock_t *lock) +int __lockfunc _read_trylock(rwlock_t *lock) { preempt_disable(); - if (_raw_write_trylock(lock)) + if (_raw_read_trylock(lock)) return 1; preempt_enable(); return 0; } -EXPORT_SYMBOL(_write_trylock); - -#ifdef CONFIG_PREEMPT -/* - * This could be a long-held lock. If another CPU holds it for a long time, - * and that CPU is not asked to reschedule then *this* CPU will spin on the - * lock for a long time, even if *this* CPU is asked to reschedule. - * - * So what we do here, in the slow (contended) path is to spin on the lock by - * hand while permitting preemption. - * - * Called inside preempt_disable(). - */ -static inline void __preempt_spin_lock(spinlock_t *lock) -{ - if (preempt_count() > 1) { - _raw_spin_lock(lock); - return; - } - - do { - preempt_enable(); - while (spin_is_locked(lock)) - cpu_relax(); - preempt_disable(); - } while (!_raw_spin_trylock(lock)); -} +EXPORT_SYMBOL(_read_trylock); -void __lockfunc _spin_lock(spinlock_t *lock) +int __lockfunc _write_trylock(rwlock_t *lock) { preempt_disable(); - if (unlikely(!_raw_spin_trylock(lock))) - __preempt_spin_lock(lock); -} - -static inline void __preempt_write_lock(rwlock_t *lock) -{ - if (preempt_count() > 1) { - _raw_write_lock(lock); - return; - } - - do { - preempt_enable(); - while (rwlock_is_locked(lock)) - cpu_relax(); - preempt_disable(); - } while (!_raw_write_trylock(lock)); -} + if (_raw_write_trylock(lock)) + return 1; -void __lockfunc _write_lock(rwlock_t *lock) -{ - preempt_disable(); - if (unlikely(!_raw_write_trylock(lock))) - __preempt_write_lock(lock); -} -#else -void __lockfunc _spin_lock(spinlock_t *lock) -{ - preempt_disable(); - _raw_spin_lock(lock); + preempt_enable(); + return 0; } +EXPORT_SYMBOL(_write_trylock); -void __lockfunc _write_lock(rwlock_t *lock) -{ - preempt_disable(); - _raw_write_lock(lock); -} -#endif -EXPORT_SYMBOL(_spin_lock); -EXPORT_SYMBOL(_write_lock); +#ifndef CONFIG_PREEMPT void __lockfunc _read_lock(rwlock_t *lock) { @@ -110,27 +66,6 @@ void __lockfunc _read_lock(rwlock_t *loc } EXPORT_SYMBOL(_read_lock); -void __lockfunc _spin_unlock(spinlock_t *lock) -{ - _raw_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(_spin_unlock); - -void __lockfunc _write_unlock(rwlock_t *lock) -{ - _raw_write_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(_write_unlock); - -void __lockfunc _read_unlock(rwlock_t *lock) -{ - _raw_read_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(_read_unlock); - unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) { unsigned long flags; @@ -212,6 +147,130 @@ void __lockfunc _write_lock_bh(rwlock_t } EXPORT_SYMBOL(_write_lock_bh); +void __lockfunc _spin_lock(spinlock_t *lock) +{ + preempt_disable(); + _raw_spin_lock(lock); +} + +EXPORT_SYMBOL(_spin_lock); + +void __lockfunc _write_lock(rwlock_t *lock) +{ + preempt_disable(); + _raw_write_lock(lock); +} + +EXPORT_SYMBOL(_write_lock); + +#else /* CONFIG_PREEMPT: */ + +/* + * This could be a long-held lock. We both prepare to spin for a long + * time (making _this_ CPU preemptable if possible), and we also signal + * towards that other CPU that it should break the lock ASAP. + * + * (We do this in a function because inlining it would be excessive.) + */ + +#define BUILD_LOCK_OPS(op, locktype) \ +void __lockfunc _##op##_lock(locktype *lock) \ +{ \ + preempt_disable(); \ + for (;;) { \ + if (likely(_raw_##op##_trylock(lock))) \ + break; \ + preempt_enable(); \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + cpu_relax(); \ + preempt_disable(); \ + } \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock); \ + \ +unsigned long __lockfunc _##op##_lock_irqsave(locktype *lock) \ +{ \ + unsigned long flags; \ + \ + preempt_disable(); \ + for (;;) { \ + local_irq_save(flags); \ + if (likely(_raw_##op##_trylock(lock))) \ + break; \ + local_irq_restore(flags); \ + \ + preempt_enable(); \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + cpu_relax(); \ + preempt_disable(); \ + } \ + return flags; \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock_irqsave); \ + \ +void __lockfunc _##op##_lock_irq(locktype *lock) \ +{ \ + _##op##_lock_irqsave(lock); \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock_irq); \ + \ +void __lockfunc _##op##_lock_bh(locktype *lock) \ +{ \ + unsigned long flags; \ + \ + /* */ \ + /* Careful: we must exclude softirqs too, hence the */ \ + /* irq-disabling. We use the generic preemption-aware */ \ + /* function: */ \ + /**/ \ + flags = _##op##_lock_irqsave(lock); \ + local_bh_disable(); \ + local_irq_restore(flags); \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock_bh) + +/* + * Build preemption-friendly versions of the following + * lock-spinning functions: + * + * _[spin|read|write]_lock() + * _[spin|read|write]_lock_irq() + * _[spin|read|write]_lock_irqsave() + * _[spin|read|write]_lock_bh() + */ +BUILD_LOCK_OPS(spin, spinlock_t); +BUILD_LOCK_OPS(read, rwlock_t); +BUILD_LOCK_OPS(write, rwlock_t); + +#endif /* CONFIG_PREEMPT */ + +void __lockfunc _spin_unlock(spinlock_t *lock) +{ + _raw_spin_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(_spin_unlock); + +void __lockfunc _write_unlock(rwlock_t *lock) +{ + _raw_write_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(_write_unlock); + +void __lockfunc _read_unlock(rwlock_t *lock) +{ + _raw_read_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(_read_unlock); + void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { _raw_spin_unlock(lock); _