diff options
author | Ingo Molnar <mingo@elte.hu> | 2005-01-07 21:49:02 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@evo.osdl.org> | 2005-01-07 21:49:02 -0800 |
commit | 38e387ee01e5a57cd3ed84062930997b87fa3896 (patch) | |
tree | c3cbc19de0beeceb82408b03a27784e9e44ee701 /kernel | |
parent | 18f27594d0c5cd2da683252afc8d0933bd64a365 (diff) | |
download | history-38e387ee01e5a57cd3ed84062930997b87fa3896.tar.gz |
[PATCH] improve preemption on SMP
SMP locking latencies are one of the last architectural problems that cause
millisec-category scheduling delays. CONFIG_PREEMPT tries to solve some of
the SMP issues but there are still lots of problems remaining: spinlocks
nested at multiple levels, spinning with irqs turned off, and non-nested
spinning with preemption turned off permanently.
The nesting problem goes like this: if a piece of kernel code (e.g. the MM
or ext3's journalling code) does the following:
spin_lock(&spinlock_1);
...
spin_lock(&spinlock_2);
...
then even with CONFIG_PREEMPT enabled, current kernels may spin on
spinlock_2 indefinitely. A number of critical sections break their long
paths by using cond_resched_lock(), but this does not break the path on
SMP, because need_resched() *of the other CPU* is not set so
cond_resched_lock() doesnt notice that a reschedule is due.
to solve this problem i've introduced a new spinlock field,
lock->break_lock, which signals towards the holding CPU that a
spinlock-break is requested by another CPU. This field is only set if a
CPU is spinning in a spinlock function [at any locking depth], so the
default overhead is zero. I've extended cond_resched_lock() to check for
this flag - in this case we can also save a reschedule. I've added the
lock_need_resched(lock) and need_lockbreak(lock) methods to check for the
need to break out of a critical section.
Another latency problem was that the stock kernel, even with CONFIG_PREEMPT
enabled, didnt have any spin-nicely preemption logic for the following,
commonly used SMP locking primitives: read_lock(), spin_lock_irqsave(),
spin_lock_irq(), spin_lock_bh(), read_lock_irqsave(), read_lock_irq(),
read_lock_bh(), write_lock_irqsave(), write_lock_irq(), write_lock_bh().
Only spin_lock() and write_lock() [the two simplest cases] where covered.
In addition to the preemption latency problems, the _irq() variants in the
above list didnt do any IRQ-enabling while spinning - possibly resulting in
excessive irqs-off sections of code!
preempt-smp.patch fixes all these latency problems by spinning irq-nicely
(if possible) and by requesting lock-breaks if needed. Two
architecture-level changes were necessary for this: the addition of the
break_lock field to spinlock_t and rwlock_t, and the addition of the
_raw_read_trylock() function.
Testing done by Mark H Johnson and myself indicate SMP latencies comparable
to the UP kernel - while they were basically indefinitely high without this
patch.
i successfully test-compiled and test-booted this patch ontop of BK-curr
using the following .config combinations: SMP && PREEMPT, !SMP && PREEMPT,
SMP && !PREEMPT and !SMP && !PREEMPT on x86, !SMP && !PREEMPT and SMP &&
PREEMPT on x64. I also test-booted x86 with the generic_read_trylock
function to check that it works fine. Essentially the same patch has been
in testing as part of the voluntary-preempt patches for some time already.
NOTE to architecture maintainers: generic_raw_read_trylock() is a crude
version that should be replaced with the proper arch-optimized version
ASAP.
From: Hugh Dickins <hugh@veritas.com>
The i386 and x86_64 _raw_read_trylocks in preempt-smp.patch are too
successful: atomic_read() returns a signed integer.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 31 | ||||
-rw-r--r-- | kernel/spinlock.c | 235 |
2 files changed, 178 insertions, 88 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 25393c06c0639b..1a05ab700f2330 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3441,6 +3441,37 @@ void __sched __cond_resched(void) EXPORT_SYMBOL(__cond_resched); +/* + * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int cond_resched_lock(spinlock_t * lock) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) + if (lock->break_lock) { + lock->break_lock = 0; + spin_unlock(lock); + cpu_relax(); + spin_lock(lock); + } +#endif + if (need_resched()) { + _raw_spin_unlock(lock); + preempt_enable_no_resched(); + set_current_state(TASK_RUNNING); + schedule(); + spin_lock(lock); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched_lock); + /** * yield - yield the current processor to other threads. * diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 476da1fd86f4cb..b485593430ec1b 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -2,6 +2,8 @@ * Copyright (2004) Linus Torvalds * * Author: Zwane Mwaikambo <zwane@fsmlabs.com> + * + * Copyright (2004) Ingo Molnar */ #include <linux/config.h> @@ -11,6 +13,17 @@ #include <linux/interrupt.h> #include <linux/module.h> +/* + * Generic declaration of the raw read_trylock() function, + * architectures are supposed to optimize this: + */ +int __lockfunc generic_raw_read_trylock(rwlock_t *lock) +{ + _raw_read_lock(lock); + return 1; +} +EXPORT_SYMBOL(generic_raw_read_trylock); + int __lockfunc _spin_trylock(spinlock_t *lock) { preempt_disable(); @@ -22,86 +35,29 @@ int __lockfunc _spin_trylock(spinlock_t *lock) } EXPORT_SYMBOL(_spin_trylock); -int __lockfunc _write_trylock(rwlock_t *lock) +int __lockfunc _read_trylock(rwlock_t *lock) { preempt_disable(); - if (_raw_write_trylock(lock)) + if (_raw_read_trylock(lock)) return 1; preempt_enable(); return 0; } -EXPORT_SYMBOL(_write_trylock); - -#ifdef CONFIG_PREEMPT -/* - * This could be a long-held lock. If another CPU holds it for a long time, - * and that CPU is not asked to reschedule then *this* CPU will spin on the - * lock for a long time, even if *this* CPU is asked to reschedule. - * - * So what we do here, in the slow (contended) path is to spin on the lock by - * hand while permitting preemption. - * - * Called inside preempt_disable(). - */ -static inline void __preempt_spin_lock(spinlock_t *lock) -{ - if (preempt_count() > 1) { - _raw_spin_lock(lock); - return; - } - - do { - preempt_enable(); - while (spin_is_locked(lock)) - cpu_relax(); - preempt_disable(); - } while (!_raw_spin_trylock(lock)); -} +EXPORT_SYMBOL(_read_trylock); -void __lockfunc _spin_lock(spinlock_t *lock) +int __lockfunc _write_trylock(rwlock_t *lock) { preempt_disable(); - if (unlikely(!_raw_spin_trylock(lock))) - __preempt_spin_lock(lock); -} - -static inline void __preempt_write_lock(rwlock_t *lock) -{ - if (preempt_count() > 1) { - _raw_write_lock(lock); - return; - } - - do { - preempt_enable(); - while (rwlock_is_locked(lock)) - cpu_relax(); - preempt_disable(); - } while (!_raw_write_trylock(lock)); -} + if (_raw_write_trylock(lock)) + return 1; -void __lockfunc _write_lock(rwlock_t *lock) -{ - preempt_disable(); - if (unlikely(!_raw_write_trylock(lock))) - __preempt_write_lock(lock); -} -#else -void __lockfunc _spin_lock(spinlock_t *lock) -{ - preempt_disable(); - _raw_spin_lock(lock); + preempt_enable(); + return 0; } +EXPORT_SYMBOL(_write_trylock); -void __lockfunc _write_lock(rwlock_t *lock) -{ - preempt_disable(); - _raw_write_lock(lock); -} -#endif -EXPORT_SYMBOL(_spin_lock); -EXPORT_SYMBOL(_write_lock); +#ifndef CONFIG_PREEMPT void __lockfunc _read_lock(rwlock_t *lock) { @@ -110,27 +66,6 @@ void __lockfunc _read_lock(rwlock_t *lock) } EXPORT_SYMBOL(_read_lock); -void __lockfunc _spin_unlock(spinlock_t *lock) -{ - _raw_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(_spin_unlock); - -void __lockfunc _write_unlock(rwlock_t *lock) -{ - _raw_write_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(_write_unlock); - -void __lockfunc _read_unlock(rwlock_t *lock) -{ - _raw_read_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(_read_unlock); - unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) { unsigned long flags; @@ -212,6 +147,130 @@ void __lockfunc _write_lock_bh(rwlock_t *lock) } EXPORT_SYMBOL(_write_lock_bh); +void __lockfunc _spin_lock(spinlock_t *lock) +{ + preempt_disable(); + _raw_spin_lock(lock); +} + +EXPORT_SYMBOL(_spin_lock); + +void __lockfunc _write_lock(rwlock_t *lock) +{ + preempt_disable(); + _raw_write_lock(lock); +} + +EXPORT_SYMBOL(_write_lock); + +#else /* CONFIG_PREEMPT: */ + +/* + * This could be a long-held lock. We both prepare to spin for a long + * time (making _this_ CPU preemptable if possible), and we also signal + * towards that other CPU that it should break the lock ASAP. + * + * (We do this in a function because inlining it would be excessive.) + */ + +#define BUILD_LOCK_OPS(op, locktype) \ +void __lockfunc _##op##_lock(locktype *lock) \ +{ \ + preempt_disable(); \ + for (;;) { \ + if (likely(_raw_##op##_trylock(lock))) \ + break; \ + preempt_enable(); \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + cpu_relax(); \ + preempt_disable(); \ + } \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock); \ + \ +unsigned long __lockfunc _##op##_lock_irqsave(locktype *lock) \ +{ \ + unsigned long flags; \ + \ + preempt_disable(); \ + for (;;) { \ + local_irq_save(flags); \ + if (likely(_raw_##op##_trylock(lock))) \ + break; \ + local_irq_restore(flags); \ + \ + preempt_enable(); \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + cpu_relax(); \ + preempt_disable(); \ + } \ + return flags; \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock_irqsave); \ + \ +void __lockfunc _##op##_lock_irq(locktype *lock) \ +{ \ + _##op##_lock_irqsave(lock); \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock_irq); \ + \ +void __lockfunc _##op##_lock_bh(locktype *lock) \ +{ \ + unsigned long flags; \ + \ + /* */ \ + /* Careful: we must exclude softirqs too, hence the */ \ + /* irq-disabling. We use the generic preemption-aware */ \ + /* function: */ \ + /**/ \ + flags = _##op##_lock_irqsave(lock); \ + local_bh_disable(); \ + local_irq_restore(flags); \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock_bh) + +/* + * Build preemption-friendly versions of the following + * lock-spinning functions: + * + * _[spin|read|write]_lock() + * _[spin|read|write]_lock_irq() + * _[spin|read|write]_lock_irqsave() + * _[spin|read|write]_lock_bh() + */ +BUILD_LOCK_OPS(spin, spinlock_t); +BUILD_LOCK_OPS(read, rwlock_t); +BUILD_LOCK_OPS(write, rwlock_t); + +#endif /* CONFIG_PREEMPT */ + +void __lockfunc _spin_unlock(spinlock_t *lock) +{ + _raw_spin_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(_spin_unlock); + +void __lockfunc _write_unlock(rwlock_t *lock) +{ + _raw_write_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(_write_unlock); + +void __lockfunc _read_unlock(rwlock_t *lock) +{ + _raw_read_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(_read_unlock); + void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { _raw_spin_unlock(lock); |