aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2005-01-07 21:49:02 -0800
committerLinus Torvalds <torvalds@evo.osdl.org>2005-01-07 21:49:02 -0800
commit38e387ee01e5a57cd3ed84062930997b87fa3896 (patch)
treec3cbc19de0beeceb82408b03a27784e9e44ee701 /kernel
parent18f27594d0c5cd2da683252afc8d0933bd64a365 (diff)
downloadhistory-38e387ee01e5a57cd3ed84062930997b87fa3896.tar.gz
[PATCH] improve preemption on SMP
SMP locking latencies are one of the last architectural problems that cause millisec-category scheduling delays. CONFIG_PREEMPT tries to solve some of the SMP issues but there are still lots of problems remaining: spinlocks nested at multiple levels, spinning with irqs turned off, and non-nested spinning with preemption turned off permanently. The nesting problem goes like this: if a piece of kernel code (e.g. the MM or ext3's journalling code) does the following: spin_lock(&spinlock_1); ... spin_lock(&spinlock_2); ... then even with CONFIG_PREEMPT enabled, current kernels may spin on spinlock_2 indefinitely. A number of critical sections break their long paths by using cond_resched_lock(), but this does not break the path on SMP, because need_resched() *of the other CPU* is not set so cond_resched_lock() doesnt notice that a reschedule is due. to solve this problem i've introduced a new spinlock field, lock->break_lock, which signals towards the holding CPU that a spinlock-break is requested by another CPU. This field is only set if a CPU is spinning in a spinlock function [at any locking depth], so the default overhead is zero. I've extended cond_resched_lock() to check for this flag - in this case we can also save a reschedule. I've added the lock_need_resched(lock) and need_lockbreak(lock) methods to check for the need to break out of a critical section. Another latency problem was that the stock kernel, even with CONFIG_PREEMPT enabled, didnt have any spin-nicely preemption logic for the following, commonly used SMP locking primitives: read_lock(), spin_lock_irqsave(), spin_lock_irq(), spin_lock_bh(), read_lock_irqsave(), read_lock_irq(), read_lock_bh(), write_lock_irqsave(), write_lock_irq(), write_lock_bh(). Only spin_lock() and write_lock() [the two simplest cases] where covered. In addition to the preemption latency problems, the _irq() variants in the above list didnt do any IRQ-enabling while spinning - possibly resulting in excessive irqs-off sections of code! preempt-smp.patch fixes all these latency problems by spinning irq-nicely (if possible) and by requesting lock-breaks if needed. Two architecture-level changes were necessary for this: the addition of the break_lock field to spinlock_t and rwlock_t, and the addition of the _raw_read_trylock() function. Testing done by Mark H Johnson and myself indicate SMP latencies comparable to the UP kernel - while they were basically indefinitely high without this patch. i successfully test-compiled and test-booted this patch ontop of BK-curr using the following .config combinations: SMP && PREEMPT, !SMP && PREEMPT, SMP && !PREEMPT and !SMP && !PREEMPT on x86, !SMP && !PREEMPT and SMP && PREEMPT on x64. I also test-booted x86 with the generic_read_trylock function to check that it works fine. Essentially the same patch has been in testing as part of the voluntary-preempt patches for some time already. NOTE to architecture maintainers: generic_raw_read_trylock() is a crude version that should be replaced with the proper arch-optimized version ASAP. From: Hugh Dickins <hugh@veritas.com> The i386 and x86_64 _raw_read_trylocks in preempt-smp.patch are too successful: atomic_read() returns a signed integer. Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched.c31
-rw-r--r--kernel/spinlock.c235
2 files changed, 178 insertions, 88 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 25393c06c0639b..1a05ab700f2330 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3441,6 +3441,37 @@ void __sched __cond_resched(void)
EXPORT_SYMBOL(__cond_resched);
+/*
+ * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int cond_resched_lock(spinlock_t * lock)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
+ if (lock->break_lock) {
+ lock->break_lock = 0;
+ spin_unlock(lock);
+ cpu_relax();
+ spin_lock(lock);
+ }
+#endif
+ if (need_resched()) {
+ _raw_spin_unlock(lock);
+ preempt_enable_no_resched();
+ set_current_state(TASK_RUNNING);
+ schedule();
+ spin_lock(lock);
+ return 1;
+ }
+ return 0;
+}
+
+EXPORT_SYMBOL(cond_resched_lock);
+
/**
* yield - yield the current processor to other threads.
*
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 476da1fd86f4cb..b485593430ec1b 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -2,6 +2,8 @@
* Copyright (2004) Linus Torvalds
*
* Author: Zwane Mwaikambo <zwane@fsmlabs.com>
+ *
+ * Copyright (2004) Ingo Molnar
*/
#include <linux/config.h>
@@ -11,6 +13,17 @@
#include <linux/interrupt.h>
#include <linux/module.h>
+/*
+ * Generic declaration of the raw read_trylock() function,
+ * architectures are supposed to optimize this:
+ */
+int __lockfunc generic_raw_read_trylock(rwlock_t *lock)
+{
+ _raw_read_lock(lock);
+ return 1;
+}
+EXPORT_SYMBOL(generic_raw_read_trylock);
+
int __lockfunc _spin_trylock(spinlock_t *lock)
{
preempt_disable();
@@ -22,86 +35,29 @@ int __lockfunc _spin_trylock(spinlock_t *lock)
}
EXPORT_SYMBOL(_spin_trylock);
-int __lockfunc _write_trylock(rwlock_t *lock)
+int __lockfunc _read_trylock(rwlock_t *lock)
{
preempt_disable();
- if (_raw_write_trylock(lock))
+ if (_raw_read_trylock(lock))
return 1;
preempt_enable();
return 0;
}
-EXPORT_SYMBOL(_write_trylock);
-
-#ifdef CONFIG_PREEMPT
-/*
- * This could be a long-held lock. If another CPU holds it for a long time,
- * and that CPU is not asked to reschedule then *this* CPU will spin on the
- * lock for a long time, even if *this* CPU is asked to reschedule.
- *
- * So what we do here, in the slow (contended) path is to spin on the lock by
- * hand while permitting preemption.
- *
- * Called inside preempt_disable().
- */
-static inline void __preempt_spin_lock(spinlock_t *lock)
-{
- if (preempt_count() > 1) {
- _raw_spin_lock(lock);
- return;
- }
-
- do {
- preempt_enable();
- while (spin_is_locked(lock))
- cpu_relax();
- preempt_disable();
- } while (!_raw_spin_trylock(lock));
-}
+EXPORT_SYMBOL(_read_trylock);
-void __lockfunc _spin_lock(spinlock_t *lock)
+int __lockfunc _write_trylock(rwlock_t *lock)
{
preempt_disable();
- if (unlikely(!_raw_spin_trylock(lock)))
- __preempt_spin_lock(lock);
-}
-
-static inline void __preempt_write_lock(rwlock_t *lock)
-{
- if (preempt_count() > 1) {
- _raw_write_lock(lock);
- return;
- }
-
- do {
- preempt_enable();
- while (rwlock_is_locked(lock))
- cpu_relax();
- preempt_disable();
- } while (!_raw_write_trylock(lock));
-}
+ if (_raw_write_trylock(lock))
+ return 1;
-void __lockfunc _write_lock(rwlock_t *lock)
-{
- preempt_disable();
- if (unlikely(!_raw_write_trylock(lock)))
- __preempt_write_lock(lock);
-}
-#else
-void __lockfunc _spin_lock(spinlock_t *lock)
-{
- preempt_disable();
- _raw_spin_lock(lock);
+ preempt_enable();
+ return 0;
}
+EXPORT_SYMBOL(_write_trylock);
-void __lockfunc _write_lock(rwlock_t *lock)
-{
- preempt_disable();
- _raw_write_lock(lock);
-}
-#endif
-EXPORT_SYMBOL(_spin_lock);
-EXPORT_SYMBOL(_write_lock);
+#ifndef CONFIG_PREEMPT
void __lockfunc _read_lock(rwlock_t *lock)
{
@@ -110,27 +66,6 @@ void __lockfunc _read_lock(rwlock_t *lock)
}
EXPORT_SYMBOL(_read_lock);
-void __lockfunc _spin_unlock(spinlock_t *lock)
-{
- _raw_spin_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(_spin_unlock);
-
-void __lockfunc _write_unlock(rwlock_t *lock)
-{
- _raw_write_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(_write_unlock);
-
-void __lockfunc _read_unlock(rwlock_t *lock)
-{
- _raw_read_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(_read_unlock);
-
unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
{
unsigned long flags;
@@ -212,6 +147,130 @@ void __lockfunc _write_lock_bh(rwlock_t *lock)
}
EXPORT_SYMBOL(_write_lock_bh);
+void __lockfunc _spin_lock(spinlock_t *lock)
+{
+ preempt_disable();
+ _raw_spin_lock(lock);
+}
+
+EXPORT_SYMBOL(_spin_lock);
+
+void __lockfunc _write_lock(rwlock_t *lock)
+{
+ preempt_disable();
+ _raw_write_lock(lock);
+}
+
+EXPORT_SYMBOL(_write_lock);
+
+#else /* CONFIG_PREEMPT: */
+
+/*
+ * This could be a long-held lock. We both prepare to spin for a long
+ * time (making _this_ CPU preemptable if possible), and we also signal
+ * towards that other CPU that it should break the lock ASAP.
+ *
+ * (We do this in a function because inlining it would be excessive.)
+ */
+
+#define BUILD_LOCK_OPS(op, locktype) \
+void __lockfunc _##op##_lock(locktype *lock) \
+{ \
+ preempt_disable(); \
+ for (;;) { \
+ if (likely(_raw_##op##_trylock(lock))) \
+ break; \
+ preempt_enable(); \
+ if (!(lock)->break_lock) \
+ (lock)->break_lock = 1; \
+ cpu_relax(); \
+ preempt_disable(); \
+ } \
+} \
+ \
+EXPORT_SYMBOL(_##op##_lock); \
+ \
+unsigned long __lockfunc _##op##_lock_irqsave(locktype *lock) \
+{ \
+ unsigned long flags; \
+ \
+ preempt_disable(); \
+ for (;;) { \
+ local_irq_save(flags); \
+ if (likely(_raw_##op##_trylock(lock))) \
+ break; \
+ local_irq_restore(flags); \
+ \
+ preempt_enable(); \
+ if (!(lock)->break_lock) \
+ (lock)->break_lock = 1; \
+ cpu_relax(); \
+ preempt_disable(); \
+ } \
+ return flags; \
+} \
+ \
+EXPORT_SYMBOL(_##op##_lock_irqsave); \
+ \
+void __lockfunc _##op##_lock_irq(locktype *lock) \
+{ \
+ _##op##_lock_irqsave(lock); \
+} \
+ \
+EXPORT_SYMBOL(_##op##_lock_irq); \
+ \
+void __lockfunc _##op##_lock_bh(locktype *lock) \
+{ \
+ unsigned long flags; \
+ \
+ /* */ \
+ /* Careful: we must exclude softirqs too, hence the */ \
+ /* irq-disabling. We use the generic preemption-aware */ \
+ /* function: */ \
+ /**/ \
+ flags = _##op##_lock_irqsave(lock); \
+ local_bh_disable(); \
+ local_irq_restore(flags); \
+} \
+ \
+EXPORT_SYMBOL(_##op##_lock_bh)
+
+/*
+ * Build preemption-friendly versions of the following
+ * lock-spinning functions:
+ *
+ * _[spin|read|write]_lock()
+ * _[spin|read|write]_lock_irq()
+ * _[spin|read|write]_lock_irqsave()
+ * _[spin|read|write]_lock_bh()
+ */
+BUILD_LOCK_OPS(spin, spinlock_t);
+BUILD_LOCK_OPS(read, rwlock_t);
+BUILD_LOCK_OPS(write, rwlock_t);
+
+#endif /* CONFIG_PREEMPT */
+
+void __lockfunc _spin_unlock(spinlock_t *lock)
+{
+ _raw_spin_unlock(lock);
+ preempt_enable();
+}
+EXPORT_SYMBOL(_spin_unlock);
+
+void __lockfunc _write_unlock(rwlock_t *lock)
+{
+ _raw_write_unlock(lock);
+ preempt_enable();
+}
+EXPORT_SYMBOL(_write_unlock);
+
+void __lockfunc _read_unlock(rwlock_t *lock)
+{
+ _raw_read_unlock(lock);
+ preempt_enable();
+}
+EXPORT_SYMBOL(_read_unlock);
+
void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
_raw_spin_unlock(lock);