From: Anton Blanchard > Anton, the ia32 semaphores will, when down() fails, leave extra refs > against the semaphore's count when extra tasks are waiting. I think. So > if ls is waiting for the sem, and tiobench releases it and then quickly > tries to reacquire it, tiobench will see the negative count in the > semaphore due to ls's presence and will enter the slow path. What we want > to do there is to make sure that tiobench will go to sleep rather than > zooming in and stealing the semaphore again. Definitely sounds reasonable. I wonder why we dont do that. Quick test... replace our semaphores with x86 versions. Im trying to replicate here. arch/ppc64/kernel/semaphore.c | 138 ++++++++++++++++++++++++++++++++++++++++++ include/asm-ppc64/semaphore.h | 3 2 files changed, 141 insertions(+) diff -puN arch/ppc64/kernel/semaphore.c~ppc64-semaphore-reimplementation arch/ppc64/kernel/semaphore.c --- 25/arch/ppc64/kernel/semaphore.c~ppc64-semaphore-reimplementation 2003-06-26 17:33:31.000000000 -0700 +++ 25-akpm/arch/ppc64/kernel/semaphore.c 2003-06-26 17:33:31.000000000 -0700 @@ -21,6 +21,7 @@ #include #include +#if 0 /* * Atomically update sem->count. * This does the equivalent of the following: @@ -129,3 +130,140 @@ int __down_interruptible(struct semaphor wake_up(&sem->wait); return retval; } +#else + +static __inline__ int atomic_add_negative(int i, atomic_t *v) +{ + if (atomic_add_return(i, v) < 0) + return 1; + else + return 0; +} + +void __up(struct semaphore *sem) +{ + wake_up(&sem->wait); +} + +void __down(struct semaphore * sem) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + unsigned long flags; + + tsk->state = TASK_UNINTERRUPTIBLE; + spin_lock_irqsave(&sem->wait.lock, flags); + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + sem->sleepers++; + for (;;) { + int sleepers = sem->sleepers; + + /* + * Add "everybody else" into it. They aren't + * playing, because we own the spinlock in + * the wait_queue_head. + */ + if (!atomic_add_negative(sleepers - 1, &sem->count)) { + sem->sleepers = 0; + break; + } + sem->sleepers = 1; /* us - see -1 above */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + schedule(); + + spin_lock_irqsave(&sem->wait.lock, flags); + tsk->state = TASK_UNINTERRUPTIBLE; + } + remove_wait_queue_locked(&sem->wait, &wait); + wake_up_locked(&sem->wait); + spin_unlock_irqrestore(&sem->wait.lock, flags); + tsk->state = TASK_RUNNING; +} + +int __down_interruptible(struct semaphore * sem) +{ + int retval = 0; + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + unsigned long flags; + + tsk->state = TASK_INTERRUPTIBLE; + spin_lock_irqsave(&sem->wait.lock, flags); + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + sem->sleepers++; + for (;;) { + int sleepers = sem->sleepers; + + /* + * With signals pending, this turns into + * the trylock failure case - we won't be + * sleeping, and we* can't get the lock as + * it has contention. Just correct the count + * and exit. + */ + if (signal_pending(current)) { + retval = -EINTR; + sem->sleepers = 0; + atomic_add(sleepers, &sem->count); + break; + } + + /* + * Add "everybody else" into it. They aren't + * playing, because we own the spinlock in + * wait_queue_head. The "-1" is because we're + * still hoping to get the semaphore. + */ + if (!atomic_add_negative(sleepers - 1, &sem->count)) { + sem->sleepers = 0; + break; + } + sem->sleepers = 1; /* us - see -1 above */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + schedule(); + + spin_lock_irqsave(&sem->wait.lock, flags); + tsk->state = TASK_INTERRUPTIBLE; + } + remove_wait_queue_locked(&sem->wait, &wait); + wake_up_locked(&sem->wait); + spin_unlock_irqrestore(&sem->wait.lock, flags); + + tsk->state = TASK_RUNNING; + return retval; +} + +/* + * Trylock failed - make sure we correct for + * having decremented the count. + * + * We could have done the trylock with a + * single "cmpxchg" without failure cases, + * but then it wouldn't work on a 386. + */ +int __down_trylock(struct semaphore * sem) +{ + int sleepers; + unsigned long flags; + + spin_lock_irqsave(&sem->wait.lock, flags); + sleepers = sem->sleepers + 1; + sem->sleepers = 0; + + /* + * Add "everybody else" and us into it. They aren't + * playing, because we own the spinlock in the + * wait_queue_head. + */ + if (!atomic_add_negative(sleepers, &sem->count)) { + wake_up_locked(&sem->wait); + } + + spin_unlock_irqrestore(&sem->wait.lock, flags); + return 1; +} +#endif diff -puN include/asm-ppc64/semaphore.h~ppc64-semaphore-reimplementation include/asm-ppc64/semaphore.h --- 25/include/asm-ppc64/semaphore.h~ppc64-semaphore-reimplementation 2003-06-26 17:33:31.000000000 -0700 +++ 25-akpm/include/asm-ppc64/semaphore.h 2003-06-26 17:33:31.000000000 -0700 @@ -22,6 +22,7 @@ struct semaphore { * sleeping on `wait'. */ atomic_t count; + int sleepers; wait_queue_head_t wait; #ifdef WAITQUEUE_DEBUG long __magic; @@ -37,6 +38,7 @@ struct semaphore { #define __SEMAPHORE_INITIALIZER(name, count) \ { ATOMIC_INIT(count), \ + 0, \ __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ __SEM_DEBUG_INIT(name) } @@ -52,6 +54,7 @@ struct semaphore { static inline void sema_init (struct semaphore *sem, int val) { atomic_set(&sem->count, val); + sem->sleepers = 0; init_waitqueue_head(&sem->wait); #ifdef WAITQUEUE_DEBUG sem->__magic = (long)&sem->__magic; _