aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorAndrew Morton <akpm@osdl.org>2004-05-31 18:48:47 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2004-05-31 18:48:47 -0700
commit9b91d73bde9d68800f9e5c338c0cf9d0fe3bc862 (patch)
treed17e9b7c1681f00096d96ea5495f5df5822b49f7 /kernel
parent673a68e627312d056cadd1e7053a9fc113149333 (diff)
downloadhistory-9b91d73bde9d68800f9e5c338c0cf9d0fe3bc862.tar.gz
[PATCH] Add FUTEX_CMP_REQUEUE futex op
From: Jakub Jelinek <jakub@redhat.com> FUTEX_REQUEUE operation has been added to the kernel mainly to improve pthread_cond_broadcast which previously used FUTEX_WAKE INT_MAX op. pthread_cond_broadcast releases internal condvar mutex before FUTEX_REQUEUE operation, as otherwise the woken up thread most likely immediately sleeps again on the internal condvar mutex until the broadcasting thread releases it. Unfortunately this is racy and causes e.g. http://sources.redhat.com/cgi-bin/cvsweb.cgi/libc/nptl/tst-cond16.c?rev=1.1&content-type=text/x-cvsweb-markup&cvsroot=glibc to hang on SMP. http://listman.redhat.com/archives/phil-list/2004-May/msg00023.html contains analysis how the hang happens, the problem is if any thread does pthread_cond_*wait in between releasing of the internal condvar mutex and FUTEX_REQUEUE operation, a wrong thread might be awaken (and immediately go to sleep again because it doesn't satisfy conditions for returning from pthread_cond_*wait) while the right thread requeued on the associated mutex and there would be nobody to wake that thread up. The patch below extends FUTEX_REQUEUE operation with something FUTEX_WAIT already uses: FUTEX_CMP_REQUEUE is passed an additional argument which is the expected value of *futex. Kernel then while holding the futex locks checks if *futex != expected and returns -EAGAIN in that case, while if it is equal, continues with a normal FUTEX_REQUEUE operation. If the syscall returns -EAGAIN, NPTL can fall back to FUTEX_WAKE INT_MAX operation which doesn't have this problem, but is less efficient, while in the likely case that nobody hit the (small) window the efficient FUTEX_REQUEUE operation is used. Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/compat.c7
-rw-r--r--kernel/futex.c49
2 files changed, 47 insertions, 9 deletions
diff --git a/kernel/compat.c b/kernel/compat.c
index 9dccddd331a39f..2406b62b1400b1 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -210,7 +210,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
#ifdef CONFIG_FUTEX
asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
- struct compat_timespec __user *utime, u32 __user *uaddr2)
+ struct compat_timespec __user *utime, u32 __user *uaddr2,
+ int val3)
{
struct timespec t;
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -221,11 +222,11 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
return -EFAULT;
timeout = timespec_to_jiffies(&t) + 1;
}
- if (op == FUTEX_REQUEUE)
+ if (op >= FUTEX_REQUEUE)
val2 = (int) (long) utime;
return do_futex((unsigned long)uaddr, op, val, timeout,
- (unsigned long)uaddr2, val2);
+ (unsigned long)uaddr2, val2, val3);
}
#endif
diff --git a/kernel/futex.c b/kernel/futex.c
index 18d5119d27ba7f..f7afaf5d38def8 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -96,6 +96,7 @@ struct futex_q {
*/
struct futex_hash_bucket {
spinlock_t lock;
+ unsigned int nqueued;
struct list_head chain;
};
@@ -318,13 +319,14 @@ out:
* physical page.
*/
static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
- int nr_wake, int nr_requeue)
+ int nr_wake, int nr_requeue, int *valp)
{
union futex_key key1, key2;
struct futex_hash_bucket *bh1, *bh2;
struct list_head *head1;
struct futex_q *this, *next;
int ret, drop_count = 0;
+ unsigned int nqueued;
down_read(&current->mm->mmap_sem);
@@ -338,12 +340,41 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
bh1 = hash_futex(&key1);
bh2 = hash_futex(&key2);
+ nqueued = bh1->nqueued;
+ if (likely(valp != NULL)) {
+ int curval;
+
+ /* In order to avoid doing get_user while
+ holding bh1->lock and bh2->lock, nqueued
+ (monotonically increasing field) must be first
+ read, then *uaddr1 fetched from userland and
+ after acquiring lock nqueued field compared with
+ the stored value. The smp_mb () below
+ makes sure that bh1->nqueued is read from memory
+ before *uaddr1. */
+ smp_mb();
+
+ if (get_user(curval, (int *)uaddr1) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (curval != *valp) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
+
if (bh1 < bh2)
spin_lock(&bh1->lock);
spin_lock(&bh2->lock);
if (bh1 > bh2)
spin_lock(&bh1->lock);
+ if (unlikely(nqueued != bh1->nqueued && valp != NULL)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
head1 = &bh1->chain;
list_for_each_entry_safe(this, next, head1, list) {
if (!match_futex (&this->key, &key1))
@@ -365,6 +396,7 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
}
}
+out_unlock:
spin_unlock(&bh1->lock);
if (bh1 != bh2)
spin_unlock(&bh2->lock);
@@ -398,6 +430,7 @@ static void queue_me(struct futex_q *q, int fd, struct file *filp)
q->lock_ptr = &bh->lock;
spin_lock(&bh->lock);
+ bh->nqueued++;
list_add_tail(&q->list, &bh->chain);
spin_unlock(&bh->lock);
}
@@ -625,7 +658,7 @@ out:
}
long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
- unsigned long uaddr2, int val2)
+ unsigned long uaddr2, int val2, int val3)
{
int ret;
@@ -641,7 +674,10 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
ret = futex_fd(uaddr, val);
break;
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, uaddr2, val, val2);
+ ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+ break;
+ case FUTEX_CMP_REQUEUE:
+ ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
break;
default:
ret = -ENOSYS;
@@ -651,7 +687,8 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
- struct timespec __user *utime, u32 __user *uaddr2)
+ struct timespec __user *utime, u32 __user *uaddr2,
+ int val3)
{
struct timespec t;
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -665,11 +702,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
/*
* requeue parameter in 'utime' if op == FUTEX_REQUEUE.
*/
- if (op == FUTEX_REQUEUE)
+ if (op >= FUTEX_REQUEUE)
val2 = (int) (long) utime;
return do_futex((unsigned long)uaddr, op, val, timeout,
- (unsigned long)uaddr2, val2);
+ (unsigned long)uaddr2, val2, val3);
}
static struct super_block *