diff options
author | Greg Kroah-Hartman <gregkh@suse.de> | 2011-10-03 16:46:08 -0700 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@suse.de> | 2011-10-03 16:46:08 -0700 |
commit | ba97c2b32efc8a298d9aea096946dd156b2c7c79 (patch) | |
tree | dd5de5a9b4b0ca637f654601effa11722fc7e6fa | |
parent | 410449452dd3868bff8971f247c7f42652d7bab3 (diff) | |
download | stable-queue-ba97c2b32efc8a298d9aea096946dd156b2c7c79.tar.gz |
3.0 patches
-rw-r--r-- | queue-3.0/posix-cpu-timers-cure-smp-wobbles.patch | 224 | ||||
-rw-r--r-- | queue-3.0/series | 1 |
2 files changed, 225 insertions, 0 deletions
diff --git a/queue-3.0/posix-cpu-timers-cure-smp-wobbles.patch b/queue-3.0/posix-cpu-timers-cure-smp-wobbles.patch new file mode 100644 index 0000000000..2a80294e5b --- /dev/null +++ b/queue-3.0/posix-cpu-timers-cure-smp-wobbles.patch @@ -0,0 +1,224 @@ +From d670ec13178d0fd8680e6742a2bc6e04f28f87d8 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <a.p.zijlstra@chello.nl> +Date: Thu, 1 Sep 2011 12:42:04 +0200 +Subject: posix-cpu-timers: Cure SMP wobbles + +From: Peter Zijlstra <a.p.zijlstra@chello.nl> + +commit d670ec13178d0fd8680e6742a2bc6e04f28f87d8 upstream. + +David reported: + + Attached below is a watered-down version of rt/tst-cpuclock2.c from + GLIBC. Just build it with "gcc -o test test.c -lpthread -lrt" or + similar. + + Run it several times, and you will see cases where the main thread + will measure a process clock difference before and after the nanosleep + which is smaller than the cpu-burner thread's individual thread clock + difference. This doesn't make any sense since the cpu-burner thread + is part of the top-level process's thread group. + + I've reproduced this on both x86-64 and sparc64 (using both 32-bit and + 64-bit binaries). + + For example: + + [davem@boricha build-x86_64-linux]$ ./test + process: before(0.001221967) after(0.498624371) diff(497402404) + thread: before(0.000081692) after(0.498316431) diff(498234739) + self: before(0.001223521) after(0.001240219) diff(16698) + [davem@boricha build-x86_64-linux]$ + + The diff of 'process' should always be >= the diff of 'thread'. + + I make sure to wrap the 'thread' clock measurements the most tightly + around the nanosleep() call, and that the 'process' clock measurements + are the outer-most ones. + + --- + #include <unistd.h> + #include <stdio.h> + #include <stdlib.h> + #include <time.h> + #include <fcntl.h> + #include <string.h> + #include <errno.h> + #include <pthread.h> + + static pthread_barrier_t barrier; + + static void *chew_cpu(void *arg) + { + pthread_barrier_wait(&barrier); + while (1) + __asm__ __volatile__("" : : : "memory"); + return NULL; + } + + int main(void) + { + clockid_t process_clock, my_thread_clock, th_clock; + struct timespec process_before, process_after; + struct timespec me_before, me_after; + struct timespec th_before, th_after; + struct timespec sleeptime; + unsigned long diff; + pthread_t th; + int err; + + err = clock_getcpuclockid(0, &process_clock); + if (err) + return 1; + + err = pthread_getcpuclockid(pthread_self(), &my_thread_clock); + if (err) + return 1; + + pthread_barrier_init(&barrier, NULL, 2); + err = pthread_create(&th, NULL, chew_cpu, NULL); + if (err) + return 1; + + err = pthread_getcpuclockid(th, &th_clock); + if (err) + return 1; + + pthread_barrier_wait(&barrier); + + err = clock_gettime(process_clock, &process_before); + if (err) + return 1; + + err = clock_gettime(my_thread_clock, &me_before); + if (err) + return 1; + + err = clock_gettime(th_clock, &th_before); + if (err) + return 1; + + sleeptime.tv_sec = 0; + sleeptime.tv_nsec = 500000000; + nanosleep(&sleeptime, NULL); + + err = clock_gettime(th_clock, &th_after); + if (err) + return 1; + + err = clock_gettime(my_thread_clock, &me_after); + if (err) + return 1; + + err = clock_gettime(process_clock, &process_after); + if (err) + return 1; + + diff = process_after.tv_nsec - process_before.tv_nsec; + printf("process: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", + process_before.tv_sec, process_before.tv_nsec, + process_after.tv_sec, process_after.tv_nsec, diff); + diff = th_after.tv_nsec - th_before.tv_nsec; + printf("thread: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", + th_before.tv_sec, th_before.tv_nsec, + th_after.tv_sec, th_after.tv_nsec, diff); + diff = me_after.tv_nsec - me_before.tv_nsec; + printf("self: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", + me_before.tv_sec, me_before.tv_nsec, + me_after.tv_sec, me_after.tv_nsec, diff); + + return 0; + } + +This is due to us using p->se.sum_exec_runtime in +thread_group_cputime() where we iterate the thread group and sum all +data. This does not take time since the last schedule operation (tick +or otherwise) into account. We can cure this by using +task_sched_runtime() at the cost of having to take locks. + +This also means we can (and must) do away with +thread_group_sched_runtime() since the modified thread_group_cputime() +is now more accurate and would deadlock when called from +thread_group_sched_runtime(). + +Aside of that it makes the function safe on 32 bit systems. The old +code added t->se.sum_exec_runtime unprotected. sum_exec_runtime is a +64bit value and could be changed on another cpu at the same time. + +Reported-by: David Miller <davem@davemloft.net> +Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> +Link: http://lkml.kernel.org/r/1314874459.7945.22.camel@twins +Tested-by: David Miller <davem@davemloft.net> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> + +--- + include/linux/sched.h | 1 - + kernel/posix-cpu-timers.c | 5 +++-- + kernel/sched.c | 24 ------------------------ + 3 files changed, 3 insertions(+), 27 deletions(-) + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1937,7 +1937,6 @@ static inline void disable_sched_clock_i + + extern unsigned long long + task_sched_runtime(struct task_struct *task); +-extern unsigned long long thread_group_sched_runtime(struct task_struct *task); + + /* sched_exec is called by processes performing an exec */ + #ifdef CONFIG_SMP +--- a/kernel/posix-cpu-timers.c ++++ b/kernel/posix-cpu-timers.c +@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_st + do { + times->utime = cputime_add(times->utime, t->utime); + times->stime = cputime_add(times->stime, t->stime); +- times->sum_exec_runtime += t->se.sum_exec_runtime; ++ times->sum_exec_runtime += task_sched_runtime(t); + } while_each_thread(tsk, t); + out: + rcu_read_unlock(); +@@ -312,7 +312,8 @@ static int cpu_clock_sample_group(const + cpu->cpu = cputime.utime; + break; + case CPUCLOCK_SCHED: +- cpu->sched = thread_group_sched_runtime(p); ++ thread_group_cputime(p, &cputime); ++ cpu->sched = cputime.sum_exec_runtime; + break; + } + return 0; +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -3713,30 +3713,6 @@ unsigned long long task_sched_runtime(st + } + + /* +- * Return sum_exec_runtime for the thread group. +- * In case the task is currently running, return the sum plus current's +- * pending runtime that have not been accounted yet. +- * +- * Note that the thread group might have other running tasks as well, +- * so the return value not includes other pending runtime that other +- * running tasks might have. +- */ +-unsigned long long thread_group_sched_runtime(struct task_struct *p) +-{ +- struct task_cputime totals; +- unsigned long flags; +- struct rq *rq; +- u64 ns; +- +- rq = task_rq_lock(p, &flags); +- thread_group_cputime(p, &totals); +- ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); +- task_rq_unlock(rq, p, &flags); +- +- return ns; +-} +- +-/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update diff --git a/queue-3.0/series b/queue-3.0/series index bd2b856cef..8a36a9e470 100644 --- a/queue-3.0/series +++ b/queue-3.0/series @@ -7,3 +7,4 @@ sparc64-force-the-execute-bit-in-openfirmware-s-translation-entries.patch sched-rt-migrate-equal-priority-tasks-to-available-cpus.patch sched-fix-up-wchan-borkage.patch ide-disk-fix-request-requeuing.patch +posix-cpu-timers-cure-smp-wobbles.patch |