From: Nick Piggin Fine-tune the unsynched sched_clock handling. Basically, you need to be careful about ensuring timestamps get correctly adjusted when moving CPUs, and you *can't* look at your unadjusted sched_clock() and a remote task's ->timestamp and try to come up with anything meaningful. I think this second problem will really hit hard in the activate_task path on systems with unsynched sched_clock when you're waking up a remote task, which happens very often. Andi, I thought some Opterons have unsynched tscs? Maybe this is causing your unexplained bad interactivity? Another problem is a fixup in pull_task. When adjusting ->timestamp from one processor to another, you must use timestamp_last_tick for the local processor too. Using sched_clock() will cause ->timestamp to creep forward. A final small fix is for sync wakeups. They were using __activate_task for some reason, thus they don't get credited for sleeping at all AFAIKS. And another thing, do we want to #ifdef timestamp_last_tick so it doesn't show on UP? --- 25-akpm/kernel/sched.c | 38 ++++++++++++++++++++++++++++---------- 1 files changed, 28 insertions(+), 10 deletions(-) diff -puN kernel/sched.c~sched-activate-tslt kernel/sched.c --- 25/kernel/sched.c~sched-activate-tslt 2004-05-12 21:01:45.731715016 -0700 +++ 25-akpm/kernel/sched.c 2004-05-12 21:01:45.737714104 -0700 @@ -457,9 +457,19 @@ static void recalc_task_prio(task_t *p, * Update all the scheduling statistics stuff. (sleep average * calculation, priority modifiers, etc.) */ -static void activate_task(task_t *p, runqueue_t *rq) +static void activate_task(task_t *p, runqueue_t *rq, int local) { - unsigned long long now = sched_clock(); + unsigned long long now; + + now = sched_clock(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + runqueue_t *this_rq = this_rq(); + now = (now - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + } +#endif recalc_task_prio(p, now); @@ -819,10 +829,8 @@ out_activate: * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ - if (sync && cpu == this_cpu) { - __activate_task(p, rq); - } else { - activate_task(p, rq); + activate_task(p, rq, cpu == this_cpu); + if (!sync || cpu != this_cpu) { if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } @@ -1264,6 +1272,9 @@ lock_again: rq->nr_running++; } } else { + /* Not the local CPU - must adjust timestamp */ + p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; __activate_task(p, rq); if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -1366,8 +1377,8 @@ void pull_task(runqueue_t *src_rq, prio_ set_task_cpu(p, this_cpu); this_rq->nr_running++; enqueue_task(p, this_array); - p->timestamp = sched_clock() - - (src_rq->timestamp_last_tick - p->timestamp); + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + + this_rq->timestamp_last_tick; /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. @@ -3340,12 +3351,19 @@ static void __migrate_task(struct task_s set_task_cpu(p, dest_cpu); if (p->array) { + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step + * afterwards, and pretending it was a local activate. + * This way is cleaner and logically correct. + */ + p->timestamp = p->timestamp - rq_src->timestamp_last_tick + + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); - activate_task(p, rq_dest); + activate_task(p, rq_dest, 0); if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); } - p->timestamp = rq_dest->timestamp_last_tick; out: double_rq_unlock(rq_src, rq_dest); _