From: Ingo Molnar the attached scheduler patch (against test2-mm2) adds the scheduling infrastructure items discussed on lkml. I got good feedback - and while i dont expect it to solve all problems, it does solve a number of bad ones: - test_starve.c code from David Mosberger - thud.c making the system unusuable due to unfairness - fair/accurate sleep average based on a finegrained clock - audio skipping way too easily other changes in sched-test2-mm2-A3: - ia64 sched_clock() code, from David Mosberger. - migration thread startup without relying on implicit scheduling behavior. While the current 2.6 code is correct (due to the cpu-up code adding CPUs one by one), but it's also fragile - and this code cannot be carried over into the 2.4 backports. So adding this method would clean up the startup and would make it easier to have 2.4 backports. and here's the original changelog for the scheduler changes: - cycle accuracy (nanosec resolution) timekeeping within the scheduler. This fixes a number of audio artifacts (skipping) i've reproduced. I dont think we can get away without going cycle accuracy - reading the cycle counter adds some overhead, but it's acceptable. The first nanosec-accuracy patch was done by Mike Galbraith - this patch is different but similar in nature. I went further in also changing the sleep_avg to be of nanosec resolution. - more finegrained timeslices: there's now a timeslice 'sub unit' of 50 usecs (TIMESLICE_GRANULARITY) - CPU hogs on the same priority level will roundrobin with this unit. This change is intended to make gaming latencies shorter. - include scheduling latency in sleep bonus calculation. This change extends the sleep-average calculation to the period of time a task spends on the runqueue but doesnt get scheduled yet, right after wakeup. Note that tasks that were preempted (ie. not woken up) and are still on the runqueue do not get this benefit. This change closes one of the last hole in the dynamic priority estimation, it should result in interactive tasks getting more priority under heavy load. This change also fixes the test-starve.c testcase from David Mosberger. arch/i386/kernel/smpboot.c | 4 arch/i386/kernel/timers/timer_tsc.c | 18 ++ arch/ia64/kernel/time.c | 8 + fs/proc/array.c | 5 include/linux/sched.h | 5 kernel/fork.c | 2 kernel/sched.c | 222 ++++++++++++++++++++++++++---------- 7 files changed, 203 insertions(+), 61 deletions(-) diff -puN arch/i386/kernel/smpboot.c~sched-2.6.0-test2-mm2-A3 arch/i386/kernel/smpboot.c --- 25/arch/i386/kernel/smpboot.c~sched-2.6.0-test2-mm2-A3 2003-07-31 10:43:58.000000000 -0700 +++ 25-akpm/arch/i386/kernel/smpboot.c 2003-07-31 10:43:58.000000000 -0700 @@ -915,13 +915,13 @@ static void smp_tune_scheduling (void) cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; } - cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000; + cache_decay_ticks = (long)cacheflush_time/cpu_khz + 1; printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", (long)cacheflush_time/(cpu_khz/1000), ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); printk("task migration cache decay timeout: %ld msecs.\n", - (cache_decay_ticks + 1) * 1000 / HZ); + cache_decay_ticks); } /* diff -puN arch/i386/kernel/timers/timer_tsc.c~sched-2.6.0-test2-mm2-A3 arch/i386/kernel/timers/timer_tsc.c --- 25/arch/i386/kernel/timers/timer_tsc.c~sched-2.6.0-test2-mm2-A3 2003-07-31 10:43:58.000000000 -0700 +++ 25-akpm/arch/i386/kernel/timers/timer_tsc.c 2003-07-31 10:43:58.000000000 -0700 @@ -116,6 +116,24 @@ static unsigned long long monotonic_cloc return base + cycles_2_ns(this_offset - last_offset); } +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long sched_clock(void) +{ + unsigned long long this_offset; + + if (unlikely(!cpu_has_tsc)) + return (unsigned long long)jiffies * (1000000000 / HZ); + + /* Read the Time Stamp Counter */ + rdtscll(this_offset); + + /* return the value in ns */ + return cycles_2_ns(this_offset); +} + + static void mark_offset_tsc(void) { unsigned long lost,delay; diff -puN arch/ia64/kernel/time.c~sched-2.6.0-test2-mm2-A3 arch/ia64/kernel/time.c --- 25/arch/ia64/kernel/time.c~sched-2.6.0-test2-mm2-A3 2003-07-31 10:43:58.000000000 -0700 +++ 25-akpm/arch/ia64/kernel/time.c 2003-07-31 10:43:58.000000000 -0700 @@ -61,6 +61,14 @@ do_profile (unsigned long ip) atomic_inc((atomic_t *) &prof_buffer[ip]); } +unsigned long long +sched_clock (void) +{ + unsigned long offset = ia64_get_itc(); + + return (offset * local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT; +} + static void itc_reset (void) { diff -puN fs/proc/array.c~sched-2.6.0-test2-mm2-A3 fs/proc/array.c --- 25/fs/proc/array.c~sched-2.6.0-test2-mm2-A3 2003-07-31 10:43:58.000000000 -0700 +++ 25-akpm/fs/proc/array.c 2003-07-31 10:43:58.000000000 -0700 @@ -154,13 +154,16 @@ static inline char * task_state(struct t read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" + "SleepAVG:\t%d%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" "TracerPid:\t%d\n" "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", - get_task_state(p), p->tgid, + get_task_state(p), + (p->sleep_avg/1024)*100/(1000000000/1024), + p->tgid, p->pid, p->pid ? p->real_parent->pid : 0, p->pid && p->ptrace ? p->parent->pid : 0, p->uid, p->euid, p->suid, p->fsuid, diff -puN include/linux/sched.h~sched-2.6.0-test2-mm2-A3 include/linux/sched.h --- 25/include/linux/sched.h~sched-2.6.0-test2-mm2-A3 2003-07-31 10:43:58.000000000 -0700 +++ 25-akpm/include/linux/sched.h 2003-07-31 10:43:58.000000000 -0700 @@ -341,7 +341,8 @@ struct task_struct { prio_array_t *array; unsigned long sleep_avg; - unsigned long last_run; + unsigned long long timestamp; + int activated; unsigned long policy; cpumask_t cpus_allowed; @@ -505,6 +506,8 @@ static inline int set_cpus_allowed(task_ } #endif +extern unsigned long long sched_clock(void); + #ifdef CONFIG_NUMA extern void sched_balance_exec(void); extern void node_nr_running_init(void); diff -puN kernel/fork.c~sched-2.6.0-test2-mm2-A3 kernel/fork.c --- 25/kernel/fork.c~sched-2.6.0-test2-mm2-A3 2003-07-31 10:43:58.000000000 -0700 +++ 25-akpm/kernel/fork.c 2003-07-31 10:43:58.000000000 -0700 @@ -896,7 +896,7 @@ struct task_struct *copy_process(unsigne */ p->first_time_slice = 1; current->time_slice >>= 1; - p->last_run = jiffies; + p->timestamp = sched_clock(); if (!current->time_slice) { /* * This case is rare, it happens when the parent has only diff -puN kernel/sched.c~sched-2.6.0-test2-mm2-A3 kernel/sched.c --- 25/kernel/sched.c~sched-2.6.0-test2-mm2-A3 2003-07-31 10:43:58.000000000 -0700 +++ 25-akpm/kernel/sched.c 2003-07-31 10:43:58.000000000 -0700 @@ -68,13 +68,15 @@ */ #define MIN_TIMESLICE ( 10 * HZ / 1000) #define MAX_TIMESLICE (200 * HZ / 1000) -#define CHILD_PENALTY 50 +#define TIMESLICE_GRANULARITY (HZ/40 ?: 1) +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 #define PARENT_PENALTY 100 #define EXIT_WEIGHT 3 #define PRIO_BONUS_RATIO 25 #define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (10*HZ) -#define STARVATION_LIMIT (10*HZ) +#define MAX_SLEEP_AVG (1*1000000000) +#define STARVATION_LIMIT HZ #define NODE_THRESHOLD 125 /* @@ -115,6 +117,11 @@ #define TASK_INTERACTIVE(p) \ ((p)->prio <= (p)->static_prio - DELTA(p)) +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio || \ + ((p)->prio == (rq)->curr->prio && \ + (p)->time_slice > (rq)->curr->time_slice * 2)) + /* * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] * to time slice values. @@ -319,8 +326,8 @@ static int effective_prio(task_t *p) if (rt_task(p)) return p->prio; - bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 - - MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; + bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*(p->sleep_avg/1024)/(MAX_SLEEP_AVG/1024)/100; + bonus -= MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; prio = p->static_prio - bonus; if (prio < MAX_RT_PRIO) @@ -339,24 +346,24 @@ static inline void __activate_task(task_ nr_running_inc(rq); } -/* - * activate_task - move a task to the runqueue and do priority recalculation - * - * Update all the scheduling statistics stuff. (sleep average - * calculation, priority modifiers, etc.) - */ -static inline void activate_task(task_t *p, runqueue_t *rq) +static void recalc_task_prio(task_t *p, unsigned long long now) { - long sleep_time = jiffies - p->last_run - 1; + unsigned long long __sleep_time = now - p->timestamp; + unsigned long sleep_time; + + if (__sleep_time > MAX_SLEEP_AVG) + sleep_time = MAX_SLEEP_AVG; + else + sleep_time = (unsigned long)__sleep_time; if (sleep_time > 0) { - int sleep_avg; + unsigned long long sleep_avg; /* * This code gives a bonus to interactive tasks. * * The boost works by updating the 'average sleep time' - * value here, based on ->last_run. The more time a task + * value here, based on ->timestamp. The more time a task * spends sleeping, the higher the average gets - and the * higher the priority boost gets as well. */ @@ -375,6 +382,37 @@ static inline void activate_task(task_t p->prio = effective_prio(p); } } +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static inline void activate_task(task_t *p, runqueue_t *rq) +{ + unsigned long long now = sched_clock(); + + recalc_task_prio(p, now); + + /* + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: + */ + if (in_interrupt()) + p->activated = 2; + else + /* + * Normal first-time wakeups get a credit too for on-runqueue time, + * but it will be weighted down: + */ + p->activated = 1; + p->timestamp = now; + __activate_task(p, rq); } @@ -501,7 +539,7 @@ repeat_lock_task: __activate_task(p, rq); else { activate_task(p, rq); - if (p->prio < rq->curr->prio) + if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } success = 1; @@ -550,8 +588,8 @@ void wake_up_forked_process(task_t * p) * and children as well, to keep max-interactive tasks * from forking tasks that are max-interactive. */ - current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100; - p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; + current->sleep_avg = current->sleep_avg / 100 * PARENT_PENALTY; + p->sleep_avg = p->sleep_avg / 100 * CHILD_PENALTY; p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); @@ -592,8 +630,7 @@ void sched_exit(task_t * p) * the sleep_avg of the parent as well. */ if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT + - p->sleep_avg) / (EXIT_WEIGHT + 1); + p->parent->sleep_avg = p->parent->sleep_avg / (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / (EXIT_WEIGHT + 1); } /** @@ -978,13 +1015,8 @@ static inline void pull_task(runqueue_t * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - if (p->prio < this_rq->curr->prio) + if (TASK_PREEMPTS_CURR(p, this_rq)) set_need_resched(); - else { - if (p->prio == this_rq->curr->prio && - p->time_slice > this_rq->curr->time_slice) - set_need_resched(); - } } /* @@ -1001,12 +1033,14 @@ static void load_balance(runqueue_t *thi runqueue_t *busiest; prio_array_t *array; struct list_head *head, *curr; + unsigned long long now; task_t *tmp; busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); if (!busiest) goto out; + now = sched_clock(); /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1047,7 +1081,7 @@ skip_queue: */ #define CAN_MIGRATE_TASK(p,rq,this_cpu) \ - ((!idle || (jiffies - (p)->last_run > cache_decay_ticks)) && \ + ((!idle || (((now - (p)->timestamp)>>10) > cache_decay_ticks)) &&\ !task_running(rq, p) && \ cpu_isset(this_cpu, (p)->cpus_allowed)) @@ -1164,8 +1198,7 @@ EXPORT_PER_CPU_SYMBOL(kstat); */ #define EXPIRED_STARVING(rq) \ (STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) + (jiffies - (rq)->expired_timestamp >= STARVATION_LIMIT))) /* * This function gets called by the timer code, with HZ frequency. @@ -1209,14 +1242,11 @@ void scheduler_tick(int user_ticks, int spin_lock(&rq->lock); /* * The task was running during this tick - update the - * time slice counter and the sleep average. Note: we - * do not update a thread's priority until it either - * goes to sleep or uses up its timeslice. This makes - * it possible for interactive tasks to use up their - * timeslices at their highest priority levels. + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. */ - if (p->sleep_avg) - p->sleep_avg--; if (unlikely(rt_task(p))) { /* * RR tasks need a special form of timeslice management. @@ -1240,12 +1270,33 @@ void scheduler_tick(int user_ticks, int p->time_slice = task_timeslice(p); p->first_time_slice = 0; + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; enqueue_task(p, rq->expired); } else enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + */ + if (!((task_timeslice(p) - p->time_slice) % TIMESLICE_GRANULARITY) && + (p->array == rq->active)) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + enqueue_task(p, rq->active); + } } out_unlock: spin_unlock(&rq->lock); @@ -1264,6 +1315,8 @@ asmlinkage void schedule(void) runqueue_t *rq; prio_array_t *array; struct list_head *queue; + unsigned long long now; + unsigned long run_time; int idx; /* @@ -1284,7 +1337,11 @@ need_resched: rq = this_rq(); release_kernel_lock(prev); - prev->last_run = jiffies; + now = sched_clock(); + if (likely(now - prev->timestamp < MAX_SLEEP_AVG)) + run_time = now - prev->timestamp; + else + run_time = MAX_SLEEP_AVG; spin_lock_irq(&rq->lock); /* @@ -1332,12 +1389,30 @@ pick_next_task: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + if (next->activated) { + unsigned long long delta = now - next->timestamp; + + if (next->activated == 1) + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; + + next->activated = 0; + array = next->array; + dequeue_task(next, array); + recalc_task_prio(next, next->timestamp + delta); + enqueue_task(next, array); + } switch_tasks: prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; + prev->sleep_avg -= run_time; + if ((long)prev->sleep_avg < 0) + prev->sleep_avg = 0; + prev->timestamp = now; + if (likely(prev != next)) { + next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -1577,6 +1652,7 @@ void set_user_nice(task_t *p, long nice) unsigned long flags; prio_array_t *array; runqueue_t *rq; + int old_prio, new_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -1585,6 +1661,12 @@ void set_user_nice(task_t *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL: + */ if (rt_task(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; @@ -1592,16 +1674,20 @@ void set_user_nice(task_t *p, long nice) array = p->array; if (array) dequeue_task(p, array); + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); - p->prio = NICE_TO_PRIO(nice); + p->prio += delta; + if (array) { enqueue_task(p, array); /* - * If the task is running and lowered its priority, - * or increased its priority then reschedule its CPU: + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: */ - if ((NICE_TO_PRIO(nice) < p->static_prio) || - task_running(rq, p)) + if (delta < 0 || (delta > 0 && task_running(rq, p))) resched_task(rq->curr); } out_unlock: @@ -2358,6 +2444,12 @@ static void move_task_away(struct task_s local_irq_restore(flags); } +typedef struct { + int cpu; + struct completion startup_done; + task_t *task; +} migration_startup_t; + /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto @@ -2367,20 +2459,21 @@ static int migration_thread(void * data) { /* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */ struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 }; - int cpu = (long) data; + migration_startup_t *startup = data; + int cpu = startup->cpu; runqueue_t *rq; int ret; + startup->task = current; + complete(&startup->startup_done); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + + BUG_ON(smp_processor_id() != cpu); + daemonize("migration/%d", cpu); set_fs(KERNEL_DS); - /* - * Either we are running on the right CPU, or there's a a - * migration thread on this CPU, guaranteed (we're started - * serially). - */ - set_cpus_allowed(current, cpumask_of_cpu(cpu)); - ret = setscheduler(0, SCHED_FIFO, ¶m); rq = this_rq(); @@ -2416,13 +2509,30 @@ static int migration_call(struct notifie unsigned long action, void *hcpu) { + long cpu = (long) hcpu; + migration_startup_t startup; + switch (action) { case CPU_ONLINE: - printk("Starting migration thread for cpu %li\n", - (long)hcpu); - kernel_thread(migration_thread, hcpu, CLONE_KERNEL); - while (!cpu_rq((long)hcpu)->migration_thread) + + printk("Starting migration thread for cpu %li\n", cpu); + + startup.cpu = cpu; + startup.task = NULL; + init_completion(&startup.startup_done); + + kernel_thread(migration_thread, &startup, CLONE_KERNEL); + wait_for_completion(&startup.startup_done); + wait_task_inactive(startup.task); + + startup.task->thread_info->cpu = cpu; + startup.task->cpus_allowed = cpumask_of_cpu(cpu); + + wake_up_process(startup.task); + + while (!cpu_rq(cpu)->migration_thread) yield(); + break; } return NOTIFY_OK; _