diff options
author | Andrew Morton <akpm@osdl.org> | 2004-05-09 23:25:34 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2004-05-09 23:25:34 -0700 |
commit | 47ad0fced4c434ac4f37bc9f40e840b32693e55d (patch) | |
tree | 687c6116315c2189eb70c6877e2ecf75c13f0807 /kernel | |
parent | a5f39fd8b3fc065350d9bd51a44c19a7b930c047 (diff) | |
download | history-47ad0fced4c434ac4f37bc9f40e840b32693e55d.tar.gz |
[PATCH] sched: SMT niceness handling
From: Con Kolivas <kernel@kolivas.org>
This patch provides full per-package priority support for SMT processors
(aka pentium4 hyperthreading) when combined with CONFIG_SCHED_SMT.
It maintains cpu percentage distribution within each physical cpu package
by limiting the time a lower priority task can run on a sibling cpu
concurrently with a higher priority task.
It introduces a new flag into the scheduler domain
unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */
This is empirically set to 15% for pentium4 at the moment and can be
modified to support different values dynamically as newer processors come
out with improved SMT performance. It should not matter how many siblings
there are.
How it works is it compares tasks running on sibling cpus and when a lower
static priority task is running it will delay it till
high_priority_timeslice * (100 - per_cpu_gain) / 100 <= low_prio_timeslice
eg. a nice 19 task timeslice is 10ms and nice 0 timeslice is 102ms On
vanilla the nice 0 task runs on one logical cpu while the nice 19 task runs
unabated on the other logical cpu. With smtnice the nice 0 runs on one
logical cpu for 102ms and the nice 19 sleeps till the nice 0 task has 12ms
remaining and then will schedule.
Real time tasks and kernel threads are not altered by this code, and kernel
threads do not delay lower priority user tasks.
with lots of thanks to Zwane Mwaikambo and Nick Piggin for help with the
coding of this version.
If this is merged, it is probably best to delay pushing this upstream in
mainline till sched_domains gets tested for at least one major release.
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 117 |
1 files changed, 115 insertions, 2 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 09979ac4b676b..eb0eb124b4c24 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1772,6 +1772,25 @@ static inline void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_t } #endif +#ifdef CONFIG_SCHED_SMT +static inline int wake_priority_sleeper(runqueue_t *rq) +{ /* + * If an SMT sibling task has been put to sleep for priority + * reasons reschedule the idle task to see if it can now run. + */ + if (rq->nr_running) { + resched_task(rq->idle); + return 1; + } + return 0; +} +#else +static inline int wake_priority_sleeper(runqueue_t *rq) +{ + return 0; +} +#endif + DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); @@ -1825,6 +1844,8 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; + if (wake_priority_sleeper(rq)) + goto out; rebalance_tick(cpu, rq, IDLE); return; } @@ -1912,6 +1933,91 @@ out: rebalance_tick(cpu, rq, NOT_IDLE); } +#ifdef CONFIG_SCHED_SMT +static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) +{ + int i; + struct sched_domain *sd = cpu_sched_domain(cpu); + cpumask_t sibling_map; + + if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { + /* Not SMT */ + return; + } + + cpus_and(sibling_map, sd->span, cpu_online_map); + cpu_clear(cpu, sibling_map); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq; + + smt_rq = cpu_rq(i); + + /* + * If an SMT sibling task is sleeping due to priority + * reasons wake it up now. + */ + if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) + resched_task(smt_rq->idle); + } +} + +static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) +{ + int ret = 0, i; + struct sched_domain *sd = cpu_sched_domain(cpu); + cpumask_t sibling_map; + + if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { + /* Not SMT */ + return 0; + } + + cpus_and(sibling_map, sd->span, cpu_online_map); + cpu_clear(cpu, sibling_map); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq; + task_t *smt_curr; + + smt_rq = cpu_rq(i); + smt_curr = smt_rq->curr; + + /* + * If a user task with lower static priority than the + * running task on the SMT sibling is trying to schedule, + * delay it till there is proportionately less timeslice + * left of the sibling task to prevent a lower priority + * task from using an unfair proportion of the + * physical cpu's resources. -ck + */ + if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(p) || rt_task(smt_curr)) && + p->mm && smt_curr->mm && !rt_task(p)) + ret |= 1; + + /* + * Reschedule a lower priority task on the SMT sibling, + * or wake it up if it has been put to sleep for priority + * reasons. + */ + if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(smt_curr) || rt_task(p)) && + smt_curr->mm && p->mm && !rt_task(smt_curr)) || + (smt_curr == smt_rq->idle && smt_rq->nr_running)) + resched_task(smt_curr); + } + return ret; +} +#else +static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) +{ +} + +static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) +{ + return 0; +} +#endif + /* * schedule() is the main scheduler function. */ @@ -1924,7 +2030,7 @@ asmlinkage void __sched schedule(void) struct list_head *queue; unsigned long long now; unsigned long run_time; - int idx; + int cpu, idx; /* * Test if we are atomic. Since do_exit() needs to call into @@ -1974,13 +2080,15 @@ need_resched: deactivate_task(prev, rq); } + cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { #ifdef CONFIG_SMP - idle_balance(smp_processor_id(), rq); + idle_balance(cpu, rq); #endif if (!rq->nr_running) { next = rq->idle; rq->expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); goto switch_tasks; } } @@ -2001,6 +2109,11 @@ need_resched: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + if (dependent_sleeper(cpu, rq, next)) { + next = rq->idle; + goto switch_tasks; + } + if (!rt_task(next) && next->activated > 0) { unsigned long long delta = now - next->timestamp; |