aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorAndrew Morton <akpm@osdl.org>2004-05-09 23:25:34 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2004-05-09 23:25:34 -0700
commit47ad0fced4c434ac4f37bc9f40e840b32693e55d (patch)
tree687c6116315c2189eb70c6877e2ecf75c13f0807 /kernel
parenta5f39fd8b3fc065350d9bd51a44c19a7b930c047 (diff)
downloadhistory-47ad0fced4c434ac4f37bc9f40e840b32693e55d.tar.gz
[PATCH] sched: SMT niceness handling
From: Con Kolivas <kernel@kolivas.org> This patch provides full per-package priority support for SMT processors (aka pentium4 hyperthreading) when combined with CONFIG_SCHED_SMT. It maintains cpu percentage distribution within each physical cpu package by limiting the time a lower priority task can run on a sibling cpu concurrently with a higher priority task. It introduces a new flag into the scheduler domain unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ This is empirically set to 15% for pentium4 at the moment and can be modified to support different values dynamically as newer processors come out with improved SMT performance. It should not matter how many siblings there are. How it works is it compares tasks running on sibling cpus and when a lower static priority task is running it will delay it till high_priority_timeslice * (100 - per_cpu_gain) / 100 <= low_prio_timeslice eg. a nice 19 task timeslice is 10ms and nice 0 timeslice is 102ms On vanilla the nice 0 task runs on one logical cpu while the nice 19 task runs unabated on the other logical cpu. With smtnice the nice 0 runs on one logical cpu for 102ms and the nice 19 sleeps till the nice 0 task has 12ms remaining and then will schedule. Real time tasks and kernel threads are not altered by this code, and kernel threads do not delay lower priority user tasks. with lots of thanks to Zwane Mwaikambo and Nick Piggin for help with the coding of this version. If this is merged, it is probably best to delay pushing this upstream in mainline till sched_domains gets tested for at least one major release.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched.c117
1 files changed, 115 insertions, 2 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 09979ac4b676b..eb0eb124b4c24 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1772,6 +1772,25 @@ static inline void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_t
}
#endif
+#ifdef CONFIG_SCHED_SMT
+static inline int wake_priority_sleeper(runqueue_t *rq)
+{ /*
+ * If an SMT sibling task has been put to sleep for priority
+ * reasons reschedule the idle task to see if it can now run.
+ */
+ if (rq->nr_running) {
+ resched_task(rq->idle);
+ return 1;
+ }
+ return 0;
+}
+#else
+static inline int wake_priority_sleeper(runqueue_t *rq)
+{
+ return 0;
+}
+#endif
+
DEFINE_PER_CPU(struct kernel_stat, kstat);
EXPORT_PER_CPU_SYMBOL(kstat);
@@ -1825,6 +1844,8 @@ void scheduler_tick(int user_ticks, int sys_ticks)
cpustat->iowait += sys_ticks;
else
cpustat->idle += sys_ticks;
+ if (wake_priority_sleeper(rq))
+ goto out;
rebalance_tick(cpu, rq, IDLE);
return;
}
@@ -1912,6 +1933,91 @@ out:
rebalance_tick(cpu, rq, NOT_IDLE);
}
+#ifdef CONFIG_SCHED_SMT
+static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
+{
+ int i;
+ struct sched_domain *sd = cpu_sched_domain(cpu);
+ cpumask_t sibling_map;
+
+ if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) {
+ /* Not SMT */
+ return;
+ }
+
+ cpus_and(sibling_map, sd->span, cpu_online_map);
+ cpu_clear(cpu, sibling_map);
+ for_each_cpu_mask(i, sibling_map) {
+ runqueue_t *smt_rq;
+
+ smt_rq = cpu_rq(i);
+
+ /*
+ * If an SMT sibling task is sleeping due to priority
+ * reasons wake it up now.
+ */
+ if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
+ resched_task(smt_rq->idle);
+ }
+}
+
+static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
+{
+ int ret = 0, i;
+ struct sched_domain *sd = cpu_sched_domain(cpu);
+ cpumask_t sibling_map;
+
+ if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) {
+ /* Not SMT */
+ return 0;
+ }
+
+ cpus_and(sibling_map, sd->span, cpu_online_map);
+ cpu_clear(cpu, sibling_map);
+ for_each_cpu_mask(i, sibling_map) {
+ runqueue_t *smt_rq;
+ task_t *smt_curr;
+
+ smt_rq = cpu_rq(i);
+ smt_curr = smt_rq->curr;
+
+ /*
+ * If a user task with lower static priority than the
+ * running task on the SMT sibling is trying to schedule,
+ * delay it till there is proportionately less timeslice
+ * left of the sibling task to prevent a lower priority
+ * task from using an unfair proportion of the
+ * physical cpu's resources. -ck
+ */
+ if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
+ task_timeslice(p) || rt_task(smt_curr)) &&
+ p->mm && smt_curr->mm && !rt_task(p))
+ ret |= 1;
+
+ /*
+ * Reschedule a lower priority task on the SMT sibling,
+ * or wake it up if it has been put to sleep for priority
+ * reasons.
+ */
+ if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
+ task_timeslice(smt_curr) || rt_task(p)) &&
+ smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
+ (smt_curr == smt_rq->idle && smt_rq->nr_running))
+ resched_task(smt_curr);
+ }
+ return ret;
+}
+#else
+static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
+{
+}
+
+static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
+{
+ return 0;
+}
+#endif
+
/*
* schedule() is the main scheduler function.
*/
@@ -1924,7 +2030,7 @@ asmlinkage void __sched schedule(void)
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
- int idx;
+ int cpu, idx;
/*
* Test if we are atomic. Since do_exit() needs to call into
@@ -1974,13 +2080,15 @@ need_resched:
deactivate_task(prev, rq);
}
+ cpu = smp_processor_id();
if (unlikely(!rq->nr_running)) {
#ifdef CONFIG_SMP
- idle_balance(smp_processor_id(), rq);
+ idle_balance(cpu, rq);
#endif
if (!rq->nr_running) {
next = rq->idle;
rq->expired_timestamp = 0;
+ wake_sleeping_dependent(cpu, rq);
goto switch_tasks;
}
}
@@ -2001,6 +2109,11 @@ need_resched:
queue = array->queue + idx;
next = list_entry(queue->next, task_t, run_list);
+ if (dependent_sleeper(cpu, rq, next)) {
+ next = rq->idle;
+ goto switch_tasks;
+ }
+
if (!rt_task(next) && next->activated > 0) {
unsigned long long delta = now - next->timestamp;