From: Con Kolivas This patch provides full per-package priority support for SMT processors (aka pentium4 hyperthreading) when combined with CONFIG_SCHED_SMT. It maintains cpu percentage distribution within each physical cpu package by limiting the time a lower priority task can run on a sibling cpu concurrently with a higher priority task. It introduces a new flag into the scheduler domain unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ This is empirically set to 15% for pentium4 at the moment and can be modified to support different values dynamically as newer processors come out with improved SMT performance. It should not matter how many siblings there are. How it works is it compares tasks running on sibling cpus and when a lower static priority task is running it will delay it till high_priority_timeslice * (100 - per_cpu_gain) / 100 <= low_prio_timeslice eg. a nice 19 task timeslice is 10ms and nice 0 timeslice is 102ms On vanilla the nice 0 task runs on one logical cpu while the nice 19 task runs unabated on the other logical cpu. With smtnice the nice 0 runs on one logical cpu for 102ms and the nice 19 sleeps till the nice 0 task has 12ms remaining and then will schedule. Real time tasks and kernel threads are not altered by this code, and kernel threads do not delay lower priority user tasks. with lots of thanks to Zwane Mwaikambo and Nick Piggin for help with the coding of this version. If this is merged, it is probably best to delay pushing this upstream in mainline till sched_domains gets tested for at least one major release. --- 25-akpm/arch/i386/kernel/smpboot.c | 12 +++ 25-akpm/include/linux/sched.h | 5 + 25-akpm/kernel/sched.c | 118 ++++++++++++++++++++++++++++++++++++- 3 files changed, 131 insertions(+), 4 deletions(-) diff -puN arch/i386/kernel/smpboot.c~sched-smt-nice-handling arch/i386/kernel/smpboot.c --- 25/arch/i386/kernel/smpboot.c~sched-smt-nice-handling Mon Mar 1 16:00:18 2004 +++ 25-akpm/arch/i386/kernel/smpboot.c Mon Mar 1 16:00:18 2004 @@ -1159,8 +1159,12 @@ __init void arch_init_sched_domains(void int j; first_cpu = last_cpu = NULL; - if (i != first_cpu(cpu_domain->span)) + if (i != first_cpu(cpu_domain->span)) { + cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER; + cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= + SD_FLAG_SHARE_CPUPOWER; continue; + } for_each_cpu_mask(j, cpu_domain->span) { struct sched_group *cpu = &sched_group_cpus[j]; @@ -1279,8 +1283,12 @@ __init void arch_init_sched_domains(void int j; first_cpu = last_cpu = NULL; - if (i != first_cpu(cpu_domain->span)) + if (i != first_cpu(cpu_domain->span)) { + cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER; + cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= + SD_FLAG_SHARE_CPUPOWER; continue; + } for_each_cpu_mask(j, cpu_domain->span) { struct sched_group *cpu = &sched_group_cpus[j]; diff -puN include/linux/sched.h~sched-smt-nice-handling include/linux/sched.h --- 25/include/linux/sched.h~sched-smt-nice-handling Mon Mar 1 16:00:18 2004 +++ 25-akpm/include/linux/sched.h Mon Mar 1 16:00:18 2004 @@ -537,6 +537,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define SD_FLAG_EXEC 2 /* Balance on exec */ #define SD_FLAG_WAKE 4 /* Balance on task wakeup */ #define SD_FLAG_FASTMIGRATE 8 /* Sync wakes put task on waking CPU */ +#define SD_FLAG_SHARE_CPUPOWER 16 /* Domain members share cpu power */ struct sched_group { struct sched_group *next; /* Must be a circular list */ @@ -562,6 +563,7 @@ struct sched_domain { unsigned int imbalance_pct; /* No balance until over watermark */ unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ + unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ int flags; /* See SD_FLAG_* */ /* Runtime fields. */ @@ -581,6 +583,7 @@ struct sched_domain { .imbalance_pct = 110, \ .cache_hot_time = 0, \ .cache_nice_tries = 0, \ + .per_cpu_gain = 15, \ .flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE | SD_FLAG_WAKE,\ .last_balance = jiffies, \ .balance_interval = 1, \ @@ -598,6 +601,7 @@ struct sched_domain { .imbalance_pct = 125, \ .cache_hot_time = (5*1000000/2), \ .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ .flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE,\ .last_balance = jiffies, \ .balance_interval = 1, \ @@ -616,6 +620,7 @@ struct sched_domain { .imbalance_pct = 125, \ .cache_hot_time = (10*1000000), \ .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ .flags = SD_FLAG_EXEC, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff -puN kernel/sched.c~sched-smt-nice-handling kernel/sched.c --- 25/kernel/sched.c~sched-smt-nice-handling Mon Mar 1 16:00:18 2004 +++ 25-akpm/kernel/sched.c Mon Mar 1 16:00:18 2004 @@ -207,9 +207,8 @@ struct runqueue { struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; int best_expired_prio; - + int cpu; atomic_t nr_iowait; - #ifdef CONFIG_SMP unsigned long cpu_load[NR_CPUS]; #endif @@ -1765,6 +1764,25 @@ static inline void rebalance_tick(int th } #endif +#ifdef CONFIG_SCHED_SMT +static inline int wake_priority_sleeper(runqueue_t *rq) +{ /* + * If an SMT sibling task has been put to sleep for priority + * reasons reschedule the idle task to see if it can now run. + */ + if (rq->nr_running) { + resched_task(rq->idle); + return 1; + } + return 0; +} +#else +static inline int wake_priority_sleeper(runqueue_t *rq) +{ + return 0; +} +#endif + DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); @@ -1818,6 +1836,8 @@ void scheduler_tick(int user_ticks, int cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; + if (wake_priority_sleeper(rq)) + goto out; rebalance_tick(cpu, rq, IDLE); return; } @@ -1905,6 +1925,93 @@ out: rebalance_tick(cpu, rq, NOT_IDLE); } +#ifdef CONFIG_SCHED_SMT +static inline void wake_sleeping_dependent(runqueue_t *rq) +{ + int i, this_cpu = rq->cpu; + struct sched_domain *sd = cpu_sched_domain(this_cpu); + cpumask_t sibling_map; + + if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { + /* Not SMT */ + return; + } + + cpus_and(sibling_map, sd->span, cpu_online_map); + cpu_clear(this_cpu, sibling_map); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq; + + smt_rq = cpu_rq(i); + + /* + * If an SMT sibling task is sleeping due to priority + * reasons wake it up now. + */ + if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) + resched_task(smt_rq->idle); + } +} + +static inline int dependent_sleeper(runqueue_t *rq, task_t *p) +{ + int ret = 0, i, this_cpu = rq->cpu; + struct sched_domain *sd = cpu_sched_domain(this_cpu); + cpumask_t sibling_map; + + if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { + /* Not SMT */ + return 0; + } + + cpus_and(sibling_map, sd->span, cpu_online_map); + cpu_clear(this_cpu, sibling_map); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq; + task_t *smt_curr; + + smt_rq = cpu_rq(i); + smt_curr = smt_rq->curr; + + /* + * If a user task with lower static priority than the + * running task on the SMT sibling is trying to schedule, + * delay it till there is proportionately less timeslice + * left of the sibling task to prevent a lower priority + * task from using an unfair proportion of the + * physical cpu's resources. -ck + */ + if (p->mm && smt_curr->mm && !rt_task(p) && + ((p->static_prio > smt_curr->static_prio && + (smt_curr->time_slice * (100 - sd->per_cpu_gain) / + 100) > task_timeslice(p)) || + rt_task(smt_curr))) + ret |= 1; + + /* + * Reschedule a lower priority task on the SMT sibling, + * or wake it up if it has been put to sleep for priority + * reasons. + */ + if ((smt_curr != smt_rq->idle && + smt_curr->static_prio > p->static_prio) || + (rt_task(p) && !rt_task(smt_curr)) || + (smt_curr == smt_rq->idle && smt_rq->nr_running)) + resched_task(smt_curr); + } + return ret; +} +#else +static inline void wake_sleeping_dependent(runqueue_t *rq) +{ +} + +static inline int dependent_sleeper(runqueue_t *rq, task_t *p) +{ + return 0; +} +#endif + void scheduling_functions_start_here(void) { } /* @@ -1976,6 +2083,7 @@ need_resched: if (!rq->nr_running) { next = rq->idle; rq->expired_timestamp = 0; + wake_sleeping_dependent(rq); goto switch_tasks; } } @@ -1996,6 +2104,11 @@ need_resched: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + if (dependent_sleeper(rq, next)) { + next = rq->idle; + goto switch_tasks; + } + if (next->activated > 0) { unsigned long long delta = now - next->timestamp; @@ -3424,6 +3537,7 @@ void __init sched_init(void) #endif rq = cpu_rq(i); + rq->cpu = i; rq->active = rq->arrays; rq->expired = rq->arrays + 1; rq->best_expired_prio = MAX_PRIO; _