From: Andrew Theurer This patch addresses some problems with wake_idle(). Currently wake_idle() will wake a task on an alternate cpu if: 1) task->cpu is not idle 2) an idle cpu can be found However the span of cpus to look for is very limited (only the task->cpu's sibling). The scheduler should find the closest idle cpu, starting with the lowest level domain, then going to higher level domains if allowed (doamin has flag SD_WAKE_IDLE). This patch does this. This and the other two patches (also to be submitted) combined have provided as much at 5% improvement on that "online transaction DB workload" and 2% on the industry standard J@EE workload. I asked Martin Bligh to test these for regression, and he did not find any. I would like to submit for inclusion to -mm and barring any problems eventually to mainline. Signed-off-by: Signed-off-by: Andrew Morton --- 25-akpm/include/asm-i386/topology.h | 1 + 25-akpm/include/asm-ia64/topology.h | 1 + 25-akpm/include/asm-ppc64/topology.h | 1 + 25-akpm/include/asm-x86_64/topology.h | 1 + 25-akpm/include/linux/topology.h | 1 + 25-akpm/kernel/sched.c | 30 +++++++++++++++--------------- 6 files changed, 20 insertions(+), 15 deletions(-) diff -puN include/asm-i386/topology.h~sched-more-agressive-wake_idle include/asm-i386/topology.h --- 25/include/asm-i386/topology.h~sched-more-agressive-wake_idle 2004-12-03 20:56:29.576263912 -0800 +++ 25-akpm/include/asm-i386/topology.h 2004-12-03 20:56:29.587262240 -0800 @@ -80,6 +80,7 @@ static inline cpumask_t pcibus_to_cpumas .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ + | SD_WAKE_IDLE \ | SD_WAKE_BALANCE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff -puN include/asm-ia64/topology.h~sched-more-agressive-wake_idle include/asm-ia64/topology.h --- 25/include/asm-ia64/topology.h~sched-more-agressive-wake_idle 2004-12-03 20:56:29.578263608 -0800 +++ 25-akpm/include/asm-ia64/topology.h 2004-12-03 20:56:29.587262240 -0800 @@ -56,6 +56,7 @@ void build_cpu_to_node_map(void); .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ + | SD_WAKE_IDLE \ | SD_WAKE_BALANCE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff -puN include/asm-ppc64/topology.h~sched-more-agressive-wake_idle include/asm-ppc64/topology.h --- 25/include/asm-ppc64/topology.h~sched-more-agressive-wake_idle 2004-12-03 20:56:29.579263456 -0800 +++ 25-akpm/include/asm-ppc64/topology.h 2004-12-03 20:56:29.588262088 -0800 @@ -51,6 +51,7 @@ static inline int node_to_first_cpu(int .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ + | SD_WAKE_IDLE \ | SD_WAKE_BALANCE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff -puN include/asm-x86_64/topology.h~sched-more-agressive-wake_idle include/asm-x86_64/topology.h --- 25/include/asm-x86_64/topology.h~sched-more-agressive-wake_idle 2004-12-03 20:56:29.580263304 -0800 +++ 25-akpm/include/asm-x86_64/topology.h 2004-12-03 20:56:29.588262088 -0800 @@ -53,6 +53,7 @@ static inline cpumask_t __pcibus_to_cpum .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ + | SD_WAKE_IDLE \ | SD_WAKE_BALANCE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff -puN include/linux/topology.h~sched-more-agressive-wake_idle include/linux/topology.h --- 25/include/linux/topology.h~sched-more-agressive-wake_idle 2004-12-03 20:56:29.582263000 -0800 +++ 25-akpm/include/linux/topology.h 2004-12-03 20:56:29.588262088 -0800 @@ -123,6 +123,7 @@ static inline int __next_node_with_cpus( | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE \ + | SD_WAKE_IDLE \ | SD_WAKE_BALANCE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff -puN kernel/sched.c~sched-more-agressive-wake_idle kernel/sched.c --- 25/kernel/sched.c~sched-more-agressive-wake_idle 2004-12-03 20:56:29.584262696 -0800 +++ 25-akpm/kernel/sched.c 2004-12-03 20:56:29.592261480 -0800 @@ -936,9 +936,10 @@ static inline unsigned long target_load( #endif /* - * wake_idle() is useful especially on SMT architectures to wake a - * task onto an idle sibling if we would otherwise wake it onto a - * busy sibling. + * wake_idle() will wake a task on an idle cpu if task->cpu is + * not idle and an idle cpu is available. The span of cpus to + * search starts with cpus closest then further out as needed, + * so we always favor a closer, idle cpu. * * Returns the CPU we should wake onto. */ @@ -946,24 +947,23 @@ static inline unsigned long target_load( static int wake_idle(int cpu, task_t *p) { cpumask_t tmp; - runqueue_t *rq = cpu_rq(cpu); struct sched_domain *sd; int i; if (idle_cpu(cpu)) return cpu; - sd = rq->sd; - if (!(sd->flags & SD_WAKE_IDLE)) - return cpu; - - cpus_and(tmp, sd->span, p->cpus_allowed); - - for_each_cpu_mask(i, tmp) { - if (idle_cpu(i)) - return i; + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_IDLE) { + cpus_and(tmp, sd->span, cpu_online_map); + cpus_and(tmp, tmp, p->cpus_allowed); + for_each_cpu_mask(i, tmp) { + if (idle_cpu(i)) + return i; + } + } + else break; } - return cpu; } #else @@ -1075,7 +1075,7 @@ static int try_to_wake_up(task_t * p, un out_set_cpu: schedstat_inc(rq, ttwu_attempts); new_cpu = wake_idle(new_cpu, p); - if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) { + if (new_cpu != cpu) { schedstat_inc(rq, ttwu_moved); set_task_cpu(p, new_cpu); task_rq_unlock(rq, &flags); _