diff -urpN -X /home/fletch/.diff.exclude 431-acenic_fix/kernel/sched.c 440-local_balance_exec/kernel/sched.c --- 431-acenic_fix/kernel/sched.c Sat Jun 14 20:31:03 2003 +++ 440-local_balance_exec/kernel/sched.c Sat Jun 14 20:35:13 2003 @@ -892,32 +892,67 @@ static void sched_migrate_task(task_t *p * Find the least loaded CPU. Slightly favor the current CPU by * setting its runqueue length as the minimum to start. */ + static int sched_best_cpu(struct task_struct *p) { - int i, minload, load, best_cpu, node = 0; + int cpu, node, minload, load, best_cpu, best_node; + int this_cpu, this_node, this_node_load; unsigned long cpumask; - best_cpu = task_cpu(p); - if (cpu_rq(best_cpu)->nr_running <= 2) - return best_cpu; + this_cpu = best_cpu = task_cpu(p); + if (cpu_rq(this_cpu)->nr_running <= 2) + return this_cpu; + this_node = best_node = cpu_to_node(this_cpu); + /* + * First look for any node-local idle queue and use that. + * This improves performance under light loads (mbligh). + * In case this node turns out to be the lightest node, store the best + * cpu that we find, so we don't go sniffing the same runqueues again. + */ minload = 10000000; - for_each_node_with_cpus(i) { - load = atomic_read(&node_nr_running[i]); + cpumask = node_to_cpumask(this_node); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!(cpumask & (1UL << cpu))) + continue; + load = cpu_rq(cpu)->nr_running; + if (load == 0) + return cpu; if (load < minload) { minload = load; - node = i; + best_cpu = cpu; } } + /* Now find the lightest loaded node, and put it in best_node */ minload = 10000000; - cpumask = node_to_cpumask(node); - for (i = 0; i < NR_CPUS; ++i) { - if (!(cpumask & (1UL << i))) + this_node_load = atomic_read(&node_nr_running[this_node]); + for_each_node_with_cpus(node) { + if (node == this_node) + load = this_node_load; + else + load = atomic_read(&node_nr_running[node]); + if (load < minload) { + minload = load; + best_node = node; + } + } + + /* If we chose this node, we already did the legwork earlier */ + if (best_node == this_node) + return best_cpu; + + /* Now find the lightest loaded cpu on best_node, and use that */ + minload = 10000000; + best_cpu = this_cpu; + cpumask = node_to_cpumask(best_node); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!(cpumask & (1UL << cpu))) continue; - if (cpu_rq(i)->nr_running < minload) { - best_cpu = i; - minload = cpu_rq(i)->nr_running; + load = cpu_rq(cpu)->nr_running; + if (load < minload) { + minload = load; + best_cpu = cpu; } } return best_cpu;