diff -urpN -X /home/fletch/.diff.exclude 430-kgdb_cleanup/kernel/sched.c 440-local_balance_exec/kernel/sched.c --- 430-kgdb_cleanup/kernel/sched.c Wed Aug 13 20:29:41 2003 +++ 440-local_balance_exec/kernel/sched.c Wed Aug 13 20:46:12 2003 @@ -787,38 +787,75 @@ static void sched_migrate_task(task_t *p * Find the least loaded CPU. Slightly favor the current CPU by * setting its runqueue length as the minimum to start. */ + static int sched_best_cpu(struct task_struct *p) { - int i, minload, load, best_cpu, node = 0; + int cpu, node, minload, load, best_cpu, best_node; + int this_cpu, this_node, this_node_load; unsigned long cpumask; - best_cpu = task_cpu(p); - if (cpu_rq(best_cpu)->nr_running <= 2) - return best_cpu; + this_cpu = best_cpu = task_cpu(p); + if (cpu_rq(this_cpu)->nr_running <= 2) + return this_cpu; + this_node = best_node = cpu_to_node(this_cpu); + /* + * First look for any node-local idle queue and use that. + * This improves performance under light loads (mbligh). + * In case this node turns out to be the lightest node, store the best + * cpu that we find, so we don't go sniffing the same runqueues again. + */ minload = 10000000; - for_each_node_with_cpus(i) { - /* - * Node load is always divided by nr_cpus_node to normalise - * load values in case cpu count differs from node to node. - * We first multiply node_nr_running by 10 to get a little - * better resolution. - */ - load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i); + cpumask = node_to_cpumask(this_node); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!(cpumask & (1UL << cpu))) + continue; + load = cpu_rq(cpu)->nr_running; + if (load == 0) + return cpu; if (load < minload) { minload = load; - node = i; + best_cpu = cpu; } } + /* + * Now find the lightest loaded node, and put it in best_node + * + * Node load is always divided by nr_cpus_node to normalise load + * values in case cpu count differs from node to node. We first + * multiply node_nr_running by 16 to get a little better resolution. + */ minload = 10000000; - cpumask = node_to_cpumask(node); - for (i = 0; i < NR_CPUS; ++i) { - if (!(cpumask & (1UL << i))) + this_node_load = 16 * atomic_read(&node_nr_running[this_node]) + / nr_cpus_node(this_node); + for_each_node_with_cpus(node) { + if (node == this_node) + load = this_node_load; + else + load = 16 * atomic_read(&node_nr_running[node]) + / nr_cpus_node(node); + if (load < minload) { + minload = load; + best_node = node; + } + } + + /* If we chose this node, we already did the legwork earlier */ + if (best_node == this_node) + return best_cpu; + + /* Now find the lightest loaded cpu on best_node, and use that */ + minload = 10000000; + best_cpu = this_cpu; + cpumask = node_to_cpumask(best_node); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!(cpumask & (1UL << cpu))) continue; - if (cpu_rq(i)->nr_running < minload) { - best_cpu = i; - minload = cpu_rq(i)->nr_running; + load = cpu_rq(cpu)->nr_running; + if (load < minload) { + minload = load; + best_cpu = cpu; } } return best_cpu;