diff -urpN -X /home/fletch/.diff.exclude 430-kgdb_cleanup/kernel/sched.c 440-local_balance_exec/kernel/sched.c
--- 430-kgdb_cleanup/kernel/sched.c	Wed Aug 13 20:29:41 2003
+++ 440-local_balance_exec/kernel/sched.c	Wed Aug 13 20:46:12 2003
@@ -787,38 +787,75 @@ static void sched_migrate_task(task_t *p
  * Find the least loaded CPU.  Slightly favor the current CPU by
  * setting its runqueue length as the minimum to start.
  */
+
 static int sched_best_cpu(struct task_struct *p)
 {
-	int i, minload, load, best_cpu, node = 0;
+	int cpu, node, minload, load, best_cpu, best_node;
+	int this_cpu, this_node, this_node_load;
 	unsigned long cpumask;
 
-	best_cpu = task_cpu(p);
-	if (cpu_rq(best_cpu)->nr_running <= 2)
-		return best_cpu;
+	this_cpu = best_cpu = task_cpu(p);
+	if (cpu_rq(this_cpu)->nr_running <= 2)
+		return this_cpu;
+	this_node = best_node = cpu_to_node(this_cpu);
 
+	/* 
+	 * First look for any node-local idle queue and use that. 
+	 * This improves performance under light loads (mbligh).
+	 * In case this node turns out to be the lightest node, store the best
+	 * cpu that we find, so we don't go sniffing the same runqueues again.
+	 */
 	minload = 10000000;
-	for_each_node_with_cpus(i) {
-		/*
-		 * Node load is always divided by nr_cpus_node to normalise 
-		 * load values in case cpu count differs from node to node.
-		 * We first multiply node_nr_running by 10 to get a little
-		 * better resolution.   
-		 */
-		load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i);
+	cpumask = node_to_cpumask(this_node);
+	for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+		if (!(cpumask & (1UL << cpu)))
+			continue;
+		load = cpu_rq(cpu)->nr_running;
+		if (load == 0)
+			return cpu;
 		if (load < minload) {
 			minload = load;
-			node = i;
+			best_cpu = cpu;
 		}
 	}
 
+	/* 
+	 * Now find the lightest loaded node, and put it in best_node
+	 * 
+	 * Node load is always divided by nr_cpus_node to normalise load 
+	 * values in case cpu count differs from node to node. We first 
+	 * multiply node_nr_running by 16 to get a little better resolution.
+	 */
 	minload = 10000000;
-	cpumask = node_to_cpumask(node);
-	for (i = 0; i < NR_CPUS; ++i) {
-		if (!(cpumask & (1UL << i)))
+	this_node_load = 16 * atomic_read(&node_nr_running[this_node])
+					/ nr_cpus_node(this_node);
+	for_each_node_with_cpus(node) {
+		if (node == this_node)
+			load = this_node_load;
+		else
+			load = 16 * atomic_read(&node_nr_running[node])
+					/ nr_cpus_node(node);
+		if (load < minload) {
+			minload = load;
+			best_node = node;
+		}
+	}
+
+	/* If we chose this node, we already did the legwork earlier */
+	if (best_node == this_node)
+		return best_cpu;
+
+	/* Now find the lightest loaded cpu on best_node, and use that */
+	minload = 10000000;
+	best_cpu = this_cpu;
+	cpumask = node_to_cpumask(best_node);
+	for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+		if (!(cpumask & (1UL << cpu)))
 			continue;
-		if (cpu_rq(i)->nr_running < minload) {
-			best_cpu = i;
-			minload = cpu_rq(i)->nr_running;
+		load = cpu_rq(cpu)->nr_running;
+		if (load < minload) {
+			minload = load;
+			best_cpu = cpu;
 		}
 	}
 	return best_cpu;