From: Nick Piggin <piggin@cyberone.com.au>

The following patch implements a cpu_power member to struct sched_group.

This allows special casing to be removed for SMT groups in the balancing
code.  It does not take CPU hotplug into account yet, but that shouldn't be
too hard.

I have tested it on the NUMAQ by pretending it has SMT.  Works as expected.
Active balances across nodes.

DESC
sched-group-power warning fixes
EDESC

kernel/sched.c: In function `find_busiest_group':
kernel/sched.c:1466: warning: comparison of distinct pointer types lacks a cast
kernel/sched.c:1467: warning: comparison of distinct pointer types lacks a cast
kernel/sched.c:1474: warning: comparison of distinct pointer types lacks a cast
kernel/sched.c:1457: warning: unused variable `load'


---

 25-akpm/arch/i386/kernel/smpboot.c |   15 +++-
 25-akpm/include/linux/sched.h      |   12 +++
 25-akpm/kernel/sched.c             |  130 +++++++++++++++++++------------------
 3 files changed, 90 insertions(+), 67 deletions(-)

diff -puN arch/i386/kernel/smpboot.c~sched-group-power arch/i386/kernel/smpboot.c
--- 25/arch/i386/kernel/smpboot.c~sched-group-power	Thu Mar 11 14:33:30 2004
+++ 25-akpm/arch/i386/kernel/smpboot.c	Thu Mar 11 14:33:30 2004
@@ -1148,7 +1148,6 @@ __init void arch_init_sched_domains(void
 
 		*phys_domain = SD_CPU_INIT;
 		phys_domain->span = nodemask;
-		phys_domain->flags |= SD_FLAG_IDLE;
 
 		*node_domain = SD_NODE_INIT;
 		node_domain->span = cpu_online_map;
@@ -1168,6 +1167,7 @@ __init void arch_init_sched_domains(void
 
 			cpu->cpumask = CPU_MASK_NONE;
 			cpu_set(j, cpu->cpumask);
+			cpu->cpu_power = SCHED_LOAD_SCALE;
 
 			if (!first_cpu)
 				first_cpu = cpu;
@@ -1181,6 +1181,7 @@ __init void arch_init_sched_domains(void
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		int j;
 		cpumask_t nodemask;
+		struct sched_group *node = &sched_group_nodes[i];
 		cpus_and(nodemask, node_to_cpumask(i), cpu_online_map);
 
 		if (cpus_empty(nodemask))
@@ -1196,6 +1197,12 @@ __init void arch_init_sched_domains(void
 				continue;
 
 			cpu->cpumask = cpu_domain->span;
+			/*
+			 * Make each extra sibling increase power by 10% of
+			 * the basic CPU. This is very arbitrary.
+			 */
+			cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
+			node->cpu_power += cpu->cpu_power;
 
 			if (!first_cpu)
 				first_cpu = cpu;
@@ -1217,6 +1224,7 @@ __init void arch_init_sched_domains(void
 			continue;
 
 		cpu->cpumask = nodemask;
+		/* ->cpu_power already setup */
 
 		if (!first_cpu)
 			first_cpu = cpu;
@@ -1226,7 +1234,6 @@ __init void arch_init_sched_domains(void
 	}
 	last_cpu->next = first_cpu;
 
-
 	mb();
 	for_each_cpu_mask(i, cpu_online_map) {
 		int node = cpu_to_node(i);
@@ -1264,7 +1271,6 @@ __init void arch_init_sched_domains(void
 
 		*phys_domain = SD_CPU_INIT;
 		phys_domain->span = cpu_online_map;
-		phys_domain->flags |= SD_FLAG_IDLE;
 	}
 
 	/* Set up CPU (sibling) groups */
@@ -1281,6 +1287,7 @@ __init void arch_init_sched_domains(void
 
 			cpus_clear(cpu->cpumask);
 			cpu_set(j, cpu->cpumask);
+			cpu->cpu_power = SCHED_LOAD_SCALE;
 
 			if (!first_cpu)
 				first_cpu = cpu;
@@ -1301,6 +1308,8 @@ __init void arch_init_sched_domains(void
 			continue;
 
 		cpu->cpumask = cpu_domain->span;
+		/* See SMT+NUMA setup for comment */
+		cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
 
 		if (!first_cpu)
 			first_cpu = cpu;
diff -puN include/linux/sched.h~sched-group-power include/linux/sched.h
--- 25/include/linux/sched.h~sched-group-power	Thu Mar 11 14:33:30 2004
+++ 25-akpm/include/linux/sched.h	Thu Mar 11 14:33:30 2004
@@ -530,15 +530,25 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 
 #ifdef CONFIG_SMP
+#define SCHED_LOAD_SHIFT 7	/* increase resolution of load calculations */
+#define SCHED_LOAD_SCALE (1UL << SCHED_LOAD_SHIFT)
+
 #define SD_FLAG_NEWIDLE		1	/* Balance when about to become idle */
 #define SD_FLAG_EXEC		2	/* Balance on exec */
 #define SD_FLAG_WAKE		4	/* Balance on task wakeup */
 #define SD_FLAG_FASTMIGRATE	8	/* Sync wakes put task on waking CPU */
-#define SD_FLAG_IDLE		16	/* Should not have all CPUs idle */
 
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
 	cpumask_t cpumask;
+
+	/*
+	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
+	 * single CPU. This should be read only (except for setup). Although
+	 * it will need to be written to at cpu hot(un)plug time, perhaps the
+	 * cpucontrol semaphore will provide enough exclusion?
+	 */
+	unsigned long cpu_power;
 };
 
 struct sched_domain {
diff -puN kernel/sched.c~sched-group-power kernel/sched.c
--- 25/kernel/sched.c~sched-group-power	Thu Mar 11 14:33:30 2004
+++ 25-akpm/kernel/sched.c	Thu Mar 11 14:33:30 2004
@@ -191,9 +191,6 @@ struct prio_array {
 	struct list_head queue[MAX_PRIO];
 };
 
-#define SCHED_LOAD_SHIFT 7	/* increase resolution of load calculations */
-#define SCHED_LOAD_SCALE (1 << SCHED_LOAD_SHIFT)
-
 /*
  * This is the main, per-CPU runqueue data structure.
  *
@@ -1350,16 +1347,14 @@ find_busiest_group(struct sched_domain *
 				unsigned long *imbalance, enum idle_type idle)
 {
 	unsigned long max_load, avg_load, total_load, this_load;
-	int modify, total_nr_cpus, busiest_nr_cpus, this_nr_cpus;
-	enum idle_type package_idle = IDLE;
-	struct sched_group *busiest = NULL, *group = domain->groups;
+	unsigned int total_pwr;
+	int modify;
+	struct sched_group *busiest = NULL, *this = NULL, *group = domain->groups;
 
 	max_load = 0;
 	this_load = 0;
 	total_load = 0;
-	total_nr_cpus = 0;
-	busiest_nr_cpus = 0;
-	this_nr_cpus = 0;
+	total_pwr = 0;
 
 	if (group == NULL)
 		goto out_balanced;
@@ -1390,8 +1385,6 @@ find_busiest_group(struct sched_domain *
 			/* Bias balancing toward cpus of our domain */
 			if (local_group) {
 				load = get_high_cpu_load(i, modify);
-				if (!idle_cpu(i))
-					package_idle = NOT_IDLE;
 			} else
 				load = get_low_cpu_load(i, modify);
 
@@ -1403,48 +1396,34 @@ find_busiest_group(struct sched_domain *
 			goto nextgroup;
 
 		total_load += avg_load;
+		total_pwr += group->cpu_power;
 
-		/*
-		 * Load is cumulative over SD_FLAG_IDLE domains, but
-		 * spread over !SD_FLAG_IDLE domains. For example, 2
-		 * processes running on an SMT CPU puts a load of 2 on
-		 * that CPU, however 2 processes running on 2 CPUs puts
-		 * a load of 1 on that domain.
-		 *
-		 * This should be configurable so as SMT siblings become
-		 * more powerful, they can "spread" more load - for example,
-		 * the above case might only count as a load of 1.7.
-		 */
-		if (!(domain->flags & SD_FLAG_IDLE)) {
-			avg_load /= nr_cpus;
-			total_nr_cpus += nr_cpus;
-		} else
-			total_nr_cpus++;
-
-		if (avg_load > max_load)
-			max_load = avg_load;
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load << SCHED_LOAD_SHIFT) / group->cpu_power;
 
 		if (local_group) {
 			this_load = avg_load;
-			this_nr_cpus = nr_cpus;
-		} else if (avg_load >= max_load) {
+			this = group;
+			goto nextgroup;
+		}
+		if (avg_load > max_load) {
+			max_load = avg_load;
 			busiest = group;
-			busiest_nr_cpus = nr_cpus;
 		}
 nextgroup:
 		group = group->next;
 	} while (group != domain->groups);
 
-	if (!busiest)
+	if (!busiest || this_load >= max_load)
 		goto out_balanced;
 
-	avg_load = total_load / total_nr_cpus;
-
-	if (this_load >= avg_load)
-		goto out_balanced;
+	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
 
-	if (idle == NOT_IDLE && 100*max_load <= domain->imbalance_pct*this_load)
+	if (idle == NOT_IDLE) {
+		if (this_load >= avg_load ||
+			100*max_load <= domain->imbalance_pct*this_load)
 		goto out_balanced;
+	}
 
 	/*
 	 * We're trying to get all the cpus to the average_load, so we don't
@@ -1458,15 +1437,44 @@ nextgroup:
 	 * appear as very large values with unsigned longs.
 	 */
 	*imbalance = (min(max_load - avg_load, avg_load - this_load) + 1) / 2;
-	/* Get rid of the scaling factor, rounding *up* as we divide */
-	*imbalance = (*imbalance + SCHED_LOAD_SCALE/2 + 1)
-					>> SCHED_LOAD_SHIFT;
 
-	if (*imbalance == 0)
-		goto out_balanced;
+	if (*imbalance <= SCHED_LOAD_SCALE/2) {
+		unsigned long pwr_now = 0, pwr_move = 0;
+		unsigned long tmp;
+
+		/*
+		 * OK, we don't have enough imbalance to justify moving tasks,
+		 * however we may be able to increase total CPU power used by
+		 * moving them.
+		 */
+
+		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
+		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+		pwr_now >>= SCHED_LOAD_SHIFT;
+
+		/* Amount of load we'd subtract */
+		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+		if (max_load > tmp)
+			pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
+							max_load - tmp);
+
+		/* Amount of load we'd add */
+		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
+		pwr_move += this->cpu_power*min(this->cpu_power, this_load + tmp);
+		pwr_move >>= SCHED_LOAD_SHIFT;
+
+		/* Move if we gain another 8th of a CPU worth of throughput */
+		if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
+			goto out_balanced;
+		*imbalance = 1;
+		return busiest;
+	}
 
 	/* How many tasks to actually move to equalise the imbalance */
-	*imbalance *= min(busiest_nr_cpus, this_nr_cpus);
+	*imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
+				>> SCHED_LOAD_SHIFT;
+	/* Get rid of the scaling factor, rounding *up* as we divide */
+	*imbalance = (*imbalance + SCHED_LOAD_SCALE/2) >> SCHED_LOAD_SHIFT;
 
 	return busiest;
 
@@ -1547,26 +1555,19 @@ out:
 	if (!balanced && nr_moved == 0)
 		failed = 1;
 
-	if (domain->flags & SD_FLAG_IDLE && failed && busiest &&
+	if (failed && busiest &&
 	   		domain->nr_balance_failed > domain->cache_nice_tries) {
-		int i;
-		for_each_cpu_mask(i, group->cpumask) {
-			int wake = 0;
+		int wake = 0;
 
-			if (!cpu_online(i))
-				continue;
-
-			busiest = cpu_rq(i);
-			spin_lock(&busiest->lock);
-			if (!busiest->active_balance) {
-				busiest->active_balance = 1;
-				busiest->push_cpu = this_cpu;
-				wake = 1;
-			}
-			spin_unlock(&busiest->lock);
-			if (wake)
-				wake_up_process(busiest->migration_thread);
-		}
+		spin_lock(&busiest->lock);
+		if (!busiest->active_balance) {
+			busiest->active_balance = 1;
+			busiest->push_cpu = this_cpu;
+			wake = 1;
+		}
+		spin_unlock(&busiest->lock);
+		if (wake)
+			wake_up_process(busiest->migration_thread);
 	}
 
 	if (failed)
@@ -3250,12 +3251,14 @@ static void __init arch_init_sched_domai
 			continue;
 
 		node->cpumask = nodemask;
+		node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask);
 
 		for_each_cpu_mask(j, node->cpumask) {
 			struct sched_group *cpu = &sched_group_cpus[j];
 
 			cpus_clear(cpu->cpumask);
 			cpu_set(j, cpu->cpumask);
+			cpu->cpu_power = SCHED_LOAD_SCALE;
 
 			if (!first_cpu)
 				first_cpu = cpu;
@@ -3302,6 +3305,7 @@ static void __init arch_init_sched_domai
 
 		cpus_clear(cpu->cpumask);
 		cpu_set(i, cpu->cpumask);
+		cpu->cpu_power = SCHED_LOAD_SCALE;
 
 		if (!first_cpu)
 			first_cpu = cpu;

_