From: Nick Piggin The following patch implements a cpu_power member to struct sched_group. This allows special casing to be removed for SMT groups in the balancing code. It does not take CPU hotplug into account yet, but that shouldn't be too hard. I have tested it on the NUMAQ by pretending it has SMT. Works as expected. Active balances across nodes. DESC sched-group-power warning fixes EDESC kernel/sched.c: In function `find_busiest_group': kernel/sched.c:1466: warning: comparison of distinct pointer types lacks a cast kernel/sched.c:1467: warning: comparison of distinct pointer types lacks a cast kernel/sched.c:1474: warning: comparison of distinct pointer types lacks a cast kernel/sched.c:1457: warning: unused variable `load' --- 25-akpm/arch/i386/kernel/smpboot.c | 15 +++- 25-akpm/include/linux/sched.h | 12 +++ 25-akpm/kernel/sched.c | 130 +++++++++++++++++++------------------ 3 files changed, 90 insertions(+), 67 deletions(-) diff -puN arch/i386/kernel/smpboot.c~sched-group-power arch/i386/kernel/smpboot.c --- 25/arch/i386/kernel/smpboot.c~sched-group-power Thu Mar 11 14:33:30 2004 +++ 25-akpm/arch/i386/kernel/smpboot.c Thu Mar 11 14:33:30 2004 @@ -1148,7 +1148,6 @@ __init void arch_init_sched_domains(void *phys_domain = SD_CPU_INIT; phys_domain->span = nodemask; - phys_domain->flags |= SD_FLAG_IDLE; *node_domain = SD_NODE_INIT; node_domain->span = cpu_online_map; @@ -1168,6 +1167,7 @@ __init void arch_init_sched_domains(void cpu->cpumask = CPU_MASK_NONE; cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; if (!first_cpu) first_cpu = cpu; @@ -1181,6 +1181,7 @@ __init void arch_init_sched_domains(void for (i = 0; i < MAX_NUMNODES; i++) { int j; cpumask_t nodemask; + struct sched_group *node = &sched_group_nodes[i]; cpus_and(nodemask, node_to_cpumask(i), cpu_online_map); if (cpus_empty(nodemask)) @@ -1196,6 +1197,12 @@ __init void arch_init_sched_domains(void continue; cpu->cpumask = cpu_domain->span; + /* + * Make each extra sibling increase power by 10% of + * the basic CPU. This is very arbitrary. + */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + node->cpu_power += cpu->cpu_power; if (!first_cpu) first_cpu = cpu; @@ -1217,6 +1224,7 @@ __init void arch_init_sched_domains(void continue; cpu->cpumask = nodemask; + /* ->cpu_power already setup */ if (!first_cpu) first_cpu = cpu; @@ -1226,7 +1234,6 @@ __init void arch_init_sched_domains(void } last_cpu->next = first_cpu; - mb(); for_each_cpu_mask(i, cpu_online_map) { int node = cpu_to_node(i); @@ -1264,7 +1271,6 @@ __init void arch_init_sched_domains(void *phys_domain = SD_CPU_INIT; phys_domain->span = cpu_online_map; - phys_domain->flags |= SD_FLAG_IDLE; } /* Set up CPU (sibling) groups */ @@ -1281,6 +1287,7 @@ __init void arch_init_sched_domains(void cpus_clear(cpu->cpumask); cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; if (!first_cpu) first_cpu = cpu; @@ -1301,6 +1308,8 @@ __init void arch_init_sched_domains(void continue; cpu->cpumask = cpu_domain->span; + /* See SMT+NUMA setup for comment */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; if (!first_cpu) first_cpu = cpu; diff -puN include/linux/sched.h~sched-group-power include/linux/sched.h --- 25/include/linux/sched.h~sched-group-power Thu Mar 11 14:33:30 2004 +++ 25-akpm/include/linux/sched.h Thu Mar 11 14:33:30 2004 @@ -530,15 +530,25 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #ifdef CONFIG_SMP +#define SCHED_LOAD_SHIFT 7 /* increase resolution of load calculations */ +#define SCHED_LOAD_SCALE (1UL << SCHED_LOAD_SHIFT) + #define SD_FLAG_NEWIDLE 1 /* Balance when about to become idle */ #define SD_FLAG_EXEC 2 /* Balance on exec */ #define SD_FLAG_WAKE 4 /* Balance on task wakeup */ #define SD_FLAG_FASTMIGRATE 8 /* Sync wakes put task on waking CPU */ -#define SD_FLAG_IDLE 16 /* Should not have all CPUs idle */ struct sched_group { struct sched_group *next; /* Must be a circular list */ cpumask_t cpumask; + + /* + * CPU power of this group, SCHED_LOAD_SCALE being max power for a + * single CPU. This should be read only (except for setup). Although + * it will need to be written to at cpu hot(un)plug time, perhaps the + * cpucontrol semaphore will provide enough exclusion? + */ + unsigned long cpu_power; }; struct sched_domain { diff -puN kernel/sched.c~sched-group-power kernel/sched.c --- 25/kernel/sched.c~sched-group-power Thu Mar 11 14:33:30 2004 +++ 25-akpm/kernel/sched.c Thu Mar 11 14:33:30 2004 @@ -191,9 +191,6 @@ struct prio_array { struct list_head queue[MAX_PRIO]; }; -#define SCHED_LOAD_SHIFT 7 /* increase resolution of load calculations */ -#define SCHED_LOAD_SCALE (1 << SCHED_LOAD_SHIFT) - /* * This is the main, per-CPU runqueue data structure. * @@ -1350,16 +1347,14 @@ find_busiest_group(struct sched_domain * unsigned long *imbalance, enum idle_type idle) { unsigned long max_load, avg_load, total_load, this_load; - int modify, total_nr_cpus, busiest_nr_cpus, this_nr_cpus; - enum idle_type package_idle = IDLE; - struct sched_group *busiest = NULL, *group = domain->groups; + unsigned int total_pwr; + int modify; + struct sched_group *busiest = NULL, *this = NULL, *group = domain->groups; max_load = 0; this_load = 0; total_load = 0; - total_nr_cpus = 0; - busiest_nr_cpus = 0; - this_nr_cpus = 0; + total_pwr = 0; if (group == NULL) goto out_balanced; @@ -1390,8 +1385,6 @@ find_busiest_group(struct sched_domain * /* Bias balancing toward cpus of our domain */ if (local_group) { load = get_high_cpu_load(i, modify); - if (!idle_cpu(i)) - package_idle = NOT_IDLE; } else load = get_low_cpu_load(i, modify); @@ -1403,48 +1396,34 @@ find_busiest_group(struct sched_domain * goto nextgroup; total_load += avg_load; + total_pwr += group->cpu_power; - /* - * Load is cumulative over SD_FLAG_IDLE domains, but - * spread over !SD_FLAG_IDLE domains. For example, 2 - * processes running on an SMT CPU puts a load of 2 on - * that CPU, however 2 processes running on 2 CPUs puts - * a load of 1 on that domain. - * - * This should be configurable so as SMT siblings become - * more powerful, they can "spread" more load - for example, - * the above case might only count as a load of 1.7. - */ - if (!(domain->flags & SD_FLAG_IDLE)) { - avg_load /= nr_cpus; - total_nr_cpus += nr_cpus; - } else - total_nr_cpus++; - - if (avg_load > max_load) - max_load = avg_load; + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load << SCHED_LOAD_SHIFT) / group->cpu_power; if (local_group) { this_load = avg_load; - this_nr_cpus = nr_cpus; - } else if (avg_load >= max_load) { + this = group; + goto nextgroup; + } + if (avg_load > max_load) { + max_load = avg_load; busiest = group; - busiest_nr_cpus = nr_cpus; } nextgroup: group = group->next; } while (group != domain->groups); - if (!busiest) + if (!busiest || this_load >= max_load) goto out_balanced; - avg_load = total_load / total_nr_cpus; - - if (this_load >= avg_load) - goto out_balanced; + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; - if (idle == NOT_IDLE && 100*max_load <= domain->imbalance_pct*this_load) + if (idle == NOT_IDLE) { + if (this_load >= avg_load || + 100*max_load <= domain->imbalance_pct*this_load) goto out_balanced; + } /* * We're trying to get all the cpus to the average_load, so we don't @@ -1458,15 +1437,44 @@ nextgroup: * appear as very large values with unsigned longs. */ *imbalance = (min(max_load - avg_load, avg_load - this_load) + 1) / 2; - /* Get rid of the scaling factor, rounding *up* as we divide */ - *imbalance = (*imbalance + SCHED_LOAD_SCALE/2 + 1) - >> SCHED_LOAD_SHIFT; - if (*imbalance == 0) - goto out_balanced; + if (*imbalance <= SCHED_LOAD_SCALE/2) { + unsigned long pwr_now = 0, pwr_move = 0; + unsigned long tmp; + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); + pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now >>= SCHED_LOAD_SHIFT; + + /* Amount of load we'd subtract */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + if (max_load > tmp) + pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, + max_load - tmp); + + /* Amount of load we'd add */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; + pwr_move += this->cpu_power*min(this->cpu_power, this_load + tmp); + pwr_move >>= SCHED_LOAD_SHIFT; + + /* Move if we gain another 8th of a CPU worth of throughput */ + if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) + goto out_balanced; + *imbalance = 1; + return busiest; + } /* How many tasks to actually move to equalise the imbalance */ - *imbalance *= min(busiest_nr_cpus, this_nr_cpus); + *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) + >> SCHED_LOAD_SHIFT; + /* Get rid of the scaling factor, rounding *up* as we divide */ + *imbalance = (*imbalance + SCHED_LOAD_SCALE/2) >> SCHED_LOAD_SHIFT; return busiest; @@ -1547,26 +1555,19 @@ out: if (!balanced && nr_moved == 0) failed = 1; - if (domain->flags & SD_FLAG_IDLE && failed && busiest && + if (failed && busiest && domain->nr_balance_failed > domain->cache_nice_tries) { - int i; - for_each_cpu_mask(i, group->cpumask) { - int wake = 0; + int wake = 0; - if (!cpu_online(i)) - continue; - - busiest = cpu_rq(i); - spin_lock(&busiest->lock); - if (!busiest->active_balance) { - busiest->active_balance = 1; - busiest->push_cpu = this_cpu; - wake = 1; - } - spin_unlock(&busiest->lock); - if (wake) - wake_up_process(busiest->migration_thread); - } + spin_lock(&busiest->lock); + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + wake = 1; + } + spin_unlock(&busiest->lock); + if (wake) + wake_up_process(busiest->migration_thread); } if (failed) @@ -3250,12 +3251,14 @@ static void __init arch_init_sched_domai continue; node->cpumask = nodemask; + node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask); for_each_cpu_mask(j, node->cpumask) { struct sched_group *cpu = &sched_group_cpus[j]; cpus_clear(cpu->cpumask); cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; if (!first_cpu) first_cpu = cpu; @@ -3302,6 +3305,7 @@ static void __init arch_init_sched_domai cpus_clear(cpu->cpumask); cpu_set(i, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; if (!first_cpu) first_cpu = cpu; _