From: Jesse Barnes This patch limits the cpu span of each node's scheduler domain to prevent balancing across too many cpus. The cpus included in a node's domain are determined by the SD_NODES_PER_DOMAIN define and the arch specific sched_domain_node_span routine if ARCH_HAS_SCHED_DOMAIN is defined. If ARCH_HAS_SCHED_DOMAIN is not defined, behavior is unchanged--all possible cpus will be included in each node's scheduling domain. Currently, only ia64 provides an arch specific sched_domain_node_span routine. Signed-off-by: Jesse Barnes Signed-off-by: Andrew Morton --- 25-akpm/arch/ia64/kernel/smpboot.c | 67 +++++++++++++++++++++++++++++++++++ 25-akpm/include/asm-ia64/processor.h | 5 ++ 25-akpm/kernel/sched.c | 17 +++++++- 3 files changed, 86 insertions(+), 3 deletions(-) diff -puN arch/ia64/kernel/smpboot.c~sched-domain-node-span-4 arch/ia64/kernel/smpboot.c --- 25/arch/ia64/kernel/smpboot.c~sched-domain-node-span-4 Tue Jul 27 14:21:37 2004 +++ 25-akpm/arch/ia64/kernel/smpboot.c Tue Jul 27 14:21:37 2004 @@ -716,3 +716,70 @@ init_smp_config(void) printk(KERN_ERR "SMP: Can't set SAL AP Boot Rendezvous: %s\n", ia64_sal_strerror(sal_ret)); } + +#ifdef CONFIG_NUMA + +/** + * find_next_best_node - find the next node to include in a sched_domain + * @node: node whose sched_domain we're building + * @used_nodes: nodes already in the sched_domain + * + * Find the next node to include in a given scheduling domain. Simply + * finds the closest node not already in the @used_nodes map. + * + * Should use nodemask_t. + */ +static int __init find_next_best_node(int node, unsigned long *used_nodes) +{ + int i, n, val, min_val, best_node = 0; + + min_val = INT_MAX; + + for (i = 0; i < numnodes; i++) { + /* Start at @node */ + n = (node + i) % numnodes; + + /* Skip already used nodes */ + if (test_bit(n, used_nodes)) + continue; + + /* Simple min distance search */ + val = node_distance(node, i); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + set_bit(best_node, used_nodes); + return best_node; +} + +/** + * sched_domain_node_span - get a cpumask for a node's sched_domain + * @node: node whose cpumask we're constructing + * @size: number of nodes to include in this span + * + * Given a node, construct a good cpumask for its sched_domain to span. It + * should be one that prevents unnecessary balancing, but also spreads tasks + * out optimally. + */ +cpumask_t __init sched_domain_node_span(int node, int size) +{ + int i; + cpumask_t span; + DECLARE_BITMAP(used_nodes, MAX_NUMNODES); + + cpus_clear(span); + bitmap_zero(used_nodes, MAX_NUMNODES); + + for (i = 0; i < size; i++) { + int next_node = find_next_best_node(node, used_nodes); + cpus_or(span, span, node_to_cpumask(next_node)); + } + + return span; +} +#endif /* CONFIG_NUMA */ + diff -puN include/asm-ia64/processor.h~sched-domain-node-span-4 include/asm-ia64/processor.h --- 25/include/asm-ia64/processor.h~sched-domain-node-span-4 Tue Jul 27 14:21:37 2004 +++ 25-akpm/include/asm-ia64/processor.h Tue Jul 27 14:21:37 2004 @@ -334,6 +334,11 @@ struct task_struct; /* Prepare to copy thread state - unlazy all lazy status */ #define prepare_to_copy(tsk) do { } while (0) +#ifdef CONFIG_NUMA +/* smpboot.c defines a numa specific scheduler domain routine */ +#define ARCH_HAS_SCHED_DOMAIN +#endif + /* * This is the mechanism for creating a new kernel thread. * diff -puN kernel/sched.c~sched-domain-node-span-4 kernel/sched.c --- 25/kernel/sched.c~sched-domain-node-span-4 Tue Jul 27 14:21:37 2004 +++ 25-akpm/kernel/sched.c Tue Jul 27 14:34:44 2004 @@ -3667,8 +3667,13 @@ void cpu_attach_domain(struct sched_doma } #ifdef ARCH_HAS_SCHED_DOMAIN -extern void __init arch_init_sched_domains(void); +extern cpumask_t __init sched_domain_node_span(int node, int size); #else +static cpumask_t __init sched_domain_node_span(int node, int size) +{ + return cpu_possible_map; +} +#endif /* ARCH_HAS_SCHED_DOMAIN */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); @@ -3691,6 +3696,10 @@ __init static int cpu_to_phys_group(int } #ifdef CONFIG_NUMA + +/* Number of nearby nodes in a node's scheduling domain */ +#define SD_NODES_PER_DOMAIN 4 + static DEFINE_PER_CPU(struct sched_domain, node_domains); static struct sched_group sched_group_nodes[MAX_NUMNODES]; __init static int cpu_to_node_group(int cpu) @@ -3755,10 +3764,13 @@ __init static void arch_init_sched_domai cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); #ifdef CONFIG_NUMA + if (i != first_cpu(sd->groups->cpumask)) + continue; sd = &per_cpu(node_domains, i); group = cpu_to_node_group(i); *sd = SD_NODE_INIT; - sd->span = cpu_possible_map; + /* FIXME: should be multilevel, in arch code */ + sd->span = sched_domain_node_span(i, SD_NODES_PER_DOMAIN); sd->groups = &sched_group_nodes[group]; #endif @@ -3845,7 +3857,6 @@ __init static void arch_init_sched_domai cpu_attach_domain(sd, i); } } -#endif /* ARCH_HAS_SCHED_DOMAIN */ #define SCHED_DOMAIN_DEBUG #ifdef SCHED_DOMAIN_DEBUG _