From: Jesse Barnes This patch adds some more NUMA specific logic to the creation of scheduler domains. Domains spanning all CPUs in a large system are too large to schedule across efficiently, leading to livelocks and inordinate amounts of time being spent in scheduler routines. With this patch applied, the node scheduling domains for NUMA platforms will only contain a specified number of nearby CPUs, based on the value of SD_NODES_PER_DOMAIN. It also allows arches to override SD_NODE_INIT, which sets the domain scheduling parameters for each node's domain. This is necessary especially for large systems. Possible future directions: o multilevel node hierarchy (e.g. node domains could contain 4 nodes worth of CPUs, supernode domains could contain 32 nodes worth, etc. each with their own SD_NODE_INIT values) o more tweaking of SD_NODE_INIT values for good load balancing vs. overhead tradeoffs Signed-off-by: Jesse Barnes Signed-off-by: Andrew Morton --- 25-akpm/arch/ia64/kernel/smpboot.c | 66 ------------------------------- 25-akpm/include/asm-ia64/processor.h | 19 ++++++++- 25-akpm/kernel/sched.c | 73 +++++++++++++++++++++++++++++++---- 3 files changed, 82 insertions(+), 76 deletions(-) diff -puN arch/ia64/kernel/smpboot.c~sched-domain-node-span-4-update arch/ia64/kernel/smpboot.c --- 25/arch/ia64/kernel/smpboot.c~sched-domain-node-span-4-update 2004-08-19 23:23:50.169662968 -0700 +++ 25-akpm/arch/ia64/kernel/smpboot.c 2004-08-19 23:23:50.177661752 -0700 @@ -717,69 +717,3 @@ init_smp_config(void) ia64_sal_strerror(sal_ret)); } -#ifdef CONFIG_NUMA - -/** - * find_next_best_node - find the next node to include in a sched_domain - * @node: node whose sched_domain we're building - * @used_nodes: nodes already in the sched_domain - * - * Find the next node to include in a given scheduling domain. Simply - * finds the closest node not already in the @used_nodes map. - * - * Should use nodemask_t. - */ -static int __init find_next_best_node(int node, unsigned long *used_nodes) -{ - int i, n, val, min_val, best_node = 0; - - min_val = INT_MAX; - - for (i = 0; i < numnodes; i++) { - /* Start at @node */ - n = (node + i) % numnodes; - - /* Skip already used nodes */ - if (test_bit(n, used_nodes)) - continue; - - /* Simple min distance search */ - val = node_distance(node, i); - - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - set_bit(best_node, used_nodes); - return best_node; -} - -/** - * sched_domain_node_span - get a cpumask for a node's sched_domain - * @node: node whose cpumask we're constructing - * @size: number of nodes to include in this span - * - * Given a node, construct a good cpumask for its sched_domain to span. It - * should be one that prevents unnecessary balancing, but also spreads tasks - * out optimally. - */ -cpumask_t __init sched_domain_node_span(int node, int size) -{ - int i; - cpumask_t span; - DECLARE_BITMAP(used_nodes, MAX_NUMNODES); - - cpus_clear(span); - bitmap_zero(used_nodes, MAX_NUMNODES); - - for (i = 0; i < size; i++) { - int next_node = find_next_best_node(node, used_nodes); - cpus_or(span, span, node_to_cpumask(next_node)); - } - - return span; -} -#endif /* CONFIG_NUMA */ - diff -puN include/asm-ia64/processor.h~sched-domain-node-span-4-update include/asm-ia64/processor.h --- 25/include/asm-ia64/processor.h~sched-domain-node-span-4-update 2004-08-19 23:23:50.170662816 -0700 +++ 25-akpm/include/asm-ia64/processor.h 2004-08-19 23:23:50.178661600 -0700 @@ -335,8 +335,23 @@ struct task_struct; #define prepare_to_copy(tsk) do { } while (0) #ifdef CONFIG_NUMA -/* smpboot.c defines a numa specific scheduler domain routine */ -#define ARCH_HAS_SCHED_DOMAIN +#define SD_NODE_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 80, \ + .max_interval = 320, \ + .busy_factor = 320, \ + .imbalance_pct = 125, \ + .cache_hot_time = (10*1000000), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_BALANCE_EXEC \ + | SD_WAKE_BALANCE, \ + .last_balance = jiffies, \ + .balance_interval = 10, \ + .nr_balance_failed = 0, \ +} #endif /* diff -puN kernel/sched.c~sched-domain-node-span-4-update kernel/sched.c --- 25/kernel/sched.c~sched-domain-node-span-4-update 2004-08-19 23:23:50.172662512 -0700 +++ 25-akpm/kernel/sched.c 2004-08-19 23:26:25.356071048 -0700 @@ -1784,10 +1784,8 @@ static void active_load_balance(runqueue for_each_domain(busiest_cpu, sd) if (cpu_isset(busiest->push_cpu, sd->span)) break; - if (!sd) { - WARN_ON(1); + if (!sd) return; - } group = sd->groups; while (!cpu_isset(busiest_cpu, group->cpumask)) @@ -3667,15 +3665,74 @@ void cpu_attach_domain(struct sched_doma } #ifdef CONFIG_NUMA -#ifdef ARCH_HAS_SCHED_DOMAIN -extern cpumask_t __init sched_domain_node_span(int node, int size); -#else +/** + * find_next_best_node - find the next node to include in a sched_domain + * @node: node whose sched_domain we're building + * @used_nodes: nodes already in the sched_domain + * + * Find the next node to include in a given scheduling domain. Simply + * finds the closest node not already in the @used_nodes map. + * + * Should use nodemask_t. + */ +static int __init find_next_best_node(int node, unsigned long *used_nodes) +{ + int i, n, val, min_val, best_node = 0; + + min_val = INT_MAX; + + for (i = 0; i < numnodes; i++) { + /* Start at @node */ + n = (node + i) % numnodes; + + /* Skip already used nodes */ + if (test_bit(n, used_nodes)) + continue; + + /* Simple min distance search */ + val = node_distance(node, i); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + set_bit(best_node, used_nodes); + return best_node; +} + +/** + * sched_domain_node_span - get a cpumask for a node's sched_domain + * @node: node whose cpumask we're constructing + * @size: number of nodes to include in this span + * + * Given a node, construct a good cpumask for its sched_domain to span. It + * should be one that prevents unnecessary balancing, but also spreads tasks + * out optimally. + */ +cpumask_t __init sched_domain_node_span(int node, int size) +{ + int i; + cpumask_t span; + DECLARE_BITMAP(used_nodes, MAX_NUMNODES); + + cpus_clear(span); + bitmap_zero(used_nodes, MAX_NUMNODES); + + for (i = 0; i < size; i++) { + int next_node = find_next_best_node(node, used_nodes); + cpus_or(span, span, node_to_cpumask(next_node)); + } + + return span; +} +#else /* !CONFIG_NUMA */ static cpumask_t __init sched_domain_node_span(int node, int size) { return cpu_possible_map; } -#endif /* ARCH_HAS_SCHED_DOMAIN */ -#endif +#endif /* CONFIG_NUMA */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); _