From: Matthew Dobson Here's yet another version of a patch to implement per-arch SD_*_INITs. This follows the same basic idea of my last patch, but 1) defines an arch-specific SD_NODE_INIT for the 4 NUMA arches (i386, x86_64, IA64 & PPC64), 2) defines *default* SD_CPU_INIT & SD_SIBLING_INIT for *all* arches, with the possibility of them being overridden by simply defining an arch-specific version in include/asm/topology.h. The motivation behind the third version of this patch is that Martin feels that there should be no "default" NUMA initializer because NUMA characteristics are *very* arch/platform specific, and hence a "default" NUMA initializer can only lead to confusion. I agree with most of that, but don't quite see as much harm in having a default as he does. Nevertheless, to keep him quiet, I've run up this version of the patch. Martin, please run this through your magic test suite and make sure I didn't break anything trivial. Signed-off-by: Andrew Morton --- 25-akpm/arch/ia64/kernel/domain.c | 1 25-akpm/include/asm-i386/topology.h | 20 +++++++++ 25-akpm/include/asm-ia64/processor.h | 21 --------- 25-akpm/include/asm-ia64/topology.h | 20 +++++++++ 25-akpm/include/asm-ppc64/topology.h | 20 +++++++++ 25-akpm/include/asm-x86_64/topology.h | 22 ++++++++++ 25-akpm/include/linux/sched.h | 74 ---------------------------------- 25-akpm/include/linux/topology.h | 72 +++++++++++++++++++++++++++++++++ 8 files changed, 156 insertions(+), 94 deletions(-) diff -puN arch/ia64/kernel/domain.c~sched_domains-make-sd_node_init-per-arch-2 arch/ia64/kernel/domain.c --- 25/arch/ia64/kernel/domain.c~sched_domains-make-sd_node_init-per-arch-2 2004-10-02 18:33:15.005494352 -0700 +++ 25-akpm/arch/ia64/kernel/domain.c 2004-10-02 18:33:15.020492072 -0700 @@ -11,7 +11,6 @@ #include #include #include -#include #define SD_NODES_PER_DOMAIN 6 diff -puN include/asm-i386/topology.h~sched_domains-make-sd_node_init-per-arch-2 include/asm-i386/topology.h --- 25/include/asm-i386/topology.h~sched_domains-make-sd_node_init-per-arch-2 2004-10-02 18:33:15.006494200 -0700 +++ 25-akpm/include/asm-i386/topology.h 2004-10-02 18:33:15.020492072 -0700 @@ -72,6 +72,26 @@ static inline cpumask_t pcibus_to_cpumas /* Cross-node load balancing interval. */ #define NODE_BALANCE_RATE 100 +/* sched_domains SD_NODE_INIT for NUMAQ machines */ +#define SD_NODE_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 8, \ + .max_interval = 32, \ + .busy_factor = 32, \ + .imbalance_pct = 125, \ + .cache_hot_time = (10*1000), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_LOAD_BALANCE \ + | SD_BALANCE_EXEC \ + | SD_WAKE_BALANCE, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} + #else /* !CONFIG_NUMA */ /* * Other i386 platforms should define their own version of the diff -puN include/asm-ia64/processor.h~sched_domains-make-sd_node_init-per-arch-2 include/asm-ia64/processor.h --- 25/include/asm-ia64/processor.h~sched_domains-make-sd_node_init-per-arch-2 2004-10-02 18:33:15.008493896 -0700 +++ 25-akpm/include/asm-ia64/processor.h 2004-10-02 18:33:15.021491920 -0700 @@ -337,27 +337,6 @@ struct task_struct; /* Prepare to copy thread state - unlazy all lazy status */ #define prepare_to_copy(tsk) do { } while (0) -#ifdef CONFIG_NUMA -#define SD_NODE_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .groups = NULL, \ - .min_interval = 80, \ - .max_interval = 320, \ - .busy_factor = 320, \ - .imbalance_pct = 125, \ - .cache_hot_time = (10*1000000), \ - .cache_nice_tries = 1, \ - .per_cpu_gain = 100, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_EXEC \ - | SD_WAKE_BALANCE, \ - .last_balance = jiffies, \ - .balance_interval = 10, \ - .nr_balance_failed = 0, \ -} -#endif - /* * This is the mechanism for creating a new kernel thread. * diff -puN include/asm-ia64/topology.h~sched_domains-make-sd_node_init-per-arch-2 include/asm-ia64/topology.h --- 25/include/asm-ia64/topology.h~sched_domains-make-sd_node_init-per-arch-2 2004-10-02 18:33:15.009493744 -0700 +++ 25-akpm/include/asm-ia64/topology.h 2004-10-02 18:33:15.021491920 -0700 @@ -45,6 +45,26 @@ void build_cpu_to_node_map(void); +/* sched_domains SD_NODE_INIT for IA64 NUMA machines */ +#define SD_NODE_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 80, \ + .max_interval = 320, \ + .busy_factor = 320, \ + .imbalance_pct = 125, \ + .cache_hot_time = (10*1000000), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_LOAD_BALANCE \ + | SD_BALANCE_EXEC \ + | SD_WAKE_BALANCE, \ + .last_balance = jiffies, \ + .balance_interval = 10, \ + .nr_balance_failed = 0, \ +} + #endif /* CONFIG_NUMA */ #include diff -puN include/asm-ppc64/topology.h~sched_domains-make-sd_node_init-per-arch-2 include/asm-ppc64/topology.h --- 25/include/asm-ppc64/topology.h~sched_domains-make-sd_node_init-per-arch-2 2004-10-02 18:33:15.011493440 -0700 +++ 25-akpm/include/asm-ppc64/topology.h 2004-10-02 18:33:15.022491768 -0700 @@ -40,6 +40,26 @@ static inline int node_to_first_cpu(int /* Cross-node load balancing interval. */ #define NODE_BALANCE_RATE 10 +/* sched_domains SD_NODE_INIT for PPC64 machines */ +#define SD_NODE_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 8, \ + .max_interval = 32, \ + .busy_factor = 32, \ + .imbalance_pct = 125, \ + .cache_hot_time = (10*1000), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_LOAD_BALANCE \ + | SD_BALANCE_EXEC \ + | SD_WAKE_BALANCE, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} + #else /* !CONFIG_NUMA */ #include diff -puN include/asm-x86_64/topology.h~sched_domains-make-sd_node_init-per-arch-2 include/asm-x86_64/topology.h --- 25/include/asm-x86_64/topology.h~sched_domains-make-sd_node_init-per-arch-2 2004-10-02 18:33:15.012493288 -0700 +++ 25-akpm/include/asm-x86_64/topology.h 2004-10-02 18:33:15.022491768 -0700 @@ -34,6 +34,28 @@ static inline cpumask_t __pcibus_to_cpum #define NODE_BALANCE_RATE 30 /* CHECKME */ +#ifdef CONFIG_NUMA +/* sched_domains SD_NODE_INIT for x86_64 machines */ +#define SD_NODE_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 8, \ + .max_interval = 32, \ + .busy_factor = 32, \ + .imbalance_pct = 125, \ + .cache_hot_time = (10*1000), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_LOAD_BALANCE \ + | SD_BALANCE_EXEC \ + | SD_WAKE_BALANCE, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} +#endif + #endif #include diff -puN include/linux/sched.h~sched_domains-make-sd_node_init-per-arch-2 include/linux/sched.h --- 25/include/linux/sched.h~sched_domains-make-sd_node_init-per-arch-2 2004-10-02 18:33:15.014492984 -0700 +++ 25-akpm/include/linux/sched.h 2004-10-02 18:33:15.024491464 -0700 @@ -29,6 +29,7 @@ #include #include #include +#include struct exec_domain; @@ -482,78 +483,7 @@ extern cpumask_t cpu_isolated_map; extern void init_sched_build_groups(struct sched_group groups[], cpumask_t span, int (*group_fn)(int cpu)); extern void cpu_attach_domain(struct sched_domain *sd, int cpu); -#endif - -#ifndef ARCH_HAS_SCHED_TUNE -#ifdef CONFIG_SCHED_SMT -#define ARCH_HAS_SCHED_WAKE_IDLE -/* Common values for SMT siblings */ -#define SD_SIBLING_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .groups = NULL, \ - .min_interval = 1, \ - .max_interval = 2, \ - .busy_factor = 8, \ - .imbalance_pct = 110, \ - .cache_hot_time = 0, \ - .cache_nice_tries = 0, \ - .per_cpu_gain = 25, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_NEWIDLE \ - | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE \ - | SD_WAKE_IDLE \ - | SD_SHARE_CPUPOWER, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} -#endif - -/* Common values for CPUs */ -#define SD_CPU_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .groups = NULL, \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_hot_time = (5*1000/2), \ - .cache_nice_tries = 1, \ - .per_cpu_gain = 100, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_NEWIDLE \ - | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE \ - | SD_WAKE_BALANCE, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} - -#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT) -#define SD_NODE_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .groups = NULL, \ - .min_interval = 8, \ - .max_interval = 32, \ - .busy_factor = 32, \ - .imbalance_pct = 125, \ - .cache_hot_time = (10*1000), \ - .cache_nice_tries = 1, \ - .per_cpu_gain = 100, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_EXEC \ - | SD_WAKE_BALANCE, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} -#endif -#endif /* ARCH_HAS_SCHED_TUNE */ +#endif /* ARCH_HAS_SCHED_DOMAIN */ #endif /* CONFIG_SMP */ diff -puN include/linux/topology.h~sched_domains-make-sd_node_init-per-arch-2 include/linux/topology.h --- 25/include/linux/topology.h~sched_domains-make-sd_node_init-per-arch-2 2004-10-02 18:33:15.015492832 -0700 +++ 25-akpm/include/linux/topology.h 2004-10-02 18:33:15.025491312 -0700 @@ -61,4 +61,76 @@ static inline int __next_node_with_cpus( #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif +/* + * Below are the 3 major initializers used in building sched_domains: + * SD_SIBLING_INIT, for SMT domains + * SD_CPU_INIT, for SMP domains + * SD_NODE_INIT, for NUMA domains + * + * Any architecture that cares to do any tuning to these values should do so + * by defining their own arch-specific initializer in include/asm/topology.h. + * A definition there will automagically override these default initializers + * and allow arch-specific performance tuning of sched_domains. + */ +#ifdef CONFIG_SCHED_SMT +/* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is, + * so can't we drop this in favor of CONFIG_SCHED_SMT? + */ +#define ARCH_HAS_SCHED_WAKE_IDLE +/* Common values for SMT siblings */ +#ifndef SD_SIBLING_INIT +#define SD_SIBLING_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 2, \ + .busy_factor = 8, \ + .imbalance_pct = 110, \ + .cache_hot_time = 0, \ + .cache_nice_tries = 0, \ + .per_cpu_gain = 25, \ + .flags = SD_LOAD_BALANCE \ + | SD_BALANCE_NEWIDLE \ + | SD_BALANCE_EXEC \ + | SD_WAKE_AFFINE \ + | SD_WAKE_IDLE \ + | SD_SHARE_CPUPOWER, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} +#endif +#endif /* CONFIG_SCHED_SMT */ + +/* Common values for CPUs */ +#ifndef SD_CPU_INIT +#define SD_CPU_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_hot_time = (5*1000/2), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_LOAD_BALANCE \ + | SD_BALANCE_NEWIDLE \ + | SD_BALANCE_EXEC \ + | SD_WAKE_AFFINE \ + | SD_WAKE_BALANCE, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} +#endif + +#ifdef CONFIG_NUMA +#ifndef SD_NODE_INIT +#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!! +#endif +#endif /* CONFIG_NUMA */ + #endif /* _LINUX_TOPOLOGY_H */ _