From: Nick Piggin The following patch builds a scheduling description for the i386 architecture using cpu_sibling_map to set up SMT if CONFIG_SCHED_SMT is set. It could be made more fancy and collapse degenerate domains at runtime (ie. 1 sibling per CPU, or 1 NUMA node in the computer). DESC sched: Fix CONFIG_SMT oops on UP EDESC From: Zwane Mwaikambo This fixes an oops due to cpu_sibling_map being uninitialised when a system with no MP table (most UP boxen) boots a CONFIG_SMT kernel. What also happens is that the cpu_group lists end up not being terminated properly, but this oops kills it first. Patch tested on UP w/o MP table, 2x P2 and UP Xeon w/ no siblings. DESC sched: fix SMT + NUMA bug EDESC From: Nick Piggin Here is a small fix that prevents i386 SMT + NUMA from crashing. DESC Change arch_init_sched_domains to use cpu_online_map EDESC From: "Martin J. Bligh" , Nick Piggin arch_init_sched_domains is using cpu_callout_map (via cpu_possible) instead of cpu_online_map. That's really only intended for cpu bootstrap, and won't work properly if we called out to a cpu, but it failed to respond. The normal way is to use cpu_online_map for this stuff, and it even cleans up the existing code a bit. (it's just a case of s/all_cpus/cpu_online_map/ and removing the loop that builds all_cpus). I tested this out on the NUMA-Q, and it works fine. DESC Fix build with NR_CPUS > BITS_PER_LONG EDESC From: Anton Blanchard --- 25-akpm/arch/i386/Kconfig | 10 + 25-akpm/arch/i386/kernel/smpboot.c | 220 ++++++++++++++++++++++++++++++++++- 25-akpm/include/asm-i386/processor.h | 5 25-akpm/kernel/sched.c | 35 +---- 4 files changed, 244 insertions(+), 26 deletions(-) diff -puN arch/i386/Kconfig~sched-domains-i386-ht arch/i386/Kconfig --- 25/arch/i386/Kconfig~sched-domains-i386-ht Fri Mar 12 11:23:10 2004 +++ 25-akpm/arch/i386/Kconfig Fri Mar 12 11:23:10 2004 @@ -475,6 +475,16 @@ config NR_CPUS This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on SMP + default off + help + SMT scheduler support improves the CPU scheduler's decision making + when dealing with Intel Pentium 4 chips with HyperThreading at a + cost of slightly increased overhead in some places. If unsure say + N here. + config PREEMPT bool "Preemptible Kernel" help diff -puN arch/i386/kernel/smpboot.c~sched-domains-i386-ht arch/i386/kernel/smpboot.c --- 25/arch/i386/kernel/smpboot.c~sched-domains-i386-ht Fri Mar 12 11:23:10 2004 +++ 25-akpm/arch/i386/kernel/smpboot.c Fri Mar 12 11:23:10 2004 @@ -39,6 +39,7 @@ #include #include +#include #include #include #include @@ -953,6 +954,8 @@ static void __init smp_boot_cpus(unsigne current_thread_info()->cpu = 0; smp_tune_scheduling(); + cpus_clear(cpu_sibling_map[0]); + cpu_set(0, cpu_sibling_map[0]); /* * If we couldn't find an SMP configuration at boot time, @@ -1083,7 +1086,7 @@ static void __init smp_boot_cpus(unsigne * efficiently. */ for (cpu = 0; cpu < NR_CPUS; cpu++) - cpu_sibling_map[cpu] = CPU_MASK_NONE; + cpus_clear(cpu_sibling_map[cpu]); for (cpu = 0; cpu < NR_CPUS; cpu++) { int siblings = 0; @@ -1120,6 +1123,221 @@ static void __init smp_boot_cpus(unsigne synchronize_tsc_bp(); } +#ifdef CONFIG_SCHED_SMT +#ifdef CONFIG_NUMA +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static DEFINE_PER_CPU(struct sched_domain, node_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu_mask(i, cpu_online_map) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + + /* TODO: change me to SD_SIBLING_INIT */ + *cpu_domain = SD_CPU_INIT; + cpu_domain->span = cpu_sibling_map[i]; + cpu_domain->flags |= SD_FLAG_WAKE; + cpu_domain->cache_hot_time = 0; + cpu_domain->cache_nice_tries = 0; + cpu_domain->max_interval = 2; + cpu_domain->busy_factor = 8; + cpu_domain->imbalance_pct = 110; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = nodemask; + phys_domain->flags |= SD_FLAG_IDLE; + + *node_domain = SD_NODE_INIT; + node_domain->span = cpu_online_map; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu_mask(i, cpu_online_map) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + int j; + first_cpu = last_cpu = NULL; + + if (i != first_cpu(cpu_domain->span)) + continue; + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpu->cpumask = CPU_MASK_NONE; + cpu_set(j, cpu->cpumask); + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + for (i = 0; i < MAX_NUMNODES; i++) { + int j; + cpumask_t nodemask; + cpus_and(nodemask, node_to_cpumask(i), cpu_online_map); + + if (cpus_empty(nodemask)) + continue; + + first_cpu = last_cpu = NULL; + /* Set up physical groups */ + for_each_cpu_mask(j, nodemask) { + struct sched_domain *cpu_domain = cpu_sched_domain(j); + struct sched_group *cpu = &sched_group_phys[j]; + + if (j != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + /* Set up nodes */ + first_cpu = last_cpu = NULL; + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *cpu = &sched_group_nodes[i]; + cpumask_t nodemask; + cpus_and(nodemask, node_to_cpumask(i), cpu_online_map); + + if (cpus_empty(nodemask)) + continue; + + cpu->cpumask = nodemask; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + + mb(); + for_each_cpu_mask(i, cpu_online_map) { + int node = cpu_to_node(i); + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + struct sched_group *cpu_group = &sched_group_cpus[i]; + struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)]; + struct sched_group *node_group = &sched_group_nodes[node]; + + cpu_domain->parent = phys_domain; + phys_domain->parent = node_domain; + + node_domain->groups = node_group; + phys_domain->groups = phys_group; + cpu_domain->groups = cpu_group; + } +} +#else /* CONFIG_NUMA */ +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu_mask(i, cpu_online_map) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + + /* TODO: change me to SD_SIBLING_INIT */ + *cpu_domain = SD_CPU_INIT; + cpu_domain->span = cpu_sibling_map[i]; + cpu_domain->flags |= SD_FLAG_WAKE; + cpu_domain->cache_hot_time = 100000; + cpu_domain->cache_nice_tries = 0; + cpu_domain->max_interval = (HZ/500 ?: 1), + cpu_domain->busy_factor = 8; + cpu_domain->imbalance_pct = 110; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = cpu_online_map; + phys_domain->flags |= SD_FLAG_IDLE; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu_mask(i, cpu_online_map) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + int j; + first_cpu = last_cpu = NULL; + + if (i != first_cpu(cpu_domain->span)) + continue; + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + first_cpu = last_cpu = NULL; + /* Set up physical groups */ + for_each_cpu_mask(i, cpu_online_map) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_group *cpu = &sched_group_phys[i]; + + if (i != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu_mask(i, cpu_online_map) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_group *cpu_group = &sched_group_cpus[i]; + struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)]; + cpu_domain->parent = phys_domain; + phys_domain->groups = phys_group; + cpu_domain->groups = cpu_group; + } +} +#endif /* CONFIG_NUMA */ +#endif /* CONFIG_SCHED_SMT */ + /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) diff -puN include/asm-i386/processor.h~sched-domains-i386-ht include/asm-i386/processor.h --- 25/include/asm-i386/processor.h~sched-domains-i386-ht Fri Mar 12 11:23:10 2004 +++ 25-akpm/include/asm-i386/processor.h Fri Mar 12 11:23:10 2004 @@ -646,4 +646,9 @@ extern inline void prefetchw(const void extern void select_idle_routine(const struct cpuinfo_x86 *c); +#ifdef CONFIG_SCHED_SMT +#define ARCH_HAS_SCHED_DOMAIN +#define ARCH_HAS_SCHED_WAKE_BALANCE +#endif + #endif /* __ASM_I386_PROCESSOR_H */ diff -puN kernel/sched.c~sched-domains-i386-ht kernel/sched.c --- 25/kernel/sched.c~sched-domains-i386-ht Fri Mar 12 11:23:10 2004 +++ 25-akpm/kernel/sched.c Fri Mar 12 11:23:10 2004 @@ -3203,28 +3203,20 @@ DEFINE_PER_CPU(struct sched_domain, node static void __init arch_init_sched_domains(void) { int i; - cpumask_t all_cpus = CPU_MASK_NONE; struct sched_group *first_node = NULL, *last_node = NULL; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; - - cpu_set(i, all_cpus); - } - /* Set up domains */ - for_each_cpu_mask(i, all_cpus) { + for_each_cpu_mask(i, cpu_online_map) { int node = cpu_to_node(i); cpumask_t nodemask = node_to_cpumask(node); struct sched_domain *node_domain = &per_cpu(node_domains, i); struct sched_domain *cpu_domain = cpu_sched_domain(i); *node_domain = SD_NODE_INIT; - node_domain->span = all_cpus; + node_domain->span = cpu_online_map; *cpu_domain = SD_CPU_INIT; - cpus_and(cpu_domain->span, nodemask, all_cpus); + cpus_and(cpu_domain->span, nodemask, cpu_online_map); cpu_domain->parent = node_domain; } @@ -3234,8 +3226,9 @@ static void __init arch_init_sched_domai int j; cpumask_t nodemask; struct sched_group *node = &sched_group_nodes[i]; + cpumask_t tmp = node_to_cpumask(i); - cpus_and(nodemask, node_to_cpumask(i), all_cpus); + cpus_and(nodemask, tmp, cpu_online_map); if (cpus_empty(nodemask)) continue; @@ -3265,7 +3258,7 @@ static void __init arch_init_sched_domai last_node->next = first_node; mb(); - for_each_cpu_mask(i, all_cpus) { + for_each_cpu_mask(i, cpu_online_map) { struct sched_domain *node_domain = &per_cpu(node_domains, i); struct sched_domain *cpu_domain = cpu_sched_domain(i); node_domain->groups = &sched_group_nodes[cpu_to_node(i)]; @@ -3277,26 +3270,18 @@ static void __init arch_init_sched_domai static void __init arch_init_sched_domains(void) { int i; - cpumask_t all_cpus = CPU_MASK_NONE; struct sched_group *first_cpu = NULL, *last_cpu = NULL; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; - - cpu_set(i, all_cpus); - } - /* Set up domains */ - for_each_cpu_mask(i, all_cpus) { + for_each_cpu_mask(i, cpu_online_map) { struct sched_domain *cpu_domain = cpu_sched_domain(i); *cpu_domain = SD_CPU_INIT; - cpu_domain->span = all_cpus; + cpu_domain->span = cpu_online_map; } /* Set up CPU groups */ - for_each_cpu_mask(i, all_cpus) { + for_each_cpu_mask(i, cpu_online_map) { struct sched_group *cpu = &sched_group_cpus[i]; cpus_clear(cpu->cpumask); @@ -3311,7 +3296,7 @@ static void __init arch_init_sched_domai last_cpu->next = first_cpu; mb(); - for_each_cpu_mask(i, all_cpus) { + for_each_cpu_mask(i, cpu_online_map) { struct sched_domain *cpu_domain = cpu_sched_domain(i); cpu_domain->groups = &sched_group_cpus[i]; } _