From: Nick Piggin Teach the generic domains builder about SMT, and consolidate all architecture specific domain code into that. Also, the SD_*_INIT macros can now be redefined by arch code without duplicating the entire setup code. This can be done by defining ARCH_HASH_SCHED_TUNE. The generic builder has been simplified with the addition of a helper macro which will probably prove to be useful to arch specific code as well and should be exported if that is the case. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton --- /dev/null | 93 ------------ 25-akpm/Documentation/sched-domains.txt | 27 ++- 25-akpm/arch/i386/kernel/smpboot.c | 207 ---------------------------- 25-akpm/arch/ppc64/kernel/smp.c | 227 +------------------------------ 25-akpm/arch/x86_64/kernel/Makefile | 1 25-akpm/arch/x86_64/kernel/Makefile-HEAD | 1 25-akpm/include/asm-i386/processor.h | 5 25-akpm/include/asm-ppc64/processor.h | 5 25-akpm/include/asm-ppc64/smp.h | 3 25-akpm/include/asm-x86_64/processor.h | 5 25-akpm/include/linux/sched.h | 5 25-akpm/kernel/sched.c | 219 ++++++++++++++++++----------- 12 files changed, 179 insertions(+), 619 deletions(-) diff -puN arch/i386/kernel/smpboot.c~sched-consolidate-sched-domains arch/i386/kernel/smpboot.c --- 25/arch/i386/kernel/smpboot.c~sched-consolidate-sched-domains 2004-08-01 23:00:32.999516248 -0700 +++ 25-akpm/arch/i386/kernel/smpboot.c 2004-08-01 23:00:33.024512448 -0700 @@ -1135,213 +1135,6 @@ static void __init smp_boot_cpus(unsigne synchronize_tsc_bp(); } -#ifdef CONFIG_SCHED_SMT -#ifdef CONFIG_NUMA -static struct sched_group sched_group_cpus[NR_CPUS]; -static struct sched_group sched_group_phys[NR_CPUS]; -static struct sched_group sched_group_nodes[MAX_NUMNODES]; -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static DEFINE_PER_CPU(struct sched_domain, node_domains); -__init void arch_init_sched_domains(void) -{ - int i; - struct sched_group *first = NULL, *last = NULL; - - /* Set up domains */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_domain *phys_domain = &per_cpu(phys_domains, i); - struct sched_domain *node_domain = &per_cpu(node_domains, i); - int node = cpu_to_node(i); - cpumask_t nodemask = node_to_cpumask(node); - - *cpu_domain = SD_SIBLING_INIT; - cpu_domain->span = cpu_sibling_map[i]; - cpu_domain->parent = phys_domain; - cpu_domain->groups = &sched_group_cpus[i]; - - *phys_domain = SD_CPU_INIT; - phys_domain->span = nodemask; - phys_domain->parent = node_domain; - phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; - - *node_domain = SD_NODE_INIT; - node_domain->span = cpu_possible_map; - node_domain->groups = &sched_group_nodes[cpu_to_node(i)]; - } - - /* Set up CPU (sibling) groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - int j; - first = last = NULL; - - if (i != first_cpu(cpu_domain->span)) - continue; - - for_each_cpu_mask(j, cpu_domain->span) { - struct sched_group *cpu = &sched_group_cpus[j]; - - cpu->cpumask = CPU_MASK_NONE; - cpu_set(j, cpu->cpumask); - cpu->cpu_power = SCHED_LOAD_SCALE; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - } - - for (i = 0; i < MAX_NUMNODES; i++) { - int j; - cpumask_t nodemask; - struct sched_group *node = &sched_group_nodes[i]; - cpumask_t node_cpumask = node_to_cpumask(i); - - cpus_and(nodemask, node_cpumask, cpu_possible_map); - - if (cpus_empty(nodemask)) - continue; - - first = last = NULL; - /* Set up physical groups */ - for_each_cpu_mask(j, nodemask) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j); - struct sched_group *cpu = &sched_group_phys[j]; - - if (j != first_cpu(cpu_domain->span)) - continue; - - cpu->cpumask = cpu_domain->span; - /* - * Make each extra sibling increase power by 10% of - * the basic CPU. This is very arbitrary. - */ - cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; - node->cpu_power += cpu->cpu_power; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - } - - /* Set up nodes */ - first = last = NULL; - for (i = 0; i < MAX_NUMNODES; i++) { - struct sched_group *cpu = &sched_group_nodes[i]; - cpumask_t nodemask; - cpumask_t node_cpumask = node_to_cpumask(i); - - cpus_and(nodemask, node_cpumask, cpu_possible_map); - - if (cpus_empty(nodemask)) - continue; - - cpu->cpumask = nodemask; - /* ->cpu_power already setup */ - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - - mb(); - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - cpu_attach_domain(cpu_domain, i); - } -} -#else /* !CONFIG_NUMA */ -static struct sched_group sched_group_cpus[NR_CPUS]; -static struct sched_group sched_group_phys[NR_CPUS]; -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -__init void arch_init_sched_domains(void) -{ - int i; - struct sched_group *first = NULL, *last = NULL; - - /* Set up domains */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_domain *phys_domain = &per_cpu(phys_domains, i); - - *cpu_domain = SD_SIBLING_INIT; - cpu_domain->span = cpu_sibling_map[i]; - cpu_domain->parent = phys_domain; - cpu_domain->groups = &sched_group_cpus[i]; - - *phys_domain = SD_CPU_INIT; - phys_domain->span = cpu_possible_map; - phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; - } - - /* Set up CPU (sibling) groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - int j; - first = last = NULL; - - if (i != first_cpu(cpu_domain->span)) - continue; - - for_each_cpu_mask(j, cpu_domain->span) { - struct sched_group *cpu = &sched_group_cpus[j]; - - cpus_clear(cpu->cpumask); - cpu_set(j, cpu->cpumask); - cpu->cpu_power = SCHED_LOAD_SCALE; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - } - - first = last = NULL; - /* Set up physical groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_group *cpu = &sched_group_phys[i]; - - if (i != first_cpu(cpu_domain->span)) - continue; - - cpu->cpumask = cpu_domain->span; - /* See SMT+NUMA setup for comment */ - cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - - mb(); - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - cpu_attach_domain(cpu_domain, i); - } -} -#endif /* CONFIG_NUMA */ -#endif /* CONFIG_SCHED_SMT */ - /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) diff -puN arch/ppc64/kernel/smp.c~sched-consolidate-sched-domains arch/ppc64/kernel/smp.c --- 25/arch/ppc64/kernel/smp.c~sched-consolidate-sched-domains 2004-08-01 23:00:33.002515792 -0700 +++ 25-akpm/arch/ppc64/kernel/smp.c 2004-08-01 23:00:33.031511384 -0700 @@ -55,6 +55,9 @@ #include int smp_threads_ready; +#ifdef CONFIG_SCHED_SMT +cpumask_t cpu_sibling_map[NR_CPUS]; +#endif unsigned long cache_decay_ticks; cpumask_t cpu_possible_map = CPU_MASK_NONE; @@ -442,6 +445,15 @@ static inline void look_for_more_cpus(vo /* Make those cpus (which might appear later) possible too. */ for (i = 0; i < maxcpus; i++) cpu_set(i, cpu_possible_map); + +#ifdef CONFIG_SCHED_SMT + memset(cpu_sibling_map, 0, sizeof(cpu_sibling_map)); + for_each_cpu(i) { + cpu_set(i, cpu_sibling_map[i]); + if (cur_cpu_spec->cpu_features & CPU_FTR_SMT) + cpu_set(i^1, cpu_sibling_map[i]); + } +#endif } #else /* ... CONFIG_HOTPLUG_CPU */ static inline int __devinit smp_startup_cpu(unsigned int lcpu) @@ -1004,218 +1016,3 @@ void __init smp_cpus_done(unsigned int m set_cpus_allowed(current, old_mask); } - -#ifdef CONFIG_SCHED_SMT -#ifdef CONFIG_NUMA -static struct sched_group sched_group_cpus[NR_CPUS]; -static struct sched_group sched_group_phys[NR_CPUS]; -static struct sched_group sched_group_nodes[MAX_NUMNODES]; -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static DEFINE_PER_CPU(struct sched_domain, node_domains); -__init void arch_init_sched_domains(void) -{ - int i; - struct sched_group *first = NULL, *last = NULL; - - /* Set up domains */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_domain *phys_domain = &per_cpu(phys_domains, i); - struct sched_domain *node_domain = &per_cpu(node_domains, i); - int node = cpu_to_node(i); - cpumask_t nodemask = node_to_cpumask(node); - cpumask_t my_cpumask = cpumask_of_cpu(i); - cpumask_t sibling_cpumask = cpumask_of_cpu(i ^ 0x1); - - *cpu_domain = SD_SIBLING_INIT; - if (cur_cpu_spec->cpu_features & CPU_FTR_SMT) - cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask); - else - cpu_domain->span = my_cpumask; - cpu_domain->parent = phys_domain; - cpu_domain->groups = &sched_group_cpus[i]; - - *phys_domain = SD_CPU_INIT; - phys_domain->span = nodemask; - phys_domain->parent = node_domain; - phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; - - *node_domain = SD_NODE_INIT; - node_domain->span = cpu_possible_map; - node_domain->groups = &sched_group_nodes[node]; - } - - /* Set up CPU (sibling) groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - int j; - first = last = NULL; - - if (i != first_cpu(cpu_domain->span)) - continue; - - for_each_cpu_mask(j, cpu_domain->span) { - struct sched_group *cpu = &sched_group_cpus[j]; - - cpus_clear(cpu->cpumask); - cpu_set(j, cpu->cpumask); - cpu->cpu_power = SCHED_LOAD_SCALE; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - } - - for (i = 0; i < MAX_NUMNODES; i++) { - int j; - cpumask_t nodemask; - struct sched_group *node = &sched_group_nodes[i]; - cpumask_t node_cpumask = node_to_cpumask(i); - cpus_and(nodemask, node_cpumask, cpu_possible_map); - - if (cpus_empty(nodemask)) - continue; - - first = last = NULL; - /* Set up physical groups */ - for_each_cpu_mask(j, nodemask) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j); - struct sched_group *cpu = &sched_group_phys[j]; - - if (j != first_cpu(cpu_domain->span)) - continue; - - cpu->cpumask = cpu_domain->span; - /* - * Make each extra sibling increase power by 10% of - * the basic CPU. This is very arbitrary. - */ - cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; - node->cpu_power += cpu->cpu_power; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - } - - /* Set up nodes */ - first = last = NULL; - for (i = 0; i < MAX_NUMNODES; i++) { - struct sched_group *cpu = &sched_group_nodes[i]; - cpumask_t nodemask; - cpumask_t node_cpumask = node_to_cpumask(i); - cpus_and(nodemask, node_cpumask, cpu_possible_map); - - if (cpus_empty(nodemask)) - continue; - - cpu->cpumask = nodemask; - /* ->cpu_power already setup */ - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - - mb(); - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - cpu_attach_domain(cpu_domain, i); - } -} -#else /* !CONFIG_NUMA */ -static struct sched_group sched_group_cpus[NR_CPUS]; -static struct sched_group sched_group_phys[NR_CPUS]; -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -__init void arch_init_sched_domains(void) -{ - int i; - struct sched_group *first = NULL, *last = NULL; - - /* Set up domains */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_domain *phys_domain = &per_cpu(phys_domains, i); - cpumask_t my_cpumask = cpumask_of_cpu(i); - cpumask_t sibling_cpumask = cpumask_of_cpu(i ^ 0x1); - - *cpu_domain = SD_SIBLING_INIT; - if (cur_cpu_spec->cpu_features & CPU_FTR_SMT) - cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask); - else - cpu_domain->span = my_cpumask; - cpu_domain->parent = phys_domain; - cpu_domain->groups = &sched_group_cpus[i]; - - *phys_domain = SD_CPU_INIT; - phys_domain->span = cpu_possible_map; - phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; - } - - /* Set up CPU (sibling) groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - int j; - first = last = NULL; - - if (i != first_cpu(cpu_domain->span)) - continue; - - for_each_cpu_mask(j, cpu_domain->span) { - struct sched_group *cpu = &sched_group_cpus[j]; - - cpus_clear(cpu->cpumask); - cpu_set(j, cpu->cpumask); - cpu->cpu_power = SCHED_LOAD_SCALE; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - } - - first = last = NULL; - /* Set up physical groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_group *cpu = &sched_group_phys[i]; - - if (i != first_cpu(cpu_domain->span)) - continue; - - cpu->cpumask = cpu_domain->span; - /* See SMT+NUMA setup for comment */ - cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - - mb(); - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - cpu_attach_domain(cpu_domain, i); - } -} -#endif /* CONFIG_NUMA */ -#endif /* CONFIG_SCHED_SMT */ diff -L arch/x86_64/kernel/domain.c -puN arch/x86_64/kernel/domain.c~sched-consolidate-sched-domains /dev/null --- 25/arch/x86_64/kernel/domain.c +++ /dev/null 2003-09-15 06:40:47.000000000 -0700 @@ -1,93 +0,0 @@ -#include -#include - -/* Don't do any NUMA setup on Opteron right now. They seem to be - better off with flat scheduling. This is just for SMT. */ - -#ifdef CONFIG_SCHED_SMT - -static struct sched_group sched_group_cpus[NR_CPUS]; -static struct sched_group sched_group_phys[NR_CPUS]; -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -__init void arch_init_sched_domains(void) -{ - int i; - struct sched_group *first = NULL, *last = NULL; - - /* Set up domains */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_domain *phys_domain = &per_cpu(phys_domains, i); - - *cpu_domain = SD_SIBLING_INIT; - /* Disable SMT NICE for CMP */ - /* RED-PEN use a generic flag */ - if (cpu_data[i].x86_vendor == X86_VENDOR_AMD) - cpu_domain->flags &= ~SD_SHARE_CPUPOWER; - cpu_domain->span = cpu_sibling_map[i]; - cpu_domain->parent = phys_domain; - cpu_domain->groups = &sched_group_cpus[i]; - - *phys_domain = SD_CPU_INIT; - phys_domain->span = cpu_possible_map; - phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; - } - - /* Set up CPU (sibling) groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - int j; - first = last = NULL; - - if (i != first_cpu(cpu_domain->span)) - continue; - - for_each_cpu_mask(j, cpu_domain->span) { - struct sched_group *cpu = &sched_group_cpus[j]; - - cpus_clear(cpu->cpumask); - cpu_set(j, cpu->cpumask); - cpu->cpu_power = SCHED_LOAD_SCALE; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - } - - first = last = NULL; - /* Set up physical groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_group *cpu = &sched_group_phys[i]; - - if (i != first_cpu(cpu_domain->span)) - continue; - - cpu->cpumask = cpu_domain->span; - /* - * Make each extra sibling increase power by 10% of - * the basic CPU. This is very arbitrary. - */ - cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - - mb(); - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - cpu_attach_domain(cpu_domain, i); - } -} - -#endif diff -puN arch/x86_64/kernel/Makefile~sched-consolidate-sched-domains arch/x86_64/kernel/Makefile --- 25/arch/x86_64/kernel/Makefile~sched-consolidate-sched-domains 2004-08-01 23:00:33.005515336 -0700 +++ 25-akpm/arch/x86_64/kernel/Makefile 2004-08-01 23:00:33.033511080 -0700 @@ -25,7 +25,6 @@ obj-$(CONFIG_EARLY_PRINTK) += early_prin obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o obj-$(CONFIG_SWIOTLB) += swiotlb.o -obj-$(CONFIG_SCHED_SMT) += domain.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KGDB) += kgdb_stub.o diff -puN arch/x86_64/kernel/Makefile-HEAD~sched-consolidate-sched-domains arch/x86_64/kernel/Makefile-HEAD --- 25/arch/x86_64/kernel/Makefile-HEAD~sched-consolidate-sched-domains 2004-08-01 23:00:33.007515032 -0700 +++ 25-akpm/arch/x86_64/kernel/Makefile-HEAD 2004-08-01 23:00:33.033511080 -0700 @@ -25,7 +25,6 @@ obj-$(CONFIG_EARLY_PRINTK) += early_prin obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o obj-$(CONFIG_SWIOTLB) += swiotlb.o -obj-$(CONFIG_SCHED_SMT) += domain.o obj-$(CONFIG_MODULES) += module.o diff -puN Documentation/sched-domains.txt~sched-consolidate-sched-domains Documentation/sched-domains.txt --- 25/Documentation/sched-domains.txt~sched-consolidate-sched-domains 2004-08-01 23:00:33.008514880 -0700 +++ 25-akpm/Documentation/sched-domains.txt 2004-08-01 23:00:33.034510928 -0700 @@ -5,12 +5,13 @@ MUST be NULL terminated, and domain stru are locklessly updated. Each scheduling domain spans a number of CPUs (stored in the ->span field). -A domain's span MUST be a superset of it child's span, and a base domain -for CPU i MUST span at least i. The top domain for each CPU will generally -span all CPUs in the system although strictly it doesn't have to, but this -could lead to a case where some CPUs will never be given tasks to run unless -the CPUs allowed mask is explicitly set. A sched domain's span means "balance -process load among these CPUs". +A domain's span MUST be a superset of it child's span (this restriction could +be relaxed if the need arises), and a base domain for CPU i MUST span at least +i. The top domain for each CPU will generally span all CPUs in the system +although strictly it doesn't have to, but this could lead to a case where some +CPUs will never be given tasks to run unless the CPUs allowed mask is +explicitly set. A sched domain's span means "balance process load among these +CPUs". Each scheduling domain must have one or more CPU groups (struct sched_group) which are organised as a circular one way linked list from the ->groups @@ -46,6 +47,20 @@ The implementor should read comments in struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of the specifics and what to tune. +For SMT, the architecture must define CONFIG_SCHED_SMT and provide a +cpumask_t cpu_sibling_map[NR_CPUS], where cpu_sibling_map[i] is the mask of +all "i"'s siblings as well as "i" itself. + +Architectures may retain the regular override the default SD_*_INIT flags +while using the generic domain builder in kernel/sched.c if they wish to +retain the traditional SMT->SMP->NUMA topology (or some subset of that). This +can be done by #define'ing ARCH_HASH_SCHED_TUNE. + +Alternatively, the architecture may completely override the generic domain +builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your +arch_init_sched_domains function. This function will attach domains to all +CPUs using cpu_attach_domain. + Implementors should change the line #undef SCHED_DOMAIN_DEBUG to diff -puN include/asm-i386/processor.h~sched-consolidate-sched-domains include/asm-i386/processor.h --- 25/include/asm-i386/processor.h~sched-consolidate-sched-domains 2004-08-01 23:00:33.010514576 -0700 +++ 25-akpm/include/asm-i386/processor.h 2004-08-01 23:00:33.035510776 -0700 @@ -647,9 +647,4 @@ extern void select_idle_routine(const st #define cache_line_size() (boot_cpu_data.x86_cache_alignment) -#ifdef CONFIG_SCHED_SMT -#define ARCH_HAS_SCHED_DOMAIN -#define ARCH_HAS_SCHED_WAKE_IDLE -#endif - #endif /* __ASM_I386_PROCESSOR_H */ diff -puN include/asm-ppc64/processor.h~sched-consolidate-sched-domains include/asm-ppc64/processor.h --- 25/include/asm-ppc64/processor.h~sched-consolidate-sched-domains 2004-08-01 23:00:33.011514424 -0700 +++ 25-akpm/include/asm-ppc64/processor.h 2004-08-01 23:00:33.035510776 -0700 @@ -626,11 +626,6 @@ static inline void prefetchw(const void #define spin_lock_prefetch(x) prefetchw(x) -#ifdef CONFIG_SCHED_SMT -#define ARCH_HAS_SCHED_DOMAIN -#define ARCH_HAS_SCHED_WAKE_IDLE -#endif - #endif /* ASSEMBLY */ /* diff -puN include/asm-ppc64/smp.h~sched-consolidate-sched-domains include/asm-ppc64/smp.h --- 25/include/asm-ppc64/smp.h~sched-consolidate-sched-domains 2004-08-01 23:00:33.013514120 -0700 +++ 25-akpm/include/asm-ppc64/smp.h 2004-08-01 23:00:33.036510624 -0700 @@ -73,6 +73,9 @@ void smp_init_pSeries(void); extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); extern void cpu_die(void) __attribute__((noreturn)); +#ifdef CONFIG_SCHED_SMT +extern cpumask_t cpu_sibling_map[NR_CPUS]; +#endif #endif /* !(CONFIG_SMP) */ #define get_hard_smp_processor_id(CPU) (paca[(CPU)].hw_cpu_id) diff -puN include/asm-x86_64/processor.h~sched-consolidate-sched-domains include/asm-x86_64/processor.h --- 25/include/asm-x86_64/processor.h~sched-consolidate-sched-domains 2004-08-01 23:00:33.014513968 -0700 +++ 25-akpm/include/asm-x86_64/processor.h 2004-08-01 23:00:33.036510624 -0700 @@ -458,9 +458,4 @@ static inline void __mwait(unsigned long #define cache_line_size() (boot_cpu_data.x86_cache_alignment) -#ifdef CONFIG_SCHED_SMT -#define ARCH_HAS_SCHED_DOMAIN -#define ARCH_HAS_SCHED_WAKE_IDLE -#endif - #endif /* __ASM_X86_64_PROCESSOR_H */ diff -puN include/linux/sched.h~sched-consolidate-sched-domains include/linux/sched.h --- 25/include/linux/sched.h~sched-consolidate-sched-domains 2004-08-01 23:00:33.016513664 -0700 +++ 25-akpm/include/linux/sched.h 2004-08-01 23:00:33.037510472 -0700 @@ -607,6 +607,9 @@ struct sched_domain { unsigned int nr_balance_failed; /* initialise to 0 */ }; +#ifndef ARCH_HAS_SCHED_TUNE +#ifdef CONFIG_SCHED_SMT +#define ARCH_HAS_SCHED_WAKE_IDLE /* Common values for SMT siblings */ #define SD_SIBLING_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ @@ -628,6 +631,7 @@ struct sched_domain { .balance_interval = 1, \ .nr_balance_failed = 0, \ } +#endif /* Common values for CPUs */ #define SD_CPU_INIT (struct sched_domain) { \ @@ -670,6 +674,7 @@ struct sched_domain { .nr_balance_failed = 0, \ } #endif +#endif /* ARCH_HAS_SCHED_TUNE */ extern void cpu_attach_domain(struct sched_domain *sd, int cpu); diff -puN kernel/sched.c~sched-consolidate-sched-domains kernel/sched.c --- 25/kernel/sched.c~sched-consolidate-sched-domains 2004-08-01 23:00:33.018513360 -0700 +++ 25-akpm/kernel/sched.c 2004-08-01 23:00:33.041509864 -0700 @@ -3669,118 +3669,175 @@ void cpu_attach_domain(struct sched_doma #ifdef ARCH_HAS_SCHED_DOMAIN extern void __init arch_init_sched_domains(void); #else -static struct sched_group sched_group_cpus[NR_CPUS]; + +#ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static struct sched_group sched_group_cpus[NR_CPUS]; +__init static int cpu_to_cpu_group(int cpu) +{ + return cpu; +} +#endif + +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static struct sched_group sched_group_phys[NR_CPUS]; +__init static int cpu_to_phys_group(int cpu) +{ + return first_cpu(cpu_sibling_map[cpu]); +} + #ifdef CONFIG_NUMA -static struct sched_group sched_group_nodes[MAX_NUMNODES]; static DEFINE_PER_CPU(struct sched_domain, node_domains); -static void __init arch_init_sched_domains(void) +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +__init static int cpu_to_node_group(int cpu) { - int i; - struct sched_group *first_node = NULL, *last_node = NULL; + return cpu_to_node(cpu); +} +#endif - /* Set up domains */ - for_each_cpu(i) { - int node = cpu_to_node(i); - cpumask_t nodemask = node_to_cpumask(node); - struct sched_domain *node_sd = &per_cpu(node_domains, i); - struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); - - *node_sd = SD_NODE_INIT; - node_sd->span = cpu_possible_map; - node_sd->groups = &sched_group_nodes[cpu_to_node(i)]; - - *cpu_sd = SD_CPU_INIT; - cpus_and(cpu_sd->span, nodemask, cpu_possible_map); - cpu_sd->groups = &sched_group_cpus[i]; - cpu_sd->parent = node_sd; - } +/* + * init_sched_build_groups takes an array of groups, the cpumask we wish + * to span, and a pointer to a function which identifies what group a CPU + * belongs to. The return value of group_fn must be a valid index into the + * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we + * keep track of groups covered with a cpumask_t). + * + * init_sched_build_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. + */ +__init static void init_sched_build_groups(struct sched_group groups[], + cpumask_t span, int (*group_fn)(int cpu)) +{ + struct sched_group *first = NULL, *last = NULL; + cpumask_t covered = CPU_MASK_NONE; + int i; - /* Set up groups */ - for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t tmp = node_to_cpumask(i); - cpumask_t nodemask; - struct sched_group *first_cpu = NULL, *last_cpu = NULL; - struct sched_group *node = &sched_group_nodes[i]; + for_each_cpu_mask(i, span) { + int group = group_fn(i); + struct sched_group *sg = &groups[group]; int j; - cpus_and(nodemask, tmp, cpu_possible_map); - - if (cpus_empty(nodemask)) + if (cpu_isset(i, covered)) continue; - node->cpumask = nodemask; - node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask); + sg->cpumask = CPU_MASK_NONE; + sg->cpu_power = 0; - for_each_cpu_mask(j, node->cpumask) { - struct sched_group *cpu = &sched_group_cpus[j]; + for_each_cpu_mask(j, span) { + if (group_fn(j) != group) + continue; - cpus_clear(cpu->cpumask); - cpu_set(j, cpu->cpumask); - cpu->cpu_power = SCHED_LOAD_SCALE; - - if (!first_cpu) - first_cpu = cpu; - if (last_cpu) - last_cpu->next = cpu; - last_cpu = cpu; + cpu_set(j, covered); + cpu_set(j, sg->cpumask); } - last_cpu->next = first_cpu; - - if (!first_node) - first_node = node; - if (last_node) - last_node->next = node; - last_node = node; - } - last_node->next = first_node; - - mb(); - for_each_cpu(i) { - struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); - cpu_attach_domain(cpu_sd, i); + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; } + last->next = first; } -#else /* !CONFIG_NUMA */ -static void __init arch_init_sched_domains(void) +__init static void arch_init_sched_domains(void) { int i; - struct sched_group *first_cpu = NULL, *last_cpu = NULL; /* Set up domains */ for_each_cpu(i) { - struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + int group; + struct sched_domain *sd = NULL, *p; + cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); - *cpu_sd = SD_CPU_INIT; - cpu_sd->span = cpu_possible_map; - cpu_sd->groups = &sched_group_cpus[i]; +#ifdef CONFIG_NUMA + sd = &per_cpu(node_domains, i); + group = cpu_to_node_group(i); + *sd = SD_NODE_INIT; + sd->span = cpu_possible_map; + sd->groups = &sched_group_nodes[group]; +#endif + + p = sd; + sd = &per_cpu(phys_domains, i); + group = cpu_to_phys_group(i); + *sd = SD_CPU_INIT; + sd->span = nodemask; + sd->parent = p; + sd->groups = &sched_group_phys[group]; + +#ifdef CONFIG_SCHED_SMT + p = sd; + sd = &per_cpu(cpu_domains, i); + group = cpu_to_cpu_group(i); + *sd = SD_SIBLING_INIT; + sd->span = cpu_sibling_map[i]; + sd->parent = p; + sd->groups = &sched_group_cpus[group]; +#endif } - /* Set up CPU groups */ - for_each_cpu_mask(i, cpu_possible_map) { - struct sched_group *cpu = &sched_group_cpus[i]; +#ifdef CONFIG_SCHED_SMT + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + if (i != first_cpu(cpu_sibling_map[i])) + continue; - cpus_clear(cpu->cpumask); - cpu_set(i, cpu->cpumask); - cpu->cpu_power = SCHED_LOAD_SCALE; + init_sched_build_groups(sched_group_cpus, cpu_sibling_map[i], + &cpu_to_cpu_group); + } +#endif - if (!first_cpu) - first_cpu = cpu; - if (last_cpu) - last_cpu->next = cpu; - last_cpu = cpu; + /* Set up physical groups */ + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + + cpus_and(nodemask, nodemask, cpu_possible_map); + if (cpus_empty(nodemask)) + continue; + + init_sched_build_groups(sched_group_phys, nodemask, + &cpu_to_phys_group); } - last_cpu->next = first_cpu; - mb(); /* domains were modified outside the lock */ +#ifdef CONFIG_NUMA + /* Set up node groups */ + init_sched_build_groups(sched_group_nodes, cpu_possible_map, + &cpu_to_node_group); +#endif + + /* Calculate CPU power for physical packages and nodes */ + for_each_cpu(i) { + int power; + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); + power = SCHED_LOAD_SCALE; + sd->groups->cpu_power = power; +#endif + + sd = &per_cpu(phys_domains, i); + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + sd->groups->cpu_power = power; + +#ifdef CONFIG_NUMA + sd = &per_cpu(node_domains, i); + sd->groups->cpu_power += power; +#endif + } + + /* Attach the domains */ for_each_cpu(i) { - struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); - cpu_attach_domain(cpu_sd, i); + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); +#else + sd = &per_cpu(phys_domains, i); +#endif + cpu_attach_domain(sd, i); } } - -#endif /* CONFIG_NUMA */ #endif /* ARCH_HAS_SCHED_DOMAIN */ #define SCHED_DOMAIN_DEBUG _