diff -upN /dev/null current/Documentation/sched-domains.txt --- /dev/null 2004-02-24 15:23:11.000000000 -0800 +++ current/Documentation/sched-domains.txt 2004-04-29 10:39:06.000000000 -0700 @@ -0,0 +1,55 @@ +Each CPU has a "base" scheduling domain (struct sched_domain). These are +accessed via cpu_sched_domain(i) and this_sched_domain() macros. The domain +hierarchy is built from these base domains via the ->parent pointer. ->parent +MUST be NULL terminated, and domain structures should be per-CPU as they +are locklessly updated. + +Each scheduling domain spans a number of CPUs (stored in the ->span field). +A domain's span MUST be a superset of it child's span, and a base domain +for CPU i MUST span at least i. The top domain for each CPU will generally +span all CPUs in the system although strictly it doesn't have to, but this +could lead to a case where some CPUs will never be given tasks to run unless +the CPUs allowed mask is explicitly set. A sched domain's span means "balance +process load among these CPUs". + +Each scheduling domain must have one or more CPU groups (struct sched_group) +which are organised as a circular one way linked list from the ->groups +pointer. The union of cpumasks of these groups MUST be the same as the +domain's span. The intersection of cpumasks from any two of these groups +MUST be the empty set. The group pointed to by the ->groups pointer MUST +contain the CPU to which the domain belongs. Groups may be shared among +CPUs as they contain read only data after they have been set up. + +Balancing within a sched domain occurs between groups. That is, each group +is treated as one entity. The load of a group is defined as the sum of the +load of each of its member CPUs, and only when the load of a group becomes +out of balance are tasks moved between groups. + +In kernel/sched.c, rebalance_tick is run periodically on each CPU. This +function takes its CPU's base sched domain and checks to see if has reached +its rebalance interval. If so, then it will run load_balance on that domain. +rebalance_tick then checks the parent sched_domain (if it exists), and the +parent of the parent and so forth. + +*** Implementing sched domains *** +The "base" domain will "span" the first level of the hierarchy. In the case +of SMT, you'll span all siblings of the physical CPU, with each group being +a single virtual CPU. + +In SMP, the parent of the base domain will span all physical CPUs in the +node. Each group being a single physical CPU. Then with NUMA, the parent +of the SMP domain will span the entire machine, with each group having the +cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example, +might have just one domain covering its one NUMA level. + +The implementor should read comments in include/linux/sched.h: +struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of +the specifics and what to tune. + +Implementors should change the line +#undef SCHED_DOMAIN_DEBUG +to +#define SCHED_DOMAIN_DEBUG +in kernel/sched.c as this enables an error checking parse of the sched domains +which should catch most possible errors (described above). It also prints out +the domain structure in a visual format. diff -upN reference/arch/i386/Kconfig current/arch/i386/Kconfig --- reference/arch/i386/Kconfig 2004-04-29 10:39:05.000000000 -0700 +++ current/arch/i386/Kconfig 2004-04-29 10:39:06.000000000 -0700 @@ -475,6 +475,16 @@ config NR_CPUS This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on SMP + default off + help + SMT scheduler support improves the CPU scheduler's decision making + when dealing with Intel Pentium 4 chips with HyperThreading at a + cost of slightly increased overhead in some places. If unsure say + N here. + config PREEMPT bool "Preemptible Kernel" help @@ -705,7 +715,7 @@ config X86_PAE # Common NUMA Features config NUMA - bool "Numa Memory Allocation Support" + bool "Numa Memory Allocation and Scheduler Support" depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI)) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT) diff -upN reference/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c current/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c --- reference/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2004-02-18 14:56:46.000000000 -0800 +++ current/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2004-04-29 10:39:06.000000000 -0700 @@ -57,8 +57,7 @@ static int cpufreq_p4_setdc(unsigned int u32 l, h; cpumask_t cpus_allowed, affected_cpu_map; struct cpufreq_freqs freqs; - int hyperthreading = 0; - int sibling = 0; + int j; if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == DC_RESV)) @@ -68,13 +67,10 @@ static int cpufreq_p4_setdc(unsigned int cpus_allowed = current->cpus_allowed; /* only run on CPU to be set, or on its sibling */ - affected_cpu_map = cpumask_of_cpu(cpu); -#ifdef CONFIG_X86_HT - hyperthreading = ((cpu_has_ht) && (smp_num_siblings == 2)); - if (hyperthreading) { - sibling = cpu_sibling_map[cpu]; - cpu_set(sibling, affected_cpu_map); - } +#ifdef CONFIG_SMP + affected_cpu_map = cpu_sibling_map[cpu]; +#else + affected_cpu_map = cpumask_of_cpu(cpu); #endif set_cpus_allowed(current, affected_cpu_map); BUG_ON(!cpu_isset(smp_processor_id(), affected_cpu_map)); @@ -97,11 +93,11 @@ static int cpufreq_p4_setdc(unsigned int /* notifiers */ freqs.old = stock_freq * l / 8; freqs.new = stock_freq * newstate / 8; - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - if (hyperthreading) { - freqs.cpu = sibling; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + for_each_cpu(j) { + if (cpu_isset(j, affected_cpu_map)) { + freqs.cpu = j; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + } } rdmsr(MSR_IA32_THERM_STATUS, l, h); @@ -132,10 +128,11 @@ static int cpufreq_p4_setdc(unsigned int set_cpus_allowed(current, cpus_allowed); /* notifiers */ - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - if (hyperthreading) { - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + for_each_cpu(j) { + if (cpu_isset(j, affected_cpu_map)) { + freqs.cpu = j; + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + } } return 0; diff -upN reference/arch/i386/kernel/io_apic.c current/arch/i386/kernel/io_apic.c --- reference/arch/i386/kernel/io_apic.c 2004-04-07 14:53:56.000000000 -0700 +++ current/arch/i386/kernel/io_apic.c 2004-04-29 10:39:06.000000000 -0700 @@ -317,8 +317,7 @@ struct irq_cpu_info { #define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) -#define CPU_TO_PACKAGEINDEX(i) \ - ((physical_balance && i > cpu_sibling_map[i]) ? cpu_sibling_map[i] : i) +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) #define MIN_BALANCED_IRQ_INTERVAL (HZ/2) @@ -401,6 +400,7 @@ static void do_irq_balance(void) unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); unsigned long move_this_load = 0; int max_loaded = 0, min_loaded = 0; + int load; unsigned long useful_load_threshold = balanced_irq_interval + 10; int selected_irq; int tmp_loaded, first_attempt = 1; @@ -452,7 +452,7 @@ static void do_irq_balance(void) for (i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) continue; - if (physical_balance && i > cpu_sibling_map[i]) + if (i != CPU_TO_PACKAGEINDEX(i)) continue; if (min_cpu_irq > CPU_IRQ(i)) { min_cpu_irq = CPU_IRQ(i); @@ -471,7 +471,7 @@ tryanothercpu: for (i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) continue; - if (physical_balance && i > cpu_sibling_map[i]) + if (i != CPU_TO_PACKAGEINDEX(i)) continue; if (max_cpu_irq <= CPU_IRQ(i)) continue; @@ -551,9 +551,14 @@ tryanotherirq: * We seek the least loaded sibling by making the comparison * (A+B)/2 vs B */ - if (physical_balance && (CPU_IRQ(min_loaded) >> 1) > - CPU_IRQ(cpu_sibling_map[min_loaded])) - min_loaded = cpu_sibling_map[min_loaded]; + load = CPU_IRQ(min_loaded) >> 1; + for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { + if (load > CPU_IRQ(j)) { + /* This won't change cpu_sibling_map[min_loaded] */ + load = CPU_IRQ(j); + min_loaded = j; + } + } cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]); target_cpu_mask = cpumask_of_cpu(min_loaded); diff -upN reference/arch/i386/kernel/smpboot.c current/arch/i386/kernel/smpboot.c --- reference/arch/i386/kernel/smpboot.c 2004-03-11 14:33:36.000000000 -0800 +++ current/arch/i386/kernel/smpboot.c 2004-04-29 10:39:06.000000000 -0700 @@ -39,6 +39,7 @@ #include #include +#include #include #include #include @@ -934,7 +935,7 @@ static int boot_cpu_logical_apicid; /* Where the IO area was mapped on multiquad, always 0 otherwise */ void *xquad_portio; -int cpu_sibling_map[NR_CPUS] __cacheline_aligned; +cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; static void __init smp_boot_cpus(unsigned int max_cpus) { @@ -953,6 +954,8 @@ static void __init smp_boot_cpus(unsigne current_thread_info()->cpu = 0; smp_tune_scheduling(); + cpus_clear(cpu_sibling_map[0]); + cpu_set(0, cpu_sibling_map[0]); /* * If we couldn't find an SMP configuration at boot time, @@ -1079,32 +1082,34 @@ static void __init smp_boot_cpus(unsigne Dprintk("Boot done.\n"); /* - * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so - * that we can tell the sibling CPU efficiently. + * construct cpu_sibling_map[], so that we can tell sibling CPUs + * efficiently. */ - if (cpu_has_ht && smp_num_siblings > 1) { - for (cpu = 0; cpu < NR_CPUS; cpu++) - cpu_sibling_map[cpu] = NO_PROC_ID; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - int i; - if (!cpu_isset(cpu, cpu_callout_map)) - continue; + for (cpu = 0; cpu < NR_CPUS; cpu++) + cpus_clear(cpu_sibling_map[cpu]); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + int siblings = 0; + int i; + if (!cpu_isset(cpu, cpu_callout_map)) + continue; + if (smp_num_siblings > 1) { for (i = 0; i < NR_CPUS; i++) { - if (i == cpu || !cpu_isset(i, cpu_callout_map)) + if (!cpu_isset(i, cpu_callout_map)) continue; if (phys_proc_id[cpu] == phys_proc_id[i]) { - cpu_sibling_map[cpu] = i; - printk("cpu_sibling_map[%d] = %d\n", cpu, cpu_sibling_map[cpu]); - break; + siblings++; + cpu_set(i, cpu_sibling_map[cpu]); } } - if (cpu_sibling_map[cpu] == NO_PROC_ID) { - smp_num_siblings = 1; - printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu); - } + } else { + siblings++; + cpu_set(cpu, cpu_sibling_map[cpu]); } + + if (siblings != smp_num_siblings) + printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings); } smpboot_setup_io_apic(); @@ -1118,6 +1123,256 @@ static void __init smp_boot_cpus(unsigne synchronize_tsc_bp(); } +#ifdef CONFIG_SCHED_SMT +#ifdef CONFIG_NUMA +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static DEFINE_PER_CPU(struct sched_domain, node_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first = NULL, *last = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + + *cpu_domain = SD_SIBLING_INIT; + cpu_domain->span = cpu_sibling_map[i]; + cpu_domain->cache_hot_time = cacheflush_time / 2; + cpu_domain->parent = phys_domain; + cpu_domain->groups = &sched_group_cpus[i]; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = nodemask; + phys_domain->cache_hot_time = cacheflush_time / 2; + phys_domain->parent = node_domain; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; + + *node_domain = SD_NODE_INIT; + node_domain->span = cpu_possible_map; + node_domain->cache_hot_time = cacheflush_time; + node_domain->groups = &sched_group_nodes[cpu_to_node(i)]; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + int j; + first = last = NULL; + + if (i != first_cpu(cpu_domain->span)) + continue; + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpu->cpumask = CPU_MASK_NONE; + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + } + + for (i = 0; i < MAX_NUMNODES; i++) { + int j; + cpumask_t nodemask; + struct sched_group *node = &sched_group_nodes[i]; + cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + first = last = NULL; + /* Set up physical groups */ + for_each_cpu_mask(j, nodemask) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j); + struct sched_group *cpu = &sched_group_phys[j]; + + if (j != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* + * Make each extra sibling increase power by 10% of + * the basic CPU. This is very arbitrary. + */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + node->cpu_power += cpu->cpu_power; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + } + + /* Set up nodes */ + first = last = NULL; + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *cpu = &sched_group_nodes[i]; + cpumask_t nodemask; + cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + cpu->cpumask = nodemask; + /* ->cpu_power already setup */ + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_domain, i); + } +} +#else /* !CONFIG_NUMA */ +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first = NULL, *last = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + + *cpu_domain = SD_SIBLING_INIT; + cpu_domain->span = cpu_sibling_map[i]; + cpu_domain->cache_hot_time = cacheflush_time / 2; + cpu_domain->parent = phys_domain; + cpu_domain->groups = &sched_group_cpus[i]; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = cpu_possible_map; + phys_domain->cache_hot_time = cacheflush_time / 2; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + int j; + first = last = NULL; + + if (i != first_cpu(cpu_domain->span)) + continue; + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + } + + first = last = NULL; + /* Set up physical groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_group *cpu = &sched_group_phys[i]; + + if (i != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* See SMT+NUMA setup for comment */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_domain, i); + } +} +#endif /* CONFIG_NUMA */ +#else /* !CONFIG_SCHED_SMT */ + +static struct sched_group sched_group_cpus[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); + +void __init arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + + *cpu_sd = SD_CPU_INIT; + cpu_sd->span = cpu_possible_map; + cpu_sd->cache_hot_time = cacheflush_time / 2; + cpu_sd->groups = &sched_group_cpus[i]; + } + + /* Set up CPU groups */ + for_each_cpu_mask(i, cpu_possible_map) { + struct sched_group *cpu = &sched_group_cpus[i]; + + cpus_clear(cpu->cpumask); + cpu_set(i, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); /* domains were modified outside the lock */ + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_sd, i); + } +} +#endif + /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) diff -upN reference/arch/i386/oprofile/op_model_p4.c current/arch/i386/oprofile/op_model_p4.c --- reference/arch/i386/oprofile/op_model_p4.c 2003-10-01 11:40:41.000000000 -0700 +++ current/arch/i386/oprofile/op_model_p4.c 2004-04-29 10:39:06.000000000 -0700 @@ -382,11 +382,8 @@ static struct p4_event_binding p4_events static unsigned int get_stagger(void) { #ifdef CONFIG_SMP - int cpu; - if (smp_num_siblings > 1) { - cpu = smp_processor_id(); - return (cpu_sibling_map[cpu] > cpu) ? 0 : 1; - } + int cpu = smp_processor_id(); + return (cpu != first_cpu(cpu_sibling_map[cpu])); #endif return 0; } diff -upN reference/arch/ppc64/Kconfig current/arch/ppc64/Kconfig --- reference/arch/ppc64/Kconfig 2004-04-07 14:54:00.000000000 -0700 +++ current/arch/ppc64/Kconfig 2004-04-29 10:39:06.000000000 -0700 @@ -173,6 +173,15 @@ config NUMA bool "NUMA support" depends on DISCONTIGMEM +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on SMP + default off + help + SMT scheduler support improves the CPU scheduler's decision making + when dealing with POWER5 cpus at a cost of slightly increased + overhead in some places. If unsure say N here. + config PREEMPT bool "Preemptible Kernel" depends on BROKEN diff -upN reference/arch/ppc64/kernel/smp.c current/arch/ppc64/kernel/smp.c --- reference/arch/ppc64/kernel/smp.c 2004-04-07 14:54:00.000000000 -0700 +++ current/arch/ppc64/kernel/smp.c 2004-04-29 10:39:06.000000000 -0700 @@ -579,11 +579,6 @@ void __init smp_prepare_cpus(unsigned in paca[boot_cpuid].prof_counter = 1; paca[boot_cpuid].prof_multiplier = 1; - /* - * XXX very rough. - */ - cache_decay_ticks = HZ/100; - #ifndef CONFIG_PPC_ISERIES paca[boot_cpuid].next_jiffy_update_tb = tb_last_stamp = get_tb(); @@ -796,3 +791,278 @@ static int __init topology_init(void) return 0; } __initcall(topology_init); + +#ifdef CONFIG_SCHED_SMT +#ifdef CONFIG_NUMA +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static DEFINE_PER_CPU(struct sched_domain, node_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + cpumask_t my_cpumask = cpumask_of_cpu(i); + cpumask_t sibling_cpumask = cpumask_of_cpu(i ^ 0x1); + + *cpu_domain = SD_SIBLING_INIT; + if (__is_processor(PV_POWER5)) + cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask); + else + cpu_domain->span = my_cpumask; + cpu_domain->groups = &sched_group_cpus[i]; + cpu_domain->parent = phys_domain; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = nodemask; + // phys_domain->cache_hot_time = XXX; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; + phys_domain->parent = node_domain; + + *node_domain = SD_NODE_INIT; + node_domain->span = cpu_possible_map; + // node_domain->cache_hot_time = XXX; + node_domain->groups = &sched_group_nodes[node]; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + int j; + first_cpu = last_cpu = NULL; + + if (i != first_cpu(cpu_domain->span)) { + &per_cpu(cpu_domains, i)->flags |= SD_SHARE_CPUPOWER; + &per_cpu(cpu_domains, first_cpu(cpu_domain->span))->flags |= + SD_SHARE_CPUPOWER; + continue; + } + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + for (i = 0; i < MAX_NUMNODES; i++) { + int j; + cpumask_t nodemask; + struct sched_group *node = &sched_group_nodes[i]; + cpumask_t node_cpumask = node_to_cpumask(i); + cpus_and(nodemask, node_cpumask, cpu_online_map); + + if (cpus_empty(nodemask)) + continue; + + first_cpu = last_cpu = NULL; + /* Set up physical groups */ + for_each_cpu_mask(j, nodemask) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j); + struct sched_group *cpu = &sched_group_phys[j]; + + if (j != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* + * Make each extra sibling increase power by 10% of + * the basic CPU. This is very arbitrary. + */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + node->cpu_power += cpu->cpu_power; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + /* Set up nodes */ + first_cpu = last_cpu = NULL; + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *cpu = &sched_group_nodes[i]; + cpumask_t nodemask; + cpumask_t node_cpumask = node_to_cpumask(i); + cpus_and(nodemask, node_cpumask, cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + cpu->cpumask = nodemask; + /* ->cpu_power already setup */ + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + int node = cpu_to_node(i); + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_domain, i); + } +} +#else /* !CONFIG_NUMA */ +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + cpumask_t my_cpumask = cpumask_of_cpu(i); + cpumask_t sibling_cpumask = cpumask_of_cpu(i ^ 0x1); + + *cpu_domain = SD_SIBLING_INIT; + if (__is_processor(PV_POWER5)) + cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask); + else + cpu_domain->span = my_cpumask; + cpu_domain->groups = &sched_group_cpus[i]; + cpu_domain->parent = phys_domain; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = cpu_possible_map; + // phys_domain->cache_hot_time = XXX; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + int j; + first_cpu = last_cpu = NULL; + + if (i != first_cpu(cpu_domain->span)) { + per_cpu(cpu_domains, i).flags |= SD_SHARE_CPUPOWER; + per_cpu(cpu_domains, first_cpu(cpu_domain->span)).flags |= + SD_SHARE_CPUPOWER; + continue; + } + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + first_cpu = last_cpu = NULL; + /* Set up physical groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_group *cpu = &sched_group_phys[i]; + + if (i != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* See SMT+NUMA setup for comment */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_sd, i); + } +} +#endif /* CONFIG_NUMA */ +#else /* !CONFIG_SCHED_SMT */ + +#ifdef CONFIG_NUMA +#error ppc64 has no NUMA scheduler defined without CONFIG_SCHED_SMT. \ + Please enable CONFIG_SCHED_SMT or bug Anton. +#endif + +static struct sched_group sched_group_cpus[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); + +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + + *cpu_sd = SD_CPU_INIT; + cpu_sd->span = cpu_possible_map; + // cpu_sd->cache_hot_time = XXX; + cpu_sd->groups = &sched_group_cpus[i]; + } + + /* Set up CPU groups */ + for_each_cpu_mask(i, cpu_possible_map) { + struct sched_group *cpu = &sched_group_cpus[i]; + + cpus_clear(cpu->cpumask); + cpu_set(i, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_sd, i); + } +} + +#endif diff -upN reference/include/asm-generic/tlb.h current/include/asm-generic/tlb.h --- reference/include/asm-generic/tlb.h 2004-03-11 14:35:14.000000000 -0800 +++ current/include/asm-generic/tlb.h 2004-04-29 10:39:06.000000000 -0700 @@ -146,4 +146,6 @@ static inline void tlb_remove_page(struc __pmd_free_tlb(tlb, pmdp); \ } while (0) +#define tlb_migrate_prepare(mm) do { } while(0) + #endif /* _ASM_GENERIC__TLB_H */ diff -upN reference/include/asm-i386/param.h current/include/asm-i386/param.h --- reference/include/asm-i386/param.h 2004-03-11 14:35:15.000000000 -0800 +++ current/include/asm-i386/param.h 2004-04-29 10:39:06.000000000 -0700 @@ -4,7 +4,9 @@ #ifdef __KERNEL__ # define HZ 1000 /* Internal kernel timer frequency */ # define USER_HZ 100 /* .. some user interfaces are in "ticks" */ -# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ +# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ +# define JIFFIES_TO_MSEC(x) (x) +# define MSEC_TO_JIFFIES(x) (x) #endif #ifndef HZ diff -upN reference/include/asm-i386/processor.h current/include/asm-i386/processor.h --- reference/include/asm-i386/processor.h 2004-03-11 14:35:15.000000000 -0800 +++ current/include/asm-i386/processor.h 2004-04-29 10:39:06.000000000 -0700 @@ -646,4 +646,11 @@ extern inline void prefetchw(const void extern void select_idle_routine(const struct cpuinfo_x86 *c); +#ifdef CONFIG_SMP +# define ARCH_HAS_SCHED_DOMAIN +# ifdef CONFIG_SCHED_SMT +# define ARCH_HAS_SCHED_WAKE_IDLE +# endif +#endif + #endif /* __ASM_I386_PROCESSOR_H */ diff -upN reference/include/asm-i386/smp.h current/include/asm-i386/smp.h --- reference/include/asm-i386/smp.h 2004-03-11 14:35:15.000000000 -0800 +++ current/include/asm-i386/smp.h 2004-04-29 10:39:06.000000000 -0700 @@ -34,7 +34,7 @@ extern void smp_alloc_memory(void); extern int pic_mode; extern int smp_num_siblings; -extern int cpu_sibling_map[]; +extern cpumask_t cpu_sibling_map[]; extern void smp_flush_tlb(void); extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); diff -upN reference/include/asm-ia64/tlb.h current/include/asm-ia64/tlb.h --- reference/include/asm-ia64/tlb.h 2004-03-11 14:35:16.000000000 -0800 +++ current/include/asm-ia64/tlb.h 2004-04-29 10:39:06.000000000 -0700 @@ -211,6 +211,8 @@ __tlb_remove_tlb_entry (struct mmu_gathe tlb->end_addr = address + PAGE_SIZE; } +#define tlb_migrate_prepare(mm) flush_tlb_mm(mm) + #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) diff -upN reference/include/asm-ppc64/processor.h current/include/asm-ppc64/processor.h --- reference/include/asm-ppc64/processor.h 2004-03-11 14:35:23.000000000 -0800 +++ current/include/asm-ppc64/processor.h 2004-04-29 10:39:06.000000000 -0700 @@ -618,6 +618,11 @@ static inline void prefetchw(const void #define spin_lock_prefetch(x) prefetchw(x) +#ifdef CONFIG_SCHED_SMT +#define ARCH_HAS_SCHED_DOMAIN +#define ARCH_HAS_SCHED_WAKE_BALANCE +#endif + #endif /* ASSEMBLY */ #endif /* __ASM_PPC64_PROCESSOR_H */ diff -upN reference/include/linux/sched.h current/include/linux/sched.h --- reference/include/linux/sched.h 2004-04-07 14:54:36.000000000 -0700 +++ current/include/linux/sched.h 2004-04-29 10:39:06.000000000 -0700 @@ -147,6 +147,7 @@ extern spinlock_t mmlist_lock; typedef struct task_struct task_t; extern void sched_init(void); +extern void sched_init_smp(void); extern void init_idle(task_t *idle, int cpu); extern void show_state(void); @@ -531,6 +532,109 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #ifdef CONFIG_SMP +#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ + +#define SD_BALANCE_NEWIDLE 1 /* Balance when about to become idle */ +#define SD_BALANCE_EXEC 2 /* Balance on exec */ +#define SD_WAKE_IDLE 4 /* Wake to idle CPU on task wakeup */ +#define SD_WAKE_AFFINE 8 /* Wake task to waking CPU */ +#define SD_SHARE_CPUPOWER 16 /* Domain members share cpu power */ + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + cpumask_t cpumask; + + /* + * CPU power of this group, SCHED_LOAD_SCALE being max power for a + * single CPU. This should be read only (except for setup). Although + * it will need to be written to at cpu hot(un)plug time, perhaps the + * cpucontrol semaphore will provide enough exclusion? + */ + unsigned long cpu_power; +}; + +struct sched_domain { + /* These fields must be setup */ + struct sched_domain *parent; /* top domain must be null terminated */ + struct sched_group *groups; /* the balancing groups of the domain */ + cpumask_t span; /* span of all CPUs in this domain */ + unsigned long min_interval; /* Minimum balance interval ms */ + unsigned long max_interval; /* Maximum balance interval ms */ + unsigned int busy_factor; /* less balancing by factor if busy */ + unsigned int imbalance_pct; /* No balance until over watermark */ + unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ + unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ + unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ + int flags; /* See SD_* */ + + /* Runtime fields. */ + unsigned long last_balance; /* init to jiffies. units in jiffies */ + unsigned int balance_interval; /* initialise to 1. units in ms. */ + unsigned int nr_balance_failed; /* initialise to 0 */ +}; + +/* Common values for SMT siblings */ +#define SD_SIBLING_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 2, \ + .busy_factor = 8, \ + .imbalance_pct = 110, \ + .cache_hot_time = 0, \ + .cache_nice_tries = 0, \ + .per_cpu_gain = 15, \ + .flags = SD_BALANCE_NEWIDLE \ + | SD_WAKE_AFFINE \ + | SD_WAKE_IDLE \ + | SD_SHARE_CPUPOWER, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} + +/* Common values for CPUs */ +#define SD_CPU_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_hot_time = (5*1000000/2), \ + .cache_nice_tries = 2, \ + .per_cpu_gain = 100, \ + .flags = SD_BALANCE_NEWIDLE \ + | SD_WAKE_AFFINE, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} + +#ifdef CONFIG_NUMA +/* Common values for NUMA nodes */ +#define SD_NODE_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 8, \ + .max_interval = 256*fls(num_online_cpus()),\ + .busy_factor = 32, \ + .imbalance_pct = 125, \ + .cache_hot_time = (10*1000000), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_BALANCE_EXEC, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} +#endif + +extern void cpu_attach_domain(struct sched_domain *sd, int cpu); + extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); #else static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask) @@ -541,12 +645,10 @@ static inline int set_cpus_allowed(task_ extern unsigned long long sched_clock(void); -#ifdef CONFIG_NUMA +#ifdef CONFIG_SMP extern void sched_balance_exec(void); -extern void node_nr_running_init(void); #else #define sched_balance_exec() {} -#define node_nr_running_init() {} #endif /* Move tasks off this (offline) CPU onto another. */ diff -upN reference/init/main.c current/init/main.c --- reference/init/main.c 2004-04-07 14:54:37.000000000 -0700 +++ current/init/main.c 2004-04-29 10:39:06.000000000 -0700 @@ -420,6 +420,13 @@ asmlinkage void __init start_kernel(void */ smp_prepare_boot_cpu(); + /* + * Set up the scheduler prior starting any interrupts (such as the + * timer interrupt). Full topology setup happens at smp_init() + * time - but meanwhile we still have a functioning scheduler. + */ + sched_init(); + build_all_zonelists(); page_alloc_init(); printk("Kernel command line: %s\n", saved_command_line); @@ -431,7 +438,7 @@ asmlinkage void __init start_kernel(void rcu_init(); init_IRQ(); pidhash_init(); - sched_init(); + init_timers(); softirq_init(); time_init(); @@ -571,7 +578,6 @@ static void do_pre_smp_initcalls(void) migration_init(); #endif - node_nr_running_init(); spawn_ksoftirqd(); } @@ -602,6 +608,7 @@ static int init(void * unused) do_pre_smp_initcalls(); smp_init(); + sched_init_smp(); do_basic_setup(); prepare_namespace(); diff -upN reference/kernel/sched.c current/kernel/sched.c --- reference/kernel/sched.c 2004-04-29 10:39:03.000000000 -0700 +++ current/kernel/sched.c 2004-04-29 10:39:06.000000000 -0700 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -72,6 +73,13 @@ #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +#ifndef JIFFIES_TO_MSEC +# define JIFFIES_TO_MSEC(x) ((x) * 1000 / HZ) +#endif +#ifndef MSEC_TO_JIFFIES +# define MSEC_TO_JIFFIES(x) ((x) * HZ / 1000) +#endif + /* * These are the 'tuning knobs' of the scheduler: * @@ -91,7 +99,6 @@ #define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) #define STARVATION_LIMIT (MAX_SLEEP_AVG) #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) -#define NODE_THRESHOLD 125 #define CREDIT_LIMIT 100 /* @@ -173,11 +180,14 @@ ((MAX_TIMESLICE - MIN_TIMESLICE) * \ (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) -static inline unsigned int task_timeslice(task_t *p) +static unsigned int task_timeslice(task_t *p) { return BASE_TIMESLICE(p); } +#define task_hot(p, now, sd) \ + (!TASK_INTERACTIVE(p) && ((now)-(p)->timestamp < (sd)->cache_hot_time)) + /* * These are the runqueue data structures: */ @@ -187,7 +197,7 @@ static inline unsigned int task_timeslic typedef struct runqueue runqueue_t; struct prio_array { - int nr_active; + unsigned int nr_active; unsigned long bitmap[BITMAP_SIZE]; struct list_head queue[MAX_PRIO]; }; @@ -201,25 +211,34 @@ struct prio_array { */ struct runqueue { spinlock_t lock; + + unsigned long nr_running; unsigned long long nr_switches; - unsigned long nr_running, expired_timestamp, nr_uninterruptible, - timestamp_last_tick; + unsigned long expired_timestamp, nr_uninterruptible; + unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; - int best_expired_prio, prev_cpu_load[NR_CPUS]; -#ifdef CONFIG_NUMA - atomic_t *node_nr_running; - int prev_node_load[MAX_NUMNODES]; -#endif + int best_expired_prio; + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct sched_domain *sd; + + /* For active balancing */ + int active_balance; + int push_cpu; + task_t *migration_thread; struct list_head migration_queue; - - atomic_t nr_iowait; +#endif }; static DEFINE_PER_CPU(struct runqueue, runqueues); +#define for_each_domain(cpu, domain) \ + for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) + #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) @@ -234,57 +253,12 @@ static DEFINE_PER_CPU(struct runqueue, r # define task_running(rq, p) ((rq)->curr == (p)) #endif -#ifdef CONFIG_NUMA - -/* - * Keep track of running tasks. - */ - -static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp = - {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)}; - -static inline void nr_running_init(struct runqueue *rq) -{ - rq->node_nr_running = &node_nr_running[0]; -} - -static inline void nr_running_inc(runqueue_t *rq) -{ - atomic_inc(rq->node_nr_running); - rq->nr_running++; -} - -static inline void nr_running_dec(runqueue_t *rq) -{ - atomic_dec(rq->node_nr_running); - rq->nr_running--; -} - -__init void node_nr_running_init(void) -{ - int i; - - for (i = 0; i < NR_CPUS; i++) { - if (cpu_possible(i)) - cpu_rq(i)->node_nr_running = - &node_nr_running[cpu_to_node(i)]; - } -} - -#else /* !CONFIG_NUMA */ - -# define nr_running_init(rq) do { } while (0) -# define nr_running_inc(rq) do { (rq)->nr_running++; } while (0) -# define nr_running_dec(rq) do { (rq)->nr_running--; } while (0) - -#endif /* CONFIG_NUMA */ - /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. */ -static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) { struct runqueue *rq; @@ -307,7 +281,7 @@ static inline void task_rq_unlock(runque /* * rq_lock - lock a given runqueue and disable interrupts. */ -static inline runqueue_t *this_rq_lock(void) +static runqueue_t *this_rq_lock(void) { runqueue_t *rq; @@ -326,7 +300,7 @@ static inline void rq_unlock(runqueue_t /* * Adding/removing a task to/from a priority array: */ -static inline void dequeue_task(struct task_struct *p, prio_array_t *array) +static void dequeue_task(struct task_struct *p, prio_array_t *array) { array->nr_active--; list_del(&p->run_list); @@ -334,7 +308,7 @@ static inline void dequeue_task(struct t __clear_bit(p->prio, array->bitmap); } -static inline void enqueue_task(struct task_struct *p, prio_array_t *array) +static void enqueue_task(struct task_struct *p, prio_array_t *array) { list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); @@ -342,6 +316,21 @@ static inline void enqueue_task(struct t p->array = array; } +#ifdef CONFIG_SMP +/* + * Used by the migration code - we pull tasks from the head of the + * remote queue so we want these tasks to show up at the head of the + * local queue: + */ +static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) +{ + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} +#endif + /* * effective_prio - return the priority that is based on the static * priority but is modified by bonuses/penalties. @@ -379,7 +368,7 @@ static int effective_prio(task_t *p) static inline void __activate_task(task_t *p, runqueue_t *rq) { enqueue_task(p, rq->active); - nr_running_inc(rq); + rq->nr_running++; } static void recalc_task_prio(task_t *p, unsigned long long now) @@ -462,7 +451,7 @@ static void recalc_task_prio(task_t *p, * Update all the scheduling statistics stuff. (sleep average * calculation, priority modifiers, etc.) */ -static inline void activate_task(task_t *p, runqueue_t *rq) +static void activate_task(task_t *p, runqueue_t *rq) { unsigned long long now = sched_clock(); @@ -498,9 +487,9 @@ static inline void activate_task(task_t /* * deactivate_task - remove a task from the runqueue. */ -static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +static void deactivate_task(struct task_struct *p, runqueue_t *rq) { - nr_running_dec(rq); + rq->nr_running--; if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; dequeue_task(p, p->array); @@ -514,9 +503,9 @@ static inline void deactivate_task(struc * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -static inline void resched_task(task_t *p) -{ #ifdef CONFIG_SMP +static void resched_task(task_t *p) +{ int need_resched, nrpolling; preempt_disable(); @@ -528,10 +517,13 @@ static inline void resched_task(task_t * if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) smp_send_reschedule(task_cpu(p)); preempt_enable(); +} #else +static inline void resched_task(task_t *p) +{ set_tsk_need_resched(p); -#endif } +#endif /** * task_curr - is this task currently executing on a CPU? @@ -543,40 +535,46 @@ inline int task_curr(task_t *p) } #ifdef CONFIG_SMP +enum request_type { + REQ_MOVE_TASK, + REQ_SET_DOMAIN, +}; + typedef struct { struct list_head list; + enum request_type type; + + /* For REQ_MOVE_TASK */ task_t *task; + int dest_cpu; + + /* For REQ_SET_DOMAIN */ + struct sched_domain *sd; + struct completion done; } migration_req_t; /* - * The task's runqueue lock must be held, and the new mask must be valid. + * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static int __set_cpus_allowed(task_t *p, cpumask_t new_mask, - migration_req_t *req) +static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) { runqueue_t *rq = task_rq(p); - p->cpus_allowed = new_mask; - /* - * Can the task run on the task's current CPU? If not then - * migrate the thread off to a proper CPU. - */ - if (cpu_isset(task_cpu(p), new_mask)) - return 0; - /* * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ if (!p->array && !task_running(rq, p)) { - set_task_cpu(p, any_online_cpu(p->cpus_allowed)); + set_task_cpu(p, dest_cpu); return 0; } init_completion(&req->done); + req->type = REQ_MOVE_TASK; req->task = p; + req->dest_cpu = dest_cpu; list_add(&req->list, &rq->migration_queue); return 1; } @@ -631,6 +629,54 @@ void kick_process(task_t *p) EXPORT_SYMBOL_GPL(kick_process); +/* + * Return a low guess at the load of cpu. + */ +static inline unsigned long cpu_load(int cpu) +{ + return cpu_rq(cpu)->nr_running * SCHED_LOAD_SCALE; +} + +#endif + +/* + * wake_idle() is useful especially on SMT architectures to wake a + * task onto an idle sibling if we would otherwise wake it onto a + * busy sibling. + * + * Returns the CPU we should wake onto. + */ +#if defined(ARCH_HAS_SCHED_WAKE_IDLE) +static int wake_idle(int cpu, task_t *p) +{ + cpumask_t tmp; + runqueue_t *rq = cpu_rq(cpu); + struct sched_domain *sd; + int i; + + if (idle_cpu(cpu)) + return cpu; + + sd = rq->sd; + if (!(sd->flags & SD_WAKE_IDLE)) + return cpu; + + cpus_and(tmp, sd->span, cpu_online_map); + for_each_cpu_mask(i, tmp) { + if (!cpu_isset(i, p->cpus_allowed)) + continue; + + if (idle_cpu(i)) + return i; + } + + return cpu; +} +#else +static inline int wake_idle(int cpu, task_t *p) +{ + return cpu; +} #endif /*** @@ -649,52 +695,138 @@ EXPORT_SYMBOL_GPL(kick_process); */ static int try_to_wake_up(task_t * p, unsigned int state, int sync) { + int cpu, this_cpu, success = 0; unsigned long flags; - int success = 0; long old_state; runqueue_t *rq; +#ifdef CONFIG_SMP + unsigned long load, this_load; + struct sched_domain *sd; + unsigned long long now; + int new_cpu; +#endif -repeat_lock_task: rq = task_rq_lock(p, &flags); old_state = p->state; - if (old_state & state) { - if (!p->array) { - /* - * Fast-migrate the task if it's not running or runnable - * currently. Do not violate hard affinity. - */ - if (unlikely(sync && !task_running(rq, p) && - (task_cpu(p) != smp_processor_id()) && - cpu_isset(smp_processor_id(), - p->cpus_allowed) && - !cpu_is_offline(smp_processor_id()))) { - set_task_cpu(p, smp_processor_id()); - task_rq_unlock(rq, &flags); - goto repeat_lock_task; - } - if (old_state == TASK_UNINTERRUPTIBLE) { - rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->activated = -1; - } - if (sync && (task_cpu(p) == smp_processor_id())) - __activate_task(p, rq); - else { - activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - } - success = 1; - } - p->state = TASK_RUNNING; + if (!(old_state & state)) + goto out; + + if (p->array) + goto out_running; + + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + +#ifdef CONFIG_SMP + if (unlikely(task_running(rq, p) || cpu_is_offline(this_cpu))) + goto out_activate; + + new_cpu = this_cpu; + + if (cpu == this_cpu) + goto out_set_cpu; + + /* + * Passive balance, if the load on the remote CPU is over + * the limit: + */ + load = cpu_load(cpu) * 100; + /* + * add the new task's effect to its new CPU. If sync wakeup then + * subtract current's load effect: this means that they cancel out + * each other in the sync case, the we have +1 load in the !sync case: + */ + this_load = cpu_load(this_cpu); + if (!sync) + this_load += SCHED_LOAD_SCALE; + this_load *= rq->sd->imbalance_pct; + + if (load > this_load) + goto out_set_cpu; + + /* + * Migrate if the source CPU is not idle or the target + * CPU is idle; if the two CPUs share a domain; and if the task + * is not cache-hot. + * + * (Note that these kinds of migrations violate the equilibrium, + * and might trigger follow-on load-balancing - hence we pick + * cache-cold tasks only.) + */ + if (!cpu_load(cpu) && cpu_load(this_cpu)) + goto out_activate; + + now = sched_clock(); + for_each_domain(this_cpu, sd) { + if (!(sd->flags & SD_WAKE_AFFINE)) + break; + if (task_hot(p, now, sd)) + break; + /* + * The two CPUs share a span of a domain that has affine + * wakeups enabled - the task can be migrated: + */ + if (cpu_isset(cpu, sd->span)) + goto out_set_cpu; + } + /* No luck - fall back to the original CPU: */ + new_cpu = cpu; + +out_set_cpu: + new_cpu = wake_idle(new_cpu, p); + if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) { + set_task_cpu(p, new_cpu); + task_rq_unlock(rq, &flags); + + /* might preempt at this point */ + + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + if (p->array) + goto out_running; + + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + } + +out_activate: +#endif /* CONFIG_SMP */ + if (old_state == TASK_UNINTERRUPTIBLE) { + rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->activated = -1; + } + + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + if (sync && cpu == this_cpu) { + __activate_task(p, rq); + } else { + activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); } + success = 1; + +out_running: + p->state = TASK_RUNNING; +out: task_rq_unlock(rq, &flags); return success; } + int fastcall wake_up_process(task_t * p) { return try_to_wake_up(p, TASK_STOPPED | @@ -749,8 +881,8 @@ void fastcall sched_fork(task_t *p) p->timestamp = sched_clock(); if (!current->time_slice) { /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the * runqueue lock is not a problem. */ current->time_slice = 1; @@ -798,7 +930,7 @@ void fastcall wake_up_forked_process(tas list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; - nr_running_inc(rq); + rq->nr_running++; } task_rq_unlock(rq, &flags); } @@ -849,7 +981,7 @@ void fastcall sched_exit(task_t * p) * with the lock held can cause deadlocks; see schedule() for * details.) */ -static inline void finish_task_switch(task_t *prev) +static void finish_task_switch(task_t *prev) { runqueue_t *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; @@ -866,7 +998,7 @@ static inline void finish_task_switch(ta * still held, otherwise prev could be scheduled on another cpu, die * there before we look at prev->state, and then the reference would * be dropped twice. - * Manfred Spraul + * Manfred Spraul */ prev_task_flags = prev->flags; finish_arch_switch(rq, prev); @@ -928,7 +1060,7 @@ unsigned long nr_running(void) { unsigned long i, sum = 0; - for (i = 0; i < NR_CPUS; i++) + for_each_cpu(i) sum += cpu_rq(i)->nr_running; return sum; @@ -970,7 +1102,7 @@ unsigned long nr_iowait(void) * Note this does not disable interrupts like task_rq_lock, * you need to do so manually before calling. */ -static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) +static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) { if (rq1 == rq2) spin_lock(&rq1->lock); @@ -991,14 +1123,21 @@ static inline void double_rq_lock(runque * Note this does not restore interrupts like task_rq_unlock, * you need to do so manually after calling. */ -static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) +static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) { spin_unlock(&rq1->lock); if (rq1 != rq2) spin_unlock(&rq2->lock); } -#ifdef CONFIG_NUMA +enum idle_type +{ + IDLE, + NOT_IDLE, + NEWLY_IDLE, +}; + +#ifdef CONFIG_SMP /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only @@ -1007,32 +1146,31 @@ static inline void double_rq_unlock(runq */ static void sched_migrate_task(task_t *p, int dest_cpu) { - runqueue_t *rq; migration_req_t req; + runqueue_t *rq; unsigned long flags; - cpumask_t old_mask, new_mask = cpumask_of_cpu(dest_cpu); lock_cpu_hotplug(); rq = task_rq_lock(p, &flags); - old_mask = p->cpus_allowed; - if (!cpu_isset(dest_cpu, old_mask) || !cpu_online(dest_cpu)) + if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; /* force the process onto the specified CPU */ - if (__set_cpus_allowed(p, new_mask, &req)) { + if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); wait_for_completion(&req.done); - /* If we raced with sys_sched_setaffinity, don't - * restore mask. */ - rq = task_rq_lock(p, &flags); - if (likely(cpus_equal(p->cpus_allowed, new_mask))) { - /* Restore old mask: won't need migration - * thread, since current cpu is allowed. */ - BUG_ON(__set_cpus_allowed(p, old_mask, NULL)); - } + /* + * we want a new context here. This eliminates TLB + * flushes on the cpus where the process executed prior to + * the migration. + */ + tlb_migrate_prepare(current->mm); + unlock_cpu_hotplug(); + + return; } out: task_rq_unlock(rq, &flags); @@ -1041,412 +1179,582 @@ out: /* * Find the least loaded CPU. Slightly favor the current CPU by - * setting its runqueue length as the minimum to start. + * setting its load as the minimum to start. */ -static int sched_best_cpu(struct task_struct *p) +static int sched_best_cpu(struct task_struct *p, struct sched_domain *sd) { - int i, minload, load, best_cpu, node = 0; - cpumask_t cpumask; + int i = 0, min_load, this_cpu, best_cpu; + cpumask_t tmp; - best_cpu = task_cpu(p); - if (cpu_rq(best_cpu)->nr_running <= 2) - return best_cpu; + best_cpu = this_cpu = task_cpu(p); - minload = 10000000; - for_each_node_with_cpus(i) { - /* - * Node load is always divided by nr_cpus_node to normalise - * load values in case cpu count differs from node to node. - * We first multiply node_nr_running by 10 to get a little - * better resolution. - */ - load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i); - if (load < minload) { - minload = load; - node = i; - } - } + /* subtract the currently running task's load effect: */ + min_load = cpu_load(this_cpu) - SCHED_LOAD_SCALE; - minload = 10000000; - cpumask = node_to_cpumask(node); - for (i = 0; i < NR_CPUS; ++i) { - if (!cpu_isset(i, cpumask)) - continue; - if (cpu_rq(i)->nr_running < minload) { + cpus_and(tmp, sd->span, cpu_online_map); + cpu_clear(this_cpu, tmp); + + for_each_cpu_mask(i, tmp) { + unsigned long load = cpu_load(i); + + if (min_load > load) { best_cpu = i; - minload = cpu_rq(i)->nr_running; + min_load = load; } } return best_cpu; } +/* + * sched_balance_exec(): find the highest-level, exec-balance-capable + * domain and try to migrate the current task to the least loaded CPU. + * + * execve() is a valuable balancing opportunity, because at this point + * the task has the smallest effective cache footprint - a completely new + * process image is being created, so almost all of the currently existing + * cache footprint is irrelevant. So we attempt to balance this task as + * broadly as possible, without considering migration costs, which costs + * otherwise affect all other types of task migrations. + */ void sched_balance_exec(void) { - int new_cpu; + struct sched_domain *sd, *best_sd = NULL; + int new_cpu, this_cpu = get_cpu(); - if (numnodes > 1) { - new_cpu = sched_best_cpu(current); - if (new_cpu != smp_processor_id()) - sched_migrate_task(current, new_cpu); - } -} + /* Prefer the current CPU if there's only this task running: */ + if (this_rq()->nr_running <= 1) + goto out; -/* - * Find the busiest node. All previous node loads contribute with a - * geometrically deccaying weight to the load measure: - * load_{t} = load_{t-1}/2 + nr_node_running_{t} - * This way sudden load peaks are flattened out a bit. - * Node load is divided by nr_cpus_node() in order to compare nodes - * of different cpu count but also [first] multiplied by 10 to - * provide better resolution. - */ -static int find_busiest_node(int this_node) -{ - int i, node = -1, load, this_load, maxload; - - if (!nr_cpus_node(this_node)) - return node; - this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1) - + (10 * atomic_read(&node_nr_running[this_node]) - / nr_cpus_node(this_node)); - this_rq()->prev_node_load[this_node] = this_load; - for_each_node_with_cpus(i) { - if (i == this_node) - continue; - load = (this_rq()->prev_node_load[i] >> 1) - + (10 * atomic_read(&node_nr_running[i]) - / nr_cpus_node(i)); - this_rq()->prev_node_load[i] = load; - if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) { - maxload = load; - node = i; + for_each_domain(this_cpu, sd) + if (sd->flags & SD_BALANCE_EXEC) + best_sd = sd; + + if (best_sd) { + new_cpu = sched_best_cpu(current, best_sd); + if (new_cpu != this_cpu) { + put_cpu(); + sched_migrate_task(current, new_cpu); + return; } } - return node; +out: + put_cpu(); } -#endif /* CONFIG_NUMA */ - -#ifdef CONFIG_SMP - /* - * double_lock_balance - lock the busiest runqueue - * - * this_rq is locked already. Recalculate nr_running if we have to - * drop the runqueue lock. + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. */ -static inline -unsigned int double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest, - int this_cpu, int idle, - unsigned int nr_running) +static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) { if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); spin_lock(&this_rq->lock); - /* Need to recalculate nr_running */ - if (idle || (this_rq->nr_running > - this_rq->prev_cpu_load[this_cpu])) - nr_running = this_rq->nr_running; - else - nr_running = this_rq->prev_cpu_load[this_cpu]; } else spin_lock(&busiest->lock); } - return nr_running; } /* - * find_busiest_queue - find the busiest runqueue among the cpus in cpumask. + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. */ static inline -runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, - int *imbalance, cpumask_t cpumask) +void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, + runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) { - int nr_running, load, max_load, i; - runqueue_t *busiest, *rq_src; + dequeue_task(p, src_array); + src_rq->nr_running--; + set_task_cpu(p, this_cpu); + this_rq->nr_running++; + enqueue_task_head(p, this_array); + p->timestamp = sched_clock() - + (src_rq->timestamp_last_tick - p->timestamp); + /* + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. + */ + if (TASK_PREEMPTS_CURR(p, this_rq)) + resched_task(this_rq->curr); +} +/* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +static inline +int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, + struct sched_domain *sd, enum idle_type idle) +{ /* - * We search all runqueues to find the most busy one. - * We do this lockless to reduce cache-bouncing overhead, - * we re-check the 'best' source CPU later on again, with - * the lock held. - * - * We fend off statistical fluctuations in runqueue lengths by - * saving the runqueue length (as seen by the balancing CPU) during - * the previous load-balancing operation and using the smaller one - * of the current and saved lengths. If a runqueue is long enough - * for a longer amount of time then we recognize it and pull tasks - * from it. - * - * The 'current runqueue length' is a statistical maximum variable, - * for that one we take the longer one - to avoid fluctuations in - * the other direction. So for a load-balance to happen it needs - * stable long runqueue on the target CPU and stable short runqueue - * on the local runqueue. - * - * We make an exception if this CPU is about to become idle - in - * that case we are less picky about moving a task across CPUs and - * take what can be taken. + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. */ - if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu])) - nr_running = this_rq->nr_running; - else - nr_running = this_rq->prev_cpu_load[this_cpu]; + if (task_running(rq, p)) + return 0; + if (!cpu_isset(this_cpu, p->cpus_allowed)) + return 0; - busiest = NULL; - max_load = 1; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpumask)) - continue; + /* Aggressive migration if we've failed balancing */ + if (idle == NEWLY_IDLE || + sd->nr_balance_failed < sd->cache_nice_tries) { + if (task_hot(p, rq->timestamp_last_tick, sd)) + return 0; + } - rq_src = cpu_rq(i); - if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i])) - load = rq_src->nr_running; - else - load = this_rq->prev_cpu_load[i]; - this_rq->prev_cpu_load[i] = rq_src->nr_running; + return 1; +} - if ((load > max_load) && (rq_src != this_rq)) { - busiest = rq_src; - max_load = load; - } - } +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + prio_array_t *array, *dst_array; + struct list_head *head, *curr; + int ret, idx, pulled = 0; + task_t *tmp; - if (likely(!busiest)) + if (max_nr_move <= 0 || busiest->nr_running <= 1) goto out; - *imbalance = max_load - nr_running; + /* We first consider active tasks. */ + if (busiest->active->nr_active) { + array = busiest->active; + dst_array = this_rq->active; + } else { + array = busiest->expired; + dst_array = this_rq->expired; + } - /* It needs an at least ~25% imbalance to trigger balancing. */ - if (!idle && ((*imbalance)*4 < max_load)) { - busiest = NULL; +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { + if (array == busiest->active && busiest->expired->nr_active) { + array = busiest->expired; + dst_array = this_rq->expired; + goto new_array; + } goto out; } - nr_running = double_lock_balance(this_rq, busiest, this_cpu, - idle, nr_running); - /* - * Make sure nothing changed since we checked the - * runqueue length. - */ - if (busiest->nr_running <= nr_running) { - spin_unlock(&busiest->lock); - busiest = NULL; + head = array->queue + idx; + curr = head->next; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->next; + + ret = can_migrate_task(tmp, busiest, this_cpu, sd, idle); + if (ret == -1) { + idx++; + goto skip_bitmap; + } + if (!ret) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; } out: - return busiest; + return pulled; } /* - * pull_task - move a task from a remote runqueue to the local runqueue. - * Both runqueues must be locked. + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the number of tasks which should be + * moved to restore balance via the imbalance parameter. */ -static inline -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, - runqueue_t *this_rq, int this_cpu) +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *nr_move, enum idle_type idle) { - dequeue_task(p, src_array); - nr_running_dec(src_rq); - set_task_cpu(p, this_cpu); - nr_running_inc(this_rq); - enqueue_task(p, this_rq->active); - p->timestamp = sched_clock() - - (src_rq->timestamp_last_tick - p->timestamp); - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + unsigned long max_load, avg_load, total_load, + this_load, total_pwr, delta; + + max_load = this_load = total_load = total_pwr = 0; + + do { + cpumask_t tmp; + int i; + + /* Tally up the load of all CPUs in the group */ + cpus_and(tmp, group->cpumask, cpu_online_map); + WARN_ON(cpus_empty(tmp)); + + avg_load = 0; + for_each_cpu_mask(i, tmp) + avg_load += cpu_load(i); + + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (cpu_isset(this_cpu, group->cpumask)) { + this_load = avg_load; + this = group; + goto nextgroup; + } + if (avg_load > max_load) { + max_load = avg_load; + busiest = group; + } +nextgroup: + group = group->next; + } while (group != sd->groups); + + if (!busiest || this_load >= max_load) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + if ((this_load >= avg_load) + || (100*max_load <= sd->imbalance_pct*this_load)) + goto out_balanced; + + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load, as either of these + * actions would just result in more rebalancing later, and ping-pong + * tasks around. Thus we look for the minimum possible imbalance. + * Negative imbalances (*we* are more loaded than anyone else) will + * be counted as no imbalance for these purposes -- we can't fix that + * by pulling tasks to us. Be careful of negative numbers as they'll + * appear as very large values with unsigned longs. */ - if (TASK_PREEMPTS_CURR(p, this_rq)) - set_need_resched(); + delta = max_load - this_load; + + if (delta > SCHED_LOAD_SCALE) { + delta = min(max_load - avg_load, avg_load - this_load); + /* + * How many tasks to actually move to equalise the + * imbalance: first round up (which will move us across + * the average unless we can precisely balance to the + * average) and get rid of the scaling factor: + */ + delta += SCHED_LOAD_SCALE-1; + *nr_move = delta / SCHED_LOAD_SCALE; + + if (*nr_move) + return busiest; + } + +out_balanced: + if (busiest && idle == NEWLY_IDLE && max_load > SCHED_LOAD_SCALE) { + *nr_move = 1; + return busiest; + } + + *nr_move = 0; + return NULL; } +/* + * find_busiest_queue - find the busiest runqueue among the cpus in group. + */ +static runqueue_t *find_busiest_queue(struct sched_group *group) +{ + cpumask_t tmp; + unsigned long load, max_load = 0; + runqueue_t *busiest = NULL; + int i; + + cpus_and(tmp, group->cpumask, cpu_online_map); + for_each_cpu_mask(i, tmp) { + load = cpu_load(i); + + if (load > max_load) { + max_load = load; + busiest = cpu_rq(i); + } + } + + return busiest; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called with this_rq unlocked. + */ +static int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) +{ + struct sched_group *group; + unsigned long imbalance; + runqueue_t *busiest; + int nr_moved; + + spin_lock(&this_rq->lock); + + group = find_busiest_group(sd, this_cpu, &imbalance, idle); + if (!group) + goto out_balanced; + + busiest = find_busiest_queue(group); + if (!busiest) + goto out_balanced; + if (unlikely(busiest == this_rq)) { + WARN_ON(1); + goto out_balanced; + } + + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + + nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle); + spin_unlock(&this_rq->lock); + spin_unlock(&busiest->lock); + + if (!nr_moved) { + sd->nr_balance_failed++; + if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { + int wake = 0; + + spin_lock(&busiest->lock); + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + wake = 1; + } + spin_unlock(&busiest->lock); + if (wake) + wake_up_process(busiest->migration_thread); + /* + * We've kicked active balancing, reset the + * failure counter: + */ + sd->nr_balance_failed = 0; + } + } else + sd->nr_balance_failed = 0; + + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + + return nr_moved; + +out_balanced: + spin_unlock(&this_rq->lock); -/* - * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? - */ -static inline -int can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle) -{ - unsigned long delta = rq->timestamp_last_tick - tsk->timestamp; + /* tune up the balancing interval */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; - /* - * We do not migrate tasks that are: - * 1) running (obviously), or - * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. - */ - if (task_running(rq, tsk)) - return 0; - if (!cpu_isset(this_cpu, tsk->cpus_allowed)) - return 0; - if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks))) - return 0; - return 1; + return 0; } /* - * Current runqueue is empty, or rebalance tick: if there is an - * inbalance (current runqueue is too short) then pull from - * busiest runqueue(s). + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. * - * We call this with the current runqueue locked, - * irqs disabled. + * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). + * this_rq is locked. */ -static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask) +static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd) { - int imbalance, idx, this_cpu = smp_processor_id(); - runqueue_t *busiest; - prio_array_t *array; - struct list_head *head, *curr; - task_t *tmp; + struct sched_group *group; + runqueue_t *busiest = NULL; + unsigned long imbalance; + int nr_moved = 0; - if (cpu_is_offline(this_cpu)) + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); + if (!group) goto out; - busiest = find_busiest_queue(this_rq, this_cpu, idle, - &imbalance, cpumask); - if (!busiest) + busiest = find_busiest_queue(group); + if (!busiest || busiest == this_rq) goto out; - /* - * We only want to steal a number of tasks equal to 1/2 the imbalance, - * otherwise we'll just shift the imbalance to the new queue: - */ - imbalance /= 2; + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) - array = busiest->expired; - else - array = busiest->active; + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, NEWLY_IDLE); -new_array: - /* Start searching at priority 0: */ - idx = 0; -skip_bitmap: - if (!idx) - idx = sched_find_first_bit(array->bitmap); - else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired) { - array = busiest->active; - goto new_array; + spin_unlock(&busiest->lock); + +out: + return nr_moved; +} + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static inline void idle_balance(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd; + + if (unlikely(cpu_is_offline(this_cpu))) + return; + + for_each_domain(this_cpu, sd) { + if (sd->flags & SD_BALANCE_NEWIDLE) { + if (load_balance_newidle(this_cpu, this_rq, sd)) { + /* We've pulled tasks over so stop searching */ + break; + } } - goto out_unlock; } +} - head = array->queue + idx; - curr = head->prev; -skip_queue: - tmp = list_entry(curr, task_t, run_list); +/* + * active_load_balance is run by migration threads. It pushes a running + * task off the cpu. It can be required to correctly have at least 1 task + * running on each physical CPU where possible, and not have a physical / + * logical imbalance. + * + * Called with busiest locked. + */ +static void active_load_balance(runqueue_t *busiest, int busiest_cpu) +{ + struct sched_group *group, *busy_group; + struct sched_domain *sd; + int i; - curr = curr->prev; + if (busiest->nr_running <= 1) + return; - if (!can_migrate_task(tmp, busiest, this_cpu, idle)) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } - pull_task(busiest, array, tmp, this_rq, this_cpu); + for_each_domain(busiest_cpu, sd) + if (cpu_isset(busiest->push_cpu, sd->span)) + break; - /* Only migrate one task if we are idle */ - if (!idle && --imbalance) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; + if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) { + WARN_ON(1); + return; } -out_unlock: - spin_unlock(&busiest->lock); -out: - ; + + group = sd->groups; + while (!cpu_isset(busiest_cpu, group->cpumask)) + group = group->next; + busy_group = group; + + group = sd->groups; + do { + cpumask_t tmp; + runqueue_t *rq; + int push_cpu = 0; + + if (group == busy_group) + goto next_group; + + cpus_and(tmp, group->cpumask, cpu_online_map); + if (!cpus_weight(tmp)) + goto next_group; + + for_each_cpu_mask(i, tmp) { + if (!idle_cpu(i)) + goto next_group; + push_cpu = i; + } + + rq = cpu_rq(push_cpu); + double_lock_balance(busiest, rq); + move_tasks(rq, push_cpu, busiest, 1, sd, IDLE); + spin_unlock(&rq->lock); +next_group: + group = group->next; + } while (group != sd->groups); } /* - * One of the idle_cpu_tick() and busy_cpu_tick() functions will - * get called every timer tick, on every CPU. Our balancing action - * frequency and balancing agressivity depends on whether the CPU is - * idle or not. + * rebalance_tick will get called every timer tick, on every CPU. * - * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on - * systems with HZ=100, every 10 msecs.) + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. * - * On NUMA, do a node-rebalance every 400 msecs. + * Balancing parameters are set up in arch_init_sched_domains. */ -#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) -#define BUSY_REBALANCE_TICK (HZ/5 ?: 1) -#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5) -#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2) -#ifdef CONFIG_NUMA -static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) +/* Don't have all balancing operations going off at once */ +#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) + +static void rebalance_tick(int this_cpu, runqueue_t *this_rq, + enum idle_type idle) { - int node = find_busiest_node(cpu_to_node(this_cpu)); + unsigned long j = jiffies + CPU_OFFSET(this_cpu); + struct sched_domain *sd; - if (node >= 0) { - cpumask_t cpumask = node_to_cpumask(node); - cpu_set(this_cpu, cpumask); - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpumask); - spin_unlock(&this_rq->lock); - } -} -#endif + if (unlikely(cpu_is_offline(this_cpu))) + return; -static void rebalance_tick(runqueue_t *this_rq, int idle) -{ -#ifdef CONFIG_NUMA - int this_cpu = smp_processor_id(); -#endif - unsigned long j = jiffies; + for_each_domain(this_cpu, sd) { + unsigned long interval = sd->balance_interval; - /* - * First do inter-node rebalancing, then intra-node rebalancing, - * if both events happen in the same tick. The inter-node - * rebalancing does not necessarily have to create a perfect - * balance within the node, since we load-balance the most loaded - * node with the current CPU. (ie. other CPUs in the local node - * are not balanced.) - */ - if (idle) { -#ifdef CONFIG_NUMA - if (!(j % IDLE_NODE_REBALANCE_TICK)) - balance_node(this_rq, idle, this_cpu); -#endif - if (!(j % IDLE_REBALANCE_TICK)) { - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); - spin_unlock(&this_rq->lock); + if (idle != IDLE) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = MSEC_TO_JIFFIES(interval); + if (unlikely(!interval)) + interval = 1; + + if (j - sd->last_balance >= interval) { + if (load_balance(this_cpu, this_rq, sd, idle)) { + /* We've pulled tasks over so no longer idle */ + idle = NOT_IDLE; + } + sd->last_balance += interval; } - return; - } -#ifdef CONFIG_NUMA - if (!(j % BUSY_NODE_REBALANCE_TICK)) - balance_node(this_rq, idle, this_cpu); -#endif - if (!(j % BUSY_REBALANCE_TICK)) { - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); - spin_unlock(&this_rq->lock); } } #else /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(runqueue_t *this_rq, int idle) +static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) +{ +} +static inline void idle_balance(int cpu, runqueue_t *rq) { } #endif +static inline int wake_priority_sleeper(runqueue_t *rq) +{ +#ifdef CONFIG_SCHED_SMT + /* + * If an SMT sibling task has been put to sleep for priority + * reasons reschedule the idle task to see if it can now run. + */ + if (rq->nr_running) { + resched_task(rq->idle); + return 1; + } +#endif + return 0; +} + DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); @@ -1500,7 +1808,9 @@ void scheduler_tick(int user_ticks, int cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; - rebalance_tick(rq, 1); + if (wake_priority_sleeper(rq)) + goto out; + rebalance_tick(cpu, rq, IDLE); return; } if (TASK_NICE(p) > 0) @@ -1584,8 +1894,93 @@ void scheduler_tick(int user_ticks, int out_unlock: spin_unlock(&rq->lock); out: - rebalance_tick(rq, 0); + rebalance_tick(cpu, rq, NOT_IDLE); +} + +#ifdef CONFIG_SCHED_SMT +static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) +{ + int i; + struct sched_domain *sd = rq->sd; + cpumask_t sibling_map; + + if (!(sd->flags & SD_SHARE_CPUPOWER)) + return; + + cpus_and(sibling_map, sd->span, cpu_online_map); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq; + + if (i == cpu) + continue; + + smt_rq = cpu_rq(i); + + /* + * If an SMT sibling task is sleeping due to priority + * reasons wake it up now. + */ + if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) + resched_task(smt_rq->idle); + } +} + +static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) +{ + struct sched_domain *sd = rq->sd; + cpumask_t sibling_map; + int ret = 0, i; + + if (!(sd->flags & SD_SHARE_CPUPOWER)) + return 0; + + cpus_and(sibling_map, sd->span, cpu_online_map); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq; + task_t *smt_curr; + + if (i == cpu) + continue; + + smt_rq = cpu_rq(i); + smt_curr = smt_rq->curr; + + /* + * If a user task with lower static priority than the + * running task on the SMT sibling is trying to schedule, + * delay it till there is proportionately less timeslice + * left of the sibling task to prevent a lower priority + * task from using an unfair proportion of the + * physical cpu's resources. -ck + */ + if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(p) || rt_task(smt_curr)) && + p->mm && smt_curr->mm && !rt_task(p)) + ret = 1; + + /* + * Reschedule a lower priority task on the SMT sibling, + * or wake it up if it has been put to sleep for priority + * reasons. + */ + if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(smt_curr) || rt_task(p)) && + smt_curr->mm && p->mm && !rt_task(smt_curr)) || + (smt_curr == smt_rq->idle && smt_rq->nr_running)) + resched_task(smt_curr); + } + return ret; +} +#else +static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) +{ +} + +static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) +{ + return 0; } +#endif void scheduling_functions_start_here(void) { } @@ -1601,7 +1996,7 @@ asmlinkage void schedule(void) struct list_head *queue; unsigned long long now; unsigned long run_time; - int idx; + int cpu, idx; /* * Test if we are atomic. Since do_exit() needs to call into @@ -1651,13 +2046,13 @@ need_resched: deactivate_task(prev, rq); } + cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { -#ifdef CONFIG_SMP - load_balance(rq, 1, cpu_to_node_mask(smp_processor_id())); -#endif + idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; rq->expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); goto switch_tasks; } } @@ -1678,6 +2073,11 @@ need_resched: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + if (dependent_sleeper(cpu, rq, next)) { + next = rq->idle; + goto switch_tasks; + } + if (!rt_task(next) && next->activated > 0) { unsigned long long delta = now - next->timestamp; @@ -2205,7 +2605,7 @@ static int setscheduler(pid_t pid, int p if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (p->prio < rq->curr->prio) + } else if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } @@ -2404,6 +2804,7 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; + prio_array_t *target = rq->expired; /* * We implement yielding by moving the task into the expired @@ -2412,13 +2813,12 @@ asmlinkage long sys_sched_yield(void) * (special rule: RT tasks will just roundrobin in the active * array.) */ - if (likely(!rt_task(current))) { - dequeue_task(current, array); - enqueue_task(current, rq->expired); - } else { - list_del(¤t->run_list); - list_add_tail(¤t->run_list, array->queue + current->prio); - } + if (unlikely(rt_task(current))) + target = rq->active; + + dequeue_task(current, array); + enqueue_task(current, target); + /* * Since we are going to call schedule() anyway, there's * no need to preempt: @@ -2729,7 +3129,12 @@ int set_cpus_allowed(task_t *p, cpumask_ goto out; } - if (__set_cpus_allowed(p, new_mask, &req)) { + p->cpus_allowed = new_mask; + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpu_isset(task_cpu(p), new_mask)) + goto out; + + if (migrate_task(p, any_online_cpu(new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -2743,22 +3148,34 @@ out: EXPORT_SYMBOL_GPL(set_cpus_allowed); -/* Move (not current) task off this cpu, onto dest cpu. */ -static void move_task_away(struct task_struct *p, int dest_cpu) +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_balance_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + */ +static void __migrate_task(struct task_struct *p, int dest_cpu) { runqueue_t *rq_dest; rq_dest = cpu_rq(dest_cpu); double_rq_lock(this_rq(), rq_dest); + /* Already moved. */ if (task_cpu(p) != smp_processor_id()) - goto out; /* Already moved */ + goto out; + /* Affinity changed (again). */ + if (!cpu_isset(dest_cpu, p->cpus_allowed)) + goto out; set_task_cpu(p, dest_cpu); if (p->array) { deactivate_task(p, this_rq()); activate_task(p, rq_dest); - if (p->prio < rq_dest->curr->prio) + if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); } p->timestamp = rq_dest->timestamp_last_tick; @@ -2788,7 +3205,13 @@ static int migration_thread(void * data) refrigerator(PF_IOTHREAD); spin_lock_irq(&rq->lock); + if (rq->active_balance) { + active_load_balance(rq, cpu); + rq->active_balance = 0; + } + head = &rq->migration_queue; + current->state = TASK_INTERRUPTIBLE; if (list_empty(head)) { spin_unlock_irq(&rq->lock); @@ -2797,11 +3220,19 @@ static int migration_thread(void * data) } req = list_entry(head->next, migration_req_t, list); list_del_init(head->next); + spin_unlock(&rq->lock); - move_task_away(req->task, - any_online_cpu(req->task->cpus_allowed)); + if (req->type == REQ_MOVE_TASK) { + __migrate_task(req->task, req->dest_cpu); + } else if (req->type == REQ_SET_DOMAIN) { + rq->sd = req->sd; + } else { + WARN_ON(1); + } + local_irq_enable(); + complete(&req->done); } return 0; @@ -2857,7 +3288,7 @@ void migrate_all_tasks(void) tsk->pid, tsk->comm, src_cpu); } - move_task_away(tsk, dest_cpu); + __migrate_task(tsk, dest_cpu); } while_each_thread(t, tsk); write_unlock(&tasklist_lock); @@ -2936,23 +3367,285 @@ int __init migration_init(void) spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; EXPORT_SYMBOL(kernel_flag); +#ifdef CONFIG_SMP +/* Attach the domain 'sd' to 'cpu' as its base domain */ +void cpu_attach_domain(struct sched_domain *sd, int cpu) +{ + migration_req_t req; + unsigned long flags; + runqueue_t *rq = cpu_rq(cpu); + int local = 1; + + spin_lock_irqsave(&rq->lock, flags); + + if (cpu == smp_processor_id() || cpu_is_offline(cpu)) { + rq->sd = sd; + } else { + init_completion(&req.done); + req.type = REQ_SET_DOMAIN; + req.sd = sd; + list_add(&req.list, &rq->migration_queue); + local = 0; + } + + spin_unlock_irqrestore(&rq->lock, flags); + + if (!local) { + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + } +} + +#ifdef ARCH_HAS_SCHED_DOMAIN +extern void __init arch_init_sched_domains(void); +#else +static struct sched_group sched_group_cpus[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +#ifdef CONFIG_NUMA +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static DEFINE_PER_CPU(struct sched_domain, node_domains); +static void __init arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_node = NULL, *last_node = NULL; + + /* Set up domains */ + for_each_cpu(i) { + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + struct sched_domain *node_sd = &per_cpu(node_domains, i); + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + + *node_sd = SD_NODE_INIT; + node_sd->span = cpu_possible_map; + node_sd->groups = &sched_group_nodes[cpu_to_node(i)]; + + *cpu_sd = SD_CPU_INIT; + cpus_and(cpu_sd->span, nodemask, cpu_possible_map); + cpu_sd->groups = &sched_group_cpus[i]; + cpu_sd->parent = node_sd; + } + + /* Set up groups */ + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + struct sched_group *node = &sched_group_nodes[i]; + cpumask_t tmp = node_to_cpumask(i); + cpumask_t nodemask; + int j; + + cpus_and(nodemask, tmp, cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + node->cpumask = nodemask; + node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask); + + for_each_cpu_mask(j, node->cpumask) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + if (!first_node) + first_node = node; + if (last_node) + last_node->next = node; + last_node = node; + } + last_node->next = first_node; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_sd, i); + } +} + +#else /* !CONFIG_NUMA */ +static void __init arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + + *cpu_sd = SD_CPU_INIT; + cpu_sd->span = cpu_possible_map; + cpu_sd->groups = &sched_group_cpus[i]; + } + + /* Set up CPU groups */ + for_each_cpu_mask(i, cpu_possible_map) { + struct sched_group *cpu = &sched_group_cpus[i]; + + cpus_clear(cpu->cpumask); + cpu_set(i, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); /* domains were modified outside the lock */ + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_sd, i); + } +} + +#endif /* CONFIG_NUMA */ +#endif /* ARCH_HAS_SCHED_DOMAIN */ + +#define SCHED_DOMAIN_DEBUG +#ifdef SCHED_DOMAIN_DEBUG +void sched_domain_debug(void) +{ + int i; + + for_each_cpu(i) { + runqueue_t *rq = cpu_rq(i); + struct sched_domain *sd; + int level = 0; + + sd = rq->sd; + + printk(KERN_DEBUG "CPU%d: %s\n", + i, (cpu_online(i) ? " online" : "offline")); + + do { + int j; + char str[NR_CPUS]; + struct sched_group *group = sd->groups; + cpumask_t groupmask, tmp; + + cpumask_scnprintf(str, NR_CPUS, sd->span); + cpus_clear(groupmask); + + printk(KERN_DEBUG); + for (j = 0; j < level + 1; j++) + printk(" "); + printk("domain %d: span %s\n", level, str); + + if (!cpu_isset(i, sd->span)) + printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); + if (!cpu_isset(i, group->cpumask)) + printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i); + if (!group->cpu_power) + printk(KERN_DEBUG "ERROR domain->cpu_power not set\n"); + + printk(KERN_DEBUG); + for (j = 0; j < level + 2; j++) + printk(" "); + printk("groups:"); + do { + if (!group) { + printk(" ERROR: NULL"); + break; + } + + if (!cpus_weight(group->cpumask)) + printk(" ERROR empty group:"); + + cpus_and(tmp, groupmask, group->cpumask); + if (cpus_weight(tmp) > 0) + printk(" ERROR repeated CPUs:"); + + cpus_or(groupmask, groupmask, group->cpumask); + + cpumask_scnprintf(str, NR_CPUS, group->cpumask); + printk(" %s", str); + + group = group->next; + } while (group != sd->groups); + printk("\n"); + + if (!cpus_equal(sd->span, groupmask)) + printk(KERN_DEBUG "ERROR groups don't span domain->span\n"); + + level++; + sd = sd->parent; + + if (sd) { + cpus_and(tmp, groupmask, sd->span); + if (!cpus_equal(tmp, groupmask)) + printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n"); + } + + } while (sd); + } +} +#else +#define sched_domain_debug() {} +#endif + +void __init sched_init_smp(void) +{ + arch_init_sched_domains(); + sched_domain_debug(); +} +#else +void __init sched_init_smp(void) +{ +} +#endif /* CONFIG_SMP */ + void __init sched_init(void) { runqueue_t *rq; int i, j, k; +#ifdef CONFIG_SMP + /* Set up an initial dummy domain for early boot */ + static struct sched_domain sched_domain_init; + static struct sched_group sched_group_init; + cpumask_t cpu_mask_all = CPU_MASK_ALL; + + memset(&sched_domain_init, 0, sizeof(struct sched_domain)); + sched_domain_init.span = cpu_mask_all; + sched_domain_init.groups = &sched_group_init; + sched_domain_init.last_balance = jiffies; + sched_domain_init.balance_interval = INT_MAX; /* Don't balance */ + + memset(&sched_group_init, 0, sizeof(struct sched_group)); + sched_group_init.cpumask = cpu_mask_all; + sched_group_init.next = &sched_group_init; + sched_group_init.cpu_power = SCHED_LOAD_SCALE; +#endif + for (i = 0; i < NR_CPUS; i++) { prio_array_t *array; rq = cpu_rq(i); + spin_lock_init(&rq->lock); rq->active = rq->arrays; rq->expired = rq->arrays + 1; rq->best_expired_prio = MAX_PRIO; - spin_lock_init(&rq->lock); +#ifdef CONFIG_SMP + rq->sd = &sched_domain_init; + rq->active_balance = 0; + rq->push_cpu = 0; + rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); +#endif atomic_set(&rq->nr_iowait, 0); - nr_running_init(rq); for (j = 0; j < 2; j++) { array = rq->arrays + j; @@ -2974,8 +3667,6 @@ void __init sched_init(void) set_task_cpu(current, smp_processor_id()); wake_up_forked_process(current); - init_timers(); - /* * The boot idle thread does lazy MMU switching as well: */