From: Nick Piggin This patch removes the per runqueue array of NR_CPU arrays. Each time we want to check a remote CPU's load we check nr_running as well anyway, so introduce a cpu_load which is the load of the local runqueue and is kept updated in the timer tick. Put them in the same cacheline. This has additional benefits of having the cpu_load consistent across all CPUs and more up to date. It is sampled better too, being updated once per timer tick. This shouldn't make much difference in scheduling behaviour, but all benchmarks are either as good or better on the 16-way NUMAQ: hackbench, reaim, volanomark are about the same, tbench and dbench are maybe a bit better. kernbench is about one percent better. John reckons it isn't a big deal, but it does save 4K per CPU or 2MB total on his big systems, so I figure it must be a bit kinder on the caches. I think it is just nicer in general anyway. --- 25-akpm/kernel/sched.c | 73 ++++++++++++++++++++----------------------------- 1 files changed, 30 insertions(+), 43 deletions(-) diff -puN kernel/sched.c~sched-local-load kernel/sched.c --- 25/kernel/sched.c~sched-local-load 2004-04-02 12:16:05.543694408 -0800 +++ 25-akpm/kernel/sched.c 2004-04-02 12:16:05.551693192 -0800 @@ -201,19 +201,24 @@ struct prio_array { */ struct runqueue { spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; +#ifdef CONFIG_SMP + unsigned long cpu_load; +#endif unsigned long long nr_switches; - unsigned long nr_running, expired_timestamp, nr_uninterruptible; + unsigned long expired_timestamp, nr_uninterruptible; unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; int best_expired_prio; - atomic_t nr_iowait; -#ifdef CONFIG_SMP - unsigned long cpu_load[NR_CPUS]; -#endif /* For active balancing */ int active_balance; int push_cpu; @@ -605,35 +610,22 @@ void kick_process(task_t *p) EXPORT_SYMBOL_GPL(kick_process); /* - * Return a low guess at the load of cpu. Update previous history if update - * is true + * Return a low guess at the load of cpu. */ -static inline unsigned long get_low_cpu_load(int cpu, int update) +static inline unsigned long get_low_cpu_load(int cpu) { runqueue_t *rq = cpu_rq(cpu); - runqueue_t *this_rq = this_rq(); - unsigned long nr = rq->nr_running << SCHED_LOAD_SHIFT; - unsigned long load = this_rq->cpu_load[cpu]; - unsigned long ret = min(nr, load); - - if (update) - this_rq->cpu_load[cpu] = (nr + load) / 2; + unsigned long load_now = rq->nr_running << SCHED_LOAD_SHIFT; - return ret; + return min(rq->cpu_load, load_now); } -static inline unsigned long get_high_cpu_load(int cpu, int update) +static inline unsigned long get_high_cpu_load(int cpu) { runqueue_t *rq = cpu_rq(cpu); - runqueue_t *this_rq = this_rq(); - unsigned long nr = rq->nr_running << SCHED_LOAD_SHIFT; - unsigned long load = this_rq->cpu_load[cpu]; - unsigned long ret = max(nr, load); - - if (update) - this_rq->cpu_load[cpu] = (nr + load) / 2; + unsigned long load_now = rq->nr_running << SCHED_LOAD_SHIFT; - return ret; + return max(rq->cpu_load, load_now); } #endif @@ -724,8 +716,8 @@ static int try_to_wake_up(task_t * p, un goto out_activate; /* Passive load balancing */ - load = get_low_cpu_load(cpu, 1); - this_load = get_high_cpu_load(this_cpu, 1) + SCHED_LOAD_SCALE; + load = get_low_cpu_load(cpu); + this_load = get_high_cpu_load(this_cpu) + SCHED_LOAD_SCALE; if (load > this_load) { new_cpu = sched_balance_wake(this_cpu, p); set_task_cpu(p, new_cpu); @@ -1158,9 +1150,9 @@ static int sched_best_cpu(struct task_st for_each_cpu_mask(i, tmp) { unsigned long load; if (i == this_cpu) - load = get_low_cpu_load(i, 0); + load = get_low_cpu_load(i); else - load = get_high_cpu_load(i, 0) + SCHED_LOAD_SCALE; + load = get_high_cpu_load(i) + SCHED_LOAD_SCALE; if (min_load > load) { best_cpu = i; @@ -1349,7 +1341,6 @@ find_busiest_group(struct sched_domain * { unsigned long max_load, avg_load, total_load, this_load; unsigned int total_pwr; - int modify; struct sched_group *busiest = NULL, *this = NULL, *group = domain->groups; max_load = 0; @@ -1360,16 +1351,6 @@ find_busiest_group(struct sched_domain * if (group == NULL) goto out_balanced; - /* - * Don't modify when we newly become idle because that ruins our - * statistics: its triggered by some value of nr_running (ie. 0). - * Timer based balancing is a good statistic though. - */ - if (idle == NEWLY_IDLE) - modify = 0; - else - modify = 1; - do { cpumask_t tmp; unsigned long load; @@ -1384,9 +1365,9 @@ find_busiest_group(struct sched_domain * for_each_cpu_mask(i, tmp) { /* Bias balancing toward cpus of our domain */ if (local_group) { - load = get_high_cpu_load(i, modify); + load = get_high_cpu_load(i); } else - load = get_low_cpu_load(i, modify); + load = get_low_cpu_load(i); nr_cpus++; avg_load += load; @@ -1502,7 +1483,7 @@ static runqueue_t *find_busiest_queue(st for_each_cpu_mask(i, tmp) { unsigned long load; - load = get_low_cpu_load(i, 0); + load = get_low_cpu_load(i); if (load > max_load) { max_load = load; @@ -1730,12 +1711,18 @@ next_group: static void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) { + unsigned long old_load, this_load; unsigned long j = jiffies + CPU_OFFSET(this_cpu); struct sched_domain *domain = this_sched_domain(); if (unlikely(cpu_is_offline(this_cpu))) return; + /* Update our load */ + old_load = this_rq->cpu_load; + this_load = this_rq->nr_running << SCHED_LOAD_SHIFT; + this_rq->cpu_load = (old_load + this_load) / 2; + /* Run through all this CPU's domains */ do { unsigned long interval; _