From: Ingo Molnar Implement balancing during clone(). It does the following things: - introduces SD_BALANCE_CLONE that can serve as a tool for an architecture to limit the search-idlest-CPU scope on clone(). E.g. the 512-CPU systems should rather not enable this. - uses the highest sd for the imbalance_pct, not this_rq (which didnt make sense). - unifies balance-on-exec and balance-on-clone via the find_idlest_cpu() function. Gets rid of sched_best_cpu() which was still a bit inconsistent IMO, it used 'min_load < load' as a condition for balancing - while a more correct approach would be to use half of the imbalance_pct, like passive balancing does. - the patch also reintroduces the possibility to do SD_BALANCE_EXEC on SMP systems, and activates it - to get testing. - NOTE: there's one thing in this patch that is slightly unclean: i introduced wake_up_forked_thread. I did this to make it easier to get rid of this patch later (wake_up_forked_process() has lots of dependencies in various architectures). If this capability remains in the kernel then i'll clean it up and introduce one function for wake_up_forked_process/thread. - NOTE2: i added the SD_BALANCE_CLONE flag to the NUMA CPU template too. Some NUMA architectures probably want to disable this. --- 25-akpm/include/linux/sched.h | 23 ++++- 25-akpm/kernel/fork.c | 20 ++++ 25-akpm/kernel/sched.c | 169 +++++++++++++++++++++++++++++++++--------- 3 files changed, 167 insertions(+), 45 deletions(-) diff -puN include/linux/sched.h~sched-balance-context include/linux/sched.h --- 25/include/linux/sched.h~sched-balance-context 2004-04-12 22:53:15.499176072 -0700 +++ 25-akpm/include/linux/sched.h 2004-04-12 22:53:15.508174704 -0700 @@ -546,10 +546,11 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define SD_BALANCE_NEWIDLE 1 /* Balance when about to become idle */ #define SD_BALANCE_EXEC 2 /* Balance on exec */ -#define SD_WAKE_IDLE 4 /* Wake to idle CPU on task wakeup */ -#define SD_WAKE_AFFINE 8 /* Wake task to waking CPU */ -#define SD_WAKE_BALANCE 16 /* Perform balancing at task wakeup */ -#define SD_SHARE_CPUPOWER 32 /* Domain members share cpu power */ +#define SD_BALANCE_CLONE 4 /* Balance on clone */ +#define SD_WAKE_IDLE 8 /* Wake to idle CPU on task wakeup */ +#define SD_WAKE_AFFINE 16 /* Wake task to waking CPU */ +#define SD_WAKE_BALANCE 32 /* Perform balancing at task wakeup */ +#define SD_SHARE_CPUPOWER 64 /* Domain members share cpu power */ struct sched_group { struct sched_group *next; /* Must be a circular list */ @@ -597,6 +598,8 @@ struct sched_domain { .cache_nice_tries = 0, \ .per_cpu_gain = 15, \ .flags = SD_BALANCE_NEWIDLE \ + | SD_BALANCE_EXEC \ + | SD_BALANCE_CLONE \ | SD_WAKE_AFFINE \ | SD_WAKE_IDLE \ | SD_SHARE_CPUPOWER, \ @@ -618,6 +621,8 @@ struct sched_domain { .cache_nice_tries = 1, \ .per_cpu_gain = 100, \ .flags = SD_BALANCE_NEWIDLE \ + | SD_BALANCE_EXEC \ + | SD_BALANCE_CLONE \ | SD_WAKE_AFFINE \ | SD_WAKE_BALANCE, \ .last_balance = jiffies, \ @@ -639,6 +644,7 @@ struct sched_domain { .cache_nice_tries = 1, \ .per_cpu_gain = 100, \ .flags = SD_BALANCE_EXEC \ + | SD_BALANCE_CLONE \ | SD_WAKE_BALANCE, \ .last_balance = jiffies, \ .balance_interval = 1, \ @@ -658,7 +664,7 @@ static inline int set_cpus_allowed(task_ extern unsigned long long sched_clock(void); -#ifdef CONFIG_NUMA +#ifdef CONFIG_SMP extern void sched_balance_exec(void); #else #define sched_balance_exec() {} @@ -716,12 +722,17 @@ extern void do_timer(struct pt_regs *); extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); extern int FASTCALL(wake_up_process(struct task_struct * tsk)); +extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk)); #ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); + extern void FASTCALL(wake_up_forked_thread(struct task_struct * tsk)); #else static inline void kick_process(struct task_struct *tsk) { } + static inline void wake_up_forked_thread(struct task_struct * tsk) + { + return wake_up_forked_process(tsk); + } #endif -extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk)); extern void FASTCALL(sched_fork(task_t * p)); extern void FASTCALL(sched_exit(task_t * p)); diff -puN kernel/fork.c~sched-balance-context kernel/fork.c --- 25/kernel/fork.c~sched-balance-context 2004-04-12 22:53:15.502175616 -0700 +++ 25-akpm/kernel/fork.c 2004-04-12 22:53:15.509174552 -0700 @@ -1177,9 +1177,23 @@ long do_fork(unsigned long clone_flags, set_tsk_thread_flag(p, TIF_SIGPENDING); } - if (!(clone_flags & CLONE_STOPPED)) - wake_up_forked_process(p); /* do this last */ - else + if (!(clone_flags & CLONE_STOPPED)) { + /* + * Do the wakeup last. On SMP we treat fork() and + * CLONE_VM separately, because fork() has already + * created cache footprint on this CPU (due to + * copying the pagetables), hence migration would + * probably be costy. Threads on the other hand + * have less traction to the current CPU, and if + * there's an imbalance then the scheduler can + * migrate this fresh thread now, before it + * accumulates a larger cache footprint: + */ + if (clone_flags & CLONE_VM) + wake_up_forked_thread(p); + else + wake_up_forked_process(p); + } else p->state = TASK_STOPPED; ++total_forks; diff -puN kernel/sched.c~sched-balance-context kernel/sched.c --- 25/kernel/sched.c~sched-balance-context 2004-04-12 22:53:15.503175464 -0700 +++ 25-akpm/kernel/sched.c 2004-04-12 22:53:15.512174096 -0700 @@ -1156,7 +1156,133 @@ enum idle_type }; #ifdef CONFIG_SMP -#ifdef CONFIG_NUMA + +/* + * find_idlest_cpu - find the least busy runqueue. + */ +static int find_idlest_cpu(struct task_struct *p, int this_cpu, + struct sched_domain *sd) +{ + unsigned long load, min_load, this_load; + int i, min_cpu; + cpumask_t mask; + + min_cpu = UINT_MAX; + min_load = ULONG_MAX; + + cpus_and(mask, sd->span, cpu_online_map); + cpus_and(mask, mask, p->cpus_allowed); + + for_each_cpu_mask(i, mask) { + load = target_load(i); + + if (load < min_load) { + min_cpu = i; + min_load = load; + + /* break out early on an idle CPU: */ + if (!min_load) + break; + } + } + + /* add +1 to account for the new task */ + this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; + + /* + * Would with the addition of the new task to the + * current CPU there be an imbalance between this + * CPU and the idlest CPU? + * + * Use half of the balancing threshold - new-context is + * a good opportunity to balance. + */ + if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) + return min_cpu; + + return this_cpu; +} + +/* + * wake_up_forked_thread - wake up a freshly forked thread. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, and it also does + * runqueue balancing. + */ +void fastcall wake_up_forked_thread(task_t * p) +{ + unsigned long flags; + int this_cpu = get_cpu(), cpu; + struct sched_domain *tmp, *sd = NULL; + runqueue_t *this_rq = cpu_rq(this_cpu), *rq; + + /* + * Find the largest domain that this CPU is part of that + * is willing to balance on clone: + */ + for_each_domain(this_cpu, tmp) + if (tmp->flags & SD_BALANCE_CLONE) + sd = tmp; + if (sd) + cpu = find_idlest_cpu(p, this_cpu, sd); + else + cpu = this_cpu; + + local_irq_save(flags); +lock_again: + rq = cpu_rq(cpu); + double_rq_lock(this_rq, rq); + + BUG_ON(p->state != TASK_RUNNING); + + /* + * We did find_idlest_cpu() unlocked, so in theory + * the mask could have changed - just dont migrate + * in this case: + */ + if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) { + cpu = this_cpu; + double_rq_unlock(this_rq, rq); + goto lock_again; + } + /* + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. + */ + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->interactive_credit = 0; + + p->prio = effective_prio(p); + set_task_cpu(p, cpu); + + if (cpu == this_cpu) { + if (unlikely(!current->array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->array = current->array; + p->array->nr_active++; + rq->nr_running++; + } + } else { + __activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + + double_rq_unlock(this_rq, rq); + local_irq_restore(flags); + put_cpu(); +} + /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only @@ -1197,34 +1323,6 @@ out: } /* - * Find the least loaded CPU. Slightly favor the current CPU by - * setting its load as the minimum to start. - */ -static int sched_best_cpu(struct task_struct *p, struct sched_domain *sd) -{ - cpumask_t tmp; - int i, min_load, this_cpu, best_cpu; - - best_cpu = this_cpu = task_cpu(p); - min_load = INT_MAX; - - cpus_and(tmp, sd->span, cpu_online_map); - for_each_cpu_mask(i, tmp) { - unsigned long load; - if (i == this_cpu) - load = source_load(i); - else - load = target_load(i) + SCHED_LOAD_SCALE; - - if (min_load > load) { - best_cpu = i; - min_load = load; - } - } - return best_cpu; -} - -/* * sched_balance_exec(): find the highest-level, exec-balance-capable * domain and try to migrate the task to the least loaded CPU. * @@ -1233,19 +1331,19 @@ static int sched_best_cpu(struct task_st */ void sched_balance_exec(void) { - struct sched_domain *sd, *best_sd = NULL; + struct sched_domain *tmp, *sd = NULL; int new_cpu, this_cpu = get_cpu(); /* Prefer the current CPU if there's only this task running */ if (this_rq()->nr_running <= 1) goto out; - for_each_domain(this_cpu, sd) - if (sd->flags & SD_BALANCE_EXEC) - best_sd = sd; + for_each_domain(this_cpu, tmp) + if (tmp->flags & SD_BALANCE_EXEC) + sd = tmp; - if (best_sd) { - new_cpu = sched_best_cpu(current, best_sd); + if (sd) { + new_cpu = find_idlest_cpu(current, this_cpu, sd); if (new_cpu != this_cpu) { put_cpu(); sched_migrate_task(current, new_cpu); @@ -1255,7 +1353,6 @@ void sched_balance_exec(void) out: put_cpu(); } -#endif /* CONFIG_NUMA */ /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. _