From: Nick Piggin Move balancing and child-runs-first logic from fork.c into sched.c where it belongs. * Consolidate wake_up_forked_process and wake_up_forked_thread into wake_up_new_process, and pass in clone_flags as suggested by Linus. This removes a lot of code duplication and allows all logic to be handled in that function. * Don't do balance-on-clone balancing for vfork'ed threads. * Don't do set_task_cpu or balance one clone in wake_up_new_process. Instead do it in sched_fork to fix set_cpus_allowed races. * Don't do child-runs-first for CLONE_VM processes, as there is obviously no COW benifit to be had. This is a big one, it enables Andi's workload to run well without clone balancing, because the OpenMP child threads can get balanced off to other nodes *before* they start running and allocating memory. * Rename sched_balance_exec to sched_exec: hide the policy from the API. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton --- 25-akpm/fs/compat.c | 4 25-akpm/fs/exec.c | 2 25-akpm/include/linux/sched.h | 15 +-- 25-akpm/kernel/fork.c | 47 +++------ 25-akpm/kernel/sched.c | 199 ++++++++++++++++++++---------------------- 5 files changed, 121 insertions(+), 146 deletions(-) diff -puN fs/compat.c~sched-clean-fork fs/compat.c --- 25/fs/compat.c~sched-clean-fork 2004-07-13 13:17:25.698527760 -0700 +++ 25-akpm/fs/compat.c 2004-07-13 13:17:25.709526088 -0700 @@ -1373,14 +1373,14 @@ int compat_do_execve(char * filename, int retval; int i; - sched_balance_exec(); - file = open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) return retval; + sched_exec(); + bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0])); diff -puN fs/exec.c~sched-clean-fork fs/exec.c --- 25/fs/exec.c~sched-clean-fork 2004-07-13 13:17:25.699527608 -0700 +++ 25-akpm/fs/exec.c 2004-07-13 13:17:25.711525784 -0700 @@ -1081,7 +1081,7 @@ int do_execve(char * filename, if (IS_ERR(file)) return retval; - sched_balance_exec(); + sched_exec(); bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0])); diff -puN include/linux/sched.h~sched-clean-fork include/linux/sched.h --- 25/include/linux/sched.h~sched-clean-fork 2004-07-13 13:17:25.701527304 -0700 +++ 25-akpm/include/linux/sched.h 2004-07-13 13:17:25.712525632 -0700 @@ -679,10 +679,11 @@ static inline int set_cpus_allowed(task_ extern unsigned long long sched_clock(void); +/* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP -extern void sched_balance_exec(void); +extern void sched_exec(void); #else -#define sched_balance_exec() {} +#define sched_exec() {} #endif extern void sched_idle_next(void); @@ -741,18 +742,14 @@ extern void do_timer(struct pt_regs *); extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); extern int FASTCALL(wake_up_process(struct task_struct * tsk)); -extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk)); +extern void FASTCALL(wake_up_new_process(struct task_struct * tsk, + unsigned long clone_flags)); #ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); - extern void FASTCALL(wake_up_forked_thread(struct task_struct * tsk)); #else static inline void kick_process(struct task_struct *tsk) { } - static inline void wake_up_forked_thread(struct task_struct * tsk) - { - wake_up_forked_process(tsk); - } #endif -extern void FASTCALL(sched_fork(task_t * p)); +extern void FASTCALL(sched_fork(task_t * p, unsigned long clone_flags)); extern void FASTCALL(sched_exit(task_t * p)); extern int in_group_p(gid_t); diff -puN kernel/fork.c~sched-clean-fork kernel/fork.c --- 25/kernel/fork.c~sched-clean-fork 2004-07-13 13:17:25.703527000 -0700 +++ 25-akpm/kernel/fork.c 2004-07-13 13:17:25.713525480 -0700 @@ -892,6 +892,16 @@ struct task_struct *copy_process(unsigne if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); + /* + * The newly dup'ed task shares the same cpus_allowed mask as its + * parent (ie. current), and it is not attached to the tasklist. + * The end result is that this CPU might go down and the parent + * be migrated away, leaving the task on a dead CPU. So take the + * hotplug lock here and release it after the child has been attached + * to the tasklist. + */ + lock_cpu_hotplug(); + retval = security_task_create(clone_flags); if (retval) goto fork_out; @@ -1020,7 +1030,7 @@ struct task_struct *copy_process(unsigne p->pdeath_signal = 0; /* Perform scheduler related setup */ - sched_fork(p); + sched_fork(p, clone_flags); /* * Ok, make it visible to the rest of the system. @@ -1098,6 +1108,7 @@ struct task_struct *copy_process(unsigne retval = 0; fork_out: + unlock_cpu_hotplug(); if (retval) return ERR_PTR(retval); return p; @@ -1204,31 +1215,10 @@ long do_fork(unsigned long clone_flags, set_tsk_thread_flag(p, TIF_SIGPENDING); } - if (!(clone_flags & CLONE_STOPPED)) { - /* - * Do the wakeup last. On SMP we treat fork() and - * CLONE_VM separately, because fork() has already - * created cache footprint on this CPU (due to - * copying the pagetables), hence migration would - * probably be costy. Threads on the other hand - * have less traction to the current CPU, and if - * there's an imbalance then the scheduler can - * migrate this fresh thread now, before it - * accumulates a larger cache footprint: - */ - if (clone_flags & CLONE_VM) - wake_up_forked_thread(p); - else - wake_up_forked_process(p); - } else { - int cpu = get_cpu(); - + if (!(clone_flags & CLONE_STOPPED)) + wake_up_new_process(p, clone_flags); + else p->state = TASK_STOPPED; - if (cpu_is_offline(task_cpu(p))) - set_task_cpu(p, cpu); - - put_cpu(); - } ++total_forks; if (unlikely (trace)) { @@ -1240,12 +1230,7 @@ long do_fork(unsigned long clone_flags, wait_for_completion(&vfork); if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); - } else - /* - * Let the child process run first, to avoid most of the - * COW overhead when the child exec()s afterwards. - */ - set_need_resched(); + } } return pid; } diff -puN kernel/sched.c~sched-clean-fork kernel/sched.c --- 25/kernel/sched.c~sched-clean-fork 2004-07-13 13:17:25.705526696 -0700 +++ 25-akpm/kernel/sched.c 2004-07-13 13:17:25.717524872 -0700 @@ -866,12 +866,50 @@ int fastcall wake_up_state(task_t *p, un return try_to_wake_up(p, state, 0); } +#ifdef CONFIG_SMP +static int find_idlest_cpu(struct task_struct *p, int this_cpu, + struct sched_domain *sd); +#endif + /* * Perform scheduler related setup for a newly forked process p. - * p is forked by current. + * p is forked by current. It also does runqueue balancing. + * The cpu hotplug lock is held. */ -void fastcall sched_fork(task_t *p) +void fastcall sched_fork(task_t *p, unsigned long clone_flags) { + int cpu; +#ifdef CONFIG_SMP + struct sched_domain *tmp, *sd = NULL; + preempt_disable(); + cpu = smp_processor_id(); + + if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) { + /* + * New thread that is not a vfork. + * Find the largest domain that this CPU is part of that + * is willing to balance on clone: + */ + for_each_domain(cpu, tmp) + if (tmp->flags & SD_BALANCE_CLONE) + sd = tmp; + if (sd) + cpu = find_idlest_cpu(p, cpu, sd); + } + preempt_enable(); +#else + cpu = smp_processor_id(); +#endif + /* + * The task hasn't been attached yet, so cpus_allowed mask cannot + * change. The cpus_allowed mask of the parent may have changed + * after it is copied, and it may then move to a CPU that is not + * allowed for the child. + */ + if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) + cpu = any_online_cpu(p->cpus_allowed); + set_task_cpu(p, cpu); + /* * We mark the process as running here, but have not actually * inserted it onto the runqueue yet. This guarantees that @@ -921,43 +959,81 @@ void fastcall sched_fork(task_t *p) } /* - * wake_up_forked_process - wake up a freshly forked process. + * wake_up_new_process - wake up a newly created process for the first time. * * This function will do some initial scheduler statistics housekeeping - * that must be done for every newly created process. + * that must be done for every newly created context, then puts the process + * on the runqueue and wakes it. */ -void fastcall wake_up_forked_process(task_t * p) +void fastcall wake_up_new_process(task_t * p, unsigned long clone_flags) { unsigned long flags; - runqueue_t *rq = task_rq_lock(current, &flags); + int this_cpu, cpu; + runqueue_t *rq; + + rq = task_rq_lock(p, &flags); + cpu = task_cpu(p); + this_cpu = smp_processor_id(); BUG_ON(p->state != TASK_RUNNING); /* * We decrease the sleep average of forking parents * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. + * from forking tasks that are max-interactive. The parent + * (current) is done further down, under its lock. */ - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); p->interactive_credit = 0; p->prio = effective_prio(p); - set_task_cpu(p, smp_processor_id()); - if (unlikely(!current->array)) + if (likely(cpu == this_cpu)) { + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + if (unlikely(!current->array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->array = current->array; + p->array->nr_active++; + rq->nr_running++; + } + set_need_resched(); + } else { + /* Run child last */ + __activate_task(p, rq); + } + } else { + runqueue_t *this_rq = cpu_rq(this_cpu); + + /* + * Not the local CPU - must adjust timestamp. This should + * get optimised away in the !CONFIG_SMP case. + */ + p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; __activate_task(p, rq); - else { - p->prio = current->prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - rq->nr_running++; + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + } + + if (unlikely(cpu != this_cpu)) { + task_rq_unlock(rq, &flags); + rq = task_rq_lock(current, &flags); } + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); task_rq_unlock(rq, &flags); } @@ -1227,89 +1303,6 @@ static int find_idlest_cpu(struct task_s } /* - * wake_up_forked_thread - wake up a freshly forked thread. - * - * This function will do some initial scheduler statistics housekeeping - * that must be done for every newly created context, and it also does - * runqueue balancing. - */ -void fastcall wake_up_forked_thread(task_t * p) -{ - unsigned long flags; - int this_cpu = get_cpu(), cpu; - struct sched_domain *tmp, *sd = NULL; - runqueue_t *this_rq = cpu_rq(this_cpu), *rq; - - /* - * Find the largest domain that this CPU is part of that - * is willing to balance on clone: - */ - for_each_domain(this_cpu, tmp) - if (tmp->flags & SD_BALANCE_CLONE) - sd = tmp; - if (sd) - cpu = find_idlest_cpu(p, this_cpu, sd); - else - cpu = this_cpu; - - local_irq_save(flags); -lock_again: - rq = cpu_rq(cpu); - double_rq_lock(this_rq, rq); - - BUG_ON(p->state != TASK_RUNNING); - - /* - * We did find_idlest_cpu() unlocked, so in theory - * the mask could have changed - just dont migrate - * in this case: - */ - if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) { - cpu = this_cpu; - double_rq_unlock(this_rq, rq); - goto lock_again; - } - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. - */ - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - - p->interactive_credit = 0; - - p->prio = effective_prio(p); - set_task_cpu(p, cpu); - - if (cpu == this_cpu) { - if (unlikely(!current->array)) - __activate_task(p, rq); - else { - p->prio = current->prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - rq->nr_running++; - } - } else { - /* Not the local CPU - must adjust timestamp */ - p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; - __activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - } - - double_rq_unlock(this_rq, rq); - local_irq_restore(flags); - put_cpu(); -} - -/* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only * allow dest_cpu, which will force the cpu onto dest_cpu. Then @@ -1342,13 +1335,13 @@ out: } /* - * sched_balance_exec(): find the highest-level, exec-balance-capable + * sched_exec(): find the highest-level, exec-balance-capable * domain and try to migrate the task to the least loaded CPU. * * execve() is a valuable balancing opportunity, because at this point * the task has the smallest effective memory and cache footprint. */ -void sched_balance_exec(void) +void sched_exec(void) { struct sched_domain *tmp, *sd = NULL; int new_cpu, this_cpu = get_cpu(); _