From e2e7c2098d3618e9952a38282d8115dc59db08c4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 08:30:05 -0500 Subject: [PATCH] sched: mmdrop needs to be delayed on -rt commit 5b6e135f5e1e9e5586ad69e35c96494a4b413a00 in tip. [PG: upstream per_cpu__ prefix removal dd17c8f729) caused an implicit (and hard to spot) shadowing of the percpu desched_task with a local var of the same name in __mmdrop_delayed, so add cpu_ prefix] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Paul Gortmaker --- include/linux/mm_types.h | 3 + include/linux/sched.h | 8 +++ kernel/fork.c | 154 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched.c | 10 +++- 4 files changed, 174 insertions(+), 1 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b8bb9a6..a977b30 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -270,6 +270,9 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; + /* realtime bits */ + struct list_head delayed_drop; + /* Swap token stuff */ /* * Last value of global fault stamp as seen by this process. diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c73160..5dfd465 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2176,12 +2176,20 @@ extern struct mm_struct * mm_alloc(void); /* mmdrop drops the mm and the page tables */ extern void __mmdrop(struct mm_struct *); +extern void __mmdrop_delayed(struct mm_struct *); + static inline void mmdrop(struct mm_struct * mm) { if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } +static inline void mmdrop_delayed(struct mm_struct * mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + __mmdrop_delayed(mm); +} + /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); /* Grab a reference to a task's mm, if it is not already going away */ diff --git a/kernel/fork.c b/kernel/fork.c index f2fac69..c4a9e7b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -65,6 +66,8 @@ #include #include #include +#include +#include #include #include @@ -99,6 +102,14 @@ int lockdep_tasklist_lock_is_held(void) EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held); #endif /* #ifdef CONFIG_PROVE_RCU */ +/* + * Delayed mmdrop. In the PREEMPT_RT case we + * dont want to do this from the scheduling + * context. + */ +static DEFINE_PER_CPU(struct task_struct *, desched_task); +static DEFINE_PER_CPU(struct list_head, delayed_drop_list); + int nr_processes(void) { int cpu; @@ -192,6 +203,8 @@ void __put_task_struct(struct task_struct *tsk) void __init fork_init(unsigned long mempages) { + int i; + #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES @@ -222,6 +235,9 @@ void __init fork_init(unsigned long mempages) init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; + + for (i = 0; i < NR_CPUS; i++) + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i)); } int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, @@ -307,6 +323,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; + INIT_LIST_HEAD(&mm->delayed_drop); mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; @@ -1279,7 +1296,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); + preempt_disable(); __get_cpu_var(process_counts)++; + preempt_enable(); } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; @@ -1753,3 +1772,138 @@ int unshare_files(struct files_struct **displaced) task_unlock(task); return 0; } + +static int mmdrop_complete(void) +{ + struct list_head *head; + int ret = 0; + + head = &get_cpu_var(delayed_drop_list); + while (!list_empty(head)) { + struct mm_struct *mm = list_entry(head->next, + struct mm_struct, delayed_drop); + list_del(&mm->delayed_drop); + put_cpu_var(delayed_drop_list); + + __mmdrop(mm); + ret = 1; + + head = &get_cpu_var(delayed_drop_list); + } + put_cpu_var(delayed_drop_list); + + return ret; +} + +/* + * We dont want to do complex work from the scheduler, thus + * we delay the work to a per-CPU worker thread: + */ +void __mmdrop_delayed(struct mm_struct *mm) +{ + struct task_struct *cpu_desched_task; + struct list_head *head; + + head = &get_cpu_var(delayed_drop_list); + list_add_tail(&mm->delayed_drop, head); + cpu_desched_task = __get_cpu_var(desched_task); + if (cpu_desched_task) + wake_up_process(cpu_desched_task); + put_cpu_var(delayed_drop_list); +} + +static void takeover_delayed_drop(int hotcpu) +{ + struct list_head *head = &per_cpu(delayed_drop_list, hotcpu); + + while (!list_empty(head)) { + struct mm_struct *mm = list_entry(head->next, + struct mm_struct, delayed_drop); + + list_del(&mm->delayed_drop); + __mmdrop_delayed(mm); + } +} + +static int desched_thread(void * __bind_cpu) +{ + set_user_nice(current, -10); + current->flags |= PF_NOFREEZE; + current->extra_flags |= PFE_SOFTIRQ; + + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + if (mmdrop_complete()) + continue; + schedule(); + + /* + * This must be called from time to time on ia64, and is a + * no-op on other archs. Used to be in cpu_idle(), but with + * the new -rt semantics it can't stay there. + */ + check_pgt_cache(); + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + + BUG_ON(per_cpu(desched_task, hotcpu)); + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu)); + p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu); + if (IS_ERR(p)) { + printk("desched_thread for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + per_cpu(desched_task, hotcpu) = p; + kthread_bind(p, hotcpu); + break; + case CPU_ONLINE: + + wake_up_process(per_cpu(desched_task, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id()); + case CPU_DEAD: + + p = per_cpu(desched_task, hotcpu); + per_cpu(desched_task, hotcpu) = NULL; + kthread_stop(p); + takeover_delayed_drop(hotcpu); + takeover_tasklets(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +__init int spawn_desched_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + return 0; +} diff --git a/kernel/sched.c b/kernel/sched.c index c41e84c..d56e54d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2915,8 +2915,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); + /* + * Delay the final freeing of the mm or task, so that we dont have + * to do complex work from within the scheduler: + */ if (mm) - mmdrop(mm); + mmdrop_delayed(mm); if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this @@ -5916,7 +5920,11 @@ void idle_task_exit(void) if (mm != &init_mm) switch_mm(mm, &init_mm, current); +#ifdef CONFIG_PREEMPT_RT + mmdrop_delayed(mm); +#else mmdrop(mm); +#endif } /* called under rq->lock with disabled interrupts */ -- 1.7.0.4