From: Srivatsa Vaddagiri <vatsa@in.ibm.com>

migrate_all_tasks is currently run with rest of the machine stopped.
It iterates thr' the complete task table, turning off cpu affinity of any task 
that it finds affine to the dying cpu. Depending on the task table 
size this can take considerable time. All this time machine is stopped, doing
nothing.

Stopping the machine for such extended periods can be avoided if we do
task migration in CPU_DEAD notification and that's precisely what this patch
does.

The patch puts idle task to the _front_ of the dying CPU's runqueue at the 
highest priority possible. This cause idle thread to run _immediately_ after 
kstopmachine thread yields. Idle thread notices that its cpu is offline and 
dies quickly. Task migration can then be done at leisure in CPU_DEAD
notification, when rest of the CPUs are running.

Some advantages with this approach are:

	- More scalable. Predicatable amout of time that machine is stopped.
	- No changes to hot path/core code. We are just exploiting scheduler 
	  rules which runs the next high-priority task on the runqueue. Also
	  since I put idle task to the _front_ of the runqueue, there
	  are no races when a equally high priority task is woken up 
	  and added to the runqueue. It gets in at the back of the runqueue,
	  _after_ idle task!
	- cpu_is_offline check that is presenty required in try_to_wake_up,
	  idle_balance and rebalance_tick can be removed, thus speeding them 
	  up a bit


---

 25-akpm/include/linux/sched.h |    3 -
 25-akpm/kernel/cpu.c          |   29 +++++++----
 25-akpm/kernel/fork.c         |   10 +++
 25-akpm/kernel/sched.c        |  108 +++++++++++++++++++++++++++++++-----------
 4 files changed, 111 insertions(+), 39 deletions(-)

diff -puN include/linux/sched.h~sched-move-migrate_all_tasks-to-cpu_dead-handling include/linux/sched.h
--- 25/include/linux/sched.h~sched-move-migrate_all_tasks-to-cpu_dead-handling	2004-05-04 20:54:58.815801768 -0700
+++ 25-akpm/include/linux/sched.h	2004-05-04 20:54:58.823800552 -0700
@@ -678,8 +678,7 @@ extern void sched_balance_exec(void);
 #define sched_balance_exec()   {}
 #endif
 
-/* Move tasks off this (offline) CPU onto another. */
-extern void migrate_all_tasks(void);
+extern void sched_idle_next(void);
 extern void set_user_nice(task_t *p, long nice);
 extern int task_prio(task_t *p);
 extern int task_nice(task_t *p);
diff -puN kernel/cpu.c~sched-move-migrate_all_tasks-to-cpu_dead-handling kernel/cpu.c
--- 25/kernel/cpu.c~sched-move-migrate_all_tasks-to-cpu_dead-handling	2004-05-04 20:54:58.816801616 -0700
+++ 25-akpm/kernel/cpu.c	2004-05-04 20:54:58.824800400 -0700
@@ -43,15 +43,16 @@ void unregister_cpu_notifier(struct noti
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static inline void check_for_tasks(int cpu, struct task_struct *k)
+static inline void check_for_tasks(int cpu)
 {
 	struct task_struct *p;
 
 	write_lock_irq(&tasklist_lock);
 	for_each_process(p) {
-		if (task_cpu(p) == cpu && p != k)
-			printk(KERN_WARNING "Task %s is on cpu %d\n",
-				p->comm, cpu);
+		if (task_cpu(p) == cpu && (p->utime != 0 || p->stime != 0))
+			printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
+				(state = %ld, flags = %lx) \n",
+				 p->comm, p->pid, cpu, p->state, p->flags);
 	}
 	write_unlock_irq(&tasklist_lock);
 }
@@ -96,8 +97,9 @@ static int take_cpu_down(void *unused)
 	if (err < 0)
 		cpu_set(smp_processor_id(), cpu_online_map);
 	else
-		/* Everyone else gets kicked off. */
-		migrate_all_tasks();
+		/* Force idle task to run as soon as we yield: it should
+		   immediately notice cpu is offline and die quickly. */
+		sched_idle_next();
 
 	return err;
 }
@@ -106,6 +108,7 @@ int cpu_down(unsigned int cpu)
 {
 	int err;
 	struct task_struct *p;
+	cpumask_t old_allowed, tmp;
 
 	if ((err = lock_cpu_hotplug_interruptible()) != 0)
 		return err;
@@ -120,17 +123,21 @@ int cpu_down(unsigned int cpu)
 		goto out;
 	}
 
+	/* Ensure that we are not runnable on dying cpu */
+	old_allowed = current->cpus_allowed;
+	tmp = CPU_MASK_ALL;
+	cpu_clear(cpu, tmp);
+	set_cpus_allowed(current, tmp);
+
 	p = __stop_machine_run(take_cpu_down, NULL, cpu);
 	if (IS_ERR(p)) {
 		err = PTR_ERR(p);
-		goto out;
+		goto out_allowed;
 	}
 
 	if (cpu_online(cpu))
 		goto out_thread;
 
-	check_for_tasks(cpu, p);
-
 	/* Wait for it to sleep (leaving idle task). */
 	while (!idle_cpu(cpu))
 		yield();
@@ -146,10 +153,14 @@ int cpu_down(unsigned int cpu)
 	    == NOTIFY_BAD)
 		BUG();
 
+	check_for_tasks(cpu);
+
 	cpu_run_sbin_hotplug(cpu, "offline");
 
 out_thread:
 	err = kthread_stop(p);
+out_allowed:
+	set_cpus_allowed(current, old_allowed);
 out:
 	unlock_cpu_hotplug();
 	return err;
diff -puN kernel/fork.c~sched-move-migrate_all_tasks-to-cpu_dead-handling kernel/fork.c
--- 25/kernel/fork.c~sched-move-migrate_all_tasks-to-cpu_dead-handling	2004-05-04 20:54:58.818801312 -0700
+++ 25-akpm/kernel/fork.c	2004-05-04 20:54:58.825800248 -0700
@@ -35,6 +35,7 @@
 #include <linux/mount.h>
 #include <linux/audit.h>
 #include <linux/rmap.h>
+#include <linux/cpu.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -1229,8 +1230,15 @@ long do_fork(unsigned long clone_flags,
 				wake_up_forked_thread(p);
 			else
 				wake_up_forked_process(p);
-		} else
+		} else {
+			int cpu = get_cpu();
+
 			p->state = TASK_STOPPED;
+			if (unlikely(cpu_is_offline(task_cpu(p))))
+				set_task_cpu(p, cpu);
+
+			put_cpu();
+		}
 		++total_forks;
 
 		if (unlikely (trace)) {
diff -puN kernel/sched.c~sched-move-migrate_all_tasks-to-cpu_dead-handling kernel/sched.c
--- 25/kernel/sched.c~sched-move-migrate_all_tasks-to-cpu_dead-handling	2004-05-04 20:54:58.819801160 -0700
+++ 25-akpm/kernel/sched.c	2004-05-04 20:54:58.828799792 -0700
@@ -386,6 +386,15 @@ static inline void __activate_task(task_
 	rq->nr_running++;
 }
 
+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+{
+	enqueue_task_head(p, rq->active);
+	rq->nr_running++;
+}
+
 static void recalc_task_prio(task_t *p, unsigned long long now)
 {
 	unsigned long long __sleep_time = now - p->timestamp;
@@ -749,7 +758,7 @@ static int try_to_wake_up(task_t * p, un
 	this_cpu = smp_processor_id();
 
 #ifdef CONFIG_SMP
-	if (unlikely(task_running(rq, p) || cpu_is_offline(this_cpu)))
+	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
 	new_cpu = cpu;
@@ -1781,9 +1790,6 @@ static inline void idle_balance(int this
 {
 	struct sched_domain *sd;
 
-	if (unlikely(cpu_is_offline(this_cpu)))
-		return;
-
 	for_each_domain(this_cpu, sd) {
 		if (sd->flags & SD_BALANCE_NEWIDLE) {
 			if (load_balance_newidle(this_cpu, this_rq, sd)) {
@@ -1871,9 +1877,6 @@ static void rebalance_tick(int this_cpu,
 	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
 	struct sched_domain *sd;
 
-	if (unlikely(cpu_is_offline(this_cpu)))
-		return;
-
 	/* Update our load */
 	old_load = this_rq->cpu_load;
 	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
@@ -3332,18 +3335,19 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  */
-static void __migrate_task(struct task_struct *p, int dest_cpu)
+static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
-	runqueue_t *rq_dest;
+	runqueue_t *rq_dest, *rq_src;
 
 	if (unlikely(cpu_is_offline(dest_cpu)))
 		return;
 
+	rq_src  = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
 
-	double_rq_lock(this_rq(), rq_dest);
+	double_rq_lock(rq_src, rq_dest);
 	/* Already moved. */
-	if (task_cpu(p) != smp_processor_id())
+	if (task_cpu(p) != src_cpu)
 		goto out;
 	/* Affinity changed (again). */
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
@@ -3351,7 +3355,7 @@ static void __migrate_task(struct task_s
 
 	set_task_cpu(p, dest_cpu);
 	if (p->array) {
-		deactivate_task(p, this_rq());
+		deactivate_task(p, rq_src);
 		activate_task(p, rq_dest);
 		if (TASK_PREEMPTS_CURR(p, rq_dest))
 			resched_task(rq_dest->curr);
@@ -3359,7 +3363,7 @@ static void __migrate_task(struct task_s
 	p->timestamp = rq_dest->timestamp_last_tick;
 
 out:
-	double_rq_unlock(this_rq(), rq_dest);
+	double_rq_unlock(rq_src, rq_dest);
 }
 
 /*
@@ -3383,6 +3387,12 @@ static int migration_thread(void * data)
 			refrigerator(PF_FREEZE);
 
 		spin_lock_irq(&rq->lock);
+
+		if (unlikely(cpu_is_offline(cpu))) {
+			spin_unlock_irq(&rq->lock);
+			goto wait_to_die;
+		}
+
 		if (rq->active_balance) {
 			active_load_balance(rq, cpu);
 			rq->active_balance = 0;
@@ -3401,7 +3411,8 @@ static int migration_thread(void * data)
 
 		if (req->type == REQ_MOVE_TASK) {
 			spin_unlock(&rq->lock);
-			__migrate_task(req->task, req->dest_cpu);
+			__migrate_task(req->task, smp_processor_id(),
+					req->dest_cpu);
 			local_irq_enable();
 		} else if (req->type == REQ_SET_DOMAIN) {
 			rq->sd = req->sd;
@@ -3414,23 +3425,27 @@ static int migration_thread(void * data)
 		complete(&req->done);
 	}
 	return 0;
+
+wait_to_die:
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-/* migrate_all_tasks - function to migrate all the tasks from the
- * current cpu caller must have already scheduled this to the target
- * cpu via set_cpus_allowed.  Machine is stopped.  */
-void migrate_all_tasks(void)
+/* migrate_all_tasks - function to migrate all tasks from the dead cpu.  */
+static void migrate_all_tasks(int src_cpu)
 {
 	struct task_struct *tsk, *t;
-	int dest_cpu, src_cpu;
+	int dest_cpu;
 	unsigned int node;
 
-	/* We're nailed to this CPU. */
-	src_cpu = smp_processor_id();
-
-	/* Not required, but here for neatness. */
-	write_lock(&tasklist_lock);
+	write_lock_irq(&tasklist_lock);
 
 	/* watch out for per node tasks, let's stay on this node */
 	node = cpu_to_node(src_cpu);
@@ -3466,10 +3481,36 @@ void migrate_all_tasks(void)
 				       tsk->pid, tsk->comm, src_cpu);
 		}
 
-		__migrate_task(tsk, dest_cpu);
+		__migrate_task(tsk, src_cpu, dest_cpu);
 	} while_each_thread(t, tsk);
 
-	write_unlock(&tasklist_lock);
+	write_unlock_irq(&tasklist_lock);
+}
+
+/* Schedules idle task to be the next runnable task on current CPU.
+ * It does so by boosting its priority to highest possible and adding it to
+ * the _front_ of runqueue. Used by CPU offline code.
+ */
+void sched_idle_next(void)
+{
+	int cpu = smp_processor_id();
+	runqueue_t *rq = this_rq();
+	struct task_struct *p = rq->idle;
+	unsigned long flags;
+
+	/* cpu has to be offline */
+	BUG_ON(cpu_online(cpu));
+
+	/* Strictly not necessary since rest of the CPUs are stopped by now
+	 * and interrupts disabled on current cpu.
+	 */
+	spin_lock_irqsave(&rq->lock, flags);
+
+	__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+	/* Add idle task to _front_ of it's priority queue */
+	__activate_idle_task(p, rq);
+
+	spin_unlock_irqrestore(&rq->lock, flags);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
@@ -3505,11 +3546,20 @@ static int migration_call(struct notifie
 	case CPU_UP_CANCELED:
 		/* Unbind it from offline cpu so it can run.  Fall thru. */
 		kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
+		kthread_stop(cpu_rq(cpu)->migration_thread);
+		cpu_rq(cpu)->migration_thread = NULL;
+		break;
 	case CPU_DEAD:
+		migrate_all_tasks(cpu);
 		rq = cpu_rq(cpu);
 		kthread_stop(rq->migration_thread);
 		rq->migration_thread = NULL;
-		BUG_ON(rq->nr_running != 0);
+		/* Idle task back to normal (off runqueue, low prio) */
+		rq = task_rq_lock(rq->idle, &flags);
+		deactivate_task(rq->idle, rq);
+		__setscheduler(rq->idle, SCHED_NORMAL, MAX_PRIO);
+		task_rq_unlock(rq, &flags);
+ 		BUG_ON(rq->nr_running != 0);
 
 		/* No need to migrate the tasks: it was best-effort if
 		 * they didn't do lock_cpu_hotplug().  Just wake up
@@ -3530,8 +3580,12 @@ static int migration_call(struct notifie
 	return NOTIFY_OK;
 }
 
+/* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before everything else.
+ */
 static struct notifier_block __devinitdata migration_notifier = {
 	.notifier_call = migration_call,
+	.priority = 10
 };
 
 int __init migration_init(void)

_