diff -urN linux-2.4.19-pre7-ac2-rml/arch/i386/kernel/i8259.c linux/arch/i386/kernel/i8259.c --- linux-2.4.19-pre7-ac2-rml/arch/i386/kernel/i8259.c Sat Apr 20 17:55:10 2002 +++ linux/arch/i386/kernel/i8259.c Sat Apr 20 18:29:10 2002 @@ -79,7 +79,6 @@ * through the ICC by us (IPIs) */ #ifdef CONFIG_SMP -BUILD_SMP_INTERRUPT(task_migration_interrupt,TASK_MIGRATION_VECTOR) BUILD_SMP_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_SMP_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR) BUILD_SMP_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) @@ -474,9 +473,6 @@ */ set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - /* IPI for task migration */ - set_intr_gate(TASK_MIGRATION_VECTOR, task_migration_interrupt); - /* IPI for invalidation */ set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); diff -urN linux-2.4.19-pre7-ac2-rml/arch/i386/kernel/smp.c linux/arch/i386/kernel/smp.c --- linux-2.4.19-pre7-ac2-rml/arch/i386/kernel/smp.c Sat Apr 20 17:55:10 2002 +++ linux/arch/i386/kernel/smp.c Sat Apr 20 18:29:10 2002 @@ -484,35 +484,6 @@ do_flush_tlb_all_local(); } -static spinlock_t migration_lock = SPIN_LOCK_UNLOCKED; -static task_t *new_task; - -/* - * This function sends a 'task migration' IPI to another CPU. - * Must be called from syscall contexts, with interrupts *enabled*. - */ -void smp_migrate_task(int cpu, task_t *p) -{ - /* - * The target CPU will unlock the migration spinlock: - */ - spin_lock(&migration_lock); - new_task = p; - send_IPI_mask(1 << cpu, TASK_MIGRATION_VECTOR); -} - -/* - * Task migration callback. - */ -asmlinkage void smp_task_migration_interrupt(void) -{ - task_t *p; - - ack_APIC_irq(); - p = new_task; - spin_unlock(&migration_lock); - sched_task_migrated(p); -} /* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing diff -urN linux-2.4.19-pre7-ac2-rml/include/asm-i386/hw_irq.h linux/include/asm-i386/hw_irq.h --- linux-2.4.19-pre7-ac2-rml/include/asm-i386/hw_irq.h Sat Apr 20 17:54:55 2002 +++ linux/include/asm-i386/hw_irq.h Sat Apr 20 18:29:10 2002 @@ -41,8 +41,7 @@ #define ERROR_APIC_VECTOR 0xfe #define INVALIDATE_TLB_VECTOR 0xfd #define RESCHEDULE_VECTOR 0xfc -#define TASK_MIGRATION_VECTOR 0xfb -#define CALL_FUNCTION_VECTOR 0xfa +#define CALL_FUNCTION_VECTOR 0xfb /* * Local APIC timer IRQ vector is on a different priority level, diff -urN linux-2.4.19-pre7-ac2-rml/include/linux/sched.h linux/include/linux/sched.h --- linux-2.4.19-pre7-ac2-rml/include/linux/sched.h Sat Apr 20 18:24:11 2002 +++ linux/include/linux/sched.h Sat Apr 20 18:29:10 2002 @@ -149,8 +149,7 @@ extern void update_one_process(task_t *p, unsigned long user, unsigned long system, int cpu); extern void scheduler_tick(int user_tick, int system); -extern void sched_task_migrated(task_t *p); -extern void smp_migrate_task(int cpu, task_t *task); +extern void migration_init(void); extern unsigned long cache_decay_ticks; extern int set_user(uid_t new_ruid, int dumpclear); @@ -450,7 +449,12 @@ */ #define _STK_LIM (8*1024*1024) +#if CONFIG_SMP extern void set_cpus_allowed(task_t *p, unsigned long new_mask); +#else +#define set_cpus_allowed(p, new_mask) do { } while (0) +#endif + extern void set_user_nice(task_t *p, long nice); extern int task_prio(task_t *p); extern int task_nice(task_t *p); diff -urN linux-2.4.19-pre7-ac2-rml/init/main.c linux/init/main.c --- linux-2.4.19-pre7-ac2-rml/init/main.c Sat Apr 20 17:54:55 2002 +++ linux/init/main.c Sat Apr 20 18:29:10 2002 @@ -458,6 +458,10 @@ */ static void __init do_basic_setup(void) { + /* Start the per-CPU migration threads */ +#if CONFIG_SMP + migration_init(); +#endif /* * Tell the world that we're going to be the grim diff -urN linux-2.4.19-pre7-ac2-rml/kernel/ksyms.c linux/kernel/ksyms.c --- linux-2.4.19-pre7-ac2-rml/kernel/ksyms.c Sat Apr 20 17:54:55 2002 +++ linux/kernel/ksyms.c Sat Apr 20 18:29:10 2002 @@ -443,7 +443,9 @@ EXPORT_SYMBOL(schedule_timeout); EXPORT_SYMBOL(sys_sched_yield); EXPORT_SYMBOL(set_user_nice); -EXPORT_SYMBOL(set_cpus_allowed); +#ifdef CONFIG_SMP +EXPORT_SYMBOL_GPL(set_cpus_allowed); +#endif EXPORT_SYMBOL(jiffies); EXPORT_SYMBOL(xtime); EXPORT_SYMBOL(do_gettimeofday); diff -urN linux-2.4.19-pre7-ac2-rml/kernel/sched.c linux/kernel/sched.c --- linux-2.4.19-pre7-ac2-rml/kernel/sched.c Sat Apr 20 18:28:18 2002 +++ linux/kernel/sched.c Sat Apr 20 18:29:10 2002 @@ -144,6 +144,8 @@ task_t *curr, *idle; prio_array_t *active, *expired, arrays[2]; int prev_nr_running[NR_CPUS]; + task_t *migration_thread; + list_t migration_queue; } ____cacheline_aligned; static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; @@ -284,20 +286,6 @@ } /* - * The SMP message passing code calls this function whenever - * the new task has arrived at the target CPU. We move the - * new task into the local runqueue. - * - * This function must be called with interrupts disabled. - */ -void sched_task_migrated(task_t *new_task) -{ - wait_task_inactive(new_task); - new_task->cpu = smp_processor_id(); - wake_up_process(new_task); -} - -/* * Kick the remote CPU if the task is running currently, * this code is used by the signal code to signal tasks * which are in user-mode as quickly as possible. @@ -962,34 +950,6 @@ return timeout; } -/* - * Change the current task's CPU affinity. Migrate the process to a - * proper CPU and schedule away if the current CPU is removed from - * the allowed bitmask. - */ -void set_cpus_allowed(task_t *p, unsigned long new_mask) -{ - new_mask &= cpu_online_map; - if (!new_mask) - BUG(); - if (p != current) - BUG(); - - p->cpus_allowed = new_mask; - /* - * Can the task run on the current CPU? If not then - * migrate the process off to a proper CPU. - */ - if (new_mask & (1UL << smp_processor_id())) - return; -#if CONFIG_SMP - current->state = TASK_UNINTERRUPTIBLE; - smp_migrate_task(__ffs(new_mask), current); - - schedule(); -#endif -} - void scheduling_functions_end_here(void) { } void set_user_nice(task_t *p, long nice) @@ -1475,6 +1435,7 @@ rq->expired = rq->arrays + 1; spin_lock_init(&rq->lock); spin_lock_init(&rq->frozen); + INIT_LIST_HEAD(&rq->migration_queue); for (j = 0; j < 2; j++) { array = rq->arrays + j; @@ -1506,3 +1467,216 @@ atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current, smp_processor_id()); } + +#if CONFIG_SMP + +/* + * This is how migration works: + * + * 1) we queue a migration_req_t structure in the source CPU's + * runqueue and wake up that CPU's migration thread. + * 2) we down() the locked semaphore => thread blocks. + * 3) migration thread wakes up (implicitly it forces the migrated + * thread off the CPU) + * 4) it gets the migration request and checks whether the migrated + * task is still in the wrong runqueue. + * 5) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 6) migration thread up()s the semaphore. + * 7) we wake up and the migration is done. + */ + +typedef struct { + list_t list; + task_t *task; + struct semaphore sem; +} migration_req_t; + +/* + * Change a given task's CPU affinity. Migrate the process to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. No + * spinlocks can be held. + */ +void set_cpus_allowed(task_t *p, unsigned long new_mask) +{ + unsigned long flags; + migration_req_t req; + runqueue_t *rq; + + new_mask &= cpu_online_map; + if (!new_mask) + BUG(); + + rq = task_rq_lock(p, &flags); + p->cpus_allowed = new_mask; + /* + * Can the task run on the task's current CPU? If not then + * migrate the process off to a proper CPU. + */ + if (new_mask & (1UL << p->cpu)) { + task_rq_unlock(rq, &flags); + return; + } + + init_MUTEX_LOCKED(&req.sem); + req.task = p; + list_add(&req.list, &rq->migration_queue); + task_rq_unlock(rq, &flags); + wake_up_process(rq->migration_thread); + + down(&req.sem); +} + +/* + * Treat the bits of migration_mask as lock bits. + * If the bit corresponding to the cpu a migration_thread is + * running on then we have failed to claim our cpu and must + * yield in order to find another. + */ +static volatile unsigned long migration_mask; +static atomic_t migration_threads_seeking_cpu; +static struct completion migration_complete + = COMPLETION_INITIALIZER(migration_complete); + +static int migration_thread(void * unused) +{ + struct sched_param param = { sched_priority: MAX_RT_PRIO - 1 }; + runqueue_t *rq; + int ret; + + daemonize(); + sigfillset(¤t->blocked); + set_fs(KERNEL_DS); + ret = setscheduler(0, SCHED_FIFO, ¶m); + + /* + * We have to migrate manually - there is no migration thread + * to do this for us yet :-) + * + * We use the following property of the Linux scheduler. At + * this point no other task is running, so by keeping all + * migration threads running, the load-balancer will distribute + * them between all CPUs equally. At that point every migration + * task binds itself to the current CPU. + */ + + /* + * Enter the loop with preemption disabled so that + * smp_processor_id() remains valid through the check. The + * interior of the wait loop re-enables preemption in an + * attempt to get scheduled off the current cpu. When the + * loop is exited the lock bit in migration_mask is acquired + * and preemption is disabled on the way out. This way the + * cpu acquired remains valid when ->cpus_allowed is set. + */ + while (test_and_set_bit(smp_processor_id(), &migration_mask)) + yield(); + + current->cpus_allowed = 1 << smp_processor_id(); + rq = this_rq(); + rq->migration_thread = current; + + /* + * Now that we've bound ourselves to a cpu, post to + * migration_threads_seeking_cpu and wait for everyone else. + * Preemption should remain disabled and the cpu should remain + * in busywait. Yielding the cpu will allow the livelock + * where where a timing pattern causes an idle task seeking a + * migration_thread to always find the unbound migration_thread + * running on the cpu's it tries to steal tasks from. + */ + atomic_dec(&migration_threads_seeking_cpu); + while (atomic_read(&migration_threads_seeking_cpu)) + cpu_relax(); + + sprintf(current->comm, "migration_CPU%d", smp_processor_id()); + + /* + * Everyone's found their cpu, so now wake migration_init(). + * Multiple wakeups are harmless; removal from the waitqueue + * has locking built-in, and waking an empty queue is valid. + */ + complete(&migration_complete); + + /* + * Initiate the event loop. + */ + for (;;) { + runqueue_t *rq_src, *rq_dest; + struct list_head *head; + int cpu_src, cpu_dest; + migration_req_t *req; + unsigned long flags; + task_t *p; + + spin_lock_irqsave(&rq->lock, flags); + head = &rq->migration_queue; + current->state = TASK_INTERRUPTIBLE; + if (list_empty(head)) { + spin_unlock_irqrestore(&rq->lock, flags); + schedule(); + continue; + } + req = list_entry(head->next, migration_req_t, list); + list_del_init(head->next); + spin_unlock_irqrestore(&rq->lock, flags); + + p = req->task; + cpu_dest = __ffs(p->cpus_allowed); + rq_dest = cpu_rq(cpu_dest); +repeat: + cpu_src = p->cpu; + rq_src = cpu_rq(cpu_src); + + local_irq_save(flags); + double_rq_lock(rq_src, rq_dest); + if (p->cpu != cpu_src) { + local_irq_restore(flags); + double_rq_unlock(rq_src, rq_dest); + goto repeat; + } + if (rq_src == rq) { + p->cpu = cpu_dest; + if (p->array) { + deactivate_task(p, rq_src); + activate_task(p, rq_dest); + } + } + local_irq_restore(flags); + double_rq_unlock(rq_src, rq_dest); + + up(&req->sem); + } +} + +void __init migration_init(void) +{ + unsigned long orig_cache_decay_ticks; + int cpu; + + atomic_set(&migration_threads_seeking_cpu, smp_num_cpus); + + orig_cache_decay_ticks = cache_decay_ticks; + cache_decay_ticks = 0; + + for (cpu = 0; cpu < smp_num_cpus; cpu++) + if (kernel_thread(migration_thread, NULL, + CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0) + BUG(); + + /* + * We cannot have missed the wakeup for the migration_thread + * bound for the cpu migration_init() is running on cannot + * acquire this cpu until migration_init() has yielded it by + * means of wait_for_completion(). + */ + wait_for_completion(&migration_complete); + + cache_decay_ticks = orig_cache_decay_ticks; +} + +#endif /* CONFIG_SMP */