Patch from Robert Love This is a rediff of Ingo's latest posted scheduler tree, F3-updated, to 2.5.60-mm1. It is primarily improved HT support plus some load balancer changes and misc. cleanups. i386/Kconfig | 18 + i386/kernel/cpu/proc.c | 8 i386/kernel/smpboot.c | 73 ++++++- asm-i386/apic.h | 2 linux/sched.h | 29 +- main.c | 14 - fork.c | 2 sched.c | 480 ++++++++++++++++++++++++++++++++++++++----------- 8 files changed, 491 insertions(+), 135 deletions(-) diff -puN arch/i386/Kconfig~sched-f3 arch/i386/Kconfig --- 25/arch/i386/Kconfig~sched-f3 2003-02-13 18:39:00.000000000 -0800 +++ 25-akpm/arch/i386/Kconfig 2003-02-13 18:39:00.000000000 -0800 @@ -419,6 +419,24 @@ config SMP If you don't know what to do here, say N. +choice + + prompt "Hyperthreading Support" + depends on SMP + default NR_SIBLINGS_0 + +config NR_SIBLINGS_0 + bool "off" + +config NR_SIBLINGS_2 + bool "2 siblings" + +config NR_SIBLINGS_4 + bool "4 siblings" + +endchoice + + config PREEMPT bool "Preemptible Kernel" help diff -puN arch/i386/kernel/cpu/proc.c~sched-f3 arch/i386/kernel/cpu/proc.c --- 25/arch/i386/kernel/cpu/proc.c~sched-f3 2003-02-13 18:39:00.000000000 -0800 +++ 25-akpm/arch/i386/kernel/cpu/proc.c 2003-02-13 18:39:00.000000000 -0800 @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -101,6 +102,13 @@ static int show_cpuinfo(struct seq_file fpu_exception ? "yes" : "no", c->cpuid_level, c->wp_works_ok ? "yes" : "no"); +#if CONFIG_SHARE_RUNQUEUE +{ + extern long __rq_idx[NR_CPUS]; + + seq_printf(m, "\nrunqueue\t: %ld\n", __rq_idx[n]); +} +#endif for ( i = 0 ; i < 32*NCAPINTS ; i++ ) if ( test_bit(i, c->x86_capability) && diff -puN arch/i386/kernel/smpboot.c~sched-f3 arch/i386/kernel/smpboot.c --- 25/arch/i386/kernel/smpboot.c~sched-f3 2003-02-13 18:39:00.000000000 -0800 +++ 25-akpm/arch/i386/kernel/smpboot.c 2003-02-13 18:39:00.000000000 -0800 @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -939,6 +940,16 @@ void *xquad_portio; int cpu_sibling_map[NR_CPUS] __cacheline_aligned; +static int test_ht; + +static int __init ht_setup(char *str) +{ + test_ht = 1; + return 1; +} + +__setup("test_ht", ht_setup); + static void __init smp_boot_cpus(unsigned int max_cpus) { int apicid, cpu, bit; @@ -1079,7 +1090,20 @@ static void __init smp_boot_cpus(unsigne Dprintk("Boot done.\n"); /* - * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so + * Here we can be sure that there is an IO-APIC in the system. Let's + * go and set it up: + */ + smpboot_setup_io_apic(); + + setup_boot_APIC_clock(); + + /* + * Synchronize the TSC with the AP + */ + if (cpu_has_tsc && cpucount) + synchronize_tsc_bp(); + /* + * If Hyper-Threading is available, construct cpu_sibling_map[], so * that we can tell the sibling CPU efficiently. */ if (cpu_has_ht && smp_num_siblings > 1) { @@ -1088,7 +1112,8 @@ static void __init smp_boot_cpus(unsigne for (cpu = 0; cpu < NR_CPUS; cpu++) { int i; - if (!test_bit(cpu, &cpu_callout_map)) continue; + if (!test_bit(cpu, &cpu_callout_map)) + continue; for (i = 0; i < NR_CPUS; i++) { if (i == cpu || !test_bit(i, &cpu_callout_map)) @@ -1104,17 +1129,41 @@ static void __init smp_boot_cpus(unsigne printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu); } } - } - - smpboot_setup_io_apic(); - - setup_boot_APIC_clock(); +#if CONFIG_SHARE_RUNQUEUE + /* + * At this point APs would have synchronised TSC and + * waiting for smp_commenced, with their APIC timer + * disabled. So BP can go ahead do some initialization + * for Hyper-Threading (if needed). + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + int i; + if (!test_bit(cpu, &cpu_callout_map)) + continue; + for (i = 0; i < NR_CPUS; i++) { + if (i <= cpu) + continue; + if (!test_bit(i, &cpu_callout_map)) + continue; - /* - * Synchronize the TSC with the AP - */ - if (cpu_has_tsc && cpucount) - synchronize_tsc_bp(); + if (phys_proc_id[cpu] != phys_proc_id[i]) + continue; + /* + * merge runqueues, resulting in one + * runqueue per package: + */ + sched_map_runqueue(cpu, i); + break; + } + } +#endif + } +#if CONFIG_SHARE_RUNQUEUE + if (smp_num_siblings == 1 && test_ht) { + printk("Simulating shared runqueues - mapping CPU#1's runqueue to CPU#0's runqueue.\n"); + sched_map_runqueue(0, 1); + } +#endif } /* These are wrappers to interface to the new boot process. Someone diff -puN include/asm-i386/apic.h~sched-f3 include/asm-i386/apic.h --- 25/include/asm-i386/apic.h~sched-f3 2003-02-13 18:39:00.000000000 -0800 +++ 25-akpm/include/asm-i386/apic.h 2003-02-13 18:39:00.000000000 -0800 @@ -98,4 +98,6 @@ extern unsigned int nmi_watchdog; #endif /* CONFIG_X86_LOCAL_APIC */ +extern int phys_proc_id[NR_CPUS]; + #endif /* __ASM_APIC_H */ diff -puN include/linux/sched.h~sched-f3 include/linux/sched.h --- 25/include/linux/sched.h~sched-f3 2003-02-13 18:39:00.000000000 -0800 +++ 25-akpm/include/linux/sched.h 2003-02-13 18:39:00.000000000 -0800 @@ -147,6 +147,24 @@ typedef struct task_struct task_t; extern void sched_init(void); extern void init_idle(task_t *idle, int cpu); +/* + * Is there a way to do this via Kconfig? + */ +#if CONFIG_NR_SIBLINGS_2 +# define CONFIG_NR_SIBLINGS 2 +#elif CONFIG_NR_SIBLINGS_4 +# define CONFIG_NR_SIBLINGS 4 +#else +# define CONFIG_NR_SIBLINGS 0 +#endif + +#if CONFIG_NR_SIBLINGS +# define CONFIG_SHARE_RUNQUEUE 1 +#else +# define CONFIG_SHARE_RUNQUEUE 0 +#endif +extern void sched_map_runqueue(int cpu1, int cpu2); + extern void show_state(void); extern void show_trace(unsigned long *stack); extern void show_stack(unsigned long *stack); @@ -311,7 +329,7 @@ struct task_struct { prio_array_t *array; unsigned long sleep_avg; - unsigned long sleep_timestamp; + unsigned long last_run; unsigned long policy; unsigned long cpus_allowed; @@ -808,11 +826,6 @@ static inline unsigned int task_cpu(stru return p->thread_info->cpu; } -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - p->thread_info->cpu = cpu; -} - #else static inline unsigned int task_cpu(struct task_struct *p) @@ -820,10 +833,6 @@ static inline unsigned int task_cpu(stru return 0; } -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ -} - #endif /* CONFIG_SMP */ #endif /* __KERNEL__ */ diff -puN init/main.c~sched-f3 init/main.c --- 25/init/main.c~sched-f3 2003-02-13 18:39:00.000000000 -0800 +++ 25-akpm/init/main.c 2003-02-13 18:39:00.000000000 -0800 @@ -358,7 +358,14 @@ static void __init smp_init(void) static void rest_init(void) { + /* + * We count on the initial thread going ok + * Like idlers init is an unlocked kernel thread, which will + * make syscalls (and thus be locked). + */ + init_idle(current, smp_processor_id()); kernel_thread(init, NULL, CLONE_KERNEL); + unlock_kernel(); cpu_idle(); } @@ -442,13 +449,6 @@ asmlinkage void __init start_kernel(void check_bugs(); printk("POSIX conformance testing by UNIFIX\n"); - /* - * We count on the initial thread going ok - * Like idlers init is an unlocked kernel thread, which will - * make syscalls (and thus be locked). - */ - init_idle(current, smp_processor_id()); - #ifdef CONFIG_X86_REMOTE_DEBUG if (gdb_enter) { gdb_hook(); /* right at boot time */ diff -puN kernel/fork.c~sched-f3 kernel/fork.c --- 25/kernel/fork.c~sched-f3 2003-02-13 18:39:00.000000000 -0800 +++ 25-akpm/kernel/fork.c 2003-02-13 18:39:00.000000000 -0800 @@ -896,7 +896,7 @@ static struct task_struct *copy_process( */ p->first_time_slice = 1; current->time_slice >>= 1; - p->sleep_timestamp = jiffies; + p->last_run = jiffies; if (!current->time_slice) { /* * This case is rare, it happens when the parent has only diff -puN kernel/sched.c~sched-f3 kernel/sched.c --- 25/kernel/sched.c~sched-f3 2003-02-13 18:39:00.000000000 -0800 +++ 25-akpm/kernel/sched.c 2003-02-13 18:39:02.000000000 -0800 @@ -80,6 +80,7 @@ int starvation_limit = 2 * HZ; #define INTERACTIVE_DELTA (interactive_delta) #define MAX_SLEEP_AVG (max_sleep_avg) #define STARVATION_LIMIT (starvation_limit) +#define AGRESSIVE_IDLE_STEAL 1 #define NODE_THRESHOLD 125 /* @@ -154,6 +155,48 @@ struct prio_array { }; /* + * It's possible for two CPUs to share the same runqueue. + * This makes sense if they eg. share caches. + * + * We take the common 1:1 (SMP, UP) case and optimize it, + * the rest goes via remapping: rq_idx(cpu) gives the + * runqueue on which a particular cpu is on, cpu_idx(cpu) + * gives the rq-specific index of the cpu. + * + * (Note that the generic scheduler code does not impose any + * restrictions on the mappings - there can be 4 CPUs per + * runqueue or even assymetric mappings.) + */ +#if CONFIG_SHARE_RUNQUEUE +# define MAX_NR_SIBLINGS CONFIG_NR_SIBLINGS + long __rq_idx[NR_CPUS] __cacheline_aligned; + static long __cpu_idx[NR_CPUS] __cacheline_aligned; +# define rq_idx(cpu) (__rq_idx[(cpu)]) +# define cpu_idx(cpu) (__cpu_idx[(cpu)]) +# define for_each_sibling(idx, rq) \ + for ((idx) = 0; (idx) < (rq)->nr_cpus; (idx)++) +# define rq_nr_cpus(rq) ((rq)->nr_cpus) +# define cpu_active_balance(c) (cpu_rq(c)->cpu[0].active_balance) +#else +# define MAX_NR_SIBLINGS 1 +# define rq_idx(cpu) (cpu) +# define cpu_idx(cpu) 0 +# define for_each_sibling(idx, rq) while (0) +# define cpu_active_balance(c) 0 +# define do_active_balance(rq, cpu) do { } while (0) +# define rq_nr_cpus(rq) 1 + static inline void active_load_balance(runqueue_t *rq, int this_cpu) { } +#endif + +typedef struct cpu_s { + task_t *curr, *idle; + task_t *migration_thread; + struct list_head migration_queue; + int active_balance; + int cpu; +} cpu_t; + +/* * This is the main, per-CPU runqueue data structure. * * Locking rule: those places that want to lock multiple runqueues @@ -164,7 +207,7 @@ struct runqueue { spinlock_t lock; unsigned long nr_running, nr_switches, expired_timestamp, nr_uninterruptible; - task_t *curr, *idle; + struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; int prev_nr_running[NR_CPUS]; #ifdef CONFIG_NUMA @@ -172,27 +215,39 @@ struct runqueue { unsigned int nr_balanced; int prev_node_load[MAX_NUMNODES]; #endif - task_t *migration_thread; - struct list_head migration_queue; + int nr_cpus; + cpu_t cpu[MAX_NR_SIBLINGS]; atomic_t nr_iowait; } ____cacheline_aligned; static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; -#define cpu_rq(cpu) (runqueues + (cpu)) +#define cpu_rq(cpu) (runqueues + (rq_idx(cpu))) +#define cpu_int(c) ((cpu_rq(c))->cpu + cpu_idx(c)) +#define cpu_curr_ptr(cpu) (cpu_int(cpu)->curr) +#define cpu_idle_ptr(cpu) (cpu_int(cpu)->idle) + #define this_rq() cpu_rq(smp_processor_id()) #define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define rt_task(p) ((p)->prio < MAX_RT_PRIO) +#define migration_thread(cpu) (cpu_int(cpu)->migration_thread) +#define migration_queue(cpu) (&cpu_int(cpu)->migration_queue) + +#if NR_CPUS > 1 +# define task_allowed(p, cpu) ((p)->cpus_allowed & (1UL << (cpu))) +#else +# define task_allowed(p, cpu) 1 +#endif + /* * Default context-switch locking: */ #ifndef prepare_arch_switch # define prepare_arch_switch(rq, next) do { } while(0) # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) -# define task_running(rq, p) ((rq)->curr == (p)) +# define task_running(p) (cpu_curr_ptr(task_cpu(p)) == (p)) #endif #ifdef CONFIG_NUMA @@ -335,16 +390,21 @@ static inline int effective_prio(task_t * Also update all the scheduling statistics stuff. (sleep average * calculation, priority modifiers, etc.) */ +static inline void __activate_task(task_t *p, runqueue_t *rq) +{ + enqueue_task(p, rq->active); + nr_running_inc(rq); +} + static inline void activate_task(task_t *p, runqueue_t *rq) { - unsigned long sleep_time = jiffies - p->sleep_timestamp; - prio_array_t *array = rq->active; + unsigned long sleep_time = jiffies - p->last_run; if (!rt_task(p) && sleep_time) { /* * This code gives a bonus to interactive tasks. We update * an 'average sleep time' value here, based on - * sleep_timestamp. The more time a task spends sleeping, + * ->last_run. The more time a task spends sleeping, * the higher the average gets - and the higher the priority * boost gets as well. */ @@ -353,8 +413,7 @@ static inline void activate_task(task_t p->sleep_avg = MAX_SLEEP_AVG; p->prio = effective_prio(p); } - enqueue_task(p, array); - nr_running_inc(rq); + __activate_task(p, rq); } /* @@ -395,8 +454,18 @@ static inline void resched_task(task_t * #endif } +static inline void resched_cpu(int cpu) +{ + resched_task(cpu_curr_ptr(cpu)); +} + #ifdef CONFIG_SMP +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + p->thread_info->cpu = cpu; +} + /* * wait_task_inactive - wait for a thread to unschedule. * @@ -411,7 +480,7 @@ void wait_task_inactive(task_t * p) repeat: preempt_disable(); rq = task_rq(p); - if (unlikely(task_running(rq, p))) { + if (unlikely(task_running(p))) { cpu_relax(); /* * enable/disable preemption just to make this @@ -422,7 +491,7 @@ repeat: goto repeat; } rq = task_rq_lock(p, &flags); - if (unlikely(task_running(rq, p))) { + if (unlikely(task_running(p))) { task_rq_unlock(rq, &flags); preempt_enable(); goto repeat; @@ -430,6 +499,13 @@ repeat: task_rq_unlock(rq, &flags); preempt_enable(); } + +#else + +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ +} + #endif /* @@ -444,10 +520,39 @@ repeat: */ void kick_if_running(task_t * p) { - if ((task_running(task_rq(p), p)) && (task_cpu(p) != smp_processor_id())) + if ((task_running(p)) && (task_cpu(p) != smp_processor_id())) resched_task(p); } +static void wake_up_cpu(runqueue_t *rq, int cpu, task_t *p) +{ + cpu_t *curr_cpu; + task_t *curr; + int idx; + + if (idle_cpu(cpu)) + return resched_cpu(cpu); + + for_each_sibling(idx, rq) { + curr_cpu = rq->cpu + idx; + if (!task_allowed(p, curr_cpu->cpu)) + continue; + if (curr_cpu->idle == curr_cpu->curr) + return resched_cpu(curr_cpu->cpu); + } + + if (p->prio < cpu_curr_ptr(cpu)->prio) + return resched_task(cpu_curr_ptr(cpu)); + + for_each_sibling(idx, rq) { + curr_cpu = rq->cpu + idx; + if (!task_allowed(p, curr_cpu->cpu)) + continue; + curr = curr_cpu->curr; + if (p->prio < curr->prio) + return resched_task(curr); + } +} /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -478,7 +583,7 @@ repeat_lock_task: * Fast-migrate the task if it's not running or runnable * currently. Do not violate hard affinity. */ - if (unlikely(sync && !task_running(rq, p) && + if (unlikely(sync && !task_running(p) && (task_cpu(p) != smp_processor_id()) && (p->cpus_allowed & (1UL << smp_processor_id())))) { @@ -488,10 +593,12 @@ repeat_lock_task: } if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - activate_task(p, rq); - - if (p->prio < rq->curr->prio) - resched_task(rq->curr); + if (sync) + __activate_task(p, rq); + else { + activate_task(p, rq); + wake_up_cpu(rq, task_cpu(p), p); + } success = 1; } p->state = TASK_RUNNING; @@ -583,7 +690,7 @@ asmlinkage void schedule_tail(task_t *pr * context_switch - switch to the new MM and the new * thread's register state. */ -static inline task_t * context_switch(task_t *prev, task_t *next) +static inline task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) { struct mm_struct *mm = next->mm; struct mm_struct *oldmm = prev->active_mm; @@ -597,7 +704,7 @@ static inline task_t * context_switch(ta if (unlikely(!prev->mm)) { prev->active_mm = NULL; - mmdrop(oldmm); + rq->prev_mm = oldmm; } /* Here we just switch the register state and the stack. */ @@ -618,8 +725,9 @@ unsigned long nr_running(void) unsigned long i, sum = 0; for (i = 0; i < NR_CPUS; i++) - sum += cpu_rq(i)->nr_running; - + /* Shared runqueues are counted only once. */ + if (!cpu_idx(i)) + sum += cpu_rq(i)->nr_running; return sum; } @@ -630,7 +738,9 @@ unsigned long nr_uninterruptible(void) for (i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) continue; - sum += cpu_rq(i)->nr_uninterruptible; + /* Shared runqueues are counted only once. */ + if (!cpu_idx(i)) + sum += cpu_rq(i)->nr_uninterruptible; } return sum; } @@ -812,7 +922,23 @@ static inline unsigned long cpus_to_bala #endif /* CONFIG_NUMA */ -#if CONFIG_SMP +/* + * One of the idle_cpu_tick() and busy_cpu_tick() functions will + * get called every timer tick, on every CPU. Our balancing action + * frequency and balancing agressivity depends on whether the CPU is + * idle or not. + * + * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on + * systems with HZ=100, every 10 msecs.) + */ +#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) +#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) + +#if !CONFIG_SMP + +static inline void load_balance(runqueue_t *rq, int this_cpu, int idle) { } + +#else /* * double_lock_balance - lock the busiest runqueue @@ -928,12 +1054,7 @@ static inline void pull_task(runqueue_t set_task_cpu(p, this_cpu); nr_running_inc(this_rq); enqueue_task(p, this_rq->active); - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ - if (p->prio < this_rq->curr->prio) - set_need_resched(); + wake_up_cpu(this_rq, this_cpu, p); } /* @@ -944,9 +1065,9 @@ static inline void pull_task(runqueue_t * We call this with the current runqueue locked, * irqs disabled. */ -static void load_balance(runqueue_t *this_rq, int idle) +static void load_balance(runqueue_t *this_rq, int this_cpu, int idle) { - int imbalance, idx, this_cpu = smp_processor_id(); + int imbalance, idx; runqueue_t *busiest; prio_array_t *array; struct list_head *head, *curr; @@ -994,12 +1115,15 @@ skip_queue: * 1) running (obviously), or * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. + * + * (except if we are in idle mode which is a more agressive + * form of rebalancing.) */ -#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ - ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \ - !task_running(rq, p) && \ - ((p)->cpus_allowed & (1UL << (this_cpu)))) +#define CAN_MIGRATE_TASK(p,rq,cpu) \ + (((idle && AGRESSIVE_IDLE_STEAL) || \ + (jiffies - (p)->last_run > cache_decay_ticks)) && \ + !task_running(p) && task_allowed(p, cpu)) curr = curr->prev; @@ -1022,31 +1146,129 @@ out: ; } +#if CONFIG_SHARE_RUNQUEUE +static void active_load_balance(runqueue_t *this_rq, int this_cpu) +{ + runqueue_t *rq; + int i, idx; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + rq = cpu_rq(i); + if (rq == this_rq) + continue; + /* + * Any SMT-specific imbalance? + */ + for_each_sibling(idx, rq) + if (rq->cpu[idx].idle == rq->cpu[idx].curr) + continue; + + /* + * At this point it's sure that we have a SMT + * imbalance: this (physical) CPU is idle but + * another CPU has two (or more) tasks running. + * + * We wake up one of the migration threads (it + * doesnt matter which one) and let it fix things up: + */ + if (!cpu_active_balance(i)) { + cpu_active_balance(i) = 1; + spin_unlock(&this_rq->lock); + wake_up_process(rq->cpu[0].migration_thread); + spin_lock(&this_rq->lock); + } + } +} + +static void do_active_balance(runqueue_t *this_rq, int this_cpu) +{ + runqueue_t *rq; + int i, idx; + + spin_unlock(&this_rq->lock); + + cpu_active_balance(this_cpu) = 0; + + /* + * Is the imbalance still present? + */ + for_each_sibling(idx, this_rq) + if (this_rq->cpu[idx].idle == this_rq->cpu[idx].curr) + goto out; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + rq = cpu_rq(i); + if (rq == this_rq) + continue; + + /* completely idle CPU? */ + if (rq->nr_running) + continue; + + /* + * At this point it's reasonably sure that we have an + * imbalance. Since we are the migration thread, try to + * balance a thread over to the target queue. + */ + spin_lock(&rq->lock); + load_balance(rq, i, 1); + spin_unlock(&rq->lock); + goto out; + } +out: + spin_lock(&this_rq->lock); +} + /* - * One of the idle_cpu_tick() and busy_cpu_tick() functions will - * get called every timer tick, on every CPU. Our balancing action - * frequency and balancing agressivity depends on whether the CPU is - * idle or not. + * This routine is called to map a CPU into another CPU's runqueue. * - * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on - * systems with HZ=100, every 10 msecs.) + * This must be called during bootup with the merged runqueue having + * no tasks. */ -#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) -#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) +void sched_map_runqueue(int cpu1, int cpu2) +{ + runqueue_t *rq1 = cpu_rq(cpu1); + runqueue_t *rq2 = cpu_rq(cpu2); + int cpu2_idx_orig = cpu_idx(cpu2), cpu2_idx; + + BUG_ON(rq1 == rq2 || rq2->nr_running || rq_idx(cpu1) != cpu1); + /* + * At this point, we dont have anything in the runqueue yet. So, + * there is no need to move processes between the runqueues. + * Only, the idle processes should be combined and accessed + * properly. + */ + cpu2_idx = rq1->nr_cpus++; + + rq_idx(cpu2) = cpu1; + cpu_idx(cpu2) = cpu2_idx; + rq1->cpu[cpu2_idx].cpu = cpu2; + rq1->cpu[cpu2_idx].idle = rq2->cpu[cpu2_idx_orig].idle; + rq1->cpu[cpu2_idx].curr = rq2->cpu[cpu2_idx_orig].curr; + INIT_LIST_HEAD(&rq1->cpu[cpu2_idx].migration_queue); + + /* just to be safe: */ + rq2->cpu[cpu2_idx_orig].idle = NULL; + rq2->cpu[cpu2_idx_orig].curr = NULL; +} +#endif +#endif -static inline void idle_tick(runqueue_t *rq) +DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } }; + +static inline void idle_tick(runqueue_t *rq, unsigned long j) { - if (jiffies % IDLE_REBALANCE_TICK) + if (j % IDLE_REBALANCE_TICK) return; spin_lock(&rq->lock); - load_balance(rq, 1); + load_balance(rq, smp_processor_id(), 1); spin_unlock(&rq->lock); } -#endif - -DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } }; - /* * We place interactive tasks back into the active array, if possible. * @@ -1057,9 +1279,9 @@ DEFINE_PER_CPU(struct kernel_stat, kstat * increasing number of running tasks: */ #define EXPIRED_STARVING(rq) \ - ((rq)->expired_timestamp && \ + (STARVATION_LIMIT && ((rq)->expired_timestamp && \ (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1)) + STARVATION_LIMIT * ((rq)->nr_running) + 1))) /* * This function gets called by the timer code, with HZ frequency. @@ -1072,12 +1294,13 @@ void scheduler_tick(int user_ticks, int { int cpu = smp_processor_id(); runqueue_t *rq = this_rq(); + unsigned long j = jiffies; task_t *p = current; if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_ticks); - if (p == rq->idle) { + if (p == cpu_idle_ptr(cpu)) { /* note: this timer irq context must be accounted for as well */ if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET) kstat_cpu(cpu).cpustat.system += sys_ticks; @@ -1085,9 +1308,7 @@ void scheduler_tick(int user_ticks, int kstat_cpu(cpu).cpustat.iowait += sys_ticks; else kstat_cpu(cpu).cpustat.idle += sys_ticks; -#if CONFIG_SMP - idle_tick(rq); -#endif + idle_tick(rq, j); return; } if (TASK_NICE(p) > 0) @@ -1096,12 +1317,13 @@ void scheduler_tick(int user_ticks, int kstat_cpu(cpu).cpustat.user += user_ticks; kstat_cpu(cpu).cpustat.system += sys_ticks; + spin_lock(&rq->lock); /* Task might have expired already, but not scheduled off yet */ if (p->array != rq->active) { set_tsk_need_resched(p); + spin_unlock(&rq->lock); return; } - spin_lock(&rq->lock); if (unlikely(rt_task(p))) { /* * RR tasks need a special form of timeslice management. @@ -1137,16 +1359,14 @@ void scheduler_tick(int user_ticks, int if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; + rq->expired_timestamp = j; enqueue_task(p, rq->expired); } else enqueue_task(p, rq->active); } out: -#if CONFIG_SMP - if (!(jiffies % BUSY_REBALANCE_TICK)) - load_balance(rq, 0); -#endif + if (!(j % BUSY_REBALANCE_TICK)) + load_balance(rq, smp_processor_id(), 0); spin_unlock(&rq->lock); } @@ -1157,11 +1377,11 @@ void scheduling_functions_start_here(voi */ asmlinkage void do_schedule(void) { + int idx, this_cpu, retry = 0; + struct list_head *queue; task_t *prev, *next; - runqueue_t *rq; prio_array_t *array; - struct list_head *queue; - int idx; + runqueue_t *rq; /* * Test if we are atomic. Since do_exit() needs to call into @@ -1174,15 +1394,15 @@ asmlinkage void do_schedule(void) dump_stack(); } } - - check_highmem_ptes(); need_resched: + check_highmem_ptes(); + this_cpu = smp_processor_id(); preempt_disable(); prev = current; rq = this_rq(); release_kernel_lock(prev); - prev->sleep_timestamp = jiffies; + prev->last_run = jiffies; spin_lock_irq(&rq->lock); /* @@ -1205,12 +1425,14 @@ need_resched: } pick_next_task: if (unlikely(!rq->nr_running)) { -#if CONFIG_SMP - load_balance(rq, 1); + load_balance(rq, this_cpu, 1); if (rq->nr_running) goto pick_next_task; -#endif - next = rq->idle; + active_load_balance(rq, this_cpu); + if (rq->nr_running) + goto pick_next_task; +pick_idle: + next = cpu_idle_ptr(this_cpu); rq->expired_timestamp = 0; goto switch_tasks; } @@ -1226,24 +1448,60 @@ pick_next_task: rq->expired_timestamp = 0; } +new_array: idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + if ((next != prev) && (rq_nr_cpus(rq) > 1)) { + struct list_head *tmp = queue->next; + + while ((task_running(next) && (next != prev)) || !task_allowed(next, this_cpu)) { + tmp = tmp->next; + if (tmp != queue) { + next = list_entry(tmp, task_t, run_list); + continue; + } + idx = find_next_bit(array->bitmap, MAX_PRIO, ++idx); + if (idx == MAX_PRIO) { + if (retry || !rq->expired->nr_active) { + goto pick_idle; + } + /* + * To avoid infinite changing of arrays, + * when we have only tasks runnable by + * sibling. + */ + retry = 1; + + array = rq->expired; + goto new_array; + } + queue = array->queue + idx; + tmp = queue->next; + next = list_entry(tmp, task_t, run_list); + } + } switch_tasks: prefetch(next); clear_tsk_need_resched(prev); - RCU_qsctr(prev->thread_info->cpu)++; + RCU_qsctr(task_cpu(prev))++; if (likely(prev != next)) { + struct mm_struct *prev_mm; rq->nr_switches++; - rq->curr = next; + cpu_curr_ptr(this_cpu) = next; + set_task_cpu(next, this_cpu); prepare_arch_switch(rq, next); - prev = context_switch(prev, next); + prev = context_switch(rq, prev, next); barrier(); rq = this_rq(); + prev_mm = rq->prev_mm; + rq->prev_mm = NULL; finish_arch_switch(rq, prev); + if (prev_mm) + mmdrop(prev_mm); } else spin_unlock_irq(&rq->lock); @@ -1527,9 +1785,8 @@ void set_user_nice(task_t *p, long nice) * If the task is running and lowered its priority, * or increased its priority then reschedule its CPU: */ - if ((NICE_TO_PRIO(nice) < p->static_prio) || - task_running(rq, p)) - resched_task(rq->curr); + if ((NICE_TO_PRIO(nice) < p->static_prio) || task_running(p)) + resched_task(cpu_curr_ptr(task_cpu(p))); } out_unlock: task_rq_unlock(rq, &flags); @@ -1607,7 +1864,7 @@ int task_nice(task_t *p) */ int task_curr(task_t *p) { - return cpu_curr(task_cpu(p)) == p; + return cpu_curr_ptr(task_cpu(p)) == p; } /** @@ -1616,7 +1873,7 @@ int task_curr(task_t *p) */ int idle_cpu(int cpu) { - return cpu_curr(cpu) == cpu_rq(cpu)->idle; + return cpu_curr_ptr(cpu) == cpu_idle_ptr(cpu); } /** @@ -1706,7 +1963,7 @@ static int setscheduler(pid_t pid, int p else p->prio = p->static_prio; if (array) - activate_task(p, task_rq(p)); + __activate_task(p, task_rq(p)); out_unlock: task_rq_unlock(rq, &flags); @@ -2181,7 +2438,7 @@ void __init init_idle(task_t *idle, int local_irq_save(flags); double_rq_lock(idle_rq, rq); - idle_rq->curr = idle_rq->idle = idle; + cpu_curr_ptr(cpu) = cpu_idle_ptr(cpu) = idle; deactivate_task(idle, rq); idle->array = NULL; idle->prio = MAX_PRIO; @@ -2236,6 +2493,7 @@ void set_cpus_allowed(task_t *p, unsigne unsigned long flags; migration_req_t req; runqueue_t *rq; + int cpu; #if 0 /* FIXME: Grab cpu_lock, return error on this case. --RR */ new_mask &= cpu_online_map; @@ -2257,31 +2515,31 @@ void set_cpus_allowed(task_t *p, unsigne * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && !task_running(rq, p)) { + if (!p->array && !task_running(p)) { set_task_cpu(p, __ffs(p->cpus_allowed)); task_rq_unlock(rq, &flags); return; } init_completion(&req.done); req.task = p; - list_add(&req.list, &rq->migration_queue); + cpu = task_cpu(p); + list_add(&req.list, migration_queue(cpu)); task_rq_unlock(rq, &flags); - - wake_up_process(rq->migration_thread); + wake_up_process(migration_thread(cpu)); wait_for_completion(&req.done); } /* - * migration_thread - this is a highprio system thread that performs + * migration_task - this is a highprio system thread that performs * thread migration by 'pulling' threads into the target runqueue. */ -static int migration_thread(void * data) +static int migration_task(void * data) { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; int cpu = (long) data; runqueue_t *rq; - int ret; + int ret, idx; daemonize("migration/%d", cpu); set_fs(KERNEL_DS); @@ -2295,7 +2553,8 @@ static int migration_thread(void * data) ret = setscheduler(0, SCHED_FIFO, ¶m); rq = this_rq(); - rq->migration_thread = current; + migration_thread(cpu) = current; + idx = cpu_idx(cpu); for (;;) { runqueue_t *rq_src, *rq_dest; @@ -2306,8 +2565,10 @@ static int migration_thread(void * data) task_t *p; spin_lock_irqsave(&rq->lock, flags); - head = &rq->migration_queue; - current->state = TASK_INTERRUPTIBLE; + if (cpu_active_balance(cpu)) + do_active_balance(rq, cpu); + head = migration_queue(cpu); + current->state = TASK_UNINTERRUPTIBLE; if (list_empty(head)) { spin_unlock_irqrestore(&rq->lock, flags); schedule(); @@ -2335,9 +2596,8 @@ repeat: set_task_cpu(p, cpu_dest); if (p->array) { deactivate_task(p, rq_src); - activate_task(p, rq_dest); - if (p->prio < rq_dest->curr->prio) - resched_task(rq_dest->curr); + __activate_task(p, rq_dest); + wake_up_cpu(rq_dest, cpu_dest, p); } } double_rq_unlock(rq_src, rq_dest); @@ -2355,12 +2615,13 @@ static int migration_call(struct notifie unsigned long action, void *hcpu) { + long cpu = (long) hcpu; + switch (action) { case CPU_ONLINE: - printk("Starting migration thread for cpu %li\n", - (long)hcpu); - kernel_thread(migration_thread, hcpu, CLONE_KERNEL); - while (!cpu_rq((long)hcpu)->migration_thread) + printk("Starting migration thread for cpu %li\n", cpu); + kernel_thread(migration_task, hcpu, CLONE_KERNEL); + while (!migration_thread(cpu)) yield(); break; } @@ -2435,11 +2696,20 @@ void __init sched_init(void) for (i = 0; i < NR_CPUS; i++) { prio_array_t *array; + /* + * Start with a 1:1 mapping between CPUs and runqueues: + */ +#if CONFIG_SHARE_RUNQUEUE + rq_idx(i) = i; + cpu_idx(i) = 0; +#endif rq = cpu_rq(i); rq->active = rq->arrays; rq->expired = rq->arrays + 1; spin_lock_init(&rq->lock); - INIT_LIST_HEAD(&rq->migration_queue); + INIT_LIST_HEAD(migration_queue(i)); + rq->nr_cpus = 1; + rq->cpu[cpu_idx(i)].cpu = i; atomic_set(&rq->nr_iowait, 0); nr_running_init(rq); @@ -2457,9 +2727,9 @@ void __init sched_init(void) * We have to do a little magic to get the first * thread right in SMP mode. */ - rq = this_rq(); - rq->curr = current; - rq->idle = current; + cpu_curr_ptr(smp_processor_id()) = current; + cpu_idle_ptr(smp_processor_id()) = current; + set_task_cpu(current, smp_processor_id()); wake_up_forked_process(current); _