diff -urNp --exclude CVS --exclude BitKeeper x-ref/arch/i386/kernel/process.c x/arch/i386/kernel/process.c --- x-ref/arch/i386/kernel/process.c 2003-07-17 03:49:37.000000000 +0200 +++ x/arch/i386/kernel/process.c 2003-07-17 03:49:40.000000000 +0200 @@ -129,7 +129,7 @@ void cpu_idle (void) void (*idle)(void) = pm_idle; if (!idle) idle = default_idle; - if (!current->need_resched) + while (!current->need_resched) idle(); schedule(); check_pgt_cache(); diff -urNp --exclude CVS --exclude BitKeeper x-ref/fs/pipe.c x/fs/pipe.c --- x-ref/fs/pipe.c 2003-07-17 03:49:37.000000000 +0200 +++ x/fs/pipe.c 2003-07-17 03:49:40.000000000 +0200 @@ -115,7 +115,7 @@ do_more_read: * writers synchronously that there is more * room. */ - wake_up_interruptible(PIPE_WAIT(*inode)); + wake_up_interruptible_sync(PIPE_WAIT(*inode)); if (!PIPE_EMPTY(*inode)) BUG(); goto do_more_read; diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/asm-i386/smp_balance.h x/include/asm-i386/smp_balance.h --- x-ref/include/asm-i386/smp_balance.h 2003-07-17 03:49:39.000000000 +0200 +++ x/include/asm-i386/smp_balance.h 2003-07-17 03:49:40.000000000 +0200 @@ -29,10 +29,28 @@ static inline int find_idle_package(int return -1; /* not found */ } +static inline int arch_reschedule_idle_override(task_t * p, int idle) +{ + if (unlikely(smp_num_siblings > 1) && !idle_cpu(cpu_sibling_map[idle])) { + int true_idle = find_idle_package(idle); + if (true_idle >= 0) { + if (likely(p->cpus_allowed & (1UL << true_idle))) + idle = true_idle; + else { + true_idle = cpu_sibling_map[true_idle]; + if (p->cpus_allowed & (1UL << true_idle)) + idle = true_idle; + } + } + } + + return idle; +} + static inline int arch_load_balance(int this_cpu, int idle) { /* Special hack for hyperthreading */ - if (unlikely(smp_num_siblings > 1 && idle && !idle_cpu(cpu_sibling_map[this_cpu]))) { + if (unlikely(smp_num_siblings > 1 && idle == 2 && !idle_cpu(cpu_sibling_map[this_cpu]))) { int found; struct runqueue *rq_target; diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/sched.h x/include/linux/sched.h --- x-ref/include/linux/sched.h 2003-07-17 03:49:39.000000000 +0200 +++ x/include/linux/sched.h 2003-07-17 03:49:40.000000000 +0200 @@ -135,7 +135,6 @@ typedef struct task_struct task_t; extern void sched_init(void); extern void init_idle(task_t *idle, int cpu); -extern int idle_cpu(int cpu); extern void show_state(void); extern void cpu_init (void); extern void trap_init(void); diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/sched_runqueue.h x/include/linux/sched_runqueue.h --- x-ref/include/linux/sched_runqueue.h 2003-07-17 03:49:39.000000000 +0200 +++ x/include/linux/sched_runqueue.h 2003-07-17 03:49:40.000000000 +0200 @@ -62,9 +62,12 @@ struct runqueue { task_t *curr, *idle; prio_array_t *active, *expired, arrays[2]; long nr_uninterruptible; +#ifdef CONFIG_SMP + long last_jiffy; int prev_nr_running[NR_CPUS]; task_t *migration_thread; list_t migration_queue; +#endif } ____cacheline_aligned; typedef struct runqueue runqueue_t; diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/smp_balance.h x/include/linux/smp_balance.h --- x-ref/include/linux/smp_balance.h 2003-07-17 03:49:39.000000000 +0200 +++ x/include/linux/smp_balance.h 2003-07-17 03:49:40.000000000 +0200 @@ -9,6 +9,7 @@ #include #else #define arch_load_balance(x, y) (0) +#define arch_reschedule_idle_override(x, idle) (idle) #endif #endif /* _LINUX_SMP_BALANCE_H */ diff -urNp --exclude CVS --exclude BitKeeper x-ref/kernel/fork.c x/kernel/fork.c --- x-ref/kernel/fork.c 2003-07-17 03:49:39.000000000 +0200 +++ x/kernel/fork.c 2003-07-17 03:49:40.000000000 +0200 @@ -752,8 +752,6 @@ int do_fork(unsigned long clone_flags, u if (p->pid < 0) /* valid pids are >= 0 */ goto bad_fork_cleanup; - INIT_LIST_HEAD(&p->run_list); - p->p_cptr = NULL; init_waitqueue_head(&p->wait_chldexit); p->vfork_done = NULL; @@ -787,7 +785,6 @@ int do_fork(unsigned long clone_flags, u spin_lock_init(&p->sigmask_lock); } #endif - p->array = NULL; p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; diff -urNp --exclude CVS --exclude BitKeeper x-ref/kernel/sched.c x/kernel/sched.c --- x-ref/kernel/sched.c 2003-07-17 03:49:39.000000000 +0200 +++ x/kernel/sched.c 2003-07-17 03:50:45.000000000 +0200 @@ -54,9 +54,8 @@ */ #define MIN_TIMESLICE ( 10 * HZ / 1000) #define MAX_TIMESLICE (300 * HZ / 1000) -#define CHILD_PENALTY 95 +#define CHILD_PENALTY 50 #define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 #define PRIO_BONUS_RATIO 25 #define INTERACTIVE_DELTA 2 #define MAX_SLEEP_AVG (2*HZ) @@ -119,7 +118,7 @@ runqueue_t runqueues[NR_CPUS] __cachelin */ #ifndef prepare_arch_switch # define prepare_arch_switch(rq, next) do { } while(0) -# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) +# define finish_arch_switch(rq, prev) spin_unlock_irq(&(rq)->lock) #endif /* @@ -157,12 +156,18 @@ static inline void dequeue_task(struct t __clear_bit(p->prio, array->bitmap); } -static inline void enqueue_task(struct task_struct *p, prio_array_t *array) +#define enqueue_task(p, array) __enqueue_task(p, array, NULL) +static inline void __enqueue_task(struct task_struct *p, prio_array_t *array, task_t * parent) { - list_add_tail(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); + if (!parent) { + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + p->array = array; + } else { + list_add_tail(&p->run_list, &parent->run_list); + array = p->array = parent->array; + } array->nr_active++; - p->array = array; } static inline int effective_prio(task_t *p) @@ -191,12 +196,13 @@ static inline int effective_prio(task_t return prio; } -static inline void activate_task(task_t *p, runqueue_t *rq) +#define activate_task(p, rq) __activate_task(p, rq, NULL) +static inline void __activate_task(task_t *p, runqueue_t *rq, task_t * parent) { unsigned long sleep_time = jiffies - p->sleep_timestamp; prio_array_t *array = rq->active; - if (!rt_task(p) && sleep_time) { + if (!parent && !rt_task(p) && sleep_time) { /* * This code gives a bonus to interactive tasks. We update * an 'average sleep time' value here, based on @@ -204,12 +210,13 @@ static inline void activate_task(task_t * the higher the average gets - and the higher the priority * boost gets as well. */ + p->sleep_timestamp = jiffies; p->sleep_avg += sleep_time; if (p->sleep_avg > MAX_SLEEP_AVG) p->sleep_avg = MAX_SLEEP_AVG; p->prio = effective_prio(p); } - enqueue_task(p, array); + __enqueue_task(p, array, parent); rq->nr_running++; } @@ -263,6 +270,12 @@ repeat: } #endif +#ifdef CONFIG_SMP +static int FASTCALL(reschedule_idle(task_t * p)); +static void FASTCALL(load_balance(runqueue_t *this_rq, int idle)); +#endif + + /* * Wake up a process. Put it on the run-queue if it's not * already there. The "current" process is always on the @@ -280,19 +293,31 @@ static int try_to_wake_up(task_t * p, in int success = 0; long old_state; runqueue_t *rq; +#ifdef CONFIG_SMP + int migrated_to_idle = 0; +#endif +#ifdef CONFIG_SMP repeat_lock_task: +#endif rq = task_rq_lock(p, &flags); old_state = p->state; if (!p->array) { - if (unlikely(sync) && - rq->curr != p && - p->cpu != smp_processor_id() && - p->cpus_allowed & (1UL << smp_processor_id())) { - p->cpu = smp_processor_id(); - task_rq_unlock(rq, &flags); - goto repeat_lock_task; +#ifdef CONFIG_SMP + if (likely(rq->curr != p)) { + /* can migrate */ + if (unlikely(sync)) { + if (p->cpu != smp_processor_id() && + p->cpus_allowed & (1UL << smp_processor_id())) { + p->cpu = smp_processor_id(); + goto migrated_task; + } + } else { + if (reschedule_idle(p)) + goto migrated_task; + } } +#endif if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; activate_task(p, rq); @@ -303,9 +328,29 @@ repeat_lock_task: resched_task(rq->curr); p->state = TASK_RUNNING; + +#ifdef CONFIG_SMP + /* + * Subtle: we can load_balance only here (before unlock) + * because it can internally drop the lock. Claim + * that the cpu is running so it will be a light rebalance, + * if this cpu will go idle soon schedule() will trigger the + * idle rescheduling balancing by itself. + */ + if (success && migrated_to_idle) + load_balance(rq, 0); +#endif + task_rq_unlock(rq, &flags); return success; + +#ifdef CONFIG_SMP + migrated_task: + task_rq_unlock(rq, &flags); + migrated_to_idle = 1; + goto repeat_lock_task; +#endif } int wake_up_process(task_t * p) @@ -321,23 +366,47 @@ int wake_up_process_kick(task_t * p) void wake_up_forked_process(task_t * p) { runqueue_t *rq; + task_t * parent = current; rq = this_rq(); spin_lock_irq(&rq->lock); p->state = TASK_RUNNING; - if (!rt_task(p)) { + if (likely(!rt_task(p) && parent->array)) { /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks + * We decrease the sleep average of forked + * children, to keep max-interactive tasks * from forking tasks that are max-interactive. + * CHILD_PENALTY is set to 50% since we have + * no clue if this is still an interactive + * task like the parent or if this will be a + * cpu bound task. The parent isn't touched + * as we don't make assumption about the parent + * changing behaviour after the child is forked. */ - current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100; + parent->sleep_avg = parent->sleep_avg * PARENT_PENALTY / 100; p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; + + /* + * For its first schedule keep the child at the same + * priority (i.e. in the same list) of the parent, + * activate_forked_task() will take care to put the + * child in front of the parent (lifo) to guarantee a + * schedule-child-first behaviour after fork. + */ + p->prio = parent->prio; + } else { + /* + * Take the usual wakeup path if it's RT or if + * it's a child of the first idle task (during boot + * only). + */ p->prio = effective_prio(p); + parent = NULL; } + p->cpu = smp_processor_id(); - activate_task(p, rq); + __activate_task(p, rq, parent); spin_unlock_irq(&rq->lock); } @@ -359,13 +428,6 @@ void sched_exit(task_t * p) current->time_slice = MAX_TIMESLICE; } __sti(); - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ - if (p->sleep_avg < current->sleep_avg) - current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT + - p->sleep_avg) / (EXIT_WEIGHT + 1); } #if CONFIG_SMP @@ -462,8 +524,10 @@ static inline unsigned int double_lock_b * Move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) +static inline int pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) { + int resched = 0; + dequeue_task(p, src_array); src_rq->nr_running--; p->cpu = this_cpu; @@ -474,11 +538,44 @@ static inline void pull_task(runqueue_t * to be always true for them. */ if (p->prio < this_rq->curr->prio) - set_need_resched(); + resched = 1; + + return resched; +} + +static inline int idle_cpu_reschedule(task_t * p, int cpu) +{ + if (unlikely(!(p->cpus_allowed & (1UL << cpu)))) + return 0; + return idle_cpu(cpu); } #include +static int reschedule_idle(task_t * p) +{ + int p_cpu = p->cpu, i; + + if (idle_cpu(p_cpu)) + return 0; + + p_cpu = cpu_number_map(p_cpu); + + for (i = (p_cpu + 1) % smp_num_cpus; + i != p_cpu; + i = (i + 1) % smp_num_cpus) { + int physical = cpu_logical_map(i); + + if (idle_cpu_reschedule(p, physical)) { + physical = arch_reschedule_idle_override(p, physical); + p->cpu = physical; + return 1; + } + } + + return 0; +} + /* * Current runqueue is empty, or rebalance tick: if there is an * inbalance (current runqueue is too short) then pull from @@ -490,11 +587,12 @@ static inline void pull_task(runqueue_t static void load_balance(runqueue_t *this_rq, int idle) { int imbalance, nr_running, load, max_load, - idx, i, this_cpu = smp_processor_id(); + idx, i, this_cpu = this_rq - runqueues; task_t *tmp; runqueue_t *busiest, *rq_src; prio_array_t *array; list_t *head, *curr; + int resched; /* * Handle architecture-specific balancing, such as hyperthreading. @@ -502,6 +600,7 @@ static void load_balance(runqueue_t *thi if (arch_load_balance(this_cpu, idle)) return; + retry: /* * We search all runqueues to find the most busy one. * We do this lockless to reduce cache-bouncing overhead, @@ -556,13 +655,13 @@ static void load_balance(runqueue_t *thi if (!idle && (imbalance < (max_load + 3)/4)) return; - nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running); /* - * Make sure nothing changed since we checked the + * Make sure nothing significant changed since we checked the * runqueue length. */ - if (busiest->nr_running <= nr_running + 1) - goto out_unlock; + if (double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running) > nr_running || + busiest->nr_running < max_load) + goto out_unlock_retry; /* * We first consider expired tasks. Those will likely not be @@ -575,6 +674,7 @@ static void load_balance(runqueue_t *thi else array = busiest->active; + resched = 0; new_array: /* Start searching at priority 0: */ idx = 0; @@ -616,8 +716,8 @@ skip_queue: idx++; goto skip_bitmap; } - pull_task(busiest, array, tmp, this_rq, this_cpu); - if (!idle && --imbalance) { + resched |= pull_task(busiest, array, tmp, this_rq, this_cpu); + if (--imbalance > 0) { if (curr != head) goto skip_queue; idx++; @@ -625,6 +725,12 @@ skip_queue: } out_unlock: spin_unlock(&busiest->lock); + if (resched) + resched_task(this_rq->curr); + return; +out_unlock_retry: + spin_unlock(&busiest->lock); + goto retry; } /* @@ -633,19 +739,19 @@ out_unlock: * frequency and balancing agressivity depends on whether the CPU is * idle or not. * - * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on - * systems with HZ=100, every 10 msecs.) + * busy-rebalance every 250 msecs. idle-rebalance every 100 msec. */ #define BUSY_REBALANCE_TICK (HZ/4 ?: 1) -#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) +#define IDLE_REBALANCE_TICK (HZ/10 ?: 1) static inline void idle_tick(void) { - if (jiffies % IDLE_REBALANCE_TICK) - return; - spin_lock(&this_rq()->lock); - load_balance(this_rq(), 1); - spin_unlock(&this_rq()->lock); + if (unlikely(time_before_eq(this_rq()->last_jiffy + IDLE_REBALANCE_TICK, jiffies))) { + spin_lock(&this_rq()->lock); + load_balance(this_rq(), 1); + spin_unlock(&this_rq()->lock); + this_rq()->last_jiffy = jiffies; + } } #endif @@ -736,8 +842,10 @@ void scheduler_tick(int user_tick, int s } out: #if CONFIG_SMP - if (!(jiffies % BUSY_REBALANCE_TICK)) + if (unlikely(time_before_eq(this_rq()->last_jiffy + BUSY_REBALANCE_TICK, jiffies))) { load_balance(rq, 0); + rq->last_jiffy = jiffies; + } #endif spin_unlock(&rq->lock); } @@ -782,7 +890,8 @@ pick_next_task: #endif if (unlikely(!rq->nr_running)) { #if CONFIG_SMP - load_balance(rq, 1); + load_balance(rq, 2); + rq->last_jiffy = jiffies; if (rq->nr_running) goto pick_next_task; #endif @@ -1020,7 +1129,7 @@ void set_user_nice(task_t *p, long nice) * If the task is running and lowered its priority, * or increased its priority then reschedule its CPU: */ - if ((NICE_TO_PRIO(nice) < p->static_prio) || (p == rq->curr)) + if (p == rq->curr) resched_task(rq->curr); } out_unlock: @@ -1324,31 +1433,55 @@ out_unlock: asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq(); - prio_array_t *array = current->array; + prio_array_t *array; int i; spin_lock_irq(&rq->lock); + if (unlikely(rq->nr_running == 1)) { + spin_unlock_irq(&rq->lock); + return 0; + } + + array = current->array; if (unlikely(rt_task(current))) { list_del(¤t->run_list); list_add_tail(¤t->run_list, array->queue + current->prio); goto out_unlock; } + if (unlikely(array == rq->expired) && rq->active->nr_active) + goto out_unlock; + list_del(¤t->run_list); if (!list_empty(array->queue + current->prio)) { list_add(¤t->run_list, array->queue[current->prio].next); goto out_unlock; } + __clear_bit(current->prio, array->bitmap); + if (likely(array == rq->active) && array->nr_active == 1) { + /* + * We're the last task in the active queue so + * we must move ourself to the expired array + * to avoid running again immediatly. + */ + array->nr_active--; + array = rq->expired; + array->nr_active++; + } i = sched_find_first_bit(array->bitmap); - if (i == MAX_PRIO || i <= current->prio) + BUG_ON(i == MAX_PRIO); + BUG_ON(i == current->prio && array == current->array); + + if (array == current->array && i < current->prio) i = current->prio; - else + else { + current->array = array; current->prio = i; - + } list_add(¤t->run_list, array->queue[i].next); __set_bit(i, array->bitmap); @@ -1594,7 +1727,9 @@ void __init sched_init(void) rq->active = rq->arrays; rq->expired = rq->arrays + 1; spin_lock_init(&rq->lock); +#ifdef CONFIG_SMP INIT_LIST_HEAD(&rq->migration_queue); +#endif for (j = 0; j < 2; j++) { array = rq->arrays + j;