diff -urNp sched-ref/fs/pipe.c sched/fs/pipe.c --- sched-ref/fs/pipe.c Sat Jul 6 20:17:35 2002 +++ sched/fs/pipe.c Sat Jul 6 20:18:11 2002 @@ -215,7 +215,7 @@ pipe_write(struct file *filp, const char * is going to give up this CPU, so it doesnt have * to do idle reschedules. */ - wake_up_interruptible(PIPE_WAIT(*inode)); + wake_up_interruptible_sync(PIPE_WAIT(*inode)); PIPE_WAITING_WRITERS(*inode)++; pipe_wait(inode); PIPE_WAITING_WRITERS(*inode)--; diff -urNp sched-ref/include/asm-alpha/system.h sched/include/asm-alpha/system.h --- sched-ref/include/asm-alpha/system.h Tue Jan 22 18:54:25 2002 +++ sched/include/asm-alpha/system.h Sat Jul 6 20:18:11 2002 @@ -130,7 +130,6 @@ struct el_common_EV6_mcheck { extern void halt(void) __attribute__((noreturn)); #define __halt() __asm__ __volatile__ ("call_pal %0 #halt" : : "i" (PAL_halt)) -#define prepare_to_switch() do { } while(0) #define switch_to(prev,next,last) \ do { \ unsigned long pcbb; \ diff -urNp sched-ref/include/asm-i386/system.h sched/include/asm-i386/system.h --- sched-ref/include/asm-i386/system.h Sun Jun 16 02:04:37 2002 +++ sched/include/asm-i386/system.h Sat Jul 6 20:18:11 2002 @@ -12,25 +12,22 @@ struct task_struct; /* one of the stranger aspects of C forward declarations.. */ extern void FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next)); -#define prepare_to_switch() do { } while(0) #define switch_to(prev,next,last) do { \ asm volatile("pushl %%esi\n\t" \ "pushl %%edi\n\t" \ "pushl %%ebp\n\t" \ "movl %%esp,%0\n\t" /* save ESP */ \ - "movl %3,%%esp\n\t" /* restore ESP */ \ + "movl %2,%%esp\n\t" /* restore ESP */ \ "movl $1f,%1\n\t" /* save EIP */ \ - "pushl %4\n\t" /* restore EIP */ \ + "pushl %3\n\t" /* restore EIP */ \ "jmp __switch_to\n" \ "1:\t" \ "popl %%ebp\n\t" \ "popl %%edi\n\t" \ "popl %%esi\n\t" \ - :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ - "=b" (last) \ + :"=m" (prev->thread.esp),"=m" (prev->thread.eip) \ :"m" (next->thread.esp),"m" (next->thread.eip), \ - "a" (prev), "d" (next), \ - "b" (prev)); \ + "a" (prev), "d" (next)); \ } while (0) #define _set_base(addr,base) do { unsigned long __pr; \ diff -urNp sched-ref/include/asm-sparc64/system.h sched/include/asm-sparc64/system.h --- sched-ref/include/asm-sparc64/system.h Fri Jul 5 12:20:55 2002 +++ sched/include/asm-sparc64/system.h Sat Jul 6 20:18:11 2002 @@ -143,7 +143,11 @@ extern void __flushw_user(void); #define flush_user_windows flushw_user #define flush_register_windows flushw_all -#define prepare_to_switch flushw_all + +#define prepare_arch_schedule(prev) task_lock(prev) +#define finish_arch_schedule(prev) task_unlock(prev) +#define prepare_arch_switch(rq) do { spin_unlock(&(rq)->lock); flushw_all(); } +#define finish_arch_switch(rq) __sti() #ifndef CONFIG_DEBUG_SPINLOCK #define CHECK_LOCKS(PREV) do { } while(0) diff -urNp sched-ref/include/linux/sched.h sched/include/linux/sched.h --- sched-ref/include/linux/sched.h Sat Jul 6 20:17:36 2002 +++ sched/include/linux/sched.h Sat Jul 6 20:18:11 2002 @@ -78,6 +78,7 @@ extern unsigned long avenrun[]; /* Load extern int nr_threads; extern int last_pid; extern unsigned long nr_running(void); +extern unsigned long nr_uninterruptible(void); //#include #include @@ -634,6 +635,7 @@ extern unsigned long prof_shift; #define CURRENT_TIME (xtime.tv_sec) extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr)); +extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)); extern void FASTCALL(sleep_on(wait_queue_head_t *q)); extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q, signed long timeout)); @@ -649,6 +651,12 @@ extern void FASTCALL(wake_up_forked_proc #define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE, 1) #define wake_up_interruptible_nr(x, nr) __wake_up((x),TASK_INTERRUPTIBLE, nr) #define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE, 0) +#ifdef CONFIG_SMP +#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) +#else +#define wake_up_interruptible_sync(x) __wake_up((x),TASK_INTERRUPTIBLE, 1) +#endif + asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru); extern int in_group_p(gid_t); diff -urNp sched-ref/kernel/sched.c sched/kernel/sched.c --- sched-ref/kernel/sched.c Sat Jul 6 20:17:53 2002 +++ sched/kernel/sched.c Sat Jul 6 20:19:52 2002 @@ -13,7 +13,7 @@ * hybrid priority-list and round-robin design with * an array-switch method of distributing timeslices * and per-CPU runqueues. Additional code by Davide - * Libenzi, Robert Love, and Rusty Russel. + * Libenzi, Robert Love, and Rusty Russell. */ #include @@ -24,6 +24,7 @@ #include #include #include +#include /* * Convert user-nice values [ -20 ... 0 ... 19 ] @@ -133,10 +134,10 @@ struct prio_array { */ struct runqueue { spinlock_t lock; - spinlock_t frozen; unsigned long nr_running, nr_switches, expired_timestamp; task_t *curr, *idle; prio_array_t *active, *expired, arrays[2]; + long nr_uninterruptible; int prev_nr_running[NR_CPUS]; task_t *migration_thread; list_t migration_queue; @@ -150,6 +151,21 @@ static struct runqueue runqueues[NR_CPUS #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define rt_task(p) ((p)->prio < MAX_RT_PRIO) +/* + * Default context-switch locking: + */ +#ifndef prepare_arch_schedule +# define prepare_arch_schedule(prev) do { } while(0) +# define finish_arch_schedule(prev) do { } while(0) +# define prepare_arch_switch(rq) do { } while(0) +# define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock) +#endif + +/* + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. + */ static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) { struct runqueue *rq; @@ -239,18 +255,24 @@ static inline void activate_task(task_t static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) { rq->nr_running--; + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; dequeue_task(p, p->array); p->array = NULL; } static inline void resched_task(task_t *p) { +#ifdef CONFIG_SMP int need_resched; need_resched = p->need_resched; set_tsk_need_resched(p); if (!need_resched && (p->cpu != smp_processor_id())) smp_send_reschedule(p->cpu); +#else + set_tsk_need_resched(p); +#endif } #ifdef CONFIG_SMP @@ -266,9 +288,10 @@ void wait_task_inactive(task_t * p) repeat: rq = task_rq(p); - while (unlikely(rq->curr == p)) { + if (unlikely(rq->curr == p)) { cpu_relax(); barrier(); + goto repeat; } rq = task_rq_lock(p, &flags); if (unlikely(rq->curr == p)) { @@ -302,27 +325,40 @@ void kick_if_running(task_t * p) * "current->state = TASK_RUNNING" to mark yourself runnable * without the overhead of this. */ -static int try_to_wake_up(task_t * p) +static int try_to_wake_up(task_t * p, int sync) { unsigned long flags; int success = 0; + long old_state; runqueue_t *rq; +repeat_lock_task: rq = task_rq_lock(p, &flags); - p->state = TASK_RUNNING; + old_state = p->state; if (!p->array) { + if (unlikely(sync && (rq->curr != p))) { + if (p->cpu != smp_processor_id()) { + p->cpu = smp_processor_id(); + task_rq_unlock(rq, &flags); + goto repeat_lock_task; + } + } + if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; activate_task(p, rq); if (p->prio < rq->curr->prio) resched_task(rq->curr); success = 1; } + p->state = TASK_RUNNING; task_rq_unlock(rq, &flags); + return success; } int wake_up_process(task_t * p) { - return try_to_wake_up(p); + return try_to_wake_up(p, 0); } void wake_up_forked_process(task_t * p) @@ -376,17 +412,16 @@ void sched_exit(task_t * p) #if CONFIG_SMP asmlinkage void schedule_tail(task_t *prev) { - spin_unlock_irq(&this_rq()->frozen); + finish_arch_switch(this_rq()); + finish_arch_schedule(prev); } #endif -static inline void context_switch(task_t *prev, task_t *next) +static inline task_t * context_switch(task_t *prev, task_t *next) { struct mm_struct *mm = next->mm; struct mm_struct *oldmm = prev->active_mm; - prepare_to_switch(); - if (unlikely(!mm)) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); @@ -401,6 +436,8 @@ static inline void context_switch(task_t /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); + + return prev; } unsigned long nr_running(void) @@ -413,6 +450,17 @@ unsigned long nr_running(void) return sum; } +/* Note: the per-cpu information is useful only to get the cumulative result */ +unsigned long nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for (i = 0; i < smp_num_cpus; i++) + sum += cpu_rq(cpu_logical_map(i))->nr_uninterruptible; + + return sum; +} + unsigned long nr_context_switches(void) { unsigned long i, sum = 0; @@ -733,6 +781,7 @@ need_resched: rq = this_rq(); release_kernel_lock(prev, smp_processor_id()); + prepare_arch_schedule(prev); prev->sleep_timestamp = jiffies; spin_lock_irq(&rq->lock); @@ -783,27 +832,19 @@ switch_tasks: if (likely(prev != next)) { rq->nr_switches++; rq->curr = next; - spin_lock(&rq->frozen); - spin_unlock(&rq->lock); - - context_switch(prev, next); - - /* - * The runqueue pointer might be from another CPU - * if the new task was last running on a different - * CPU - thus re-load it. - */ - smp_mb(); + + prepare_arch_switch(rq); + prev = context_switch(prev, next); + barrier(); rq = this_rq(); - spin_unlock_irq(&rq->frozen); - } else { + finish_arch_switch(rq); + } else spin_unlock_irq(&rq->lock); - } + finish_arch_schedule(prev); reacquire_kernel_lock(current); if (need_resched()) goto need_resched; - return; } /* @@ -815,8 +856,7 @@ switch_tasks: * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive) +static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync) { struct list_head *tmp; unsigned int state; @@ -827,7 +867,7 @@ static inline void __wake_up_common(wait curr = list_entry(tmp, wait_queue_t, task_list); p = curr->task; state = p->state; - if ((state & mode) && try_to_wake_up(p) && + if ((state & mode) && try_to_wake_up(p, sync) && ((curr->flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)) break; } @@ -841,17 +881,36 @@ void __wake_up(wait_queue_head_t *q, uns return; wq_read_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive); + __wake_up_common(q, mode, nr_exclusive, 0); wq_read_unlock_irqrestore(&q->lock, flags); } +#if CONFIG_SMP + +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + unsigned long flags; + + if (unlikely(!q)) + return; + + wq_read_lock_irqsave(&q->lock, flags); + if (likely(nr_exclusive)) + __wake_up_common(q, mode, nr_exclusive, 1); + else + __wake_up_common(q, mode, nr_exclusive, 0); + wq_read_unlock_irqrestore(&q->lock, flags); +} + +#endif + void complete(struct completion *x) { unsigned long flags; wq_write_lock_irqsave(&x->wait.lock, flags); x->done++; - __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1); + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, 0); wq_write_unlock_irqrestore(&x->wait.lock, flags); } @@ -1028,6 +1087,11 @@ int task_nice(task_t *p) return TASK_NICE(p); } +int idle_cpu(int cpu) +{ + return cpu_curr(cpu) == cpu_rq(cpu)->idle; +} + static inline task_t *find_process_by_pid(pid_t pid) { return pid ? find_task_by_pid(pid) : current; @@ -1078,7 +1142,7 @@ static int setscheduler(pid_t pid, int p /* * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_OTHER is 0. + * 1..MAX_USER_RT_PRIO, valid priority for SCHED_OTHER is 0. */ retval = -EINVAL; if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) @@ -1489,7 +1553,7 @@ static inline void double_rq_unlock(runq spin_unlock(&rq2->lock); } -void init_idle(task_t *idle, int cpu) +void __init init_idle(task_t *idle, int cpu) { runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(idle->cpu); unsigned long flags; @@ -1520,13 +1584,12 @@ void __init sched_init(void) int i, j, k; for (i = 0; i < NR_CPUS; i++) { - runqueue_t *rq = cpu_rq(i); prio_array_t *array; + rq = cpu_rq(i); rq->active = rq->arrays; rq->expired = rq->arrays + 1; spin_lock_init(&rq->lock); - spin_lock_init(&rq->frozen); INIT_LIST_HEAD(&rq->migration_queue); for (j = 0; j < 2; j++) { @@ -1618,7 +1681,7 @@ void set_cpus_allowed(task_t *p, unsigne * If the task is not on a runqueue, then it is safe to * simply update the task's cpu field. */ - if (!p->array) { + if (!p->array && (p != rq->curr)) { p->cpu = __ffs(p->cpus_allowed); task_rq_unlock(rq, &flags); return; diff -urNp sched-ref/kernel/timer.c sched/kernel/timer.c --- sched-ref/kernel/timer.c Sat Jul 6 20:17:35 2002 +++ sched/kernel/timer.c Sat Jul 6 20:18:14 2002 @@ -608,17 +608,7 @@ void update_process_times(int user_tick) */ static unsigned long count_active_tasks(void) { - struct task_struct *p; - unsigned long nr = 0; - - read_lock(&tasklist_lock); - for_each_task(p) { - if ((p->state == TASK_RUNNING || - (p->state & TASK_UNINTERRUPTIBLE))) - nr += FIXED_1; - } - read_unlock(&tasklist_lock); - return nr; + return (nr_running() + nr_uninterruptible()) * FIXED_1; } /*