diff -urN numa_sched-ref/arch/alpha/config.in numa_sched/arch/alpha/config.in --- numa_sched-ref/arch/alpha/config.in Sun Oct 7 00:58:42 2001 +++ numa_sched/arch/alpha/config.in Sun Oct 7 00:58:53 2001 @@ -219,6 +219,9 @@ bool 'Discontiguous Memory Support' CONFIG_DISCONTIGMEM if [ "$CONFIG_DISCONTIGMEM" = "y" ]; then bool ' NUMA Support' CONFIG_NUMA + if [ "$CONFIG_NUMA" = "y" ]; then + bool ' NUMA Scheduler Support' CONFIG_NUMA_SCHED + fi fi fi diff -urN numa_sched-ref/arch/alpha/kernel/entry.S numa_sched/arch/alpha/kernel/entry.S --- numa_sched-ref/arch/alpha/kernel/entry.S Sun Oct 7 00:58:41 2001 +++ numa_sched/arch/alpha/kernel/entry.S Sun Oct 7 00:58:53 2001 @@ -35,7 +35,7 @@ #define TASK_EXEC_DOMAIN 32 #define TASK_NEED_RESCHED 40 #define TASK_PTRACE 48 -#define TASK_PROCESSOR 100 +#define TASK_PROCESSOR 84 /* * task flags (must match include/linux/sched.h): diff -urN numa_sched-ref/include/asm-alpha/mmzone.h numa_sched/include/asm-alpha/mmzone.h --- numa_sched-ref/include/asm-alpha/mmzone.h Sat May 26 04:03:47 2001 +++ numa_sched/include/asm-alpha/mmzone.h Sun Oct 7 00:58:53 2001 @@ -21,7 +21,7 @@ #ifdef NOTYET kern_vars_t kern_vars; #endif -#if defined(CONFIG_NUMA) && defined(CONFIG_NUMA_SCHED) +#ifdef CONFIG_NUMA_SCHED struct numa_schedule_data schedule_data; #endif } plat_pg_data_t; diff -urN numa_sched-ref/include/asm-alpha/timex.h numa_sched/include/asm-alpha/timex.h --- numa_sched-ref/include/asm-alpha/timex.h Tue Dec 29 22:56:15 1998 +++ numa_sched/include/asm-alpha/timex.h Sun Oct 7 00:58:53 2001 @@ -27,4 +27,8 @@ return ret; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa_sched-ref/include/asm-arm/timex.h numa_sched/include/asm-arm/timex.h --- numa_sched-ref/include/asm-arm/timex.h Thu Nov 16 15:37:33 2000 +++ numa_sched/include/asm-arm/timex.h Sun Oct 7 00:58:53 2001 @@ -23,4 +23,8 @@ return 0; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa_sched-ref/include/asm-cris/timex.h numa_sched/include/asm-cris/timex.h --- numa_sched-ref/include/asm-cris/timex.h Sat May 26 04:03:47 2001 +++ numa_sched/include/asm-cris/timex.h Sun Oct 7 00:58:53 2001 @@ -20,4 +20,8 @@ return 0; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa_sched-ref/include/asm-i386/timex.h numa_sched/include/asm-i386/timex.h --- numa_sched-ref/include/asm-i386/timex.h Sun Sep 23 21:11:40 2001 +++ numa_sched/include/asm-i386/timex.h Sun Oct 7 00:58:53 2001 @@ -47,4 +47,8 @@ extern unsigned long cpu_khz; +typedef cycles_t last_schedule_t; +#define get_last_schedule() ({ get_cycles(); }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa_sched-ref/include/asm-ia64/timex.h numa_sched/include/asm-ia64/timex.h --- numa_sched-ref/include/asm-ia64/timex.h Tue May 1 19:35:31 2001 +++ numa_sched/include/asm-ia64/timex.h Sun Oct 7 00:58:53 2001 @@ -21,4 +21,8 @@ return ret; } +typedef cycles_t last_schedule_t; +#define get_last_schedule() ({ get_cycles(); }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif /* _ASM_IA64_TIMEX_H */ diff -urN numa_sched-ref/include/asm-m68k/timex.h numa_sched/include/asm-m68k/timex.h --- numa_sched-ref/include/asm-m68k/timex.h Tue Jan 5 20:20:43 1999 +++ numa_sched/include/asm-m68k/timex.h Sun Oct 7 00:58:53 2001 @@ -19,4 +19,8 @@ return 0; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa_sched-ref/include/asm-mips/timex.h numa_sched/include/asm-mips/timex.h --- numa_sched-ref/include/asm-mips/timex.h Sat May 13 17:31:25 2000 +++ numa_sched/include/asm-mips/timex.h Sun Oct 7 00:58:53 2001 @@ -36,6 +36,11 @@ { return read_32bit_cp0_register(CP0_COUNT); } + +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif /* __KERNEL__ */ #endif /* __ASM_MIPS_TIMEX_H */ diff -urN numa_sched-ref/include/asm-mips64/timex.h numa_sched/include/asm-mips64/timex.h --- numa_sched-ref/include/asm-mips64/timex.h Sun Sep 23 21:11:41 2001 +++ numa_sched/include/asm-mips64/timex.h Sun Oct 7 00:58:53 2001 @@ -43,4 +43,8 @@ return val; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif /* _ASM_TIMEX_H */ diff -urN numa_sched-ref/include/asm-parisc/timex.h numa_sched/include/asm-parisc/timex.h --- numa_sched-ref/include/asm-parisc/timex.h Thu Dec 14 22:34:13 2000 +++ numa_sched/include/asm-parisc/timex.h Sun Oct 7 00:58:53 2001 @@ -18,4 +18,8 @@ return mfctl(16); } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa_sched-ref/include/asm-ppc/timex.h numa_sched/include/asm-ppc/timex.h --- numa_sched-ref/include/asm-ppc/timex.h Sun Sep 23 21:11:41 2001 +++ numa_sched/include/asm-ppc/timex.h Sun Oct 7 00:58:53 2001 @@ -45,5 +45,9 @@ return ret; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif #endif /* __KERNEL__ */ diff -urN numa_sched-ref/include/asm-s390/timex.h numa_sched/include/asm-s390/timex.h --- numa_sched-ref/include/asm-s390/timex.h Fri May 12 20:41:44 2000 +++ numa_sched/include/asm-s390/timex.h Sun Oct 7 00:58:53 2001 @@ -26,4 +26,8 @@ return 0; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa_sched-ref/include/asm-s390x/timex.h numa_sched/include/asm-s390x/timex.h --- numa_sched-ref/include/asm-s390x/timex.h Thu Feb 22 03:45:11 2001 +++ numa_sched/include/asm-s390x/timex.h Sun Oct 7 00:58:53 2001 @@ -26,4 +26,8 @@ return 0; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa_sched-ref/include/asm-sh/timex.h numa_sched/include/asm-sh/timex.h --- numa_sched-ref/include/asm-sh/timex.h Fri Jan 5 02:19:29 2001 +++ numa_sched/include/asm-sh/timex.h Sun Oct 7 00:58:53 2001 @@ -21,4 +21,8 @@ return 0; } +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif /* __ASM_SH_TIMEX_H */ diff -urN numa_sched-ref/include/asm-sparc/timex.h numa_sched/include/asm-sparc/timex.h --- numa_sched-ref/include/asm-sparc/timex.h Thu Mar 11 01:53:37 1999 +++ numa_sched/include/asm-sparc/timex.h Sun Oct 7 00:58:53 2001 @@ -17,4 +17,8 @@ extern cycles_t cacheflush_time; #define get_cycles() (0) +typedef long last_schedule_t; +#define get_last_schedule() ({ jiffies; }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa_sched-ref/include/asm-sparc64/timex.h numa_sched/include/asm-sparc64/timex.h --- numa_sched-ref/include/asm-sparc64/timex.h Sun Sep 23 21:11:42 2001 +++ numa_sched/include/asm-sparc64/timex.h Sun Oct 7 00:58:53 2001 @@ -20,4 +20,8 @@ ret; \ }) +typedef cycles_t last_schedule_t; +#define get_last_schedule() ({ get_cycles(); }) +#define last_schedule_before(a, b) ({ a < b; }) + #endif diff -urN numa_sched-ref/include/linux/numa_sched.h numa_sched/include/linux/numa_sched.h --- numa_sched-ref/include/linux/numa_sched.h Thu Jan 1 01:00:00 1970 +++ numa_sched/include/linux/numa_sched.h Sun Oct 7 00:58:53 2001 @@ -0,0 +1,53 @@ +/* + * linux/include/linux/numa_sched.h + * + * NUMA based scheduler + */ + +#ifndef _LINUX_NUMA_SCHED_H +#define _LINUX_NUMA_SCHED_H + +#ifdef CONFIG_NUMA_SCHED +#include +#include +#include +#include + +struct numa_per_cpu_schedule_data { + struct task_struct * curr; + last_schedule_t last_schedule; +}; + +struct numa_schedule_data { + struct numa_per_cpu_schedule_data per_cpu[NR_CPUS] ____cacheline_aligned; + struct list_head runqueue_head; + int nr_running, nr_threads; +}; + +#define numa_nr_running_inc() do { NODE_SCHEDULE_DATA(numa_node_id())->nr_running++; } while(0) +#define numa_nr_running_dec() do { NODE_SCHEDULE_DATA(numa_node_id())->nr_running--; } while(0) +#define numa_nr_running(nid) (NODE_SCHEDULE_DATA(nid)->nr_running) + +#define numa_nr_threads_inc() do { NODE_SCHEDULE_DATA(numa_node_id())->nr_threads++; } while(0) +#define numa_nr_threads_dec() do { NODE_SCHEDULE_DATA(numa_node_id())->nr_threads--; } while(0) +#define numa_nr_threads(nid) (NODE_SCHEDULE_DATA(nid)->nr_threads) + +#define cpu_curr(cpu) (NODE_SCHEDULE_DATA(cputonode(cpu))->per_cpu[(cpu)].curr) +#define last_schedule(cpu) (NODE_SCHEDULE_DATA(cputonode(cpu))->per_cpu[(cpu)].last_schedule) + +#define numa_runqueue_head(x) (&NODE_SCHEDULE_DATA(x)->runqueue_head) + +#else /* CONFIG_NUMA_SCHED */ + +#define numa_nr_running_inc() do { } while(0) +#define numa_nr_running_dec() do { } while(0) +#define numa_nr_threads_inc() do { } while(0) +#define numa_nr_threads_dec() do { } while(0) + +#define cpu_curr(cpu) (aligned_data[(cpu)].schedule_data.curr) +#define last_schedule(cpu) (aligned_data[(cpu)].schedule_data.last_schedule) + +#define numa_runqueue_head(x) (&runqueue_head) +#endif /* CONFIG_NUMA_SCHED */ + +#endif /* __ALPHA_NUMA_SCHED_H */ diff -urN numa_sched-ref/include/linux/sched.h numa_sched/include/linux/sched.h --- numa_sched-ref/include/linux/sched.h Sun Oct 7 00:58:42 2001 +++ numa_sched/include/linux/sched.h Sun Oct 7 00:58:53 2001 @@ -26,6 +26,7 @@ #include #include #include +#include struct exec_domain; @@ -300,9 +301,9 @@ * all fields in a single cacheline that are needed for * the goodness() loop in schedule(). */ - long counter; - long nice; - unsigned long policy; + int counter; + int nice; + unsigned int policy; struct mm_struct *mm; int has_cpu, processor; unsigned long cpus_allowed; @@ -311,8 +312,9 @@ * that's just fine.) */ struct list_head run_list; - unsigned long sleep_time; - +#ifdef CONFIG_NUMA_SCHED + int nid; +#endif struct task_struct *next_task, *prev_task; struct mm_struct *active_mm; struct rw_sem_recursor mm_recursor; @@ -462,7 +464,7 @@ mm: NULL, \ active_mm: &init_mm, \ mm_recursor: RWSEM_RECURSOR_INITIALIZER, \ - cpus_allowed: -1, \ + cpus_allowed: -1UL, \ run_list: LIST_HEAD_INIT(tsk.run_list), \ next_task: &tsk, \ prev_task: &tsk, \ @@ -767,6 +769,30 @@ extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); +#define nr_running_inc() \ +do { \ + numa_nr_running_inc(); \ + nr_running++; \ +} while (0) + +#define nr_running_dec() \ +do { \ + numa_nr_running_dec(); \ + nr_running--; \ +} while (0) + +#define nr_threads_inc() \ +do { \ + numa_nr_threads_inc(); \ + nr_threads++; \ +} while (0) + +#define nr_threads_dec() \ +do { \ + numa_nr_threads_dec(); \ + nr_threads--; \ +} while (0) + #define __wait_event(wq, condition) \ do { \ wait_queue_t __wait; \ @@ -847,29 +873,28 @@ #define next_thread(p) \ list_entry((p)->thread_group.next, struct task_struct, thread_group) -static inline void del_from_runqueue(struct task_struct * p) -{ - nr_running--; - p->sleep_time = jiffies; - list_del(&p->run_list); - p->run_list.next = NULL; -} +#define del_from_runqueue(p) \ +do { \ + nr_running_dec(); \ + list_del(&(p)->run_list); \ + (p)->run_list.next = NULL; \ +} while(0) static inline int task_on_runqueue(struct task_struct *p) { return (p->run_list.next != NULL); } -static inline void unhash_process(struct task_struct *p) -{ - if (task_on_runqueue(p)) BUG(); - write_lock_irq(&tasklist_lock); - nr_threads--; - unhash_pid(p); - REMOVE_LINKS(p); - list_del(&p->thread_group); - write_unlock_irq(&tasklist_lock); -} +#define unhash_process(p) \ +do { \ + if (task_on_runqueue(p)) BUG(); \ + write_lock_irq(&tasklist_lock); \ + nr_threads_dec(); \ + unhash_pid(p); \ + REMOVE_LINKS(p); \ + list_del(&(p)->thread_group); \ + write_unlock_irq(&tasklist_lock); \ +} while(0) /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ static inline void task_lock(struct task_struct *p) diff -urN numa_sched-ref/kernel/fork.c numa_sched/kernel/fork.c --- numa_sched-ref/kernel/fork.c Sun Sep 23 21:11:43 2001 +++ numa_sched/kernel/fork.c Sun Oct 7 00:58:53 2001 @@ -639,7 +639,6 @@ { int i; p->has_cpu = 0; - p->processor = current->processor; /* ?? should we just memset this ?? */ for(i = 0; i < smp_num_cpus; i++) p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0; @@ -716,7 +715,7 @@ SET_LINKS(p); hash_pid(p); - nr_threads++; + nr_threads_inc(); write_unlock_irq(&tasklist_lock); if (p->ptrace & PT_PTRACED) diff -urN numa_sched-ref/kernel/sched.c numa_sched/kernel/sched.c --- numa_sched-ref/kernel/sched.c Fri Oct 5 06:04:05 2001 +++ numa_sched/kernel/sched.c Sun Oct 7 00:59:38 2001 @@ -10,6 +10,7 @@ * 1998-11-19 Implemented schedule_timeout() and related stuff * by Andrea Arcangeli * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar + * 2001-01-29 first NUMA scheduler attempt by Andrea Arcangeli, SuSE */ /* @@ -91,6 +92,8 @@ spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ +#ifndef CONFIG_NUMA_SCHED + static LIST_HEAD(runqueue_head); /* @@ -100,13 +103,26 @@ static union { struct schedule_data { struct task_struct * curr; - cycles_t last_schedule; + last_schedule_t last_schedule; } schedule_data; char __pad [SMP_CACHE_BYTES]; } aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule +#define init_numa_schedule_data() do { } while(0) + +#else /* CONFIG_NUMA_SCHED */ + +static void __init init_numa_schedule_data(void) +{ + int i; + + for (i = 0; i < numnodes; i++) { + INIT_LIST_HEAD(&NODE_SCHEDULE_DATA(i)->runqueue_head); + NODE_SCHEDULE_DATA(i)->nr_running = 0; + NODE_SCHEDULE_DATA(i)->nr_threads = 0; + } +} +#endif /* CONFIG_NUMA_SCHED */ struct kernel_stat kstat; extern struct task_struct *child_reaper; @@ -114,8 +130,9 @@ #ifdef CONFIG_SMP #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) +#define logical_idle_task(cpu) (init_tasks[cpu]) #define can_schedule(p,cpu) ((!(p)->has_cpu) && \ - ((p)->cpus_allowed & (1 << cpu))) + ((p)->cpus_allowed & (1UL << cpu))) #else @@ -213,8 +230,8 @@ #ifdef CONFIG_SMP int this_cpu = smp_processor_id(); struct task_struct *tsk, *target_tsk; - int cpu, best_cpu, i, max_prio; - cycles_t oldest_idle; + int cpu, best_cpu, i, max_prio, found_idle; + last_schedule_t oldest_idle; /* * shortcut if the woken up task's last CPU is @@ -222,17 +239,17 @@ */ best_cpu = p->processor; if (can_schedule(p, best_cpu)) { - tsk = idle_task(best_cpu); - if (cpu_curr(best_cpu) == tsk) { - int need_resched; + target_tsk = idle_task(best_cpu); + if (cpu_curr(best_cpu) == target_tsk) { + long need_resched; send_now_idle: /* * If need_resched == -1 then we can skip sending * the IPI altogether, tsk->need_resched is * actively watched by the idle thread. */ - need_resched = tsk->need_resched; - tsk->need_resched = 1; + need_resched = target_tsk->need_resched; + target_tsk->need_resched = 1; if ((best_cpu != this_cpu) && !need_resched) smp_send_reschedule(best_cpu); return; @@ -246,13 +263,17 @@ * one will have the least active cache context.) Also find * the executing process which has the least priority. */ - oldest_idle = (cycles_t) -1; target_tsk = NULL; max_prio = 0; + found_idle = 0; for (i = 0; i < smp_num_cpus; i++) { cpu = cpu_logical_map(i); - if (!can_schedule(p, cpu)) + if ( +#ifdef CONFIG_NUMA_SCHED + cputonode(cpu) != p->nid || +#endif + !can_schedule(p, cpu)) continue; tsk = cpu_curr(cpu); /* @@ -260,13 +281,14 @@ * a priority list between idle CPUs, but this is not * a problem. */ - if (tsk == idle_task(cpu)) { - if (last_schedule(cpu) < oldest_idle) { + if (tsk == logical_idle_task(i)) { + if (!found_idle || last_schedule_before(last_schedule(cpu), oldest_idle)) { oldest_idle = last_schedule(cpu); target_tsk = tsk; + found_idle = 1; } } else { - if (oldest_idle == -1ULL) { + if (!found_idle) { int prio = preemption_goodness(tsk, p, cpu); if (prio > max_prio) { @@ -276,15 +298,33 @@ } } } - tsk = target_tsk; - if (tsk) { - if (oldest_idle != -1ULL) { - best_cpu = tsk->processor; - goto send_now_idle; + +#ifdef CONFIG_NUMA_SCHED + if (!target_tsk) + /* Make sure to use the idle cpus in the other nodes */ + for (i = 0; i < smp_num_cpus; i++) { + cpu = cpu_logical_map(i); + if (cputonode(cpu) == p->nid || !can_schedule(p, cpu)) + continue; + tsk = cpu_curr(cpu); + if (tsk == logical_idle_task(i)) { + if (!found_idle || last_schedule_before(last_schedule(cpu), oldest_idle)) { + oldest_idle = last_schedule(cpu); + target_tsk = tsk; + found_idle = 1; + target_tsk->nid = cputonode(cpu); + } + } } - tsk->need_resched = 1; - if (tsk->processor != this_cpu) - smp_send_reschedule(tsk->processor); +#endif + + if (target_tsk) { + best_cpu = target_tsk->processor; + if (found_idle) + goto send_now_idle; + target_tsk->need_resched = 1; + if (best_cpu != this_cpu) + smp_send_reschedule(best_cpu); } return; @@ -308,20 +348,20 @@ */ static inline void add_to_runqueue(struct task_struct * p) { - list_add(&p->run_list, &runqueue_head); - nr_running++; + list_add(&p->run_list, numa_runqueue_head(p->nid)); + nr_running_inc(); } static inline void move_last_runqueue(struct task_struct * p) { list_del(&p->run_list); - list_add_tail(&p->run_list, &runqueue_head); + list_add_tail(&p->run_list, numa_runqueue_head(p->nid)); } static inline void move_first_runqueue(struct task_struct * p) { list_del(&p->run_list); - list_add(&p->run_list, &runqueue_head); + list_add(&p->run_list, numa_runqueue_head(p->nid)); } /* @@ -344,9 +384,9 @@ p->state = TASK_RUNNING; if (task_on_runqueue(p)) goto out; - add_to_runqueue(p); if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id()))) reschedule_idle(p); + add_to_runqueue(p); success = 1; out: spin_unlock_irqrestore(&runqueue_lock, flags); @@ -532,10 +572,12 @@ */ asmlinkage void schedule(void) { - struct schedule_data * sched_data; struct task_struct *prev, *next, *p; struct list_head *tmp; int this_cpu, c; +#ifdef CONFIG_NUMA_SCHED + int recalculate_all; +#endif spin_lock_prefetch(&runqueue_lock); @@ -550,12 +592,6 @@ release_kernel_lock(prev, this_cpu); - /* - * 'sched_data' is protected by the fact that we can run - * only one process per CPU. - */ - sched_data = & aligned_data[this_cpu].schedule_data; - spin_lock_irq(&runqueue_lock); /* move an exhausted RR process to be last.. */ @@ -589,7 +625,7 @@ goto still_running; still_running_back: - list_for_each(tmp, &runqueue_head) { + list_for_each(tmp, numa_runqueue_head(numa_node_id())) { p = list_entry(tmp, struct task_struct, run_list); if (can_schedule(p, this_cpu)) { int weight = goodness(p, this_cpu, prev->active_mm); @@ -598,6 +634,27 @@ } } +#ifdef CONFIG_NUMA_SCHED + recalculate_all = 0; + if (c < 0) { + int nid; + + recalculate_all = 1; + for (nid = 0; nid < numnodes; nid++) { + if (nid == numa_node_id()) + continue; + list_for_each(tmp, numa_runqueue_head(nid)) { + p = list_entry(tmp, struct task_struct, run_list); + if (can_schedule(p, this_cpu)) { + int weight = goodness(p, this_cpu, prev->active_mm); + if (weight > c) + c = weight, next = p; + } + } + } + } +#endif + /* Do we need to re-calculate counters? */ if (!c) goto recalculate; @@ -606,10 +663,16 @@ * switching to the next task, save this fact in * sched_data. */ - sched_data->curr = next; + cpu_curr(this_cpu) = next; #ifdef CONFIG_SMP next->has_cpu = 1; next->processor = this_cpu; +#ifdef CONFIG_NUMA_SCHED + if (next != idle_task(this_cpu) && next->nid != numa_node_id()) { + next->nid = numa_node_id(); + move_last_runqueue(next); + } +#endif #endif spin_unlock_irq(&runqueue_lock); @@ -627,7 +690,7 @@ * and it's approximate, so we do not have to maintain * it while holding the runqueue spinlock. */ - sched_data->last_schedule = get_cycles(); + last_schedule(this_cpu) = get_last_schedule(); /* * We drop the scheduler lock early (it's a global spinlock), @@ -686,8 +749,13 @@ struct task_struct *p; spin_unlock_irq(&runqueue_lock); read_lock(&tasklist_lock); - for_each_task(p) + for_each_task(p) { +#ifdef CONFIG_NUMA_SCHED + if (!recalculate_all && p->nid != numa_node_id()) + continue; +#endif p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); + } read_unlock(&tasklist_lock); spin_lock_irq(&runqueue_lock); } @@ -1056,8 +1124,7 @@ // Subtract non-idle processes running on other CPUs. for (i = 0; i < smp_num_cpus; i++) { - int cpu = cpu_logical_map(i); - if (aligned_data[cpu].schedule_data.curr != idle_task(cpu)) + if (cpu_curr(i) != logical_idle_task(i)) nr_pending--; } #else @@ -1313,16 +1380,15 @@ void __init init_idle(void) { - struct schedule_data * sched_data; - sched_data = &aligned_data[smp_processor_id()].schedule_data; + int cpu = smp_processor_id(); if (current != &init_task && task_on_runqueue(current)) { printk("UGH! (%d:%d) was on the runqueue, removing.\n", smp_processor_id(), current->pid); del_from_runqueue(current); } - sched_data->curr = current; - sched_data->last_schedule = get_cycles(); + cpu_curr(cpu) = current; + last_schedule(cpu) = get_last_schedule(); clear_bit(current->processor, &wait_init_idle); } @@ -1353,4 +1419,6 @@ */ atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current, cpu); + + init_numa_schedule_data(); }