From: Nick Piggin The patch removes the interactivity estimator. It introduces a priority calculator which adapts quickly to change running patterns. It completely changes timeslice allocation. Previously a timeslice would be allocated based solely on a process' nice level - ~200ms for -20, 10ms for 19. Timeslices are now based only on priority (however nice level directly affects priority). You'll have to read task_timeslice to get a proper picture of how it works, but here is an (inaccurate) examples: Two high priority processes are running: they'll each get a 25ms timeslice. Two low priority processes become runnable: they'll each get a 5ms timeslice. High priority processes sleep: the low prio processes now get 100ms timeslices. /proc/kernel/base_timeslice - a scaling factor for the timeslice calculation. While testing, try lowering this value if interactivity is bad or raising it if efficiency is decreased. For good interactivity in X, the X server should be reniced to about -10. The patch contains a hack to do this for you because you will forget. Signed-off-by: Andrew Morton --- 25-akpm/fs/proc/array.c | 5 25-akpm/include/linux/init_task.h | 5 25-akpm/include/linux/sched.h | 11 25-akpm/include/linux/sysctl.h | 1 25-akpm/kernel/sched.c | 861 ++++++++++++++------------------------ 25-akpm/kernel/sysctl.c | 16 25-akpm/mm/oom_kill.c | 7 7 files changed, 367 insertions(+), 539 deletions(-) diff -puN fs/proc/array.c~nicksched fs/proc/array.c --- 25/fs/proc/array.c~nicksched 2004-08-21 23:49:55.321130800 -0700 +++ 25-akpm/fs/proc/array.c 2004-08-21 23:49:55.334128824 -0700 @@ -159,7 +159,8 @@ static inline char * task_state(struct t read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" + "sleep_time:\t%lu\n" + "total_time:\t%lu\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -167,7 +168,7 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), + p->sleep_time, p->total_time, p->tgid, p->pid, p->pid ? p->real_parent->pid : 0, p->pid && p->ptrace ? p->parent->pid : 0, diff -puN include/linux/init_task.h~nicksched include/linux/init_task.h --- 25/include/linux/init_task.h~nicksched 2004-08-21 23:49:55.322130648 -0700 +++ 25-akpm/include/linux/init_task.h 2004-08-21 23:49:55.334128824 -0700 @@ -71,14 +71,13 @@ extern struct group_info init_groups; .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ - .prio = MAX_PRIO-20, \ - .static_prio = MAX_PRIO-20, \ + .prio = MAX_PRIO-29, \ + .static_prio = MAX_PRIO-29, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ - .time_slice = HZ, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ diff -puN include/linux/sched.h~nicksched include/linux/sched.h --- 25/include/linux/sched.h~nicksched 2004-08-21 23:49:55.324130344 -0700 +++ 25-akpm/include/linux/sched.h 2004-08-21 23:49:55.335128672 -0700 @@ -298,7 +298,7 @@ struct signal_struct { #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO -#define MAX_PRIO (MAX_RT_PRIO + 40) +#define MAX_PRIO (MAX_RT_PRIO + 59) #define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) @@ -414,14 +414,15 @@ struct task_struct { struct list_head run_list; prio_array_t *array; - unsigned long sleep_avg; - long interactive_credit; + /* Scheduler variables follow. kernel/sched.c */ + unsigned long array_sequence; unsigned long long timestamp; - int activated; + int used_slice; + + unsigned long total_time, sleep_time; unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; #ifdef CONFIG_SCHEDSTATS struct sched_info sched_info; diff -puN include/linux/sysctl.h~nicksched include/linux/sysctl.h --- 25/include/linux/sysctl.h~nicksched 2004-08-21 23:49:55.325130192 -0700 +++ 25-akpm/include/linux/sysctl.h 2004-08-21 23:49:55.336128520 -0700 @@ -134,6 +134,7 @@ enum KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */ KERN_HZ_TIMER=65, /* int: hz timer on or off */ KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */ + KERN_SCHED_TIMESLICE=67, /* int: base timeslice for scheduler */ }; diff -puN kernel/sched.c~nicksched kernel/sched.c --- 25/kernel/sched.c~nicksched 2004-08-21 23:49:55.327129888 -0700 +++ 25-akpm/kernel/sched.c 2004-08-21 23:49:55.345127152 -0700 @@ -47,139 +47,74 @@ #include -#ifdef CONFIG_NUMA -#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) -#else -#define cpu_to_node_mask(cpu) (cpu_online_map) -#endif - /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 30) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 30) #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) /* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. + * it's a [ 0 ... 58 ] range. */ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) -/* - * Some helpers for converting nanosecond timing to jiffy resolution - */ -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +#define US_TO_JIFFIES(x) ((x) * HZ / 1000000) +#define JIFFIES_TO_US(x) ((x) * 1000000 / HZ) /* - * These are the 'tuning knobs' of the scheduler: - * - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. - * Timeslices get refilled after they expire. - */ -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) -#define DEF_TIMESLICE (100 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) -#define CREDIT_LIMIT 100 - -/* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. - */ - -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) - -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) -#else -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -#endif - -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) + * MIN_TIMESLICE is the timeslice that a minimum priority process gets if there + * is a maximum priority process runnable. MAX_TIMESLICE is derived from the + * formula in task_timeslice. It cannot be changed here. It is the timesilce + * that the maximum priority process will get. Larger timeslices are attainable + * by low priority processes however. + */ +int sched_base_timeslice = 64; +int sched_min_base = 1; +int sched_max_base = 10000; -#define DELTA(p) \ - (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) +#define RT_TIMESLICE (50 * 1000 / HZ) /* 50ms */ +#define BASE_TIMESLICE (sched_base_timeslice) +#define MIN_TIMESLICE 1 -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) +/* Maximum amount of history that will be used to calculate priority */ +#define MAX_SLEEP_SHIFT 19 +#define MAX_SLEEP (1UL << MAX_SLEEP_SHIFT) /* roughly 0.52s */ -#define INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) +/* + * Maximum effect that 1 block of activity (run/sleep/etc) can have. This is + * will moderate dicard freak events (eg. SIGSTOP) + */ +#define MAX_SLEEP_AFFECT (MAX_SLEEP/4) -#define HIGH_CREDIT(p) \ - ((p)->interactive_credit > CREDIT_LIMIT) +/* + * The amount of history can be decreased (on fork for example). This puts a + * lower bound on it. + */ +#define MIN_HISTORY (MAX_SLEEP/8) -#define LOW_CREDIT(p) \ - ((p)->interactive_credit < -CREDIT_LIMIT) +#define FORKED_TS_MAX (US_TO_JIFFIES(MIN_HISTORY) ?: 1) -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) +/* + * SLEEP_FACTOR is a fixed point factor used to scale history tracking things. + * In particular: total_time, sleep_time, sleep_avg. + */ +#define SLEEP_FACTOR 1024 /* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. + * The scheduler classifies a process as performing one of the following + * activities */ +#define STIME_SLEEP 1 /* Sleeping */ +#define STIME_RUN 2 /* Using CPU */ -#define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) +#define TASK_PREEMPTS_CURR(p, rq) ( (p)->prio < (rq)->curr->prio ) -static unsigned int task_timeslice(task_t *p) -{ - if (p->static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); - else - return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); -} #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time) enum idle_type @@ -201,6 +136,7 @@ struct sched_domain; typedef struct runqueue runqueue_t; struct prio_array { + int min_prio; unsigned int nr_active; unsigned long bitmap[BITMAP_SIZE]; struct list_head queue[MAX_PRIO]; @@ -224,16 +160,17 @@ struct runqueue { #ifdef CONFIG_SMP unsigned long cpu_load; #endif + unsigned long array_sequence; + unsigned long nr_uninterruptible; unsigned long long nr_switches; - unsigned long expired_timestamp, nr_uninterruptible; - unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; - prio_array_t *active, *expired, arrays[2]; - int best_expired_prio; atomic_t nr_iowait; + prio_array_t *active, *expired, arrays[2]; #ifdef CONFIG_SMP + unsigned long long timestamp_last_tick; + struct sched_domain *sd; /* For active balancing */ @@ -387,7 +324,7 @@ struct sched_domain { .max_interval = 4, \ .busy_factor = 64, \ .imbalance_pct = 125, \ - .cache_hot_time = (5*1000000/2), \ + .cache_hot_time = (5*1000/2), \ .cache_nice_tries = 1, \ .per_cpu_gain = 100, \ .flags = SD_BALANCE_NEWIDLE \ @@ -409,7 +346,7 @@ struct sched_domain { .max_interval = 32, \ .busy_factor = 32, \ .imbalance_pct = 125, \ - .cache_hot_time = (10*1000000), \ + .cache_hot_time = (10*1000), \ .cache_nice_tries = 1, \ .per_cpu_gain = 100, \ .flags = SD_BALANCE_EXEC \ @@ -563,20 +500,6 @@ struct file_operations proc_schedstat_op # define schedstat_add(rq, field, amt) do { } while (0); #endif -/* - * rq_lock - lock a given runqueue and disable interrupts. - */ -static runqueue_t *this_rq_lock(void) -{ - runqueue_t *rq; - - local_irq_disable(); - rq = this_rq(); - spin_lock(&rq->lock); - - return rq; -} - static inline void rq_unlock(runqueue_t *rq) { spin_unlock_irq(&rq->lock); @@ -701,8 +624,18 @@ static void dequeue_task(struct task_str static void enqueue_task(struct task_struct *p, prio_array_t *array) { + struct list_head *entry = array->queue + p->prio; sched_info_queued(p); - list_add_tail(&p->run_list, array->queue + p->prio); + + if (!rt_task(p)) { + /* + * Cycle tasks on the same priority level. This reduces their + * timeslice fluctuations due to higher priority tasks expiring. + */ + if (!list_empty(entry)) + entry = entry->next; + } + list_add_tail(&p->run_list, entry); __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; @@ -721,44 +654,122 @@ static inline void enqueue_task_head(str p->array = array; } +static inline unsigned long long clock_us(void) +{ + return sched_clock() >> 10; +} + /* - * effective_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. - * - * We use 25% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * add_task_time updates a task @p after @time of doing the specified @type + * of activity. See STIME_*. This is used for priority calculation. + */ +static inline void add_task_time(task_t *p, unsigned long long time, unsigned long type) +{ + unsigned long ratio; + unsigned long long tmp; + unsigned long t; + + if (type == STIME_SLEEP) { + if (time > MAX_SLEEP_AFFECT*4) + time = MAX_SLEEP_AFFECT*4; + t = ((unsigned long)time + 3) / 4; + } else { + unsigned long div = 60 - USER_PRIO(p->static_prio); + t = (unsigned long)time * 30; + t = t / div; + t = t * 30; + t = t / div; + } + + ratio = MAX_SLEEP - t; + tmp = (unsigned long long)ratio*p->total_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->total_time = (unsigned long)tmp; + + tmp = (unsigned long long)ratio*p->sleep_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->sleep_time = (unsigned long)tmp; + + p->total_time += t; + if (type == STIME_SLEEP) + p->sleep_time += t; +} + +static unsigned long task_sleep_avg(task_t *p) +{ + return (SLEEP_FACTOR * p->sleep_time) / (p->total_time + 1); +} + +/* + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. * - * Both properties are important to certain workloads. + * Timeslices are scaled, so if only low priority processes are running, + * they will all get long timeslices. */ -static int effective_prio(task_t *p) +static int task_timeslice(task_t *p, runqueue_t *rq) { + int idx, base, delta; + int timeslice; + + if (rt_task(p)) + return RT_TIMESLICE; + + idx = min(p->prio, rq->expired->min_prio); + delta = p->prio - idx; + base = BASE_TIMESLICE * (MAX_USER_PRIO + 1) / (delta + 2); + + base = base * 40 / (70 - USER_PRIO(idx)); + base = base * 40 / (70 - USER_PRIO(idx)); + + timeslice = base * 1000 / HZ; + timeslice >>= 5; + if (timeslice < MIN_TIMESLICE) + timeslice = MIN_TIMESLICE; + + return timeslice; +} + +/* + * task_priority: calculates a task's priority based on previous running + * history (see add_task_time). The priority is just a simple linear function + * based on sleep_avg and static_prio. + */ +static int task_priority(task_t *p) +{ + unsigned long sleep_avg; int bonus, prio; if (rt_task(p)) return p->prio; - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; + sleep_avg = task_sleep_avg(p); + + prio = USER_PRIO(p->static_prio) + 10; + bonus = (((MAX_USER_PRIO + 1) / 3) * sleep_avg + (SLEEP_FACTOR / 2)) + / SLEEP_FACTOR; + prio = MAX_RT_PRIO + prio - bonus; - prio = p->static_prio - bonus; if (prio < MAX_RT_PRIO) - prio = MAX_RT_PRIO; + return MAX_RT_PRIO; if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; + return MAX_PRIO-1; + return prio; } /* * __activate_task - move a task to the runqueue. */ -static inline void __activate_task(task_t *p, runqueue_t *rq) +static inline void __activate_task(task_t *p, runqueue_t *rq, prio_array_t *array) { - enqueue_task(p, rq->active); + enqueue_task(p, array); rq->nr_running++; + if (!rt_task(p)) { + if (p->prio < array->min_prio) + array->min_prio = p->prio; + } } /* @@ -770,80 +781,6 @@ static inline void __activate_idle_task( rq->nr_running++; } -static void recalc_task_prio(task_t *p, unsigned long long now) -{ - unsigned long long __sleep_time = now - p->timestamp; - unsigned long sleep_time; - - if (__sleep_time > NS_MAX_SLEEP_AVG) - sleep_time = NS_MAX_SLEEP_AVG; - else - sleep_time = (unsigned long)__sleep_time; - - if (likely(sleep_time > 0)) { - /* - * User tasks that sleep a long time are categorised as - * idle and will get just interactive status to stay active & - * prevent them suddenly becoming cpu hogs and starving - * other processes. - */ - if (p->mm && p->activated != -1 && - sleep_time > INTERACTIVE_SLEEP(p)) { - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - - DEF_TIMESLICE); - if (!HIGH_CREDIT(p)) - p->interactive_credit++; - } else { - /* - * The lower the sleep avg a task has the more - * rapidly it will rise with sleep time. - */ - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; - - /* - * Tasks with low interactive_credit are limited to - * one timeslice worth of sleep avg bonus. - */ - if (LOW_CREDIT(p) && - sleep_time > JIFFIES_TO_NS(task_timeslice(p))) - sleep_time = JIFFIES_TO_NS(task_timeslice(p)); - - /* - * Non high_credit tasks waking from uninterruptible - * sleep are limited in their sleep_avg rise as they - * are likely to be cpu hogs waiting on I/O - */ - if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) { - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) - sleep_time = 0; - else if (p->sleep_avg + sleep_time >= - INTERACTIVE_SLEEP(p)) { - p->sleep_avg = INTERACTIVE_SLEEP(p); - sleep_time = 0; - } - } - - /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a - * task spends sleeping, the higher the average gets - - * and the higher the priority boost gets as well. - */ - p->sleep_avg += sleep_time; - - if (p->sleep_avg > NS_MAX_SLEEP_AVG) { - p->sleep_avg = NS_MAX_SLEEP_AVG; - if (!HIGH_CREDIT(p)) - p->interactive_credit++; - } - } - } - - p->prio = effective_prio(p); -} - /* * activate_task - move a task to the runqueue and do priority recalculation * @@ -852,9 +789,10 @@ static void recalc_task_prio(task_t *p, */ static void activate_task(task_t *p, runqueue_t *rq, int local) { - unsigned long long now; + unsigned long long now, sleep; + prio_array_t *array; - now = sched_clock(); + now = clock_us(); #ifdef CONFIG_SMP if (!local) { /* Compensate for drifting sched_clock */ @@ -863,44 +801,34 @@ static void activate_task(task_t *p, run + rq->timestamp_last_tick; } #endif - - recalc_task_prio(p, now); - /* - * This checks to make sure it's not an uninterruptible task - * that is now waking up. + * If we have slept through an active/expired array switch, restart + * our timeslice too. */ - if (!p->activated) { - /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: - */ - if (in_interrupt()) - p->activated = 2; - else { - /* - * Normal first-time wakeups get a credit too for - * on-runqueue time, but it will be weighted down: - */ - p->activated = 1; - } - } + + sleep = now - p->timestamp; p->timestamp = now; + add_task_time(p, sleep, STIME_SLEEP); + p->prio = task_priority(p); - __activate_task(p, rq); + array = rq->active; + if (unlikely(p->used_slice == -1)) { + /* This only applys to newly woken children */ + array = rq->expired; + p->used_slice = 0; + } else if (rq->array_sequence != p->array_sequence) + p->used_slice = 0; + + __activate_task(p, rq, array); } /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct task_struct *p, runqueue_t *rq) +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) { + p->array_sequence = rq->array_sequence; rq->nr_running--; - if (p->state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible++; dequeue_task(p, p->array); p->array = NULL; } @@ -1224,28 +1152,14 @@ out_set_cpu: out_activate: #endif /* CONFIG_SMP */ - if (old_state == TASK_UNINTERRUPTIBLE) { + if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->activated = -1; - } - - /* - * Sync wakeups (i.e. those types of wakeups where the waker - * has indicated that it will leave the CPU in short order) - * don't trigger a preemption, if the woken up task will run on - * this cpu. (in this case the 'I will reschedule' promise of - * the waker guarantees that the freshly woken up task is going - * to be considered on this CPU.) - */ activate_task(p, rq, cpu == this_cpu); if (!sync || cpu != this_cpu) { if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } + success = 1; out_running: @@ -1259,7 +1173,7 @@ out: int fastcall wake_up_process(task_t * p) { return try_to_wake_up(p, TASK_STOPPED | - TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); + TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); } EXPORT_SYMBOL(wake_up_process); @@ -1280,6 +1194,9 @@ static int find_idlest_cpu(struct task_s */ void fastcall sched_fork(task_t *p) { + unsigned long sleep_avg; + runqueue_t *rq; + /* * We mark the process as running here, but have not actually * inserted it onto the runqueue yet. This guarantees that @@ -1302,33 +1219,42 @@ void fastcall sched_fork(task_t *p) */ p->thread_info->preempt_count = 1; #endif - /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. - */ + + preempt_disable(); + rq = this_rq(); + + /* XXX */ + if (unlikely(p->comm[0] == 'X' && p->comm[1] == 'F')) { + static int warned = 0; + if (!warned) { + printk(KERN_INFO "Renicing %s for you\n", p->comm); + warned = 1; + } + p->static_prio = NICE_TO_PRIO(-10); + } + + /* Get MIN_HISTORY of history with the same sleep_avg as parent. */ + sleep_avg = task_sleep_avg(current); + p->total_time = MIN_HISTORY; + p->sleep_time = p->total_time * sleep_avg / SLEEP_FACTOR; + + /* Parent loses 1/4 of sleep time for forking */ + current->sleep_time = 3*current->sleep_time/4; + + p->used_slice = 0; local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; - /* - * The remainder of the first timeslice might be recovered by - * the parent if the child exits early enough. - */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (unlikely(!current->time_slice)) { - /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the - * runqueue lock is not a problem. - */ - current->time_slice = 1; - preempt_disable(); - scheduler_tick(0, 0); - local_irq_enable(); - preempt_enable(); - } else - local_irq_enable(); + if (unlikely(current->used_slice == -1 || current == rq->idle)) + p->used_slice = -1; + else { + int ts = task_timeslice(current, rq); + current->used_slice += (ts + 3) / 4; + if (current->used_slice >= ts) { + current->used_slice = -1; + set_need_resched(); + } + } + local_irq_enable(); + preempt_enable(); } /* @@ -1342,57 +1268,55 @@ void fastcall wake_up_new_task(task_t * { unsigned long flags; int this_cpu, cpu; - runqueue_t *rq, *this_rq; + runqueue_t *rq; + prio_array_t *array; + + BUG_ON(p->state != TASK_RUNNING); + + p->prio = task_priority(p); + p->timestamp = clock_us(); rq = task_rq_lock(p, &flags); - cpu = task_cpu(p); this_cpu = smp_processor_id(); - - BUG_ON(p->state != TASK_RUNNING); + cpu = task_cpu(p); schedstat_inc(rq, wunt_cnt); - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. The parent - * (current) is done further down, under its lock. - */ - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - - p->interactive_credit = 0; - p->prio = effective_prio(p); + array = rq->active; + if (unlikely(p->used_slice == -1)) { + p->used_slice = 0; + array = rq->expired; + } else { + int total = task_timeslice(p, rq); + int ts = max((total + 3) / 4, MIN_TIMESLICE); + ts = min(ts, (int)FORKED_TS_MAX); + p->used_slice = total - ts; + } if (likely(cpu == this_cpu)) { - if (!(clone_flags & CLONE_VM)) { + if (!(clone_flags & CLONE_VM) && likely(array == rq->active)) { /* * The VM isn't cloned, so we're in a good position to * do child-runs-first in anticipation of an exec. This * usually avoids a lot of COW overhead. */ - if (unlikely(!current->array)) - __activate_task(p, rq); - else { + if (p->prio >= current->prio) { p->prio = current->prio; list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; rq->nr_running++; - } + } else + __activate_task(p, rq, array); + set_need_resched(); - } else + } else { /* Run child last */ - __activate_task(p, rq); - /* - * We skip the following code due to cpu == this_cpu - * - * task_rq_unlock(rq, &flags); - * this_rq = task_rq_lock(current, &flags); - */ - this_rq = rq; + __activate_task(p, rq, array); + } +#ifdef CONFIG_SMP } else { - this_rq = cpu_rq(this_cpu); + runqueue_t *this_rq = this_rq(); /* * Not the local CPU - must adjust timestamp. This should @@ -1400,52 +1324,18 @@ void fastcall wake_up_new_task(task_t * */ p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + rq->timestamp_last_tick; - __activate_task(p, rq); + __activate_task(p, rq, array); if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); schedstat_inc(rq, wunt_moved); - /* - * Parent and child are on different CPUs, now get the - * parent runqueue to update the parent's ->sleep_avg: - */ - task_rq_unlock(rq, &flags); - this_rq = task_rq_lock(current, &flags); +#endif } - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - task_rq_unlock(this_rq, &flags); + task_rq_unlock(rq, &flags); } -/* - * Potentially available exiting-child timeslices are - * retrieved here - this way the parent does not get - * penalized for creating too many threads. - * - * (this cannot be used to 'generate' timeslices - * artificially, because any timeslice recovered here - * was given away by the parent in the first place.) - */ void fastcall sched_exit(task_t * p) { - unsigned long flags; - runqueue_t *rq; - - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ - rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > task_timeslice(p))) - p->parent->time_slice = task_timeslice(p); - } - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / - (EXIT_WEIGHT + 1); - task_rq_unlock(rq, &flags); } /** @@ -1754,6 +1644,10 @@ void pull_task(runqueue_t *src_rq, prio_ set_task_cpu(p, this_cpu); this_rq->nr_running++; enqueue_task(p, this_array); + if (!rt_task(p)) { + if (p->prio < this_array->min_prio) + this_array->min_prio = p->prio; + } p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; /* @@ -2057,7 +1951,6 @@ static int load_balance(int this_cpu, ru unsigned long imbalance; int nr_moved; - spin_lock(&this_rq->lock); schedstat_inc(sd, lb_cnt[idle]); group = find_busiest_group(sd, this_cpu, &imbalance, idle); @@ -2092,12 +1985,11 @@ static int load_balance(int this_cpu, ru * still unbalanced. nr_moved simply stays zero, so it is * correctly treated as an imbalance. */ - double_lock_balance(this_rq, busiest); + double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle); - spin_unlock(&busiest->lock); + double_rq_unlock(this_rq, busiest); } - spin_unlock(&this_rq->lock); if (!nr_moved) { schedstat_inc(sd, lb_failed[idle]); @@ -2131,8 +2023,6 @@ static int load_balance(int this_cpu, ru return nr_moved; out_balanced: - spin_unlock(&this_rq->lock); - /* tune up the balancing interval */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; @@ -2358,22 +2248,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat EXPORT_PER_CPU_SYMBOL(kstat); /* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: - */ -#define EXPIRED_STARVING(rq) \ - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ - ((rq)->curr->static_prio > (rq)->best_expired_prio)) - -/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * @@ -2382,12 +2256,16 @@ EXPORT_PER_CPU_SYMBOL(kstat); */ void scheduler_tick(int user_ticks, int sys_ticks) { + enum idle_type cpu_status; int cpu = smp_processor_id(); struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; runqueue_t *rq = this_rq(); task_t *p = current; + int ts; - rq->timestamp_last_tick = sched_clock(); +#ifdef CONFIG_SMP + rq->timestamp_last_tick = clock_us(); +#endif if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_ticks); @@ -2401,6 +2279,7 @@ void scheduler_tick(int user_ticks, int sys_ticks = 0; } + cpu_status = NOT_IDLE; if (p == rq->idle) { if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait += sys_ticks; @@ -2408,8 +2287,8 @@ void scheduler_tick(int user_ticks, int cpustat->idle += sys_ticks; if (wake_priority_sleeper(rq)) goto out; - rebalance_tick(cpu, rq, IDLE); - return; + cpu_status = IDLE; + goto out; } if (TASK_NICE(p) > 0) cpustat->nice += user_ticks; @@ -2418,81 +2297,22 @@ void scheduler_tick(int user_ticks, int cpustat->system += sys_ticks; /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { - set_tsk_need_resched(p); + if (unlikely(p->used_slice == -1)) goto out; - } - spin_lock(&rq->lock); - /* - * The task was running during this tick - update the - * time slice counter. Note: we do not update a thread's - * priority until it either goes to sleep or uses up its - * timeslice. This makes it possible for interactive tasks - * to use up their timeslices at their highest priority levels. - */ - if (rt_task(p)) { - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - set_tsk_need_resched(p); - - /* put it at the end of the queue: */ - dequeue_task(p, rq->active); - enqueue_task(p, rq->active); - } - goto out_unlock; - } - if (!--p->time_slice) { - dequeue_task(p, rq->active); + + if (unlikely(p->policy == SCHED_FIFO)) + goto out; + + /* p was running during this tick. Update its time slice counter. */ + p->used_slice++; + ts = task_timeslice(p, rq); + if (unlikely(p->used_slice >= ts)) { + p->used_slice = -1; set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { - enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; - } else - enqueue_task(p, rq->active); - } else { - /* - * Prevent a too long timeslice allowing a task to monopolize - * the CPU. We do this by splitting up the timeslice into - * smaller pieces. - * - * Note: this does not mean the task's timeslices expire or - * get lost in any way, they just might be preempted by - * another task of equal priority. (one with higher - * priority would have preempted this task already.) We - * requeue this task to the end of the list on this priority - * level, which is in essence a round-robin of tasks with - * equal priority. - * - * This only applies to tasks in the interactive - * delta range with at least TIMESLICE_GRANULARITY to requeue. - */ - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { - - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - enqueue_task(p, rq->active); - } } -out_unlock: - spin_unlock(&rq->lock); + out: - rebalance_tick(cpu, rq, NOT_IDLE); + rebalance_tick(cpu, rq, cpu_status); } #ifdef CONFIG_SCHED_SMT @@ -2588,8 +2408,9 @@ static inline int dependent_sleeper(int * task from using an unfair proportion of the * physical cpu's resources. -ck */ - if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(p) || rt_task(smt_curr)) && + if (((task_timeslice(smt_curr, smt_rq) + * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(p, this_rq) || rt_task(smt_curr)) && p->mm && smt_curr->mm && !rt_task(p)) ret = 1; @@ -2598,8 +2419,8 @@ static inline int dependent_sleeper(int * or wake it up if it has been put to sleep for priority * reasons. */ - if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(smt_curr) || rt_task(p)) && + if ((((task_timeslice(p, this_rq) * (100-sd->per_cpu_gain)/100) + > task_timeslice(smt_curr, smt_rq) || rt_task(p)) && smt_curr->mm && p->mm && !rt_task(smt_curr)) || (smt_curr == smt_rq->idle && smt_rq->nr_running)) resched_task(smt_curr); @@ -2639,11 +2460,10 @@ asmlinkage void __sched schedule(void) * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { - if (unlikely(in_atomic())) { - printk(KERN_ERR "bad: scheduling while atomic!\n"); - dump_stack(); - } + if (unlikely(in_atomic()) && + likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { + printk(KERN_ERR "bad: scheduling while atomic!\n"); + dump_stack(); } need_resched: @@ -2662,19 +2482,10 @@ need_resched: release_kernel_lock(prev); schedstat_inc(rq, sched_cnt); - now = sched_clock(); - if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) - run_time = now - prev->timestamp; - else - run_time = NS_MAX_SLEEP_AVG; - - /* - * Tasks with interactive credits get charged less run_time - * at high sleep_avg to delay them losing their interactive - * status - */ - if (HIGH_CREDIT(prev)) - run_time /= (CURRENT_BONUS(prev) ? : 1); + now = clock_us(); + run_time = now - prev->timestamp; + prev->timestamp = now; + add_task_time(prev, run_time, STIME_RUN); spin_lock_irq(&rq->lock); @@ -2688,17 +2499,41 @@ need_resched: if (unlikely((prev->state & TASK_INTERRUPTIBLE) && unlikely(signal_pending(prev)))) prev->state = TASK_RUNNING; - else + else { deactivate_task(prev, rq); + if (prev->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + goto no_check_expired; + } } + if (unlikely(prev->used_slice == -1)) { + if (rt_task(prev)) { + if (prev->policy == SCHED_RR) { + dequeue_task(prev, prev->array); + enqueue_task(prev, rq->active); + } + } else { + dequeue_task(prev, prev->array); + prev->prio = task_priority(prev); + enqueue_task(prev, rq->expired); + if (prev->prio < rq->expired->min_prio) + rq->expired->min_prio = prev->prio; + } + prev->used_slice = 0; + } +no_check_expired: + cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { go_idle: + rq->array_sequence++; idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; - rq->expired_timestamp = 0; + rq->arrays[0].min_prio = MAX_PRIO; + rq->arrays[1].min_prio = MAX_PRIO; + wake_sleeping_dependent(cpu, rq); /* * wake_sleeping_dependent() might have released @@ -2729,11 +2564,11 @@ go_idle: * Switch the active and expired arrays. */ schedstat_inc(rq, sched_switch); + rq->array_sequence++; rq->active = rq->expired; rq->expired = array; + rq->expired->min_prio = MAX_PRIO; array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; } else schedstat_inc(rq, sched_noswitch); @@ -2741,31 +2576,11 @@ go_idle: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); - if (!rt_task(next) && next->activated > 0) { - unsigned long long delta = now - next->timestamp; - - if (next->activated == 1) - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - - array = next->array; - dequeue_task(next, array); - recalc_task_prio(next, next->timestamp + delta); - enqueue_task(next, array); - } - next->activated = 0; switch_tasks: prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) { - prev->sleep_avg = 0; - if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) - prev->interactive_credit--; - } - prev->timestamp = now; - sched_info_switch(prev, next); if (likely(prev != next)) { next->timestamp = now; @@ -3253,12 +3068,12 @@ static int setscheduler(pid_t pid, int p array = p->array; if (array) - deactivate_task(p, task_rq(p)); + deactivate_task(p, rq); retval = 0; oldprio = p->prio; __setscheduler(p, policy, lp.sched_priority); if (array) { - __activate_task(p, task_rq(p)); + __activate_task(p, rq, array); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -3481,37 +3296,31 @@ out_unlock: */ asmlinkage long sys_sched_yield(void) { - runqueue_t *rq = this_rq_lock(); - prio_array_t *array = current->array; - prio_array_t *target = rq->expired; +#ifdef CONFIG_SCHEDSTATS + runqueue_t *rq; +#endif - schedstat_inc(rq, yld_cnt); - /* - * We implement yielding by moving the task into the expired - * queue. - * - * (special rule: RT tasks will just roundrobin in the active - * array.) - */ - if (rt_task(current)) - target = rq->active; + local_irq_disable(); +#ifdef CONFIG_SCHEDSTATS + rq = this_rq(); + schedstat_inc(rq, yld_cnt); + spin_lock(&rq->lock); if (current->array->nr_active == 1) { schedstat_inc(rq, yld_act_empty); if (!rq->expired->nr_active) schedstat_inc(rq, yld_both_empty); } else if (!rq->expired->nr_active) schedstat_inc(rq, yld_exp_empty); - - dequeue_task(current, array); - enqueue_task(current, target); - /* * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ _raw_spin_unlock(&rq->lock); preempt_enable_no_resched(); +#endif + current->used_slice = -1; + local_irq_enable(); schedule(); @@ -3628,6 +3437,8 @@ long sys_sched_rr_get_interval(pid_t pid int retval = -EINVAL; struct timespec t; task_t *p; + unsigned long flags; + runqueue_t *rq; if (pid < 0) goto out_nounlock; @@ -3642,8 +3453,9 @@ long sys_sched_rr_get_interval(pid_t pid if (retval) goto out_unlock; - jiffies_to_timespec(p->policy & SCHED_FIFO ? - 0 : task_timeslice(p), &t); + rq = task_rq_lock(p, &flags); + jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : task_timeslice(p, rq), &t); + task_rq_unlock(rq, &flags); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -3756,11 +3568,10 @@ void __devinit init_idle(task_t *idle, i runqueue_t *rq = cpu_rq(cpu); unsigned long flags; - idle->sleep_avg = 0; - idle->interactive_credit = 0; idle->array = NULL; idle->prio = MAX_PRIO; idle->state = TASK_RUNNING; + idle->used_slice = 0; set_task_cpu(idle, cpu); spin_lock_irqsave(&rq->lock, flags); @@ -4627,7 +4438,6 @@ void __init sched_init(void) spin_lock_init(&rq->lock); rq->active = rq->arrays; rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP rq->sd = &sched_domain_init; @@ -4641,11 +4451,12 @@ void __init sched_init(void) for (j = 0; j < 2; j++) { array = rq->arrays + j; + array->min_prio = MAX_PRIO; for (k = 0; k < MAX_PRIO; k++) { INIT_LIST_HEAD(array->queue + k); __clear_bit(k, array->bitmap); } - // delimiter for bitsearch + /* delimiter for bitsearch */ __set_bit(MAX_PRIO, array->bitmap); } } diff -puN kernel/sysctl.c~nicksched kernel/sysctl.c --- 25/kernel/sysctl.c~nicksched 2004-08-21 23:49:55.329129584 -0700 +++ 25-akpm/kernel/sysctl.c 2004-08-21 23:49:55.347126848 -0700 @@ -64,6 +64,9 @@ extern int sysctl_lower_zone_protection; extern int min_free_kbytes; extern int printk_ratelimit_jiffies; extern int printk_ratelimit_burst; +extern int sched_base_timeslice; +extern int sched_min_base; +extern int sched_max_base; #if defined(CONFIG_X86_LOCAL_APIC) && defined(__i386__) int unknown_nmi_panic; @@ -636,6 +639,18 @@ static ctl_table kern_table[] = { .proc_handler = &proc_unknown_nmi_panic, }, #endif + { + .ctl_name = KERN_SCHED_TIMESLICE, + .procname = "base_timeslice", + .data = &sched_base_timeslice, + .maxlen = sizeof (sched_base_timeslice), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &sched_min_base, + .extra2 = &sched_max_base, + }, + { .ctl_name = 0 } }; @@ -915,6 +930,7 @@ static ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { .ctl_name = 0 } }; diff -puN mm/oom_kill.c~nicksched mm/oom_kill.c --- 25/mm/oom_kill.c~nicksched 2004-08-21 23:49:55.330129432 -0700 +++ 25-akpm/mm/oom_kill.c 2004-08-21 23:49:55.347126848 -0700 @@ -144,11 +144,10 @@ static void __oom_kill_task(task_t *p) printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm); /* - * We give our sacrificial lamb high priority and access to - * all the memory it needs. That way it should be able to - * exit() and clear out its resources quickly... + * We give our sacrificial lamb access to all the memory it needs. + * That way it should be able to exit() and clear out its resources + * quickly... */ - p->time_slice = HZ; p->flags |= PF_MEMALLOC | PF_MEMDIE; /* This process has hardware access, be more careful. */ _