From: Nick Piggin <nickpiggin@yahoo.com.au>

The patch removes the interactivity estimator.  It introduces a priority
calculator which adapts quickly to change running patterns.

It completely changes timeslice allocation.  Previously a timeslice would be
allocated based solely on a process' nice level - ~200ms for -20, 10ms for 19.
 Timeslices are now based only on priority (however nice level directly
affects priority).

You'll have to read task_timeslice to get a proper picture of how it works,
but here is an (inaccurate) examples:

Two high priority processes are running: they'll each get a 25ms timeslice. 
Two low priority processes become runnable: they'll each get a 5ms timeslice. 
High priority processes sleep: the low prio processes now get 100ms
timeslices.

/proc/kernel/base_timeslice - a scaling factor for the timeslice calculation. 
While testing, try lowering this value if interactivity is bad or raising it
if efficiency is decreased.

For good interactivity in X, the X server should be reniced to about -10.  The
patch contains a hack to do this for you because you will forget.

Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/fs/proc/array.c           |    5 
 25-akpm/include/linux/init_task.h |    5 
 25-akpm/include/linux/sched.h     |   11 
 25-akpm/include/linux/sysctl.h    |    1 
 25-akpm/kernel/sched.c            |  861 ++++++++++++++------------------------
 25-akpm/kernel/sysctl.c           |   16 
 25-akpm/mm/oom_kill.c             |    7 
 7 files changed, 367 insertions(+), 539 deletions(-)

diff -puN fs/proc/array.c~nicksched fs/proc/array.c
--- 25/fs/proc/array.c~nicksched	2004-08-21 23:49:55.321130800 -0700
+++ 25-akpm/fs/proc/array.c	2004-08-21 23:49:55.334128824 -0700
@@ -159,7 +159,8 @@ static inline char * task_state(struct t
 	read_lock(&tasklist_lock);
 	buffer += sprintf(buffer,
 		"State:\t%s\n"
-		"SleepAVG:\t%lu%%\n"
+		"sleep_time:\t%lu\n"
+		"total_time:\t%lu\n"
 		"Tgid:\t%d\n"
 		"Pid:\t%d\n"
 		"PPid:\t%d\n"
@@ -167,7 +168,7 @@ static inline char * task_state(struct t
 		"Uid:\t%d\t%d\t%d\t%d\n"
 		"Gid:\t%d\t%d\t%d\t%d\n",
 		get_task_state(p),
-		(p->sleep_avg/1024)*100/(1020000000/1024),
+		p->sleep_time, p->total_time,
 	       	p->tgid,
 		p->pid, p->pid ? p->real_parent->pid : 0,
 		p->pid && p->ptrace ? p->parent->pid : 0,
diff -puN include/linux/init_task.h~nicksched include/linux/init_task.h
--- 25/include/linux/init_task.h~nicksched	2004-08-21 23:49:55.322130648 -0700
+++ 25-akpm/include/linux/init_task.h	2004-08-21 23:49:55.334128824 -0700
@@ -71,14 +71,13 @@ extern struct group_info init_groups;
 	.usage		= ATOMIC_INIT(2),				\
 	.flags		= 0,						\
 	.lock_depth	= -1,						\
-	.prio		= MAX_PRIO-20,					\
-	.static_prio	= MAX_PRIO-20,					\
+	.prio		= MAX_PRIO-29,					\
+	.static_prio	= MAX_PRIO-29,					\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
-	.time_slice	= HZ,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
diff -puN include/linux/sched.h~nicksched include/linux/sched.h
--- 25/include/linux/sched.h~nicksched	2004-08-21 23:49:55.324130344 -0700
+++ 25-akpm/include/linux/sched.h	2004-08-21 23:49:55.335128672 -0700
@@ -298,7 +298,7 @@ struct signal_struct {
 #define MAX_USER_RT_PRIO	100
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
 
-#define MAX_PRIO		(MAX_RT_PRIO + 40)
+#define MAX_PRIO		(MAX_RT_PRIO + 59)
 
 #define rt_task(p)		(unlikely((p)->prio < MAX_RT_PRIO))
 
@@ -414,14 +414,15 @@ struct task_struct {
 	struct list_head run_list;
 	prio_array_t *array;
 
-	unsigned long sleep_avg;
-	long interactive_credit;
+	/* Scheduler variables follow. kernel/sched.c */
+	unsigned long array_sequence;
 	unsigned long long timestamp;
-	int activated;
+	int used_slice;
+
+	unsigned long total_time, sleep_time;
 
 	unsigned long policy;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice, first_time_slice;
 
 #ifdef CONFIG_SCHEDSTATS
 	struct sched_info sched_info;
diff -puN include/linux/sysctl.h~nicksched include/linux/sysctl.h
--- 25/include/linux/sysctl.h~nicksched	2004-08-21 23:49:55.325130192 -0700
+++ 25-akpm/include/linux/sysctl.h	2004-08-21 23:49:55.336128520 -0700
@@ -134,6 +134,7 @@ enum
 	KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */
 	KERN_HZ_TIMER=65,	/* int: hz timer on or off */
 	KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */
+	KERN_SCHED_TIMESLICE=67, /* int: base timeslice for scheduler */
 };
 
 
diff -puN kernel/sched.c~nicksched kernel/sched.c
--- 25/kernel/sched.c~nicksched	2004-08-21 23:49:55.327129888 -0700
+++ 25-akpm/kernel/sched.c	2004-08-21 23:49:55.345127152 -0700
@@ -47,139 +47,74 @@
 
 #include <asm/unistd.h>
 
-#ifdef CONFIG_NUMA
-#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
-#else
-#define cpu_to_node_mask(cpu) (cpu_online_map)
-#endif
-
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  * and back.
  */
-#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
+#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 30)
+#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 30)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
+ * it's a [ 0 ... 58 ] range.
  */
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 
-/*
- * Some helpers for converting nanosecond timing to jiffy resolution
- */
-#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
-#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+#define US_TO_JIFFIES(x)	((x) * HZ / 1000000)
+#define JIFFIES_TO_US(x)	((x) * 1000000 / HZ)
 
 /*
- * These are the 'tuning knobs' of the scheduler:
- *
- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
- * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
- * Timeslices get refilled after they expire.
- */
-#define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
-#define DEF_TIMESLICE		(100 * HZ / 1000)
-#define ON_RUNQUEUE_WEIGHT	 30
-#define CHILD_PENALTY		 95
-#define PARENT_PENALTY		100
-#define EXIT_WEIGHT		  3
-#define PRIO_BONUS_RATIO	 25
-#define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
-#define INTERACTIVE_DELTA	  2
-#define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
-#define STARVATION_LIMIT	(MAX_SLEEP_AVG)
-#define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
-#define CREDIT_LIMIT		100
-
-/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- *  priority range a task can explore, a value of '1' means the
- *  task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
- */
-
-#define CURRENT_BONUS(p) \
-	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
-		MAX_SLEEP_AVG)
-
-#ifdef CONFIG_SMP
-#define TIMESLICE_GRANULARITY(p)	(MIN_TIMESLICE * \
-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
-			num_online_cpus())
-#else
-#define TIMESLICE_GRANULARITY(p)	(MIN_TIMESLICE * \
-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
-#endif
-
-#define SCALE(v1,v1_max,v2_max) \
-	(v1) * (v2_max) / (v1_max)
+ * MIN_TIMESLICE is the timeslice that a minimum priority process gets if there
+ * is a maximum priority process runnable. MAX_TIMESLICE is derived from the
+ * formula in task_timeslice. It cannot be changed here. It is the timesilce
+ * that the maximum priority process will get. Larger timeslices are attainable
+ * by low priority processes however.
+ */
+int sched_base_timeslice = 64;
+int sched_min_base = 1;
+int sched_max_base = 10000;
 
-#define DELTA(p) \
-	(SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
+#define RT_TIMESLICE		(50 * 1000 / HZ)     /* 50ms */
+#define BASE_TIMESLICE		(sched_base_timeslice)
+#define MIN_TIMESLICE		1
 
-#define TASK_INTERACTIVE(p) \
-	((p)->prio <= (p)->static_prio - DELTA(p))
+/* Maximum amount of history that will be used to calculate priority */
+#define MAX_SLEEP_SHIFT		19
+#define MAX_SLEEP		(1UL << MAX_SLEEP_SHIFT) /* roughly 0.52s */
 
-#define INTERACTIVE_SLEEP(p) \
-	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
-		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
+/*
+ * Maximum effect that 1 block of activity (run/sleep/etc) can have. This is
+ * will moderate dicard freak events (eg. SIGSTOP)
+ */
+#define MAX_SLEEP_AFFECT	(MAX_SLEEP/4)
 
-#define HIGH_CREDIT(p) \
-	((p)->interactive_credit > CREDIT_LIMIT)
+/*
+ * The amount of history can be decreased (on fork for example). This puts a
+ * lower bound on it.
+ */
+#define MIN_HISTORY		(MAX_SLEEP/8)
 
-#define LOW_CREDIT(p) \
-	((p)->interactive_credit < -CREDIT_LIMIT)
+#define FORKED_TS_MAX		(US_TO_JIFFIES(MIN_HISTORY) ?: 1)
 
-#define TASK_PREEMPTS_CURR(p, rq) \
-	((p)->prio < (rq)->curr->prio)
+/*
+ * SLEEP_FACTOR is a fixed point factor used to scale history tracking things.
+ * In particular: total_time, sleep_time, sleep_avg.
+ */
+#define SLEEP_FACTOR		1024
 
 /*
- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
- * to time slice values: [800ms ... 100ms ... 5ms]
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
+ * The scheduler classifies a process as performing one of the following
+ * activities
  */
+#define STIME_SLEEP		1	/* Sleeping */
+#define STIME_RUN		2	/* Using CPU */
 
-#define SCALE_PRIO(x, prio) \
-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
+#define TASK_PREEMPTS_CURR(p, rq)	( (p)->prio < (rq)->curr->prio )
 
-static unsigned int task_timeslice(task_t *p)
-{
-	if (p->static_prio < NICE_TO_PRIO(0))
-		return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
-	else
-		return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
-}
 #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
 
 enum idle_type
@@ -201,6 +136,7 @@ struct sched_domain;
 typedef struct runqueue runqueue_t;
 
 struct prio_array {
+	int min_prio;
 	unsigned int nr_active;
 	unsigned long bitmap[BITMAP_SIZE];
 	struct list_head queue[MAX_PRIO];
@@ -224,16 +160,17 @@ struct runqueue {
 #ifdef CONFIG_SMP
 	unsigned long cpu_load;
 #endif
+	unsigned long array_sequence;
+	unsigned long nr_uninterruptible;
 	unsigned long long nr_switches;
-	unsigned long expired_timestamp, nr_uninterruptible;
-	unsigned long long timestamp_last_tick;
 	task_t *curr, *idle;
 	struct mm_struct *prev_mm;
-	prio_array_t *active, *expired, arrays[2];
-	int best_expired_prio;
 	atomic_t nr_iowait;
+	prio_array_t *active, *expired, arrays[2];
 
 #ifdef CONFIG_SMP
+	unsigned long long timestamp_last_tick;
+
 	struct sched_domain *sd;
 
 	/* For active balancing */
@@ -387,7 +324,7 @@ struct sched_domain {
 	.max_interval		= 4,			\
 	.busy_factor		= 64,			\
 	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (5*1000000/2),	\
+	.cache_hot_time		= (5*1000/2),		\
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_BALANCE_NEWIDLE	\
@@ -409,7 +346,7 @@ struct sched_domain {
 	.max_interval		= 32,			\
 	.busy_factor		= 32,			\
 	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (10*1000000),		\
+	.cache_hot_time		= (10*1000),		\
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_BALANCE_EXEC	\
@@ -563,20 +500,6 @@ struct file_operations proc_schedstat_op
 # define schedstat_add(rq, field, amt)	do { } while (0);
 #endif
 
-/*
- * rq_lock - lock a given runqueue and disable interrupts.
- */
-static runqueue_t *this_rq_lock(void)
-{
-	runqueue_t *rq;
-
-	local_irq_disable();
-	rq = this_rq();
-	spin_lock(&rq->lock);
-
-	return rq;
-}
-
 static inline void rq_unlock(runqueue_t *rq)
 {
 	spin_unlock_irq(&rq->lock);
@@ -701,8 +624,18 @@ static void dequeue_task(struct task_str
 
 static void enqueue_task(struct task_struct *p, prio_array_t *array)
 {
+	struct list_head *entry = array->queue + p->prio;
 	sched_info_queued(p);
-	list_add_tail(&p->run_list, array->queue + p->prio);
+
+	if (!rt_task(p)) {
+		/*
+		 * Cycle tasks on the same priority level. This reduces their
+		 * timeslice fluctuations due to higher priority tasks expiring.
+		 */
+		if (!list_empty(entry))
+			entry = entry->next;
+	}
+	list_add_tail(&p->run_list, entry);
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
@@ -721,44 +654,122 @@ static inline void enqueue_task_head(str
 	p->array = array;
 }
 
+static inline unsigned long long clock_us(void)
+{
+	return sched_clock() >> 10;
+}
+
 /*
- * effective_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+ * add_task_time updates a task @p after @time of doing the specified @type
+ * of activity. See STIME_*. This is used for priority calculation.
+ */
+static inline void add_task_time(task_t *p, unsigned long long time, unsigned long type)
+{
+	unsigned long ratio;
+	unsigned long long tmp;
+	unsigned long t;
+
+	if (type == STIME_SLEEP) {
+		if (time > MAX_SLEEP_AFFECT*4)
+			time = MAX_SLEEP_AFFECT*4;
+		t = ((unsigned long)time + 3) / 4;
+	} else {
+		unsigned long div = 60 - USER_PRIO(p->static_prio);
+		t = (unsigned long)time * 30;
+		t = t / div;
+		t = t * 30;
+		t = t / div;
+	}
+
+	ratio = MAX_SLEEP - t;
+	tmp = (unsigned long long)ratio*p->total_time + MAX_SLEEP/2;
+	tmp >>= MAX_SLEEP_SHIFT;
+	p->total_time = (unsigned long)tmp;
+
+	tmp = (unsigned long long)ratio*p->sleep_time + MAX_SLEEP/2;
+	tmp >>= MAX_SLEEP_SHIFT;
+	p->sleep_time = (unsigned long)tmp;
+
+	p->total_time += t;
+	if (type == STIME_SLEEP)
+		p->sleep_time += t;
+}
+
+static unsigned long task_sleep_avg(task_t *p)
+{
+	return (SLEEP_FACTOR * p->sleep_time) / (p->total_time + 1);
+}
+
+/*
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
  *
- * Both properties are important to certain workloads.
+ * Timeslices are scaled, so if only low priority processes are running,
+ * they will all get long timeslices.
  */
-static int effective_prio(task_t *p)
+static int task_timeslice(task_t *p, runqueue_t *rq)
 {
+	int idx, base, delta;
+	int timeslice;
+
+	if (rt_task(p))
+		return RT_TIMESLICE;
+
+	idx = min(p->prio, rq->expired->min_prio);
+	delta = p->prio - idx;
+	base = BASE_TIMESLICE * (MAX_USER_PRIO + 1) / (delta + 2);
+
+	base = base * 40 / (70 - USER_PRIO(idx));
+	base = base * 40 / (70 - USER_PRIO(idx));
+
+	timeslice = base * 1000 / HZ;
+	timeslice >>= 5;
+	if (timeslice < MIN_TIMESLICE)
+		timeslice = MIN_TIMESLICE;
+
+	return timeslice;
+}
+
+/*
+ * task_priority: calculates a task's priority based on previous running
+ * history (see add_task_time). The priority is just a simple linear function
+ * based on sleep_avg and static_prio.
+ */
+static int task_priority(task_t *p)
+{
+	unsigned long sleep_avg;
 	int bonus, prio;
 
 	if (rt_task(p))
 		return p->prio;
 
-	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
+	sleep_avg = task_sleep_avg(p);
+
+	prio = USER_PRIO(p->static_prio) + 10;
+	bonus = (((MAX_USER_PRIO + 1) / 3) * sleep_avg + (SLEEP_FACTOR / 2))
+					/ SLEEP_FACTOR;
+	prio = MAX_RT_PRIO + prio - bonus;
 
-	prio = p->static_prio - bonus;
 	if (prio < MAX_RT_PRIO)
-		prio = MAX_RT_PRIO;
+		return MAX_RT_PRIO;
 	if (prio > MAX_PRIO-1)
-		prio = MAX_PRIO-1;
+		return MAX_PRIO-1;
+
 	return prio;
 }
 
 /*
  * __activate_task - move a task to the runqueue.
  */
-static inline void __activate_task(task_t *p, runqueue_t *rq)
+static inline void __activate_task(task_t *p, runqueue_t *rq, prio_array_t *array)
 {
-	enqueue_task(p, rq->active);
+	enqueue_task(p, array);
 	rq->nr_running++;
+	if (!rt_task(p)) {
+		if (p->prio < array->min_prio)
+			array->min_prio = p->prio;
+	}
 }
 
 /*
@@ -770,80 +781,6 @@ static inline void __activate_idle_task(
 	rq->nr_running++;
 }
 
-static void recalc_task_prio(task_t *p, unsigned long long now)
-{
-	unsigned long long __sleep_time = now - p->timestamp;
-	unsigned long sleep_time;
-
-	if (__sleep_time > NS_MAX_SLEEP_AVG)
-		sleep_time = NS_MAX_SLEEP_AVG;
-	else
-		sleep_time = (unsigned long)__sleep_time;
-
-	if (likely(sleep_time > 0)) {
-		/*
-		 * User tasks that sleep a long time are categorised as
-		 * idle and will get just interactive status to stay active &
-		 * prevent them suddenly becoming cpu hogs and starving
-		 * other processes.
-		 */
-		if (p->mm && p->activated != -1 &&
-			sleep_time > INTERACTIVE_SLEEP(p)) {
-				p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
-						DEF_TIMESLICE);
-				if (!HIGH_CREDIT(p))
-					p->interactive_credit++;
-		} else {
-			/*
-			 * The lower the sleep avg a task has the more
-			 * rapidly it will rise with sleep time.
-			 */
-			sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
-
-			/*
-			 * Tasks with low interactive_credit are limited to
-			 * one timeslice worth of sleep avg bonus.
-			 */
-			if (LOW_CREDIT(p) &&
-			    sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
-				sleep_time = JIFFIES_TO_NS(task_timeslice(p));
-
-			/*
-			 * Non high_credit tasks waking from uninterruptible
-			 * sleep are limited in their sleep_avg rise as they
-			 * are likely to be cpu hogs waiting on I/O
-			 */
-			if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
-				if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
-					sleep_time = 0;
-				else if (p->sleep_avg + sleep_time >=
-						INTERACTIVE_SLEEP(p)) {
-					p->sleep_avg = INTERACTIVE_SLEEP(p);
-					sleep_time = 0;
-				}
-			}
-
-			/*
-			 * This code gives a bonus to interactive tasks.
-			 *
-			 * The boost works by updating the 'average sleep time'
-			 * value here, based on ->timestamp. The more time a
-			 * task spends sleeping, the higher the average gets -
-			 * and the higher the priority boost gets as well.
-			 */
-			p->sleep_avg += sleep_time;
-
-			if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
-				p->sleep_avg = NS_MAX_SLEEP_AVG;
-				if (!HIGH_CREDIT(p))
-					p->interactive_credit++;
-			}
-		}
-	}
-
-	p->prio = effective_prio(p);
-}
-
 /*
  * activate_task - move a task to the runqueue and do priority recalculation
  *
@@ -852,9 +789,10 @@ static void recalc_task_prio(task_t *p, 
  */
 static void activate_task(task_t *p, runqueue_t *rq, int local)
 {
-	unsigned long long now;
+	unsigned long long now, sleep;
+	prio_array_t *array;
 
-	now = sched_clock();
+	now = clock_us();
 #ifdef CONFIG_SMP
 	if (!local) {
 		/* Compensate for drifting sched_clock */
@@ -863,44 +801,34 @@ static void activate_task(task_t *p, run
 			+ rq->timestamp_last_tick;
 	}
 #endif
-
-	recalc_task_prio(p, now);
-
 	/*
-	 * This checks to make sure it's not an uninterruptible task
-	 * that is now waking up.
+	 * If we have slept through an active/expired array switch, restart
+	 * our timeslice too.
 	 */
-	if (!p->activated) {
-		/*
-		 * Tasks which were woken up by interrupts (ie. hw events)
-		 * are most likely of interactive nature. So we give them
-		 * the credit of extending their sleep time to the period
-		 * of time they spend on the runqueue, waiting for execution
-		 * on a CPU, first time around:
-		 */
-		if (in_interrupt())
-			p->activated = 2;
-		else {
-			/*
-			 * Normal first-time wakeups get a credit too for
-			 * on-runqueue time, but it will be weighted down:
-			 */
-			p->activated = 1;
-		}
-	}
+
+	sleep = now - p->timestamp;
 	p->timestamp = now;
+	add_task_time(p, sleep, STIME_SLEEP);
+	p->prio = task_priority(p);
 
-	__activate_task(p, rq);
+	array = rq->active;
+	if (unlikely(p->used_slice == -1)) {
+		/* This only applys to newly woken children */
+		array = rq->expired;
+		p->used_slice = 0;
+	} else if (rq->array_sequence != p->array_sequence)
+		p->used_slice = 0;
+
+	__activate_task(p, rq, array);
 }
 
 /*
  * deactivate_task - remove a task from the runqueue.
  */
-static void deactivate_task(struct task_struct *p, runqueue_t *rq)
+static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
+	p->array_sequence = rq->array_sequence;
 	rq->nr_running--;
-	if (p->state == TASK_UNINTERRUPTIBLE)
-		rq->nr_uninterruptible++;
 	dequeue_task(p, p->array);
 	p->array = NULL;
 }
@@ -1224,28 +1152,14 @@ out_set_cpu:
 
 out_activate:
 #endif /* CONFIG_SMP */
-	if (old_state == TASK_UNINTERRUPTIBLE) {
+	if (old_state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
-		/*
-		 * Tasks on involuntary sleep don't earn
-		 * sleep_avg beyond just interactive state.
-		 */
-		p->activated = -1;
-	}
-
-	/*
-	 * Sync wakeups (i.e. those types of wakeups where the waker
-	 * has indicated that it will leave the CPU in short order)
-	 * don't trigger a preemption, if the woken up task will run on
-	 * this cpu. (in this case the 'I will reschedule' promise of
-	 * the waker guarantees that the freshly woken up task is going
-	 * to be considered on this CPU.)
-	 */
 	activate_task(p, rq, cpu == this_cpu);
 	if (!sync || cpu != this_cpu) {
 		if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
+
 	success = 1;
 
 out_running:
@@ -1259,7 +1173,7 @@ out:
 int fastcall wake_up_process(task_t * p)
 {
 	return try_to_wake_up(p, TASK_STOPPED |
-		       		 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
+				TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
 }
 
 EXPORT_SYMBOL(wake_up_process);
@@ -1280,6 +1194,9 @@ static int find_idlest_cpu(struct task_s
  */
 void fastcall sched_fork(task_t *p)
 {
+	unsigned long sleep_avg;
+	runqueue_t *rq;
+
 	/*
 	 * We mark the process as running here, but have not actually
 	 * inserted it onto the runqueue yet. This guarantees that
@@ -1302,33 +1219,42 @@ void fastcall sched_fork(task_t *p)
 	 */
 	p->thread_info->preempt_count = 1;
 #endif
-	/*
-	 * Share the timeslice between parent and child, thus the
-	 * total amount of pending timeslices in the system doesn't change,
-	 * resulting in more scheduling fairness.
-	 */
+
+	preempt_disable();
+	rq = this_rq();
+
+	/* XXX */
+	if (unlikely(p->comm[0] == 'X' && p->comm[1] == 'F')) {
+		static int warned = 0;
+		if (!warned) {
+			printk(KERN_INFO "Renicing %s for you\n", p->comm);
+			warned = 1;
+		}
+		p->static_prio = NICE_TO_PRIO(-10);
+	}
+
+	/* Get MIN_HISTORY of history with the same sleep_avg as parent. */
+	sleep_avg = task_sleep_avg(current);
+	p->total_time = MIN_HISTORY;
+	p->sleep_time = p->total_time * sleep_avg / SLEEP_FACTOR;
+
+	/* Parent loses 1/4 of sleep time for forking */
+	current->sleep_time = 3*current->sleep_time/4;
+
+	p->used_slice = 0;
 	local_irq_disable();
-	p->time_slice = (current->time_slice + 1) >> 1;
-	/*
-	 * The remainder of the first timeslice might be recovered by
-	 * the parent if the child exits early enough.
-	 */
-	p->first_time_slice = 1;
-	current->time_slice >>= 1;
-	p->timestamp = sched_clock();
-	if (unlikely(!current->time_slice)) {
-		/*
-		 * This case is rare, it happens when the parent has only
-		 * a single jiffy left from its timeslice. Taking the
-		 * runqueue lock is not a problem.
-		 */
-		current->time_slice = 1;
-		preempt_disable();
-		scheduler_tick(0, 0);
-		local_irq_enable();
-		preempt_enable();
-	} else
-		local_irq_enable();
+	if (unlikely(current->used_slice == -1 || current == rq->idle))
+		p->used_slice = -1;
+	else {
+		int ts = task_timeslice(current, rq);
+		current->used_slice += (ts + 3) / 4;
+		if (current->used_slice >= ts) {
+			current->used_slice = -1;
+			set_need_resched();
+		}
+	}
+	local_irq_enable();
+	preempt_enable();
 }
 
 /*
@@ -1342,57 +1268,55 @@ void fastcall wake_up_new_task(task_t * 
 {
 	unsigned long flags;
 	int this_cpu, cpu;
-	runqueue_t *rq, *this_rq;
+	runqueue_t *rq;
+	prio_array_t *array;
+
+	BUG_ON(p->state != TASK_RUNNING);
+
+	p->prio = task_priority(p);
+	p->timestamp = clock_us();
 
 	rq = task_rq_lock(p, &flags);
-	cpu = task_cpu(p);
 	this_cpu = smp_processor_id();
-
-	BUG_ON(p->state != TASK_RUNNING);
+	cpu = task_cpu(p);
 
 	schedstat_inc(rq, wunt_cnt);
-	/*
-	 * We decrease the sleep average of forking parents
-	 * and children as well, to keep max-interactive tasks
-	 * from forking tasks that are max-interactive. The parent
-	 * (current) is done further down, under its lock.
-	 */
-	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
-		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-
-	p->interactive_credit = 0;
 
-	p->prio = effective_prio(p);
+	array = rq->active;
+	if (unlikely(p->used_slice == -1)) {
+		p->used_slice = 0;
+		array = rq->expired;
+	} else {
+		int total = task_timeslice(p, rq);
+		int ts = max((total + 3) / 4, MIN_TIMESLICE);
+		ts = min(ts, (int)FORKED_TS_MAX);
+		p->used_slice = total - ts;
+	}
 
 	if (likely(cpu == this_cpu)) {
-		if (!(clone_flags & CLONE_VM)) {
+		if (!(clone_flags & CLONE_VM) && likely(array == rq->active)) {
 			/*
 			 * The VM isn't cloned, so we're in a good position to
 			 * do child-runs-first in anticipation of an exec. This
 			 * usually avoids a lot of COW overhead.
 			 */
-			if (unlikely(!current->array))
-				__activate_task(p, rq);
-			else {
+			if (p->prio >= current->prio) {
 				p->prio = current->prio;
 				list_add_tail(&p->run_list, &current->run_list);
 				p->array = current->array;
 				p->array->nr_active++;
 				rq->nr_running++;
-			}
+			} else
+				__activate_task(p, rq, array);
+
 			set_need_resched();
-		} else
+		} else {
 			/* Run child last */
-			__activate_task(p, rq);
-		/*
-		 * We skip the following code due to cpu == this_cpu
-	 	 *
-		 *   task_rq_unlock(rq, &flags);
-		 *   this_rq = task_rq_lock(current, &flags);
-		 */
-		this_rq = rq;
+			__activate_task(p, rq, array);
+		}
+#ifdef CONFIG_SMP
 	} else {
-		this_rq = cpu_rq(this_cpu);
+		runqueue_t *this_rq = this_rq();
 
 		/*
 		 * Not the local CPU - must adjust timestamp. This should
@@ -1400,52 +1324,18 @@ void fastcall wake_up_new_task(task_t * 
 		 */
 		p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
 					+ rq->timestamp_last_tick;
-		__activate_task(p, rq);
+		__activate_task(p, rq, array);
 		if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 
 		schedstat_inc(rq, wunt_moved);
-		/*
-		 * Parent and child are on different CPUs, now get the
-		 * parent runqueue to update the parent's ->sleep_avg:
-		 */
-		task_rq_unlock(rq, &flags);
-		this_rq = task_rq_lock(current, &flags);
+#endif
 	}
-	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
-		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-	task_rq_unlock(this_rq, &flags);
+	task_rq_unlock(rq, &flags);
 }
 
-/*
- * Potentially available exiting-child timeslices are
- * retrieved here - this way the parent does not get
- * penalized for creating too many threads.
- *
- * (this cannot be used to 'generate' timeslices
- * artificially, because any timeslice recovered here
- * was given away by the parent in the first place.)
- */
 void fastcall sched_exit(task_t * p)
 {
-	unsigned long flags;
-	runqueue_t *rq;
-
-	/*
-	 * If the child was a (relative-) CPU hog then decrease
-	 * the sleep_avg of the parent as well.
-	 */
-	rq = task_rq_lock(p->parent, &flags);
-	if (p->first_time_slice) {
-		p->parent->time_slice += p->time_slice;
-		if (unlikely(p->parent->time_slice > task_timeslice(p)))
-			p->parent->time_slice = task_timeslice(p);
-	}
-	if (p->sleep_avg < p->parent->sleep_avg)
-		p->parent->sleep_avg = p->parent->sleep_avg /
-		(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
-		(EXIT_WEIGHT + 1);
-	task_rq_unlock(rq, &flags);
 }
 
 /**
@@ -1754,6 +1644,10 @@ void pull_task(runqueue_t *src_rq, prio_
 	set_task_cpu(p, this_cpu);
 	this_rq->nr_running++;
 	enqueue_task(p, this_array);
+	if (!rt_task(p)) {
+		if (p->prio < this_array->min_prio)
+			this_array->min_prio = p->prio;
+	}
 	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 				+ this_rq->timestamp_last_tick;
 	/*
@@ -2057,7 +1951,6 @@ static int load_balance(int this_cpu, ru
 	unsigned long imbalance;
 	int nr_moved;
 
-	spin_lock(&this_rq->lock);
 	schedstat_inc(sd, lb_cnt[idle]);
 
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle);
@@ -2092,12 +1985,11 @@ static int load_balance(int this_cpu, ru
 		 * still unbalanced. nr_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
-		double_lock_balance(this_rq, busiest);
+		double_rq_lock(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
 						imbalance, sd, idle);
-		spin_unlock(&busiest->lock);
+		double_rq_unlock(this_rq, busiest);
 	}
-	spin_unlock(&this_rq->lock);
 
 	if (!nr_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
@@ -2131,8 +2023,6 @@ static int load_balance(int this_cpu, ru
 	return nr_moved;
 
 out_balanced:
-	spin_unlock(&this_rq->lock);
-
 	/* tune up the balancing interval */
 	if (sd->balance_interval < sd->max_interval)
 		sd->balance_interval *= 2;
@@ -2358,22 +2248,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
 EXPORT_PER_CPU_SYMBOL(kstat);
 
 /*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired:
- */
-#define EXPIRED_STARVING(rq) \
-	((STARVATION_LIMIT && ((rq)->expired_timestamp && \
-		(jiffies - (rq)->expired_timestamp >= \
-			STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
-			((rq)->curr->static_prio > (rq)->best_expired_prio))
-
-/*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  *
@@ -2382,12 +2256,16 @@ EXPORT_PER_CPU_SYMBOL(kstat);
  */
 void scheduler_tick(int user_ticks, int sys_ticks)
 {
+	enum idle_type cpu_status;
 	int cpu = smp_processor_id();
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
+	int ts;
 
-	rq->timestamp_last_tick = sched_clock();
+#ifdef CONFIG_SMP
+	rq->timestamp_last_tick = clock_us();
+#endif
 
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_ticks);
@@ -2401,6 +2279,7 @@ void scheduler_tick(int user_ticks, int 
 		sys_ticks = 0;
 	}
 
+	cpu_status = NOT_IDLE;
 	if (p == rq->idle) {
 		if (atomic_read(&rq->nr_iowait) > 0)
 			cpustat->iowait += sys_ticks;
@@ -2408,8 +2287,8 @@ void scheduler_tick(int user_ticks, int 
 			cpustat->idle += sys_ticks;
 		if (wake_priority_sleeper(rq))
 			goto out;
-		rebalance_tick(cpu, rq, IDLE);
-		return;
+		cpu_status = IDLE;
+		goto out;
 	}
 	if (TASK_NICE(p) > 0)
 		cpustat->nice += user_ticks;
@@ -2418,81 +2297,22 @@ void scheduler_tick(int user_ticks, int 
 	cpustat->system += sys_ticks;
 
 	/* Task might have expired already, but not scheduled off yet */
-	if (p->array != rq->active) {
-		set_tsk_need_resched(p);
+	if (unlikely(p->used_slice == -1))
 		goto out;
-	}
-	spin_lock(&rq->lock);
-	/*
-	 * The task was running during this tick - update the
-	 * time slice counter. Note: we do not update a thread's
-	 * priority until it either goes to sleep or uses up its
-	 * timeslice. This makes it possible for interactive tasks
-	 * to use up their timeslices at their highest priority levels.
-	 */
-	if (rt_task(p)) {
-		/*
-		 * RR tasks need a special form of timeslice management.
-		 * FIFO tasks have no timeslices.
-		 */
-		if ((p->policy == SCHED_RR) && !--p->time_slice) {
-			p->time_slice = task_timeslice(p);
-			p->first_time_slice = 0;
-			set_tsk_need_resched(p);
-
-			/* put it at the end of the queue: */
-			dequeue_task(p, rq->active);
-			enqueue_task(p, rq->active);
-		}
-		goto out_unlock;
-	}
-	if (!--p->time_slice) {
-		dequeue_task(p, rq->active);
+
+	if (unlikely(p->policy == SCHED_FIFO))
+		goto out;
+
+	/* p was running during this tick. Update its time slice counter. */
+	p->used_slice++;
+	ts = task_timeslice(p, rq);
+	if (unlikely(p->used_slice >= ts)) {
+		p->used_slice = -1;
 		set_tsk_need_resched(p);
-		p->prio = effective_prio(p);
-		p->time_slice = task_timeslice(p);
-		p->first_time_slice = 0;
-
-		if (!rq->expired_timestamp)
-			rq->expired_timestamp = jiffies;
-		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
-			enqueue_task(p, rq->expired);
-			if (p->static_prio < rq->best_expired_prio)
-				rq->best_expired_prio = p->static_prio;
-		} else
-			enqueue_task(p, rq->active);
-	} else {
-		/*
-		 * Prevent a too long timeslice allowing a task to monopolize
-		 * the CPU. We do this by splitting up the timeslice into
-		 * smaller pieces.
-		 *
-		 * Note: this does not mean the task's timeslices expire or
-		 * get lost in any way, they just might be preempted by
-		 * another task of equal priority. (one with higher
-		 * priority would have preempted this task already.) We
-		 * requeue this task to the end of the list on this priority
-		 * level, which is in essence a round-robin of tasks with
-		 * equal priority.
-		 *
-		 * This only applies to tasks in the interactive
-		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
-		 */
-		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
-			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
-			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
-			(p->array == rq->active)) {
-
-			dequeue_task(p, rq->active);
-			set_tsk_need_resched(p);
-			p->prio = effective_prio(p);
-			enqueue_task(p, rq->active);
-		}
 	}
-out_unlock:
-	spin_unlock(&rq->lock);
+
 out:
-	rebalance_tick(cpu, rq, NOT_IDLE);
+	rebalance_tick(cpu, rq, cpu_status);
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -2588,8 +2408,9 @@ static inline int dependent_sleeper(int 
 		 * task from using an unfair proportion of the
 		 * physical cpu's resources. -ck
 		 */
-		if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
-			task_timeslice(p) || rt_task(smt_curr)) &&
+		if (((task_timeslice(smt_curr, smt_rq)
+					* (100 - sd->per_cpu_gain) / 100) >
+			task_timeslice(p, this_rq) || rt_task(smt_curr)) &&
 			p->mm && smt_curr->mm && !rt_task(p))
 				ret = 1;
 
@@ -2598,8 +2419,8 @@ static inline int dependent_sleeper(int 
 		 * or wake it up if it has been put to sleep for priority
 		 * reasons.
 		 */
-		if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
-			task_timeslice(smt_curr) || rt_task(p)) &&
+		if ((((task_timeslice(p, this_rq) * (100-sd->per_cpu_gain)/100)
+			> task_timeslice(smt_curr, smt_rq) || rt_task(p)) &&
 			smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
 			(smt_curr == smt_rq->idle && smt_rq->nr_running))
 				resched_task(smt_curr);
@@ -2639,11 +2460,10 @@ asmlinkage void __sched schedule(void)
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
-	if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
-		if (unlikely(in_atomic())) {
-			printk(KERN_ERR "bad: scheduling while atomic!\n");
-			dump_stack();
-		}
+	if (unlikely(in_atomic()) &&
+			likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
+		printk(KERN_ERR "bad: scheduling while atomic!\n");
+		dump_stack();
 	}
 
 need_resched:
@@ -2662,19 +2482,10 @@ need_resched:
 
 	release_kernel_lock(prev);
 	schedstat_inc(rq, sched_cnt);
-	now = sched_clock();
-	if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
-		run_time = now - prev->timestamp;
-	else
-		run_time = NS_MAX_SLEEP_AVG;
-
-	/*
-	 * Tasks with interactive credits get charged less run_time
-	 * at high sleep_avg to delay them losing their interactive
-	 * status
-	 */
-	if (HIGH_CREDIT(prev))
-		run_time /= (CURRENT_BONUS(prev) ? : 1);
+	now = clock_us();
+	run_time = now - prev->timestamp;
+	prev->timestamp = now;
+	add_task_time(prev, run_time, STIME_RUN);
 
 	spin_lock_irq(&rq->lock);
 
@@ -2688,17 +2499,41 @@ need_resched:
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
 				unlikely(signal_pending(prev))))
 			prev->state = TASK_RUNNING;
-		else
+		else {
 			deactivate_task(prev, rq);
+			if (prev->state == TASK_UNINTERRUPTIBLE)
+				rq->nr_uninterruptible++;
+			goto no_check_expired;
+		}
 	}
 
+	if (unlikely(prev->used_slice == -1)) {
+		if (rt_task(prev)) {
+			if (prev->policy == SCHED_RR) {
+				dequeue_task(prev, prev->array);
+				enqueue_task(prev, rq->active);
+			}
+		} else {
+			dequeue_task(prev, prev->array);
+			prev->prio = task_priority(prev);
+			enqueue_task(prev, rq->expired);
+			if (prev->prio < rq->expired->min_prio)
+				rq->expired->min_prio = prev->prio;
+		}
+		prev->used_slice = 0;
+	}
+no_check_expired:
+
 	cpu = smp_processor_id();
 	if (unlikely(!rq->nr_running)) {
 go_idle:
+		rq->array_sequence++;
 		idle_balance(cpu, rq);
 		if (!rq->nr_running) {
 			next = rq->idle;
-			rq->expired_timestamp = 0;
+			rq->arrays[0].min_prio = MAX_PRIO;
+			rq->arrays[1].min_prio = MAX_PRIO;
+
 			wake_sleeping_dependent(cpu, rq);
 			/*
 			 * wake_sleeping_dependent() might have released
@@ -2729,11 +2564,11 @@ go_idle:
 		 * Switch the active and expired arrays.
 		 */
 		schedstat_inc(rq, sched_switch);
+		rq->array_sequence++;
 		rq->active = rq->expired;
 		rq->expired = array;
+		rq->expired->min_prio = MAX_PRIO;
 		array = rq->active;
-		rq->expired_timestamp = 0;
-		rq->best_expired_prio = MAX_PRIO;
 	} else
 		schedstat_inc(rq, sched_noswitch);
 
@@ -2741,31 +2576,11 @@ go_idle:
 	queue = array->queue + idx;
 	next = list_entry(queue->next, task_t, run_list);
 
-	if (!rt_task(next) && next->activated > 0) {
-		unsigned long long delta = now - next->timestamp;
-
-		if (next->activated == 1)
-			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
-
-		array = next->array;
-		dequeue_task(next, array);
-		recalc_task_prio(next, next->timestamp + delta);
-		enqueue_task(next, array);
-	}
-	next->activated = 0;
 switch_tasks:
 	prefetch(next);
 	clear_tsk_need_resched(prev);
 	RCU_qsctr(task_cpu(prev))++;
 
-	prev->sleep_avg -= run_time;
-	if ((long)prev->sleep_avg <= 0) {
-		prev->sleep_avg = 0;
-		if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
-			prev->interactive_credit--;
-	}
-	prev->timestamp = now;
-
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		next->timestamp = now;
@@ -3253,12 +3068,12 @@ static int setscheduler(pid_t pid, int p
 
 	array = p->array;
 	if (array)
-		deactivate_task(p, task_rq(p));
+		deactivate_task(p, rq);
 	retval = 0;
 	oldprio = p->prio;
 	__setscheduler(p, policy, lp.sched_priority);
 	if (array) {
-		__activate_task(p, task_rq(p));
+		__activate_task(p, rq, array);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
@@ -3481,37 +3296,31 @@ out_unlock:
  */
 asmlinkage long sys_sched_yield(void)
 {
-	runqueue_t *rq = this_rq_lock();
-	prio_array_t *array = current->array;
-	prio_array_t *target = rq->expired;
+#ifdef CONFIG_SCHEDSTATS
+	runqueue_t *rq;
+#endif
 
-	schedstat_inc(rq, yld_cnt);
-	/*
-	 * We implement yielding by moving the task into the expired
-	 * queue.
-	 *
-	 * (special rule: RT tasks will just roundrobin in the active
-	 *  array.)
-	 */
-	if (rt_task(current))
-		target = rq->active;
+	local_irq_disable();
+#ifdef CONFIG_SCHEDSTATS
+	rq = this_rq();
 
+	schedstat_inc(rq, yld_cnt);
+	spin_lock(&rq->lock);
 	if (current->array->nr_active == 1) {
 		schedstat_inc(rq, yld_act_empty);
 		if (!rq->expired->nr_active)
 			schedstat_inc(rq, yld_both_empty);
 	} else if (!rq->expired->nr_active)
 		schedstat_inc(rq, yld_exp_empty);
-
-	dequeue_task(current, array);
-	enqueue_task(current, target);
-
 	/*
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
 	_raw_spin_unlock(&rq->lock);
 	preempt_enable_no_resched();
+#endif
+	current->used_slice = -1;
+	local_irq_enable();
 
 	schedule();
 
@@ -3628,6 +3437,8 @@ long sys_sched_rr_get_interval(pid_t pid
 	int retval = -EINVAL;
 	struct timespec t;
 	task_t *p;
+	unsigned long flags;
+	runqueue_t *rq;
 
 	if (pid < 0)
 		goto out_nounlock;
@@ -3642,8 +3453,9 @@ long sys_sched_rr_get_interval(pid_t pid
 	if (retval)
 		goto out_unlock;
 
-	jiffies_to_timespec(p->policy & SCHED_FIFO ?
-				0 : task_timeslice(p), &t);
+	rq = task_rq_lock(p, &flags);
+	jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : task_timeslice(p, rq), &t);
+	task_rq_unlock(rq, &flags);
 	read_unlock(&tasklist_lock);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:
@@ -3756,11 +3568,10 @@ void __devinit init_idle(task_t *idle, i
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	idle->sleep_avg = 0;
-	idle->interactive_credit = 0;
 	idle->array = NULL;
 	idle->prio = MAX_PRIO;
 	idle->state = TASK_RUNNING;
+	idle->used_slice = 0;
 	set_task_cpu(idle, cpu);
 
 	spin_lock_irqsave(&rq->lock, flags);
@@ -4627,7 +4438,6 @@ void __init sched_init(void)
 		spin_lock_init(&rq->lock);
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
-		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
 		rq->sd = &sched_domain_init;
@@ -4641,11 +4451,12 @@ void __init sched_init(void)
 
 		for (j = 0; j < 2; j++) {
 			array = rq->arrays + j;
+			array->min_prio = MAX_PRIO;
 			for (k = 0; k < MAX_PRIO; k++) {
 				INIT_LIST_HEAD(array->queue + k);
 				__clear_bit(k, array->bitmap);
 			}
-			// delimiter for bitsearch
+			/* delimiter for bitsearch */
 			__set_bit(MAX_PRIO, array->bitmap);
 		}
 	}
diff -puN kernel/sysctl.c~nicksched kernel/sysctl.c
--- 25/kernel/sysctl.c~nicksched	2004-08-21 23:49:55.329129584 -0700
+++ 25-akpm/kernel/sysctl.c	2004-08-21 23:49:55.347126848 -0700
@@ -64,6 +64,9 @@ extern int sysctl_lower_zone_protection;
 extern int min_free_kbytes;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
+extern int sched_base_timeslice;
+extern int sched_min_base;
+extern int sched_max_base;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(__i386__)
 int unknown_nmi_panic;
@@ -636,6 +639,18 @@ static ctl_table kern_table[] = {
 		.proc_handler   = &proc_unknown_nmi_panic,
 	},
 #endif
+	{
+		.ctl_name	= KERN_SCHED_TIMESLICE,
+		.procname	= "base_timeslice",
+		.data		= &sched_base_timeslice,
+		.maxlen		= sizeof (sched_base_timeslice),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &sched_min_base,
+		.extra2		= &sched_max_base,
+	},
+
 	{ .ctl_name = 0 }
 };
 
@@ -915,6 +930,7 @@ static ctl_table fs_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+
 	{ .ctl_name = 0 }
 };
 
diff -puN mm/oom_kill.c~nicksched mm/oom_kill.c
--- 25/mm/oom_kill.c~nicksched	2004-08-21 23:49:55.330129432 -0700
+++ 25-akpm/mm/oom_kill.c	2004-08-21 23:49:55.347126848 -0700
@@ -144,11 +144,10 @@ static void __oom_kill_task(task_t *p)
 	printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
 
 	/*
-	 * We give our sacrificial lamb high priority and access to
-	 * all the memory it needs. That way it should be able to
-	 * exit() and clear out its resources quickly...
+	 * We give our sacrificial lamb access to all the memory it needs.
+	 * That way it should be able to exit() and clear out its resources
+	 * quickly...
 	 */
-	p->time_slice = HZ;
 	p->flags |= PF_MEMALLOC | PF_MEMDIE;
 
 	/* This process has hardware access, be more careful. */
_