diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/arch/i386/Kconfig 730-schedstats/arch/i386/Kconfig
--- 720-vma_statistics/arch/i386/Kconfig	2004-02-20 15:54:05.000000000 -0800
+++ 730-schedstats/arch/i386/Kconfig	2004-02-20 15:57:18.000000000 -0800
@@ -1657,6 +1657,19 @@ config MAGIC_SYSRQ
 	depends on KGDB_SYSRQ
 	default y
 
+config SCHEDSTATS
+	bool "Collect scheduler statistics"
+	depends on PROC_FS
+	default y
+	help
+	  If you say Y here, additional code will be inserted into the
+	  scheduler and related routines to collect statistics about
+	  scheduler behavior and provide them in /proc/schedstat.  These
+	  stats may be useful for both tuning and debugging the scheduler
+	  If you aren't debugging the scheduler or trying to tune a specific
+	  application, you can say N to avoid the very slight overhead
+	  this adds.
+
 config X86_FIND_SMP_CONFIG
 	bool
 	depends on X86_LOCAL_APIC || X86_VOYAGER
diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/arch/ppc/Kconfig 730-schedstats/arch/ppc/Kconfig
--- 720-vma_statistics/arch/ppc/Kconfig	2004-02-18 16:23:02.000000000 -0800
+++ 730-schedstats/arch/ppc/Kconfig	2004-02-20 15:57:18.000000000 -0800
@@ -1261,6 +1261,19 @@ config DEBUG_INFO
 	  debug the kernel.
 	  If you don't debug the kernel, you can say N.
 
+config SCHEDSTATS
+	bool "Collect scheduler statistics"
+	depends on PROC_FS
+	default y
+	help
+	  If you say Y here, additional code will be inserted into the
+	  scheduler and related routines to collect statistics about
+	  scheduler behavior and provide them in /proc/schedstat.  These
+	  stats may be useful for both tuning and debugging the scheduler
+	  If you aren't debugging the scheduler or trying to tune a specific
+	  application, you can say N to avoid the very slight overhead
+	  this adds.
+	  
 config BOOTX_TEXT
 	bool "Support for early boot text console (BootX or OpenFirmware only)"
 	depends PPC_OF
diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/arch/ppc64/Kconfig 730-schedstats/arch/ppc64/Kconfig
--- 720-vma_statistics/arch/ppc64/Kconfig	2004-02-18 16:23:23.000000000 -0800
+++ 730-schedstats/arch/ppc64/Kconfig	2004-02-20 15:57:18.000000000 -0800
@@ -417,7 +417,20 @@ config DEBUG_INFO
 	  debugging info resulting in a larger kernel image.
 	  Say Y here only if you plan to use gdb to debug the kernel.
 	  If you don't debug the kernel, you can say N.
-	  
+	 
+config SCHEDSTATS
+	bool "Collect scheduler statistics"
+	depends on PROC_FS
+	default y
+	help
+	  If you say Y here, additional code will be inserted into the
+	  scheduler and related routines to collect statistics about
+	  scheduler behavior and provide them in /proc/schedstat.  These
+	  stats may be useful for both tuning and debugging the scheduler
+	  If you aren't debugging the scheduler or trying to tune a specific
+	  application, you can say N to avoid the very slight overhead
+	  this adds.
+
 config MCOUNT
 	bool "Generate function call graph"
 	depends on DEBUG_KERNEL
diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/arch/x86_64/Kconfig 730-schedstats/arch/x86_64/Kconfig
--- 720-vma_statistics/arch/x86_64/Kconfig	2004-02-18 16:23:03.000000000 -0800
+++ 730-schedstats/arch/x86_64/Kconfig	2004-02-20 15:57:18.000000000 -0800
@@ -470,6 +470,19 @@ config DEBUG_INFO
 	  Say Y here only if you plan to use gdb to debug the kernel.
 	  Please note that this option requires new binutils.
 	  If you don't debug the kernel, you can say N.
+
+config SCHEDSTATS
+	bool "Collect scheduler statistics"
+	depends on PROC_FS
+	default y 
+	help
+	  If you say Y here, additional code will be inserted into the
+	  scheduler and related routines to collect statistics about
+	  scheduler behavior and provide them in /proc/schedstat.  These
+	  stats may be useful for both tuning and debugging the scheduler
+	  If you aren't debugging the scheduler or trying to tune a specific
+	  application, you can say N to avoid the very slight overhead
+	  this adds.
 	  
 config FRAME_POINTER
        bool "Compile the kernel with frame pointers"
diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/fs/proc/array.c 730-schedstats/fs/proc/array.c
--- 720-vma_statistics/fs/proc/array.c	2003-10-21 11:16:10.000000000 -0700
+++ 730-schedstats/fs/proc/array.c	2004-02-20 15:57:18.000000000 -0800
@@ -343,9 +343,15 @@ int proc_pid_stat(struct task_struct *ta
 	read_lock(&tasklist_lock);
 	ppid = task->pid ? task->real_parent->pid : 0;
 	read_unlock(&tasklist_lock);
+#ifdef CONFIG_SCHEDSTATS
+	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
+%lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu %lu %lu %lu\n",
+#else
 	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \
 %lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n",
+#endif	/* CONFIG_SCHEDSTATS */
 		task->pid,
 		task->comm,
 		state,
@@ -391,7 +397,14 @@ int proc_pid_stat(struct task_struct *ta
 		task->exit_signal,
 		task_cpu(task),
 		task->rt_priority,
+#ifdef CONFIG_SCHEDSTATS
+		task->policy,
+		task->sched_info.cpu_time,
+		task->sched_info.run_delay,
+		task->sched_info.pcnt);
+#else
 		task->policy);
+#endif /* CONFIG_SCHEDSTATS */
 	if(mm)
 		mmput(mm);
 	return res;
diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/fs/proc/proc_misc.c 730-schedstats/fs/proc/proc_misc.c
--- 720-vma_statistics/fs/proc/proc_misc.c	2004-02-18 16:23:23.000000000 -0800
+++ 730-schedstats/fs/proc/proc_misc.c	2004-02-20 15:57:18.000000000 -0800
@@ -325,6 +325,10 @@ static struct file_operations proc_vmsta
 	.release	= seq_release,
 };
 
+#ifdef CONFIG_SCHEDSTATS
+extern struct file_operations proc_schedstat_operations;
+#endif
+
 #ifdef CONFIG_PROC_HARDWARE
 static int hardware_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
@@ -826,6 +830,9 @@ void __init proc_misc_init(void)
 #ifdef CONFIG_NUMA
 	create_seq_entry("meminfo.numa",0,&proc_meminfo_numa_operations);
 #endif
+#ifdef CONFIG_SCHEDSTATS
+	create_seq_entry("schedstat", 0, &proc_schedstat_operations);
+#endif
 #ifdef CONFIG_PROC_KCORE
 	proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
 	if (proc_root_kcore) {
diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/include/linux/sched.h 730-schedstats/include/linux/sched.h
--- 720-vma_statistics/include/linux/sched.h	2004-02-20 15:40:36.000000000 -0800
+++ 730-schedstats/include/linux/sched.h	2004-02-20 15:57:18.000000000 -0800
@@ -101,6 +101,16 @@ extern unsigned long nr_running_cpu(int 
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_iowait(void);
 
+#ifdef CONFIG_SCHEDSTATS
+struct sched_info;
+extern void cpu_sched_info(struct sched_info *, int);
+#define SCHEDSTAT_INC(cpu, field)	schedstats[cpu].field++;
+#define SCHEDSTAT_ADD(cpu, field, amt)	schedstats[cpu].field += amt;
+#else
+#define SCHEDSTAT_INC(cpu, field)	{;}
+#define SCHEDSTAT_ADD(cpu, field, amt)	{;}
+#endif
+
 #include <linux/time.h>
 #include <linux/param.h>
 #include <linux/resource.h>
@@ -336,6 +346,18 @@ struct k_itimer {
 	struct sigqueue *sigq;		/* signal queue entry. */
 };
 
+#ifdef CONFIG_SCHEDSTATS
+struct sched_info {
+	/* cumulative counters */
+	unsigned long	cpu_time,	/* time spent on the cpu */
+			run_delay,	/* time spent waiting on a runqueue */
+			pcnt;		/* # of timeslices run on this cpu */
+
+	/* timestamps */
+	unsigned long	last_arrival,	/* when we last ran on a cpu */
+			last_queued;	/* when we were last queued to run */
+};
+#endif /* CONFIG_SCHEDSTATS */
 
 struct io_context;			/* See blkdev.h */
 void exit_io_context(void);
@@ -362,6 +384,10 @@ struct task_struct {
 	cpumask_t cpus_allowed;
 	unsigned int time_slice, first_time_slice;
 
+#ifdef CONFIG_SCHEDSTATS
+	struct sched_info sched_info;
+#endif /* CONFIG_SCHEDSTATS */
+
 	struct list_head tasks;
 	struct list_head ptrace_children;
 	struct list_head ptrace_list;
diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/kernel/fork.c 730-schedstats/kernel/fork.c
--- 720-vma_statistics/kernel/fork.c	2004-02-20 15:40:36.000000000 -0800
+++ 730-schedstats/kernel/fork.c	2004-02-20 15:58:14.000000000 -0800
@@ -944,6 +944,9 @@ struct task_struct *copy_process(unsigne
 	p->security = NULL;
 	p->io_context = NULL;
 	p->io_wait = NULL;
+#ifdef CONFIG_SCHEDSTATS
+	memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif /* CONFIG_SCHEDSTATS */
 
 	retval = -ENOMEM;
 	if ((retval = security_task_alloc(p)))
diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/kernel/sched.c 730-schedstats/kernel/sched.c
--- 720-vma_statistics/kernel/sched.c	2004-02-18 16:20:09.000000000 -0800
+++ 730-schedstats/kernel/sched.c	2004-02-20 15:57:18.000000000 -0800
@@ -37,6 +37,8 @@
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/times.h>
 
 #ifdef CONFIG_NUMA
 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
@@ -205,6 +207,9 @@ struct runqueue {
 		      nr_uninterruptible;
 	unsigned long long timestamp_last_tick;
 	task_t *curr, *idle;
+#ifdef CONFIG_SCHEDSTATS
+	int cpu;  /* to make easy reverse-lookups with per-cpu runqueues */
+#endif
 	struct mm_struct *prev_mm;
 	prio_array_t *active, *expired, arrays[2];
 	int best_expired_prio;
@@ -220,6 +225,10 @@ struct runqueue {
 
 	task_t *migration_thread;
 	struct list_head migration_queue;
+
+#ifdef CONFIG_SCHEDSTATS
+	struct sched_info info;
+#endif
 };
 
 static DEFINE_PER_CPU(struct runqueue, runqueues);
@@ -278,6 +287,146 @@ static inline void task_rq_unlock(runque
 	spin_unlock_irqrestore(&rq->lock, *flags);
 }
 
+
+#ifdef CONFIG_SCHEDSTATS
+struct schedstat {
+	/* sys_sched_yield stats */
+	unsigned long yld_exp_empty;
+	unsigned long yld_act_empty;
+	unsigned long yld_both_empty;
+	unsigned long yld_cnt;
+
+	/* schedule stats */
+	unsigned long sched_noswitch;
+	unsigned long sched_switch;
+	unsigned long sched_cnt;
+
+	/* load_balance stats */
+	unsigned long lb_imbalance;
+	unsigned long lb_idle;
+	unsigned long lb_busy;
+	unsigned long lb_resched;
+	unsigned long lb_cnt;
+	unsigned long lb_nobusyg;
+	unsigned long lb_nobusyq;
+
+	/* pull_task stats */
+	unsigned long pt_gained;
+	unsigned long pt_lost;
+
+	/* active_load_balance stats */
+	unsigned long alb_cnt;
+	unsigned long alb_gained;
+	unsigned long alb_lost;
+
+	/* migrate_to_cpu stats */
+	unsigned long mtc_cnt;
+
+	/* sched_balance_exec stats */
+	unsigned long sbe_cnt;
+} ____cacheline_aligned;
+
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION	5
+
+struct schedstat schedstats[NR_CPUS];
+
+static int show_schedstat(struct seq_file *seq, void *v)
+{
+	struct schedstat sums;
+	int i;
+
+	memset(&sums, 0, sizeof(sums));
+	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+	seq_printf(seq, "timestamp %lu\n", jiffies);
+	for (i = 0; i < NR_CPUS; i++) {
+
+		struct sched_info info;
+
+		if (!cpu_online(i)) continue;
+
+		cpu_sched_info(&info, i);
+
+		sums.yld_exp_empty += schedstats[i].yld_exp_empty;
+		sums.yld_act_empty += schedstats[i].yld_act_empty;
+		sums.yld_both_empty += schedstats[i].yld_both_empty;
+		sums.yld_cnt += schedstats[i].yld_cnt;
+		sums.sched_noswitch += schedstats[i].sched_noswitch;
+		sums.sched_switch += schedstats[i].sched_switch;
+		sums.sched_cnt += schedstats[i].sched_cnt;
+		sums.lb_idle += schedstats[i].lb_idle;
+		sums.lb_busy += schedstats[i].lb_busy;
+		sums.lb_resched += schedstats[i].lb_resched;
+		sums.lb_cnt += schedstats[i].lb_cnt;
+		sums.lb_imbalance += schedstats[i].lb_imbalance;
+		sums.lb_nobusyg += schedstats[i].lb_nobusyg;
+		sums.lb_nobusyq += schedstats[i].lb_nobusyq;
+		sums.pt_gained += schedstats[i].pt_gained;
+		sums.pt_lost += schedstats[i].pt_lost;
+		sums.alb_cnt += schedstats[i].alb_cnt;
+		sums.alb_gained += schedstats[i].alb_gained;
+		sums.alb_lost += schedstats[i].alb_lost;
+		seq_printf(seq, 
+		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
+		    "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+		    i, schedstats[i].yld_both_empty,
+		    schedstats[i].yld_act_empty, schedstats[i].yld_exp_empty,
+		    schedstats[i].yld_cnt, schedstats[i].sched_noswitch,
+		    schedstats[i].sched_switch, schedstats[i].sched_cnt,
+		    schedstats[i].lb_idle, schedstats[i].lb_busy,
+		    schedstats[i].lb_resched,
+		    schedstats[i].lb_cnt, schedstats[i].lb_imbalance,
+		    schedstats[i].lb_nobusyg, schedstats[i].lb_nobusyq,
+		    schedstats[i].pt_gained, schedstats[i].pt_lost,
+		    schedstats[i].alb_cnt,
+		    schedstats[i].alb_gained, schedstats[i].alb_lost,
+		    schedstats[i].sbe_cnt,
+		    schedstats[i].mtc_cnt,
+		    info.cpu_time, info.run_delay, info.pcnt);
+	}
+	seq_printf(seq, 
+	    "totals %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
+	    "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+	    sums.yld_both_empty, sums.yld_act_empty, sums.yld_exp_empty,
+	    sums.yld_cnt, sums.sched_noswitch, sums.sched_switch,
+	    sums.sched_cnt, sums.lb_idle, sums.lb_busy, sums.lb_resched,
+	    sums.lb_cnt, sums.lb_imbalance, sums.lb_nobusyg, sums.lb_nobusyq,
+	    sums.pt_gained, sums.pt_lost, sums.alb_cnt, sums.alb_gained,
+	    sums.alb_lost, sums.sbe_cnt, sums.mtc_cnt);
+
+	return 0;
+}
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+	unsigned size = 4096 * (1 + num_online_cpus() / 32);
+	char *buf = kmalloc(size, GFP_KERNEL);
+	struct seq_file *m;
+	int res;
+
+	if (!buf)
+		return -ENOMEM;
+	res = single_open(file, show_schedstat, NULL);
+	if (!res) {
+		m = file->private_data;
+		m->buf = buf;
+		m->size = size;
+	} else
+		kfree(buf);
+	return res;
+}
+
+struct file_operations proc_schedstat_operations = {
+	.open    = schedstat_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+#endif
+
 /*
  * rq_lock - lock a given runqueue and disable interrupts.
  */
@@ -297,6 +446,113 @@ static inline void rq_unlock(runqueue_t 
 	spin_unlock_irq(&rq->lock);
 }
 
+#ifdef CONFIG_SCHEDSTATS
+/*
+ * Called when a process is dequeued from the active array and given
+ * the cpu.  We should note that with the exception of interactive
+ * tasks, the expired queue will become the active queue after the active
+ * queue is empty, without explicitly dequeuing and requeuing tasks in the
+ * expired queue.  (Interactive tasks may be requeued directly to the
+ * active queue, thus delaying tasks in the expired queue from running;
+ * see scheduler_tick()).
+ *
+ * This function is only called from sched_info_arrive(), rather than
+ * dequeue_task(). Even though a task may be queued and dequeued multiple
+ * times as it is shuffled about, we're really interested in knowing how
+ * long it was from the *first* time it was queued to the time that it
+ * finally hit a cpu.
+ */
+static inline void sched_info_dequeued(task_t *t)
+{
+	t->sched_info.last_queued = 0;
+}
+
+/*
+ * Called when a task finally hits the cpu.  We can now calculate how
+ * long it was waiting to run.  We also note when it began so that we
+ * can keep stats on how long its timeslice is.
+ */
+static inline void sched_info_arrive(task_t *t)
+{
+	unsigned long now  = jiffies;
+	unsigned long diff = 0;
+	struct runqueue *rq = task_rq(t);
+
+	if (t->sched_info.last_queued)
+		diff = now - t->sched_info.last_queued;
+	sched_info_dequeued(t);
+	t->sched_info.run_delay += diff;
+	t->sched_info.last_arrival = now;
+	t->sched_info.pcnt++;
+
+	if (!rq)
+		return;
+	
+	rq->info.run_delay += diff;
+	rq->info.pcnt++;
+}
+
+/*
+ * Called when a process is queued into either the active or expired
+ * array.  The time is noted and later used to determine how long we
+ * had to wait for us to reach the cpu.  Since the expired queue will
+ * become the active queue after active queue is empty, without dequeuing
+ * and requeuing any tasks, we are interested in queuing to either. It
+ * is unusual but not impossible for tasks to be dequeued and immediately
+ * requeued in the same or another array: this can happen in sched_yield(),
+ * set_user_nice(), and even load_balance() as it moves tasks from runqueue
+ * to runqueue.
+ *
+ * This function is only called from enqueue_task(), but also only updates
+ * the timestamp if it is already not set.  It's assumed that
+ * sched_info_dequeued() will clear that stamp when appropriate.
+ */
+static inline void sched_info_queued(task_t *t)
+{
+	if (!t->sched_info.last_queued)
+		t->sched_info.last_queued = jiffies;
+}
+
+/*
+ * Called when a process ceases being the active-running process, either
+ * voluntarily or involuntarily.  Now we can calculate how long we ran.
+ */
+static inline void sched_info_depart(task_t *t)
+{
+	struct runqueue *rq = task_rq(t);
+	unsigned long diff = jiffies - t->sched_info.last_arrival;
+
+	t->sched_info.cpu_time += diff;
+
+	if (rq)
+		rq->info.cpu_time += diff;
+}
+
+/*
+ * Called when tasks are switched involuntarily due, typically, to expiring
+ * their time slice.  (This may also be called when switching to or from
+ * the idle task.)  We are only called when prev != next.
+ */
+static inline void sched_info_switch(task_t *prev, task_t *next)
+{
+	struct runqueue *rq = task_rq(prev);
+
+	/*
+	 * prev now departs the cpu.  It's not interesting to record
+	 * stats about how efficient we were at scheduling the idle
+	 * process, however.
+	 */
+	if (prev != rq->idle)
+		sched_info_depart(prev);
+
+	if (next != rq->idle)
+		sched_info_arrive(next);
+}
+#else
+#define sched_info_queued(t)		{}
+#define sched_info_switch(t, next)	{}
+#endif /* CONFIG_SCHEDSTATS */
+
 /*
  * Adding/removing a task to/from a priority array:
  */
@@ -310,6 +566,7 @@ static inline void dequeue_task(struct t
 
 static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
 {
+	sched_info_queued(p);
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
@@ -1077,6 +1334,13 @@ unsigned long nr_iowait(void)
 	return sum;
 }
 
+#ifdef CONFIG_SCHEDSTATS
+void cpu_sched_info(struct sched_info *info, int cpu)
+{
+	memcpy(info, &cpu_rq(cpu)->info, sizeof(struct sched_info));
+}
+#endif /* CONFIG_SCHEDSTATS */
+
 /*
  * double_rq_lock - safely lock two runqueues
  *
@@ -1136,6 +1400,7 @@ static void sched_migrate_task(task_t *p
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
 
+	SCHEDSTAT_INC(smp_processor_id(), mtc_cnt);
 	/* force the process onto the specified CPU */
 	if (migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread. */
@@ -1186,6 +1451,7 @@ void sched_balance_exec(void)
 	if (numnodes == 1)
 		return;
 
+	SCHEDSTAT_INC(this_cpu, sbe_cnt);
 	while (domain->parent && !(domain->flags & SD_FLAG_EXEC))
 		domain = domain->parent;
 
@@ -1220,6 +1486,8 @@ static inline void pull_task(runqueue_t 
 		task_t *p, runqueue_t *this_rq, prio_array_t *this_array,
 		int this_cpu)
 {
+	SCHEDSTAT_INC(this_cpu, pt_gained);
+	SCHEDSTAT_INC(src_rq->cpu, pt_lost);
 	dequeue_task(p, src_array);
 	nr_running_dec(src_rq);
 	set_task_cpu(p, this_cpu);
@@ -1351,7 +1619,7 @@ static struct sched_group *
 find_busiest_group(struct sched_domain *domain, int this_cpu,
 				unsigned long *imbalance, enum idle_type idle)
 {
-	unsigned long max_load, avg_load, total_load, this_load;
+	unsigned long max_load, avg_load, total_load, this_load, load_diff;
 	int modify, total_nr_cpus, busiest_nr_cpus = 0;
 	enum idle_type package_idle = IDLE;
 	struct sched_group *busiest = NULL, *group = domain->groups;
@@ -1504,19 +1772,23 @@ static int load_balance(int this_cpu, ru
 	int balanced = 0, failed = 0;
 	int nr_moved = 0;
 
+	SCHEDSTAT_INC(this_cpu, lb_cnt);
 	spin_lock(&this_rq->lock);
 
 	group = find_busiest_group(domain, this_cpu, &imbalance, idle);
 	if (!group) {
 		balanced = 1;
+		SCHEDSTAT_INC(this_cpu, lb_nobusyg);
 		goto out;
 	}
 
 	busiest = find_busiest_queue(group);
 	if (!busiest || busiest == this_rq) {
 		balanced = 1;
+		SCHEDSTAT_INC(this_cpu, lb_nobusyq);
 		goto out;
 	}
+	SCHEDSTAT_ADD(this_cpu, lb_imbalance, imbalance);
 
 	/* Attempt to move tasks */
 	double_lock_balance(this_rq, busiest);
@@ -1634,20 +1906,21 @@ static inline void idle_balance(int this
  *
  * Called with busiest locked.
  */
-static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
+static void active_load_balance(runqueue_t *this_rq, int this_cpu)
 {
 	int i;
-	struct sched_domain *sd = cpu_sched_domain(busiest_cpu);
+	struct sched_domain *sd = cpu_sched_domain(this_cpu);
 	struct sched_group *group, *busy_group;
 
-	if (busiest->nr_running <= 1)
+	SCHEDSTAT_INC(this_cpu, alb_cnt);
+	if (this_rq->nr_running <= 1)
 		return;
 
 	/* sd->parent should never cause a NULL dereference, if it did so,
  	 * then push_cpu was set to a buggy value */
-	while (!cpu_isset(busiest->push_cpu, sd->span)) {
+	while (!cpu_isset(this_rq->push_cpu, sd->span)) {
  		sd = sd->parent;
-		if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) {
+		if (!sd->parent && !cpu_isset(this_rq->push_cpu, sd->span)) {
 			WARN_ON(1);
 			return;
 		}
@@ -1659,7 +1932,7 @@ static void active_load_balance(runqueue
 	}
 
  	group = sd->groups;
-	while (!cpu_isset(busiest_cpu, group->cpumask)) {
+	while (!cpu_isset(this_cpu, group->cpumask)) {
  		group = group->next;
 		if (group == sd->groups) {
 			WARN_ON(1);
@@ -1689,8 +1962,10 @@ static void active_load_balance(runqueue
  			goto next_group;
 
 		rq = cpu_rq(push_cpu);
-		double_lock_balance(busiest, rq);
-		move_tasks(rq, push_cpu, busiest, 1, sd, IDLE);
+		double_lock_balance(this_rq, rq);
+		move_tasks(rq, push_cpu, this_rq, 1, sd, IDLE);
+		SCHEDSTAT_INC(this_cpu, alb_lost);
+		SCHEDSTAT_INC(push_cpu, alb_gained);
 		spin_unlock(&rq->lock);
 next_group:
 		group = group->next;
@@ -1731,6 +2006,11 @@ static void rebalance_tick(int this_cpu,
 			interval = 1;
 
 		if (j - domain->last_balance >= interval) {
+			if (idle == IDLE) {
+				SCHEDSTAT_INC(this_cpu, lb_idle);
+			} else {
+				SCHEDSTAT_INC(this_cpu, lb_busy);
+			}
 			if (load_balance(this_cpu, this_rq, domain, idle)) {
 				/* We've pulled tasks over so no longer idle */
 				idle = NOT_IDLE;
@@ -1904,13 +2184,14 @@ asmlinkage void schedule(void)
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
-	int idx;
+	int idx, this_cpu = smp_processor_id();
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
+	SCHEDSTAT_INC(this_cpu, sched_cnt);
 	if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
 		if (unlikely(in_atomic())) {
 			printk(KERN_ERR "bad: scheduling while atomic!\n");
@@ -1956,6 +2237,7 @@ need_resched:
 
 	if (unlikely(!rq->nr_running)) {
 #ifdef CONFIG_SMP
+		SCHEDSTAT_INC(this_cpu, lb_resched);
 		idle_balance(smp_processor_id(), rq);
 #endif
 		if (!rq->nr_running) {
@@ -1970,12 +2252,14 @@ need_resched:
 		/*
 		 * Switch the active and expired arrays.
 		 */
+		SCHEDSTAT_INC(this_cpu, sched_switch);
 		rq->active = rq->expired;
 		rq->expired = array;
 		array = rq->active;
 		rq->expired_timestamp = 0;
 		rq->best_expired_prio = MAX_PRIO;
 	}
+	SCHEDSTAT_INC(this_cpu, sched_noswitch);
 
 	idx = sched_find_first_bit(array->bitmap);
 	queue = array->queue + idx;
@@ -2006,6 +2290,7 @@ switch_tasks:
 	}
 	prev->timestamp = now;
 
+	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		next->timestamp = now;
 		rq->nr_switches++;
@@ -2697,6 +2982,9 @@ asmlinkage long sys_sched_yield(void)
 {
 	runqueue_t *rq = this_rq_lock();
 	prio_array_t *array = current->array;
+#ifdef CONFIG_SCHEDSTATS
+	int this_cpu = smp_processor_id();
+#endif /* CONFIG_SCHEDSTATS */
 
 	/*
 	 * We implement yielding by moving the task into the expired
@@ -2705,7 +2993,16 @@ asmlinkage long sys_sched_yield(void)
 	 * (special rule: RT tasks will just roundrobin in the active
 	 *  array.)
 	 */
+	SCHEDSTAT_INC(this_cpu, yld_cnt);
 	if (likely(!rt_task(current))) {
+		if (current->array->nr_active == 1) {
+		    SCHEDSTAT_INC(this_cpu, yld_act_empty);
+		    if (!rq->expired->nr_active) {
+			SCHEDSTAT_INC(this_cpu, yld_both_empty);
+		    }
+		} else if (!rq->expired->nr_active) {
+			SCHEDSTAT_INC(this_cpu, yld_exp_empty);
+		}
 		dequeue_task(current, array);
 		enqueue_task(current, rq->expired);
 	} else {
@@ -3458,6 +3755,9 @@ void __init sched_init(void)
 		rq = cpu_rq(i);
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
+#ifdef CONFIG_SCHEDSTATS
+		rq->cpu = i;
+#endif /* CONFIG_SCHEDSTATS */
 		rq->best_expired_prio = MAX_PRIO;
 
 		spin_lock_init(&rq->lock);