diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/arch/i386/Kconfig 730-schedstats/arch/i386/Kconfig --- 720-vma_statistics/arch/i386/Kconfig 2004-02-20 15:54:05.000000000 -0800 +++ 730-schedstats/arch/i386/Kconfig 2004-02-20 15:57:18.000000000 -0800 @@ -1657,6 +1657,19 @@ config MAGIC_SYSRQ depends on KGDB_SYSRQ default y +config SCHEDSTATS + bool "Collect scheduler statistics" + depends on PROC_FS + default y + help + If you say Y here, additional code will be inserted into the + scheduler and related routines to collect statistics about + scheduler behavior and provide them in /proc/schedstat. These + stats may be useful for both tuning and debugging the scheduler + If you aren't debugging the scheduler or trying to tune a specific + application, you can say N to avoid the very slight overhead + this adds. + config X86_FIND_SMP_CONFIG bool depends on X86_LOCAL_APIC || X86_VOYAGER diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/arch/ppc/Kconfig 730-schedstats/arch/ppc/Kconfig --- 720-vma_statistics/arch/ppc/Kconfig 2004-02-18 16:23:02.000000000 -0800 +++ 730-schedstats/arch/ppc/Kconfig 2004-02-20 15:57:18.000000000 -0800 @@ -1261,6 +1261,19 @@ config DEBUG_INFO debug the kernel. If you don't debug the kernel, you can say N. +config SCHEDSTATS + bool "Collect scheduler statistics" + depends on PROC_FS + default y + help + If you say Y here, additional code will be inserted into the + scheduler and related routines to collect statistics about + scheduler behavior and provide them in /proc/schedstat. These + stats may be useful for both tuning and debugging the scheduler + If you aren't debugging the scheduler or trying to tune a specific + application, you can say N to avoid the very slight overhead + this adds. + config BOOTX_TEXT bool "Support for early boot text console (BootX or OpenFirmware only)" depends PPC_OF diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/arch/ppc64/Kconfig 730-schedstats/arch/ppc64/Kconfig --- 720-vma_statistics/arch/ppc64/Kconfig 2004-02-18 16:23:23.000000000 -0800 +++ 730-schedstats/arch/ppc64/Kconfig 2004-02-20 15:57:18.000000000 -0800 @@ -417,7 +417,20 @@ config DEBUG_INFO debugging info resulting in a larger kernel image. Say Y here only if you plan to use gdb to debug the kernel. If you don't debug the kernel, you can say N. - + +config SCHEDSTATS + bool "Collect scheduler statistics" + depends on PROC_FS + default y + help + If you say Y here, additional code will be inserted into the + scheduler and related routines to collect statistics about + scheduler behavior and provide them in /proc/schedstat. These + stats may be useful for both tuning and debugging the scheduler + If you aren't debugging the scheduler or trying to tune a specific + application, you can say N to avoid the very slight overhead + this adds. + config MCOUNT bool "Generate function call graph" depends on DEBUG_KERNEL diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/arch/x86_64/Kconfig 730-schedstats/arch/x86_64/Kconfig --- 720-vma_statistics/arch/x86_64/Kconfig 2004-02-18 16:23:03.000000000 -0800 +++ 730-schedstats/arch/x86_64/Kconfig 2004-02-20 15:57:18.000000000 -0800 @@ -470,6 +470,19 @@ config DEBUG_INFO Say Y here only if you plan to use gdb to debug the kernel. Please note that this option requires new binutils. If you don't debug the kernel, you can say N. + +config SCHEDSTATS + bool "Collect scheduler statistics" + depends on PROC_FS + default y + help + If you say Y here, additional code will be inserted into the + scheduler and related routines to collect statistics about + scheduler behavior and provide them in /proc/schedstat. These + stats may be useful for both tuning and debugging the scheduler + If you aren't debugging the scheduler or trying to tune a specific + application, you can say N to avoid the very slight overhead + this adds. config FRAME_POINTER bool "Compile the kernel with frame pointers" diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/fs/proc/array.c 730-schedstats/fs/proc/array.c --- 720-vma_statistics/fs/proc/array.c 2003-10-21 11:16:10.000000000 -0700 +++ 730-schedstats/fs/proc/array.c 2004-02-20 15:57:18.000000000 -0800 @@ -343,9 +343,15 @@ int proc_pid_stat(struct task_struct *ta read_lock(&tasklist_lock); ppid = task->pid ? task->real_parent->pid : 0; read_unlock(&tasklist_lock); +#ifdef CONFIG_SCHEDSTATS + res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ +%lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \ +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu %lu %lu %lu\n", +#else res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \ %lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n", +#endif /* CONFIG_SCHEDSTATS */ task->pid, task->comm, state, @@ -391,7 +397,14 @@ int proc_pid_stat(struct task_struct *ta task->exit_signal, task_cpu(task), task->rt_priority, +#ifdef CONFIG_SCHEDSTATS + task->policy, + task->sched_info.cpu_time, + task->sched_info.run_delay, + task->sched_info.pcnt); +#else task->policy); +#endif /* CONFIG_SCHEDSTATS */ if(mm) mmput(mm); return res; diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/fs/proc/proc_misc.c 730-schedstats/fs/proc/proc_misc.c --- 720-vma_statistics/fs/proc/proc_misc.c 2004-02-18 16:23:23.000000000 -0800 +++ 730-schedstats/fs/proc/proc_misc.c 2004-02-20 15:57:18.000000000 -0800 @@ -325,6 +325,10 @@ static struct file_operations proc_vmsta .release = seq_release, }; +#ifdef CONFIG_SCHEDSTATS +extern struct file_operations proc_schedstat_operations; +#endif + #ifdef CONFIG_PROC_HARDWARE static int hardware_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -826,6 +830,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_NUMA create_seq_entry("meminfo.numa",0,&proc_meminfo_numa_operations); #endif +#ifdef CONFIG_SCHEDSTATS + create_seq_entry("schedstat", 0, &proc_schedstat_operations); +#endif #ifdef CONFIG_PROC_KCORE proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/include/linux/sched.h 730-schedstats/include/linux/sched.h --- 720-vma_statistics/include/linux/sched.h 2004-02-20 15:40:36.000000000 -0800 +++ 730-schedstats/include/linux/sched.h 2004-02-20 15:57:18.000000000 -0800 @@ -101,6 +101,16 @@ extern unsigned long nr_running_cpu(int extern unsigned long nr_uninterruptible(void); extern unsigned long nr_iowait(void); +#ifdef CONFIG_SCHEDSTATS +struct sched_info; +extern void cpu_sched_info(struct sched_info *, int); +#define SCHEDSTAT_INC(cpu, field) schedstats[cpu].field++; +#define SCHEDSTAT_ADD(cpu, field, amt) schedstats[cpu].field += amt; +#else +#define SCHEDSTAT_INC(cpu, field) {;} +#define SCHEDSTAT_ADD(cpu, field, amt) {;} +#endif + #include #include #include @@ -336,6 +346,18 @@ struct k_itimer { struct sigqueue *sigq; /* signal queue entry. */ }; +#ifdef CONFIG_SCHEDSTATS +struct sched_info { + /* cumulative counters */ + unsigned long cpu_time, /* time spent on the cpu */ + run_delay, /* time spent waiting on a runqueue */ + pcnt; /* # of timeslices run on this cpu */ + + /* timestamps */ + unsigned long last_arrival, /* when we last ran on a cpu */ + last_queued; /* when we were last queued to run */ +}; +#endif /* CONFIG_SCHEDSTATS */ struct io_context; /* See blkdev.h */ void exit_io_context(void); @@ -362,6 +384,10 @@ struct task_struct { cpumask_t cpus_allowed; unsigned int time_slice, first_time_slice; +#ifdef CONFIG_SCHEDSTATS + struct sched_info sched_info; +#endif /* CONFIG_SCHEDSTATS */ + struct list_head tasks; struct list_head ptrace_children; struct list_head ptrace_list; diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/kernel/fork.c 730-schedstats/kernel/fork.c --- 720-vma_statistics/kernel/fork.c 2004-02-20 15:40:36.000000000 -0800 +++ 730-schedstats/kernel/fork.c 2004-02-20 15:58:14.000000000 -0800 @@ -944,6 +944,9 @@ struct task_struct *copy_process(unsigne p->security = NULL; p->io_context = NULL; p->io_wait = NULL; +#ifdef CONFIG_SCHEDSTATS + memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif /* CONFIG_SCHEDSTATS */ retval = -ENOMEM; if ((retval = security_task_alloc(p))) diff -purN -X /home/mbligh/.diff.exclude 720-vma_statistics/kernel/sched.c 730-schedstats/kernel/sched.c --- 720-vma_statistics/kernel/sched.c 2004-02-18 16:20:09.000000000 -0800 +++ 730-schedstats/kernel/sched.c 2004-02-20 15:57:18.000000000 -0800 @@ -37,6 +37,8 @@ #include #include #include +#include +#include #ifdef CONFIG_NUMA #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) @@ -205,6 +207,9 @@ struct runqueue { nr_uninterruptible; unsigned long long timestamp_last_tick; task_t *curr, *idle; +#ifdef CONFIG_SCHEDSTATS + int cpu; /* to make easy reverse-lookups with per-cpu runqueues */ +#endif struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; int best_expired_prio; @@ -220,6 +225,10 @@ struct runqueue { task_t *migration_thread; struct list_head migration_queue; + +#ifdef CONFIG_SCHEDSTATS + struct sched_info info; +#endif }; static DEFINE_PER_CPU(struct runqueue, runqueues); @@ -278,6 +287,146 @@ static inline void task_rq_unlock(runque spin_unlock_irqrestore(&rq->lock, *flags); } + +#ifdef CONFIG_SCHEDSTATS +struct schedstat { + /* sys_sched_yield stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule stats */ + unsigned long sched_noswitch; + unsigned long sched_switch; + unsigned long sched_cnt; + + /* load_balance stats */ + unsigned long lb_imbalance; + unsigned long lb_idle; + unsigned long lb_busy; + unsigned long lb_resched; + unsigned long lb_cnt; + unsigned long lb_nobusyg; + unsigned long lb_nobusyq; + + /* pull_task stats */ + unsigned long pt_gained; + unsigned long pt_lost; + + /* active_load_balance stats */ + unsigned long alb_cnt; + unsigned long alb_gained; + unsigned long alb_lost; + + /* migrate_to_cpu stats */ + unsigned long mtc_cnt; + + /* sched_balance_exec stats */ + unsigned long sbe_cnt; +} ____cacheline_aligned; + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 5 + +struct schedstat schedstats[NR_CPUS]; + +static int show_schedstat(struct seq_file *seq, void *v) +{ + struct schedstat sums; + int i; + + memset(&sums, 0, sizeof(sums)); + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + for (i = 0; i < NR_CPUS; i++) { + + struct sched_info info; + + if (!cpu_online(i)) continue; + + cpu_sched_info(&info, i); + + sums.yld_exp_empty += schedstats[i].yld_exp_empty; + sums.yld_act_empty += schedstats[i].yld_act_empty; + sums.yld_both_empty += schedstats[i].yld_both_empty; + sums.yld_cnt += schedstats[i].yld_cnt; + sums.sched_noswitch += schedstats[i].sched_noswitch; + sums.sched_switch += schedstats[i].sched_switch; + sums.sched_cnt += schedstats[i].sched_cnt; + sums.lb_idle += schedstats[i].lb_idle; + sums.lb_busy += schedstats[i].lb_busy; + sums.lb_resched += schedstats[i].lb_resched; + sums.lb_cnt += schedstats[i].lb_cnt; + sums.lb_imbalance += schedstats[i].lb_imbalance; + sums.lb_nobusyg += schedstats[i].lb_nobusyg; + sums.lb_nobusyq += schedstats[i].lb_nobusyq; + sums.pt_gained += schedstats[i].pt_gained; + sums.pt_lost += schedstats[i].pt_lost; + sums.alb_cnt += schedstats[i].alb_cnt; + sums.alb_gained += schedstats[i].alb_gained; + sums.alb_lost += schedstats[i].alb_lost; + seq_printf(seq, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + i, schedstats[i].yld_both_empty, + schedstats[i].yld_act_empty, schedstats[i].yld_exp_empty, + schedstats[i].yld_cnt, schedstats[i].sched_noswitch, + schedstats[i].sched_switch, schedstats[i].sched_cnt, + schedstats[i].lb_idle, schedstats[i].lb_busy, + schedstats[i].lb_resched, + schedstats[i].lb_cnt, schedstats[i].lb_imbalance, + schedstats[i].lb_nobusyg, schedstats[i].lb_nobusyq, + schedstats[i].pt_gained, schedstats[i].pt_lost, + schedstats[i].alb_cnt, + schedstats[i].alb_gained, schedstats[i].alb_lost, + schedstats[i].sbe_cnt, + schedstats[i].mtc_cnt, + info.cpu_time, info.run_delay, info.pcnt); + } + seq_printf(seq, + "totals %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + sums.yld_both_empty, sums.yld_act_empty, sums.yld_exp_empty, + sums.yld_cnt, sums.sched_noswitch, sums.sched_switch, + sums.sched_cnt, sums.lb_idle, sums.lb_busy, sums.lb_resched, + sums.lb_cnt, sums.lb_imbalance, sums.lb_nobusyg, sums.lb_nobusyq, + sums.pt_gained, sums.pt_lost, sums.alb_cnt, sums.alb_gained, + sums.alb_lost, sums.sbe_cnt, sums.mtc_cnt); + + return 0; +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ + unsigned size = 4096 * (1 + num_online_cpus() / 32); + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = single_open(file, show_schedstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * rq_lock - lock a given runqueue and disable interrupts. */ @@ -297,6 +446,113 @@ static inline void rq_unlock(runqueue_t spin_unlock_irq(&rq->lock); } +#ifdef CONFIG_SCHEDSTATS +/* + * Called when a process is dequeued from the active array and given + * the cpu. We should note that with the exception of interactive + * tasks, the expired queue will become the active queue after the active + * queue is empty, without explicitly dequeuing and requeuing tasks in the + * expired queue. (Interactive tasks may be requeued directly to the + * active queue, thus delaying tasks in the expired queue from running; + * see scheduler_tick()). + * + * This function is only called from sched_info_arrive(), rather than + * dequeue_task(). Even though a task may be queued and dequeued multiple + * times as it is shuffled about, we're really interested in knowing how + * long it was from the *first* time it was queued to the time that it + * finally hit a cpu. + */ +static inline void sched_info_dequeued(task_t *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * Called when a task finally hits the cpu. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static inline void sched_info_arrive(task_t *t) +{ + unsigned long now = jiffies; + unsigned long diff = 0; + struct runqueue *rq = task_rq(t); + + if (t->sched_info.last_queued) + diff = now - t->sched_info.last_queued; + sched_info_dequeued(t); + t->sched_info.run_delay += diff; + t->sched_info.last_arrival = now; + t->sched_info.pcnt++; + + if (!rq) + return; + + rq->info.run_delay += diff; + rq->info.pcnt++; +} + +/* + * Called when a process is queued into either the active or expired + * array. The time is noted and later used to determine how long we + * had to wait for us to reach the cpu. Since the expired queue will + * become the active queue after active queue is empty, without dequeuing + * and requeuing any tasks, we are interested in queuing to either. It + * is unusual but not impossible for tasks to be dequeued and immediately + * requeued in the same or another array: this can happen in sched_yield(), + * set_user_nice(), and even load_balance() as it moves tasks from runqueue + * to runqueue. + * + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(task_t *t) +{ + if (!t->sched_info.last_queued) + t->sched_info.last_queued = jiffies; +} + +/* + * Called when a process ceases being the active-running process, either + * voluntarily or involuntarily. Now we can calculate how long we ran. + */ +static inline void sched_info_depart(task_t *t) +{ + struct runqueue *rq = task_rq(t); + unsigned long diff = jiffies - t->sched_info.last_arrival; + + t->sched_info.cpu_time += diff; + + if (rq) + rq->info.cpu_time += diff; +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void sched_info_switch(task_t *prev, task_t *next) +{ + struct runqueue *rq = task_rq(prev); + + /* + * prev now departs the cpu. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(prev); + + if (next != rq->idle) + sched_info_arrive(next); +} +#else +#define sched_info_queued(t) {} +#define sched_info_switch(t, next) {} +#endif /* CONFIG_SCHEDSTATS */ + /* * Adding/removing a task to/from a priority array: */ @@ -310,6 +566,7 @@ static inline void dequeue_task(struct t static inline void enqueue_task(struct task_struct *p, prio_array_t *array) { + sched_info_queued(p); list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); array->nr_active++; @@ -1077,6 +1334,13 @@ unsigned long nr_iowait(void) return sum; } +#ifdef CONFIG_SCHEDSTATS +void cpu_sched_info(struct sched_info *info, int cpu) +{ + memcpy(info, &cpu_rq(cpu)->info, sizeof(struct sched_info)); +} +#endif /* CONFIG_SCHEDSTATS */ + /* * double_rq_lock - safely lock two runqueues * @@ -1136,6 +1400,7 @@ static void sched_migrate_task(task_t *p if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; + SCHEDSTAT_INC(smp_processor_id(), mtc_cnt); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread. */ @@ -1186,6 +1451,7 @@ void sched_balance_exec(void) if (numnodes == 1) return; + SCHEDSTAT_INC(this_cpu, sbe_cnt); while (domain->parent && !(domain->flags & SD_FLAG_EXEC)) domain = domain->parent; @@ -1220,6 +1486,8 @@ static inline void pull_task(runqueue_t task_t *p, runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) { + SCHEDSTAT_INC(this_cpu, pt_gained); + SCHEDSTAT_INC(src_rq->cpu, pt_lost); dequeue_task(p, src_array); nr_running_dec(src_rq); set_task_cpu(p, this_cpu); @@ -1351,7 +1619,7 @@ static struct sched_group * find_busiest_group(struct sched_domain *domain, int this_cpu, unsigned long *imbalance, enum idle_type idle) { - unsigned long max_load, avg_load, total_load, this_load; + unsigned long max_load, avg_load, total_load, this_load, load_diff; int modify, total_nr_cpus, busiest_nr_cpus = 0; enum idle_type package_idle = IDLE; struct sched_group *busiest = NULL, *group = domain->groups; @@ -1504,19 +1772,23 @@ static int load_balance(int this_cpu, ru int balanced = 0, failed = 0; int nr_moved = 0; + SCHEDSTAT_INC(this_cpu, lb_cnt); spin_lock(&this_rq->lock); group = find_busiest_group(domain, this_cpu, &imbalance, idle); if (!group) { balanced = 1; + SCHEDSTAT_INC(this_cpu, lb_nobusyg); goto out; } busiest = find_busiest_queue(group); if (!busiest || busiest == this_rq) { balanced = 1; + SCHEDSTAT_INC(this_cpu, lb_nobusyq); goto out; } + SCHEDSTAT_ADD(this_cpu, lb_imbalance, imbalance); /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); @@ -1634,20 +1906,21 @@ static inline void idle_balance(int this * * Called with busiest locked. */ -static void active_load_balance(runqueue_t *busiest, int busiest_cpu) +static void active_load_balance(runqueue_t *this_rq, int this_cpu) { int i; - struct sched_domain *sd = cpu_sched_domain(busiest_cpu); + struct sched_domain *sd = cpu_sched_domain(this_cpu); struct sched_group *group, *busy_group; - if (busiest->nr_running <= 1) + SCHEDSTAT_INC(this_cpu, alb_cnt); + if (this_rq->nr_running <= 1) return; /* sd->parent should never cause a NULL dereference, if it did so, * then push_cpu was set to a buggy value */ - while (!cpu_isset(busiest->push_cpu, sd->span)) { + while (!cpu_isset(this_rq->push_cpu, sd->span)) { sd = sd->parent; - if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) { + if (!sd->parent && !cpu_isset(this_rq->push_cpu, sd->span)) { WARN_ON(1); return; } @@ -1659,7 +1932,7 @@ static void active_load_balance(runqueue } group = sd->groups; - while (!cpu_isset(busiest_cpu, group->cpumask)) { + while (!cpu_isset(this_cpu, group->cpumask)) { group = group->next; if (group == sd->groups) { WARN_ON(1); @@ -1689,8 +1962,10 @@ static void active_load_balance(runqueue goto next_group; rq = cpu_rq(push_cpu); - double_lock_balance(busiest, rq); - move_tasks(rq, push_cpu, busiest, 1, sd, IDLE); + double_lock_balance(this_rq, rq); + move_tasks(rq, push_cpu, this_rq, 1, sd, IDLE); + SCHEDSTAT_INC(this_cpu, alb_lost); + SCHEDSTAT_INC(push_cpu, alb_gained); spin_unlock(&rq->lock); next_group: group = group->next; @@ -1731,6 +2006,11 @@ static void rebalance_tick(int this_cpu, interval = 1; if (j - domain->last_balance >= interval) { + if (idle == IDLE) { + SCHEDSTAT_INC(this_cpu, lb_idle); + } else { + SCHEDSTAT_INC(this_cpu, lb_busy); + } if (load_balance(this_cpu, this_rq, domain, idle)) { /* We've pulled tasks over so no longer idle */ idle = NOT_IDLE; @@ -1904,13 +2184,14 @@ asmlinkage void schedule(void) struct list_head *queue; unsigned long long now; unsigned long run_time; - int idx; + int idx, this_cpu = smp_processor_id(); /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ + SCHEDSTAT_INC(this_cpu, sched_cnt); if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { if (unlikely(in_atomic())) { printk(KERN_ERR "bad: scheduling while atomic!\n"); @@ -1956,6 +2237,7 @@ need_resched: if (unlikely(!rq->nr_running)) { #ifdef CONFIG_SMP + SCHEDSTAT_INC(this_cpu, lb_resched); idle_balance(smp_processor_id(), rq); #endif if (!rq->nr_running) { @@ -1970,12 +2252,14 @@ need_resched: /* * Switch the active and expired arrays. */ + SCHEDSTAT_INC(this_cpu, sched_switch); rq->active = rq->expired; rq->expired = array; array = rq->active; rq->expired_timestamp = 0; rq->best_expired_prio = MAX_PRIO; } + SCHEDSTAT_INC(this_cpu, sched_noswitch); idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; @@ -2006,6 +2290,7 @@ switch_tasks: } prev->timestamp = now; + sched_info_switch(prev, next); if (likely(prev != next)) { next->timestamp = now; rq->nr_switches++; @@ -2697,6 +2982,9 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; +#ifdef CONFIG_SCHEDSTATS + int this_cpu = smp_processor_id(); +#endif /* CONFIG_SCHEDSTATS */ /* * We implement yielding by moving the task into the expired @@ -2705,7 +2993,16 @@ asmlinkage long sys_sched_yield(void) * (special rule: RT tasks will just roundrobin in the active * array.) */ + SCHEDSTAT_INC(this_cpu, yld_cnt); if (likely(!rt_task(current))) { + if (current->array->nr_active == 1) { + SCHEDSTAT_INC(this_cpu, yld_act_empty); + if (!rq->expired->nr_active) { + SCHEDSTAT_INC(this_cpu, yld_both_empty); + } + } else if (!rq->expired->nr_active) { + SCHEDSTAT_INC(this_cpu, yld_exp_empty); + } dequeue_task(current, array); enqueue_task(current, rq->expired); } else { @@ -3458,6 +3755,9 @@ void __init sched_init(void) rq = cpu_rq(i); rq->active = rq->arrays; rq->expired = rq->arrays + 1; +#ifdef CONFIG_SCHEDSTATS + rq->cpu = i; +#endif /* CONFIG_SCHEDSTATS */ rq->best_expired_prio = MAX_PRIO; spin_lock_init(&rq->lock);