An instrumentation patch which attempts to work out where all that idle time is coming from. cat /proc/sleepo fs/select.c:255(schedule_timeout) 119 204415008 1717773 37026565 kernel/exit.c:1056(schedule) 10 47496358 4749635 23686014 fs/namei.c:1247(down) 1 53708 53708 53708 fs/namei.c:344(down) 4 67425 16856 33478 fs/jbd/journal.c:197(schedule) 5 96894718 19378943 44777091 Tells us that four down() attempts at fs/namei.c:344 blocked. The total amout of time spent blocked was 67425 microseconds. The average was 16845 us and the maximum was 33478 us. Bear in mind that if 100 processes block at a certain point for one second, this tool will claim that 100 seconds were spent blocked at that point. Which is quite reasonable, but some care is needed when interpreting the results. The following operations are instrumented: down() down_read() down_write() schedule() schedule_timeout() io_schedule() io_schedule_timeout() CONFIG_SLEEPOMETER is currently broken with CONFIG_PREEMPT due to #include hell. Reading /proc/sleepo clears all the current stats. arch/i386/Kconfig | 7 +++++++ arch/i386/kernel/entry.S | 8 ++++++++ fs/proc/proc_misc.c | 17 +++++++++++++++++ include/asm-i386/rwsem.h | 0 include/asm-i386/semaphore.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 11 ++--------- include/linux/preempt.h | 2 ++ include/linux/rwsem.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/sched.h | 14 ++++++++++++-- kernel/Makefile | 1 + kernel/ksyms.c | 8 +++++--- kernel/module.c | 4 ++++ kernel/sched.c | 16 ++++++++++++++++ kernel/timer.c | 4 ++++ 14 files changed, 148 insertions(+), 14 deletions(-) diff -puN arch/i386/Kconfig~sleepometer arch/i386/Kconfig --- 25/arch/i386/Kconfig~sleepometer 2003-06-14 23:05:10.000000000 -0700 +++ 25-akpm/arch/i386/Kconfig 2003-06-15 00:24:15.000000000 -0700 @@ -1752,6 +1752,13 @@ config X86_MPPARSE depends on X86_LOCAL_APIC && !X86_VISWS default y +config SLEEPOMETER + bool "sleep instrumentation" if !PREEMPT + default n + help + Creates /proc/sleepo, and puts things in there which identify + where the kernel is blocking and for how long. + endmenu source "security/Kconfig" diff -puN arch/i386/kernel/entry.S~sleepometer arch/i386/kernel/entry.S --- 25/arch/i386/kernel/entry.S~sleepometer 2003-06-14 23:05:10.000000000 -0700 +++ 25-akpm/arch/i386/kernel/entry.S 2003-06-15 01:17:43.000000000 -0700 @@ -237,7 +237,11 @@ need_resched: jz restore_all movl $PREEMPT_ACTIVE,TI_PRE_COUNT(%ebp) sti +#ifdef CONFIG_SLEEPOMETER + call sleepo_schedule +#else call schedule +#endif movl $0,TI_PRE_COUNT(%ebp) cli jmp need_resched @@ -332,7 +336,11 @@ work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: +#ifdef CONFIG_SLEEPOMETER + call sleepo_schedule +#else call schedule +#endif cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret diff -puN fs/proc/proc_misc.c~sleepometer fs/proc/proc_misc.c --- 25/fs/proc/proc_misc.c~sleepometer 2003-06-14 23:05:10.000000000 -0700 +++ 25-akpm/fs/proc/proc_misc.c 2003-06-14 23:05:10.000000000 -0700 @@ -298,6 +298,20 @@ static struct file_operations proc_vmsta .release = seq_release, }; +#ifdef CONFIG_SLEEPOMETER +extern struct seq_operations sleepo_op; +static int sleepo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &sleepo_op); +} +static struct file_operations proc_sleepo_file_operations = { + .open = sleepo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + #ifdef CONFIG_PROC_HARDWARE static int hardware_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -695,6 +709,9 @@ void __init proc_misc_init(void) create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations); create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations); create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations); +#ifdef CONFIG_SLEEPOMETER + create_seq_entry("sleepo",S_IRUGO, &proc_sleepo_file_operations); +#endif create_seq_entry("diskstats", 0, &proc_diskstats_operations); #ifdef CONFIG_MODULES create_seq_entry("modules", 0, &proc_modules_operations); diff -puN include/asm-i386/semaphore.h~sleepometer include/asm-i386/semaphore.h --- 25/include/asm-i386/semaphore.h~sleepometer 2003-06-14 23:05:10.000000000 -0700 +++ 25-akpm/include/asm-i386/semaphore.h 2003-06-14 23:05:10.000000000 -0700 @@ -41,6 +41,10 @@ #include #include +#ifdef CONFIG_SLEEPOMETER +#include +#endif + struct semaphore { atomic_t count; int sleepers; @@ -111,7 +115,11 @@ asmlinkage void __up(struct semaphore * * "__down_failed" is a special asm handler that calls the C * routine that actually waits. See arch/i386/kernel/semaphore.c */ +#ifdef CONFIG_SLEEPOMETER +static inline void sleepo_down(struct semaphore * sem) +#else static inline void down(struct semaphore * sem) +#endif { #ifdef WAITQUEUE_DEBUG CHECK_MAGIC(sem->__magic); @@ -131,11 +139,26 @@ static inline void down(struct semaphore :"memory"); } +#ifdef CONFIG_SLEEPOMETER +#define down(sem) \ + do { \ + static struct sleepo_data sd; \ + \ + sleepo_begin(__FILE__, __LINE__, "down", &sd); \ + sleepo_down(sem); \ + sleepo_end(&sd); \ + } while (0) +#endif + /* * Interruptible try to acquire a semaphore. If we obtained * it, return zero. If we were interrupted, returns -EINTR */ +#ifdef CONFIG_SLEEPOMETER +static inline int sleepo_down_interruptible(struct semaphore * sem) +#else static inline int down_interruptible(struct semaphore * sem) +#endif { int result; @@ -159,6 +182,19 @@ static inline int down_interruptible(str return result; } +#ifdef CONFIG_SLEEPOMETER +#define down_interruptible(sem) \ + ({ \ + static struct sleepo_data sd; \ + int ret; \ + \ + sleepo_begin(__FILE__, __LINE__, "down_interruptible", &sd);\ + ret = sleepo_down_interruptible(sem); \ + sleepo_end(&sd); \ + ret; \ + }) +#endif + /* * Non-blockingly attempt to down() a semaphore. * Returns zero if we acquired it diff -puN include/linux/fs.h~sleepometer include/linux/fs.h --- 25/include/linux/fs.h~sleepometer 2003-06-14 23:05:10.000000000 -0700 +++ 25-akpm/include/linux/fs.h 2003-06-14 23:05:10.000000000 -0700 @@ -627,15 +627,8 @@ struct super_block { /* * Superblock locking. */ -static inline void lock_super(struct super_block * sb) -{ - down(&sb->s_lock); -} - -static inline void unlock_super(struct super_block * sb) -{ - up(&sb->s_lock); -} +#define lock_super(sb) down(&(sb)->s_lock) +#define unlock_super(sb) up(&(sb)->s_lock) /* * VFS helper functions.. diff -puN include/linux/preempt.h~sleepometer include/linux/preempt.h --- 25/include/linux/preempt.h~sleepometer 2003-06-14 23:05:10.000000000 -0700 +++ 25-akpm/include/linux/preempt.h 2003-06-15 01:12:25.000000000 -0700 @@ -22,7 +22,9 @@ do { \ #ifdef CONFIG_PREEMPT +#ifndef CONFIG_SLEEPOMETER extern void preempt_schedule(void); +#endif #define preempt_disable() \ do { \ diff -puN include/linux/sched.h~sleepometer include/linux/sched.h --- 25/include/linux/sched.h~sleepometer 2003-06-14 23:05:10.000000000 -0700 +++ 25-akpm/include/linux/sched.h 2003-06-14 23:05:10.000000000 -0700 @@ -158,8 +158,14 @@ extern void show_trace_task(task_t *tsk) */ extern void show_stack(struct task_struct *task, unsigned long *sp); +#ifdef CONFIG_SLEEPOMETER +#include +#else void io_schedule(void); long io_schedule_timeout(long timeout); +asmlinkage void schedule(void); +extern signed long FASTCALL(schedule_timeout(signed long timeout)); +#endif extern void cpu_init (void); extern void trap_init(void); @@ -171,8 +177,6 @@ extern unsigned long cache_decay_ticks; #define MAX_SCHEDULE_TIMEOUT LONG_MAX -extern signed long FASTCALL(schedule_timeout(signed long timeout)); -asmlinkage void schedule(void); struct namespace; @@ -341,6 +345,12 @@ struct task_struct { unsigned long sleep_avg; unsigned long last_run; +#ifdef CONFIG_SLEEPOMETER + unsigned long long sleepo_start; /* usecs */ + struct sleepo_data *active_sleepo; + int sleepo_depth; +#endif + unsigned long policy; unsigned long cpus_allowed; unsigned int time_slice, first_time_slice; diff -puN kernel/ksyms.c~sleepometer kernel/ksyms.c --- 25/kernel/ksyms.c~sleepometer 2003-06-14 23:05:10.000000000 -0700 +++ 25-akpm/kernel/ksyms.c 2003-06-15 00:57:37.000000000 -0700 @@ -459,13 +459,15 @@ EXPORT_SYMBOL(sleep_on); EXPORT_SYMBOL(sleep_on_timeout); EXPORT_SYMBOL(interruptible_sleep_on); EXPORT_SYMBOL(interruptible_sleep_on_timeout); +EXPORT_SYMBOL(yield); +EXPORT_SYMBOL(__cond_resched); +#ifndef CONFIG_SLEEPOMETER EXPORT_SYMBOL(schedule); +EXPORT_SYMBOL(schedule_timeout); #ifdef CONFIG_PREEMPT EXPORT_SYMBOL(preempt_schedule); #endif -EXPORT_SYMBOL(schedule_timeout); -EXPORT_SYMBOL(yield); -EXPORT_SYMBOL(__cond_resched); +#endif EXPORT_SYMBOL(set_user_nice); EXPORT_SYMBOL(task_nice); EXPORT_SYMBOL_GPL(idle_cpu); diff -puN kernel/Makefile~sleepometer kernel/Makefile --- 25/kernel/Makefile~sleepometer 2003-06-14 23:05:10.000000000 -0700 +++ 25-akpm/kernel/Makefile 2003-06-14 23:05:10.000000000 -0700 @@ -20,6 +20,7 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o obj-$(CONFIG_COMPAT) += compat.o +obj-$(CONFIG_SLEEPOMETER) += sleepo.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff -puN kernel/sched.c~sleepometer kernel/sched.c --- 25/kernel/sched.c~sleepometer 2003-06-14 23:05:10.000000000 -0700 +++ 25-akpm/kernel/sched.c 2003-06-15 00:59:06.000000000 -0700 @@ -1241,7 +1241,11 @@ void scheduling_functions_start_here(voi /* * schedule() is the main scheduler function. */ +#ifdef CONFIG_SLEEPOMETER +asmlinkage void sleepo_schedule(void) +#else asmlinkage void schedule(void) +#endif { task_t *prev, *next; runqueue_t *rq; @@ -1344,7 +1348,11 @@ switch_tasks: * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ +#ifdef CONFIG_SLEEPOMETER +asmlinkage void sleepo_preempt_schedule(void) +#else asmlinkage void preempt_schedule(void) +#endif { struct thread_info *ti = current_thread_info(); @@ -2020,7 +2028,11 @@ void yield(void) * But don't do that if it is a deliberate, throttling IO wait (this task * has set its backing_dev_info: the queue against which it should throttle) */ +#ifdef CONFIG_SLEEPOMETER +void sleepo_io_schedule(void) +#else void io_schedule(void) +#endif { struct runqueue *rq = this_rq(); @@ -2029,7 +2041,11 @@ void io_schedule(void) atomic_dec(&rq->nr_iowait); } +#ifdef CONFIG_SLEEPOMETER +long sleepo_io_schedule_timeout(long timeout) +#else long io_schedule_timeout(long timeout) +#endif { struct runqueue *rq = this_rq(); long ret; diff -puN kernel/timer.c~sleepometer kernel/timer.c --- 25/kernel/timer.c~sleepometer 2003-06-14 23:05:10.000000000 -0700 +++ 25-akpm/kernel/timer.c 2003-06-15 00:59:24.000000000 -0700 @@ -985,7 +985,11 @@ static void process_timeout(unsigned lon * * In all cases the return value is guaranteed to be non-negative. */ +#ifdef CONFIG_SLEEPOMETER +signed long sleepo_schedule_timeout(signed long timeout) +#else signed long schedule_timeout(signed long timeout) +#endif { struct timer_list timer; unsigned long expire; diff -puN kernel/module.c~sleepometer kernel/module.c --- 25/kernel/module.c~sleepometer 2003-06-15 00:33:12.000000000 -0700 +++ 25-akpm/kernel/module.c 2003-06-15 00:33:55.000000000 -0700 @@ -647,6 +647,10 @@ sys_delete_module(const char __user *nam char name[MODULE_NAME_LEN]; int ret, forced = 0; +#ifdef CONFIG_SLEEPOMETER + return -EINVAL; /* sleepo doesn't clean stuff up */ +#endif + if (!capable(CAP_SYS_MODULE)) return -EPERM; diff -puN include/asm-i386/rwsem.h~sleepometer include/asm-i386/rwsem.h diff -puN include/linux/rwsem.h~sleepometer include/linux/rwsem.h --- 25/include/linux/rwsem.h~sleepometer 2003-06-15 00:38:50.000000000 -0700 +++ 25-akpm/include/linux/rwsem.h 2003-06-15 00:40:52.000000000 -0700 @@ -19,6 +19,10 @@ #include #include +#ifdef CONFIG_SLEEPOMETER +#include +#endif + struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK @@ -38,7 +42,11 @@ extern void FASTCALL(rwsemtrace(struct r /* * lock for reading */ +#ifdef CONFIG_SLEEPOMETER +static inline void sleepo_down_read(struct rw_semaphore *sem) +#else static inline void down_read(struct rw_semaphore *sem) +#endif { might_sleep(); rwsemtrace(sem,"Entering down_read"); @@ -46,6 +54,17 @@ static inline void down_read(struct rw_s rwsemtrace(sem,"Leaving down_read"); } +#ifdef CONFIG_SLEEPOMETER +#define down_read(sem) \ + do { \ + static struct sleepo_data sd; \ + \ + sleepo_begin(__FILE__, __LINE__, "down_read", &sd); \ + sleepo_down_read(sem); \ + sleepo_end(&sd); \ + } while (0) +#endif + /* * trylock for reading -- returns 1 if successful, 0 if contention */ @@ -61,7 +80,11 @@ static inline int down_read_trylock(stru /* * lock for writing */ +#ifdef CONFIG_SLEEPOMETER +static inline void sleepo_down_write(struct rw_semaphore *sem) +#else static inline void down_write(struct rw_semaphore *sem) +#endif { might_sleep(); rwsemtrace(sem,"Entering down_write"); @@ -69,6 +92,17 @@ static inline void down_write(struct rw_ rwsemtrace(sem,"Leaving down_write"); } +#ifdef CONFIG_SLEEPOMETER +#define down_write(sem) \ + do { \ + static struct sleepo_data sd; \ + \ + sleepo_begin(__FILE__, __LINE__, "down_write", &sd); \ + sleepo_down_write(sem); \ + sleepo_end(&sd); \ + } while (0) +#endif + /* * trylock for writing -- returns 1 if successful, 0 if contention */ _