From: Gerrit Huizenga CKRM processor scheduling delay accounting - provides a mechanism to In addition to counting frequency the total delay in ns is also recorded. CPU delays are specified as cpu-wait and cpu-run. I/O delays are recorded for memory and regular I/O. Information is accessible through /proc//delay. Signed-Off-By: Chandra Seetharaman Signed-Off-By: Hubertus Franke Signed-Off-By: Shailabh Nagar Signed-Off-By: Gerrit Huizenga Signed-off-by: Andrew Morton --- fs/proc/array.c | 18 +++++++++ fs/proc/base.c | 17 ++++++++ fs/proc/internal.h | 1 include/linux/sched.h | 88 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/taskdelays.h | 35 +++++++++++++++++ init/Kconfig | 8 ++++ kernel/fork.c | 1 kernel/sched.c | 13 ++++++ mm/memory.c | 27 +++++++++---- 9 files changed, 198 insertions(+), 10 deletions(-) diff -puN fs/proc/array.c~ckrm-processor-delay-accounting fs/proc/array.c --- devel/fs/proc/array.c~ckrm-processor-delay-accounting 2005-07-27 15:59:18.000000000 -0700 +++ devel-akpm/fs/proc/array.c 2005-07-27 15:59:18.000000000 -0700 @@ -482,3 +482,21 @@ int proc_pid_statm(struct task_struct *t return sprintf(buffer,"%d %d %d %d %d %d %d\n", size, resident, shared, text, lib, data, 0); } + + +int proc_pid_delay(struct task_struct *task, char * buffer) +{ + int res; + + res = sprintf(buffer,"%u %llu %llu %u %llu %u %llu\n", + (unsigned int) get_delay(task,runs), + (uint64_t) get_delay(task,runcpu_total), + (uint64_t) get_delay(task,waitcpu_total), + (unsigned int) get_delay(task,num_iowaits), + (uint64_t) get_delay(task,iowait_total), + (unsigned int) get_delay(task,num_memwaits), + (uint64_t) get_delay(task,mem_iowait_total) + ); + return res; +} + diff -puN fs/proc/base.c~ckrm-processor-delay-accounting fs/proc/base.c --- devel/fs/proc/base.c~ckrm-processor-delay-accounting 2005-07-27 15:59:18.000000000 -0700 +++ devel-akpm/fs/proc/base.c 2005-07-27 15:59:18.000000000 -0700 @@ -158,6 +158,10 @@ enum pid_directory_inos { #ifdef CONFIG_AUDITSYSCALL PROC_TID_LOGINUID, #endif +#ifdef CONFIG_DELAY_ACCT + PROC_TID_DELAY_ACCT, + PROC_TGID_DELAY_ACCT, +#endif PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */ PROC_TID_OOM_SCORE, PROC_TID_OOM_ADJUST, @@ -197,6 +201,9 @@ static struct pid_entry tgid_base_stuff[ #ifdef CONFIG_SECURITY E(PROC_TGID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO), #endif +#ifdef CONFIG_DELAY_ACCT + E(PROC_TGID_DELAY_ACCT,"delay", S_IFREG|S_IRUGO), +#endif #ifdef CONFIG_KALLSYMS E(PROC_TGID_WCHAN, "wchan", S_IFREG|S_IRUGO), #endif @@ -237,6 +244,9 @@ static struct pid_entry tid_base_stuff[] #ifdef CONFIG_SECURITY E(PROC_TID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO), #endif +#ifdef CONFIG_DELAY_ACCT + E(PROC_TGID_DELAY_ACCT,"delay", S_IFREG|S_IRUGO), +#endif #ifdef CONFIG_KALLSYMS E(PROC_TID_WCHAN, "wchan", S_IFREG|S_IRUGO), #endif @@ -1664,6 +1674,13 @@ static struct dentry *proc_pident_lookup ei->op.proc_read = proc_pid_wchan; break; #endif +#ifdef CONFIG_DELAY_ACCT + case PROC_TID_DELAY_ACCT: + case PROC_TGID_DELAY_ACCT: + inode->i_fop = &proc_info_file_operations; + ei->op.proc_read = proc_pid_delay; + break; +#endif #ifdef CONFIG_SCHEDSTATS case PROC_TID_SCHEDSTAT: case PROC_TGID_SCHEDSTAT: diff -puN fs/proc/internal.h~ckrm-processor-delay-accounting fs/proc/internal.h --- devel/fs/proc/internal.h~ckrm-processor-delay-accounting 2005-07-27 15:59:18.000000000 -0700 +++ devel-akpm/fs/proc/internal.h 2005-07-27 15:59:18.000000000 -0700 @@ -36,6 +36,7 @@ extern int proc_tid_stat(struct task_str extern int proc_tgid_stat(struct task_struct *, char *); extern int proc_pid_status(struct task_struct *, char *); extern int proc_pid_statm(struct task_struct *, char *); +extern int proc_pid_delay(struct task_struct *, char*); static inline struct task_struct *proc_task(struct inode *inode) { diff -puN include/linux/sched.h~ckrm-processor-delay-accounting include/linux/sched.h --- devel/include/linux/sched.h~ckrm-processor-delay-accounting 2005-07-27 15:59:18.000000000 -0700 +++ devel-akpm/include/linux/sched.h 2005-07-27 15:59:18.000000000 -0700 @@ -34,6 +34,7 @@ #include #include #include +#include #include /* For AT_VECTOR_SIZE */ @@ -820,6 +821,9 @@ struct task_struct { int cpuset_mems_generation; #endif atomic_t fs_excl; /* holding fs exclusive resources */ +#ifdef CONFIG_DELAY_ACCT + struct task_delay_info delays; +#endif }; static inline pid_t process_group(struct task_struct *tsk) @@ -870,6 +874,8 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */ #define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */ +#define PF_MEMIO 0x01000000 /* I am potentially doing I/O for mem */ +#define PF_IOWAIT 0x02000000 /* I am waiting on disk I/O */ /* * Only the _current_ task can read/write to tsk->flags, but other @@ -1395,6 +1401,88 @@ static inline void thaw_processes(void) static inline int try_to_freeze(void) { return 0; } #endif /* CONFIG_PM */ + +/* API for registering delay info */ +#ifdef CONFIG_DELAY_ACCT + +#define test_delay_flag(tsk,flg) ((tsk)->flags & (flg)) +#define set_delay_flag(tsk,flg) ((tsk)->flags |= (flg)) +#define clear_delay_flag(tsk,flg) ((tsk)->flags &= ~(flg)) + +#define def_delay_var(var) unsigned long long var +#define get_delay(tsk,field) ((tsk)->delays.field) + +#define start_delay(var) ((var) = sched_clock()) +#define start_delay_set(var,flg) (set_delay_flag(current,flg),(var) = \ + sched_clock()) + +#define inc_delay(tsk,field) (((tsk)->delays.field)++) + +/* because of hardware timer drifts in SMPs and task continue on different cpu + * then where the start_ts was taken there is a possibility that + * end_ts < start_ts by some usecs. In this case we ignore the diff + * and add nothing to the total. + */ +#ifdef CONFIG_SMP +#define test_ts_integrity(start_ts,end_ts) (likely((end_ts) > (start_ts))) +#else +#define test_ts_integrity(start_ts,end_ts) (1) +#endif + +#define add_delay_ts(tsk,field,start_ts,end_ts) \ + do { if (test_ts_integrity(start_ts,end_ts)) (tsk)->delays.field += ((end_ts)-(start_ts)); } while (0) + +#define add_delay_clear(tsk,field,start_ts,flg) \ + do { \ + unsigned long long now = sched_clock(); \ + add_delay_ts(tsk,field,start_ts,now); \ + clear_delay_flag(tsk,flg); \ + } while (0) + +static inline void add_io_delay(unsigned long long dstart) +{ + struct task_struct * tsk = current; + unsigned long long now = sched_clock(); + unsigned long long val; + + if (test_ts_integrity(dstart,now)) + val = now - dstart; + else + val = 0; + if (test_delay_flag(tsk,PF_MEMIO)) { + tsk->delays.mem_iowait_total += val; + tsk->delays.num_memwaits++; + } else { + tsk->delays.iowait_total += val; + tsk->delays.num_iowaits++; + } + clear_delay_flag(tsk,PF_IOWAIT); +} + +inline static void init_delays(struct task_struct *tsk) +{ + memset((void*)&tsk->delays,0,sizeof(tsk->delays)); +} + +#else + +#define test_delay_flag(tsk,flg) (0) +#define set_delay_flag(tsk,flg) do { } while (0) +#define clear_delay_flag(tsk,flg) do { } while (0) + +#define def_delay_var(var) +#define get_delay(tsk,field) (0) + +#define start_delay(var) do { } while (0) +#define start_delay_set(var,flg) do { } while (0) + +#define inc_delay(tsk,field) do { } while (0) +#define add_delay_ts(tsk,field,start_ts,now) do { } while (0) +#define add_delay_clear(tsk,field,start_ts,flg) do { } while (0) +#define add_io_delay(dstart) do { } while (0) +#define init_delays(tsk) do { } while (0) +#endif + #endif /* __KERNEL__ */ #endif diff -puN /dev/null include/linux/taskdelays.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/include/linux/taskdelays.h 2005-07-27 15:59:18.000000000 -0700 @@ -0,0 +1,35 @@ +/* taskdelays.h - for delay accounting + * + * Copyright (C) Hubertus Franke, IBM Corp. 2003, 2004 + * + * Has the data structure for delay counting. + * + * Latest version, more details at http://ckrm.sf.net + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef _LINUX_TASKDELAYS_H +#define _LINUX_TASKDELAYS_H + +#include +#include + +struct task_delay_info { + /* delay statistics in usecs */ + uint64_t waitcpu_total; + uint64_t runcpu_total; + uint64_t iowait_total; + uint64_t mem_iowait_total; + uint32_t runs; + uint32_t num_iowaits; + uint32_t num_memwaits; +}; + +#endif /* _LINUX_TASKDELAYS_H */ diff -puN init/Kconfig~ckrm-processor-delay-accounting init/Kconfig --- devel/init/Kconfig~ckrm-processor-delay-accounting 2005-07-27 15:59:18.000000000 -0700 +++ devel-akpm/init/Kconfig 2005-07-27 15:59:18.000000000 -0700 @@ -262,6 +262,14 @@ menuconfig EMBEDDED environments which can tolerate a "non-standard" kernel. Only use this if you really know what you are doing. +config DELAY_ACCT + bool "Enable delay accounting (EXPERIMENTAL)" + help + In addition to counting frequency the total delay in ns is also + recorded. CPU delays are specified as cpu-wait and cpu-run. + I/O delays are recorded for memory and regular I/O. + Information is accessible through /proc//delay. + config KALLSYMS bool "Load all symbols for debugging/kksymoops" if EMBEDDED default y diff -puN kernel/fork.c~ckrm-processor-delay-accounting kernel/fork.c --- devel/kernel/fork.c~ckrm-processor-delay-accounting 2005-07-27 15:59:18.000000000 -0700 +++ devel-akpm/kernel/fork.c 2005-07-27 15:59:18.000000000 -0700 @@ -906,6 +906,7 @@ static task_t *copy_process(unsigned lon if (p->binfmt && !try_module_get(p->binfmt->module)) goto bad_fork_cleanup_put_domain; + init_delays(p); p->did_exec = 0; copy_flags(clone_flags, p); p->pid = pid; diff -puN kernel/sched.c~ckrm-processor-delay-accounting kernel/sched.c --- devel/kernel/sched.c~ckrm-processor-delay-accounting 2005-07-27 15:59:18.000000000 -0700 +++ devel-akpm/kernel/sched.c 2005-07-27 15:59:18.000000000 -0700 @@ -827,11 +827,13 @@ static inline void resched_task(task_t * * task_curr - is this task currently executing on a CPU? * @p: the task in question. */ -inline int task_curr(const task_t *p) +int task_curr(const task_t *p) { return cpu_curr(task_cpu(p)) == p; } +EXPORT_SYMBOL_GPL(task_curr); + #ifdef CONFIG_SMP typedef struct { struct list_head list; @@ -2892,6 +2894,7 @@ switch_tasks: update_cpu_clock(prev, rq, now); + add_delay_ts(prev, runcpu_total, prev->timestamp, now); prev->sleep_avg -= run_time; if ((long)prev->sleep_avg <= 0) prev->sleep_avg = 0; @@ -2899,6 +2902,8 @@ switch_tasks: sched_info_switch(prev, next); if (likely(prev != next)) { + add_delay_ts(next, waitcpu_total, next->timestamp, now); + inc_delay(next, runs); next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -3975,9 +3980,12 @@ void __sched io_schedule(void) { struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + def_delay_var(dstart); + start_delay_set(dstart, PF_IOWAIT); atomic_inc(&rq->nr_iowait); schedule(); atomic_dec(&rq->nr_iowait); + add_io_delay(dstart); } EXPORT_SYMBOL(io_schedule); @@ -3986,10 +3994,13 @@ long __sched io_schedule_timeout(long ti { struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); long ret; + def_delay_var(dstart); + start_delay_set(dstart,PF_IOWAIT); atomic_inc(&rq->nr_iowait); ret = schedule_timeout(timeout); atomic_dec(&rq->nr_iowait); + add_io_delay(dstart); return ret; } diff -puN mm/memory.c~ckrm-processor-delay-accounting mm/memory.c --- devel/mm/memory.c~ckrm-processor-delay-accounting 2005-07-27 15:59:18.000000000 -0700 +++ devel-akpm/mm/memory.c 2005-07-27 16:02:17.000000000 -0700 @@ -2065,6 +2065,7 @@ int handle_mm_fault(struct mm_struct *mm pud_t *pud; pmd_t *pmd; pte_t *pte; + int rc; __set_current_state(TASK_RUNNING); @@ -2084,6 +2085,7 @@ int handle_mm_fault(struct mm_struct *mm * pte_alloc_map here. */ page_table_atomic_start(mm); + set_delay_flag(current, PF_MEMIO); pgd = pgd_offset(mm, address); if (unlikely(pgd_none(*pgd))) { pud_t *new; @@ -2091,8 +2093,10 @@ int handle_mm_fault(struct mm_struct *mm page_table_atomic_stop(mm); new = pud_alloc_one(mm, address); - if (!new) - goto oom; + if (!new) { + rc = VM_FAULT_OOM; + goto out; + } page_table_atomic_start(mm); if (!pgd_test_and_populate(mm, pgd, new)) @@ -2106,8 +2110,10 @@ int handle_mm_fault(struct mm_struct *mm page_table_atomic_stop(mm); new = pmd_alloc_one(mm, address); - if (!new) - goto oom; + if (!new) { + rc = VM_FAULT_OOM; + goto out; + } page_table_atomic_start(mm); @@ -2122,8 +2128,10 @@ int handle_mm_fault(struct mm_struct *mm page_table_atomic_stop(mm); new = pte_alloc_one(mm, address); - if (!new) - goto oom; + if (!new) { + rc = VM_FAULT_OOM; + goto out; + } page_table_atomic_start(mm); @@ -2136,9 +2144,10 @@ int handle_mm_fault(struct mm_struct *mm } pte = pte_offset_map(pmd, address); - return handle_pte_fault(mm, vma, address, write_access, pte, pmd); -oom: - return VM_FAULT_OOM; + rc = handle_pte_fault(mm, vma, address, write_access, pte, pmd); +out: + clear_delay_flag(current, PF_MEMIO); + return rc; sigbus: return VM_FAULT_SIGBUS; _