diff -urN linux-2.4.12-preempt/Documentation/Configure.help linux/Documentation/Configure.help --- linux-2.4.12-preempt/Documentation/Configure.help Mon Oct 8 19:10:08 2001 +++ linux/Documentation/Configure.help Mon Oct 8 19:36:32 2001 @@ -166,6 +166,16 @@ system where throughput is more important than interactive response, such as a server system. Say N if you are unsure. +Collect preemption latency times +CONFIG_PREEMPT_TIMES + This option will instruct the preemption code to measure in-kernel + latencies due to various locking primitives. The results for the 20 + worst latency times are accessible via /proc/latencytimes. They are + reset on read. + + Say Y here if you are testing kernel preemption and trying to determine + where high-latency problems lie. Say N if you are unsure. + Kernel math emulation CONFIG_MATH_EMULATION Linux can emulate a math coprocessor (used for floating point diff -urN linux-2.4.12-preempt/arch/i386/config.in linux/arch/i386/config.in --- linux-2.4.12-preempt/arch/i386/config.in Mon Oct 8 19:09:41 2001 +++ linux/arch/i386/config.in Mon Oct 8 19:36:32 2001 @@ -170,6 +170,9 @@ bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR bool 'Symmetric multi-processing support' CONFIG_SMP bool 'Preemptible Kernel' CONFIG_PREEMPT +if [ "$CONFIG_PREEMPT" = "y" ]; then + bool 'Collect preemption latency times' CONFIG_PREEMPT_TIMES +fi if [ "$CONFIG_SMP" != "y" ]; then bool 'Local APIC support on uniprocessors' CONFIG_X86_UP_APIC dep_bool 'IO-APIC support on uniprocessors' CONFIG_X86_UP_IOAPIC $CONFIG_X86_UP_APIC diff -urN linux-2.4.12-preempt/arch/i386/kernel/irq.c linux/arch/i386/kernel/irq.c --- linux-2.4.12-preempt/arch/i386/kernel/irq.c Mon Oct 8 19:09:42 2001 +++ linux/arch/i386/kernel/irq.c Mon Oct 8 19:36:32 2001 @@ -559,6 +559,11 @@ * SMP cross-CPU interrupts have their own specific * handlers). */ +#ifdef CONFIG_PREEMPT_TIMES +extern void latency_cause(int,int); +#else +#define latency_cause(a, b) +#endif asmlinkage unsigned int do_IRQ(struct pt_regs regs) { /* @@ -577,9 +582,12 @@ struct irqaction * action; unsigned int status; + preempt_lock_start(-99); + kstat.irqs[cpu][irq]++; spin_lock(&desc->lock); desc->handler->ack(irq); + latency_cause(-99,~irq); /* REPLAY is when Linux resends an IRQ that was dropped earlier WAITING is used by probe to mark irqs that are being tested @@ -636,6 +644,8 @@ desc->handler->end(irq); spin_unlock(&desc->lock); + preempt_lock_stop(); + if (softirq_pending(cpu)) do_softirq(); return 1; diff -urN linux-2.4.12-preempt/include/asm-i386/smplock.h linux/include/asm-i386/smplock.h --- linux-2.4.12-preempt/include/asm-i386/smplock.h Mon Oct 8 19:08:47 2001 +++ linux/include/asm-i386/smplock.h Mon Oct 8 19:36:33 2001 @@ -9,6 +9,16 @@ #include extern spinlock_t kernel_flag; +#ifdef CONFIG_PREEMPT_TIMES +#define lock_bkl() do { \ + atomic_inc(atomic_ptr_preempt_count()); \ + preempt_lock_start(3); \ + _raw_spin_lock(&kernel_flag); \ +} while (0) +#else +#define lock_bkl() spin_lock(&kernel_flag) +#endif +#define unlock_bkl() spin_unlock(&kernel_flag) #ifdef CONFIG_SMP #define kernel_locked() spin_is_locked(&kernel_flag) @@ -40,7 +50,18 @@ spin_lock(&kernel_flag); \ } while (0) - +#ifdef CONFIG_PREEMPT_TIMES +#define lock_kernel() do { \ + if (current->lock_depth == -1) \ + lock_bkl(); \ + ++current->lock_depth; \ +} while (0) + +#define unlock_kernel() do { \ + if (--current->lock_depth < 0) \ + unlock_bkl(); \ +} while (0) +#else /* * Getting the big kernel lock. * @@ -87,3 +108,4 @@ "=m" (current->lock_depth)); #endif } +#endif /* CONFIG_PREEMPT_TIMES */ diff -urN linux-2.4.12-preempt/include/linux/spinlock.h linux/include/linux/spinlock.h --- linux-2.4.12-preempt/include/linux/spinlock.h Mon Oct 8 19:08:47 2001 +++ linux/include/linux/spinlock.h Mon Oct 8 19:36:33 2001 @@ -132,6 +132,25 @@ #ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_TIMES +extern void latency_start(const char *fname,unsigned lineno,int cause); +extern void latency_check(const char *fname,unsigned lineno, int cause); +extern void latency_end(const char *fname,unsigned lineno); +#define preempt_lock_start(c) \ +do { \ + if (preempt_is_disable() == 1) \ + latency_start(__BASE_FILE__, __LINE__, c); \ +} while (0) +#define preempt_lock_stop() \ +do { \ + if (preempt_is_disable() == 1) \ + latency_end(__BASE_FILE__, __LINE__); \ +} while (0) +#else +#define preempt_lock_start(c) do {} while (0) +#define preempt_lock_stop() do {} while (0) +#endif /* CONFIG_PREEMPT_TIMES */ + #define switch_lock_count() current->preempt_count #define preempt_is_disable() (switch_lock_count().counter) #define atomic_ptr_preempt_count() (&switch_lock_count()) @@ -139,13 +158,16 @@ #define preempt_disable() do { \ atomic_inc(atomic_ptr_preempt_count()); \ + preempt_lock_start(1); \ } while (0) #define preempt_enable_no_resched() do { \ + preempt_lock_stop(); \ atomic_dec(atomic_ptr_preempt_count()); \ } while (0) #define preempt_enable() do { \ + preempt_lock_stop(); \ if (atomic_dec_and_test(atomic_ptr_preempt_count()) && \ current->need_resched) \ preempt_schedule(); \ @@ -188,6 +210,9 @@ #define write_lock(lock) _raw_write_lock(lock) #define write_unlock(lock) _raw_write_unlock(lock) #define write_trylock(lock) _raw_write_trylock(lock) + +#define preempt_lock_start(c) do {} while (0) +#define preempt_lock_stop() do {} while (0) #endif /* !CONFIG_PREEMPT */ /* "lock on reference count zero" */ diff -urN linux-2.4.12-preempt/kernel/Makefile linux/kernel/Makefile --- linux-2.4.12-preempt/kernel/Makefile Mon Oct 8 19:08:45 2001 +++ linux/kernel/Makefile Mon Oct 8 19:36:33 2001 @@ -19,6 +19,7 @@ obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += ksyms.o obj-$(CONFIG_PM) += pm.o +obj-$(CONFIG_PREEMPT_TIMES) += preem_latency.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff -urN linux-2.4.12-preempt/kernel/ksyms.c linux/kernel/ksyms.c --- linux-2.4.12-preempt/kernel/ksyms.c Mon Oct 8 19:08:45 2001 +++ linux/kernel/ksyms.c Mon Oct 8 19:36:33 2001 @@ -434,6 +434,11 @@ #ifdef CONFIG_PREEMPT EXPORT_SYMBOL(preempt_schedule); #endif +#ifdef CONFIG_PREEMPT_TIMES +EXPORT_SYMBOL(latency_start); +EXPORT_SYMBOL(latency_end); +EXPORT_SYMBOL(latency_check); +#endif EXPORT_SYMBOL(schedule_timeout); EXPORT_SYMBOL(jiffies); EXPORT_SYMBOL(xtime); diff -urN linux-2.4.12-preempt/kernel/preem_latency.c linux/kernel/preem_latency.c --- linux-2.4.12-preempt/kernel/preem_latency.c Wed Dec 31 19:00:00 1969 +++ linux/kernel/preem_latency.c Mon Oct 8 19:36:33 2001 @@ -0,0 +1,386 @@ +/* + * Preemption Latency Measurement Tool + * Copyright (C) 2001 MontaVista Software Inc. + * + * Measures the duration of locks held in the kernel. + * Useful for pinpointing long-held locks, which we + * can not preempt during. + * + * 20010923 Robert M. Love + * - remove causes "lowlat" and "check" + * - update for revised preemption patch + * - remove defines for preempt_lock_check and window + * + * 20010919 Robert M. Love + * whitespace cleanup, remove unneeded code, remove + * ifdefs around latency_cause, etc + * + * 20010918 Robert M. Love + * update for 2.4.10-pre11 and 2.4.9-ac13 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * definded in asm/msr.h as + * __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx") + */ +#define readclock(low) rdtscl(low) + +#define NUM_LOG_ENTRY 20 +#define NUM_COUNT 2 +#define COMMAND_LENGTH 16 +#define NUM_INTERRUPT 16 + +const char * syscallStartFileName = "system_call"; +const char * entryCheckFileName = "ret_with_reschedule"; +const char * errorCodeStart = "error_code start"; +const char * deviceNotAvailableStart = "device_not_availablestart"; + +extern unsigned long cpu_khz; /* number of rdtsc ticks per second/1000 */ + +extern void latency_check(const char *, unsigned, int); +extern void latency_end(const char *, unsigned); + +int theCause, mask; +unsigned theSyscallNumber; + +#define SfS 0 /* waiting for a preempt off start1 call */ +#define SfC 1 /* waiting for a preempt off end call */ +#define SfM 2 /* waiting for either of the above */ + +struct pointId { + const char * FileName; + unsigned Address; + unsigned FileLine; + unsigned ProcId; + char ProcName[COMMAND_LENGTH]; +}; + +/* worst recorded latencies */ +struct log_entry { + unsigned latency; + int cause; + int mask; + unsigned intrCount[NUM_INTERRUPT+1]; + struct pointId start; + struct pointId end; +}; + +struct LatencyData { + int breakCount; /* count interrupt and iret */ + const char * testName; /* the test name */ + unsigned logFlag; /* 0 - no logging; 1 - logging */ + unsigned panicFlag; /* set to 1 if something is really wrong */ + unsigned syncFlag; /* for synchro between start and end */ + unsigned numIntrs; /* total number interrupts & intrs in range*/ + unsigned numCount; /* layout */ + unsigned numEntry; + int least; /* lowest latency we care about */ + unsigned count[NUM_COUNT]; + struct log_entry entry[NUM_LOG_ENTRY]; /* worst recorded latencies */ +}; + +struct LatencyData latencyData = { + breakCount: 0, + testName: "preemption latency (start/check/break points)", + logFlag: 0, + panicFlag: 0, + syncFlag: 0, + numIntrs: 0, + numCount: NUM_COUNT, + numEntry: NUM_LOG_ENTRY, +}; + +static char *cause_name[] = { + "Unknown", + "spin_lock", + "reacqBKL", + "BKL" +}; + +#define CAUSE_MAX (sizeof(cause_name)/sizeof(cause_name[0]) - 1) + +static struct pointId latencyStart; +static unsigned latencyStartCount; +unsigned numIntr = NUM_INTERRUPT; +unsigned intrCount[NUM_INTERRUPT+1]; + +#define CLEAR(x) do { \ + int zz; \ + for (zz=0; zz<= NUM_INTERRUPT; zz++) \ + x[zz] = 0; \ + } while(0) + +#define COPY(x, y) do { \ + int zz; \ + for (zz=0; zz<= NUM_INTERRUPT; zz++) \ + x[zz] = y[zz]; \ + } while(0) + +/* strategy : + * + * Four points are identified + * S : the starting point right after system call, user -> kernel + * S1: alternative starting points + * . faults (including device_not_available which does go through + * error_code : label) + * . interrupts (all go through do_IRQ) + * C : need_resched is checked and schedule() is invoked if flag is set + * B : before RESTORE ALL and if control is returnning to user land + * + * For S points : + * it MUST follow a break point + * it initializes timing values for the starting point + * + * For S1 points : + * if it follows a breakpoint, treat it as an S point + * otherwise, ignore it. + * + * For C points : + * it must follow a C point or a S point + * calculate the interval and reset the starting point + * + * For B points : + * It must follow a C point or an S point + * calculate the internval and invalidate the starting point + * + */ +/* used to debug the execution between start() and start1() */ +static unsigned startFlag = 0; + +asmlinkage void latency_start(const char *fname,unsigned lineno, int cause) +{ + /* if we are not logging or we have an error, do nothing */ + if ((latencyData.logFlag == 0) || ( latencyData.panicFlag != 0)) + return; + + if (latencyData.syncFlag != SfC) { + readclock(latencyStartCount); + startFlag = 1; + mask = 0; + theCause = cause; + latencyData.syncFlag = SfC; + latencyStart.Address = (int)__builtin_return_address(0); + latencyStart.FileName = fname; + latencyStart.FileLine = lineno; + latencyStart.ProcId = current->pid; + memcpy(latencyStart.ProcName, current->comm, 16); + CLEAR(intrCount); + } +} + +void latency_cause(int was, int tobe) +{ + if (was == -100) { + mask |= tobe; + return; + } + + if (theCause == was) + theCause = tobe; +} + +void latency_logentry(unsigned diff, const char *fname, + unsigned lineno, unsigned Address, int mask) +{ + unsigned lowest = 0xffffffff; + unsigned lowestIndex = 0; + unsigned sameIndex = 0xffffffff; + int i; + + if (diff < latencyData.least) + return; + + /* check if we need to log this event */ + for (i = 0; i < NUM_LOG_ENTRY; i++) { + if (lowest > latencyData.entry[i].latency) { + lowest = latencyData.entry[i].latency; + lowestIndex = i; + } + /* If the call addresses match, it is the same path */ + if ((Address == latencyData.entry[i].end.Address) && + (latencyStart.Address == latencyData.entry[i].start.Address)){ + sameIndex = i; + break; + } + } + + if (sameIndex == 0xffffffff) { + i = lowestIndex; + /* + * we would like "least" to be the smallest latency in the table + * but we are pressed for time so we settle for it being what + * may have been the lowest, as this new entry may replace + * the found lowest with a higher value, in which case we don't + * have the true lowest. Oh well, we get closer as time + * passes. The real reason for this is to speed things up, + * so we don't worry too much if this isn't exact. + */ + latencyData.least = latencyData.entry[i].latency; + } else { + i = sameIndex; + } + + if (diff > latencyData.entry[i].latency) { + latencyData.entry[i].latency = diff; + latencyData.entry[i].cause = theCause; + latencyData.entry[i].mask = mask; + + latencyData.entry[i].end.FileName = fname; + latencyData.entry[i].end.FileLine = lineno; + latencyData.entry[i].end.Address = Address; + latencyData.entry[i].end.ProcId = current->pid; + memcpy(latencyData.entry[i].end.ProcName, + current->comm, COMMAND_LENGTH); + + latencyData.entry[i].start.FileName = latencyStart.FileName; + latencyData.entry[i].start.FileLine = latencyStart.FileLine; + latencyData.entry[i].start.Address = latencyStart.Address; + latencyData.entry[i].start.ProcId = latencyStart.ProcId; + memcpy(latencyData.entry[i].start.ProcName, + latencyStart.ProcName, COMMAND_LENGTH); + } + + latencyData.numIntrs++; +} + +/* Called at end of preemption time */ +asmlinkage void latency_end(const char *fname, unsigned lineno) +{ + unsigned endCount; + + /* if we are not logging or we have an error, do nothing */ + if ((latencyData.logFlag == 0) || (latencyData.panicFlag != 0)) { + return; + } + + /* read entry again */ + readclock(endCount); + + if (latencyData.syncFlag == SfS) { + latencyData.count[SfS] ++; + return; + } + + latency_logentry(endCount - latencyStartCount, fname, lineno, + (int)__builtin_return_address(0), mask); + + latencyData.syncFlag = SfS; + return; +} + +/* latency_check is for the end of one period and the start of another */ +asmlinkage void latency_check(const char *fname, unsigned lineno, int cause) +{ + unsigned endCount; + + /* if we are not logging or we have an error, do nothing */ + if ((latencyData.logFlag == 0) || (latencyData.panicFlag != 0)) { + return; + } + /* read entry again */ + readclock(endCount); + + if (latencyData.syncFlag != SfS) { + latency_logentry(endCount - latencyStartCount, fname, lineno, + (int)__builtin_return_address(0), mask); + } + + /* re-set the starting point */ + latencyData.syncFlag = SfM; + mask = 0; + theCause = cause; + latencyStart.Address = (int) __builtin_return_address(0); + latencyStart.FileName = fname; + latencyStart.FileLine = lineno; + latencyStart.ProcId = current->pid; + memcpy(latencyStart.ProcName, current->comm, 16); + CLEAR(intrCount); + readclock(latencyStartCount); +} + +static int g_read_completed = 0; + +static int latencytimes_read_proc(char *page_buffer, char **my_first_byte, + off_t virtual_start, int length, int *eof, void *data) +{ + int my_buffer_offset = 0; + char * const my_base = page_buffer; + int i,j,max; + + if (virtual_start == 0){ + /* Just been opened */ + latencyData.logFlag = 0; /* stop logging */ + g_read_completed = 0; + my_buffer_offset += sprintf(my_base + my_buffer_offset, + "Worst %d latency times of %d measured in this period.\n" + " usec cause mask start line/file address end line/file\n", NUM_LOG_ENTRY,latencyData.numIntrs); + + } else if (g_read_completed == NUM_LOG_ENTRY) { + *eof = 1; + latencyData.breakCount = 0; + latencyData.syncFlag = SfS; + latencyData.numIntrs = 0; + latencyData.least = 0; + for (i = 0; i < latencyData.numCount; + latencyData.count[i++] = 0); + latencyData.logFlag = 1; /* start logging */ + return 0; + } + g_read_completed++; + for (j = max = i = 0;j < NUM_LOG_ENTRY; j++){ + if( latencyData.entry[j].latency > max ){ + max = latencyData.entry[j].latency; + i = j; + } + } + my_buffer_offset += + sprintf(my_base + my_buffer_offset, + "%6d %10s %8x %5d/%-15s %8x %5d/%s\n", + (int)((latencyData.entry[i].latency * 10) / + (cpu_khz / 100)), + (latencyData.entry[i].cause == -99) ? "unknown" : + (latencyData.entry[i].cause < 0) ? + irq_desc[~latencyData.entry[i].cause].action->name : + (latencyData.entry[i].cause > CAUSE_MAX) ? "Unknown" : + cause_name[latencyData.entry[i].cause], + latencyData.entry[i].mask, + latencyData.entry[i].start.FileLine, + latencyData.entry[i].start.FileName == (char *)0 ? + "entry.S" : + latencyData.entry[i].start.FileName, + latencyData.entry[i].start.Address, + /*latencyData.entry[i].start.ProcId, + latencyData.entry[i].start.ProcName,*/ + latencyData.entry[i].end.FileLine, + latencyData.entry[i].end.FileName == (char *)0 ? + "entry.S" : + latencyData.entry[i].end.FileName /*, + latencyData.entry[i].end.ProcId, + latencyData.entry[i].end.ProcName*/); + + /* Clearing these two effectivly clears the entry */ + + latencyData.entry[i].latency = 0; + latencyData.entry[i].end.Address = 0; + *my_first_byte = page_buffer; + return my_buffer_offset; +} + +int __init latencytimes_init(void) +{ +#ifdef CONFIG_PROC_FS + create_proc_read_entry("latencytimes", 0, 0, latencytimes_read_proc, 0); +#endif + return 0; +} + +__initcall(latencytimes_init); diff -urN linux-2.4.12-preempt/kernel/sched.c linux/kernel/sched.c --- linux-2.4.12-preempt/kernel/sched.c Mon Oct 8 19:08:45 2001 +++ linux/kernel/sched.c Mon Oct 8 19:36:33 2001 @@ -1110,10 +1110,12 @@ /* it would be ideal not to lock tasks to their cpu here, * but only around the data that needs such locking */ lock_to_this_cpu(); + preempt_lock_start(2); atomic_add(PREEMPT_ACTIVE + 1, &switch_lock_count()); schedule(); restore_cpus_allowed(); atomic_sub(PREEMPT_ACTIVE + 1, &switch_lock_count()); + preempt_lock_stop(); } } #endif /* CONFIG_PREEMPT */ diff -urN linux-2.4.12-preempt/kernel/softirq.c linux/kernel/softirq.c --- linux-2.4.12-preempt/kernel/softirq.c Mon Oct 8 19:08:45 2001 +++ linux/kernel/softirq.c Mon Oct 8 19:36:33 2001 @@ -44,6 +44,12 @@ static struct softirq_action softirq_vec[32] __cacheline_aligned; +#ifdef CONFIG_PREEMPT_TIMES +extern void latency_cause(int,int); +#else +#define latency_cause(a, b) +#endif + /* * we cannot loop indefinitely here to avoid userspace starvation, * but we also don't want to introduce a worst case 1/HZ latency @@ -67,6 +73,7 @@ if (in_interrupt()) return; + preempt_lock_start(-99); local_irq_save(flags); @@ -84,6 +91,9 @@ local_irq_enable(); h = softirq_vec; + + latency_cause(-100, pending); + do { if (pending & 1) @@ -106,6 +116,7 @@ } local_irq_restore(flags); + preempt_lock_stop(); } /*