From: Ingo Molnar Tmplements the "non-preemptible section trace" feature, which prints out a "critical section nesting" trace after stackdumps: Call Trace: [] show_stack+0x7a/0x90 [] show_registers+0x156/0x1ce [] die+0xe8/0x172 [] do_trap+0x76/0xa3 [] do_invalid_op+0xa3/0xad [] error_code+0x4f/0x54 [] test+0x8/0xa [] sys_gettimeofday+0x56/0x74 [] sysenter_past_esp+0x54/0x75 --------------------------- | preempt count: 00000004 ] | 4 levels deep critical section nesting: ----------------------------------------- .. [] .... test3+0xd/0xf .....[] .. ( <= test2+0x8/0x21) .. [] .... test3+0xd/0xf .....[] .. ( <= test2+0xd/0x21) .. [] .... test2+0x17/0x21 .....[] .. ( <= test+0x8/0xa) .. [] .... die+0x39/0x172 .....[] .. ( <= do_trap+0x76/0xa3) This feature is implemented via a low-overhead mechanism by keeping the caller and caller-parent addresses for each disable_preempt() call site, and printing it upon crashes. Note that every other locking API is thus traced too, such as spinlocks, rwlocks, per-cpu variables, etc. This feature is especially useful in identifying leaked preemption counts, as the missing count is displayed as an extra entry in the stack. The feature is active when PREEMPT_DEBUG is enabled. I've also cleaned up preemption-count debugging by moving the debug functions out of sched.c into lib/preempt.c. Also, i have added preemption-counter-imbalance checks to the hardirq and softirq processing codepaths. The behavior of preemption-counter checks is now uniform: a warning is printed with all info we have at that point, and the preemption counter is then restored to the old value. On x86 i have changed the 4KSTACKS feature to inherit the low bits of the preemption-count across hardirq/softirq-context switching, so that the preemption trace entries of interrupts do not overwrite process level preemption trace entries. Boot-tested on x86. Should work on all architectures. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton --- arch/i386/kernel/irq.c | 28 +++++++++++ arch/i386/kernel/traps.c | 1 arch/x86_64/kernel/process.c | 2 arch/x86_64/kernel/traps.c | 8 ++- include/asm-x86_64/proto.h | 2 include/linux/sched.h | 13 +++++ kernel/exit.c | 9 ++- kernel/irq/handle.c | 17 +++++++ kernel/sched.c | 33 -------------- kernel/softirq.c | 16 ++++++ kernel/timer.c | 35 ++++++++++---- lib/Kconfig.debug | 3 + lib/Makefile | 2 lib/preempt.c | 101 +++++++++++++++++++++++++++++++++++++++++++ 14 files changed, 217 insertions(+), 53 deletions(-) diff -puN arch/i386/kernel/irq.c~debug-preempt-tracing arch/i386/kernel/irq.c --- devel/arch/i386/kernel/irq.c~debug-preempt-tracing 2005-08-06 14:50:05.000000000 -0700 +++ devel-akpm/arch/i386/kernel/irq.c 2005-08-06 14:50:05.000000000 -0700 @@ -55,6 +55,9 @@ fastcall unsigned int do_IRQ(struct pt_r { /* high bits used in ret_from_ code */ int irq = regs->orig_eax & 0xff; +#ifdef CONFIG_DEBUG_PREEMPT + u32 count = preempt_count() & PREEMPT_MASK; +#endif #ifdef CONFIG_4KSTACKS union irq_ctx *curctx, *irqctx; u32 *isp; @@ -95,6 +98,14 @@ fastcall unsigned int do_IRQ(struct pt_r irqctx->tinfo.task = curctx->tinfo.task; irqctx->tinfo.previous_esp = current_stack_pointer; + /* + * Keep the preemption-count offset, so that the + * process-level preemption-trace entries do not + * get overwritten by the hardirq context: + */ +#ifdef CONFIG_DEBUG_PREEMPT + irqctx->tinfo.preempt_count += count; +#endif asm volatile( " xchgl %%ebx,%%esp \n" " call __do_IRQ \n" @@ -103,6 +114,9 @@ fastcall unsigned int do_IRQ(struct pt_r : "0" (irq), "1" (regs), "2" (isp) : "memory", "cc", "ecx" ); +#ifdef CONFIG_DEBUG_PREEMPT + irqctx->tinfo.preempt_count -= count; +#endif } else #endif __do_IRQ(irq, regs); @@ -165,6 +179,9 @@ extern asmlinkage void __do_softirq(void asmlinkage void do_softirq(void) { +#ifdef CONFIG_DEBUG_PREEMPT + u32 count = preempt_count() & PREEMPT_MASK; +#endif unsigned long flags; struct thread_info *curctx; union irq_ctx *irqctx; @@ -181,6 +198,14 @@ asmlinkage void do_softirq(void) irqctx->tinfo.task = curctx->task; irqctx->tinfo.previous_esp = current_stack_pointer; + /* + * Keep the preemption-count offset, so that the + * process-level preemption-trace entries do not + * get overwritten by the softirq context: + */ +#ifdef CONFIG_DEBUG_PREEMPT + irqctx->tinfo.preempt_count += count; +#endif /* build the stack frame on the softirq stack */ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); @@ -192,6 +217,9 @@ asmlinkage void do_softirq(void) : "0"(isp) : "memory", "cc", "edx", "ecx", "eax" ); +#ifdef CONFIG_DEBUG_PREEMPT + irqctx->tinfo.preempt_count -= count; +#endif } local_irq_restore(flags); diff -puN arch/i386/kernel/traps.c~debug-preempt-tracing arch/i386/kernel/traps.c --- devel/arch/i386/kernel/traps.c~debug-preempt-tracing 2005-08-06 14:50:05.000000000 -0700 +++ devel-akpm/arch/i386/kernel/traps.c 2005-08-06 14:50:05.000000000 -0700 @@ -164,6 +164,7 @@ void show_trace(struct task_struct *task break; printk(" =======================\n"); } + print_preempt_trace(task, task->thread_info->preempt_count); } void show_stack(struct task_struct *task, unsigned long *esp) diff -puN arch/x86_64/kernel/process.c~debug-preempt-tracing arch/x86_64/kernel/process.c --- devel/arch/x86_64/kernel/process.c~debug-preempt-tracing 2005-08-06 14:50:05.000000000 -0700 +++ devel-akpm/arch/x86_64/kernel/process.c 2005-08-06 14:50:05.000000000 -0700 @@ -312,7 +312,7 @@ void show_regs(struct pt_regs *regs) { printk("CPU %d:", smp_processor_id()); __show_regs(regs); - show_trace(®s->rsp); + show_trace(current, ®s->rsp); } /* diff -puN arch/x86_64/kernel/traps.c~debug-preempt-tracing arch/x86_64/kernel/traps.c --- devel/arch/x86_64/kernel/traps.c~debug-preempt-tracing 2005-08-06 14:50:05.000000000 -0700 +++ devel-akpm/arch/x86_64/kernel/traps.c 2005-08-06 14:50:19.000000000 -0700 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -157,7 +158,7 @@ static unsigned long *in_exception_stack * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -void show_trace(unsigned long *stack) +void show_trace(struct task_struct *task, unsigned long *stack) { unsigned long addr; const unsigned cpu = safe_smp_processor_id(); @@ -222,6 +223,7 @@ void show_trace(unsigned long *stack) HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); #undef HANDLE_STACK printk("\n"); + print_preempt_trace(task, task->thread_info->preempt_count); } void show_stack(struct task_struct *tsk, unsigned long * rsp) @@ -258,7 +260,7 @@ void show_stack(struct task_struct *tsk, printk("%016lx ", *stack++); touch_nmi_watchdog(); } - show_trace((unsigned long *)rsp); + show_trace(tsk, (unsigned long *)rsp); } /* @@ -267,7 +269,7 @@ void show_stack(struct task_struct *tsk, void dump_stack(void) { unsigned long dummy; - show_trace(&dummy); + show_trace(current, &dummy); } EXPORT_SYMBOL(dump_stack); diff -puN include/asm-x86_64/proto.h~debug-preempt-tracing include/asm-x86_64/proto.h --- devel/include/asm-x86_64/proto.h~debug-preempt-tracing 2005-08-06 14:50:05.000000000 -0700 +++ devel-akpm/include/asm-x86_64/proto.h 2005-08-06 14:50:05.000000000 -0700 @@ -66,7 +66,7 @@ extern unsigned long end_pfn_map; extern cpumask_t cpu_initialized; -extern void show_trace(unsigned long * rsp); +extern void show_trace(struct task_struct *task, unsigned long *rsp); extern void show_registers(struct pt_regs *regs); extern void exception_table_check(void); diff -puN include/linux/sched.h~debug-preempt-tracing include/linux/sched.h --- devel/include/linux/sched.h~debug-preempt-tracing 2005-08-06 14:50:05.000000000 -0700 +++ devel-akpm/include/linux/sched.h 2005-08-06 14:50:05.000000000 -0700 @@ -644,6 +644,14 @@ extern int groups_search(struct group_in #define GROUP_AT(gi, i) \ ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK]) +#ifdef CONFIG_DEBUG_PREEMPT +# define MAX_PREEMPT_TRACE 25 +extern void print_preempt_trace(struct task_struct *task, u32 count); +#else +static inline void print_preempt_trace(struct task_struct *task, u32 count) +{ +} +#endif struct audit_context; /* See audit.c */ struct mempolicy; @@ -822,6 +830,11 @@ struct task_struct { int cpuset_mems_generation; #endif atomic_t fs_excl; /* holding fs exclusive resources */ + +#ifdef CONFIG_DEBUG_PREEMPT + void *preempt_off_caller[MAX_PREEMPT_TRACE]; + void *preempt_off_parent[MAX_PREEMPT_TRACE]; +#endif }; static inline pid_t process_group(struct task_struct *tsk) diff -puN kernel/exit.c~debug-preempt-tracing kernel/exit.c --- devel/kernel/exit.c~debug-preempt-tracing 2005-08-06 14:50:05.000000000 -0700 +++ devel-akpm/kernel/exit.c 2005-08-06 14:50:05.000000000 -0700 @@ -821,10 +821,11 @@ fastcall NORET_TYPE void do_exit(long co tsk->it_prof_expires = cputime_zero; tsk->it_sched_expires = 0; - if (unlikely(in_atomic())) - printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", - current->comm, current->pid, - preempt_count()); + if (unlikely(in_atomic())) { + printk(KERN_ERR "BUG: %s[%d] exited with nonzero preempt_count %d!\n", + tsk->comm, tsk->pid, preempt_count()); + print_preempt_trace(tsk, preempt_count()); + } acct_update_integrals(tsk); update_mem_hiwater(tsk); diff -puN kernel/irq/handle.c~debug-preempt-tracing kernel/irq/handle.c --- devel/kernel/irq/handle.c~debug-preempt-tracing 2005-08-06 14:50:05.000000000 -0700 +++ devel-akpm/kernel/irq/handle.c 2005-08-06 14:50:05.000000000 -0700 @@ -85,7 +85,24 @@ fastcall int handle_IRQ_event(unsigned i local_irq_enable(); do { +#ifdef CONFIG_DEBUG_PREEMPT + u32 in_count = preempt_count(), out_count; +#endif ret = action->handler(irq, action->dev_id, regs); +#ifdef CONFIG_DEBUG_PREEMPT + out_count = preempt_count(); + if (in_count != out_count) { + printk(KERN_ERR "BUG: irq %d [%s] preempt-count " + "imbalance: in=%08x, out=%08x!\n", + irq, action->name, in_count, out_count); + print_preempt_trace(current, out_count); + /* + * We already printed all the useful info, + * fix up the preemption count now: + */ + preempt_count() = in_count; + } +#endif if (ret == IRQ_HANDLED) status |= action->flags; retval |= ret; diff -puN kernel/sched.c~debug-preempt-tracing kernel/sched.c --- devel/kernel/sched.c~debug-preempt-tracing 2005-08-06 14:50:05.000000000 -0700 +++ devel-akpm/kernel/sched.c 2005-08-06 14:50:05.000000000 -0700 @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -2707,38 +2708,6 @@ static inline int dependent_sleeper(int } #endif -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) - -void fastcall add_preempt_count(int val) -{ - /* - * Underflow? - */ - BUG_ON((preempt_count() < 0)); - preempt_count() += val; - /* - * Spinlock count overflowing soon? - */ - BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); -} -EXPORT_SYMBOL(add_preempt_count); - -void fastcall sub_preempt_count(int val) -{ - /* - * Underflow? - */ - BUG_ON(val > preempt_count()); - /* - * Is the spinlock portion underflowing? - */ - BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); - preempt_count() -= val; -} -EXPORT_SYMBOL(sub_preempt_count); - -#endif - /* * schedule() is the main scheduler function. */ diff -puN kernel/softirq.c~debug-preempt-tracing kernel/softirq.c --- devel/kernel/softirq.c~debug-preempt-tracing 2005-08-06 14:50:05.000000000 -0700 +++ devel-akpm/kernel/softirq.c 2005-08-06 14:50:05.000000000 -0700 @@ -92,7 +92,23 @@ restart: do { if (pending & 1) { +#ifdef CONFIG_DEBUG_PREEMPT + u32 in_count = preempt_count(), out_count; +#endif h->action(h); +#ifdef CONFIG_DEBUG_PREEMPT + out_count = preempt_count(); + if (in_count != out_count) { + printk(KERN_ERR "BUG: softirq %ld preempt-count " + "imbalance: in=%08x, out=%08x!\n", + h - softirq_vec, in_count, out_count); + print_preempt_trace(current, out_count); + /* + * Fix up the bad preemption count: + */ + preempt_count() = in_count; + } +#endif rcu_bh_qsctr_inc(cpu); } h++; diff -puN kernel/timer.c~debug-preempt-tracing kernel/timer.c --- devel/kernel/timer.c~debug-preempt-tracing 2005-08-06 14:50:05.000000000 -0700 +++ devel-akpm/kernel/timer.c 2005-08-06 14:50:05.000000000 -0700 @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -480,6 +481,7 @@ static inline void __run_timers(tvec_bas while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; + int in_count, out_count; timer = list_entry(head->next,struct timer_list,entry); fn = timer->function; @@ -488,17 +490,20 @@ static inline void __run_timers(tvec_bas set_running_timer(base, timer); detach_timer(timer, 1); spin_unlock_irq(&base->t_base.lock); - { - int preempt_count = preempt_count(); - fn(data); - if (preempt_count != preempt_count()) { - printk(KERN_WARNING "huh, entered %p " - "with preempt_count %08x, exited" - " with %08x?\n", - fn, preempt_count, - preempt_count()); - BUG(); - } + + in_count = preempt_count(); + fn(data); + out_count = preempt_count(); + if (in_count != out_count) { + print_symbol(KERN_ERR "BUG: %s", (long)fn); + printk(KERN_ERR "(%p) preempt-count imbalance: " + "in=%08x, out=%08x!", + fn, in_count, out_count); + print_preempt_trace(current, out_count); + /* + * Fix up the bad preemption count: + */ + preempt_count() = in_count; } spin_lock_irq(&base->t_base.lock); } @@ -914,6 +919,10 @@ static void run_timer_softirq(struct sof if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); + if (panic_timeout == 2) { + panic_timeout = 0; + preempt_disable(); + } } /* @@ -922,6 +931,10 @@ static void run_timer_softirq(struct sof void run_local_timers(void) { raise_softirq(TIMER_SOFTIRQ); + if (panic_timeout == 1) { + panic_timeout = 0; + preempt_disable(); + } } /* diff -puN lib/Kconfig.debug~debug-preempt-tracing lib/Kconfig.debug --- devel/lib/Kconfig.debug~debug-preempt-tracing 2005-08-06 14:50:05.000000000 -0700 +++ devel-akpm/lib/Kconfig.debug 2005-08-06 14:50:05.000000000 -0700 @@ -89,6 +89,9 @@ config DEBUG_PREEMPT bool "Debug preemptible kernel" depends on DEBUG_KERNEL && PREEMPT default y + select FRAME_POINTER + select KALLSYMS + select KALLSYMS_ALL help If you say Y here then the kernel will use a debug variant of the commonly used smp_processor_id() function and will print warnings diff -puN lib/Makefile~debug-preempt-tracing lib/Makefile --- devel/lib/Makefile~debug-preempt-tracing 2005-08-06 14:50:05.000000000 -0700 +++ devel-akpm/lib/Makefile 2005-08-06 14:50:05.000000000 -0700 @@ -21,7 +21,7 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o -obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o +obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o preempt.o ifneq ($(CONFIG_HAVE_DEC_LOCK),y) lib-y += dec_and_lock.o diff -puN /dev/null lib/preempt.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/lib/preempt.c 2005-08-06 14:50:05.000000000 -0700 @@ -0,0 +1,101 @@ +/* + * lib/preempt.c + * + * DEBUG_PREEMPT variant of add_preempt_count() and sub_preempt_count(). + * Preemption tracing. + * + * (C) 2005 Ingo Molnar, Red Hat + */ +#include +#include +#include + +/* + * Add a value to the preemption count, and check for overflows, + * underflows and maintain a small stack of callers that gets + * printed upon crashes. + */ +void fastcall add_preempt_count(int val) +{ + unsigned int count = preempt_count(), idx = count & PREEMPT_MASK; + + /* + * Underflow? + */ + BUG_ON(count < 0); + + preempt_count() += val; + + /* + * Spinlock count overflowing soon? + */ + BUG_ON(idx >= PREEMPT_MASK-10); + + /* + * Maintain the per-task preemption-nesting stack (which + * will be printed upon crashes). It's a low-overhead thing, + * constant overhead per preempt-disable. + */ + if (idx < MAX_PREEMPT_TRACE) { + void *caller = __builtin_return_address(0), *parent = NULL; + +#ifdef CONFIG_FRAME_POINTER + parent = __builtin_return_address(1); + if (in_lock_functions(parent)) { + parent = __builtin_return_address(2); + if (in_lock_functions(parent)) + parent = __builtin_return_address(3); + } +#endif + current->preempt_off_caller[idx] = caller; + current->preempt_off_parent[idx] = parent; + } +} +EXPORT_SYMBOL(add_preempt_count); + +void fastcall sub_preempt_count(int val) +{ + unsigned int count = preempt_count(); + + /* + * Underflow? + */ + BUG_ON(val > count); + /* + * Is the spinlock portion underflowing? + */ + BUG_ON((val < PREEMPT_MASK) && !(count & PREEMPT_MASK)); + + preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +void print_preempt_trace(struct task_struct *task, u32 count) +{ + unsigned int i, idx = count & PREEMPT_MASK; + + preempt_disable(); + + printk("---------------------------\n"); + printk("| preempt count: %08x ]\n", count); + if (count) { + printk("| %d level deep critical section nesting:\n", idx); + printk("----------------------------------------\n"); + } else + printk("---------------------------\n"); + for (i = 0; i < idx; i++) { + printk(".. [<%p>] .... ", task->preempt_off_caller[i]); + print_symbol("%s\n", (long)task->preempt_off_caller[i]); + printk(".....[<%p>] .. ( <= ", + task->preempt_off_parent[i]); + print_symbol("%s)\n", (long)task->preempt_off_parent[i]); + if (i == MAX_PREEMPT_TRACE-1) { + printk("[rest truncated, reached MAX_PREEMPT_TRACE]\n"); + break; + } + } + printk("\n"); + + preempt_enable(); +} + _