Attempt to do something intelligent with IRQ handlers which don't return IRQ_HANDLED. - If they return neither IRQ_HANDLED nor IRQ_NONE, complain. - If they return IRQ_NONE more than 99900 times in 100000 interrupts, complain and disable the IRQ. I did have it at 750-in-1000, but someone had an otherwise-functioning system which triggered it. The 99.9% ratio is designed to address the problem wherein the babbling device shares an IRQ with a good device. We don't want the good device's trickle of IRQ_HANDLED callouts to defeat the lockup detector. (fat chance os this working right). - Add a kernel boot parameter `noirqdebug' to turn the whole thing off. 25-akpm/Documentation/kernel-parameters.txt | 3 25-akpm/arch/i386/kernel/irq.c | 118 +++++++++++++++++++++------- 25-akpm/include/linux/irq.h | 2 3 files changed, 96 insertions(+), 27 deletions(-) diff -puN arch/i386/kernel/irq.c~irq-check-rate-limit arch/i386/kernel/irq.c --- 25/arch/i386/kernel/irq.c~irq-check-rate-limit Tue May 27 14:42:35 2003 +++ 25-akpm/arch/i386/kernel/irq.c Tue May 27 14:55:57 2003 @@ -66,8 +66,12 @@ /* * Controller mappings for all interrupt sources: */ -irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = - { [0 ... NR_IRQS-1] = { 0, &no_irq_type, NULL, 0, SPIN_LOCK_UNLOCKED}}; +irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { + [0 ... NR_IRQS-1] = { + .handler = &no_irq_type, + .lock = SPIN_LOCK_UNLOCKED + } +}; static void register_irq_proc (unsigned int irq); @@ -209,7 +213,6 @@ int handle_IRQ_event(unsigned int irq, { int status = 1; /* Force the "do bottom halves" bit */ int retval = 0; - struct irqaction *first_action = action; if (!(action->flags & SA_INTERRUPT)) local_irq_enable(); @@ -222,30 +225,88 @@ int handle_IRQ_event(unsigned int irq, if (status & SA_SAMPLE_RANDOM) add_interrupt_randomness(irq); local_irq_disable(); - if (retval != 1) { - static int count = 100; - if (count) { - count--; - if (retval) { - printk("irq event %d: bogus retval mask %x\n", - irq, retval); - } else { - printk("irq %d: nobody cared!\n", irq); - } - dump_stack(); - printk("handlers:\n"); - action = first_action; - do { - printk("[<%p>]", action->handler); - print_symbol(" (%s)", - (unsigned long)action->handler); - printk("\n"); - action = action->next; - } while (action); - } + return retval; +} + +static void __report_bad_irq(int irq, irq_desc_t *desc, irqreturn_t action_ret) +{ + struct irqaction *action; + + if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { + printk(KERN_ERR "irq event %d: bogus return value %x\n", + irq, action_ret); + } else { + printk(KERN_ERR "irq %d: nobody cared!\n", irq); } + dump_stack(); + printk(KERN_ERR "handlers:\n"); + action = desc->action; + do { + printk(KERN_ERR "[<%p>]", action->handler); + print_symbol(" (%s)", + (unsigned long)action->handler); + printk("\n"); + action = action->next; + } while (action); +} + +static void report_bad_irq(int irq, irq_desc_t *desc, irqreturn_t action_ret) +{ + static int count = 100; - return status; + if (count) { + count--; + __report_bad_irq(irq, desc, action_ret); + } +} + +static int noirqdebug; + +static int __init noirqdebug_setup(char *str) +{ + noirqdebug = 1; + printk("IRQ lockup detection disabled\n"); + return 1; +} + +__setup("noirqdebug", noirqdebug_setup); + +/* + * If 99,900 of the previous 100,000 interrupts have not been handled then + * assume that the IRQ is stuck in some manner. Drop a diagnostic and try to + * turn the IRQ off. + * + * (The other 100-of-100,000 interrupts may have been a correctly-functioning + * device sharing an IRQ with the failing one) + * + * Called under desc->lock + */ +static void note_interrupt(int irq, irq_desc_t *desc, irqreturn_t action_ret) +{ + if (action_ret != IRQ_HANDLED) { + desc->irqs_unhandled++; + if (action_ret != IRQ_NONE) + report_bad_irq(irq, desc, action_ret); + } + + desc->irq_count++; + if (desc->irq_count < 100000) + return; + + desc->irq_count = 0; + if (desc->irqs_unhandled > 99900) { + /* + * The interrupt is stuck + */ + __report_bad_irq(irq, desc, action_ret); + /* + * Now kill the IRQ + */ + printk(KERN_EMERG "Disabling IRQ #%d\n", irq); + desc->status |= IRQ_DISABLED; + desc->handler->disable(irq); + } + desc->irqs_unhandled = 0; } /* @@ -418,10 +479,13 @@ asmlinkage unsigned int do_IRQ(struct pt * SMP environment. */ for (;;) { + irqreturn_t action_ret; + spin_unlock(&desc->lock); - handle_IRQ_event(irq, ®s, action); + action_ret = handle_IRQ_event(irq, ®s, action); spin_lock(&desc->lock); - + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); if (likely(!(desc->status & IRQ_PENDING))) break; desc->status &= ~IRQ_PENDING; diff -puN include/linux/irq.h~irq-check-rate-limit include/linux/irq.h --- 25/include/linux/irq.h~irq-check-rate-limit Tue May 27 14:42:35 2003 +++ 25-akpm/include/linux/irq.h Tue May 27 14:42:35 2003 @@ -61,6 +61,8 @@ typedef struct { hw_irq_controller *handler; struct irqaction *action; /* IRQ action list */ unsigned int depth; /* nested irq disables */ + unsigned int irq_count; /* For detecting broken interrupts */ + unsigned int irqs_unhandled; spinlock_t lock; } ____cacheline_aligned irq_desc_t; diff -puN Documentation/kernel-parameters.txt~irq-check-rate-limit Documentation/kernel-parameters.txt --- 25/Documentation/kernel-parameters.txt~irq-check-rate-limit Tue May 27 14:53:34 2003 +++ 25-akpm/Documentation/kernel-parameters.txt Tue May 27 14:54:51 2003 @@ -617,6 +617,9 @@ running once the system is up. noht [SMP,IA-32] Disables P4 Xeon(tm) HyperThreading. + noirqdebug [IA-32] Disables the code which attempts to detect and + disable unhandled interrupt sources. + noisapnp [ISAPNP] Disables ISA PnP code. noinitrd [RAM] Tells the kernel not to load any configured _