From: Oleg Nesterov smp_local_timer_interrupt() calls run_local_timers() which triggers TIMER_SOFTIRQ unconditionally. The handler, run_timer_softirq(), does: if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); This time_after_eq() is useless, base->timer_jiffies == jifiies almost allways. So every local timer interrupt implies do_softirq()->__run_timers(). This patch adds tvec_base_t->timer_expires, which is used as estimation for the "nearest" pending timer. It is calculated in __run_timers(), by scanning next 64 entries in tvec_base_t->tv1. So, if we have single pending timer which constantly readds itself after invocation with expires = jiffies + 1, we lost up to 64 (average 56) loop iterations (scanning tvec_base_t->tv1) per jiffie. But if expires = jiffies + 2, we have 28 iterations per jiffie, and only 50% of interrupts trigger TIMER_SOFTIRQ. I have collected some stat during kernel compilation: 0.0282 TIMER_SOFTIRQ's per jiffie 1.0880 loop iterations (timer_expires calculation) per jiffie 0.0570 do_softirq() calls from smp_apic_timer_interrupt() but there is no noticeable time difference. Simple benchmark, counts gettimeofday() per second, 3 runs. Clean kernel: 2810548.9333 // gettimeofday()s per second 1m0.002s 0m8.761s 0m51.243s // real, user, sys 2811569.0833 1m0.003s 0m8.875s 0m51.130s 2810842.0333 1m0.002s 0m9.073s 0m50.931s Patched: 2812897.6667 // gettimeofday()s per second 1m0.002s 0m10.920s 0m49.090s // real, user, sys 2812929.4667 1m0.002s 0m11.074s 0m48.938s 2812905.9000 1m0.002s 0m11.092s 0m48.919s Again, slightly faster, but far below 1%. Code bloat: 68 bytes (gcc 2.95.3). Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton --- 25-akpm/include/linux/timer.h | 1 25-akpm/kernel/timer.c | 65 ++++++++++++++++++++++++++---------------- 2 files changed, 41 insertions(+), 25 deletions(-) diff -puN include/linux/timer.h~reduce-false-timer_softirq-calls include/linux/timer.h --- 25/include/linux/timer.h~reduce-false-timer_softirq-calls 2004-11-30 01:23:08.341499008 -0800 +++ 25-akpm/include/linux/timer.h 2004-11-30 01:23:08.346498248 -0800 @@ -96,7 +96,6 @@ static inline void add_timer(struct time #endif extern void init_timers(void); -extern void run_local_timers(void); extern void it_real_fn(unsigned long); #endif diff -puN kernel/timer.c~reduce-false-timer_softirq-calls kernel/timer.c --- 25/kernel/timer.c~reduce-false-timer_softirq-calls 2004-11-30 01:23:08.343498704 -0800 +++ 25-akpm/kernel/timer.c 2004-11-30 01:23:08.348497944 -0800 @@ -65,7 +65,7 @@ typedef struct tvec_root_s { struct tvec_t_base_s { spinlock_t lock; - unsigned long timer_jiffies; + unsigned long timer_jiffies, timer_expires; struct timer_list *running_timer; tvec_root_t tv1; tvec_t tv2; @@ -136,6 +136,7 @@ static void internal_add_timer(tvec_base * or you set a timer to go off in the past */ vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); + expires = base->timer_jiffies; } else { int i; /* If the timeout is larger than 0xffffffff on 64-bit @@ -152,6 +153,9 @@ static void internal_add_timer(tvec_base * Timers are FIFO: */ list_add_tail(&timer->entry, vec); + + if (time_after(base->timer_expires, expires)) + base->timer_expires = expires; } int __mod_timer(struct timer_list *timer, unsigned long expires) @@ -433,14 +437,17 @@ static int cascade(tvec_base_t *base, tv static inline void __run_timers(tvec_base_t *base) { - struct timer_list *timer; + unsigned long expires; + int probes; spin_lock_irq(&base->lock); + + base->timer_jiffies = base->timer_expires; while (time_after_eq(jiffies, base->timer_jiffies)) { struct list_head work_list = LIST_HEAD_INIT(work_list); struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; - + /* * Cascade timers: */ @@ -449,10 +456,11 @@ static inline void __run_timers(tvec_bas (!cascade(base, &base->tv3, INDEX(1))) && !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); - ++base->timer_jiffies; + ++base->timer_jiffies; list_splice_init(base->tv1.vec + index, &work_list); repeat: if (!list_empty(head)) { + struct timer_list *timer; void (*fn)(unsigned long); unsigned long data; @@ -471,9 +479,36 @@ repeat: } } set_running_timer(base, NULL); + + expires = base->timer_jiffies; + for (probes = 65; --probes && + (expires & TVR_MASK) && + list_empty(base->tv1.vec + (expires & TVR_MASK)); + ++expires); + base->timer_expires = expires; + spin_unlock_irq(&base->lock); } +/* + * This function runs timers and the timer-tq in bottom half context. + */ +static void run_timer_softirq(struct softirq_action *h) +{ + __run_timers(&__get_cpu_var(tvec_bases)); +} + +/* + * Called by the local, per-CPU timer interrupt on SMP. + */ +static inline void run_local_timers(int cpu) +{ + tvec_base_t *base = &per_cpu(tvec_bases, cpu); + + if (time_after_eq(jiffies, base->timer_expires)) + raise_softirq(TIMER_SOFTIRQ); +} + #ifdef CONFIG_NO_IDLE_HZ /* * Find out when the next timer event is due to happen. This @@ -862,7 +897,7 @@ void update_process_times(int user_tick) int cpu = smp_processor_id(), system = user_tick ^ 1; update_one_process(p, user_tick, system, cpu); - run_local_timers(); + run_local_timers(cpu); scheduler_tick(user_tick, system); } @@ -917,25 +952,6 @@ EXPORT_SYMBOL(xtime_lock); #endif /* - * This function runs timers and the timer-tq in bottom half context. - */ -static void run_timer_softirq(struct softirq_action *h) -{ - tvec_base_t *base = &__get_cpu_var(tvec_bases); - - if (time_after_eq(jiffies, base->timer_jiffies)) - __run_timers(base); -} - -/* - * Called by the local, per-CPU timer interrupt on SMP. - */ -void run_local_timers(void) -{ - raise_softirq(TIMER_SOFTIRQ); -} - -/* * Called by the timer interrupt. xtime_lock must already be taken * by the timer IRQ! */ @@ -1334,6 +1350,7 @@ static void __devinit init_timers_cpu(in INIT_LIST_HEAD(base->tv1.vec + j); base->timer_jiffies = jiffies; + base->timer_expires = base->timer_jiffies; } #ifdef CONFIG_HOTPLUG_CPU _