From: Nick Piggin John Hawkes discribed this problem to me: There *is* a small problem in this area, though, that SuSE avoids. "jiffies" gets updated by cpu0. The other CPUs may, over time, get out of sync (and they're initialized on ia64 to start out being out of sync), so it's no guarantee that every CPU will wake up from its timer interrupt and see a "jiffies" value that is guaranteed to be last_jiffies+1. Sometimes the jiffies value may be unchanged since the last wakeup. Sometimes the jiffies value may have incremented by 2 (or more, especially if cpu0's interrupts are disabled for long stretches of time). So an algoithm that says, "I'll call load_balance() only when jiffies is *exactly* N" is going to fail on occasion, either by calling load_balance() too often or not often enough. *** I fixed this by adding a last_balance field to struct sched_domain, and working off that. --- 25-akpm/include/linux/sched.h | 4 ++++ 25-akpm/kernel/sched.c | 16 ++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff -puN include/linux/sched.h~sched-no-drop-balance include/linux/sched.h --- 25/include/linux/sched.h~sched-no-drop-balance Mon May 3 15:03:54 2004 +++ 25-akpm/include/linux/sched.h Mon May 3 15:03:54 2004 @@ -575,6 +575,7 @@ struct sched_domain { int flags; /* See SD_FLAG_* */ /* Runtime fields. */ + unsigned long last_balance; /* init to jiffies. units in jiffies */ unsigned int balance_interval; /* initialise to 1. units in ms. */ unsigned int nr_balance_failed; /* initialise to 0 */ }; @@ -591,6 +592,7 @@ struct sched_domain { .cache_hot_time = 0, \ .cache_nice_tries = 0, \ .flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE | SD_FLAG_WAKE,\ + .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ } @@ -607,6 +609,7 @@ struct sched_domain { .cache_hot_time = (5*1000000/2), \ .cache_nice_tries = 1, \ .flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE,\ + .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ } @@ -624,6 +627,7 @@ struct sched_domain { .cache_hot_time = (10*1000000), \ .cache_nice_tries = 1, \ .flags = SD_FLAG_EXEC, \ + .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ } diff -puN kernel/sched.c~sched-no-drop-balance kernel/sched.c --- 25/kernel/sched.c~sched-no-drop-balance Mon May 3 15:03:54 2004 +++ 25-akpm/kernel/sched.c Mon May 3 15:03:54 2004 @@ -1759,26 +1759,26 @@ static void rebalance_tick(int this_cpu, /* Run through all this CPU's domains */ do { - int modulo; + unsigned long interval; if (unlikely(!domain->groups)) break; - modulo = domain->balance_interval; - + interval = domain->balance_interval; if (idle != IDLE) - modulo *= domain->busy_factor; + interval *= domain->busy_factor; /* scale ms to jiffies */ - modulo = modulo * HZ / 1000; - if (modulo == 0) - modulo = 1; + interval = interval * HZ / 1000; + if (unlikely(interval == 0)) + interval = 1; - if (!(j % modulo)) { + if (j - domain->last_balance >= interval) { if (load_balance(this_cpu, this_rq, domain, idle)) { /* We've pulled tasks over so no longer idle */ idle = NOT_IDLE; } + domain->last_balance += interval; } domain = domain->parent; _