diff options
author | Andrew Morton <akpm@osdl.org> | 2004-05-09 23:26:09 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2004-05-09 23:26:09 -0700 |
commit | 850f7d78d3912cbba31e4f6c266f1182a7ae8a20 (patch) | |
tree | a89c1370d2af7f0b7658b3215cdc59842a315fa3 /kernel | |
parent | fa8f2c5000cafcaa87182c5f2c021ae1ae52299c (diff) | |
download | history-850f7d78d3912cbba31e4f6c266f1182a7ae8a20.tar.gz |
[PATCH] sched: trivial fixes, cleanups
From: Ingo Molnar <mingo@elte.hu>
The trivial fixes.
- added recent trivial bits from Nick's and my patches.
- hotplug CPU fix
- early init cleanup
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 485 |
1 files changed, 242 insertions, 243 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 94e19f1a558010..6a1c9a4df37e60 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -75,6 +75,13 @@ #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +#ifndef JIFFIES_TO_MSEC +# define JIFFIES_TO_MSEC(x) ((x) * 1000 / HZ) +#endif +#ifndef MSEC_TO_JIFFIES +# define MSEC_TO_JIFFIES(x) ((x) * HZ / 1000) +#endif + /* * These are the 'tuning knobs' of the scheduler: * @@ -257,16 +264,6 @@ const unsigned long scheduling_functions_end_here = # define task_running(rq, p) ((rq)->curr == (p)) #endif -static inline void nr_running_inc(runqueue_t *rq) -{ - rq->nr_running++; -} - -static inline void nr_running_dec(runqueue_t *rq) -{ - rq->nr_running--; -} - /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -367,7 +364,7 @@ static int effective_prio(task_t *p) static inline void __activate_task(task_t *p, runqueue_t *rq) { enqueue_task(p, rq->active); - nr_running_inc(rq); + rq->nr_running++; } static void recalc_task_prio(task_t *p, unsigned long long now) @@ -488,7 +485,7 @@ static inline void activate_task(task_t *p, runqueue_t *rq) */ static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) { - nr_running_dec(rq); + rq->nr_running--; if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; dequeue_task(p, p->array); @@ -502,9 +499,9 @@ static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ +#ifdef CONFIG_SMP static inline void resched_task(task_t *p) { -#ifdef CONFIG_SMP int need_resched, nrpolling; preempt_disable(); @@ -516,10 +513,13 @@ static inline void resched_task(task_t *p) if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) smp_send_reschedule(task_cpu(p)); preempt_enable(); +} #else +static inline void resched_task(task_t *p) +{ set_tsk_need_resched(p); -#endif } +#endif /** * task_curr - is this task currently executing on a CPU? @@ -611,13 +611,14 @@ void kick_process(task_t *p) } EXPORT_SYMBOL_GPL(kick_process); + /* * Return a low guess at the load of cpu. */ static inline unsigned long get_low_cpu_load(int cpu) { runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running << SCHED_LOAD_SHIFT; + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; return min(rq->cpu_load, load_now); } @@ -625,7 +626,7 @@ static inline unsigned long get_low_cpu_load(int cpu) static inline unsigned long get_high_cpu_load(int cpu) { runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running << SCHED_LOAD_SHIFT; + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; return max(rq->cpu_load, load_now); } @@ -633,26 +634,27 @@ static inline unsigned long get_high_cpu_load(int cpu) #endif /* - * sched_balance_wake can be used with SMT architectures to wake a - * task onto an idle sibling if cpu is not idle. Returns cpu if - * cpu is idle or no siblings are idle, otherwise returns an idle - * sibling. + * wake_idle() is useful especially on SMT architectures to wake a + * task onto an idle sibling if we would otherwise wake it onto a + * busy sibling. + * + * Returns the CPU we should wake onto. */ -#if defined(CONFIG_SMP) && defined(ARCH_HAS_SCHED_WAKE_BALANCE) -static int sched_balance_wake(int cpu, task_t *p) +#if defined(ARCH_HAS_SCHED_WAKE_IDLE) +static int wake_idle(int cpu, task_t *p) { cpumask_t tmp; - struct sched_domain *domain; + struct sched_domain *sd; int i; if (idle_cpu(cpu)) return cpu; - domain = cpu_sched_domain(cpu); - if (!(domain->flags & SD_FLAG_WAKE)) + sd = cpu_sched_domain(cpu); + if (!(sd->flags & SD_WAKE_IDLE)) return cpu; - cpus_and(tmp, domain->span, cpu_online_map); + cpus_and(tmp, sd->span, cpu_online_map); for_each_cpu_mask(i, tmp) { if (!cpu_isset(i, p->cpus_allowed)) continue; @@ -664,7 +666,7 @@ static int sched_balance_wake(int cpu, task_t *p) return cpu; } #else -static inline int sched_balance_wake(int cpu, task_t *p) +static inline int wake_idle(int cpu, task_t *p) { return cpu; } @@ -694,8 +696,8 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) #ifdef CONFIG_SMP unsigned long long now; unsigned long load, this_load; - int new_cpu; struct sched_domain *sd; + int new_cpu; #endif rq = task_rq_lock(p, &flags); @@ -706,49 +708,44 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) if (p->array) goto out_running; - this_cpu = smp_processor_id(); cpu = task_cpu(p); + this_cpu = smp_processor_id(); #ifdef CONFIG_SMP - if (cpu == this_cpu || unlikely(cpu_is_offline(this_cpu))) + if (unlikely(task_running(rq, p) || cpu_is_offline(this_cpu))) goto out_activate; - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed) - || task_running(rq, p))) - goto out_activate; + new_cpu = this_cpu; /* Wake to this CPU if we can */ + + if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + goto out_set_cpu; /* Passive load balancing */ load = get_low_cpu_load(cpu); this_load = get_high_cpu_load(this_cpu) + SCHED_LOAD_SCALE; - if (load > this_load) { - new_cpu = sched_balance_wake(this_cpu, p); - set_task_cpu(p, new_cpu); - goto repeat_lock_task; - } + if (load > this_load) + goto out_set_cpu; now = sched_clock(); - sd = cpu_sched_domain(this_cpu); /* - * Fast-migrate the task if it's not running or - * runnable currently. Do not violate hard affinity. + * Migrate the task to the waking domain. + * Do not violate hard affinity. */ - do { - if (!(sd->flags & SD_FLAG_FASTMIGRATE)) + for_each_domain(this_cpu, sd) { + if (!(sd->flags & SD_WAKE_AFFINE)) break; if (now - p->timestamp < sd->cache_hot_time) break; - if (cpu_isset(cpu, sd->span)) { - new_cpu = sched_balance_wake(this_cpu, p); - set_task_cpu(p, new_cpu); - goto repeat_lock_task; - } - sd = sd->parent; - } while (sd); + if (cpu_isset(cpu, sd->span)) + goto out_set_cpu; + } - new_cpu = sched_balance_wake(cpu, p); - if (new_cpu != cpu) { + new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ +out_set_cpu: + new_cpu = wake_idle(new_cpu, p); + if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) { set_task_cpu(p, new_cpu); goto repeat_lock_task; } @@ -778,6 +775,14 @@ out_activate: p->activated = -1; } + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ if (sync && cpu == this_cpu) { __activate_task(p, rq); } else { @@ -794,6 +799,7 @@ out: return success; } + int fastcall wake_up_process(task_t * p) { return try_to_wake_up(p, TASK_STOPPED | @@ -897,7 +903,7 @@ void fastcall wake_up_forked_process(task_t * p) list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; - nr_running_inc(rq); + rq->nr_running++; } task_rq_unlock(rq, &flags); } @@ -1114,8 +1120,8 @@ enum idle_type */ static void sched_migrate_task(task_t *p, int dest_cpu) { - runqueue_t *rq; migration_req_t req; + runqueue_t *rq; unsigned long flags; lock_cpu_hotplug(); @@ -1136,6 +1142,7 @@ static void sched_migrate_task(task_t *p, int dest_cpu) * the migration. */ tlb_migrate_prepare(current->mm); + unlock_cpu_hotplug(); return; } @@ -1146,9 +1153,9 @@ out: /* * Find the least loaded CPU. Slightly favor the current CPU by - * setting its runqueue length as the minimum to start. + * setting its load as the minimum to start. */ -static int sched_best_cpu(struct task_struct *p, struct sched_domain *domain) +static int sched_best_cpu(struct task_struct *p, struct sched_domain *sd) { cpumask_t tmp; int i, min_load, this_cpu, best_cpu; @@ -1156,7 +1163,7 @@ static int sched_best_cpu(struct task_struct *p, struct sched_domain *domain) best_cpu = this_cpu = task_cpu(p); min_load = INT_MAX; - cpus_and(tmp, domain->span, cpu_online_map); + cpus_and(tmp, sd->span, cpu_online_map); for_each_cpu_mask(i, tmp) { unsigned long load; if (i == this_cpu) @@ -1168,30 +1175,42 @@ static int sched_best_cpu(struct task_struct *p, struct sched_domain *domain) best_cpu = i; min_load = load; } - } return best_cpu; } +/* + * sched_balance_exec(): find the highest-level, exec-balance-capable + * domain and try to migrate the task to the least loaded CPU. + * + * execve() is a valuable balancing opportunity, because at this point + * the task has the smallest effective memory and cache footprint. + */ void sched_balance_exec(void) { - struct sched_domain *domain = this_sched_domain(); + struct sched_domain *sd, *best_sd = NULL; int new_cpu; - int this_cpu = smp_processor_id(); - if (numnodes == 1) - return; + int this_cpu = get_cpu(); + /* Prefer the current CPU if there's only this task running */ if (this_rq()->nr_running <= 1) - return; + goto out; - while (domain->parent && !(domain->flags & SD_FLAG_EXEC)) - domain = domain->parent; + for_each_domain(this_cpu, sd) { + if (sd->flags & SD_BALANCE_EXEC) + best_sd = sd; + } - if (domain->flags & SD_FLAG_EXEC) { - new_cpu = sched_best_cpu(current, domain); - if (new_cpu != this_cpu) + if (best_sd) { + new_cpu = sched_best_cpu(current, best_sd); + if (new_cpu != this_cpu) { + put_cpu(); sched_migrate_task(current, new_cpu); + return; + } } +out: + put_cpu(); } #endif /* CONFIG_NUMA */ @@ -1214,14 +1233,14 @@ static inline void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, - task_t *p, runqueue_t *this_rq, prio_array_t *this_array, - int this_cpu) +static inline +void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, + runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) { dequeue_task(p, src_array); - nr_running_dec(src_rq); + src_rq->nr_running--; set_task_cpu(p, this_cpu); - nr_running_inc(this_rq); + this_rq->nr_running++; enqueue_task(p, this_array); p->timestamp = sched_clock() - (src_rq->timestamp_last_tick - p->timestamp); @@ -1238,7 +1257,7 @@ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, */ static inline int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, - struct sched_domain *domain, enum idle_type idle) + struct sched_domain *sd, enum idle_type idle) { /* * We do not migrate tasks that are: @@ -1253,9 +1272,9 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, /* Aggressive migration if we've failed balancing */ if (idle == NEWLY_IDLE || - domain->nr_balance_failed < domain->cache_nice_tries) { + sd->nr_balance_failed < sd->cache_nice_tries) { if ((rq->timestamp_last_tick - p->timestamp) - < domain->cache_hot_time) + < sd->cache_hot_time) return 0; } @@ -1270,7 +1289,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, * Called with both runqueues locked. */ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *domain, + unsigned long max_nr_move, struct sched_domain *sd, enum idle_type idle) { int idx; @@ -1305,7 +1324,7 @@ skip_bitmap: else idx = find_next_bit(array->bitmap, MAX_PRIO, idx); if (idx >= MAX_PRIO) { - if (array == busiest->expired) { + if (array == busiest->expired && busiest->active->nr_active) { array = busiest->active; dst_array = this_rq->active; goto new_array; @@ -1320,7 +1339,7 @@ skip_queue: curr = curr->prev; - if (!can_migrate_task(tmp, busiest, this_cpu, domain, idle)) { + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { if (curr != head) goto skip_queue; idx++; @@ -1346,20 +1365,16 @@ out: * moved to restore balance via the imbalance parameter. */ static struct sched_group * -find_busiest_group(struct sched_domain *domain, int this_cpu, - unsigned long *imbalance, enum idle_type idle) +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum idle_type idle) { - unsigned long max_load, avg_load, total_load, this_load; - unsigned int total_pwr; - struct sched_group *busiest = NULL, *this = NULL, *group = domain->groups; + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + unsigned long max_load, avg_load, total_load, this_load, total_pwr; - max_load = 0; - this_load = 0; - total_load = 0; - total_pwr = 0; + if (unlikely(!group)) + return NULL; - if (group == NULL) - goto out_balanced; + max_load = this_load = total_load = total_pwr = 0; do { cpumask_t tmp; @@ -1372,6 +1387,11 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, /* Tally up the load of all CPUs in the group */ avg_load = 0; cpus_and(tmp, group->cpumask, cpu_online_map); + if (unlikely(cpus_empty(tmp))) { + WARN_ON(1); + return NULL; + } + for_each_cpu_mask(i, tmp) { /* Bias balancing toward cpus of our domain */ if (local_group) { @@ -1390,7 +1410,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, total_pwr += group->cpu_power; /* Adjust by relative CPU power of the group */ - avg_load = (avg_load << SCHED_LOAD_SHIFT) / group->cpu_power; + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; if (local_group) { this_load = avg_load; @@ -1403,7 +1423,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, } nextgroup: group = group->next; - } while (group != domain->groups); + } while (group != sd->groups); if (!busiest || this_load >= max_load) goto out_balanced; @@ -1412,7 +1432,7 @@ nextgroup: if (idle == NOT_IDLE) { if (this_load >= avg_load || - 100*max_load <= domain->imbalance_pct*this_load) + 100*max_load <= sd->imbalance_pct*this_load) goto out_balanced; } @@ -1441,7 +1461,7 @@ nextgroup: pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); - pwr_now >>= SCHED_LOAD_SHIFT; + pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; @@ -1452,7 +1472,7 @@ nextgroup: /* Amount of load we'd add */ tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; pwr_move += this->cpu_power*min(this->cpu_power, this_load + tmp); - pwr_move >>= SCHED_LOAD_SHIFT; + pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain another 8th of a CPU worth of throughput */ if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) @@ -1463,9 +1483,9 @@ nextgroup: /* How many tasks to actually move to equalise the imbalance */ *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) - >> SCHED_LOAD_SHIFT; + / SCHED_LOAD_SCALE; /* Get rid of the scaling factor, rounding *up* as we divide */ - *imbalance = (*imbalance + SCHED_LOAD_SCALE/2) >> SCHED_LOAD_SHIFT; + *imbalance = (*imbalance + SCHED_LOAD_SCALE/2) / SCHED_LOAD_SCALE; return busiest; @@ -1485,14 +1505,12 @@ out_balanced: static runqueue_t *find_busiest_queue(struct sched_group *group) { cpumask_t tmp; - int i; - unsigned long max_load = 0; + unsigned long load, max_load = 0; runqueue_t *busiest = NULL; + int i; cpus_and(tmp, group->cpumask, cpu_online_map); for_each_cpu_mask(i, tmp) { - unsigned long load; - load = get_low_cpu_load(i); if (load > max_load) { @@ -1511,68 +1529,72 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) * Called with this_rq unlocked. */ static int load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *domain, enum idle_type idle) + struct sched_domain *sd, enum idle_type idle) { struct sched_group *group; - runqueue_t *busiest = NULL; + runqueue_t *busiest; unsigned long imbalance; - int balanced = 0, failed = 0; - int nr_moved = 0; + int nr_moved; spin_lock(&this_rq->lock); - group = find_busiest_group(domain, this_cpu, &imbalance, idle); - if (!group) { - balanced = 1; - goto out; - } + group = find_busiest_group(sd, this_cpu, &imbalance, idle); + if (!group) + goto out_balanced; busiest = find_busiest_queue(group); - if (!busiest || busiest == this_rq) { - balanced = 1; - goto out; + if (!busiest) + goto out_balanced; + if (unlikely(busiest == this_rq)) { + WARN_ON(1); + goto out_balanced; } /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); - nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, domain, idle); - spin_unlock(&busiest->lock); -out: + nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle); spin_unlock(&this_rq->lock); + spin_unlock(&busiest->lock); - if (!balanced && nr_moved == 0) - failed = 1; + if (!nr_moved) { + sd->nr_balance_failed++; - if (failed && busiest && - domain->nr_balance_failed > domain->cache_nice_tries) { - int wake = 0; + if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { + int wake = 0; - spin_lock(&busiest->lock); - if (!busiest->active_balance) { - busiest->active_balance = 1; - busiest->push_cpu = this_cpu; - wake = 1; - } - spin_unlock(&busiest->lock); - if (wake) - wake_up_process(busiest->migration_thread); - } + spin_lock(&busiest->lock); + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + wake = 1; + } + spin_unlock(&busiest->lock); + if (wake) + wake_up_process(busiest->migration_thread); - if (failed) - domain->nr_balance_failed++; - else - domain->nr_balance_failed = 0; + /* + * We've kicked active balancing, reset the failure + * counter. + */ + sd->nr_balance_failed = sd->cache_nice_tries; + } + } else + sd->nr_balance_failed = 0; - if (balanced) { - if (domain->balance_interval < domain->max_interval) - domain->balance_interval *= 2; - } else { - domain->balance_interval = domain->min_interval; - } + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; return nr_moved; + +out_balanced: + spin_unlock(&this_rq->lock); + + /* tune up the balancing interval */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + + return 0; } /* @@ -1583,14 +1605,14 @@ out: * this_rq is locked. */ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, - struct sched_domain *domain) + struct sched_domain *sd) { struct sched_group *group; runqueue_t *busiest = NULL; unsigned long imbalance; int nr_moved = 0; - group = find_busiest_group(domain, this_cpu, &imbalance, NEWLY_IDLE); + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); if (!group) goto out; @@ -1602,7 +1624,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, domain, NEWLY_IDLE); + imbalance, sd, NEWLY_IDLE); spin_unlock(&busiest->lock); @@ -1616,25 +1638,22 @@ out: */ static inline void idle_balance(int this_cpu, runqueue_t *this_rq) { - struct sched_domain *domain = this_sched_domain(); + struct sched_domain *sd; if (unlikely(cpu_is_offline(this_cpu))) return; - do { - if (unlikely(!domain->groups)) - /* hasn't been setup yet */ - break; + for_each_domain(this_cpu, sd) { + if (unlikely(!sd->groups)) + return; - if (domain->flags & SD_FLAG_NEWIDLE) { - if (load_balance_newidle(this_cpu, this_rq, domain)) { + if (sd->flags & SD_BALANCE_NEWIDLE) { + if (load_balance_newidle(this_cpu, this_rq, sd)) { /* We've pulled tasks over so stop searching */ break; } } - - domain = domain->parent; - } while (domain); + } } /* @@ -1647,36 +1666,26 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq) */ static void active_load_balance(runqueue_t *busiest, int busiest_cpu) { - int i; - struct sched_domain *sd = cpu_sched_domain(busiest_cpu); + struct sched_domain *sd; struct sched_group *group, *busy_group; + int i; if (busiest->nr_running <= 1) return; - /* sd->parent should never cause a NULL dereference, if it did so, - * then push_cpu was set to a buggy value */ - while (!cpu_isset(busiest->push_cpu, sd->span)) { - sd = sd->parent; - if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) { - WARN_ON(1); - return; - } + for_each_domain(busiest_cpu, sd) { + if (cpu_isset(busiest->push_cpu, sd->span)) + break; } - if (!sd->groups) { + if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) { WARN_ON(1); return; } group = sd->groups; - while (!cpu_isset(busiest_cpu, group->cpumask)) { + while (!cpu_isset(busiest_cpu, group->cpumask)) group = group->next; - if (group == sd->groups) { - WARN_ON(1); - return; - } - } busy_group = group; group = sd->groups; @@ -1719,59 +1728,60 @@ next_group: /* Don't have all balancing operations going off at once */ #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) -static void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) +static void rebalance_tick(int this_cpu, runqueue_t *this_rq, + enum idle_type idle) { unsigned long old_load, this_load; unsigned long j = jiffies + CPU_OFFSET(this_cpu); - struct sched_domain *domain = this_sched_domain(); + struct sched_domain *sd; if (unlikely(cpu_is_offline(this_cpu))) return; /* Update our load */ old_load = this_rq->cpu_load; - this_load = this_rq->nr_running << SCHED_LOAD_SHIFT; + this_load = this_rq->nr_running * SCHED_LOAD_SCALE; this_rq->cpu_load = (old_load + this_load) / 2; - /* Run through all this CPU's domains */ - do { - unsigned long interval; + for_each_domain(this_cpu, sd) { + unsigned long interval = sd->balance_interval; - if (unlikely(!domain->groups)) - break; + if (unlikely(!sd->groups)) + return; - interval = domain->balance_interval; if (idle != IDLE) - interval *= domain->busy_factor; + interval *= sd->busy_factor; /* scale ms to jiffies */ - interval = interval * HZ / 1000; + interval = MSEC_TO_JIFFIES(interval); if (unlikely(interval == 0)) interval = 1; - if (j - domain->last_balance >= interval) { - if (load_balance(this_cpu, this_rq, domain, idle)) { + if (j - sd->last_balance >= interval) { + if (load_balance(this_cpu, this_rq, sd, idle)) { /* We've pulled tasks over so no longer idle */ idle = NOT_IDLE; } - domain->last_balance += interval; + sd->last_balance += interval; } - - domain = domain->parent; - } while (domain); + } } #else /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) +static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) +{ +} +static inline void idle_balance(int cpu, runqueue_t *rq) { } #endif -#ifdef CONFIG_SCHED_SMT static inline int wake_priority_sleeper(runqueue_t *rq) -{ /* +{ +#ifdef CONFIG_SCHED_SMT + /* * If an SMT sibling task has been put to sleep for priority * reasons reschedule the idle task to see if it can now run. */ @@ -1779,14 +1789,9 @@ static inline int wake_priority_sleeper(runqueue_t *rq) resched_task(rq->idle); return 1; } +#endif return 0; } -#else -static inline int wake_priority_sleeper(runqueue_t *rq) -{ - return 0; -} -#endif DEFINE_PER_CPU(struct kernel_stat, kstat); @@ -1937,10 +1942,8 @@ static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) struct sched_domain *sd = cpu_sched_domain(cpu); cpumask_t sibling_map; - if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { - /* Not SMT */ + if (!(sd->flags & SD_SHARE_CPUPOWER)) return; - } cpus_and(sibling_map, sd->span, cpu_online_map); cpu_clear(cpu, sibling_map); @@ -1960,14 +1963,12 @@ static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) { - int ret = 0, i; struct sched_domain *sd = cpu_sched_domain(cpu); cpumask_t sibling_map; + int ret = 0, i; - if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { - /* Not SMT */ + if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; - } cpus_and(sibling_map, sd->span, cpu_online_map); cpu_clear(cpu, sibling_map); @@ -1989,7 +1990,7 @@ static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > task_timeslice(p) || rt_task(smt_curr)) && p->mm && smt_curr->mm && !rt_task(p)) - ret |= 1; + ret = 1; /* * Reschedule a lower priority task on the SMT sibling, @@ -2079,9 +2080,7 @@ need_resched: cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { -#ifdef CONFIG_SMP idle_balance(cpu, rq); -#endif if (!rq->nr_running) { next = rq->idle; rq->expired_timestamp = 0; @@ -2627,7 +2626,7 @@ static int setscheduler(pid_t pid, int policy, struct sched_param __user *param) if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (p->prio < rq->curr->prio) + } else if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } @@ -3406,24 +3405,24 @@ static void __init arch_init_sched_domains(void) for_each_cpu(i) { int node = cpu_to_node(i); cpumask_t nodemask = node_to_cpumask(node); - struct sched_domain *node_domain = &per_cpu(node_domains, i); - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *node_sd = &per_cpu(node_domains, i); + struct sched_domain *cpu_sd = cpu_sched_domain(i); - *node_domain = SD_NODE_INIT; - node_domain->span = cpu_possible_map; + *node_sd = SD_NODE_INIT; + node_sd->span = cpu_possible_map; - *cpu_domain = SD_CPU_INIT; - cpus_and(cpu_domain->span, nodemask, cpu_possible_map); - cpu_domain->parent = node_domain; + *cpu_sd = SD_CPU_INIT; + cpus_and(cpu_sd->span, nodemask, cpu_possible_map); + cpu_sd->parent = node_sd; } /* Set up groups */ for (i = 0; i < MAX_NUMNODES; i++) { - struct sched_group *first_cpu = NULL, *last_cpu = NULL; - int j; + cpumask_t tmp = node_to_cpumask(i); cpumask_t nodemask; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; struct sched_group *node = &sched_group_nodes[i]; - cpumask_t tmp = node_to_cpumask(i); + int j; cpus_and(nodemask, tmp, cpu_possible_map); @@ -3458,14 +3457,14 @@ static void __init arch_init_sched_domains(void) mb(); for_each_cpu(i) { - struct sched_domain *node_domain = &per_cpu(node_domains, i); - struct sched_domain *cpu_domain = cpu_sched_domain(i); - node_domain->groups = &sched_group_nodes[cpu_to_node(i)]; - cpu_domain->groups = &sched_group_cpus[i]; + struct sched_domain *node_sd = &per_cpu(node_domains, i); + struct sched_domain *cpu_sd = cpu_sched_domain(i); + node_sd->groups = &sched_group_nodes[cpu_to_node(i)]; + cpu_sd->groups = &sched_group_cpus[i]; } } -#else /* CONFIG_NUMA */ +#else /* !CONFIG_NUMA */ static void __init arch_init_sched_domains(void) { int i; @@ -3473,10 +3472,10 @@ static void __init arch_init_sched_domains(void) /* Set up domains */ for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *cpu_sd = cpu_sched_domain(i); - *cpu_domain = SD_CPU_INIT; - cpu_domain->span = cpu_possible_map; + *cpu_sd = SD_CPU_INIT; + cpu_sd->span = cpu_possible_map; } /* Set up CPU groups */ @@ -3497,15 +3496,15 @@ static void __init arch_init_sched_domains(void) mb(); for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); - cpu_domain->groups = &sched_group_cpus[i]; + struct sched_domain *cpu_sd = cpu_sched_domain(i); + cpu_sd->groups = &sched_group_cpus[i]; } } #endif /* CONFIG_NUMA */ #endif /* ARCH_HAS_SCHED_DOMAIN */ -#undef SCHED_DOMAIN_DEBUG +#define SCHED_DOMAIN_DEBUG #ifdef SCHED_DOMAIN_DEBUG void sched_domain_debug(void) { @@ -3513,7 +3512,7 @@ void sched_domain_debug(void) for_each_cpu(i) { int level = 0; - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *cpu_sd = cpu_sched_domain(i); printk(KERN_DEBUG "CPU%d: %s\n", i, (cpu_online(i) ? " online" : "offline")); @@ -3521,10 +3520,10 @@ void sched_domain_debug(void) do { int j; char str[NR_CPUS]; - struct sched_group *group = cpu_domain->groups; + struct sched_group *group = cpu_sd->groups; cpumask_t groupmask, tmp; - cpumask_snprintf(str, NR_CPUS, cpu_domain->span); + cpumask_scnprintf(str, NR_CPUS, cpu_sd->span); cpus_clear(groupmask); printk(KERN_DEBUG); @@ -3532,10 +3531,12 @@ void sched_domain_debug(void) printk(" "); printk("domain %d: span %s\n", level, str); - if (!cpu_isset(i, cpu_domain->span)) + if (!cpu_isset(i, cpu_sd->span)) printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); if (!cpu_isset(i, group->cpumask)) printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i); + if (!group->cpu_power) + printk(KERN_DEBUG "ERROR domain->cpu_power not set\n"); printk(KERN_DEBUG); for (j = 0; j < level + 2; j++) @@ -3556,26 +3557,26 @@ void sched_domain_debug(void) cpus_or(groupmask, groupmask, group->cpumask); - cpumask_snprintf(str, NR_CPUS, group->cpumask); + cpumask_scnprintf(str, NR_CPUS, group->cpumask); printk(" %s", str); group = group->next; - } while (group != cpu_domain->groups); + } while (group != cpu_sd->groups); printk("\n"); - if (!cpus_equal(cpu_domain->span, groupmask)) + if (!cpus_equal(cpu_sd->span, groupmask)) printk(KERN_DEBUG "ERROR groups don't span domain->span\n"); level++; - cpu_domain = cpu_domain->parent; + cpu_sd = cpu_sd->parent; - if (cpu_domain) { - cpus_and(tmp, groupmask, cpu_domain->span); + if (cpu_sd) { + cpus_and(tmp, groupmask, cpu_sd->span); if (!cpus_equal(tmp, groupmask)) printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n"); } - } while (cpu_domain); + } while (cpu_sd); } } #else @@ -3635,8 +3636,6 @@ void __init sched_init(void) set_task_cpu(current, smp_processor_id()); wake_up_forked_process(current); - init_timers(); - /* * The boot idle thread does lazy MMU switching as well: */ |