aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c538
1 files changed, 308 insertions, 230 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 03be0d1330a6b2..4214df32ba4534 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -78,15 +78,9 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
-int sched_thermal_decay_shift;
static int __init setup_sched_thermal_decay_shift(char *str)
{
- int _shift = 0;
-
- if (kstrtoint(str, 0, &_shift))
- pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
-
- sched_thermal_decay_shift = clamp(_shift, 0, 10);
+ pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
return 1;
}
__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
@@ -157,7 +151,6 @@ static struct ctl_table sched_fair_sysctls[] = {
.extra1 = SYSCTL_ZERO,
},
#endif /* CONFIG_NUMA_BALANCING */
- {}
};
static int __init sched_fair_sysctl_init(void)
@@ -388,8 +381,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
/*
* With cfs_rq being unthrottled/throttled during an enqueue,
- * it can happen the tmp_alone_branch points the a leaf that
- * we finally want to del. In this case, tmp_alone_branch moves
+ * it can happen the tmp_alone_branch points to the leaf that
+ * we finally want to delete. In this case, tmp_alone_branch moves
* to the prev element but it will point to rq->leaf_cfs_rq_list
* at the end of the enqueue.
*/
@@ -406,7 +399,7 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq)
SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
}
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
+/* Iterate through all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
leaf_cfs_rq_list)
@@ -595,13 +588,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* [[ NOTE: this is only equal to the ideal scheduler under the condition
* that join/leave operations happen at lag_i = 0, otherwise the
- * virtual time has non-continguous motion equivalent to:
+ * virtual time has non-contiguous motion equivalent to:
*
* V +-= lag_i / W
*
* Also see the comment in place_entity() that deals with this. ]]
*
- * However, since v_i is u64, and the multiplcation could easily overflow
+ * However, since v_i is u64, and the multiplication could easily overflow
* transform it into a relative form that uses smaller quantities:
*
* Substitute: v_i == (v_i - v0) + v0
@@ -671,7 +664,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
}
if (load) {
- /* sign flips effective floor / ceil */
+ /* sign flips effective floor / ceiling */
if (avg < 0)
avg -= (load - 1);
avg = div_s64(avg, load);
@@ -696,15 +689,21 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
*
* XXX could add max_slice to the augmented data to track this.
*/
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static s64 entity_lag(u64 avruntime, struct sched_entity *se)
{
- s64 lag, limit;
+ s64 vlag, limit;
+ vlag = avruntime - se->vruntime;
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+
+ return clamp(vlag, -limit, limit);
+}
+
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
SCHED_WARN_ON(!se->on_rq);
- lag = avg_vruntime(cfs_rq) - se->vruntime;
- limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
- se->vlag = clamp(lag, -limit, limit);
+ se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
}
/*
@@ -721,7 +720,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
*
- * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
+ * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
* to the loss in precision caused by the division.
*/
static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
@@ -1024,7 +1023,7 @@ void init_entity_runnable_average(struct sched_entity *se)
if (entity_is_task(se))
sa->load_avg = scale_load_down(se->load.weight);
- /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
+ /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */
}
/*
@@ -1616,7 +1615,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
max_dist = READ_ONCE(sched_max_numa_distance);
/*
* This code is called for each node, introducing N^2 complexity,
- * which should be ok given the number of nodes rarely exceeds 8.
+ * which should be OK given the number of nodes rarely exceeds 8.
*/
for_each_online_node(node) {
unsigned long faults;
@@ -3290,7 +3289,7 @@ retry_pids:
/*
* Shared library pages mapped by multiple processes are not
* migrated as it is expected they are cache replicated. Avoid
- * hinting faults in read-only file-backed mappings or the vdso
+ * hinting faults in read-only file-backed mappings or the vDSO
* as migrating the pages will be of marginal benefit.
*/
if (!vma->vm_mm ||
@@ -3301,7 +3300,7 @@ retry_pids:
/*
* Skip inaccessible VMAs to avoid any confusion between
- * PROT_NONE and NUMA hinting ptes
+ * PROT_NONE and NUMA hinting PTEs
*/
if (!vma_is_accessible(vma)) {
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
@@ -3333,7 +3332,7 @@ retry_pids:
}
/*
- * Scanning the VMA's of short lived tasks add more overhead. So
+ * Scanning the VMAs of short lived tasks add more overhead. So
* delay the scan for new VMAs.
*/
if (mm->numa_scan_seq && time_before(jiffies,
@@ -3377,7 +3376,7 @@ retry_pids:
/*
* Try to scan sysctl_numa_balancing_size worth of
* hpages that have at least one present PTE that
- * is not already pte-numa. If the VMA contains
+ * is not already PTE-numa. If the VMA contains
* areas that are unused or already full of prot_numa
* PTEs, scan up to virtpages, to skip through those
* areas faster.
@@ -3676,16 +3675,15 @@ static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif
-static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
+static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
unsigned long weight)
{
unsigned long old_weight = se->load.weight;
- u64 avruntime = avg_vruntime(cfs_rq);
s64 vlag, vslice;
/*
* VRUNTIME
- * ========
+ * --------
*
* COROLLARY #1: The virtual runtime of the entity needs to be
* adjusted if re-weight at !0-lag point.
@@ -3761,14 +3759,14 @@ static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
* = V - vl'
*/
if (avruntime != se->vruntime) {
- vlag = (s64)(avruntime - se->vruntime);
+ vlag = entity_lag(avruntime, se);
vlag = div_s64(vlag * old_weight, weight);
se->vruntime = avruntime - vlag;
}
/*
* DEADLINE
- * ========
+ * --------
*
* When the weight changes, the virtual time slope changes and
* we should adjust the relative virtual deadline accordingly.
@@ -3787,25 +3785,26 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
bool curr = cfs_rq->curr == se;
+ u64 avruntime;
if (se->on_rq) {
/* commit outstanding execution time */
- if (curr)
- update_curr(cfs_rq);
- else
+ update_curr(cfs_rq);
+ avruntime = avg_vruntime(cfs_rq);
+ if (!curr)
__dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
- if (!se->on_rq) {
+ if (se->on_rq) {
+ reweight_eevdf(se, avruntime, weight);
+ } else {
/*
* Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
* we need to scale se->vlag when w_i changes.
*/
se->vlag = div_s64(se->vlag * se->load.weight, weight);
- } else {
- reweight_eevdf(cfs_rq, se, weight);
}
update_load_set(&se->load, weight);
@@ -4739,7 +4738,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/*
* Track task load average for carrying it to new CPU after migrated, and
- * track group sched_entity load average for task_h_load calc in migration
+ * track group sched_entity load average for task_h_load calculation in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cfs_rq, se);
@@ -4822,7 +4821,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
-static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf);
static inline unsigned long task_util(struct task_struct *p)
{
@@ -4965,13 +4964,22 @@ done:
trace_sched_util_est_se_tp(&p->se);
}
+static inline unsigned long get_actual_cpu_capacity(int cpu)
+{
+ unsigned long capacity = arch_scale_cpu_capacity(cpu);
+
+ capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
+
+ return capacity;
+}
+
static inline int util_fits_cpu(unsigned long util,
unsigned long uclamp_min,
unsigned long uclamp_max,
int cpu)
{
- unsigned long capacity_orig, capacity_orig_thermal;
unsigned long capacity = capacity_of(cpu);
+ unsigned long capacity_orig;
bool fits, uclamp_max_fits;
/*
@@ -4993,7 +5001,7 @@ static inline int util_fits_cpu(unsigned long util,
* Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
* should fit a little cpu even if there's some pressure.
*
- * Only exception is for thermal pressure since it has a direct impact
+ * Only exception is for HW or cpufreq pressure since it has a direct impact
* on available OPP of the system.
*
* We honour it for uclamp_min only as a drop in performance level
@@ -5003,7 +5011,6 @@ static inline int util_fits_cpu(unsigned long util,
* goal is to cap the task. So it's okay if it's getting less.
*/
capacity_orig = arch_scale_cpu_capacity(cpu);
- capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
/*
* We want to force a task to fit a cpu as implied by uclamp_max.
@@ -5020,14 +5027,14 @@ static inline int util_fits_cpu(unsigned long util,
* | | | | | | |
* | | | | | | |
* +----------------------------------------
- * cpu0 cpu1 cpu2
+ * CPU0 CPU1 CPU2
*
* In the above example if a task is capped to a specific performance
* point, y, then when:
*
- * * util = 80% of x then it does not fit on cpu0 and should migrate
- * to cpu1
- * * util = 80% of y then it is forced to fit on cpu1 to honour
+ * * util = 80% of x then it does not fit on CPU0 and should migrate
+ * to CPU1
+ * * util = 80% of y then it is forced to fit on CPU1 to honour
* uclamp_max request.
*
* which is what we're enforcing here. A task always fits if
@@ -5058,7 +5065,7 @@ static inline int util_fits_cpu(unsigned long util,
* | | | | | | |
* | | | | | | | (region c, boosted, util < uclamp_min)
* +----------------------------------------
- * cpu0 cpu1 cpu2
+ * CPU0 CPU1 CPU2
*
* a) If util > uclamp_max, then we're capped, we don't care about
* actual fitness value here. We only care if uclamp_max fits
@@ -5078,7 +5085,8 @@ static inline int util_fits_cpu(unsigned long util,
* handle the case uclamp_min > uclamp_max.
*/
uclamp_min = min(uclamp_min, uclamp_max);
- if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+ if (fits && (util < uclamp_min) &&
+ (uclamp_min > get_actual_cpu_capacity(cpu)))
return -1;
return fits;
@@ -5098,15 +5106,19 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
+ int cpu = cpu_of(rq);
+
if (!sched_asym_cpucap_active())
return;
- if (!p || p->nr_cpus_allowed == 1) {
- rq->misfit_task_load = 0;
- return;
- }
+ /*
+ * Affinity allows us to go somewhere higher? Or are we on biggest
+ * available CPU already? Or do we fit into this CPU ?
+ */
+ if (!p || (p->nr_cpus_allowed == 1) ||
+ (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
+ task_fits_cpu(p, cpu)) {
- if (task_fits_cpu(p, cpu_of(rq))) {
rq->misfit_task_load = 0;
return;
}
@@ -5142,7 +5154,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
+static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf)
{
return 0;
}
@@ -5248,7 +5260,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->vruntime = vruntime - lag;
/*
- * When joining the competition; the exisiting tasks will be,
+ * When joining the competition; the existing tasks will be,
* on average, halfway through their slice, as such start tasks
* off with half a slice to ease into the competition.
*/
@@ -5397,7 +5409,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Now advance min_vruntime if @se was the entity holding it back,
* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
* put back on, and if we advance min_vruntime, we'll be placed back
- * further than we started -- ie. we'll be penalized.
+ * further than we started -- i.e. we'll be penalized.
*/
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
@@ -5433,7 +5445,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* Track our maximum slice length, if the CPU's load is at
- * least twice that of our own weight (i.e. dont track it
+ * least twice that of our own weight (i.e. don't track it
* when there are only lesser-weight tasks around):
*/
if (schedstat_enabled() &&
@@ -6669,22 +6681,47 @@ static inline void hrtick_update(struct rq *rq)
#ifdef CONFIG_SMP
static inline bool cpu_overutilized(int cpu)
{
- unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
- unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+ unsigned long rq_util_min, rq_util_max;
+
+ if (!sched_energy_enabled())
+ return false;
+
+ rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
/* Return true only if the utilization doesn't fit CPU's capacity */
return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}
-static inline void update_overutilized_status(struct rq *rq)
+/*
+ * overutilized value make sense only if EAS is enabled
+ */
+static inline bool is_rd_overutilized(struct root_domain *rd)
{
- if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
- WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
- }
+ return !sched_energy_enabled() || READ_ONCE(rd->overutilized);
+}
+
+static inline void set_rd_overutilized(struct root_domain *rd, bool flag)
+{
+ if (!sched_energy_enabled())
+ return;
+
+ WRITE_ONCE(rd->overutilized, flag);
+ trace_sched_overutilized_tp(rd, flag);
+}
+
+static inline void check_update_overutilized_status(struct rq *rq)
+{
+ /*
+ * overutilized field is used for load balancing decisions only
+ * if energy aware scheduler is being used
+ */
+
+ if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
+ set_rd_overutilized(rq->rd, 1);
}
#else
-static inline void update_overutilized_status(struct rq *rq) { }
+static inline void check_update_overutilized_status(struct rq *rq) { }
#endif
/* Runqueue only has SCHED_IDLE tasks enqueued */
@@ -6785,7 +6822,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* and the following generally works well enough in practice.
*/
if (!task_new)
- update_overutilized_status(rq);
+ check_update_overutilized_status(rq);
enqueue_throttle:
assert_list_leaf_cfs_rq(rq);
@@ -6872,7 +6909,7 @@ dequeue_throttle:
#ifdef CONFIG_SMP
-/* Working cpumask for: load_balance, load_balance_newidle. */
+/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
@@ -7104,13 +7141,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
}
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
+sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
/*
- * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
+ * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group.
*/
static int
-find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
@@ -7166,7 +7203,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
-static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p,
int cpu, int prev_cpu, int sd_flag)
{
int new_cpu = cpu;
@@ -7191,13 +7228,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
continue;
}
- group = find_idlest_group(sd, p, cpu);
+ group = sched_balance_find_dst_group(sd, p, cpu);
if (!group) {
sd = sd->child;
continue;
}
- new_cpu = find_idlest_group_cpu(group, p, cpu);
+ new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu);
if (new_cpu == cpu) {
/* Now try balancing at a lower domain level of 'cpu': */
sd = sd->child;
@@ -7465,7 +7502,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
* Look for the CPU with best capacity.
*/
else if (fits < 0)
- cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
+ cpu_cap = get_actual_cpu_capacity(cpu);
/*
* First, select CPU which fits better (-1 being better than 0).
@@ -7509,7 +7546,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
/*
* On asymmetric system, update task utilization because we will check
- * that the task fits with cpu's capacity.
+ * that the task fits with CPU's capacity.
*/
if (sched_asym_cpucap_active()) {
sync_entity_load_avg(&p->se);
@@ -7942,7 +7979,7 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd,
* NOTE: Forkees are not accepted in the energy-aware wake-up path because
* they don't have any useful utilization data yet and it's not possible to
* forecast their impact on energy consumption. Consequently, they will be
- * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
+ * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out
* to be energy-inefficient in some use-cases. The alternative would be to
* bias new tasks towards specific types of CPUs first, or to try to infer
* their util_avg from the parent task, but those heuristics could hurt
@@ -7958,15 +7995,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
struct root_domain *rd = this_rq()->rd;
int cpu, best_energy_cpu, target = -1;
int prev_fits = -1, best_fits = -1;
- unsigned long best_thermal_cap = 0;
- unsigned long prev_thermal_cap = 0;
+ unsigned long best_actual_cap = 0;
+ unsigned long prev_actual_cap = 0;
struct sched_domain *sd;
struct perf_domain *pd;
struct energy_env eenv;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
- if (!pd || READ_ONCE(rd->overutilized))
+ if (!pd)
goto unlock;
/*
@@ -7989,7 +8026,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
for (; pd; pd = pd->next) {
unsigned long util_min = p_util_min, util_max = p_util_max;
- unsigned long cpu_cap, cpu_thermal_cap, util;
+ unsigned long cpu_cap, cpu_actual_cap, util;
long prev_spare_cap = -1, max_spare_cap = -1;
unsigned long rq_util_min, rq_util_max;
unsigned long cur_delta, base_energy;
@@ -8001,18 +8038,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (cpumask_empty(cpus))
continue;
- /* Account thermal pressure for the energy estimation */
+ /* Account external pressure for the energy estimation */
cpu = cpumask_first(cpus);
- cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
- cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
+ cpu_actual_cap = get_actual_cpu_capacity(cpu);
- eenv.cpu_cap = cpu_thermal_cap;
+ eenv.cpu_cap = cpu_actual_cap;
eenv.pd_cap = 0;
for_each_cpu(cpu, cpus) {
struct rq *rq = cpu_rq(cpu);
- eenv.pd_cap += cpu_thermal_cap;
+ eenv.pd_cap += cpu_actual_cap;
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
@@ -8033,7 +8069,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
/*
* Open code uclamp_rq_util_with() except for
- * the clamp() part. Ie: apply max aggregation
+ * the clamp() part. I.e.: apply max aggregation
* only. util_fits_cpu() logic requires to
* operate on non clamped util but must use the
* max-aggregated uclamp_{min, max}.
@@ -8083,7 +8119,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (prev_delta < base_energy)
goto unlock;
prev_delta -= base_energy;
- prev_thermal_cap = cpu_thermal_cap;
+ prev_actual_cap = cpu_actual_cap;
best_delta = min(best_delta, prev_delta);
}
@@ -8098,7 +8134,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* but best energy cpu has better capacity.
*/
if ((max_fits < 0) &&
- (cpu_thermal_cap <= best_thermal_cap))
+ (cpu_actual_cap <= best_actual_cap))
continue;
cur_delta = compute_energy(&eenv, pd, cpus, p,
@@ -8119,14 +8155,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
best_fits = max_fits;
- best_thermal_cap = cpu_thermal_cap;
+ best_actual_cap = cpu_actual_cap;
}
}
rcu_read_unlock();
if ((best_fits > prev_fits) ||
((best_fits > 0) && (best_delta < prev_delta)) ||
- ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
+ ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
target = best_energy_cpu;
return target;
@@ -8169,7 +8205,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
cpumask_test_cpu(cpu, p->cpus_ptr))
return cpu;
- if (sched_energy_enabled()) {
+ if (!is_rd_overutilized(this_rq()->rd)) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu);
if (new_cpu >= 0)
return new_cpu;
@@ -8207,7 +8243,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
if (unlikely(sd)) {
/* Slow path */
- new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+ new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
@@ -8253,14 +8289,46 @@ static void task_dead_fair(struct task_struct *p)
remove_entity_load_avg(&p->se);
}
+/*
+ * Set the max capacity the task is allowed to run at for misfit detection.
+ */
+static void set_task_max_allowed_capacity(struct task_struct *p)
+{
+ struct asym_cap_data *entry;
+
+ if (!sched_asym_cpucap_active())
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ cpumask_t *cpumask;
+
+ cpumask = cpu_capacity_span(entry);
+ if (!cpumask_intersects(p->cpus_ptr, cpumask))
+ continue;
+
+ p->max_allowed_capacity = entry->capacity;
+ break;
+ }
+ rcu_read_unlock();
+}
+
+static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
+{
+ set_cpus_allowed_common(p, ctx);
+ set_task_max_allowed_capacity(p);
+}
+
static int
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
if (rq->nr_running)
return 1;
- return newidle_balance(rq, rf) != 0;
+ return sched_balance_newidle(rq, rf) != 0;
}
+#else
+static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
#endif /* CONFIG_SMP */
static void set_next_buddy(struct sched_entity *se)
@@ -8511,10 +8579,10 @@ idle:
if (!rf)
return NULL;
- new_tasks = newidle_balance(rq, rf);
+ new_tasks = sched_balance_newidle(rq, rf);
/*
- * Because newidle_balance() releases (and re-acquires) rq->lock, it is
+ * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
@@ -8592,7 +8660,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
return false;
- /* Tell the scheduler that we'd really like pse to run next. */
+ /* Tell the scheduler that we'd really like se to run next. */
set_next_buddy(se);
yield_task_fair(rq);
@@ -8930,7 +8998,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
- /* Disregard pcpu kthreads; they are where they need to be. */
+ /* Disregard percpu kthreads; they are where they need to be. */
if (kthread_is_per_cpu(p))
return 0;
@@ -9076,7 +9144,7 @@ static int detach_tasks(struct lb_env *env)
* We don't want to steal all, otherwise we may be treated likewise,
* which could at worst lead to a livelock crash.
*/
- if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
+ if (env->idle && env->src_rq->nr_running <= 1)
break;
env->loop++;
@@ -9255,7 +9323,7 @@ static inline bool others_have_blocked(struct rq *rq)
if (cpu_util_dl(rq))
return true;
- if (thermal_load_avg(rq))
+ if (hw_load_avg(rq))
return true;
if (cpu_util_irq(rq))
@@ -9285,7 +9353,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
{
const struct sched_class *curr_class;
u64 now = rq_clock_pelt(rq);
- unsigned long thermal_pressure;
+ unsigned long hw_pressure;
bool decayed;
/*
@@ -9294,11 +9362,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
*/
curr_class = rq->curr->sched_class;
- thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
+ hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
- update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
+ update_hw_load_avg(now, rq, hw_pressure) |
update_irq_load_avg(rq, 0);
if (others_have_blocked(rq))
@@ -9417,7 +9485,7 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif
-static void update_blocked_averages(int cpu)
+static void sched_balance_update_blocked_averages(int cpu)
{
bool decayed = false, done = true;
struct rq *rq = cpu_rq(cpu);
@@ -9436,25 +9504,25 @@ static void update_blocked_averages(int cpu)
rq_unlock_irqrestore(rq, &rf);
}
-/********** Helpers for find_busiest_group ************************/
+/********** Helpers for sched_balance_find_src_group ************************/
/*
- * sg_lb_stats - stats of a sched_group required for load_balancing
+ * sg_lb_stats - stats of a sched_group required for load-balancing:
*/
struct sg_lb_stats {
- unsigned long avg_load; /*Avg load across the CPUs of the group */
- unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long group_capacity;
- unsigned long group_util; /* Total utilization over the CPUs of the group */
- unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
- unsigned int sum_nr_running; /* Nr of tasks running in the group */
- unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
- unsigned int idle_cpus;
+ unsigned long avg_load; /* Avg load over the CPUs of the group */
+ unsigned long group_load; /* Total load over the CPUs of the group */
+ unsigned long group_capacity; /* Capacity over the CPUs of the group */
+ unsigned long group_util; /* Total utilization over the CPUs of the group */
+ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
+ unsigned int sum_nr_running; /* Nr of all tasks running in the group */
+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
+ unsigned int idle_cpus; /* Nr of idle CPUs in the group */
unsigned int group_weight;
enum group_type group_type;
- unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
- unsigned int group_smt_balance; /* Task on busy SMT be moved */
- unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
+ unsigned int group_smt_balance; /* Task on busy SMT be moved */
+ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -9462,19 +9530,18 @@ struct sg_lb_stats {
};
/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- * during load balancing.
+ * sd_lb_stats - stats of a sched_domain required for load-balancing:
*/
struct sd_lb_stats {
- struct sched_group *busiest; /* Busiest group in this sd */
- struct sched_group *local; /* Local group in this sd */
- unsigned long total_load; /* Total load of all groups in sd */
- unsigned long total_capacity; /* Total capacity of all groups in sd */
- unsigned long avg_load; /* Average load across all groups in sd */
- unsigned int prefer_sibling; /* tasks should go to sibling first */
-
- struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
- struct sg_lb_stats local_stat; /* Statistics of the local group */
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *local; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_capacity; /* Total capacity of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+ unsigned int prefer_sibling; /* Tasks should go to sibling first */
+
+ struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */
+ struct sg_lb_stats local_stat; /* Statistics of the local group */
};
static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
@@ -9500,8 +9567,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
static unsigned long scale_rt_capacity(int cpu)
{
+ unsigned long max = get_actual_cpu_capacity(cpu);
struct rq *rq = cpu_rq(cpu);
- unsigned long max = arch_scale_cpu_capacity(cpu);
unsigned long used, free;
unsigned long irq;
@@ -9513,12 +9580,9 @@ static unsigned long scale_rt_capacity(int cpu)
/*
* avg_rt.util_avg and avg_dl.util_avg track binary signals
* (running and not running) with weights 0 and 1024 respectively.
- * avg_thermal.load_avg tracks thermal pressure and the weighted
- * average uses the actual delta max capacity(load).
*/
used = cpu_util_rt(rq);
used += cpu_util_dl(rq);
- used += thermal_load_avg(rq);
if (unlikely(used >= max))
return 1;
@@ -9611,16 +9675,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
}
-/*
- * Check whether a rq has a misfit task and if it looks like we can actually
- * help that task: we can migrate the task to a CPU of higher capacity, or
- * the task's current CPU is heavily pressured.
- */
-static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+/* Check if the rq has a misfit task */
+static inline bool check_misfit_status(struct rq *rq)
{
- return rq->misfit_task_load &&
- (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
- check_cpu_capacity(rq, sd));
+ return rq->misfit_task_load;
}
/*
@@ -9644,7 +9702,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
*
* When this is so detected; this group becomes a candidate for busiest; see
* update_sd_pick_busiest(). And calculate_imbalance() and
- * find_busiest_group() avoid some of the usual balance conditions to allow it
+ * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it
* to create an effective group imbalance.
*
* This is a somewhat tricky proposition since the next run might not find the
@@ -9809,7 +9867,7 @@ static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
struct sched_group *group)
{
- if (env->idle == CPU_NOT_IDLE)
+ if (!env->idle)
return false;
/*
@@ -9833,7 +9891,7 @@ static inline long sibling_imbalance(struct lb_env *env,
int ncores_busiest, ncores_local;
long imbalance;
- if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running)
+ if (!env->idle || !busiest->sum_nr_running)
return 0;
ncores_busiest = sds->busiest->cores;
@@ -9879,13 +9937,15 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
* @sds: Load-balancing data with statistics of the local group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
- * @sg_status: Holds flag indicating the status of the sched_group
+ * @sg_overloaded: sched_group is overloaded
+ * @sg_overutilized: sched_group is overutilized
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *group,
struct sg_lb_stats *sgs,
- int *sg_status)
+ bool *sg_overloaded,
+ bool *sg_overutilized)
{
int i, nr_running, local_group;
@@ -9906,10 +9966,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->sum_nr_running += nr_running;
if (nr_running > 1)
- *sg_status |= SG_OVERLOAD;
+ *sg_overloaded = 1;
if (cpu_overutilized(i))
- *sg_status |= SG_OVERUTILIZED;
+ *sg_overutilized = 1;
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
@@ -9931,10 +9991,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
/* Check for a misfit task on the cpu */
if (sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
- *sg_status |= SG_OVERLOAD;
+ *sg_overloaded = 1;
}
- } else if ((env->idle != CPU_NOT_IDLE) &&
- sched_reduced_capacity(rq, env->sd)) {
+ } else if (env->idle && sched_reduced_capacity(rq, env->sd)) {
/* Check for a task running on a CPU with reduced capacity */
if (sgs->group_misfit_task_load < load)
sgs->group_misfit_task_load = load;
@@ -9946,7 +10005,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_weight = group->group_weight;
/* Check if dst CPU is idle and preferred to this group */
- if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
+ if (!local_group && env->idle && sgs->sum_h_nr_running &&
sched_group_asym(env, sgs, group))
sgs->group_asym_packing = 1;
@@ -10084,7 +10143,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
has_spare:
/*
- * Select not overloaded group with lowest number of idle cpus
+ * Select not overloaded group with lowest number of idle CPUs
* and highest number of running tasks. We could also compare
* the spare capacity which is more stable but it can end up
* that the group has less spare capacity but finally more idle
@@ -10304,13 +10363,13 @@ static bool update_pick_idlest(struct sched_group *idlest,
}
/*
- * find_idlest_group() finds and returns the least busy CPU group within the
+ * sched_balance_find_dst_group() finds and returns the least busy CPU group within the
* domain.
*
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
{
struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
struct sg_lb_stats local_sgs, tmp_sgs;
@@ -10558,7 +10617,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
unsigned long sum_util = 0;
- int sg_status = 0;
+ bool sg_overloaded = 0, sg_overutilized = 0;
do {
struct sg_lb_stats *sgs = &tmp_sgs;
@@ -10574,7 +10633,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sds, sg, sgs, &sg_status);
+ update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -10602,19 +10661,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
if (!env->sd->parent) {
- struct root_domain *rd = env->dst_rq->rd;
-
/* update overload indicator if we are at root domain */
- WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
+ set_rd_overloaded(env->dst_rq->rd, sg_overloaded);
/* Update over-utilization (tipping point, U >= 0) indicator */
- WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
- } else if (sg_status & SG_OVERUTILIZED) {
- struct root_domain *rd = env->dst_rq->rd;
-
- WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
+ set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
+ } else if (sg_overutilized) {
+ set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
}
update_idle_cpu_scan(env, sum_util);
@@ -10704,7 +10757,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* waiting task in this overloaded busiest group. Let's
* try to pull it.
*/
- if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
+ if (env->idle && env->imbalance == 0) {
env->migration_type = migrate_task;
env->imbalance = 1;
}
@@ -10723,7 +10776,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/*
* If there is no overload, we just want to even the number of
- * idle cpus.
+ * idle CPUs.
*/
env->migration_type = migrate_task;
env->imbalance = max_t(long, 0,
@@ -10796,7 +10849,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
) / SCHED_CAPACITY_SCALE;
}
-/******* find_busiest_group() helpers end here *********************/
+/******* sched_balance_find_src_group() helpers end here *********************/
/*
* Decision matrix according to the local and busiest group type:
@@ -10819,7 +10872,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
/**
- * find_busiest_group - Returns the busiest group within the sched_domain
+ * sched_balance_find_src_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
* @env: The load balancing environment.
*
@@ -10828,7 +10881,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*
* Return: - The busiest group if imbalance exists.
*/
-static struct sched_group *find_busiest_group(struct lb_env *env)
+static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
{
struct sg_lb_stats *local, *busiest;
struct sd_lb_stats sds;
@@ -10851,12 +10904,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_misfit_task)
goto force_balance;
- if (sched_energy_enabled()) {
- struct root_domain *rd = env->dst_rq->rd;
-
- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
- goto out_balanced;
- }
+ if (!is_rd_overutilized(env->dst_rq->rd) &&
+ rcu_dereference(env->dst_rq->rd->pd))
+ goto out_balanced;
/* ASYM feature bypasses nice load balance check */
if (busiest->group_type == group_asym_packing)
@@ -10919,7 +10969,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
if (busiest->group_type != group_overloaded) {
- if (env->idle == CPU_NOT_IDLE) {
+ if (!env->idle) {
/*
* If the busiest group is not overloaded (and as a
* result the local one too) but this CPU is already
@@ -10967,9 +11017,9 @@ out_balanced:
}
/*
- * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
+ * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group.
*/
-static struct rq *find_busiest_queue(struct lb_env *env,
+static struct rq *sched_balance_find_src_rq(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
@@ -11127,7 +11177,7 @@ asym_active_balance(struct lb_env *env)
* the lower priority @env::dst_cpu help it. Do not follow
* CPU priority.
*/
- return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) &&
+ return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) &&
(sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
!sched_use_asym_prio(env->sd, env->src_cpu));
}
@@ -11165,7 +11215,7 @@ static int need_active_balance(struct lb_env *env)
* because of other sched_class or IRQs if more capacity stays
* available on dst_cpu.
*/
- if ((env->idle != CPU_NOT_IDLE) &&
+ if (env->idle &&
(env->src_rq->cfs.h_nr_running == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
@@ -11250,7 +11300,7 @@ static int should_we_balance(struct lb_env *env)
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
-static int load_balance(int this_cpu, struct rq *this_rq,
+static int sched_balance_rq(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
{
@@ -11282,13 +11332,13 @@ redo:
goto out_balanced;
}
- group = find_busiest_group(&env);
+ group = sched_balance_find_src_group(&env);
if (!group) {
schedstat_inc(sd->lb_nobusyg[idle]);
goto out_balanced;
}
- busiest = find_busiest_queue(&env, group);
+ busiest = sched_balance_find_src_rq(&env, group);
if (!busiest) {
schedstat_inc(sd->lb_nobusyq[idle]);
goto out_balanced;
@@ -11306,7 +11356,7 @@ redo:
env.flags |= LBF_ALL_PINNED;
if (busiest->nr_running > 1) {
/*
- * Attempt to move tasks. If find_busiest_group has found
+ * Attempt to move tasks. If sched_balance_find_src_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
@@ -11421,8 +11471,12 @@ more_balance:
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
+ *
+ * Similarly for migration_misfit which is not related to
+ * load/util migration, don't pollute nr_balance_failed.
*/
- if (idle != CPU_NEWLY_IDLE)
+ if (idle != CPU_NEWLY_IDLE &&
+ env.migration_type != migrate_misfit)
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
@@ -11501,12 +11555,17 @@ out_one_pinned:
ld_moved = 0;
/*
- * newidle_balance() disregards balance intervals, so we could
+ * sched_balance_newidle() disregards balance intervals, so we could
* repeatedly reach this code, which would lead to balance_interval
* skyrocketing in a short amount of time. Skip the balance_interval
* increase logic to avoid that.
+ *
+ * Similarly misfit migration which is not necessarily an indication of
+ * the system being busy and requires lb to backoff to let it settle
+ * down.
*/
- if (env.idle == CPU_NEWLY_IDLE)
+ if (env.idle == CPU_NEWLY_IDLE ||
+ env.migration_type == migrate_misfit)
goto out;
/* tune up the balancing interval */
@@ -11639,10 +11698,23 @@ out_unlock:
return 0;
}
-static DEFINE_SPINLOCK(balancing);
+/*
+ * This flag serializes load-balancing passes over large domains
+ * (above the NODE topology level) - only one load-balancing instance
+ * may run at a time, to reduce overhead on very large systems with
+ * lots of CPUs and large NUMA distances.
+ *
+ * - Note that load-balancing passes triggered while another one
+ * is executing are skipped and not re-tried.
+ *
+ * - Also note that this does not serialize rebalance_domains()
+ * execution, as non-SD_SERIALIZE domains will still be
+ * load-balanced in parallel.
+ */
+static atomic_t sched_balance_running = ATOMIC_INIT(0);
/*
- * Scale the max load_balance interval with the number of CPUs in the system.
+ * Scale the max sched_balance_rq interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
*/
void update_max_interval(void)
@@ -11680,7 +11752,7 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
*
* Balancing parameters are set up in init_sched_domains.
*/
-static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
+static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
@@ -11717,25 +11789,25 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
- if (!spin_trylock(&balancing))
+ if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
goto out;
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
+ if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
/*
* The LBF_DST_PINNED logic could have changed
* env->dst_cpu, so we can't know our idle
* state even if we migrated tasks. Update it.
*/
- idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
- busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+ idle = idle_cpu(cpu);
+ busy = !idle && !sched_idle_cpu(cpu);
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
}
if (need_serialize)
- spin_unlock(&balancing);
+ atomic_set_release(&sched_balance_running, 0);
out:
if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
@@ -11895,7 +11967,7 @@ static void nohz_balancer_kick(struct rq *rq)
* currently idle; in which case, kick the ILB to move tasks
* around.
*
- * When balancing betwen cores, all the SMT siblings of the
+ * When balancing between cores, all the SMT siblings of the
* preferred CPU must be idle.
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
@@ -11912,7 +11984,7 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
* to run the misfit task on.
*/
- if (check_misfit_status(rq, sd)) {
+ if (check_misfit_status(rq)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
@@ -12056,7 +12128,7 @@ void nohz_balance_enter_idle(int cpu)
out:
/*
* Each time a cpu enter idle, we assume that it has blocked load and
- * enable the periodic update of the load of idle cpus
+ * enable the periodic update of the load of idle CPUs
*/
WRITE_ONCE(nohz.has_blocked, 1);
}
@@ -12074,13 +12146,13 @@ static bool update_nohz_stats(struct rq *rq)
if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
return true;
- update_blocked_averages(cpu);
+ sched_balance_update_blocked_averages(cpu);
return rq->has_blocked_load;
}
/*
- * Internal function that runs load balance for all idle cpus. The load balance
+ * Internal function that runs load balance for all idle CPUs. The load balance
* can be a simple update of blocked load or a complete load balance with
* tasks movement depending of flags.
*/
@@ -12156,7 +12228,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
rq_unlock_irqrestore(rq, &rf);
if (flags & NOHZ_BALANCE_KICK)
- rebalance_domains(rq, CPU_IDLE);
+ sched_balance_domains(rq, CPU_IDLE);
}
if (time_after(next_balance, rq->next_balance)) {
@@ -12185,7 +12257,7 @@ abort:
/*
* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * rebalancing for all the CPUs for whom scheduler ticks are stopped.
*/
static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
@@ -12216,7 +12288,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
* called from this function on (this) CPU that's not yet in the mask. That's
* OK because the goal of nohz_run_idle_balance() is to run ILB only for
* updating the blocked load of already idle CPUs without waking up one of
- * those idle CPUs and outside the preempt disable / irq off phase of the local
+ * those idle CPUs and outside the preempt disable / IRQ off phase of the local
* cpu about to enter idle, because it can take a long time.
*/
void nohz_run_idle_balance(int cpu)
@@ -12227,7 +12299,7 @@ void nohz_run_idle_balance(int cpu)
/*
* Update the blocked load only if no SCHED_SOFTIRQ is about to happen
- * (ie NOHZ_STATS_KICK set) and will do the same.
+ * (i.e. NOHZ_STATS_KICK set) and will do the same.
*/
if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
_nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
@@ -12272,7 +12344,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
- * newidle_balance is called by schedule() if this_cpu is about to become
+ * sched_balance_newidle is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*
* Returns:
@@ -12280,10 +12352,11 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
* 0 - failed, no new tasks
* > 0 - success, new (fair) tasks present
*/
-static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
+ int continue_balancing = 1;
u64 t0, t1, curr_cost = 0;
struct sched_domain *sd;
int pulled_task = 0;
@@ -12298,8 +12371,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
return 0;
/*
- * We must set idle_stamp _before_ calling idle_balance(), such that we
- * measure the duration of idle_balance() as idle time.
+ * We must set idle_stamp _before_ calling sched_balance_rq()
+ * for CPU_NEWLY_IDLE, such that we measure the this duration
+ * as idle time.
*/
this_rq->idle_stamp = rq_clock(this_rq);
@@ -12320,7 +12394,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
- if (!READ_ONCE(this_rq->rd->overload) ||
+ if (!get_rd_overloaded(this_rq->rd) ||
(sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
if (sd)
@@ -12334,11 +12408,10 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
raw_spin_rq_unlock(this_rq);
t0 = sched_clock_cpu(this_cpu);
- update_blocked_averages(this_cpu);
+ sched_balance_update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
- int continue_balancing = 1;
u64 domain_cost;
update_next_balance(sd, &next_balance);
@@ -12348,7 +12421,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
if (sd->flags & SD_BALANCE_NEWIDLE) {
- pulled_task = load_balance(this_cpu, this_rq,
+ pulled_task = sched_balance_rq(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
@@ -12364,8 +12437,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
*/
- if (pulled_task || this_rq->nr_running > 0 ||
- this_rq->ttwu_pending)
+ if (pulled_task || !continue_balancing)
break;
}
rcu_read_unlock();
@@ -12403,19 +12475,21 @@ out:
}
/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ * This softirq handler is triggered via SCHED_SOFTIRQ from two places:
+ *
+ * - directly from the local scheduler_tick() for periodic load balancing
+ *
+ * - indirectly from a remote scheduler_tick() for NOHZ idle balancing
+ * through the SMP cross-call nohz_csd_func()
*/
-static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
+static __latent_entropy void sched_balance_softirq(struct softirq_action *h)
{
struct rq *this_rq = this_rq();
- enum cpu_idle_type idle = this_rq->idle_balance ?
- CPU_IDLE : CPU_NOT_IDLE;
-
+ enum cpu_idle_type idle = this_rq->idle_balance;
/*
- * If this CPU has a pending nohz_balance_kick, then do the
+ * If this CPU has a pending NOHZ_BALANCE_KICK, then do the
* balancing on behalf of the other idle CPUs whose ticks are
- * stopped. Do nohz_idle_balance *before* rebalance_domains to
+ * stopped. Do nohz_idle_balance *before* sched_balance_domains to
* give the idle CPUs a chance to load balance. Else we may
* load balance only within the local sched_domain hierarchy
* and abort nohz_idle_balance altogether if we pull some load.
@@ -12424,14 +12498,14 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
return;
/* normal load balance */
- update_blocked_averages(this_rq->cpu);
- rebalance_domains(this_rq, idle);
+ sched_balance_update_blocked_averages(this_rq->cpu);
+ sched_balance_domains(this_rq, idle);
}
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
-void trigger_load_balance(struct rq *rq)
+void sched_balance_trigger(struct rq *rq)
{
/*
* Don't need to rebalance while attached to NULL domain or
@@ -12615,7 +12689,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
task_tick_numa(rq, curr);
update_misfit_status(curr, rq);
- update_overutilized_status(task_rq(curr));
+ check_update_overutilized_status(task_rq(curr));
task_tick_core(rq, curr);
}
@@ -12635,6 +12709,8 @@ static void task_fork_fair(struct task_struct *p)
rq_lock(rq, &rf);
update_rq_clock(rq);
+ set_task_max_allowed_capacity(p);
+
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
if (curr)
@@ -12758,6 +12834,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
attach_task_cfs_rq(p);
+ set_task_max_allowed_capacity(p);
+
if (task_on_rq_queued(p)) {
/*
* We were most likely switched from sched_rt, so
@@ -13129,7 +13207,7 @@ DEFINE_SCHED_CLASS(fair) = {
.rq_offline = rq_offline_fair,
.task_dead = task_dead_fair,
- .set_cpus_allowed = set_cpus_allowed_common,
+ .set_cpus_allowed = set_cpus_allowed_fair,
#endif
.task_tick = task_tick_fair,
@@ -13209,7 +13287,7 @@ __init void init_sched_fair_class(void)
#endif
}
- open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
+ open_softirq(SCHED_SOFTIRQ, sched_balance_softirq);
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;