aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/base
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-01-08 19:49:17 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-01-08 19:49:17 -0800
commitbfe8eb3b85c571f7e94e1039f59b462505b8e0fc (patch)
tree2084624e1d6e2c7f570239aad1bbdd9741cfe5e5 /drivers/base
parentaac4de465af08ccec90ef47bdcc13435e48a7223 (diff)
parentcdb3033e191fd03da2d7da23b9cd448dfa180a8e (diff)
downloadlinux-bfe8eb3b85c571f7e94e1039f59b462505b8e0fc.tar.gz
Merge tag 'sched-core-2024-01-08' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "Energy scheduling: - Consolidate how the max compute capacity is used in the scheduler and how we calculate the frequency for a level of utilization. - Rework interface between the scheduler and the schedutil governor - Simplify the util_est logic Deadline scheduler: - Work more towards reducing SCHED_DEADLINE starvation of low priority tasks (e.g., SCHED_OTHER) tasks when higher priority tasks monopolize CPU cycles, via the introduction of 'deadline servers' (nested/2-level scheduling). "Fair servers" to make use of this facility are not introduced yet. EEVDF: - Introduce O(1) fastpath for EEVDF task selection NUMA balancing: - Tune the NUMA-balancing vma scanning logic some more, to better distribute the probability of a particular vma getting scanned. Plus misc fixes, cleanups and updates" * tag 'sched-core-2024-01-08' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (30 commits) sched/fair: Fix tg->load when offlining a CPU sched/fair: Remove unused 'next_buddy_marked' local variable in check_preempt_wakeup_fair() sched/fair: Use all little CPUs for CPU-bound workloads sched/fair: Simplify util_est sched/fair: Remove SCHED_FEAT(UTIL_EST_FASTUP, true) arm64/amu: Use capacity_ref_freq() to set AMU ratio cpufreq/cppc: Set the frequency used for computing the capacity cpufreq/cppc: Move and rename cppc_cpufreq_{perf_to_khz|khz_to_perf}() energy_model: Use a fixed reference frequency cpufreq/schedutil: Use a fixed reference frequency cpufreq: Use the fixed and coherent frequency for scaling capacity sched/topology: Add a new arch_scale_freq_ref() method freezer,sched: Clean saved_state when restoring it during thaw sched/fair: Update min_vruntime for reweight_entity() correctly sched/doc: Update documentation after renames and synchronize Chinese version sched/cpufreq: Rework iowait boost sched/cpufreq: Rework schedutil governor performance estimation sched/pelt: Avoid underestimation of task utilization sched/timers: Explain why idle task schedules out on remote timer enqueue sched/cpuidle: Comment about timers requirements VS idle handler ...
Diffstat (limited to 'drivers/base')
-rw-r--r--drivers/base/arch_topology.c56
1 files changed, 39 insertions, 17 deletions
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index b741b5ba82bd6..5aaa0865625d0 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -19,6 +19,7 @@
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/units.h>
#define CREATE_TRACE_POINTS
#include <trace/events/thermal_pressure.h>
@@ -26,7 +27,8 @@
static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
static struct cpumask scale_freq_counters_mask;
static bool scale_freq_invariant;
-static DEFINE_PER_CPU(u32, freq_factor) = 1;
+DEFINE_PER_CPU(unsigned long, capacity_freq_ref) = 1;
+EXPORT_PER_CPU_SYMBOL_GPL(capacity_freq_ref);
static bool supports_scale_freq_counters(const struct cpumask *cpus)
{
@@ -170,9 +172,9 @@ DEFINE_PER_CPU(unsigned long, thermal_pressure);
* operating on stale data when hot-plug is used for some CPUs. The
* @capped_freq reflects the currently allowed max CPUs frequency due to
* thermal capping. It might be also a boost frequency value, which is bigger
- * than the internal 'freq_factor' max frequency. In such case the pressure
- * value should simply be removed, since this is an indication that there is
- * no thermal throttling. The @capped_freq must be provided in kHz.
+ * than the internal 'capacity_freq_ref' max frequency. In such case the
+ * pressure value should simply be removed, since this is an indication that
+ * there is no thermal throttling. The @capped_freq must be provided in kHz.
*/
void topology_update_thermal_pressure(const struct cpumask *cpus,
unsigned long capped_freq)
@@ -183,10 +185,7 @@ void topology_update_thermal_pressure(const struct cpumask *cpus,
cpu = cpumask_first(cpus);
max_capacity = arch_scale_cpu_capacity(cpu);
- max_freq = per_cpu(freq_factor, cpu);
-
- /* Convert to MHz scale which is used in 'freq_factor' */
- capped_freq /= 1000;
+ max_freq = arch_scale_freq_ref(cpu);
/*
* Handle properly the boost frequencies, which should simply clean
@@ -279,13 +278,13 @@ void topology_normalize_cpu_scale(void)
capacity_scale = 1;
for_each_possible_cpu(cpu) {
- capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu);
+ capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu);
capacity_scale = max(capacity, capacity_scale);
}
pr_debug("cpu_capacity: capacity_scale=%llu\n", capacity_scale);
for_each_possible_cpu(cpu) {
- capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu);
+ capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu);
capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT,
capacity_scale);
topology_set_cpu_scale(cpu, capacity);
@@ -321,15 +320,15 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu)
cpu_node, raw_capacity[cpu]);
/*
- * Update freq_factor for calculating early boot cpu capacities.
+ * Update capacity_freq_ref for calculating early boot CPU capacities.
* For non-clk CPU DVFS mechanism, there's no way to get the
* frequency value now, assuming they are running at the same
- * frequency (by keeping the initial freq_factor value).
+ * frequency (by keeping the initial capacity_freq_ref value).
*/
cpu_clk = of_clk_get(cpu_node, 0);
if (!PTR_ERR_OR_ZERO(cpu_clk)) {
- per_cpu(freq_factor, cpu) =
- clk_get_rate(cpu_clk) / 1000;
+ per_cpu(capacity_freq_ref, cpu) =
+ clk_get_rate(cpu_clk) / HZ_PER_KHZ;
clk_put(cpu_clk);
}
} else {
@@ -345,11 +344,16 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu)
return !ret;
}
+void __weak freq_inv_set_max_ratio(int cpu, u64 max_rate)
+{
+}
+
#ifdef CONFIG_ACPI_CPPC_LIB
#include <acpi/cppc_acpi.h>
void topology_init_cpu_capacity_cppc(void)
{
+ u64 capacity, capacity_scale = 0;
struct cppc_perf_caps perf_caps;
int cpu;
@@ -366,6 +370,10 @@ void topology_init_cpu_capacity_cppc(void)
(perf_caps.highest_perf >= perf_caps.nominal_perf) &&
(perf_caps.highest_perf >= perf_caps.lowest_perf)) {
raw_capacity[cpu] = perf_caps.highest_perf;
+ capacity_scale = max_t(u64, capacity_scale, raw_capacity[cpu]);
+
+ per_cpu(capacity_freq_ref, cpu) = cppc_perf_to_khz(&perf_caps, raw_capacity[cpu]);
+
pr_debug("cpu_capacity: CPU%d cpu_capacity=%u (raw).\n",
cpu, raw_capacity[cpu]);
continue;
@@ -376,7 +384,18 @@ void topology_init_cpu_capacity_cppc(void)
goto exit;
}
- topology_normalize_cpu_scale();
+ for_each_possible_cpu(cpu) {
+ freq_inv_set_max_ratio(cpu,
+ per_cpu(capacity_freq_ref, cpu) * HZ_PER_KHZ);
+
+ capacity = raw_capacity[cpu];
+ capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT,
+ capacity_scale);
+ topology_set_cpu_scale(cpu, capacity);
+ pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
+ cpu, topology_get_cpu_scale(cpu));
+ }
+
schedule_work(&update_topology_flags_work);
pr_debug("cpu_capacity: cpu_capacity initialization done\n");
@@ -410,8 +429,11 @@ init_cpu_capacity_callback(struct notifier_block *nb,
cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus);
- for_each_cpu(cpu, policy->related_cpus)
- per_cpu(freq_factor, cpu) = policy->cpuinfo.max_freq / 1000;
+ for_each_cpu(cpu, policy->related_cpus) {
+ per_cpu(capacity_freq_ref, cpu) = policy->cpuinfo.max_freq;
+ freq_inv_set_max_ratio(cpu,
+ per_cpu(capacity_freq_ref, cpu) * HZ_PER_KHZ);
+ }
if (cpumask_empty(cpus_to_visit)) {
topology_normalize_cpu_scale();