aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2024-04-29 14:28:20 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2024-04-29 14:28:20 +1000
commitd498d58cec6d6c5dea9970b7d9bfce00a5441fdb (patch)
tree4ed8fbd414890c642038f3d904ff9d73504f81ee
parent333c5792447c54438d145bdf2cd96271086d6819 (diff)
parent0a2f1c12ac1323c78789ca3ff28d2f99447a30c9 (diff)
downloadlinux-next-history-d498d58cec6d6c5dea9970b7d9bfce00a5441fdb.tar.gz
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
Notice: this object is not reachable from any branch.
# Conflicts: # arch/arm64/boot/dts/st/stm32mp251.dtsi
Notice: this object is not reachable from any branch.
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt1
-rw-r--r--Documentation/arch/x86/resctrl.rst6
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/riscv,aplic.yaml172
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/riscv,imsics.yaml172
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.yaml17
-rw-r--r--Documentation/process/maintainer-tip.rst18
-rw-r--r--Documentation/scheduler/sched-domains.rst12
-rw-r--r--Documentation/scheduler/sched-stats.rst37
-rw-r--r--Documentation/translations/zh_CN/scheduler/sched-domains.rst10
-rw-r--r--Documentation/translations/zh_CN/scheduler/sched-stats.rst30
-rw-r--r--MAINTAINERS20
-rw-r--r--arch/arm/boot/dts/st/stm32mp131.dtsi74
-rw-r--r--arch/arm/boot/dts/st/stm32mp151.dtsi75
-rw-r--r--arch/arm/include/asm/topology.h6
-rw-r--r--arch/arm/kernel/hw_breakpoint.c8
-rw-r--r--arch/arm/kernel/topology.c2
-rw-r--r--arch/arm64/Kconfig.platforms1
-rw-r--r--arch/arm64/boot/dts/st/stm32mp251.dtsi176
-rw-r--r--arch/arm64/include/asm/topology.h6
-rw-r--r--arch/arm64/kernel/hw_breakpoint.c4
-rw-r--r--arch/mips/dec/setup.c2
-rw-r--r--arch/parisc/kernel/smp.c2
-rw-r--r--arch/powerpc/include/asm/Kbuild1
-rw-r--r--arch/powerpc/include/asm/cputime.h13
-rw-r--r--arch/powerpc/include/asm/vdso/gettimeofday.h26
-rw-r--r--arch/powerpc/kernel/time.c22
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c2
-rw-r--r--arch/riscv/Kconfig2
-rw-r--r--arch/s390/include/asm/vdso/gettimeofday.h7
-rw-r--r--arch/s390/include/asm/vtime.h2
-rw-r--r--arch/s390/kernel/irq.c1
-rw-r--r--arch/s390/kernel/nmi.c1
-rw-r--r--arch/um/include/asm/cpufeature.h3
-rw-r--r--arch/x86/Kconfig43
-rw-r--r--arch/x86/boot/compressed/head_64.S5
-rw-r--r--arch/x86/boot/compressed/sev.c197
-rw-r--r--arch/x86/boot/main.c4
-rw-r--r--arch/x86/entry/Makefile2
-rw-r--r--arch/x86/entry/entry_64_compat.S1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/entry/thunk.S (renamed from arch/x86/entry/thunk_64.S)0
-rw-r--r--arch/x86/entry/thunk_32.S18
-rw-r--r--arch/x86/events/amd/core.c37
-rw-r--r--arch/x86/events/amd/lbr.c13
-rw-r--r--arch/x86/events/intel/cstate.c144
-rw-r--r--arch/x86/events/intel/lbr.c3
-rw-r--r--arch/x86/events/intel/pt.c12
-rw-r--r--arch/x86/events/intel/uncore.c100
-rw-r--r--arch/x86/events/intel/uncore_nhmex.c3
-rw-r--r--arch/x86/events/intel/uncore_snbep.c5
-rw-r--r--arch/x86/events/msr.c132
-rw-r--r--arch/x86/events/perf_event.h13
-rw-r--r--arch/x86/events/rapl.c7
-rw-r--r--arch/x86/hyperv/hv_vtl.c1
-rw-r--r--arch/x86/include/asm/alternative.h22
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/asm.h3
-rw-r--r--arch/x86/include/asm/atomic.h12
-rw-r--r--arch/x86/include/asm/atomic64_32.h81
-rw-r--r--arch/x86/include/asm/atomic64_64.h12
-rw-r--r--arch/x86/include/asm/boot.h5
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h205
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h8
-rw-r--r--arch/x86/include/asm/cpu_device_id.h101
-rw-r--r--arch/x86/include/asm/cpufeature.h16
-rw-r--r--arch/x86/include/asm/extable_fixup_types.h2
-rw-r--r--arch/x86/include/asm/ia32.h11
-rw-r--r--arch/x86/include/asm/ia32_unistd.h12
-rw-r--r--arch/x86/include/asm/intel-family.h84
-rw-r--r--arch/x86/include/asm/irq_stack.h2
-rw-r--r--arch/x86/include/asm/mce.h2
-rw-r--r--arch/x86/include/asm/msr-index.h9
-rw-r--r--arch/x86/include/asm/page_types.h8
-rw-r--r--arch/x86/include/asm/percpu.h157
-rw-r--r--arch/x86/include/asm/pgtable_types.h2
-rw-r--r--arch/x86/include/asm/processor.h32
-rw-r--r--arch/x86/include/asm/prom.h9
-rw-r--r--arch/x86/include/asm/qspinlock.h13
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h7
-rw-r--r--arch/x86/include/asm/seccomp.h2
-rw-r--r--arch/x86/include/asm/sev.h2
-rw-r--r--arch/x86/include/asm/special_insns.h8
-rw-r--r--arch/x86/include/asm/string_64.h45
-rw-r--r--arch/x86/include/asm/text-patching.h2
-rw-r--r--arch/x86/include/asm/uaccess.h4
-rw-r--r--arch/x86/include/asm/vdso/gettimeofday.h44
-rw-r--r--arch/x86/include/asm/vm86.h2
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/alternative.c133
-rw-r--r--arch/x86/kernel/apic/apic.c46
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c7
-rw-r--r--arch/x86/kernel/callthunks.c9
-rw-r--r--arch/x86/kernel/cpu/amd.c12
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c17
-rw-r--r--arch/x86/kernel/cpu/bugs.c30
-rw-r--r--arch/x86/kernel/cpu/common.c171
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c3
-rw-r--r--arch/x86/kernel/cpu/intel.c1
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c12
-rw-r--r--arch/x86/kernel/cpu/match.c5
-rw-r--r--arch/x86/kernel/cpu/mce/core.c24
-rw-r--r--arch/x86/kernel/cpu/mce/genpool.c40
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c21
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c27
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c5
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c65
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c40
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h5
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c11
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c24
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c12
-rw-r--r--arch/x86/kernel/cpu/resctrl/trace.h (renamed from arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h)24
-rw-r--r--arch/x86/kernel/cpu/topology_amd.c19
-rw-r--r--arch/x86/kernel/cpu/topology_ext.c15
-rw-r--r--arch/x86/kernel/devicetree.c24
-rw-r--r--arch/x86/kernel/dumpstack.c4
-rw-r--r--arch/x86/kernel/fpu/core.c4
-rw-r--r--arch/x86/kernel/fpu/xstate.c4
-rw-r--r--arch/x86/kernel/head_32.S13
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/rtc.c1
-rw-r--r--arch/x86/kernel/setup.c40
-rw-r--r--arch/x86/kernel/sev.c14
-rw-r--r--arch/x86/kernel/shstk.c4
-rw-r--r--arch/x86/kernel/signal_32.c2
-rw-r--r--arch/x86/kernel/signal_64.c6
-rw-r--r--arch/x86/kernel/smpboot.c28
-rw-r--r--arch/x86/kernel/topology.c43
-rw-r--r--arch/x86/kernel/tsc.c8
-rw-r--r--arch/x86/kernel/tsc_msr.c14
-rw-r--r--arch/x86/kernel/vmlinux.lds.S10
-rw-r--r--arch/x86/math-emu/fpu_etc.c9
-rw-r--r--arch/x86/math-emu/fpu_trig.c6
-rw-r--r--arch/x86/math-emu/reg_constant.c7
-rw-r--r--arch/x86/mm/extable.c9
-rw-r--r--arch/x86/mm/fault.c7
-rw-r--r--arch/x86/mm/init.c16
-rw-r--r--arch/x86/mm/pat/set_memory.c68
-rw-r--r--arch/x86/platform/ce4100/ce4100.c1
-rw-r--r--arch/x86/platform/iris/iris.c5
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-pm.c7
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-sci.c5
-rw-r--r--arch/x86/purgatory/Makefile3
-rw-r--r--arch/x86/tools/relocs.c371
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.c1
-rw-r--r--arch/x86/xen/enlighten.c3
-rw-r--r--arch/x86/xen/xen-head.S2
-rw-r--r--drivers/base/arch_topology.c26
-rw-r--r--drivers/cpufreq/cpufreq.c36
-rw-r--r--drivers/cpufreq/qcom-cpufreq-hw.c4
-rw-r--r--drivers/cpuidle/coupled.c13
-rw-r--r--drivers/irqchip/Kconfig27
-rw-r--r--drivers/irqchip/Makefile3
-rw-r--r--drivers/irqchip/irq-alpine-msi.c2
-rw-r--r--drivers/irqchip/irq-bcm6345-l1.c6
-rw-r--r--drivers/irqchip/irq-brcmstb-l2.c17
-rw-r--r--drivers/irqchip/irq-gic-v3-its.c17
-rw-r--r--drivers/irqchip/irq-loongson-eiointc.c12
-rw-r--r--drivers/irqchip/irq-loongson-pch-msi.c2
-rw-r--r--drivers/irqchip/irq-loongson-pch-pic.c76
-rw-r--r--drivers/irqchip/irq-mxs.c2
-rw-r--r--drivers/irqchip/irq-renesas-rzg2l.c28
-rw-r--r--drivers/irqchip/irq-riscv-aplic-direct.c323
-rw-r--r--drivers/irqchip/irq-riscv-aplic-main.c211
-rw-r--r--drivers/irqchip/irq-riscv-aplic-main.h52
-rw-r--r--drivers/irqchip/irq-riscv-aplic-msi.c257
-rw-r--r--drivers/irqchip/irq-riscv-imsic-early.c201
-rw-r--r--drivers/irqchip/irq-riscv-imsic-platform.c375
-rw-r--r--drivers/irqchip/irq-riscv-imsic-state.c865
-rw-r--r--drivers/irqchip/irq-riscv-imsic-state.h108
-rw-r--r--drivers/irqchip/irq-sifive-plic.c7
-rw-r--r--drivers/irqchip/irq-stm32-exti.c139
-rw-r--r--drivers/irqchip/irq-sunxi-nmi.c1
-rw-r--r--drivers/irqchip/irq-tb10x.c1
-rw-r--r--drivers/thermal/cpufreq_cooling.c3
-rw-r--r--drivers/virt/coco/sev-guest/sev-guest.c28
-rw-r--r--include/asm-generic/sections.h5
-rw-r--r--include/asm-generic/vmlinux.lds.h8
-rw-r--r--include/asm-generic/vtime.h1
-rw-r--r--include/linux/arch_topology.h8
-rw-r--r--include/linux/cpufreq.h10
-rw-r--r--include/linux/cpuhotplug.h1
-rw-r--r--include/linux/cpumask.h17
-rw-r--r--include/linux/find.h27
-rw-r--r--include/linux/irq.h2
-rw-r--r--include/linux/irqchip/riscv-aplic.h145
-rw-r--r--include/linux/irqchip/riscv-imsic.h87
-rw-r--r--include/linux/irqdesc.h16
-rw-r--r--include/linux/jump_label.h3
-rw-r--r--include/linux/kernel_stat.h8
-rw-r--r--include/linux/math64.h8
-rw-r--r--include/linux/perf_event.h37
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/linux/sched/idle.h2
-rw-r--r--include/linux/sched/topology.h10
-rw-r--r--include/linux/timerqueue.h5
-rw-r--r--include/linux/vtime.h5
-rw-r--r--include/trace/events/hw_pressure.h (renamed from include/trace/events/thermal_pressure.h)14
-rw-r--r--include/trace/events/mce.h25
-rw-r--r--include/trace/events/sched.h2
-rw-r--r--include/vdso/datapage.h4
-rw-r--r--include/vdso/math64.h38
-rw-r--r--init/Kconfig12
-rw-r--r--init/init_task.c1
-rw-r--r--init/main.c1
-rw-r--r--kernel/context_tracking.c2
-rw-r--r--kernel/events/core.c273
-rw-r--r--kernel/events/ring_buffer.c4
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/cpuhotplug.c27
-rw-r--r--kernel/irq/internals.h9
-rw-r--r--kernel/irq/irqdesc.c65
-rw-r--r--kernel/irq/irqdomain.c5
-rw-r--r--kernel/irq/manage.c28
-rw-r--r--kernel/irq/proc.c9
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/jump_label.c53
-rw-r--r--kernel/locking/lock_events.h4
-rw-r--r--kernel/locking/qspinlock.c13
-rw-r--r--kernel/locking/qspinlock_paravirt.h49
-rw-r--r--kernel/sched/core.c14
-rw-r--r--kernel/sched/cputime.c13
-rw-r--r--kernel/sched/fair.c503
-rw-r--r--kernel/sched/loadavg.c2
-rw-r--r--kernel/sched/pelt.c22
-rw-r--r--kernel/sched/pelt.h16
-rw-r--r--kernel/sched/sched.h71
-rw-r--r--kernel/sched/stats.c5
-rw-r--r--kernel/sched/topology.c56
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c44
-rw-r--r--kernel/time/hrtimer.c41
-rw-r--r--kernel/time/timekeeping.c96
-rw-r--r--kernel/time/timer.c2
-rw-r--r--kernel/time/vsyscall.c6
-rw-r--r--kernel/watchdog.c215
-rw-r--r--kernel/workqueue.c2
-rw-r--r--lib/Kconfig.debug16
-rw-r--r--lib/find_bit.c12
-rw-r--r--lib/vdso/Kconfig7
-rw-r--r--lib/vdso/gettimeofday.c55
-rw-r--r--rust/kernel/time.rs60
-rw-r--r--scripts/gdb/linux/interrupts.py6
-rw-r--r--tools/testing/selftests/bpf/prog_tests/perf_skip.c137
-rw-r--r--tools/testing/selftests/bpf/progs/test_perf_skip.c15
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc2
-rw-r--r--tools/testing/selftests/perf_events/.gitignore1
-rw-r--r--tools/testing/selftests/perf_events/Makefile2
-rw-r--r--tools/testing/selftests/perf_events/watermark_signal.c146
-rw-r--r--tools/testing/selftests/x86/amx.c27
-rw-r--r--tools/testing/selftests/x86/lam.c2
253 files changed, 7133 insertions, 2487 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 514a469c0113aa..9fd52ca4911fda 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5817,6 +5817,7 @@
but is useful for debugging and performance tuning.
sched_thermal_decay_shift=
+ [Deprecated]
[KNL, SMP] Set a decay shift for scheduler thermal
pressure signal. Thermal pressure signal follows the
default decay period of other scheduler pelt
diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst
index 6c245582d8fb16..627e23869bca83 100644
--- a/Documentation/arch/x86/resctrl.rst
+++ b/Documentation/arch/x86/resctrl.rst
@@ -446,6 +446,12 @@ during mkdir.
max_threshold_occupancy is a user configurable value to determine the
occupancy at which an RMID can be freed.
+The mon_llc_occupancy_limbo tracepoint gives the precise occupancy in bytes
+for a subset of RMID that are not immediately available for allocation.
+This can't be relied on to produce output every second, it may be necessary
+to attempt to create an empty monitor group to force an update. Output may
+only be produced if creation of a control or monitor group fails.
+
Schemata files - general concepts
---------------------------------
Each line in the file describes one resource. The line starts with
diff --git a/Documentation/devicetree/bindings/interrupt-controller/riscv,aplic.yaml b/Documentation/devicetree/bindings/interrupt-controller/riscv,aplic.yaml
new file mode 100644
index 00000000000000..190a6499c932ea
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/riscv,aplic.yaml
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/interrupt-controller/riscv,aplic.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: RISC-V Advanced Platform Level Interrupt Controller (APLIC)
+
+maintainers:
+ - Anup Patel <anup@brainfault.org>
+
+description:
+ The RISC-V advanced interrupt architecture (AIA) defines an advanced
+ platform level interrupt controller (APLIC) for handling wired interrupts
+ in a RISC-V platform. The RISC-V AIA specification can be found at
+ https://github.com/riscv/riscv-aia.
+
+ The RISC-V APLIC is implemented as hierarchical APLIC domains where all
+ interrupt sources connect to the root APLIC domain and a parent APLIC
+ domain can delegate interrupt sources to it's child APLIC domains. There
+ is one device tree node for each APLIC domain.
+
+allOf:
+ - $ref: /schemas/interrupt-controller.yaml#
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - qemu,aplic
+ - const: riscv,aplic
+
+ reg:
+ maxItems: 1
+
+ interrupt-controller: true
+
+ "#interrupt-cells":
+ const: 2
+
+ interrupts-extended:
+ minItems: 1
+ maxItems: 16384
+ description:
+ Given APLIC domain directly injects external interrupts to a set of
+ RISC-V HARTS (or CPUs). Each node pointed to should be a riscv,cpu-intc
+ node, which has a CPU node (i.e. RISC-V HART) as parent.
+
+ msi-parent:
+ description:
+ Given APLIC domain forwards wired interrupts as MSIs to a AIA incoming
+ message signaled interrupt controller (IMSIC). If both "msi-parent" and
+ "interrupts-extended" properties are present then it means the APLIC
+ domain supports both MSI mode and Direct mode in HW. In this case, the
+ APLIC driver has to choose between MSI mode or Direct mode.
+
+ riscv,num-sources:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 1
+ maximum: 1023
+ description:
+ Specifies the number of wired interrupt sources supported by this
+ APLIC domain.
+
+ riscv,children:
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ minItems: 1
+ maxItems: 1024
+ items:
+ maxItems: 1
+ description:
+ A list of child APLIC domains for the given APLIC domain. Each child
+ APLIC domain is assigned a child index in increasing order, with the
+ first child APLIC domain assigned child index 0. The APLIC domain child
+ index is used by firmware to delegate interrupts from the given APLIC
+ domain to a particular child APLIC domain.
+
+ riscv,delegation:
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ minItems: 1
+ maxItems: 1024
+ items:
+ items:
+ - description: child APLIC domain phandle
+ - description: first interrupt number of the parent APLIC domain (inclusive)
+ - description: last interrupt number of the parent APLIC domain (inclusive)
+ description:
+ A interrupt delegation list where each entry is a triple consisting
+ of child APLIC domain phandle, first interrupt number of the parent
+ APLIC domain, and last interrupt number of the parent APLIC domain.
+ Firmware must configure interrupt delegation registers based on
+ interrupt delegation list.
+
+dependencies:
+ riscv,delegation: [ "riscv,children" ]
+
+required:
+ - compatible
+ - reg
+ - interrupt-controller
+ - "#interrupt-cells"
+ - riscv,num-sources
+
+anyOf:
+ - required:
+ - interrupts-extended
+ - required:
+ - msi-parent
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ // Example 1 (APLIC domains directly injecting interrupt to HARTs):
+
+ interrupt-controller@c000000 {
+ compatible = "qemu,aplic", "riscv,aplic";
+ interrupts-extended = <&cpu1_intc 11>,
+ <&cpu2_intc 11>,
+ <&cpu3_intc 11>,
+ <&cpu4_intc 11>;
+ reg = <0xc000000 0x4080>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ riscv,num-sources = <63>;
+ riscv,children = <&aplic1>, <&aplic2>;
+ riscv,delegation = <&aplic1 1 63>;
+ };
+
+ aplic1: interrupt-controller@d000000 {
+ compatible = "qemu,aplic", "riscv,aplic";
+ interrupts-extended = <&cpu1_intc 9>,
+ <&cpu2_intc 9>;
+ reg = <0xd000000 0x4080>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ riscv,num-sources = <63>;
+ };
+
+ aplic2: interrupt-controller@e000000 {
+ compatible = "qemu,aplic", "riscv,aplic";
+ interrupts-extended = <&cpu3_intc 9>,
+ <&cpu4_intc 9>;
+ reg = <0xe000000 0x4080>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ riscv,num-sources = <63>;
+ };
+
+ - |
+ // Example 2 (APLIC domains forwarding interrupts as MSIs):
+
+ interrupt-controller@c000000 {
+ compatible = "qemu,aplic", "riscv,aplic";
+ msi-parent = <&imsic_mlevel>;
+ reg = <0xc000000 0x4000>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ riscv,num-sources = <63>;
+ riscv,children = <&aplic3>;
+ riscv,delegation = <&aplic3 1 63>;
+ };
+
+ aplic3: interrupt-controller@d000000 {
+ compatible = "qemu,aplic", "riscv,aplic";
+ msi-parent = <&imsic_slevel>;
+ reg = <0xd000000 0x4000>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ riscv,num-sources = <63>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/interrupt-controller/riscv,imsics.yaml b/Documentation/devicetree/bindings/interrupt-controller/riscv,imsics.yaml
new file mode 100644
index 00000000000000..84976f17a4a1db
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/riscv,imsics.yaml
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/interrupt-controller/riscv,imsics.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: RISC-V Incoming MSI Controller (IMSIC)
+
+maintainers:
+ - Anup Patel <anup@brainfault.org>
+
+description: |
+ The RISC-V advanced interrupt architecture (AIA) defines a per-CPU incoming
+ MSI controller (IMSIC) for handling MSIs in a RISC-V platform. The RISC-V
+ AIA specification can be found at https://github.com/riscv/riscv-aia.
+
+ The IMSIC is a per-CPU (or per-HART) device with separate interrupt file
+ for each privilege level (machine or supervisor). The configuration of
+ a IMSIC interrupt file is done using AIA CSRs and it also has a 4KB MMIO
+ space to receive MSIs from devices. Each IMSIC interrupt file supports a
+ fixed number of interrupt identities (to distinguish MSIs from devices)
+ which is same for given privilege level across CPUs (or HARTs).
+
+ The device tree of a RISC-V platform will have one IMSIC device tree node
+ for each privilege level (machine or supervisor) which collectively describe
+ IMSIC interrupt files at that privilege level across CPUs (or HARTs).
+
+ The arrangement of IMSIC interrupt files in MMIO space of a RISC-V platform
+ follows a particular scheme defined by the RISC-V AIA specification. A IMSIC
+ group is a set of IMSIC interrupt files co-located in MMIO space and we can
+ have multiple IMSIC groups (i.e. clusters, sockets, chiplets, etc) in a
+ RISC-V platform. The MSI target address of a IMSIC interrupt file at given
+ privilege level (machine or supervisor) encodes group index, HART index,
+ and guest index (shown below).
+
+ XLEN-1 > (HART Index MSB) 12 0
+ | | | |
+ -------------------------------------------------------------
+ |xxxxxx|Group Index|xxxxxxxxxxx|HART Index|Guest Index| 0 |
+ -------------------------------------------------------------
+
+allOf:
+ - $ref: /schemas/interrupt-controller.yaml#
+ - $ref: /schemas/interrupt-controller/msi-controller.yaml#
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - qemu,imsics
+ - const: riscv,imsics
+
+ reg:
+ minItems: 1
+ maxItems: 16384
+ description:
+ Base address of each IMSIC group.
+
+ interrupt-controller: true
+
+ "#interrupt-cells":
+ const: 0
+
+ msi-controller: true
+
+ "#msi-cells":
+ const: 0
+
+ interrupts-extended:
+ minItems: 1
+ maxItems: 16384
+ description:
+ This property represents the set of CPUs (or HARTs) for which given
+ device tree node describes the IMSIC interrupt files. Each node pointed
+ to should be a riscv,cpu-intc node, which has a CPU node (i.e. RISC-V
+ HART) as parent.
+
+ riscv,num-ids:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 63
+ maximum: 2047
+ description:
+ Number of interrupt identities supported by IMSIC interrupt file.
+
+ riscv,num-guest-ids:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 63
+ maximum: 2047
+ description:
+ Number of interrupt identities are supported by IMSIC guest interrupt
+ file. When not specified it is assumed to be same as specified by the
+ riscv,num-ids property.
+
+ riscv,guest-index-bits:
+ minimum: 0
+ maximum: 7
+ default: 0
+ description:
+ Number of guest index bits in the MSI target address.
+
+ riscv,hart-index-bits:
+ minimum: 0
+ maximum: 15
+ description:
+ Number of HART index bits in the MSI target address. When not
+ specified it is calculated based on the interrupts-extended property.
+
+ riscv,group-index-bits:
+ minimum: 0
+ maximum: 7
+ default: 0
+ description:
+ Number of group index bits in the MSI target address.
+
+ riscv,group-index-shift:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 0
+ maximum: 55
+ default: 24
+ description:
+ The least significant bit position of the group index bits in the
+ MSI target address.
+
+required:
+ - compatible
+ - reg
+ - interrupt-controller
+ - msi-controller
+ - "#msi-cells"
+ - interrupts-extended
+ - riscv,num-ids
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ // Example 1 (Machine-level IMSIC files with just one group):
+
+ interrupt-controller@24000000 {
+ compatible = "qemu,imsics", "riscv,imsics";
+ interrupts-extended = <&cpu1_intc 11>,
+ <&cpu2_intc 11>,
+ <&cpu3_intc 11>,
+ <&cpu4_intc 11>;
+ reg = <0x28000000 0x4000>;
+ interrupt-controller;
+ #interrupt-cells = <0>;
+ msi-controller;
+ #msi-cells = <0>;
+ riscv,num-ids = <127>;
+ };
+
+ - |
+ // Example 2 (Supervisor-level IMSIC files with two groups):
+
+ interrupt-controller@28000000 {
+ compatible = "qemu,imsics", "riscv,imsics";
+ interrupts-extended = <&cpu1_intc 9>,
+ <&cpu2_intc 9>,
+ <&cpu3_intc 9>,
+ <&cpu4_intc 9>;
+ reg = <0x28000000 0x2000>, /* Group0 IMSICs */
+ <0x29000000 0x2000>; /* Group1 IMSICs */
+ interrupt-controller;
+ #interrupt-cells = <0>;
+ msi-controller;
+ #msi-cells = <0>;
+ riscv,num-ids = <127>;
+ riscv,group-index-bits = <1>;
+ riscv,group-index-shift = <24>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.yaml b/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.yaml
index 00c10a8258f138..9967e57b449b0a 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.yaml
+++ b/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.yaml
@@ -89,8 +89,23 @@ examples:
reg = <0x5000d000 0x400>;
};
+ - |
//Example 2
- exti2: interrupt-controller@40013c00 {
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ exti2: interrupt-controller@5000d000 {
+ compatible = "st,stm32mp1-exti", "syscon";
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ reg = <0x5000d000 0x400>;
+ interrupts-extended =
+ <&intc GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <&intc GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>;
+ };
+
+ - |
+ //Example 3
+ exti3: interrupt-controller@40013c00 {
compatible = "st,stm32-exti";
interrupt-controller;
#interrupt-cells = <2>;
diff --git a/Documentation/process/maintainer-tip.rst b/Documentation/process/maintainer-tip.rst
index 497bb39727c8b0..64739968afa632 100644
--- a/Documentation/process/maintainer-tip.rst
+++ b/Documentation/process/maintainer-tip.rst
@@ -409,20 +409,20 @@ See :ref:`resend_reminders`.
Merge window
^^^^^^^^^^^^
-Please do not expect large patch series to be handled during the merge
-window or even during the week before. Such patches should be submitted in
-mergeable state *at* *least* a week before the merge window opens.
-Exceptions are made for bug fixes and *sometimes* for small standalone
-drivers for new hardware or minimally invasive patches for hardware
-enablement.
+Please do not expect patches to be reviewed or merged by tip
+maintainers around or during the merge window. The trees are closed
+to all but urgent fixes during this time. They reopen once the merge
+window closes and a new -rc1 kernel has been released.
+
+Large series should be submitted in mergeable state *at* *least* a week
+before the merge window opens. Exceptions are made for bug fixes and
+*sometimes* for small standalone drivers for new hardware or minimally
+invasive patches for hardware enablement.
During the merge window, the maintainers instead focus on following the
upstream changes, fixing merge window fallout, collecting bug fixes, and
allowing themselves a breath. Please respect that.
-The release candidate -rc1 is the starting point for new patches to be
-applied which are targeted for the next merge window.
-
So called _urgent_ branches will be merged into mainline during the
stabilization phase of each release.
diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst
index e57ad28301bde9..5e996fe973b1cd 100644
--- a/Documentation/scheduler/sched-domains.rst
+++ b/Documentation/scheduler/sched-domains.rst
@@ -31,21 +31,21 @@ is treated as one entity. The load of a group is defined as the sum of the
load of each of its member CPUs, and only when the load of a group becomes
out of balance are tasks moved between groups.
-In kernel/sched/core.c, trigger_load_balance() is run periodically on each CPU
-through scheduler_tick(). It raises a softirq after the next regularly scheduled
+In kernel/sched/core.c, sched_balance_trigger() is run periodically on each CPU
+through sched_tick(). It raises a softirq after the next regularly scheduled
rebalancing event for the current runqueue has arrived. The actual load
-balancing workhorse, run_rebalance_domains()->rebalance_domains(), is then run
+balancing workhorse, sched_balance_softirq()->sched_balance_domains(), is then run
in softirq context (SCHED_SOFTIRQ).
The latter function takes two arguments: the runqueue of current CPU and whether
-the CPU was idle at the time the scheduler_tick() happened and iterates over all
+the CPU was idle at the time the sched_tick() happened and iterates over all
sched domains our CPU is on, starting from its base domain and going up the ->parent
chain. While doing that, it checks to see if the current domain has exhausted its
-rebalance interval. If so, it runs load_balance() on that domain. It then checks
+rebalance interval. If so, it runs sched_balance_rq() on that domain. It then checks
the parent sched_domain (if it exists), and the parent of the parent and so
forth.
-Initially, load_balance() finds the busiest group in the current sched domain.
+Initially, sched_balance_rq() finds the busiest group in the current sched domain.
If it succeeds, it looks for the busiest runqueue of all the CPUs' runqueues in
that group. If it manages to find such a runqueue, it locks both our initial
CPU's runqueue and the newly found busiest one and starts moving tasks from it
diff --git a/Documentation/scheduler/sched-stats.rst b/Documentation/scheduler/sched-stats.rst
index 03c06291599800..7c2b16c4729d33 100644
--- a/Documentation/scheduler/sched-stats.rst
+++ b/Documentation/scheduler/sched-stats.rst
@@ -2,6 +2,11 @@
Scheduler Statistics
====================
+Version 16 of schedstats changed the order of definitions within
+'enum cpu_idle_type', which changed the order of [CPU_MAX_IDLE_TYPES]
+columns in show_schedstat(). In particular the position of CPU_IDLE
+and __CPU_NOT_IDLE changed places. The size of the array is unchanged.
+
Version 15 of schedstats dropped counters for some sched_yield:
yld_exp_empty, yld_act_empty and yld_both_empty. Otherwise, it is
identical to version 14.
@@ -72,53 +77,53 @@ domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
The first field is a bit mask indicating what cpus this domain operates over.
-The next 24 are a variety of load_balance() statistics in grouped into types
+The next 24 are a variety of sched_balance_rq() statistics in grouped into types
of idleness (idle, busy, and newly idle):
- 1) # of times in this domain load_balance() was called when the
+ 1) # of times in this domain sched_balance_rq() was called when the
cpu was idle
- 2) # of times in this domain load_balance() checked but found
+ 2) # of times in this domain sched_balance_rq() checked but found
the load did not require balancing when the cpu was idle
- 3) # of times in this domain load_balance() tried to move one or
+ 3) # of times in this domain sched_balance_rq() tried to move one or
more tasks and failed, when the cpu was idle
4) sum of imbalances discovered (if any) with each call to
- load_balance() in this domain when the cpu was idle
+ sched_balance_rq() in this domain when the cpu was idle
5) # of times in this domain pull_task() was called when the cpu
was idle
6) # of times in this domain pull_task() was called even though
the target task was cache-hot when idle
- 7) # of times in this domain load_balance() was called but did
+ 7) # of times in this domain sched_balance_rq() was called but did
not find a busier queue while the cpu was idle
8) # of times in this domain a busier queue was found while the
cpu was idle but no busier group was found
- 9) # of times in this domain load_balance() was called when the
+ 9) # of times in this domain sched_balance_rq() was called when the
cpu was busy
- 10) # of times in this domain load_balance() checked but found the
+ 10) # of times in this domain sched_balance_rq() checked but found the
load did not require balancing when busy
- 11) # of times in this domain load_balance() tried to move one or
+ 11) # of times in this domain sched_balance_rq() tried to move one or
more tasks and failed, when the cpu was busy
12) sum of imbalances discovered (if any) with each call to
- load_balance() in this domain when the cpu was busy
+ sched_balance_rq() in this domain when the cpu was busy
13) # of times in this domain pull_task() was called when busy
14) # of times in this domain pull_task() was called even though the
target task was cache-hot when busy
- 15) # of times in this domain load_balance() was called but did not
+ 15) # of times in this domain sched_balance_rq() was called but did not
find a busier queue while the cpu was busy
16) # of times in this domain a busier queue was found while the cpu
was busy but no busier group was found
- 17) # of times in this domain load_balance() was called when the
+ 17) # of times in this domain sched_balance_rq() was called when the
cpu was just becoming idle
- 18) # of times in this domain load_balance() checked but found the
+ 18) # of times in this domain sched_balance_rq() checked but found the
load did not require balancing when the cpu was just becoming idle
- 19) # of times in this domain load_balance() tried to move one or more
+ 19) # of times in this domain sched_balance_rq() tried to move one or more
tasks and failed, when the cpu was just becoming idle
20) sum of imbalances discovered (if any) with each call to
- load_balance() in this domain when the cpu was just becoming idle
+ sched_balance_rq() in this domain when the cpu was just becoming idle
21) # of times in this domain pull_task() was called when newly idle
22) # of times in this domain pull_task() was called even though the
target task was cache-hot when just becoming idle
- 23) # of times in this domain load_balance() was called but did not
+ 23) # of times in this domain sched_balance_rq() was called but did not
find a busier queue while the cpu was just becoming idle
24) # of times in this domain a busier queue was found while the cpu
was just becoming idle but no busier group was found
diff --git a/Documentation/translations/zh_CN/scheduler/sched-domains.rst b/Documentation/translations/zh_CN/scheduler/sched-domains.rst
index e814d4c0114194..06363169c56b43 100644
--- a/Documentation/translations/zh_CN/scheduler/sched-domains.rst
+++ b/Documentation/translations/zh_CN/scheduler/sched-domains.rst
@@ -34,17 +34,17 @@ CPU共享。任æ„两个组的CPU掩ç çš„交集ä¸ä¸€å®šä¸ºç©ºï¼Œå¦‚果是这ç§
调度域中的负载å‡è¡¡å‘生在调度组中。也就是说,æ¯ä¸ªç»„被视为一个实体。组的负载被定义为它
管辖的æ¯ä¸ªCPU的负载之和。仅当组的负载ä¸å‡è¡¡åŽï¼Œä»»åŠ¡æ‰åœ¨ç»„之间å‘生è¿ç§»ã€‚
-在kernel/sched/core.c中,trigger_load_balance()在æ¯ä¸ªCPU上通过scheduler_tick()
+在kernel/sched/core.c中,sched_balance_trigger()在æ¯ä¸ªCPU上通过sched_tick()
周期执行。在当å‰è¿è¡Œé˜Ÿåˆ—下一个定期调度å†å¹³è¡¡äº‹ä»¶åˆ°è¾¾åŽï¼Œå®ƒå¼•å‘一个软中断。负载å‡è¡¡çœŸæ­£
-的工作由run_rebalance_domains()->rebalance_domains()完æˆï¼Œåœ¨è½¯ä¸­æ–­ä¸Šä¸‹æ–‡ä¸­æ‰§è¡Œ
+的工作由sched_balance_softirq()->sched_balance_domains()完æˆï¼Œåœ¨è½¯ä¸­æ–­ä¸Šä¸‹æ–‡ä¸­æ‰§è¡Œ
(SCHED_SOFTIRQ)。
-åŽä¸€ä¸ªå‡½æ•°æœ‰ä¸¤ä¸ªå…¥å‚:当å‰CPUçš„è¿è¡Œé˜Ÿåˆ—ã€å®ƒåœ¨scheduler_tick()调用时是å¦ç©ºé—²ã€‚函数会从
+åŽä¸€ä¸ªå‡½æ•°æœ‰ä¸¤ä¸ªå…¥å‚:当å‰CPUçš„è¿è¡Œé˜Ÿåˆ—ã€å®ƒåœ¨sched_tick()调用时是å¦ç©ºé—²ã€‚函数会从
当å‰CPU所在的基调度域开始迭代执行,并沿ç€parent指针链å‘上进入更高层级的调度域。在迭代
过程中,函数会检查当å‰è°ƒåº¦åŸŸæ˜¯å¦å·²ç»è€—尽了å†å¹³è¡¡çš„时间间隔,如果是,它在该调度域è¿è¡Œ
-load_balance()。接下æ¥å®ƒæ£€æŸ¥çˆ¶è°ƒåº¦åŸŸï¼ˆå¦‚果存在),å†åŽæ¥çˆ¶è°ƒåº¦åŸŸçš„父调度域,以此类推。
+sched_balance_rq()。接下æ¥å®ƒæ£€æŸ¥çˆ¶è°ƒåº¦åŸŸï¼ˆå¦‚果存在),å†åŽæ¥çˆ¶è°ƒåº¦åŸŸçš„父调度域,以此类推。
-èµ·åˆï¼Œload_balance()查找当å‰è°ƒåº¦åŸŸä¸­æœ€ç¹å¿™çš„调度组。如果æˆåŠŸï¼Œåœ¨è¯¥è°ƒåº¦ç»„管辖的全部CPU
+èµ·åˆï¼Œsched_balance_rq()查找当å‰è°ƒåº¦åŸŸä¸­æœ€ç¹å¿™çš„调度组。如果æˆåŠŸï¼Œåœ¨è¯¥è°ƒåº¦ç»„管辖的全部CPU
çš„è¿è¡Œé˜Ÿåˆ—中找出最ç¹å¿™çš„è¿è¡Œé˜Ÿåˆ—。如能找到,对当å‰çš„CPUè¿è¡Œé˜Ÿåˆ—和新找到的最ç¹å¿™è¿è¡Œ
队列å‡åŠ é”,并把任务从最ç¹å¿™é˜Ÿåˆ—中è¿ç§»åˆ°å½“å‰CPU上。被è¿ç§»çš„任务数é‡ç­‰äºŽåœ¨å…ˆå‰è¿­ä»£æ‰§è¡Œ
中计算出的该调度域的调度组的ä¸å‡è¡¡å€¼ã€‚
diff --git a/Documentation/translations/zh_CN/scheduler/sched-stats.rst b/Documentation/translations/zh_CN/scheduler/sched-stats.rst
index c5e0be6638373f..09eee251761072 100644
--- a/Documentation/translations/zh_CN/scheduler/sched-stats.rst
+++ b/Documentation/translations/zh_CN/scheduler/sched-stats.rst
@@ -75,42 +75,42 @@ domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
ç¹å¿™ï¼Œæ–°ç©ºé—²ï¼‰ï¼š
- 1) 当CPU空闲时,load_balance()在这个调度域中被调用了#次
- 2) 当CPU空闲时,load_balance()在这个调度域中被调用,但是å‘现负载无需
+ 1) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用了#次
+ 2) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,但是å‘现负载无需
å‡è¡¡#次
- 3) 当CPU空闲时,load_balance()在这个调度域中被调用,试图è¿ç§»1个或更多
+ 3) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,试图è¿ç§»1个或更多
任务且失败了#次
- 4) 当CPU空闲时,load_balance()在这个调度域中被调用,å‘现ä¸å‡è¡¡ï¼ˆå¦‚果有)
+ 4) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,å‘现ä¸å‡è¡¡ï¼ˆå¦‚果有)
#次
5) 当CPU空闲时,pull_task()在这个调度域中被调用#次
6) 当CPU空闲时,尽管目标任务是热缓存状æ€ï¼Œpull_task()ä¾ç„¶è¢«è°ƒç”¨#次
- 7) 当CPU空闲时,load_balance()在这个调度域中被调用,未能找到更ç¹å¿™çš„
+ 7) 当CPU空闲时,sched_balance_rq()在这个调度域中被调用,未能找到更ç¹å¿™çš„
队列#次
8) 当CPU空闲时,在调度域中找到了更ç¹å¿™çš„队列,但未找到更ç¹å¿™çš„调度组
#次
- 9) 当CPUç¹å¿™æ—¶ï¼Œload_balance()在这个调度域中被调用了#次
- 10) 当CPUç¹å¿™æ—¶ï¼Œload_balance()在这个调度域中被调用,但是å‘现负载无需
+ 9) 当CPUç¹å¿™æ—¶ï¼Œsched_balance_rq()在这个调度域中被调用了#次
+ 10) 当CPUç¹å¿™æ—¶ï¼Œsched_balance_rq()在这个调度域中被调用,但是å‘现负载无需
å‡è¡¡#次
- 11) 当CPUç¹å¿™æ—¶ï¼Œload_balance()在这个调度域中被调用,试图è¿ç§»1个或更多
+ 11) 当CPUç¹å¿™æ—¶ï¼Œsched_balance_rq()在这个调度域中被调用,试图è¿ç§»1个或更多
任务且失败了#次
- 12) 当CPUç¹å¿™æ—¶ï¼Œload_balance()在这个调度域中被调用,å‘现ä¸å‡è¡¡ï¼ˆå¦‚果有)
+ 12) 当CPUç¹å¿™æ—¶ï¼Œsched_balance_rq()在这个调度域中被调用,å‘现ä¸å‡è¡¡ï¼ˆå¦‚果有)
#次
13) 当CPUç¹å¿™æ—¶ï¼Œpull_task()在这个调度域中被调用#次
14) 当CPUç¹å¿™æ—¶ï¼Œå°½ç®¡ç›®æ ‡ä»»åŠ¡æ˜¯çƒ­ç¼“存状æ€ï¼Œpull_task()ä¾ç„¶è¢«è°ƒç”¨#次
- 15) 当CPUç¹å¿™æ—¶ï¼Œload_balance()在这个调度域中被调用,未能找到更ç¹å¿™çš„
+ 15) 当CPUç¹å¿™æ—¶ï¼Œsched_balance_rq()在这个调度域中被调用,未能找到更ç¹å¿™çš„
队列#次
16) 当CPUç¹å¿™æ—¶ï¼Œåœ¨è°ƒåº¦åŸŸä¸­æ‰¾åˆ°äº†æ›´ç¹å¿™çš„队列,但未找到更ç¹å¿™çš„调度组
#次
- 17) 当CPU新空闲时,load_balance()在这个调度域中被调用了#次
- 18) 当CPU新空闲时,load_balance()在这个调度域中被调用,但是å‘现负载无需
+ 17) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用了#次
+ 18) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,但是å‘现负载无需
å‡è¡¡#次
- 19) 当CPU新空闲时,load_balance()在这个调度域中被调用,试图è¿ç§»1个或更多
+ 19) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,试图è¿ç§»1个或更多
任务且失败了#次
- 20) 当CPU新空闲时,load_balance()在这个调度域中被调用,å‘现ä¸å‡è¡¡ï¼ˆå¦‚果有)
+ 20) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,å‘现ä¸å‡è¡¡ï¼ˆå¦‚果有)
#次
21) 当CPU新空闲时,pull_task()在这个调度域中被调用#次
22) 当CPU新空闲时,尽管目标任务是热缓存状æ€ï¼Œpull_task()ä¾ç„¶è¢«è°ƒç”¨#次
- 23) 当CPU新空闲时,load_balance()在这个调度域中被调用,未能找到更ç¹å¿™çš„
+ 23) 当CPU新空闲时,sched_balance_rq()在这个调度域中被调用,未能找到更ç¹å¿™çš„
队列#次
24) 当CPU新空闲时,在调度域中找到了更ç¹å¿™çš„队列,但未找到更ç¹å¿™çš„调度组
#次
diff --git a/MAINTAINERS b/MAINTAINERS
index aef24daa919e81..240a93bf13d486 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11516,6 +11516,7 @@ S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core
F: Documentation/core-api/irq/irq-domain.rst
F: include/linux/irqdomain.h
+F: include/linux/irqdomain_defs.h
F: kernel/irq/irqdomain.c
F: kernel/irq/msi.c
@@ -11525,6 +11526,10 @@ L: linux-kernel@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core
F: include/linux/group_cpus.h
+F: include/linux/irq.h
+F: include/linux/irqhandler.h
+F: include/linux/irqnr.h
+F: include/linux/irqreturn.h
F: kernel/irq/
F: lib/group_cpus.c
@@ -11535,6 +11540,7 @@ S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core
F: Documentation/devicetree/bindings/interrupt-controller/
F: drivers/irqchip/
+F: include/linux/irqchip.h
ISA
M: William Breathitt Gray <william.gray@linaro.org>
@@ -19078,6 +19084,20 @@ S: Maintained
F: drivers/mtd/nand/raw/r852.c
F: drivers/mtd/nand/raw/r852.h
+RISC-V AIA DRIVERS
+M: Anup Patel <anup@brainfault.org>
+L: linux-riscv@lists.infradead.org
+S: Maintained
+F: Documentation/devicetree/bindings/interrupt-controller/riscv,aplic.yaml
+F: Documentation/devicetree/bindings/interrupt-controller/riscv,imsics.yaml
+F: drivers/irqchip/irq-riscv-aplic-*.c
+F: drivers/irqchip/irq-riscv-aplic-*.h
+F: drivers/irqchip/irq-riscv-imsic-*.c
+F: drivers/irqchip/irq-riscv-imsic-*.h
+F: drivers/irqchip/irq-riscv-intc.c
+F: include/linux/irqchip/riscv-aplic.h
+F: include/linux/irqchip/riscv-imsic.h
+
RISC-V ARCHITECTURE
M: Paul Walmsley <paul.walmsley@sifive.com>
M: Palmer Dabbelt <palmer@dabbelt.com>
diff --git a/arch/arm/boot/dts/st/stm32mp131.dtsi b/arch/arm/boot/dts/st/stm32mp131.dtsi
index ecfa120827ba01..6704ceef284d31 100644
--- a/arch/arm/boot/dts/st/stm32mp131.dtsi
+++ b/arch/arm/boot/dts/st/stm32mp131.dtsi
@@ -783,10 +783,82 @@
};
exti: interrupt-controller@5000d000 {
- compatible = "st,stm32mp13-exti", "syscon";
+ compatible = "st,stm32mp1-exti", "syscon";
interrupt-controller;
#interrupt-cells = <2>;
reg = <0x5000d000 0x400>;
+ interrupts-extended =
+ <&intc GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_0 */
+ <&intc GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 24 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 65 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 66 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 67 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 68 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_10 */
+ <&intc GIC_SPI 43 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 78 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>,
+ <0>, /* EXTI_20 */
+ <&intc GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 73 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 93 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 114 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 39 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 40 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 72 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 53 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_30 */
+ <&intc GIC_SPI 54 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>, /* EXTI_40 */
+ <0>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 96 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <&intc GIC_SPI 117 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_50 */
+ <0>,
+ <&intc GIC_SPI 118 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 119 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>, /* EXTI_60 */
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 63 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <&intc GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>; /* EXTI_70 */
};
syscfg: syscon@50020000 {
diff --git a/arch/arm/boot/dts/st/stm32mp151.dtsi b/arch/arm/boot/dts/st/stm32mp151.dtsi
index 16bd6eee32b4e2..90c5c72c87ab7e 100644
--- a/arch/arm/boot/dts/st/stm32mp151.dtsi
+++ b/arch/arm/boot/dts/st/stm32mp151.dtsi
@@ -176,6 +176,81 @@
interrupt-controller;
#interrupt-cells = <2>;
reg = <0x5000d000 0x400>;
+ interrupts-extended =
+ <&intc GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_0 */
+ <&intc GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 64 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 65 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 66 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 67 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 40 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_10 */
+ <&intc GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 127 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>,
+ <0>, /* EXTI_20 */
+ <&intc GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 72 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 95 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 107 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 37 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 39 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 71 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 52 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_30 */
+ <&intc GIC_SPI 53 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>, /* EXTI_40 */
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 151 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 93 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 138 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <&intc GIC_SPI 139 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_50 */
+ <0>,
+ <&intc GIC_SPI 140 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 141 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 135 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>, /* EXTI_60 */
+ <&intc GIC_SPI 100 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 144 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <&intc GIC_SPI 62 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_70 */
+ <0>,
+ <0>,
+ <&intc GIC_SPI 129 IRQ_TYPE_LEVEL_HIGH>;
};
syscfg: syscon@50020000 {
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 853c4f81ba4a57..ad36b657006746 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -22,9 +22,9 @@
/* Enable topology flag updates */
#define arch_update_cpu_topology topology_update_cpu_topology
-/* Replace task scheduler's default thermal pressure API */
-#define arch_scale_thermal_pressure topology_get_thermal_pressure
-#define arch_update_thermal_pressure topology_update_thermal_pressure
+/* Replace task scheduler's default HW pressure API */
+#define arch_scale_hw_pressure topology_get_hw_pressure
+#define arch_update_hw_pressure topology_update_hw_pressure
#else
diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index dc0fb7a8137150..054e9199f30db6 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -626,7 +626,7 @@ int hw_breakpoint_arch_parse(struct perf_event *bp,
hw->address &= ~alignment_mask;
hw->ctrl.len <<= offset;
- if (uses_default_overflow_handler(bp)) {
+ if (is_default_overflow_handler(bp)) {
/*
* Mismatch breakpoints are required for single-stepping
* breakpoints.
@@ -798,7 +798,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
* Otherwise, insert a temporary mismatch breakpoint so that
* we can single-step over the watchpoint trigger.
*/
- if (!uses_default_overflow_handler(wp))
+ if (!is_default_overflow_handler(wp))
continue;
step:
enable_single_step(wp, instruction_pointer(regs));
@@ -811,7 +811,7 @@ step:
info->trigger = addr;
pr_debug("watchpoint fired: address = 0x%x\n", info->trigger);
perf_bp_event(wp, regs);
- if (uses_default_overflow_handler(wp))
+ if (is_default_overflow_handler(wp))
enable_single_step(wp, instruction_pointer(regs));
}
@@ -886,7 +886,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs)
info->trigger = addr;
pr_debug("breakpoint fired: address = 0x%x\n", addr);
perf_bp_event(bp, regs);
- if (uses_default_overflow_handler(bp))
+ if (is_default_overflow_handler(bp))
enable_single_step(bp, addr);
goto unlock;
}
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index ef0058de432bcf..2336ee2aa44a20 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -42,7 +42,7 @@
* can take this difference into account during load balance. A per cpu
* structure is preferred because each CPU updates its own cpu_capacity field
* during the load balance except for idle cores. One idle core is selected
- * to run the rebalance_domains for all idle cores and the cpu_capacity can be
+ * to run the sched_balance_domains for all idle cores and the cpu_capacity can be
* updated during this sequence.
*/
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index a028ea31237899..a52618073de2f7 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -309,6 +309,7 @@ config ARCH_STM32
select GPIOLIB
select PINCTRL
select PINCTRL_STM32MP257
+ select STM32_EXTI
select ARM_SMC_MBOX
select ARM_SCMI_PROTOCOL
select COMMON_CLK_SCMI
diff --git a/arch/arm64/boot/dts/st/stm32mp251.dtsi b/arch/arm64/boot/dts/st/stm32mp251.dtsi
index 4b48e4ed2d284c..dcd0656d67a807 100644
--- a/arch/arm64/boot/dts/st/stm32mp251.dtsi
+++ b/arch/arm64/boot/dts/st/stm32mp251.dtsi
@@ -443,6 +443,99 @@
<&clk_dsi_txbyte>;
};
+ exti1: interrupt-controller@44220000 {
+ compatible = "st,stm32mp1-exti", "syscon";
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ reg = <0x44220000 0x400>;
+ interrupts-extended =
+ <&intc GIC_SPI 268 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_0 */
+ <&intc GIC_SPI 269 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 270 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 271 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 272 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 273 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 274 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 275 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 276 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 277 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 278 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_10 */
+ <&intc GIC_SPI 279 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 280 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 281 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 282 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 283 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 260 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 259 IRQ_TYPE_LEVEL_HIGH>,
+ <0>, /* EXTI_20 */
+ <&intc GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 137 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 181 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 114 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 115 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 136 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 126 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_30 */
+ <&intc GIC_SPI 127 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 148 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 149 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 150 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <&intc GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 125 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 152 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 153 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_40 */
+ <&intc GIC_SPI 154 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 155 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 169 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 182 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 209 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 229 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 166 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 215 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 208 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 210 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_50 */
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 171 IRQ_TYPE_LEVEL_HIGH>,
+ <0>, /* EXTI_60 */
+ <&intc GIC_SPI 173 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 220 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 131 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <&intc GIC_SPI 134 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_70 */
+ <0>,
+ <&intc GIC_SPI 224 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 202 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 138 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 253 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 254 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 255 IRQ_TYPE_LEVEL_HIGH>,
+ <0>, /* EXTI_80 */
+ <0>,
+ <0>,
+ <&intc GIC_SPI 257 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 258 IRQ_TYPE_LEVEL_HIGH>;
+ };
+
syscfg: syscon@44230000 {
compatible = "st,stm32mp25-syscfg", "syscon";
reg = <0x44230000 0x10000>;
@@ -453,6 +546,8 @@
#size-cells = <1>;
compatible = "st,stm32mp257-pinctrl";
ranges = <0 0x44240000 0xa0400>;
+ interrupt-parent = <&exti1>;
+ st,syscfg = <&exti1 0x60 0xff>;
pins-are-numbered;
gpioa: gpio@44240000 {
@@ -582,6 +677,8 @@
#size-cells = <1>;
compatible = "st,stm32mp257-z-pinctrl";
ranges = <0 0x46200000 0x400>;
+ interrupt-parent = <&exti1>;
+ st,syscfg = <&exti1 0x60 0xff>;
pins-are-numbered;
gpioz: gpio@46200000 {
@@ -597,5 +694,84 @@
};
};
+
+ exti2: interrupt-controller@46230000 {
+ compatible = "st,stm32mp1-exti", "syscon";
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ reg = <0x46230000 0x400>;
+ interrupts-extended =
+ <&intc GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_0 */
+ <&intc GIC_SPI 18 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 22 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 24 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 25 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 26 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 27 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_10 */
+ <&intc GIC_SPI 28 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 30 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <0>, /* EXTI_20 */
+ <&intc GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 212 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 151 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 156 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <&intc GIC_SPI 216 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 217 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_30 */
+ <&intc GIC_SPI 218 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <&intc GIC_SPI 207 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 175 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 177 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 199 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_40 */
+ <0>,
+ <0>,
+ <&intc GIC_SPI 200 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <&intc GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>, /* EXTI_50 */
+ <&intc GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>, /* EXTI_60 */
+ <&intc GIC_SPI 221 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 246 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <&intc GIC_SPI 247 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 248 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 249 IRQ_TYPE_LEVEL_HIGH>,
+ <&intc GIC_SPI 256 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>,
+ <&intc GIC_SPI 213 IRQ_TYPE_LEVEL_HIGH>; /* EXTI_70 */
+ };
};
};
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index a323b109b9c44b..0f6ef432fb8403 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -35,9 +35,9 @@ void update_freq_counters_refs(void);
/* Enable topology flag updates */
#define arch_update_cpu_topology topology_update_cpu_topology
-/* Replace task scheduler's default thermal pressure API */
-#define arch_scale_thermal_pressure topology_get_thermal_pressure
-#define arch_update_thermal_pressure topology_update_thermal_pressure
+/* Replace task scheduler's default HW pressure API */
+#define arch_scale_hw_pressure topology_get_hw_pressure
+#define arch_update_hw_pressure topology_update_hw_pressure
#include <asm-generic/topology.h>
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 2f5755192c2b43..722ac45f9f7b16 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -655,7 +655,7 @@ static int breakpoint_handler(unsigned long unused, unsigned long esr,
perf_bp_event(bp, regs);
/* Do we need to handle the stepping? */
- if (uses_default_overflow_handler(bp))
+ if (is_default_overflow_handler(bp))
step = 1;
unlock:
rcu_read_unlock();
@@ -734,7 +734,7 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val,
static int watchpoint_report(struct perf_event *wp, unsigned long addr,
struct pt_regs *regs)
{
- int step = uses_default_overflow_handler(wp);
+ int step = is_default_overflow_handler(wp);
struct arch_hw_breakpoint *info = counter_arch_bp(wp);
info->trigger = addr;
diff --git a/arch/mips/dec/setup.c b/arch/mips/dec/setup.c
index 6c3704f51d0de4..87f0a1436bf9cc 100644
--- a/arch/mips/dec/setup.c
+++ b/arch/mips/dec/setup.c
@@ -756,7 +756,7 @@ void __init arch_init_irq(void)
NULL))
pr_err("Failed to register fpu interrupt\n");
desc_fpu = irq_to_desc(irq_fpu);
- fpu_kstat_irq = this_cpu_ptr(desc_fpu->kstat_irqs);
+ fpu_kstat_irq = this_cpu_ptr(&desc_fpu->kstat_irqs->cnt);
}
if (dec_interrupt[DEC_IRQ_CASCADE] >= 0) {
if (request_irq(dec_interrupt[DEC_IRQ_CASCADE], no_action,
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 444154271f2370..800eb64e91ad0a 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -344,7 +344,7 @@ static int smp_boot_one_cpu(int cpuid, struct task_struct *idle)
struct irq_desc *desc = irq_to_desc(i);
if (desc && desc->kstat_irqs)
- *per_cpu_ptr(desc->kstat_irqs, cpuid) = 0;
+ *per_cpu_ptr(desc->kstat_irqs, cpuid) = (struct irqstat) { };
}
#endif
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 61a8d5555cd7eb..e5fdc336c9b225 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -6,5 +6,4 @@ generic-y += agp.h
generic-y += kvm_types.h
generic-y += mcs_spinlock.h
generic-y += qrwlock.h
-generic-y += vtime.h
generic-y += early_ioremap.h
diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index 4961fb38e4385f..aff858ca99c05d 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -32,23 +32,10 @@
#ifdef CONFIG_PPC64
#define get_accounting(tsk) (&get_paca()->accounting)
#define raw_get_accounting(tsk) (&local_paca->accounting)
-static inline void arch_vtime_task_switch(struct task_struct *tsk) { }
#else
#define get_accounting(tsk) (&task_thread_info(tsk)->accounting)
#define raw_get_accounting(tsk) get_accounting(tsk)
-/*
- * Called from the context switch with interrupts disabled, to charge all
- * accumulated times to the current process, and to prepare accounting on
- * the next process.
- */
-static inline void arch_vtime_task_switch(struct task_struct *prev)
-{
- struct cpu_accounting_data *acct = get_accounting(current);
- struct cpu_accounting_data *acct0 = get_accounting(prev);
-
- acct->starttime = acct0->starttime;
-}
#endif
/*
diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h
index 78302f6c258006..c6390890a60c2f 100644
--- a/arch/powerpc/include/asm/vdso/gettimeofday.h
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -13,6 +13,17 @@
#define VDSO_HAS_TIME 1
+/*
+ * powerpc specific delta calculation.
+ *
+ * This variant removes the masking of the subtraction because the
+ * clocksource mask of all VDSO capable clocksources on powerpc is U64_MAX
+ * which would result in a pointless operation. The compiler cannot
+ * optimize it away as the mask comes from the vdso data and is not compile
+ * time constant.
+ */
+#define VDSO_DELTA_NOMASK 1
+
static __always_inline int do_syscall_2(const unsigned long _r0, const unsigned long _r3,
const unsigned long _r4)
{
@@ -104,21 +115,6 @@ static inline bool vdso_clocksource_ok(const struct vdso_data *vd)
}
#define vdso_clocksource_ok vdso_clocksource_ok
-/*
- * powerpc specific delta calculation.
- *
- * This variant removes the masking of the subtraction because the
- * clocksource mask of all VDSO capable clocksources on powerpc is U64_MAX
- * which would result in a pointless operation. The compiler cannot
- * optimize it away as the mask comes from the vdso data and is not compile
- * time constant.
- */
-static __always_inline u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
-{
- return (cycles - last) * mult;
-}
-#define vdso_calc_delta vdso_calc_delta
-
#ifndef __powerpc64__
static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
{
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index df20cf201f74d7..c0fdc6d94feee7 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -354,6 +354,28 @@ void vtime_flush(struct task_struct *tsk)
acct->hardirq_time = 0;
acct->softirq_time = 0;
}
+
+/*
+ * Called from the context switch with interrupts disabled, to charge all
+ * accumulated times to the current process, and to prepare accounting on
+ * the next process.
+ */
+void vtime_task_switch(struct task_struct *prev)
+{
+ if (is_idle_task(prev))
+ vtime_account_idle(prev);
+ else
+ vtime_account_kernel(prev);
+
+ vtime_flush(prev);
+
+ if (!IS_ENABLED(CONFIG_PPC64)) {
+ struct cpu_accounting_data *acct = get_accounting(current);
+ struct cpu_accounting_data *acct0 = get_accounting(prev);
+
+ acct->starttime = acct0->starttime;
+ }
+}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
void __no_kcsan __delay(unsigned long loops)
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index e429848785033c..f2636414d82a07 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -837,7 +837,7 @@ static inline void this_cpu_inc_rm(unsigned int __percpu *addr)
*/
static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
{
- this_cpu_inc_rm(desc->kstat_irqs);
+ this_cpu_inc_rm(&desc->kstat_irqs->cnt);
__this_cpu_inc(kstat.irqs_sum);
}
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 0010b15e05223d..f961449ca077d1 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -177,6 +177,8 @@ config RISCV
select PCI_DOMAINS_GENERIC if PCI
select PCI_MSI if PCI
select RISCV_ALTERNATIVE if !XIP_KERNEL
+ select RISCV_APLIC
+ select RISCV_IMSIC
select RISCV_INTC
select RISCV_TIMER if RISCV_SBI
select SIFIVE_PLIC
diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h
index db84942eb78ffb..7937765ccfa5b2 100644
--- a/arch/s390/include/asm/vdso/gettimeofday.h
+++ b/arch/s390/include/asm/vdso/gettimeofday.h
@@ -6,16 +6,13 @@
#define VDSO_HAS_CLOCK_GETRES 1
+#define VDSO_DELTA_NOMASK 1
+
#include <asm/syscall.h>
#include <asm/timex.h>
#include <asm/unistd.h>
#include <linux/compiler.h>
-#define vdso_calc_delta __arch_vdso_calc_delta
-static __always_inline u64 __arch_vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
-{
- return (cycles - last) * mult;
-}
static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
{
diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h
index fe17e448c0c571..561c91c1a87c3a 100644
--- a/arch/s390/include/asm/vtime.h
+++ b/arch/s390/include/asm/vtime.h
@@ -2,8 +2,6 @@
#ifndef _S390_VTIME_H
#define _S390_VTIME_H
-#define __ARCH_HAS_VTIME_TASK_SWITCH
-
static inline void update_timer_sys(void)
{
S390_lowcore.system_timer += S390_lowcore.last_update_timer - S390_lowcore.exit_timer;
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 6f71b0ce1068c4..259496fe0ef9fd 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -29,6 +29,7 @@
#include <asm/hw_irq.h>
#include <asm/stacktrace.h>
#include <asm/softirq_stack.h>
+#include <asm/vtime.h>
#include "entry.h"
DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index c77382a673257a..230d010bac9b3c 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -31,6 +31,7 @@
#include <asm/crw.h>
#include <asm/asm-offsets.h>
#include <asm/pai.h>
+#include <asm/vtime.h>
struct mcck_struct {
unsigned int kill_task : 1;
diff --git a/arch/um/include/asm/cpufeature.h b/arch/um/include/asm/cpufeature.h
index 66fe06db872f05..1eb8b834fbec31 100644
--- a/arch/um/include/asm/cpufeature.h
+++ b/arch/um/include/asm/cpufeature.h
@@ -38,8 +38,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
#define this_cpu_has(bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
- x86_this_cpu_test_bit(bit, \
- (unsigned long __percpu *)&cpu_info.x86_capability))
+ x86_this_cpu_test_bit(bit, cpu_info.x86_capability))
/*
* This macro is for detection of features which need kernel
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 886e0c19ad592f..d99f54514232db 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -171,6 +171,7 @@ config X86
select GENERIC_TIME_VSYSCALL
select GENERIC_GETTIMEOFDAY
select GENERIC_VDSO_TIME_NS
+ select GENERIC_VDSO_OVERFLOW_PROTECT
select GUP_GET_PXX_LOW_HIGH if X86_PAE
select HARDIRQS_SW_RESEND
select HARDLOCKUP_CHECK_TIMESTAMP if X86_64
@@ -503,12 +504,11 @@ config X86_FRED
When enabled, try to use Flexible Return and Event Delivery
instead of the legacy SYSCALL/SYSENTER/IDT architecture for
ring transitions and exception/interrupt handling if the
- system supports.
+ system supports it.
-if X86_32
config X86_BIGSMP
bool "Support for big SMP systems with more than 8 CPUs"
- depends on SMP
+ depends on SMP && X86_32
help
This option is needed for the systems that have more than 8 CPUs.
@@ -521,7 +521,10 @@ config X86_EXTENDED_PLATFORM
systems out there.)
If you enable this option then you'll be able to select support
- for the following (non-PC) 32 bit x86 platforms:
+ for the following non-PC x86 platforms, depending on the value of
+ CONFIG_64BIT.
+
+ 32-bit platforms (CONFIG_64BIT=n):
Goldfish (Android emulator)
AMD Elan
RDC R-321x SoC
@@ -529,28 +532,14 @@ config X86_EXTENDED_PLATFORM
STA2X11-based (e.g. Northville)
Moorestown MID devices
- If you have one of these systems, or if you want to build a
- generic distribution kernel, say Y here - otherwise say N.
-endif # X86_32
-
-if X86_64
-config X86_EXTENDED_PLATFORM
- bool "Support for extended (non-PC) x86 platforms"
- default y
- help
- If you disable this option then the kernel will only support
- standard PC platforms. (which covers the vast majority of
- systems out there.)
-
- If you enable this option then you'll be able to select support
- for the following (non-PC) 64 bit x86 platforms:
+ 64-bit platforms (CONFIG_64BIT=y):
Numascale NumaChip
ScaleMP vSMP
SGI Ultraviolet
If you have one of these systems, or if you want to build a
generic distribution kernel, say Y here - otherwise say N.
-endif # X86_64
+
# This is an alphabetically sorted list of 64 bit extended platforms
# Please maintain the alphabetic order if and when there are additions
config X86_NUMACHIP
@@ -2432,18 +2421,20 @@ source "kernel/livepatch/Kconfig"
endmenu
config CC_HAS_NAMED_AS
- def_bool CC_IS_GCC && GCC_VERSION >= 120100
+ def_bool CC_IS_GCC && GCC_VERSION >= 90100
+
+config CC_HAS_NAMED_AS_FIXED_SANITIZERS
+ def_bool CC_IS_GCC && GCC_VERSION >= 130300
config USE_X86_SEG_SUPPORT
def_bool y
depends on CC_HAS_NAMED_AS
#
- # -fsanitize=kernel-address (KASAN) is at the moment incompatible
- # with named address spaces - see GCC PR sanitizer/111736.
+ # -fsanitize=kernel-address (KASAN) and -fsanitize=thread
+ # (KCSAN) are incompatible with named address spaces with
+ # GCC < 13.3 - see GCC PR sanitizer/111736.
#
- depends on !KASAN
- # -fsanitize=thread (KCSAN) is also incompatible.
- depends on !KCSAN
+ depends on !(KASAN || KCSAN) || CC_HAS_NAMED_AS_FIXED_SANITIZERS
config CC_HAS_SLS
def_bool $(cc-option,-mharden-sls=all)
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index bf4a10a5794f1c..1dcb794c5479ed 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -398,6 +398,11 @@ SYM_CODE_START(startup_64)
call sev_enable
#endif
+ /* Preserve only the CR4 bits that must be preserved, and clear the rest */
+ movq %cr4, %rax
+ andl $(X86_CR4_PAE | X86_CR4_MCE | X86_CR4_LA57), %eax
+ movq %rax, %cr4
+
/*
* configure_5level_paging() updates the number of paging levels using
* a trampoline in 32-bit addressable memory if the current number does
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index ec71846d28c9ed..0457a9d7e5150f 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -335,26 +335,6 @@ finish:
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
}
-static void enforce_vmpl0(void)
-{
- u64 attrs;
- int err;
-
- /*
- * RMPADJUST modifies RMP permissions of a lesser-privileged (numerically
- * higher) privilege level. Here, clear the VMPL1 permission mask of the
- * GHCB page. If the guest is not running at VMPL0, this will fail.
- *
- * If the guest is running at VMPL0, it will succeed. Even if that operation
- * modifies permission bits, it is still ok to do so currently because Linux
- * SNP guests are supported only on VMPL0 so VMPL1 or higher permission masks
- * changing is a don't-care.
- */
- attrs = 1;
- if (rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, attrs))
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0);
-}
-
/*
* SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need
* guest side implementation for proper functioning of the guest. If any
@@ -413,6 +393,85 @@ void snp_check_features(void)
}
}
+/* Search for Confidential Computing blob in the EFI config table. */
+static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp)
+{
+ unsigned long cfg_table_pa;
+ unsigned int cfg_table_len;
+ int ret;
+
+ ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len);
+ if (ret)
+ return NULL;
+
+ return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa,
+ cfg_table_len,
+ EFI_CC_BLOB_GUID);
+}
+
+/*
+ * Initial set up of SNP relies on information provided by the
+ * Confidential Computing blob, which can be passed to the boot kernel
+ * by firmware/bootloader in the following ways:
+ *
+ * - via an entry in the EFI config table
+ * - via a setup_data structure, as defined by the Linux Boot Protocol
+ *
+ * Scan for the blob in that order.
+ */
+static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ cc_info = find_cc_blob_efi(bp);
+ if (cc_info)
+ goto found_cc_info;
+
+ cc_info = find_cc_blob_setup_data(bp);
+ if (!cc_info)
+ return NULL;
+
+found_cc_info:
+ if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ return cc_info;
+}
+
+/*
+ * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks
+ * will verify the SNP CPUID/MSR bits.
+ */
+static bool early_snp_init(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ if (!bp)
+ return false;
+
+ cc_info = find_cc_blob(bp);
+ if (!cc_info)
+ return false;
+
+ /*
+ * If a SNP-specific Confidential Computing blob is present, then
+ * firmware/bootloader have indicated SNP support. Verifying this
+ * involves CPUID checks which will be more reliable if the SNP
+ * CPUID table is used. See comments over snp_setup_cpuid_table() for
+ * more details.
+ */
+ setup_cpuid_table(cc_info);
+
+ /*
+ * Pass run-time kernel a pointer to CC info via boot_params so EFI
+ * config table doesn't need to be searched again during early startup
+ * phase.
+ */
+ bp->cc_blob_address = (u32)(unsigned long)cc_info;
+
+ return true;
+}
+
/*
* sev_check_cpu_support - Check for SEV support in the CPU capabilities
*
@@ -463,7 +522,7 @@ void sev_enable(struct boot_params *bp)
bp->cc_blob_address = 0;
/*
- * Do an initial SEV capability check before snp_init() which
+ * Do an initial SEV capability check before early_snp_init() which
* loads the CPUID page and the same checks afterwards are done
* without the hypervisor and are trustworthy.
*
@@ -478,7 +537,7 @@ void sev_enable(struct boot_params *bp)
* Setup/preliminary detection of SNP. This will be sanity-checked
* against CPUID/MSR values later.
*/
- snp = snp_init(bp);
+ snp = early_snp_init(bp);
/* Now repeat the checks with the SNP CPUID table. */
@@ -509,7 +568,20 @@ void sev_enable(struct boot_params *bp)
if (!(get_hv_features() & GHCB_HV_FT_SNP))
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
- enforce_vmpl0();
+ /*
+ * Enforce running at VMPL0.
+ *
+ * RMPADJUST modifies RMP permissions of a lesser-privileged (numerically
+ * higher) privilege level. Here, clear the VMPL1 permission mask of the
+ * GHCB page. If the guest is not running at VMPL0, this will fail.
+ *
+ * If the guest is running at VMPL0, it will succeed. Even if that operation
+ * modifies permission bits, it is still ok to do so currently because Linux
+ * SNP guests running at VMPL0 only run at VMPL0, so VMPL1 or higher
+ * permission mask changes are a don't-care.
+ */
+ if (rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, 1))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0);
}
if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
@@ -535,85 +607,6 @@ u64 sev_get_status(void)
return m.q;
}
-/* Search for Confidential Computing blob in the EFI config table. */
-static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp)
-{
- unsigned long cfg_table_pa;
- unsigned int cfg_table_len;
- int ret;
-
- ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len);
- if (ret)
- return NULL;
-
- return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa,
- cfg_table_len,
- EFI_CC_BLOB_GUID);
-}
-
-/*
- * Initial set up of SNP relies on information provided by the
- * Confidential Computing blob, which can be passed to the boot kernel
- * by firmware/bootloader in the following ways:
- *
- * - via an entry in the EFI config table
- * - via a setup_data structure, as defined by the Linux Boot Protocol
- *
- * Scan for the blob in that order.
- */
-static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
-{
- struct cc_blob_sev_info *cc_info;
-
- cc_info = find_cc_blob_efi(bp);
- if (cc_info)
- goto found_cc_info;
-
- cc_info = find_cc_blob_setup_data(bp);
- if (!cc_info)
- return NULL;
-
-found_cc_info:
- if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
-
- return cc_info;
-}
-
-/*
- * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks
- * will verify the SNP CPUID/MSR bits.
- */
-bool snp_init(struct boot_params *bp)
-{
- struct cc_blob_sev_info *cc_info;
-
- if (!bp)
- return false;
-
- cc_info = find_cc_blob(bp);
- if (!cc_info)
- return false;
-
- /*
- * If a SNP-specific Confidential Computing blob is present, then
- * firmware/bootloader have indicated SNP support. Verifying this
- * involves CPUID checks which will be more reliable if the SNP
- * CPUID table is used. See comments over snp_setup_cpuid_table() for
- * more details.
- */
- setup_cpuid_table(cc_info);
-
- /*
- * Pass run-time kernel a pointer to CC info via boot_params so EFI
- * config table doesn't need to be searched again during early startup
- * phase.
- */
- bp->cc_blob_address = (u32)(unsigned long)cc_info;
-
- return true;
-}
-
void sev_prep_identity_maps(unsigned long top_level_pgt)
{
/*
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index c4ea5258ab558f..9049f390d8347f 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -119,8 +119,8 @@ static void init_heap(void)
char *stack_end;
if (boot_params.hdr.loadflags & CAN_USE_HEAP) {
- asm("leal %P1(%%esp),%0"
- : "=r" (stack_end) : "i" (-STACK_SIZE));
+ asm("leal %n1(%%esp),%0"
+ : "=r" (stack_end) : "i" (STACK_SIZE));
heap_end = (char *)
((size_t)boot_params.hdr.heap_end_ptr + 0x200);
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index c93e7f5c2a0652..ce1cc1622385b3 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -17,7 +17,7 @@ obj-y += common.o
obj-y += vdso/
obj-y += vsyscall/
-obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o
+obj-$(CONFIG_PREEMPTION) += thunk.o
CFLAGS_entry_fred.o += -fno-stack-protector
CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE)
obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index c779046cc3fe79..11c9b8efdc4ccc 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -7,7 +7,6 @@
#include <asm/asm-offsets.h>
#include <asm/current.h>
#include <asm/errno.h>
-#include <asm/ia32_unistd.h>
#include <asm/thread_info.h>
#include <asm/segment.h>
#include <asm/irqflags.h>
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 52df0dec70da6c..a396f6e6ab5bf9 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -374,7 +374,7 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
-453 64 map_shadow_stack sys_map_shadow_stack
+453 common map_shadow_stack sys_map_shadow_stack
454 common futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk.S
index 119ebdc3d36239..119ebdc3d36239 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk.S
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
deleted file mode 100644
index da37f42f45498d..00000000000000
--- a/arch/x86/entry/thunk_32.S
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
- * Copyright 2008 by Steven Rostedt, Red Hat, Inc
- * (inspired by Andi Kleen's thunk_64.S)
- */
-
-#include <linux/export.h>
-#include <linux/linkage.h>
-#include <asm/asm.h>
-
-#include "calling.h"
-
-THUNK preempt_schedule_thunk, preempt_schedule
-THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
-EXPORT_SYMBOL(preempt_schedule_thunk)
-EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
-
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 985ef3b479191f..1fc4ce44e743c0 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -647,7 +647,7 @@ static void amd_pmu_cpu_dead(int cpu)
}
}
-static inline void amd_pmu_set_global_ctl(u64 ctl)
+static __always_inline void amd_pmu_set_global_ctl(u64 ctl)
{
wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl);
}
@@ -907,6 +907,37 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
return amd_pmu_adjust_nmi_window(handled);
}
+/*
+ * AMD-specific callback invoked through perf_snapshot_branch_stack static
+ * call, defined in include/linux/perf_event.h. See its definition for API
+ * details. It's up to caller to provide enough space in *entries* to fit all
+ * LBR records, otherwise returned result will be truncated to *cnt* entries.
+ */
+static int amd_pmu_v2_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+ struct cpu_hw_events *cpuc;
+ unsigned long flags;
+
+ /*
+ * The sequence of steps to freeze LBR should be completely inlined
+ * and contain no branches to minimize contamination of LBR snapshot
+ */
+ local_irq_save(flags);
+ amd_pmu_core_disable_all();
+ __amd_pmu_lbr_disable();
+
+ cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ amd_pmu_lbr_read();
+ cnt = min(cnt, x86_pmu.lbr_nr);
+ memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
+
+ amd_pmu_v2_enable_all(0);
+ local_irq_restore(flags);
+
+ return cnt;
+}
+
static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1443,6 +1474,10 @@ static int __init amd_core_pmu_init(void)
static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset);
static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add);
static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del);
+
+ /* Only support branch_stack snapshot on perfmon v2 */
+ if (x86_pmu.handle_irq == amd_pmu_v2_handle_irq)
+ static_call_update(perf_snapshot_branch_stack, amd_pmu_v2_snapshot_branch_stack);
} else if (!amd_brs_init()) {
/*
* BRS requires special event constraints and flushing on ctxsw.
diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c
index 5149830c7c4fa6..19c7b76e21bcb4 100644
--- a/arch/x86/events/amd/lbr.c
+++ b/arch/x86/events/amd/lbr.c
@@ -310,10 +310,6 @@ int amd_pmu_lbr_hw_config(struct perf_event *event)
{
int ret = 0;
- /* LBR is not recommended in counting mode */
- if (!is_sampling_event(event))
- return -EINVAL;
-
ret = amd_pmu_lbr_setup_filter(event);
if (!ret)
event->attach_state |= PERF_ATTACH_SCHED_CB;
@@ -414,18 +410,11 @@ void amd_pmu_lbr_enable_all(void)
void amd_pmu_lbr_disable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- u64 dbg_ctl, dbg_extn_cfg;
if (!cpuc->lbr_users || !x86_pmu.lbr_nr)
return;
- rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
- wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
-
- if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) {
- rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
- wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
- }
+ __amd_pmu_lbr_disable();
}
__init int amd_pmu_lbr_init(void)
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 326c8cd5aa2d2e..54eb142810fbb6 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -696,78 +696,78 @@ static const struct cstate_model srf_cstates __initconst = {
static const struct x86_cpu_id intel_cstates_match[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EX, &nhm_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE, &nhm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EP, &nhm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EX, &nhm_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &snb_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &snb_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &snb_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &hswult_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &slm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_D, &slm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &slm_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &snb_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &snb_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &hswult_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &hswult_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &hswult_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &hswult_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &cnl_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &knl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &knl_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &srf_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &grr_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &icx_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &icx_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, &icx_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &icx_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &adl_cstates),
+ X86_MATCH_VFM(INTEL_NEHALEM, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_NEHALEM_EP, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_NEHALEM_EX, &nhm_cstates),
+
+ X86_MATCH_VFM(INTEL_WESTMERE, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_WESTMERE_EP, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_WESTMERE_EX, &nhm_cstates),
+
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE, &snb_cstates),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_IVYBRIDGE, &snb_cstates),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_HASWELL, &snb_cstates),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &snb_cstates),
+ X86_MATCH_VFM(INTEL_HASWELL_G, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_HASWELL_L, &hswult_cstates),
+
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &slm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_D, &slm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &slm_cstates),
+
+ X86_MATCH_VFM(INTEL_BROADWELL, &snb_cstates),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &snb_cstates),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, &snb_cstates),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, &snb_cstates),
+ X86_MATCH_VFM(INTEL_SKYLAKE, &snb_cstates),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, &hswult_cstates),
+ X86_MATCH_VFM(INTEL_KABYLAKE, &hswult_cstates),
+ X86_MATCH_VFM(INTEL_COMETLAKE_L, &hswult_cstates),
+ X86_MATCH_VFM(INTEL_COMETLAKE, &hswult_cstates),
+
+ X86_MATCH_VFM(INTEL_CANNONLAKE_L, &cnl_cstates),
+
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &knl_cstates),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &knl_cstates),
+
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_L, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &srf_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &grr_cstates),
+
+ X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ICELAKE, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &icx_cstates),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &icx_cstates),
+
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, &icl_cstates),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ROCKETLAKE, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, &adl_cstates),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, &adl_cstates),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, &adl_cstates),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &adl_cstates),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &adl_cstates),
+ X86_MATCH_VFM(INTEL_METEORLAKE, &adl_cstates),
+ X86_MATCH_VFM(INTEL_METEORLAKE_L, &adl_cstates),
{ },
};
MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 4367aa77cb8d9f..dc641b50814e21 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -2,6 +2,7 @@
#include <linux/perf_event.h>
#include <linux/types.h>
+#include <asm/cpu_device_id.h>
#include <asm/perf_event.h>
#include <asm/msr.h>
@@ -1457,7 +1458,7 @@ void __init intel_pmu_lbr_init_atom(void)
* to have an operational LBR which can freeze
* on PMU interrupt
*/
- if (boot_cpu_data.x86_model == 28
+ if (boot_cpu_data.x86_vfm == INTEL_ATOM_BONNELL
&& boot_cpu_data.x86_stepping < 10) {
pr_cont("LBR disabled due to erratum");
return;
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 8e2a12235e62c2..14db6d9d318b3c 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -22,7 +22,7 @@
#include <asm/insn.h>
#include <asm/io.h>
#include <asm/intel_pt.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include "../perf_event.h"
#include "pt.h"
@@ -211,11 +211,11 @@ static int __init pt_pmu_hw_init(void)
}
/* model-specific quirks */
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_BROADWELL:
- case INTEL_FAM6_BROADWELL_D:
- case INTEL_FAM6_BROADWELL_G:
- case INTEL_FAM6_BROADWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_D:
+ case INTEL_BROADWELL_G:
+ case INTEL_BROADWELL_X:
/* not setting BRANCH_EN will #GP, erratum BDM106 */
pt_pmu.branch_en_always_on = true;
break;
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 258e2cdf28fadc..419c517b8594fb 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1829,56 +1829,56 @@ static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
};
static const struct x86_cpu_id intel_uncore_match[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE, &nhm_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EP, &nhm_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &snb_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &ivb_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &hsw_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &hsw_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &hsw_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &bdw_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &bdw_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &snbep_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EX, &nhmex_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EX, &nhmex_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ivbep_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &hswep_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &bdx_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &bdx_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &knl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &knl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &skx_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &icl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &mtl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &mtl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &spr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, &gnr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &gnr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &gnr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_NEHALEM_EP, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_NEHALEM, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_WESTMERE, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_WESTMERE_EP, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE, &snb_uncore_init),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE, &ivb_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL, &hsw_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL_L, &hsw_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL_G, &hsw_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL, &bdw_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, &bdw_uncore_init),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &snbep_uncore_init),
+ X86_MATCH_VFM(INTEL_NEHALEM_EX, &nhmex_uncore_init),
+ X86_MATCH_VFM(INTEL_WESTMERE_EX, &nhmex_uncore_init),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ivbep_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &hswep_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &bdx_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &bdx_uncore_init),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &knl_uncore_init),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &knl_uncore_init),
+ X86_MATCH_VFM(INTEL_SKYLAKE, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &skx_uncore_init),
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_KABYLAKE, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_COMETLAKE_L, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_COMETLAKE, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, &icl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE, &icl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &icx_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &icx_uncore_init),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, &tgl_l_uncore_init),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, &tgl_uncore_init),
+ X86_MATCH_VFM(INTEL_ROCKETLAKE, &rkl_uncore_init),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_METEORLAKE, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_METEORLAKE_L, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_uncore_init),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_uncore_init),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &snr_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &gnr_uncore_init),
{},
};
MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match);
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
index 92da8aaa59660e..466833478e81fb 100644
--- a/arch/x86/events/intel/uncore_nhmex.c
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* Nehalem-EX/Westmere-EX uncore support */
+#include <asm/cpu_device_id.h>
#include "uncore.h"
/* NHM-EX event control */
@@ -1217,7 +1218,7 @@ static struct intel_uncore_type *nhmex_msr_uncores[] = {
void nhmex_uncore_cpu_init(void)
{
- if (boot_cpu_data.x86_model == 46)
+ if (boot_cpu_data.x86_vfm == INTEL_NEHALEM_EX)
uncore_nhmex = true;
else
nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 2eaf0f339849b6..74b8b21e8990be 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* SandyBridge-EP/IvyTown uncore support */
+#include <asm/cpu_device_id.h>
#include "uncore.h"
#include "uncore_discovery.h"
@@ -3285,7 +3286,7 @@ void bdx_uncore_cpu_init(void)
uncore_msr_uncores = bdx_msr_uncores;
/* Detect systems with no SBOXes */
- if ((boot_cpu_data.x86_model == 86) || hswep_has_limit_sbox(BDX_PCU_DID))
+ if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_D || hswep_has_limit_sbox(BDX_PCU_DID))
uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints;
@@ -5394,7 +5395,7 @@ static int icx_iio_get_topology(struct intel_uncore_type *type)
static void icx_iio_set_mapping(struct intel_uncore_type *type)
{
/* Detect ICX-D system. This case is not supported */
- if (boot_cpu_data.x86_model == INTEL_FAM6_ICELAKE_D) {
+ if (boot_cpu_data.x86_vfm == INTEL_ICELAKE_D) {
pmu_clear_mapping_attr(type->attr_update, &icx_iio_mapping_group);
return;
}
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 9e237b30f01745..45b1866ff0513d 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -2,7 +2,7 @@
#include <linux/perf_event.h>
#include <linux/sysfs.h>
#include <linux/nospec.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include "probe.h"
enum perf_msr_id {
@@ -43,75 +43,75 @@ static bool test_intel(int idx, void *data)
boot_cpu_data.x86 != 6)
return false;
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_NEHALEM:
- case INTEL_FAM6_NEHALEM_G:
- case INTEL_FAM6_NEHALEM_EP:
- case INTEL_FAM6_NEHALEM_EX:
-
- case INTEL_FAM6_WESTMERE:
- case INTEL_FAM6_WESTMERE_EP:
- case INTEL_FAM6_WESTMERE_EX:
-
- case INTEL_FAM6_SANDYBRIDGE:
- case INTEL_FAM6_SANDYBRIDGE_X:
-
- case INTEL_FAM6_IVYBRIDGE:
- case INTEL_FAM6_IVYBRIDGE_X:
-
- case INTEL_FAM6_HASWELL:
- case INTEL_FAM6_HASWELL_X:
- case INTEL_FAM6_HASWELL_L:
- case INTEL_FAM6_HASWELL_G:
-
- case INTEL_FAM6_BROADWELL:
- case INTEL_FAM6_BROADWELL_D:
- case INTEL_FAM6_BROADWELL_G:
- case INTEL_FAM6_BROADWELL_X:
- case INTEL_FAM6_SAPPHIRERAPIDS_X:
- case INTEL_FAM6_EMERALDRAPIDS_X:
- case INTEL_FAM6_GRANITERAPIDS_X:
- case INTEL_FAM6_GRANITERAPIDS_D:
-
- case INTEL_FAM6_ATOM_SILVERMONT:
- case INTEL_FAM6_ATOM_SILVERMONT_D:
- case INTEL_FAM6_ATOM_AIRMONT:
-
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_D:
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
- case INTEL_FAM6_ATOM_TREMONT_D:
- case INTEL_FAM6_ATOM_TREMONT:
- case INTEL_FAM6_ATOM_TREMONT_L:
-
- case INTEL_FAM6_XEON_PHI_KNL:
- case INTEL_FAM6_XEON_PHI_KNM:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_NEHALEM:
+ case INTEL_NEHALEM_G:
+ case INTEL_NEHALEM_EP:
+ case INTEL_NEHALEM_EX:
+
+ case INTEL_WESTMERE:
+ case INTEL_WESTMERE_EP:
+ case INTEL_WESTMERE_EX:
+
+ case INTEL_SANDYBRIDGE:
+ case INTEL_SANDYBRIDGE_X:
+
+ case INTEL_IVYBRIDGE:
+ case INTEL_IVYBRIDGE_X:
+
+ case INTEL_HASWELL:
+ case INTEL_HASWELL_X:
+ case INTEL_HASWELL_L:
+ case INTEL_HASWELL_G:
+
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_D:
+ case INTEL_BROADWELL_G:
+ case INTEL_BROADWELL_X:
+ case INTEL_SAPPHIRERAPIDS_X:
+ case INTEL_EMERALDRAPIDS_X:
+ case INTEL_GRANITERAPIDS_X:
+ case INTEL_GRANITERAPIDS_D:
+
+ case INTEL_ATOM_SILVERMONT:
+ case INTEL_ATOM_SILVERMONT_D:
+ case INTEL_ATOM_AIRMONT:
+
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_D:
+ case INTEL_ATOM_GOLDMONT_PLUS:
+ case INTEL_ATOM_TREMONT_D:
+ case INTEL_ATOM_TREMONT:
+ case INTEL_ATOM_TREMONT_L:
+
+ case INTEL_XEON_PHI_KNL:
+ case INTEL_XEON_PHI_KNM:
if (idx == PERF_MSR_SMI)
return true;
break;
- case INTEL_FAM6_SKYLAKE_L:
- case INTEL_FAM6_SKYLAKE:
- case INTEL_FAM6_SKYLAKE_X:
- case INTEL_FAM6_KABYLAKE_L:
- case INTEL_FAM6_KABYLAKE:
- case INTEL_FAM6_COMETLAKE_L:
- case INTEL_FAM6_COMETLAKE:
- case INTEL_FAM6_ICELAKE_L:
- case INTEL_FAM6_ICELAKE:
- case INTEL_FAM6_ICELAKE_X:
- case INTEL_FAM6_ICELAKE_D:
- case INTEL_FAM6_TIGERLAKE_L:
- case INTEL_FAM6_TIGERLAKE:
- case INTEL_FAM6_ROCKETLAKE:
- case INTEL_FAM6_ALDERLAKE:
- case INTEL_FAM6_ALDERLAKE_L:
- case INTEL_FAM6_ATOM_GRACEMONT:
- case INTEL_FAM6_RAPTORLAKE:
- case INTEL_FAM6_RAPTORLAKE_P:
- case INTEL_FAM6_RAPTORLAKE_S:
- case INTEL_FAM6_METEORLAKE:
- case INTEL_FAM6_METEORLAKE_L:
+ case INTEL_SKYLAKE_L:
+ case INTEL_SKYLAKE:
+ case INTEL_SKYLAKE_X:
+ case INTEL_KABYLAKE_L:
+ case INTEL_KABYLAKE:
+ case INTEL_COMETLAKE_L:
+ case INTEL_COMETLAKE:
+ case INTEL_ICELAKE_L:
+ case INTEL_ICELAKE:
+ case INTEL_ICELAKE_X:
+ case INTEL_ICELAKE_D:
+ case INTEL_TIGERLAKE_L:
+ case INTEL_TIGERLAKE:
+ case INTEL_ROCKETLAKE:
+ case INTEL_ALDERLAKE:
+ case INTEL_ALDERLAKE_L:
+ case INTEL_ATOM_GRACEMONT:
+ case INTEL_RAPTORLAKE:
+ case INTEL_RAPTORLAKE_P:
+ case INTEL_RAPTORLAKE_S:
+ case INTEL_METEORLAKE:
+ case INTEL_METEORLAKE_L:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index fb56518356ecfb..72b022a1e16c56 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1329,6 +1329,19 @@ void amd_pmu_lbr_enable_all(void);
void amd_pmu_lbr_disable_all(void);
int amd_pmu_lbr_hw_config(struct perf_event *event);
+static __always_inline void __amd_pmu_lbr_disable(void)
+{
+ u64 dbg_ctl, dbg_extn_cfg;
+
+ rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
+ wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
+
+ if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) {
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ }
+}
+
#ifdef CONFIG_PERF_EVENTS_AMD_BRS
#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index fb2b1961e5a33a..ca5f687fa4200f 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -675,10 +675,8 @@ static const struct attribute_group *rapl_attr_update[] = {
static int __init init_rapl_pmus(void)
{
int maxdie = topology_max_packages() * topology_max_dies_per_package();
- size_t size;
- size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
- rapl_pmus = kzalloc(size, GFP_KERNEL);
+ rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, maxdie), GFP_KERNEL);
if (!rapl_pmus)
return -ENOMEM;
@@ -808,6 +806,9 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE_H, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(LUNARLAKE_M, &model_skl),
{},
};
MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index 5c7de79423b8a5..04775346369c59 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -34,7 +34,6 @@ void __init hv_vtl_init_platform(void)
/* Avoid searching for BIOS MP tables */
x86_init.mpparse.find_mptable = x86_init_noop;
x86_init.mpparse.early_parse_smp_cfg = x86_init_noop;
- x86_init.mpparse.parse_smp_cfg = x86_init_noop;
x86_platform.get_wallclock = get_rtc_noop;
x86_platform.set_wallclock = set_rtc_noop;
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 67b68d0d17d1ec..0cb2396de066d5 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -294,10 +294,10 @@ static inline int alternatives_text_reserved(void *start, void *end)
* Otherwise, if CPU has feature1, newinstr1 is used.
* Otherwise, oldinstr is used.
*/
-#define alternative_input_2(oldinstr, newinstr1, ft_flags1, newinstr2, \
- ft_flags2, input...) \
- asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, \
- newinstr2, ft_flags2) \
+#define alternative_input_2(oldinstr, newinstr1, ft_flags1, newinstr2, \
+ ft_flags2, input...) \
+ asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, \
+ newinstr2, ft_flags2) \
: : "i" (0), ## input)
/* Like alternative_input, but with a single output argument */
@@ -307,7 +307,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
/* Like alternative_io, but for replacing a direct call with another one. */
#define alternative_call(oldfunc, newfunc, ft_flags, output, input...) \
- asm_inline volatile (ALTERNATIVE("call %P[old]", "call %P[new]", ft_flags) \
+ asm_inline volatile (ALTERNATIVE("call %c[old]", "call %c[new]", ft_flags) \
: output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
/*
@@ -316,12 +316,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
* Otherwise, if CPU has feature1, function1 is used.
* Otherwise, old function is used.
*/
-#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \
- output, input...) \
- asm_inline volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", ft_flags1,\
- "call %P[new2]", ft_flags2) \
- : output, ASM_CALL_CONSTRAINT \
- : [old] "i" (oldfunc), [new1] "i" (newfunc1), \
+#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \
+ output, input...) \
+ asm_inline volatile (ALTERNATIVE_2("call %c[old]", "call %c[new1]", ft_flags1, \
+ "call %c[new2]", ft_flags2) \
+ : output, ASM_CALL_CONSTRAINT \
+ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \
[new2] "i" (newfunc2), ## input)
/*
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index e6ab0cf15ed573..839c58f95114f5 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -92,7 +92,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
{
volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
- alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
+ alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
ASM_OUTPUT2("=r" (v), "=m" (*addr)),
ASM_OUTPUT2("0" (v), "m" (*addr)));
}
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index ca8eed1d496ab4..2bec0c89a95c27 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -229,9 +229,6 @@ register unsigned long current_stack_pointer asm(_ASM_SP);
#define _ASM_EXTABLE_UA(from, to) \
_ASM_EXTABLE_TYPE(from, to, EX_TYPE_UACCESS)
-#define _ASM_EXTABLE_CPY(from, to) \
- _ASM_EXTABLE_TYPE(from, to, EX_TYPE_COPY)
-
#define _ASM_EXTABLE_FAULT(from, to) \
_ASM_EXTABLE_TYPE(from, to, EX_TYPE_FAULT)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 55a55ec0435020..55b4d24356eacc 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -86,11 +86,7 @@ static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
}
#define arch_atomic_add_return arch_atomic_add_return
-static __always_inline int arch_atomic_sub_return(int i, atomic_t *v)
-{
- return arch_atomic_add_return(-i, v);
-}
-#define arch_atomic_sub_return arch_atomic_sub_return
+#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v)
static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
{
@@ -98,11 +94,7 @@ static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
}
#define arch_atomic_fetch_add arch_atomic_fetch_add
-static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v)
-{
- return xadd(&v->counter, -i);
-}
-#define arch_atomic_fetch_sub arch_atomic_fetch_sub
+#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v)
static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 3486d91b8595f1..8db2ec4d6cdac7 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -14,6 +14,32 @@ typedef struct {
#define ATOMIC64_INIT(val) { (val) }
+/*
+ * Read an atomic64_t non-atomically.
+ *
+ * This is intended to be used in cases where a subsequent atomic operation
+ * will handle the torn value, and can be used to prime the first iteration
+ * of unconditional try_cmpxchg() loops, e.g.:
+ *
+ * s64 val = arch_atomic64_read_nonatomic(v);
+ * do { } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i);
+ *
+ * This is NOT safe to use where the value is not always checked by a
+ * subsequent atomic operation, such as in conditional try_cmpxchg() loops
+ * that can break before the atomic operation, e.g.:
+ *
+ * s64 val = arch_atomic64_read_nonatomic(v);
+ * do {
+ * if (condition(val))
+ * break;
+ * } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i);
+ */
+static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v)
+{
+ /* See comment in arch_atomic_read(). */
+ return __READ_ONCE(v->counter);
+}
+
#define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...)
#ifndef ATOMIC64_EXPORT
#define ATOMIC64_DECL_ONE __ATOMIC64_DECL
@@ -24,7 +50,7 @@ typedef struct {
#ifdef CONFIG_X86_CMPXCHG64
#define __alternative_atomic64(f, g, out, in...) \
- asm volatile("call %P[func]" \
+ asm volatile("call %c[func]" \
: out : [func] "i" (atomic64_##g##_cx8), ## in)
#define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8)
@@ -61,12 +87,18 @@ ATOMIC64_DECL(add_unless);
#undef __ATOMIC64_DECL
#undef ATOMIC64_EXPORT
-static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
- return arch_cmpxchg64(&v->counter, o, n);
+ return arch_cmpxchg64(&v->counter, old, new);
}
#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg
+static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
+{
+ return arch_try_cmpxchg64(&v->counter, old, new);
+}
+#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
+
static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
{
s64 o;
@@ -195,69 +227,62 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
}
static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
- return old;
+ return val;
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and
static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
}
static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
- return old;
+ return val;
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or
static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
}
static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
- return old;
+ return val;
}
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i));
- return old;
+ return val;
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 3165c0feedf742..ae12acae5b06a0 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -80,11 +80,7 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
}
#define arch_atomic64_add_return arch_atomic64_add_return
-static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
-{
- return arch_atomic64_add_return(-i, v);
-}
-#define arch_atomic64_sub_return arch_atomic64_sub_return
+#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v)
static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
@@ -92,11 +88,7 @@ static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add
-static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
-{
- return xadd(&v->counter, -i);
-}
-#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub
+#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v)
static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index a3e0be0470a400..3e5b111e619d4e 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -6,11 +6,6 @@
#include <asm/pgtable_types.h>
#include <uapi/asm/boot.h>
-/* Physical address where kernel should be loaded. */
-#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
- + (CONFIG_PHYSICAL_ALIGN - 1)) \
- & ~(CONFIG_PHYSICAL_ALIGN - 1))
-
/* Minimum kernel alignment, as a power of two */
#ifdef CONFIG_X86_64
# define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index b5731c51f0f4cd..ed2797f132ce09 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -3,103 +3,150 @@
#define _ASM_X86_CMPXCHG_32_H
/*
- * Note: if you use set64_bit(), __cmpxchg64(), or their variants,
+ * Note: if you use __cmpxchg64(), or their variants,
* you need to test for the feature in boot_cpu_data.
*/
-#ifdef CONFIG_X86_CMPXCHG64
-#define arch_cmpxchg64(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
- (unsigned long long)(n)))
-#define arch_cmpxchg64_local(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
- (unsigned long long)(n)))
-#define arch_try_cmpxchg64(ptr, po, n) \
- __try_cmpxchg64((ptr), (unsigned long long *)(po), \
- (unsigned long long)(n))
-#endif
+union __u64_halves {
+ u64 full;
+ struct {
+ u32 low, high;
+ };
+};
+
+#define __arch_cmpxchg64(_ptr, _old, _new, _lock) \
+({ \
+ union __u64_halves o = { .full = (_old), }, \
+ n = { .full = (_new), }; \
+ \
+ asm volatile(_lock "cmpxchg8b %[ptr]" \
+ : [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high) \
+ : "memory"); \
+ \
+ o.full; \
+})
+
+
+static __always_inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
+{
+ return __arch_cmpxchg64(ptr, old, new, LOCK_PREFIX);
+}
-static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
+static __always_inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
{
- u64 prev;
- asm volatile(LOCK_PREFIX "cmpxchg8b %1"
- : "=A" (prev),
- "+m" (*ptr)
- : "b" ((u32)new),
- "c" ((u32)(new >> 32)),
- "0" (old)
- : "memory");
- return prev;
+ return __arch_cmpxchg64(ptr, old, new,);
}
-static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
+#define __arch_try_cmpxchg64(_ptr, _oldp, _new, _lock) \
+({ \
+ union __u64_halves o = { .full = *(_oldp), }, \
+ n = { .full = (_new), }; \
+ bool ret; \
+ \
+ asm volatile(_lock "cmpxchg8b %[ptr]" \
+ CC_SET(e) \
+ : CC_OUT(e) (ret), \
+ [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high) \
+ : "memory"); \
+ \
+ if (unlikely(!ret)) \
+ *(_oldp) = o.full; \
+ \
+ likely(ret); \
+})
+
+static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new)
{
- u64 prev;
- asm volatile("cmpxchg8b %1"
- : "=A" (prev),
- "+m" (*ptr)
- : "b" ((u32)new),
- "c" ((u32)(new >> 32)),
- "0" (old)
- : "memory");
- return prev;
+ return __arch_try_cmpxchg64(ptr, oldp, new, LOCK_PREFIX);
}
-static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new)
+static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new)
{
- bool success;
- u64 old = *pold;
- asm volatile(LOCK_PREFIX "cmpxchg8b %[ptr]"
- CC_SET(z)
- : CC_OUT(z) (success),
- [ptr] "+m" (*ptr),
- "+A" (old)
- : "b" ((u32)new),
- "c" ((u32)(new >> 32))
- : "memory");
-
- if (unlikely(!success))
- *pold = old;
- return success;
+ return __arch_try_cmpxchg64(ptr, oldp, new,);
}
-#ifndef CONFIG_X86_CMPXCHG64
+#ifdef CONFIG_X86_CMPXCHG64
+
+#define arch_cmpxchg64 __cmpxchg64
+
+#define arch_cmpxchg64_local __cmpxchg64_local
+
+#define arch_try_cmpxchg64 __try_cmpxchg64
+
+#define arch_try_cmpxchg64_local __try_cmpxchg64_local
+
+#else
+
/*
* Building a kernel capable running on 80386 and 80486. It may be necessary
* to simulate the cmpxchg8b on the 80386 and 80486 CPU.
*/
-#define arch_cmpxchg64(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __ret; \
- __typeof__(*(ptr)) __old = (o); \
- __typeof__(*(ptr)) __new = (n); \
- alternative_io(LOCK_PREFIX_HERE \
- "call cmpxchg8b_emu", \
- "lock; cmpxchg8b (%%esi)" , \
- X86_FEATURE_CX8, \
- "=A" (__ret), \
- "S" ((ptr)), "0" (__old), \
- "b" ((unsigned int)__new), \
- "c" ((unsigned int)(__new>>32)) \
- : "memory"); \
- __ret; })
-
-
-#define arch_cmpxchg64_local(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __ret; \
- __typeof__(*(ptr)) __old = (o); \
- __typeof__(*(ptr)) __new = (n); \
- alternative_io("call cmpxchg8b_emu", \
- "cmpxchg8b (%%esi)" , \
- X86_FEATURE_CX8, \
- "=A" (__ret), \
- "S" ((ptr)), "0" (__old), \
- "b" ((unsigned int)__new), \
- "c" ((unsigned int)(__new>>32)) \
- : "memory"); \
- __ret; })
+#define __arch_cmpxchg64_emu(_ptr, _old, _new, _lock_loc, _lock) \
+({ \
+ union __u64_halves o = { .full = (_old), }, \
+ n = { .full = (_new), }; \
+ \
+ asm volatile(ALTERNATIVE(_lock_loc \
+ "call cmpxchg8b_emu", \
+ _lock "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \
+ : [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high), "S" (_ptr) \
+ : "memory"); \
+ \
+ o.full; \
+})
+
+static __always_inline u64 arch_cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
+{
+ return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock; ");
+}
+#define arch_cmpxchg64 arch_cmpxchg64
+
+static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
+{
+ return __arch_cmpxchg64_emu(ptr, old, new, ,);
+}
+#define arch_cmpxchg64_local arch_cmpxchg64_local
+
+#define __arch_try_cmpxchg64_emu(_ptr, _oldp, _new, _lock_loc, _lock) \
+({ \
+ union __u64_halves o = { .full = *(_oldp), }, \
+ n = { .full = (_new), }; \
+ bool ret; \
+ \
+ asm volatile(ALTERNATIVE(_lock_loc \
+ "call cmpxchg8b_emu", \
+ _lock "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \
+ CC_SET(e) \
+ : CC_OUT(e) (ret), \
+ [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high), "S" (_ptr) \
+ : "memory"); \
+ \
+ if (unlikely(!ret)) \
+ *(_oldp) = o.full; \
+ \
+ likely(ret); \
+})
+
+static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new)
+{
+ return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock; ");
+}
+#define arch_try_cmpxchg64 arch_try_cmpxchg64
+
+static __always_inline bool arch_try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new)
+{
+ return __arch_try_cmpxchg64_emu(ptr, oldp, new, ,);
+}
+#define arch_try_cmpxchg64_local arch_try_cmpxchg64_local
#endif
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 44b08b53ab32fc..5e241306db26a7 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -20,6 +20,12 @@
arch_try_cmpxchg((ptr), (po), (n)); \
})
+#define arch_try_cmpxchg64_local(ptr, po, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ arch_try_cmpxchg_local((ptr), (po), (n)); \
+})
+
union __u128_halves {
u128 full;
struct {
@@ -62,7 +68,7 @@ static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old,
asm volatile(_lock "cmpxchg16b %[ptr]" \
CC_SET(e) \
: CC_OUT(e) (ret), \
- [ptr] "+m" (*ptr), \
+ [ptr] "+m" (*(_ptr)), \
"+a" (o.low), "+d" (o.high) \
: "b" (n.low), "c" (n.high) \
: "memory"); \
diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h
index eb8fcede9e3bf4..970a232009c306 100644
--- a/arch/x86/include/asm/cpu_device_id.h
+++ b/arch/x86/include/asm/cpu_device_id.h
@@ -3,6 +3,39 @@
#define _ASM_X86_CPU_DEVICE_ID
/*
+ * Can't use <linux/bitfield.h> because it generates expressions that
+ * cannot be used in structure initializers. Bitfield construction
+ * here must match the union in struct cpuinfo_86:
+ * union {
+ * struct {
+ * __u8 x86_model;
+ * __u8 x86;
+ * __u8 x86_vendor;
+ * __u8 x86_reserved;
+ * };
+ * __u32 x86_vfm;
+ * };
+ */
+#define VFM_MODEL_BIT 0
+#define VFM_FAMILY_BIT 8
+#define VFM_VENDOR_BIT 16
+#define VFM_RSVD_BIT 24
+
+#define VFM_MODEL_MASK GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT)
+#define VFM_FAMILY_MASK GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT)
+#define VFM_VENDOR_MASK GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT)
+
+#define VFM_MODEL(vfm) (((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT)
+#define VFM_FAMILY(vfm) (((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT)
+#define VFM_VENDOR(vfm) (((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT)
+
+#define VFM_MAKE(_vendor, _family, _model) ( \
+ ((_model) << VFM_MODEL_BIT) | \
+ ((_family) << VFM_FAMILY_BIT) | \
+ ((_vendor) << VFM_VENDOR_BIT) \
+)
+
+/*
* Declare drivers belonging to specific x86 CPUs
* Similar in spirit to pci_device_id and related PCI functions
*
@@ -49,6 +82,16 @@
.driver_data = (unsigned long) _data \
}
+#define X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \
+ _steppings, _feature, _data) { \
+ .vendor = _vendor, \
+ .family = _family, \
+ .model = _model, \
+ .steppings = _steppings, \
+ .feature = _feature, \
+ .driver_data = (unsigned long) _data \
+}
+
/**
* X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Macro for CPU matching
* @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
@@ -164,6 +207,56 @@
X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, INTEL_FAM6_##model, \
steppings, X86_FEATURE_ANY, data)
+/**
+ * X86_MATCH_VFM - Match encoded vendor/family/model
+ * @vfm: Encoded 8-bits each for vendor, family, model
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is cast to unsigned long internally.
+ *
+ * Stepping and feature are set to wildcards
+ */
+#define X86_MATCH_VFM(vfm, data) \
+ X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
+ VFM_VENDOR(vfm), \
+ VFM_FAMILY(vfm), \
+ VFM_MODEL(vfm), \
+ X86_STEPPING_ANY, X86_FEATURE_ANY, data)
+
+/**
+ * X86_MATCH_VFM_STEPPINGS - Match encoded vendor/family/model/stepping
+ * @vfm: Encoded 8-bits each for vendor, family, model
+ * @steppings: Bitmask of steppings to match
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is cast to unsigned long internally.
+ *
+ * feature is set to wildcard
+ */
+#define X86_MATCH_VFM_STEPPINGS(vfm, steppings, data) \
+ X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
+ VFM_VENDOR(vfm), \
+ VFM_FAMILY(vfm), \
+ VFM_MODEL(vfm), \
+ steppings, X86_FEATURE_ANY, data)
+
+/**
+ * X86_MATCH_VFM_FEATURE - Match encoded vendor/family/model/feature
+ * @vfm: Encoded 8-bits each for vendor, family, model
+ * @feature: A X86_FEATURE bit
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is cast to unsigned long internally.
+ *
+ * Steppings is set to wildcard
+ */
+#define X86_MATCH_VFM_FEATURE(vfm, feature, data) \
+ X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
+ VFM_VENDOR(vfm), \
+ VFM_FAMILY(vfm), \
+ VFM_MODEL(vfm), \
+ X86_STEPPING_ANY, feature, data)
+
/*
* Match specific microcode revisions.
*
@@ -190,6 +283,14 @@ struct x86_cpu_desc {
.x86_microcode_rev = (revision), \
}
+#define AMD_CPU_DESC(fam, model, stepping, revision) { \
+ .x86_family = (fam), \
+ .x86_vendor = X86_VENDOR_AMD, \
+ .x86_model = (model), \
+ .x86_stepping = (stepping), \
+ .x86_microcode_rev = (revision), \
+}
+
extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
extern bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table);
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 686e92d2663eee..0b9611da6c53f1 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -129,8 +129,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
#define this_cpu_has(bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
- x86_this_cpu_test_bit(bit, \
- (unsigned long __percpu *)&cpu_info.x86_capability))
+ x86_this_cpu_test_bit(bit, cpu_info.x86_capability))
/*
* This macro is for detection of features which need kernel
@@ -150,8 +149,12 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
extern void setup_clear_cpu_cap(unsigned int bit);
extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
-#define setup_force_cpu_cap(bit) do { \
- set_cpu_cap(&boot_cpu_data, bit); \
+#define setup_force_cpu_cap(bit) do { \
+ \
+ if (!boot_cpu_has(bit)) \
+ WARN_ON(alternatives_patched); \
+ \
+ set_cpu_cap(&boot_cpu_data, bit); \
set_bit(bit, (unsigned long *)cpu_caps_set); \
} while (0)
@@ -172,11 +175,10 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
*/
static __always_inline bool _static_cpu_has(u16 bit)
{
- asm goto(
- ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]")
+ asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]")
".pushsection .altinstr_aux,\"ax\"\n"
"6:\n"
- " testb %[bitnum]," _ASM_RIP(%P[cap_byte]) "\n"
+ " testb %[bitnum], %a[cap_byte]\n"
" jnz %l[t_yes]\n"
" jmp %l[t_no]\n"
".popsection\n"
diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h
index 7acf0383be8022..906b0d5541e896 100644
--- a/arch/x86/include/asm/extable_fixup_types.h
+++ b/arch/x86/include/asm/extable_fixup_types.h
@@ -36,7 +36,7 @@
#define EX_TYPE_DEFAULT 1
#define EX_TYPE_FAULT 2
#define EX_TYPE_UACCESS 3
-#define EX_TYPE_COPY 4
+/* unused, was: #define EX_TYPE_COPY 4 */
#define EX_TYPE_CLEAR_FS 5
#define EX_TYPE_FPU_RESTORE 6
#define EX_TYPE_BPF 7
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 4212c00c9708d4..9d69f3f8dbabc8 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -56,17 +56,6 @@ struct stat64 {
unsigned long long st_ino;
} __attribute__((packed));
-#define IA32_STACK_TOP IA32_PAGE_OFFSET
-
-#ifdef __KERNEL__
-struct linux_binprm;
-extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
- unsigned long stack_top, int exec_stack);
-struct mm_struct;
-extern void ia32_pick_mmap_layout(struct mm_struct *mm);
-
-#endif
-
extern bool __ia32_enabled;
static __always_inline bool ia32_enabled(void)
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
deleted file mode 100644
index aa065c94ccf540..00000000000000
--- a/arch/x86/include/asm/ia32_unistd.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_IA32_UNISTD_H
-#define _ASM_X86_IA32_UNISTD_H
-
-/*
- * This file contains the system call numbers of the ia32 compat ABI,
- * this is for the kernel only.
- */
-#define __SYSCALL_ia32_NR(x) (x)
-#include <asm/unistd_32_ia32.h>
-
-#endif /* _ASM_X86_IA32_UNISTD_H */
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index d0941f4c272495..f81a851c46dca5 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -40,137 +40,221 @@
* their own names :-(
*/
+#define IFM(_fam, _model) VFM_MAKE(X86_VENDOR_INTEL, _fam, _model)
+
/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */
#define INTEL_FAM6_ANY X86_MODEL_ANY
+/* Wildcard match for FAM6 so X86_MATCH_VFM(ANY) works */
+#define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY)
#define INTEL_FAM6_CORE_YONAH 0x0E
+#define INTEL_CORE_YONAH IFM(6, 0x0E)
#define INTEL_FAM6_CORE2_MEROM 0x0F
+#define INTEL_CORE2_MEROM IFM(6, 0x0F)
#define INTEL_FAM6_CORE2_MEROM_L 0x16
+#define INTEL_CORE2_MEROM_L IFM(6, 0x16)
#define INTEL_FAM6_CORE2_PENRYN 0x17
+#define INTEL_CORE2_PENRYN IFM(6, 0x17)
#define INTEL_FAM6_CORE2_DUNNINGTON 0x1D
+#define INTEL_CORE2_DUNNINGTON IFM(6, 0x1D)
#define INTEL_FAM6_NEHALEM 0x1E
+#define INTEL_NEHALEM IFM(6, 0x1E)
#define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */
+#define INTEL_NEHALEM_G IFM(6, 0x1F) /* Auburndale / Havendale */
#define INTEL_FAM6_NEHALEM_EP 0x1A
+#define INTEL_NEHALEM_EP IFM(6, 0x1A)
#define INTEL_FAM6_NEHALEM_EX 0x2E
+#define INTEL_NEHALEM_EX IFM(6, 0x2E)
#define INTEL_FAM6_WESTMERE 0x25
+#define INTEL_WESTMERE IFM(6, 0x25)
#define INTEL_FAM6_WESTMERE_EP 0x2C
+#define INTEL_WESTMERE_EP IFM(6, 0x2C)
#define INTEL_FAM6_WESTMERE_EX 0x2F
+#define INTEL_WESTMERE_EX IFM(6, 0x2F)
#define INTEL_FAM6_SANDYBRIDGE 0x2A
+#define INTEL_SANDYBRIDGE IFM(6, 0x2A)
#define INTEL_FAM6_SANDYBRIDGE_X 0x2D
+#define INTEL_SANDYBRIDGE_X IFM(6, 0x2D)
#define INTEL_FAM6_IVYBRIDGE 0x3A
+#define INTEL_IVYBRIDGE IFM(6, 0x3A)
#define INTEL_FAM6_IVYBRIDGE_X 0x3E
+#define INTEL_IVYBRIDGE_X IFM(6, 0x3E)
#define INTEL_FAM6_HASWELL 0x3C
+#define INTEL_HASWELL IFM(6, 0x3C)
#define INTEL_FAM6_HASWELL_X 0x3F
+#define INTEL_HASWELL_X IFM(6, 0x3F)
#define INTEL_FAM6_HASWELL_L 0x45
+#define INTEL_HASWELL_L IFM(6, 0x45)
#define INTEL_FAM6_HASWELL_G 0x46
+#define INTEL_HASWELL_G IFM(6, 0x46)
#define INTEL_FAM6_BROADWELL 0x3D
+#define INTEL_BROADWELL IFM(6, 0x3D)
#define INTEL_FAM6_BROADWELL_G 0x47
+#define INTEL_BROADWELL_G IFM(6, 0x47)
#define INTEL_FAM6_BROADWELL_X 0x4F
+#define INTEL_BROADWELL_X IFM(6, 0x4F)
#define INTEL_FAM6_BROADWELL_D 0x56
+#define INTEL_BROADWELL_D IFM(6, 0x56)
#define INTEL_FAM6_SKYLAKE_L 0x4E /* Sky Lake */
+#define INTEL_SKYLAKE_L IFM(6, 0x4E) /* Sky Lake */
#define INTEL_FAM6_SKYLAKE 0x5E /* Sky Lake */
+#define INTEL_SKYLAKE IFM(6, 0x5E) /* Sky Lake */
#define INTEL_FAM6_SKYLAKE_X 0x55 /* Sky Lake */
+#define INTEL_SKYLAKE_X IFM(6, 0x55) /* Sky Lake */
/* CASCADELAKE_X 0x55 Sky Lake -- s: 7 */
/* COOPERLAKE_X 0x55 Sky Lake -- s: 11 */
#define INTEL_FAM6_KABYLAKE_L 0x8E /* Sky Lake */
+#define INTEL_KABYLAKE_L IFM(6, 0x8E) /* Sky Lake */
/* AMBERLAKE_L 0x8E Sky Lake -- s: 9 */
/* COFFEELAKE_L 0x8E Sky Lake -- s: 10 */
/* WHISKEYLAKE_L 0x8E Sky Lake -- s: 11,12 */
#define INTEL_FAM6_KABYLAKE 0x9E /* Sky Lake */
+#define INTEL_KABYLAKE IFM(6, 0x9E) /* Sky Lake */
/* COFFEELAKE 0x9E Sky Lake -- s: 10-13 */
#define INTEL_FAM6_COMETLAKE 0xA5 /* Sky Lake */
+#define INTEL_COMETLAKE IFM(6, 0xA5) /* Sky Lake */
#define INTEL_FAM6_COMETLAKE_L 0xA6 /* Sky Lake */
+#define INTEL_COMETLAKE_L IFM(6, 0xA6) /* Sky Lake */
#define INTEL_FAM6_CANNONLAKE_L 0x66 /* Palm Cove */
+#define INTEL_CANNONLAKE_L IFM(6, 0x66) /* Palm Cove */
#define INTEL_FAM6_ICELAKE_X 0x6A /* Sunny Cove */
+#define INTEL_ICELAKE_X IFM(6, 0x6A) /* Sunny Cove */
#define INTEL_FAM6_ICELAKE_D 0x6C /* Sunny Cove */
+#define INTEL_ICELAKE_D IFM(6, 0x6C) /* Sunny Cove */
#define INTEL_FAM6_ICELAKE 0x7D /* Sunny Cove */
+#define INTEL_ICELAKE IFM(6, 0x7D) /* Sunny Cove */
#define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */
+#define INTEL_ICELAKE_L IFM(6, 0x7E) /* Sunny Cove */
#define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */
+#define INTEL_ICELAKE_NNPI IFM(6, 0x9D) /* Sunny Cove */
#define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */
+#define INTEL_ROCKETLAKE IFM(6, 0xA7) /* Cypress Cove */
#define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */
+#define INTEL_TIGERLAKE_L IFM(6, 0x8C) /* Willow Cove */
#define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */
+#define INTEL_TIGERLAKE IFM(6, 0x8D) /* Willow Cove */
#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */
+#define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */
#define INTEL_FAM6_EMERALDRAPIDS_X 0xCF
+#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF)
#define INTEL_FAM6_GRANITERAPIDS_X 0xAD
+#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD)
#define INTEL_FAM6_GRANITERAPIDS_D 0xAE
+#define INTEL_GRANITERAPIDS_D IFM(6, 0xAE)
/* "Hybrid" Processors (P-Core/E-Core) */
#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */
+#define INTEL_LAKEFIELD IFM(6, 0x8A) /* Sunny Cove / Tremont */
#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
+#define INTEL_ALDERLAKE IFM(6, 0x97) /* Golden Cove / Gracemont */
#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
+#define INTEL_ALDERLAKE_L IFM(6, 0x9A) /* Golden Cove / Gracemont */
#define INTEL_FAM6_RAPTORLAKE 0xB7 /* Raptor Cove / Enhanced Gracemont */
+#define INTEL_RAPTORLAKE IFM(6, 0xB7) /* Raptor Cove / Enhanced Gracemont */
#define INTEL_FAM6_RAPTORLAKE_P 0xBA
+#define INTEL_RAPTORLAKE_P IFM(6, 0xBA)
#define INTEL_FAM6_RAPTORLAKE_S 0xBF
+#define INTEL_RAPTORLAKE_S IFM(6, 0xBF)
#define INTEL_FAM6_METEORLAKE 0xAC
+#define INTEL_METEORLAKE IFM(6, 0xAC)
#define INTEL_FAM6_METEORLAKE_L 0xAA
+#define INTEL_METEORLAKE_L IFM(6, 0xAA)
#define INTEL_FAM6_ARROWLAKE_H 0xC5
+#define INTEL_ARROWLAKE_H IFM(6, 0xC5)
#define INTEL_FAM6_ARROWLAKE 0xC6
+#define INTEL_ARROWLAKE IFM(6, 0xC6)
#define INTEL_FAM6_ARROWLAKE_U 0xB5
+#define INTEL_ARROWLAKE_U IFM(6, 0xB5)
#define INTEL_FAM6_LUNARLAKE_M 0xBD
+#define INTEL_LUNARLAKE_M IFM(6, 0xBD)
/* "Small Core" Processors (Atom/E-Core) */
#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
+#define INTEL_ATOM_BONNELL IFM(6, 0x1C) /* Diamondville, Pineview */
#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */
+#define INTEL_ATOM_BONNELL_MID IFM(6, 0x26) /* Silverthorne, Lincroft */
#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */
+#define INTEL_ATOM_SALTWELL IFM(6, 0x36) /* Cedarview */
#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */
+#define INTEL_ATOM_SALTWELL_MID IFM(6, 0x27) /* Penwell */
#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */
+#define INTEL_ATOM_SALTWELL_TABLET IFM(6, 0x35) /* Cloverview */
#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */
+#define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */
#define INTEL_FAM6_ATOM_SILVERMONT_D 0x4D /* Avaton, Rangely */
+#define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */
#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */
+#define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */
#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */
+#define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */
#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */
+#define INTEL_ATOM_AIRMONT_MID IFM(6, 0x5A) /* Moorefield */
#define INTEL_FAM6_ATOM_AIRMONT_NP 0x75 /* Lightning Mountain */
+#define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */
#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
+#define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */
#define INTEL_FAM6_ATOM_GOLDMONT_D 0x5F /* Denverton */
+#define INTEL_ATOM_GOLDMONT_D IFM(6, 0x5F) /* Denverton */
/* Note: the micro-architecture is "Goldmont Plus" */
#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
+#define INTEL_ATOM_GOLDMONT_PLUS IFM(6, 0x7A) /* Gemini Lake */
#define INTEL_FAM6_ATOM_TREMONT_D 0x86 /* Jacobsville */
+#define INTEL_ATOM_TREMONT_D IFM(6, 0x86) /* Jacobsville */
#define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */
+#define INTEL_ATOM_TREMONT IFM(6, 0x96) /* Elkhart Lake */
#define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */
+#define INTEL_ATOM_TREMONT_L IFM(6, 0x9C) /* Jasper Lake */
#define INTEL_FAM6_ATOM_GRACEMONT 0xBE /* Alderlake N */
+#define INTEL_ATOM_GRACEMONT IFM(6, 0xBE) /* Alderlake N */
#define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */
+#define INTEL_ATOM_CRESTMONT_X IFM(6, 0xAF) /* Sierra Forest */
#define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */
+#define INTEL_ATOM_CRESTMONT IFM(6, 0xB6) /* Grand Ridge */
#define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */
+#define INTEL_ATOM_DARKMONT_X IFM(6, 0xDD) /* Clearwater Forest */
/* Xeon Phi */
#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */
+#define INTEL_XEON_PHI_KNL IFM(6, 0x57) /* Knights Landing */
#define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */
+#define INTEL_XEON_PHI_KNM IFM(6, 0x85) /* Knights Mill */
/* Family 5 */
#define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */
+#define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */
#endif /* _ASM_X86_INTEL_FAMILY_H */
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 798183867d7896..b71ad173f8776f 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -100,7 +100,7 @@
}
#define ASM_CALL_ARG0 \
- "call %P[__func] \n" \
+ "call %c[__func] \n" \
ASM_REACHABLE
#define ASM_CALL_ARG1 \
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index de3118305838e9..dfd2e9699bd731 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -13,6 +13,7 @@
#define MCG_CTL_P BIT_ULL(8) /* MCG_CTL register available */
#define MCG_EXT_P BIT_ULL(9) /* Extended registers available */
#define MCG_CMCI_P BIT_ULL(10) /* CMCI supported */
+#define MCG_SEAM_NR BIT_ULL(12) /* MCG_STATUS_SEAM_NR supported */
#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */
#define MCG_EXT_CNT_SHIFT 16
#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
@@ -25,6 +26,7 @@
#define MCG_STATUS_EIPV BIT_ULL(1) /* ip points to correct instruction */
#define MCG_STATUS_MCIP BIT_ULL(2) /* machine check in progress */
#define MCG_STATUS_LMCES BIT_ULL(3) /* LMCE signaled */
+#define MCG_STATUS_SEAM_NR BIT_ULL(12) /* Machine check inside SEAM non-root mode */
/* MCG_EXT_CTL register defines */
#define MCG_EXT_CTL_LMCE_EN BIT_ULL(0) /* Enable LMCE */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e72c2b87295799..e022e6eb766c64 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -170,6 +170,10 @@
* CPU is not affected by Branch
* History Injection.
*/
+#define ARCH_CAP_XAPIC_DISABLE BIT(21) /*
+ * IA32_XAPIC_DISABLE_STATUS MSR
+ * supported
+ */
#define ARCH_CAP_PBRSB_NO BIT(24) /*
* Not susceptible to Post-Barrier
* Return Stack Buffer Predictions.
@@ -192,11 +196,6 @@
* File.
*/
-#define ARCH_CAP_XAPIC_DISABLE BIT(21) /*
- * IA32_XAPIC_DISABLE_STATUS MSR
- * supported
- */
-
#define MSR_IA32_FLUSH_CMD 0x0000010b
#define L1D_FLUSH BIT(0) /*
* Writeback and invalidate the
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 9da9c8a2f1df5d..52f1b4ff0cc16c 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -31,10 +31,12 @@
#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC
-#define __PHYSICAL_START ALIGN(CONFIG_PHYSICAL_START, \
- CONFIG_PHYSICAL_ALIGN)
+/* Physical address where kernel should be loaded. */
+#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
+ + (CONFIG_PHYSICAL_ALIGN - 1)) \
+ & ~(CONFIG_PHYSICAL_ALIGN - 1))
-#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
+#define __START_KERNEL (__START_KERNEL_map + LOAD_PHYSICAL_ADDR)
#ifdef CONFIG_X86_64
#include <asm/page_64_types.h>
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 44958ebaf626e2..3bedee1801e2a8 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -59,36 +59,24 @@
#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":"
#define __my_cpu_offset this_cpu_read(this_cpu_off)
-#ifdef CONFIG_USE_X86_SEG_SUPPORT
-/*
- * Efficient implementation for cases in which the compiler supports
- * named address spaces. Allows the compiler to perform additional
- * optimizations that can save more instructions.
- */
-#define arch_raw_cpu_ptr(ptr) \
-({ \
- unsigned long tcp_ptr__; \
- tcp_ptr__ = __raw_cpu_read(, this_cpu_off); \
- \
- tcp_ptr__ += (unsigned long)(ptr); \
- (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
-})
-#else /* CONFIG_USE_X86_SEG_SUPPORT */
/*
* Compared to the generic __my_cpu_offset version, the following
* saves one instruction and avoids clobbering a temp register.
+ *
+ * arch_raw_cpu_ptr should not be used in 32-bit VDSO for a 64-bit
+ * kernel, because games are played with CONFIG_X86_64 there and
+ * sizeof(this_cpu_off) becames 4.
*/
-#define arch_raw_cpu_ptr(ptr) \
+#ifndef BUILD_VDSO32_64
+#define arch_raw_cpu_ptr(_ptr) \
({ \
- unsigned long tcp_ptr__; \
- asm ("mov " __percpu_arg(1) ", %0" \
- : "=r" (tcp_ptr__) \
- : "m" (__my_cpu_var(this_cpu_off))); \
- \
- tcp_ptr__ += (unsigned long)(ptr); \
- (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
+ unsigned long tcp_ptr__ = raw_cpu_read_long(this_cpu_off); \
+ tcp_ptr__ += (__force unsigned long)(_ptr); \
+ (typeof(*(_ptr)) __kernel __force *)tcp_ptr__; \
})
-#endif /* CONFIG_USE_X86_SEG_SUPPORT */
+#else
+#define arch_raw_cpu_ptr(_ptr) ({ BUILD_BUG(); (typeof(_ptr))0; })
+#endif
#define PER_CPU_VAR(var) %__percpu_seg:(var)__percpu_rel
@@ -102,8 +90,8 @@
#endif /* CONFIG_SMP */
#define __my_cpu_type(var) typeof(var) __percpu_seg_override
-#define __my_cpu_ptr(ptr) (__my_cpu_type(*ptr) *)(uintptr_t)(ptr)
-#define __my_cpu_var(var) (*__my_cpu_ptr(&var))
+#define __my_cpu_ptr(ptr) (__my_cpu_type(*(ptr))*)(__force uintptr_t)(ptr)
+#define __my_cpu_var(var) (*__my_cpu_ptr(&(var)))
#define __percpu_arg(x) __percpu_prefix "%" #x
#define __force_percpu_arg(x) __force_percpu_prefix "%" #x
@@ -230,25 +218,26 @@ do { \
})
/*
- * xchg is implemented using cmpxchg without a lock prefix. xchg is
- * expensive due to the implied lock prefix. The processor cannot prefetch
- * cachelines if xchg is used.
+ * raw_cpu_xchg() can use a load-store since
+ * it is not required to be IRQ-safe.
*/
-#define percpu_xchg_op(size, qual, _var, _nval) \
+#define raw_percpu_xchg_op(_var, _nval) \
({ \
- __pcpu_type_##size pxo_old__; \
- __pcpu_type_##size pxo_new__ = __pcpu_cast_##size(_nval); \
- asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), \
- "%[oval]") \
- "\n1:\t" \
- __pcpu_op2_##size("cmpxchg", "%[nval]", \
- __percpu_arg([var])) \
- "\n\tjnz 1b" \
- : [oval] "=&a" (pxo_old__), \
- [var] "+m" (__my_cpu_var(_var)) \
- : [nval] __pcpu_reg_##size(, pxo_new__) \
- : "memory"); \
- (typeof(_var))(unsigned long) pxo_old__; \
+ typeof(_var) pxo_old__ = raw_cpu_read(_var); \
+ raw_cpu_write(_var, _nval); \
+ pxo_old__; \
+})
+
+/*
+ * this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
+ * xchg is expensive due to the implied lock prefix. The processor
+ * cannot prefetch cachelines if xchg is used.
+ */
+#define this_percpu_xchg_op(_var, _nval) \
+({ \
+ typeof(_var) pxo_old__ = this_cpu_read(_var); \
+ do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval)); \
+ pxo_old__; \
})
/*
@@ -428,10 +417,6 @@ do { \
* actually per-thread variables implemented as per-CPU variables and
* thus stable for the duration of the respective task.
*/
-#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp)
-#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp)
-#define this_cpu_read_stable_4(pcp) percpu_stable_op(4, "mov", pcp)
-#define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp)
#define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp)
#ifdef CONFIG_USE_X86_SEG_SUPPORT
@@ -500,6 +485,10 @@ do { \
#define this_cpu_read_const(pcp) ({ BUILD_BUG(); (typeof(pcp))0; })
#endif /* CONFIG_USE_X86_SEG_SUPPORT */
+#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp)
+#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp)
+#define this_cpu_read_stable_4(pcp) percpu_stable_op(4, "mov", pcp)
+
#define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val)
#define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val)
#define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val)
@@ -509,18 +498,6 @@ do { \
#define raw_cpu_or_1(pcp, val) percpu_to_op(1, , "or", (pcp), val)
#define raw_cpu_or_2(pcp, val) percpu_to_op(2, , "or", (pcp), val)
#define raw_cpu_or_4(pcp, val) percpu_to_op(4, , "or", (pcp), val)
-
-/*
- * raw_cpu_xchg() can use a load-store since it is not required to be
- * IRQ-safe.
- */
-#define raw_percpu_xchg_op(var, nval) \
-({ \
- typeof(var) pxo_ret__ = raw_cpu_read(var); \
- raw_cpu_write(var, (nval)); \
- pxo_ret__; \
-})
-
#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val)
#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val)
#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val)
@@ -534,9 +511,9 @@ do { \
#define this_cpu_or_1(pcp, val) percpu_to_op(1, volatile, "or", (pcp), val)
#define this_cpu_or_2(pcp, val) percpu_to_op(2, volatile, "or", (pcp), val)
#define this_cpu_or_4(pcp, val) percpu_to_op(4, volatile, "or", (pcp), val)
-#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(1, volatile, pcp, nval)
-#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(2, volatile, pcp, nval)
-#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(4, volatile, pcp, nval)
+#define this_cpu_xchg_1(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval) this_percpu_xchg_op(pcp, nval)
#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(1, , pcp, val)
#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(2, , pcp, val)
@@ -563,6 +540,8 @@ do { \
* 32 bit must fall back to generic operations.
*/
#ifdef CONFIG_X86_64
+#define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp)
+
#define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val)
#define raw_cpu_and_8(pcp, val) percpu_to_op(8, , "and", (pcp), val)
#define raw_cpu_or_8(pcp, val) percpu_to_op(8, , "or", (pcp), val)
@@ -575,41 +554,41 @@ do { \
#define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val)
#define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val)
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
-#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval)
+#define this_cpu_xchg_8(pcp, nval) this_percpu_xchg_op(pcp, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
-#endif
-
-static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
- const unsigned long __percpu *addr)
-{
- unsigned long __percpu *a =
- (unsigned long __percpu *)addr + nr / BITS_PER_LONG;
-#ifdef CONFIG_X86_64
- return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0;
+#define raw_cpu_read_long(pcp) raw_cpu_read_8(pcp)
#else
- return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_4(*a)) != 0;
-#endif
-}
+/* There is no generic 64 bit read stable operation for 32 bit targets. */
+#define this_cpu_read_stable_8(pcp) ({ BUILD_BUG(); (typeof(pcp))0; })
-static inline bool x86_this_cpu_variable_test_bit(int nr,
- const unsigned long __percpu *addr)
-{
- bool oldbit;
+#define raw_cpu_read_long(pcp) raw_cpu_read_4(pcp)
+#endif
- asm volatile("btl "__percpu_arg(2)",%1"
- CC_SET(c)
- : CC_OUT(c) (oldbit)
- : "m" (*__my_cpu_ptr((unsigned long __percpu *)(addr))), "Ir" (nr));
+#define x86_this_cpu_constant_test_bit(_nr, _var) \
+({ \
+ unsigned long __percpu *addr__ = \
+ (unsigned long __percpu *)&(_var) + ((_nr) / BITS_PER_LONG); \
+ !!((1UL << ((_nr) % BITS_PER_LONG)) & raw_cpu_read(*addr__)); \
+})
- return oldbit;
-}
+#define x86_this_cpu_variable_test_bit(_nr, _var) \
+({ \
+ bool oldbit; \
+ \
+ asm volatile("btl %[nr], " __percpu_arg([var]) \
+ CC_SET(c) \
+ : CC_OUT(c) (oldbit) \
+ : [var] "m" (__my_cpu_var(_var)), \
+ [nr] "rI" (_nr)); \
+ oldbit; \
+})
-#define x86_this_cpu_test_bit(nr, addr) \
- (__builtin_constant_p((nr)) \
- ? x86_this_cpu_constant_test_bit((nr), (addr)) \
- : x86_this_cpu_variable_test_bit((nr), (addr)))
+#define x86_this_cpu_test_bit(_nr, _var) \
+ (__builtin_constant_p(_nr) \
+ ? x86_this_cpu_constant_test_bit(_nr, _var) \
+ : x86_this_cpu_variable_test_bit(_nr, _var))
#include <asm-generic/percpu.h>
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 9abb8cc4cd4747..b786449626267e 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -567,6 +567,8 @@ static inline void update_page_count(int level, unsigned long pages) { }
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level);
+pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
+ unsigned int *level, bool *nx, bool *rw);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 811548f131f4e3..56a202a94be2a6 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -108,9 +108,23 @@ struct cpuinfo_topology {
};
struct cpuinfo_x86 {
- __u8 x86; /* CPU family */
- __u8 x86_vendor; /* CPU vendor */
- __u8 x86_model;
+ union {
+ /*
+ * The particular ordering (low-to-high) of (vendor,
+ * family, model) is done in case range of models, like
+ * it is usually done on AMD, need to be compared.
+ */
+ struct {
+ __u8 x86_model;
+ /* CPU family */
+ __u8 x86;
+ /* CPU vendor */
+ __u8 x86_vendor;
+ __u8 x86_reserved;
+ };
+ /* combined vendor, family, model */
+ __u32 x86_vfm;
+ };
__u8 x86_stepping;
#ifdef CONFIG_X86_64
/* Number of 4K pages in DTLB/ITLB combined(in pages): */
@@ -587,7 +601,7 @@ extern char ignore_fpu_irq;
# define BASE_PREFETCH ""
# define ARCH_HAS_PREFETCH
#else
-# define BASE_PREFETCH "prefetcht0 %P1"
+# define BASE_PREFETCH "prefetcht0 %1"
#endif
/*
@@ -598,7 +612,7 @@ extern char ignore_fpu_irq;
*/
static inline void prefetch(const void *x)
{
- alternative_input(BASE_PREFETCH, "prefetchnta %P1",
+ alternative_input(BASE_PREFETCH, "prefetchnta %1",
X86_FEATURE_XMM,
"m" (*(const char *)x));
}
@@ -610,7 +624,7 @@ static inline void prefetch(const void *x)
*/
static __always_inline void prefetchw(const void *x)
{
- alternative_input(BASE_PREFETCH, "prefetchw %P1",
+ alternative_input(BASE_PREFETCH, "prefetchw %1",
X86_FEATURE_3DNOWPREFETCH,
"m" (*(const char *)x));
}
@@ -636,12 +650,10 @@ static __always_inline void prefetchw(const void *x)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
#else
-extern unsigned long __end_init_task[];
+extern unsigned long __top_init_kernel_stack[];
#define INIT_THREAD { \
- .sp = (unsigned long)&__end_init_task - \
- TOP_OF_KERNEL_STACK_PADDING - \
- sizeof(struct pt_regs), \
+ .sp = (unsigned long)&__top_init_kernel_stack, \
}
extern unsigned long KSTK_ESP(struct task_struct *task);
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 043758a2e62703..365798cb4408d9 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -23,19 +23,14 @@ extern int of_ioapic;
extern u64 initial_dtb;
extern void add_dtb(u64 data);
void x86_of_pci_init(void);
-void x86_dtb_parse_smp_config(void);
+void x86_flattree_get_config(void);
#else
static inline void add_dtb(u64 data) { }
static inline void x86_of_pci_init(void) { }
-static inline void x86_dtb_parse_smp_config(void) { }
+static inline void x86_flattree_get_config(void) { }
#define of_ioapic 0
#endif
-#ifdef CONFIG_OF_EARLY_FLATTREE
-void x86_flattree_get_config(void);
-#else
-static inline void x86_flattree_get_config(void) { }
-#endif
extern char cmd_line[COMMAND_LINE_SIZE];
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index cde8357bb226d1..a053c12939751f 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -85,6 +85,8 @@ DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key);
#define virt_spin_lock virt_spin_lock
static inline bool virt_spin_lock(struct qspinlock *lock)
{
+ int val;
+
if (!static_branch_likely(&virt_spin_lock_key))
return false;
@@ -94,10 +96,13 @@ static inline bool virt_spin_lock(struct qspinlock *lock)
* horrible lock 'holder' preemption issues.
*/
- do {
- while (atomic_read(&lock->val) != 0)
- cpu_relax();
- } while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0);
+ __retry:
+ val = atomic_read(&lock->val);
+
+ if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) {
+ cpu_relax();
+ goto __retry;
+ }
return true;
}
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index ef9697f20129af..0a985784be9bb7 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -25,9 +25,9 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
*
* void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock)
* {
- * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0);
+ * u8 lockval = _Q_LOCKED_VAL;
*
- * if (likely(lockval == _Q_LOCKED_VAL))
+ * if (try_cmpxchg(&lock->locked, &lockval, 0))
* return;
* pv_queued_spin_unlock_slowpath(lock, lockval);
* }
@@ -40,10 +40,9 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
#define PV_UNLOCK_ASM \
FRAME_BEGIN \
"push %rdx\n\t" \
- "mov $0x1,%eax\n\t" \
+ "mov $" __stringify(_Q_LOCKED_VAL) ",%eax\n\t" \
"xor %edx,%edx\n\t" \
LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \
- "cmp $0x1,%al\n\t" \
"jne .slowpath\n\t" \
"pop %rdx\n\t" \
FRAME_END \
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index fef16e398161c4..42bcd42d70d123 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -9,7 +9,7 @@
#endif
#ifdef CONFIG_COMPAT
-#include <asm/ia32_unistd.h>
+#include <asm/unistd_32_ia32.h>
#define __NR_seccomp_read_32 __NR_ia32_read
#define __NR_seccomp_write_32 __NR_ia32_write
#define __NR_seccomp_exit_32 __NR_ia32_exit
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 7f57382afee417..48bc397db6493b 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -140,7 +140,7 @@ struct secrets_os_area {
#define VMPCK_KEY_LEN 32
/* See the SNP spec version 0.9 for secrets page format */
-struct snp_secrets_page_layout {
+struct snp_secrets_page {
u32 version;
u32 imien : 1,
rsvd1 : 31;
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 2e9fc5c400cdc3..aec6e2d3aa1d52 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -182,8 +182,8 @@ static __always_inline void clflush(volatile void *__p)
static inline void clflushopt(volatile void *__p)
{
- alternative_io(".byte 0x3e; clflush %P0",
- ".byte 0x66; clflush %P0",
+ alternative_io(".byte 0x3e; clflush %0",
+ ".byte 0x66; clflush %0",
X86_FEATURE_CLFLUSHOPT,
"+m" (*(volatile char __force *)__p));
}
@@ -205,9 +205,9 @@ static inline void clwb(volatile void *__p)
#ifdef CONFIG_X86_USER_SHADOW_STACK
static inline int write_user_shstk_64(u64 __user *addr, u64 val)
{
- asm goto("1: wrussq %[val], (%[addr])\n"
+ asm goto("1: wrussq %[val], %[addr]\n"
_ASM_EXTABLE(1b, %l[fail])
- :: [addr] "r" (addr), [val] "r" (val)
+ :: [addr] "m" (*addr), [val] "r" (val)
:: fail);
return 0;
fail:
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 857d364b98888a..9d0b324eab213a 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -30,37 +30,40 @@ void *__memset(void *s, int c, size_t n);
#define __HAVE_ARCH_MEMSET16
static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
{
- long d0, d1;
- asm volatile("rep\n\t"
- "stosw"
- : "=&c" (d0), "=&D" (d1)
- : "a" (v), "1" (s), "0" (n)
- : "memory");
- return s;
+ const __auto_type s0 = s;
+ asm volatile (
+ "rep stosw"
+ : "+D" (s), "+c" (n)
+ : "a" (v)
+ : "memory"
+ );
+ return s0;
}
#define __HAVE_ARCH_MEMSET32
static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
{
- long d0, d1;
- asm volatile("rep\n\t"
- "stosl"
- : "=&c" (d0), "=&D" (d1)
- : "a" (v), "1" (s), "0" (n)
- : "memory");
- return s;
+ const __auto_type s0 = s;
+ asm volatile (
+ "rep stosl"
+ : "+D" (s), "+c" (n)
+ : "a" (v)
+ : "memory"
+ );
+ return s0;
}
#define __HAVE_ARCH_MEMSET64
static inline void *memset64(uint64_t *s, uint64_t v, size_t n)
{
- long d0, d1;
- asm volatile("rep\n\t"
- "stosq"
- : "=&c" (d0), "=&D" (d1)
- : "a" (v), "1" (s), "0" (n)
- : "memory");
- return s;
+ const __auto_type s0 = s;
+ asm volatile (
+ "rep stosq"
+ : "+D" (s), "+c" (n)
+ : "a" (v)
+ : "memory"
+ );
+ return s0;
}
#endif
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 345aafbc196488..6259f1937fe77e 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -15,7 +15,7 @@
extern void text_poke_early(void *addr, const void *opcode, size_t len);
-extern void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len);
+extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len);
/*
* Clear and restore the kernel write-protection flag on the local CPU.
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 237dc8cdd12b94..0f9bab92a43d76 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -78,7 +78,7 @@ extern int __get_user_bad(void);
int __ret_gu; \
register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \
__chk_user_ptr(ptr); \
- asm volatile("call __" #fn "_%P4" \
+ asm volatile("call __" #fn "_%c4" \
: "=a" (__ret_gu), "=r" (__val_gu), \
ASM_CALL_CONSTRAINT \
: "0" (ptr), "i" (sizeof(*(ptr)))); \
@@ -177,7 +177,7 @@ extern void __put_user_nocheck_8(void);
__chk_user_ptr(__ptr); \
__ptr_pu = __ptr; \
__val_pu = __x; \
- asm volatile("call __" #fn "_%P[size]" \
+ asm volatile("call __" #fn "_%c[size]" \
: "=c" (__ret_pu), \
ASM_CALL_CONSTRAINT \
: "0" (__ptr_pu), \
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index 8e048ca980df85..0ef36190abe613 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -300,7 +300,7 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
#define vdso_cycles_ok arch_vdso_cycles_ok
/*
- * x86 specific delta calculation.
+ * x86 specific calculation of nanoseconds for the current cycle count
*
* The regular implementation assumes that clocksource reads are globally
* monotonic. The TSC can be slightly off across sockets which can cause
@@ -308,8 +308,8 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
* jump.
*
* Therefore it needs to be verified that @cycles are greater than
- * @last. If not then use @last, which is the base time of the current
- * conversion period.
+ * @vd->cycles_last. If not then use @vd->cycles_last, which is the base
+ * time of the current conversion period.
*
* This variant also uses a custom mask because while the clocksource mask of
* all the VDSO capable clocksources on x86 is U64_MAX, the above code uses
@@ -317,25 +317,37 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
* declares everything with the MSB/Sign-bit set as invalid. Therefore the
* effective mask is S64_MAX.
*/
-static __always_inline
-u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
+static __always_inline u64 vdso_calc_ns(const struct vdso_data *vd, u64 cycles, u64 base)
{
- /*
- * Due to the MSB/Sign-bit being used as invalid marker (see
- * arch_vdso_cycles_valid() above), the effective mask is S64_MAX.
- */
- u64 delta = (cycles - last) & S64_MAX;
+ u64 delta = cycles - vd->cycle_last;
/*
- * Due to the above mentioned TSC wobbles, filter out negative motion.
- * Per the above masking, the effective sign bit is now bit 62.
+ * Negative motion and deltas which can cause multiplication
+ * overflow require special treatment. This check covers both as
+ * negative motion is guaranteed to be greater than @vd::max_cycles
+ * due to unsigned comparison.
+ *
+ * Due to the MSB/Sign-bit being used as invalid marker (see
+ * arch_vdso_cycles_valid() above), the effective mask is S64_MAX,
+ * but that case is also unlikely and will also take the unlikely path
+ * here.
*/
- if (unlikely(delta & (1ULL << 62)))
- return 0;
+ if (unlikely(delta > vd->max_cycles)) {
+ /*
+ * Due to the above mentioned TSC wobbles, filter out
+ * negative motion. Per the above masking, the effective
+ * sign bit is now bit 62.
+ */
+ if (delta & (1ULL << 62))
+ return base >> vd->shift;
+
+ /* Handle multiplication overflow gracefully */
+ return mul_u64_u32_add_u64_shr(delta & S64_MAX, vd->mult, base, vd->shift);
+ }
- return delta * mult;
+ return ((delta * vd->mult) + base) >> vd->shift;
}
-#define vdso_calc_delta vdso_calc_delta
+#define vdso_calc_ns vdso_calc_ns
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 9e8ac5073ecb8d..62ee1990990385 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -84,7 +84,7 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { }
-#define free_vm86(t) do { } while(0)
+#define free_vm86(task) do { (void)(task); } while(0)
#endif /* CONFIG_VM86 */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 74077694da7d9f..5d128167e2e248 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -62,7 +62,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
-obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
+obj-y += pci-dma.o quirks.o kdebugfs.o
obj-y += alternative.o i8253.o hw_breakpoint.o
obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
obj-y += resource.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 45a280f2161cd3..7555c15b71830f 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -125,6 +125,20 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
};
/*
+ * Nomenclature for variable names to simplify and clarify this code and ease
+ * any potential staring at it:
+ *
+ * @instr: source address of the original instructions in the kernel text as
+ * generated by the compiler.
+ *
+ * @buf: temporary buffer on which the patching operates. This buffer is
+ * eventually text-poked into the kernel image.
+ *
+ * @replacement/@repl: pointer to the opcodes which are replacing @instr, located
+ * in the .altinstr_replacement section.
+ */
+
+/*
* Fill the buffer with a single effective instruction of size @len.
*
* In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
@@ -133,28 +147,28 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
* each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
* *jump* over instead of executing long and daft NOPs.
*/
-static void add_nop(u8 *instr, unsigned int len)
+static void add_nop(u8 *buf, unsigned int len)
{
- u8 *target = instr + len;
+ u8 *target = buf + len;
if (!len)
return;
if (len <= ASM_NOP_MAX) {
- memcpy(instr, x86_nops[len], len);
+ memcpy(buf, x86_nops[len], len);
return;
}
if (len < 128) {
- __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
- instr += JMP8_INSN_SIZE;
+ __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE);
+ buf += JMP8_INSN_SIZE;
} else {
- __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
- instr += JMP32_INSN_SIZE;
+ __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE);
+ buf += JMP32_INSN_SIZE;
}
- for (;instr < target; instr++)
- *instr = INT3_INSN_OPCODE;
+ for (;buf < target; buf++)
+ *buf = INT3_INSN_OPCODE;
}
extern s32 __retpoline_sites[], __retpoline_sites_end[];
@@ -187,12 +201,12 @@ static bool insn_is_nop(struct insn *insn)
* Find the offset of the first non-NOP instruction starting at @offset
* but no further than @len.
*/
-static int skip_nops(u8 *instr, int offset, int len)
+static int skip_nops(u8 *buf, int offset, int len)
{
struct insn insn;
for (; offset < len; offset += insn.length) {
- if (insn_decode_kernel(&insn, &instr[offset]))
+ if (insn_decode_kernel(&insn, &buf[offset]))
break;
if (!insn_is_nop(&insn))
@@ -203,66 +217,32 @@ static int skip_nops(u8 *instr, int offset, int len)
}
/*
- * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
- * to the end of the NOP sequence into a single NOP.
- */
-static bool
-__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
-{
- int i = *next - insn->length;
-
- switch (insn->opcode.bytes[0]) {
- case JMP8_INSN_OPCODE:
- case JMP32_INSN_OPCODE:
- *prev = i;
- *target = *next + insn->immediate.value;
- return false;
- }
-
- if (insn_is_nop(insn)) {
- int nop = i;
-
- *next = skip_nops(instr, *next, len);
- if (*target && *next == *target)
- nop = *prev;
-
- add_nop(instr + nop, *next - nop);
- DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next);
- return true;
- }
-
- *target = 0;
- return false;
-}
-
-/*
* "noinline" to cause control flow change and thus invalidate I$ and
* cause refetch after modification.
*/
-static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
+static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
{
- int prev, target = 0;
-
for (int next, i = 0; i < len; i = next) {
struct insn insn;
- if (insn_decode_kernel(&insn, &instr[i]))
+ if (insn_decode_kernel(&insn, &buf[i]))
return;
next = i + insn.length;
- __optimize_nops(instr, len, &insn, &next, &prev, &target);
- }
-}
+ if (insn_is_nop(&insn)) {
+ int nop = i;
-static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len)
-{
- unsigned long flags;
+ /* Has the NOP already been optimized? */
+ if (i + insn.length == len)
+ return;
- local_irq_save(flags);
- optimize_nops(instr, len);
- sync_core();
- local_irq_restore(flags);
+ next = skip_nops(buf, next, len);
+
+ add_nop(buf + nop, next - nop);
+ DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
+ }
+ }
}
/*
@@ -335,11 +315,9 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
return (target < src || target > src + src_len);
}
-void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
+static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
{
- int prev, target = 0;
-
- for (int next, i = 0; i < len; i = next) {
+ for (int next, i = 0; i < instrlen; i = next) {
struct insn insn;
if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
@@ -347,9 +325,6 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
next = i + insn.length;
- if (__optimize_nops(buf, len, &insn, &next, &prev, &target))
- continue;
-
switch (insn.opcode.bytes[0]) {
case 0x0f:
if (insn.opcode.bytes[1] < 0x80 ||
@@ -361,10 +336,10 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
case JMP8_INSN_OPCODE:
case JMP32_INSN_OPCODE:
case CALL_INSN_OPCODE:
- if (need_reloc(next + insn.immediate.value, src, src_len)) {
+ if (need_reloc(next + insn.immediate.value, repl, repl_len)) {
apply_reloc(insn.immediate.nbytes,
buf + i + insn_offset_immediate(&insn),
- src - dest);
+ repl - instr);
}
/*
@@ -372,7 +347,7 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
*/
if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
s32 imm = insn.immediate.value;
- imm += src - dest;
+ imm += repl - instr;
imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
if ((imm >> 31) == (imm >> 7)) {
buf[i+0] = JMP8_INSN_OPCODE;
@@ -385,15 +360,21 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
}
if (insn_rip_relative(&insn)) {
- if (need_reloc(next + insn.displacement.value, src, src_len)) {
+ if (need_reloc(next + insn.displacement.value, repl, repl_len)) {
apply_reloc(insn.displacement.nbytes,
buf + i + insn_offset_displacement(&insn),
- src - dest);
+ repl - instr);
}
}
}
}
+void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
+{
+ __apply_relocation(buf, instr, instrlen, repl, repl_len);
+ optimize_nops(instr, buf, repl_len);
+}
+
/* Low-level backend functions usable from alternative code replacements. */
DEFINE_ASM_FUNC(nop_func, "", .entry.text);
EXPORT_SYMBOL_GPL(nop_func);
@@ -464,9 +445,9 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
void __init_or_module noinline apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
- struct alt_instr *a;
- u8 *instr, *replacement;
u8 insn_buff[MAX_PATCH_LEN];
+ u8 *instr, *replacement;
+ struct alt_instr *a;
DPRINTK(ALT, "alt table %px, -> %px", start, end);
@@ -504,7 +485,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* patch if feature is *NOT* present.
*/
if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
- optimize_nops_inplace(instr, a->instrlen);
+ memcpy(insn_buff, instr, a->instrlen);
+ optimize_nops(instr, insn_buff, a->instrlen);
+ text_poke_early(instr, insn_buff, a->instrlen);
continue;
}
@@ -526,7 +509,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
- apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen);
+ apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
@@ -761,7 +744,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
len = patch_retpoline(addr, &insn, bytes);
if (len == insn.length) {
- optimize_nops(bytes, len);
+ optimize_nops(addr, bytes, len);
DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
text_poke_early(addr, bytes, len);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index c342c4aa9c6848..8e0514eb552b95 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -497,32 +497,32 @@ static struct clock_event_device lapic_clockevent = {
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
static const struct x86_cpu_id deadline_match[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */
+ X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */
+ X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */
- X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, 0x0b000020),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003),
+ X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011),
+ X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e),
+ X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c),
+ X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0),
+ X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136),
+ X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014),
+ X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0),
- X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22),
- X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20),
- X86_MATCH_INTEL_FAM6_MODEL( HASWELL_G, 0x17),
+ X86_MATCH_VFM(INTEL_HASWELL, 0x22),
+ X86_MATCH_VFM(INTEL_HASWELL_L, 0x20),
+ X86_MATCH_VFM(INTEL_HASWELL_G, 0x17),
- X86_MATCH_INTEL_FAM6_MODEL( BROADWELL, 0x25),
- X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_G, 0x17),
+ X86_MATCH_VFM(INTEL_BROADWELL, 0x25),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, 0x17),
- X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_L, 0xb2),
- X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE, 0xb2),
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, 0xb2),
+ X86_MATCH_VFM(INTEL_SKYLAKE, 0xb2),
- X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE_L, 0x52),
- X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE, 0x52),
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, 0x52),
+ X86_MATCH_VFM(INTEL_KABYLAKE, 0x52),
{},
};
@@ -631,7 +631,7 @@ void lapic_update_tsc_freq(void)
static __initdata int lapic_cal_loops = -1;
static __initdata long lapic_cal_t1, lapic_cal_t2;
static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
-static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata u32 lapic_cal_pm1, lapic_cal_pm2;
static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
/*
@@ -641,7 +641,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
{
unsigned long long tsc = 0;
long tapic = apic_read(APIC_TMCCT);
- unsigned long pm = acpi_pm_read_early();
+ u32 pm = acpi_pm_read_early();
if (boot_cpu_has(X86_FEATURE_TSC))
tsc = rdtsc();
@@ -666,7 +666,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
}
static int __init
-calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
+calibrate_by_pmtimer(u32 deltapm, long *delta, long *deltatsc)
{
const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
const long pm_thresh = pm_100ms / 100;
@@ -677,7 +677,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
return -1;
#endif
- apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
+ apic_printk(APIC_VERBOSE, "... PM-Timer delta = %u\n", deltapm);
/* Check, if the PM timer is available */
if (!deltapm)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 567dbd2fe4b697..7db83212effb0f 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -178,13 +178,16 @@ static int x2apic_prepare_cpu(unsigned int cpu)
u32 phys_apicid = apic->cpu_present_to_apicid(cpu);
u32 cluster = apic_cluster(phys_apicid);
u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf));
+ int node = cpu_to_node(cpu);
x86_cpu_to_logical_apicid[cpu] = logical_apicid;
- if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0)
+ if (alloc_clustermask(cpu, cluster, node) < 0)
return -ENOMEM;
- if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
+
+ if (!zalloc_cpumask_var_node(&per_cpu(ipi_mask, cpu), GFP_KERNEL, node))
return -ENOMEM;
+
return 0;
}
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index e92ff0c11db814..4656474567533b 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -185,8 +185,7 @@ static void *patch_dest(void *dest, bool direct)
u8 *pad = dest - tsize;
memcpy(insn_buff, skl_call_thunk_template, tsize);
- apply_relocation(insn_buff, tsize, pad,
- skl_call_thunk_template, tsize);
+ apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize);
/* Already patched? */
if (!bcmp(pad, insn_buff, tsize))
@@ -308,8 +307,7 @@ static bool is_callthunk(void *addr)
pad = (void *)(dest - tmpl_size);
memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
- apply_relocation(insn_buff, tmpl_size, pad,
- skl_call_thunk_template, tmpl_size);
+ apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size);
return !bcmp(pad, insn_buff, tmpl_size);
}
@@ -327,8 +325,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip)
return 0;
memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
- apply_relocation(insn_buff, tmpl_size, ip,
- skl_call_thunk_template, tmpl_size);
+ apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size);
memcpy(*pprog, insn_buff, tmpl_size);
*pprog += tmpl_size;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 307302af0aeee2..44df3f11e7319b 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -13,6 +13,7 @@
#include <asm/apic.h>
#include <asm/cacheinfo.h>
#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
#include <asm/spec-ctrl.h>
#include <asm/smp.h>
#include <asm/numa.h>
@@ -794,6 +795,11 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
clear_rdrand_cpuid_bit(c);
}
+static const struct x86_cpu_desc erratum_1386_microcode[] = {
+ AMD_CPU_DESC(0x17, 0x1, 0x2, 0x0800126e),
+ AMD_CPU_DESC(0x17, 0x31, 0x0, 0x08301052),
+};
+
static void fix_erratum_1386(struct cpuinfo_x86 *c)
{
/*
@@ -803,7 +809,13 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c)
*
* Affected parts all have no supervisor XSAVE states, meaning that
* the XSAVEC instruction (which works fine) is equivalent.
+ *
+ * Clear the feature flag only on microcode revisions which
+ * don't have the fix.
*/
+ if (x86_cpu_has_min_microcode_rev(erratum_1386_microcode))
+ return;
+
clear_cpu_cap(c, X86_FEATURE_XSAVES);
}
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index fdbb5f07448fad..f9a8c7b7943fb2 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -124,25 +124,24 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
return true;
}
-#define X86_MATCH(model) \
- X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
- INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
+#define X86_MATCH(vfm) \
+ X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
- X86_MATCH(XEON_PHI_KNL),
- X86_MATCH(XEON_PHI_KNM),
+ X86_MATCH(INTEL_XEON_PHI_KNL),
+ X86_MATCH(INTEL_XEON_PHI_KNM),
{}
};
static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
- X86_MATCH(SKYLAKE_X),
+ X86_MATCH(INTEL_SKYLAKE_X),
{}
};
static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
- X86_MATCH(ATOM_GOLDMONT),
- X86_MATCH(ATOM_GOLDMONT_D),
- X86_MATCH(ATOM_GOLDMONT_PLUS),
+ X86_MATCH(INTEL_ATOM_GOLDMONT),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_D),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
{}
};
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ab18185894dfd5..b6f927f6c567e1 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -26,7 +26,7 @@
#include <asm/msr.h>
#include <asm/vmx.h>
#include <asm/paravirt.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/e820/api.h>
#include <asm/hypervisor.h>
#include <asm/tlbflush.h>
@@ -2391,20 +2391,20 @@ static void override_cache_bits(struct cpuinfo_x86 *c)
if (c->x86 != 6)
return;
- switch (c->x86_model) {
- case INTEL_FAM6_NEHALEM:
- case INTEL_FAM6_WESTMERE:
- case INTEL_FAM6_SANDYBRIDGE:
- case INTEL_FAM6_IVYBRIDGE:
- case INTEL_FAM6_HASWELL:
- case INTEL_FAM6_HASWELL_L:
- case INTEL_FAM6_HASWELL_G:
- case INTEL_FAM6_BROADWELL:
- case INTEL_FAM6_BROADWELL_G:
- case INTEL_FAM6_SKYLAKE_L:
- case INTEL_FAM6_SKYLAKE:
- case INTEL_FAM6_KABYLAKE_L:
- case INTEL_FAM6_KABYLAKE:
+ switch (c->x86_vfm) {
+ case INTEL_NEHALEM:
+ case INTEL_WESTMERE:
+ case INTEL_SANDYBRIDGE:
+ case INTEL_IVYBRIDGE:
+ case INTEL_HASWELL:
+ case INTEL_HASWELL_L:
+ case INTEL_HASWELL_G:
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_G:
+ case INTEL_SKYLAKE_L:
+ case INTEL_SKYLAKE:
+ case INTEL_KABYLAKE_L:
+ case INTEL_KABYLAKE:
if (c->x86_cache_bits < 44)
c->x86_cache_bits = 44;
break;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 605c26c009c8ac..cdaa795a9371df 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -114,17 +114,17 @@ static const struct x86_cpu_id ppin_cpuids[] = {
X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]),
/* Legacy models without CPUID enumeration */
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
{}
};
@@ -1053,18 +1053,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
void get_cpu_address_sizes(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
- bool vp_bits_from_cpuid = true;
if (!cpu_has(c, X86_FEATURE_CPUID) ||
- (c->extended_cpuid_level < 0x80000008))
- vp_bits_from_cpuid = false;
-
- if (vp_bits_from_cpuid) {
- cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
-
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
- } else {
+ (c->extended_cpuid_level < 0x80000008)) {
if (IS_ENABLED(CONFIG_X86_64)) {
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
@@ -1078,7 +1069,13 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c)
cpu_has(c, X86_FEATURE_PSE36))
c->x86_phys_bits = 36;
}
+ } else {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
}
+
c->x86_cache_bits = c->x86_phys_bits;
c->x86_cache_alignment = c->x86_clflush_size;
}
@@ -1125,8 +1122,8 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#define VULNWL(vendor, family, model, whitelist) \
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist)
-#define VULNWL_INTEL(model, whitelist) \
- VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
+#define VULNWL_INTEL(vfm, whitelist) \
+ X86_MATCH_VFM(vfm, whitelist)
#define VULNWL_AMD(family, whitelist) \
VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
@@ -1143,32 +1140,32 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION),
/* Intel Family 6 */
- VULNWL_INTEL(TIGERLAKE, NO_MMIO),
- VULNWL_INTEL(TIGERLAKE_L, NO_MMIO),
- VULNWL_INTEL(ALDERLAKE, NO_MMIO),
- VULNWL_INTEL(ALDERLAKE_L, NO_MMIO),
+ VULNWL_INTEL(INTEL_TIGERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_TIGERLAKE_L, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE_L, NO_MMIO),
- VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(CORE_YONAH, NO_SSB),
+ VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB),
- VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
/*
* Technically, swapgs isn't serializing on AMD (despite it previously
@@ -1178,9 +1175,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
* good enough for our purposes.
*/
- VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB),
- VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB),
- VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_L, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
/* AMD Family 0xf - 0x12 */
VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
@@ -1201,10 +1198,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define VULNBL(vendor, family, model, blacklist) \
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
-#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \
- X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \
- INTEL_FAM6_##model, steppings, \
- X86_FEATURE_ANY, issues)
+#define VULNBL_INTEL_STEPPINGS(vfm, steppings, issues) \
+ X86_MATCH_VFM_STEPPINGS(vfm, steppings, issues)
#define VULNBL_AMD(family, blacklist) \
VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
@@ -1229,43 +1224,43 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define RFDS BIT(7)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
- VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
- VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
- VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
- VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS),
- VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS),
- VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(ALDERLAKE, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(RAPTORLAKE, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(RAPTORLAKE_P, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(RAPTORLAKE_S, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_HASWELL, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_L, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_G, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_X, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_G, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_X, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
+ VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE_L, X86_STEPPING_ANY, GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE, X86_STEPPING_ANY, GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(INTEL_ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_P, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_S, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS),
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 946813d816bfc2..b7d9f530ae1646 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -114,6 +114,9 @@ static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
if (WARN_ON(feature >= MAX_FEATURE_BITS))
return;
+ if (boot_cpu_has(feature))
+ WARN_ON(alternatives_patched);
+
clear_feature(c, feature);
/* Collect all features to disable, handling dependencies */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index be30d7fa2e66bc..3c3e7e5695ba41 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -228,6 +228,7 @@ static void detect_tme_early(struct cpuinfo_x86 *c)
if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
pr_info_once("x86/tme: not enabled by BIOS\n");
mktme_status = MKTME_DISABLED;
+ clear_cpu_cap(c, X86_FEATURE_TME);
return;
}
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index f18d35fe27a928..30b1d63b97f3a1 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -204,12 +204,12 @@ static int intel_epb_offline(unsigned int cpu)
}
static const struct x86_cpu_id intel_epb_normal[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,
- ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT,
- ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,
- ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
{}
};
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index ad6776081e60da..8651643bddae84 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -17,8 +17,7 @@
*
* A typical table entry would be to match a specific CPU
*
- * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL,
- * X86_FEATURE_ANY, NULL);
+ * X86_MATCH_VFM_FEATURE(INTEL_BROADWELL, X86_FEATURE_ANY, NULL);
*
* Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
* %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor)
@@ -26,7 +25,7 @@
* asm/cpu_device_id.h contains a set of useful macros which are shortcuts
* for various common selections. The above can be shortened to:
*
- * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL);
+ * X86_MATCH_VFM(INTEL_BROADWELL, NULL);
*
* Arrays used to match for this should also be declared using
* MODULE_DEVICE_TABLE(x86cpu, ...)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 84d41be6d06ba4..ad0623b659ed5b 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -47,7 +47,7 @@
#include <linux/kexec.h>
#include <asm/fred.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/tlbflush.h>
@@ -1593,6 +1593,24 @@ noinstr void do_machine_check(struct pt_regs *regs)
else
queue_task_work(&m, msg, kill_me_maybe);
+ } else if (m.mcgstatus & MCG_STATUS_SEAM_NR) {
+ /*
+ * Saved RIP on stack makes it look like the machine check
+ * was taken in the kernel on the instruction following
+ * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates
+ * that the machine check was taken inside SEAM non-root
+ * mode. CPU core has already marked that guest as dead.
+ * It is OK for the kernel to resume execution at the
+ * apparent point of the machine check as the fault did
+ * not occur there. Mark the page as poisoned so it won't
+ * be added to free list when the guest is terminated.
+ */
+ if (mce_usable_address(&m)) {
+ struct page *p = pfn_to_online_page(m.addr >> PAGE_SHIFT);
+
+ if (p)
+ SetPageHWPoison(p);
+ }
} else {
/*
* Handle an MCE which has happened in kernel space but from
@@ -1930,14 +1948,14 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
cfg->bootlog = 0;
- if (c->x86 == 6 && c->x86_model == 45)
+ if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
mce_flags.snb_ifu_quirk = 1;
/*
* Skylake, Cascacde Lake and Cooper Lake require a quirk on
* rep movs.
*/
- if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
+ if (c->x86_vfm == INTEL_SKYLAKE_X)
mce_flags.skx_repmov_quirk = 1;
}
diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c
index fbe8b61c341303..4284749ec80373 100644
--- a/arch/x86/kernel/cpu/mce/genpool.c
+++ b/arch/x86/kernel/cpu/mce/genpool.c
@@ -16,14 +16,14 @@
* used to save error information organized in a lock-less list.
*
* This memory pool is only to be used to save MCE records in MCE context.
- * MCE events are rare, so a fixed size memory pool should be enough. Use
- * 2 pages to save MCE events for now (~80 MCE records at most).
+ * MCE events are rare, so a fixed size memory pool should be enough.
+ * Allocate on a sliding scale based on number of CPUs.
*/
-#define MCE_POOLSZ (2 * PAGE_SIZE)
+#define MCE_MIN_ENTRIES 80
+#define MCE_PER_CPU 2
static struct gen_pool *mce_evt_pool;
static LLIST_HEAD(mce_event_llist);
-static char gen_pool_buf[MCE_POOLSZ];
/*
* Compare the record "t" with each of the records on list "l" to see if
@@ -118,22 +118,32 @@ int mce_gen_pool_add(struct mce *mce)
static int mce_gen_pool_create(void)
{
- struct gen_pool *tmpp;
+ int mce_numrecords, mce_poolsz, order;
+ struct gen_pool *gpool;
int ret = -ENOMEM;
-
- tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
- if (!tmpp)
- goto out;
-
- ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
+ void *mce_pool;
+
+ order = order_base_2(sizeof(struct mce_evt_llist));
+ gpool = gen_pool_create(order, -1);
+ if (!gpool)
+ return ret;
+
+ mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU);
+ mce_poolsz = mce_numrecords * (1 << order);
+ mce_pool = kmalloc(mce_poolsz, GFP_KERNEL);
+ if (!mce_pool) {
+ gen_pool_destroy(gpool);
+ return ret;
+ }
+ ret = gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1);
if (ret) {
- gen_pool_destroy(tmpp);
- goto out;
+ gen_pool_destroy(gpool);
+ kfree(mce_pool);
+ return ret;
}
- mce_evt_pool = tmpp;
+ mce_evt_pool = gpool;
-out:
return ret;
}
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 399b62e223d2db..f6103e6bf69a8b 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -13,7 +13,7 @@
#include <linux/cpumask.h>
#include <asm/apic.h>
#include <asm/cpufeature.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -455,10 +455,10 @@ static void intel_imc_init(struct cpuinfo_x86 *c)
{
u64 error_control;
- switch (c->x86_model) {
- case INTEL_FAM6_SANDYBRIDGE_X:
- case INTEL_FAM6_IVYBRIDGE_X:
- case INTEL_FAM6_HASWELL_X:
+ switch (c->x86_vfm) {
+ case INTEL_SANDYBRIDGE_X:
+ case INTEL_IVYBRIDGE_X:
+ case INTEL_HASWELL_X:
if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control))
return;
error_control |= 2;
@@ -484,12 +484,11 @@ bool intel_filter_mce(struct mce *m)
struct cpuinfo_x86 *c = &boot_cpu_data;
/* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */
- if ((c->x86 == 6) &&
- ((c->x86_model == INTEL_FAM6_HASWELL) ||
- (c->x86_model == INTEL_FAM6_HASWELL_L) ||
- (c->x86_model == INTEL_FAM6_BROADWELL) ||
- (c->x86_model == INTEL_FAM6_HASWELL_G) ||
- (c->x86_model == INTEL_FAM6_SKYLAKE_X)) &&
+ if ((c->x86_vfm == INTEL_HASWELL ||
+ c->x86_vfm == INTEL_HASWELL_L ||
+ c->x86_vfm == INTEL_BROADWELL ||
+ c->x86_vfm == INTEL_HASWELL_G ||
+ c->x86_vfm == INTEL_SKYLAKE_X) &&
(m->bank == 0) &&
((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
return true;
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index c4477162c07d13..5d31d5f68ff206 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -12,7 +12,7 @@
#include <linux/uaccess.h>
#include <asm/mce.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/traps.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
@@ -39,20 +39,20 @@ static struct severity {
u64 mask;
u64 result;
unsigned char sev;
- unsigned char mcgmask;
- unsigned char mcgres;
+ unsigned short mcgmask;
+ unsigned short mcgres;
unsigned char ser;
unsigned char context;
unsigned char excp;
unsigned char covered;
- unsigned char cpu_model;
+ unsigned int cpu_vfm;
unsigned char cpu_minstepping;
unsigned char bank_lo, bank_hi;
char *msg;
} severities[] = {
#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h
-#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s
+#define VFM_STEPPING(m, s) .cpu_vfm = m, .cpu_minstepping = s
#define KERNEL .context = IN_KERNEL
#define USER .context = IN_USER
#define KERNEL_RECOV .context = IN_KERNEL_RECOV
@@ -128,7 +128,7 @@ static struct severity {
MCESEV(
AO, "Uncorrected Patrol Scrub Error",
SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0),
- MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18)
+ VFM_STEPPING(INTEL_SKYLAKE_X, 4), BANK_RANGE(13, 18)
),
/* ignore OVER for UCNA */
@@ -174,6 +174,18 @@ static struct severity {
USER
),
MCESEV(
+ AR, "Data load error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
+ AR, "Instruction fetch error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
PANIC, "Data load in unrecoverable area of kernel",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
KERNEL
@@ -290,7 +302,6 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
switch (fixup_type) {
case EX_TYPE_UACCESS:
- case EX_TYPE_COPY:
if (!copy_user)
return IN_KERNEL;
m->kflags |= MCE_IN_KERNEL_COPYIN;
@@ -386,7 +397,7 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char
continue;
if (s->excp && excp != s->excp)
continue;
- if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model)
+ if (s->cpu_vfm && boot_cpu_data.x86_model != s->cpu_vfm)
continue;
if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping)
continue;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 13b45b9c806dae..c0d56c02b8da9f 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -84,8 +84,6 @@ struct microcode_amd {
unsigned int mpb[];
};
-#define PATCH_MAX_SIZE (3 * PAGE_SIZE)
-
static struct equiv_cpu_table {
unsigned int num_entries;
struct equiv_cpu_entry *entry;
@@ -465,7 +463,7 @@ static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, siz
return !__apply_microcode_amd(mc);
}
-static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
+static bool get_builtin_microcode(struct cpio_data *cp, u8 family)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
struct firmware fw;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 5f0414452b67e2..815fa67356a2dd 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -21,7 +21,7 @@
#include <linux/uio.h>
#include <linux/mm.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/setup.h>
@@ -577,8 +577,7 @@ static bool is_blacklisted(unsigned int cpu)
* This behavior is documented in item BDF90, #334165 (Intel Xeon
* Processor E7-8800/4800 v4 Product Family).
*/
- if (c->x86 == 6 &&
- c->x86_model == INTEL_FAM6_BROADWELL_X &&
+ if (c->x86_vfm == INTEL_BROADWELL_X &&
c->x86_stepping == 0x01 &&
llc_size_per_core > 2621440 &&
c->microcode < 0x0b000021) {
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 83e40341583e6f..a113d9aba553e0 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -22,7 +22,7 @@
#include <linux/cacheinfo.h>
#include <linux/cpuhotplug.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/resctrl.h>
#include "internal.h"
@@ -56,14 +56,9 @@ int max_name_width, max_data_width;
*/
bool rdt_alloc_capable;
-static void
-mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
-static void
-cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
-static void
-mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
+static void mba_wrmsr_intel(struct msr_param *m);
+static void cat_wrmsr(struct msr_param *m);
+static void mba_wrmsr_amd(struct msr_param *m);
#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains)
@@ -309,12 +304,11 @@ static void rdt_get_cdp_l2_config(void)
rdt_get_cdp_config(RDT_RESOURCE_L2);
}
-static void
-mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+static void mba_wrmsr_amd(struct msr_param *m)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(m->dom);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
for (i = m->low; i < m->high; i++)
wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
@@ -334,25 +328,22 @@ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
return r->default_ctrl;
}
-static void
-mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r)
+static void mba_wrmsr_intel(struct msr_param *m)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(m->dom);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
/* Write the delay values for mba. */
for (i = m->low; i < m->high; i++)
- wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], r));
+ wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
}
-static void
-cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+static void cat_wrmsr(struct msr_param *m)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(m->dom);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
for (i = m->low; i < m->high; i++)
wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
@@ -362,6 +353,8 @@ struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
{
struct rdt_domain *d;
+ lockdep_assert_cpus_held();
+
list_for_each_entry(d, &r->domains, list) {
/* Find the domain that contains this CPU */
if (cpumask_test_cpu(cpu, &d->cpu_mask))
@@ -378,19 +371,11 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
void rdt_ctrl_update(void *arg)
{
+ struct rdt_hw_resource *hw_res;
struct msr_param *m = arg;
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
- struct rdt_resource *r = m->res;
- int cpu = smp_processor_id();
- struct rdt_domain *d;
- d = get_domain_from_cpu(cpu, r);
- if (d) {
- hw_res->msr_update(d, m, r);
- return;
- }
- pr_warn_once("cpu %d not found in any domain for resource %s\n",
- cpu, r->name);
+ hw_res = resctrl_to_arch_res(m->res);
+ hw_res->msr_update(m);
}
/*
@@ -463,9 +448,11 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
hw_dom->ctrl_val = dc;
setup_default_ctrlval(r, dc);
+ m.res = r;
+ m.dom = d;
m.low = 0;
m.high = hw_res->num_closid;
- hw_res->msr_update(d, &m, r);
+ hw_res->msr_update(&m);
return 0;
}
@@ -821,18 +808,18 @@ static __init bool get_rdt_mon_resources(void)
static __init void __check_quirks_intel(void)
{
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_HASWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_HASWELL_X:
if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
cache_alloc_hsw_probe();
break;
- case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_SKYLAKE_X:
if (boot_cpu_data.x86_stepping <= 4)
set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
else
set_rdt_options("!l3cat");
fallthrough;
- case INTEL_FAM6_BROADWELL_X:
+ case INTEL_BROADWELL_X:
intel_rdt_mbm_apply_quirk();
break;
}
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 7997b47743a210..b7291f60399c0e 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -272,22 +272,6 @@ static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
}
}
-static bool apply_config(struct rdt_hw_domain *hw_dom,
- struct resctrl_staged_config *cfg, u32 idx,
- cpumask_var_t cpu_mask)
-{
- struct rdt_domain *dom = &hw_dom->d_resctrl;
-
- if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) {
- cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask);
- hw_dom->ctrl_val[idx] = cfg->new_ctrl;
-
- return true;
- }
-
- return false;
-}
-
int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
u32 closid, enum resctrl_conf_type t, u32 cfg_val)
{
@@ -302,9 +286,10 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
hw_dom->ctrl_val[idx] = cfg_val;
msr_param.res = r;
+ msr_param.dom = d;
msr_param.low = idx;
msr_param.high = idx + 1;
- hw_res->msr_update(d, &msr_param, r);
+ hw_res->msr_update(&msr_param);
return 0;
}
@@ -315,48 +300,39 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
struct rdt_hw_domain *hw_dom;
struct msr_param msr_param;
enum resctrl_conf_type t;
- cpumask_var_t cpu_mask;
struct rdt_domain *d;
u32 idx;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
- if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
- return -ENOMEM;
-
- msr_param.res = NULL;
list_for_each_entry(d, &r->domains, list) {
hw_dom = resctrl_to_arch_dom(d);
+ msr_param.res = NULL;
for (t = 0; t < CDP_NUM_TYPES; t++) {
cfg = &hw_dom->d_resctrl.staged_config[t];
if (!cfg->have_new_ctrl)
continue;
idx = get_config_index(closid, t);
- if (!apply_config(hw_dom, cfg, idx, cpu_mask))
+ if (cfg->new_ctrl == hw_dom->ctrl_val[idx])
continue;
+ hw_dom->ctrl_val[idx] = cfg->new_ctrl;
if (!msr_param.res) {
msr_param.low = idx;
msr_param.high = msr_param.low + 1;
msr_param.res = r;
+ msr_param.dom = d;
} else {
msr_param.low = min(msr_param.low, idx);
msr_param.high = max(msr_param.high, idx + 1);
}
}
+ if (msr_param.res)
+ smp_call_function_any(&d->cpu_mask, rdt_ctrl_update, &msr_param, 1);
}
- if (cpumask_empty(cpu_mask))
- goto done;
-
- /* Update resource control msr on all the CPUs. */
- on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
-
-done:
- free_cpumask_var(cpu_mask);
-
return 0;
}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 1a8687f8073a89..f1d926832ec8af 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -379,11 +379,13 @@ static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r)
/**
* struct msr_param - set a range of MSRs from a domain
* @res: The resource to use
+ * @dom: The domain to update
* @low: Beginning index from base MSR
* @high: End index
*/
struct msr_param {
struct rdt_resource *res;
+ struct rdt_domain *dom;
u32 low;
u32 high;
};
@@ -443,8 +445,7 @@ struct rdt_hw_resource {
struct rdt_resource r_resctrl;
u32 num_closid;
unsigned int msr_base;
- void (*msr_update) (struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
+ void (*msr_update)(struct msr_param *m);
unsigned int mon_scale;
unsigned int mbm_width;
unsigned int mbm_cfg_mask;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index c34a35ec0f031a..2345e6836593fc 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -24,6 +24,7 @@
#include <asm/resctrl.h>
#include "internal.h"
+#include "trace.h"
/**
* struct rmid_entry - dirty tracking for all RMID.
@@ -354,6 +355,16 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
rmid_dirty = true;
} else {
rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
+
+ /*
+ * x86's CLOSID and RMID are independent numbers, so the entry's
+ * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the
+ * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't
+ * used to select the configuration. It is thus necessary to track both
+ * CLOSID and RMID because there may be dependencies between them
+ * on some architectures.
+ */
+ trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->id, val);
}
if (force_free || !rmid_dirty) {
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 884b88e2514130..aacf236dfe3b9c 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -23,7 +23,7 @@
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/resctrl.h>
#include <asm/perf_event.h>
@@ -31,7 +31,7 @@
#include "internal.h"
#define CREATE_TRACE_POINTS
-#include "pseudo_lock_event.h"
+#include "trace.h"
/*
* The bits needed to disable hardware prefetching varies based on the
@@ -88,8 +88,8 @@ static u64 get_prefetch_disable_bits(void)
boot_cpu_data.x86 != 6)
return 0;
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_BROADWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
/*
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
* as:
@@ -100,8 +100,8 @@ static u64 get_prefetch_disable_bits(void)
* 63:4 Reserved
*/
return 0xF;
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
/*
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
* as:
@@ -1084,9 +1084,9 @@ static int measure_l2_residency(void *_plr)
* L2_HIT 02H
* L2_MISS 10H
*/
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
.umask = 0x10);
perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
@@ -1123,8 +1123,8 @@ static int measure_l3_residency(void *_plr)
* MISS 41H
*/
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_BROADWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
/* On BDW the hit event counts references, not hits */
perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
.umask = 0x4f);
@@ -1142,7 +1142,7 @@ static int measure_l3_residency(void *_plr)
*/
counts.miss_after -= counts.miss_before;
- if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) {
+ if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {
/*
* On BDW references and misses are counted, need to adjust.
* Sometimes the "hits" counter is a bit more than the
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 011e17efb1a66e..02f213f1c51c51 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2813,16 +2813,12 @@ static int reset_all_ctrls(struct rdt_resource *r)
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
struct rdt_hw_domain *hw_dom;
struct msr_param msr_param;
- cpumask_var_t cpu_mask;
struct rdt_domain *d;
int i;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
- if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
- return -ENOMEM;
-
msr_param.res = r;
msr_param.low = 0;
msr_param.high = hw_res->num_closid;
@@ -2834,17 +2830,13 @@ static int reset_all_ctrls(struct rdt_resource *r)
*/
list_for_each_entry(d, &r->domains, list) {
hw_dom = resctrl_to_arch_dom(d);
- cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
for (i = 0; i < hw_res->num_closid; i++)
hw_dom->ctrl_val[i] = r->default_ctrl;
+ msr_param.dom = d;
+ smp_call_function_any(&d->cpu_mask, rdt_ctrl_update, &msr_param, 1);
}
- /* Update CBM on all the CPUs in cpu_mask */
- on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
-
- free_cpumask_var(cpu_mask);
-
return 0;
}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h b/arch/x86/kernel/cpu/resctrl/trace.h
index 428ebbd4270b93..2a506316b30342 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h
+++ b/arch/x86/kernel/cpu/resctrl/trace.h
@@ -2,8 +2,8 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM resctrl
-#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_PSEUDO_LOCK_H
+#if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RESCTRL_H
#include <linux/tracepoint.h>
@@ -35,9 +35,25 @@ TRACE_EVENT(pseudo_lock_l3,
TP_printk("hits=%llu miss=%llu",
__entry->l3_hits, __entry->l3_miss));
-#endif /* _TRACE_PSEUDO_LOCK_H */
+TRACE_EVENT(mon_llc_occupancy_limbo,
+ TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes),
+ TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes),
+ TP_STRUCT__entry(__field(u32, ctrl_hw_id)
+ __field(u32, mon_hw_id)
+ __field(int, domain_id)
+ __field(u64, llc_occupancy_bytes)),
+ TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id;
+ __entry->mon_hw_id = mon_hw_id;
+ __entry->domain_id = domain_id;
+ __entry->llc_occupancy_bytes = llc_occupancy_bytes;),
+ TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu",
+ __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id,
+ __entry->llc_occupancy_bytes)
+ );
+
+#endif /* _TRACE_RESCTRL_H */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE pseudo_lock_event
+#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c
index a7aa6eff4ae5ba..c3a3e9225c46d1 100644
--- a/arch/x86/kernel/cpu/topology_amd.c
+++ b/arch/x86/kernel/cpu/topology_amd.c
@@ -58,7 +58,7 @@ static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id)
tscan->amd_node_id = node_id;
}
-static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb)
+static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext)
{
struct {
// eax
@@ -86,7 +86,7 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb)
* If leaf 0xb is available, then the domain shifts are set
* already and nothing to do here.
*/
- if (!has_0xb) {
+ if (!has_topoext) {
/*
* Leaf 0x80000008 set the CORE domain shift already.
* Update the SMT domain, but do not propagate it.
@@ -169,21 +169,24 @@ static void topoext_fixup(struct topo_scan *tscan)
static void parse_topology_amd(struct topo_scan *tscan)
{
- bool has_0xb = false;
+ bool has_topoext = false;
/*
* If the extended topology leaf 0x8000_001e is available
- * try to get SMT and CORE shift from leaf 0xb first, then
- * try to get the CORE shift from leaf 0x8000_0008.
+ * try to get SMT, CORE, TILE, and DIE shifts from extended
+ * CPUID leaf 0x8000_0026 on supported processors first. If
+ * extended CPUID leaf 0x8000_0026 is not supported, try to
+ * get SMT and CORE shift from leaf 0xb first, then try to
+ * get the CORE shift from leaf 0x8000_0008.
*/
if (cpu_feature_enabled(X86_FEATURE_TOPOEXT))
- has_0xb = cpu_parse_topology_ext(tscan);
+ has_topoext = cpu_parse_topology_ext(tscan);
- if (!has_0xb && !parse_8000_0008(tscan))
+ if (!has_topoext && !parse_8000_0008(tscan))
return;
/* Prefer leaf 0x8000001e if available */
- if (parse_8000_001e(tscan, has_0xb))
+ if (parse_8000_001e(tscan, has_topoext))
return;
/* Try the NODEID MSR */
diff --git a/arch/x86/kernel/cpu/topology_ext.c b/arch/x86/kernel/cpu/topology_ext.c
index e477228cd5b2b4..467b0326bf1a1e 100644
--- a/arch/x86/kernel/cpu/topology_ext.c
+++ b/arch/x86/kernel/cpu/topology_ext.c
@@ -13,7 +13,10 @@ enum topo_types {
CORE_TYPE = 2,
MAX_TYPE_0B = 3,
MODULE_TYPE = 3,
+ AMD_CCD_TYPE = 3,
TILE_TYPE = 4,
+ AMD_SOCKET_TYPE = 4,
+ MAX_TYPE_80000026 = 5,
DIE_TYPE = 5,
DIEGRP_TYPE = 6,
MAX_TYPE_1F = 7,
@@ -32,6 +35,13 @@ static const unsigned int topo_domain_map_0b_1f[MAX_TYPE_1F] = {
[DIEGRP_TYPE] = TOPO_DIEGRP_DOMAIN,
};
+static const unsigned int topo_domain_map_80000026[MAX_TYPE_80000026] = {
+ [SMT_TYPE] = TOPO_SMT_DOMAIN,
+ [CORE_TYPE] = TOPO_CORE_DOMAIN,
+ [AMD_CCD_TYPE] = TOPO_TILE_DOMAIN,
+ [AMD_SOCKET_TYPE] = TOPO_DIE_DOMAIN,
+};
+
static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf,
unsigned int *last_dom)
{
@@ -56,6 +66,7 @@ static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf,
switch (leaf) {
case 0x0b: maxtype = MAX_TYPE_0B; map = topo_domain_map_0b_1f; break;
case 0x1f: maxtype = MAX_TYPE_1F; map = topo_domain_map_0b_1f; break;
+ case 0x80000026: maxtype = MAX_TYPE_80000026; map = topo_domain_map_80000026; break;
default: return false;
}
@@ -125,6 +136,10 @@ bool cpu_parse_topology_ext(struct topo_scan *tscan)
if (tscan->c->cpuid_level >= 0x1f && parse_topology_leaf(tscan, 0x1f))
return true;
+ /* AMD: Try leaf 0x80000026 first. */
+ if (tscan->c->extended_cpuid_level >= 0x80000026 && parse_topology_leaf(tscan, 0x80000026))
+ return true;
+
/* Intel/AMD: Fall back to leaf 0xB if available */
return tscan->c->cpuid_level >= 0x0b && parse_topology_leaf(tscan, 0x0b);
}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 003e0298f46a47..8e3c53b4d070e5 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -24,6 +24,7 @@
#include <asm/pci_x86.h>
#include <asm/setup.h>
#include <asm/i8259.h>
+#include <asm/numa.h>
#include <asm/prom.h>
__initdata u64 initial_dtb;
@@ -137,6 +138,7 @@ static void __init dtb_cpu_setup(void)
continue;
}
topology_register_apic(apic_id, CPU_ACPIID_INVALID, true);
+ set_apicid_to_node(apic_id, of_node_to_nid(dn));
}
}
@@ -277,9 +279,18 @@ static void __init dtb_apic_setup(void)
dtb_ioapic_setup();
}
-#ifdef CONFIG_OF_EARLY_FLATTREE
+static void __init x86_dtb_parse_smp_config(void)
+{
+ if (!of_have_populated_dt())
+ return;
+
+ dtb_setup_hpet();
+ dtb_apic_setup();
+}
+
void __init x86_flattree_get_config(void)
{
+#ifdef CONFIG_OF_EARLY_FLATTREE
u32 size, map_len;
void *dt;
@@ -301,14 +312,7 @@ void __init x86_flattree_get_config(void)
if (initial_dtb)
early_memunmap(dt, map_len);
-}
#endif
-
-void __init x86_dtb_parse_smp_config(void)
-{
- if (!of_have_populated_dt())
- return;
-
- dtb_setup_hpet();
- dtb_apic_setup();
+ if (of_have_populated_dt())
+ x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config;
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 44a91ef5a23b8e..a7d562697e50e4 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -405,8 +405,8 @@ static void __die_header(const char *str, struct pt_regs *regs, long err)
pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
printk(KERN_DEFAULT
- "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
- pr,
+ "Oops: %s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff,
+ ++die_counter, pr,
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 520deb411a7025..1209c7aebb211f 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -145,8 +145,8 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
asm volatile(
"fnclex\n\t"
"emms\n\t"
- "fildl %P[addr]" /* set F?P to defined value */
- : : [addr] "m" (fpstate));
+ "fildl %[addr]" /* set F?P to defined value */
+ : : [addr] "m" (*fpstate));
}
if (use_xsave()) {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 33a214b1a4cec1..6276329f5e660f 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1434,8 +1434,8 @@ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
return rstor;
/*
- * XSAVE(S): clone(), fpu_swap_kvm_fpu()
- * XRSTORS(S): fpu_swap_kvm_fpu()
+ * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
+ * XRSTORS(S): fpu_swap_kvm_fpstate()
*/
/*
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index b50f3641c4d6de..2e42056d230630 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -44,9 +44,6 @@
#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
-
-#define SIZEOF_PTREGS 17*4
-
/*
* Worst-case size of the kernel mapping we need to make:
* a relocatable kernel can live anywhere in lowmem, so we need to be able
@@ -488,19 +485,13 @@ SYM_DATA_END(initial_page_table)
.data
.balign 4
-/*
- * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder
- * reliably detect the end of the stack.
- */
-SYM_DATA(initial_stack,
- .long init_thread_union + THREAD_SIZE -
- SIZEOF_PTREGS - TOP_OF_KERNEL_STACK_PADDING)
+SYM_DATA(initial_stack, .long __top_init_kernel_stack)
__INITRODATA
int_msg:
.asciz "Unknown interrupt or fault at: %p %p %p\n"
-#include "../../x86/xen/xen-head.S"
+#include "../xen/xen-head.S"
/*
* The IDT and GDT 'descriptors' are a strange 48-bit object
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index d8198fbd70e54e..330922b328bfed 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -66,7 +66,7 @@ SYM_CODE_START_NOALIGN(startup_64)
mov %rsi, %r15
/* Set up the stack for verify_cpu() */
- leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp
+ leaq __top_init_kernel_stack(%rip), %rsp
/* Setup GSBASE to allow stack canary access for C code */
movl $MSR_GS_BASE, %ecx
@@ -720,7 +720,7 @@ SYM_DATA(smpboot_control, .long 0)
SYM_DATA(phys_base, .quad 0x0)
EXPORT_SYMBOL(phys_base)
-#include "../../x86/xen/xen-head.S"
+#include "../xen/xen-head.S"
__PAGE_ALIGNED_BSS
SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 7f0732bc0ccd23..263f8aed4e2cf8 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -44,7 +44,7 @@
#include <asm/svm.h>
#include <asm/e820/api.h>
-DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
+DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled);
static int kvmapf = 1;
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 2e7066980f3e8b..51a849a79c985c 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -10,7 +10,6 @@
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
#include <asm/time.h>
-#include <asm/intel-mid.h>
#include <asm/setup.h>
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 47e7fcdbdacd5b..05c5aa951da746 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -7,6 +7,7 @@
*/
#include <linux/acpi.h>
#include <linux/console.h>
+#include <linux/cpu.h>
#include <linux/crash_dump.h>
#include <linux/dma-map-ops.h>
#include <linux/efi.h>
@@ -753,6 +754,22 @@ void __init setup_arch(char **cmdline_p)
boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
#endif
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+ strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+ if (builtin_cmdline[0]) {
+ /* append boot loader cmdline to builtin */
+ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+ strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ }
+#endif
+#endif
+
+ strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
/*
* If we have OLPC OFW, we might end up relocating the fixmap due to
* reserve_top(), so do this before touching the ioremap area.
@@ -832,22 +849,6 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = __pa_symbol(__bss_start);
bss_resource.end = __pa_symbol(__bss_stop)-1;
-#ifdef CONFIG_CMDLINE_BOOL
-#ifdef CONFIG_CMDLINE_OVERRIDE
- strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
-#else
- if (builtin_cmdline[0]) {
- /* append boot loader cmdline to builtin */
- strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
- strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
- strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
- }
-#endif
-#endif
-
- strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
- *cmdline_p = command_line;
-
/*
* x86_configure_nx() is called before parse_early_param() to detect
* whether hardware doesn't support NX (so that the early EHCI debug
@@ -1216,3 +1217,10 @@ static int __init register_kernel_offset_dumper(void)
return 0;
}
__initcall(register_kernel_offset_dumper);
+
+#ifdef CONFIG_HOTPLUG_CPU
+bool arch_cpu_is_hotpluggable(int cpu)
+{
+ return cpu > 0;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 38ad066179d81f..3342ed58e168a3 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -648,7 +648,7 @@ static u64 __init get_secrets_page(void)
static u64 __init get_snp_jump_table_addr(void)
{
- struct snp_secrets_page_layout *layout;
+ struct snp_secrets_page *secrets;
void __iomem *mem;
u64 pa, addr;
@@ -662,9 +662,9 @@ static u64 __init get_snp_jump_table_addr(void)
return 0;
}
- layout = (__force struct snp_secrets_page_layout *)mem;
+ secrets = (__force struct snp_secrets_page *)mem;
- addr = layout->os_area.ap_jump_table_pa;
+ addr = secrets->os_area.ap_jump_table_pa;
iounmap(mem);
return addr;
@@ -938,7 +938,7 @@ static int snp_set_vmsa(void *va, bool vmsa)
#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2)
#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3)
-static void *snp_alloc_vmsa_page(void)
+static void *snp_alloc_vmsa_page(int cpu)
{
struct page *p;
@@ -950,7 +950,7 @@ static void *snp_alloc_vmsa_page(void)
*
* Allocate an 8k page which is also 8k-aligned.
*/
- p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
+ p = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
if (!p)
return NULL;
@@ -1019,7 +1019,7 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
* #VMEXIT of that vCPU would wipe out all of the settings being done
* here.
*/
- vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page();
+ vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(cpu);
if (!vmsa)
return -ENOMEM;
@@ -1341,7 +1341,7 @@ static void __init alloc_runtime_data(int cpu)
{
struct sev_es_runtime_data *data;
- data = memblock_alloc(sizeof(*data), PAGE_SIZE);
+ data = memblock_alloc_node(sizeof(*data), PAGE_SIZE, cpu_to_node(cpu));
if (!data)
panic("Can't allocate SEV-ES runtime data");
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 59e15dd8d0f866..6f1e9883f07425 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -163,8 +163,8 @@ static int shstk_setup(void)
if (features_enabled(ARCH_SHSTK_SHSTK))
return 0;
- /* Also not supported for 32 bit and x32 */
- if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall())
+ /* Also not supported for 32 bit */
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
return -EOPNOTSUPP;
size = adjust_shstk_size(0);
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index c12624bc82a31b..ef654530bf5a93 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -34,7 +34,7 @@
#include <asm/gsseg.h>
#ifdef CONFIG_IA32_EMULATION
-#include <asm/ia32_unistd.h>
+#include <asm/unistd_32_ia32.h>
static inline void reload_segments(struct sigcontext_32 *sc)
{
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 23d8aaf8d9fd19..8a94053c544465 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -315,6 +315,9 @@ int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
uc_flags = frame_uc_flags(regs);
+ if (setup_signal_shadow_stack(ksig))
+ return -EFAULT;
+
if (!user_access_begin(frame, sizeof(*frame)))
return -EFAULT;
@@ -377,6 +380,9 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
+ if (restore_signal_shadow_stack())
+ goto badframe;
+
if (compat_restore_altstack(&frame->uc.uc_stack))
goto badframe;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 76bb65045c649a..0c35207320cb4f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -438,9 +438,9 @@ static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
*/
static const struct x86_cpu_id intel_cod_cpu[] = {
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */
- X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */
+ X86_MATCH_VFM(INTEL_HASWELL_X, 0), /* COD */
+ X86_MATCH_VFM(INTEL_BROADWELL_X, 0), /* COD */
+ X86_MATCH_VFM(INTEL_ANY, 1), /* SNC */
{}
};
@@ -1033,20 +1033,22 @@ static __init void disable_smp(void)
void __init smp_prepare_cpus_common(void)
{
- unsigned int i;
+ unsigned int cpu, node;
/* Mark all except the boot CPU as hotpluggable */
- for_each_possible_cpu(i) {
- if (i)
- per_cpu(cpu_info.cpu_index, i) = nr_cpu_ids;
+ for_each_possible_cpu(cpu) {
+ if (cpu)
+ per_cpu(cpu_info.cpu_index, cpu) = nr_cpu_ids;
}
- for_each_possible_cpu(i) {
- zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
+ for_each_possible_cpu(cpu) {
+ node = cpu_to_node(cpu);
+
+ zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_die_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_llc_shared_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_l2c_shared_map, cpu), GFP_KERNEL, node);
}
set_cpu_sibling_map(0);
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
deleted file mode 100644
index d42c28b8bfd80c..00000000000000
--- a/arch/x86/kernel/topology.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Populate sysfs with topology information
- *
- * Written by: Matthew Dobson, IBM Corporation
- * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <colpatch@us.ibm.com>
- */
-#include <linux/interrupt.h>
-#include <linux/nodemask.h>
-#include <linux/export.h>
-#include <linux/mmzone.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/irq.h>
-#include <asm/io_apic.h>
-#include <asm/cpu.h>
-
-#ifdef CONFIG_HOTPLUG_CPU
-bool arch_cpu_is_hotpluggable(int cpu)
-{
- return cpu > 0;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 5a69a49acc963f..06b170759e5bf5 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -26,7 +26,7 @@
#include <asm/x86_init.h>
#include <asm/geode.h>
#include <asm/apic.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/i8259.h>
#include <asm/uv/uv.h>
@@ -44,7 +44,7 @@ EXPORT_SYMBOL(tsc_khz);
static int __read_mostly tsc_unstable;
static unsigned int __initdata tsc_early_khz;
-static DEFINE_STATIC_KEY_FALSE(__use_tsc);
+static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc);
int tsc_clocksource_reliable;
@@ -682,7 +682,7 @@ unsigned long native_calibrate_tsc(void)
* clock.
*/
if (crystal_khz == 0 &&
- boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_D)
+ boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
crystal_khz = 25000;
/*
@@ -713,7 +713,7 @@ unsigned long native_calibrate_tsc(void)
* For Atom SoCs TSC is the only reliable clocksource.
* Mark TSC reliable so no watchdog on it.
*/
- if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
+ if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT)
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 6555a857a1e6ea..deeb028256709a 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -147,13 +147,13 @@ static const struct freq_desc freq_desc_lgm = {
};
static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_MID, &freq_desc_pnw),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_TABLET,&freq_desc_clv),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &freq_desc_byt),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &freq_desc_tng),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &freq_desc_cht),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &freq_desc_ann),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_NP, &freq_desc_lgm),
+ X86_MATCH_VFM(INTEL_ATOM_SALTWELL_MID, &freq_desc_pnw),
+ X86_MATCH_VFM(INTEL_ATOM_SALTWELL_TABLET, &freq_desc_clv),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &freq_desc_byt),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &freq_desc_tng),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &freq_desc_cht),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &freq_desc_ann),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &freq_desc_lgm),
{}
};
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 56451fd2099e71..3509afc6a672ef 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -15,11 +15,7 @@
* put it inside the section definition.
*/
-#ifdef CONFIG_X86_32
-#define LOAD_OFFSET __PAGE_OFFSET
-#else
#define LOAD_OFFSET __START_KERNEL_map
-#endif
#define RUNTIME_DISCARD_EXIT
#define EMITS_PT_NOTE
@@ -114,11 +110,10 @@ PHDRS {
SECTIONS
{
+ . = __START_KERNEL;
#ifdef CONFIG_X86_32
- . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET);
#else
- . = __START_KERNEL;
phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET);
#endif
@@ -172,6 +167,9 @@ SECTIONS
/* init_task */
INIT_TASK_DATA(THREAD_SIZE)
+ /* equivalent to task_pt_regs(&init_task) */
+ __top_init_kernel_stack = __end_init_stack - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE;
+
#ifdef CONFIG_X86_32
/* 32 bit has nosave before _edata */
NOSAVE_DATA
diff --git a/arch/x86/math-emu/fpu_etc.c b/arch/x86/math-emu/fpu_etc.c
index 1b118fd9314093..39423ec409e188 100644
--- a/arch/x86/math-emu/fpu_etc.c
+++ b/arch/x86/math-emu/fpu_etc.c
@@ -120,9 +120,14 @@ static void fxam(FPU_REG *st0_ptr, u_char st0tag)
setcc(c);
}
+static void FPU_ST0_illegal(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ FPU_illegal();
+}
+
static FUNC_ST0 const fp_etc_table[] = {
- fchs, fabs, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal,
- ftst_, fxam, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal
+ fchs, fabs, FPU_ST0_illegal, FPU_ST0_illegal,
+ ftst_, fxam, FPU_ST0_illegal, FPU_ST0_illegal,
};
void FPU_etc(void)
diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c
index 990d847ae902c6..85daf98c81c3ea 100644
--- a/arch/x86/math-emu/fpu_trig.c
+++ b/arch/x86/math-emu/fpu_trig.c
@@ -433,13 +433,13 @@ static void fxtract(FPU_REG *st0_ptr, u_char st0_tag)
#endif /* PARANOID */
}
-static void fdecstp(void)
+static void fdecstp(FPU_REG *st0_ptr, u_char st0_tag)
{
clear_C1();
top--;
}
-static void fincstp(void)
+static void fincstp(FPU_REG *st0_ptr, u_char st0_tag)
{
clear_C1();
top++;
@@ -1631,7 +1631,7 @@ static void fscale(FPU_REG *st0_ptr, u_char st0_tag)
static FUNC_ST0 const trig_table_a[] = {
f2xm1, fyl2x, fptan, fpatan,
- fxtract, fprem1, (FUNC_ST0) fdecstp, (FUNC_ST0) fincstp
+ fxtract, fprem1, fdecstp, fincstp,
};
void FPU_triga(void)
diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c
index 742619e94bdf28..003a0b2753e68b 100644
--- a/arch/x86/math-emu/reg_constant.c
+++ b/arch/x86/math-emu/reg_constant.c
@@ -108,8 +108,13 @@ static void fldz(int rc)
typedef void (*FUNC_RC) (int);
+static void FPU_RC_illegal(int unused)
+{
+ FPU_illegal();
+}
+
static FUNC_RC constants_table[] = {
- fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC) FPU_illegal
+ fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, FPU_RC_illegal
};
void fconst(void)
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index b522933bfa56e8..51986e8a9d3535 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -164,13 +164,6 @@ static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
return ex_handler_default(fixup, regs);
}
-static bool ex_handler_copy(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
-{
- WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
- return ex_handler_fault(fixup, regs, trapnr);
-}
-
static bool ex_handler_msr(const struct exception_table_entry *fixup,
struct pt_regs *regs, bool wrmsr, bool safe, int reg)
{
@@ -341,8 +334,6 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
return ex_handler_fault(e, regs, trapnr);
case EX_TYPE_UACCESS:
return ex_handler_uaccess(e, regs, trapnr, fault_addr);
- case EX_TYPE_COPY:
- return ex_handler_copy(e, regs, trapnr);
case EX_TYPE_CLEAR_FS:
return ex_handler_clear_fs(e, regs);
case EX_TYPE_FPU_RESTORE:
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 67b18adc75ddc1..4b2a0448e20a7b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -515,18 +515,19 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
if (error_code & X86_PF_INSTR) {
unsigned int level;
+ bool nx, rw;
pgd_t *pgd;
pte_t *pte;
pgd = __va(read_cr3_pa());
pgd += pgd_index(address);
- pte = lookup_address_in_pgd(pgd, address, &level);
+ pte = lookup_address_in_pgd_attr(pgd, address, &level, &nx, &rw);
- if (pte && pte_present(*pte) && !pte_exec(*pte))
+ if (pte && pte_present(*pte) && (!pte_exec(*pte) || nx))
pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
from_kuid(&init_user_ns, current_uid()));
- if (pte && pte_present(*pte) && pte_exec(*pte) &&
+ if (pte && pte_present(*pte) && pte_exec(*pte) && !nx &&
(pgd_flags(*pgd) & _PAGE_USER) &&
(__read_cr4() & X86_CR4_SMEP))
pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 827f9df87d7dfb..eb503f53c3195c 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -262,21 +262,17 @@ static void __init probe_page_size_mask(void)
}
}
-#define INTEL_MATCH(_model) { .vendor = X86_VENDOR_INTEL, \
- .family = 6, \
- .model = _model, \
- }
/*
* INVLPG may not properly flush Global entries
* on these CPUs when PCIDs are enabled.
*/
static const struct x86_cpu_id invlpg_miss_ids[] = {
- INTEL_MATCH(INTEL_FAM6_ALDERLAKE ),
- INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ),
- INTEL_MATCH(INTEL_FAM6_ATOM_GRACEMONT ),
- INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ),
- INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P),
- INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, 0),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, 0),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0),
{}
};
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 80c9037ffadffd..19fdfbb171ed6e 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -619,7 +619,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
* Validate strict W^X semantics.
*/
static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
- unsigned long pfn, unsigned long npg)
+ unsigned long pfn, unsigned long npg,
+ bool nx, bool rw)
{
unsigned long end;
@@ -641,6 +642,10 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
return new;
+ /* Non-leaf translation entries can disable writing or execution. */
+ if (!rw || nx)
+ return new;
+
end = start + npg * PAGE_SIZE - 1;
WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
(unsigned long long)pgprot_val(old),
@@ -657,20 +662,26 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
/*
* Lookup the page table entry for a virtual address in a specific pgd.
- * Return a pointer to the entry and the level of the mapping.
+ * Return a pointer to the entry, the level of the mapping, and the effective
+ * NX and RW bits of all page table levels.
*/
-pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
- unsigned int *level)
+pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
+ unsigned int *level, bool *nx, bool *rw)
{
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
*level = PG_LEVEL_NONE;
+ *nx = false;
+ *rw = true;
if (pgd_none(*pgd))
return NULL;
+ *nx |= pgd_flags(*pgd) & _PAGE_NX;
+ *rw &= pgd_flags(*pgd) & _PAGE_RW;
+
p4d = p4d_offset(pgd, address);
if (p4d_none(*p4d))
return NULL;
@@ -679,6 +690,9 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
if (p4d_leaf(*p4d) || !p4d_present(*p4d))
return (pte_t *)p4d;
+ *nx |= p4d_flags(*p4d) & _PAGE_NX;
+ *rw &= p4d_flags(*p4d) & _PAGE_RW;
+
pud = pud_offset(p4d, address);
if (pud_none(*pud))
return NULL;
@@ -687,6 +701,9 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
if (pud_leaf(*pud) || !pud_present(*pud))
return (pte_t *)pud;
+ *nx |= pud_flags(*pud) & _PAGE_NX;
+ *rw &= pud_flags(*pud) & _PAGE_RW;
+
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
return NULL;
@@ -695,12 +712,27 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
if (pmd_leaf(*pmd) || !pmd_present(*pmd))
return (pte_t *)pmd;
+ *nx |= pmd_flags(*pmd) & _PAGE_NX;
+ *rw &= pmd_flags(*pmd) & _PAGE_RW;
+
*level = PG_LEVEL_4K;
return pte_offset_kernel(pmd, address);
}
/*
+ * Lookup the page table entry for a virtual address in a specific pgd.
+ * Return a pointer to the entry and the level of the mapping.
+ */
+pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level)
+{
+ bool nx, rw;
+
+ return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw);
+}
+
+/*
* Lookup the page table entry for a virtual address. Return a pointer
* to the entry and the level of the mapping.
*
@@ -715,13 +747,16 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
EXPORT_SYMBOL_GPL(lookup_address);
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
- unsigned int *level)
+ unsigned int *level, bool *nx, bool *rw)
{
- if (cpa->pgd)
- return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
- address, level);
+ pgd_t *pgd;
+
+ if (!cpa->pgd)
+ pgd = pgd_offset_k(address);
+ else
+ pgd = cpa->pgd + pgd_index(address);
- return lookup_address(address, level);
+ return lookup_address_in_pgd_attr(pgd, address, level, nx, rw);
}
/*
@@ -849,12 +884,13 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
pgprot_t old_prot, new_prot, req_prot, chk_prot;
pte_t new_pte, *tmp;
enum pg_level level;
+ bool nx, rw;
/*
* Check for races, another CPU might have split this page
* up already:
*/
- tmp = _lookup_address_cpa(cpa, address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (tmp != kpte)
return 1;
@@ -965,7 +1001,8 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
psize, CPA_DETECT);
- new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages);
+ new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages,
+ nx, rw);
/*
* If there is a conflict, split the large page.
@@ -1046,6 +1083,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
pte_t *pbase = (pte_t *)page_address(base);
unsigned int i, level;
pgprot_t ref_prot;
+ bool nx, rw;
pte_t *tmp;
spin_lock(&pgd_lock);
@@ -1053,7 +1091,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* Check for races, another CPU might have split this page
* up for us already:
*/
- tmp = _lookup_address_cpa(cpa, address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (tmp != kpte) {
spin_unlock(&pgd_lock);
return 1;
@@ -1594,10 +1632,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
int do_split, err;
unsigned int level;
pte_t *kpte, old_pte;
+ bool nx, rw;
address = __cpa_addr(cpa, cpa->curpage);
repeat:
- kpte = _lookup_address_cpa(cpa, address, &level);
+ kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (!kpte)
return __cpa_process_fault(cpa, address, primary);
@@ -1619,7 +1658,8 @@ repeat:
new_prot = static_protections(new_prot, address, pfn, 1, 0,
CPA_PROTECT);
- new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1);
+ new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1,
+ nx, rw);
new_prot = pgprot_clear_protnone_bits(new_prot);
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index f32451bdcfdd5b..f8126821a94d76 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -139,7 +139,6 @@ void __init x86_ce4100_early_setup(void)
x86_init.resources.probe_roms = x86_init_noop;
x86_init.mpparse.find_mptable = x86_init_noop;
x86_init.mpparse.early_parse_smp_cfg = x86_init_noop;
- x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config;
x86_init.pci.init = ce4100_pci_init;
x86_init.pci.init_irq = sdv_pci_init;
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index b42bfdab01a99a..c5f3bbdbdcfef6 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -62,11 +62,10 @@ static int iris_probe(struct platform_device *pdev)
return 0;
}
-static int iris_remove(struct platform_device *pdev)
+static void iris_remove(struct platform_device *pdev)
{
pm_power_off = old_pm_power_off;
printk(KERN_INFO "Iris power_off handler uninstalled.\n");
- return 0;
}
static struct platform_driver iris_driver = {
@@ -74,7 +73,7 @@ static struct platform_driver iris_driver = {
.name = "iris",
},
.probe = iris_probe,
- .remove = iris_remove,
+ .remove_new = iris_remove,
};
static struct resource iris_resources[] = {
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
index f067ac780ba7aa..6a9c42de74e74f 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -144,7 +144,7 @@ static int xo1_pm_probe(struct platform_device *pdev)
return 0;
}
-static int xo1_pm_remove(struct platform_device *pdev)
+static void xo1_pm_remove(struct platform_device *pdev)
{
if (strcmp(pdev->name, "cs5535-pms") == 0)
pms_base = 0;
@@ -152,7 +152,6 @@ static int xo1_pm_remove(struct platform_device *pdev)
acpi_base = 0;
pm_power_off = NULL;
- return 0;
}
static struct platform_driver cs5535_pms_driver = {
@@ -160,7 +159,7 @@ static struct platform_driver cs5535_pms_driver = {
.name = "cs5535-pms",
},
.probe = xo1_pm_probe,
- .remove = xo1_pm_remove,
+ .remove_new = xo1_pm_remove,
};
static struct platform_driver cs5535_acpi_driver = {
@@ -168,7 +167,7 @@ static struct platform_driver cs5535_acpi_driver = {
.name = "olpc-xo1-pm-acpi",
},
.probe = xo1_pm_probe,
- .remove = xo1_pm_remove,
+ .remove_new = xo1_pm_remove,
};
static int __init xo1_pm_init(void)
diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
index 89f25af4b3c339..46d42ff6e18ab5 100644
--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -598,7 +598,7 @@ err_ebook:
return r;
}
-static int xo1_sci_remove(struct platform_device *pdev)
+static void xo1_sci_remove(struct platform_device *pdev)
{
free_irq(sci_irq, pdev);
cancel_work_sync(&sci_work);
@@ -608,7 +608,6 @@ static int xo1_sci_remove(struct platform_device *pdev)
free_ebook_switch();
free_power_button();
acpi_base = 0;
- return 0;
}
static struct platform_driver xo1_sci_driver = {
@@ -617,7 +616,7 @@ static struct platform_driver xo1_sci_driver = {
.dev_groups = lid_groups,
},
.probe = xo1_sci_probe,
- .remove = xo1_sci_remove,
+ .remove_new = xo1_sci_remove,
.suspend = xo1_sci_suspend,
.resume = xo1_sci_resume,
};
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index bc31863c5ee638..a18591f6e6d941 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -42,7 +42,8 @@ KCOV_INSTRUMENT := n
# make up the standalone purgatory.ro
PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel
-PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss -g0
+PURGATORY_CFLAGS := -mcmodel=small -ffreestanding -fno-zero-initialized-in-bss -g0
+PURGATORY_CFLAGS += -fpic -fvisibility=hidden
PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING
PURGATORY_CFLAGS += -fno-stack-protector
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index b029fb81ebeee0..c101bed6194000 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -11,41 +11,42 @@
#define Elf_Shdr ElfW(Shdr)
#define Elf_Sym ElfW(Sym)
-static Elf_Ehdr ehdr;
-static unsigned long shnum;
-static unsigned int shstrndx;
-static unsigned int shsymtabndx;
-static unsigned int shxsymtabndx;
+static Elf_Ehdr ehdr;
+static unsigned long shnum;
+static unsigned int shstrndx;
+static unsigned int shsymtabndx;
+static unsigned int shxsymtabndx;
static int sym_index(Elf_Sym *sym);
struct relocs {
- uint32_t *offset;
- unsigned long count;
- unsigned long size;
+ uint32_t *offset;
+ unsigned long count;
+ unsigned long size;
};
-static struct relocs relocs16;
-static struct relocs relocs32;
+static struct relocs relocs16;
+static struct relocs relocs32;
+
#if ELF_BITS == 64
-static struct relocs relocs32neg;
-static struct relocs relocs64;
-#define FMT PRIu64
+static struct relocs relocs32neg;
+static struct relocs relocs64;
+# define FMT PRIu64
#else
-#define FMT PRIu32
+# define FMT PRIu32
#endif
struct section {
- Elf_Shdr shdr;
- struct section *link;
- Elf_Sym *symtab;
- Elf32_Word *xsymtab;
- Elf_Rel *reltab;
- char *strtab;
+ Elf_Shdr shdr;
+ struct section *link;
+ Elf_Sym *symtab;
+ Elf32_Word *xsymtab;
+ Elf_Rel *reltab;
+ char *strtab;
};
-static struct section *secs;
+static struct section *secs;
-static const char * const sym_regex_kernel[S_NSYMTYPES] = {
+static const char * const sym_regex_kernel[S_NSYMTYPES] = {
/*
* Following symbols have been audited. There values are constant and do
* not change if bzImage is loaded at a different physical address than
@@ -115,13 +116,13 @@ static const char * const sym_regex_realmode[S_NSYMTYPES] = {
"^pa_",
};
-static const char * const *sym_regex;
+static const char * const *sym_regex;
+
+static regex_t sym_regex_c[S_NSYMTYPES];
-static regex_t sym_regex_c[S_NSYMTYPES];
static int is_reloc(enum symtype type, const char *sym_name)
{
- return sym_regex[type] &&
- !regexec(&sym_regex_c[type], sym_name, 0, NULL, 0);
+ return sym_regex[type] && !regexec(&sym_regex_c[type], sym_name, 0, NULL, 0);
}
static void regex_init(int use_real_mode)
@@ -139,8 +140,7 @@ static void regex_init(int use_real_mode)
if (!sym_regex[i])
continue;
- err = regcomp(&sym_regex_c[i], sym_regex[i],
- REG_EXTENDED|REG_NOSUB);
+ err = regcomp(&sym_regex_c[i], sym_regex[i], REG_EXTENDED|REG_NOSUB);
if (err) {
regerror(err, &sym_regex_c[i], errbuf, sizeof(errbuf));
@@ -163,9 +163,10 @@ static const char *sym_type(unsigned type)
#undef SYM_TYPE
};
const char *name = "unknown sym type name";
- if (type < ARRAY_SIZE(type_name)) {
+
+ if (type < ARRAY_SIZE(type_name))
name = type_name[type];
- }
+
return name;
}
@@ -179,9 +180,10 @@ static const char *sym_bind(unsigned bind)
#undef SYM_BIND
};
const char *name = "unknown sym bind name";
- if (bind < ARRAY_SIZE(bind_name)) {
+
+ if (bind < ARRAY_SIZE(bind_name))
name = bind_name[bind];
- }
+
return name;
}
@@ -196,9 +198,10 @@ static const char *sym_visibility(unsigned visibility)
#undef SYM_VISIBILITY
};
const char *name = "unknown sym visibility name";
- if (visibility < ARRAY_SIZE(visibility_name)) {
+
+ if (visibility < ARRAY_SIZE(visibility_name))
name = visibility_name[visibility];
- }
+
return name;
}
@@ -244,9 +247,10 @@ static const char *rel_type(unsigned type)
#undef REL_TYPE
};
const char *name = "unknown type rel type name";
- if (type < ARRAY_SIZE(type_name) && type_name[type]) {
+
+ if (type < ARRAY_SIZE(type_name) && type_name[type])
name = type_name[type];
- }
+
return name;
}
@@ -256,15 +260,14 @@ static const char *sec_name(unsigned shndx)
const char *name;
sec_strtab = secs[shstrndx].strtab;
name = "<noname>";
- if (shndx < shnum) {
+
+ if (shndx < shnum)
name = sec_strtab + secs[shndx].shdr.sh_name;
- }
- else if (shndx == SHN_ABS) {
+ else if (shndx == SHN_ABS)
name = "ABSOLUTE";
- }
- else if (shndx == SHN_COMMON) {
+ else if (shndx == SHN_COMMON)
name = "COMMON";
- }
+
return name;
}
@@ -272,18 +275,19 @@ static const char *sym_name(const char *sym_strtab, Elf_Sym *sym)
{
const char *name;
name = "<noname>";
- if (sym->st_name) {
+
+ if (sym->st_name)
name = sym_strtab + sym->st_name;
- }
- else {
+ else
name = sec_name(sym_index(sym));
- }
+
return name;
}
static Elf_Sym *sym_lookup(const char *symname)
{
int i;
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
long nsyms;
@@ -309,14 +313,15 @@ static Elf_Sym *sym_lookup(const char *symname)
}
#if BYTE_ORDER == LITTLE_ENDIAN
-#define le16_to_cpu(val) (val)
-#define le32_to_cpu(val) (val)
-#define le64_to_cpu(val) (val)
+# define le16_to_cpu(val) (val)
+# define le32_to_cpu(val) (val)
+# define le64_to_cpu(val) (val)
#endif
+
#if BYTE_ORDER == BIG_ENDIAN
-#define le16_to_cpu(val) bswap_16(val)
-#define le32_to_cpu(val) bswap_32(val)
-#define le64_to_cpu(val) bswap_64(val)
+# define le16_to_cpu(val) bswap_16(val)
+# define le32_to_cpu(val) bswap_32(val)
+# define le64_to_cpu(val) bswap_64(val)
#endif
static uint16_t elf16_to_cpu(uint16_t val)
@@ -337,13 +342,13 @@ static uint64_t elf64_to_cpu(uint64_t val)
{
return le64_to_cpu(val);
}
-#define elf_addr_to_cpu(x) elf64_to_cpu(x)
-#define elf_off_to_cpu(x) elf64_to_cpu(x)
-#define elf_xword_to_cpu(x) elf64_to_cpu(x)
+# define elf_addr_to_cpu(x) elf64_to_cpu(x)
+# define elf_off_to_cpu(x) elf64_to_cpu(x)
+# define elf_xword_to_cpu(x) elf64_to_cpu(x)
#else
-#define elf_addr_to_cpu(x) elf32_to_cpu(x)
-#define elf_off_to_cpu(x) elf32_to_cpu(x)
-#define elf_xword_to_cpu(x) elf32_to_cpu(x)
+# define elf_addr_to_cpu(x) elf32_to_cpu(x)
+# define elf_off_to_cpu(x) elf32_to_cpu(x)
+# define elf_xword_to_cpu(x) elf32_to_cpu(x)
#endif
static int sym_index(Elf_Sym *sym)
@@ -365,22 +370,17 @@ static int sym_index(Elf_Sym *sym)
static void read_ehdr(FILE *fp)
{
- if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
- die("Cannot read ELF header: %s\n",
- strerror(errno));
- }
- if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) {
+ if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1)
+ die("Cannot read ELF header: %s\n", strerror(errno));
+ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0)
die("No ELF magic\n");
- }
- if (ehdr.e_ident[EI_CLASS] != ELF_CLASS) {
+ if (ehdr.e_ident[EI_CLASS] != ELF_CLASS)
die("Not a %d bit executable\n", ELF_BITS);
- }
- if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) {
+ if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB)
die("Not a LSB ELF executable\n");
- }
- if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
+ if (ehdr.e_ident[EI_VERSION] != EV_CURRENT)
die("Unknown ELF version\n");
- }
+
/* Convert the fields to native endian */
ehdr.e_type = elf_half_to_cpu(ehdr.e_type);
ehdr.e_machine = elf_half_to_cpu(ehdr.e_machine);
@@ -439,19 +439,18 @@ static void read_shdrs(FILE *fp)
Elf_Shdr shdr;
secs = calloc(shnum, sizeof(struct section));
- if (!secs) {
- die("Unable to allocate %ld section headers\n",
- shnum);
- }
- if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- ehdr.e_shoff, strerror(errno));
- }
+ if (!secs)
+ die("Unable to allocate %ld section headers\n", shnum);
+
+ if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", ehdr.e_shoff, strerror(errno));
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
+
if (fread(&shdr, sizeof(shdr), 1, fp) != 1)
- die("Cannot read ELF section headers %d/%ld: %s\n",
- i, shnum, strerror(errno));
+ die("Cannot read ELF section headers %d/%ld: %s\n", i, shnum, strerror(errno));
+
sec->shdr.sh_name = elf_word_to_cpu(shdr.sh_name);
sec->shdr.sh_type = elf_word_to_cpu(shdr.sh_type);
sec->shdr.sh_flags = elf_xword_to_cpu(shdr.sh_flags);
@@ -471,31 +470,28 @@ static void read_shdrs(FILE *fp)
static void read_strtabs(FILE *fp)
{
int i;
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
- if (sec->shdr.sh_type != SHT_STRTAB) {
+
+ if (sec->shdr.sh_type != SHT_STRTAB)
continue;
- }
+
sec->strtab = malloc(sec->shdr.sh_size);
- if (!sec->strtab) {
- die("malloc of %" FMT " bytes for strtab failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->strtab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read symbol table: %s\n",
- strerror(errno));
- }
+ if (!sec->strtab)
+ die("malloc of %" FMT " bytes for strtab failed\n", sec->shdr.sh_size);
+
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno));
+
+ if (fread(sec->strtab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size)
+ die("Cannot read symbol table: %s\n", strerror(errno));
}
}
static void read_symtabs(FILE *fp)
{
- int i,j;
+ int i, j;
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
@@ -504,19 +500,15 @@ static void read_symtabs(FILE *fp)
switch (sec->shdr.sh_type) {
case SHT_SYMTAB_SHNDX:
sec->xsymtab = malloc(sec->shdr.sh_size);
- if (!sec->xsymtab) {
- die("malloc of %" FMT " bytes for xsymtab failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->xsymtab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read extended symbol table: %s\n",
- strerror(errno));
- }
+ if (!sec->xsymtab)
+ die("malloc of %" FMT " bytes for xsymtab failed\n", sec->shdr.sh_size);
+
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno));
+
+ if (fread(sec->xsymtab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size)
+ die("Cannot read extended symbol table: %s\n", strerror(errno));
+
shxsymtabndx = i;
continue;
@@ -524,19 +516,15 @@ static void read_symtabs(FILE *fp)
num_syms = sec->shdr.sh_size / sizeof(Elf_Sym);
sec->symtab = malloc(sec->shdr.sh_size);
- if (!sec->symtab) {
- die("malloc of %" FMT " bytes for symtab failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->symtab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read symbol table: %s\n",
- strerror(errno));
- }
+ if (!sec->symtab)
+ die("malloc of %" FMT " bytes for symtab failed\n", sec->shdr.sh_size);
+
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno));
+
+ if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size)
+ die("Cannot read symbol table: %s\n", strerror(errno));
+
for (j = 0; j < num_syms; j++) {
Elf_Sym *sym = &sec->symtab[j];
@@ -557,28 +545,27 @@ static void read_symtabs(FILE *fp)
static void read_relocs(FILE *fp)
{
- int i,j;
+ int i, j;
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
- if (sec->shdr.sh_type != SHT_REL_TYPE) {
+
+ if (sec->shdr.sh_type != SHT_REL_TYPE)
continue;
- }
+
sec->reltab = malloc(sec->shdr.sh_size);
- if (!sec->reltab) {
- die("malloc of %" FMT " bytes for relocs failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->reltab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read symbol table: %s\n",
- strerror(errno));
- }
+ if (!sec->reltab)
+ die("malloc of %" FMT " bytes for relocs failed\n", sec->shdr.sh_size);
+
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno));
+
+ if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size)
+ die("Cannot read symbol table: %s\n", strerror(errno));
+
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) {
Elf_Rel *rel = &sec->reltab[j];
+
rel->r_offset = elf_addr_to_cpu(rel->r_offset);
rel->r_info = elf_xword_to_cpu(rel->r_info);
#if (SHT_REL_TYPE == SHT_RELA)
@@ -601,23 +588,27 @@ static void print_absolute_symbols(void)
printf("Absolute symbols\n");
printf(" Num: Value Size Type Bind Visibility Name\n");
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
char *sym_strtab;
int j;
- if (sec->shdr.sh_type != SHT_SYMTAB) {
+ if (sec->shdr.sh_type != SHT_SYMTAB)
continue;
- }
+
sym_strtab = sec->link->strtab;
+
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Sym); j++) {
Elf_Sym *sym;
const char *name;
+
sym = &sec->symtab[j];
name = sym_name(sym_strtab, sym);
- if (sym->st_shndx != SHN_ABS) {
+
+ if (sym->st_shndx != SHN_ABS)
continue;
- }
+
printf(format,
j, sym->st_value, sym->st_size,
sym_type(ELF_ST_TYPE(sym->st_info)),
@@ -645,34 +636,37 @@ static void print_absolute_relocs(void)
char *sym_strtab;
Elf_Sym *sh_symtab;
int j;
- if (sec->shdr.sh_type != SHT_REL_TYPE) {
+
+ if (sec->shdr.sh_type != SHT_REL_TYPE)
continue;
- }
+
sec_symtab = sec->link;
sec_applies = &secs[sec->shdr.sh_info];
- if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
+ if (!(sec_applies->shdr.sh_flags & SHF_ALLOC))
continue;
- }
+
/*
* Do not perform relocations in .notes section; any
* values there are meant for pre-boot consumption (e.g.
* startup_xen).
*/
- if (sec_applies->shdr.sh_type == SHT_NOTE) {
+ if (sec_applies->shdr.sh_type == SHT_NOTE)
continue;
- }
+
sh_symtab = sec_symtab->symtab;
sym_strtab = sec_symtab->link->strtab;
+
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) {
Elf_Rel *rel;
Elf_Sym *sym;
const char *name;
+
rel = &sec->reltab[j];
sym = &sh_symtab[ELF_R_SYM(rel->r_info)];
name = sym_name(sym_strtab, sym);
- if (sym->st_shndx != SHN_ABS) {
+
+ if (sym->st_shndx != SHN_ABS)
continue;
- }
/* Absolute symbols are not relocated if bzImage is
* loaded at a non-compiled address. Display a warning
@@ -691,10 +685,8 @@ static void print_absolute_relocs(void)
continue;
if (!printed) {
- printf("WARNING: Absolute relocations"
- " present\n");
- printf("Offset Info Type Sym.Value "
- "Sym.Name\n");
+ printf("WARNING: Absolute relocations present\n");
+ printf("Offset Info Type Sym.Value Sym.Name\n");
printed = 1;
}
@@ -718,8 +710,8 @@ static void add_reloc(struct relocs *r, uint32_t offset)
void *mem = realloc(r->offset, newsize * sizeof(r->offset[0]));
if (!mem)
- die("realloc of %ld entries for relocs failed\n",
- newsize);
+ die("realloc of %ld entries for relocs failed\n", newsize);
+
r->offset = mem;
r->size = newsize;
}
@@ -730,6 +722,7 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
Elf_Sym *sym, const char *symname))
{
int i;
+
/* Walk through the relocations */
for (i = 0; i < shnum; i++) {
char *sym_strtab;
@@ -738,16 +731,25 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
int j;
struct section *sec = &secs[i];
- if (sec->shdr.sh_type != SHT_REL_TYPE) {
+ if (sec->shdr.sh_type != SHT_REL_TYPE)
continue;
- }
+
sec_symtab = sec->link;
sec_applies = &secs[sec->shdr.sh_info];
- if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
+ if (!(sec_applies->shdr.sh_flags & SHF_ALLOC))
continue;
- }
+
+ /*
+ * Do not perform relocations in .notes sections; any
+ * values there are meant for pre-boot consumption (e.g.
+ * startup_xen).
+ */
+ if (sec_applies->shdr.sh_type == SHT_NOTE)
+ continue;
+
sh_symtab = sec_symtab->symtab;
sym_strtab = sec_symtab->link->strtab;
+
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) {
Elf_Rel *rel = &sec->reltab[j];
Elf_Sym *sym = &sh_symtab[ELF_R_SYM(rel->r_info)];
@@ -781,14 +783,16 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
* kernel data and does not require special treatment.
*
*/
-static int per_cpu_shndx = -1;
+static int per_cpu_shndx = -1;
static Elf_Addr per_cpu_load_addr;
static void percpu_init(void)
{
int i;
+
for (i = 0; i < shnum; i++) {
ElfW(Sym) *sym;
+
if (strcmp(sec_name(i), ".data..percpu"))
continue;
@@ -801,6 +805,7 @@ static void percpu_init(void)
per_cpu_shndx = i;
per_cpu_load_addr = sym->st_value;
+
return;
}
}
@@ -871,8 +876,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
* Only used by jump labels
*/
if (is_percpu_sym(sym, symname))
- die("Invalid R_X86_64_PC64 relocation against per-CPU symbol %s\n",
- symname);
+ die("Invalid R_X86_64_PC64 relocation against per-CPU symbol %s\n", symname);
break;
case R_X86_64_32:
@@ -892,8 +896,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
if (is_reloc(S_ABS, symname))
break;
- die("Invalid absolute %s relocation: %s\n",
- rel_type(r_type), symname);
+ die("Invalid absolute %s relocation: %s\n", rel_type(r_type), symname);
break;
}
@@ -913,8 +916,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
break;
default:
- die("Unsupported relocation type: %s (%d)\n",
- rel_type(r_type), r_type);
+ die("Unsupported relocation type: %s (%d)\n", rel_type(r_type), r_type);
break;
}
@@ -951,8 +953,7 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
if (is_reloc(S_ABS, symname))
break;
- die("Invalid absolute %s relocation: %s\n",
- rel_type(r_type), symname);
+ die("Invalid absolute %s relocation: %s\n", rel_type(r_type), symname);
break;
}
@@ -960,16 +961,14 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
break;
default:
- die("Unsupported relocation type: %s (%d)\n",
- rel_type(r_type), r_type);
+ die("Unsupported relocation type: %s (%d)\n", rel_type(r_type), r_type);
break;
}
return 0;
}
-static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
- const char *symname)
+static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname)
{
unsigned r_type = ELF32_R_TYPE(rel->r_info);
int shn_abs = (sym->st_shndx == SHN_ABS) && !is_reloc(S_REL, symname);
@@ -1004,9 +1003,7 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
if (!is_reloc(S_LIN, symname))
break;
}
- die("Invalid %s %s relocation: %s\n",
- shn_abs ? "absolute" : "relative",
- rel_type(r_type), symname);
+ die("Invalid %s %s relocation: %s\n", shn_abs ? "absolute" : "relative", rel_type(r_type), symname);
break;
case R_386_32:
@@ -1027,14 +1024,11 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
add_reloc(&relocs32, rel->r_offset);
break;
}
- die("Invalid %s %s relocation: %s\n",
- shn_abs ? "absolute" : "relative",
- rel_type(r_type), symname);
+ die("Invalid %s %s relocation: %s\n", shn_abs ? "absolute" : "relative", rel_type(r_type), symname);
break;
default:
- die("Unsupported relocation type: %s (%d)\n",
- rel_type(r_type), r_type);
+ die("Unsupported relocation type: %s (%d)\n", rel_type(r_type), r_type);
break;
}
@@ -1046,7 +1040,10 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
static int cmp_relocs(const void *va, const void *vb)
{
const uint32_t *a, *b;
- a = va; b = vb;
+
+ a = va;
+ b = vb;
+
return (*a == *b)? 0 : (*a > *b)? 1 : -1;
}
@@ -1060,6 +1057,7 @@ static int write32(uint32_t v, FILE *f)
unsigned char buf[4];
put_unaligned_le32(v, buf);
+
return fwrite(buf, 1, 4, f) == 4 ? 0 : -1;
}
@@ -1072,8 +1070,7 @@ static void emit_relocs(int as_text, int use_real_mode)
{
int i;
int (*write_reloc)(uint32_t, FILE *) = write32;
- int (*do_reloc)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
- const char *symname);
+ int (*do_reloc)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname);
#if ELF_BITS == 64
if (!use_real_mode)
@@ -1160,6 +1157,7 @@ static int do_reloc_info(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
rel_type(ELF_R_TYPE(rel->r_info)),
symname,
sec_name(sym_index(sym)));
+
return 0;
}
@@ -1185,19 +1183,24 @@ void process(FILE *fp, int use_real_mode, int as_text,
read_strtabs(fp);
read_symtabs(fp);
read_relocs(fp);
+
if (ELF_BITS == 64)
percpu_init();
+
if (show_absolute_syms) {
print_absolute_symbols();
return;
}
+
if (show_absolute_relocs) {
print_absolute_relocs();
return;
}
+
if (show_reloc_info) {
print_reloc_info();
return;
}
+
emit_relocs(as_text, use_real_mode);
}
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 4d6826a76f7881..49a1c6890b5560 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -27,7 +27,6 @@
#include <linux/log2.h>
#include <linux/acpi.h>
#include <linux/suspend.h>
-#include <linux/acpi.h>
#include <asm/page.h>
#include <asm/special_insns.h>
#include <asm/msr-index.h>
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index a01ca255b0c649..5f3a69f6ec3401 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1,8 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
-#include <linux/memblock.h>
-#endif
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/kexec.h>
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 04101b984f24da..758bcd47b72d32 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -49,7 +49,7 @@ SYM_CODE_START(startup_xen)
ANNOTATE_NOENDBR
cld
- leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp
+ leaq __top_init_kernel_stack(%rip), %rsp
/* Set up %gs.
*
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 024b78a0cfc11b..0248912ff68751 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -22,7 +22,7 @@
#include <linux/units.h>
#define CREATE_TRACE_POINTS
-#include <trace/events/thermal_pressure.h>
+#include <trace/events/hw_pressure.h>
static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
static struct cpumask scale_freq_counters_mask;
@@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
per_cpu(cpu_scale, cpu) = capacity;
}
-DEFINE_PER_CPU(unsigned long, thermal_pressure);
+DEFINE_PER_CPU(unsigned long, hw_pressure);
/**
- * topology_update_thermal_pressure() - Update thermal pressure for CPUs
+ * topology_update_hw_pressure() - Update HW pressure for CPUs
* @cpus : The related CPUs for which capacity has been reduced
* @capped_freq : The maximum allowed frequency that CPUs can run at
*
- * Update the value of thermal pressure for all @cpus in the mask. The
+ * Update the value of HW pressure for all @cpus in the mask. The
* cpumask should include all (online+offline) affected CPUs, to avoid
* operating on stale data when hot-plug is used for some CPUs. The
* @capped_freq reflects the currently allowed max CPUs frequency due to
- * thermal capping. It might be also a boost frequency value, which is bigger
+ * HW capping. It might be also a boost frequency value, which is bigger
* than the internal 'capacity_freq_ref' max frequency. In such case the
* pressure value should simply be removed, since this is an indication that
- * there is no thermal throttling. The @capped_freq must be provided in kHz.
+ * there is no HW throttling. The @capped_freq must be provided in kHz.
*/
-void topology_update_thermal_pressure(const struct cpumask *cpus,
+void topology_update_hw_pressure(const struct cpumask *cpus,
unsigned long capped_freq)
{
- unsigned long max_capacity, capacity, th_pressure;
+ unsigned long max_capacity, capacity, hw_pressure;
u32 max_freq;
int cpu;
@@ -189,21 +189,21 @@ void topology_update_thermal_pressure(const struct cpumask *cpus,
/*
* Handle properly the boost frequencies, which should simply clean
- * the thermal pressure value.
+ * the HW pressure value.
*/
if (max_freq <= capped_freq)
capacity = max_capacity;
else
capacity = mult_frac(max_capacity, capped_freq, max_freq);
- th_pressure = max_capacity - capacity;
+ hw_pressure = max_capacity - capacity;
- trace_thermal_pressure_update(cpu, th_pressure);
+ trace_hw_pressure_update(cpu, hw_pressure);
for_each_cpu(cpu, cpus)
- WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
+ WRITE_ONCE(per_cpu(hw_pressure, cpu), hw_pressure);
}
-EXPORT_SYMBOL_GPL(topology_update_thermal_pressure);
+EXPORT_SYMBOL_GPL(topology_update_hw_pressure);
static ssize_t cpu_capacity_show(struct device *dev,
struct device_attribute *attr,
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index fd9c3ed21f49c4..a45aac17c20f0e 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2585,6 +2585,40 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
}
EXPORT_SYMBOL(cpufreq_get_policy);
+DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
+
+/**
+ * cpufreq_update_pressure() - Update cpufreq pressure for CPUs
+ * @policy: cpufreq policy of the CPUs.
+ *
+ * Update the value of cpufreq pressure for all @cpus in the policy.
+ */
+static void cpufreq_update_pressure(struct cpufreq_policy *policy)
+{
+ unsigned long max_capacity, capped_freq, pressure;
+ u32 max_freq;
+ int cpu;
+
+ cpu = cpumask_first(policy->related_cpus);
+ max_freq = arch_scale_freq_ref(cpu);
+ capped_freq = policy->max;
+
+ /*
+ * Handle properly the boost frequencies, which should simply clean
+ * the cpufreq pressure value.
+ */
+ if (max_freq <= capped_freq) {
+ pressure = 0;
+ } else {
+ max_capacity = arch_scale_cpu_capacity(cpu);
+ pressure = max_capacity -
+ mult_frac(max_capacity, capped_freq, max_freq);
+ }
+
+ for_each_cpu(cpu, policy->related_cpus)
+ WRITE_ONCE(per_cpu(cpufreq_pressure, cpu), pressure);
+}
+
/**
* cpufreq_set_policy - Modify cpufreq policy parameters.
* @policy: Policy object to modify.
@@ -2640,6 +2674,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
trace_cpu_frequency_limits(policy);
+ cpufreq_update_pressure(policy);
+
policy->cached_target_freq = UINT_MAX;
pr_debug("new min and max freqs are %u - %u kHz\n",
diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index 70b0f21968a010..ec8df5496a0cf8 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -347,8 +347,8 @@ static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data)
throttled_freq = freq_hz / HZ_PER_KHZ;
- /* Update thermal pressure (the boost frequencies are accepted) */
- arch_update_thermal_pressure(policy->related_cpus, throttled_freq);
+ /* Update HW pressure (the boost frequencies are accepted) */
+ arch_update_hw_pressure(policy->related_cpus, throttled_freq);
/*
* In the unlikely case policy is unregistered do not enable
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index 9acde71558d510..bb8761c8a42e46 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -439,13 +439,8 @@ static int cpuidle_coupled_clear_pokes(int cpu)
static bool cpuidle_coupled_any_pokes_pending(struct cpuidle_coupled *coupled)
{
- cpumask_t cpus;
- int ret;
-
- cpumask_and(&cpus, cpu_online_mask, &coupled->coupled_cpus);
- ret = cpumask_and(&cpus, &cpuidle_coupled_poke_pending, &cpus);
-
- return ret;
+ return cpumask_first_and_and(cpu_online_mask, &coupled->coupled_cpus,
+ &cpuidle_coupled_poke_pending) < nr_cpu_ids;
}
/**
@@ -626,9 +621,7 @@ out:
static void cpuidle_coupled_update_online_cpus(struct cpuidle_coupled *coupled)
{
- cpumask_t cpus;
- cpumask_and(&cpus, cpu_online_mask, &coupled->coupled_cpus);
- coupled->online_count = cpumask_weight(&cpus);
+ coupled->online_count = cpumask_weight_and(cpu_online_mask, &coupled->coupled_cpus);
}
/**
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 72c07a12f5e18e..14464716bacbb7 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -540,6 +540,31 @@ config RISCV_INTC
depends on RISCV
select IRQ_DOMAIN_HIERARCHY
+config RISCV_APLIC
+ bool
+ depends on RISCV
+ select IRQ_DOMAIN_HIERARCHY
+
+config RISCV_APLIC_MSI
+ bool
+ depends on RISCV_APLIC
+ select GENERIC_MSI_IRQ
+ default RISCV_APLIC
+
+config RISCV_IMSIC
+ bool
+ depends on RISCV
+ select IRQ_DOMAIN_HIERARCHY
+ select GENERIC_IRQ_MATRIX_ALLOCATOR
+ select GENERIC_MSI_IRQ
+
+config RISCV_IMSIC_PCI
+ bool
+ depends on RISCV_IMSIC
+ depends on PCI
+ depends on PCI_MSI
+ default RISCV_IMSIC
+
config SIFIVE_PLIC
bool
depends on RISCV
@@ -568,7 +593,7 @@ config IRQ_LOONGARCH_CPU
bool
select GENERIC_IRQ_CHIP
select IRQ_DOMAIN
- select GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP
select LOONGSON_HTVEC
select LOONGSON_LIOINTC
select LOONGSON_EIOINTC
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index ec4a183809981d..d9dc3d99aaa863 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -95,6 +95,9 @@ obj-$(CONFIG_QCOM_MPM) += irq-qcom-mpm.o
obj-$(CONFIG_CSKY_MPINTC) += irq-csky-mpintc.o
obj-$(CONFIG_CSKY_APB_INTC) += irq-csky-apb-intc.o
obj-$(CONFIG_RISCV_INTC) += irq-riscv-intc.o
+obj-$(CONFIG_RISCV_APLIC) += irq-riscv-aplic-main.o irq-riscv-aplic-direct.o
+obj-$(CONFIG_RISCV_APLIC_MSI) += irq-riscv-aplic-msi.o
+obj-$(CONFIG_RISCV_IMSIC) += irq-riscv-imsic-state.o irq-riscv-imsic-early.o irq-riscv-imsic-platform.o
obj-$(CONFIG_SIFIVE_PLIC) += irq-sifive-plic.o
obj-$(CONFIG_STARFIVE_JH8100_INTC) += irq-starfive-jh8100-intc.o
obj-$(CONFIG_IMX_IRQSTEER) += irq-imx-irqsteer.o
diff --git a/drivers/irqchip/irq-alpine-msi.c b/drivers/irqchip/irq-alpine-msi.c
index 9c8b1349ee17b8..a1430ab60a8a3f 100644
--- a/drivers/irqchip/irq-alpine-msi.c
+++ b/drivers/irqchip/irq-alpine-msi.c
@@ -165,7 +165,7 @@ static int alpine_msix_middle_domain_alloc(struct irq_domain *domain,
return 0;
err_sgi:
- irq_domain_free_irqs_parent(domain, virq, i - 1);
+ irq_domain_free_irqs_parent(domain, virq, i);
alpine_msix_free_sgi(priv, sgi, nr_irqs);
return err;
}
diff --git a/drivers/irqchip/irq-bcm6345-l1.c b/drivers/irqchip/irq-bcm6345-l1.c
index eb02d203c9634f..90daa274ef2395 100644
--- a/drivers/irqchip/irq-bcm6345-l1.c
+++ b/drivers/irqchip/irq-bcm6345-l1.c
@@ -192,14 +192,10 @@ static int bcm6345_l1_set_affinity(struct irq_data *d,
u32 mask = BIT(d->hwirq % IRQS_PER_WORD);
unsigned int old_cpu = cpu_for_irq(intc, d);
unsigned int new_cpu;
- struct cpumask valid;
unsigned long flags;
bool enabled;
- if (!cpumask_and(&valid, &intc->cpumask, dest))
- return -EINVAL;
-
- new_cpu = cpumask_any_and(&valid, cpu_online_mask);
+ new_cpu = cpumask_first_and_and(&intc->cpumask, dest, cpu_online_mask);
if (new_cpu >= nr_cpu_ids)
return -EINVAL;
diff --git a/drivers/irqchip/irq-brcmstb-l2.c b/drivers/irqchip/irq-brcmstb-l2.c
index 2b0b3175cea068..c988886917f739 100644
--- a/drivers/irqchip/irq-brcmstb-l2.c
+++ b/drivers/irqchip/irq-brcmstb-l2.c
@@ -118,7 +118,7 @@ out:
chained_irq_exit(chip, desc);
}
-static void brcmstb_l2_intc_suspend(struct irq_data *d)
+static void __brcmstb_l2_intc_suspend(struct irq_data *d, bool save)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
struct irq_chip_type *ct = irq_data_get_chip_type(d);
@@ -127,7 +127,8 @@ static void brcmstb_l2_intc_suspend(struct irq_data *d)
irq_gc_lock_irqsave(gc, flags);
/* Save the current mask */
- b->saved_mask = irq_reg_readl(gc, ct->regs.mask);
+ if (save)
+ b->saved_mask = irq_reg_readl(gc, ct->regs.mask);
if (b->can_wake) {
/* Program the wakeup mask */
@@ -137,6 +138,16 @@ static void brcmstb_l2_intc_suspend(struct irq_data *d)
irq_gc_unlock_irqrestore(gc, flags);
}
+static void brcmstb_l2_intc_shutdown(struct irq_data *d)
+{
+ __brcmstb_l2_intc_suspend(d, false);
+}
+
+static void brcmstb_l2_intc_suspend(struct irq_data *d)
+{
+ __brcmstb_l2_intc_suspend(d, true);
+}
+
static void brcmstb_l2_intc_resume(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
@@ -252,7 +263,7 @@ static int __init brcmstb_l2_intc_of_init(struct device_node *np,
ct->chip.irq_suspend = brcmstb_l2_intc_suspend;
ct->chip.irq_resume = brcmstb_l2_intc_resume;
- ct->chip.irq_pm_shutdown = brcmstb_l2_intc_suspend;
+ ct->chip.irq_pm_shutdown = brcmstb_l2_intc_shutdown;
if (data->can_wake) {
/* This IRQ chip can wake the system, set all child interrupts
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 5f7d3db3afd824..40ebf1726393ce 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3832,9 +3832,9 @@ static int its_vpe_set_affinity(struct irq_data *d,
bool force)
{
struct its_vpe *vpe = irq_data_get_irq_chip_data(d);
- struct cpumask common, *table_mask;
+ unsigned int from, cpu = nr_cpu_ids;
+ struct cpumask *table_mask;
unsigned long flags;
- int from, cpu;
/*
* Changing affinity is mega expensive, so let's be as lazy as
@@ -3856,10 +3856,15 @@ static int its_vpe_set_affinity(struct irq_data *d,
* If we are offered another CPU in the same GICv4.1 ITS
* affinity, pick this one. Otherwise, any CPU will do.
*/
- if (table_mask && cpumask_and(&common, mask_val, table_mask))
- cpu = cpumask_test_cpu(from, &common) ? from : cpumask_first(&common);
- else
+ if (table_mask)
+ cpu = cpumask_any_and(mask_val, table_mask);
+ if (cpu < nr_cpu_ids) {
+ if (cpumask_test_cpu(from, mask_val) &&
+ cpumask_test_cpu(from, table_mask))
+ cpu = from;
+ } else {
cpu = cpumask_first(mask_val);
+ }
if (from == cpu)
goto out;
@@ -4527,8 +4532,6 @@ static int its_vpe_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
struct page *vprop_page;
int base, nr_ids, i, err = 0;
- BUG_ON(!vm);
-
bitmap = its_lpi_alloc(roundup_pow_of_two(nr_irqs), &base, &nr_ids);
if (!bitmap)
return -ENOMEM;
diff --git a/drivers/irqchip/irq-loongson-eiointc.c b/drivers/irqchip/irq-loongson-eiointc.c
index b64cbe3052e841..c7ddebf312ad2c 100644
--- a/drivers/irqchip/irq-loongson-eiointc.c
+++ b/drivers/irqchip/irq-loongson-eiointc.c
@@ -59,6 +59,7 @@ static int cpu_to_eio_node(int cpu)
return cpu_logical_map(cpu) / CORES_PER_EIO_NODE;
}
+#ifdef CONFIG_SMP
static void eiointc_set_irq_route(int pos, unsigned int cpu, unsigned int mnode, nodemask_t *node_map)
{
int i, node, cpu_node, route_node;
@@ -92,19 +93,15 @@ static int eiointc_set_irq_affinity(struct irq_data *d, const struct cpumask *af
unsigned int cpu;
unsigned long flags;
uint32_t vector, regaddr;
- struct cpumask intersect_affinity;
struct eiointc_priv *priv = d->domain->host_data;
raw_spin_lock_irqsave(&affinity_lock, flags);
- cpumask_and(&intersect_affinity, affinity, cpu_online_mask);
- cpumask_and(&intersect_affinity, &intersect_affinity, &priv->cpuspan_map);
-
- if (cpumask_empty(&intersect_affinity)) {
+ cpu = cpumask_first_and_and(&priv->cpuspan_map, affinity, cpu_online_mask);
+ if (cpu >= nr_cpu_ids) {
raw_spin_unlock_irqrestore(&affinity_lock, flags);
return -EINVAL;
}
- cpu = cpumask_first(&intersect_affinity);
vector = d->hwirq;
regaddr = EIOINTC_REG_ENABLE + ((vector >> 5) << 2);
@@ -126,6 +123,7 @@ static int eiointc_set_irq_affinity(struct irq_data *d, const struct cpumask *af
return IRQ_SET_MASK_OK;
}
+#endif
static int eiointc_index(int node)
{
@@ -238,7 +236,9 @@ static struct irq_chip eiointc_irq_chip = {
.irq_ack = eiointc_ack_irq,
.irq_mask = eiointc_mask_irq,
.irq_unmask = eiointc_unmask_irq,
+#ifdef CONFIG_SMP
.irq_set_affinity = eiointc_set_irq_affinity,
+#endif
};
static int eiointc_domain_alloc(struct irq_domain *domain, unsigned int virq,
diff --git a/drivers/irqchip/irq-loongson-pch-msi.c b/drivers/irqchip/irq-loongson-pch-msi.c
index 6e1e1f011bb292..dd4d699170f4ec 100644
--- a/drivers/irqchip/irq-loongson-pch-msi.c
+++ b/drivers/irqchip/irq-loongson-pch-msi.c
@@ -136,7 +136,7 @@ static int pch_msi_middle_domain_alloc(struct irq_domain *domain,
err_hwirq:
pch_msi_free_hwirq(priv, hwirq, nr_irqs);
- irq_domain_free_irqs_parent(domain, virq, i - 1);
+ irq_domain_free_irqs_parent(domain, virq, i);
return err;
}
diff --git a/drivers/irqchip/irq-loongson-pch-pic.c b/drivers/irqchip/irq-loongson-pch-pic.c
index 63db8e2172e017..cbaef65e804cec 100644
--- a/drivers/irqchip/irq-loongson-pch-pic.c
+++ b/drivers/irqchip/irq-loongson-pch-pic.c
@@ -33,6 +33,7 @@
#define PIC_COUNT (PIC_COUNT_PER_REG * PIC_REG_COUNT)
#define PIC_REG_IDX(irq_id) ((irq_id) / PIC_COUNT_PER_REG)
#define PIC_REG_BIT(irq_id) ((irq_id) % PIC_COUNT_PER_REG)
+#define PIC_UNDEF_VECTOR 255
static int nr_pics;
@@ -46,12 +47,19 @@ struct pch_pic {
u32 saved_vec_en[PIC_REG_COUNT];
u32 saved_vec_pol[PIC_REG_COUNT];
u32 saved_vec_edge[PIC_REG_COUNT];
+ u8 table[PIC_COUNT];
+ int inuse;
};
static struct pch_pic *pch_pic_priv[MAX_IO_PICS];
struct fwnode_handle *pch_pic_handle[MAX_IO_PICS];
+static inline u8 hwirq_to_bit(struct pch_pic *priv, int hirq)
+{
+ return priv->table[hirq];
+}
+
static void pch_pic_bitset(struct pch_pic *priv, int offset, int bit)
{
u32 reg;
@@ -80,45 +88,47 @@ static void pch_pic_mask_irq(struct irq_data *d)
{
struct pch_pic *priv = irq_data_get_irq_chip_data(d);
- pch_pic_bitset(priv, PCH_PIC_MASK, d->hwirq);
+ pch_pic_bitset(priv, PCH_PIC_MASK, hwirq_to_bit(priv, d->hwirq));
irq_chip_mask_parent(d);
}
static void pch_pic_unmask_irq(struct irq_data *d)
{
struct pch_pic *priv = irq_data_get_irq_chip_data(d);
+ int bit = hwirq_to_bit(priv, d->hwirq);
- writel(BIT(PIC_REG_BIT(d->hwirq)),
- priv->base + PCH_PIC_CLR + PIC_REG_IDX(d->hwirq) * 4);
+ writel(BIT(PIC_REG_BIT(bit)),
+ priv->base + PCH_PIC_CLR + PIC_REG_IDX(bit) * 4);
irq_chip_unmask_parent(d);
- pch_pic_bitclr(priv, PCH_PIC_MASK, d->hwirq);
+ pch_pic_bitclr(priv, PCH_PIC_MASK, bit);
}
static int pch_pic_set_type(struct irq_data *d, unsigned int type)
{
struct pch_pic *priv = irq_data_get_irq_chip_data(d);
+ int bit = hwirq_to_bit(priv, d->hwirq);
int ret = 0;
switch (type) {
case IRQ_TYPE_EDGE_RISING:
- pch_pic_bitset(priv, PCH_PIC_EDGE, d->hwirq);
- pch_pic_bitclr(priv, PCH_PIC_POL, d->hwirq);
+ pch_pic_bitset(priv, PCH_PIC_EDGE, bit);
+ pch_pic_bitclr(priv, PCH_PIC_POL, bit);
irq_set_handler_locked(d, handle_edge_irq);
break;
case IRQ_TYPE_EDGE_FALLING:
- pch_pic_bitset(priv, PCH_PIC_EDGE, d->hwirq);
- pch_pic_bitset(priv, PCH_PIC_POL, d->hwirq);
+ pch_pic_bitset(priv, PCH_PIC_EDGE, bit);
+ pch_pic_bitset(priv, PCH_PIC_POL, bit);
irq_set_handler_locked(d, handle_edge_irq);
break;
case IRQ_TYPE_LEVEL_HIGH:
- pch_pic_bitclr(priv, PCH_PIC_EDGE, d->hwirq);
- pch_pic_bitclr(priv, PCH_PIC_POL, d->hwirq);
+ pch_pic_bitclr(priv, PCH_PIC_EDGE, bit);
+ pch_pic_bitclr(priv, PCH_PIC_POL, bit);
irq_set_handler_locked(d, handle_level_irq);
break;
case IRQ_TYPE_LEVEL_LOW:
- pch_pic_bitclr(priv, PCH_PIC_EDGE, d->hwirq);
- pch_pic_bitset(priv, PCH_PIC_POL, d->hwirq);
+ pch_pic_bitclr(priv, PCH_PIC_EDGE, bit);
+ pch_pic_bitset(priv, PCH_PIC_POL, bit);
irq_set_handler_locked(d, handle_level_irq);
break;
default:
@@ -133,11 +143,12 @@ static void pch_pic_ack_irq(struct irq_data *d)
{
unsigned int reg;
struct pch_pic *priv = irq_data_get_irq_chip_data(d);
+ int bit = hwirq_to_bit(priv, d->hwirq);
- reg = readl(priv->base + PCH_PIC_EDGE + PIC_REG_IDX(d->hwirq) * 4);
- if (reg & BIT(PIC_REG_BIT(d->hwirq))) {
- writel(BIT(PIC_REG_BIT(d->hwirq)),
- priv->base + PCH_PIC_CLR + PIC_REG_IDX(d->hwirq) * 4);
+ reg = readl(priv->base + PCH_PIC_EDGE + PIC_REG_IDX(bit) * 4);
+ if (reg & BIT(PIC_REG_BIT(bit))) {
+ writel(BIT(PIC_REG_BIT(bit)),
+ priv->base + PCH_PIC_CLR + PIC_REG_IDX(bit) * 4);
}
irq_chip_ack_parent(d);
}
@@ -159,6 +170,8 @@ static int pch_pic_domain_translate(struct irq_domain *d,
{
struct pch_pic *priv = d->host_data;
struct device_node *of_node = to_of_node(fwspec->fwnode);
+ unsigned long flags;
+ int i;
if (of_node) {
if (fwspec->param_count < 2)
@@ -171,12 +184,33 @@ static int pch_pic_domain_translate(struct irq_domain *d,
return -EINVAL;
*hwirq = fwspec->param[0] - priv->gsi_base;
+
if (fwspec->param_count > 1)
*type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK;
else
*type = IRQ_TYPE_NONE;
}
+ raw_spin_lock_irqsave(&priv->pic_lock, flags);
+ /* Check pic-table to confirm if the hwirq has been assigned */
+ for (i = 0; i < priv->inuse; i++) {
+ if (priv->table[i] == *hwirq) {
+ *hwirq = i;
+ break;
+ }
+ }
+ if (i == priv->inuse) {
+ /* Assign a new hwirq in pic-table */
+ if (priv->inuse >= PIC_COUNT) {
+ pr_err("pch-pic domain has no free vectors\n");
+ raw_spin_unlock_irqrestore(&priv->pic_lock, flags);
+ return -EINVAL;
+ }
+ priv->table[priv->inuse] = *hwirq;
+ *hwirq = priv->inuse++;
+ }
+ raw_spin_unlock_irqrestore(&priv->pic_lock, flags);
+
return 0;
}
@@ -194,6 +228,9 @@ static int pch_pic_alloc(struct irq_domain *domain, unsigned int virq,
if (err)
return err;
+ /* Write vector ID */
+ writeb(priv->ht_vec_base + hwirq, priv->base + PCH_INT_HTVEC(hwirq_to_bit(priv, hwirq)));
+
parent_fwspec.fwnode = domain->parent->fwnode;
parent_fwspec.param_count = 1;
parent_fwspec.param[0] = hwirq + priv->ht_vec_base;
@@ -222,7 +259,7 @@ static void pch_pic_reset(struct pch_pic *priv)
for (i = 0; i < PIC_COUNT; i++) {
/* Write vector ID */
- writeb(priv->ht_vec_base + i, priv->base + PCH_INT_HTVEC(i));
+ writeb(priv->ht_vec_base + i, priv->base + PCH_INT_HTVEC(hwirq_to_bit(priv, i)));
/* Hardcode route to HT0 Lo */
writeb(1, priv->base + PCH_INT_ROUTE(i));
}
@@ -284,6 +321,7 @@ static int pch_pic_init(phys_addr_t addr, unsigned long size, int vec_base,
u32 gsi_base)
{
struct pch_pic *priv;
+ int i;
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (!priv)
@@ -294,6 +332,10 @@ static int pch_pic_init(phys_addr_t addr, unsigned long size, int vec_base,
if (!priv->base)
goto free_priv;
+ priv->inuse = 0;
+ for (i = 0; i < PIC_COUNT; i++)
+ priv->table[i] = PIC_UNDEF_VECTOR;
+
priv->ht_vec_base = vec_base;
priv->vec_count = ((readq(priv->base) >> 48) & 0xff) + 1;
priv->gsi_base = gsi_base;
diff --git a/drivers/irqchip/irq-mxs.c b/drivers/irqchip/irq-mxs.c
index be96806455458b..d67b5da389820d 100644
--- a/drivers/irqchip/irq-mxs.c
+++ b/drivers/irqchip/irq-mxs.c
@@ -130,7 +130,7 @@ static struct irq_chip asm9260_icoll_chip = {
IRQCHIP_SKIP_SET_WAKE,
};
-asmlinkage void __exception_irq_entry icoll_handle_irq(struct pt_regs *regs)
+static void __exception_irq_entry icoll_handle_irq(struct pt_regs *regs)
{
u32 irqnr;
diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c
index ae67fec2ab468f..f6484bf15e0b8f 100644
--- a/drivers/irqchip/irq-renesas-rzg2l.c
+++ b/drivers/irqchip/irq-renesas-rzg2l.c
@@ -138,7 +138,7 @@ static void rzg2l_irqc_eoi(struct irq_data *d)
irq_chip_eoi_parent(d);
}
-static void rzg2l_irqc_irq_disable(struct irq_data *d)
+static void rzg2l_tint_irq_endisable(struct irq_data *d, bool enable)
{
unsigned int hw_irq = irqd_to_hwirq(d);
@@ -151,30 +151,24 @@ static void rzg2l_irqc_irq_disable(struct irq_data *d)
raw_spin_lock(&priv->lock);
reg = readl_relaxed(priv->base + TSSR(tssr_index));
- reg &= ~(TIEN << TSSEL_SHIFT(tssr_offset));
+ if (enable)
+ reg |= TIEN << TSSEL_SHIFT(tssr_offset);
+ else
+ reg &= ~(TIEN << TSSEL_SHIFT(tssr_offset));
writel_relaxed(reg, priv->base + TSSR(tssr_index));
raw_spin_unlock(&priv->lock);
}
+}
+
+static void rzg2l_irqc_irq_disable(struct irq_data *d)
+{
+ rzg2l_tint_irq_endisable(d, false);
irq_chip_disable_parent(d);
}
static void rzg2l_irqc_irq_enable(struct irq_data *d)
{
- unsigned int hw_irq = irqd_to_hwirq(d);
-
- if (hw_irq >= IRQC_TINT_START && hw_irq < IRQC_NUM_IRQ) {
- struct rzg2l_irqc_priv *priv = irq_data_to_priv(d);
- u32 offset = hw_irq - IRQC_TINT_START;
- u32 tssr_offset = TSSR_OFFSET(offset);
- u8 tssr_index = TSSR_INDEX(offset);
- u32 reg;
-
- raw_spin_lock(&priv->lock);
- reg = readl_relaxed(priv->base + TSSR(tssr_index));
- reg |= TIEN << TSSEL_SHIFT(tssr_offset);
- writel_relaxed(reg, priv->base + TSSR(tssr_index));
- raw_spin_unlock(&priv->lock);
- }
+ rzg2l_tint_irq_endisable(d, true);
irq_chip_enable_parent(d);
}
diff --git a/drivers/irqchip/irq-riscv-aplic-direct.c b/drivers/irqchip/irq-riscv-aplic-direct.c
new file mode 100644
index 00000000000000..4a3ffe856d6c30
--- /dev/null
+++ b/drivers/irqchip/irq-riscv-aplic-direct.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+
+#include <linux/bitfield.h>
+#include <linux/bitops.h>
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqchip/riscv-aplic.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/printk.h>
+#include <linux/smp.h>
+
+#include "irq-riscv-aplic-main.h"
+
+#define APLIC_DISABLE_IDELIVERY 0
+#define APLIC_ENABLE_IDELIVERY 1
+#define APLIC_DISABLE_ITHRESHOLD 1
+#define APLIC_ENABLE_ITHRESHOLD 0
+
+struct aplic_direct {
+ struct aplic_priv priv;
+ struct irq_domain *irqdomain;
+ struct cpumask lmask;
+};
+
+struct aplic_idc {
+ unsigned int hart_index;
+ void __iomem *regs;
+ struct aplic_direct *direct;
+};
+
+static unsigned int aplic_direct_parent_irq;
+static DEFINE_PER_CPU(struct aplic_idc, aplic_idcs);
+
+static void aplic_direct_irq_eoi(struct irq_data *d)
+{
+ /*
+ * The fasteoi_handler requires irq_eoi() callback hence
+ * provide a dummy handler.
+ */
+}
+
+#ifdef CONFIG_SMP
+static int aplic_direct_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
+ bool force)
+{
+ struct aplic_priv *priv = irq_data_get_irq_chip_data(d);
+ struct aplic_direct *direct = container_of(priv, struct aplic_direct, priv);
+ struct aplic_idc *idc;
+ unsigned int cpu, val;
+ void __iomem *target;
+
+ if (force)
+ cpu = cpumask_first_and(&direct->lmask, mask_val);
+ else
+ cpu = cpumask_first_and_and(&direct->lmask, mask_val, cpu_online_mask);
+
+ if (cpu >= nr_cpu_ids)
+ return -EINVAL;
+
+ idc = per_cpu_ptr(&aplic_idcs, cpu);
+ target = priv->regs + APLIC_TARGET_BASE + (d->hwirq - 1) * sizeof(u32);
+ val = FIELD_PREP(APLIC_TARGET_HART_IDX, idc->hart_index);
+ val |= FIELD_PREP(APLIC_TARGET_IPRIO, APLIC_DEFAULT_PRIORITY);
+ writel(val, target);
+
+ irq_data_update_effective_affinity(d, cpumask_of(cpu));
+
+ return IRQ_SET_MASK_OK_DONE;
+}
+#endif
+
+static struct irq_chip aplic_direct_chip = {
+ .name = "APLIC-DIRECT",
+ .irq_mask = aplic_irq_mask,
+ .irq_unmask = aplic_irq_unmask,
+ .irq_set_type = aplic_irq_set_type,
+ .irq_eoi = aplic_direct_irq_eoi,
+#ifdef CONFIG_SMP
+ .irq_set_affinity = aplic_direct_set_affinity,
+#endif
+ .flags = IRQCHIP_SET_TYPE_MASKED |
+ IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_MASK_ON_SUSPEND,
+};
+
+static int aplic_direct_irqdomain_translate(struct irq_domain *d, struct irq_fwspec *fwspec,
+ unsigned long *hwirq, unsigned int *type)
+{
+ struct aplic_priv *priv = d->host_data;
+
+ return aplic_irqdomain_translate(fwspec, priv->gsi_base, hwirq, type);
+}
+
+static int aplic_direct_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct aplic_priv *priv = domain->host_data;
+ struct aplic_direct *direct = container_of(priv, struct aplic_direct, priv);
+ struct irq_fwspec *fwspec = arg;
+ irq_hw_number_t hwirq;
+ unsigned int type;
+ int i, ret;
+
+ ret = aplic_irqdomain_translate(fwspec, priv->gsi_base, &hwirq, &type);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_domain_set_info(domain, virq + i, hwirq + i, &aplic_direct_chip,
+ priv, handle_fasteoi_irq, NULL, NULL);
+ irq_set_affinity(virq + i, &direct->lmask);
+ }
+
+ return 0;
+}
+
+static const struct irq_domain_ops aplic_direct_irqdomain_ops = {
+ .translate = aplic_direct_irqdomain_translate,
+ .alloc = aplic_direct_irqdomain_alloc,
+ .free = irq_domain_free_irqs_top,
+};
+
+/*
+ * To handle an APLIC direct interrupts, we just read the CLAIMI register
+ * which will return highest priority pending interrupt and clear the
+ * pending bit of the interrupt. This process is repeated until CLAIMI
+ * register return zero value.
+ */
+static void aplic_direct_handle_irq(struct irq_desc *desc)
+{
+ struct aplic_idc *idc = this_cpu_ptr(&aplic_idcs);
+ struct irq_domain *irqdomain = idc->direct->irqdomain;
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ irq_hw_number_t hw_irq;
+ int irq;
+
+ chained_irq_enter(chip, desc);
+
+ while ((hw_irq = readl(idc->regs + APLIC_IDC_CLAIMI))) {
+ hw_irq = hw_irq >> APLIC_IDC_TOPI_ID_SHIFT;
+ irq = irq_find_mapping(irqdomain, hw_irq);
+
+ if (unlikely(irq <= 0)) {
+ dev_warn_ratelimited(idc->direct->priv.dev,
+ "hw_irq %lu mapping not found\n", hw_irq);
+ } else {
+ generic_handle_irq(irq);
+ }
+ }
+
+ chained_irq_exit(chip, desc);
+}
+
+static void aplic_idc_set_delivery(struct aplic_idc *idc, bool en)
+{
+ u32 de = (en) ? APLIC_ENABLE_IDELIVERY : APLIC_DISABLE_IDELIVERY;
+ u32 th = (en) ? APLIC_ENABLE_ITHRESHOLD : APLIC_DISABLE_ITHRESHOLD;
+
+ /* Priority must be less than threshold for interrupt triggering */
+ writel(th, idc->regs + APLIC_IDC_ITHRESHOLD);
+
+ /* Delivery must be set to 1 for interrupt triggering */
+ writel(de, idc->regs + APLIC_IDC_IDELIVERY);
+}
+
+static int aplic_direct_dying_cpu(unsigned int cpu)
+{
+ if (aplic_direct_parent_irq)
+ disable_percpu_irq(aplic_direct_parent_irq);
+
+ return 0;
+}
+
+static int aplic_direct_starting_cpu(unsigned int cpu)
+{
+ if (aplic_direct_parent_irq) {
+ enable_percpu_irq(aplic_direct_parent_irq,
+ irq_get_trigger_type(aplic_direct_parent_irq));
+ }
+
+ return 0;
+}
+
+static int aplic_direct_parse_parent_hwirq(struct device *dev, u32 index,
+ u32 *parent_hwirq, unsigned long *parent_hartid)
+{
+ struct of_phandle_args parent;
+ int rc;
+
+ /*
+ * Currently, only OF fwnode is supported so extend this
+ * function for ACPI support.
+ */
+ if (!is_of_node(dev->fwnode))
+ return -EINVAL;
+
+ rc = of_irq_parse_one(to_of_node(dev->fwnode), index, &parent);
+ if (rc)
+ return rc;
+
+ rc = riscv_of_parent_hartid(parent.np, parent_hartid);
+ if (rc)
+ return rc;
+
+ *parent_hwirq = parent.args[0];
+ return 0;
+}
+
+int aplic_direct_setup(struct device *dev, void __iomem *regs)
+{
+ int i, j, rc, cpu, current_cpu, setup_count = 0;
+ struct aplic_direct *direct;
+ struct irq_domain *domain;
+ struct aplic_priv *priv;
+ struct aplic_idc *idc;
+ unsigned long hartid;
+ u32 v, hwirq;
+
+ direct = devm_kzalloc(dev, sizeof(*direct), GFP_KERNEL);
+ if (!direct)
+ return -ENOMEM;
+ priv = &direct->priv;
+
+ rc = aplic_setup_priv(priv, dev, regs);
+ if (rc) {
+ dev_err(dev, "failed to create APLIC context\n");
+ return rc;
+ }
+
+ /* Setup per-CPU IDC and target CPU mask */
+ current_cpu = get_cpu();
+ for (i = 0; i < priv->nr_idcs; i++) {
+ rc = aplic_direct_parse_parent_hwirq(dev, i, &hwirq, &hartid);
+ if (rc) {
+ dev_warn(dev, "parent irq for IDC%d not found\n", i);
+ continue;
+ }
+
+ /*
+ * Skip interrupts other than external interrupts for
+ * current privilege level.
+ */
+ if (hwirq != RV_IRQ_EXT)
+ continue;
+
+ cpu = riscv_hartid_to_cpuid(hartid);
+ if (cpu < 0) {
+ dev_warn(dev, "invalid cpuid for IDC%d\n", i);
+ continue;
+ }
+
+ cpumask_set_cpu(cpu, &direct->lmask);
+
+ idc = per_cpu_ptr(&aplic_idcs, cpu);
+ idc->hart_index = i;
+ idc->regs = priv->regs + APLIC_IDC_BASE + i * APLIC_IDC_SIZE;
+ idc->direct = direct;
+
+ aplic_idc_set_delivery(idc, true);
+
+ /*
+ * Boot cpu might not have APLIC hart_index = 0 so check
+ * and update target registers of all interrupts.
+ */
+ if (cpu == current_cpu && idc->hart_index) {
+ v = FIELD_PREP(APLIC_TARGET_HART_IDX, idc->hart_index);
+ v |= FIELD_PREP(APLIC_TARGET_IPRIO, APLIC_DEFAULT_PRIORITY);
+ for (j = 1; j <= priv->nr_irqs; j++)
+ writel(v, priv->regs + APLIC_TARGET_BASE + (j - 1) * sizeof(u32));
+ }
+
+ setup_count++;
+ }
+ put_cpu();
+
+ /* Find parent domain and register chained handler */
+ domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(),
+ DOMAIN_BUS_ANY);
+ if (!aplic_direct_parent_irq && domain) {
+ aplic_direct_parent_irq = irq_create_mapping(domain, RV_IRQ_EXT);
+ if (aplic_direct_parent_irq) {
+ irq_set_chained_handler(aplic_direct_parent_irq,
+ aplic_direct_handle_irq);
+
+ /*
+ * Setup CPUHP notifier to enable parent
+ * interrupt on all CPUs
+ */
+ cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "irqchip/riscv/aplic:starting",
+ aplic_direct_starting_cpu,
+ aplic_direct_dying_cpu);
+ }
+ }
+
+ /* Fail if we were not able to setup IDC for any CPU */
+ if (!setup_count)
+ return -ENODEV;
+
+ /* Setup global config and interrupt delivery */
+ aplic_init_hw_global(priv, false);
+
+ /* Create irq domain instance for the APLIC */
+ direct->irqdomain = irq_domain_create_linear(dev->fwnode, priv->nr_irqs + 1,
+ &aplic_direct_irqdomain_ops, priv);
+ if (!direct->irqdomain) {
+ dev_err(dev, "failed to create direct irq domain\n");
+ return -ENOMEM;
+ }
+
+ /* Advertise the interrupt controller */
+ dev_info(dev, "%d interrupts directly connected to %d CPUs\n",
+ priv->nr_irqs, priv->nr_idcs);
+
+ return 0;
+}
diff --git a/drivers/irqchip/irq-riscv-aplic-main.c b/drivers/irqchip/irq-riscv-aplic-main.c
new file mode 100644
index 00000000000000..774a0c97fdab64
--- /dev/null
+++ b/drivers/irqchip/irq-riscv-aplic-main.c
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+
+#include <linux/bitfield.h>
+#include <linux/irqchip/riscv-aplic.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+#include <linux/printk.h>
+
+#include "irq-riscv-aplic-main.h"
+
+void aplic_irq_unmask(struct irq_data *d)
+{
+ struct aplic_priv *priv = irq_data_get_irq_chip_data(d);
+
+ writel(d->hwirq, priv->regs + APLIC_SETIENUM);
+}
+
+void aplic_irq_mask(struct irq_data *d)
+{
+ struct aplic_priv *priv = irq_data_get_irq_chip_data(d);
+
+ writel(d->hwirq, priv->regs + APLIC_CLRIENUM);
+}
+
+int aplic_irq_set_type(struct irq_data *d, unsigned int type)
+{
+ struct aplic_priv *priv = irq_data_get_irq_chip_data(d);
+ void __iomem *sourcecfg;
+ u32 val = 0;
+
+ switch (type) {
+ case IRQ_TYPE_NONE:
+ val = APLIC_SOURCECFG_SM_INACTIVE;
+ break;
+ case IRQ_TYPE_LEVEL_LOW:
+ val = APLIC_SOURCECFG_SM_LEVEL_LOW;
+ break;
+ case IRQ_TYPE_LEVEL_HIGH:
+ val = APLIC_SOURCECFG_SM_LEVEL_HIGH;
+ break;
+ case IRQ_TYPE_EDGE_FALLING:
+ val = APLIC_SOURCECFG_SM_EDGE_FALL;
+ break;
+ case IRQ_TYPE_EDGE_RISING:
+ val = APLIC_SOURCECFG_SM_EDGE_RISE;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ sourcecfg = priv->regs + APLIC_SOURCECFG_BASE;
+ sourcecfg += (d->hwirq - 1) * sizeof(u32);
+ writel(val, sourcecfg);
+
+ return 0;
+}
+
+int aplic_irqdomain_translate(struct irq_fwspec *fwspec, u32 gsi_base,
+ unsigned long *hwirq, unsigned int *type)
+{
+ if (WARN_ON(fwspec->param_count < 2))
+ return -EINVAL;
+ if (WARN_ON(!fwspec->param[0]))
+ return -EINVAL;
+
+ /* For DT, gsi_base is always zero. */
+ *hwirq = fwspec->param[0] - gsi_base;
+ *type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK;
+
+ WARN_ON(*type == IRQ_TYPE_NONE);
+
+ return 0;
+}
+
+void aplic_init_hw_global(struct aplic_priv *priv, bool msi_mode)
+{
+ u32 val;
+#ifdef CONFIG_RISCV_M_MODE
+ u32 valh;
+
+ if (msi_mode) {
+ val = lower_32_bits(priv->msicfg.base_ppn);
+ valh = FIELD_PREP(APLIC_xMSICFGADDRH_BAPPN, upper_32_bits(priv->msicfg.base_ppn));
+ valh |= FIELD_PREP(APLIC_xMSICFGADDRH_LHXW, priv->msicfg.lhxw);
+ valh |= FIELD_PREP(APLIC_xMSICFGADDRH_HHXW, priv->msicfg.hhxw);
+ valh |= FIELD_PREP(APLIC_xMSICFGADDRH_LHXS, priv->msicfg.lhxs);
+ valh |= FIELD_PREP(APLIC_xMSICFGADDRH_HHXS, priv->msicfg.hhxs);
+ writel(val, priv->regs + APLIC_xMSICFGADDR);
+ writel(valh, priv->regs + APLIC_xMSICFGADDRH);
+ }
+#endif
+
+ /* Setup APLIC domaincfg register */
+ val = readl(priv->regs + APLIC_DOMAINCFG);
+ val |= APLIC_DOMAINCFG_IE;
+ if (msi_mode)
+ val |= APLIC_DOMAINCFG_DM;
+ writel(val, priv->regs + APLIC_DOMAINCFG);
+ if (readl(priv->regs + APLIC_DOMAINCFG) != val)
+ dev_warn(priv->dev, "unable to write 0x%x in domaincfg\n", val);
+}
+
+static void aplic_init_hw_irqs(struct aplic_priv *priv)
+{
+ int i;
+
+ /* Disable all interrupts */
+ for (i = 0; i <= priv->nr_irqs; i += 32)
+ writel(-1U, priv->regs + APLIC_CLRIE_BASE + (i / 32) * sizeof(u32));
+
+ /* Set interrupt type and default priority for all interrupts */
+ for (i = 1; i <= priv->nr_irqs; i++) {
+ writel(0, priv->regs + APLIC_SOURCECFG_BASE + (i - 1) * sizeof(u32));
+ writel(APLIC_DEFAULT_PRIORITY,
+ priv->regs + APLIC_TARGET_BASE + (i - 1) * sizeof(u32));
+ }
+
+ /* Clear APLIC domaincfg */
+ writel(0, priv->regs + APLIC_DOMAINCFG);
+}
+
+int aplic_setup_priv(struct aplic_priv *priv, struct device *dev, void __iomem *regs)
+{
+ struct of_phandle_args parent;
+ int rc;
+
+ /*
+ * Currently, only OF fwnode is supported so extend this
+ * function for ACPI support.
+ */
+ if (!is_of_node(dev->fwnode))
+ return -EINVAL;
+
+ /* Save device pointer and register base */
+ priv->dev = dev;
+ priv->regs = regs;
+
+ /* Find out number of interrupt sources */
+ rc = of_property_read_u32(to_of_node(dev->fwnode), "riscv,num-sources",
+ &priv->nr_irqs);
+ if (rc) {
+ dev_err(dev, "failed to get number of interrupt sources\n");
+ return rc;
+ }
+
+ /*
+ * Find out number of IDCs based on parent interrupts
+ *
+ * If "msi-parent" property is present then we ignore the
+ * APLIC IDCs which forces the APLIC driver to use MSI mode.
+ */
+ if (!of_property_present(to_of_node(dev->fwnode), "msi-parent")) {
+ while (!of_irq_parse_one(to_of_node(dev->fwnode), priv->nr_idcs, &parent))
+ priv->nr_idcs++;
+ }
+
+ /* Setup initial state APLIC interrupts */
+ aplic_init_hw_irqs(priv);
+
+ return 0;
+}
+
+static int aplic_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ bool msi_mode = false;
+ void __iomem *regs;
+ int rc;
+
+ /* Map the MMIO registers */
+ regs = devm_platform_ioremap_resource(pdev, 0);
+ if (!regs) {
+ dev_err(dev, "failed map MMIO registers\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * If msi-parent property is present then setup APLIC MSI
+ * mode otherwise setup APLIC direct mode.
+ */
+ if (is_of_node(dev->fwnode))
+ msi_mode = of_property_present(to_of_node(dev->fwnode), "msi-parent");
+ if (msi_mode)
+ rc = aplic_msi_setup(dev, regs);
+ else
+ rc = aplic_direct_setup(dev, regs);
+ if (rc)
+ dev_err(dev, "failed to setup APLIC in %s mode\n", msi_mode ? "MSI" : "direct");
+
+ return rc;
+}
+
+static const struct of_device_id aplic_match[] = {
+ { .compatible = "riscv,aplic" },
+ {}
+};
+
+static struct platform_driver aplic_driver = {
+ .driver = {
+ .name = "riscv-aplic",
+ .of_match_table = aplic_match,
+ },
+ .probe = aplic_probe,
+};
+builtin_platform_driver(aplic_driver);
diff --git a/drivers/irqchip/irq-riscv-aplic-main.h b/drivers/irqchip/irq-riscv-aplic-main.h
new file mode 100644
index 00000000000000..4393927d8c8070
--- /dev/null
+++ b/drivers/irqchip/irq-riscv-aplic-main.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+
+#ifndef _IRQ_RISCV_APLIC_MAIN_H
+#define _IRQ_RISCV_APLIC_MAIN_H
+
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/fwnode.h>
+
+#define APLIC_DEFAULT_PRIORITY 1
+
+struct aplic_msicfg {
+ phys_addr_t base_ppn;
+ u32 hhxs;
+ u32 hhxw;
+ u32 lhxs;
+ u32 lhxw;
+};
+
+struct aplic_priv {
+ struct device *dev;
+ u32 gsi_base;
+ u32 nr_irqs;
+ u32 nr_idcs;
+ void __iomem *regs;
+ struct aplic_msicfg msicfg;
+};
+
+void aplic_irq_unmask(struct irq_data *d);
+void aplic_irq_mask(struct irq_data *d);
+int aplic_irq_set_type(struct irq_data *d, unsigned int type);
+int aplic_irqdomain_translate(struct irq_fwspec *fwspec, u32 gsi_base,
+ unsigned long *hwirq, unsigned int *type);
+void aplic_init_hw_global(struct aplic_priv *priv, bool msi_mode);
+int aplic_setup_priv(struct aplic_priv *priv, struct device *dev, void __iomem *regs);
+int aplic_direct_setup(struct device *dev, void __iomem *regs);
+#ifdef CONFIG_RISCV_APLIC_MSI
+int aplic_msi_setup(struct device *dev, void __iomem *regs);
+#else
+static inline int aplic_msi_setup(struct device *dev, void __iomem *regs)
+{
+ return -ENODEV;
+}
+#endif
+
+#endif
diff --git a/drivers/irqchip/irq-riscv-aplic-msi.c b/drivers/irqchip/irq-riscv-aplic-msi.c
new file mode 100644
index 00000000000000..028444af48bd57
--- /dev/null
+++ b/drivers/irqchip/irq-riscv-aplic-msi.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+
+#include <linux/bitfield.h>
+#include <linux/bitops.h>
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/riscv-aplic.h>
+#include <linux/irqchip/riscv-imsic.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+#include <linux/printk.h>
+#include <linux/smp.h>
+
+#include "irq-riscv-aplic-main.h"
+
+static void aplic_msi_irq_mask(struct irq_data *d)
+{
+ aplic_irq_mask(d);
+ irq_chip_mask_parent(d);
+}
+
+static void aplic_msi_irq_unmask(struct irq_data *d)
+{
+ irq_chip_unmask_parent(d);
+ aplic_irq_unmask(d);
+}
+
+static void aplic_msi_irq_eoi(struct irq_data *d)
+{
+ struct aplic_priv *priv = irq_data_get_irq_chip_data(d);
+
+ /*
+ * EOI handling is required only for level-triggered interrupts
+ * when APLIC is in MSI mode.
+ */
+
+ switch (irqd_get_trigger_type(d)) {
+ case IRQ_TYPE_LEVEL_LOW:
+ case IRQ_TYPE_LEVEL_HIGH:
+ /*
+ * The section "4.9.2 Special consideration for level-sensitive interrupt
+ * sources" of the RISC-V AIA specification says:
+ *
+ * A second option is for the interrupt service routine to write the
+ * APLIC’s source identity number for the interrupt to the domain’s
+ * setipnum register just before exiting. This will cause the interrupt’s
+ * pending bit to be set to one again if the source is still asserting
+ * an interrupt, but not if the source is not asserting an interrupt.
+ */
+ writel(d->hwirq, priv->regs + APLIC_SETIPNUM_LE);
+ break;
+ }
+}
+
+static void aplic_msi_write_msg(struct irq_data *d, struct msi_msg *msg)
+{
+ unsigned int group_index, hart_index, guest_index, val;
+ struct aplic_priv *priv = irq_data_get_irq_chip_data(d);
+ struct aplic_msicfg *mc = &priv->msicfg;
+ phys_addr_t tppn, tbppn, msg_addr;
+ void __iomem *target;
+
+ /* For zeroed MSI, simply write zero into the target register */
+ if (!msg->address_hi && !msg->address_lo && !msg->data) {
+ target = priv->regs + APLIC_TARGET_BASE;
+ target += (d->hwirq - 1) * sizeof(u32);
+ writel(0, target);
+ return;
+ }
+
+ /* Sanity check on message data */
+ WARN_ON(msg->data > APLIC_TARGET_EIID_MASK);
+
+ /* Compute target MSI address */
+ msg_addr = (((u64)msg->address_hi) << 32) | msg->address_lo;
+ tppn = msg_addr >> APLIC_xMSICFGADDR_PPN_SHIFT;
+
+ /* Compute target HART Base PPN */
+ tbppn = tppn;
+ tbppn &= ~APLIC_xMSICFGADDR_PPN_HART(mc->lhxs);
+ tbppn &= ~APLIC_xMSICFGADDR_PPN_LHX(mc->lhxw, mc->lhxs);
+ tbppn &= ~APLIC_xMSICFGADDR_PPN_HHX(mc->hhxw, mc->hhxs);
+ WARN_ON(tbppn != mc->base_ppn);
+
+ /* Compute target group and hart indexes */
+ group_index = (tppn >> APLIC_xMSICFGADDR_PPN_HHX_SHIFT(mc->hhxs)) &
+ APLIC_xMSICFGADDR_PPN_HHX_MASK(mc->hhxw);
+ hart_index = (tppn >> APLIC_xMSICFGADDR_PPN_LHX_SHIFT(mc->lhxs)) &
+ APLIC_xMSICFGADDR_PPN_LHX_MASK(mc->lhxw);
+ hart_index |= (group_index << mc->lhxw);
+ WARN_ON(hart_index > APLIC_TARGET_HART_IDX_MASK);
+
+ /* Compute target guest index */
+ guest_index = tppn & APLIC_xMSICFGADDR_PPN_HART(mc->lhxs);
+ WARN_ON(guest_index > APLIC_TARGET_GUEST_IDX_MASK);
+
+ /* Update IRQ TARGET register */
+ target = priv->regs + APLIC_TARGET_BASE;
+ target += (d->hwirq - 1) * sizeof(u32);
+ val = FIELD_PREP(APLIC_TARGET_HART_IDX, hart_index);
+ val |= FIELD_PREP(APLIC_TARGET_GUEST_IDX, guest_index);
+ val |= FIELD_PREP(APLIC_TARGET_EIID, msg->data);
+ writel(val, target);
+}
+
+static void aplic_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+ arg->desc = desc;
+ arg->hwirq = (u32)desc->data.icookie.value;
+}
+
+static int aplic_msi_translate(struct irq_domain *d, struct irq_fwspec *fwspec,
+ unsigned long *hwirq, unsigned int *type)
+{
+ struct msi_domain_info *info = d->host_data;
+ struct aplic_priv *priv = info->data;
+
+ return aplic_irqdomain_translate(fwspec, priv->gsi_base, hwirq, type);
+}
+
+static const struct msi_domain_template aplic_msi_template = {
+ .chip = {
+ .name = "APLIC-MSI",
+ .irq_mask = aplic_msi_irq_mask,
+ .irq_unmask = aplic_msi_irq_unmask,
+ .irq_set_type = aplic_irq_set_type,
+ .irq_eoi = aplic_msi_irq_eoi,
+#ifdef CONFIG_SMP
+ .irq_set_affinity = irq_chip_set_affinity_parent,
+#endif
+ .irq_write_msi_msg = aplic_msi_write_msg,
+ .flags = IRQCHIP_SET_TYPE_MASKED |
+ IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_MASK_ON_SUSPEND,
+ },
+
+ .ops = {
+ .set_desc = aplic_msi_set_desc,
+ .msi_translate = aplic_msi_translate,
+ },
+
+ .info = {
+ .bus_token = DOMAIN_BUS_WIRED_TO_MSI,
+ .flags = MSI_FLAG_USE_DEV_FWNODE,
+ .handler = handle_fasteoi_irq,
+ .handler_name = "fasteoi",
+ },
+};
+
+int aplic_msi_setup(struct device *dev, void __iomem *regs)
+{
+ const struct imsic_global_config *imsic_global;
+ struct aplic_priv *priv;
+ struct aplic_msicfg *mc;
+ phys_addr_t pa;
+ int rc;
+
+ priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ rc = aplic_setup_priv(priv, dev, regs);
+ if (rc) {
+ dev_err(dev, "failed to create APLIC context\n");
+ return rc;
+ }
+ mc = &priv->msicfg;
+
+ /*
+ * The APLIC outgoing MSI config registers assume target MSI
+ * controller to be RISC-V AIA IMSIC controller.
+ */
+ imsic_global = imsic_get_global_config();
+ if (!imsic_global) {
+ dev_err(dev, "IMSIC global config not found\n");
+ return -ENODEV;
+ }
+
+ /* Find number of guest index bits (LHXS) */
+ mc->lhxs = imsic_global->guest_index_bits;
+ if (APLIC_xMSICFGADDRH_LHXS_MASK < mc->lhxs) {
+ dev_err(dev, "IMSIC guest index bits big for APLIC LHXS\n");
+ return -EINVAL;
+ }
+
+ /* Find number of HART index bits (LHXW) */
+ mc->lhxw = imsic_global->hart_index_bits;
+ if (APLIC_xMSICFGADDRH_LHXW_MASK < mc->lhxw) {
+ dev_err(dev, "IMSIC hart index bits big for APLIC LHXW\n");
+ return -EINVAL;
+ }
+
+ /* Find number of group index bits (HHXW) */
+ mc->hhxw = imsic_global->group_index_bits;
+ if (APLIC_xMSICFGADDRH_HHXW_MASK < mc->hhxw) {
+ dev_err(dev, "IMSIC group index bits big for APLIC HHXW\n");
+ return -EINVAL;
+ }
+
+ /* Find first bit position of group index (HHXS) */
+ mc->hhxs = imsic_global->group_index_shift;
+ if (mc->hhxs < (2 * APLIC_xMSICFGADDR_PPN_SHIFT)) {
+ dev_err(dev, "IMSIC group index shift should be >= %d\n",
+ (2 * APLIC_xMSICFGADDR_PPN_SHIFT));
+ return -EINVAL;
+ }
+ mc->hhxs -= (2 * APLIC_xMSICFGADDR_PPN_SHIFT);
+ if (APLIC_xMSICFGADDRH_HHXS_MASK < mc->hhxs) {
+ dev_err(dev, "IMSIC group index shift big for APLIC HHXS\n");
+ return -EINVAL;
+ }
+
+ /* Compute PPN base */
+ mc->base_ppn = imsic_global->base_addr >> APLIC_xMSICFGADDR_PPN_SHIFT;
+ mc->base_ppn &= ~APLIC_xMSICFGADDR_PPN_HART(mc->lhxs);
+ mc->base_ppn &= ~APLIC_xMSICFGADDR_PPN_LHX(mc->lhxw, mc->lhxs);
+ mc->base_ppn &= ~APLIC_xMSICFGADDR_PPN_HHX(mc->hhxw, mc->hhxs);
+
+ /* Setup global config and interrupt delivery */
+ aplic_init_hw_global(priv, true);
+
+ /* Set the APLIC device MSI domain if not available */
+ if (!dev_get_msi_domain(dev)) {
+ /*
+ * The device MSI domain for OF devices is only set at the
+ * time of populating/creating OF device. If the device MSI
+ * domain is discovered later after the OF device is created
+ * then we need to set it explicitly before using any platform
+ * MSI functions.
+ *
+ * In case of APLIC device, the parent MSI domain is always
+ * IMSIC and the IMSIC MSI domains are created later through
+ * the platform driver probing so we set it explicitly here.
+ */
+ if (is_of_node(dev->fwnode))
+ of_msi_configure(dev, to_of_node(dev->fwnode));
+ }
+
+ if (!msi_create_device_irq_domain(dev, MSI_DEFAULT_DOMAIN, &aplic_msi_template,
+ priv->nr_irqs + 1, priv, priv)) {
+ dev_err(dev, "failed to create MSI irq domain\n");
+ return -ENOMEM;
+ }
+
+ /* Advertise the interrupt controller */
+ pa = priv->msicfg.base_ppn << APLIC_xMSICFGADDR_PPN_SHIFT;
+ dev_info(dev, "%d interrupts forwarded to MSI base %pa\n", priv->nr_irqs, &pa);
+
+ return 0;
+}
diff --git a/drivers/irqchip/irq-riscv-imsic-early.c b/drivers/irqchip/irq-riscv-imsic-early.c
new file mode 100644
index 00000000000000..886418ec06cb9b
--- /dev/null
+++ b/drivers/irqchip/irq-riscv-imsic-early.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "riscv-imsic: " fmt
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+
+#include "irq-riscv-imsic-state.h"
+
+static int imsic_parent_irq;
+
+#ifdef CONFIG_SMP
+static void imsic_ipi_send(unsigned int cpu)
+{
+ struct imsic_local_config *local = per_cpu_ptr(imsic->global.local, cpu);
+
+ writel_relaxed(IMSIC_IPI_ID, local->msi_va);
+}
+
+static void imsic_ipi_starting_cpu(void)
+{
+ /* Enable IPIs for current CPU. */
+ __imsic_id_set_enable(IMSIC_IPI_ID);
+}
+
+static void imsic_ipi_dying_cpu(void)
+{
+ /* Disable IPIs for current CPU. */
+ __imsic_id_clear_enable(IMSIC_IPI_ID);
+}
+
+static int __init imsic_ipi_domain_init(void)
+{
+ int virq;
+
+ /* Create IMSIC IPI multiplexing */
+ virq = ipi_mux_create(IMSIC_NR_IPI, imsic_ipi_send);
+ if (virq <= 0)
+ return virq < 0 ? virq : -ENOMEM;
+
+ /* Set vIRQ range */
+ riscv_ipi_set_virq_range(virq, IMSIC_NR_IPI, true);
+
+ /* Announce that IMSIC is providing IPIs */
+ pr_info("%pfwP: providing IPIs using interrupt %d\n", imsic->fwnode, IMSIC_IPI_ID);
+
+ return 0;
+}
+#else
+static void imsic_ipi_starting_cpu(void) { }
+static void imsic_ipi_dying_cpu(void) { }
+static int __init imsic_ipi_domain_init(void) { return 0; }
+#endif
+
+/*
+ * To handle an interrupt, we read the TOPEI CSR and write zero in one
+ * instruction. If TOPEI CSR is non-zero then we translate TOPEI.ID to
+ * Linux interrupt number and let Linux IRQ subsystem handle it.
+ */
+static void imsic_handle_irq(struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ int err, cpu = smp_processor_id();
+ struct imsic_vector *vec;
+ unsigned long local_id;
+
+ chained_irq_enter(chip, desc);
+
+ while ((local_id = csr_swap(CSR_TOPEI, 0))) {
+ local_id >>= TOPEI_ID_SHIFT;
+
+ if (local_id == IMSIC_IPI_ID) {
+ if (IS_ENABLED(CONFIG_SMP))
+ ipi_mux_process();
+ continue;
+ }
+
+ if (unlikely(!imsic->base_domain))
+ continue;
+
+ vec = imsic_vector_from_local_id(cpu, local_id);
+ if (!vec) {
+ pr_warn_ratelimited("vector not found for local ID 0x%lx\n", local_id);
+ continue;
+ }
+
+ err = generic_handle_domain_irq(imsic->base_domain, vec->hwirq);
+ if (unlikely(err))
+ pr_warn_ratelimited("hwirq 0x%x mapping not found\n", vec->hwirq);
+ }
+
+ chained_irq_exit(chip, desc);
+}
+
+static int imsic_starting_cpu(unsigned int cpu)
+{
+ /* Mark per-CPU IMSIC state as online */
+ imsic_state_online();
+
+ /* Enable per-CPU parent interrupt */
+ enable_percpu_irq(imsic_parent_irq, irq_get_trigger_type(imsic_parent_irq));
+
+ /* Setup IPIs */
+ imsic_ipi_starting_cpu();
+
+ /*
+ * Interrupts identities might have been enabled/disabled while
+ * this CPU was not running so sync-up local enable/disable state.
+ */
+ imsic_local_sync_all();
+
+ /* Enable local interrupt delivery */
+ imsic_local_delivery(true);
+
+ return 0;
+}
+
+static int imsic_dying_cpu(unsigned int cpu)
+{
+ /* Cleanup IPIs */
+ imsic_ipi_dying_cpu();
+
+ /* Mark per-CPU IMSIC state as offline */
+ imsic_state_offline();
+
+ return 0;
+}
+
+static int __init imsic_early_probe(struct fwnode_handle *fwnode)
+{
+ struct irq_domain *domain;
+ int rc;
+
+ /* Find parent domain and register chained handler */
+ domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(), DOMAIN_BUS_ANY);
+ if (!domain) {
+ pr_err("%pfwP: Failed to find INTC domain\n", fwnode);
+ return -ENOENT;
+ }
+ imsic_parent_irq = irq_create_mapping(domain, RV_IRQ_EXT);
+ if (!imsic_parent_irq) {
+ pr_err("%pfwP: Failed to create INTC mapping\n", fwnode);
+ return -ENOENT;
+ }
+
+ /* Initialize IPI domain */
+ rc = imsic_ipi_domain_init();
+ if (rc) {
+ pr_err("%pfwP: Failed to initialize IPI domain\n", fwnode);
+ return rc;
+ }
+
+ /* Setup chained handler to the parent domain interrupt */
+ irq_set_chained_handler(imsic_parent_irq, imsic_handle_irq);
+
+ /*
+ * Setup cpuhp state (must be done after setting imsic_parent_irq)
+ *
+ * Don't disable per-CPU IMSIC file when CPU goes offline
+ * because this affects IPI and the masking/unmasking of
+ * virtual IPIs is done via generic IPI-Mux
+ */
+ cpuhp_setup_state(CPUHP_AP_IRQ_RISCV_IMSIC_STARTING, "irqchip/riscv/imsic:starting",
+ imsic_starting_cpu, imsic_dying_cpu);
+
+ return 0;
+}
+
+static int __init imsic_early_dt_init(struct device_node *node, struct device_node *parent)
+{
+ struct fwnode_handle *fwnode = &node->fwnode;
+ int rc;
+
+ /* Setup IMSIC state */
+ rc = imsic_setup_state(fwnode);
+ if (rc) {
+ pr_err("%pfwP: failed to setup state (error %d)\n", fwnode, rc);
+ return rc;
+ }
+
+ /* Do early setup of IPIs */
+ rc = imsic_early_probe(fwnode);
+ if (rc)
+ return rc;
+
+ /* Ensure that OF platform device gets probed */
+ of_node_clear_flag(node, OF_POPULATED);
+ return 0;
+}
+
+IRQCHIP_DECLARE(riscv_imsic, "riscv,imsics", imsic_early_dt_init);
diff --git a/drivers/irqchip/irq-riscv-imsic-platform.c b/drivers/irqchip/irq-riscv-imsic-platform.c
new file mode 100644
index 00000000000000..11723a763c102a
--- /dev/null
+++ b/drivers/irqchip/irq-riscv-imsic-platform.c
@@ -0,0 +1,375 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "riscv-imsic: " fmt
+#include <linux/bitmap.h>
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqdomain.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+
+#include "irq-riscv-imsic-state.h"
+
+static bool imsic_cpu_page_phys(unsigned int cpu, unsigned int guest_index,
+ phys_addr_t *out_msi_pa)
+{
+ struct imsic_global_config *global;
+ struct imsic_local_config *local;
+
+ global = &imsic->global;
+ local = per_cpu_ptr(global->local, cpu);
+
+ if (BIT(global->guest_index_bits) <= guest_index)
+ return false;
+
+ if (out_msi_pa)
+ *out_msi_pa = local->msi_pa + (guest_index * IMSIC_MMIO_PAGE_SZ);
+
+ return true;
+}
+
+static void imsic_irq_mask(struct irq_data *d)
+{
+ imsic_vector_mask(irq_data_get_irq_chip_data(d));
+}
+
+static void imsic_irq_unmask(struct irq_data *d)
+{
+ imsic_vector_unmask(irq_data_get_irq_chip_data(d));
+}
+
+static int imsic_irq_retrigger(struct irq_data *d)
+{
+ struct imsic_vector *vec = irq_data_get_irq_chip_data(d);
+ struct imsic_local_config *local;
+
+ if (WARN_ON(!vec))
+ return -ENOENT;
+
+ local = per_cpu_ptr(imsic->global.local, vec->cpu);
+ writel_relaxed(vec->local_id, local->msi_va);
+ return 0;
+}
+
+static void imsic_irq_compose_vector_msg(struct imsic_vector *vec, struct msi_msg *msg)
+{
+ phys_addr_t msi_addr;
+
+ if (WARN_ON(!vec))
+ return;
+
+ if (WARN_ON(!imsic_cpu_page_phys(vec->cpu, 0, &msi_addr)))
+ return;
+
+ msg->address_hi = upper_32_bits(msi_addr);
+ msg->address_lo = lower_32_bits(msi_addr);
+ msg->data = vec->local_id;
+}
+
+static void imsic_irq_compose_msg(struct irq_data *d, struct msi_msg *msg)
+{
+ imsic_irq_compose_vector_msg(irq_data_get_irq_chip_data(d), msg);
+}
+
+#ifdef CONFIG_SMP
+static void imsic_msi_update_msg(struct irq_data *d, struct imsic_vector *vec)
+{
+ struct msi_msg msg = { };
+
+ imsic_irq_compose_vector_msg(vec, &msg);
+ irq_data_get_irq_chip(d)->irq_write_msi_msg(d, &msg);
+}
+
+static int imsic_irq_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
+ bool force)
+{
+ struct imsic_vector *old_vec, *new_vec;
+ struct irq_data *pd = d->parent_data;
+
+ old_vec = irq_data_get_irq_chip_data(pd);
+ if (WARN_ON(!old_vec))
+ return -ENOENT;
+
+ /* If old vector cpu belongs to the target cpumask then do nothing */
+ if (cpumask_test_cpu(old_vec->cpu, mask_val))
+ return IRQ_SET_MASK_OK_DONE;
+
+ /* If move is already in-flight then return failure */
+ if (imsic_vector_get_move(old_vec))
+ return -EBUSY;
+
+ /* Get a new vector on the desired set of CPUs */
+ new_vec = imsic_vector_alloc(old_vec->hwirq, mask_val);
+ if (!new_vec)
+ return -ENOSPC;
+
+ /* Point device to the new vector */
+ imsic_msi_update_msg(d, new_vec);
+
+ /* Update irq descriptors with the new vector */
+ pd->chip_data = new_vec;
+
+ /* Update effective affinity of parent irq data */
+ irq_data_update_effective_affinity(pd, cpumask_of(new_vec->cpu));
+
+ /* Move state of the old vector to the new vector */
+ imsic_vector_move(old_vec, new_vec);
+
+ return IRQ_SET_MASK_OK_DONE;
+}
+#endif
+
+static struct irq_chip imsic_irq_base_chip = {
+ .name = "IMSIC",
+ .irq_mask = imsic_irq_mask,
+ .irq_unmask = imsic_irq_unmask,
+ .irq_retrigger = imsic_irq_retrigger,
+ .irq_compose_msi_msg = imsic_irq_compose_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_MASK_ON_SUSPEND,
+};
+
+static int imsic_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *args)
+{
+ struct imsic_vector *vec;
+
+ /* Multi-MSI is not supported yet. */
+ if (nr_irqs > 1)
+ return -EOPNOTSUPP;
+
+ vec = imsic_vector_alloc(virq, cpu_online_mask);
+ if (!vec)
+ return -ENOSPC;
+
+ irq_domain_set_info(domain, virq, virq, &imsic_irq_base_chip, vec,
+ handle_simple_irq, NULL, NULL);
+ irq_set_noprobe(virq);
+ irq_set_affinity(virq, cpu_online_mask);
+ irq_data_update_effective_affinity(irq_get_irq_data(virq), cpumask_of(vec->cpu));
+
+ return 0;
+}
+
+static void imsic_irq_domain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+
+ imsic_vector_free(irq_data_get_irq_chip_data(d));
+ irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+}
+
+static int imsic_irq_domain_select(struct irq_domain *domain, struct irq_fwspec *fwspec,
+ enum irq_domain_bus_token bus_token)
+{
+ const struct msi_parent_ops *ops = domain->msi_parent_ops;
+ u32 busmask = BIT(bus_token);
+
+ if (fwspec->fwnode != domain->fwnode || fwspec->param_count != 0)
+ return 0;
+
+ /* Handle pure domain searches */
+ if (bus_token == ops->bus_select_token)
+ return 1;
+
+ return !!(ops->bus_select_mask & busmask);
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+static void imsic_irq_debug_show(struct seq_file *m, struct irq_domain *d,
+ struct irq_data *irqd, int ind)
+{
+ if (!irqd) {
+ imsic_vector_debug_show_summary(m, ind);
+ return;
+ }
+
+ imsic_vector_debug_show(m, irq_data_get_irq_chip_data(irqd), ind);
+}
+#endif
+
+static const struct irq_domain_ops imsic_base_domain_ops = {
+ .alloc = imsic_irq_domain_alloc,
+ .free = imsic_irq_domain_free,
+ .select = imsic_irq_domain_select,
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+ .debug_show = imsic_irq_debug_show,
+#endif
+};
+
+#ifdef CONFIG_RISCV_IMSIC_PCI
+
+static void imsic_pci_mask_irq(struct irq_data *d)
+{
+ pci_msi_mask_irq(d);
+ irq_chip_mask_parent(d);
+}
+
+static void imsic_pci_unmask_irq(struct irq_data *d)
+{
+ irq_chip_unmask_parent(d);
+ pci_msi_unmask_irq(d);
+}
+
+#define MATCH_PCI_MSI BIT(DOMAIN_BUS_PCI_MSI)
+
+#else
+
+#define MATCH_PCI_MSI 0
+
+#endif
+
+static bool imsic_init_dev_msi_info(struct device *dev,
+ struct irq_domain *domain,
+ struct irq_domain *real_parent,
+ struct msi_domain_info *info)
+{
+ const struct msi_parent_ops *pops = real_parent->msi_parent_ops;
+
+ /* MSI parent domain specific settings */
+ switch (real_parent->bus_token) {
+ case DOMAIN_BUS_NEXUS:
+ if (WARN_ON_ONCE(domain != real_parent))
+ return false;
+#ifdef CONFIG_SMP
+ info->chip->irq_set_affinity = imsic_irq_set_affinity;
+#endif
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /* Is the target supported? */
+ switch (info->bus_token) {
+#ifdef CONFIG_RISCV_IMSIC_PCI
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ info->chip->irq_mask = imsic_pci_mask_irq;
+ info->chip->irq_unmask = imsic_pci_unmask_irq;
+ break;
+#endif
+ case DOMAIN_BUS_DEVICE_MSI:
+ /*
+ * Per-device MSI should never have any MSI feature bits
+ * set. It's sole purpose is to create a dumb interrupt
+ * chip which has a device specific irq_write_msi_msg()
+ * callback.
+ */
+ if (WARN_ON_ONCE(info->flags))
+ return false;
+
+ /* Core managed MSI descriptors */
+ info->flags |= MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS |
+ MSI_FLAG_FREE_MSI_DESCS;
+ break;
+ case DOMAIN_BUS_WIRED_TO_MSI:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /* Use hierarchial chip operations re-trigger */
+ info->chip->irq_retrigger = irq_chip_retrigger_hierarchy;
+
+ /*
+ * Mask out the domain specific MSI feature flags which are not
+ * supported by the real parent.
+ */
+ info->flags &= pops->supported_flags;
+
+ /* Enforce the required flags */
+ info->flags |= pops->required_flags;
+
+ return true;
+}
+
+#define MATCH_PLATFORM_MSI BIT(DOMAIN_BUS_PLATFORM_MSI)
+
+static const struct msi_parent_ops imsic_msi_parent_ops = {
+ .supported_flags = MSI_GENERIC_FLAGS_MASK |
+ MSI_FLAG_PCI_MSIX,
+ .required_flags = MSI_FLAG_USE_DEF_DOM_OPS |
+ MSI_FLAG_USE_DEF_CHIP_OPS,
+ .bus_select_token = DOMAIN_BUS_NEXUS,
+ .bus_select_mask = MATCH_PCI_MSI | MATCH_PLATFORM_MSI,
+ .init_dev_msi_info = imsic_init_dev_msi_info,
+};
+
+int imsic_irqdomain_init(void)
+{
+ struct imsic_global_config *global;
+
+ if (!imsic || !imsic->fwnode) {
+ pr_err("early driver not probed\n");
+ return -ENODEV;
+ }
+
+ if (imsic->base_domain) {
+ pr_err("%pfwP: irq domain already created\n", imsic->fwnode);
+ return -ENODEV;
+ }
+
+ /* Create Base IRQ domain */
+ imsic->base_domain = irq_domain_create_tree(imsic->fwnode,
+ &imsic_base_domain_ops, imsic);
+ if (!imsic->base_domain) {
+ pr_err("%pfwP: failed to create IMSIC base domain\n", imsic->fwnode);
+ return -ENOMEM;
+ }
+ imsic->base_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT;
+ imsic->base_domain->msi_parent_ops = &imsic_msi_parent_ops;
+
+ irq_domain_update_bus_token(imsic->base_domain, DOMAIN_BUS_NEXUS);
+
+ global = &imsic->global;
+ pr_info("%pfwP: hart-index-bits: %d, guest-index-bits: %d\n",
+ imsic->fwnode, global->hart_index_bits, global->guest_index_bits);
+ pr_info("%pfwP: group-index-bits: %d, group-index-shift: %d\n",
+ imsic->fwnode, global->group_index_bits, global->group_index_shift);
+ pr_info("%pfwP: per-CPU IDs %d at base PPN %pa\n",
+ imsic->fwnode, global->nr_ids, &global->base_addr);
+ pr_info("%pfwP: total %d interrupts available\n",
+ imsic->fwnode, num_possible_cpus() * (global->nr_ids - 1));
+
+ return 0;
+}
+
+static int imsic_platform_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+
+ if (imsic && imsic->fwnode != dev->fwnode) {
+ dev_err(dev, "fwnode mismatch\n");
+ return -ENODEV;
+ }
+
+ return imsic_irqdomain_init();
+}
+
+static const struct of_device_id imsic_platform_match[] = {
+ { .compatible = "riscv,imsics" },
+ {}
+};
+
+static struct platform_driver imsic_platform_driver = {
+ .driver = {
+ .name = "riscv-imsic",
+ .of_match_table = imsic_platform_match,
+ },
+ .probe = imsic_platform_probe,
+};
+builtin_platform_driver(imsic_platform_driver);
diff --git a/drivers/irqchip/irq-riscv-imsic-state.c b/drivers/irqchip/irq-riscv-imsic-state.c
new file mode 100644
index 00000000000000..5479f872e62b42
--- /dev/null
+++ b/drivers/irqchip/irq-riscv-imsic-state.c
@@ -0,0 +1,865 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "riscv-imsic: " fmt
+#include <linux/cpu.h>
+#include <linux/bitmap.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <asm/hwcap.h>
+
+#include "irq-riscv-imsic-state.h"
+
+#define IMSIC_DISABLE_EIDELIVERY 0
+#define IMSIC_ENABLE_EIDELIVERY 1
+#define IMSIC_DISABLE_EITHRESHOLD 1
+#define IMSIC_ENABLE_EITHRESHOLD 0
+
+static inline void imsic_csr_write(unsigned long reg, unsigned long val)
+{
+ csr_write(CSR_ISELECT, reg);
+ csr_write(CSR_IREG, val);
+}
+
+static inline unsigned long imsic_csr_read(unsigned long reg)
+{
+ csr_write(CSR_ISELECT, reg);
+ return csr_read(CSR_IREG);
+}
+
+static inline unsigned long imsic_csr_read_clear(unsigned long reg, unsigned long val)
+{
+ csr_write(CSR_ISELECT, reg);
+ return csr_read_clear(CSR_IREG, val);
+}
+
+static inline void imsic_csr_set(unsigned long reg, unsigned long val)
+{
+ csr_write(CSR_ISELECT, reg);
+ csr_set(CSR_IREG, val);
+}
+
+static inline void imsic_csr_clear(unsigned long reg, unsigned long val)
+{
+ csr_write(CSR_ISELECT, reg);
+ csr_clear(CSR_IREG, val);
+}
+
+struct imsic_priv *imsic;
+
+const struct imsic_global_config *imsic_get_global_config(void)
+{
+ return imsic ? &imsic->global : NULL;
+}
+EXPORT_SYMBOL_GPL(imsic_get_global_config);
+
+static bool __imsic_eix_read_clear(unsigned long id, bool pend)
+{
+ unsigned long isel, imask;
+
+ isel = id / BITS_PER_LONG;
+ isel *= BITS_PER_LONG / IMSIC_EIPx_BITS;
+ isel += pend ? IMSIC_EIP0 : IMSIC_EIE0;
+ imask = BIT(id & (__riscv_xlen - 1));
+
+ return !!(imsic_csr_read_clear(isel, imask) & imask);
+}
+
+static inline bool __imsic_id_read_clear_enabled(unsigned long id)
+{
+ return __imsic_eix_read_clear(id, false);
+}
+
+static inline bool __imsic_id_read_clear_pending(unsigned long id)
+{
+ return __imsic_eix_read_clear(id, true);
+}
+
+void __imsic_eix_update(unsigned long base_id, unsigned long num_id, bool pend, bool val)
+{
+ unsigned long id = base_id, last_id = base_id + num_id;
+ unsigned long i, isel, ireg;
+
+ while (id < last_id) {
+ isel = id / BITS_PER_LONG;
+ isel *= BITS_PER_LONG / IMSIC_EIPx_BITS;
+ isel += pend ? IMSIC_EIP0 : IMSIC_EIE0;
+
+ /*
+ * Prepare the ID mask to be programmed in the
+ * IMSIC EIEx and EIPx registers. These registers
+ * are XLEN-wide and we must not touch IDs which
+ * are < base_id and >= (base_id + num_id).
+ */
+ ireg = 0;
+ for (i = id & (__riscv_xlen - 1); id < last_id && i < __riscv_xlen; i++) {
+ ireg |= BIT(i);
+ id++;
+ }
+
+ /*
+ * The IMSIC EIEx and EIPx registers are indirectly
+ * accessed via using ISELECT and IREG CSRs so we
+ * need to access these CSRs without getting preempted.
+ *
+ * All existing users of this function call this
+ * function with local IRQs disabled so we don't
+ * need to do anything special here.
+ */
+ if (val)
+ imsic_csr_set(isel, ireg);
+ else
+ imsic_csr_clear(isel, ireg);
+ }
+}
+
+static void __imsic_local_sync(struct imsic_local_priv *lpriv)
+{
+ struct imsic_local_config *mlocal;
+ struct imsic_vector *vec, *mvec;
+ int i;
+
+ lockdep_assert_held(&lpriv->lock);
+
+ for_each_set_bit(i, lpriv->dirty_bitmap, imsic->global.nr_ids + 1) {
+ if (!i || i == IMSIC_IPI_ID)
+ goto skip;
+ vec = &lpriv->vectors[i];
+
+ if (READ_ONCE(vec->enable))
+ __imsic_id_set_enable(i);
+ else
+ __imsic_id_clear_enable(i);
+
+ /*
+ * If the ID was being moved to a new ID on some other CPU
+ * then we can get a MSI during the movement so check the
+ * ID pending bit and re-trigger the new ID on other CPU
+ * using MMIO write.
+ */
+ mvec = READ_ONCE(vec->move);
+ WRITE_ONCE(vec->move, NULL);
+ if (mvec && mvec != vec) {
+ if (__imsic_id_read_clear_pending(i)) {
+ mlocal = per_cpu_ptr(imsic->global.local, mvec->cpu);
+ writel_relaxed(mvec->local_id, mlocal->msi_va);
+ }
+
+ imsic_vector_free(&lpriv->vectors[i]);
+ }
+
+skip:
+ bitmap_clear(lpriv->dirty_bitmap, i, 1);
+ }
+}
+
+void imsic_local_sync_all(void)
+{
+ struct imsic_local_priv *lpriv = this_cpu_ptr(imsic->lpriv);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&lpriv->lock, flags);
+ bitmap_fill(lpriv->dirty_bitmap, imsic->global.nr_ids + 1);
+ __imsic_local_sync(lpriv);
+ raw_spin_unlock_irqrestore(&lpriv->lock, flags);
+}
+
+void imsic_local_delivery(bool enable)
+{
+ if (enable) {
+ imsic_csr_write(IMSIC_EITHRESHOLD, IMSIC_ENABLE_EITHRESHOLD);
+ imsic_csr_write(IMSIC_EIDELIVERY, IMSIC_ENABLE_EIDELIVERY);
+ return;
+ }
+
+ imsic_csr_write(IMSIC_EIDELIVERY, IMSIC_DISABLE_EIDELIVERY);
+ imsic_csr_write(IMSIC_EITHRESHOLD, IMSIC_DISABLE_EITHRESHOLD);
+}
+
+#ifdef CONFIG_SMP
+static void imsic_local_timer_callback(struct timer_list *timer)
+{
+ struct imsic_local_priv *lpriv = this_cpu_ptr(imsic->lpriv);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&lpriv->lock, flags);
+ __imsic_local_sync(lpriv);
+ raw_spin_unlock_irqrestore(&lpriv->lock, flags);
+}
+
+static void __imsic_remote_sync(struct imsic_local_priv *lpriv, unsigned int cpu)
+{
+ lockdep_assert_held(&lpriv->lock);
+
+ /*
+ * The spinlock acquire/release semantics ensure that changes
+ * to vector enable, vector move and dirty bitmap are visible
+ * to the target CPU.
+ */
+
+ /*
+ * We schedule a timer on the target CPU if the target CPU is not
+ * same as the current CPU. An offline CPU will unconditionally
+ * synchronize IDs through imsic_starting_cpu() when the
+ * CPU is brought up.
+ */
+ if (cpu_online(cpu)) {
+ if (cpu == smp_processor_id()) {
+ __imsic_local_sync(lpriv);
+ return;
+ }
+
+ if (!timer_pending(&lpriv->timer)) {
+ lpriv->timer.expires = jiffies + 1;
+ add_timer_on(&lpriv->timer, cpu);
+ }
+ }
+}
+#else
+static void __imsic_remote_sync(struct imsic_local_priv *lpriv, unsigned int cpu)
+{
+ lockdep_assert_held(&lpriv->lock);
+ __imsic_local_sync(lpriv);
+}
+#endif
+
+void imsic_vector_mask(struct imsic_vector *vec)
+{
+ struct imsic_local_priv *lpriv;
+
+ lpriv = per_cpu_ptr(imsic->lpriv, vec->cpu);
+ if (WARN_ON_ONCE(&lpriv->vectors[vec->local_id] != vec))
+ return;
+
+ /*
+ * This function is called through Linux irq subsystem with
+ * irqs disabled so no need to save/restore irq flags.
+ */
+
+ raw_spin_lock(&lpriv->lock);
+
+ WRITE_ONCE(vec->enable, false);
+ bitmap_set(lpriv->dirty_bitmap, vec->local_id, 1);
+ __imsic_remote_sync(lpriv, vec->cpu);
+
+ raw_spin_unlock(&lpriv->lock);
+}
+
+void imsic_vector_unmask(struct imsic_vector *vec)
+{
+ struct imsic_local_priv *lpriv;
+
+ lpriv = per_cpu_ptr(imsic->lpriv, vec->cpu);
+ if (WARN_ON_ONCE(&lpriv->vectors[vec->local_id] != vec))
+ return;
+
+ /*
+ * This function is called through Linux irq subsystem with
+ * irqs disabled so no need to save/restore irq flags.
+ */
+
+ raw_spin_lock(&lpriv->lock);
+
+ WRITE_ONCE(vec->enable, true);
+ bitmap_set(lpriv->dirty_bitmap, vec->local_id, 1);
+ __imsic_remote_sync(lpriv, vec->cpu);
+
+ raw_spin_unlock(&lpriv->lock);
+}
+
+static bool imsic_vector_move_update(struct imsic_local_priv *lpriv, struct imsic_vector *vec,
+ bool new_enable, struct imsic_vector *new_move)
+{
+ unsigned long flags;
+ bool enabled;
+
+ raw_spin_lock_irqsave(&lpriv->lock, flags);
+
+ /* Update enable and move details */
+ enabled = READ_ONCE(vec->enable);
+ WRITE_ONCE(vec->enable, new_enable);
+ WRITE_ONCE(vec->move, new_move);
+
+ /* Mark the vector as dirty and synchronize */
+ bitmap_set(lpriv->dirty_bitmap, vec->local_id, 1);
+ __imsic_remote_sync(lpriv, vec->cpu);
+
+ raw_spin_unlock_irqrestore(&lpriv->lock, flags);
+
+ return enabled;
+}
+
+void imsic_vector_move(struct imsic_vector *old_vec, struct imsic_vector *new_vec)
+{
+ struct imsic_local_priv *old_lpriv, *new_lpriv;
+ bool enabled;
+
+ if (WARN_ON_ONCE(old_vec->cpu == new_vec->cpu))
+ return;
+
+ old_lpriv = per_cpu_ptr(imsic->lpriv, old_vec->cpu);
+ if (WARN_ON_ONCE(&old_lpriv->vectors[old_vec->local_id] != old_vec))
+ return;
+
+ new_lpriv = per_cpu_ptr(imsic->lpriv, new_vec->cpu);
+ if (WARN_ON_ONCE(&new_lpriv->vectors[new_vec->local_id] != new_vec))
+ return;
+
+ /*
+ * Move and re-trigger the new vector based on the pending
+ * state of the old vector because we might get a device
+ * interrupt on the old vector while device was being moved
+ * to the new vector.
+ */
+ enabled = imsic_vector_move_update(old_lpriv, old_vec, false, new_vec);
+ imsic_vector_move_update(new_lpriv, new_vec, enabled, new_vec);
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+void imsic_vector_debug_show(struct seq_file *m, struct imsic_vector *vec, int ind)
+{
+ struct imsic_local_priv *lpriv;
+ struct imsic_vector *mvec;
+ bool is_enabled;
+
+ lpriv = per_cpu_ptr(imsic->lpriv, vec->cpu);
+ if (WARN_ON_ONCE(&lpriv->vectors[vec->local_id] != vec))
+ return;
+
+ is_enabled = imsic_vector_isenabled(vec);
+ mvec = imsic_vector_get_move(vec);
+
+ seq_printf(m, "%*starget_cpu : %5u\n", ind, "", vec->cpu);
+ seq_printf(m, "%*starget_local_id : %5u\n", ind, "", vec->local_id);
+ seq_printf(m, "%*sis_reserved : %5u\n", ind, "",
+ (vec->local_id <= IMSIC_IPI_ID) ? 1 : 0);
+ seq_printf(m, "%*sis_enabled : %5u\n", ind, "", is_enabled ? 1 : 0);
+ seq_printf(m, "%*sis_move_pending : %5u\n", ind, "", mvec ? 1 : 0);
+ if (mvec) {
+ seq_printf(m, "%*smove_cpu : %5u\n", ind, "", mvec->cpu);
+ seq_printf(m, "%*smove_local_id : %5u\n", ind, "", mvec->local_id);
+ }
+}
+
+void imsic_vector_debug_show_summary(struct seq_file *m, int ind)
+{
+ irq_matrix_debug_show(m, imsic->matrix, ind);
+}
+#endif
+
+struct imsic_vector *imsic_vector_from_local_id(unsigned int cpu, unsigned int local_id)
+{
+ struct imsic_local_priv *lpriv = per_cpu_ptr(imsic->lpriv, cpu);
+
+ if (!lpriv || imsic->global.nr_ids < local_id)
+ return NULL;
+
+ return &lpriv->vectors[local_id];
+}
+
+struct imsic_vector *imsic_vector_alloc(unsigned int hwirq, const struct cpumask *mask)
+{
+ struct imsic_vector *vec = NULL;
+ struct imsic_local_priv *lpriv;
+ unsigned long flags;
+ unsigned int cpu;
+ int local_id;
+
+ raw_spin_lock_irqsave(&imsic->matrix_lock, flags);
+ local_id = irq_matrix_alloc(imsic->matrix, mask, false, &cpu);
+ raw_spin_unlock_irqrestore(&imsic->matrix_lock, flags);
+ if (local_id < 0)
+ return NULL;
+
+ lpriv = per_cpu_ptr(imsic->lpriv, cpu);
+ vec = &lpriv->vectors[local_id];
+ vec->hwirq = hwirq;
+ vec->enable = false;
+ vec->move = NULL;
+
+ return vec;
+}
+
+void imsic_vector_free(struct imsic_vector *vec)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&imsic->matrix_lock, flags);
+ vec->hwirq = UINT_MAX;
+ irq_matrix_free(imsic->matrix, vec->cpu, vec->local_id, false);
+ raw_spin_unlock_irqrestore(&imsic->matrix_lock, flags);
+}
+
+static void __init imsic_local_cleanup(void)
+{
+ struct imsic_local_priv *lpriv;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ lpriv = per_cpu_ptr(imsic->lpriv, cpu);
+
+ bitmap_free(lpriv->dirty_bitmap);
+ kfree(lpriv->vectors);
+ }
+
+ free_percpu(imsic->lpriv);
+}
+
+static int __init imsic_local_init(void)
+{
+ struct imsic_global_config *global = &imsic->global;
+ struct imsic_local_priv *lpriv;
+ struct imsic_vector *vec;
+ int cpu, i;
+
+ /* Allocate per-CPU private state */
+ imsic->lpriv = alloc_percpu(typeof(*imsic->lpriv));
+ if (!imsic->lpriv)
+ return -ENOMEM;
+
+ /* Setup per-CPU private state */
+ for_each_possible_cpu(cpu) {
+ lpriv = per_cpu_ptr(imsic->lpriv, cpu);
+
+ raw_spin_lock_init(&lpriv->lock);
+
+ /* Allocate dirty bitmap */
+ lpriv->dirty_bitmap = bitmap_zalloc(global->nr_ids + 1, GFP_KERNEL);
+ if (!lpriv->dirty_bitmap)
+ goto fail_local_cleanup;
+
+#ifdef CONFIG_SMP
+ /* Setup lazy timer for synchronization */
+ timer_setup(&lpriv->timer, imsic_local_timer_callback, TIMER_PINNED);
+#endif
+
+ /* Allocate vector array */
+ lpriv->vectors = kcalloc(global->nr_ids + 1, sizeof(*lpriv->vectors),
+ GFP_KERNEL);
+ if (!lpriv->vectors)
+ goto fail_local_cleanup;
+
+ /* Setup vector array */
+ for (i = 0; i <= global->nr_ids; i++) {
+ vec = &lpriv->vectors[i];
+ vec->cpu = cpu;
+ vec->local_id = i;
+ vec->hwirq = UINT_MAX;
+ }
+ }
+
+ return 0;
+
+fail_local_cleanup:
+ imsic_local_cleanup();
+ return -ENOMEM;
+}
+
+void imsic_state_online(void)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&imsic->matrix_lock, flags);
+ irq_matrix_online(imsic->matrix);
+ raw_spin_unlock_irqrestore(&imsic->matrix_lock, flags);
+}
+
+void imsic_state_offline(void)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&imsic->matrix_lock, flags);
+ irq_matrix_offline(imsic->matrix);
+ raw_spin_unlock_irqrestore(&imsic->matrix_lock, flags);
+
+#ifdef CONFIG_SMP
+ struct imsic_local_priv *lpriv = this_cpu_ptr(imsic->lpriv);
+
+ raw_spin_lock_irqsave(&lpriv->lock, flags);
+ WARN_ON_ONCE(try_to_del_timer_sync(&lpriv->timer) < 0);
+ raw_spin_unlock_irqrestore(&lpriv->lock, flags);
+#endif
+}
+
+static int __init imsic_matrix_init(void)
+{
+ struct imsic_global_config *global = &imsic->global;
+
+ raw_spin_lock_init(&imsic->matrix_lock);
+ imsic->matrix = irq_alloc_matrix(global->nr_ids + 1,
+ 0, global->nr_ids + 1);
+ if (!imsic->matrix)
+ return -ENOMEM;
+
+ /* Reserve ID#0 because it is special and never implemented */
+ irq_matrix_assign_system(imsic->matrix, 0, false);
+
+ /* Reserve IPI ID because it is special and used internally */
+ irq_matrix_assign_system(imsic->matrix, IMSIC_IPI_ID, false);
+
+ return 0;
+}
+
+static int __init imsic_get_parent_hartid(struct fwnode_handle *fwnode,
+ u32 index, unsigned long *hartid)
+{
+ struct of_phandle_args parent;
+ int rc;
+
+ /*
+ * Currently, only OF fwnode is supported so extend this
+ * function for ACPI support.
+ */
+ if (!is_of_node(fwnode))
+ return -EINVAL;
+
+ rc = of_irq_parse_one(to_of_node(fwnode), index, &parent);
+ if (rc)
+ return rc;
+
+ /*
+ * Skip interrupts other than external interrupts for
+ * current privilege level.
+ */
+ if (parent.args[0] != RV_IRQ_EXT)
+ return -EINVAL;
+
+ return riscv_of_parent_hartid(parent.np, hartid);
+}
+
+static int __init imsic_get_mmio_resource(struct fwnode_handle *fwnode,
+ u32 index, struct resource *res)
+{
+ /*
+ * Currently, only OF fwnode is supported so extend this
+ * function for ACPI support.
+ */
+ if (!is_of_node(fwnode))
+ return -EINVAL;
+
+ return of_address_to_resource(to_of_node(fwnode), index, res);
+}
+
+static int __init imsic_parse_fwnode(struct fwnode_handle *fwnode,
+ struct imsic_global_config *global,
+ u32 *nr_parent_irqs,
+ u32 *nr_mmios)
+{
+ unsigned long hartid;
+ struct resource res;
+ int rc;
+ u32 i;
+
+ /*
+ * Currently, only OF fwnode is supported so extend this
+ * function for ACPI support.
+ */
+ if (!is_of_node(fwnode))
+ return -EINVAL;
+
+ *nr_parent_irqs = 0;
+ *nr_mmios = 0;
+
+ /* Find number of parent interrupts */
+ while (!imsic_get_parent_hartid(fwnode, *nr_parent_irqs, &hartid))
+ (*nr_parent_irqs)++;
+ if (!*nr_parent_irqs) {
+ pr_err("%pfwP: no parent irqs available\n", fwnode);
+ return -EINVAL;
+ }
+
+ /* Find number of guest index bits in MSI address */
+ rc = of_property_read_u32(to_of_node(fwnode), "riscv,guest-index-bits",
+ &global->guest_index_bits);
+ if (rc)
+ global->guest_index_bits = 0;
+
+ /* Find number of HART index bits */
+ rc = of_property_read_u32(to_of_node(fwnode), "riscv,hart-index-bits",
+ &global->hart_index_bits);
+ if (rc) {
+ /* Assume default value */
+ global->hart_index_bits = __fls(*nr_parent_irqs);
+ if (BIT(global->hart_index_bits) < *nr_parent_irqs)
+ global->hart_index_bits++;
+ }
+
+ /* Find number of group index bits */
+ rc = of_property_read_u32(to_of_node(fwnode), "riscv,group-index-bits",
+ &global->group_index_bits);
+ if (rc)
+ global->group_index_bits = 0;
+
+ /*
+ * Find first bit position of group index.
+ * If not specified assumed the default APLIC-IMSIC configuration.
+ */
+ rc = of_property_read_u32(to_of_node(fwnode), "riscv,group-index-shift",
+ &global->group_index_shift);
+ if (rc)
+ global->group_index_shift = IMSIC_MMIO_PAGE_SHIFT * 2;
+
+ /* Find number of interrupt identities */
+ rc = of_property_read_u32(to_of_node(fwnode), "riscv,num-ids",
+ &global->nr_ids);
+ if (rc) {
+ pr_err("%pfwP: number of interrupt identities not found\n", fwnode);
+ return rc;
+ }
+
+ /* Find number of guest interrupt identities */
+ rc = of_property_read_u32(to_of_node(fwnode), "riscv,num-guest-ids",
+ &global->nr_guest_ids);
+ if (rc)
+ global->nr_guest_ids = global->nr_ids;
+
+ /* Sanity check guest index bits */
+ i = BITS_PER_LONG - IMSIC_MMIO_PAGE_SHIFT;
+ if (i < global->guest_index_bits) {
+ pr_err("%pfwP: guest index bits too big\n", fwnode);
+ return -EINVAL;
+ }
+
+ /* Sanity check HART index bits */
+ i = BITS_PER_LONG - IMSIC_MMIO_PAGE_SHIFT - global->guest_index_bits;
+ if (i < global->hart_index_bits) {
+ pr_err("%pfwP: HART index bits too big\n", fwnode);
+ return -EINVAL;
+ }
+
+ /* Sanity check group index bits */
+ i = BITS_PER_LONG - IMSIC_MMIO_PAGE_SHIFT -
+ global->guest_index_bits - global->hart_index_bits;
+ if (i < global->group_index_bits) {
+ pr_err("%pfwP: group index bits too big\n", fwnode);
+ return -EINVAL;
+ }
+
+ /* Sanity check group index shift */
+ i = global->group_index_bits + global->group_index_shift - 1;
+ if (i >= BITS_PER_LONG) {
+ pr_err("%pfwP: group index shift too big\n", fwnode);
+ return -EINVAL;
+ }
+
+ /* Sanity check number of interrupt identities */
+ if (global->nr_ids < IMSIC_MIN_ID ||
+ global->nr_ids >= IMSIC_MAX_ID ||
+ (global->nr_ids & IMSIC_MIN_ID) != IMSIC_MIN_ID) {
+ pr_err("%pfwP: invalid number of interrupt identities\n", fwnode);
+ return -EINVAL;
+ }
+
+ /* Sanity check number of guest interrupt identities */
+ if (global->nr_guest_ids < IMSIC_MIN_ID ||
+ global->nr_guest_ids >= IMSIC_MAX_ID ||
+ (global->nr_guest_ids & IMSIC_MIN_ID) != IMSIC_MIN_ID) {
+ pr_err("%pfwP: invalid number of guest interrupt identities\n", fwnode);
+ return -EINVAL;
+ }
+
+ /* Compute base address */
+ rc = imsic_get_mmio_resource(fwnode, 0, &res);
+ if (rc) {
+ pr_err("%pfwP: first MMIO resource not found\n", fwnode);
+ return -EINVAL;
+ }
+ global->base_addr = res.start;
+ global->base_addr &= ~(BIT(global->guest_index_bits +
+ global->hart_index_bits +
+ IMSIC_MMIO_PAGE_SHIFT) - 1);
+ global->base_addr &= ~((BIT(global->group_index_bits) - 1) <<
+ global->group_index_shift);
+
+ /* Find number of MMIO register sets */
+ while (!imsic_get_mmio_resource(fwnode, *nr_mmios, &res))
+ (*nr_mmios)++;
+
+ return 0;
+}
+
+int __init imsic_setup_state(struct fwnode_handle *fwnode)
+{
+ u32 i, j, index, nr_parent_irqs, nr_mmios, nr_handlers = 0;
+ struct imsic_global_config *global;
+ struct imsic_local_config *local;
+ void __iomem **mmios_va = NULL;
+ struct resource *mmios = NULL;
+ unsigned long reloff, hartid;
+ phys_addr_t base_addr;
+ int rc, cpu;
+
+ /*
+ * Only one IMSIC instance allowed in a platform for clean
+ * implementation of SMP IRQ affinity and per-CPU IPIs.
+ *
+ * This means on a multi-socket (or multi-die) platform we
+ * will have multiple MMIO regions for one IMSIC instance.
+ */
+ if (imsic) {
+ pr_err("%pfwP: already initialized hence ignoring\n", fwnode);
+ return -EALREADY;
+ }
+
+ if (!riscv_isa_extension_available(NULL, SxAIA)) {
+ pr_err("%pfwP: AIA support not available\n", fwnode);
+ return -ENODEV;
+ }
+
+ imsic = kzalloc(sizeof(*imsic), GFP_KERNEL);
+ if (!imsic)
+ return -ENOMEM;
+ imsic->fwnode = fwnode;
+ global = &imsic->global;
+
+ global->local = alloc_percpu(typeof(*global->local));
+ if (!global->local) {
+ rc = -ENOMEM;
+ goto out_free_priv;
+ }
+
+ /* Parse IMSIC fwnode */
+ rc = imsic_parse_fwnode(fwnode, global, &nr_parent_irqs, &nr_mmios);
+ if (rc)
+ goto out_free_local;
+
+ /* Allocate MMIO resource array */
+ mmios = kcalloc(nr_mmios, sizeof(*mmios), GFP_KERNEL);
+ if (!mmios) {
+ rc = -ENOMEM;
+ goto out_free_local;
+ }
+
+ /* Allocate MMIO virtual address array */
+ mmios_va = kcalloc(nr_mmios, sizeof(*mmios_va), GFP_KERNEL);
+ if (!mmios_va) {
+ rc = -ENOMEM;
+ goto out_iounmap;
+ }
+
+ /* Parse and map MMIO register sets */
+ for (i = 0; i < nr_mmios; i++) {
+ rc = imsic_get_mmio_resource(fwnode, i, &mmios[i]);
+ if (rc) {
+ pr_err("%pfwP: unable to parse MMIO regset %d\n", fwnode, i);
+ goto out_iounmap;
+ }
+
+ base_addr = mmios[i].start;
+ base_addr &= ~(BIT(global->guest_index_bits +
+ global->hart_index_bits +
+ IMSIC_MMIO_PAGE_SHIFT) - 1);
+ base_addr &= ~((BIT(global->group_index_bits) - 1) <<
+ global->group_index_shift);
+ if (base_addr != global->base_addr) {
+ rc = -EINVAL;
+ pr_err("%pfwP: address mismatch for regset %d\n", fwnode, i);
+ goto out_iounmap;
+ }
+
+ mmios_va[i] = ioremap(mmios[i].start, resource_size(&mmios[i]));
+ if (!mmios_va[i]) {
+ rc = -EIO;
+ pr_err("%pfwP: unable to map MMIO regset %d\n", fwnode, i);
+ goto out_iounmap;
+ }
+ }
+
+ /* Initialize local (or per-CPU )state */
+ rc = imsic_local_init();
+ if (rc) {
+ pr_err("%pfwP: failed to initialize local state\n",
+ fwnode);
+ goto out_iounmap;
+ }
+
+ /* Configure handlers for target CPUs */
+ for (i = 0; i < nr_parent_irqs; i++) {
+ rc = imsic_get_parent_hartid(fwnode, i, &hartid);
+ if (rc) {
+ pr_warn("%pfwP: hart ID for parent irq%d not found\n", fwnode, i);
+ continue;
+ }
+
+ cpu = riscv_hartid_to_cpuid(hartid);
+ if (cpu < 0) {
+ pr_warn("%pfwP: invalid cpuid for parent irq%d\n", fwnode, i);
+ continue;
+ }
+
+ /* Find MMIO location of MSI page */
+ index = nr_mmios;
+ reloff = i * BIT(global->guest_index_bits) *
+ IMSIC_MMIO_PAGE_SZ;
+ for (j = 0; nr_mmios; j++) {
+ if (reloff < resource_size(&mmios[j])) {
+ index = j;
+ break;
+ }
+
+ /*
+ * MMIO region size may not be aligned to
+ * BIT(global->guest_index_bits) * IMSIC_MMIO_PAGE_SZ
+ * if holes are present.
+ */
+ reloff -= ALIGN(resource_size(&mmios[j]),
+ BIT(global->guest_index_bits) * IMSIC_MMIO_PAGE_SZ);
+ }
+ if (index >= nr_mmios) {
+ pr_warn("%pfwP: MMIO not found for parent irq%d\n", fwnode, i);
+ continue;
+ }
+
+ local = per_cpu_ptr(global->local, cpu);
+ local->msi_pa = mmios[index].start + reloff;
+ local->msi_va = mmios_va[index] + reloff;
+
+ nr_handlers++;
+ }
+
+ /* If no CPU handlers found then can't take interrupts */
+ if (!nr_handlers) {
+ pr_err("%pfwP: No CPU handlers found\n", fwnode);
+ rc = -ENODEV;
+ goto out_local_cleanup;
+ }
+
+ /* Initialize matrix allocator */
+ rc = imsic_matrix_init();
+ if (rc) {
+ pr_err("%pfwP: failed to create matrix allocator\n", fwnode);
+ goto out_local_cleanup;
+ }
+
+ /* We don't need MMIO arrays anymore so let's free-up */
+ kfree(mmios_va);
+ kfree(mmios);
+
+ return 0;
+
+out_local_cleanup:
+ imsic_local_cleanup();
+out_iounmap:
+ for (i = 0; i < nr_mmios; i++) {
+ if (mmios_va[i])
+ iounmap(mmios_va[i]);
+ }
+ kfree(mmios_va);
+ kfree(mmios);
+out_free_local:
+ free_percpu(imsic->global.local);
+out_free_priv:
+ kfree(imsic);
+ imsic = NULL;
+ return rc;
+}
diff --git a/drivers/irqchip/irq-riscv-imsic-state.h b/drivers/irqchip/irq-riscv-imsic-state.h
new file mode 100644
index 00000000000000..5ae2f69b035b6e
--- /dev/null
+++ b/drivers/irqchip/irq-riscv-imsic-state.h
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+
+#ifndef _IRQ_RISCV_IMSIC_STATE_H
+#define _IRQ_RISCV_IMSIC_STATE_H
+
+#include <linux/irqchip/riscv-imsic.h>
+#include <linux/irqdomain.h>
+#include <linux/fwnode.h>
+#include <linux/timer.h>
+
+#define IMSIC_IPI_ID 1
+#define IMSIC_NR_IPI 8
+
+struct imsic_vector {
+ /* Fixed details of the vector */
+ unsigned int cpu;
+ unsigned int local_id;
+ /* Details saved by driver in the vector */
+ unsigned int hwirq;
+ /* Details accessed using local lock held */
+ bool enable;
+ struct imsic_vector *move;
+};
+
+struct imsic_local_priv {
+ /* Local lock to protect vector enable/move variables and dirty bitmap */
+ raw_spinlock_t lock;
+
+ /* Local dirty bitmap for synchronization */
+ unsigned long *dirty_bitmap;
+
+#ifdef CONFIG_SMP
+ /* Local timer for synchronization */
+ struct timer_list timer;
+#endif
+
+ /* Local vector table */
+ struct imsic_vector *vectors;
+};
+
+struct imsic_priv {
+ /* Device details */
+ struct fwnode_handle *fwnode;
+
+ /* Global configuration common for all HARTs */
+ struct imsic_global_config global;
+
+ /* Per-CPU state */
+ struct imsic_local_priv __percpu *lpriv;
+
+ /* State of IRQ matrix allocator */
+ raw_spinlock_t matrix_lock;
+ struct irq_matrix *matrix;
+
+ /* IRQ domains (created by platform driver) */
+ struct irq_domain *base_domain;
+};
+
+extern struct imsic_priv *imsic;
+
+void __imsic_eix_update(unsigned long base_id, unsigned long num_id, bool pend, bool val);
+
+static inline void __imsic_id_set_enable(unsigned long id)
+{
+ __imsic_eix_update(id, 1, false, true);
+}
+
+static inline void __imsic_id_clear_enable(unsigned long id)
+{
+ __imsic_eix_update(id, 1, false, false);
+}
+
+void imsic_local_sync_all(void);
+void imsic_local_delivery(bool enable);
+
+void imsic_vector_mask(struct imsic_vector *vec);
+void imsic_vector_unmask(struct imsic_vector *vec);
+
+static inline bool imsic_vector_isenabled(struct imsic_vector *vec)
+{
+ return READ_ONCE(vec->enable);
+}
+
+static inline struct imsic_vector *imsic_vector_get_move(struct imsic_vector *vec)
+{
+ return READ_ONCE(vec->move);
+}
+
+void imsic_vector_move(struct imsic_vector *old_vec, struct imsic_vector *new_vec);
+
+struct imsic_vector *imsic_vector_from_local_id(unsigned int cpu, unsigned int local_id);
+
+struct imsic_vector *imsic_vector_alloc(unsigned int hwirq, const struct cpumask *mask);
+void imsic_vector_free(struct imsic_vector *vector);
+
+void imsic_vector_debug_show(struct seq_file *m, struct imsic_vector *vec, int ind);
+void imsic_vector_debug_show_summary(struct seq_file *m, int ind);
+
+void imsic_state_online(void);
+void imsic_state_offline(void);
+int imsic_setup_state(struct fwnode_handle *fwnode);
+int imsic_irqdomain_init(void);
+
+#endif
diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c
index f3d4cb9e34f7d8..8fb183ced1e778 100644
--- a/drivers/irqchip/irq-sifive-plic.c
+++ b/drivers/irqchip/irq-sifive-plic.c
@@ -164,15 +164,12 @@ static int plic_set_affinity(struct irq_data *d,
const struct cpumask *mask_val, bool force)
{
unsigned int cpu;
- struct cpumask amask;
struct plic_priv *priv = irq_data_get_irq_chip_data(d);
- cpumask_and(&amask, &priv->lmask, mask_val);
-
if (force)
- cpu = cpumask_first(&amask);
+ cpu = cpumask_first_and(&priv->lmask, mask_val);
else
- cpu = cpumask_any_and(&amask, cpu_online_mask);
+ cpu = cpumask_first_and_and(&priv->lmask, mask_val, cpu_online_mask);
if (cpu >= nr_cpu_ids)
return -EINVAL;
diff --git a/drivers/irqchip/irq-stm32-exti.c b/drivers/irqchip/irq-stm32-exti.c
index 26a5193d0ae412..2cc9f3b7d6690c 100644
--- a/drivers/irqchip/irq-stm32-exti.c
+++ b/drivers/irqchip/irq-stm32-exti.c
@@ -19,13 +19,26 @@
#include <linux/of_address.h>
#include <linux/of_irq.h>
#include <linux/platform_device.h>
-#include <linux/syscore_ops.h>
+#include <linux/pm.h>
#include <dt-bindings/interrupt-controller/arm-gic.h>
-#define IRQS_PER_BANK 32
+#define IRQS_PER_BANK 32
-#define HWSPNLCK_TIMEOUT 1000 /* usec */
+#define HWSPNLCK_TIMEOUT 1000 /* usec */
+
+#define EXTI_EnCIDCFGR(n) (0x180 + (n) * 4)
+#define EXTI_HWCFGR1 0x3f0
+
+/* Register: EXTI_EnCIDCFGR(n) */
+#define EXTI_CIDCFGR_CFEN_MASK BIT(0)
+#define EXTI_CIDCFGR_CID_MASK GENMASK(6, 4)
+#define EXTI_CIDCFGR_CID_SHIFT 4
+
+/* Register: EXTI_HWCFGR1 */
+#define EXTI_HWCFGR1_CIDWIDTH_MASK GENMASK(27, 24)
+
+#define EXTI_CID1 1
struct stm32_exti_bank {
u32 imr_ofst;
@@ -36,6 +49,7 @@ struct stm32_exti_bank {
u32 rpr_ofst;
u32 fpr_ofst;
u32 trg_ofst;
+ u32 seccfgr_ofst;
};
#define UNDEF_REG ~0
@@ -54,17 +68,18 @@ struct stm32_exti_chip_data {
u32 mask_cache;
u32 rtsr_cache;
u32 ftsr_cache;
+ u32 event_reserved;
};
struct stm32_exti_host_data {
void __iomem *base;
+ struct device *dev;
struct stm32_exti_chip_data *chips_data;
const struct stm32_exti_drv_data *drv_data;
struct hwspinlock *hwlock;
+ bool dt_has_irqs_desc; /* skip internal desc_irqs array and get it from DT */
};
-static struct stm32_exti_host_data *stm32_host_data;
-
static const struct stm32_exti_bank stm32f4xx_exti_b1 = {
.imr_ofst = 0x00,
.emr_ofst = 0x04,
@@ -74,6 +89,7 @@ static const struct stm32_exti_bank stm32f4xx_exti_b1 = {
.rpr_ofst = 0x14,
.fpr_ofst = UNDEF_REG,
.trg_ofst = UNDEF_REG,
+ .seccfgr_ofst = UNDEF_REG,
};
static const struct stm32_exti_bank *stm32f4xx_exti_banks[] = {
@@ -94,6 +110,7 @@ static const struct stm32_exti_bank stm32h7xx_exti_b1 = {
.rpr_ofst = 0x88,
.fpr_ofst = UNDEF_REG,
.trg_ofst = UNDEF_REG,
+ .seccfgr_ofst = UNDEF_REG,
};
static const struct stm32_exti_bank stm32h7xx_exti_b2 = {
@@ -105,6 +122,7 @@ static const struct stm32_exti_bank stm32h7xx_exti_b2 = {
.rpr_ofst = 0x98,
.fpr_ofst = UNDEF_REG,
.trg_ofst = UNDEF_REG,
+ .seccfgr_ofst = UNDEF_REG,
};
static const struct stm32_exti_bank stm32h7xx_exti_b3 = {
@@ -116,6 +134,7 @@ static const struct stm32_exti_bank stm32h7xx_exti_b3 = {
.rpr_ofst = 0xA8,
.fpr_ofst = UNDEF_REG,
.trg_ofst = UNDEF_REG,
+ .seccfgr_ofst = UNDEF_REG,
};
static const struct stm32_exti_bank *stm32h7xx_exti_banks[] = {
@@ -138,6 +157,7 @@ static const struct stm32_exti_bank stm32mp1_exti_b1 = {
.rpr_ofst = 0x0C,
.fpr_ofst = 0x10,
.trg_ofst = 0x3EC,
+ .seccfgr_ofst = 0x14,
};
static const struct stm32_exti_bank stm32mp1_exti_b2 = {
@@ -149,6 +169,7 @@ static const struct stm32_exti_bank stm32mp1_exti_b2 = {
.rpr_ofst = 0x2C,
.fpr_ofst = 0x30,
.trg_ofst = 0x3E8,
+ .seccfgr_ofst = 0x34,
};
static const struct stm32_exti_bank stm32mp1_exti_b3 = {
@@ -160,6 +181,7 @@ static const struct stm32_exti_bank stm32mp1_exti_b3 = {
.rpr_ofst = 0x4C,
.fpr_ofst = 0x50,
.trg_ofst = 0x3E4,
+ .seccfgr_ofst = 0x54,
};
static const struct stm32_exti_bank *stm32mp1_exti_banks[] = {
@@ -322,7 +344,7 @@ static void stm32_irq_handler(struct irq_desc *desc)
while ((pending = stm32_exti_pending(gc))) {
for_each_set_bit(n, &pending, IRQS_PER_BANK)
generic_handle_domain_irq(domain, irq_base + n);
- }
+ }
}
chained_irq_exit(chip, desc);
@@ -621,50 +643,32 @@ static int stm32_exti_h_set_affinity(struct irq_data *d,
return IRQ_SET_MASK_OK_DONE;
}
-static int __maybe_unused stm32_exti_h_suspend(void)
+static int stm32_exti_h_suspend(struct device *dev)
{
+ struct stm32_exti_host_data *host_data = dev_get_drvdata(dev);
struct stm32_exti_chip_data *chip_data;
int i;
- for (i = 0; i < stm32_host_data->drv_data->bank_nr; i++) {
- chip_data = &stm32_host_data->chips_data[i];
- raw_spin_lock(&chip_data->rlock);
+ for (i = 0; i < host_data->drv_data->bank_nr; i++) {
+ chip_data = &host_data->chips_data[i];
stm32_chip_suspend(chip_data, chip_data->wake_active);
- raw_spin_unlock(&chip_data->rlock);
}
return 0;
}
-static void __maybe_unused stm32_exti_h_resume(void)
+static int stm32_exti_h_resume(struct device *dev)
{
+ struct stm32_exti_host_data *host_data = dev_get_drvdata(dev);
struct stm32_exti_chip_data *chip_data;
int i;
- for (i = 0; i < stm32_host_data->drv_data->bank_nr; i++) {
- chip_data = &stm32_host_data->chips_data[i];
- raw_spin_lock(&chip_data->rlock);
+ for (i = 0; i < host_data->drv_data->bank_nr; i++) {
+ chip_data = &host_data->chips_data[i];
stm32_chip_resume(chip_data, chip_data->mask_cache);
- raw_spin_unlock(&chip_data->rlock);
}
-}
-static struct syscore_ops stm32_exti_h_syscore_ops = {
-#ifdef CONFIG_PM_SLEEP
- .suspend = stm32_exti_h_suspend,
- .resume = stm32_exti_h_resume,
-#endif
-};
-
-static void stm32_exti_h_syscore_init(struct stm32_exti_host_data *host_data)
-{
- stm32_host_data = host_data;
- register_syscore_ops(&stm32_exti_h_syscore_ops);
-}
-
-static void stm32_exti_h_syscore_deinit(void)
-{
- unregister_syscore_ops(&stm32_exti_h_syscore_ops);
+ return 0;
}
static int stm32_exti_h_retrigger(struct irq_data *d)
@@ -725,12 +729,35 @@ static int stm32_exti_h_domain_alloc(struct irq_domain *dm,
bank = hwirq / IRQS_PER_BANK;
chip_data = &host_data->chips_data[bank];
+ /* Check if event is reserved (Secure) */
+ if (chip_data->event_reserved & BIT(hwirq % IRQS_PER_BANK)) {
+ dev_err(host_data->dev, "event %lu is reserved, secure\n", hwirq);
+ return -EPERM;
+ }
+
event_trg = readl_relaxed(host_data->base + chip_data->reg_bank->trg_ofst);
chip = (event_trg & BIT(hwirq % IRQS_PER_BANK)) ?
&stm32_exti_h_chip : &stm32_exti_h_chip_direct;
irq_domain_set_hwirq_and_chip(dm, virq, hwirq, chip, chip_data);
+ if (host_data->dt_has_irqs_desc) {
+ struct of_phandle_args out_irq;
+ int ret;
+
+ ret = of_irq_parse_one(host_data->dev->of_node, hwirq, &out_irq);
+ if (ret)
+ return ret;
+ /* we only support one parent, so far */
+ if (of_node_to_fwnode(out_irq.np) != dm->parent->fwnode)
+ return -EINVAL;
+
+ of_phandle_args_to_fwspec(out_irq.np, out_irq.args,
+ out_irq.args_count, &p_fwspec);
+
+ return irq_domain_alloc_irqs_parent(dm, virq, 1, &p_fwspec);
+ }
+
if (!host_data->drv_data->desc_irqs)
return -EINVAL;
@@ -771,8 +798,6 @@ stm32_exti_host_data *stm32_exti_host_init(const struct stm32_exti_drv_data *dd,
goto free_chips_data;
}
- stm32_host_data = host_data;
-
return host_data;
free_chips_data:
@@ -807,6 +832,10 @@ stm32_exti_chip_data *stm32_exti_chip_init(struct stm32_exti_host_data *h_data,
if (stm32_bank->emr_ofst != UNDEF_REG)
writel_relaxed(0, base + stm32_bank->emr_ofst);
+ /* reserve Secure events */
+ if (stm32_bank->seccfgr_ofst != UNDEF_REG)
+ chip_data->event_reserved = readl_relaxed(base + stm32_bank->seccfgr_ofst);
+
pr_info("%pOF: bank%d\n", node, bank_idx);
return chip_data;
@@ -891,6 +920,27 @@ static const struct irq_domain_ops stm32_exti_h_domain_ops = {
.xlate = irq_domain_xlate_twocell,
};
+static void stm32_exti_check_rif(struct stm32_exti_host_data *host_data)
+{
+ unsigned int bank, i, event;
+ u32 cid, cidcfgr, hwcfgr1;
+
+ /* quit on CID not supported */
+ hwcfgr1 = readl_relaxed(host_data->base + EXTI_HWCFGR1);
+ if ((hwcfgr1 & EXTI_HWCFGR1_CIDWIDTH_MASK) == 0)
+ return;
+
+ for (bank = 0; bank < host_data->drv_data->bank_nr; bank++) {
+ for (i = 0; i < IRQS_PER_BANK; i++) {
+ event = bank * IRQS_PER_BANK + i;
+ cidcfgr = readl_relaxed(host_data->base + EXTI_EnCIDCFGR(event));
+ cid = (cidcfgr & EXTI_CIDCFGR_CID_MASK) >> EXTI_CIDCFGR_CID_SHIFT;
+ if ((cidcfgr & EXTI_CIDCFGR_CFEN_MASK) && cid != EXTI_CID1)
+ host_data->chips_data[bank].event_reserved |= BIT(i);
+ }
+ }
+}
+
static void stm32_exti_remove_irq(void *data)
{
struct irq_domain *domain = data;
@@ -898,11 +948,6 @@ static void stm32_exti_remove_irq(void *data)
irq_domain_remove(domain);
}
-static void stm32_exti_remove(struct platform_device *pdev)
-{
- stm32_exti_h_syscore_deinit();
-}
-
static int stm32_exti_probe(struct platform_device *pdev)
{
int ret, i;
@@ -916,6 +961,9 @@ static int stm32_exti_probe(struct platform_device *pdev)
if (!host_data)
return -ENOMEM;
+ dev_set_drvdata(dev, host_data);
+ host_data->dev = dev;
+
/* check for optional hwspinlock which may be not available yet */
ret = of_hwspin_lock_get_id(np, 0);
if (ret == -EPROBE_DEFER)
@@ -955,6 +1003,8 @@ static int stm32_exti_probe(struct platform_device *pdev)
for (i = 0; i < drv_data->bank_nr; i++)
stm32_exti_chip_init(host_data, i, np);
+ stm32_exti_check_rif(host_data);
+
parent_domain = irq_find_host(of_irq_find_parent(np));
if (!parent_domain) {
dev_err(dev, "GIC interrupt-parent not found\n");
@@ -975,7 +1025,8 @@ static int stm32_exti_probe(struct platform_device *pdev)
if (ret)
return ret;
- stm32_exti_h_syscore_init(host_data);
+ if (of_property_read_bool(np, "interrupts-extended"))
+ host_data->dt_has_irqs_desc = true;
return 0;
}
@@ -988,12 +1039,16 @@ static const struct of_device_id stm32_exti_ids[] = {
};
MODULE_DEVICE_TABLE(of, stm32_exti_ids);
+static const struct dev_pm_ops stm32_exti_dev_pm_ops = {
+ NOIRQ_SYSTEM_SLEEP_PM_OPS(stm32_exti_h_suspend, stm32_exti_h_resume)
+};
+
static struct platform_driver stm32_exti_driver = {
.probe = stm32_exti_probe,
- .remove_new = stm32_exti_remove,
.driver = {
.name = "stm32_exti",
.of_match_table = stm32_exti_ids,
+ .pm = &stm32_exti_dev_pm_ops,
},
};
diff --git a/drivers/irqchip/irq-sunxi-nmi.c b/drivers/irqchip/irq-sunxi-nmi.c
index e760b1278143dd..bb92fd85e975f8 100644
--- a/drivers/irqchip/irq-sunxi-nmi.c
+++ b/drivers/irqchip/irq-sunxi-nmi.c
@@ -192,7 +192,6 @@ static int __init sunxi_sc_nmi_irq_init(struct device_node *node,
gc->chip_types[0].regs.type = reg_offs->ctrl;
gc->chip_types[1].type = IRQ_TYPE_EDGE_BOTH;
- gc->chip_types[1].chip.name = gc->chip_types[0].chip.name;
gc->chip_types[1].chip.irq_ack = irq_gc_ack_set_bit;
gc->chip_types[1].chip.irq_mask = irq_gc_mask_clr_bit;
gc->chip_types[1].chip.irq_unmask = irq_gc_mask_set_bit;
diff --git a/drivers/irqchip/irq-tb10x.c b/drivers/irqchip/irq-tb10x.c
index 680586354d12ff..d59bfbe8c6d02a 100644
--- a/drivers/irqchip/irq-tb10x.c
+++ b/drivers/irqchip/irq-tb10x.c
@@ -150,7 +150,6 @@ static int __init of_tb10x_init_irq(struct device_node *ictl,
gc->chip_types[0].regs.mask = AB_IRQCTL_INT_ENABLE;
gc->chip_types[1].type = IRQ_TYPE_EDGE_BOTH;
- gc->chip_types[1].chip.name = gc->chip_types[0].chip.name;
gc->chip_types[1].chip.irq_ack = irq_gc_ack_set_bit;
gc->chip_types[1].chip.irq_mask = irq_gc_mask_clr_bit;
gc->chip_types[1].chip.irq_unmask = irq_gc_mask_set_bit;
diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index 9d1b1459700df5..280071be30b157 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -477,7 +477,6 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
unsigned long state)
{
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
- struct cpumask *cpus;
unsigned int frequency;
int ret;
@@ -494,8 +493,6 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
ret = freq_qos_update_request(&cpufreq_cdev->qos_req, frequency);
if (ret >= 0) {
cpufreq_cdev->cpufreq_state = state;
- cpus = cpufreq_cdev->policy->related_cpus;
- arch_update_thermal_pressure(cpus, frequency);
ret = 0;
}
diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c
index 87f241825bc360..654290a8e1ba31 100644
--- a/drivers/virt/coco/sev-guest/sev-guest.c
+++ b/drivers/virt/coco/sev-guest/sev-guest.c
@@ -59,7 +59,7 @@ struct snp_guest_dev {
*/
struct snp_guest_msg secret_request, secret_response;
- struct snp_secrets_page_layout *layout;
+ struct snp_secrets_page *secrets;
struct snp_req_data input;
union {
struct snp_report_req report;
@@ -743,26 +743,26 @@ static const struct file_operations snp_guest_fops = {
.unlocked_ioctl = snp_guest_ioctl,
};
-static u8 *get_vmpck(int id, struct snp_secrets_page_layout *layout, u32 **seqno)
+static u8 *get_vmpck(int id, struct snp_secrets_page *secrets, u32 **seqno)
{
u8 *key = NULL;
switch (id) {
case 0:
- *seqno = &layout->os_area.msg_seqno_0;
- key = layout->vmpck0;
+ *seqno = &secrets->os_area.msg_seqno_0;
+ key = secrets->vmpck0;
break;
case 1:
- *seqno = &layout->os_area.msg_seqno_1;
- key = layout->vmpck1;
+ *seqno = &secrets->os_area.msg_seqno_1;
+ key = secrets->vmpck1;
break;
case 2:
- *seqno = &layout->os_area.msg_seqno_2;
- key = layout->vmpck2;
+ *seqno = &secrets->os_area.msg_seqno_2;
+ key = secrets->vmpck2;
break;
case 3:
- *seqno = &layout->os_area.msg_seqno_3;
- key = layout->vmpck3;
+ *seqno = &secrets->os_area.msg_seqno_3;
+ key = secrets->vmpck3;
break;
default:
break;
@@ -897,8 +897,8 @@ static void unregister_sev_tsm(void *data)
static int __init sev_guest_probe(struct platform_device *pdev)
{
- struct snp_secrets_page_layout *layout;
struct sev_guest_platform_data *data;
+ struct snp_secrets_page *secrets;
struct device *dev = &pdev->dev;
struct snp_guest_dev *snp_dev;
struct miscdevice *misc;
@@ -916,7 +916,7 @@ static int __init sev_guest_probe(struct platform_device *pdev)
if (!mapping)
return -ENODEV;
- layout = (__force void *)mapping;
+ secrets = (__force void *)mapping;
ret = -ENOMEM;
snp_dev = devm_kzalloc(&pdev->dev, sizeof(struct snp_guest_dev), GFP_KERNEL);
@@ -924,7 +924,7 @@ static int __init sev_guest_probe(struct platform_device *pdev)
goto e_unmap;
ret = -EINVAL;
- snp_dev->vmpck = get_vmpck(vmpck_id, layout, &snp_dev->os_area_msg_seqno);
+ snp_dev->vmpck = get_vmpck(vmpck_id, secrets, &snp_dev->os_area_msg_seqno);
if (!snp_dev->vmpck) {
dev_err(dev, "invalid vmpck id %d\n", vmpck_id);
goto e_unmap;
@@ -938,7 +938,7 @@ static int __init sev_guest_probe(struct platform_device *pdev)
platform_set_drvdata(pdev, snp_dev);
snp_dev->dev = dev;
- snp_dev->layout = layout;
+ snp_dev->secrets = secrets;
/* Allocate the shared page used for the request and response message. */
snp_dev->request = alloc_shared_pages(dev, sizeof(struct snp_guest_msg));
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index db13bb620f527e..c768de6f19a9a9 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -180,6 +180,11 @@ static inline bool is_kernel_rodata(unsigned long addr)
addr < (unsigned long)__end_rodata;
}
+static inline bool is_kernel_ro_after_init(unsigned long addr)
+{
+ return addr >= (unsigned long)__start_ro_after_init &&
+ addr < (unsigned long)__end_ro_after_init;
+}
/**
* is_kernel_inittext - checks if the pointer address is located in the
* .init.text section
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index a6dc516a56bf9a..5703526d6ebf13 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -3,7 +3,7 @@
* linker scripts.
*
* A minimal linker scripts has following content:
- * [This is a sample, architectures may have special requiriements]
+ * [This is a sample, architectures may have special requirements]
*
* OUTPUT_FORMAT(...)
* OUTPUT_ARCH(...)
@@ -402,13 +402,13 @@
#define INIT_TASK_DATA(align) \
. = ALIGN(align); \
- __start_init_task = .; \
+ __start_init_stack = .; \
init_thread_union = .; \
init_stack = .; \
KEEP(*(.data..init_task)) \
KEEP(*(.data..init_thread_info)) \
- . = __start_init_task + THREAD_SIZE; \
- __end_init_task = .;
+ . = __start_init_stack + THREAD_SIZE; \
+ __end_init_stack = .;
#define JUMP_TABLE_DATA \
. = ALIGN(8); \
diff --git a/include/asm-generic/vtime.h b/include/asm-generic/vtime.h
deleted file mode 100644
index b1a49677fe2540..00000000000000
--- a/include/asm-generic/vtime.h
+++ /dev/null
@@ -1 +0,0 @@
-/* no content, but patch(1) dislikes empty files */
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index a63d61ca55afc8..b721f360d75923 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -60,14 +60,14 @@ void topology_scale_freq_tick(void);
void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus);
void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus);
-DECLARE_PER_CPU(unsigned long, thermal_pressure);
+DECLARE_PER_CPU(unsigned long, hw_pressure);
-static inline unsigned long topology_get_thermal_pressure(int cpu)
+static inline unsigned long topology_get_hw_pressure(int cpu)
{
- return per_cpu(thermal_pressure, cpu);
+ return per_cpu(hw_pressure, cpu);
}
-void topology_update_thermal_pressure(const struct cpumask *cpus,
+void topology_update_hw_pressure(const struct cpumask *cpus,
unsigned long capped_freq);
struct cpu_topology {
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 9956afb9acc233..20f7e98ee8af90 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -241,6 +241,12 @@ struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
bool has_target_index(void);
+
+DECLARE_PER_CPU(unsigned long, cpufreq_pressure);
+static inline unsigned long cpufreq_get_pressure(int cpu)
+{
+ return READ_ONCE(per_cpu(cpufreq_pressure, cpu));
+}
#else
static inline unsigned int cpufreq_get(unsigned int cpu)
{
@@ -264,6 +270,10 @@ static inline bool cpufreq_supports_freq_invariance(void)
}
static inline void disable_cpufreq(void) { }
static inline void cpufreq_update_limits(unsigned int cpu) { }
+static inline unsigned long cpufreq_get_pressure(int cpu)
+{
+ return 0;
+}
#endif
#ifdef CONFIG_CPU_FREQ_STAT
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 35e78ddb2b37c7..7a5785f405b626 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -146,6 +146,7 @@ enum cpuhp_state {
CPUHP_AP_IRQ_MIPS_GIC_STARTING,
CPUHP_AP_IRQ_LOONGARCH_STARTING,
CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING,
+ CPUHP_AP_IRQ_RISCV_IMSIC_STARTING,
CPUHP_AP_ARM_MVEBU_COHERENCY,
CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
CPUHP_AP_PERF_X86_STARTING,
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 435bb1ffff6d8b..d66cdfdfcaa150 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -188,6 +188,23 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask
}
/**
+ * cpumask_first_and_and - return the first cpu from *srcp1 & *srcp2 & *srcp3
+ * @srcp1: the first input
+ * @srcp2: the second input
+ * @srcp3: the third input
+ *
+ * Return: >= nr_cpu_ids if no cpus set in all.
+ */
+static inline
+unsigned int cpumask_first_and_and(const struct cpumask *srcp1,
+ const struct cpumask *srcp2,
+ const struct cpumask *srcp3)
+{
+ return find_first_and_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
+ cpumask_bits(srcp3), small_cpumask_bits);
+}
+
+/**
* cpumask_last - get the last CPU in a cpumask
* @srcp: - the cpumask pointer
*
diff --git a/include/linux/find.h b/include/linux/find.h
index c69598e383c161..28ec5a03393ade 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -29,6 +29,8 @@ unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsign
unsigned long n);
extern unsigned long _find_first_and_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long size);
+unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2,
+ const unsigned long *addr3, unsigned long size);
extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
@@ -345,6 +347,31 @@ unsigned long find_first_and_bit(const unsigned long *addr1,
}
#endif
+/**
+ * find_first_and_and_bit - find the first set bit in 3 memory regions
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @addr3: The third address to base the search on
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number for the first set bit
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_first_and_and_bit(const unsigned long *addr1,
+ const unsigned long *addr2,
+ const unsigned long *addr3,
+ unsigned long size)
+{
+ if (small_const_nbits(size)) {
+ unsigned long val = *addr1 & *addr2 & *addr3 & GENMASK(size - 1, 0);
+
+ return val ? __ffs(val) : size;
+ }
+
+ return _find_first_and_and_bit(addr1, addr2, addr3, size);
+}
+
#ifndef find_first_zero_bit
/**
* find_first_zero_bit - find the first cleared bit in a memory region
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 97baa937ab5b99..a217e1029c1dbb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -115,7 +115,7 @@ enum {
* Return value for chip->irq_set_affinity()
*
* IRQ_SET_MASK_OK - OK, core updates irq_common_data.affinity
- * IRQ_SET_MASK_NOCPY - OK, chip did update irq_common_data.affinity
+ * IRQ_SET_MASK_NOCOPY - OK, chip did update irq_common_data.affinity
* IRQ_SET_MASK_OK_DONE - Same as IRQ_SET_MASK_OK for core. Special code to
* support stacked irqchips, which indicates skipping
* all descendant irqchips.
diff --git a/include/linux/irqchip/riscv-aplic.h b/include/linux/irqchip/riscv-aplic.h
new file mode 100644
index 00000000000000..ec8f7df505831f
--- /dev/null
+++ b/include/linux/irqchip/riscv-aplic.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+#ifndef __LINUX_IRQCHIP_RISCV_APLIC_H
+#define __LINUX_IRQCHIP_RISCV_APLIC_H
+
+#include <linux/bitops.h>
+
+#define APLIC_MAX_IDC BIT(14)
+#define APLIC_MAX_SOURCE 1024
+
+#define APLIC_DOMAINCFG 0x0000
+#define APLIC_DOMAINCFG_RDONLY 0x80000000
+#define APLIC_DOMAINCFG_IE BIT(8)
+#define APLIC_DOMAINCFG_DM BIT(2)
+#define APLIC_DOMAINCFG_BE BIT(0)
+
+#define APLIC_SOURCECFG_BASE 0x0004
+#define APLIC_SOURCECFG_D BIT(10)
+#define APLIC_SOURCECFG_CHILDIDX_MASK 0x000003ff
+#define APLIC_SOURCECFG_SM_MASK 0x00000007
+#define APLIC_SOURCECFG_SM_INACTIVE 0x0
+#define APLIC_SOURCECFG_SM_DETACH 0x1
+#define APLIC_SOURCECFG_SM_EDGE_RISE 0x4
+#define APLIC_SOURCECFG_SM_EDGE_FALL 0x5
+#define APLIC_SOURCECFG_SM_LEVEL_HIGH 0x6
+#define APLIC_SOURCECFG_SM_LEVEL_LOW 0x7
+
+#define APLIC_MMSICFGADDR 0x1bc0
+#define APLIC_MMSICFGADDRH 0x1bc4
+#define APLIC_SMSICFGADDR 0x1bc8
+#define APLIC_SMSICFGADDRH 0x1bcc
+
+#ifdef CONFIG_RISCV_M_MODE
+#define APLIC_xMSICFGADDR APLIC_MMSICFGADDR
+#define APLIC_xMSICFGADDRH APLIC_MMSICFGADDRH
+#else
+#define APLIC_xMSICFGADDR APLIC_SMSICFGADDR
+#define APLIC_xMSICFGADDRH APLIC_SMSICFGADDRH
+#endif
+
+#define APLIC_xMSICFGADDRH_L BIT(31)
+#define APLIC_xMSICFGADDRH_HHXS_MASK 0x1f
+#define APLIC_xMSICFGADDRH_HHXS_SHIFT 24
+#define APLIC_xMSICFGADDRH_HHXS (APLIC_xMSICFGADDRH_HHXS_MASK << \
+ APLIC_xMSICFGADDRH_HHXS_SHIFT)
+#define APLIC_xMSICFGADDRH_LHXS_MASK 0x7
+#define APLIC_xMSICFGADDRH_LHXS_SHIFT 20
+#define APLIC_xMSICFGADDRH_LHXS (APLIC_xMSICFGADDRH_LHXS_MASK << \
+ APLIC_xMSICFGADDRH_LHXS_SHIFT)
+#define APLIC_xMSICFGADDRH_HHXW_MASK 0x7
+#define APLIC_xMSICFGADDRH_HHXW_SHIFT 16
+#define APLIC_xMSICFGADDRH_HHXW (APLIC_xMSICFGADDRH_HHXW_MASK << \
+ APLIC_xMSICFGADDRH_HHXW_SHIFT)
+#define APLIC_xMSICFGADDRH_LHXW_MASK 0xf
+#define APLIC_xMSICFGADDRH_LHXW_SHIFT 12
+#define APLIC_xMSICFGADDRH_LHXW (APLIC_xMSICFGADDRH_LHXW_MASK << \
+ APLIC_xMSICFGADDRH_LHXW_SHIFT)
+#define APLIC_xMSICFGADDRH_BAPPN_MASK 0xfff
+#define APLIC_xMSICFGADDRH_BAPPN_SHIFT 0
+#define APLIC_xMSICFGADDRH_BAPPN (APLIC_xMSICFGADDRH_BAPPN_MASK << \
+ APLIC_xMSICFGADDRH_BAPPN_SHIFT)
+
+#define APLIC_xMSICFGADDR_PPN_SHIFT 12
+
+#define APLIC_xMSICFGADDR_PPN_HART(__lhxs) \
+ (BIT(__lhxs) - 1)
+
+#define APLIC_xMSICFGADDR_PPN_LHX_MASK(__lhxw) \
+ (BIT(__lhxw) - 1)
+#define APLIC_xMSICFGADDR_PPN_LHX_SHIFT(__lhxs) \
+ ((__lhxs))
+#define APLIC_xMSICFGADDR_PPN_LHX(__lhxw, __lhxs) \
+ (APLIC_xMSICFGADDR_PPN_LHX_MASK(__lhxw) << \
+ APLIC_xMSICFGADDR_PPN_LHX_SHIFT(__lhxs))
+
+#define APLIC_xMSICFGADDR_PPN_HHX_MASK(__hhxw) \
+ (BIT(__hhxw) - 1)
+#define APLIC_xMSICFGADDR_PPN_HHX_SHIFT(__hhxs) \
+ ((__hhxs) + APLIC_xMSICFGADDR_PPN_SHIFT)
+#define APLIC_xMSICFGADDR_PPN_HHX(__hhxw, __hhxs) \
+ (APLIC_xMSICFGADDR_PPN_HHX_MASK(__hhxw) << \
+ APLIC_xMSICFGADDR_PPN_HHX_SHIFT(__hhxs))
+
+#define APLIC_IRQBITS_PER_REG 32
+
+#define APLIC_SETIP_BASE 0x1c00
+#define APLIC_SETIPNUM 0x1cdc
+
+#define APLIC_CLRIP_BASE 0x1d00
+#define APLIC_CLRIPNUM 0x1ddc
+
+#define APLIC_SETIE_BASE 0x1e00
+#define APLIC_SETIENUM 0x1edc
+
+#define APLIC_CLRIE_BASE 0x1f00
+#define APLIC_CLRIENUM 0x1fdc
+
+#define APLIC_SETIPNUM_LE 0x2000
+#define APLIC_SETIPNUM_BE 0x2004
+
+#define APLIC_GENMSI 0x3000
+
+#define APLIC_TARGET_BASE 0x3004
+#define APLIC_TARGET_HART_IDX_SHIFT 18
+#define APLIC_TARGET_HART_IDX_MASK 0x3fff
+#define APLIC_TARGET_HART_IDX (APLIC_TARGET_HART_IDX_MASK << \
+ APLIC_TARGET_HART_IDX_SHIFT)
+#define APLIC_TARGET_GUEST_IDX_SHIFT 12
+#define APLIC_TARGET_GUEST_IDX_MASK 0x3f
+#define APLIC_TARGET_GUEST_IDX (APLIC_TARGET_GUEST_IDX_MASK << \
+ APLIC_TARGET_GUEST_IDX_SHIFT)
+#define APLIC_TARGET_IPRIO_SHIFT 0
+#define APLIC_TARGET_IPRIO_MASK 0xff
+#define APLIC_TARGET_IPRIO (APLIC_TARGET_IPRIO_MASK << \
+ APLIC_TARGET_IPRIO_SHIFT)
+#define APLIC_TARGET_EIID_SHIFT 0
+#define APLIC_TARGET_EIID_MASK 0x7ff
+#define APLIC_TARGET_EIID (APLIC_TARGET_EIID_MASK << \
+ APLIC_TARGET_EIID_SHIFT)
+
+#define APLIC_IDC_BASE 0x4000
+#define APLIC_IDC_SIZE 32
+
+#define APLIC_IDC_IDELIVERY 0x00
+
+#define APLIC_IDC_IFORCE 0x04
+
+#define APLIC_IDC_ITHRESHOLD 0x08
+
+#define APLIC_IDC_TOPI 0x18
+#define APLIC_IDC_TOPI_ID_SHIFT 16
+#define APLIC_IDC_TOPI_ID_MASK 0x3ff
+#define APLIC_IDC_TOPI_ID (APLIC_IDC_TOPI_ID_MASK << \
+ APLIC_IDC_TOPI_ID_SHIFT)
+#define APLIC_IDC_TOPI_PRIO_SHIFT 0
+#define APLIC_IDC_TOPI_PRIO_MASK 0xff
+#define APLIC_IDC_TOPI_PRIO (APLIC_IDC_TOPI_PRIO_MASK << \
+ APLIC_IDC_TOPI_PRIO_SHIFT)
+
+#define APLIC_IDC_CLAIMI 0x1c
+
+#endif
diff --git a/include/linux/irqchip/riscv-imsic.h b/include/linux/irqchip/riscv-imsic.h
new file mode 100644
index 00000000000000..faf0b800b1b0ff
--- /dev/null
+++ b/include/linux/irqchip/riscv-imsic.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+#ifndef __LINUX_IRQCHIP_RISCV_IMSIC_H
+#define __LINUX_IRQCHIP_RISCV_IMSIC_H
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <asm/csr.h>
+
+#define IMSIC_MMIO_PAGE_SHIFT 12
+#define IMSIC_MMIO_PAGE_SZ BIT(IMSIC_MMIO_PAGE_SHIFT)
+#define IMSIC_MMIO_PAGE_LE 0x00
+#define IMSIC_MMIO_PAGE_BE 0x04
+
+#define IMSIC_MIN_ID 63
+#define IMSIC_MAX_ID 2048
+
+#define IMSIC_EIDELIVERY 0x70
+
+#define IMSIC_EITHRESHOLD 0x72
+
+#define IMSIC_EIP0 0x80
+#define IMSIC_EIP63 0xbf
+#define IMSIC_EIPx_BITS 32
+
+#define IMSIC_EIE0 0xc0
+#define IMSIC_EIE63 0xff
+#define IMSIC_EIEx_BITS 32
+
+#define IMSIC_FIRST IMSIC_EIDELIVERY
+#define IMSIC_LAST IMSIC_EIE63
+
+#define IMSIC_MMIO_SETIPNUM_LE 0x00
+#define IMSIC_MMIO_SETIPNUM_BE 0x04
+
+struct imsic_local_config {
+ phys_addr_t msi_pa;
+ void __iomem *msi_va;
+};
+
+struct imsic_global_config {
+ /*
+ * MSI Target Address Scheme
+ *
+ * XLEN-1 12 0
+ * | | |
+ * -------------------------------------------------------------
+ * |xxxxxx|Group Index|xxxxxxxxxxx|HART Index|Guest Index| 0 |
+ * -------------------------------------------------------------
+ */
+
+ /* Bits representing Guest index, HART index, and Group index */
+ u32 guest_index_bits;
+ u32 hart_index_bits;
+ u32 group_index_bits;
+ u32 group_index_shift;
+
+ /* Global base address matching all target MSI addresses */
+ phys_addr_t base_addr;
+
+ /* Number of interrupt identities */
+ u32 nr_ids;
+
+ /* Number of guest interrupt identities */
+ u32 nr_guest_ids;
+
+ /* Per-CPU IMSIC addresses */
+ struct imsic_local_config __percpu *local;
+};
+
+#ifdef CONFIG_RISCV_IMSIC
+
+const struct imsic_global_config *imsic_get_global_config(void);
+
+#else
+
+static inline const struct imsic_global_config *imsic_get_global_config(void)
+{
+ return NULL;
+}
+
+#endif
+
+#endif
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index d9451d456a7333..fd091c35d5721e 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -18,6 +18,18 @@ struct irq_domain;
struct pt_regs;
/**
+ * struct irqstat - interrupt statistics
+ * @cnt: real-time interrupt count
+ * @ref: snapshot of interrupt count
+ */
+struct irqstat {
+ unsigned int cnt;
+#ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT
+ unsigned int ref;
+#endif
+};
+
+/**
* struct irq_desc - interrupt descriptor
* @irq_common_data: per irq and chip data passed down to chip functions
* @kstat_irqs: irq stats per cpu
@@ -55,7 +67,7 @@ struct pt_regs;
struct irq_desc {
struct irq_common_data irq_common_data;
struct irq_data irq_data;
- unsigned int __percpu *kstat_irqs;
+ struct irqstat __percpu *kstat_irqs;
irq_flow_handler_t handle_irq;
struct irqaction *action; /* IRQ action list */
unsigned int status_use_accessors;
@@ -119,7 +131,7 @@ extern struct irq_desc irq_desc[NR_IRQS];
static inline unsigned int irq_desc_kstat_cpu(struct irq_desc *desc,
unsigned int cpu)
{
- return desc->kstat_irqs ? *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+ return desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0;
}
static inline struct irq_desc *irq_data_to_desc(struct irq_data *data)
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index f0a949b7c9733c..f5a2727ca4a9a9 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -216,6 +216,7 @@ extern struct jump_entry __start___jump_table[];
extern struct jump_entry __stop___jump_table[];
extern void jump_label_init(void);
+extern void jump_label_init_ro(void);
extern void jump_label_lock(void);
extern void jump_label_unlock(void);
extern void arch_jump_label_transform(struct jump_entry *entry,
@@ -265,6 +266,8 @@ static __always_inline void jump_label_init(void)
static_key_initialized = true;
}
+static __always_inline void jump_label_init_ro(void) { }
+
static __always_inline bool static_key_false(struct static_key *key)
{
if (unlikely_notrace(static_key_count(key) > 0))
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 9935f7ecbfb9e3..9c042c6384bb3d 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -79,6 +79,14 @@ static inline unsigned int kstat_cpu_softirqs_sum(int cpu)
return sum;
}
+#ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT
+extern void kstat_snapshot_irqs(void);
+extern unsigned int kstat_get_irq_since_snapshot(unsigned int irq);
+#else
+static inline void kstat_snapshot_irqs(void) { }
+static inline unsigned int kstat_get_irq_since_snapshot(unsigned int irq) { return 0; }
+#endif
+
/*
* Number of interrupts per specific IRQ source, since bootup
*/
diff --git a/include/linux/math64.h b/include/linux/math64.h
index bf74478926d4ad..d34def7f9a8cd3 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -4,8 +4,8 @@
#include <linux/types.h>
#include <linux/math.h>
-#include <vdso/math64.h>
#include <asm/div64.h>
+#include <vdso/math64.h>
#if BITS_PER_LONG == 64
@@ -179,16 +179,12 @@ static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
#ifndef mul_u64_u32_shr
static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
{
- u32 ah, al;
+ u32 ah = a >> 32, al = a;
u64 ret;
- al = a;
- ah = a >> 32;
-
ret = mul_u32_u32(al, mul) >> shift;
if (ah)
ret += mul_u32_u32(ah, mul) << (32 - shift);
-
return ret;
}
#endif /* mul_u64_u32_shr */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d2a15c0c6f8a9e..a5304ae8c654f0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -809,11 +809,8 @@ struct perf_event {
u64 (*clock)(void);
perf_overflow_handler_t overflow_handler;
void *overflow_handler_context;
-#ifdef CONFIG_BPF_SYSCALL
- perf_overflow_handler_t orig_overflow_handler;
struct bpf_prog *prog;
u64 bpf_cookie;
-#endif
#ifdef CONFIG_EVENT_TRACING
struct trace_event_call *tp_event;
@@ -883,6 +880,7 @@ struct perf_event_pmu_context {
unsigned int nr_events;
unsigned int nr_cgroups;
+ unsigned int nr_freq;
atomic_t refcount; /* event <-> epc */
struct rcu_head rcu_head;
@@ -897,6 +895,11 @@ struct perf_event_pmu_context {
int rotate_necessary;
};
+static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc)
+{
+ return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active);
+}
+
struct perf_event_groups {
struct rb_root tree;
u64 index;
@@ -1342,8 +1345,10 @@ extern int perf_event_output(struct perf_event *event,
struct pt_regs *regs);
static inline bool
-__is_default_overflow_handler(perf_overflow_handler_t overflow_handler)
+is_default_overflow_handler(struct perf_event *event)
{
+ perf_overflow_handler_t overflow_handler = event->overflow_handler;
+
if (likely(overflow_handler == perf_event_output_forward))
return true;
if (unlikely(overflow_handler == perf_event_output_backward))
@@ -1351,22 +1356,6 @@ __is_default_overflow_handler(perf_overflow_handler_t overflow_handler)
return false;
}
-#define is_default_overflow_handler(event) \
- __is_default_overflow_handler((event)->overflow_handler)
-
-#ifdef CONFIG_BPF_SYSCALL
-static inline bool uses_default_overflow_handler(struct perf_event *event)
-{
- if (likely(is_default_overflow_handler(event)))
- return true;
-
- return __is_default_overflow_handler(event->orig_overflow_handler);
-}
-#else
-#define uses_default_overflow_handler(event) \
- is_default_overflow_handler(event)
-#endif
-
extern void
perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
@@ -1697,6 +1686,14 @@ perf_event_addr_filters(struct perf_event *event)
return ifh;
}
+static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
+{
+ /* Only the parent has fasync state */
+ if (event->parent)
+ event = event->parent;
+ return &event->fasync;
+}
+
extern void perf_event_addr_filters_sync(struct perf_event *event);
extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4118b3f959c324..8d1cf672ac4c0c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -301,7 +301,7 @@ enum {
TASK_COMM_LEN = 16,
};
-extern void scheduler_tick(void);
+extern void sched_tick(void);
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
@@ -840,6 +840,7 @@ struct task_struct {
#endif
unsigned int policy;
+ unsigned long max_allowed_capacity;
int nr_cpus_allowed;
const cpumask_t *cpus_ptr;
cpumask_t *user_cpus_ptr;
diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index 478084f9105e13..e670ac282333e9 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -5,8 +5,8 @@
#include <linux/sched.h>
enum cpu_idle_type {
+ __CPU_NOT_IDLE = 0,
CPU_IDLE,
- CPU_NOT_IDLE,
CPU_NEWLY_IDLE,
CPU_MAX_IDLE_TYPES
};
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 18572c9ea72468..4237daa5ac7a25 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -110,7 +110,7 @@ struct sched_domain {
unsigned long last_decay_max_lb_cost;
#ifdef CONFIG_SCHEDSTATS
- /* load_balance() stats */
+ /* sched_balance_rq() stats */
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
@@ -270,17 +270,17 @@ unsigned long arch_scale_cpu_capacity(int cpu)
}
#endif
-#ifndef arch_scale_thermal_pressure
+#ifndef arch_scale_hw_pressure
static __always_inline
-unsigned long arch_scale_thermal_pressure(int cpu)
+unsigned long arch_scale_hw_pressure(int cpu)
{
return 0;
}
#endif
-#ifndef arch_update_thermal_pressure
+#ifndef arch_update_hw_pressure
static __always_inline
-void arch_update_thermal_pressure(const struct cpumask *cpus,
+void arch_update_hw_pressure(const struct cpumask *cpus,
unsigned long capped_frequency)
{ }
#endif
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
index 62973f7d4610f9..d306d9dd22073f 100644
--- a/include/linux/timerqueue.h
+++ b/include/linux/timerqueue.h
@@ -37,11 +37,6 @@ static inline bool timerqueue_node_queued(struct timerqueue_node *node)
return !RB_EMPTY_NODE(&node->node);
}
-static inline bool timerqueue_node_expires(struct timerqueue_node *node)
-{
- return node->expires;
-}
-
static inline void timerqueue_init_head(struct timerqueue_head *head)
{
head->rb_root = RB_ROOT_CACHED;
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 3684487d01e1c6..29dd5b91dd7d66 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -5,10 +5,6 @@
#include <linux/context_tracking_state.h>
#include <linux/sched.h>
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-#include <asm/vtime.h>
-#endif
-
/*
* Common vtime APIs
*/
@@ -18,7 +14,6 @@ extern void vtime_account_idle(struct task_struct *tsk);
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-extern void arch_vtime_task_switch(struct task_struct *tsk);
extern void vtime_user_enter(struct task_struct *tsk);
extern void vtime_user_exit(struct task_struct *tsk);
extern void vtime_guest_enter(struct task_struct *tsk);
diff --git a/include/trace/events/thermal_pressure.h b/include/trace/events/hw_pressure.h
index b68680201360d8..b9cd68854128c7 100644
--- a/include/trace/events/thermal_pressure.h
+++ b/include/trace/events/hw_pressure.h
@@ -1,27 +1,27 @@
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
-#define TRACE_SYSTEM thermal_pressure
+#define TRACE_SYSTEM hw_pressure
#if !defined(_TRACE_THERMAL_PRESSURE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_THERMAL_PRESSURE_H
#include <linux/tracepoint.h>
-TRACE_EVENT(thermal_pressure_update,
- TP_PROTO(int cpu, unsigned long thermal_pressure),
- TP_ARGS(cpu, thermal_pressure),
+TRACE_EVENT(hw_pressure_update,
+ TP_PROTO(int cpu, unsigned long hw_pressure),
+ TP_ARGS(cpu, hw_pressure),
TP_STRUCT__entry(
- __field(unsigned long, thermal_pressure)
+ __field(unsigned long, hw_pressure)
__field(int, cpu)
),
TP_fast_assign(
- __entry->thermal_pressure = thermal_pressure;
+ __entry->hw_pressure = hw_pressure;
__entry->cpu = cpu;
),
- TP_printk("cpu=%d thermal_pressure=%lu", __entry->cpu, __entry->thermal_pressure)
+ TP_printk("cpu=%d hw_pressure=%lu", __entry->cpu, __entry->hw_pressure)
);
#endif /* _TRACE_THERMAL_PRESSURE_H */
diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h
index 1391ada0da3b95..f0f7b3cb204171 100644
--- a/include/trace/events/mce.h
+++ b/include/trace/events/mce.h
@@ -9,6 +9,14 @@
#include <linux/tracepoint.h>
#include <asm/mce.h>
+/*
+ * MCE Event Record.
+ *
+ * Only very relevant and transient information which cannot be
+ * gathered from a system by any other means or which can only be
+ * acquired arduously should be added to this record.
+ */
+
TRACE_EVENT(mce_record,
TP_PROTO(struct mce *m),
@@ -25,6 +33,7 @@ TRACE_EVENT(mce_record,
__field( u64, ipid )
__field( u64, ip )
__field( u64, tsc )
+ __field( u64, ppin )
__field( u64, walltime )
__field( u32, cpu )
__field( u32, cpuid )
@@ -33,6 +42,7 @@ TRACE_EVENT(mce_record,
__field( u8, cs )
__field( u8, bank )
__field( u8, cpuvendor )
+ __field( u32, microcode )
),
TP_fast_assign(
@@ -45,6 +55,7 @@ TRACE_EVENT(mce_record,
__entry->ipid = m->ipid;
__entry->ip = m->ip;
__entry->tsc = m->tsc;
+ __entry->ppin = m->ppin;
__entry->walltime = m->time;
__entry->cpu = m->extcpu;
__entry->cpuid = m->cpuid;
@@ -53,20 +64,26 @@ TRACE_EVENT(mce_record,
__entry->cs = m->cs;
__entry->bank = m->bank;
__entry->cpuvendor = m->cpuvendor;
+ __entry->microcode = m->microcode;
),
- TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR/MISC/SYND: %016Lx/%016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x",
+ TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x",
__entry->cpu,
__entry->mcgcap, __entry->mcgstatus,
__entry->bank, __entry->status,
__entry->ipid,
- __entry->addr, __entry->misc, __entry->synd,
+ __entry->addr,
+ __entry->misc,
+ __entry->synd,
__entry->cs, __entry->ip,
__entry->tsc,
- __entry->cpuvendor, __entry->cpuid,
+ __entry->ppin,
+ __entry->cpuvendor,
+ __entry->cpuid,
__entry->walltime,
__entry->socketid,
- __entry->apicid)
+ __entry->apicid,
+ __entry->microcode)
);
#endif /* _TRACE_MCE_H */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index dbb01b4b745108..d115d64c401134 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -752,7 +752,7 @@ DECLARE_TRACE(pelt_dl_tp,
TP_PROTO(struct rq *rq),
TP_ARGS(rq));
-DECLARE_TRACE(pelt_thermal_tp,
+DECLARE_TRACE(pelt_hw_tp,
TP_PROTO(struct rq *rq),
TP_ARGS(rq));
diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h
index c71ddb6d46914b..d04d394db064ae 100644
--- a/include/vdso/datapage.h
+++ b/include/vdso/datapage.h
@@ -61,6 +61,7 @@ struct vdso_timestamp {
* @seq: timebase sequence counter
* @clock_mode: clock mode
* @cycle_last: timebase at clocksource init
+ * @max_cycles: maximum cycles which won't overflow 64bit multiplication
* @mask: clocksource mask
* @mult: clocksource multiplier
* @shift: clocksource shift
@@ -92,6 +93,9 @@ struct vdso_data {
s32 clock_mode;
u64 cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+ u64 max_cycles;
+#endif
u64 mask;
u32 mult;
u32 shift;
diff --git a/include/vdso/math64.h b/include/vdso/math64.h
index 7da703ee55614a..22ae212f8b28fe 100644
--- a/include/vdso/math64.h
+++ b/include/vdso/math64.h
@@ -21,4 +21,42 @@ __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
return ret;
}
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+
+#ifndef mul_u64_u32_add_u64_shr
+static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift)
+{
+ return (u64)((((unsigned __int128)a * mul) + b) >> shift);
+}
+#endif /* mul_u64_u32_add_u64_shr */
+
+#else
+
+#ifndef mul_u64_u32_add_u64_shr
+#ifndef mul_u32_u32
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+ return (u64)a * b;
+}
+#define mul_u32_u32 mul_u32_u32
+#endif
+static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift)
+{
+ u32 ah = a >> 32, al = a;
+ bool ovf;
+ u64 ret;
+
+ ovf = __builtin_add_overflow(mul_u32_u32(al, mul), b, &ret);
+ ret >>= shift;
+ if (ovf && shift)
+ ret += 1ULL << (64 - shift);
+ if (ah)
+ ret += mul_u32_u32(ah, mul) << (32 - shift);
+
+ return ret;
+}
+#endif /* mul_u64_u32_add_u64_shr */
+
+#endif
+
#endif /* __VDSO_MATH64_H */
diff --git a/init/Kconfig b/init/Kconfig
index 0a021d6b4939ae..2bc65904c4db78 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -547,24 +547,24 @@ config HAVE_SCHED_AVG_IRQ
depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
depends on SMP
-config SCHED_THERMAL_PRESSURE
+config SCHED_HW_PRESSURE
bool
default y if ARM && ARM_CPU_TOPOLOGY
default y if ARM64
depends on SMP
depends on CPU_FREQ_THERMAL
help
- Select this option to enable thermal pressure accounting in the
- scheduler. Thermal pressure is the value conveyed to the scheduler
+ Select this option to enable HW pressure accounting in the
+ scheduler. HW pressure is the value conveyed to the scheduler
that reflects the reduction in CPU compute capacity resulted from
- thermal throttling. Thermal throttling occurs when the performance of
- a CPU is capped due to high operating temperatures.
+ HW throttling. HW throttling occurs when the performance of
+ a CPU is capped due to high operating temperatures as an example.
If selected, the scheduler will be able to balance tasks accordingly,
i.e. put less load on throttled CPUs than on non/less throttled ones.
This requires the architecture to implement
- arch_update_thermal_pressure() and arch_scale_thermal_pressure().
+ arch_update_hw_pressure() and arch_scale_thermal_pressure().
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
diff --git a/init/init_task.c b/init/init_task.c
index 4daee6d761c86c..2558b719e053c8 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -77,6 +77,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
.cpus_ptr = &init_task.cpus_mask,
.user_cpus_ptr = NULL,
.cpus_mask = CPU_MASK_ALL,
+ .max_allowed_capacity = SCHED_CAPACITY_SCALE,
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
diff --git a/init/main.c b/init/main.c
index 3f956c6beb3d8c..6406e9b6627bb1 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1435,6 +1435,7 @@ static void mark_readonly(void)
* insecure pages which are W+X.
*/
flush_module_init_free_work();
+ jump_label_init_ro();
mark_rodata_ro();
debug_checkwx();
rodata_test();
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 70ae70d0382337..24b1e11432608e 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -432,7 +432,7 @@ static __always_inline void ct_kernel_enter(bool user, int offset) { }
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
-DEFINE_STATIC_KEY_FALSE(context_tracking_key);
+DEFINE_STATIC_KEY_FALSE_RO(context_tracking_key);
EXPORT_SYMBOL_GPL(context_tracking_key);
static noinstr bool context_tracking_recursion_enter(void)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c5a0dc1f135f02..f0128c5ff27896 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2302,8 +2302,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
if (!is_software_event(event))
cpc->active_oncpu--;
- if (event->attr.freq && event->attr.sample_freq)
+ if (event->attr.freq && event->attr.sample_freq) {
ctx->nr_freq--;
+ epc->nr_freq--;
+ }
if (event->attr.exclusive || !cpc->active_oncpu)
cpc->exclusive = 0;
@@ -2558,9 +2560,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
if (!is_software_event(event))
cpc->active_oncpu++;
- if (event->attr.freq && event->attr.sample_freq)
+ if (event->attr.freq && event->attr.sample_freq) {
ctx->nr_freq++;
-
+ epc->nr_freq++;
+ }
if (event->attr.exclusive)
cpc->exclusive = 1;
@@ -4123,30 +4126,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
}
}
-/*
- * combine freq adjustment with unthrottling to avoid two passes over the
- * events. At the same time, make sure, having freq events does not change
- * the rate of unthrottling as that would introduce bias.
- */
-static void
-perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
+static void perf_adjust_freq_unthr_events(struct list_head *event_list)
{
struct perf_event *event;
struct hw_perf_event *hwc;
u64 now, period = TICK_NSEC;
s64 delta;
- /*
- * only need to iterate over all events iff:
- * - context have events in frequency mode (needs freq adjust)
- * - there are events to unthrottle on this cpu
- */
- if (!(ctx->nr_freq || unthrottle))
- return;
-
- raw_spin_lock(&ctx->lock);
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ list_for_each_entry(event, event_list, active_list) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
@@ -4154,18 +4141,17 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
if (!event_filter_match(event))
continue;
- perf_pmu_disable(event->pmu);
-
hwc = &event->hw;
if (hwc->interrupts == MAX_INTERRUPTS) {
hwc->interrupts = 0;
perf_log_throttle(event, 1);
- event->pmu->start(event, 0);
+ if (!event->attr.freq || !event->attr.sample_freq)
+ event->pmu->start(event, 0);
}
if (!event->attr.freq || !event->attr.sample_freq)
- goto next;
+ continue;
/*
* stop the event and update event->count
@@ -4187,8 +4173,41 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
perf_adjust_period(event, period, delta, false);
event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
- next:
- perf_pmu_enable(event->pmu);
+ }
+}
+
+/*
+ * combine freq adjustment with unthrottling to avoid two passes over the
+ * events. At the same time, make sure, having freq events does not change
+ * the rate of unthrottling as that would introduce bias.
+ */
+static void
+perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ /*
+ * only need to iterate over all events iff:
+ * - context have events in frequency mode (needs freq adjust)
+ * - there are events to unthrottle on this cpu
+ */
+ if (!(ctx->nr_freq || unthrottle))
+ return;
+
+ raw_spin_lock(&ctx->lock);
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (!(pmu_ctx->nr_freq || unthrottle))
+ continue;
+ if (!perf_pmu_ctx_is_active(pmu_ctx))
+ continue;
+ if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
+ continue;
+
+ perf_pmu_disable(pmu_ctx->pmu);
+ perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
+ perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
+ perf_pmu_enable(pmu_ctx->pmu);
}
raw_spin_unlock(&ctx->lock);
@@ -6684,14 +6703,6 @@ static const struct file_operations perf_fops = {
* to user-space before waking everybody up.
*/
-static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
-{
- /* only the parent has fasync state */
- if (event->parent)
- event = event->parent;
- return &event->fasync;
-}
-
void perf_event_wakeup(struct perf_event *event)
{
ring_buffer_wakeup(event);
@@ -9544,6 +9555,100 @@ static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *r
return true;
}
+#ifdef CONFIG_BPF_SYSCALL
+static int bpf_overflow_handler(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct bpf_perf_event_data_kern ctx = {
+ .data = data,
+ .event = event,
+ };
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.regs = perf_arch_bpf_user_pt_regs(regs);
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
+ goto out;
+ rcu_read_lock();
+ prog = READ_ONCE(event->prog);
+ if (prog) {
+ perf_prepare_sample(data, event, regs);
+ ret = bpf_prog_run(prog, &ctx);
+ }
+ rcu_read_unlock();
+out:
+ __this_cpu_dec(bpf_prog_active);
+
+ return ret;
+}
+
+static inline int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ if (event->overflow_handler_context)
+ /* hw breakpoint or kernel counter */
+ return -EINVAL;
+
+ if (event->prog)
+ return -EEXIST;
+
+ if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
+ return -EINVAL;
+
+ if (event->attr.precise_ip &&
+ prog->call_get_stack &&
+ (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
+ event->attr.exclude_callchain_kernel ||
+ event->attr.exclude_callchain_user)) {
+ /*
+ * On perf_event with precise_ip, calling bpf_get_stack()
+ * may trigger unwinder warnings and occasional crashes.
+ * bpf_get_[stack|stackid] works around this issue by using
+ * callchain attached to perf_sample_data. If the
+ * perf_event does not full (kernel and user) callchain
+ * attached to perf_sample_data, do not allow attaching BPF
+ * program that calls bpf_get_[stack|stackid].
+ */
+ return -EPROTO;
+ }
+
+ event->prog = prog;
+ event->bpf_cookie = bpf_cookie;
+ return 0;
+}
+
+static inline void perf_event_free_bpf_handler(struct perf_event *event)
+{
+ struct bpf_prog *prog = event->prog;
+
+ if (!prog)
+ return;
+
+ event->prog = NULL;
+ bpf_prog_put(prog);
+}
+#else
+static inline int bpf_overflow_handler(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ return 1;
+}
+
+static inline int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void perf_event_free_bpf_handler(struct perf_event *event)
+{
+}
+#endif
+
/*
* Generic event overflow handling, sampling.
*/
@@ -9564,6 +9669,9 @@ static int __perf_event_overflow(struct perf_event *event,
ret = __perf_event_account_interrupt(event, throttle);
+ if (event->prog && !bpf_overflow_handler(event, data, regs))
+ return ret;
+
/*
* XXX event_limit might not quite work as expected on inherited
* events
@@ -10422,97 +10530,6 @@ static void perf_event_free_filter(struct perf_event *event)
ftrace_profile_free_filter(event);
}
-#ifdef CONFIG_BPF_SYSCALL
-static void bpf_overflow_handler(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- struct bpf_perf_event_data_kern ctx = {
- .data = data,
- .event = event,
- };
- struct bpf_prog *prog;
- int ret = 0;
-
- ctx.regs = perf_arch_bpf_user_pt_regs(regs);
- if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
- goto out;
- rcu_read_lock();
- prog = READ_ONCE(event->prog);
- if (prog) {
- perf_prepare_sample(data, event, regs);
- ret = bpf_prog_run(prog, &ctx);
- }
- rcu_read_unlock();
-out:
- __this_cpu_dec(bpf_prog_active);
- if (!ret)
- return;
-
- event->orig_overflow_handler(event, data, regs);
-}
-
-static int perf_event_set_bpf_handler(struct perf_event *event,
- struct bpf_prog *prog,
- u64 bpf_cookie)
-{
- if (event->overflow_handler_context)
- /* hw breakpoint or kernel counter */
- return -EINVAL;
-
- if (event->prog)
- return -EEXIST;
-
- if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
- return -EINVAL;
-
- if (event->attr.precise_ip &&
- prog->call_get_stack &&
- (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
- event->attr.exclude_callchain_kernel ||
- event->attr.exclude_callchain_user)) {
- /*
- * On perf_event with precise_ip, calling bpf_get_stack()
- * may trigger unwinder warnings and occasional crashes.
- * bpf_get_[stack|stackid] works around this issue by using
- * callchain attached to perf_sample_data. If the
- * perf_event does not full (kernel and user) callchain
- * attached to perf_sample_data, do not allow attaching BPF
- * program that calls bpf_get_[stack|stackid].
- */
- return -EPROTO;
- }
-
- event->prog = prog;
- event->bpf_cookie = bpf_cookie;
- event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
- WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
- return 0;
-}
-
-static void perf_event_free_bpf_handler(struct perf_event *event)
-{
- struct bpf_prog *prog = event->prog;
-
- if (!prog)
- return;
-
- WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
- event->prog = NULL;
- bpf_prog_put(prog);
-}
-#else
-static int perf_event_set_bpf_handler(struct perf_event *event,
- struct bpf_prog *prog,
- u64 bpf_cookie)
-{
- return -EOPNOTSUPP;
-}
-static void perf_event_free_bpf_handler(struct perf_event *event)
-{
-}
-#endif
-
/*
* returns true if the event is a tracepoint, or a kprobe/upprobe created
* with perf_event_open()
@@ -11971,13 +11988,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
overflow_handler = parent_event->overflow_handler;
context = parent_event->overflow_handler_context;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
- if (overflow_handler == bpf_overflow_handler) {
+ if (parent_event->prog) {
struct bpf_prog *prog = parent_event->prog;
bpf_prog_inc(prog);
event->prog = prog;
- event->orig_overflow_handler =
- parent_event->orig_overflow_handler;
}
#endif
}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 60ed43d1c29e3e..4013408ce01236 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -22,6 +22,10 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
atomic_set(&handle->rb->poll, EPOLLIN);
handle->event->pending_wakeup = 1;
+
+ if (*perf_event_fasync(handle->event) && !handle->event->pending_kill)
+ handle->event->pending_kill = POLL_IN;
+
irq_work_queue(&handle->event->pending_irq);
}
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 2531f3496ab6d7..529adb1f58593c 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -108,6 +108,10 @@ config GENERIC_IRQ_MATRIX_ALLOCATOR
config GENERIC_IRQ_RESERVATION_MODE
bool
+# Snapshot for interrupt statistics
+config GENERIC_IRQ_STAT_SNAPSHOT
+ bool
+
# Support forced irq threading
config IRQ_FORCED_THREADING
bool
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 1ed2b1739363b8..75cadbc3c2320d 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -130,6 +130,22 @@ static bool migrate_one_irq(struct irq_desc *desc)
* CPU.
*/
err = irq_do_set_affinity(d, affinity, false);
+
+ /*
+ * If there are online CPUs in the affinity mask, but they have no
+ * vectors left to make the migration work, try to break the
+ * affinity by migrating to any online CPU.
+ */
+ if (err == -ENOSPC && !irqd_affinity_is_managed(d) && affinity != cpu_online_mask) {
+ pr_debug("IRQ%u: set affinity failed for %*pbl, re-try with online CPUs\n",
+ d->irq, cpumask_pr_args(affinity));
+
+ affinity = cpu_online_mask;
+ brokeaff = true;
+
+ err = irq_do_set_affinity(d, affinity, false);
+ }
+
if (err) {
pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
d->irq, err);
@@ -195,10 +211,15 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
!irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity))
return;
- if (irqd_is_managed_and_shutdown(data)) {
- irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
+ /*
+ * Don't restore suspended interrupts here when a system comes back
+ * from S3. They are reenabled via resume_device_irqs().
+ */
+ if (desc->istate & IRQS_SUSPENDED)
return;
- }
+
+ if (irqd_is_managed_and_shutdown(data))
+ irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
/*
* If the interrupt can only be directed to a single target
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index bcc7f21db9eeb3..ed28059e984980 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -98,6 +98,8 @@ extern void mask_irq(struct irq_desc *desc);
extern void unmask_irq(struct irq_desc *desc);
extern void unmask_threaded_irq(struct irq_desc *desc);
+extern unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask);
+
#ifdef CONFIG_SPARSE_IRQ
static inline void irq_mark_irq(unsigned int irq) { }
#else
@@ -258,7 +260,7 @@ static inline void irq_state_set_masked(struct irq_desc *desc)
static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc)
{
- __this_cpu_inc(*desc->kstat_irqs);
+ __this_cpu_inc(desc->kstat_irqs->cnt);
__this_cpu_inc(kstat.irqs_sum);
}
@@ -278,6 +280,11 @@ static inline int irq_desc_is_chained(struct irq_desc *desc)
return (desc->action && desc->action == &chained_action);
}
+static inline bool irq_is_nmi(struct irq_desc *desc)
+{
+ return desc->istate & IRQS_NMI;
+}
+
#ifdef CONFIG_PM_SLEEP
bool irq_pm_check_wakeup(struct irq_desc *desc);
void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4c6b32318ce350..88ac3652fcf2f0 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -134,7 +134,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
desc->name = NULL;
desc->owner = owner;
for_each_possible_cpu(cpu)
- *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
+ *per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { };
desc_smp_init(desc, node, affinity);
}
@@ -186,7 +186,7 @@ static int init_desc(struct irq_desc *desc, int irq, int node,
const struct cpumask *affinity,
struct module *owner)
{
- desc->kstat_irqs = alloc_percpu(unsigned int);
+ desc->kstat_irqs = alloc_percpu(struct irqstat);
if (!desc->kstat_irqs)
return -ENOMEM;
@@ -911,10 +911,7 @@ int irq_set_percpu_devid_partition(unsigned int irq,
{
struct irq_desc *desc = irq_to_desc(irq);
- if (!desc)
- return -EINVAL;
-
- if (desc->percpu_enabled)
+ if (!desc || desc->percpu_enabled)
return -EINVAL;
desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);
@@ -922,10 +919,7 @@ int irq_set_percpu_devid_partition(unsigned int irq,
if (!desc->percpu_enabled)
return -ENOMEM;
- if (affinity)
- desc->percpu_affinity = affinity;
- else
- desc->percpu_affinity = cpu_possible_mask;
+ desc->percpu_affinity = affinity ? : cpu_possible_mask;
irq_set_percpu_devid_flags(irq);
return 0;
@@ -968,33 +962,58 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
struct irq_desc *desc = irq_to_desc(irq);
- return desc && desc->kstat_irqs ?
- *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+ return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0;
}
-static bool irq_is_nmi(struct irq_desc *desc)
+unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask)
{
- return desc->istate & IRQS_NMI;
-}
-
-static unsigned int kstat_irqs(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned int sum = 0;
int cpu;
- if (!desc || !desc->kstat_irqs)
- return 0;
if (!irq_settings_is_per_cpu_devid(desc) &&
!irq_settings_is_per_cpu(desc) &&
!irq_is_nmi(desc))
return data_race(desc->tot_count);
- for_each_possible_cpu(cpu)
- sum += data_race(*per_cpu_ptr(desc->kstat_irqs, cpu));
+ for_each_cpu(cpu, cpumask)
+ sum += data_race(per_cpu(desc->kstat_irqs->cnt, cpu));
return sum;
}
+static unsigned int kstat_irqs(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || !desc->kstat_irqs)
+ return 0;
+ return kstat_irqs_desc(desc, cpu_possible_mask);
+}
+
+#ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT
+
+void kstat_snapshot_irqs(void)
+{
+ struct irq_desc *desc;
+ unsigned int irq;
+
+ for_each_irq_desc(irq, desc) {
+ if (!desc->kstat_irqs)
+ continue;
+ this_cpu_write(desc->kstat_irqs->ref, this_cpu_read(desc->kstat_irqs->cnt));
+ }
+}
+
+unsigned int kstat_get_irq_since_snapshot(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || !desc->kstat_irqs)
+ return 0;
+ return this_cpu_read(desc->kstat_irqs->cnt) - this_cpu_read(desc->kstat_irqs->ref);
+}
+
+#endif
+
/**
* kstat_irqs_usr - Get the statistics for an interrupt from thread context
* @irq: The interrupt number
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3dd1c871e0910f..aadc8891cc166f 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -909,10 +909,11 @@ EXPORT_SYMBOL_GPL(irq_create_of_mapping);
*/
void irq_dispose_mapping(unsigned int virq)
{
- struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *irq_data;
struct irq_domain *domain;
- if (!virq || !irq_data)
+ irq_data = virq ? irq_get_irq_data(virq) : NULL;
+ if (!irq_data)
return;
domain = irq_data->domain;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bf9ae8a8686ff6..71b0fc2d0aeaa9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -564,7 +564,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
/* The release function is promised process context */
might_sleep();
- if (!desc || desc->istate & IRQS_NMI)
+ if (!desc || irq_is_nmi(desc))
return -EINVAL;
/* Complete initialisation of *notify */
@@ -800,10 +800,14 @@ void __enable_irq(struct irq_desc *desc)
irq_settings_set_noprobe(desc);
/*
* Call irq_startup() not irq_enable() here because the
- * interrupt might be marked NOAUTOEN. So irq_startup()
- * needs to be invoked when it gets enabled the first
- * time. If it was already started up, then irq_startup()
- * will invoke irq_enable() under the hood.
+ * interrupt might be marked NOAUTOEN so irq_startup()
+ * needs to be invoked when it gets enabled the first time.
+ * This is also required when __enable_irq() is invoked for
+ * a managed and shutdown interrupt from the S3 resume
+ * path.
+ *
+ * If it was already started up, then irq_startup() will
+ * invoke irq_enable() under the hood.
*/
irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE);
break;
@@ -898,7 +902,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
return -EINVAL;
/* Don't use NMIs as wake up interrupts please */
- if (desc->istate & IRQS_NMI) {
+ if (irq_is_nmi(desc)) {
ret = -EINVAL;
goto out_unlock;
}
@@ -1624,7 +1628,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
unsigned int oldtype;
- if (desc->istate & IRQS_NMI) {
+ if (irq_is_nmi(desc)) {
pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n",
new->name, irq, desc->irq_data.chip->name);
ret = -EINVAL;
@@ -2082,7 +2086,7 @@ const void *free_nmi(unsigned int irq, void *dev_id)
unsigned long flags;
const void *devname;
- if (!desc || WARN_ON(!(desc->istate & IRQS_NMI)))
+ if (!desc || WARN_ON(!irq_is_nmi(desc)))
return NULL;
if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
@@ -2548,7 +2552,7 @@ void free_percpu_nmi(unsigned int irq, void __percpu *dev_id)
if (!desc || !irq_settings_is_per_cpu_devid(desc))
return;
- if (WARN_ON(!(desc->istate & IRQS_NMI)))
+ if (WARN_ON(!irq_is_nmi(desc)))
return;
kfree(__free_percpu_irq(irq, dev_id));
@@ -2684,7 +2688,7 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
return -EINVAL;
/* The line cannot already be NMI */
- if (desc->istate & IRQS_NMI)
+ if (irq_is_nmi(desc))
return -EINVAL;
action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
@@ -2745,7 +2749,7 @@ int prepare_percpu_nmi(unsigned int irq)
if (!desc)
return -EINVAL;
- if (WARN(!(desc->istate & IRQS_NMI),
+ if (WARN(!irq_is_nmi(desc),
KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n",
irq)) {
ret = -EINVAL;
@@ -2787,7 +2791,7 @@ void teardown_percpu_nmi(unsigned int irq)
if (!desc)
return;
- if (WARN_ON(!(desc->istate & IRQS_NMI)))
+ if (WARN_ON(!irq_is_nmi(desc)))
goto out;
irq_nmi_teardown(desc);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 623b8136e9af3b..5c320c3f10a746 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -488,18 +488,15 @@ int show_interrupts(struct seq_file *p, void *v)
if (!desc || irq_settings_is_hidden(desc))
goto outsparse;
- if (desc->kstat_irqs) {
- for_each_online_cpu(j)
- any_count |= data_race(*per_cpu_ptr(desc->kstat_irqs, j));
- }
+ if (desc->kstat_irqs)
+ any_count = kstat_irqs_desc(desc, cpu_online_mask);
if ((!desc->action || irq_desc_is_chained(desc)) && !any_count)
goto outsparse;
seq_printf(p, "%*d: ", prec, i);
for_each_online_cpu(j)
- seq_printf(p, "%10u ", desc->kstat_irqs ?
- *per_cpu_ptr(desc->kstat_irqs, j) : 0);
+ seq_printf(p, "%10u ", desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, j) : 0);
raw_spin_lock_irqsave(&desc->lock, flags);
if (desc->irq_data.chip) {
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 5f2c66860ac64f..b07a2d732ffbcb 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -190,7 +190,7 @@ int irq_inject_interrupt(unsigned int irq)
* - not NMI type
* - activated
*/
- if ((desc->istate & IRQS_NMI) || !irqd_is_activated(&desc->irq_data))
+ if (irq_is_nmi(desc) || !irqd_is_activated(&desc->irq_data))
err = -EINVAL;
else
err = check_irq_resend(desc, true);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index d9c822bbffb8d3..3218fa5688b939 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -530,6 +530,45 @@ void __init jump_label_init(void)
cpus_read_unlock();
}
+static inline bool static_key_sealed(struct static_key *key)
+{
+ return (key->type & JUMP_TYPE_LINKED) && !(key->type & ~JUMP_TYPE_MASK);
+}
+
+static inline void static_key_seal(struct static_key *key)
+{
+ unsigned long type = key->type & JUMP_TYPE_TRUE;
+ key->type = JUMP_TYPE_LINKED | type;
+}
+
+void jump_label_init_ro(void)
+{
+ struct jump_entry *iter_start = __start___jump_table;
+ struct jump_entry *iter_stop = __stop___jump_table;
+ struct jump_entry *iter;
+
+ if (WARN_ON_ONCE(!static_key_initialized))
+ return;
+
+ cpus_read_lock();
+ jump_label_lock();
+
+ for (iter = iter_start; iter < iter_stop; iter++) {
+ struct static_key *iterk = jump_entry_key(iter);
+
+ if (!is_kernel_ro_after_init((unsigned long)iterk))
+ continue;
+
+ if (static_key_sealed(iterk))
+ continue;
+
+ static_key_seal(iterk);
+ }
+
+ jump_label_unlock();
+ cpus_read_unlock();
+}
+
#ifdef CONFIG_MODULES
enum jump_label_type jump_label_init_type(struct jump_entry *entry)
@@ -650,6 +689,15 @@ static int jump_label_add_module(struct module *mod)
static_key_set_entries(key, iter);
continue;
}
+
+ /*
+ * If the key was sealed at init, then there's no need to keep a
+ * reference to its module entries - just patch them now and be
+ * done with it.
+ */
+ if (static_key_sealed(key))
+ goto do_poke;
+
jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
if (!jlm)
return -ENOMEM;
@@ -675,6 +723,7 @@ static int jump_label_add_module(struct module *mod)
static_key_set_linked(key);
/* Only update if we've changed from our initial state */
+do_poke:
if (jump_label_type(iter) != jump_label_init_type(iter))
__jump_label_update(key, iter, iter_stop, true);
}
@@ -699,6 +748,10 @@ static void jump_label_del_module(struct module *mod)
if (within_module((unsigned long)key, mod))
continue;
+ /* No @jlm allocated because key was sealed at init. */
+ if (static_key_sealed(key))
+ continue;
+
/* No memory during module load */
if (WARN_ON(!static_key_linked(key)))
continue;
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
index a6016b91803d25..d2345e9c0190b7 100644
--- a/kernel/locking/lock_events.h
+++ b/kernel/locking/lock_events.h
@@ -53,8 +53,8 @@ static inline void __lockevent_add(enum lock_events event, int inc)
#else /* CONFIG_LOCK_EVENT_COUNTS */
#define lockevent_inc(ev)
-#define lockevent_add(ev, c)
-#define lockevent_cond_inc(ev, c)
+#define lockevent_add(ev, c) do { (void)(c); } while (0)
+#define lockevent_cond_inc(ev, c) do { (void)(c); } while (0)
#endif /* CONFIG_LOCK_EVENT_COUNTS */
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ebe6b8ec7cb380..1df5fef8a65616 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -220,21 +220,18 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
*/
static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
- u32 old, new, val = atomic_read(&lock->val);
+ u32 old, new;
- for (;;) {
- new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+ old = atomic_read(&lock->val);
+ do {
+ new = (old & _Q_LOCKED_PENDING_MASK) | tail;
/*
* We can use relaxed semantics since the caller ensures that
* the MCS node is properly initialized before updating the
* tail.
*/
- old = atomic_cmpxchg_relaxed(&lock->val, val, new);
- if (old == val)
- break;
+ } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
- val = old;
- }
return old;
}
#endif /* _Q_PENDING_BITS == 8 */
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index ae2b12f68b9080..f5a36e67b59355 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -86,9 +86,10 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
*/
for (;;) {
int val = atomic_read(&lock->val);
+ u8 old = 0;
if (!(val & _Q_LOCKED_PENDING_MASK) &&
- (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
+ try_cmpxchg_acquire(&lock->locked, &old, _Q_LOCKED_VAL)) {
lockevent_inc(pv_lock_stealing);
return true;
}
@@ -116,11 +117,12 @@ static __always_inline void set_pending(struct qspinlock *lock)
* barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
* lock just to be sure that it will get it.
*/
-static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+static __always_inline bool trylock_clear_pending(struct qspinlock *lock)
{
+ u16 old = _Q_PENDING_VAL;
+
return !READ_ONCE(lock->locked) &&
- (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL,
- _Q_LOCKED_VAL) == _Q_PENDING_VAL);
+ try_cmpxchg_acquire(&lock->locked_pending, &old, _Q_LOCKED_VAL);
}
#else /* _Q_PENDING_BITS == 8 */
static __always_inline void set_pending(struct qspinlock *lock)
@@ -128,27 +130,21 @@ static __always_inline void set_pending(struct qspinlock *lock)
atomic_or(_Q_PENDING_VAL, &lock->val);
}
-static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+static __always_inline bool trylock_clear_pending(struct qspinlock *lock)
{
- int val = atomic_read(&lock->val);
-
- for (;;) {
- int old, new;
-
- if (val & _Q_LOCKED_MASK)
- break;
+ int old, new;
+ old = atomic_read(&lock->val);
+ do {
+ if (old & _Q_LOCKED_MASK)
+ return false;
/*
* Try to clear pending bit & set locked bit
*/
- old = val;
- new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
- val = atomic_cmpxchg_acquire(&lock->val, old, new);
+ new = (old & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+ } while (!atomic_try_cmpxchg_acquire (&lock->val, &old, new));
- if (val == old)
- return 1;
- }
- return 0;
+ return true;
}
#endif /* _Q_PENDING_BITS == 8 */
@@ -216,8 +212,9 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
int hopcnt = 0;
for_each_hash_entry(he, offset, hash) {
+ struct qspinlock *old = NULL;
hopcnt++;
- if (!cmpxchg(&he->lock, NULL, lock)) {
+ if (try_cmpxchg(&he->lock, &old, lock)) {
WRITE_ONCE(he->node, node);
lockevent_pv_hop(hopcnt);
return &he->lock;
@@ -294,7 +291,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
struct pv_node *pp = (struct pv_node *)prev;
- bool __maybe_unused wait_early;
+ bool wait_early;
int loop;
for (;;) {
@@ -360,7 +357,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
-
+ enum vcpu_state old = vcpu_halted;
/*
* If the vCPU is indeed halted, advance its state to match that of
* pv_wait_node(). If OTOH this fails, the vCPU was running and will
@@ -377,8 +374,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
* subsequent writes.
*/
smp_mb__before_atomic();
- if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed)
- != vcpu_halted)
+ if (!try_cmpxchg_relaxed(&pn->state, &old, vcpu_hashed))
return;
/*
@@ -546,15 +542,14 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
#ifndef __pv_queued_spin_unlock
__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock)
{
- u8 locked;
+ u8 locked = _Q_LOCKED_VAL;
/*
* We must not unlock if SLOW, because in that case we must first
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
- locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0);
- if (likely(locked == _Q_LOCKED_VAL))
+ if (try_cmpxchg_release(&lock->locked, &locked, 0))
return;
__pv_queued_spin_unlock_slowpath(lock, locked);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7019a40457a6da..1a914388144aeb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -108,7 +108,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
-EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
@@ -5662,13 +5662,13 @@ static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*/
-void scheduler_tick(void)
+void sched_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
struct rq_flags rf;
- unsigned long thermal_pressure;
+ unsigned long hw_pressure;
u64 resched_latency;
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
@@ -5679,8 +5679,8 @@ void scheduler_tick(void)
rq_lock(rq, &rf);
update_rq_clock(rq);
- thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
- update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
+ hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
+ update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
curr->sched_class->task_tick(rq, curr, 0);
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
@@ -5700,7 +5700,7 @@ void scheduler_tick(void)
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
- trigger_load_balance(rq);
+ sched_balance_trigger(rq);
#endif
}
@@ -6585,7 +6585,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* paths. For example, see arch/x86/entry_64.S.
*
* To drive preemption between tasks, the scheduler sets the flag in timer
- * interrupt handler scheduler_tick().
+ * interrupt handler sched_tick().
*
* 3. Wakeups don't really cause entry into schedule(). They add a
* task to the run-queue and that's it.
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index af7952f12e6cf1..aa48b2ec879df1 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -424,19 +424,6 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
-{
- if (is_idle_task(prev))
- vtime_account_idle(prev);
- else
- vtime_account_kernel(prev);
-
- vtime_flush(prev);
- arch_vtime_task_switch(prev);
-}
-# endif
-
void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
{
unsigned int pc = irq_count() - offset;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c62805dbd6088b..146ecf9cc3afe7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -78,15 +78,9 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
-int sched_thermal_decay_shift;
static int __init setup_sched_thermal_decay_shift(char *str)
{
- int _shift = 0;
-
- if (kstrtoint(str, 0, &_shift))
- pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
-
- sched_thermal_decay_shift = clamp(_shift, 0, 10);
+ pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
return 1;
}
__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
@@ -388,8 +382,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
/*
* With cfs_rq being unthrottled/throttled during an enqueue,
- * it can happen the tmp_alone_branch points the a leaf that
- * we finally want to del. In this case, tmp_alone_branch moves
+ * it can happen the tmp_alone_branch points to the leaf that
+ * we finally want to delete. In this case, tmp_alone_branch moves
* to the prev element but it will point to rq->leaf_cfs_rq_list
* at the end of the enqueue.
*/
@@ -406,7 +400,7 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq)
SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
}
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
+/* Iterate through all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
leaf_cfs_rq_list)
@@ -595,13 +589,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* [[ NOTE: this is only equal to the ideal scheduler under the condition
* that join/leave operations happen at lag_i = 0, otherwise the
- * virtual time has non-continguous motion equivalent to:
+ * virtual time has non-contiguous motion equivalent to:
*
* V +-= lag_i / W
*
* Also see the comment in place_entity() that deals with this. ]]
*
- * However, since v_i is u64, and the multiplcation could easily overflow
+ * However, since v_i is u64, and the multiplication could easily overflow
* transform it into a relative form that uses smaller quantities:
*
* Substitute: v_i == (v_i - v0) + v0
@@ -671,7 +665,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
}
if (load) {
- /* sign flips effective floor / ceil */
+ /* sign flips effective floor / ceiling */
if (avg < 0)
avg -= (load - 1);
avg = div_s64(avg, load);
@@ -727,7 +721,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
*
- * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
+ * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
* to the loss in precision caused by the division.
*/
static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
@@ -1030,7 +1024,7 @@ void init_entity_runnable_average(struct sched_entity *se)
if (entity_is_task(se))
sa->load_avg = scale_load_down(se->load.weight);
- /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
+ /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */
}
/*
@@ -1622,7 +1616,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
max_dist = READ_ONCE(sched_max_numa_distance);
/*
* This code is called for each node, introducing N^2 complexity,
- * which should be ok given the number of nodes rarely exceeds 8.
+ * which should be OK given the number of nodes rarely exceeds 8.
*/
for_each_online_node(node) {
unsigned long faults;
@@ -3296,7 +3290,7 @@ retry_pids:
/*
* Shared library pages mapped by multiple processes are not
* migrated as it is expected they are cache replicated. Avoid
- * hinting faults in read-only file-backed mappings or the vdso
+ * hinting faults in read-only file-backed mappings or the vDSO
* as migrating the pages will be of marginal benefit.
*/
if (!vma->vm_mm ||
@@ -3307,7 +3301,7 @@ retry_pids:
/*
* Skip inaccessible VMAs to avoid any confusion between
- * PROT_NONE and NUMA hinting ptes
+ * PROT_NONE and NUMA hinting PTEs
*/
if (!vma_is_accessible(vma)) {
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
@@ -3339,7 +3333,7 @@ retry_pids:
}
/*
- * Scanning the VMA's of short lived tasks add more overhead. So
+ * Scanning the VMAs of short lived tasks add more overhead. So
* delay the scan for new VMAs.
*/
if (mm->numa_scan_seq && time_before(jiffies,
@@ -3383,7 +3377,7 @@ retry_pids:
/*
* Try to scan sysctl_numa_balancing_size worth of
* hpages that have at least one present PTE that
- * is not already pte-numa. If the VMA contains
+ * is not already PTE-numa. If the VMA contains
* areas that are unused or already full of prot_numa
* PTEs, scan up to virtpages, to skip through those
* areas faster.
@@ -3690,7 +3684,7 @@ static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
/*
* VRUNTIME
- * ========
+ * --------
*
* COROLLARY #1: The virtual runtime of the entity needs to be
* adjusted if re-weight at !0-lag point.
@@ -3773,7 +3767,7 @@ static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
/*
* DEADLINE
- * ========
+ * --------
*
* When the weight changes, the virtual time slope changes and
* we should adjust the relative virtual deadline accordingly.
@@ -4745,7 +4739,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/*
* Track task load average for carrying it to new CPU after migrated, and
- * track group sched_entity load average for task_h_load calc in migration
+ * track group sched_entity load average for task_h_load calculation in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cfs_rq, se);
@@ -4828,7 +4822,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
-static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf);
static inline unsigned long task_util(struct task_struct *p)
{
@@ -4971,13 +4965,22 @@ done:
trace_sched_util_est_se_tp(&p->se);
}
+static inline unsigned long get_actual_cpu_capacity(int cpu)
+{
+ unsigned long capacity = arch_scale_cpu_capacity(cpu);
+
+ capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
+
+ return capacity;
+}
+
static inline int util_fits_cpu(unsigned long util,
unsigned long uclamp_min,
unsigned long uclamp_max,
int cpu)
{
- unsigned long capacity_orig, capacity_orig_thermal;
unsigned long capacity = capacity_of(cpu);
+ unsigned long capacity_orig;
bool fits, uclamp_max_fits;
/*
@@ -4999,7 +5002,7 @@ static inline int util_fits_cpu(unsigned long util,
* Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
* should fit a little cpu even if there's some pressure.
*
- * Only exception is for thermal pressure since it has a direct impact
+ * Only exception is for HW or cpufreq pressure since it has a direct impact
* on available OPP of the system.
*
* We honour it for uclamp_min only as a drop in performance level
@@ -5009,7 +5012,6 @@ static inline int util_fits_cpu(unsigned long util,
* goal is to cap the task. So it's okay if it's getting less.
*/
capacity_orig = arch_scale_cpu_capacity(cpu);
- capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
/*
* We want to force a task to fit a cpu as implied by uclamp_max.
@@ -5026,14 +5028,14 @@ static inline int util_fits_cpu(unsigned long util,
* | | | | | | |
* | | | | | | |
* +----------------------------------------
- * cpu0 cpu1 cpu2
+ * CPU0 CPU1 CPU2
*
* In the above example if a task is capped to a specific performance
* point, y, then when:
*
- * * util = 80% of x then it does not fit on cpu0 and should migrate
- * to cpu1
- * * util = 80% of y then it is forced to fit on cpu1 to honour
+ * * util = 80% of x then it does not fit on CPU0 and should migrate
+ * to CPU1
+ * * util = 80% of y then it is forced to fit on CPU1 to honour
* uclamp_max request.
*
* which is what we're enforcing here. A task always fits if
@@ -5064,7 +5066,7 @@ static inline int util_fits_cpu(unsigned long util,
* | | | | | | |
* | | | | | | | (region c, boosted, util < uclamp_min)
* +----------------------------------------
- * cpu0 cpu1 cpu2
+ * CPU0 CPU1 CPU2
*
* a) If util > uclamp_max, then we're capped, we don't care about
* actual fitness value here. We only care if uclamp_max fits
@@ -5084,7 +5086,8 @@ static inline int util_fits_cpu(unsigned long util,
* handle the case uclamp_min > uclamp_max.
*/
uclamp_min = min(uclamp_min, uclamp_max);
- if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+ if (fits && (util < uclamp_min) &&
+ (uclamp_min > get_actual_cpu_capacity(cpu)))
return -1;
return fits;
@@ -5104,15 +5107,19 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
+ int cpu = cpu_of(rq);
+
if (!sched_asym_cpucap_active())
return;
- if (!p || p->nr_cpus_allowed == 1) {
- rq->misfit_task_load = 0;
- return;
- }
+ /*
+ * Affinity allows us to go somewhere higher? Or are we on biggest
+ * available CPU already? Or do we fit into this CPU ?
+ */
+ if (!p || (p->nr_cpus_allowed == 1) ||
+ (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
+ task_fits_cpu(p, cpu)) {
- if (task_fits_cpu(p, cpu_of(rq))) {
rq->misfit_task_load = 0;
return;
}
@@ -5148,7 +5155,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
+static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf)
{
return 0;
}
@@ -5254,7 +5261,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->vruntime = vruntime - lag;
/*
- * When joining the competition; the exisiting tasks will be,
+ * When joining the competition; the existing tasks will be,
* on average, halfway through their slice, as such start tasks
* off with half a slice to ease into the competition.
*/
@@ -5403,7 +5410,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Now advance min_vruntime if @se was the entity holding it back,
* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
* put back on, and if we advance min_vruntime, we'll be placed back
- * further than we started -- ie. we'll be penalized.
+ * further than we started -- i.e. we'll be penalized.
*/
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
@@ -5439,7 +5446,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* Track our maximum slice length, if the CPU's load is at
- * least twice that of our own weight (i.e. dont track it
+ * least twice that of our own weight (i.e. don't track it
* when there are only lesser-weight tasks around):
*/
if (schedstat_enabled() &&
@@ -6675,22 +6682,47 @@ static inline void hrtick_update(struct rq *rq)
#ifdef CONFIG_SMP
static inline bool cpu_overutilized(int cpu)
{
- unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
- unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+ unsigned long rq_util_min, rq_util_max;
+
+ if (!sched_energy_enabled())
+ return false;
+
+ rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
/* Return true only if the utilization doesn't fit CPU's capacity */
return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}
-static inline void update_overutilized_status(struct rq *rq)
+/*
+ * overutilized value make sense only if EAS is enabled
+ */
+static inline bool is_rd_overutilized(struct root_domain *rd)
+{
+ return !sched_energy_enabled() || READ_ONCE(rd->overutilized);
+}
+
+static inline void set_rd_overutilized(struct root_domain *rd, bool flag)
{
- if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
- WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
- }
+ if (!sched_energy_enabled())
+ return;
+
+ WRITE_ONCE(rd->overutilized, flag);
+ trace_sched_overutilized_tp(rd, flag);
+}
+
+static inline void check_update_overutilized_status(struct rq *rq)
+{
+ /*
+ * overutilized field is used for load balancing decisions only
+ * if energy aware scheduler is being used
+ */
+
+ if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
+ set_rd_overutilized(rq->rd, 1);
}
#else
-static inline void update_overutilized_status(struct rq *rq) { }
+static inline void check_update_overutilized_status(struct rq *rq) { }
#endif
/* Runqueue only has SCHED_IDLE tasks enqueued */
@@ -6791,7 +6823,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* and the following generally works well enough in practice.
*/
if (!task_new)
- update_overutilized_status(rq);
+ check_update_overutilized_status(rq);
enqueue_throttle:
assert_list_leaf_cfs_rq(rq);
@@ -6878,7 +6910,7 @@ dequeue_throttle:
#ifdef CONFIG_SMP
-/* Working cpumask for: load_balance, load_balance_newidle. */
+/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
@@ -7110,13 +7142,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
}
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
+sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
/*
- * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
+ * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group.
*/
static int
-find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
@@ -7172,7 +7204,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
-static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p,
int cpu, int prev_cpu, int sd_flag)
{
int new_cpu = cpu;
@@ -7197,13 +7229,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
continue;
}
- group = find_idlest_group(sd, p, cpu);
+ group = sched_balance_find_dst_group(sd, p, cpu);
if (!group) {
sd = sd->child;
continue;
}
- new_cpu = find_idlest_group_cpu(group, p, cpu);
+ new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu);
if (new_cpu == cpu) {
/* Now try balancing at a lower domain level of 'cpu': */
sd = sd->child;
@@ -7471,7 +7503,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
* Look for the CPU with best capacity.
*/
else if (fits < 0)
- cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
+ cpu_cap = get_actual_cpu_capacity(cpu);
/*
* First, select CPU which fits better (-1 being better than 0).
@@ -7515,7 +7547,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
/*
* On asymmetric system, update task utilization because we will check
- * that the task fits with cpu's capacity.
+ * that the task fits with CPU's capacity.
*/
if (sched_asym_cpucap_active()) {
sync_entity_load_avg(&p->se);
@@ -7948,7 +7980,7 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd,
* NOTE: Forkees are not accepted in the energy-aware wake-up path because
* they don't have any useful utilization data yet and it's not possible to
* forecast their impact on energy consumption. Consequently, they will be
- * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
+ * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out
* to be energy-inefficient in some use-cases. The alternative would be to
* bias new tasks towards specific types of CPUs first, or to try to infer
* their util_avg from the parent task, but those heuristics could hurt
@@ -7964,15 +7996,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
struct root_domain *rd = this_rq()->rd;
int cpu, best_energy_cpu, target = -1;
int prev_fits = -1, best_fits = -1;
- unsigned long best_thermal_cap = 0;
- unsigned long prev_thermal_cap = 0;
+ unsigned long best_actual_cap = 0;
+ unsigned long prev_actual_cap = 0;
struct sched_domain *sd;
struct perf_domain *pd;
struct energy_env eenv;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
- if (!pd || READ_ONCE(rd->overutilized))
+ if (!pd)
goto unlock;
/*
@@ -7995,7 +8027,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
for (; pd; pd = pd->next) {
unsigned long util_min = p_util_min, util_max = p_util_max;
- unsigned long cpu_cap, cpu_thermal_cap, util;
+ unsigned long cpu_cap, cpu_actual_cap, util;
long prev_spare_cap = -1, max_spare_cap = -1;
unsigned long rq_util_min, rq_util_max;
unsigned long cur_delta, base_energy;
@@ -8007,18 +8039,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (cpumask_empty(cpus))
continue;
- /* Account thermal pressure for the energy estimation */
+ /* Account external pressure for the energy estimation */
cpu = cpumask_first(cpus);
- cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
- cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
+ cpu_actual_cap = get_actual_cpu_capacity(cpu);
- eenv.cpu_cap = cpu_thermal_cap;
+ eenv.cpu_cap = cpu_actual_cap;
eenv.pd_cap = 0;
for_each_cpu(cpu, cpus) {
struct rq *rq = cpu_rq(cpu);
- eenv.pd_cap += cpu_thermal_cap;
+ eenv.pd_cap += cpu_actual_cap;
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
@@ -8039,7 +8070,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
/*
* Open code uclamp_rq_util_with() except for
- * the clamp() part. Ie: apply max aggregation
+ * the clamp() part. I.e.: apply max aggregation
* only. util_fits_cpu() logic requires to
* operate on non clamped util but must use the
* max-aggregated uclamp_{min, max}.
@@ -8089,7 +8120,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (prev_delta < base_energy)
goto unlock;
prev_delta -= base_energy;
- prev_thermal_cap = cpu_thermal_cap;
+ prev_actual_cap = cpu_actual_cap;
best_delta = min(best_delta, prev_delta);
}
@@ -8104,7 +8135,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* but best energy cpu has better capacity.
*/
if ((max_fits < 0) &&
- (cpu_thermal_cap <= best_thermal_cap))
+ (cpu_actual_cap <= best_actual_cap))
continue;
cur_delta = compute_energy(&eenv, pd, cpus, p,
@@ -8125,14 +8156,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
best_fits = max_fits;
- best_thermal_cap = cpu_thermal_cap;
+ best_actual_cap = cpu_actual_cap;
}
}
rcu_read_unlock();
if ((best_fits > prev_fits) ||
((best_fits > 0) && (best_delta < prev_delta)) ||
- ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
+ ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
target = best_energy_cpu;
return target;
@@ -8175,7 +8206,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
cpumask_test_cpu(cpu, p->cpus_ptr))
return cpu;
- if (sched_energy_enabled()) {
+ if (!is_rd_overutilized(this_rq()->rd)) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu);
if (new_cpu >= 0)
return new_cpu;
@@ -8213,7 +8244,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
if (unlikely(sd)) {
/* Slow path */
- new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+ new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
@@ -8259,14 +8290,46 @@ static void task_dead_fair(struct task_struct *p)
remove_entity_load_avg(&p->se);
}
+/*
+ * Set the max capacity the task is allowed to run at for misfit detection.
+ */
+static void set_task_max_allowed_capacity(struct task_struct *p)
+{
+ struct asym_cap_data *entry;
+
+ if (!sched_asym_cpucap_active())
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ cpumask_t *cpumask;
+
+ cpumask = cpu_capacity_span(entry);
+ if (!cpumask_intersects(p->cpus_ptr, cpumask))
+ continue;
+
+ p->max_allowed_capacity = entry->capacity;
+ break;
+ }
+ rcu_read_unlock();
+}
+
+static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
+{
+ set_cpus_allowed_common(p, ctx);
+ set_task_max_allowed_capacity(p);
+}
+
static int
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
if (rq->nr_running)
return 1;
- return newidle_balance(rq, rf) != 0;
+ return sched_balance_newidle(rq, rf) != 0;
}
+#else
+static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
#endif /* CONFIG_SMP */
static void set_next_buddy(struct sched_entity *se)
@@ -8517,10 +8580,10 @@ idle:
if (!rf)
return NULL;
- new_tasks = newidle_balance(rq, rf);
+ new_tasks = sched_balance_newidle(rq, rf);
/*
- * Because newidle_balance() releases (and re-acquires) rq->lock, it is
+ * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
@@ -8598,7 +8661,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
return false;
- /* Tell the scheduler that we'd really like pse to run next. */
+ /* Tell the scheduler that we'd really like se to run next. */
set_next_buddy(se);
yield_task_fair(rq);
@@ -8936,7 +8999,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
- /* Disregard pcpu kthreads; they are where they need to be. */
+ /* Disregard percpu kthreads; they are where they need to be. */
if (kthread_is_per_cpu(p))
return 0;
@@ -9082,7 +9145,7 @@ static int detach_tasks(struct lb_env *env)
* We don't want to steal all, otherwise we may be treated likewise,
* which could at worst lead to a livelock crash.
*/
- if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
+ if (env->idle && env->src_rq->nr_running <= 1)
break;
env->loop++;
@@ -9261,7 +9324,7 @@ static inline bool others_have_blocked(struct rq *rq)
if (cpu_util_dl(rq))
return true;
- if (thermal_load_avg(rq))
+ if (hw_load_avg(rq))
return true;
if (cpu_util_irq(rq))
@@ -9291,7 +9354,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
{
const struct sched_class *curr_class;
u64 now = rq_clock_pelt(rq);
- unsigned long thermal_pressure;
+ unsigned long hw_pressure;
bool decayed;
/*
@@ -9300,11 +9363,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
*/
curr_class = rq->curr->sched_class;
- thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
+ hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
- update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
+ update_hw_load_avg(now, rq, hw_pressure) |
update_irq_load_avg(rq, 0);
if (others_have_blocked(rq))
@@ -9423,7 +9486,7 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif
-static void update_blocked_averages(int cpu)
+static void sched_balance_update_blocked_averages(int cpu)
{
bool decayed = false, done = true;
struct rq *rq = cpu_rq(cpu);
@@ -9442,25 +9505,25 @@ static void update_blocked_averages(int cpu)
rq_unlock_irqrestore(rq, &rf);
}
-/********** Helpers for find_busiest_group ************************/
+/********** Helpers for sched_balance_find_src_group ************************/
/*
- * sg_lb_stats - stats of a sched_group required for load_balancing
+ * sg_lb_stats - stats of a sched_group required for load-balancing:
*/
struct sg_lb_stats {
- unsigned long avg_load; /*Avg load across the CPUs of the group */
- unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long group_capacity;
- unsigned long group_util; /* Total utilization over the CPUs of the group */
- unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
- unsigned int sum_nr_running; /* Nr of tasks running in the group */
- unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
- unsigned int idle_cpus;
+ unsigned long avg_load; /* Avg load over the CPUs of the group */
+ unsigned long group_load; /* Total load over the CPUs of the group */
+ unsigned long group_capacity; /* Capacity over the CPUs of the group */
+ unsigned long group_util; /* Total utilization over the CPUs of the group */
+ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
+ unsigned int sum_nr_running; /* Nr of all tasks running in the group */
+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
+ unsigned int idle_cpus; /* Nr of idle CPUs in the group */
unsigned int group_weight;
enum group_type group_type;
- unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
- unsigned int group_smt_balance; /* Task on busy SMT be moved */
- unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
+ unsigned int group_smt_balance; /* Task on busy SMT be moved */
+ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -9468,19 +9531,18 @@ struct sg_lb_stats {
};
/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- * during load balancing.
+ * sd_lb_stats - stats of a sched_domain required for load-balancing:
*/
struct sd_lb_stats {
- struct sched_group *busiest; /* Busiest group in this sd */
- struct sched_group *local; /* Local group in this sd */
- unsigned long total_load; /* Total load of all groups in sd */
- unsigned long total_capacity; /* Total capacity of all groups in sd */
- unsigned long avg_load; /* Average load across all groups in sd */
- unsigned int prefer_sibling; /* tasks should go to sibling first */
-
- struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
- struct sg_lb_stats local_stat; /* Statistics of the local group */
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *local; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_capacity; /* Total capacity of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+ unsigned int prefer_sibling; /* Tasks should go to sibling first */
+
+ struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */
+ struct sg_lb_stats local_stat; /* Statistics of the local group */
};
static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
@@ -9506,8 +9568,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
static unsigned long scale_rt_capacity(int cpu)
{
+ unsigned long max = get_actual_cpu_capacity(cpu);
struct rq *rq = cpu_rq(cpu);
- unsigned long max = arch_scale_cpu_capacity(cpu);
unsigned long used, free;
unsigned long irq;
@@ -9519,12 +9581,9 @@ static unsigned long scale_rt_capacity(int cpu)
/*
* avg_rt.util_avg and avg_dl.util_avg track binary signals
* (running and not running) with weights 0 and 1024 respectively.
- * avg_thermal.load_avg tracks thermal pressure and the weighted
- * average uses the actual delta max capacity(load).
*/
used = cpu_util_rt(rq);
used += cpu_util_dl(rq);
- used += thermal_load_avg(rq);
if (unlikely(used >= max))
return 1;
@@ -9617,16 +9676,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
}
-/*
- * Check whether a rq has a misfit task and if it looks like we can actually
- * help that task: we can migrate the task to a CPU of higher capacity, or
- * the task's current CPU is heavily pressured.
- */
-static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+/* Check if the rq has a misfit task */
+static inline bool check_misfit_status(struct rq *rq)
{
- return rq->misfit_task_load &&
- (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
- check_cpu_capacity(rq, sd));
+ return rq->misfit_task_load;
}
/*
@@ -9650,7 +9703,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
*
* When this is so detected; this group becomes a candidate for busiest; see
* update_sd_pick_busiest(). And calculate_imbalance() and
- * find_busiest_group() avoid some of the usual balance conditions to allow it
+ * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it
* to create an effective group imbalance.
*
* This is a somewhat tricky proposition since the next run might not find the
@@ -9815,7 +9868,7 @@ static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
struct sched_group *group)
{
- if (env->idle == CPU_NOT_IDLE)
+ if (!env->idle)
return false;
/*
@@ -9839,7 +9892,7 @@ static inline long sibling_imbalance(struct lb_env *env,
int ncores_busiest, ncores_local;
long imbalance;
- if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running)
+ if (!env->idle || !busiest->sum_nr_running)
return 0;
ncores_busiest = sds->busiest->cores;
@@ -9885,13 +9938,15 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
* @sds: Load-balancing data with statistics of the local group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
- * @sg_status: Holds flag indicating the status of the sched_group
+ * @sg_overloaded: sched_group is overloaded
+ * @sg_overutilized: sched_group is overutilized
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *group,
struct sg_lb_stats *sgs,
- int *sg_status)
+ bool *sg_overloaded,
+ bool *sg_overutilized)
{
int i, nr_running, local_group;
@@ -9912,10 +9967,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->sum_nr_running += nr_running;
if (nr_running > 1)
- *sg_status |= SG_OVERLOAD;
+ *sg_overloaded = 1;
if (cpu_overutilized(i))
- *sg_status |= SG_OVERUTILIZED;
+ *sg_overutilized = 1;
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
@@ -9937,10 +9992,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
/* Check for a misfit task on the cpu */
if (sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
- *sg_status |= SG_OVERLOAD;
+ *sg_overloaded = 1;
}
- } else if ((env->idle != CPU_NOT_IDLE) &&
- sched_reduced_capacity(rq, env->sd)) {
+ } else if (env->idle && sched_reduced_capacity(rq, env->sd)) {
/* Check for a task running on a CPU with reduced capacity */
if (sgs->group_misfit_task_load < load)
sgs->group_misfit_task_load = load;
@@ -9952,7 +10006,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_weight = group->group_weight;
/* Check if dst CPU is idle and preferred to this group */
- if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
+ if (!local_group && env->idle && sgs->sum_h_nr_running &&
sched_group_asym(env, sgs, group))
sgs->group_asym_packing = 1;
@@ -10090,7 +10144,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
has_spare:
/*
- * Select not overloaded group with lowest number of idle cpus
+ * Select not overloaded group with lowest number of idle CPUs
* and highest number of running tasks. We could also compare
* the spare capacity which is more stable but it can end up
* that the group has less spare capacity but finally more idle
@@ -10310,13 +10364,13 @@ static bool update_pick_idlest(struct sched_group *idlest,
}
/*
- * find_idlest_group() finds and returns the least busy CPU group within the
+ * sched_balance_find_dst_group() finds and returns the least busy CPU group within the
* domain.
*
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
{
struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
struct sg_lb_stats local_sgs, tmp_sgs;
@@ -10564,7 +10618,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
unsigned long sum_util = 0;
- int sg_status = 0;
+ bool sg_overloaded = 0, sg_overutilized = 0;
do {
struct sg_lb_stats *sgs = &tmp_sgs;
@@ -10580,7 +10634,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sds, sg, sgs, &sg_status);
+ update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -10608,19 +10662,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
if (!env->sd->parent) {
- struct root_domain *rd = env->dst_rq->rd;
-
/* update overload indicator if we are at root domain */
- WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
+ set_rd_overloaded(env->dst_rq->rd, sg_overloaded);
/* Update over-utilization (tipping point, U >= 0) indicator */
- WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
- } else if (sg_status & SG_OVERUTILIZED) {
- struct root_domain *rd = env->dst_rq->rd;
-
- WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
+ set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
+ } else if (sg_overutilized) {
+ set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
}
update_idle_cpu_scan(env, sum_util);
@@ -10710,7 +10758,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* waiting task in this overloaded busiest group. Let's
* try to pull it.
*/
- if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
+ if (env->idle && env->imbalance == 0) {
env->migration_type = migrate_task;
env->imbalance = 1;
}
@@ -10729,7 +10777,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/*
* If there is no overload, we just want to even the number of
- * idle cpus.
+ * idle CPUs.
*/
env->migration_type = migrate_task;
env->imbalance = max_t(long, 0,
@@ -10802,7 +10850,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
) / SCHED_CAPACITY_SCALE;
}
-/******* find_busiest_group() helpers end here *********************/
+/******* sched_balance_find_src_group() helpers end here *********************/
/*
* Decision matrix according to the local and busiest group type:
@@ -10825,7 +10873,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
/**
- * find_busiest_group - Returns the busiest group within the sched_domain
+ * sched_balance_find_src_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
* @env: The load balancing environment.
*
@@ -10834,7 +10882,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*
* Return: - The busiest group if imbalance exists.
*/
-static struct sched_group *find_busiest_group(struct lb_env *env)
+static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
{
struct sg_lb_stats *local, *busiest;
struct sd_lb_stats sds;
@@ -10857,12 +10905,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_misfit_task)
goto force_balance;
- if (sched_energy_enabled()) {
- struct root_domain *rd = env->dst_rq->rd;
-
- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
- goto out_balanced;
- }
+ if (!is_rd_overutilized(env->dst_rq->rd) &&
+ rcu_dereference(env->dst_rq->rd->pd))
+ goto out_balanced;
/* ASYM feature bypasses nice load balance check */
if (busiest->group_type == group_asym_packing)
@@ -10925,7 +10970,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
if (busiest->group_type != group_overloaded) {
- if (env->idle == CPU_NOT_IDLE) {
+ if (!env->idle) {
/*
* If the busiest group is not overloaded (and as a
* result the local one too) but this CPU is already
@@ -10973,9 +11018,9 @@ out_balanced:
}
/*
- * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
+ * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group.
*/
-static struct rq *find_busiest_queue(struct lb_env *env,
+static struct rq *sched_balance_find_src_rq(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
@@ -11133,7 +11178,7 @@ asym_active_balance(struct lb_env *env)
* the lower priority @env::dst_cpu help it. Do not follow
* CPU priority.
*/
- return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) &&
+ return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) &&
(sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
!sched_use_asym_prio(env->sd, env->src_cpu));
}
@@ -11171,7 +11216,7 @@ static int need_active_balance(struct lb_env *env)
* because of other sched_class or IRQs if more capacity stays
* available on dst_cpu.
*/
- if ((env->idle != CPU_NOT_IDLE) &&
+ if (env->idle &&
(env->src_rq->cfs.h_nr_running == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
@@ -11256,7 +11301,7 @@ static int should_we_balance(struct lb_env *env)
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
-static int load_balance(int this_cpu, struct rq *this_rq,
+static int sched_balance_rq(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
{
@@ -11288,13 +11333,13 @@ redo:
goto out_balanced;
}
- group = find_busiest_group(&env);
+ group = sched_balance_find_src_group(&env);
if (!group) {
schedstat_inc(sd->lb_nobusyg[idle]);
goto out_balanced;
}
- busiest = find_busiest_queue(&env, group);
+ busiest = sched_balance_find_src_rq(&env, group);
if (!busiest) {
schedstat_inc(sd->lb_nobusyq[idle]);
goto out_balanced;
@@ -11312,7 +11357,7 @@ redo:
env.flags |= LBF_ALL_PINNED;
if (busiest->nr_running > 1) {
/*
- * Attempt to move tasks. If find_busiest_group has found
+ * Attempt to move tasks. If sched_balance_find_src_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
@@ -11427,8 +11472,12 @@ more_balance:
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
+ *
+ * Similarly for migration_misfit which is not related to
+ * load/util migration, don't pollute nr_balance_failed.
*/
- if (idle != CPU_NEWLY_IDLE)
+ if (idle != CPU_NEWLY_IDLE &&
+ env.migration_type != migrate_misfit)
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
@@ -11507,12 +11556,17 @@ out_one_pinned:
ld_moved = 0;
/*
- * newidle_balance() disregards balance intervals, so we could
+ * sched_balance_newidle() disregards balance intervals, so we could
* repeatedly reach this code, which would lead to balance_interval
* skyrocketing in a short amount of time. Skip the balance_interval
* increase logic to avoid that.
+ *
+ * Similarly misfit migration which is not necessarily an indication of
+ * the system being busy and requires lb to backoff to let it settle
+ * down.
*/
- if (env.idle == CPU_NEWLY_IDLE)
+ if (env.idle == CPU_NEWLY_IDLE ||
+ env.migration_type == migrate_misfit)
goto out;
/* tune up the balancing interval */
@@ -11645,10 +11699,23 @@ out_unlock:
return 0;
}
-static DEFINE_SPINLOCK(balancing);
+/*
+ * This flag serializes load-balancing passes over large domains
+ * (above the NODE topology level) - only one load-balancing instance
+ * may run at a time, to reduce overhead on very large systems with
+ * lots of CPUs and large NUMA distances.
+ *
+ * - Note that load-balancing passes triggered while another one
+ * is executing are skipped and not re-tried.
+ *
+ * - Also note that this does not serialize rebalance_domains()
+ * execution, as non-SD_SERIALIZE domains will still be
+ * load-balanced in parallel.
+ */
+static atomic_t sched_balance_running = ATOMIC_INIT(0);
/*
- * Scale the max load_balance interval with the number of CPUs in the system.
+ * Scale the max sched_balance_rq interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
*/
void update_max_interval(void)
@@ -11686,7 +11753,7 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
*
* Balancing parameters are set up in init_sched_domains.
*/
-static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
+static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
@@ -11723,25 +11790,25 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
- if (!spin_trylock(&balancing))
+ if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
goto out;
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
+ if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
/*
* The LBF_DST_PINNED logic could have changed
* env->dst_cpu, so we can't know our idle
* state even if we migrated tasks. Update it.
*/
- idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
- busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+ idle = idle_cpu(cpu);
+ busy = !idle && !sched_idle_cpu(cpu);
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
}
if (need_serialize)
- spin_unlock(&balancing);
+ atomic_set_release(&sched_balance_running, 0);
out:
if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
@@ -11901,7 +11968,7 @@ static void nohz_balancer_kick(struct rq *rq)
* currently idle; in which case, kick the ILB to move tasks
* around.
*
- * When balancing betwen cores, all the SMT siblings of the
+ * When balancing between cores, all the SMT siblings of the
* preferred CPU must be idle.
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
@@ -11918,7 +11985,7 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
* to run the misfit task on.
*/
- if (check_misfit_status(rq, sd)) {
+ if (check_misfit_status(rq)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
@@ -12062,7 +12129,7 @@ void nohz_balance_enter_idle(int cpu)
out:
/*
* Each time a cpu enter idle, we assume that it has blocked load and
- * enable the periodic update of the load of idle cpus
+ * enable the periodic update of the load of idle CPUs
*/
WRITE_ONCE(nohz.has_blocked, 1);
}
@@ -12080,13 +12147,13 @@ static bool update_nohz_stats(struct rq *rq)
if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
return true;
- update_blocked_averages(cpu);
+ sched_balance_update_blocked_averages(cpu);
return rq->has_blocked_load;
}
/*
- * Internal function that runs load balance for all idle cpus. The load balance
+ * Internal function that runs load balance for all idle CPUs. The load balance
* can be a simple update of blocked load or a complete load balance with
* tasks movement depending of flags.
*/
@@ -12162,7 +12229,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
rq_unlock_irqrestore(rq, &rf);
if (flags & NOHZ_BALANCE_KICK)
- rebalance_domains(rq, CPU_IDLE);
+ sched_balance_domains(rq, CPU_IDLE);
}
if (time_after(next_balance, rq->next_balance)) {
@@ -12191,7 +12258,7 @@ abort:
/*
* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * rebalancing for all the CPUs for whom scheduler ticks are stopped.
*/
static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
@@ -12222,7 +12289,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
* called from this function on (this) CPU that's not yet in the mask. That's
* OK because the goal of nohz_run_idle_balance() is to run ILB only for
* updating the blocked load of already idle CPUs without waking up one of
- * those idle CPUs and outside the preempt disable / irq off phase of the local
+ * those idle CPUs and outside the preempt disable / IRQ off phase of the local
* cpu about to enter idle, because it can take a long time.
*/
void nohz_run_idle_balance(int cpu)
@@ -12233,7 +12300,7 @@ void nohz_run_idle_balance(int cpu)
/*
* Update the blocked load only if no SCHED_SOFTIRQ is about to happen
- * (ie NOHZ_STATS_KICK set) and will do the same.
+ * (i.e. NOHZ_STATS_KICK set) and will do the same.
*/
if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
_nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
@@ -12278,7 +12345,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
- * newidle_balance is called by schedule() if this_cpu is about to become
+ * sched_balance_newidle is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*
* Returns:
@@ -12286,10 +12353,11 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
* 0 - failed, no new tasks
* > 0 - success, new (fair) tasks present
*/
-static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
+ int continue_balancing = 1;
u64 t0, t1, curr_cost = 0;
struct sched_domain *sd;
int pulled_task = 0;
@@ -12304,8 +12372,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
return 0;
/*
- * We must set idle_stamp _before_ calling idle_balance(), such that we
- * measure the duration of idle_balance() as idle time.
+ * We must set idle_stamp _before_ calling sched_balance_rq()
+ * for CPU_NEWLY_IDLE, such that we measure the this duration
+ * as idle time.
*/
this_rq->idle_stamp = rq_clock(this_rq);
@@ -12326,7 +12395,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
- if (!READ_ONCE(this_rq->rd->overload) ||
+ if (!get_rd_overloaded(this_rq->rd) ||
(sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
if (sd)
@@ -12340,11 +12409,10 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
raw_spin_rq_unlock(this_rq);
t0 = sched_clock_cpu(this_cpu);
- update_blocked_averages(this_cpu);
+ sched_balance_update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
- int continue_balancing = 1;
u64 domain_cost;
update_next_balance(sd, &next_balance);
@@ -12354,7 +12422,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
if (sd->flags & SD_BALANCE_NEWIDLE) {
- pulled_task = load_balance(this_cpu, this_rq,
+ pulled_task = sched_balance_rq(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
@@ -12370,8 +12438,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
*/
- if (pulled_task || this_rq->nr_running > 0 ||
- this_rq->ttwu_pending)
+ if (pulled_task || !continue_balancing)
break;
}
rcu_read_unlock();
@@ -12409,19 +12476,21 @@ out:
}
/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ * This softirq handler is triggered via SCHED_SOFTIRQ from two places:
+ *
+ * - directly from the local scheduler_tick() for periodic load balancing
+ *
+ * - indirectly from a remote scheduler_tick() for NOHZ idle balancing
+ * through the SMP cross-call nohz_csd_func()
*/
-static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
+static __latent_entropy void sched_balance_softirq(struct softirq_action *h)
{
struct rq *this_rq = this_rq();
- enum cpu_idle_type idle = this_rq->idle_balance ?
- CPU_IDLE : CPU_NOT_IDLE;
-
+ enum cpu_idle_type idle = this_rq->idle_balance;
/*
- * If this CPU has a pending nohz_balance_kick, then do the
+ * If this CPU has a pending NOHZ_BALANCE_KICK, then do the
* balancing on behalf of the other idle CPUs whose ticks are
- * stopped. Do nohz_idle_balance *before* rebalance_domains to
+ * stopped. Do nohz_idle_balance *before* sched_balance_domains to
* give the idle CPUs a chance to load balance. Else we may
* load balance only within the local sched_domain hierarchy
* and abort nohz_idle_balance altogether if we pull some load.
@@ -12430,14 +12499,14 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
return;
/* normal load balance */
- update_blocked_averages(this_rq->cpu);
- rebalance_domains(this_rq, idle);
+ sched_balance_update_blocked_averages(this_rq->cpu);
+ sched_balance_domains(this_rq, idle);
}
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
-void trigger_load_balance(struct rq *rq)
+void sched_balance_trigger(struct rq *rq)
{
/*
* Don't need to rebalance while attached to NULL domain or
@@ -12621,7 +12690,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
task_tick_numa(rq, curr);
update_misfit_status(curr, rq);
- update_overutilized_status(task_rq(curr));
+ check_update_overutilized_status(task_rq(curr));
task_tick_core(rq, curr);
}
@@ -12641,6 +12710,8 @@ static void task_fork_fair(struct task_struct *p)
rq_lock(rq, &rf);
update_rq_clock(rq);
+ set_task_max_allowed_capacity(p);
+
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
if (curr)
@@ -12764,6 +12835,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
attach_task_cfs_rq(p);
+ set_task_max_allowed_capacity(p);
+
if (task_on_rq_queued(p)) {
/*
* We were most likely switched from sched_rt, so
@@ -13135,7 +13208,7 @@ DEFINE_SCHED_CLASS(fair) = {
.rq_offline = rq_offline_fair,
.task_dead = task_dead_fair,
- .set_cpus_allowed = set_cpus_allowed_common,
+ .set_cpus_allowed = set_cpus_allowed_fair,
#endif
.task_tick = task_tick_fair,
@@ -13215,7 +13288,7 @@ __init void init_sched_fair_class(void)
#endif
}
- open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
+ open_softirq(SCHED_SOFTIRQ, sched_balance_softirq);
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 52c8f8226b0d35..ca9da66cc89477 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -379,7 +379,7 @@ void calc_global_load(void)
}
/*
- * Called from scheduler_tick() to periodically update this CPU's
+ * Called from sched_tick() to periodically update this CPU's
* active count.
*/
void calc_global_load_tick(struct rq *this_rq)
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 63b6cf8982201d..ef00382de595f8 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -208,8 +208,8 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
* se has been already dequeued but cfs_rq->curr still points to it.
* This means that weight will be 0 but not running for a sched_entity
* but also for a cfs_rq if the latter becomes idle. As an example,
- * this happens during idle_balance() which calls
- * update_blocked_averages().
+ * this happens during sched_balance_newidle() which calls
+ * sched_balance_update_blocked_averages().
*
* Also see the comment in accumulate_sum().
*/
@@ -384,30 +384,30 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
return 0;
}
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+#ifdef CONFIG_SCHED_HW_PRESSURE
/*
- * thermal:
+ * hardware:
*
* load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
*
* util_avg and runnable_load_avg are not supported and meaningless.
*
* Unlike rt/dl utilization tracking that track time spent by a cpu
- * running a rt/dl task through util_avg, the average thermal pressure is
- * tracked through load_avg. This is because thermal pressure signal is
+ * running a rt/dl task through util_avg, the average HW pressure is
+ * tracked through load_avg. This is because HW pressure signal is
* time weighted "delta" capacity unlike util_avg which is binary.
* "delta capacity" = actual capacity -
- * capped capacity a cpu due to a thermal event.
+ * capped capacity a cpu due to a HW event.
*/
-int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
- if (___update_load_sum(now, &rq->avg_thermal,
+ if (___update_load_sum(now, &rq->avg_hw,
capacity,
capacity,
capacity)) {
- ___update_load_avg(&rq->avg_thermal, 1);
- trace_pelt_thermal_tp(rq);
+ ___update_load_avg(&rq->avg_hw, 1);
+ trace_pelt_hw_tp(rq);
return 1;
}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 9e1083465fbc3d..2150062949d43f 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -7,21 +7,21 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
-int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
+#ifdef CONFIG_SCHED_HW_PRESSURE
+int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity);
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
{
- return READ_ONCE(rq->avg_thermal.load_avg);
+ return READ_ONCE(rq->avg_hw.load_avg);
}
#else
static inline int
-update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
return 0;
}
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
{
return 0;
}
@@ -202,12 +202,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
}
static inline int
-update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
return 0;
}
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
{
return 0;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ae50f212775e5c..a831af1020700c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,20 @@ extern int sysctl_sched_rt_runtime;
extern int sched_rr_timeslice;
/*
+ * Asymmetric CPU capacity bits
+ */
+struct asym_cap_data {
+ struct list_head link;
+ struct rcu_head rcu;
+ unsigned long capacity;
+ unsigned long cpus[];
+};
+
+extern struct list_head asym_cap_list;
+
+#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
+
+/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
@@ -701,7 +715,7 @@ struct rt_rq {
} highest_prio;
#endif
#ifdef CONFIG_SMP
- int overloaded;
+ bool overloaded;
struct plist_head pushable_tasks;
#endif /* CONFIG_SMP */
@@ -745,7 +759,7 @@ struct dl_rq {
u64 next;
} earliest_dl;
- int overloaded;
+ bool overloaded;
/*
* Tasks on this rq that can be pushed away. They are kept in
@@ -838,10 +852,6 @@ struct perf_domain {
struct rcu_head rcu;
};
-/* Scheduling group status flags */
-#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */
-#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */
-
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -862,10 +872,10 @@ struct root_domain {
* - More than one runnable task
* - Running task is misfit
*/
- int overload;
+ bool overloaded;
/* Indicate one or more cpus over-utilized (tipping point) */
- int overutilized;
+ bool overutilized;
/*
* The bit corresponding to a CPU gets set here if such CPU has more
@@ -905,8 +915,6 @@ struct root_domain {
cpumask_var_t rto_mask;
struct cpupri cpupri;
- unsigned long max_cpu_capacity;
-
/*
* NULL-terminated list of performance domains intersecting with the
* CPUs of the rd. Protected by RCU.
@@ -920,6 +928,17 @@ extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
extern void sched_get_rd(struct root_domain *rd);
extern void sched_put_rd(struct root_domain *rd);
+static inline int get_rd_overloaded(struct root_domain *rd)
+{
+ return READ_ONCE(rd->overloaded);
+}
+
+static inline void set_rd_overloaded(struct root_domain *rd, int status)
+{
+ if (get_rd_overloaded(rd) != status)
+ WRITE_ONCE(rd->overloaded, status);
+}
+
#ifdef HAVE_RT_PUSH_IPI
extern void rto_push_irq_work_func(struct irq_work *work);
#endif
@@ -1091,8 +1110,8 @@ struct rq {
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
struct sched_avg avg_irq;
#endif
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
- struct sched_avg avg_thermal;
+#ifdef CONFIG_SCHED_HW_PRESSURE
+ struct sched_avg avg_hw;
#endif
u64 idle_stamp;
u64 avg_idle;
@@ -1533,24 +1552,6 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
}
-/**
- * By default the decay is the default pelt decay period.
- * The decay shift can change the decay period in
- * multiples of 32.
- * Decay shift Decay period(ms)
- * 0 32
- * 1 64
- * 2 128
- * 3 256
- * 4 512
- */
-extern int sched_thermal_decay_shift;
-
-static inline u64 rq_clock_thermal(struct rq *rq)
-{
- return rq_clock_task(rq) >> sched_thermal_decay_shift;
-}
-
static inline void rq_clock_skip_update(struct rq *rq)
{
lockdep_assert_rq_held(rq);
@@ -2399,7 +2400,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq);
extern void update_group_capacity(struct sched_domain *sd, int cpu);
-extern void trigger_load_balance(struct rq *rq);
+extern void sched_balance_trigger(struct rq *rq);
extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx);
@@ -2519,10 +2520,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
}
#ifdef CONFIG_SMP
- if (prev_nr < 2 && rq->nr_running >= 2) {
- if (!READ_ONCE(rq->rd->overload))
- WRITE_ONCE(rq->rd->overload, 1);
- }
+ if (prev_nr < 2 && rq->nr_running >= 2)
+ set_rd_overloaded(rq->rd, 1);
#endif
sched_update_tick_dependency(rq);
@@ -2906,7 +2905,7 @@ extern void cfs_bandwidth_usage_dec(void);
#define NOHZ_NEWILB_KICK_BIT 2
#define NOHZ_NEXT_KICK_BIT 3
-/* Run rebalance_domains() */
+/* Run sched_balance_domains() */
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
/* Update blocked load */
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 857f837f52cbed..78e48f5426ee1f 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -113,7 +113,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
* Bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 15
+#define SCHEDSTAT_VERSION 16
static int show_schedstat(struct seq_file *seq, void *v)
{
@@ -150,8 +150,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "domain%d %*pb", dcount++,
cpumask_pr_args(sched_domain_span(sd)));
- for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
- itype++) {
+ for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) {
seq_printf(seq, " %u %u %u %u %u %u %u %u",
sd->lb_count[itype],
sd->lb_balanced[itype],
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 99ea5986038ce4..63aecd2a7a9f35 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1330,23 +1330,12 @@ next:
}
/*
- * Asymmetric CPU capacity bits
- */
-struct asym_cap_data {
- struct list_head link;
- unsigned long capacity;
- unsigned long cpus[];
-};
-
-/*
* Set of available CPUs grouped by their corresponding capacities
* Each list entry contains a CPU mask reflecting CPUs that share the same
* capacity.
* The lifespan of data is unlimited.
*/
-static LIST_HEAD(asym_cap_list);
-
-#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
+LIST_HEAD(asym_cap_list);
/*
* Verify whether there is any CPU capacity asymmetry in a given sched domain.
@@ -1386,21 +1375,39 @@ asym_cpu_capacity_classify(const struct cpumask *sd_span,
}
+static void free_asym_cap_entry(struct rcu_head *head)
+{
+ struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu);
+ kfree(entry);
+}
+
static inline void asym_cpu_capacity_update_data(int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(cpu);
- struct asym_cap_data *entry = NULL;
+ struct asym_cap_data *insert_entry = NULL;
+ struct asym_cap_data *entry;
+ /*
+ * Search if capacity already exits. If not, track which the entry
+ * where we should insert to keep the list ordered descendingly.
+ */
list_for_each_entry(entry, &asym_cap_list, link) {
if (capacity == entry->capacity)
goto done;
+ else if (!insert_entry && capacity > entry->capacity)
+ insert_entry = list_prev_entry(entry, link);
}
entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
return;
entry->capacity = capacity;
- list_add(&entry->link, &asym_cap_list);
+
+ /* If NULL then the new capacity is the smallest, add last. */
+ if (!insert_entry)
+ list_add_tail_rcu(&entry->link, &asym_cap_list);
+ else
+ list_add_rcu(&entry->link, &insert_entry->link);
done:
__cpumask_set_cpu(cpu, cpu_capacity_span(entry));
}
@@ -1423,8 +1430,8 @@ static void asym_cpu_capacity_scan(void)
list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
if (cpumask_empty(cpu_capacity_span(entry))) {
- list_del(&entry->link);
- kfree(entry);
+ list_del_rcu(&entry->link);
+ call_rcu(&entry->rcu, free_asym_cap_entry);
}
}
@@ -1434,8 +1441,8 @@ static void asym_cpu_capacity_scan(void)
*/
if (list_is_singular(&asym_cap_list)) {
entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
- list_del(&entry->link);
- kfree(entry);
+ list_del_rcu(&entry->link);
+ call_rcu(&entry->rcu, free_asym_cap_entry);
}
}
@@ -2507,16 +2514,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
- unsigned long capacity;
-
rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);
- capacity = arch_scale_cpu_capacity(i);
- /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
- if (capacity > READ_ONCE(d.rd->max_cpu_capacity))
- WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
-
cpu_attach_domain(sd, d.rd, i);
if (lowest_flag_domain(i, SD_CLUSTER))
@@ -2530,10 +2530,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (has_cluster)
static_branch_inc_cpuslocked(&sched_cluster_active);
- if (rq && sched_debug_verbose) {
- pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
- cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
- }
+ if (rq && sched_debug_verbose)
+ pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
ret = 0;
error:
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index a7ca458cdd9cd6..60a6484831b173 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -677,7 +677,7 @@ static ssize_t current_device_show(struct device *dev,
raw_spin_lock_irq(&clockevents_lock);
td = tick_get_tick_dev(dev);
if (td && td->evtdev)
- count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name);
+ count = sysfs_emit(buf, "%s\n", td->evtdev->name);
raw_spin_unlock_irq(&clockevents_lock);
return count;
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e5b260aa0e02c2..d25ba49e313cca 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -20,6 +20,16 @@
#include "tick-internal.h"
#include "timekeeping_internal.h"
+static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end)
+{
+ u64 delta = clocksource_delta(end, start, cs->mask);
+
+ if (likely(delta < cs->max_cycles))
+ return clocksource_cyc2ns(delta, cs->mult, cs->shift);
+
+ return mul_u64_u32_shr(delta, cs->mult, cs->shift);
+}
+
/**
* clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
* @mult: pointer to mult variable
@@ -222,8 +232,8 @@ enum wd_read_status {
static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
{
unsigned int nretries, max_retries;
- u64 wd_end, wd_end2, wd_delta;
int64_t wd_delay, wd_seq_delay;
+ u64 wd_end, wd_end2;
max_retries = clocksource_get_max_watchdog_retry();
for (nretries = 0; nretries <= max_retries; nretries++) {
@@ -234,9 +244,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
wd_end2 = watchdog->read(watchdog);
local_irq_enable();
- wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
- wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
- watchdog->shift);
+ wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end);
if (wd_delay <= WATCHDOG_MAX_SKEW) {
if (nretries > 1 || nretries >= max_retries) {
pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
@@ -254,8 +262,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
* report system busy, reinit the watchdog and skip the current
* watchdog test.
*/
- wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask);
- wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift);
+ wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2);
if (wd_seq_delay > WATCHDOG_MAX_SKEW/2)
goto skip_test;
}
@@ -366,8 +373,7 @@ void clocksource_verify_percpu(struct clocksource *cs)
delta = (csnow_end - csnow_mid) & cs->mask;
if (delta < 0)
cpumask_set_cpu(cpu, &cpus_ahead);
- delta = clocksource_delta(csnow_end, csnow_begin, cs->mask);
- cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end);
if (cs_nsec > cs_nsec_max)
cs_nsec_max = cs_nsec;
if (cs_nsec < cs_nsec_min)
@@ -398,8 +404,8 @@ static inline void clocksource_reset_watchdog(void)
static void clocksource_watchdog(struct timer_list *unused)
{
- u64 csnow, wdnow, cslast, wdlast, delta;
int64_t wd_nsec, cs_nsec, interval;
+ u64 csnow, wdnow, cslast, wdlast;
int next_cpu, reset_pending;
struct clocksource *cs;
enum wd_read_status read_ret;
@@ -456,12 +462,8 @@ static void clocksource_watchdog(struct timer_list *unused)
continue;
}
- delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
- wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
- watchdog->shift);
-
- delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
- cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow);
+ cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow);
wdlast = cs->wd_last; /* save these in case we print them */
cslast = cs->cs_last;
cs->cs_last = csnow;
@@ -832,7 +834,7 @@ void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles)
*/
u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
{
- u64 now, delta, nsec = 0;
+ u64 now, nsec = 0;
if (!suspend_clocksource)
return 0;
@@ -847,12 +849,8 @@ u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
else
now = suspend_clocksource->read(suspend_clocksource);
- if (now > suspend_start) {
- delta = clocksource_delta(now, suspend_start,
- suspend_clocksource->mask);
- nsec = mul_u64_u32_shr(delta, suspend_clocksource->mult,
- suspend_clocksource->shift);
- }
+ if (now > suspend_start)
+ nsec = cycles_to_nsec_safe(suspend_clocksource, suspend_start, now);
/*
* Disable the suspend timer to save power if current clocksource is
@@ -1336,7 +1334,7 @@ static ssize_t current_clocksource_show(struct device *dev,
ssize_t count = 0;
mutex_lock(&clocksource_mutex);
- count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
+ count = sysfs_emit(buf, "%s\n", curr_clocksource->name);
mutex_unlock(&clocksource_mutex);
return count;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 70625dff62ce10..492c14aac642bb 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -644,17 +644,12 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
/*
* Is the high resolution mode active ?
*/
-static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
+static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
cpu_base->hres_active : 0;
}
-static inline int hrtimer_hres_active(void)
-{
- return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
-}
-
static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
struct hrtimer *next_timer,
ktime_t expires_next)
@@ -678,7 +673,7 @@ static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
* set. So we'd effectively block all timers until the T2 event
* fires.
*/
- if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
+ if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
return;
tick_program_event(expires_next, 1);
@@ -789,12 +784,12 @@ static void retrigger_next_event(void *arg)
* function call will take care of the reprogramming in case the
* CPU was in a NOHZ idle sleep.
*/
- if (!__hrtimer_hres_active(base) && !tick_nohz_active)
+ if (!hrtimer_hres_active(base) && !tick_nohz_active)
return;
raw_spin_lock(&base->lock);
hrtimer_update_base(base);
- if (__hrtimer_hres_active(base))
+ if (hrtimer_hres_active(base))
hrtimer_force_reprogram(base, 0);
else
hrtimer_update_next_event(base);
@@ -951,7 +946,7 @@ void clock_was_set(unsigned int bases)
cpumask_var_t mask;
int cpu;
- if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active)
+ if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active)
goto out_timerfd;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -1491,7 +1486,7 @@ u64 hrtimer_get_next_event(void)
raw_spin_lock_irqsave(&cpu_base->lock, flags);
- if (!__hrtimer_hres_active(cpu_base))
+ if (!hrtimer_hres_active(cpu_base))
expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1514,7 +1509,7 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)
raw_spin_lock_irqsave(&cpu_base->lock, flags);
- if (__hrtimer_hres_active(cpu_base)) {
+ if (hrtimer_hres_active(cpu_base)) {
unsigned int active;
if (!cpu_base->softirq_activated) {
@@ -1875,25 +1870,7 @@ retry:
tick_program_event(expires_next, 1);
pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
}
-
-/* called with interrupts disabled */
-static inline void __hrtimer_peek_ahead_timers(void)
-{
- struct tick_device *td;
-
- if (!hrtimer_hres_active())
- return;
-
- td = this_cpu_ptr(&tick_cpu_device);
- if (td && td->evtdev)
- hrtimer_interrupt(td->evtdev);
-}
-
-#else /* CONFIG_HIGH_RES_TIMERS */
-
-static inline void __hrtimer_peek_ahead_timers(void) { }
-
-#endif /* !CONFIG_HIGH_RES_TIMERS */
+#endif /* !CONFIG_HIGH_RES_TIMERS */
/*
* Called from run_local_timers in hardirq context every jiffy
@@ -1904,7 +1881,7 @@ void hrtimer_run_queues(void)
unsigned long flags;
ktime_t now;
- if (__hrtimer_hres_active(cpu_base))
+ if (hrtimer_hres_active(cpu_base))
return;
/*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b58dffc58a8f40..4e18db1819f842 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -237,7 +237,9 @@ static void timekeeping_check_update(struct timekeeper *tk, u64 offset)
}
}
-static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
+static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles);
+
+static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
{
struct timekeeper *tk = &tk_core.timekeeper;
u64 now, last, mask, max, delta;
@@ -264,34 +266,23 @@ static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
* Try to catch underflows by checking if we are seeing small
* mask-relative negative values.
*/
- if (unlikely((~delta & mask) < (mask >> 3))) {
+ if (unlikely((~delta & mask) < (mask >> 3)))
tk->underflow_seen = 1;
- delta = 0;
- }
- /* Cap delta value to the max_cycles values to avoid mult overflows */
- if (unlikely(delta > max)) {
+ /* Check for multiplication overflows */
+ if (unlikely(delta > max))
tk->overflow_seen = 1;
- delta = tkr->clock->max_cycles;
- }
- return delta;
+ /* timekeeping_cycles_to_ns() handles both under and overflow */
+ return timekeeping_cycles_to_ns(tkr, now);
}
#else
static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
{
}
-static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
+static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
{
- u64 cycle_now, delta;
-
- /* read clocksource */
- cycle_now = tk_clock_read(tkr);
-
- /* calculate the delta since the last update_wall_time */
- delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
-
- return delta;
+ BUG();
}
#endif
@@ -370,32 +361,46 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
}
/* Timekeeper helper functions. */
+static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta)
+{
+ return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift);
+}
-static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta)
+static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
{
- u64 nsec;
+ /* Calculate the delta since the last update_wall_time() */
+ u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;
- nsec = delta * tkr->mult + tkr->xtime_nsec;
- nsec >>= tkr->shift;
+ /*
+ * This detects both negative motion and the case where the delta
+ * overflows the multiplication with tkr->mult.
+ */
+ if (unlikely(delta > tkr->clock->max_cycles)) {
+ /*
+ * Handle clocksource inconsistency between CPUs to prevent
+ * time from going backwards by checking for the MSB of the
+ * mask being set in the delta.
+ */
+ if (delta & ~(mask >> 1))
+ return tkr->xtime_nsec >> tkr->shift;
+
+ return delta_to_ns_safe(tkr, delta);
+ }
- return nsec;
+ return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
}
-static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
+static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr)
{
- u64 delta;
-
- delta = timekeeping_get_delta(tkr);
- return timekeeping_delta_to_ns(tkr, delta);
+ return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr));
}
-static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
+static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
{
- u64 delta;
+ if (IS_ENABLED(CONFIG_DEBUG_TIMEKEEPING))
+ return timekeeping_debug_get_ns(tkr);
- /* calculate the delta since the last update_wall_time */
- delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
- return timekeeping_delta_to_ns(tkr, delta);
+ return __timekeeping_get_ns(tkr);
}
/**
@@ -431,14 +436,6 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
memcpy(base + 1, base, sizeof(*base));
}
-static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr)
-{
- u64 delta, cycles = tk_clock_read(tkr);
-
- delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
- return timekeeping_delta_to_ns(tkr, delta);
-}
-
static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
{
struct tk_read_base *tkr;
@@ -449,7 +446,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base);
- now += fast_tk_get_delta_ns(tkr);
+ now += __timekeeping_get_ns(tkr);
} while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
return now;
@@ -565,7 +562,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
tkr = tkf->base + (seq & 0x01);
basem = ktime_to_ns(tkr->base);
baser = ktime_to_ns(tkr->base_real);
- delta = fast_tk_get_delta_ns(tkr);
+ delta = __timekeeping_get_ns(tkr);
} while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
if (mono)
@@ -800,10 +797,15 @@ static void timekeeping_forward_now(struct timekeeper *tk)
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;
- tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
- tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;
+ while (delta > 0) {
+ u64 max = tk->tkr_mono.clock->max_cycles;
+ u64 incr = delta < max ? delta : max;
- tk_normalize_xtime(tk);
+ tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult;
+ tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult;
+ tk_normalize_xtime(tk);
+ delta -= incr;
+ }
}
/**
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3baf2fbe6848f0..e394d6d5b9b542 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2488,7 +2488,7 @@ void update_process_times(int user_tick)
if (in_irq())
irq_work_tick();
#endif
- scheduler_tick();
+ sched_tick();
if (IS_ENABLED(CONFIG_POSIX_TIMERS))
run_posix_cpu_timers();
}
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index f0d5062d9cbc62..9193d6133e5d68 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -22,10 +22,16 @@ static inline void update_vdso_data(struct vdso_data *vdata,
u64 nsec, sec;
vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+ vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles;
+#endif
vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult;
vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift;
vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+ vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles;
+#endif
vdata[CS_RAW].mask = tk->tkr_raw.mask;
vdata[CS_RAW].mult = tk->tkr_raw.mult;
vdata[CS_RAW].shift = tk->tkr_raw.shift;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d7b2125503af3d..d12ff74889ed54 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -12,20 +12,25 @@
#define pr_fmt(fmt) "watchdog: " fmt
-#include <linux/mm.h>
#include <linux/cpu.h>
-#include <linux/nmi.h>
#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/kernel_stat.h>
+#include <linux/kvm_para.h>
+#include <linux/math64.h>
+#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/stop_machine.h>
#include <linux/sysctl.h>
#include <linux/tick.h>
+
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
-#include <linux/stop_machine.h>
#include <asm/irq_regs.h>
-#include <linux/kvm_para.h>
static DEFINE_MUTEX(watchdog_mutex);
@@ -35,6 +40,8 @@ static DEFINE_MUTEX(watchdog_mutex);
# define WATCHDOG_HARDLOCKUP_DEFAULT 0
#endif
+#define NUM_SAMPLE_PERIODS 5
+
unsigned long __read_mostly watchdog_enabled;
int __read_mostly watchdog_user_enabled = 1;
static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
@@ -333,6 +340,188 @@ __setup("watchdog_thresh=", watchdog_thresh_setup);
static void __lockup_detector_cleanup(void);
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM
+enum stats_per_group {
+ STATS_SYSTEM,
+ STATS_SOFTIRQ,
+ STATS_HARDIRQ,
+ STATS_IDLE,
+ NUM_STATS_PER_GROUP,
+};
+
+static const enum cpu_usage_stat tracked_stats[NUM_STATS_PER_GROUP] = {
+ CPUTIME_SYSTEM,
+ CPUTIME_SOFTIRQ,
+ CPUTIME_IRQ,
+ CPUTIME_IDLE,
+};
+
+static DEFINE_PER_CPU(u16, cpustat_old[NUM_STATS_PER_GROUP]);
+static DEFINE_PER_CPU(u8, cpustat_util[NUM_SAMPLE_PERIODS][NUM_STATS_PER_GROUP]);
+static DEFINE_PER_CPU(u8, cpustat_tail);
+
+/*
+ * We don't need nanosecond resolution. A granularity of 16ms is
+ * sufficient for our precision, allowing us to use u16 to store
+ * cpustats, which will roll over roughly every ~1000 seconds.
+ * 2^24 ~= 16 * 10^6
+ */
+static u16 get_16bit_precision(u64 data_ns)
+{
+ return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */
+}
+
+static void update_cpustat(void)
+{
+ int i;
+ u8 util;
+ u16 old_stat, new_stat;
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+ u8 tail = __this_cpu_read(cpustat_tail);
+ u16 sample_period_16 = get_16bit_precision(sample_period);
+
+ kcpustat_cpu_fetch(&kcpustat, smp_processor_id());
+
+ for (i = 0; i < NUM_STATS_PER_GROUP; i++) {
+ old_stat = __this_cpu_read(cpustat_old[i]);
+ new_stat = get_16bit_precision(cpustat[tracked_stats[i]]);
+ util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16);
+ __this_cpu_write(cpustat_util[tail][i], util);
+ __this_cpu_write(cpustat_old[i], new_stat);
+ }
+
+ __this_cpu_write(cpustat_tail, (tail + 1) % NUM_SAMPLE_PERIODS);
+}
+
+static void print_cpustat(void)
+{
+ int i, group;
+ u8 tail = __this_cpu_read(cpustat_tail);
+ u64 sample_period_second = sample_period;
+
+ do_div(sample_period_second, NSEC_PER_SEC);
+
+ /*
+ * Outputting the "watchdog" prefix on every line is redundant and not
+ * concise, and the original alarm information is sufficient for
+ * positioning in logs, hence here printk() is used instead of pr_crit().
+ */
+ printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n",
+ smp_processor_id(), sample_period_second);
+
+ for (i = 0; i < NUM_SAMPLE_PERIODS; i++) {
+ group = (tail + i) % NUM_SAMPLE_PERIODS;
+ printk(KERN_CRIT "\t#%d: %3u%% system,\t%3u%% softirq,\t"
+ "%3u%% hardirq,\t%3u%% idle\n", i + 1,
+ __this_cpu_read(cpustat_util[group][STATS_SYSTEM]),
+ __this_cpu_read(cpustat_util[group][STATS_SOFTIRQ]),
+ __this_cpu_read(cpustat_util[group][STATS_HARDIRQ]),
+ __this_cpu_read(cpustat_util[group][STATS_IDLE]));
+ }
+}
+
+#define HARDIRQ_PERCENT_THRESH 50
+#define NUM_HARDIRQ_REPORT 5
+struct irq_counts {
+ int irq;
+ u32 counts;
+};
+
+static DEFINE_PER_CPU(bool, snapshot_taken);
+
+/* Tabulate the most frequent interrupts. */
+static void tabulate_irq_count(struct irq_counts *irq_counts, int irq, u32 counts, int rank)
+{
+ int i;
+ struct irq_counts new_count = {irq, counts};
+
+ for (i = 0; i < rank; i++) {
+ if (counts > irq_counts[i].counts)
+ swap(new_count, irq_counts[i]);
+ }
+}
+
+/*
+ * If the hardirq time exceeds HARDIRQ_PERCENT_THRESH% of the sample_period,
+ * then the cause of softlockup might be interrupt storm. In this case, it
+ * would be useful to start interrupt counting.
+ */
+static bool need_counting_irqs(void)
+{
+ u8 util;
+ int tail = __this_cpu_read(cpustat_tail);
+
+ tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT;
+ util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]);
+ return util > HARDIRQ_PERCENT_THRESH;
+}
+
+static void start_counting_irqs(void)
+{
+ if (!__this_cpu_read(snapshot_taken)) {
+ kstat_snapshot_irqs();
+ __this_cpu_write(snapshot_taken, true);
+ }
+}
+
+static void stop_counting_irqs(void)
+{
+ __this_cpu_write(snapshot_taken, false);
+}
+
+static void print_irq_counts(void)
+{
+ unsigned int i, count;
+ struct irq_counts irq_counts_sorted[NUM_HARDIRQ_REPORT] = {
+ {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}
+ };
+
+ if (__this_cpu_read(snapshot_taken)) {
+ for_each_active_irq(i) {
+ count = kstat_get_irq_since_snapshot(i);
+ tabulate_irq_count(irq_counts_sorted, i, count, NUM_HARDIRQ_REPORT);
+ }
+
+ /*
+ * Outputting the "watchdog" prefix on every line is redundant and not
+ * concise, and the original alarm information is sufficient for
+ * positioning in logs, hence here printk() is used instead of pr_crit().
+ */
+ printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%. Most frequent HardIRQs:\n",
+ smp_processor_id(), HARDIRQ_PERCENT_THRESH);
+
+ for (i = 0; i < NUM_HARDIRQ_REPORT; i++) {
+ if (irq_counts_sorted[i].irq == -1)
+ break;
+
+ printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n",
+ i + 1, irq_counts_sorted[i].counts,
+ irq_counts_sorted[i].irq);
+ }
+
+ /*
+ * If the hardirq time is less than HARDIRQ_PERCENT_THRESH% in the last
+ * sample_period, then we suspect the interrupt storm might be subsiding.
+ */
+ if (!need_counting_irqs())
+ stop_counting_irqs();
+ }
+}
+
+static void report_cpu_status(void)
+{
+ print_cpustat();
+ print_irq_counts();
+}
+#else
+static inline void update_cpustat(void) { }
+static inline void report_cpu_status(void) { }
+static inline bool need_counting_irqs(void) { return false; }
+static inline void start_counting_irqs(void) { }
+static inline void stop_counting_irqs(void) { }
+#endif
+
/*
* Hard-lockup warnings should be triggered after just a few seconds. Soft-
* lockups can have false positives under extreme conditions. So we generally
@@ -364,7 +553,7 @@ static void set_sample_period(void)
* and hard thresholds) to increment before the
* hardlockup detector generates a warning
*/
- sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / NUM_SAMPLE_PERIODS);
watchdog_update_hrtimer_threshold(sample_period);
}
@@ -434,6 +623,18 @@ static int is_softlockup(unsigned long touch_ts,
unsigned long now)
{
if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) {
+ /*
+ * If period_ts has not been updated during a sample_period, then
+ * in the subsequent few sample_periods, period_ts might also not
+ * be updated, which could indicate a potential softlockup. In
+ * this case, if we suspect the cause of the potential softlockup
+ * might be interrupt storm, then we need to count the interrupts
+ * to find which interrupt is storming.
+ */
+ if (time_after_eq(now, period_ts + get_softlockup_thresh() / NUM_SAMPLE_PERIODS) &&
+ need_counting_irqs())
+ start_counting_irqs();
+
/* Warn about unreasonable delays. */
if (time_after(now, period_ts + get_softlockup_thresh()))
return now - touch_ts;
@@ -456,6 +657,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
static int softlockup_fn(void *data)
{
update_touch_ts();
+ stop_counting_irqs();
complete(this_cpu_ptr(&softlockup_completion));
return 0;
@@ -504,6 +706,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
*/
period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts));
+ update_cpustat();
+
/* Reset the interval when touched by known problematic code. */
if (period_ts == SOFTLOCKUP_DELAY_REPORT) {
if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@ -539,6 +743,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
+ report_cpu_status();
print_modules();
print_irqtrace_events(current);
if (regs)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0066c8f6c15442..f397510edc9b6e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1464,7 +1464,7 @@ void wq_worker_sleeping(struct task_struct *task)
* wq_worker_tick - a scheduler tick occurred while a kworker is running
* @task: task currently running
*
- * Called from scheduler_tick(). We're in the IRQ context and the current
+ * Called from sched_tick(). We're in the IRQ context and the current
* worker's fields which follow the 'K' locking rule can be accessed safely.
*/
void wq_worker_tick(struct task_struct *task)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c4b6b77f36bf81..3856e2d7e7ff59 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1061,6 +1061,20 @@ config SOFTLOCKUP_DETECTOR
chance to run. The current stack trace is displayed upon
detection and the system will stay locked up.
+config SOFTLOCKUP_DETECTOR_INTR_STORM
+ bool "Detect Interrupt Storm in Soft Lockups"
+ depends on SOFTLOCKUP_DETECTOR && IRQ_TIME_ACCOUNTING
+ select GENERIC_IRQ_STAT_SNAPSHOT
+ default y if NR_CPUS <= 128
+ help
+ Say Y here to enable the kernel to detect interrupt storm
+ during "soft lockups".
+
+ "soft lockups" can be caused by a variety of reasons. If one is
+ caused by an interrupt storm, then the storming interrupts will not
+ be on the callstack. To detect this case, it is necessary to report
+ the CPU stats and the interrupt counts during the "soft lockups".
+
config BOOTPARAM_SOFTLOCKUP_PANIC
bool "Panic (Reboot) On Soft Lockups"
depends on SOFTLOCKUP_DETECTOR
@@ -1282,7 +1296,7 @@ config SCHED_INFO
config SCHEDSTATS
bool "Collect scheduler statistics"
- depends on DEBUG_KERNEL && PROC_FS
+ depends on PROC_FS
select SCHED_INFO
help
If you say Y here, additional code will be inserted into the
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 32f99e9a670e64..dacadd904250fb 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -116,6 +116,18 @@ unsigned long _find_first_and_bit(const unsigned long *addr1,
EXPORT_SYMBOL(_find_first_and_bit);
#endif
+/*
+ * Find the first set bit in three memory regions.
+ */
+unsigned long _find_first_and_and_bit(const unsigned long *addr1,
+ const unsigned long *addr2,
+ const unsigned long *addr3,
+ unsigned long size)
+{
+ return FIND_FIRST_BIT(addr1[idx] & addr2[idx] & addr3[idx], /* nop */, size);
+}
+EXPORT_SYMBOL(_find_first_and_and_bit);
+
#ifndef find_first_zero_bit
/*
* Find the first cleared bit in a memory region.
diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig
index d883ac29950897..c46c2300517c94 100644
--- a/lib/vdso/Kconfig
+++ b/lib/vdso/Kconfig
@@ -30,4 +30,11 @@ config GENERIC_VDSO_TIME_NS
Selected by architectures which support time namespaces in the
VDSO
+config GENERIC_VDSO_OVERFLOW_PROTECT
+ bool
+ help
+ Select to add multiplication overflow protection to the VDSO
+ time getter functions for the price of an extra conditional
+ in the hotpath.
+
endif
diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index ce2f6955200321..899850bd6f0bee 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -5,15 +5,23 @@
#include <vdso/datapage.h>
#include <vdso/helpers.h>
-#ifndef vdso_calc_delta
-/*
- * Default implementation which works for all sane clocksources. That
- * obviously excludes x86/TSC.
- */
-static __always_inline
-u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
+#ifndef vdso_calc_ns
+
+#ifdef VDSO_DELTA_NOMASK
+# define VDSO_DELTA_MASK(vd) ULLONG_MAX
+#else
+# define VDSO_DELTA_MASK(vd) (vd->mask)
+#endif
+
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+static __always_inline bool vdso_delta_ok(const struct vdso_data *vd, u64 delta)
+{
+ return delta < vd->max_cycles;
+}
+#else
+static __always_inline bool vdso_delta_ok(const struct vdso_data *vd, u64 delta)
{
- return ((cycles - last) & mask) * mult;
+ return true;
}
#endif
@@ -24,6 +32,21 @@ static __always_inline u64 vdso_shift_ns(u64 ns, u32 shift)
}
#endif
+/*
+ * Default implementation which works for all sane clocksources. That
+ * obviously excludes x86/TSC.
+ */
+static __always_inline u64 vdso_calc_ns(const struct vdso_data *vd, u64 cycles, u64 base)
+{
+ u64 delta = (cycles - vd->cycle_last) & VDSO_DELTA_MASK(vd);
+
+ if (likely(vdso_delta_ok(vd, delta)))
+ return vdso_shift_ns((delta * vd->mult) + base, vd->shift);
+
+ return mul_u64_u32_add_u64_shr(delta, vd->mult, base, vd->shift);
+}
+#endif /* vdso_calc_ns */
+
#ifndef __arch_vdso_hres_capable
static inline bool __arch_vdso_hres_capable(void)
{
@@ -49,10 +72,10 @@ static inline bool vdso_cycles_ok(u64 cycles)
static __always_inline int do_hres_timens(const struct vdso_data *vdns, clockid_t clk,
struct __kernel_timespec *ts)
{
- const struct vdso_data *vd;
const struct timens_offset *offs = &vdns->offset[clk];
const struct vdso_timestamp *vdso_ts;
- u64 cycles, last, ns;
+ const struct vdso_data *vd;
+ u64 cycles, ns;
u32 seq;
s64 sec;
@@ -73,10 +96,7 @@ static __always_inline int do_hres_timens(const struct vdso_data *vdns, clockid_
cycles = __arch_get_hw_counter(vd->clock_mode, vd);
if (unlikely(!vdso_cycles_ok(cycles)))
return -1;
- ns = vdso_ts->nsec;
- last = vd->cycle_last;
- ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
- ns = vdso_shift_ns(ns, vd->shift);
+ ns = vdso_calc_ns(vd, cycles, vdso_ts->nsec);
sec = vdso_ts->sec;
} while (unlikely(vdso_read_retry(vd, seq)));
@@ -111,7 +131,7 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
struct __kernel_timespec *ts)
{
const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
- u64 cycles, last, sec, ns;
+ u64 cycles, sec, ns;
u32 seq;
/* Allows to compile the high resolution parts out */
@@ -144,10 +164,7 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
cycles = __arch_get_hw_counter(vd->clock_mode, vd);
if (unlikely(!vdso_cycles_ok(cycles)))
return -1;
- ns = vdso_ts->nsec;
- last = vd->cycle_last;
- ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
- ns = vdso_shift_ns(ns, vd->shift);
+ ns = vdso_calc_ns(vd, cycles, vdso_ts->nsec);
sec = vdso_ts->sec;
} while (unlikely(vdso_read_retry(vd, seq)));
diff --git a/rust/kernel/time.rs b/rust/kernel/time.rs
index 25a896eed4689f..6811d5cadbd40d 100644
--- a/rust/kernel/time.rs
+++ b/rust/kernel/time.rs
@@ -5,6 +5,9 @@
//! This module contains the kernel APIs related to time and timers that
//! have been ported or wrapped for usage by Rust code in the kernel.
+/// The number of nanoseconds per millisecond.
+pub const NSEC_PER_MSEC: i64 = bindings::NSEC_PER_MSEC as i64;
+
/// The time unit of Linux kernel. One jiffy equals (1/HZ) second.
pub type Jiffies = core::ffi::c_ulong;
@@ -18,3 +21,60 @@ pub fn msecs_to_jiffies(msecs: Msecs) -> Jiffies {
// matter what the argument is.
unsafe { bindings::__msecs_to_jiffies(msecs) }
}
+
+/// A Rust wrapper around a `ktime_t`.
+#[repr(transparent)]
+#[derive(Copy, Clone)]
+pub struct Ktime {
+ inner: bindings::ktime_t,
+}
+
+impl Ktime {
+ /// Create a `Ktime` from a raw `ktime_t`.
+ #[inline]
+ pub fn from_raw(inner: bindings::ktime_t) -> Self {
+ Self { inner }
+ }
+
+ /// Get the current time using `CLOCK_MONOTONIC`.
+ #[inline]
+ pub fn ktime_get() -> Self {
+ // SAFETY: It is always safe to call `ktime_get` outside of NMI context.
+ Self::from_raw(unsafe { bindings::ktime_get() })
+ }
+
+ /// Divide the number of nanoseconds by a compile-time constant.
+ #[inline]
+ fn divns_constant<const DIV: i64>(self) -> i64 {
+ self.to_ns() / DIV
+ }
+
+ /// Returns the number of nanoseconds.
+ #[inline]
+ pub fn to_ns(self) -> i64 {
+ self.inner
+ }
+
+ /// Returns the number of milliseconds.
+ #[inline]
+ pub fn to_ms(self) -> i64 {
+ self.divns_constant::<NSEC_PER_MSEC>()
+ }
+}
+
+/// Returns the number of milliseconds between two ktimes.
+#[inline]
+pub fn ktime_ms_delta(later: Ktime, earlier: Ktime) -> i64 {
+ (later - earlier).to_ms()
+}
+
+impl core::ops::Sub for Ktime {
+ type Output = Ktime;
+
+ #[inline]
+ fn sub(self, other: Ktime) -> Ktime {
+ Self {
+ inner: self.inner - other.inner,
+ }
+ }
+}
diff --git a/scripts/gdb/linux/interrupts.py b/scripts/gdb/linux/interrupts.py
index 66ae5c7690cf17..616a5f26377a8c 100644
--- a/scripts/gdb/linux/interrupts.py
+++ b/scripts/gdb/linux/interrupts.py
@@ -37,7 +37,7 @@ def show_irq_desc(prec, irq):
any_count = 0
if desc['kstat_irqs']:
for cpu in cpus.each_online_cpu():
- any_count += cpus.per_cpu(desc['kstat_irqs'], cpu)
+ any_count += cpus.per_cpu(desc['kstat_irqs'], cpu)['cnt']
if (desc['action'] == 0 or irq_desc_is_chained(desc)) and any_count == 0:
return text;
@@ -45,7 +45,7 @@ def show_irq_desc(prec, irq):
text += "%*d: " % (prec, irq)
for cpu in cpus.each_online_cpu():
if desc['kstat_irqs']:
- count = cpus.per_cpu(desc['kstat_irqs'], cpu)
+ count = cpus.per_cpu(desc['kstat_irqs'], cpu)['cnt']
else:
count = 0
text += "%10u" % (count)
@@ -177,7 +177,7 @@ def arm_common_show_interrupts(prec):
if desc == 0:
continue
for cpu in cpus.each_online_cpu():
- text += "%10u" % (cpus.per_cpu(desc['kstat_irqs'], cpu))
+ text += "%10u" % (cpus.per_cpu(desc['kstat_irqs'], cpu)['cnt'])
text += " %s" % (ipi_types[ipi].string())
text += "\n"
return text
diff --git a/tools/testing/selftests/bpf/prog_tests/perf_skip.c b/tools/testing/selftests/bpf/prog_tests/perf_skip.c
new file mode 100644
index 00000000000000..37d8618800e477
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/perf_skip.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <test_progs.h>
+#include "test_perf_skip.skel.h"
+#include <linux/compiler.h>
+#include <linux/hw_breakpoint.h>
+#include <sys/mman.h>
+
+#ifndef TRAP_PERF
+#define TRAP_PERF 6
+#endif
+
+int sigio_count, sigtrap_count;
+
+static void handle_sigio(int sig __always_unused)
+{
+ ++sigio_count;
+}
+
+static void handle_sigtrap(int signum __always_unused,
+ siginfo_t *info,
+ void *ucontext __always_unused)
+{
+ ASSERT_EQ(info->si_code, TRAP_PERF, "si_code");
+ ++sigtrap_count;
+}
+
+static noinline int test_function(void)
+{
+ asm volatile ("");
+ return 0;
+}
+
+void serial_test_perf_skip(void)
+{
+ struct sigaction action = {};
+ struct sigaction previous_sigtrap;
+ sighandler_t previous_sigio = SIG_ERR;
+ struct test_perf_skip *skel = NULL;
+ struct perf_event_attr attr = {};
+ int perf_fd = -1;
+ int err;
+ struct f_owner_ex owner;
+ struct bpf_link *prog_link = NULL;
+
+ action.sa_flags = SA_SIGINFO | SA_NODEFER;
+ action.sa_sigaction = handle_sigtrap;
+ sigemptyset(&action.sa_mask);
+ if (!ASSERT_OK(sigaction(SIGTRAP, &action, &previous_sigtrap), "sigaction"))
+ return;
+
+ previous_sigio = signal(SIGIO, handle_sigio);
+ if (!ASSERT_NEQ(previous_sigio, SIG_ERR, "signal"))
+ goto cleanup;
+
+ skel = test_perf_skip__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel_load"))
+ goto cleanup;
+
+ attr.type = PERF_TYPE_BREAKPOINT;
+ attr.size = sizeof(attr);
+ attr.bp_type = HW_BREAKPOINT_X;
+ attr.bp_addr = (uintptr_t)test_function;
+ attr.bp_len = sizeof(long);
+ attr.sample_period = 1;
+ attr.sample_type = PERF_SAMPLE_IP;
+ attr.pinned = 1;
+ attr.exclude_kernel = 1;
+ attr.exclude_hv = 1;
+ attr.precise_ip = 3;
+ attr.sigtrap = 1;
+ attr.remove_on_exec = 1;
+
+ perf_fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
+ if (perf_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) {
+ printf("SKIP:no PERF_TYPE_BREAKPOINT/HW_BREAKPOINT_X\n");
+ test__skip();
+ goto cleanup;
+ }
+ if (!ASSERT_OK(perf_fd < 0, "perf_event_open"))
+ goto cleanup;
+
+ /* Configure the perf event to signal on sample. */
+ err = fcntl(perf_fd, F_SETFL, O_ASYNC);
+ if (!ASSERT_OK(err, "fcntl(F_SETFL, O_ASYNC)"))
+ goto cleanup;
+
+ owner.type = F_OWNER_TID;
+ owner.pid = syscall(__NR_gettid);
+ err = fcntl(perf_fd, F_SETOWN_EX, &owner);
+ if (!ASSERT_OK(err, "fcntl(F_SETOWN_EX)"))
+ goto cleanup;
+
+ /* Allow at most one sample. A sample rejected by bpf should
+ * not count against this.
+ */
+ err = ioctl(perf_fd, PERF_EVENT_IOC_REFRESH, 1);
+ if (!ASSERT_OK(err, "ioctl(PERF_EVENT_IOC_REFRESH)"))
+ goto cleanup;
+
+ prog_link = bpf_program__attach_perf_event(skel->progs.handler, perf_fd);
+ if (!ASSERT_OK_PTR(prog_link, "bpf_program__attach_perf_event"))
+ goto cleanup;
+
+ /* Configure the bpf program to suppress the sample. */
+ skel->bss->ip = (uintptr_t)test_function;
+ test_function();
+
+ ASSERT_EQ(sigio_count, 0, "sigio_count");
+ ASSERT_EQ(sigtrap_count, 0, "sigtrap_count");
+
+ /* Configure the bpf program to allow the sample. */
+ skel->bss->ip = 0;
+ test_function();
+
+ ASSERT_EQ(sigio_count, 1, "sigio_count");
+ ASSERT_EQ(sigtrap_count, 1, "sigtrap_count");
+
+ /* Test that the sample above is the only one allowed (by perf, not
+ * by bpf)
+ */
+ test_function();
+
+ ASSERT_EQ(sigio_count, 1, "sigio_count");
+ ASSERT_EQ(sigtrap_count, 1, "sigtrap_count");
+
+cleanup:
+ bpf_link__destroy(prog_link);
+ if (perf_fd >= 0)
+ close(perf_fd);
+ test_perf_skip__destroy(skel);
+
+ if (previous_sigio != SIG_ERR)
+ signal(SIGIO, previous_sigio);
+ sigaction(SIGTRAP, &previous_sigtrap, NULL);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_perf_skip.c b/tools/testing/selftests/bpf/progs/test_perf_skip.c
new file mode 100644
index 00000000000000..7eb8b6de7a5708
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_perf_skip.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+uintptr_t ip;
+
+SEC("perf_event")
+int handler(struct bpf_perf_event_data *data)
+{
+ /* Skip events that have the correct ip. */
+ return ip != PT_REGS_IP(&data->regs);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
index 25432b8cd5bd25..073a748b9380a1 100644
--- a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
@@ -19,7 +19,7 @@ fail() { # mesg
FILTER=set_ftrace_filter
FUNC1="schedule"
-FUNC2="scheduler_tick"
+FUNC2="sched_tick"
ALL_FUNCS="#### all functions enabled ####"
diff --git a/tools/testing/selftests/perf_events/.gitignore b/tools/testing/selftests/perf_events/.gitignore
index 790c47001e77e3..ee93dc4969b8b5 100644
--- a/tools/testing/selftests/perf_events/.gitignore
+++ b/tools/testing/selftests/perf_events/.gitignore
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
sigtrap_threads
remove_on_exec
+watermark_signal
diff --git a/tools/testing/selftests/perf_events/Makefile b/tools/testing/selftests/perf_events/Makefile
index db93c4ff081a45..70e3ff21127890 100644
--- a/tools/testing/selftests/perf_events/Makefile
+++ b/tools/testing/selftests/perf_events/Makefile
@@ -2,5 +2,5 @@
CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES)
LDFLAGS += -lpthread
-TEST_GEN_PROGS := sigtrap_threads remove_on_exec
+TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal
include ../lib.mk
diff --git a/tools/testing/selftests/perf_events/watermark_signal.c b/tools/testing/selftests/perf_events/watermark_signal.c
new file mode 100644
index 00000000000000..49dc1e83117493
--- /dev/null
+++ b/tools/testing/selftests/perf_events/watermark_signal.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/perf_event.h>
+#include <stddef.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+#define __maybe_unused __attribute__((__unused__))
+
+static int sigio_count;
+
+static void handle_sigio(int signum __maybe_unused,
+ siginfo_t *oh __maybe_unused,
+ void *uc __maybe_unused)
+{
+ ++sigio_count;
+}
+
+static void do_child(void)
+{
+ raise(SIGSTOP);
+
+ for (int i = 0; i < 20; ++i)
+ sleep(1);
+
+ raise(SIGSTOP);
+
+ exit(0);
+}
+
+TEST(watermark_signal)
+{
+ struct perf_event_attr attr;
+ struct perf_event_mmap_page *p = NULL;
+ struct sigaction previous_sigio, sigio = { 0 };
+ pid_t child = -1;
+ int child_status;
+ int fd = -1;
+ long page_size = sysconf(_SC_PAGE_SIZE);
+
+ sigio.sa_sigaction = handle_sigio;
+ EXPECT_EQ(sigaction(SIGIO, &sigio, &previous_sigio), 0);
+
+ memset(&attr, 0, sizeof(attr));
+ attr.size = sizeof(attr);
+ attr.type = PERF_TYPE_SOFTWARE;
+ attr.config = PERF_COUNT_SW_DUMMY;
+ attr.sample_period = 1;
+ attr.disabled = 1;
+ attr.watermark = 1;
+ attr.context_switch = 1;
+ attr.wakeup_watermark = 1;
+
+ child = fork();
+ EXPECT_GE(child, 0);
+ if (child == 0)
+ do_child();
+ else if (child < 0) {
+ perror("fork()");
+ goto cleanup;
+ }
+
+ if (waitpid(child, &child_status, WSTOPPED) != child ||
+ !(WIFSTOPPED(child_status) && WSTOPSIG(child_status) == SIGSTOP)) {
+ fprintf(stderr,
+ "failed to sycnhronize with child errno=%d status=%x\n",
+ errno,
+ child_status);
+ goto cleanup;
+ }
+
+ fd = syscall(__NR_perf_event_open, &attr, child, -1, -1,
+ PERF_FLAG_FD_CLOEXEC);
+ if (fd < 0) {
+ fprintf(stderr, "failed opening event %llx\n", attr.config);
+ goto cleanup;
+ }
+
+ if (fcntl(fd, F_SETFL, FASYNC)) {
+ perror("F_SETFL FASYNC");
+ goto cleanup;
+ }
+
+ if (fcntl(fd, F_SETOWN, getpid())) {
+ perror("F_SETOWN getpid()");
+ goto cleanup;
+ }
+
+ if (fcntl(fd, F_SETSIG, SIGIO)) {
+ perror("F_SETSIG SIGIO");
+ goto cleanup;
+ }
+
+ p = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (p == NULL) {
+ perror("mmap");
+ goto cleanup;
+ }
+
+ if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0)) {
+ perror("PERF_EVENT_IOC_ENABLE");
+ goto cleanup;
+ }
+
+ if (kill(child, SIGCONT) < 0) {
+ perror("SIGCONT");
+ goto cleanup;
+ }
+
+ if (waitpid(child, &child_status, WSTOPPED) != -1 || errno != EINTR)
+ fprintf(stderr,
+ "expected SIGIO to terminate wait errno=%d status=%x\n%d",
+ errno,
+ child_status,
+ sigio_count);
+
+ EXPECT_GE(sigio_count, 1);
+
+cleanup:
+ if (p != NULL)
+ munmap(p, 2 * page_size);
+
+ if (fd >= 0)
+ close(fd);
+
+ if (child > 0) {
+ kill(child, SIGKILL);
+ waitpid(child, NULL, 0);
+ }
+
+ sigaction(SIGIO, &previous_sigio, NULL);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/x86/amx.c b/tools/testing/selftests/x86/amx.c
index d884fd69dd510b..95aad6d8849beb 100644
--- a/tools/testing/selftests/x86/amx.c
+++ b/tools/testing/selftests/x86/amx.c
@@ -103,21 +103,6 @@ static void clearhandler(int sig)
#define CPUID_LEAF1_ECX_XSAVE_MASK (1 << 26)
#define CPUID_LEAF1_ECX_OSXSAVE_MASK (1 << 27)
-static inline void check_cpuid_xsave(void)
-{
- uint32_t eax, ebx, ecx, edx;
-
- /*
- * CPUID.1:ECX.XSAVE[bit 26] enumerates general
- * support for the XSAVE feature set, including
- * XGETBV.
- */
- __cpuid_count(1, 0, eax, ebx, ecx, edx);
- if (!(ecx & CPUID_LEAF1_ECX_XSAVE_MASK))
- fatal_error("cpuid: no CPU xsave support");
- if (!(ecx & CPUID_LEAF1_ECX_OSXSAVE_MASK))
- fatal_error("cpuid: no OS xsave support");
-}
static uint32_t xbuf_size;
@@ -350,6 +335,7 @@ enum expected_result { FAIL_EXPECTED, SUCCESS_EXPECTED };
/* arch_prctl() and sigaltstack() test */
+#define ARCH_GET_XCOMP_SUPP 0x1021
#define ARCH_GET_XCOMP_PERM 0x1022
#define ARCH_REQ_XCOMP_PERM 0x1023
@@ -928,8 +914,15 @@ static void test_ptrace(void)
int main(void)
{
- /* Check hardware availability at first */
- check_cpuid_xsave();
+ unsigned long features;
+ long rc;
+
+ rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_SUPP, &features);
+ if (rc || (features & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE) {
+ ksft_print_msg("no AMX support\n");
+ return KSFT_SKIP;
+ }
+
check_cpuid_xtiledata();
init_stashed_xsave();
diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c
index 215b8150b7ccad..c0f016f45ee172 100644
--- a/tools/testing/selftests/x86/lam.c
+++ b/tools/testing/selftests/x86/lam.c
@@ -1183,7 +1183,7 @@ int main(int argc, char **argv)
if (!cpu_has_lam()) {
ksft_print_msg("Unsupported LAM feature!\n");
- return -1;
+ return KSFT_SKIP;
}
while ((c = getopt(argc, argv, "ht:")) != -1) {