aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-03-14 12:35:02 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-03-14 12:35:02 -0700
commit9d0c8e793f0eb0613efe81d2cdca8c2efa0ad33c (patch)
tree2483edd3c314bb118fb76f1d8c9d9bd73f9c2259
parent50eb842fe517b2765b7748c3016082b484a6dbb8 (diff)
parent35737d2db2f4567106c90060ad110b27cb354fa4 (diff)
downloadlinux-9d0c8e793f0eb0613efe81d2cdca8c2efa0ad33c.tar.gz
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM fixes from Paolo Bonzini: "More fixes for ARM and x86" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: LAPIC: Advancing the timer expiration on guest initiated write KVM: x86/mmu: Skip !MMU-present SPTEs when removing SP in exclusive mode KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged kvm: x86: annotate RCU pointers KVM: arm64: Fix exclusive limit for IPA size KVM: arm64: Reject VM creation when the default IPA size is unsupported KVM: arm64: Ensure I-cache isolation between vcpus of a same VM KVM: arm64: Don't use cbz/adr with external symbols KVM: arm64: Fix range alignment when walking page tables KVM: arm64: Workaround firmware wrongly advertising GICv2-on-v3 compatibility KVM: arm64: Rename __vgic_v3_get_ich_vtr_el2() to __vgic_v3_get_gic_config() KVM: arm64: Don't access PMSELR_EL0/PMUSERENR_EL0 when no PMU is available KVM: arm64: Turn kvm_arm_support_pmu_v3() into a static key KVM: arm64: Fix nVHE hyp panic host context restore KVM: arm64: Avoid corrupting vCPU context register in guest exit KVM: arm64: nvhe: Save the SPE context early kvm: x86: use NULL instead of using plain integer as pointer KVM: SVM: Connect 'npt' module param to KVM's internal 'npt_enabled' KVM: x86: Ensure deadline timer has truly expired before posting its IRQ
-rw-r--r--Documentation/virt/kvm/api.rst3
-rw-r--r--arch/arm64/include/asm/kvm_asm.h8
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h8
-rw-r--r--arch/arm64/kernel/image-vars.h3
-rw-r--r--arch/arm64/kvm/arm.c7
-rw-r--r--arch/arm64/kvm/hyp/entry.S8
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h9
-rw-r--r--arch/arm64/kvm/hyp/nvhe/debug-sr.c12
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S15
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c12
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c14
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c3
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c1
-rw-r--r--arch/arm64/kvm/hyp/vgic-v3-sr.c40
-rw-r--r--arch/arm64/kvm/hyp/vhe/tlb.c3
-rw-r--r--arch/arm64/kvm/mmu.c3
-rw-r--r--arch/arm64/kvm/perf.c10
-rw-r--r--arch/arm64/kvm/pmu-emul.c10
-rw-r--r--arch/arm64/kvm/reset.c12
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c12
-rw-r--r--arch/x86/include/asm/kvm_host.h4
-rw-r--r--arch/x86/kernel/kvmclock.c19
-rw-r--r--arch/x86/kvm/lapic.c12
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c11
-rw-r--r--arch/x86/kvm/svm/svm.c25
-rw-r--r--arch/x86/kvm/x86.c2
-rw-r--r--include/kvm/arm_pmu.h9
27 files changed, 194 insertions, 81 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 1a2b5210cdbf58..38e327d4b47903 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -182,6 +182,9 @@ is dependent on the CPU capability and the kernel configuration. The limit can
be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION
ioctl() at run-time.
+Creation of the VM will fail if the requested IPA size (whether it is
+implicit or explicit) is unsupported on the host.
+
Please note that configuring the IPA size does not affect the capability
exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects
size of the address translated by the stage2 level (guest physical to
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 22d933e9b59e52..a7ab84f781f73d 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -47,10 +47,10 @@
#define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2
#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3
#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4
-#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5
+#define __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context 5
#define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6
#define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7
-#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config 8
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11
@@ -183,16 +183,16 @@ DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs);
#define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs)
extern void __kvm_flush_vm_context(void);
+extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu);
extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
int level);
extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);
-extern void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu);
extern void __kvm_timer_set_cntvoff(u64 cntvoff);
extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
-extern u64 __vgic_v3_get_ich_vtr_el2(void);
+extern u64 __vgic_v3_get_gic_config(void);
extern u64 __vgic_v3_read_vmcr(void);
extern void __vgic_v3_write_vmcr(u32 vmcr);
extern void __vgic_v3_init_lrs(void);
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index c0450828378b52..32ae676236b6b4 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -83,6 +83,11 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt);
void __debug_switch_to_guest(struct kvm_vcpu *vcpu);
void __debug_switch_to_host(struct kvm_vcpu *vcpu);
+#ifdef __KVM_NVHE_HYPERVISOR__
+void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu);
+void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu);
+#endif
+
void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
@@ -97,7 +102,8 @@ bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt);
void __noreturn hyp_panic(void);
#ifdef __KVM_NVHE_HYPERVISOR__
-void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
+void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr,
+ u64 elr, u64 par);
#endif
#endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 23f1a557bd9f04..5aa9ed1e9ec618 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -101,6 +101,9 @@ KVM_NVHE_ALIAS(__stop___kvm_ex_table);
/* Array containing bases of nVHE per-CPU memory regions. */
KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base);
+/* PMU available static key */
+KVM_NVHE_ALIAS(kvm_arm_pmu_available);
+
#endif /* CONFIG_KVM */
#endif /* __ARM64_KERNEL_IMAGE_VARS_H */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index fc4c95dd2d2617..7f06ba76698d84 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -385,11 +385,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
/*
+ * We guarantee that both TLBs and I-cache are private to each
+ * vcpu. If detecting that a vcpu from the same VM has
+ * previously run on the same physical CPU, call into the
+ * hypervisor code to nuke the relevant contexts.
+ *
* We might get preempted before the vCPU actually runs, but
* over-invalidation doesn't affect correctness.
*/
if (*last_ran != vcpu->vcpu_id) {
- kvm_call_hyp(__kvm_tlb_flush_local_vmid, mmu);
+ kvm_call_hyp(__kvm_flush_cpu_context, mmu);
*last_ran = vcpu->vcpu_id;
}
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index b0afad7a99c6e8..e831d3dfd50d7c 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -85,8 +85,10 @@ SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL)
// If the hyp context is loaded, go straight to hyp_panic
get_loaded_vcpu x0, x1
- cbz x0, hyp_panic
+ cbnz x0, 1f
+ b hyp_panic
+1:
// The hyp context is saved so make sure it is restored to allow
// hyp_panic to run at hyp and, subsequently, panic to run in the host.
// This makes use of __guest_exit to avoid duplication but sets the
@@ -94,7 +96,7 @@ SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL)
// current state is saved to the guest context but it will only be
// accurate if the guest had been completely restored.
adr_this_cpu x0, kvm_hyp_ctxt, x1
- adr x1, hyp_panic
+ adr_l x1, hyp_panic
str x1, [x0, #CPU_XREG_OFFSET(30)]
get_vcpu_ptr x1, x0
@@ -146,7 +148,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// Now restore the hyp regs
restore_callee_saved_regs x2
- set_loaded_vcpu xzr, x1, x2
+ set_loaded_vcpu xzr, x2, x3
alternative_if ARM64_HAS_RAS_EXTN
// If we have the RAS extensions we can consume a pending error
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 54f4860cd87c06..6c1f51f25eb340 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -90,15 +90,18 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
* counter, which could make a PMXEVCNTR_EL0 access UNDEF at
* EL1 instead of being trapped to EL2.
*/
- write_sysreg(0, pmselr_el0);
- write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
+ if (kvm_arm_support_pmu_v3()) {
+ write_sysreg(0, pmselr_el0);
+ write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
+ }
write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
}
static inline void __deactivate_traps_common(void)
{
write_sysreg(0, hstr_el2);
- write_sysreg(0, pmuserenr_el0);
+ if (kvm_arm_support_pmu_v3())
+ write_sysreg(0, pmuserenr_el0);
}
static inline void ___activate_traps(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
index 91a711aa8382e1..f401724f12ef75 100644
--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
@@ -58,16 +58,24 @@ static void __debug_restore_spe(u64 pmscr_el1)
write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1);
}
-void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
+void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
/* Disable and flush SPE data generation */
__debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1);
+}
+
+void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
+{
__debug_switch_to_guest_common(vcpu);
}
-void __debug_switch_to_host(struct kvm_vcpu *vcpu)
+void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
__debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
+}
+
+void __debug_switch_to_host(struct kvm_vcpu *vcpu)
+{
__debug_switch_to_host_common(vcpu);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index 6585a7cbbc5662..5d94584840ccfe 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -71,7 +71,8 @@ SYM_FUNC_START(__host_enter)
SYM_FUNC_END(__host_enter)
/*
- * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
+ * void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr,
+ * u64 elr, u64 par);
*/
SYM_FUNC_START(__hyp_do_panic)
/* Prepare and exit to the host's panic funciton. */
@@ -82,9 +83,11 @@ SYM_FUNC_START(__hyp_do_panic)
hyp_kimg_va lr, x6
msr elr_el2, lr
- /* Set the panic format string. Use the, now free, LR as scratch. */
- ldr lr, =__hyp_panic_string
- hyp_kimg_va lr, x6
+ mov x29, x0
+
+ /* Load the format string into x0 and arguments into x1-7 */
+ ldr x0, =__hyp_panic_string
+ hyp_kimg_va x0, x6
/* Load the format arguments into x1-7. */
mov x6, x3
@@ -94,9 +97,7 @@ SYM_FUNC_START(__hyp_do_panic)
mrs x5, hpfar_el2
/* Enter the host, conditionally restoring the host context. */
- cmp x0, xzr
- mov x0, lr
- b.eq __host_enter_without_restoring
+ cbz x29, __host_enter_without_restoring
b __host_enter_for_panic
SYM_FUNC_END(__hyp_do_panic)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index f012f8665ecc1c..936328207bde02 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -46,11 +46,11 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
__kvm_tlb_flush_vmid(kern_hyp_va(mmu));
}
-static void handle___kvm_tlb_flush_local_vmid(struct kvm_cpu_context *host_ctxt)
+static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
- __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu));
+ __kvm_flush_cpu_context(kern_hyp_va(mmu));
}
static void handle___kvm_timer_set_cntvoff(struct kvm_cpu_context *host_ctxt)
@@ -67,9 +67,9 @@ static void handle___kvm_enable_ssbs(struct kvm_cpu_context *host_ctxt)
write_sysreg_el2(tmp, SYS_SCTLR);
}
-static void handle___vgic_v3_get_ich_vtr_el2(struct kvm_cpu_context *host_ctxt)
+static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt)
{
- cpu_reg(host_ctxt, 1) = __vgic_v3_get_ich_vtr_el2();
+ cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config();
}
static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt)
@@ -115,10 +115,10 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_flush_vm_context),
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
HANDLE_FUNC(__kvm_tlb_flush_vmid),
- HANDLE_FUNC(__kvm_tlb_flush_local_vmid),
+ HANDLE_FUNC(__kvm_flush_cpu_context),
HANDLE_FUNC(__kvm_timer_set_cntvoff),
HANDLE_FUNC(__kvm_enable_ssbs),
- HANDLE_FUNC(__vgic_v3_get_ich_vtr_el2),
+ HANDLE_FUNC(__vgic_v3_get_gic_config),
HANDLE_FUNC(__vgic_v3_read_vmcr),
HANDLE_FUNC(__vgic_v3_write_vmcr),
HANDLE_FUNC(__vgic_v3_init_lrs),
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index f3d0e9eca56cd2..68ab6b4d514140 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -192,6 +192,14 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
pmu_switch_needed = __pmu_switch_to_guest(host_ctxt);
__sysreg_save_state_nvhe(host_ctxt);
+ /*
+ * We must flush and disable the SPE buffer for nVHE, as
+ * the translation regime(EL1&0) is going to be loaded with
+ * that of the guest. And we must do this before we change the
+ * translation regime to EL2 (via MDCR_EL2_E2PB == 0) and
+ * before we load guest Stage1.
+ */
+ __debug_save_host_buffers_nvhe(vcpu);
__adjust_pc(vcpu);
@@ -234,11 +242,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED)
__fpsimd_save_fpexc32(vcpu);
+ __debug_switch_to_host(vcpu);
/*
* This must come after restoring the host sysregs, since a non-VHE
* system may enable SPE here and make use of the TTBRs.
*/
- __debug_switch_to_host(vcpu);
+ __debug_restore_host_buffers_nvhe(vcpu);
if (pmu_switch_needed)
__pmu_switch_to_host(host_ctxt);
@@ -257,7 +266,6 @@ void __noreturn hyp_panic(void)
u64 spsr = read_sysreg_el2(SYS_SPSR);
u64 elr = read_sysreg_el2(SYS_ELR);
u64 par = read_sysreg_par();
- bool restore_host = true;
struct kvm_cpu_context *host_ctxt;
struct kvm_vcpu *vcpu;
@@ -271,7 +279,7 @@ void __noreturn hyp_panic(void)
__sysreg_restore_state_nvhe(host_ctxt);
}
- __hyp_do_panic(restore_host, spsr, elr, par);
+ __hyp_do_panic(host_ctxt, spsr, elr, par);
unreachable();
}
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index fbde89a2c6e838..229b06748c2084 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -123,7 +123,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
__tlb_switch_to_host(&cxt);
}
-void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu)
+void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
{
struct tlb_inv_context cxt;
@@ -131,6 +131,7 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu)
__tlb_switch_to_guest(mmu, &cxt);
__tlbi(vmalle1);
+ asm volatile("ic iallu");
dsb(nsh);
isb();
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 4d177ce1d536f3..926fc07074f57b 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -223,6 +223,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
goto out;
if (!table) {
+ data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level));
data->addr += kvm_granule_size(level);
goto out;
}
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index 80406f463c28fd..ee3682b9873c44 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -405,9 +405,45 @@ void __vgic_v3_init_lrs(void)
__gic_v3_set_lr(0, i);
}
-u64 __vgic_v3_get_ich_vtr_el2(void)
+/*
+ * Return the GIC CPU configuration:
+ * - [31:0] ICH_VTR_EL2
+ * - [62:32] RES0
+ * - [63] MMIO (GICv2) capable
+ */
+u64 __vgic_v3_get_gic_config(void)
{
- return read_gicreg(ICH_VTR_EL2);
+ u64 val, sre = read_gicreg(ICC_SRE_EL1);
+ unsigned long flags = 0;
+
+ /*
+ * To check whether we have a MMIO-based (GICv2 compatible)
+ * CPU interface, we need to disable the system register
+ * view. To do that safely, we have to prevent any interrupt
+ * from firing (which would be deadly).
+ *
+ * Note that this only makes sense on VHE, as interrupts are
+ * already masked for nVHE as part of the exception entry to
+ * EL2.
+ */
+ if (has_vhe())
+ flags = local_daif_save();
+
+ write_gicreg(0, ICC_SRE_EL1);
+ isb();
+
+ val = read_gicreg(ICC_SRE_EL1);
+
+ write_gicreg(sre, ICC_SRE_EL1);
+ isb();
+
+ if (has_vhe())
+ local_daif_restore(flags);
+
+ val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63);
+ val |= read_gicreg(ICH_VTR_EL2);
+
+ return val;
}
u64 __vgic_v3_read_vmcr(void)
diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c
index fd7895945bbc6f..66f17349f0c369 100644
--- a/arch/arm64/kvm/hyp/vhe/tlb.c
+++ b/arch/arm64/kvm/hyp/vhe/tlb.c
@@ -127,7 +127,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
__tlb_switch_to_host(&cxt);
}
-void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu)
+void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
{
struct tlb_inv_context cxt;
@@ -135,6 +135,7 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu)
__tlb_switch_to_guest(mmu, &cxt);
__tlbi(vmalle1);
+ asm volatile("ic iallu");
dsb(nsh);
isb();
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 77cb2d28f2a43a..8711894db8c224 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1312,8 +1312,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
* Prevent userspace from creating a memory region outside of the IPA
* space addressable by the KVM guest IPA space.
*/
- if (memslot->base_gfn + memslot->npages >=
- (kvm_phys_size(kvm) >> PAGE_SHIFT))
+ if ((memslot->base_gfn + memslot->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
return -EFAULT;
mmap_read_lock(current->mm);
diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c
index d45b8b9a441518..739164324afedb 100644
--- a/arch/arm64/kvm/perf.c
+++ b/arch/arm64/kvm/perf.c
@@ -11,6 +11,8 @@
#include <asm/kvm_emulate.h>
+DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available);
+
static int kvm_is_in_guest(void)
{
return kvm_get_running_vcpu() != NULL;
@@ -48,6 +50,14 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
int kvm_perf_init(void)
{
+ /*
+ * Check if HW_PERF_EVENTS are supported by checking the number of
+ * hardware performance counters. This could ensure the presence of
+ * a physical PMU and CONFIG_PERF_EVENT is selected.
+ */
+ if (IS_ENABLED(CONFIG_ARM_PMU) && perf_num_counters() > 0)
+ static_branch_enable(&kvm_arm_pmu_available);
+
return perf_register_guest_info_callbacks(&kvm_guest_cbs);
}
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index e9ec08b0b07033..e32c6e139a09d3 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -823,16 +823,6 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
return val & mask;
}
-bool kvm_arm_support_pmu_v3(void)
-{
- /*
- * Check if HW_PERF_EVENTS are supported by checking the number of
- * hardware performance counters. This could ensure the presence of
- * a physical PMU and CONFIG_PERF_EVENT is selected.
- */
- return (perf_num_counters() > 0);
-}
-
int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
{
if (!kvm_vcpu_has_pmu(vcpu))
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index e81c7ec9e10202..bd354cd45d2860 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -326,10 +326,9 @@ int kvm_set_ipa_limit(void)
}
kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange);
- WARN(kvm_ipa_limit < KVM_PHYS_SHIFT,
- "KVM IPA Size Limit (%d bits) is smaller than default size\n",
- kvm_ipa_limit);
- kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit);
+ kvm_info("IPA Size Limit: %d bits%s\n", kvm_ipa_limit,
+ ((kvm_ipa_limit < KVM_PHYS_SHIFT) ?
+ " (Reduced IPA size, limited VM/VMM compatibility)" : ""));
return 0;
}
@@ -358,6 +357,11 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
return -EINVAL;
} else {
phys_shift = KVM_PHYS_SHIFT;
+ if (phys_shift > kvm_ipa_limit) {
+ pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
+ current->comm);
+ return -EINVAL;
+ }
}
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 52915b34235143..6f530925a23144 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -574,9 +574,13 @@ early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
*/
int vgic_v3_probe(const struct gic_kvm_info *info)
{
- u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2);
+ u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config);
+ bool has_v2;
int ret;
+ has_v2 = ich_vtr_el2 >> 63;
+ ich_vtr_el2 = (u32)ich_vtr_el2;
+
/*
* The ListRegs field is 5 bits, but there is an architectural
* maximum of 16 list registers. Just ignore bit 4...
@@ -594,13 +598,15 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
gicv4_enable ? "en" : "dis");
}
+ kvm_vgic_global_state.vcpu_base = 0;
+
if (!info->vcpu.start) {
kvm_info("GICv3: no GICV resource entry\n");
- kvm_vgic_global_state.vcpu_base = 0;
+ } else if (!has_v2) {
+ pr_warn(FW_BUG "CPU interface incapable of MMIO access\n");
} else if (!PAGE_ALIGNED(info->vcpu.start)) {
pr_warn("GICV physical address 0x%llx not page aligned\n",
(unsigned long long)info->vcpu.start);
- kvm_vgic_global_state.vcpu_base = 0;
} else {
kvm_vgic_global_state.vcpu_base = info->vcpu.start;
kvm_vgic_global_state.can_emulate_gicv2 = true;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 877a4025d8da08..9bc091ecaaeb97 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -963,7 +963,7 @@ struct kvm_arch {
struct kvm_pit *vpit;
atomic_t vapics_in_nmi_mode;
struct mutex apic_map_lock;
- struct kvm_apic_map *apic_map;
+ struct kvm_apic_map __rcu *apic_map;
atomic_t apic_map_dirty;
bool apic_access_page_done;
@@ -1036,7 +1036,7 @@ struct kvm_arch {
bool bus_lock_detection_enabled;
- struct kvm_pmu_event_filter *pmu_event_filter;
+ struct kvm_pmu_event_filter __rcu *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index aa593743acf679..1fc0962c89c082 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -268,21 +268,20 @@ static void __init kvmclock_init_mem(void)
static int __init kvm_setup_vsyscall_timeinfo(void)
{
-#ifdef CONFIG_X86_64
- u8 flags;
+ kvmclock_init_mem();
- if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall)
- return 0;
+#ifdef CONFIG_X86_64
+ if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) {
+ u8 flags;
- flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
- if (!(flags & PVCLOCK_TSC_STABLE_BIT))
- return 0;
+ flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
+ if (!(flags & PVCLOCK_TSC_STABLE_BIT))
+ return 0;
- kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
+ kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
+ }
#endif
- kvmclock_init_mem();
-
return 0;
}
early_initcall(kvm_setup_vsyscall_timeinfo);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 45d40bfacb7cbe..cc369b9ad8f11c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1642,7 +1642,16 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
}
if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
- kvm_wait_lapic_expire(vcpu);
+ /*
+ * Ensure the guest's timer has truly expired before posting an
+ * interrupt. Open code the relevant checks to avoid querying
+ * lapic_timer_int_injected(), which will be false since the
+ * interrupt isn't yet injected. Waiting until after injecting
+ * is not an option since that won't help a posted interrupt.
+ */
+ if (vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns)
+ __kvm_wait_lapic_expire(vcpu);
kvm_apic_inject_pending_timer_irqs(apic);
return;
}
@@ -2595,6 +2604,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
apic_update_ppr(apic);
hrtimer_cancel(&apic->lapic_timer.timer);
+ apic->lapic_timer.expired_tscdeadline = 0;
apic_update_lvtt(apic);
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c926c6b899a106..d78915019b082f 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -337,7 +337,18 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
cpu_relax();
}
} else {
+ /*
+ * If the SPTE is not MMU-present, there is no backing
+ * page associated with the SPTE and so no side effects
+ * that need to be recorded, and exclusive ownership of
+ * mmu_lock ensures the SPTE can't be made present.
+ * Note, zapping MMIO SPTEs is also unnecessary as they
+ * are guarded by the memslots generation, not by being
+ * unreachable.
+ */
old_child_spte = READ_ONCE(*sptep);
+ if (!is_shadow_present_pte(old_child_spte))
+ continue;
/*
* Marking the SPTE as a removed SPTE is not
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index baee91c1e93643..58a45bb139f88a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -115,13 +115,6 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_INVALID, .always = false },
};
-/* enable NPT for AMD64 and X86 with PAE */
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-bool npt_enabled = true;
-#else
-bool npt_enabled;
-#endif
-
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* pause_filter_count: On processors that support Pause filtering(indicated
@@ -170,9 +163,12 @@ module_param(pause_filter_count_shrink, ushort, 0444);
static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
module_param(pause_filter_count_max, ushort, 0444);
-/* allow nested paging (virtualized MMU) for all guests */
-static int npt = true;
-module_param(npt, int, S_IRUGO);
+/*
+ * Use nested page tables by default. Note, NPT may get forced off by
+ * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
+ */
+bool npt_enabled = true;
+module_param_named(npt, npt_enabled, bool, 0444);
/* allow nested virtualization in KVM/SVM */
static int nested = true;
@@ -988,10 +984,15 @@ static __init int svm_hardware_setup(void)
goto err;
}
- if (!boot_cpu_has(X86_FEATURE_NPT))
+ /*
+ * KVM's MMU doesn't support using 2-level paging for itself, and thus
+ * NPT isn't supported if the host is using 2-level paging since host
+ * CR4 is unchanged on VMRUN.
+ */
+ if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
npt_enabled = false;
- if (npt_enabled && !npt)
+ if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2a20ce60152ea9..47e021bdcc94ab 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10601,7 +10601,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
return (void __user *)hva;
} else {
if (!slot || !slot->npages)
- return 0;
+ return NULL;
old_npages = slot->npages;
hva = slot->userspace_addr;
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 8dcb3e1477bc98..6fd3cda608e4a6 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -13,6 +13,13 @@
#define ARMV8_PMU_CYCLE_IDX (ARMV8_PMU_MAX_COUNTERS - 1)
#define ARMV8_PMU_MAX_COUNTER_PAIRS ((ARMV8_PMU_MAX_COUNTERS + 1) >> 1)
+DECLARE_STATIC_KEY_FALSE(kvm_arm_pmu_available);
+
+static __always_inline bool kvm_arm_support_pmu_v3(void)
+{
+ return static_branch_likely(&kvm_arm_pmu_available);
+}
+
#ifdef CONFIG_HW_PERF_EVENTS
struct kvm_pmc {
@@ -47,7 +54,6 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val);
void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val);
void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
u64 select_idx);
-bool kvm_arm_support_pmu_v3(void);
int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr);
int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu,
@@ -87,7 +93,6 @@ static inline void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) {}
static inline void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) {}
static inline void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu,
u64 data, u64 select_idx) {}
-static inline bool kvm_arm_support_pmu_v3(void) { return false; }
static inline int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{