diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2024-04-29 14:47:08 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2024-04-29 14:47:08 +1000 |
commit | 8af591e31104714c34309fe5a1ed2786cefae7aa (patch) | |
tree | 26e77be99de1e5bf4d370d27285c3d2a1c2dd1e2 | |
parent | d2b414442dff384b3933f6986f79177e92fdaa1e (diff) | |
parent | 7b076c6a308ec5bce9fc96e2935443ed228b9148 (diff) | |
download | linux-next-history-8af591e31104714c34309fe5a1ed2786cefae7aa.tar.gz |
Merge branch 'next' of https://github.com/kvm-x86/linux.git
Notice: this object is not reachable from any branch.
Notice: this object is not reachable from any branch.
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 41 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_emulate.h | 1 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/page_track.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 14 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 75 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 30 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 6 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/aarch64/page_fault_test.c | 4 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/demand_paging_test.c | 90 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/include/userfaultfd_util.h | 16 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/lib/userfaultfd_util.c | 153 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/steal_time.c | 47 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 16 |
17 files changed, 331 insertions, 176 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3cb7647ea2f20d..8307714cb2f6f9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -994,9 +994,6 @@ struct kvm_vcpu_arch { u64 msr_kvm_poll_control; - /* set at EPT violation at this point */ - unsigned long exit_qualification; - /* pv related host specific info */ struct { bool pv_unhalted; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 1851b3870a9c05..f2f2be5d11415d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1232,9 +1232,22 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = 0; break; case 0x80000008: { - unsigned g_phys_as = (entry->eax >> 16) & 0xff; - unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); - unsigned phys_as = entry->eax & 0xff; + /* + * GuestPhysAddrSize (EAX[23:16]) is intended for software + * use. + * + * KVM's ABI is to report the effective MAXPHYADDR for the + * guest in PhysAddrSize (phys_as), and the maximum + * *addressable* GPA in GuestPhysAddrSize (g_phys_as). + * + * GuestPhysAddrSize is valid if and only if TDP is enabled, + * in which case the max GPA that can be addressed by KVM may + * be less than the max GPA that can be legally generated by + * the guest, e.g. if MAXPHYADDR>48 but the CPU doesn't + * support 5-level TDP. + */ + unsigned int virt_as = max((entry->eax >> 8) & 0xff, 48U); + unsigned int phys_as, g_phys_as; /* * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as @@ -1242,16 +1255,24 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * reductions in MAXPHYADDR for memory encryption affect shadow * paging, too. * - * If TDP is enabled but an explicit guest MAXPHYADDR is not - * provided, use the raw bare metal MAXPHYADDR as reductions to - * the HPAs do not affect GPAs. + * If TDP is enabled, use the raw bare metal MAXPHYADDR as + * reductions to the HPAs do not affect GPAs. The max + * addressable GPA is the same as the max effective GPA, except + * that it's capped at 48 bits if 5-level TDP isn't supported + * (hardware processes bits 51:48 only when walking the fifth + * level page table). */ - if (!tdp_enabled) - g_phys_as = boot_cpu_data.x86_phys_bits; - else if (!g_phys_as) + if (!tdp_enabled) { + phys_as = boot_cpu_data.x86_phys_bits; + g_phys_as = 0; + } else { + phys_as = entry->eax & 0xff; g_phys_as = phys_as; + if (kvm_mmu_get_max_tdp_level() < 5) + g_phys_as = min(g_phys_as, 48); + } - entry->eax = g_phys_as | (virt_as << 8); + entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16); entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8)); entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0008_EBX); diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 5382646162a387..29ea4313e1bb1e 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -26,6 +26,7 @@ struct x86_exception { bool nested_page_fault; u64 address; /* cr2 or nested page fault gpa */ u8 async_page_fault; + unsigned long exit_qualification; }; /* diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 60f21bb4c27b19..b410a227c6018d 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -100,6 +100,8 @@ static inline u8 kvm_get_shadow_phys_bits(void) return boot_cpu_data.x86_phys_bits; } +u8 kvm_mmu_get_max_tdp_level(void); + void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 69a872d151a9be..553d51cababa75 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5269,6 +5269,11 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) return max_tdp_level; } +u8 kvm_mmu_get_max_tdp_level(void) +{ + return tdp_root_level ? tdp_root_level : max_tdp_level; +} + static union kvm_mmu_page_role kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index f6448284c18e3e..561c331fd6ecc7 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -41,7 +41,7 @@ bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) { - kvfree(slot->arch.gfn_write_track); + vfree(slot->arch.gfn_write_track); slot->arch.gfn_write_track = NULL; } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 4d4e98fe4f3548..7a87097cb45b52 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -497,21 +497,21 @@ error: * The other bits are set to 0. */ if (!(errcode & PFERR_RSVD_MASK)) { - vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID | - EPT_VIOLATION_GVA_TRANSLATED); + walker->fault.exit_qualification = 0; + if (write_fault) - vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE; + walker->fault.exit_qualification |= EPT_VIOLATION_ACC_WRITE; if (user_fault) - vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ; + walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ; if (fetch_fault) - vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR; + walker->fault.exit_qualification |= EPT_VIOLATION_ACC_INSTR; /* * Note, pte_access holds the raw RWX bits from the EPTE, not * ACC_*_MASK flags! */ - vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) << - EPT_VIOLATION_RWX_SHIFT; + walker->fault.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) << + EPT_VIOLATION_RWX_SHIFT; } #endif walker->fault.address = addr; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index af21703c848c16..c3dc18fac4f7cc 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -530,6 +530,31 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } +static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte) +{ + u64 *sptep = rcu_dereference(iter->sptep); + + /* + * The caller is responsible for ensuring the old SPTE is not a REMOVED + * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, + * and pre-checking before inserting a new SPTE is advantageous as it + * avoids unnecessary work. + */ + WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); + + /* + * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and + * does not hold the mmu_lock. On failure, i.e. if a different logical + * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with + * the current value, so the caller operates on fresh data, e.g. if it + * retries tdp_mmu_set_spte_atomic() + */ + if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) + return -EBUSY; + + return 0; +} + /* * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically * and handle the associated bookkeeping. Do not mark the page dirty @@ -551,27 +576,13 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { - u64 *sptep = rcu_dereference(iter->sptep); - - /* - * The caller is responsible for ensuring the old SPTE is not a REMOVED - * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, - * and pre-checking before inserting a new SPTE is advantageous as it - * avoids unnecessary work. - */ - WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); + int ret; lockdep_assert_held_read(&kvm->mmu_lock); - /* - * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and - * does not hold the mmu_lock. On failure, i.e. if a different logical - * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with - * the current value, so the caller operates on fresh data, e.g. if it - * retries tdp_mmu_set_spte_atomic() - */ - if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) - return -EBUSY; + ret = __tdp_mmu_set_spte_atomic(iter, new_spte); + if (ret) + return ret; handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level, true); @@ -584,13 +595,17 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, { int ret; + lockdep_assert_held_read(&kvm->mmu_lock); + /* - * Freeze the SPTE by setting it to a special, - * non-present value. This will stop other threads from - * immediately installing a present entry in its place - * before the TLBs are flushed. + * Freeze the SPTE by setting it to a special, non-present value. This + * will stop other threads from immediately installing a present entry + * in its place before the TLBs are flushed. + * + * Delay processing of the zapped SPTE until after TLBs are flushed and + * the REMOVED_SPTE is replaced (see below). */ - ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); + ret = __tdp_mmu_set_spte_atomic(iter, REMOVED_SPTE); if (ret) return ret; @@ -599,12 +614,20 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, /* * No other thread can overwrite the removed SPTE as they must either * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not - * overwrite the special removed SPTE value. No bookkeeping is needed - * here since the SPTE is going from non-present to non-present. Use - * the raw write helper to avoid an unnecessary check on volatile bits. + * overwrite the special removed SPTE value. Use the raw write helper to + * avoid an unnecessary check on volatile bits. */ __kvm_tdp_mmu_write_spte(iter->sptep, 0); + /* + * Process the zapped SPTE after flushing TLBs, and after replacing + * REMOVED_SPTE with 0. This minimizes the amount of time vCPUs are + * blocked by the REMOVED_SPTE and reduces contention on the child + * SPTEs. + */ + handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + 0, iter->level, true); + return 0; } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d05ddf7514915c..d5b832126e3458 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -409,18 +409,40 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qualification; u32 vm_exit_reason; - unsigned long exit_qualification = vcpu->arch.exit_qualification; if (vmx->nested.pml_full) { vm_exit_reason = EXIT_REASON_PML_FULL; vmx->nested.pml_full = false; - exit_qualification &= INTR_INFO_UNBLOCK_NMI; + + /* + * It should be impossible to trigger a nested PML Full VM-Exit + * for anything other than an EPT Violation from L2. KVM *can* + * trigger nEPT page fault injection in response to an EPT + * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT + * tables also changed, but KVM should not treat EPT Misconfig + * VM-Exits as writes. + */ + WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION); + + /* + * PML Full and EPT Violation VM-Exits both use bit 12 to report + * "NMI unblocking due to IRET", i.e. the bit can be propagated + * as-is from the original EXIT_QUALIFICATION. + */ + exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; } else { - if (fault->error_code & PFERR_RSVD_MASK) + if (fault->error_code & PFERR_RSVD_MASK) { vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; - else + exit_qualification = 0; + } else { + exit_qualification = fault->exit_qualification; + exit_qualification |= vmx_get_exit_qual(vcpu) & + (EPT_VIOLATION_GVA_IS_VALID | + EPT_VIOLATION_GVA_TRANSLATED); vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + } /* * Although the caller (kvm_inject_emulated_page_fault) would diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 94db9e4db2ceaa..f10b5f8f364b40 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5765,8 +5765,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; - vcpu->arch.exit_qualification = exit_qualification; - /* * Check that the GPA doesn't exceed physical memory limits, as that is * a guest page fault. We have to emulate the instruction here, because diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2369956638deea..ae10da510f76b8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12798,7 +12798,7 @@ static void memslot_rmap_free(struct kvm_memory_slot *slot) int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.rmap[i]); + vfree(slot->arch.rmap[i]); slot->arch.rmap[i] = NULL; } } @@ -12810,7 +12810,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) memslot_rmap_free(slot); for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.lpage_info[i - 1]); + vfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } @@ -12902,7 +12902,7 @@ out_free: memslot_rmap_free(slot); for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.lpage_info[i - 1]); + vfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } return -ENOMEM; diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index 5972905275cfac..a2a158e2c0b85a 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -375,14 +375,14 @@ static void setup_uffd(struct kvm_vm *vm, struct test_params *p, *pt_uffd = uffd_setup_demand_paging(uffd_mode, 0, pt_args.hva, pt_args.paging_size, - test->uffd_pt_handler); + 1, test->uffd_pt_handler); *data_uffd = NULL; if (test->uffd_data_handler) *data_uffd = uffd_setup_demand_paging(uffd_mode, 0, data_args.hva, data_args.paging_size, - test->uffd_data_handler); + 1, test->uffd_data_handler); } static void free_uffd(struct test_desc *test, struct uffd_desc *pt_uffd, diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index bf3609f718544f..056ff1c873450a 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -13,7 +13,6 @@ #include <stdio.h> #include <stdlib.h> #include <time.h> -#include <poll.h> #include <pthread.h> #include <linux/userfaultfd.h> #include <sys/syscall.h> @@ -77,8 +76,20 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, copy.mode = 0; r = ioctl(uffd, UFFDIO_COPY, ©); - if (r == -1) { - pr_info("Failed UFFDIO_COPY in 0x%lx from thread %d with errno: %d\n", + /* + * With multiple vCPU threads fault on a single page and there are + * multiple readers for the UFFD, at least one of the UFFDIO_COPYs + * will fail with EEXIST: handle that case without signaling an + * error. + * + * Note that this also suppress any EEXISTs occurring from, + * e.g., the first UFFDIO_COPY/CONTINUEs on a page. That never + * happens here, but a realistic VMM might potentially maintain + * some external state to correctly surface EEXISTs to userspace + * (or prevent duplicate COPY/CONTINUEs in the first place). + */ + if (r == -1 && errno != EEXIST) { + pr_info("Failed UFFDIO_COPY in 0x%lx from thread %d, errno = %d\n", addr, tid, errno); return r; } @@ -89,8 +100,20 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, cont.range.len = demand_paging_size; r = ioctl(uffd, UFFDIO_CONTINUE, &cont); - if (r == -1) { - pr_info("Failed UFFDIO_CONTINUE in 0x%lx from thread %d with errno: %d\n", + /* + * With multiple vCPU threads fault on a single page and there are + * multiple readers for the UFFD, at least one of the UFFDIO_COPYs + * will fail with EEXIST: handle that case without signaling an + * error. + * + * Note that this also suppress any EEXISTs occurring from, + * e.g., the first UFFDIO_COPY/CONTINUEs on a page. That never + * happens here, but a realistic VMM might potentially maintain + * some external state to correctly surface EEXISTs to userspace + * (or prevent duplicate COPY/CONTINUEs in the first place). + */ + if (r == -1 && errno != EEXIST) { + pr_info("Failed UFFDIO_CONTINUE in 0x%lx, thread %d, errno = %d\n", addr, tid, errno); return r; } @@ -110,7 +133,9 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, struct test_params { int uffd_mode; + bool single_uffd; useconds_t uffd_delay; + int readers_per_uffd; enum vm_mem_backing_src_type src_type; bool partition_vcpu_memory_access; }; @@ -131,10 +156,12 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct memstress_vcpu_args *vcpu_args; struct test_params *p = arg; struct uffd_desc **uffd_descs = NULL; + uint64_t uffd_region_size; struct timespec start; struct timespec ts_diff; + double vcpu_paging_rate; struct kvm_vm *vm; - int i; + int i, num_uffds = 0; vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, p->src_type, p->partition_vcpu_memory_access); @@ -147,7 +174,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) memset(guest_data_prototype, 0xAB, demand_paging_size); if (p->uffd_mode == UFFDIO_REGISTER_MODE_MINOR) { - for (i = 0; i < nr_vcpus; i++) { + num_uffds = p->single_uffd ? 1 : nr_vcpus; + for (i = 0; i < num_uffds; i++) { vcpu_args = &memstress_args.vcpu_args[i]; prefault_mem(addr_gpa2alias(vm, vcpu_args->gpa), vcpu_args->pages * memstress_args.guest_page_size); @@ -155,9 +183,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) } if (p->uffd_mode) { - uffd_descs = malloc(nr_vcpus * sizeof(struct uffd_desc *)); + num_uffds = p->single_uffd ? 1 : nr_vcpus; + uffd_region_size = nr_vcpus * guest_percpu_mem_size / num_uffds; + + uffd_descs = malloc(num_uffds * sizeof(struct uffd_desc *)); TEST_ASSERT(uffd_descs, "Memory allocation failed"); - for (i = 0; i < nr_vcpus; i++) { + for (i = 0; i < num_uffds; i++) { + struct memstress_vcpu_args *vcpu_args; void *vcpu_hva; vcpu_args = &memstress_args.vcpu_args[i]; @@ -170,7 +202,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) */ uffd_descs[i] = uffd_setup_demand_paging( p->uffd_mode, p->uffd_delay, vcpu_hva, - vcpu_args->pages * memstress_args.guest_page_size, + uffd_region_size, + p->readers_per_uffd, &handle_uffd_page_request); } } @@ -187,15 +220,19 @@ static void run_test(enum vm_guest_mode mode, void *arg) if (p->uffd_mode) { /* Tell the user fault fd handler threads to quit */ - for (i = 0; i < nr_vcpus; i++) + for (i = 0; i < num_uffds; i++) uffd_stop_demand_paging(uffd_descs[i]); } - pr_info("Total guest execution time: %ld.%.9lds\n", + pr_info("Total guest execution time:\t%ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); - pr_info("Overall demand paging rate: %f pgs/sec\n", - memstress_args.vcpu_args[0].pages * nr_vcpus / - ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / NSEC_PER_SEC)); + + vcpu_paging_rate = memstress_args.vcpu_args[0].pages / + ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / NSEC_PER_SEC); + pr_info("Per-vcpu demand paging rate:\t%f pgs/sec/vcpu\n", + vcpu_paging_rate); + pr_info("Overall demand paging rate:\t%f pgs/sec\n", + vcpu_paging_rate * nr_vcpus); memstress_destroy_vm(vm); @@ -207,15 +244,20 @@ static void run_test(enum vm_guest_mode mode, void *arg) static void help(char *name) { puts(""); - printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n" - " [-b memory] [-s type] [-v vcpus] [-c cpu_list] [-o]\n", name); + printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-a]\n" + " [-d uffd_delay_usec] [-r readers_per_uffd] [-b memory]\n" + " [-s type] [-v vcpus] [-c cpu_list] [-o]\n", name); guest_modes_help(); printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n" " UFFD registration mode: 'MISSING' or 'MINOR'.\n"); kvm_print_vcpu_pinning_help(); + printf(" -a: Use a single userfaultfd for all of guest memory, instead of\n" + " creating one for each region paged by a unique vCPU\n" + " Set implicitly with -o, and no effect without -u.\n"); printf(" -d: add a delay in usec to the User Fault\n" " FD handler to simulate demand paging\n" " overheads. Ignored without -u.\n"); + printf(" -r: Set the number of reader threads per uffd.\n"); printf(" -b: specify the size of the memory region which should be\n" " demand paged by each vCPU. e.g. 10M or 3G.\n" " Default: 1G\n"); @@ -234,12 +276,14 @@ int main(int argc, char *argv[]) struct test_params p = { .src_type = DEFAULT_VM_MEM_SRC, .partition_vcpu_memory_access = true, + .readers_per_uffd = 1, + .single_uffd = false, }; int opt; guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:u:d:b:s:v:c:o")) != -1) { + while ((opt = getopt(argc, argv, "ahom:u:d:b:s:v:c:r:")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); @@ -251,6 +295,9 @@ int main(int argc, char *argv[]) p.uffd_mode = UFFDIO_REGISTER_MODE_MINOR; TEST_ASSERT(p.uffd_mode, "UFFD mode must be 'MISSING' or 'MINOR'."); break; + case 'a': + p.single_uffd = true; + break; case 'd': p.uffd_delay = strtoul(optarg, NULL, 0); TEST_ASSERT(p.uffd_delay >= 0, "A negative UFFD delay is not supported."); @@ -271,6 +318,13 @@ int main(int argc, char *argv[]) break; case 'o': p.partition_vcpu_memory_access = false; + p.single_uffd = true; + break; + case 'r': + p.readers_per_uffd = atoi(optarg); + TEST_ASSERT(p.readers_per_uffd >= 1, + "Invalid number of readers per uffd %d: must be >=1", + p.readers_per_uffd); break; case 'h': default: diff --git a/tools/testing/selftests/kvm/include/userfaultfd_util.h b/tools/testing/selftests/kvm/include/userfaultfd_util.h index 877449c345928b..24f2cc5f429281 100644 --- a/tools/testing/selftests/kvm/include/userfaultfd_util.h +++ b/tools/testing/selftests/kvm/include/userfaultfd_util.h @@ -17,17 +17,27 @@ typedef int (*uffd_handler_t)(int uffd_mode, int uffd, struct uffd_msg *msg); -struct uffd_desc { +struct uffd_reader_args { int uffd_mode; int uffd; - int pipefds[2]; useconds_t delay; uffd_handler_t handler; - pthread_t thread; + /* Holds the read end of the pipe for killing the reader. */ + int pipe; +}; + +struct uffd_desc { + int uffd; + uint64_t num_readers; + /* Holds the write ends of the pipes for killing the readers. */ + int *pipefds; + pthread_t *readers; + struct uffd_reader_args *reader_args; }; struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, void *hva, uint64_t len, + uint64_t num_readers, uffd_handler_t handler); void uffd_stop_demand_paging(struct uffd_desc *uffd); diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c index f4eef6eb2dc2cc..0ba866c4af694f 100644 --- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c +++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c @@ -16,6 +16,7 @@ #include <poll.h> #include <pthread.h> #include <linux/userfaultfd.h> +#include <sys/epoll.h> #include <sys/syscall.h> #include "kvm_util.h" @@ -27,76 +28,69 @@ static void *uffd_handler_thread_fn(void *arg) { - struct uffd_desc *uffd_desc = (struct uffd_desc *)arg; - int uffd = uffd_desc->uffd; - int pipefd = uffd_desc->pipefds[0]; - useconds_t delay = uffd_desc->delay; + struct uffd_reader_args *reader_args = (struct uffd_reader_args *)arg; + int uffd = reader_args->uffd; int64_t pages = 0; struct timespec start; struct timespec ts_diff; + struct epoll_event evt; + int epollfd; + + epollfd = epoll_create(1); + TEST_ASSERT(epollfd >= 0, "Failed to create epollfd."); + + evt.events = EPOLLIN | EPOLLEXCLUSIVE; + evt.data.u32 = 0; + TEST_ASSERT(!epoll_ctl(epollfd, EPOLL_CTL_ADD, uffd, &evt), + "Failed to add uffd to epollfd"); + + evt.events = EPOLLIN; + evt.data.u32 = 1; + TEST_ASSERT(!epoll_ctl(epollfd, EPOLL_CTL_ADD, reader_args->pipe, &evt), + "Failed to add pipe to epollfd"); clock_gettime(CLOCK_MONOTONIC, &start); while (1) { struct uffd_msg msg; - struct pollfd pollfd[2]; - char tmp_chr; int r; - pollfd[0].fd = uffd; - pollfd[0].events = POLLIN; - pollfd[1].fd = pipefd; - pollfd[1].events = POLLIN; - - r = poll(pollfd, 2, -1); - switch (r) { - case -1: - pr_info("poll err"); - continue; - case 0: - continue; - case 1: - break; - default: - pr_info("Polling uffd returned %d", r); - return NULL; - } + r = epoll_wait(epollfd, &evt, 1, -1); + TEST_ASSERT(r == 1, + "Unexpected number of events (%d) from epoll, errno = %d", + r, errno); - if (pollfd[0].revents & POLLERR) { - pr_info("uffd revents has POLLERR"); - return NULL; - } + if (evt.data.u32 == 1) { + char tmp_chr; - if (pollfd[1].revents & POLLIN) { - r = read(pollfd[1].fd, &tmp_chr, 1); + TEST_ASSERT(!(evt.events & (EPOLLERR | EPOLLHUP)), + "Reader thread received EPOLLERR or EPOLLHUP on pipe."); + r = read(reader_args->pipe, &tmp_chr, 1); TEST_ASSERT(r == 1, - "Error reading pipefd in UFFD thread"); + "Error reading pipefd in uffd reader thread"); break; } - if (!(pollfd[0].revents & POLLIN)) - continue; + TEST_ASSERT(!(evt.events & (EPOLLERR | EPOLLHUP)), + "Reader thread received EPOLLERR or EPOLLHUP on uffd."); r = read(uffd, &msg, sizeof(msg)); if (r == -1) { - if (errno == EAGAIN) - continue; - pr_info("Read of uffd got errno %d\n", errno); - return NULL; + TEST_ASSERT(errno == EAGAIN, + "Error reading from UFFD: errno = %d", errno); + continue; } - if (r != sizeof(msg)) { - pr_info("Read on uffd returned unexpected size: %d bytes", r); - return NULL; - } + TEST_ASSERT(r == sizeof(msg), + "Read on uffd returned unexpected number of bytes (%d)", r); if (!(msg.event & UFFD_EVENT_PAGEFAULT)) continue; - if (delay) - usleep(delay); - r = uffd_desc->handler(uffd_desc->uffd_mode, uffd, &msg); - if (r < 0) - return NULL; + if (reader_args->delay) + usleep(reader_args->delay); + r = reader_args->handler(reader_args->uffd_mode, uffd, &msg); + TEST_ASSERT(r >= 0, + "Reader thread handler fn returned negative value %d", r); pages++; } @@ -110,6 +104,7 @@ static void *uffd_handler_thread_fn(void *arg) struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, void *hva, uint64_t len, + uint64_t num_readers, uffd_handler_t handler) { struct uffd_desc *uffd_desc; @@ -118,14 +113,25 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY; - int ret; + int ret, i; PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n", is_minor ? "MINOR" : "MISSING", is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY"); uffd_desc = malloc(sizeof(struct uffd_desc)); - TEST_ASSERT(uffd_desc, "malloc failed"); + TEST_ASSERT(uffd_desc, "Failed to malloc uffd descriptor"); + + uffd_desc->pipefds = calloc(sizeof(int), num_readers); + TEST_ASSERT(uffd_desc->pipefds, "Failed to alloc pipes"); + + uffd_desc->readers = calloc(sizeof(pthread_t), num_readers); + TEST_ASSERT(uffd_desc->readers, "Failed to alloc reader threads"); + + uffd_desc->reader_args = calloc(sizeof(struct uffd_reader_args), num_readers); + TEST_ASSERT(uffd_desc->reader_args, "Failed to alloc reader_args"); + + uffd_desc->num_readers = num_readers; /* In order to get minor faults, prefault via the alias. */ if (is_minor) @@ -148,18 +154,28 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) == expected_ioctls, "missing userfaultfd ioctls"); - ret = pipe2(uffd_desc->pipefds, O_CLOEXEC | O_NONBLOCK); - TEST_ASSERT(!ret, "Failed to set up pipefd"); - - uffd_desc->uffd_mode = uffd_mode; uffd_desc->uffd = uffd; - uffd_desc->delay = delay; - uffd_desc->handler = handler; - pthread_create(&uffd_desc->thread, NULL, uffd_handler_thread_fn, - uffd_desc); + for (i = 0; i < uffd_desc->num_readers; ++i) { + int pipes[2]; + + ret = pipe2((int *) &pipes, O_CLOEXEC | O_NONBLOCK); + TEST_ASSERT(!ret, "Failed to set up pipefd %i for uffd_desc %p", + i, uffd_desc); - PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n", - hva, hva + len); + uffd_desc->pipefds[i] = pipes[1]; + + uffd_desc->reader_args[i].uffd_mode = uffd_mode; + uffd_desc->reader_args[i].uffd = uffd; + uffd_desc->reader_args[i].delay = delay; + uffd_desc->reader_args[i].handler = handler; + uffd_desc->reader_args[i].pipe = pipes[0]; + + pthread_create(&uffd_desc->readers[i], NULL, uffd_handler_thread_fn, + &uffd_desc->reader_args[i]); + + PER_VCPU_DEBUG("Created uffd thread %i for HVA range [%p, %p)\n", + i, hva, hva + len); + } return uffd_desc; } @@ -167,19 +183,26 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, void uffd_stop_demand_paging(struct uffd_desc *uffd) { char c = 0; - int ret; + int i; - ret = write(uffd->pipefds[1], &c, 1); - TEST_ASSERT(ret == 1, "Unable to write to pipefd"); + for (i = 0; i < uffd->num_readers; ++i) + TEST_ASSERT(write(uffd->pipefds[i], &c, 1) == 1, + "Unable to write to pipefd %i for uffd_desc %p", i, uffd); - ret = pthread_join(uffd->thread, NULL); - TEST_ASSERT(ret == 0, "Pthread_join failed."); + for (i = 0; i < uffd->num_readers; ++i) + TEST_ASSERT(!pthread_join(uffd->readers[i], NULL), + "Pthread_join failed on reader %i for uffd_desc %p", i, uffd); close(uffd->uffd); - close(uffd->pipefds[1]); - close(uffd->pipefds[0]); + for (i = 0; i < uffd->num_readers; ++i) { + close(uffd->pipefds[i]); + close(uffd->reader_args[i].pipe); + } + free(uffd->pipefds); + free(uffd->readers); + free(uffd->reader_args); free(uffd); } diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index 2ff82c7fd92659..9999173b054536 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -85,20 +85,18 @@ static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) { struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); - int i; - pr_info("VCPU%d:\n", vcpu_idx); - pr_info(" steal: %lld\n", st->steal); - pr_info(" version: %d\n", st->version); - pr_info(" flags: %d\n", st->flags); - pr_info(" preempted: %d\n", st->preempted); - pr_info(" u8_pad: "); - for (i = 0; i < 3; ++i) - pr_info("%d", st->u8_pad[i]); - pr_info("\n pad: "); - for (i = 0; i < 11; ++i) - pr_info("%d", st->pad[i]); - pr_info("\n"); + ksft_print_msg("VCPU%d:\n", vcpu_idx); + ksft_print_msg(" steal: %lld\n", st->steal); + ksft_print_msg(" version: %d\n", st->version); + ksft_print_msg(" flags: %d\n", st->flags); + ksft_print_msg(" preempted: %d\n", st->preempted); + ksft_print_msg(" u8_pad: %d %d %d\n", + st->u8_pad[0], st->u8_pad[1], st->u8_pad[2]); + ksft_print_msg(" pad: %d %d %d %d %d %d %d %d %d %d %d\n", + st->pad[0], st->pad[1], st->pad[2], st->pad[3], + st->pad[4], st->pad[5], st->pad[6], st->pad[7], + st->pad[8], st->pad[9], st->pad[10]); } #elif defined(__aarch64__) @@ -201,10 +199,10 @@ static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) { struct st_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); - pr_info("VCPU%d:\n", vcpu_idx); - pr_info(" rev: %d\n", st->rev); - pr_info(" attr: %d\n", st->attr); - pr_info(" st_time: %ld\n", st->st_time); + ksft_print_msg("VCPU%d:\n", vcpu_idx); + ksft_print_msg(" rev: %d\n", st->rev); + ksft_print_msg(" attr: %d\n", st->attr); + ksft_print_msg(" st_time: %ld\n", st->st_time); } #elif defined(__riscv) @@ -368,7 +366,9 @@ int main(int ac, char **av) vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages); + ksft_print_header(); TEST_REQUIRE(is_steal_time_supported(vcpus[0])); + ksft_set_plan(NR_VCPUS); /* Run test on each VCPU */ for (i = 0; i < NR_VCPUS; ++i) { @@ -409,14 +409,15 @@ int main(int ac, char **av) run_delay, stolen_time); if (verbose) { - pr_info("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld", i, - guest_stolen_time[i], stolen_time); - if (stolen_time == run_delay) - pr_info(" (BONUS: guest test-stolen-time even exactly matches test-run_delay)"); - pr_info("\n"); + ksft_print_msg("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld%s\n", + i, guest_stolen_time[i], stolen_time, + stolen_time == run_delay ? + " (BONUS: guest test-stolen-time even exactly matches test-run_delay)" : ""); steal_time_dump(vm, i); } + ksft_test_result_pass("vcpu%d\n", i); } - return 0; + /* Print results and exit() accordingly */ + ksft_finished(); } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ceab74b8ff5bf0..3508a9037d6435 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -971,7 +971,7 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) if (!memslot->dirty_bitmap) return; - kvfree(memslot->dirty_bitmap); + vfree(memslot->dirty_bitmap); memslot->dirty_bitmap = NULL; } @@ -2920,7 +2920,7 @@ out: /* * Pin guest page in memory and return its pfn. * @addr: host virtual address which maps memory to the guest - * @atomic: whether this function can sleep + * @atomic: whether this function is forbidden from sleeping * @interruptible: whether the process can be interrupted by non-fatal signals * @async: whether this function need to wait IO complete if the * host page is not in the memory @@ -2992,16 +2992,12 @@ kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, if (hva) *hva = addr; - if (addr == KVM_HVA_ERR_RO_BAD) { - if (writable) - *writable = false; - return KVM_PFN_ERR_RO_FAULT; - } - if (kvm_is_error_hva(addr)) { if (writable) *writable = false; - return KVM_PFN_NOSLOT; + + return addr == KVM_HVA_ERR_RO_BAD ? KVM_PFN_ERR_RO_FAULT : + KVM_PFN_NOSLOT; } /* Do not map writable pfn in the readonly memslot. */ @@ -3265,6 +3261,7 @@ static int next_segment(unsigned long len, int offset) return len; } +/* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */ static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, void *data, int offset, int len) { @@ -3366,6 +3363,7 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, } EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); +/* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */ static int __kvm_write_guest_page(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn, const void *data, int offset, int len) |