aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2014-05-08 13:31:23 +1000
committerEli Qiao <taget@linux.vnet.ibm.com>2014-05-08 12:31:17 +0800
commitafe34f689bff47714a18fd92c544f4318a8fc586 (patch)
tree39188943862162629f9bdde2aa557501d7eb78c5
parent1beb8571b373cf5c6019029071a52028807287f1 (diff)
downloadpowerkvm-afe34f689bff47714a18fd92c544f4318a8fc586.tar.gz
KVM: PPC: Book3S HV: Fix two bugs in dirty-page tracking
The first bug is that we are testing the C (changed) bit in the hashed page table without first doing a tlbie. The architecture allows the update of the C bit to happen at any time up until we do a tlbie for the page. However, we don't want to do a tlbie for every page on every pass of a migration operation. Thus we do the tlbie if there are no vcpus currently running, which would indicate the final phase of migration. If any vcpus are running then reading the dirty log is already racy because pages could get dirtied immediately after we check them. Also, we don't need to do the tlbie if the HPT entry doesn't allow writing, since in that case the C bit can not get set. The second bug is that in the case where we see a dirty 16MB page followed by a dirty 4kB page (both mapping to the same guest real address), we return 1 rather than 16MB / PAGE_SIZE. The return value, indicating the number of dirty pages, needs to reflect the largest dirty page we come across, not the last dirty page we see. Fixes: 109551 (this time for sure) Signed-off-by: Paul Mackerras <paulus@samba.org>
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c51
1 files changed, 40 insertions, 11 deletions
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index bccc8c39ce5065..4464e485de342a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1037,10 +1037,16 @@ void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
}
+static int vcpus_running(struct kvm *kvm)
+{
+ return atomic_read(&kvm->arch.vcpus_running) != 0;
+}
+
static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
{
struct revmap_entry *rev = kvm->arch.revmap;
unsigned long head, i, j;
+ unsigned long v, r, n;
unsigned long *hptep;
int ret = 0;
@@ -1060,7 +1066,22 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
j = rev[i].forw;
- if (!(hptep[1] & HPTE_R_C))
+ /*
+ * Checking the C (changed) bit here is racy since there
+ * is no guarantee about when the hardware writes it back.
+ * If the HPTE is not writable then it is stable since the
+ * page can't be written to, and we would have done a tlbie
+ * (which forces the hardware to complete any writeback)
+ * when making the HPTE read-only.
+ * If vcpus are running then this call is racy anyway
+ * since the page could get dirtied subsequently, so we
+ * expect there to be a further call which would pick up
+ * any delayed C bit writeback.
+ * Otherwise we need to do the tlbie even if C==0 in
+ * order to pick up any delayed writeback of C.
+ */
+ if (!(hptep[1] & HPTE_R_C) &&
+ (!hpte_is_writable(hptep[1]) || vcpus_running(kvm)))
continue;
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
@@ -1072,21 +1093,29 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
}
/* Now check and modify the HPTE */
- if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) {
- /* need to make it temporarily absent to clear C */
- hptep[0] |= HPTE_V_ABSENT;
- kvmppc_invalidate_hpte(kvm, hptep, i);
- hptep[1] &= ~HPTE_R_C;
- eieio();
- hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
+ if (!(hptep[0] & HPTE_V_VALID))
+ continue;
+
+ /* need to make it temporarily absent so C is stable */
+ hptep[0] |= HPTE_V_ABSENT;
+ kvmppc_invalidate_hpte(kvm, hptep, i);
+ v = hptep[0];
+ r = hptep[1];
+ if (r & HPTE_R_C) {
+ hptep[1] = r & ~HPTE_R_C;
if (!(rev[i].guest_rpte & HPTE_R_C)) {
rev[i].guest_rpte |= HPTE_R_C;
note_hpte_modification(kvm, &rev[i]);
}
- ret = max(1UL, hpte_page_size(hptep[0], hptep[1]) >>
- PAGE_SHIFT);
+ n = hpte_page_size(v, r);
+ n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (n > ret)
+ ret = n;
+ eieio();
}
- hptep[0] &= ~HPTE_V_HVLOCK;
+ v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK);
+ v |= HPTE_V_VALID;
+ hptep[0] = v;
} while ((i = j) != head);
unlock_rmap(rmapp);