Re: [PATCH 1/2] KVM: x86/mmu: Add RET_PF_RETRY_INVALID_SLOT for fault retry on invalid slot

Sean Christopherson <seanjc@xxxxxxxxxx> · Mon, 19 May 2025 10:06:22 -0700

On Mon, May 19, 2025, Rick P Edgecombe wrote:
> On Mon, 2025-05-19 at 06:33 -0700, Sean Christopherson wrote:
> > Was this hit by a real VMM?  If so, why is a TDX VMM removing a memslot without
> > kicking vCPUs out of KVM?
> > 
> > Regardless, I would prefer not to add a new RET_PF_* flag for this.  At a glance,
> > KVM can simply drop and reacquire SRCU in the relevant paths.
> 
> During the initial debugging and kicking around stage, this is the first
> direction we looked. But kvm_gmem_populate() doesn't have scru locked, so then
> kvm_tdp_map_page() tries to unlock without it being held. (although that version
> didn't check r == RET_PF_RETRY like you had). Yan had the following concerns and
> came up with the version in this series, which we held review on for the list:

Ah, I missed the kvm_gmem_populate() => kvm_tdp_map_page() chain.

> > However, upon further consideration, I am reluctant to implement this fix for

Which fix?

> > the following reasons:
> > - kvm_gmem_populate() already holds the kvm->slots_lock.
> > - While retrying with srcu unlock and lock can workaround the
> >   KVM_MEMSLOT_INVALID deadlock, it results in each kvm_vcpu_pre_fault_memory()
> >   and tdx_handle_ept_violation() faulting with different memslot layouts.

This behavior has existed since pretty much the beginning of KVM time.  TDX is the
oddball that doesn't re-enter the guest.  All other flavors re-enter the guest on
RET_PF_RETRY, which means dropping and reacquiring SRCU.  Which is why I don't like
RET_PF_RETRY_INVALID_SLOT; it's simply handling the case we know about.

Arguably, _TDX_ is buggy by not providing this behavior.

> I'm not sure why the second one is really a problem. For the first one I think
> that path could just take the scru lock in the proper order with kvm-
> >slots_lock?

Acquiring SRCU inside slots_lock should be fine.  The reserve order would be
problematic, as KVM synchronizes SRCU while holding slots_lock.

That said, I don't love the idea of grabbing SRCU, because it's so obviously a
hack.  What about something like this?

---
 arch/x86/kvm/mmu.h     |  2 ++
 arch/x86/kvm/mmu/mmu.c | 49 +++++++++++++++++++++++++++---------------
 arch/x86/kvm/vmx/tdx.c |  7 ++++--
 virt/kvm/kvm_main.c    |  5 ++---
 4 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index b4b6860ab971..0fc68f0fe80e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -259,6 +259,8 @@ extern bool tdp_mmu_enabled;
 
 bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa);
 int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level);
+int kvm_tdp_prefault_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
+			  u8 *level);
 
 static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index cbc84c6abc2e..4f16fe95173c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4851,24 +4851,15 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level
 {
 	int r;
 
-	/*
-	 * Restrict to TDP page fault, since that's the only case where the MMU
-	 * is indexed by GPA.
-	 */
-	if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
-		return -EOPNOTSUPP;
+	if (signal_pending(current))
+		return -EINTR;
 
-	do {
-		if (signal_pending(current))
-			return -EINTR;
+	if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
+		return -EIO;
 
-		if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
-			return -EIO;
-
-		cond_resched();
-		r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
-	} while (r == RET_PF_RETRY);
+	cond_resched();
 
+	r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
 	if (r < 0)
 		return r;
 
@@ -4878,10 +4869,12 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level
 	case RET_PF_WRITE_PROTECTED:
 		return 0;
 
+	case RET_PF_RETRY:
+		return -EAGAIN;
+
 	case RET_PF_EMULATE:
 		return -ENOENT;
 
-	case RET_PF_RETRY:
 	case RET_PF_CONTINUE:
 	case RET_PF_INVALID:
 	default:
@@ -4891,6 +4884,28 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level
 }
 EXPORT_SYMBOL_GPL(kvm_tdp_map_page);
 
+int kvm_tdp_prefault_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level)
+{
+	int r;
+
+	/*
+	 * Restrict to TDP page fault, since that's the only case where the MMU
+	 * is indexed by GPA.
+	 */
+	if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
+		return -EOPNOTSUPP;
+
+	for (;;) {
+		r = kvm_tdp_map_page(vcpu, gpa, error_code, level);
+		if (r != -EAGAIN)
+			break;
+
+		/* Comment goes here. */
+		kvm_vcpu_srcu_read_unlock(vcpu);
+		kvm_vcpu_srcu_read_lock(vcpu);
+	}
+}
+
 long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
 				    struct kvm_pre_fault_memory *range)
 {
@@ -4918,7 +4933,7 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
 	 * Shadow paging uses GVA for kvm page fault, so restrict to
 	 * two-dimensional paging.
 	 */
-	r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
+	r = kvm_tdp_prefault_page(vcpu, range->gpa, error_code, &level);
 	if (r < 0)
 		return r;
 
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index b952bc673271..1a232562080d 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -3075,8 +3075,11 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
 	if (ret != 1)
 		return -ENOMEM;
 
-	ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
-	if (ret < 0)
+	do {
+		ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
+	while (ret == -EAGAIN);
+
+	if (ret)
 		goto out;
 
 	/*
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b24db92e98f3..21a3fa7476dd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4266,7 +4266,6 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
 static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
 				     struct kvm_pre_fault_memory *range)
 {
-	int idx;
 	long r;
 	u64 full_size;
 
@@ -4279,7 +4278,7 @@ static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
 		return -EINVAL;
 
 	vcpu_load(vcpu);
-	idx = srcu_read_lock(&vcpu->kvm->srcu);
+	kvm_vcpu_srcu_read_lock(vcpu);
 
 	full_size = range->size;
 	do {
@@ -4300,7 +4299,7 @@ static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
 		cond_resched();
 	} while (range->size);
 
-	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	kvm_vcpu_srcu_read_unlock(vcpu);
 	vcpu_put(vcpu);
 
 	/* Return success if at least one page was mapped successfully.  */

base-commit: 12ca5c63556bbfcd77fe890fcdd1cd1adfb31fdd
--