Re: [PATCH V2 1/1] KVM: TDX: Add sub-ioctl KVM_TDX_TERMINATE_VM

Adrian Hunter <adrian.hunter@xxxxxxxxx> · Tue, 22 Apr 2025 12:37:13 +0300

On 22/04/25 11:13, Adrian Hunter wrote:
> On 19/04/25 04:12, Sean Christopherson wrote:
>> On Thu, Apr 17, 2025, Adrian Hunter wrote:
>>> From: Sean Christopherson <seanjc@xxxxxxxxxx>
>>>
>>> Add sub-ioctl KVM_TDX_TERMINATE_VM to release the HKID prior to shutdown,
>>> which enables more efficient reclaim of private memory.
>>>
>>> Private memory is removed from MMU/TDP when guest_memfds are closed. If
>>> the HKID has not been released, the TDX VM is still in RUNNABLE state,
>>> so pages must be removed using "Dynamic Page Removal" procedure (refer
>>> TDX Module Base spec) which involves a number of steps:
>>> 	Block further address translation
>>> 	Exit each VCPU
>>> 	Clear Secure EPT entry
>>> 	Flush/write-back/invalidate relevant caches
>>>
>>> However, when the HKID is released, the TDX VM moves to TD_TEARDOWN state
>>> where all TDX VM pages are effectively unmapped, so pages can be reclaimed
>>> directly.
>>>
>>> Reclaiming TD Pages in TD_TEARDOWN State was seen to decrease the total
>>> reclaim time.  For example:
>>>
>>> 	VCPUs	Size (GB)	Before (secs)	After (secs)
>>> 	 4	 18		  72		 24
>>> 	32	107		 517		134
>>> 	64	400		5539		467
>>>
>>> [Adrian: wrote commit message, added KVM_TDX_TERMINATE_VM documentation,
>>>  and moved cpus_read_lock() inside kvm->lock for consistency as reported
>>>  by lockdep]
>>
>> /facepalm
>>
>> I over-thought this.  We've had an long-standing battle with kvm_lock vs.
>> cpus_read_lock(), but this is kvm->lock, not kvm_lock.  /sigh
>>
>>> +static int tdx_terminate_vm(struct kvm *kvm)
>>> +{
>>> +	int r = 0;
>>> +
>>> +	guard(mutex)(&kvm->lock);
>>
>> With kvm->lock taken outside cpus_read_lock(), just handle KVM_TDX_TERMINATE_VM
>> in the switch statement, i.e. let tdx_vm_ioctl() deal with kvm->lock.
> 
> Ok, also cpus_read_lock() can go back where it was in __tdx_release_hkid().
> 
> But also in __tdx_release_hkid(), there is
> 
> 	if (KVM_BUG_ON(refcount_read(&kvm->users_count) && !terminate, kvm))
> 		return;
> 
> However, __tdx_td_init() calls tdx_mmu_release_hkid() on the
> error path so that is not correct.
> 

So it ends up like this:

diff --git a/Documentation/virt/kvm/x86/intel-tdx.rst b/Documentation/virt/kvm/x86/intel-tdx.rst
index de41d4c01e5c..e5d4d9cf4cf2 100644
--- a/Documentation/virt/kvm/x86/intel-tdx.rst
+++ b/Documentation/virt/kvm/x86/intel-tdx.rst
@@ -38,6 +38,7 @@ ioctl with TDX specific sub-ioctl() commands.
           KVM_TDX_INIT_MEM_REGION,
           KVM_TDX_FINALIZE_VM,
           KVM_TDX_GET_CPUID,
+          KVM_TDX_TERMINATE_VM,
 
           KVM_TDX_CMD_NR_MAX,
   };
@@ -214,6 +215,21 @@ struct kvm_cpuid2.
 	  __u32 padding[3];
   };
 
+KVM_TDX_TERMINATE_VM
+-------------------
+:Type: vm ioctl
+:Returns: 0 on success, <0 on error
+
+Release Host Key ID (HKID) to allow more efficient reclaim of private memory.
+After this, the TD is no longer in a runnable state.
+
+Using KVM_TDX_TERMINATE_VM is optional.
+
+- id: KVM_TDX_TERMINATE_VM
+- flags: must be 0
+- data: must be 0
+- hw_error: must be 0
+
 KVM TDX creation flow
 =====================
 In addition to the standard KVM flow, new TDX ioctls need to be called.  The
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 225a12e0d5d6..a2f973e1d75d 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -939,6 +939,7 @@ enum kvm_tdx_cmd_id {
 	KVM_TDX_INIT_MEM_REGION,
 	KVM_TDX_FINALIZE_VM,
 	KVM_TDX_GET_CPUID,
+	KVM_TDX_TERMINATE_VM,
 
 	KVM_TDX_CMD_NR_MAX,
 };
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index b952bc673271..5161f6f891d7 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -500,14 +500,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
 	 */
 	mutex_lock(&tdx_lock);
 
-	/*
-	 * Releasing HKID is in vm_destroy().
-	 * After the above flushing vps, there should be no more vCPU
-	 * associations, as all vCPU fds have been released at this stage.
-	 */
 	err = tdh_mng_vpflushdone(&kvm_tdx->td);
-	if (err == TDX_FLUSHVP_NOT_DONE)
-		goto out;
 	if (KVM_BUG_ON(err, kvm)) {
 		pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
 		pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
@@ -515,6 +508,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
 		goto out;
 	}
 
+	write_lock(&kvm->mmu_lock);
 	for_each_online_cpu(i) {
 		if (packages_allocated &&
 		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
@@ -539,7 +533,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
 	} else {
 		tdx_hkid_free(kvm_tdx);
 	}
-
+	write_unlock(&kvm->mmu_lock);
 out:
 	mutex_unlock(&tdx_lock);
 	cpus_read_unlock();
@@ -1789,13 +1783,13 @@ int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
 	struct page *page = pfn_to_page(pfn);
 	int ret;
 
-	/*
-	 * HKID is released after all private pages have been removed, and set
-	 * before any might be populated. Warn if zapping is attempted when
-	 * there can't be anything populated in the private EPT.
-	 */
-	if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
-		return -EINVAL;
+	if (!is_hkid_assigned(to_kvm_tdx(kvm))) {
+		WARN_ON_ONCE(!kvm->vm_dead);
+		ret = tdx_reclaim_page(page);
+		if (!ret)
+			tdx_unpin(kvm, page);
+		return ret;
+	}
 
 	ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
 	if (ret <= 0)
@@ -2790,6 +2784,20 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
 	return 0;
 }
 
+static int tdx_terminate_vm(struct kvm *kvm)
+{
+	if (!kvm_trylock_all_vcpus(kvm))
+		return -EBUSY;
+
+	kvm_vm_dead(kvm);
+
+	kvm_unlock_all_vcpus(kvm);
+
+	tdx_mmu_release_hkid(kvm);
+
+	return 0;
+}
+
 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_tdx_cmd tdx_cmd;
@@ -2817,6 +2825,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 	case KVM_TDX_FINALIZE_VM:
 		r = tdx_td_finalize(kvm, &tdx_cmd);
 		break;
+	case KVM_TDX_TERMINATE_VM:
+		r = tdx_terminate_vm(kvm);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;