Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers

"Huang, Kai" <kai.huang@xxxxxxxxx> · Mon, 5 May 2025 12:44:26 +0000

On Fri, 2025-05-02 at 16:08 +0300, Kirill A. Shutemov wrote:
> Introduce a pair of helpers to allocate and free memory for a given 2M
> range. The range is represented by struct page for any memory in the
> range and the PAMT memory by a list of pages.
> 
> Use per-2M refcounts to detect when PAMT memory has to be allocated and
> when it can be freed.
> 
> pamt_lock spinlock serializes against races between multiple
> tdx_pamt_add() as well as tdx_pamt_add() vs tdx_pamt_put().

Maybe elaborate a little bit on _why_ using spinlock?

> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
> ---
>  arch/x86/include/asm/tdx.h   |   2 +
>  arch/x86/kvm/vmx/tdx.c       | 123 +++++++++++++++++++++++++++++++++++
>  arch/x86/kvm/vmx/tdx_errno.h |   1 +
>  3 files changed, 126 insertions(+)
> 
> diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> index 8091bf5b43cc..42449c054938 100644
> --- a/arch/x86/include/asm/tdx.h
> +++ b/arch/x86/include/asm/tdx.h
> @@ -135,6 +135,8 @@ static inline int tdx_nr_pamt_pages(const struct tdx_sys_info *sysinfo)
>  	return sysinfo->tdmr.pamt_4k_entry_size * PTRS_PER_PTE / PAGE_SIZE;
>  }
>  
> +atomic_t *tdx_get_pamt_refcount(unsigned long hpa);
> +

This at least needs to be in the same patch which exports it.  But as replied to
patch 2, I think we should just move the code in this patch to TDX core code.

>  int tdx_guest_keyid_alloc(void);
>  u32 tdx_get_nr_guest_keyids(void);
>  void tdx_guest_keyid_free(unsigned int keyid);
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index b952bc673271..ea7e2d93fb44 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -207,6 +207,10 @@ static bool tdx_operand_busy(u64 err)
>  	return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
>  }
>  
> +static bool tdx_hpa_range_not_free(u64 err)
> +{
> +	return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_HPA_RANGE_NOT_FREE;
> +}
>  
>  /*
>   * A per-CPU list of TD vCPUs associated with a given CPU.
> @@ -276,6 +280,125 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
>  	vcpu->cpu = -1;
>  }
>  
> +static DEFINE_SPINLOCK(pamt_lock);
> +
> +static void tdx_free_pamt_pages(struct list_head *pamt_pages)
> +{
> +	struct page *page;
> +
> +	while ((page = list_first_entry_or_null(pamt_pages, struct page, lru))) {
> +		list_del(&page->lru);
> +		__free_page(page);
> +	}
> +}
> +
> +static int tdx_alloc_pamt_pages(struct list_head *pamt_pages)
> +{
> +	for (int i = 0; i < tdx_nr_pamt_pages(tdx_sysinfo); i++) {
> +		struct page *page = alloc_page(GFP_KERNEL);
> +		if (!page)
> +			goto fail;
> +		list_add(&page->lru, pamt_pages);
> +	}
> +	return 0;
> +fail:
> +	tdx_free_pamt_pages(pamt_pages);
> +	return -ENOMEM;
> +}
> +
> +static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
> +			struct list_head *pamt_pages)
> +{
> +	u64 err;
> +
> +	hpa = ALIGN_DOWN(hpa, SZ_2M);
> +
> +	spin_lock(&pamt_lock);

Just curious, Can the lock be per-2M-range?

> +
> +	/* Lost race to other tdx_pamt_add() */
> +	if (atomic_read(pamt_refcount) != 0) {
> +		atomic_inc(pamt_refcount);
> +		spin_unlock(&pamt_lock);
> +		tdx_free_pamt_pages(pamt_pages);

It's unfortunate multiple caller of tdx_pamt_add() needs to firstly allocate
PAMT pages by the caller out of the spinlock and then free them here.

I am thinking if we make tdx_pamt_add() return:

	* > 0: PAMT pages already added (another tdx_pamt_add() won)
	* = 0: PAMT pages added successfully
	* < 0: error code

.. then we at least could move tdx_free_pamt_pages() to the caller too.

> +		return 0;
> +	}
> +
> +	err = tdh_phymem_pamt_add(hpa | TDX_PS_2M, pamt_pages);
> +
> +	if (err)
> +		tdx_free_pamt_pages(pamt_pages);

Seems we are calling tdx_free_pamt_pages() within spinlock, which is not
consistent with above when another tdx_pamt_add() has won the race.

> +
> +	/*
> +	 * tdx_hpa_range_not_free() is true if current task won race
> +	 * against tdx_pamt_put().
> +	 */
> +	if (err && !tdx_hpa_range_not_free(err)) {
> +		spin_unlock(&pamt_lock);
> +		pr_tdx_error(TDH_PHYMEM_PAMT_ADD, err);
> +		return -EIO;
> +	}

I had hard time to figure out why we need to handle tdx_hpa_range_not_free()
explicitly.  IIUC, it is because atomic_dec_and_test() is used in
tdx_pamt_put(), in which case the atomic_t can reach to 0 outside of the
spinlock thus tdh_phymem_pamt_add() can be called when there's still PAMT pages
populated.

But ...

> +
> +	atomic_set(pamt_refcount, 1);
> +	spin_unlock(&pamt_lock);
> +	return 0;
> +}
> +
> +static int tdx_pamt_get(struct page *page)
> +{
> +	unsigned long hpa = page_to_phys(page);
> +	atomic_t *pamt_refcount;
> +	LIST_HEAD(pamt_pages);
> +
> +	if (!tdx_supports_dynamic_pamt(tdx_sysinfo))
> +		return 0;
> +
> +	pamt_refcount = tdx_get_pamt_refcount(hpa);
> +	WARN_ON_ONCE(atomic_read(pamt_refcount) < 0);
> +
> +	if (atomic_inc_not_zero(pamt_refcount))
> +		return 0;

... if we set the initial value of pamt_refcount to -1, and use
atomic_inc_unless_negetive() here:

	if (atomic_inc_unless_negative(pamt_refcount))
		return 0;

	if (tdx_alloc_pamt_pages(&pamt_pages))
		return -ENOMEM;

	spin_lock(&pamt_lock);
	ret = tdx_pamt_add(hpa, &pamt_pages);
	if (ret >= 0)
		atomic_inc(pamt_refcount, 0);
	spin_unlock(&pamt_lock);

	/*
	 * If another tdx_pamt_get() won the race, or in case of
	 * error, PAMT pages are not used and can be freed.
	 */
	if (ret)
		tdx_free_pamt_pages(&pamt_pages);

	return ret >= 0 ? 0 : ret;

and ...

> +
> +	if (tdx_alloc_pamt_pages(&pamt_pages))
> +		return -ENOMEM;
> +
> +	return tdx_pamt_add(pamt_refcount, hpa, &pamt_pages);
> +}
> +
> +static void tdx_pamt_put(struct page *page)
> +{
> +	unsigned long hpa = page_to_phys(page);
> +	atomic_t *pamt_refcount;
> +	LIST_HEAD(pamt_pages);
> +	u64 err;
> +
> +	if (!tdx_supports_dynamic_pamt(tdx_sysinfo))
> +		return;
> +
> +	hpa = ALIGN_DOWN(hpa, SZ_2M);
> +
> +	pamt_refcount = tdx_get_pamt_refcount(hpa);
> +	if (!atomic_dec_and_test(pamt_refcount))
> +		return;

... use atomic_dec_if_possible() here, we should be able to avoid the special
handling of tdx_hpa_range_not_free() in tdx_pamt_get().  Someething like:

	if (atomic_dec_if_positive(pamt_refcount) >= 0)
		return;

	spin_lock(&pamt_lock);

	/* tdx_pamt_get() called more than once */
	if (atomic_read(pamt_refcount) > 0) {
		spin_unlock(&pamt_lock);
		return;
	}

	err = tdh_phymem_pamt_remove(hpa | TDX_PS_2M, &pamt_pages);
	atomic_set(pamt_refcount, -1);
	spin_unlock(&pamt_lock);

	tdx_free_pamt_pages(&pamt_pages);

Hmm.. am I missing anything?

> +
> +	spin_lock(&pamt_lock);
> +
> +	/* Lost race against tdx_pamt_add()? */
> +	if (atomic_read(pamt_refcount) != 0) {
> +		spin_unlock(&pamt_lock);
> +		return;
> +	}
> +
> +	err = tdh_phymem_pamt_remove(hpa | TDX_PS_2M, &pamt_pages);
> +	spin_unlock(&pamt_lock);
> +
> +	if (err) {
> +		pr_tdx_error(TDH_PHYMEM_PAMT_REMOVE, err);
> +		return;
> +	}
> +
> +	tdx_free_pamt_pages(&pamt_pages);
> +}
> +