On Mon, May 05, 2025 at 11:05:12AM +0000, Huang, Kai wrote: > > > +static atomic_t *pamt_refcounts; > > + > > static enum tdx_module_status_t tdx_module_status; > > static DEFINE_MUTEX(tdx_module_lock); > > > > @@ -1035,9 +1038,108 @@ static int config_global_keyid(void) > > return ret; > > } > > > > +atomic_t *tdx_get_pamt_refcount(unsigned long hpa) > > +{ > > + return &pamt_refcounts[hpa / PMD_SIZE]; > > +} > > +EXPORT_SYMBOL_GPL(tdx_get_pamt_refcount); > > It's not quite clear why this function needs to be exported in this patch. IMO > it's better to move the export to the patch which actually needs it. > > Looking at patch 5, tdx_pamt_get()/put() use it, and they are in KVM code. But > I think we should just put them here in this file. tdx_alloc_page() and > tdx_free_page() should be in this file too. > > And instead of exporting tdx_get_pamt_refcount(), the TDX core code here can > export tdx_alloc_page() and tdx_free_page(), providing two high level helpers to > allow the TDX users (e.g., KVM) to allocate/free TDX private pages. How PAMT > pages are allocated is then hidden in the core TDX code. We would still need tdx_get_pamt_refcount() to handle case when we need to bump refcount for page allocated elsewhere. > > + > > +static int pamt_refcount_populate(pte_t *pte, unsigned long addr, void *data) > > +{ > > + unsigned long vaddr; > > + pte_t entry; > > + > > + if (!pte_none(ptep_get(pte))) > > + return 0; > > + > > + vaddr = __get_free_page(GFP_KERNEL | __GFP_ZERO); > > + if (!vaddr) > > + return -ENOMEM; > > + > > + entry = pfn_pte(PFN_DOWN(__pa(vaddr)), PAGE_KERNEL); > > + > > + spin_lock(&init_mm.page_table_lock); > > + if (pte_none(ptep_get(pte))) > > + set_pte_at(&init_mm, addr, pte, entry); > > + else > > + free_page(vaddr); > > + spin_unlock(&init_mm.page_table_lock); > > + > > + return 0; > > +} > > + > > +static int pamt_refcount_depopulate(pte_t *pte, unsigned long addr, > > + void *data) > > +{ > > + unsigned long vaddr; > > + > > + vaddr = (unsigned long)__va(PFN_PHYS(pte_pfn(ptep_get(pte)))); > > + > > + spin_lock(&init_mm.page_table_lock); > > + if (!pte_none(ptep_get(pte))) { > > + pte_clear(&init_mm, addr, pte); > > + free_page(vaddr); > > + } > > + spin_unlock(&init_mm.page_table_lock); > > + > > + return 0; > > +} > > + > > +static int alloc_tdmr_pamt_refcount(struct tdmr_info *tdmr) > > +{ > > + unsigned long start, end; > > + > > + start = (unsigned long)tdx_get_pamt_refcount(tdmr->base); > > + end = (unsigned long)tdx_get_pamt_refcount(tdmr->base + tdmr->size); > > + start = round_down(start, PAGE_SIZE); > > + end = round_up(end, PAGE_SIZE); > > + > > + return apply_to_page_range(&init_mm, start, end - start, > > + pamt_refcount_populate, NULL); > > +} > > IIUC, populating refcount based on TDMR will slightly waste memory. The reason > is IIUC we don't need to populate the refcount for a 2M range if the range is > completely marked as reserved in TDMR, because it's not possible for the kernel > to use such range for TDX. > > Populating based on the list of TDX memory blocks should be better. In > practice, the difference should be unnoticeable, but conceptually, using TDX > memory blocks is better. Okay, I will look into this after dealing with huge pages. > > + > > +static int init_pamt_metadata(void) > > +{ > > + size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts); > > + struct vm_struct *area; > > + > > + if (!tdx_supports_dynamic_pamt(&tdx_sysinfo)) > > + return 0; > > + > > + /* > > + * Reserve vmalloc range for PAMT reference counters. It covers all > > + * physical address space up to max_pfn. It is going to be populated > > + * from init_tdmr() only for present memory that available for TDX use. > > + */ > > + area = get_vm_area(size, VM_IOREMAP); > > + if (!area) > > + return -ENOMEM; > > + > > + pamt_refcounts = area->addr; > > + return 0; > > +} > > + > > +static void free_pamt_metadata(void) > > +{ > > + size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts); > > + > > + size = round_up(size, PAGE_SIZE); > > + apply_to_existing_page_range(&init_mm, > > + (unsigned long)pamt_refcounts, > > + size, pamt_refcount_depopulate, > > + NULL); > > + vfree(pamt_refcounts); > > + pamt_refcounts = NULL; > > +} > > + > > static int init_tdmr(struct tdmr_info *tdmr) > > { > > u64 next; > > + int ret; > > + > > + ret = alloc_tdmr_pamt_refcount(tdmr); > > + if (ret) > > + return ret; > > > > /* > > * Initializing a TDMR can be time consuming. To avoid long > > @@ -1048,7 +1150,6 @@ static int init_tdmr(struct tdmr_info *tdmr) > > struct tdx_module_args args = { > > .rcx = tdmr->base, > > }; > > - int ret; > > > > ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args); > > if (ret) > > @@ -1134,10 +1235,15 @@ static int init_tdx_module(void) > > if (ret) > > goto err_reset_pamts; > > > > + /* Reserve vmalloc range for PAMT reference counters */ > > + ret = init_pamt_metadata(); > > + if (ret) > > + goto err_reset_pamts; > > + > > /* Initialize TDMRs to complete the TDX module initialization */ > > ret = init_tdmrs(&tdx_tdmr_list); > > if (ret) > > - goto err_reset_pamts; > > + goto err_free_pamt_metadata; > > > > pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list)); > > > > @@ -1149,6 +1255,9 @@ static int init_tdx_module(void) > > put_online_mems(); > > return ret; > > > > +err_free_pamt_metadata: > > + free_pamt_metadata(); > > + > > err_reset_pamts: > > /* > > * Part of PAMTs may already have been initialized by the > -- Kiryl Shutsemau / Kirill A. Shutemov