During the TD build phase (i.e., before the TD becomes RUNNABLE), enforce a 4KB mapping level both in the S-EPT managed by the TDX module and the mirror page table managed by KVM. During this phase, TD's memory is added via tdh_mem_page_add(), which only accepts 4KB granularity. Therefore, return PG_LEVEL_4K in TDX's .private_max_mapping_level hook to ensure KVM maps at the 4KB level in the mirror page table. Meanwhile, iterate over each 4KB page of a large gmem backend page in tdx_gmem_post_populate() and invoke tdh_mem_page_add() to map at the 4KB level in the S-EPT. Still allow huge pages in gmem backend during TD build time. Based on [1], which gmem series allows 2MB TPH and non-in-place conversion, pass in region.nr_pages to kvm_gmem_populate() in tdx_vcpu_init_mem_region(). This enables kvm_gmem_populate() to allocate huge pages from the gmem backend when the remaining nr_pages, GFN alignment, and page private/shared attribute permit. KVM is then able to promote the initial 4K mapping to huge after TD is RUNNABLE. Disallow any private huge pages during TD build time. Use BUG_ON() in tdx_mem_page_record_premap_cnt() and tdx_is_sept_zap_err_due_to_premap() to assert the mapping level is 4KB. Opportunistically, remove unused parameters in tdx_mem_page_record_premap_cnt(). Link: https://lore.kernel.org/all/20241212063635.712877-1-michael.roth@xxxxxxx [1] Signed-off-by: Yan Zhao <yan.y.zhao@xxxxxxxxx> --- arch/x86/kvm/vmx/tdx.c | 45 ++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 98cde20f14da..03885cb2869b 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1530,14 +1530,16 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there * are no half-initialized shared EPT pages. */ -static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, - enum pg_level level, kvm_pfn_t pfn) +static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, enum pg_level level) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) return -EINVAL; + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ atomic64_inc(&kvm_tdx->nr_premapped); return 0; @@ -1571,7 +1573,7 @@ int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) return tdx_mem_page_aug(kvm, gfn, level, page); - return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); + return tdx_mem_page_record_premap_cnt(kvm, level); } static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, @@ -1666,7 +1668,7 @@ int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err, u64 entry, int level) { - if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) + if (!err || kvm_tdx->state == TD_STATE_RUNNABLE || level > PG_LEVEL_4K) return false; if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) @@ -3052,8 +3054,8 @@ struct tdx_gmem_post_populate_arg { __u32 flags; }; -static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, int order, void *_arg) +static int tdx_gmem_post_populate_4k(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + void __user *src, void *_arg) { u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS; struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); @@ -3120,6 +3122,21 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, return ret; } +static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + void __user *src, int order, void *_arg) +{ + unsigned long i, npages = 1 << order; + int ret; + + for (i = 0; i < npages; i++) { + ret = tdx_gmem_post_populate_4k(kvm, gfn + i, pfn + i, + src + i * PAGE_SIZE, _arg); + if (ret) + return ret; + } + return 0; +} + static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) { struct vcpu_tdx *tdx = to_tdx(vcpu); @@ -3166,20 +3183,15 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c }; gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa), u64_to_user_ptr(region.source_addr), - 1, tdx_gmem_post_populate, &arg); + region.nr_pages, tdx_gmem_post_populate, &arg); if (gmem_ret < 0) { ret = gmem_ret; break; } - if (gmem_ret != 1) { - ret = -EIO; - break; - } - - region.source_addr += PAGE_SIZE; - region.gpa += PAGE_SIZE; - region.nr_pages--; + region.source_addr += PAGE_SIZE * gmem_ret; + region.gpa += PAGE_SIZE * gmem_ret; + region.nr_pages -= gmem_ret; cond_resched(); } @@ -3224,6 +3236,9 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) { + if (unlikely(to_kvm_tdx(kvm)->state != TD_STATE_RUNNABLE)) + return PG_LEVEL_4K; + return PG_LEVEL_4K; } -- 2.43.2