On Thu, 2025-04-24 at 11:08 +0800, Yan Zhao wrote: > Introduce kvm_split_boundary_leafs() to manage the splitting of boundary > leafs within the mirror root. > > Before zapping a specific GFN range in the mirror root, split any huge leaf > that intersects with the boundary of the GFN range to ensure that the > subsequent zap operation does not impact any GFN outside the specified > range. This is crucial for the mirror root as the private page table > requires the guest's ACCEPT operation after faulting back a GFN. > > This function should be called while kvm->mmu_lock is held for writing. The > kvm->mmu_lock is temporarily released to allocate memory for sp for split. > The only expected error is -ENOMEM. > > Opportunistically, WARN in tdp_mmu_zap_leafs() if zapping a huge leaf in > the mirror root affects a GFN outside the specified range. > > Signed-off-by: Xiaoyao Li <xiaoyao.li@xxxxxxxxx> > Signed-off-by: Yan Zhao <yan.y.zhao@xxxxxxxxx> > --- > arch/x86/kvm/mmu/mmu.c | 21 +++++++ > arch/x86/kvm/mmu/tdp_mmu.c | 116 ++++++++++++++++++++++++++++++++++++- > arch/x86/kvm/mmu/tdp_mmu.h | 1 + > include/linux/kvm_host.h | 1 + > 4 files changed, 136 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 0e227199d73e..0d49c69b6b55 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -1640,6 +1640,27 @@ static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm, > start, end - 1, can_yield, true, flush); > } > > +/* > + * Split large leafs at the boundary of the specified range for the mirror root > + * > + * Return value: > + * 0 : success, no flush is required; > + * 1 : success, flush is required; > + * <0: failure. > + */ > +int kvm_split_boundary_leafs(struct kvm *kvm, struct kvm_gfn_range *range) > +{ > + bool ret = 0; > + > + lockdep_assert_once(kvm->mmu_invalidate_in_progress || > + lockdep_is_held(&kvm->slots_lock)); > + > + if (tdp_mmu_enabled) > + ret = kvm_tdp_mmu_gfn_range_split_boundary(kvm, range); > + > + return ret; > +} > + > bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) > { > bool flush = false; > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 0f683753a7bb..d3fba5d11ea2 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -324,6 +324,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, > u64 old_spte, u64 new_spte, int level, > bool shared); > > +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, > + struct kvm_mmu_page *sp, bool shared); > static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(bool mirror); > static void *get_external_spt(gfn_t gfn, u64 new_spte, int level); > > @@ -962,6 +964,19 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) > return true; > } > > +static inline bool iter_split_required(struct kvm *kvm, struct kvm_mmu_page *root, > + struct tdp_iter *iter, gfn_t start, gfn_t end) > +{ > + if (!is_mirror_sp(root) || !is_large_pte(iter->old_spte)) > + return false; > + > + /* Fully contained, no need to split */ > + if (iter->gfn >= start && iter->gfn + KVM_PAGES_PER_HPAGE(iter->level) <= end) > + return false; > + > + return true; > +} > + > /* > * If can_yield is true, will release the MMU lock and reschedule if the > * scheduler needs the CPU or there is contention on the MMU lock. If this > @@ -991,6 +1006,8 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, > !is_last_spte(iter.old_spte, iter.level)) > continue; > > + WARN_ON_ONCE(iter_split_required(kvm, root, &iter, start, end)); > + Kind of unrelated change? But good idea. Maybe for another patch. > tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); > > /* > @@ -1246,9 +1263,6 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, > return 0; > } > > -static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, > - struct kvm_mmu_page *sp, bool shared); > - > /* > * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing > * page tables and SPTEs to translate the faulting guest physical address. > @@ -1341,6 +1355,102 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > return ret; > } > > +/* > + * Split large leafs at the boundary of the specified range for the mirror root > + */ > +static int tdp_mmu_split_boundary_leafs(struct kvm *kvm, struct kvm_mmu_page *root, > + gfn_t start, gfn_t end, bool can_yield, bool *flush) > +{ > + struct kvm_mmu_page *sp = NULL; > + struct tdp_iter iter; > + > + WARN_ON_ONCE(!can_yield); Why pass this in then? > + > + if (!is_mirror_sp(root)) > + return 0; What is special about mirror roots here? > + > + end = min(end, tdp_mmu_max_gfn_exclusive()); > + > + lockdep_assert_held_write(&kvm->mmu_lock); > + > + rcu_read_lock(); > + > + for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) { > +retry: > + if (can_yield && Do we need this part of the conditional based on the above? > + tdp_mmu_iter_cond_resched(kvm, &iter, *flush, false)) { > + *flush = false; > + continue; > + } > + > + if (!is_shadow_present_pte(iter.old_spte) || > + !is_last_spte(iter.old_spte, iter.level) || > + !iter_split_required(kvm, root, &iter, start, end)) > + continue; > + > + if (!sp) { > + rcu_read_unlock(); > + > + write_unlock(&kvm->mmu_lock); > + > + sp = tdp_mmu_alloc_sp_for_split(true); > + > + write_lock(&kvm->mmu_lock); > + > + if (!sp) { > + trace_kvm_mmu_split_huge_page(iter.gfn, iter.old_spte, > + iter.level, -ENOMEM); > + return -ENOMEM; > + } > + rcu_read_lock(); > + > + iter.yielded = true; > + continue; > + } > + tdp_mmu_init_child_sp(sp, &iter); > + > + if (tdp_mmu_split_huge_page(kvm, &iter, sp, false)) I think it can't fail when you hold mmu write lock. > + goto retry; > + > + sp = NULL; > + /* > + * Set yielded in case after splitting to a lower level, > + * the new iter requires furter splitting. > + */ > + iter.yielded = true; > + *flush = true; > + } > + > + rcu_read_unlock(); > + > + /* Leave it here though it should be impossible for the mirror root */ > + if (sp) > + tdp_mmu_free_sp(sp); What do you think about relying on tdp_mmu_split_huge_pages_root() and moving this to an optimization patch at the end? Or what about just two calls to tdp_mmu_split_huge_pages_root() at the boundaries? > + return 0; > +} > + > +int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct kvm_gfn_range *range) > +{ > + enum kvm_tdp_mmu_root_types types; > + struct kvm_mmu_page *root; > + bool flush = false; > + int ret; > + > + types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS; What is the reason for KVM_INVALID_ROOTS in this case? > + > + __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types) { It would be better to check for mirror roots here, instead of inside tdp_mmu_split_boundary_leafs(). > + ret = tdp_mmu_split_boundary_leafs(kvm, root, range->start, range->end, > + range->may_block, &flush); > + if (ret < 0) { > + if (flush) > + kvm_flush_remote_tlbs(kvm); > + > + return ret; > + } > + } > + return flush; > +} > + > /* Used by mmu notifier via kvm_unmap_gfn_range() */ > bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, > bool flush) > diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h > index 52acf99d40a0..806a21d4f0e3 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.h > +++ b/arch/x86/kvm/mmu/tdp_mmu.h > @@ -69,6 +69,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm); > void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm, > enum kvm_tdp_mmu_root_types root_types); > void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared); > +int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct kvm_gfn_range *range); > > int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 655d36e1f4db..19d7a577e7ed 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -272,6 +272,7 @@ struct kvm_gfn_range { > bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); > bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); > bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); > +int kvm_split_boundary_leafs(struct kvm *kvm, struct kvm_gfn_range *range); > #endif > > enum {