Re: [RFC PATCH 16/21] KVM: x86/mmu: Introduce kvm_split_boundary_leafs() to split boundary leafs

"Edgecombe, Rick P" <rick.p.edgecombe@xxxxxxxxx> · Tue, 13 May 2025 22:56:26 +0000

On Thu, 2025-04-24 at 11:08 +0800, Yan Zhao wrote:
> Introduce kvm_split_boundary_leafs() to manage the splitting of boundary
> leafs within the mirror root.
> 
> Before zapping a specific GFN range in the mirror root, split any huge leaf
> that intersects with the boundary of the GFN range to ensure that the
> subsequent zap operation does not impact any GFN outside the specified
> range. This is crucial for the mirror root as the private page table
> requires the guest's ACCEPT operation after faulting back a GFN.
> 
> This function should be called while kvm->mmu_lock is held for writing. The
> kvm->mmu_lock is temporarily released to allocate memory for sp for split.
> The only expected error is -ENOMEM.
> 
> Opportunistically, WARN in tdp_mmu_zap_leafs() if zapping a huge leaf in
> the mirror root affects a GFN outside the specified range.
> 
> Signed-off-by: Xiaoyao Li <xiaoyao.li@xxxxxxxxx>
> Signed-off-by: Yan Zhao <yan.y.zhao@xxxxxxxxx>
> ---
>  arch/x86/kvm/mmu/mmu.c     |  21 +++++++
>  arch/x86/kvm/mmu/tdp_mmu.c | 116 ++++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/mmu/tdp_mmu.h |   1 +
>  include/linux/kvm_host.h   |   1 +
>  4 files changed, 136 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 0e227199d73e..0d49c69b6b55 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1640,6 +1640,27 @@ static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm,
>  				 start, end - 1, can_yield, true, flush);
>  }
>  
> +/*
> + * Split large leafs at the boundary of the specified range for the mirror root
> + *
> + * Return value:
> + * 0 : success, no flush is required;
> + * 1 : success, flush is required;
> + * <0: failure.
> + */
> +int kvm_split_boundary_leafs(struct kvm *kvm, struct kvm_gfn_range *range)
> +{
> +	bool ret = 0;
> +
> +	lockdep_assert_once(kvm->mmu_invalidate_in_progress ||
> +			    lockdep_is_held(&kvm->slots_lock));
> +
> +	if (tdp_mmu_enabled)
> +		ret = kvm_tdp_mmu_gfn_range_split_boundary(kvm, range);
> +
> +	return ret;
> +}
> +
>  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>  {
>  	bool flush = false;
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 0f683753a7bb..d3fba5d11ea2 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -324,6 +324,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
>  				u64 old_spte, u64 new_spte, int level,
>  				bool shared);
>  
> +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
> +				   struct kvm_mmu_page *sp, bool shared);
>  static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(bool mirror);
>  static void *get_external_spt(gfn_t gfn, u64 new_spte, int level);
>  
> @@ -962,6 +964,19 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>  	return true;
>  }
>  
> +static inline bool iter_split_required(struct kvm *kvm, struct kvm_mmu_page *root,
> +				       struct tdp_iter *iter, gfn_t start, gfn_t end)
> +{
> +	if (!is_mirror_sp(root) || !is_large_pte(iter->old_spte))
> +		return false;
> +
> +	/* Fully contained, no need to split */
> +	if (iter->gfn >= start && iter->gfn + KVM_PAGES_PER_HPAGE(iter->level) <= end)
> +		return false;
> +
> +	return true;
> +}
> +
>  /*
>   * If can_yield is true, will release the MMU lock and reschedule if the
>   * scheduler needs the CPU or there is contention on the MMU lock. If this
> @@ -991,6 +1006,8 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
>  		    !is_last_spte(iter.old_spte, iter.level))
>  			continue;
>  
> +		WARN_ON_ONCE(iter_split_required(kvm, root, &iter, start, end));
> +

Kind of unrelated change? But good idea. Maybe for another patch.

>  		tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
>  
>  		/*
> @@ -1246,9 +1263,6 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
>  	return 0;
>  }
>  
> -static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
> -				   struct kvm_mmu_page *sp, bool shared);
> -
>  /*
>   * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
>   * page tables and SPTEs to translate the faulting guest physical address.
> @@ -1341,6 +1355,102 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>  	return ret;
>  }
>  
> +/*
> + * Split large leafs at the boundary of the specified range for the mirror root
> + */
> +static int tdp_mmu_split_boundary_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
> +					gfn_t start, gfn_t end, bool can_yield, bool *flush)
> +{
> +	struct kvm_mmu_page *sp = NULL;
> +	struct tdp_iter iter;
> +
> +	WARN_ON_ONCE(!can_yield);

Why pass this in then?

> +
> +	if (!is_mirror_sp(root))
> +		return 0;

What is special about mirror roots here?

> +
> +	end = min(end, tdp_mmu_max_gfn_exclusive());
> +
> +	lockdep_assert_held_write(&kvm->mmu_lock);
> +
> +	rcu_read_lock();
> +
> +	for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
> +retry:
> +		if (can_yield &&

Do we need this part of the conditional based on the above?

> +		    tdp_mmu_iter_cond_resched(kvm, &iter, *flush, false)) {
> +			*flush = false;
> +			continue;
> +		}
> +
> +		if (!is_shadow_present_pte(iter.old_spte) ||
> +		    !is_last_spte(iter.old_spte, iter.level) ||
> +		    !iter_split_required(kvm, root, &iter, start, end))
> +			continue;
> +
> +		if (!sp) {
> +			rcu_read_unlock();
> +
> +			write_unlock(&kvm->mmu_lock);
> +
> +			sp = tdp_mmu_alloc_sp_for_split(true);
> +
> +			write_lock(&kvm->mmu_lock);
> +
> +			if (!sp) {
> +				trace_kvm_mmu_split_huge_page(iter.gfn, iter.old_spte,
> +							      iter.level, -ENOMEM);
> +				return -ENOMEM;
> +			}
> +			rcu_read_lock();
> +
> +			iter.yielded = true;
> +			continue;
> +		}
> +		tdp_mmu_init_child_sp(sp, &iter);
> +
> +		if (tdp_mmu_split_huge_page(kvm, &iter, sp, false))

I think it can't fail when you hold mmu write lock.

> +			goto retry;
> +
> +		sp = NULL;
> +		/*
> +		 * Set yielded in case after splitting to a lower level,
> +		 * the new iter requires furter splitting.
> +		 */
> +		iter.yielded = true;
> +		*flush = true;
> +	}
> +
> +	rcu_read_unlock();
> +
> +	/* Leave it here though it should be impossible for the mirror root */
> +	if (sp)
> +		tdp_mmu_free_sp(sp);

What do you think about relying on tdp_mmu_split_huge_pages_root() and moving
this to an optimization patch at the end?

Or what about just two calls to tdp_mmu_split_huge_pages_root() at the
boundaries?

> +	return 0;
> +}
> +
> +int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct kvm_gfn_range *range)
> +{
> +	enum kvm_tdp_mmu_root_types types;
> +	struct kvm_mmu_page *root;
> +	bool flush = false;
> +	int ret;
> +
> +	types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;

What is the reason for KVM_INVALID_ROOTS in this case?

> +
> +	__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types) {

It would be better to check for mirror roots here, instead of inside
tdp_mmu_split_boundary_leafs().

> +		ret = tdp_mmu_split_boundary_leafs(kvm, root, range->start, range->end,
> +						   range->may_block, &flush);
> +		if (ret < 0) {
> +			if (flush)
> +				kvm_flush_remote_tlbs(kvm);
> +
> +			return ret;
> +		}
> +	}
> +	return flush;
> +}
> +
>  /* Used by mmu notifier via kvm_unmap_gfn_range() */
>  bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
>  				 bool flush)
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> index 52acf99d40a0..806a21d4f0e3 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.h
> +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> @@ -69,6 +69,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm);
>  void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
>  				  enum kvm_tdp_mmu_root_types root_types);
>  void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared);
> +int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct kvm_gfn_range *range);
>  
>  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
>  
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 655d36e1f4db..19d7a577e7ed 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -272,6 +272,7 @@ struct kvm_gfn_range {
>  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
>  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
>  bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> +int kvm_split_boundary_leafs(struct kvm *kvm, struct kvm_gfn_range *range);
>  #endif
>  
>  enum {