Re: [PATCH v8 06/13] KVM: x86: Generalize private fault lookups to guest_memfd fault lookups

Ackerley Tng <ackerleytng@xxxxxxxxxx> · Wed, 30 Apr 2025 11:58:35 -0700

Fuad Tabba <tabba@xxxxxxxxxx> writes:

> Until now, faults to private memory backed by guest_memfd are always
> consumed from guest_memfd whereas faults to shared memory are consumed
> from anonymous memory. Subsequent patches will allow sharing guest_memfd
> backed memory in-place, and mapping it by the host. Faults to in-place
> shared memory should be consumed from guest_memfd as well.
>
> In order to facilitate that, generalize the fault lookups. Currently,
> only private memory is consumed from guest_memfd and therefore as it
> stands, this patch does not change the behavior.
>
> Co-developed-by: David Hildenbrand <david@xxxxxxxxxx>
> Signed-off-by: David Hildenbrand <david@xxxxxxxxxx>
> Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx>
> ---
>  arch/x86/kvm/mmu/mmu.c   | 19 +++++++++----------
>  include/linux/kvm_host.h |  6 ++++++
>  2 files changed, 15 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d5dd869c890..08eebd24a0e1 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3258,7 +3258,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
>
>  static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
>  				       const struct kvm_memory_slot *slot,
> -				       gfn_t gfn, int max_level, bool is_private)
> +				       gfn_t gfn, int max_level, bool is_gmem)
>  {
>  	struct kvm_lpage_info *linfo;
>  	int host_level;
> @@ -3270,7 +3270,7 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
>  			break;
>  	}
>
> -	if (is_private)
> +	if (is_gmem)
>  		return max_level;

I think this renaming isn't quite accurate.

IIUC in __kvm_mmu_max_mapping_level(), we skip considering
host_pfn_mapping_level() if the gfn is private because private memory
will not be mapped to userspace, so there's no need to query userspace
page tables in host_pfn_mapping_level().

Renaming is_private to is_gmem in this function implies that as long as
gmem is used, especially for shared pages from gmem, lpage_info will
always be updated and there's no need to query userspace page tables.

>
>  	if (max_level == PG_LEVEL_4K)
> @@ -3283,10 +3283,9 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
>  int kvm_mmu_max_mapping_level(struct kvm *kvm,
>  			      const struct kvm_memory_slot *slot, gfn_t gfn)
>  {
> -	bool is_private = kvm_slot_has_gmem(slot) &&
> -			  kvm_mem_is_private(kvm, gfn);
> +	bool is_gmem = kvm_slot_has_gmem(slot) && kvm_mem_from_gmem(kvm, gfn);

This renaming should probably be undone too.

>
> -	return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
> +	return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_gmem);
>  }
>
>  void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> @@ -4465,7 +4464,7 @@ static inline u8 kvm_max_level_for_order(int order)
>  	return PG_LEVEL_4K;
>  }
>
> -static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
> +static u8 kvm_max_gmem_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
>  					u8 max_level, int gmem_order)
>  {
>  	u8 req_max_level;
> @@ -4491,7 +4490,7 @@ static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
>  				 r == RET_PF_RETRY, fault->map_writable);
>  }
>
> -static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
> +static int kvm_mmu_faultin_pfn_gmem(struct kvm_vcpu *vcpu,
>  				       struct kvm_page_fault *fault)
>  {
>  	int max_order, r;
> @@ -4509,8 +4508,8 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
>  	}
>
>  	fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
> -	fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
> -							 fault->max_level, max_order);
> +	fault->max_level = kvm_max_gmem_mapping_level(vcpu->kvm, fault->pfn,
> +						      fault->max_level, max_order);
>
>  	return RET_PF_CONTINUE;
>  }
> @@ -4521,7 +4520,7 @@ static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
>  	unsigned int foll = fault->write ? FOLL_WRITE : 0;
>
>  	if (fault->is_private)
> -		return kvm_mmu_faultin_pfn_private(vcpu, fault);
> +		return kvm_mmu_faultin_pfn_gmem(vcpu, fault);
>
>  	foll |= FOLL_NOWAIT;
>  	fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index d9616ee6acc7..cdcd7ac091b5 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2514,6 +2514,12 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
>  }
>  #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
>
> +static inline bool kvm_mem_from_gmem(struct kvm *kvm, gfn_t gfn)
> +{
> +	/* For now, only private memory gets consumed from guest_memfd. */
> +	return kvm_mem_is_private(kvm, gfn);
> +}

Can I understand this function as "should fault from gmem"? And hence
also "was faulted from gmem"?

After this entire patch series, for arm64, KVM will always service stage
2 faults from gmem.

Perhaps this function should retain your suggested name of
kvm_mem_from_gmem() but only depend on
kvm_arch_gmem_supports_shared_mem(), since this patch series doesn't
update the MMU in X86. So something like this,

+static inline bool kvm_mem_from_gmem(struct kvm *kvm, gfn_t gfn)
+{
+	return kvm_arch_gmem_supports_shared_mem(kvm);
+}

with the only usage in arm64.

When the MMU code for X86 is updated, we could then update the above
with 

static inline bool kvm_mem_from_gmem(struct kvm *kvm, gfn_t gfn)
{
-	return kvm_arch_gmem_supports_shared_mem(kvm);
+	return kvm_arch_gmem_supports_shared_mem(kvm) ||
+              kvm_gmem_should_always_use_gmem(gfn_to_memslot(kvm, gfn)->gmem.file) ||
+              kvm_mem_is_private(kvm, gfn);
}

where kvm_gmem_should_always_use_gmem() will read a guest_memfd flag? 

> +
>  #ifdef CONFIG_KVM_GMEM
>  int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>  		     gfn_t gfn, kvm_pfn_t *pfn, struct page **page,