Re: [PATCH v12 10/18] KVM: x86/mmu: Handle guest page faults for guest_memfd with shared memory

Sean Christopherson <seanjc@xxxxxxxxxx> · Fri, 13 Jun 2025 15:08:39 -0700

On Wed, Jun 11, 2025, Fuad Tabba wrote:
> From: Ackerley Tng <ackerleytng@xxxxxxxxxx>
> 
> For memslots backed by guest_memfd with shared mem support, the KVM MMU
> must always fault in pages from guest_memfd, and not from the host
> userspace_addr. Update the fault handler to do so.

And with a KVM_MEMSLOT_GUEST_MEMFD_ONLY flag, this becomes super obvious.

> This patch also refactors related function names for accuracy:

This patch.  And phrase changelogs as commands.

> kvm_mem_is_private() returns true only when the current private/shared
> state (in the CoCo sense) of the memory is private, and returns false if
> the current state is shared explicitly or impicitly, e.g., belongs to a
> non-CoCo VM.

Again, state changes as commands.  For the above, it's not obvious if you're
talking about the existing code versus the state of things after "this patch".

> kvm_mmu_faultin_pfn_gmem() is updated to indicate that it can be used to
> fault in not just private memory, but more generally, from guest_memfd.

> +static inline u8 kvm_max_level_for_order(int order)

Do not use "inline" for functions that are visible only to the local compilation
unit.  "inline" is just a hint, and modern compilers are smart enough to inline
functions when appropriate without a hint.

A longer explanation/rant here: https://lore.kernel.org/all/ZAdfX+S323JVWNZC@xxxxxxxxxx

> +static inline int kvm_gmem_max_mapping_level(const struct kvm_memory_slot *slot,
> +					     gfn_t gfn, int max_level)
> +{
> +	int max_order;
>  
>  	if (max_level == PG_LEVEL_4K)
>  		return PG_LEVEL_4K;

This is dead code, the one and only caller has *just* checked for this condition.
>  
> -	host_level = host_pfn_mapping_level(kvm, gfn, slot);
> -	return min(host_level, max_level);
> +	max_order = kvm_gmem_mapping_order(slot, gfn);
> +	return min(max_level, kvm_max_level_for_order(max_order));
>  }

...

> -static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
> -					u8 max_level, int gmem_order)
> +static u8 kvm_max_level_for_fault_and_order(struct kvm *kvm,

This is comically verbose.  C ain't Java.  And having two separate helpers makes
it *really* hard to (a) even see there are TWO helpers in the first place, and
(b) understand how they differ.

Gah, and not your bug, but completely ignoring the RMP in kvm_mmu_max_mapping_level()
is wrong.  It "works" because guest_memfd doesn't (yet) support dirty logging,
no one enables the NX hugepage mitigation on AMD hosts.

We could plumb in the pfn and private info, but I don't really see the point,
at least not at this time.

> +					    struct kvm_page_fault *fault,
> +					    int order)
>  {
> -	u8 req_max_level;
> +	u8 max_level = fault->max_level;
>  
>  	if (max_level == PG_LEVEL_4K)
>  		return PG_LEVEL_4K;
>  
> -	max_level = min(kvm_max_level_for_order(gmem_order), max_level);
> +	max_level = min(kvm_max_level_for_order(order), max_level);
>  	if (max_level == PG_LEVEL_4K)
>  		return PG_LEVEL_4K;
>  
> -	req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn);
> -	if (req_max_level)
> -		max_level = min(max_level, req_max_level);
> +	if (fault->is_private) {
> +		u8 level = kvm_x86_call(private_max_mapping_level)(kvm, fault->pfn);

Hmm, so the interesting thing here is that (IIRC) the RMP restrictions aren't
just on the private pages, they also apply to the HYPERVISOR/SHARED pages.  (Don't
quote me on that).

Regardless, I'm leaning toward dropping the "private" part, and making SNP deal
with the intricacies of the RMP:

	/* Some VM types have additional restrictions, e.g. SNP's RMP. */
	req_max_level = kvm_x86_call(max_mapping_level)(kvm, fault);
	if (req_max_level)
		max_level = min(max_level, req_max_level);

Then we can get to something like:

static int kvm_gmem_max_mapping_level(struct kvm *kvm, int order,
				      struct kvm_page_fault *fault)
{
	int max_level, req_max_level;

	max_level = kvm_max_level_for_order(order);
	if (max_level == PG_LEVEL_4K)
		return PG_LEVEL_4K;

	req_max_level = kvm_x86_call(max_mapping_level)(kvm, fault);
	if (req_max_level)
		max_level = min(max_level, req_max_level);

	return max_level;
}

int kvm_mmu_max_mapping_level(struct kvm *kvm,
			      const struct kvm_memory_slot *slot, gfn_t gfn)
{
	int max_level;

	max_level = kvm_lpage_info_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM);
	if (max_level == PG_LEVEL_4K)
		return PG_LEVEL_4K;

	/* TODO: Comment goes here about KVM not supporting this path (yet). */
	if (kvm_mem_is_private(kvm, gfn))
		return PG_LEVEL_4K;

	if (kvm_is_memslot_gmem_only(slot)) {
		int order = kvm_gmem_mapping_order(slot, gfn);

		return min(max_level, kvm_gmem_max_mapping_level(kvm, order, NULL));
	}

	return min(max_level, host_pfn_mapping_level(kvm, gfn, slot));
}

static int kvm_mmu_faultin_pfn_gmem(struct kvm_vcpu *vcpu,
				    struct kvm_page_fault *fault)
{
	struct kvm *kvm = vcpu->kvm;
	int order, r;

	if (!kvm_slot_has_gmem(fault->slot)) {
		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
		return -EFAULT;
	}

	r = kvm_gmem_get_pfn(kvm, fault->slot, fault->gfn, &fault->pfn,
			     &fault->refcounted_page, &order);
	if (r) {
		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
		return r;
	}

	fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
	fault->max_level = kvm_gmem_max_mapping_level(kvm, order, fault);

	return RET_PF_CONTINUE;
}

int sev_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault)
{
	int level, rc;
	bool assigned;

	if (!sev_snp_guest(kvm))
		return 0;

	if (WARN_ON_ONCE(!fault) || !fault->is_private)
		return 0;

	rc = snp_lookup_rmpentry(fault->pfn, &assigned, &level);
	if (rc || !assigned)
		return PG_LEVEL_4K;

	return level;
}
> +/*
> + * Returns true if the given gfn's private/shared status (in the CoCo sense) is
> + * private.
> + *
> + * A return value of false indicates that the gfn is explicitly or implicitly
> + * shared (i.e., non-CoCo VMs).
> + */
>  static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
>  {
> -	return IS_ENABLED(CONFIG_KVM_GMEM) &&
> -	       kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> +	struct kvm_memory_slot *slot;
> +
> +	if (!IS_ENABLED(CONFIG_KVM_GMEM))
> +		return false;
> +
> +	slot = gfn_to_memslot(kvm, gfn);
> +	if (kvm_slot_has_gmem(slot) && kvm_gmem_memslot_supports_shared(slot)) {
> +		/*
> +		 * Without in-place conversion support, if a guest_memfd memslot
> +		 * supports shared memory, then all the slot's memory is
> +		 * considered not private, i.e., implicitly shared.
> +		 */
> +		return false;

Why!?!?  Just make sure KVM_MEMORY_ATTRIBUTE_PRIVATE is mutually exclusive with
mappable guest_memfd.  You need to do that no matter what.  Then you don't need
to sprinkle special case code all over the place.

> +	}
> +
> +	return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
>  }
>  #else
>  static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> -- 
> 2.50.0.rc0.642.g800a2b2222-goog
>