Re: [PATCH v8 10/13] KVM: arm64: Handle guest_memfd()-backed guest page faults

James Houghton <jthoughton@xxxxxxxxxx> · Fri, 9 May 2025 20:15:28 +0000

On Wed, Apr 30, 2025 at 9:57 AM Fuad Tabba <tabba@xxxxxxxxxx> wrote:
>
> Add arm64 support for handling guest page faults on guest_memfd
> backed memslots.
>
> For now, the fault granule is restricted to PAGE_SIZE.
>
> Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx>
> ---
>  arch/arm64/kvm/mmu.c     | 65 +++++++++++++++++++++++++++-------------
>  include/linux/kvm_host.h |  5 ++++
>  virt/kvm/kvm_main.c      |  5 ----
>  3 files changed, 50 insertions(+), 25 deletions(-)
>
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 148a97c129de..d1044c7f78bb 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1466,6 +1466,30 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
>         return vma->vm_flags & VM_MTE_ALLOWED;
>  }
>
> +static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> +                            gfn_t gfn, bool write_fault, bool *writable,
> +                            struct page **page, bool is_gmem)
> +{
> +       kvm_pfn_t pfn;
> +       int ret;
> +
> +       if (!is_gmem)
> +               return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page);
> +
> +       *writable = false;
> +
> +       ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL);
> +       if (!ret) {
> +               *writable = !memslot_is_readonly(slot);
> +               return pfn;
> +       }
> +
> +       if (ret == -EHWPOISON)
> +               return KVM_PFN_ERR_HWPOISON;
> +
> +       return KVM_PFN_ERR_NOSLOT_MASK;
> +}
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                           struct kvm_s2_trans *nested,
>                           struct kvm_memory_slot *memslot, unsigned long hva,
> @@ -1473,19 +1497,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  {
>         int ret = 0;
>         bool write_fault, writable;
> -       bool exec_fault, mte_allowed;
> +       bool exec_fault, mte_allowed = false;
>         bool device = false, vfio_allow_any_uc = false;
>         unsigned long mmu_seq;
>         phys_addr_t ipa = fault_ipa;
>         struct kvm *kvm = vcpu->kvm;
> -       struct vm_area_struct *vma;
> +       struct vm_area_struct *vma = NULL;
>         short vma_shift;
>         void *memcache;
> -       gfn_t gfn;
> +       gfn_t gfn = ipa >> PAGE_SHIFT;
>         kvm_pfn_t pfn;
>         bool logging_active = memslot_is_logging(memslot);
> -       bool force_pte = logging_active || is_protected_kvm_enabled();
> -       long vma_pagesize, fault_granule;
> +       bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn);
> +       bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled();
> +       long vma_pagesize, fault_granule = PAGE_SIZE;
>         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
>         struct kvm_pgtable *pgt;
>         struct page *page;
> @@ -1522,16 +1547,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                         return ret;
>         }
>
> +       mmap_read_lock(current->mm);

We don't have to take the mmap_lock for gmem faults, right?

I think we should reorganize user_mem_abort() a bit (and I think vma_pagesize
and maybe vma_shift should be renamed) given the changes we're making here.

Below is a diff that I think might be a little cleaner. Let me know what you
think.

> +
>         /*
>          * Let's check if we will get back a huge page backed by hugetlbfs, or
>          * get block mapping for device MMIO region.
>          */
> -       mmap_read_lock(current->mm);
> -       vma = vma_lookup(current->mm, hva);
> -       if (unlikely(!vma)) {
> -               kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> -               mmap_read_unlock(current->mm);
> -               return -EFAULT;
> +       if (!is_gmem) {
> +               vma = vma_lookup(current->mm, hva);
> +               if (unlikely(!vma)) {
> +                       kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> +                       mmap_read_unlock(current->mm);
> +                       return -EFAULT;
> +               }
> +
> +               vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> +               mte_allowed = kvm_vma_mte_allowed(vma);
>         }
>
>         if (force_pte)
> @@ -1602,18 +1633,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                 ipa &= ~(vma_pagesize - 1);
>         }
>
> -       gfn = ipa >> PAGE_SHIFT;
> -       mte_allowed = kvm_vma_mte_allowed(vma);
> -
> -       vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> -
>         /* Don't use the VMA after the unlock -- it may have vanished */
>         vma = NULL;
>
>         /*
>          * Read mmu_invalidate_seq so that KVM can detect if the results of
> -        * vma_lookup() or __kvm_faultin_pfn() become stale prior to
> -        * acquiring kvm->mmu_lock.
> +        * vma_lookup() or faultin_pfn() become stale prior to acquiring
> +        * kvm->mmu_lock.
>          *
>          * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
>          * with the smp_wmb() in kvm_mmu_invalidate_end().
> @@ -1621,8 +1647,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         mmu_seq = vcpu->kvm->mmu_invalidate_seq;
>         mmap_read_unlock(current->mm);
>
> -       pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
> -                               &writable, &page);
> +       pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem);
>         if (pfn == KVM_PFN_ERR_HWPOISON) {

I think we need to take care to handle HWPOISON properly. I know that it is
(or will most likely be) the case that GUP(hva) --> pfn, but with gmem,
it *might* not be the case. So the following line isn't right.

I think we need to handle HWPOISON for gmem using memory fault exits instead of
sending a SIGBUS to userspace. This would be consistent with how KVM/x86
today handles getting a HWPOISON page back from kvm_gmem_get_pfn(). I'm not
entirely sure how KVM/x86 is meant to handle HWPOISON on shared gmem pages yet;
I need to keep reading your series.

The reorganization diff below leaves this unfixed.

>                 kvm_send_hwpoison_signal(hva, vma_shift);
>                 return 0;
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index f3af6bff3232..1b2e4e9a7802 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1882,6 +1882,11 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
>         return gfn_to_memslot(kvm, gfn)->id;
>  }
>
> +static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> +{
> +       return slot->flags & KVM_MEM_READONLY;
> +}
> +
>  static inline gfn_t
>  hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
>  {
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index c75d8e188eb7..d9bca5ba19dc 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2640,11 +2640,6 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
>         return size;
>  }
>
> -static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> -{
> -       return slot->flags & KVM_MEM_READONLY;
> -}
> -
>  static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
>                                        gfn_t *nr_pages, bool write)
>  {
> --
> 2.49.0.901.g37484f566f-goog

Thanks, Fuad! Here's the reorganization/rename diff:

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index d1044c7f78bba..c9eb72fe9013b 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1502,7 +1502,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	unsigned long mmu_seq;
 	phys_addr_t ipa = fault_ipa;
 	struct kvm *kvm = vcpu->kvm;
-	struct vm_area_struct *vma = NULL;
 	short vma_shift;
 	void *memcache;
 	gfn_t gfn = ipa >> PAGE_SHIFT;
@@ -1510,7 +1509,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	bool logging_active = memslot_is_logging(memslot);
 	bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn);
 	bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled();
-	long vma_pagesize, fault_granule = PAGE_SIZE;
+	long target_size = PAGE_SIZE, fault_granule = PAGE_SIZE;
 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
 	struct kvm_pgtable *pgt;
 	struct page *page;
@@ -1547,13 +1546,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			return ret;
 	}
 
-	mmap_read_lock(current->mm);
-
 	/*
 	 * Let's check if we will get back a huge page backed by hugetlbfs, or
 	 * get block mapping for device MMIO region.
 	 */
 	if (!is_gmem) {
+		struct vm_area_struct *vma = NULL;
+
+		mmap_read_lock(current->mm);
+
 		vma = vma_lookup(current->mm, hva);
 		if (unlikely(!vma)) {
 			kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
@@ -1563,38 +1564,45 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 		vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
 		mte_allowed = kvm_vma_mte_allowed(vma);
-	}
-
-	if (force_pte)
-		vma_shift = PAGE_SHIFT;
-	else
-		vma_shift = get_vma_page_shift(vma, hva);
+		vma_shift = force_pte ? get_vma_page_shift(vma, hva) : PAGE_SHIFT;
 
-	switch (vma_shift) {
+		switch (vma_shift) {
 #ifndef __PAGETABLE_PMD_FOLDED
-	case PUD_SHIFT:
-		if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
-			break;
-		fallthrough;
+		case PUD_SHIFT:
+			if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
+				break;
+			fallthrough;
 #endif
-	case CONT_PMD_SHIFT:
-		vma_shift = PMD_SHIFT;
-		fallthrough;
-	case PMD_SHIFT:
-		if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
+		case CONT_PMD_SHIFT:
+			vma_shift = PMD_SHIFT;
+			fallthrough;
+		case PMD_SHIFT:
+			if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
+				break;
+			fallthrough;
+		case CONT_PTE_SHIFT:
+			vma_shift = PAGE_SHIFT;
+			force_pte = true;
+			fallthrough;
+		case PAGE_SHIFT:
 			break;
-		fallthrough;
-	case CONT_PTE_SHIFT:
-		vma_shift = PAGE_SHIFT;
-		force_pte = true;
-		fallthrough;
-	case PAGE_SHIFT:
-		break;
-	default:
-		WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
-	}
+		default:
+			WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
+		}
 
-	vma_pagesize = 1UL << vma_shift;
+		/*
+		 * Read mmu_invalidate_seq so that KVM can detect if the results of
+		 * vma_lookup() or faultin_pfn() become stale prior to acquiring
+		 * kvm->mmu_lock.
+		 *
+		 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
+		 * with the smp_wmb() in kvm_mmu_invalidate_end().
+		 */
+		mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+		mmap_read_unlock(current->mm);
+
+		target_size = 1UL << vma_shift;
+	}
 
 	if (nested) {
 		unsigned long max_map_size;
@@ -1620,7 +1628,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			max_map_size = PAGE_SIZE;
 
 		force_pte = (max_map_size == PAGE_SIZE);
-		vma_pagesize = min(vma_pagesize, (long)max_map_size);
+		target_size = min(target_size, (long)max_map_size);
 	}
 
 	/*
@@ -1628,27 +1636,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	 * ensure we find the right PFN and lay down the mapping in the right
 	 * place.
 	 */
-	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
-		fault_ipa &= ~(vma_pagesize - 1);
-		ipa &= ~(vma_pagesize - 1);
+	if (target_size == PMD_SIZE || target_size == PUD_SIZE) {
+		fault_ipa &= ~(target_size - 1);
+		ipa &= ~(target_size - 1);
 	}
 
-	/* Don't use the VMA after the unlock -- it may have vanished */
-	vma = NULL;
-
-	/*
-	 * Read mmu_invalidate_seq so that KVM can detect if the results of
-	 * vma_lookup() or faultin_pfn() become stale prior to acquiring
-	 * kvm->mmu_lock.
-	 *
-	 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
-	 * with the smp_wmb() in kvm_mmu_invalidate_end().
-	 */
-	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
-	mmap_read_unlock(current->mm);
-
 	pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem);
 	if (pfn == KVM_PFN_ERR_HWPOISON) {
+		// TODO: Handle gmem properly. vma_shift
+		// intentionally left uninitialized.
 		kvm_send_hwpoison_signal(hva, vma_shift);
 		return 0;
 	}
@@ -1658,9 +1654,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (kvm_is_device_pfn(pfn)) {
 		/*
 		 * If the page was identified as device early by looking at
-		 * the VMA flags, vma_pagesize is already representing the
+		 * the VMA flags, target_size is already representing the
 		 * largest quantity we can map.  If instead it was mapped
-		 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
+		 * via __kvm_faultin_pfn(), target_size is set to PAGE_SIZE
 		 * and must not be upgraded.
 		 *
 		 * In both cases, we don't let transparent_hugepage_adjust()
@@ -1699,7 +1695,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 	kvm_fault_lock(kvm);
 	pgt = vcpu->arch.hw_mmu->pgt;
-	if (mmu_invalidate_retry(kvm, mmu_seq)) {
+	if (!is_gmem && mmu_invalidate_retry(kvm, mmu_seq)) {
 		ret = -EAGAIN;
 		goto out_unlock;
 	}
@@ -1708,16 +1704,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	 * If we are not forced to use page mapping, check if we are
 	 * backed by a THP and thus use block mapping if possible.
 	 */
-	if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
+	if (target_size == PAGE_SIZE && !(force_pte || device)) {
 		if (fault_is_perm && fault_granule > PAGE_SIZE)
-			vma_pagesize = fault_granule;
-		else
-			vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
+			target_size = fault_granule;
+		else if (!is_gmem)
+			target_size = transparent_hugepage_adjust(kvm, memslot,
 								   hva, &pfn,
 								   &fault_ipa);
 
-		if (vma_pagesize < 0) {
-			ret = vma_pagesize;
+		if (target_size < 0) {
+			ret = target_size;
 			goto out_unlock;
 		}
 	}
@@ -1725,7 +1721,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (!fault_is_perm && !device && kvm_has_mte(kvm)) {
 		/* Check the VMM hasn't introduced a new disallowed VMA */
 		if (mte_allowed) {
-			sanitise_mte_tags(kvm, pfn, vma_pagesize);
+			sanitise_mte_tags(kvm, pfn, target_size);
 		} else {
 			ret = -EFAULT;
 			goto out_unlock;
@@ -1750,10 +1746,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 	/*
 	 * Under the premise of getting a FSC_PERM fault, we just need to relax
-	 * permissions only if vma_pagesize equals fault_granule. Otherwise,
+	 * permissions only if target_size equals fault_granule. Otherwise,
 	 * kvm_pgtable_stage2_map() should be called to change block size.
 	 */
-	if (fault_is_perm && vma_pagesize == fault_granule) {
+	if (fault_is_perm && target_size == fault_granule) {
 		/*
 		 * Drop the SW bits in favour of those stored in the
 		 * PTE, which will be preserved.
@@ -1761,7 +1757,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		prot &= ~KVM_NV_GUEST_MAP_SZ;
 		ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags);
 	} else {
-		ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
+		ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, target_size,
 					     __pfn_to_phys(pfn), prot,
 					     memcache, flags);
 	}