Re: [PATCH v8 10/13] KVM: arm64: Handle guest_memfd()-backed guest page faults

Fuad Tabba <tabba@xxxxxxxxxx> · Mon, 12 May 2025 08:07:18 +0100

Hi James,

On Fri, 9 May 2025 at 21:15, James Houghton <jthoughton@xxxxxxxxxx> wrote:
>
> On Wed, Apr 30, 2025 at 9:57 AM Fuad Tabba <tabba@xxxxxxxxxx> wrote:
> >
> > Add arm64 support for handling guest page faults on guest_memfd
> > backed memslots.
> >
> > For now, the fault granule is restricted to PAGE_SIZE.
> >
> > Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx>
> > ---
> >  arch/arm64/kvm/mmu.c     | 65 +++++++++++++++++++++++++++-------------
> >  include/linux/kvm_host.h |  5 ++++
> >  virt/kvm/kvm_main.c      |  5 ----
> >  3 files changed, 50 insertions(+), 25 deletions(-)
> >
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index 148a97c129de..d1044c7f78bb 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -1466,6 +1466,30 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
> >         return vma->vm_flags & VM_MTE_ALLOWED;
> >  }
> >
> > +static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > +                            gfn_t gfn, bool write_fault, bool *writable,
> > +                            struct page **page, bool is_gmem)
> > +{
> > +       kvm_pfn_t pfn;
> > +       int ret;
> > +
> > +       if (!is_gmem)
> > +               return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page);
> > +
> > +       *writable = false;
> > +
> > +       ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL);
> > +       if (!ret) {
> > +               *writable = !memslot_is_readonly(slot);
> > +               return pfn;
> > +       }
> > +
> > +       if (ret == -EHWPOISON)
> > +               return KVM_PFN_ERR_HWPOISON;
> > +
> > +       return KVM_PFN_ERR_NOSLOT_MASK;
> > +}
> > +
> >  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >                           struct kvm_s2_trans *nested,
> >                           struct kvm_memory_slot *memslot, unsigned long hva,
> > @@ -1473,19 +1497,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >  {
> >         int ret = 0;
> >         bool write_fault, writable;
> > -       bool exec_fault, mte_allowed;
> > +       bool exec_fault, mte_allowed = false;
> >         bool device = false, vfio_allow_any_uc = false;
> >         unsigned long mmu_seq;
> >         phys_addr_t ipa = fault_ipa;
> >         struct kvm *kvm = vcpu->kvm;
> > -       struct vm_area_struct *vma;
> > +       struct vm_area_struct *vma = NULL;
> >         short vma_shift;
> >         void *memcache;
> > -       gfn_t gfn;
> > +       gfn_t gfn = ipa >> PAGE_SHIFT;
> >         kvm_pfn_t pfn;
> >         bool logging_active = memslot_is_logging(memslot);
> > -       bool force_pte = logging_active || is_protected_kvm_enabled();
> > -       long vma_pagesize, fault_granule;
> > +       bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn);
> > +       bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled();
> > +       long vma_pagesize, fault_granule = PAGE_SIZE;
> >         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> >         struct kvm_pgtable *pgt;
> >         struct page *page;
> > @@ -1522,16 +1547,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >                         return ret;
> >         }
> >
> > +       mmap_read_lock(current->mm);
>
> We don't have to take the mmap_lock for gmem faults, right?
>
> I think we should reorganize user_mem_abort() a bit (and I think vma_pagesize
> and maybe vma_shift should be renamed) given the changes we're making here.

Good point.

> Below is a diff that I think might be a little cleaner. Let me know what you
> think.
>
> > +
> >         /*
> >          * Let's check if we will get back a huge page backed by hugetlbfs, or
> >          * get block mapping for device MMIO region.
> >          */
> > -       mmap_read_lock(current->mm);
> > -       vma = vma_lookup(current->mm, hva);
> > -       if (unlikely(!vma)) {
> > -               kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> > -               mmap_read_unlock(current->mm);
> > -               return -EFAULT;
> > +       if (!is_gmem) {
> > +               vma = vma_lookup(current->mm, hva);
> > +               if (unlikely(!vma)) {
> > +                       kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> > +                       mmap_read_unlock(current->mm);
> > +                       return -EFAULT;
> > +               }
> > +
> > +               vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> > +               mte_allowed = kvm_vma_mte_allowed(vma);
> >         }
> >
> >         if (force_pte)
> > @@ -1602,18 +1633,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >                 ipa &= ~(vma_pagesize - 1);
> >         }
> >
> > -       gfn = ipa >> PAGE_SHIFT;
> > -       mte_allowed = kvm_vma_mte_allowed(vma);
> > -
> > -       vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> > -
> >         /* Don't use the VMA after the unlock -- it may have vanished */
> >         vma = NULL;
> >
> >         /*
> >          * Read mmu_invalidate_seq so that KVM can detect if the results of
> > -        * vma_lookup() or __kvm_faultin_pfn() become stale prior to
> > -        * acquiring kvm->mmu_lock.
> > +        * vma_lookup() or faultin_pfn() become stale prior to acquiring
> > +        * kvm->mmu_lock.
> >          *
> >          * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
> >          * with the smp_wmb() in kvm_mmu_invalidate_end().
> > @@ -1621,8 +1647,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >         mmu_seq = vcpu->kvm->mmu_invalidate_seq;
> >         mmap_read_unlock(current->mm);
> >
> > -       pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
> > -                               &writable, &page);
> > +       pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem);
> >         if (pfn == KVM_PFN_ERR_HWPOISON) {
>
> I think we need to take care to handle HWPOISON properly. I know that it is
> (or will most likely be) the case that GUP(hva) --> pfn, but with gmem,
> it *might* not be the case. So the following line isn't right.
>
> I think we need to handle HWPOISON for gmem using memory fault exits instead of
> sending a SIGBUS to userspace. This would be consistent with how KVM/x86
> today handles getting a HWPOISON page back from kvm_gmem_get_pfn(). I'm not
> entirely sure how KVM/x86 is meant to handle HWPOISON on shared gmem pages yet;
> I need to keep reading your series.

You're right. In the next respin (coming soon), Ackerley has added a
patch that performs a best-effort check to ensure that hva matches the
gfn.

> The reorganization diff below leaves this unfixed.
>
> >                 kvm_send_hwpoison_signal(hva, vma_shift);
> >                 return 0;
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index f3af6bff3232..1b2e4e9a7802 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -1882,6 +1882,11 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
> >         return gfn_to_memslot(kvm, gfn)->id;
> >  }
> >
> > +static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> > +{
> > +       return slot->flags & KVM_MEM_READONLY;
> > +}
> > +
> >  static inline gfn_t
> >  hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
> >  {
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index c75d8e188eb7..d9bca5ba19dc 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -2640,11 +2640,6 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
> >         return size;
> >  }
> >
> > -static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> > -{
> > -       return slot->flags & KVM_MEM_READONLY;
> > -}
> > -
> >  static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
> >                                        gfn_t *nr_pages, bool write)
> >  {
> > --
> > 2.49.0.901.g37484f566f-goog
>
> Thanks, Fuad! Here's the reorganization/rename diff:

Thank you James. This is very helpful.

Cheers,
/fuad

>
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index d1044c7f78bba..c9eb72fe9013b 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1502,7 +1502,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         unsigned long mmu_seq;
>         phys_addr_t ipa = fault_ipa;
>         struct kvm *kvm = vcpu->kvm;
> -       struct vm_area_struct *vma = NULL;
>         short vma_shift;
>         void *memcache;
>         gfn_t gfn = ipa >> PAGE_SHIFT;
> @@ -1510,7 +1509,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         bool logging_active = memslot_is_logging(memslot);
>         bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn);
>         bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled();
> -       long vma_pagesize, fault_granule = PAGE_SIZE;
> +       long target_size = PAGE_SIZE, fault_granule = PAGE_SIZE;
>         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
>         struct kvm_pgtable *pgt;
>         struct page *page;
> @@ -1547,13 +1546,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                         return ret;
>         }
>
> -       mmap_read_lock(current->mm);
> -
>         /*
>          * Let's check if we will get back a huge page backed by hugetlbfs, or
>          * get block mapping for device MMIO region.
>          */
>         if (!is_gmem) {
> +               struct vm_area_struct *vma = NULL;
> +
> +               mmap_read_lock(current->mm);
> +
>                 vma = vma_lookup(current->mm, hva);
>                 if (unlikely(!vma)) {
>                         kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> @@ -1563,38 +1564,45 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>
>                 vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
>                 mte_allowed = kvm_vma_mte_allowed(vma);
> -       }
> -
> -       if (force_pte)
> -               vma_shift = PAGE_SHIFT;
> -       else
> -               vma_shift = get_vma_page_shift(vma, hva);
> +               vma_shift = force_pte ? get_vma_page_shift(vma, hva) : PAGE_SHIFT;
>
> -       switch (vma_shift) {
> +               switch (vma_shift) {
>  #ifndef __PAGETABLE_PMD_FOLDED
> -       case PUD_SHIFT:
> -               if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
> -                       break;
> -               fallthrough;
> +               case PUD_SHIFT:
> +                       if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
> +                               break;
> +                       fallthrough;
>  #endif
> -       case CONT_PMD_SHIFT:
> -               vma_shift = PMD_SHIFT;
> -               fallthrough;
> -       case PMD_SHIFT:
> -               if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
> +               case CONT_PMD_SHIFT:
> +                       vma_shift = PMD_SHIFT;
> +                       fallthrough;
> +               case PMD_SHIFT:
> +                       if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
> +                               break;
> +                       fallthrough;
> +               case CONT_PTE_SHIFT:
> +                       vma_shift = PAGE_SHIFT;
> +                       force_pte = true;
> +                       fallthrough;
> +               case PAGE_SHIFT:
>                         break;
> -               fallthrough;
> -       case CONT_PTE_SHIFT:
> -               vma_shift = PAGE_SHIFT;
> -               force_pte = true;
> -               fallthrough;
> -       case PAGE_SHIFT:
> -               break;
> -       default:
> -               WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
> -       }
> +               default:
> +                       WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
> +               }
>
> -       vma_pagesize = 1UL << vma_shift;
> +               /*
> +                * Read mmu_invalidate_seq so that KVM can detect if the results of
> +                * vma_lookup() or faultin_pfn() become stale prior to acquiring
> +                * kvm->mmu_lock.
> +                *
> +                * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
> +                * with the smp_wmb() in kvm_mmu_invalidate_end().
> +                */
> +               mmu_seq = vcpu->kvm->mmu_invalidate_seq;
> +               mmap_read_unlock(current->mm);
> +
> +               target_size = 1UL << vma_shift;
> +       }
>
>         if (nested) {
>                 unsigned long max_map_size;
> @@ -1620,7 +1628,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                         max_map_size = PAGE_SIZE;
>
>                 force_pte = (max_map_size == PAGE_SIZE);
> -               vma_pagesize = min(vma_pagesize, (long)max_map_size);
> +               target_size = min(target_size, (long)max_map_size);
>         }
>
>         /*
> @@ -1628,27 +1636,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>          * ensure we find the right PFN and lay down the mapping in the right
>          * place.
>          */
> -       if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
> -               fault_ipa &= ~(vma_pagesize - 1);
> -               ipa &= ~(vma_pagesize - 1);
> +       if (target_size == PMD_SIZE || target_size == PUD_SIZE) {
> +               fault_ipa &= ~(target_size - 1);
> +               ipa &= ~(target_size - 1);
>         }
>
> -       /* Don't use the VMA after the unlock -- it may have vanished */
> -       vma = NULL;
> -
> -       /*
> -        * Read mmu_invalidate_seq so that KVM can detect if the results of
> -        * vma_lookup() or faultin_pfn() become stale prior to acquiring
> -        * kvm->mmu_lock.
> -        *
> -        * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
> -        * with the smp_wmb() in kvm_mmu_invalidate_end().
> -        */
> -       mmu_seq = vcpu->kvm->mmu_invalidate_seq;
> -       mmap_read_unlock(current->mm);
> -
>         pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem);
>         if (pfn == KVM_PFN_ERR_HWPOISON) {
> +               // TODO: Handle gmem properly. vma_shift
> +               // intentionally left uninitialized.
>                 kvm_send_hwpoison_signal(hva, vma_shift);
>                 return 0;
>         }
> @@ -1658,9 +1654,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         if (kvm_is_device_pfn(pfn)) {
>                 /*
>                  * If the page was identified as device early by looking at
> -                * the VMA flags, vma_pagesize is already representing the
> +                * the VMA flags, target_size is already representing the
>                  * largest quantity we can map.  If instead it was mapped
> -                * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
> +                * via __kvm_faultin_pfn(), target_size is set to PAGE_SIZE
>                  * and must not be upgraded.
>                  *
>                  * In both cases, we don't let transparent_hugepage_adjust()
> @@ -1699,7 +1695,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>
>         kvm_fault_lock(kvm);
>         pgt = vcpu->arch.hw_mmu->pgt;
> -       if (mmu_invalidate_retry(kvm, mmu_seq)) {
> +       if (!is_gmem && mmu_invalidate_retry(kvm, mmu_seq)) {
>                 ret = -EAGAIN;
>                 goto out_unlock;
>         }
> @@ -1708,16 +1704,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>          * If we are not forced to use page mapping, check if we are
>          * backed by a THP and thus use block mapping if possible.
>          */
> -       if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
> +       if (target_size == PAGE_SIZE && !(force_pte || device)) {
>                 if (fault_is_perm && fault_granule > PAGE_SIZE)
> -                       vma_pagesize = fault_granule;
> -               else
> -                       vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
> +                       target_size = fault_granule;
> +               else if (!is_gmem)
> +                       target_size = transparent_hugepage_adjust(kvm, memslot,
>                                                                    hva, &pfn,
>                                                                    &fault_ipa);
>
> -               if (vma_pagesize < 0) {
> -                       ret = vma_pagesize;
> +               if (target_size < 0) {
> +                       ret = target_size;
>                         goto out_unlock;
>                 }
>         }
> @@ -1725,7 +1721,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         if (!fault_is_perm && !device && kvm_has_mte(kvm)) {
>                 /* Check the VMM hasn't introduced a new disallowed VMA */
>                 if (mte_allowed) {
> -                       sanitise_mte_tags(kvm, pfn, vma_pagesize);
> +                       sanitise_mte_tags(kvm, pfn, target_size);
>                 } else {
>                         ret = -EFAULT;
>                         goto out_unlock;
> @@ -1750,10 +1746,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>
>         /*
>          * Under the premise of getting a FSC_PERM fault, we just need to relax
> -        * permissions only if vma_pagesize equals fault_granule. Otherwise,
> +        * permissions only if target_size equals fault_granule. Otherwise,
>          * kvm_pgtable_stage2_map() should be called to change block size.
>          */
> -       if (fault_is_perm && vma_pagesize == fault_granule) {
> +       if (fault_is_perm && target_size == fault_granule) {
>                 /*
>                  * Drop the SW bits in favour of those stored in the
>                  * PTE, which will be preserved.
> @@ -1761,7 +1757,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                 prot &= ~KVM_NV_GUEST_MAP_SZ;
>                 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags);
>         } else {
> -               ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
> +               ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, target_size,
>                                              __pfn_to_phys(pfn), prot,
>                                              memcache, flags);
>         }