On Wed, Apr 30, 2025 at 9:57 AM Fuad Tabba <tabba@xxxxxxxxxx> wrote: > > Add arm64 support for handling guest page faults on guest_memfd > backed memslots. > > For now, the fault granule is restricted to PAGE_SIZE. > > Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx> > --- > arch/arm64/kvm/mmu.c | 65 +++++++++++++++++++++++++++------------- > include/linux/kvm_host.h | 5 ++++ > virt/kvm/kvm_main.c | 5 ---- > 3 files changed, 50 insertions(+), 25 deletions(-) > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > index 148a97c129de..d1044c7f78bb 100644 > --- a/arch/arm64/kvm/mmu.c > +++ b/arch/arm64/kvm/mmu.c > @@ -1466,6 +1466,30 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) > return vma->vm_flags & VM_MTE_ALLOWED; > } > > +static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, > + gfn_t gfn, bool write_fault, bool *writable, > + struct page **page, bool is_gmem) > +{ > + kvm_pfn_t pfn; > + int ret; > + > + if (!is_gmem) > + return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page); > + > + *writable = false; > + > + ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL); > + if (!ret) { > + *writable = !memslot_is_readonly(slot); > + return pfn; > + } > + > + if (ret == -EHWPOISON) > + return KVM_PFN_ERR_HWPOISON; > + > + return KVM_PFN_ERR_NOSLOT_MASK; > +} > + > static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > struct kvm_s2_trans *nested, > struct kvm_memory_slot *memslot, unsigned long hva, > @@ -1473,19 +1497,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > { > int ret = 0; > bool write_fault, writable; > - bool exec_fault, mte_allowed; > + bool exec_fault, mte_allowed = false; > bool device = false, vfio_allow_any_uc = false; > unsigned long mmu_seq; > phys_addr_t ipa = fault_ipa; > struct kvm *kvm = vcpu->kvm; > - struct vm_area_struct *vma; > + struct vm_area_struct *vma = NULL; > short vma_shift; > void *memcache; > - gfn_t gfn; > + gfn_t gfn = ipa >> PAGE_SHIFT; > kvm_pfn_t pfn; > bool logging_active = memslot_is_logging(memslot); > - bool force_pte = logging_active || is_protected_kvm_enabled(); > - long vma_pagesize, fault_granule; > + bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn); > + bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled(); > + long vma_pagesize, fault_granule = PAGE_SIZE; > enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; > struct kvm_pgtable *pgt; > struct page *page; > @@ -1522,16 +1547,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > return ret; > } > > + mmap_read_lock(current->mm); We don't have to take the mmap_lock for gmem faults, right? I think we should reorganize user_mem_abort() a bit (and I think vma_pagesize and maybe vma_shift should be renamed) given the changes we're making here. Below is a diff that I think might be a little cleaner. Let me know what you think. > + > /* > * Let's check if we will get back a huge page backed by hugetlbfs, or > * get block mapping for device MMIO region. > */ > - mmap_read_lock(current->mm); > - vma = vma_lookup(current->mm, hva); > - if (unlikely(!vma)) { > - kvm_err("Failed to find VMA for hva 0x%lx\n", hva); > - mmap_read_unlock(current->mm); > - return -EFAULT; > + if (!is_gmem) { > + vma = vma_lookup(current->mm, hva); > + if (unlikely(!vma)) { > + kvm_err("Failed to find VMA for hva 0x%lx\n", hva); > + mmap_read_unlock(current->mm); > + return -EFAULT; > + } > + > + vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; > + mte_allowed = kvm_vma_mte_allowed(vma); > } > > if (force_pte) > @@ -1602,18 +1633,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > ipa &= ~(vma_pagesize - 1); > } > > - gfn = ipa >> PAGE_SHIFT; > - mte_allowed = kvm_vma_mte_allowed(vma); > - > - vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; > - > /* Don't use the VMA after the unlock -- it may have vanished */ > vma = NULL; > > /* > * Read mmu_invalidate_seq so that KVM can detect if the results of > - * vma_lookup() or __kvm_faultin_pfn() become stale prior to > - * acquiring kvm->mmu_lock. > + * vma_lookup() or faultin_pfn() become stale prior to acquiring > + * kvm->mmu_lock. > * > * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs > * with the smp_wmb() in kvm_mmu_invalidate_end(). > @@ -1621,8 +1647,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > mmu_seq = vcpu->kvm->mmu_invalidate_seq; > mmap_read_unlock(current->mm); > > - pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, > - &writable, &page); > + pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem); > if (pfn == KVM_PFN_ERR_HWPOISON) { I think we need to take care to handle HWPOISON properly. I know that it is (or will most likely be) the case that GUP(hva) --> pfn, but with gmem, it *might* not be the case. So the following line isn't right. I think we need to handle HWPOISON for gmem using memory fault exits instead of sending a SIGBUS to userspace. This would be consistent with how KVM/x86 today handles getting a HWPOISON page back from kvm_gmem_get_pfn(). I'm not entirely sure how KVM/x86 is meant to handle HWPOISON on shared gmem pages yet; I need to keep reading your series. The reorganization diff below leaves this unfixed. > kvm_send_hwpoison_signal(hva, vma_shift); > return 0; > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index f3af6bff3232..1b2e4e9a7802 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -1882,6 +1882,11 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn) > return gfn_to_memslot(kvm, gfn)->id; > } > > +static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot) > +{ > + return slot->flags & KVM_MEM_READONLY; > +} > + > static inline gfn_t > hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) > { > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index c75d8e188eb7..d9bca5ba19dc 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -2640,11 +2640,6 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) > return size; > } > > -static bool memslot_is_readonly(const struct kvm_memory_slot *slot) > -{ > - return slot->flags & KVM_MEM_READONLY; > -} > - > static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn, > gfn_t *nr_pages, bool write) > { > -- > 2.49.0.901.g37484f566f-goog Thanks, Fuad! Here's the reorganization/rename diff: diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d1044c7f78bba..c9eb72fe9013b 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1502,7 +1502,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, unsigned long mmu_seq; phys_addr_t ipa = fault_ipa; struct kvm *kvm = vcpu->kvm; - struct vm_area_struct *vma = NULL; short vma_shift; void *memcache; gfn_t gfn = ipa >> PAGE_SHIFT; @@ -1510,7 +1509,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, bool logging_active = memslot_is_logging(memslot); bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn); bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled(); - long vma_pagesize, fault_granule = PAGE_SIZE; + long target_size = PAGE_SIZE, fault_granule = PAGE_SIZE; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; struct page *page; @@ -1547,13 +1546,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return ret; } - mmap_read_lock(current->mm); - /* * Let's check if we will get back a huge page backed by hugetlbfs, or * get block mapping for device MMIO region. */ if (!is_gmem) { + struct vm_area_struct *vma = NULL; + + mmap_read_lock(current->mm); + vma = vma_lookup(current->mm, hva); if (unlikely(!vma)) { kvm_err("Failed to find VMA for hva 0x%lx\n", hva); @@ -1563,38 +1564,45 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; mte_allowed = kvm_vma_mte_allowed(vma); - } - - if (force_pte) - vma_shift = PAGE_SHIFT; - else - vma_shift = get_vma_page_shift(vma, hva); + vma_shift = force_pte ? get_vma_page_shift(vma, hva) : PAGE_SHIFT; - switch (vma_shift) { + switch (vma_shift) { #ifndef __PAGETABLE_PMD_FOLDED - case PUD_SHIFT: - if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) - break; - fallthrough; + case PUD_SHIFT: + if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) + break; + fallthrough; #endif - case CONT_PMD_SHIFT: - vma_shift = PMD_SHIFT; - fallthrough; - case PMD_SHIFT: - if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) + case CONT_PMD_SHIFT: + vma_shift = PMD_SHIFT; + fallthrough; + case PMD_SHIFT: + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) + break; + fallthrough; + case CONT_PTE_SHIFT: + vma_shift = PAGE_SHIFT; + force_pte = true; + fallthrough; + case PAGE_SHIFT: break; - fallthrough; - case CONT_PTE_SHIFT: - vma_shift = PAGE_SHIFT; - force_pte = true; - fallthrough; - case PAGE_SHIFT: - break; - default: - WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); - } + default: + WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); + } - vma_pagesize = 1UL << vma_shift; + /* + * Read mmu_invalidate_seq so that KVM can detect if the results of + * vma_lookup() or faultin_pfn() become stale prior to acquiring + * kvm->mmu_lock. + * + * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs + * with the smp_wmb() in kvm_mmu_invalidate_end(). + */ + mmu_seq = vcpu->kvm->mmu_invalidate_seq; + mmap_read_unlock(current->mm); + + target_size = 1UL << vma_shift; + } if (nested) { unsigned long max_map_size; @@ -1620,7 +1628,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, max_map_size = PAGE_SIZE; force_pte = (max_map_size == PAGE_SIZE); - vma_pagesize = min(vma_pagesize, (long)max_map_size); + target_size = min(target_size, (long)max_map_size); } /* @@ -1628,27 +1636,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * ensure we find the right PFN and lay down the mapping in the right * place. */ - if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { - fault_ipa &= ~(vma_pagesize - 1); - ipa &= ~(vma_pagesize - 1); + if (target_size == PMD_SIZE || target_size == PUD_SIZE) { + fault_ipa &= ~(target_size - 1); + ipa &= ~(target_size - 1); } - /* Don't use the VMA after the unlock -- it may have vanished */ - vma = NULL; - - /* - * Read mmu_invalidate_seq so that KVM can detect if the results of - * vma_lookup() or faultin_pfn() become stale prior to acquiring - * kvm->mmu_lock. - * - * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs - * with the smp_wmb() in kvm_mmu_invalidate_end(). - */ - mmu_seq = vcpu->kvm->mmu_invalidate_seq; - mmap_read_unlock(current->mm); - pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem); if (pfn == KVM_PFN_ERR_HWPOISON) { + // TODO: Handle gmem properly. vma_shift + // intentionally left uninitialized. kvm_send_hwpoison_signal(hva, vma_shift); return 0; } @@ -1658,9 +1654,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (kvm_is_device_pfn(pfn)) { /* * If the page was identified as device early by looking at - * the VMA flags, vma_pagesize is already representing the + * the VMA flags, target_size is already representing the * largest quantity we can map. If instead it was mapped - * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE + * via __kvm_faultin_pfn(), target_size is set to PAGE_SIZE * and must not be upgraded. * * In both cases, we don't let transparent_hugepage_adjust() @@ -1699,7 +1695,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_fault_lock(kvm); pgt = vcpu->arch.hw_mmu->pgt; - if (mmu_invalidate_retry(kvm, mmu_seq)) { + if (!is_gmem && mmu_invalidate_retry(kvm, mmu_seq)) { ret = -EAGAIN; goto out_unlock; } @@ -1708,16 +1704,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * If we are not forced to use page mapping, check if we are * backed by a THP and thus use block mapping if possible. */ - if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { + if (target_size == PAGE_SIZE && !(force_pte || device)) { if (fault_is_perm && fault_granule > PAGE_SIZE) - vma_pagesize = fault_granule; - else - vma_pagesize = transparent_hugepage_adjust(kvm, memslot, + target_size = fault_granule; + else if (!is_gmem) + target_size = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, &fault_ipa); - if (vma_pagesize < 0) { - ret = vma_pagesize; + if (target_size < 0) { + ret = target_size; goto out_unlock; } } @@ -1725,7 +1721,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (!fault_is_perm && !device && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new disallowed VMA */ if (mte_allowed) { - sanitise_mte_tags(kvm, pfn, vma_pagesize); + sanitise_mte_tags(kvm, pfn, target_size); } else { ret = -EFAULT; goto out_unlock; @@ -1750,10 +1746,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* * Under the premise of getting a FSC_PERM fault, we just need to relax - * permissions only if vma_pagesize equals fault_granule. Otherwise, + * permissions only if target_size equals fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - if (fault_is_perm && vma_pagesize == fault_granule) { + if (fault_is_perm && target_size == fault_granule) { /* * Drop the SW bits in favour of those stored in the * PTE, which will be preserved. @@ -1761,7 +1757,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, prot &= ~KVM_NV_GUEST_MAP_SZ; ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); } else { - ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, target_size, __pfn_to_phys(pfn), prot, memcache, flags); }