On Wed, Jun 11, 2025 at 09:17:36AM -0700, Sean Christopherson wrote: > On Wed, Jun 11, 2025, Andrew Jones wrote: > > On Wed, Jun 11, 2025 at 05:51:40PM +0800, zhouquan@xxxxxxxxxxx wrote: > > > From: Quan Zhou <zhouquan@xxxxxxxxxxx> > > > > > > The caller has already passed in the memslot, and there are > > > two instances `{kvm_faultin_pfn/mark_page_dirty}` of retrieving > > > the memslot again in `kvm_riscv_gstage_map`, we can replace them > > > with `{__kvm_faultin_pfn/mark_page_dirty_in_slot}`. > > > > > > Signed-off-by: Quan Zhou <zhouquan@xxxxxxxxxxx> > > > --- > > > arch/riscv/kvm/mmu.c | 5 +++-- > > > 1 file changed, 3 insertions(+), 2 deletions(-) > > > > > > diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c > > > index 1087ea74567b..f9059dac3ba3 100644 > > > --- a/arch/riscv/kvm/mmu.c > > > +++ b/arch/riscv/kvm/mmu.c > > > @@ -648,7 +648,8 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, > > > return -EFAULT; > > > } > > > > > > - hfn = kvm_faultin_pfn(vcpu, gfn, is_write, &writable, &page); > > > + hfn = __kvm_faultin_pfn(memslot, gfn, is_write ? FOLL_WRITE : 0, > > > + &writable, &page); > > > > I think introducing another function with the following diff would be > > better than duplicating the is_write to foll translation. > > NAK, I don't want an explosion of wrapper APIs (especially with boolean params). > > I 100% agree that it's mildly annoying to force arch code to do convert "write" > to FOLL_WRITE, but that's a symptom of KVM not providing a common structure for > passing page fault information. > > What I want to get to is a set of APIs that look something the below (very off > the cuff), not add more wrappers and put KVM back in a situation where there are > a bajillion ways to do the same basic thing. > > struct kvm_page_fault { > const gpa_t addr; > const bool exec; > const bool write; > const bool present; > > gfn_t gfn; > > /* The memslot containing gfn. May be NULL. */ > struct kvm_memory_slot *slot; > > /* Outputs */ > unsigned long mmu_seq; > kvm_pfn_t pfn; > struct page *refcounted_page; > bool map_writable; > }; > > kvm_pfn_t __kvm_faultin_pfn(struct kvm_page_fault *fault, unsigned int flags) > { > struct kvm_follow_pfn kfp = { > .slot = fault->slot, > .gfn = fault->gfn, > .flags = flags | fault->write ? FOLL_WRITE : 0, > .map_writable = &fault->writable, > .refcounted_page = &fault->refcounted_page, > }; > > fault->writable = false; > fault->refcounted_page = NULL; > > return kvm_follow_pfn(&kfp); > } > EXPORT_SYMBOL_GPL(__kvm_faultin_pfn); > > kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool write, > bool *writable, struct page **refcounted_page) > { > struct kvm_follow_pfn kfp = { > .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),, > .gfn = gfn, > .flags = write ? FOLL_WRITE : 0, > .map_writable = writable, > .refcounted_page = refcounted_page, > }; > > if (WARN_ON_ONCE(!writable || !refcounted_page)) > return KVM_PFN_ERR_FAULT; > > *writable = false; > *refcounted_page = NULL; > > return kvm_follow_pfn(&kfp); > } > EXPORT_SYMBOL_GPL(__kvm_faultin_pfn); > > > To get things started, I proposed moving "struct kvm_page_fault" to common code > so that it can be shared by x86 and arm64 as part of the KVM userfault series. > But I'd be more than happy to acclerate the standardization of "struct kvm_page_fault" > if we want to get there sooner than later. > > [*] https://lore.kernel.org/all/aBqlkz1bqhu-9toV@xxxxxxxxxx > > In the meantime, RISC-V can start preparing for that future, and clean up its > code in the process. > > E.g. "fault_addr" should be "gpa_t", not "unsigned long". If 32-bit RISC-V > is strictly limited to 32-bit _physical_ addresses in the *architecture*, then > gpa_t should probably be tweaked accordingly. 32-bit riscv supports 34-bit physical addresses, so fault_addr should indeed be gpa_t. > > And vma_pageshift should be "unsigned int", not "short". Yes, particularly because huge_page_shift() returns unsigned int which may be used to assign vma_pageshift. > > Looks like y'all also have a bug where an -EEXIST will be returned to userspace, > and will generate what's probably a spurious kvm_err() message. On 32-bit riscv, due to losing the upper bits of the physical address? Or is there yet another thing to fix? > > E.g. in the short term: The diff looks good to me, should I test and post it for you? Thanks, drew > > --- > arch/riscv/include/asm/kvm_host.h | 5 ++-- > arch/riscv/kvm/mmu.c | 49 +++++++++++++++++++++---------- > arch/riscv/kvm/vcpu_exit.c | 40 +------------------------ > 3 files changed, 36 insertions(+), 58 deletions(-) > > diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h > index 85cfebc32e4c..84c5db715ba5 100644 > --- a/arch/riscv/include/asm/kvm_host.h > +++ b/arch/riscv/include/asm/kvm_host.h > @@ -361,9 +361,8 @@ int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, > bool writable, bool in_atomic); > void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa, > unsigned long size); > -int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, > - struct kvm_memory_slot *memslot, > - gpa_t gpa, unsigned long hva, bool is_write); > +int kvm_riscv_gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, > + struct kvm_cpu_trap *trap); > int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm); > void kvm_riscv_gstage_free_pgd(struct kvm *kvm); > void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu); > diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c > index 1087ea74567b..3b0afc1c0832 100644 > --- a/arch/riscv/kvm/mmu.c > +++ b/arch/riscv/kvm/mmu.c > @@ -586,22 +586,37 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) > return pte_young(ptep_get(ptep)); > } > > -int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, > - struct kvm_memory_slot *memslot, > - gpa_t gpa, unsigned long hva, bool is_write) > +int kvm_riscv_gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, > + struct kvm_cpu_trap *trap) > { > - int ret; > - kvm_pfn_t hfn; > - bool writable; > - short vma_pageshift; > + > + struct kvm_mmu_memory_cache *pcache = &vcpu->arch.mmu_page_cache; > + gpa_t gpa = (trap->htval << 2) | (trap->stval & 0x3); > gfn_t gfn = gpa >> PAGE_SHIFT; > - struct vm_area_struct *vma; > + struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); > + bool logging = memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); > + bool write = trap->scause == EXC_STORE_GUEST_PAGE_FAULT; > + bool read = trap->scause == EXC_LOAD_GUEST_PAGE_FAULT; > + unsigned int flags = write ? FOLL_WRITE : 0; > + unsigned long hva, vma_pagesize, mmu_seq; > struct kvm *kvm = vcpu->kvm; > - struct kvm_mmu_memory_cache *pcache = &vcpu->arch.mmu_page_cache; > - bool logging = (memslot->dirty_bitmap && > - !(memslot->flags & KVM_MEM_READONLY)) ? true : false; > - unsigned long vma_pagesize, mmu_seq; > + unsigned int vma_pageshift; > + struct vm_area_struct *vma; > struct page *page; > + kvm_pfn_t hfn; > + bool writable; > + int ret; > + > + hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); > + if (kvm_is_error_hva(hva) || (write && !writable)) { > + if (read) > + return kvm_riscv_vcpu_mmio_load(vcpu, run, gpa, > + trap->htinst); > + if (write) > + return kvm_riscv_vcpu_mmio_store(vcpu, run, gpa, > + trap->htinst); > + return -EOPNOTSUPP; > + } > > /* We need minimum second+third level pages */ > ret = kvm_mmu_topup_memory_cache(pcache, gstage_pgd_levels); > @@ -648,7 +663,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, > return -EFAULT; > } > > - hfn = kvm_faultin_pfn(vcpu, gfn, is_write, &writable, &page); > + hfn = __kvm_faultin_pfn(memslot, gfn, flags, &writable, &page); > if (hfn == KVM_PFN_ERR_HWPOISON) { > send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, > vma_pageshift, current); > @@ -661,7 +676,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, > * If logging is active then we allow writable pages only > * for write faults. > */ > - if (logging && !is_write) > + if (logging && !write) > writable = false; > > spin_lock(&kvm->mmu_lock); > @@ -677,14 +692,16 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, > ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, > vma_pagesize, true, true); > } > + if (ret == -EEXIST) > + ret = 0; > > if (ret) > kvm_err("Failed to map in G-stage\n"); > > out_unlock: > - kvm_release_faultin_page(kvm, page, ret && ret != -EEXIST, writable); > + kvm_release_faultin_page(kvm, page, ret, writable); > spin_unlock(&kvm->mmu_lock); > - return ret; > + return ret ? ret : 1; > } > > int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm) > diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c > index 6e0c18412795..6f07077068f6 100644 > --- a/arch/riscv/kvm/vcpu_exit.c > +++ b/arch/riscv/kvm/vcpu_exit.c > @@ -10,44 +10,6 @@ > #include <asm/csr.h> > #include <asm/insn-def.h> > > -static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, > - struct kvm_cpu_trap *trap) > -{ > - struct kvm_memory_slot *memslot; > - unsigned long hva, fault_addr; > - bool writable; > - gfn_t gfn; > - int ret; > - > - fault_addr = (trap->htval << 2) | (trap->stval & 0x3); > - gfn = fault_addr >> PAGE_SHIFT; > - memslot = gfn_to_memslot(vcpu->kvm, gfn); > - hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); > - > - if (kvm_is_error_hva(hva) || > - (trap->scause == EXC_STORE_GUEST_PAGE_FAULT && !writable)) { > - switch (trap->scause) { > - case EXC_LOAD_GUEST_PAGE_FAULT: > - return kvm_riscv_vcpu_mmio_load(vcpu, run, > - fault_addr, > - trap->htinst); > - case EXC_STORE_GUEST_PAGE_FAULT: > - return kvm_riscv_vcpu_mmio_store(vcpu, run, > - fault_addr, > - trap->htinst); > - default: > - return -EOPNOTSUPP; > - }; > - } > - > - ret = kvm_riscv_gstage_map(vcpu, memslot, fault_addr, hva, > - (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false); > - if (ret < 0) > - return ret; > - > - return 1; > -} > - > /** > * kvm_riscv_vcpu_unpriv_read -- Read machine word from Guest memory > * > @@ -229,7 +191,7 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, > case EXC_LOAD_GUEST_PAGE_FAULT: > case EXC_STORE_GUEST_PAGE_FAULT: > if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) > - ret = gstage_page_fault(vcpu, run, trap); > + ret = kvm_riscv_gstage_page_fault(vcpu, run, trap); > break; > case EXC_SUPERVISOR_SYSCALL: > if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) > > base-commit: 19272b37aa4f83ca52bdf9c16d5d81bdd1354494 > --