On Wed, 21 May 2025 16:55:18 +0200 Nina Schoetterl-Glausch <nsg@xxxxxxxxxxxxx> wrote: > On Wed, 2025-05-14 at 18:38 +0200, Claudio Imbrenda wrote: > > Refactor some gmap functions; move the implementation into a separate > > file with only helper functions. The new helper functions work on vm > > addresses, leaving all gmap logic in the gmap functions, which mostly > > become just wrappers. > > > > The whole gmap handling is going to be moved inside KVM soon, but the > > helper functions need to touch core mm functions, and thus need to > > stay in the core of kernel. > > > > Signed-off-by: Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx> > > --- > > MAINTAINERS | 2 + > > arch/s390/include/asm/gmap_helpers.h | 18 ++ > > arch/s390/kvm/diag.c | 11 +- > > arch/s390/kvm/kvm-s390.c | 3 +- > > arch/s390/mm/Makefile | 2 + > > arch/s390/mm/gmap.c | 46 ++--- > > arch/s390/mm/gmap_helpers.c | 266 +++++++++++++++++++++++++++ > > 7 files changed, 307 insertions(+), 41 deletions(-) > > create mode 100644 arch/s390/include/asm/gmap_helpers.h > > create mode 100644 arch/s390/mm/gmap_helpers.c > > > [...] > > > diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c > > index 4869555ff403..17a2a57de3a2 100644 > > --- a/arch/s390/mm/gmap.c > > +++ b/arch/s390/mm/gmap.c > > > [...] > > > void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) > > { > > - unsigned long gaddr, vmaddr, size; > > - struct vm_area_struct *vma; > > + unsigned long vmaddr, next, start, end; > > > > mmap_read_lock(gmap->mm); > > - for (gaddr = from; gaddr < to; > > - gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { > > - /* Find the vm address for the guest address */ > > - vmaddr = (unsigned long) > > - radix_tree_lookup(&gmap->guest_to_host, > > - gaddr >> PMD_SHIFT); > > + for ( ; from < to; from = next) { > > + next = ALIGN(from + 1, PMD_SIZE); > > I think you can use pmd_addr_end(from, to) here... I guess > > > + vmaddr = (unsigned long)radix_tree_lookup(&gmap->guest_to_host, from >> PMD_SHIFT); > > if (!vmaddr) > > continue; > > - vmaddr |= gaddr & ~PMD_MASK; > > - /* Find vma in the parent mm */ > > - vma = find_vma(gmap->mm, vmaddr); > > - if (!vma) > > - continue; > > - /* > > - * We do not discard pages that are backed by > > - * hugetlbfs, so we don't have to refault them. > > - */ > > - if (is_vm_hugetlb_page(vma)) > > - continue; > > - size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); > > - zap_page_range_single(vma, vmaddr, size, NULL); > > + start = vmaddr | (from & ~PMD_MASK); > > + end = (vmaddr | (min(to - 1, next - 1) & ~PMD_MASK)) + 1; > > ... then simply do: > end = vmaddr + (next - from); looks cleaner, yes > > > + __gmap_helper_discard(gmap->mm, start, end); > > } > > mmap_read_unlock(gmap->mm); > > } > > diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c > > new file mode 100644 > > index 000000000000..8e66586718f6 > > --- /dev/null > > +++ b/arch/s390/mm/gmap_helpers.c > > @@ -0,0 +1,266 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > +/* > > + * Helper functions for KVM guest address space mapping code > > + * > > + * Copyright IBM Corp. 2007, 2025 > > + * Author(s): Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx> > > + * Martin Schwidefsky <schwidefsky@xxxxxxxxxx> > > + * David Hildenbrand <david@xxxxxxxxxx> > > + * Janosch Frank <frankja@xxxxxxxxxxxxxxxxxx> > > + */ > > +#include <linux/mm_types.h> > > +#include <linux/mmap_lock.h> > > +#include <linux/mm.h> > > +#include <linux/hugetlb.h> > > +#include <linux/pagewalk.h> > > +#include <linux/ksm.h> > > +#include <asm/gmap_helpers.h> > > Please add documentation to all these functions for those of us that > don't live and breath mm code :) eh... yes I think it's a good idea > > > + > > +static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) > > +{ > > + if (!non_swap_entry(entry)) > > + dec_mm_counter(mm, MM_SWAPENTS); > > + else if (is_migration_entry(entry)) > > + dec_mm_counter(mm, mm_counter(pfn_swap_entry_folio(entry))); > > + free_swap_and_cache(entry); > > +} > > + > > +void __gmap_helper_zap_one(struct mm_struct *mm, unsigned long vmaddr) > > __gmap_helper_zap_mapping_pte ? but I'm not taking a pte as parameter > > > +{ > > + struct vm_area_struct *vma; > > + spinlock_t *ptl; > > + pte_t *ptep; > > + > > + mmap_assert_locked(mm); > > + > > + /* Find the vm address for the guest address */ > > + vma = vma_lookup(mm, vmaddr); > > + if (!vma || is_vm_hugetlb_page(vma)) > > + return; > > + > > + /* Get pointer to the page table entry */ > > + ptep = get_locked_pte(mm, vmaddr, &ptl); > > + if (!likely(ptep)) > > if (unlikely(!ptep)) reads nicer to me. ok > > > + return; > > + if (pte_swap(*ptep)) > > + ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep)); > > + pte_unmap_unlock(ptep, ptl); > > +} > > +EXPORT_SYMBOL_GPL(__gmap_helper_zap_one); > > Looks reasonable, but I'm not well versed enough in mm code to evaluate > that with confidence. > > > + > > +void __gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end) > > Maybe call this gmap_helper_discard_nolock or something. maybe __gmap_helper_discard_unlocked? the __ prefix often implies lack of locking > > > +{ > > + struct vm_area_struct *vma; > > + unsigned long next; > > + > > + mmap_assert_locked(mm); > > + > > + while (vmaddr < end) { > > + vma = find_vma_intersection(mm, vmaddr, end); > > + if (!vma) > > + break; > > + vmaddr = max(vmaddr, vma->vm_start); > > + next = min(end, vma->vm_end); > > + if (!is_vm_hugetlb_page(vma)) > > + zap_page_range_single(vma, vmaddr, next - vmaddr, NULL); > > + vmaddr = next; > > + } > > +} > > + > > +void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end) > > +{ > > + mmap_read_lock(mm); > > + __gmap_helper_discard(mm, vmaddr, end); > > + mmap_read_unlock(mm); > > +} > > +EXPORT_SYMBOL_GPL(gmap_helper_discard); > > + > > +static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) > > +{ > > + struct vm_area_struct *vma; > > + pgd_t *pgd; > > + p4d_t *p4d; > > + pud_t *pud; > > + > > + /* We need a valid VMA, otherwise this is clearly a fault. */ > > + vma = vma_lookup(mm, addr); > > + if (!vma) > > + return -EFAULT; > > + > > + pgd = pgd_offset(mm, addr); > > + if (!pgd_present(*pgd)) > > + return -ENOENT; > > What about pgd_bad? I don't know, this code is copied verbatim from mm/gmap.c > > > + > > + p4d = p4d_offset(pgd, addr); > > + if (!p4d_present(*p4d)) > > + return -ENOENT; > > + > > + pud = pud_offset(p4d, addr); > > + if (!pud_present(*pud)) > > + return -ENOENT; > > + > > + /* Large PUDs are not supported yet. */ > > + if (pud_leaf(*pud)) > > + return -EFAULT; > > + > > + *pmdp = pmd_offset(pud, addr); > > + return 0; > > +} > > + > > +void __gmap_helper_set_unused(struct mm_struct *mm, unsigned long vmaddr) > > What is this function for? Why do you introduce it now? this is for cmma handling, I'm introducing it now because it needs to be in this file and I would like to put all the stuff in at once. I will not need to touch this file anymore if I get this in now. > > > +{ > > + spinlock_t *ptl; > > + pmd_t *pmdp; > > + pte_t *ptep; > > + > > + mmap_assert_locked(mm); > > + > > + if (pmd_lookup(mm, vmaddr, &pmdp)) > > + return; > > + ptl = pmd_lock(mm, pmdp); > > + if (!pmd_present(*pmdp) || pmd_leaf(*pmdp)) { > > + spin_unlock(ptl); > > + return; > > + } > > + spin_unlock(ptl); > > + > > + ptep = pte_offset_map_lock(mm, pmdp, vmaddr, &ptl); > > + if (!ptep) > > + return; > > + /* The last byte of a pte can be changed freely without ipte */ > > + __atomic64_or(_PAGE_UNUSED, (long *)ptep); > > + pte_unmap_unlock(ptep, ptl); > > +} > > +EXPORT_SYMBOL_GPL(__gmap_helper_set_unused); > > The stuff below is from arch/s390/mm/gmap.c right? > Are you going to delete it from there? not in this series, but the next series will remove mm/gmap.c altogether > > > +static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, > > + unsigned long end, struct mm_walk *walk) > > +{ > > [...] > > > +} > > + > > +static const struct mm_walk_ops find_zeropage_ops = { > > + .pte_entry = find_zeropage_pte_entry, > > + .walk_lock = PGWALK_WRLOCK, > > +}; > > + > > +/* > > + * Unshare all shared zeropages, replacing them by anonymous pages. Note that > > + * we cannot simply zap all shared zeropages, because this could later > > + * trigger unexpected userfaultfd missing events. > > + * > > + * This must be called after mm->context.allow_cow_sharing was > > + * set to 0, to avoid future mappings of shared zeropages. > > + * > > + * mm contracts with s390, that even if mm were to remove a page table, > > + * and racing with walk_page_range_vma() calling pte_offset_map_lock() > > + * would fail, it will never insert a page table containing empty zero > > + * pages once mm_forbids_zeropage(mm) i.e. > > + * mm->context.allow_cow_sharing is set to 0. > > + */ > > +static int __gmap_helper_unshare_zeropages(struct mm_struct *mm) > > +{ > > [...] > > > +} > > + > > +static int __gmap_helper_disable_cow_sharing(struct mm_struct *mm) > > +{ > > [...] > > > +} > > + > > +/* > > + * Disable most COW-sharing of memory pages for the whole process: > > + * (1) Disable KSM and unmerge/unshare any KSM pages. > > + * (2) Disallow shared zeropages and unshare any zerpages that are mapped. > > + * > > + * Not that we currently don't bother with COW-shared pages that are shared > > + * with parent/child processes due to fork(). > > + */ > > +int gmap_helper_disable_cow_sharing(void) > > +{ > > [...] > > > +} > > +EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing); >