On Wed, 2025-05-14 at 18:38 +0200, Claudio Imbrenda wrote: > Refactor some gmap functions; move the implementation into a separate > file with only helper functions. The new helper functions work on vm > addresses, leaving all gmap logic in the gmap functions, which mostly > become just wrappers. > > The whole gmap handling is going to be moved inside KVM soon, but the > helper functions need to touch core mm functions, and thus need to > stay in the core of kernel. > > Signed-off-by: Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx> > --- > MAINTAINERS | 2 + > arch/s390/include/asm/gmap_helpers.h | 18 ++ > arch/s390/kvm/diag.c | 11 +- > arch/s390/kvm/kvm-s390.c | 3 +- > arch/s390/mm/Makefile | 2 + > arch/s390/mm/gmap.c | 46 ++--- > arch/s390/mm/gmap_helpers.c | 266 +++++++++++++++++++++++++++ > 7 files changed, 307 insertions(+), 41 deletions(-) > create mode 100644 arch/s390/include/asm/gmap_helpers.h > create mode 100644 arch/s390/mm/gmap_helpers.c > [...] > diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c > index 4869555ff403..17a2a57de3a2 100644 > --- a/arch/s390/mm/gmap.c > +++ b/arch/s390/mm/gmap.c > [...] > void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) > { > - unsigned long gaddr, vmaddr, size; > - struct vm_area_struct *vma; > + unsigned long vmaddr, next, start, end; > > mmap_read_lock(gmap->mm); > - for (gaddr = from; gaddr < to; > - gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { > - /* Find the vm address for the guest address */ > - vmaddr = (unsigned long) > - radix_tree_lookup(&gmap->guest_to_host, > - gaddr >> PMD_SHIFT); > + for ( ; from < to; from = next) { > + next = ALIGN(from + 1, PMD_SIZE); I think you can use pmd_addr_end(from, to) here... > + vmaddr = (unsigned long)radix_tree_lookup(&gmap->guest_to_host, from >> PMD_SHIFT); > if (!vmaddr) > continue; > - vmaddr |= gaddr & ~PMD_MASK; > - /* Find vma in the parent mm */ > - vma = find_vma(gmap->mm, vmaddr); > - if (!vma) > - continue; > - /* > - * We do not discard pages that are backed by > - * hugetlbfs, so we don't have to refault them. > - */ > - if (is_vm_hugetlb_page(vma)) > - continue; > - size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); > - zap_page_range_single(vma, vmaddr, size, NULL); > + start = vmaddr | (from & ~PMD_MASK); > + end = (vmaddr | (min(to - 1, next - 1) & ~PMD_MASK)) + 1; ... then simply do: end = vmaddr + (next - from); > + __gmap_helper_discard(gmap->mm, start, end); > } > mmap_read_unlock(gmap->mm); > } > diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c > new file mode 100644 > index 000000000000..8e66586718f6 > --- /dev/null > +++ b/arch/s390/mm/gmap_helpers.c > @@ -0,0 +1,266 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Helper functions for KVM guest address space mapping code > + * > + * Copyright IBM Corp. 2007, 2025 > + * Author(s): Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx> > + * Martin Schwidefsky <schwidefsky@xxxxxxxxxx> > + * David Hildenbrand <david@xxxxxxxxxx> > + * Janosch Frank <frankja@xxxxxxxxxxxxxxxxxx> > + */ > +#include <linux/mm_types.h> > +#include <linux/mmap_lock.h> > +#include <linux/mm.h> > +#include <linux/hugetlb.h> > +#include <linux/pagewalk.h> > +#include <linux/ksm.h> > +#include <asm/gmap_helpers.h> Please add documentation to all these functions for those of us that don't live and breath mm code :) > + > +static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) > +{ > + if (!non_swap_entry(entry)) > + dec_mm_counter(mm, MM_SWAPENTS); > + else if (is_migration_entry(entry)) > + dec_mm_counter(mm, mm_counter(pfn_swap_entry_folio(entry))); > + free_swap_and_cache(entry); > +} > + > +void __gmap_helper_zap_one(struct mm_struct *mm, unsigned long vmaddr) __gmap_helper_zap_mapping_pte ? > +{ > + struct vm_area_struct *vma; > + spinlock_t *ptl; > + pte_t *ptep; > + > + mmap_assert_locked(mm); > + > + /* Find the vm address for the guest address */ > + vma = vma_lookup(mm, vmaddr); > + if (!vma || is_vm_hugetlb_page(vma)) > + return; > + > + /* Get pointer to the page table entry */ > + ptep = get_locked_pte(mm, vmaddr, &ptl); > + if (!likely(ptep)) if (unlikely(!ptep)) reads nicer to me. > + return; > + if (pte_swap(*ptep)) > + ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep)); > + pte_unmap_unlock(ptep, ptl); > +} > +EXPORT_SYMBOL_GPL(__gmap_helper_zap_one); Looks reasonable, but I'm not well versed enough in mm code to evaluate that with confidence. > + > +void __gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end) Maybe call this gmap_helper_discard_nolock or something. > +{ > + struct vm_area_struct *vma; > + unsigned long next; > + > + mmap_assert_locked(mm); > + > + while (vmaddr < end) { > + vma = find_vma_intersection(mm, vmaddr, end); > + if (!vma) > + break; > + vmaddr = max(vmaddr, vma->vm_start); > + next = min(end, vma->vm_end); > + if (!is_vm_hugetlb_page(vma)) > + zap_page_range_single(vma, vmaddr, next - vmaddr, NULL); > + vmaddr = next; > + } > +} > + > +void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end) > +{ > + mmap_read_lock(mm); > + __gmap_helper_discard(mm, vmaddr, end); > + mmap_read_unlock(mm); > +} > +EXPORT_SYMBOL_GPL(gmap_helper_discard); > + > +static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) > +{ > + struct vm_area_struct *vma; > + pgd_t *pgd; > + p4d_t *p4d; > + pud_t *pud; > + > + /* We need a valid VMA, otherwise this is clearly a fault. */ > + vma = vma_lookup(mm, addr); > + if (!vma) > + return -EFAULT; > + > + pgd = pgd_offset(mm, addr); > + if (!pgd_present(*pgd)) > + return -ENOENT; What about pgd_bad? > + > + p4d = p4d_offset(pgd, addr); > + if (!p4d_present(*p4d)) > + return -ENOENT; > + > + pud = pud_offset(p4d, addr); > + if (!pud_present(*pud)) > + return -ENOENT; > + > + /* Large PUDs are not supported yet. */ > + if (pud_leaf(*pud)) > + return -EFAULT; > + > + *pmdp = pmd_offset(pud, addr); > + return 0; > +} > + > +void __gmap_helper_set_unused(struct mm_struct *mm, unsigned long vmaddr) What is this function for? Why do you introduce it now? > +{ > + spinlock_t *ptl; > + pmd_t *pmdp; > + pte_t *ptep; > + > + mmap_assert_locked(mm); > + > + if (pmd_lookup(mm, vmaddr, &pmdp)) > + return; > + ptl = pmd_lock(mm, pmdp); > + if (!pmd_present(*pmdp) || pmd_leaf(*pmdp)) { > + spin_unlock(ptl); > + return; > + } > + spin_unlock(ptl); > + > + ptep = pte_offset_map_lock(mm, pmdp, vmaddr, &ptl); > + if (!ptep) > + return; > + /* The last byte of a pte can be changed freely without ipte */ > + __atomic64_or(_PAGE_UNUSED, (long *)ptep); > + pte_unmap_unlock(ptep, ptl); > +} > +EXPORT_SYMBOL_GPL(__gmap_helper_set_unused); The stuff below is from arch/s390/mm/gmap.c right? Are you going to delete it from there? > +static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, > + unsigned long end, struct mm_walk *walk) > +{ [...] > +} > + > +static const struct mm_walk_ops find_zeropage_ops = { > + .pte_entry = find_zeropage_pte_entry, > + .walk_lock = PGWALK_WRLOCK, > +}; > + > +/* > + * Unshare all shared zeropages, replacing them by anonymous pages. Note that > + * we cannot simply zap all shared zeropages, because this could later > + * trigger unexpected userfaultfd missing events. > + * > + * This must be called after mm->context.allow_cow_sharing was > + * set to 0, to avoid future mappings of shared zeropages. > + * > + * mm contracts with s390, that even if mm were to remove a page table, > + * and racing with walk_page_range_vma() calling pte_offset_map_lock() > + * would fail, it will never insert a page table containing empty zero > + * pages once mm_forbids_zeropage(mm) i.e. > + * mm->context.allow_cow_sharing is set to 0. > + */ > +static int __gmap_helper_unshare_zeropages(struct mm_struct *mm) > +{ [...] > +} > + > +static int __gmap_helper_disable_cow_sharing(struct mm_struct *mm) > +{ [...] > +} > + > +/* > + * Disable most COW-sharing of memory pages for the whole process: > + * (1) Disable KSM and unmerge/unshare any KSM pages. > + * (2) Disallow shared zeropages and unshare any zerpages that are mapped. > + * > + * Not that we currently don't bother with COW-shared pages that are shared > + * with parent/child processes due to fork(). > + */ > +int gmap_helper_disable_cow_sharing(void) > +{ [...] > +} > +EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing); -- IBM Deutschland Research & Development GmbH Vorsitzender des Aufsichtsrats: Wolfgang Wendt Geschäftsführung: David Faller Sitz der Gesellschaft: Böblingen / Registergericht: Amtsgericht Stuttgart, HRB 243294