Re: [PATCH v1 4/5] KVM: s390: refactor and split some gmap helpers

Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx> · Wed, 21 May 2025 17:19:30 +0200

On Wed, 21 May 2025 16:55:18 +0200
Nina Schoetterl-Glausch <nsg@xxxxxxxxxxxxx> wrote:

> On Wed, 2025-05-14 at 18:38 +0200, Claudio Imbrenda wrote:
> > Refactor some gmap functions; move the implementation into a separate
> > file with only helper functions. The new helper functions work on vm
> > addresses, leaving all gmap logic in the gmap functions, which mostly
> > become just wrappers.
> > 
> > The whole gmap handling is going to be moved inside KVM soon, but the
> > helper functions need to touch core mm functions, and thus need to
> > stay in the core of kernel.
> > 
> > Signed-off-by: Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx>
> > ---
> >  MAINTAINERS                          |   2 +
> >  arch/s390/include/asm/gmap_helpers.h |  18 ++
> >  arch/s390/kvm/diag.c                 |  11 +-
> >  arch/s390/kvm/kvm-s390.c             |   3 +-
> >  arch/s390/mm/Makefile                |   2 +
> >  arch/s390/mm/gmap.c                  |  46 ++---
> >  arch/s390/mm/gmap_helpers.c          | 266 +++++++++++++++++++++++++++
> >  7 files changed, 307 insertions(+), 41 deletions(-)
> >  create mode 100644 arch/s390/include/asm/gmap_helpers.h
> >  create mode 100644 arch/s390/mm/gmap_helpers.c
> >   
> [...]
> 
> > diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> > index 4869555ff403..17a2a57de3a2 100644
> > --- a/arch/s390/mm/gmap.c
> > +++ b/arch/s390/mm/gmap.c
> >   
> [...]
> 
> >  void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
> >  {
> > -	unsigned long gaddr, vmaddr, size;
> > -	struct vm_area_struct *vma;
> > +	unsigned long vmaddr, next, start, end;
> >  
> >  	mmap_read_lock(gmap->mm);
> > -	for (gaddr = from; gaddr < to;
> > -	     gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
> > -		/* Find the vm address for the guest address */
> > -		vmaddr = (unsigned long)
> > -			radix_tree_lookup(&gmap->guest_to_host,
> > -					  gaddr >> PMD_SHIFT);
> > +	for ( ; from < to; from = next) {
> > +		next = ALIGN(from + 1, PMD_SIZE);  
> 
> I think you can use pmd_addr_end(from, to) here...

I guess

> 
> > +		vmaddr = (unsigned long)radix_tree_lookup(&gmap->guest_to_host, from >> PMD_SHIFT);
> >  		if (!vmaddr)
> >  			continue;
> > -		vmaddr |= gaddr & ~PMD_MASK;
> > -		/* Find vma in the parent mm */
> > -		vma = find_vma(gmap->mm, vmaddr);
> > -		if (!vma)
> > -			continue;
> > -		/*
> > -		 * We do not discard pages that are backed by
> > -		 * hugetlbfs, so we don't have to refault them.
> > -		 */
> > -		if (is_vm_hugetlb_page(vma))
> > -			continue;
> > -		size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
> > -		zap_page_range_single(vma, vmaddr, size, NULL);
> > +		start = vmaddr | (from & ~PMD_MASK);
> > +		end = (vmaddr | (min(to - 1, next - 1) & ~PMD_MASK)) + 1;  
> 
> ... then simply do:
> 		end = vmaddr + (next - from);

looks cleaner, yes

> 
> > +		__gmap_helper_discard(gmap->mm, start, end);
> >  	}
> >  	mmap_read_unlock(gmap->mm);
> >  }
> > diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c
> > new file mode 100644
> > index 000000000000..8e66586718f6
> > --- /dev/null
> > +++ b/arch/s390/mm/gmap_helpers.c
> > @@ -0,0 +1,266 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + *  Helper functions for KVM guest address space mapping code
> > + *
> > + *    Copyright IBM Corp. 2007, 2025
> > + *    Author(s): Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx>
> > + *               Martin Schwidefsky <schwidefsky@xxxxxxxxxx>
> > + *               David Hildenbrand <david@xxxxxxxxxx>
> > + *               Janosch Frank <frankja@xxxxxxxxxxxxxxxxxx>
> > + */
> > +#include <linux/mm_types.h>
> > +#include <linux/mmap_lock.h>
> > +#include <linux/mm.h>
> > +#include <linux/hugetlb.h>
> > +#include <linux/pagewalk.h>
> > +#include <linux/ksm.h>
> > +#include <asm/gmap_helpers.h>  
> 
> Please add documentation to all these functions for those of us that
> don't live and breath mm code :)

eh... yes I think it's a good idea

> 
> > +
> > +static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
> > +{
> > +	if (!non_swap_entry(entry))
> > +		dec_mm_counter(mm, MM_SWAPENTS);
> > +	else if (is_migration_entry(entry))
> > +		dec_mm_counter(mm, mm_counter(pfn_swap_entry_folio(entry)));
> > +	free_swap_and_cache(entry);
> > +}
> > +
> > +void __gmap_helper_zap_one(struct mm_struct *mm, unsigned long vmaddr)  
> 
> __gmap_helper_zap_mapping_pte ?

but I'm not taking a pte as parameter

> 
> > +{
> > +	struct vm_area_struct *vma;
> > +	spinlock_t *ptl;
> > +	pte_t *ptep;
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	/* Find the vm address for the guest address */
> > +	vma = vma_lookup(mm, vmaddr);
> > +	if (!vma || is_vm_hugetlb_page(vma))
> > +		return;
> > +
> > +	/* Get pointer to the page table entry */
> > +	ptep = get_locked_pte(mm, vmaddr, &ptl);
> > +	if (!likely(ptep))  
> 
> if (unlikely(!ptep)) reads nicer to me.

ok

> 
> > +		return;
> > +	if (pte_swap(*ptep))
> > +		ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep));
> > +	pte_unmap_unlock(ptep, ptl);
> > +}
> > +EXPORT_SYMBOL_GPL(__gmap_helper_zap_one);  
> 
> Looks reasonable, but I'm not well versed enough in mm code to evaluate
> that with confidence.
> 
> > +
> > +void __gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)  
> 
> Maybe call this gmap_helper_discard_nolock or something.

maybe __gmap_helper_discard_unlocked?

the __ prefix often implies lack of locking

> 
> > +{
> > +	struct vm_area_struct *vma;
> > +	unsigned long next;
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	while (vmaddr < end) {
> > +		vma = find_vma_intersection(mm, vmaddr, end);
> > +		if (!vma)
> > +			break;
> > +		vmaddr = max(vmaddr, vma->vm_start);
> > +		next = min(end, vma->vm_end);
> > +		if (!is_vm_hugetlb_page(vma))
> > +			zap_page_range_single(vma, vmaddr, next - vmaddr, NULL);
> > +		vmaddr = next;
> > +	}
> > +}
> > +
> > +void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)
> > +{
> > +	mmap_read_lock(mm);
> > +	__gmap_helper_discard(mm, vmaddr, end);
> > +	mmap_read_unlock(mm);
> > +}
> > +EXPORT_SYMBOL_GPL(gmap_helper_discard);
> > +
> > +static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp)
> > +{
> > +	struct vm_area_struct *vma;
> > +	pgd_t *pgd;
> > +	p4d_t *p4d;
> > +	pud_t *pud;
> > +
> > +	/* We need a valid VMA, otherwise this is clearly a fault. */
> > +	vma = vma_lookup(mm, addr);
> > +	if (!vma)
> > +		return -EFAULT;
> > +
> > +	pgd = pgd_offset(mm, addr);
> > +	if (!pgd_present(*pgd))
> > +		return -ENOENT;  
> 
> What about pgd_bad?

I don't know, this code is copied verbatim from mm/gmap.c

> 
> > +
> > +	p4d = p4d_offset(pgd, addr);
> > +	if (!p4d_present(*p4d))
> > +		return -ENOENT;
> > +
> > +	pud = pud_offset(p4d, addr);
> > +	if (!pud_present(*pud))
> > +		return -ENOENT;
> > +
> > +	/* Large PUDs are not supported yet. */
> > +	if (pud_leaf(*pud))
> > +		return -EFAULT;
> > +
> > +	*pmdp = pmd_offset(pud, addr);
> > +	return 0;
> > +}
> > +
> > +void __gmap_helper_set_unused(struct mm_struct *mm, unsigned long vmaddr)  
> 
> What is this function for? Why do you introduce it now?

this is for cmma handling, I'm introducing it now because it needs to
be in this file and I would like to put all the stuff in at once.
I will not need to touch this file anymore if I get this in now.

> 
> > +{
> > +	spinlock_t *ptl;
> > +	pmd_t *pmdp;
> > +	pte_t *ptep;
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	if (pmd_lookup(mm, vmaddr, &pmdp))
> > +		return;
> > +	ptl = pmd_lock(mm, pmdp);
> > +	if (!pmd_present(*pmdp) || pmd_leaf(*pmdp)) {
> > +		spin_unlock(ptl);
> > +		return;
> > +	}
> > +	spin_unlock(ptl);
> > +
> > +	ptep = pte_offset_map_lock(mm, pmdp, vmaddr, &ptl);
> > +	if (!ptep)
> > +		return;
> > +	/* The last byte of a pte can be changed freely without ipte */
> > +	__atomic64_or(_PAGE_UNUSED, (long *)ptep);
> > +	pte_unmap_unlock(ptep, ptl);
> > +}
> > +EXPORT_SYMBOL_GPL(__gmap_helper_set_unused);  
> 
> The stuff below is from arch/s390/mm/gmap.c right?
> Are you going to delete it from there?

not in this series, but the next series will remove mm/gmap.c altogether

> 
> > +static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
> > +				   unsigned long end, struct mm_walk *walk)
> > +{  
> 
> [...]
> 
> > +}
> > +
> > +static const struct mm_walk_ops find_zeropage_ops = {
> > +	.pte_entry      = find_zeropage_pte_entry,
> > +	.walk_lock      = PGWALK_WRLOCK,
> > +};
> > +
> > +/*
> > + * Unshare all shared zeropages, replacing them by anonymous pages. Note that
> > + * we cannot simply zap all shared zeropages, because this could later
> > + * trigger unexpected userfaultfd missing events.
> > + *
> > + * This must be called after mm->context.allow_cow_sharing was
> > + * set to 0, to avoid future mappings of shared zeropages.
> > + *
> > + * mm contracts with s390, that even if mm were to remove a page table,
> > + * and racing with walk_page_range_vma() calling pte_offset_map_lock()
> > + * would fail, it will never insert a page table containing empty zero
> > + * pages once mm_forbids_zeropage(mm) i.e.
> > + * mm->context.allow_cow_sharing is set to 0.
> > + */
> > +static int __gmap_helper_unshare_zeropages(struct mm_struct *mm)
> > +{  
> 
> [...]
> 
> > +}
> > +
> > +static int __gmap_helper_disable_cow_sharing(struct mm_struct *mm)
> > +{  
> 
> [...]
> 
> > +}
> > +
> > +/*
> > + * Disable most COW-sharing of memory pages for the whole process:
> > + * (1) Disable KSM and unmerge/unshare any KSM pages.
> > + * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
> > + *
> > + * Not that we currently don't bother with COW-shared pages that are shared
> > + * with parent/child processes due to fork().
> > + */
> > +int gmap_helper_disable_cow_sharing(void)
> > +{  
> 
> [...]
> 
> > +}
> > +EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing);  
>