On Wed, Jul 16, 2025 at 7:53 AM David Hildenbrand <david@xxxxxxxxxx> wrote: > > On 14.07.25 02:31, Nico Pache wrote: > > generalize the order of the __collapse_huge_page_* functions > > to support future mTHP collapse. > > > > mTHP collapse can suffer from incosistant behavior, and memory waste > > "creep". disable swapin and shared support for mTHP collapse. > > > > No functional changes in this patch. > > > > Reviewed-by: Baolin Wang <baolin.wang@xxxxxxxxxxxxxxxxx> > > Co-developed-by: Dev Jain <dev.jain@xxxxxxx> > > Signed-off-by: Dev Jain <dev.jain@xxxxxxx> > > Signed-off-by: Nico Pache <npache@xxxxxxxxxx> > > --- > > mm/khugepaged.c | 49 +++++++++++++++++++++++++++++++------------------ > > 1 file changed, 31 insertions(+), 18 deletions(-) > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > > index cc9a35185604..ee54e3c1db4e 100644 > > --- a/mm/khugepaged.c > > +++ b/mm/khugepaged.c > > @@ -552,15 +552,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, > > unsigned long address, > > pte_t *pte, > > struct collapse_control *cc, > > - struct list_head *compound_pagelist) > > + struct list_head *compound_pagelist, > > + u8 order) > > u8 ... (applies to all instances) Fixed all instances of this (other than those that need to stay) > > > { > > struct page *page = NULL; > > struct folio *folio = NULL; > > pte_t *_pte; > > int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; > > bool writable = false; > > + int scaled_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order); > > "scaled_max_ptes_none" maybe? done! > > > > > - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; > > + for (_pte = pte; _pte < pte + (1 << order); > > _pte++, address += PAGE_SIZE) { > > pte_t pteval = ptep_get(_pte); > > if (pte_none(pteval) || (pte_present(pteval) && > > @@ -568,7 +570,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, > > ++none_or_zero; > > if (!userfaultfd_armed(vma) && > > (!cc->is_khugepaged || > > - none_or_zero <= khugepaged_max_ptes_none)) { > > + none_or_zero <= scaled_none)) { > > continue; > > } else { > > result = SCAN_EXCEED_NONE_PTE; > > @@ -596,8 +598,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, > > /* See hpage_collapse_scan_pmd(). */ > > if (folio_maybe_mapped_shared(folio)) { > > ++shared; > > - if (cc->is_khugepaged && > > - shared > khugepaged_max_ptes_shared) { > > + if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged && > > + shared > khugepaged_max_ptes_shared)) { > > Please add a comment why we do something different with PMD. As > commenting below, does this deserve a TODO? > > > result = SCAN_EXCEED_SHARED_PTE; > > count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); > > goto out; > > @@ -698,13 +700,14 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, > > struct vm_area_struct *vma, > > unsigned long address, > > spinlock_t *ptl, > > - struct list_head *compound_pagelist) > > + struct list_head *compound_pagelist, > > + u8 order) > > { > > struct folio *src, *tmp; > > pte_t *_pte; > > pte_t pteval; > > > > - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; > > + for (_pte = pte; _pte < pte + (1 << order); > > _pte++, address += PAGE_SIZE) { > > pteval = ptep_get(_pte); > > if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { > > @@ -751,7 +754,8 @@ static void __collapse_huge_page_copy_failed(pte_t *pte, > > pmd_t *pmd, > > pmd_t orig_pmd, > > struct vm_area_struct *vma, > > - struct list_head *compound_pagelist) > > + struct list_head *compound_pagelist, > > + u8 order) > > { > > spinlock_t *pmd_ptl; > > > > @@ -768,7 +772,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte, > > * Release both raw and compound pages isolated > > * in __collapse_huge_page_isolate. > > */ > > - release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist); > > + release_pte_pages(pte, pte + (1 << order), compound_pagelist); > > } > > > > /* > > @@ -789,7 +793,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte, > > static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, > > pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, > > unsigned long address, spinlock_t *ptl, > > - struct list_head *compound_pagelist) > > + struct list_head *compound_pagelist, u8 order) > > { > > unsigned int i; > > int result = SCAN_SUCCEED; > > @@ -797,7 +801,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, > > /* > > * Copying pages' contents is subject to memory poison at any iteration. > > */ > > - for (i = 0; i < HPAGE_PMD_NR; i++) { > > + for (i = 0; i < (1 << order); i++) { > > pte_t pteval = ptep_get(pte + i); > > struct page *page = folio_page(folio, i); > > unsigned long src_addr = address + i * PAGE_SIZE; > > @@ -816,10 +820,10 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, > > > > if (likely(result == SCAN_SUCCEED)) > > __collapse_huge_page_copy_succeeded(pte, vma, address, ptl, > > - compound_pagelist); > > + compound_pagelist, order); > > else > > __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma, > > - compound_pagelist); > > + compound_pagelist, order); > > > > return result; > > } > > @@ -994,11 +998,11 @@ static int check_pmd_still_valid(struct mm_struct *mm, > > static int __collapse_huge_page_swapin(struct mm_struct *mm, > > struct vm_area_struct *vma, > > unsigned long haddr, pmd_t *pmd, > > - int referenced) > > + int referenced, u8 order) > > { > > int swapped_in = 0; > > vm_fault_t ret = 0; > > - unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); > > + unsigned long address, end = haddr + (PAGE_SIZE << order); > > int result; > > pte_t *pte = NULL; > > spinlock_t *ptl; > > @@ -1029,6 +1033,15 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, > > if (!is_swap_pte(vmf.orig_pte)) > > continue; > > > > + /* Dont swapin for mTHP collapse */ > > Should we turn this into a TODO, because it's something to figure out > regarding the scaling etc? Good idea, I changed both of these into TODOs > > > + if (order != HPAGE_PMD_ORDER) { > > + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP); > > + pte_unmap(pte); > > + mmap_read_unlock(mm); > > + result = SCAN_EXCEED_SWAP_PTE; > > + goto out; > > + } > > + > > vmf.pte = pte; > > vmf.ptl = ptl; > > ret = do_swap_page(&vmf); > > @@ -1149,7 +1162,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > > * that case. Continuing to collapse causes inconsistency. > > */ > > result = __collapse_huge_page_swapin(mm, vma, address, pmd, > > - referenced); > > + referenced, HPAGE_PMD_ORDER); > > Indent messed up. Feel free to exceed 80 chars if it aids readability. Fixed! > > > if (result != SCAN_SUCCEED) > > goto out_nolock; > > } > > @@ -1197,7 +1210,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > > pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); > > if (pte) { > > result = __collapse_huge_page_isolate(vma, address, pte, cc, > > - &compound_pagelist); > > + &compound_pagelist, HPAGE_PMD_ORDER); > > Dito. Fixed! > > > Apart from that, nothing jumped at me > > Acked-by: David Hildenbrand <david@xxxxxxxxxx> Thanks for the ack! I fixed the compile issue you noted too. > > -- > Cheers, > > David / dhildenb >