On Mon, Apr 28, 2025 at 12:13 PM Liam R. Howlett <Liam.Howlett@xxxxxxxxxx> wrote: > > * Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> [250428 11:28]: > > This is a key step in our being able to abstract and isolate VMA allocation > > and destruction logic. > > > > This function is the last one where vm_area_free() and vm_area_dup() are > > directly referenced outside of mmap, so having this in mm allows us to > > isolate these. > > > > We do the same for the nommu version which is substantially simpler. > > > > We place the declaration for dup_mmap() in mm/internal.h and have > > kernel/fork.c import this in order to prevent improper use of this > > functionality elsewhere in the kernel. > > > > While we're here, we remove the useless #ifdef CONFIG_MMU check around > > mmap_read_lock_maybe_expand() in mmap.c, mmap.c is compiled only if > > CONFIG_MMU is set. > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> > > Suggested-by: Pedro Falcato <pfalcato@xxxxxxx> > > Reviewed-by: Pedro Falcato <pfalcato@xxxxxxx> > > Reviewed-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx> Reviewed-by: Suren Baghdasaryan <surenb@xxxxxxxxxx> > > > --- > > kernel/fork.c | 189 ++------------------------------------------------ > > mm/internal.h | 2 + > > mm/mmap.c | 181 +++++++++++++++++++++++++++++++++++++++++++++-- > > mm/nommu.c | 8 +++ > > 4 files changed, 189 insertions(+), 191 deletions(-) > > > > diff --git a/kernel/fork.c b/kernel/fork.c > > index 168681fc4b25..ac9f9267a473 100644 > > --- a/kernel/fork.c > > +++ b/kernel/fork.c > > @@ -112,6 +112,9 @@ > > #include <asm/cacheflush.h> > > #include <asm/tlbflush.h> > > > > +/* For dup_mmap(). */ > > +#include "../mm/internal.h" > > + > > #include <trace/events/sched.h> > > > > #define CREATE_TRACE_POINTS > > @@ -589,7 +592,7 @@ void free_task(struct task_struct *tsk) > > } > > EXPORT_SYMBOL(free_task); > > > > -static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) > > +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) > > { > > struct file *exe_file; > > > > @@ -604,183 +607,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) > > } > > > > #ifdef CONFIG_MMU > > -static __latent_entropy int dup_mmap(struct mm_struct *mm, > > - struct mm_struct *oldmm) > > -{ > > - struct vm_area_struct *mpnt, *tmp; > > - int retval; > > - unsigned long charge = 0; > > - LIST_HEAD(uf); > > - VMA_ITERATOR(vmi, mm, 0); > > - > > - if (mmap_write_lock_killable(oldmm)) > > - return -EINTR; > > - flush_cache_dup_mm(oldmm); > > - uprobe_dup_mmap(oldmm, mm); > > - /* > > - * Not linked in yet - no deadlock potential: > > - */ > > - mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); > > - > > - /* No ordering required: file already has been exposed. */ > > - dup_mm_exe_file(mm, oldmm); > > - > > - mm->total_vm = oldmm->total_vm; > > - mm->data_vm = oldmm->data_vm; > > - mm->exec_vm = oldmm->exec_vm; > > - mm->stack_vm = oldmm->stack_vm; > > - > > - /* Use __mt_dup() to efficiently build an identical maple tree. */ > > - retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); > > - if (unlikely(retval)) > > - goto out; > > - > > - mt_clear_in_rcu(vmi.mas.tree); > > - for_each_vma(vmi, mpnt) { > > - struct file *file; > > - > > - vma_start_write(mpnt); > > - if (mpnt->vm_flags & VM_DONTCOPY) { > > - retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, > > - mpnt->vm_end, GFP_KERNEL); > > - if (retval) > > - goto loop_out; > > - > > - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); > > - continue; > > - } > > - charge = 0; > > - /* > > - * Don't duplicate many vmas if we've been oom-killed (for > > - * example) > > - */ > > - if (fatal_signal_pending(current)) { > > - retval = -EINTR; > > - goto loop_out; > > - } > > - if (mpnt->vm_flags & VM_ACCOUNT) { > > - unsigned long len = vma_pages(mpnt); > > - > > - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ > > - goto fail_nomem; > > - charge = len; > > - } > > - tmp = vm_area_dup(mpnt); > > - if (!tmp) > > - goto fail_nomem; > > - > > - /* track_pfn_copy() will later take care of copying internal state. */ > > - if (unlikely(tmp->vm_flags & VM_PFNMAP)) > > - untrack_pfn_clear(tmp); > > - > > - retval = vma_dup_policy(mpnt, tmp); > > - if (retval) > > - goto fail_nomem_policy; > > - tmp->vm_mm = mm; > > - retval = dup_userfaultfd(tmp, &uf); > > - if (retval) > > - goto fail_nomem_anon_vma_fork; > > - if (tmp->vm_flags & VM_WIPEONFORK) { > > - /* > > - * VM_WIPEONFORK gets a clean slate in the child. > > - * Don't prepare anon_vma until fault since we don't > > - * copy page for current vma. > > - */ > > - tmp->anon_vma = NULL; > > - } else if (anon_vma_fork(tmp, mpnt)) > > - goto fail_nomem_anon_vma_fork; > > - vm_flags_clear(tmp, VM_LOCKED_MASK); > > - /* > > - * Copy/update hugetlb private vma information. > > - */ > > - if (is_vm_hugetlb_page(tmp)) > > - hugetlb_dup_vma_private(tmp); > > - > > - /* > > - * Link the vma into the MT. After using __mt_dup(), memory > > - * allocation is not necessary here, so it cannot fail. > > - */ > > - vma_iter_bulk_store(&vmi, tmp); > > - > > - mm->map_count++; > > - > > - if (tmp->vm_ops && tmp->vm_ops->open) > > - tmp->vm_ops->open(tmp); > > - > > - file = tmp->vm_file; > > - if (file) { > > - struct address_space *mapping = file->f_mapping; > > - > > - get_file(file); > > - i_mmap_lock_write(mapping); > > - if (vma_is_shared_maywrite(tmp)) > > - mapping_allow_writable(mapping); > > - flush_dcache_mmap_lock(mapping); > > - /* insert tmp into the share list, just after mpnt */ > > - vma_interval_tree_insert_after(tmp, mpnt, > > - &mapping->i_mmap); > > - flush_dcache_mmap_unlock(mapping); > > - i_mmap_unlock_write(mapping); > > - } > > - > > - if (!(tmp->vm_flags & VM_WIPEONFORK)) > > - retval = copy_page_range(tmp, mpnt); > > - > > - if (retval) { > > - mpnt = vma_next(&vmi); > > - goto loop_out; > > - } > > - } > > - /* a new mm has just been created */ > > - retval = arch_dup_mmap(oldmm, mm); > > -loop_out: > > - vma_iter_free(&vmi); > > - if (!retval) { > > - mt_set_in_rcu(vmi.mas.tree); > > - ksm_fork(mm, oldmm); > > - khugepaged_fork(mm, oldmm); > > - } else { > > - > > - /* > > - * The entire maple tree has already been duplicated. If the > > - * mmap duplication fails, mark the failure point with > > - * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, > > - * stop releasing VMAs that have not been duplicated after this > > - * point. > > - */ > > - if (mpnt) { > > - mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); > > - mas_store(&vmi.mas, XA_ZERO_ENTRY); > > - /* Avoid OOM iterating a broken tree */ > > - set_bit(MMF_OOM_SKIP, &mm->flags); > > - } > > - /* > > - * The mm_struct is going to exit, but the locks will be dropped > > - * first. Set the mm_struct as unstable is advisable as it is > > - * not fully initialised. > > - */ > > - set_bit(MMF_UNSTABLE, &mm->flags); > > - } > > -out: > > - mmap_write_unlock(mm); > > - flush_tlb_mm(oldmm); > > - mmap_write_unlock(oldmm); > > - if (!retval) > > - dup_userfaultfd_complete(&uf); > > - else > > - dup_userfaultfd_fail(&uf); > > - return retval; > > - > > -fail_nomem_anon_vma_fork: > > - mpol_put(vma_policy(tmp)); > > -fail_nomem_policy: > > - vm_area_free(tmp); > > -fail_nomem: > > - retval = -ENOMEM; > > - vm_unacct_memory(charge); > > - goto loop_out; > > -} > > - > > static inline int mm_alloc_pgd(struct mm_struct *mm) > > { > > mm->pgd = pgd_alloc(mm); > > @@ -794,13 +620,6 @@ static inline void mm_free_pgd(struct mm_struct *mm) > > pgd_free(mm, mm->pgd); > > } > > #else > > -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) > > -{ > > - mmap_write_lock(oldmm); > > - dup_mm_exe_file(mm, oldmm); > > - mmap_write_unlock(oldmm); > > - return 0; > > -} > > #define mm_alloc_pgd(mm) (0) > > #define mm_free_pgd(mm) > > #endif /* CONFIG_MMU */ > > diff --git a/mm/internal.h b/mm/internal.h > > index 40464f755092..b3e011976f74 100644 > > --- a/mm/internal.h > > +++ b/mm/internal.h > > @@ -1631,5 +1631,7 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, > > } > > #endif /* CONFIG_PT_RECLAIM */ > > > > +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); > > +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); > > > > #endif /* __MM_INTERNAL_H */ > > diff --git a/mm/mmap.c b/mm/mmap.c > > index 9e09eac0021c..5259df031e15 100644 > > --- a/mm/mmap.c > > +++ b/mm/mmap.c > > @@ -1675,7 +1675,6 @@ static int __meminit init_reserve_notifier(void) > > } > > subsys_initcall(init_reserve_notifier); > > > > -#ifdef CONFIG_MMU > > /* > > * Obtain a read lock on mm->mmap_lock, if the specified address is below the > > * start of the VMA, the intent is to perform a write, and it is a > > @@ -1719,10 +1718,180 @@ bool mmap_read_lock_maybe_expand(struct mm_struct *mm, > > mmap_write_downgrade(mm); > > return true; > > } > > -#else > > -bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, > > - unsigned long addr, bool write) > > + > > +__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) > > { > > - return false; > > + struct vm_area_struct *mpnt, *tmp; > > + int retval; > > + unsigned long charge = 0; > > + LIST_HEAD(uf); > > + VMA_ITERATOR(vmi, mm, 0); > > + > > + if (mmap_write_lock_killable(oldmm)) > > + return -EINTR; > > + flush_cache_dup_mm(oldmm); > > + uprobe_dup_mmap(oldmm, mm); > > + /* > > + * Not linked in yet - no deadlock potential: > > + */ > > + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); > > + > > + /* No ordering required: file already has been exposed. */ > > + dup_mm_exe_file(mm, oldmm); > > + > > + mm->total_vm = oldmm->total_vm; > > + mm->data_vm = oldmm->data_vm; > > + mm->exec_vm = oldmm->exec_vm; > > + mm->stack_vm = oldmm->stack_vm; > > + > > + /* Use __mt_dup() to efficiently build an identical maple tree. */ > > + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); > > + if (unlikely(retval)) > > + goto out; > > + > > + mt_clear_in_rcu(vmi.mas.tree); > > + for_each_vma(vmi, mpnt) { > > + struct file *file; > > + > > + vma_start_write(mpnt); > > + if (mpnt->vm_flags & VM_DONTCOPY) { > > + retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, > > + mpnt->vm_end, GFP_KERNEL); > > + if (retval) > > + goto loop_out; > > + > > + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); > > + continue; > > + } > > + charge = 0; > > + /* > > + * Don't duplicate many vmas if we've been oom-killed (for > > + * example) > > + */ > > + if (fatal_signal_pending(current)) { > > + retval = -EINTR; > > + goto loop_out; > > + } > > + if (mpnt->vm_flags & VM_ACCOUNT) { > > + unsigned long len = vma_pages(mpnt); > > + > > + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ > > + goto fail_nomem; > > + charge = len; > > + } > > + > > + tmp = vm_area_dup(mpnt); > > + if (!tmp) > > + goto fail_nomem; > > + > > + /* track_pfn_copy() will later take care of copying internal state. */ > > + if (unlikely(tmp->vm_flags & VM_PFNMAP)) > > + untrack_pfn_clear(tmp); > > + > > + retval = vma_dup_policy(mpnt, tmp); > > + if (retval) > > + goto fail_nomem_policy; > > + tmp->vm_mm = mm; > > + retval = dup_userfaultfd(tmp, &uf); > > + if (retval) > > + goto fail_nomem_anon_vma_fork; > > + if (tmp->vm_flags & VM_WIPEONFORK) { > > + /* > > + * VM_WIPEONFORK gets a clean slate in the child. > > + * Don't prepare anon_vma until fault since we don't > > + * copy page for current vma. > > + */ > > + tmp->anon_vma = NULL; > > + } else if (anon_vma_fork(tmp, mpnt)) > > + goto fail_nomem_anon_vma_fork; > > + vm_flags_clear(tmp, VM_LOCKED_MASK); > > + /* > > + * Copy/update hugetlb private vma information. > > + */ > > + if (is_vm_hugetlb_page(tmp)) > > + hugetlb_dup_vma_private(tmp); > > + > > + /* > > + * Link the vma into the MT. After using __mt_dup(), memory > > + * allocation is not necessary here, so it cannot fail. > > + */ > > + vma_iter_bulk_store(&vmi, tmp); > > + > > + mm->map_count++; > > + > > + if (tmp->vm_ops && tmp->vm_ops->open) > > + tmp->vm_ops->open(tmp); > > + > > + file = tmp->vm_file; > > + if (file) { > > + struct address_space *mapping = file->f_mapping; > > + > > + get_file(file); > > + i_mmap_lock_write(mapping); > > + if (vma_is_shared_maywrite(tmp)) > > + mapping_allow_writable(mapping); > > + flush_dcache_mmap_lock(mapping); > > + /* insert tmp into the share list, just after mpnt */ > > + vma_interval_tree_insert_after(tmp, mpnt, > > + &mapping->i_mmap); > > + flush_dcache_mmap_unlock(mapping); > > + i_mmap_unlock_write(mapping); > > + } > > + > > + if (!(tmp->vm_flags & VM_WIPEONFORK)) > > + retval = copy_page_range(tmp, mpnt); > > + > > + if (retval) { > > + mpnt = vma_next(&vmi); > > + goto loop_out; > > + } > > + } > > + /* a new mm has just been created */ > > + retval = arch_dup_mmap(oldmm, mm); > > +loop_out: > > + vma_iter_free(&vmi); > > + if (!retval) { > > + mt_set_in_rcu(vmi.mas.tree); > > + ksm_fork(mm, oldmm); > > + khugepaged_fork(mm, oldmm); > > + } else { > > + > > + /* > > + * The entire maple tree has already been duplicated. If the > > + * mmap duplication fails, mark the failure point with > > + * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, > > + * stop releasing VMAs that have not been duplicated after this > > + * point. > > + */ > > + if (mpnt) { > > + mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); > > + mas_store(&vmi.mas, XA_ZERO_ENTRY); > > + /* Avoid OOM iterating a broken tree */ > > + set_bit(MMF_OOM_SKIP, &mm->flags); > > + } > > + /* > > + * The mm_struct is going to exit, but the locks will be dropped > > + * first. Set the mm_struct as unstable is advisable as it is > > + * not fully initialised. > > + */ > > + set_bit(MMF_UNSTABLE, &mm->flags); > > + } > > +out: > > + mmap_write_unlock(mm); > > + flush_tlb_mm(oldmm); > > + mmap_write_unlock(oldmm); > > + if (!retval) > > + dup_userfaultfd_complete(&uf); > > + else > > + dup_userfaultfd_fail(&uf); > > + return retval; > > + > > +fail_nomem_anon_vma_fork: > > + mpol_put(vma_policy(tmp)); > > +fail_nomem_policy: > > + vm_area_free(tmp); > > +fail_nomem: > > + retval = -ENOMEM; > > + vm_unacct_memory(charge); > > + goto loop_out; > > } > > -#endif > > diff --git a/mm/nommu.c b/mm/nommu.c > > index 2b4d304c6445..a142fc258d39 100644 > > --- a/mm/nommu.c > > +++ b/mm/nommu.c > > @@ -1874,3 +1874,11 @@ static int __meminit init_admin_reserve(void) > > return 0; > > } > > subsys_initcall(init_admin_reserve); > > + > > +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) > > +{ > > + mmap_write_lock(oldmm); > > + dup_mm_exe_file(mm, oldmm); > > + mmap_write_unlock(oldmm); > > + return 0; > > +} > > -- > > 2.49.0 > >