* Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> [250428 11:28]: > This is a key step in our being able to abstract and isolate VMA allocation > and destruction logic. > > This function is the last one where vm_area_free() and vm_area_dup() are > directly referenced outside of mmap, so having this in mm allows us to > isolate these. > > We do the same for the nommu version which is substantially simpler. > > We place the declaration for dup_mmap() in mm/internal.h and have > kernel/fork.c import this in order to prevent improper use of this > functionality elsewhere in the kernel. > > While we're here, we remove the useless #ifdef CONFIG_MMU check around > mmap_read_lock_maybe_expand() in mmap.c, mmap.c is compiled only if > CONFIG_MMU is set. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> > Suggested-by: Pedro Falcato <pfalcato@xxxxxxx> > Reviewed-by: Pedro Falcato <pfalcato@xxxxxxx> Reviewed-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx> > --- > kernel/fork.c | 189 ++------------------------------------------------ > mm/internal.h | 2 + > mm/mmap.c | 181 +++++++++++++++++++++++++++++++++++++++++++++-- > mm/nommu.c | 8 +++ > 4 files changed, 189 insertions(+), 191 deletions(-) > > diff --git a/kernel/fork.c b/kernel/fork.c > index 168681fc4b25..ac9f9267a473 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -112,6 +112,9 @@ > #include <asm/cacheflush.h> > #include <asm/tlbflush.h> > > +/* For dup_mmap(). */ > +#include "../mm/internal.h" > + > #include <trace/events/sched.h> > > #define CREATE_TRACE_POINTS > @@ -589,7 +592,7 @@ void free_task(struct task_struct *tsk) > } > EXPORT_SYMBOL(free_task); > > -static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) > +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) > { > struct file *exe_file; > > @@ -604,183 +607,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) > } > > #ifdef CONFIG_MMU > -static __latent_entropy int dup_mmap(struct mm_struct *mm, > - struct mm_struct *oldmm) > -{ > - struct vm_area_struct *mpnt, *tmp; > - int retval; > - unsigned long charge = 0; > - LIST_HEAD(uf); > - VMA_ITERATOR(vmi, mm, 0); > - > - if (mmap_write_lock_killable(oldmm)) > - return -EINTR; > - flush_cache_dup_mm(oldmm); > - uprobe_dup_mmap(oldmm, mm); > - /* > - * Not linked in yet - no deadlock potential: > - */ > - mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); > - > - /* No ordering required: file already has been exposed. */ > - dup_mm_exe_file(mm, oldmm); > - > - mm->total_vm = oldmm->total_vm; > - mm->data_vm = oldmm->data_vm; > - mm->exec_vm = oldmm->exec_vm; > - mm->stack_vm = oldmm->stack_vm; > - > - /* Use __mt_dup() to efficiently build an identical maple tree. */ > - retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); > - if (unlikely(retval)) > - goto out; > - > - mt_clear_in_rcu(vmi.mas.tree); > - for_each_vma(vmi, mpnt) { > - struct file *file; > - > - vma_start_write(mpnt); > - if (mpnt->vm_flags & VM_DONTCOPY) { > - retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, > - mpnt->vm_end, GFP_KERNEL); > - if (retval) > - goto loop_out; > - > - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); > - continue; > - } > - charge = 0; > - /* > - * Don't duplicate many vmas if we've been oom-killed (for > - * example) > - */ > - if (fatal_signal_pending(current)) { > - retval = -EINTR; > - goto loop_out; > - } > - if (mpnt->vm_flags & VM_ACCOUNT) { > - unsigned long len = vma_pages(mpnt); > - > - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ > - goto fail_nomem; > - charge = len; > - } > - tmp = vm_area_dup(mpnt); > - if (!tmp) > - goto fail_nomem; > - > - /* track_pfn_copy() will later take care of copying internal state. */ > - if (unlikely(tmp->vm_flags & VM_PFNMAP)) > - untrack_pfn_clear(tmp); > - > - retval = vma_dup_policy(mpnt, tmp); > - if (retval) > - goto fail_nomem_policy; > - tmp->vm_mm = mm; > - retval = dup_userfaultfd(tmp, &uf); > - if (retval) > - goto fail_nomem_anon_vma_fork; > - if (tmp->vm_flags & VM_WIPEONFORK) { > - /* > - * VM_WIPEONFORK gets a clean slate in the child. > - * Don't prepare anon_vma until fault since we don't > - * copy page for current vma. > - */ > - tmp->anon_vma = NULL; > - } else if (anon_vma_fork(tmp, mpnt)) > - goto fail_nomem_anon_vma_fork; > - vm_flags_clear(tmp, VM_LOCKED_MASK); > - /* > - * Copy/update hugetlb private vma information. > - */ > - if (is_vm_hugetlb_page(tmp)) > - hugetlb_dup_vma_private(tmp); > - > - /* > - * Link the vma into the MT. After using __mt_dup(), memory > - * allocation is not necessary here, so it cannot fail. > - */ > - vma_iter_bulk_store(&vmi, tmp); > - > - mm->map_count++; > - > - if (tmp->vm_ops && tmp->vm_ops->open) > - tmp->vm_ops->open(tmp); > - > - file = tmp->vm_file; > - if (file) { > - struct address_space *mapping = file->f_mapping; > - > - get_file(file); > - i_mmap_lock_write(mapping); > - if (vma_is_shared_maywrite(tmp)) > - mapping_allow_writable(mapping); > - flush_dcache_mmap_lock(mapping); > - /* insert tmp into the share list, just after mpnt */ > - vma_interval_tree_insert_after(tmp, mpnt, > - &mapping->i_mmap); > - flush_dcache_mmap_unlock(mapping); > - i_mmap_unlock_write(mapping); > - } > - > - if (!(tmp->vm_flags & VM_WIPEONFORK)) > - retval = copy_page_range(tmp, mpnt); > - > - if (retval) { > - mpnt = vma_next(&vmi); > - goto loop_out; > - } > - } > - /* a new mm has just been created */ > - retval = arch_dup_mmap(oldmm, mm); > -loop_out: > - vma_iter_free(&vmi); > - if (!retval) { > - mt_set_in_rcu(vmi.mas.tree); > - ksm_fork(mm, oldmm); > - khugepaged_fork(mm, oldmm); > - } else { > - > - /* > - * The entire maple tree has already been duplicated. If the > - * mmap duplication fails, mark the failure point with > - * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, > - * stop releasing VMAs that have not been duplicated after this > - * point. > - */ > - if (mpnt) { > - mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); > - mas_store(&vmi.mas, XA_ZERO_ENTRY); > - /* Avoid OOM iterating a broken tree */ > - set_bit(MMF_OOM_SKIP, &mm->flags); > - } > - /* > - * The mm_struct is going to exit, but the locks will be dropped > - * first. Set the mm_struct as unstable is advisable as it is > - * not fully initialised. > - */ > - set_bit(MMF_UNSTABLE, &mm->flags); > - } > -out: > - mmap_write_unlock(mm); > - flush_tlb_mm(oldmm); > - mmap_write_unlock(oldmm); > - if (!retval) > - dup_userfaultfd_complete(&uf); > - else > - dup_userfaultfd_fail(&uf); > - return retval; > - > -fail_nomem_anon_vma_fork: > - mpol_put(vma_policy(tmp)); > -fail_nomem_policy: > - vm_area_free(tmp); > -fail_nomem: > - retval = -ENOMEM; > - vm_unacct_memory(charge); > - goto loop_out; > -} > - > static inline int mm_alloc_pgd(struct mm_struct *mm) > { > mm->pgd = pgd_alloc(mm); > @@ -794,13 +620,6 @@ static inline void mm_free_pgd(struct mm_struct *mm) > pgd_free(mm, mm->pgd); > } > #else > -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) > -{ > - mmap_write_lock(oldmm); > - dup_mm_exe_file(mm, oldmm); > - mmap_write_unlock(oldmm); > - return 0; > -} > #define mm_alloc_pgd(mm) (0) > #define mm_free_pgd(mm) > #endif /* CONFIG_MMU */ > diff --git a/mm/internal.h b/mm/internal.h > index 40464f755092..b3e011976f74 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -1631,5 +1631,7 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, > } > #endif /* CONFIG_PT_RECLAIM */ > > +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); > +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); > > #endif /* __MM_INTERNAL_H */ > diff --git a/mm/mmap.c b/mm/mmap.c > index 9e09eac0021c..5259df031e15 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -1675,7 +1675,6 @@ static int __meminit init_reserve_notifier(void) > } > subsys_initcall(init_reserve_notifier); > > -#ifdef CONFIG_MMU > /* > * Obtain a read lock on mm->mmap_lock, if the specified address is below the > * start of the VMA, the intent is to perform a write, and it is a > @@ -1719,10 +1718,180 @@ bool mmap_read_lock_maybe_expand(struct mm_struct *mm, > mmap_write_downgrade(mm); > return true; > } > -#else > -bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, > - unsigned long addr, bool write) > + > +__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) > { > - return false; > + struct vm_area_struct *mpnt, *tmp; > + int retval; > + unsigned long charge = 0; > + LIST_HEAD(uf); > + VMA_ITERATOR(vmi, mm, 0); > + > + if (mmap_write_lock_killable(oldmm)) > + return -EINTR; > + flush_cache_dup_mm(oldmm); > + uprobe_dup_mmap(oldmm, mm); > + /* > + * Not linked in yet - no deadlock potential: > + */ > + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); > + > + /* No ordering required: file already has been exposed. */ > + dup_mm_exe_file(mm, oldmm); > + > + mm->total_vm = oldmm->total_vm; > + mm->data_vm = oldmm->data_vm; > + mm->exec_vm = oldmm->exec_vm; > + mm->stack_vm = oldmm->stack_vm; > + > + /* Use __mt_dup() to efficiently build an identical maple tree. */ > + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); > + if (unlikely(retval)) > + goto out; > + > + mt_clear_in_rcu(vmi.mas.tree); > + for_each_vma(vmi, mpnt) { > + struct file *file; > + > + vma_start_write(mpnt); > + if (mpnt->vm_flags & VM_DONTCOPY) { > + retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, > + mpnt->vm_end, GFP_KERNEL); > + if (retval) > + goto loop_out; > + > + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); > + continue; > + } > + charge = 0; > + /* > + * Don't duplicate many vmas if we've been oom-killed (for > + * example) > + */ > + if (fatal_signal_pending(current)) { > + retval = -EINTR; > + goto loop_out; > + } > + if (mpnt->vm_flags & VM_ACCOUNT) { > + unsigned long len = vma_pages(mpnt); > + > + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ > + goto fail_nomem; > + charge = len; > + } > + > + tmp = vm_area_dup(mpnt); > + if (!tmp) > + goto fail_nomem; > + > + /* track_pfn_copy() will later take care of copying internal state. */ > + if (unlikely(tmp->vm_flags & VM_PFNMAP)) > + untrack_pfn_clear(tmp); > + > + retval = vma_dup_policy(mpnt, tmp); > + if (retval) > + goto fail_nomem_policy; > + tmp->vm_mm = mm; > + retval = dup_userfaultfd(tmp, &uf); > + if (retval) > + goto fail_nomem_anon_vma_fork; > + if (tmp->vm_flags & VM_WIPEONFORK) { > + /* > + * VM_WIPEONFORK gets a clean slate in the child. > + * Don't prepare anon_vma until fault since we don't > + * copy page for current vma. > + */ > + tmp->anon_vma = NULL; > + } else if (anon_vma_fork(tmp, mpnt)) > + goto fail_nomem_anon_vma_fork; > + vm_flags_clear(tmp, VM_LOCKED_MASK); > + /* > + * Copy/update hugetlb private vma information. > + */ > + if (is_vm_hugetlb_page(tmp)) > + hugetlb_dup_vma_private(tmp); > + > + /* > + * Link the vma into the MT. After using __mt_dup(), memory > + * allocation is not necessary here, so it cannot fail. > + */ > + vma_iter_bulk_store(&vmi, tmp); > + > + mm->map_count++; > + > + if (tmp->vm_ops && tmp->vm_ops->open) > + tmp->vm_ops->open(tmp); > + > + file = tmp->vm_file; > + if (file) { > + struct address_space *mapping = file->f_mapping; > + > + get_file(file); > + i_mmap_lock_write(mapping); > + if (vma_is_shared_maywrite(tmp)) > + mapping_allow_writable(mapping); > + flush_dcache_mmap_lock(mapping); > + /* insert tmp into the share list, just after mpnt */ > + vma_interval_tree_insert_after(tmp, mpnt, > + &mapping->i_mmap); > + flush_dcache_mmap_unlock(mapping); > + i_mmap_unlock_write(mapping); > + } > + > + if (!(tmp->vm_flags & VM_WIPEONFORK)) > + retval = copy_page_range(tmp, mpnt); > + > + if (retval) { > + mpnt = vma_next(&vmi); > + goto loop_out; > + } > + } > + /* a new mm has just been created */ > + retval = arch_dup_mmap(oldmm, mm); > +loop_out: > + vma_iter_free(&vmi); > + if (!retval) { > + mt_set_in_rcu(vmi.mas.tree); > + ksm_fork(mm, oldmm); > + khugepaged_fork(mm, oldmm); > + } else { > + > + /* > + * The entire maple tree has already been duplicated. If the > + * mmap duplication fails, mark the failure point with > + * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, > + * stop releasing VMAs that have not been duplicated after this > + * point. > + */ > + if (mpnt) { > + mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); > + mas_store(&vmi.mas, XA_ZERO_ENTRY); > + /* Avoid OOM iterating a broken tree */ > + set_bit(MMF_OOM_SKIP, &mm->flags); > + } > + /* > + * The mm_struct is going to exit, but the locks will be dropped > + * first. Set the mm_struct as unstable is advisable as it is > + * not fully initialised. > + */ > + set_bit(MMF_UNSTABLE, &mm->flags); > + } > +out: > + mmap_write_unlock(mm); > + flush_tlb_mm(oldmm); > + mmap_write_unlock(oldmm); > + if (!retval) > + dup_userfaultfd_complete(&uf); > + else > + dup_userfaultfd_fail(&uf); > + return retval; > + > +fail_nomem_anon_vma_fork: > + mpol_put(vma_policy(tmp)); > +fail_nomem_policy: > + vm_area_free(tmp); > +fail_nomem: > + retval = -ENOMEM; > + vm_unacct_memory(charge); > + goto loop_out; > } > -#endif > diff --git a/mm/nommu.c b/mm/nommu.c > index 2b4d304c6445..a142fc258d39 100644 > --- a/mm/nommu.c > +++ b/mm/nommu.c > @@ -1874,3 +1874,11 @@ static int __meminit init_admin_reserve(void) > return 0; > } > subsys_initcall(init_admin_reserve); > + > +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) > +{ > + mmap_write_lock(oldmm); > + dup_mm_exe_file(mm, oldmm); > + mmap_write_unlock(oldmm); > + return 0; > +} > -- > 2.49.0 >