* Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> [250428 11:28]: > There are peculiarities within the kernel where what is very clearly mm > code is performed elsewhere arbitrarily. > > This violates separation of concerns and makes it harder to refactor code > to make changes to how fundamental initialisation and operation of mm logic > is performed. > > One such case is the creation of the VMA containing the initial stack upon > execve()'ing a new process. This is currently performed in __bprm_mm_init() > in fs/exec.c. > > Abstract this operation to create_init_stack_vma(). This allows us to limit > use of vma allocation and free code to fork and mm only. > > We previously did the same for the step at which we relocate the initial > stack VMA downwards via relocate_vma_down(), now we move the initial VMA > establishment too. > > Take the opportunity to also move insert_vm_struct() to mm/vma.c as it's no > longer needed anywhere outside of mm. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> > Acked-by: David Hildenbrand <david@xxxxxxxxxx> > Reviewed-by: Suren Baghdasaryan <surenb@xxxxxxxxxx> Reviewed-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx> > --- > fs/exec.c | 66 +++--------------------------- > mm/mmap.c | 42 ------------------- > mm/vma.c | 43 ++++++++++++++++++++ > mm/vma.h | 4 ++ > mm/vma_exec.c | 69 ++++++++++++++++++++++++++++++++ > tools/testing/vma/vma_internal.h | 32 +++++++++++++++ > 6 files changed, 153 insertions(+), 103 deletions(-) > > diff --git a/fs/exec.c b/fs/exec.c > index 477bc3f2e966..f9bbcf0016a4 100644 > --- a/fs/exec.c > +++ b/fs/exec.c > @@ -245,60 +245,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, > flush_cache_page(bprm->vma, pos, page_to_pfn(page)); > } > > -static int __bprm_mm_init(struct linux_binprm *bprm) > -{ > - int err; > - struct vm_area_struct *vma = NULL; > - struct mm_struct *mm = bprm->mm; > - > - bprm->vma = vma = vm_area_alloc(mm); > - if (!vma) > - return -ENOMEM; > - vma_set_anonymous(vma); > - > - if (mmap_write_lock_killable(mm)) { > - err = -EINTR; > - goto err_free; > - } > - > - /* > - * Need to be called with mmap write lock > - * held, to avoid race with ksmd. > - */ > - err = ksm_execve(mm); > - if (err) > - goto err_ksm; > - > - /* > - * Place the stack at the largest stack address the architecture > - * supports. Later, we'll move this to an appropriate place. We don't > - * use STACK_TOP because that can depend on attributes which aren't > - * configured yet. > - */ > - BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); > - vma->vm_end = STACK_TOP_MAX; > - vma->vm_start = vma->vm_end - PAGE_SIZE; > - vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); > - vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); > - > - err = insert_vm_struct(mm, vma); > - if (err) > - goto err; > - > - mm->stack_vm = mm->total_vm = 1; > - mmap_write_unlock(mm); > - bprm->p = vma->vm_end - sizeof(void *); > - return 0; > -err: > - ksm_exit(mm); > -err_ksm: > - mmap_write_unlock(mm); > -err_free: > - bprm->vma = NULL; > - vm_area_free(vma); > - return err; > -} > - > static bool valid_arg_len(struct linux_binprm *bprm, long len) > { > return len <= MAX_ARG_STRLEN; > @@ -351,12 +297,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, > { > } > > -static int __bprm_mm_init(struct linux_binprm *bprm) > -{ > - bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); > - return 0; > -} > - > static bool valid_arg_len(struct linux_binprm *bprm, long len) > { > return len <= bprm->p; > @@ -385,9 +325,13 @@ static int bprm_mm_init(struct linux_binprm *bprm) > bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; > task_unlock(current->group_leader); > > - err = __bprm_mm_init(bprm); > +#ifndef CONFIG_MMU > + bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); > +#else > + err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p); > if (err) > goto err; > +#endif > > return 0; > > diff --git a/mm/mmap.c b/mm/mmap.c > index 1794bf6f4dc0..9e09eac0021c 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -1321,48 +1321,6 @@ void exit_mmap(struct mm_struct *mm) > vm_unacct_memory(nr_accounted); > } > > -/* Insert vm structure into process list sorted by address > - * and into the inode's i_mmap tree. If vm_file is non-NULL > - * then i_mmap_rwsem is taken here. > - */ > -int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) > -{ > - unsigned long charged = vma_pages(vma); > - > - > - if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) > - return -ENOMEM; > - > - if ((vma->vm_flags & VM_ACCOUNT) && > - security_vm_enough_memory_mm(mm, charged)) > - return -ENOMEM; > - > - /* > - * The vm_pgoff of a purely anonymous vma should be irrelevant > - * until its first write fault, when page's anon_vma and index > - * are set. But now set the vm_pgoff it will almost certainly > - * end up with (unless mremap moves it elsewhere before that > - * first wfault), so /proc/pid/maps tells a consistent story. > - * > - * By setting it to reflect the virtual start address of the > - * vma, merges and splits can happen in a seamless way, just > - * using the existing file pgoff checks and manipulations. > - * Similarly in do_mmap and in do_brk_flags. > - */ > - if (vma_is_anonymous(vma)) { > - BUG_ON(vma->anon_vma); > - vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; > - } > - > - if (vma_link(mm, vma)) { > - if (vma->vm_flags & VM_ACCOUNT) > - vm_unacct_memory(charged); > - return -ENOMEM; > - } > - > - return 0; > -} > - > /* > * Return true if the calling process may expand its vm space by the passed > * number of pages > diff --git a/mm/vma.c b/mm/vma.c > index 8a6c5e835759..1f2634b29568 100644 > --- a/mm/vma.c > +++ b/mm/vma.c > @@ -3052,3 +3052,46 @@ int __vm_munmap(unsigned long start, size_t len, bool unlock) > userfaultfd_unmap_complete(mm, &uf); > return ret; > } > + > + > +/* Insert vm structure into process list sorted by address > + * and into the inode's i_mmap tree. If vm_file is non-NULL > + * then i_mmap_rwsem is taken here. > + */ > +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) > +{ > + unsigned long charged = vma_pages(vma); > + > + > + if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) > + return -ENOMEM; > + > + if ((vma->vm_flags & VM_ACCOUNT) && > + security_vm_enough_memory_mm(mm, charged)) > + return -ENOMEM; > + > + /* > + * The vm_pgoff of a purely anonymous vma should be irrelevant > + * until its first write fault, when page's anon_vma and index > + * are set. But now set the vm_pgoff it will almost certainly > + * end up with (unless mremap moves it elsewhere before that > + * first wfault), so /proc/pid/maps tells a consistent story. > + * > + * By setting it to reflect the virtual start address of the > + * vma, merges and splits can happen in a seamless way, just > + * using the existing file pgoff checks and manipulations. > + * Similarly in do_mmap and in do_brk_flags. > + */ > + if (vma_is_anonymous(vma)) { > + BUG_ON(vma->anon_vma); > + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; > + } > + > + if (vma_link(mm, vma)) { > + if (vma->vm_flags & VM_ACCOUNT) > + vm_unacct_memory(charged); > + return -ENOMEM; > + } > + > + return 0; > +} > diff --git a/mm/vma.h b/mm/vma.h > index 1ce3e18f01b7..94307a2e4ab6 100644 > --- a/mm/vma.h > +++ b/mm/vma.h > @@ -548,8 +548,12 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address); > > int __vm_munmap(unsigned long start, size_t len, bool unlock); > > +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma); > + > /* vma_exec.h */ > #ifdef CONFIG_MMU > +int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, > + unsigned long *top_mem_p); > int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); > #endif > > diff --git a/mm/vma_exec.c b/mm/vma_exec.c > index 6736ae37f748..2dffb02ed6a2 100644 > --- a/mm/vma_exec.c > +++ b/mm/vma_exec.c > @@ -90,3 +90,72 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) > /* Shrink the vma to just the new range */ > return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); > } > + > +/* > + * Establish the stack VMA in an execve'd process, located temporarily at the > + * maximum stack address provided by the architecture. > + * > + * We later relocate this downwards in relocate_vma_down(). > + * > + * This function is almost certainly NOT what you want for anything other than > + * early executable initialisation. > + * > + * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the > + * maximum addressable location in the stack (that is capable of storing a > + * system word of data). > + */ > +int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, > + unsigned long *top_mem_p) > +{ > + int err; > + struct vm_area_struct *vma = vm_area_alloc(mm); > + > + if (!vma) > + return -ENOMEM; > + > + vma_set_anonymous(vma); > + > + if (mmap_write_lock_killable(mm)) { > + err = -EINTR; > + goto err_free; > + } > + > + /* > + * Need to be called with mmap write lock > + * held, to avoid race with ksmd. > + */ > + err = ksm_execve(mm); > + if (err) > + goto err_ksm; > + > + /* > + * Place the stack at the largest stack address the architecture > + * supports. Later, we'll move this to an appropriate place. We don't > + * use STACK_TOP because that can depend on attributes which aren't > + * configured yet. > + */ > + BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); > + vma->vm_end = STACK_TOP_MAX; > + vma->vm_start = vma->vm_end - PAGE_SIZE; > + vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); > + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); > + > + err = insert_vm_struct(mm, vma); > + if (err) > + goto err; > + > + mm->stack_vm = mm->total_vm = 1; > + mmap_write_unlock(mm); > + *vmap = vma; > + *top_mem_p = vma->vm_end - sizeof(void *); > + return 0; > + > +err: > + ksm_exit(mm); > +err_ksm: > + mmap_write_unlock(mm); > +err_free: > + *vmap = NULL; > + vm_area_free(vma); > + return err; > +} > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h > index 0df19ca0000a..32e990313158 100644 > --- a/tools/testing/vma/vma_internal.h > +++ b/tools/testing/vma/vma_internal.h > @@ -56,6 +56,8 @@ extern unsigned long dac_mmap_min_addr; > #define VM_PFNMAP 0x00000400 > #define VM_LOCKED 0x00002000 > #define VM_IO 0x00004000 > +#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ > +#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ > #define VM_DONTEXPAND 0x00040000 > #define VM_LOCKONFAULT 0x00080000 > #define VM_ACCOUNT 0x00100000 > @@ -70,6 +72,20 @@ extern unsigned long dac_mmap_min_addr; > #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) > #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) > > +#ifdef CONFIG_STACK_GROWSUP > +#define VM_STACK VM_GROWSUP > +#define VM_STACK_EARLY VM_GROWSDOWN > +#else > +#define VM_STACK VM_GROWSDOWN > +#define VM_STACK_EARLY 0 > +#endif > + > +#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) > +#define TASK_SIZE_LOW DEFAULT_MAP_WINDOW > +#define TASK_SIZE_MAX DEFAULT_MAP_WINDOW > +#define STACK_TOP TASK_SIZE_LOW > +#define STACK_TOP_MAX TASK_SIZE_MAX > + > /* This mask represents all the VMA flag bits used by mlock */ > #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) > > @@ -82,6 +98,10 @@ extern unsigned long dac_mmap_min_addr; > > #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) > > +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS > +#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) > +#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) > + > #define RLIMIT_STACK 3 /* max stack size */ > #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ > > @@ -1280,4 +1300,16 @@ static inline void free_pgd_range(struct mmu_gather *tlb, > (void)ceiling; > } > > +static inline int ksm_execve(struct mm_struct *mm) > +{ > + (void)mm; > + > + return 0; > +} > + > +static inline void ksm_exit(struct mm_struct *mm) > +{ > + (void)mm; > +} > + > #endif /* __MM_VMA_INTERNAL_H */ > -- > 2.49.0 >