Andrew - I maanged to typo vma_init.c to vma_init.h here in mm/vma.c (at least I was consistent in these silly typos...). Would you prefer a fixpatch or is it ok for you to quickly fix that up? Thanks! On Mon, Apr 28, 2025 at 04:28:17PM +0100, Lorenzo Stoakes wrote: > Right now these are performed in kernel/fork.c which is odd and a violation > of separation of concerns, as well as preventing us from integrating this > and related logic into userland VMA testing going forward, and perhaps more > importantly - enabling us to, in a subsequent commit, make VMA > allocation/freeing a purely internal mm operation. > > There is a fly in the ointment - nommu - mmap.c is not compiled if > CONFIG_MMU not set, and neither is vma.c. > > To square the circle, let's add a new file - vma_init.c. This will be > compiled for both CONFIG_MMU and nommu builds, and will also form part of > the VMA userland testing. > > This allows us to de-duplicate code, while maintaining separation of > concerns and the ability for us to userland test this logic. > > Update the VMA userland tests accordingly, additionally adding a > detach_free_vma() helper function to correctly detach VMAs before freeing > them in test code, as this change was triggering the assert for this. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> > --- > MAINTAINERS | 1 + > kernel/fork.c | 88 ------------------- > mm/Makefile | 2 +- > mm/mmap.c | 3 +- > mm/nommu.c | 4 +- > mm/vma.h | 7 ++ > mm/vma_init.c | 101 ++++++++++++++++++++++ > tools/testing/vma/Makefile | 2 +- > tools/testing/vma/vma.c | 26 ++++-- > tools/testing/vma/vma_internal.h | 143 +++++++++++++++++++++++++------ > 10 files changed, 251 insertions(+), 126 deletions(-) > create mode 100644 mm/vma_init.c > > diff --git a/MAINTAINERS b/MAINTAINERS > index 1ee1c22e6e36..d274e6802ba5 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -15656,6 +15656,7 @@ F: mm/mseal.c > F: mm/vma.c > F: mm/vma.h > F: mm/vma_exec.c > +F: mm/vma_init.c > F: mm/vma_internal.h > F: tools/testing/selftests/mm/merge.c > F: tools/testing/vma/ > diff --git a/kernel/fork.c b/kernel/fork.c > index ac9f9267a473..9e4616dacd82 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -431,88 +431,9 @@ struct kmem_cache *files_cachep; > /* SLAB cache for fs_struct structures (tsk->fs) */ > struct kmem_cache *fs_cachep; > > -/* SLAB cache for vm_area_struct structures */ > -static struct kmem_cache *vm_area_cachep; > - > /* SLAB cache for mm_struct structures (tsk->mm) */ > static struct kmem_cache *mm_cachep; > > -struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) > -{ > - struct vm_area_struct *vma; > - > - vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); > - if (!vma) > - return NULL; > - > - vma_init(vma, mm); > - > - return vma; > -} > - > -static void vm_area_init_from(const struct vm_area_struct *src, > - struct vm_area_struct *dest) > -{ > - dest->vm_mm = src->vm_mm; > - dest->vm_ops = src->vm_ops; > - dest->vm_start = src->vm_start; > - dest->vm_end = src->vm_end; > - dest->anon_vma = src->anon_vma; > - dest->vm_pgoff = src->vm_pgoff; > - dest->vm_file = src->vm_file; > - dest->vm_private_data = src->vm_private_data; > - vm_flags_init(dest, src->vm_flags); > - memcpy(&dest->vm_page_prot, &src->vm_page_prot, > - sizeof(dest->vm_page_prot)); > - /* > - * src->shared.rb may be modified concurrently when called from > - * dup_mmap(), but the clone will reinitialize it. > - */ > - data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); > - memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, > - sizeof(dest->vm_userfaultfd_ctx)); > -#ifdef CONFIG_ANON_VMA_NAME > - dest->anon_name = src->anon_name; > -#endif > -#ifdef CONFIG_SWAP > - memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, > - sizeof(dest->swap_readahead_info)); > -#endif > -#ifndef CONFIG_MMU > - dest->vm_region = src->vm_region; > -#endif > -#ifdef CONFIG_NUMA > - dest->vm_policy = src->vm_policy; > -#endif > -} > - > -struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) > -{ > - struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); > - > - if (!new) > - return NULL; > - > - ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); > - ASSERT_EXCLUSIVE_WRITER(orig->vm_file); > - vm_area_init_from(orig, new); > - vma_lock_init(new, true); > - INIT_LIST_HEAD(&new->anon_vma_chain); > - vma_numab_state_init(new); > - dup_anon_vma_name(orig, new); > - > - return new; > -} > - > -void vm_area_free(struct vm_area_struct *vma) > -{ > - /* The vma should be detached while being destroyed. */ > - vma_assert_detached(vma); > - vma_numab_state_free(vma); > - free_anon_vma_name(vma); > - kmem_cache_free(vm_area_cachep, vma); > -} > - > static void account_kernel_stack(struct task_struct *tsk, int account) > { > if (IS_ENABLED(CONFIG_VMAP_STACK)) { > @@ -3033,11 +2954,6 @@ void __init mm_cache_init(void) > > void __init proc_caches_init(void) > { > - struct kmem_cache_args args = { > - .use_freeptr_offset = true, > - .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), > - }; > - > sighand_cachep = kmem_cache_create("sighand_cache", > sizeof(struct sighand_struct), 0, > SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| > @@ -3054,10 +2970,6 @@ void __init proc_caches_init(void) > sizeof(struct fs_struct), 0, > SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, > NULL); > - vm_area_cachep = kmem_cache_create("vm_area_struct", > - sizeof(struct vm_area_struct), &args, > - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| > - SLAB_ACCOUNT); > mmap_init(); > nsproxy_cache_init(); > } > diff --git a/mm/Makefile b/mm/Makefile > index 15a901bb431a..690ddcf7d9a1 100644 > --- a/mm/Makefile > +++ b/mm/Makefile > @@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ > mm_init.o percpu.o slab_common.o \ > compaction.o show_mem.o \ > interval_tree.o list_lru.o workingset.o \ > - debug.o gup.o mmap_lock.o $(mmu-y) > + debug.o gup.o mmap_lock.o vma_init.o $(mmu-y) > > # Give 'page_alloc' its own module-parameter namespace > page-alloc-y := page_alloc.o > diff --git a/mm/mmap.c b/mm/mmap.c > index 5259df031e15..81dd962a1cfc 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -1554,7 +1554,7 @@ static const struct ctl_table mmap_table[] = { > #endif /* CONFIG_SYSCTL */ > > /* > - * initialise the percpu counter for VM > + * initialise the percpu counter for VM, initialise VMA state. > */ > void __init mmap_init(void) > { > @@ -1565,6 +1565,7 @@ void __init mmap_init(void) > #ifdef CONFIG_SYSCTL > register_sysctl_init("vm", mmap_table); > #endif > + vma_state_init(); > } > > /* > diff --git a/mm/nommu.c b/mm/nommu.c > index a142fc258d39..0bf4849b8204 100644 > --- a/mm/nommu.c > +++ b/mm/nommu.c > @@ -399,7 +399,8 @@ static const struct ctl_table nommu_table[] = { > }; > > /* > - * initialise the percpu counter for VM and region record slabs > + * initialise the percpu counter for VM and region record slabs, initialise VMA > + * state. > */ > void __init mmap_init(void) > { > @@ -409,6 +410,7 @@ void __init mmap_init(void) > VM_BUG_ON(ret); > vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); > register_sysctl_init("vm", nommu_table); > + vma_state_init(); > } > > /* > diff --git a/mm/vma.h b/mm/vma.h > index 94307a2e4ab6..4a1e1768ca46 100644 > --- a/mm/vma.h > +++ b/mm/vma.h > @@ -548,8 +548,15 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address); > > int __vm_munmap(unsigned long start, size_t len, bool unlock); > > + > int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma); > > +/* vma_init.h, shared between CONFIG_MMU and nommu. */ > +void __init vma_state_init(void); > +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm); > +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig); > +void vm_area_free(struct vm_area_struct *vma); > + > /* vma_exec.h */ > #ifdef CONFIG_MMU > int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, > diff --git a/mm/vma_init.c b/mm/vma_init.c > new file mode 100644 > index 000000000000..967ca8517986 > --- /dev/null > +++ b/mm/vma_init.c > @@ -0,0 +1,101 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > + > +/* > + * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared > + * between CONFIG_MMU and non-CONFIG_MMU kernel configurations. > + */ > + > +#include "vma_internal.h" > +#include "vma.h" > + > +/* SLAB cache for vm_area_struct structures */ > +static struct kmem_cache *vm_area_cachep; > + > +void __init vma_state_init(void) > +{ > + struct kmem_cache_args args = { > + .use_freeptr_offset = true, > + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), > + }; > + > + vm_area_cachep = kmem_cache_create("vm_area_struct", > + sizeof(struct vm_area_struct), &args, > + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| > + SLAB_ACCOUNT); > +} > + > +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) > +{ > + struct vm_area_struct *vma; > + > + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); > + if (!vma) > + return NULL; > + > + vma_init(vma, mm); > + > + return vma; > +} > + > +static void vm_area_init_from(const struct vm_area_struct *src, > + struct vm_area_struct *dest) > +{ > + dest->vm_mm = src->vm_mm; > + dest->vm_ops = src->vm_ops; > + dest->vm_start = src->vm_start; > + dest->vm_end = src->vm_end; > + dest->anon_vma = src->anon_vma; > + dest->vm_pgoff = src->vm_pgoff; > + dest->vm_file = src->vm_file; > + dest->vm_private_data = src->vm_private_data; > + vm_flags_init(dest, src->vm_flags); > + memcpy(&dest->vm_page_prot, &src->vm_page_prot, > + sizeof(dest->vm_page_prot)); > + /* > + * src->shared.rb may be modified concurrently when called from > + * dup_mmap(), but the clone will reinitialize it. > + */ > + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); > + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, > + sizeof(dest->vm_userfaultfd_ctx)); > +#ifdef CONFIG_ANON_VMA_NAME > + dest->anon_name = src->anon_name; > +#endif > +#ifdef CONFIG_SWAP > + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, > + sizeof(dest->swap_readahead_info)); > +#endif > +#ifndef CONFIG_MMU > + dest->vm_region = src->vm_region; > +#endif > +#ifdef CONFIG_NUMA > + dest->vm_policy = src->vm_policy; > +#endif > +} > + > +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) > +{ > + struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); > + > + if (!new) > + return NULL; > + > + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); > + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); > + vm_area_init_from(orig, new); > + vma_lock_init(new, true); > + INIT_LIST_HEAD(&new->anon_vma_chain); > + vma_numab_state_init(new); > + dup_anon_vma_name(orig, new); > + > + return new; > +} > + > +void vm_area_free(struct vm_area_struct *vma) > +{ > + /* The vma should be detached while being destroyed. */ > + vma_assert_detached(vma); > + vma_numab_state_free(vma); > + free_anon_vma_name(vma); > + kmem_cache_free(vm_area_cachep, vma); > +} > diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile > index 624040fcf193..66f3831a668f 100644 > --- a/tools/testing/vma/Makefile > +++ b/tools/testing/vma/Makefile > @@ -9,7 +9,7 @@ include ../shared/shared.mk > OFILES = $(SHARED_OFILES) vma.o maple-shim.o > TARGETS = vma > > -vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_exec.c ../../../mm/vma.h > +vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h > > vma: $(OFILES) > $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS) > diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c > index 5832ae5d797d..2be7597a2ac2 100644 > --- a/tools/testing/vma/vma.c > +++ b/tools/testing/vma/vma.c > @@ -28,6 +28,7 @@ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; > * Directly import the VMA implementation here. Our vma_internal.h wrapper > * provides userland-equivalent functionality for everything vma.c uses. > */ > +#include "../../../mm/vma_init.c" > #include "../../../mm/vma_exec.c" > #include "../../../mm/vma.c" > > @@ -91,6 +92,12 @@ static int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma) > return res; > } > > +static void detach_free_vma(struct vm_area_struct *vma) > +{ > + vma_mark_detached(vma); > + vm_area_free(vma); > +} > + > /* Helper function to allocate a VMA and link it to the tree. */ > static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, > unsigned long start, > @@ -104,7 +111,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, > return NULL; > > if (attach_vma(mm, vma)) { > - vm_area_free(vma); > + detach_free_vma(vma); > return NULL; > } > > @@ -249,7 +256,7 @@ static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi) > > vma_iter_set(vmi, 0); > for_each_vma(*vmi, vma) { > - vm_area_free(vma); > + detach_free_vma(vma); > count++; > } > > @@ -319,7 +326,7 @@ static bool test_simple_merge(void) > ASSERT_EQ(vma->vm_pgoff, 0); > ASSERT_EQ(vma->vm_flags, flags); > > - vm_area_free(vma); > + detach_free_vma(vma); > mtree_destroy(&mm.mm_mt); > > return true; > @@ -361,7 +368,7 @@ static bool test_simple_modify(void) > ASSERT_EQ(vma->vm_end, 0x1000); > ASSERT_EQ(vma->vm_pgoff, 0); > > - vm_area_free(vma); > + detach_free_vma(vma); > vma_iter_clear(&vmi); > > vma = vma_next(&vmi); > @@ -370,7 +377,7 @@ static bool test_simple_modify(void) > ASSERT_EQ(vma->vm_end, 0x2000); > ASSERT_EQ(vma->vm_pgoff, 1); > > - vm_area_free(vma); > + detach_free_vma(vma); > vma_iter_clear(&vmi); > > vma = vma_next(&vmi); > @@ -379,7 +386,7 @@ static bool test_simple_modify(void) > ASSERT_EQ(vma->vm_end, 0x3000); > ASSERT_EQ(vma->vm_pgoff, 2); > > - vm_area_free(vma); > + detach_free_vma(vma); > mtree_destroy(&mm.mm_mt); > > return true; > @@ -407,7 +414,7 @@ static bool test_simple_expand(void) > ASSERT_EQ(vma->vm_end, 0x3000); > ASSERT_EQ(vma->vm_pgoff, 0); > > - vm_area_free(vma); > + detach_free_vma(vma); > mtree_destroy(&mm.mm_mt); > > return true; > @@ -428,7 +435,7 @@ static bool test_simple_shrink(void) > ASSERT_EQ(vma->vm_end, 0x1000); > ASSERT_EQ(vma->vm_pgoff, 0); > > - vm_area_free(vma); > + detach_free_vma(vma); > mtree_destroy(&mm.mm_mt); > > return true; > @@ -619,7 +626,7 @@ static bool test_merge_new(void) > ASSERT_EQ(vma->vm_pgoff, 0); > ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); > > - vm_area_free(vma); > + detach_free_vma(vma); > count++; > } > > @@ -1668,6 +1675,7 @@ int main(void) > int num_tests = 0, num_fail = 0; > > maple_tree_init(); > + vma_state_init(); > > #define TEST(name) \ > do { \ > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h > index 32e990313158..198abe66de5a 100644 > --- a/tools/testing/vma/vma_internal.h > +++ b/tools/testing/vma/vma_internal.h > @@ -155,6 +155,10 @@ typedef __bitwise unsigned int vm_fault_t; > */ > #define pr_warn_once pr_err > > +#define data_race(expr) expr > + > +#define ASSERT_EXCLUSIVE_WRITER(x) > + > struct kref { > refcount_t refcount; > }; > @@ -255,6 +259,8 @@ struct file { > > #define VMA_LOCK_OFFSET 0x40000000 > > +typedef struct { unsigned long v; } freeptr_t; > + > struct vm_area_struct { > /* The first cache line has the info for VMA tree walking. */ > > @@ -264,9 +270,7 @@ struct vm_area_struct { > unsigned long vm_start; > unsigned long vm_end; > }; > -#ifdef CONFIG_PER_VMA_LOCK > - struct rcu_head vm_rcu; /* Used for deferred freeing. */ > -#endif > + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ > }; > > struct mm_struct *vm_mm; /* The address space we belong to. */ > @@ -463,6 +467,65 @@ struct pagetable_move_control { > .len_in = len_, \ > } > > +struct kmem_cache_args { > + /** > + * @align: The required alignment for the objects. > + * > + * %0 means no specific alignment is requested. > + */ > + unsigned int align; > + /** > + * @useroffset: Usercopy region offset. > + * > + * %0 is a valid offset, when @usersize is non-%0 > + */ > + unsigned int useroffset; > + /** > + * @usersize: Usercopy region size. > + * > + * %0 means no usercopy region is specified. > + */ > + unsigned int usersize; > + /** > + * @freeptr_offset: Custom offset for the free pointer > + * in &SLAB_TYPESAFE_BY_RCU caches > + * > + * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer > + * outside of the object. This might cause the object to grow in size. > + * Cache creators that have a reason to avoid this can specify a custom > + * free pointer offset in their struct where the free pointer will be > + * placed. > + * > + * Note that placing the free pointer inside the object requires the > + * caller to ensure that no fields are invalidated that are required to > + * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for > + * details). > + * > + * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset > + * is specified, %use_freeptr_offset must be set %true. > + * > + * Note that @ctor currently isn't supported with custom free pointers > + * as a @ctor requires an external free pointer. > + */ > + unsigned int freeptr_offset; > + /** > + * @use_freeptr_offset: Whether a @freeptr_offset is used. > + */ > + bool use_freeptr_offset; > + /** > + * @ctor: A constructor for the objects. > + * > + * The constructor is invoked for each object in a newly allocated slab > + * page. It is the cache user's responsibility to free object in the > + * same state as after calling the constructor, or deal appropriately > + * with any differences between a freshly constructed and a reallocated > + * object. > + * > + * %NULL means no constructor. > + */ > + void (*ctor)(void *); > +}; > + > static inline void vma_iter_invalidate(struct vma_iterator *vmi) > { > mas_pause(&vmi->mas); > @@ -547,31 +610,38 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) > vma->vm_lock_seq = UINT_MAX; > } > > -static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) > -{ > - struct vm_area_struct *vma = calloc(1, sizeof(struct vm_area_struct)); > +struct kmem_cache { > + const char *name; > + size_t object_size; > + struct kmem_cache_args *args; > +}; > > - if (!vma) > - return NULL; > +static inline struct kmem_cache *__kmem_cache_create(const char *name, > + size_t object_size, > + struct kmem_cache_args *args) > +{ > + struct kmem_cache *ret = malloc(sizeof(struct kmem_cache)); > > - vma_init(vma, mm); > + ret->name = name; > + ret->object_size = object_size; > + ret->args = args; > > - return vma; > + return ret; > } > > -static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) > -{ > - struct vm_area_struct *new = calloc(1, sizeof(struct vm_area_struct)); > +#define kmem_cache_create(__name, __object_size, __args, ...) \ > + __kmem_cache_create((__name), (__object_size), (__args)) > > - if (!new) > - return NULL; > +static inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) > +{ > + (void)gfpflags; > > - memcpy(new, orig, sizeof(*new)); > - refcount_set(&new->vm_refcnt, 0); > - new->vm_lock_seq = UINT_MAX; > - INIT_LIST_HEAD(&new->anon_vma_chain); > + return calloc(s->object_size, 1); > +} > > - return new; > +static inline void kmem_cache_free(struct kmem_cache *s, void *x) > +{ > + free(x); > } > > /* > @@ -738,11 +808,6 @@ static inline void mpol_put(struct mempolicy *) > { > } > > -static inline void vm_area_free(struct vm_area_struct *vma) > -{ > - free(vma); > -} > - > static inline void lru_add_drain(void) > { > } > @@ -1312,4 +1377,32 @@ static inline void ksm_exit(struct mm_struct *mm) > (void)mm; > } > > +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) > +{ > + (void)vma; > + (void)reset_refcnt; > +} > + > +static inline void vma_numab_state_init(struct vm_area_struct *vma) > +{ > + (void)vma; > +} > + > +static inline void vma_numab_state_free(struct vm_area_struct *vma) > +{ > + (void)vma; > +} > + > +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, > + struct vm_area_struct *new_vma) > +{ > + (void)orig_vma; > + (void)new_vma; > +} > + > +static inline void free_anon_vma_name(struct vm_area_struct *vma) > +{ > + (void)vma; > +} > + > #endif /* __MM_VMA_INTERNAL_H */ > -- > 2.49.0 >