On Fri, May 09, 2025 at 01:13:34PM +0100, Lorenzo Stoakes wrote: > Provide a means by which drivers can specify which fields of those > permitted to be changed should be altered to prior to mmap()'ing a > range (which may either result from a merge or from mapping an entirely new > VMA). > > Doing so is substantially safer than the existing .mmap() calback which > provides unrestricted access to the part-constructed VMA and permits > drivers and file systems to do 'creative' things which makes it hard to > reason about the state of the VMA after the function returns. > > The existing .mmap() callback's freedom has caused a great deal of issues, > especially in error handling, as unwinding the mmap() state has proven to > be non-trivial and caused significant issues in the past, for instance > those addressed in commit 5de195060b2e ("mm: resolve faulty mmap_region() > error path behaviour"). > > It also necessitates a second attempt at merge once the .mmap() callback > has completed, which has caused issues in the past, is awkward, adds > overhead and is difficult to reason about. > > The .mmap_prepare() callback eliminates this requirement, as we can update > fields prior to even attempting the first merge. It is safer, as we heavily > restrict what can actually be modified, and being invoked very early in the > mmap() process, error handling can be performed safely with very little > unwinding of state required. > > The .mmap_prepare() and deprecated .mmap() callbacks are mutually > exclusive, so we permit only one to be invoked at a time. > > Update vma userland test stubs to account for changes. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> > --- > include/linux/fs.h | 25 ++++++++++++ > include/linux/mm_types.h | 24 +++++++++++ > mm/memory.c | 3 +- > mm/mmap.c | 2 +- > mm/vma.c | 68 +++++++++++++++++++++++++++++++- > tools/testing/vma/vma_internal.h | 66 ++++++++++++++++++++++++++++--- > 6 files changed, 180 insertions(+), 8 deletions(-) > > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 016b0fe1536e..e2721a1ff13d 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -2169,6 +2169,7 @@ struct file_operations { > int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); > int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, > unsigned int poll_flags); > + int (*mmap_prepare)(struct vm_area_desc *); > } __randomize_layout; > > /* Supports async buffered reads */ > @@ -2238,11 +2239,35 @@ struct inode_operations { > struct offset_ctx *(*get_offset_ctx)(struct inode *inode); > } ____cacheline_aligned; > > +/* Did the driver provide valid mmap hook configuration? */ > +static inline bool file_has_valid_mmap_hooks(struct file *file) > +{ > + bool has_mmap = file->f_op->mmap; > + bool has_mmap_prepare = file->f_op->mmap_prepare; > + > + /* Hooks are mutually exclusive. */ > + if (WARN_ON_ONCE(has_mmap && has_mmap_prepare)) > + return false; > + if (WARN_ON_ONCE(!has_mmap && !has_mmap_prepare)) > + return false; > + > + return true; > +} > + > static inline int call_mmap(struct file *file, struct vm_area_struct *vma) > { > + if (WARN_ON_ONCE(file->f_op->mmap_prepare)) > + return -EINVAL; > + > return file->f_op->mmap(file, vma); > } > > +static inline int __call_mmap_prepare(struct file *file, > + struct vm_area_desc *desc) > +{ > + return file->f_op->mmap_prepare(desc); > +} nit: I would prefer if we could rename this to vfs_mmap() and vfs_mmap_prepare() as this is in line with all the other vfs related helpers we expose. > + > extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); > extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); > extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index e76bade9ebb1..15808cad2bc1 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -763,6 +763,30 @@ struct vma_numab_state { > int prev_scan_seq; > }; > > +/* > + * Describes a VMA that is about to be mmap()'ed. Drivers may choose to > + * manipulate mutable fields which will cause those fields to be updated in the > + * resultant VMA. > + * > + * Helper functions are not required for manipulating any field. > + */ > +struct vm_area_desc { > + /* Immutable state. */ > + struct mm_struct *mm; > + unsigned long start; > + unsigned long end; > + > + /* Mutable fields. Populated with initial state. */ > + pgoff_t pgoff; > + struct file *file; > + vm_flags_t vm_flags; > + pgprot_t page_prot; > + > + /* Write-only fields. */ > + const struct vm_operations_struct *vm_ops; > + void *private_data; > +}; > + > /* > * This struct describes a virtual memory area. There is one of these > * per VM-area/task. A VM area is any part of the process virtual memory > diff --git a/mm/memory.c b/mm/memory.c > index 68c1d962d0ad..99af83434e7c 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -527,10 +527,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, > dump_page(page, "bad pte"); > pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", > (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); > - pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n", > + pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n", > vma->vm_file, > vma->vm_ops ? vma->vm_ops->fault : NULL, > vma->vm_file ? vma->vm_file->f_op->mmap : NULL, > + vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL, > mapping ? mapping->a_ops->read_folio : NULL); > dump_stack(); > add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); > diff --git a/mm/mmap.c b/mm/mmap.c > index 81dd962a1cfc..50f902c08341 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -475,7 +475,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > vm_flags &= ~VM_MAYEXEC; > } > > - if (!file->f_op->mmap) > + if (!file_has_valid_mmap_hooks(file)) > return -ENODEV; > if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) > return -EINVAL; > diff --git a/mm/vma.c b/mm/vma.c > index 1f2634b29568..3f32e04bb6cc 100644 > --- a/mm/vma.c > +++ b/mm/vma.c > @@ -17,6 +17,11 @@ struct mmap_state { > unsigned long pglen; > unsigned long flags; > struct file *file; > + pgprot_t page_prot; > + > + /* User-defined fields, perhaps updated by .mmap_prepare(). */ > + const struct vm_operations_struct *vm_ops; > + void *vm_private_data; > > unsigned long charged; > bool retry_merge; > @@ -40,6 +45,7 @@ struct mmap_state { > .pglen = PHYS_PFN(len_), \ > .flags = flags_, \ > .file = file_, \ > + .page_prot = vm_get_page_prot(flags_), \ > } > > #define VMG_MMAP_STATE(name, map_, vma_) \ > @@ -2385,6 +2391,10 @@ static int __mmap_new_file_vma(struct mmap_state *map, > int error; > > vma->vm_file = get_file(map->file); > + > + if (!map->file->f_op->mmap) > + return 0; > + > error = mmap_file(vma->vm_file, vma); > if (error) { > fput(vma->vm_file); > @@ -2441,7 +2451,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) > vma_iter_config(vmi, map->addr, map->end); > vma_set_range(vma, map->addr, map->end, map->pgoff); > vm_flags_init(vma, map->flags); > - vma->vm_page_prot = vm_get_page_prot(map->flags); > + vma->vm_page_prot = map->page_prot; > > if (vma_iter_prealloc(vmi, vma)) { > error = -ENOMEM; > @@ -2528,6 +2538,56 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) > vma_set_page_prot(vma); > } > > +/* > + * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that > + * specifies it. > + * > + * This is called prior to any merge attempt, and updates whitelisted fields > + * that are permitted to be updated by the caller. > + * > + * All but user-defined fields will be pre-populated with original values. > + * > + * Returns 0 on success, or an error code otherwise. > + */ > +static int call_mmap_prepare(struct mmap_state *map) > +{ > + int err; > + struct vm_area_desc desc = { > + .mm = map->mm, > + .start = map->addr, > + .end = map->end, > + > + .pgoff = map->pgoff, > + .file = map->file, > + .vm_flags = map->flags, > + .page_prot = map->page_prot, > + }; > + > + /* Invoke the hook. */ > + err = __call_mmap_prepare(map->file, &desc); > + if (err) > + return err; > + > + /* Update fields permitted to be changed. */ > + map->pgoff = desc.pgoff; > + map->file = desc.file; > + map->flags = desc.vm_flags; > + map->page_prot = desc.page_prot; > + /* User-defined fields. */ > + map->vm_ops = desc.vm_ops; > + map->vm_private_data = desc.private_data; > + > + return 0; > +} > + > +static void set_vma_user_defined_fields(struct vm_area_struct *vma, > + struct mmap_state *map) > +{ > + if (map->vm_ops) > + vma->vm_ops = map->vm_ops; > + vma->vm_private_data = map->vm_private_data; > +} > + > static unsigned long __mmap_region(struct file *file, unsigned long addr, > unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, > struct list_head *uf) > @@ -2535,10 +2595,13 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, > struct mm_struct *mm = current->mm; > struct vm_area_struct *vma = NULL; > int error; > + bool have_mmap_prepare = file && file->f_op->mmap_prepare; > VMA_ITERATOR(vmi, mm, addr); > MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); > > error = __mmap_prepare(&map, uf); > + if (!error && have_mmap_prepare) > + error = call_mmap_prepare(&map); > if (error) > goto abort_munmap; > > @@ -2556,6 +2619,9 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, > goto unacct_error; > } > > + if (have_mmap_prepare) > + set_vma_user_defined_fields(vma, &map); > + > /* If flags changed, we might be able to merge, so try again. */ > if (map.retry_merge) { > struct vm_area_struct *merged; > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h > index 198abe66de5a..f6e45e62da3a 100644 > --- a/tools/testing/vma/vma_internal.h > +++ b/tools/testing/vma/vma_internal.h > @@ -253,8 +253,40 @@ struct mm_struct { > unsigned long flags; /* Must use atomic bitops to access */ > }; > > +struct vm_area_struct; > + > +/* > + * Describes a VMA that is about to be mmap()'ed. Drivers may choose to > + * manipulate mutable fields which will cause those fields to be updated in the > + * resultant VMA. > + * > + * Helper functions are not required for manipulating any field. > + */ > +struct vm_area_desc { > + /* Immutable state. */ > + struct mm_struct *mm; > + unsigned long start; > + unsigned long end; > + > + /* Mutable fields. Populated with initial state. */ > + pgoff_t pgoff; > + struct file *file; > + vm_flags_t vm_flags; > + pgprot_t page_prot; > + > + /* Write-only fields. */ > + const struct vm_operations_struct *vm_ops; > + void *private_data; > +}; > + > +struct file_operations { > + int (*mmap)(struct file *, struct vm_area_struct *); > + int (*mmap_prepare)(struct vm_area_desc *); > +}; > + > struct file { > struct address_space *f_mapping; > + const struct file_operations *f_op; > }; > > #define VMA_LOCK_OFFSET 0x40000000 > @@ -1125,11 +1157,6 @@ static inline void vm_flags_clear(struct vm_area_struct *vma, > vma->__vm_flags &= ~flags; > } > > -static inline int call_mmap(struct file *, struct vm_area_struct *) > -{ > - return 0; > -} > - > static inline int shmem_zero_setup(struct vm_area_struct *) > { > return 0; > @@ -1405,4 +1432,33 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) > (void)vma; > } > > +/* Did the driver provide valid mmap hook configuration? */ > +static inline bool file_has_valid_mmap_hooks(struct file *file) > +{ > + bool has_mmap = file->f_op->mmap; > + bool has_mmap_prepare = file->f_op->mmap_prepare; > + > + /* Hooks are mutually exclusive. */ > + if (WARN_ON_ONCE(has_mmap && has_mmap_prepare)) > + return false; > + if (WARN_ON_ONCE(!has_mmap && !has_mmap_prepare)) > + return false; > + > + return true; > +} > + > +static inline int call_mmap(struct file *file, struct vm_area_struct *vma) > +{ > + if (WARN_ON_ONCE(file->f_op->mmap_prepare)) > + return -EINVAL; > + > + return file->f_op->mmap(file, vma); > +} > + > +static inline int __call_mmap_prepare(struct file *file, > + struct vm_area_desc *desc) > +{ > + return file->f_op->mmap_prepare(desc); > +} > + > #endif /* __MM_VMA_INTERNAL_H */ > -- > 2.49.0 >