On Mon, Sep 8, 2025 at 4:11 AM Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> wrote: > > We have introduced the f_op->mmap_prepare hook to allow for setting up a > VMA far earlier in the process of mapping memory, reducing problematic > error handling paths, but this does not provide what all > drivers/filesystems need. > > In order to supply this, and to be able to move forward with removing > f_op->mmap altogether, introduce f_op->mmap_complete. > > This hook is called once the VMA is fully mapped and everything is done, > however with the mmap write lock and VMA write locks held. > > The hook is then provided with a fully initialised VMA which it can do what > it needs with, though the mmap and VMA write locks must remain held > throughout. > > It is not intended that the VMA be modified at this point, attempts to do > so will end in tears. > > This allows for operations such as pre-population typically via a remap, or > really anything that requires access to the VMA once initialised. > > In addition, a caller may need to take a lock in mmap_prepare, when it is > possible to modify the VMA, and release it on mmap_complete. In order to > handle errors which may arise between the two operations, f_op->mmap_abort > is provided. > > This hook should be used to drop any lock and clean up anything before the > VMA mapping operation is aborted. After this point the VMA will not be > added to any mapping and will not exist. > > We also add a new mmap_context field to the vm_area_desc type which can be > used to pass information pertinent to any locks which are held or any state > which is required for mmap_complete, abort to operate correctly. > > We also update the compatibility layer for nested filesystems which > currently still only specify an f_op->mmap() handler so that it correctly > invokes f_op->mmap_complete as necessary (note that no error can occur > between mmap_prepare and mmap_complete so mmap_abort will never be called > in this case). > > Also update the VMA tests to account for the changes. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> > --- > include/linux/fs.h | 4 ++ > include/linux/mm_types.h | 5 ++ > mm/util.c | 18 +++++-- > mm/vma.c | 82 ++++++++++++++++++++++++++++++-- > tools/testing/vma/vma_internal.h | 31 ++++++++++-- > 5 files changed, 129 insertions(+), 11 deletions(-) > > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 594bd4d0521e..bb432924993a 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -2195,6 +2195,10 @@ struct file_operations { > int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, > unsigned int poll_flags); > int (*mmap_prepare)(struct vm_area_desc *); > + int (*mmap_complete)(struct file *, struct vm_area_struct *, > + const void *context); > + void (*mmap_abort)(const struct file *, const void *vm_private_data, > + const void *context); > } __randomize_layout; > > /* Supports async buffered reads */ > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index cf759fe08bb3..052db1f31fb3 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -793,6 +793,11 @@ struct vm_area_desc { > /* Write-only fields. */ > const struct vm_operations_struct *vm_ops; > void *private_data; > + /* > + * A user-defined field, value will be passed to mmap_complete, > + * mmap_abort. > + */ > + void *mmap_context; > }; > > /* > diff --git a/mm/util.c b/mm/util.c > index 248f877f629b..f5bcac140cb9 100644 > --- a/mm/util.c > +++ b/mm/util.c > @@ -1161,17 +1161,26 @@ int __compat_vma_mmap_prepare(const struct file_operations *f_op, > err = f_op->mmap_prepare(&desc); > if (err) > return err; > + > set_vma_from_desc(vma, &desc); > > - return 0; > + /* > + * No error can occur between mmap_prepare() and mmap_complete so no > + * need to invoke mmap_abort(). > + */ > + > + if (f_op->mmap_complete) > + err = f_op->mmap_complete(file, vma, desc.mmap_context); > + > + return err; > } > EXPORT_SYMBOL(__compat_vma_mmap_prepare); > > /** > * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an > - * existing VMA. > + * existing VMA and invoke .mmap_complete() if provided. > * @file: The file which possesss an f_op->mmap_prepare() hook. nit: possesss seems to be misspelled. Maybe we can fix it here as well? > - * @vma: The VMA to apply the .mmap_prepare() hook to. > + * @vma: The VMA to apply the hooks to. > * > * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain > * stacked filesystems invoke a nested mmap hook of an underlying file. > @@ -1188,6 +1197,9 @@ EXPORT_SYMBOL(__compat_vma_mmap_prepare); > * establishes a struct vm_area_desc descriptor, passes to the underlying > * .mmap_prepare() hook and applies any changes performed by it. > * > + * If the relevant hooks are provided, it also invokes .mmap_complete() upon > + * successful completion. > + * > * Once the conversion of filesystems is complete this function will no longer > * be required and will be removed. > * > diff --git a/mm/vma.c b/mm/vma.c > index 0efa4288570e..a0b568fe9e8d 100644 > --- a/mm/vma.c > +++ b/mm/vma.c > @@ -22,6 +22,7 @@ struct mmap_state { > /* User-defined fields, perhaps updated by .mmap_prepare(). */ > const struct vm_operations_struct *vm_ops; > void *vm_private_data; > + void *mmap_context; > > unsigned long charged; > > @@ -2343,6 +2344,23 @@ static int __mmap_prelude(struct mmap_state *map, struct list_head *uf) > int error; > struct vma_iterator *vmi = map->vmi; > struct vma_munmap_struct *vms = &map->vms; > + struct file *file = map->file; > + > + if (file) { > + /* f_op->mmap_complete requires f_op->mmap_prepare. */ > + if (file->f_op->mmap_complete && !file->f_op->mmap_prepare) > + return -EINVAL; > + > + /* > + * It's not valid to provide an f_op->mmap_abort hook without also > + * providing the f_op->mmap_prepare and f_op->mmap_complete hooks it is > + * used with. > + */ > + if (file->f_op->mmap_abort && > + (!file->f_op->mmap_prepare || > + !file->f_op->mmap_complete)) > + return -EINVAL; > + } > > /* Find the first overlapping VMA and initialise unmap state. */ > vms->vma = vma_find(vmi, map->end); > @@ -2595,6 +2613,7 @@ static int call_mmap_prepare(struct mmap_state *map) > /* User-defined fields. */ > map->vm_ops = desc.vm_ops; > map->vm_private_data = desc.private_data; > + map->mmap_context = desc.mmap_context; > > return 0; > } > @@ -2636,16 +2655,61 @@ static bool can_set_ksm_flags_early(struct mmap_state *map) > return false; > } > > +/* > + * Invoke the f_op->mmap_complete hook, providing it with a fully initialised > + * VMA to operate upon. > + * > + * The mmap and VMA write locks must be held prior to and after the hook has > + * been invoked. > + */ > +static int call_mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) > +{ > + struct file *file = map->file; > + void *context = map->mmap_context; > + int error; > + size_t len; > + > + if (!file || !file->f_op->mmap_complete) > + return 0; > + > + error = file->f_op->mmap_complete(file, vma, context); > + /* The hook must NOT drop the write locks. */ > + vma_assert_write_locked(vma); > + mmap_assert_write_locked(current->mm); > + if (!error) > + return 0; > + > + /* > + * If an error occurs, unmap the VMA altogether and return an error. We > + * only clear the newly allocated VMA, since this function is only > + * invoked if we do NOT merge, so we only clean up the VMA we created. > + */ > + len = vma_pages(vma) << PAGE_SHIFT; > + do_munmap(current->mm, vma->vm_start, len, NULL); > + return error; > +} > + > +static void call_mmap_abort(struct mmap_state *map) > +{ > + struct file *file = map->file; > + void *vm_private_data = map->vm_private_data; > + > + VM_WARN_ON_ONCE(!file || !file->f_op); > + file->f_op->mmap_abort(file, vm_private_data, map->mmap_context); > +} > + > static unsigned long __mmap_region(struct file *file, unsigned long addr, > unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, > struct list_head *uf) > { > - struct mm_struct *mm = current->mm; > - struct vm_area_struct *vma = NULL; > - int error; > bool have_mmap_prepare = file && file->f_op->mmap_prepare; > + bool have_mmap_abort = file && file->f_op->mmap_abort; > + struct mm_struct *mm = current->mm; > VMA_ITERATOR(vmi, mm, addr); > MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); > + struct vm_area_struct *vma = NULL; > + bool allocated_new = false; > + int error; > > map.check_ksm_early = can_set_ksm_flags_early(&map); > > @@ -2668,8 +2732,12 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, > /* ...but if we can't, allocate a new VMA. */ > if (!vma) { > error = __mmap_new_vma(&map, &vma); > - if (error) > + if (error) { > + if (have_mmap_abort) > + call_mmap_abort(&map); > goto unacct_error; > + } > + allocated_new = true; > } > > if (have_mmap_prepare) > @@ -2677,6 +2745,12 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, > > __mmap_epilogue(&map, vma); > > + if (allocated_new) { > + error = call_mmap_complete(&map, vma); > + if (error) > + return error; > + } > + > return addr; > > /* Accounting was done by __mmap_prelude(). */ > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h > index 07167446dcf4..566cef1c0e0b 100644 > --- a/tools/testing/vma/vma_internal.h > +++ b/tools/testing/vma/vma_internal.h > @@ -297,11 +297,20 @@ struct vm_area_desc { > /* Write-only fields. */ > const struct vm_operations_struct *vm_ops; > void *private_data; > + /* > + * A user-defined field, value will be passed to mmap_complete, > + * mmap_abort. > + */ > + void *mmap_context; > }; > > struct file_operations { > int (*mmap)(struct file *, struct vm_area_struct *); > int (*mmap_prepare)(struct vm_area_desc *); > + void (*mmap_abort)(const struct file *, const void *vm_private_data, > + const void *context); > + int (*mmap_complete)(struct file *, struct vm_area_struct *, > + const void *context); > }; > > struct file { > @@ -1471,7 +1480,7 @@ static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, > { > struct vm_area_desc desc = { > .mm = vma->vm_mm, > - .file = vma->vm_file, > + .file = file, > .start = vma->vm_start, > .end = vma->vm_end, > > @@ -1485,13 +1494,21 @@ static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, > err = f_op->mmap_prepare(&desc); > if (err) > return err; > + > set_vma_from_desc(vma, &desc); > > - return 0; > + /* > + * No error can occur between mmap_prepare() and mmap_complete so no > + * need to invoke mmap_abort(). > + */ > + > + if (f_op->mmap_complete) > + err = f_op->mmap_complete(file, vma, desc.mmap_context); > + > + return err; > } > > -static inline int compat_vma_mmap_prepare(struct file *file, > - struct vm_area_struct *vma) > +static inline int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) > { > return __compat_vma_mmap_prepare(file->f_op, file, vma); > } > @@ -1548,4 +1565,10 @@ static inline vm_flags_t ksm_vma_flags(const struct mm_struct *, const struct fi > return vm_flags; > } > > +static inline int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, > + struct list_head *uf) > +{ > + return 0; > +} > + > #endif /* __MM_VMA_INTERNAL_H */ > -- > 2.51.0 >