Re: [PATCH 2/2] fanotify: introduce restartable permission events

Amir Goldstein <amir73il@xxxxxxxxx> · Fri, 8 Aug 2025 18:30:12 +0200

On Thu, Aug 7, 2025 at 12:06 AM Ibrahim Jirdeh <ibrahimjirdeh@xxxxxxxx> wrote:
>
> This add support for restarting permission events. The main goal of
> the change is to provide better handling for pending events for lazy
> file loading use cases which may back fanotify events by a long-lived
> daemon. For prior discussion of approaches see [1][2].
>
> In terms of implementation, we add a new control-fd/queue-fd api.
> Control fd returned by fanotify_init keeps fanotify group alive and
> supports operations like fanotify_mark as well as a new ioctl
> FAN_IOC_OPEN_QUEUE_FD to issue user a queue fd. Queue fd is used
> for reading events and writing back responses. Upon release of
> queue fd, pending permission events are reinserted back into
> notification queue for reprocessing.
>
> Control-fd/queue-fd api is guarded by FAN_RESTARTABLE_EVENTS flag.
> In addition FAN_RESTARTABLE_EVENTS can only be used in conjunction
> with FAN_CLASS_CONTENT or FAN_CLASS_PRE_CONTENT, and only permission
> events can added to the mark mask if a group initialize with
> FAN_RESTARTABLE_EVENTS.
>
> [1] https://lore.kernel.org/linux-fsdevel/6za2mngeqslmqjg3icoubz37hbbxi6bi44canfsg2aajgkialt@c3ujlrjzkppr
> [2] https://lore.kernel.org/linux-fsdevel/20250623192503.2673076-1-ibrahimjirdeh@xxxxxxxx
>
> Suggested-by: Amir Goldstein <amir73il@xxxxxxxxx>
> Link: https://lore.kernel.org/linux-fsdevel/CAOQ4uxhN6ok6BCBGbxeUt9ULq6g=qL6=_2_QGi8MqTHv5ZN7Vg@xxxxxxxxxxxxxx
> Signed-off-by: Ibrahim Jirdeh <ibrahimjirdeh@xxxxxxxx>
> ---

Looking good overall.
I have some implementation comments, but they are mostly minor technical
details.

>  fs/notify/fanotify/fanotify.h       |   4 +
>  fs/notify/fanotify/fanotify_user.c  | 111 ++++++++++++++++++++++++++--
>  fs/notify/group.c                   |   2 +
>  include/linux/fanotify.h            |   1 +
>  include/linux/fsnotify_backend.h    |   2 +
>  include/uapi/linux/fanotify.h       |   6 ++
>  tools/include/uapi/linux/fanotify.h |   6 ++
>  7 files changed, 125 insertions(+), 7 deletions(-)
>
> diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
> index c0dffbc3370d..5cf25e7ad2d8 100644
> --- a/fs/notify/fanotify/fanotify.h
> +++ b/fs/notify/fanotify/fanotify.h
> @@ -556,3 +556,7 @@ extern void fanotify_insert_event(struct fsnotify_group *group,
>
>  extern int fanotify_merge(struct fsnotify_group *group,
>         struct fsnotify_event *event);
> +
> +extern const struct file_operations fanotify_fops;
> +extern const struct file_operations fanotify_control_fops;
> +extern const struct file_operations fanotify_queue_fops;
> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> index 01d273d35936..8d5266be78a2 100644
> --- a/fs/notify/fanotify/fanotify_user.c
> +++ b/fs/notify/fanotify/fanotify_user.c
> @@ -1011,6 +1011,7 @@ static void clear_queue(struct file *file, bool restart_events)
>          * restart is requested, move them back into the notification queue
>          * for reprocessing, otherwise simulate a reply from userspace.
>          */
> +       mutex_lock(&group->queue_mutex);
>         spin_lock(&group->notification_lock);
>         while (!list_empty(&group->fanotify_data.access_list)) {
>                 struct fanotify_perm_event *event;
> @@ -1043,8 +1044,17 @@ static void clear_queue(struct file *file, bool restart_events)
>                 spin_lock(&group->notification_lock);
>         }
>         spin_unlock(&group->notification_lock);
> +       group->queue_opened = false;
> +       mutex_unlock(&group->queue_mutex);
>  }
>
> +static int fanotify_queue_release(struct inode *ignored, struct file *file)
> +{
> +       clear_queue(file, true);
> +       return 0;
> +}
> +
> +
>  static int fanotify_release(struct inode *ignored, struct file *file)
>  {
>         struct fsnotify_group *group = file->private_data;
> @@ -1092,6 +1102,47 @@ static int fanotify_release(struct inode *ignored, struct file *file)
>         return 0;
>  }
>
> +static int fanotify_open_queue_fd(struct file *file)
> +{
> +       struct fsnotify_group *group = file->private_data;
> +       int f_flags, fd;
> +       struct file *queue_file;
> +
> +       if (!FAN_GROUP_FLAG(group, FAN_RESTARTABLE_EVENTS))
> +               return -EINVAL;
> +
> +       mutex_lock(&group->queue_mutex);
> +       if (group->queue_opened) {
> +               fd = -EEXIST;

This is more an exclusive open than an exclusive creation,
so EBUSY feels more appropriate.

> +               goto out_unlock;
> +       }
> +
> +       f_flags = O_RDWR;
> +       if (group->fanotify_data.flags & FAN_CLOEXEC)
> +               f_flags |= O_CLOEXEC;
> +       if (group->fanotify_data.flags & FAN_NONBLOCK)
> +               f_flags |= O_NONBLOCK;
> +
> +       fd = get_unused_fd_flags(f_flags);
> +       if (fd < 0)
> +               goto out_unlock;
> +
> +       queue_file = anon_inode_getfile_fmode("[fanotify]",

This is mostly for nice printing in /proc so how about [fanotify-queue]?

I believe that CRIU will actually try to restore [fanotify] fd from
the init flags and marks, so printing this as fd as [fanotify] could
mess up CRIU.
OTOH, CRIU that doesn't know about FAN_IOC_OPEN_QUEUE_FD,
has no chance of restoring this group.
I hope they look at init flags and bail out for unknown flags,
but not sure they do.

> +                                             &fanotify_queue_fops, group,
> +                                             f_flags, FMODE_NONOTIFY);
> +       if (IS_ERR(queue_file)) {
> +               put_unused_fd(fd);
> +               fd = PTR_ERR(queue_file);
> +               goto out_unlock;
> +       }
> +       fd_install(fd, queue_file);
> +       group->queue_opened = true;
> +
> +out_unlock:
> +       mutex_unlock(&group->queue_mutex);
> +       return fd;
> +}
> +
>  static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>  {
>         struct fsnotify_group *group;
> @@ -1112,12 +1163,15 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
>                 spin_unlock(&group->notification_lock);
>                 ret = put_user(send_len, (int __user *) p);
>                 break;
> +       case FAN_IOC_OPEN_QUEUE_FD:
> +               ret = fanotify_open_queue_fd(file);
> +               break;
>         }
>
>         return ret;
>  }
>
> -static const struct file_operations fanotify_fops = {
> +const struct file_operations fanotify_fops = {
>         .show_fdinfo    = fanotify_show_fdinfo,
>         .poll           = fanotify_poll,
>         .read           = fanotify_read,
> @@ -1129,6 +1183,30 @@ static const struct file_operations fanotify_fops = {
>         .llseek         = noop_llseek,
>  };
>
> +const struct file_operations fanotify_control_fops = {
> +       .show_fdinfo    = fanotify_show_fdinfo,
> +       .poll           = NULL,
> +       .read           = NULL,
> +       .write          = NULL,
> +       .fasync         = NULL,

no need to add the NULL fields.
I have no idea why .fasync  = NULL exists in fanotify_fops
It must be an oversight(?)

> +       .release        = fanotify_release,
> +       .unlocked_ioctl = fanotify_ioctl,
> +       .compat_ioctl   = compat_ptr_ioctl,

I'm afraid that FIONREAD ioctl only applied to queue fd
(it's a query about expected bytes returned from read())
so you'd need fanotify_control_ioctl() or something like that.
There is no need for compat_ioctl since open fd ioctl takes no
ptr argument.

> +       .llseek         = noop_llseek,
> +};
> +
> +const struct file_operations fanotify_queue_fops = {
> +       .show_fdinfo    = fanotify_show_fdinfo,

This prints info about group and marks
I don't think it is appropriate for queue fd.
It would have been nice to print some info that would allow
userspace to associate this [fanotify-queue] fd with the [fanotify] fd,
but I can't think of a valid identifier that we can use, so just leave
unassigned.

> +       .poll           = fanotify_poll,
> +       .read           = fanotify_read,
> +       .write          = fanotify_write,
> +       .fasync         = NULL,
> +       .release        = fanotify_queue_release,
> +       .unlocked_ioctl = NULL,
> +       .compat_ioctl   = compat_ptr_ioctl,
> +       .llseek         = noop_llseek,
> +};
> +
>  static int fanotify_find_path(int dfd, const char __user *filename,
>                               struct path *path, unsigned int flags, __u64 mask,
>                               unsigned int obj_type)
> @@ -1541,6 +1619,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
>         int f_flags, fd;
>         unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
>         unsigned int class = flags & FANOTIFY_CLASS_BITS;
> +       unsigned int restartable_events = flags & FAN_RESTARTABLE_EVENTS;
>         unsigned int internal_flags = 0;
>         struct file *file;
>
> @@ -1620,10 +1699,17 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
>             (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
>                 return -EINVAL;
>
> -       f_flags = O_RDWR;
> +       /*
> +        * FAN_RESTARTABLE_EVENTS requires FAN_CLASS_CONTENT or
> +        * FAN_CLASS_PRE_CONTENT
> +        */
> +       if (restartable_events && class == FAN_CLASS_NOTIF)
> +               return -EINVAL;
> +
> +       f_flags = restartable_events ? O_RDONLY : O_RDWR;
>         if (flags & FAN_CLOEXEC)
>                 f_flags |= O_CLOEXEC;
> -       if (flags & FAN_NONBLOCK)
> +       if (!restartable_events && (flags & FAN_NONBLOCK))
>                 f_flags |= O_NONBLOCK;
>
>         /* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
> @@ -1694,8 +1780,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
>         if (fd < 0)
>                 goto out_destroy_group;
>
> -       file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group,
> -                                       f_flags, FMODE_NONOTIFY);
> +       file = anon_inode_getfile_fmode("[fanotify]",
> +                                       (restartable_events ? &fanotify_control_fops :
> +                                       &fanotify_fops),
> +                                       group, f_flags, FMODE_NONOTIFY);
>         if (IS_ERR(file)) {
>                 put_unused_fd(fd);
>                 fd = PTR_ERR(file);
> @@ -1920,7 +2008,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
>                 return -EBADF;
>
>         /* verify that this is indeed an fanotify instance */
> -       if (unlikely(fd_file(f)->f_op != &fanotify_fops))
> +       if (unlikely(fd_file(f)->f_op != &fanotify_fops &&
> +               fd_file(f)->f_op != &fanotify_control_fops))
>                 return -EINVAL;
>         group = fd_file(f)->private_data;
>
> @@ -1937,6 +2026,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
>                         return -EINVAL;
>         }
>
> +       /*
> +        * With FAN_RESTARTABLE_EVENTS, a user is only allowed to setup
> +        * permission events
> +        */
> +       if (FAN_GROUP_FLAG(group, FAN_RESTARTABLE_EVENTS) &&
> +               !fanotify_is_perm_event(mask))
> +               return -EINVAL;
> +
>         /*
>          * A user is allowed to setup sb/mount/mntns marks only if it is
>          * capable in the user ns where the group was created.
> @@ -2142,7 +2239,7 @@ static int __init fanotify_user_setup(void)
>                                      FANOTIFY_DEFAULT_MAX_USER_MARKS);
>
>         BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
> -       BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14);
> +       BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 15);
>         BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
>
>         fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
> diff --git a/fs/notify/group.c b/fs/notify/group.c
> index 18446b7b0d49..949a8023a7e4 100644
> --- a/fs/notify/group.c
> +++ b/fs/notify/group.c
> @@ -25,6 +25,7 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
>                 group->ops->free_group_priv(group);
>
>         mem_cgroup_put(group->memcg);
> +       mutex_destroy(&group->queue_mutex);
>         mutex_destroy(&group->mark_mutex);
>
>         kfree(group);
> @@ -130,6 +131,7 @@ static struct fsnotify_group *__fsnotify_alloc_group(
>         init_waitqueue_head(&group->notification_waitq);
>         group->max_events = UINT_MAX;
>
> +       mutex_init(&group->queue_mutex);
>         mutex_init(&group->mark_mutex);
>         INIT_LIST_HEAD(&group->marks_list);
>
> diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
> index 879cff5eccd4..38854a1d6485 100644
> --- a/include/linux/fanotify.h
> +++ b/include/linux/fanotify.h
> @@ -37,6 +37,7 @@
>                                          FAN_REPORT_TID | \
>                                          FAN_REPORT_PIDFD | \
>                                          FAN_REPORT_FD_ERROR | \
> +                                        FAN_RESTARTABLE_EVENTS | \
>                                          FAN_UNLIMITED_QUEUE | \
>                                          FAN_UNLIMITED_MARKS)
>
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index d4034ddaf392..1203124dc9e8 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -231,6 +231,8 @@ struct fsnotify_group {
>         unsigned int max_events;                /* maximum events allowed on the list */
>         enum fsnotify_group_prio priority;      /* priority for sending events */
>         bool shutdown;          /* group is being shut down, don't queue more events */
> +       bool queue_opened; /* whether or not a queue fd has been issued */
> +       struct mutex queue_mutex; /* protects event queue during open / release */

Semantically, these belong in fanotify_data as they are only referenced in
fanotify code (except for the init/destroy that could be moved).
But TBH, it doesn't matter so much to me, so please move them
only if it's not too painful.

Thanks,
Amir.