This add support for restarting permission events. The main goal of the change is to provide better handling for pending events for lazy file loading use cases which may back fanotify events by a long-lived daemon. For prior discussion of approaches see [1][2]. In terms of implementation, we add a new control-fd/queue-fd api. Control fd returned by fanotify_init keeps fanotify group alive and supports operations like fanotify_mark as well as a new ioctl FAN_IOC_OPEN_QUEUE_FD to issue user a queue fd. Queue fd is used for reading events and writing back responses. Upon release of queue fd, pending permission events are reinserted back into notification queue for reprocessing. Control-fd/queue-fd api is guarded by FAN_RESTARTABLE_EVENTS flag. In addition FAN_RESTARTABLE_EVENTS can only be used in conjunction with FAN_CLASS_CONTENT or FAN_CLASS_PRE_CONTENT, and only permission events can added to the mark mask if a group initialize with FAN_RESTARTABLE_EVENTS. [1] https://lore.kernel.org/linux-fsdevel/6za2mngeqslmqjg3icoubz37hbbxi6bi44canfsg2aajgkialt@c3ujlrjzkppr [2] https://lore.kernel.org/linux-fsdevel/20250623192503.2673076-1-ibrahimjirdeh@xxxxxxxx Suggested-by: Amir Goldstein <amir73il@xxxxxxxxx> Link: https://lore.kernel.org/linux-fsdevel/CAOQ4uxhN6ok6BCBGbxeUt9ULq6g=qL6=_2_QGi8MqTHv5ZN7Vg@xxxxxxxxxxxxxx Signed-off-by: Ibrahim Jirdeh <ibrahimjirdeh@xxxxxxxx> --- fs/notify/fanotify/fanotify.h | 4 + fs/notify/fanotify/fanotify_user.c | 111 ++++++++++++++++++++++++++-- fs/notify/group.c | 2 + include/linux/fanotify.h | 1 + include/linux/fsnotify_backend.h | 2 + include/uapi/linux/fanotify.h | 6 ++ tools/include/uapi/linux/fanotify.h | 6 ++ 7 files changed, 125 insertions(+), 7 deletions(-) diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index c0dffbc3370d..5cf25e7ad2d8 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -556,3 +556,7 @@ extern void fanotify_insert_event(struct fsnotify_group *group, extern int fanotify_merge(struct fsnotify_group *group, struct fsnotify_event *event); + +extern const struct file_operations fanotify_fops; +extern const struct file_operations fanotify_control_fops; +extern const struct file_operations fanotify_queue_fops; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 01d273d35936..8d5266be78a2 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1011,6 +1011,7 @@ static void clear_queue(struct file *file, bool restart_events) * restart is requested, move them back into the notification queue * for reprocessing, otherwise simulate a reply from userspace. */ + mutex_lock(&group->queue_mutex); spin_lock(&group->notification_lock); while (!list_empty(&group->fanotify_data.access_list)) { struct fanotify_perm_event *event; @@ -1043,8 +1044,17 @@ static void clear_queue(struct file *file, bool restart_events) spin_lock(&group->notification_lock); } spin_unlock(&group->notification_lock); + group->queue_opened = false; + mutex_unlock(&group->queue_mutex); } +static int fanotify_queue_release(struct inode *ignored, struct file *file) +{ + clear_queue(file, true); + return 0; +} + + static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; @@ -1092,6 +1102,47 @@ static int fanotify_release(struct inode *ignored, struct file *file) return 0; } +static int fanotify_open_queue_fd(struct file *file) +{ + struct fsnotify_group *group = file->private_data; + int f_flags, fd; + struct file *queue_file; + + if (!FAN_GROUP_FLAG(group, FAN_RESTARTABLE_EVENTS)) + return -EINVAL; + + mutex_lock(&group->queue_mutex); + if (group->queue_opened) { + fd = -EEXIST; + goto out_unlock; + } + + f_flags = O_RDWR; + if (group->fanotify_data.flags & FAN_CLOEXEC) + f_flags |= O_CLOEXEC; + if (group->fanotify_data.flags & FAN_NONBLOCK) + f_flags |= O_NONBLOCK; + + fd = get_unused_fd_flags(f_flags); + if (fd < 0) + goto out_unlock; + + queue_file = anon_inode_getfile_fmode("[fanotify]", + &fanotify_queue_fops, group, + f_flags, FMODE_NONOTIFY); + if (IS_ERR(queue_file)) { + put_unused_fd(fd); + fd = PTR_ERR(queue_file); + goto out_unlock; + } + fd_install(fd, queue_file); + group->queue_opened = true; + +out_unlock: + mutex_unlock(&group->queue_mutex); + return fd; +} + static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; @@ -1112,12 +1163,15 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar spin_unlock(&group->notification_lock); ret = put_user(send_len, (int __user *) p); break; + case FAN_IOC_OPEN_QUEUE_FD: + ret = fanotify_open_queue_fd(file); + break; } return ret; } -static const struct file_operations fanotify_fops = { +const struct file_operations fanotify_fops = { .show_fdinfo = fanotify_show_fdinfo, .poll = fanotify_poll, .read = fanotify_read, @@ -1129,6 +1183,30 @@ static const struct file_operations fanotify_fops = { .llseek = noop_llseek, }; +const struct file_operations fanotify_control_fops = { + .show_fdinfo = fanotify_show_fdinfo, + .poll = NULL, + .read = NULL, + .write = NULL, + .fasync = NULL, + .release = fanotify_release, + .unlocked_ioctl = fanotify_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .llseek = noop_llseek, +}; + +const struct file_operations fanotify_queue_fops = { + .show_fdinfo = fanotify_show_fdinfo, + .poll = fanotify_poll, + .read = fanotify_read, + .write = fanotify_write, + .fasync = NULL, + .release = fanotify_queue_release, + .unlocked_ioctl = NULL, + .compat_ioctl = compat_ptr_ioctl, + .llseek = noop_llseek, +}; + static int fanotify_find_path(int dfd, const char __user *filename, struct path *path, unsigned int flags, __u64 mask, unsigned int obj_type) @@ -1541,6 +1619,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) int f_flags, fd; unsigned int fid_mode = flags & FANOTIFY_FID_BITS; unsigned int class = flags & FANOTIFY_CLASS_BITS; + unsigned int restartable_events = flags & FAN_RESTARTABLE_EVENTS; unsigned int internal_flags = 0; struct file *file; @@ -1620,10 +1699,17 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID))) return -EINVAL; - f_flags = O_RDWR; + /* + * FAN_RESTARTABLE_EVENTS requires FAN_CLASS_CONTENT or + * FAN_CLASS_PRE_CONTENT + */ + if (restartable_events && class == FAN_CLASS_NOTIF) + return -EINVAL; + + f_flags = restartable_events ? O_RDONLY : O_RDWR; if (flags & FAN_CLOEXEC) f_flags |= O_CLOEXEC; - if (flags & FAN_NONBLOCK) + if (!restartable_events && (flags & FAN_NONBLOCK)) f_flags |= O_NONBLOCK; /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ @@ -1694,8 +1780,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if (fd < 0) goto out_destroy_group; - file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group, - f_flags, FMODE_NONOTIFY); + file = anon_inode_getfile_fmode("[fanotify]", + (restartable_events ? &fanotify_control_fops : + &fanotify_fops), + group, f_flags, FMODE_NONOTIFY); if (IS_ERR(file)) { put_unused_fd(fd); fd = PTR_ERR(file); @@ -1920,7 +2008,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EBADF; /* verify that this is indeed an fanotify instance */ - if (unlikely(fd_file(f)->f_op != &fanotify_fops)) + if (unlikely(fd_file(f)->f_op != &fanotify_fops && + fd_file(f)->f_op != &fanotify_control_fops)) return -EINVAL; group = fd_file(f)->private_data; @@ -1937,6 +2026,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EINVAL; } + /* + * With FAN_RESTARTABLE_EVENTS, a user is only allowed to setup + * permission events + */ + if (FAN_GROUP_FLAG(group, FAN_RESTARTABLE_EVENTS) && + !fanotify_is_perm_event(mask)) + return -EINVAL; + /* * A user is allowed to setup sb/mount/mntns marks only if it is * capable in the user ns where the group was created. @@ -2142,7 +2239,7 @@ static int __init fanotify_user_setup(void) FANOTIFY_DEFAULT_MAX_USER_MARKS); BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 15); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); fanotify_mark_cache = KMEM_CACHE(fanotify_mark, diff --git a/fs/notify/group.c b/fs/notify/group.c index 18446b7b0d49..949a8023a7e4 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -25,6 +25,7 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) group->ops->free_group_priv(group); mem_cgroup_put(group->memcg); + mutex_destroy(&group->queue_mutex); mutex_destroy(&group->mark_mutex); kfree(group); @@ -130,6 +131,7 @@ static struct fsnotify_group *__fsnotify_alloc_group( init_waitqueue_head(&group->notification_waitq); group->max_events = UINT_MAX; + mutex_init(&group->queue_mutex); mutex_init(&group->mark_mutex); INIT_LIST_HEAD(&group->marks_list); diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 879cff5eccd4..38854a1d6485 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -37,6 +37,7 @@ FAN_REPORT_TID | \ FAN_REPORT_PIDFD | \ FAN_REPORT_FD_ERROR | \ + FAN_RESTARTABLE_EVENTS | \ FAN_UNLIMITED_QUEUE | \ FAN_UNLIMITED_MARKS) diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index d4034ddaf392..1203124dc9e8 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -231,6 +231,8 @@ struct fsnotify_group { unsigned int max_events; /* maximum events allowed on the list */ enum fsnotify_group_prio priority; /* priority for sending events */ bool shutdown; /* group is being shut down, don't queue more events */ + bool queue_opened; /* whether or not a queue fd has been issued */ + struct mutex queue_mutex; /* protects event queue during open / release */ #define FSNOTIFY_GROUP_USER 0x01 /* user allocated group */ #define FSNOTIFY_GROUP_DUPS 0x02 /* allow multiple marks per object */ diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index e710967c7c26..008097628279 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -67,6 +67,7 @@ #define FAN_REPORT_TARGET_FID 0x00001000 /* Report dirent target id */ #define FAN_REPORT_FD_ERROR 0x00002000 /* event->fd can report error */ #define FAN_REPORT_MNT 0x00004000 /* Report mount events */ +#define FAN_RESTARTABLE_EVENTS 0x00008000 /* enable control-fd/queue-api */ /* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */ #define FAN_REPORT_DFID_NAME (FAN_REPORT_DIR_FID | FAN_REPORT_NAME) @@ -271,4 +272,9 @@ struct fanotify_response_info_audit_rule { (long)(meta)->event_len >= (long)FAN_EVENT_METADATA_LEN && \ (long)(meta)->event_len <= (long)(len)) +/* fanotify ioctls */ + +/* Issue a queue fd used in control-fd api to read and respond to events */ +#define FAN_IOC_OPEN_QUEUE_FD _IO('F', 0xF0) + #endif /* _UAPI_LINUX_FANOTIFY_H */ diff --git a/tools/include/uapi/linux/fanotify.h b/tools/include/uapi/linux/fanotify.h index e710967c7c26..008097628279 100644 --- a/tools/include/uapi/linux/fanotify.h +++ b/tools/include/uapi/linux/fanotify.h @@ -67,6 +67,7 @@ #define FAN_REPORT_TARGET_FID 0x00001000 /* Report dirent target id */ #define FAN_REPORT_FD_ERROR 0x00002000 /* event->fd can report error */ #define FAN_REPORT_MNT 0x00004000 /* Report mount events */ +#define FAN_RESTARTABLE_EVENTS 0x00008000 /* enable control-fd/queue-api */ /* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */ #define FAN_REPORT_DFID_NAME (FAN_REPORT_DIR_FID | FAN_REPORT_NAME) @@ -271,4 +272,9 @@ struct fanotify_response_info_audit_rule { (long)(meta)->event_len >= (long)FAN_EVENT_METADATA_LEN && \ (long)(meta)->event_len <= (long)(len)) +/* fanotify ioctls */ + +/* Issue a queue fd used in control-fd api to read and respond to events */ +#define FAN_IOC_OPEN_QUEUE_FD _IO('F', 0xF0) + #endif /* _UAPI_LINUX_FANOTIFY_H */ -- 2.47.3