Re: [PATCH v2 2/2] fanotify: support watching filesystems and mounts inside userns

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sat, Apr 19, 2025 at 12:06:57PM +0200, Amir Goldstein wrote:
> An unprivileged user is allowed to create an fanotify group and add
> inode marks, but not filesystem, mntns and mount marks.
> 
> Add limited support for setting up filesystem, mntns and mount marks by
> an unprivileged user under the following conditions:
> 
> 1.   User has CAP_SYS_ADMIN in the user ns where the group was created
> 2.a. User has CAP_SYS_ADMIN in the user ns where the filesystem was
>      mounted (implies FS_USERNS_MOUNT)
>   OR (in case setting up a mntns or mount mark)
> 2.b. User has CAP_SYS_ADMIN in the user ns associated with the mntns

So the crux of the problem is that we need to be sure that for all
options we need to be sure that the scope of the permission guarantees
necessary privileges over all the associated objects are held.

CAP_SYS_ADMIN in the owning user namespace of the filesystem (1.) seems
trivially ok because it means that the caller has privileges to mount
that filesystem.

If the caller is just privileged over the owning user namespace of the
mount namespace (2.b) then they are able to listen for mount
notifications starting with v6.15. Note how that the permissions are
specifically scoped to mount objects in that api.

But what you're trying to do here is not scoped to mounts. You're using
that permission check to delegate privileges over non-mount objects such
as directories accessible from that mount as well.

IOW, if I set up a mount mark on a mount based on the fact that I have
privileges over that mount's owning users namespace then it reads to me
that if I have:

mount --bind-into-unprvileged-container /etc /my/container/rootfs/etc

such that the new bind-mount for /etc that gets plugging into the
container is owned by the unprivileged containers's mount namespace then
the container can see write/open/read events on /etc/passwd and
/etc/shadow? But that bind-mount exposes the host's /etc/shadow and
/etc/passwd. That seems like a no go to me.

> 
> Signed-off-by: Amir Goldstein <amir73il@xxxxxxxxx>
> ---
>  fs/notify/fanotify/fanotify.c      |  1 +
>  fs/notify/fanotify/fanotify_user.c | 36 +++++++++++++++++++++---------
>  include/linux/fanotify.h           |  5 ++---
>  include/linux/fsnotify_backend.h   |  1 +
>  4 files changed, 30 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
> index 6d386080faf2..060d9bee34bd 100644
> --- a/fs/notify/fanotify/fanotify.c
> +++ b/fs/notify/fanotify/fanotify.c
> @@ -1009,6 +1009,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
>  
>  static void fanotify_free_group_priv(struct fsnotify_group *group)
>  {
> +	put_user_ns(group->user_ns);
>  	kfree(group->fanotify_data.merge_hash);
>  	if (group->fanotify_data.ucounts)
>  		dec_ucount(group->fanotify_data.ucounts,
> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> index 471c57832357..b4255b661bda 100644
> --- a/fs/notify/fanotify/fanotify_user.c
> +++ b/fs/notify/fanotify/fanotify_user.c
> @@ -1499,6 +1499,7 @@ static struct hlist_head *fanotify_alloc_merge_hash(void)
>  /* fanotify syscalls */
>  SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
>  {
> +	struct user_namespace *user_ns = current_user_ns();
>  	struct fsnotify_group *group;
>  	int f_flags, fd;
>  	unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
> @@ -1513,10 +1514,11 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
>  		/*
>  		 * An unprivileged user can setup an fanotify group with
>  		 * limited functionality - an unprivileged group is limited to
> -		 * notification events with file handles and it cannot use
> -		 * unlimited queue/marks.
> +		 * notification events with file handles or mount ids and it
> +		 * cannot use unlimited queue/marks.
>  		 */
> -		if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
> +		if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) ||
> +		    !(flags & (FANOTIFY_FID_BITS | FAN_REPORT_MNT)))
>  			return -EPERM;
>  
>  		/*
> @@ -1595,8 +1597,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
>  	}
>  
>  	/* Enforce groups limits per user in all containing user ns */
> -	group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
> -						  current_euid(),
> +	group->fanotify_data.ucounts = inc_ucount(user_ns, current_euid(),
>  						  UCOUNT_FANOTIFY_GROUPS);
>  	if (!group->fanotify_data.ucounts) {
>  		fd = -EMFILE;
> @@ -1605,6 +1606,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
>  
>  	group->fanotify_data.flags = flags | internal_flags;
>  	group->memcg = get_mem_cgroup_from_mm(current->mm);
> +	group->user_ns = get_user_ns(user_ns);
>  
>  	group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
>  	if (!group->fanotify_data.merge_hash) {
> @@ -1804,6 +1806,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
>  	struct fsnotify_group *group;
>  	struct path path;
>  	struct fan_fsid __fsid, *fsid = NULL;
> +	struct user_namespace *user_ns = NULL;
>  	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
>  	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
>  	unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
> @@ -1897,12 +1900,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
>  	}
>  
>  	/*
> -	 * An unprivileged user is not allowed to setup mount nor filesystem
> -	 * marks.  This also includes setting up such marks by a group that
> -	 * was initialized by an unprivileged user.
> +	 * A user is allowed to setup sb/mount/mntns marks only if it is
> +	 * capable in the user ns where the group was created.
>  	 */
> -	if ((!capable(CAP_SYS_ADMIN) ||
> -	     FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
> +	if (!ns_capable(group->user_ns, CAP_SYS_ADMIN) &&
>  	    mark_type != FAN_MARK_INODE)
>  		return -EPERM;
>  
> @@ -1987,12 +1988,27 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
>  		obj = inode;
>  	} else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
>  		obj = path.mnt;
> +		user_ns = real_mount(obj)->mnt_ns->user_ns;
>  	} else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) {
>  		obj = path.mnt->mnt_sb;
> +		user_ns = path.mnt->mnt_sb->s_user_ns;
>  	} else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) {
>  		obj = mnt_ns_from_dentry(path.dentry);
> +		user_ns = ((struct mnt_namespace *)obj)->user_ns;
>  	}
>  
> +	/*
> +	 * In addition to being capable in the user ns where group was created,
> +	 * the user also needs to be capable in the user ns associated with
> +	 * the marked filesystem (for FS_USERNS_MOUNT filesystems) or in the
> +	 * user ns associated with the mntns (when marking a mount or mntns).
> +	 * This is aligned with the required permissions to open_by_handle_at()
> +	 * a directory fid provided with the events.
> +	 */
> +	ret = -EPERM;
> +	if (user_ns && !ns_capable(user_ns, CAP_SYS_ADMIN))
> +		goto path_put_and_out;
> +
>  	ret = -EINVAL;
>  	if (!obj)
>  		goto path_put_and_out;
> diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
> index 3c817dc6292e..879cff5eccd4 100644
> --- a/include/linux/fanotify.h
> +++ b/include/linux/fanotify.h
> @@ -38,8 +38,7 @@
>  					 FAN_REPORT_PIDFD | \
>  					 FAN_REPORT_FD_ERROR | \
>  					 FAN_UNLIMITED_QUEUE | \
> -					 FAN_UNLIMITED_MARKS | \
> -					 FAN_REPORT_MNT)
> +					 FAN_UNLIMITED_MARKS)
>  
>  /*
>   * fanotify_init() flags that are allowed for user without CAP_SYS_ADMIN.
> @@ -48,7 +47,7 @@
>   * so one of the flags for reporting file handles is required.
>   */
>  #define FANOTIFY_USER_INIT_FLAGS	(FAN_CLASS_NOTIF | \
> -					 FANOTIFY_FID_BITS | \
> +					 FANOTIFY_FID_BITS | FAN_REPORT_MNT | \
>  					 FAN_CLOEXEC | FAN_NONBLOCK)
>  
>  #define FANOTIFY_INIT_FLAGS	(FANOTIFY_ADMIN_INIT_FLAGS | \
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index fc27b53c58c2..d4034ddaf392 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -250,6 +250,7 @@ struct fsnotify_group {
>  						 * full */
>  
>  	struct mem_cgroup *memcg;	/* memcg to charge allocations */
> +	struct user_namespace *user_ns;	/* user ns where group was created */
>  
>  	/* groups can define private fields here or use the void *private */
>  	union {
> -- 
> 2.34.1
> 




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux