Re: [PATCH 27/32] nsfs: support file handles

Amir Goldstein <amir73il@xxxxxxxxx> · Wed, 10 Sep 2025 19:21:22 +0200

On Wed, Sep 10, 2025 at 4:39 PM Christian Brauner <brauner@xxxxxxxxxx> wrote:
>
> A while ago we added support for file handles to pidfs so pidfds can be
> encoded and decoded as file handles. Userspace has adopted this quickly
> and it's proven very useful.

> Pidfd file handles are exhaustive meaning
> they don't require a handle on another pidfd to pass to
> open_by_handle_at() so it can derive the filesystem to decode in.
>
> Implement the exhaustive file handles for namespaces as well.

I think you decide to split the "exhaustive" part to another patch,
so better drop this paragraph?

I am missing an explanation about the permissions for
opening these file handles.

My understanding of the code is that the opener needs to meet one of
the conditions:
1. user has CAP_SYS_ADMIN in the userns owning the opened namespace
2. current task is in the opened namespace

But I do not fully understand the rationale behind the 2nd condition,
that is, when is it useful?
And as far as I can tell, your selftest does not cover this condition
(only both true or both false)?

I suggest to start with allowing only the useful and important
cases, so if cond #1 is useful enough, drop cond #2 and we can add
it later if needed and then your selftests already cover cond #1 true and false.

>
> Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx>

After documenting the permissions, with ot without dropping cond #2
feel free to add:

Reviewed-by: Amir Goldstein <amir73il@xxxxxxxxx>

> ---
>  fs/nsfs.c                | 176 +++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/exportfs.h |   6 ++
>  2 files changed, 182 insertions(+)
>
> diff --git a/fs/nsfs.c b/fs/nsfs.c
> index 6f8008177133..a1585a2f4f03 100644
> --- a/fs/nsfs.c
> +++ b/fs/nsfs.c
> @@ -13,6 +13,12 @@
>  #include <linux/nsfs.h>
>  #include <linux/uaccess.h>
>  #include <linux/mnt_namespace.h>
> +#include <linux/ipc_namespace.h>
> +#include <linux/time_namespace.h>
> +#include <linux/utsname.h>
> +#include <linux/exportfs.h>
> +#include <linux/nstree.h>
> +#include <net/net_namespace.h>
>
>  #include "mount.h"
>  #include "internal.h"
> @@ -417,12 +423,182 @@ static const struct stashed_operations nsfs_stashed_ops = {
>         .put_data = nsfs_put_data,
>  };
>
> +struct nsfs_fid {
> +       u64 ns_id;
> +       u32 ns_type;
> +       u32 ns_inum;
> +} __attribute__ ((packed));
> +
> +#define NSFS_FID_SIZE (sizeof(struct nsfs_fid) / sizeof(u32))
> +
> +static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
> +                         struct inode *parent)
> +{
> +       struct nsfs_fid *fid = (struct nsfs_fid *)fh;
> +       struct ns_common *ns = inode->i_private;
> +       int len = *max_len;
> +
> +       /*
> +        * TODO:
> +        * For hierarchical namespaces we should start to encode the
> +        * parent namespace. Then userspace can walk a namespace
> +        * hierarchy purely based on file handles.
> +        */
> +       if (parent)
> +               return FILEID_INVALID;
> +
> +       if (len < NSFS_FID_SIZE) {
> +               *max_len = NSFS_FID_SIZE;
> +               return FILEID_INVALID;
> +       }
> +
> +       len  = NSFS_FID_SIZE;
> +
> +       fid->ns_id = ns->ns_id;
> +       fid->ns_type = ns->ops->type;
> +       fid->ns_inum = inode->i_ino;
> +       *max_len = len;
> +       return FILEID_NSFS;
> +}
> +
> +static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
> +                                       int fh_len, int fh_type)
> +{
> +       struct path path __free(path_put) = {};
> +       struct nsfs_fid *fid = (struct nsfs_fid *)fh;
> +       struct user_namespace *owning_ns = NULL;
> +       struct ns_common *ns;
> +       int ret;
> +
> +       if (fh_len < NSFS_FID_SIZE)
> +               return NULL;
> +
> +       switch (fh_type) {
> +       case FILEID_NSFS:
> +               break;
> +       default:
> +               return NULL;
> +       }
> +
> +       scoped_guard(rcu) {
> +               ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type);
> +               if (!ns)
> +                       return NULL;
> +
> +               VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id);
> +               VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type);
> +               VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum);
> +
> +               if (!refcount_inc_not_zero(&ns->count))
> +                       return NULL;
> +       }
> +
> +       switch (ns->ops->type) {
> +#ifdef CONFIG_CGROUPS
> +       case CLONE_NEWCGROUP:
> +               if (!current_in_namespace(to_cg_ns(ns)))
> +                       owning_ns = to_cg_ns(ns)->user_ns;
> +               break;
> +#endif
> +#ifdef CONFIG_IPC_NS
> +       case CLONE_NEWIPC:
> +               if (!current_in_namespace(to_ipc_ns(ns)))
> +                       owning_ns = to_ipc_ns(ns)->user_ns;
> +               break;
> +#endif
> +       case CLONE_NEWNS:
> +               if (!current_in_namespace(to_mnt_ns(ns)))
> +                       owning_ns = to_mnt_ns(ns)->user_ns;
> +               break;
> +#ifdef CONFIG_NET_NS
> +       case CLONE_NEWNET:
> +               if (!current_in_namespace(to_net_ns(ns)))
> +                       owning_ns = to_net_ns(ns)->user_ns;
> +               break;
> +#endif
> +#ifdef CONFIG_PID_NS
> +       case CLONE_NEWPID:
> +               if (!current_in_namespace(to_pid_ns(ns))) {
> +                       owning_ns = to_pid_ns(ns)->user_ns;
> +               } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) {
> +                       ns->ops->put(ns);
> +                       return ERR_PTR(-EPERM);
> +               }
> +               break;
> +#endif
> +#ifdef CONFIG_TIME_NS
> +       case CLONE_NEWTIME:
> +               if (!current_in_namespace(to_time_ns(ns)))
> +                       owning_ns = to_time_ns(ns)->user_ns;
> +               break;
> +#endif
> +#ifdef CONFIG_USER_NS
> +       case CLONE_NEWUSER:
> +               if (!current_in_namespace(to_user_ns(ns)))
> +                       owning_ns = to_user_ns(ns);
> +               break;
> +#endif
> +#ifdef CONFIG_UTS_NS
> +       case CLONE_NEWUTS:
> +               if (!current_in_namespace(to_uts_ns(ns)))
> +                       owning_ns = to_uts_ns(ns)->user_ns;
> +               break;
> +#endif
> +       default:
> +               return ERR_PTR(-EOPNOTSUPP);
> +       }
> +
> +       if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) {
> +               ns->ops->put(ns);
> +               return ERR_PTR(-EPERM);
> +       }
> +
> +       /* path_from_stashed() unconditionally consumes the reference. */
> +       ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
> +       if (ret)
> +               return ERR_PTR(ret);
> +
> +       return no_free_ptr(path.dentry);
> +}
> +
> +/*
> + * Make sure that we reject any nonsensical flags that users pass via
> + * open_by_handle_at().
> + */
> +#define VALID_FILE_HANDLE_OPEN_FLAGS \
> +       (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL)
> +
> +static int nsfs_export_permission(struct handle_to_path_ctx *ctx,
> +                                  unsigned int oflags)
> +{
> +       if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE))
> +               return -EINVAL;
> +
> +       /* nsfs_fh_to_dentry() is performs further permission checks. */
> +       return 0;
> +}
> +
> +static struct file *nsfs_export_open(struct path *path, unsigned int oflags)
> +{
> +       /* Clear O_LARGEFILE as open_by_handle_at() forces it. */
> +       oflags &= ~O_LARGEFILE;
> +       return file_open_root(path, "", oflags, 0);
> +}
> +
> +static const struct export_operations nsfs_export_operations = {
> +       .encode_fh      = nsfs_encode_fh,
> +       .fh_to_dentry   = nsfs_fh_to_dentry,
> +       .open           = nsfs_export_open,
> +       .permission     = nsfs_export_permission,
> +};
> +
>  static int nsfs_init_fs_context(struct fs_context *fc)
>  {
>         struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
>         if (!ctx)
>                 return -ENOMEM;
>         ctx->ops = &nsfs_ops;
> +       ctx->eops = &nsfs_export_operations;
>         ctx->dops = &ns_dentry_operations;
>         fc->s_fs_info = (void *)&nsfs_stashed_ops;
>         return 0;
> diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
> index cfb0dd1ea49c..3aac58a520c7 100644
> --- a/include/linux/exportfs.h
> +++ b/include/linux/exportfs.h
> @@ -122,6 +122,12 @@ enum fid_type {
>         FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1,
>         FILEID_BCACHEFS_WITH_PARENT = 0xb2,
>
> +       /*
> +        *
> +        * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number.
> +        */
> +       FILEID_NSFS = 0xf1,
> +
>         /*
>          * 64 bit unique kernfs id
>          */
>
> --
> 2.47.3
>