On Wed, Sep 10, 2025 at 4:39 PM Christian Brauner <brauner@xxxxxxxxxx> wrote: > > A while ago we added support for file handles to pidfs so pidfds can be > encoded and decoded as file handles. Userspace has adopted this quickly > and it's proven very useful. > Pidfd file handles are exhaustive meaning > they don't require a handle on another pidfd to pass to > open_by_handle_at() so it can derive the filesystem to decode in. > > Implement the exhaustive file handles for namespaces as well. I think you decide to split the "exhaustive" part to another patch, so better drop this paragraph? I am missing an explanation about the permissions for opening these file handles. My understanding of the code is that the opener needs to meet one of the conditions: 1. user has CAP_SYS_ADMIN in the userns owning the opened namespace 2. current task is in the opened namespace But I do not fully understand the rationale behind the 2nd condition, that is, when is it useful? And as far as I can tell, your selftest does not cover this condition (only both true or both false)? I suggest to start with allowing only the useful and important cases, so if cond #1 is useful enough, drop cond #2 and we can add it later if needed and then your selftests already cover cond #1 true and false. > > Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> After documenting the permissions, with ot without dropping cond #2 feel free to add: Reviewed-by: Amir Goldstein <amir73il@xxxxxxxxx> > --- > fs/nsfs.c | 176 +++++++++++++++++++++++++++++++++++++++++++++++ > include/linux/exportfs.h | 6 ++ > 2 files changed, 182 insertions(+) > > diff --git a/fs/nsfs.c b/fs/nsfs.c > index 6f8008177133..a1585a2f4f03 100644 > --- a/fs/nsfs.c > +++ b/fs/nsfs.c > @@ -13,6 +13,12 @@ > #include <linux/nsfs.h> > #include <linux/uaccess.h> > #include <linux/mnt_namespace.h> > +#include <linux/ipc_namespace.h> > +#include <linux/time_namespace.h> > +#include <linux/utsname.h> > +#include <linux/exportfs.h> > +#include <linux/nstree.h> > +#include <net/net_namespace.h> > > #include "mount.h" > #include "internal.h" > @@ -417,12 +423,182 @@ static const struct stashed_operations nsfs_stashed_ops = { > .put_data = nsfs_put_data, > }; > > +struct nsfs_fid { > + u64 ns_id; > + u32 ns_type; > + u32 ns_inum; > +} __attribute__ ((packed)); > + > +#define NSFS_FID_SIZE (sizeof(struct nsfs_fid) / sizeof(u32)) > + > +static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, > + struct inode *parent) > +{ > + struct nsfs_fid *fid = (struct nsfs_fid *)fh; > + struct ns_common *ns = inode->i_private; > + int len = *max_len; > + > + /* > + * TODO: > + * For hierarchical namespaces we should start to encode the > + * parent namespace. Then userspace can walk a namespace > + * hierarchy purely based on file handles. > + */ > + if (parent) > + return FILEID_INVALID; > + > + if (len < NSFS_FID_SIZE) { > + *max_len = NSFS_FID_SIZE; > + return FILEID_INVALID; > + } > + > + len = NSFS_FID_SIZE; > + > + fid->ns_id = ns->ns_id; > + fid->ns_type = ns->ops->type; > + fid->ns_inum = inode->i_ino; > + *max_len = len; > + return FILEID_NSFS; > +} > + > +static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, > + int fh_len, int fh_type) > +{ > + struct path path __free(path_put) = {}; > + struct nsfs_fid *fid = (struct nsfs_fid *)fh; > + struct user_namespace *owning_ns = NULL; > + struct ns_common *ns; > + int ret; > + > + if (fh_len < NSFS_FID_SIZE) > + return NULL; > + > + switch (fh_type) { > + case FILEID_NSFS: > + break; > + default: > + return NULL; > + } > + > + scoped_guard(rcu) { > + ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); > + if (!ns) > + return NULL; > + > + VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); > + VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type); > + VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); > + > + if (!refcount_inc_not_zero(&ns->count)) > + return NULL; > + } > + > + switch (ns->ops->type) { > +#ifdef CONFIG_CGROUPS > + case CLONE_NEWCGROUP: > + if (!current_in_namespace(to_cg_ns(ns))) > + owning_ns = to_cg_ns(ns)->user_ns; > + break; > +#endif > +#ifdef CONFIG_IPC_NS > + case CLONE_NEWIPC: > + if (!current_in_namespace(to_ipc_ns(ns))) > + owning_ns = to_ipc_ns(ns)->user_ns; > + break; > +#endif > + case CLONE_NEWNS: > + if (!current_in_namespace(to_mnt_ns(ns))) > + owning_ns = to_mnt_ns(ns)->user_ns; > + break; > +#ifdef CONFIG_NET_NS > + case CLONE_NEWNET: > + if (!current_in_namespace(to_net_ns(ns))) > + owning_ns = to_net_ns(ns)->user_ns; > + break; > +#endif > +#ifdef CONFIG_PID_NS > + case CLONE_NEWPID: > + if (!current_in_namespace(to_pid_ns(ns))) { > + owning_ns = to_pid_ns(ns)->user_ns; > + } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) { > + ns->ops->put(ns); > + return ERR_PTR(-EPERM); > + } > + break; > +#endif > +#ifdef CONFIG_TIME_NS > + case CLONE_NEWTIME: > + if (!current_in_namespace(to_time_ns(ns))) > + owning_ns = to_time_ns(ns)->user_ns; > + break; > +#endif > +#ifdef CONFIG_USER_NS > + case CLONE_NEWUSER: > + if (!current_in_namespace(to_user_ns(ns))) > + owning_ns = to_user_ns(ns); > + break; > +#endif > +#ifdef CONFIG_UTS_NS > + case CLONE_NEWUTS: > + if (!current_in_namespace(to_uts_ns(ns))) > + owning_ns = to_uts_ns(ns)->user_ns; > + break; > +#endif > + default: > + return ERR_PTR(-EOPNOTSUPP); > + } > + > + if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) { > + ns->ops->put(ns); > + return ERR_PTR(-EPERM); > + } > + > + /* path_from_stashed() unconditionally consumes the reference. */ > + ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); > + if (ret) > + return ERR_PTR(ret); > + > + return no_free_ptr(path.dentry); > +} > + > +/* > + * Make sure that we reject any nonsensical flags that users pass via > + * open_by_handle_at(). > + */ > +#define VALID_FILE_HANDLE_OPEN_FLAGS \ > + (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL) > + > +static int nsfs_export_permission(struct handle_to_path_ctx *ctx, > + unsigned int oflags) > +{ > + if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE)) > + return -EINVAL; > + > + /* nsfs_fh_to_dentry() is performs further permission checks. */ > + return 0; > +} > + > +static struct file *nsfs_export_open(struct path *path, unsigned int oflags) > +{ > + /* Clear O_LARGEFILE as open_by_handle_at() forces it. */ > + oflags &= ~O_LARGEFILE; > + return file_open_root(path, "", oflags, 0); > +} > + > +static const struct export_operations nsfs_export_operations = { > + .encode_fh = nsfs_encode_fh, > + .fh_to_dentry = nsfs_fh_to_dentry, > + .open = nsfs_export_open, > + .permission = nsfs_export_permission, > +}; > + > static int nsfs_init_fs_context(struct fs_context *fc) > { > struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); > if (!ctx) > return -ENOMEM; > ctx->ops = &nsfs_ops; > + ctx->eops = &nsfs_export_operations; > ctx->dops = &ns_dentry_operations; > fc->s_fs_info = (void *)&nsfs_stashed_ops; > return 0; > diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h > index cfb0dd1ea49c..3aac58a520c7 100644 > --- a/include/linux/exportfs.h > +++ b/include/linux/exportfs.h > @@ -122,6 +122,12 @@ enum fid_type { > FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1, > FILEID_BCACHEFS_WITH_PARENT = 0xb2, > > + /* > + * > + * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number. > + */ > + FILEID_NSFS = 0xf1, > + > /* > * 64 bit unique kernfs id > */ > > -- > 2.47.3 >