On Thu, May 22, 2025 at 2:03 AM Darrick J. Wong <djwong@xxxxxxxxxx> wrote: > > From: Darrick J. Wong <djwong@xxxxxxxxxx> > > Add a new notification so that fuse servers can add extra block devices > to use with iomap. > > Signed-off-by: "Darrick J. Wong" <djwong@xxxxxxxxxx> > --- > fs/fuse/fuse_i.h | 19 +++++++ > fs/fuse/fuse_trace.h | 36 ++++++++++++++ > include/uapi/linux/fuse.h | 8 +++ > fs/fuse/dev.c | 23 +++++++++ > fs/fuse/file_iomap.c | 119 ++++++++++++++++++++++++++++++++++++++++++++- > fs/fuse/inode.c | 9 +++ > 6 files changed, 211 insertions(+), 3 deletions(-) > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index aa51f25856697d..4eb75ed90db300 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -619,6 +619,12 @@ struct fuse_sync_bucket { > struct rcu_head rcu; > }; > > +struct fuse_iomap { > + /* array of file objects that reference block devices for iomap */ > + struct file **files; > + unsigned int nr_files; > +}; > + > /** > * A Fuse connection. > * > @@ -970,6 +976,10 @@ struct fuse_conn { > struct fuse_ring *ring; > #endif > > +#ifdef CONFIG_FUSE_IOMAP > + struct fuse_iomap iomap_conn; > +#endif > + > /** Only used if the connection opts into request timeouts */ > struct { > /* Worker for checking if any requests have timed out */ > @@ -1610,9 +1620,18 @@ static inline bool fuse_has_iomap(const struct inode *inode) > { > return get_fuse_conn_c(inode)->iomap; > } > + > +void fuse_iomap_init_reply(struct fuse_mount *fm); > +void fuse_iomap_conn_put(struct fuse_conn *fc); > + > +int fuse_iomap_add_device(struct fuse_conn *fc, > + const struct fuse_iomap_add_device_out *outarg); > #else > # define fuse_iomap_enabled(...) (false) > # define fuse_has_iomap(...) (false) > +# define fuse_iomap_init_reply(...) ((void)0) > +# define fuse_iomap_conn_put(...) ((void)0) > +# define fuse_iomap_add_device(...) (-ENOSYS) > #endif > > #endif /* _FS_FUSE_I_H */ > diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h > index f9a316c9788e06..e1a2e491d2581a 100644 > --- a/fs/fuse/fuse_trace.h > +++ b/fs/fuse/fuse_trace.h > @@ -380,6 +380,42 @@ TRACE_EVENT(fuse_iomap_end_error, > __entry->pos, __entry->count, __entry->written, > __entry->error) > ); > + > +TRACE_EVENT(fuse_iomap_dev_class, > + TP_PROTO(const struct fuse_conn *fc, unsigned int idx, > + const struct file *file), > + > + TP_ARGS(fc, idx, file), > + > + TP_STRUCT__entry( > + __field(dev_t, connection) > + __field(unsigned int, idx) > + __field(dev_t, bdev) > + ), > + > + TP_fast_assign( > + struct inode *inode = file_inode(file); > + > + __entry->connection = fc->dev; > + __entry->idx = idx; > + if (S_ISBLK(inode->i_mode)) { > + __entry->bdev = inode->i_rdev; > + } else > + __entry->bdev = 0; > + ), > + > + TP_printk("connection %u idx %u dev %u:%u", > + __entry->connection, > + __entry->idx, > + MAJOR(__entry->bdev), MINOR(__entry->bdev)) > +); > +#define DEFINE_FUSE_IOMAP_DEV_EVENT(name) \ > +DEFINE_EVENT(fuse_iomap_dev_class, name, \ > + TP_PROTO(const struct fuse_conn *fc, unsigned int idx, \ > + const struct file *file), \ > + TP_ARGS(fc, idx, file)) > +DEFINE_FUSE_IOMAP_DEV_EVENT(fuse_iomap_add_dev); > +DEFINE_FUSE_IOMAP_DEV_EVENT(fuse_iomap_remove_dev); > #endif /* CONFIG_FUSE_IOMAP */ > > #endif /* _TRACE_FUSE_H */ > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > index ce6c9960f2418f..ea8992e980a015 100644 > --- a/include/uapi/linux/fuse.h > +++ b/include/uapi/linux/fuse.h > @@ -236,6 +236,7 @@ > * 7.44 > * - add FUSE_IOMAP and iomap_{begin,end,ioend} handlers for FIEMAP and > * SEEK_{DATA,HOLE} support > + * - add FUSE_NOTIFY_ADD_IOMAP_DEVICE for multi-device filesystems > */ > > #ifndef _LINUX_FUSE_H > @@ -681,6 +682,7 @@ enum fuse_notify_code { > FUSE_NOTIFY_RETRIEVE = 5, > FUSE_NOTIFY_DELETE = 6, > FUSE_NOTIFY_RESEND = 7, > + FUSE_NOTIFY_ADD_IOMAP_DEVICE = 8, > FUSE_NOTIFY_CODE_MAX, > }; > > @@ -1371,4 +1373,10 @@ struct fuse_iomap_end_in { > uint32_t map_dev; /* device cookie * */ > }; > > +struct fuse_iomap_add_device_out { > + int32_t fd; /* fd of the open device to add */ > + uint32_t reserved; /* must be zero */ > + uint32_t *map_dev; /* location to receive device cookie */ > +}; > + > #endif /* _LINUX_FUSE_H */ > diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c > index 6dcbaa218b7a16..9d7064ec170cf6 100644 > --- a/fs/fuse/dev.c > +++ b/fs/fuse/dev.c > @@ -1824,6 +1824,26 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, > return err; > } > > +static int fuse_notify_add_iomap_device(struct fuse_conn *fc, unsigned int size, > + struct fuse_copy_state *cs) > +{ > + struct fuse_iomap_add_device_out outarg; > + int err = -EINVAL; > + > + if (size != sizeof(outarg)) > + goto err; > + > + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); > + if (err) > + goto err; > + fuse_copy_finish(cs); > + > + return fuse_iomap_add_device(fc, &outarg); > +err: > + fuse_copy_finish(cs); > + return err; > +} > + > struct fuse_retrieve_args { > struct fuse_args_pages ap; > struct fuse_notify_retrieve_in inarg; > @@ -2049,6 +2069,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, > case FUSE_NOTIFY_RESEND: > return fuse_notify_resend(fc); > > + case FUSE_NOTIFY_ADD_IOMAP_DEVICE: > + return fuse_notify_add_iomap_device(fc, size, cs); > + > default: > fuse_copy_finish(cs); > return -EINVAL; > diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c > index dfa0c309803113..faefd29a273bf3 100644 > --- a/fs/fuse/file_iomap.c > +++ b/fs/fuse/file_iomap.c > @@ -142,6 +142,26 @@ static inline int fuse_iomap_validate(const struct fuse_iomap_begin_out *outarg, > return 0; > } > > +static inline struct block_device *fuse_iomap_bdev(struct fuse_mount *fm, > + unsigned int idx) > +{ > + struct fuse_conn *fc = fm->fc; > + struct file *file = NULL; > + > + spin_lock(&fc->lock); > + if (idx < fc->iomap_conn.nr_files) > + file = fc->iomap_conn.files[idx]; > + spin_unlock(&fc->lock); > + > + if (!file) > + return NULL; > + > + if (!S_ISBLK(file_inode(file)->i_mode)) > + return NULL; > + > + return I_BDEV(file->f_mapping->host); > +} > + > static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count, > unsigned opflags, struct iomap *iomap, > struct iomap *srcmap) > @@ -155,6 +175,7 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count, > }; > struct fuse_iomap_begin_out outarg = { }; > struct fuse_mount *fm = get_fuse_mount(inode); > + struct block_device *read_bdev; > FUSE_ARGS(args); > int err; > > @@ -181,8 +202,18 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count, > if (err) > return err; > > + read_bdev = fuse_iomap_bdev(fm, outarg.read_dev); > + if (!read_bdev) > + return -ENODEV; > + > if ((opflags & IOMAP_WRITE) && > outarg.write_type != FUSE_IOMAP_TYPE_PURE_OVERWRITE) { > + struct block_device *write_bdev = > + fuse_iomap_bdev(fm, outarg.write_dev); > + > + if (!write_bdev) > + return -ENODEV; > + > /* > * For an out of place write, we must supply the write mapping > * via @iomap, and the read mapping via @srcmap. > @@ -192,14 +223,14 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count, > iomap->length = outarg.length; > iomap->type = outarg.write_type; > iomap->flags = outarg.write_flags; > - iomap->bdev = inode->i_sb->s_bdev; > + iomap->bdev = write_bdev; > > srcmap->addr = outarg.read_addr; > srcmap->offset = outarg.offset; > srcmap->length = outarg.length; > srcmap->type = outarg.read_type; > srcmap->flags = outarg.read_flags; > - srcmap->bdev = inode->i_sb->s_bdev; > + srcmap->bdev = read_bdev; > } else { > /* > * For everything else (reads, reporting, and pure overwrites), > @@ -211,7 +242,7 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count, > iomap->length = outarg.length; > iomap->type = outarg.read_type; > iomap->flags = outarg.read_flags; > - iomap->bdev = inode->i_sb->s_bdev; > + iomap->bdev = read_bdev; > } > > return 0; > @@ -278,3 +309,85 @@ const struct iomap_ops fuse_iomap_ops = { > .iomap_begin = fuse_iomap_begin, > .iomap_end = fuse_iomap_end, > }; > + > +void fuse_iomap_conn_put(struct fuse_conn *fc) > +{ > + unsigned int i; > + > + for (i = 0; i < fc->iomap_conn.nr_files; i++) { > + struct file *file = fc->iomap_conn.files[i]; > + > + trace_fuse_iomap_remove_dev(fc, i, file); > + > + fc->iomap_conn.files[i] = NULL; > + fput(file); > + } > + > + kfree(fc->iomap_conn.files); > + fc->iomap_conn.nr_files = 0; > +} > + > +/* Add a bdev to the fuse connection, returns the index or a negative errno */ > +static int __fuse_iomap_add_device(struct fuse_conn *fc, struct file *file) > +{ > + struct file **new_files; > + int ret; > + > + if (fc->iomap_conn.nr_files >= PAGE_SIZE / sizeof(unsigned int)) > + return -EMFILE; > + > + new_files = krealloc_array(fc->iomap_conn.files, > + fc->iomap_conn.nr_files + 1, > + sizeof(struct file *), > + GFP_KERNEL | __GFP_ZERO); > + if (!new_files) > + return -ENOMEM; > + > + spin_lock(&fc->lock); > + fc->iomap_conn.files = new_files; > + fc->iomap_conn.files[fc->iomap_conn.nr_files] = get_file(file); > + ret = fc->iomap_conn.nr_files++; > + spin_unlock(&fc->lock); > + > + trace_fuse_iomap_add_dev(fc, ret, file); > + > + return ret; > +} > + > +void fuse_iomap_init_reply(struct fuse_mount *fm) > +{ > + struct fuse_conn *fc = fm->fc; > + struct super_block *sb = fm->sb; > + > + if (sb->s_bdev) > + __fuse_iomap_add_device(fc, sb->s_bdev_file); > +} > + > +int fuse_iomap_add_device(struct fuse_conn *fc, > + const struct fuse_iomap_add_device_out *outarg) > +{ > + struct file *file; > + int ret; > + > + if (!fc->iomap) > + return -EINVAL; > + > + if (outarg->reserved) > + return -EINVAL; > + > + CLASS(fd, somefd)(outarg->fd); > + if (fd_empty(somefd)) > + return -EBADF; > + file = fd_file(somefd); > + > + if (!S_ISBLK(file_inode(file)->i_mode)) > + return -ENODEV; > + > + down_read(&fc->killsb); > + ret = __fuse_iomap_add_device(fc, file); > + up_read(&fc->killsb); > + if (ret < 0) > + return ret; > + > + return put_user(ret, outarg->map_dev); > +} This very much reminds of FUSE_DEV_IOC_BACKING_OPEN that gives kernel an fd to remember for later file operations. FUSE_DEV_IOC_BACKING_OPEN was implemented as an ioctl because of security concerns of passing an fd to the kernel via write(). Speaking of security concerns, we need to consider if this requires some privileges to allow setting up direct access to blockdev. But also, apart from the fact that those are block device fds, what does iomap_conn.files[] differ from fc->backing_files_map? Miklos had envisioned this (backing blockdev) use case as one of the private cases of fuse passthrough. Instead of identity mapping to backing file created at open time it's extent mapping to backing blockdev created at data access time. I am not saying that you need to reuse anything from fuse passthrough code, because the use cases probably do not overlap, but hopefully, you can avoid falling into the same pits that we have already managed to avoid. Thanks, Amir.