From: Darrick J. Wong <djwong@xxxxxxxxxx> Add an ioctl that allows fuse servers to register block devices for use with iomap. This is (for now) separate from the backing file open/close ioctl (despite using the same struct) to keep the codepaths separate. Signed-off-by: "Darrick J. Wong" <djwong@xxxxxxxxxx> --- fs/fuse/fuse_i.h | 27 +++++ fs/fuse/fuse_trace.h | 62 +++++++++++ include/uapi/linux/fuse.h | 3 + fs/fuse/dev.c | 21 ++++ fs/fuse/file_iomap.c | 243 ++++++++++++++++++++++++++++++++++++++++++++- fs/fuse/inode.c | 13 ++ 6 files changed, 361 insertions(+), 8 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b6dc9226f3d77f..12c462a29fe0c4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -616,6 +616,19 @@ struct fuse_sync_bucket { struct rcu_head rcu; }; +struct fuse_iomap_conn { + struct idr device_map; +}; + +struct fuse_iomap_dev { + struct file *file; + struct block_device *bdev; + + /** refcount */ + refcount_t count; + struct rcu_head rcu; +}; + /** * A Fuse connection. * @@ -970,6 +983,10 @@ struct fuse_conn { struct fuse_ring *ring; #endif +#ifdef CONFIG_FUSE_IOMAP + struct fuse_iomap_conn iomap_conn; +#endif + /** Only used if the connection opts into request timeouts */ struct { /* Worker for checking if any requests have timed out */ @@ -1616,9 +1633,19 @@ static inline bool fuse_has_iomap(const struct inode *inode) { return get_fuse_conn_c(inode)->iomap; } + +bool fuse_iomap_fill_super(struct fuse_mount *fm); +int fuse_iomap_conn_alloc(struct fuse_conn *fc); +void fuse_iomap_conn_put(struct fuse_conn *fc); + +int fuse_iomap_dev_add(struct fuse_conn *fc, const struct fuse_backing_map *map); #else # define fuse_iomap_enabled(...) (false) # define fuse_has_iomap(...) (false) +# define fuse_iomap_fill_super(...) (true) +# define fuse_iomap_conn_alloc(...) (0) +# define fuse_iomap_conn_put(...) ((void)0) +# define fuse_iomap_dev_add(...) (-ENOSYS) #endif #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h index ecf9332321a1e6..5c8533053f8eed 100644 --- a/fs/fuse/fuse_trace.h +++ b/fs/fuse/fuse_trace.h @@ -410,6 +410,68 @@ TRACE_EVENT(fuse_iomap_end_error, __entry->pos, __entry->count, __entry->written, __entry->error) ); + +TRACE_EVENT(fuse_iomap_dev_add, + TP_PROTO(const struct fuse_conn *fc, + const struct fuse_backing_map *map), + + TP_ARGS(fc, map), + + TP_STRUCT__entry( + __field(dev_t, connection) + __field(int, fd) + __field(unsigned int, flags) + ), + + TP_fast_assign( + __entry->connection = fc->dev; + __entry->fd = map->fd; + __entry->flags = map->flags; + ), + + TP_printk("connection %u fd %d flags 0x%x", + __entry->connection, + __entry->fd, + __entry->flags) +); + +TRACE_EVENT(fuse_iomap_dev_class, + TP_PROTO(const struct fuse_conn *fc, unsigned int idx, + const struct fuse_iomap_dev *fb), + + TP_ARGS(fc, idx, fb), + + TP_STRUCT__entry( + __field(dev_t, connection) + __field(unsigned int, idx) + __field(dev_t, bdev) + ), + + TP_fast_assign( + __entry->connection = fc->dev; + __entry->idx = idx; + + if (fb) { + struct inode *inode = file_inode(fb->file); + + __entry->bdev = inode->i_rdev; + } else { + __entry->bdev = 0; + } + ), + + TP_printk("connection %u idx %u dev %u:%u", + __entry->connection, + __entry->idx, + MAJOR(__entry->bdev), MINOR(__entry->bdev)) +); +#define DEFINE_FUSE_IOMAP_DEV_EVENT(name) \ +DEFINE_EVENT(fuse_iomap_dev_class, name, \ + TP_PROTO(const struct fuse_conn *fc, unsigned int idx, \ + const struct fuse_iomap_dev *fb), \ + TP_ARGS(fc, idx, fb)) +DEFINE_FUSE_IOMAP_DEV_EVENT(fuse_iomap_add_dev); +DEFINE_FUSE_IOMAP_DEV_EVENT(fuse_iomap_remove_dev); #endif /* CONFIG_FUSE_IOMAP */ #endif /* _TRACE_FUSE_H */ diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 501f4d838e654f..2fe83fc196b021 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -239,6 +239,7 @@ * 7.99 * - add FUSE_IOMAP and iomap_{begin,end,ioend} handlers for FIEMAP and * SEEK_{DATA,HOLE} support + * - add FUSE_DEV_IOC_IOMAP_DEV_ADD to configure block devices for iomap */ #ifndef _LINUX_FUSE_H @@ -1136,6 +1137,8 @@ struct fuse_backing_map { #define FUSE_DEV_IOC_BACKING_OPEN _IOW(FUSE_DEV_IOC_MAGIC, 1, \ struct fuse_backing_map) #define FUSE_DEV_IOC_BACKING_CLOSE _IOW(FUSE_DEV_IOC_MAGIC, 2, uint32_t) +#define FUSE_DEV_IOC_IOMAP_DEV_ADD _IOW(FUSE_DEV_IOC_MAGIC, 3, \ + struct fuse_backing_map) struct fuse_lseek_in { uint64_t fh; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8dd74cbfbcc6fc..49ff2c6654e768 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2633,6 +2633,24 @@ static long fuse_dev_ioctl_backing_open(struct file *file, return fuse_backing_open(fud->fc, &map); } +static long fuse_dev_ioctl_iomap_dev_add(struct file *file, + struct fuse_backing_map __user *argp) +{ + struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_backing_map map; + + if (!fud) + return -EPERM; + + if (!IS_ENABLED(CONFIG_FUSE_IOMAP)) + return -EOPNOTSUPP; + + if (copy_from_user(&map, argp, sizeof(map))) + return -EFAULT; + + return fuse_iomap_dev_add(fud->fc, &map); +} + static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) { struct fuse_dev *fud = fuse_get_dev(file); @@ -2665,6 +2683,9 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, case FUSE_DEV_IOC_BACKING_CLOSE: return fuse_dev_ioctl_backing_close(file, argp); + case FUSE_DEV_IOC_IOMAP_DEV_ADD: + return fuse_dev_ioctl_iomap_dev_add(file, argp); + default: return -ENOTTY; } diff --git a/fs/fuse/file_iomap.c b/fs/fuse/file_iomap.c index a206a9254df3fe..535429023d37e7 100644 --- a/fs/fuse/file_iomap.c +++ b/fs/fuse/file_iomap.c @@ -189,9 +189,6 @@ fuse_iomap_begin_validate(const struct fuse_iomap_begin_out *outarg, return -EIO; } - /* XXX: Check the device cookie */ - ASSERT(outarg->read_dev == 0); - /* No overflows in the device range, if supplied */ if (outarg->read_addr != FUSE_IOMAP_NULL_ADDR && BAD_DATA(check_add_overflow(outarg->read_addr, outarg->length, &end))) @@ -220,6 +217,98 @@ static inline bool fuse_is_iomap_file_write(unsigned int opflags) return opflags & (IOMAP_WRITE | IOMAP_ZERO | IOMAP_UNSHARE); } +static struct fuse_iomap_dev *fuse_iomap_dev_get(struct fuse_iomap_dev *fb) +{ + if (fb && refcount_inc_not_zero(&fb->count)) + return fb; + return NULL; +} + +static void fuse_iomap_dev_free(struct fuse_iomap_dev *fb) +{ + if (fb->file) + fput(fb->file); + kfree_rcu(fb, rcu); +} + +static void fuse_iomap_dev_put(struct fuse_iomap_dev *fb) +{ + if (fb && refcount_dec_and_test(&fb->count)) + fuse_iomap_dev_free(fb); +} + +static int fuse_iomap_dev_id_alloc(struct fuse_conn *fc, + struct fuse_iomap_dev *fb) +{ + int id; + + idr_preload(GFP_KERNEL); + spin_lock(&fc->lock); + id = idr_alloc_cyclic(&fc->iomap_conn.device_map, fb, 1, 0, + GFP_ATOMIC); + spin_unlock(&fc->lock); + idr_preload_end(); + + trace_fuse_iomap_add_dev(fc, id, fb); + + return id; +} + +static struct fuse_iomap_dev *fuse_iomap_dev_id_remove(struct fuse_conn *fc, + int id) +{ + struct fuse_iomap_dev *fb; + + spin_lock(&fc->lock); + fb = idr_remove(&fc->iomap_conn.device_map, id); + spin_unlock(&fc->lock); + + if (fb) + trace_fuse_iomap_remove_dev(fc, id, fb); + + return fb; +} + +static inline struct fuse_iomap_dev * +fuse_iomap_dev_id_find(struct fuse_conn *fc, int idx) +{ + struct fuse_iomap_dev *fb; + + rcu_read_lock(); + fb = idr_find(&fc->iomap_conn.device_map, idx); + fb = fuse_iomap_dev_get(fb); + rcu_read_unlock(); + + return fb; +} + +static inline struct fuse_iomap_dev * +fuse_iomap_find_dev(struct fuse_conn *fc, uint16_t map_type, uint32_t map_dev) +{ + struct fuse_iomap_dev *ret = NULL; + + if (map_dev != FUSE_IOMAP_DEV_NULL && map_dev < INT_MAX) + ret = fuse_iomap_dev_id_find(fc, map_dev); + + switch (map_type) { + case FUSE_IOMAP_TYPE_MAPPED: + case FUSE_IOMAP_TYPE_UNWRITTEN: + /* Mappings backed by space must have a device/addr */ + if (BAD_DATA(ret == NULL)) + return ERR_PTR(-EIO); + break; + } + + return ret; +} + +static inline void +fuse_iomap_set_device(struct iomap *iomap, const struct fuse_iomap_dev *fb) +{ + iomap->bdev = fb ? fb->bdev : NULL; + iomap->dax_dev = NULL; +} + static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count, unsigned opflags, struct iomap *iomap, struct iomap *srcmap) @@ -233,6 +322,8 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count, }; struct fuse_iomap_begin_out outarg = { }; struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_iomap_dev *read_dev = NULL; + struct fuse_iomap_dev *write_dev = NULL; FUSE_ARGS(args); int err; @@ -259,8 +350,21 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count, if (err) return err; + read_dev = fuse_iomap_find_dev(fm->fc, outarg.read_type, + outarg.read_dev); + if (IS_ERR(read_dev)) + return PTR_ERR(read_dev); + if (fuse_is_iomap_file_write(opflags) && outarg.write_type != FUSE_IOMAP_TYPE_PURE_OVERWRITE) { + + write_dev = fuse_iomap_find_dev(fm->fc, outarg.write_type, + outarg.write_dev); + if (IS_ERR(write_dev)) { + err = PTR_ERR(write_dev); + goto out_read_dev; + } + /* * For an out of place write, we must supply the write mapping * via @iomap, and the read mapping via @srcmap. @@ -270,14 +374,14 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count, iomap->length = outarg.length; iomap->type = outarg.write_type; iomap->flags = outarg.write_flags; - iomap->bdev = inode->i_sb->s_bdev; + fuse_iomap_set_device(iomap, write_dev); srcmap->addr = outarg.read_addr; srcmap->offset = outarg.offset; srcmap->length = outarg.length; srcmap->type = outarg.read_type; srcmap->flags = outarg.read_flags; - srcmap->bdev = inode->i_sb->s_bdev; + fuse_iomap_set_device(srcmap, read_dev); } else { /* * For everything else (reads, reporting, and pure overwrites), @@ -289,10 +393,19 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t count, iomap->length = outarg.length; iomap->type = outarg.read_type; iomap->flags = outarg.read_flags; - iomap->bdev = inode->i_sb->s_bdev; + fuse_iomap_set_device(iomap, read_dev); } - return 0; + /* + * XXX: if we ever want to support closing devices, we need a way to + * track the fuse_iomap_dev refcount all the way through bio endios. + * For now we put the refcount here because you can't remove an iomap + * device until unmount time. + */ + fuse_iomap_dev_put(write_dev); +out_read_dev: + fuse_iomap_dev_put(read_dev); + return err; } static bool fuse_want_iomap_end(const struct iomap *iomap, unsigned int opflags, @@ -356,3 +469,119 @@ const struct iomap_ops fuse_iomap_ops = { .iomap_begin = fuse_iomap_begin, .iomap_end = fuse_iomap_end, }; + +int fuse_iomap_conn_alloc(struct fuse_conn *fc) +{ + idr_init(&fc->iomap_conn.device_map); + return 0; +} + +static int fuse_iomap_dev_id_free(int id, void *p, void *data) +{ + struct fuse_iomap_dev *fb = p; + struct fuse_conn *fc = data; + + trace_fuse_iomap_remove_dev(fc, id, fb); + + WARN_ON_ONCE(refcount_read(&fb->count) != 1); + fuse_iomap_dev_free(fb); + return 0; +} + +void fuse_iomap_conn_put(struct fuse_conn *fc) +{ + idr_for_each(&fc->iomap_conn.device_map, fuse_iomap_dev_id_free, fc); + idr_destroy(&fc->iomap_conn.device_map); +} + +static struct fuse_iomap_dev *fuse_iomap_dev_alloc(struct file *file) +{ + struct fuse_iomap_dev *fb = + kmalloc(sizeof(struct fuse_iomap_dev), GFP_KERNEL); + + if (!fb) + return NULL; + + fb->file = file; + fb->bdev = I_BDEV(file->f_mapping->host); + refcount_set(&fb->count, 1); + + return fb; +} + +bool fuse_iomap_fill_super(struct fuse_mount *fm) +{ + struct fuse_conn *fc = fm->fc; + struct super_block *sb = fm->sb; + int res; + + if (sb->s_bdev) { + /* + * Try to install s_bdev as the first iomap device, if this + * is a block-device filesystem. + */ + struct fuse_iomap_dev *fb = + fuse_iomap_dev_alloc(sb->s_bdev_file); + + if (!fb) + return false; + + res = fuse_iomap_dev_id_alloc(fc, fb); + if (res < 0) + return false; + if (res != 1) { + struct fuse_iomap_dev *bad = + fuse_iomap_dev_id_remove(fc, res); + + ASSERT(res == 1); + ASSERT(bad == fb); + fuse_iomap_dev_put(bad); + return false; + } + } + + return true; +} + +int fuse_iomap_dev_add(struct fuse_conn *fc, const struct fuse_backing_map *map) +{ + struct file *file; + struct fuse_iomap_dev *fb = NULL; + int res; + + trace_fuse_iomap_dev_add(fc, map); + + res = -EPERM; + if (!fc->iomap) + goto out; + + res = -EINVAL; + if (map->flags || map->padding) + goto out; + + file = fget_raw(map->fd); + res = -EBADF; + if (!file) + goto out; + + res = -ENODEV; + if (!S_ISBLK(file_inode(file)->i_mode)) + goto out_fput; + + fb = fuse_iomap_dev_alloc(file); + if (!fb) + goto out_fput; + + res = fuse_iomap_dev_id_alloc(fc, fb); + if (res < 0) { + fuse_iomap_dev_free(fb); + goto out; + } + + return res; + +out_fput: + fput(file); +out: + return res; +} diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6173795d3826d0..8266f30bc8a954 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1015,6 +1015,7 @@ void fuse_conn_put(struct fuse_conn *fc) struct fuse_iqueue *fiq = &fc->iq; struct fuse_sync_bucket *bucket; + fuse_iomap_conn_put(fc); if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_conn_free(fc); if (fc->timeout.req_timeout) @@ -1454,6 +1455,9 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, init_server_timeout(fc, timeout); + if (fc->iomap && !fuse_iomap_fill_super(fm)) + ok = false; + fm->sb->s_bdi->ra_pages = min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; @@ -1823,10 +1827,15 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) sb->s_subtype = ctx->subtype; ctx->subtype = NULL; + + err = fuse_iomap_conn_alloc(fc); + if (err) + goto err; + if (IS_ENABLED(CONFIG_FUSE_DAX)) { err = fuse_dax_conn_alloc(fc, ctx->dax_mode, ctx->dax_dev); if (err) - goto err; + goto err_free_iomap; } if (ctx->fudptr) { @@ -1888,6 +1897,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err_dev_free: if (fud) fuse_dev_free(fud); + err_free_iomap: + fuse_iomap_conn_put(fc); err_free_dax: if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_conn_free(fc);