On Mon, 2025-04-21 at 13:49 +0200, Bernd Schubert wrote:
>
> On 4/19/25 02:06, Joanne Koong wrote:
> > For direct io writes, splice is disabled when forwarding pages from the
> > client to the server. This is because the pages in the pipe buffer are
> > user pages, which is controlled by the client. Thus if a server replies
> > to the request and then keeps accessing the pages afterwards, there is
> > the possibility that the client may have modified the contents of the
> > pages in the meantime. More context on this can be found in commit
> > 0c4bcfdecb1a ("fuse: fix pipe buffer lifetime for direct_io").
> >
> > For servers that do not need to access pages after answering the
> > request, splice gives a non-trivial improvement in performance.
> > Benchmarks show roughly a 40% speedup.
> >
> > Allow servers to enable zero-copy splice for servicing client direct io
> > writes. By enabling this, the server understands that they should not
> > continue accessing the pipe buffer after completing the request or may
> > face incorrect data if they do so.
> >
> > Signed-off-by: Joanne Koong <joannelkoong@xxxxxxxxx>
> > ---
> > fs/fuse/dev.c | 18 ++++++++++--------
> > fs/fuse/dev_uring.c | 4 ++--
> > fs/fuse/fuse_dev_i.h | 5 +++--
> > fs/fuse/fuse_i.h | 3 +++
> > fs/fuse/inode.c | 5 ++++-
> > include/uapi/linux/fuse.h | 8 ++++++++
> > 6 files changed, 30 insertions(+), 13 deletions(-)
> >
> > diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> > index 67d07b4c778a..1b0ea8593f74 100644
> > --- a/fs/fuse/dev.c
> > +++ b/fs/fuse/dev.c
> > @@ -816,12 +816,13 @@ static int unlock_request(struct fuse_req *req)
> > return err;
> > }
> >
> > -void fuse_copy_init(struct fuse_copy_state *cs, bool write,
> > - struct iov_iter *iter)
> > +void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
> > + bool write, struct iov_iter *iter)
> > {
> > memset(cs, 0, sizeof(*cs));
> > cs->write = write;
> > cs->iter = iter;
> > + cs->splice_read_user_pages = fc->splice_read_user_pages;
> > }
> >
> > /* Unmap and put previous page of userspace buffer */
> > @@ -1105,9 +1106,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
> > if (cs->write && cs->pipebufs && page) {
> > /*
> > * Can't control lifetime of pipe buffers, so always
> > - * copy user pages.
> > + * copy user pages if server does not support splice
> > + * for reading user pages.
> > */
> > - if (cs->req->args->user_pages) {
> > + if (cs->req->args->user_pages && !cs->splice_read_user_pages) {
> > err = fuse_copy_fill(cs);
> > if (err)
> > return err;
> > @@ -1538,7 +1540,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
> > if (!user_backed_iter(to))
> > return -EINVAL;
> >
> > - fuse_copy_init(&cs, true, to);
> > + fuse_copy_init(&cs, fud->fc, true, to);
> >
> > return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to));
> > }
> > @@ -1561,7 +1563,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
> > if (!bufs)
> > return -ENOMEM;
> >
> > - fuse_copy_init(&cs, true, NULL);
> > + fuse_copy_init(&cs, fud->fc, true, NULL);
> > cs.pipebufs = bufs;
> > cs.pipe = pipe;
> > ret = fuse_dev_do_read(fud, in, &cs, len);
> > @@ -2227,7 +2229,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
> > if (!user_backed_iter(from))
> > return -EINVAL;
> >
> > - fuse_copy_init(&cs, false, from);
> > + fuse_copy_init(&cs, fud->fc, false, from);
> >
> > return fuse_dev_do_write(fud, &cs, iov_iter_count(from));
> > }
> > @@ -2301,7 +2303,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
> > }
> > pipe_unlock(pipe);
> >
> > - fuse_copy_init(&cs, false, NULL);
> > + fuse_copy_init(&cs, fud->fc, false, NULL);
> > cs.pipebufs = bufs;
> > cs.nr_segs = nbuf;
> > cs.pipe = pipe;
> > diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
> > index ef470c4a9261..52b883a6a79d 100644
> > --- a/fs/fuse/dev_uring.c
> > +++ b/fs/fuse/dev_uring.c
> > @@ -593,7 +593,7 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
> > if (err)
> > return err;
> >
> > - fuse_copy_init(&cs, false, &iter);
> > + fuse_copy_init(&cs, ring->fc, false, &iter);
> > cs.is_uring = true;
> > cs.req = req;
> >
> > @@ -623,7 +623,7 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
> > return err;
> > }
> >
> > - fuse_copy_init(&cs, true, &iter);
> > + fuse_copy_init(&cs, ring->fc, true, &iter);
> > cs.is_uring = true;
> > cs.req = req;
> >
> > diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
> > index db136e045925..25e593e64c67 100644
> > --- a/fs/fuse/fuse_dev_i.h
> > +++ b/fs/fuse/fuse_dev_i.h
> > @@ -32,6 +32,7 @@ struct fuse_copy_state {
> > bool write:1;
> > bool move_pages:1;
> > bool is_uring:1;
> > + bool splice_read_user_pages:1;
> > struct {
> > unsigned int copied_sz; /* copied size into the user buffer */
> > } ring;
> > @@ -51,8 +52,8 @@ struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique);
> >
> > void fuse_dev_end_requests(struct list_head *head);
> >
> > -void fuse_copy_init(struct fuse_copy_state *cs, bool write,
> > - struct iov_iter *iter);
> > +void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *conn,
> > + bool write, struct iov_iter *iter);
> > int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs,
> > unsigned int argpages, struct fuse_arg *args,
> > int zeroing);
> > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > index 3d5289cb82a5..e21875f16220 100644
> > --- a/fs/fuse/fuse_i.h
> > +++ b/fs/fuse/fuse_i.h
> > @@ -898,6 +898,9 @@ struct fuse_conn {
> > /* Use io_uring for communication */
> > bool io_uring:1;
> >
> > + /* Allow splice for reading user pages */
> > + bool splice_read_user_pages:1;
> > +
> > /** Maximum stack depth for passthrough backing files */
> > int max_stack_depth;
> >
> > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > index 43b6643635ee..e82e96800fde 100644
> > --- a/fs/fuse/inode.c
> > +++ b/fs/fuse/inode.c
> > @@ -1439,6 +1439,9 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> >
> > if (flags & FUSE_REQUEST_TIMEOUT)
> > timeout = arg->request_timeout;
> > +
> > + if (flags & FUSE_SPLICE_READ_USER_PAGES)
> > + fc->splice_read_user_pages = true;
>
>
> Shouldn't that check for capable(CAP_SYS_ADMIN)? Isn't the issue
> that one can access file content although the write is already
> marked as completed? I.e. a fuse file system might get data
> it was never exposed to and possibly secret data?
> A more complex version version could check for CAP_SYS_ADMIN, but
> also allow later on read/write to files that have the same uid as
> the fuser-server process?
>
IDGI. I don't see how this allows the server access to something it
didn't have access to before.
This patchset seems to be about a "contract" between the kernel and the
userland server. The server is agreeing to be very careful about not
touching pages after a write request completes, and the kernel allows
splicing the pages if that's the case.
Can you explain the potential attack vector?
>
> > } else {
> > ra_pages = fc->max_read / PAGE_SIZE;
> > fc->no_lock = true;
> > @@ -1489,7 +1492,7 @@ void fuse_send_init(struct fuse_mount *fm)
> > FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
> > FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP |
> > FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP |
> > - FUSE_REQUEST_TIMEOUT;
> > + FUSE_REQUEST_TIMEOUT | FUSE_SPLICE_READ_USER_PAGES;
> > #ifdef CONFIG_FUSE_DAX
> > if (fm->fc->dax)
> > flags |= FUSE_MAP_ALIGNMENT;
> > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> > index 122d6586e8d4..b35f6bbcb322 100644
> > --- a/include/uapi/linux/fuse.h
> > +++ b/include/uapi/linux/fuse.h
> > @@ -235,6 +235,9 @@
> > *
> > * 7.44
> > * - add FUSE_NOTIFY_INC_EPOCH
> > + *
> > + * 7.45
> > + * - add FUSE_SPLICE_READ_USER_PAGES
> > */
> >
> > #ifndef _LINUX_FUSE_H
> > @@ -443,6 +446,10 @@ struct fuse_file_lock {
> > * FUSE_OVER_IO_URING: Indicate that client supports io-uring
> > * FUSE_REQUEST_TIMEOUT: kernel supports timing out requests.
> > * init_out.request_timeout contains the timeout (in secs)
> > + * FUSE_SPLICE_READ_USER_PAGES: kernel supports splice on the device for reading
> > + * user pages. If the server enables this, then the
> > + * server should not access the pipe buffer after
> > + * completing the request.
> > */
> > #define FUSE_ASYNC_READ (1 << 0)
> > #define FUSE_POSIX_LOCKS (1 << 1)
> > @@ -490,6 +497,7 @@ struct fuse_file_lock {
> > #define FUSE_ALLOW_IDMAP (1ULL << 40)
> > #define FUSE_OVER_IO_URING (1ULL << 41)
> > #define FUSE_REQUEST_TIMEOUT (1ULL << 42)
> > +#define FUSE_SPLICE_READ_USER_PAGES (1ULL << 43)
> >
> > /**
> > * CUSE INIT request/reply flags
>
--
Jeff Layton <jlayton@xxxxxxxxxx>