Re: [PATCH v1] fuse: use splice for reading user pages on servers that enable it

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, 2025-04-21 at 13:49 +0200, Bernd Schubert wrote:
> 
> On 4/19/25 02:06, Joanne Koong wrote:
> > For direct io writes, splice is disabled when forwarding pages from the
> > client to the server. This is because the pages in the pipe buffer are
> > user pages, which is controlled by the client. Thus if a server replies
> > to the request and then keeps accessing the pages afterwards, there is
> > the possibility that the client may have modified the contents of the
> > pages in the meantime. More context on this can be found in commit
> > 0c4bcfdecb1a ("fuse: fix pipe buffer lifetime for direct_io").
> > 
> > For servers that do not need to access pages after answering the
> > request, splice gives a non-trivial improvement in performance.
> > Benchmarks show roughly a 40% speedup.
> > 
> > Allow servers to enable zero-copy splice for servicing client direct io
> > writes. By enabling this, the server understands that they should not
> > continue accessing the pipe buffer after completing the request or may
> > face incorrect data if they do so.
> > 
> > Signed-off-by: Joanne Koong <joannelkoong@xxxxxxxxx>
> > ---
> >  fs/fuse/dev.c             | 18 ++++++++++--------
> >  fs/fuse/dev_uring.c       |  4 ++--
> >  fs/fuse/fuse_dev_i.h      |  5 +++--
> >  fs/fuse/fuse_i.h          |  3 +++
> >  fs/fuse/inode.c           |  5 ++++-
> >  include/uapi/linux/fuse.h |  8 ++++++++
> >  6 files changed, 30 insertions(+), 13 deletions(-)
> > 
> > diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> > index 67d07b4c778a..1b0ea8593f74 100644
> > --- a/fs/fuse/dev.c
> > +++ b/fs/fuse/dev.c
> > @@ -816,12 +816,13 @@ static int unlock_request(struct fuse_req *req)
> >  	return err;
> >  }
> >  
> > -void fuse_copy_init(struct fuse_copy_state *cs, bool write,
> > -		    struct iov_iter *iter)
> > +void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
> > +		    bool write, struct iov_iter *iter)
> >  {
> >  	memset(cs, 0, sizeof(*cs));
> >  	cs->write = write;
> >  	cs->iter = iter;
> > +	cs->splice_read_user_pages = fc->splice_read_user_pages;
> >  }
> >  
> >  /* Unmap and put previous page of userspace buffer */
> > @@ -1105,9 +1106,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
> >  		if (cs->write && cs->pipebufs && page) {
> >  			/*
> >  			 * Can't control lifetime of pipe buffers, so always
> > -			 * copy user pages.
> > +			 * copy user pages if server does not support splice
> > +			 * for reading user pages.
> >  			 */
> > -			if (cs->req->args->user_pages) {
> > +			if (cs->req->args->user_pages && !cs->splice_read_user_pages) {
> >  				err = fuse_copy_fill(cs);
> >  				if (err)
> >  					return err;
> > @@ -1538,7 +1540,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
> >  	if (!user_backed_iter(to))
> >  		return -EINVAL;
> >  
> > -	fuse_copy_init(&cs, true, to);
> > +	fuse_copy_init(&cs, fud->fc, true, to);
> >  
> >  	return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to));
> >  }
> > @@ -1561,7 +1563,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
> >  	if (!bufs)
> >  		return -ENOMEM;
> >  
> > -	fuse_copy_init(&cs, true, NULL);
> > +	fuse_copy_init(&cs, fud->fc, true, NULL);
> >  	cs.pipebufs = bufs;
> >  	cs.pipe = pipe;
> >  	ret = fuse_dev_do_read(fud, in, &cs, len);
> > @@ -2227,7 +2229,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
> >  	if (!user_backed_iter(from))
> >  		return -EINVAL;
> >  
> > -	fuse_copy_init(&cs, false, from);
> > +	fuse_copy_init(&cs, fud->fc, false, from);
> >  
> >  	return fuse_dev_do_write(fud, &cs, iov_iter_count(from));
> >  }
> > @@ -2301,7 +2303,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
> >  	}
> >  	pipe_unlock(pipe);
> >  
> > -	fuse_copy_init(&cs, false, NULL);
> > +	fuse_copy_init(&cs, fud->fc, false, NULL);
> >  	cs.pipebufs = bufs;
> >  	cs.nr_segs = nbuf;
> >  	cs.pipe = pipe;
> > diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
> > index ef470c4a9261..52b883a6a79d 100644
> > --- a/fs/fuse/dev_uring.c
> > +++ b/fs/fuse/dev_uring.c
> > @@ -593,7 +593,7 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
> >  	if (err)
> >  		return err;
> >  
> > -	fuse_copy_init(&cs, false, &iter);
> > +	fuse_copy_init(&cs, ring->fc, false, &iter);
> >  	cs.is_uring = true;
> >  	cs.req = req;
> >  
> > @@ -623,7 +623,7 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
> >  		return err;
> >  	}
> >  
> > -	fuse_copy_init(&cs, true, &iter);
> > +	fuse_copy_init(&cs, ring->fc, true, &iter);
> >  	cs.is_uring = true;
> >  	cs.req = req;
> >  
> > diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
> > index db136e045925..25e593e64c67 100644
> > --- a/fs/fuse/fuse_dev_i.h
> > +++ b/fs/fuse/fuse_dev_i.h
> > @@ -32,6 +32,7 @@ struct fuse_copy_state {
> >  	bool write:1;
> >  	bool move_pages:1;
> >  	bool is_uring:1;
> > +	bool splice_read_user_pages:1;
> >  	struct {
> >  		unsigned int copied_sz; /* copied size into the user buffer */
> >  	} ring;
> > @@ -51,8 +52,8 @@ struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique);
> >  
> >  void fuse_dev_end_requests(struct list_head *head);
> >  
> > -void fuse_copy_init(struct fuse_copy_state *cs, bool write,
> > -			   struct iov_iter *iter);
> > +void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *conn,
> > +		    bool write, struct iov_iter *iter);
> >  int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs,
> >  		   unsigned int argpages, struct fuse_arg *args,
> >  		   int zeroing);
> > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > index 3d5289cb82a5..e21875f16220 100644
> > --- a/fs/fuse/fuse_i.h
> > +++ b/fs/fuse/fuse_i.h
> > @@ -898,6 +898,9 @@ struct fuse_conn {
> >  	/* Use io_uring for communication */
> >  	bool io_uring:1;
> >  
> > +	/* Allow splice for reading user pages */
> > +	bool splice_read_user_pages:1;
> > +
> >  	/** Maximum stack depth for passthrough backing files */
> >  	int max_stack_depth;
> >  
> > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > index 43b6643635ee..e82e96800fde 100644
> > --- a/fs/fuse/inode.c
> > +++ b/fs/fuse/inode.c
> > @@ -1439,6 +1439,9 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> >  
> >  			if (flags & FUSE_REQUEST_TIMEOUT)
> >  				timeout = arg->request_timeout;
> > +
> > +			if (flags & FUSE_SPLICE_READ_USER_PAGES)
> > +				fc->splice_read_user_pages = true;
> 
> 
> Shouldn't that check for capable(CAP_SYS_ADMIN)? Isn't the issue
> that one can access file content although the write is already
> marked as completed? I.e. a fuse file system might get data
> it was never exposed to and possibly secret data?
> A more complex version version could check for CAP_SYS_ADMIN, but
> also allow later on read/write to files that have the same uid as
> the fuser-server process?
> 

IDGI. I don't see how this allows the server access to something it
didn't have access to before.

This patchset seems to be about a "contract" between the kernel and the
userland server. The server is agreeing to be very careful about not
touching pages after a write request completes, and the kernel allows
splicing the pages if that's the case.

Can you explain the potential attack vector?


> 
> >  		} else {
> >  			ra_pages = fc->max_read / PAGE_SIZE;
> >  			fc->no_lock = true;
> > @@ -1489,7 +1492,7 @@ void fuse_send_init(struct fuse_mount *fm)
> >  		FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
> >  		FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP |
> >  		FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP |
> > -		FUSE_REQUEST_TIMEOUT;
> > +		FUSE_REQUEST_TIMEOUT | FUSE_SPLICE_READ_USER_PAGES;
> >  #ifdef CONFIG_FUSE_DAX
> >  	if (fm->fc->dax)
> >  		flags |= FUSE_MAP_ALIGNMENT;
> > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> > index 122d6586e8d4..b35f6bbcb322 100644
> > --- a/include/uapi/linux/fuse.h
> > +++ b/include/uapi/linux/fuse.h
> > @@ -235,6 +235,9 @@
> >   *
> >   *  7.44
> >   *  - add FUSE_NOTIFY_INC_EPOCH
> > + *
> > + *  7.45
> > + *  - add FUSE_SPLICE_READ_USER_PAGES
> >   */
> >  
> >  #ifndef _LINUX_FUSE_H
> > @@ -443,6 +446,10 @@ struct fuse_file_lock {
> >   * FUSE_OVER_IO_URING: Indicate that client supports io-uring
> >   * FUSE_REQUEST_TIMEOUT: kernel supports timing out requests.
> >   *			 init_out.request_timeout contains the timeout (in secs)
> > + * FUSE_SPLICE_READ_USER_PAGES: kernel supports splice on the device for reading
> > + *				user pages. If the server enables this, then the
> > + *				server should not access the pipe buffer after
> > + *				completing the request.
> >   */
> >  #define FUSE_ASYNC_READ		(1 << 0)
> >  #define FUSE_POSIX_LOCKS	(1 << 1)
> > @@ -490,6 +497,7 @@ struct fuse_file_lock {
> >  #define FUSE_ALLOW_IDMAP	(1ULL << 40)
> >  #define FUSE_OVER_IO_URING	(1ULL << 41)
> >  #define FUSE_REQUEST_TIMEOUT	(1ULL << 42)
> > +#define FUSE_SPLICE_READ_USER_PAGES (1ULL << 43)
> >  
> >  /**
> >   * CUSE INIT request/reply flags
> 

-- 
Jeff Layton <jlayton@xxxxxxxxxx>





[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux