Re: [PATCH v1 3/3] NFSD: Implement NFSD_IO_DIRECT for NFS READ

"NeilBrown" <neil@xxxxxxxxxx> · Wed, 10 Sep 2025 11:52:38 +1000

On Wed, 10 Sep 2025, Chuck Lever wrote:
> On 9/9/25 7:37 PM, NeilBrown wrote:
> > On Wed, 10 Sep 2025, Chuck Lever wrote:
> >> From: Chuck Lever <chuck.lever@xxxxxxxxxx>
> >>
> >> Add an experimental option that forces NFS READ operations to use
> >> direct I/O instead of reading through the NFS server's page cache.
> >>
> >> There are already other layers of caching:
> >>  - The page cache on NFS clients
> >>  - The block device underlying the exported file system
> >>
> >> The server's page cache, in many cases, is unlikely to provide
> >> additional benefit. Some benchmarks have demonstrated that the
> >> server's page cache is actively detrimental for workloads whose
> >> working set is larger than the server's available physical memory.
> >>
> >> For instance, on small NFS servers, cached NFS file content can
> >> squeeze out local memory consumers. For large sequential workloads,
> >> an enormous amount of data flows into and out of the page cache
> >> and is consumed by NFS clients exactly once -- caching that data
> >> is expensive to do and totally valueless.
> >>
> >> For now this is a hidden option that can be enabled on test
> >> systems for benchmarking. In the longer term, this option might
> >> be enabled persistently or per-export. When the exported file
> >> system does not support direct I/O, NFSD falls back to using
> >> either DONTCACHE or buffered I/O to fulfill NFS READ requests.
> >>
> >> Suggested-by: Mike Snitzer <snitzer@xxxxxxxxxx>
> >> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx>
> >> ---
> >>  fs/nfsd/debugfs.c |  2 ++
> >>  fs/nfsd/nfsd.h    |  1 +
> >>  fs/nfsd/trace.h   |  1 +
> >>  fs/nfsd/vfs.c     | 78 +++++++++++++++++++++++++++++++++++++++++++++++
> >>  4 files changed, 82 insertions(+)
> >>
> >> diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
> >> index ed2b9e066206..00eb1ecef6ac 100644
> >> --- a/fs/nfsd/debugfs.c
> >> +++ b/fs/nfsd/debugfs.c
> >> @@ -44,6 +44,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
> >>   * Contents:
> >>   *   %0: NFS READ will use buffered IO
> >>   *   %1: NFS READ will use dontcache (buffered IO w/ dropbehind)
> >> + *   %2: NFS READ will use direct IO
> >>   *
> >>   * This setting takes immediate effect for all NFS versions,
> >>   * all exports, and in all NFSD net namespaces.
> >> @@ -64,6 +65,7 @@ static int nfsd_io_cache_read_set(void *data, u64 val)
> >>  		nfsd_io_cache_read = NFSD_IO_BUFFERED;
> >>  		break;
> >>  	case NFSD_IO_DONTCACHE:
> >> +	case NFSD_IO_DIRECT:
> >>  		/*
> >>  		 * Must disable splice_read when enabling
> >>  		 * NFSD_IO_DONTCACHE.
> >> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> >> index ea87b42894dd..bdb60ee1f1a4 100644
> >> --- a/fs/nfsd/nfsd.h
> >> +++ b/fs/nfsd/nfsd.h
> >> @@ -157,6 +157,7 @@ enum {
> >>  	/* Any new NFSD_IO enum value must be added at the end */
> >>  	NFSD_IO_BUFFERED,
> >>  	NFSD_IO_DONTCACHE,
> >> +	NFSD_IO_DIRECT,
> >>  };
> >>  
> >>  extern u64 nfsd_io_cache_read __read_mostly;
> >> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> >> index e5af0d058fd0..88901df5fbb1 100644
> >> --- a/fs/nfsd/trace.h
> >> +++ b/fs/nfsd/trace.h
> >> @@ -464,6 +464,7 @@ DEFINE_EVENT(nfsd_io_class, nfsd_##name,	\
> >>  DEFINE_NFSD_IO_EVENT(read_start);
> >>  DEFINE_NFSD_IO_EVENT(read_splice);
> >>  DEFINE_NFSD_IO_EVENT(read_vector);
> >> +DEFINE_NFSD_IO_EVENT(read_direct);
> >>  DEFINE_NFSD_IO_EVENT(read_io_done);
> >>  DEFINE_NFSD_IO_EVENT(read_done);
> >>  DEFINE_NFSD_IO_EVENT(write_start);
> >> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> >> index 441267d877f9..9665454743eb 100644
> >> --- a/fs/nfsd/vfs.c
> >> +++ b/fs/nfsd/vfs.c
> >> @@ -1074,6 +1074,79 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >>  	return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
> >>  }
> >>  
> >> +/*
> >> + * The byte range of the client's READ request is expanded on both
> >> + * ends until it meets the underlying file system's direct I/O
> >> + * alignment requirements. After the internal read is complete, the
> >> + * byte range of the NFS READ payload is reduced to the byte range
> >> + * that was originally requested.
> >> + *
> >> + * Note that a direct read can be done only when the xdr_buf
> >> + * containing the NFS READ reply does not already have contents in
> >> + * its .pages array. This is due to potentially restrictive
> >> + * alignment requirements on the read buffer. When .page_len and
> >> + * @base are zero, the .pages array is guaranteed to be page-
> >> + * aligned.
> > 
> > Where do we test that this condition is met?
> > 
> > I can see that nfsd_direct_read() is only called if "base" is zero, but
> > that means the current contents of the .pages array are page-aligned,
> > not that there are now.
> > 
> >> + */
> >> +static noinline_for_stack __be32
> >> +nfsd_direct_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >> +		 struct nfsd_file *nf, loff_t offset, unsigned long *count,
> >> +		 u32 *eof)
> >> +{
> >> +	loff_t dio_start, dio_end;
> >> +	unsigned long v, total;
> >> +	struct iov_iter iter;
> >> +	struct kiocb kiocb;
> >> +	ssize_t host_err;
> >> +	size_t len;
> >> +
> >> +	init_sync_kiocb(&kiocb, nf->nf_file);
> >> +	kiocb.ki_flags |= IOCB_DIRECT;
> >> +
> >> +	/* Read a properly-aligned region of bytes into rq_bvec */
> >> +	dio_start = round_down(offset, nf->nf_dio_read_offset_align);
> >> +	dio_end = round_up(offset + *count, nf->nf_dio_read_offset_align);
> >> +
> >> +	kiocb.ki_pos = dio_start;
> >> +
> >> +	v = 0;
> >> +	total = dio_end - dio_start;
> >> +	while (total) {
> >> +		len = min_t(size_t, total, PAGE_SIZE);
> >> +		bvec_set_page(&rqstp->rq_bvec[v], *(rqstp->rq_next_page++),
> >> +			      len, 0);
> >> +		total -= len;
> >> +		++v;
> >> +	}
> >> +	WARN_ON_ONCE(v > rqstp->rq_maxpages);
> > 
> > I would rather we had an early test rather than a late warn-on.
> > e.g.
> >   if (total > (rqstp->rq_maxpages >> PAGE_SHIFT))
> >      return -EINVAL /* or whatever */;
> > 
> > Otherwise it seems to be making unstated assumptions about how big the
> > alignment requirements could be.
> 
> This is the same warn-on test that nfsd_iter_read does for buffered and
> dontcache reads. It's done late because the final value of v is computed
> here, not known before the loop.

True, but in this case "total" could be larger than "*count" which was
size-checked in e.g.  nfsd4_encode_read.  So it could now be larger than
the available space.

> 
> I think we might be able to turn this into a short read, for all I/O
> modes?

Yes, that could be a clean way to handle the unlikely case that the
reads doesn't fit any more.

Thanks,
NeilBrown

> 
> 
> > Thanks,
> > NeilBrown
> > 
> >> +
> >> +	trace_nfsd_read_direct(rqstp, fhp, offset, *count);
> >> +	iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, dio_end - dio_start);
> >> +
> >> +	host_err = vfs_iocb_iter_read(nf->nf_file, &kiocb, &iter);
> >> +	if (host_err >= 0) {
> >> +		unsigned int pad = offset - dio_start;
> >> +
> >> +		/* The returned payload starts after the pad */
> >> +		rqstp->rq_res.page_base = pad;
> >> +
> >> +		/* Compute the count of bytes to be returned */
> >> +		if (host_err > pad + *count) {
> >> +			host_err = *count;
> >> +		} else if (host_err > pad) {
> >> +			host_err -= pad;
> >> +		} else {
> >> +			host_err = 0;
> >> +		}
> >> +	} else if (unlikely(host_err == -EINVAL)) {
> >> +		pr_info_ratelimited("nfsd: Unexpected direct I/O alignment failure\n");
> >> +		host_err = -ESERVERFAULT;
> >> +	}
> >> +
> >> +	return nfsd_finish_read(rqstp, fhp, nf->nf_file, offset, count,
> >> +				eof, host_err);
> >> +}
> >> +
> >>  /**
> >>   * nfsd_iter_read - Perform a VFS read using an iterator
> >>   * @rqstp: RPC transaction context
> >> @@ -1106,6 +1179,11 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >>  	switch (nfsd_io_cache_read) {
> >>  	case NFSD_IO_BUFFERED:
> >>  		break;
> >> +	case NFSD_IO_DIRECT:
> >> +		if (nf->nf_dio_read_offset_align && !base)
> >> +			return nfsd_direct_read(rqstp, fhp, nf, offset,
> >> +						count, eof);
> >> +		fallthrough;
> >>  	case NFSD_IO_DONTCACHE:
> >>  		if (file->f_op->fop_flags & FOP_DONTCACHE)
> >>  			kiocb.ki_flags = IOCB_DONTCACHE;
> >> -- 
> >> 2.50.0
> >>
> >>
> >>
> > 
> 
> 
> -- 
> Chuck Lever
>