On 8/7/25 12:25 PM, Mike Snitzer wrote: > Add 'io_cache_read' to NFSD's debugfs interface so that: Any data > read by NFSD will either be: > - cached using page cache (NFSD_IO_BUFFERED=1) > - cached but removed from the page cache upon completion > (NFSD_IO_DONTCACHE=2). > - not cached (NFSD_IO_DIRECT=3) > > io_cache_read may be set by writing to: > /sys/kernel/debug/nfsd/io_cache_read > > If NFSD_IO_DONTCACHE is specified using 2, FOP_DONTCACHE must be > advertised as supported by the underlying filesystem (e.g. XFS), > otherwise all IO flagged with RWF_DONTCACHE will fail with > -EOPNOTSUPP. > > If NFSD_IO_DIRECT is specified using 3, the IO must be aligned > relative to the underlying block device's logical_block_size. Also the > memory buffer used to store the read must be aligned relative to the > underlying block device's dma_alignment. > > Signed-off-by: Mike Snitzer <snitzer@xxxxxxxxxx> > --- > fs/nfsd/debugfs.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++ > fs/nfsd/nfsd.h | 9 ++++++++ > fs/nfsd/vfs.c | 19 +++++++++++++--- > 3 files changed, 83 insertions(+), 3 deletions(-) > > diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c > index 84b0c8b559dc9..c07f71d4e84f4 100644 > --- a/fs/nfsd/debugfs.c > +++ b/fs/nfsd/debugfs.c > @@ -27,11 +27,66 @@ static int nfsd_dsr_get(void *data, u64 *val) > static int nfsd_dsr_set(void *data, u64 val) > { > nfsd_disable_splice_read = (val > 0) ? true : false; > + if (!nfsd_disable_splice_read) { > + /* > + * Cannot use NFSD_IO_DONTCACHE or NFSD_IO_DIRECT > + * if splice_read is enabled. > + */ > + nfsd_io_cache_read = NFSD_IO_BUFFERED; > + } > return 0; > } > > DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n"); > > +/* > + * /sys/kernel/debug/nfsd/io_cache_read > + * > + * Contents: > + * %1: NFS READ will use buffered IO > + * %2: NFS READ will use dontcache (buffered IO w/ dropbehind) > + * %3: NFS READ will use direct IO > + * > + * The default value of this setting is zero (UNSPECIFIED). > + * This setting takes immediate effect for all NFS versions, > + * all exports, and in all NFSD net namespaces. > + */ > + > +static int nfsd_io_cache_read_get(void *data, u64 *val) > +{ > + *val = nfsd_io_cache_read; > + return 0; > +} > + > +static int nfsd_io_cache_read_set(void *data, u64 val) > +{ > + int ret = 0; > + > + switch (val) { > + case NFSD_IO_BUFFERED: > + nfsd_io_cache_read = NFSD_IO_BUFFERED; > + break; > + case NFSD_IO_DONTCACHE: > + case NFSD_IO_DIRECT: > + /* > + * Must disable splice_read when enabling > + * NFSD_IO_DONTCACHE or NFSD_IO_DIRECT. > + */ > + nfsd_disable_splice_read = true; > + nfsd_io_cache_read = val; > + break; > + default: > + nfsd_io_cache_read = NFSD_IO_UNSPECIFIED; > + ret = -EINVAL; > + break; > + } > + > + return ret; > +} > + > +DEFINE_DEBUGFS_ATTRIBUTE(nfsd_io_cache_read_fops, nfsd_io_cache_read_get, > + nfsd_io_cache_read_set, "%llu\n"); > + > void nfsd_debugfs_exit(void) > { > debugfs_remove_recursive(nfsd_top_dir); > @@ -44,4 +99,7 @@ void nfsd_debugfs_init(void) > > debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO, > nfsd_top_dir, NULL, &nfsd_dsr_fops); > + > + debugfs_create_file("io_cache_read", S_IWUSR | S_IRUGO, > + nfsd_top_dir, NULL, &nfsd_io_cache_read_fops); > } > diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h > index 1cd0bed57bc2f..6ef799405145f 100644 > --- a/fs/nfsd/nfsd.h > +++ b/fs/nfsd/nfsd.h > @@ -153,6 +153,15 @@ static inline void nfsd_debugfs_exit(void) {} > > extern bool nfsd_disable_splice_read __read_mostly; > > +enum { > + NFSD_IO_UNSPECIFIED = 0, > + NFSD_IO_BUFFERED, > + NFSD_IO_DONTCACHE, > + NFSD_IO_DIRECT, > +}; > + > +extern u64 nfsd_io_cache_read __read_mostly; > + > extern int nfsd_max_blksize; > > static inline int nfsd_v4client(struct svc_rqst *rq) > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c > index 79439ad93880a..26b6d96258711 100644 > --- a/fs/nfsd/vfs.c > +++ b/fs/nfsd/vfs.c > @@ -49,6 +49,7 @@ > #define NFSDDBG_FACILITY NFSDDBG_FILEOP > > bool nfsd_disable_splice_read __read_mostly; > +u64 nfsd_io_cache_read __read_mostly; > > /** > * nfserrno - Map Linux errnos to NFS errnos > @@ -1099,17 +1100,29 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, > size_t len; > > init_sync_kiocb(&kiocb, file); > + > + if (nfsd_io_cache_read == NFSD_IO_DIRECT) { > + /* Verify ondisk DIO alignment, memory addrs checked below */ > + if (nf->nf_dio_mem_align && nf->nf_dio_read_offset_align && > + (((offset | *count) & (nf->nf_dio_read_offset_align - 1)) == 0)) > + kiocb.ki_flags = IOCB_DIRECT; > + } else if (nfsd_io_cache_read == NFSD_IO_DONTCACHE) > + kiocb.ki_flags = IOCB_DONTCACHE; > + Personal style: let's make this a switch statement like it will be for the write path. > kiocb.ki_pos = offset; > > v = 0; > total = *count; > while (total) { > len = min_t(size_t, total, PAGE_SIZE - base); > - bvec_set_page(&rqstp->rq_bvec[v], *(rqstp->rq_next_page++), > - len, base); > + bvec_set_page(&rqstp->rq_bvec[v], *(rqstp->rq_next_page++), len, base); Nit: changing this line does not appear to be necessary. > + /* No need to verify memory is DIO-aligned since bv_offset is 0 */ > + if (unlikely((kiocb.ki_flags & IOCB_DIRECT) && base && > + (base & (nf->nf_dio_mem_align - 1)))) > + kiocb.ki_flags &= ~IOCB_DIRECT; > total -= len; > - ++v; > base = 0; > + v++; Nit: I've actually measured pre-incrementing to be slightly faster than post-incrementing, so I'd like to keep "++v;" here. > } > WARN_ON_ONCE(v > rqstp->rq_maxpages); > -- Chuck Lever