If enable-dontcache is used, expand any misaligned READ to the next DIO-aligned block (on either end of the READ). Reserve an extra page in svc_serv_maxpages() because nfsd_iter_read() might need two extra pages when a READ payload is not DIO-aligned -- but nfsd_iter_read() and nfsd_splice_actor() are mutually exclusive (so reuse page reserved for nfsd_splice_actor). Also add nfsd_read_vector_dio trace event. This combination of trace events is useful: echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_vector/enable echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_vector_dio/enable echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_io_done/enable echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_read/enable Which for this dd command: dd if=/mnt/share1/test of=/dev/null bs=47008 count=2 iflag=direct Results in: nfsd-16580 [001] ..... 5672.403130: nfsd_read_vector_dio: xid=0x5ccf019c fh_hash=0xe4dadb60 offset=0 len=47008 start=0+0 end=47104-96 nfsd-16580 [001] ..... 5672.403131: nfsd_read_vector: xid=0x5ccf019c fh_hash=0xe4dadb60 offset=0 len=47104 nfsd-16580 [001] ..... 5672.403134: xfs_file_direct_read: dev 253:0 ino 0x1c2388c1 disize 0x16f40 pos 0x0 bytecount 0xb800 nfsd-16580 [001] ..... 5672.404380: nfsd_read_io_done: xid=0x5ccf019c fh_hash=0xe4dadb60 offset=0 len=47008 nfsd-16580 [001] ..... 5672.404672: nfsd_read_vector_dio: xid=0x5dcf019c fh_hash=0xe4dadb60 offset=47008 len=47008 start=46592+416 end=94208-192 nfsd-16580 [001] ..... 5672.404672: nfsd_read_vector: xid=0x5dcf019c fh_hash=0xe4dadb60 offset=46592 len=47616 nfsd-16580 [001] ..... 5672.404673: xfs_file_direct_read: dev 253:0 ino 0x1c2388c1 disize 0x16f40 pos 0xb600 bytecount 0xba00 nfsd-16580 [001] ..... 5672.405771: nfsd_read_io_done: xid=0x5dcf019c fh_hash=0xe4dadb60 offset=47008 len=47008 Suggested-by: Jeff Layton <jlayton@xxxxxxxxxx> Suggested-by: Chuck Lever <chuck.lever@xxxxxxxxxx> Signed-off-by: Mike Snitzer <snitzer@xxxxxxxxxx> --- fs/nfsd/trace.h | 37 ++++++++++++++++++++++ fs/nfsd/vfs.c | 65 ++++++++++++++++++++++++++++---------- include/linux/sunrpc/svc.h | 5 ++- 3 files changed, 90 insertions(+), 17 deletions(-) diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 3c5505ef5e3a..a46515b953f4 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -473,6 +473,43 @@ DEFINE_NFSD_IO_EVENT(write_done); DEFINE_NFSD_IO_EVENT(commit_start); DEFINE_NFSD_IO_EVENT(commit_done); +TRACE_EVENT(nfsd_read_vector_dio, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *fhp, + u64 offset, + u32 len, + loff_t start, + loff_t start_extra, + loff_t end, + loff_t end_extra), + TP_ARGS(rqstp, fhp, offset, len, start, start_extra, end, end_extra), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, fh_hash) + __field(u64, offset) + __field(u32, len) + __field(loff_t, start) + __field(loff_t, start_extra) + __field(loff_t, end) + __field(loff_t, end_extra) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->offset = offset; + __entry->len = len; + __entry->start = start; + __entry->start_extra = start_extra; + __entry->end = end; + __entry->end_extra = end_extra; + ), + TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu len=%u start=%llu+%llu end=%llu-%llu", + __entry->xid, __entry->fh_hash, + __entry->offset, __entry->len, + __entry->start, __entry->start_extra, + __entry->end, __entry->end_extra) +); + DECLARE_EVENT_CLASS(nfsd_err_class, TP_PROTO(struct svc_rqst *rqstp, struct svc_fh *fhp, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a942609e3ab9..be5d025b4680 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -19,6 +19,7 @@ #include <linux/splice.h> #include <linux/falloc.h> #include <linux/fcntl.h> +#include <linux/math.h> #include <linux/namei.h> #include <linux/delay.h> #include <linux/fsnotify.h> @@ -1101,15 +1102,41 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int base, u32 *eof) { struct file *file = nf->nf_file; - unsigned long v, total; + unsigned long v, total, in_count = *count; + loff_t start_extra = 0, end_extra = 0; struct iov_iter iter; - loff_t ppos = offset; + loff_t ppos; rwf_t flags = 0; ssize_t host_err; size_t len; + /* + * If dontcache enabled, expand any misaligned READ to + * the next DIO-aligned block (on either end of the READ). + */ + if (nfsd_enable_dontcache && nf->nf_dio_mem_align && + (base & (nf->nf_dio_mem_align-1)) == 0) { + const u32 dio_blocksize = nf->nf_dio_read_offset_align; + loff_t orig_end = offset + *count; + loff_t start = round_down(offset, dio_blocksize); + loff_t end = round_up(orig_end, dio_blocksize); + + WARN_ON_ONCE(dio_blocksize > PAGE_SIZE); + start_extra = offset - start; + end_extra = end - orig_end; + + /* Show original offset and count, and how it was expanded for DIO */ + trace_nfsd_read_vector_dio(rqstp, fhp, offset, *count, + start, start_extra, end, end_extra); + + /* trace_nfsd_read_vector() will reflect larger DIO-aligned READ */ + offset = start; + in_count = end - start; + flags |= RWF_DIRECT; + } + v = 0; - total = *count; + total = in_count; while (total) { len = min_t(size_t, total, PAGE_SIZE - base); bvec_set_page(&rqstp->rq_bvec[v], *(rqstp->rq_next_page++), @@ -1120,21 +1147,27 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, } WARN_ON_ONCE(v > rqstp->rq_maxpages); - trace_nfsd_read_vector(rqstp, fhp, offset, *count); - iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count); - - if (nfsd_enable_dontcache) { - if (is_dio_aligned(&iter, offset, nf->nf_dio_read_offset_align)) - flags |= RWF_DIRECT; - /* FIXME: not using RWF_DONTCACHE for misaligned IO because it works - * against us (due to RMW needing to read without benefit of cache), - * whereas buffered IO enables misaligned IO to be more performant. - */ - //else - // flags |= RWF_DONTCACHE; - } + trace_nfsd_read_vector(rqstp, fhp, offset, in_count); + iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, in_count); + ppos = offset; host_err = vfs_iter_read(file, &iter, &ppos, flags); + + if ((start_extra || end_extra) && host_err >= 0) { + rqstp->rq_bvec[0].bv_offset += start_extra; + rqstp->rq_bvec[0].bv_len -= start_extra; + rqstp->rq_bvec[v].bv_len -= end_extra; + /* Must adjust returned read size to reflect original extent */ + offset += start_extra; + if (likely(host_err >= start_extra)) { + host_err -= start_extra; + if (host_err > *count) + host_err = *count; + } else { + /* Short read that didn't read any of requested data */ + host_err = 0; + } + } return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 46f7991cea58..52f5c9ec35aa 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -163,10 +163,13 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * pages, one for the request, and one for the reply. * nfsd_splice_actor() might need an extra page when a READ payload * is not page-aligned. + * nfsd_iter_read() might need two extra pages when a READ payload + * is not DIO-aligned -- but nfsd_iter_read() and nfsd_splice_actor() + * are mutually exclusive. */ static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv) { - return DIV_ROUND_UP(serv->sv_max_mesg, PAGE_SIZE) + 2 + 1; + return DIV_ROUND_UP(serv->sv_max_mesg, PAGE_SIZE) + 2 + 1 + 1; } /* -- 2.44.0