[PATCH 6/6] NFSD: issue READs using O_DIRECT even if IO is misaligned

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



If enable-dontcache is used, expand any misaligned READ to the next
DIO-aligned block (on either end of the READ).

Reserve an extra page in svc_serv_maxpages() because nfsd_iter_read()
might need two extra pages when a READ payload is not DIO-aligned --
but nfsd_iter_read() and nfsd_splice_actor() are mutually exclusive
(so reuse page reserved for nfsd_splice_actor).

Also add nfsd_read_vector_dio trace event. This combination of
trace events is useful:

 echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_vector/enable
 echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_vector_dio/enable
 echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_io_done/enable
 echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_read/enable

Which for this dd command:

 dd if=/mnt/share1/test of=/dev/null bs=47008 count=2 iflag=direct

Results in:

 nfsd-16580   [001] .....  5672.403130: nfsd_read_vector_dio: xid=0x5ccf019c fh_hash=0xe4dadb60 offset=0 len=47008 start=0+0 end=47104-96
 nfsd-16580   [001] .....  5672.403131: nfsd_read_vector: xid=0x5ccf019c fh_hash=0xe4dadb60 offset=0 len=47104
 nfsd-16580   [001] .....  5672.403134: xfs_file_direct_read: dev 253:0 ino 0x1c2388c1 disize 0x16f40 pos 0x0 bytecount 0xb800
 nfsd-16580   [001] .....  5672.404380: nfsd_read_io_done: xid=0x5ccf019c fh_hash=0xe4dadb60 offset=0 len=47008

 nfsd-16580   [001] .....  5672.404672: nfsd_read_vector_dio: xid=0x5dcf019c fh_hash=0xe4dadb60 offset=47008 len=47008 start=46592+416 end=94208-192
 nfsd-16580   [001] .....  5672.404672: nfsd_read_vector: xid=0x5dcf019c fh_hash=0xe4dadb60 offset=46592 len=47616
 nfsd-16580   [001] .....  5672.404673: xfs_file_direct_read: dev 253:0 ino 0x1c2388c1 disize 0x16f40 pos 0xb600 bytecount 0xba00
 nfsd-16580   [001] .....  5672.405771: nfsd_read_io_done: xid=0x5dcf019c fh_hash=0xe4dadb60 offset=47008 len=47008

Suggested-by: Jeff Layton <jlayton@xxxxxxxxxx>
Suggested-by: Chuck Lever <chuck.lever@xxxxxxxxxx>
Signed-off-by: Mike Snitzer <snitzer@xxxxxxxxxx>
---
 fs/nfsd/trace.h            | 37 ++++++++++++++++++++++
 fs/nfsd/vfs.c              | 65 ++++++++++++++++++++++++++++----------
 include/linux/sunrpc/svc.h |  5 ++-
 3 files changed, 90 insertions(+), 17 deletions(-)

diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 3c5505ef5e3a..a46515b953f4 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -473,6 +473,43 @@ DEFINE_NFSD_IO_EVENT(write_done);
 DEFINE_NFSD_IO_EVENT(commit_start);
 DEFINE_NFSD_IO_EVENT(commit_done);
 
+TRACE_EVENT(nfsd_read_vector_dio,
+	TP_PROTO(struct svc_rqst *rqstp,
+		 struct svc_fh	*fhp,
+		 u64		offset,
+		 u32		len,
+		 loff_t         start,
+		 loff_t         start_extra,
+		 loff_t         end,
+		 loff_t         end_extra),
+	TP_ARGS(rqstp, fhp, offset, len, start, start_extra, end, end_extra),
+	TP_STRUCT__entry(
+		__field(u32, xid)
+		__field(u32, fh_hash)
+		__field(u64, offset)
+		__field(u32, len)
+		__field(loff_t, start)
+		__field(loff_t, start_extra)
+		__field(loff_t, end)
+		__field(loff_t, end_extra)
+	),
+	TP_fast_assign(
+		__entry->xid = be32_to_cpu(rqstp->rq_xid);
+		__entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+		__entry->offset = offset;
+		__entry->len = len;
+		__entry->start = start;
+		__entry->start_extra = start_extra;
+		__entry->end = end;
+		__entry->end_extra = end_extra;
+	),
+	TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu len=%u start=%llu+%llu end=%llu-%llu",
+		  __entry->xid, __entry->fh_hash,
+		  __entry->offset, __entry->len,
+		  __entry->start, __entry->start_extra,
+		  __entry->end, __entry->end_extra)
+);
+
 DECLARE_EVENT_CLASS(nfsd_err_class,
 	TP_PROTO(struct svc_rqst *rqstp,
 		 struct svc_fh	*fhp,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a942609e3ab9..be5d025b4680 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -19,6 +19,7 @@
 #include <linux/splice.h>
 #include <linux/falloc.h>
 #include <linux/fcntl.h>
+#include <linux/math.h>
 #include <linux/namei.h>
 #include <linux/delay.h>
 #include <linux/fsnotify.h>
@@ -1101,15 +1102,41 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		      unsigned int base, u32 *eof)
 {
 	struct file *file = nf->nf_file;
-	unsigned long v, total;
+	unsigned long v, total, in_count = *count;
+	loff_t start_extra = 0, end_extra = 0;
 	struct iov_iter iter;
-	loff_t ppos = offset;
+	loff_t ppos;
 	rwf_t flags = 0;
 	ssize_t host_err;
 	size_t len;
 
+	/*
+	 * If dontcache enabled, expand any misaligned READ to
+	 * the next DIO-aligned block (on either end of the READ).
+	 */
+	if (nfsd_enable_dontcache && nf->nf_dio_mem_align &&
+	    (base & (nf->nf_dio_mem_align-1)) == 0) {
+		const u32 dio_blocksize = nf->nf_dio_read_offset_align;
+		loff_t orig_end = offset + *count;
+		loff_t start = round_down(offset, dio_blocksize);
+		loff_t end = round_up(orig_end, dio_blocksize);
+
+		WARN_ON_ONCE(dio_blocksize > PAGE_SIZE);
+		start_extra = offset - start;
+		end_extra = end - orig_end;
+
+		/* Show original offset and count, and how it was expanded for DIO */
+		trace_nfsd_read_vector_dio(rqstp, fhp, offset, *count,
+					   start, start_extra, end, end_extra);
+
+		/* trace_nfsd_read_vector() will reflect larger DIO-aligned READ */
+		offset = start;
+		in_count = end - start;
+		flags |= RWF_DIRECT;
+	}
+
 	v = 0;
-	total = *count;
+	total = in_count;
 	while (total) {
 		len = min_t(size_t, total, PAGE_SIZE - base);
 		bvec_set_page(&rqstp->rq_bvec[v], *(rqstp->rq_next_page++),
@@ -1120,21 +1147,27 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	}
 	WARN_ON_ONCE(v > rqstp->rq_maxpages);
 
-	trace_nfsd_read_vector(rqstp, fhp, offset, *count);
-	iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count);
-
-	if (nfsd_enable_dontcache) {
-		if (is_dio_aligned(&iter, offset, nf->nf_dio_read_offset_align))
-			flags |= RWF_DIRECT;
-		/* FIXME: not using RWF_DONTCACHE for misaligned IO because it works
-		 * against us (due to RMW needing to read without benefit of cache),
-		 * whereas buffered IO enables misaligned IO to be more performant.
-		 */
-		//else
-		//	flags |= RWF_DONTCACHE;
-	}
+	trace_nfsd_read_vector(rqstp, fhp, offset, in_count);
+	iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, in_count);
 
+	ppos = offset;
 	host_err = vfs_iter_read(file, &iter, &ppos, flags);
+
+	if ((start_extra || end_extra) && host_err >= 0) {
+		rqstp->rq_bvec[0].bv_offset += start_extra;
+		rqstp->rq_bvec[0].bv_len -= start_extra;
+		rqstp->rq_bvec[v].bv_len -= end_extra;
+		/* Must adjust returned read size to reflect original extent */
+		offset += start_extra;
+		if (likely(host_err >= start_extra)) {
+			host_err -= start_extra;
+			if (host_err > *count)
+				host_err = *count;
+		} else {
+			/* Short read that didn't read any of requested data */
+			host_err = 0;
+		}
+	}
 	return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
 }
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 46f7991cea58..52f5c9ec35aa 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -163,10 +163,13 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp);
  * pages, one for the request, and one for the reply.
  * nfsd_splice_actor() might need an extra page when a READ payload
  * is not page-aligned.
+ * nfsd_iter_read() might need two extra pages when a READ payload
+ * is not DIO-aligned -- but nfsd_iter_read() and nfsd_splice_actor()
+ * are mutually exclusive.
  */
 static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv)
 {
-	return DIV_ROUND_UP(serv->sv_max_mesg, PAGE_SIZE) + 2 + 1;
+	return DIV_ROUND_UP(serv->sv_max_mesg, PAGE_SIZE) + 2 + 1 + 1;
 }
 
 /*
-- 
2.44.0





[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux