On 8/7/25 10:30 PM, Mike Snitzer wrote: > On Thu, Aug 07, 2025 at 12:25:44PM -0400, Mike Snitzer wrote: >> If NFSD_IO_DIRECT is used, split any misaligned WRITE into a start, >> middle and end as needed. The large middle extent is DIO-aligned and >> the start and/or end are misaligned. Buffered IO is used for the >> misaligned extents and O_DIRECT is used for the middle DIO-aligned >> extent. >> >> The nfsd_analyze_write_dio trace event shows how NFSD splits a given >> misaligned WRITE into a mix of misaligned extent(s) and a DIO-aligned >> extent. >> >> This combination of trace events is useful: >> >> echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_opened/enable >> echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_analyze_write_dio/enable >> echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_io_done/enable >> echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_write/enable >> >> Which for this dd command: >> >> dd if=/dev/zero of=/mnt/share1/test bs=47008 count=2 oflag=direct >> >> Results in: >> >> nfsd-23908 [010] ..... 10374.902333: nfsd_write_opened: xid=0x7fc5923b fh_hash=0x857ca4fc offset=0 len=47008 >> nfsd-23908 [010] ..... 10374.902335: nfsd_analyze_write_dio: xid=0x7fc5923b fh_hash=0x857ca4fc offset=0 len=47008 start=0+0 middle=0+46592 end=46592+416 >> nfsd-23908 [010] ..... 10374.902343: xfs_file_direct_write: dev 259:2 ino 0xc00116 disize 0x0 pos 0x0 bytecount 0xb600 >> nfsd-23908 [010] ..... 10374.902697: nfsd_write_io_done: xid=0x7fc5923b fh_hash=0x857ca4fc offset=0 len=47008 >> >> nfsd-23908 [010] ..... 10374.902925: nfsd_write_opened: xid=0x80c5923b fh_hash=0x857ca4fc offset=47008 len=47008 >> nfsd-23908 [010] ..... 10374.902926: nfsd_analyze_write_dio: xid=0x80c5923b fh_hash=0x857ca4fc offset=47008 len=47008 start=47008+96 middle=47104+46592 end=93696+320 >> nfsd-23908 [010] ..... 10374.903010: xfs_file_direct_write: dev 259:2 ino 0xc00116 disize 0xb800 pos 0xb800 bytecount 0xb600 >> nfsd-23908 [010] ..... 10374.903239: nfsd_write_io_done: xid=0x80c5923b fh_hash=0x857ca4fc offset=47008 len=47008 >> >> Signed-off-by: Mike Snitzer <snitzer@xxxxxxxxxx> >> --- >> fs/nfsd/vfs.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++---- >> 1 file changed, 170 insertions(+), 13 deletions(-) >> >> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c >> index be083a8812717..1b5aa3e6e6623 100644 >> --- a/fs/nfsd/vfs.c >> +++ b/fs/nfsd/vfs.c >> @@ -1315,6 +1315,167 @@ static int wait_for_concurrent_writes(struct file *file) >> return err; >> } >> >> +struct nfsd_write_dio { >> + loff_t middle_offset; /* Offset for start of DIO-aligned middle */ >> + loff_t end_offset; /* Offset for start of DIO-aligned end */ >> + ssize_t start_len; /* Length for misaligned first extent */ >> + ssize_t middle_len; /* Length for DIO-aligned middle extent */ >> + ssize_t end_len; /* Length for misaligned last extent */ >> +}; >> + >> +static bool >> +nfsd_analyze_write_dio(struct svc_rqst *rqstp, struct svc_fh *fhp, >> + struct nfsd_file *nf, loff_t offset, >> + unsigned long len, struct nfsd_write_dio *write_dio) >> +{ >> + const u32 dio_blocksize = nf->nf_dio_offset_align; >> + loff_t orig_end, middle_end, start_end, start_offset = offset; >> + ssize_t start_len = len; >> + >> + if (WARN_ONCE(!nf->nf_dio_mem_align || !dio_blocksize, >> + "%s: underlying filesystem has not provided DIO alignment info\n", >> + __func__)) >> + return false; >> + if (WARN_ONCE(dio_blocksize > PAGE_SIZE, >> + "%s: underlying storage's dio_blocksize=%u > PAGE_SIZE=%lu\n", >> + __func__, dio_blocksize, PAGE_SIZE)) >> + return false; >> + if (unlikely(len < dio_blocksize)) >> + return false; >> + >> + memset(write_dio, 0, sizeof(*write_dio)); >> + >> + if (((offset | len) & (dio_blocksize-1)) == 0) { >> + /* already DIO-aligned, no misaligned head or tail */ >> + write_dio->middle_offset = offset; >> + write_dio->middle_len = len; >> + /* clear these for the benefit of trace_nfsd_analyze_write_dio */ >> + start_offset = 0; >> + start_len = 0; >> + goto out; >> + } >> + >> + start_end = round_up(offset, dio_blocksize); >> + start_len = start_end - offset; >> + orig_end = offset + len; >> + middle_end = round_down(orig_end, dio_blocksize); >> + >> + write_dio->start_len = start_len; >> + write_dio->middle_offset = start_end; >> + write_dio->middle_len = middle_end - start_end; >> + write_dio->end_offset = middle_end; >> + write_dio->end_len = orig_end - middle_end; >> +out: >> + trace_nfsd_analyze_write_dio(rqstp, fhp, offset, len, start_offset, start_len, >> + write_dio->middle_offset, write_dio->middle_len, >> + write_dio->end_offset, write_dio->end_len); >> + return true; >> +} >> + >> +/* >> + * Setup as many as 3 iov_iter based on extents described by @write_dio. >> + * @iterp: pointer to pointer to onstack array of 3 iov_iter structs from caller. >> + * @iter_is_dio_aligned: pointer to onstack array of 3 bools from caller. >> + * @rq_bvec: backing bio_vec used to setup all 3 iov_iter permutations. >> + * @nvecs: number of segments in @rq_bvec >> + * @cnt: size of the request in bytes >> + * @write_dio: nfsd_write_dio struct that describes start, middle and end extents. >> + * >> + * Returns the number of iov_iter that were setup. >> + */ >> +static int >> +nfsd_setup_write_dio_iters(struct iov_iter **iterp, bool *iter_is_dio_aligned, >> + struct bio_vec *rq_bvec, unsigned int nvecs, >> + unsigned long cnt, struct nfsd_write_dio *write_dio) >> +{ >> + int n_iters = 0; >> + struct iov_iter *iters = *iterp; >> + >> + /* Setup misaligned start? */ >> + if (write_dio->start_len) { >> + iter_is_dio_aligned[n_iters] = false; >> + iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt); >> + iters[n_iters].count = write_dio->start_len; >> + n_iters++; >> + } >> + >> + /* Setup DIO-aligned middle */ >> + iter_is_dio_aligned[n_iters] = true; >> + iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt); >> + if (write_dio->start_len) >> + iov_iter_advance(&iters[n_iters], write_dio->start_len); >> + iters[n_iters].count -= write_dio->end_len; >> + n_iters++; >> + >> + /* Setup misaligned end? */ >> + if (write_dio->end_len) { >> + iter_is_dio_aligned[n_iters] = false; >> + iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt); >> + iov_iter_advance(&iters[n_iters], >> + write_dio->start_len + write_dio->middle_len); >> + n_iters++; >> + } >> + >> + return n_iters; >> +} >> + >> +static int >> +nfsd_issue_write_buffered(struct svc_rqst *rqstp, struct file *file, >> + unsigned int nvecs, unsigned long *cnt, >> + struct kiocb *kiocb) >> +{ >> + struct iov_iter iter; >> + int host_err; >> + >> + iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt); >> + host_err = vfs_iocb_iter_write(file, kiocb, &iter); >> + if (host_err < 0) >> + return host_err; >> + *cnt = host_err; >> + >> + return 0; >> +} >> + >> +static noinline int >> +nfsd_issue_write_dio(struct svc_rqst *rqstp, struct svc_fh *fhp, >> + struct nfsd_file *nf, loff_t offset, >> + unsigned int nvecs, unsigned long *cnt, >> + struct kiocb *kiocb) >> +{ >> + struct nfsd_write_dio write_dio; >> + struct file *file = nf->nf_file; >> + >> + if (!nfsd_analyze_write_dio(rqstp, fhp, nf, offset, >> + *cnt, &write_dio)) { >> + return nfsd_issue_write_buffered(rqstp, file, >> + nvecs, cnt, kiocb); >> + } else { >> + bool iter_is_dio_aligned[3]; >> + struct iov_iter iter_stack[3]; >> + struct iov_iter *iter = iter_stack; >> + unsigned int n_iters = 0; >> + int host_err; >> + >> + n_iters = nfsd_setup_write_dio_iters(&iter, >> + iter_is_dio_aligned, rqstp->rq_bvec, >> + nvecs, *cnt, &write_dio); >> + *cnt = 0; >> + for (int i = 0; i < n_iters; i++) { >> + if (iter_is_dio_aligned[i]) >> + kiocb->ki_flags |= IOCB_DIRECT; >> + else >> + kiocb->ki_flags &= ~IOCB_DIRECT; >> + host_err = vfs_iocb_iter_write(file, kiocb, >> + &iter[i]); >> + if (host_err < 0) >> + return host_err; >> + *cnt += host_err; >> + } >> + } >> + >> + return 0; >> +} >> + >> /** >> * nfsd_vfs_write - write data to an already-open file >> * @rqstp: RPC execution context >> @@ -1342,7 +1503,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, >> struct super_block *sb = file_inode(file)->i_sb; >> struct kiocb kiocb; >> struct svc_export *exp; >> - struct iov_iter iter; >> errseq_t since; >> __be32 nfserr; >> int host_err; >> @@ -1379,31 +1539,28 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, >> kiocb.ki_flags |= IOCB_DSYNC; >> >> nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload); >> - iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt); >> + >> + since = READ_ONCE(file->f_wb_err); >> + if (verf) >> + nfsd_copy_write_verifier(verf, nn); >> >> switch (nfsd_io_cache_write) { >> case NFSD_IO_DIRECT: >> - /* direct I/O must be aligned to device logical sector size */ >> - if (nf->nf_dio_mem_align && nf->nf_dio_offset_align && >> - (((offset | *cnt) & (nf->nf_dio_offset_align-1)) == 0)) >> - kiocb.ki_flags |= IOCB_DIRECT; >> + host_err = nfsd_issue_write_dio(rqstp, fhp, nf, offset, >> + nvecs, cnt, &kiocb); >> break; >> case NFSD_IO_DONTCACHE: >> kiocb.ki_flags |= IOCB_DONTCACHE; >> - break; >> + fallthrough; >> case NFSD_IO_BUFFERED: >> + host_err = nfsd_issue_write_buffered(rqstp, file, >> + nvecs, cnt, &kiocb); >> break; >> } >> - >> - since = READ_ONCE(file->f_wb_err); >> - if (verf) >> - nfsd_copy_write_verifier(verf, nn); >> - host_err = vfs_iocb_iter_write(file, &kiocb, &iter); >> if (host_err < 0) { >> commit_reset_write_verifier(nn, rqstp, host_err); >> goto out_nfserr; >> } >> - *cnt = host_err; >> nfsd_stats_io_write_add(nn, exp, *cnt); >> fsnotify_modify(file); >> host_err = filemap_check_wb_err(file->f_mapping, since); >> -- >> 2.44.0 >> >> > > Embarrassingly, turns out I only tested the NFSD_IO_DIRECT case prior > to submitting this v5, if the 'nfsd_io_cache_write' debugfs file is > never written it defaults to NFSD_IO_UNSPECIFIED. But even if that's > the case we need to treat NFSD_IO_UNSPECIFIED like NFSD_IO_BUFFERED. > (that is how nfsd_vfs_read behaves, but I missed this in > nfsd_vfs_write when I refactored the code for v5). > > This incremental patch fixes this oversight, Chuck should I submit a > v8 or you're OK with folding this fixup (if the rest of the code is > OK)? Hold off on v6 for now. Let's wait for more review comments. > Thanks, > Mike > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c > index 1b5aa3e6e6623..b529754a20bd5 100644 > --- a/fs/nfsd/vfs.c > +++ b/fs/nfsd/vfs.c > @@ -1505,7 +1505,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, > struct svc_export *exp; > errseq_t since; > __be32 nfserr; > - int host_err; > + int host_err = 0; > unsigned long exp_op_flags = 0; > unsigned int pflags = current->flags; > bool restore_flags = false; > @@ -1552,6 +1552,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, > case NFSD_IO_DONTCACHE: > kiocb.ki_flags |= IOCB_DONTCACHE; > fallthrough; > + case NFSD_IO_UNSPECIFIED: > case NFSD_IO_BUFFERED: > host_err = nfsd_issue_write_buffered(rqstp, file, > nvecs, cnt, &kiocb); -- Chuck Lever