If NFSD_IO_DIRECT is used, split any misaligned WRITE into a start, middle and end as needed. The large middle extent is DIO-aligned and the start and/or end are misaligned. Buffered IO (with preference towards using DONTCACHE) is used for the misaligned extents and O_DIRECT is used for the middle DIO-aligned extent. If vfs_iocb_iter_write() returns -ENOTBLK, due to its inability to invalidate the page cache on behalf of the DIO WRITE, then nfsd_issue_write_dio() will fall back to using buffered IO. Signed-off-by: Mike Snitzer <snitzer@xxxxxxxxxx> --- fs/nfsd/debugfs.c | 1 + fs/nfsd/trace.h | 1 + fs/nfsd/vfs.c | 215 ++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 212 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c index 00eb1ecef6ac1..7f44689e0a53e 100644 --- a/fs/nfsd/debugfs.c +++ b/fs/nfsd/debugfs.c @@ -108,6 +108,7 @@ static int nfsd_io_cache_write_set(void *data, u64 val) switch (val) { case NFSD_IO_BUFFERED: case NFSD_IO_DONTCACHE: + case NFSD_IO_DIRECT: nfsd_io_cache_write = val; break; default: diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 88901df5fbb11..4e560427f6e1e 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -469,6 +469,7 @@ DEFINE_NFSD_IO_EVENT(read_io_done); DEFINE_NFSD_IO_EVENT(read_done); DEFINE_NFSD_IO_EVENT(write_start); DEFINE_NFSD_IO_EVENT(write_opened); +DEFINE_NFSD_IO_EVENT(write_direct); DEFINE_NFSD_IO_EVENT(write_io_done); DEFINE_NFSD_IO_EVENT(write_done); DEFINE_NFSD_IO_EVENT(commit_start); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 239266c241d2c..e2039feaf95c8 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1246,6 +1246,208 @@ static int wait_for_concurrent_writes(struct file *file) return err; } +struct nfsd_write_dio { + ssize_t start_len; /* Length for misaligned first extent */ + ssize_t middle_len; /* Length for DIO-aligned middle extent */ + ssize_t end_len; /* Length for misaligned last extent */ +}; + +static bool +nfsd_is_write_dio_possible(loff_t offset, unsigned long len, + struct nfsd_file *nf, struct nfsd_write_dio *write_dio) +{ + const u32 dio_blocksize = nf->nf_dio_offset_align; + loff_t start_end, orig_end, middle_end; + + if (unlikely(!nf->nf_dio_mem_align || !dio_blocksize)) + return false; + if (unlikely(dio_blocksize > PAGE_SIZE)) + return false; + if (unlikely(len < dio_blocksize)) + return false; + + start_end = round_up(offset, dio_blocksize); + orig_end = offset + len; + middle_end = round_down(orig_end, dio_blocksize); + + write_dio->start_len = start_end - offset; + write_dio->middle_len = middle_end - start_end; + write_dio->end_len = orig_end - middle_end; + + return true; +} + +static bool nfsd_iov_iter_aligned_bvec(const struct iov_iter *i, + unsigned int addr_mask, unsigned int len_mask) +{ + const struct bio_vec *bvec = i->bvec; + size_t skip = i->iov_offset; + size_t size = i->count; + + if (size & len_mask) + return false; + do { + size_t len = bvec->bv_len; + + if (len > size) + len = size; + if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) + return false; + bvec++; + size -= len; + skip = 0; + } while (size); + + return true; +} + +/* + * Setup as many as 3 iov_iter based on extents described by @write_dio. + * @iterp: pointer to pointer to onstack array of 3 iov_iter structs from caller. + * @iter_is_dio_aligned: pointer to onstack array of 3 bools from caller. + * @rq_bvec: backing bio_vec used to setup all 3 iov_iter permutations. + * @nvecs: number of segments in @rq_bvec + * @cnt: size of the request in bytes + * @write_dio: nfsd_write_dio struct that describes start, middle and end extents. + * + * Returns the number of iov_iter that were setup. + */ +static int +nfsd_setup_write_dio_iters(struct iov_iter **iterp, bool *iter_is_dio_aligned, + struct bio_vec *rq_bvec, unsigned int nvecs, + unsigned long cnt, struct nfsd_write_dio *write_dio, + struct nfsd_file *nf) +{ + int n_iters = 0; + struct iov_iter *iters = *iterp; + + /* Setup misaligned start? */ + if (write_dio->start_len) { + iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt); + iters[n_iters].count = write_dio->start_len; + iter_is_dio_aligned[n_iters] = false; + ++n_iters; + } + + /* Setup DIO-aligned middle */ + iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt); + if (write_dio->start_len) + iov_iter_advance(&iters[n_iters], write_dio->start_len); + iters[n_iters].count -= write_dio->end_len; + iter_is_dio_aligned[n_iters] = + nfsd_iov_iter_aligned_bvec(&iters[n_iters], + nf->nf_dio_mem_align-1, nf->nf_dio_offset_align-1); + if (unlikely(!iter_is_dio_aligned[n_iters])) + return 0; /* no DIO-aligned IO possible */ + ++n_iters; + + /* Setup misaligned end? */ + if (write_dio->end_len) { + iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt); + iov_iter_advance(&iters[n_iters], + write_dio->start_len + write_dio->middle_len); + iter_is_dio_aligned[n_iters] = false; + ++n_iters; + } + + return n_iters; +} + +static int +nfsd_buffered_write(struct svc_rqst *rqstp, struct file *file, + unsigned int nvecs, unsigned long *cnt, + struct kiocb *kiocb) +{ + struct iov_iter iter; + int host_err; + + iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt); + host_err = vfs_iocb_iter_write(file, kiocb, &iter); + if (host_err < 0) + return host_err; + *cnt = host_err; + + return 0; +} + +static int +nfsd_issue_write_dio(struct svc_rqst *rqstp, struct nfsd_file *nf, + loff_t offset, unsigned int nvecs, + unsigned long *cnt, struct kiocb *kiocb, + struct nfsd_write_dio *write_dio) +{ + struct file *file = nf->nf_file; + bool iter_is_dio_aligned[3]; + struct iov_iter iter_stack[3]; + struct iov_iter *iter = iter_stack; + unsigned int n_iters = 0; + unsigned long in_count = *cnt; + loff_t in_offset = kiocb->ki_pos; + ssize_t host_err; + + n_iters = nfsd_setup_write_dio_iters(&iter, iter_is_dio_aligned, + rqstp->rq_bvec, nvecs, *cnt, write_dio, nf); + if (unlikely(!n_iters)) + return nfsd_buffered_write(rqstp, file, nvecs, cnt, kiocb); + + *cnt = 0; + for (int i = 0; i < n_iters; i++) { + if (iter_is_dio_aligned[i]) + kiocb->ki_flags |= IOCB_DIRECT; + else + kiocb->ki_flags &= ~IOCB_DIRECT; + + host_err = vfs_iocb_iter_write(file, kiocb, &iter[i]); + if (host_err < 0) { + /* VFS will return -ENOTBLK if DIO WRITE fails to + * invalidate the page cache. Retry using buffered IO. + */ + if (unlikely(host_err == -ENOTBLK)) { + kiocb->ki_flags &= ~IOCB_DIRECT; + *cnt = in_count; + kiocb->ki_pos = in_offset; + return nfsd_buffered_write(rqstp, file, + nvecs, cnt, kiocb); + } else if (unlikely(host_err == -EINVAL)) { + pr_info_ratelimited("nfsd: Unexpected direct I/O alignment failure\n"); + host_err = -ESERVERFAULT; + } + return host_err; + } + *cnt += host_err; + if (host_err < iter[i].count) /* partial write? */ + break; + } + + return 0; +} + +static noinline_for_stack int +nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file *nf, loff_t offset, unsigned int nvecs, + unsigned long *cnt, struct kiocb *kiocb) +{ + struct nfsd_write_dio write_dio; + + /* Any buffered IO issued here will be misaligned, use + * IOCB_SYNC to ensure it has completed before returning. + */ + kiocb->ki_flags |= IOCB_SYNC; + /* Check if IOCB_DONTCACHE should be used when issuing buffered IO; + * if so, it will be ignored for any DIO issued here. + */ + if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE) + kiocb->ki_flags |= IOCB_DONTCACHE; + + if (nfsd_is_write_dio_possible(offset, *cnt, nf, &write_dio)) { + trace_nfsd_write_direct(rqstp, fhp, offset, *cnt); + return nfsd_issue_write_dio(rqstp, nf, offset, nvecs, + cnt, kiocb, &write_dio); + } + + return nfsd_buffered_write(rqstp, nf->nf_file, nvecs, cnt, kiocb); +} + /** * nfsd_vfs_write - write data to an already-open file * @rqstp: RPC execution context @@ -1273,7 +1475,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct super_block *sb = file_inode(file)->i_sb; struct kiocb kiocb; struct svc_export *exp; - struct iov_iter iter; errseq_t since; __be32 nfserr; int host_err; @@ -1310,25 +1511,29 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, kiocb.ki_flags |= IOCB_DSYNC; nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload); - iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt); + since = READ_ONCE(file->f_wb_err); if (verf) nfsd_copy_write_verifier(verf, nn); switch (nfsd_io_cache_write) { - case NFSD_IO_BUFFERED: + case NFSD_IO_DIRECT: + host_err = nfsd_direct_write(rqstp, fhp, nf, offset, + nvecs, cnt, &kiocb); break; case NFSD_IO_DONTCACHE: if (file->f_op->fop_flags & FOP_DONTCACHE) kiocb.ki_flags |= IOCB_DONTCACHE; + fallthrough; /* must call nfsd_buffered_write */ + case NFSD_IO_BUFFERED: + host_err = nfsd_buffered_write(rqstp, file, + nvecs, cnt, &kiocb); break; } - host_err = vfs_iocb_iter_write(file, &kiocb, &iter); if (host_err < 0) { commit_reset_write_verifier(nn, rqstp, host_err); goto out_nfserr; } - *cnt = host_err; nfsd_stats_io_write_add(nn, exp, *cnt); fsnotify_modify(file); host_err = filemap_check_wb_err(file->f_mapping, since); -- 2.44.0