Re: [PATCH v5 7/7] NFSD: issue WRITEs using O_DIRECT even if IO is misaligned

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 8/7/25 10:30 PM, Mike Snitzer wrote:
> On Thu, Aug 07, 2025 at 12:25:44PM -0400, Mike Snitzer wrote:
>> If NFSD_IO_DIRECT is used, split any misaligned WRITE into a start,
>> middle and end as needed. The large middle extent is DIO-aligned and
>> the start and/or end are misaligned. Buffered IO is used for the
>> misaligned extents and O_DIRECT is used for the middle DIO-aligned
>> extent.
>>
>> The nfsd_analyze_write_dio trace event shows how NFSD splits a given
>> misaligned WRITE into a mix of misaligned extent(s) and a DIO-aligned
>> extent.
>>
>> This combination of trace events is useful:
>>
>>   echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_opened/enable
>>   echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_analyze_write_dio/enable
>>   echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_io_done/enable
>>   echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_write/enable
>>
>> Which for this dd command:
>>
>>   dd if=/dev/zero of=/mnt/share1/test bs=47008 count=2 oflag=direct
>>
>> Results in:
>>
>>   nfsd-23908   [010] ..... 10374.902333: nfsd_write_opened: xid=0x7fc5923b fh_hash=0x857ca4fc offset=0 len=47008
>>   nfsd-23908   [010] ..... 10374.902335: nfsd_analyze_write_dio: xid=0x7fc5923b fh_hash=0x857ca4fc offset=0 len=47008 start=0+0 middle=0+46592 end=46592+416
>>   nfsd-23908   [010] ..... 10374.902343: xfs_file_direct_write: dev 259:2 ino 0xc00116 disize 0x0 pos 0x0 bytecount 0xb600
>>   nfsd-23908   [010] ..... 10374.902697: nfsd_write_io_done: xid=0x7fc5923b fh_hash=0x857ca4fc offset=0 len=47008
>>
>>   nfsd-23908   [010] ..... 10374.902925: nfsd_write_opened: xid=0x80c5923b fh_hash=0x857ca4fc offset=47008 len=47008
>>   nfsd-23908   [010] ..... 10374.902926: nfsd_analyze_write_dio: xid=0x80c5923b fh_hash=0x857ca4fc offset=47008 len=47008 start=47008+96 middle=47104+46592 end=93696+320
>>   nfsd-23908   [010] ..... 10374.903010: xfs_file_direct_write: dev 259:2 ino 0xc00116 disize 0xb800 pos 0xb800 bytecount 0xb600
>>   nfsd-23908   [010] ..... 10374.903239: nfsd_write_io_done: xid=0x80c5923b fh_hash=0x857ca4fc offset=47008 len=47008
>>
>> Signed-off-by: Mike Snitzer <snitzer@xxxxxxxxxx>
>> ---
>>  fs/nfsd/vfs.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++----
>>  1 file changed, 170 insertions(+), 13 deletions(-)
>>
>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>> index be083a8812717..1b5aa3e6e6623 100644
>> --- a/fs/nfsd/vfs.c
>> +++ b/fs/nfsd/vfs.c
>> @@ -1315,6 +1315,167 @@ static int wait_for_concurrent_writes(struct file *file)
>>  	return err;
>>  }
>>  
>> +struct nfsd_write_dio {
>> +	loff_t middle_offset;	/* Offset for start of DIO-aligned middle */
>> +	loff_t end_offset;	/* Offset for start of DIO-aligned end */
>> +	ssize_t	start_len;	/* Length for misaligned first extent */
>> +	ssize_t	middle_len;	/* Length for DIO-aligned middle extent */
>> +	ssize_t	end_len;	/* Length for misaligned last extent */
>> +};
>> +
>> +static bool
>> +nfsd_analyze_write_dio(struct svc_rqst *rqstp, struct svc_fh *fhp,
>> +		       struct nfsd_file *nf, loff_t offset,
>> +		       unsigned long len, struct nfsd_write_dio *write_dio)
>> +{
>> +	const u32 dio_blocksize = nf->nf_dio_offset_align;
>> +	loff_t orig_end, middle_end, start_end, start_offset = offset;
>> +	ssize_t start_len = len;
>> +
>> +	if (WARN_ONCE(!nf->nf_dio_mem_align || !dio_blocksize,
>> +		      "%s: underlying filesystem has not provided DIO alignment info\n",
>> +		      __func__))
>> +		return false;
>> +	if (WARN_ONCE(dio_blocksize > PAGE_SIZE,
>> +		      "%s: underlying storage's dio_blocksize=%u > PAGE_SIZE=%lu\n",
>> +		      __func__, dio_blocksize, PAGE_SIZE))
>> +		return false;
>> +	if (unlikely(len < dio_blocksize))
>> +		return false;
>> +
>> +	memset(write_dio, 0, sizeof(*write_dio));
>> +
>> +	if (((offset | len) & (dio_blocksize-1)) == 0) {
>> +		/* already DIO-aligned, no misaligned head or tail */
>> +		write_dio->middle_offset = offset;
>> +		write_dio->middle_len = len;
>> +		/* clear these for the benefit of trace_nfsd_analyze_write_dio */
>> +		start_offset = 0;
>> +		start_len = 0;
>> +		goto out;
>> +	}
>> +
>> +	start_end = round_up(offset, dio_blocksize);
>> +	start_len = start_end - offset;
>> +	orig_end = offset + len;
>> +	middle_end = round_down(orig_end, dio_blocksize);
>> +
>> +	write_dio->start_len = start_len;
>> +	write_dio->middle_offset = start_end;
>> +	write_dio->middle_len = middle_end - start_end;
>> +	write_dio->end_offset = middle_end;
>> +	write_dio->end_len = orig_end - middle_end;
>> +out:
>> +	trace_nfsd_analyze_write_dio(rqstp, fhp, offset, len, start_offset, start_len,
>> +				     write_dio->middle_offset, write_dio->middle_len,
>> +				     write_dio->end_offset, write_dio->end_len);
>> +	return true;
>> +}
>> +
>> +/*
>> + * Setup as many as 3 iov_iter based on extents described by @write_dio.
>> + * @iterp: pointer to pointer to onstack array of 3 iov_iter structs from caller.
>> + * @iter_is_dio_aligned: pointer to onstack array of 3 bools from caller.
>> + * @rq_bvec: backing bio_vec used to setup all 3 iov_iter permutations.
>> + * @nvecs: number of segments in @rq_bvec
>> + * @cnt: size of the request in bytes
>> + * @write_dio: nfsd_write_dio struct that describes start, middle and end extents.
>> + *
>> + * Returns the number of iov_iter that were setup.
>> + */
>> +static int
>> +nfsd_setup_write_dio_iters(struct iov_iter **iterp, bool *iter_is_dio_aligned,
>> +			   struct bio_vec *rq_bvec, unsigned int nvecs,
>> +			   unsigned long cnt, struct nfsd_write_dio *write_dio)
>> +{
>> +	int n_iters = 0;
>> +	struct iov_iter *iters = *iterp;
>> +
>> +	/* Setup misaligned start? */
>> +	if (write_dio->start_len) {
>> +		iter_is_dio_aligned[n_iters] = false;
>> +		iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
>> +		iters[n_iters].count = write_dio->start_len;
>> +		n_iters++;
>> +	}
>> +
>> +	/* Setup DIO-aligned middle */
>> +	iter_is_dio_aligned[n_iters] = true;
>> +	iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
>> +	if (write_dio->start_len)
>> +		iov_iter_advance(&iters[n_iters], write_dio->start_len);
>> +	iters[n_iters].count -= write_dio->end_len;
>> +	n_iters++;
>> +
>> +	/* Setup misaligned end? */
>> +	if (write_dio->end_len) {
>> +		iter_is_dio_aligned[n_iters] = false;
>> +		iov_iter_bvec(&iters[n_iters], ITER_SOURCE, rq_bvec, nvecs, cnt);
>> +		iov_iter_advance(&iters[n_iters],
>> +				 write_dio->start_len + write_dio->middle_len);
>> +		n_iters++;
>> +	}
>> +
>> +	return n_iters;
>> +}
>> +
>> +static int
>> +nfsd_issue_write_buffered(struct svc_rqst *rqstp, struct file *file,
>> +			  unsigned int nvecs, unsigned long *cnt,
>> +			  struct kiocb *kiocb)
>> +{
>> +	struct iov_iter iter;
>> +	int host_err;
>> +
>> +	iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
>> +	host_err = vfs_iocb_iter_write(file, kiocb, &iter);
>> +	if (host_err < 0)
>> +		return host_err;
>> +	*cnt = host_err;
>> +
>> +	return 0;
>> +}
>> +
>> +static noinline int
>> +nfsd_issue_write_dio(struct svc_rqst *rqstp, struct svc_fh *fhp,
>> +		     struct nfsd_file *nf, loff_t offset,
>> +		     unsigned int nvecs, unsigned long *cnt,
>> +		     struct kiocb *kiocb)
>> +{
>> +	struct nfsd_write_dio write_dio;
>> +	struct file *file = nf->nf_file;
>> +
>> +	if (!nfsd_analyze_write_dio(rqstp, fhp, nf, offset,
>> +				    *cnt, &write_dio)) {
>> +		return nfsd_issue_write_buffered(rqstp, file,
>> +					nvecs, cnt, kiocb);
>> +	} else {
>> +		bool iter_is_dio_aligned[3];
>> +		struct iov_iter iter_stack[3];
>> +		struct iov_iter *iter = iter_stack;
>> +		unsigned int n_iters = 0;
>> +		int host_err;
>> +
>> +		n_iters = nfsd_setup_write_dio_iters(&iter,
>> +				iter_is_dio_aligned, rqstp->rq_bvec,
>> +				nvecs, *cnt, &write_dio);
>> +		*cnt = 0;
>> +		for (int i = 0; i < n_iters; i++) {
>> +			if (iter_is_dio_aligned[i])
>> +				kiocb->ki_flags |= IOCB_DIRECT;
>> +			else
>> +				kiocb->ki_flags &= ~IOCB_DIRECT;
>> +			host_err = vfs_iocb_iter_write(file, kiocb,
>> +						       &iter[i]);
>> +			if (host_err < 0)
>> +				return host_err;
>> +			*cnt += host_err;
>> +		}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>  /**
>>   * nfsd_vfs_write - write data to an already-open file
>>   * @rqstp: RPC execution context
>> @@ -1342,7 +1503,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>  	struct super_block	*sb = file_inode(file)->i_sb;
>>  	struct kiocb		kiocb;
>>  	struct svc_export	*exp;
>> -	struct iov_iter		iter;
>>  	errseq_t		since;
>>  	__be32			nfserr;
>>  	int			host_err;
>> @@ -1379,31 +1539,28 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>  		kiocb.ki_flags |= IOCB_DSYNC;
>>  
>>  	nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
>> -	iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
>> +
>> +	since = READ_ONCE(file->f_wb_err);
>> +	if (verf)
>> +		nfsd_copy_write_verifier(verf, nn);
>>  
>>  	switch (nfsd_io_cache_write) {
>>  	case NFSD_IO_DIRECT:
>> -		/* direct I/O must be aligned to device logical sector size */
>> -		if (nf->nf_dio_mem_align && nf->nf_dio_offset_align &&
>> -		    (((offset | *cnt) & (nf->nf_dio_offset_align-1)) == 0))
>> -			kiocb.ki_flags |= IOCB_DIRECT;
>> +		host_err = nfsd_issue_write_dio(rqstp, fhp, nf, offset,
>> +						nvecs, cnt, &kiocb);
>>  		break;
>>  	case NFSD_IO_DONTCACHE:
>>  		kiocb.ki_flags |= IOCB_DONTCACHE;
>> -		break;
>> +		fallthrough;
>>  	case NFSD_IO_BUFFERED:
>> +		host_err = nfsd_issue_write_buffered(rqstp, file,
>> +						nvecs, cnt, &kiocb);
>>  		break;
>>  	}
>> -
>> -	since = READ_ONCE(file->f_wb_err);
>> -	if (verf)
>> -		nfsd_copy_write_verifier(verf, nn);
>> -	host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
>>  	if (host_err < 0) {
>>  		commit_reset_write_verifier(nn, rqstp, host_err);
>>  		goto out_nfserr;
>>  	}
>> -	*cnt = host_err;
>>  	nfsd_stats_io_write_add(nn, exp, *cnt);
>>  	fsnotify_modify(file);
>>  	host_err = filemap_check_wb_err(file->f_mapping, since);
>> -- 
>> 2.44.0
>>
>>
> 
> Embarrassingly, turns out I only tested the NFSD_IO_DIRECT case prior
> to submitting this v5, if the 'nfsd_io_cache_write' debugfs file is
> never written it defaults to NFSD_IO_UNSPECIFIED.  But even if that's
> the case we need to treat NFSD_IO_UNSPECIFIED like NFSD_IO_BUFFERED.
> (that is how nfsd_vfs_read behaves, but I missed this in
> nfsd_vfs_write when I refactored the code for v5).
> 
> This incremental patch fixes this oversight, Chuck should I submit a
> v8 or you're OK with folding this fixup (if the rest of the code is
> OK)?

Hold off on v6 for now. Let's wait for more review comments.


> Thanks,
> Mike
> 
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 1b5aa3e6e6623..b529754a20bd5 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -1505,7 +1505,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  	struct svc_export	*exp;
>  	errseq_t		since;
>  	__be32			nfserr;
> -	int			host_err;
> +	int			host_err = 0;
>  	unsigned long		exp_op_flags = 0;
>  	unsigned int		pflags = current->flags;
>  	bool			restore_flags = false;
> @@ -1552,6 +1552,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  	case NFSD_IO_DONTCACHE:
>  		kiocb.ki_flags |= IOCB_DONTCACHE;
>  		fallthrough;
> +	case NFSD_IO_UNSPECIFIED:
>  	case NFSD_IO_BUFFERED:
>  		host_err = nfsd_issue_write_buffered(rqstp, file,
>  						nvecs, cnt, &kiocb);


-- 
Chuck Lever




[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux