On 9/11/25 12:02 PM, Sergey Bashirov wrote: > This patch allows the pNFS server to respond with multiple extents > in a layoutget request. As a result, the number of layoutget requests > is significantly reduced for various file access patterns, including > random and parallel writes, avoiding unnecessary load to the server. > On the client side, this improves the performance of writing large > files and allows requesting layouts with minimum length greater than > PAGE_SIZE. > > Signed-off-by: Sergey Bashirov <sergeybashirov@xxxxxxxxx> > --- > Checked with smatch, tested on pNFS block volume setup. > > fs/nfsd/blocklayout.c | 167 +++++++++++++++++++++++++++++---------- > fs/nfsd/blocklayoutxdr.c | 36 ++++++--- > fs/nfsd/blocklayoutxdr.h | 5 ++ > 3 files changed, 157 insertions(+), 51 deletions(-) > > diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c > index fde5539cf6a6..d53f3ec8823a 100644 > --- a/fs/nfsd/blocklayout.c > +++ b/fs/nfsd/blocklayout.c > @@ -17,48 +17,39 @@ > #define NFSDDBG_FACILITY NFSDDBG_PNFS > > > +/** > + * nfsd4_block_map_extent - get extent that covers the start of segment > + * @inode: inode of the file requested by the client > + * @fhp: handle of the file requested by the client > + * @seg: layout subrange requested by the client > + * @minlength: layout min length requested by the client > + * @bex: output block extent structure > + * > + * Get an extent from the file system that starts at @seg->offset or below, > + * but may be shorter than @seg->length. > + * > + * Return values: > + * %nfs_ok: Success, @bex is initialized and valid > + * %nfserr_layoutunavailable: Failed to get extent for requested @seg > + * OS errors converted to NFS errors > + */ > static __be32 > -nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, > - const struct svc_fh *fhp, struct nfsd4_layoutget *args) > +nfsd4_block_map_extent(struct inode *inode, const struct svc_fh *fhp, > + const struct nfsd4_layout_seg *seg, u64 minlength, > + struct pnfs_block_extent *bex) > { > - struct nfsd4_layout_seg *seg = &args->lg_seg; > struct super_block *sb = inode->i_sb; > - u32 block_size = i_blocksize(inode); > - struct pnfs_block_extent *bex; > struct iomap iomap; > u32 device_generation = 0; > int error; > > - if (locks_in_grace(SVC_NET(rqstp))) > - return nfserr_grace; > - > - if (seg->offset & (block_size - 1)) { > - dprintk("pnfsd: I/O misaligned\n"); > - goto out_layoutunavailable; > - } > - > - /* > - * Some clients barf on non-zero block numbers for NONE or INVALID > - * layouts, so make sure to zero the whole structure. > - */ > - error = -ENOMEM; > - bex = kzalloc(sizeof(*bex), GFP_KERNEL); > - if (!bex) > - goto out_error; > - args->lg_content = bex; > - > error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length, > &iomap, seg->iomode != IOMODE_READ, > &device_generation); > if (error) { > if (error == -ENXIO) > - goto out_layoutunavailable; > - goto out_error; > - } > - > - if (iomap.length < args->lg_minlength) { > - dprintk("pnfsd: extent smaller than minlength\n"); > - goto out_layoutunavailable; > + return nfserr_layoutunavailable; > + return nfserrno(error); > } > > switch (iomap.type) { > @@ -74,9 +65,9 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, > /* > * Crack monkey special case from section 2.3.1. > */ > - if (args->lg_minlength == 0) { > + if (minlength == 0) { > dprintk("pnfsd: no soup for you!\n"); > - goto out_layoutunavailable; > + return nfserr_layoutunavailable; > } > > bex->es = PNFS_BLOCK_INVALID_DATA; > @@ -93,27 +84,119 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, > case IOMAP_DELALLOC: > default: > WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type); > - goto out_layoutunavailable; > + return nfserr_layoutunavailable; > } > > error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation); > if (error) > - goto out_error; > + return nfserrno(error); > + > bex->foff = iomap.offset; > bex->len = iomap.length; > + return nfs_ok; > +} > > - seg->offset = iomap.offset; > - seg->length = iomap.length; > +/** > + * nfsd4_block_map_segment - get extent array for the requested layout > + * @inode: inode of the file requested by the client > + * @fhp: handle of the file requested by the client > + * @seg: layout range requested by the client > + * @minlength: layout min length requested by the client > + * @bl: output array of block extents > + * > + * Get an array of consecutive block extents that span the requested > + * layout range. The resulting range may be shorter than requested if > + * all preallocated block extents are used. > + * > + * Return values: > + * %nfs_ok: Success, @bl initialized and valid > + * %nfserr_layoutunavailable: Failed to get extents for requested @seg > + * OS errors converted to NFS errors > + */ > +static __be32 > +nfsd4_block_map_segment(struct inode *inode, const struct svc_fh *fhp, > + const struct nfsd4_layout_seg *seg, u64 minlength, > + struct pnfs_block_layout *bl) > +{ > + struct nfsd4_layout_seg subseg = *seg; > + u32 i; > + __be32 nfserr; > > - dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es); > - return 0; > + for (i = 0; i < bl->nr_extents; i++) { > + struct pnfs_block_extent *extent = bl->extents + i; > + u64 extent_len; > + > + nfserr = nfsd4_block_map_extent(inode, fhp, &subseg, > + minlength, extent); > + if (nfserr != nfs_ok) > + return nfserr; > + > + extent_len = extent->len - (subseg.offset - extent->foff); > + if (extent_len >= subseg.length) { > + bl->nr_extents = i + 1; > + return nfs_ok; > + } > + > + subseg.offset = extent->foff + extent->len; > + subseg.length -= extent_len; > + } > + > + return nfs_ok; > +} > + > +static __be32 > +nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, > + const struct svc_fh *fhp, struct nfsd4_layoutget *args) > +{ > + struct nfsd4_layout_seg *seg = &args->lg_seg; > + u64 seg_length; > + struct pnfs_block_extent *first_bex, *last_bex; > + struct pnfs_block_layout *bl; > + u32 nr_extents_max = PAGE_SIZE / sizeof(bl->extents[0]) - 1; > + u32 block_size = i_blocksize(inode); > + __be32 nfserr; > + > + if (locks_in_grace(SVC_NET(rqstp))) > + return nfserr_grace; > + > + nfserr = nfserr_layoutunavailable; > + if (seg->offset & (block_size - 1)) { > + dprintk("pnfsd: I/O misaligned\n"); > + goto out_error; > + } > + > + /* > + * Some clients barf on non-zero block numbers for NONE or INVALID > + * layouts, so make sure to zero the whole structure. > + */ > + nfserr = nfserrno(-ENOMEM); > + bl = kzalloc(struct_size(bl, extents, nr_extents_max), GFP_KERNEL); > + if (!bl) > + goto out_error; > + bl->nr_extents = nr_extents_max; > + args->lg_content = bl; > + > + nfserr = nfsd4_block_map_segment(inode, fhp, seg, > + args->lg_minlength, bl); > + if (nfserr != nfs_ok) > + goto out_error; > + first_bex = bl->extents; > + last_bex = bl->extents + bl->nr_extents - 1; > + > + nfserr = nfserr_layoutunavailable; > + seg_length = last_bex->foff + last_bex->len - seg->offset; > + if (seg_length < args->lg_minlength) { > + dprintk("pnfsd: extent smaller than minlength\n"); > + goto out_error; > + } > + > + seg->offset = first_bex->foff; > + seg->length = last_bex->foff - first_bex->foff + last_bex->len; > + return nfs_ok; > > out_error: > seg->length = 0; > - return nfserrno(error); > -out_layoutunavailable: > - seg->length = 0; > - return nfserr_layoutunavailable; > + return nfserr; > } > > static __be32 > diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c > index e50afe340737..68c112d47cee 100644 > --- a/fs/nfsd/blocklayoutxdr.c > +++ b/fs/nfsd/blocklayoutxdr.c > @@ -14,12 +14,25 @@ > #define NFSDDBG_FACILITY NFSDDBG_PNFS > > > +/** > + * nfsd4_block_encode_layoutget - encode block/scsi layout extent array > + * @xdr: stream for data encoding > + * @lgp: layoutget content, actually an array of extents to encode > + * > + * This function encodes the opaque loc_body field in the layoutget response. > + * Since the pnfs_block_layout4 and pnfs_scsi_layout4 structures on the wire > + * are the same, this function is used by both layout drivers. > + * > + * Return values: > + * %nfs_ok: Success, all extents encoded into @xdr > + * %nfserr_toosmall: Not enough space in @xdr to encode all the data > + */ > __be32 > nfsd4_block_encode_layoutget(struct xdr_stream *xdr, > const struct nfsd4_layoutget *lgp) > { > - const struct pnfs_block_extent *b = lgp->lg_content; > - int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32); > + const struct pnfs_block_layout *bl = lgp->lg_content; > + u32 i, len = sizeof(__be32) + bl->nr_extents * PNFS_BLOCK_EXTENT_SIZE; > __be32 *p; > > p = xdr_reserve_space(xdr, sizeof(__be32) + len); > @@ -27,14 +40,19 @@ nfsd4_block_encode_layoutget(struct xdr_stream *xdr, > return nfserr_toosmall; > > *p++ = cpu_to_be32(len); > - *p++ = cpu_to_be32(1); /* we always return a single extent */ > + *p++ = cpu_to_be32(bl->nr_extents); > > - p = svcxdr_encode_deviceid4(p, &b->vol_id); > - p = xdr_encode_hyper(p, b->foff); > - p = xdr_encode_hyper(p, b->len); > - p = xdr_encode_hyper(p, b->soff); > - *p++ = cpu_to_be32(b->es); > - return 0; > + for (i = 0; i < bl->nr_extents; i++) { > + const struct pnfs_block_extent *bex = bl->extents + i; > + > + p = svcxdr_encode_deviceid4(p, &bex->vol_id); > + p = xdr_encode_hyper(p, bex->foff); > + p = xdr_encode_hyper(p, bex->len); > + p = xdr_encode_hyper(p, bex->soff); > + *p++ = cpu_to_be32(bex->es); > + } > + > + return nfs_ok; > } > > static int > diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h > index 7d25ef689671..54fe7f517a94 100644 > --- a/fs/nfsd/blocklayoutxdr.h > +++ b/fs/nfsd/blocklayoutxdr.h > @@ -21,6 +21,11 @@ struct pnfs_block_range { > u64 len; > }; > > +struct pnfs_block_layout { > + u32 nr_extents; > + struct pnfs_block_extent extents[] __counted_by(nr_extents); > +}; > + > /* > * Random upper cap for the uuid length to avoid unbounded allocation. > * Not actually limited by the protocol. Dai, Christoph - please review and/or test. -- Chuck Lever