On Sat, 2025-06-21 at 19:52 +0300, Sergey Bashirov wrote: > When pNFS client in the block or scsi layout mode sends layoutcommit > to MDS, a variable length array of modified extents is supplied within > the request. This patch allows the server to accept such extent arrays > if they do not fit within single memory page. > > The issue can be reproduced when writing to a 1GB file using FIO with > O_DIRECT, 4K block and large I/O depth without preallocation of the > file. In this case, the server returns NFSERR_BADXDR to the client. > > Co-developed-by: Konstantin Evtushenko <koevtushenko@xxxxxxxxxx> > Signed-off-by: Konstantin Evtushenko <koevtushenko@xxxxxxxxxx> > Signed-off-by: Sergey Bashirov <sergeybashirov@xxxxxxxxx> > --- > fs/nfsd/blocklayout.c | 20 ++++++---- > fs/nfsd/blocklayoutxdr.c | 86 +++++++++++++++++++++++++++------------- > fs/nfsd/blocklayoutxdr.h | 4 +- > fs/nfsd/nfs4proc.c | 2 +- > fs/nfsd/nfs4xdr.c | 11 +++-- > fs/nfsd/pnfs.h | 1 + > fs/nfsd/xdr4.h | 3 +- > 7 files changed, 80 insertions(+), 47 deletions(-) > > diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c > index 19078a043e85..54fbe157f84a 100644 > --- a/fs/nfsd/blocklayout.c > +++ b/fs/nfsd/blocklayout.c > @@ -173,16 +173,18 @@ nfsd4_block_proc_getdeviceinfo(struct super_block *sb, > } > > static __be32 > -nfsd4_block_proc_layoutcommit(struct inode *inode, > +nfsd4_block_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, > struct nfsd4_layoutcommit *lcp) > { > struct iomap *iomaps; > int nr_iomaps; > __be32 nfserr; > > - nfserr = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, > - lcp->lc_up_len, &iomaps, &nr_iomaps, > - i_blocksize(inode)); > + memcpy(&rqstp->rq_arg, &lcp->lc_up_layout, sizeof(struct xdr_buf)); Huh? rqstp->rq_arg is an xdr_buf. This is going to end up scri > + svcxdr_init_decode(rqstp); > + > + nfserr = nfsd4_block_decode_layoutupdate(&rqstp->rq_arg_stream, > + &iomaps, &nr_iomaps, i_blocksize(inode)); > if (nfserr != nfs_ok) > return nfserr; > > @@ -313,16 +315,18 @@ nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb, > return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp)); > } > static __be32 > -nfsd4_scsi_proc_layoutcommit(struct inode *inode, > +nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, > struct nfsd4_layoutcommit *lcp) > { > struct iomap *iomaps; > int nr_iomaps; > __be32 nfserr; > > - nfserr = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, > - lcp->lc_up_len, &iomaps, &nr_iomaps, > - i_blocksize(inode)); > + memcpy(&rqstp->rq_arg, &lcp->lc_up_layout, sizeof(struct xdr_buf)); > + svcxdr_init_decode(rqstp); > + > + nfserr = nfsd4_scsi_decode_layoutupdate(&rqstp->rq_arg_stream, > + &iomaps, &nr_iomaps, i_blocksize(inode)); > if (nfserr != nfs_ok) > return nfserr; > > diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c > index bcf21fde9120..266b2737882e 100644 > --- a/fs/nfsd/blocklayoutxdr.c > +++ b/fs/nfsd/blocklayoutxdr.c > @@ -114,8 +114,7 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > > /** > * nfsd4_block_decode_layoutupdate - decode the block layout extent array > - * @p: pointer to the xdr data > - * @len: number of bytes to decode > + * @xdr: subbuf set to the encoded array > * @iomapp: pointer to store the decoded extent array > * @nr_iomapsp: pointer to store the number of extents > * @block_size: alignment of extent offset and length > @@ -128,25 +127,24 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > * > * Return values: > * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid > - * %nfserr_bad_xdr: The encoded array in @p is invalid > + * %nfserr_bad_xdr: The encoded array in @xdr is invalid > * %nfserr_inval: An unaligned extent found > * %nfserr_delay: Failed to allocate memory for @iomapp > */ > __be32 > -nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > +nfsd4_block_decode_layoutupdate(struct xdr_stream *xdr, struct iomap **iomapp, > int *nr_iomapsp, u32 block_size) > { > struct iomap *iomaps; > - u32 nr_iomaps, i; > + u32 nr_iomaps, expected, len, i; > + __be32 nfserr; > > - if (len < sizeof(u32)) > - return nfserr_bad_xdr; > - len -= sizeof(u32); > - if (len % PNFS_BLOCK_EXTENT_SIZE) > + if (xdr_stream_decode_u32(xdr, &nr_iomaps)) > return nfserr_bad_xdr; > > - nr_iomaps = be32_to_cpup(p++); > - if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) > + len = sizeof(__be32) + xdr_stream_remaining(xdr); > + expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE; > + if (len != expected) > return nfserr_bad_xdr; > > iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); > @@ -155,24 +153,48 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > > for (i = 0; i < nr_iomaps; i++) { > struct pnfs_block_extent bex; > + ssize_t ret; > > - memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid)); > - p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid)); > + ret = xdr_stream_decode_opaque_fixed(xdr, > + &bex.vol_id, sizeof(bex.vol_id)); > + if (ret < sizeof(bex.vol_id)) { > + nfserr = nfserr_bad_xdr; > + goto fail; > + } > > - p = xdr_decode_hyper(p, &bex.foff); > + if (xdr_stream_decode_u64(xdr, &bex.foff)) { > + nfserr = nfserr_bad_xdr; > + goto fail; > + } > if (bex.foff & (block_size - 1)) { > + nfserr = nfserr_inval; > + goto fail; > + } > + > + if (xdr_stream_decode_u64(xdr, &bex.len)) { > + nfserr = nfserr_bad_xdr; > goto fail; > } > - p = xdr_decode_hyper(p, &bex.len); > if (bex.len & (block_size - 1)) { > + nfserr = nfserr_inval; > + goto fail; > + } > + > + if (xdr_stream_decode_u64(xdr, &bex.soff)) { > + nfserr = nfserr_bad_xdr; > goto fail; > } > - p = xdr_decode_hyper(p, &bex.soff); > if (bex.soff & (block_size - 1)) { > + nfserr = nfserr_inval; > + goto fail; > + } > + > + if (xdr_stream_decode_u32(xdr, &bex.es)) { > + nfserr = nfserr_bad_xdr; > goto fail; > } > - bex.es = be32_to_cpup(p++); > if (bex.es != PNFS_BLOCK_READWRITE_DATA) { > + nfserr = nfserr_inval; > goto fail; > } > > @@ -185,13 +207,12 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > return nfs_ok; > fail: > kfree(iomaps); > - return nfserr_inval; > + return nfserr; > } > > /** > * nfsd4_scsi_decode_layoutupdate - decode the scsi layout extent array > - * @p: pointer to the xdr data > - * @len: number of bytes to decode > + * @xdr: subbuf set to the encoded array > * @iomapp: pointer to store the decoded extent array > * @nr_iomapsp: pointer to store the number of extents > * @block_size: alignment of extent offset and length > @@ -203,21 +224,22 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > * > * Return values: > * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid > - * %nfserr_bad_xdr: The encoded array in @p is invalid > + * %nfserr_bad_xdr: The encoded array in @xdr is invalid > * %nfserr_inval: An unaligned extent found > * %nfserr_delay: Failed to allocate memory for @iomapp > */ > __be32 > -nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > +nfsd4_scsi_decode_layoutupdate(struct xdr_stream *xdr, struct iomap **iomapp, > int *nr_iomapsp, u32 block_size) > { > struct iomap *iomaps; > - u32 nr_iomaps, expected, i; > + u32 nr_iomaps, expected, len, i; > + __be32 nfserr; > > - if (len < sizeof(u32)) > + if (xdr_stream_decode_u32(xdr, &nr_iomaps)) > return nfserr_bad_xdr; > > - nr_iomaps = be32_to_cpup(p++); > + len = sizeof(__be32) + xdr_stream_remaining(xdr); > expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE; > if (len != expected) > return nfserr_bad_xdr; > @@ -229,14 +251,22 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > for (i = 0; i < nr_iomaps; i++) { > u64 val; > > - p = xdr_decode_hyper(p, &val); > + if (xdr_stream_decode_u64(xdr, &val)) { > + nfserr = nfserr_bad_xdr; > + goto fail; > + } > if (val & (block_size - 1)) { > + nfserr = nfserr_inval; > goto fail; > } > iomaps[i].offset = val; > > - p = xdr_decode_hyper(p, &val); > + if (xdr_stream_decode_u64(xdr, &val)) { > + nfserr = nfserr_bad_xdr; > + goto fail; > + } > if (val & (block_size - 1)) { > + nfserr = nfserr_inval; > goto fail; > } > iomaps[i].length = val; > @@ -247,5 +277,5 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > return nfs_ok; > fail: > kfree(iomaps); > - return nfserr_inval; > + return nfserr; > } > diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h > index 15b3569f3d9a..7d25ef689671 100644 > --- a/fs/nfsd/blocklayoutxdr.h > +++ b/fs/nfsd/blocklayoutxdr.h > @@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > const struct nfsd4_getdeviceinfo *gdp); > __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, > const struct nfsd4_layoutget *lgp); > -__be32 nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, > +__be32 nfsd4_block_decode_layoutupdate(struct xdr_stream *xdr, > struct iomap **iomapp, int *nr_iomapsp, u32 block_size); > -__be32 nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, > +__be32 nfsd4_scsi_decode_layoutupdate(struct xdr_stream *xdr, > struct iomap **iomapp, int *nr_iomapsp, u32 block_size); > > #endif /* _NFSD_BLOCKLAYOUTXDR_H */ > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c > index f13abbb13b38..873cd667477c 100644 > --- a/fs/nfsd/nfs4proc.c > +++ b/fs/nfsd/nfs4proc.c > @@ -2533,7 +2533,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, > lcp->lc_size_chg = false; > } > > - nfserr = ops->proc_layoutcommit(inode, lcp); > + nfserr = ops->proc_layoutcommit(inode, rqstp, lcp); > nfs4_put_stid(&ls->ls_stid); > out: > return nfserr; > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c > index 3afcdbed6e14..659e60b85d5f 100644 > --- a/fs/nfsd/nfs4xdr.c > +++ b/fs/nfsd/nfs4xdr.c > @@ -604,6 +604,8 @@ static __be32 > nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, > struct nfsd4_layoutcommit *lcp) > { > + u32 len; > + > if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_layout_type) < 0) > return nfserr_bad_xdr; > if (lcp->lc_layout_type < LAYOUT_NFSV4_1_FILES) > @@ -611,13 +613,10 @@ nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, > if (lcp->lc_layout_type >= LAYOUT_TYPE_MAX) > return nfserr_bad_xdr; > > - if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0) > + if (xdr_stream_decode_u32(argp->xdr, &len) < 0) > + return nfserr_bad_xdr; > + if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, len)) > return nfserr_bad_xdr; > - if (lcp->lc_up_len > 0) { > - lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len); > - if (!lcp->lc_up_layout) > - return nfserr_bad_xdr; > - } > > return nfs_ok; > } > diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h > index 925817f66917..dfd411d1f363 100644 > --- a/fs/nfsd/pnfs.h > +++ b/fs/nfsd/pnfs.h > @@ -35,6 +35,7 @@ struct nfsd4_layout_ops { > const struct nfsd4_layoutget *lgp); > > __be32 (*proc_layoutcommit)(struct inode *inode, > + struct svc_rqst *rqstp, > struct nfsd4_layoutcommit *lcp); > > void (*fence_client)(struct nfs4_layout_stateid *ls, > diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h > index aa2a356da784..02887029a81c 100644 > --- a/fs/nfsd/xdr4.h > +++ b/fs/nfsd/xdr4.h > @@ -630,8 +630,7 @@ struct nfsd4_layoutcommit { > u64 lc_last_wr; /* request */ > struct timespec64 lc_mtime; /* request */ > u32 lc_layout_type; /* request */ > - u32 lc_up_len; /* layout length */ > - void *lc_up_layout; /* decoded by callback */ > + struct xdr_buf lc_up_layout; /* decoded by callback */ > bool lc_size_chg; /* response */ > u64 lc_newsize; /* response */ > }; LGTM. Nice work! Reviewed-by: Jeff Layton <jlayton@xxxxxxxxxx>