On 6/14/25 11:59 AM, Sergey Bashirov wrote: > When pNFS client in the block or scsi layout mode sends layoutcommit > to MDS, a variable length array of modified extents is supplied within > the request. This patch allows the server to accept such extent arrays > if they do not fit within single memory page. > > The issue can be reproduced when writing to a 1GB file using FIO with > O_DIRECT, 4K block and large I/O depth without preallocation of the > file. In this case, the server returns NFSERR_BADXDR to the client. > > Co-developed-by: Konstantin Evtushenko <koevtushenko@xxxxxxxxxx> > Signed-off-by: Konstantin Evtushenko <koevtushenko@xxxxxxxxxx> > Signed-off-by: Sergey Bashirov <sergeybashirov@xxxxxxxxx> > --- > Changes in v3: > - Prerequisite: [v3] nfsd: Use correct error code when decoding extents > - Drop dprintk() > - Use svcxdr_init_decode() to init subbuf > - Tested manually the block layout driver using FIO > > fs/nfsd/blocklayout.c | 20 ++++--- > fs/nfsd/blocklayoutxdr.c | 118 ++++++++++++++++++++------------------- > fs/nfsd/blocklayoutxdr.h | 4 +- > fs/nfsd/nfs4proc.c | 2 +- > fs/nfsd/nfs4xdr.c | 11 ++-- > fs/nfsd/pnfs.h | 1 + > fs/nfsd/xdr4.h | 3 +- > 7 files changed, 83 insertions(+), 76 deletions(-) > > diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c > index 19078a043e85..54fbe157f84a 100644 > --- a/fs/nfsd/blocklayout.c > +++ b/fs/nfsd/blocklayout.c > @@ -173,16 +173,18 @@ nfsd4_block_proc_getdeviceinfo(struct super_block *sb, > } > > static __be32 > -nfsd4_block_proc_layoutcommit(struct inode *inode, > +nfsd4_block_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, > struct nfsd4_layoutcommit *lcp) > { > struct iomap *iomaps; > int nr_iomaps; > __be32 nfserr; > > - nfserr = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, > - lcp->lc_up_len, &iomaps, &nr_iomaps, > - i_blocksize(inode)); > + memcpy(&rqstp->rq_arg, &lcp->lc_up_layout, sizeof(struct xdr_buf)); > + svcxdr_init_decode(rqstp); > + > + nfserr = nfsd4_block_decode_layoutupdate(&rqstp->rq_arg_stream, > + &iomaps, &nr_iomaps, i_blocksize(inode)); > if (nfserr != nfs_ok) > return nfserr; > > @@ -313,16 +315,18 @@ nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb, > return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp)); > } > static __be32 > -nfsd4_scsi_proc_layoutcommit(struct inode *inode, > +nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, > struct nfsd4_layoutcommit *lcp) > { > struct iomap *iomaps; > int nr_iomaps; > __be32 nfserr; > > - nfserr = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, > - lcp->lc_up_len, &iomaps, &nr_iomaps, > - i_blocksize(inode)); > + memcpy(&rqstp->rq_arg, &lcp->lc_up_layout, sizeof(struct xdr_buf)); > + svcxdr_init_decode(rqstp); > + > + nfserr = nfsd4_scsi_decode_layoutupdate(&rqstp->rq_arg_stream, > + &iomaps, &nr_iomaps, i_blocksize(inode)); > if (nfserr != nfs_ok) > return nfserr; > > diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c > index 669ff8e6e966..266b2737882e 100644 > --- a/fs/nfsd/blocklayoutxdr.c > +++ b/fs/nfsd/blocklayoutxdr.c > @@ -114,8 +114,7 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > > /** > * nfsd4_block_decode_layoutupdate - decode the block layout extent array > - * @p: pointer to the xdr data > - * @len: number of bytes to decode > + * @xdr: subbuf set to the encoded array > * @iomapp: pointer to store the decoded extent array > * @nr_iomapsp: pointer to store the number of extents > * @block_size: alignment of extent offset and length > @@ -128,68 +127,74 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > * > * Return values: > * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid > - * %nfserr_bad_xdr: The encoded array in @p is invalid > + * %nfserr_bad_xdr: The encoded array in @xdr is invalid > * %nfserr_inval: An unaligned extent found > * %nfserr_delay: Failed to allocate memory for @iomapp > */ > __be32 > -nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > +nfsd4_block_decode_layoutupdate(struct xdr_stream *xdr, struct iomap **iomapp, > int *nr_iomapsp, u32 block_size) > { > struct iomap *iomaps; > - u32 nr_iomaps, i; > + u32 nr_iomaps, expected, len, i; > + __be32 nfserr; > > - if (len < sizeof(u32)) { > - dprintk("%s: extent array too small: %u\n", __func__, len); > + if (xdr_stream_decode_u32(xdr, &nr_iomaps)) > return nfserr_bad_xdr; > - } > - len -= sizeof(u32); > - if (len % PNFS_BLOCK_EXTENT_SIZE) { > - dprintk("%s: extent array invalid: %u\n", __func__, len); > - return nfserr_bad_xdr; > - } > > - nr_iomaps = be32_to_cpup(p++); > - if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) { > - dprintk("%s: extent array size mismatch: %u/%u\n", > - __func__, len, nr_iomaps); > + len = sizeof(__be32) + xdr_stream_remaining(xdr); > + expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE; > + if (len != expected) > return nfserr_bad_xdr; > - } > > iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); > - if (!iomaps) { > - dprintk("%s: failed to allocate extent array\n", __func__); > + if (!iomaps) > return nfserr_delay; > - } > > for (i = 0; i < nr_iomaps; i++) { > struct pnfs_block_extent bex; > + ssize_t ret; > > - memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid)); > - p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid)); > + ret = xdr_stream_decode_opaque_fixed(xdr, > + &bex.vol_id, sizeof(bex.vol_id)); > + if (ret < sizeof(bex.vol_id)) { > + nfserr = nfserr_bad_xdr; > + goto fail; > + } > > - p = xdr_decode_hyper(p, &bex.foff); > + if (xdr_stream_decode_u64(xdr, &bex.foff)) { > + nfserr = nfserr_bad_xdr; > + goto fail; > + } > if (bex.foff & (block_size - 1)) { > - dprintk("%s: unaligned offset 0x%llx\n", > - __func__, bex.foff); > + nfserr = nfserr_inval; > + goto fail; > + } > + > + if (xdr_stream_decode_u64(xdr, &bex.len)) { > + nfserr = nfserr_bad_xdr; > goto fail; > } > - p = xdr_decode_hyper(p, &bex.len); > if (bex.len & (block_size - 1)) { > - dprintk("%s: unaligned length 0x%llx\n", > - __func__, bex.foff); > + nfserr = nfserr_inval; > + goto fail; > + } > + > + if (xdr_stream_decode_u64(xdr, &bex.soff)) { > + nfserr = nfserr_bad_xdr; > goto fail; > } > - p = xdr_decode_hyper(p, &bex.soff); > if (bex.soff & (block_size - 1)) { > - dprintk("%s: unaligned disk offset 0x%llx\n", > - __func__, bex.soff); > + nfserr = nfserr_inval; > + goto fail; > + } > + > + if (xdr_stream_decode_u32(xdr, &bex.es)) { > + nfserr = nfserr_bad_xdr; > goto fail; > } > - bex.es = be32_to_cpup(p++); > if (bex.es != PNFS_BLOCK_READWRITE_DATA) { > - dprintk("%s: incorrect extent state %d\n", > - __func__, bex.es); > + nfserr = nfserr_inval; > goto fail; > } > > @@ -202,13 +207,12 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > return nfs_ok; > fail: > kfree(iomaps); > - return nfserr_inval; > + return nfserr; > } > > /** > * nfsd4_scsi_decode_layoutupdate - decode the scsi layout extent array > - * @p: pointer to the xdr data > - * @len: number of bytes to decode > + * @xdr: subbuf set to the encoded array > * @iomapp: pointer to store the decoded extent array > * @nr_iomapsp: pointer to store the number of extents > * @block_size: alignment of extent offset and length > @@ -220,49 +224,49 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > * > * Return values: > * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid > - * %nfserr_bad_xdr: The encoded array in @p is invalid > + * %nfserr_bad_xdr: The encoded array in @xdr is invalid > * %nfserr_inval: An unaligned extent found > * %nfserr_delay: Failed to allocate memory for @iomapp > */ > __be32 > -nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > +nfsd4_scsi_decode_layoutupdate(struct xdr_stream *xdr, struct iomap **iomapp, > int *nr_iomapsp, u32 block_size) > { > struct iomap *iomaps; > - u32 nr_iomaps, expected, i; > + u32 nr_iomaps, expected, len, i; > + __be32 nfserr; > > - if (len < sizeof(u32)) { > - dprintk("%s: extent array too small: %u\n", __func__, len); > + if (xdr_stream_decode_u32(xdr, &nr_iomaps)) > return nfserr_bad_xdr; > - } > > - nr_iomaps = be32_to_cpup(p++); > + len = sizeof(__be32) + xdr_stream_remaining(xdr); > expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE; > - if (len != expected) { > - dprintk("%s: extent array size mismatch: %u/%u\n", > - __func__, len, expected); > + if (len != expected) > return nfserr_bad_xdr; > - } > > iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); > - if (!iomaps) { > - dprintk("%s: failed to allocate extent array\n", __func__); > + if (!iomaps) > return nfserr_delay; > - } > > for (i = 0; i < nr_iomaps; i++) { > u64 val; > > - p = xdr_decode_hyper(p, &val); > + if (xdr_stream_decode_u64(xdr, &val)) { > + nfserr = nfserr_bad_xdr; > + goto fail; > + } > if (val & (block_size - 1)) { > - dprintk("%s: unaligned offset 0x%llx\n", __func__, val); > + nfserr = nfserr_inval; > goto fail; > } > iomaps[i].offset = val; > > - p = xdr_decode_hyper(p, &val); > + if (xdr_stream_decode_u64(xdr, &val)) { > + nfserr = nfserr_bad_xdr; > + goto fail; > + } > if (val & (block_size - 1)) { > - dprintk("%s: unaligned length 0x%llx\n", __func__, val); > + nfserr = nfserr_inval; > goto fail; > } > iomaps[i].length = val; > @@ -273,5 +277,5 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > return nfs_ok; > fail: > kfree(iomaps); > - return nfserr_inval; > + return nfserr; > } > diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h > index 15b3569f3d9a..7d25ef689671 100644 > --- a/fs/nfsd/blocklayoutxdr.h > +++ b/fs/nfsd/blocklayoutxdr.h > @@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > const struct nfsd4_getdeviceinfo *gdp); > __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, > const struct nfsd4_layoutget *lgp); > -__be32 nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, > +__be32 nfsd4_block_decode_layoutupdate(struct xdr_stream *xdr, > struct iomap **iomapp, int *nr_iomapsp, u32 block_size); > -__be32 nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, > +__be32 nfsd4_scsi_decode_layoutupdate(struct xdr_stream *xdr, > struct iomap **iomapp, int *nr_iomapsp, u32 block_size); > > #endif /* _NFSD_BLOCKLAYOUTXDR_H */ > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c > index f13abbb13b38..873cd667477c 100644 > --- a/fs/nfsd/nfs4proc.c > +++ b/fs/nfsd/nfs4proc.c > @@ -2533,7 +2533,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, > lcp->lc_size_chg = false; > } > > - nfserr = ops->proc_layoutcommit(inode, lcp); > + nfserr = ops->proc_layoutcommit(inode, rqstp, lcp); > nfs4_put_stid(&ls->ls_stid); > out: > return nfserr; > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c > index 3afcdbed6e14..659e60b85d5f 100644 > --- a/fs/nfsd/nfs4xdr.c > +++ b/fs/nfsd/nfs4xdr.c > @@ -604,6 +604,8 @@ static __be32 > nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, > struct nfsd4_layoutcommit *lcp) > { > + u32 len; > + > if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_layout_type) < 0) > return nfserr_bad_xdr; > if (lcp->lc_layout_type < LAYOUT_NFSV4_1_FILES) > @@ -611,13 +613,10 @@ nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, > if (lcp->lc_layout_type >= LAYOUT_TYPE_MAX) > return nfserr_bad_xdr; > > - if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0) > + if (xdr_stream_decode_u32(argp->xdr, &len) < 0) > + return nfserr_bad_xdr; > + if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, len)) > return nfserr_bad_xdr; > - if (lcp->lc_up_len > 0) { > - lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len); > - if (!lcp->lc_up_layout) > - return nfserr_bad_xdr; > - } > > return nfs_ok; > } > diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h > index 925817f66917..dfd411d1f363 100644 > --- a/fs/nfsd/pnfs.h > +++ b/fs/nfsd/pnfs.h > @@ -35,6 +35,7 @@ struct nfsd4_layout_ops { > const struct nfsd4_layoutget *lgp); > > __be32 (*proc_layoutcommit)(struct inode *inode, > + struct svc_rqst *rqstp, > struct nfsd4_layoutcommit *lcp); > > void (*fence_client)(struct nfs4_layout_stateid *ls, > diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h > index aa2a356da784..02887029a81c 100644 > --- a/fs/nfsd/xdr4.h > +++ b/fs/nfsd/xdr4.h > @@ -630,8 +630,7 @@ struct nfsd4_layoutcommit { > u64 lc_last_wr; /* request */ > struct timespec64 lc_mtime; /* request */ > u32 lc_layout_type; /* request */ > - u32 lc_up_len; /* layout length */ > - void *lc_up_layout; /* decoded by callback */ > + struct xdr_buf lc_up_layout; /* decoded by callback */ > bool lc_size_chg; /* response */ > u64 lc_newsize; /* response */ > }; Hi Sergey - Typically we separate the clean-ups from the substantive changes. So removing the dprintk call sites would be done in a pre-requisite patch. I'm asking you to do it because if I split this patch up, I'm likely to get something wrong, and you have a convenient test case. Also, reposting means your tested version of the series is what gets archived on lore. Would you mind splitting this one up and posting a v4 ? -- Chuck Lever