On 6/9/25 9:18 PM, Sergey Bashirov wrote: > When pNFS client in block layout mode sends layoutcommit RPC to MDS, > a variable length array of modified extents is supplied within request. > This patch allows NFS server to accept such extent arrays if they do not > fit within single memory page. > > The issue can be reproduced when writing to a 1GB file using FIO with small > block size and large I/O depth. Server returns NFSERR_BADXDR to the client. > > Co-developed-by: Konstantin Evtushenko <koevtushenko@xxxxxxxxxx> > Signed-off-by: Konstantin Evtushenko <koevtushenko@xxxxxxxxxx> > Signed-off-by: Sergey Bashirov <sergeybashirov@xxxxxxxxx> > --- > Changes in v2: > - Drop the len argument in *_decode_layoutupdate() > - Remove lc_up_len field from nfsd4_layoutcommit structure > - Fix code formatting > > fs/nfsd/blocklayout.c | 8 ++-- > fs/nfsd/blocklayoutxdr.c | 89 +++++++++++++++++++++++++++++----------- > fs/nfsd/blocklayoutxdr.h | 8 ++-- > fs/nfsd/nfs4xdr.c | 11 +++-- > fs/nfsd/xdr4.h | 3 +- > 5 files changed, 78 insertions(+), 41 deletions(-) > > diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c > index 08a20e5bcf7f..e74c3a19aeb9 100644 > --- a/fs/nfsd/blocklayout.c > +++ b/fs/nfsd/blocklayout.c > @@ -179,8 +179,8 @@ nfsd4_block_proc_layoutcommit(struct inode *inode, > struct iomap *iomaps; > int nr_iomaps; > > - nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, > - lcp->lc_up_len, &iomaps, i_blocksize(inode)); > + nr_iomaps = nfsd4_block_decode_layoutupdate(&lcp->lc_up_layout, > + &iomaps, i_blocksize(inode)); > if (nr_iomaps < 0) > return nfserrno(nr_iomaps); > > @@ -317,8 +317,8 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, > struct iomap *iomaps; > int nr_iomaps; > > - nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, > - lcp->lc_up_len, &iomaps, i_blocksize(inode)); > + nr_iomaps = nfsd4_scsi_decode_layoutupdate(&lcp->lc_up_layout, > + &iomaps, i_blocksize(inode)); > if (nr_iomaps < 0) > return nfserrno(nr_iomaps); > > diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c > index ce78f74715ee..b76dbb119e19 100644 > --- a/fs/nfsd/blocklayoutxdr.c > +++ b/fs/nfsd/blocklayoutxdr.c > @@ -113,26 +113,27 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > } > > int > -nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > +nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, struct iomap **iomapp, > u32 block_size) > { > struct iomap *iomaps; > - u32 nr_iomaps, i; > + u32 nr_iomaps, expected, len, i; > + struct xdr_stream xdr; > + char scratch[sizeof(struct pnfs_block_extent)]; Using "sizeof(struct yada)" for an XDR decoding buffer is incorrect. struct pnfs_block_extent is a C structure, with padding and architecture-dependent field sizes. It does not represent the size of the XDR data items on the wire. > - if (len < sizeof(u32)) { > - dprintk("%s: extent array too small: %u\n", __func__, len); > - return -EINVAL; > - } > - len -= sizeof(u32); > - if (len % PNFS_BLOCK_EXTENT_SIZE) { > - dprintk("%s: extent array invalid: %u\n", __func__, len); > + xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL); > + xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch)); Consider using svcxdr_init_decode() instead. > + > + if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) { > + dprintk("%s: failed to decode extent array size\n", __func__); > return -EINVAL; > } Throughout, please drop the dprintk's. At least don't add any new ones. I find the use of -EINVAL to signal an XDR decoding error to be somewhat inappropriate, but I guess it's historical (ie, its usage is not introduced by your patch). I would have expected perhaps NFS4ERR_BADXDR. Perhaps that should be corrected in a prerequisite patch. NFS4ERR_EINVAL means the server was able to decode the request but the decoded values are invalid. Here, the decoding has failed, and that's really a different problem. > - nr_iomaps = be32_to_cpup(p++); > - if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) { > + len = sizeof(__be32) + xdr_stream_remaining(&xdr); > + expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE; > + if (len != expected) { > dprintk("%s: extent array size mismatch: %u/%u\n", > - __func__, len, nr_iomaps); > + __func__, len, expected); > return -EINVAL; > } > > @@ -144,29 +145,54 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > > for (i = 0; i < nr_iomaps; i++) { > struct pnfs_block_extent bex; > + ssize_t ret; > > - memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid)); > - p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid)); > + ret = xdr_stream_decode_opaque_fixed(&xdr, > + &bex.vol_id, sizeof(bex.vol_id)); > + if (ret < sizeof(bex.vol_id)) { > + dprintk("%s: failed to decode device id for entry %u\n", > + __func__, i); > + goto fail; > + } > > - p = xdr_decode_hyper(p, &bex.foff); > + if (xdr_stream_decode_u64(&xdr, &bex.foff)) { > + dprintk("%s: failed to decode offset for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.foff & (block_size - 1)) { > dprintk("%s: unaligned offset 0x%llx\n", > __func__, bex.foff); > goto fail; > } > - p = xdr_decode_hyper(p, &bex.len); > + > + if (xdr_stream_decode_u64(&xdr, &bex.len)) { > + dprintk("%s: failed to decode length for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.len & (block_size - 1)) { > dprintk("%s: unaligned length 0x%llx\n", > __func__, bex.foff); > goto fail; > } > - p = xdr_decode_hyper(p, &bex.soff); > + > + if (xdr_stream_decode_u64(&xdr, &bex.soff)) { > + dprintk("%s: failed to decode soffset for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.soff & (block_size - 1)) { > dprintk("%s: unaligned disk offset 0x%llx\n", > __func__, bex.soff); > goto fail; > } > - bex.es = be32_to_cpup(p++); > + > + if (xdr_stream_decode_u32(&xdr, &bex.es)) { > + dprintk("%s: failed to decode estate for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.es != PNFS_BLOCK_READWRITE_DATA) { > dprintk("%s: incorrect extent state %d\n", > __func__, bex.es); > @@ -185,18 +211,23 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > } > > int > -nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > +nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, struct iomap **iomapp, > u32 block_size) > { > struct iomap *iomaps; > - u32 nr_iomaps, expected, i; > + u32 nr_iomaps, expected, len, i; > + struct xdr_stream xdr; > + char scratch[sizeof(struct pnfs_block_range)]; > > - if (len < sizeof(u32)) { > - dprintk("%s: extent array too small: %u\n", __func__, len); > + xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL); > + xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch)); > + > + if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) { > + dprintk("%s: failed to decode extent array size\n", __func__); > return -EINVAL; > } > > - nr_iomaps = be32_to_cpup(p++); > + len = sizeof(__be32) + xdr_stream_remaining(&xdr); > expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE; > if (len != expected) { > dprintk("%s: extent array size mismatch: %u/%u\n", > @@ -213,14 +244,22 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > for (i = 0; i < nr_iomaps; i++) { > u64 val; > > - p = xdr_decode_hyper(p, &val); > + if (xdr_stream_decode_u64(&xdr, &val)) { > + dprintk("%s: failed to decode offset for entry %u\n", > + __func__, i); > + goto fail; > + } > if (val & (block_size - 1)) { > dprintk("%s: unaligned offset 0x%llx\n", __func__, val); > goto fail; > } > iomaps[i].offset = val; > > - p = xdr_decode_hyper(p, &val); > + if (xdr_stream_decode_u64(&xdr, &val)) { > + dprintk("%s: failed to decode length for entry %u\n", > + __func__, i); > + goto fail; > + } > if (val & (block_size - 1)) { > dprintk("%s: unaligned length 0x%llx\n", __func__, val); > goto fail; > diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h > index 4e28ac8f1127..6e603cfec0a7 100644 > --- a/fs/nfsd/blocklayoutxdr.h > +++ b/fs/nfsd/blocklayoutxdr.h > @@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > const struct nfsd4_getdeviceinfo *gdp); > __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, > const struct nfsd4_layoutget *lgp); > -int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > - u32 block_size); > -int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > - u32 block_size); > +int nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, > + struct iomap **iomapp, u32 block_size); > +int nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, > + struct iomap **iomapp, u32 block_size); > > #endif /* _NFSD_BLOCKLAYOUTXDR_H */ > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c > index 3afcdbed6e14..659e60b85d5f 100644 > --- a/fs/nfsd/nfs4xdr.c > +++ b/fs/nfsd/nfs4xdr.c > @@ -604,6 +604,8 @@ static __be32 > nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, > struct nfsd4_layoutcommit *lcp) > { > + u32 len; > + > if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_layout_type) < 0) > return nfserr_bad_xdr; > if (lcp->lc_layout_type < LAYOUT_NFSV4_1_FILES) > @@ -611,13 +613,10 @@ nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, > if (lcp->lc_layout_type >= LAYOUT_TYPE_MAX) > return nfserr_bad_xdr; > > - if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0) > + if (xdr_stream_decode_u32(argp->xdr, &len) < 0) > + return nfserr_bad_xdr; > + if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, len)) > return nfserr_bad_xdr; The layout is effectively an opaque payload at this point, so using xdr_stream_subsegment() makes sense to me. > - if (lcp->lc_up_len > 0) { > - lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len); > - if (!lcp->lc_up_layout) > - return nfserr_bad_xdr; > - } > > return nfs_ok; > } > diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h > index aa2a356da784..02887029a81c 100644 > --- a/fs/nfsd/xdr4.h > +++ b/fs/nfsd/xdr4.h > @@ -630,8 +630,7 @@ struct nfsd4_layoutcommit { > u64 lc_last_wr; /* request */ > struct timespec64 lc_mtime; /* request */ > u32 lc_layout_type; /* request */ > - u32 lc_up_len; /* layout length */ > - void *lc_up_layout; /* decoded by callback */ > + struct xdr_buf lc_up_layout; /* decoded by callback */ > bool lc_size_chg; /* response */ > u64 lc_newsize; /* response */ > }; -- Chuck Lever