On 6/4/25 9:07 AM, Sergey Bashirov wrote: > When pNFS client in block layout mode sends layoutcommit RPC to MDS, > a variable length array of modified extents is supplied within request. > This patch allows NFS server to accept such extent arrays if they do not > fit within single memory page. > > Co-developed-by: Konstantin Evtushenko <koevtushenko@xxxxxxxxxx> > Signed-off-by: Konstantin Evtushenko <koevtushenko@xxxxxxxxxx> > Signed-off-by: Sergey Bashirov <sergeybashirov@xxxxxxxxx> > --- > fs/nfsd/blocklayout.c | 12 ++++--- > fs/nfsd/blocklayoutxdr.c | 78 ++++++++++++++++++++++++++++++++-------- > fs/nfsd/blocklayoutxdr.h | 8 ++--- > fs/nfsd/nfs4xdr.c | 7 ++-- > fs/nfsd/xdr4.h | 2 +- > 5 files changed, 79 insertions(+), 28 deletions(-) > > diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c > index e5c0982a381d..d40a0860fcf6 100644 > --- a/fs/nfsd/blocklayout.c > +++ b/fs/nfsd/blocklayout.c > @@ -179,8 +179,10 @@ nfsd4_block_proc_layoutcommit(struct inode *inode, > struct iomap *iomaps; > int nr_iomaps; > > - nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, > - lcp->lc_up_len, &iomaps, i_blocksize(inode)); > + nr_iomaps = nfsd4_block_decode_layoutupdate(&lcp->lc_up_layout, > + lcp->lc_up_len, > + &iomaps, > + i_blocksize(inode)); > if (nr_iomaps < 0) > return nfserrno(nr_iomaps); > > @@ -317,8 +319,10 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, > struct iomap *iomaps; > int nr_iomaps; > > - nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, > - lcp->lc_up_len, &iomaps, i_blocksize(inode)); > + nr_iomaps = nfsd4_scsi_decode_layoutupdate(&lcp->lc_up_layout, > + lcp->lc_up_len, > + &iomaps, > + i_blocksize(inode)); > if (nr_iomaps < 0) > return nfserrno(nr_iomaps); > > diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c > index 442543304930..e3e3d79c8b4f 100644 > --- a/fs/nfsd/blocklayoutxdr.c > +++ b/fs/nfsd/blocklayoutxdr.c > @@ -103,11 +103,13 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > } > > int > -nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > - u32 block_size) > +nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, u32 len, > + struct iomap **iomapp, u32 block_size) > { > + struct xdr_stream xdr; > struct iomap *iomaps; > u32 nr_iomaps, i; > + char scratch[sizeof(struct pnfs_block_extent)]; > > if (len < sizeof(u32)) { > dprintk("%s: extent array too small: %u\n", __func__, len); > @@ -119,7 +121,15 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > return -EINVAL; > } > > - nr_iomaps = be32_to_cpup(p++); > + xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL); > + xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch)); > + > + if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) { > + dprintk("%s: failed to decode extent array length\n", > + __func__); > + return -EINVAL; > + } > + > if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) { > dprintk("%s: extent array size mismatch: %u/%u\n", > __func__, len, nr_iomaps); > @@ -135,28 +145,51 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > for (i = 0; i < nr_iomaps; i++) { > struct pnfs_block_extent bex; > > - memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid)); > - p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid)); > + if (xdr_stream_decode_opaque_fixed(&xdr, &bex.vol_id, sizeof(bex.vol_id)) < > + sizeof(bex.vol_id)) { > + dprintk("%s: failed to decode device id for entry %u\n", > + __func__, i); > + goto fail; > + } > > - p = xdr_decode_hyper(p, &bex.foff); > + if (xdr_stream_decode_u64(&xdr, &bex.foff)) { > + dprintk("%s: failed to decode offset for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.foff & (block_size - 1)) { > dprintk("%s: unaligned offset 0x%llx\n", > __func__, bex.foff); > goto fail; > } > - p = xdr_decode_hyper(p, &bex.len); > + > + if (xdr_stream_decode_u64(&xdr, &bex.len)) { > + dprintk("%s: failed to decode length for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.len & (block_size - 1)) { > dprintk("%s: unaligned length 0x%llx\n", > __func__, bex.foff); > goto fail; > } > - p = xdr_decode_hyper(p, &bex.soff); > + > + if (xdr_stream_decode_u64(&xdr, &bex.soff)) { > + dprintk("%s: failed to decode soffset for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.soff & (block_size - 1)) { > dprintk("%s: unaligned disk offset 0x%llx\n", > __func__, bex.soff); > goto fail; > } > - bex.es = be32_to_cpup(p++); > + > + if (xdr_stream_decode_u32(&xdr, &bex.es)) { > + dprintk("%s: failed to decode estate for entry %u\n", > + __func__, i); > + goto fail; > + } > if (bex.es != PNFS_BLOCK_READWRITE_DATA) { > dprintk("%s: incorrect extent state %d\n", > __func__, bex.es); > @@ -175,18 +208,27 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > } > > int > -nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > - u32 block_size) > +nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, u32 len, > + struct iomap **iomapp, u32 block_size) > { > + struct xdr_stream xdr; > struct iomap *iomaps; > u32 nr_iomaps, expected, i; > + char scratch[sizeof(u64)]; > > if (len < sizeof(u32)) { > dprintk("%s: extent array too small: %u\n", __func__, len); > return -EINVAL; > } > > - nr_iomaps = be32_to_cpup(p++); > + xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL); > + xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch)); > + > + if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) { > + dprintk("%s: failed to decode extent array length\n", __func__); > + return -EINVAL; > + } > + > expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE; > if (len != expected) { > dprintk("%s: extent array size mismatch: %u/%u\n", > @@ -203,14 +245,22 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > for (i = 0; i < nr_iomaps; i++) { > u64 val; > > - p = xdr_decode_hyper(p, &val); > + if (xdr_stream_decode_u64(&xdr, &val)) { > + dprintk("%s: failed to decode offset for entry %u\n", > + __func__, i); > + goto fail; > + } > if (val & (block_size - 1)) { > dprintk("%s: unaligned offset 0x%llx\n", __func__, val); > goto fail; > } > iomaps[i].offset = val; > > - p = xdr_decode_hyper(p, &val); > + if (xdr_stream_decode_u64(&xdr, &val)) { > + dprintk("%s: failed to decode length for entry %u\n", > + __func__, i); > + goto fail; > + } > if (val & (block_size - 1)) { > dprintk("%s: unaligned length 0x%llx\n", __func__, val); > goto fail; > diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h > index bc5166bfe46b..c4c8139b8e96 100644 > --- a/fs/nfsd/blocklayoutxdr.h > +++ b/fs/nfsd/blocklayoutxdr.h > @@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, > struct nfsd4_getdeviceinfo *gdp); > __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, > struct nfsd4_layoutget *lgp); > -int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > - u32 block_size); > -int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, > - u32 block_size); > +int nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, u32 len, > + struct iomap **iomapp, u32 block_size); > +int nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, u32 len, > + struct iomap **iomapp, u32 block_size); > > #endif /* _NFSD_BLOCKLAYOUTXDR_H */ > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c > index 5a93a5db4fb0..81f42dc75b95 100644 > --- a/fs/nfsd/nfs4xdr.c > +++ b/fs/nfsd/nfs4xdr.c > @@ -592,11 +592,8 @@ nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, > > if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0) > return nfserr_bad_xdr; > - if (lcp->lc_up_len > 0) { > - lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len); > - if (!lcp->lc_up_layout) > - return nfserr_bad_xdr; > - } > + if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, lcp->lc_up_len)) > + return nfserr_bad_xdr; > > return nfs_ok; > } > diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h > index 846ab6df9d48..8516a1a6b46d 100644 > --- a/fs/nfsd/xdr4.h > +++ b/fs/nfsd/xdr4.h > @@ -492,7 +492,7 @@ struct nfsd4_layoutcommit { > struct timespec64 lc_mtime; /* request */ > u32 lc_layout_type; /* request */ > u32 lc_up_len; /* layout length */ > - void *lc_up_layout; /* decoded by callback */ > + struct xdr_buf lc_up_layout; /* request, decoded by callback */ > u32 lc_size_chg; /* boolean for response */ > u64 lc_newsize; /* response */ > }; Thanks for the suggestion, Sergey! Note the MAINTAINERS entry for NFSD: $ scripts/get_maintainer.pl fs/nfsd/vfs.c Chuck Lever <chuck.lever@xxxxxxxxxx> (maintainer:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS) Jeff Layton <jlayton@xxxxxxxxxx> (maintainer:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS) Neil Brown <neilb@xxxxxxx> (reviewer:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS) Olga Kornievskaia <okorniev@xxxxxxxxxx> (reviewer:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS) Dai Ngo <Dai.Ngo@xxxxxxxxxx> (reviewer:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS) Tom Talpey <tom@xxxxxxxxxx> (reviewer:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS) linux-nfs@xxxxxxxxxxxxxxx (open list:KERNEL NFSD, SUNRPC, AND LOCKD SERVERS) linux-kernel@xxxxxxxxxxxxxxx (open list) KERNEL NFSD, SUNRPC, AND LOCKD SERVERS status: Supported In particular, Dai is looking at the Linux NFS server's pNFS with iSCSI implementation right at the moment and might have some thoughts about expanding the number of extents in block layouts. Can you repost your patch with the current reviewers and maintainers copied as appropriate? -- Chuck Lever