On Mon, Jun 23, 2025 at 07:21:22PM -0700, Joanne Koong wrote: > From: Christoph Hellwig <hch@xxxxxx> > > Replace ->map_blocks with a new ->writeback_range, which differs in the > following ways: > > - it must also queue up the I/O for writeback, that is called into the > slightly refactored and extended in scope iomap_add_to_ioend for > each region > - can handle only a part of the requested region, that is the retry > loop for partial mappings moves to the caller > - handles cleanup on failures as well, and thus also replaces the > discard_folio method only implemented by XFS. > > This will allow to use the iomap writeback code also for file systems > that are not block based like fuse. Sounds like a good goal. ;) > Co-developed-by: Joanne Koong <joannelkoong@xxxxxxxxx> > Signed-off-by: Christoph Hellwig <hch@xxxxxx> > --- > .../filesystems/iomap/operations.rst | 23 +-- > block/fops.c | 25 ++- > fs/gfs2/bmap.c | 26 +-- > fs/iomap/buffered-io.c | 93 +++++------ > fs/iomap/trace.h | 2 +- > fs/xfs/xfs_aops.c | 154 ++++++++++-------- > fs/zonefs/file.c | 28 ++-- > include/linux/iomap.h | 20 +-- > 8 files changed, 187 insertions(+), 184 deletions(-) > > diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst > index 3b628e370d88..b28f215db6e5 100644 > --- a/Documentation/filesystems/iomap/operations.rst > +++ b/Documentation/filesystems/iomap/operations.rst <snip> > @@ -283,15 +283,14 @@ The ``ops`` structure must be specified and is as follows: > .. code-block:: c > > struct iomap_writeback_ops { > - int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, > - loff_t offset, unsigned len); > - int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); > - void (*discard_folio)(struct folio *folio, loff_t pos); > + int (*writeback_range)(struct iomap_writepage_ctx *wpc, > + struct folio *folio, u64 pos, unsigned int len, u64 end_pos); Why does @pos change from loff_t to u64 here? Are we expecting filesystems that set FOP_UNSIGNED_OFFSET? > + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); Nit: ^^ indenting change here. > }; > > The fields are as follows: > > - - ``map_blocks``: Sets ``wpc->iomap`` to the space mapping of the file > + - ``writeback_range``: Sets ``wpc->iomap`` to the space mapping of the file > range (in bytes) given by ``offset`` and ``len``. > iomap calls this function for each dirty fs block in each dirty folio, > though it will `reuse mappings <snip> > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c > index 11a55da26a6f..80d8acfaa068 100644 > --- a/fs/iomap/buffered-io.c > +++ b/fs/iomap/buffered-io.c > @@ -1748,63 +1764,34 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, > * Note that this defeats the ability to chain the ioends of > * appending writes. > */ > - wpc->ioend->io_size += len; > + wpc->ioend->io_size += map_len; > if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos) > wpc->ioend->io_size = end_pos - wpc->ioend->io_offset; > > - wbc_account_cgroup_owner(wpc->wbc, folio, len); > - return 0; > + wbc_account_cgroup_owner(wpc->wbc, folio, map_len); > + return map_len; > } > +EXPORT_SYMBOL_GPL(iomap_add_to_ioend); > > -static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, > - struct folio *folio, u64 pos, u64 end_pos, unsigned dirty_len, > +static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, > + struct folio *folio, u64 pos, u32 rlen, u64 end_pos, > bool *wb_pending) > { > - int error; > - > do { > - unsigned map_len; > - > - error = wpc->ops->map_blocks(wpc, wpc->inode, pos, dirty_len); > - if (error) > - break; > - trace_iomap_writepage_map(wpc->inode, pos, dirty_len, > - &wpc->iomap); > - > - map_len = min_t(u64, dirty_len, > - wpc->iomap.offset + wpc->iomap.length - pos); > - WARN_ON_ONCE(!folio->private && map_len < dirty_len); > + ssize_t ret; > > - switch (wpc->iomap.type) { > - case IOMAP_INLINE: > - WARN_ON_ONCE(1); > - error = -EIO; > - break; > - case IOMAP_HOLE: > - break; > - default: > - error = iomap_add_to_ioend(wpc, folio, pos, end_pos, > - map_len); > - if (!error) > - *wb_pending = true; > - break; > - } > - dirty_len -= map_len; > - pos += map_len; > - } while (dirty_len && !error); > + ret = wpc->ops->writeback_range(wpc, folio, pos, rlen, end_pos); > + if (WARN_ON_ONCE(ret == 0 || ret > rlen)) > + return -EIO; > + if (ret < 0) > + return ret; > + rlen -= ret; > + pos += ret; > + if (wpc->iomap.type != IOMAP_HOLE) > + *wb_pending = true; /me wonders if this should be an outparam of ->writeback_range to signal that it actually added the folio to the writeback ioend chain? Or maybe just a boolean in iomap_writepage_ctx that we clear before calling ->writeback_range and iomap_add_to_ioend can set it as appropriate? > + } while (rlen); > > - /* > - * We cannot cancel the ioend directly here on error. We may have > - * already set other pages under writeback and hence we have to run I/O > - * completion to mark the error state of the pages under writeback > - * appropriately. > - * > - * Just let the file system know what portion of the folio failed to > - * map. > - */ > - if (error && wpc->ops->discard_folio) > - wpc->ops->discard_folio(folio, pos); > - return error; > + return 0; > } > > /* <snip> > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > index 65485a52df3b..8157b6d92c8e 100644 > --- a/fs/xfs/xfs_aops.c > +++ b/fs/xfs/xfs_aops.c > @@ -233,6 +233,47 @@ xfs_end_bio( > spin_unlock_irqrestore(&ip->i_ioend_lock, flags); > } > > +/* > + * We cannot cancel the ioend directly on error. We may have already set other > + * pages under writeback and hence we have to run I/O completion to mark the > + * error state of the pages under writeback appropriately. > + * > + * If the folio has delalloc blocks on it, the caller is asking us to punch them > + * out. If we don't, we can leave a stale delalloc mapping covered by a clean > + * page that needs to be dirtied again before the delalloc mapping can be > + * converted. This stale delalloc mapping can trip up a later direct I/O read > + * operation on the same region. > + * > + * We prevent this by truncating away the delalloc regions on the folio. Because > + * they are delalloc, we can do this without needing a transaction. Indeed - if > + * we get ENOSPC errors, we have to be able to do this truncation without a > + * transaction as there is no space left for block reservation (typically why > + * we see a ENOSPC in writeback). > + */ > +static void > +xfs_discard_folio( Nice, one less indirect call. :) > + struct folio *folio, > + loff_t pos) > +{ > + struct xfs_inode *ip = XFS_I(folio->mapping->host); > + struct xfs_mount *mp = ip->i_mount; > + > + if (xfs_is_shutdown(mp)) > + return; > + > + xfs_alert_ratelimited(mp, > + "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", > + folio, ip->i_ino, pos); > + > + /* > + * The end of the punch range is always the offset of the first > + * byte of the next folio. Hence the end offset is only dependent on the > + * folio itself and not the start offset that is passed in. > + */ > + xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, > + folio_pos(folio) + folio_size(folio), NULL); > +} > + > /* > * Fast revalidation of the cached writeback mapping. Return true if the current > * mapping is valid, false otherwise. <snip> > @@ -389,7 +431,12 @@ xfs_map_blocks( > > xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq); > trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); > - return 0; > +map_blocks: Should this jump label should be named add_to_ioend or something? We already mapped the blocks. The same applies to the zoned version of this function. --D