Re: [PATCH 03/11] iomap: refactor the writeback interface

Joanne Koong <joannelkoong@xxxxxxxxx> · Tue, 17 Jun 2025 11:33:31 -0700

On Tue, Jun 17, 2025 at 3:55 AM Christoph Hellwig <hch@xxxxxx> wrote:
>
> Replace ->map_blocks with a new ->writeback_range, which differs in the
> following ways:
>
>  - it must also queue up the I/O for writeback, that is called into the
>    slightly refactored and extended in scope iomap_add_to_ioend for
>    each region
>  - can handle only a part of the requested region, that is the retry
>    loop for partial mappings moves to the caller
>  - handles cleanup on failures as well, and thus also replaces the
>    discard_folio method only implemented by XFS.
>
> This will allow to use the iomap writeback code also for file systems
> that are not block based like fuse.
>
> Co-developed-by: Joanne Koong <joannelkoong@xxxxxxxxx>
> Signed-off-by: Christoph Hellwig <hch@xxxxxx>
> ---
>  .../filesystems/iomap/operations.rst          |  23 +--
>  block/fops.c                                  |  25 ++-
>  fs/gfs2/bmap.c                                |  26 +--
>  fs/iomap/buffered-io.c                        |  93 +++++------
>  fs/iomap/trace.h                              |   2 +-
>  fs/xfs/xfs_aops.c                             | 154 ++++++++++--------
>  fs/zonefs/file.c                              |  28 ++--
>  include/linux/iomap.h                         |  20 +--
>  8 files changed, 187 insertions(+), 184 deletions(-)
>
> diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
> index 3b628e370d88..b28f215db6e5 100644
> --- a/Documentation/filesystems/iomap/operations.rst
> +++ b/Documentation/filesystems/iomap/operations.rst
> @@ -271,7 +271,7 @@ writeback.
>  It does not lock ``i_rwsem`` or ``invalidate_lock``.
>
>  The dirty bit will be cleared for all folios run through the
> -``->map_blocks`` machinery described below even if the writeback fails.
> +``->writeback_range`` machinery described below even if the writeback fails.
>  This is to prevent dirty folio clots when storage devices fail; an
>  ``-EIO`` is recorded for userspace to collect via ``fsync``.
>
> @@ -283,15 +283,14 @@ The ``ops`` structure must be specified and is as follows:
>  .. code-block:: c
>
>   struct iomap_writeback_ops {
> -     int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
> -                       loff_t offset, unsigned len);
> -     int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
> -     void (*discard_folio)(struct folio *folio, loff_t pos);
> +    int (*writeback_range)(struct iomap_writepage_ctx *wpc,
> +               struct folio *folio, u64 pos, unsigned int len, u64 end_pos);

end_pos only gets used in iomap_add_to_ioend() but it looks like
end_pos can be deduced there by doing something like "end_pos =
min(folio_pos(folio) + folio_size(folio), i_size_read(wpc->inode))".
Would it be cleaner for ->writeback_range() to just pass in pos and
len instead of also passing in end_pos? I find the end_pos arg kind of
confusing anyways, like I think most people would think end_pos is the
end of the dirty range (eg pos + len), not the end position of the
folio.

> +    int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
>   };
>
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 65485a52df3b..8157b6d92c8e 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> -static int
> -xfs_map_blocks(
> +static ssize_t
> +xfs_writeback_range(
>         struct iomap_writepage_ctx *wpc,
> -       struct inode            *inode,
> -       loff_t                  offset,
> -       unsigned int            len)
> +       struct folio            *folio,
> +       u64                     offset,
> +       unsigned int            len,
> +       u64                     end_pos)
>  {
> -       struct xfs_inode        *ip = XFS_I(inode);
> +       struct xfs_inode        *ip = XFS_I(wpc->inode);
>         struct xfs_mount        *mp = ip->i_mount;
> -       ssize_t                 count = i_blocksize(inode);
> +       ssize_t                 count = i_blocksize(wpc->inode);
>         xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
>         xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
>         xfs_fileoff_t           cow_fsb;
> @@ -292,7 +334,7 @@ xfs_map_blocks(
>         struct xfs_bmbt_irec    imap;
>         struct xfs_iext_cursor  icur;
>         int                     retries = 0;
> -       int                     error = 0;
> +       ssize_t                 ret = 0;
>         unsigned int            *seq;
>
>         if (xfs_is_shutdown(mp))
> @@ -316,7 +358,7 @@ xfs_map_blocks(
>          * out that ensures that we always see the current value.
>          */
>         if (xfs_imap_valid(wpc, ip, offset))
> -               return 0;
> +               goto map_blocks;
>
>         /*
>          * If we don't have a valid map, now it's time to get a new one for this
> @@ -351,7 +393,7 @@ xfs_map_blocks(
>          */
>         if (xfs_imap_valid(wpc, ip, offset)) {
>                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
> -               return 0;
> +               goto map_blocks;
>         }
>
>         /*
> @@ -389,7 +431,12 @@ xfs_map_blocks(
>
>         xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
>         trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
> -       return 0;
> +map_blocks:

nit: should this be called map_blocks or changed to something like
"add_to_ioend"? afaict, the mapping has already been done by this
point?

> +       ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
> +       if (ret < 0)
> +               goto out_error;
> +       return ret;
> +
>  allocate_blocks:
>         /*
>          * Convert a dellalloc extent to a real one. The current page is held
> @@ -402,9 +449,9 @@ xfs_map_blocks(
>         else
>
> -static int
> -xfs_zoned_map_blocks(
> +static ssize_t
> +xfs_zoned_writeback_range(
>         struct iomap_writepage_ctx *wpc,
> -       struct inode            *inode,
> -       loff_t                  offset,
> -       unsigned int            len)
> +       struct folio            *folio,
> +       u64                     offset,
> +       unsigned int            len,
> +       u64                     end_pos)
>  {
> -       struct xfs_inode        *ip = XFS_I(inode);
> +       struct xfs_inode        *ip = XFS_I(wpc->inode);
>         struct xfs_mount        *mp = ip->i_mount;
>         xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
>         xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + len);
>         xfs_filblks_t           count_fsb;
>         struct xfs_bmbt_irec    imap, del;
>         struct xfs_iext_cursor  icur;
> +       ssize_t                 ret;
>
>         if (xfs_is_shutdown(mp))
>                 return -EIO;
> @@ -586,7 +601,7 @@ xfs_zoned_map_blocks(
>                 imap.br_state = XFS_EXT_NORM;
>                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
>                 xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
> -               return 0;
> +               goto map_blocks;
>         }
>         end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
>         count_fsb = end_fsb - offset_fsb;
> @@ -603,9 +618,13 @@ xfs_zoned_map_blocks(
>         wpc->iomap.offset = offset;
>         wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
>         wpc->iomap.flags = IOMAP_F_ANON_WRITE;
> -
>         trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
> -       return 0;
> +
> +map_blocks:

Same question here

> +       ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
> +       if (ret < 0)
> +               xfs_discard_folio(folio, offset);
> +       return ret;
>  }