iomap_file_buffered_write has it's own private read path for reading in folios that are only partially overwritten, which not only adds extra code, but also extra problem when e.g. we want reads to go through a file system method to support checksums or RAID, or even support non-block based file systems. Switch to using ->read_folio instead, which has a few up- and downsides. ->read_folio always reads the entire folios and not just the start and the tail that is not being overwritten. Historically this was seen as a downside as it reads more data than needed. But with modern file systems and modern storage devices this is probably a benefit. If the folio is stored contiguously on disk, the single read will be more efficient than two small reads on almost all current hardware. If the folio is backed by two blocks, at least we pipeline the two reads instead of doing two synchronous ones. And if the file system fragmented the folio so badly that we'll now need to do more than two reads we're still at least pipelining it, although that should basically never happen with modern file systems. ->read_folio unlocks the folio on completion. This adds extract atomics to the write fast path, but the actual signaling by doing a lock_page after ->read_folio is not any slower than the completion wakeup. We just have to recheck the mapping in this case do lock out truncates and other mapping manipulations. ->read_folio starts another, nested, iomap iteration, with an extra lookup of the extent at the current file position. For in-place update file systems this is extra work, although if they use a good data structure like the xfs iext btree there is very little overhead in another lookup. For file system that write out of place this actually implements the desired semantics as they don't care about the existing data for the write iteration at all, although untangling this and removing the srcmap member in the iomap_iter will require additional work to turn the block zeroing and unshare helpers upside down. Signed-off-by: Christoph Hellwig <hch@xxxxxx> --- fs/iomap/buffered-io.c | 116 ++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 71 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 3729391a18f3..52b4040208dd 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -667,30 +667,34 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) pos + len - 1); } -static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, - size_t poff, size_t plen, const struct iomap *iomap) +/* + * Now that we have a locked folio, check that the iomap we have cached is not + * stale before we do anything. + * + * The extent mapping can change due to concurrent IO in flight, e.g. the + * IOMAP_UNWRITTEN state can change and memory reclaim could have reclaimed a + * previously partially written page at this index after IO completion before + * this write reaches this file offset, and hence we could do the wrong thing + * here (zero a page range incorrectly or fail to zero) and corrupt data. + */ +static bool iomap_validate(struct iomap_iter *iter) { - struct bio_vec bvec; - struct bio bio; + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; - bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); - bio_add_folio_nofail(&bio, folio, plen, poff); - return submit_bio_wait(&bio); + if (folio_ops && folio_ops->iomap_valid && + !folio_ops->iomap_valid(iter->inode, &iter->iomap)) { + iter->iomap.flags |= IOMAP_F_STALE; + return false; + } + + return true; } -static int __iomap_write_begin(const struct iomap_iter *iter, size_t len, +static int __iomap_write_begin(struct iomap_iter *iter, size_t len, struct folio *folio) { - const struct iomap *srcmap = iomap_iter_srcmap(iter); + struct inode *inode = iter->inode; struct iomap_folio_state *ifs; - loff_t pos = iter->pos; - loff_t block_size = i_blocksize(iter->inode); - loff_t block_start = round_down(pos, block_size); - loff_t block_end = round_up(pos + len, block_size); - unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio); - size_t from = offset_in_folio(folio, pos), to = from + len; - size_t poff, plen; /* * If the write or zeroing completely overlaps the current folio, then @@ -699,45 +703,29 @@ static int __iomap_write_begin(const struct iomap_iter *iter, size_t len, * For the unshare case, we must read in the ondisk contents because we * are not changing pagecache contents. */ - if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) && - pos + len >= folio_pos(folio) + folio_size(folio)) + if (!(iter->flags & IOMAP_UNSHARE) && + iter->pos <= folio_pos(folio) && + iter->pos + len >= folio_pos(folio) + folio_size(folio)) return 0; - ifs = ifs_alloc(iter->inode, folio, iter->flags); - if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > 1) + ifs = ifs_alloc(inode, folio, iter->flags); + if ((iter->flags & IOMAP_NOWAIT) && !ifs && + i_blocks_per_folio(inode, folio) > 1) return -EAGAIN; - if (folio_test_uptodate(folio)) - return 0; - - do { - iomap_adjust_read_range(iter->inode, folio, &block_start, - block_end - block_start, &poff, &plen); - if (plen == 0) - break; + if (!folio_test_uptodate(folio)) { + inode->i_mapping->a_ops->read_folio(NULL, folio); - if (!(iter->flags & IOMAP_UNSHARE) && - (from <= poff || from >= poff + plen) && - (to <= poff || to >= poff + plen)) - continue; - - if (iomap_block_needs_zeroing(iter, block_start)) { - if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE)) - return -EIO; - folio_zero_segments(folio, poff, from, to, poff + plen); - } else { - int status; - - if (iter->flags & IOMAP_NOWAIT) - return -EAGAIN; - - status = iomap_read_folio_sync(block_start, folio, - poff, plen, srcmap); - if (status) - return status; - } - iomap_set_range_uptodate(folio, poff, plen); - } while ((block_start += plen) < block_end); + /* + * ->read_folio unlocks the folio. Relock and revalidate the + * folio. + */ + folio_lock(folio); + if (unlikely(folio->mapping != inode->i_mapping)) + return 1; + if (unlikely(!iomap_validate(iter))) + return 1; + } return 0; } @@ -803,7 +791,6 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter, static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop, size_t *poffset, u64 *plen) { - const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; u64 len = min_t(u64, SIZE_MAX, iomap_length(iter)); @@ -818,28 +805,14 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop, if (fatal_signal_pending(current)) return -EINTR; +lookup_again: folio = __iomap_get_folio(iter, len); if (IS_ERR(folio)) return PTR_ERR(folio); - /* - * Now we have a locked folio, before we do anything with it we need to - * check that the iomap we have cached is not stale. The inode extent - * mapping can change due to concurrent IO in flight (e.g. - * IOMAP_UNWRITTEN state can change and memory reclaim could have - * reclaimed a previously partially written page at this index after IO - * completion before this write reaches this file offset) and hence we - * could do the wrong thing here (zero a page range incorrectly or fail - * to zero) and corrupt data. - */ - if (folio_ops && folio_ops->iomap_valid) { - bool iomap_valid = folio_ops->iomap_valid(iter->inode, - &iter->iomap); - if (!iomap_valid) { - iter->iomap.flags |= IOMAP_F_STALE; - status = 0; - goto out_unlock; - } + if (unlikely(!iomap_validate(iter))) { + status = 0; + goto out_unlock; } pos = iomap_trim_folio_range(iter, folio, poffset, &len); @@ -860,7 +833,8 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop, out_unlock: __iomap_put_folio(iter, 0, folio); - + if (status == 1) + goto lookup_again; return status; } -- 2.47.2