Improve writeback performance to RAID-4/5/6 by aligning writes to stripe boundaries. This relies on io_opt being set to the stripe size (or a multiple) when BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE is set. Benchmark of sequential writing to a large file on XFS using io_uring with 8-disk md-raid6: Before: 601.0 MB/s After: 614.5 MB/s Improvement: +2.3% Signed-off-by: Tony Battersby <tonyb@xxxxxxxxxxxxxxx> --- fs/iomap/buffered-io.c | 175 +++++++++++++++++++++++++---------------- 1 file changed, 106 insertions(+), 69 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index fb4519158f3a..f9020f916268 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1685,81 +1685,118 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t pos, loff_t end_pos, unsigned len) { - struct iomap_folio_state *ifs = folio->private; - size_t poff = offset_in_folio(folio, pos); - unsigned int ioend_flags = 0; - int error; - - if (wpc->iomap.type == IOMAP_UNWRITTEN) - ioend_flags |= IOMAP_IOEND_UNWRITTEN; - if (wpc->iomap.flags & IOMAP_F_SHARED) - ioend_flags |= IOMAP_IOEND_SHARED; - if (folio_test_dropbehind(folio)) - ioend_flags |= IOMAP_IOEND_DONTCACHE; - if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) - ioend_flags |= IOMAP_IOEND_BOUNDARY; + struct queue_limits *lim = bdev_limits(wpc->iomap.bdev); + unsigned int io_align = + (lim->features & BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE) ? + lim->io_opt >> SECTOR_SHIFT : 0; - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { + do { + struct iomap_folio_state *ifs = folio->private; + size_t poff = offset_in_folio(folio, pos); + unsigned int ioend_flags = 0; + unsigned int rem_len = 0; + int error; + + if (wpc->iomap.type == IOMAP_UNWRITTEN) + ioend_flags |= IOMAP_IOEND_UNWRITTEN; + if (wpc->iomap.flags & IOMAP_F_SHARED) + ioend_flags |= IOMAP_IOEND_SHARED; + if (folio_test_dropbehind(folio)) + ioend_flags |= IOMAP_IOEND_DONTCACHE; + if (pos == wpc->iomap.offset && + (wpc->iomap.flags & IOMAP_F_BOUNDARY)) + ioend_flags |= IOMAP_IOEND_BOUNDARY; + + if (!wpc->ioend || + !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { new_ioend: - error = iomap_submit_ioend(wpc, 0); - if (error) - return error; - wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos, - ioend_flags); - } + error = iomap_submit_ioend(wpc, 0); + if (error) + return error; + wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos, + ioend_flags); + } - if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) - goto new_ioend; + /* Align writes to io_align if given. */ + if (io_align && !(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) { + sector_t lba = bio_end_sector(&wpc->ioend->io_bio); + unsigned int mod = lba % io_align; + unsigned int max_len; - if (ifs) - atomic_add(len, &ifs->write_bytes_pending); + /* + * If the end sector is already aligned and the bio is + * nonempty, then start a new bio for the remainder. + */ + if (!mod && wpc->ioend->io_bio.bi_iter.bi_size) + goto new_ioend; - /* - * Clamp io_offset and io_size to the incore EOF so that ondisk - * file size updates in the ioend completion are byte-accurate. - * This avoids recovering files with zeroed tail regions when - * writeback races with appending writes: - * - * Thread 1: Thread 2: - * ------------ ----------- - * write [A, A+B] - * update inode size to A+B - * submit I/O [A, A+BS] - * write [A+B, A+B+C] - * update inode size to A+B+C - * <I/O completes, updates disk size to min(A+B+C, A+BS)> - * <power failure> - * - * After reboot: - * 1) with A+B+C < A+BS, the file has zero padding in range - * [A+B, A+B+C] - * - * |< Block Size (BS) >| - * |DDDDDDDDDDDD0000000000000| - * ^ ^ ^ - * A A+B A+B+C - * (EOF) - * - * 2) with A+B+C > A+BS, the file has zero padding in range - * [A+B, A+BS] - * - * |< Block Size (BS) >|< Block Size (BS) >| - * |DDDDDDDDDDDD0000000000000|00000000000000000000000000| - * ^ ^ ^ ^ - * A A+B A+BS A+B+C - * (EOF) - * - * D = Valid Data - * 0 = Zero Padding - * - * Note that this defeats the ability to chain the ioends of - * appending writes. - */ - wpc->ioend->io_size += len; - if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos) - wpc->ioend->io_size = end_pos - wpc->ioend->io_offset; + /* + * Clip the end of the bio to the alignment boundary. + */ + max_len = (io_align - mod) << SECTOR_SHIFT; + if (len > max_len) { + rem_len = len - max_len; + len = max_len; + } + } + + if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) + goto new_ioend; + + if (ifs) + atomic_add(len, &ifs->write_bytes_pending); + + /* + * Clamp io_offset and io_size to the incore EOF so that ondisk + * file size updates in the ioend completion are byte-accurate. + * This avoids recovering files with zeroed tail regions when + * writeback races with appending writes: + * + * Thread 1: Thread 2: + * ------------ ----------- + * write [A, A+B] + * update inode size to A+B + * submit I/O [A, A+BS] + * write [A+B, A+B+C] + * update inode size to A+B+C + * <I/O completes, updates disk size to min(A+B+C, A+BS)> + * <power failure> + * + * After reboot: + * 1) with A+B+C < A+BS, the file has zero padding in range + * [A+B, A+B+C] + * + * |< Block Size (BS) >| + * |DDDDDDDDDDDD0000000000000| + * ^ ^ ^ + * A A+B A+B+C + * (EOF) + * + * 2) with A+B+C > A+BS, the file has zero padding in range + * [A+B, A+BS] + * + * |< Block Size (BS) >|< Block Size (BS) >| + * |DDDDDDDDDDDD0000000000000|00000000000000000000000000| + * ^ ^ ^ ^ + * A A+B A+BS A+B+C + * (EOF) + * + * D = Valid Data + * 0 = Zero Padding + * + * Note that this defeats the ability to chain the ioends of + * appending writes. + */ + wpc->ioend->io_size += len; + if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos) + wpc->ioend->io_size = end_pos - wpc->ioend->io_offset; + + wbc_account_cgroup_owner(wbc, folio, len); + + pos += len; + len = rem_len; + } while (len); - wbc_account_cgroup_owner(wbc, folio, len); return 0; } -- 2.43.0