md-raid currently sets io_min and io_opt to the RAID chunk and stripe sizes and then calls queue_limits_stack_bdev() to combine the io_min and io_opt values with those of the component devices. The io_opt size is notably combined using the least common multiple (lcm), which does not work well in practice for some drives (1), resulting in overflow or unreasonable values. dm-raid, on the other hand, sets io_min and io_opt through the raid_io_hints() function, which is called after stacking all the queue limits of the component drives, so the RAID chunk and stripe sizes override the values of the stacking. Change md-raid to be more like dm-raid by setting io_min and io_opt to the RAID chunk and stripe sizes after stacking the queue limits of the component devies. This fixes /sys/block/md0/queue/optimal_io_size from being a bogus value like 3221127168 to being the correct RAID stripe size. (1) SATA disks attached to mpt3sas report io_opt = 16776704, or 2^24 - 512. See also commit 9c0ba14828d6 ("blk-settings: round down io_opt to physical_block_size"). Signed-off-by: Tony Battersby <tonyb@xxxxxxxxxxxxxxx> --- drivers/md/md.c | 15 +++++++++++++++ drivers/md/raid0.c | 4 ++-- drivers/md/raid10.c | 4 ++-- drivers/md/raid5.c | 4 ++-- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 0f03b21e66e4..decf593d3bd7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5837,11 +5837,15 @@ EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) { struct queue_limits lim; + unsigned int io_min; + unsigned int io_opt; if (mddev_is_dm(mddev)) return 0; lim = queue_limits_start_update(mddev->gendisk->queue); + io_min = lim.io_min; + io_opt = lim.io_opt; queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); @@ -5851,6 +5855,17 @@ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) queue_limits_cancel_update(mddev->gendisk->queue); return -ENXIO; } + switch (mddev->level) { + case 0: + case 4: + case 5: + case 6: + case 10: + /* Preserve original chunk size and stripe size. */ + lim.io_min = io_min; + lim.io_opt = io_opt; + break; + } return queue_limits_commit_update(mddev->gendisk->queue, &lim); } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d8f639f4ae12..657e66e92e14 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -382,12 +382,12 @@ static int raid0_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; - lim.io_min = mddev->chunk_sectors << 9; - lim.io_opt = lim.io_min * mddev->raid_disks; lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) return err; + lim.io_min = mddev->chunk_sectors << 9; + lim.io_opt = lim.io_min * mddev->raid_disks; return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c9bd2005bfd0..ea5147531ceb 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4011,12 +4011,12 @@ static int raid10_set_queue_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; - lim.io_min = mddev->chunk_sectors << 9; - lim.io_opt = lim.io_min * raid10_nr_stripes(conf); lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) return err; + lim.io_min = mddev->chunk_sectors << 9; + lim.io_opt = lim.io_min * raid10_nr_stripes(conf); return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ca5b0e8ba707..bba647c38cff 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7727,8 +7727,6 @@ static int raid5_set_limits(struct mddev *mddev) stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9)); md_init_stacking_limits(&lim); - lim.io_min = mddev->chunk_sectors << 9; - lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; lim.discard_granularity = stripe; lim.max_write_zeroes_sectors = 0; @@ -7736,6 +7734,8 @@ static int raid5_set_limits(struct mddev *mddev) rdev_for_each(rdev, mddev) queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset, mddev->gendisk->disk_name); + lim.io_min = mddev->chunk_sectors << 9; + lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); /* * Zeroing is required for discard, otherwise data could be lost. base-commit: 038d61fd642278bab63ee8ef722c50d10ab01e8f -- 2.43.0