[PATCH 1/2] md/raid0,raid4,raid5,raid6,raid10: fix bogus io_opt value

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



md-raid currently sets io_min and io_opt to the RAID chunk and stripe
sizes and then calls queue_limits_stack_bdev() to combine the io_min and
io_opt values with those of the component devices.  The io_opt size is
notably combined using the least common multiple (lcm), which does not
work well in practice for some drives (1), resulting in overflow or
unreasonable values.

dm-raid, on the other hand, sets io_min and io_opt through the
raid_io_hints() function, which is called after stacking all the queue
limits of the component drives, so the RAID chunk and stripe sizes
override the values of the stacking.

Change md-raid to be more like dm-raid by setting io_min and io_opt to
the RAID chunk and stripe sizes after stacking the queue limits of the
component devies.  This fixes /sys/block/md0/queue/optimal_io_size from
being a bogus value like 3221127168 to being the correct RAID stripe
size.

(1) SATA disks attached to mpt3sas report io_opt = 16776704, or
2^24 - 512.  See also commit 9c0ba14828d6 ("blk-settings: round down
io_opt to physical_block_size").

Signed-off-by: Tony Battersby <tonyb@xxxxxxxxxxxxxxx>
---
 drivers/md/md.c     | 15 +++++++++++++++
 drivers/md/raid0.c  |  4 ++--
 drivers/md/raid10.c |  4 ++--
 drivers/md/raid5.c  |  4 ++--
 4 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0f03b21e66e4..decf593d3bd7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5837,11 +5837,15 @@ EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
 int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct queue_limits lim;
+	unsigned int io_min;
+	unsigned int io_opt;
 
 	if (mddev_is_dm(mddev))
 		return 0;
 
 	lim = queue_limits_start_update(mddev->gendisk->queue);
+	io_min = lim.io_min;
+	io_opt = lim.io_opt;
 	queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset,
 				mddev->gendisk->disk_name);
 
@@ -5851,6 +5855,17 @@ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
 		queue_limits_cancel_update(mddev->gendisk->queue);
 		return -ENXIO;
 	}
+	switch (mddev->level) {
+	case 0:
+	case 4:
+	case 5:
+	case 6:
+	case 10:
+		/* Preserve original chunk size and stripe size. */
+		lim.io_min = io_min;
+		lim.io_opt = io_opt;
+		break;
+	}
 
 	return queue_limits_commit_update(mddev->gendisk->queue, &lim);
 }
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d8f639f4ae12..657e66e92e14 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -382,12 +382,12 @@ static int raid0_set_limits(struct mddev *mddev)
 	md_init_stacking_limits(&lim);
 	lim.max_hw_sectors = mddev->chunk_sectors;
 	lim.max_write_zeroes_sectors = mddev->chunk_sectors;
-	lim.io_min = mddev->chunk_sectors << 9;
-	lim.io_opt = lim.io_min * mddev->raid_disks;
 	lim.features |= BLK_FEAT_ATOMIC_WRITES;
 	err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
 	if (err)
 		return err;
+	lim.io_min = mddev->chunk_sectors << 9;
+	lim.io_opt = lim.io_min * mddev->raid_disks;
 	return queue_limits_set(mddev->gendisk->queue, &lim);
 }
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c9bd2005bfd0..ea5147531ceb 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4011,12 +4011,12 @@ static int raid10_set_queue_limits(struct mddev *mddev)
 
 	md_init_stacking_limits(&lim);
 	lim.max_write_zeroes_sectors = 0;
-	lim.io_min = mddev->chunk_sectors << 9;
-	lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
 	lim.features |= BLK_FEAT_ATOMIC_WRITES;
 	err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
 	if (err)
 		return err;
+	lim.io_min = mddev->chunk_sectors << 9;
+	lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
 	return queue_limits_set(mddev->gendisk->queue, &lim);
 }
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ca5b0e8ba707..bba647c38cff 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7727,8 +7727,6 @@ static int raid5_set_limits(struct mddev *mddev)
 	stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9));
 
 	md_init_stacking_limits(&lim);
-	lim.io_min = mddev->chunk_sectors << 9;
-	lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
 	lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE;
 	lim.discard_granularity = stripe;
 	lim.max_write_zeroes_sectors = 0;
@@ -7736,6 +7734,8 @@ static int raid5_set_limits(struct mddev *mddev)
 	rdev_for_each(rdev, mddev)
 		queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset,
 				mddev->gendisk->disk_name);
+	lim.io_min = mddev->chunk_sectors << 9;
+	lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
 
 	/*
 	 * Zeroing is required for discard, otherwise data could be lost.

base-commit: 038d61fd642278bab63ee8ef722c50d10ab01e8f
-- 
2.43.0





[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux