CC Heinz who is a dm-raid expert. But he is on holiday this week. On Tue, May 27, 2025 at 4:24 PM Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> wrote: > > Hi, Zdenek Kabelac > > 在 2025/05/27 16:14, Yu Kuai 写道: > > From: Yu Kuai <yukuai3@xxxxxxxxxx> > > > > IO with REQ_RAHEAD or REQ_NOWAIT can fail early, even if the storage medium > > is fine, hence record badblocks or remove the disk from array does not > > make sense. > > > > This problem if found by lvm2 test lvcreate-large-raid, where dm-zero > > will fail read ahead IO directly. > > > > Reported-and-tested-by: Mikulas Patocka <mpatocka@xxxxxxxxxx> > > Closes: https://lore.kernel.org/all/34fa755d-62c8-4588-8ee1-33cb1249bdf2@xxxxxxxxxx/ > > Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx> > > --- > > Changes in v2: > > - handle REQ_NOWAIT as well. > > > > drivers/md/raid1-10.c | 10 ++++++++++ > > drivers/md/raid1.c | 19 ++++++++++--------- > > drivers/md/raid10.c | 11 ++++++----- > > 3 files changed, 26 insertions(+), 14 deletions(-) > > > > Just to let you know that while testing lvcreate-large-raid, the test > can fail sometime: > > [ 0:12.021] ## DEBUG0: 08:11:43.596775 lvcreate[8576] > device_mapper/ioctl/libdm-iface.c:2118 device-mapper: create ioctl on > LVMTEST8371vg1-LV1_rmeta_0 > LVM-iqJjW9HItbME2d4DC2S7D58zd2omecjf0yQN83foinyxHaPoZqGEnX4rRUN7i0kH > failed: Device or resource busy > > And add debug info in kernel: > diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c > index 4165fef4c170..04b66b804365 100644 > --- a/drivers/md/dm-ioctl.c > +++ b/drivers/md/dm-ioctl.c > @@ -263,6 +263,7 @@ static int dm_hash_insert(const char *name, const > char *uuid, struct mapped_devi > { > struct hash_cell *cell, *hc; > > + printk("%s: %s %s\n", __func__, name, uuid); > /* > * Allocate the new cells. > */ > @@ -310,6 +311,7 @@ static struct dm_table *__hash_remove(struct > hash_cell *hc) > struct dm_table *table; > int srcu_idx; > > + printk("%s: %s %s\n", __func__, hc->name, hc->uuid); > lockdep_assert_held(&_hash_lock); > > /* remove from the dev trees */ > @@ -882,18 +884,23 @@ static int dev_create(struct file *filp, struct > dm_ioctl *param, size_t param_si > struct mapped_device *md; > > r = check_name(param->name); > - if (r) > + if (r) { > + printk("%s: check_name failed %d\n", __func__, r); > return r; > + } > > if (param->flags & DM_PERSISTENT_DEV_FLAG) > m = MINOR(huge_decode_dev(param->dev)); > > r = dm_create(m, &md); > - if (r) > + if (r) { > + printk("%s: dm_create failed %d\n", __func__, r); > return r; > + } > > r = dm_hash_insert(param->name, *param->uuid ? param->uuid : > NULL, md); > if (r) { > + printk("%s: dm_hash_insert failed %d\n", __func__, r); > dm_put(md); > dm_destroy(md); > return r; > > I got: > [ 2265.277214] dm_hash_insert: LVMTEST8371vg1-LV1_rmeta_0 > LVM-iqJjW9HItbME2d4DC2S7D58zd2omecjf0yQN83foinyxHaPoZqGEnX4rRUN7i0kH^M > [ 2265.279666] dm_hash_insert: LVMTEST8371vg1-LV1_rmeta_0 > LVM-iqJjW9HItbME2d4DC2S7D58zd2omecjf0yQN83foinyxHaPoZqGEnX4rRUN7i0kH^M > [ 2265.281043] dev_create: dm_hash_insert failed -16 > > Looks like the test somehow insert the same target twice? Is this a > known issue? > > Thanks, > Kuai > > > diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c > > index c7efd8aab675..b8b3a9069701 100644 > > --- a/drivers/md/raid1-10.c > > +++ b/drivers/md/raid1-10.c > > @@ -293,3 +293,13 @@ static inline bool raid1_should_read_first(struct mddev *mddev, > > > > return false; > > } > > + > > +/* > > + * bio with REQ_RAHEAD or REQ_NOWAIT can fail at anytime, before such IO is > > + * submitted to the underlying disks, hence don't record badblocks or retry > > + * in this case. > > + */ > > +static inline bool raid1_should_handle_error(struct bio *bio) > > +{ > > + return !(bio->bi_opf & (REQ_RAHEAD | REQ_NOWAIT)); > > +} > > diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c > > index 657d481525be..19c5a0ce5a40 100644 > > --- a/drivers/md/raid1.c > > +++ b/drivers/md/raid1.c > > @@ -373,14 +373,16 @@ static void raid1_end_read_request(struct bio *bio) > > */ > > update_head_pos(r1_bio->read_disk, r1_bio); > > > > - if (uptodate) > > + if (uptodate) { > > set_bit(R1BIO_Uptodate, &r1_bio->state); > > - else if (test_bit(FailFast, &rdev->flags) && > > - test_bit(R1BIO_FailFast, &r1_bio->state)) > > + } else if (test_bit(FailFast, &rdev->flags) && > > + test_bit(R1BIO_FailFast, &r1_bio->state)) { > > /* This was a fail-fast read so we definitely > > * want to retry */ > > ; > > - else { > > + } else if (!raid1_should_handle_error(bio)) { > > + uptodate = 1; > > + } else { > > /* If all other devices have failed, we want to return > > * the error upwards rather than fail the last device. > > * Here we redefine "uptodate" to mean "Don't want to retry" > > @@ -451,16 +453,15 @@ static void raid1_end_write_request(struct bio *bio) > > struct bio *to_put = NULL; > > int mirror = find_bio_disk(r1_bio, bio); > > struct md_rdev *rdev = conf->mirrors[mirror].rdev; > > - bool discard_error; > > sector_t lo = r1_bio->sector; > > sector_t hi = r1_bio->sector + r1_bio->sectors; > > - > > - discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; > > + bool ignore_error = !raid1_should_handle_error(bio) || > > + (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD); > > > > /* > > * 'one mirror IO has finished' event handler: > > */ > > - if (bio->bi_status && !discard_error) { > > + if (bio->bi_status && !ignore_error) { > > set_bit(WriteErrorSeen, &rdev->flags); > > if (!test_and_set_bit(WantReplacement, &rdev->flags)) > > set_bit(MD_RECOVERY_NEEDED, & > > @@ -511,7 +512,7 @@ static void raid1_end_write_request(struct bio *bio) > > > > /* Maybe we can clear some bad blocks. */ > > if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) && > > - !discard_error) { > > + !ignore_error) { > > r1_bio->bios[mirror] = IO_MADE_GOOD; > > set_bit(R1BIO_MadeGood, &r1_bio->state); > > } > > diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c > > index dce06bf65016..b74780af4c22 100644 > > --- a/drivers/md/raid10.c > > +++ b/drivers/md/raid10.c > > @@ -399,6 +399,8 @@ static void raid10_end_read_request(struct bio *bio) > > * wait for the 'master' bio. > > */ > > set_bit(R10BIO_Uptodate, &r10_bio->state); > > + } else if (!raid1_should_handle_error(bio)) { > > + uptodate = 1; > > } else { > > /* If all other devices that store this block have > > * failed, we want to return the error upwards rather > > @@ -456,9 +458,8 @@ static void raid10_end_write_request(struct bio *bio) > > int slot, repl; > > struct md_rdev *rdev = NULL; > > struct bio *to_put = NULL; > > - bool discard_error; > > - > > - discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; > > + bool ignore_error = !raid1_should_handle_error(bio) || > > + (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD); > > > > dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); > > > > @@ -472,7 +473,7 @@ static void raid10_end_write_request(struct bio *bio) > > /* > > * this branch is our 'one mirror IO has finished' event handler: > > */ > > - if (bio->bi_status && !discard_error) { > > + if (bio->bi_status && !ignore_error) { > > if (repl) > > /* Never record new bad blocks to replacement, > > * just fail it. > > @@ -527,7 +528,7 @@ static void raid10_end_write_request(struct bio *bio) > > /* Maybe we can clear some bad blocks. */ > > if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr, > > r10_bio->sectors) && > > - !discard_error) { > > + !ignore_error) { > > bio_put(bio); > > if (repl) > > r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; > > > >