Hi Kuai Appreciate your thoughts on V3. Thanks in advance. On 8/19/25 10:38, Meir Elisha wrote: > When operating in write-through journal mode, a journal device failure > can lead to parity corruption and silent data loss. > This occurs because the current implementation continues to update > parity even when journal writes fail, violating the write-through > consistency guarantee. > > Signed-off-by: Meir Elisha <meir.elisha@xxxxxxxxxxx> > --- > Changes in v3: > - Remove unnecessary braces in ops_run_io() > - Adding comment on clearing the R5_Want* flags > Changes in v2: > - Drop writes only when s->failed > conf->max_degraded > - Remove changes in handle_stripe() > > Wrote a script for showcasing the issue. > > #!/bin/bash > > set -e > > RAID_DEV="/dev/md127" > DATA_OFFSET=32 > > # Arrays to store disk states > declare -a BEFORE_BYTES > declare -a DISK_NAMES > > cleanup() { > mdadm --stop $RAID_DEV 2>/dev/null || true > dmsetup remove disk0-flakey disk1-flakey journal-flakey 2>/dev/null || true > for i in {10..15}; do > losetup -d /dev/loop$i 2>/dev/null || true > done > rm -f /tmp/disk*.img 2>/dev/null || true > } > > # Function to read first byte from device at offset > read_first_byte() { > local device=$1 > local offset=$2 > dd if=$device bs=32k skip=$offset count=4 iflag=direct status=none | head -c 1 | xxd -p > } > > # Function to calculate which disk holds parity for a given stripe > # RAID5 left-symmetric algorithm (default) > get_parity_disk() { > local stripe_num=$1 > local n_disks=$2 > local pd_idx=$((($n_disks - 1) - ($stripe_num % $n_disks))) > echo $pd_idx > } > > cleanup > echo "=== RAID5 Parity Bug Test ===" > echo > > # Create backing files > for i in {0..5}; do > dd if=/dev/zero of=/tmp/disk$i.img bs=1M count=100 status=none > losetup /dev/loop$((10+i)) /tmp/disk$i.img > done > > SIZE=$(blockdev --getsz /dev/loop10) > > # Create normal linear targets first > dmsetup create disk0-flakey --table "0 $SIZE linear /dev/loop10 0" > dmsetup create disk1-flakey --table "0 $SIZE linear /dev/loop11 0" > dmsetup create journal-flakey --table "0 $SIZE linear /dev/loop15 0" > > # Create RAID5 using the dm devices > echo "1. Creating RAID5 array..." > mdadm --create $RAID_DEV --chunk=32K --level=5 --raid-devices=5 \ > /dev/mapper/disk0-flakey \ > /dev/mapper/disk1-flakey \ > /dev/loop12 /dev/loop13 /dev/loop14 \ > --write-journal /dev/mapper/journal-flakey \ > --assume-clean --force > > echo "write-through" > /sys/block/md127/md/journal_mode > echo 0 > /sys/block/md127/md/safe_mode_delay > > # Write test pattern > echo "2. Writing test pattern..." > for i in 0 1 2 3; do > VAL=$((1 << i)) > echo "VAL:$VAL" > perl -e "print chr($VAL) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=$i oflag=direct status=none > done > sync > sleep 1 # Give time for writes to settle > > echo "3. Reading disk states before failure..." > > # Calculate parity disk for stripe 0 (first 32k chunk) > STRIPE_NUM=0 > N_DISKS=5 > PARITY_INDEX=$(get_parity_disk $STRIPE_NUM $N_DISKS) > echo "Calculated parity disk index for stripe $STRIPE_NUM: $PARITY_INDEX" > > # Map RAID device index to loop device > PARITY_DISK=$((10 + $PARITY_INDEX)) > echo "Parity is on loop$PARITY_DISK" > echo > > for i in {10..14}; do > # Read first byte from device > BYTE=$(read_first_byte /dev/loop$i $DATA_OFFSET) > BEFORE_BYTES[$i]=$BYTE > DISK_NAMES[$i]="loop$i" > echo -n "loop$i: 0x$BYTE" > if [ "$i" = "$PARITY_DISK" ]; then > echo " <-- PARITY disk" > else > echo > fi > done > > echo > echo "4. Fail the first disk..." > > dmsetup suspend disk0-flakey > dmsetup reload disk0-flakey --table "0 $SIZE flakey /dev/loop10 0 0 4294967295 2 error_reads error_writes" > dmsetup resume disk0-flakey > > perl -e "print chr(4) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=2 oflag=direct status=none > sync > sleep 1 > > dmsetup suspend journal-flakey > dmsetup reload journal-flakey --table "0 $SIZE flakey /dev/loop15 0 0 4294967295 2 error_reads error_writes" > dmsetup resume journal-flakey > > dmsetup suspend disk1-flakey > dmsetup reload disk1-flakey --table "0 $SIZE flakey /dev/loop11 0 0 4294967295 2 error_reads error_writes" > dmsetup resume disk1-flakey > > echo "5. Attempting write (should fail the 2nd disk and the journal)..." > dd if=/dev/zero of=$RAID_DEV bs=32k count=1 seek=0 oflag=direct 2>&1 || echo "Write failed (expected)" > sync > sleep 1 > > echo > echo "6. Checking if parity was incorrectly updated:" > CHANGED=0 > for i in {10..14}; do > # Read current state from device > BYTE_AFTER=$(read_first_byte /dev/loop$i $DATA_OFFSET) > BYTE_BEFORE=${BEFORE_BYTES[$i]} > > if [ "$BYTE_BEFORE" != "$BYTE_AFTER" ]; then > echo "*** loop$i CHANGED: 0x$BYTE_BEFORE -> 0x$BYTE_AFTER ***" > CHANGED=$((CHANGED + 1)) > > if [ "$i" = "$PARITY_DISK" ]; then > echo " ^^ PARITY WAS UPDATED - BUG DETECTED!" > fi > else > echo "loop$i unchanged: 0x$BYTE_BEFORE" > fi > done > > echo > echo "RESULT:" > if [ $CHANGED -gt 0 ]; then > echo "*** BUG DETECTED: $CHANGED disk(s) changed despite journal failure ***" > else > echo "✓ GOOD: No disks changed" > fi > > cleanup > > drivers/md/raid5.c | 19 +++++++++++++++++++ > 1 file changed, 19 insertions(+) > > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c > index 023649fe2476..da7756cc6a2a 100644 > --- a/drivers/md/raid5.c > +++ b/drivers/md/raid5.c > @@ -1146,9 +1146,21 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) > > might_sleep(); > > + /* Successfully logged to journal */ > if (log_stripe(sh, s) == 0) > return; > > + /* > + * Journal device failed. Only abort writes if we have > + * too many failed devices to maintain consistency. > + */ > + if (conf->log && r5l_log_disk_error(conf) && > + s->failed > conf->max_degraded && > + (s->to_write || s->written)) { > + set_bit(STRIPE_HANDLE, &sh->state); > + return; > + } > + > should_defer = conf->batch_bio_dispatch && conf->group_cnt; > > for (i = disks; i--; ) { > @@ -3672,6 +3684,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, > * still be locked - so just clear all R5_LOCKED flags > */ > clear_bit(R5_LOCKED, &sh->dev[i].flags); > + /* Clear R5_Want* flags to prevent stale operations > + * from executing on retry. > + */ > + clear_bit(R5_Wantwrite, &sh->dev[i].flags); > + clear_bit(R5_Wantcompute, &sh->dev[i].flags); > + clear_bit(R5_WantFUA, &sh->dev[i].flags); > + clear_bit(R5_Wantdrain, &sh->dev[i].flags); > } > s->to_write = 0; > s->written = 0;