When operating in write-through journal mode, a journal device failure can lead to parity corruption and silent data loss. This occurs because the current implementation continues to update parity even when journal writes fail, violating the write-through consistency guarantee. Signed-off-by: Meir Elisha <meir.elisha@xxxxxxxxxxx> --- Changes in v2: - Drop writes only when s->failed > conf->max_degraded - Remove changes in handle_stripe() When log_stripe() fails in ops_run_io() we keep write to the parity disk causing parity to get updated as if the write succeeded. shouldn't we fail if the journal is down? am I missing something here? Thanks in advance for reviewing. Wrote a script for showcasing the issue. #!/bin/bash set -e RAID_DEV="/dev/md127" DATA_OFFSET=32 # Arrays to store disk states declare -a BEFORE_BYTES declare -a DISK_NAMES cleanup() { mdadm --stop $RAID_DEV 2>/dev/null || true dmsetup remove disk0-flakey disk1-flakey journal-flakey 2>/dev/null || true for i in {10..15}; do losetup -d /dev/loop$i 2>/dev/null || true done rm -f /tmp/disk*.img 2>/dev/null || true } # Function to read first byte from device at offset read_first_byte() { local device=$1 local offset=$2 dd if=$device bs=32k skip=$offset count=4 iflag=direct status=none | head -c 1 | xxd -p } # Function to calculate which disk holds parity for a given stripe # RAID5 left-symmetric algorithm (default) get_parity_disk() { local stripe_num=$1 local n_disks=$2 local pd_idx=$((($n_disks - 1) - ($stripe_num % $n_disks))) echo $pd_idx } cleanup echo "=== RAID5 Parity Bug Test ===" echo # Create backing files for i in {0..5}; do dd if=/dev/zero of=/tmp/disk$i.img bs=1M count=100 status=none losetup /dev/loop$((10+i)) /tmp/disk$i.img done SIZE=$(blockdev --getsz /dev/loop10) # Create normal linear targets first dmsetup create disk0-flakey --table "0 $SIZE linear /dev/loop10 0" dmsetup create disk1-flakey --table "0 $SIZE linear /dev/loop11 0" dmsetup create journal-flakey --table "0 $SIZE linear /dev/loop15 0" # Create RAID5 using the dm devices echo "1. Creating RAID5 array..." mdadm --create $RAID_DEV --chunk=32K --level=5 --raid-devices=5 \ /dev/mapper/disk0-flakey \ /dev/mapper/disk1-flakey \ /dev/loop12 /dev/loop13 /dev/loop14 \ --write-journal /dev/mapper/journal-flakey \ --assume-clean --force echo "write-through" > /sys/block/md127/md/journal_mode echo 0 > /sys/block/md127/md/safe_mode_delay # Write test pattern echo "2. Writing test pattern..." for i in 0 1 2 3; do VAL=$((1 << i)) echo "VAL:$VAL" perl -e "print chr($VAL) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=$i oflag=direct status=none done sync sleep 1 # Give time for writes to settle echo "3. Reading disk states before failure..." # Calculate parity disk for stripe 0 (first 32k chunk) STRIPE_NUM=0 N_DISKS=5 PARITY_INDEX=$(get_parity_disk $STRIPE_NUM $N_DISKS) echo "Calculated parity disk index for stripe $STRIPE_NUM: $PARITY_INDEX" # Map RAID device index to loop device PARITY_DISK=$((10 + $PARITY_INDEX)) echo "Parity is on loop$PARITY_DISK" echo for i in {10..14}; do # Read first byte from device BYTE=$(read_first_byte /dev/loop$i $DATA_OFFSET) BEFORE_BYTES[$i]=$BYTE DISK_NAMES[$i]="loop$i" echo -n "loop$i: 0x$BYTE" if [ "$i" = "$PARITY_DISK" ]; then echo " <-- PARITY disk" else echo fi done echo echo "4. Fail the first disk..." dmsetup suspend disk0-flakey dmsetup reload disk0-flakey --table "0 $SIZE flakey /dev/loop10 0 0 4294967295 2 error_reads error_writes" dmsetup resume disk0-flakey perl -e "print chr(4) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=2 oflag=direct status=none sync sleep 1 dmsetup suspend journal-flakey dmsetup reload journal-flakey --table "0 $SIZE flakey /dev/loop15 0 0 4294967295 2 error_reads error_writes" dmsetup resume journal-flakey dmsetup suspend disk1-flakey dmsetup reload disk1-flakey --table "0 $SIZE flakey /dev/loop11 0 0 4294967295 2 error_reads error_writes" dmsetup resume disk1-flakey echo "5. Attempting write (should fail the 2nd disk and the journal)..." dd if=/dev/zero of=$RAID_DEV bs=32k count=1 seek=0 oflag=direct 2>&1 || echo "Write failed (expected)" sync sleep 1 echo echo "6. Checking if parity was incorrectly updated:" CHANGED=0 for i in {10..14}; do # Read current state from device BYTE_AFTER=$(read_first_byte /dev/loop$i $DATA_OFFSET) BYTE_BEFORE=${BEFORE_BYTES[$i]} if [ "$BYTE_BEFORE" != "$BYTE_AFTER" ]; then echo "*** loop$i CHANGED: 0x$BYTE_BEFORE -> 0x$BYTE_AFTER ***" CHANGED=$((CHANGED + 1)) if [ "$i" = "$PARITY_DISK" ]; then echo " ^^ PARITY WAS UPDATED - BUG DETECTED!" fi else echo "loop$i unchanged: 0x$BYTE_BEFORE" fi done echo echo "RESULT:" if [ $CHANGED -gt 0 ]; then echo "*** BUG DETECTED: $CHANGED disk(s) changed despite journal failure ***" else echo "✓ GOOD: No disks changed fi cleanup drivers/md/raid5.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 023649fe2476..509ab5673cf8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1146,8 +1146,21 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) might_sleep(); - if (log_stripe(sh, s) == 0) + if (log_stripe(sh, s) == 0) { + /* Successfully logged to journal */ return; + } + + if (conf->log && r5l_log_disk_error(conf)) { + /* + * Journal device failed. Only abort writes if we have + * too many failed devices to maintain consistency. + */ + if (s->failed > conf->max_degraded && (s->to_write || s->written)) { + set_bit(STRIPE_HANDLE, &sh->state); + return; + } + } should_defer = conf->batch_bio_dispatch && conf->group_cnt; @@ -3672,6 +3685,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, * still be locked - so just clear all R5_LOCKED flags */ clear_bit(R5_LOCKED, &sh->dev[i].flags); + clear_bit(R5_Wantwrite, &sh->dev[i].flags); + clear_bit(R5_Wantcompute, &sh->dev[i].flags); + clear_bit(R5_WantFUA, &sh->dev[i].flags); + clear_bit(R5_Wantdrain, &sh->dev[i].flags); } s->to_write = 0; s->written = 0; -- 2.34.1