A journal device failure can lead to parity corruption and silent data loss in degraded arrays. This occurs because the current implementation continues to update the parity even when journal writes fail. Fixed this by: 1. In ops_run_io(), check if journal writes failed before proceeding with disk writes. Abort write operations when journal is faulty. 2. In handle_failed_stripe(), clear all R5_Want* flags to ensure clean state for stripe retry after journal failure. 3. In handle_stripe(), correctly identify write operations that must be failed when journal is unavailable. Signed-off-by: Meir Elisha <meir.elisha@xxxxxxxxxxx> --- When log_stripe() fails in ops_run_io() we keep write to the parity disk causing parity to get updated as if the write succeeded. shouldn't we fail if the journal is down? am I missing something here? Thanks in advance for reviewing. Wrote a script for showcasing the issue. #!/bin/bash set -e RAID_DEV="/dev/md127" DATA_OFFSET=32 # Arrays to store disk states declare -a BEFORE_BYTES declare -a DISK_NAMES cleanup() { mdadm --stop $RAID_DEV 2>/dev/null || true dmsetup remove disk0-flakey disk1-flakey journal-flakey 2>/dev/null || true for i in {10..15}; do losetup -d /dev/loop$i 2>/dev/null || true done rm -f /tmp/disk*.img 2>/dev/null || true } # Function to read first byte from device at offset read_first_byte() { local device=$1 local offset=$2 dd if=$device bs=32k skip=$offset count=4 iflag=direct status=none | head -c 1 | xxd -p } # Function to calculate which disk holds parity for a given stripe # RAID5 left-symmetric algorithm (default) get_parity_disk() { local stripe_num=$1 local n_disks=$2 local pd_idx=$((($n_disks - 1) - ($stripe_num % $n_disks))) echo $pd_idx } cleanup echo "=== RAID5 Parity Bug Test ===" echo # Create backing files for i in {0..5}; do dd if=/dev/zero of=/tmp/disk$i.img bs=1M count=100 status=none losetup /dev/loop$((10+i)) /tmp/disk$i.img done SIZE=$(blockdev --getsz /dev/loop10) # Create normal linear targets first dmsetup create disk0-flakey --table "0 $SIZE linear /dev/loop10 0" dmsetup create disk1-flakey --table "0 $SIZE linear /dev/loop11 0" dmsetup create journal-flakey --table "0 $SIZE linear /dev/loop15 0" # Create RAID5 using the dm devices echo "1. Creating RAID5 array..." mdadm --create $RAID_DEV --chunk=32K --level=5 --raid-devices=5 \ /dev/mapper/disk0-flakey \ /dev/mapper/disk1-flakey \ /dev/loop12 /dev/loop13 /dev/loop14 \ --write-journal /dev/mapper/journal-flakey \ --assume-clean --force echo "write-through" > /sys/block/md127/md/journal_mode echo 0 > /sys/block/md127/md/safe_mode_delay # Write test pattern echo "2. Writing test pattern..." for i in 0 1 2 3; do VAL=$((1 << i)) echo "VAL:$VAL" perl -e "print chr($VAL) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=$i oflag=direct status=none done sync sleep 1 # Give time for writes to settle echo "3. Reading disk states before failure..." # Calculate parity disk for stripe 0 (first 32k chunk) STRIPE_NUM=0 N_DISKS=5 PARITY_INDEX=$(get_parity_disk $STRIPE_NUM $N_DISKS) echo "Calculated parity disk index for stripe $STRIPE_NUM: $PARITY_INDEX" # Map RAID device index to loop device PARITY_DISK=$((10 + $PARITY_INDEX)) echo "Parity is on loop$PARITY_DISK" echo for i in {10..14}; do # Read first byte from device BYTE=$(read_first_byte /dev/loop$i $DATA_OFFSET) BEFORE_BYTES[$i]=$BYTE DISK_NAMES[$i]="loop$i" echo -n "loop$i: 0x$BYTE" if [ "$i" = "$PARITY_DISK" ]; then echo " <-- PARITY disk" else echo fi done echo echo "4. Fail the first disk..." dmsetup suspend disk0-flakey dmsetup reload disk0-flakey --table "0 $SIZE flakey /dev/loop10 0 0 4294967295 2 error_reads error_writes" dmsetup resume disk0-flakey perl -e "print chr(4) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=2 oflag=direct status=none sync sleep 1 dmsetup suspend journal-flakey dmsetup reload journal-flakey --table "0 $SIZE flakey /dev/loop15 0 0 4294967295 2 error_reads error_writes" dmsetup resume journal-flakey dmsetup suspend disk1-flakey dmsetup reload disk1-flakey --table "0 $SIZE flakey /dev/loop11 0 0 4294967295 2 error_reads error_writes" dmsetup resume disk1-flakey echo "5. Attempting write (should fail the 2nd disk and the journal)..." dd if=/dev/zero of=$RAID_DEV bs=32k count=1 seek=0 oflag=direct 2>&1 || echo "Write failed (expected)" sync sleep 1 echo echo "6. Checking if parity was incorrectly updated:" CHANGED=0 for i in {10..14}; do # Read current state from device BYTE_AFTER=$(read_first_byte /dev/loop$i $DATA_OFFSET) BYTE_BEFORE=${BEFORE_BYTES[$i]} if [ "$BYTE_BEFORE" != "$BYTE_AFTER" ]; then echo "*** loop$i CHANGED: 0x$BYTE_BEFORE -> 0x$BYTE_AFTER ***" CHANGED=$((CHANGED + 1)) if [ "$i" = "$PARITY_DISK" ]; then echo " ^^ PARITY WAS UPDATED - BUG DETECTED!" fi else echo "loop$i unchanged: 0x$BYTE_BEFORE" fi done echo echo "RESULT:" if [ $CHANGED -gt 0 ]; then echo "*** BUG DETECTED: $CHANGED disk(s) changed despite journal failure ***" else echo "✓ GOOD: No disks changed fi cleanup drivers/md/raid5.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 023649fe2476..856dd3f0907f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1146,8 +1146,25 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) might_sleep(); - if (log_stripe(sh, s) == 0) + if (log_stripe(sh, s) == 0) { + /* Successfully logged to journal */ return; + } + + if (conf->log && r5l_log_disk_error(conf)) { + /* + * Journal device failed. We must not proceed with writes + * to prevent a write hole. + * The RAID write hole occurs when parity is updated + * without successfully updating all data blocks. + * If the journal write fails, we must abort the entire + * stripe operation to maintain consistency. + */ + if (s->to_write || s->written) { + set_bit(STRIPE_HANDLE, &sh->state); + return; + } + } should_defer = conf->batch_bio_dispatch && conf->group_cnt; @@ -3672,6 +3689,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, * still be locked - so just clear all R5_LOCKED flags */ clear_bit(R5_LOCKED, &sh->dev[i].flags); + clear_bit(R5_Wantwrite, &sh->dev[i].flags); + clear_bit(R5_Wantcompute, &sh->dev[i].flags); + clear_bit(R5_WantFUA, &sh->dev[i].flags); + clear_bit(R5_Wantdrain, &sh->dev[i].flags); } s->to_write = 0; s->written = 0; @@ -4966,12 +4987,9 @@ static void handle_stripe(struct stripe_head *sh) /* * check if the array has lost more than max_degraded devices and, * if so, some requests might need to be failed. - * - * When journal device failed (log_failed), we will only process - * the stripe if there is data need write to raid disks */ if (s.failed > conf->max_degraded || - (s.log_failed && s.injournal == 0)) { + (s.log_failed && (s.to_write || s.written || s.syncing))) { sh->check_state = 0; sh->reconstruct_state = 0; break_stripe_batch_list(sh, 0); -- 2.34.1