[PATCH] md/raid5: Fix parity corruption on journal failure

Meir Elisha <meir.elisha@xxxxxxxxxxx> · Tue, 12 Aug 2025 16:08:21 +0300

A journal device failure can lead to parity corruption and silent data
loss in degraded arrays. This occurs because the current
implementation continues to update the parity even when journal
writes fail.

Fixed this by:
1. In ops_run_io(), check if journal writes failed before proceeding
   with disk writes. Abort write operations when journal is faulty.
2. In handle_failed_stripe(), clear all R5_Want* flags to ensure
   clean state for stripe retry after journal failure.
3. In handle_stripe(), correctly identify write operations that must
   be failed when journal is unavailable.

Signed-off-by: Meir Elisha <meir.elisha@xxxxxxxxxxx>
---

When log_stripe() fails in ops_run_io() we keep write to the parity
disk causing parity to get updated as if the write succeeded.
shouldn't we fail if the journal is down? am I missing something here?
Thanks in advance for reviewing.

Wrote a script for showcasing the issue.

#!/bin/bash

set -e

RAID_DEV="/dev/md127"
DATA_OFFSET=32

# Arrays to store disk states
declare -a BEFORE_BYTES
declare -a DISK_NAMES

cleanup() {
    mdadm --stop $RAID_DEV 2>/dev/null || true
    dmsetup remove disk0-flakey disk1-flakey journal-flakey 2>/dev/null || true
    for i in {10..15}; do
        losetup -d /dev/loop$i 2>/dev/null || true
    done
    rm -f /tmp/disk*.img 2>/dev/null || true
}

# Function to read first byte from device at offset
read_first_byte() {
    local device=$1
    local offset=$2
    dd if=$device bs=32k skip=$offset count=4 iflag=direct status=none | head -c 1 | xxd -p
}

# Function to calculate which disk holds parity for a given stripe
# RAID5 left-symmetric algorithm (default)
get_parity_disk() {
    local stripe_num=$1
    local n_disks=$2
    local pd_idx=$((($n_disks - 1) - ($stripe_num % $n_disks)))
    echo $pd_idx
}

cleanup
echo "=== RAID5 Parity Bug Test ==="
echo

# Create backing files
for i in {0..5}; do
    dd if=/dev/zero of=/tmp/disk$i.img bs=1M count=100 status=none
    losetup /dev/loop$((10+i)) /tmp/disk$i.img
done

SIZE=$(blockdev --getsz /dev/loop10)

# Create normal linear targets first
dmsetup create disk0-flakey --table "0 $SIZE linear /dev/loop10 0"
dmsetup create disk1-flakey --table "0 $SIZE linear /dev/loop11 0"
dmsetup create journal-flakey --table "0 $SIZE linear /dev/loop15 0"

# Create RAID5 using the dm devices
echo "1. Creating RAID5 array..."
mdadm --create $RAID_DEV --chunk=32K --level=5 --raid-devices=5 \
    /dev/mapper/disk0-flakey \
    /dev/mapper/disk1-flakey \
    /dev/loop12 /dev/loop13 /dev/loop14 \
    --write-journal /dev/mapper/journal-flakey \
    --assume-clean --force

echo "write-through" > /sys/block/md127/md/journal_mode
echo 0 > /sys/block/md127/md/safe_mode_delay

# Write test pattern
echo "2. Writing test pattern..."
for i in 0 1 2 3; do
    VAL=$((1 << i))
    echo "VAL:$VAL"
    perl -e "print chr($VAL) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=$i oflag=direct status=none
done
sync
sleep 1  # Give time for writes to settle

echo "3. Reading disk states before failure..."

# Calculate parity disk for stripe 0 (first 32k chunk)
STRIPE_NUM=0
N_DISKS=5
PARITY_INDEX=$(get_parity_disk $STRIPE_NUM $N_DISKS)
echo "Calculated parity disk index for stripe $STRIPE_NUM: $PARITY_INDEX"

# Map RAID device index to loop device
PARITY_DISK=$((10 + $PARITY_INDEX))
echo "Parity is on loop$PARITY_DISK"
echo

for i in {10..14}; do
    # Read first byte from device
    BYTE=$(read_first_byte /dev/loop$i $DATA_OFFSET)
    BEFORE_BYTES[$i]=$BYTE
    DISK_NAMES[$i]="loop$i"
    
    echo -n "loop$i: 0x$BYTE"
    if [ "$i" = "$PARITY_DISK" ]; then
        echo " <-- PARITY disk"
    else
        echo
    fi
done

echo
echo "4. Fail the first disk..."

dmsetup suspend disk0-flakey
dmsetup reload disk0-flakey --table "0 $SIZE flakey /dev/loop10 0 0 4294967295 2 error_reads error_writes"
dmsetup resume disk0-flakey

perl -e "print chr(4) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=2 oflag=direct status=none
sync
sleep 1

dmsetup suspend journal-flakey
dmsetup reload journal-flakey --table "0 $SIZE flakey /dev/loop15 0 0 4294967295 2 error_reads error_writes"
dmsetup resume journal-flakey

dmsetup suspend disk1-flakey
dmsetup reload disk1-flakey --table "0 $SIZE flakey /dev/loop11 0 0 4294967295 2 error_reads error_writes"
dmsetup resume disk1-flakey

echo "5. Attempting write (should fail the 2nd disk and the journal)..."
dd if=/dev/zero of=$RAID_DEV bs=32k count=1 seek=0 oflag=direct 2>&1 || echo "Write failed (expected)"
sync
sleep 1

echo
echo "6. Checking if parity was incorrectly updated:"
CHANGED=0
for i in {10..14}; do
    # Read current state from device
    BYTE_AFTER=$(read_first_byte /dev/loop$i $DATA_OFFSET)
    BYTE_BEFORE=${BEFORE_BYTES[$i]}

    if [ "$BYTE_BEFORE" != "$BYTE_AFTER" ]; then
        echo "*** loop$i CHANGED: 0x$BYTE_BEFORE -> 0x$BYTE_AFTER ***"
        CHANGED=$((CHANGED + 1))

        if [ "$i" = "$PARITY_DISK" ]; then
            echo "  ^^ PARITY WAS UPDATED - BUG DETECTED!"
        fi
    else
        echo "loop$i unchanged: 0x$BYTE_BEFORE"
    fi
done

echo
echo "RESULT:"
if [ $CHANGED -gt 0 ]; then
    echo "*** BUG DETECTED: $CHANGED disk(s) changed despite journal failure ***"
else
    echo "✓ GOOD: No disks changed
fi

cleanup

 drivers/md/raid5.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 023649fe2476..856dd3f0907f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1146,8 +1146,25 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 	might_sleep();
 
-	if (log_stripe(sh, s) == 0)
+	if (log_stripe(sh, s) == 0) {
+		/* Successfully logged to journal */
 		return;
+	}
+
+	if (conf->log && r5l_log_disk_error(conf)) {
+		/*
+		 * Journal device failed. We must not proceed with writes
+		 * to prevent a write hole.
+		 * The RAID write hole occurs when parity is updated
+		 * without successfully updating all data blocks.
+		 * If the journal write fails, we must abort the entire
+		 * stripe operation to maintain consistency.
+		 */
+		if (s->to_write || s->written) {
+			set_bit(STRIPE_HANDLE, &sh->state);
+			return;
+		}
+	}
 
 	should_defer = conf->batch_bio_dispatch && conf->group_cnt;
 
@@ -3672,6 +3689,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		 * still be locked - so just clear all R5_LOCKED flags
 		 */
 		clear_bit(R5_LOCKED, &sh->dev[i].flags);
+		clear_bit(R5_Wantwrite, &sh->dev[i].flags);
+		clear_bit(R5_Wantcompute, &sh->dev[i].flags);
+		clear_bit(R5_WantFUA, &sh->dev[i].flags);
+		clear_bit(R5_Wantdrain, &sh->dev[i].flags);
 	}
 	s->to_write = 0;
 	s->written = 0;
@@ -4966,12 +4987,9 @@ static void handle_stripe(struct stripe_head *sh)
 	/*
 	 * check if the array has lost more than max_degraded devices and,
 	 * if so, some requests might need to be failed.
-	 *
-	 * When journal device failed (log_failed), we will only process
-	 * the stripe if there is data need write to raid disks
 	 */
 	if (s.failed > conf->max_degraded ||
-	    (s.log_failed && s.injournal == 0)) {
+	    (s.log_failed && (s.to_write || s.written || s.syncing))) {
 		sh->check_state = 0;
 		sh->reconstruct_state = 0;
 		break_stripe_batch_list(sh, 0);
-- 
2.34.1