Re: [PATCH] md/raid5: Fix parity corruption on journal failure

Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> · Fri, 15 Aug 2025 09:23:03 +0800

Hi,

在 2025/08/14 22:28, Meir Elisha 写道:


On 13/08/2025 4:33, Yu Kuai wrote:
Hi,

在 2025/08/12 21:08, Meir Elisha 写道:
A journal device failure can lead to parity corruption and silent data
loss in degraded arrays. This occurs because the current
implementation continues to update the parity even when journal
writes fail.

Fixed this by:
1. In ops_run_io(), check if journal writes failed before proceeding
     with disk writes. Abort write operations when journal is faulty.
2. In handle_failed_stripe(), clear all R5_Want* flags to ensure
     clean state for stripe retry after journal failure.
3. In handle_stripe(), correctly identify write operations that must
     be failed when journal is unavailable.

Signed-off-by: Meir Elisha <meir.elisha@xxxxxxxxxxx>
---

When log_stripe() fails in ops_run_io() we keep write to the parity
disk causing parity to get updated as if the write succeeded.
shouldn't we fail if the journal is down? am I missing something here?
Thanks in advance for reviewing.

Wrote a script for showcasing the issue.

#!/bin/bash

set -e

RAID_DEV="/dev/md127"
DATA_OFFSET=32

# Arrays to store disk states
declare -a BEFORE_BYTES
declare -a DISK_NAMES

cleanup() {
      mdadm --stop $RAID_DEV 2>/dev/null || true
      dmsetup remove disk0-flakey disk1-flakey journal-flakey 2>/dev/null || true
      for i in {10..15}; do
          losetup -d /dev/loop$i 2>/dev/null || true
      done
      rm -f /tmp/disk*.img 2>/dev/null || true
}

# Function to read first byte from device at offset
read_first_byte() {
      local device=$1
      local offset=$2
      dd if=$device bs=32k skip=$offset count=4 iflag=direct status=none | head -c 1 | xxd -p
}

# Function to calculate which disk holds parity for a given stripe
# RAID5 left-symmetric algorithm (default)
get_parity_disk() {
      local stripe_num=$1
      local n_disks=$2
      local pd_idx=$((($n_disks - 1) - ($stripe_num % $n_disks)))
      echo $pd_idx
}

cleanup
echo "=== RAID5 Parity Bug Test ==="
echo

# Create backing files
for i in {0..5}; do
      dd if=/dev/zero of=/tmp/disk$i.img bs=1M count=100 status=none
      losetup /dev/loop$((10+i)) /tmp/disk$i.img
done

SIZE=$(blockdev --getsz /dev/loop10)

# Create normal linear targets first
dmsetup create disk0-flakey --table "0 $SIZE linear /dev/loop10 0"
dmsetup create disk1-flakey --table "0 $SIZE linear /dev/loop11 0"
dmsetup create journal-flakey --table "0 $SIZE linear /dev/loop15 0"

# Create RAID5 using the dm devices
echo "1. Creating RAID5 array..."
mdadm --create $RAID_DEV --chunk=32K --level=5 --raid-devices=5 \
      /dev/mapper/disk0-flakey \
      /dev/mapper/disk1-flakey \
      /dev/loop12 /dev/loop13 /dev/loop14 \
      --write-journal /dev/mapper/journal-flakey \
      --assume-clean --force

echo "write-through" > /sys/block/md127/md/journal_mode
echo 0 > /sys/block/md127/md/safe_mode_delay

# Write test pattern
echo "2. Writing test pattern..."
for i in 0 1 2 3; do
      VAL=$((1 << i))
      echo "VAL:$VAL"
      perl -e "print chr($VAL) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=$i oflag=direct status=none
done
sync
sleep 1  # Give time for writes to settle

echo "3. Reading disk states before failure..."

# Calculate parity disk for stripe 0 (first 32k chunk)
STRIPE_NUM=0
N_DISKS=5
PARITY_INDEX=$(get_parity_disk $STRIPE_NUM $N_DISKS)
echo "Calculated parity disk index for stripe $STRIPE_NUM: $PARITY_INDEX"

# Map RAID device index to loop device
PARITY_DISK=$((10 + $PARITY_INDEX))
echo "Parity is on loop$PARITY_DISK"
echo

for i in {10..14}; do
      # Read first byte from device
      BYTE=$(read_first_byte /dev/loop$i $DATA_OFFSET)
      BEFORE_BYTES[$i]=$BYTE
      DISK_NAMES[$i]="loop$i"
           echo -n "loop$i: 0x$BYTE"
      if [ "$i" = "$PARITY_DISK" ]; then
          echo " <-- PARITY disk"
      else
          echo
      fi
done

echo
echo "4. Fail the first disk..."

dmsetup suspend disk0-flakey
dmsetup reload disk0-flakey --table "0 $SIZE flakey /dev/loop10 0 0 4294967295 2 error_reads error_writes"
dmsetup resume disk0-flakey

perl -e "print chr(4) x 32768" | dd of=$RAID_DEV bs=32k count=1 seek=2 oflag=direct status=none
sync
sleep 1

dmsetup suspend journal-flakey
dmsetup reload journal-flakey --table "0 $SIZE flakey /dev/loop15 0 0 4294967295 2 error_reads error_writes"
dmsetup resume journal-flakey

dmsetup suspend disk1-flakey
dmsetup reload disk1-flakey --table "0 $SIZE flakey /dev/loop11 0 0 4294967295 2 error_reads error_writes"
dmsetup resume disk1-flakey

echo "5. Attempting write (should fail the 2nd disk and the journal)..."
dd if=/dev/zero of=$RAID_DEV bs=32k count=1 seek=0 oflag=direct 2>&1 || echo "Write failed (expected)"
sync
sleep 1

echo
echo "6. Checking if parity was incorrectly updated:"
CHANGED=0
for i in {10..14}; do
      # Read current state from device
      BYTE_AFTER=$(read_first_byte /dev/loop$i $DATA_OFFSET)
      BYTE_BEFORE=${BEFORE_BYTES[$i]}

      if [ "$BYTE_BEFORE" != "$BYTE_AFTER" ]; then
          echo "*** loop$i CHANGED: 0x$BYTE_BEFORE -> 0x$BYTE_AFTER ***"
          CHANGED=$((CHANGED + 1))

          if [ "$i" = "$PARITY_DISK" ]; then
              echo "  ^^ PARITY WAS UPDATED - BUG DETECTED!"
          fi
      else
          echo "loop$i unchanged: 0x$BYTE_BEFORE"
      fi
done

echo
echo "RESULT:"
if [ $CHANGED -gt 0 ]; then
      echo "*** BUG DETECTED: $CHANGED disk(s) changed despite journal failure ***"
else
      echo "✓ GOOD: No disks changed
fi

cleanup

   drivers/md/raid5.c | 28 +++++++++++++++++++++++-----
   1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 023649fe2476..856dd3f0907f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1146,8 +1146,25 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
         might_sleep();
   -    if (log_stripe(sh, s) == 0)
+    if (log_stripe(sh, s) == 0) {
+        /* Successfully logged to journal */
           return;
+    }
+
+    if (conf->log && r5l_log_disk_error(conf)) {
+        /*
+         * Journal device failed. We must not proceed with writes
+         * to prevent a write hole.
+         * The RAID write hole occurs when parity is updated
+         * without successfully updating all data blocks.
+         * If the journal write fails, we must abort the entire
+         * stripe operation to maintain consistency.
+         */
+        if (s->to_write || s->written) {
+            set_bit(STRIPE_HANDLE, &sh->state);
+            return;
+        }

I think this is too radical to fail all the writes to the array, even if
log disk failed, everything will be fine without a power failure that
should be unlikely to happen.

And it's right, if power failure do happened, data atomicity can't be
guaranteed, however, please notice that we will still make sure data
is consistent, by resync based on bitmap or full array resync if bitmap
is none.

I think, if log disk is down, let's keep this behavior for user to still
using this array, user should aware that data atomicity is no longer
guaranteed. If you really want to stop writing such array to make sure
data atomicity after power failure, I can accept a switch to enable this
behaviour manually.

Thanks,
Kuai

Hi Kuai
Thanks for reviewing. I want to reiterate the scenario I've tested.
Got a RAID5 (4 disks + 1 parity) and a journal(consistency policy=journal).
Starting by failing one of the disks so the array becomes degraded.
then, I failing the 2nd disk and the journal at the same time (using dm-flakey)
when I initiate the write.
While examine the parity disk I've noticed that the parity got calculated as
if the write succeeded. this is shown by the reproduce script above.
Not sure I understands your comment on resync since we are in journal policy.
if parity is invalid how can we guarantee data will be valid?
Appreciate you response.

OK, I didn't see your test in details yet, just by your coding, I didn't
get how you handled the case failed disks are more than max_degraded.

Looks like you're failing all the writes if log disk failed. What I'm
worrying is that if only log disk failed, or log disk failed with less
than max_degraded data disk failed, are you changing behaviour?

Thanks,
Kuai


+    }
         should_defer = conf->batch_bio_dispatch && conf->group_cnt;
   @@ -3672,6 +3689,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
            * still be locked - so just clear all R5_LOCKED flags
            */
           clear_bit(R5_LOCKED, &sh->dev[i].flags);
+        clear_bit(R5_Wantwrite, &sh->dev[i].flags);
+        clear_bit(R5_Wantcompute, &sh->dev[i].flags);
+        clear_bit(R5_WantFUA, &sh->dev[i].flags);
+        clear_bit(R5_Wantdrain, &sh->dev[i].flags);
       }
       s->to_write = 0;
       s->written = 0;
@@ -4966,12 +4987,9 @@ static void handle_stripe(struct stripe_head *sh)
       /*
        * check if the array has lost more than max_degraded devices and,
        * if so, some requests might need to be failed.
-     *
-     * When journal device failed (log_failed), we will only process
-     * the stripe if there is data need write to raid disks
        */
       if (s.failed > conf->max_degraded ||
-        (s.log_failed && s.injournal == 0)) {
+        (s.log_failed && (s.to_write || s.written || s.syncing))) {
           sh->check_state = 0;
           sh->reconstruct_state = 0;
           break_stripe_batch_list(sh, 0);


I think the changes in handle_stripe() are wrong here.

.