[PATCH 3/4] mdcheck: simplify start / continue logic and add "--restart" logic

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The current logic of "mdcheck" is susceptible to races when multiple
mdcheck instances run simultaneously, as checks can be initiated from both
"mdcheck_start.service" and "mdcheck_continue.service".

This patch changes the logic as follows.

* When `mdcheck` has finished checking a RAID array, it will create a
  marker `/var/lib/mdcheckChecked_$UUID`.
* A new option `--restart` is introduced. `mdcheck --restart` removes all
  `/var/lib/mdcheck/Checked_*` markers.
  This is called from `mdcheck_start.service`, which is typically started
  by a timer in large time intervals (default once per month).
* `mdcheck --continue` works as it used to. It continues previously started
  checks (where the `/var/lib/mdcheck/MD_UUID_$UUID` file is present and
  contains a start position) for the check.
  This usage is *not recommended any more*.
* `mdcheck` with no arguments is like `--continue`, but it also starts new
  checks for all arrays for which no check has previously been
  started, *except* for arrays for which a marker
  `/var/lib/mdcheck/Checked_$UUID` exists.
  `mdcheck_continue.service` calls `mdcheck` this way. It is called in
  short time intervals, by default once per day.
* Combining `--restart` and `--continue` is an error.

This way, the only systemd service that actually triggers a kernel-level
RAID check is `mdcheck_continue.service`, which avoids races.

When all checks have finished, `mdcheck_continue.service` is no-op.
When `mdcheck_start.service` runs, the checks re re-enabled and will be
started from 0 by the next `mdcheck_continue.service` invocation.

Signed-off-by: Martin Wilck <mwilck@xxxxxxxx>
---
 misc/mdcheck                     | 61 ++++++++++++++++++++++----------
 systemd/mdcheck_continue.service |  6 ++--
 systemd/mdcheck_start.service    |  3 +-
 systemd/mdcheck_start.timer      |  2 +-
 4 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/misc/mdcheck b/misc/mdcheck
index 5ea26cd..df4fd3b 100644
--- a/misc/mdcheck
+++ b/misc/mdcheck
@@ -21,15 +21,21 @@
 #
 # It supports a 'time budget' such that any incomplete 'check'
 # will be checkpointed when that time has expired.
-# A subsequent invocation can allow the 'check' to continue.
+# A subsequent invocation will allow the 'check' to continue.
 #
 # Options are:
-#   --continue    Don't start new checks, only continue old ones.
+#   --continue    Don't start new checks, only continue previously started ones.
+#   --restart:    Enable restarting the array checks
 #   --duration    This is passed to "date --date=$duration" to find out
 #		  when to finish
 #
-# To support '--continue', arrays are identified by UUID and the 'sync_completed'
-# value is stored  in /var/lib/mdcheck/$UUID
+# Arrays are identified by UUID and the 'sync_completed' value is stored
+# in /var/lib/mdcheck/MD_UUID_$UUID. If this file exists on startup of
+# the script, the check will continue where the previous check left off.
+# After the check completes, /var/lib/mdcheck/Checked_$UUID will be created.
+# Another full check will be started after this file is removed.
+# Use "mdcheck --restart" to remove these markers and re-enable checking
+# all arrays.
 
 # get device name from sysfs
 devname() {
@@ -40,24 +46,29 @@ devname() {
     echo -n "/dev/$dev"
 }
 
-args=$(getopt -o hcd: -l help,continue,duration: -n mdcheck -- "$@")
+args=$(getopt -o hcrd: -l help,continue,restart,duration: -n mdcheck -- "$@")
 rv=$?
 if [ $rv -ne 0 ]; then exit $rv; fi
 
 eval set -- $args
 
 cont=
+restart=
 endtime=
 while [ " $1" != " --" ]
 do
     case $1 in
 	--help )
-		echo >&2 'Usage: mdcheck [--continue] [--duration time-offset]'
+		echo >&2 'Usage: mdcheck [--restart|--continue] [--duration time-offset]'
 		echo >&2 '  time-offset must be understood by "date --date"'
 		exit 0
 		;;
-	--continue ) cont=yes ;;
-	--duration ) shift; dur=$1
+	--continue )
+		cont=yes ;;
+	--restart )
+		restart=yes ;;
+	--duration )
+		shift; dur=$1
 		endtime=$(date --date "$dur" "+%s")
 		;;
     esac
@@ -65,6 +76,17 @@ do
 done
 shift
 
+if [ "$cont" = yes ]; then
+	if [ "$restart" = yes ]; then
+		echo 'ERROR: --restart and --continue cannot be combined' >&2
+		exit 1
+	fi
+elif [ "$restart" = yes ]; then
+	echo 'Re-enabling array checks for all arrays' >&2
+	rm -f /var/lib/mdcheck/Checked_*
+	exit $?
+fi
+
 # We need a temp file occasionally...
 tmp=/var/lib/mdcheck/.md-check-$$
 cnt=0
@@ -116,17 +138,16 @@ do
 	[[ "$MD_UUID" ]] || continue
 
 	fl="/var/lib/mdcheck/MD_UUID_$MD_UUID"
-	if [ -z "$cont" ]
-	then
-		start=0
-		logger -p daemon.info mdcheck start checking $dev
-	elif [ -z "$MD_UUID" -o ! -f "$fl" ]
-	then
-		# Nothing to continue here
-		continue
-	else
+	checked="${fl/MD_UUID_/Checked_}"
+	if [[ -f "$fl" ]]; then
+		[[ ! -f "$checked" ]] || {
+		    echo "WARNING: $checked exists, continuing anyway" >&2
+		}
 		start=`cat "$fl"`
-		logger -p daemon.info mdcheck continue checking $dev from $start
+	elif [[ ! -f "$checked" && -z "$cont" ]]; then
+		start=0
+	else # nothing to do
+		continue
 	fi
 
 	: "$((cnt+=1))"
@@ -136,6 +157,7 @@ do
 	echo $start > $fl
 	echo $start > $sys/md/sync_min
 	echo check > $sys/md/sync_action
+	logger -p daemon.info mdcheck checking $dev from $start
 done
 
 if [ -z "$endtime" ]
@@ -158,7 +180,8 @@ do
 		then
 			logger -p daemon.info mdcheck finished checking $dev
 			eval MD_${i}_fl=
-			rm -f $fl
+			rm -f "$fl"
+			touch "${fl/MD_UUID_/Checked_}"
 			continue;
 		fi
 		read a rest < $sys/md/sync_completed
diff --git a/systemd/mdcheck_continue.service b/systemd/mdcheck_continue.service
index cd12db8..8eb97cf 100644
--- a/systemd/mdcheck_continue.service
+++ b/systemd/mdcheck_continue.service
@@ -7,10 +7,12 @@
 
 [Unit]
 Description=MD array scrubbing - continuation
-ConditionPathExistsGlob=/var/lib/mdcheck/MD_UUID_*
 Documentation=man:mdadm(8)
 
 [Service]
 Type=simple
 Environment="MDADM_CHECK_DURATION=6 hours"
-ExecStart=/usr/share/mdadm/mdcheck --continue --duration ${MDADM_CHECK_DURATION}
+# Note that we're not calling "mdcheck --continue" here.
+# We want previously started checks to be continued, and new ones
+# to be started.
+ExecStart=/usr/share/mdadm/mdcheck --duration ${MDADM_CHECK_DURATION}
diff --git a/systemd/mdcheck_start.service b/systemd/mdcheck_start.service
index 16ba6b6..c7ddd4f 100644
--- a/systemd/mdcheck_start.service
+++ b/systemd/mdcheck_start.service
@@ -12,5 +12,4 @@ Documentation=man:mdadm(8)
 
 [Service]
 Type=simple
-Environment="MDADM_CHECK_DURATION=6 hours"
-ExecStart=/usr/share/mdadm/mdcheck --duration ${MDADM_CHECK_DURATION}
+ExecStart=/usr/share/mdadm/mdcheck --restart
diff --git a/systemd/mdcheck_start.timer b/systemd/mdcheck_start.timer
index 1b8f3f2..8d09b3f 100644
--- a/systemd/mdcheck_start.timer
+++ b/systemd/mdcheck_start.timer
@@ -9,7 +9,7 @@
 Description=MD array scrubbing
 
 [Timer]
-OnCalendar=Sun *-*-1..7 1:05:00
+OnCalendar=Sun *-*-1..7 0:45:00
 
 [Install]
 WantedBy= mdmonitor.service
-- 
2.51.0





[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux