[PATCH V2] ssdd: mitigate tracee starvation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



When ssdd is invoked with nforks > 100 && niters == 10000 on a tuned,
realtime kernel, the following error messages can be seen:

forktest#4/8719: EXITING, ERROR: wait on PTRACE_SINGLESTEP #385: no SIGCHLD seen (signal count == 0), signo 5
forktest#1/8716: EXITING, ERROR: wait on PTRACE_SINGLESTEP #398: no SIGCHLD seen (signal count == 0), signo 5
forktest#6/8721: EXITING, ERROR: wait on PTRACE_SINGLESTEP #385: no SIGCHLD seen (signal count == 0), signo 5
forktest#10/8725: EXITING, ERROR: wait on PTRACE_SINGLESTEP #388: no SIGCHLD seen (signal count == 0), signo 5
forktest#11/8726: EXITING, ERROR: wait on PTRACE_SINGLESTEP #388: no SIGCHLD seen (signal count == 0), signo 5
forktest#12/8727: EXITING, ERROR: wait on PTRACE_SINGLESTEP #389: no SIGCHLD seen (signal count == 0), signo 5
forktest#14/8729: EXITING, ERROR: wait on PTRACE_SINGLESTEP #389: no SIGCHLD seen (signal count == 0), signo 5
forktest#15/8730: EXITING, ERROR: wait on PTRACE_SINGLESTEP #389: no SIGCHLD seen (signal count == 0), signo 5

This behavior is caused by ptrace_stop() being unable to sleep after
taking tasklist_lock().

As forktest() generates "niter" PTRACE_SINGLESTEP's for nforks, in the
rare event where nforks exceeds the defaults by a large order of
magnitude, the sporadic test failures caused by missing SIGCHLDs
indicates that the tracees are unable to effectively wait for their
asynchronous signals to arrive --as denoted in the previous sleeps for
check_sigchld().

Therefore, by performing an sigtimedwait() in check_sigchld(), we
give the tracee enough CPU time to call
do_notify_parent_cldstop()->send_signal_locked().

The observed behavior after appling this patch mitigates the
aforementioned issue in scenarios with a high number of nforks.

Suggested-by: Oleg Nesterov <oleg@xxxxxxxxxx>
Suggested-by: Crystal Wood <crwood@xxxxxxxxxx>
Signed-off-by: Derek Barbosa <debarbos@xxxxxxxxxx>
---
V1 -> V2: Addressed review comments, removed usleep() in favor of
sigtimedwait().

 src/ssdd/ssdd.c | 72 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 53 insertions(+), 19 deletions(-)

diff --git a/src/ssdd/ssdd.c b/src/ssdd/ssdd.c
index 50f7424..0089678 100644
--- a/src/ssdd/ssdd.c
+++ b/src/ssdd/ssdd.c
@@ -30,6 +30,7 @@
 #include <getopt.h>
 #include <string.h>
 #include <signal.h>
+#include <time.h>
 #include <errno.h>
 
 #include <sys/types.h>
@@ -67,7 +68,7 @@ static const char *get_state_name(int state)
 static int quiet;
 static char jsonfile[MAX_PATH];
 
-static int got_sigchld;
+volatile int got_sigchld;
 
 enum option_value { OPT_NFORKS=1, OPT_NITERS, OPT_HELP, OPT_JSON, OPT_QUIET };
 
@@ -127,24 +128,36 @@ static int do_wait(pid_t *wait_pid, int *ret_sig)
 	return STATE_UNKNOWN;
 }
 
-static int check_sigchld(void)
+static int check_sigchld(sigset_t* set)
 {
-	int i;
-	/*
-	 * The signal is asynchronous so give it some
-	 * time to arrive.
-	 */
-	for (i = 0; i < 10 && !got_sigchld; i++)
-		usleep(1000); /* 10 msecs */
-	for (i = 0; i < 10 && !got_sigchld; i++)
-		usleep(2000); /* 20 + 10 = 30 msecs */
-	for (i = 0; i < 10 && !got_sigchld; i++)
-		usleep(4000); /* 40 + 30 = 70 msecs */
-	for (i = 0; i < 10 && !got_sigchld; i++)
-		usleep(8000); /* 80 + 70 = 150 msecs */
-	for (i = 0; i < 10 && !got_sigchld; i++)
-		usleep(16000); /* 160 + 150 = 310 msecs */
 
+	struct timespec timeout;
+	timeout.tv_sec = 10;
+	timeout.tv_nsec = 0;
+	int recv_sig = 0;
+
+        /*
+         * Check the handler flag, then if need be, wait for the signal to
+         * arrive
+         */
+        if(!got_sigchld) {
+		recv_sig = sigtimedwait(set, NULL, &timeout);
+	}
+
+	if(sigprocmask(SIG_UNBLOCK, set, NULL) == -1){
+		printf("EXITING, ERROR: "
+		       "unable to mask signal set\n");
+		exit(1);
+	}
+
+	if (recv_sig == -1){
+		printf("EXITING, ERROR: "
+		       "Timeout: no signal recieved in 10 seconds\n");
+		exit(1);
+	} else if (recv_sig == SIGCHLD){
+		got_sigchld = 1;
+	}
+	
 	return got_sigchld;
 }
 
@@ -195,6 +208,20 @@ static int forktests(int testid)
 		exit(1);
 	}
 
+	/*
+	 * Block the signal before it is generated
+	 * Ensures we can synchronously wait for it.
+	 */
+	sigset_t set;
+	sigemptyset(&set);
+	sigaddset(&set, SIGCHLD);
+
+	if (sigprocmask(SIG_BLOCK, &set, NULL) == -1){
+		printf("EXITING, ERROR: "
+		       "unable to mask signal set\n");
+		exit(1);
+	}
+
 	/*
 	 * Attach to the child.
 	 */
@@ -224,7 +251,7 @@ static int forktests(int testid)
 		       ret_sig);
 		exit(1);
 	}
-	if (!check_sigchld()) {
+	if (!check_sigchld(&set)) {
 		printf("forktest#%d/%d: EXITING, ERROR: "
 		       "wait on PTRACE_ATTACH saw a SIGCHLD count of %d, should be 1\n",
 		       testid, getpid(), got_sigchld);
@@ -238,6 +265,13 @@ static int forktests(int testid)
 	 * step the tracee.
 	 */
 	for (i = 0; i < nsteps; i++) {
+
+		if (sigprocmask(SIG_BLOCK, &set, NULL) == -1){
+			printf("EXITING, ERROR: "
+			       "unable to mask signal set\n");
+			exit(1);
+		}
+
 		pstatus = ptrace(PTRACE_SINGLESTEP, child, NULL, NULL);
 
 		if (pstatus) {
@@ -271,7 +305,7 @@ static int forktests(int testid)
 			       testid, getpid(), i, ret_sig);
 			exit(1);
 		}
-		if (!check_sigchld()) {
+		if (!check_sigchld(&set)) {
 			printf("forktest#%d/%d: EXITING, ERROR: "
 			       "wait on PTRACE_SINGLESTEP #%d: no SIGCHLD seen "
 			       "(signal count == 0), signo %d\n",
-- 
2.50.0





[Index of Archives]     [RT Stable]     [Kernel Newbies]     [IDE]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux ATA RAID]     [Samba]     [Video 4 Linux]     [Device Mapper]

  Powered by Linux