When ssdd is invoked with nforks > 100 && niters == 10000 on a tuned, realtime kernel, the following error messages can be seen: forktest#4/8719: EXITING, ERROR: wait on PTRACE_SINGLESTEP #385: no SIGCHLD seen (signal count == 0), signo 5 forktest#1/8716: EXITING, ERROR: wait on PTRACE_SINGLESTEP #398: no SIGCHLD seen (signal count == 0), signo 5 forktest#6/8721: EXITING, ERROR: wait on PTRACE_SINGLESTEP #385: no SIGCHLD seen (signal count == 0), signo 5 forktest#10/8725: EXITING, ERROR: wait on PTRACE_SINGLESTEP #388: no SIGCHLD seen (signal count == 0), signo 5 forktest#11/8726: EXITING, ERROR: wait on PTRACE_SINGLESTEP #388: no SIGCHLD seen (signal count == 0), signo 5 forktest#12/8727: EXITING, ERROR: wait on PTRACE_SINGLESTEP #389: no SIGCHLD seen (signal count == 0), signo 5 forktest#14/8729: EXITING, ERROR: wait on PTRACE_SINGLESTEP #389: no SIGCHLD seen (signal count == 0), signo 5 forktest#15/8730: EXITING, ERROR: wait on PTRACE_SINGLESTEP #389: no SIGCHLD seen (signal count == 0), signo 5 This behavior is caused by ptrace_stop() being unable to sleep after taking tasklist_lock(). As forktest() generates "niter" PTRACE_SINGLESTEP's for nforks, in the rare event where nforks exceeds the defaults by a large order of magnitude, the sporadic test failures caused by missing SIGCHLDs indicates that the tracees are unable to effectively wait for their asynchronous signals to arrive --as denoted in the previous sleeps for check_sigchld(). Therefore, by performing an sigtimedwait() in check_sigchld(), we give the tracee enough CPU time to call do_notify_parent_cldstop()->send_signal_locked(). The observed behavior after appling this patch mitigates the aforementioned issue in scenarios with a high number of nforks. Suggested-by: Oleg Nesterov <oleg@xxxxxxxxxx> Suggested-by: Crystal Wood <crwood@xxxxxxxxxx> Signed-off-by: Derek Barbosa <debarbos@xxxxxxxxxx> --- V1 -> V2: Addressed review comments, removed usleep() in favor of sigtimedwait(). src/ssdd/ssdd.c | 72 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 53 insertions(+), 19 deletions(-) diff --git a/src/ssdd/ssdd.c b/src/ssdd/ssdd.c index 50f7424..0089678 100644 --- a/src/ssdd/ssdd.c +++ b/src/ssdd/ssdd.c @@ -30,6 +30,7 @@ #include <getopt.h> #include <string.h> #include <signal.h> +#include <time.h> #include <errno.h> #include <sys/types.h> @@ -67,7 +68,7 @@ static const char *get_state_name(int state) static int quiet; static char jsonfile[MAX_PATH]; -static int got_sigchld; +volatile int got_sigchld; enum option_value { OPT_NFORKS=1, OPT_NITERS, OPT_HELP, OPT_JSON, OPT_QUIET }; @@ -127,24 +128,36 @@ static int do_wait(pid_t *wait_pid, int *ret_sig) return STATE_UNKNOWN; } -static int check_sigchld(void) +static int check_sigchld(sigset_t* set) { - int i; - /* - * The signal is asynchronous so give it some - * time to arrive. - */ - for (i = 0; i < 10 && !got_sigchld; i++) - usleep(1000); /* 10 msecs */ - for (i = 0; i < 10 && !got_sigchld; i++) - usleep(2000); /* 20 + 10 = 30 msecs */ - for (i = 0; i < 10 && !got_sigchld; i++) - usleep(4000); /* 40 + 30 = 70 msecs */ - for (i = 0; i < 10 && !got_sigchld; i++) - usleep(8000); /* 80 + 70 = 150 msecs */ - for (i = 0; i < 10 && !got_sigchld; i++) - usleep(16000); /* 160 + 150 = 310 msecs */ + struct timespec timeout; + timeout.tv_sec = 10; + timeout.tv_nsec = 0; + int recv_sig = 0; + + /* + * Check the handler flag, then if need be, wait for the signal to + * arrive + */ + if(!got_sigchld) { + recv_sig = sigtimedwait(set, NULL, &timeout); + } + + if(sigprocmask(SIG_UNBLOCK, set, NULL) == -1){ + printf("EXITING, ERROR: " + "unable to mask signal set\n"); + exit(1); + } + + if (recv_sig == -1){ + printf("EXITING, ERROR: " + "Timeout: no signal recieved in 10 seconds\n"); + exit(1); + } else if (recv_sig == SIGCHLD){ + got_sigchld = 1; + } + return got_sigchld; } @@ -195,6 +208,20 @@ static int forktests(int testid) exit(1); } + /* + * Block the signal before it is generated + * Ensures we can synchronously wait for it. + */ + sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGCHLD); + + if (sigprocmask(SIG_BLOCK, &set, NULL) == -1){ + printf("EXITING, ERROR: " + "unable to mask signal set\n"); + exit(1); + } + /* * Attach to the child. */ @@ -224,7 +251,7 @@ static int forktests(int testid) ret_sig); exit(1); } - if (!check_sigchld()) { + if (!check_sigchld(&set)) { printf("forktest#%d/%d: EXITING, ERROR: " "wait on PTRACE_ATTACH saw a SIGCHLD count of %d, should be 1\n", testid, getpid(), got_sigchld); @@ -238,6 +265,13 @@ static int forktests(int testid) * step the tracee. */ for (i = 0; i < nsteps; i++) { + + if (sigprocmask(SIG_BLOCK, &set, NULL) == -1){ + printf("EXITING, ERROR: " + "unable to mask signal set\n"); + exit(1); + } + pstatus = ptrace(PTRACE_SINGLESTEP, child, NULL, NULL); if (pstatus) { @@ -271,7 +305,7 @@ static int forktests(int testid) testid, getpid(), i, ret_sig); exit(1); } - if (!check_sigchld()) { + if (!check_sigchld(&set)) { printf("forktest#%d/%d: EXITING, ERROR: " "wait on PTRACE_SINGLESTEP #%d: no SIGCHLD seen " "(signal count == 0), signo %d\n", -- 2.50.0