From: Ernie Petrides <petrides@redhat.com>

There is a long-standing locking hole in the kernel's handling of the
signals related to stopping and resuming processes.  When a process
handles SIGSTOP, SIGTSTP, SIGTTIN, or SIGTTOU, the "sighand" lock is
held while the signal is dequeued and appropriate masks are updated.
But the "sighand" lock is dropped in several cases before the task's
state is changed to TASK_STOPPED (or before a group-stop is initiated).

If a process running on another cpu posts a SIGCONT or SIGKILL just after
the "victim" process releases the lock but before its state is set to
TASK_STOPPED, the corresponding wakeup will be lost and the victim will
remain stopped despite the successive SIGCONT or SIGKILL.  In this case,
a repeated posting of SIGCONT or SIGKILL will have no effect, since the
original one is already pending (and so causes a repeated posting to be
discarded).  The occurrence of a SIGSTOP/SIGKILL race where the victim
has blocked all other signals will result in an unkillable process.

Although a fabricated test program can reproduce a SIGSTOP/SIGCONT race
hang in less than a minute (on a 2-cpu Dell Precision 450), the scenario
that has been most frequently encountered is a hang during reboot or
shutdown.  This occurs because /sbin/killall5 brackets the scanning of
/proc/* and associated signal posting to (most) of the processes still
running with kill(-1, SIGSTOP) and kill(-1, SIGCONT) calls to temporarily
freeze every process except for "init".  Occasionally, its parent (running
the /etc/rc6.d/S01reboot shell script) gets stuck in TASK_STOPPED state
with pending SIGCONT and SIGCLD signals, but with no other process left
to wake it up.

In order to fix the race condition, the locking in do_signal_stop()
and get_signal_to_deliver() needed reworking to close the hole.  Due
to lock ordering issues between the "sighand" lock and tasklist_lock,
there are two cases where the former lock needs to be released and
then reacquired, thus allowing a tiny hole for a SIGCONT/SIGKILL to
be posted.  These two cases are resolved by rechecking for a pending
SIGCONT/SIGKILL after the locks are (re)acquired in the proper order.

Anyone wanting a copy of the test program may e-mail me off-list.


 kernel/signal.c |  103 ++++++++++++++++++++++++++++++++++++++++----------------
 1 files changed, 74 insertions(+), 29 deletions(-)

diff -puN kernel/signal.c~signal-race-fix kernel/signal.c
--- 25/kernel/signal.c~signal-race-fix	2003-07-27 12:24:44.000000000 -0700
+++ 25-akpm/kernel/signal.c	2003-07-27 12:24:44.000000000 -0700
@@ -155,6 +155,11 @@ int max_queued_signals = 1024;
 	(!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
 	 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
 
+#define sig_avoid_stop_race() \
+	(sigtestsetmask(&current->pending.signal, M(SIGCONT) | M(SIGKILL)) || \
+	 sigtestsetmask(&current->signal->shared_pending.signal, \
+						  M(SIGCONT) | M(SIGKILL)))
+
 static inline int sig_ignored(struct task_struct *t, int sig)
 {
 	void * handler;
@@ -1550,17 +1555,13 @@ do_signal_stop(int signr)
 	struct sighand_struct *sighand = current->sighand;
 	int stop_count = -1;
 
+	/* spin_lock_irq(&sighand->siglock) is now done in caller */
+
 	if (sig->group_stop_count > 0) {
 		/*
 		 * There is a group stop in progress.  We don't need to
 		 * start another one.
 		 */
-		spin_lock_irq(&sighand->siglock);
-		if (unlikely(sig->group_stop_count == 0)) {
-			BUG_ON(!sig->group_exit);
-			spin_unlock_irq(&sighand->siglock);
-			return;
-		}
 		signr = sig->group_exit_code;
 		stop_count = --sig->group_stop_count;
 		current->exit_code = signr;
@@ -1569,17 +1570,27 @@ do_signal_stop(int signr)
 	}
 	else if (thread_group_empty(current)) {
 		/*
-		 * No locks needed in this case.
+		 * Lock must be held through transition to stopped state.
 		 */
 		current->exit_code = signr;
 		set_current_state(TASK_STOPPED);
+		spin_unlock_irq(&sighand->siglock);
 	}
 	else {
 		/*
 		 * There is no group stop already in progress.
-		 * We must initiate one now.
+		 * We must initiate one now, but that requires
+		 * dropping siglock to get both the tasklist lock
+		 * and siglock again in the proper order.  Note that
+		 * this allows an intervening SIGCONT to be posted.
+		 * We need to check for that and bail out if necessary.
 		 */
 		struct task_struct *t;
+
+		spin_unlock_irq(&sighand->siglock);
+
+		/* signals can be posted during this window */
+
 		read_lock(&tasklist_lock);
 		spin_lock_irq(&sighand->siglock);
 
@@ -1594,6 +1605,16 @@ do_signal_stop(int signr)
 			return;
 		}
 
+		if (unlikely(sig_avoid_stop_race())) {
+			/*
+			 * Either a SIGCONT or a SIGKILL signal was
+			 * posted in the siglock-not-held window.
+			 */
+			spin_unlock_irq(&sighand->siglock);
+			read_unlock(&tasklist_lock);
+			return;
+		}
+
 		if (sig->group_stop_count == 0) {
 			sig->group_exit_code = signr;
 			stop_count = 0;
@@ -1668,20 +1689,21 @@ static inline int handle_group_stop(void
 int get_signal_to_deliver(siginfo_t *info, struct pt_regs *regs, void *cookie)
 {
 	sigset_t *mask = &current->blocked;
+	int signr = 0;
 
+relock:
+	spin_lock_irq(&current->sighand->siglock);
 	for (;;) {
-		unsigned long signr = 0;
 		struct k_sigaction *ka;
 
-		spin_lock_irq(&current->sighand->siglock);
 		if (unlikely(current->signal->group_stop_count > 0) &&
 		    handle_group_stop())
-			continue;
+			goto relock;
+
 		signr = dequeue_signal(current, mask, info);
-		spin_unlock_irq(&current->sighand->siglock);
 
 		if (!signr)
-			break;
+			break; /* will return 0 */
 
 		if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) {
 			ptrace_signal_deliver(regs, cookie);
@@ -1690,25 +1712,25 @@ int get_signal_to_deliver(siginfo_t *inf
 			 * If there is a group stop in progress,
 			 * we must participate in the bookkeeping.
 			 */
-			if (current->signal->group_stop_count > 0) {
-				spin_lock_irq(&current->sighand->siglock);
+			if (current->signal->group_stop_count > 0)
 				--current->signal->group_stop_count;
-				spin_unlock_irq(&current->sighand->siglock);
-			}
 
 			/* Let the debugger run.  */
 			current->exit_code = signr;
 			current->last_siginfo = info;
 			set_current_state(TASK_STOPPED);
+			spin_unlock_irq(&current->sighand->siglock);
 			notify_parent(current, SIGCHLD);
 			schedule();
 
 			current->last_siginfo = NULL;
 
 			/* We're back.  Did the debugger cancel the sig?  */
+			spin_lock_irq(&current->sighand->siglock);
 			signr = current->exit_code;
 			if (signr == 0)
 				continue;
+
 			current->exit_code = 0;
 
 			/* Update the siginfo structure if the signal has
@@ -1725,9 +1747,7 @@ int get_signal_to_deliver(siginfo_t *inf
 
 			/* If the (new) signal is now blocked, requeue it.  */
 			if (sigismember(&current->blocked, signr)) {
-				spin_lock_irq(&current->sighand->siglock);
 				specific_send_sig_info(signr, info, current);
-				spin_unlock_irq(&current->sighand->siglock);
 				continue;
 			}
 		}
@@ -1736,7 +1756,7 @@ int get_signal_to_deliver(siginfo_t *inf
 		if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
 			continue;
 		if (ka->sa.sa_handler != SIG_DFL) /* Run the handler.  */
-			return signr;
+			break; /* will return non-zero "signr" value */
 
 		/*
 		 * Now we are doing the default action for this signal.
@@ -1753,20 +1773,44 @@ int get_signal_to_deliver(siginfo_t *inf
 			 * The default action is to stop all threads in
 			 * the thread group.  The job control signals
 			 * do nothing in an orphaned pgrp, but SIGSTOP
-			 * always works.
+			 * always works.  Note that siglock needs to be
+			 * dropped during the call to is_orphaned_pgrp()
+			 * because of lock ordering with tasklist_lock.
+			 * This allows an intervening SIGCONT to be posted.
+			 * We need to check for that and bail out if necessary.
 			 */
-			if (signr == SIGSTOP ||
-			    !is_orphaned_pgrp(current->pgrp))
-				do_signal_stop(signr);
-			continue;
+			if (signr == SIGSTOP) {
+				do_signal_stop(signr); /* releases siglock */
+				goto relock;
+			}
+			spin_unlock_irq(&current->sighand->siglock);
+
+			/* signals can be posted during this window */
+
+			if (is_orphaned_pgrp(current->pgrp))
+				goto relock;
+
+			spin_lock_irq(&current->sighand->siglock);
+			if (unlikely(sig_avoid_stop_race())) {
+				/*
+				 * Either a SIGCONT or a SIGKILL signal was
+				 * posted in the siglock-not-held window.
+				 */
+				continue;
+			}
+
+			do_signal_stop(signr); /* releases siglock */
+			goto relock;
 		}
 
+		spin_unlock_irq(&current->sighand->siglock);
+
 		/*
 		 * Anything else is fatal, maybe with a core dump.
 		 */
 		current->flags |= PF_SIGNALED;
 		if (sig_kernel_coredump(signr) &&
-		    do_coredump(signr, signr, regs)) {
+		    do_coredump((long)signr, signr, regs)) {
 			/*
 			 * That killed all other threads in the group and
 			 * synchronized with their demise, so there can't
@@ -1780,8 +1824,8 @@ int get_signal_to_deliver(siginfo_t *inf
 			BUG_ON(!current->signal->group_exit);
 			BUG_ON(current->signal->group_exit_code != code);
 			do_exit(code);
-				/* NOTREACHED */
-			}
+			/* NOTREACHED */
+		}
 
 		/*
 		 * Death signals, no core dump.
@@ -1789,7 +1833,8 @@ int get_signal_to_deliver(siginfo_t *inf
 		do_group_exit(signr);
 		/* NOTREACHED */
 	}
-	return 0;
+	spin_unlock_irq(&current->sighand->siglock);
+	return signr;
 }
 
 #endif

_