aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDarrick J. Wong <djwong@kernel.org>2024-01-11 18:07:07 -0800
committerDarrick J. Wong <djwong@kernel.org>2024-01-11 18:08:47 -0800
commit1c95c17c8857223d05e8c4516af42c6d41ae579a (patch)
tree093d875659829af9e5eb80629941d2d20bc5756b
parent0c22427fe07e22e25991742f7948945a039272fc (diff)
downloadxfsprogs-dev-1c95c17c8857223d05e8c4516af42c6d41ae579a.tar.gz
xfs_scrub_all: fix termination signal handling
Currently, xfs_scrub_all does not handle termination signals well. SIGTERM and SIGINT are left to their default handlers, which are immediate termination of the process group in the case of SIGTERM and raising KeyboardInterrupt in the case of SIGINT. Terminating the process group is fine when the xfs_scrub processes are direct children, but this completely doesn't work if we're farming the work out to systemd services since we don't terminate the child service. Instead, they keep going. Raising KeyboardInterrupt doesn't work because once the main thread calls sys.exit at the bottom of main(), it blocks in the python runtime waiting for child threads to terminate. There's no longer any context to handle an exception, so the signal is ignored and no child processes are killed. In other words, if you try to kill a running xfs_scrub_all, chances are good it won't kill the child xfs_scrub processes. This is undesirable and egregious since we actually have the ability to track and kill all the subprocesses that we create. Solve the subproblem of getting stuck in the python runtime by calling it repeatedly until we no longer have subprocesses. This means that the main thread loops until all threads have exited. Solve the subproblem of the signals doing the wrong thing by setting up our own signal handler that can wake up the main thread and initiate subprocess shutdown, no matter whether the subprocesses are systemd services or directly fork/exec'd. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de>
-rw-r--r--scrub/xfs_scrub_all.in64
1 files changed, 52 insertions, 12 deletions
diff --git a/scrub/xfs_scrub_all.in b/scrub/xfs_scrub_all.in
index 2c20b91fdb..d0ab27fd30 100644
--- a/scrub/xfs_scrub_all.in
+++ b/scrub/xfs_scrub_all.in
@@ -14,6 +14,7 @@ import time
import sys
import os
import argparse
+import signal
from io import TextIOWrapper
retcode = 0
@@ -196,6 +197,45 @@ def run_scrub(mnt, cond, running_devs, mntdevs, killfuncs):
cond.notify()
cond.release()
+def signal_scrubs(signum, cond):
+ '''Handle termination signals by killing xfs_scrub children.'''
+ global debug, terminate
+
+ if debug:
+ print('Signal handler called with signal', signum)
+ sys.stdout.flush()
+
+ terminate = True
+ cond.acquire()
+ cond.notify()
+ cond.release()
+
+def wait_for_termination(cond, killfuncs):
+ '''Wait for a child thread to terminate. Returns True if we should
+ abort the program, False otherwise.'''
+ global debug, terminate
+
+ if debug:
+ print('waiting for threads to terminate')
+ sys.stdout.flush()
+
+ cond.acquire()
+ try:
+ cond.wait()
+ except KeyboardInterrupt:
+ terminate = True
+ cond.release()
+
+ if not terminate:
+ return False
+
+ print("Terminating...")
+ sys.stdout.flush()
+ while len(killfuncs) > 0:
+ fn = killfuncs.pop()
+ fn()
+ return True
+
def main():
'''Find mounts, schedule scrub runs.'''
def thr(mnt, devs):
@@ -231,6 +271,10 @@ def main():
running_devs = set()
killfuncs = set()
cond = threading.Condition()
+
+ signal.signal(signal.SIGINT, lambda s, f: signal_scrubs(s, cond))
+ signal.signal(signal.SIGTERM, lambda s, f: signal_scrubs(s, cond))
+
while len(fs) > 0:
if len(running_devs) == 0:
mnt, devs = fs.popitem()
@@ -250,18 +294,14 @@ def main():
thr(mnt, devs)
for p in poppers:
fs.pop(p)
- cond.acquire()
- try:
- cond.wait()
- except KeyboardInterrupt:
- terminate = True
- print("Terminating...")
- sys.stdout.flush()
- while len(killfuncs) > 0:
- fn = killfuncs.pop()
- fn()
- fs = []
- cond.release()
+
+ # Wait for one thread to finish
+ if wait_for_termination(cond, killfuncs):
+ break
+
+ # Wait for the rest of the threads to finish
+ while len(killfuncs) > 0:
+ wait_for_termination(cond, killfuncs)
if journalthread is not None:
journalthread.terminate()