diff options
author | Darrick J. Wong <djwong@kernel.org> | 2024-01-11 18:07:06 -0800 |
---|---|---|
committer | Darrick J. Wong <djwong@kernel.org> | 2024-01-11 18:08:47 -0800 |
commit | 3abc6a0c3979bf19fa3e1d1126e363ea42c48971 (patch) | |
tree | 47d2dbd177c952f276ee2d03a5ee14ce4f3f7753 | |
parent | 27df677a7b31c51c4595d2ae9078927b790d94b9 (diff) | |
download | xfsprogs-dev-3abc6a0c3979bf19fa3e1d1126e363ea42c48971.tar.gz |
xfs_scrub_all: survive systemd restarts when waiting for services
If xfs_scrub_all detects a running systemd, it will use it to invoke
xfs_scrub subprocesses in a sandboxed and resource-controlled
environment. Unfortunately, if you happen to restart dbus or systemd
while it's running, you get this:
systemd[1]: Reexecuting.
xfs_scrub_all[9958]: Warning! D-Bus connection terminated.
xfs_scrub_all[9956]: Warning! D-Bus connection terminated.
xfs_scrub_all[9956]: Failed to wait for response: Connection reset by peer
xfs_scrub_all[9958]: Failed to wait for response: Connection reset by peer
xfs_scrub_all[9930]: Scrubbing / done, (err=1)
xfs_scrub_all[9930]: Scrubbing /storage done, (err=1)
The xfs_scrub units themselves are still running, it's just that the
`systemctl start' command that xfs_scrub_all uses to start and wait for
the unit lost its connection to dbus and hence is no longer monitoring
sub-services.
When this happens, we don't have great options -- systemctl doesn't have
a command to wait on an activating (aka running) unit. Emulate the
functionality we normally get by polling the failed/active statuses.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
-rw-r--r-- | scrub/xfs_scrub_all.in | 78 |
1 files changed, 65 insertions, 13 deletions
diff --git a/scrub/xfs_scrub_all.in b/scrub/xfs_scrub_all.in index 671d588177..ab9b491fb4 100644 --- a/scrub/xfs_scrub_all.in +++ b/scrub/xfs_scrub_all.in @@ -14,6 +14,7 @@ import time import sys import os import argparse +from io import TextIOWrapper retcode = 0 terminate = False @@ -58,12 +59,18 @@ def find_mounts(): return fs -def kill_systemd(unit, proc): - '''Kill systemd unit.''' - proc.terminate() - cmd=['systemctl', 'stop', unit] - x = subprocess.Popen(cmd) - x.wait() +def backtick(cmd): + '''Generator function that yields lines of a program's stdout.''' + p = subprocess.Popen(cmd, stdout = subprocess.PIPE) + for line in TextIOWrapper(p.stdout, encoding="utf-8"): + yield line.strip() + +def remove_killfunc(killfuncs, fn): + '''Ensure fn is not in killfuncs.''' + try: + killfuncs.remove(fn) + except: + pass def run_killable(cmd, stdout, killfuncs, kill_fn): '''Run a killable program. Returns program retcode or -1 if we can't start it.''' @@ -72,10 +79,7 @@ def run_killable(cmd, stdout, killfuncs, kill_fn): real_kill_fn = lambda: kill_fn(proc) killfuncs.add(real_kill_fn) proc.wait() - try: - killfuncs.remove(real_kill_fn) - except: - pass + remove_killfunc(killfuncs, real_kill_fn) return proc.returncode except: return -1 @@ -96,6 +100,56 @@ def path_to_serviceunit(path): except: return None +def systemctl_stop(unitname): + '''Stop a systemd unit.''' + cmd = ['systemctl', 'stop', unitname] + x = subprocess.Popen(cmd) + x.wait() + +def systemctl_start(unitname, killfuncs): + '''Start a systemd unit and wait for it to complete.''' + stop_fn = None + cmd = ['systemctl', 'start', unitname] + try: + proc = subprocess.Popen(cmd, stdout = DEVNULL()) + stop_fn = lambda: systemctl_stop(unitname) + killfuncs.add(stop_fn) + proc.wait() + ret = proc.returncode + except: + if stop_fn is not None: + remove_killfunc(killfuncs, stop_fn) + return -1 + + if ret != 1: + remove_killfunc(killfuncs, stop_fn) + return ret + + # If systemctl-start returns 1, it's possible that the service failed + # or that dbus/systemd restarted and the client program lost its + # connection -- according to the systemctl man page, 1 means "unit not + # failed". + # + # Either way, we switch to polling the service status to try to wait + # for the service to end. As of systemd 249, the is-active command + # returns any of the following states: active, reloading, inactive, + # failed, activating, deactivating, or maintenance. Apparently these + # strings are not localized. + while True: + try: + for l in backtick(['systemctl', 'is-active', unitname]): + if l == 'failed': + remove_killfunc(killfuncs, stop_fn) + return 1 + if l == 'inactive': + remove_killfunc(killfuncs, stop_fn) + return 0 + except: + remove_killfunc(killfuncs, stop_fn) + return -1 + + time.sleep(1) + def run_scrub(mnt, cond, running_devs, mntdevs, killfuncs): '''Run a scrub process.''' global retcode, terminate @@ -110,9 +164,7 @@ def run_scrub(mnt, cond, running_devs, mntdevs, killfuncs): # Try it the systemd way unitname = path_to_serviceunit(path) if unitname is not None: - cmd=['systemctl', 'start', unitname] - ret = run_killable(cmd, DEVNULL(), killfuncs, \ - lambda proc: kill_systemd(unitname, proc)) + ret = systemctl_start(unitname, killfuncs) if ret == 0 or ret == 1: print("Scrubbing %s done, (err=%d)" % (mnt, ret)) sys.stdout.flush() |