aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDarrick J. Wong <djwong@kernel.org>2024-01-11 18:07:06 -0800
committerDarrick J. Wong <djwong@kernel.org>2024-01-11 18:08:47 -0800
commit3abc6a0c3979bf19fa3e1d1126e363ea42c48971 (patch)
tree47d2dbd177c952f276ee2d03a5ee14ce4f3f7753
parent27df677a7b31c51c4595d2ae9078927b790d94b9 (diff)
downloadxfsprogs-dev-3abc6a0c3979bf19fa3e1d1126e363ea42c48971.tar.gz
xfs_scrub_all: survive systemd restarts when waiting for services
If xfs_scrub_all detects a running systemd, it will use it to invoke xfs_scrub subprocesses in a sandboxed and resource-controlled environment. Unfortunately, if you happen to restart dbus or systemd while it's running, you get this: systemd[1]: Reexecuting. xfs_scrub_all[9958]: Warning! D-Bus connection terminated. xfs_scrub_all[9956]: Warning! D-Bus connection terminated. xfs_scrub_all[9956]: Failed to wait for response: Connection reset by peer xfs_scrub_all[9958]: Failed to wait for response: Connection reset by peer xfs_scrub_all[9930]: Scrubbing / done, (err=1) xfs_scrub_all[9930]: Scrubbing /storage done, (err=1) The xfs_scrub units themselves are still running, it's just that the `systemctl start' command that xfs_scrub_all uses to start and wait for the unit lost its connection to dbus and hence is no longer monitoring sub-services. When this happens, we don't have great options -- systemctl doesn't have a command to wait on an activating (aka running) unit. Emulate the functionality we normally get by polling the failed/active statuses. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de>
-rw-r--r--scrub/xfs_scrub_all.in78
1 files changed, 65 insertions, 13 deletions
diff --git a/scrub/xfs_scrub_all.in b/scrub/xfs_scrub_all.in
index 671d588177..ab9b491fb4 100644
--- a/scrub/xfs_scrub_all.in
+++ b/scrub/xfs_scrub_all.in
@@ -14,6 +14,7 @@ import time
import sys
import os
import argparse
+from io import TextIOWrapper
retcode = 0
terminate = False
@@ -58,12 +59,18 @@ def find_mounts():
return fs
-def kill_systemd(unit, proc):
- '''Kill systemd unit.'''
- proc.terminate()
- cmd=['systemctl', 'stop', unit]
- x = subprocess.Popen(cmd)
- x.wait()
+def backtick(cmd):
+ '''Generator function that yields lines of a program's stdout.'''
+ p = subprocess.Popen(cmd, stdout = subprocess.PIPE)
+ for line in TextIOWrapper(p.stdout, encoding="utf-8"):
+ yield line.strip()
+
+def remove_killfunc(killfuncs, fn):
+ '''Ensure fn is not in killfuncs.'''
+ try:
+ killfuncs.remove(fn)
+ except:
+ pass
def run_killable(cmd, stdout, killfuncs, kill_fn):
'''Run a killable program. Returns program retcode or -1 if we can't start it.'''
@@ -72,10 +79,7 @@ def run_killable(cmd, stdout, killfuncs, kill_fn):
real_kill_fn = lambda: kill_fn(proc)
killfuncs.add(real_kill_fn)
proc.wait()
- try:
- killfuncs.remove(real_kill_fn)
- except:
- pass
+ remove_killfunc(killfuncs, real_kill_fn)
return proc.returncode
except:
return -1
@@ -96,6 +100,56 @@ def path_to_serviceunit(path):
except:
return None
+def systemctl_stop(unitname):
+ '''Stop a systemd unit.'''
+ cmd = ['systemctl', 'stop', unitname]
+ x = subprocess.Popen(cmd)
+ x.wait()
+
+def systemctl_start(unitname, killfuncs):
+ '''Start a systemd unit and wait for it to complete.'''
+ stop_fn = None
+ cmd = ['systemctl', 'start', unitname]
+ try:
+ proc = subprocess.Popen(cmd, stdout = DEVNULL())
+ stop_fn = lambda: systemctl_stop(unitname)
+ killfuncs.add(stop_fn)
+ proc.wait()
+ ret = proc.returncode
+ except:
+ if stop_fn is not None:
+ remove_killfunc(killfuncs, stop_fn)
+ return -1
+
+ if ret != 1:
+ remove_killfunc(killfuncs, stop_fn)
+ return ret
+
+ # If systemctl-start returns 1, it's possible that the service failed
+ # or that dbus/systemd restarted and the client program lost its
+ # connection -- according to the systemctl man page, 1 means "unit not
+ # failed".
+ #
+ # Either way, we switch to polling the service status to try to wait
+ # for the service to end. As of systemd 249, the is-active command
+ # returns any of the following states: active, reloading, inactive,
+ # failed, activating, deactivating, or maintenance. Apparently these
+ # strings are not localized.
+ while True:
+ try:
+ for l in backtick(['systemctl', 'is-active', unitname]):
+ if l == 'failed':
+ remove_killfunc(killfuncs, stop_fn)
+ return 1
+ if l == 'inactive':
+ remove_killfunc(killfuncs, stop_fn)
+ return 0
+ except:
+ remove_killfunc(killfuncs, stop_fn)
+ return -1
+
+ time.sleep(1)
+
def run_scrub(mnt, cond, running_devs, mntdevs, killfuncs):
'''Run a scrub process.'''
global retcode, terminate
@@ -110,9 +164,7 @@ def run_scrub(mnt, cond, running_devs, mntdevs, killfuncs):
# Try it the systemd way
unitname = path_to_serviceunit(path)
if unitname is not None:
- cmd=['systemctl', 'start', unitname]
- ret = run_killable(cmd, DEVNULL(), killfuncs, \
- lambda proc: kill_systemd(unitname, proc))
+ ret = systemctl_start(unitname, killfuncs)
if ret == 0 or ret == 1:
print("Scrubbing %s done, (err=%d)" % (mnt, ret))
sys.stdout.flush()