Merge remote-tracking branch 'leah/ltm-auto-resume'

author: Theodore Ts'o <tytso@mit.edu> 2023-07-01 00:51:06 -0400
committer: Theodore Ts'o <tytso@mit.edu> 2023-07-01 00:51:06 -0400
commit: df7a7d1c5d784d9b1ef539b756357cf10d4f105f (patch)
tree: f9741ec2ed69b75a2986a7833ba3fcac5ca3946c
parent: 0f389da9f980e0bb51a45c7d464bf2b0edc09fdd (diff)
parent: 9e67ddbbede1eb9fc766fc2c2e642a890a4db915 (diff)
download: xfstests-bld-df7a7d1c5d784d9b1ef539b756357cf10d4f105f.tar.gz
21 files changed, 516 insertions, 92 deletions
diff --git a/fstests-bld/misc/syncfs.c b/fstests-bld/misc/syncfs.c
index 6bb8a9ae..60e52f24 100644
--- a/fstests-bld/misc/syncfs.c
+++ b/fstests-bld/misc/syncfs.c
@@ -1,5 +1,5 @@
 /*
- * syncfs.c -- issue 
+ * syncfs.c -- issue syncfs on a file or directory
  */
 
 #define _GNU_SOURCE
@@ -16,14 +16,14 @@ const char *progname;
 
 static void usage(void)
 {
-	fprintf(stderr, "Usage: %s <file>\n");
+	fprintf(stderr, "Usage: %s <file>\n", progname);
 	exit(1);
 }
 
 int main(int argc, char **argv)
 {
 	int fd;
-	
+
 	progname = argv[0];
 	if (argc != 2)
 		usage();
@@ -38,4 +38,3 @@ int main(int argc, char **argv)
 	}
 	return 0;
 }
-	
diff --git a/run-fstests/gce-xfstests b/run-fstests/gce-xfstests
index d3732128..4373813a 100755
--- a/run-fstests/gce-xfstests
+++ b/run-fstests/gce-xfstests
@@ -3,6 +3,7 @@
 XFSTESTS_FLAVOR=gce
 RUN_ON_LTM=
 RUN_ON_KCS=
+GCE_IMAGE_PROJECT=
 t=$(echo ${XFSTESTS_FLAVOR}_xfstests_dir | tr "[:lower:]" "[:upper:]")
 eval DIR="\$$t"
 if test -z "$DIR"
@@ -232,6 +233,14 @@ case "$1" in
 	    deldisks="--delete-disks all"
 	fi
 	shift
+
+	bg="&"
+	if test "$1" = "--wait"
+	then
+	    bg=
+	    shift
+	fi
+
 	for i in "$@"
 	do
 	    if test -n "$deldisks"
@@ -244,8 +253,8 @@ case "$1" in
 	    run_gcloud compute -q instances add-metadata "$i" \
 		       --metadata "shutdown_reason=$reason" \
 		       --zone "$zone" > /dev/null
-	    run_gcloud compute -q instances delete "$i" \
-		       --zone "$zone" $deldisks &
+	    eval run_gcloud compute -q instances delete "$i" \
+		       --zone "$zone" $deldisks $bg
 	done
 	exit 0
 	;;
@@ -408,13 +417,20 @@ case "$1" in
 	exit $?
 	;;
     ssh)
+	# gce-xfstests ssh --user <user> <host> -- <cmd>
 	user=root
 	shift
 	while (( $# >= 1 )); do
 	    case $1 in
 		--user|-u) shift
 			   user="$1"
+			   echo "user=$user"
 			   ;;
+		--)	shift
+			ssh_cmd="$@"
+			CMD="--command="
+			break
+			;;
 		-*)	echo "Unknown option $1"
 			exit 1
 			;;
@@ -423,8 +439,17 @@ case "$1" in
 	    esac
 	    shift
 	done
-	run_gcloud compute -q ssh $user@"$host" \
-	    --zone $(get_gce_zone "$host") $RUN_INTERNAL
+
+	# ssh_cmd must be quoted but passing "" when there is no command
+	# causes gcloud to complain (even if --command= lumped into ssh_cmd)
+	if test -n "$ssh_cmd"; then
+	    run_gcloud compute -q  ssh $user@"$host" \
+		--zone $(get_gce_zone $host) $RUN_INTERNAL --command="$ssh_cmd"
+	    exit $?
+	fi
+
+	run_gcloud compute -q  ssh $user@"$host" \
+	    --zone $(get_gce_zone $host) $RUN_INTERNAL
 	exit $?
 	;;
     scp)
@@ -1148,7 +1173,9 @@ fi
 
 if test -n "$RUN_ON_LTM"; then
     . "$DIR/util/gce-ltm-funcs"
-    send_to_ltm $ORIG_CMDLINE_B64
+    if ! send_to_ltm $ORIG_CMDLINE_B64; then
+       exit 1
+    fi
     exit 0
 elif test -n "$RUN_ON_KCS"; then
     if ! gsutil -q stat "gs://$GS_BUCKET/build_config" &> /dev/null
@@ -1158,7 +1185,9 @@ elif test -n "$RUN_ON_KCS"; then
         gsutil cp "$DIR/../kernel-build/kernel-configs/x86_64-config-5.4" "gs://$GS_BUCKET/build_config"
     fi
     . "$DIR/util/gce-kcs-funcs"
-    send_to_kcs $ORIG_CMDLINE_B64
+    if ! send_to_kcs $ORIG_CMDLINE_B64; then
+        exit 1
+    fi
     exit 0
 fi
 
diff --git a/run-fstests/util/gce-ltm-funcs b/run-fstests/util/gce-ltm-funcs
index 4da3e863..ac7b5a29 100644
--- a/run-fstests/util/gce-ltm-funcs
+++ b/run-fstests/util/gce-ltm-funcs
@@ -32,7 +32,14 @@ function send_to_ltm() {
     local cmd_to_send=$1
     shift
 
-    if test ! -f "$DIR/.ltm_cookie_$GCE_PROJECT"; then
+        # Failed login will create an empty cookie file, so ensure
+        # the file exists and contains a cookie - sometimes ltm_post_json
+        # will succeed even when login fails, so we cannot simply remove
+        # the cookie file upon ltm_post_json failure
+    if test ! -f "$DIR/.ltm_cookie_$GCE_PROJECT" || \
+            ! grep "a.$GCE_PROJECT.gce-xfstests" "$DIR/.ltm_cookie_$GCE_PROJECT" &> /dev/null
+    then
+        echo "login attempt " >> /tmp/ltm-auto-resume.debug
         # just create a new login session and store it in the cookie
         ltm_post_json -c $DIR/.ltm_cookie_$GCE_PROJECT -d "{\"password\":\"$GCE_LTM_PWD\"}" \
             "https://$LTM_HOSTNAME/login"
@@ -97,6 +104,9 @@ function send_to_ltm() {
     if [ -n "$ARCH" ]; then
 	LTM_OPTS="${LTM_OPTS:+$LTM_OPTS, }\"arch\":\"$ARCH\""
     fi
+    if [ -n "$MONITOR_TIMEOUT" ]; then
+	LTM_OPTS="${LTM_OPTS:+$LTM_OPTS, }\"monitor_timeout\":\"$MONITOR_TIMEOUT\""
+    fi
     if [ -n "$LTM_OPTS" ]; then
 	LTM_OPTS="\"options\": {$LTM_OPTS}"
     fi
diff --git a/run-fstests/util/get-config b/run-fstests/util/get-config
index 310f6c9b..52c86ef2 100644
--- a/run-fstests/util/get-config
+++ b/run-fstests/util/get-config
@@ -26,8 +26,14 @@ export KBUILD_DIR="$(dirname $DIR)/kernel-build"
 
 # Source custom configs in ~/.config/ if present
 [ -f "$HOME/.config/xfstests-common" ] && . "$HOME/.config/xfstests-common"
-[ -f "$HOME/.config/${XFSTESTS_FLAVOR}-xfstests" ] && \
-	. "$HOME/.config/${XFSTESTS_FLAVOR}-xfstests"
+
+# If XFSTESTS_CONFIG is set, use that
+# otherwise, look for config in default location ~/.config/
+if [ -n "$XFSTESTS_CONFIG" -a -f "$XFSTESTS_CONFIG" ]; then
+    . "$XFSTESTS_CONFIG"
+elif [ -f "$HOME/.config/${XFSTESTS_FLAVOR}-xfstests" ]; then
+    . "$HOME/.config/${XFSTESTS_FLAVOR}-xfstests"
+fi
 
 # For gce-xfstests, source the config for the active account if present
 if test "$XFSTESTS_FLAVOR" = "gce" -a -z "$GCE_ACCOUNT" -a \
diff --git a/run-fstests/util/parse_cli b/run-fstests/util/parse_cli
index 7b66fb32..efad590e 100644
--- a/run-fstests/util/parse_cli
+++ b/run-fstests/util/parse_cli
@@ -87,6 +87,11 @@ print_help ()
 	echo "			- Don't shard test VMs into other GCE zones"
 	echo "	--bucket-subdir	- Use the next argument as a bucket subdir"
     fi
+    if flavor_in gce ; then
+	echo "	--monitor-timeout time	- LTM option to reboot test VM if no"
+	echo "		status update after specified time. Accepted time"
+	echo "		suffixes include \"h\", \"m\", \"s\"."
+    fi
     echo ""
     echo "Common file system configurations are:"
     echo "	4k 1k ext3 nojournal ext3conv metacsum dioread_nolock "
@@ -119,7 +124,7 @@ validate_test_name()
     if test -z "$DO_BLKTESTS" ; then
 	case "$1" in
 	    btrfs*|ceph*|cifs*|ext4*|f2fs*|generic*|nfs*) ;;
-	    ocfs2*|overlay*|perf*|shared*|udf*|xfs*) ;;
+	    ocfs2*|overlay*|perf*|shared*|udf*|xfs*|selftest*) ;;
 	    *)
 		echo -e "Invalid xfstests test name: $1\n"
 		print_help
@@ -269,6 +274,7 @@ local-ssd-nvme
 log
 machtype:
 modules:
+monitor-timeout:
 nfssrv:
 note:
 no-action
@@ -759,6 +765,9 @@ while (( $# >= 1 )); do
 	--skip-kernel-arch-probe)
 	    SKIP_KERNEL_ARCH_PROBE=YES
 	    ;;
+	--monitor-timeout) shift
+	    MONITOR_TIMEOUT="$1"
+	    ;;
 	--)
 	    shift
 	    break
diff --git a/test-appliance/files/root/runtests.sh b/test-appliance/files/root/runtests.sh
index c4ddb739..d2a0e6ef 100755
--- a/test-appliance/files/root/runtests.sh
+++ b/test-appliance/files/root/runtests.sh
@@ -36,6 +36,8 @@ function copy_xunit_results()
 	fi
 	rm "$RESULT"
     fi
+
+    /root/xfstests/bin/syncfs $RESULT_BASE
 }
 
 # check to see if a device is assigned to be used
@@ -303,6 +305,7 @@ else
 fi
 
 touch "$RESULTS/fstest-completed"
+rm -f /run/last_logged
 
 ./check --help > /tmp/check-help
 report_fmt=xunit
@@ -607,13 +610,33 @@ do
 	    show_mount_opts
 	fi
 	gce_run_hooks fs-config-begin $TC
-	for j in $(seq 1 $RPT_COUNT) ; do
+	RPT_START=1
+	if test -f "$RESULT_BASE/rpt_status"; then
+	    RPT_START=$(cat "$RESULT_BASE/rpt_status" | sed 's:/.*::g')
+	fi
+	for j in $(seq $RPT_START $RPT_COUNT) ; do
+	    echo "$j/$RPT_COUNT" > "$RESULT_BASE/rpt_status"
+	    /root/xfstests/bin/syncfs "$RESULT_BASE"
 	    gce_run_hooks pre-xfstests $TC $j
 	    if test -n "$RUN_ONCE" ; then
 		if test -f "$RESULT_BASE/completed"
 		then
-		    head -n -2 "$RESULT_BASE/completed" > /tmp/completed
-		    mv /tmp/completed "$RESULT_BASE/completed"
+		    last_test="$(tail -n 1 "$RESULT_BASE/completed")"
+
+		    if test -f "$RESULT_BASE/results.xml"; then
+			add_error_xunit "$RESULT_BASE/results.xml" "$last_test" "xfstests.global"
+		    else
+			# if first test crashes, make sure results.xml gets
+			# setup correctly via copy_xunit_results
+			add_error_xunit "$RESULT_BASE/result.xml" "$last_test" "xfstests.global"
+			copy_xunit_results
+		    fi
+		    /root/xfstests/bin/syncfs $RESULT_BASE
+
+		    # this was part of the in-progress preemption work,
+		    # removing for now as it conflicts with the crash recovery stuff
+		    # head -n -2 "$RESULT_BASE/completed" > /tmp/completed
+		    # mv /tmp/completed "$RESULT_BASE/completed"
 		else
 		    touch "$RESULT_BASE/completed"
 		fi
@@ -627,7 +650,7 @@ do
 	    then
 		echo ./check -R $report_fmt $fail_test_loop -T $EXTRA_OPT \
 		     $AEX $TEST_SET_EXCLUDE $(cat /tmp/tests-to-run) \
-		     > "$RESULT_BASE/check-cmd"
+		     >> "$RESULT_BASE/check-cmd"
 		bash ./check -R $report_fmt $fail_test_loop -T $EXTRA_OPT \
 		     $AEX $TEST_SET_EXCLUDE $(cat /tmp/tests-to-run)
 		copy_xunit_results
@@ -642,6 +665,7 @@ do
 	    fi
 	    rm -f "$RESULT_BASE/completed"
 	done
+	rm -f "$RESULT_BASE/rpt_status"
 	if test -n "$RUN_ON_GCE"
 	then
 	    gsutil cp "gs://$GS_BUCKET/check-time.tar.gz" /tmp >& /dev/null
diff --git a/test-appliance/files/usr/lib/python3/dist-packages/diff_stats.py b/test-appliance/files/usr/lib/python3/dist-packages/diff_stats.py
new file mode 100644
index 00000000..7cd218ea
--- /dev/null
+++ b/test-appliance/files/usr/lib/python3/dist-packages/diff_stats.py
@@ -0,0 +1,105 @@
+#!/usr/bin/python3
+
+import argparse
+import sys
+from gen_results_summary import TestStats
+import xml.etree.ElementTree as ET
+from junitparser import JUnitXml, Property, Properties, Failure, Error, Skipped
+
+
+# s[cfg] = cfg_stats
+# cfg_stats[test] = TestStats()
+# consider s1 the baseline
+def diff_stats(s1, s2, threshold, output_file, input_file1, input_file2):
+    """Compare the statistics between two Stats, report regressions and unexpected results"""
+    print(f"Writing results to {output_file}")
+
+    skip_str=""
+    error_str=""
+    file = open(output_file, 'w')
+    file.write(f'Regression check {input_file1} -> {input_file2}:\n\n')
+    for cfg in s1.keys():
+        if cfg not in s2.keys():
+            file.write(f'***Warning: missing config {cfg} in {input_file2}***\n')
+
+    for cfg in s2.keys():
+        file.write(f'{cfg:-^45}\n')
+        if cfg not in s1.keys():
+            file.write(f'***Warning: missing config {cfg} in {input_file1}***\n')
+            continue
+        for test_name in s2[cfg]:
+            test = s2[cfg][test_name]
+            if test_name not in s1[cfg]:
+                file.write(f'***Warning: {cfg}:{test_name} run on {input_file2} but not on {input_file1}***\n')
+                continue
+            if test.failed > 0:
+                test_1 = s1[cfg][test_name]
+                fail_rate_1 = 100.0 * test_1.failed / test_1.total
+                fail_rate_2 = 100.0 * test.failed / test.total
+                if fail_rate_2 >= fail_rate_1 + threshold:
+                    file.write(f'{test_name}: {test_1.failed}/{test_1.total} ({fail_rate_1:.2f}%) -> {test.failed}/{test.total} ({fail_rate_2:.2f}%)\n')
+
+            test_1 = s1[cfg][test_name]
+            skip_rate_1 = 100.0 * test_1.skipped / test_1.total
+            skip_rate_2 = 100.0 * test.skipped / test.total
+            if skip_rate_1 != skip_rate_2:
+                skip_str+=f'{cfg}:{test_name} skip rate changed {test_1.skipped}/{test_1.total} ({skip_rate_1:.2f}%) -> {test.skipped}/{test.total} ({skip_rate_2:.2f}%)\n'
+
+            if test.error > 0:
+                test_1 = s1[cfg][test_name]
+                error_rate_1 = 100.0 * test_1.error / test_1.total
+                error_rate_2 = 100.0 * test.error / test.total
+                # always print error stats
+                error_str+=f'{cfg}:{test_name} ERROR {test_1.error}/{test_1.total} ({error_rate_1:.2f})% -> {test.error}/{test.total} ({error_rate_2:.2f}%)\n'
+        file.write('\n')
+
+    if len(error_str) > 0:
+        file.write('\n*** ERROR(S) occurred in new test set: ***\n')
+        file.write(error_str)
+
+    if len(skip_str) > 0:
+        file.write('\n*** WARNING: skip rate changed between test sets: ***\n')
+        file.write(skip_str)
+    file.close()
+
+
+def read_stats(input_file):
+    """Read test statistics from file"""
+    stats = {}
+    tree = ET.parse(input_file)
+    root = tree.getroot()
+
+    for cfg_element in root.findall('config'):
+        cfg = cfg_element.get('name')
+        if cfg not in stats:
+            stats[cfg] = {}
+        for test_element in cfg_element.findall('test'):
+            test = TestStats()
+
+            name         = test_element.get('name')
+            test.failed  = int(test_element.get('failed'))
+            test.skipped = int(test_element.get('skipped'))
+            test.error   = int(test_element.get('error'))
+            test.total   = int(test_element.get('total'))
+
+            stats[cfg][name] = test
+
+    return stats
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('stats_file1', help='First stats file (baseline)', type=str)
+    parser.add_argument('stats_file2', help='Second stats file (file to compare to baseline)', type=str)
+    parser.add_argument('--outfile', help='Diff output file', default="stats.diff", type=str)
+    parser.add_argument('--regression_threshold', help='Percent (int) increase needed in fail rate to determine regression', type=int, default=5)
+    args = parser.parse_args()
+
+    stats1 = read_stats(args.stats_file1)
+    stats2 = read_stats(args.stats_file2)
+
+    diff_stats(stats1, stats2, args.regression_threshold, args.outfile, args.stats_file1, args.stats_file2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test-appliance/files/usr/lib/python3/dist-packages/gen_results_summary.py b/test-appliance/files/usr/lib/python3/dist-packages/gen_results_summary.py
index 44fb07d2..fe37e64d 100644
--- a/test-appliance/files/usr/lib/python3/dist-packages/gen_results_summary.py
+++ b/test-appliance/files/usr/lib/python3/dist-packages/gen_results_summary.py
@@ -135,6 +135,35 @@ def sum_testsuites(testsuites):
         errors += testsuite.errors
     return (tests, skipped, failures, errors, runtime)
 
+def get_testsuite_stats(testsuite):
+    """Aggregate stats on individual tests"""
+    Stats = {}
+    for test_case in testsuite:
+        isFail = False
+        isSkipped = False
+        isError = False
+        for entry in test_case.result:
+            if isinstance(entry, Failure):
+                isFail = True
+            if isinstance(entry, Skipped):
+                isSkipped = True
+            if isinstance(entry, Error):
+                isError = True
+        if test_case.name in Stats:
+            s = Stats[test_case.name]
+        else:
+            s = TestStats()
+            Stats[test_case.name] = s
+        s.total += 1
+        if isFail:
+            s.failed += 1
+        if isSkipped:
+            s.skipped += 1
+        if isError:
+            s.error += 1
+
+    return Stats
+
 def print_summary(out_f, testsuite, verbose):
     """Print a summary for a particular test suite
 
@@ -179,30 +208,7 @@ def print_summary(out_f, testsuite, verbose):
             out_f.write("  %-12s %-8s %ds\n" %
                         (test_case.name, status, test_case.time))
     else:
-        Stats = {}
-        for test_case in testsuite:
-            isFail = False
-            isSkipped = False
-            isError = False
-            for entry in test_case.result:
-                if isinstance(entry, Failure):
-                    isFail = True
-                if isinstance(entry, Skipped):
-                    isSkipped = True
-                if isinstance(entry, Error):
-                    isError = True
-            if test_case.name in Stats:
-                s = Stats[test_case.name]
-            else:
-                s = TestStats()
-                Stats[test_case.name] = s
-            s.total += 1
-            if isFail:
-                s.failed += 1
-            if isSkipped:
-                s.skipped += 1
-            if isError:
-                s.error += 1
+        Stats = get_testsuite_stats(testsuite)
 
         wp = wrapped_print(out_f, 'Failures', ' ')
         for t in Stats:
diff --git a/test-appliance/files/usr/lib/python3/dist-packages/get_stats.py b/test-appliance/files/usr/lib/python3/dist-packages/get_stats.py
new file mode 100644
index 00000000..4cd62815
--- /dev/null
+++ b/test-appliance/files/usr/lib/python3/dist-packages/get_stats.py
@@ -0,0 +1,68 @@
+#!/usr/bin/python3
+
+import argparse
+import sys
+from gen_results_summary import get_property, get_testsuite_stats, get_results
+from junitparser import JUnitXml, Property, Properties, Failure, Error, Skipped
+
+try:
+    from lxml import etree
+except ImportError:
+    from xml.etree import ElementTree as etree
+
+
+# reports is list of results from each xml file
+# stats[cfg] = cfg_stats
+# cfg_stats[test] = TestStats()
+def get_stats_from_dir(results_dir):
+    """From a results dir, return a list of reports and test statistics"""
+    reports = []
+    stats = {}
+    for filename in get_results(results_dir):
+        reports.append(JUnitXml.fromfile(filename))
+
+    if len(reports) == 0:
+        sys.stderr.write(f'Error: could not find any reports in {results_dir}')
+        return None
+
+    for testsuite in reports:
+        cfg = get_property(testsuite.properties(), 'TESTCFG') or get_property(testsuite.properties(), 'FSTESTCFG')
+        if cfg in stats:
+            sys.stderr.write(f'Found duplicate config {cfg}')
+            return None
+        stats[cfg] = get_testsuite_stats(testsuite)
+
+    return stats
+
+# writes all configs into single output file
+# condensing into entries of test->(failed, skipped, error, total)
+# this will let us store stats and easily merge from other runs
+# without having to reprocess everything
+def write_stats(s, output_file):
+    """Write the test statistics to a file"""
+    root = etree.Element("configs")
+    for cfg in s:
+        cfg_element = etree.SubElement(root, "config", name=cfg)
+        for test_name in s[cfg]:
+            test = s[cfg][test_name]
+            etree.SubElement(cfg_element, "test", name=test_name, failed=str(test.failed), skipped=str(test.skipped), error=str(test.error), total=str(test.total))
+
+    tree = etree.ElementTree(root)
+    etree.indent(tree, space="\t", level=0)
+    tree.write(output_file, encoding='utf-8')
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('results_dir', help='Results directory to process', type=str)
+    parser.add_argument('--outfile', help='Diff output file', default='./stats.xml', type=str)
+    args = parser.parse_args()
+
+    stats = get_stats_from_dir(args.results_dir)
+
+    if stats == None:
+        return -1
+
+    write_stats(stats, args.outfile)
+
+if __name__ == "__main__":
+    main()
diff --git a/test-appliance/files/usr/lib/python3/dist-packages/junitparser/__init__.py b/test-appliance/files/usr/lib/python3/dist-packages/junitparser/__init__.py
index e3d8da0c..55b9ddbc 100644
--- a/test-appliance/files/usr/lib/python3/dist-packages/junitparser/__init__.py
+++ b/test-appliance/files/usr/lib/python3/dist-packages/junitparser/__init__.py
@@ -8,6 +8,7 @@ from .junitparser import (
     Skipped,
     Failure,
     Error,
+    Result,
     TestCase,
     Properties,
     IntAttr,
diff --git a/test-appliance/files/usr/lib/python3/dist-packages/junitparser/junitparser.py b/test-appliance/files/usr/lib/python3/dist-packages/junitparser/junitparser.py
index eb38b298..b3bbd853 100644
--- a/test-appliance/files/usr/lib/python3/dist-packages/junitparser/junitparser.py
+++ b/test-appliance/files/usr/lib/python3/dist-packages/junitparser/junitparser.py
@@ -310,7 +310,11 @@ class JUnitXml(Element):
         if parse_func:
             tree = parse_func(filepath)
         else:
-            tree = etree.parse(filepath) # nosec
+            try:
+                tree = etree.parse(filepath) # nosec
+            except etree.XMLSyntaxError:
+                p = etree.XMLParser(huge_tree=True)
+                tree = etree.parse(filepath, parser=p)
         root_elem = tree.getroot()
         if root_elem.tag == "testsuites":
             instance = cls()
diff --git a/test-appliance/files/usr/lib/python3/dist-packages/merge_stats.py b/test-appliance/files/usr/lib/python3/dist-packages/merge_stats.py
new file mode 100644
index 00000000..a3148142
--- /dev/null
+++ b/test-appliance/files/usr/lib/python3/dist-packages/merge_stats.py
@@ -0,0 +1,45 @@
+#!/usr/bin/python3
+
+import argparse
+import sys
+import xml.etree.ElementTree as ET
+import get_stats
+import diff_stats
+from gen_results_summary import TestStats
+from junitparser import JUnitXml, Property, Properties, Failure, Error, Skipped
+
+
+def merge_stats(stats1, stats2):
+    """Merges stats2 into stats1"""
+    for cfg in stats2:
+        if cfg not in stats1:
+            stats1[cfg] = {}
+
+        for test_name in stats2[cfg]:
+            if test_name not in stats1[cfg]:
+                stats1[cfg][test_name] = TestStats()
+            stats1[cfg][test_name].failed  += stats2[cfg][test_name].failed
+            stats1[cfg][test_name].skipped += stats2[cfg][test_name].skipped
+            stats1[cfg][test_name].error   += stats2[cfg][test_name].error
+            stats1[cfg][test_name].total   += stats2[cfg][test_name].total
+
+    return stats1
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('stats_file', help='First stats file', type=str)
+    parser.add_argument('stats_files_merge', nargs='+', help='List of stats files to merge', type=str)
+    parser.add_argument('--outfile', default='merged_stats.xml', help='Output xml file', type=str)
+    args = parser.parse_args()
+
+    stats = diff_stats.read_stats(args.stats_file)
+
+    for file in args.stats_files_merge:
+        stats_merge = diff_stats.read_stats(file)
+        stats = merge_stats(stats, stats_merge)
+
+    get_stats.write_stats(stats, args.outfile)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test-appliance/files/usr/local/bin/add_error_xunit b/test-appliance/files/usr/local/bin/add_error_xunit
new file mode 100755
index 00000000..0f12e983
--- /dev/null
+++ b/test-appliance/files/usr/local/bin/add_error_xunit
@@ -0,0 +1,48 @@
+#!/usr/bin/python3
+import argparse
+import os
+import sys
+from junitparser import JUnitXml, TestSuite, TestCase, Result, Error
+
+def get_test_suite(filename):
+    if not os.path.exists(filename):
+        ts = TestSuite()
+    else:
+        try:
+            ts = JUnitXml.fromfile(filename)
+        except IOError as e:
+            sys.exit("Couldn't open %s: %s" % (filename, e[1]))
+
+    if type(ts) != TestSuite:
+        sys.exit('%s is not a xUnit report file' % filename)
+    return ts
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('input_file', help='input xUnit result file')
+parser.add_argument('testname', help='name of test causing error')
+parser.add_argument('classname', help='classname for test case')
+args = parser.parse_args()
+
+ts = get_test_suite(args.input_file)
+
+result = Result()
+
+error = Error(result)
+error.message='Machine rebooted (crash or test timeout)'
+error.type='TestFail'
+
+tc = TestCase()
+tc.classname=args.classname
+tc.name=args.testname
+tc.time = 0
+tc.result = [error]
+
+# this also updates the statistics
+ts.add_testcase(tc)
+
+ts.write(args.input_file + '.new', pretty=True)
+if os.path.exists(args.input_file):
+    os.rename(args.input_file, args.input_file + '.error.bak')
+os.rename(args.input_file + '.new' , args.input_file)
+
diff --git a/test-appliance/files/usr/local/lib/gce-add-metadata b/test-appliance/files/usr/local/lib/gce-add-metadata
index 672975b1..d77f50f1 100755
--- a/test-appliance/files/usr/local/lib/gce-add-metadata
+++ b/test-appliance/files/usr/local/lib/gce-add-metadata
@@ -7,4 +7,4 @@ then
 fi
 
 flock /run/xattr.lock gcloud compute instances -q add-metadata \
-      --zone $ZONE $(hostname) --metadata "$@" >& /dev/null
+      --zone $ZONE $(hostname) --metadata "^##^$@" >& /dev/null
diff --git a/test-appliance/files/usr/local/lib/gce-logger b/test-appliance/files/usr/local/lib/gce-logger
index 68a1970e..4eadca31 100755
--- a/test-appliance/files/usr/local/lib/gce-logger
+++ b/test-appliance/files/usr/local/lib/gce-logger
@@ -6,10 +6,31 @@ then
     run_hooks logger "$*"
 fi
 
+is_test=
 status=$(echo "$*" | sed -e 's/^run xfstest //')
 if echo "$*" | grep -q "^run xfstest "
 then
+    is_test="y"
     echo "$status" >> $RESULT_BASE/completed
+
+    if test ! -f $RESULT_BASE/rpt_status -o \
+		! -f $RESULT_BASE/tests-to-run -o \
+		! -f $RESULT_BASE/completed
+    then
+	status="--% $status"
+    else
+	rpt_status=$(cat $RESULT_BASE/rpt_status)
+	current_rpt=${rpt_status%%/*}
+	total_rpt=${rpt_status##*/}
+	total_tests=$(cat $RESULT_BASE/tests-to-run | sort | uniq | wc -l)
+	count_completed=$(cat $RESULT_BASE/completed | sort | uniq | wc -l)
+
+	progress=$(( ( ( $current_rpt - 1 ) * $total_tests + $count_completed ) * 100  / ( $total_tests * $total_rpt ) ))
+	status="$progress% $status"
+    fi
+
+    # sync completed file
+    /root/xfstests/bin/syncfs $RESULT_BASE
 fi
 
 if test -f /run/fstest-config
@@ -18,7 +39,11 @@ then
     status="$cfg $status"
 fi
 
-if test -z "$(find /run/last_logged -mmin -1 -print 2> /dev/null)"
+# force first test to upload it's test status and wait for it to finish
+if test -n "$is_test" -a ! -s /run/last_logged; then
+    /usr/local/lib/gce-add-metadata "status=$(date +%H:%M) $status"
+    echo "Started testing" > /run/last_logged
+elif test -z "$(find /run/last_logged -mmin -1 -print 2> /dev/null)"
 then
     /usr/local/lib/gce-add-metadata "status=$(date +%H:%M) $status" &
     touch /run/last_logged
diff --git a/test-appliance/files/usr/local/lib/gce-server/ltm/shard.go b/test-appliance/files/usr/local/lib/gce-server/ltm/shard.go
index 25af29cd..bef49236 100644
--- a/test-appliance/files/usr/local/lib/gce-server/ltm/shard.go
+++ b/test-appliance/files/usr/local/lib/gce-server/ltm/shard.go
@@ -38,6 +38,7 @@ type ShardWorker struct {
 	vmStatus    string
 	vmtestStart time.Time
 	testResult  server.ResultType
+	vmReset     bool
 
 	log                *logrus.Entry
 	logPath            string
@@ -48,9 +49,9 @@ type ShardWorker struct {
 }
 
 const (
-	monitorTimeout  = 1 * time.Hour
-	noStatusTimeout = 5 * time.Minute
+	noStatusTimeout = 10 * time.Minute
 	monitorInterval = 60 * time.Second
+	resetTimeout    = 10 * time.Minute
 	gsInterval      = 10 * time.Second
 	maxAttempts     = 5
 )
@@ -70,6 +71,7 @@ func NewShardWorker(sharder *ShardScheduler, shardID string, config string, zone
 		vmStatus:    "waiting for launch",
 		vmtestStart: time.Now(),
 		testResult:  server.DefaultResult,
+		vmReset:     false,
 
 		log:                sharder.log.WithField("shardID", shardID),
 		logPath:            logPath,
@@ -91,23 +93,25 @@ func NewShardWorker(sharder *ShardScheduler, shardID string, config string, zone
 		"--no-email",
 		"-c", config,
 	}
+
 	if sharder.arch != "" {
 		shard.args = append(shard.args, "--arch", sharder.arch)
 	}
-	shard.args = append(shard.args, sharder.validArgs...)
 
-	var defaultProj bool = true
-	for _, arg := range shard.args {
+	var imgProjFlag bool = false
+	for _, arg := range sharder.validArgs {
 		if arg == "--image-project" {
-			defaultProj = false
+			imgProjFlag = true
 			break
 		}
 	}
 
-	if defaultProj {
-		shard.args = append(shard.args, "--image-project", sharder.projID)
+	if ! imgProjFlag && len(sharder.imgProjID) > 0 {
+		shard.args = append(shard.args, "--image-project", sharder.imgProjID)
 	}
 
+	shard.args = append(shard.args, sharder.validArgs...)
+
 	return &shard
 }
 
@@ -185,6 +189,7 @@ func (shard *ShardWorker) monitor() {
 				if *metaData.Value != shard.vmStatus {
 					shard.vmStatus = *metaData.Value
 					shard.vmtestStart = time.Now()
+					shard.vmReset = false
 					break
 				}
 			}
@@ -203,20 +208,30 @@ func (shard *ShardWorker) monitor() {
 			log.Debug("waiting to get test status metadata")
 		}
 
-		if time.Since(shard.vmtestStart) > monitorTimeout {
-			if !shard.sharder.keepDeadVM {
-				shard.shutdownOnTimeout(instanceInfo.Metadata)
-			}
-			shard.vmStatus = "timeout on one test"
-			shard.testResult = server.Hang
 
-			log.WithFields(logrus.Fields{
-				"status": shard.vmStatus,
-				"start":  shard.vmtestStart.Format(time.Stamp),
-			}).Errorf("Instance seems to have wedged, no status update for %s", monitorTimeout.Round(time.Minute))
+		if shard.vmReset && time.Since(shard.vmtestStart) > resetTimeout {
+			log.Errorf("VM did not come back online after reset, exiting");
 			return
 		}
 
+		// Reset VM if we don't get a status update
+		// Skip check if we are already performing a reset
+		// Selftests may limit monitorTimeout to shorter than noStatusTimeout
+		//    so skip check if we are still launching
+		if time.Since(shard.vmtestStart) > shard.sharder.monitorTimeout &&
+				! shard.vmReset && shard.vmStatus != "launching" {
+			log.Debug("Resetting VM")
+			err := shard.sharder.gce.ResetVM(shard.sharder.projID, shard.zone, shard.name)
+			if err != nil {
+				log.Errorf("Failed to reset %s", shard.name)
+				shard.vmStatus = "failed to reset after timeout"
+				shard.testResult = server.Error
+				return
+			}
+			shard.vmReset = true
+			shard.vmtestStart = time.Now()
+		}
+
 		log.WithFields(logrus.Fields{
 			"status": shard.vmStatus,
 			"start":  shard.vmtestStart.Format(time.Stamp),
@@ -283,8 +298,7 @@ func (shard *ShardWorker) shutdownOnTimeout(metadata *compute.Metadata) {
 
 /*
 finish calls gce-xfstests scripts to fetch and unpack test result files.
-It deletes the results in gs bucket and local serial port output.
-It also determines testResult:
+It deletes the results in gs bucket and determines testResult:
 
 Default		VM finishes without issues, test result is found;
 Crash		VM started running tests but no test result is found;
@@ -321,11 +335,6 @@ func (shard *ShardWorker) finish() {
 		shard.log.Panic("Failed to find unpacked result files")
 	}
 
-	if check.FileExists(shard.serialOutputPath) && !shard.vmTimeout {
-		err = os.Remove(shard.serialOutputPath)
-		check.NoError(err, shard.log, "Failed to remove dir")
-	}
-
 	prefix := fmt.Sprintf("%s/results.%s", shard.sharder.bucketSubdir, shard.resultsName)
 	_, err = shard.sharder.gce.DeleteFiles(prefix)
 	check.NoError(err, shard.log, "Failed to delete file")
diff --git a/test-appliance/files/usr/local/lib/gce-server/ltm/sharder.go b/test-appliance/files/usr/local/lib/gce-server/ltm/sharder.go
index 957efa98..95d1f9e6 100644
--- a/test-appliance/files/usr/local/lib/gce-server/ltm/sharder.go
+++ b/test-appliance/files/usr/local/lib/gce-server/ltm/sharder.go
@@ -23,6 +23,7 @@ import (
 	"sort"
 	"strings"
 	"sync"
+	"time"
 
 	"thunk.org/gce-server/util/check"
 	"thunk.org/gce-server/util/email"
@@ -36,12 +37,14 @@ import (
 )
 
 const genResultsSummaryPath = "/usr/local/bin/gen_results_summary"
+const defaultMonitorTimeout = 1 * time.Hour
 
 // ShardScheduler schedules tests and aggregates reports.
 type ShardScheduler struct {
-	testID  string
-	projID  string
-	origCmd string
+	testID    string
+	projID    string
+	imgProjID string
+	origCmd   string
 
 	zone           string
 	region         string
@@ -53,6 +56,7 @@ type ShardScheduler struct {
 	reportReceiver string
 	maxShards      int
 	keepDeadVM     bool
+	monitorTimeout time.Duration
 
 	reportKCS   bool
 	testRequest server.TaskRequest
@@ -101,6 +105,9 @@ func NewShardScheduler(c server.TaskRequest, testID string) *ShardScheduler {
 	projID, err := gcp.GceConfig.Get("GCE_PROJECT")
 	check.Panic(err, log, "Failed to get project config")
 
+	imgProjID, err := gcp.GceConfig.Get("GCE_IMAGE_PROJECT")
+	check.Panic(err, log, "Failed to get image project")
+
 	gsBucket, err := gcp.GceConfig.Get("GS_BUCKET")
 	check.Panic(err, log, "Failed to get gs bucket config")
 
@@ -108,9 +115,10 @@ func NewShardScheduler(c server.TaskRequest, testID string) *ShardScheduler {
 
 	log.Info("Initiating test sharder")
 	sharder := ShardScheduler{
-		testID:  testID,
-		projID:  projID,
-		origCmd: origCmd,
+		testID:    testID,
+		projID:    projID,
+		imgProjID: imgProjID,
+		origCmd:   origCmd,
 
 		zone:           zone,
 		region:         region,
@@ -122,6 +130,7 @@ func NewShardScheduler(c server.TaskRequest, testID string) *ShardScheduler {
 		reportReceiver: c.Options.ReportEmail,
 		maxShards:      0,
 		keepDeadVM:     false,
+		monitorTimeout: defaultMonitorTimeout,
 
 		reportKCS:   false,
 		testRequest: c,
@@ -143,6 +152,15 @@ func NewShardScheduler(c server.TaskRequest, testID string) *ShardScheduler {
 	if sharder.bucketSubdir == "" {
 		sharder.bucketSubdir = "results"
 	}
+	if c.Options.MonitorTimeout != "" {
+		sharder.monitorTimeout, err = time.ParseDuration(c.Options.MonitorTimeout)
+		if err != nil {
+			sharder.monitorTimeout = defaultMonitorTimeout
+			sharder.log.WithField("MonitorTimeout", c.Options.MonitorTimeout).Error("Unable to parse --monitor-timeout option, using default value")
+		} else {
+			sharder.log.WithField("MonitorTimeout", sharder.monitorTimeout).Info("Parsed monitor timeout argument")
+		}
+	}
 
 	sharder.validArgs, sharder.configs, err = getConfigs(sharder.origCmd)
 	check.Panic(err, log, "Failed to parse config from origCmd")
@@ -368,6 +386,7 @@ func (sharder *ShardScheduler) aggResults() {
 			"unpackedResultsDir": shard.unpackedResultsDir,
 		})
 		log.Debug("Moving shard result files into aggregate folder")
+		shardHasResults := false
 
 		if check.DirExists(shard.unpackedResultsDir) {
 			err := os.RemoveAll(sharder.aggDir + shard.shardID)
@@ -376,16 +395,22 @@ func (sharder *ShardScheduler) aggResults() {
 			err = os.Rename(shard.unpackedResultsDir, sharder.aggDir+shard.shardID)
 			check.Panic(err, log, "Failed to move dir")
 
+			shardHasResults = true
 			hasResults = true
-		} else if check.FileExists(shard.serialOutputPath) {
+		}
+
+		if check.FileExists(shard.serialOutputPath) {
 			err := os.RemoveAll(sharder.aggDir + shard.shardID + ".serial")
 			check.Panic(err, log, "Failed to remove dir")
 
 			err = os.Rename(shard.serialOutputPath, sharder.aggDir+shard.shardID+".serial")
 			check.Panic(err, log, "Failed to move dir")
 
+			shardHasResults = true
 			hasResults = true
-		} else {
+		}
+
+		if ! shardHasResults {
 			log.Warn("Shard has no results available")
 		}
 	}
diff --git a/test-appliance/files/usr/local/lib/gce-server/util/gcp/gcp.go b/test-appliance/files/usr/local/lib/gce-server/util/gcp/gcp.go
index 65480d6d..da8d7401 100644
--- a/test-appliance/files/usr/local/lib/gce-server/util/gcp/gcp.go
+++ b/test-appliance/files/usr/local/lib/gce-server/util/gcp/gcp.go
@@ -286,3 +286,10 @@ func NotFound(err error) bool {
 	}
 	return false
 }
+
+func (gce *Service) ResetVM(project string, zone string, instance string) error {
+        instancesService := compute.NewInstancesService(gce.service)
+        call := instancesService.Reset(project, zone, instance)
+        _, err := call.Do()
+        return err
+}
diff --git a/test-appliance/files/usr/local/lib/gce-server/util/parser/parser.go b/test-appliance/files/usr/local/lib/gce-server/util/parser/parser.go
index 81ff5fa0..045ab86c 100644
--- a/test-appliance/files/usr/local/lib/gce-server/util/parser/parser.go
+++ b/test-appliance/files/usr/local/lib/gce-server/util/parser/parser.go
@@ -40,6 +40,7 @@ var invalidOpts = []string{
 	"--watch",
 	"--bisect-good",
 	"--bisect-bad",
+	"--monitor-timeout",
 }
 
 /*
diff --git a/test-appliance/files/usr/local/lib/gce-server/util/server/server.go b/test-appliance/files/usr/local/lib/gce-server/util/server/server.go
index 4c9122e8..d7e3690e 100644
--- a/test-appliance/files/usr/local/lib/gce-server/util/server/server.go
+++ b/test-appliance/files/usr/local/lib/gce-server/util/server/server.go
@@ -116,20 +116,21 @@ const (
 
 // UserOptions contains configs user sends to LTM or KCS.
 type UserOptions struct {
-	NoRegionShard bool   `json:"no_region_shard"`
-	BucketSubdir  string `json:"bucket_subdir"`
-	GsKernel      string `json:"gs_kernel"`
-	ReportEmail   string `json:"report_email"`
-	CommitID      string `json:"commit_id"`
-	GitRepo       string `json:"git_repo"`
-	BranchName    string `json:"branch_name"`
-	UnWatch       string `json:"unwatch"`
-	BadCommit     string `json:"bad_commit"`
-	GoodCommit    string `json:"good_commit"`
-	KConfig	      string `json:"kconfig"`
-	KConfigOpts   string `json:"kconfig_opts"`
-	KbuildOpts    string `json:"kbuild_opts"`
-	Arch          string `json:"arch"`
+	NoRegionShard  bool   `json:"no_region_shard"`
+	BucketSubdir   string `json:"bucket_subdir"`
+	GsKernel       string `json:"gs_kernel"`
+	ReportEmail    string `json:"report_email"`
+	CommitID       string `json:"commit_id"`
+	GitRepo        string `json:"git_repo"`
+	BranchName     string `json:"branch_name"`
+	UnWatch        string `json:"unwatch"`
+	BadCommit      string `json:"bad_commit"`
+	GoodCommit     string `json:"good_commit"`
+	KConfig	       string `json:"kconfig"`
+	KConfigOpts    string `json:"kconfig_opts"`
+	KbuildOpts     string `json:"kbuild_opts"`
+	Arch           string `json:"arch"`
+	MonitorTimeout string `json:"monitor_timeout"`
 }
 
 // InternalOptions contains configs used by LTM and KCS internally.
diff --git a/test-appliance/gce-xfstests-bld.sh b/test-appliance/gce-xfstests-bld.sh
index 4c54f8ce..2c054d0f 100644
--- a/test-appliance/gce-xfstests-bld.sh
+++ b/test-appliance/gce-xfstests-bld.sh
@@ -372,6 +372,8 @@ sed -i -e '/ExecStart/s/agetty/agetty -a root/' \
     -e 's/After=rc.local.service/After=network.target/' \
 	/etc/systemd/system/telnet-getty@.service
 
+echo "kernel.panic=60" >> /etc/sysctl.conf
+
 systemctl enable kvm-xfstests.service
 systemctl enable gce-fetch-gs-files.service
 systemctl enable gce-finalize-wait.service
author	Theodore Ts'o <tytso@mit.edu>	2023-07-01 00:51:06 -0400
committer	Theodore Ts'o <tytso@mit.edu>	2023-07-01 00:51:06 -0400
commit	df7a7d1c5d784d9b1ef539b756357cf10d4f105f (patch)
tree	f9741ec2ed69b75a2986a7833ba3fcac5ca3946c
parent	0f389da9f980e0bb51a45c7d464bf2b0edc09fdd (diff)
parent	9e67ddbbede1eb9fc766fc2c2e642a890a4db915 (diff)
download	xfstests-bld-df7a7d1c5d784d9b1ef539b756357cf10d4f105f.tar.gz