diff options
author | Theodore Ts'o <tytso@mit.edu> | 2023-07-01 00:51:06 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2023-07-01 00:51:06 -0400 |
commit | df7a7d1c5d784d9b1ef539b756357cf10d4f105f (patch) | |
tree | f9741ec2ed69b75a2986a7833ba3fcac5ca3946c | |
parent | 0f389da9f980e0bb51a45c7d464bf2b0edc09fdd (diff) | |
parent | 9e67ddbbede1eb9fc766fc2c2e642a890a4db915 (diff) | |
download | xfstests-bld-df7a7d1c5d784d9b1ef539b756357cf10d4f105f.tar.gz |
Merge remote-tracking branch 'leah/ltm-auto-resume'
21 files changed, 516 insertions, 92 deletions
diff --git a/fstests-bld/misc/syncfs.c b/fstests-bld/misc/syncfs.c index 6bb8a9ae..60e52f24 100644 --- a/fstests-bld/misc/syncfs.c +++ b/fstests-bld/misc/syncfs.c @@ -1,5 +1,5 @@ /* - * syncfs.c -- issue + * syncfs.c -- issue syncfs on a file or directory */ #define _GNU_SOURCE @@ -16,14 +16,14 @@ const char *progname; static void usage(void) { - fprintf(stderr, "Usage: %s <file>\n"); + fprintf(stderr, "Usage: %s <file>\n", progname); exit(1); } int main(int argc, char **argv) { int fd; - + progname = argv[0]; if (argc != 2) usage(); @@ -38,4 +38,3 @@ int main(int argc, char **argv) } return 0; } - diff --git a/run-fstests/gce-xfstests b/run-fstests/gce-xfstests index d3732128..4373813a 100755 --- a/run-fstests/gce-xfstests +++ b/run-fstests/gce-xfstests @@ -3,6 +3,7 @@ XFSTESTS_FLAVOR=gce RUN_ON_LTM= RUN_ON_KCS= +GCE_IMAGE_PROJECT= t=$(echo ${XFSTESTS_FLAVOR}_xfstests_dir | tr "[:lower:]" "[:upper:]") eval DIR="\$$t" if test -z "$DIR" @@ -232,6 +233,14 @@ case "$1" in deldisks="--delete-disks all" fi shift + + bg="&" + if test "$1" = "--wait" + then + bg= + shift + fi + for i in "$@" do if test -n "$deldisks" @@ -244,8 +253,8 @@ case "$1" in run_gcloud compute -q instances add-metadata "$i" \ --metadata "shutdown_reason=$reason" \ --zone "$zone" > /dev/null - run_gcloud compute -q instances delete "$i" \ - --zone "$zone" $deldisks & + eval run_gcloud compute -q instances delete "$i" \ + --zone "$zone" $deldisks $bg done exit 0 ;; @@ -408,13 +417,20 @@ case "$1" in exit $? ;; ssh) + # gce-xfstests ssh --user <user> <host> -- <cmd> user=root shift while (( $# >= 1 )); do case $1 in --user|-u) shift user="$1" + echo "user=$user" ;; + --) shift + ssh_cmd="$@" + CMD="--command=" + break + ;; -*) echo "Unknown option $1" exit 1 ;; @@ -423,8 +439,17 @@ case "$1" in esac shift done - run_gcloud compute -q ssh $user@"$host" \ - --zone $(get_gce_zone "$host") $RUN_INTERNAL + + # ssh_cmd must be quoted but passing "" when there is no command + # causes gcloud to complain (even if --command= lumped into ssh_cmd) + if test -n "$ssh_cmd"; then + run_gcloud compute -q ssh $user@"$host" \ + --zone $(get_gce_zone $host) $RUN_INTERNAL --command="$ssh_cmd" + exit $? + fi + + run_gcloud compute -q ssh $user@"$host" \ + --zone $(get_gce_zone $host) $RUN_INTERNAL exit $? ;; scp) @@ -1148,7 +1173,9 @@ fi if test -n "$RUN_ON_LTM"; then . "$DIR/util/gce-ltm-funcs" - send_to_ltm $ORIG_CMDLINE_B64 + if ! send_to_ltm $ORIG_CMDLINE_B64; then + exit 1 + fi exit 0 elif test -n "$RUN_ON_KCS"; then if ! gsutil -q stat "gs://$GS_BUCKET/build_config" &> /dev/null @@ -1158,7 +1185,9 @@ elif test -n "$RUN_ON_KCS"; then gsutil cp "$DIR/../kernel-build/kernel-configs/x86_64-config-5.4" "gs://$GS_BUCKET/build_config" fi . "$DIR/util/gce-kcs-funcs" - send_to_kcs $ORIG_CMDLINE_B64 + if ! send_to_kcs $ORIG_CMDLINE_B64; then + exit 1 + fi exit 0 fi diff --git a/run-fstests/util/gce-ltm-funcs b/run-fstests/util/gce-ltm-funcs index 4da3e863..ac7b5a29 100644 --- a/run-fstests/util/gce-ltm-funcs +++ b/run-fstests/util/gce-ltm-funcs @@ -32,7 +32,14 @@ function send_to_ltm() { local cmd_to_send=$1 shift - if test ! -f "$DIR/.ltm_cookie_$GCE_PROJECT"; then + # Failed login will create an empty cookie file, so ensure + # the file exists and contains a cookie - sometimes ltm_post_json + # will succeed even when login fails, so we cannot simply remove + # the cookie file upon ltm_post_json failure + if test ! -f "$DIR/.ltm_cookie_$GCE_PROJECT" || \ + ! grep "a.$GCE_PROJECT.gce-xfstests" "$DIR/.ltm_cookie_$GCE_PROJECT" &> /dev/null + then + echo "login attempt " >> /tmp/ltm-auto-resume.debug # just create a new login session and store it in the cookie ltm_post_json -c $DIR/.ltm_cookie_$GCE_PROJECT -d "{\"password\":\"$GCE_LTM_PWD\"}" \ "https://$LTM_HOSTNAME/login" @@ -97,6 +104,9 @@ function send_to_ltm() { if [ -n "$ARCH" ]; then LTM_OPTS="${LTM_OPTS:+$LTM_OPTS, }\"arch\":\"$ARCH\"" fi + if [ -n "$MONITOR_TIMEOUT" ]; then + LTM_OPTS="${LTM_OPTS:+$LTM_OPTS, }\"monitor_timeout\":\"$MONITOR_TIMEOUT\"" + fi if [ -n "$LTM_OPTS" ]; then LTM_OPTS="\"options\": {$LTM_OPTS}" fi diff --git a/run-fstests/util/get-config b/run-fstests/util/get-config index 310f6c9b..52c86ef2 100644 --- a/run-fstests/util/get-config +++ b/run-fstests/util/get-config @@ -26,8 +26,14 @@ export KBUILD_DIR="$(dirname $DIR)/kernel-build" # Source custom configs in ~/.config/ if present [ -f "$HOME/.config/xfstests-common" ] && . "$HOME/.config/xfstests-common" -[ -f "$HOME/.config/${XFSTESTS_FLAVOR}-xfstests" ] && \ - . "$HOME/.config/${XFSTESTS_FLAVOR}-xfstests" + +# If XFSTESTS_CONFIG is set, use that +# otherwise, look for config in default location ~/.config/ +if [ -n "$XFSTESTS_CONFIG" -a -f "$XFSTESTS_CONFIG" ]; then + . "$XFSTESTS_CONFIG" +elif [ -f "$HOME/.config/${XFSTESTS_FLAVOR}-xfstests" ]; then + . "$HOME/.config/${XFSTESTS_FLAVOR}-xfstests" +fi # For gce-xfstests, source the config for the active account if present if test "$XFSTESTS_FLAVOR" = "gce" -a -z "$GCE_ACCOUNT" -a \ diff --git a/run-fstests/util/parse_cli b/run-fstests/util/parse_cli index 7b66fb32..efad590e 100644 --- a/run-fstests/util/parse_cli +++ b/run-fstests/util/parse_cli @@ -87,6 +87,11 @@ print_help () echo " - Don't shard test VMs into other GCE zones" echo " --bucket-subdir - Use the next argument as a bucket subdir" fi + if flavor_in gce ; then + echo " --monitor-timeout time - LTM option to reboot test VM if no" + echo " status update after specified time. Accepted time" + echo " suffixes include \"h\", \"m\", \"s\"." + fi echo "" echo "Common file system configurations are:" echo " 4k 1k ext3 nojournal ext3conv metacsum dioread_nolock " @@ -119,7 +124,7 @@ validate_test_name() if test -z "$DO_BLKTESTS" ; then case "$1" in btrfs*|ceph*|cifs*|ext4*|f2fs*|generic*|nfs*) ;; - ocfs2*|overlay*|perf*|shared*|udf*|xfs*) ;; + ocfs2*|overlay*|perf*|shared*|udf*|xfs*|selftest*) ;; *) echo -e "Invalid xfstests test name: $1\n" print_help @@ -269,6 +274,7 @@ local-ssd-nvme log machtype: modules: +monitor-timeout: nfssrv: note: no-action @@ -759,6 +765,9 @@ while (( $# >= 1 )); do --skip-kernel-arch-probe) SKIP_KERNEL_ARCH_PROBE=YES ;; + --monitor-timeout) shift + MONITOR_TIMEOUT="$1" + ;; --) shift break diff --git a/test-appliance/files/root/runtests.sh b/test-appliance/files/root/runtests.sh index c4ddb739..d2a0e6ef 100755 --- a/test-appliance/files/root/runtests.sh +++ b/test-appliance/files/root/runtests.sh @@ -36,6 +36,8 @@ function copy_xunit_results() fi rm "$RESULT" fi + + /root/xfstests/bin/syncfs $RESULT_BASE } # check to see if a device is assigned to be used @@ -303,6 +305,7 @@ else fi touch "$RESULTS/fstest-completed" +rm -f /run/last_logged ./check --help > /tmp/check-help report_fmt=xunit @@ -607,13 +610,33 @@ do show_mount_opts fi gce_run_hooks fs-config-begin $TC - for j in $(seq 1 $RPT_COUNT) ; do + RPT_START=1 + if test -f "$RESULT_BASE/rpt_status"; then + RPT_START=$(cat "$RESULT_BASE/rpt_status" | sed 's:/.*::g') + fi + for j in $(seq $RPT_START $RPT_COUNT) ; do + echo "$j/$RPT_COUNT" > "$RESULT_BASE/rpt_status" + /root/xfstests/bin/syncfs "$RESULT_BASE" gce_run_hooks pre-xfstests $TC $j if test -n "$RUN_ONCE" ; then if test -f "$RESULT_BASE/completed" then - head -n -2 "$RESULT_BASE/completed" > /tmp/completed - mv /tmp/completed "$RESULT_BASE/completed" + last_test="$(tail -n 1 "$RESULT_BASE/completed")" + + if test -f "$RESULT_BASE/results.xml"; then + add_error_xunit "$RESULT_BASE/results.xml" "$last_test" "xfstests.global" + else + # if first test crashes, make sure results.xml gets + # setup correctly via copy_xunit_results + add_error_xunit "$RESULT_BASE/result.xml" "$last_test" "xfstests.global" + copy_xunit_results + fi + /root/xfstests/bin/syncfs $RESULT_BASE + + # this was part of the in-progress preemption work, + # removing for now as it conflicts with the crash recovery stuff + # head -n -2 "$RESULT_BASE/completed" > /tmp/completed + # mv /tmp/completed "$RESULT_BASE/completed" else touch "$RESULT_BASE/completed" fi @@ -627,7 +650,7 @@ do then echo ./check -R $report_fmt $fail_test_loop -T $EXTRA_OPT \ $AEX $TEST_SET_EXCLUDE $(cat /tmp/tests-to-run) \ - > "$RESULT_BASE/check-cmd" + >> "$RESULT_BASE/check-cmd" bash ./check -R $report_fmt $fail_test_loop -T $EXTRA_OPT \ $AEX $TEST_SET_EXCLUDE $(cat /tmp/tests-to-run) copy_xunit_results @@ -642,6 +665,7 @@ do fi rm -f "$RESULT_BASE/completed" done + rm -f "$RESULT_BASE/rpt_status" if test -n "$RUN_ON_GCE" then gsutil cp "gs://$GS_BUCKET/check-time.tar.gz" /tmp >& /dev/null diff --git a/test-appliance/files/usr/lib/python3/dist-packages/diff_stats.py b/test-appliance/files/usr/lib/python3/dist-packages/diff_stats.py new file mode 100644 index 00000000..7cd218ea --- /dev/null +++ b/test-appliance/files/usr/lib/python3/dist-packages/diff_stats.py @@ -0,0 +1,105 @@ +#!/usr/bin/python3 + +import argparse +import sys +from gen_results_summary import TestStats +import xml.etree.ElementTree as ET +from junitparser import JUnitXml, Property, Properties, Failure, Error, Skipped + + +# s[cfg] = cfg_stats +# cfg_stats[test] = TestStats() +# consider s1 the baseline +def diff_stats(s1, s2, threshold, output_file, input_file1, input_file2): + """Compare the statistics between two Stats, report regressions and unexpected results""" + print(f"Writing results to {output_file}") + + skip_str="" + error_str="" + file = open(output_file, 'w') + file.write(f'Regression check {input_file1} -> {input_file2}:\n\n') + for cfg in s1.keys(): + if cfg not in s2.keys(): + file.write(f'***Warning: missing config {cfg} in {input_file2}***\n') + + for cfg in s2.keys(): + file.write(f'{cfg:-^45}\n') + if cfg not in s1.keys(): + file.write(f'***Warning: missing config {cfg} in {input_file1}***\n') + continue + for test_name in s2[cfg]: + test = s2[cfg][test_name] + if test_name not in s1[cfg]: + file.write(f'***Warning: {cfg}:{test_name} run on {input_file2} but not on {input_file1}***\n') + continue + if test.failed > 0: + test_1 = s1[cfg][test_name] + fail_rate_1 = 100.0 * test_1.failed / test_1.total + fail_rate_2 = 100.0 * test.failed / test.total + if fail_rate_2 >= fail_rate_1 + threshold: + file.write(f'{test_name}: {test_1.failed}/{test_1.total} ({fail_rate_1:.2f}%) -> {test.failed}/{test.total} ({fail_rate_2:.2f}%)\n') + + test_1 = s1[cfg][test_name] + skip_rate_1 = 100.0 * test_1.skipped / test_1.total + skip_rate_2 = 100.0 * test.skipped / test.total + if skip_rate_1 != skip_rate_2: + skip_str+=f'{cfg}:{test_name} skip rate changed {test_1.skipped}/{test_1.total} ({skip_rate_1:.2f}%) -> {test.skipped}/{test.total} ({skip_rate_2:.2f}%)\n' + + if test.error > 0: + test_1 = s1[cfg][test_name] + error_rate_1 = 100.0 * test_1.error / test_1.total + error_rate_2 = 100.0 * test.error / test.total + # always print error stats + error_str+=f'{cfg}:{test_name} ERROR {test_1.error}/{test_1.total} ({error_rate_1:.2f})% -> {test.error}/{test.total} ({error_rate_2:.2f}%)\n' + file.write('\n') + + if len(error_str) > 0: + file.write('\n*** ERROR(S) occurred in new test set: ***\n') + file.write(error_str) + + if len(skip_str) > 0: + file.write('\n*** WARNING: skip rate changed between test sets: ***\n') + file.write(skip_str) + file.close() + + +def read_stats(input_file): + """Read test statistics from file""" + stats = {} + tree = ET.parse(input_file) + root = tree.getroot() + + for cfg_element in root.findall('config'): + cfg = cfg_element.get('name') + if cfg not in stats: + stats[cfg] = {} + for test_element in cfg_element.findall('test'): + test = TestStats() + + name = test_element.get('name') + test.failed = int(test_element.get('failed')) + test.skipped = int(test_element.get('skipped')) + test.error = int(test_element.get('error')) + test.total = int(test_element.get('total')) + + stats[cfg][name] = test + + return stats + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('stats_file1', help='First stats file (baseline)', type=str) + parser.add_argument('stats_file2', help='Second stats file (file to compare to baseline)', type=str) + parser.add_argument('--outfile', help='Diff output file', default="stats.diff", type=str) + parser.add_argument('--regression_threshold', help='Percent (int) increase needed in fail rate to determine regression', type=int, default=5) + args = parser.parse_args() + + stats1 = read_stats(args.stats_file1) + stats2 = read_stats(args.stats_file2) + + diff_stats(stats1, stats2, args.regression_threshold, args.outfile, args.stats_file1, args.stats_file2) + + +if __name__ == "__main__": + main() diff --git a/test-appliance/files/usr/lib/python3/dist-packages/gen_results_summary.py b/test-appliance/files/usr/lib/python3/dist-packages/gen_results_summary.py index 44fb07d2..fe37e64d 100644 --- a/test-appliance/files/usr/lib/python3/dist-packages/gen_results_summary.py +++ b/test-appliance/files/usr/lib/python3/dist-packages/gen_results_summary.py @@ -135,6 +135,35 @@ def sum_testsuites(testsuites): errors += testsuite.errors return (tests, skipped, failures, errors, runtime) +def get_testsuite_stats(testsuite): + """Aggregate stats on individual tests""" + Stats = {} + for test_case in testsuite: + isFail = False + isSkipped = False + isError = False + for entry in test_case.result: + if isinstance(entry, Failure): + isFail = True + if isinstance(entry, Skipped): + isSkipped = True + if isinstance(entry, Error): + isError = True + if test_case.name in Stats: + s = Stats[test_case.name] + else: + s = TestStats() + Stats[test_case.name] = s + s.total += 1 + if isFail: + s.failed += 1 + if isSkipped: + s.skipped += 1 + if isError: + s.error += 1 + + return Stats + def print_summary(out_f, testsuite, verbose): """Print a summary for a particular test suite @@ -179,30 +208,7 @@ def print_summary(out_f, testsuite, verbose): out_f.write(" %-12s %-8s %ds\n" % (test_case.name, status, test_case.time)) else: - Stats = {} - for test_case in testsuite: - isFail = False - isSkipped = False - isError = False - for entry in test_case.result: - if isinstance(entry, Failure): - isFail = True - if isinstance(entry, Skipped): - isSkipped = True - if isinstance(entry, Error): - isError = True - if test_case.name in Stats: - s = Stats[test_case.name] - else: - s = TestStats() - Stats[test_case.name] = s - s.total += 1 - if isFail: - s.failed += 1 - if isSkipped: - s.skipped += 1 - if isError: - s.error += 1 + Stats = get_testsuite_stats(testsuite) wp = wrapped_print(out_f, 'Failures', ' ') for t in Stats: diff --git a/test-appliance/files/usr/lib/python3/dist-packages/get_stats.py b/test-appliance/files/usr/lib/python3/dist-packages/get_stats.py new file mode 100644 index 00000000..4cd62815 --- /dev/null +++ b/test-appliance/files/usr/lib/python3/dist-packages/get_stats.py @@ -0,0 +1,68 @@ +#!/usr/bin/python3 + +import argparse +import sys +from gen_results_summary import get_property, get_testsuite_stats, get_results +from junitparser import JUnitXml, Property, Properties, Failure, Error, Skipped + +try: + from lxml import etree +except ImportError: + from xml.etree import ElementTree as etree + + +# reports is list of results from each xml file +# stats[cfg] = cfg_stats +# cfg_stats[test] = TestStats() +def get_stats_from_dir(results_dir): + """From a results dir, return a list of reports and test statistics""" + reports = [] + stats = {} + for filename in get_results(results_dir): + reports.append(JUnitXml.fromfile(filename)) + + if len(reports) == 0: + sys.stderr.write(f'Error: could not find any reports in {results_dir}') + return None + + for testsuite in reports: + cfg = get_property(testsuite.properties(), 'TESTCFG') or get_property(testsuite.properties(), 'FSTESTCFG') + if cfg in stats: + sys.stderr.write(f'Found duplicate config {cfg}') + return None + stats[cfg] = get_testsuite_stats(testsuite) + + return stats + +# writes all configs into single output file +# condensing into entries of test->(failed, skipped, error, total) +# this will let us store stats and easily merge from other runs +# without having to reprocess everything +def write_stats(s, output_file): + """Write the test statistics to a file""" + root = etree.Element("configs") + for cfg in s: + cfg_element = etree.SubElement(root, "config", name=cfg) + for test_name in s[cfg]: + test = s[cfg][test_name] + etree.SubElement(cfg_element, "test", name=test_name, failed=str(test.failed), skipped=str(test.skipped), error=str(test.error), total=str(test.total)) + + tree = etree.ElementTree(root) + etree.indent(tree, space="\t", level=0) + tree.write(output_file, encoding='utf-8') + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('results_dir', help='Results directory to process', type=str) + parser.add_argument('--outfile', help='Diff output file', default='./stats.xml', type=str) + args = parser.parse_args() + + stats = get_stats_from_dir(args.results_dir) + + if stats == None: + return -1 + + write_stats(stats, args.outfile) + +if __name__ == "__main__": + main() diff --git a/test-appliance/files/usr/lib/python3/dist-packages/junitparser/__init__.py b/test-appliance/files/usr/lib/python3/dist-packages/junitparser/__init__.py index e3d8da0c..55b9ddbc 100644 --- a/test-appliance/files/usr/lib/python3/dist-packages/junitparser/__init__.py +++ b/test-appliance/files/usr/lib/python3/dist-packages/junitparser/__init__.py @@ -8,6 +8,7 @@ from .junitparser import ( Skipped, Failure, Error, + Result, TestCase, Properties, IntAttr, diff --git a/test-appliance/files/usr/lib/python3/dist-packages/junitparser/junitparser.py b/test-appliance/files/usr/lib/python3/dist-packages/junitparser/junitparser.py index eb38b298..b3bbd853 100644 --- a/test-appliance/files/usr/lib/python3/dist-packages/junitparser/junitparser.py +++ b/test-appliance/files/usr/lib/python3/dist-packages/junitparser/junitparser.py @@ -310,7 +310,11 @@ class JUnitXml(Element): if parse_func: tree = parse_func(filepath) else: - tree = etree.parse(filepath) # nosec + try: + tree = etree.parse(filepath) # nosec + except etree.XMLSyntaxError: + p = etree.XMLParser(huge_tree=True) + tree = etree.parse(filepath, parser=p) root_elem = tree.getroot() if root_elem.tag == "testsuites": instance = cls() diff --git a/test-appliance/files/usr/lib/python3/dist-packages/merge_stats.py b/test-appliance/files/usr/lib/python3/dist-packages/merge_stats.py new file mode 100644 index 00000000..a3148142 --- /dev/null +++ b/test-appliance/files/usr/lib/python3/dist-packages/merge_stats.py @@ -0,0 +1,45 @@ +#!/usr/bin/python3 + +import argparse +import sys +import xml.etree.ElementTree as ET +import get_stats +import diff_stats +from gen_results_summary import TestStats +from junitparser import JUnitXml, Property, Properties, Failure, Error, Skipped + + +def merge_stats(stats1, stats2): + """Merges stats2 into stats1""" + for cfg in stats2: + if cfg not in stats1: + stats1[cfg] = {} + + for test_name in stats2[cfg]: + if test_name not in stats1[cfg]: + stats1[cfg][test_name] = TestStats() + stats1[cfg][test_name].failed += stats2[cfg][test_name].failed + stats1[cfg][test_name].skipped += stats2[cfg][test_name].skipped + stats1[cfg][test_name].error += stats2[cfg][test_name].error + stats1[cfg][test_name].total += stats2[cfg][test_name].total + + return stats1 + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('stats_file', help='First stats file', type=str) + parser.add_argument('stats_files_merge', nargs='+', help='List of stats files to merge', type=str) + parser.add_argument('--outfile', default='merged_stats.xml', help='Output xml file', type=str) + args = parser.parse_args() + + stats = diff_stats.read_stats(args.stats_file) + + for file in args.stats_files_merge: + stats_merge = diff_stats.read_stats(file) + stats = merge_stats(stats, stats_merge) + + get_stats.write_stats(stats, args.outfile) + + +if __name__ == "__main__": + main() diff --git a/test-appliance/files/usr/local/bin/add_error_xunit b/test-appliance/files/usr/local/bin/add_error_xunit new file mode 100755 index 00000000..0f12e983 --- /dev/null +++ b/test-appliance/files/usr/local/bin/add_error_xunit @@ -0,0 +1,48 @@ +#!/usr/bin/python3 +import argparse +import os +import sys +from junitparser import JUnitXml, TestSuite, TestCase, Result, Error + +def get_test_suite(filename): + if not os.path.exists(filename): + ts = TestSuite() + else: + try: + ts = JUnitXml.fromfile(filename) + except IOError as e: + sys.exit("Couldn't open %s: %s" % (filename, e[1])) + + if type(ts) != TestSuite: + sys.exit('%s is not a xUnit report file' % filename) + return ts + + +parser = argparse.ArgumentParser() +parser.add_argument('input_file', help='input xUnit result file') +parser.add_argument('testname', help='name of test causing error') +parser.add_argument('classname', help='classname for test case') +args = parser.parse_args() + +ts = get_test_suite(args.input_file) + +result = Result() + +error = Error(result) +error.message='Machine rebooted (crash or test timeout)' +error.type='TestFail' + +tc = TestCase() +tc.classname=args.classname +tc.name=args.testname +tc.time = 0 +tc.result = [error] + +# this also updates the statistics +ts.add_testcase(tc) + +ts.write(args.input_file + '.new', pretty=True) +if os.path.exists(args.input_file): + os.rename(args.input_file, args.input_file + '.error.bak') +os.rename(args.input_file + '.new' , args.input_file) + diff --git a/test-appliance/files/usr/local/lib/gce-add-metadata b/test-appliance/files/usr/local/lib/gce-add-metadata index 672975b1..d77f50f1 100755 --- a/test-appliance/files/usr/local/lib/gce-add-metadata +++ b/test-appliance/files/usr/local/lib/gce-add-metadata @@ -7,4 +7,4 @@ then fi flock /run/xattr.lock gcloud compute instances -q add-metadata \ - --zone $ZONE $(hostname) --metadata "$@" >& /dev/null + --zone $ZONE $(hostname) --metadata "^##^$@" >& /dev/null diff --git a/test-appliance/files/usr/local/lib/gce-logger b/test-appliance/files/usr/local/lib/gce-logger index 68a1970e..4eadca31 100755 --- a/test-appliance/files/usr/local/lib/gce-logger +++ b/test-appliance/files/usr/local/lib/gce-logger @@ -6,10 +6,31 @@ then run_hooks logger "$*" fi +is_test= status=$(echo "$*" | sed -e 's/^run xfstest //') if echo "$*" | grep -q "^run xfstest " then + is_test="y" echo "$status" >> $RESULT_BASE/completed + + if test ! -f $RESULT_BASE/rpt_status -o \ + ! -f $RESULT_BASE/tests-to-run -o \ + ! -f $RESULT_BASE/completed + then + status="--% $status" + else + rpt_status=$(cat $RESULT_BASE/rpt_status) + current_rpt=${rpt_status%%/*} + total_rpt=${rpt_status##*/} + total_tests=$(cat $RESULT_BASE/tests-to-run | sort | uniq | wc -l) + count_completed=$(cat $RESULT_BASE/completed | sort | uniq | wc -l) + + progress=$(( ( ( $current_rpt - 1 ) * $total_tests + $count_completed ) * 100 / ( $total_tests * $total_rpt ) )) + status="$progress% $status" + fi + + # sync completed file + /root/xfstests/bin/syncfs $RESULT_BASE fi if test -f /run/fstest-config @@ -18,7 +39,11 @@ then status="$cfg $status" fi -if test -z "$(find /run/last_logged -mmin -1 -print 2> /dev/null)" +# force first test to upload it's test status and wait for it to finish +if test -n "$is_test" -a ! -s /run/last_logged; then + /usr/local/lib/gce-add-metadata "status=$(date +%H:%M) $status" + echo "Started testing" > /run/last_logged +elif test -z "$(find /run/last_logged -mmin -1 -print 2> /dev/null)" then /usr/local/lib/gce-add-metadata "status=$(date +%H:%M) $status" & touch /run/last_logged diff --git a/test-appliance/files/usr/local/lib/gce-server/ltm/shard.go b/test-appliance/files/usr/local/lib/gce-server/ltm/shard.go index 25af29cd..bef49236 100644 --- a/test-appliance/files/usr/local/lib/gce-server/ltm/shard.go +++ b/test-appliance/files/usr/local/lib/gce-server/ltm/shard.go @@ -38,6 +38,7 @@ type ShardWorker struct { vmStatus string vmtestStart time.Time testResult server.ResultType + vmReset bool log *logrus.Entry logPath string @@ -48,9 +49,9 @@ type ShardWorker struct { } const ( - monitorTimeout = 1 * time.Hour - noStatusTimeout = 5 * time.Minute + noStatusTimeout = 10 * time.Minute monitorInterval = 60 * time.Second + resetTimeout = 10 * time.Minute gsInterval = 10 * time.Second maxAttempts = 5 ) @@ -70,6 +71,7 @@ func NewShardWorker(sharder *ShardScheduler, shardID string, config string, zone vmStatus: "waiting for launch", vmtestStart: time.Now(), testResult: server.DefaultResult, + vmReset: false, log: sharder.log.WithField("shardID", shardID), logPath: logPath, @@ -91,23 +93,25 @@ func NewShardWorker(sharder *ShardScheduler, shardID string, config string, zone "--no-email", "-c", config, } + if sharder.arch != "" { shard.args = append(shard.args, "--arch", sharder.arch) } - shard.args = append(shard.args, sharder.validArgs...) - var defaultProj bool = true - for _, arg := range shard.args { + var imgProjFlag bool = false + for _, arg := range sharder.validArgs { if arg == "--image-project" { - defaultProj = false + imgProjFlag = true break } } - if defaultProj { - shard.args = append(shard.args, "--image-project", sharder.projID) + if ! imgProjFlag && len(sharder.imgProjID) > 0 { + shard.args = append(shard.args, "--image-project", sharder.imgProjID) } + shard.args = append(shard.args, sharder.validArgs...) + return &shard } @@ -185,6 +189,7 @@ func (shard *ShardWorker) monitor() { if *metaData.Value != shard.vmStatus { shard.vmStatus = *metaData.Value shard.vmtestStart = time.Now() + shard.vmReset = false break } } @@ -203,20 +208,30 @@ func (shard *ShardWorker) monitor() { log.Debug("waiting to get test status metadata") } - if time.Since(shard.vmtestStart) > monitorTimeout { - if !shard.sharder.keepDeadVM { - shard.shutdownOnTimeout(instanceInfo.Metadata) - } - shard.vmStatus = "timeout on one test" - shard.testResult = server.Hang - log.WithFields(logrus.Fields{ - "status": shard.vmStatus, - "start": shard.vmtestStart.Format(time.Stamp), - }).Errorf("Instance seems to have wedged, no status update for %s", monitorTimeout.Round(time.Minute)) + if shard.vmReset && time.Since(shard.vmtestStart) > resetTimeout { + log.Errorf("VM did not come back online after reset, exiting"); return } + // Reset VM if we don't get a status update + // Skip check if we are already performing a reset + // Selftests may limit monitorTimeout to shorter than noStatusTimeout + // so skip check if we are still launching + if time.Since(shard.vmtestStart) > shard.sharder.monitorTimeout && + ! shard.vmReset && shard.vmStatus != "launching" { + log.Debug("Resetting VM") + err := shard.sharder.gce.ResetVM(shard.sharder.projID, shard.zone, shard.name) + if err != nil { + log.Errorf("Failed to reset %s", shard.name) + shard.vmStatus = "failed to reset after timeout" + shard.testResult = server.Error + return + } + shard.vmReset = true + shard.vmtestStart = time.Now() + } + log.WithFields(logrus.Fields{ "status": shard.vmStatus, "start": shard.vmtestStart.Format(time.Stamp), @@ -283,8 +298,7 @@ func (shard *ShardWorker) shutdownOnTimeout(metadata *compute.Metadata) { /* finish calls gce-xfstests scripts to fetch and unpack test result files. -It deletes the results in gs bucket and local serial port output. -It also determines testResult: +It deletes the results in gs bucket and determines testResult: Default VM finishes without issues, test result is found; Crash VM started running tests but no test result is found; @@ -321,11 +335,6 @@ func (shard *ShardWorker) finish() { shard.log.Panic("Failed to find unpacked result files") } - if check.FileExists(shard.serialOutputPath) && !shard.vmTimeout { - err = os.Remove(shard.serialOutputPath) - check.NoError(err, shard.log, "Failed to remove dir") - } - prefix := fmt.Sprintf("%s/results.%s", shard.sharder.bucketSubdir, shard.resultsName) _, err = shard.sharder.gce.DeleteFiles(prefix) check.NoError(err, shard.log, "Failed to delete file") diff --git a/test-appliance/files/usr/local/lib/gce-server/ltm/sharder.go b/test-appliance/files/usr/local/lib/gce-server/ltm/sharder.go index 957efa98..95d1f9e6 100644 --- a/test-appliance/files/usr/local/lib/gce-server/ltm/sharder.go +++ b/test-appliance/files/usr/local/lib/gce-server/ltm/sharder.go @@ -23,6 +23,7 @@ import ( "sort" "strings" "sync" + "time" "thunk.org/gce-server/util/check" "thunk.org/gce-server/util/email" @@ -36,12 +37,14 @@ import ( ) const genResultsSummaryPath = "/usr/local/bin/gen_results_summary" +const defaultMonitorTimeout = 1 * time.Hour // ShardScheduler schedules tests and aggregates reports. type ShardScheduler struct { - testID string - projID string - origCmd string + testID string + projID string + imgProjID string + origCmd string zone string region string @@ -53,6 +56,7 @@ type ShardScheduler struct { reportReceiver string maxShards int keepDeadVM bool + monitorTimeout time.Duration reportKCS bool testRequest server.TaskRequest @@ -101,6 +105,9 @@ func NewShardScheduler(c server.TaskRequest, testID string) *ShardScheduler { projID, err := gcp.GceConfig.Get("GCE_PROJECT") check.Panic(err, log, "Failed to get project config") + imgProjID, err := gcp.GceConfig.Get("GCE_IMAGE_PROJECT") + check.Panic(err, log, "Failed to get image project") + gsBucket, err := gcp.GceConfig.Get("GS_BUCKET") check.Panic(err, log, "Failed to get gs bucket config") @@ -108,9 +115,10 @@ func NewShardScheduler(c server.TaskRequest, testID string) *ShardScheduler { log.Info("Initiating test sharder") sharder := ShardScheduler{ - testID: testID, - projID: projID, - origCmd: origCmd, + testID: testID, + projID: projID, + imgProjID: imgProjID, + origCmd: origCmd, zone: zone, region: region, @@ -122,6 +130,7 @@ func NewShardScheduler(c server.TaskRequest, testID string) *ShardScheduler { reportReceiver: c.Options.ReportEmail, maxShards: 0, keepDeadVM: false, + monitorTimeout: defaultMonitorTimeout, reportKCS: false, testRequest: c, @@ -143,6 +152,15 @@ func NewShardScheduler(c server.TaskRequest, testID string) *ShardScheduler { if sharder.bucketSubdir == "" { sharder.bucketSubdir = "results" } + if c.Options.MonitorTimeout != "" { + sharder.monitorTimeout, err = time.ParseDuration(c.Options.MonitorTimeout) + if err != nil { + sharder.monitorTimeout = defaultMonitorTimeout + sharder.log.WithField("MonitorTimeout", c.Options.MonitorTimeout).Error("Unable to parse --monitor-timeout option, using default value") + } else { + sharder.log.WithField("MonitorTimeout", sharder.monitorTimeout).Info("Parsed monitor timeout argument") + } + } sharder.validArgs, sharder.configs, err = getConfigs(sharder.origCmd) check.Panic(err, log, "Failed to parse config from origCmd") @@ -368,6 +386,7 @@ func (sharder *ShardScheduler) aggResults() { "unpackedResultsDir": shard.unpackedResultsDir, }) log.Debug("Moving shard result files into aggregate folder") + shardHasResults := false if check.DirExists(shard.unpackedResultsDir) { err := os.RemoveAll(sharder.aggDir + shard.shardID) @@ -376,16 +395,22 @@ func (sharder *ShardScheduler) aggResults() { err = os.Rename(shard.unpackedResultsDir, sharder.aggDir+shard.shardID) check.Panic(err, log, "Failed to move dir") + shardHasResults = true hasResults = true - } else if check.FileExists(shard.serialOutputPath) { + } + + if check.FileExists(shard.serialOutputPath) { err := os.RemoveAll(sharder.aggDir + shard.shardID + ".serial") check.Panic(err, log, "Failed to remove dir") err = os.Rename(shard.serialOutputPath, sharder.aggDir+shard.shardID+".serial") check.Panic(err, log, "Failed to move dir") + shardHasResults = true hasResults = true - } else { + } + + if ! shardHasResults { log.Warn("Shard has no results available") } } diff --git a/test-appliance/files/usr/local/lib/gce-server/util/gcp/gcp.go b/test-appliance/files/usr/local/lib/gce-server/util/gcp/gcp.go index 65480d6d..da8d7401 100644 --- a/test-appliance/files/usr/local/lib/gce-server/util/gcp/gcp.go +++ b/test-appliance/files/usr/local/lib/gce-server/util/gcp/gcp.go @@ -286,3 +286,10 @@ func NotFound(err error) bool { } return false } + +func (gce *Service) ResetVM(project string, zone string, instance string) error { + instancesService := compute.NewInstancesService(gce.service) + call := instancesService.Reset(project, zone, instance) + _, err := call.Do() + return err +} diff --git a/test-appliance/files/usr/local/lib/gce-server/util/parser/parser.go b/test-appliance/files/usr/local/lib/gce-server/util/parser/parser.go index 81ff5fa0..045ab86c 100644 --- a/test-appliance/files/usr/local/lib/gce-server/util/parser/parser.go +++ b/test-appliance/files/usr/local/lib/gce-server/util/parser/parser.go @@ -40,6 +40,7 @@ var invalidOpts = []string{ "--watch", "--bisect-good", "--bisect-bad", + "--monitor-timeout", } /* diff --git a/test-appliance/files/usr/local/lib/gce-server/util/server/server.go b/test-appliance/files/usr/local/lib/gce-server/util/server/server.go index 4c9122e8..d7e3690e 100644 --- a/test-appliance/files/usr/local/lib/gce-server/util/server/server.go +++ b/test-appliance/files/usr/local/lib/gce-server/util/server/server.go @@ -116,20 +116,21 @@ const ( // UserOptions contains configs user sends to LTM or KCS. type UserOptions struct { - NoRegionShard bool `json:"no_region_shard"` - BucketSubdir string `json:"bucket_subdir"` - GsKernel string `json:"gs_kernel"` - ReportEmail string `json:"report_email"` - CommitID string `json:"commit_id"` - GitRepo string `json:"git_repo"` - BranchName string `json:"branch_name"` - UnWatch string `json:"unwatch"` - BadCommit string `json:"bad_commit"` - GoodCommit string `json:"good_commit"` - KConfig string `json:"kconfig"` - KConfigOpts string `json:"kconfig_opts"` - KbuildOpts string `json:"kbuild_opts"` - Arch string `json:"arch"` + NoRegionShard bool `json:"no_region_shard"` + BucketSubdir string `json:"bucket_subdir"` + GsKernel string `json:"gs_kernel"` + ReportEmail string `json:"report_email"` + CommitID string `json:"commit_id"` + GitRepo string `json:"git_repo"` + BranchName string `json:"branch_name"` + UnWatch string `json:"unwatch"` + BadCommit string `json:"bad_commit"` + GoodCommit string `json:"good_commit"` + KConfig string `json:"kconfig"` + KConfigOpts string `json:"kconfig_opts"` + KbuildOpts string `json:"kbuild_opts"` + Arch string `json:"arch"` + MonitorTimeout string `json:"monitor_timeout"` } // InternalOptions contains configs used by LTM and KCS internally. diff --git a/test-appliance/gce-xfstests-bld.sh b/test-appliance/gce-xfstests-bld.sh index 4c54f8ce..2c054d0f 100644 --- a/test-appliance/gce-xfstests-bld.sh +++ b/test-appliance/gce-xfstests-bld.sh @@ -372,6 +372,8 @@ sed -i -e '/ExecStart/s/agetty/agetty -a root/' \ -e 's/After=rc.local.service/After=network.target/' \ /etc/systemd/system/telnet-getty@.service +echo "kernel.panic=60" >> /etc/sysctl.conf + systemctl enable kvm-xfstests.service systemctl enable gce-fetch-gs-files.service systemctl enable gce-finalize-wait.service |