Bug 1455316 - sometimes when test-verify finds a failure, all future tests are marked as failing also. r=gbrown
☠☠ backed out by 3bf2be0ef465 ☠ ☠
authorJoel Maher <jmaher@mozilla.com>
Tue, 01 May 2018 11:50:46 -0400
changeset 472589 5cdb10268f47f90d0e5610d7e714f3a4d4be16d6
parent 472588 de23cd2fcc4af28ff67e24411e0282001a9a94c2
child 472590 d92d104b4b5f8f91937b0a25779b1c9256ea345e
push id1728
push userjlund@mozilla.com
push dateMon, 18 Jun 2018 21:12:27 +0000
treeherdermozilla-release@c296fde26f5f [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersgbrown
bugs1455316
milestone61.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1455316 - sometimes when test-verify finds a failure, all future tests are marked as failing also. r=gbrown
testing/mozharness/mozharness/mozilla/structuredlog.py
testing/mozharness/mozharness/mozilla/testing/unittest.py
testing/mozharness/scripts/android_emulator_unittest.py
testing/mozharness/scripts/desktop_unittest.py
testing/mozharness/scripts/marionette.py
testing/mozharness/scripts/web_platform_tests.py
--- a/testing/mozharness/mozharness/mozilla/structuredlog.py
+++ b/testing/mozharness/mozharness/mozilla/structuredlog.py
@@ -6,16 +6,21 @@
 import json
 
 from mozharness.base import log
 from mozharness.base.log import OutputParser, WARNING, INFO, ERROR
 from mozharness.mozilla.buildbot import TBPL_WARNING, TBPL_FAILURE
 from mozharness.mozilla.buildbot import TBPL_SUCCESS, TBPL_WORST_LEVEL_TUPLE
 from mozharness.mozilla.testing.unittest import tbox_print_summary
 
+from collections import (
+    defaultdict,
+    namedtuple,
+)
+
 
 class StructuredOutputParser(OutputParser):
     # The script class using this must inherit the MozbaseMixin to ensure
     # that mozlog is available.
     def __init__(self, **kwargs):
         """Object that tracks the overall status of the test run"""
         # The 'strict' argument dictates whether the presence of output
         # from the harness process other than line-delimited json indicates
@@ -94,20 +99,58 @@ class StructuredOutputParser(OutputParse
             if error_level is not None:
                 level = self.worst_level(error_level, level)
 
         log_data = self.formatter(data)
         if log_data is not None:
             self.log(log_data, level=level)
             self.update_levels(tbpl_level, level)
 
-    def evaluate_parser(self, return_code, success_codes=None):
+    def _subtract_tuples(self, old, new):
+        items = set(old.keys() + new.keys())
+        merged = defaultdict(int)
+        for item in items:
+            merged[item] = new.get(item, 0) - old.get(item, 0)
+            if merged[item] <= 0:
+                del merged[item]
+        return merged
+
+    def evaluate_parser(self, return_code, success_codes=None, previous_summary=None):
         success_codes = success_codes or [0]
         summary = self.handler.summarize()
 
+        """
+          We can run evaluate_parser multiple times, it will duplicate failures
+          and status which can mean that future tests will fail if a previous test fails.
+          When we have a previous summary, we want to do 2 things:
+            1) Remove previous data from the new summary to only look at new data
+            2) Build a joined summary to include the previous + new data
+        """
+        if previous_summary:
+            RunSummary = namedtuple("RunSummary",
+                                    ("unexpected_statuses",
+                                     "expected_statuses",
+                                     "log_level_counts",
+                                     "action_counts"))
+
+            self.tbpl_status = TBPL_SUCCESS
+
+            joined_summary = summary
+
+            # Remove previously known status messages
+            summary = RunSummary(self._subtract_tuples(previous_summary.unexpected_statuses, summary.unexpected_statuses),
+                                 self._subtract_tuples(previous_summary.expected_statuses, summary.expected_statuses),
+                                 summary.log_level_counts,
+                                 summary.action_counts)
+
+            # If we have previous data to ignore, cache it so we don't parse the log multiple times
+            self.summary = summary
+        else:
+           joined_summary = summary
+
         fail_pair = TBPL_WARNING, WARNING
         error_pair = TBPL_FAILURE, ERROR
 
         # These are warning/orange statuses.
         failure_conditions = [
             sum(summary.unexpected_statuses.values()) > 0,
             summary.action_counts.get('crash', 0) > summary.expected_statuses.get('CRASH', 0),
             summary.action_counts.get('valgrind_error', 0) > 0
@@ -137,44 +180,52 @@ class StructuredOutputParser(OutputParse
         if self.num_errors:
             self.update_levels(*error_pair)
 
         # Harnesses typically return non-zero on test failure, so don't promote
         # to error if we already have a failing status.
         if return_code not in success_codes and self.tbpl_status == TBPL_SUCCESS:
             self.update_levels(*error_pair)
 
-        return self.tbpl_status, self.worst_log_level
+        return self.tbpl_status, self.worst_log_level, joined_summary
 
     def update_levels(self, tbpl_level, log_level):
         self.worst_log_level = self.worst_level(log_level, self.worst_log_level)
         self.tbpl_status = self.worst_level(tbpl_level, self.tbpl_status,
                                             levels=TBPL_WORST_LEVEL_TUPLE)
 
     def print_summary(self, suite_name):
         # Summary text provided for compatibility. Counts are currently
         # in the format <pass count>/<fail count>/<todo count>,
         # <expected count>/<unexpected count>/<expected fail count> will yield the
         # expected info from a structured log (fail count from the prior implementation
         # includes unexpected passes from "todo" assertions).
-        summary = self.handler.summarize()
+        try:
+            summary = self.summary
+        except AttributeError:
+            summary = self.handler.summarize()
+
         unexpected_count = sum(summary.unexpected_statuses.values())
         expected_count = sum(summary.expected_statuses.values())
         expected_failures = summary.expected_statuses.get('FAIL', 0)
 
         if unexpected_count:
             fail_text = '<em class="testfail">%s</em>' % unexpected_count
         else:
             fail_text = '0'
 
         text_summary = "%s/%s/%s" % (expected_count, fail_text, expected_failures)
         self.info("TinderboxPrint: %s<br/>%s\n" % (suite_name, text_summary))
 
     def append_tinderboxprint_line(self, suite_name):
-        summary = self.handler.summarize()
+        try:
+            summary = self.summary
+        except AttributeError:
+            summary = self.handler.summarize()
+
         unexpected_count = sum(summary.unexpected_statuses.values())
         expected_count = sum(summary.expected_statuses.values())
         expected_failures = summary.expected_statuses.get('FAIL', 0)
         crashed = 0
         if 'crash' in summary.action_counts:
             crashed = summary.action_counts['crash']
         text_summary = tbox_print_summary(expected_count,
                                           unexpected_count,
--- a/testing/mozharness/mozharness/mozilla/testing/unittest.py
+++ b/testing/mozharness/mozharness/mozilla/testing/unittest.py
@@ -60,26 +60,52 @@ class TestSummaryOutputParserHelper(Outp
         m = self.regex.search(line)
         if m:
             try:
                 setattr(self, m.group(1), int(m.group(2)))
             except ValueError:
                 # ignore bad values
                 pass
 
-    def evaluate_parser(self, return_code, success_codes=None):
+    def evaluate_parser(self, return_code, success_codes=None, previous_summary=None):
+        """
+          We can run evaluate_parser multiple times, it will duplicate failures
+          and status which can mean that future tests will fail if a previous test fails.
+          When we have a previous summary, we want to do 2 things:
+            1) Remove previous data from the new summary to only look at new data
+            2) Build a joined summary to include the previous + new data
+        """
+        keys = ['passed', 'failed']
+        joined_summary = {}
+        for key in keys:
+            joined_summary[key] = getattr(self, key)
+
+        if previous_summary:
+            for key in keys:
+                joined_summary[key] += previous_summary[key]
+                value = getattr(self, key) - previous_summary[key]
+                if value < 0:
+                    value = 0
+                setattr(self, key, value)
+            self.tbpl_status = TBPL_SUCCESS
+            self.worst_log_level = INFO
+
+        joined_summary = {}
+        if previous_summary:
+            joined_summary = previous_summary
+
         if return_code == 0 and self.passed > 0 and self.failed == 0:
             self.tbpl_status = TBPL_SUCCESS
         elif return_code == 10 and self.failed > 0:
             self.tbpl_status = TBPL_WARNING
         else:
             self.tbpl_status = TBPL_FAILURE
             self.worst_log_level = ERROR
 
-        return (self.tbpl_status, self.worst_log_level)
+        return (self.tbpl_status, self.worst_log_level, joined_summary)
 
     def print_summary(self, suite_name):
         # generate the TinderboxPrint line for TBPL
         emphasize_fail_text = '<em class="testfail">%s</em>'
         failed = "0"
         if self.passed == 0 and self.failed == 0:
             self.tsummary = emphasize_fail_text % "T-FAIL"
         else:
@@ -165,23 +191,45 @@ class DesktopUnittestOutputParser(Output
         if self.harness_retry_re.search(line):
             self.critical(' %s' % line)
             self.worst_log_level = self.worst_level(CRITICAL, self.worst_log_level)
             self.tbpl_status = self.worst_level(TBPL_RETRY, self.tbpl_status,
                                                 levels=TBPL_WORST_LEVEL_TUPLE)
             return  # skip base parse_single_line
         super(DesktopUnittestOutputParser, self).parse_single_line(line)
 
-    def evaluate_parser(self, return_code, success_codes=None):
+    def evaluate_parser(self, return_code, success_codes=None, previous_summary=None):
         success_codes = success_codes or [0]
 
         if self.num_errors:  # mozharness ran into a script error
             self.tbpl_status = self.worst_level(TBPL_FAILURE, self.tbpl_status,
                                                 levels=TBPL_WORST_LEVEL_TUPLE)
 
+        """
+          We can run evaluate_parser multiple times, it will duplicate failures
+          and status which can mean that future tests will fail if a previous test fails.
+          When we have a previous summary, we want to do 2 things:
+            1) Remove previous data from the new summary to only look at new data
+            2) Build a joined summary to include the previous + new data
+        """
+        keys = ['pass_count', 'fail_count', 'known_fail_count', 'crashed', 'leaked']
+        joined_summary = {}
+        for key in keys:
+            joined_summary[key] = getattr(self, key)
+
+        if previous_summary:
+            for key in keys:
+                joined_summary[key] += previous_summary[key]
+                value = getattr(self, key) - previous_summary[key]
+                if value < 0:
+                    value = 0
+                setattr(self, key, value)
+            self.tbpl_status = TBPL_SUCCESS
+            self.worst_log_level = INFO
+
         # I have to put this outside of parse_single_line because this checks not
         # only if fail_count was more then 0 but also if fail_count is still -1
         # (no fail summary line was found)
         if self.fail_count != 0:
             self.worst_log_level = self.worst_level(WARNING, self.worst_log_level)
             self.tbpl_status = self.worst_level(TBPL_WARNING, self.tbpl_status,
                                                 levels=TBPL_WORST_LEVEL_TUPLE)
 
@@ -195,17 +243,17 @@ class DesktopUnittestOutputParser(Output
                                                 self.tbpl_status,
                                                 levels=TBPL_WORST_LEVEL_TUPLE)
 
         if return_code not in success_codes:
             self.tbpl_status = self.worst_level(TBPL_FAILURE, self.tbpl_status,
                                                 levels=TBPL_WORST_LEVEL_TUPLE)
 
         # we can trust in parser.worst_log_level in either case
-        return (self.tbpl_status, self.worst_log_level)
+        return (self.tbpl_status, self.worst_log_level, joined_summary)
 
     def append_tinderboxprint_line(self, suite_name):
         # We are duplicating a condition (fail_count) from evaluate_parser and
         # parse parse_single_line but at little cost since we are not parsing
         # the log more then once.  I figured this method should stay isolated as
         # it is only here for tbpl highlighted summaries and is not part of
         # buildbot evaluation or result status IIUC.
         summary = tbox_print_summary(self.pass_count,
--- a/testing/mozharness/scripts/android_emulator_unittest.py
+++ b/testing/mozharness/scripts/android_emulator_unittest.py
@@ -785,16 +785,17 @@ class AndroidEmulatorTest(TestingMixin, 
                 self.fatal("Don't know how to run --test-suite '%s'!" % self.test_suite)
             env = self.query_env()
             if minidump:
                 env['MINIDUMP_STACKWALK'] = minidump
             env['MOZ_UPLOAD_DIR'] = self.query_abs_dirs()['abs_blob_upload_dir']
             env['MINIDUMP_SAVE_PATH'] = self.query_abs_dirs()['abs_blob_upload_dir']
             env['RUST_BACKTRACE'] = 'full'
 
+            summary = None
             for per_test_args in self.query_args(per_test_suite):
                 if (datetime.datetime.now() - self.start_time) > max_per_test_time:
                     # Running tests has run out of time. That is okay! Stop running
                     # them so that a task timeout is not triggered, and so that
                     # (partial) results are made available in a timely manner.
                     self.info("TinderboxPrint: Running tests took too long: "
                               "Not all tests were executed.<br/>")
                     # Signal per-test time exceeded, to break out of suites and
@@ -815,17 +816,17 @@ class AndroidEmulatorTest(TestingMixin, 
 
                 suite_category = self.test_suite
                 parser = self.get_test_output_parser(
                     suite_category,
                     config=self.config,
                     log_obj=self.log_obj,
                     error_list=[])
                 self.run_command(final_cmd, cwd=cwd, env=env, output_parser=parser)
-                tbpl_status, log_level = parser.evaluate_parser(0)
+                tbpl_status, log_level, summary = parser.evaluate_parser(0, summary)
                 parser.append_tinderboxprint_line(self.test_suite)
 
                 self.info("##### %s log ends" % self.test_suite)
 
                 if len(per_test_args) > 0:
                     self.buildbot_status(tbpl_status, level=log_level)
                     self.log_per_test_status(per_test_args[-1], tbpl_status, log_level)
                 else:
--- a/testing/mozharness/scripts/desktop_unittest.py
+++ b/testing/mozharness/scripts/desktop_unittest.py
@@ -849,16 +849,17 @@ class DesktopUnittest(TestingMixin, Merc
                 if self.config['single_stylo_traversal']:
                     env['STYLO_THREADS'] = '1'
                 else:
                     env['STYLO_THREADS'] = '4'
 
                 env = self.query_env(partial_env=env, log_level=INFO)
                 cmd_timeout = self.get_timeout_for_category(suite_category)
 
+                summary = None
                 for per_test_args in self.query_args(suite):
                     if (datetime.now() - self.start_time) > max_per_test_time:
                         # Running tests has run out of time. That is okay! Stop running
                         # them so that a task timeout is not triggered, and so that
                         # (partial) results are made available in a timely manner.
                         self.info("TinderboxPrint: Running tests took too long: Not all tests "
                                   "were executed.<br/>")
                         # Signal per-test time exceeded, to break out of suites and
@@ -900,18 +901,19 @@ class DesktopUnittest(TestingMixin, Merc
                     #    findings for harness/suite errors <- DesktopUnittestOutputParser
                     # 3) checking to see if the return code is in success_codes
 
                     success_codes = None
                     if self._is_windows() and suite_category != 'gtest':
                         # bug 1120644
                         success_codes = [0, 1]
 
-                    tbpl_status, log_level = parser.evaluate_parser(return_code,
-                                                                    success_codes=success_codes)
+                    tbpl_status, log_level, summary = parser.evaluate_parser(return_code,
+                                                                             success_codes,
+                                                                             summary)
                     parser.append_tinderboxprint_line(suite_name)
 
                     self.buildbot_status(tbpl_status, level=log_level)
                     if len(per_test_args) > 0:
                         self.log_per_test_status(per_test_args[-1], tbpl_status, log_level)
                     else:
                         self.log("The %s suite: %s ran with return status: %s" %
                                  (suite_category, suite, tbpl_status), level=log_level)
--- a/testing/mozharness/scripts/marionette.py
+++ b/testing/mozharness/scripts/marionette.py
@@ -349,17 +349,17 @@ class MarionetteTest(TestingMixin, Mercu
                                               error_list=BaseErrorList + HarnessErrorList,
                                               strict=False)
         return_code = self.run_command(cmd,
                                        cwd=cwd,
                                        output_timeout=1000,
                                        output_parser=marionette_parser,
                                        env=env)
         level = INFO
-        tbpl_status, log_level = marionette_parser.evaluate_parser(
+        tbpl_status, log_level, summary = marionette_parser.evaluate_parser(
             return_code=return_code)
         marionette_parser.append_tinderboxprint_line("marionette")
 
         qemu = os.path.join(dirs['abs_work_dir'], 'qemu.log')
         if os.path.isfile(qemu):
             self.copyfile(qemu, os.path.join(dirs['abs_blob_upload_dir'],
                                              'qemu.log'))
 
--- a/testing/mozharness/scripts/web_platform_tests.py
+++ b/testing/mozharness/scripts/web_platform_tests.py
@@ -332,16 +332,17 @@ class WebPlatformTest(TestingMixin, Merc
                     self.info("Skipping 'wdspec' tests - no geckodriver")
         else:
             test_types = self.config.get("test_type", [])
             suites = [None]
         for suite in suites:
             if suite:
                 test_types = [suite]
 
+            summary = None
             for per_test_args in self.query_args(suite):
                 if (datetime.now() - start_time) > max_per_test_time:
                     # Running tests has run out of time. That is okay! Stop running
                     # them so that a task timeout is not triggered, and so that
                     # (partial) results are made available in a timely manner.
                     self.info("TinderboxPrint: Running tests took too long: Not all tests "
                               "were executed.<br/>")
                     return
@@ -365,17 +366,17 @@ class WebPlatformTest(TestingMixin, Merc
                                                cwd=dirs['abs_work_dir'],
                                                output_timeout=1000,
                                                output_parser=parser,
                                                env=env)
 
                 if self.per_test_coverage:
                     self.add_per_test_coverage_report(gcov_dir, jsvm_dir, suite, per_test_args[-1])
 
-                tbpl_status, log_level = parser.evaluate_parser(return_code)
+                tbpl_status, log_level, summary = parser.evaluate_parser(return_code, summary)
                 self.buildbot_status(tbpl_status, level=log_level)
 
                 if len(per_test_args) > 0:
                     self.log_per_test_status(per_test_args[-1], tbpl_status, log_level)
 
 
 # main {{{1
 if __name__ == '__main__':