Bug 1561306 - [test isolation] repeat directory tests 5 times, individual tests 20 times, r=jmaher,tomprince
authorBob Clary <bclary@bclary.com>
Fri, 05 Jul 2019 18:40:20 +0000
changeset 544332 0ec05244de2d0ea003a49cb9a0d7c29c617e9ba0
parent 544331 4ea815152fd7e5d353cfdaebe21ae6840dc69900
child 544333 4d37f42cecafac5c14d3dbc20a9204643b5cc3ea
push id2131
push userffxbld-merge
push dateMon, 26 Aug 2019 18:30:20 +0000
treeherdermozilla-release@b19ffb3ca153 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjmaher, tomprince
bugs1561306
milestone69.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1561306 - [test isolation] repeat directory tests 5 times, individual tests 20 times, r=jmaher,tomprince Differential Revision: https://phabricator.services.mozilla.com/D36013
taskcluster/taskgraph/actions/isolate_test.py
--- a/taskcluster/taskgraph/actions/isolate_test.py
+++ b/taskcluster/taskgraph/actions/isolate_test.py
@@ -1,27 +1,26 @@
 # -*- coding: utf-8 -*-
 
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 from __future__ import absolute_import, print_function, unicode_literals
 
-import copy
 import json
 import logging
 import os
 import re
 
 from slugid import nice as slugid
 from taskgraph.util.taskcluster import list_artifacts, get_artifact, get_task_definition
 from ..util.parameterization import resolve_task_references
 from .registry import register_callback_action
-from .util import create_task_from_def, fetch_graph_and_labels
+from .util import create_task_from_def, fetch_graph_and_labels, add_args_to_command
 
 logger = logging.getLogger(__name__)
 
 
 def get_failures(task_id):
     """Returns a dict containing properties containing a list of
     directories containing test failures and a separate list of
     individual test failures from the errorsummary.log artifact for
@@ -61,68 +60,110 @@ def get_failures(task_id):
     for artifact in artifacts:
         if 'name' in artifact and artifact['name'].endswith('errorsummary.log'):
             stream = get_artifact(task_id, artifact['name'])
             if stream:
                 # Read all of the content from the stream and split
                 # the lines out since on macosx and windows, the first
                 # line is empty.
                 for line in stream.read().split('\n'):
+                    if len(tests) > 4:
+                        # The number of tasks created is determined by
+                        # the `times` value and the number of distinct
+                        # tests and directories as:
+                        # times * (1 + len(tests) + len(dirs)).
+                        # Since the maximum value of `times`
+                        # specifiable in the Treeherder UI is 100, the
+                        # number of tasks created can reach a very
+                        # large value depending on the number of
+                        # unique tests.  During testing, it was found
+                        # that 10 distinct tests were sufficient to
+                        # cause the action task to exceed the
+                        # maxRunTime of 1800 seconds resulting in it
+                        # being aborted.  We limit the number of
+                        # distinct tests and thereby the number of
+                        # distinct test directories to a maximum of 5
+                        # to keep the action task from timing out.
+                        break
                     line = line.strip()
                     match = re_test.search(line)
                     if match:
                         test_path = munge_test_path(match.group(1))
                         if test_path:
                             tests.add(test_path)
                             test_dir = os.path.dirname(test_path)
                             if test_dir:
                                 dirs.add(test_dir)
+            break
     return {'dirs': sorted(dirs), 'tests': sorted(tests)}
 
 
-def create_isolate_failure_tasks(task_definition, failures, level):
+def create_isolate_failure_tasks(task_definition, failures, level, times):
     """
-    Create tasks to re-run the original tasks plus tasks to test
+    Create tasks to re-run the original task plus tasks to test
     each failing test directory and individual path.
 
     """
-    # redo the original task...
-    logger.info("create_isolate_failure_tasks: task_definition: {},"
-                "failures: {}".format(task_definition, failures))
-    new_task_id = slugid()
-    new_task_definition = copy.deepcopy(task_definition)
-    th_dict = new_task_definition['extra']['treeherder']
+    logger.info("Isolate task:\n{}".format(json.dumps(task_definition, indent=2)))
+
+    task_name = task_definition['metadata']['name']
+    repeatable_task = False
+    if ('crashtest' in task_name or 'mochitest' in task_name or
+        'reftest' in task_name and 'jsreftest' not in task_name):
+        repeatable_task = True
+
+    th_dict = task_definition['extra']['treeherder']
+    symbol = th_dict['symbol']
+    is_windows = 'windows' in th_dict['machine']['platform']
+
+    suite = task_definition['extra']['suite']
+    if '-chunked' in suite:
+        suite = suite[:suite.index('-chunked')]
+    if '-coverage' in suite:
+        suite = suite[:suite.index('-coverage')]
+
+    command = task_definition['payload']['command']
+
     th_dict['groupSymbol'] = th_dict['groupSymbol'] + '-I'
     th_dict['tier'] = 3
 
-    logger.info('Cloning original task')
-    create_task_from_def(new_task_id, new_task_definition, level)
+    for i in range(times):
+        create_task_from_def(slugid(), task_definition, level)
+
+    if repeatable_task:
+        task_definition['payload']['maxRunTime'] = 3600 * 3
 
     for failure_group in failures:
-        failure_group_suffix = '-id' if failure_group == 'dirs' else '-it'
+        if failure_group == 'dirs':
+            failure_group_suffix = '-id'
+            # execute 5 total loops
+            repeat_args = ['--repeat=4'] if repeatable_task else []
+        else:
+            failure_group_suffix = '-it'
+            # execute 20 total loops
+            repeat_args = ['--repeat=19'] if repeatable_task else []
+
+        if repeat_args:
+            task_definition['payload']['command'] = add_args_to_command(command,
+                                                                        extra_args=repeat_args)
+        else:
+            task_definition['payload']['command'] = command
+
         for failure_path in failures[failure_group]:
-            new_task_id = slugid()
-            new_task_definition = copy.deepcopy(task_definition)
-            th_dict = new_task_definition['extra']['treeherder']
-            th_dict['groupSymbol'] = th_dict['groupSymbol'] + '-I'
-            th_dict['symbol'] = th_dict['symbol'] + failure_group_suffix
-            th_dict['tier'] = 3
-            suite = new_task_definition['extra']['suite']
-            if '-chunked' in suite:
-                suite = suite[:suite.index('-chunked')]
-            if '-coverage' in suite:
-                suite = suite[:suite.index('-coverage')]
-            env_dict = new_task_definition['payload']['env']
-            if 'MOZHARNESS_TEST_PATHS' not in env_dict:
-                env_dict['MOZHARNESS_TEST_PATHS'] = {}
-            if 'windows' in th_dict['machine']['platform']:
+            th_dict['symbol'] = symbol + failure_group_suffix
+            if is_windows:
                 failure_path = '\\'.join(failure_path.split('/'))
-            env_dict['MOZHARNESS_TEST_PATHS'] = json.dumps({suite: [failure_path]})
-            logger.info('Creating task for {}'.format(failure_path))
-            create_task_from_def(new_task_id, new_task_definition, level)
+            task_definition['payload']['env']['MOZHARNESS_TEST_PATHS'] = json.dumps(
+                {suite: [failure_path]})
+
+            logger.info("Creating task for path {} with command {}".format(
+                failure_path,
+                task_definition['payload']['command']))
+            for i in range(times):
+                create_task_from_def(slugid(), task_definition, level)
 
 
 @register_callback_action(
     name='isolate-test-failures',
     title='Isolate test failures in job',
     generic=True,
     symbol='it',
     description="Re-run Tests for original manifest, directories and tests for failing tests.",
@@ -157,10 +198,12 @@ def isolate_test_failures(parameters, gr
     dependencies = {name: label_to_taskid[label]
                     for name, label in pre_task.dependencies.iteritems()}
 
     task_definition = resolve_task_references(pre_task.label, pre_task.task, dependencies)
     task_definition.setdefault('dependencies', []).extend(dependencies.itervalues())
 
     failures = get_failures(task_id)
     logger.info('isolate_test_failures: %s' % failures)
-    for i in range(input['times']):
-        create_isolate_failure_tasks(task_definition, failures, parameters['level'])
+    create_isolate_failure_tasks(task_definition,
+                                 failures,
+                                 parameters['level'],
+                                 input['times'])