Bug 1280570 - Retrigger tc tests when mozharness returns TBPL_RETRY; r=dustin
authorGeoff Brown <gbrown@mozilla.com>
Thu, 06 Oct 2016 19:03:13 -0600
changeset 862926 36987accb00f03a65982876f952ef5059cf4afb8
parent 860089 42c95d88aaaa7c2eca1d278399421d437441ac4d
child 862927 18e3e9ddc05db7de935f89694f66a47db78491eb
push id148512
push usergbrown@mozilla.com
push dateFri, 07 Oct 2016 01:03:52 +0000
treeherdertry@18e3e9ddc05d [default view] [failures only]
reviewersdustin
bugs1280570
milestone52.0a1
Bug 1280570 - Retrigger tc tests when mozharness returns TBPL_RETRY; r=dustin
taskcluster/taskgraph/transforms/job/mozharness.py
taskcluster/taskgraph/transforms/task.py
taskcluster/taskgraph/transforms/tests/all_kinds.py
taskcluster/taskgraph/transforms/tests/make_task_description.py
taskcluster/taskgraph/transforms/tests/test_description.py
testing/mozharness/mozharness/mozilla/testing/errors.py
--- a/taskcluster/taskgraph/transforms/job/mozharness.py
+++ b/taskcluster/taskgraph/transforms/job/mozharness.py
@@ -135,16 +135,19 @@ def mozharness_on_docker_worker_setup(co
         ])
         if run['tooltool-downloads'] == 'internal':
             taskdesc['scopes'].append(
                 'docker-worker:relengapi-proxy:tooltool.download.internal')
         env['TOOLTOOL_CACHE'] = '/home/worker/tooltool-cache'
         env['TOOLTOOL_REPO'] = 'https://github.com/mozilla/build-tooltool'
         env['TOOLTOOL_REV'] = 'master'
 
+    # Retry if mozharness returns TBPL_RETRY (4)
+    worker['retry-exit-status'] = 4
+
     docker_worker_setup_secrets(config, job, taskdesc)
 
     command = [
         '/home/worker/bin/run-task',
         # Various caches/volumes are default owned by root:root.
         '--chown-recursive', '/home/worker/workspace',
         '--chown-recursive', '/home/worker/tooltool-cache',
         '--vcs-checkout', '/home/worker/workspace/build/src',
--- a/taskcluster/taskgraph/transforms/task.py
+++ b/taskcluster/taskgraph/transforms/task.py
@@ -191,16 +191,20 @@ task_description_schema = Schema({
         # environment variables
         Required('env', default={}): {basestring: taskref_or_string},
 
         # the command to run
         'command': [taskref_or_string],
 
         # the maximum time to run, in seconds
         'max-run-time': int,
+
+        # the exit status code that indicates the task should be retried
+        Optional('retry-exit-status'): int,
+
     }, {
         Required('implementation'): 'generic-worker',
 
         # command is a list of commands to run, sequentially
         'command': [basestring],
 
         # artifacts to extract from the task image after completion; note that artifacts
         # for the generic worker cannot have names
@@ -339,16 +343,19 @@ def build_docker_worker_payload(config, 
         'command': worker['command'],
         'image': image,
         'env': worker['env'],
     }
 
     if 'max-run-time' in worker:
         payload['maxRunTime'] = worker['max-run-time']
 
+    if 'retry-exit-status' in worker:
+        payload['onExitStatus'] = {'retry': [worker['retry-exit-status']]}
+
     if 'artifacts' in worker:
         artifacts = {}
         for artifact in worker['artifacts']:
             artifacts[artifact['name']] = {
                 'path': artifact['path'],
                 'type': artifact['type'],
                 'expires': task_def['expires'],  # always expire with the task
             }
@@ -389,16 +396,19 @@ def build_generic_worker_payload(config,
 
     task_def['payload'] = {
         'command': worker['command'],
         'artifacts': artifacts,
         'env': worker['env'],
         'maxRunTime': worker['max-run-time'],
     }
 
+    if 'retry-exit-status' in worker:
+        raise Exception("retry-exit-status not supported in generic-worker")
+
 
 transforms = TransformSequence()
 
 
 @transforms.add
 def validate(config, tasks):
     for task in tasks:
         yield validate_schema(
--- a/taskcluster/taskgraph/transforms/tests/all_kinds.py
+++ b/taskcluster/taskgraph/transforms/tests/all_kinds.py
@@ -117,8 +117,17 @@ def split_chunks(config, tests):
             chunked['this-chunk'] = this_chunk
 
             # add the chunk number to the TH symbol
             group, symbol = split_symbol(chunked['treeherder-symbol'])
             symbol += str(this_chunk)
             chunked['treeherder-symbol'] = join_symbol(group, symbol)
 
             yield chunked
+
+
+@transforms.add
+def set_retry_exit_status(config, tests):
+    """Set the retry exit status to TBPL_RETRY, the value returned by mozharness
+       scripts to indicate a transient failure that should be retried."""
+    for test in tests:
+        test['retry-exit-status'] = 4
+        yield test
--- a/taskcluster/taskgraph/transforms/tests/make_task_description.py
+++ b/taskcluster/taskgraph/transforms/tests/make_task_description.py
@@ -141,16 +141,17 @@ def docker_worker_setup(config, test, ta
     worker['implementation'] = test['worker-implementation']
     worker['docker-image'] = test['docker-image']
 
     worker['allow-ptrace'] = True  # required for all tests, for crashreporter
     worker['relengapi-proxy'] = False  # but maybe enabled for tooltool below
     worker['loopback-video'] = test['loopback-video']
     worker['loopback-audio'] = test['loopback-audio']
     worker['max-run-time'] = test['max-run-time']
+    worker['retry-exit-status'] = test['retry-exit-status']
 
     worker['artifacts'] = [{
         'name': prefix,
         'path': path,
         'type': 'directory',
     } for (prefix, path) in ARTIFACTS]
 
     worker['caches'] = [{
--- a/taskcluster/taskgraph/transforms/tests/test_description.py
+++ b/taskcluster/taskgraph/transforms/tests/test_description.py
@@ -125,16 +125,19 @@ test_description_schema = Schema({
 
     # seconds of runtime after which the task will be killed.  Like 'chunks',
     # this can be keyed by test pltaform.
     Required('max-run-time', default=3600): Any(
         int,
         {'by-test-platform': {basestring: int}},
     ),
 
+    # the exit status code that indicates the task should be retried
+    Optional('retry-exit-status'): int,
+
     # Whether to perform a gecko checkout.
     Required('checkout', default=False): bool,
 
     # What to run
     Required('mozharness'): Any({
         # the mozharness script used to run this task
         Required('script'): basestring,
 
--- a/testing/mozharness/mozharness/mozilla/testing/errors.py
+++ b/testing/mozharness/mozharness/mozilla/testing/errors.py
@@ -93,17 +93,17 @@ TinderBoxPrintRe = {
         'pass_group': "Passed",
         'fail_group': "Failed",
         'known_fail_group': "Skipped",
     },
 
     "harness_error": {
         'full_regex': re.compile(r"(?:TEST-UNEXPECTED-FAIL|PROCESS-CRASH) \| .* \| (application crashed|missing output line for total leaks!|negative leaks caught!|\d+ bytes leaked)"),
         'minimum_regex': re.compile(r'''(TEST-UNEXPECTED|PROCESS-CRASH)'''),
-        'retry_regex': re.compile(r'''FAIL-SHOULD-RETRY''')
+        'retry_regex': re.compile(r'''(FAIL-SHOULD-RETRY|No space left on device|DMError|Connection to the other side was lost in a non-clean fashion|program finished with exit code 80|INFRA-ERROR|twisted.spread.pb.PBConnectionLost)''')
     },
 }
 
 TestPassed = [
     {'regex': re.compile('''(TEST-INFO|TEST-KNOWN-FAIL|TEST-PASS|INFO \| )'''), 'level': INFO},
 ]
 
 LogcatErrorList = [