Bug 1357753: optionally run linux Talos on native-engine; r=wcosta
authorDustin J. Mitchell <dustin@mozilla.com>
Wed, 10 May 2017 21:12:02 +0000
changeset 409437 d267e93bfec417089317acc7a1602dd003fc04e6
parent 409436 cc714eb0adb27f9c5e9f464eb0862e04f4098bab
child 409438 1f135333d095525ffebafd7f774f246995207b3d
push id7391
push usermtabara@mozilla.com
push dateMon, 12 Jun 2017 13:08:53 +0000
treeherdermozilla-beta@2191d7f87e2e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerswcosta
bugs1357753
milestone55.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1357753: optionally run linux Talos on native-engine; r=wcosta This makes talos obey the `-w` try syntax flag. MozReview-Commit-ID: 2v6X3ko2t9K
taskcluster/scripts/tester/test-linux.sh
taskcluster/taskgraph/transforms/job/mozharness_test.py
taskcluster/taskgraph/transforms/task.py
taskcluster/taskgraph/transforms/tests.py
--- a/taskcluster/scripts/tester/test-linux.sh
+++ b/taskcluster/scripts/tester/test-linux.sh
@@ -7,37 +7,36 @@ echo "running as" $(id)
 # Detect release version.
 . /etc/lsb-release
 if [ "${DISTRIB_RELEASE}" == "12.04" ]; then
     UBUNTU_1204=1
 elif [ "${DISTRIB_RELEASE}" == "16.04" ]; then
     UBUNTU_1604=1
 fi
 
-. /home/worker/scripts/xvfb.sh
-
 ####
 # Taskcluster friendly wrapper for performing fx desktop tests via mozharness.
 ####
 
 # Inputs, with defaults
 
 : MOZHARNESS_PATH               ${MOZHARNESS_PATH}
 : MOZHARNESS_URL                ${MOZHARNESS_URL}
 : MOZHARNESS_SCRIPT             ${MOZHARNESS_SCRIPT}
 : MOZHARNESS_CONFIG             ${MOZHARNESS_CONFIG}
 : NEED_XVFB                     ${NEED_XVFB:=true}
 : NEED_WINDOW_MANAGER           ${NEED_WINDOW_MANAGER:=false}
 : NEED_PULSEAUDIO               ${NEED_PULSEAUDIO:=false}
 : START_VNC                     ${START_VNC:=false}
 : TASKCLUSTER_INTERACTIVE       ${TASKCLUSTER_INTERACTIVE:=false}
-: WORKSPACE                     ${WORKSPACE:=/home/worker/workspace}
+: WORKSPACE                     ${WORKSPACE:=$HOME/workspace}
 : mozharness args               "${@}"
 
 set -v
+mkdir -p $WORKSPACE
 cd $WORKSPACE
 
 fail() {
     echo # make sure error message is on a new line
     echo "[test-linux.sh:error]" "${@}"
     exit 1
 }
 
@@ -55,21 +54,23 @@ fi
 
 if [[ -z ${MOZHARNESS_SCRIPT} ]]; then fail "MOZHARNESS_SCRIPT is not set"; fi
 if [[ -z ${MOZHARNESS_CONFIG} ]]; then fail "MOZHARNESS_CONFIG is not set"; fi
 
 mkdir -p ~/artifacts/public
 
 cleanup() {
     local rv=$?
-    if [[ -s /home/worker/.xsession-errors ]]; then
+    if [[ -s $HOME/.xsession-errors ]]; then
       # To share X issues
-      cp /home/worker/.xsession-errors ~/artifacts/public/xsession-errors.log
+      cp $HOME/.xsession-errors ~/artifacts/public/xsession-errors.log
     fi
-    cleanup_xvfb
+    if $NEED_XVFB; then
+        cleanup_xvfb
+    fi
     exit $rv
 }
 trap cleanup EXIT INT
 
 # Download mozharness with exponential backoff
 # curl already applies exponential backoff, but not for all
 # failed cases, apparently, as we keep getting failed downloads
 # with 404 code.
@@ -113,26 +114,28 @@ fi
 
 # pulseaudio daemon must be started before xvfb on Ubuntu 12.04.
 if [ "${UBUNTU_1204}" ]; then
     maybe_start_pulse
 fi
 
 # run XVfb in the background, if necessary
 if $NEED_XVFB; then
+    # note that this file is not available when run under native-worker
+    . $HOME/scripts/xvfb.sh
     start_xvfb '1600x1200x24' 0
 fi
 
 if $START_VNC; then
     x11vnc > ~/artifacts/public/x11vnc.log 2>&1 &
 fi
 
 if $NEED_WINDOW_MANAGER; then
     # This is read by xsession to select the window manager
-    echo DESKTOP_SESSION=ubuntu > /home/worker/.xsessionrc
+    echo DESKTOP_SESSION=ubuntu > $HOME/.xsessionrc
 
     # note that doing anything with this display before running Xsession will cause sadness (like,
     # crashes in compiz). Make sure that X has enough time to start
     sleep 15
     # DISPLAY has already been set above
     # XXX: it would be ideal to add a semaphore logic to make sure that the
     # window manager is ready
     /etc/X11/Xsession 2>&1 &
@@ -162,17 +165,18 @@ export MOZ_SOURCE_REPO="${GECKO_HEAD_REP
 export MOZ_SOURCE_CHANGESET="${GECKO_HEAD_REV}"
 
 # support multiple, space delimited, config files
 config_cmds=""
 for cfg in $MOZHARNESS_CONFIG; do
   config_cmds="${config_cmds} --config-file ${MOZHARNESS_PATH}/configs/${cfg}"
 done
 
-mozharness_bin="/home/worker/bin/run-mozharness"
+mozharness_bin="$HOME/bin/run-mozharness"
+mkdir -p $(dirname $mozharness_bin)
 
 # Save the computed mozharness command to a binary which is useful
 # for interactive mode.
 echo -e "#!/usr/bin/env bash
 # Some mozharness scripts assume base_work_dir is in
 # the current working directory, see bug 1279237
 cd $WORKSPACE
 cmd=\"python2.7 ${MOZHARNESS_PATH}/scripts/${MOZHARNESS_SCRIPT} ${config_cmds} ${@} \${@}\"
@@ -185,11 +189,11 @@ if ! $TASKCLUSTER_INTERACTIVE; then
   # run the given mozharness script and configs, but pass the rest of the
   # arguments in from our own invocation
   ${mozharness_bin};
 fi
 
 # Run a custom mach command (this is typically used by action tasks to run
 # harnesses in a particular way)
 if [ "$CUSTOM_MACH_COMMAND" ]; then
-    eval "/home/worker/workspace/build/tests/mach ${CUSTOM_MACH_COMMAND}"
+    eval "$HOME/workspace/build/tests/mach ${CUSTOM_MACH_COMMAND}"
     exit $?
 fi
--- a/taskcluster/taskgraph/transforms/job/mozharness_test.py
+++ b/taskcluster/taskgraph/transforms/job/mozharness_test.py
@@ -111,16 +111,19 @@ def mozharness_test_on_docker(config, jo
             'name': 'tooltool-cache',
             'mount-point': '/home/worker/tooltool-cache',
         })
         taskdesc['scopes'].extend([
             'docker-worker:relengapi-proxy:tooltool.download.internal',
             'docker-worker:relengapi-proxy:tooltool.download.public',
         ])
 
+    if test['reboot']:
+        raise Exception('reboot: {} not supported on generic-worker'.format(test['reboot']))
+
     # assemble the command line
     command = [
         '/home/worker/bin/run-task',
         # The workspace cache/volume is default owned by root:root.
         '--chown', '/home/worker/workspace',
     ]
 
     # Support vcs checkouts regardless of whether the task runs from
@@ -203,16 +206,19 @@ def mozharness_test_on_generic_worker(co
     test_packages_url = get_artifact_url(
         '<build>', 'public/build/{}.test_packages.json'.format(target))
 
     taskdesc['scopes'].extend(
         ['generic-worker:os-group:{}'.format(group) for group in test['os-groups']])
 
     worker['os-groups'] = test['os-groups']
 
+    if test['reboot']:
+        raise Exception('reboot: {} not supported on generic-worker'.format(test['reboot']))
+
     worker['max-run-time'] = test['max-run-time']
     worker['artifacts'] = artifacts
 
     # this list will get cleaned up / reduced / removed in bug 1354088
     if build_platform.startswith('macosx'):
         worker['env'] = {
             'IDLEIZER_DISABLE_SHUTDOWN': 'true',
             'LANG': 'en_US.UTF-8',
@@ -303,16 +309,17 @@ def mozharness_test_on_generic_worker(co
         ]
 
 
 @run_job_using('native-engine', 'mozharness-test', schema=mozharness_test_run_schema)
 def mozharness_test_on_native_engine(config, job, taskdesc):
     test = taskdesc['run']['test']
     mozharness = test['mozharness']
     worker = taskdesc['worker']
+    is_talos = test['suite'] == 'talos'
 
     build_platform = taskdesc['attributes']['build_platform']
     build_type = taskdesc['attributes']['build_type']
     target = 'firefox-{}.en-US.{}'.format(get_firefox_version(), 'mac') \
         if build_platform == 'macosx64' and build_type == 'opt' else 'target'
 
     installer_url = get_artifact_url('<build>', mozharness['build-artifact-name'])
     test_packages_url = get_artifact_url('<build>',
@@ -321,37 +328,43 @@ def mozharness_test_on_native_engine(con
                                       'public/build/mozharness.zip')
 
     worker['artifacts'] = [{
         'name': prefix.rstrip('/'),
         'path': path.rstrip('/'),
         'type': 'directory',
     } for (prefix, path) in ARTIFACTS]
 
-    worker['reboot'] = test['reboot']
-    worker['env'] = {
+    if test['reboot']:
+        worker['reboot'] = test['reboot']
+
+    worker['env'] = env = {
         'GECKO_HEAD_REPOSITORY': config.params['head_repository'],
         'GECKO_HEAD_REV': config.params['head_rev'],
         'MOZHARNESS_CONFIG': ' '.join(mozharness['config']),
         'MOZHARNESS_SCRIPT': mozharness['script'],
         'MOZHARNESS_URL': {'task-reference': mozharness_url},
         'MOZILLA_BUILD_URL': {'task-reference': installer_url},
         "MOZ_NO_REMOTE": '1',
         "NO_EM_RESTART": '1',
         "XPCOM_DEBUG_BREAK": 'warn',
         "NO_FAIL_ON_TEST_ERRORS": '1',
         "MOZ_HIDE_RESULTS_TABLE": '1',
         "MOZ_NODE_PATH": "/usr/local/bin/node",
     }
+    # talos tests don't need Xvfb
+    if is_talos:
+        env['NEED_XVFB'] = 'false'
 
-    worker['context'] = '{}/raw-file/{}/taskcluster/scripts/tester/test-macosx.sh'.format(
-        config.params['head_repository'], config.params['head_rev']
+    script = 'test-macosx.sh' if test['test-platform'].startswith('macosx') else 'test-linux.sh'
+    worker['context'] = '{}/raw-file/{}/taskcluster/scripts/tester/{}'.format(
+        config.params['head_repository'], config.params['head_rev'], script
     )
 
-    command = worker['command'] = ["./test-macosx.sh"]
+    command = worker['command'] = ["./{}".format(script)]
     if mozharness.get('no-read-buildbot-config'):
         command.append("--no-read-buildbot-config")
     command.extend([
         {"task-reference": "--installer-url=" + installer_url},
         {"task-reference": "--test-packages-url=" + test_packages_url},
     ])
     if mozharness.get('include-blob-upload-branch'):
         command.append('--blob-upload-branch=' + config.params['project'])
--- a/taskcluster/taskgraph/transforms/task.py
+++ b/taskcluster/taskgraph/transforms/task.py
@@ -309,17 +309,18 @@ task_description_schema = Schema({
     }, {
         Required('implementation'): 'native-engine',
 
         # A link for an executable to download
         Optional('context'): basestring,
 
         # Tells the worker whether machine should reboot
         # after the task is finished.
-        Optional('reboot'): bool,
+        Optional('reboot'):
+            Any('always', 'on-exception', 'on-failure'),
 
         # the command to run
         Optional('command'): [taskref_or_string],
 
         # environment variables
         Optional('env'): {basestring: taskref_or_string},
 
         # artifacts to extract from the task image after completion
@@ -749,19 +750,20 @@ def build_macosx_engine_payload(config, 
         'type': artifact['type'],
         'expires': task_def['expires'],
     }, worker.get('artifacts', []))
 
     task_def['payload'] = {
         'context': worker['context'],
         'command': worker['command'],
         'env': worker['env'],
-        'reboot': worker.get('reboot', False),
         'artifacts': artifacts,
     }
+    if worker.get('reboot'):
+        task_def['payload'] = worker['reboot']
 
     if task.get('needs-sccache'):
         raise Exception('needs-sccache not supported in native-engine')
 
 
 @payload_builder('buildbot-bridge')
 def build_buildbot_bridge_payload(config, task, task_def):
     del task['extra']['treeherder']
--- a/taskcluster/taskgraph/transforms/tests.py
+++ b/taskcluster/taskgraph/transforms/tests.py
@@ -142,17 +142,17 @@ test_description_schema = Schema({
     # Whether the task requires loopback audio or video (whatever that may mean
     # on the platform)
     Required('loopback-audio', default=False): bool,
     Required('loopback-video', default=False): bool,
 
     # Whether the test can run using a software GL implementation on Linux
     # using the GL compositor. May not be used with "legacy" sized instances
     # due to poor LLVMPipe performance (bug 1296086).  Defaults to true for
-    # linux platforms and false otherwise
+    # unit tests on linux platforms and false otherwise
     Optional('allow-software-gl-layers'): bool,
 
     # The worker implementation for this test, as dictated by policy and by the
     # test platform.
     Optional('worker-implementation'): Any(
         'docker-worker',
         'native-engine',
         'generic-worker',
@@ -183,17 +183,18 @@ test_description_schema = Schema({
 
     # the exit status code that indicates the task should be retried
     Optional('retry-exit-status'): int,
 
     # Whether to perform a gecko checkout.
     Required('checkout', default=False): bool,
 
     # Wheter to perform a machine reboot after test is done
-    Optional('reboot', default=True): bool,
+    Optional('reboot', default=False):
+        Any(False, 'always', 'on-exception', 'on-failure'),
 
     # What to run
     Required('mozharness'): optionally_keyed_by(
         'test-platform', 'test-platform-phylum', {
             # the mozharness script used to run this task
             Required('script'): basestring,
 
             # the config files required for the task
@@ -323,18 +324,18 @@ def set_defaults(config, tests):
             # loopback-video is always true for Android, but false for other
             # platform phyla
             test['loopback-video'] = True
         else:
             # all non-android tests want to run the bits that require node
             test['mozharness']['set-moz-node-path'] = True
             test.setdefault('e10s', 'both')
 
-        # software-gl-layers is only meaningful on linux, where it defaults to True
-        if test['test-platform'].startswith('linux'):
+        # software-gl-layers is only meaningful on linux unittests, where it defaults to True
+        if test['test-platform'].startswith('linux') and test['suite'] != 'talos':
             test.setdefault('allow-software-gl-layers', True)
         else:
             test['allow-software-gl-layers'] = False
 
         # Enable WebRender by default on the QuantumRender test platform, since
         # the whole point of QuantumRender is to run with WebRender enabled.
         # If other *-qr test platforms are added they should also be checked for
         # here; currently linux64-qr is the only one.
@@ -351,16 +352,38 @@ def set_defaults(config, tests):
         test.setdefault('instance-size', 'default')
         test.setdefault('max-run-time', 3600)
         test.setdefault('reboot', True)
         test['mozharness'].setdefault('extra-options', [])
         yield test
 
 
 @transforms.add
+def setup_talos(config, tests):
+    """Add options that are specific to talos jobs (identified by suite=talos)"""
+    for test in tests:
+        if test['suite'] != 'talos':
+            yield test
+            continue
+
+        extra_options = test.setdefault('mozharness', {}).setdefault('extra-options', [])
+        extra_options.append('--add-option')
+        extra_options.append('--webServer,localhost')
+        extra_options.append('--use-talos-json')
+
+        # Per https://bugzilla.mozilla.org/show_bug.cgi?id=1357753#c3, branch
+        # name is only required for try
+        if config.params['project'] == 'try':
+            extra_options.append('--branch-name')
+            extra_options.append('Try')
+
+        yield test
+
+
+@transforms.add
 def set_target(config, tests):
     for test in tests:
         build_platform = test['build-platform']
         if build_platform.startswith('macosx'):
             if build_platform.split('/')[1] == 'opt':
                 target = 'firefox-{}.en-US.{}.dmg'.format(
                     get_firefox_version(),
                     'mac',
@@ -410,17 +433,20 @@ def set_worker_implementation(config, te
     for test in tests:
         test_platform = test['test-platform']
         if test_platform.startswith('macosx'):
             if config.config['args'].taskcluster_worker:
                 test['worker-implementation'] = 'native-engine'
             else:
                 test['worker-implementation'] = 'generic-worker'
         elif test.get('suite', '') == 'talos':
-            test['worker-implementation'] = 'buildbot-bridge'
+            if config.config['args'].taskcluster_worker:
+                test['worker-implementation'] = 'native-engine'
+            else:
+                test['worker-implementation'] = 'buildbot-bridge'
         elif test_platform.startswith('win'):
             test['worker-implementation'] = 'generic-worker'
         else:
             test['worker-implementation'] = 'docker-worker'
 
         yield test
 
 
@@ -796,20 +822,24 @@ def make_job_description(config, tests):
             optimizations.append(['seta'])
 
         run = jobdesc['run'] = {}
         run['using'] = 'mozharness-test'
         run['test'] = test
         worker = jobdesc['worker'] = {}
         implementation = worker['implementation'] = test['worker-implementation']
 
+        # TODO: need some better way to express this...
         if implementation == 'buildbot-bridge':
             jobdesc['worker-type'] = 'buildbot-bridge/buildbot-bridge'
         elif implementation == 'native-engine':
-            jobdesc['worker-type'] = 'tc-worker-provisioner/gecko-t-osx-10-10'
+            if test['test-platform'].startswith('linux'):
+                jobdesc['worker-type'] = 'releng-hardware/gecko-t-linux-talos'
+            else:
+                jobdesc['worker-type'] = 'tc-worker-provisioner/gecko-t-osx-10-10'
         elif implementation == 'generic-worker':
             test_platform = test['test-platform'].split('/')[0]
             jobdesc['worker-type'] = WORKER_TYPE[test_platform]
         elif implementation == 'docker-worker' or implementation == 'docker-engine':
             jobdesc['worker-type'] = WORKER_TYPE[test['instance-size']]
             worker = jobdesc['worker']
             worker['docker-image'] = test['docker-image']
             worker['allow-ptrace'] = True  # required for all tests, for crashreporter