Bug 1471905: [taskgraph] Consistently include the cache digests of parent tasks in downstream cached tasks; r=dustin
authorTom Prince <mozilla@hocat.ca>
Wed, 05 Dec 2018 02:15:56 +0000
changeset 508581 ac2e2c7315db3d47700dcc5ea095cd79ec55220e
parent 508580 38acabd3b5b871068948d649b822e8d8a2e583d8
child 508582 18faf33e3dc1d324dbc4566e18d4e20cf13b941f
push id1905
push userffxbld-merge
push dateMon, 21 Jan 2019 12:33:13 +0000
treeherdermozilla-release@c2fca1944d8c [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersdustin
bugs1471905
milestone65.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1471905: [taskgraph] Consistently include the cache digests of parent tasks in downstream cached tasks; r=dustin There are several kinds that cache tasks based on the inputs that go into the task. Historically, these inputs included the name of upstream tasks. This change these tasks to include the digest of the upstream tasks. This also bumps the version of the docker and toolchain as every digest is changed for them. Differential Revision: https://phabricator.services.mozilla.com/D11949
taskcluster/ci/docker-image/kind.yml
taskcluster/ci/packages/kind.yml
taskcluster/ci/toolchain/kind.yml
taskcluster/docs/attributes.rst
taskcluster/taskgraph/transforms/cached_tasks.py
taskcluster/taskgraph/transforms/docker_image.py
taskcluster/taskgraph/transforms/job/debian_package.py
taskcluster/taskgraph/transforms/job/toolchain.py
taskcluster/taskgraph/util/cached_tasks.py
--- a/taskcluster/ci/docker-image/kind.yml
+++ b/taskcluster/ci/docker-image/kind.yml
@@ -4,16 +4,17 @@
 
 loader: taskgraph.loader.transform:loader
 
 kind-dependencies:
   - packages
 
 transforms:
   - taskgraph.transforms.docker_image:transforms
+  - taskgraph.transforms.cached_tasks:transforms
   - taskgraph.transforms.task:transforms
 
 # make a task for each docker-image we might want.  For the moment, since we
 # write artifacts for each, these are whitelisted, but ideally that will change
 # (to use subdirectory clones of the proper directory), at which point we can
 # generate tasks for every docker image in the directory, secure in the
 # knowledge that unnecessary images will be omitted from the target task graph
 jobs:
--- a/taskcluster/ci/packages/kind.yml
+++ b/taskcluster/ci/packages/kind.yml
@@ -2,16 +2,17 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 loader: taskgraph.loader.transform:loader
 
 transforms:
   - taskgraph.transforms.try_job:transforms
   - taskgraph.transforms.job:transforms
+  - taskgraph.transforms.cached_tasks:transforms
   - taskgraph.transforms.task:transforms
 
 job-defaults:
   treeherder:
     kind: build
     platform: packages/opt
     tier: 1
   worker-type: aws-provisioner-v1/gecko-{level}-b-linux
--- a/taskcluster/ci/toolchain/kind.yml
+++ b/taskcluster/ci/toolchain/kind.yml
@@ -6,16 +6,17 @@ loader: taskgraph.loader.transform:loade
 
 kind-dependencies:
    - fetch
 
 transforms:
    - taskgraph.transforms.try_job:transforms
    - taskgraph.transforms.use_toolchains:transforms
    - taskgraph.transforms.job:transforms
+   - taskgraph.transforms.cached_tasks:transforms
    - taskgraph.transforms.task:transforms
 
 job-defaults:
    run-on-projects: []
 
 jobs-from:
    - linux.yml
    - macosx.yml
--- a/taskcluster/docs/attributes.rst
+++ b/taskcluster/docs/attributes.rst
@@ -256,25 +256,28 @@ build kinds where the full crashsymbols 
 to True. The full symbol packages will then be generated and uploaded on
 release branches and on try.
 
 cron
 ====
 Indicates that a task is meant to be run via cron tasks, and should not be run
 on push.
 
-cache_digest
-============
-Some tasks generate artifacts that are cached between pushes. This is the unique string used
-to identify the current version of the artifacts. See :py:mod:`taskgraph.util.cached_task`.
+cached_task
+===========
+Some tasks generate artifacts that are cached between pushes. This is a
+dictionary with the type and name of the cache, and the unique string used to
+identify the current version of the artifacts. See :py:mod:`taskgraph.util.cached_task`.
 
-cache_type
-==========
-Some tasks generate artifacts that are cached between pushes. This is the type of cache that is
-used for the this task. See :py:mod:`taskgraph.util.cached_task`.
+.. code:: yaml
+
+   cached_task:
+       digest: 66dfc2204600b48d92a049b6a18b83972bb9a92f9504c06608a9c20eb4c9d8ae
+       name: debian7-base
+       type: docker-images.v2
 
 required_signoffs
 =================
 A list of release signoffs that this kind requires, should the release also
 require these signoffs. For example, ``mar-signing`` signoffs may be required
 by some releases in the future; for any releases that require ``mar-signing``
 signoffs, the kinds that also require that signoff are marked with this
 attribute.
new file mode 100644
--- /dev/null
+++ b/taskcluster/taskgraph/transforms/cached_tasks.py
@@ -0,0 +1,78 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from __future__ import absolute_import, print_function, unicode_literals
+
+from collections import deque
+import taskgraph
+from taskgraph.transforms.base import TransformSequence
+from taskgraph.util.cached_tasks import add_optimization
+
+transforms = TransformSequence()
+
+
+def order_tasks(config, tasks):
+    """Iterate image tasks in an order where parent tasks come first."""
+    if config.kind == 'docker-image':
+        kind_prefix = 'build-docker-image-'
+    else:
+        kind_prefix = config.kind + '-'
+
+    pending = deque(tasks)
+    task_labels = {task['label'] for task in pending}
+    emitted = set()
+    while True:
+        try:
+            task = pending.popleft()
+        except IndexError:
+            break
+        parents = {
+            task for task in task.get('dependencies', {}).values()
+            if task.startswith(kind_prefix)
+        }
+        if parents and not emitted.issuperset(parents):
+            if not task_labels.issuperset(parents):
+                raise Exception('Missing parent tasks for {}: {}'.format(
+                    task['label'], set(parents) - task_labels))
+            pending.append(task)
+            continue
+        emitted.add(task['label'])
+        yield task
+
+
+def format_task_digest(cached_task):
+    return "/".join([
+        cached_task['type'],
+        cached_task['name'],
+        cached_task['digest'],
+    ])
+
+
+@transforms.add
+def cache_task(config, tasks):
+    if taskgraph.fast:
+        for task in tasks:
+            yield task
+        return
+
+    digests = {}
+    for task in config.kind_dependencies_tasks:
+        if 'cached_task' in task.attributes:
+            digests[task.label] = format_task_digest(task.attributes['cached_task'])
+
+    for task in order_tasks(config, tasks):
+        cache = task.pop('cache')
+        dependency_digests = []
+        for p in task.get('dependencies', {}).values():
+            if p in digests:
+                dependency_digests.append(digests[p])
+            else:
+                raise Exception('Cached task {} has uncached parent task: {}'.format(
+                    task['label'], p))
+        digest_data = cache['digest-data'] + sorted(dependency_digests)
+        add_optimization(config, task, cache_type=cache['type'],
+                         cache_name=cache['name'], digest_data=digest_data)
+        digests[task['label']] = format_task_digest(task['attributes']['cached_task'])
+
+        yield task
--- a/taskcluster/taskgraph/transforms/docker_image.py
+++ b/taskcluster/taskgraph/transforms/docker_image.py
@@ -10,17 +10,16 @@ import re
 from collections import deque
 import taskgraph
 from taskgraph.transforms.base import TransformSequence
 from taskgraph.transforms.task import _run_task_suffix
 from .. import GECKO
 from taskgraph.util.docker import (
     generate_context_hash,
 )
-from taskgraph.util.cached_tasks import add_optimization
 from taskgraph.util.schema import (
     Schema,
 )
 from voluptuous import (
     Optional,
     Required,
 )
 
@@ -75,25 +74,24 @@ def order_image_tasks(config, tasks):
             pending.append(task)
             continue
         emitted.add(task['name'])
         yield task
 
 
 @transforms.add
 def fill_template(config, tasks):
-    available_packages = {}
+    available_packages = set()
     for task in config.kind_dependencies_tasks:
         if task.kind != 'packages':
             continue
         name = task.label.replace('packages-', '')
-        available_packages[name] = task.attributes['cache_digest']
+        available_packages.add(name)
 
     context_hashes = {}
-    image_digests = {}
 
     for task in order_image_tasks(config, tasks):
         image_name = task.pop('name')
         job_symbol = task.pop('symbol')
         args = task.pop('args', {})
         definition = task.pop('definition', image_name)
         packages = task.pop('packages', [])
         parent = task.pop('parent', None)
@@ -117,19 +115,16 @@ def fill_template(config, tasks):
             context_path = os.path.join('taskcluster', 'docker', definition)
             context_hash = generate_context_hash(
                 GECKO, context_path, image_name, args)
         else:
             context_hash = '0'*40
         digest_data = [context_hash]
         context_hashes[image_name] = context_hash
 
-        if parent:
-            digest_data += [image_digests[parent]]
-
         description = 'Build the docker image {} for use by dependent tasks'.format(
             image_name)
 
         # Adjust the zstandard compression level based on the execution level.
         # We use faster compression for level 1 because we care more about
         # end-to-end times. We use slower/better compression for other levels
         # because images are read more often and it is worth the trade-off to
         # burn more CPU once to reduce image size.
@@ -218,31 +213,24 @@ def fill_template(config, tasks):
                 worker['env'][k] = {'task-reference': v}
             else:
                 worker['env'][k] = v
 
         if packages:
             deps = taskdesc.setdefault('dependencies', {})
             for p in sorted(packages):
                 deps[p] = 'packages-{}'.format(p)
-                digest_data.append(available_packages[p])
 
         if parent:
             deps = taskdesc.setdefault('dependencies', {})
             deps[parent] = 'build-docker-image-{}'.format(parent)
             worker['env']['DOCKER_IMAGE_PARENT_TASK'] = {
                 'task-reference': '<{}>'.format(parent),
             }
 
-        if len(digest_data) > 1:
-            kwargs = {'digest_data': digest_data}
-        else:
-            kwargs = {'digest': digest_data[0]}
-        add_optimization(
-            config, taskdesc,
-            cache_type="docker-images.v1",
-            cache_name=image_name,
-            **kwargs
-        )
-
-        image_digests[image_name] = taskdesc['attributes']['cache_digest']
+        if not taskgraph.fast:
+            taskdesc['cache'] = {
+                'type': 'docker-images.v2',
+                'name': image_name,
+                'digest-data': digest_data,
+            }
 
         yield taskdesc
--- a/taskcluster/taskgraph/transforms/job/debian_package.py
+++ b/taskcluster/taskgraph/transforms/job/debian_package.py
@@ -13,17 +13,17 @@ import re
 from taskgraph.util.schema import Schema
 from voluptuous import Any, Optional, Required
 
 from taskgraph.transforms.job import run_job_using
 from taskgraph.transforms.job.common import add_artifacts
 
 from taskgraph.util.hash import hash_path
 from taskgraph import GECKO
-from taskgraph.util.cached_tasks import add_optimization
+import taskgraph
 
 DSC_PACKAGE_RE = re.compile('.*(?=_)')
 SOURCE_PACKAGE_RE = re.compile('.*(?=[-_]\d)')
 
 source_definition = {
     Required('url'): basestring,
     Required('sha256'): basestring,
 }
@@ -207,30 +207,34 @@ def docker_worker_debian_package(config,
             unpack=unpack,
             adjust=adjust,
             artifacts='/tmp/artifacts',
             base_deps=' '.join(base_deps),
             resolver=resolver,
         )
     ]
 
-    # Use the command generated above as the base for the index hash.
-    # We rely on it not varying depending on the head_repository or head_rev.
-    data = list(worker['command'])
-    if 'patch' in run:
-        data.append(hash_path(os.path.join(GECKO, 'build', 'debian-packages', run['patch'])))
-
-    if docker_repo != 'debian':
-        data.append(docker_repo)
-
     if run.get('packages'):
         env = worker.setdefault('env', {})
         env['PACKAGES'] = {
             'task-reference': ' '.join('<{}>'.format(p)
                                        for p in run['packages'])
         }
         deps = taskdesc.setdefault('dependencies', {})
         for p in run['packages']:
             deps[p] = 'packages-{}'.format(p)
-            data.append(p)
+
+    # Use the command generated above as the base for the index hash.
+    # We rely on it not varying depending on the head_repository or head_rev.
+    digest_data = list(worker['command'])
+    if 'patch' in run:
+        digest_data.append(
+            hash_path(os.path.join(GECKO, 'build', 'debian-packages', run['patch'])))
 
-    add_optimization(config, taskdesc, cache_type='packages.v1',
-                     cache_name=name, digest_data=data)
+    if docker_repo != 'debian':
+        digest_data.append(docker_repo)
+
+    if not taskgraph.fast:
+        taskdesc['cache'] = {
+            'type': 'packages.v1',
+            'name': name,
+            'digest-data': digest_data
+        }
--- a/taskcluster/taskgraph/transforms/job/toolchain.py
+++ b/taskcluster/taskgraph/transforms/job/toolchain.py
@@ -16,21 +16,20 @@ from taskgraph.transforms.job import run
 from taskgraph.transforms.job.common import (
     docker_worker_add_artifacts,
     docker_worker_add_tooltool,
     generic_worker_hg_commands,
     support_vcs_checkout,
 )
 from taskgraph.util.hash import hash_paths
 from taskgraph import GECKO
-from taskgraph.util.cached_tasks import add_optimization
 import taskgraph
 
 
-CACHE_TYPE = 'toolchains.v2'
+CACHE_TYPE = 'toolchains.v3'
 
 toolchain_run_schema = Schema({
     Required('using'): 'toolchain-script',
 
     # The script (in taskcluster/scripts/misc) to run.
     # Python scripts are invoked with `mach python` so vendored libraries
     # are available.
     Required('script'): basestring,
@@ -78,34 +77,26 @@ def get_digest_data(config, run, taskdes
     # Tooltool manifest if any is defined:
     tooltool_manifest = taskdesc['worker']['env'].get('TOOLTOOL_MANIFEST')
     if tooltool_manifest:
         files.append(tooltool_manifest)
 
     # Accumulate dependency hashes for index generation.
     data = [hash_paths(GECKO, files)]
 
-    # If the task has dependencies, we need those dependencies to influence
-    # the index path. So take the digest from the files above, add the list
-    # of its dependencies, and hash the aggregate.
-    # If the task has no dependencies, just use the digest from above.
-    deps = taskdesc['dependencies']
-    if deps:
-        data.extend(sorted(deps.values()))
-
     # If the task uses an in-tree docker image, we want it to influence
     # the index path as well. Ideally, the content of the docker image itself
     # should have an influence, but at the moment, we can't get that
     # information here. So use the docker image name as a proxy. Not a lot of
     # changes to docker images actually have an impact on the resulting
     # toolchain artifact, so we'll just rely on such important changes to be
     # accompanied with a docker image name change.
     image = taskdesc['worker'].get('docker-image', {}).get('in-tree')
     if image:
-        data.extend(image)
+        data.append(image)
 
     # Likewise script arguments should influence the index.
     args = run.get('arguments')
     if args:
         data.extend(args)
     return data
 
 
@@ -178,22 +169,21 @@ def docker_worker_toolchain(config, job,
 
     attributes = taskdesc.setdefault('attributes', {})
     attributes['toolchain-artifact'] = run['toolchain-artifact']
     if 'toolchain-alias' in run:
         attributes['toolchain-alias'] = run['toolchain-alias']
 
     if not taskgraph.fast:
         name = taskdesc['label'].replace('{}-'.format(config.kind), '', 1)
-        add_optimization(
-            config, taskdesc,
-            cache_type=CACHE_TYPE,
-            cache_name=name,
-            digest_data=get_digest_data(config, run, taskdesc),
-        )
+        taskdesc['cache'] = {
+            'type': CACHE_TYPE,
+            'name': name,
+            'digest-data': get_digest_data(config, run, taskdesc),
+        }
 
 
 @run_job_using("generic-worker", "toolchain-script",
                schema=toolchain_run_schema, defaults=toolchain_defaults)
 def windows_toolchain(config, job, taskdesc):
     run = job['run']
 
     worker = taskdesc['worker']
@@ -237,14 +227,13 @@ def windows_toolchain(config, job, taskd
 
     attributes = taskdesc.setdefault('attributes', {})
     attributes['toolchain-artifact'] = run['toolchain-artifact']
     if 'toolchain-alias' in run:
         attributes['toolchain-alias'] = run['toolchain-alias']
 
     if not taskgraph.fast:
         name = taskdesc['label'].replace('{}-'.format(config.kind), '', 1)
-        add_optimization(
-            config, taskdesc,
-            cache_type=CACHE_TYPE,
-            cache_name=name,
-            digest_data=get_digest_data(config, run, taskdesc),
-        )
+        taskdesc['cache'] = {
+            'type': CACHE_TYPE,
+            'name': name,
+            'digest-data': get_digest_data(config, run, taskdesc),
+        }
--- a/taskcluster/taskgraph/util/cached_tasks.py
+++ b/taskcluster/taskgraph/util/cached_tasks.py
@@ -63,10 +63,13 @@ def add_optimization(config, taskdesc, c
     # ... and add some extra routes for humans
     subs['build_date_long'] = time.strftime("%Y.%m.%d.%Y%m%d%H%M%S",
                                             time.gmtime(config.params['build_date']))
     taskdesc['routes'].extend([
         'index.{}'.format(route.format(**subs))
         for route in EXTRA_CACHE_INDEXES
     ])
 
-    taskdesc['attributes']['cache_digest'] = digest
-    taskdesc['attributes']['cache_type'] = cache_type
+    taskdesc['attributes']['cached_task'] = {
+        'type': cache_type,
+        'name': cache_name,
+        'digest': digest,
+    }