Bug 1318200 - Introduce task graph filtering; r?dustin draft
authorGregory Szorc <gps@mozilla.com>
Thu, 17 Nov 2016 15:53:30 -0800
changeset 441327 4e499341a65666cf20149c0dd0d95f1766f6a59f
parent 440726 36fd26a7c3ad4ca143549ffe0377637046060bb8
child 441328 06242e612265680a1dbc28f47b126146b1be87cc
child 441389 3e9123ffce5ba99efc1d5a0375b9e53540ac9182
child 441390 98a31bdc375aa153c3d78a062fa0a54d56666c9b
push id36394
push userbmo:gps@mozilla.com
push dateFri, 18 Nov 2016 18:38:31 +0000
Bug 1318200 - Introduce task graph filtering; r?dustin Previously, we ran a single "target task" function to mutate the full task graph into a subset based on input parameters (try syntax, repository being built for, etc). This concept is useful. But the implementation was limiting because we could only have a single "target tasks" function. This commit introduces the concept of "filters." They conceptually do the same thing as "target tasks methods" but you can run more than 1 of them. Filters are simply functions that examine an input graph+parameters and emit nodes that should be retained. Filters, like target tasks methods, are defined via decorated functions in a module. TaskGraphGenerator has been converted to use filters. The list of defined filters can be defined in the parameters dict passed into TaskGraphGenerator. A default filter list is provided in decision.py. The intent is to eventually convert target tasks to filters. Until that happens, we always run the registered target tasks method via a filter proxy function. No new tests have been added because we don't yet have any functionality relying explicitly on filters. Tests will be added in a subsequent commit once we add a new filter. While I was here, I also snuck in some logging on the size of the graphs. MozReview-Commit-ID: ERn2hIYbMRp
--- a/taskcluster/docs/parameters.rst
+++ b/taskcluster/docs/parameters.rst
@@ -84,14 +84,19 @@ Target Set
 The "target set" is the set of task labels which must be included in a task
 graph.  The task graph generation process will include any tasks required by
 those in the target set, recursively.  In a decision task, this set can be
 specified programmatically using one of a variety of methods (e.g., parsing try
 syntax or reading a project-specific configuration file).
+    List of filter functions (from ``taskcluster/taskgraph/filter_tasks.py``) to
+    apply. This is usually defined internally, as filters are typically
+    global.
     The method to use to determine the target task set.  This is the suffix of
     one of the functions in ``tascluster/taskgraph/target_tasks.py``.
    If true, then target tasks are eligible for optimization.
--- a/taskcluster/docs/taskgraph.rst
+++ b/taskcluster/docs/taskgraph.rst
@@ -86,18 +86,18 @@ decision task does except the command-li
 Graph Generation
 Graph generation, as run via ``mach taskgraph decision``, proceeds as follows:
 #. For all kinds, generate all tasks.  The result is the "full task set"
 #. Create dependency links between tasks using kind-specific mechanisms.  The
    result is the "full task graph".
-#. Select the target tasks (based on try syntax or a tree-specific
-   specification).  The result is the "target task set".
+#. Filter the target tasks (based on a series of filters, such as try syntax,
+   tree-specific specifications, etc). The result is the "target task set".
 #. Based on the full task graph, calculate the transitive closure of the target
    task set.  That is, the target tasks and all requirements of those tasks.
    The result is the "target task graph".
 #. Optimize the target task graph based on kind-specific optimization methods.
    The result is the "optimized task graph" with fewer nodes than the target
    task graph.
 #. Create tasks for all tasks in the optimized task graph.
--- a/taskcluster/taskgraph/decision.py
+++ b/taskcluster/taskgraph/decision.py
@@ -118,16 +118,22 @@ def get_decision_parameters(options):
     ] if n in options}
+    # Define default filter list, as most configurations shouldn't need
+    # custom filters.
+    parameters['filters'] = [
+        'target_tasks_method',
+    ]
     # owner must be an email, but sometimes (e.g., for ffxbld) it is not, in which
     # case, fake it
     if '@' not in parameters['owner']:
         parameters['owner'] += '@noreply.mozilla.org'
     # use the pushdate as build_date if given, else use current time
     parameters['build_date'] = parameters['pushdate'] or int(time.time())
     # moz_build_date is the build identifier based on build_date
new file mode 100644
--- /dev/null
+++ b/taskcluster/taskgraph/filter_tasks.py
@@ -0,0 +1,32 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+from __future__ import absolute_import, unicode_literals
+from . import (
+    target_tasks,
+filter_task_functions = {}
+def filter_task(name):
+    """Generator to declare a task filter function."""
+    def wrap(func):
+        filter_task_functions[name] = func
+        return func
+    return wrap
+def filter_target_tasks(graph, parameters):
+    """Proxy filter to use legacy target tasks code.
+    This should go away once target_tasks are converted to filters.
+    """
+    attr = parameters.get('target_tasks_method', 'all_tasks')
+    fn = target_tasks.get_method(attr)
+    return fn(graph, parameters)
--- a/taskcluster/taskgraph/generator.py
+++ b/taskcluster/taskgraph/generator.py
@@ -2,18 +2,18 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 from __future__ import absolute_import, print_function, unicode_literals
 import logging
 import os
 import yaml
+from . import filter_tasks
 from .graph import Graph
-from .target_tasks import get_method
 from .taskgraph import TaskGraph
 from .optimize import optimize_task_graph
 from .util.python_path import find_object
 logger = logging.getLogger(__name__)
 class Kind(object):
@@ -56,18 +56,23 @@ class TaskGraphGenerator(object):
         @param root_dir: root directory, with subdirectories for each kind
         @param parameters: parameters for this task-graph generation
         @type parameters: dict
         self.root_dir = root_dir
         self.parameters = parameters
-        target_tasks_method = parameters.get('target_tasks_method', 'all_tasks')
-        self.target_tasks_method = get_method(target_tasks_method)
+        filters = parameters.get('filters', [])
+        # Always add legacy target tasks method until we deprecate that API.
+        if 'target_tasks_method' not in filters:
+            filters.insert(0, 'target_tasks_method')
+        self.filters = [filter_tasks.filter_task_functions[f] for f in filters]
         # this can be set up until the time the target task set is generated;
         # it defaults to parameters['target_tasks']
         self._target_tasks = parameters.get('target_tasks')
         # start the generator
         self._run = self._run()
         self._run_results = {}
@@ -175,23 +180,34 @@ class TaskGraphGenerator(object):
         logger.info("Generating full task graph")
         edges = set()
         for t in full_task_set:
             for dep, depname in t.get_dependencies(full_task_set):
                 edges.add((t.label, dep, depname))
         full_task_graph = TaskGraph(all_tasks,
                                     Graph(full_task_set.graph.nodes, edges))
+        logger.info("Full task graph contains %d tasks and %d dependencies" % (
+            len(full_task_set.graph.nodes), len(edges)))
         yield 'full_task_graph', full_task_graph
         logger.info("Generating target task set")
-        target_tasks = set(self.target_tasks_method(full_task_graph, self.parameters))
-        target_task_set = TaskGraph(
-            {l: all_tasks[l] for l in target_tasks},
-            Graph(target_tasks, set()))
+        target_task_set = TaskGraph(dict(all_tasks),
+                                    Graph(set(all_tasks.keys()), set()))
+        for fltr in self.filters:
+            old_len = len(target_task_set.graph.nodes)
+            target_tasks = set(fltr(target_task_set, self.parameters))
+            target_task_set = TaskGraph(
+                {l: all_tasks[l] for l in target_tasks},
+                Graph(target_tasks, set()))
+            logger.info('Filter %s pruned %d tasks (%d remain)' % (
+                fltr.__name__,
+                old_len - len(target_tasks),
+                len(target_tasks)))
         yield 'target_task_set', target_task_set
         logger.info("Generating target task graph")
         target_graph = full_task_graph.graph.transitive_closure(target_tasks)
         target_task_graph = TaskGraph(
             {l: all_tasks[l] for l in target_graph.nodes},
         yield 'target_task_graph', target_task_graph
--- a/taskcluster/taskgraph/parameters.py
+++ b/taskcluster/taskgraph/parameters.py
@@ -9,16 +9,17 @@ from __future__ import absolute_import, 
 import json
 import yaml
 from mozbuild.util import ReadOnlyDict
 # Please keep this list sorted and in sync with taskcluster/docs/parameters.rst
+    'filters',