taskcluster/taskgraph/util/templates.py
author Gregory Szorc <gps@mozilla.com>
Thu, 18 Aug 2016 08:58:59 -0700
changeset 413298 84dfa535f53ae5ec7a03a318e945684a2f8ca519
parent 413222 ddce0af84cbd65950a71e9851e086f3e0fc0050b
permissions -rw-r--r--
Bug 1295486 - Decode YAML files to UTF-8 at read time; r=dustin Before, we'd open files and feed bytes to yaml.load(). When a str is fed to yaml.load(), it attempts to guess the encoding. It defaults to UTF-8 unless somebody set us up the BOM. This is probably OK. Except if the file isn't valid UTF-8, the exception will be raised in the bowels of YAML parsing and it may not be obvious the failure is due to invalid UTF-8 input versus say Python str/unicode coercion foo. We change all call sites that load YAML from a file to use codecs.open() to open the file in UTF-8 and perform UTF-8 decoding/validation at file read time. This should make any UTF-8 failures more obvious. Furthermore, it reinforces that our YAML files are UTF-8 and not some other encoding. I discovered this issue as part of trying to get emoji symbols to render on Treeherder. Unfortunately, it appears pyyaml detects many emoji as unprintable characters and refuses to load them. This makes me sad and makes me want to abandon pyyaml/YAML in favor of something that supports emoji :P MozReview-Commit-ID: AOvAruZFfnK

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from __future__ import absolute_import, print_function, unicode_literals

import codecs
import os

import pystache
import yaml
import copy

# Key used in template inheritance...
INHERITS_KEY = '$inherits'


def merge_to(source, dest):
    '''
    Merge dict and arrays (override scalar values)

    Keys from source override keys from dest, and elements from lists in source
    are appended to lists in dest.

    :param dict source: to copy from
    :param dict dest: to copy to (modified in place)
    '''

    for key, value in source.items():
        # Override mismatching or empty types
        if type(value) != type(dest.get(key)):  # noqa
            dest[key] = source[key]
            continue

        # Merge dict
        if isinstance(value, dict):
            merge_to(value, dest[key])
            continue

        if isinstance(value, list):
            dest[key] = dest[key] + source[key]
            continue

        dest[key] = source[key]

    return dest


def merge(*objects):
    '''
    Merge the given objects, using the semantics described for merge_to, with
    objects later in the list taking precedence.  From an inheritance
    perspective, "parents" should be listed before "children".

    Returns the result without modifying any arguments.
    '''
    if len(objects) == 1:
        return copy.deepcopy(objects[0])
    return merge_to(objects[-1], merge(*objects[:-1]))


class TemplatesException(Exception):
    pass


class Templates():
    '''
    The taskcluster integration makes heavy use of yaml to describe tasks this
    class handles the loading/rendering.
    '''

    def __init__(self, root):
        '''
        Initialize the template render.

        :param str root: Root path where to load yaml files.
        '''
        if not root:
            raise TemplatesException('Root is required')

        if not os.path.isdir(root):
            raise TemplatesException('Root must be a directory')

        self.root = root

    def _inherits(self, path, obj, properties, seen):
        blueprint = obj.pop(INHERITS_KEY)
        seen.add(path)

        # Resolve the path here so we can detect circular references.
        template = self.resolve_path(blueprint.get('from'))
        variables = blueprint.get('variables', {})

        # Passed parameters override anything in the task itself.
        for key in properties:
            variables[key] = properties[key]

        if not template:
            msg = '"{}" inheritance template missing'.format(path)
            raise TemplatesException(msg)

        if template in seen:
            msg = 'Error while handling "{}" in "{}" circular template' + \
                  'inheritance seen \n  {}'
            raise TemplatesException(msg.format(path, template, seen))

        try:
            out = self.load(template, variables, seen)
        except TemplatesException as e:
            msg = 'Error expanding parent ("{}") of "{}" original error {}'
            raise TemplatesException(msg.format(template, path, str(e)))

        # Anything left in obj is merged into final results (and overrides)
        return merge_to(obj, out)

    def render(self, path, content, parameters, seen):
        '''
        Renders a given yaml string.

        :param str path:  used to prevent infinite recursion in inheritance.
        :param str content: Of yaml file.
        :param dict parameters: For mustache templates.
        :param set seen: Seen files (used for inheritance)
        '''
        content = pystache.render(content, parameters)
        result = yaml.load(content)

        # In addition to the usual template logic done by mustache we also
        # handle special '$inherit' dict keys.
        if isinstance(result, dict) and INHERITS_KEY in result:
            return self._inherits(path, result, parameters, seen)

        return result

    def resolve_path(self, path):
        return os.path.join(self.root, path)

    def load(self, path, parameters=None, seen=None):
        '''
        Load an render the given yaml path.

        :param str path: Location of yaml file to load (relative to root).
        :param dict parameters: To template yaml file with.
        '''
        seen = seen or set()

        if not path:
            raise TemplatesException('path is required')

        path = self.resolve_path(path)

        if not os.path.isfile(path):
            raise TemplatesException('"{}" is not a file'.format(path))

        # pystache.render() converts str to unicode. So just feed it a
        # unicode so it doesn't have to guess the encoding. By verifying
        # the file is UTF-8 at read time, we also make tracebacks saner.
        with codecs.open(path, 'rb', 'utf-8') as fh:
            content = fh.read()
        return self.render(path, content, parameters, seen)