Bug 1295486 - Decode YAML files to UTF-8 at read time; r=dustin
☠☠ backed out by 0944984daec9 ☠ ☠
authorGregory Szorc <gps@mozilla.com>
Thu, 18 Aug 2016 08:58:59 -0700
changeset 309957 7a3a65ff58653cfa96f39c2ab4e458111602b92c
parent 309956 f2ea401ab10ce254c22d7ec6ec715b55fbb73998
child 309958 213a4986ccfd71f04f822e56b1a9ad9505ed5e98
push id30575
push userryanvm@gmail.com
push dateFri, 19 Aug 2016 13:46:06 +0000
treeherdermozilla-central@3da4d64410c0 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersdustin
bugs1295486
milestone51.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1295486 - Decode YAML files to UTF-8 at read time; r=dustin Before, we'd open files and feed bytes to yaml.load(). When a str is fed to yaml.load(), it attempts to guess the encoding. It defaults to UTF-8 unless somebody set us up the BOM. This is probably OK. Except if the file isn't valid UTF-8, the exception will be raised in the bowels of YAML parsing and it may not be obvious the failure is due to invalid UTF-8 input versus say Python str/unicode coercion foo. We change all call sites that load YAML from a file to use codecs.open() to open the file in UTF-8 and perform UTF-8 decoding/validation at file read time. This should make any UTF-8 failures more obvious. Furthermore, it reinforces that our YAML files are UTF-8 and not some other encoding. I discovered this issue as part of trying to get emoji symbols to render on Treeherder. Unfortunately, it appears pyyaml detects many emoji as unprintable characters and refuses to load them. This makes me sad and makes me want to abandon pyyaml/YAML in favor of something that supports emoji :P MozReview-Commit-ID: AOvAruZFfnK
taskcluster/taskgraph/generator.py
taskcluster/taskgraph/parameters.py
taskcluster/taskgraph/task/test.py
taskcluster/taskgraph/task/transform.py
taskcluster/taskgraph/util/templates.py
--- a/taskcluster/taskgraph/generator.py
+++ b/taskcluster/taskgraph/generator.py
@@ -1,13 +1,15 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 from __future__ import absolute_import, print_function, unicode_literals
+
+import codecs
 import logging
 import os
 import yaml
 
 from .graph import Graph
 from .taskgraph import TaskGraph
 from .optimize import optimize_task_graph
 from .util.python_path import find_object
@@ -136,17 +138,17 @@ class TaskGraphGenerator(object):
         for path in os.listdir(self.root_dir):
             path = os.path.join(self.root_dir, path)
             if not os.path.isdir(path):
                 continue
             kind_name = os.path.basename(path)
             logger.debug("loading kind `{}` from `{}`".format(kind_name, path))
 
             kind_yml = os.path.join(path, 'kind.yml')
-            with open(kind_yml) as f:
+            with codecs.open(kind_yml, 'rb', 'utf-8') as f:
                 config = yaml.load(f)
 
             yield Kind(kind_name, path, config)
 
     def _run(self):
         logger.info("Loading kinds")
         # put the kinds into a graph and sort topologically so that kinds are loaded
         # in post-order
--- a/taskcluster/taskgraph/parameters.py
+++ b/taskcluster/taskgraph/parameters.py
@@ -1,16 +1,17 @@
 # -*- coding: utf-8 -*-
 
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 from __future__ import absolute_import, print_function, unicode_literals
 
+import codecs
 import json
 import yaml
 from mozbuild.util import ReadOnlyDict
 
 
 class Parameters(ReadOnlyDict):
     """An immutable dictionary with nicer KeyError messages on failure"""
     def __getitem__(self, k):
@@ -22,15 +23,15 @@ class Parameters(ReadOnlyDict):
 
 def load_parameters_file(options):
     """
     Load parameters from the --parameters option
     """
     filename = options['parameters']
     if not filename:
         return Parameters()
-    with open(filename) as f:
+    with codecs.open(filename, 'rb', 'utf-8') as f:
         if filename.endswith('.yml'):
             return Parameters(**yaml.safe_load(f))
         elif filename.endswith('.json'):
             return Parameters(**json.load(f))
         else:
             raise TypeError("Parameters file `{}` is not JSON or YAML".format(filename))
--- a/taskcluster/taskgraph/task/test.py
+++ b/taskcluster/taskgraph/task/test.py
@@ -1,14 +1,15 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 from __future__ import absolute_import, print_function, unicode_literals
 
+import codecs
 import copy
 import logging
 import os
 import yaml
 
 from . import base
 from taskgraph.util.python_path import find_object
 from taskgraph.transforms.base import TransformSequence, TransformConfig
@@ -156,10 +157,10 @@ class TestTask(base.Task):
 
     def optimize(self):
         return False, None
 
 
 def load_yaml(path, name):
     """Convenience method to load a YAML file in the kind directory"""
     filename = os.path.join(path, name)
-    with open(filename, "rb") as f:
+    with codecs.open(filename, 'rb', 'utf-8') as f:
         return yaml.load(f)
--- a/taskcluster/taskgraph/task/transform.py
+++ b/taskcluster/taskgraph/task/transform.py
@@ -1,14 +1,15 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 from __future__ import absolute_import, print_function, unicode_literals
 
+import codecs
 import logging
 import os
 import yaml
 
 from . import base
 from ..util.python_path import find_object
 from ..transforms.base import TransformSequence, TransformConfig
 
@@ -72,10 +73,10 @@ class TransformTask(base.Task):
 
     def optimize(self):
         return False, None
 
 
 def load_yaml(path, name):
     """Convenience method to load a YAML file in the kind directory"""
     filename = os.path.join(path, name)
-    with open(filename, "rb") as f:
+    with codecs.open(filename, 'rb', 'utf-8') as f:
         return yaml.load(f)
--- a/taskcluster/taskgraph/util/templates.py
+++ b/taskcluster/taskgraph/util/templates.py
@@ -1,8 +1,9 @@
+import codecs
 import os
 
 import pystache
 import yaml
 
 # Key used in template inheritance...
 INHERITS_KEY = '$inherits'
 
@@ -123,10 +124,16 @@ class Templates():
         if not path:
             raise TemplatesException('path is required')
 
         path = self.resolve_path(path)
 
         if not os.path.isfile(path):
             raise TemplatesException('"{}" is not a file'.format(path))
 
-        content = open(path).read()
+        # pystache.render() converts str to unicode. So just feed it a
+        # unicode so it doesn't have to guess the encoding. By verifying
+        # the file is UTF-8 at read time, we also make tracebacks easier
+        # to debug since it is obvious the failure is due to the file content
+        # and not a Python str/unicode issue.
+        with codecs.open(path, 'rb', 'utf-8') as fh:
+            content = fh.read()
         return self.render(path, content, parameters, seen)