Bug 1065306 - Post: Handle utf-8 files in DotProperties. r=gps
authorNick Alexander <nalexander@mozilla.com>
Wed, 01 Oct 2014 23:23:30 -0700
changeset 208297 fa409031b7233ca464ce8bed2d485f7cb4ad2ddf
parent 208296 f2901a47e53d27bcda68a23795ca4ba41f0b346e
child 208298 e5c4d68db526a9ffc83c60fa60d371623da01fc7
push id9113
push usernalexander@mozilla.com
push dateThu, 02 Oct 2014 06:23:47 +0000
treeherderfx-team@fa409031b723 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersgps
bugs1065306
milestone35.0a1
Bug 1065306 - Post: Handle utf-8 files in DotProperties. r=gps
python/mozbuild/mozbuild/dotproperties.py
python/mozbuild/mozbuild/test/data/bad.properties
python/mozbuild/mozbuild/test/data/valid.properties
python/mozbuild/mozbuild/test/test_dotproperties.py
--- a/python/mozbuild/mozbuild/dotproperties.py
+++ b/python/mozbuild/mozbuild/dotproperties.py
@@ -3,16 +3,17 @@
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
 # This file contains utility functions for reading .properties files, like
 # region.properties.
 
 
 from __future__ import unicode_literals
 
+import codecs
 import re
 import os
 import sys
 
 if sys.version_info[0] == 3:
     str_type = str
 else:
     str_type = basestring
@@ -26,17 +27,17 @@ class DotProperties:
             self.update(file)
 
     def update(self, file):
         '''Updates properties from a file name or file-like object.
 
         Ignores empty lines and comment lines.'''
 
         if isinstance(file, str_type):
-            f = open(file, 'rt')
+            f = codecs.open(file, 'r', 'utf-8')
         else:
             f = file
 
         for l in f.readlines():
             line = l.strip()
             if not line or line.startswith('#'):
                 continue
             (k, v) = re.split('\s*=\s*', line, 1)
new file mode 100644
--- /dev/null
+++ b/python/mozbuild/mozbuild/test/data/bad.properties
@@ -0,0 +1,12 @@
+# A region.properties file with invalid unicode byte sequences.  The
+# sequences were cribbed from Markus Kuhn's "UTF-8 decoder capability
+# and stress test", available at
+# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+
+# 3.5  Impossible bytes                                                         |
+#                                                                               |
+# The following two bytes cannot appear in a correct UTF-8 string               |
+#                                                                               |
+# 3.5.1  fe = ""                                                               |
+# 3.5.2  ff = ""                                                               |
+# 3.5.3  fe fe ff ff = ""                                                   |
new file mode 100644
--- /dev/null
+++ b/python/mozbuild/mozbuild/test/data/valid.properties
@@ -0,0 +1,11 @@
+# A region.properties file with unicode characters.
+
+# Danish.
+# ####  ~~ Søren Munk Skrøder, sskroeder - 2009-05-30 @ #mozmae
+
+# Korean.
+A.title=한메일
+
+# Russian.
+list.0 = test
+list.1 = Яндекс
--- a/python/mozbuild/mozbuild/test/test_dotproperties.py
+++ b/python/mozbuild/mozbuild/test/test_dotproperties.py
@@ -1,32 +1,29 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+# -*- coding: utf-8 -*-
 
 from __future__ import unicode_literals
 
 import os
-import sys
 import unittest
 
 from StringIO import StringIO
 
+import mozpack.path as mozpath
+
 from mozbuild.dotproperties import (
     DotProperties,
 )
 
 from mozunit import (
     main,
 )
 
-if sys.version_info[0] == 3:
-    str_type = 'str'
-else:
-    str_type = 'unicode'
+test_data_path = mozpath.abspath(mozpath.dirname(__file__))
+test_data_path = mozpath.join(test_data_path, 'data')
 
 
 class TestDotProperties(unittest.TestCase):
     def test_get(self):
         contents = StringIO('''
 key=value
 ''')
         p = DotProperties(contents)
@@ -79,11 +76,40 @@ B.url=url B
         self.assertEqual(p.get_dict('missing'), {})
         self.assertEqual(p.get_dict('A'), {'title': 'title A'})
         self.assertEqual(p.get_dict('B'), {'title': 'title B', 'url': 'url B'})
         with self.assertRaises(ValueError):
             p.get_dict('A', required_keys=['title', 'url'])
         with self.assertRaises(ValueError):
             p.get_dict('missing', required_keys=['key'])
 
+    def test_unicode(self):
+        contents = StringIO('''
+# Danish.
+# ####  ~~ Søren Munk Skrøder, sskroeder - 2009-05-30 @ #mozmae
+
+# Korean.
+A.title=한메일
+
+# Russian.
+list.0 = test
+list.1 = Яндекс
+''')
+        p = DotProperties(contents)
+        self.assertEqual(p.get_dict('A'), {'title': '한메일'})
+        self.assertEqual(p.get_list('list'), ['test', 'Яндекс'])
+
+    def test_valid_unicode_from_file(self):
+        # The contents of valid.properties is identical to the contents of the
+        # test above.  This specifically exercises reading from a file.
+        p = DotProperties(os.path.join(test_data_path, 'valid.properties'))
+        self.assertEqual(p.get_dict('A'), {'title': '한메일'})
+        self.assertEqual(p.get_list('list'), ['test', 'Яндекс'])
+
+    def test_bad_unicode_from_file(self):
+        # The contents of bad.properties is not valid Unicode; see the comments
+        # in the file itself for details.
+        with self.assertRaises(UnicodeDecodeError):
+            DotProperties(os.path.join(test_data_path, 'bad.properties'))
+
 
 if __name__ == '__main__':
     main()