Bug 940103 - Import compare-locales in the tree. r=gps
authorAxel Hecht <axel@pike.org>
Wed, 18 Mar 2015 18:34:15 +0100
changeset 264199 153252a2e6bc742281997365fa0005316d7c558e
parent 264198 3e26bf9400c82b2c3187cd7666c4b2cc445a6c94
child 264200 84b451463f065af1643e310322e6665f6fbf5983
push id4718
push userraliiev@mozilla.com
push dateMon, 11 May 2015 18:39:53 +0000
treeherdermozilla-beta@c20c4ef55f08 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersgps
bugs940103
milestone39.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 940103 - Import compare-locales in the tree. r=gps This is http://hg.mozilla.org/l10n/compare-locales/file/48445f53a274 in the upstream repo. To review future updates, using the github mirror will be easier, you can check https://github.com/Pike/compare-locales/compare/873e557...master for future commits.
python/compare-locales/compare_locales/__init__.py
python/compare-locales/compare_locales/checks.py
python/compare-locales/compare_locales/commands.py
python/compare-locales/compare_locales/compare.py
python/compare-locales/compare_locales/parser.py
python/compare-locales/compare_locales/paths.py
python/compare-locales/compare_locales/tests/__init__.py
python/compare-locales/compare_locales/tests/data/bug121341.properties
python/compare-locales/compare_locales/tests/data/test.properties
python/compare-locales/compare_locales/tests/data/triple-license.dtd
python/compare-locales/compare_locales/tests/test_checks.py
python/compare-locales/compare_locales/tests/test_dtd.py
python/compare-locales/compare_locales/tests/test_ini.py
python/compare-locales/compare_locales/tests/test_merge.py
python/compare-locales/compare_locales/tests/test_properties.py
python/compare-locales/compare_locales/tests/test_util.py
python/compare-locales/compare_locales/tests/test_webapps.py
python/compare-locales/compare_locales/util.py
python/compare-locales/compare_locales/webapps.py
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/__init__.py
@@ -0,0 +1,1 @@
+version = "0.10.1a"
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/checks.py
@@ -0,0 +1,420 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import re
+from difflib import SequenceMatcher
+from xml import sax
+try:
+    from cStringIO import StringIO
+except ImportError:
+    from StringIO import StringIO
+
+from compare_locales.parser import DTDParser, PropertiesParser
+
+
+class Checker(object):
+    '''Abstract class to implement checks per file type.
+    '''
+    pattern = None
+
+    @classmethod
+    def use(cls, file):
+        return cls.pattern.match(file.file)
+
+    def check(self, refEnt, l10nEnt):
+        '''Given the reference and localized Entities, performs checks.
+
+        This is a generator yielding tuples of
+        - "warning" or "error", depending on what should be reported,
+        - tuple of line, column info for the error within the string
+        - description string to be shown in the report
+        '''
+        if True:
+            raise NotImplementedError("Need to subclass")
+        yield ("error", (0, 0), "This is an example error", "example")
+
+
+class PrintfException(Exception):
+    def __init__(self, msg, pos):
+        self.pos = pos
+        self.msg = msg
+
+
+class PropertiesChecker(Checker):
+    '''Tests to run on .properties files.
+    '''
+    pattern = re.compile('.*\.properties$')
+    printf = re.compile(r'%(?P<good>%|'
+                        r'(?:(?P<number>[1-9][0-9]*)\$)?'
+                        r'(?P<width>\*|[0-9]+)?'
+                        r'(?P<prec>\.(?:\*|[0-9]+)?)?'
+                        r'(?P<spec>[duxXosScpfg]))?')
+
+    def check(self, refEnt, l10nEnt):
+        '''Test for the different variable formats.
+        '''
+        refValue, l10nValue = refEnt.val, l10nEnt.val
+        refSpecs = None
+        # check for PluralForm.jsm stuff, should have the docs in the
+        # comment
+        if 'Localization_and_Plurals' in refEnt.pre_comment:
+            # For plurals, common variable pattern is #1. Try that.
+            pats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)',
+                                                            refValue))
+            if len(pats) == 0:
+                return
+            lpats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)',
+                                                             l10nValue))
+            if pats - lpats:
+                yield ('warning', 0, 'not all variables used in l10n',
+                       'plural')
+                return
+            if lpats - pats:
+                yield ('error', 0, 'unreplaced variables in l10n',
+                       'plural')
+                return
+            return
+        # check for lost escapes
+        raw_val = l10nEnt.raw_val
+        for m in PropertiesParser.escape.finditer(raw_val):
+            if m.group('single') and \
+               m.group('single') not in PropertiesParser.known_escapes:
+                yield ('warning', m.start(),
+                       'unknown escape sequence, \\' + m.group('single'),
+                       'escape')
+        try:
+            refSpecs = self.getPrintfSpecs(refValue)
+        except PrintfException:
+            refSpecs = []
+        if refSpecs:
+            for t in self.checkPrintf(refSpecs, l10nValue):
+                yield t
+            return
+
+    def checkPrintf(self, refSpecs, l10nValue):
+        try:
+            l10nSpecs = self.getPrintfSpecs(l10nValue)
+        except PrintfException, e:
+            yield ('error', e.pos, e.msg, 'printf')
+            return
+        if refSpecs != l10nSpecs:
+            sm = SequenceMatcher()
+            sm.set_seqs(refSpecs, l10nSpecs)
+            msgs = []
+            warn = None
+            for action, i1, i2, j1, j2 in sm.get_opcodes():
+                if action == 'equal':
+                    continue
+                if action == 'delete':
+                    # missing argument in l10n
+                    if i2 == len(refSpecs):
+                        # trailing specs missing, that's just a warning
+                        warn = ', '.join('trailing argument %d `%s` missing' %
+                                         (i+1, refSpecs[i])
+                                         for i in xrange(i1, i2))
+                    else:
+                        for i in xrange(i1, i2):
+                            msgs.append('argument %d `%s` missing' %
+                                        (i+1, refSpecs[i]))
+                    continue
+                if action == 'insert':
+                    # obsolete argument in l10n
+                    for i in xrange(j1, j2):
+                        msgs.append('argument %d `%s` obsolete' %
+                                    (i+1, l10nSpecs[i]))
+                    continue
+                if action == 'replace':
+                    for i, j in zip(xrange(i1, i2), xrange(j1, j2)):
+                        msgs.append('argument %d `%s` should be `%s`' %
+                                    (j+1, l10nSpecs[j], refSpecs[i]))
+            if msgs:
+                yield ('error', 0, ', '.join(msgs), 'printf')
+            if warn is not None:
+                yield ('warning', 0, warn, 'printf')
+
+    def getPrintfSpecs(self, val):
+        hasNumber = False
+        specs = []
+        for m in self.printf.finditer(val):
+            if m.group("good") is None:
+                # found just a '%', signal an error
+                raise PrintfException('Found single %', m.start())
+            if m.group("good") == '%':
+                # escaped %
+                continue
+            if ((hasNumber and m.group('number') is None) or
+                    (not hasNumber and specs and
+                     m.group('number') is not None)):
+                # mixed style, numbered and not
+                raise PrintfException('Mixed ordered and non-ordered args',
+                                      m.start())
+            hasNumber = m.group('number') is not None
+            if hasNumber:
+                pos = int(m.group('number')) - 1
+                ls = len(specs)
+                if pos >= ls:
+                    # pad specs
+                    nones = pos - ls
+                    specs[ls:pos] = nones*[None]
+                    specs.append(m.group('spec'))
+                else:
+                    if specs[pos] is not None:
+                        raise PrintfException('Double ordered argument %d' %
+                                              (pos+1),
+                                              m.start())
+                    specs[pos] = m.group('spec')
+            else:
+                specs.append(m.group('spec'))
+        # check for missing args
+        if hasNumber and not all(specs):
+            raise PrintfException('Ordered argument missing', 0)
+        return specs
+
+
+class DTDChecker(Checker):
+    """Tests to run on DTD files.
+
+    Uses xml.sax for the heavy lifting of xml parsing.
+
+    The code tries to parse until it doesn't find any unresolved entities
+    anymore. If it finds one, it tries to grab the key, and adds an empty
+    <!ENTITY key ""> definition to the header.
+
+    Also checks for some CSS and number heuristics in the values.
+    """
+    pattern = re.compile('.*\.dtd$')
+
+    eref = re.compile('&(%s);' % DTDParser.Name)
+    tmpl = '''<!DOCTYPE elem [%s]>
+<elem>%s</elem>
+'''
+    xmllist = set(('amp', 'lt', 'gt', 'apos', 'quot'))
+
+    def __init__(self, reference):
+        self.reference = reference
+        self.__known_entities = None
+
+    def known_entities(self, refValue):
+        if self.__known_entities is None and self.reference is not None:
+            self.__known_entities = set()
+            for ent in self.reference:
+                self.__known_entities.update(self.entities_for_value(ent.val))
+        return self.__known_entities if self.__known_entities is not None \
+            else self.entities_for_value(refValue)
+
+    def entities_for_value(self, value):
+        reflist = set(m.group(1).encode('utf-8')
+                      for m in self.eref.finditer(value))
+        reflist -= self.xmllist
+        return reflist
+
+    # Setup for XML parser, with default and text-only content handler
+    class TextContent(sax.handler.ContentHandler):
+        textcontent = ''
+
+        def characters(self, content):
+            self.textcontent += content
+
+    defaulthandler = sax.handler.ContentHandler()
+    texthandler = TextContent()
+
+    numPattern = r'([0-9]+|[0-9]*\.[0-9]+)'
+    num = re.compile('^%s$' % numPattern)
+    lengthPattern = '%s(em|px|ch|cm|in)' % numPattern
+    length = re.compile('^%s$' % lengthPattern)
+    spec = re.compile(r'((?:min\-)?(?:width|height))\s*:\s*%s' %
+                      lengthPattern)
+    style = re.compile(r'^%(spec)s\s*(;\s*%(spec)s\s*)*;?$' %
+                       {'spec': spec.pattern})
+
+    processContent = None
+
+    def check(self, refEnt, l10nEnt):
+        """Try to parse the refvalue inside a dummy element, and keep
+        track of entities that we need to define to make that work.
+
+        Return a checker that offers just those entities.
+        """
+        refValue, l10nValue = refEnt.val, l10nEnt.val
+        # find entities the refValue references,
+        # reusing markup from DTDParser.
+        reflist = self.known_entities(refValue)
+        entities = ''.join('<!ENTITY %s "">' % s for s in sorted(reflist))
+        parser = sax.make_parser()
+        parser.setFeature(sax.handler.feature_external_ges, False)
+
+        parser.setContentHandler(self.defaulthandler)
+        try:
+            parser.parse(StringIO(self.tmpl %
+                                  (entities, refValue.encode('utf-8'))))
+            # also catch stray %
+            parser.parse(StringIO(self.tmpl %
+                                  (refEnt.all.encode('utf-8') + entities,
+                                   '&%s;' % refEnt.key.encode('utf-8'))))
+        except sax.SAXParseException, e:
+            yield ('warning',
+                   (0, 0),
+                   "can't parse en-US value", 'xmlparse')
+
+        # find entities the l10nValue references,
+        # reusing markup from DTDParser.
+        l10nlist = self.entities_for_value(l10nValue)
+        missing = sorted(l10nlist - reflist)
+        _entities = entities + ''.join('<!ENTITY %s "">' % s for s in missing)
+        warntmpl = u'Referencing unknown entity `%s`'
+        if reflist:
+            warntmpl += ' (%s known)' % ', '.join(sorted(reflist))
+        if self.processContent is not None:
+            self.texthandler.textcontent = ''
+            parser.setContentHandler(self.texthandler)
+        try:
+            parser.parse(StringIO(self.tmpl % (_entities,
+                         l10nValue.encode('utf-8'))))
+            # also catch stray %
+            # if this fails, we need to substract the entity definition
+            parser.setContentHandler(self.defaulthandler)
+            parser.parse(StringIO(self.tmpl % (
+                l10nEnt.all.encode('utf-8') + _entities,
+                '&%s;' % l10nEnt.key.encode('utf-8'))))
+        except sax.SAXParseException, e:
+            # xml parse error, yield error
+            # sometimes, the error is reported on our fake closing
+            # element, make that the end of the last line
+            lnr = e.getLineNumber() - 1
+            lines = l10nValue.splitlines()
+            if lnr > len(lines):
+                lnr = len(lines)
+                col = len(lines[lnr-1])
+            else:
+                col = e.getColumnNumber()
+                if lnr == 1:
+                    # first line starts with <elem>, substract
+                    col -= len("<elem>")
+                elif lnr == 0:
+                    col -= len("<!DOCTYPE elem [")  # first line is DOCTYPE
+            yield ('error', (lnr, col), ' '.join(e.args), 'xmlparse')
+
+        for key in missing:
+            yield ('warning', (0, 0), warntmpl % key.decode('utf-8'),
+                   'xmlparse')
+
+        # Number check
+        if self.num.match(refValue) and not self.num.match(l10nValue):
+            yield ('warning', 0, 'reference is a number', 'number')
+        # CSS checks
+        # just a length, width="100em"
+        if self.length.match(refValue) and not self.length.match(l10nValue):
+            yield ('error', 0, 'reference is a CSS length', 'css')
+        # real CSS spec, style="width:100px;"
+        if self.style.match(refValue):
+            if not self.style.match(l10nValue):
+                yield ('error', 0, 'reference is a CSS spec', 'css')
+            else:
+                # warn if different properties or units
+                refMap = dict((s, u) for s, _, u in
+                              self.spec.findall(refValue))
+                msgs = []
+                for s, _, u in self.spec.findall(l10nValue):
+                    if s not in refMap:
+                        msgs.insert(0, '%s only in l10n' % s)
+                        continue
+                    else:
+                        ru = refMap.pop(s)
+                        if u != ru:
+                            msgs.append("units for %s don't match "
+                                        "(%s != %s)" % (s, u, ru))
+                for s in refMap.iterkeys():
+                    msgs.insert(0, '%s only in reference' % s)
+                if msgs:
+                    yield ('warning', 0, ', '.join(msgs), 'css')
+
+        if self.processContent is not None:
+            for t in self.processContent(self.texthandler.textcontent):
+                yield t
+
+
+class PrincessAndroid(DTDChecker):
+    """Checker for the string values that Android puts into an XML container.
+
+    http://developer.android.com/guide/topics/resources/string-resource.html#FormattingAndStyling  # noqa
+    has more info. Check for unescaped apostrophes and bad unicode escapes.
+    """
+    quoted = re.compile("(?P<q>[\"']).*(?P=q)$")
+
+    def unicode_escape(self, str):
+        """Helper method to try to decode all unicode escapes in a string.
+
+        This code uses the standard python decode for unicode-escape, but
+        that's somewhat tricky, as its input needs to be ascii. To get to
+        ascii, the unicode string gets converted to ascii with
+        backslashreplace, i.e., all non-ascii unicode chars get unicode
+        escaped. And then we try to roll all of that back.
+        Now, when that hits an error, that's from the original string, and we
+        need to search for the actual error position in the original string,
+        as the backslashreplace code changes string positions quite badly.
+        See also the last check in TestAndroid.test_android_dtd, with a
+        lengthy chinese string.
+        """
+        val = str.encode('ascii', 'backslashreplace')
+        try:
+            val.decode('unicode-escape')
+        except UnicodeDecodeError, e:
+            args = list(e.args)
+            badstring = args[1][args[2]:args[3]]
+            i = len(args[1][:args[2]].decode('unicode-escape'))
+            args[2] = i
+            args[3] = i + len(badstring)
+            raise UnicodeDecodeError(*args)
+
+    @classmethod
+    def use(cls, file):
+        """Use this Checker only for DTD files in embedding/android."""
+        return (file.module in ("embedding/android",
+                                "mobile/android/base")
+                and cls.pattern.match(file.file))
+
+    def processContent(self, val):
+        """Actual check code.
+        Check for unicode escapes and unescaped quotes and apostrophes,
+        if string's not quoted.
+        """
+        # first, try to decode unicode escapes
+        try:
+            self.unicode_escape(val)
+        except UnicodeDecodeError, e:
+            yield ('error', e.args[2], e.args[4], 'android')
+        # check for unescaped single or double quotes.
+        # first, see if the complete string is single or double quoted,
+        # that changes the rules
+        m = self.quoted.match(val)
+        if m:
+            q = m.group('q')
+            offset = 0
+            val = val[1:-1]  # strip quotes
+        else:
+            q = "[\"']"
+            offset = -1
+        stray_quot = re.compile(r"[\\\\]*(%s)" % q)
+
+        for m in stray_quot.finditer(val):
+            if len(m.group(0)) % 2:
+                # found an unescaped single or double quote, which message?
+                if m.group(1) == '"':
+                    msg = u"Quotes in Android DTDs need escaping with \\\" "\
+                          u"or \\u0022, or put string in apostrophes."
+                else:
+                    msg = u"Apostrophes in Android DTDs need escaping with "\
+                          u"\\' or \\u0027, or use \u2019, or put string in "\
+                          u"quotes."
+                yield ('error', m.end(0)+offset, msg, 'android')
+
+
+def getChecker(file, reference=None):
+    if PropertiesChecker.use(file):
+        return PropertiesChecker()
+    if PrincessAndroid.use(file):
+        return PrincessAndroid(reference)
+    if DTDChecker.use(file):
+        return DTDChecker(reference)
+    return None
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/commands.py
@@ -0,0 +1,154 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+'Commands exposed to commandlines'
+
+import logging
+from optparse import OptionParser, make_option
+
+from compare_locales.paths import EnumerateApp
+from compare_locales.compare import compareApp, compareDirs
+from compare_locales.webapps import compare_web_app
+
+
+class BaseCommand(object):
+    """Base class for compare-locales commands.
+    This handles command line parsing, and general sugar for setuptools
+    entry_points.
+    """
+    options = [
+        make_option('-v', '--verbose', action='count', dest='v', default=0,
+                    help='Make more noise'),
+        make_option('-q', '--quiet', action='count', dest='q', default=0,
+                    help='Make less noise'),
+        make_option('-m', '--merge',
+                    help='''Use this directory to stage merged files,
+use {ab_CD} to specify a different directory for each locale'''),
+    ]
+    data_option = make_option('--data', choices=['text', 'exhibit', 'json'],
+                              default='text',
+                              help='''Choose data and format (one of text,
+exhibit, json); text: (default) Show which files miss which strings, together
+with warnings and errors. Also prints a summary; json: Serialize the internal
+tree, useful for tools. Also always succeeds; exhibit: Serialize the summary
+data in a json useful for Exhibit
+''')
+
+    def __init__(self):
+        self.parser = None
+
+    def get_parser(self):
+        """Get an OptionParser, with class docstring as usage, and
+        self.options.
+        """
+        parser = OptionParser()
+        parser.set_usage(self.__doc__)
+        for option in self.options:
+            parser.add_option(option)
+        return parser
+
+    @classmethod
+    def call(cls):
+        """Entry_point for setuptools.
+        The actual command handling is done in the handle() method of the
+        subclasses.
+        """
+        cmd = cls()
+        cmd.handle_()
+
+    def handle_(self):
+        """The instance part of the classmethod call."""
+        self.parser = self.get_parser()
+        (options, args) = self.parser.parse_args()
+        # log as verbose or quiet as we want, warn by default
+        logging.basicConfig()
+        logging.getLogger().setLevel(logging.WARNING -
+                                     (options.v - options.q)*10)
+        observer = self.handle(args, options)
+        print observer.serialize(type=options.data).encode('utf-8', 'replace')
+
+    def handle(self, args, options):
+        """Subclasses need to implement this method for the actual
+        command handling.
+        """
+        raise NotImplementedError
+
+
+class CompareLocales(BaseCommand):
+    """usage: %prog [options] l10n.ini l10n_base_dir [locale ...]
+
+Check the localization status of a gecko application.
+The first argument is a path to the l10n.ini file for the application,
+followed by the base directory of the localization repositories.
+Then you pass in the list of locale codes you want to compare. If there are
+not locales given, the list of locales will be taken from the all-locales file
+of the application\'s l10n.ini."""
+
+    options = BaseCommand.options + [
+        make_option('--clobber-merge', action="store_true", default=False,
+                    dest='clobber',
+                    help="""WARNING: DATALOSS.
+Use this option with care. If specified, the merge directory will
+be clobbered for each module. That means, the subdirectory will
+be completely removed, any files that were there are lost.
+Be careful to specify the right merge directory when using this option."""),
+        make_option('-r', '--reference', default='en-US', dest='reference',
+                    help='Explicitly set the reference '
+                    'localization. [default: en-US]'),
+        BaseCommand.data_option
+    ]
+
+    def handle(self, args, options):
+        if len(args) < 2:
+            self.parser.error('Need to pass in list of languages')
+        inipath, l10nbase = args[:2]
+        locales = args[2:]
+        app = EnumerateApp(inipath, l10nbase, locales)
+        app.reference = options.reference
+        try:
+            observer = compareApp(app, merge_stage=options.merge,
+                                  clobber=options.clobber)
+        except (OSError, IOError), exc:
+            print "FAIL: " + str(exc)
+            self.parser.exit(2)
+        return observer
+
+
+class CompareDirs(BaseCommand):
+    """usage: %prog [options] reference localization
+
+Check the localization status of a directory tree.
+The first argument is a path to the reference data,the second is the
+localization to be tested."""
+
+    options = BaseCommand.options + [
+        BaseCommand.data_option
+    ]
+
+    def handle(self, args, options):
+        if len(args) != 2:
+            self.parser.error('Reference and localizatino required')
+        reference, locale = args
+        observer = compareDirs(reference, locale, merge_stage=options.merge)
+        return observer
+
+
+class CompareWebApp(BaseCommand):
+    """usage: %prog [options] webapp [locale locale]
+
+Check the localization status of a gaia-style web app.
+The first argument is the directory of the web app.
+Following arguments explicitly state the locales to test.
+If none are given, test all locales in manifest.webapp or files."""
+
+    options = BaseCommand.options[:-1] + [
+        BaseCommand.data_option]
+
+    def handle(self, args, options):
+        if len(args) < 1:
+            self.parser.error('Webapp directory required')
+        basedir = args[0]
+        locales = args[1:]
+        observer = compare_web_app(basedir, locales)
+        return observer
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/compare.py
@@ -0,0 +1,635 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+'Mozilla l10n compare locales tool'
+
+import codecs
+import os
+import os.path
+import shutil
+import re
+from difflib import SequenceMatcher
+from collections import defaultdict
+
+try:
+    from json import dumps
+except:
+    from simplejson import dumps
+
+from compare_locales import parser
+from compare_locales import paths
+from compare_locales.checks import getChecker
+
+
+class Tree(object):
+    def __init__(self, valuetype):
+        self.branches = dict()
+        self.valuetype = valuetype
+        self.value = None
+
+    def __getitem__(self, leaf):
+        parts = []
+        if isinstance(leaf, paths.File):
+            parts = [p for p in [leaf.locale, leaf.module] if p] + \
+                leaf.file.split('/')
+        else:
+            parts = leaf.split('/')
+        return self.__get(parts)
+
+    def __get(self, parts):
+        common = None
+        old = None
+        new = tuple(parts)
+        t = self
+        for k, v in self.branches.iteritems():
+            for i, part in enumerate(zip(k, parts)):
+                if part[0] != part[1]:
+                    i -= 1
+                    break
+            if i < 0:
+                continue
+            i += 1
+            common = tuple(k[:i])
+            old = tuple(k[i:])
+            new = tuple(parts[i:])
+            break
+        if old:
+            self.branches.pop(k)
+            t = Tree(self.valuetype)
+            t.branches[old] = v
+            self.branches[common] = t
+        elif common:
+            t = self.branches[common]
+        if new:
+            if common:
+                return t.__get(new)
+            t2 = t
+            t = Tree(self.valuetype)
+            t2.branches[new] = t
+        if t.value is None:
+            t.value = t.valuetype()
+        return t.value
+
+    indent = '  '
+
+    def getContent(self, depth=0):
+        '''
+        Returns iterator of (depth, flag, key_or_value) tuples.
+        If flag is 'value', key_or_value is a value object, otherwise
+        (flag is 'key') it's a key string.
+        '''
+        keys = self.branches.keys()
+        keys.sort()
+        if self.value is not None:
+            yield (depth, 'value', self.value)
+        for key in keys:
+            yield (depth, 'key', key)
+            for child in self.branches[key].getContent(depth + 1):
+                yield child
+
+    def toJSON(self):
+        '''
+        Returns this Tree as a JSON-able tree of hashes.
+        Only the values need to take care that they're JSON-able.
+        '''
+        json = {}
+        keys = self.branches.keys()
+        keys.sort()
+        if self.value is not None:
+            json['value'] = self.value
+        children = [('/'.join(key), self.branches[key].toJSON())
+                    for key in keys]
+        if children:
+            json['children'] = children
+        return json
+
+    def getStrRows(self):
+        def tostr(t):
+            if t[1] == 'key':
+                return self.indent * t[0] + '/'.join(t[2])
+            return self.indent * (t[0] + 1) + str(t[2])
+
+        return map(tostr, self.getContent())
+
+    def __str__(self):
+        return '\n'.join(self.getStrRows())
+
+
+class AddRemove(SequenceMatcher):
+    def __init__(self):
+        SequenceMatcher.__init__(self, None, None, None)
+
+    def set_left(self, left):
+        if not isinstance(left, list):
+            left = [l for l in left]
+        self.set_seq1(left)
+
+    def set_right(self, right):
+        if not isinstance(right, list):
+            right = [l for l in right]
+        self.set_seq2(right)
+
+    def __iter__(self):
+        for tag, i1, i2, j1, j2 in self.get_opcodes():
+            if tag == 'equal':
+                for pair in zip(self.a[i1:i2], self.b[j1:j2]):
+                    yield ('equal', pair)
+            elif tag == 'delete':
+                for item in self.a[i1:i2]:
+                    yield ('delete', item)
+            elif tag == 'insert':
+                for item in self.b[j1:j2]:
+                    yield ('add', item)
+            else:
+                # tag == 'replace'
+                for item in self.a[i1:i2]:
+                    yield ('delete', item)
+                for item in self.b[j1:j2]:
+                    yield ('add', item)
+
+
+class DirectoryCompare(SequenceMatcher):
+    def __init__(self, reference):
+        SequenceMatcher.__init__(self, None, [i for i in reference],
+                                 [])
+        self.watcher = None
+
+    def setWatcher(self, watcher):
+        self.watcher = watcher
+
+    def compareWith(self, other):
+        if not self.watcher:
+            return
+        self.set_seq2([i for i in other])
+        for tag, i1, i2, j1, j2 in self.get_opcodes():
+            if tag == 'equal':
+                for i, j in zip(xrange(i1, i2), xrange(j1, j2)):
+                    self.watcher.compare(self.a[i], self.b[j])
+            elif tag == 'delete':
+                for i in xrange(i1, i2):
+                    self.watcher.add(self.a[i], other.cloneFile(self.a[i]))
+            elif tag == 'insert':
+                for j in xrange(j1, j2):
+                    self.watcher.remove(self.b[j])
+            else:
+                for j in xrange(j1, j2):
+                    self.watcher.remove(self.b[j])
+                for i in xrange(i1, i2):
+                    self.watcher.add(self.a[i], other.cloneFile(self.a[i]))
+
+
+class Observer(object):
+    stat_cats = ['missing', 'obsolete', 'missingInFiles', 'report',
+                 'changed', 'unchanged', 'keys']
+
+    def __init__(self):
+        class intdict(defaultdict):
+            def __init__(self):
+                defaultdict.__init__(self, int)
+
+        self.summary = defaultdict(intdict)
+        self.details = Tree(dict)
+        self.filter = None
+
+    # support pickling
+    def __getstate__(self):
+        return dict(summary=self.getSummary(), details=self.details)
+
+    def __setstate__(self, state):
+        class intdict(defaultdict):
+            def __init__(self):
+                defaultdict.__init__(self, int)
+
+        self.summary = defaultdict(intdict)
+        if 'summary' in state:
+            for loc, stats in state['summary'].iteritems():
+                self.summary[loc].update(stats)
+        self.details = state['details']
+        self.filter = None
+
+    def getSummary(self):
+        plaindict = {}
+        for k, v in self.summary.iteritems():
+            plaindict[k] = dict(v)
+        return plaindict
+
+    def toJSON(self):
+        return dict(summary=self.getSummary(), details=self.details.toJSON())
+
+    def notify(self, category, file, data):
+        rv = "error"
+        if category in self.stat_cats:
+            # these get called post reporting just for stats
+            # return "error" to forward them to other other_observers
+            self.summary[file.locale][category] += data
+            # keep track of how many strings are in a missing file
+            # we got the {'missingFile': 'error'} from the first pass
+            if category == 'missingInFiles':
+                self.details[file]['strings'] = data
+            return "error"
+        if category in ['missingFile', 'obsoleteFile']:
+            if self.filter is not None:
+                rv = self.filter(file)
+            if rv != "ignore":
+                self.details[file][category] = rv
+            return rv
+        if category in ['missingEntity', 'obsoleteEntity']:
+            if self.filter is not None:
+                rv = self.filter(file, data)
+            if rv == "ignore":
+                return rv
+            v = self.details[file]
+            try:
+                v[category].append(data)
+            except KeyError:
+                v[category] = [data]
+            return rv
+        if category == 'error':
+            try:
+                self.details[file][category].append(data)
+            except KeyError:
+                self.details[file][category] = [data]
+            self.summary[file.locale]['errors'] += 1
+        elif category == 'warning':
+            try:
+                self.details[file][category].append(data)
+            except KeyError:
+                self.details[file][category] = [data]
+            self.summary[file.locale]['warnings'] += 1
+        return rv
+
+    def toExhibit(self):
+        items = []
+        for locale in sorted(self.summary.iterkeys()):
+            summary = self.summary[locale]
+            if locale is not None:
+                item = {'id': 'xxx/' + locale,
+                        'label': locale,
+                        'locale': locale}
+            else:
+                item = {'id': 'xxx',
+                        'label': 'xxx',
+                        'locale': 'xxx'}
+            item['type'] = 'Build'
+            total = sum([summary[k]
+                         for k in ('changed', 'unchanged', 'report', 'missing',
+                                   'missingInFiles')
+                         if k in summary])
+            rate = (('changed' in summary and summary['changed'] * 100)
+                    or 0) / total
+            item.update((k, summary.get(k, 0))
+                        for k in ('changed', 'unchanged'))
+            item.update((k, summary[k])
+                        for k in ('report', 'errors', 'warnings')
+                        if k in summary)
+            item['missing'] = summary.get('missing', 0) + \
+                summary.get('missingInFiles', 0)
+            item['completion'] = rate
+            item['total'] = total
+            result = 'success'
+            if item.get('warnings', 0):
+                result = 'warning'
+            if item.get('errors', 0) or item.get('missing', 0):
+                result = 'failure'
+            item['result'] = result
+            items.append(item)
+        data = {
+            "properties": dict.fromkeys(
+                ("completion", "errors", "warnings", "missing", "report",
+                 "unchanged", "changed", "obsolete"),
+                {"valueType": "number"}),
+            "types": {
+                "Build": {"pluralLabel": "Builds"}
+            }}
+        data['items'] = items
+        return dumps(data, indent=2)
+
+    def serialize(self, type="text"):
+        if type == "exhibit":
+            return self.toExhibit()
+        if type == "json":
+            return dumps(self.toJSON())
+
+        def tostr(t):
+            if t[1] == 'key':
+                return '  ' * t[0] + '/'.join(t[2])
+            o = []
+            indent = '  ' * (t[0] + 1)
+            if 'error' in t[2]:
+                o += [indent + 'ERROR: ' + e for e in t[2]['error']]
+            if 'warning' in t[2]:
+                o += [indent + 'WARNING: ' + e for e in t[2]['warning']]
+            if 'missingEntity' in t[2] or 'obsoleteEntity' in t[2]:
+                missingEntities = ('missingEntity' in t[2] and
+                                   t[2]['missingEntity']) or []
+                obsoleteEntities = ('obsoleteEntity' in t[2] and
+                                    t[2]['obsoleteEntity']) or []
+                entities = missingEntities + obsoleteEntities
+                entities.sort()
+                for entity in entities:
+                    op = '+'
+                    if entity in obsoleteEntities:
+                        op = '-'
+                    o.append(indent + op + entity)
+            elif 'missingFile' in t[2]:
+                o.append(indent + '// add and localize this file')
+            elif 'obsoleteFile' in t[2]:
+                o.append(indent + '// remove this file')
+            return '\n'.join(o)
+
+        out = []
+        for locale, summary in sorted(self.summary.iteritems()):
+            if locale is not None:
+                out.append(locale + ':')
+            out += [k + ': ' + str(v) for k, v in sorted(summary.iteritems())]
+            total = sum([summary[k]
+                         for k in ['changed', 'unchanged', 'report', 'missing',
+                                   'missingInFiles']
+                         if k in summary])
+            rate = 0
+            if total:
+                rate = (('changed' in summary and summary['changed'] * 100)
+                        or 0) / total
+            out.append('%d%% of entries changed' % rate)
+        return '\n'.join(map(tostr, self.details.getContent()) + out)
+
+    def __str__(self):
+        return 'observer'
+
+
+class ContentComparer:
+    keyRE = re.compile('[kK]ey')
+    nl = re.compile('\n', re.M)
+
+    def __init__(self):
+        '''Create a ContentComparer.
+        observer is usually a instance of Observer. The return values
+        of the notify method are used to control the handling of missing
+        entities.
+        '''
+        self.reference = dict()
+        self.observer = Observer()
+        self.other_observers = []
+        self.merge_stage = None
+
+    def add_observer(self, obs):
+        '''Add a non-filtering observer.
+        Results from the notify calls are ignored.
+        '''
+        self.other_observers.append(obs)
+
+    def set_merge_stage(self, merge_stage):
+        self.merge_stage = merge_stage
+
+    def merge(self, ref_entities, ref_map, ref_file, l10n_file, missing,
+              skips, p):
+        outfile = os.path.join(self.merge_stage, l10n_file.module,
+                               l10n_file.file)
+        outdir = os.path.dirname(outfile)
+        if not os.path.isdir(outdir):
+            os.makedirs(outdir)
+        if not p.canMerge:
+            shutil.copyfile(ref_file.fullpath, outfile)
+            print "copied reference to " + outfile
+            return
+        if skips:
+            # skips come in ordered by key name, we need them in file order
+            skips.sort(key=lambda s: s.span[0])
+        trailing = (['\n'] +
+                    [ref_entities[ref_map[key]].all for key in missing] +
+                    [ref_entities[ref_map[skip.key]].all for skip in skips])
+        if skips:
+            # we need to skip a few errornous blocks in the input, copy by hand
+            f = codecs.open(outfile, 'wb', p.encoding)
+            offset = 0
+            for skip in skips:
+                chunk = skip.span
+                f.write(p.contents[offset:chunk[0]])
+                offset = chunk[1]
+            f.write(p.contents[offset:])
+        else:
+            shutil.copyfile(l10n_file.fullpath, outfile)
+            f = codecs.open(outfile, 'ab', p.encoding)
+        print "adding to " + outfile
+
+        def ensureNewline(s):
+            if not s.endswith('\n'):
+                return s + '\n'
+            return s
+
+        f.write(''.join(map(ensureNewline, trailing)))
+        f.close()
+
+    def notify(self, category, file, data):
+        """Check observer for the found data, and if it's
+        not to ignore, notify other_observers.
+        """
+        rv = self.observer.notify(category, file, data)
+        if rv == 'ignore':
+            return rv
+        for obs in self.other_observers:
+            # non-filtering other_observers, ignore results
+            obs.notify(category, file, data)
+        return rv
+
+    def remove(self, obsolete):
+        self.notify('obsoleteFile', obsolete, None)
+        pass
+
+    def compare(self, ref_file, l10n):
+        try:
+            p = parser.getParser(ref_file.file)
+        except UserWarning:
+            # no comparison, XXX report?
+            return
+        if ref_file not in self.reference:
+            # we didn't parse this before
+            try:
+                p.readContents(ref_file.getContents())
+            except Exception, e:
+                self.notify('error', ref_file, str(e))
+                return
+            self.reference[ref_file] = p.parse()
+        ref = self.reference[ref_file]
+        ref_list = ref[1].keys()
+        ref_list.sort()
+        try:
+            p.readContents(l10n.getContents())
+            l10n_entities, l10n_map = p.parse()
+        except Exception, e:
+            self.notify('error', l10n, str(e))
+            return
+        lines = []
+
+        def _getLine(offset):
+            if not lines:
+                lines.append(0)
+                for m in self.nl.finditer(p.contents):
+                    lines.append(m.end())
+            for i in xrange(len(lines), 0, -1):
+                if offset >= lines[i - 1]:
+                    return (i, offset - lines[i - 1])
+            return (1, offset)
+
+        l10n_list = l10n_map.keys()
+        l10n_list.sort()
+        ar = AddRemove()
+        ar.set_left(ref_list)
+        ar.set_right(l10n_list)
+        report = missing = obsolete = changed = unchanged = keys = 0
+        missings = []
+        skips = []
+        checker = getChecker(l10n, reference=ref[0])
+        for action, item_or_pair in ar:
+            if action == 'delete':
+                # missing entity
+                _rv = self.notify('missingEntity', l10n, item_or_pair)
+                if _rv == "ignore":
+                    continue
+                if _rv == "error":
+                    # only add to missing entities for l10n-merge on error,
+                    # not report
+                    missings.append(item_or_pair)
+                    missing += 1
+                else:
+                    # just report
+                    report += 1
+            elif action == 'add':
+                # obsolete entity or junk
+                if isinstance(l10n_entities[l10n_map[item_or_pair]],
+                              parser.Junk):
+                    junk = l10n_entities[l10n_map[item_or_pair]]
+                    params = (junk.val,) + junk.span
+                    self.notify('error', l10n,
+                                'Unparsed content "%s" at %d-%d' % params)
+                elif self.notify('obsoleteEntity', l10n,
+                                 item_or_pair) != 'ignore':
+                    obsolete += 1
+            else:
+                # entity found in both ref and l10n, check for changed
+                entity = item_or_pair[0]
+                refent = ref[0][ref[1][entity]]
+                l10nent = l10n_entities[l10n_map[entity]]
+                if self.keyRE.search(entity):
+                    keys += 1
+                else:
+                    if refent.val == l10nent.val:
+                        self.doUnchanged(l10nent)
+                        unchanged += 1
+                    else:
+                        self.doChanged(ref_file, refent, l10nent)
+                        changed += 1
+                        # run checks:
+                if checker:
+                    for tp, pos, msg, cat in checker.check(refent, l10nent):
+                        # compute real src position, if first line,
+                        # col needs adjustment
+                        _l, _offset = _getLine(l10nent.val_span[0])
+                        if isinstance(pos, tuple):
+                            # line, column
+                            if pos[0] == 1:
+                                col = pos[1] + _offset
+                            else:
+                                col = pos[1]
+                            _l += pos[0] - 1
+                        else:
+                            _l, col = _getLine(l10nent.val_span[0] + pos)
+                            # skip error entities when merging
+                        if tp == 'error' and self.merge_stage is not None:
+                            skips.append(l10nent)
+                        self.notify(tp, l10n,
+                                    u"%s at line %d, column %d for %s" %
+                                    (msg, _l, col, refent.key))
+                pass
+        if missing:
+            self.notify('missing', l10n, missing)
+        if self.merge_stage is not None and (missings or skips):
+            self.merge(ref[0], ref[1], ref_file, l10n, missings, skips, p)
+        if report:
+            self.notify('report', l10n, report)
+        if obsolete:
+            self.notify('obsolete', l10n, obsolete)
+        if changed:
+            self.notify('changed', l10n, changed)
+        if unchanged:
+            self.notify('unchanged', l10n, unchanged)
+        if keys:
+            self.notify('keys', l10n, keys)
+        pass
+
+    def add(self, orig, missing):
+        if self.notify('missingFile', missing, None) == "ignore":
+            # filter said that we don't need this file, don't count it
+            return
+        f = orig
+        try:
+            p = parser.getParser(f.file)
+        except UserWarning:
+            return
+        try:
+            p.readContents(f.getContents())
+            entities, map = p.parse()
+        except Exception, e:
+            self.notify('error', f, str(e))
+            return
+        self.notify('missingInFiles', missing, len(map))
+
+    def doUnchanged(self, entity):
+        # overload this if needed
+        pass
+
+    def doChanged(self, file, ref_entity, l10n_entity):
+        # overload this if needed
+        pass
+
+
+def compareApp(app, other_observer=None, merge_stage=None, clobber=False):
+    '''Compare locales set in app.
+
+    Optional arguments are:
+    - other_observer. A object implementing
+        notify(category, _file, data)
+      The return values of that callback are ignored.
+    - merge_stage. A directory to be used for staging the output of
+      l10n-merge.
+    - clobber. Clobber the module subdirectories of the merge dir as we go.
+      Use wisely, as it might cause data loss.
+    '''
+    comparer = ContentComparer()
+    if other_observer is not None:
+        comparer.add_observer(other_observer)
+    comparer.observer.filter = app.filter
+    for module, reference, locales in app:
+        dir_comp = DirectoryCompare(reference)
+        dir_comp.setWatcher(comparer)
+        for _, localization in locales:
+            if merge_stage is not None:
+                locale_merge = merge_stage.format(ab_CD=localization.locale)
+                comparer.set_merge_stage(locale_merge)
+                if clobber:
+                    # if clobber on, remove the stage for the module if it exists
+                    clobberdir = os.path.join(locale_merge, module)
+                    if os.path.exists(clobberdir):
+                        shutil.rmtree(clobberdir)
+                        print "clobbered " + clobberdir
+            dir_comp.compareWith(localization)
+    return comparer.observer
+
+
+def compareDirs(reference, locale, other_observer=None, merge_stage=None):
+    '''Compare reference and locale dir.
+
+    Optional arguments are:
+    - other_observer. A object implementing
+        notify(category, _file, data)
+      The return values of that callback are ignored.
+    '''
+    comparer = ContentComparer()
+    if other_observer is not None:
+        comparer.add_observer(other_observer)
+    comparer.set_merge_stage(merge_stage)
+    dir_comp = DirectoryCompare(paths.EnumerateDir(reference))
+    dir_comp.setWatcher(comparer)
+    dir_comp.compareWith(paths.EnumerateDir(locale))
+    return comparer.observer
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/parser.py
@@ -0,0 +1,521 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import re
+import codecs
+import logging
+from HTMLParser import HTMLParser
+
+__constructors = []
+
+
+class Entity(object):
+    '''
+    Abstraction layer for a localizable entity.
+    Currently supported are grammars of the form:
+
+    1: pre white space
+    2: pre comments
+    3: entity definition
+    4: entity key (name)
+    5: entity value
+    6: post comment (and white space) in the same line (dtd only)
+                                                 <--[1]
+    <!-- pre comments -->                        <--[2]
+    <!ENTITY key "value"> <!-- comment -->
+
+    <-------[3]---------><------[6]------>
+    '''
+    def __init__(self, contents, pp,
+                 span, pre_ws_span, pre_comment_span, def_span,
+                 key_span, val_span, post_span):
+        self.contents = contents
+        self.span = span
+        self.pre_ws_span = pre_ws_span
+        self.pre_comment_span = pre_comment_span
+        self.def_span = def_span
+        self.key_span = key_span
+        self.val_span = val_span
+        self.post_span = post_span
+        self.pp = pp
+        pass
+
+    # getter helpers
+
+    def get_all(self):
+        return self.contents[self.span[0]:self.span[1]]
+
+    def get_pre_ws(self):
+        return self.contents[self.pre_ws_span[0]:self.pre_ws_span[1]]
+
+    def get_pre_comment(self):
+        return self.contents[self.pre_comment_span[0]:
+                             self.pre_comment_span[1]]
+
+    def get_def(self):
+        return self.contents[self.def_span[0]:self.def_span[1]]
+
+    def get_key(self):
+        return self.contents[self.key_span[0]:self.key_span[1]]
+
+    def get_val(self):
+        return self.pp(self.contents[self.val_span[0]:self.val_span[1]])
+
+    def get_raw_val(self):
+        return self.contents[self.val_span[0]:self.val_span[1]]
+
+    def get_post(self):
+        return self.contents[self.post_span[0]:self.post_span[1]]
+
+    # getters
+
+    all = property(get_all)
+    pre_ws = property(get_pre_ws)
+    pre_comment = property(get_pre_comment)
+    definition = property(get_def)
+    key = property(get_key)
+    val = property(get_val)
+    raw_val = property(get_raw_val)
+    post = property(get_post)
+
+    def __repr__(self):
+        return self.key
+
+
+class Junk(object):
+    '''
+    An almost-Entity, representing junk data that we didn't parse.
+    This way, we can signal bad content as stuff we don't understand.
+    And the either fix that, or report real bugs in localizations.
+    '''
+    junkid = 0
+
+    def __init__(self, contents, span):
+        self.contents = contents
+        self.span = span
+        self.pre_ws = self.pre_comment = self.definition = self.post = ''
+        self.__class__.junkid += 1
+        self.key = '_junk_%d_%d-%d' % (self.__class__.junkid, span[0], span[1])
+
+    # getter helpers
+    def get_all(self):
+        return self.contents[self.span[0]:self.span[1]]
+
+    # getters
+    all = property(get_all)
+    val = property(get_all)
+
+    def __repr__(self):
+        return self.key
+
+
+class Parser:
+    canMerge = True
+
+    def __init__(self):
+        if not hasattr(self, 'encoding'):
+            self.encoding = 'utf-8'
+        pass
+
+    def readFile(self, file):
+        f = codecs.open(file, 'r', self.encoding)
+        try:
+            self.contents = f.read()
+        except UnicodeDecodeError, e:
+            (logging.getLogger('locales')
+                    .error("Can't read file: " + file + '; ' + str(e)))
+            self.contents = u''
+        f.close()
+
+    def readContents(self, contents):
+        (self.contents, length) = codecs.getdecoder(self.encoding)(contents)
+
+    def parse(self):
+        l = []
+        m = {}
+        for e in self:
+            m[e.key] = len(l)
+            l.append(e)
+        return (l, m)
+
+    def postProcessValue(self, val):
+        return val
+
+    def __iter__(self):
+        contents = self.contents
+        offset = 0
+        self.header, offset = self.getHeader(contents, offset)
+        self.footer = ''
+        entity, offset = self.getEntity(contents, offset)
+        while entity:
+            yield entity
+            entity, offset = self.getEntity(contents, offset)
+        f = self.reFooter.match(contents, offset)
+        if f:
+            self.footer = f.group()
+            offset = f.end()
+        if len(contents) > offset:
+            yield Junk(contents, (offset, len(contents)))
+        pass
+
+    def getHeader(self, contents, offset):
+        header = ''
+        h = self.reHeader.match(contents)
+        if h:
+            header = h.group()
+            offset = h.end()
+        return (header, offset)
+
+    def getEntity(self, contents, offset):
+        m = self.reKey.match(contents, offset)
+        if m:
+            offset = m.end()
+            entity = self.createEntity(contents, m)
+            return (entity, offset)
+        # first check if footer has a non-empy match,
+        # 'cause then we don't find junk
+        m = self.reFooter.match(contents, offset)
+        if m and m.end() > offset:
+            return (None, offset)
+        m = self.reKey.search(contents, offset)
+        if m:
+            # we didn't match, but search, so there's junk between offset
+            # and start. We'll match() on the next turn
+            junkend = m.start()
+            return (Junk(contents, (offset, junkend)), junkend)
+        return (None, offset)
+
+    def createEntity(self, contents, m):
+        return Entity(contents, self.postProcessValue,
+                      *[m.span(i) for i in xrange(7)])
+
+
+def getParser(path):
+    for item in __constructors:
+        if re.search(item[0], path):
+            return item[1]
+    raise UserWarning("Cannot find Parser")
+
+
+# Subgroups of the match will:
+# 1: pre white space
+# 2: pre comments
+# 3: entity definition
+# 4: entity key (name)
+# 5: entity value
+# 6: post comment (and white space) in the same line (dtd only)
+#                                            <--[1]
+# <!-- pre comments -->                      <--[2]
+# <!ENTITY key "value"> <!-- comment -->
+#
+# <-------[3]---------><------[6]------>
+
+
+class DTDParser(Parser):
+    # http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar
+    # ":" | [A-Z] | "_" | [a-z] |
+    # [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
+    # | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
+    # [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
+    # [#x10000-#xEFFFF]
+    CharMinusDash = u'\x09\x0A\x0D\u0020-\u002C\u002E-\uD7FF\uE000-\uFFFD'
+    XmlComment = '<!--(?:-?[%s])*?-->' % CharMinusDash
+    NameStartChar = u':A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF' + \
+        u'\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + \
+        u'\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD'
+    # + \U00010000-\U000EFFFF seems to be unsupported in python
+
+    # NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
+    #     [#x0300-#x036F] | [#x203F-#x2040]
+    NameChar = NameStartChar + ur'\-\.0-9' + u'\xB7\u0300-\u036F\u203F-\u2040'
+    Name = '[' + NameStartChar + '][' + NameChar + ']*'
+    reKey = re.compile('(?:(?P<pre>\s*)(?P<precomment>(?:' + XmlComment +
+                       '\s*)*)(?P<entity><!ENTITY\s+(?P<key>' + Name +
+                       ')\s+(?P<val>\"[^\"]*\"|\'[^\']*\'?)\s*>)'
+                       '(?P<post>[ \t]*(?:' + XmlComment + '\s*)*\n?)?)',
+                       re.DOTALL)
+    # add BOM to DTDs, details in bug 435002
+    reHeader = re.compile(u'^\ufeff?'
+                          u'(\s*<!--.*(http://mozilla.org/MPL/2.0/|'
+                          u'LICENSE BLOCK)([^-]+-)*[^-]+-->)?', re.S)
+    reFooter = re.compile('\s*(<!--([^-]+-)*[^-]+-->\s*)*$')
+    rePE = re.compile('(?:(\s*)((?:' + XmlComment + '\s*)*)'
+                      '(<!ENTITY\s+%\s+(' + Name +
+                      ')\s+SYSTEM\s+(\"[^\"]*\"|\'[^\']*\')\s*>\s*%' + Name +
+                      ';)([ \t]*(?:' + XmlComment + '\s*)*\n?)?)')
+
+    def getEntity(self, contents, offset):
+        '''
+        Overload Parser.getEntity to special-case ParsedEntities.
+        Just check for a parsed entity if that method claims junk.
+
+        <!ENTITY % foo SYSTEM "url">
+        %foo;
+        '''
+        entity, inneroffset = Parser.getEntity(self, contents, offset)
+        if (entity and isinstance(entity, Junk)) or entity is None:
+            m = self.rePE.match(contents, offset)
+            if m:
+                inneroffset = m.end()
+                entity = Entity(contents, self.postProcessValue,
+                                *[m.span(i) for i in xrange(7)])
+        return (entity, inneroffset)
+
+    def createEntity(self, contents, m):
+        valspan = m.span('val')
+        valspan = (valspan[0]+1, valspan[1]-1)
+        return Entity(contents, self.postProcessValue, m.span(),
+                      m.span('pre'), m.span('precomment'),
+                      m.span('entity'), m.span('key'), valspan,
+                      m.span('post'))
+
+
+class PropertiesParser(Parser):
+    escape = re.compile(r'\\((?P<uni>u[0-9a-fA-F]{1,4})|'
+                        '(?P<nl>\n\s*)|(?P<single>.))', re.M)
+    known_escapes = {'n': '\n', 'r': '\r', 't': '\t', '\\': '\\'}
+
+    def __init__(self):
+        self.reKey = re.compile('^(\s*)'
+                                '((?:[#!].*?\n\s*)*)'
+                                '([^#!\s\n][^=:\n]*?)\s*[:=][ \t]*', re.M)
+        self.reHeader = re.compile('^\s*([#!].*\s*)+')
+        self.reFooter = re.compile('\s*([#!].*\s*)*$')
+        self._escapedEnd = re.compile(r'\\+$')
+        self._trailingWS = re.compile(r'[ \t]*$')
+        Parser.__init__(self)
+
+    def getHeader(self, contents, offset):
+        header = ''
+        h = self.reHeader.match(contents, offset)
+        if h:
+            candidate = h.group()
+            if 'http://mozilla.org/MPL/2.0/' in candidate or \
+                    'LICENSE BLOCK' in candidate:
+                header = candidate
+                offset = h.end()
+        return (header, offset)
+
+    def getEntity(self, contents, offset):
+        # overwritten to parse values line by line
+        m = self.reKey.match(contents, offset)
+        if m:
+            offset = m.end()
+            while True:
+                endval = nextline = contents.find('\n', offset)
+                if nextline == -1:
+                    endval = offset = len(contents)
+                    break
+                # is newline escaped?
+                _e = self._escapedEnd.search(contents, offset, nextline)
+                offset = nextline + 1
+                if _e is None:
+                    break
+                # backslashes at end of line, if 2*n, not escaped
+                if len(_e.group()) % 2 == 0:
+                    break
+            # strip trailing whitespace
+            ws = self._trailingWS.search(contents, m.end(), offset)
+            if ws:
+                endval -= ws.end() - ws.start()
+            entity = Entity(contents, self.postProcessValue,
+                            (m.start(), offset),   # full span
+                            m.span(1),  # leading whitespan
+                            m.span(2),  # leading comment span
+                            (m.start(3), offset),   # entity def span
+                            m.span(3),   # key span
+                            (m.end(), endval),   # value span
+                            (offset, offset))  # post comment span, empty
+            return (entity, offset)
+        m = self.reKey.search(contents, offset)
+        if m:
+            # we didn't match, but search, so there's junk between offset
+            # and start. We'll match() on the next turn
+            junkend = m.start()
+            return (Junk(contents, (offset, junkend)), junkend)
+        return (None, offset)
+
+    def postProcessValue(self, val):
+
+        def unescape(m):
+            found = m.groupdict()
+            if found['uni']:
+                return unichr(int(found['uni'][1:], 16))
+            if found['nl']:
+                return ''
+            return self.known_escapes.get(found['single'], found['single'])
+        val = self.escape.sub(unescape, val)
+        return val
+
+
+class DefinesParser(Parser):
+    # can't merge, #unfilter needs to be the last item, which we don't support
+    canMerge = False
+
+    def __init__(self):
+        self.reKey = re.compile('^(\s*)((?:^#(?!define\s).*\s*)*)'
+                                '(#define[ \t]+(\w+)[ \t]+(.*?))([ \t]*$\n?)',
+                                re.M)
+        self.reHeader = re.compile('^\s*(#(?!define\s).*\s*)*')
+        self.reFooter = re.compile('\s*(#(?!define\s).*\s*)*$', re.M)
+        Parser.__init__(self)
+
+
+class IniParser(Parser):
+    '''
+    Parse files of the form:
+    # initial comment
+    [cat]
+    whitespace*
+    #comment
+    string=value
+    ...
+    '''
+    def __init__(self):
+        self.reHeader = re.compile('^((?:\s*|[;#].*)\n)*\[.+?\]\n', re.M)
+        self.reKey = re.compile('(\s*)((?:[;#].*\n\s*)*)((.+?)=(.*))(\n?)')
+        self.reFooter = re.compile('\s*')
+        Parser.__init__(self)
+
+
+DECL, COMMENT, START, END, CONTENT = range(5)
+
+
+class BookmarksParserInner(HTMLParser):
+
+    class Token(object):
+        _type = None
+        content = ''
+
+        def __str__(self):
+            return self.content
+
+    class DeclToken(Token):
+        _type = DECL
+
+        def __init__(self, decl):
+            self.content = decl
+            pass
+
+        def __str__(self):
+            return '<!%s>' % self.content
+        pass
+
+    class CommentToken(Token):
+        _type = COMMENT
+
+        def __init__(self, comment):
+            self.content = comment
+            pass
+
+        def __str__(self):
+            return '<!--%s-->' % self.content
+        pass
+
+    class StartToken(Token):
+        _type = START
+
+        def __init__(self, tag, attrs, content):
+            self.tag = tag
+            self.attrs = dict(attrs)
+            self.content = content
+            pass
+        pass
+
+    class EndToken(Token):
+        _type = END
+
+        def __init__(self, tag):
+            self.tag = tag
+            pass
+
+        def __str__(self):
+            return '</%s>' % self.tag.upper()
+        pass
+
+    class ContentToken(Token):
+        _type = CONTENT
+
+        def __init__(self, content):
+            self.content = content
+            pass
+        pass
+
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.tokens = []
+
+    def parse(self, contents):
+        self.tokens = []
+        self.feed(contents)
+        self.close()
+        return self.tokens
+
+    # Called when we hit an end DL tag to reset the folder selections
+    def handle_decl(self, decl):
+        self.tokens.append(self.DeclToken(decl))
+
+    # Called when we hit an end DL tag to reset the folder selections
+    def handle_comment(self, comment):
+        self.tokens.append(self.CommentToken(comment))
+
+    def handle_starttag(self, tag, attrs):
+        self.tokens.append(self.StartToken(tag, attrs,
+                                           self.get_starttag_text()))
+
+    # Called when text data is encountered
+    def handle_data(self, data):
+        if self.tokens[-1]._type == CONTENT:
+            self.tokens[-1].content += data
+        else:
+            self.tokens.append(self.ContentToken(data))
+
+    def handle_charref(self, data):
+        self.handle_data('&#%s;' % data)
+
+    def handle_entityref(self, data):
+        self.handle_data('&%s;' % data)
+
+    # Called when we hit an end DL tag to reset the folder selections
+    def handle_endtag(self, tag):
+        self.tokens.append(self.EndToken(tag))
+
+
+class BookmarksParser(Parser):
+    canMerge = False
+
+    class BMEntity(object):
+        def __init__(self, key, val):
+            self.key = key
+            self.val = val
+
+    def __iter__(self):
+        p = BookmarksParserInner()
+        tks = p.parse(self.contents)
+        i = 0
+        k = []
+        for i in xrange(len(tks)):
+            t = tks[i]
+            if t._type == START:
+                k.append(t.tag)
+                keys = t.attrs.keys()
+                keys.sort()
+                for attrname in keys:
+                    yield self.BMEntity('.'.join(k) + '.@' + attrname,
+                                        t.attrs[attrname])
+                if i + 1 < len(tks) and tks[i+1]._type == CONTENT:
+                    i += 1
+                    t = tks[i]
+                    v = t.content.strip()
+                    if v:
+                        yield self.BMEntity('.'.join(k), v)
+            elif t._type == END:
+                k.pop()
+
+
+__constructors = [('\\.dtd$', DTDParser()),
+                  ('\\.properties$', PropertiesParser()),
+                  ('\\.ini$', IniParser()),
+                  ('\\.inc$', DefinesParser()),
+                  ('bookmarks\\.html$', BookmarksParser())]
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/paths.py
@@ -0,0 +1,398 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import os.path
+import os
+from ConfigParser import ConfigParser, NoSectionError, NoOptionError
+from urlparse import urlparse, urljoin
+from urllib import pathname2url, url2pathname
+from urllib2 import urlopen
+from collections import defaultdict
+from compare_locales import util
+
+
+class L10nConfigParser(object):
+    '''Helper class to gather application information from ini files.
+
+    This class is working on synchronous open to read files or web data.
+    Subclass this and overwrite loadConfigs and addChild if you need async.
+    '''
+    def __init__(self, inipath, **kwargs):
+        """Constructor for L10nConfigParsers
+
+        inipath -- l10n.ini path
+        Optional keyword arguments are fowarded to the inner ConfigParser as
+        defaults.
+        """
+        if os.path.isabs(inipath):
+            self.inipath = 'file:%s' % pathname2url(inipath)
+        else:
+            pwdurl = 'file:%s/' % pathname2url(os.getcwd())
+            self.inipath = urljoin(pwdurl, inipath)
+        # l10n.ini files can import other l10n.ini files, store the
+        # corresponding L10nConfigParsers
+        self.children = []
+        # we really only care about the l10n directories described in l10n.ini
+        self.dirs = []
+        # optional defaults to be passed to the inner ConfigParser (unused?)
+        self.defaults = kwargs
+
+    def getDepth(self, cp):
+        '''Get the depth for the comparison from the parsed l10n.ini.
+
+        Overloadable to get the source depth for fennec and friends.
+        '''
+        try:
+            depth = cp.get('general', 'depth')
+        except:
+            depth = '.'
+        return depth
+
+    def getFilters(self):
+        '''Get the test functions from this ConfigParser and all children.
+
+        Only works with synchronous loads, used by compare-locales, which
+        is local anyway.
+        '''
+        filterurl = urljoin(self.inipath, 'filter.py')
+        try:
+            l = {}
+            execfile(url2pathname(urlparse(filterurl).path), {}, l)
+            if 'test' in l and callable(l['test']):
+                filters = [l['test']]
+            else:
+                filters = []
+        except:
+            filters = []
+
+        for c in self.children:
+            filters += c.getFilters()
+
+        return filters
+
+    def loadConfigs(self):
+        """Entry point to load the l10n.ini file this Parser refers to.
+
+        This implementation uses synchronous loads, subclasses might overload
+        this behaviour. If you do, make sure to pass a file-like object
+        to onLoadConfig.
+        """
+        self.onLoadConfig(urlopen(self.inipath))
+
+    def onLoadConfig(self, inifile):
+        """Parse a file-like object for the loaded l10n.ini file."""
+        cp = ConfigParser(self.defaults)
+        cp.readfp(inifile)
+        depth = self.getDepth(cp)
+        self.baseurl = urljoin(self.inipath, depth)
+        # create child loaders for any other l10n.ini files to be included
+        try:
+            for title, path in cp.items('includes'):
+                # skip default items
+                if title in self.defaults:
+                    continue
+                # add child config parser
+                self.addChild(title, path, cp)
+        except NoSectionError:
+            pass
+        # try to load the "dirs" defined in the "compare" section
+        try:
+            self.dirs.extend(cp.get('compare', 'dirs').split())
+        except (NoOptionError, NoSectionError):
+            pass
+        # try getting a top level compare dir, as used for fennec
+        try:
+            self.tld = cp.get('compare', 'tld')
+            # remove tld from comparison dirs
+            if self.tld in self.dirs:
+                self.dirs.remove(self.tld)
+        except (NoOptionError, NoSectionError):
+            self.tld = None
+        # try to set "all_path" and "all_url"
+        try:
+            self.all_path = cp.get('general', 'all')
+            self.all_url = urljoin(self.baseurl, self.all_path)
+        except (NoOptionError, NoSectionError):
+            self.all_path = None
+            self.all_url = None
+        return cp
+
+    def addChild(self, title, path, orig_cp):
+        """Create a child L10nConfigParser and load it.
+
+        title -- indicates the module's name
+        path -- indicates the path to the module's l10n.ini file
+        orig_cp -- the configuration parser of this l10n.ini
+        """
+        cp = L10nConfigParser(urljoin(self.baseurl, path), **self.defaults)
+        cp.loadConfigs()
+        self.children.append(cp)
+
+    def getTLDPathsTuple(self, basepath):
+        """Given the basepath, return the path fragments to be used for
+        self.tld. For build runs, this is (basepath, self.tld), for
+        source runs, just (basepath,).
+
+        @see overwritten method in SourceTreeConfigParser.
+        """
+        return (basepath, self.tld)
+
+    def dirsIter(self):
+        """Iterate over all dirs and our base path for this l10n.ini"""
+        url = urlparse(self.baseurl)
+        basepath = url2pathname(url.path)
+        if self.tld is not None:
+            yield self.tld, self.getTLDPathsTuple(basepath)
+        for dir in self.dirs:
+            yield dir, (basepath, dir)
+
+    def directories(self):
+        """Iterate over all dirs and base paths for this l10n.ini as well
+        as the included ones.
+        """
+        for t in self.dirsIter():
+            yield t
+        for child in self.children:
+            for t in child.directories():
+                yield t
+
+    def allLocales(self):
+        """Return a list of all the locales of this project"""
+        return util.parseLocales(urlopen(self.all_url).read())
+
+
+class SourceTreeConfigParser(L10nConfigParser):
+    '''Subclassing L10nConfigParser to work with just the repos
+    checked out next to each other instead of intermingled like
+    we do for real builds.
+    '''
+
+    def __init__(self, inipath, basepath):
+        '''Add additional arguments basepath.
+
+        basepath is used to resolve local paths via branchnames.
+        '''
+        L10nConfigParser.__init__(self, inipath)
+        self.basepath = basepath
+        self.tld = None
+
+    def getDepth(self, cp):
+        '''Get the depth for the comparison from the parsed l10n.ini.
+
+        Overloaded to get the source depth for fennec and friends.
+        '''
+        try:
+            depth = cp.get('general', 'source-depth')
+        except:
+            try:
+                depth = cp.get('general', 'depth')
+            except:
+                depth = '.'
+        return depth
+
+    def addChild(self, title, path, orig_cp):
+        # check if there's a section with details for this include
+        # we might have to check a different repo, or even VCS
+        # for example, projects like "mail" indicate in
+        # an "include_" section where to find the l10n.ini for "toolkit"
+        details = 'include_' + title
+        if orig_cp.has_section(details):
+            branch = orig_cp.get(details, 'mozilla')
+            inipath = orig_cp.get(details, 'l10n.ini')
+            path = self.basepath + '/' + branch + '/' + inipath
+        else:
+            path = urljoin(self.baseurl, path)
+        cp = SourceTreeConfigParser(path, self.basepath, **self.defaults)
+        cp.loadConfigs()
+        self.children.append(cp)
+
+    def getTLDPathsTuple(self, basepath):
+        """Overwrite L10nConfigParser's getTLDPathsTuple to just return
+        the basepath.
+        """
+        return (basepath, )
+
+
+class File(object):
+
+    def __init__(self, fullpath, file, module=None, locale=None):
+        self.fullpath = fullpath
+        self.file = file
+        self.module = module
+        self.locale = locale
+        pass
+
+    def getContents(self):
+        # open with universal line ending support and read
+        return open(self.fullpath, 'rU').read()
+
+    def __hash__(self):
+        f = self.file
+        if self.module:
+            f = self.module + '/' + f
+        return hash(f)
+
+    def __str__(self):
+        return self.fullpath
+
+    def __cmp__(self, other):
+        if not isinstance(other, File):
+            raise NotImplementedError
+        rv = cmp(self.module, other.module)
+        if rv != 0:
+            return rv
+        return cmp(self.file, other.file)
+
+
+class EnumerateDir(object):
+    ignore_dirs = ['CVS', '.svn', '.hg', '.git']
+
+    def __init__(self, basepath, module='', locale=None, ignore_subdirs=[]):
+        self.basepath = basepath
+        self.module = module
+        self.locale = locale
+        self.ignore_subdirs = ignore_subdirs
+        pass
+
+    def cloneFile(self, other):
+        '''
+        Return a File object that this enumerator would return, if it had it.
+        '''
+        return File(os.path.join(self.basepath, other.file), other.file,
+                    self.module, self.locale)
+
+    def __iter__(self):
+        # our local dirs are given as a tuple of path segments, starting off
+        # with an empty sequence for the basepath.
+        dirs = [()]
+        while dirs:
+            dir = dirs.pop(0)
+            fulldir = os.path.join(self.basepath, *dir)
+            try:
+                entries = os.listdir(fulldir)
+            except OSError:
+                # we probably just started off in a non-existing dir, ignore
+                continue
+            entries.sort()
+            for entry in entries:
+                leaf = os.path.join(fulldir, entry)
+                if os.path.isdir(leaf):
+                    if entry not in self.ignore_dirs and \
+                        leaf not in [os.path.join(self.basepath, d)
+                                     for d in self.ignore_subdirs]:
+                        dirs.append(dir + (entry,))
+                    continue
+                yield File(leaf, '/'.join(dir + (entry,)),
+                           self.module, self.locale)
+
+
+class LocalesWrap(object):
+
+    def __init__(self, base, module, locales, ignore_subdirs=[]):
+        self.base = base
+        self.module = module
+        self.locales = locales
+        self.ignore_subdirs = ignore_subdirs
+
+    def __iter__(self):
+        for locale in self.locales:
+            path = os.path.join(self.base, locale, self.module)
+            yield (locale, EnumerateDir(path, self.module, locale,
+                                        self.ignore_subdirs))
+
+
+class EnumerateApp(object):
+    reference = 'en-US'
+
+    def __init__(self, inipath, l10nbase, locales=None):
+        self.setupConfigParser(inipath)
+        self.modules = defaultdict(dict)
+        self.l10nbase = os.path.abspath(l10nbase)
+        self.filters = []
+        drive, tail = os.path.splitdrive(inipath)
+        self.addFilters(*self.config.getFilters())
+        self.locales = locales or self.config.allLocales()
+        self.locales.sort()
+
+    def setupConfigParser(self, inipath):
+        self.config = L10nConfigParser(inipath)
+        self.config.loadConfigs()
+
+    def addFilters(self, *args):
+        self.filters += args
+
+    value_map = {None: None, 'error': 0, 'ignore': 1, 'report': 2}
+
+    def filter(self, l10n_file, entity=None):
+        '''Go through all added filters, and,
+        - map "error" -> 0, "ignore" -> 1, "report" -> 2
+        - if filter.test returns a bool, map that to
+            False -> "ignore" (1), True -> "error" (0)
+        - take the max of all reported
+        '''
+        rv = 0
+        for f in reversed(self.filters):
+            try:
+                _r = f(l10n_file.module, l10n_file.file, entity)
+            except:
+                # XXX error handling
+                continue
+            if isinstance(_r, bool):
+                _r = [1, 0][_r]
+            else:
+                # map string return value to int, default to 'error',
+                # None is None
+                _r = self.value_map.get(_r, 0)
+            if _r is not None:
+                rv = max(rv, _r)
+        return ['error', 'ignore', 'report'][rv]
+
+    def __iter__(self):
+        '''
+        Iterate over all modules, return en-US directory enumerator, and an
+        iterator over all locales in each iteration. Per locale, the locale
+        code and an directory enumerator will be given.
+        '''
+        dirmap = dict(self.config.directories())
+        mods = dirmap.keys()
+        mods.sort()
+        for mod in mods:
+            if self.reference == 'en-US':
+                base = os.path.join(*(dirmap[mod] + ('locales', 'en-US')))
+            else:
+                base = os.path.join(self.l10nbase, self.reference, mod)
+            yield (mod, EnumerateDir(base, mod, self.reference),
+                   LocalesWrap(self.l10nbase, mod, self.locales,
+                   [m[len(mod)+1:] for m in mods if m.startswith(mod+'/')]))
+
+
+class EnumerateSourceTreeApp(EnumerateApp):
+    '''Subclass EnumerateApp to work on side-by-side checked out
+    repos, and to no pay attention to how the source would actually
+    be checked out for building.
+
+    It's supporting applications like Fennec, too, which have
+    'locales/en-US/...' in their root dir, but claim to be 'mobile'.
+    '''
+
+    def __init__(self, inipath, basepath, l10nbase, locales=None):
+        self.basepath = basepath
+        EnumerateApp.__init__(self, inipath, l10nbase, locales)
+
+    def setupConfigParser(self, inipath):
+        self.config = SourceTreeConfigParser(inipath, self.basepath)
+        self.config.loadConfigs()
+
+
+def get_base_path(mod, loc):
+    'statics for path patterns and conversion'
+    __l10n = 'l10n/%(loc)s/%(mod)s'
+    __en_US = 'mozilla/%(mod)s/locales/en-US'
+    if loc == 'en-US':
+        return __en_US % {'mod': mod}
+    return __l10n % {'mod': mod, 'loc': loc}
+
+
+def get_path(mod, loc, leaf):
+    return get_base_path(mod, loc) + '/' + leaf
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/__init__.py
@@ -0,0 +1,49 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+'''Mixins for parser tests.
+'''
+
+from itertools import izip_longest
+from pkg_resources import resource_string
+import re
+
+from compare_locales.parser import getParser
+
+
+class ParserTestMixin():
+    '''Utility methods used by the parser tests.
+    '''
+    filename = None
+
+    def setUp(self):
+        '''Create a parser for this test.
+        '''
+        self.parser = getParser(self.filename)
+
+    def tearDown(self):
+        'tear down this test'
+        del self.parser
+
+    def resource(self, name):
+        testcontent = resource_string(__name__, 'data/' + name)
+        # fake universal line endings
+        testcontent = re.sub('\r\n?', lambda m: '\n', testcontent)
+        return testcontent
+
+    def _test(self, content, refs):
+        '''Helper to test the parser.
+        Compares the result of parsing content with the given list
+        of reference keys and values.
+        '''
+        self.parser.readContents(content)
+        entities = [entity for entity in self.parser]
+        for entity, ref in izip_longest(entities, refs):
+            self.assertTrue(entity, 'excess reference entitiy')
+            self.assertTrue(ref, 'excess parsed entity')
+            self.assertEqual(entity.val, ref[1])
+            if ref[0].startswith('_junk'):
+                self.assertTrue(re.match(ref[0], entity.key))
+            else:
+                self.assertEqual(entity.key, ref[0])
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/data/bug121341.properties
@@ -0,0 +1,68 @@
+# simple check
+1=abc
+# test whitespace trimming in key and value
+  2	=   xy	
+# test parsing of escaped values
+3 = \u1234\t\r\n\uAB\
+\u1\n
+# test multiline properties
+4 = this is \
+multiline property
+5 = this is \
+	   another multiline property
+# property with DOS EOL
+6 = test\u0036
+# test multiline property with with DOS EOL
+7 = yet another multi\
+    line propery
+# trimming should not trim escaped whitespaces
+8 =	\ttest5\u0020	
+# another variant of #8
+9 =     \ test6\t	    
+# test UTF-8 encoded property/value
+10aሴb = c췯d
+# next property should test unicode escaping at the boundary of parsing buffer
+# buffer size is expected to be 4096 so add comments to get to this offset
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+###############################################################################
+11 = \uABCD
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/data/test.properties
@@ -0,0 +1,14 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+1=1
+ 2=2
+3 =3
+ 4 =4
+5=5
+6= 6
+7=7 
+8= 8 
+# this is a comment
+9=this is the first part of a continued line \
+ and here is the 2nd part
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/data/triple-license.dtd
@@ -0,0 +1,38 @@
+<!-- ***** BEGIN LICENSE BLOCK *****
+#if 0
+   - Version: MPL 1.1/GPL 2.0/LGPL 2.1
+   -
+   - The contents of this file are subject to the Mozilla Public License Version
+   - 1.1 (the "License"); you may not use this file except in compliance with
+   - the License. You may obtain a copy of the License at
+   - http://www.mozilla.org/MPL/
+   -
+   - Software distributed under the License is distributed on an "AS IS" basis,
+   - WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+   - for the specific language governing rights and limitations under the
+   - License.
+   -
+   - The Original Code is mozilla.org Code.
+   -
+   - The Initial Developer of the Original Code is dummy.
+   - Portions created by the Initial Developer are Copyright (C) 2005
+   - the Initial Developer. All Rights Reserved.
+   -
+   - Contributor(s):
+   -
+   - Alternatively, the contents of this file may be used under the terms of
+   - either the GNU General Public License Version 2 or later (the "GPL"), or
+   - the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+   - in which case the provisions of the GPL or the LGPL are applicable instead
+   - of those above. If you wish to allow use of your version of this file only
+   - under the terms of either the GPL or the LGPL, and not to allow others to
+   - use your version of this file under the terms of the MPL, indicate your
+   - decision by deleting the provisions above and replace them with the notice
+   - and other provisions required by the LGPL or the GPL. If you do not delete
+   - the provisions above, a recipient may use your version of this file under
+   - the terms of any one of the MPL, the GPL or the LGPL.
+   -
+#endif
+   - ***** END LICENSE BLOCK ***** -->
+
+<!ENTITY foo "value">
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_checks.py
@@ -0,0 +1,345 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales.checks import getChecker
+from compare_locales.parser import getParser, Entity
+from compare_locales.paths import File
+
+
+class BaseHelper(unittest.TestCase):
+    file = None
+    refContent = None
+
+    def setUp(self):
+        p = getParser(self.file.file)
+        p.readContents(self.refContent)
+        self.refList, self.refMap = p.parse()
+
+    def _test(self, content, refWarnOrErrors):
+        p = getParser(self.file.file)
+        p.readContents(content)
+        l10n = [e for e in p]
+        assert len(l10n) == 1
+        l10n = l10n[0]
+        checker = getChecker(self.file)
+        ref = self.refList[self.refMap[l10n.key]]
+        found = tuple(checker.check(ref, l10n))
+        self.assertEqual(found, refWarnOrErrors)
+
+
+class TestProperties(BaseHelper):
+    file = File('foo.properties', 'foo.properties')
+    refContent = '''some = value
+'''
+
+    def testGood(self):
+        self._test('''some = localized''',
+                   tuple())
+
+    def testMissedEscape(self):
+        self._test(r'''some = \u67ood escape, bad \escape''',
+                   (('warning', 20, r'unknown escape sequence, \e',
+                     'escape'),))
+
+
+class TestPlurals(BaseHelper):
+    file = File('foo.properties', 'foo.properties')
+    refContent = '''\
+# LOCALIZATION NOTE (downloadsTitleFiles): Semi-colon list of plural forms.
+# See: http://developer.mozilla.org/en/docs/Localization_and_Plurals
+# #1 number of files
+# example: 111 files - Downloads
+downloadsTitleFiles=#1 file - Downloads;#1 files - #2
+'''
+
+    def testGood(self):
+        self._test('''\
+# LOCALIZATION NOTE (downloadsTitleFiles): Semi-colon list of plural forms.
+# See: http://developer.mozilla.org/en/docs/Localization_and_Plurals
+# #1 number of files
+# example: 111 files - Downloads
+downloadsTitleFiles=#1 file - Downloads;#1 files - #2;#1 filers
+''',
+                   tuple())
+
+    def testNotUsed(self):
+        self._test('''\
+# LOCALIZATION NOTE (downloadsTitleFiles): Semi-colon list of plural forms.
+# See: http://developer.mozilla.org/en/docs/Localization_and_Plurals
+# #1 number of files
+# example: 111 files - Downloads
+downloadsTitleFiles=#1 file - Downloads;#1 files - Downloads;#1 filers
+''',
+                   (('warning', 0, 'not all variables used in l10n',
+                     'plural'),))
+
+    def testNotDefined(self):
+        self._test('''\
+# LOCALIZATION NOTE (downloadsTitleFiles): Semi-colon list of plural forms.
+# See: http://developer.mozilla.org/en/docs/Localization_and_Plurals
+# #1 number of files
+# example: 111 files - Downloads
+downloadsTitleFiles=#1 file - Downloads;#1 files - #2;#1 #3
+''',
+                   (('error', 0, 'unreplaced variables in l10n', 'plural'),))
+
+
+class TestDTDs(BaseHelper):
+    file = File('foo.dtd', 'foo.dtd')
+    refContent = '''<!ENTITY foo "This is &apos;good&apos;">
+<!ENTITY width "10ch">
+<!ENTITY style "width: 20ch; height: 280px;">
+<!ENTITY minStyle "min-height: 50em;">
+<!ENTITY ftd "0">
+<!ENTITY formatPercent "This is 100&#037; correct">
+<!ENTITY some.key "K">
+'''
+
+    def testWarning(self):
+        self._test('''<!ENTITY foo "This is &not; good">
+''',
+                   (('warning', (0, 0), 'Referencing unknown entity `not`',
+                     'xmlparse'),))
+        # make sure we only handle translated entity references
+        self._test(u'''<!ENTITY foo "This is &ƞǿŧ; good">
+'''.encode('utf-8'),
+            (('warning', (0, 0), u'Referencing unknown entity `ƞǿŧ`',
+              'xmlparse'),))
+
+    def testErrorFirstLine(self):
+        self._test('''<!ENTITY foo "This is </bad> stuff">
+''',
+                   (('error', (1, 10), 'mismatched tag', 'xmlparse'),))
+
+    def testErrorSecondLine(self):
+        self._test('''<!ENTITY foo "This is
+  </bad>
+stuff">
+''',
+                   (('error', (2, 4), 'mismatched tag', 'xmlparse'),))
+
+    def testKeyErrorSingleAmpersand(self):
+        self._test('''<!ENTITY some.key "&">
+''',
+                   (('error', (1, 1), 'not well-formed (invalid token)',
+                     'xmlparse'),))
+
+    def testXMLEntity(self):
+        self._test('''<!ENTITY foo "This is &quot;good&quot;">
+''',
+                   tuple())
+
+    def testPercentEntity(self):
+        self._test('''<!ENTITY formatPercent "Another 100&#037;">
+''',
+                   tuple())
+        self._test('''<!ENTITY formatPercent "Bad 100% should fail">
+''',
+                   (('error', (0, 32), 'not well-formed (invalid token)',
+                     'xmlparse'),))
+
+    def testNoNumber(self):
+        self._test('''<!ENTITY ftd "foo">''',
+                   (('warning', 0, 'reference is a number', 'number'),))
+
+    def testNoLength(self):
+        self._test('''<!ENTITY width "15miles">''',
+                   (('error', 0, 'reference is a CSS length', 'css'),))
+
+    def testNoStyle(self):
+        self._test('''<!ENTITY style "15ch">''',
+                   (('error', 0, 'reference is a CSS spec', 'css'),))
+        self._test('''<!ENTITY style "junk">''',
+                   (('error', 0, 'reference is a CSS spec', 'css'),))
+
+    def testStyleWarnings(self):
+        self._test('''<!ENTITY style "width:15ch">''',
+                   (('warning', 0, 'height only in reference', 'css'),))
+        self._test('''<!ENTITY style "width:15em;height:200px;">''',
+                   (('warning', 0, "units for width don't match (em != ch)",
+                     'css'),))
+
+    def testNoWarning(self):
+        self._test('''<!ENTITY width "12em">''', tuple())
+        self._test('''<!ENTITY style "width:12ch;height:200px;">''', tuple())
+        self._test('''<!ENTITY ftd "0">''', tuple())
+
+
+class TestAndroid(unittest.TestCase):
+    """Test Android checker
+
+    Make sure we're hitting our extra rules only if
+    we're passing in a DTD file in the embedding/android module.
+    """
+    apos_msg = u"Apostrophes in Android DTDs need escaping with \\' or " + \
+               u"\\u0027, or use \u2019, or put string in quotes."
+    quot_msg = u"Quotes in Android DTDs need escaping with \\\" or " + \
+               u"\\u0022, or put string in apostrophes."
+
+    def getEntity(self, v):
+        return Entity(v, lambda s: s, (0, len(v)), (), (0, 0), (), (),
+                      (0, len(v)), ())
+
+    def getDTDEntity(self, v):
+        v = v.replace('"', '&quot;')
+        return Entity('<!ENTITY foo "%s">' % v,
+                      lambda s: s,
+                      (0, len(v) + 16), (), (0, 0), (), (9, 12),
+                      (14, len(v) + 14), ())
+
+    def test_android_dtd(self):
+        """Testing the actual android checks. The logic is involved,
+        so this is a lot of nitty gritty detail tests.
+        """
+        f = File("embedding/android/strings.dtd", "strings.dtd",
+                 "embedding/android")
+        checker = getChecker(f)
+        # good string
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # dtd warning
+        l10n = self.getDTDEntity("plain localized string &ref;")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('warning', (0, 0),
+                           'Referencing unknown entity `ref`', 'xmlparse'),))
+        # no report on stray ampersand or quote, if not completely quoted
+        for i in xrange(3):
+            # make sure we're catching unescaped apostrophes,
+            # try 0..5 backticks
+            l10n = self.getDTDEntity("\\"*(2*i) + "'")
+            self.assertEqual(tuple(checker.check(ref, l10n)),
+                             (('error', 2*i, self.apos_msg, 'android'),))
+            l10n = self.getDTDEntity("\\"*(2*i + 1) + "'")
+            self.assertEqual(tuple(checker.check(ref, l10n)),
+                             ())
+            # make sure we don't report if apos string is quoted
+            l10n = self.getDTDEntity('"' + "\\"*(2*i) + "'\"")
+            tpl = tuple(checker.check(ref, l10n))
+            self.assertEqual(tpl, (),
+                             "`%s` shouldn't fail but got %s"
+                             % (l10n.val, str(tpl)))
+            l10n = self.getDTDEntity('"' + "\\"*(2*i+1) + "'\"")
+            tpl = tuple(checker.check(ref, l10n))
+            self.assertEqual(tpl, (),
+                             "`%s` shouldn't fail but got %s"
+                             % (l10n.val, str(tpl)))
+            # make sure we're catching unescaped quotes, try 0..5 backticks
+            l10n = self.getDTDEntity("\\"*(2*i) + "\"")
+            self.assertEqual(tuple(checker.check(ref, l10n)),
+                             (('error', 2*i, self.quot_msg, 'android'),))
+            l10n = self.getDTDEntity("\\"*(2*i + 1) + "'")
+            self.assertEqual(tuple(checker.check(ref, l10n)),
+                             ())
+            # make sure we don't report if quote string is single quoted
+            l10n = self.getDTDEntity("'" + "\\"*(2*i) + "\"'")
+            tpl = tuple(checker.check(ref, l10n))
+            self.assertEqual(tpl, (),
+                             "`%s` shouldn't fail but got %s" %
+                             (l10n.val, str(tpl)))
+            l10n = self.getDTDEntity('"' + "\\"*(2*i+1) + "'\"")
+            tpl = tuple(checker.check(ref, l10n))
+            self.assertEqual(tpl, (),
+                             "`%s` shouldn't fail but got %s" %
+                             (l10n.val, str(tpl)))
+        # check for mixed quotes and ampersands
+        l10n = self.getDTDEntity("'\"")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 0, self.apos_msg, 'android'),
+                          ('error', 1, self.quot_msg, 'android')))
+        l10n = self.getDTDEntity("''\"'")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 1, self.apos_msg, 'android'),))
+        l10n = self.getDTDEntity('"\'""')
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 2, self.quot_msg, 'android'),))
+
+        # broken unicode escape
+        l10n = self.getDTDEntity("Some broken \u098 unicode")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 12, 'truncated \\uXXXX escape',
+                           'android'),))
+        # broken unicode escape, try to set the error off
+        l10n = self.getDTDEntity(u"\u9690"*14+"\u006"+"  "+"\u0064")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 14, 'truncated \\uXXXX escape',
+                           'android'),))
+
+    def test_android_prop(self):
+        f = File("embedding/android/strings.properties", "strings.properties",
+                 "embedding/android")
+        checker = getChecker(f)
+        # good plain string
+        ref = self.getEntity("plain string")
+        l10n = self.getEntity("plain localized string")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # no dtd warning
+        ref = self.getEntity("plain string")
+        l10n = self.getEntity("plain localized string &ref;")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # no report on stray ampersand
+        ref = self.getEntity("plain string")
+        l10n = self.getEntity("plain localized string with apos: '")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # report on bad printf
+        ref = self.getEntity("string with %s")
+        l10n = self.getEntity("string with %S")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 0, 'argument 1 `S` should be `s`',
+                           'printf'),))
+
+    def test_non_android_dtd(self):
+        f = File("browser/strings.dtd", "strings.dtd", "browser")
+        checker = getChecker(f)
+        # good string
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # dtd warning
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string &ref;")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('warning', (0, 0),
+                          'Referencing unknown entity `ref`', 'xmlparse'),))
+        # no report on stray ampersand
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string with apos: '")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+
+    def test_entities_across_dtd(self):
+        f = File("browser/strings.dtd", "strings.dtd", "browser")
+        p = getParser(f.file)
+        p.readContents('<!ENTITY other "some &good.ref;">')
+        ref = p.parse()
+        checker = getChecker(f, reference=ref[0])
+        # good string
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # dtd warning
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string &ref;")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('warning', (0, 0),
+                           'Referencing unknown entity `ref` (good.ref known)',
+                           'xmlparse'),))
+        # no report on stray ampersand
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string with &good.ref;")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+
+
+if __name__ == '__main__':
+    unittest.main()
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_dtd.py
@@ -0,0 +1,86 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+'''Tests for the DTD parser.
+'''
+
+import unittest
+import re
+
+from compare_locales.parser import getParser
+from compare_locales.tests import ParserTestMixin
+
+
+class TestDTD(ParserTestMixin, unittest.TestCase):
+    '''Tests for the DTD Parser.'''
+    filename = 'foo.dtd'
+
+    def test_one_entity(self):
+        self._test('''<!ENTITY foo.label "stuff">''',
+                   (('foo.label', 'stuff'),))
+
+    quoteContent = '''<!ENTITY good.one "one">
+<!ENTITY bad.one "bad " quote">
+<!ENTITY good.two "two">
+<!ENTITY bad.two "bad "quoted" word">
+<!ENTITY good.three "three">
+<!ENTITY good.four "good ' quote">
+<!ENTITY good.five "good 'quoted' word">
+'''
+    quoteRef = (
+        ('good.one', 'one'),
+        ('_junk_\\d_25-56$', '<!ENTITY bad.one "bad " quote">'),
+        ('good.two', 'two'),
+        ('_junk_\\d_82-119$', '<!ENTITY bad.two "bad "quoted" word">'),
+        ('good.three', 'three'),
+        ('good.four', 'good \' quote'),
+        ('good.five', 'good \'quoted\' word'),)
+
+    def test_quotes(self):
+        self._test(self.quoteContent, self.quoteRef)
+
+    def test_apos(self):
+        qr = re.compile('[\'"]', re.M)
+
+        def quot2apos(s):
+            return qr.sub(lambda m: m.group(0) == '"' and "'" or '"', s)
+
+        self._test(quot2apos(self.quoteContent),
+                   map(lambda t: (t[0], quot2apos(t[1])), self.quoteRef))
+
+    def test_parsed_ref(self):
+        self._test('''<!ENTITY % fooDTD SYSTEM "chrome://brand.dtd">
+  %fooDTD;
+''',
+                   (('fooDTD', '"chrome://brand.dtd"'),))
+
+    def test_trailing_comment(self):
+        self._test('''<!ENTITY first "string">
+<!ENTITY second "string">
+<!--
+<!ENTITY commented "out">
+-->
+''',
+                   (('first', 'string'), ('second', 'string')))
+
+    def test_license_header(self):
+        p = getParser('foo.dtd')
+        p.readContents(self.resource('triple-license.dtd'))
+        for e in p:
+            self.assertEqual(e.key, 'foo')
+            self.assertEqual(e.val, 'value')
+        self.assert_('MPL' in p.header)
+        p.readContents('''\
+<!-- This Source Code Form is subject to the terms of the Mozilla Public
+   - License, v. 2.0. If a copy of the MPL was not distributed with this file,
+   - You can obtain one at http://mozilla.org/MPL/2.0/.  -->
+<!ENTITY foo "value">
+''')
+        for e in p:
+            self.assertEqual(e.key, 'foo')
+            self.assertEqual(e.val, 'value')
+        self.assert_('MPL' in p.header)
+
+if __name__ == '__main__':
+    unittest.main()
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_ini.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales.tests import ParserTestMixin
+
+
+mpl2 = '''\
+; This Source Code Form is subject to the terms of the Mozilla Public
+; License, v. 2.0. If a copy of the MPL was not distributed with this file,
+; You can obtain one at http://mozilla.org/MPL/2.0/.
+'''
+
+
+class TestIniParser(ParserTestMixin, unittest.TestCase):
+
+    filename = 'foo.ini'
+
+    def testSimpleHeader(self):
+        self._test('''; This file is in the UTF-8 encoding
+[Strings]
+TitleText=Some Title
+''', (('TitleText', 'Some Title'),))
+        self.assert_('UTF-8' in self.parser.header)
+
+    def testMPL2_Space_UTF(self):
+        self._test(mpl2 + '''
+; This file is in the UTF-8 encoding
+[Strings]
+TitleText=Some Title
+''', (('TitleText', 'Some Title'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def testMPL2_Space(self):
+        self._test(mpl2 + '''
+[Strings]
+TitleText=Some Title
+''', (('TitleText', 'Some Title'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def testMPL2_MultiSpace(self):
+        self._test(mpl2 + '''\
+
+; more comments
+
+[Strings]
+TitleText=Some Title
+''', (('TitleText', 'Some Title'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def testMPL2_Junk(self):
+        self._test(mpl2 + '''\
+Junk
+[Strings]
+TitleText=Some Title
+''', (('_junk_\\d+_0-213$', mpl2 + '''\
+Junk
+[Strings]'''), ('TitleText', 'Some Title')))
+        self.assert_('MPL' not in self.parser.header)
+
+if __name__ == '__main__':
+    unittest.main()
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_merge.py
@@ -0,0 +1,105 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+import os
+from tempfile import mkdtemp
+import shutil
+
+from compare_locales.parser import getParser
+from compare_locales.paths import File
+from compare_locales.compare import ContentComparer
+
+
+class TestProperties(unittest.TestCase):
+
+    def setUp(self):
+        self.tmp = mkdtemp()
+        os.mkdir(os.path.join(self.tmp, "merge"))
+        self.ref = os.path.join(self.tmp, "en-reference.properties")
+        open(self.ref, "w").write("""foo = fooVal
+bar = barVal
+eff = effVal""")
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp)
+        del self.tmp
+
+    def testGood(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        l10n = os.path.join(self.tmp, "l10n.properties")
+        open(l10n, "w").write("""foo = lFoo
+bar = lBar
+eff = lEff
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.properties", ""),
+                   File(l10n, "l10n.properties", ""))
+        print cc.observer.serialize()
+
+    def testMissing(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        l10n = os.path.join(self.tmp, "l10n.properties")
+        open(l10n, "w").write("""bar = lBar
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.properties", ""),
+                   File(l10n, "l10n.properties", ""))
+        print cc.observer.serialize()
+        mergefile = os.path.join(self.tmp, "merge", "l10n.properties")
+        self.assertTrue(os.path.isfile(mergefile))
+        p = getParser(mergefile)
+        p.readFile(mergefile)
+        [m, n] = p.parse()
+        self.assertEqual(map(lambda e: e.key,  m), ["bar", "eff", "foo"])
+
+
+class TestDTD(unittest.TestCase):
+
+    def setUp(self):
+        self.tmp = mkdtemp()
+        os.mkdir(os.path.join(self.tmp, "merge"))
+        self.ref = os.path.join(self.tmp, "en-reference.dtd")
+        open(self.ref, "w").write("""<!ENTITY foo 'fooVal'>
+<!ENTITY bar 'barVal'>
+<!ENTITY eff 'effVal'>""")
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp)
+        del self.tmp
+
+    def testGood(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        l10n = os.path.join(self.tmp, "l10n.dtd")
+        open(l10n, "w").write("""<!ENTITY foo 'lFoo'>
+<!ENTITY bar 'lBar'>
+<!ENTITY eff 'lEff'>
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.dtd", ""),
+                   File(l10n, "l10n.dtd", ""))
+        print cc.observer.serialize()
+
+    def testMissing(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        l10n = os.path.join(self.tmp, "l10n.dtd")
+        open(l10n, "w").write("""<!ENTITY bar 'lBar'>
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.dtd", ""),
+                   File(l10n, "l10n.dtd", ""))
+        print cc.observer.serialize()
+        mergefile = os.path.join(self.tmp, "merge", "l10n.dtd")
+        self.assertTrue(os.path.isfile(mergefile))
+        p = getParser(mergefile)
+        p.readFile(mergefile)
+        [m, n] = p.parse()
+        self.assertEqual(map(lambda e: e.key,  m), ["bar", "eff", "foo"])
+
+if __name__ == '__main__':
+    unittest.main()
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_properties.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales.tests import ParserTestMixin
+
+
+class TestPropertiesParser(ParserTestMixin, unittest.TestCase):
+
+    filename = 'foo.properties'
+
+    def testBackslashes(self):
+        self._test(r'''one_line = This is one line
+two_line = This is the first \
+of two lines
+one_line_trailing = This line ends in \\
+and has junk
+two_lines_triple = This line is one of two and ends in \\\
+and still has another line coming
+''', (
+            ('one_line', 'This is one line'),
+            ('two_line', u'This is the first of two lines'),
+            ('one_line_trailing', u'This line ends in \\'),
+            ('_junk_\\d+_113-126$', 'and has junk\n'),
+            ('two_lines_triple', 'This line is one of two and ends in \\'
+             'and still has another line coming')))
+
+    def testProperties(self):
+        # port of netwerk/test/PropertiesTest.cpp
+        self.parser.readContents(self.resource('test.properties'))
+        ref = ['1', '2', '3', '4', '5', '6', '7', '8',
+               'this is the first part of a continued line '
+               'and here is the 2nd part']
+        i = iter(self.parser)
+        for r, e in zip(ref, i):
+            self.assertEqual(e.val, r)
+
+    def test_bug121341(self):
+        # port of xpcom/tests/unit/test_bug121341.js
+        self.parser.readContents(self.resource('bug121341.properties'))
+        ref = ['abc', 'xy', u"\u1234\t\r\n\u00AB\u0001\n",
+               "this is multiline property",
+               "this is another multiline property", u"test\u0036",
+               "yet another multiline propery", u"\ttest5\u0020", " test6\t",
+               u"c\uCDEFd", u"\uABCD"]
+        i = iter(self.parser)
+        for r, e in zip(ref, i):
+            self.assertEqual(e.val, r)
+
+    def test_comment_in_multi(self):
+        self._test(r'''bar=one line with a \
+# part that looks like a comment \
+and an end''', (('bar', 'one line with a # part that looks like a comment '
+                'and an end'),))
+
+    def test_license_header(self):
+        self._test('''\
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+foo=value
+''', (('foo', 'value'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def test_escapes(self):
+        self.parser.readContents(r'''
+# unicode escapes
+zero = some \unicode
+one = \u0
+two = \u41
+three = \u042
+four = \u0043
+five = \u0044a
+six = \a
+seven = \n\r\t\\
+''')
+        ref = ['some unicode', chr(0), 'A', 'B', 'C', 'Da', 'a', '\n\r\t\\']
+        for r, e in zip(ref, self.parser):
+            self.assertEqual(e.val, r)
+
+    def test_trailing_comment(self):
+        self._test('''first = string
+second = string
+
+#
+#commented out
+''', (('first', 'string'), ('second', 'string')))
+
+
+if __name__ == '__main__':
+    unittest.main()
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_util.py
@@ -0,0 +1,29 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales import util
+
+
+class ParseLocalesTest(unittest.TestCase):
+    def test_empty(self):
+        self.assertEquals(util.parseLocales(''), [])
+
+    def test_all(self):
+        self.assertEquals(util.parseLocales('''af
+de'''), ['af', 'de'])
+
+    def test_shipped(self):
+        self.assertEquals(util.parseLocales('''af
+ja win mac
+de'''), ['af', 'de', 'ja'])
+
+    def test_sparse(self):
+        self.assertEquals(util.parseLocales('''
+af
+
+de
+
+'''), ['af', 'de'])
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_webapps.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales import webapps
+
+
+class TestFileComparison(unittest.TestCase):
+
+    def mock_FileComparison(self, mock_listdir):
+        class Target(webapps.FileComparison):
+            def _listdir(self):
+                return mock_listdir()
+        return Target('.', 'en-US')
+
+    def test_just_reference(self):
+        def _listdir():
+            return ['my_app.en-US.properties']
+        filecomp = self.mock_FileComparison(_listdir)
+        filecomp.files()
+        self.assertEqual(filecomp.locales(), [])
+        self.assertEqual(filecomp._reference.keys(), ['my_app'])
+        file_ = filecomp._reference['my_app']
+        self.assertEqual(file_.file, 'locales/my_app.en-US.properties')
+
+    def test_just_locales(self):
+        def _listdir():
+            return ['my_app.ar.properties',
+                    'my_app.sr-Latn.properties',
+                    'my_app.sv-SE.properties',
+                    'my_app.po_SI.properties']
+        filecomp = self.mock_FileComparison(_listdir)
+        filecomp.files()
+        self.assertEqual(filecomp.locales(),
+                         ['ar', 'sr-Latn', 'sv-SE'])
+        self.assertEqual(filecomp._files['ar'].keys(), ['my_app'])
+        file_ = filecomp._files['ar']['my_app']
+        self.assertEqual(file_.file, 'locales/my_app.ar.properties')
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/util.py
@@ -0,0 +1,11 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+# This file is shared between compare-locales and locale-inspector
+# test_util is in compare-locales only, for the sake of easy
+# development.
+
+
+def parseLocales(content):
+    return sorted(l.split()[0] for l in content.splitlines() if l)
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/webapps.py
@@ -0,0 +1,235 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+'''gaia-style web apps support
+
+This variant supports manifest.webapp localization as well as
+.properties files with a naming scheme of locales/foo.*.properties.
+'''
+
+from collections import defaultdict
+import json
+import os
+import os.path
+import re
+
+from compare_locales.paths import File, EnumerateDir
+from compare_locales.compare import AddRemove, ContentComparer
+
+
+class WebAppCompare(object):
+    '''For a given directory, analyze
+    /manifest.webapp
+    /locales/*.*.properties
+
+    Deduce the present locale codes.
+    '''
+    ignore_dirs = EnumerateDir.ignore_dirs
+    reference_locale = 'en-US'
+
+    def __init__(self, basedir):
+        '''Constructor
+        :param basedir: Directory of the web app to inspect
+        '''
+        self.basedir = basedir
+        self.manifest = Manifest(basedir, self.reference_locale)
+        self.files = FileComparison(basedir, self.reference_locale)
+        self.watcher = None
+
+    def compare(self, locales):
+        '''Compare the manifest.webapp and the locales/*.*.properties
+        '''
+        if not locales:
+            locales = self.locales()
+        self.manifest.compare(locales)
+        self.files.compare(locales)
+
+    def setWatcher(self, watcher):
+        self.watcher = watcher
+        self.manifest.watcher = watcher
+        self.files.watcher = watcher
+
+    def locales(self):
+        '''Inspect files on disk to find present languages.
+        :rtype: List of locales, sorted, including reference.
+        '''
+        locales = set(self.manifest.strings.keys())
+        locales.update(self.files.locales())
+        locales = list(sorted(locales))
+        return locales
+
+
+class Manifest(object):
+    '''Class that helps with parsing and inspection of manifest.webapp.
+    '''
+
+    def __init__(self, basedir, reference_locale):
+        self.file = File(os.path.join(basedir, 'manifest.webapp'),
+                         'manifest.webapp')
+        self.reference_locale = reference_locale
+        self._strings = None
+        self.watcher = None
+
+    @property
+    def strings(self):
+        if self._strings is None:
+            self._strings = self.load_and_parse()
+        return self._strings
+
+    def load_and_parse(self):
+        try:
+            manifest = json.load(open(self.file.fullpath))
+        except (ValueError, IOError), e:
+            if self.watcher:
+                self.watcher.notify('error', self.file, str(e))
+            return False
+        return self.extract_manifest_strings(manifest)
+
+    def extract_manifest_strings(self, manifest_fragment):
+        '''Extract localizable strings from a manifest dict.
+        This method is recursive, and returns a two-level dict,
+        first level being locale codes, second level being generated
+        key and localized value. Keys are generated by concatenating
+        each level in the json with a ".".
+        '''
+        rv = defaultdict(dict)
+        localizable = manifest_fragment.pop('locales', {})
+        if localizable:
+            for locale, keyvalue in localizable.iteritems():
+                for key, value in keyvalue.iteritems():
+                    key = '.'.join(['locales', 'AB_CD', key])
+                    rv[locale][key] = value
+        for key, sub_manifest in manifest_fragment.iteritems():
+            if not isinstance(sub_manifest, dict):
+                continue
+            subdict = self.extract_manifest_strings(sub_manifest)
+            if subdict:
+                for locale, keyvalue in subdict:
+                    rv[locale].update((key + '.' + subkey, value)
+                                      for subkey, value
+                                      in keyvalue.iteritems())
+        return rv
+
+    def compare(self, locales):
+        strings = self.strings
+        if not strings:
+            return
+        # create a copy so that we can mock around with it
+        strings = strings.copy()
+        reference = strings.pop(self.reference_locale)
+        for locale in locales:
+            if locale == self.reference_locale:
+                continue
+            self.compare_strings(reference,
+                                 strings.get(locale, {}),
+                                 locale)
+
+    def compare_strings(self, reference, l10n, locale):
+        add_remove = AddRemove()
+        add_remove.set_left(sorted(reference.keys()))
+        add_remove.set_right(sorted(l10n.keys()))
+        missing = obsolete = changed = unchanged = 0
+        for op, item_or_pair in add_remove:
+            if op == 'equal':
+                if reference[item_or_pair[0]] == l10n[item_or_pair[1]]:
+                    unchanged += 1
+                else:
+                    changed += 1
+            else:
+                key = item_or_pair.replace('.AB_CD.',
+                                           '.%s.' % locale)
+                if op == 'add':
+                    # obsolete entry
+                    obsolete += 1
+                    self.watcher.notify('obsoleteEntity', self.file, key)
+                else:
+                    # missing entry
+                    missing += 1
+                    self.watcher.notify('missingEntity', self.file, key)
+
+
+class FileComparison(object):
+    '''Compare the locales/*.*.properties files inside a webapp.
+    '''
+    prop = re.compile('(?P<base>.*)\\.'
+                      '(?P<locale>[a-zA-Z]+(?:-[a-zA-Z]+)*)'
+                      '\\.properties$')
+
+    def __init__(self, basedir, reference_locale):
+        self.basedir = basedir
+        self.reference_locale = reference_locale
+        self.watcher = None
+        self._reference = self._files = None
+
+    def locales(self):
+        '''Get the locales present in the webapp
+        '''
+        self.files()
+        locales = self._files.keys()
+        locales.sort()
+        return locales
+
+    def compare(self, locales):
+        self.files()
+        for locale in locales:
+            l10n = self._files[locale]
+            filecmp = AddRemove()
+            filecmp.set_left(sorted(self._reference.keys()))
+            filecmp.set_right(sorted(l10n.keys()))
+            for op, item_or_pair in filecmp:
+                if op == 'equal':
+                    self.watcher.compare(self._reference[item_or_pair[0]],
+                                         l10n[item_or_pair[1]])
+                elif op == 'add':
+                    # obsolete file
+                    self.watcher.remove(l10n[item_or_pair])
+                else:
+                    # missing file
+                    _path = '.'.join([item_or_pair, locale, 'properties'])
+                    missingFile = File(
+                        os.path.join(self.basedir, 'locales', _path),
+                        'locales/' + _path)
+                    self.watcher.add(self._reference[item_or_pair],
+                                     missingFile)
+
+    def files(self):
+        '''Read the list of locales from disk.
+        '''
+        if self._reference:
+            return
+        self._reference = {}
+        self._files = defaultdict(dict)
+        path_list = self._listdir()
+        for path in path_list:
+            match = self.prop.match(path)
+            if match is None:
+                continue
+            locale = match.group('locale')
+            if locale == self.reference_locale:
+                target = self._reference
+            else:
+                target = self._files[locale]
+            fullpath = os.path.join(self.basedir, 'locales', path)
+            target[match.group('base')] = File(fullpath, 'locales/' + path)
+
+    def _listdir(self):
+        'Monkey-patch this for testing.'
+        return os.listdir(os.path.join(self.basedir, 'locales'))
+
+
+def compare_web_app(basedir, locales, other_observer=None):
+    '''Compare gaia-style web app.
+
+    Optional arguments are:
+    - other_observer. A object implementing
+        notify(category, _file, data)
+      The return values of that callback are ignored.
+    '''
+    comparer = ContentComparer()
+    if other_observer is not None:
+        comparer.add_observer(other_observer)
+    webapp_comp = WebAppCompare(basedir)
+    webapp_comp.setWatcher(comparer)
+    webapp_comp.compare(locales)
+    return comparer.observer