bug 1315976, update compare-locales to 1.2.1, r=flod
authorAxel Hecht <axel@pike.org>
Thu, 24 Nov 2016 14:30:31 +0100
changeset 324196 626c5d74b74cc7bcd3625f57d6df1929401ad971
parent 324195 37a877e840fc012433e1aaba208a1c13f9ce22f1
child 324197 2e262535fc7358975b86240798c75cf624f39870
push id24
push usermaklebus@msu.edu
push dateTue, 20 Dec 2016 03:11:33 +0000
reviewersflod
bugs1315976
milestone53.0a1
bug 1315976, update compare-locales to 1.2.1, r=flod MozReview-Commit-ID: 6sZtRiw0mQs
python/compare-locales/compare_locales/__init__.py
python/compare-locales/compare_locales/commands.py
python/compare-locales/compare_locales/compare.py
python/compare-locales/compare_locales/parser.py
python/compare-locales/compare_locales/tests/__init__.py
python/compare-locales/compare_locales/tests/test_checks.py
python/compare-locales/compare_locales/tests/test_defines.py
python/compare-locales/compare_locales/tests/test_dtd.py
python/compare-locales/compare_locales/tests/test_ini.py
python/compare-locales/compare_locales/tests/test_merge.py
python/compare-locales/compare_locales/tests/test_parser.py
python/compare-locales/compare_locales/tests/test_properties.py
python/compare-locales/compare_locales/webapps.py
--- a/python/compare-locales/compare_locales/__init__.py
+++ b/python/compare-locales/compare_locales/__init__.py
@@ -1,1 +1,1 @@
-version = "1.1"
+version = "1.2.1"
--- a/python/compare-locales/compare_locales/commands.py
+++ b/python/compare-locales/compare_locales/commands.py
@@ -1,154 +1,155 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 'Commands exposed to commandlines'
 
 import logging
-from optparse import OptionParser, make_option
+from argparse import ArgumentParser
 
+from compare_locales import version
 from compare_locales.paths import EnumerateApp
 from compare_locales.compare import compareApp, compareDirs
 from compare_locales.webapps import compare_web_app
 
 
 class BaseCommand(object):
     """Base class for compare-locales commands.
     This handles command line parsing, and general sugar for setuptools
     entry_points.
     """
-    options = [
-        make_option('-v', '--verbose', action='count', dest='v', default=0,
-                    help='Make more noise'),
-        make_option('-q', '--quiet', action='count', dest='q', default=0,
-                    help='Make less noise'),
-        make_option('-m', '--merge',
-                    help='''Use this directory to stage merged files,
-use {ab_CD} to specify a different directory for each locale'''),
-    ]
-    data_option = make_option('--data', choices=['text', 'exhibit', 'json'],
-                              default='text',
-                              help='''Choose data and format (one of text,
+
+    def __init__(self):
+        self.parser = None
+
+    def get_parser(self):
+        """Get an ArgumentParser, with class docstring as description.
+        """
+        parser = ArgumentParser(description=self.__doc__)
+        parser.add_argument('--version', action='version',
+                            version='%(prog)s ' + version)
+        parser.add_argument('-v', '--verbose', action='count', dest='v',
+                            default=0, help='Make more noise')
+        parser.add_argument('-q', '--quiet', action='count', dest='q',
+                            default=0, help='Make less noise')
+        parser.add_argument('-m', '--merge',
+                            help='''Use this directory to stage merged files,
+use {ab_CD} to specify a different directory for each locale''')
+        return parser
+
+    def add_data_argument(self, parser):
+        parser.add_argument('--data', choices=['text', 'exhibit', 'json'],
+                            default='text',
+                            help='''Choose data and format (one of text,
 exhibit, json); text: (default) Show which files miss which strings, together
 with warnings and errors. Also prints a summary; json: Serialize the internal
 tree, useful for tools. Also always succeeds; exhibit: Serialize the summary
 data in a json useful for Exhibit
 ''')
 
-    def __init__(self):
-        self.parser = None
-
-    def get_parser(self):
-        """Get an OptionParser, with class docstring as usage, and
-        self.options.
-        """
-        parser = OptionParser()
-        parser.set_usage(self.__doc__)
-        for option in self.options:
-            parser.add_option(option)
-        return parser
-
     @classmethod
     def call(cls):
         """Entry_point for setuptools.
         The actual command handling is done in the handle() method of the
         subclasses.
         """
         cmd = cls()
         cmd.handle_()
 
     def handle_(self):
         """The instance part of the classmethod call."""
         self.parser = self.get_parser()
-        (options, args) = self.parser.parse_args()
+        args = self.parser.parse_args()
         # log as verbose or quiet as we want, warn by default
         logging.basicConfig()
         logging.getLogger().setLevel(logging.WARNING -
-                                     (options.v - options.q)*10)
-        observer = self.handle(args, options)
-        print observer.serialize(type=options.data).encode('utf-8', 'replace')
+                                     (args.v - args.q) * 10)
+        observer = self.handle(args)
+        print observer.serialize(type=args.data).encode('utf-8', 'replace')
 
-    def handle(self, args, options):
+    def handle(self, args):
         """Subclasses need to implement this method for the actual
         command handling.
         """
         raise NotImplementedError
 
 
 class CompareLocales(BaseCommand):
-    """usage: %prog [options] l10n.ini l10n_base_dir [locale ...]
-
-Check the localization status of a gecko application.
+    """Check the localization status of a gecko application.
 The first argument is a path to the l10n.ini file for the application,
 followed by the base directory of the localization repositories.
 Then you pass in the list of locale codes you want to compare. If there are
 not locales given, the list of locales will be taken from the all-locales file
 of the application\'s l10n.ini."""
 
-    options = BaseCommand.options + [
-        make_option('--clobber-merge', action="store_true", default=False,
-                    dest='clobber',
-                    help="""WARNING: DATALOSS.
+    def get_parser(self):
+        parser = super(CompareLocales, self).get_parser()
+        parser.add_argument('ini_file', metavar='l10n.ini',
+                            help='INI file for the project')
+        parser.add_argument('l10n_base_dir', metavar='l10n-base-dir',
+                            help='Parent directory of localizations')
+        parser.add_argument('locales', nargs='*', metavar='locale-code',
+                            help='Locale code and top-level directory of '
+                                 'each localization')
+        parser.add_argument('--clobber-merge', action="store_true",
+                            default=False, dest='clobber',
+                            help="""WARNING: DATALOSS.
 Use this option with care. If specified, the merge directory will
 be clobbered for each module. That means, the subdirectory will
 be completely removed, any files that were there are lost.
-Be careful to specify the right merge directory when using this option."""),
-        make_option('-r', '--reference', default='en-US', dest='reference',
-                    help='Explicitly set the reference '
-                    'localization. [default: en-US]'),
-        BaseCommand.data_option
-    ]
+Be careful to specify the right merge directory when using this option.""")
+        parser.add_argument('-r', '--reference', default='en-US',
+                            dest='reference',
+                            help='Explicitly set the reference '
+                            'localization. [default: en-US]')
+        self.add_data_argument(parser)
+        return parser
 
-    def handle(self, args, options):
-        if len(args) < 2:
-            self.parser.error('Need to pass in list of languages')
-        inipath, l10nbase = args[:2]
-        locales = args[2:]
-        app = EnumerateApp(inipath, l10nbase, locales)
-        app.reference = options.reference
+    def handle(self, args):
+        app = EnumerateApp(args.ini_file, args.l10n_base_dir, args.locales)
+        app.reference = args.reference
         try:
-            observer = compareApp(app, merge_stage=options.merge,
-                                  clobber=options.clobber)
+            observer = compareApp(app, merge_stage=args.merge,
+                                  clobber=args.clobber)
         except (OSError, IOError), exc:
             print "FAIL: " + str(exc)
             self.parser.exit(2)
         return observer
 
 
 class CompareDirs(BaseCommand):
-    """usage: %prog [options] reference localization
-
-Check the localization status of a directory tree.
+    """Check the localization status of a directory tree.
 The first argument is a path to the reference data,the second is the
 localization to be tested."""
 
-    options = BaseCommand.options + [
-        BaseCommand.data_option
-    ]
+    def get_parser(self):
+        parser = super(CompareDirs, self).get_parser()
+        parser.add_argument('reference')
+        parser.add_argument('localization')
+        self.add_data_argument(parser)
+        return parser
 
-    def handle(self, args, options):
-        if len(args) != 2:
-            self.parser.error('Reference and localizatino required')
-        reference, locale = args
-        observer = compareDirs(reference, locale, merge_stage=options.merge)
+    def handle(self, args):
+        observer = compareDirs(args.reference, args.localization,
+                               merge_stage=args.merge)
         return observer
 
 
 class CompareWebApp(BaseCommand):
-    """usage: %prog [options] webapp [locale locale]
-
-Check the localization status of a gaia-style web app.
+    """Check the localization status of a gaia-style web app.
 The first argument is the directory of the web app.
 Following arguments explicitly state the locales to test.
 If none are given, test all locales in manifest.webapp or files."""
 
-    options = BaseCommand.options[:-1] + [
-        BaseCommand.data_option]
+    def get_parser(self):
+        parser = super(CompareWebApp, self).get_parser()
+        parser.add_argument('webapp')
+        parser.add_argument('locales', nargs='*', metavar='locale-code',
+                            help='Locale code and top-level directory of '
+                                 'each localization')
+        self.add_data_argument(parser)
+        return parser
 
-    def handle(self, args, options):
-        if len(args) < 1:
-            self.parser.error('Webapp directory required')
-        basedir = args[0]
-        locales = args[1:]
-        observer = compare_web_app(basedir, locales)
+    def handle(self, args):
+        observer = compare_web_app(args.webapp, args.locales)
         return observer
--- a/python/compare-locales/compare_locales/compare.py
+++ b/python/compare-locales/compare_locales/compare.py
@@ -378,45 +378,45 @@ class ContentComparer:
         Results from the notify calls are ignored.
         '''
         self.other_observers.append(obs)
 
     def set_merge_stage(self, merge_stage):
         self.merge_stage = merge_stage
 
     def merge(self, ref_entities, ref_map, ref_file, l10n_file, missing,
-              skips, p):
+              skips, ctx, canMerge, encoding):
         outfile = os.path.join(self.merge_stage, l10n_file.module,
                                l10n_file.file)
         outdir = os.path.dirname(outfile)
         if not os.path.isdir(outdir):
             os.makedirs(outdir)
-        if not p.canMerge:
+        if not canMerge:
             shutil.copyfile(ref_file.fullpath, outfile)
             print "copied reference to " + outfile
             return
         if skips:
             # skips come in ordered by key name, we need them in file order
             skips.sort(key=lambda s: s.span[0])
         trailing = (['\n'] +
                     [ref_entities[ref_map[key]].all for key in missing] +
                     [ref_entities[ref_map[skip.key]].all for skip in skips
                      if not isinstance(skip, parser.Junk)])
         if skips:
             # we need to skip a few errornous blocks in the input, copy by hand
-            f = codecs.open(outfile, 'wb', p.encoding)
+            f = codecs.open(outfile, 'wb', encoding)
             offset = 0
             for skip in skips:
                 chunk = skip.span
-                f.write(p.contents[offset:chunk[0]])
+                f.write(ctx.contents[offset:chunk[0]])
                 offset = chunk[1]
-            f.write(p.contents[offset:])
+            f.write(ctx.contents[offset:])
         else:
             shutil.copyfile(l10n_file.fullpath, outfile)
-            f = codecs.open(outfile, 'ab', p.encoding)
+            f = codecs.open(outfile, 'ab', encoding)
         print "adding to " + outfile
 
         def ensureNewline(s):
             if not s.endswith('\n'):
                 return s + '\n'
             return s
 
         f.write(''.join(map(ensureNewline, trailing)))
@@ -453,30 +453,20 @@ class ContentComparer:
                 return
             self.reference[ref_file] = p.parse()
         ref = self.reference[ref_file]
         ref_list = ref[1].keys()
         ref_list.sort()
         try:
             p.readContents(l10n.getContents())
             l10n_entities, l10n_map = p.parse()
+            l10n_ctx = p.ctx
         except Exception, e:
             self.notify('error', l10n, str(e))
             return
-        lines = []
-
-        def _getLine(offset):
-            if not lines:
-                lines.append(0)
-                for m in self.nl.finditer(p.contents):
-                    lines.append(m.end())
-            for i in xrange(len(lines), 0, -1):
-                if offset >= lines[i - 1]:
-                    return (i, offset - lines[i - 1])
-            return (1, offset)
 
         l10n_list = l10n_map.keys()
         l10n_list.sort()
         ar = AddRemove()
         ar.set_left(ref_list)
         ar.set_right(l10n_list)
         report = missing = obsolete = changed = unchanged = keys = 0
         missings = []
@@ -496,19 +486,20 @@ class ContentComparer:
                 else:
                     # just report
                     report += 1
             elif action == 'add':
                 # obsolete entity or junk
                 if isinstance(l10n_entities[l10n_map[item_or_pair]],
                               parser.Junk):
                     junk = l10n_entities[l10n_map[item_or_pair]]
-                    params = (junk.val,) + junk.span
+                    params = (junk.val,) + junk.position() + junk.position(-1)
                     self.notify('error', l10n,
-                                'Unparsed content "%s" at %d-%d' % params)
+                                'Unparsed content "%s" from line %d colum %d'
+                                ' to line %d column %d' % params)
                     if self.merge_stage is not None:
                         skips.append(junk)
                 elif self.notify('obsoleteEntity', l10n,
                                  item_or_pair) != 'ignore':
                     obsolete += 1
             else:
                 # entity found in both ref and l10n, check for changed
                 entity = item_or_pair[0]
@@ -523,37 +514,40 @@ class ContentComparer:
                     else:
                         self.doChanged(ref_file, refent, l10nent)
                         changed += 1
                         # run checks:
                 if checker:
                     for tp, pos, msg, cat in checker.check(refent, l10nent):
                         # compute real src position, if first line,
                         # col needs adjustment
-                        _l, _offset = _getLine(l10nent.val_span[0])
                         if isinstance(pos, tuple):
+                            _l, col = l10nent.value_position()
                             # line, column
                             if pos[0] == 1:
-                                col = pos[1] + _offset
+                                col = col + pos[1]
                             else:
                                 col = pos[1]
-                            _l += pos[0] - 1
+                                _l += pos[0] - 1
                         else:
-                            _l, col = _getLine(l10nent.val_span[0] + pos)
-                            # skip error entities when merging
+                            _l, col = l10nent.value_position(pos)
+                        # skip error entities when merging
                         if tp == 'error' and self.merge_stage is not None:
                             skips.append(l10nent)
                         self.notify(tp, l10n,
                                     u"%s at line %d, column %d for %s" %
                                     (msg, _l, col, refent.key))
                 pass
         if missing:
             self.notify('missing', l10n, missing)
         if self.merge_stage is not None and (missings or skips):
-            self.merge(ref[0], ref[1], ref_file, l10n, missings, skips, p)
+            self.merge(
+                ref[0], ref[1], ref_file,
+                l10n, missings, skips, l10n_ctx,
+                p.canMerge, p.encoding)
         if report:
             self.notify('report', l10n, report)
         if obsolete:
             self.notify('obsolete', l10n, obsolete)
         if changed:
             self.notify('changed', l10n, changed)
         if unchanged:
             self.notify('unchanged', l10n, unchanged)
--- a/python/compare-locales/compare_locales/parser.py
+++ b/python/compare-locales/compare_locales/parser.py
@@ -1,199 +1,298 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 import re
+import bisect
 import codecs
 import logging
-from HTMLParser import HTMLParser
 
 __constructors = []
 
 
-class Entity(object):
+class EntityBase(object):
     '''
     Abstraction layer for a localizable entity.
     Currently supported are grammars of the form:
 
     1: pre white space
-    2: pre comments
-    3: entity definition
-    4: entity key (name)
-    5: entity value
-    6: post comment (and white space) in the same line (dtd only)
+    2: entity definition
+    3: entity key (name)
+    4: entity value
+    5: post white space
                                                  <--[1]
-    <!-- pre comments -->                        <--[2]
-    <!ENTITY key "value"> <!-- comment -->
+    <!ENTITY key "value">
 
-    <-------[3]---------><------[6]------>
+    <-------[2]--------->
     '''
-    def __init__(self, contents, pp,
-                 span, pre_ws_span, pre_comment_span, def_span,
+    def __init__(self, ctx, pp, pre_comment,
+                 span, pre_ws_span, def_span,
                  key_span, val_span, post_span):
-        self.contents = contents
+        self.ctx = ctx
         self.span = span
         self.pre_ws_span = pre_ws_span
-        self.pre_comment_span = pre_comment_span
         self.def_span = def_span
         self.key_span = key_span
         self.val_span = val_span
         self.post_span = post_span
         self.pp = pp
+        self.pre_comment = pre_comment
         pass
 
+    def position(self, offset=0):
+        """Get the 1-based line and column of the character
+        with given offset into the Entity.
+
+        If offset is negative, return the end of the Entity.
+        """
+        if offset < 0:
+            pos = self.span[1]
+        else:
+            pos = self.span[0] + offset
+        return self.ctx.lines(pos)[0]
+
+    def value_position(self, offset=0):
+        """Get the 1-based line and column of the character
+        with given offset into the value.
+
+        If offset is negative, return the end of the value.
+        """
+        if offset < 0:
+            pos = self.val_span[1]
+        else:
+            pos = self.val_span[0] + offset
+        return self.ctx.lines(pos)[0]
+
     # getter helpers
 
     def get_all(self):
-        return self.contents[self.span[0]:self.span[1]]
+        return self.ctx.contents[self.span[0]:self.span[1]]
 
     def get_pre_ws(self):
-        return self.contents[self.pre_ws_span[0]:self.pre_ws_span[1]]
-
-    def get_pre_comment(self):
-        return self.contents[self.pre_comment_span[0]:
-                             self.pre_comment_span[1]]
+        return self.ctx.contents[self.pre_ws_span[0]:self.pre_ws_span[1]]
 
     def get_def(self):
-        return self.contents[self.def_span[0]:self.def_span[1]]
+        return self.ctx.contents[self.def_span[0]:self.def_span[1]]
 
     def get_key(self):
-        return self.contents[self.key_span[0]:self.key_span[1]]
+        return self.ctx.contents[self.key_span[0]:self.key_span[1]]
 
     def get_val(self):
-        return self.pp(self.contents[self.val_span[0]:self.val_span[1]])
+        return self.pp(self.ctx.contents[self.val_span[0]:self.val_span[1]])
 
     def get_raw_val(self):
-        return self.contents[self.val_span[0]:self.val_span[1]]
+        return self.ctx.contents[self.val_span[0]:self.val_span[1]]
 
     def get_post(self):
-        return self.contents[self.post_span[0]:self.post_span[1]]
+        return self.ctx.contents[self.post_span[0]:self.post_span[1]]
 
     # getters
 
     all = property(get_all)
     pre_ws = property(get_pre_ws)
-    pre_comment = property(get_pre_comment)
     definition = property(get_def)
     key = property(get_key)
     val = property(get_val)
     raw_val = property(get_raw_val)
     post = property(get_post)
 
     def __repr__(self):
         return self.key
 
 
+class Entity(EntityBase):
+    pass
+
+
+class Comment(EntityBase):
+    def __init__(self, ctx, span, pre_ws_span, def_span,
+                 post_span):
+        self.ctx = ctx
+        self.span = span
+        self.pre_ws_span = pre_ws_span
+        self.def_span = def_span
+        self.post_span = post_span
+        self.pp = lambda v: v
+
+    @property
+    def key(self):
+        return None
+
+    @property
+    def val(self):
+        return None
+
+    def __repr__(self):
+        return self.all
+
+
 class Junk(object):
     '''
     An almost-Entity, representing junk data that we didn't parse.
     This way, we can signal bad content as stuff we don't understand.
     And the either fix that, or report real bugs in localizations.
     '''
     junkid = 0
 
-    def __init__(self, contents, span):
-        self.contents = contents
+    def __init__(self, ctx, span):
+        self.ctx = ctx
         self.span = span
-        self.pre_ws = self.pre_comment = self.definition = self.post = ''
+        self.pre_ws = self.definition = self.post = ''
         self.__class__.junkid += 1
         self.key = '_junk_%d_%d-%d' % (self.__class__.junkid, span[0], span[1])
 
+    def position(self, offset=0):
+        """Get the 1-based line and column of the character
+        with given offset into the Entity.
+
+        If offset is negative, return the end of the Entity.
+        """
+        if offset < 0:
+            pos = self.span[1]
+        else:
+            pos = self.span[0] + offset
+        return self.ctx.lines(pos)[0]
+
     # getter helpers
     def get_all(self):
-        return self.contents[self.span[0]:self.span[1]]
+        return self.ctx.contents[self.span[0]:self.span[1]]
 
     # getters
     all = property(get_all)
     val = property(get_all)
 
     def __repr__(self):
         return self.key
 
 
+class Whitespace(EntityBase):
+    '''Entity-like object representing an empty file with whitespace,
+    if allowed
+    '''
+    def __init__(self, ctx, span):
+        self.ctx = ctx
+        self.key_span = self.val_span = self.span = span
+        self.def_span = self.pre_ws_span = (span[0], span[0])
+        self.post_span = (span[1], span[1])
+        self.pp = lambda v: v
+
+    def __repr__(self):
+        return self.raw_val
+
+
 class Parser:
     canMerge = True
+    tail = re.compile('\s+\Z')
+
+    class Context(object):
+        "Fixture for content and line numbers"
+        def __init__(self, contents):
+            self.contents = contents
+            self._lines = None
+
+        def lines(self, *positions):
+            # return line and column tuples, 1-based
+            if self._lines is None:
+                nl = re.compile('\n', re.M)
+                self._lines = [m.end()
+                               for m in nl.finditer(self.contents)]
+            line_nrs = [bisect.bisect(self._lines, p) for p in positions]
+            # compute columns
+            pos_ = [
+                (1 + line, 1 + p - (self._lines[line-1] if line else 0))
+                for line, p in zip(line_nrs, positions)]
+            return pos_
 
     def __init__(self):
         if not hasattr(self, 'encoding'):
             self.encoding = 'utf-8'
-        pass
+        self.ctx = None
+        self.last_comment = None
 
     def readFile(self, file):
-        f = codecs.open(file, 'r', self.encoding)
-        try:
-            self.contents = f.read()
-        except UnicodeDecodeError, e:
-            (logging.getLogger('locales')
-                    .error("Can't read file: " + file + '; ' + str(e)))
-            self.contents = u''
-        f.close()
+        with open(file, 'rU') as f:
+            try:
+                self.readContents(f.read())
+            except UnicodeDecodeError, e:
+                (logging.getLogger('locales')
+                        .error("Can't read file: " + file + '; ' + str(e)))
 
     def readContents(self, contents):
-        (self.contents, length) = codecs.getdecoder(self.encoding)(contents)
+        '''Read contents and create parsing context.
+
+        contents are in native encoding, but with normalized line endings.
+        '''
+        (contents, length) = codecs.getdecoder(self.encoding)(contents)
+        self.ctx = Parser.Context(contents)
 
     def parse(self):
         l = []
         m = {}
         for e in self:
             m[e.key] = len(l)
             l.append(e)
         return (l, m)
 
     def postProcessValue(self, val):
         return val
 
     def __iter__(self):
-        contents = self.contents
+        return self.walk(onlyEntities=True)
+
+    def walk(self, onlyEntities=False):
+        if not self.ctx:
+            # loading file failed, or we just didn't load anything
+            return
+        ctx = self.ctx
+        contents = ctx.contents
         offset = 0
-        self.header, offset = self.getHeader(contents, offset)
-        self.footer = ''
-        entity, offset = self.getEntity(contents, offset)
+        entity, offset = self.getEntity(ctx, offset)
         while entity:
-            yield entity
-            entity, offset = self.getEntity(contents, offset)
-        f = self.reFooter.match(contents, offset)
-        if f:
-            self.footer = f.group()
-            offset = f.end()
+            if (not onlyEntities or
+                    type(entity) is Entity or
+                    type(entity) is Junk):
+                yield entity
+            entity, offset = self.getEntity(ctx, offset)
         if len(contents) > offset:
-            yield Junk(contents, (offset, len(contents)))
-        pass
+            yield Junk(ctx, (offset, len(contents)))
 
-    def getHeader(self, contents, offset):
-        header = ''
-        h = self.reHeader.match(contents)
-        if h:
-            header = h.group()
-            offset = h.end()
-        return (header, offset)
-
-    def getEntity(self, contents, offset):
-        m = self.reKey.match(contents, offset)
+    def getEntity(self, ctx, offset):
+        m = self.reKey.match(ctx.contents, offset)
         if m:
             offset = m.end()
-            entity = self.createEntity(contents, m)
+            entity = self.createEntity(ctx, m)
             return (entity, offset)
-        # first check if footer has a non-empty match,
-        # 'cause then we don't find junk
-        m = self.reFooter.match(contents, offset)
-        if m and m.end() > offset:
-            return (None, offset)
-        m = self.reKey.search(contents, offset)
+        m = self.reComment.match(ctx.contents, offset)
         if m:
-            # we didn't match, but search, so there's junk between offset
-            # and start. We'll match() on the next turn
-            junkend = m.start()
-            return (Junk(contents, (offset, junkend)), junkend)
-        return (None, offset)
+            offset = m.end()
+            self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(4)])
+            return (self.last_comment, offset)
+        return self.getTrailing(ctx, offset, self.reKey, self.reComment)
 
-    def createEntity(self, contents, m):
-        return Entity(contents, self.postProcessValue,
-                      *[m.span(i) for i in xrange(7)])
+    def getTrailing(self, ctx, offset, *expressions):
+        junkend = None
+        for exp in expressions:
+            m = exp.search(ctx.contents, offset)
+            if m:
+                junkend = min(junkend, m.start()) if junkend else m.start()
+        if junkend is None:
+            if self.tail.match(ctx.contents, offset):
+                white_end = len(ctx.contents)
+                return (Whitespace(ctx, (offset, white_end)), white_end)
+            else:
+                return (None, offset)
+        return (Junk(ctx, (offset, junkend)), junkend)
+
+    def createEntity(self, ctx, m):
+        pre_comment = unicode(self.last_comment) if self.last_comment else ''
+        self.last_comment = ''
+        return Entity(ctx, self.postProcessValue, pre_comment,
+                      *[m.span(i) for i in xrange(6)])
 
 
 def getParser(path):
     for item in __constructors:
         if re.search(item[0], path):
             return item[1]
     raise UserWarning("Cannot find Parser")
 
@@ -225,297 +324,249 @@ class DTDParser(Parser):
         u'\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + \
         u'\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD'
     # + \U00010000-\U000EFFFF seems to be unsupported in python
 
     # NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
     #     [#x0300-#x036F] | [#x203F-#x2040]
     NameChar = NameStartChar + ur'\-\.0-9' + u'\xB7\u0300-\u036F\u203F-\u2040'
     Name = '[' + NameStartChar + '][' + NameChar + ']*'
-    reKey = re.compile('(?:(?P<pre>\s*)(?P<precomment>(?:' + XmlComment +
-                       '\s*)*)(?P<entity><!ENTITY\s+(?P<key>' + Name +
+    reKey = re.compile('(?:(?P<pre>\s*)(?P<entity><!ENTITY\s+(?P<key>' + Name +
                        ')\s+(?P<val>\"[^\"]*\"|\'[^\']*\'?)\s*>)'
-                       '(?P<post>[ \t]*(?:' + XmlComment + '\s*)*\n?)?)',
-                       re.DOTALL)
+                       '(?P<post>\s*)?)',
+                       re.DOTALL | re.M)
     # add BOM to DTDs, details in bug 435002
-    reHeader = re.compile(u'^\ufeff?'
-                          u'(\s*<!--.*(http://mozilla.org/MPL/2.0/|'
-                          u'LICENSE BLOCK)([^-]+-)*[^-]+-->)?', re.S)
-    reFooter = re.compile('\s*(<!--([^-]+-)*[^-]+-->\s*)*$')
-    rePE = re.compile('(?:(\s*)((?:' + XmlComment + '\s*)*)'
-                      '(<!ENTITY\s+%\s+(' + Name +
-                      ')\s+SYSTEM\s+(\"[^\"]*\"|\'[^\']*\')\s*>\s*%' + Name +
-                      ';)([ \t]*(?:' + XmlComment + '\s*)*\n?)?)')
+    reHeader = re.compile(u'^\ufeff')
+    reComment = re.compile('(\s*)(<!--(-?[%s])*?-->)(\s*)' % CharMinusDash,
+                           re.S)
+    rePE = re.compile(u'(?:(\s*)'
+                      u'(<!ENTITY\s+%\s+(' + Name +
+                      u')\s+SYSTEM\s+(\"[^\"]*\"|\'[^\']*\')\s*>\s*%' + Name +
+                      u';)([ \t]*(?:' + XmlComment + u'\s*)*\n?)?)')
 
-    def getEntity(self, contents, offset):
+    def getEntity(self, ctx, offset):
         '''
         Overload Parser.getEntity to special-case ParsedEntities.
         Just check for a parsed entity if that method claims junk.
 
         <!ENTITY % foo SYSTEM "url">
         %foo;
         '''
-        entity, inneroffset = Parser.getEntity(self, contents, offset)
+        if offset is 0 and self.reHeader.match(ctx.contents):
+            offset += 1
+        entity, inneroffset = Parser.getEntity(self, ctx, offset)
         if (entity and isinstance(entity, Junk)) or entity is None:
-            m = self.rePE.match(contents, offset)
+            m = self.rePE.match(ctx.contents, offset)
             if m:
                 inneroffset = m.end()
-                entity = Entity(contents, self.postProcessValue,
-                                *[m.span(i) for i in xrange(7)])
+                self.last_comment = ''
+                entity = Entity(ctx, self.postProcessValue, '',
+                                *[m.span(i) for i in xrange(6)])
         return (entity, inneroffset)
 
-    def createEntity(self, contents, m):
+    def createEntity(self, ctx, m):
         valspan = m.span('val')
         valspan = (valspan[0]+1, valspan[1]-1)
-        return Entity(contents, self.postProcessValue, m.span(),
-                      m.span('pre'), m.span('precomment'),
+        pre_comment = unicode(self.last_comment) if self.last_comment else ''
+        self.last_comment = ''
+        return Entity(ctx, self.postProcessValue, pre_comment,
+                      m.span(),
+                      m.span('pre'),
                       m.span('entity'), m.span('key'), valspan,
                       m.span('post'))
 
 
 class PropertiesParser(Parser):
     escape = re.compile(r'\\((?P<uni>u[0-9a-fA-F]{1,4})|'
                         '(?P<nl>\n\s*)|(?P<single>.))', re.M)
     known_escapes = {'n': '\n', 'r': '\r', 't': '\t', '\\': '\\'}
 
     def __init__(self):
         self.reKey = re.compile('^(\s*)'
-                                '((?:[#!].*?\n\s*)*)'
                                 '([^#!\s\n][^=:\n]*?)\s*[:=][ \t]*', re.M)
-        self.reHeader = re.compile('^\s*([#!].*\s*)+')
-        self.reFooter = re.compile('\s*([#!].*\s*)*$')
+        self.reComment = re.compile('(\s*)(((?:[#!][^\n]*\n?)+))', re.M)
         self._escapedEnd = re.compile(r'\\+$')
-        self._trailingWS = re.compile(r'[ \t]*$')
+        self._trailingWS = re.compile(r'\s*(?:\n|\Z)', re.M)
         Parser.__init__(self)
 
-    def getHeader(self, contents, offset):
-        header = ''
-        h = self.reHeader.match(contents, offset)
-        if h:
-            candidate = h.group()
-            if 'http://mozilla.org/MPL/2.0/' in candidate or \
-                    'LICENSE BLOCK' in candidate:
-                header = candidate
-                offset = h.end()
-        return (header, offset)
-
-    def getEntity(self, contents, offset):
+    def getEntity(self, ctx, offset):
         # overwritten to parse values line by line
+        contents = ctx.contents
+        m = self.reComment.match(contents, offset)
+        if m:
+            spans = [m.span(i) for i in xrange(3)]
+            start_trailing = offset = m.end()
+            while offset < len(contents):
+                m = self._trailingWS.match(contents, offset)
+                if not m:
+                    break
+                offset = m.end()
+            spans.append((start_trailing, offset))
+            self.last_comment = Comment(ctx, *spans)
+            return (self.last_comment, offset)
         m = self.reKey.match(contents, offset)
         if m:
-            offset = m.end()
+            startline = offset = m.end()
             while True:
                 endval = nextline = contents.find('\n', offset)
                 if nextline == -1:
                     endval = offset = len(contents)
                     break
                 # is newline escaped?
                 _e = self._escapedEnd.search(contents, offset, nextline)
                 offset = nextline + 1
                 if _e is None:
                     break
                 # backslashes at end of line, if 2*n, not escaped
                 if len(_e.group()) % 2 == 0:
                     break
+                startline = offset
             # strip trailing whitespace
-            ws = self._trailingWS.search(contents, m.end(), offset)
+            ws = self._trailingWS.search(contents, startline)
             if ws:
-                endval -= ws.end() - ws.start()
-            entity = Entity(contents, self.postProcessValue,
+                endval = ws.start()
+                offset = ws.end()
+            pre_comment = (unicode(self.last_comment) if self.last_comment
+                           else '')
+            self.last_comment = ''
+            entity = Entity(ctx, self.postProcessValue, pre_comment,
                             (m.start(), offset),   # full span
                             m.span(1),  # leading whitespan
-                            m.span(2),  # leading comment span
-                            (m.start(3), offset),   # entity def span
-                            m.span(3),   # key span
+                            (m.start(2), offset),   # entity def span
+                            m.span(2),   # key span
                             (m.end(), endval),   # value span
                             (offset, offset))  # post comment span, empty
             return (entity, offset)
-        m = self.reKey.search(contents, offset)
-        if m:
-            # we didn't match, but search, so there's junk between offset
-            # and start. We'll match() on the next turn
-            junkend = m.start()
-            return (Junk(contents, (offset, junkend)), junkend)
-        return (None, offset)
+        return self.getTrailing(ctx, offset, self.reKey, self.reComment)
 
     def postProcessValue(self, val):
 
         def unescape(m):
             found = m.groupdict()
             if found['uni']:
                 return unichr(int(found['uni'][1:], 16))
             if found['nl']:
                 return ''
             return self.known_escapes.get(found['single'], found['single'])
         val = self.escape.sub(unescape, val)
         return val
 
 
+class DefinesInstruction(EntityBase):
+    '''Entity-like object representing processing instructions in inc files
+    '''
+    def __init__(self, ctx, span, pre_ws_span, def_span, val_span, post_span):
+        self.ctx = ctx
+        self.span = span
+        self.pre_ws_span = pre_ws_span
+        self.def_span = def_span
+        self.key_span = self.val_span = val_span
+        self.post_span = post_span
+        self.pp = lambda v: v
+
+    def __repr__(self):
+        return self.raw_val
+
+
 class DefinesParser(Parser):
     # can't merge, #unfilter needs to be the last item, which we don't support
     canMerge = False
+    tail = re.compile(r'(?!)')  # never match
 
     def __init__(self):
-        self.reKey = re.compile('^(\s*)((?:^#(?!define\s).*\s*)*)'
-                                '(#define[ \t]+(\w+)[ \t]+(.*?))([ \t]*$\n?)',
+        self.reComment = re.compile(
+            '((?:[ \t]*\n)*)'
+            '((?:^# .*?(?:\n|\Z))+)'
+            '((?:[ \t]*(?:\n|\Z))*)', re.M)
+        self.reKey = re.compile('((?:[ \t]*\n)*)'
+                                '(#define[ \t]+(\w+)(?:[ \t](.*?))?(?:\n|\Z))'
+                                '((?:[ \t]*(?:\n|\Z))*)',
                                 re.M)
-        self.reHeader = re.compile('^\s*(#(?!define\s).*\s*)*')
-        self.reFooter = re.compile('\s*(#(?!define\s).*\s*)*$', re.M)
+        self.rePI = re.compile('((?:[ \t]*\n)*)'
+                               '(#(\w+)[ \t]+(.*?)(?:\n|\Z))'
+                               '((?:[ \t]*(?:\n|\Z))*)',
+                               re.M)
         Parser.__init__(self)
 
+    def getEntity(self, ctx, offset):
+        contents = ctx.contents
+        m = self.reComment.match(contents, offset)
+        if m:
+            offset = m.end()
+            self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(4)])
+            return (self.last_comment, offset)
+        m = self.reKey.match(contents, offset)
+        if m:
+            offset = m.end()
+            return (self.createEntity(ctx, m), offset)
+        m = self.rePI.match(contents, offset)
+        if m:
+            offset = m.end()
+            return (DefinesInstruction(ctx, *[m.span(i) for i in xrange(5)]),
+                    offset)
+        return self.getTrailing(ctx, offset,
+                                self.reComment, self.reKey, self.rePI)
+
+
+class IniSection(EntityBase):
+    '''Entity-like object representing sections in ini files
+    '''
+    def __init__(self, ctx, span, pre_ws_span, def_span, val_span, post_span):
+        self.ctx = ctx
+        self.span = span
+        self.pre_ws_span = pre_ws_span
+        self.def_span = def_span
+        self.key_span = self.val_span = val_span
+        self.post_span = post_span
+        self.pp = lambda v: v
+
+    def __repr__(self):
+        return self.raw_val
+
 
 class IniParser(Parser):
     '''
     Parse files of the form:
     # initial comment
     [cat]
     whitespace*
     #comment
     string=value
     ...
     '''
     def __init__(self):
-        self.reHeader = re.compile('^((?:\s*|[;#].*)\n)*\[.+?\]\n', re.M)
-        self.reKey = re.compile('(\s*)((?:[;#].*\n\s*)*)((.+?)=(.*))(\n?)')
-        self.reFooter = re.compile('\s*([;#].*\s*)*$')
+        self.reComment = re.compile(
+            '((?:[ \t]*\n)*)'
+            '((?:^[;#].*?(?:\n|\Z))+)'
+            '((?:[ \t]*(?:\n|\Z))*)', re.M)
+        self.reSection = re.compile(
+            '((?:[ \t]*\n)*)'
+            '(\[(.*?)\])'
+            '((?:[ \t]*(?:\n|\Z))*)', re.M)
+        self.reKey = re.compile(
+            '((?:[ \t]*\n)*)'
+            '((.+?)=(.*))'
+            '((?:[ \t]*(?:\n|\Z))*)', re.M)
         Parser.__init__(self)
 
-
-DECL, COMMENT, START, END, CONTENT = range(5)
-
-
-class BookmarksParserInner(HTMLParser):
-
-    class Token(object):
-        _type = None
-        content = ''
-
-        def __str__(self):
-            return self.content
-
-    class DeclToken(Token):
-        _type = DECL
-
-        def __init__(self, decl):
-            self.content = decl
-            pass
-
-        def __str__(self):
-            return '<!%s>' % self.content
-        pass
-
-    class CommentToken(Token):
-        _type = COMMENT
-
-        def __init__(self, comment):
-            self.content = comment
-            pass
-
-        def __str__(self):
-            return '<!--%s-->' % self.content
-        pass
-
-    class StartToken(Token):
-        _type = START
-
-        def __init__(self, tag, attrs, content):
-            self.tag = tag
-            self.attrs = dict(attrs)
-            self.content = content
-            pass
-        pass
-
-    class EndToken(Token):
-        _type = END
-
-        def __init__(self, tag):
-            self.tag = tag
-            pass
-
-        def __str__(self):
-            return '</%s>' % self.tag.upper()
-        pass
-
-    class ContentToken(Token):
-        _type = CONTENT
-
-        def __init__(self, content):
-            self.content = content
-            pass
-        pass
-
-    def __init__(self):
-        HTMLParser.__init__(self)
-        self.tokens = []
-
-    def parse(self, contents):
-        self.tokens = []
-        self.feed(contents)
-        self.close()
-        return self.tokens
-
-    # Called when we hit an end DL tag to reset the folder selections
-    def handle_decl(self, decl):
-        self.tokens.append(self.DeclToken(decl))
-
-    # Called when we hit an end DL tag to reset the folder selections
-    def handle_comment(self, comment):
-        self.tokens.append(self.CommentToken(comment))
-
-    def handle_starttag(self, tag, attrs):
-        self.tokens.append(self.StartToken(tag, attrs,
-                                           self.get_starttag_text()))
-
-    # Called when text data is encountered
-    def handle_data(self, data):
-        if self.tokens[-1]._type == CONTENT:
-            self.tokens[-1].content += data
-        else:
-            self.tokens.append(self.ContentToken(data))
-
-    def handle_charref(self, data):
-        self.handle_data('&#%s;' % data)
-
-    def handle_entityref(self, data):
-        self.handle_data('&%s;' % data)
-
-    # Called when we hit an end DL tag to reset the folder selections
-    def handle_endtag(self, tag):
-        self.tokens.append(self.EndToken(tag))
-
-
-class BookmarksParser(Parser):
-    canMerge = False
-
-    class BMEntity(object):
-        def __init__(self, key, val):
-            self.key = key
-            self.val = val
-
-    def __iter__(self):
-        p = BookmarksParserInner()
-        tks = p.parse(self.contents)
-        i = 0
-        k = []
-        for i in xrange(len(tks)):
-            t = tks[i]
-            if t._type == START:
-                k.append(t.tag)
-                keys = t.attrs.keys()
-                keys.sort()
-                for attrname in keys:
-                    yield self.BMEntity('.'.join(k) + '.@' + attrname,
-                                        t.attrs[attrname])
-                if i + 1 < len(tks) and tks[i+1]._type == CONTENT:
-                    i += 1
-                    t = tks[i]
-                    v = t.content.strip()
-                    if v:
-                        yield self.BMEntity('.'.join(k), v)
-            elif t._type == END:
-                k.pop()
+    def getEntity(self, ctx, offset):
+        contents = ctx.contents
+        m = self.reComment.match(contents, offset)
+        if m:
+            offset = m.end()
+            self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(4)])
+            return (self.last_comment, offset)
+        m = self.reSection.match(contents, offset)
+        if m:
+            offset = m.end()
+            return (IniSection(ctx, *[m.span(i) for i in xrange(5)]), offset)
+        m = self.reKey.match(contents, offset)
+        if m:
+            offset = m.end()
+            return (self.createEntity(ctx, m), offset)
+        return self.getTrailing(ctx, offset,
+                                self.reComment, self.reSection, self.reKey)
 
 
 __constructors = [('\\.dtd$', DTDParser()),
                   ('\\.properties$', PropertiesParser()),
                   ('\\.ini$', IniParser()),
-                  ('\\.inc$', DefinesParser()),
-                  ('bookmarks\\.html$', BookmarksParser())]
+                  ('\\.inc$', DefinesParser())]
--- a/python/compare-locales/compare_locales/tests/__init__.py
+++ b/python/compare-locales/compare_locales/tests/__init__.py
@@ -4,28 +4,28 @@
 
 '''Mixins for parser tests.
 '''
 
 from itertools import izip_longest
 from pkg_resources import resource_string
 import re
 
-from compare_locales.parser import getParser
+from compare_locales import parser
 
 
 class ParserTestMixin():
     '''Utility methods used by the parser tests.
     '''
     filename = None
 
     def setUp(self):
         '''Create a parser for this test.
         '''
-        self.parser = getParser(self.filename)
+        self.parser = parser.getParser(self.filename)
 
     def tearDown(self):
         'tear down this test'
         del self.parser
 
     def resource(self, name):
         testcontent = resource_string(__name__, 'data/' + name)
         # fake universal line endings
@@ -33,17 +33,18 @@ class ParserTestMixin():
         return testcontent
 
     def _test(self, content, refs):
         '''Helper to test the parser.
         Compares the result of parsing content with the given list
         of reference keys and values.
         '''
         self.parser.readContents(content)
-        entities = [entity for entity in self.parser]
+        entities = list(self.parser.walk())
         for entity, ref in izip_longest(entities, refs):
-            self.assertTrue(entity, 'excess reference entity')
-            self.assertTrue(ref, 'excess parsed entity')
-            self.assertEqual(entity.val, ref[1])
-            if ref[0].startswith('_junk'):
-                self.assertTrue(re.match(ref[0], entity.key))
+            self.assertTrue(entity, 'excess reference entity ' + unicode(ref))
+            self.assertTrue(ref, 'excess parsed entity ' + unicode(entity))
+            if isinstance(entity, parser.Entity):
+                self.assertEqual(entity.key, ref[0])
+                self.assertEqual(entity.val, ref[1])
             else:
-                self.assertEqual(entity.key, ref[0])
+                self.assertEqual(type(entity).__name__, ref[0])
+                self.assertIn(ref[1], entity.all)
--- a/python/compare-locales/compare_locales/tests/test_checks.py
+++ b/python/compare-locales/compare_locales/tests/test_checks.py
@@ -1,17 +1,17 @@
 # -*- coding: utf-8 -*-
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 import unittest
 
 from compare_locales.checks import getChecker
-from compare_locales.parser import getParser, Entity
+from compare_locales.parser import getParser, Parser, Entity
 from compare_locales.paths import File
 
 
 class BaseHelper(unittest.TestCase):
     file = None
     refContent = None
 
     def setUp(self):
@@ -234,24 +234,26 @@ class TestAndroid(unittest.TestCase):
     we're passing in a DTD file in the embedding/android module.
     """
     apos_msg = u"Apostrophes in Android DTDs need escaping with \\' or " + \
                u"\\u0027, or use \u2019, or put string in quotes."
     quot_msg = u"Quotes in Android DTDs need escaping with \\\" or " + \
                u"\\u0022, or put string in apostrophes."
 
     def getEntity(self, v):
-        return Entity(v, lambda s: s, (0, len(v)), (), (0, 0), (), (),
+        ctx = Parser.Context(v)
+        return Entity(ctx, lambda s: s, '', (0, len(v)), (), (), (),
                       (0, len(v)), ())
 
     def getDTDEntity(self, v):
         v = v.replace('"', '&quot;')
-        return Entity('<!ENTITY foo "%s">' % v,
-                      lambda s: s,
-                      (0, len(v) + 16), (), (0, 0), (), (9, 12),
+        ctx = Parser.Context('<!ENTITY foo "%s">' % v)
+        return Entity(ctx,
+                      lambda s: s, '',
+                      (0, len(v) + 16), (), (), (9, 12),
                       (14, len(v) + 14), ())
 
     def test_android_dtd(self):
         """Testing the actual android checks. The logic is involved,
         so this is a lot of nitty gritty detail tests.
         """
         f = File("embedding/android/strings.dtd", "strings.dtd",
                  "embedding/android")
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_defines.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales.tests import ParserTestMixin
+
+
+mpl2 = '''\
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+'''
+
+
+class TestDefinesParser(ParserTestMixin, unittest.TestCase):
+
+    filename = 'defines.inc'
+
+    def testBrowser(self):
+        self._test(mpl2 + '''#filter emptyLines
+
+#define MOZ_LANGPACK_CREATOR mozilla.org
+
+# If non-English locales wish to credit multiple contributors, uncomment this
+# variable definition and use the format specified.
+# #define MOZ_LANGPACK_CONTRIBUTORS <em:contributor>Joe Solon</em:contributor>
+
+#unfilter emptyLines
+
+''', (
+            ('Comment', mpl2),
+            ('DefinesInstruction', 'filter emptyLines'),
+            ('MOZ_LANGPACK_CREATOR', 'mozilla.org'),
+            ('Comment', '#define'),
+            ('DefinesInstruction', 'unfilter emptyLines')))
+
+    def testBrowserWithContributors(self):
+        self._test(mpl2 + '''#filter emptyLines
+
+#define MOZ_LANGPACK_CREATOR mozilla.org
+
+# If non-English locales wish to credit multiple contributors, uncomment this
+# variable definition and use the format specified.
+#define MOZ_LANGPACK_CONTRIBUTORS <em:contributor>Joe Solon</em:contributor>
+
+#unfilter emptyLines
+
+''', (
+            ('Comment', mpl2),
+            ('DefinesInstruction', 'filter emptyLines'),
+            ('MOZ_LANGPACK_CREATOR', 'mozilla.org'),
+            ('Comment', 'non-English'),
+            ('MOZ_LANGPACK_CONTRIBUTORS',
+             '<em:contributor>Joe Solon</em:contributor>'),
+            ('DefinesInstruction', 'unfilter emptyLines')))
+
+    def testCommentWithNonAsciiCharacters(self):
+        self._test(mpl2 + '''#filter emptyLines
+
+# e.g. #define seamonkey_l10n <DT><A HREF="http://www.mozilla.cz/produkty/seamonkey/">SeaMonkey v češtině</a>
+#define seamonkey_l10n_long
+
+#unfilter emptyLines
+
+''', (
+            ('Comment', mpl2),
+            ('DefinesInstruction', 'filter emptyLines'),
+            ('Comment', u'češtině'),
+            ('seamonkey_l10n_long', ''),
+            ('DefinesInstruction', 'unfilter emptyLines')))
+
+    def testToolkit(self):
+        self._test('''#define MOZ_LANG_TITLE English (US)
+''', (
+            ('MOZ_LANG_TITLE', 'English (US)'),))
+
+    def testToolkitEmpty(self):
+        self._test('', tuple())
+
+    def test_empty_file(self):
+        '''Test that empty files generate errors
+
+        defines.inc are interesting that way, as their
+        content is added to the generated file.
+        '''
+        self._test('\n', (('Junk', '\n'),))
+        self._test('\n\n', (('Junk', '\n\n'),))
+        self._test(' \n\n', (('Junk', ' \n\n'),))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/compare-locales/compare_locales/tests/test_dtd.py
+++ b/python/compare-locales/compare_locales/tests/test_dtd.py
@@ -3,17 +3,17 @@
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 '''Tests for the DTD parser.
 '''
 
 import unittest
 import re
 
-from compare_locales.parser import getParser
+from compare_locales import parser
 from compare_locales.tests import ParserTestMixin
 
 
 class TestDTD(ParserTestMixin, unittest.TestCase):
     '''Tests for the DTD Parser.'''
     filename = 'foo.dtd'
 
     def test_one_entity(self):
@@ -25,19 +25,19 @@ class TestDTD(ParserTestMixin, unittest.
 <!ENTITY good.two "two">
 <!ENTITY bad.two "bad "quoted" word">
 <!ENTITY good.three "three">
 <!ENTITY good.four "good ' quote">
 <!ENTITY good.five "good 'quoted' word">
 '''
     quoteRef = (
         ('good.one', 'one'),
-        ('_junk_\\d_25-56$', '<!ENTITY bad.one "bad " quote">'),
+        ('Junk', '<!ENTITY bad.one "bad " quote">'),
         ('good.two', 'two'),
-        ('_junk_\\d_82-119$', '<!ENTITY bad.two "bad "quoted" word">'),
+        ('Junk', '<!ENTITY bad.two "bad "quoted" word">'),
         ('good.three', 'three'),
         ('good.four', 'good \' quote'),
         ('good.five', 'good \'quoted\' word'),)
 
     def test_quotes(self):
         self._test(self.quoteContent, self.quoteRef)
 
     def test_apos(self):
@@ -57,30 +57,73 @@ class TestDTD(ParserTestMixin, unittest.
 
     def test_trailing_comment(self):
         self._test('''<!ENTITY first "string">
 <!ENTITY second "string">
 <!--
 <!ENTITY commented "out">
 -->
 ''',
-                   (('first', 'string'), ('second', 'string')))
+                   (('first', 'string'), ('second', 'string'),
+                    ('Comment', 'out')))
 
     def test_license_header(self):
-        p = getParser('foo.dtd')
+        p = parser.getParser('foo.dtd')
         p.readContents(self.resource('triple-license.dtd'))
-        for e in p:
-            self.assertEqual(e.key, 'foo')
-            self.assertEqual(e.val, 'value')
-        self.assert_('MPL' in p.header)
+        entities = list(p.walk())
+        self.assert_(isinstance(entities[0], parser.Comment))
+        self.assertIn('MPL', entities[0].all)
+        e = entities[1]
+        self.assert_(isinstance(e, parser.Entity))
+        self.assertEqual(e.key, 'foo')
+        self.assertEqual(e.val, 'value')
+        self.assertEqual(len(entities), 2)
         p.readContents('''\
 <!-- This Source Code Form is subject to the terms of the Mozilla Public
    - License, v. 2.0. If a copy of the MPL was not distributed with this file,
    - You can obtain one at http://mozilla.org/MPL/2.0/.  -->
 <!ENTITY foo "value">
 ''')
-        for e in p:
-            self.assertEqual(e.key, 'foo')
-            self.assertEqual(e.val, 'value')
-        self.assert_('MPL' in p.header)
+        entities = list(p.walk())
+        self.assert_(isinstance(entities[0], parser.Comment))
+        self.assertIn('MPL', entities[0].all)
+        e = entities[1]
+        self.assert_(isinstance(e, parser.Entity))
+        self.assertEqual(e.key, 'foo')
+        self.assertEqual(e.val, 'value')
+        self.assertEqual(len(entities), 2)
+
+    def testBOM(self):
+        self._test(u'\ufeff<!ENTITY foo.label "stuff">'.encode('utf-8'),
+                   (('foo.label', 'stuff'),))
+
+    def test_trailing_whitespace(self):
+        self._test('<!ENTITY foo.label "stuff">\n  \n',
+                   (('foo.label', 'stuff'),))
+
+    def test_unicode_comment(self):
+        self._test('<!-- \xe5\x8f\x96 -->',
+                   (('Comment', u'\u53d6'),))
+
+    def test_empty_file(self):
+        self._test('', tuple())
+        self._test('\n', (('Whitespace', '\n'),))
+        self._test('\n\n', (('Whitespace', '\n\n'),))
+        self._test(' \n\n', (('Whitespace', ' \n\n'),))
+
+    def test_positions(self):
+        self.parser.readContents('''\
+<!ENTITY one  "value">
+<!ENTITY  two "other
+escaped value">
+''')
+        one, two = list(self.parser)
+        self.assertEqual(one.position(), (1, 1))
+        self.assertEqual(one.value_position(), (1, 16))
+        self.assertEqual(one.position(-1), (2, 1))
+        self.assertEqual(two.position(), (2, 1))
+        self.assertEqual(two.value_position(), (2, 16))
+        self.assertEqual(two.value_position(-1), (3, 14))
+        self.assertEqual(two.value_position(10), (3, 5))
+
 
 if __name__ == '__main__':
     unittest.main()
--- a/python/compare-locales/compare_locales/tests/test_ini.py
+++ b/python/compare-locales/compare_locales/tests/test_ini.py
@@ -18,98 +18,122 @@ mpl2 = '''\
 class TestIniParser(ParserTestMixin, unittest.TestCase):
 
     filename = 'foo.ini'
 
     def testSimpleHeader(self):
         self._test('''; This file is in the UTF-8 encoding
 [Strings]
 TitleText=Some Title
-''', (('TitleText', 'Some Title'),))
-        self.assert_('UTF-8' in self.parser.header)
+''', (
+            ('Comment', 'UTF-8 encoding'),
+            ('IniSection', 'Strings'),
+            ('TitleText', 'Some Title'),))
 
     def testMPL2_Space_UTF(self):
         self._test(mpl2 + '''
 ; This file is in the UTF-8 encoding
 [Strings]
 TitleText=Some Title
-''', (('TitleText', 'Some Title'),))
-        self.assert_('MPL' in self.parser.header)
+''', (
+            ('Comment', mpl2),
+            ('Comment', 'UTF-8'),
+            ('IniSection', 'Strings'),
+            ('TitleText', 'Some Title'),))
 
     def testMPL2_Space(self):
         self._test(mpl2 + '''
 [Strings]
 TitleText=Some Title
-''', (('TitleText', 'Some Title'),))
-        self.assert_('MPL' in self.parser.header)
+''', (
+            ('Comment', mpl2),
+            ('IniSection', 'Strings'),
+            ('TitleText', 'Some Title'),))
 
     def testMPL2_MultiSpace(self):
         self._test(mpl2 + '''\
 
 ; more comments
 
 [Strings]
 TitleText=Some Title
-''', (('TitleText', 'Some Title'),))
-        self.assert_('MPL' in self.parser.header)
+''', (
+            ('Comment', mpl2),
+            ('Comment', 'more comments'),
+            ('IniSection', 'Strings'),
+            ('TitleText', 'Some Title'),))
 
     def testMPL2_JunkBeforeCategory(self):
         self._test(mpl2 + '''\
 Junk
 [Strings]
 TitleText=Some Title
-''', (('_junk_\\d+_0-213$', mpl2 + '''\
-Junk
-[Strings]'''), ('TitleText', 'Some Title')))
-        self.assert_('MPL' not in self.parser.header)
+''', (
+            ('Comment', mpl2),
+            ('Junk', 'Junk'),
+            ('IniSection', 'Strings'),
+            ('TitleText', 'Some Title')))
 
     def test_TrailingComment(self):
         self._test(mpl2 + '''
 [Strings]
 TitleText=Some Title
 ;Stray trailing comment
-''', (('TitleText', 'Some Title'),))
-        self.assert_('MPL' in self.parser.header)
+''', (
+            ('Comment', mpl2),
+            ('IniSection', 'Strings'),
+            ('TitleText', 'Some Title'),
+            ('Comment', 'Stray trailing')))
 
     def test_SpacedTrailingComments(self):
         self._test(mpl2 + '''
 [Strings]
 TitleText=Some Title
 
 ;Stray trailing comment
 ;Second stray comment
 
-''', (('TitleText', 'Some Title'),))
-        self.assert_('MPL' in self.parser.header)
+''', (
+            ('Comment', mpl2),
+            ('IniSection', 'Strings'),
+            ('TitleText', 'Some Title'),
+            ('Comment', 'Second stray comment')))
 
     def test_TrailingCommentsAndJunk(self):
         self._test(mpl2 + '''
 [Strings]
 TitleText=Some Title
 
 ;Stray trailing comment
 Junk
 ;Second stray comment
 
-''', (('TitleText', 'Some Title'), ('_junk_\\d+_231-284$', '''\
-
-;Stray trailing comment
-Junk
-;Second stray comment
-
-''')))
-        self.assert_('MPL' in self.parser.header)
+''', (
+            ('Comment', mpl2),
+            ('IniSection', 'Strings'),
+            ('TitleText', 'Some Title'),
+            ('Comment', 'Stray trailing'),
+            ('Junk', 'Junk'),
+            ('Comment', 'Second stray comment')))
 
     def test_JunkInbetweenEntries(self):
         self._test(mpl2 + '''
 [Strings]
 TitleText=Some Title
 
 Junk
 
 Good=other string
-''', (('TitleText', 'Some Title'), ('_junk_\\d+_231-236$', '''\
+''', (
+            ('Comment', mpl2),
+            ('IniSection', 'Strings'),
+            ('TitleText', 'Some Title'),
+            ('Junk', 'Junk'),
+            ('Good', 'other string')))
 
-Junk'''), ('Good', 'other string')))
-        self.assert_('MPL' in self.parser.header)
+    def test_empty_file(self):
+        self._test('', tuple())
+        self._test('\n', (('Whitespace', '\n'),))
+        self._test('\n\n', (('Whitespace', '\n\n'),))
+        self._test(' \n\n', (('Whitespace', ' \n\n'),))
 
 if __name__ == '__main__':
     unittest.main()
--- a/python/compare-locales/compare_locales/tests/test_merge.py
+++ b/python/compare-locales/compare_locales/tests/test_merge.py
@@ -8,32 +8,32 @@ from tempfile import mkdtemp
 import shutil
 
 from compare_locales.parser import getParser
 from compare_locales.paths import File
 from compare_locales.compare import ContentComparer
 
 
 class ContentMixin(object):
-    maxDiff = None  # we got big dictionaries to compare
     extension = None  # OVERLOAD
 
     def reference(self, content):
         self.ref = os.path.join(self.tmp, "en-reference" + self.extension)
         open(self.ref, "w").write(content)
 
     def localized(self, content):
         self.l10n = os.path.join(self.tmp, "l10n" + self.extension)
         open(self.l10n, "w").write(content)
 
 
 class TestProperties(unittest.TestCase, ContentMixin):
     extension = '.properties'
 
     def setUp(self):
+        self.maxDiff = None
         self.tmp = mkdtemp()
         os.mkdir(os.path.join(self.tmp, "merge"))
 
     def tearDown(self):
         shutil.rmtree(self.tmp)
         del self.tmp
 
     def testGood(self):
@@ -93,17 +93,18 @@ eff = effVal""")
         [m, n] = p.parse()
         self.assertEqual(map(lambda e: e.key,  m), ["bar", "eff", "foo"])
 
     def testError(self):
         self.assertTrue(os.path.isdir(self.tmp))
         self.reference("""foo = fooVal
 bar = %d barVal
 eff = effVal""")
-        self.localized("""bar = %S lBar
+        self.localized("""\
+bar = %S lBar
 eff = leffVal
 """)
         cc = ContentComparer()
         cc.set_merge_stage(os.path.join(self.tmp, "merge"))
         cc.compare(File(self.ref, "en-reference.properties", ""),
                    File(self.l10n, "l10n.properties", ""))
         self.assertDictEqual(
             cc.observer.toJSON(),
@@ -111,17 +112,17 @@ eff = leffVal
                 {None: {
                     'changed': 2, 'errors': 1, 'missing': 1
                 }},
              'details': {
                  'children': [
                      ('l10n.properties',
                          {'value': {
                           'error': [u'argument 1 `S` should be `d` '
-                                    u'at line 1, column 6 for bar'],
+                                    u'at line 1, column 7 for bar'],
                           'missingEntity': [u'foo']}}
                       )
                  ]}
              }
         )
         mergefile = os.path.join(self.tmp, "merge", "l10n.properties")
         self.assertTrue(os.path.isfile(mergefile))
         p = getParser(mergefile)
@@ -155,16 +156,17 @@ eff = leffVal
              }
         )
 
 
 class TestDTD(unittest.TestCase, ContentMixin):
     extension = '.dtd'
 
     def setUp(self):
+        self.maxDiff = None
         self.tmp = mkdtemp()
         os.mkdir(os.path.join(self.tmp, "merge"))
 
     def tearDown(self):
         shutil.rmtree(self.tmp)
         del self.tmp
 
     def testGood(self):
@@ -243,17 +245,19 @@ class TestDTD(unittest.TestCase, Content
                 {None: {
                     'errors': 1, 'missing': 1, 'unchanged': 2
                 }},
              'details': {
                  'children': [
                      ('l10n.dtd',
                          {'value': {
                              'error': [u'Unparsed content "<!ENTY bar '
-                                       u'\'gimmick\'>" at 23-44'],
+                                       u'\'gimmick\'>" '
+                                       u'from line 2 colum 1 to '
+                                       u'line 2 column 22'],
                              'missingEntity': [u'bar']}}
                       )
                  ]}
              }
         )
         mergefile = os.path.join(self.tmp, "merge", "l10n.dtd")
         self.assertTrue(os.path.isfile(mergefile))
         p = getParser(mergefile)
new file mode 100644
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_parser.py
@@ -0,0 +1,44 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales import parser
+
+
+class TestParserContext(unittest.TestCase):
+    def test_lines(self):
+        "Test that Parser.Context.lines returns 1-based tuples"
+        ctx = parser.Parser.Context('''first line
+second line
+third line
+''')
+        self.assertEqual(
+            ctx.lines(0, 1),
+            [(1, 1), (1, 2)]
+        )
+        self.assertEqual(
+            ctx.lines(len('first line')),
+            [(1, len('first line') + 1)]
+        )
+        self.assertEqual(
+            ctx.lines(len('first line') + 1),
+            [(2, 1)]
+        )
+        self.assertEqual(
+            ctx.lines(len(ctx.contents)),
+            [(4, 1)]
+        )
+
+    def test_empty_parser(self):
+        p = parser.Parser()
+        entities, _map = p.parse()
+        self.assertListEqual(
+            entities,
+            []
+        )
+        self.assertDictEqual(
+            _map,
+            {}
+        )
--- a/python/compare-locales/compare_locales/tests/test_properties.py
+++ b/python/compare-locales/compare_locales/tests/test_properties.py
@@ -19,17 +19,17 @@ of two lines
 one_line_trailing = This line ends in \\
 and has junk
 two_lines_triple = This line is one of two and ends in \\\
 and still has another line coming
 ''', (
             ('one_line', 'This is one line'),
             ('two_line', u'This is the first of two lines'),
             ('one_line_trailing', u'This line ends in \\'),
-            ('_junk_\\d+_113-126$', 'and has junk\n'),
+            ('Junk', 'and has junk\n'),
             ('two_lines_triple', 'This line is one of two and ends in \\'
              'and still has another line coming')))
 
     def testProperties(self):
         # port of netwerk/test/PropertiesTest.cpp
         self.parser.readContents(self.resource('test.properties'))
         ref = ['1', '2', '3', '4', '5', '6', '7', '8',
                'this is the first part of a continued line '
@@ -58,18 +58,17 @@ and an end''', (('bar', 'one line with a
 
     def test_license_header(self):
         self._test('''\
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 foo=value
-''', (('foo', 'value'),))
-        self.assert_('MPL' in self.parser.header)
+''', (('Comment', 'MPL'), ('foo', 'value')))
 
     def test_escapes(self):
         self.parser.readContents(r'''
 # unicode escapes
 zero = some \unicode
 one = \u0
 two = \u41
 three = \u042
@@ -83,13 +82,69 @@ seven = \n\r\t\\
             self.assertEqual(e.val, r)
 
     def test_trailing_comment(self):
         self._test('''first = string
 second = string
 
 #
 #commented out
-''', (('first', 'string'), ('second', 'string')))
+''', (('first', 'string'), ('second', 'string'),
+            ('Comment', 'commented out')))
+
+    def test_trailing_newlines(self):
+        self._test('''\
+foo = bar
+
+\x20\x20
+  ''', (('foo', 'bar'),))
+
+    def test_just_comments(self):
+        self._test('''\
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+# LOCALIZATION NOTE These strings are used inside the Promise debugger
+# which is available as a panel in the Debugger.
+''', (('Comment', 'MPL'), ('Comment', 'LOCALIZATION NOTE')))
+
+    def test_just_comments_without_trailing_newline(self):
+        self._test('''\
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+# LOCALIZATION NOTE These strings are used inside the Promise debugger
+# which is available as a panel in the Debugger.''', (
+            ('Comment', 'MPL'), ('Comment', 'LOCALIZATION NOTE')))
+
+    def test_trailing_comment_and_newlines(self):
+        self._test('''\
+# LOCALIZATION NOTE These strings are used inside the Promise debugger
+# which is available as a panel in the Debugger.
+
+
+
+''',  (('Comment', 'LOCALIZATION NOTE'),))
+
+    def test_empty_file(self):
+        self._test('', tuple())
+        self._test('\n', (('Whitespace', '\n'),))
+        self._test('\n\n', (('Whitespace', '\n\n'),))
+        self._test(' \n\n', (('Whitespace', ' \n\n'),))
+
+    def test_positions(self):
+        self.parser.readContents('''\
+one = value
+two = other \\
+escaped value
+''')
+        one, two = list(self.parser)
+        self.assertEqual(one.position(), (1, 1))
+        self.assertEqual(one.value_position(), (1, 7))
+        self.assertEqual(two.position(), (2, 1))
+        self.assertEqual(two.value_position(), (2, 7))
+        self.assertEqual(two.value_position(-1), (3, 14))
+        self.assertEqual(two.value_position(10), (3, 3))
 
 if __name__ == '__main__':
     unittest.main()
--- a/python/compare-locales/compare_locales/webapps.py
+++ b/python/compare-locales/compare_locales/webapps.py
@@ -78,17 +78,17 @@ class Manifest(object):
         return self._strings
 
     def load_and_parse(self):
         try:
             manifest = json.load(open(self.file.fullpath))
         except (ValueError, IOError), e:
             if self.watcher:
                 self.watcher.notify('error', self.file, str(e))
-            return False
+            return {}
         return self.extract_manifest_strings(manifest)
 
     def extract_manifest_strings(self, manifest_fragment):
         '''Extract localizable strings from a manifest dict.
         This method is recursive, and returns a two-level dict,
         first level being locale codes, second level being generated
         key and localized value. Keys are generated by concatenating
         each level in the json with a ".".