Bug 1497898 - Update the gitignore implementation to work as an iterator filter, r=ato
☠☠ backed out by fe1c2bb6cfbc ☠ ☠
authorJames Graham <james@hoppipolla.co.uk>
Wed, 10 Oct 2018 17:51:21 +0000
changeset 500621 c3cb0408498cd890dba6038de0debf247ba03324
parent 500620 a6a89509add7962cae05fc9b7e504c42910a7147
child 500622 184bc31c33a6edba5ddb13d1daef81c39f87370f
push id1864
push userffxbld-merge
push dateMon, 03 Dec 2018 15:51:40 +0000
treeherdermozilla-release@f040763d99ad [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersato
bugs1497898
milestone64.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1497898 - Update the gitignore implementation to work as an iterator filter, r=ato This updates the gitignore implemenation to take input like os.walk but with additional stat data for the files. It also makes several useful optimistaions: * Avoid using regex when just matching a literal * Identify patterns that can only match the final component of a path and run those against that component rather than the full path. * Add the possibility of providing a dictionary of paths to gitignore statuses as a cache. This dramatically reduces the amount of time we spend in gitignore processing when updating the manifest. Depends on D8223 Differential Revision: https://phabricator.services.mozilla.com/D8224
testing/web-platform/tests/tools/gitignore/gitignore.py
testing/web-platform/tests/tools/gitignore/tests/test_gitignore.py
--- a/testing/web-platform/tests/tools/gitignore/gitignore.py
+++ b/testing/web-platform/tests/tools/gitignore/gitignore.py
@@ -1,158 +1,247 @@
 import re
 import os
+import itertools
+from six import itervalues, iteritems
+from collections import defaultdict
+
 
 end_space = re.compile(r"([^\\]\s)*$")
 
 
-def fnmatch_translate(pat, allow_component_only=True):
+def fnmatch_translate(pat):
     parts = []
-    seq = False
+    seq = None
     i = 0
-    component_pattern = False
+    any_char = "[^/]"
     if pat[0] == "/":
         parts.append("^")
-        any_char = "[^/]"
-        if pat[0] == "/":
-            pat = pat[1:]
+        pat = pat[1:]
     else:
-        any_char = "."
-        if allow_component_only and "/" not in pat:
-            component_pattern = True
-            parts.append("^")
-        else:
-            parts.append("^(?:.*/)?")
+        # By default match the entire path up to a /
+        # but if / doesn't appear in the pattern we will mark is as
+        # a name pattern and just produce a pattern that matches against
+        # the filename
+        parts.append("^(?:.*/)?")
+
+    name_pattern = True
     if pat[-1] == "/":
         # If the last character is / match this directory or any subdirectory
         pat = pat[:-1]
         suffix = "(?:/|$)"
     else:
         suffix = "$"
     while i < len(pat):
         c = pat[i]
         if c == "\\":
             if i < len(pat) - 1:
                 i += 1
                 c = pat[i]
                 parts.append(re.escape(c))
             else:
                 raise ValueError
-        elif seq:
+        elif seq is not None:
+            # TODO: this doesn't really handle invalid sequences in the right way
             if c == "]":
-                seq = False
-                # First two cases are to deal with the case where / is the only character
-                # in the sequence but path_name is True so it shouldn't match anything
+                seq = None
                 if parts[-1] == "[":
                     parts = parts[:-1]
                 elif parts[-1] == "^" and parts[-2] == "[":
                     parts = parts[:-2]
                 else:
                     parts.append(c)
             elif c == "-":
                 parts.append(c)
             else:
                 parts += re.escape(c)
         elif c == "[":
             parts.append("[")
             if i < len(pat) - 1 and pat[i+1] in ("!", "^"):
                 parts.append("^")
                 i += 1
-            seq = True
+            seq = i
         elif c == "*":
             if i < len(pat) - 1 and pat[i+1] == "*":
-                parts.append(any_char + "*")
+                if i > 0 and pat[i-1] != "/":
+                    raise ValueError
+                parts.append(".*")
                 i += 1
-                if i < len(pat) - 1 and pat[i+1] == "*":
+                if i < len(pat) - 1 and pat[i+1] != "/":
                     raise ValueError
             else:
                 parts.append(any_char + "*")
         elif c == "?":
             parts.append(any_char)
+        elif c == "/" and not seq:
+            name_pattern = False
+            parts.append(c)
         else:
             parts.append(re.escape(c))
         i += 1
 
-    if seq:
+    if name_pattern:
+        parts[0] = "^"
+
+    if seq is not None:
         raise ValueError
     parts.append(suffix)
     try:
-        return component_pattern, re.compile("".join(parts))
+        return name_pattern, re.compile("".join(parts))
     except Exception:
-        raise
+        raise ValueError
+
+# Regexp matching rules that have to be converted to patterns
+pattern_re = re.compile(r".*[\*\[\?]")
 
 
 def parse_line(line):
     line = line.rstrip()
     if not line or line[0] == "#":
         return
 
     invert = line[0] == "!"
     if invert:
         line = line[1:]
 
     dir_only = line[-1] == "/"
 
     if dir_only:
         line = line[:-1]
 
-    return invert, dir_only, fnmatch_translate(line, dir_only)
+    # Could make a special case for **/foo, but we don't have any patterns like that
+    if not invert and not pattern_re.match(line):
+        literal = True
+        pattern = tuple(line.rsplit("/", 1))
+    else:
+        pattern = fnmatch_translate(line)
+        literal = False
+
+    return invert, dir_only, literal, pattern
 
 
 class PathFilter(object):
-    def __init__(self, root, extras=None):
+    def __init__(self, root, extras=None, cache=None):
         if root:
             ignore_path = os.path.join(root, ".gitignore")
         else:
             ignore_path = None
         if not ignore_path and not extras:
             self.trivial = True
             return
         self.trivial = False
 
-        self.rules_file = []
-        self.rules_dir = []
+        self.literals_file = defaultdict(dict)
+        self.literals_dir = defaultdict(dict)
+        self.patterns_file = []
+        self.patterns_dir = []
+        self.cache = cache or {}
 
         if extras is None:
             extras = []
 
         if ignore_path and os.path.exists(ignore_path):
-            self._read_ignore(ignore_path)
-
-        for item in extras:
-            self._read_line(item)
+            args = ignore_path, extras
+        else:
+            args = None, extras
+        self._read_ignore(*args)
 
-    def _read_ignore(self, ignore_path):
-        with open(ignore_path) as f:
-            for line in f:
-                self._read_line(line)
+    def _read_ignore(self, ignore_path, extras):
+        if ignore_path is not None:
+            with open(ignore_path) as f:
+                for line in f:
+                    self._read_line(line)
+        for line in extras:
+            self._read_line(line)
 
     def _read_line(self, line):
         parsed = parse_line(line)
         if not parsed:
             return
-        invert, dir_only, regexp = parsed
-        if dir_only:
-            self.rules_dir.append((regexp, invert))
-        else:
-            self.rules_file.append((regexp, invert))
+        invert, dir_only, literal, rule = parsed
+
+        if invert:
+            # For exclude rules, we attach the rules to all preceeding patterns, so
+            # that we can match patterns out of order and check if they were later
+            # overriden by an exclude rule
+            assert not literal
+            if not dir_only:
+                rules_iter = itertools.chain(
+                    itertools.chain(*(iteritems(item) for item in itervalues(self.literals_dir))),
+                    itertools.chain(*(iteritems(item) for item in itervalues(self.literals_file))),
+                    self.patterns_dir,
+                    self.patterns_file)
+            else:
+                rules_iter = itertools.chain(
+                    itertools.chain(*(iteritems(item) for item in itervalues(self.literals_dir))),
+                    self.patterns_dir)
 
-    def __call__(self, path):
-        if os.path.sep != "/":
-            path = path.replace(os.path.sep, "/")
+            for rules in rules_iter:
+                rules[1].append(rule)
+        else:
+            if literal:
+                if len(rule) == 1:
+                    dir_name, pattern = None, rule[0]
+                else:
+                    dir_name, pattern = rule
+                self.literals_dir[dir_name][pattern] = []
+                if not dir_only:
+                    self.literals_file[dir_name][pattern] = []
+            else:
+                self.patterns_dir.append((rule, []))
+                if not dir_only:
+                    self.patterns_file.append((rule, []))
 
-        if self.trivial:
-            return True
+    def filter(self, iterator):
+        empty = {}
+        for dirpath, dirnames, filenames in iterator:
+            orig_dirpath = dirpath
+            if os.path.sep != "/":
+                dirpath = dirpath.replace(os.path.sep, "/")
+
+            keep_dirs = []
+            keep_files = []
 
-        path_is_dir = path[-1] == "/"
-        if path_is_dir:
-            path = path[:-1]
-            rules = self.rules_dir
-        else:
-            rules = self.rules_file
+            for iter_items, literals, patterns, target, suffix in [
+                    (dirnames, self.literals_dir, self.patterns_dir, keep_dirs, "/"),
+                    (filenames, self.literals_file, self.patterns_file, keep_files, "")]:
+                for item in iter_items:
+                    name = item[0]
+                    if dirpath:
+                        path = "%s/%s" % (dirpath, name) + suffix
+                    else:
+                        path = name + suffix
+                    if path in self.cache:
+                        if not self.cache[path]:
+                            target.append(item)
+                        continue
+                    for rule_dir in [None, dirpath]:
+                        if name in literals.get(rule_dir, empty):
+                            exclude = literals[rule_dir][name]
+                            if not any(rule.match(path) for rule in exclude):
+                                # Skip this item
+                                self.cache[path] = True
+                                break
+                    else:
+                        for (component_only, pattern), exclude in patterns:
+                            if component_only:
+                                match = pattern.match(name)
+                            else:
+                                match = pattern.match(path)
+                            if match:
+                                if not any(rule.match(name if name_only else path)
+                                           for name_only, rule in exclude):
+                                    # Skip this item
+                                    self.cache[path] = True
+                                    break
+                        else:
+                            self.cache[path] = False
+                            target.append(item)
 
-        include = True
-        for regexp, invert in rules:
-            if not include and invert and regexp.match(path):
-                include = True
-            elif include and not invert and regexp.match(path):
-                include = False
-        return include
+            dirnames[:] = keep_dirs
+            assert ".git" not in dirnames
+            yield orig_dirpath, dirnames, keep_files
+
+    def __call__(self, iterator):
+        if self.trivial:
+            return iterator
+
+        return self.filter(iterator)
--- a/testing/web-platform/tests/tools/gitignore/tests/test_gitignore.py
+++ b/testing/web-platform/tests/tools/gitignore/tests/test_gitignore.py
@@ -1,82 +1,100 @@
 import pytest
 
 from ..gitignore import fnmatch_translate, PathFilter
 
 match_data = [
-    ("foo", False, ["a/foo", "foo"]),
-    ("*.a", False, ["foo.a", "a/foo.a", "a/b/foo.a", "a.a/foo.a"]),
-    ("*.py[co]", False, ["a.pyc", "a.pyo", "a/b/c.pyc"]),
-    ("\\#*", False, ["#a", "a/#b"]),
-    ("*#", False, ["a#", "a/b#", "#a#"]),
-    ("/*.c", False, ["a.c", ".c"]),
+    ("foo", True, ["a/foo", "foo"]),
+    ("*.a", True, ["foo.a", "a/foo.a", "a/b/foo.a", "a.a/foo.a"]),
+    ("*.py[co]", True, ["a.pyc", "a.pyo", "a/b/c.pyc"]),
+    ("\\#*", True, ["#a", "a/#b"]),
+    ("*#", True, ["a#", "a/b#", "#a#"]),
+    ("/*.c", True, ["a.c", ".c"]),
     ("**/b", False, ["a/b", "a/c/b"]),
     ("*b", True, ["ab"]),
-    ("**/b", True, ["a/b"]),
-    ("a/", True, ["a", "a/b", "a/b/c"])
+    ("*b", True, ["a/b"]),
+    ("**/b", False, ["a/b"]),
+    ("a/", True, ["a"]),
+    ("a[/]b", True, []),
+    ("**/b", False, ["a/c/b"]),
+    ("a?c", True, ["abc"]),
+    ("a[^b]c", True, ["acc"]),
+    ("a[b-c]c", True, ["abc", "acc"]),
+    ("a[^]c", True, ["ac"]),  # This is probably wrong
+    ("a[^]c", True, ["ac"]),  # This is probably wrong
 ]
 
 mismatch_data = [
-    ("foo", False, ["foob", "afoo"]),
-    ("*.a", False, ["a", "foo:a", "a.a/foo"]),
-    ("*.py[co]", False, ["a.pyd", "pyo"]),
-    ("/*.c", False, ["a/b.c"]),
-    ("*b", True, ["a/b"]),
-    ("**b", True, ["a/b"]),
-    ("a[/]b", True, ["a/b"]),
-    ("**/b", True, ["a/c/b"]),
-    ("a", True, ["ab"])
+    ("foo", True, ["foob", "afoo"]),
+    ("*.a", True, ["a", "foo:a", "a.a/foo"]),
+    ("*.py[co]", True, ["a.pyd", "pyo", "a.py"]),
+    ("a", True, ["ab"]),
+    ("a?c", True, ["ac", "abbc"]),
+    ("a[^b]c", True, ["abc"]),
+    ("a[b-c]c", True, ["adc"]),
 ]
 
 invalid_data = [
     "[a",
     "***/foo",
     "a\\",
+    "**b",
+    "b**/",
+    "[[]"
 ]
 
 filter_data = [
-    ("foo", True),
-    ("a", False),
-    ("a/b", False),
-    ("a/c", True),
-    ("a/c/", False),
-    ("c/b", True)
+    (["foo", "bar/", "/a", "*.py"],
+     [("", ["foo", "bar", "baz"], ["a"]),
+      ("baz", ["a"], ["foo", "bar"])],
+     [(["baz"], []),
+      (["a"], ["bar"])]),
+    (["#foo", "", "a*", "!a.py"],
+     [("", ["foo"], ["a", "a.foo", "a.py"])],
+     [(["foo"], ["a.py"])]),
 ]
 
 
 def expand_data(compact_data):
-    for pattern, path_name, inputs in compact_data:
+    for pattern, name_only, inputs in compact_data:
         for input in inputs:
-            yield pattern, input, path_name
+            yield pattern, name_only, input
 
 
-@pytest.mark.parametrize("pattern, input, path_name", expand_data(match_data))
-def tests_match(pattern, input, path_name):
-    regexp = fnmatch_translate(pattern, path_name)
+@pytest.mark.parametrize("pattern, name_only, input", expand_data(match_data))
+def tests_match(pattern, name_only, input):
+    name_only_result, regexp = fnmatch_translate(pattern)
+    assert name_only_result == name_only
+    if name_only:
+        input = input.rsplit("/", 1)[-1]
     assert regexp.match(input) is not None
 
 
-@pytest.mark.parametrize("pattern, input, path_name", expand_data(mismatch_data))
-def tests_no_match(pattern, input, path_name):
-    regexp = fnmatch_translate(pattern, path_name)
+@pytest.mark.parametrize("pattern, name_only, input", expand_data(mismatch_data))
+def tests_no_match(pattern, name_only, input):
+    name_only_result, regexp = fnmatch_translate(pattern)
+    assert name_only_result == name_only
+    if name_only:
+        input = input.rsplit("/", 1)[-1]
     assert regexp.match(input) is None
 
 
 @pytest.mark.parametrize("pattern", invalid_data)
 def tests_invalid(pattern):
     with pytest.raises(ValueError):
-        fnmatch_translate(pattern, False)
-    with pytest.raises(ValueError):
-        fnmatch_translate(pattern, True)
+        fnmatch_translate(pattern)
 
 
-@pytest.mark.parametrize("path, expected", filter_data)
-def test_path_filter(path, expected):
-    extras = [
-        "#foo",
-        "a  ",
-        "**/b",
-        "a/c/",
-        "!c/b",
-    ]
-    f = PathFilter(None, extras)
-    assert f(path) == expected
+@pytest.mark.parametrize("rules, input, expected", filter_data)
+def test_path_filter(rules, input, expected):
+    f = PathFilter(None, rules)
+    # Add some fake stat data
+    for i, item in enumerate(input):
+        repl = [input[i][0]]
+        for j in [1, 2]:
+            repl.append([(name, None) for name in input[i][j]])
+        input[i] = tuple(repl)
+
+    for i, output in enumerate(f(input)):
+        assert output[0] == input[i][0]
+        for j in [1, 2]:
+            assert [item[0] for item in output[j]] == expected[i][j-1]