Bug 1497898 - Update the gitignore implementation to work as an iterator filter, r=ato
☠☠ backed out by fe1c2bb6cfbc ☠ ☠
authorJames Graham <james@hoppipolla.co.uk>
Wed, 10 Oct 2018 17:51:21 +0000
changeset 490477 c3cb0408498cd890dba6038de0debf247ba03324
parent 490476 a6a89509add7962cae05fc9b7e504c42910a7147
child 490478 184bc31c33a6edba5ddb13d1daef81c39f87370f
push id247
push userfmarier@mozilla.com
push dateSat, 27 Oct 2018 01:06:44 +0000
reviewersato
bugs1497898
milestone64.0a1
Bug 1497898 - Update the gitignore implementation to work as an iterator filter, r=ato This updates the gitignore implemenation to take input like os.walk but with additional stat data for the files. It also makes several useful optimistaions: * Avoid using regex when just matching a literal * Identify patterns that can only match the final component of a path and run those against that component rather than the full path. * Add the possibility of providing a dictionary of paths to gitignore statuses as a cache. This dramatically reduces the amount of time we spend in gitignore processing when updating the manifest. Depends on D8223 Differential Revision: https://phabricator.services.mozilla.com/D8224
testing/web-platform/tests/tools/gitignore/gitignore.py
testing/web-platform/tests/tools/gitignore/tests/test_gitignore.py
--- a/testing/web-platform/tests/tools/gitignore/gitignore.py
+++ b/testing/web-platform/tests/tools/gitignore/gitignore.py
@@ -1,158 +1,247 @@
 import re
 import os
+import itertools
+from six import itervalues, iteritems
+from collections import defaultdict
+
 
 end_space = re.compile(r"([^\\]\s)*$")
 
 
-def fnmatch_translate(pat, allow_component_only=True):
+def fnmatch_translate(pat):
     parts = []
-    seq = False
+    seq = None
     i = 0
-    component_pattern = False
+    any_char = "[^/]"
     if pat[0] == "/":
         parts.append("^")
-        any_char = "[^/]"
-        if pat[0] == "/":
-            pat = pat[1:]
+        pat = pat[1:]
     else:
-        any_char = "."
-        if allow_component_only and "/" not in pat:
-            component_pattern = True
-            parts.append("^")
-        else:
-            parts.append("^(?:.*/)?")
+        # By default match the entire path up to a /
+        # but if / doesn't appear in the pattern we will mark is as
+        # a name pattern and just produce a pattern that matches against
+        # the filename
+        parts.append("^(?:.*/)?")
+
+    name_pattern = True
     if pat[-1] == "/":
         # If the last character is / match this directory or any subdirectory
         pat = pat[:-1]
         suffix = "(?:/|$)"
     else:
         suffix = "$"
     while i < len(pat):
         c = pat[i]
         if c == "\\":
             if i < len(pat) - 1:
                 i += 1
                 c = pat[i]
                 parts.append(re.escape(c))
             else:
                 raise ValueError
-        elif seq:
+        elif seq is not None:
+            # TODO: this doesn't really handle invalid sequences in the right way
             if c == "]":
-                seq = False
-                # First two cases are to deal with the case where / is the only character
-                # in the sequence but path_name is True so it shouldn't match anything
+                seq = None
                 if parts[-1] == "[":
                     parts = parts[:-1]
                 elif parts[-1] == "^" and parts[-2] == "[":
                     parts = parts[:-2]
                 else:
                     parts.append(c)
             elif c == "-":
                 parts.append(c)
             else:
                 parts += re.escape(c)
         elif c == "[":
             parts.append("[")
             if i < len(pat) - 1 and pat[i+1] in ("!", "^"):
                 parts.append("^")
                 i += 1
-            seq = True
+            seq = i
         elif c == "*":
             if i < len(pat) - 1 and pat[i+1] == "*":
-                parts.append(any_char + "*")
+                if i > 0 and pat[i-1] != "/":
+                    raise ValueError
+                parts.append(".*")
                 i += 1
-                if i < len(pat) - 1 and pat[i+1] == "*":
+                if i < len(pat) - 1 and pat[i+1] != "/":
                     raise ValueError
             else:
                 parts.append(any_char + "*")
         elif c == "?":
             parts.append(any_char)
+        elif c == "/" and not seq:
+            name_pattern = False
+            parts.append(c)
         else:
             parts.append(re.escape(c))
         i += 1
 
-    if seq:
+    if name_pattern:
+        parts[0] = "^"
+
+    if seq is not None:
         raise ValueError
     parts.append(suffix)
     try:
-        return component_pattern, re.compile("".join(parts))
+        return name_pattern, re.compile("".join(parts))
     except Exception:
-        raise
+        raise ValueError
+
+# Regexp matching rules that have to be converted to patterns
+pattern_re = re.compile(r".*[\*\[\?]")
 
 
 def parse_line(line):
     line = line.rstrip()
     if not line or line[0] == "#":
         return
 
     invert = line[0] == "!"
     if invert:
         line = line[1:]
 
     dir_only = line[-1] == "/"
 
     if dir_only:
         line = line[:-1]
 
-    return invert, dir_only, fnmatch_translate(line, dir_only)
+    # Could make a special case for **/foo, but we don't have any patterns like that
+    if not invert and not pattern_re.match(line):
+        literal = True
+        pattern = tuple(line.rsplit("/", 1))
+    else:
+        pattern = fnmatch_translate(line)
+        literal = False
+
+    return invert, dir_only, literal, pattern
 
 
 class PathFilter(object):
-    def __init__(self, root, extras=None):
+    def __init__(self, root, extras=None, cache=None):
         if root:
             ignore_path = os.path.join(root, ".gitignore")
         else:
             ignore_path = None
         if not ignore_path and not extras:
             self.trivial = True
             return
         self.trivial = False
 
-        self.rules_file = []
-        self.rules_dir = []
+        self.literals_file = defaultdict(dict)
+        self.literals_dir = defaultdict(dict)
+        self.patterns_file = []
+        self.patterns_dir = []
+        self.cache = cache or {}
 
         if extras is None:
             extras = []
 
         if ignore_path and os.path.exists(ignore_path):
-            self._read_ignore(ignore_path)
-
-        for item in extras:
-            self._read_line(item)
+            args = ignore_path, extras
+        else:
+            args = None, extras
+        self._read_ignore(*args)
 
-    def _read_ignore(self, ignore_path):
-        with open(ignore_path) as f:
-            for line in f:
-                self._read_line(line)
+    def _read_ignore(self, ignore_path, extras):
+        if ignore_path is not None:
+            with open(ignore_path) as f:
+                for line in f:
+                    self._read_line(line)
+        for line in extras:
+            self._read_line(line)
 
     def _read_line(self, line):
         parsed = parse_line(line)
         if not parsed:
             return
-        invert, dir_only, regexp = parsed
-        if dir_only:
-            self.rules_dir.append((regexp, invert))
-        else:
-            self.rules_file.append((regexp, invert))
+        invert, dir_only, literal, rule = parsed
+
+        if invert:
+            # For exclude rules, we attach the rules to all preceeding patterns, so
+            # that we can match patterns out of order and check if they were later
+            # overriden by an exclude rule
+            assert not literal
+            if not dir_only:
+                rules_iter = itertools.chain(
+                    itertools.chain(*(iteritems(item) for item in itervalues(self.literals_dir))),
+                    itertools.chain(*(iteritems(item) for item in itervalues(self.literals_file))),
+                    self.patterns_dir,
+                    self.patterns_file)
+            else:
+                rules_iter = itertools.chain(
+                    itertools.chain(*(iteritems(item) for item in itervalues(self.literals_dir))),
+                    self.patterns_dir)
 
-    def __call__(self, path):
-        if os.path.sep != "/":
-            path = path.replace(os.path.sep, "/")
+            for rules in rules_iter:
+                rules[1].append(rule)
+        else:
+            if literal:
+                if len(rule) == 1:
+                    dir_name, pattern = None, rule[0]
+                else:
+                    dir_name, pattern = rule
+                self.literals_dir[dir_name][pattern] = []
+                if not dir_only:
+                    self.literals_file[dir_name][pattern] = []
+            else:
+                self.patterns_dir.append((rule, []))
+                if not dir_only:
+                    self.patterns_file.append((rule, []))
 
-        if self.trivial:
-            return True
+    def filter(self, iterator):
+        empty = {}
+        for dirpath, dirnames, filenames in iterator:
+            orig_dirpath = dirpath
+            if os.path.sep != "/":
+                dirpath = dirpath.replace(os.path.sep, "/")
+
+            keep_dirs = []
+            keep_files = []
 
-        path_is_dir = path[-1] == "/"
-        if path_is_dir:
-            path = path[:-1]
-            rules = self.rules_dir
-        else:
-            rules = self.rules_file
+            for iter_items, literals, patterns, target, suffix in [
+                    (dirnames, self.literals_dir, self.patterns_dir, keep_dirs, "/"),
+                    (filenames, self.literals_file, self.patterns_file, keep_files, "")]:
+                for item in iter_items:
+                    name = item[0]
+                    if dirpath:
+                        path = "%s/%s" % (dirpath, name) + suffix
+                    else:
+                        path = name + suffix
+                    if path in self.cache:
+                        if not self.cache[path]:
+                            target.append(item)
+                        continue
+                    for rule_dir in [None, dirpath]:
+                        if name in literals.get(rule_dir, empty):
+                            exclude = literals[rule_dir][name]
+                            if not any(rule.match(path) for rule in exclude):
+                                # Skip this item
+                                self.cache[path] = True
+                                break
+                    else:
+                        for (component_only, pattern), exclude in patterns:
+                            if component_only:
+                                match = pattern.match(name)
+                            else:
+                                match = pattern.match(path)
+                            if match:
+                                if not any(rule.match(name if name_only else path)
+                                           for name_only, rule in exclude):
+                                    # Skip this item
+                                    self.cache[path] = True
+                                    break
+                        else:
+                            self.cache[path] = False
+                            target.append(item)
 
-        include = True
-        for regexp, invert in rules:
-            if not include and invert and regexp.match(path):
-                include = True
-            elif include and not invert and regexp.match(path):
-                include = False
-        return include
+            dirnames[:] = keep_dirs
+            assert ".git" not in dirnames
+            yield orig_dirpath, dirnames, keep_files
+
+    def __call__(self, iterator):
+        if self.trivial:
+            return iterator
+
+        return self.filter(iterator)
--- a/testing/web-platform/tests/tools/gitignore/tests/test_gitignore.py
+++ b/testing/web-platform/tests/tools/gitignore/tests/test_gitignore.py
@@ -1,82 +1,100 @@
 import pytest
 
 from ..gitignore import fnmatch_translate, PathFilter
 
 match_data = [
-    ("foo", False, ["a/foo", "foo"]),
-    ("*.a", False, ["foo.a", "a/foo.a", "a/b/foo.a", "a.a/foo.a"]),
-    ("*.py[co]", False, ["a.pyc", "a.pyo", "a/b/c.pyc"]),
-    ("\\#*", False, ["#a", "a/#b"]),
-    ("*#", False, ["a#", "a/b#", "#a#"]),
-    ("/*.c", False, ["a.c", ".c"]),
+    ("foo", True, ["a/foo", "foo"]),
+    ("*.a", True, ["foo.a", "a/foo.a", "a/b/foo.a", "a.a/foo.a"]),
+    ("*.py[co]", True, ["a.pyc", "a.pyo", "a/b/c.pyc"]),
+    ("\\#*", True, ["#a", "a/#b"]),
+    ("*#", True, ["a#", "a/b#", "#a#"]),
+    ("/*.c", True, ["a.c", ".c"]),
     ("**/b", False, ["a/b", "a/c/b"]),
     ("*b", True, ["ab"]),
-    ("**/b", True, ["a/b"]),
-    ("a/", True, ["a", "a/b", "a/b/c"])
+    ("*b", True, ["a/b"]),
+    ("**/b", False, ["a/b"]),
+    ("a/", True, ["a"]),
+    ("a[/]b", True, []),
+    ("**/b", False, ["a/c/b"]),
+    ("a?c", True, ["abc"]),
+    ("a[^b]c", True, ["acc"]),
+    ("a[b-c]c", True, ["abc", "acc"]),
+    ("a[^]c", True, ["ac"]),  # This is probably wrong
+    ("a[^]c", True, ["ac"]),  # This is probably wrong
 ]
 
 mismatch_data = [
-    ("foo", False, ["foob", "afoo"]),
-    ("*.a", False, ["a", "foo:a", "a.a/foo"]),
-    ("*.py[co]", False, ["a.pyd", "pyo"]),
-    ("/*.c", False, ["a/b.c"]),
-    ("*b", True, ["a/b"]),
-    ("**b", True, ["a/b"]),
-    ("a[/]b", True, ["a/b"]),
-    ("**/b", True, ["a/c/b"]),
-    ("a", True, ["ab"])
+    ("foo", True, ["foob", "afoo"]),
+    ("*.a", True, ["a", "foo:a", "a.a/foo"]),
+    ("*.py[co]", True, ["a.pyd", "pyo", "a.py"]),
+    ("a", True, ["ab"]),
+    ("a?c", True, ["ac", "abbc"]),
+    ("a[^b]c", True, ["abc"]),
+    ("a[b-c]c", True, ["adc"]),
 ]
 
 invalid_data = [
     "[a",
     "***/foo",
     "a\\",
+    "**b",
+    "b**/",
+    "[[]"
 ]
 
 filter_data = [
-    ("foo", True),
-    ("a", False),
-    ("a/b", False),
-    ("a/c", True),
-    ("a/c/", False),
-    ("c/b", True)
+    (["foo", "bar/", "/a", "*.py"],
+     [("", ["foo", "bar", "baz"], ["a"]),
+      ("baz", ["a"], ["foo", "bar"])],
+     [(["baz"], []),
+      (["a"], ["bar"])]),
+    (["#foo", "", "a*", "!a.py"],
+     [("", ["foo"], ["a", "a.foo", "a.py"])],
+     [(["foo"], ["a.py"])]),
 ]
 
 
 def expand_data(compact_data):
-    for pattern, path_name, inputs in compact_data:
+    for pattern, name_only, inputs in compact_data:
         for input in inputs:
-            yield pattern, input, path_name
+            yield pattern, name_only, input
 
 
-@pytest.mark.parametrize("pattern, input, path_name", expand_data(match_data))
-def tests_match(pattern, input, path_name):
-    regexp = fnmatch_translate(pattern, path_name)
+@pytest.mark.parametrize("pattern, name_only, input", expand_data(match_data))
+def tests_match(pattern, name_only, input):
+    name_only_result, regexp = fnmatch_translate(pattern)
+    assert name_only_result == name_only
+    if name_only:
+        input = input.rsplit("/", 1)[-1]
     assert regexp.match(input) is not None
 
 
-@pytest.mark.parametrize("pattern, input, path_name", expand_data(mismatch_data))
-def tests_no_match(pattern, input, path_name):
-    regexp = fnmatch_translate(pattern, path_name)
+@pytest.mark.parametrize("pattern, name_only, input", expand_data(mismatch_data))
+def tests_no_match(pattern, name_only, input):
+    name_only_result, regexp = fnmatch_translate(pattern)
+    assert name_only_result == name_only
+    if name_only:
+        input = input.rsplit("/", 1)[-1]
     assert regexp.match(input) is None
 
 
 @pytest.mark.parametrize("pattern", invalid_data)
 def tests_invalid(pattern):
     with pytest.raises(ValueError):
-        fnmatch_translate(pattern, False)
-    with pytest.raises(ValueError):
-        fnmatch_translate(pattern, True)
+        fnmatch_translate(pattern)
 
 
-@pytest.mark.parametrize("path, expected", filter_data)
-def test_path_filter(path, expected):
-    extras = [
-        "#foo",
-        "a  ",
-        "**/b",
-        "a/c/",
-        "!c/b",
-    ]
-    f = PathFilter(None, extras)
-    assert f(path) == expected
+@pytest.mark.parametrize("rules, input, expected", filter_data)
+def test_path_filter(rules, input, expected):
+    f = PathFilter(None, rules)
+    # Add some fake stat data
+    for i, item in enumerate(input):
+        repl = [input[i][0]]
+        for j in [1, 2]:
+            repl.append([(name, None) for name in input[i][j]])
+        input[i] = tuple(repl)
+
+    for i, output in enumerate(f(input)):
+        assert output[0] == input[i][0]
+        for j in [1, 2]:
+            assert [item[0] for item in output[j]] == expected[i][j-1]