Bug 1497898 - Add manifest caches for the mtime and gitignore rules, r=ato
authorJames Graham <james@hoppipolla.co.uk>
Fri, 16 Nov 2018 18:48:30 +0000
changeset 503252 9d61715037440d58adf237622245bb847b9a898c
parent 503251 166466f3a5fe137e02a710012babe8a14756afee
child 503253 10e90d3295ee7a29b094b42912cdb584dfd47e98
push id10290
push userffxbld-merge
push dateMon, 03 Dec 2018 16:23:23 +0000
treeherdermozilla-beta@700bed2445e6 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersato
bugs1497898
milestone65.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1497898 - Add manifest caches for the mtime and gitignore rules, r=ato When processing the manifest using the worktree, instead of reading all files to see if the content changed, instead only process files where the mtime has been updated since the previous run. Also cache the result of running gitignore, so we can save a couple of seconds processing the gitignore rules. Depends on D8225 Differential Revision: https://phabricator.services.mozilla.com/D8226
testing/web-platform/tests/tools/manifest/vcs.py
--- a/testing/web-platform/tests/tools/manifest/vcs.py
+++ b/testing/web-platform/tests/tools/manifest/vcs.py
@@ -1,41 +1,45 @@
 import json
 import os
 import platform
+import stat
 import subprocess
+from collections import deque
 
 from .sourcefile import SourceFile
 
 
 class Git(object):
-    def __init__(self, repo_root, url_base, filters=None):
+    def __init__(self, repo_root, url_base, cache_path, rebuild=False):
         self.root = os.path.abspath(repo_root)
         self.git = Git.get_func(repo_root)
         self.url_base = url_base
+        # rebuild is a noop for now since we don't cache anything
 
     @staticmethod
     def get_func(repo_path):
         def git(cmd, *args):
             full_cmd = ["git", cmd] + list(args)
             try:
                 return subprocess.check_output(full_cmd, cwd=repo_path, stderr=subprocess.STDOUT)
             except Exception as e:
                 if platform.uname()[0] == "Windows" and isinstance(e, WindowsError):
                         full_cmd[0] = "git.bat"
                         return subprocess.check_output(full_cmd, cwd=repo_path, stderr=subprocess.STDOUT)
                 else:
                     raise
         return git
 
     @classmethod
-    def for_path(cls, path, url_base):
+    def for_path(cls, path, url_base, cache_path, rebuild=False):
         git = Git.get_func(path)
         try:
-            return cls(git("rev-parse", "--show-toplevel").rstrip(), url_base)
+            return cls(git("rev-parse", "--show-toplevel").rstrip(), url_base, cache_path,
+                       rebuild=rebuild)
         except subprocess.CalledProcessError:
             return None
 
     def _local_changes(self):
         changes = {}
         cmd = ["status", "-z", "--ignore-submodules=all"]
         data = self.git(*cmd)
 
@@ -72,24 +76,34 @@ class Git(object):
                 else:
                     contents = None
                 yield SourceFile(self.root,
                                  rel_path,
                                  self.url_base,
                                  hash,
                                  contents=contents), True
 
+    def dump_caches(self):
+        pass
+
 
 class FileSystem(object):
-    def __init__(self, root, url_base, mtime_filter):
+    def __init__(self, root, url_base, cache_path, rebuild=False):
         self.root = root
         self.url_base = url_base
+        if cache_path is not None:
+            self.mtime_cache = MtimeCache(cache_path, rebuild)
+            self.ignore_cache = GitIgnoreCache(cache_path, root, rebuild)
+        else:
+            self.mtime_cache = None
+            self.ignore_cache = None
         from gitignore import gitignore
-        self.path_filter = gitignore.PathFilter(self.root, extras=[".git/"])
-        self.mtime_filter = mtime_filter
+        self.path_filter = gitignore.PathFilter(self.root,
+                                                extras=[".git/"],
+                                                cache=self.ignore_cache)
 
     def __iter__(self):
         mtime_cache = self.mtime_cache
         for dirpath, dirnames, filenames in self.path_filter(walk(".")):
             for filename, path_stat in filenames:
                 # We strip the ./ prefix off the path
                 path = os.path.join(dirpath, filename)
                 if mtime_cache is None or mtime_cache.updated(path, path_stat):
@@ -110,38 +124,70 @@ class CacheFile(object):
     def __init__(self, cache_root, rebuild=False):
         if not os.path.exists(cache_root):
             os.makedirs(cache_root)
         self.path = os.path.join(cache_root, self.file_name)
         self.data = self.load(rebuild)
         self.modified = False
 
     def dump(self):
-        missing = set(self.data.keys()) - self.updated
-        if not missing or not self.modified:
+        if not self.modified:
             return
-        for item in missing:
-            del self.data[item]
         with open(self.path, 'w') as f:
             json.dump(self.data, f, indent=1)
 
-    def load(self):
+    def load(self, rebuild=False):
+        data = {}
         try:
-            with open(self.path, 'r') as f:
-                return json.load(f)
+            if not rebuild:
+                with open(self.path, 'r') as f:
+                    data = json.load(f)
+                data = self.check_valid(data)
         except IOError:
-            return {}
+            pass
+        return data
 
-    def update(self, rel_path, stat=None):
-        self.updated.add(rel_path)
-        try:
-            if stat is None:
-                stat = os.stat(os.path.join(self.root,
-                                            rel_path))
-        except Exception:
-            return True
+    def check_valid(self, data):
+        """Check if the cached data is valid and return an updated copy of the
+        cache containing only data that can be used."""
+        return data
+
 
+class MtimeCache(CacheFile):
+    file_name = "mtime.json"
+
+    def updated(self, rel_path, stat):
+        """Return a boolean indicating whether the file changed since the cache was last updated.
+
+        This implicitly updates the cache with the new mtime data."""
         mtime = stat.st_mtime
         if mtime != self.data.get(rel_path):
             self.modified = True
             self.data[rel_path] = mtime
             return True
         return False
+
+
+class GitIgnoreCache(CacheFile):
+    file_name = "gitignore.json"
+
+    def __init__(self, cache_root, ignore_path, rebuild=False):
+        self.ignore_path = ignore_path
+        super(GitIgnoreCache, self).__init__(cache_root, rebuild=False)
+
+    def check_valid(self, data):
+        mtime = os.path.getmtime(self.ignore_path)
+        if data.get("/gitignore_file") != [self.ignore_path, mtime]:
+            self.modified = True
+            data = {}
+            data["/gitignore_file"] = [self.ignore_path, mtime]
+        return data
+
+    def __contains__(self, key):
+        return key in self.data
+
+    def __getitem__(self, key):
+        return self.data[key]
+
+    def __setitem__(self, key, value):
+        if self.data.get(key) != value:
+            self.modified = True
+            self.data[key] = value