Bug 1636797 - In `hash.py`, enumerate files from the VCS rather than searching the filesystem directly r=ahal
authorRicky Stewart <rstewart@mozilla.com>
Mon, 17 Aug 2020 15:19:34 +0000
changeset 609591 98fddfef58ceb784e742575e9c010d04e6f82843
parent 609590 b0b25fca6075f3ae22eaa54f035a7f3331e14ba6
child 609592 4ac5156712d7aac5ae9c7c479131720bb928fae1
push id13553
push userffxbld-merge
push dateMon, 24 Aug 2020 12:51:36 +0000
treeherdermozilla-beta@a54f8b5d0977 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersahal
bugs1636797
milestone81.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1636797 - In `hash.py`, enumerate files from the VCS rather than searching the filesystem directly r=ahal This resolves a long-standing issue in development where `mach artifact` (and therefore `mach bootstrap`) would fail unpredictably if you had dirty, but ignored, files in your checkout. Resolving this problem often required unwieldy `hg purge`/`git ignore` incantations that are easy to get wrong. This patch addresses the problem by doing what we "should" have been doing all along, and consulting the VCS to list tracked files rather than listing EVERY file on disk and applying heuristics to determine whether they should be included in the hash. Differential Revision: https://phabricator.services.mozilla.com/D86780
config/check_js_msg_encoding.py
python/mozbuild/mozpack/files.py
python/mozversioncontrol/mozversioncontrol/__init__.py
taskcluster/taskgraph/util/hash.py
--- a/config/check_js_msg_encoding.py
+++ b/config/check_js_msg_encoding.py
@@ -49,21 +49,20 @@ def check_single_file(filename):
 
 
 def check_files():
     result = True
 
     with get_repository_from_env() as repo:
         root = repo.path
 
-        for filename in repo.get_files_in_working_directory():
-            if filename.endswith('.msg'):
-                if filename not in ignore_files:
-                    if not check_single_file(os.path.join(root, filename)):
-                        result = False
+        for filename, _ in repo.get_tracked_files_finder().find('**/*.msg'):
+            if filename not in ignore_files:
+                if not check_single_file(os.path.join(root, filename)):
+                    result = False
 
     return result
 
 
 def main():
     if not check_files():
         sys.exit(1)
 
--- a/python/mozbuild/mozpack/files.py
+++ b/python/mozbuild/mozpack/files.py
@@ -1,30 +1,32 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 from __future__ import absolute_import, print_function, unicode_literals
 
+import bisect
 import codecs
 import errno
 import inspect
 import os
 import platform
 import shutil
 import six
 import stat
 import subprocess
 import uuid
 import mozbuild.makeutil as makeutil
-from itertools import chain
+from itertools import chain, takewhile
 from mozbuild.preprocessor import Preprocessor
 from mozbuild.util import (
     FileAvoidWrite,
     ensure_unicode,
+    memoize
 )
 from mozpack.executables import (
     is_executable,
     may_strip,
     strip,
     may_elfhack,
     elfhack,
     xz_compress,
@@ -1233,8 +1235,44 @@ class MercurialRevisionFinder(BaseFinder
         # thousands of MercurialFile instances for every file in the repo is
         # inefficient.
         f = self._files[path]
         if not f:
             f = MercurialFile(self._client, self._rev, path)
             self._files[path] = f
 
         return f
+
+
+class FileListFinder(BaseFinder):
+    """Finder for a literal list of file names."""
+
+    def __init__(self, files):
+        """files must be a sorted list."""
+        self._files = files
+
+    @memoize
+    def _match(self, pattern):
+        """Return a sorted list of all files matching the given pattern."""
+        # We don't use the utility _find_helper method because it's not tuned
+        # for performance in the way that we would like this class to be. That's
+        # a possible avenue for refactoring here.
+        ret = []
+        # We do this as an optimization to figure out where in the sorted list
+        # to search and where to stop searching.
+        components = pattern.split('/')
+        prefix = '/'.join(takewhile(lambda s: '*' not in s, components))
+        start = bisect.bisect_left(self._files, prefix)
+        for i in six.moves.range(start, len(self._files)):
+            f = self._files[i]
+            if not f.startswith(prefix):
+                break
+            # Skip hidden files while scanning.
+            if '/.' in f[len(prefix):]:
+                continue
+            if mozpath.match(f, pattern):
+                ret.append(f)
+        return ret
+
+    def find(self, pattern):
+        pattern = pattern.strip('/')
+        for path in self._match(pattern):
+            yield path, File(path)
--- a/python/mozversioncontrol/mozversioncontrol/__init__.py
+++ b/python/mozversioncontrol/mozversioncontrol/__init__.py
@@ -9,16 +9,17 @@ import errno
 import os
 import re
 import shutil
 import subprocess
 import sys
 
 from mozbuild.util import ensure_subprocess_env
 from mozfile import which
+from mozpack.files import FileListFinder
 
 
 class MissingVCSTool(Exception):
     """Represents a failure to find a version control tool binary."""
 
 
 class MissingVCSInfo(Exception):
     """Represents a general failure to resolve a VCS interface."""
@@ -211,18 +212,24 @@ class Repository(object):
         '''
 
     @abc.abstractmethod
     def forget_add_remove_files(self, *paths):
         '''Undo the effects of a previous add_remove_files call for `paths`.
         '''
 
     @abc.abstractmethod
-    def get_files_in_working_directory(self):
-        """Obtain a list of managed files in the working directory."""
+    def get_tracked_files_finder(self):
+        """Obtain a mozpack.files.BaseFinder of managed files in the working
+        directory.
+
+        The Finder will have its list of all files in the repo cached for its
+        entire lifetime, so operations on the Finder will not track with, for
+        example, commits to the repo during the Finder's lifetime.
+        """
 
     @abc.abstractmethod
     def working_directory_clean(self, untracked=False, ignored=False):
         """Determine if the working directory is free of modifications.
 
         Returns True if the working directory does not have any file
         modifications. False otherwise.
 
@@ -414,20 +421,21 @@ class HgRepository(Repository):
             args = ['--config', 'extensions.automv='] + args
         self._run(*args)
 
     def forget_add_remove_files(self, *paths):
         if not paths:
             return
         self._run('forget', *paths)
 
-    def get_files_in_working_directory(self):
+    def get_tracked_files_finder(self):
         # Can return backslashes on Windows. Normalize to forward slashes.
-        return list(p.replace('\\', '/') for p in
-                    self._run(b'files', b'-0').split('\0') if p)
+        files = list(p.replace('\\', '/') for p in
+                     self._run(b'files', b'-0').split('\0') if p)
+        return FileListFinder(files)
 
     def working_directory_clean(self, untracked=False, ignored=False):
         args = ['status', '--modified', '--added', '--removed',
                 '--deleted']
         if untracked:
             args.append('--unknown')
         if ignored:
             args.append('--ignored')
@@ -544,18 +552,19 @@ class GitRepository(Repository):
             return
         self._run('add', *paths)
 
     def forget_add_remove_files(self, *paths):
         if not paths:
             return
         self._run('reset', *paths)
 
-    def get_files_in_working_directory(self):
-        return [p for p in self._run('ls-files', '-z').split('\0') if p]
+    def get_tracked_files_finder(self):
+        files = [p for p in self._run('ls-files', '-z').split('\0') if p]
+        return FileListFinder(files)
 
     def working_directory_clean(self, untracked=False, ignored=False):
         args = ['status', '--porcelain']
 
         # Even in --porcelain mode, behavior is affected by the
         # ``status.showUntrackedFiles`` option, which means we need to be
         # explicit about how to treat untracked files.
         if untracked:
--- a/taskcluster/taskgraph/util/hash.py
+++ b/taskcluster/taskgraph/util/hash.py
@@ -1,41 +1,46 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 from __future__ import absolute_import, print_function, unicode_literals
 from mozbuild.util import memoize
-from mozpack.files import FileFinder
 import mozpack.path as mozpath
+from mozversioncontrol import get_repository_object
 import hashlib
 import io
 import six
 
 
 @memoize
 def hash_path(path):
     """Hash a single file.
 
     Returns the SHA-256 hash in hex form.
     """
     with io.open(path, mode='rb') as fh:
         return hashlib.sha256(fh.read()).hexdigest()
 
 
+@memoize
+def get_file_finder(base_path):
+    return get_repository_object(base_path).get_tracked_files_finder()
+
+
 def hash_paths(base_path, patterns):
     """
     Give a list of path patterns, return a digest of the contents of all
     the corresponding files, similarly to git tree objects or mercurial
     manifests.
 
     Each file is hashed. The list of all hashes and file paths is then
     itself hashed to produce the result.
     """
-    finder = FileFinder(base_path)
+    finder = get_file_finder(base_path)
     h = hashlib.sha256()
     files = {}
     for pattern in patterns:
         found = list(finder.find(pattern))
         if found:
             files.update(found)
         else:
             raise Exception('%s did not match anything' % pattern)