Bug 1343718 - Limit artifact downloads by size rather than by number. r=chmanchester
authorMike Hommey <mh+mozilla@glandium.org>
Wed, 12 Apr 2017 16:17:52 +0900
changeset 353080 3717bedc62f9fb6b68dd8618ff2e70a02184428f
parent 353079 b5a3e4cb50908c4893bc9252cc19d9bf019ff04c
child 353081 25b64d100bda232ad31a4e72fdb7b92e3220955f
push id31656
push userihsiao@mozilla.com
push dateFri, 14 Apr 2017 09:10:41 +0000
treeherdermozilla-central@cda24082bff8 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerschmanchester
bugs1343718
milestone55.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1343718 - Limit artifact downloads by size rather than by number. r=chmanchester This adds a unit test for the expected behavior, and adds the modules from mach_bootstrap.py that are missing in the virtualenv for the test to run.
build/virtualenv_packages.txt
python/mozbuild/mozbuild/artifacts.py
python/mozbuild/mozbuild/test/python.ini
python/mozbuild/mozbuild/test/test_artifacts.py
--- a/build/virtualenv_packages.txt
+++ b/build/virtualenv_packages.txt
@@ -33,8 +33,11 @@ pyasn1_modules.pth:python/pyasn1-modules
 redo.pth:python/redo
 requests.pth:python/requests
 rsa.pth:python/rsa
 futures.pth:python/futures
 ecc.pth:python/PyECC
 xpcshell.pth:testing/xpcshell
 pyyaml.pth:python/pyyaml/lib
 pytoml.pth:python/pytoml
+pylru.pth:python/pylru
+taskcluster.pth:taskcluster
+dlmanager.pth:python/dlmanager
--- a/python/mozbuild/mozbuild/artifacts.py
+++ b/python/mozbuild/mozbuild/artifacts.py
@@ -93,19 +93,22 @@ NUM_PUSHHEADS_TO_QUERY_PER_PARENT = 50  
 # There isn't really such a thing as a reasonable default here, because we don't
 # know how many pushheads we'll need to look at to find a build with our artifacts,
 # and we don't know how many changesets will be in each push. For now we assume
 # we'll find a build in the last 50 pushes, assuming each push contains 10 changesets.
 NUM_REVISIONS_TO_QUERY = 500
 
 MAX_CACHED_TASKS = 400  # Number of pushheads to cache Task Cluster task data for.
 
-# Number of downloaded artifacts to cache.  Each artifact can be very large,
-# so don't make this to large!  TODO: make this a size (like 500 megs) rather than an artifact count.
-MAX_CACHED_ARTIFACTS = 6
+# Minimum number of downloaded artifacts to keep. Each artifact can be very large,
+# so don't make this to large!
+MIN_CACHED_ARTIFACTS = 6
+
+# Maximum size of the downloaded artifacts to keep in cache, in bytes (1GiB).
+MAX_CACHED_ARTIFACTS_SIZE = 1024 * 1024 * 1024
 
 # Downloaded artifacts are cached, and a subset of their contents extracted for
 # easy installation.  This is most noticeable on Mac OS X: since mounting and
 # copying from DMG files is very slow, we extract the desired binaries to a
 # separate archive for fast re-installation.
 PROCESSED_SUFFIX = '.processed.jar'
 
 CANDIDATE_TREES = (
@@ -659,47 +662,112 @@ class TaskCache(CacheManager):
             # public/build/buildprops.json for this purpose.
             url = get_artifact_url(taskId, artifact_name)
             urls.append(url)
         if not urls:
             raise ValueError('Task for {namespace} existed, but no artifacts found!'.format(namespace=namespace))
         return urls
 
 
-class ArtifactCache(CacheManager):
+class ArtifactPersistLimit(PersistLimit):
+    '''Handle persistence for artifacts cache
+
+    When instantiating a DownloadManager, it starts by filling the
+    PersistLimit instance it's given with register_dir_content.
+    In practice, this registers all the files already in the cache directory.
+    After a download finishes, the newly downloaded file is registered, and the
+    oldest files registered to the PersistLimit instance are removed depending
+    on the size and file limits it's configured for.
+    This is all good, but there are a few tweaks we want here:
+    - We have pickle files in the cache directory that we don't want purged.
+    - Files that were just downloaded in the same session shouldn't be purged.
+      (if for some reason we end up downloading more than the default max size,
+       we don't want the files to be purged)
+    To achieve this, this subclass of PersistLimit inhibits the register_file
+    method for pickle files and tracks what files were downloaded in the same
+    session to avoid removing them.
+
+    The register_file method may be used to register cache matches too, so that
+    later sessions know they were freshly used.
+    '''
+
+    def __init__(self, log=None):
+        super(ArtifactPersistLimit, self).__init__(
+            size_limit=MAX_CACHED_ARTIFACTS_SIZE,
+            file_limit=MIN_CACHED_ARTIFACTS)
+        self._log = log
+        self._registering_dir = False
+        self._downloaded_now = set()
+
+    def log(self, *args, **kwargs):
+        if self._log:
+            self._log(*args, **kwargs)
+
+    def register_file(self, path):
+        if path.endswith('.pickle'):
+            return
+        if not self._registering_dir:
+            # Touch the file so that subsequent calls to a mach artifact
+            # command know it was recently used. While remove_old_files
+            # is based on access time, in various cases, the access time is not
+            # updated when just reading the file, so we force an update.
+            try:
+                os.utime(path, None)
+            except OSError:
+                pass
+            self._downloaded_now.add(path)
+        super(ArtifactPersistLimit, self).register_file(path)
+
+    def register_dir_content(self, directory, pattern="*"):
+        self._registering_dir = True
+        super(ArtifactPersistLimit, self).register_dir_content(
+            directory, pattern)
+        self._registering_dir = False
+
+    def remove_old_files(self):
+        from dlmanager import fs
+        files = sorted(self.files, key=lambda f: f.stat.st_atime)
+        kept = []
+        while len(files) > self.file_limit and \
+                self._files_size >= self.size_limit:
+            f = files.pop(0)
+            if f.path in self._downloaded_now:
+                kept.append(f)
+                continue
+            fs.remove(f.path)
+            self.log(logging.INFO, 'artifact',
+                {'filename': f.path},
+                'Purged artifact {filename}')
+            self._files_size -= f.stat.st_size
+        self.files = files + kept
+
+    def remove_all(self):
+        from dlmanager import fs
+        for f in self.files:
+            fs.remove(f.path)
+        self._files_size = 0
+        self.files = []
+
+
+class ArtifactCache(object):
     '''Fetch Task Cluster artifact URLs and purge least recently used artifacts from disk.'''
 
     def __init__(self, cache_dir, log=None, skip_cache=False):
-        # TODO: instead of storing N artifact packages, store M megabytes.
-        CacheManager.__init__(self, cache_dir, 'fetch', MAX_CACHED_ARTIFACTS, cache_callback=self.delete_file, log=log, skip_cache=skip_cache)
         self._cache_dir = cache_dir
-        size_limit = 1024 * 1024 * 1024 # 1Gb in bytes.
-        file_limit = 4 # But always keep at least 4 old artifacts around.
-        persist_limit = PersistLimit(size_limit, file_limit)
-        self._download_manager = DownloadManager(self._cache_dir, persist_limit=persist_limit)
+        self._log = log
+        self._skip_cache = skip_cache
+        self._persist_limit = ArtifactPersistLimit(log)
+        self._download_manager = DownloadManager(
+            self._cache_dir, persist_limit=self._persist_limit)
         self._last_dl_update = -1
 
-    def delete_file(self, key, value):
-        try:
-            os.remove(value)
-            self.log(logging.INFO, 'artifact',
-                {'filename': value},
-                'Purged artifact {filename}')
-        except (OSError, IOError):
-            pass
+    def log(self, *args, **kwargs):
+        if self._log:
+            self._log(*args, **kwargs)
 
-        try:
-            os.remove(value + PROCESSED_SUFFIX)
-            self.log(logging.INFO, 'artifact',
-                {'filename': value + PROCESSED_SUFFIX},
-                'Purged processed artifact {filename}')
-        except (OSError, IOError):
-            pass
-
-    @cachedmethod(operator.attrgetter('_cache'))
     def fetch(self, url, force=False):
         # We download to a temporary name like HASH[:16]-basename to
         # differentiate among URLs with the same basenames.  We used to then
         # extract the build ID from the downloaded artifact and use it to make a
         # human readable unique name, but extracting build IDs is time consuming
         # (especially on Mac OS X, where we must mount a large DMG file).
         hash = hashlib.sha256(url).hexdigest()[:16]
         fname = hash + '-' + os.path.basename(url)
@@ -727,24 +795,38 @@ class ArtifactCache(CacheManager):
                 self._last_dl_update = now
                 self.log(logging.INFO, 'artifact',
                          {'bytes_so_far': bytes_so_far, 'total_size': total_size, 'percent': percent},
                          'Downloading... {percent:02.1f} %')
 
             if dl:
                 dl.set_progress(download_progress)
                 dl.wait()
+            else:
+                # Avoid the file being removed if it was in the cache already.
+                path = os.path.join(self._cache_dir, fname)
+                self._persist_limit.register_file(path)
+
             self.log(logging.INFO, 'artifact',
                 {'path': os.path.abspath(mozpath.join(self._cache_dir, fname))},
                 'Downloaded artifact to {path}')
             return os.path.abspath(mozpath.join(self._cache_dir, fname))
         finally:
             # Cancel any background downloads in progress.
             self._download_manager.cancel()
 
+    def clear_cache(self):
+        if self._skip_cache:
+            self.log(logging.DEBUG, 'artifact',
+                {},
+                'Skipping cache: ignoring clear_cache!')
+            return
+
+        self._persist_limit.remove_all()
+
 
 class Artifacts(object):
     '''Maintain state to efficiently fetch build artifacts from a Firefox tree.'''
 
     def __init__(self, tree, substs, defines, job=None, log=None,
                  cache_dir='.', hg=None, git=None, skip_cache=False,
                  topsrcdir=None):
         if (hg and git) or (not hg and not git):
@@ -936,16 +1018,18 @@ class Artifacts(object):
             self.log(logging.INFO, 'artifact',
                 {'filename': filename},
                 'Processing contents of {filename}')
             self.log(logging.INFO, 'artifact',
                 {'processed_filename': processed_filename},
                 'Writing processed {processed_filename}')
             self._artifact_job.process_artifact(filename, processed_filename)
 
+        self._artifact_cache._persist_limit.register_file(processed_filename)
+
         self.log(logging.INFO, 'artifact',
             {'processed_filename': processed_filename},
             'Installing from processed {processed_filename}')
 
         # Copy all .so files, avoiding modification where possible.
         ensureParentDir(mozpath.join(distdir, '.dummy'))
 
         with zipfile.ZipFile(processed_filename) as zf:
@@ -966,18 +1050,17 @@ class Artifacts(object):
                     perms |= stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH # u+w, a+r.
                     os.chmod(n, perms)
         return 0
 
     def install_from_url(self, url, distdir):
         self.log(logging.INFO, 'artifact',
             {'url': url},
             'Installing from {url}')
-        with self._artifact_cache as artifact_cache:  # The with block handles persistence.
-            filename = artifact_cache.fetch(url)
+        filename = self._artifact_cache.fetch(url)
         return self.install_from_file(filename, distdir)
 
     def _install_from_hg_pushheads(self, hg_pushheads, distdir):
         """Iterate pairs (hg_hash, {tree-set}) associating hg revision hashes
         and tree-sets they are known to be in, trying to download and
         install from each.
         """
 
--- a/python/mozbuild/mozbuild/test/python.ini
+++ b/python/mozbuild/mozbuild/test/python.ini
@@ -21,16 +21,17 @@
 [configure/test_util.py]
 [controller/test_ccachestats.py]
 [controller/test_clobber.py]
 [frontend/test_context.py]
 [frontend/test_emitter.py]
 [frontend/test_namespaces.py]
 [frontend/test_reader.py]
 [frontend/test_sandbox.py]
+[test_artifacts.py]
 [test_base.py]
 [test_containers.py]
 [test_dotproperties.py]
 [test_expression.py]
 [test_jarmaker.py]
 [test_line_endings.py]
 [test_makeutil.py]
 [test_mozconfig.py]
new file mode 100644
--- /dev/null
+++ b/python/mozbuild/mozbuild/test/test_artifacts.py
@@ -0,0 +1,145 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from __future__ import unicode_literals
+
+import os
+import mozunit
+import time
+import unittest
+from tempfile import mkdtemp
+from shutil import rmtree
+
+from mozbuild.artifacts import ArtifactCache
+from mozbuild import artifacts
+
+
+CONTENTS = {
+    'http://server/foo': b'foo',
+    'http://server/bar': b'bar' * 400,
+    'http://server/qux': b'qux' * 400,
+    'http://server/fuga': b'fuga' * 300,
+    'http://server/hoge': b'hoge' * 300,
+    'http://server/larger': b'larger' * 3000,
+}
+
+class FakeResponse(object):
+    def __init__(self, content):
+        self._content = content
+
+    @property
+    def headers(self):
+        return {
+            'Content-length': str(len(self._content))
+        }
+
+    def iter_content(self, chunk_size):
+        content = memoryview(self._content)
+        while content:
+            yield content[:chunk_size]
+            content = content[chunk_size:]
+
+    def raise_for_status(self):
+        pass
+
+    def close(self):
+        pass
+
+
+class FakeSession(object):
+    def get(self, url, stream=True):
+        assert stream is True
+        return FakeResponse(CONTENTS[url])
+
+
+class TestArtifactCache(unittest.TestCase):
+    def setUp(self):
+        self.min_cached_artifacts = artifacts.MIN_CACHED_ARTIFACTS
+        self.max_cached_artifacts_size = artifacts.MAX_CACHED_ARTIFACTS_SIZE
+        artifacts.MIN_CACHED_ARTIFACTS = 2
+        artifacts.MAX_CACHED_ARTIFACTS_SIZE = 4096
+
+        self._real_utime = os.utime
+        os.utime = self.utime
+        self.timestamp = time.time() - 86400
+
+        self.tmpdir = mkdtemp()
+
+    def tearDown(self):
+        rmtree(self.tmpdir)
+        artifacts.MIN_CACHED_ARTIFACTS = self.min_cached_artifacts
+        artifacts.MAX_CACHED_ARTIFACTS_SIZE = self.max_cached_artifacts_size
+        os.utime = self._real_utime
+
+    def utime(self, path, times):
+        if times is None:
+            # Ensure all downloaded files have a different timestamp
+            times = (self.timestamp, self.timestamp)
+            self.timestamp += 2
+        self._real_utime(path, times)
+
+    def test_artifact_cache_persistence(self):
+        cache = ArtifactCache(self.tmpdir)
+        cache._download_manager.session = FakeSession()
+
+        path = cache.fetch('http://server/foo')
+        expected = [os.path.basename(path)]
+        self.assertEqual(os.listdir(self.tmpdir), expected)
+
+        path = cache.fetch('http://server/bar')
+        expected.append(os.path.basename(path))
+        self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+        # We're downloading more than the cache allows us, but since it's all
+        # in the same session, no purge happens.
+        path = cache.fetch('http://server/qux')
+        expected.append(os.path.basename(path))
+        self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+        path = cache.fetch('http://server/fuga')
+        expected.append(os.path.basename(path))
+        self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+        cache = ArtifactCache(self.tmpdir)
+        cache._download_manager.session = FakeSession()
+
+        # Downloading a new file in a new session purges the oldest files in
+        # the cache.
+        path = cache.fetch('http://server/hoge')
+        expected.append(os.path.basename(path))
+        expected = expected[2:]
+        self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+        # Downloading a file already in the cache leaves the cache untouched
+        cache = ArtifactCache(self.tmpdir)
+        cache._download_manager.session = FakeSession()
+
+        path = cache.fetch('http://server/qux')
+        self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+        # bar was purged earlier, re-downloading it should purge the oldest
+        # downloaded file, which at this point would be qux, but we also
+        # re-downloaded it in the mean time, so the next one (fuga) should be
+        # the purged one.
+        cache = ArtifactCache(self.tmpdir)
+        cache._download_manager.session = FakeSession()
+
+        path = cache.fetch('http://server/bar')
+        expected.append(os.path.basename(path))
+        expected = [p for p in expected if 'fuga' not in p]
+        self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+        # Downloading one file larger than the cache size should still leave
+        # MIN_CACHED_ARTIFACTS files.
+        cache = ArtifactCache(self.tmpdir)
+        cache._download_manager.session = FakeSession()
+
+        path = cache.fetch('http://server/larger')
+        expected.append(os.path.basename(path))
+        expected = expected[-2:]
+        self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+
+if __name__ == '__main__':
+    mozunit.main()