Bug 1409876 - Record cache history to aid failure analysis; r=dustin
authorGregory Szorc <gps@mozilla.com>
Wed, 25 Oct 2017 16:58:37 -0700
changeset 388453 994704c5337e8adf1bf92043bcaa5d4c75ff5c90
parent 388452 1402f70041d9b22128b7d8819c7529cccb2b749f
child 388454 2f54b783114c413577487f40d70abaf04a10bece
push id32750
push userarchaeopteryx@coole-files.de
push dateThu, 26 Oct 2017 21:56:27 +0000
treeherdermozilla-central@a6d7be4ac1e3 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersdustin
bugs1409876
milestone58.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1409876 - Record cache history to aid failure analysis; r=dustin We introduce a per-cache .cachelog file containing important events in the cache's history such as creation, requirements adjusting, and utilization. If cache validation fails, we print the cache log. If a previous task was responsible for getting the cache in a bad state, its TASK_ID should be printed, allowing us to more easily identify mis-configured tasks. MozReview-Commit-ID: BJun5Hi5w0s
taskcluster/docker/recipes/run-task
--- a/taskcluster/docker/recipes/run-task
+++ b/taskcluster/docker/recipes/run-task
@@ -290,16 +290,19 @@ def main(args):
     # validate that use of the cache is acceptable.
     #
     # We /could/ blow away the cache data on requirements mismatch.
     # While this would be convenient, this could result in "competing" tasks
     # effectively undoing the other's work. This would slow down task
     # execution in aggregate. Without monitoring for this, people may not notice
     # the problem and tasks would be slower than they could be. We follow the
     # principle of "fail fast" to ensure optimal task execution.
+    #
+    # We also write an audit log of who used the caches. This log is printed
+    # during failures to help aid debugging.
 
     if 'TASKCLUSTER_CACHES' in os.environ:
         caches = os.environ['TASKCLUSTER_CACHES'].split(';')
         del os.environ['TASKCLUSTER_CACHES']
     else:
         caches = []
 
     if 'TASKCLUSTER_UNTRUSTED_CACHES' in os.environ:
@@ -316,37 +319,48 @@ def main(args):
         # to force the use of a new cache.
         b'version=1',
         # Include the UID and GID the task will run as to ensure that tasks
         # with different UID and GID don't share the same cache.
         b'uid=%d' % uid,
         b'gid=%d' % gid,
     }
 
+    def write_audit_entry(path, msg):
+        now = datetime.datetime.utcnow().isoformat()
+        with open(path, 'ab') as fh:
+            fh.write(b'[%sZ %s] %s\n' % (
+                     now, os.environ.get('TASK_ID', 'UNKNOWN'), msg))
+
     for cache in caches:
         if not os.path.isdir(cache):
             print('error: cache %s is not a directory; this should never '
                   'happen' % cache)
             return 1
 
         requires_path = os.path.join(cache, '.cacherequires')
+        audit_path = os.path.join(cache, '.cachelog')
 
         # The cache is empty. Configure it.
         if not os.listdir(cache):
             print_line(b'cache', b'cache %s is empty; writing requirements: '
                        b'%s\n' % (cache, b' '.join(sorted(our_requirements))))
 
             # We write a requirements file so future invocations know what the
             # requirements are.
             with open(requires_path, 'wb') as fh:
                 fh.write(b'\n'.join(sorted(our_requirements)))
 
             # And make it read-only as a precaution against deletion.
             os.chmod(requires_path, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
 
+            write_audit_entry(audit_path,
+                              'created; requirements: %s' %
+                              ', '.join(sorted(our_requirements)))
+
             set_dir_permissions(cache, uid, gid)
 
         # The cache has content and we have a requirements file. Validate
         # requirements alignment.
         elif os.path.exists(requires_path):
             with open(requires_path, 'rb') as fh:
                 wanted_requirements = set(fh.read().splitlines())
 
@@ -369,39 +383,57 @@ def main(args):
                            b'changing ownership to facilitate cache use\n' %
                            cache)
                 chown_recursive(cache, user.pw_name, group.gr_name, uid, gid)
 
                 # And write out the updated reality.
                 with open(requires_path, 'wb') as fh:
                     fh.write(b'\n'.join(sorted(our_requirements)))
 
+                write_audit_entry(audit_path,
+                                  'chown; requirements: %s' %
+                                  ', '.join(sorted(our_requirements)))
+
             elif missing:
                 print('error: requirements for populated cache %s differ from '
                       'this task' % cache)
                 print('cache requirements: %s' % ' '.join(sorted(
                     wanted_requirements)))
                 print('our requirements:   %s' % ' '.join(sorted(
                     our_requirements)))
                 if any(s.startswith(('uid=', 'gid=')) for s in missing):
                     print(CACHE_UID_GID_MISMATCH)
+
+                write_audit_entry(audit_path,
+                                  'requirements mismatch; wanted: %s' %
+                                  ', '.join(sorted(our_requirements)))
+
+                print('')
+                print('audit log:')
+                with open(audit_path, 'rb') as fh:
+                    print(fh.read())
+
                 return 1
+            else:
+                write_audit_entry(audit_path, 'used')
 
             # We don't need to adjust permissions here because the cache is
             # associated with a uid/gid and the first task should have set
             # a proper owner/group.
 
         # The cache has content and no requirements file. This shouldn't
         # happen because run-task should be the first thing that touches a
         # cache.
         else:
             print('error: cache %s is not empty and is missing a '
                   '.cacherequires file; the cache names for this task are '
                   'likely mis-configured or TASKCLUSTER_CACHES is not set '
                   'properly' % cache)
+
+            write_audit_entry(audit_path, 'missing .cacherequires')
             return 1
 
     if 'TASKCLUSTER_VOLUMES' in os.environ:
         volumes = os.environ['TASKCLUSTER_VOLUMES'].split(';')
         del os.environ['TASKCLUSTER_VOLUMES']
     else:
         volumes = []