Bug 1288610 - Add functions for creating deterministic tar archives; r=ted
authorGregory Szorc <gps@mozilla.com>
Mon, 25 Jul 2016 12:46:07 -0700
changeset 346522 6f6d78bc38f446209efbb666a917046911f7e9e5
parent 346521 9e9ba7b85410185a32eb2448508439b756325091
child 346523 f36727c412bc19f62cf0ad54a8149db9c32f88b5
push id6389
push userraliiev@mozilla.com
push dateMon, 19 Sep 2016 13:38:22 +0000
treeherdermozilla-beta@01d67bfe6c81 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersted
bugs1288610
milestone50.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1288610 - Add functions for creating deterministic tar archives; r=ted I have a need to create tar archives deterministically and reproducibly. Since we already have similar functionality in mozpack for producting zip/jar archives, I figured it made sense for this functionality to live in mozpack. I made the functionality as simple as possible: we only accept files from the filesystem and the set of files must be known in advance. No class to hold/buffer state: just a simple function that takes a mapping of files and writes to a stream. MozReview-Commit-ID: If0NTcA7wpc
python/mozbuild/mozpack/archive.py
python/mozbuild/mozpack/test/test_archive.py
new file mode 100644
--- /dev/null
+++ b/python/mozbuild/mozpack/archive.py
@@ -0,0 +1,107 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from __future__ import absolute_import
+
+import bz2
+import gzip
+import stat
+import tarfile
+
+
+# 2016-01-01T00:00:00+0000
+DEFAULT_MTIME = 1451606400
+
+
+def create_tar_from_files(fp, files):
+    """Create a tar file deterministically.
+
+    Receives a dict mapping names of files in the archive to local filesystem
+    paths.
+
+    The files will be archived and written to the passed file handle opened
+    for writing.
+
+    Only regular files can be written.
+
+    FUTURE accept mozpack.files classes for writing
+    FUTURE accept a filename argument (or create APIs to write files)
+    """
+    with tarfile.open(name='', mode='w', fileobj=fp, dereference=True) as tf:
+        for archive_path, fs_path in sorted(files.items()):
+            ti = tf.gettarinfo(fs_path, archive_path)
+
+            if not ti.isreg():
+                raise ValueError('not a regular file: %s' % fs_path)
+
+            # Disallow setuid and setgid bits. This is an arbitrary restriction.
+            # However, since we set uid/gid to root:root, setuid and setgid
+            # would be a glaring security hole if the archive were
+            # uncompressed as root.
+            if ti.mode & (stat.S_ISUID | stat.S_ISGID):
+                raise ValueError('cannot add file with setuid or setgid set: '
+                                 '%s' % fs_path)
+
+            # Set uid, gid, username, and group as deterministic values.
+            ti.uid = 0
+            ti.gid = 0
+            ti.uname = ''
+            ti.gname = ''
+
+            # Set mtime to a constant value.
+            ti.mtime = DEFAULT_MTIME
+
+            with open(fs_path, 'rb') as fh:
+                tf.addfile(ti, fh)
+
+
+def create_tar_gz_from_files(fp, files, filename=None, compresslevel=9):
+    """Create a tar.gz file deterministically from files.
+
+    This is a glorified wrapper around ``create_tar_from_files`` that
+    adds gzip compression.
+
+    The passed file handle should be opened for writing in binary mode.
+    When the function returns, all data has been written to the handle.
+    """
+    # Offset 3-7 in the gzip header contains an mtime. Pin it to a known
+    # value so output is deterministic.
+    gf = gzip.GzipFile(filename=filename or '', mode='wb', fileobj=fp,
+                       compresslevel=compresslevel, mtime=DEFAULT_MTIME)
+    with gf:
+        create_tar_from_files(gf, files)
+
+
+class _BZ2Proxy(object):
+    """File object that proxies writes to a bz2 compressor."""
+    def __init__(self, fp, compresslevel=9):
+        self.fp = fp
+        self.compressor = bz2.BZ2Compressor(compresslevel=compresslevel)
+        self.pos = 0
+
+    def tell(self):
+        return self.pos
+
+    def write(self, data):
+        data = self.compressor.compress(data)
+        self.pos += len(data)
+        self.fp.write(data)
+
+    def close(self):
+        data = self.compressor.flush()
+        self.pos += len(data)
+        self.fp.write(data)
+
+
+def create_tar_bz2_from_files(fp, files, compresslevel=9):
+    """Create a tar.bz2 file deterministically from files.
+
+    This is a glorified wrapper around ``create_tar_from_files`` that
+    adds bzip2 compression.
+
+    This function is similar to ``create_tar_gzip_from_files()``.
+    """
+    proxy = _BZ2Proxy(fp, compresslevel=compresslevel)
+    create_tar_from_files(proxy, files)
+    proxy.close()
new file mode 100644
--- /dev/null
+++ b/python/mozbuild/mozpack/test/test_archive.py
@@ -0,0 +1,190 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from __future__ import absolute_import
+
+import hashlib
+import os
+import shutil
+import stat
+import tarfile
+import tempfile
+import unittest
+
+from mozpack.archive import (
+    DEFAULT_MTIME,
+    create_tar_from_files,
+    create_tar_gz_from_files,
+    create_tar_bz2_from_files,
+)
+
+from mozunit import main
+
+
+MODE_STANDARD = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH
+
+
+def file_hash(path):
+    h = hashlib.sha1()
+    with open(path, 'rb') as fh:
+        while True:
+            data = fh.read(8192)
+            if not data:
+                break
+            h.update(data)
+
+    return h.hexdigest()
+
+
+class TestArchive(unittest.TestCase):
+    def _create_files(self, root):
+        files = {}
+        for i in range(10):
+            p = os.path.join(root, b'file%d' % i)
+            with open(p, 'wb') as fh:
+                fh.write(b'file%d' % i)
+            # Need to set permissions or umask may influence testing.
+            os.chmod(p, MODE_STANDARD)
+            files[b'file%d' % i] = p
+
+        return files
+
+    def _verify_basic_tarfile(self, tf):
+        self.assertEqual(len(tf.getmembers()), 10)
+
+        names = ['file%d' % i for i in range(10)]
+        self.assertEqual(tf.getnames(), names)
+
+        for ti in tf.getmembers():
+            self.assertEqual(ti.uid, 0)
+            self.assertEqual(ti.gid, 0)
+            self.assertEqual(ti.uname, '')
+            self.assertEqual(ti.gname, '')
+            self.assertEqual(ti.mode, MODE_STANDARD)
+            self.assertEqual(ti.mtime, DEFAULT_MTIME)
+
+    def test_dirs_refused(self):
+        d = tempfile.mkdtemp()
+        try:
+            tp = os.path.join(d, 'test.tar')
+            with open(tp, 'wb') as fh:
+                with self.assertRaisesRegexp(ValueError, 'not a regular'):
+                    create_tar_from_files(fh, {'test': d})
+        finally:
+            shutil.rmtree(d)
+
+    def test_setuid_setgid_refused(self):
+        d = tempfile.mkdtemp()
+        try:
+            uid = os.path.join(d, 'setuid')
+            gid = os.path.join(d, 'setgid')
+            with open(uid, 'a'):
+                pass
+            with open(gid, 'a'):
+                pass
+
+            os.chmod(uid, MODE_STANDARD | stat.S_ISUID)
+            os.chmod(gid, MODE_STANDARD | stat.S_ISGID)
+
+            tp = os.path.join(d, 'test.tar')
+            with open(tp, 'wb') as fh:
+                with self.assertRaisesRegexp(ValueError, 'cannot add file with setuid'):
+                    create_tar_from_files(fh, {'test': uid})
+                with self.assertRaisesRegexp(ValueError, 'cannot add file with setuid'):
+                    create_tar_from_files(fh, {'test': gid})
+        finally:
+            shutil.rmtree(d)
+
+    def test_create_tar_basic(self):
+        d = tempfile.mkdtemp()
+        try:
+            files = self._create_files(d)
+
+            tp = os.path.join(d, 'test.tar')
+            with open(tp, 'wb') as fh:
+                create_tar_from_files(fh, files)
+
+            # Output should be deterministic.
+            self.assertEqual(file_hash(tp), 'cd16cee6f13391abd94dfa435d2633b61ed727f1')
+
+            with tarfile.open(tp, 'r') as tf:
+                self._verify_basic_tarfile(tf)
+
+        finally:
+            shutil.rmtree(d)
+
+    def test_executable_preserved(self):
+        d = tempfile.mkdtemp()
+        try:
+            p = os.path.join(d, 'exec')
+            with open(p, 'wb') as fh:
+                fh.write('#!/bin/bash\n')
+            os.chmod(p, MODE_STANDARD | stat.S_IXUSR)
+
+            tp = os.path.join(d, 'test.tar')
+            with open(tp, 'wb') as fh:
+                create_tar_from_files(fh, {'exec': p})
+
+            self.assertEqual(file_hash(tp), '357e1b81c0b6cfdfa5d2d118d420025c3c76ee93')
+
+            with tarfile.open(tp, 'r') as tf:
+                m = tf.getmember('exec')
+                self.assertEqual(m.mode, MODE_STANDARD | stat.S_IXUSR)
+
+        finally:
+            shutil.rmtree(d)
+
+    def test_create_tar_gz_basic(self):
+        d = tempfile.mkdtemp()
+        try:
+            files = self._create_files(d)
+
+            gp = os.path.join(d, 'test.tar.gz')
+            with open(gp, 'wb') as fh:
+                create_tar_gz_from_files(fh, files)
+
+            self.assertEqual(file_hash(gp), 'acb602239c1aeb625da5e69336775609516d60f5')
+
+            with tarfile.open(gp, 'r:gz') as tf:
+                self._verify_basic_tarfile(tf)
+
+        finally:
+            shutil.rmtree(d)
+
+    def test_tar_gz_name(self):
+        d = tempfile.mkdtemp()
+        try:
+            files = self._create_files(d)
+
+            gp = os.path.join(d, 'test.tar.gz')
+            with open(gp, 'wb') as fh:
+                create_tar_gz_from_files(fh, files, filename='foobar', compresslevel=1)
+
+            self.assertEqual(file_hash(gp), 'fd099f96480cc1100f37baa8e89a6b820dbbcbd3')
+
+            with tarfile.open(gp, 'r:gz') as tf:
+                self._verify_basic_tarfile(tf)
+
+        finally:
+            shutil.rmtree(d)
+
+    def test_create_tar_bz2_basic(self):
+        d = tempfile.mkdtemp()
+        try:
+            files = self._create_files(d)
+
+            bp = os.path.join(d, 'test.tar.bz2')
+            with open(bp, 'wb') as fh:
+                create_tar_bz2_from_files(fh, files)
+
+            self.assertEqual(file_hash(bp), '1827ad00dfe7acf857b7a1c95ce100361e3f6eea')
+
+            with tarfile.open(bp, 'r:bz2') as tf:
+                self._verify_basic_tarfile(tf)
+        finally:
+            shutil.rmtree(d)
+
+
+if __name__ == '__main__':
+    main()