Bug 1210538 - Add antivirus checks to release promotion graph a=rail
authorKim Moir <kmoir@mozilla.com>
Mon, 22 Feb 2016 15:51:22 -0500
changeset 304338 56e25919ea0ccf7127ecf44de904ad9b08f75b08
parent 304337 cc1e2772fff8bedc6833817076ab3e23d892df3a
child 304339 f4fcca4a042f0b20d51e05c794348b25c25be991
push id9175
push userraliiev@mozilla.com
push dateThu, 03 Mar 2016 03:39:52 +0000
treeherdermozilla-aurora@0bee186afe5a [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersrail
bugs1210538
milestone46.0a2
Bug 1210538 - Add antivirus checks to release promotion graph a=rail MozReview-Commit-ID: 7ERzTbXJsiB
release/docker/beet-mover/Dockerfile
testing/mozharness/external_tools/extract_and_run_command.py
testing/mozharness/scripts/release/beet_mover.py
new file mode 100644
--- /dev/null
+++ b/release/docker/beet-mover/Dockerfile
@@ -0,0 +1,19 @@
+FROM ubuntu:vivid
+
+RUN apt-get -q update \
+    && apt-get install --yes -q \
+    mercurial \
+    python-dev \
+    python-pip \
+    python-virtualenv \
+    libffi-dev \
+    libssl-dev \
+    libyaml-dev \
+    libmysqlclient-dev \
+    clamav \
+    clamav-freshclam \
+    curl \
+    wget \
+    && apt-get clean
+
+RUN freshclam --verbose
new file mode 100644
--- /dev/null
+++ b/testing/mozharness/external_tools/extract_and_run_command.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+"""\
+Usage: extract_and_run_command.py [-j N] [command to run] -- [files and/or directories]
+    -j is the number of workers to start, defaulting to 1.
+    [command to run] must be a command that can accept one or many files
+    to process as arguments.
+
+WARNING: This script does NOT respond to SIGINT. You must use SIGQUIT or SIGKILL to
+         terminate it early.
+ """
+
+### The canonical location for this file is
+###   https://hg.mozilla.org/build/tools/file/default/stage/extract_and_run_command.py
+###
+### Please update the copy in puppet to deploy new changes to
+### stage.mozilla.org, see
+# https://wiki.mozilla.org/ReleaseEngineering/How_To/Modify_scripts_on_stage
+
+import logging
+import os
+from os import path
+import sys
+from Queue import Queue
+import shutil
+import subprocess
+import tempfile
+from threading import Thread
+import time
+
+logging.basicConfig(
+    stream=sys.stdout, level=logging.INFO, format="%(message)s")
+log = logging.getLogger(__name__)
+
+try:
+    # the future - https://github.com/mozilla/build-mar via a venv
+    from mardor.marfile import BZ2MarFile
+except:
+    # the past - http://hg.mozilla.org/build/tools/file/default/buildfarm/utils/mar.py
+    sys.path.append(
+        path.join(path.dirname(path.realpath(__file__)), "../buildfarm/utils"))
+    from mar import BZ2MarFile
+
+SEVENZIP = "7za"
+
+
+def extractMar(filename, tempdir):
+    m = BZ2MarFile(filename)
+    m.extractall(path=tempdir)
+
+
+def extractExe(filename, tempdir):
+    try:
+        # We don't actually care about output, put we redirect to a tempfile
+        # to avoid deadlocking in wait() when stdout=PIPE
+        fd = tempfile.TemporaryFile()
+        proc = subprocess.Popen([SEVENZIP, 'x', '-o%s' % tempdir, filename],
+                                stdout=fd, stderr=subprocess.STDOUT)
+        proc.wait()
+    except subprocess.CalledProcessError:
+        # Not all EXEs are 7-zip files, so we have to ignore extraction errors
+        pass
+
+# The keys here are matched against the last 3 characters of filenames.
+# The values are callables that accept two string arguments.
+EXTRACTORS = {
+    '.mar': extractMar,
+    '.exe': extractExe,
+}
+
+
+def find_files(d):
+    """yields all of the files in `d'"""
+    for root, dirs, files in os.walk(d):
+        for f in files:
+            yield path.abspath(path.join(root, f))
+
+
+def rchmod(d, mode=0755):
+    """chmods everything in `d' to `mode', including `d' itself"""
+    os.chmod(d, mode)
+    for root, dirs, files in os.walk(d):
+        for item in dirs:
+            os.chmod(path.join(root, item), mode)
+        for item in files:
+            os.chmod(path.join(root, item), mode)
+
+
+def maybe_extract(filename):
+    """If an extractor is found for `filename', extracts it to a temporary
+       directory and chmods it. The consumer is responsible for removing
+       the extracted files, if desired."""
+    ext = path.splitext(filename)[1]
+    if ext not in EXTRACTORS.keys():
+        return None
+    # Append the full filepath to the tempdir
+    tempdir_root = tempfile.mkdtemp()
+    tempdir = path.join(tempdir_root, filename.lstrip('/'))
+    os.makedirs(tempdir)
+    EXTRACTORS[ext](filename, tempdir)
+    rchmod(tempdir_root)
+    return tempdir_root
+
+
+def process(item, command):
+    def format_time(t):
+        return time.strftime("%H:%M:%S", time.localtime(t))
+    # Buffer output to avoid interleaving of multiple workers'
+    logs = []
+    args = [item]
+    proc = None
+    start = time.time()
+    logs.append("START %s: %s" % (format_time(start), item))
+    # If the file was extracted, we need to process all of its files, too.
+    tempdir = maybe_extract(item)
+    if tempdir:
+        for f in find_files(tempdir):
+            args.append(f)
+
+    try:
+        fd = tempfile.TemporaryFile()
+        proc = subprocess.Popen(command + args, stdout=fd)
+        proc.wait()
+        if proc.returncode != 0:
+            raise Exception("returned %s" % proc.returncode)
+    finally:
+        if tempdir:
+            shutil.rmtree(tempdir)
+        fd.seek(0)
+        # rstrip() here to avoid an unnecessary newline, if it exists.
+        logs.append(fd.read().rstrip())
+        end = time.time()
+        elapsed = end - start
+        logs.append("END %s (%d seconds elapsed): %s\n" % (
+            format_time(end), elapsed, item))
+        # Now that we've got all of our output, print it. It's important that
+        # the logging module is used for this, because "print" is not
+        # thread-safe.
+        log.info("\n".join(logs))
+
+
+def worker(command, errors):
+    item = q.get()
+    while item != None:
+        try:
+            process(item, command)
+        except:
+            errors.put(item)
+        item = q.get()
+
+if __name__ == '__main__':
+    # getopt is used in favour of optparse to enable "--" as a separator
+    # between the command and list of files. optparse doesn't allow that.
+    from getopt import getopt
+    options, args = getopt(sys.argv[1:], 'j:h', ['help'])
+
+    concurrency = 1
+    for o, a in options:
+        if o == '-j':
+            concurrency = int(a)
+        elif o in ('-h', '--help'):
+            log.info(__doc__)
+            sys.exit(0)
+
+    if len(args) < 3 or '--' not in args:
+        log.error(__doc__)
+        sys.exit(1)
+
+    command = []
+    while args[0] != "--":
+        command.append(args.pop(0))
+    args.pop(0)
+
+    q = Queue()
+    errors = Queue()
+    threads = []
+    for i in range(concurrency):
+        t = Thread(target=worker, args=(command, errors))
+        t.start()
+        threads.append(t)
+
+    # find_files is a generator, so work will begin prior to it finding
+    # all of the files
+    for arg in args:
+        if path.isfile(arg):
+            q.put(arg)
+        else:
+            for f in find_files(arg):
+                q.put(f)
+    # Because the workers are started before we start populating the q
+    # they can't use .empty() to determine whether or not their done.
+    # We also can't use q.join() or j.task_done(), because we need to
+    # support Python 2.4. We know that find_files won't yield None,
+    # so we can detect doneness by having workers die when they get None
+    # as an item.
+    for i in range(concurrency):
+        q.put(None)
+
+    for t in threads:
+        t.join()
+
+    if not errors.empty():
+        log.error("Command failed for the following files:")
+        while not errors.empty():
+            log.error("  %s" % errors.get())
+        sys.exit(1)
--- a/testing/mozharness/scripts/release/beet_mover.py
+++ b/testing/mozharness/scripts/release/beet_mover.py
@@ -1,27 +1,31 @@
 #!/usr/bin/env python
 # ***** BEGIN LICENSE BLOCK *****
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 # ***** END LICENSE BLOCK *****
 """beet_mover.py.
 
-downloads artifacts and uploads them to s3
+downloads artifacts, scans them and uploads them to s3
 """
 import hashlib
 import sys
 import os
 import pprint
+import re
+from os import listdir
+from os.path import isfile, join
 
 sys.path.insert(1, os.path.dirname(os.path.dirname(sys.path[0])))
 from mozharness.base.log import FATAL
 from mozharness.base.python import VirtualenvMixin
 from mozharness.base.script import BaseScript
+import mozharness
 
 
 def get_hash(content, hash_type="md5"):
     h = hashlib.new(hash_type)
     h.update(content)
     return h.hexdigest()
 
 
@@ -80,59 +84,99 @@ CONFIG_OPTIONS = [
         "dest": "taskid",
         "help": "taskcluster task id to download artifacts from",
     }],
     [["--production"], {
         "dest": "production",
         "default": False,
         "help": "taskcluster task id to download artifacts from",
     }],
+    [["--exclude"], {
+        "dest": "excludes",
+        "action": "append",
+        "help": "List of filename patterns to exclude. See script source for default",
+    }],
+    [["-s", "--scan-parallelization"], {
+        "dest": "scan_parallelization",
+        "default": 4,
+        "type": "int",
+        "help": "Number of concurrent file scans",
+    }],
 ]
 
+DEFAULT_EXCLUDES = [
+    r"^.*tests.*$",
+    r"^.*crashreporter.*$",
+    r"^.*\.zip(\.asc)?$",
+    r"^.*\.log$",
+    r"^.*\.txt$",
+    r"^.*\.asc$",
+    r"^.*/partner-repacks.*$",
+    r"^.*.checksums(\.asc)?$",
+    r"^.*/logs/.*$",
+    r"^.*/jsshell.*$",
+    r"^.*json$",
+    r"^.*/host.*$",
+    r"^.*/mar-tools/.*$",
+    r"^.*gecko-unsigned-unaligned.apk$",
+    r"^.*robocop.apk$",
+    r"^.*contrib.*"
+]
+CACHE_DIR = 'cache'
+
 
 class BeetMover(BaseScript, VirtualenvMixin, object):
     def __init__(self, aws_creds):
         beetmover_kwargs = {
             'config_options': CONFIG_OPTIONS,
             'all_actions': [
                 # 'clobber',
                 'create-virtualenv',
                 'activate-virtualenv',
                 'generate-candidates-manifest',
                 'verify-bits',  # beets
+                'download-bits', # beets
+                'scan-bits',     # beets
                 'upload-bits',  # beets
             ],
             'require_config_file': False,
             # Default configuration
             'config': {
                 # base index url where to find taskcluster artifact based on taskid
                 # TODO - find out if we need to support taskcluster run number other than 0.
                 # e.g. maybe we could end up with artifacts in > 'run 0' in a re-trigger situation?
                 "artifact_base_url": 'https://queue.taskcluster.net/v1/task/{taskid}/runs/0/artifacts/public/{subdir}',
                 "virtualenv_modules": [
                     "boto",
                     "PyYAML",
                     "Jinja2",
+                    "redo",
+                    "mar",
                 ],
                 "virtualenv_path": "venv",
                 'buckets': {
                     'development': "mozilla-releng-beet-mover-dev",
                     'production': "mozilla-releng-beet-mover",
                 },
                 'product': 'firefox',
             },
         }
+        #todo do excludes need to be configured via command line for specific builds?
         super(BeetMover, self).__init__(**beetmover_kwargs)
 
         c = self.config
         self.manifest = {}
         # assigned in _post_create_virtualenv
         self.virtualenv_imports = None
         self.bucket = c['buckets']['production'] if c['production'] else c['buckets']['development']
         self.aws_key_id, self.aws_secret_key = aws_creds
+        # if excludes is set from command line, use it otherwise use defaults
+        self.excludes = self.config.get('excludes', DEFAULT_EXCLUDES)
+        dirs = self.query_abs_dirs()
+        self.dest_dir = os.path.join(dirs['abs_work_dir'], CACHE_DIR)
 
     def activate_virtualenv(self):
         """
         activates virtualenv and adds module imports to a instance wide namespace.
 
         creating and activating a virtualenv onto the currently executing python interpreter is a
         bit black magic. Rather than having import statements added in various places within the
         script, we import them here immediately after we activate the newly created virtualenv
@@ -167,84 +211,133 @@ class BeetMover(BaseScript, VirtualenvMi
             "locales": self.config['locales'],
             "version": self.config['version'],
             "app_version": self.config.get('app_version', ''),
             "partial_version": self.config.get('partial_version', ''),
             "build_num": self.config['build_num'],
             # mirror current release folder structure
             "s3_prefix": 'pub/{}/candidates'.format(self.config['product']),
             "artifact_base_url": self.config['artifact_base_url'].format(
-                    taskid=self.config['taskid'], subdir=self.config['artifact_sudbir']
+                    taskid=self.config['taskid'], subdir=self.config['artifact_subdir']
             )
         }
         self.manifest = yaml.safe_load(template.render(**template_vars))
 
         self.log("manifest generated:")
         self.log(pprint.pformat(self.manifest['mapping']))
 
     def verify_bits(self):
         """
         inspects each artifact and verifies that they were created by trustworthy tasks
         """
         # TODO
         self.log('skipping verification. unimplemented...')
 
+    def download_bits(self):
+        """
+        downloads list of artifacts to self.dest_dir dir based on a given manifest
+        """
+        self.log('downloading and uploading artifacts to self_dest_dir...')
+
+        # TODO - do we want to mirror/upload to more than one region?
+        dirs = self.query_abs_dirs()
+
+        for locale in self.manifest['mapping']:
+            for deliverable in self.manifest['mapping'][locale]:
+                self.log("downloading '{}' deliverable for '{}' locale".format(deliverable, locale))
+                # download locally to working dir
+                source=self.manifest['mapping'][locale][deliverable]['artifact']
+                file_name = self.retry(self.download_file,
+                    args=[source],
+                    kwargs={'parent_dir': dirs['abs_work_dir']},
+                    error_level=FATAL)
+        self.log('Success!')
+
     def upload_bits(self):
         """
-        downloads and uploads list of artifacts to s3 candidates dir based on a given manifest
+        uploads list of artifacts to s3 candidates dir based on a given manifest
         """
-        self.log('downloading and uploading artifacts to s3...')
+        self.log('uploading artifacts to s3...')
+        dirs = self.query_abs_dirs()
 
         # connect to s3
         boto = self.virtualenv_imports['boto']
         conn = boto.connect_s3(self.aws_key_id, self.aws_secret_key)
         bucket = conn.get_bucket(self.bucket)
 
+        #todo change so this is not every entry in manifest - should exclude those that don't pass virus sign
+        #not sure how to determine this
         for locale in self.manifest['mapping']:
             for deliverable in self.manifest['mapping'][locale]:
                 self.log("uploading '{}' deliverable for '{}' locale".format(deliverable, locale))
+                #we have already downloaded the files locally so we can use that version
+                source = self.manifest['mapping'][locale][deliverable]['artifact']
+                downloaded_file = os.path.join(dirs['abs_work_dir'], self.get_filename_from_url(source))
                 self.upload_bit(
-                    source=self.manifest['mapping'][locale][deliverable]['artifact'],
+                    source=downloaded_file,
                     s3_key=self.manifest['mapping'][locale][deliverable]['s3_key'],
                     bucket=bucket,
                 )
         self.log('Success!')
 
+
     def upload_bit(self, source, s3_key, bucket):
         # TODO - do we want to mirror/upload to more than one region?
         dirs = self.query_abs_dirs()
         boto = self.virtualenv_imports['boto']
 
-        # download locally
-        file_name = self.retry(self.download_file,
-                               args=[source],
-                               kwargs={'parent_dir': dirs['abs_work_dir']},
-                               error_level=FATAL)
+        #todo need to copy from dir to s3
 
         self.info('uploading to s3 with key: {}'.format(s3_key))
         key = boto.s3.key.Key(bucket)  # create new key
         key.key = s3_key  # set key name
 
         self.info("Checking if `{}` already exists".format(s3_key))
         key = bucket.get_key(s3_key)
         if not key:
             self.info("Uploading to `{}`".format(s3_key))
             key = bucket.new_key(s3_key)
 
             # set key value
-            self.retry(key.set_contents_from_filename, args=[file_name], error_level=FATAL),
+            self.retry(key.set_contents_from_filename, args=[source], error_level=FATAL),
 
             # key.make_public() may lead to race conditions, because
             # it doesn't pass version_id, so it may not set permissions
             bucket.set_canned_acl(acl_str='public-read', key_name=s3_key,
                                   version_id=key.version_id)
         else:
-            if not get_hash(key.get_contents_as_string()) == get_hash(open(file_name).read()):
+            if not get_hash(key.get_contents_as_string()) == get_hash(open(source).read()):
                 # for now, let's halt. If necessary, we can revisit this and allow for overwrites
                 #  to the same buildnum release with different bits
                 self.fatal("`{}` already exists with different checksum.".format(s3_key))
             self.log("`{}` has the same MD5 checksum, not uploading".format(s3_key))
 
+    def scan_bits(self):
 
+        dirs = self.query_abs_dirs()
+
+        filenames = [f for f in listdir(dirs['abs_work_dir']) if isfile(join(dirs['abs_work_dir'], f))]
+        self.mkdir_p(self.dest_dir)
+        for file_name in filenames:
+            if self._matches_exclude(file_name):
+                self.info("Excluding {} from virus scan".format(file_name))
+            else:
+                self.info('Copying {} to {}'.format(file_name,self.dest_dir))
+                self.copyfile(os.path.join(dirs['abs_work_dir'], file_name), os.path.join(self.dest_dir,file_name))
+        self._scan_files()
+        self.info('Emptying {}'.format(self.dest_dir))
+        self.rmtree(self.dest_dir)
+
+    def _scan_files(self):
+        """Scan the files we've collected. We do the download and scan concurrently to make
+        it easier to have a coherent log afterwards. Uses the venv python."""
+        external_tools_path = os.path.join(
+                              os.path.abspath(os.path.dirname(os.path.dirname(mozharness.__file__))), 'external_tools')
+        self.run_command([self.query_python_path(), os.path.join(external_tools_path,'extract_and_run_command.py'),
+                         '-j{}'.format(self.config['scan_parallelization']),
+                         'clamscan', '--no-summary', '--', self.dest_dir])
+
+    def _matches_exclude(self, keyname):
+         return any(re.search(exclude, keyname) for exclude in self.excludes)
 
 if __name__ == '__main__':
     beet_mover = BeetMover(get_aws_auth())
     beet_mover.run_and_exit()