scripts/generate-hg-s3-bundles
author Dan Minor <dminor@mozilla.com>
Fri, 18 Dec 2015 15:12:07 -0500
changeset 362083 2248273cff1c9789e941b14cb0a303dbfca0ff98
parent 362016 4092c234bf8c9957f41fb4ca38014a43407dd3f6
child 362306 24d0f9419046d4c108685ea5d6a252545b8f0efc
permissions -rwxr-xr-x
mozreview: people besides review requesters should be able autoland (bug 1205018) r=mdoglio This removes the check for whether or not the review is mutable by the current user, which allows people other than the review requester to autoland provided they have sufficient privileges.

#!/usr/bin/env python2
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from __future__ import absolute_import

import argparse
import datetime
import errno
import os
import subprocess

from boto.s3.connection import S3Connection
import concurrent.futures as futures


HG = '/repo/hg/venv_tools/bin/hg'

# The types of bundles to generate.
#
# Define in order bundles should be listed in manifest.
CREATES = [
    ('gzip', ['bundle', '-a', '-t', 'gzip-v1']),
    ('bzip2', ['bundle', '-a', '-t', 'bzip2-v1']),
    ('stream-legacy', ['streambundle']),
    ('packed1', ['streambundle', '--type', 's1']),
]

BUNDLECLONE_ORDER = [
    ('gzip', 'compression=gzip'),
    ('bzip2', 'compression=bzip2'),
    ('stream-legacy', 'stream=revlogv1'),
]

CLONEBUNDLES_ORDER = [
    ('gzip', 'BUNDLESPEC=gzip-v1'),
    ('bzip2', 'BUNDLESPEC=bzip2-v1'),
    ('packed1', 'BUNDLESPEC=none-packed1;requirements%3Drevlogv1'),
]

# Defines hostname and bucket where uploads should go.
HOSTS = (
    ('s3-us-west-2.amazonaws.com', 'moz-hg-bundles-us-west-2', 'us-west-2'),
    ('s3-external-1.amazonaws.com', 'moz-hg-bundles-us-east-1', 'us-east-1'),
)

CDN = 'https://hg.cdn.mozilla.net'

# We store bundles on the local filesystem because they may cause chaos
# with volume snapshots. See bug 1167935.
BUNDLE_ROOT = '/repo/bundles'

CONCURRENT_THREADS = 4

HTML_INDEX = '''
<html>
  <head>
    <title>Mercurial Bundles</title>
    <style>
      .numeric {
        text-align: right;
        font-variant-numeric: tabular-nums;
      }
    </style>
  </head>
  <body>
    <h1>Mercurial Bundles</h1>
    <p>
       This server contains Mercurial bundle files that can be used to seed
       repository clones. If your Mercurial client is configured properly,
       it should fetch one of these bundles automatically.
    </p>
    <p>
      The table below lists all available repositories and their bundles.
      Only the most recent bundle is shown. Previous bundles are expired 7 days
      after they are superseded.
    </p>
    <p>
       For more, see
       <a href="https://mozilla-version-control-tools.readthedocs.org/en/latest/hgmo/bundleclone.html">the official docs</a>.
    </p>
    <table border="1">
      <tr>
        <th>Repository</th>
        <th>gzip</th>
        <th>bz2</th>
        <th>stream</th>
        <th>stream-legacy</th>
      </tr>
      %s
    </table>
    <p>This page generated at %s.</p>
  </body>
</html>
'''.strip()

HTML_ENTRY = '''
<tr>
  <td>{repo}</td>
  <td class="numeric"><a href="{gzip_path}">{gzip_size:,}</a></td>
  <td class="numeric">{bzip2_entry}</td>
  <td class="numeric"><a href="{packed1_path}">{packed1_size:,}</a></td>
  <td class="numeric"><a href="{stream_path}">{stream_size:,}</a></td>
</tr>
'''.strip()


def upload_to_s3(host, bucket_name, local_path, remote_path):
    """Upload a file to S3."""

    c = S3Connection(host=host)
    # We assume buckets exist already.
    b = c.get_bucket(bucket_name, validate=False)

    key = b.get_key(remote_path, validate=False)

    # There is a lifecycle policy on the buckets that expires objects after a
    # few days. If we did nothing here and a repository didn't change for a few
    # days, the bundle objects may become expired and made unavailable.
    #
    # Copying the object resets the modification time to current and prevents
    # unwanted early expiration.
    if key.exists():
        print('copying object to reset expiration time')
        key.copy(bucket_name, remote_path, validate_dst_bucket=False)
    else:
        key.set_contents_from_filename(local_path)

def bundle_paths(root, repo, tag, typ):
    basename = '%s.%s.hg' % (tag, typ)
    final_path = os.path.join(root, basename)
    remote_path = '%s/%s' % (repo, basename)

    return final_path, remote_path

def generate_bundle(repo, temp_path, final_path, extra_args):
    """Generate a single bundle from arguments.

    Generates using the command specified by ``extra_args`` into ``temp_path``
    before moving the fully created bundle to ``final_path``.
    """
    args = [HG, '-R', repo] + extra_args + [temp_path]
    subprocess.check_call(args)
    os.rename(temp_path, final_path)


def generate_bundles(repo, upload=True, bzip2=False):
    """Generate bundle files for a repository at a path."""
    assert not os.path.isabs(repo)
    repo_full = os.path.join('/repo/hg/mozilla', repo)

    hg_stat = os.stat(os.path.join(repo_full, '.hg'))
    uid = hg_stat.st_uid
    gid = hg_stat.st_gid

    # Bundle files are named after the tip revision in the repository at
    # the time the bundle was created. This is the easiest way to name
    # bundle files.
    tip = subprocess.check_output([HG, '-R', repo_full, 'log', '-r', 'tip', '-T', '{node}'])
    print('tip is %s' % tip)

    bundle_path = os.path.join(BUNDLE_ROOT, repo)

    # Create directory to hold bundle files.
    try:
        os.makedirs(bundle_path, 0755)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    # We keep the last bundle files around so we can reuse them if necessary.
    # Prune irrelevant files.
    for p in os.listdir(bundle_path):
        if p.startswith('.') or p.startswith(tip):
            continue

        full = os.path.join(bundle_path, p)
        print('removing old bundle file: %s' % full)
        os.unlink(full)

    # Bundle generation is pretty straightforward. We simply invoke
    # `hg bundle` for each type of bundle we're producing. We use ``-a``
    # to bundle all revisions currently in the repository.
    #
    # There is a race condition between discovering the tip revision and
    # bundling: it's possible for extra revisions beyond observed tip to
    # sneak into the bundles. This is acceptable. Bundles are best effort
    # to offload clone load from the server. They don't have to be exactly
    # identical nor as advertised.
    #
    # We write to temporary files then move them into place after generation.
    # This is because an aborted bundle process may result in a partial file,
    # which may confuse our don't-write-if-file-exists logic.

    with futures.ThreadPoolExecutor(CONCURRENT_THREADS) as e:
        for t, args in CREATES:
            if t == 'bzip2' and not bzip2:
                continue

            final_path, remote_path = bundle_paths(bundle_path, repo, tip, t)
            temp_path = '%s.tmp' % final_path

            if os.path.exists(final_path):
                print('bundle already exists, skipping: %s' % final_path)
                continue

            e.submit(generate_bundle, repo_full, temp_path, final_path, args)

    # Object path is keyed off the repository name so we can easily see what
    # is taking up space on the server.
    #
    # We upload directly to each EC2 region. This is worth explaining.
    #
    # S3 supports replication. However, replication occurs asynchronously
    # with the upload. This means there is a window between when upload
    # completes and when the bundle is available in the other region. We
    # don't want to advertise the bundle until it is distributed, as this
    # would result in a 404 and client failure. We could poll and wait for
    # replication to complete. However, there are similar issues with
    # using COPY...
    #
    # There is a COPY API on S3 that allows you to perform a remote copy
    # between regions. This seems like a perfect API, as it saves the
    # client from having to upload the same data to Amazon multiple times.
    # However, we've seen COPY operations take longer to complete than a
    # raw upload. See bug 1167732. Since bundles are being generated in a
    # datacenter that has plentiful bandwidth to S3 and because we
    # generally like operations to complete faster, we choose to simply
    # upload the bundle to multiple regions instead of employ COPY.
    if upload:
        with futures.ThreadPoolExecutor(CONCURRENT_THREADS) as e:
            for host, bucket, name in HOSTS:
                for t, _args in CREATES:
                    if t == 'bzip2' and not bzip2:
                        continue

                    final_path, remote_path = bundle_paths(bundle_path, repo,
                                                           tip, t)
                    print('uploading to %s/%s/%s' % (host, bucket, remote_path))
                    e.submit(upload_to_s3, host, bucket, final_path, remote_path)

    # Now assemble a manifest listing each bundle.
    paths = {}
    for t, _args in CREATES:
        if t == 'bzip2' and not bzip2:
            continue

        final_path, remote_path = bundle_paths(bundle_path, repo, tip, t)
        paths[t] = (remote_path, os.path.getsize(final_path))

    bundleclone_manifest = []
    for t, params in BUNDLECLONE_ORDER:
        if t == 'bzip2' and not bzip2:
            continue

        final_path, remote_path = bundle_paths(bundle_path, repo, tip, t)

        bundleclone_manifest.append('%s/%s %s cdn=true requiresni=true' % (
            CDN, remote_path, params))

        for host, bucket, name in HOSTS:
            entry = 'https://%s/%s/%s ec2region=%s %s' % (
                host, bucket, remote_path, name, params)
            bundleclone_manifest.append(entry)

    clonebundles_manifest = []
    for t, params in CLONEBUNDLES_ORDER:
        if t == 'bzip2' and not bzip2:
            continue

        final_path, remote_path = bundle_paths(bundle_path, repo, tip, t)
        clonebundles_manifest.append('%s/%s %s REQUIRESNI=true cdn=true' % (
            CDN, remote_path, params))

        for host, bucket, name in HOSTS:
            entry = 'https://%s/%s/%s %s ec2region=%s' % (
                host, bucket, remote_path, params, name)
            clonebundles_manifest.append(entry)

    bundleclone_path = os.path.join(repo_full, '.hg', 'bundleclone.manifest')
    clonebundles_path = os.path.join(repo_full, '.hg', 'clonebundles.manifest')
    with open(bundleclone_path, 'wb') as fh:
        fh.write('\n'.join(bundleclone_manifest))
    with open(clonebundles_path, 'wb') as fh:
        fh.write('\n'.join(clonebundles_manifest))

    # Ensure manifest is owned by same user who owns repo and has sane
    # permissions.
    # TODO we can't do this yet since the "hg" user isn't a member of the
    # scm_* groups.
    #os.chown(bundleclone_path, uid, gid)
    #os.chown(clonebundles_path, uid, gid)
    os.chmod(bundleclone_path, 0664)
    os.chmod(clonebundles_path, 0664)

    # Replicate manifest to mirrors.
    subprocess.check_call(['/repo/hg/scripts/push-repo.sh'], cwd=repo_full)

    return paths


def upload_index(repos):
    """Upload an index HTML page describing available bundles."""
    entries = []

    for repo in sorted(repos):
        p = repos[repo]
        basename = os.path.basename(p['gzip'][0])

        opts = dict(
            repo=repo,
            basename=basename,
            gzip_path=p['gzip'][0],
            gzip_size=p['gzip'][1],
            packed1_path=p['packed1'][0],
            packed1_size=p['packed1'][1],
            stream_path=p['stream-legacy'][0],
            stream_size=p['stream-legacy'][1],
        )

        if 'bzip2' in p:
            opts['bzip2_entry'] = '<a href="{path}">{size:,}</a>'.format(
                path=p['bzip2'][0],
                size=p['bzip2'][1],
            )
        else:
            opts['bzip2_entry'] = '-'

        entries.append(HTML_ENTRY.format(**opts))

    html = HTML_INDEX % ('\n'.join(entries),
                         datetime.datetime.utcnow().isoformat())

    for host, bucket, region in HOSTS:
        c = S3Connection(host=host)
        b = c.get_bucket(bucket, validate=False)
        k = b.get_key('index.html', validate=False)
        k.content_type = 'text/html'
        # Without this, the CDN caches objects for an indeterminate amount of
        # time. We want this page to be fairly current, so establish a less
        # aggressive caching policy.
        k.cache_control = 'max-age=60'
        k.set_contents_from_string(html)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', help='file to read repository list from')
    parser.add_argument('--no-upload', action='store_true',
                        help='do not upload to servers (useful for testing)')

    repos = []

    args, remaining = parser.parse_known_args()
    if args.f:
        with open(args.f, 'rb') as fh:
            for line in fh:
                fields = line.strip().split()
                repos.append((fields[0], {x: True for x in fields[1:]}))

    for repo in remaining:
        repos.append((repo, {}))

    upload = not args.no_upload

    paths = {}

    for repo, opts in repos:
        paths[repo] = generate_bundles(repo, upload=upload, **opts)

    if upload:
        upload_index(paths)