aws/aws_sanity_checker.py
author Chris AtLee <catlee@mozilla.com>
Tue, 21 Jan 2014 13:45:08 -0500
changeset 261 9ce063fb875d63412d3639312c8bac2f4bbba59f
parent 192 ca6fc71f67c9b1ee0954375415c1c1f1040d9766
child 280 ffbe4068757d26c1ea2da2f30487e57872d9f3c7
permissions -rw-r--r--
Print out long running instances first r=me

#!/usr/bin/env python

import argparse
import json
import logging
import time
import calendar
import collections
import re
from boto.ec2 import connect_to_region

log = logging.getLogger(__name__)
REGIONS = ('us-east-1', 'us-west-2')
KNOWN_TYPES = ('puppetmaster', 'buildbot-master', 'dev-linux64', 'bld-linux64',
               'try-linux64', 'tst-linux32', 'tst-linux64', 'tst-win64', 'dev',
               'servo-linux64', 'packager', 'vcssync', 'infra')

EXPECTED_MAX_UPTIME = {
    "puppetmaster": "meh",
    "buildbot-master": "meh",
    "dev": "meh",
    "infra": "meh",
    "vcssync": "meh",
    "dev-linux64": 8,
    "bld-linux64": 24,
    "try-linux64": 12,
    "tst-linux32": 12,
    "tst-linux64": 12,
    "servo-linux64": 8,
    "default": 4
}

EXPECTED_MAX_DOWNTIME = {
    "puppetmaster": 0,
    "buildbot-master": 0,
    "dev": 0,
    "infra": 0,
    "vcssync": 0,
    "dev-linux64": 72,
    "bld-linux64": 72,
    "try-linux64": 72,
    "tst-linux32": 72,
    "tst-linux64": 72,
    "servo-linux64": 72,
    "packager": "meh",
    "default": 24
}


def is_beanstalk_instance(i):
    return i.tags.get("elasticbeanstalk:environment-name") is not None


def get_connection(region, secrets):
    if secrets:
        conn = connect_to_region(
            region,
            aws_access_key_id=secrets['aws_access_key_id'],
            aws_secret_access_key=secrets['aws_secret_access_key']
        )
    else:
        conn = connect_to_region(region)
    return conn


def get_all_instances(conn):
    res = conn.get_all_instances()
    instances = []
    if res:
        instances = reduce(lambda a, b: a + b, [r.instances for r in res])
    # Skip instances managed by Elastic Beanstalk
    return [i for i in instances if not is_beanstalk_instance(i)]


def parse_launch_time(launch_time):
    launch_time = calendar.timegm(time.strptime(
        launch_time[:19], '%Y-%m-%dT%H:%M:%S'))
    return launch_time


def get_bad_type(instances):
    bad_types = []
    for i in instances:
        ins_type = i.tags.get('moz-type')
        if ins_type not in KNOWN_TYPES:
            bad_types.append((i, 'Unknown type "%s"' % ins_type))
    return bad_types


def get_bad_state(instances):
    bad_state = []
    for i in instances:
        ins_state = i.tags.get('moz-state')
        if ins_state != "ready":
            bad_state.append((i, 'Unknown state "%s"' % ins_state))
    return bad_state


def get_uptime(instance):
    return (time.time() - parse_launch_time(instance.launch_time)) / 3600


def get_stale(instances, expected_stale_time, running_only=True):
    long_running = []
    for i in instances:
        if running_only:
            if i.state == "stopped":
                continue
        else:
            if i.state != "stopped":
                continue

        uptime = get_uptime(i)
        moz_type = i.tags.get('moz-type', 'default')
        expected_max = expected_stale_time.get(moz_type)
        if expected_max == "meh":
            continue
        if uptime > expected_max:
            up_down = "up"
            if not running_only:
                up_down = "down"
            long_running.append((i, "%s for %i hours" % (up_down, uptime)))
    return long_running


def format_instance_list(instances):
    for n, (i, msg) in enumerate(instances):
        print n, "{name} ({id}, {region}): {msg}".format(
            name=i.tags.get('Name'), id=i.id, region=i.region.name,
            msg=msg)


def instance_sanity_check(instances):
    bad_type = get_bad_type(instances=instances)
    bad_state = get_bad_state(instances=instances)
    long_running = get_stale(instances=instances,
                             expected_stale_time=EXPECTED_MAX_UPTIME)
    long_stopped = get_stale(instances=instances,
                             expected_stale_time=EXPECTED_MAX_DOWNTIME,
                             running_only=False)
    if long_running:
        print "==== Long running instances ===="
        format_instance_list(sorted(long_running, reverse=True,
                                    key=lambda x: get_uptime(x[0])))
        print
    if bad_type:
        print "==== Instances with unknown type ===="
        format_instance_list(sorted(bad_type, key=lambda x: x[0].region.name))
        print
    if bad_state:
        print "==== Instances with unknown state ===="
        format_instance_list(sorted(bad_state, key=lambda x: x[0].region.name))
        print
    if long_stopped:
        print "==== Instances stopped for a while ===="
        format_instance_list(sorted(long_stopped, reverse=True,
                                    key=lambda x: get_uptime(x[0])))
        print


def get_not_attached(volumes):
    bad_volumes = []
    for v in volumes:
        if v.status != "in-use":
            bad_volumes.append((v, "Not attached"))
    return bad_volumes


def volume_sanity_check(volumes):
    total = sum(v.size for v in volumes)
    not_attached = get_not_attached(volumes)
    print "Volume usage: %sG" % total
    if not_attached:
        print "==== Not attached volumes ===="
        for i, (v, msg) in enumerate(sorted(not_attached,
                                     key=lambda x: x[0].region.name)):
            print i, "%s %s: %s" % (v.id, v.region.name, msg)
        print


def instance_stats(instances):
    states = collections.defaultdict(int)
    types = collections.defaultdict(list)
    type_regexp = re.compile(r"(.*?)-?\d+$")
    for i in instances:
        states[i.state] += 1
        name = i.tags.get("Name")
        # Try to remove trailing digits or use the whole name
        if name:
            m = type_regexp.match(name)
            if m:
                type_name = m.group(1)
            else:
                type_name = name
        else:
            type_name = "unknown"
        running = bool(i.state != "stopped")
        # type: [True, True, False, ...]
        types[type_name].append(running)

    print "==== %s instances in total ====" % len(instances)
    for state, n in states.iteritems():
        print "%s: %s" % (state, n)
    print
    print "==== Type breakdown ===="
    # Sort by amount of running instances
    for t, n in sorted(types.iteritems(), key=lambda x: x[1].count(True),
                       reverse=True):
        print "%s: running: %s, stopped: %s" % (t, n.count(True),
                                                n.count(False))
    print

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("-k", "--secrets", type=argparse.FileType('r'),
                        help="optional file where secrets can be found")
    parser.add_argument("-r", "--region", dest="regions", action="append",
                        help="optional list of regions")
    parser.add_argument("-q", "--quiet", action="store_true",
                        help="Supress logging messages")

    args = parser.parse_args()
    if args.secrets:
        secrets = json.load(args.secrets)
    else:
        secrets = None

    logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
    if not args.quiet:
        log.setLevel(logging.DEBUG)
    else:
        log.setLevel(logging.ERROR)

    if not args.regions:
        args.regions = REGIONS
    all_instances = []
    all_volumes = []
    for region in args.regions:
        conn = get_connection(region, secrets)
        all_instances.extend(get_all_instances(conn))
        all_volumes.extend(conn.get_all_volumes())
    instance_stats(all_instances)
    instance_sanity_check(all_instances)
    volume_sanity_check(all_volumes)