Bug 1103272 - aws_stop_idle.py should use ssh keys for authenication
authorRail Aliiev <rail@mozilla.com>
Tue, 25 Nov 2014 12:14:21 -0500
changeset 567 c94086d038ef1bf3afd162b6ac9dea19a5e48f9a
parent 566 250d3a865c33cc0ac20f11fdb55801ba79a518d7
child 568 a52ccd1188015ff2f32435769ff6c6f86b5c0eaf
push id558
push userraliiev@mozilla.com
push dateTue, 25 Nov 2014 17:15:13 +0000
bugs1103272
Bug 1103272 - aws_stop_idle.py should use ssh keys for authenication
.coveragerc
.gitignore
.hgignore
.travis.yml
cloudtools/aws/__init__.py
cloudtools/aws/sanity.py
cloudtools/aws/spot.py
cloudtools/buildbot.py
cloudtools/fabric/__init__.py
cloudtools/graphite.py
cloudtools/jacuzzi.py
cloudtools/ssh.py
requirements-tests.txt
requirements.txt
scripts/aws_create_ami.py
scripts/aws_create_win_ami.py
scripts/aws_manage_routingtables.py
scripts/aws_manage_securitygroups.py
scripts/aws_sanity_checker.py
scripts/aws_stop_idle.py
scripts/check_dns.py
scripts/ec22ip.py
scripts/yaml_includes.py
tests/test_cloudtools_fabric.py
tests/test_cloudtools_graphite.py
tests/test_cloudtools_jacuzzi.py
tests/test_cloudtools_ssh.py
tests/test_yaml_includes.py
tox.ini
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,7 @@
 [run]
 branch = True
 include =
     cloudtools/*
     scripts/*
+omit=
+    .tox
new file mode 120000
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,1 @@
+.hgignore
\ No newline at end of file
--- a/.hgignore
+++ b/.hgignore
@@ -1,3 +1,5 @@
 \.pyc$
 \..*\.swp
 .*\.log
+\.tox/
+\.ropeproject/
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,8 +1,8 @@
 language: python
 python:
   - "2.7"
 
 install:
-    - travis_retry pip install -r requirements-tests.txt
-script: nosetests --with-coverage --cover-erase -v
-after_success: coveralls
+    - travis_retry pip install tox==1.8
+script:
+    - tox -e py27,py27-coverage
--- a/cloudtools/aws/__init__.py
+++ b/cloudtools/aws/__init__.py
@@ -246,8 +246,32 @@ def load_instance_config(moz_instance_ty
                                        moz_instance_type)))
 
 
 def jacuzzi_suffix(slaveset):
     if slaveset:
         return "jacuzzied"
     else:
         return "not_jacuzzied"
+
+
+def get_buildslave_instances(region, moz_types):
+    # Look for running `moz_types` instances with moz-state=ready
+    conn = get_aws_connection(region)
+    instances = conn.get_only_instances(filters={
+        'tag:moz-state': 'ready',
+        'instance-state-name': 'running',
+    })
+
+    retval = []
+    for i in instances:
+        if i.tags.get("moz-type") in moz_types and \
+                not i.tags.get("moz-loaned-to"):
+            retval.append(i)
+
+    return retval
+
+
+def get_impaired_instance_ids(region):
+    conn = get_aws_connection(region)
+    impaired = conn.get_all_instance_status(
+        filters={'instance-status.status': 'impaired'})
+    return [i.id for i in impaired]
--- a/cloudtools/aws/sanity.py
+++ b/cloudtools/aws/sanity.py
@@ -10,17 +10,18 @@ import calendar
 from datetime import timedelta
 from cloudtools.aws import parse_aws_time
 
 log = logging.getLogger(__name__)
 
 BUILDAPI_URL_JSON = "http://buildapi.pvt.build.mozilla.org/buildapi/recent/{slave_name}?format=json"
 BUILDAPI_URL = "http://buildapi.pvt.build.mozilla.org/buildapi/recent/{slave_name}"
 
-SLAVE_TAGS = ('try-linux64', 'tst-linux32', 'tst-linux64', 'tst-emulator64', 'bld-linux64')
+SLAVE_TAGS = ('try-linux64', 'tst-linux32', 'tst-linux64', 'tst-emulator64',
+              'bld-linux64')
 
 KNOWN_TYPES = ('puppetmaster', 'buildbot-master', 'dev-linux64', 'infra',
                'bld-linux64', 'try-linux64', 'tst-linux32', 'tst-linux64',
                'tst-emulator64', 'tst-win64', 'dev', 'packager',
                'vcssync', "signing")
 
 EXPECTED_MAX_UPTIME = {
     "puppetmaster": "meh",
@@ -67,17 +68,18 @@ def timedelta_to_time_string(timeout):
     if days != 0:
         time_string = "{days}d {time_string}".format(days=days,
                                                      time_string=time_string)
     return time_string
 
 
 def launch_time_to_epoch(launch_time):
     """converts a lunch_time into a timestamp"""
-    return calendar.timegm(time.strptime(launch_time[:19], '%Y-%m-%dT%H:%M:%S'))
+    return calendar.timegm(
+        time.strptime(launch_time[:19], '%Y-%m-%dT%H:%M:%S'))
 
 
 class AWSInstance(object):
     """AWS AWSInstance"""
     def __init__(self, instance, events_dir=None):
         self.instance = instance
         self.now = time.time()
         self.timeout = None
@@ -155,27 +157,27 @@ class AWSInstance(object):
         return region.name
 
     def is_long_running(self):
         """returns True is this instance is running for a long time"""
         if not self.is_running():
             return False
         if self.is_loaned():
             return False
-        my_uptime =  self._get_uptime_timestamp()
+        my_uptime = self._get_uptime_timestamp()
         return my_uptime > self.max_uptime
 
     def is_long_stopped(self):
         """returns True is this instance is running for a long time"""
         if self.is_running():
             return False
         if self.is_loaned():
             return False
         # get the uptime and assume it has been always down...
-        my_downtime =  self._get_uptime_timestamp()
+        my_downtime = self._get_uptime_timestamp()
         if self.events_dir:
             # ... unless we have the local logs
             my_downtime = self.get_stop_time_from_logs()
         return my_downtime > self.max_downtime
 
     def is_lazy(self):
         """returns True if this instance is on line for a while and it's not
            getting any jobs. It makes sense only if this machine is a slave.
@@ -271,17 +273,16 @@ class AWSInstance(object):
             self.get_name(), self.get_id(), self.get_region(),
             self._get_moz_type())
 
     def longrunning_message(self):
         """returns the running_message and appends (no info from buildapi)"""
         message = self.running_message()
         return " ".join([message, "(no info from buildapi)"])
 
-
     def _event_log_file(self, event):
         """returns the json file from the event directory"""
         if not self.events_dir:
             return
         instance_json = os.path.join(self.events_dir, event, self.get_id())
         if os.path.exists(instance_json):
             return instance_json
         return
--- a/cloudtools/aws/spot.py
+++ b/cloudtools/aws/spot.py
@@ -25,17 +25,17 @@ log = logging.getLogger(__name__)
 def populate_spot_requests_cache(region, request_ids=None):
     log.debug("Caching spot requests in %s", region)
     kwargs = {}
     if request_ids:
         kwargs["request_ids"] = request_ids
     conn = get_aws_connection(region)
     try:
         reqs = conn.get_all_spot_instance_requests(**kwargs)
-    except boto.exception.EC2ResponseError, e:
+    except boto.exception.EC2ResponseError:
         log.debug("Some of the requests not found, requesting all")
         reqs = conn.get_all_spot_instance_requests()
     for req in reqs:
         _spot_requests[region, req.id] = req
 
 
 def get_spot_request(region, request_id):
     if (region, request_id) in _spot_requests:
--- a/cloudtools/buildbot.py
+++ b/cloudtools/buildbot.py
@@ -1,18 +1,20 @@
 import time
 import sqlalchemy as sa
 import re
 import logging
+import requests
 from sqlalchemy.engine.reflection import Inspector
 from collections import defaultdict
 
 from .jacuzzi import get_allocated_slaves
 
 log = logging.getLogger(__name__)
+ACTIVITY_BOOTING, ACTIVITY_STOPPED = ("booting", "stopped")
 
 
 def find_pending(dburl):
     db = sa.create_engine(dburl)
     inspector = Inspector(db)
     # Newer buildbot has a "buildrequest_claims" table
     if "buildrequest_claims" in inspector.get_table_names():
         query = sa.text("""
@@ -52,8 +54,117 @@ def map_builders(pending, builder_map):
                 log.debug("%s instance type %s slaveset %s",
                           pending_buildername, moz_instance_type, slaveset)
                 type_map[moz_instance_type, slaveset] += 1
                 break
         else:
             log.debug("%s has pending jobs, but no instance types defined",
                       pending_buildername)
     return type_map
+
+
+def get_tacfile(ssh_client):
+    return ssh_client.get_stdout("cat /builds/slave/buildbot.tac")
+
+
+def get_buildbot_master(ssh_client, masters_json):
+    tacfile = get_tacfile(ssh_client)
+    host = re.search("^buildmaster_host = '(.*?)'$", tacfile, re.M)
+    host = host.group(1)
+    port = None
+    for master in masters_json:
+        if master["hostname"] == host:
+            port = master["http_port"]
+            break
+    assert host and port
+    return host, port
+
+
+def graceful_shutdown(ssh_client, masters_json):
+    # Find out which master we're attached to by looking at buildbot.tac
+    log.debug("%s - looking up which master we're attached to",
+              ssh_client.name)
+    host, port = get_buildbot_master(ssh_client, masters_json)
+
+    url = "http://{host}:{port}/buildslaves/{name}/shutdown".format(
+        host=host, port=port, name=ssh_client.name)
+    log.debug("%s - POSTing to %s", ssh_client.name, url)
+    requests.post(url, allow_redirects=False)
+
+
+def get_last_activity(ssh_client):
+    slave_time = ssh_client.get_stdout("date +%Y%m%d%H%M%S").strip()
+    slave_time = time.mktime(time.strptime(slave_time, "%Y%m%d%H%M%S"))
+    uptime = float(ssh_client.get_stdout("cat /proc/uptime").split()[0])
+
+    if uptime < 3 * 60:
+        # Assume we're still booting
+        log.debug("%s - uptime is %.2f; assuming we're still booting up",
+                  ssh_client.name, uptime)
+        return ACTIVITY_BOOTING
+
+    stdout = ssh_client.get_stdout(
+        "tail -n 100 /builds/slave/twistd.log.1 /builds/slave/twistd.log")
+
+    last_activity = None
+    running_command = False
+    t = time.time()
+    line = ""
+    for line in stdout.splitlines():
+        m = re.search(r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})", line)
+        if m:
+            t = time.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
+            t = time.mktime(t)
+        else:
+            # Not sure what to do with this line...
+            continue
+
+        # uncomment to dump out ALL the lines
+        # log.debug("%s - %s", name, line.strip())
+
+        if "RunProcess._startCommand" in line or "using PTY: " in line:
+            log.debug("%s - started command - %s", ssh_client.name,
+                      line.strip())
+            running_command = True
+        elif "commandComplete" in line or "stopCommand" in line:
+            log.debug("%s - done command - %s", ssh_client.name, line.strip())
+            running_command = False
+
+        if "Shut Down" in line:
+            # Check if this happened before we booted, i.e. we're still booting
+            # up
+            if (slave_time - t) > uptime:
+                log.debug(
+                    "%s - shutdown line is older than uptime; assuming we're "
+                    "still booting %s", ssh_client.name, line.strip())
+                last_activity = ACTIVITY_BOOTING
+            else:
+                last_activity = ACTIVITY_STOPPED
+        elif "I have a leftover directory" in line:
+            # Ignore this, it doesn't indicate anything
+            continue
+        elif running_command:
+            # We're in the middle of running something, so say that our last
+            # activity is now (0 seconds ago)
+            last_activity = 0
+        else:
+            last_activity = slave_time - t
+
+    # If the last lines from the log are over 10 minutes ago, and are from
+    # before our reboot, then try rebooting
+    if (slave_time - t) > 10 * 60 and (slave_time - t) > uptime:
+        log.debug(
+            "%s - shut down happened %ss ago, but we've been up for %ss - %s",
+            ssh_client.name, slave_time - t, uptime, line.strip())
+        # If longer than 30 minutes, try rebooting
+        if (slave_time - t) > 30 * 60:
+            log.debug("%s - rebooting", ssh_client.name)
+            ssh_client.reboot()
+
+    # If there's *no* activity (e.g. no twistd.log files), and we've been up a
+    # while, then reboot
+    if last_activity is None and uptime > 15 * 60:
+        log.debug("%s - no activity; rebooting", ssh_client.name)
+        # If longer than 30 minutes, try rebooting
+        ssh_client.reboot()
+
+    log.debug("%s - %s - %s", ssh_client.name, last_activity, line.strip())
+    return last_activity
--- a/cloudtools/fabric/__init__.py
+++ b/cloudtools/fabric/__init__.py
@@ -1,13 +1,13 @@
 from fabric.api import env
 
 
 def setup_fabric_env(host_string=None, user="root", abort_on_prompts=True,
                      disable_known_hosts=True, key_filename=None):
     env.abort_on_prompts = abort_on_prompts
     env.disable_known_hosts = disable_known_hosts
-    if host_string:
+    if host_string:  # pragma: no branch
         env.host_string = host_string
-    if user:
+    if user:  # pragma: no branch
         env.user = user
-    if key_filename:
+    if key_filename:  # pragma: no branch
         env.key_filename = key_filename
--- a/cloudtools/graphite.py
+++ b/cloudtools/graphite.py
@@ -38,17 +38,17 @@ class GraphiteLogger(object):
     def generate_data(self, prefix):
         data = []
         for name, (value, timestamp) in sorted(self._data.iteritems()):
             data.append(self._generate_line(prefix, name, value, timestamp))
         return "".join(data)
 
     def sendall(self):
         if not self._data:
-            log.warning("Nothing to submit to graphite")
+            log.debug("Nothing to submit to graphite")
             return
 
         for host, port, prefix in self._servers:
             data = self.generate_data(prefix)
             log.debug("Graphite send: \n%s", data)
             try:
                 log.debug("Connecting to graphite at %s:%s", host, port)
                 sock = socket.create_connection((host, port), timeout=10)
--- a/cloudtools/jacuzzi.py
+++ b/cloudtools/jacuzzi.py
@@ -2,17 +2,17 @@ import logging
 import requests
 
 JACUZZI_BASE_URL = "http://jacuzzi-allocator.pub.build.mozilla.org/v1"
 log = logging.getLogger(__name__)
 _jacuzzi_allocated_cache = {}
 
 
 def get_allocated_slaves(buildername):
-    if buildername in _jacuzzi_allocated_cache:
+    if buildername in _jacuzzi_allocated_cache:  # pragma: no branch
         return _jacuzzi_allocated_cache[buildername]
 
     if buildername is None:
         log.debug("getting set of all allocated slaves")
         r = requests.get("{0}/allocated/all".format(JACUZZI_BASE_URL))
         _jacuzzi_allocated_cache[buildername] = frozenset(r.json()['machines'])
         return _jacuzzi_allocated_cache[buildername]
 
new file mode 100644
--- /dev/null
+++ b/cloudtools/ssh.py
@@ -0,0 +1,43 @@
+import logging
+import paramiko
+from .graphite import get_graphite_logger
+
+log = logging.getLogger(__name__)
+gr_log = get_graphite_logger()
+
+
+class SSHClient(paramiko.SSHClient):
+
+    def __init__(self, instance, username, key_filename):
+        super(SSHClient, self).__init__()
+        self.set_missing_host_key_policy(paramiko.MissingHostKeyPolicy())
+        self.instance = instance
+        self.username = username
+        self.key_filename = key_filename
+        self.ip = instance.private_ip_address
+        self.name = instance.tags.get("Name")
+
+    def connect(self, *args, **kwargs):
+        try:
+            super(SSHClient, self).connect(*args, hostname=self.ip,
+                                           username=self.username,
+                                           key_filename=self.key_filename,
+                                           **kwargs)
+            return self
+        except Exception:
+            log.debug("Couldn't log into %s at %s", self.name, self.ip)
+            return None
+
+    def get_stdout(self, command):
+        stdin, stdout, _ = self.exec_command(command)
+        stdin.close()
+        data = stdout.read()
+        return data
+
+    def reboot(self, command=None):
+        if not command:
+            command = "sudo reboot"
+        self.get_stdout(command)
+        gr_log.add(
+            "rebooted.{}".format(self.instance.tags.get("moz-type", "none")),
+            1, collect=True)
--- a/requirements-tests.txt
+++ b/requirements-tests.txt
@@ -1,9 +1,9 @@
 Fabric==1.8.0
+PyYAML==3.11
 boto==2.27.0
-coverage==3.7.1
 iso8601==0.1.10
 mock==1.0.1
 nose==1.3.3
-python-coveralls==2.4.3
+flake8==2.2.5
 repoze.lru==0.6
 requests==2.0.1
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,12 @@
 Fabric==1.8.0
 IPy==0.81
 MySQL-python==1.2.4
+PyYAML==3.11
 SQLAlchemy==0.8.3
 argparse==1.2.1
 boto==2.27.0
 docopt==0.6.1
 ecdsa==0.10
 invtool==0.1.0
 iso8601==0.1.10
 paramiko==1.12.0
--- a/scripts/aws_create_ami.py
+++ b/scripts/aws_create_ami.py
@@ -42,16 +42,17 @@ def install_packages(packages_file, dist
         chroot_prefix = ""
 
     if distro in ("debian", "ubuntu"):
         run("{}apt-get update".format(chroot_prefix))
         run("DEBIAN_FRONTEND=noninteractive {}apt-get install -y "
             "--force-yes {}".format(chroot_prefix, packages))
         run("{}apt-get clean".format(chroot_prefix))
 
+
 def sync(src, dst):
     for local_directory, _, files in os.walk(src, followlinks=True):
         directory = os.path.relpath(local_directory, src)
         if directory == '.':
             directory = ''
 
         remote_directory = os.path.join(dst, directory)
         if directory != '':
--- a/scripts/aws_create_win_ami.py
+++ b/scripts/aws_create_win_ami.py
@@ -10,20 +10,19 @@ import random
 import json
 import uuid
 import time
 import logging
 import site
 import os
 
 import boto
-from boto.ec2 import connect_to_region
 from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType
 from boto.ec2.networkinterface import NetworkInterfaceSpecification, \
-     NetworkInterfaceCollection
+    NetworkInterfaceCollection
 site.addsitedir(os.path.join(os.path.dirname(__file__), ".."))
 from cloudtools.aws import AMI_CONFIGS_DIR, wait_for_status, get_aws_connection
 log = logging.getLogger(__name__)
 
 
 def create_instance(connection, instance_name, config, key_name):
     bdm = None
     if 'device_map' in config:
@@ -73,26 +72,24 @@ def create_instance(connection, instance
     wait_for_status(instance, 'state', 'stopped', 'update')
     log.info("clearing userData")
     instance.modify_attribute("userData", None)
     return instance
 
 
 def create_ami(host_instance, config_name, config):
     connection = host_instance.connection
-    host_img = connection.get_image(config['ami'])
-    target_name = config_name
-    config_dir = "%s/%s" % (AMI_CONFIGS_DIR, target_name)
     dated_target_name = "%s-%s" % (
         config_name, time.strftime("%Y-%m-%d-%H-%M", time.gmtime()))
 
     log.info('Creating AMI')
-    host_img = connection.get_image(config['ami'])
 
-    ami_id = connection.create_image(host_instance.id, name=dated_target_name, description='%s EBS AMI' % dated_target_name,)
+    ami_id = connection.create_image(host_instance.id, name=dated_target_name,
+                                     description='%s EBS AMI' %
+                                     dated_target_name,)
     while True:
         try:
             ami = connection.get_image(ami_id)
             ami.add_tag('Name', dated_target_name)
             log.info('AMI created')
             log.info('ID: {id}, name: {name}'.format(id=ami.id, name=ami.name))
             break
         except boto.exception.EC2ResponseError:
@@ -108,17 +105,19 @@ def create_ami(host_instance, config_nam
 if __name__ == '__main__':
     from docopt import docopt
 
     args = docopt(__doc__)
 
     logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
 
     try:
-        config = json.load(open("%s/%s.json" % (AMI_CONFIGS_DIR, args['--config'])))[args['--region']]
+        config = json.load(
+            open("%s/%s.json" % (AMI_CONFIGS_DIR,
+                                 args['--config'])))[args['--region']]
     except KeyError:
-        parser.error("unknown configuration for region %s" % options.region)
+        log.error("unknown configuration")
+        exit(1)
 
     connection = get_aws_connection(args['--region'])
     host_instance = create_instance(connection, args['INSTANCE_NAME'], config,
                                     args['--key-name'])
-    #host_instance = connection.get_all_instances(["i-1c743476"])[0].instances[0]
     target_ami = create_ami(host_instance, args['--config'], config)
--- a/scripts/aws_manage_routingtables.py
+++ b/scripts/aws_manage_routingtables.py
@@ -1,28 +1,26 @@
 #!/usr/bin/env python
-import socket
-
 import boto.vpc
 import yaml
 import dns.resolver
 
 import logging
 log = logging.getLogger(__name__)
+_dns_cache = {}
 
 
 def get_connection(region):
     return boto.vpc.connect_to_region(region)
 
 
 def load_config(filename):
     return yaml.load(open(filename))
 
 
-_dns_cache = {}
 def resolve_host(hostname):
     if hostname in _dns_cache:
         return _dns_cache[hostname]
     log.info("resolving host %s", hostname)
     ips = dns.resolver.query(hostname, "A")
     ips = [i.to_text() for i in ips]
     _dns_cache[hostname] = ips
     return ips
@@ -76,17 +74,16 @@ def sync_tables(conn, my_tables, remote_
         to_add = set()
         for cidr, dest in my_t['routes'].iteritems():
             if "/" not in cidr:
                 for ip in resolve_host(cidr):
                     log.info("adding %s for %s", ip, cidr)
                     to_add.add(("%s/32" % ip, dest))
                 to_delete.add(cidr)
 
-
         for d in to_delete:
             del my_t['routes'][d]
 
         for cidr, dest in to_add:
             my_t['routes'][cidr] = dest
 
         for cidr, dest in my_t['routes'].iteritems():
             assert "/" in cidr
--- a/scripts/aws_manage_securitygroups.py
+++ b/scripts/aws_manage_securitygroups.py
@@ -6,16 +6,17 @@ import dns.resolver
 import yaml_includes
 
 # see http://docs.aws.amazon.com/AmazonVPC/latest/UserGuide/VPC_Appendix_Limits.html
 # note that "Rules" in that document actually refers to grants
 MAX_GRANTS_PER_SG = 125
 
 import logging
 log = logging.getLogger(__name__)
+port_re = re.compile(r'^(\d+)-(\d+)$')
 
 
 def get_connection(region):
     return boto.ec2.connect_to_region(region)
 
 
 def load_config(filename):
     return yaml_includes.process_includes(yaml.load(open(filename)))
@@ -38,34 +39,33 @@ def resolve_host(hostname):
         return _dns_cache[hostname]
     log.info("resolving host %s", hostname)
     ips = dns.resolver.query(hostname, "A")
     ips = [i.to_text() for i in ips]
     _dns_cache[hostname] = ips
     return ips
 
 
-port_re = re.compile(r'^(\d+)-(\d+)$')
 def make_rules_for_def(rule):
     """Returns a set of rules for a given config definition. A rule is a
     (proto, from_port, to_port, hosts) tuple
     """
     retval = []
     proto = str(rule['proto'])
     if 'ports' in rule:
         ports = []
         for p in rule['ports']:
             p = str(p)
             mo = port_re.match(p)
             if mo:
                 ports.append(tuple(mo.groups()))
             else:
                 ports.append((p, p))
     else:
-        ports = [(None,None)]
+        ports = [(None, None)]
     hosts = rule['hosts']
     # Resolve the hostnames
     log.debug("%s %s %s", proto, ports, hosts)
     log.debug("Resolving hostnames")
     for h in hosts[:]:
         if '/' not in h:
             ips = resolve_host(h)
             hosts.remove(h)
@@ -102,19 +102,19 @@ def rules_from_sg(sg):
             continue
         rules.setdefault(('inbound', rule.ip_protocol, rule.from_port,
                           rule.to_port), set()).update(cidr_grants)
     for rule in sg.rules_egress:
         # ignore non-cidr grants (to other sg's)
         cidr_grants = set(g.cidr_ip for g in rule.grants if g.cidr_ip)
         if not cidr_grants:
             continue
-        rules.setdefault(('outbound', rule.ip_protocol, rule.from_port,
-                          rule.to_port), set()).update(set(g.cidr_ip for g in
-                                                           rule.grants if g.cidr_ip))
+        rules.setdefault(
+            ('outbound', rule.ip_protocol, rule.from_port, rule.to_port),
+            set()).update(set(g.cidr_ip for g in rule.grants if g.cidr_ip))
 
     return rules
 
 
 def add_hosts(sg, rule_key, hosts):
     if rule_key[0] == 'inbound':
         auth_func = sg.connection.authorize_security_group
     else:
@@ -227,17 +227,17 @@ def main():
         conn = get_connection(region)
         all_groups = conn.get_all_security_groups()
         conns_by_region[region] = conn
         security_groups_by_region[region] = all_groups
 
     prompt = True
 
     # look for too-big security groups
-    ok =  True
+    ok = True
     for sg_name, sg_config in sg_defs.iteritems():
         rules = make_rules(sg_config)
         total_grants = sum([len(hosts) for hosts in rules.itervalues()])
         if total_grants > MAX_GRANTS_PER_SG:
             log.warning("Group %s has %d rules, more than the allowed %d",
                         sg_name, total_grants, MAX_GRANTS_PER_SG)
             ok = False
     if not ok:
--- a/scripts/aws_sanity_checker.py
+++ b/scripts/aws_sanity_checker.py
@@ -9,16 +9,17 @@ import site
 import os
 
 site.addsitedir(os.path.join(os.path.dirname(__file__), ".."))
 from cloudtools.aws.sanity import AWSInstance, aws_instance_factory, SLAVE_TAGS
 from cloudtools.aws import get_aws_connection, DEFAULT_REGIONS
 
 log = logging.getLogger(__name__)
 
+
 def is_beanstalk_instance(i):
     """returns True if this is a beanstalk instance"""
     return i.tags.get("elasticbeanstalk:environment-name") is not None
 
 
 def get_all_instances(connection):
     """gets all the instances from a connection"""
     res = connection.get_all_instances()
@@ -36,28 +37,29 @@ def report(items, message):
         for num, item in enumerate(items):
             print "{num} {item}".format(num=num, item=item)
         print
 
 
 def _report_lazy_running_instances(lazy):
     """reports the lazy long running instances"""
     message = 'Lazy long running instances'
-    lazy = sorted(lazy, reverse=True, key=lambda x:x.get_uptime())
+    lazy = sorted(lazy, reverse=True, key=lambda x: x.get_uptime())
     lazy = [i.longrunning_message() for i in lazy]
     report(lazy, message)
 
 
 def _report_long_running_instances(long_running):
     """reports the long running instances"""
     message = 'Long running instances'
     # remove lazy instances...
     long_running = [i for i in long_running if not i.is_lazy()]
     if long_running:
-        items = sorted(long_running, reverse=True, key=lambda x:x.get_uptime())
+        items = sorted(long_running, reverse=True,
+                       key=lambda x: x.get_uptime())
         items = [i.longrunning_message() for i in items]
         report(items, message)
     else:
         print "==== No long running instances ===="
         print
 
 
 def _report_loaned(loaned):
@@ -65,16 +67,17 @@ def _report_loaned(loaned):
     if loaned:
         items = [i.loaned_message() for i in loaned]
         message = "Loaned"
         report(items, message)
     else:
         print "==== No loaned instances ===="
         print
 
+
 def _report_bad_type(bad_type):
     """reports the instances with a bad type"""
     if bad_type:
         message = "Instances with unknown type"
         # sort the instances by region
         items = sorted(bad_type, key=lambda x: x.get_region())
         # we need the unknown_type_message...
         items = [i.unknown_type_message() for i in items]
@@ -82,31 +85,32 @@ def _report_bad_type(bad_type):
     else:
         print "==== No instances with unknown type ===="
         print
 
 
 def _report_bad_state(bad_state):
     """reports the instances with a bad state"""
     if bad_state:
-        message =  "Instances with unknown state"
+        message = "Instances with unknown state"
         items = sorted(bad_state, key=lambda x: x.get_region())
         items = [i.unknown_state_message() for i in items]
         report(items, message)
     else:
         print "==== No instances with unknown state ===="
         print
 
 
 def _report_long_stopped(long_stopped):
     """reports the instances stopped for a while"""
     if long_stopped:
         message = "Instances stopped for a while"
-        items = sorted(long_stopped, reverse=True, key=lambda x: x.get_uptime())
-        items = [i.stopped_message() for i in items ]
+        items = sorted(long_stopped, reverse=True,
+                       key=lambda x: x.get_uptime())
+        items = [i.stopped_message() for i in items]
         report(items, message)
     else:
         print "==== No long stopped instances ===="
         print
 
 
 def _report_impaired(impaired):
     """reports the impaired instances"""
@@ -139,18 +143,18 @@ def get_impaired(connection, instances):
 
 def _report_volume_sanity_check(volumes):
     """prints the Volume info"""
     total = sum(v.size for v in volumes)
     not_attached = get_not_attached(volumes)
     print "Volume usage: %sG" % total
     if not_attached:
         print "==== Not attached volumes ===="
-        for instance, (volume, msg) in enumerate(sorted(not_attached,
-            key=lambda x: x[0].region.name)):
+        for instance, (volume, msg) in enumerate(
+                sorted(not_attached, key=lambda x: x[0].region.name)):
             print instance, "%s %s: %s" % (volume.id, volume.region.name, msg)
         print
 
 
 def _report_instance_stats(instances, regions):
     """prints the instances stats"""
     states = collections.defaultdict(int)
     types = collections.defaultdict(list)
--- a/scripts/aws_stop_idle.py
+++ b/scripts/aws_stop_idle.py
@@ -1,307 +1,156 @@
 #!/usr/bin/env python
 """
 Watches running EC2 instances and shuts them down when idle
 """
 # lint_ignore=E501,C901
-import re
 import time
 import calendar
-try:
-    import simplejson as json
-    assert json
-except ImportError:
-    import json
-
 import random
 import threading
 import boto.ec2
 import requests
 import logging
 import site
 import os
-from paramiko import SSHClient
+import json
 from Queue import Queue, Empty
 
 site.addsitedir(os.path.join(os.path.dirname(__file__), ".."))
-from cloudtools.aws import get_aws_connection
+from cloudtools.aws import get_impaired_instance_ids, get_buildslave_instances
+from cloudtools.buildbot import graceful_shutdown, get_last_activity, \
+    ACTIVITY_STOPPED, ACTIVITY_BOOTING
+from cloudtools.ssh import SSHClient
+import cloudtools.graphite
 
-log = logging.getLogger()
+log = logging.getLogger(__name__)
+gr_log = cloudtools.graphite.get_graphite_logger()
 
 # Instances running less than STOP_THRESHOLD_MINS minutes within 1 hour
 # boundary won't be stopped.
 STOP_THRESHOLD_MINS_SPOT = 45
 STOP_THRESHOLD_MINS_ONDEMAND = 30
 
 
-def get_buildbot_instances(conn, moz_types):
-    # Look for running `moz_types` instances with moz-state=ready
-    instances = conn.get_only_instances(filters={
-        'tag:moz-state': 'ready',
-        'instance-state-name': 'running',
-    })
-
-    retval = []
-    for i in instances:
-        if i.tags.get("moz-type") in moz_types and \
-                not i.tags.get("moz-loaned-to"):
-            retval.append(i)
-
-    return retval
-
-
-class IgnorePolicy:
-
-    def missing_host_key(self, client, hostname, key):
-        pass
-
-
-def get_ssh_client(name, ip, credentials):
-    client = SSHClient()
-    client.set_missing_host_key_policy(IgnorePolicy())
-    for u, passwords in credentials.iteritems():
-        for p in passwords:
-            try:
-                client.connect(hostname=ip, username=u, password=p)
-                return client
-            except Exception:
-                pass
-
-    log.debug("Couldn't log into %s at %s with any known passwords", name, ip)
-    return None
-
-
-def get_last_activity(name, client):
-    stdin, stdout, stderr = client.exec_command("date +%Y%m%d%H%M%S")
-    slave_time = stdout.read().strip()
-    slave_time = time.mktime(time.strptime(slave_time, "%Y%m%d%H%M%S"))
-
-    stdin, stdout, stderr = client.exec_command("cat /proc/uptime")
-    uptime = float(stdout.read().split()[0])
-
-    if uptime < 3 * 60:
-        # Assume we're still booting
-        log.debug("%s - uptime is %.2f; assuming we're still booting up", name,
-                  uptime)
-        return "booting"
-
-    stdin, stdout, stderr = client.exec_command(
-        "tail -n 100 /builds/slave/twistd.log.1 /builds/slave/twistd.log")
-    stdin.close()
-
-    last_activity = None
-    running_command = False
-    t = time.time()
-    line = ""
-    for line in stdout:
-        m = re.search(r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})", line)
-        if m:
-            t = time.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
-            t = time.mktime(t)
-        else:
-            # Not sure what to do with this line...
-            continue
-
-        # uncomment to dump out ALL the lines
-        # log.debug("%s - %s", name, line.strip())
-
-        if "RunProcess._startCommand" in line or "using PTY: " in line:
-            log.debug("%s - started command - %s", name, line.strip())
-            running_command = True
-        elif "commandComplete" in line or "stopCommand" in line:
-            log.debug("%s - done command - %s", name, line.strip())
-            running_command = False
-
-        if "Shut Down" in line:
-            # Check if this happened before we booted, i.e. we're still booting
-            # up
-            if (slave_time - t) > uptime:
-                log.debug(
-                    "%s - shutdown line is older than uptime; assuming we're "
-                    "still booting %s", name, line.strip())
-                last_activity = "booting"
-            else:
-                last_activity = "stopped"
-        elif "I have a leftover directory" in line:
-            # Ignore this, it doesn't indicate anything
-            continue
-        elif running_command:
-            # We're in the middle of running something, so say that our last
-            # activity is now (0 seconds ago)
-            last_activity = 0
-        else:
-            last_activity = slave_time - t
-
-    # If the last lines from the log are over 10 minutes ago, and are from
-    # before our reboot, then try rebooting
-    if (slave_time - t) > 10 * 60 and (slave_time - t) > uptime:
-        log.warning(
-            "%s - shut down happened %ss ago, but we've been up for %ss - %s",
-            name, slave_time - t, uptime, line.strip())
-        # If longer than 30 minutes, try rebooting
-        if (slave_time - t) > 30 * 60:
-            log.warning("%s - rebooting", name)
-            stdin, stdout, stderr = client.exec_command("sudo reboot")
-            stdin.close()
-
-    # If there's *no* activity (e.g. no twistd.log files), and we've been up a
-    # while, then reboot
-    if last_activity is None and uptime > 15 * 60:
-        log.warning("%s - no activity; rebooting", name)
-        # If longer than 30 minutes, try rebooting
-        stdin, stdout, stderr = client.exec_command("sudo reboot")
-        stdin.close()
-
-    log.debug("%s - %s - %s", name, last_activity, line.strip())
-    return last_activity
-
-
-def get_tacfile(client):
-    stdin, stdout, stderr = client.exec_command(
-        "cat /builds/slave/buildbot.tac")
-    stdin.close()
-    data = stdout.read()
-    return data
-
-
-def get_buildbot_master(client, masters_json):
-    tacfile = get_tacfile(client)
-    host = re.search("^buildmaster_host = '(.*?)'$", tacfile, re.M)
-    host = host.group(1)
-    port = None
-    for master in masters_json:
-        if master["hostname"] == host:
-            port = master["http_port"]
-            break
-    assert host and port
-    return host, port
-
-
-def graceful_shutdown(name, ip, client, masters_json):
-    # Find out which master we're attached to by looking at buildbot.tac
-    log.debug("%s - looking up which master we're attached to", name)
-    host, port = get_buildbot_master(client, masters_json)
-
-    url = "http://{host}:{port}/buildslaves/{name}/shutdown".format(
-        host=host, port=port, name=name)
-    log.debug("%s - POSTing to %s", name, url)
-    requests.post(url, allow_redirects=False)
-
-
-def aws_safe_stop_instance(i, impaired_ids, credentials, masters_json,
+def aws_safe_stop_instance(i, impaired_ids, user, key_filename, masters_json,
                            dryrun=False):
     "Returns True if stopped"
-    name = i.tags['Name']
     # TODO: Check with slavealloc
 
-    ip = i.private_ip_address
-    ssh_client = get_ssh_client(name, ip, credentials)
+    ssh_client = SSHClient(instance=i, username=user,
+                           key_filename=key_filename).connect()
     stopped = False
     launch_time = calendar.timegm(time.strptime(
         i.launch_time[:19], '%Y-%m-%dT%H:%M:%S'))
     if not ssh_client:
         if i.id in impaired_ids:
             if time.time() - launch_time > 60 * 10:
                 stopped = True
                 if not dryrun:
                     log.debug(
                         "%s - shut down an instance with impaired status",
-                        name)
+                        ssh_client.name)
                     i.terminate()
+                    gr_log.add("impaired.{moz_type}".format(
+                        ssh_client.instance.tags.get("moz-type", "none")), 1,
+                        collect=True)
                 else:
-                    log.debug("%s - would have stopped", name)
+                    log.debug("%s - would have stopped", ssh_client.name)
         return stopped
 
     uptime_min = int((time.time() - launch_time) / 60)
     # Don't try to stop spot instances until after STOP_THRESHOLD_MINS_SPOT
     # minutes into each hour
     if i.spot_instance_request_id:
         threshold = STOP_THRESHOLD_MINS_SPOT
         if uptime_min % 60 < threshold:
-            log.debug("Skipping %s, with uptime %s", name, uptime_min)
+            log.debug("Skipping %s, with uptime %s", ssh_client.name,
+                      uptime_min)
             return False
     else:
         # On demand instances can be stopped after STOP_THRESHOLD_MINS_ONDEMAND
         threshold = STOP_THRESHOLD_MINS_ONDEMAND
         if uptime_min < threshold:
-            log.debug("Skipping %s, with updtime %s", name, uptime_min)
+            log.debug("Skipping %s, with updtime %s", ssh_client.name,
+                      uptime_min)
             return False
 
-    last_activity = get_last_activity(name, ssh_client)
-    if last_activity == "stopped":
+    last_activity = get_last_activity(ssh_client)
+    if last_activity == ACTIVITY_STOPPED:
         stopped = True
         if not dryrun:
-            log.debug("%s - stopping instance (launched %s)", name,
+            log.debug("%s - stopping instance (launched %s)", ssh_client.name,
                       i.launch_time)
             i.terminate()
         else:
-            log.debug("%s - would have stopped", name)
+            log.debug("%s - would have stopped", ssh_client.name)
         return stopped
 
-    if last_activity == "booting":
+    if last_activity == ACTIVITY_BOOTING:
         # Wait harder
         return stopped
 
-    log.debug("%s - last activity %s", name, last_activity)
+    log.debug("%s - last activity %s", ssh_client.name, last_activity)
 
     # If it looks like we're idle for more than 8 hours, kill the machine
     if last_activity > 8 * 3600:
-        log.warning("%s - last activity more than 8 hours ago; shutting down",
-                    name)
+        log.debug("%s - last activity more than 8 hours ago; shutting down",
+                  ssh_client.name)
         if not dryrun:
-            log.debug("%s - starting graceful shutdown", name)
-            graceful_shutdown(name, ip, ssh_client, masters_json)
+            log.debug("%s - starting graceful shutdown", ssh_client.name)
+            graceful_shutdown(ssh_client, masters_json)
             # Stop the instance
-            log.debug("%s - stopping instance", name)
+            log.debug("%s - stopping instance", ssh_client.name)
             i.terminate()
             stopped = True
 
     # If the machine is idle for more than 5 minutes, shut it down
     elif last_activity > 300:
         if not dryrun:
             # Hit graceful shutdown on the master
-            log.debug("%s - starting graceful shutdown", name)
-            graceful_shutdown(name, ip, ssh_client, masters_json)
+            log.debug("%s - starting graceful shutdown", ssh_client.name)
+            graceful_shutdown(ssh_client, masters_json)
 
             # Check if we've exited right away
-            if get_last_activity(name, ssh_client) == "stopped":
-                log.debug("%s - stopping instance", name)
+            if get_last_activity(ssh_client) == ACTIVITY_STOPPED:
+                log.debug("%s - stopping instance", ssh_client.name)
                 i.terminate()
                 stopped = True
             else:
                 log.debug(
-                    "%s - not stopping, waiting for graceful shutdown", name)
+                    "%s - not stopping, waiting for graceful shutdown",
+                    ssh_client.name)
         else:
-            log.debug("%s - would have started graceful shutdown", name)
+            log.debug("%s - would have started graceful shutdown",
+                      ssh_client.name)
             stopped = True
     else:
-        log.debug("%s - not stopping", name)
+        log.debug("%s - not stopping", ssh_client.name)
     return stopped
 
 
-def aws_stop_idle(credentials, regions, masters_json, moz_types,
+def aws_stop_idle(user, key_filename, regions, masters_json, moz_types,
                   dryrun=False, concurrency=8):
     if not regions:
         # Look at all regions
         log.debug("loading all regions")
         regions = [r.name for r in boto.ec2.regions()]
 
     min_running_by_type = 0
 
     all_instances = []
     impaired_ids = []
 
     for r in regions:
         log.debug("looking at region %s", r)
-        conn = get_aws_connection(r)
-        instances = get_buildbot_instances(conn, moz_types)
-        impaired = conn.get_all_instance_status(
-            filters={'instance-status.status': 'impaired'})
-        impaired_ids.extend(i.id for i in impaired)
+        instances = get_buildslave_instances(r, moz_types)
+        log.debug("Got %s buildslave instances", len(instances))
+        impaired_ids.extend(get_impaired_instance_ids(r))
+        log.debug("Got %s impaired instances", len(impaired_ids))
         instances_by_type = {}
         for i in instances:
             # TODO: Check if launch_time is too old, and terminate the instance
             # if it is
             # NB can't turn this on until aws_create_instance is working
             # properly (with ssh keys)
             instances_by_type.setdefault(i.tags['moz-type'], []).append(i)
 
@@ -323,22 +172,22 @@ def aws_stop_idle(credentials, regions, 
 
     def worker():
         while True:
             try:
                 i = q.get(timeout=0.1)
             except Empty:
                 return
             try:
-                if aws_safe_stop_instance(i, impaired_ids, credentials,
+                if aws_safe_stop_instance(i, impaired_ids, user, key_filename,
                                           masters_json, dryrun=dryrun):
                     to_stop.put(i)
             except Exception:
-                log.warning("%s - unable to stop" % i.tags.get('Name'),
-                            exc_info=True)
+                log.debug("%s - unable to stop" % i.tags.get('Name'),
+                          exc_info=True)
 
     for i in all_instances:
         q.put(i)
 
     # Workaround for http://bugs.python.org/issue11108
     time.strptime("19000102030405", "%Y%m%d%H%M%S")
     threads = []
     for i in range(concurrency):
@@ -358,71 +207,84 @@ def aws_stop_idle(credentials, regions, 
                 raise SystemExit(1)
 
     total_stopped = {}
     while not to_stop.empty():
         i = to_stop.get()
         if not dryrun:
             i.update()
         if 'moz-type' not in i.tags:
-            log.warn("%s - has no moz-type! (%s)" % (i.tags.get('Name'), i.id))
+            log.debug("%s - has no moz-type! (%s)" % (i.tags.get('Name'),
+                                                      i.id))
 
-        t = i.tags.get('moz-type', 'notype')
+        t = i.tags.get('moz-type', 'none')
         if t not in total_stopped:
             total_stopped[t] = 0
         total_stopped[t] += 1
 
     for t, c in sorted(total_stopped.items()):
         log.debug("%s - stopped %s", t, c)
+        gr_log.add("stopped.%s" % t, c)
 
 if __name__ == '__main__':
     import argparse
     import logging.handlers
     parser = argparse.ArgumentParser()
     parser.add_argument("-r", "--region", action="append", dest="regions",
                         required=True)
     parser.add_argument("-v", "--verbose", action="store_const",
                         dest="loglevel", const=logging.DEBUG,
-                        default=logging.INFO)
-    parser.add_argument("-c", "--credentials", type=argparse.FileType('r'),
+                        default=logging.WARNING)
+    parser.add_argument("-k", "--secrets", type=argparse.FileType('r'),
                         required=True)
+    parser.add_argument("-u", "--user", required=True, help="SSH user name")
+    parser.add_argument("--ssh-key", required=True,
+                        help="Private SSH key path")
     parser.add_argument("-t", "--moz-type", action="append", dest="moz_types",
                         required=True,
                         help="moz-type tag values to be checked")
     parser.add_argument("-j", "--concurrency", type=int, default=8)
-    parser.add_argument("--masters-json",
-                        default="https://hg.mozilla.org/build/tools/raw-file/default/buildfarm/maintenance/production-masters.json")
+    parser.add_argument(
+        "--masters-json",
+        default="https://hg.mozilla.org/build/tools/raw-file/default/buildfarm"
+        "/maintenance/production-masters.json")
     parser.add_argument("--dry-run", action="store_true")
     parser.add_argument("-l", "--logfile", dest="logfile",
                         help="log file for full debug log")
 
     args = parser.parse_args()
 
     logging.getLogger().setLevel(logging.DEBUG)
     logging.getLogger("boto").setLevel(logging.WARN)
     logging.getLogger("paramiko").setLevel(logging.WARN)
     logging.getLogger('requests').setLevel(logging.WARN)
 
-    formatter = logging.Formatter("%(asctime)s - %(message)s")
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s -  %(message)s")
     handler = logging.StreamHandler()
     handler.setFormatter(formatter)
     handler.setLevel(args.loglevel)
     logging.getLogger().addHandler(handler)
 
     if args.logfile:
         handler = logging.handlers.RotatingFileHandler(
             args.logfile, maxBytes=10 * (1024 ** 2), backupCount=100)
         handler.setLevel(logging.DEBUG)
         handler.setFormatter(formatter)
         logging.getLogger().addHandler(handler)
 
     log.debug("starting")
-    credentials = json.load(args.credentials)
+
+    masters_json = requests.get(args.masters_json).json()
+    secrets = json.load(args.secrets)
 
-    try:
-        masters_json = json.load(open(args.masters_json))
-    except IOError:
-        masters_json = requests.get(args.masters_json).json()
+    aws_stop_idle(user=args.user, key_filename=args.ssh_key,
+                  regions=args.regions, masters_json=masters_json,
+                  moz_types=args.moz_types, dryrun=args.dry_run,
+                  concurrency=args.concurrency)
+    for entry in secrets.get("graphite_hosts", []):
+        host = entry.get("host")
+        port = entry.get("port")
+        prefix = "aws_stop_idle"
+        if all([host, port, prefix]):
+            gr_log.add_destination(host, port, prefix)
 
-    aws_stop_idle(credentials, args.regions, masters_json,
-                  args.moz_types, dryrun=args.dry_run,
-                  concurrency=args.concurrency)
+    gr_log.sendall()
     log.debug("done")
--- a/scripts/check_dns.py
+++ b/scripts/check_dns.py
@@ -1,11 +1,10 @@
 #!/usr/bin/env python
 import argparse
-import json
 import logging
 from multiprocessing import Pool
 import site
 import os
 
 site.addsitedir(os.path.join(os.path.dirname(__file__), ".."))
 from cloudtools.aws import get_aws_connection
 from cloudtools.dns import get_ip, get_ptr, get_cname
--- a/scripts/ec22ip.py
+++ b/scripts/ec22ip.py
@@ -1,12 +1,11 @@
 #!/usr/bin/env python
 
 import re
-import json
 import site
 import os
 
 site.addsitedir(os.path.join(os.path.dirname(__file__), ".."))
 from cloudtools.aws import get_aws_connection
 
 
 if __name__ == '__main__':
--- a/scripts/yaml_includes.py
+++ b/scripts/yaml_includes.py
@@ -1,20 +1,22 @@
 import copy
 
+
 def process_includes(data):
     """
     Iterate over a de-YAML'd data structure.  A top-level 'includes'
     is treated as a dictionary of includable chunks.  Anywhere else,
     a dictionary containing only {'include': 'somename'} will include the
     chunk named 'somename' in its place.
     """
     if not isinstance(data, dict) or 'includes' not in data:
         return data
     includes = data.pop('includes')
+
     def iter(d):
         if isinstance(d, dict):
             if len(d) == 1 and 'include' in d and d['include'] in includes:
                 return includes[d['include']]
             return {k: iter(v) for (k, v) in d.iteritems()}
         elif isinstance(d, list):
             return [iter(v) for v in d]
         else:
new file mode 100644
--- /dev/null
+++ b/tests/test_cloudtools_fabric.py
@@ -0,0 +1,11 @@
+import unittest
+from fabric.api import env
+from cloudtools.fabric import setup_fabric_env
+
+
+class TestSetupFabricEnv(unittest.TestCase):
+
+    def test_generic(self):
+        setup_fabric_env(host_string="h1", user="u1", key_filename="k1")
+        self.assertTrue(env.abort_on_prompts)
+        self.assertTrue(env.disable_known_hosts)
--- a/tests/test_cloudtools_graphite.py
+++ b/tests/test_cloudtools_graphite.py
@@ -1,8 +1,9 @@
+import socket
 import unittest
 import mock
 import cloudtools.graphite
 from cloudtools.graphite import get_graphite_logger
 
 
 class TestGraphiteLogger(unittest.TestCase):
 
@@ -103,8 +104,45 @@ class TestGraphiteLogger(unittest.TestCa
             sock = mock.MagicMock()
             conn.return_value = sock
             gl.sendall()
             expected_calls = [
                 mock.call("prefix1.name 44 9999\n"),
                 mock.call("prefix2.name 44 9999\n"),
             ]
             sock.sendall.assert_has_calls(expected_calls)
+
+    @mock.patch.object(socket, "create_connection")
+    def test_sendall_no_data(self, m_conn):
+        gl = get_graphite_logger()
+        gl.add_destination("host1", 1111, "prefix1")
+        gl.sendall()
+        self.assertEqual(m_conn.call_count, 0)
+
+    @mock.patch.object(socket, "create_connection")
+    def test_sendall_exception(self, m_create_connection):
+        gl = get_graphite_logger()
+        gl.add_destination("host1", 1111, "prefix1")
+        gl.add("name", 44)
+        m_create_connection.side_effect = Exception("oops")
+        # No exception should be raised
+        gl.sendall()
+        self.assertDictEqual({}, gl._data)
+
+
+class TestGenerateInstanceStats(unittest.TestCase):
+
+    def test_running_only(self):
+        i1 = mock.Mock()
+        i1.state = "running"
+        i2 = mock.Mock()
+        i2.state = "stopped"
+        for i in [i1, i2]:
+            i.region.name = "r1"
+            i.tags = {"moz-type": "m1"}
+            i.instance_type = "i1"
+            i.spot_instance_request_id = "r1"
+            i.virtualization_type = "v1"
+            i.root_device_type = "d1"
+        with mock.patch("cloudtools.graphite._graphite_logger") as m_l:
+            cloudtools.graphite.generate_instance_stats([i1, i2])
+            m_l.add.assert_called_once_with("running.r1.m1.i1.spot.v1.d1",
+                                            1, collect=True)
--- a/tests/test_cloudtools_jacuzzi.py
+++ b/tests/test_cloudtools_jacuzzi.py
@@ -1,11 +1,13 @@
 import unittest
 import mock
-from cloudtools.jacuzzi import filter_instances_by_slaveset
+import cloudtools.jacuzzi
+from cloudtools.jacuzzi import filter_instances_by_slaveset, \
+    get_allocated_slaves
 
 
 class TestFilterSlaveset(unittest.TestCase):
 
     def test_None(self):
 
         i1 = mock.Mock()
         i1.tags = {"Name": "name1"}
@@ -24,8 +26,47 @@ class TestFilterSlaveset(unittest.TestCa
         i2 = mock.Mock()
         i2.tags = {"Name": "name2"}
 
         with mock.patch("cloudtools.jacuzzi.get_allocated_slaves") as mock_gas:
             # ensure it's not called
             self.assertEqual(filter_instances_by_slaveset([i1, i2], ["name1"]),
                              [i1])
             self.assertEquals(mock_gas.mock_calls, [])
+
+
+class TestGetAllocatedSlaves(unittest.TestCase):
+
+    @mock.patch("requests.get")
+    def test_cache(self, m_get):
+        cloudtools.jacuzzi._jacuzzi_allocated_cache = {"b1": "ret1"}
+        slaves = get_allocated_slaves("b1")
+        self.assertEqual(slaves, "ret1")
+        self.assertEqual(m_get.call_count, 0)
+
+    @mock.patch("requests.get")
+    def test_caching_no_buildername(self, m_get):
+        m_get.return_value.json.return_value = {"machines": ["m1", "m2"]}
+        get_allocated_slaves(None)
+        self.assertTrue(
+            cloudtools.jacuzzi._jacuzzi_allocated_cache[None].issuperset(
+                set(["m1", "m2"])))
+
+    @mock.patch("requests.get")
+    def test_no_buildername(self, m_get):
+        m_get.return_value.json.return_value = {"machines": ["m1", "m2"]}
+        self.assertEqual(get_allocated_slaves(None), frozenset(["m1", "m2"]))
+
+    @mock.patch("requests.get")
+    def test_404(self, m_get):
+        m_get.return_value.status_code = 404
+        self.assertEqual(get_allocated_slaves("b1"), None)
+        self.assertIsNone(cloudtools.jacuzzi._jacuzzi_allocated_cache["b1"])
+
+    @mock.patch("requests.get")
+    def test_buildername(self, m_get):
+        cloudtools.jacuzzi._jacuzzi_allocated_cache = {}
+        m_get.return_value.json.return_value = {"machines": ["m1", "m2"]}
+        get_allocated_slaves("b1")
+        self.assertTrue(
+            cloudtools.jacuzzi._jacuzzi_allocated_cache["b1"].issuperset(
+                set(["m1", "m2"])))
+        self.assertEqual(get_allocated_slaves("b1"), frozenset(["m1", "m2"]))
new file mode 100644
--- /dev/null
+++ b/tests/test_cloudtools_ssh.py
@@ -0,0 +1,64 @@
+import unittest
+import mock
+import paramiko
+from cloudtools.ssh import SSHClient
+
+
+class TestSSHClient(unittest.TestCase):
+
+    def test_policy(self):
+        instance = mock.MagicMock()
+        client = SSHClient(instance, "user", "key")
+        self.assertIsInstance(client._policy, paramiko.MissingHostKeyPolicy)
+
+    @mock.patch.object(paramiko.SSHClient, "connect")
+    def test_connect(self, m_connect):
+        instance = mock.Mock()
+        instance.private_ip_address = "ip1"
+        instance.tags = {"Name": "n1"}
+        ssh_client = SSHClient(instance, "u1", "k1")
+        ssh_client.connect()
+        m_connect.assert_called_once_with(hostname="ip1", username="u1",
+                                          key_filename="k1")
+
+    @mock.patch.object(paramiko.SSHClient, "exec_command")
+    def test_get_stdout(self, m_exec_command):
+        instance = mock.Mock()
+        instance.private_ip_address = "ip1"
+        instance.tags = {"Name": "n1"}
+        ssh_client = SSHClient(instance, "u1", "k1")
+        stdin, stdout = mock.Mock(), mock.Mock()
+        stdout.read.return_value = "out1"
+        m_exec_command.return_value = stdin, stdout, None
+        out = ssh_client.get_stdout("my command")
+        m_exec_command.assert_called_once_with("my command")
+        stdin.close.assert_called_once_with()
+        stdout.read.assert_called_once_with()
+        self.assertEqual("out1", out)
+
+    @mock.patch.object(paramiko.SSHClient, "connect")
+    def test_connect_returns_None(self, m_connect):
+        instance = mock.Mock()
+        instance.private_ip_address = "ip1"
+        instance.tags = {"Name": "n1"}
+        ssh_client = SSHClient(instance, "u1", "k1")
+        m_connect.side_effect = Exception("Ooops")
+        self.assertIsNone(ssh_client.connect())
+
+    @mock.patch.object(SSHClient, "get_stdout")
+    def test_reboot_no_command(self, m_get_stdout):
+        instance = mock.Mock()
+        instance.private_ip_address = "ip1"
+        instance.tags = {"Name": "n1", "moz-type": "t1"}
+        ssh_client = SSHClient(instance, "u1", "k1")
+        ssh_client.reboot()
+        m_get_stdout.assert_called_once_with("sudo reboot")
+
+    @mock.patch.object(SSHClient, "get_stdout")
+    def test_reboot_with_command(self, m_get_stdout):
+        instance = mock.Mock()
+        instance.private_ip_address = "ip1"
+        instance.tags = {"Name": "n1", "moz-type": "t1"}
+        ssh_client = SSHClient(instance, "u1", "k1")
+        ssh_client.reboot("cmd1")
+        m_get_stdout.assert_called_once_with("cmd1")
--- a/tests/test_yaml_includes.py
+++ b/tests/test_yaml_includes.py
@@ -3,16 +3,17 @@ import unittest
 import sys
 import os
 from cStringIO import StringIO
 import yaml
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../scripts'))
 from yaml_includes import process_includes
 
+
 class TestFreshInstances(unittest.TestCase):
 
     def test_no_change(self):
         self.assertEqual(process_includes({}), {})
         self.assertEqual(process_includes([]), [])
         self.assertEqual(process_includes('x'), 'x')
         complex = {
             'a': [
@@ -46,17 +47,16 @@ class TestFreshInstances(unittest.TestCa
                 - - a
                   - b
             x:
               a: 1
               b: 2
             """)))
         self.assertEqual(process_includes(input), exp)
 
-
     def test_recursion(self):
         input = yaml.load(StringIO(textwrap.dedent("""\
             includes:
                 inc:
                   a: b
                 meta-inc:
                   - include: inc
                   - include: inc
new file mode 100644
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,24 @@
+[tox]
+envlist = py27
+skipsdist = True
+
+[testenv]
+deps =
+    -r{toxinidir}/requirements-tests.txt
+commands=
+    flake8
+    nosetests -v
+
+[testenv:py27-coverage]
+deps=
+    {[testenv]deps}
+    coverage==3.7.1
+    python-coveralls==2.4.3
+commands=
+    nosetests --with-coverage --cover-erase -v
+    coveralls
+
+[flake8]
+max-line-length = 160
+exclude = .ropeproject,.tox
+show-source = True