Consider which unique network interfaces are used by exiting spot requests when
authorChris AtLee <catlee@mozilla.com>
Fri, 14 Mar 2014 15:55:12 -0400
changeset 346 879e7baced0255502941eb6f0c6737354672d179
parent 345 242375e8e7ea9d08d874c26a6b4faf512a44230a
child 347 9c6b68f16d2adc94883358b046c98b8ccd904ff0
push id346
push usercatlee@mozilla.com
push dateFri, 14 Mar 2014 19:55:23 +0000
Consider which unique network interfaces are used by exiting spot requests when counting number of active requests.
requirements.txt
scripts/aws_watch_pending.py
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,14 +1,14 @@
 Fabric==1.8.0
 IPy==0.81
 MySQL-python==1.2.4
 SQLAlchemy==0.8.3
 argparse==1.2.1
-boto==2.16.0
+boto==2.27.0
 docopt==0.6.1
 ecdsa==0.10
 invtool==0.1.0
 iso8601=0.1.10
 paramiko==1.12.0
 pycrypto==2.6.1
 repoze.lru==0.6
 requests==2.0.1
--- a/scripts/aws_watch_pending.py
+++ b/scripts/aws_watch_pending.py
@@ -396,28 +396,33 @@ def request_spot_instances(moz_instance_
         conn = get_aws_connection(region)
         connections.append(conn)
     spot_choices = get_spot_choices(connections, spot_rules, "Linux/UNIX (Amazon VPC)")
     if not spot_choices:
         log.warn("No spot choices for %s", moz_instance_type)
         return 0
 
     to_start = {}
+    active_network_ids = {}
     for region in regions:
         # Check if spots are enabled in this region for this type
         region_limit = spot_config.get("limits", {}).get(region, {}).get(
             moz_instance_type)
         if not region_limit:
             log.debug("No spot limits defined for %s in %s, skipping...",
                       moz_instance_type, region)
             continue
 
         # check the limits
-        active_count = len(aws_get_spot_requests(
-            region=region, moz_instance_type=moz_instance_type))
+        # Count how many unique network interfaces are active
+        # Sometimes we have multiple requests for the same interface
+        active_requests = aws_get_spot_requests(region=region, moz_instance_type=moz_instance_type)
+        active_network_ids[region] = set(r.launch_specification.networkInterfaceId for r in active_requests)
+        active_count = len(active_network_ids[region])
+        log.debug("%s: %i running spot instances in %s", moz_instance_type, active_count, region)
         can_be_started = region_limit - active_count
         if can_be_started < 1:
             log.debug("Not starting. Active spot request count in %s region "
                       "hit limit of %s. Active count: %s", region,
                       region_limit, active_count)
             continue
 
         to_be_started = min(can_be_started, start_count - started)
@@ -444,17 +449,19 @@ def request_spot_instances(moz_instance_
         launched = do_request_spot_instances(
             amount=need,
             region=region, secrets=secrets,
             moz_instance_type=moz_instance_type,
             ami=to_start[region]["ami"],
             instance_config=instance_config, dryrun=dryrun,
             cached_cert_dir=cached_cert_dir,
             spot_choice=choice,
-            slaveset=slaveset)
+            slaveset=slaveset,
+            active_network_ids=active_network_ids[region],
+        )
         started += launched
 
         if started >= start_count:
             break
 
     return started
 
 
@@ -477,44 +484,45 @@ def get_puppet_certs(ip, secrets, cached
         raise RuntimeError("Cannot retrieve puppet cert")
     with open(cert_file, "wb") as f:
         f.write(cert_data)
     return cert_data
 
 
 def do_request_spot_instances(amount, region, secrets, moz_instance_type, ami,
                               instance_config, cached_cert_dir, spot_choice,
-                              slaveset, dryrun):
+                              slaveset, active_network_ids, dryrun):
     started = 0
     for _ in range(amount):
         try:
             r = do_request_spot_instance(
                 region=region, secrets=secrets,
                 moz_instance_type=moz_instance_type,
                 price=spot_choice.bid_price,
                 availability_zone=spot_choice.availability_zone,
                 ami=ami, instance_config=instance_config,
                 cached_cert_dir=cached_cert_dir,
                 instance_type=spot_choice.instance_type, slaveset=slaveset,
-                dryrun=dryrun)
+                active_network_ids=active_network_ids, dryrun=dryrun)
             if r:
                 started += 1
-        except (RuntimeError):
+        except Exception:
             log.warn("Cannot start", exc_info=True)
     return started
 
 
 def do_request_spot_instance(region, secrets, moz_instance_type, price, ami,
                              instance_config, cached_cert_dir, instance_type,
-                             availability_zone, slaveset, dryrun):
+                             availability_zone, slaveset, active_network_ids, dryrun):
     conn = get_aws_connection(region)
     interface = get_available_interface(
         conn=conn, moz_instance_type=moz_instance_type,
         availability_zone=availability_zone,
-        slaveset=slaveset)
+        slaveset=slaveset,
+        active_network_ids=active_network_ids)
     if not interface:
         log.warn("No free network interfaces left in %s" % region)
         return False
 
     # TODO: check DNS
     fqdn = interface.tags.get("FQDN")
     if not fqdn:
         raise RuntimeError("Skipping %s without FQDN" % interface)
@@ -581,17 +589,17 @@ EOF
     )
     sir[0].add_tag("moz-type", moz_instance_type)
     return True
 
 
 _cached_interfaces = {}
 
 
-def get_available_interface(conn, moz_instance_type, availability_zone, slaveset):
+def get_available_interface(conn, moz_instance_type, availability_zone, slaveset, active_network_ids):
     global _cached_interfaces
     if not _cached_interfaces.get(availability_zone):
         _cached_interfaces[availability_zone] = {}
     if _cached_interfaces[availability_zone].get(moz_instance_type) is None:
         filters = {
             "status": "available",
             "tag:moz-type": moz_instance_type,
             "availability-zone": availability_zone,
@@ -603,23 +611,28 @@ def get_available_interface(conn, moz_in
 
     log.debug("%s interfaces in %s",
               len(_cached_interfaces[availability_zone][moz_instance_type]),
               availability_zone)
     if _cached_interfaces[availability_zone][moz_instance_type]:
         # Find one in our slaveset
         if slaveset:
             for i in _cached_interfaces[availability_zone][moz_instance_type]:
+                if i.id in active_network_ids:
+                    log.debug("skipping %i since it's active", i.id)
+                    continue
                 if i.tags.get("FQDN").split(".")[0] in slaveset:
                     _cached_interfaces[availability_zone][moz_instance_type].remove(i)
                     log.debug("using %s", i.tags.get("FQDN"))
                     return i
         else:
             allocated_slaves = get_allocated_slaves(None)
             for i in _cached_interfaces[availability_zone][moz_instance_type]:
+                if i.id in active_network_ids:
+                    log.debug("skipping %i since it's active", i.id)
                 if i.tags.get("FQDN").split(".")[0] not in allocated_slaves:
                     _cached_interfaces[availability_zone][moz_instance_type].remove(i)
                     log.debug("using %s", i.tags.get("FQDN"))
                     return i
     return None
 
 
 def get_ami(region, moz_instance_type):