Bug 983939 - Allow to run retried jobs on spot instances. r=catlee
authorRail Aliiev <rail@mozilla.com>
Mon, 17 Mar 2014 09:24:00 -0400
changeset 357 45ffbf7839de07b4b32bdb5a271f7998402e22cd
parent 356 9fa182b4bcfd59ba350d8481420c51feb24e3daa
child 358 0ce75ee78ae76026f2c22f6db7df78ca50540c37
push id356
push userraliiev@mozilla.com
push dateMon, 17 Mar 2014 13:24:05 +0000
reviewerscatlee
bugs983939
Bug 983939 - Allow to run retried jobs on spot instances. r=catlee
scripts/aws_watch_pending.py
--- a/scripts/aws_watch_pending.py
+++ b/scripts/aws_watch_pending.py
@@ -31,16 +31,21 @@ import site
 site.addsitedir(os.path.join(os.path.dirname(__file__), ".."))
 from cloudtools.aws import get_aws_connection, INSTANCE_CONFIGS_DIR, \
     aws_time_to_datetime
 from cloudtools.aws.spot import CANCEL_STATUS_CODES, \
     TERMINATED_BY_AWS_STATUS_CODES
 
 log = logging.getLogger()
 
+# Number of job retries allowed to run on spot instances. We stop using spot
+# instances if number of retires a larger than this number. If you update this
+# number, you also need to update the same viariable in buildbotcustom/misc.py
+MAX_SPOT_RETRIES = 1
+
 
 @lru_cache(10)
 def get_all_spot_requests(region):
     log.info("getting all spot requests for %s", region)
     conn = get_aws_connection(region)
     spot_requests = conn.get_all_spot_instance_requests()
     return spot_requests
 
@@ -694,20 +699,20 @@ def aws_watch_pending(dburl, regions, se
     to_create_ondemand = defaultdict(int)
     to_create_spot = defaultdict(int)
 
     # Then match them to the builder_map
     for pending_buildername, brid in pending:
         for buildername_exp, instance_type in builder_map.items():
             if re.match(buildername_exp, pending_buildername):
                 slaveset = get_allocated_slaves(pending_buildername)
-                if find_retries(db, brid) == 0:
+                if find_retries(db, brid) > MAX_SPOT_RETRIES:
+                    to_create_ondemand[instance_type, slaveset] += 1
+                else:
                     to_create_spot[instance_type, slaveset] += 1
-                else:
-                    to_create_ondemand[instance_type, slaveset] += 1
                 break
         else:
             log.debug("%s has pending jobs, but no instance types defined",
                       pending_buildername)
 
     if not to_create_spot and not to_create_ondemand:
         log.debug("no pending jobs we can do anything about! all done!")
         return