Bug 983939 - Allow to run retried jobs on spot instances. r=catlee
--- a/scripts/aws_watch_pending.py
+++ b/scripts/aws_watch_pending.py
@@ -31,16 +31,21 @@ import site
site.addsitedir(os.path.join(os.path.dirname(__file__), ".."))
from cloudtools.aws import get_aws_connection, INSTANCE_CONFIGS_DIR, \
aws_time_to_datetime
from cloudtools.aws.spot import CANCEL_STATUS_CODES, \
TERMINATED_BY_AWS_STATUS_CODES
log = logging.getLogger()
+# Number of job retries allowed to run on spot instances. We stop using spot
+# instances if number of retires a larger than this number. If you update this
+# number, you also need to update the same viariable in buildbotcustom/misc.py
+MAX_SPOT_RETRIES = 1
+
@lru_cache(10)
def get_all_spot_requests(region):
log.info("getting all spot requests for %s", region)
conn = get_aws_connection(region)
spot_requests = conn.get_all_spot_instance_requests()
return spot_requests
@@ -694,20 +699,20 @@ def aws_watch_pending(dburl, regions, se
to_create_ondemand = defaultdict(int)
to_create_spot = defaultdict(int)
# Then match them to the builder_map
for pending_buildername, brid in pending:
for buildername_exp, instance_type in builder_map.items():
if re.match(buildername_exp, pending_buildername):
slaveset = get_allocated_slaves(pending_buildername)
- if find_retries(db, brid) == 0:
+ if find_retries(db, brid) > MAX_SPOT_RETRIES:
+ to_create_ondemand[instance_type, slaveset] += 1
+ else:
to_create_spot[instance_type, slaveset] += 1
- else:
- to_create_ondemand[instance_type, slaveset] += 1
break
else:
log.debug("%s has pending jobs, but no instance types defined",
pending_buildername)
if not to_create_spot and not to_create_ondemand:
log.debug("no pending jobs we can do anything about! all done!")
return