TICKscripts: increase thresholds in `vcsreplicator_consumer_lag.tick`
authorConnor Sheehan <sheehan@mozilla.com>
Fri, 17 May 2019 10:50:36 -0400
changeset 7030 0a3096a899ce9c623af12b650b510282a19f2459
parent 7029 fdb441baf69aa69512336fbf50d243962242e7e4
child 7031 fb153bf90ccbcf641fe26e6f08895a28449fbc88
push id3498
push usercosheehan@mozilla.com
push dateFri, 17 May 2019 14:51:23 +0000
treeherderversion-control-tools@0a3096a899ce [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
TICKscripts: increase thresholds in `vcsreplicator_consumer_lag.tick` The Slack alerts were happening too frequently to be actionable. Here we increase the thresholds to a level high enough that an alert should require action.
scripts/TICKscripts/vcsreplicator_consumer_lag.tick
--- a/scripts/TICKscripts/vcsreplicator_consumer_lag.tick
+++ b/scripts/TICKscripts/vcsreplicator_consumer_lag.tick
@@ -1,63 +1,60 @@
-var alerts_log = '/var/log/kapacitor/alerts.log'
-
 // Dataframe
 var dataframe_consumer = stream
     |from()
         .measurement('exec_vcsreplicator_consumer')
         .groupBy('host', 'partition')
     |eval(lambda: int("available") - int("offset"))
         .as('message_lag')
 
 var dataframe_heads = stream
     |from()
         .measurement('exec_vcsreplicator_headsconsumer')
         .groupBy('host', 'partition')
     |eval(lambda: int("available") - int("offset"))
         .as('message_lag')
 
-// Thresholds
+// Threshold
 var alert_message_lag_consumer = dataframe_consumer
     |alert()
-        .warn(lambda: "message_lag" > 5)
-        .crit(lambda: "message_lag" > 10)
+        .warn(lambda: "message_lag" > 30)
+        .crit(lambda: "message_lag" > 50)
         .warnReset(lambda: "message_lag" < 2)
         .critReset(lambda: "message_lag" < 2)
-        .message('[{{ index .Tags "host" }}] vcsreplicator {{ index .Tags "partition" }} is {{ .Level }}: lagging by {{ index .Fields "message_lag" }} messages')
+        .message('*[{{ index .Tags "host" }}]* vcsreplicator-consumer partition {{ index .Tags "partition" }} is *{{ .Level }}*: lagging by {{ index .Fields "message_lag" }} messages')
 
 var alert_time_lag_consumer = dataframe_consumer
     |alert()
         .warn(lambda: "lag_time" > 30)
         .crit(lambda: "lag_time" > 60)
         .warnReset(lambda: "lag_time" <= 3)
         .critReset(lambda: "lag_time" <= 3)
-        .message('[{{ index .Tags "host" }}] vcsreplicator-consumer partition {{ index .Tags "partition" }} is {{ .Level }}: lagging by {{ .lag_time }} messages')
+        .message('*[{{ index .Tags "host" }}]* vcsreplicator-consumer partition {{ index .Tags "partition" }} is *{{ .Level }}*: lagging by {{ .lag_time }} messages')
 
 var alert_message_lag_heads = dataframe_heads
     |alert()
-        .warn(lambda: "message_lag" > 5)
-        .crit(lambda: "message_lag" > 10)
+        .warn(lambda: "message_lag" > 30)
+        .crit(lambda: "message_lag" > 50)
         .warnReset(lambda: "message_lag" < 2)
         .critReset(lambda: "message_lag" < 2)
-        .message('[{{ index .Tags "host" }}] vcsreplicator-heads partition {{ index .Tags "partition" }} is {{ .Level }}: lagging by {{ index .Fields "message_lag" }} messages')
+        .message('*[{{ index .Tags "host" }}]* vcsreplicator-heads partition {{ index .Tags "partition" }} is *{{ .Level }}*: lagging by {{ index .Fields "message_lag" }} messages')
 
 var alert_time_lag_heads = dataframe_heads
     |alert()
         .warn(lambda: "lag_time" > 30)
         .crit(lambda: "lag_time" > 60)
         .warnReset(lambda: "lag_time" <= 3)
         .critReset(lambda: "lag_time" <= 3)
-        .message('[{{ index .Tags "host" }}] vcsreplicator-heads partition {{ index .Tags "partition" }} is {{ .Level }}: lagging by {{ index .Fields "lag_time" }} messages')
+        .message('*[{{ index .Tags "host" }}]* vcsreplicator-heads partition {{ index .Tags "partition" }} is *{{ .Level }}*: lagging by {{ index .Fields "lag_time" }} messages')
 
 // Alert
 alert_time_lag_consumer
-    .log(alerts_log)
-    .slack()
+        .slack()
+
 alert_message_lag_consumer
-    .log(alerts_log)
-    .slack()
+        .slack()
+
 alert_time_lag_heads
-    .log(alerts_log)
-    .slack()
+        .slack()
+
 alert_message_lag_heads
-    .log(alerts_log)
-    .slack()
+        .slack()