Bug 1271077 - Print system resource utilization so it appears in Treeherder; r?jlund draft
authorGregory Szorc <gps@mozilla.com>
Mon, 09 May 2016 22:30:18 -0700
changeset 365176 620b4e2a6459269eba35bce0563acf8839db1ba2
parent 364815 043082cb7bd8490c60815f67fbd1f33323ad7663
child 520470 596189b94e734313cfe1c8e2d033a990a28a9a5c
push id17652
push usergszorc@mozilla.com
push dateTue, 10 May 2016 05:30:35 +0000
reviewersjlund
bugs1271077, 1271035
milestone49.0a1
Bug 1271077 - Print system resource utilization so it appears in Treeherder; r?jlund The system resource utilization during job execution is important: it gives us an idea of the efficiency (or lack thereof) of activities. As bug 1271035 showed us, there can be some really wonky things going on during job execution. To help us notice these things, this commit prints some overall resource utilization data with the special "TinderboxPrint" syntax so it appears in Treeherder. This should hopefully draw the attention of more eye balls and cause people to ask questions about what jobs are doing. This supplements the existing printing of total resource usage in the logs. Unfortunately nobody was really looking at that data because it wasn't exposed that well. This commit should change that. MozReview-Commit-ID: AXNRDS9lrOd
testing/mozharness/mozharness/base/python.py
--- a/testing/mozharness/mozharness/base/python.py
+++ b/testing/mozharness/mozharness/base/python.py
@@ -516,17 +516,20 @@ class ResourceMonitoringMixin(object):
         if rm.start_time is None:
             return
 
         def resources(phase):
             cpu_percent = rm.aggregate_cpu_percent(phase=phase, per_cpu=False)
             cpu_times = rm.aggregate_cpu_times(phase=phase, per_cpu=False)
             io = rm.aggregate_io(phase=phase)
 
-            return cpu_percent, cpu_times, io
+            swap_in = sum(m.swap.sin for m in rm.measurements)
+            swap_out = sum(m.swap.sout for m in rm.measurements)
+
+            return cpu_percent, cpu_times, io, (swap_in, swap_out)
 
         def log_usage(prefix, duration, cpu_percent, cpu_times, io):
             message = '{prefix} - Wall time: {duration:.0f}s; ' \
                 'CPU: {cpu_percent}; ' \
                 'Read bytes: {io_read_bytes}; Write bytes: {io_write_bytes}; ' \
                 'Read time: {io_read_time}; Write time: {io_write_time}'
 
             # XXX Some test harnesses are complaining about a string being
@@ -538,30 +541,63 @@ class ResourceMonitoringMixin(object):
                 self.info(
                     message.format(
                         prefix=prefix, duration=duration,
                         cpu_percent=cpu_percent_str, io_read_bytes=io.read_bytes,
                         io_write_bytes=io.write_bytes, io_read_time=io.read_time,
                         io_write_time=io.write_time
                     )
                 )
+
             except ValueError:
                 self.warning("Exception when formatting: %s" %
                              traceback.format_exc())
 
-        cpu_percent, cpu_times, io = resources(None)
+        cpu_percent, cpu_times, io, (swap_in, swap_out) = resources(None)
         duration = rm.end_time - rm.start_time
 
         log_usage('Total resource usage', duration, cpu_percent, cpu_times, io)
 
+        # Print special messages so usage shows up in Treeherder.
+        if cpu_percent:
+            self._tinderbox_print('CPU usage<br/>{:,.1f}%'.format(
+                                  cpu_percent))
+
+        self._tinderbox_print('I/O read bytes / time<br/>{:,} / {:,}'.format(
+                              io.read_bytes, io.read_time))
+        self._tinderbox_print('I/O write bytes / time<br/>{:,} / {:,}'.format(
+                              io.write_bytes, io.write_time))
+
+        # Print CPU components having >1%.
+        for attr in sorted(dir(cpu_times)):
+            if attr.startswith('_'):
+                continue
+
+            if attr in ('count', 'index'):
+                continue
+
+            value = getattr(cpu_times, attr)
+            percent = value / duration * 100.0
+            if percent > 1.00:
+                self._tinderbox_print('CPU {}<br/>{:,.1f} ({:.1f}%)'.format(
+                                      attr, value, percent))
+
+        # Swap on Windows isn't reported by psutil.
+        if os.name not in ('nt', 'ce'):
+            self._tinderbox_print('Swap in / out<br/>{:,} / {:,}'.format(
+                                  swap_in, swap_out))
+
         for phase in rm.phases.keys():
             start_time, end_time = rm.phases[phase]
             cpu_percent, cpu_times, io = resources(phase)
             log_usage(phase, end_time - start_time, cpu_percent, cpu_times, io)
 
+    def _tinderbox_print(self, message):
+        self.info('TinderboxPrint: %s' % message)
+
 
 class InfluxRecordingMixin(object):
     """Provides InfluxDB stat recording to scripts.
 
     This class records stats to an InfluxDB server, if enabled. Stat recording
     is enabled in a script by inheriting from this class, and adding an
     influxdb_credentials line to the influx_credentials_file (usually oauth.txt
     in automation).  This line should look something like: