Backed out changeset fe3893891ade (bug 1380081)
authorSebastian Hengst <archaeopteryx@coole-files.de>
Fri, 11 Aug 2017 09:28:58 +0200
changeset 374270 2145dba678d8f69a35149a120908445e55f02587
parent 374269 e6d57bb2b33e6b37585b44e0bfd3f76d45f808d2
child 374271 d7a7569e436a330711c0cd8b622c399b14fb3190
push id32318
push userkwierso@gmail.com
push dateFri, 11 Aug 2017 20:16:01 +0000
treeherdermozilla-central@80ff3f300e05 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
bugs1380081
milestone57.0a1
backs outfe3893891adea4475aad9703e9740f64a8ba2117
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Backed out changeset fe3893891ade (bug 1380081)
toolkit/components/telemetry/TelemetrySession.jsm
--- a/toolkit/components/telemetry/TelemetrySession.jsm
+++ b/toolkit/components/telemetry/TelemetrySession.jsm
@@ -54,16 +54,18 @@ const ENVIRONMENT_CHANGE_LISTENER = "Tel
 
 const MS_IN_ONE_HOUR  = 60 * 60 * 1000;
 const MIN_SUBSESSION_LENGTH_MS = Services.prefs.getIntPref("toolkit.telemetry.minSubsessionLength", 5 * 60) * 1000;
 
 const LOGGER_NAME = "Toolkit.Telemetry";
 const LOGGER_PREFIX = "TelemetrySession" + (Utils.isContentProcess ? "#content::" : "::");
 
 const MESSAGE_TELEMETRY_PAYLOAD = "Telemetry:Payload";
+const MESSAGE_TELEMETRY_THREAD_HANGS = "Telemetry:ChildThreadHangs";
+const MESSAGE_TELEMETRY_GET_CHILD_THREAD_HANGS = "Telemetry:GetChildThreadHangs";
 const MESSAGE_TELEMETRY_USS = "Telemetry:USS";
 const MESSAGE_TELEMETRY_GET_CHILD_USS = "Telemetry:GetChildUSS";
 
 const DATAREPORTING_DIRECTORY = "datareporting";
 const ABORTED_SESSION_FILE_NAME = "aborted-session-ping";
 
 // Whether the FHR/Telemetry unification features are enabled.
 // Changing this pref requires a restart.
@@ -513,16 +515,27 @@ this.TelemetrySession = Object.freeze({
    * @param reason Optional, the reason to trigger the payload.
    * @param clearSubsession Optional, whether to clear subsession specific data.
    * @returns Object
    */
   getPayload(reason, clearSubsession = false) {
     return Impl.getPayload(reason, clearSubsession);
   },
   /**
+   * Returns a promise that resolves to an array of thread hang stats from content processes, one entry per process.
+   * The structure of each entry is identical to that of "threadHangStats" in nsITelemetry.
+   * While thread hang stats are also part of the child payloads, this function is useful for cheaply getting this information,
+   * which is useful for realtime hang monitoring.
+   * Child processes that do not respond, or spawn/die during execution of this function are excluded from the result.
+   * @returns Promise
+   */
+  getChildThreadHangs() {
+    return Impl.getChildThreadHangs();
+  },
+  /**
    * Save the session state to a pending file.
    * Used only for testing purposes.
    */
   testSavePendingPing() {
     return Impl.testSavePendingPing();
   },
   /**
    * Collect and store information about startup.
@@ -641,16 +654,27 @@ var Impl = {
   // The previous build ID, if this is the first run with a new build.
   // Null if this is the first run, or the previous build ID is unknown.
   _previousBuildId: null,
   // Telemetry payloads sent by child processes.
   // Each element is in the format {source: <weak-ref>, payload: <object>},
   // where source is a weak reference to the child process,
   // and payload is the telemetry payload from that child process.
   _childTelemetry: [],
+  // Thread hangs from child processes.
+  // Used for TelemetrySession.getChildThreadHangs(); not sent with Telemetry pings.
+  // TelemetrySession.getChildThreadHangs() is used by extensions such as Statuser (https://github.com/chutten/statuser).
+  // Each element is in the format {source: <weak-ref>, payload: <object>},
+  // where source is a weak reference to the child process,
+  // and payload contains the thread hang stats from that child process.
+  _childThreadHangs: [],
+  // Array of the resolve functions of all the promises that are waiting for the child thread hang stats to arrive, used to resolve all those promises at once.
+  _childThreadHangsResolveFunctions: [],
+  // Timeout function for child thread hang stats retrieval.
+  _childThreadHangsTimeout: null,
   // Unique id that identifies this session so the server can cope with duplicate
   // submissions, orphaning and other oddities. The id is shared across subsessions.
   _sessionId: null,
   // Random subsession id.
   _subsessionId: null,
   // Session id of the previous session, null on first run.
   _previousSessionId: null,
   // Subsession id of the previous subsession (even if it was in a different session),
@@ -1471,16 +1495,17 @@ var Impl = {
     if (previousBuildId != thisBuildID) {
       this._previousBuildId = previousBuildId;
       Services.prefs.setStringPref(TelemetryUtils.Preferences.PreviousBuildID, thisBuildID);
     }
 
     this.attachEarlyObservers();
 
     ppml.addMessageListener(MESSAGE_TELEMETRY_PAYLOAD, this);
+    ppml.addMessageListener(MESSAGE_TELEMETRY_THREAD_HANGS, this);
     ppml.addMessageListener(MESSAGE_TELEMETRY_USS, this);
   },
 
   /**
    * Does the "heavy" Telemetry initialization later on, so we
    * don't impact startup performance.
    * @return {Promise} Resolved when the initialization completes.
    */
@@ -1557,16 +1582,17 @@ var Impl = {
     this._testing = testing;
 
     if (!Telemetry.canRecordBase) {
       this._log.trace("setupContentProcess - base recording is disabled, not initializing");
       return;
     }
 
     this.addObserver("content-child-shutdown");
+    cpml.addMessageListener(MESSAGE_TELEMETRY_GET_CHILD_THREAD_HANGS, this);
     cpml.addMessageListener(MESSAGE_TELEMETRY_GET_CHILD_USS, this);
 
     let delayedTask = new DeferredTask(() => {
       this._initialized = true;
 
       this.attachObservers();
       this.gatherMemory();
 
@@ -1606,16 +1632,45 @@ var Impl = {
 
       if (this._childTelemetry.length == MAX_NUM_CONTENT_PAYLOADS + 1) {
         this._childTelemetry.shift();
         Telemetry.getHistogramById("TELEMETRY_DISCARDED_CONTENT_PINGS_COUNT").add();
       }
 
       break;
     }
+    case MESSAGE_TELEMETRY_THREAD_HANGS:
+    {
+      // Accumulate child thread hang stats from this child
+      this._childThreadHangs.push(message.data);
+
+      // Check if we've got data from all the children, accounting for child processes dying
+      // if it happens before the last response is received and no new child processes are spawned at the exact same time
+      // If that happens, we can resolve the promise earlier rather than having to wait for the timeout to expire
+      // Basically, the number of replies is at most the number of messages sent out, this._childCount,
+      // and also at most the number of child processes that currently exist
+      if (this._childThreadHangs.length === Math.min(this._childCount, ppmm.childCount)) {
+        clearTimeout(this._childThreadHangsTimeout);
+
+        // Resolve all the promises that are waiting on these thread hang stats
+        // We resolve here instead of rejecting because
+        for (let resolve of this._childThreadHangsResolveFunctions) {
+          resolve(this._childThreadHangs);
+        }
+        this._childThreadHangsResolveFunctions = [];
+      }
+
+      break;
+    }
+    case MESSAGE_TELEMETRY_GET_CHILD_THREAD_HANGS:
+    {
+      // In child process, send the requested child thread hangs
+      this.sendContentProcessThreadHangs();
+      break;
+    }
     case MESSAGE_TELEMETRY_USS:
     {
       // In parent process, receive the USS report from the child
       if (this._totalMemoryTimeout && this._childrenToHearFrom.delete(message.data.id)) {
         let uss = message.data.bytes;
         this._totalMemory += uss;
         this._USSFromChildProcesses.push(uss);
         if (this._childrenToHearFrom.size == 0) {
@@ -1695,16 +1750,25 @@ var Impl = {
   sendContentProcessPing: function sendContentProcessPing(reason) {
     this._log.trace("sendContentProcessPing - Reason " + reason);
     const isSubsession = !this._isClassicReason(reason);
     let payload = this.getSessionPayload(reason, isSubsession);
     payload.childUUID = this._processUUID;
     cpmm.sendAsyncMessage(MESSAGE_TELEMETRY_PAYLOAD, payload);
   },
 
+  sendContentProcessThreadHangs: function sendContentProcessThreadHangs() {
+    this._log.trace("sendContentProcessThreadHangs");
+    let payload = {
+      childUUID: this._processUUID,
+      hangs: Telemetry.threadHangStats,
+    };
+    cpmm.sendAsyncMessage(MESSAGE_TELEMETRY_THREAD_HANGS, payload);
+  },
+
    /**
     * Save both the "saved-session" and the "shutdown" pings to disk.
     * This needs to be called after TelemetrySend shuts down otherwise pings
     * would be sent instead of getting persisted to disk.
     */
   saveShutdownPings() {
     this._log.trace("saveShutdownPings");
 
@@ -1784,16 +1848,60 @@ var Impl = {
     // We only gather startup info once.
     if (Object.keys(this._slowSQLStartup).length == 0) {
       this._slowSQLStartup = Telemetry.slowSQL;
     }
     this.gatherMemory();
     return this.getSessionPayload(reason, clearSubsession);
   },
 
+  getChildThreadHangs: function getChildThreadHangs() {
+    return new Promise((resolve) => {
+      // Return immediately if there are no child processes to get stats from
+      if (ppmm.childCount === 0) {
+        resolve([]);
+        return;
+      }
+
+      // Register our promise so it will be resolved when we receive the child thread hang stats on the parent process
+      // The resolve functions will all be called from "receiveMessage" when a MESSAGE_TELEMETRY_THREAD_HANGS message comes in
+      this._childThreadHangsResolveFunctions.push((threadHangStats) => {
+        let hangs = threadHangStats.map(child => child.hangs);
+        return resolve(hangs);
+      });
+
+      // If we (the parent) are not currently in the process of requesting child thread hangs, request them
+      // If we are, then the resolve function we registered above will receive the results without needing to request them again
+      if (this._childThreadHangsResolveFunctions.length === 1) {
+        // We have to cache the number of children we send messages to, in case the child count changes while waiting for messages to arrive
+        // This handles the case where the child count increases later on, in which case the new processes won't respond since we never sent messages to them
+        this._childCount = ppmm.childCount;
+
+        this._childThreadHangs = []; // Clear the child hangs
+        for (let i = 0; i < this._childCount; i++) {
+          // If a child dies at exactly while we're running this loop, the message sending will fail but we won't get an exception
+          // In this case, since we won't know this has happened, we will simply rely on the timeout to handle it
+          ppmm.getChildAt(i).sendAsyncMessage(MESSAGE_TELEMETRY_GET_CHILD_THREAD_HANGS);
+        }
+
+        // Set up a timeout in case one or more of the content processes never responds
+        this._childThreadHangsTimeout = setTimeout(() => {
+          // Resolve all the promises that are waiting on these thread hang stats
+          // We resolve here instead of rejecting because the purpose of this function is
+          // to retrieve the BHR stats from all processes that will give us stats
+          // As a result, one process failing simply means it doesn't get included in the result.
+          for (let resolve of this._childThreadHangsResolveFunctions) {
+            resolve(this._childThreadHangs);
+          }
+          this._childThreadHangsResolveFunctions = [];
+        }, 200);
+      }
+    });
+  },
+
   gatherStartup: function gatherStartup() {
     this._log.trace("gatherStartup");
     let counters = processInfo.getCounters();
     if (counters) {
       [this._startupIO.startupSessionRestoreReadBytes,
         this._startupIO.startupSessionRestoreWriteBytes] = counters;
     }
     this._slowSQLStartup = Telemetry.slowSQL;