Bug 909974 - Add a hang monitor for background threads; r=froydnj
authorJim Chen <nchen@mozilla.com>
Fri, 22 Nov 2013 14:17:29 -0500
changeset 157074 febf82ab059650839546c76433000dc83870091e
parent 157073 e58ce8839585a598ef47a139ea8adacfdc54eeb9
child 157075 6174eab68f5a2773c98c9d4b3ee56b12c8873db5
push id36633
push usernchen@mozilla.com
push dateFri, 22 Nov 2013 19:20:41 +0000
treeherdermozilla-inbound@5365478bdea9 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersfroydnj
bugs909974
milestone28.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 909974 - Add a hang monitor for background threads; r=froydnj
xpcom/build/nsXPComInit.cpp
xpcom/threads/BackgroundHangMonitor.cpp
xpcom/threads/BackgroundHangMonitor.h
xpcom/threads/moz.build
--- a/xpcom/build/nsXPComInit.cpp
+++ b/xpcom/build/nsXPComInit.cpp
@@ -101,16 +101,17 @@ extern nsresult nsStringInputStreamConst
 #include "nsSecurityConsoleMessage.h"
 #include "nsMessageLoop.h"
 
 #include <locale.h>
 #include "mozilla/Services.h"
 #include "mozilla/Omnijar.h"
 #include "mozilla/HangMonitor.h"
 #include "mozilla/Telemetry.h"
+#include "mozilla/BackgroundHangMonitor.h"
 
 #include "nsChromeRegistry.h"
 #include "nsChromeProtocolHandler.h"
 #include "mozilla/IOInterposer.h"
 #include "mozilla/PoisonIOInterposer.h"
 #include "mozilla/LateWriteChecks.h"
 
 #include "mozilla/scache/StartupCache.h"
@@ -582,16 +583,17 @@ NS_InitXPCOM2(nsIServiceManager* *result
 
     // The memory reporter manager is up and running -- register a reporter for
     // ICU's memory usage.
     NS_RegisterMemoryReporter(new ICUReporter());
 
     mozilla::Telemetry::Init();
 
     mozilla::HangMonitor::Startup();
+    mozilla::BackgroundHangMonitor::Startup();
 
 #ifdef MOZ_VISUAL_EVENT_TRACER
     mozilla::eventtracer::Init();
 #endif
 
     return NS_OK;
 }
 
@@ -827,16 +829,17 @@ ShutdownXPCOM(nsIServiceManager* servMgr
     if (sExitManager) {
         delete sExitManager;
         sExitManager = nullptr;
     }
 
     Omnijar::CleanUp();
 
     HangMonitor::Shutdown();
+    BackgroundHangMonitor::Shutdown();
 
 #ifdef MOZ_VISUAL_EVENT_TRACER
     eventtracer::Shutdown();
 #endif
 
     profiler_shutdown();
 
     NS_LogTerm();
new file mode 100644
--- /dev/null
+++ b/xpcom/threads/BackgroundHangMonitor.cpp
@@ -0,0 +1,393 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/BackgroundHangMonitor.h"
+#include "mozilla/LinkedList.h"
+#include "mozilla/Monitor.h"
+#include "mozilla/StaticPtr.h"
+#include "mozilla/ThreadLocal.h"
+
+#include "prinrval.h"
+#include "prthread.h"
+
+#include <algorithm>
+
+namespace mozilla {
+
+/**
+ * BackgroundHangManager is the global object that
+ * manages all instances of BackgroundHangThread.
+ */
+class BackgroundHangManager : public AtomicRefCounted<BackgroundHangManager>
+{
+private:
+  // Background hang monitor thread function
+  static void MonitorThread(void* aData)
+  {
+    PR_SetCurrentThreadName("BgHangManager");
+    // Keep a strong reference throughout thread lifetime
+    RefPtr<BackgroundHangManager>(
+      static_cast<BackgroundHangManager*>(aData))->RunMonitorThread();
+  }
+
+  // Hang monitor thread
+  PRThread* mHangMonitorThread;
+  // Stop hang monitoring
+  bool mShutdown;
+
+  BackgroundHangManager(const BackgroundHangManager&);
+  BackgroundHangManager& operator=(const BackgroundHangManager&);
+  void RunMonitorThread();
+
+public:
+  static StaticRefPtr<BackgroundHangManager> sInstance;
+
+  // Lock for access to members of this class
+  Monitor mLock;
+  // Current time as seen by hang monitors
+  PRIntervalTime mIntervalNow;
+  // List of BackgroundHangThread instances associated with each thread
+  LinkedList<BackgroundHangThread> mHangThreads;
+
+  void Shutdown()
+  {
+    MonitorAutoLock autoLock(mLock);
+    mShutdown = true;
+    autoLock.Notify();
+  }
+
+  void Wakeup()
+  {
+    // Use PR_Interrupt to avoid potentially taking a lock
+    PR_Interrupt(mHangMonitorThread);
+  }
+
+  BackgroundHangManager();
+  ~BackgroundHangManager();
+};
+
+/**
+ * BackgroundHangThread is a per-thread object that is used
+ * by all instances of BackgroundHangMonitor to monitor hangs.
+ */
+class BackgroundHangThread : public RefCounted<BackgroundHangThread>
+                           , public LinkedListElement<BackgroundHangThread>
+{
+private:
+  static ThreadLocal<BackgroundHangThread*> sTlsKey;
+
+  BackgroundHangThread(const BackgroundHangThread&);
+  BackgroundHangThread& operator=(const BackgroundHangThread&);
+
+  /* Keep a reference to the manager, so we can keep going even
+     after BackgroundHangManager::Shutdown is called. */
+  const RefPtr<BackgroundHangManager> mManager;
+  // Unique thread ID for identification
+  const PRThread* mThreadID;
+
+public:
+  static BackgroundHangThread* FindThread();
+
+  static void Startup()
+  {
+    /* We can tolerate init() failing.
+       The if block turns off warn_unused_result. */
+    if (!sTlsKey.init()) {}
+  }
+
+  // Name of the thread
+  const nsAutoCString mThreadName;
+  // Hang timeout in ticks
+  const PRIntervalTime mTimeout;
+  // PermaHang timeout in ticks
+  const PRIntervalTime mMaxTimeout;
+  // Time at last activity
+  PRIntervalTime mInterval;
+  // Is the thread in a waiting state
+  bool mWaiting;
+
+  BackgroundHangThread(const char* aName,
+                       uint32_t aTimeoutMs,
+                       uint32_t aMaxTimeoutMs);
+  ~BackgroundHangThread();
+
+  // Report a hang; aManager->mLock is NOT locked
+  void ReportHang(PRIntervalTime aHangTime) const;
+  // Report a permanent hang; aManager->mLock IS locked
+  void ReportPermaHang() const;
+  // Called by BackgroundHangMonitor::NotifyActivity
+  void NotifyActivity();
+  // Called by BackgroundHangMonitor::NotifyWait
+  void NotifyWait()
+  {
+    NotifyActivity();
+    mWaiting = true;
+  }
+};
+
+
+StaticRefPtr<BackgroundHangManager> BackgroundHangManager::sInstance;
+
+ThreadLocal<BackgroundHangThread*> BackgroundHangThread::sTlsKey;
+
+
+BackgroundHangManager::BackgroundHangManager()
+  : mShutdown(false)
+  , mLock("BackgroundHangManager")
+  , mIntervalNow(0)
+{
+  // Lock so we don't race against the new monitor thread
+  MonitorAutoLock autoLock(mLock);
+  mHangMonitorThread = PR_CreateThread(
+    PR_USER_THREAD, MonitorThread, this,
+    PR_PRIORITY_LOW, PR_GLOBAL_THREAD, PR_UNJOINABLE_THREAD, 0);
+}
+
+BackgroundHangManager::~BackgroundHangManager()
+{
+  MOZ_ASSERT(mShutdown,
+    "Destruction without Shutdown call");
+  MOZ_ASSERT(mHangThreads.isEmpty(),
+    "Destruction with outstanding monitors");
+}
+
+void
+BackgroundHangManager::RunMonitorThread()
+{
+  // Keep us locked except when waiting
+  MonitorAutoLock autoLock(mLock);
+
+  /* mIntervalNow is updated at various intervals determined by waitTime.
+     However, if an update latency is too long (due to CPU scheduling, system
+     sleep, etc.), we don't update mIntervalNow at all. This is done so that
+     long latencies in our timing are not detected as hangs. systemTime is
+     used to track PR_IntervalNow() and determine our latency. */
+
+  PRIntervalTime systemTime = PR_IntervalNow();
+  // Default values for the first iteration of thread loop
+  PRIntervalTime waitTime = PR_INTERVAL_NO_WAIT;
+  PRIntervalTime permaHangTimeout = PR_INTERVAL_NO_WAIT;
+
+  while (!mShutdown) {
+
+    PR_ClearInterrupt();
+    nsresult rv = autoLock.Wait(waitTime);
+
+    PRIntervalTime newTime = PR_IntervalNow();
+    PRIntervalTime systemInterval = newTime - systemTime;
+    systemTime = newTime;
+
+    /* waitTime is a quarter of the shortest timeout value; If our timing
+       latency is low enough (less than half the shortest timeout value),
+       we can update mIntervalNow. */
+    if (MOZ_LIKELY(waitTime != PR_INTERVAL_NO_TIMEOUT &&
+                   systemInterval < 2 * waitTime)) {
+      mIntervalNow += systemInterval;
+    }
+
+    /* If it's before the next permahang timeout, and our wait did not
+       get interrupted (either through Notify or PR_Interrupt), we can
+       keep the current waitTime and skip iterating through hang monitors. */
+    if (MOZ_LIKELY(systemInterval < permaHangTimeout &&
+                   systemInterval >= waitTime &&
+                   rv == NS_OK)) {
+      permaHangTimeout -= systemInterval;
+      continue;
+    }
+
+    /* We are in one of the following scenarios,
+     - Permahang timeout
+     - Thread added/removed
+     - Thread wait ended
+       In all cases, we want to go through our list of hang
+       monitors and update waitTime and permaHangTimeout. */
+    waitTime = PR_INTERVAL_NO_TIMEOUT;
+    permaHangTimeout = PR_INTERVAL_NO_TIMEOUT;
+
+    // Locally hold mIntervalNow
+    PRIntervalTime intervalNow = mIntervalNow;
+
+    // iterate through hang monitors
+    for (BackgroundHangThread* currentThread = mHangThreads.getFirst();
+         currentThread; currentThread = currentThread->getNext()) {
+
+      if (currentThread->mWaiting) {
+        // Thread is waiting, not hanging
+        continue;
+      }
+      PRIntervalTime hangTime = intervalNow - currentThread->mInterval;
+      if (MOZ_UNLIKELY(hangTime >= currentThread->mMaxTimeout)) {
+        // Skip subsequent iterations and tolerate a race on mWaiting here
+        currentThread->mWaiting = true;
+        currentThread->ReportPermaHang();
+        continue;
+      }
+      /* We wait for a quarter of the shortest timeout
+         value to give mIntervalNow enough granularity. */
+      waitTime = std::min(waitTime, currentThread->mTimeout / 4);
+      permaHangTimeout = std::min(
+        permaHangTimeout, currentThread->mMaxTimeout - hangTime);
+    }
+  }
+
+  /* We are shutting down now.
+     Wait for all outstanding monitors to unregister. */
+  while (!mHangThreads.isEmpty()) {
+    autoLock.Wait(PR_INTERVAL_NO_TIMEOUT);
+  }
+}
+
+
+BackgroundHangThread::BackgroundHangThread(const char* aName,
+                                           uint32_t aTimeoutMs,
+                                           uint32_t aMaxTimeoutMs)
+  : mManager(BackgroundHangManager::sInstance)
+  , mThreadID(PR_GetCurrentThread())
+  , mThreadName(aName)
+  , mTimeout(PR_MillisecondsToInterval(aTimeoutMs))
+  , mMaxTimeout(PR_MillisecondsToInterval(aMaxTimeoutMs))
+  , mInterval(mManager->mIntervalNow)
+  , mWaiting(true)
+{
+  if (sTlsKey.initialized()) {
+    sTlsKey.set(this);
+  }
+  // Lock here because LinkedList is not thread-safe
+  MonitorAutoLock autoLock(mManager->mLock);
+  // Add to thread list
+  mManager->mHangThreads.insertBack(this);
+  // Wake up monitor thread to process new thread
+  autoLock.Notify();
+}
+
+BackgroundHangThread::~BackgroundHangThread()
+{
+  // Lock here because LinkedList is not thread-safe
+  MonitorAutoLock autoLock(mManager->mLock);
+  // Remove from thread list
+  remove();
+  // Wake up monitor thread to process removed thread
+  autoLock.Notify();
+
+  // We no longer have a thread
+  if (sTlsKey.initialized()) {
+    sTlsKey.set(nullptr);
+  }
+}
+
+void
+BackgroundHangThread::ReportHang(PRIntervalTime aHangTime) const
+{
+  // Recovered from a hang; called on the hanged thread
+  // mManager->mLock is NOT locked
+
+  // TODO: Add telemetry reporting for hangs
+}
+
+void
+BackgroundHangThread::ReportPermaHang() const
+{
+  // Permanently hanged; called on the monitor thread
+  // mManager->mLock IS locked
+
+  // TODO: Add telemetry reporting for perma-hangs
+}
+
+MOZ_ALWAYS_INLINE void
+BackgroundHangThread::NotifyActivity()
+{
+  PRIntervalTime intervalNow = mManager->mIntervalNow;
+  if (mWaiting) {
+    mInterval = intervalNow;
+    mWaiting = false;
+    /* We have to wake up the manager thread because when all threads
+       are waiting, the manager thread waits indefinitely as well. */
+    mManager->Wakeup();
+  } else {
+    PRIntervalTime duration = intervalNow - mInterval;
+    if (MOZ_UNLIKELY(duration >= mTimeout)) {
+      ReportHang(duration);
+    }
+    mInterval = intervalNow;
+  }
+}
+
+BackgroundHangThread*
+BackgroundHangThread::FindThread()
+{
+  if (sTlsKey.initialized()) {
+    // Use TLS if available
+    return sTlsKey.get();
+  }
+  // If TLS is unavailable, we can search through the thread list
+  RefPtr<BackgroundHangManager> manager(BackgroundHangManager::sInstance);
+  MOZ_ASSERT(manager, "Creating BackgroundHangMonitor after shutdown");
+
+  PRThread* threadID = PR_GetCurrentThread();
+  // Lock thread list for traversal
+  MonitorAutoLock autoLock(manager->mLock);
+  for (BackgroundHangThread* thread = manager->mHangThreads.getFirst();
+       thread; thread = thread->getNext()) {
+    if (thread->mThreadID == threadID) {
+      return thread;
+    }
+  }
+  // Current thread is not initialized
+  return nullptr;
+}
+
+
+void
+BackgroundHangMonitor::Startup()
+{
+  MOZ_ASSERT(!BackgroundHangManager::sInstance, "Already initialized");
+  BackgroundHangThread::Startup();
+  BackgroundHangManager::sInstance = new BackgroundHangManager();
+}
+
+void
+BackgroundHangMonitor::Shutdown()
+{
+  MOZ_ASSERT(BackgroundHangManager::sInstance, "Not initialized");
+  /* Scope our lock inside Shutdown() because the sInstance object can
+     be destroyed as soon as we set sInstance to nullptr below, and
+     we don't want to hold the lock when it's being destroyed. */
+  BackgroundHangManager::sInstance->Shutdown();
+  BackgroundHangManager::sInstance = nullptr;
+}
+
+BackgroundHangMonitor::BackgroundHangMonitor(const char* aName,
+                                             uint32_t aTimeoutMs,
+                                             uint32_t aMaxTimeoutMs)
+  : mThread(BackgroundHangThread::FindThread())
+{
+  if (!mThread) {
+    mThread = new BackgroundHangThread(aName, aTimeoutMs, aMaxTimeoutMs);
+  }
+}
+
+BackgroundHangMonitor::BackgroundHangMonitor()
+  : mThread(BackgroundHangThread::FindThread())
+{
+  MOZ_ASSERT(mThread, "Thread not initialized for hang monitoring");
+}
+
+BackgroundHangMonitor::~BackgroundHangMonitor()
+{
+}
+
+void
+BackgroundHangMonitor::NotifyActivity()
+{
+  mThread->NotifyActivity();
+}
+
+void
+BackgroundHangMonitor::NotifyWait()
+{
+  mThread->NotifyWait();
+}
+
+} // namespace mozilla
new file mode 100644
--- /dev/null
+++ b/xpcom/threads/BackgroundHangMonitor.h
@@ -0,0 +1,158 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef mozilla_BackgroundHangMonitor_h
+#define mozilla_BackgroundHangMonitor_h
+
+#include "mozilla/RefPtr.h"
+
+#include <stdint.h>
+
+namespace mozilla {
+
+class BackgroundHangThread;
+
+/**
+ * The background hang monitor is responsible for detecting and reporting
+ * hangs in background (non-main) threads. A thread registers itself using
+ * the BackgroundHangMonitor object and periodically calls its methods to
+ * inform the hang monitor of the thread's activity. Each thread is given
+ * a thread name, a timeout, and a maximum timeout. If one of the thread's
+ * tasks runs for longer than the timeout duration but shorter than the
+ * maximum timeout, a (transient) hang is reported. On the other hand, if
+ * a task runs for longer than the maximum timeout duration or never
+ * finishes (e.g. in a deadlock), a permahang is reported.
+ *
+ * Tasks are defined arbitrarily, but are typically represented by events
+ * in an event loop -- processing one event is equivalent to running one
+ * task. To ensure responsiveness, tasks in a thread often have a target
+ * running time. This is a good starting point for determining the timeout
+ * and maximum timeout values. For example, the Compositor thread has a
+ * responsiveness goal of 60Hz or 17ms, so a starting timeout could be
+ * 100ms. Considering some platforms (e.g. Android) can terminate the app
+ * when a critical thread hangs for longer than a few seconds, a good
+ * starting maximum timeout is 4 or 5 seconds.
+ *
+ * A thread registers itself through the BackgroundHangMonitor constructor.
+ * Multiple BackgroundHangMonitor objects can be used in one thread. The
+ * constructor without arguments can be used when it is known that the thread
+ * already has a BackgroundHangMonitor registered. When all instances of
+ * BackgroundHangMonitor are destroyed, the thread is unregistered.
+ *
+ * The thread then uses two methods to inform BackgroundHangMonitor of the
+ * thread's activity:
+ *
+ *  > BackgroundHangMonitor::NotifyActivity should be called *before*
+ *    starting a task. The task run time is determined by the interval
+ *    between this call and the next NotifyActivity call.
+ *
+ *  > BackgroundHangMonitor::NotifyWait should be called *before* the
+ *    thread enters a wait state (e.g. to wait for a new event). This
+ *    prevents a waiting thread from being detected as hanging. The wait
+ *    state is automatically cleared at the next NotifyActivity call.
+ *
+ * The following example shows hang monitoring in a simple event loop:
+ *
+ *  void thread_main()
+ *  {
+ *    mozilla::BackgroundHangMonitor hangMonitor("example1", 100, 1000);
+ *    while (!exiting) {
+ *      hangMonitor.NotifyActivity();
+ *      process_next_event();
+ *      hangMonitor.NotifyWait();
+ *      wait_for_next_event();
+ *    }
+ *  }
+ *
+ * The following example shows reentrancy in nested event loops:
+ *
+ *  void thread_main()
+ *  {
+ *    mozilla::BackgroundHangMonitor hangMonitor("example2", 100, 1000);
+ *    while (!exiting) {
+ *      hangMonitor.NotifyActivity();
+ *      process_next_event();
+ *      hangMonitor.NotifyWait();
+ *      wait_for_next_event();
+ *    }
+ *  }
+ *
+ *  void process_next_event()
+ *  {
+ *    mozilla::BackgroundHangMonitor hangMonitor();
+ *    if (is_sync_event) {
+ *      while (!finished_event) {
+ *        hangMonitor.NotifyActivity();
+ *        process_next_event();
+ *        hangMonitor.NotifyWait();
+ *        wait_for_next_event();
+ *      }
+ *    } else {
+ *      process_nonsync_event();
+ *    }
+ *  }
+ *
+ */
+class BackgroundHangMonitor
+{
+private:
+  RefPtr<BackgroundHangThread> mThread;
+
+public:
+  /**
+   * Enable hang monitoring.
+   * Must return before using BackgroundHangMonitor.
+   */
+  static void Startup();
+
+  /**
+   * Disable hang monitoring.
+   * Can be called without destroying all BackgroundHangMonitors first.
+   */
+  static void Shutdown();
+
+  /**
+   * Start monitoring hangs for the current thread.
+   *
+   * @param aName Name to identify the thread with
+   * @param aTimeoutMs Amount of time in milliseconds without
+   *  activity before registering a hang
+   * @param aMaxTimeoutMs Amount of time in milliseconds without
+   *  activity before registering a permanent hang
+   */
+  BackgroundHangMonitor(const char* aName,
+                        uint32_t aTimeoutMs,
+                        uint32_t aMaxTimeoutMs);
+
+  /**
+   * Monitor hangs using an existing monitor
+   * associated with the current thread.
+   */
+  BackgroundHangMonitor();
+
+  /**
+   * Destroys the hang monitor; hang monitoring for a thread stops
+   * when all monitors associated with the thread are destroyed.
+   */
+  ~BackgroundHangMonitor();
+
+  /**
+   * Notify the hang monitor of pending current thread activity.
+   * Call this method before starting an "activity" or after
+   * exiting from a wait state.
+   */
+  void NotifyActivity();
+
+  /**
+   * Notify the hang monitor of current thread wait.
+   * Call this method before entering a wait state; call
+   * NotifyActivity when subsequently exiting the wait state.
+   */
+  void NotifyWait();
+};
+
+} // namespace mozilla
+
+#endif // mozilla_BackgroundHangMonitor_h
--- a/xpcom/threads/moz.build
+++ b/xpcom/threads/moz.build
@@ -23,22 +23,24 @@ XPIDL_MODULE = 'xpcom_threads'
 EXPORTS += [
     'nsEventQueue.h',
     'nsMemoryPressure.h',
     'nsProcess.h',
     'nsThread.h',
 ]
 
 EXPORTS.mozilla += [
+    'BackgroundHangMonitor.h',
     'HangMonitor.h',
     'LazyIdleThread.h',
     'SyncRunnable.h',
 ]
 
 UNIFIED_SOURCES += [
+    'BackgroundHangMonitor.cpp',
     'HangMonitor.cpp',
     'LazyIdleThread.cpp',
     'nsEnvironment.cpp',
     'nsEventQueue.cpp',
     'nsMemoryPressure.cpp',
     'nsProcessCommon.cpp',
     'nsThread.cpp',
     'nsThreadManager.cpp',