Bug 872496 - Allow early registration of stack tops, to improve native unwind quality. r=bgirard.
authorJulian Seward <jseward@acm.org>
Tue, 28 May 2013 14:03:38 +0200
changeset 145460 383bed640c7b37061e25ff168be1455d531bbd03
parent 145459 44bda40fd2b798e100aebd3a186da945e62dad22
child 145461 778368babc940c2d9a4f584f8efcc15542adb132
push id368
push userbbajaj@mozilla.com
push dateMon, 09 Sep 2013 22:57:58 +0000
treeherdermozilla-release@5a4f47ae1217 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbgirard
bugs872496
milestone24.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 872496 - Allow early registration of stack tops, to improve native unwind quality. r=bgirard.
dom/indexedDB/TransactionThreadPool.cpp
dom/workers/RuntimeService.cpp
ipc/chromium/src/base/thread.cc
toolkit/xre/nsAppRunner.cpp
toolkit/xre/nsEmbedFunctions.cpp
tools/profiler/GeckoProfiler.h
tools/profiler/GeckoProfilerFunc.h
tools/profiler/GeckoProfilerImpl.h
tools/profiler/TableTicker.cpp
tools/profiler/UnwinderThread2.cpp
tools/profiler/UnwinderThread2.h
tools/profiler/platform-linux.cc
tools/profiler/platform-macos.cc
tools/profiler/platform-win32.cc
tools/profiler/platform.cpp
tools/profiler/platform.h
xpcom/build/nsXPComInit.cpp
xpcom/threads/LazyIdleThread.cpp
--- a/dom/indexedDB/TransactionThreadPool.cpp
+++ b/dom/indexedDB/TransactionThreadPool.cpp
@@ -667,17 +667,18 @@ FinishTransactionRunnable::Run()
 
 NS_IMPL_THREADSAFE_ISUPPORTS1(TransactionThreadPoolListener,
                               nsIThreadPoolListener)
 
 NS_IMETHODIMP
 TransactionThreadPoolListener::OnThreadCreated()
 {
   MOZ_ASSERT(!NS_IsMainThread());
-  profiler_register_thread("IndexedDB Transaction");
+  char aLocal;
+  profiler_register_thread("IndexedDB Transaction", &aLocal);
   return NS_OK;
 }
 
 NS_IMETHODIMP
 TransactionThreadPoolListener::OnThreadShuttingDown()
 {
   MOZ_ASSERT(!NS_IsMainThread());
   profiler_unregister_thread();
--- a/dom/workers/RuntimeService.cpp
+++ b/dom/workers/RuntimeService.cpp
@@ -509,17 +509,18 @@ public:
     if (!cx) {
       // XXX need to fire an error at parent.
       NS_ERROR("Failed to create runtime and context!");
       return NS_ERROR_FAILURE;
     }
 
     JSRuntime* rt = JS_GetRuntime(cx);
 
-    profiler_register_thread("WebWorker");
+    char aLocal;
+    profiler_register_thread("WebWorker", &aLocal);
 #ifdef MOZ_ENABLE_PROFILER_SPS
     if (PseudoStack* stack = mozilla_get_pseudo_stack())
       stack->sampleRuntime(rt);
 #endif
 
     {
       JSAutoRequest ar(cx);
       workerPrivate->DoRunLoop(cx);
--- a/ipc/chromium/src/base/thread.cc
+++ b/ipc/chromium/src/base/thread.cc
@@ -132,17 +132,18 @@ void Thread::StopSoon() {
   // most likely means that the thread terminated unexpectedly, probably due
   // to someone calling Quit() on our message loop directly.
   DCHECK(message_loop_);
 
   message_loop_->PostTask(FROM_HERE, new ThreadQuitTask());
 }
 
 void Thread::ThreadMain() {
-  profiler_register_thread(name_.c_str());
+  char aLocal;
+  profiler_register_thread(name_.c_str(), &aLocal);
 
   // The message loop for this thread.
   MessageLoop message_loop(startup_data_->options.message_loop_type);
 
   // Complete the initialization of our Thread object.
   thread_id_ = PlatformThread::CurrentId();
   PlatformThread::SetName(name_.c_str());
   message_loop.set_thread_name(name_);
--- a/toolkit/xre/nsAppRunner.cpp
+++ b/toolkit/xre/nsAppRunner.cpp
@@ -3880,17 +3880,18 @@ XREMain::XRE_mainRun()
 }
 
 /*
  * XRE_main - A class based main entry point used by most platforms.
  */
 int
 XREMain::XRE_main(int argc, char* argv[], const nsXREAppData* aAppData)
 {
-  GeckoProfilerInitRAII profilerGuard;
+  char aLocal;
+  GeckoProfilerInitRAII profilerGuard(&aLocal);
   PROFILER_LABEL("Startup", "XRE_Main");
 
   nsresult rv = NS_OK;
 
   gArgc = argc;
   gArgv = argv;
 
   NS_ENSURE_TRUE(aAppData, 2);
@@ -4075,17 +4076,18 @@ public:
     }
   }
   HRESULT mResult;
 };
 
 int
 XRE_mainMetro(int argc, char* argv[], const nsXREAppData* aAppData)
 {
-  GeckoProfilerInitRAII profilerGuard;
+  char aLocal;
+  GeckoProfilerInitRAII profilerGuard(&aLocal);
   PROFILER_LABEL("Startup", "XRE_Main");
 
   nsresult rv = NS_OK;
 
   xreMainPtr = new XREMain();
   if (!xreMainPtr) {
     return 1;
   }
--- a/toolkit/xre/nsEmbedFunctions.cpp
+++ b/toolkit/xre/nsEmbedFunctions.cpp
@@ -277,17 +277,18 @@ SetTaskbarGroupId(const nsString& aId)
 nsresult
 XRE_InitChildProcess(int aArgc,
                      char* aArgv[],
                      GeckoProcessType aProcess)
 {
   NS_ENSURE_ARG_MIN(aArgc, 2);
   NS_ENSURE_ARG_POINTER(aArgv);
   NS_ENSURE_ARG_POINTER(aArgv[0]);
-  profiler_init();
+  char aLocal;
+  profiler_init(&aLocal);
   PROFILER_LABEL("Startup", "XRE_InitChildProcess");
 
   sChildProcessType = aProcess;
 
   // Complete 'task_t' exchange for Mac OS X. This structure has the same size
   // regardless of architecture so we don't have any cross-arch issues here.
 #ifdef XP_MACOSX
   if (aArgc < 1)
--- a/tools/profiler/GeckoProfiler.h
+++ b/tools/profiler/GeckoProfiler.h
@@ -74,17 +74,17 @@
 
 // Main thread specilization to avoid TLS lookup for performance critical use.
 #define PROFILER_MAIN_THREAD_LABEL(name_space, info) do {} while (0)
 #define PROFILER_MAIN_THREAD_LABEL_PRINTF(name_space, info, format, ...) do {} while (0)
 
 // Initilize the profiler TLS, signal handlers on linux. If MOZ_PROFILER_STARTUP
 // is set the profiler will be started. This call must happen before any other
 // sampler calls. Particularly sampler_label/sampler_marker.
-static inline void profiler_init() {};
+static inline void profiler_init(void* stackTop) {};
 
 // Clean up the profiler module, stopping it if required. This function may
 // also save a shutdown profile if requested. No profiler calls should happen
 // after this point and all pseudo labels should have been popped.
 static inline void profiler_shutdown() {};
 
 // Start the profiler with the selected options. The samples will be
 // recorded in a circular buffer.
@@ -130,34 +130,34 @@ static inline void profiler_print_locati
 // Discard the profile, throw away the profile and notify 'profiler-locked'.
 // This function is to be used when entering private browsing to prevent
 // the profiler from collecting sensitive data.
 static inline void profiler_lock() {}
 
 // Re-enable the profiler and notify 'profiler-unlocked'.
 static inline void profiler_unlock() {}
 
-static inline void profiler_register_thread(const char* name) {}
+static inline void profiler_register_thread(const char* name, void* stackTop) {}
 static inline void profiler_unregister_thread() {}
 
 // Call by the JSRuntime's operation callback. This is used to enable
 // profiling on auxilerary threads.
 static inline void profiler_js_operation_callback() {}
 
 static inline double profiler_time() { return 0; }
 
 #else
 
 #include "GeckoProfilerImpl.h"
 
 #endif
 
 class GeckoProfilerInitRAII {
 public:
-  GeckoProfilerInitRAII() {
-    profiler_init();
+  GeckoProfilerInitRAII(void* stackTop) {
+    profiler_init(stackTop);
   }
   ~GeckoProfilerInitRAII() {
     profiler_shutdown();
   }
 };
 
 #endif // ifndef SAMPLER_H
--- a/tools/profiler/GeckoProfilerFunc.h
+++ b/tools/profiler/GeckoProfilerFunc.h
@@ -37,33 +37,33 @@ const double* mozilla_sampler_get_respon
 void mozilla_sampler_save();
 
 char* mozilla_sampler_get_profile();
 
 JSObject *mozilla_sampler_get_profile_data(JSContext *aCx);
 
 const char** mozilla_sampler_get_features();
 
-void mozilla_sampler_init();
+void mozilla_sampler_init(void* stackTop);
 
 void mozilla_sampler_shutdown();
 
 void mozilla_sampler_print_location1();
 void mozilla_sampler_print_location2();
 
 // Lock the profiler. When locked the profiler is (1) stopped,
 // (2) profile data is cleared, (3) profiler-locked is fired.
 // This is used to lock down the profiler during private browsing
 void mozilla_sampler_lock();
 
 // Unlock the profiler, leaving it stopped and fires profiler-unlocked.
 void mozilla_sampler_unlock();
 
 // Register/unregister threads with the profiler
-bool mozilla_sampler_register_thread(const char* name);
+bool mozilla_sampler_register_thread(const char* name, void* stackTop);
 void mozilla_sampler_unregister_thread();
 
 double mozilla_sampler_time();
 
 /* Returns true if env var SPS_NEW is set to anything, else false. */
 extern bool sps_version2();
 
 #endif
--- a/tools/profiler/GeckoProfilerImpl.h
+++ b/tools/profiler/GeckoProfilerImpl.h
@@ -47,19 +47,19 @@ extern bool stack_key_initialized;
 # elif defined(_MSC_VER)
 #  define SAMPLE_FUNCTION_NAME __FUNCTION__
 # else
 #  define SAMPLE_FUNCTION_NAME __func__  // defined in C99, supported in various C++ compilers. Just raw function name.
 # endif
 #endif
 
 static inline
-void profiler_init()
+void profiler_init(void* stackTop)
 {
-  mozilla_sampler_init();
+  mozilla_sampler_init(stackTop);
 }
 
 static inline
 void profiler_shutdown()
 {
   mozilla_sampler_shutdown();
 }
 
@@ -136,19 +136,19 @@ void profiler_lock()
 
 static inline
 void profiler_unlock()
 {
   return mozilla_sampler_unlock();
 }
 
 static inline
-void profiler_register_thread(const char* name)
+void profiler_register_thread(const char* name, void* stackTop)
 {
-  mozilla_sampler_register_thread(name);
+  mozilla_sampler_register_thread(name, stackTop);
 }
 
 static inline
 void profiler_unregister_thread()
 {
   mozilla_sampler_unregister_thread();
 }
 
--- a/tools/profiler/TableTicker.cpp
+++ b/tools/profiler/TableTicker.cpp
@@ -557,17 +557,17 @@ static void print_callback(const Profile
     case 'c':
       printf_stderr("  %s\n", tagStringData);
   }
 }
 
 void mozilla_sampler_print_location1()
 {
   if (!stack_key_initialized)
-    profiler_init();
+    profiler_init(NULL);
 
   PseudoStack *stack = tlsPseudoStack.get();
   if (!stack) {
     MOZ_ASSERT(false);
     return;
   }
 
   ThreadProfile threadProfile("Temp", PROFILE_DEFAULT_ENTRY, stack,
--- a/tools/profiler/UnwinderThread2.cpp
+++ b/tools/profiler/UnwinderThread2.cpp
@@ -78,16 +78,20 @@ void uwt__stop()
 void uwt__deinit()
 {
 }
 
 void uwt__register_thread_for_profiling ( void* stackTop )
 {
 }
 
+void uwt__unregister_thread_for_profiling()
+{
+}
+
 // RUNS IN SIGHANDLER CONTEXT
 UnwinderThreadBuffer* uwt__acquire_empty_buffer()
 {
   return NULL;
 }
 
 // RUNS IN SIGHANDLER CONTEXT
 void
@@ -117,16 +121,19 @@ static void* unwind_thr_fn ( void* exit_
 static pthread_t unwind_thr;
 static int       unwind_thr_exit_now = 0; // RACED ON
 
 // Threads must be registered with this file before they can be
 // sampled.  So that we know the max safe stack address for each
 // registered thread.
 static void thread_register_for_profiling ( void* stackTop );
 
+// Unregister a thread.
+static void thread_unregister_for_profiling();
+
 // Frees some memory when the unwinder thread is shut down.
 static void do_breakpad_unwind_Buffer_free_singletons();
 
 // RUNS IN SIGHANDLER CONTEXT
 // Acquire an empty buffer and mark it as FILLING
 static UnwinderThreadBuffer* acquire_empty_buffer();
 
 // RUNS IN SIGHANDLER CONTEXT
@@ -171,16 +178,21 @@ void uwt__deinit()
   do_breakpad_unwind_Buffer_free_singletons();
 }
 
 void uwt__register_thread_for_profiling(void* stackTop)
 {
   thread_register_for_profiling(stackTop);
 }
 
+void uwt__unregister_thread_for_profiling()
+{
+  thread_unregister_for_profiling();
+}
+
 // RUNS IN SIGHANDLER CONTEXT
 UnwinderThreadBuffer* uwt__acquire_empty_buffer()
 {
   return acquire_empty_buffer();
 }
 
 // RUNS IN SIGHANDLER CONTEXT
 void
@@ -343,31 +355,39 @@ typedef
   StackLimit;
 
 /* Globals -- the buffer array */
 #define N_UNW_THR_BUFFERS 10
 /*SL*/ static UnwinderThreadBuffer** g_buffers     = NULL;
 /*SL*/ static uint64_t               g_seqNo       = 0;
 /*SL*/ static SpinLock               g_spinLock    = { 0 };
 
-/* Globals -- the thread array */
-#define N_SAMPLING_THREADS 10
-/*SL*/ static StackLimit g_stackLimits[N_SAMPLING_THREADS];
-/*SL*/ static int        g_stackLimitsUsed = 0;
+/* Globals -- the thread array.  The array is dynamically expanded on
+   demand.  The spinlock must be held when accessing g_stackLimits,
+   g_stackLimits[some index], g_stackLimitsUsed and g_stackLimitsSize.
+   However, the spinlock must not be held when calling malloc to
+   allocate or expand the array, as that would risk deadlock against a
+   sampling thread that holds the malloc lock and is trying to acquire
+   the spinlock. */
+/*SL*/ static StackLimit* g_stackLimits     = NULL;
+/*SL*/ static size_t      g_stackLimitsUsed = 0;
+/*SL*/ static size_t      g_stackLimitsSize = 0;
 
 /* Stats -- atomically incremented, no lock needed */
 static uintptr_t g_stats_totalSamples = 0; // total # sample attempts
 static uintptr_t g_stats_noBuffAvail  = 0; // # failed due to no buffer avail
+static uintptr_t g_stats_thrUnregd    = 0; // # failed due to unregistered thr
 
 /* We must be VERY CAREFUL what we do with the spinlock held.  The
    only thing it is safe to do with it held is modify (viz, read or
    write) g_buffers, g_buffers[], g_seqNo, g_buffers[]->state,
-   g_stackLimits[] and g_stackLimitsUsed.  No arbitrary computations,
-   no syscalls, no printfs, no file IO, and absolutely no dynamic
-   memory allocation (else we WILL eventually deadlock).
+   g_stackLimits, g_stackLimits[], g_stackLimitsUsed and
+   g_stackLimitsSize.  No arbitrary computations, no syscalls, no
+   printfs, no file IO, and absolutely no dynamic memory allocation
+   (else we WILL eventually deadlock).
 
    This applies both to the signal handler and to the unwinder thread.
 */
 
 //// END type UnwindThreadBuffer
 //////////////////////////////////////////////////////////
 
 // fwds
@@ -471,70 +491,201 @@ static void atomic_INC(uintptr_t* loc)
   while (1) {
     uintptr_t old = *loc;
     uintptr_t nyu = old + 1;
     bool ok = do_CASW( loc, old, nyu );
     if (ok) break;
   }
 }
 
-/* Register a thread for profiling.  It must not be allowed to receive
-   signals before this is done, else the signal handler will
-   MOZ_ASSERT. */
+// Registers a thread for profiling.  Detects and ignores duplicate
+// registration.
 static void thread_register_for_profiling(void* stackTop)
 {
-  int i;
-  /* Minimal sanity check on stackTop */
-  MOZ_ASSERT( (void*)&i < stackTop );
+  pthread_t me = pthread_self();
 
   spinLock_acquire(&g_spinLock);
 
-  pthread_t me = pthread_self();
-  for (i = 0; i < g_stackLimitsUsed; i++) {
-    /* check for duplicate registration */
-    MOZ_ASSERT(g_stackLimits[i].thrId != me);
+  // tmp copy of g_stackLimitsUsed, to avoid racing in message printing
+  int n_used;
+
+  // Ignore spurious calls which aren't really registering anything.
+  if (stackTop == NULL) {
+    n_used = g_stackLimitsUsed;
+    spinLock_release(&g_spinLock);
+    LOGF("BPUnw: [%d total] thread_register_for_profiling"
+         "(me=%p, stacktop=NULL) (IGNORED)", n_used, (void*)me);
+    return;
+  }
+
+  /* Minimal sanity check on stackTop */
+  MOZ_ASSERT((void*)&n_used/*any auto var will do*/ < stackTop);
+
+  bool is_dup = false;
+  for (size_t i = 0; i < g_stackLimitsUsed; i++) {
+    if (g_stackLimits[i].thrId == me) {
+      is_dup = true;
+      break;
+    }
+  }
+
+  if (is_dup) {
+    /* It's a duplicate registration.  Ignore it: drop the lock and
+       return. */
+    n_used = g_stackLimitsUsed;
+    spinLock_release(&g_spinLock);
+
+    LOGF("BPUnw: [%d total] thread_register_for_profiling"
+         "(me=%p, stacktop=%p) (DUPLICATE)", n_used, (void*)me, stackTop);
+    return;
   }
-  if (!(g_stackLimitsUsed < N_SAMPLING_THREADS))
-    MOZ_CRASH();  // Don't continue -- we'll get memory corruption.
+
+  /* Make sure the g_stackLimits array is large enough to accommodate
+     this new entry.  This is tricky.  If it isn't large enough, we
+     can malloc a larger version, but we have to do that without
+     holding the spinlock, else we risk deadlock.  The deadlock
+     scenario is:
+
+     Some other thread that is being sampled
+                                        This thread
+
+     call malloc                        call this function
+     acquire malloc lock                acquire the spinlock
+     (sampling signal)                  discover thread array not big enough,
+     call uwt__acquire_empty_buffer       call malloc to make it larger
+     acquire the spinlock               acquire malloc lock
+
+     This gives an inconsistent lock acquisition order on the malloc
+     lock and spinlock, hence risk of deadlock.
+
+     Allocating more space for the array without holding the spinlock
+     implies tolerating races against other thread(s) who are also
+     trying to expand the array.  How can we detect if we have been
+     out-raced?  Every successful expansion of g_stackLimits[] results
+     in an increase in g_stackLimitsSize.  Hence we can detect if we
+     got out-raced by remembering g_stackLimitsSize before we dropped
+     the spinlock and checking if it has changed after the spinlock is
+     reacquired. */
+
+  MOZ_ASSERT(g_stackLimitsUsed <= g_stackLimitsSize);
+
+  if (g_stackLimitsUsed == g_stackLimitsSize) {
+    /* g_stackLimits[] is full; resize it. */
+
+    size_t old_size = g_stackLimitsSize;
+    size_t new_size = old_size == 0 ? 4 : (2 * old_size);
+
+    spinLock_release(&g_spinLock);
+    StackLimit* new_arr  = (StackLimit*)malloc(new_size * sizeof(StackLimit));
+    if (!new_arr)
+      return;
+
+    spinLock_acquire(&g_spinLock);
+
+    if (old_size != g_stackLimitsSize) {
+      /* We've been outraced.  Instead of trying to deal in-line with
+         this extremely rare case, just start all over again by
+         tail-calling this routine. */
+      spinLock_release(&g_spinLock);
+      free(new_arr);
+      thread_register_for_profiling(stackTop);
+      return;
+    }
+
+    memcpy(new_arr, g_stackLimits, old_size * sizeof(StackLimit));
+    if (g_stackLimits)
+      free(g_stackLimits);
+
+    g_stackLimits = new_arr;
+
+    MOZ_ASSERT(g_stackLimitsSize < new_size);
+    g_stackLimitsSize = new_size;
+  }
+
+  MOZ_ASSERT(g_stackLimitsUsed < g_stackLimitsSize);
+
+  /* Finally, we have a safe place to put the new entry. */
+
+  // Round |stackTop| up to the end of the containing page.  We may
+  // as well do this -- there's no danger of a fault, and we might
+  // get a few more base-of-the-stack frames as a result.  This
+  // assumes that no target has a page size smaller than 4096.
+  uintptr_t stackTopR = (uintptr_t)stackTop;
+  stackTopR = (stackTopR & ~(uintptr_t)4095) + (uintptr_t)4095;
+
   g_stackLimits[g_stackLimitsUsed].thrId    = me;
-  g_stackLimits[g_stackLimitsUsed].stackTop = stackTop;
+  g_stackLimits[g_stackLimitsUsed].stackTop = (void*)stackTopR;
   g_stackLimits[g_stackLimitsUsed].nSamples = 0;
   g_stackLimitsUsed++;
 
+  n_used = g_stackLimitsUsed;
   spinLock_release(&g_spinLock);
-  LOGF("BPUnw: thread_register_for_profiling(stacktop %p, me %p)", 
-       stackTop, (void*)me);
+
+  LOGF("BPUnw: [%d total] thread_register_for_profiling"
+       "(me=%p, stacktop=%p)", n_used, (void*)me, stackTop);
+}
+
+// Deregisters a thread from profiling.  Detects and ignores attempts
+// to deregister a not-registered thread.
+static void thread_unregister_for_profiling()
+{
+  spinLock_acquire(&g_spinLock);
+
+  // tmp copy of g_stackLimitsUsed, to avoid racing in message printing
+  size_t n_used;
+
+  size_t i;
+  bool found = false;
+  pthread_t me = pthread_self();
+  for (i = 0; i < g_stackLimitsUsed; i++) {
+    if (g_stackLimits[i].thrId == me)
+      break;
+  }
+  if (i < g_stackLimitsUsed) {
+    // found this entry.  Slide the remaining ones down one place.
+    for (; i+1 < g_stackLimitsUsed; i++) {
+      g_stackLimits[i] = g_stackLimits[i+1];
+    }
+    g_stackLimitsUsed--;
+    found = true;
+  }
+
+  n_used = g_stackLimitsUsed;
+
+  spinLock_release(&g_spinLock);
+  LOGF("BPUnw: [%d total] thread_unregister_for_profiling(me=%p) %s", 
+       (int)n_used, (void*)me, found ? "" : " (NOT REGISTERED) ");
 }
 
 
 __attribute__((unused))
 static void show_registered_threads()
 {
-  int i;
+  size_t i;
   spinLock_acquire(&g_spinLock);
   for (i = 0; i < g_stackLimitsUsed; i++) {
     LOGF("[%d]  pthread_t=%p  nSamples=%lld",
-         i, (void*)g_stackLimits[i].thrId, 
-            (unsigned long long int)g_stackLimits[i].nSamples);
+         (int)i, (void*)g_stackLimits[i].thrId, 
+                 (unsigned long long int)g_stackLimits[i].nSamples);
   }
   spinLock_release(&g_spinLock);
 }
 
 
 // RUNS IN SIGHANDLER CONTEXT
 static UnwinderThreadBuffer* acquire_empty_buffer()
 {
   /* acq lock
      if buffers == NULL { rel lock; exit }
      scan to find a free buff; if none { rel lock; exit }
      set buff state to S_FILLING
      fillseqno++; and remember it
      rel lock
   */
-  int i;
+  size_t i;
 
   atomic_INC( &g_stats_totalSamples );
 
   /* This code is critical.  We are in a signal handler and possibly
      with the malloc lock held.  So we can't allocate any heap, and
      can't safely call any C library functions, not even the pthread_
      functions.  And we certainly can't do any syscalls.  In short,
      this function needs to be self contained, not do any allocation,
@@ -544,21 +695,30 @@ static UnwinderThreadBuffer* acquire_emp
   spinLock_acquire(&g_spinLock);
 
   /* First of all, look for this thread's entry in g_stackLimits[].
      We need to find it in order to figure out how much stack we can
      safely copy into the sample.  This assumes that pthread_self()
      is safe to call in a signal handler, which strikes me as highly
      likely. */
   pthread_t me = pthread_self();
-  MOZ_ASSERT(g_stackLimitsUsed >= 0 && g_stackLimitsUsed <= N_SAMPLING_THREADS);
+  MOZ_ASSERT(g_stackLimitsUsed <= g_stackLimitsSize);
   for (i = 0; i < g_stackLimitsUsed; i++) {
     if (g_stackLimits[i].thrId == me)
       break;
   }
+
+  /* If the thread isn't registered for profiling, just ignore the call
+     and return NULL. */
+  if (i == g_stackLimitsUsed) {
+    spinLock_release(&g_spinLock);
+    atomic_INC( &g_stats_thrUnregd );
+    return NULL;
+  }
+
   /* "this thread is registered for profiling" */
   MOZ_ASSERT(i < g_stackLimitsUsed);
 
   /* The furthest point that we can safely scan back up the stack. */
   void* myStackTop = g_stackLimits[i].stackTop;
   g_stackLimits[i].nSamples++;
 
   /* Try to find a free buffer to use. */
@@ -569,17 +729,17 @@ static UnwinderThreadBuffer* acquire_emp
     atomic_INC( &g_stats_noBuffAvail );
     return NULL;
   }
 
   for (i = 0; i < N_UNW_THR_BUFFERS; i++) {
     if (g_buffers[i]->state == S_EMPTY)
       break;
   }
-  MOZ_ASSERT(i >= 0 && i <= N_UNW_THR_BUFFERS);
+  MOZ_ASSERT(i <= N_UNW_THR_BUFFERS);
 
   if (i == N_UNW_THR_BUFFERS) {
     /* Again, no free buffers .. give up. */
     spinLock_release(&g_spinLock);
     atomic_INC( &g_stats_noBuffAvail );
     if (LOGLEVEL >= 3)
       LOG("BPUnw: handler:  no free buffers");
     return NULL;
@@ -1779,19 +1939,21 @@ void do_breakpad_unwind_Buffer(/*OUT*/PC
   if (LOGLEVEL >= 3) {
     LOGF("BPUnw: unwinder: seqNo %llu, buf %d: got %u frames "
          "(%u trustworthy)", 
          (unsigned long long int)buff->seqNo, buffNo, n_frames, n_frames_good);
   }
 
   if (LOGLEVEL >= 2) {
     if (0 == (g_stats_totalSamples % 1000))
-      LOGF("BPUnw: %llu total samples, %llu failed due to buffer unavail",
+      LOGF("BPUnw: %llu total samples, %llu failed (buffer unavail), "
+                   "%llu failed (thread unreg'd), ",
            (unsigned long long int)g_stats_totalSamples,
-           (unsigned long long int)g_stats_noBuffAvail);
+           (unsigned long long int)g_stats_noBuffAvail,
+           (unsigned long long int)g_stats_thrUnregd);
   }
 
   delete stack;
   delete sw;
   delete memory;
   delete context;
 }
 
--- a/tools/profiler/UnwinderThread2.h
+++ b/tools/profiler/UnwinderThread2.h
@@ -32,28 +32,36 @@ void uwt__init();
 // reference to the profile which is owned by the profiler.
 void uwt__stop();
 
 // Release the unwinder resources. This must be called after profiling
 // has stop. At this point we know the profiler doesn't hold any buffer
 // and can safely release any resources.
 void uwt__deinit();
 
-// Registers a sampler thread for profiling.  Threads must be registered
-// before they are allowed to call utb__acquire_empty_buffer or
-// utb__release_full_buffer.
+// Registers a sampler thread for profiling.  Threads must be
+// registered before calls to call utb__acquire_empty_buffer or
+// utb__release_full_buffer have any effect.  If stackTop is
+// NULL, the call is ignored.
 void uwt__register_thread_for_profiling(void* stackTop);
 
-// RUNS IN SIGHANDLER CONTEXT
+// Deregister a sampler thread for profiling.
+void uwt__unregister_thread_for_profiling();
+
+// RUNS IN SIGHANDLER CONTEXT 
 // Called in the sampled thread (signal) context.  Get an empty buffer
 // into which ProfileEntries can be put.  It may return NULL if no
 // empty buffers can be found, which will be the case if the unwinder
 // thread(s) have fallen behind for some reason.  In this case the
-// sampled thread must simply give up and return from the signal handler
-// immediately, else it risks deadlock.
+// sampled thread must simply give up and return from the signal
+// handler immediately, else it risks deadlock.
+//
+// If the calling thread has not previously registered itself for
+// profiling via uwt__register_thread_for_profiling, this routine
+// returns NULL.
 UnwinderThreadBuffer* uwt__acquire_empty_buffer();
 
 // RUNS IN SIGHANDLER CONTEXT
 // Called in the sampled thread (signal) context.  Release a buffer
 // that the sampled thread has acquired, handing the contents to
 // the unwinder thread, and, if necessary, passing sufficient
 // information (stack top chunk, + registers) to also do a native
 // unwind.  If 'ucV' is NULL, no native unwind is done.  If non-NULL,
--- a/tools/profiler/platform-linux.cc
+++ b/tools/profiler/platform-linux.cc
@@ -60,16 +60,17 @@
 #include <errno.h>
 #include <stdarg.h>
 #include "platform.h"
 #include "GeckoProfilerImpl.h"
 #include "mozilla/Mutex.h"
 #include "ProfileEntry.h"
 #include "nsThreadUtils.h"
 #include "TableTicker.h"
+#include "UnwinderThread2.h"
 
 #include <string.h>
 #include <stdio.h>
 #include <list>
 
 #define SIGNAL_SAVE_PROFILE SIGUSR2
 
 #if defined(__GLIBC__)
@@ -353,17 +354,19 @@ void Sampler::Stop() {
   // Restore old signal handler
   if (signal_handler_installed_) {
     sigaction(SIGNAL_SAVE_PROFILE, &old_sigsave_signal_handler_, 0);
     sigaction(SIGPROF, &old_sigprof_signal_handler_, 0);
     signal_handler_installed_ = false;
   }
 }
 
-bool Sampler::RegisterCurrentThread(const char* aName, PseudoStack* aPseudoStack, bool aIsMainThread)
+bool Sampler::RegisterCurrentThread(const char* aName,
+                                    PseudoStack* aPseudoStack,
+                                    bool aIsMainThread, void* stackTop)
 {
   if (!Sampler::sRegisteredThreadsMutex)
     return false;
 
   mozilla::MutexAutoLock lock(*Sampler::sRegisteredThreadsMutex);
 
   ThreadInfo* info = new ThreadInfo(aName, gettid(),
     aIsMainThread, aPseudoStack);
@@ -380,16 +383,18 @@ bool Sampler::RegisterCurrentThread(cons
                                        info->GetPlatformData(),
                                        aIsMainThread));
     if (sActiveSampler->ProfileJS()) {
       info->Profile()->GetPseudoStack()->enableJSSampling();
     }
   }
 
   sRegisteredThreads->push_back(info);
+
+  uwt__register_thread_for_profiling(stackTop);
   return true;
 }
 
 void Sampler::UnregisterCurrentThread()
 {
   if (!Sampler::sRegisteredThreadsMutex)
     return;
 
@@ -400,16 +405,18 @@ void Sampler::UnregisterCurrentThread()
   for (uint32_t i = 0; i < sRegisteredThreads->size(); i++) {
     ThreadInfo* info = sRegisteredThreads->at(i);
     if (info->ThreadId() == id) {
       delete info;
       sRegisteredThreads->erase(sRegisteredThreads->begin() + i);
       break;
     }
   }
+
+  uwt__unregister_thread_for_profiling();
 }
 
 #ifdef ANDROID
 static struct sigaction old_sigstart_signal_handler;
 const int SIGSTART = SIGUSR1;
 
 static void StartSignalHandler(int signal, siginfo_t* info, void* context) {
   profiler_start(PROFILE_DEFAULT_ENTRY, PROFILE_DEFAULT_INTERVAL,
--- a/tools/profiler/platform-macos.cc
+++ b/tools/profiler/platform-macos.cc
@@ -336,17 +336,19 @@ Sampler::GetProfiledThread(PlatformData*
 }
 
 #include <sys/syscall.h>
 pid_t gettid()
 {
   return (pid_t) syscall(SYS_thread_selfid);
 }
 
-bool Sampler::RegisterCurrentThread(const char* aName, PseudoStack* aPseudoStack, bool aIsMainThread)
+bool Sampler::RegisterCurrentThread(const char* aName,
+                                    PseudoStack* aPseudoStack,
+                                    bool aIsMainThread, void* stackTop)
 {
   if (!Sampler::sRegisteredThreadsMutex)
     return false;
 
   mozilla::MutexAutoLock lock(*Sampler::sRegisteredThreadsMutex);
 
   ThreadInfo* info = new ThreadInfo(aName, gettid(),
     aIsMainThread, aPseudoStack);
@@ -363,16 +365,18 @@ bool Sampler::RegisterCurrentThread(cons
                                        info->GetPlatformData(),
                                        aIsMainThread));
     if (sActiveSampler->ProfileJS()) {
       info->Profile()->GetPseudoStack()->enableJSSampling();
     }
   }
 
   sRegisteredThreads->push_back(info);
+
+  uwt__register_thread_for_profiling(stackTop);
   return true;
 }
 
 void Sampler::UnregisterCurrentThread()
 {
   if (!Sampler::sRegisteredThreadsMutex)
     return;
 
--- a/tools/profiler/platform-win32.cc
+++ b/tools/profiler/platform-win32.cc
@@ -27,16 +27,17 @@
 // SUCH DAMAGE.
 
 #include <windows.h>
 #include <mmsystem.h>
 #include <process.h>
 #include "platform.h"
 #include "TableTicker.h"
 #include "ProfileEntry.h"
+#include "UnwinderThread2.h"
 
 class PlatformData : public Malloced {
  public:
   // Get a handle to the calling thread. This is the thread that we are
   // going to profile. We need to make a copy of the handle because we are
   // going to use it in the sampler thread. Using GetThreadHandle() will
   // not work in this case. We're using OpenThread because DuplicateHandle
   // for some reason doesn't work in Chrome's sandbox.
@@ -256,17 +257,19 @@ void Thread::Join() {
     WaitForSingleObject(thread_, INFINITE);
   }
 }
 
 void OS::Sleep(int milliseconds) {
   ::Sleep(milliseconds);
 }
 
-bool Sampler::RegisterCurrentThread(const char* aName, PseudoStack* aPseudoStack, bool aIsMainThread)
+bool Sampler::RegisterCurrentThread(const char* aName,
+                                    PseudoStack* aPseudoStack,
+                                    bool aIsMainThread, void* stackTop)
 {
   if (!Sampler::sRegisteredThreadsMutex)
     return false;
 
   mozilla::MutexAutoLock lock(*Sampler::sRegisteredThreadsMutex);
 
   ThreadInfo* info = new ThreadInfo(aName, GetCurrentThreadId(),
     aIsMainThread, aPseudoStack);
@@ -283,16 +286,18 @@ bool Sampler::RegisterCurrentThread(cons
                                        info->GetPlatformData(),
                                        aIsMainThread));
     if (sActiveSampler->ProfileJS()) {
       info->Profile()->GetPseudoStack()->enableJSSampling();
     }
   }
 
   sRegisteredThreads->push_back(info);
+
+  uwt__register_thread_for_profiling(stackTop);
   return true;
 }
 
 void Sampler::UnregisterCurrentThread()
 {
   if (!Sampler::sRegisteredThreadsMutex)
     return;
 
--- a/tools/profiler/platform.cpp
+++ b/tools/profiler/platform.cpp
@@ -247,17 +247,17 @@ void read_profiler_env_vars()
   LOG( "SPS:");
 
   return;
 }
 
 ////////////////////////////////////////////////////////////////////////
 // BEGIN externally visible functions
 
-void mozilla_sampler_init()
+void mozilla_sampler_init(void* stackTop)
 {
   sInitCount++;
 
   if (stack_key_initialized)
     return;
 
   LOG("BEGIN mozilla_sampler_init");
   if (!tlsPseudoStack.init() || !tlsTicker.init()) {
@@ -266,17 +266,17 @@ void mozilla_sampler_init()
   }
   stack_key_initialized = true;
 
   Sampler::Startup();
 
   PseudoStack *stack = new PseudoStack();
   tlsPseudoStack.set(stack);
 
-  Sampler::RegisterCurrentThread("Gecko", stack, true);
+  Sampler::RegisterCurrentThread("Gecko", stack, true, stackTop);
 
   // Read mode settings from MOZ_PROFILER_MODE and interval
   // settings from MOZ_PROFILER_INTERVAL and stack-scan threshhold
   // from MOZ_PROFILER_STACK_SCAN.
   read_profiler_env_vars();
 
   // Allow the profiler to be started using signals
   OS::RegisterStartHandler();
@@ -397,17 +397,17 @@ const char** mozilla_sampler_get_feature
   return features;
 }
 
 // Values are only honored on the first start
 void mozilla_sampler_start(int aProfileEntries, int aInterval,
                            const char** aFeatures, uint32_t aFeatureCount)
 {
   if (!stack_key_initialized)
-    profiler_init();
+    profiler_init(NULL);
 
   /* If the sampling interval was set using env vars, use that
      in preference to anything else. */
   if (sUnwindInterval > 0)
     aInterval = sUnwindInterval;
 
   PseudoStack *stack = tlsPseudoStack.get();
   if (!stack) {
@@ -418,19 +418,16 @@ void mozilla_sampler_start(int aProfileE
   // Reset the current state if the profiler is running
   profiler_stop();
 
   TableTicker* t;
   t = new TableTicker(aInterval ? aInterval : PROFILE_DEFAULT_INTERVAL,
                       aProfileEntries ? aProfileEntries : PROFILE_DEFAULT_ENTRY,
                       aFeatures, aFeatureCount);
   if (t->HasUnwinderThread()) {
-    int aLocal;
-    uwt__register_thread_for_profiling( &aLocal );
-
     // Create the unwinder thread.  ATM there is only one.
     uwt__init();
   }
 
   tlsTicker.set(t);
   t->Start();
   if (t->ProfileJS()) {
       mozilla::MutexAutoLock lock(*Sampler::sRegisteredThreadsMutex);
@@ -462,17 +459,17 @@ void mozilla_sampler_start(int aProfileE
   nsCOMPtr<nsIObserverService> os = mozilla::services::GetObserverService();
   if (os)
     os->NotifyObservers(nullptr, "profiler-started", nullptr);
 }
 
 void mozilla_sampler_stop()
 {
   if (!stack_key_initialized)
-    profiler_init();
+    profiler_init(NULL);
 
   TableTicker *t = tlsTicker.get();
   if (!t) {
     return;
   }
 
   bool disableJS = t->ProfileJS();
   bool unwinderThreader = t->HasUnwinderThread();
@@ -554,22 +551,22 @@ void mozilla_sampler_lock()
 
 void mozilla_sampler_unlock()
 {
   nsCOMPtr<nsIObserverService> os = mozilla::services::GetObserverService();
   if (os)
     os->NotifyObservers(nullptr, "profiler-unlocked", nullptr);
 }
 
-bool mozilla_sampler_register_thread(const char* aName)
+bool mozilla_sampler_register_thread(const char* aName, void* stackTop)
 {
   PseudoStack* stack = new PseudoStack();
   tlsPseudoStack.set(stack);
 
-  return Sampler::RegisterCurrentThread(aName, stack, false);
+  return Sampler::RegisterCurrentThread(aName, stack, false, stackTop);
 }
 
 void mozilla_sampler_unregister_thread()
 {
   Sampler::UnregisterCurrentThread();
 }
 
 double mozilla_sampler_time()
--- a/tools/profiler/platform.h
+++ b/tools/profiler/platform.h
@@ -330,17 +330,19 @@ class Sampler {
 #ifdef XP_MACOSX
   static pthread_t GetProfiledThread(PlatformData*);
 #endif
 
   static std::vector<ThreadInfo*> GetRegisteredThreads() {
     return *sRegisteredThreads;
   }
 
-  static bool RegisterCurrentThread(const char* aName, PseudoStack* aPseudoStack, bool aIsMainThread);
+  static bool RegisterCurrentThread(const char* aName,
+                                    PseudoStack* aPseudoStack,
+                                    bool aIsMainThread, void* stackTop);
   static void UnregisterCurrentThread();
 
   static void Startup();
   // Should only be called on shutdown
   static void Shutdown();
 
   static TableTicker* GetActiveSampler() { return sActiveSampler; }
   static void SetActiveSampler(TableTicker* sampler) { sActiveSampler = sampler; }
--- a/xpcom/build/nsXPComInit.cpp
+++ b/xpcom/build/nsXPComInit.cpp
@@ -327,17 +327,18 @@ NS_InitXPCOM(nsIServiceManager* *result,
 
 EXPORT_XPCOM_API(nsresult)
 NS_InitXPCOM2(nsIServiceManager* *result,
               nsIFile* binDirectory,
               nsIDirectoryServiceProvider* appFileLocationProvider)
 {
     mozPoisonValueInit();
 
-    profiler_init();
+    char aLocal;
+    profiler_init(&aLocal);
     nsresult rv = NS_OK;
 
      // We are not shutting down
     gXPCOMShuttingDown = false;
 
     // Initialize the available memory tracker before other threads have had a
     // chance to start up, because the initialization is not thread-safe.
     mozilla::AvailableMemoryTracker::Init();
--- a/xpcom/threads/LazyIdleThread.cpp
+++ b/xpcom/threads/LazyIdleThread.cpp
@@ -164,17 +164,18 @@ LazyIdleThread::EnsureThread()
   NS_ENSURE_SUCCESS(rv, rv);
 
   return NS_OK;
 }
 
 void
 LazyIdleThread::InitThread()
 {
-  profiler_register_thread(mName.get());
+  char aLocal;
+  profiler_register_thread(mName.get(), &aLocal);
 
   PR_SetCurrentThreadName(mName.get());
 
   // Happens on mThread but mThread may not be set yet...
 
   nsCOMPtr<nsIThreadInternal> thread(do_QueryInterface(NS_GetCurrentThread()));
   MOZ_ASSERT(thread, "This should always succeed!");