Bug 478195 - '[Mac] Same-thread deadlock with trace-malloc (causing random red on OS X 10.5.2 mozilla-central leak test build)'. r=dbaron.
authorRobert O'Callahan <rocallahan@mozilla.com>
Tue, 17 Feb 2009 21:11:29 -0800
changeset 25120 90d9e594332487dc5151653e88ba20856d414a2f
parent 25119 503b36e90c422f1adbd5f89c4517d5208d7e8f97
child 25121 65a461a480b7749e6a4c04d2300e5e38caf17869
push idunknown
push userunknown
push dateunknown
reviewersdbaron
bugs478195
milestone1.9.2a1pre
Bug 478195 - '[Mac] Same-thread deadlock with trace-malloc (causing random red on OS X 10.5.2 mozilla-central leak test build)'. r=dbaron.
tools/trace-malloc/lib/nsTraceMalloc.c
xpcom/base/nsStackWalk.cpp
xpcom/base/nsStackWalk.h
--- a/tools/trace-malloc/lib/nsTraceMalloc.c
+++ b/tools/trace-malloc/lib/nsTraceMalloc.c
@@ -908,39 +908,49 @@ stack_callback(void *pc, void *closure)
      */
     if (info->entries < info->size)
         info->buffer[info->entries] = pc;
     ++info->entries;
 }
 
 /*
  * The caller MUST NOT be holding tmlock when calling backtrace.
+ * On return, if *immediate_abort is set, then the return value is NULL
+ * and the thread is in a very dangerous situation (e.g. holding
+ * sem_pool_lock in Mac OS X pthreads); the caller should bail out
+ * without doing anything (such as acquiring locks).
  */
 callsite *
-backtrace(tm_thread *t, int skip)
+backtrace(tm_thread *t, int skip, int *immediate_abort)
 {
     callsite *site;
     stack_buffer_info *info = &t->backtrace_buf;
     void ** new_stack_buffer;
     size_t new_stack_buffer_size;
+    nsresult rv;
 
     t->suppress_tracing++;
 
     /*
      * NS_StackWalk can (on Windows) acquire a lock the shared library
      * loader.  Another thread might call malloc while holding that lock
      * (when loading a shared library).  So we can't be in tmlock during
      * this call.  For details, see
      * https://bugzilla.mozilla.org/show_bug.cgi?id=374829#c8
      */
 
     /* skip == 0 means |backtrace| should show up, so don't use skip + 1 */
     /* NB: this call is repeated below if the buffer is too small */
     info->entries = 0;
-    NS_StackWalk(stack_callback, skip, info);
+    rv = NS_StackWalk(stack_callback, skip, info);
+    *immediate_abort = rv == NS_ERROR_UNEXPECTED;
+    if (rv == NS_ERROR_UNEXPECTED || info->entries == 0) {
+        t->suppress_tracing--;
+        return NULL;
+    }
 
     /*
      * To avoid allocating in stack_callback (which, on Windows, is
      * called on a different thread from the one we're running on here),
      * reallocate here if it didn't have a big enough buffer (which
      * includes the first call on any thread), and call it again.
      */
     if (info->entries > info->size) {
@@ -954,21 +964,16 @@ backtrace(tm_thread *t, int skip)
 
         /* and call NS_StackWalk again */
         info->entries = 0;
         NS_StackWalk(stack_callback, skip, info);
 
         PR_ASSERT(info->entries * 2 == new_stack_buffer_size); /* same stack */
     }
 
-    if (info->entries == 0) {
-        t->suppress_tracing--;
-        return NULL;
-    }
-
     site = calltree(info->buffer, info->entries, t);
 
     TM_ENTER_LOCK(t);
     tmstats.backtrace_calls++;
     if (!site) {
         tmstats.backtrace_failures++;
         PR_ASSERT(tmstats.backtrace_failures < 100);
     }
@@ -1702,18 +1707,19 @@ allocation_enumerator(PLHashEntry *he, P
     return HT_ENUMERATE_NEXT;
 }
 
 PR_IMPLEMENT(void)
 NS_TraceStack(int skip, FILE *ofp)
 {
     callsite *site;
     tm_thread *t = tm_get_thread();
+    int immediate_abort;
 
-    site = backtrace(t, skip + 1);
+    site = backtrace(t, skip + 1, &immediate_abort);
     while (site) {
         if (site->name || site->parent) {
             fprintf(ofp, "%s[%s +0x%X]\n",
                     site->name, site->library, site->offset);
         }
         site = site->parent;
     }
 }
@@ -1783,21 +1789,24 @@ NS_TrackAllocation(void* ptr, FILE *ofp)
 }
 
 PR_IMPLEMENT(void)
 MallocCallback(void *ptr, size_t size, PRUint32 start, PRUint32 end, tm_thread *t)
 {
     callsite *site;
     PLHashEntry *he;
     allocation *alloc;
+    int immediate_abort;
 
     if (!tracing_enabled || t->suppress_tracing != 0)
         return;
 
-    site = backtrace(t, 2);
+    site = backtrace(t, 2, &immediate_abort);
+    if (immediate_abort)
+        return;
 
     TM_SUPPRESS_TRACING_AND_ENTER_LOCK(t);
     tmstats.malloc_calls++;
     if (!ptr) {
         tmstats.malloc_failures++;
     } else {
         if (site) {
             log_event5(logfp, TM_EVENT_MALLOC,
@@ -1817,21 +1826,24 @@ MallocCallback(void *ptr, size_t size, P
 }
 
 PR_IMPLEMENT(void)
 CallocCallback(void *ptr, size_t count, size_t size, PRUint32 start, PRUint32 end, tm_thread *t)
 {
     callsite *site;
     PLHashEntry *he;
     allocation *alloc;
+    int immediate_abort;
 
     if (!tracing_enabled || t->suppress_tracing != 0)
         return;
 
-    site = backtrace(t, 2);
+    site = backtrace(t, 2, &immediate_abort);
+    if (immediate_abort)
+        return;
 
     TM_SUPPRESS_TRACING_AND_ENTER_LOCK(t);
     tmstats.calloc_calls++;
     if (!ptr) {
         tmstats.calloc_failures++;
     } else {
         size *= count;
         if (site) {
@@ -1856,21 +1868,24 @@ ReallocCallback(void * oldptr, void *ptr
                 PRUint32 start, PRUint32 end, tm_thread *t)
 {
     callsite *oldsite, *site;
     size_t oldsize;
     PLHashNumber hash;
     PLHashEntry **hep, *he;
     allocation *alloc;
     FILE *trackfp = NULL;
+    int immediate_abort;
 
     if (!tracing_enabled || t->suppress_tracing != 0)
         return;
 
-    site = backtrace(t, 2);
+    site = backtrace(t, 2, &immediate_abort);
+    if (immediate_abort)
+        return;
 
     TM_SUPPRESS_TRACING_AND_ENTER_LOCK(t);
     tmstats.realloc_calls++;
     oldsite = NULL;
     oldsize = 0;
     hep = NULL;
     he = NULL;
     if (oldptr && get_allocations()) {
@@ -1939,16 +1954,20 @@ FreeCallback(void * ptr, PRUint32 start,
 {
     PLHashEntry **hep, *he;
     callsite *site;
     allocation *alloc;
 
     if (!tracing_enabled || t->suppress_tracing != 0)
         return;
 
+    // XXX Perhaps we should call backtrace() so we can check for
+    // immediate_abort. However, the only current contexts where
+    // immediate_abort will be true do not call free(), so for now,
+    // let's avoid the cost of backtrace().
     TM_SUPPRESS_TRACING_AND_ENTER_LOCK(t);
     tmstats.free_calls++;
     if (!ptr) {
         tmstats.null_free_calls++;
     } else {
         if (get_allocations()) {
             hep = PL_HashTableRawLookup(allocations, hash_pointer(ptr), ptr);
             he = *hep;
--- a/xpcom/base/nsStackWalk.cpp
+++ b/xpcom/base/nsStackWalk.cpp
@@ -1401,21 +1401,68 @@ NS_FormatCodeAddressDetails(void *aPC, c
 #else
 #define HAVE___LIBC_STACK_END 0
 #endif
 
 #if HAVE___LIBC_STACK_END
 extern void *__libc_stack_end; // from ld-linux.so
 #endif
 
+#ifdef XP_MACOSX
+struct AddressRange {
+  void* mStart;
+  void* mEnd;
+};
+// Addresses in this range must stop the stack walk
+static AddressRange gCriticalRange;
+
+static void FindFunctionAddresses(const char* aName, AddressRange* aRange)
+{
+  aRange->mStart = dlsym(RTLD_DEFAULT, aName);
+  if (!aRange->mStart)
+    return;
+  aRange->mEnd = aRange->mStart;
+  while (PR_TRUE) {
+    Dl_info info;
+    if (!dladdr(aRange->mEnd, &info))
+      break;
+    if (strcmp(info.dli_sname, aName))
+      break;
+    aRange->mEnd = (char*)aRange->mEnd + 1;
+  }
+}
+
+static void InitCriticalRanges()
+{
+  if (gCriticalRange.mStart)
+    return;
+  // We must not do work when 'new_sem_from_pool' calls realloc, since
+  // it holds a non-reentrant spin-lock and we will quickly deadlock.
+  // new_sem_from_pool is not directly accessible using dladdr but its
+  // code is bundled with pthread_cond_wait$UNIX2003 (on
+  // Leopard anyway).
+  FindFunctionAddresses("pthread_cond_wait$UNIX2003", &gCriticalRange);
+}
+
+static PRBool InCriticalRange(void* aPC)
+{
+  return gCriticalRange.mStart &&
+    gCriticalRange.mStart <= aPC && aPC < gCriticalRange.mEnd;
+}
+#else
+static void InitCriticalRanges() {}
+static PRBool InCriticalRange(void* aPC) { return PR_FALSE; }
+#endif
+
 EXPORT_XPCOM_API(nsresult)
 NS_StackWalk(NS_WalkStackCallback aCallback, PRUint32 aSkipFrames,
              void *aClosure)
 {
   // Stack walking code courtesy Kipp's "leaky".
+  InitCriticalRanges();
 
   // Get the frame pointer
   void **bp;
 #if defined(__i386) 
   __asm__( "movl %%ebp, %0" : "=g"(bp));
 #else
   // It would be nice if this worked uniformly, but at least on i386 and
   // x86_64, it stopped working with gcc 4.1, because it points to the
@@ -1438,16 +1485,20 @@ NS_StackWalk(NS_WalkStackCallback aCallb
       break;
     }
 #if (defined(__ppc__) && defined(XP_MACOSX)) || defined(__powerpc64__)
     // ppc mac or powerpc64 linux
     void *pc = *(bp+2);
 #else // i386 or powerpc32 linux
     void *pc = *(bp+1);
 #endif
+    if (InCriticalRange(pc)) {
+      printf("Aborting stack trace, PC in critical range\n");
+      return NS_ERROR_UNEXPECTED;
+    }
     if (--skip < 0) {
       (*aCallback)(pc, aClosure);
     }
     bp = next;
   }
   return NS_OK;
 }
 
--- a/xpcom/base/nsStackWalk.h
+++ b/xpcom/base/nsStackWalk.h
@@ -56,16 +56,19 @@ typedef void
  * @param aCallback    Callback function, called once per frame.
  * @param aSkipFrames  Number of initial frames to skip.  0 means that
  *                     the first callback will be for the caller of
  *                     NS_StackWalk.
  * @param aClosure     Caller-supplied data passed through to aCallback.
  *
  * Returns NS_ERROR_NOT_IMPLEMENTED on platforms where it is
  * unimplemented.
+ * Returns NS_ERROR_UNEXPECTED when the stack indicates that the thread
+ * is in a very dangerous situation (e.g., holding sem_pool_lock in 
+ * Mac OS X pthreads code). Callers should then bail out immediately.
  *
  * May skip some stack frames due to compiler optimizations or code
  * generation.
  */
 XPCOM_API(nsresult)
 NS_StackWalk(NS_WalkStackCallback aCallback, PRUint32 aSkipFrames,
              void *aClosure);