Bug 1594577 - Record hangs which precede forced shutdowns r=froydnj
☠☠ backed out by d7e41714afd1 ☠ ☠
authorDoug Thayer <dothayer@mozilla.com>
Thu, 14 Nov 2019 21:35:42 +0000
changeset 502283 bd42216f7b6309c683bcc8d9d63c26a834d08d04
parent 502282 ae8d3569d4b4f2a2877c640bb61d62b14113d43f
child 502284 284910a66370a61a378e10f667256b5c5b59607f
push id114172
push userdluca@mozilla.com
push dateTue, 19 Nov 2019 11:31:10 +0000
treeherdermozilla-inbound@b5c5ba07d3db [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersfroydnj
bugs1594577
milestone72.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1594577 - Record hangs which precede forced shutdowns r=froydnj In short - if a user forcibly terminates the browser because it seems to be permanently hung, we currently do not get a change to record the hang. This is unfortunate, because these likely represent the most egregious hangs in terms of user frustration. This patch seeks to address that. If a hang exceeds 8192ms (the current definition of a "permahang" in existing BHR terms), then we decide to immediately persist it to disk, in case we never get a chance to return to the main thread and submit it. On the next start of the browser, we read the file from disk on a background thread, and just submit it using the normal mechanism. Regarding the handling of the file itself, I tried to do the simplest thing I could - as far as I can tell there is no standard simple serialization mechanism available directly to C++ in Gecko, so I just serialized it by hand. I didn't take any special care with endianness or anything as I can't think of a situation in which we really care at all about these files being transferable between architectures. I directly used PR_Write / PR_Read instead of doing something fancy like memory mapping the file, because I don't think performance is a critical concern here and it offers a simple protection against reading out of bounds. Differential Revision: https://phabricator.services.mozilla.com/D52566
dom/ipc/ContentParent.cpp
gfx/ipc/GPUChild.cpp
toolkit/components/backgroundhangmonitor/BHRTelemetryService.jsm
toolkit/components/backgroundhangmonitor/BackgroundHangMonitor.cpp
toolkit/components/backgroundhangmonitor/HangDetails.cpp
toolkit/components/backgroundhangmonitor/HangDetails.h
toolkit/components/backgroundhangmonitor/nsIHangDetails.idl
toolkit/components/telemetry/docs/data/backgroundhangmonitor-ping.rst
--- a/dom/ipc/ContentParent.cpp
+++ b/dom/ipc/ContentParent.cpp
@@ -5787,17 +5787,17 @@ mozilla::ipc::IPCResult ContentParent::R
     const HangDetails& aDetails) {
   nsCOMPtr<nsIObserverService> obs = mozilla::services::GetObserverService();
   if (obs) {
     // Copy the HangDetails recieved over the network into a nsIHangDetails, and
     // then fire our own observer notification.
     // XXX: We should be able to avoid this potentially expensive copy here by
     // moving our deserialized argument.
     nsCOMPtr<nsIHangDetails> hangDetails =
-        new nsHangDetails(HangDetails(aDetails));
+        new nsHangDetails(HangDetails(aDetails), PersistedToDisk::No);
     obs->NotifyObservers(hangDetails, "bhr-thread-hang", nullptr);
   }
   return IPC_OK();
 }
 
 mozilla::ipc::IPCResult ContentParent::RecvAutomaticStorageAccessCanBeGranted(
     const Principal& aPrincipal,
     AutomaticStorageAccessCanBeGrantedResolver&& aResolver) {
--- a/gfx/ipc/GPUChild.cpp
+++ b/gfx/ipc/GPUChild.cpp
@@ -264,17 +264,17 @@ mozilla::ipc::IPCResult GPUChild::RecvBH
     const HangDetails& aDetails) {
   nsCOMPtr<nsIObserverService> obs = mozilla::services::GetObserverService();
   if (obs) {
     // Copy the HangDetails recieved over the network into a nsIHangDetails, and
     // then fire our own observer notification.
     // XXX: We should be able to avoid this potentially expensive copy here by
     // moving our deserialized argument.
     nsCOMPtr<nsIHangDetails> hangDetails =
-        new nsHangDetails(HangDetails(aDetails));
+        new nsHangDetails(HangDetails(aDetails), PersistedToDisk::No);
     obs->NotifyObservers(hangDetails, "bhr-thread-hang", nullptr);
   }
   return IPC_OK();
 }
 
 class DeferredDeleteGPUChild : public Runnable {
  public:
   explicit DeferredDeleteGPUChild(UniquePtr<GPUChild>&& aChild)
--- a/toolkit/components/backgroundhangmonitor/BHRTelemetryService.jsm
+++ b/toolkit/components/backgroundhangmonitor/BHRTelemetryService.jsm
@@ -3,16 +3,21 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 "use strict";
 
 ChromeUtils.import("resource://gre/modules/Services.jsm", this);
 
 ChromeUtils.defineModuleGetter(
   this,
+  "OS",
+  "resource://gre/modules/osfile.jsm"
+);
+ChromeUtils.defineModuleGetter(
+  this,
   "TelemetryController",
   "resource://gre/modules/TelemetryController.jsm"
 );
 
 function BHRTelemetryService() {
   // Allow tests to get access to this object to verify it works correctly.
   this.wrappedJSObject = this;
 
@@ -30,27 +35,29 @@ BHRTelemetryService.prototype = Object.f
   TRANSMIT_HANG_COUNT: 50,
 
   resetPayload() {
     this.startTime = +new Date();
     this.payload = {
       modules: [],
       hangs: [],
     };
+    this.clearPermahangFile = false;
   },
 
   recordHang({
     duration,
     thread,
     runnableName,
     process,
     stack,
     remoteType,
     modules,
     annotations,
+    wasPersisted,
   }) {
     if (!Services.telemetry.canRecordExtended) {
       return;
     }
 
     // Create a mapping from module indicies in the original nsIHangDetails
     // object to this.payload.modules indicies.
     let moduleIdxs = modules.map(module => {
@@ -94,24 +101,35 @@ BHRTelemetryService.prototype = Object.f
       thread,
       runnableName,
       process,
       remoteType,
       annotations,
       stack,
     });
 
+    if (wasPersisted) {
+      this.clearPermahangFile = true;
+    }
+
     // If we have collected enough hangs, we can submit the hangs we have
     // collected to telemetry.
     if (this.payload.hangs.length > this.TRANSMIT_HANG_COUNT) {
       this.submit();
     }
   },
 
   submit() {
+    if (this.clearPermahangFile) {
+      OS.File.remove(
+        OS.Path.join(OS.Constants.Path.profileDir, "last_permahang.bin"),
+        { ignoreAbsent: true }
+      );
+    }
+
     if (!Services.telemetry.canRecordExtended) {
       return;
     }
 
     // NOTE: We check a separate bhrPing.enabled pref here. This pref is unset
     // when running tests so that we run as much of BHR as possible (to catch
     // errors) while avoiding timeouts caused by invoking `pingsender` during
     // testing.
--- a/toolkit/components/backgroundhangmonitor/BackgroundHangMonitor.cpp
+++ b/toolkit/components/backgroundhangmonitor/BackgroundHangMonitor.cpp
@@ -15,16 +15,17 @@
 #include "mozilla/Telemetry.h"
 #include "mozilla/ThreadLocal.h"
 #include "mozilla/SystemGroup.h"
 #include "mozilla/Unused.h"
 
 #include "prinrval.h"
 #include "prthread.h"
 #include "ThreadStackHelper.h"
+#include "nsAppDirectoryServiceDefs.h"
 #include "nsIObserverService.h"
 #include "nsIObserver.h"
 #include "mozilla/Services.h"
 #include "nsThreadUtils.h"
 #include "nsXULAppAPI.h"
 #include "GeckoProfiler.h"
 #include "HangDetails.h"
 
@@ -99,16 +100,20 @@ class BackgroundHangManager : public nsI
   // Current time as seen by hang monitors
   TimeStamp mNow;
   // List of BackgroundHangThread instances associated with each thread
   LinkedList<BackgroundHangThread> mHangThreads;
 
   // Unwinding and reporting of hangs is despatched to this thread.
   nsCOMPtr<nsIThread> mHangProcessingThread;
 
+  // Used for recording a permahang in case we don't ever make it back to
+  // the main thread to record/send it.
+  nsCOMPtr<nsIFile> mPermahangFile;
+
   // Allows us to watch CPU usage and annotate hangs when the system is
   // under high external load.
   CPUUsageWatcher mCPUUsageWatcher;
 
   void Shutdown() {
     MonitorAutoLock autoLock(mLock);
     mShutdown = true;
     autoLock.Notify();
@@ -126,23 +131,41 @@ class BackgroundHangManager : public nsI
   virtual ~BackgroundHangManager();
 };
 
 NS_IMPL_ISUPPORTS(BackgroundHangManager, nsIObserver)
 
 NS_IMETHODIMP
 BackgroundHangManager::Observe(nsISupports* aSubject, const char* aTopic,
                                const char16_t* aData) {
-  NS_ENSURE_TRUE(!strcmp(aTopic, "profile-after-change"), NS_ERROR_UNEXPECTED);
-  BackgroundHangMonitor::DisableOnBeta();
+  if (!strcmp(aTopic, "browser-delayed-startup-finished")) {
+    MonitorAutoLock autoLock(mLock);
+    nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR,
+                                         getter_AddRefs(mPermahangFile));
+    if (NS_SUCCEEDED(rv)) {
+      mPermahangFile->AppendNative(NS_LITERAL_CSTRING("last_permahang.bin"));
+    } else {
+      mPermahangFile = nullptr;
+    }
 
-  nsCOMPtr<nsIObserverService> observerService =
-      mozilla::services::GetObserverService();
-  MOZ_ASSERT(observerService);
-  observerService->RemoveObserver(this, "profile-after-change");
+    if (mHangProcessingThread && mPermahangFile) {
+      nsCOMPtr<nsIRunnable> submitRunnable =
+          new SubmitPersistedPermahangRunnable(mPermahangFile);
+      mHangProcessingThread->Dispatch(submitRunnable.forget());
+    }
+  } else if (!strcmp(aTopic, "profile-after-change")) {
+    BackgroundHangMonitor::DisableOnBeta();
+    nsCOMPtr<nsIObserverService> observerService =
+        mozilla::services::GetObserverService();
+    MOZ_ASSERT(observerService);
+    observerService->RemoveObserver(BackgroundHangManager::sInstance,
+                                    "profile-after-change");
+  } else {
+    return NS_ERROR_UNEXPECTED;
+  }
 
   return NS_OK;
 }
 
 /**
  * BackgroundHangThread is a per-thread object that is used
  * by all instances of BackgroundHangMonitor to monitor hangs.
  */
@@ -211,17 +234,18 @@ class BackgroundHangThread : public Link
 
   BackgroundHangThread(const char* aName, uint32_t aTimeoutMs,
                        uint32_t aMaxTimeoutMs,
                        BackgroundHangMonitor::ThreadType aThreadType =
                            BackgroundHangMonitor::THREAD_SHARED);
 
   // Report a hang; aManager->mLock IS locked. The hang will be processed
   // off-main-thread, and will then be submitted back.
-  void ReportHang(TimeDuration aHangTime);
+  void ReportHang(TimeDuration aHangTime,
+                  PersistedToDisk aPersistedToDisk = PersistedToDisk::No);
   // Report a permanent hang; aManager->mLock IS locked
   void ReportPermaHang();
   // Called by BackgroundHangMonitor::NotifyActivity
   void NotifyActivity() {
     MonitorAutoLock autoLock(mManager->mLock);
     Update();
   }
   // Called by BackgroundHangMonitor::NotifyWait
@@ -463,37 +487,45 @@ BackgroundHangThread::~BackgroundHangThr
   autoLock.Notify();
 
   // We no longer have a thread
   if (sTlsKeyInitialized && IsShared()) {
     sTlsKey.set(nullptr);
   }
 }
 
-void BackgroundHangThread::ReportHang(TimeDuration aHangTime) {
+void BackgroundHangThread::ReportHang(TimeDuration aHangTime,
+                                      PersistedToDisk aPersistedToDisk) {
   // Recovered from a hang; called on the monitor thread
   // mManager->mLock IS locked
 
   HangDetails hangDetails(
       aHangTime,
       nsDependentCString(XRE_ChildProcessTypeToString(XRE_GetProcessType())),
       VoidString(), mThreadName, mRunnableName, std::move(mHangStack),
       std::move(mAnnotations));
 
+  PersistedToDisk persistedToDisk = aPersistedToDisk;
+  if (aPersistedToDisk == PersistedToDisk::Yes && XRE_IsParentProcess()) {
+    auto res = WriteHangDetailsToFile(hangDetails, mManager->mPermahangFile);
+    persistedToDisk = res.isOk() ? PersistedToDisk::Yes : PersistedToDisk::No;
+  }
+
   // If the hang processing thread exists, we can process the native stack
   // on it. Otherwise, we are unable to report a native stack, so we just
   // report without one.
   if (mManager->mHangProcessingThread) {
     nsCOMPtr<nsIRunnable> processHangStackRunnable =
-        new ProcessHangStackRunnable(std::move(hangDetails));
+        new ProcessHangStackRunnable(std::move(hangDetails), persistedToDisk);
     mManager->mHangProcessingThread->Dispatch(
         processHangStackRunnable.forget());
   } else {
     NS_WARNING("Unable to report native stack without a BHR processing thread");
-    RefPtr<nsHangDetails> hd = new nsHangDetails(std::move(hangDetails));
+    RefPtr<nsHangDetails> hd =
+        new nsHangDetails(std::move(hangDetails), persistedToDisk);
     hd->Submit();
   }
 
   // If the profiler is enabled, add a marker.
 #ifdef MOZ_GECKO_PROFILER
   if (profiler_can_accept_markers()) {
     TimeStamp endTime = TimeStamp::Now();
     TimeStamp startTime = endTime - aHangTime;
@@ -504,23 +536,21 @@ void BackgroundHangThread::ReportHang(Ti
   }
 #endif
 }
 
 void BackgroundHangThread::ReportPermaHang() {
   // Permanently hanged; called on the monitor thread
   // mManager->mLock IS locked
 
-  // NOTE: We used to capture a native stack in this situation if one had not
-  // already been captured, but with the new ReportHang design that is less
-  // practical.
-  //
-  // We currently don't look at hang reports outside of nightly, and already
-  // collect native stacks eagerly on nightly, so this should be OK.
-  ReportHang(mMaxTimeout);
+  // The significance of a permahang is that it's likely that we won't ever
+  // recover and be allowed to submit this hang. On the parent thread, we
+  // compensate for this by writing the hang details to disk on this thread,
+  // and in our next session we'll try to read those details
+  ReportHang(mMaxTimeout, PersistedToDisk::Yes);
 }
 
 MOZ_ALWAYS_INLINE void BackgroundHangThread::Update() {
   TimeStamp now = mManager->mNow;
   if (mWaiting) {
     mLastActivity = now;
     mWaiting = false;
     /* We have to wake up the manager thread because when all threads
@@ -608,51 +638,59 @@ void BackgroundHangMonitor::Startup() {
 #ifdef MOZ_ENABLE_BACKGROUND_HANG_MONITOR
   MOZ_ASSERT(!BackgroundHangManager::sInstance, "Already initialized");
 
   if (XRE_IsContentProcess() && IsDisabled()) {
     BackgroundHangManager::sDisabled = true;
     return;
   }
 
+  nsCOMPtr<nsIObserverService> observerService =
+      mozilla::services::GetObserverService();
+  MOZ_ASSERT(observerService);
+
   if (!strcmp(MOZ_STRINGIFY(MOZ_UPDATE_CHANNEL), "beta")) {
     if (XRE_IsParentProcess()) {  // cached ClientID hasn't been read yet
       BackgroundHangThread::Startup();
       BackgroundHangManager::sInstance = new BackgroundHangManager();
       Unused << NS_WARN_IF(
           BackgroundHangManager::sInstance->mCPUUsageWatcher.Init().isErr());
-
-      nsCOMPtr<nsIObserverService> observerService =
-          mozilla::services::GetObserverService();
-      MOZ_ASSERT(observerService);
-
       observerService->AddObserver(BackgroundHangManager::sInstance,
                                    "profile-after-change", false);
       return;
     } else if (DisableOnBeta()) {
       return;
     }
   }
 
   BackgroundHangThread::Startup();
   BackgroundHangManager::sInstance = new BackgroundHangManager();
   Unused << NS_WARN_IF(
       BackgroundHangManager::sInstance->mCPUUsageWatcher.Init().isErr());
+  if (XRE_IsParentProcess()) {
+    observerService->AddObserver(BackgroundHangManager::sInstance,
+                                 "browser-delayed-startup-finished", false);
+  }
 #endif
 }
 
 void BackgroundHangMonitor::Shutdown() {
 #ifdef MOZ_ENABLE_BACKGROUND_HANG_MONITOR
   if (BackgroundHangManager::sDisabled) {
     MOZ_ASSERT(!BackgroundHangManager::sInstance, "Initialized");
     return;
   }
 
   MOZ_ASSERT(BackgroundHangManager::sInstance, "Not initialized");
   BackgroundHangManager::sInstance->mCPUUsageWatcher.Uninit();
+  nsCOMPtr<nsIObserverService> observerService =
+      mozilla::services::GetObserverService();
+  MOZ_ASSERT(observerService);
+  observerService->RemoveObserver(BackgroundHangManager::sInstance,
+                                  "browser-delayed-startup-finished");
   /* Scope our lock inside Shutdown() because the sInstance object can
      be destroyed as soon as we set sInstance to nullptr below, and
      we don't want to hold the lock when it's being destroyed. */
   BackgroundHangManager::sInstance->Shutdown();
   BackgroundHangManager::sInstance = nullptr;
   BackgroundHangManager::sDisabled = true;
 #endif
 }
--- a/toolkit/components/backgroundhangmonitor/HangDetails.cpp
+++ b/toolkit/components/backgroundhangmonitor/HangDetails.cpp
@@ -7,24 +7,33 @@
 #include "HangDetails.h"
 #include "nsIHangDetails.h"
 #include "nsPrintfCString.h"
 #include "mozilla/gfx/GPUParent.h"
 #include "mozilla/dom/ContentChild.h"
 #include "mozilla/dom/ContentParent.h"  // For RemoteTypePrefix
 #include "mozilla/Unused.h"
 #include "mozilla/GfxMessageUtils.h"  // For ParamTraits<GeckoProcessType>
+#include "mozilla/ResultExtensions.h"
 
 #ifdef MOZ_GECKO_PROFILER
 #  include "shared-libraries.h"
 #endif
 
+static const char MAGIC[] = "permahangsavev1";
+
 namespace mozilla {
 
 NS_IMETHODIMP
+nsHangDetails::GetWasPersisted(bool* aWasPersisted) {
+  *aWasPersisted = mPersistedToDisk == PersistedToDisk::Yes;
+  return NS_OK;
+}
+
+NS_IMETHODIMP
 nsHangDetails::GetDuration(double* aDuration) {
   *aDuration = mDetails.duration().ToMilliseconds();
   return NS_OK;
 }
 
 NS_IMETHODIMP
 nsHangDetails::GetThread(nsACString& aName) {
   aName.Assign(mDetails.threadName());
@@ -371,22 +380,327 @@ void ReadModuleInformation(HangStack& st
     if (moduleReferenced) {
       HangModule module(info.GetDebugName(), info.GetBreakpadId());
       stack.modules().AppendElement(module);
     }
   }
 #endif
 }
 
+Result<Ok, nsresult> WriteUint(PRFileDesc* aFile, const CheckedUint32& aInt) {
+  if (!aInt.isValid()) {
+    MOZ_ASSERT_UNREACHABLE("Integer value out of bounds.");
+    return Err(NS_ERROR_UNEXPECTED);
+  }
+  int32_t value = aInt.value();
+  if (PR_Write(aFile, (void*)&value, sizeof(value)) != sizeof(value)) {
+    return Err(NS_ERROR_FAILURE);
+  }
+  return Ok();
+}
+
+Result<uint32_t, nsresult> ReadUint(PRFileDesc* aFile) {
+  int32_t value;
+  if (PR_Read(aFile, (void*)&value, sizeof(value)) != sizeof(value)) {
+    return Err(NS_ERROR_FAILURE);
+  }
+  return value;
+}
+
+Result<Ok, nsresult> WriteCString(PRFileDesc* aFile, const char* aString) {
+  size_t length = strlen(aString);
+  MOZ_TRY(WriteUint(aFile, CheckedUint32(length)));
+  if (PR_Write(aFile, (void*)aString, length) != length) {
+    return Err(NS_ERROR_FAILURE);
+  }
+  return Ok();
+}
+
+template <typename CharT>
+Result<Ok, nsresult> WriteTString(PRFileDesc* aFile,
+                                  const nsTString<CharT>& aString) {
+  MOZ_TRY(WriteUint(aFile, CheckedUint32(aString.Length())));
+  size_t size = aString.Length() * sizeof(CharT);
+  if (PR_Write(aFile, (void*)aString.get(), size) != size) {
+    return Err(NS_ERROR_FAILURE);
+  }
+  return Ok();
+}
+
+template <typename CharT>
+Result<nsTString<CharT>, nsresult> ReadTString(PRFileDesc* aFile) {
+  uint32_t length;
+  MOZ_TRY_VAR(length, ReadUint(aFile));
+  nsTString<CharT> result;
+  CharT buffer[512];
+  size_t bufferLength = sizeof(buffer) / sizeof(CharT);
+  while (length != 0) {
+    size_t toRead = std::min(bufferLength, size_t(length));
+    size_t toReadSize = toRead * sizeof(CharT);
+    if (PR_Read(aFile, (void*)buffer, toReadSize) != toReadSize) {
+      return Err(NS_ERROR_FAILURE);
+    }
+
+    if (!result.Append(buffer, toRead, mozilla::fallible)) {
+      return Err(NS_ERROR_FAILURE);
+    }
+
+    if (length > bufferLength) {
+      length -= bufferLength;
+    } else {
+      length = 0;
+    }
+  }
+  return result;
+}
+
+Result<Ok, nsresult> WriteEntry(PRFileDesc* aFile, const HangStack& aStack,
+                                const HangEntry& aEntry) {
+  MOZ_TRY(WriteUint(aFile, uint32_t(aEntry.type())));
+  switch (aEntry.type()) {
+    case HangEntry::TnsCString: {
+      MOZ_TRY(WriteTString(aFile, aEntry.get_nsCString()));
+      break;
+    }
+    case HangEntry::THangEntryBufOffset: {
+      uint32_t offset = aEntry.get_HangEntryBufOffset().index();
+
+      if (NS_WARN_IF(aStack.strbuffer().IsEmpty() ||
+                     offset >= aStack.strbuffer().Length())) {
+        MOZ_ASSERT_UNREACHABLE("Corrupted offset data");
+        return Err(NS_ERROR_FAILURE);
+      }
+
+      if (aStack.strbuffer().LastElement() != '\0') {
+        MOZ_ASSERT_UNREACHABLE("Corrupted strbuffer data");
+        return Err(NS_ERROR_FAILURE);
+      }
+
+      const char* start = (const char*)aStack.strbuffer().Elements() + offset;
+      MOZ_TRY(WriteCString(aFile, start));
+      break;
+    }
+    case HangEntry::THangEntryModOffset: {
+      const HangEntryModOffset& mo = aEntry.get_HangEntryModOffset();
+
+      MOZ_TRY(WriteUint(aFile, CheckedUint32(mo.module())));
+      MOZ_TRY(WriteUint(aFile, CheckedUint32(mo.offset())));
+      break;
+    }
+    case HangEntry::THangEntryProgCounter:
+    case HangEntry::THangEntryContent:
+    case HangEntry::THangEntryJit:
+    case HangEntry::THangEntryWasm:
+    case HangEntry::THangEntryChromeScript:
+    case HangEntry::THangEntrySuppressed: {
+      break;
+    }
+    default:
+      MOZ_CRASH("Unsupported HangEntry type?");
+  }
+  return Ok();
+}
+
+Result<Ok, nsresult> ReadEntry(PRFileDesc* aFile, HangStack& aStack) {
+  uint32_t type;
+  MOZ_TRY_VAR(type, ReadUint(aFile));
+  HangEntry::Type entryType = HangEntry::Type(type);
+  switch (entryType) {
+    case HangEntry::TnsCString:
+    case HangEntry::THangEntryBufOffset: {
+      nsCString str;
+      MOZ_TRY_VAR(str, ReadTString<char>(aFile));
+      aStack.stack().AppendElement(std::move(str));
+      break;
+    }
+    case HangEntry::THangEntryModOffset: {
+      uint32_t module;
+      MOZ_TRY_VAR(module, ReadUint(aFile));
+      uint32_t offset;
+      MOZ_TRY_VAR(offset, ReadUint(aFile));
+      aStack.stack().AppendElement(HangEntryModOffset(module, offset));
+      break;
+    }
+    case HangEntry::THangEntryProgCounter: {
+      aStack.stack().AppendElement(HangEntryProgCounter());
+      break;
+    }
+    case HangEntry::THangEntryContent: {
+      aStack.stack().AppendElement(HangEntryContent());
+      break;
+    }
+    case HangEntry::THangEntryJit: {
+      aStack.stack().AppendElement(HangEntryJit());
+      break;
+    }
+    case HangEntry::THangEntryWasm: {
+      aStack.stack().AppendElement(HangEntryWasm());
+      break;
+    }
+    case HangEntry::THangEntryChromeScript: {
+      aStack.stack().AppendElement(HangEntryChromeScript());
+      break;
+    }
+    case HangEntry::THangEntrySuppressed: {
+      aStack.stack().AppendElement(HangEntrySuppressed());
+      break;
+    }
+    default:
+      MOZ_CRASH("Unsupported HangEntry type?");
+  }
+  return Ok();
+}
+
+Result<HangDetails, nsresult> ReadHangDetailsFromFile(nsIFile* aFile) {
+  AutoFDClose fd;
+  nsresult rv = aFile->OpenNSPRFileDesc(PR_RDONLY, 0644, &fd.rwget());
+  if (NS_FAILED(rv)) {
+    return Err(rv);
+  }
+
+  uint8_t magicBuffer[sizeof(MAGIC)];
+  if (PR_Read(fd, (void*)magicBuffer, sizeof(MAGIC)) != sizeof(MAGIC)) {
+    return Err(NS_ERROR_FAILURE);
+  }
+  if (memcmp(magicBuffer, MAGIC, sizeof(MAGIC)) != 0) {
+    return Err(NS_ERROR_FAILURE);
+  }
+
+  HangDetails result;
+  uint32_t duration;
+  MOZ_TRY_VAR(duration, ReadUint(fd));
+  result.duration() = TimeDuration::FromMilliseconds(double(duration));
+  MOZ_TRY_VAR(result.threadName(), ReadTString<char>(fd));
+  MOZ_TRY_VAR(result.runnableName(), ReadTString<char>(fd));
+  MOZ_TRY_VAR(result.process(), ReadTString<char>(fd));
+  MOZ_TRY_VAR(result.remoteType(), ReadTString<char16_t>(fd));
+
+  uint32_t numAnnotations;
+  MOZ_TRY_VAR(numAnnotations, ReadUint(fd));
+  auto& annotations = result.annotations();
+
+  // Add a "Unrecovered" annotation so we can know when processing this that
+  // the hang persisted until the process was closed.
+  if (!annotations.SetCapacity(numAnnotations + 1, mozilla::fallible)) {
+    return Err(NS_ERROR_FAILURE);
+  }
+  annotations.AppendElement(HangAnnotation(NS_LITERAL_STRING("Unrecovered"),
+                                           NS_LITERAL_STRING("true")));
+
+  for (size_t i = 0; i < numAnnotations; ++i) {
+    HangAnnotation annot;
+    MOZ_TRY_VAR(annot.name(), ReadTString<char16_t>(fd));
+    MOZ_TRY_VAR(annot.value(), ReadTString<char16_t>(fd));
+    annotations.AppendElement(std::move(annot));
+  }
+
+  auto& stack = result.stack();
+  uint32_t numFrames;
+  MOZ_TRY_VAR(numFrames, ReadUint(fd));
+  if (!stack.stack().SetCapacity(numFrames, mozilla::fallible)) {
+    return Err(NS_ERROR_FAILURE);
+  }
+
+  for (size_t i = 0; i < numFrames; ++i) {
+    MOZ_TRY(ReadEntry(fd, stack));
+  }
+
+  uint32_t numModules;
+  MOZ_TRY_VAR(numModules, ReadUint(fd));
+  auto& modules = stack.modules();
+  if (!annotations.SetCapacity(numModules, mozilla::fallible)) {
+    return Err(NS_ERROR_FAILURE);
+  }
+
+  for (size_t i = 0; i < numModules; ++i) {
+    HangModule module;
+    MOZ_TRY_VAR(module.name(), ReadTString<char16_t>(fd));
+    MOZ_TRY_VAR(module.breakpadId(), ReadTString<char>(fd));
+    modules.AppendElement(std::move(module));
+  }
+
+  return result;
+}
+
+Result<Ok, nsresult> WriteHangDetailsToFile(HangDetails& aDetails,
+                                            nsIFile* aFile) {
+  AutoFDClose fd;
+  nsresult rv = aFile->OpenNSPRFileDesc(
+      PR_WRONLY | PR_CREATE_FILE | PR_TRUNCATE, 0644, &fd.rwget());
+  if (NS_FAILED(rv)) {
+    return Err(rv);
+  }
+
+  if (PR_Write(fd, (void*)MAGIC, sizeof(MAGIC)) != sizeof(MAGIC)) {
+    return Err(NS_ERROR_FAILURE);
+  }
+
+  double duration = aDetails.duration().ToMilliseconds();
+  if (duration > double(MaxValue<uint32_t>::value)) {
+    // Something has gone terribly wrong if we've hung for more than 2^32 ms.
+    return Err(NS_ERROR_FAILURE);
+  }
+
+  MOZ_TRY(WriteUint(fd, uint32_t(duration)));
+  MOZ_TRY(WriteTString(fd, aDetails.threadName()));
+  MOZ_TRY(WriteTString(fd, aDetails.runnableName()));
+  MOZ_TRY(WriteTString(fd, aDetails.process()));
+  MOZ_TRY(WriteTString(fd, aDetails.remoteType()));
+  MOZ_TRY(WriteUint(fd, CheckedUint32(aDetails.annotations().Length())));
+
+  for (auto& annot : aDetails.annotations()) {
+    MOZ_TRY(WriteTString(fd, annot.name()));
+    MOZ_TRY(WriteTString(fd, annot.value()));
+  }
+
+  auto& stack = aDetails.stack();
+  ReadModuleInformation(stack);
+
+  MOZ_TRY(WriteUint(fd, CheckedUint32(stack.stack().Length())));
+  for (auto& entry : stack.stack()) {
+    MOZ_TRY(WriteEntry(fd, stack, entry));
+  }
+
+  auto& modules = stack.modules();
+  MOZ_TRY(WriteUint(fd, CheckedUint32(modules.Length())));
+
+  for (auto& module : modules) {
+    MOZ_TRY(WriteTString(fd, module.name()));
+    MOZ_TRY(WriteTString(fd, module.breakpadId()));
+  }
+
+  return Ok();
+}
+
 NS_IMETHODIMP
 ProcessHangStackRunnable::Run() {
   // NOTE: Reading module information can take a long time, which is why we do
   // it off-main-thread.
-  ReadModuleInformation(mHangDetails.stack());
+  if (mHangDetails.stack().modules().IsEmpty()) {
+    ReadModuleInformation(mHangDetails.stack());
+  }
 
   RefPtr<nsHangDetails> hangDetails =
-      new nsHangDetails(std::move(mHangDetails));
+      new nsHangDetails(std::move(mHangDetails), mPersistedToDisk);
+  hangDetails->Submit();
+
+  return NS_OK;
+}
+
+NS_IMETHODIMP
+SubmitPersistedPermahangRunnable::Run() {
+  auto hangDetailsResult = ReadHangDetailsFromFile(mPermahangFile);
+  if (hangDetailsResult.isErr()) {
+    // If we somehow failed in trying to deserialize the hang file, go ahead
+    // and delete it to prevent future runs from having to go through the
+    // same thing. If we succeeded, however, the file should be cleaned up
+    // once the hang is submitted.
+    Unused << mPermahangFile->Remove(false);
+    return hangDetailsResult.unwrapErr();
+  }
+  RefPtr<nsHangDetails> hangDetails =
+      new nsHangDetails(hangDetailsResult.unwrap(), PersistedToDisk::Yes);
   hangDetails->Submit();
 
   return NS_OK;
 }
 
 }  // namespace mozilla
--- a/toolkit/components/backgroundhangmonitor/HangDetails.h
+++ b/toolkit/components/backgroundhangmonitor/HangDetails.h
@@ -5,64 +5,95 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #ifndef mozilla_HangDetails_h
 #define mozilla_HangDetails_h
 
 #include "ipc/IPCMessageUtils.h"
 #include "mozilla/ProcessedStack.h"
 #include "mozilla/RefPtr.h"
+#include "mozilla/Result.h"
 #include "mozilla/Move.h"
 #include "mozilla/HangTypes.h"
 #include "mozilla/HangAnnotations.h"
 #include "nsTArray.h"
 #include "nsIHangDetails.h"
 #include "mozilla/TimeStamp.h"
 
 namespace mozilla {
 
+enum class PersistedToDisk {
+  No,
+  Yes,
+};
+
 /**
  * HangDetails is the concrete implementaion of nsIHangDetails, and contains the
  * infromation which we want to expose to observers of the bhr-thread-hang
  * observer notification.
  */
 class nsHangDetails : public nsIHangDetails {
  public:
   NS_DECL_THREADSAFE_ISUPPORTS
   NS_DECL_NSIHANGDETAILS
 
-  explicit nsHangDetails(HangDetails&& aDetails)
-      : mDetails(std::move(aDetails)) {}
+  explicit nsHangDetails(HangDetails&& aDetails,
+                         PersistedToDisk aPersistedToDisk)
+      : mDetails(std::move(aDetails)), mPersistedToDisk(aPersistedToDisk) {}
 
   // Submit these HangDetails to the main thread. This will dispatch a runnable
   // to the main thread which will fire off the bhr-thread-hang observer
   // notification with this HangDetails as the subject.
   void Submit();
 
  private:
   virtual ~nsHangDetails() {}
 
   HangDetails mDetails;
+  PersistedToDisk mPersistedToDisk;
 };
 
+Result<Ok, nsresult> WriteHangDetailsToFile(HangDetails& aDetails,
+                                            nsIFile* aFile);
+
 /**
  * This runnable is run on the StreamTransportService threadpool in order to
  * process the stack off main thread before submitting it to the main thread as
  * an observer notification.
  *
  * This object should have the only remaining reference to aHangDetails, as it
  * will access its fields without synchronization.
  */
 class ProcessHangStackRunnable final : public Runnable {
  public:
-  explicit ProcessHangStackRunnable(HangDetails&& aHangDetails)
+  explicit ProcessHangStackRunnable(HangDetails&& aHangDetails,
+                                    PersistedToDisk aPersistedToDisk)
       : Runnable("ProcessHangStackRunnable"),
-        mHangDetails(std::move(aHangDetails)) {}
+        mHangDetails(std::move(aHangDetails)),
+        mPersistedToDisk(aPersistedToDisk) {}
 
   NS_IMETHOD Run() override;
 
  private:
   HangDetails mHangDetails;
+  PersistedToDisk mPersistedToDisk;
+};
+
+/**
+ * This runnable handles checking whether our last session wrote a permahang to
+ * disk which we were unable to submit through telemetry. If so, we read the
+ * permahang out and try again to submit it.
+ */
+class SubmitPersistedPermahangRunnable final : public Runnable {
+ public:
+  explicit SubmitPersistedPermahangRunnable(nsIFile* aPermahangFile)
+      : Runnable("SubmitPersistedPermahangRunnable"),
+        mPermahangFile(aPermahangFile) {}
+
+  NS_IMETHOD Run() override;
+
+ private:
+  nsCOMPtr<nsIFile> mPermahangFile;
 };
 
 }  // namespace mozilla
 
 #endif  // mozilla_HangDetails_h
--- a/toolkit/components/backgroundhangmonitor/nsIHangDetails.idl
+++ b/toolkit/components/backgroundhangmonitor/nsIHangDetails.idl
@@ -16,16 +16,22 @@ class HangDetails;
 /**
  * A scriptable interface for getting information about a BHR detected hang.
  * This is the type of the subject of the "bhr-thread-hang" observer topic.
  */
 [scriptable, uuid(23d63fff-38d6-4003-9c57-2c90aca1180a)]
 interface nsIHangDetails : nsISupports
 {
   /**
+   * The hang was persisted to disk as a permahang, so we can clear the
+   * permahang file once we submit this.
+   */
+  readonly attribute bool wasPersisted;
+
+  /**
    * The detected duration of the hang in milliseconds.
    */
   readonly attribute double duration;
 
   /**
    * The name of the thread which hung.
    */
   readonly attribute ACString thread;
--- a/toolkit/components/telemetry/docs/data/backgroundhangmonitor-ping.rst
+++ b/toolkit/components/telemetry/docs/data/backgroundhangmonitor-ping.rst
@@ -148,8 +148,10 @@ The following annotations are currently 
 | pluginVersion   | Version of the currently running plugin         |
 +-----------------+-------------------------------------------------+
 | HangUIShown     | "true" if the hang UI was shown                 |
 +-----------------+-------------------------------------------------+
 | HangUIContinued | "true" if continue was selected in the hang UI  |
 +-----------------+-------------------------------------------------+
 | HangUIDontShow  | "true" if the hang UI was not shown             |
 +-----------------+-------------------------------------------------+
+| Unrecovered     | "true" if the hang persisted until process exit |
++-----------------+-------------------------------------------------+