Bug 1181175 - Use RDTSC for Performance Monitoring instead of getrusage. r=jandem
☠☠ backed out by 3ccebe8a8e61 ☠ ☠
authorDavid Rajchenbach-Teller <dteller@mozilla.com>
Tue, 07 Jul 2015 18:45:54 +0200
changeset 288459 0a92886f497a3f79023195fb9e35a0a302c0f285
parent 288458 3e6d4d0f2e268ce676c0d9c77b523e79aed7c394
child 288460 21b660154b4a7388fdb68a456f9b5c7b874d1000
push id5067
push userraliiev@mozilla.com
push dateMon, 21 Sep 2015 14:04:52 +0000
treeherdermozilla-beta@14221ffe5b2f [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjandem
bugs1181175
milestone42.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1181175 - Use RDTSC for Performance Monitoring instead of getrusage. r=jandem
js/ipc/CPOWTimer.cpp
js/src/jsapi.h
js/src/vm/Interpreter.cpp
js/src/vm/Runtime.cpp
js/src/vm/Runtime.h
js/xpconnect/src/nsXPConnect.cpp
js/xpconnect/src/xpcprivate.h
toolkit/components/perfmonitoring/tests/xpcshell/test_compartments.js
--- a/js/ipc/CPOWTimer.cpp
+++ b/js/ipc/CPOWTimer.cpp
@@ -36,11 +36,10 @@ CPOWTimer::~CPOWTimer()
     }
 
     const int64_t endInterval = JS_Now();
     if (endInterval <= startInterval_) {
         // Do not assume monotonicity.
         return;
     }
 
-    js::PerformanceData* performance = js::GetPerformanceData(runtime);
-    performance->totalCPOWTime += endInterval - startInterval_;
+    js::AddCPOWPerformanceDelta(runtime, endInterval - startInterval_);
 }
--- a/js/src/jsapi.h
+++ b/js/src/jsapi.h
@@ -5477,16 +5477,17 @@ BuildStackString(JSContext* cx, HandleOb
 /* Stopwatch-based CPU monitoring. */
 
 namespace js {
 
 class AutoStopwatch;
 
 // Container for performance data
 // All values are monotonic.
+// All values are updated after running to completion.
 struct PerformanceData {
     // Number of times we have spent at least 2^n consecutive
     // milliseconds executing code in this group.
     // durations[0] is increased whenever we spend at least 1 ms
     // executing code in this group
     // durations[1] whenever we spend 2ms+
     //
     // durations[i] whenever we spend 2^ims+
@@ -5543,40 +5544,78 @@ struct PerformanceData {
 struct PerformanceGroup {
 
     // Performance data for this group.
     PerformanceData data;
 
     // An id unique to this runtime.
     const uint64_t uid;
 
+    // The number of cycles spent in this group during this iteration
+    // of the event loop. Note that cycles are not a reliable measure,
+    // especially over short intervals. See Runtime.cpp for a more
+    // complete discussion on the imprecision of cycle measurement.
+    uint64_t recentCycles;
+
+    // The number of times this group has been activated during this
+    // iteration of the event loop.
+    uint64_t recentTicks;
+
+    // The number of milliseconds spent doing CPOW during this
+    // iteration of the event loop.
+    uint64_t recentCPOW;
+
+    // The current iteration of the event loop.
+    uint64_t iteration() const {
+        return iteration_;
+    }
+
     // `true` if an instance of `AutoStopwatch` is already monitoring
     // the performance of this performance group for this iteration
     // of the event loop, `false` otherwise.
-    bool hasStopwatch(uint64_t iteration) const {
-        return stopwatch_ != nullptr && iteration_ == iteration;
+    bool hasStopwatch(uint64_t it) const {
+        return stopwatch_ != nullptr && iteration_ == it;
+    }
+
+    // `true` if a specific instance of `AutoStopwatch` is already monitoring
+    // the performance of this performance group for this iteration
+    // of the event loop, `false` otherwise.
+    bool hasStopwatch(uint64_t it, const AutoStopwatch* stopwatch) const {
+        return stopwatch_ == stopwatch && iteration_ == it;
     }
 
     // Mark that an instance of `AutoStopwatch` is monitoring
     // the performance of this group for a given iteration.
-    void acquireStopwatch(uint64_t iteration, const AutoStopwatch* stopwatch) {
-        iteration_ = iteration;
+    void acquireStopwatch(uint64_t it, const AutoStopwatch* stopwatch) {
+        if (iteration_ != it) {
+            // Any data that pretends to be recent is actually bound
+            // to an older iteration and therefore stale.
+            resetRecentData();
+        }
+        iteration_ = it;
         stopwatch_ = stopwatch;
     }
 
     // Mark that no `AutoStopwatch` is monitoring the
     // performance of this group for the iteration.
-    void releaseStopwatch(uint64_t iteration, const AutoStopwatch* stopwatch) {
-        if (iteration_ != iteration)
+    void releaseStopwatch(uint64_t it, const AutoStopwatch* stopwatch) {
+        if (iteration_ != it)
             return;
 
         MOZ_ASSERT(stopwatch == stopwatch_ || stopwatch_ == nullptr);
         stopwatch_ = nullptr;
     }
 
+    // Get rid of any data that pretends to be recent.
+    void resetRecentData() {
+        recentCycles = 0;
+        recentTicks = 0;
+        recentCPOW = 0;
+    }
+
     // Refcounting. For use with mozilla::RefPtr.
     void AddRef();
     void Release();
 
     // Construct a PerformanceGroup for a single compartment.
     explicit PerformanceGroup(JSRuntime* rt);
 
     // Construct a PerformanceGroup for a group of compartments.
@@ -5594,20 +5633,19 @@ private:
 
     // The current iteration of the event loop. If necessary,
     // may safely overflow.
     uint64_t iteration_;
 
     // The hash key for this PerformanceGroup.
     void* const key_;
 
-    // A reference counter.
+    // Refcounter.
     uint64_t refCount_;
 
-
     // `true` if this PerformanceGroup may be shared by several
     // compartments, `false` if it is dedicated to a single
     // compartment.
     const bool isSharedGroup_;
 };
 
 //
 // Each PerformanceGroupHolder handles:
@@ -5657,22 +5695,29 @@ struct PerformanceGroupHolder {
     // The PerformanceGroups held by this object.
     // Initially set to `nullptr` until the first call to `getGroup`.
     // May be reset to `nullptr` by a call to `unlink`.
     mozilla::RefPtr<js::PerformanceGroup> sharedGroup_;
     mozilla::RefPtr<js::PerformanceGroup> ownGroup_;
 };
 
 /**
- * Reset any stopwatch currently measuring.
+ * Commit any Performance Monitoring data.
  *
- * This function is designed to be called when we process a new event.
+ * Until `FlushMonitoring` has been called, all PerformanceMonitoring data is invisible
+ * to the outside world and can cancelled with a call to `ResetMonitoring`.
  */
 extern JS_PUBLIC_API(void)
-ResetStopwatches(JSRuntime*);
+FlushPerformanceMonitoring(JSRuntime*);
+
+/**
+ * Cancel any measurement that hasn't been committed.
+ */
+extern JS_PUBLIC_API(void)
+ResetPerformanceMonitoring(JSRuntime*);
 
 /**
  * Turn on/off stopwatch-based CPU monitoring.
  *
  * `SetStopwatchIsMonitoringCPOW` or `SetStopwatchIsMonitoringJank`
  * may return `false` if monitoring could not be activated, which may
  * happen if we are out of memory.
  */
@@ -5688,20 +5733,21 @@ extern JS_PUBLIC_API(bool)
 SetStopwatchIsMonitoringPerCompartment(JSRuntime*, bool);
 extern JS_PUBLIC_API(bool)
 GetStopwatchIsMonitoringPerCompartment(JSRuntime*);
 
 extern JS_PUBLIC_API(bool)
 IsStopwatchActive(JSRuntime*);
 
 /**
- * Access the performance information stored in a compartment.
- */
-extern JS_PUBLIC_API(PerformanceData*)
-GetPerformanceData(JSRuntime*);
+ * Add a number of microseconds to the time spent waiting on CPOWs
+ * since process start.
+ */
+extern JS_PUBLIC_API(void)
+AddCPOWPerformanceDelta(JSRuntime*, uint64_t delta);
 
 typedef bool
 (PerformanceStatsWalker)(JSContext* cx,
                          const PerformanceData& stats, uint64_t uid,
                          const uint64_t* parentId, void* closure);
 
 /**
  * Extract the performance statistics.
--- a/js/src/vm/Interpreter.cpp
+++ b/js/src/vm/Interpreter.cpp
@@ -51,24 +51,20 @@
 #include "jit/AtomicOperations-inl.h"
 #include "jit/JitFrames-inl.h"
 #include "vm/Debugger-inl.h"
 #include "vm/NativeObject-inl.h"
 #include "vm/Probes-inl.h"
 #include "vm/ScopeObject-inl.h"
 #include "vm/Stack-inl.h"
 
-#if defined(XP_MACOSX)
-#include <mach/mach.h>
-#elif defined(XP_UNIX)
-#include <sys/resource.h>
-#elif defined(XP_WIN)
-#include <processthreadsapi.h>
-#include <windows.h>
-#endif // defined(XP_MACOSX) || defined(XP_UNIX) || defined(XP_WIN)
+#if defined(XP_WIN)
+#include <Windows.h>
+#include <Processthreadsapi.h>
+#endif // defined(XP_WIN)
 
 using namespace js;
 using namespace js::gc;
 
 using mozilla::ArrayLength;
 using mozilla::DebugOnly;
 using mozilla::NumberEqualsInt32;
 using mozilla::PodCopy;
@@ -390,142 +386,166 @@ class AutoStopwatch final
     uint64_t iteration_;
 
     // `true` if we are monitoring jank, `false` otherwise.
     bool isMonitoringJank_;
     // `true` if we are monitoring CPOW, `false` otherwise.
     bool isMonitoringCPOW_;
 
     // Timestamps captured while starting the stopwatch.
-    uint64_t userTimeStart_;
-    uint64_t systemTimeStart_;
+    uint64_t cyclesStart_;
     uint64_t CPOWTimeStart_;
 
-   // The performance group shared by this compartment and possibly
-   // others, or `nullptr` if another AutoStopwatch is already in
-   // charge of monitoring that group.
-   mozilla::RefPtr<js::PerformanceGroup> sharedGroup_;
-
-   // The toplevel group, representing the entire process, or `nullptr`
-   // if another AutoStopwatch is already in charge of monitoring that group.
-   mozilla::RefPtr<js::PerformanceGroup> topGroup_;
-
-   // The performance group specific to this compartment, or
-   // `nullptr` if another AutoStopwatch is already in charge of
-   // monitoring that group.
-   mozilla::RefPtr<js::PerformanceGroup> ownGroup_;
-
-   public:
+    // The CPU on which we started the measure. Defined only
+    // if `isMonitoringJank_` is `true`.
+#if defined(XP_WIN) && _WIN32_WINNT >= 0x0601
+    struct cpuid_t {
+        WORD group_;
+        BYTE number_;
+        cpuid_t(WORD group, BYTE number)
+          : group_(group),
+            number_(number)
+        { }
+        cpuid_t()
+          : group_(0),
+            number_(0)
+        { }
+    };
+#elif defined(XP_LINUX)
+    typedef int cpuid_t;
+#else
+    typedef struct {} cpuid_t;
+#endif // defined(XP_WIN) || defined(XP_LINUX)
+
+    cpuid_t cpuStart_;
+
+    // The performance group shared by this compartment and possibly
+    // others, or `nullptr` if another AutoStopwatch is already in
+    // charge of monitoring that group.
+    mozilla::RefPtr<js::PerformanceGroup> sharedGroup_;
+
+    // The toplevel group, representing the entire process, or `nullptr`
+    // if another AutoStopwatch is already in charge of monitoring that group.
+    mozilla::RefPtr<js::PerformanceGroup> topGroup_;
+
+    // The performance group specific to this compartment, or
+    // `nullptr` if another AutoStopwatch is already in charge of
+    // monitoring that group.
+    mozilla::RefPtr<js::PerformanceGroup> ownGroup_;
+
+ public:
     // If the stopwatch is active, constructing an instance of
     // AutoStopwatch causes it to become the current owner of the
     // stopwatch.
     //
     // Previous owner is restored upon destruction.
     explicit inline AutoStopwatch(JSContext* cx MOZ_GUARD_OBJECT_NOTIFIER_PARAM)
       : cx_(cx)
       , iteration_(0)
       , isMonitoringJank_(false)
       , isMonitoringCPOW_(false)
-      , userTimeStart_(0)
-      , systemTimeStart_(0)
+      , cyclesStart_(0)
       , CPOWTimeStart_(0)
     {
         MOZ_GUARD_OBJECT_NOTIFIER_INIT;
 
         JSCompartment* compartment = cx_->compartment();
         if (compartment->scheduledForDestruction)
             return;
 
         JSRuntime* runtime = cx_->runtime();
-        iteration_ = runtime->stopwatch.iteration;
+        iteration_ = runtime->stopwatch.iteration();
 
         sharedGroup_ = acquireGroup(compartment->performanceMonitoring.getSharedGroup(cx));
         if (sharedGroup_)
             topGroup_ = acquireGroup(runtime->stopwatch.performance.getOwnGroup());
 
         if (runtime->stopwatch.isMonitoringPerCompartment())
             ownGroup_ = acquireGroup(compartment->performanceMonitoring.getOwnGroup());
 
         if (!sharedGroup_ && !ownGroup_) {
             // We are not in charge of monitoring anything.
             return;
         }
 
+        // Now that we are sure that JS code is being executed,
+        // initialize the stopwatch for this iteration, lazily.
+        runtime->stopwatch.start();
         enter();
     }
     ~AutoStopwatch()
     {
         if (!sharedGroup_ && !ownGroup_) {
             // We are not in charge of monitoring anything.
-            // (isMonitoringForTop_ implies isMonitoringForGroup_,
-            // so we do not need to check it)
             return;
         }
 
         JSCompartment* compartment = cx_->compartment();
         if (compartment->scheduledForDestruction)
             return;
 
         JSRuntime* runtime = cx_->runtime();
-        if (iteration_ != runtime->stopwatch.iteration) {
+        if (iteration_ != runtime->stopwatch.iteration()) {
             // We have entered a nested event loop at some point.
             // Any information we may have is obsolete.
             return;
         }
 
+        // Finish and commit measures
+        exit();
+
         releaseGroup(sharedGroup_);
         releaseGroup(topGroup_);
         releaseGroup(ownGroup_);
-
-        // Finish and commit measures
-        exit();
     }
    private:
     void enter() {
         JSRuntime* runtime = cx_->runtime();
 
         if (runtime->stopwatch.isMonitoringCPOW()) {
-            CPOWTimeStart_ = runtime->stopwatch.performance.getOwnGroup()->data.totalCPOWTime;
+            CPOWTimeStart_ = runtime->stopwatch.totalCPOWTime;
             isMonitoringCPOW_ = true;
         }
 
         if (runtime->stopwatch.isMonitoringJank()) {
-            if (this->getTimes(runtime, &userTimeStart_, &systemTimeStart_)) {
-                isMonitoringJank_ = true;
-            }
+            cyclesStart_ = this->getCycles();
+            cpuStart_ = this->getCPU();
+            isMonitoringJank_ = true;
         }
 
     }
 
     void exit() {
         JSRuntime* runtime = cx_->runtime();
 
-        uint64_t userTimeDelta = 0;
-        uint64_t systemTimeDelta = 0;
+        uint64_t cyclesDelta = 0;
         if (isMonitoringJank_ && runtime->stopwatch.isMonitoringJank()) {
             // We were monitoring jank when we entered and we still are.
-            uint64_t userTimeEnd, systemTimeEnd;
-            if (!this->getTimes(runtime, &userTimeEnd, &systemTimeEnd)) {
-                // We make no attempt to recover from this error. If
-                // we bail out here, we lose nothing of value, plus
-                // I'm nearly sure that this error cannot happen in
-                // practice.
-                return;
+
+            // If possible, discard results when we don't end on the
+            // same CPU as we started.  Note that we can be
+            // rescheduled to another CPU beween `getCycles()` and
+            // `getCPU()`.  We hope that this will happen rarely
+            // enough that the impact on our statistics will remain
+            // limited.
+            const cpuid_t cpuEnd = this->getCPU();
+            if (isSameCPU(cpuStart_, cpuEnd)) {
+                const uint64_t cyclesEnd = getCycles();
+                cyclesDelta = getDelta(cyclesEnd, cyclesStart_);
             }
-            userTimeDelta = userTimeEnd - userTimeStart_;
-            systemTimeDelta = systemTimeEnd - systemTimeStart_;
         }
 
         uint64_t CPOWTimeDelta = 0;
         if (isMonitoringCPOW_ && runtime->stopwatch.isMonitoringCPOW()) {
             // We were monitoring CPOW when we entered and we still are.
-            CPOWTimeDelta = runtime->stopwatch.performance.getOwnGroup()->data.totalCPOWTime - CPOWTimeStart_;
+            const uint64_t CPOWTimeEnd = runtime->stopwatch.totalCPOWTime;
+            CPOWTimeDelta = getDelta(CPOWTimeEnd, CPOWTimeStart_);
 
         }
-        commitDeltasToGroups(userTimeDelta, systemTimeDelta, CPOWTimeDelta);
+        addToGroups(cyclesDelta, CPOWTimeDelta);
     }
 
     // Attempt to acquire a group
     // If the group is `null` or if the group already has a stopwatch,
     // do nothing and return `null`.
     // Otherwise, bind the group to `this` for the current iteration
     // and return `group`.
     PerformanceGroup* acquireGroup(PerformanceGroup* group) {
@@ -542,150 +562,114 @@ class AutoStopwatch final
     // Release a group.
     // Noop if `group` is null or if `this` is not the stopwatch
     // of `group` for the current iteration.
     void releaseGroup(PerformanceGroup* group) {
         if (group)
             group->releaseStopwatch(iteration_, this);
     }
 
-    void commitDeltasToGroups(uint64_t userTimeDelta, uint64_t systemTimeDelta,
-                              uint64_t CPOWTimeDelta) const {
-        applyDeltas(userTimeDelta, systemTimeDelta, CPOWTimeDelta, sharedGroup_);
-        applyDeltas(userTimeDelta, systemTimeDelta, CPOWTimeDelta, topGroup_);
-        applyDeltas(userTimeDelta, systemTimeDelta, CPOWTimeDelta, ownGroup_);
+    // Add recent changes to all the groups owned by this stopwatch.
+    // Mark the groups as changed recently.
+    void addToGroups(uint64_t cyclesDelta, uint64_t CPOWTimeDelta) {
+        addToGroup(cyclesDelta, CPOWTimeDelta, sharedGroup_);
+        addToGroup(cyclesDelta, CPOWTimeDelta, topGroup_);
+        addToGroup(cyclesDelta, CPOWTimeDelta, ownGroup_);
     }
 
-    void applyDeltas(uint64_t userTimeDelta, uint64_t systemTimeDelta,
-                     uint64_t CPOWTimeDelta, PerformanceGroup* group) const {
+    // Add recent changes to a single group. Mark the group as changed recently.
+    void addToGroup(uint64_t cyclesDelta, uint64_t CPOWTimeDelta, PerformanceGroup* group) {
         if (!group)
             return;
 
-        group->data.ticks++;
-
-        uint64_t totalTimeDelta = userTimeDelta + systemTimeDelta;
-        group->data.totalUserTime += userTimeDelta;
-        group->data.totalSystemTime += systemTimeDelta;
-        group->data.totalCPOWTime += CPOWTimeDelta;
-
-        // Update an array containing the number of times we have missed
-        // at least 2^0 successive ms, 2^1 successive ms, ...
-        // 2^i successive ms.
-
-        // Duration of one frame, i.e. 16ms in museconds
-        size_t i = 0;
-        uint64_t duration = 1000;
-        for (i = 0, duration = 1000;
-             i < ArrayLength(group->data.durations) && duration < totalTimeDelta;
-             ++i, duration *= 2)
-        {
-            group->data.durations[i]++;
+        MOZ_ASSERT(group->hasStopwatch(iteration_, this));
+
+        if (group->recentTicks == 0) {
+            // First time we meet this group during the tick,
+            // mark it as needing updates.
+            JSRuntime* runtime = cx_->runtime();
+            runtime->stopwatch.addChangedGroup(group);
         }
+        group->recentTicks++;
+        group->recentCycles += cyclesDelta;
+        group->recentCPOW += CPOWTimeDelta;
+    }
+
+    // Perform a subtraction for a quantity that should be monotonic
+    // but is not guaranteed to be so.
+    //
+    // If `start <= end`, return `end - start`.
+    // Otherwise, return `0`.
+    uint64_t getDelta(const uint64_t end, const uint64_t start) const
+    {
+        if (start >= end)
+            return 0;
+        return end - start;
     }
 
-    // Get the OS-reported time spent in userland/systemland, in
-    // microseconds. On most platforms, this data is per-thread,
-    // but on some platforms we need to fall back to per-process.
-    bool getTimes(JSRuntime* runtime, uint64_t* userTime, uint64_t* systemTime) const {
-        MOZ_ASSERT(userTime);
-        MOZ_ASSERT(systemTime);
-
-#if defined(XP_MACOSX)
-        // On MacOS X, to get we per-thread data, we need to
-        // reach into the kernel.
-
-        mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
-        thread_basic_info_data_t info;
-        mach_port_t port = mach_thread_self();
-        kern_return_t err =
-            thread_info(/* [in] targeted thread*/ port,
-                        /* [in] nature of information*/ THREAD_BASIC_INFO,
-                        /* [out] thread information */  (thread_info_t)&info,
-                        /* [inout] number of items */   &count);
-
-        // We do not need ability to communicate with the thread, so
-        // let's release the port.
-        mach_port_deallocate(mach_task_self(), port);
-
-        if (err != KERN_SUCCESS)
-            return false;
-
-        *userTime = info.user_time.microseconds + info.user_time.seconds * 1000000;
-        *systemTime = info.system_time.microseconds + info.system_time.seconds * 1000000;
-
-#elif defined(XP_UNIX)
-        struct rusage rusage;
-#if defined(RUSAGE_THREAD)
-        // Under Linux, we can obtain per-thread statistics
-        int err = getrusage(RUSAGE_THREAD, &rusage);
+    // Return the value of the Timestamp Counter, as provided by the CPU.
+    // 0 on platforms for which we do not have access to a Timestamp Counter.
+    uint64_t getCycles() const
+    {
+#if defined(MOZ_HAVE_RDTSC)
+        return ReadTimestampCounter();
 #else
-        // Under other Unices, we need to do with more noisy
-        // per-process statistics.
-        int err = getrusage(RUSAGE_SELF, &rusage);
-#endif // defined(RUSAGE_THREAD)
-
-        if (err)
-            return false;
-
-        *userTime = rusage.ru_utime.tv_usec + rusage.ru_utime.tv_sec * 1000000;
-        *systemTime = rusage.ru_stime.tv_usec + rusage.ru_stime.tv_sec * 1000000;
-
-#elif defined(XP_WIN)
-        // Under Windows, we can obtain per-thread statistics,
-        // although experience seems to suggest that they are
-        // not very good under Windows XP.
-        FILETIME creationFileTime; // Ignored
-        FILETIME exitFileTime; // Ignored
-        FILETIME kernelFileTime;
-        FILETIME userFileTime;
-        BOOL success = GetThreadTimes(GetCurrentThread(),
-                                      &creationFileTime, &exitFileTime,
-                                      &kernelFileTime, &userFileTime);
-
-        if (!success)
-            return false;
-
-        ULARGE_INTEGER kernelTimeInt;
-        ULARGE_INTEGER userTimeInt;
-        kernelTimeInt.LowPart = kernelFileTime.dwLowDateTime;
-        kernelTimeInt.HighPart = kernelFileTime.dwHighDateTime;
-        // Convert 100 ns to 1 us, make sure that the result is monotonic
-        *systemTime = runtime->stopwatch.systemTimeFix.monotonize(kernelTimeInt.QuadPart / 10);
-
-        userTimeInt.LowPart = userFileTime.dwLowDateTime;
-        userTimeInt.HighPart = userFileTime.dwHighDateTime;
-        // Convert 100 ns to 1 us, make sure that the result is monotonic
-        *userTime = runtime->stopwatch.userTimeFix.monotonize(userTimeInt.QuadPart / 10);
-
-#endif // defined(XP_MACOSX) || defined(XP_UNIX) || defined(XP_WIN)
-
+        return 0;
+#endif // defined(MOZ_HAVE_RDTSC)
+    }
+
+
+    // Return the identifier of the current CPU, on platforms for which we have
+    // access to the current CPU.
+    cpuid_t inline getCPU() const
+    {
+#if defined(XP_WIN)
+        PROCESSOR_NUMBER proc;
+        GetCurrentProcessorNumberEx(&proc);
+
+        cpuid_t result(proc.Group, proc.Number);
+        return result;
+#elif defined(XP_LINUX)
+        return sched_getcpu();
+#else
+        return {};
+#endif // defined(XP_WIN) || defined(XP_LINUX)
+    }
+
+    // Compare two CPU identifiers.
+    bool inline isSameCPU(const cpuid_t& a, const cpuid_t& b) const
+    {
+#if defined(XP_WIN)
+        return a.group_ == b.group_ && a.number_ == b.number_;
+#elif defined(XP_LINUX)
+        return a == b;
+#else
         return true;
+#endif
     }
-
-
-private:
+ private:
     MOZ_DECL_USE_GUARD_OBJECT_NOTIFIER;
 };
 
 } // namespace js
 
 // MSVC with PGO inlines a lot of functions in RunScript, resulting in large
 // stack frames and stack overflow issues, see bug 1167883. Turn off PGO to
 // avoid this.
 #ifdef _MSC_VER
 # pragma optimize("g", off)
 #endif
 bool
 js::RunScript(JSContext* cx, RunState& state)
 {
     JS_CHECK_RECURSION(cx, return false);
 
-#if defined(NIGHTLY_BUILD)
+#if defined(NIGHTLY_BUILD) && defined(MOZ_HAVE_RDTSC)
     js::AutoStopwatch stopwatch(cx);
-#endif // defined(NIGHTLY_BUILD)
+#endif // defined(NIGHTLY_BUILD) && defined(MOZ_HAVE_RDTSC)
 
     SPSEntryMarker marker(cx->runtime(), state.script());
 
     state.script()->ensureNonLazyCanonicalFunction(cx);
 
     if (jit::IsIonEnabled(cx)) {
         jit::MethodStatus status = jit::CanEnter(cx, state);
         if (status == jit::Method_Error)
--- a/js/src/vm/Runtime.cpp
+++ b/js/src/vm/Runtime.cpp
@@ -7,16 +7,25 @@
 #include "vm/Runtime-inl.h"
 
 #include "mozilla/ArrayUtils.h"
 #include "mozilla/Atomics.h"
 #include "mozilla/DebugOnly.h"
 #include "mozilla/MemoryReporting.h"
 #include "mozilla/ThreadLocal.h"
 
+#if defined(XP_MACOSX)
+#include <mach/mach.h>
+#elif defined(XP_UNIX)
+#include <sys/resource.h>
+#elif defined(XP_WIN)
+#include <processthreadsapi.h>
+#include <windows.h>
+#endif // defined(XP_MACOSX) || defined(XP_UNIX) || defined(XP_WIN)
+
 #include <locale.h>
 #include <string.h>
 
 #ifdef JS_CAN_CHECK_THREADSAFE_ACCESSES
 # include <sys/mman.h>
 #endif
 
 #include "jsatom.h"
@@ -867,23 +876,310 @@ JS::UpdateJSRuntimeProfilerSampleBufferG
 
 JS_FRIEND_API(bool)
 JS::IsProfilingEnabledForRuntime(JSRuntime* runtime)
 {
     MOZ_ASSERT(runtime);
     return runtime->spsProfiler.enabled();
 }
 
+JS_PUBLIC_API(void)
+js::FlushPerformanceMonitoring(JSRuntime* runtime)
+{
+    MOZ_ASSERT(runtime);
+    return runtime->stopwatch.commit();
+}
+JS_PUBLIC_API(void)
+js::ResetPerformanceMonitoring(JSRuntime* runtime)
+{
+    MOZ_ASSERT(runtime);
+    return runtime->stopwatch.reset();
+}
+
 void
-js::ResetStopwatches(JSRuntime* rt)
+JSRuntime::Stopwatch::reset()
+{
+    // All ongoing measures are dependent on the current iteration#.
+    // By incrementing it, we mark all data as stale. Stale data will
+    // be overwritten progressively during the execution.
+    ++iteration_;
+    touchedGroups.clear();
+}
+
+void
+JSRuntime::Stopwatch::start()
+{
+    if (!isMonitoringJank_) {
+        return;
+    }
+
+    if (iteration_ == startedAtIteration_) {
+        // The stopwatch is already started for this iteration.
+        return;
+    }
+
+    startedAtIteration_ = iteration_;
+    if (!getResources(&userTimeStart_, &systemTimeStart_))
+        return;
+}
+
+// Commit the data that has been collected during the iteration
+// into the actual `PerformanceData`.
+//
+// We use the proportion of cycles-spent-in-group over
+// cycles-spent-in-toplevel-group as an approximation to allocate
+// system (kernel) time and user (CPU) time to each group. Note
+// that cycles are not an exact measure:
+//
+// 1. if the computer has gone to sleep, the clock may be reset to 0;
+// 2. if the process is moved between CPUs/cores, it may end up on a CPU
+//    or core with an unsynchronized clock;
+// 3. the mapping between clock cycles and walltime varies with the current
+//    frequency of the CPU;
+// 4. other threads/processes using the same CPU will also increment
+//    the counter.
+//
+// ** Effect of 1. (computer going to sleep)
+//
+// We assume that this will happen very seldom. Since the final numbers
+// are bounded by the CPU time and Kernel time reported by `getresources`,
+// the effect will be contained to a single iteration of the event loop.
+//
+// ** Effect of 2. (moving between CPUs/cores)
+//
+// On platforms that support it, we only measure the number of cycles
+// if we start and end execution of a group on the same
+// CPU/core. While there is a small window (a few cycles) during which
+// the thread can be migrated without us noticing, we expect that this
+// will happen rarely enough that this won't affect the statistics
+// meaningfully.
+//
+// On other platforms, assuming that the probability of jumping
+// between CPUs/cores during a given (real) cycle is constant, and
+// that the distribution of differences between clocks is even, the
+// probability that the number of cycles reported by a measure is
+// modified by X cycles should be a gaussian distribution, with groups
+// with longer execution having a larger amplitude than groups with
+// shorter execution. Since we discard measures that result in a
+// negative number of cycles, this distribution is actually skewed
+// towards over-estimating the number of cycles of groups that already
+// have many cycles and under-estimating the number of cycles that
+// already have fewer cycles.
+//
+// Since the final numbers are bounded by the CPU time and Kernel time
+// reported by `getresources`, we accept this bias.
+//
+// ** Effect of 3. (mapping between clock cycles and walltime)
+//
+// Assuming that this is evenly distributed, we expect that this will
+// eventually balance out.
+//
+// ** Effect of 4. (cycles increase with system activity)
+//
+// Assuming that, within an iteration of the event loop, this happens
+// unformly over time, this will skew towards over-estimating the number
+// of cycles of groups that already have many cycles and under-estimating
+// the number of cycles that already have fewer cycles.
+//
+// Since the final numbers are bounded by the CPU time and Kernel time
+// reported by `getresources`, we accept this bias.
+//
+// ** Big picture
+//
+// Computing the number of cycles is fast and should be accurate
+// enough in practice. Alternatives (such as calling `getresources`
+// all the time or sampling from another thread) are very expensive
+// in system calls and/or battery and not necessarily more accurate.
+void
+JSRuntime::Stopwatch::commit()
 {
-    MOZ_ASSERT(rt);
-    rt->stopwatch.reset();
+#if !defined(MOZ_HAVE_RDTSC)
+    // The AutoStopwatch is only executed if `MOZ_HAVE_RDTSC`.
+    return;
+#endif // !defined(MOZ_HAVE_RDTSC)
+
+    if (!isMonitoringJank_) {
+        // Either we have not started monitoring or monitoring has
+        // been cancelled during the iteration.
+        return;
+    }
+
+    if (startedAtIteration_ != iteration_) {
+        // No JS code has been monitored during this iteration.
+        return;
+    }
+
+    uint64_t userTimeStop, systemTimeStop;
+    if (!getResources(&userTimeStop, &systemTimeStop))
+        return;
+
+    // `getResources` is not guaranteed to be monotonic, so round up
+    // any negative result to 0 milliseconds.
+    uint64_t userTimeDelta = 0;
+    if (userTimeStop > userTimeStart_)
+        userTimeDelta = userTimeStop - userTimeStart_;
+
+    uint64_t systemTimeDelta = 0;
+    if (systemTimeStop > systemTimeStart_)
+        systemTimeDelta = systemTimeStop - systemTimeStart_;
+
+    mozilla::RefPtr<js::PerformanceGroup> group = performance.getOwnGroup();
+    const uint64_t totalRecentCycles = group->recentCycles;
+
+    mozilla::Vector<mozilla::RefPtr<js::PerformanceGroup>> recentGroups;
+    touchedGroups.swap(recentGroups);
+    MOZ_ASSERT(recentGroups.length() > 0);
+
+    // We should only reach this stage if `group` has had some activity.
+    MOZ_ASSERT(group->recentTicks > 0);
+    for (mozilla::RefPtr<js::PerformanceGroup>* iter = recentGroups.begin(); iter != recentGroups.end(); ++iter) {
+        transferDeltas(userTimeDelta, systemTimeDelta, totalRecentCycles, *iter);
+    }
+
+    // Make sure that `group` was treated along with the other items of `recentGroups`.
+    MOZ_ASSERT(group->recentTicks == 0);
+
+    // Finally, reset immediately, to make sure that we're not hit by the
+    // end of a nested event loop (which would cause `commit` to be called
+    // twice in succession).
+    reset();
 }
 
+void
+JSRuntime::Stopwatch::transferDeltas(uint64_t totalUserTimeDelta, uint64_t totalSystemTimeDelta,
+                                     uint64_t totalCyclesDelta, js::PerformanceGroup* group) {
+
+    const uint64_t ticksDelta = group->recentTicks;
+    const uint64_t cpowTimeDelta = group->recentCPOW;
+    const uint64_t cyclesDelta = group->recentCycles;
+    group->resetRecentData();
+
+    // We have now performed all cleanup and may `return` at any time without fear of leaks.
+
+    if (group->iteration() != iteration_) {
+        // Stale data, don't commit.
+        return;
+    }
+
+    // When we add a group as changed, we immediately set its
+    // `recentTicks` from 0 to 1.  If we have `ticksDelta == 0` at
+    // this stage, we have already called `resetRecentData` but we
+    // haven't removed it from the list.
+    MOZ_ASSERT(ticksDelta != 0);
+    MOZ_ASSERT(cyclesDelta <= totalCyclesDelta);
+    if (cyclesDelta == 0 || totalCyclesDelta == 0) {
+        // Nothing useful, don't commit.
+        return;
+    }
+
+    double proportion = (double)cyclesDelta / (double)totalCyclesDelta;
+    MOZ_ASSERT(proportion <= 1);
+
+    const uint64_t userTimeDelta = proportion * totalUserTimeDelta;
+    const uint64_t systemTimeDelta = proportion * totalSystemTimeDelta;
+
+    group->data.totalUserTime += userTimeDelta;
+    group->data.totalSystemTime += systemTimeDelta;
+    group->data.totalCPOWTime += cpowTimeDelta;
+    group->data.ticks += ticksDelta;
+
+    const uint64_t totalTimeDelta = userTimeDelta + systemTimeDelta;
+
+    size_t i = 0;
+    uint64_t duration = 1000; // 1ms in ┬Ás
+    for (i = 0, duration = 1000;
+         i < ArrayLength(group->data.durations) && duration < totalTimeDelta;
+         ++i, duration *= 2) {
+        group->data.durations[i]++;
+    }
+}
+
+// Get the OS-reported time spent in userland/systemland, in
+// microseconds. On most platforms, this data is per-thread,
+// but on some platforms we need to fall back to per-process.
+// Data is not guaranteed to be monotonic.
+bool
+JSRuntime::Stopwatch::getResources(uint64_t* userTime,
+                                   uint64_t* systemTime) const {
+    MOZ_ASSERT(userTime);
+    MOZ_ASSERT(systemTime);
+
+#if defined(XP_MACOSX)
+    // On MacOS X, to get we per-thread data, we need to
+    // reach into the kernel.
+
+    mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
+    thread_basic_info_data_t info;
+    mach_port_t port = mach_thread_self();
+    kern_return_t err =
+        thread_info(/* [in] targeted thread*/ port,
+                    /* [in] nature of information*/ THREAD_BASIC_INFO,
+                    /* [out] thread information */  (thread_info_t)&info,
+                    /* [inout] number of items */   &count);
+
+    // We do not need ability to communicate with the thread, so
+    // let's release the port.
+    mach_port_deallocate(mach_task_self(), port);
+
+    if (err != KERN_SUCCESS)
+        return false;
+
+    *userTime = info.user_time.microseconds + info.user_time.seconds * 1000000;
+    *systemTime = info.system_time.microseconds + info.system_time.seconds * 1000000;
+
+#elif defined(XP_UNIX)
+    struct rusage rusage;
+#if defined(RUSAGE_THREAD)
+    // Under Linux, we can obtain per-thread statistics
+    int err = getrusage(RUSAGE_THREAD, &rusage);
+#else
+    // Under other Unices, we need to do with more noisy
+    // per-process statistics.
+    int err = getrusage(RUSAGE_SELF, &rusage);
+#endif // defined(RUSAGE_THREAD)
+
+    if (err)
+        return false;
+
+    *userTime = rusage.ru_utime.tv_usec + rusage.ru_utime.tv_sec * 1000000;
+    *systemTime = rusage.ru_stime.tv_usec + rusage.ru_stime.tv_sec * 1000000;
+
+#elif defined(XP_WIN)
+    // Under Windows, we can obtain per-thread statistics,
+    // although experience seems to suggest that they are
+    // not very good under Windows XP.
+    FILETIME creationFileTime; // Ignored
+    FILETIME exitFileTime; // Ignored
+    FILETIME kernelFileTime;
+    FILETIME userFileTime;
+    BOOL success = GetThreadTimes(GetCurrentThread(),
+                                  &creationFileTime, &exitFileTime,
+                                  &kernelFileTime, &userFileTime);
+
+    if (!success)
+        return false;
+
+    ULARGE_INTEGER kernelTimeInt;
+    kernelTimeInt.LowPart = kernelFileTime.dwLowDateTime;
+    kernelTimeInt.HighPart = kernelFileTime.dwHighDateTime;
+    // Convert 100 ns to 1 us.
+    *systemTime = kernelTimeInt.QuadPart / 10;
+
+    ULARGE_INTEGER userTimeInt;
+    userTimeInt.LowPart = userFileTime.dwLowDateTime;
+    userTimeInt.HighPart = userFileTime.dwHighDateTime;
+    // Convert 100 ns to 1 us.
+    *userTime = userTimeInt.QuadPart / 10;
+
+#endif // defined(XP_MACOSX) || defined(XP_UNIX) || defined(XP_WIN)
+
+    return true;
+}
+
+
 bool
 js::SetStopwatchIsMonitoringJank(JSRuntime* rt, bool value)
 {
     return rt->stopwatch.setIsMonitoringJank(value);
 }
 bool
 js::GetStopwatchIsMonitoringJank(JSRuntime* rt)
 {
@@ -956,49 +1252,56 @@ js::PerformanceGroupHolder::getSharedGro
     void* key = getHashKey(cx);
     JSRuntime::Stopwatch::Groups::AddPtr ptr = runtime_->stopwatch.groups().lookupForAdd(key);
     if (ptr) {
         sharedGroup_ = ptr->value();
         MOZ_ASSERT(sharedGroup_);
     } else {
         sharedGroup_ = runtime_->new_<PerformanceGroup>(cx, key);
         if (!sharedGroup_)
-            return nullptr;
-
+          return nullptr;
         runtime_->stopwatch.groups().add(ptr, key, sharedGroup_);
     }
 
     return sharedGroup_;
 }
 
-PerformanceData*
-js::GetPerformanceData(JSRuntime* rt)
+void
+js::AddCPOWPerformanceDelta(JSRuntime* rt, uint64_t delta)
 {
-    return &rt->stopwatch.performance.getOwnGroup()->data;
+    rt->stopwatch.totalCPOWTime += delta;
 }
 
 js::PerformanceGroup::PerformanceGroup(JSRuntime* rt)
   : uid(rt->stopwatch.uniqueId()),
+    recentCycles(0),
+    recentTicks(0),
+    recentCPOW(0),
     runtime_(rt),
     stopwatch_(nullptr),
     iteration_(0),
     key_(nullptr),
     refCount_(0),
     isSharedGroup_(false)
-{ }
+{
+}
 
- js::PerformanceGroup::PerformanceGroup(JSContext* cx, void* key)
-   : uid(cx->runtime()->stopwatch.uniqueId()),
-     runtime_(cx->runtime()),
-     stopwatch_(nullptr),
-     iteration_(0),
-     key_(key),
-     refCount_(0),
-     isSharedGroup_(true)
-{ }
+js::PerformanceGroup::PerformanceGroup(JSContext* cx, void* key)
+  : uid(cx->runtime()->stopwatch.uniqueId()),
+    recentCycles(0),
+    recentTicks(0),
+    recentCPOW(0),
+    runtime_(cx->runtime()),
+    stopwatch_(nullptr),
+    iteration_(0),
+    key_(key),
+    refCount_(0),
+    isSharedGroup_(true)
+{
+}
 
 void
 js::PerformanceGroup::AddRef()
 {
     ++refCount_;
 }
 
 void
--- a/js/src/vm/Runtime.h
+++ b/js/src/vm/Runtime.h
@@ -10,16 +10,17 @@
 #include "mozilla/Atomics.h"
 #include "mozilla/Attributes.h"
 #include "mozilla/LinkedList.h"
 #include "mozilla/MemoryReporting.h"
 #include "mozilla/PodOperations.h"
 #include "mozilla/Scoped.h"
 #include "mozilla/ThreadLocal.h"
 #include "mozilla/UniquePtr.h"
+#include "mozilla/Vector.h"
 
 #include <setjmp.h>
 
 #include "jsatom.h"
 #include "jsclist.h"
 #include "jsscript.h"
 
 #ifdef XP_MACOSX
@@ -1513,63 +1514,77 @@ struct JSRuntime : public JS::shadow::Ru
         }
 
         /**
          * Performance data on the entire runtime.
          */
         js::PerformanceGroupHolder performance;
 
         /**
-         * The number of times we have entered the event loop.
-         * Used to reset counters whenever we enter the loop,
-         * which may be caused either by having completed the
-         * previous run of the event loop, or by entering a
-         * nested loop.
-         *
-         * Always incremented by 1, may safely overflow.
-         */
-        uint64_t iteration;
-
-        /**
          * Callback used to ask the embedding to determine in which
          * Performance Group the current execution belongs. Typically, this is
          * used to regroup JSCompartments from several iframes from the same
          * page or from several compartments of the same addon into a single
          * Performance Group.
          *
          * May be `nullptr`, in which case we put all the JSCompartments
          * in the same PerformanceGroup.
          */
         JSCurrentPerfGroupCallback currentPerfGroupCallback;
 
+        /**
+         * The number of the current iteration of the event loop.
+         */
+        uint64_t iteration() {
+            return iteration_;
+        }
+
         explicit Stopwatch(JSRuntime* runtime)
           : performance(runtime)
-          , iteration(0)
           , currentPerfGroupCallback(nullptr)
+          , totalCPOWTime(0)
           , isMonitoringJank_(false)
           , isMonitoringCPOW_(false)
           , isMonitoringPerCompartment_(false)
+          , iteration_(0)
+          , startedAtIteration_(0)
           , idCounter_(0)
         { }
 
         /**
          * Reset the stopwatch.
          *
-         * This method is meant to be called whenever we start processing
-         * an event, to ensure that stop any ongoing measurement that would
-         * otherwise provide irrelevant results.
+         * This method is meant to be called whenever we start
+         * processing an event, to ensure that we stop any ongoing
+         * measurement that would otherwise provide irrelevant
+         * results.
          */
-        void reset() {
-            ++iteration;
-        }
+        void reset();
+
+        /**
+         * Start the stopwatch.
+         *
+         * This method is meant to be called once we know that the
+         * current event contains JavaScript code to execute. Calling
+         * this several times during the same iteration is idempotent.
+         */
+        void start();
+
+        /**
+         * Commit the performance data collected since the last call
+         * to `start()`, unless `reset()` has been called since then.
+         */
+        void commit();
+
         /**
          * Activate/deactivate stopwatch measurement of jank.
          *
-         * Noop if `value` is `true` and the stopwatch is already active,
-         * or if `value` is `false` and the stopwatch is already inactive.
+         * Noop if `value` is `true` and the stopwatch is already
+         * measuring jank, or if `value` is `false` and the stopwatch
+         * is not measuring jank.
          *
          * Otherwise, any pending measurements are dropped, but previous
          * measurements remain stored.
          *
          * May return `false` if the underlying hashtable cannot be allocated.
          */
         bool setIsMonitoringJank(bool value) {
             if (isMonitoringJank_ != value)
@@ -1582,16 +1597,28 @@ struct JSRuntime : public JS::shadow::Ru
 
             isMonitoringJank_ = value;
             return true;
         }
         bool isMonitoringJank() const {
             return isMonitoringJank_;
         }
 
+        /**
+         * Activate/deactivate stopwatch measurement per compartment.
+         *
+         * Noop if `value` is `true` and the stopwatch is already
+         * measuring per compartment, or if `value` is `false` and the
+         * stopwatch is not measuring per compartment.
+         *
+         * Otherwise, any pending measurements are dropped, but previous
+         * measurements remain stored.
+         *
+         * May return `false` if the underlying hashtable cannot be allocated.
+         */
         bool setIsMonitoringPerCompartment(bool value) {
             if (isMonitoringPerCompartment_ != value)
                 reset();
 
             if (value && !groups_.initialized()) {
                 if (!groups_.init(128))
                     return false;
             }
@@ -1600,73 +1627,136 @@ struct JSRuntime : public JS::shadow::Ru
             return true;
         }
         bool isMonitoringPerCompartment() const {
             return isMonitoringPerCompartment_;
         }
 
         /**
          * Activate/deactivate stopwatch measurement of CPOW.
+         *
+         * Noop if `value` is `true` and the stopwatch is already
+         * measuring CPOW, or if `value` is `false` and the stopwatch
+         * is not measuring CPOW.
+         *
+         * Otherwise, any pending measurements are dropped, but previous
+         * measurements remain stored.
+         *
+         * May return `false` if the underlying hashtable cannot be allocated.
          */
         bool setIsMonitoringCPOW(bool value) {
+            if (isMonitoringCPOW_ != value)
+                reset();
+
+            if (value && !groups_.initialized()) {
+                if (!groups_.init(128))
+                    return false;
+            }
+
             isMonitoringCPOW_ = value;
             return true;
         }
 
         bool isMonitoringCPOW() const {
             return isMonitoringCPOW_;
         }
 
         /**
          * Return a identifier for a group, unique to the runtime.
          */
         uint64_t uniqueId() {
             return idCounter_++;
         }
 
-        // Some systems have non-monotonic clocks. While we cannot
-        // improve the precision, we can make sure that our measures
-        // are monotonic nevertheless. We do this by storing the
-        // result of the latest call to the clock and making sure
-        // that the next timestamp is greater or equal.
-        struct MonotonicTimeStamp {
-            MonotonicTimeStamp()
-              : latestGood_(0)
-            {}
-            inline uint64_t monotonize(uint64_t stamp)
-            {
-                if (stamp <= latestGood_)
-                    return latestGood_;
-                latestGood_ = stamp;
-                return stamp;
-            }
-          private:
-            uint64_t latestGood_;
-        };
-        MonotonicTimeStamp systemTimeFix;
-        MonotonicTimeStamp userTimeFix;
+        /**
+         * Mark a group as changed during the current iteration.
+         *
+         * Recent data from this group will be post-processed and
+         * committed at the end of the iteration.
+         */
+        void addChangedGroup(js::PerformanceGroup* group) {
+            MOZ_ASSERT(group->recentTicks == 0);
+            touchedGroups.append(group);
+        }
 
+        // The total amount of time spent waiting on CPOWs since the
+        // start of the process, in microseconds.
+        uint64_t totalCPOWTime;
     private:
         Stopwatch(const Stopwatch&) = delete;
         Stopwatch& operator=(const Stopwatch&) = delete;
 
+        // Commit a piece of data to a single group.
+        // `totalUserTimeDelta`, `totalSystemTimeDelta`, `totalCyclesDelta`
+        // represent the outer measures, taken for the entire runtime.
+        void transferDeltas(uint64_t totalUserTimeDelta,
+                            uint64_t totalSystemTimeDelta,
+                            uint64_t totalCyclesDelta,
+                            js::PerformanceGroup* destination);
+
+        // Query the OS for the time spent in CPU/kernel since process
+        // launch.
+        bool getResources(uint64_t* userTime, uint64_t* systemTime) const;
+
+    private:
         Groups groups_;
         friend struct js::PerformanceGroupHolder;
 
         /**
-         * `true` if stopwatch monitoring is active, `false` otherwise.
+         * `true` if stopwatch monitoring is active for Jank, `false` otherwise.
          */
         bool isMonitoringJank_;
+        /**
+         * `true` if stopwatch monitoring is active for CPOW, `false` otherwise.
+         */
         bool isMonitoringCPOW_;
+        /**
+         * `true` if the stopwatch should udpdate data per-compartment, in
+         * addition to data per-group.
+         */
         bool isMonitoringPerCompartment_;
 
         /**
+         * The number of times we have entered the event loop.
+         * Used to reset counters whenever we enter the loop,
+         * which may be caused either by having completed the
+         * previous run of the event loop, or by entering a
+         * nested loop.
+         *
+         * Always incremented by 1, may safely overflow.
+         */
+        uint64_t iteration_;
+
+        /**
+         * The iteration at which the stopwatch was last started.
+         *
+         * Used both to avoid starting the stopwatch several times
+         * during the same event loop and to avoid committing stale
+         * stopwatch results.
+         */
+        uint64_t startedAtIteration_;
+
+        /**
          * A counter used to generate unique identifiers for groups.
          */
         uint64_t idCounter_;
+
+        /**
+         * The timestamps returned by `getResources()` during the call to
+         * `start()` in the current iteration of the event loop.
+         */
+        uint64_t userTimeStart_;
+        uint64_t systemTimeStart_;
+
+        /**
+         * Performance groups used during the current event.
+         *
+         * They are cleared by `commit()` and `reset()`.
+         */
+        mozilla::Vector<mozilla::RefPtr<js::PerformanceGroup>> touchedGroups;
     };
     Stopwatch stopwatch;
 };
 
 namespace js {
 
 // When entering JIT code, the calling JSContext* is stored into the thread's
 // PerThreadData. This function retrieves the JSContext with the pre-condition
--- a/js/xpconnect/src/nsXPConnect.cpp
+++ b/js/xpconnect/src/nsXPConnect.cpp
@@ -943,16 +943,18 @@ public:
 } // namespace
 
 NS_IMETHODIMP
 nsXPConnect::OnProcessNextEvent(nsIThreadInternal* aThread, bool aMayWait,
                                 uint32_t aRecursionDepth)
 {
     MOZ_ASSERT(NS_IsMainThread());
 
+    mRuntime->OnBeforeProcessNextEvent();
+
     // If ProcessNextEvent was called during a Promise "then" callback, we
     // must process any pending microtasks before blocking in the event loop,
     // otherwise we may deadlock until an event enters the queue later.
     if (aMayWait) {
         if (Promise::PerformMicroTaskCheckpoint()) {
             // If any microtask was processed, we post a dummy event in order to
             // force the ProcessNextEvent call not to block.  This is required
             // to support nested event loops implemented using a pattern like
@@ -991,16 +993,18 @@ nsXPConnect::AfterProcessNextEvent(nsITh
     // Call cycle collector occasionally.
     MOZ_ASSERT(NS_IsMainThread());
     nsJSContext::MaybePokeCC();
 
     nsContentUtils::PerformMainThreadMicroTaskCheckpoint();
 
     Promise::PerformMicroTaskCheckpoint();
 
+    mRuntime->OnAfterMicroTaskCheckPoint();
+
     PopNullJSContext();
 
     return NS_OK;
 }
 
 NS_IMETHODIMP
 nsXPConnect::OnDispatchedEvent(nsIThreadInternal* aThread)
 {
--- a/js/xpconnect/src/xpcprivate.h
+++ b/js/xpconnect/src/xpcprivate.h
@@ -610,20 +610,34 @@ public:
     JSObject* PrivilegedJunkScope() { return mPrivilegedJunkScope; }
     JSObject* CompilationScope() { return mCompilationScope; }
 
     void InitSingletonScopes();
     void DeleteSingletonScopes();
 
     PRTime GetWatchdogTimestamp(WatchdogTimestampCategory aCategory);
 
+    // Called before we start processing the next event on the main
+    // thread.
+    void OnBeforeProcessNextEvent() {
+        // As we may be entering a nested event loop, we need to
+        // cancel any ongoing performance measurement.
+        js::ResetPerformanceMonitoring(Get()->Runtime());
+    }
+
+    // Called after we have finished processing the next event,
+    // including micro-tasks.
+    void OnAfterMicroTaskCheckPoint() {
+        // Now that we are certain that the event is complete,
+        // we can flush any ongoing performance measurement.
+        js::FlushPerformanceMonitoring(Get()->Runtime());
+    }
     void OnProcessNextEvent() {
         mSlowScriptCheckpoint = mozilla::TimeStamp::NowLoRes();
         mSlowScriptSecondHalf = false;
-        js::ResetStopwatches(Get()->Runtime());
     }
     void OnAfterProcessNextEvent() {
         mSlowScriptCheckpoint = mozilla::TimeStamp();
         mSlowScriptSecondHalf = false;
     }
 
     nsTArray<nsXPCWrappedJS*>& WrappedJSToReleaseArray() { return mWrappedJSToReleaseArray; }
 
--- a/toolkit/components/perfmonitoring/tests/xpcshell/test_compartments.js
+++ b/toolkit/components/perfmonitoring/tests/xpcshell/test_compartments.js
@@ -6,47 +6,48 @@ Cu.import("resource://gre/modules/Task.j
 Cu.import("resource://gre/modules/Services.jsm", this);
 Cu.import("resource://gre/modules/PerformanceStats.jsm", this);
 
 function run_test() {
   run_next_test();
 }
 
 let promiseStatistics = Task.async(function*(name) {
-  yield Promise.resolve(); // Make sure that we wait until
-  // statistics have been updated.
+  yield new Promise(resolve => do_execute_soon(resolve));
+  // Make sure that we wait until statistics have been updated.
   let service = Cc["@mozilla.org/toolkit/performance-stats-service;1"].
     getService(Ci.nsIPerformanceStatsService);
   let snapshot = service.getSnapshot();
   let componentsData = [];
   let componentsEnum = snapshot.getComponentsData().enumerate();
   while (componentsEnum.hasMoreElements()) {
     let data = componentsEnum.getNext().QueryInterface(Ci.nsIPerformanceStats);
     let normalized = JSON.parse(JSON.stringify(data));
     componentsData.push(data);
   }
+  yield new Promise(resolve => do_execute_soon(resolve));
   return {
     processData: JSON.parse(JSON.stringify(snapshot.getProcessData())),
     componentsData
   };
 });
 
 let promiseSetMonitoring = Task.async(function*(to) {
   let service = Cc["@mozilla.org/toolkit/performance-stats-service;1"].
     getService(Ci.nsIPerformanceStatsService);
   service.isMonitoringJank = to;
   service.isMonitoringCPOW = to;
-  yield Promise.resolve();
+  yield new Promise(resolve => do_execute_soon(resolve));
 });
 
 let promiseSetPerCompartment = Task.async(function*(to) {
   let service = Cc["@mozilla.org/toolkit/performance-stats-service;1"].
     getService(Ci.nsIPerformanceStatsService);
   service.isMonitoringPerCompartment = to;
-  yield Promise.resolve();
+  yield new Promise(resolve => do_execute_soon(resolve));
 });
 
 function getBuiltinStatistics(name, snapshot) {
   let stats = snapshot.componentsData.find(stats =>
     stats.isSystem && !stats.addonId
   );
   do_print(`Built-in statistics for ${name} were ${stats?"":"not "}found`);
   do_print(JSON.stringify(snapshot.componentsData, null, "\t"));
@@ -58,17 +59,17 @@ function burnCPU(ms) {
   let counter = 0;
   let ignored = [];
   let start = Date.now();
   while (Date.now() - start < ms) {
     ignored.push(0);
     ignored.shift();
     ++counter;
   }
-  do_print("Burning CPU over, after " + counter + " iterations");
+  do_print(`Burning CPU over, after ${counter} iterations and ${Date.now() - start} milliseconds.`);
 }
 
 function ensureEquals(snap1, snap2, name) {
   for (let k of Object.keys(snap1.processData)) {
     if (k == "ticks") {
       // Ticks monitoring cannot be deactivated
       continue;
     }
@@ -128,16 +129,17 @@ add_task(function* test_measure() {
   ensureEquals(stats0, stats1, "Initial state vs. Initial state + burn, without stopwatch");
   let process1 = stats1.processData;
   let process2 = stats2.processData;
   let process3 = stats3.processData;
   let process4 = stats4.processData;
   if (skipPrecisionTests) {
     do_print("Skipping totalUserTime check under Windows XP, as timer is not always updated by the OS.")
   } else {
+    do_print(JSON.stringify(process2));
     Assert.ok(process2.totalUserTime - process1.totalUserTime >= 10000, `At least 10ms counted for process time (${process2.totalUserTime - process1.totalUserTime})`);
   }
   Assert.equal(process2.totalCPOWTime, process1.totalCPOWTime, "We haven't used any CPOW time during the first burn");
   Assert.equal(process4.totalUserTime, process3.totalUserTime, "After deactivating the stopwatch, we didn't count any time");
   Assert.equal(process4.totalCPOWTime, process3.totalCPOWTime, "After deactivating the stopwatch, we didn't count any CPOW time");
 
   let builtin1 = getBuiltinStatistics("Built-ins 1", stats1) || { totalUserTime: 0, totalCPOWTime: 0 };
   let builtin2 = getBuiltinStatistics("Built-ins 2", stats2);
@@ -157,13 +159,14 @@ add_task(function* test_measure() {
   Assert.equal(builtin4.totalUserTime, builtin3.totalUserTime, "After deactivating the stopwatch, we didn't count any time for the built-in");
   Assert.equal(builtin4.totalCPOWTime, builtin3.totalCPOWTime, "After deactivating the stopwatch, we didn't count any CPOW time for the built-in");
 
   // Ideally, we should be able to look for test_compartments.js, but
   // it doesn't have its own compartment.
   for (let stats of [stats1, stats2, stats3, stats4]) {
     Assert.ok(!stats.componentsData.find(x => x.name.includes("Task.jsm")), "At this stage, Task.jsm doesn't show up in the components data");
   }
+  yield promiseSetMonitoring(true);
   yield promiseSetPerCompartment(true);
   burnCPU(300);
   let stats5 = yield promiseStatistics("With per-compartment monitoring");
-  Assert.ok(stats5.componentsData.find(x => x.name.includes("Task.jsm")), "With per-compartment monitoring, test_compartments.js shows up");
+  Assert.ok(stats5.componentsData.find(x => x.name.indexOf("Task.jsm") != -1), "With per-compartment monitoring, Task.jsm shows up");
 });