Bug 784859 - Part 2: Avoid locking to store the computed result in the global variable in CalibratedPerformanceCounter; r=bbondy
authorEhsan Akhgari <ehsan@mozilla.com>
Thu, 06 Sep 2012 11:01:06 -0400
changeset 104647 2a8acb221714
parent 104646 d291e2674dc6
child 104648 9b7184f6e154
push id14615
push usereakhgari@mozilla.com
push date2012-09-08 18:13 +0000
treeherdermozilla-inbound@d4a8c7efef04 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbbondy
bugs784859
milestone18.0a1
Bug 784859 - Part 2: Avoid locking to store the computed result in the global variable in CalibratedPerformanceCounter; r=bbondy
xpcom/ds/TimeStamp_windows.cpp
--- a/xpcom/ds/TimeStamp_windows.cpp
+++ b/xpcom/ds/TimeStamp_windows.cpp
@@ -24,22 +24,37 @@
 #include "prlog.h"
 #include <stdio.h>
 
 #include <intrin.h>
 
 static bool
 HasStableTSC()
 {
+  union {
+    int regs[4];
+    struct {
+      int nIds;
+      char cpuString[12];
+    };
+  } cpuInfo;
+
+  __cpuid(cpuInfo.regs, 0);
+  // Only allow Intel CPUs for now
+  // The order of the registers is reg[1], reg[3], reg[2].  We just adjust the
+  // string so that we can compare in one go.
+  if (_strnicmp(cpuInfo.cpuString, "GenuntelineI", sizeof(cpuInfo.cpuString)))
+    return false;
+
   int regs[4];
 
   // detect if the Advanced Power Management feature is supported
   __cpuid(regs, 0x80000000);
   if (regs[0] < 0x80000007)
-          return false;
+    return false;
 
   __cpuid(regs, 0x80000007);
   // if bit 8 is set than TSC will run at a constant rate
   // in all ACPI P-state, C-states and T-states
   return regs[3] & (1 << 8);
 }
 
 
@@ -196,16 +211,28 @@ static bool sForceRecalibrate = false;
 
 namespace mozilla {
 
 
 static ULONGLONG
 CalibratedPerformanceCounter();
 
 
+static inline ULONGLONG
+InterlockedRead64(volatile ULONGLONG* destination)
+{
+#ifdef _WIN64
+  // Aligned 64-bit reads on x86-64 are atomic
+  return *destination;
+#else
+  // Dirty hack since Windows doesn't provide an atomic 64-bit read function
+  return _InterlockedCompareExchange64(reinterpret_cast<volatile __int64*> (destination), 0, 0);
+#endif
+}
+
 // ----------------------------------------------------------------------------
 // Critical Section helper class
 // ----------------------------------------------------------------------------
 
 class AutoCriticalSection
 {
 public:
   AutoCriticalSection(LPCRITICAL_SECTION section)
@@ -489,62 +516,92 @@ CheckCalibration(LONGLONG overflow, ULON
       mt2ms_d(sSkew), sForceRecalibrate));
 
     sForceRecalibrate = false;
   }
 
   return true;
 }
 
+// AtomicStoreIfGreaterThan tries to store the maximum of two values in one of them
+// without locking.  The only scenario in which two racing threads may corrupt the
+// maximum value is when they both try to increase the value without knowing about
+// each other, like below:
+//
+// Thread 1 reads 1000.  newValue in thread 1 is 1005.
+// Thread 2 reads 1000.  newValue in thread 2 is 1001.
+// Thread 1 tries to store.  Its value is less than newValue, so the store happens.
+//                           *destination is now 1005.
+// Thread 2 tries to store.  Its value is less than newValue, so the store happens.
+//                           *destination is now 1001.
+//
+// The correct value to be stored if this was happening serially is 1005.  The
+// following algorithm achieves that.
+//
+// The return value is the maximum value.
+ULONGLONG
+AtomicStoreIfGreaterThan(ULONGLONG* destination, ULONGLONG newValue)
+{
+  ULONGLONG readValue;
+  do {
+    readValue = InterlockedRead64(destination);
+    if (readValue > newValue)
+      return readValue;
+  } while (readValue != _InterlockedCompareExchange64(reinterpret_cast<volatile __int64*> (destination),
+                                                      newValue, readValue));
+
+  return newValue;
+}
+
 // The main function.  Result is in [mt] ensuring to not go back and be mostly
 // reliable with highest possible resolution.
 static ULONGLONG
 CalibratedPerformanceCounter()
 {
   // XXX This is using ObserverService, cannot instantiate in the static
   // startup, really needs a better initation code here.
   StandbyObserver::Ensure();
 
   // Don't hold the lock over call to QueryPerformanceCounter, since it is
   // the largest bottleneck, let threads read the value concurently to have
   // possibly a better performance.
 
   ULONGLONG qpc = PerformanceCounter();
   DWORD gtcw = GetTickCount();
 
+  ULONGLONG result;
+  {
   AutoCriticalSection lock(&sTimeStampLock);
 
   // Rollover protection
   ULONGLONG gtc = TickCount64(gtcw);
 
   LONGLONG diff = qpc - ms2mt(gtc) - sSkew;
   LONGLONG overflow = 0;
 
   if (diff < sUnderrunThreshold) {
     overflow = sUnderrunThreshold - diff;
   }
   else if (diff > sOverrunThreshold) {
     overflow = diff - sOverrunThreshold;
   }
 
-  ULONGLONG result = qpc;
+  result = qpc;
   if (!CheckCalibration(overflow, qpc, gtc)) {
     // We are back on GTC, QPC has been observed unreliable
     result = ms2mt(gtc) + sSkew;
   }
 
 #if 0
   LOG(("TimeStamp: result = %1.2fms, diff = %1.4fms",
       mt2ms_d(result), mt2ms_d(diff)));
 #endif
+  }
 
-  if (result > sLastResult)
-    sLastResult = result;
-
-  return sLastResult;
+  return AtomicStoreIfGreaterThan(&sLastResult, result);
 }
 
 // ----------------------------------------------------------------------------
 // TimeDuration and TimeStamp implementation
 // ----------------------------------------------------------------------------
 
 double
 TimeDuration::ToSeconds() const