Bug 836869 - Make QueryPerformanceCounter bad leap detection heuristic smarter, r=tellrob, sr=benjamin
authorHonza Bambas <honzab.moz@firemni.cz>
Wed, 01 May 2013 18:57:58 +0200
changeset 141437 83be5398e9074a9ba446ab882f22b38237cf8b07
parent 141436 74431003f8cc1ac129899253e263f0b1f4392026
child 141438 c19ddcb42f6e2150347d5a1228d23a64fb09a19e
push id2579
push userakeybl@mozilla.com
push dateMon, 24 Jun 2013 18:52:47 +0000
treeherdermozilla-beta@b69b7de8a05a [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerstellrob, benjamin
bugs836869
milestone23.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 836869 - Make QueryPerformanceCounter bad leap detection heuristic smarter, r=tellrob, sr=benjamin
xpcom/ds/TimeStamp_windows.cpp
xpcom/ds/TimeStamp_windows.h
--- a/xpcom/ds/TimeStamp_windows.cpp
+++ b/xpcom/ds/TimeStamp_windows.cpp
@@ -50,41 +50,26 @@ static volatile ULONGLONG sResolutionSig
 static const double   kNsPerSecd  = 1000000000.0;
 static const LONGLONG kNsPerSec   = 1000000000;
 static const LONGLONG kNsPerMillisec = 1000000;
 
 // ----------------------------------------------------------------------------
 // Global constants
 // ----------------------------------------------------------------------------
 
-// If QPC is found faulty for two stamps in this interval, we disable it
-// completely.
-//
-// Values is in [ms].
-static const uint32_t kQPCHardFailureDetectionInterval = 2000;
-
-// On every use of QPC values we check the overflow of skew difference of the
-// two stamps doesn't go over this number of milliseconds.  Both timer
-// functions jitter so we have to have some limit.  The value is based on tests.
+// Tolerance to failures settings.
 //
-// Changing kQPCHardFailureDetectionInterval influences this limit: prolonging
-// just kQPCHardFailureDetectionInterval means to be more sensitive to threshold
-// overflows.
-//
-// How this constant is used (see CheckQPC function):
-// First, adjust the limit linearly to the check interval:
-//   LIMIT = (GTC_now - GTC_epoch) / kQPCHardFailureDetectionInterval
-// Then, check the skew difference overflow is in this adjusted limit:
-//   ABS( (QPC_now - GTC_now) - (QPC_epoch - GTC_epoch) ) - THRESHOLD < LIMIT
-//
-// Thresholds are calculated dynamically, see sUnderrunThreshold and
-// sOverrunThreshold below.
-//
-// Limit is in number of [ms].
-static const ULONGLONG kOverflowLimit = 50;
+// What is the interval we want to have failure free.
+// in [ms]
+static const uint32_t kFailureFreeInterval = 5000;
+// How many failures we are willing to tolerate in the interval.
+static const uint32_t kMaxFailuresPerInterval = 4;
+// What is the threshold to treat fluctuations as actual failures.
+// in [ms]
+static const uint32_t kFailureThreshold = 50;
 
 // If we are not able to get the value of GTC time increment, use this value
 // which is the most usual increment.
 static const DWORD kDefaultTimeIncrement = 156001;
 
 // ----------------------------------------------------------------------------
 // Global variables, not changing at runtime
 // ----------------------------------------------------------------------------
@@ -103,33 +88,51 @@ static const DWORD kDefaultTimeIncrement
 
 #define ms2mt(x) ((x) * sFrequencyPerSec)
 #define mt2ms(x) ((x) / sFrequencyPerSec)
 #define mt2ms_f(x) (double(x) / sFrequencyPerSec)
 
 // Result of QueryPerformanceFrequency
 static LONGLONG sFrequencyPerSec = 0;
 
-// Lower and upper bound that QueryPerformanceCounter - GetTickCount must not
-// go under or over when compared to any older QPC - GTC difference (skew).
-// Values are based on the GetTickCount update interval.
+// How much we are tolerant to GTC occasional loose of resoltion.
+// This number says how many multiples of the minimal GTC resolution
+// detected on the system are acceptable.  This number is empirical.
+static const LONGLONG kGTCTickLeapTolerance = 4;
+
+// Base tolerance (more: "inability of detection" range) threshold is calculated
+// dynamically, and kept in sGTCResulutionThreshold.
 //
-// Schematically, QPC works correctly if ((QPC_now - GTC_now) -
-// (QPC_epoch - GTC_epoch)) is in  [sUnderrunThreshold, sOverrunThreshold]
-// interval every time we compare two time stamps.
+// Schematically, QPC worked "100%" correctly if ((GTC_now - GTC_epoch) -
+// (QPC_now - QPC_epoch)) was in  [-sGTCResulutionThreshold, sGTCResulutionThreshold]
+// interval every time we'd compared two time stamps.
+// If not, then we check the overflow behind this basic threshold
+// is in kFailureThreshold.  If not, we condider it as a QPC failure.  If too many
+// failures in short time are detected, QPC is considered faulty and disabled.
 //
 // Kept in [mt]
-static LONGLONG sUnderrunThreshold;
-static LONGLONG sOverrunThreshold;
+static LONGLONG sGTCResulutionThreshold;
 
-// Interval to return duration using QPC.  When two time stamps
-// are within this interval, perform QPC check first.
+// If QPC is found faulty for two stamps in this interval, we engage
+// the fault detection algorithm.  For duration larger then this limit
+// we bypass using durations calculated from QPC when jitter is detected,
+// but don't touch the sUseQPC flag.
 //
-// Kept in [mt]
-static LONGLONG sQPCHardFailureDetectionInterval;
+// Value is in [ms].
+static const uint32_t kHardFailureLimit = 2000;
+// Conversion to [mt]
+static LONGLONG sHardFailureLimit;
+
+// Conversion of kFailureFreeInterval and kFailureThreshold to [mt]
+static LONGLONG sFailureFreeInterval;
+static LONGLONG sFailureThreshold;
+
+// ----------------------------------------------------------------------------
+// Systemm status flags
+// ----------------------------------------------------------------------------
 
 // Flag for stable TSC that indicates platform where QPC is stable.
 static bool sHasStableTSC = false;
 
 // ----------------------------------------------------------------------------
 // Global state variables, changing at runtime
 // ----------------------------------------------------------------------------
 
@@ -145,16 +148,31 @@ static bool volatile sUseQPC = true;
 // Inspired by Rob Arnold's work on PRMJ_Now().
 static const DWORD kLockSpinCount = 4096;
 
 // Common mutex (thanks the relative complexity of the logic, this is better
 // then using CMPXCHG8B.)
 // It is protecting the globals bellow.
 static CRITICAL_SECTION sTimeStampLock;
 
+// ----------------------------------------------------------------------------
+// Global lock protected variables
+// ----------------------------------------------------------------------------
+
+// Timestamp in future until QPC must behave correctly.
+// Set to now + kFailureFreeInterval on first QPC failure detection.
+// Set to now + E * kFailureFreeInterval on following errors,
+//   where E is number of errors detected during last kFailureFreeInterval
+//   milliseconds, calculated simply as:
+//   E = (sFaultIntoleranceCheckpoint - now) / kFailureFreeInterval + 1.
+// When E > kMaxFailuresPerInterval -> disable QPC.
+//
+// Kept in [mt]
+static ULONGLONG sFaultIntoleranceCheckpoint = 0;
+
 // Used only when GetTickCount64 is not available on the platform.
 // Last result of GetTickCount call.
 //
 // Kept in [ms]
 static DWORD sLastGTCResult = 0;
 
 // Higher part of the 64-bit value of MozGetTickCount64,
 // incremented atomically.
@@ -234,37 +252,27 @@ InitThresholds()
   timeIncrementCeil -= 1;
   // Convert to ms, values will be: 15, 20
   timeIncrementCeil /= 10000;
   // Round up, values will be: 16, 21
   timeIncrementCeil += 1;
   // Convert back to 100ns, values will be: 160000, 210000
   timeIncrementCeil *= 10000;
 
-  // How many milli-ticks has the interval
-  LONGLONG ticksPerGetTickCountResolution =
-    (int64_t(timeIncrement) * sFrequencyPerSec) / 10000LL;
-
   // How many milli-ticks has the interval rounded up
   LONGLONG ticksPerGetTickCountResolutionCeiling =
     (int64_t(timeIncrementCeil) * sFrequencyPerSec) / 10000LL;
 
-  // I observed differences about 2 times of the GTC resolution.  GTC may
-  // jump by 32 ms in two steps, therefor use the ceiling value.
-  // Having 64 (15.6 or 16 * 4 exactly) is used to avoid false negatives
-  // for very short times where QPC and GTC may jitter even more.
-  sUnderrunThreshold =
-    LONGLONG((-4) * ticksPerGetTickCountResolutionCeiling);
+  // GTC may jump by 32 (2*16) ms in two steps, therefor use the ceiling value.
+  sGTCResulutionThreshold =
+    LONGLONG(kGTCTickLeapTolerance * ticksPerGetTickCountResolutionCeiling);
 
-  // QPC should go no further than 2 * GTC resolution.
-  sOverrunThreshold =
-    LONGLONG((+4) * ticksPerGetTickCountResolution);
-
-  sQPCHardFailureDetectionInterval =
-    LONGLONG(kQPCHardFailureDetectionInterval) * sFrequencyPerSec;
+  sHardFailureLimit = ms2mt(kHardFailureLimit);
+  sFailureFreeInterval = ms2mt(kFailureFreeInterval);
+  sFailureThreshold = ms2mt(kFailureThreshold);
 }
 
 static void
 InitResolution()
 {
   // 10 total trials is arbitrary: what we're trying to avoid by
   // looping is getting unlucky and being interrupted by a context
   // switch or signal, or being bitten by paging/cache effects
@@ -336,88 +344,94 @@ TimeStampValue::operator+=(const int64_t
 TimeStampValue&
 TimeStampValue::operator-=(const int64_t aOther)
 {
   mGTC -= aOther;
   mQPC -= aOther;
   return *this;
 }
 
-// If the duration is less then one second, perform check of QPC stability
-// by comparing both 'epoch' and 'now' skew (=GTC - QPC) values.
-bool
-TimeStampValue::CheckQPC(int64_t aDuration, const TimeStampValue &aOther) const
+// If the duration is less then two seconds, perform check of QPC stability
+// by comparing both GTC and QPC calculated durations of this and aOther.
+uint64_t
+TimeStampValue::CheckQPC(const TimeStampValue &aOther) const
 {
-  if (!mHasQPC || !aOther.mHasQPC) // Not both holding QPC
-    return false;
+  uint64_t deltaGTC = mGTC - aOther.mGTC;
+
+  if (!mHasQPC || !aOther.mHasQPC) // Both not holding QPC
+    return deltaGTC;
+
+  uint64_t deltaQPC = mQPC - aOther.mQPC;
 
   if (sHasStableTSC) // For stable TSC there is no need to check
-    return true;
+    return deltaQPC;
 
   if (!sUseQPC) // QPC globally disabled
-    return false;
-
-  // Treat absolutely for calibration purposes
-  aDuration = DeprecatedAbs(aDuration);
+    return deltaGTC;
 
   // Check QPC is sane before using it.
+  int64_t diff = DeprecatedAbs(int64_t(deltaQPC) - int64_t(deltaGTC));
+  if (diff <= sGTCResulutionThreshold)
+    return deltaQPC;
 
-  LONGLONG skew1 = mGTC - mQPC;
-  LONGLONG skew2 = aOther.mGTC - aOther.mQPC;
+  // Treat absolutely for calibration purposes
+  int64_t duration = DeprecatedAbs(int64_t(deltaGTC));
+  int64_t overflow = diff - sGTCResulutionThreshold;
 
-  LONGLONG diff = skew1 - skew2;
-  LONGLONG overflow;
+  LOG(("TimeStamp: QPC check after %llums with overflow %1.4fms",
+       mt2ms(duration), mt2ms_f(overflow)));
 
-  if (diff < sUnderrunThreshold)
-    overflow = sUnderrunThreshold - diff;
-  else if (diff > sOverrunThreshold)
-    overflow = diff - sOverrunThreshold;
-  else
-    return true;
+  if (overflow <= sFailureThreshold) // We are in the limit, let go.
+    return deltaQPC; // XXX Should we return GTC here?
+
+  // QPC deviates, don't use it, since now this method may only return deltaGTC.
+  LOG(("TimeStamp: QPC jittered over failure threshold"));
+
+  if (duration < sHardFailureLimit) {
+    // Interval between the two time stamps is very short, consider
+    // QPC as unstable and record a failure.
+    uint64_t now = ms2mt(sGetTickCount64());
+
+    AutoCriticalSection lock(&sTimeStampLock);
 
-  ULONGLONG trend;
-  if (aDuration)
-    trend = LONGLONG(overflow * (double(sQPCHardFailureDetectionInterval) / aDuration));
-  else
-    trend = overflow;
-
-  LOG(("TimeStamp: QPC check after %llums with overflow %1.4fms"
-       ", adjusted trend per interval is %1.4fms",
-       mt2ms(aDuration),
-       mt2ms_f(overflow),
-       mt2ms_f(trend)));
-
-  if (trend <= ms2mt(kOverflowLimit)) {
-    // We are in the limit, let go.
-    return true;
+    if (sFaultIntoleranceCheckpoint && sFaultIntoleranceCheckpoint > now) {
+      // There's already been an error in the last fault intollerant interval.
+      // Time since now to the checkpoint actually holds information on how many
+      // failures there were in the failure free interval we have defined.
+      uint64_t failureCount = (sFaultIntoleranceCheckpoint - now + sFailureFreeInterval - 1) /
+                               sFailureFreeInterval;
+      if (failureCount > kMaxFailuresPerInterval) {
+        sUseQPC = false;
+        LOG(("TimeStamp: QPC disabled"));
+      }
+      else {
+        // Move the fault intolerance checkpoint more to the future, prolong it
+        // to reflect the number of detected failures.
+        ++failureCount;
+        sFaultIntoleranceCheckpoint = now + failureCount * sFailureFreeInterval;
+        LOG(("TimeStamp: recording %dth QPC failure", failureCount));
+      }
+    }
+    else {
+      // Setup fault intolerance checkpoint in the future for first detected error.
+      sFaultIntoleranceCheckpoint = now + sFailureFreeInterval;
+      LOG(("TimeStamp: recording 1st QPC failure"));
+    }
   }
 
-  // QPC deviates, don't use it.
-  LOG(("TimeStamp: QPC found highly jittering"));
-
-  if (aDuration < sQPCHardFailureDetectionInterval) {
-    // Interval between the two time stamps is very short, consider
-    // QPC as unstable and disable it completely.
-    sUseQPC = false;
-    LOG(("TimeStamp: QPC disabled"));
-  }
-
-  return false;
+  return deltaGTC;
 }
 
 uint64_t
 TimeStampValue::operator-(const TimeStampValue &aOther) const
 {
   if (mIsNull && aOther.mIsNull)
     return uint64_t(0);
 
-  if (CheckQPC(int64_t(mGTC - aOther.mGTC), aOther))
-    return mQPC - aOther.mQPC;
-
-  return mGTC - aOther.mGTC;
+  return CheckQPC(aOther);
 }
 
 // ----------------------------------------------------------------------------
 // TimeDuration and TimeStamp implementation
 // ----------------------------------------------------------------------------
 
 double
 TimeDuration::ToSeconds() const
--- a/xpcom/ds/TimeStamp_windows.h
+++ b/xpcom/ds/TimeStamp_windows.h
@@ -19,17 +19,17 @@ class TimeStampValue
   // Both QPC and GTC are kept in [mt] units.
   uint64_t mGTC;
   uint64_t mQPC;
   bool mHasQPC;
   bool mIsNull;
 
   TimeStampValue(uint64_t GTC, uint64_t QPC, bool hasQPC);
 
-  bool CheckQPC(int64_t aDuration, const TimeStampValue &aOther) const;
+  uint64_t CheckQPC(const TimeStampValue &aOther) const;
 
 public:
   struct _SomethingVeryRandomHere;
   TimeStampValue(_SomethingVeryRandomHere* nullValue);
 
   uint64_t operator-(const TimeStampValue &aOther) const;
 
   TimeStampValue operator+(const int64_t aOther) const