Bug 1345032 - Further cost reductions for ProfileBuffer::FindLastSampleOfThread. r=n.nethercote.
authorJulian Seward <jseward@acm.org>
Wed, 22 Mar 2017 11:18:31 +0100
changeset 348941 6235799ad4fd1e408a6c348c41f04d9ff7582e3d
parent 348940 3ccb231829a94c8d5aff22222ecfe0ef1f1d50a7
child 348942 d13479bfbc633160a96f032d0ae23554910b3d75
push id88365
push userjseward@mozilla.com
push dateThu, 23 Mar 2017 07:19:01 +0000
treeherdermozilla-inbound@6235799ad4fd [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersn.nethercote
bugs1345032, 1344118, 1344258
milestone55.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1345032 - Further cost reductions for ProfileBuffer::FindLastSampleOfThread. r=n.nethercote. ProfileBuffer::FindLastSampleOfThread currently involves a linear search backwards through the sample buffer. Profiling showed that to be the largest profiler cost by far, at least on Linux. Bugs 1344118 and 1344258 significantly improve the situation, collectively reducing the cost by a factor of at least 5 and often much more. But the linear search is still present and still dominant. The worst of it is that it's unnecessary: we could achieve the same by recording the start point of the most recent sample for each thread in that thread's ThreadInfo record. This patch does exactly that, adding the type ProfileBuffer::LastSample to store the start points. LastSample also includes the ID of the thread it pertains to as a read-only field, as that is needed in various places. addTag doesn't check whether we're overwriting buffer entries containing start points. Instead, FindLastSample checks whether the entry pointed to the LastSample it is given still contains a marker.
tools/profiler/core/ProfileBuffer.cpp
tools/profiler/core/ProfileBuffer.h
tools/profiler/core/ProfileBufferEntry.cpp
tools/profiler/core/ThreadInfo.cpp
tools/profiler/core/ThreadInfo.h
tools/profiler/core/platform-linux-android.cpp
tools/profiler/core/platform-macos.cpp
tools/profiler/core/platform-win32.cpp
tools/profiler/core/platform.cpp
--- a/tools/profiler/core/ProfileBuffer.cpp
+++ b/tools/profiler/core/ProfileBuffer.cpp
@@ -34,16 +34,24 @@ void ProfileBuffer::addTag(const Profile
   }
   if (mWritePos == mReadPos) {
     // Keep one slot open.
     mEntries[mReadPos] = ProfileBufferEntry();
     mReadPos = (mReadPos + 1) % mEntrySize;
   }
 }
 
+void ProfileBuffer::addTagThreadId(LastSample& aLS)
+{
+  // This is the start of a sample, so make a note of its location in |aLS|.
+  aLS.mGeneration = mGeneration;
+  aLS.mPos = mWritePos;
+  addTag(ProfileBufferEntry::ThreadId(aLS.mThreadId));
+}
+
 void ProfileBuffer::addStoredMarker(ProfilerMarker *aStoredMarker) {
   aStoredMarker->SetGeneration(mGeneration);
   mStoredMarkers.insert(aStoredMarker);
 }
 
 void ProfileBuffer::deleteExpiredStoredMarkers() {
   // Delete markers of samples that have been overwritten due to circular
   // buffer wraparound.
--- a/tools/profiler/core/ProfileBuffer.h
+++ b/tools/profiler/core/ProfileBuffer.h
@@ -14,36 +14,63 @@
 
 class ProfileBuffer final
 {
 public:
   explicit ProfileBuffer(int aEntrySize);
 
   ~ProfileBuffer();
 
+  // LastSample is used to record the buffer location of the most recent
+  // sample for each thread.
+  struct LastSample {
+    explicit LastSample(int aThreadId)
+      : mThreadId(aThreadId)
+      , mGeneration(0)
+      , mPos(-1)
+    {}
+
+    // The thread to which this LastSample pertains.
+    const int mThreadId;
+    // The profiler-buffer generation number at which the sample was created.
+    uint32_t mGeneration;
+    // And its position in the buffer, or -1 meaning "invalid".
+    int mPos;
+  };
+
+  // Add |aTag| to the buffer, ignoring what kind of entry it is.
   void addTag(const ProfileBufferEntry& aTag);
+
+  // Add to the buffer, a sample start (ThreadId) entry, for the thread that
+  // |aLS| belongs to, and record the resulting generation and index in |aLS|.
+  void addTagThreadId(LastSample& aLS);
+
   void StreamSamplesToJSON(SpliceableJSONWriter& aWriter, int aThreadId, double aSinceTime,
                            JSContext* cx, UniqueStacks& aUniqueStacks);
   void StreamMarkersToJSON(SpliceableJSONWriter& aWriter, int aThreadId,
                            const mozilla::TimeStamp& aStartTime,
                            double aSinceTime,
                            UniqueStacks& aUniqueStacks);
-  bool DuplicateLastSample(int aThreadId, const mozilla::TimeStamp& aStartTime);
+
+  // Find the most recent sample for the thread denoted by |aLS| and clone it,
+  // patching in |aStartTime| as appropriate.
+  bool DuplicateLastSample(const mozilla::TimeStamp& aStartTime,
+                           LastSample& aLS);
 
   void addStoredMarker(ProfilerMarker* aStoredMarker);
 
   // The following two methods are not signal safe! They delete markers.
   void deleteExpiredStoredMarkers();
   void reset();
 
   size_t SizeOfIncludingThis(mozilla::MallocSizeOf aMallocSizeOf) const;
 
 protected:
   char* processDynamicTag(int readPos, int* tagsConsumed, char* tagBuff);
-  int FindLastSampleOfThread(int aThreadId);
+  int FindLastSampleOfThread(const LastSample& aLS);
 
 public:
   // Circular buffer 'Keep One Slot Open' implementation for simplicity
   mozilla::UniquePtr<ProfileBufferEntry[]> mEntries;
 
   // Points to the next entry we will write to, which is also the one at which
   // we need to stop reading.
   int mWritePos;
--- a/tools/profiler/core/ProfileBufferEntry.cpp
+++ b/tools/profiler/core/ProfileBufferEntry.cpp
@@ -737,50 +737,59 @@ ProfileBuffer::StreamMarkersToJSON(Splic
       if (marker->GetTime() >= aSinceTime) {
         entry.getMarker()->StreamJSON(aWriter, aStartTime, aUniqueStacks);
       }
     }
     readPos = (readPos + 1) % mEntrySize;
   }
 }
 
-int ProfileBuffer::FindLastSampleOfThread(int aThreadId)
+int
+ProfileBuffer::FindLastSampleOfThread(const LastSample& aLS)
 {
-  // We search backwards from mWritePos-1 to mReadPos.
-  // Adding mEntrySize makes the result of the modulus positive.
-  int currPos = (mWritePos + mEntrySize - 1) % mEntrySize;
-  int stopPos = (mReadPos + mEntrySize - 1) % mEntrySize;
-  while (currPos != stopPos) {
-    ProfileBufferEntry entry = mEntries[currPos];
-    if (entry.isThreadId() && entry.mTagInt == aThreadId) {
-      return currPos;
+  // |aLS| has a valid generation number if either it matches the buffer's
+  // generation, or is one behind the buffer's generation, since the buffer's
+  // generation is incremented on wraparound.  There's no ambiguity relative to
+  // ProfileBuffer::reset, since that increments mGeneration by two.
+  if (aLS.mGeneration == mGeneration ||
+      (mGeneration > 0 && aLS.mGeneration == mGeneration - 1)) {
+    int ix = aLS.mPos;
+
+    if (ix == -1) {
+      // There's no record of |aLS|'s thread ever having recorded a sample in
+      // the buffer.
+      return -1;
     }
-    currPos--;
-    if (currPos < 0) {
-      // This almost never happens, so is perfectly predicted, and hence
-      // almost free.
-      currPos = mEntrySize - 1;
-    }
+
+    // It might be that the sample has since been overwritten, so check that it
+    // is still valid.
+    MOZ_RELEASE_ASSERT(0 <= ix && ix < mEntrySize);
+    ProfileBufferEntry& entry = mEntries[ix];
+    bool isStillValid = entry.isThreadId() && entry.mTagInt == aLS.mThreadId;
+    return isStillValid ? ix : -1;
   }
 
-  // This is rare. It typically happens after ProfileBuffer::reset() occurs.
+  // |aLS| denotes a sample which is older than either two wraparounds or one
+  // call to ProfileBuffer::reset.  In either case it is no longer valid.
+  MOZ_ASSERT(aLS.mGeneration <= mGeneration - 2);
   return -1;
 }
 
 bool
-ProfileBuffer::DuplicateLastSample(int aThreadId, const TimeStamp& aStartTime)
+ProfileBuffer::DuplicateLastSample(const TimeStamp& aStartTime, LastSample& aLS)
 {
-  int lastSampleStartPos = FindLastSampleOfThread(aThreadId);
+  int lastSampleStartPos = FindLastSampleOfThread(aLS);
   if (lastSampleStartPos == -1) {
     return false;
   }
 
-  MOZ_ASSERT(mEntries[lastSampleStartPos].isThreadId());
+  MOZ_ASSERT(mEntries[lastSampleStartPos].isThreadId() &&
+             mEntries[lastSampleStartPos].mTagInt == aLS.mThreadId);
 
-  addTag(mEntries[lastSampleStartPos]);
+  addTagThreadId(aLS);
 
   // Go through the whole entry and duplicate it, until we find the next one.
   for (int readPos = (lastSampleStartPos + 1) % mEntrySize;
        readPos != mWritePos;
        readPos = (readPos + 1) % mEntrySize) {
     switch (mEntries[readPos].kind()) {
       case ProfileBufferEntry::Kind::ThreadId:
         // We're done.
@@ -789,17 +798,17 @@ ProfileBuffer::DuplicateLastSample(int a
         // Copy with new time
         addTag(ProfileBufferEntry::Time((TimeStamp::Now() -
                                          aStartTime).ToMilliseconds()));
         break;
       case ProfileBufferEntry::Kind::Marker:
         // Don't copy markers
         break;
       default:
-        // Copy anything else we don't know about
+        // Copy anything else we don't know about.
         addTag(mEntries[readPos]);
         break;
     }
   }
   return true;
 }
 
 // END ProfileBuffer
--- a/tools/profiler/core/ThreadInfo.cpp
+++ b/tools/profiler/core/ThreadInfo.cpp
@@ -18,16 +18,17 @@ ThreadInfo::ThreadInfo(const char* aName
   : mName(strdup(aName))
   , mThreadId(aThreadId)
   , mIsMainThread(aIsMainThread)
   , mPseudoStack(aPseudoStack)
   , mPlatformData(AllocPlatformData(aThreadId))
   , mStackTop(aStackTop)
   , mPendingDelete(false)
   , mHasProfile(false)
+  , mLastSample(aThreadId)
 {
   MOZ_COUNT_CTOR(ThreadInfo);
   mThread = NS_GetCurrentThread();
 
   // We don't have to guess on mac
 #if defined(GP_OS_darwin)
   pthread_t self = pthread_self();
   mStackTop = pthread_get_stackaddr_np(self);
--- a/tools/profiler/core/ThreadInfo.h
+++ b/tools/profiler/core/ThreadInfo.h
@@ -33,16 +33,18 @@ public:
 
   virtual void SetPendingDelete();
   bool IsPendingDelete() const { return mPendingDelete; }
 
   bool CanInvokeJS() const;
 
   size_t SizeOfIncludingThis(mozilla::MallocSizeOf aMallocSizeOf) const;
 
+  ProfileBuffer::LastSample& LastSample() { return mLastSample; }
+
  private:
   mozilla::UniqueFreePtr<char> mName;
   int mThreadId;
   const bool mIsMainThread;
 
   // The thread's PseudoStack. This is an owning pointer iff mIsPendingDelete
   // is set.
   mozilla::NotNull<PseudoStack*> mPseudoStack;
@@ -99,11 +101,17 @@ private:
   // stringifying JIT frames). In the case of JSRuntime destruction,
   // FlushSamplesAndMarkers should be called to save them. These are spliced
   // into the final stream.
   mozilla::UniquePtr<char[]> mSavedStreamedSamples;
   mozilla::UniquePtr<char[]> mSavedStreamedMarkers;
   mozilla::Maybe<UniqueStacks> mUniqueStacks;
 
   ThreadResponsiveness mRespInfo;
+
+  // When sampling, this holds the generation number and offset in the
+  // ProfileBuffer of the most recent sample for this thread.
+  // mLastSample.mThreadId duplicates mThreadId in this structure, which
+  // simplifies some uses of mLastSample.
+  ProfileBuffer::LastSample mLastSample;
 };
 
 #endif
--- a/tools/profiler/core/platform-linux-android.cpp
+++ b/tools/profiler/core/platform-linux-android.cpp
@@ -385,18 +385,18 @@ public:
             ThreadInfo* info = threads[i];
 
             if (!info->HasProfile() || info->IsPendingDelete()) {
               // We are not interested in profiling this thread.
               continue;
             }
 
             if (info->Stack()->CanDuplicateLastSampleDueToSleep() &&
-                gPS->Buffer(lock)->DuplicateLastSample(info->ThreadId(),
-                                                       gPS->StartTime(lock))) {
+                gPS->Buffer(lock)->DuplicateLastSample(gPS->StartTime(lock),
+                                                       info->LastSample())) {
               continue;
             }
 
             info->UpdateThreadResponsiveness();
 
             int threadId = info->ThreadId();
             MOZ_ASSERT(threadId != my_tid);
 
--- a/tools/profiler/core/platform-macos.cpp
+++ b/tools/profiler/core/platform-macos.cpp
@@ -141,18 +141,18 @@ public:
             ThreadInfo* info = threads[i];
 
             if (!info->HasProfile() || info->IsPendingDelete()) {
               // We are not interested in profiling this thread.
               continue;
             }
 
             if (info->Stack()->CanDuplicateLastSampleDueToSleep() &&
-                gPS->Buffer(lock)->DuplicateLastSample(info->ThreadId(),
-                                                       gPS->StartTime(lock))) {
+                gPS->Buffer(lock)->DuplicateLastSample(gPS->StartTime(lock),
+                                                       info->LastSample())) {
               continue;
             }
 
             info->UpdateThreadResponsiveness();
 
             SampleContext(lock, info, isFirstProfiledThread);
 
             isFirstProfiledThread = false;
--- a/tools/profiler/core/platform-win32.cpp
+++ b/tools/profiler/core/platform-win32.cpp
@@ -169,18 +169,18 @@ public:
             ThreadInfo* info = threads[i];
 
             if (!info->HasProfile() || info->IsPendingDelete()) {
               // We are not interested in profiling this thread.
               continue;
             }
 
             if (info->Stack()->CanDuplicateLastSampleDueToSleep() &&
-                gPS->Buffer(lock)->DuplicateLastSample(info->ThreadId(),
-                                                       gPS->StartTime(lock))) {
+                gPS->Buffer(lock)->DuplicateLastSample(gPS->StartTime(lock),
+                                                       info->LastSample())) {
               continue;
             }
 
             info->UpdateThreadResponsiveness();
 
             SampleContext(lock, info, isFirstProfiledThread);
 
             isFirstProfiledThread = false;
--- a/tools/profiler/core/platform.cpp
+++ b/tools/profiler/core/platform.cpp
@@ -944,17 +944,18 @@ DoSampleStackTrace(PS::LockRef aLock, Pr
 
 // This function is called for each sampling period with the current program
 // counter. It is called within a signal and so must be re-entrant.
 static void
 Tick(PS::LockRef aLock, ProfileBuffer* aBuffer, TickSample* aSample)
 {
   ThreadInfo& threadInfo = *aSample->threadInfo;
 
-  aBuffer->addTag(ProfileBufferEntry::ThreadId(threadInfo.ThreadId()));
+  MOZ_ASSERT(threadInfo.LastSample().mThreadId == threadInfo.ThreadId());
+  aBuffer->addTagThreadId(threadInfo.LastSample());
 
   mozilla::TimeDuration delta = aSample->timestamp - gPS->StartTime(aLock);
   aBuffer->addTag(ProfileBufferEntry::Time(delta.ToMilliseconds()));
 
   NotNull<PseudoStack*> stack = threadInfo.Stack();
 
 #if defined(USE_NS_STACKWALK) || defined(USE_EHABI_STACKWALK) || \
     defined(USE_LUL_STACKWALK)