Bug 1414168 - Change how run sizes are calculated. r=njn
authorMike Hommey <mh+mozilla@glandium.org>
Wed, 08 Nov 2017 14:04:10 +0900
changeset 444165 11bc33786fd378b717fc4ea358c12e6f96320432
parent 444164 29a122463ffcc549f3ccccbc5779ea13ca68abfe
child 444166 182b840c89560a0d4a53bcd5bc337749e92fdd23
push id1618
push userCallek@gmail.com
push dateThu, 11 Jan 2018 17:45:48 +0000
treeherdermozilla-release@882ca853e05a [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersnjn
bugs1414168
milestone58.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1414168 - Change how run sizes are calculated. r=njn There are multiple flaws to the current code: - The loop calculating the right parameters for a given run size is repeated. - The loop trying different run sizes doesn't actually work to fulfil the overhead constraint: while it stops when the constraint is fulfilled, the values that are kept are those from the previous iteration, which may well be well over the constraint. In practice, the latter resulted in a few surprising results: - most size classes had an overhead slightly over the constraint (1.562%), which, while not terribly bad, doesn't match the set expectations. - some size classes ended up with relatively good overheads only because of the additional constraint that run sizes had to be larger than the run size of smaller size classes. Without this constraint, some size classes would end up with overheads well over 2% just because that happens to be the last overhead value before reaching below the 1.5% constraint. Furthermore, for higher-level fragmentation concerns, smaller run sizes are better than larger run sizes, and in many cases, smaller run sizes can yield the same (or even sometimes, better) overhead as larger run sizes. For example, the current code choses 8KiB for runs of size 112, but using 4KiB runs would actually yield the same number of regions, and the same overhead. We thus change the calculation to: - not force runs to be smaller than those of smaller classes. - avoid the code repetition. - actually enforce its overhead constraint, but make it 1.6%. - for especially small size classes, relax the overhead constraint to 2.4%. This leads to an uneven set of run sizes: size class before after 4 4 KiB 4 KiB 8 4 KiB 4 KiB 16 4 KiB 4 KiB 32 4 KiB 4 KiB 48 4 KiB 4 KiB 64 4 KiB 4 KiB 80 4 KiB 4 KiB 96 4 KiB 4 KiB 112 8 KiB 4 KiB 128 8 KiB 8 KiB 144 8 KiB 4 KiB 160 8 KiB 8 KiB 176 8 KiB 4 KiB 192 12 KiB 4 KiB 208 12 KiB 8 KiB 224 12 KiB 4 KiB 240 12 KiB 4 KiB 256 16 KiB 16 KiB 272 16 KiB 4 KiB 288 16 KiB 4 KiB 304 16 KiB 12 KiB 320 20 KiB 12 KiB 336 20 KiB 4 KiB 352 20 KiB 8 KiB 368 20 KiB 4 KiB 384 24 KiB 8 KiB 400 24 KiB 20 KiB 416 24 KiB 16 KiB 432 24 KiB 12 KiB 448 28 KiB 4 KiB 464 28 KiB 16 KiB 480 28 KiB 8 KiB 496 28 KiB 20 KiB 512 32 KiB 32 KiB 1024 64 KiB 64 KiB 2048 132 KiB 128 KiB * Note: before is before this change only, not before the set of changes from this bug; before that, the run size for 96 could be 8 KiB in some configurations. In most cases, the overhead hasn't changed, with a few exceptions: - Improvements: size class before after 208 1.823% 0.977% 304 1.660% 1.042% 320 1.562% 1.042% 400 0.716% 0.391% 464 1.283% 0.879% 480 1.228% 0.391% 496 1.395% 0.703% - Regressions: 352 0.312% 1.172% 416 0.130% 0.977% 2048 1.515% 1.562% For the regressions, the values are either still well within the constraint or very close to the previous value, that I don't feel like it's worth trying to avoid them, with the risk of making things worse for other size classes.
memory/build/mozjemalloc.cpp
--- a/memory/build/mozjemalloc.cpp
+++ b/memory/build/mozjemalloc.cpp
@@ -894,17 +894,19 @@ struct arena_bin_t
   uint32_t mRunNumRegionsMask;
 
   // Offset of first region in a run for this bin's size class.
   uint32_t mRunFirstRegionOffset;
 
   // Current number of runs in this bin, full or otherwise.
   unsigned long mNumRuns;
 
-  static constexpr long double kRunOverhead = 1.5_percent;
+  // Amount of overhead runs are allowed to have.
+  static constexpr long double kRunOverhead = 1.6_percent;
+  static constexpr long double kRunRelaxedOverhead = 2.4_percent;
 };
 
 struct arena_t
 {
 #if defined(MOZ_DIAGNOSTIC_ASSERT_ENABLED)
   uint32_t mMagic;
 #define ARENA_MAGIC 0x947d3d24
 #endif
@@ -2944,69 +2946,50 @@ arena_t::MallocBinHard(arena_bin_t* aBin
     return nullptr;
   }
   MOZ_DIAGNOSTIC_ASSERT(aBin->mCurrentRun->magic == ARENA_RUN_MAGIC);
   MOZ_DIAGNOSTIC_ASSERT(aBin->mCurrentRun->nfree > 0);
 
   return MallocBinEasy(aBin, aBin->mCurrentRun);
 }
 
-// Calculate bin->mRunSize such that it meets the following constraints:
-//
-//   *) bin->mRunSize >= min_run_size
-//   *) bin->mRunSize <= gMaxLargeClass
-//   *) bin->mRunSize <= gMaxBinClass
-//   *) run header overhead <= kRunOverhead
-//
+// Calculate bin->mRunSize such that it meets a set of constraints.
 // bin->mRunNumRegions, bin->mRunNumRegionsMask, and bin->mRunFirstRegionOffset are
 // also calculated here, since these settings are all interdependent.
-static size_t
-arena_bin_run_size_calc(arena_bin_t* bin, size_t min_run_size)
+// The generated run sizes, for a page size of 4 KiB, are:
+//   size|run       size|run       size|run       size|run
+//  class|size     class|size     class|size     class|size
+//     4   4 KiB      8   4 KiB     16   4 KiB     32   4 KiB
+//    48   4 KiB     64   4 KiB     80   4 KiB     96   4 KiB
+//   112   4 KiB    128   8 KiB    144   4 KiB    160   8 KiB
+//   176   4 KiB    192   4 KiB    208   8 KiB    224   4 KiB
+//   240   4 KiB    256  16 KiB    272   4 KiB    288   4 KiB
+//   304  12 KiB    320  12 KiB    336   4 KiB    352   8 KiB
+//   368   4 KiB    384   8 KiB    400  20 KiB    416  16 KiB
+//   432  12 KiB    448   4 KiB    464  16 KiB    480   8 KiB
+//   496  20 KiB    512  32 KiB   1024  64 KiB   2048 128 KiB
+static void
+arena_bin_run_size_calc(arena_bin_t* bin)
 {
-  size_t try_run_size, good_run_size;
-  unsigned good_nregs, good_mask_nelms, good_reg0_offset;
+  size_t try_run_size;
   unsigned try_nregs, try_mask_nelms, try_reg0_offset;
   // Size of the run header, excluding regs_mask.
   static const size_t kFixedHeaderSize = offsetof(arena_run_t, regs_mask);
 
-  MOZ_ASSERT(min_run_size >= gPageSize);
-  MOZ_ASSERT(min_run_size <= gMaxLargeClass);
-
-  // Calculate known-valid settings before entering the mRunSize
-  // expansion loop, so that the first part of the loop always copies
-  // valid settings.
-  //
-  // The do..while loop iteratively reduces the number of regions until
-  // the run header and the regions no longer overlap.  A closed formula
-  // would be quite messy, since there is an interdependency between the
-  // header's mask length and the number of regions.
-  try_run_size = min_run_size;
-  try_nregs = ((try_run_size - kFixedHeaderSize) / bin->mSizeClass) +
-              1; // Counter-act try_nregs-- in loop.
-  do {
-    try_nregs--;
-    try_mask_nelms =
-      (try_nregs >> (LOG2(sizeof(int)) + 3)) +
-      ((try_nregs & ((1U << (LOG2(sizeof(int)) + 3)) - 1)) ? 1 : 0);
-    try_reg0_offset = try_run_size - (try_nregs * bin->mSizeClass);
-  } while (kFixedHeaderSize + (sizeof(unsigned) * try_mask_nelms) >
-           try_reg0_offset);
+  try_run_size = gPageSize;
 
   // mRunSize expansion loop.
   while (true) {
-    // Copy valid settings before trying more aggressive settings.
-    good_run_size = try_run_size;
-    good_nregs = try_nregs;
-    good_mask_nelms = try_mask_nelms;
-    good_reg0_offset = try_reg0_offset;
-
-    // Try more aggressive settings.
-    try_run_size += gPageSize;
     try_nregs = ((try_run_size - kFixedHeaderSize) / bin->mSizeClass) +
                 1; // Counter-act try_nregs-- in loop.
+
+    // The do..while loop iteratively reduces the number of regions until
+    // the run header and the regions no longer overlap.  A closed formula
+    // would be quite messy, since there is an interdependency between the
+    // header's mask length and the number of regions.
     do {
       try_nregs--;
       try_mask_nelms =
         (try_nregs >> (LOG2(sizeof(int)) + 3)) +
         ((try_nregs & ((1U << (LOG2(sizeof(int)) + 3)) - 1)) ? 1 : 0);
       try_reg0_offset = try_run_size - (try_nregs * bin->mSizeClass);
     } while (kFixedHeaderSize + (sizeof(unsigned) * try_mask_nelms) >
              try_reg0_offset);
@@ -3016,41 +2999,50 @@ arena_bin_run_size_calc(arena_bin_t* bin
       break;
     }
 
     // Try to keep the run overhead below kRunOverhead.
     if (Fraction(try_reg0_offset, try_run_size) <= arena_bin_t::kRunOverhead) {
       break;
     }
 
+    // If the overhead is larger than the size class, it means the size class
+    // is small and doesn't align very well with the header. It's desirable to
+    // have smaller run sizes for them, so relax the overhead requirement.
+    if (try_reg0_offset > bin->mSizeClass) {
+      if (Fraction(try_reg0_offset, try_run_size) <= arena_bin_t::kRunRelaxedOverhead) {
+        break;
+      }
+    }
+
     // The run header includes one bit per region of the given size. For sizes
     // small enough, the number of regions is large enough that growing the run
     // size barely moves the needle for the overhead because of all those bits.
     // For example, for a size of 8 bytes, adding 4KiB to the run size adds
     // close to 512 bits to the header, which is 64 bytes.
     // With such overhead, there is no way to get to the wanted overhead above,
     // so we give up if the required size for regs_mask more than doubles the
     // size of the run header.
     if (try_mask_nelms * sizeof(unsigned) >= kFixedHeaderSize) {
       break;
     }
 
-  }
-
-  MOZ_ASSERT(kFixedHeaderSize + (sizeof(unsigned) * good_mask_nelms) <=
-             good_reg0_offset);
-  MOZ_ASSERT((good_mask_nelms << (LOG2(sizeof(int)) + 3)) >= good_nregs);
+    // Try more aggressive settings.
+    try_run_size += gPageSize;
+  }
+
+  MOZ_ASSERT(kFixedHeaderSize + (sizeof(unsigned) * try_mask_nelms) <=
+             try_reg0_offset);
+  MOZ_ASSERT((try_mask_nelms << (LOG2(sizeof(int)) + 3)) >= try_nregs);
 
   // Copy final settings.
-  bin->mRunSize = good_run_size;
-  bin->mRunNumRegions = good_nregs;
-  bin->mRunNumRegionsMask = good_mask_nelms;
-  bin->mRunFirstRegionOffset = good_reg0_offset;
-
-  return good_run_size;
+  bin->mRunSize = try_run_size;
+  bin->mRunNumRegions = try_nregs;
+  bin->mRunNumRegionsMask = try_mask_nelms;
+  bin->mRunFirstRegionOffset = try_reg0_offset;
 }
 
 void*
 arena_t::MallocSmall(size_t aSize, bool aZero)
 {
   void* ret;
   arena_bin_t* bin;
   arena_run_t* run;
@@ -3808,17 +3800,16 @@ iralloc(void* aPtr, size_t aSize, arena_
   return (aSize <= gMaxLargeClass) ? arena_ralloc(aPtr, aSize, oldsize, aArena)
                                    : huge_ralloc(aPtr, aSize, oldsize);
 }
 
 arena_t::arena_t()
 {
   unsigned i;
   arena_bin_t* bin;
-  size_t prev_run_size;
 
   MOZ_RELEASE_ASSERT(mLock.Init());
 
   memset(&mLink, 0, sizeof(mLink));
   memset(&mStats, 0, sizeof(arena_stats_t));
 
   // Initialize chunks.
   mChunksDirty.Init();
@@ -3830,27 +3821,26 @@ arena_t::arena_t()
   mNumDirty = 0;
   // Reduce the maximum amount of dirty pages we allow to be kept on
   // thread local arenas. TODO: make this more flexible.
   mMaxDirty = opt_dirty_max >> 3;
 
   mRunsAvail.Init();
 
   // Initialize bins.
-  prev_run_size = gPageSize;
   SizeClass sizeClass(1);
 
   for (i = 0;; i++) {
     bin = &mBins[i];
     bin->mCurrentRun = nullptr;
     bin->mNonFullRuns.Init();
 
     bin->mSizeClass = sizeClass.Size();
 
-    prev_run_size = arena_bin_run_size_calc(bin, prev_run_size);
+    arena_bin_run_size_calc(bin);
 
     bin->mNumRuns = 0;
 
     // SizeClass doesn't want sizes larger than gMaxSubPageClass for now.
     if (sizeClass.Size() == gMaxSubPageClass) {
       break;
     }
     sizeClass = sizeClass.Next();