Bug 1380033 - Tiering policy with space proxy. r=luke
authorLars T Hansen <lhansen@mozilla.com>
Thu, 31 Aug 2017 12:24:13 +0200
changeset 382495 a414ed3ef9e596556f88b94fd76a470bee9cd2fd
parent 382494 56083ad02b19e622ae3a141b62e3cdcb58dd09df
child 382496 3d11e27057a37db169e3e8f166fed60b854152e4
push id32559
push userkwierso@gmail.com
push dateFri, 22 Sep 2017 21:56:17 +0000
treeherdermozilla-central@3d72fdb0e561 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersluke
bugs1380033
milestone58.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1380033 - Tiering policy with space proxy. r=luke
js/src/jit/ProcessExecutableMemory.cpp
js/src/jit/ProcessExecutableMemory.h
js/src/wasm/WasmCompile.cpp
js/src/wasm/WasmGenerator.h
js/src/wasm/WasmValidate.cpp
js/src/wasm/WasmValidate.h
--- a/js/src/jit/ProcessExecutableMemory.cpp
+++ b/js/src/jit/ProcessExecutableMemory.cpp
@@ -614,16 +614,23 @@ js::jit::InitProcessExecutableMemory()
 }
 
 void
 js::jit::ReleaseProcessExecutableMemory()
 {
     execMemory.release();
 }
 
+size_t
+js::jit::LikelyAvailableExecutableMemory()
+{
+    // Round down available memory to the closest MB.
+    return MaxCodeBytesPerProcess - AlignBytes(execMemory.bytesAllocated(), 0x100000U);
+}
+
 bool
 js::jit::CanLikelyAllocateMoreExecutableMemory()
 {
     // Use a 8 MB buffer.
     static const size_t BufferSize = 8 * 1024 * 1024;
 
     MOZ_ASSERT(execMemory.bytesAllocated() <= MaxCodeBytesPerProcess);
 
--- a/js/src/jit/ProcessExecutableMemory.h
+++ b/js/src/jit/ProcessExecutableMemory.h
@@ -45,12 +45,17 @@ extern void DeallocateExecutableMemory(v
 // Returns true if we can allocate a few more MB of executable code without
 // hitting our code limit. This function can be used to stop compiling things
 // that are optional (like Baseline and Ion code) when we're about to reach the
 // limit, so we are less likely to OOM or crash. Note that the limit is
 // per-process, so other threads can also allocate code after we call this
 // function.
 extern bool CanLikelyAllocateMoreExecutableMemory();
 
+// Returns a rough guess of how much executable memory remains available,
+// rounded down to MB limit.  Note this can fluctuate as other threads within
+// the process allocate executable memory.
+extern size_t LikelyAvailableExecutableMemory();
+
 } // namespace jit
 } // namespace js
 
 #endif // jit_ProcessExecutableMemory_h
--- a/js/src/wasm/WasmCompile.cpp
+++ b/js/src/wasm/WasmCompile.cpp
@@ -18,16 +18,17 @@
 
 #include "wasm/WasmCompile.h"
 
 #include "mozilla/Maybe.h"
 #include "mozilla/Unused.h"
 
 #include "jsprf.h"
 
+#include "jit/ProcessExecutableMemory.h"
 #include "wasm/WasmBaselineCompile.h"
 #include "wasm/WasmBinaryIterator.h"
 #include "wasm/WasmGenerator.h"
 #include "wasm/WasmSignalHandlers.h"
 #include "wasm/WasmValidate.h"
 
 using namespace js;
 using namespace js::jit;
@@ -98,50 +99,322 @@ CompileArgs::initFromContext(JSContext* 
     // only enable it when a developer actually cares: when the debugger tab
     // is open.
     debugEnabled = cx->compartment()->debuggerObservesAsmJS();
 
     this->scriptedCaller = Move(scriptedCaller);
     return assumptions.initBuildIdFromContext(cx);
 }
 
+// Classify the current system as one of a set of recognizable classes.  This
+// really needs to get our tier-1 systems right.
+//
+// TODO: We don't yet have a good measure of how fast a system is.  We
+// distinguish between mobile and desktop because these are very different kinds
+// of systems, but we could further distinguish between low / medium / high end
+// within those major classes.  If we do so, then constants below would be
+// provided for each (class, architecture, system-tier) combination, not just
+// (class, architecture) as now.
+//
+// CPU clock speed is not by itself a good predictor of system performance, as
+// there are high-performance systems with slow clocks (recent Intel) and
+// low-performance systems with fast clocks (older AMD).  We can also use
+// physical memory, core configuration, OS details, CPU class and family, and
+// CPU manufacturer to disambiguate.
+
+enum class SystemClass
+{
+    DesktopX86,
+    DesktopX64,
+    DesktopUnknown32,
+    DesktopUnknown64,
+    MobileX86,
+    MobileArm32,
+    MobileArm64,
+    MobileUnknown32,
+    MobileUnknown64
+};
+
+static SystemClass
+Classify()
+{
+    bool isDesktop;
+
+#if defined(ANDROID) || defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)
+    isDesktop = false;
+#else
+    isDesktop = true;
+#endif
+
+    if (isDesktop) {
+#if defined(JS_CODEGEN_X64)
+        return SystemClass::DesktopX64;
+#elif defined(JS_CODEGEN_X86)
+        return SystemClass::DesktopX86;
+#elif defined(JS_64BIT)
+        return SystemClass::DesktopUnknown64;
+#else
+        return SystemClass::DesktopUnknown32;
+#endif
+    } else {
+#if defined(JS_CODEGEN_X86)
+        return SystemClass::MobileX86;
+#elif defined(JS_CODEGEN_ARM)
+        return SystemClass::MobileArm32;
+#elif defined(JS_CODEGEN_ARM64)
+        return SystemClass::MobileArm64;
+#elif defined(JS_64BIT)
+        return SystemClass::MobileUnknown64;
+#else
+        return SystemClass::MobileUnknown32;
+#endif
+    }
+}
+
+#ifndef JS_64BIT
+
+// Code sizes in machine code bytes per bytecode byte, again empirical except
+// where marked as "Guess".
+
+static const double x64Tox86Inflation = 1.25;
+
+static const double x64IonBytesPerBytecode = 2.45;
+static const double x86IonBytesPerBytecode = x64IonBytesPerBytecode * x64Tox86Inflation;
+static const double arm32IonBytesPerBytecode = 3.3;
+static const double arm64IonBytesPerBytecode = 3.0; // Guess
+
+static const double x64BaselineBytesPerBytecode = x64IonBytesPerBytecode * 1.43;
+static const double x86BaselineBytesPerBytecode = x64BaselineBytesPerBytecode * x64Tox86Inflation;
+static const double arm32BaselineBytesPerBytecode = arm32IonBytesPerBytecode * 1.39;
+static const double arm64BaselineBytesPerBytecode = arm64IonBytesPerBytecode * 1.39; // Guess
+
+static double
+IonBytesPerBytecode(SystemClass cls)
+{
+    switch (cls) {
+      case SystemClass::DesktopX86:
+      case SystemClass::MobileX86:
+      case SystemClass::DesktopUnknown32:
+        return x86IonBytesPerBytecode;
+      case SystemClass::DesktopX64:
+      case SystemClass::DesktopUnknown64:
+        return x64IonBytesPerBytecode;
+      case SystemClass::MobileArm32:
+      case SystemClass::MobileUnknown32:
+        return arm32IonBytesPerBytecode;
+      case SystemClass::MobileArm64:
+      case SystemClass::MobileUnknown64:
+        return arm64IonBytesPerBytecode;
+      default:
+        MOZ_CRASH();
+    }
+}
+
+static double
+BaselineBytesPerBytecode(SystemClass cls)
+{
+    switch (cls) {
+      case SystemClass::DesktopX86:
+      case SystemClass::MobileX86:
+      case SystemClass::DesktopUnknown32:
+        return x86BaselineBytesPerBytecode;
+      case SystemClass::DesktopX64:
+      case SystemClass::DesktopUnknown64:
+        return x64BaselineBytesPerBytecode;
+      case SystemClass::MobileArm32:
+      case SystemClass::MobileUnknown32:
+        return arm32BaselineBytesPerBytecode;
+      case SystemClass::MobileArm64:
+      case SystemClass::MobileUnknown64:
+        return arm64BaselineBytesPerBytecode;
+      default:
+        MOZ_CRASH();
+    }
+}
+
+#endif // !JS_64BIT
+
+// If parallel Ion compilation is going to take longer than this, we should tier.
+
+static const double tierCutoffMs = 250;
+
+// Compilation rate values are empirical except when noted, the reference
+// systems are:
+//
+// Late-2013 MacBook Pro (2.6GHz quad hyperthreaded Haswell)
+// Late-2015 Nexus 5X (1.4GHz quad Cortex-A53 + 1.8GHz dual Cortex-A57)
+
+static const double x64BytecodesPerMs = 2100;
+static const double x86BytecodesPerMs = 1500;
+static const double arm32BytecodesPerMs = 450;
+static const double arm64BytecodesPerMs = 650; // Guess
+
+// Tiering cutoff values: if code section sizes are below these values (when
+// divided by the effective number of cores) we do not tier, because we guess
+// that parallel Ion compilation will be fast enough.
+
+static const double x64DesktopTierCutoff = x64BytecodesPerMs * tierCutoffMs;
+static const double x86DesktopTierCutoff = x86BytecodesPerMs * tierCutoffMs;
+static const double x86MobileTierCutoff = x86DesktopTierCutoff / 2; // Guess
+static const double arm32MobileTierCutoff = arm32BytecodesPerMs * tierCutoffMs;
+static const double arm64MobileTierCutoff = arm64BytecodesPerMs * tierCutoffMs;
+
+static double
+CodesizeCutoff(SystemClass cls, uint32_t codeSize)
+{
+    switch (cls) {
+      case SystemClass::DesktopX86:
+      case SystemClass::DesktopUnknown32:
+        return x86DesktopTierCutoff;
+      case SystemClass::DesktopX64:
+      case SystemClass::DesktopUnknown64:
+        return x64DesktopTierCutoff;
+      case SystemClass::MobileX86:
+        return x86MobileTierCutoff;
+      case SystemClass::MobileArm32:
+      case SystemClass::MobileUnknown32:
+        return arm32MobileTierCutoff;
+      case SystemClass::MobileArm64:
+      case SystemClass::MobileUnknown64:
+        return arm64MobileTierCutoff;
+      default:
+        MOZ_CRASH();
+    }
+}
+
+// As the number of cores grows the effectiveness of each core dwindles (on the
+// systems we care about for SpiderMonkey).
+//
+// The data are empirical, computed from the observed compilation time of the
+// Tanks demo code on a variable number of cores.
+//
+// The heuristic may fail on NUMA systems where the core count is high but the
+// performance increase is nil or negative once the program moves beyond one
+// socket.  However, few browser users have such systems.
+
+static double
+EffectiveCores(SystemClass cls, uint32_t cores)
+{
+    if (cores <= 3)
+        return pow(cores, 0.9);
+    return pow(cores, 0.75);
+}
+
+#ifndef JS_64BIT
+// Don't tier if tiering will fill code memory to more to more than this
+// fraction.
+
+static const double spaceCutoffPct = 0.9;
+#endif
+
+// Figure out whether we should use tiered compilation or not.
 static bool
-BackgroundWorkPossible()
+GetTieringEnabled(uint32_t codeSize)
 {
-    return CanUseExtraThreads() && HelperThreadState().cpuCount > 1;
+    if (!CanUseExtraThreads())
+        return false;
+
+    uint32_t cpuCount = HelperThreadState().cpuCount;
+    MOZ_ASSERT(cpuCount > 0);
+
+    // It's mostly sensible not to background compile when there's only one
+    // hardware thread as we want foreground computation to have access to that.
+    // However, if wasm background compilation helper threads can be given lower
+    // priority then background compilation on single-core systems still makes
+    // some kind of sense.  That said, this is a non-issue: as of September 2017
+    // 1-core was down to 3.5% of our population and falling.
+
+    if (cpuCount == 1)
+        return false;
+
+    MOZ_ASSERT(HelperThreadState().threadCount >= cpuCount);
+
+    // Compute the max number of threads available to do actual background
+    // compilation work.
+
+    uint32_t workers = HelperThreadState().maxWasmCompilationThreads();
+
+    // The number of cores we will use is bounded both by the CPU count and the
+    // worker count.
+
+    uint32_t cores = Min(cpuCount, workers);
+
+    SystemClass cls = Classify();
+
+    // Ion compilation on available cores must take long enough to be worth the
+    // bother.
+
+    double cutoffSize = CodesizeCutoff(cls, codeSize);
+    double effectiveCores = EffectiveCores(cls, cores);
+
+    if ((codeSize / effectiveCores) < cutoffSize)
+        return false;
+
+    // Do not implement a size cutoff for 64-bit systems since the code size
+    // budget for 64 bit is so large that it will hardly ever be an issue.
+    // (Also the cutoff percentage might be different on 64-bit.)
+
+#ifndef JS_64BIT
+    // If the amount of executable code for baseline compilation jeopardizes the
+    // availability of executable memory for ion code then do not tier, for now.
+    //
+    // TODO: For now we consider this module in isolation.  We should really
+    // worry about what else is going on in this process and might be filling up
+    // the code memory.  It's like we need some kind of code memory reservation
+    // system or JIT compilation for large modules.
+
+    double ionRatio = IonBytesPerBytecode(cls);
+    double baselineRatio = BaselineBytesPerBytecode(cls);
+    double needMemory = codeSize * (ionRatio + baselineRatio);
+    double availMemory = LikelyAvailableExecutableMemory();
+    double cutoff = spaceCutoffPct * MaxCodeBytesPerProcess;
+
+    // If the sum of baseline and ion code makes us exceeds some set percentage
+    // of the executable memory then disable tiering.
+
+    if ((MaxCodeBytesPerProcess - availMemory) + needMemory > cutoff)
+        return false;
+#endif
+
+    return true;
 }
 
 SharedModule
 wasm::CompileInitialTier(const ShareableBytes& bytecode, const CompileArgs& args, UniqueChars* error)
 {
     MOZ_RELEASE_ASSERT(wasm::HaveSignalHandlers());
 
     bool baselineEnabled = BaselineCanCompile() && args.baselineEnabled;
     bool debugEnabled = BaselineCanCompile() && args.debugEnabled;
     bool ionEnabled = args.ionEnabled || !baselineEnabled;
 
-    CompileMode mode;
-    Tier tier;
-    DebugEnabled debug;
-    if (BackgroundWorkPossible() && baselineEnabled && ionEnabled && !debugEnabled) {
-        mode = CompileMode::Tier1;
-        tier = Tier::Baseline;
-        debug = DebugEnabled::False;
-    } else {
-        mode = CompileMode::Once;
-        tier = debugEnabled || !ionEnabled ? Tier::Baseline : Tier::Ion;
-        debug = debugEnabled ? DebugEnabled::True : DebugEnabled::False;
-    }
+    DebugEnabled debug = debugEnabled ? DebugEnabled::True : DebugEnabled::False;
 
-    ModuleEnvironment env(mode, tier, debug);
+    ModuleEnvironment env(ModuleEnvironment::UnknownMode, ModuleEnvironment::UnknownTier, debug);
 
     Decoder d(bytecode.bytes, error);
     if (!DecodeModuleEnvironment(d, &env))
         return nullptr;
 
+    uint32_t codeSize;
+    if (!d.peekSectionSize(SectionId::Code, &env, "code", &codeSize))
+        codeSize = 0;
+
+    CompileMode mode;
+    Tier tier;
+    if (baselineEnabled && ionEnabled && !debugEnabled && GetTieringEnabled(codeSize)) {
+        mode = CompileMode::Tier1;
+        tier = Tier::Baseline;
+    } else {
+        mode = CompileMode::Once;
+        tier = debugEnabled || !ionEnabled ? Tier::Baseline : Tier::Ion;
+    }
+
+    env.setModeAndTier(mode, tier);
+
     ModuleGenerator mg(args, &env, nullptr, error);
     if (!mg.init())
         return nullptr;
 
     if (!DecodeCodeSection(d, mg, &env))
         return nullptr;
 
     if (!DecodeModuleTail(d, &env))
--- a/js/src/wasm/WasmGenerator.h
+++ b/js/src/wasm/WasmGenerator.h
@@ -139,20 +139,20 @@ class CompileTask
     }
     jit::MacroAssembler& masm() {
         return *masm_;
     }
     FuncCompileUnitVector& units() {
         return units_;
     }
     Tier tier() const {
-        return env_.tier;
+        return env_.tier();
     }
     CompileMode mode() const {
-        return env_.mode;
+        return env_.mode();
     }
     bool debugEnabled() const {
         return env_.debug == DebugEnabled::True;
     }
     bool reset() {
         units_.clear();
         masm_.reset();
         alloc_.reset();
@@ -238,18 +238,18 @@ class MOZ_STACK_CLASS ModuleGenerator
     MOZ_MUST_USE bool compileFuncDef(uint32_t funcIndex, uint32_t lineOrBytecode,
                                      Bytes&& bytes, const uint8_t* begin, const uint8_t* end,
                                      Uint32Vector&& lineNums);
 
     MOZ_MUST_USE bool initAsmJS(Metadata* asmJSMetadata);
     MOZ_MUST_USE bool initWasm();
 
     bool isAsmJS() const { return env_->isAsmJS(); }
-    Tier tier() const { return env_->tier; }
-    CompileMode mode() const { return env_->mode; }
+    Tier tier() const { return env_->tier(); }
+    CompileMode mode() const { return env_->mode(); }
     bool debugEnabled() const { return env_->debugEnabled(); }
 
   public:
     ModuleGenerator(const CompileArgs& args, ModuleEnvironment* env,
                     Atomic<bool>* cancelled, UniqueChars* error);
     ~ModuleGenerator();
 
     MOZ_MUST_USE bool init(Metadata* maybeAsmJSMetadata = nullptr);
--- a/js/src/wasm/WasmValidate.cpp
+++ b/js/src/wasm/WasmValidate.cpp
@@ -55,17 +55,17 @@ Decoder::fail(size_t errorOffset, const 
         return false;
 
     *error_ = Move(strWithOffset);
     return false;
 }
 
 bool
 Decoder::startSection(SectionId id, ModuleEnvironment* env, uint32_t* sectionStart,
-                      uint32_t* sectionSize, const char* sectionName)
+                      uint32_t* sectionSize, const char* sectionName, bool peeking)
 {
     // Record state at beginning of section to allow rewinding to this point
     // if, after skipping through several custom sections, we don't find the
     // section 'id'.
     const uint8_t* const initialCur = cur_;
     const size_t initialCustomSectionsLength = env->customSections.length();
 
     // Maintain a pointer to the current section that gets updated as custom
@@ -80,45 +80,65 @@ Decoder::startSection(SectionId id, Modu
 
     while (idValue != uint8_t(id)) {
         if (idValue != uint8_t(SectionId::Custom))
             goto rewind;
 
         // Rewind to the beginning of the current section since this is what
         // skipCustomSection() assumes.
         cur_ = currentSectionStart;
-        if (!skipCustomSection(env))
+        if (!skipCustomSection(env)) {
+            if (peeking)
+                goto rewind;
             return false;
+        }
 
         // Having successfully skipped a custom section, consider the next
         // section.
         currentSectionStart = cur_;
         if (!readFixedU8(&idValue))
             goto rewind;
     }
 
     // Found it, now start the section.
 
-    if (!readVarU32(sectionSize) || bytesRemain() < *sectionSize)
+    if (!readVarU32(sectionSize) || bytesRemain() < *sectionSize) {
+        if (peeking)
+            goto rewind;
         goto fail;
+    }
 
     *sectionStart = cur_ - beg_;
+    if (peeking)
+        goto rewind_peeking;
     return true;
 
   rewind:
+    peeking = false;
+  rewind_peeking:
     cur_ = initialCur;
     env->customSections.shrinkTo(initialCustomSectionsLength);
-    *sectionStart = NotStarted;
+    if (!peeking)
+        *sectionStart = NotStarted;
     return true;
 
   fail:
     return failf("failed to start %s section", sectionName);
 }
 
 bool
+Decoder::peekSectionSize(SectionId id, ModuleEnvironment* env, const char* sectionName, uint32_t* sectionSize)
+{
+    uint32_t sectionStart;
+    if (!startSection(id, env, &sectionStart, sectionSize, sectionName, /*peeking=*/true))
+        return false;
+    return sectionStart != NotStarted;
+}
+
+bool
 Decoder::finishSection(uint32_t sectionStart, uint32_t sectionSize, const char* sectionName)
 {
     if (resilientMode_)
         return true;
     if (sectionSize != (cur_ - beg_) - sectionStart)
         return failf("byte size mismatch in %s section", sectionName);
     return true;
 }
--- a/js/src/wasm/WasmValidate.h
+++ b/js/src/wasm/WasmValidate.h
@@ -31,21 +31,24 @@ namespace wasm {
 // ModuleGenerator, the ModuleEnvironment holds state shared between the
 // ModuleGenerator thread and background compile threads. All the threads
 // are given a read-only view of the ModuleEnvironment, thus preventing race
 // conditions.
 
 struct ModuleEnvironment
 {
     // Constant parameters for the entire compilation:
-    const CompileMode         mode;
-    const Tier                tier;
     const DebugEnabled        debug;
     const ModuleKind          kind;
 
+    // Constant parameters determined no later than at the start of the code
+    // section:
+    CompileMode               mode_;
+    Tier                      tier_;
+
     // Module fields filled out incrementally during decoding:
     MemoryUsage               memoryUsage;
     Atomic<uint32_t>          minMemoryLength;
     Maybe<uint32_t>           maxMemoryLength;
     SigWithIdVector           sigs;
     SigWithIdPtrVector        funcSigs;
     Uint32Vector              funcImportGlobalDataOffsets;
     GlobalDescVector          globals;
@@ -54,28 +57,45 @@ struct ModuleEnvironment
     ImportVector              imports;
     ExportVector              exports;
     Maybe<uint32_t>           startFuncIndex;
     ElemSegmentVector         elemSegments;
     DataSegmentVector         dataSegments;
     NameInBytecodeVector      funcNames;
     CustomSectionVector       customSections;
 
+    static const CompileMode UnknownMode = (CompileMode)-1;
+    static const Tier        UnknownTier = (Tier)-1;
+
     explicit ModuleEnvironment(CompileMode mode = CompileMode::Once,
                                Tier tier = Tier::Ion,
                                DebugEnabled debug = DebugEnabled::False,
                                ModuleKind kind = ModuleKind::Wasm)
-      : mode(mode),
-        tier(tier),
-        debug(debug),
+      : debug(debug),
         kind(kind),
+        mode_(mode),
+        tier_(tier),
         memoryUsage(MemoryUsage::None),
         minMemoryLength(0)
     {}
 
+    CompileMode mode() const {
+        MOZ_ASSERT(mode_ != UnknownMode);
+        return mode_;
+    }
+    Tier tier() const {
+        MOZ_ASSERT(tier_ != UnknownTier);
+        return tier_;
+    }
+    void setModeAndTier(CompileMode mode, Tier tier) {
+        MOZ_ASSERT(mode_ == UnknownMode);
+        MOZ_ASSERT(tier_ == UnknownTier);
+        mode_ = mode;
+        tier_ = tier;
+    }
     size_t numTables() const {
         return tables.length();
     }
     size_t numSigs() const {
         return sigs.length();
     }
     size_t numFuncs() const {
         // asm.js pre-reserves a bunch of function index space which is
@@ -541,20 +561,25 @@ class Decoder
     // See "section" description in Encoder.
 
     static const uint32_t NotStarted = UINT32_MAX;
 
     MOZ_MUST_USE bool startSection(SectionId id,
                                    ModuleEnvironment* env,
                                    uint32_t* sectionStart,
                                    uint32_t* sectionSize,
-                                   const char* sectionName);
+                                   const char* sectionName,
+                                   bool peeking = false);
     MOZ_MUST_USE bool finishSection(uint32_t sectionStart,
                                     uint32_t sectionSize,
                                     const char* sectionName);
+    MOZ_MUST_USE bool peekSectionSize(SectionId id,
+                                      ModuleEnvironment* env,
+                                      const char* sectionName,
+                                      uint32_t* sectionSize);
 
     // Custom sections do not cause validation errors unless the error is in
     // the section header itself.
 
     MOZ_MUST_USE bool startCustomSection(const char* expected,
                                          size_t expectedLength,
                                          ModuleEnvironment* env,
                                          uint32_t* sectionStart,