Bug 1442540 - Empirical ARM64 policy values for wasm tiering. r=luke
authorLars T Hansen <lhansen@mozilla.com>
Tue, 08 Jan 2019 20:43:01 +0100
changeset 510131 16819308720e1aa1228ff31f84bd233599fb5e23
parent 510115 a6cc9b822c1c1051961f8a293bc3e0331b8739d6
child 510132 0985217ed619be2fc84eeefc895321f4c88dc21e
push id10547
push userffxbld-merge
push dateMon, 21 Jan 2019 13:03:58 +0000
treeherdermozilla-beta@24ec1916bffe [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersluke
bugs1442540
milestone66.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1442540 - Empirical ARM64 policy values for wasm tiering. r=luke These values were computed by measuring compilations of a set of programs on ARM64 reference hardware using --no-threads and then applying sane multipliers to estimate Ion values where we don't yet have them (because we don't have Ion on ARM64). See the bug for a table of raw data. See comments in the code for more information about what the values mean.
js/src/wasm/WasmCompile.cpp
--- a/js/src/wasm/WasmCompile.cpp
+++ b/js/src/wasm/WasmCompile.cpp
@@ -157,33 +157,35 @@ static SystemClass ClassifySystem() {
     return SystemClass::MobileUnknown64;
 #else
     return SystemClass::MobileUnknown32;
 #endif
   }
 }
 
 // Code sizes in machine code bytes per bytecode byte, again empirical except
-// where marked as "Guess".
+// where marked.
+//
+// The Ion estimate for ARM64 is the measured Baseline value scaled by a
+// plausible factor for optimized code.
 
 static const double x64Tox86Inflation = 1.25;
 
 static const double x64IonBytesPerBytecode = 2.45;
 static const double x86IonBytesPerBytecode =
     x64IonBytesPerBytecode * x64Tox86Inflation;
 static const double arm32IonBytesPerBytecode = 3.3;
-static const double arm64IonBytesPerBytecode = 3.0;  // Guess
+static const double arm64IonBytesPerBytecode = 3.0 / 1.4;  // Estimate
 
 static const double x64BaselineBytesPerBytecode = x64IonBytesPerBytecode * 1.43;
 static const double x86BaselineBytesPerBytecode =
     x64BaselineBytesPerBytecode * x64Tox86Inflation;
 static const double arm32BaselineBytesPerBytecode =
     arm32IonBytesPerBytecode * 1.39;
-static const double arm64BaselineBytesPerBytecode =
-    arm64IonBytesPerBytecode * 1.39;  // Guess
+static const double arm64BaselineBytesPerBytecode = 3.0;
 
 static double OptimizedBytesPerBytecode(SystemClass cls) {
   switch (cls) {
     case SystemClass::DesktopX86:
     case SystemClass::MobileX86:
     case SystemClass::DesktopUnknown32:
       return x86IonBytesPerBytecode;
     case SystemClass::DesktopX64:
@@ -234,35 +236,42 @@ double wasm::EstimateCompiledCodeSize(Ti
 // If parallel Ion compilation is going to take longer than this, we should
 // tier.
 
 static const double tierCutoffMs = 250;
 
 // Compilation rate values are empirical except when noted, the reference
 // systems are:
 //
-// Late-2013 MacBook Pro (2.6GHz quad hyperthreaded Haswell)
-// Late-2015 Nexus 5X (1.4GHz quad Cortex-A53 + 1.8GHz dual Cortex-A57)
+// Late-2013 MacBook Pro (2.6GHz 4 x hyperthreaded Haswell, Mac OS X)
+// Late-2015 Nexus 5X (1.4GHz 4 x Cortex-A53 + 1.8GHz 2 x Cortex-A57, Android)
+// Ca-2016 SoftIron Overdrive 1000 (1.7GHz 4 x Cortex-A57, Fedora)
+//
+// The rates are always per core.
+//
+// The estimate for ARM64 is the Baseline compilation rate on the SoftIron
+// (because we have no Ion yet), divided by 5 to estimate Ion compile rate and
+// then divided by 2 to make it more reasonable for consumer ARM64 systems.
 
-static const double x64BytecodesPerMs = 2100;
-static const double x86BytecodesPerMs = 1500;
-static const double arm32BytecodesPerMs = 450;
-static const double arm64BytecodesPerMs = 650;  // Guess
+static const double x64IonBytecodesPerMs = 2100;
+static const double x86IonBytecodesPerMs = 1500;
+static const double arm32IonBytecodesPerMs = 450;
+static const double arm64IonBytecodesPerMs = 750;  // Estimate
 
 // Tiering cutoff values: if code section sizes are below these values (when
 // divided by the effective number of cores) we do not tier, because we guess
 // that parallel Ion compilation will be fast enough.
 
-static const double x64DesktopTierCutoff = x64BytecodesPerMs * tierCutoffMs;
-static const double x86DesktopTierCutoff = x86BytecodesPerMs * tierCutoffMs;
+static const double x64DesktopTierCutoff = x64IonBytecodesPerMs * tierCutoffMs;
+static const double x86DesktopTierCutoff = x86IonBytecodesPerMs * tierCutoffMs;
 static const double x86MobileTierCutoff = x86DesktopTierCutoff / 2;  // Guess
-static const double arm32MobileTierCutoff = arm32BytecodesPerMs * tierCutoffMs;
-static const double arm64MobileTierCutoff = arm64BytecodesPerMs * tierCutoffMs;
+static const double arm32MobileTierCutoff = arm32IonBytecodesPerMs * tierCutoffMs;
+static const double arm64MobileTierCutoff = arm64IonBytecodesPerMs * tierCutoffMs;
 
-static double CodesizeCutoff(SystemClass cls, uint32_t codeSize) {
+static double CodesizeCutoff(SystemClass cls) {
   switch (cls) {
     case SystemClass::DesktopX86:
     case SystemClass::DesktopUnknown32:
       return x86DesktopTierCutoff;
     case SystemClass::DesktopX64:
     case SystemClass::DesktopUnknown64:
       return x64DesktopTierCutoff;
     case SystemClass::MobileX86:
@@ -330,17 +339,17 @@ static bool TieringBeneficial(uint32_t c
 
   uint32_t cores = Min(cpuCount, workers);
 
   SystemClass cls = ClassifySystem();
 
   // Ion compilation on available cores must take long enough to be worth the
   // bother.
 
-  double cutoffSize = CodesizeCutoff(cls, codeSize);
+  double cutoffSize = CodesizeCutoff(cls);
   double effectiveCores = EffectiveCores(cls, cores);
 
   if ((codeSize / effectiveCores) < cutoffSize) {
     return false;
   }
 
   // Do not implement a size cutoff for 64-bit systems since the code size
   // budget for 64 bit is so large that it will hardly ever be an issue.