Bug 1576567 part 1 - Optimize table address loads in interpreter code. r=lth
authorJan de Mooij <jdemooij@mozilla.com>
Wed, 28 Aug 2019 06:30:56 +0000
changeset 554080 dae1e9839adce6f2129c3593b2c2991dd34a0f35
parent 554079 06a3816a2a49d95c16f13c6cb64ba0f66a95c398
child 554081 feec09fd96eb7c212844f4dce66e78864ca36606
push id2165
push userffxbld-merge
push dateMon, 14 Oct 2019 16:30:58 +0000
treeherdermozilla-release@0eae18af659f [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerslth
bugs1576567
milestone70.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1576567 part 1 - Optimize table address loads in interpreter code. r=lth This affects the following platforms: * x64: use a RIP-relative LEA instead of an immediate MOV. This saves a few hundred bytes total and seems to be a little bit faster on interpreter micro-benchmarks. * arm64: use ADR instead of LDR. Seems to be a measurable speedup running Speedometer on Pixel 2 with the JITs disabled. Differential Revision: https://phabricator.services.mozilla.com/D43398
js/src/jit/BaselineCodeGen.cpp
js/src/jit/MacroAssembler.h
js/src/jit/arm/MacroAssembler-arm.cpp
js/src/jit/arm64/MacroAssembler-arm64.cpp
js/src/jit/mips-shared/MacroAssembler-mips-shared.cpp
js/src/jit/x64/MacroAssembler-x64.cpp
js/src/jit/x86/MacroAssembler-x86.cpp
--- a/js/src/jit/BaselineCodeGen.cpp
+++ b/js/src/jit/BaselineCodeGen.cpp
@@ -6904,17 +6904,17 @@ bool BaselineInterpreterGenerator::emitI
   masm.bind(&interpretOpAfterDebugTrap);
 
   // Load pc, bytecode op.
   Register pcReg = LoadBytecodePC(masm, scratch1);
   masm.load8ZeroExtend(Address(pcReg, 0), scratch1);
 
   // Jump to table[op].
   {
-    CodeOffset label = masm.movWithPatch(ImmWord(uintptr_t(-1)), scratch2);
+    CodeOffset label = masm.moveNearAddressWithPatch(scratch2);
     if (!tableLabels_.append(label)) {
       return false;
     }
     BaseIndex pointer(scratch2, scratch1, ScalePointer);
     masm.branchToComputedAddress(pointer);
   }
 
   // At the end of each op, emit code to bump the pc and jump to the
@@ -6945,17 +6945,17 @@ bool BaselineInterpreterGenerator::emitI
     }
 
     if (!emitDebugTrap()) {
       return false;
     }
 
     // Load the opcode, jump to table[op].
     masm.load8ZeroExtend(Address(InterpreterPCRegAtDispatch, 0), scratch1);
-    CodeOffset label = masm.movWithPatch(ImmWord(uintptr_t(-1)), scratch2);
+    CodeOffset label = masm.moveNearAddressWithPatch(scratch2);
     if (!tableLabels_.append(label)) {
       return false;
     }
     BaseIndex pointer(scratch2, scratch1, ScalePointer);
     masm.branchToComputedAddress(pointer);
     return true;
   };
 
@@ -7105,20 +7105,20 @@ bool BaselineInterpreterGenerator::gener
         ReportOutOfMemory(cx);
         return false;
       }
 
       code->setHasBytecodeMap();
     }
 
     // Patch loads now that we know the tableswitch base address.
+    CodeLocationLabel tableLoc(code, CodeOffset(tableOffset_));
     for (CodeOffset off : tableLabels_) {
-      Assembler::PatchDataWithValueCheck(CodeLocationLabel(code, off),
-                                         ImmPtr(code->raw() + tableOffset_),
-                                         ImmPtr((void*)-1));
+      MacroAssembler::patchNearAddressMove(CodeLocationLabel(code, off),
+                                           tableLoc);
     }
 
 #ifdef JS_ION_PERF
     writePerfSpewerJitCodeProfile(code, "BaselineInterpreter");
 #endif
 
 #ifdef MOZ_VTUNE
     vtune::MarkStub(code, "BaselineInterpreter");
--- a/js/src/jit/MacroAssembler.h
+++ b/js/src/jit/MacroAssembler.h
@@ -473,16 +473,28 @@ class MacroAssembler : public MacroAssem
 
   // Emit a nop that can be patched to and from a nop and a call with int32
   // relative displacement.
   CodeOffset nopPatchableToCall(const wasm::CallSiteDesc& desc) PER_SHARED_ARCH;
   static void patchNopToCall(uint8_t* callsite,
                              uint8_t* target) PER_SHARED_ARCH;
   static void patchCallToNop(uint8_t* callsite) PER_SHARED_ARCH;
 
+  // These methods are like movWithPatch/PatchDataWithValueCheck but allow
+  // using pc-relative addressing on certain platforms (RIP-relative LEA on x64,
+  // ADR instruction on arm64).
+  //
+  // Note: "Near" applies to ARM64 where the target must be within 1 MB (this is
+  // release-asserted).
+  CodeOffset moveNearAddressWithPatch(Register dest)
+      DEFINED_ON(x86, x64, arm, arm64, mips_shared);
+  static void patchNearAddressMove(CodeLocationLabel loc,
+                                   CodeLocationLabel target)
+      DEFINED_ON(x86, x64, arm, arm64, mips_shared);
+
  public:
   // ===============================================================
   // [SMDOC] JIT-to-C++ Function Calls (callWithABI)
   //
   // callWithABI is used to make a call using the standard C/C++ system ABI.
   //
   // callWithABI is a low level interface for making calls, as such every call
   // made with callWithABI should be organized with 6 steps: spilling live
--- a/js/src/jit/arm/MacroAssembler-arm.cpp
+++ b/js/src/jit/arm/MacroAssembler-arm.cpp
@@ -5740,16 +5740,25 @@ void MacroAssembler::flexibleDivMod32(Re
 
     LiveRegisterSet ignore;
     ignore.add(remOutput);
     ignore.add(lhsOutput);
     PopRegsInMaskIgnore(volatileLiveRegs, ignore);
   }
 }
 
+CodeOffset MacroAssembler::moveNearAddressWithPatch(Register dest) {
+  return movWithPatch(ImmPtr(nullptr), dest);
+}
+
+void MacroAssembler::patchNearAddressMove(CodeLocationLabel loc,
+                                          CodeLocationLabel target) {
+  PatchDataWithValueCheck(loc, ImmPtr(target.raw()), ImmPtr(nullptr));
+}
+
 // ========================================================================
 // Spectre Mitigations.
 
 void MacroAssembler::speculationBarrier() {
   // Spectre mitigation recommended by ARM for cases where csel/cmov cannot be
   // used.
   as_csdb();
 }
--- a/js/src/jit/arm64/MacroAssembler-arm64.cpp
+++ b/js/src/jit/arm64/MacroAssembler-arm64.cpp
@@ -2018,16 +2018,36 @@ void MacroAssembler::flexibleDivMod32(Re
   } else {
     Sdiv(ARMRegister(srcDest, 32), src, ARMRegister(rhs, 32));
   }
   // Compute remainder
   Mul(scratch, ARMRegister(srcDest, 32), ARMRegister(rhs, 32));
   Sub(ARMRegister(remOutput, 32), src, scratch);
 }
 
+CodeOffset MacroAssembler::moveNearAddressWithPatch(Register dest) {
+  AutoForbidPoolsAndNops afp(this,
+                             /* max number of instructions in scope = */ 1);
+  CodeOffset offset(currentOffset());
+  adr(ARMRegister(dest, 64), 0, LabelDoc());
+  return offset;
+}
+
+void MacroAssembler::patchNearAddressMove(CodeLocationLabel loc,
+                                          CodeLocationLabel target) {
+  ptrdiff_t off = target - loc;
+  MOZ_RELEASE_ASSERT(vixl::IsInt21(off));
+
+  Instruction* cur = reinterpret_cast<Instruction*>(loc.raw());
+  MOZ_ASSERT(cur->IsADR());
+
+  vixl::Register rd = vixl::Register::XRegFromCode(cur->Rd());
+  adr(cur, rd, off);
+}
+
 // ========================================================================
 // Spectre Mitigations.
 
 void MacroAssembler::speculationBarrier() {
   // Conditional speculation barrier.
   csdb();
 }
 
--- a/js/src/jit/mips-shared/MacroAssembler-mips-shared.cpp
+++ b/js/src/jit/mips-shared/MacroAssembler-mips-shared.cpp
@@ -2764,13 +2764,22 @@ void MacroAssembler::flexibleDivMod32(Re
     as_divu(srcDest, rhs);
   } else {
     as_div(srcDest, rhs);
   }
   as_mfhi(remOutput);
   as_mflo(srcDest);
 }
 
+CodeOffset MacroAssembler::moveNearAddressWithPatch(Register dest) {
+  return movWithPatch(ImmPtr(nullptr), dest);
+}
+
+void MacroAssembler::patchNearAddressMove(CodeLocationLabel loc,
+                                          CodeLocationLabel target) {
+  PatchDataWithValueCheck(loc, ImmPtr(target.raw()), ImmPtr(nullptr));
+}
+
 // ========================================================================
 // Spectre Mitigations.
 
 void MacroAssembler::speculationBarrier() { MOZ_CRASH(); }
 //}}} check_macroassembler_style
--- a/js/src/jit/x64/MacroAssembler-x64.cpp
+++ b/js/src/jit/x64/MacroAssembler-x64.cpp
@@ -1049,9 +1049,21 @@ void MacroAssembler::atomicExchange64(co
 }
 
 void MacroAssembler::atomicFetchOp64(const Synchronization& sync, AtomicOp op,
                                      Register64 value, const Address& mem,
                                      Register64 temp, Register64 output) {
   AtomicFetchOp64(*this, nullptr, op, value.reg, mem, temp.reg, output.reg);
 }
 
+CodeOffset MacroAssembler::moveNearAddressWithPatch(Register dest) {
+  return leaRipRelative(dest);
+}
+
+void MacroAssembler::patchNearAddressMove(CodeLocationLabel loc,
+                                          CodeLocationLabel target) {
+  ptrdiff_t off = target - loc;
+  MOZ_ASSERT(off > ptrdiff_t(INT32_MIN));
+  MOZ_ASSERT(off < ptrdiff_t(INT32_MAX));
+  PatchWrite_Imm32(loc, Imm32(off));
+}
+
 //}}} check_macroassembler_style
--- a/js/src/jit/x86/MacroAssembler-x86.cpp
+++ b/js/src/jit/x86/MacroAssembler-x86.cpp
@@ -1238,9 +1238,18 @@ void MacroAssembler::convertInt64ToFloat
 
   fstp32(Operand(esp, 0));
   vmovss(Address(esp, 0), output);
   freeStack(2 * sizeof(intptr_t));
 }
 
 void MacroAssembler::PushBoxed(FloatRegister reg) { Push(reg); }
 
+CodeOffset MacroAssembler::moveNearAddressWithPatch(Register dest) {
+  return movWithPatch(ImmPtr(nullptr), dest);
+}
+
+void MacroAssembler::patchNearAddressMove(CodeLocationLabel loc,
+                                          CodeLocationLabel target) {
+  PatchDataWithValueCheck(loc, ImmPtr(target.raw()), ImmPtr(nullptr));
+}
+
 //}}} check_macroassembler_style