Bug 1316820 - [WASM] The way stack locals are initialized has been optimized. r?lth draft
authorMichelangelo De Simone <mdesimone@mozilla.com>
Fri, 17 Mar 2017 17:30:34 -0700
changeset 501763 145e47885a227a481e191381e534a37deabde5de
parent 487583 c7b015c488cfb2afbcff295a9639acd85df332f8
child 550001 4f9c28ebfa503e99f905dea36b5d56d3d6bd6cf5
push id50117
push userbmo:mdesimone@mozilla.com
push dateMon, 20 Mar 2017 23:20:56 +0000
reviewerslth
bugs1316820
milestone54.0a1
Bug 1316820 - [WASM] The way stack locals are initialized has been optimized. r?lth MozReview-Commit-ID: 8llQmMPfGeB
js/src/wasm/WasmBaselineCompile.cpp
--- a/js/src/wasm/WasmBaselineCompile.cpp
+++ b/js/src/wasm/WasmBaselineCompile.cpp
@@ -556,16 +556,17 @@ class BaseCompiler
                  TempAllocator* alloc,
                  MacroAssembler* masm);
 
     MOZ_MUST_USE bool init();
 
     FuncOffsets finish();
 
     MOZ_MUST_USE bool emitFunction();
+    void emitInitStackLocals();
 
     // Used by some of the ScratchRegister implementations.
     operator MacroAssembler&() const { return masm; }
 
 #ifdef DEBUG
     bool scratchRegisterTaken() const {
         return scratchRegisterTaken_;
     }
@@ -622,16 +623,24 @@ class BaseCompiler
     void storeToFrameF64(FloatRegister r, int32_t offset) {
         masm.storeDouble(r, Address(StackPointer, localOffsetToSPOffset(offset)));
     }
 
     void storeToFrameF32(FloatRegister r, int32_t offset) {
         masm.storeFloat32(r, Address(StackPointer, localOffsetToSPOffset(offset)));
     }
 
+    void storeToFrameWord(Register r, int32_t offset) {
+#if defined(JS_64BIT)
+        masm.store64(Register64(r), Address(StackPointer, localOffsetToSPOffset(offset)));
+#else
+        masm.store32(r, Address(StackPointer, localOffsetToSPOffset(offset)));
+#endif
+    }
+
     void loadFromFrameI32(Register r, int32_t offset) {
         masm.load32(Address(StackPointer, localOffsetToSPOffset(offset)), r);
     }
 
     void loadFromFrameI64(Register64 r, int32_t offset) {
         masm.load64(Address(StackPointer, localOffsetToSPOffset(offset)), r);
     }
 
@@ -2172,36 +2181,18 @@ class BaseCompiler
                 if (i->argInRegister())
                     storeToFrameF32(i->fpu(), l.offs());
                 break;
               default:
                 MOZ_CRASH("Function argument type");
             }
         }
 
-        // Initialize the stack locals to zero.
-        //
-        // The following are all Bug 1316820:
-        //
-        // TODO / OPTIMIZE: on x64, at least, scratch will be a 64-bit
-        // register and we can move 64 bits at a time.
-        //
-        // TODO / OPTIMIZE: On SSE2 or better SIMD systems we may be
-        // able to store 128 bits at a time.  (I suppose on some
-        // systems we have 512-bit SIMD for that matter.)
-        //
-        // TODO / OPTIMIZE: if we have only one initializing store
-        // then it's better to store a zero literal, probably.
-
-        if (varLow_ < varHigh_) {
-            ScratchI32 scratch(*this);
-            masm.mov(ImmWord(0), scratch);
-            for (int32_t i = varLow_ ; i < varHigh_ ; i += 4)
-                storeToFrameI32(scratch, i + 4);
-        }
+        if (varHigh_ != varLow_)
+            emitInitStackLocals();
 
         if (debugEnabled_)
             insertBreakablePoint(CallSiteDesc::EnterFrame);
     }
 
     void saveResult() {
         MOZ_ASSERT(debugEnabled_);
         size_t debugFrameOffset = masm.framePushed() - DebugFrame::offsetOfFrame();
@@ -7313,16 +7304,99 @@ BaseCompiler::emitFunction()
         return false;
 
     if (!endFunction())
         return false;
 
     return true;
 }
 
+void
+BaseCompiler::emitInitStackLocals()
+{
+    MOZ_ASSERT(varLow_ != varHigh_, "there should be stack locals to initialize");
+
+#if defined(JS_64BIT)
+    const uint8_t wordsize = 8;
+#else
+    const uint8_t wordsize = 4;
+#endif
+
+    const uint32_t unroll_limit = 32;
+    const uint32_t init_words = (varHigh_ - varLow_) / wordsize;
+
+    // In case we have only one local to initialize, it's just
+    // faster/easier to store an immediate zero.
+    if (init_words == 1) {
+        masm.storePtr(ImmWord(0), Address(StackPointer, localOffsetToSPOffset(varLow_ + wordsize)));
+        return;
+    }
+
+    Register zero = needI32();
+    masm.move32(Imm32(0), zero);
+
+    // Full unroll if we have less than unroll_limit locals.
+    // Keeping this case explicit, instead of merging it with the
+    // partial unroll, helps reducing the emitted instructions.
+    // In this case we emit at most unroll_limit+2 instructions.
+    if (init_words < unroll_limit)  {
+        for (int32_t i = varLow_; i < varHigh_; i += wordsize)
+            storeToFrameWord(zero, i + wordsize);
+
+        freeGPR(zero);
+        return;
+    }
+
+    Register p = needI32();
+    masm.computeEffectiveAddress(Address(StackPointer, localOffsetToSPOffset(varLow_ + wordsize)), p);
+
+    RegI32 lim = needI32();
+    masm.computeEffectiveAddress(Address(StackPointer, localOffsetToSPOffset(varHigh_)), lim);
+
+    Label L1;
+    masm.bind(&L1);
+
+    // At this point the register 'p' contains the pointer to the first slot to initialize,
+    // meaning the effective address of "varLow_".
+    for (uint32_t i = 0; i < unroll_limit; ++i) {
+#if defined(JS_64BIT)
+        masm.store64(Register64(zero), Address(p, -(wordsize * i)));
+#else
+        masm.store32(zero, Address(p, -(wordsize * i)));
+#endif
+    }
+
+    // Adjust the base (p) for branching.
+#if defined(JS_64BIT)
+    masm.sub64(Imm64(unroll_limit * wordsize), Register64(p));
+#else
+    masm.sub32(Imm32(unroll_limit * wordsize), p);
+#endif
+
+    // The upper (lower) bound is computed from varHigh_: branch back to L1 if
+    // p is still to hit lim.
+    masm.branchPtr(Assembler::LessThan, lim, p, &L1);
+
+    // Zero the remaining locals.
+    const uint32_t iterations = init_words / unroll_limit;
+    const uint32_t tail = init_words - (iterations * unroll_limit);
+
+    for (uint32_t i = 0; i < tail; ++i) {
+#if defined(JS_64BIT)
+        masm.store64(Register64(zero), Address(p, -(wordsize * i)));
+#else
+        masm.store32(zero, Address(p, -(wordsize * i)));
+#endif
+    }
+
+    freeGPR(p);
+    freeGPR(lim);
+    freeGPR(zero);
+}
+
 BaseCompiler::BaseCompiler(const ModuleEnvironment& env,
                            Decoder& decoder,
                            const FuncBytes& func,
                            const ValTypeVector& locals,
                            bool debugEnabled,
                            TempAllocator* alloc,
                            MacroAssembler* masm)
     : env_(env),