Bug 1096684 - IonMonkey: Optimize with SSE4.1 insertps and blendps r=bbouvier
authorDan Gohman <sunfish@mozilla.com>
Wed, 12 Nov 2014 12:38:32 -0800
changeset 215258 191a52db5011606e73899b3782929f6a5b237419
parent 215257 9bd6e299c84a9dbaf3468866380609db76d475d6
child 215259 cf5beb0f5d259693e71046f40b3997f13c2ac079
push id51728
push userdgohman@mozilla.com
push dateWed, 12 Nov 2014 20:41:59 +0000
treeherdermozilla-inbound@4c04203003c2 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbbouvier
bugs1096684
milestone36.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1096684 - IonMonkey: Optimize with SSE4.1 insertps and blendps r=bbouvier
js/src/jit-test/tests/asm.js/testSIMD.js
js/src/jit/shared/Assembler-x86-shared.h
js/src/jit/shared/BaseAssembler-x86-shared.h
js/src/jit/shared/CodeGenerator-x86-shared.cpp
--- a/js/src/jit-test/tests/asm.js/testSIMD.js
+++ b/js/src/jit-test/tests/asm.js/testSIMD.js
@@ -901,17 +901,26 @@ const LANE_SELECTORS = [
     [0, 1, 0, 1],
     [0, 0, 1, 1],
     [2, 2, 3, 3],
     // Impl-specific special cases for shuffle (case and swapped case)
     [2, 3, 6, 7], [6, 7, 2, 3],
     [0, 1, 4, 5], [4, 5, 0, 1],
     [0, 4, 1, 5], [4, 0, 5, 1],
     [2, 6, 3, 7], [6, 2, 7, 3],
-    [4, 1, 2, 3], [0, 5, 6, 7]
+    [4, 1, 2, 3], [0, 5, 6, 7],
+    // Insert one element from rhs into lhs keeping other elements unchanged
+    [7, 1, 2, 3],
+    [0, 7, 2, 3],
+    [0, 1, 7, 2],
+    // These are effectively vector selects
+    [0, 5, 2, 3],
+    [0, 1, 6, 3],
+    [4, 5, 2, 3],
+    [4, 1, 6, 3]
 ];
 
 for (var lanes of LANE_SELECTORS) {
     CheckI4('var shuffle=i4.shuffle;', 'var x=i4(1,2,3,4); var y=i4(5,6,7,8); x=shuffle(x, y, ' + lanes.join(',') + ')', shuffle([1,2,3,4], [5,6,7,8], lanes));
     CheckF4('var shuffle=f4.shuffle;', 'var x=f4(1,2,3,4); var y=f4(5,6,7,8); x=shuffle(x, y, ' + lanes.join(',') + ')', shuffle([1,2,3,4], [5,6,7,8], lanes));
 }
 DEBUG && print('time for checking all shuffles:', Date.now() - before);
 
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@@ -2152,16 +2152,36 @@ class AssemblerX86Shared : public Assemb
         ret |= unsigned(sourceLane) << 6;
         MOZ_ASSERT(ret < 256);
         return ret;
     }
     void insertps(FloatRegister src, FloatRegister dest, unsigned mask) {
         MOZ_ASSERT(HasSSE41());
         masm.insertps_irr(mask, src.code(), dest.code());
     }
+    unsigned blendpsMask(bool x, bool y, bool z, bool w) {
+        return x | (y << 1) | (z << 2) | (w << 3);
+    }
+    void blendps(FloatRegister src, FloatRegister dest, unsigned mask) {
+        MOZ_ASSERT(HasSSE41());
+        masm.blendps_irr(mask, src.code(), dest.code());
+    }
+    void blendps(const Operand &src, FloatRegister dest, unsigned mask) {
+        MOZ_ASSERT(HasSSE41());
+        switch (src.kind()) {
+          case Operand::FPREG:
+            masm.blendps_irr(mask, src.fpu(), dest.code());
+            break;
+          case Operand::MEM_REG_DISP:
+            masm.blendps_imr(mask, src.disp(), src.base(), dest.code());
+            break;
+          default:
+            MOZ_CRASH("unexpected operand kind");
+        }
+    }
     void minsd(FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.minsd_rr(src.code(), dest.code());
     }
     void minsd(const Operand &src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         switch (src.kind()) {
           case Operand::FPREG:
--- a/js/src/jit/shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/shared/BaseAssembler-x86-shared.h
@@ -375,26 +375,28 @@ private:
         OP2_PSLLD_VdqWdq    = 0xF2,
         OP2_PSUBD_VdqWdq    = 0xFA,
         OP2_PADDD_VdqWdq    = 0xFE
     } TwoByteOpcodeID;
 
     typedef enum {
         OP3_ROUNDSS_VsdWsd  = 0x0A,
         OP3_ROUNDSD_VsdWsd  = 0x0B,
+        OP3_BLENDPS_VpsWpsIb = 0x0C,
         OP3_PTEST_VdVd      = 0x17,
         OP3_INSERTPS_VpsUps = 0x21,
         OP3_PINSRD_VdqEdIb  = 0x22
     } ThreeByteOpcodeID;
 
     typedef enum {
         ESCAPE_PTEST        = 0x38,
         ESCAPE_PINSRD       = 0x3A,
         ESCAPE_ROUNDSD      = 0x3A,
-        ESCAPE_INSERTPS     = 0x3A
+        ESCAPE_INSERTPS     = 0x3A,
+        ESCAPE_BLENDPS      = 0x3A
     } ThreeByteEscape;
 
     TwoByteOpcodeID jccRel32(Condition cond)
     {
         return (TwoByteOpcodeID)(OP2_JCC_rel32 + cond);
     }
 
     TwoByteOpcodeID setccOpcode(Condition cond)
@@ -3692,41 +3694,60 @@ public:
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.threeByteOp(OP3_ROUNDSS_VsdWsd, ESCAPE_ROUNDSD, (RegisterID)dst, (RegisterID)src);
         m_formatter.immediate8(mode); // modes are the same for roundsd and roundss
     }
 
     void insertps_irr(unsigned mask, XMMRegisterID src, XMMRegisterID dst)
     {
         MOZ_ASSERT(mask < 256);
-        spew("insertps   $%u, %s, %s", mask, nameFPReg(src), nameFPReg(dst));
+        spew("insertps   $%x, %s, %s", mask, nameFPReg(src), nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.threeByteOp(OP3_INSERTPS_VpsUps, ESCAPE_INSERTPS, (RegisterID)dst, (RegisterID)src);
         m_formatter.immediate8(uint8_t(mask));
     }
 
     void pinsrd_irr(unsigned lane, RegisterID src, XMMRegisterID dst)
     {
         MOZ_ASSERT(lane < 4);
-        spew("pinsrd     $%u, %s, %s", lane, nameIReg(src), nameFPReg(dst));
+        spew("pinsrd     $%x, %s, %s", lane, nameIReg(src), nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.threeByteOp(OP3_PINSRD_VdqEdIb, ESCAPE_PINSRD, (RegisterID)dst, (RegisterID)src);
         m_formatter.immediate8(uint8_t(lane));
     }
 
     void pinsrd_imr(unsigned lane, int offset, RegisterID base, XMMRegisterID dst)
     {
         MOZ_ASSERT(lane < 4);
-        spew("pinsrd     $%u, %s0x%x(%s), %s", lane, PRETTY_PRINT_OFFSET(offset),
+        spew("pinsrd     $%x, %s0x%x(%s), %s", lane, PRETTY_PRINT_OFFSET(offset),
              nameIReg(base), nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.threeByteOp(OP3_PINSRD_VdqEdIb, ESCAPE_PINSRD, (RegisterID)dst, base, offset);
         m_formatter.immediate8(uint8_t(lane));
     }
 
+    void blendps_irr(unsigned imm, XMMRegisterID src, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(imm < 16);
+        spew("blendps    $%x, %s, %s", imm, nameFPReg(src), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.threeByteOp(OP3_BLENDPS_VpsWpsIb, ESCAPE_BLENDPS, (RegisterID)dst, (RegisterID)src);
+        m_formatter.immediate8(uint8_t(imm));
+    }
+
+    void blendps_imr(unsigned imm, int offset, RegisterID base, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(imm < 16);
+        spew("blendps    $%x, %s0x%x(%s), %s", imm, PRETTY_PRINT_OFFSET(offset), nameIReg(base),
+             nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.threeByteOp(OP3_BLENDPS_VpsWpsIb, ESCAPE_BLENDPS, (RegisterID)dst, base, offset);
+        m_formatter.immediate8(uint8_t(imm));
+    }
+
     void minsd_rr(XMMRegisterID src, XMMRegisterID dst)
     {
         spew("minsd      %s, %s", nameFPReg(src), nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_F2);
         m_formatter.twoByteOp(OP2_MINSD_VsdWsd, (RegisterID)dst, (RegisterID)src);
     }
 
     void minsd_mr(int offset, RegisterID base, XMMRegisterID dst)
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@@ -2470,25 +2470,57 @@ CodeGeneratorX86Shared::visitSimdShuffle
     // - T: temporary
 
     uint32_t mask;
 
     // If all lanes came from a single vector, we should have constructed a
     // MSimdSwizzle instead.
     MOZ_ASSERT(numLanesFromLHS < 4);
 
+    // If all values stay in their lane, this is a blend.
+    if (AssemblerX86Shared::HasSSE41()) {
+        if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) {
+            masm.blendps(rhs, out, masm.blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4));
+            return true;
+        }
+    }
+
     // One element of the second, all other elements of the first
     if (numLanesFromLHS == 3) {
         unsigned firstMask = -1, secondMask = -1;
 
+        // register-register movss preserves the high lanes.
         if (ins->lanesMatch(4, 1, 2, 3)) {
             masm.movss(rhs, out);
             return true;
         }
 
+        // SSE4.1 insertps can handle any single element.
+        unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
+        if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
+            SimdLane srcLane;
+            SimdLane dstLane;
+            if (x >= 4) {
+                srcLane = SimdLane(x - 4);
+                dstLane = LaneX;
+            } else if (y >= 4) {
+                srcLane = SimdLane(y - 4);
+                dstLane = LaneY;
+            } else if (z >= 4) {
+                srcLane = SimdLane(z - 4);
+                dstLane = LaneZ;
+            } else {
+                MOZ_ASSERT(w >= 4);
+                srcLane = SimdLane(w - 4);
+                dstLane = LaneW;
+            }
+            masm.insertps(rhs, out, masm.insertpsMask(srcLane, dstLane));
+            return true;
+        }
+
         FloatRegister rhsCopy = ToFloatRegister(ins->temp());
 
         if (x < 4 && y < 4) {
             if (w >= 4) {
                 w %= 4;
                 // T = (Rw Rw Lz Lz) = shufps(firstMask, lhs, rhs)
                 firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
                 // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = shufps(secondMask, T, lhs)