Bug 1025100: Use INSERTPS for inserting float elements when SSE4.1 is available; r=sunfish
authorBenjamin Bouvier <benj@benj.me>
Fri, 19 Sep 2014 14:56:05 +0200
changeset 206246 02e8c6942c85da6d1dae60a4455c9f87687486db
parent 206245 d418a4d0f8d5840b51c875599926074ee120ef97
child 206247 4df302f6b71995dd1a9010dc1bfecf1639e6f810
push id8859
push userryanvm@gmail.com
push dateFri, 19 Sep 2014 18:19:29 +0000
treeherderfx-team@42f7ab962b31 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssunfish
bugs1025100
milestone35.0a1
Bug 1025100: Use INSERTPS for inserting float elements when SSE4.1 is available; r=sunfish
js/src/jit/shared/Assembler-x86-shared.h
js/src/jit/shared/BaseAssembler-x86-shared.h
js/src/jit/shared/CodeGenerator-x86-shared.cpp
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@@ -1414,21 +1414,21 @@ class AssemblerX86Shared : public Assemb
         masm.divl_r(divisor.code());
     }
 
     void unpcklps(FloatRegister src, FloatRegister dest) {
         JS_ASSERT(HasSSE2());
         masm.unpcklps_rr(src.code(), dest.code());
     }
     void pinsrd(unsigned lane, Register src, FloatRegister dest) {
-        JS_ASSERT(HasSSE2());
+        JS_ASSERT(HasSSE41());
         masm.pinsrd_irr(lane, src.code(), dest.code());
     }
     void pinsrd(unsigned lane, const Operand &src, FloatRegister dest) {
-        JS_ASSERT(HasSSE2());
+        JS_ASSERT(HasSSE41());
         switch (src.kind()) {
           case Operand::REG:
             masm.pinsrd_irr(lane, src.reg(), dest.code());
             break;
           case Operand::MEM_REG_DISP:
             masm.pinsrd_imr(lane, src.disp(), src.base(), dest.code());
             break;
           default:
@@ -1953,28 +1953,39 @@ class AssemblerX86Shared : public Assemb
     void sqrtsd(FloatRegister src, FloatRegister dest) {
         JS_ASSERT(HasSSE2());
         masm.sqrtsd_rr(src.code(), dest.code());
     }
     void sqrtss(FloatRegister src, FloatRegister dest) {
         JS_ASSERT(HasSSE2());
         masm.sqrtss_rr(src.code(), dest.code());
     }
-    void roundsd(FloatRegister src, FloatRegister dest,
-                 X86Assembler::RoundingMode mode)
-    {
+    void roundsd(FloatRegister src, FloatRegister dest, X86Assembler::RoundingMode mode) {
         JS_ASSERT(HasSSE41());
         masm.roundsd_rr(src.code(), dest.code(), mode);
     }
-    void roundss(FloatRegister src, FloatRegister dest,
-                 X86Assembler::RoundingMode mode)
-    {
+    void roundss(FloatRegister src, FloatRegister dest, X86Assembler::RoundingMode mode) {
         JS_ASSERT(HasSSE41());
         masm.roundss_rr(src.code(), dest.code(), mode);
     }
+    unsigned insertpsMask(SimdLane sourceLane, SimdLane destLane, unsigned zeroMask = 0)
+    {
+        // Note that the sourceLane bits are ignored in the case of a source
+        // memory operand, and the source is the given 32-bits memory location.
+        MOZ_ASSERT(zeroMask < 16);
+        unsigned ret = zeroMask ;
+        ret |= unsigned(destLane) << 4;
+        ret |= unsigned(sourceLane) << 6;
+        MOZ_ASSERT(ret < 256);
+        return ret;
+    }
+    void insertps(FloatRegister src, FloatRegister dest, unsigned mask) {
+        JS_ASSERT(HasSSE41());
+        masm.insertps_irr(mask, src.code(), dest.code());
+    }
     void minsd(FloatRegister src, FloatRegister dest) {
         JS_ASSERT(HasSSE2());
         masm.minsd_rr(src.code(), dest.code());
     }
     void minsd(const Operand &src, FloatRegister dest) {
         JS_ASSERT(HasSSE2());
         switch (src.kind()) {
           case Operand::FPREG:
--- a/js/src/jit/shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/shared/BaseAssembler-x86-shared.h
@@ -347,23 +347,25 @@ private:
         OP2_PSUBD_VdqWdq    = 0xFA,
         OP2_PADDD_VdqWdq    = 0xFE
     } TwoByteOpcodeID;
 
     typedef enum {
         OP3_ROUNDSS_VsdWsd  = 0x0A,
         OP3_ROUNDSD_VsdWsd  = 0x0B,
         OP3_PTEST_VdVd      = 0x17,
+        OP3_INSERTPS_VpsUps = 0x21,
         OP3_PINSRD_VdqEdIb  = 0x22
     } ThreeByteOpcodeID;
 
     typedef enum {
         ESCAPE_PTEST        = 0x38,
         ESCAPE_PINSRD       = 0x3A,
-        ESCAPE_ROUNDSD      = 0x3A
+        ESCAPE_ROUNDSD      = 0x3A,
+        ESCAPE_INSERTPS     = 0x3A
     } ThreeByteEscape;
 
     TwoByteOpcodeID jccRel32(Condition cond)
     {
         return (TwoByteOpcodeID)(OP2_JCC_rel32 + cond);
     }
 
     TwoByteOpcodeID setccOpcode(Condition cond)
@@ -3599,16 +3601,26 @@ public:
     {
         spew("roundss    %s, %s, %d",
              nameFPReg(src), nameFPReg(dst), (int)mode);
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.threeByteOp(OP3_ROUNDSS_VsdWsd, ESCAPE_ROUNDSD, (RegisterID)dst, (RegisterID)src);
         m_formatter.immediate8(mode); // modes are the same for roundsd and roundss
     }
 
+    void insertps_irr(unsigned mask, XMMRegisterID src, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(mask < 256);
+        spew("insertps     $%u, %s, %s",
+             mask, nameFPReg(src), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.threeByteOp(OP3_INSERTPS_VpsUps, ESCAPE_INSERTPS, (RegisterID)dst, (RegisterID)src);
+        m_formatter.immediate8(uint8_t(mask));
+    }
+
     void pinsrd_irr(unsigned lane, RegisterID src, XMMRegisterID dst)
     {
         MOZ_ASSERT(lane < 4);
         spew("pinsrd     $%u, %s, %s",
              lane, nameIReg(src), nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.threeByteOp(OP3_PINSRD_VdqEdIb, ESCAPE_PINSRD, (RegisterID)dst, (RegisterID)src);
         m_formatter.immediate8(uint8_t(lane));
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@@ -2321,16 +2321,22 @@ CodeGeneratorX86Shared::visitSimdInsertE
     if (ins->lane() == SimdLane::LaneX) {
         // As both operands are registers, movss doesn't modify the upper bits
         // of the destination operand.
         if (value != output)
             masm.movss(value, output);
         return true;
     }
 
+    if (AssemblerX86Shared::HasSSE41()) {
+        // The input value is in the low float32 of the 'value' FloatRegister.
+        masm.insertps(value, output, masm.insertpsMask(SimdLane::LaneX, ins->lane()));
+        return true;
+    }
+
     unsigned component = unsigned(ins->lane());
     masm.reserveStack(Simd128DataSize);
     masm.storeAlignedFloat32x4(vector, Address(StackPointer, 0));
     masm.storeFloat32(value, Address(StackPointer, component * sizeof(int32_t)));
     masm.loadAlignedFloat32x4(Address(StackPointer, 0), output);
     masm.freeStack(Simd128DataSize);
     return true;
 }