Bug 1025100: Use INSERTPS for inserting float elements when SSE4.1 is available; r=sunfish
authorBenjamin Bouvier <benj@benj.me>
Fri, 19 Sep 2014 14:56:05 +0200
changeset 206231 02e8c6942c85da6d1dae60a4455c9f87687486db
parent 206230 d418a4d0f8d5840b51c875599926074ee120ef97
child 206232 4df302f6b71995dd1a9010dc1bfecf1639e6f810
push id27516
push userryanvm@gmail.com
push dateFri, 19 Sep 2014 17:54:48 +0000
treeherdermozilla-central@b00bdb144e06 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssunfish
bugs1025100
milestone35.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1025100: Use INSERTPS for inserting float elements when SSE4.1 is available; r=sunfish
js/src/jit/shared/Assembler-x86-shared.h
js/src/jit/shared/BaseAssembler-x86-shared.h
js/src/jit/shared/CodeGenerator-x86-shared.cpp
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@@ -1414,21 +1414,21 @@ class AssemblerX86Shared : public Assemb
         masm.divl_r(divisor.code());
     }
 
     void unpcklps(FloatRegister src, FloatRegister dest) {
         JS_ASSERT(HasSSE2());
         masm.unpcklps_rr(src.code(), dest.code());
     }
     void pinsrd(unsigned lane, Register src, FloatRegister dest) {
-        JS_ASSERT(HasSSE2());
+        JS_ASSERT(HasSSE41());
         masm.pinsrd_irr(lane, src.code(), dest.code());
     }
     void pinsrd(unsigned lane, const Operand &src, FloatRegister dest) {
-        JS_ASSERT(HasSSE2());
+        JS_ASSERT(HasSSE41());
         switch (src.kind()) {
           case Operand::REG:
             masm.pinsrd_irr(lane, src.reg(), dest.code());
             break;
           case Operand::MEM_REG_DISP:
             masm.pinsrd_imr(lane, src.disp(), src.base(), dest.code());
             break;
           default:
@@ -1953,28 +1953,39 @@ class AssemblerX86Shared : public Assemb
     void sqrtsd(FloatRegister src, FloatRegister dest) {
         JS_ASSERT(HasSSE2());
         masm.sqrtsd_rr(src.code(), dest.code());
     }
     void sqrtss(FloatRegister src, FloatRegister dest) {
         JS_ASSERT(HasSSE2());
         masm.sqrtss_rr(src.code(), dest.code());
     }
-    void roundsd(FloatRegister src, FloatRegister dest,
-                 X86Assembler::RoundingMode mode)
-    {
+    void roundsd(FloatRegister src, FloatRegister dest, X86Assembler::RoundingMode mode) {
         JS_ASSERT(HasSSE41());
         masm.roundsd_rr(src.code(), dest.code(), mode);
     }
-    void roundss(FloatRegister src, FloatRegister dest,
-                 X86Assembler::RoundingMode mode)
-    {
+    void roundss(FloatRegister src, FloatRegister dest, X86Assembler::RoundingMode mode) {
         JS_ASSERT(HasSSE41());
         masm.roundss_rr(src.code(), dest.code(), mode);
     }
+    unsigned insertpsMask(SimdLane sourceLane, SimdLane destLane, unsigned zeroMask = 0)
+    {
+        // Note that the sourceLane bits are ignored in the case of a source
+        // memory operand, and the source is the given 32-bits memory location.
+        MOZ_ASSERT(zeroMask < 16);
+        unsigned ret = zeroMask ;
+        ret |= unsigned(destLane) << 4;
+        ret |= unsigned(sourceLane) << 6;
+        MOZ_ASSERT(ret < 256);
+        return ret;
+    }
+    void insertps(FloatRegister src, FloatRegister dest, unsigned mask) {
+        JS_ASSERT(HasSSE41());
+        masm.insertps_irr(mask, src.code(), dest.code());
+    }
     void minsd(FloatRegister src, FloatRegister dest) {
         JS_ASSERT(HasSSE2());
         masm.minsd_rr(src.code(), dest.code());
     }
     void minsd(const Operand &src, FloatRegister dest) {
         JS_ASSERT(HasSSE2());
         switch (src.kind()) {
           case Operand::FPREG:
--- a/js/src/jit/shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/shared/BaseAssembler-x86-shared.h
@@ -347,23 +347,25 @@ private:
         OP2_PSUBD_VdqWdq    = 0xFA,
         OP2_PADDD_VdqWdq    = 0xFE
     } TwoByteOpcodeID;
 
     typedef enum {
         OP3_ROUNDSS_VsdWsd  = 0x0A,
         OP3_ROUNDSD_VsdWsd  = 0x0B,
         OP3_PTEST_VdVd      = 0x17,
+        OP3_INSERTPS_VpsUps = 0x21,
         OP3_PINSRD_VdqEdIb  = 0x22
     } ThreeByteOpcodeID;
 
     typedef enum {
         ESCAPE_PTEST        = 0x38,
         ESCAPE_PINSRD       = 0x3A,
-        ESCAPE_ROUNDSD      = 0x3A
+        ESCAPE_ROUNDSD      = 0x3A,
+        ESCAPE_INSERTPS     = 0x3A
     } ThreeByteEscape;
 
     TwoByteOpcodeID jccRel32(Condition cond)
     {
         return (TwoByteOpcodeID)(OP2_JCC_rel32 + cond);
     }
 
     TwoByteOpcodeID setccOpcode(Condition cond)
@@ -3599,16 +3601,26 @@ public:
     {
         spew("roundss    %s, %s, %d",
              nameFPReg(src), nameFPReg(dst), (int)mode);
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.threeByteOp(OP3_ROUNDSS_VsdWsd, ESCAPE_ROUNDSD, (RegisterID)dst, (RegisterID)src);
         m_formatter.immediate8(mode); // modes are the same for roundsd and roundss
     }
 
+    void insertps_irr(unsigned mask, XMMRegisterID src, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(mask < 256);
+        spew("insertps     $%u, %s, %s",
+             mask, nameFPReg(src), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.threeByteOp(OP3_INSERTPS_VpsUps, ESCAPE_INSERTPS, (RegisterID)dst, (RegisterID)src);
+        m_formatter.immediate8(uint8_t(mask));
+    }
+
     void pinsrd_irr(unsigned lane, RegisterID src, XMMRegisterID dst)
     {
         MOZ_ASSERT(lane < 4);
         spew("pinsrd     $%u, %s, %s",
              lane, nameIReg(src), nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.threeByteOp(OP3_PINSRD_VdqEdIb, ESCAPE_PINSRD, (RegisterID)dst, (RegisterID)src);
         m_formatter.immediate8(uint8_t(lane));
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@@ -2321,16 +2321,22 @@ CodeGeneratorX86Shared::visitSimdInsertE
     if (ins->lane() == SimdLane::LaneX) {
         // As both operands are registers, movss doesn't modify the upper bits
         // of the destination operand.
         if (value != output)
             masm.movss(value, output);
         return true;
     }
 
+    if (AssemblerX86Shared::HasSSE41()) {
+        // The input value is in the low float32 of the 'value' FloatRegister.
+        masm.insertps(value, output, masm.insertpsMask(SimdLane::LaneX, ins->lane()));
+        return true;
+    }
+
     unsigned component = unsigned(ins->lane());
     masm.reserveStack(Simd128DataSize);
     masm.storeAlignedFloat32x4(vector, Address(StackPointer, 0));
     masm.storeFloat32(value, Address(StackPointer, component * sizeof(int32_t)));
     masm.loadAlignedFloat32x4(Address(StackPointer, 0), output);
     masm.freeStack(Simd128DataSize);
     return true;
 }