Bug 1096684 - IonMonkey: Optimize with SSE3 movsldup and movshdup r=bbouvier
authorDan Gohman <sunfish@mozilla.com>
Wed, 12 Nov 2014 12:38:32 -0800
changeset 215368 4c04203003c22c9b725bb9ecbe2b5731f5fc601f
parent 215367 cf5beb0f5d259693e71046f40b3997f13c2ac079
child 215369 f991f366f66e8d0938e292b1fd57e495d6503f37
push id12118
push userkwierso@gmail.com
push dateThu, 13 Nov 2014 01:27:56 +0000
treeherderb2g-inbound@bb35fdd01744 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbbouvier
bugs1096684
milestone36.0a1
Bug 1096684 - IonMonkey: Optimize with SSE3 movsldup and movshdup r=bbouvier
js/src/jit/shared/Assembler-x86-shared.h
js/src/jit/shared/BaseAssembler-x86-shared.h
js/src/jit/shared/CodeGenerator-x86-shared.cpp
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@@ -2189,16 +2189,50 @@ class AssemblerX86Shared : public Assemb
             break;
           case Operand::MEM_REG_DISP:
             masm.blendps_imr(mask, src.disp(), src.base(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
+    void movsldup(FloatRegister src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE3());
+        masm.movsldup_rr(src.code(), dest.code());
+    }
+    void movsldup(const Operand &src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE3());
+        switch (src.kind()) {
+          case Operand::FPREG:
+            masm.movsldup_rr(src.fpu(), dest.code());
+            break;
+          case Operand::MEM_REG_DISP:
+            masm.movsldup_mr(src.disp(), src.base(), dest.code());
+            break;
+          default:
+            MOZ_CRASH("unexpected operand kind");
+        }
+    }
+    void movshdup(FloatRegister src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE3());
+        masm.movshdup_rr(src.code(), dest.code());
+    }
+    void movshdup(const Operand &src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE3());
+        switch (src.kind()) {
+          case Operand::FPREG:
+            masm.movshdup_rr(src.fpu(), dest.code());
+            break;
+          case Operand::MEM_REG_DISP:
+            masm.movshdup_mr(src.disp(), src.base(), dest.code());
+            break;
+          default:
+            MOZ_CRASH("unexpected operand kind");
+        }
+    }
     void minsd(FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.minsd_rr(src.code(), dest.code());
     }
     void minsd(const Operand &src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         switch (src.kind()) {
           case Operand::FPREG:
--- a/js/src/jit/shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/shared/BaseAssembler-x86-shared.h
@@ -295,19 +295,21 @@ private:
 
     typedef enum {
         OP2_UD2             = 0x0B,
         OP2_MOVSD_VsdWsd    = 0x10,
         OP2_MOVPS_VpsWps    = 0x10,
         OP2_MOVSD_WsdVsd    = 0x11,
         OP2_MOVPS_WpsVps    = 0x11,
         OP2_MOVHLPS_VqUq    = 0x12,
+        OP2_MOVSLDUP_VpsWps = 0x12,
         OP2_UNPCKLPS_VsdWsd = 0x14,
         OP2_UNPCKHPS_VsdWsd = 0x15,
         OP2_MOVLHPS_VqUq    = 0x16,
+        OP2_MOVSHDUP_VpsWps = 0x16,
         OP2_MOVAPD_VsdWsd   = 0x28,
         OP2_MOVAPS_VsdWsd   = 0x28,
         OP2_MOVAPS_WsdVsd   = 0x29,
         OP2_CVTSI2SD_VsdEd  = 0x2A,
         OP2_CVTTSD2SI_GdWsd = 0x2C,
         OP2_UCOMISD_VsdWsd  = 0x2E,
         OP2_MOVMSKPD_EdVd   = 0x50,
         OP2_ANDPS_VpsWps    = 0x54,
@@ -3759,16 +3761,46 @@ public:
         MOZ_ASSERT(imm < 16);
         spew("blendps    $%x, %s0x%x(%s), %s", imm, PRETTY_PRINT_OFFSET(offset), nameIReg(base),
              nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.threeByteOp(OP3_BLENDPS_VpsWpsIb, ESCAPE_BLENDPS, (RegisterID)dst, base, offset);
         m_formatter.immediate8(uint8_t(imm));
     }
 
+    void movsldup_rr(XMMRegisterID src, XMMRegisterID dst)
+    {
+        spew("movsldup   %s, %s", nameFPReg(src), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_F3);
+        m_formatter.twoByteOp(OP2_MOVSLDUP_VpsWps, (RegisterID)dst, (RegisterID)src);
+    }
+
+    void movsldup_mr(int offset, RegisterID base, XMMRegisterID dst)
+    {
+        spew("movsldup   %s0x%x(%s), %s", PRETTY_PRINT_OFFSET(offset), nameIReg(base),
+             nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_F3);
+        m_formatter.twoByteOp(OP2_MOVSLDUP_VpsWps, (RegisterID)dst, base, offset);
+    }
+
+    void movshdup_rr(XMMRegisterID src, XMMRegisterID dst)
+    {
+        spew("movshdup   %s, %s", nameFPReg(src), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_F3);
+        m_formatter.twoByteOp(OP2_MOVSHDUP_VpsWps, (RegisterID)dst, (RegisterID)src);
+    }
+
+    void movshdup_mr(int offset, RegisterID base, XMMRegisterID dst)
+    {
+        spew("movshdup   %s0x%x(%s), %s", PRETTY_PRINT_OFFSET(offset), nameIReg(base),
+             nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_F3);
+        m_formatter.twoByteOp(OP2_MOVSHDUP_VpsWps, (RegisterID)dst, base, offset);
+    }
+
     void minsd_rr(XMMRegisterID src, XMMRegisterID dst)
     {
         spew("minsd      %s, %s", nameFPReg(src), nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_F2);
         m_formatter.twoByteOp(OP2_MINSD_VsdWsd, (RegisterID)dst, (RegisterID)src);
     }
 
     void minsd_mr(int offset, RegisterID base, XMMRegisterID dst)
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@@ -2409,16 +2409,27 @@ CodeGeneratorX86Shared::visitSimdSwizzle
     FloatRegister input = ToFloatRegister(ins->input());
     FloatRegister output = ToFloatRegister(ins->output());
 
     uint32_t x = ins->laneX();
     uint32_t y = ins->laneY();
     uint32_t z = ins->laneZ();
     uint32_t w = ins->laneW();
 
+    if (AssemblerX86Shared::HasSSE3()) {
+        if (ins->lanesMatch(0, 0, 2, 2)) {
+            masm.movsldup(input, output);
+            return true;
+        }
+        if (ins->lanesMatch(1, 1, 3, 3)) {
+            masm.movshdup(input, output);
+            return true;
+        }
+    }
+
     // TODO Here and below, arch specific lowering could identify this pattern
     // and use defineReuseInput to avoid this move (bug 1084404)
     if (ins->lanesMatch(2, 3, 2, 3)) {
         masm.movaps(input, output);
         masm.movhlps(input, output);
         return true;
     }