Bug 1136226 - Implement MSimdInsertElement for small integer types. r=bbouvier
authorJakob Olesen <jolesen@mozilla.com>
Tue, 31 May 2016 09:00:18 -0700
changeset 338720 b45c0a42f19cc86dbeb58f0f0282b788fe132d46
parent 338719 f0779cf0f83dc854e97c87466f7109d9b264b48c
child 338721 62c28a8f7ebf20927d97f809313d3e0c567a064a
push id6249
push userjlund@mozilla.com
push dateMon, 01 Aug 2016 13:59:36 +0000
treeherdermozilla-beta@bad9d4f5bf7e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbbouvier
bugs1136226
milestone49.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1136226 - Implement MSimdInsertElement for small integer types. r=bbouvier Use vpinsrw to insert 16x8 lanes. This instruction is available since SSE2, so it can be used unconditionally. Move visitSimdInsertElement into x86-specific code in order to set the proper register allocation constraints.
js/src/jit/Lowering.cpp
js/src/jit/Lowering.h
js/src/jit/shared/LIR-shared.h
js/src/jit/shared/Lowering-shared.h
js/src/jit/x86-shared/Assembler-x86-shared.h
js/src/jit/x86-shared/BaseAssembler-x86-shared.h
js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
js/src/jit/x86-shared/Encoding-x86-shared.h
js/src/jit/x86-shared/Lowering-x86-shared.cpp
js/src/jit/x86-shared/Lowering-x86-shared.h
--- a/js/src/jit/Lowering.cpp
+++ b/js/src/jit/Lowering.cpp
@@ -4381,36 +4381,16 @@ LIRGenerator::visitSimdReinterpretCast(M
     LUse use = useRegisterAtStart(input);
     // :TODO: (Bug 1132894) We have to allocate a different register as redefine
     // and/or defineReuseInput are not yet capable of reusing the same register
     // with a different register type.
     define(new(alloc()) LSimdReinterpretCast(use), ins);
 }
 
 void
-LIRGenerator::visitSimdInsertElement(MSimdInsertElement* ins)
-{
-    MOZ_ASSERT(IsSimdType(ins->type()));
-
-    LUse vec = useRegisterAtStart(ins->vector());
-    LUse val = useRegister(ins->value());
-    switch (ins->type()) {
-      case MIRType::Int32x4:
-      case MIRType::Bool32x4:
-        defineReuseInput(new(alloc()) LSimdInsertElementI(vec, val), ins, 0);
-        break;
-      case MIRType::Float32x4:
-        defineReuseInput(new(alloc()) LSimdInsertElementF(vec, val), ins, 0);
-        break;
-      default:
-        MOZ_CRASH("Unknown SIMD kind when generating constant");
-    }
-}
-
-void
 LIRGenerator::visitSimdAllTrue(MSimdAllTrue* ins)
 {
     MDefinition* input = ins->input();
     MOZ_ASSERT(IsBooleanSimdType(input->type()));
 
     LUse use = useRegisterAtStart(input);
     define(new(alloc()) LSimdAllTrue(use), ins);
 }
--- a/js/src/jit/Lowering.h
+++ b/js/src/jit/Lowering.h
@@ -289,17 +289,16 @@ class LIRGenerator : public LIRGenerator
     void visitAsmJSPassStackArg(MAsmJSPassStackArg* ins);
     void visitAsmJSCall(MAsmJSCall* ins);
     void visitSetDOMProperty(MSetDOMProperty* ins);
     void visitGetDOMProperty(MGetDOMProperty* ins);
     void visitGetDOMMember(MGetDOMMember* ins);
     void visitRecompileCheck(MRecompileCheck* ins);
     void visitSimdBox(MSimdBox* ins);
     void visitSimdUnbox(MSimdUnbox* ins);
-    void visitSimdInsertElement(MSimdInsertElement* ins);
     void visitSimdSwizzle(MSimdSwizzle* ins);
     void visitSimdGeneralShuffle(MSimdGeneralShuffle* ins);
     void visitSimdShuffle(MSimdShuffle* ins);
     void visitSimdUnaryArith(MSimdUnaryArith* ins);
     void visitSimdBinaryComp(MSimdBinaryComp* ins);
     void visitSimdBinaryBitwise(MSimdBinaryBitwise* ins);
     void visitSimdShift(MSimdShift* ins);
     void visitSimdConstant(MSimdConstant* ins);
--- a/js/src/jit/shared/LIR-shared.h
+++ b/js/src/jit/shared/LIR-shared.h
@@ -303,16 +303,19 @@ class LSimdInsertElementBase : public LI
         return getOperand(0);
     }
     const LAllocation* value() {
         return getOperand(1);
     }
     unsigned lane() const {
         return mir_->toSimdInsertElement()->lane();
     }
+    unsigned length() const {
+        return SimdTypeToLength(mir_->toSimdInsertElement()->type());
+    }
 };
 
 // Replace an element from a given SIMD integer or boolean lane with a given value.
 // The value inserted into a boolean lane should be 0 or -1.
 class LSimdInsertElementI : public LSimdInsertElementBase
 {
   public:
     LIR_HEADER(SimdInsertElementI);
--- a/js/src/jit/shared/Lowering-shared.h
+++ b/js/src/jit/shared/Lowering-shared.h
@@ -266,16 +266,18 @@ class LIRGeneratorShared : public MDefin
         return false;
     }
 
     // Provide NYI default implementations of the SIMD visitor functions.
     // Many targets don't implement SIMD at all, and we don't want to duplicate
     // these stubs in the specific sub-classes.
     // Some SIMD visitors are implemented in LIRGenerator in Lowering.cpp. These
     // shared implementations are not included here.
+    void visitSimdInsertElement(MSimdInsertElement*) override { MOZ_CRASH("NYI"); }
+    void visitSimdExtractElement(MSimdExtractElement*) override { MOZ_CRASH("NYI"); }
     void visitSimdBinaryArith(MSimdBinaryArith*) override { MOZ_CRASH("NYI"); }
     void visitSimdSelect(MSimdSelect*) override { MOZ_CRASH("NYI"); }
     void visitSimdSplat(MSimdSplat*) override { MOZ_CRASH("NYI"); }
     void visitSimdValueX4(MSimdValueX4*) override { MOZ_CRASH("NYI"); }
 };
 
 } // namespace jit
 } // namespace js
--- a/js/src/jit/x86-shared/Assembler-x86-shared.h
+++ b/js/src/jit/x86-shared/Assembler-x86-shared.h
@@ -2101,33 +2101,29 @@ class AssemblerX86Shared : public Assemb
     }
     void idiv(Register divisor) {
         masm.idivl_r(divisor.encoding());
     }
     void udiv(Register divisor) {
         masm.divl_r(divisor.encoding());
     }
 
+    void vpinsrb(unsigned lane, Register src1, FloatRegister src0, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE41());
+        masm.vpinsrb_irr(lane, src1.encoding(), src0.encoding(), dest.encoding());
+    }
+    void vpinsrw(unsigned lane, Register src1, FloatRegister src0, FloatRegister dest) {
+        masm.vpinsrw_irr(lane, src1.encoding(), src0.encoding(), dest.encoding());
+    }
+
     void vpinsrd(unsigned lane, Register src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE41());
         masm.vpinsrd_irr(lane, src1.encoding(), src0.encoding(), dest.encoding());
     }
-    void vpinsrd(unsigned lane, const Operand& src1, FloatRegister src0, FloatRegister dest) {
-        MOZ_ASSERT(HasSSE41());
-        switch (src1.kind()) {
-          case Operand::REG:
-            masm.vpinsrd_irr(lane, src1.reg(), src0.encoding(), dest.encoding());
-            break;
-          case Operand::MEM_REG_DISP:
-            masm.vpinsrd_imr(lane, src1.disp(), src1.base(), src0.encoding(), dest.encoding());
-            break;
-          default:
-            MOZ_CRASH("unexpected operand kind");
-        }
-    }
+
     void vpextrb(unsigned lane, FloatRegister src, Register dest) {
         MOZ_ASSERT(HasSSE41());
         masm.vpextrb_irr(lane, src.encoding(), dest.encoding());
     }
     void vpextrw(unsigned lane, FloatRegister src, Register dest) {
         masm.vpextrw_irr(lane, src.encoding(), dest.encoding());
     }
     void vpextrd(unsigned lane, FloatRegister src, Register dest) {
--- a/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
@@ -3080,16 +3080,22 @@ public:
         twoByteOpSimd("vmulsd", VEX_SD, OP2_MULSD_VsdWsd, offset, base, src0, dst);
     }
 
     void vmulss_mr(int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vmulss", VEX_SS, OP2_MULSD_VsdWsd, offset, base, src0, dst);
     }
 
+    void vpinsrw_irr(uint32_t whichWord, RegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(whichWord < 8);
+        twoByteOpImmInt32Simd("vpinsrw", VEX_PD, OP2_PINSRW, whichWord, src1, src0, dst);
+    }
+
     void vpextrw_irr(uint32_t whichWord, XMMRegisterID src, RegisterID dst)
     {
         MOZ_ASSERT(whichWord < 8);
         twoByteOpImmSimdInt32("vpextrw", VEX_PD, OP2_PEXTRW_GdUdIb, whichWord, src, dst);
     }
 
     void vsubsd_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
@@ -3245,28 +3251,28 @@ public:
     {
         threeByteOpImmSimd("vinsertps", VEX_PD, OP3_INSERTPS_VpsUps, ESCAPE_3A, mask, src1, src0, dst);
     }
     void vinsertps_imr(uint32_t mask, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
     {
         threeByteOpImmSimd("vinsertps", VEX_PD, OP3_INSERTPS_VpsUps, ESCAPE_3A, mask, offset, base, src0, dst);
     }
 
+    void vpinsrb_irr(unsigned lane, RegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(lane < 16);
+        threeByteOpImmInt32Simd("vpinsrb", VEX_PD, OP3_PINSRB_VdqEdIb, ESCAPE_3A, lane, src1, src0, dst);
+    }
+
     void vpinsrd_irr(unsigned lane, RegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         MOZ_ASSERT(lane < 4);
         threeByteOpImmInt32Simd("vpinsrd", VEX_PD, OP3_PINSRD_VdqEdIb, ESCAPE_3A, lane, src1, src0, dst);
     }
 
-    void vpinsrd_imr(unsigned lane, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
-    {
-        MOZ_ASSERT(lane < 4);
-        threeByteOpImmInt32Simd("vpinsrd", VEX_PD, OP3_PINSRD_VdqEdIb, ESCAPE_3A, lane, offset, base, src0, dst);
-    }
-
     void vpextrb_irr(unsigned lane, XMMRegisterID src, RegisterID dst)
     {
         MOZ_ASSERT(lane < 16);
         threeByteOpImmSimdInt32("vpextrb", VEX_PD, OP3_PEXTRB_EdVdqIb, ESCAPE_3A, lane, (XMMRegisterID)dst, (RegisterID)src);
     }
 
     void vpextrd_irr(unsigned lane, XMMRegisterID src, RegisterID dst)
     {
@@ -3848,16 +3854,32 @@ threeByteOpImmSimd("vblendps", VEX_PD, O
             return;
         }
 
         spew("%-11s$0x%x, %s, %s", name, imm, XMMRegName(rm), GPReg32Name(dst));
         m_formatter.twoByteOpVex(ty, opcode, (RegisterID)rm, invalid_xmm, dst);
         m_formatter.immediate8u(imm);
     }
 
+    void twoByteOpImmInt32Simd(const char* name, VexOperandType ty, TwoByteOpcodeID opcode,
+                               uint32_t imm, RegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        if (useLegacySSEEncodingForOtherOutput()) {
+            spew("%-11s$0x%x, %s, %s", legacySSEOpName(name), imm, GPReg32Name(rm), XMMRegName(dst));
+            m_formatter.legacySSEPrefix(ty);
+            m_formatter.twoByteOp(opcode, rm, dst);
+            m_formatter.immediate8u(imm);
+            return;
+        }
+
+        spew("%-11s$0x%x, %s, %s", name, imm, GPReg32Name(rm), XMMRegName(dst));
+        m_formatter.twoByteOpVex(ty, opcode, rm, src0, dst);
+        m_formatter.immediate8u(imm);
+    }
+
     void twoByteOpSimdFlags(const char* name, VexOperandType ty, TwoByteOpcodeID opcode,
                             XMMRegisterID rm, XMMRegisterID reg)
     {
         if (useLegacySSEEncodingForOtherOutput()) {
             spew("%-11s%s, %s", legacySSEOpName(name), XMMRegName(rm), XMMRegName(reg));
             m_formatter.legacySSEPrefix(ty);
             m_formatter.twoByteOp(opcode, (RegisterID)rm, reg);
             return;
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -2696,17 +2696,18 @@ CodeGeneratorX86Shared::emitSimdExtractL
             masm.shrl(Imm32(8), output);
             // The shrl handles the zero-extension. Don't repeat it.
             if (signedness == SimdSign::Unsigned)
                 signedness = SimdSign::NotApplicable;
         }
     }
 
     // We have the right low 8 bits in |output|, but we may need to fix the high
-    // bits.
+    // bits. Note that this requires |output| to be one of the %eax-%edx
+    // registers.
     switch (signedness) {
       case SimdSign::Signed:
         masm.movsbl(output, output);
         break;
       case SimdSign::Unsigned:
         masm.movzbl(output, output);
         break;
       case SimdSign::NotApplicable:
@@ -2805,30 +2806,54 @@ CodeGeneratorX86Shared::visitSimdExtract
 void
 CodeGeneratorX86Shared::visitSimdInsertElementI(LSimdInsertElementI* ins)
 {
     FloatRegister vector = ToFloatRegister(ins->vector());
     Register value = ToRegister(ins->value());
     FloatRegister output = ToFloatRegister(ins->output());
     MOZ_ASSERT(vector == output); // defineReuseInput(0)
 
-    unsigned component = unsigned(ins->lane());
+    unsigned lane = ins->lane();
+    unsigned length = ins->length();
+
+    if (length == 8) {
+        // Available in SSE 2.
+        masm.vpinsrw(lane, value, vector, output);
+        return;
+    }
 
     // Note that, contrarily to float32x4, we cannot use vmovd if the inserted
     // value goes into the first component, as vmovd clears out the higher lanes
     // of the output.
     if (AssemblerX86Shared::HasSSE41()) {
         // TODO: Teach Lowering that we don't need defineReuseInput if we have AVX.
-        masm.vpinsrd(component, value, vector, output);
-        return;
+        switch (length) {
+          case 4:
+            masm.vpinsrd(lane, value, vector, output);
+            return;
+          case 16:
+            masm.vpinsrb(lane, value, vector, output);
+            return;
+        }
     }
 
     masm.reserveStack(Simd128DataSize);
     masm.storeAlignedSimd128Int(vector, Address(StackPointer, 0));
-    masm.store32(value, Address(StackPointer, component * sizeof(int32_t)));
+    switch (length) {
+      case 4:
+        masm.store32(value, Address(StackPointer, lane * sizeof(int32_t)));
+        break;
+      case 16:
+        // Note that this requires `value` to be in one the registers where the
+        // low 8 bits are addressible (%eax - %edx on x86, all of them on x86-64).
+        masm.store8(value, Address(StackPointer, lane * sizeof(int8_t)));
+        break;
+      default:
+        MOZ_CRASH("Unsupported SIMD length");
+    }
     masm.loadAlignedSimd128Int(Address(StackPointer, 0), output);
     masm.freeStack(Simd128DataSize);
 }
 
 void
 CodeGeneratorX86Shared::visitSimdInsertElementF(LSimdInsertElementF* ins)
 {
     FloatRegister vector = ToFloatRegister(ins->vector());
--- a/js/src/jit/x86-shared/Encoding-x86-shared.h
+++ b/js/src/jit/x86-shared/Encoding-x86-shared.h
@@ -242,16 +242,17 @@ enum TwoByteOpcodeID {
     OP2_BSR_GvEv        = 0xBD,
     OP2_MOVSX_GvEb      = 0xBE,
     OP2_MOVSX_GvEw      = 0xBF,
     OP2_MOVZX_GvEb      = 0xB6,
     OP2_MOVZX_GvEw      = 0xB7,
     OP2_XADD_EbGb       = 0xC0,
     OP2_XADD_EvGv       = 0xC1,
     OP2_CMPPS_VpsWps    = 0xC2,
+    OP2_PINSRW          = 0xC4,
     OP2_PEXTRW_GdUdIb   = 0xC5,
     OP2_SHUFPS_VpsWpsIb = 0xC6,
     OP2_PSRLD_VdqWdq    = 0xD2,
     OP2_MOVQ_WdVd       = 0xD6,
     OP2_PANDDQ_VdqWdq   = 0xDB,
     OP2_PANDNDQ_VdqWdq  = 0xDF,
     OP2_PSRAD_VdqWdq    = 0xE2,
     OP2_PORDQ_VdqWdq    = 0xEB,
@@ -265,16 +266,17 @@ enum TwoByteOpcodeID {
 enum ThreeByteOpcodeID {
     OP3_ROUNDSS_VsdWsd  = 0x0A,
     OP3_ROUNDSD_VsdWsd  = 0x0B,
     OP3_BLENDVPS_VdqWdq = 0x14,
     OP3_PEXTRB_EdVdqIb  = 0x14,
     OP3_PEXTRD_EdVdqIb  = 0x16,
     OP3_BLENDPS_VpsWpsIb = 0x0C,
     OP3_PTEST_VdVd      = 0x17,
+    OP3_PINSRB_VdqEdIb  = 0x20,
     OP3_INSERTPS_VpsUps = 0x21,
     OP3_PINSRD_VdqEdIb  = 0x22,
     OP3_PMULLD_VdqWdq   = 0x40,
     OP3_VBLENDVPS_VdqWdq = 0x4A
 };
 
 // Test whether the given opcode should be printed with its operands reversed.
 inline bool IsXMMReversedOperands(TwoByteOpcodeID opcode)
--- a/js/src/jit/x86-shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@@ -630,16 +630,49 @@ LIRGeneratorX86Shared::lowerAtomicTypedA
         defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
     else if (reuseInput)
         defineReuseInput(lir, ins, LAtomicTypedArrayElementBinop::valueOp);
     else
         define(lir, ins);
 }
 
 void
+LIRGeneratorX86Shared::visitSimdInsertElement(MSimdInsertElement* ins)
+{
+    MOZ_ASSERT(IsSimdType(ins->type()));
+
+    LUse vec = useRegisterAtStart(ins->vector());
+    LUse val = useRegister(ins->value());
+    switch (ins->type()) {
+      case MIRType::Int8x16:
+      case MIRType::Bool8x16:
+        // When SSE 4.1 is not available, we need to go via the stack.
+        // This requires the value to be inserted to be in %eax-%edx.
+        // Pick %ebx since other instructions use %eax or %ecx hard-wired.
+#if defined(JS_CODEGEN_X86)
+        if (!AssemblerX86Shared::HasSSE41())
+            val = useFixed(ins->value(), ebx);
+#endif
+        defineReuseInput(new(alloc()) LSimdInsertElementI(vec, val), ins, 0);
+        break;
+      case MIRType::Int16x8:
+      case MIRType::Int32x4:
+      case MIRType::Bool16x8:
+      case MIRType::Bool32x4:
+        defineReuseInput(new(alloc()) LSimdInsertElementI(vec, val), ins, 0);
+        break;
+      case MIRType::Float32x4:
+        defineReuseInput(new(alloc()) LSimdInsertElementF(vec, val), ins, 0);
+        break;
+      default:
+        MOZ_CRASH("Unknown SIMD kind when generating constant");
+    }
+}
+
+void
 LIRGeneratorX86Shared::visitSimdExtractElement(MSimdExtractElement* ins)
 {
     MOZ_ASSERT(IsSimdType(ins->input()->type()));
     MOZ_ASSERT(!IsSimdType(ins->type()));
 
     switch (ins->input()->type()) {
       case MIRType::Int8x16:
       case MIRType::Int16x8:
--- a/js/src/jit/x86-shared/Lowering-x86-shared.h
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.h
@@ -51,16 +51,17 @@ class LIRGeneratorX86Shared : public LIR
     void lowerMulI(MMul* mul, MDefinition* lhs, MDefinition* rhs);
     void lowerDivI(MDiv* div);
     void lowerModI(MMod* mod);
     void lowerUDiv(MDiv* div);
     void lowerUMod(MMod* mod);
     void lowerUrshD(MUrsh* mir);
     void lowerTruncateDToInt32(MTruncateToInt32* ins);
     void lowerTruncateFToInt32(MTruncateToInt32* ins);
+    void visitSimdInsertElement(MSimdInsertElement* ins);
     void visitSimdExtractElement(MSimdExtractElement* ins);
     void visitSimdBinaryArith(MSimdBinaryArith* ins);
     void visitSimdSelect(MSimdSelect* ins);
     void visitSimdSplat(MSimdSplat* ins);
     void visitSimdValueX4(MSimdValueX4* ins);
     void lowerCompareExchangeTypedArrayElement(MCompareExchangeTypedArrayElement* ins,
                                                bool useI386ByteRegisters);
     void lowerAtomicExchangeTypedArrayElement(MAtomicExchangeTypedArrayElement* ins,