Bug 1136226 - Implement Bool8x16.splat and Bool16x8.splat. r=bbouvier
authorJakob Stoklund Olesen <jolesen@mozilla.com>
Tue, 31 May 2016 09:00:19 -0700
changeset 340740 a17bc6fab38f7beaffa3608ca8ec4a7d660a2bd4
parent 340739 4117a5326ded2a0f6543da9070e7c68c0ba0a172
child 340741 fa42a25f4124566158f812c0b796360dd239f814
push id1183
push userraliiev@mozilla.com
push dateMon, 05 Sep 2016 20:01:49 +0000
treeherdermozilla-release@3148731bed45 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbbouvier
bugs1136226
milestone49.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1136226 - Implement Bool8x16.splat and Bool16x8.splat. r=bbouvier The scalar argument to this operation is expanded into MIR as either -1 or 0 in an Int32, so the 4-lane splat produces the correct result for 8-lane and 16-lane splats too. Either an all-zeroes vector or an all-ones vector.
js/src/jit/MIR.cpp
js/src/jit/shared/LIR-shared.h
js/src/jit/shared/LOpcodes-shared.h
js/src/jit/x86-shared/Assembler-x86-shared.h
js/src/jit/x86-shared/BaseAssembler-x86-shared.h
js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
js/src/jit/x86-shared/CodeGenerator-x86-shared.h
js/src/jit/x86-shared/Encoding-x86-shared.h
js/src/jit/x86-shared/Lowering-x86-shared.cpp
--- a/js/src/jit/MIR.cpp
+++ b/js/src/jit/MIR.cpp
@@ -1156,16 +1156,26 @@ MSimdSplat::foldsTo(TempAllocator& alloc
 
     SimdConstant cst;
     switch (type()) {
       case MIRType::Bool32x4: {
         int32_t v = op->toConstant()->valueToBooleanInfallible() ? -1 : 0;
         cst = SimdConstant::SplatX4(v);
         break;
       }
+      case MIRType::Int8x16: {
+        int32_t v = op->toConstant()->toInt32();
+        cst = SimdConstant::SplatX16(v);
+        break;
+      }
+      case MIRType::Int16x8: {
+        int32_t v = op->toConstant()->toInt32();
+        cst = SimdConstant::SplatX8(v);
+        break;
+      }
       case MIRType::Int32x4: {
         int32_t v = op->toConstant()->toInt32();
         cst = SimdConstant::SplatX4(v);
         break;
       }
       case MIRType::Float32x4: {
         float v = op->toConstant()->numberToDouble();
         cst = SimdConstant::SplatX4(v);
--- a/js/src/jit/shared/LIR-shared.h
+++ b/js/src/jit/shared/LIR-shared.h
@@ -190,16 +190,46 @@ class LSimdUnbox : public LInstructionHe
         return getTemp(0);
     }
 
     MSimdUnbox* mir() const {
         return mir_->toSimdUnbox();
     }
 };
 
+// Constructs a SIMD value with 16 equal components (int8x16).
+class LSimdSplatX16 : public LInstructionHelper<1, 1, 0>
+{
+  public:
+    LIR_HEADER(SimdSplatX16)
+    explicit LSimdSplatX16(const LAllocation& v)
+    {
+        setOperand(0, v);
+    }
+
+    MSimdSplat* mir() const {
+        return mir_->toSimdSplat();
+    }
+};
+
+// Constructs a SIMD value with 8 equal components (int16x8).
+class LSimdSplatX8 : public LInstructionHelper<1, 1, 0>
+{
+  public:
+    LIR_HEADER(SimdSplatX8)
+    explicit LSimdSplatX8(const LAllocation& v)
+    {
+        setOperand(0, v);
+    }
+
+    MSimdSplat* mir() const {
+        return mir_->toSimdSplat();
+    }
+};
+
 // Constructs a SIMD value with 4 equal components (e.g. int32x4, float32x4).
 class LSimdSplatX4 : public LInstructionHelper<1, 1, 0>
 {
   public:
     LIR_HEADER(SimdSplatX4)
     explicit LSimdSplatX4(const LAllocation& v)
     {
         setOperand(0, v);
--- a/js/src/jit/shared/LOpcodes-shared.h
+++ b/js/src/jit/shared/LOpcodes-shared.h
@@ -15,16 +15,18 @@
     _(MoveGroup)                    \
     _(Integer)                      \
     _(Integer64)                    \
     _(Pointer)                      \
     _(Double)                       \
     _(Float32)                      \
     _(SimdBox)                      \
     _(SimdUnbox)                    \
+    _(SimdSplatX16)                 \
+    _(SimdSplatX8)                  \
     _(SimdSplatX4)                  \
     _(Simd128Int)                   \
     _(Simd128Float)                 \
     _(SimdAllTrue)                  \
     _(SimdAnyTrue)                  \
     _(SimdReinterpretCast)          \
     _(SimdExtractElementI)          \
     _(SimdExtractElementU2D)        \
--- a/js/src/jit/x86-shared/Assembler-x86-shared.h
+++ b/js/src/jit/x86-shared/Assembler-x86-shared.h
@@ -1080,16 +1080,17 @@ class AssemblerX86Shared : public Assemb
     }
 
     void breakpoint() {
         masm.int3();
     }
 
     static bool HasSSE2() { return CPUInfo::IsSSE2Present(); }
     static bool HasSSE3() { return CPUInfo::IsSSE3Present(); }
+    static bool HasSSSE3() { return CPUInfo::IsSSSE3Present(); }
     static bool HasSSE41() { return CPUInfo::IsSSE41Present(); }
     static bool HasPOPCNT() { return CPUInfo::IsPOPCNTPresent(); }
     static bool SupportsFloatingPoint() { return CPUInfo::IsSSE2Present(); }
     static bool SupportsSimd() { return CPUInfo::IsSSE2Present(); }
     static bool HasAVX() { return CPUInfo::IsAVXPresent(); }
 
     void cmpl(Register rhs, Register lhs) {
         masm.cmpl_rr(rhs.encoding(), lhs.encoding());
@@ -2991,16 +2992,29 @@ class AssemblerX86Shared : public Assemb
             break;
           case Operand::MEM_ADDRESS32:
             masm.vpshufd_imr(mask, src1.address(), dest.encoding());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
+
+    void vpshuflw(uint32_t mask, FloatRegister src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.vpshuflw_irr(mask, src.encoding(), dest.encoding());
+    }
+    void vpshufhw(uint32_t mask, FloatRegister src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.vpshufhw_irr(mask, src.encoding(), dest.encoding());
+    }
+    void vpshufb(FloatRegister mask, FloatRegister src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSSE3());
+        masm.vpshufb_rr(mask.encoding(), src.encoding(), dest.encoding());
+    }
     void vmovddup(FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE3());
         masm.vmovddup_rr(src.encoding(), dest.encoding());
     }
     void vmovhlps(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.vmovhlps_rr(src1.encoding(), src0.encoding(), dest.encoding());
     }
--- a/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
@@ -2820,16 +2820,31 @@ public:
     {
         twoByteOpImmSimd("vpshufd", VEX_PD, OP2_PSHUFD_VdqWdqIb, mask, offset, base, invalid_xmm, dst);
     }
     void vpshufd_imr(uint32_t mask, const void* address, XMMRegisterID dst)
     {
         twoByteOpImmSimd("vpshufd", VEX_PD, OP2_PSHUFD_VdqWdqIb, mask, address, invalid_xmm, dst);
     }
 
+    void vpshuflw_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
+    {
+        twoByteOpImmSimd("vpshuflw", VEX_SD, OP2_PSHUFLW_VdqWdqIb, mask, src, invalid_xmm, dst);
+    }
+
+    void vpshufhw_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
+    {
+        twoByteOpImmSimd("vpshufhw", VEX_SS, OP2_PSHUFHW_VdqWdqIb, mask, src, invalid_xmm, dst);
+    }
+
+    void vpshufb_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        threeByteOpSimd("vpshufb", VEX_PD, OP3_PSHUFB_VdqWdq, ESCAPE_38, src1, src0, dst);
+    }
+
     void vshufps_irr(uint32_t mask, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpImmSimd("vshufps", VEX_PS, OP2_SHUFPS_VpsWpsIb, mask, src1, src0, dst);
     }
     void vshufps_imr(uint32_t mask, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpImmSimd("vshufps", VEX_PS, OP2_SHUFPS_VpsWpsIb, mask, offset, base, src0, dst);
     }
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -2594,40 +2594,65 @@ CodeGeneratorX86Shared::visitSimdValueFl
     FloatRegister r1Copy = masm.reusedInputFloat32x4(r1, tmp);
 
     masm.vunpcklps(r3, r1Copy, tmp);
     masm.vunpcklps(r2, r0Copy, output);
     masm.vunpcklps(tmp, output, output);
 }
 
 void
+CodeGeneratorX86Shared::visitSimdSplatX16(LSimdSplatX16* ins)
+{
+    MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 16);
+    Register input = ToRegister(ins->getOperand(0));
+    FloatRegister output = ToFloatRegister(ins->output());
+    masm.vmovd(input, output);
+    if (AssemblerX86Shared::HasSSSE3()) {
+        masm.zeroSimd128Int(ScratchSimd128Reg);
+        masm.vpshufb(ScratchSimd128Reg, output, output);
+    } else {
+        // Use two shifts to duplicate the low 8 bits into the low 16 bits.
+        masm.vpsllw(Imm32(8), output, output);
+        masm.vmovdqa(output, ScratchSimd128Reg);
+        masm.vpsrlw(Imm32(8), ScratchSimd128Reg, ScratchSimd128Reg);
+        masm.vpor(ScratchSimd128Reg, output, output);
+        // Then do an X8 splat.
+        masm.vpshuflw(0, output, output);
+        masm.vpshufd(0, output, output);
+    }
+}
+
+void
+CodeGeneratorX86Shared::visitSimdSplatX8(LSimdSplatX8* ins)
+{
+    MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 8);
+    Register input = ToRegister(ins->getOperand(0));
+    FloatRegister output = ToFloatRegister(ins->output());
+    masm.vmovd(input, output);
+    masm.vpshuflw(0, output, output);
+    masm.vpshufd(0, output, output);
+}
+
+void
 CodeGeneratorX86Shared::visitSimdSplatX4(LSimdSplatX4* ins)
 {
     FloatRegister output = ToFloatRegister(ins->output());
 
     MSimdSplat* mir = ins->mir();
     MOZ_ASSERT(IsSimdType(mir->type()));
     JS_STATIC_ASSERT(sizeof(float) == sizeof(int32_t));
 
-    switch (mir->type()) {
-      case MIRType::Int32x4:
-      case MIRType::Bool32x4: {
+    if (mir->type() == MIRType::Float32x4) {
+        FloatRegister r = ToFloatRegister(ins->getOperand(0));
+        FloatRegister rCopy = masm.reusedInputFloat32x4(r, output);
+        masm.vshufps(0, rCopy, rCopy, output);
+    } else {
         Register r = ToRegister(ins->getOperand(0));
         masm.vmovd(r, output);
         masm.vpshufd(0, output, output);
-        break;
-      }
-      case MIRType::Float32x4: {
-        FloatRegister r = ToFloatRegister(ins->getOperand(0));
-        FloatRegister rCopy = masm.reusedInputFloat32x4(r, output);
-        masm.vshufps(0, rCopy, rCopy, output);
-        break;
-      }
-      default:
-        MOZ_CRASH("Unknown SIMD kind");
     }
 }
 
 void
 CodeGeneratorX86Shared::visitSimdReinterpretCast(LSimdReinterpretCast* ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     FloatRegister output = ToFloatRegister(ins->output());
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.h
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.h
@@ -286,16 +286,18 @@ class CodeGeneratorX86Shared : public Co
     void visitNegD(LNegD* lir);
     void visitNegF(LNegF* lir);
 
     void visitOutOfLineWasmTruncateCheck(OutOfLineWasmTruncateCheck* ool);
 
     // SIMD operators
     void visitSimdValueInt32x4(LSimdValueInt32x4* lir);
     void visitSimdValueFloat32x4(LSimdValueFloat32x4* lir);
+    void visitSimdSplatX16(LSimdSplatX16* lir);
+    void visitSimdSplatX8(LSimdSplatX8* lir);
     void visitSimdSplatX4(LSimdSplatX4* lir);
     void visitSimd128Int(LSimd128Int* ins);
     void visitSimd128Float(LSimd128Float* ins);
     void visitInt32x4ToFloat32x4(LInt32x4ToFloat32x4* ins);
     void visitFloat32x4ToInt32x4(LFloat32x4ToInt32x4* ins);
     void visitFloat32x4ToUint32x4(LFloat32x4ToUint32x4* ins);
     void visitSimdReinterpretCast(LSimdReinterpretCast* lir);
     void visitSimdExtractElementB(LSimdExtractElementB* lir);
--- a/js/src/jit/x86-shared/Encoding-x86-shared.h
+++ b/js/src/jit/x86-shared/Encoding-x86-shared.h
@@ -212,16 +212,18 @@ enum TwoByteOpcodeID {
     OP2_ORPD_VpdWpd     = 0x56,
     OP2_XORPD_VpdWpd    = 0x57,
     OP2_PUNPCKLDQ       = 0x62,
     OP2_PCMPGTD_VdqWdq  = 0x66,
     OP2_MOVD_VdEd       = 0x6E,
     OP2_MOVDQ_VsdWsd    = 0x6F,
     OP2_MOVDQ_VdqWdq    = 0x6F,
     OP2_PSHUFD_VdqWdqIb = 0x70,
+    OP2_PSHUFLW_VdqWdqIb = 0x70,
+    OP2_PSHUFHW_VdqWdqIb = 0x70,
     OP2_PSLLW_UdqIb     = 0x71,
     OP2_PSRAW_UdqIb     = 0x71,
     OP2_PSRLW_UdqIb     = 0x71,
     OP2_PSLLD_UdqIb     = 0x72,
     OP2_PSRAD_UdqIb     = 0x72,
     OP2_PSRLD_UdqIb     = 0x72,
     OP2_PSRLDQ_Vd       = 0x73,
     OP2_PCMPEQW         = 0x75,
@@ -276,16 +278,17 @@ enum TwoByteOpcodeID {
     OP2_PSUBW_VdqWdq    = 0xF9,
     OP2_PSUBD_VdqWdq    = 0xFA,
     OP2_PADDB_VdqWdq    = 0xFC,
     OP2_PADDW_VdqWdq    = 0xFD,
     OP2_PADDD_VdqWdq    = 0xFE
 };
 
 enum ThreeByteOpcodeID {
+    OP3_PSHUFB_VdqWdq   = 0x00,
     OP3_ROUNDSS_VsdWsd  = 0x0A,
     OP3_ROUNDSD_VsdWsd  = 0x0B,
     OP3_BLENDVPS_VdqWdq = 0x14,
     OP3_PEXTRB_EdVdqIb  = 0x14,
     OP3_PEXTRD_EdVdqIb  = 0x16,
     OP3_BLENDPS_VpsWpsIb = 0x0C,
     OP3_PTEST_VdVd      = 0x17,
     OP3_PINSRB_VdqEdIb  = 0x20,
--- a/js/src/jit/x86-shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@@ -809,29 +809,37 @@ LIRGeneratorX86Shared::visitSimdSelect(M
 
     define(lins, ins);
 }
 
 void
 LIRGeneratorX86Shared::visitSimdSplat(MSimdSplat* ins)
 {
     LAllocation x = useRegisterAtStart(ins->getOperand(0));
-    LSimdSplatX4* lir = new(alloc()) LSimdSplatX4(x);
 
     switch (ins->type()) {
+      case MIRType::Int8x16:
+        define(new (alloc()) LSimdSplatX16(x), ins);
+        break;
+      case MIRType::Int16x8:
+        define(new (alloc()) LSimdSplatX8(x), ins);
+        break;
       case MIRType::Int32x4:
-      case MIRType::Bool32x4:
-        define(lir, ins);
-        break;
       case MIRType::Float32x4:
-        // (Non-AVX) codegen actually wants the input and the output to be in
-        // the same register, but we can't currently use defineReuseInput
-        // because they have different types (scalar vs vector), so a spill slot
-        // for one may not be suitable for the other.
-        define(lir, ins);
+      case MIRType::Bool8x16:
+      case MIRType::Bool16x8:
+      case MIRType::Bool32x4:
+        // Use the SplatX4 instruction for all boolean splats. Since the input
+        // value is a 32-bit int that is either 0 or -1, the X4 splat gives
+        // the right result for all boolean geometries.
+        // For floats, (Non-AVX) codegen actually wants the input and the output
+        // to be in the same register, but we can't currently use
+        // defineReuseInput because they have different types (scalar vs
+        // vector), so a spill slot for one may not be suitable for the other.
+        define(new (alloc()) LSimdSplatX4(x), ins);
         break;
       default:
         MOZ_CRASH("Unknown SIMD kind");
     }
 }
 
 void
 LIRGeneratorX86Shared::visitSimdValueX4(MSimdValueX4* ins)