Bug 1136226 - Implement 16x8 SIMD shift operators. r=bbouvier
authorJakob Stoklund Olesen <jolesen@mozilla.com>
Tue, 31 May 2016 09:00:18 -0700
changeset 340736 25dc50270a77116f3e679f451e143b0031382cdd
parent 340735 69ce6e7501086e2492274e352752eee5eeea447d
child 340737 6ea5fb073f4be87096890646bf58061c90b22fcd
push id1183
push userraliiev@mozilla.com
push dateMon, 05 Sep 2016 20:01:49 +0000
treeherdermozilla-release@3148731bed45 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbbouvier
bugs1136226
milestone49.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1136226 - Implement 16x8 SIMD shift operators. r=bbouvier These all have corresponding SSE instructions. The 8x16 shifts don't have SSE instructions, so they will be added by the next commit.
js/src/jit/Lowering.cpp
js/src/jit/shared/LIR-shared.h
js/src/jit/x86-shared/Assembler-x86-shared.h
js/src/jit/x86-shared/BaseAssembler-x86-shared.h
js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
js/src/jit/x86-shared/Encoding-x86-shared.h
js/src/jit/x86-shared/MacroAssembler-x86-shared.h
--- a/js/src/jit/Lowering.cpp
+++ b/js/src/jit/Lowering.cpp
@@ -4539,18 +4539,18 @@ LIRGenerator::visitSimdBinaryBitwise(MSi
     ReorderCommutative(&lhs, &rhs, ins);
     LSimdBinaryBitwise* lir = new(alloc()) LSimdBinaryBitwise;
     lowerForFPU(lir, ins, lhs, rhs);
 }
 
 void
 LIRGenerator::visitSimdShift(MSimdShift* ins)
 {
-    MOZ_ASSERT(ins->type() == MIRType::Int32x4);
-    MOZ_ASSERT(ins->lhs()->type() == MIRType::Int32x4);
+    MOZ_ASSERT(IsIntegerSimdType(ins->type()));
+    MOZ_ASSERT(ins->lhs()->type() == ins->type());
     MOZ_ASSERT(ins->rhs()->type() == MIRType::Int32);
 
     LUse vector = useRegisterAtStart(ins->lhs());
     LAllocation value = useRegisterOrConstant(ins->rhs());
     // We need a temp register to mask the shift amount, but not if the shift
     // amount is a constant.
     LDefinition tempReg = value.isConstant() ? LDefinition::BogusTemp() : temp();
     LSimdShift* lir = new(alloc()) LSimdShift(vector, value, tempReg);
--- a/js/src/jit/shared/LIR-shared.h
+++ b/js/src/jit/shared/LIR-shared.h
@@ -620,16 +620,19 @@ class LSimdShift : public LInstructionHe
         return mir_->toSimdShift()->operation();
     }
     const char* extraName() const {
         return MSimdShift::OperationName(operation());
     }
     MSimdShift* mir() const {
         return mir_->toSimdShift();
     }
+    MIRType type() const {
+        return mir_->type();
+    }
 };
 
 // SIMD selection of lanes from two int32x4 or float32x4 arguments based on a
 // int32x4 argument.
 class LSimdSelect : public LInstructionHelper<1, 3, 1>
 {
   public:
     LIR_HEADER(SimdSelect);
--- a/js/src/jit/x86-shared/Assembler-x86-shared.h
+++ b/js/src/jit/x86-shared/Assembler-x86-shared.h
@@ -2162,16 +2162,41 @@ class AssemblerX86Shared : public Assemb
         MOZ_ASSERT(HasSSE2());
         masm.vpsrld_rr(src1.encoding(), src0.encoding(), dest.encoding());
     }
     void vpsrld(Imm32 count, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.vpsrld_ir(count.value, src0.encoding(), dest.encoding());
     }
 
+    void vpsllw(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.vpsllw_rr(src1.encoding(), src0.encoding(), dest.encoding());
+    }
+    void vpsllw(Imm32 count, FloatRegister src0, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.vpsllw_ir(count.value, src0.encoding(), dest.encoding());
+    }
+    void vpsraw(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.vpsraw_rr(src1.encoding(), src0.encoding(), dest.encoding());
+    }
+    void vpsraw(Imm32 count, FloatRegister src0, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.vpsraw_ir(count.value, src0.encoding(), dest.encoding());
+    }
+    void vpsrlw(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.vpsrlw_rr(src1.encoding(), src0.encoding(), dest.encoding());
+    }
+    void vpsrlw(Imm32 count, FloatRegister src0, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.vpsrlw_ir(count.value, src0.encoding(), dest.encoding());
+    }
+
     void vcvtsi2sd(const Operand& src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         switch (src1.kind()) {
           case Operand::REG:
             masm.vcvtsi2sd_rr(src1.reg(), src0.encoding(), dest.encoding());
             break;
           case Operand::MEM_REG_DISP:
             masm.vcvtsi2sd_mr(src1.disp(), src1.base(), src0.encoding(), dest.encoding());
--- a/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
@@ -2753,34 +2753,34 @@ public:
     {
         MOZ_ASSERT(count < 16);
         shiftOpImmSimd("vpsrldq", OP2_PSRLDQ_Vd, ShiftID::vpsrldq, count, src, dst);
     }
 
     void vpsllq_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
     {
         MOZ_ASSERT(count < 64);
-        shiftOpImmSimd("vpsllq", OP2_PSRLDQ_Vd, ShiftID::vpsllq, count, src, dst);
+        shiftOpImmSimd("vpsllq", OP2_PSRLDQ_Vd, ShiftID::vpsllx, count, src, dst);
     }
 
     void vpsrlq_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
     {
         MOZ_ASSERT(count < 64);
-        shiftOpImmSimd("vpsrlq", OP2_PSRLDQ_Vd, ShiftID::vpsrlq, count, src, dst);
+        shiftOpImmSimd("vpsrlq", OP2_PSRLDQ_Vd, ShiftID::vpsrlx, count, src, dst);
     }
 
     void vpslld_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vpslld", VEX_PD, OP2_PSLLD_VdqWdq, src1, src0, dst);
     }
 
     void vpslld_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
     {
         MOZ_ASSERT(count < 32);
-        shiftOpImmSimd("vpslld", OP2_PSLLD_UdqIb, ShiftID::vpslld, count, src, dst);
+        shiftOpImmSimd("vpslld", OP2_PSLLD_UdqIb, ShiftID::vpsllx, count, src, dst);
     }
 
     void vpsrad_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vpsrad", VEX_PD, OP2_PSRAD_VdqWdq, src1, src0, dst);
     }
 
     void vpsrad_ir(int32_t count, XMMRegisterID src, XMMRegisterID dst)
@@ -2792,17 +2792,50 @@ public:
     void vpsrld_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vpsrld", VEX_PD, OP2_PSRLD_VdqWdq, src1, src0, dst);
     }
 
     void vpsrld_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
     {
         MOZ_ASSERT(count < 32);
-        shiftOpImmSimd("vpsrld", OP2_PSRLD_UdqIb, ShiftID::vpsrld, count, src, dst);
+        shiftOpImmSimd("vpsrld", OP2_PSRLD_UdqIb, ShiftID::vpsrlx, count, src, dst);
+    }
+
+    void vpsllw_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vpsllw", VEX_PD, OP2_PSLLW_VdqWdq, src1, src0, dst);
+    }
+
+    void vpsllw_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(count < 16);
+        shiftOpImmSimd("vpsllw", OP2_PSLLW_UdqIb, ShiftID::vpsllx, count, src, dst);
+    }
+
+    void vpsraw_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vpsraw", VEX_PD, OP2_PSRAW_VdqWdq, src1, src0, dst);
+    }
+
+    void vpsraw_ir(int32_t count, XMMRegisterID src, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(count < 16);
+        shiftOpImmSimd("vpsraw", OP2_PSRAW_UdqIb, ShiftID::vpsrad, count, src, dst);
+    }
+
+    void vpsrlw_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vpsrlw", VEX_PD, OP2_PSRLW_VdqWdq, src1, src0, dst);
+    }
+
+    void vpsrlw_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(count < 16);
+        shiftOpImmSimd("vpsrlw", OP2_PSRLW_UdqIb, ShiftID::vpsrlx, count, src, dst);
     }
 
     void vmovmskpd_rr(XMMRegisterID src, RegisterID dst)
     {
         twoByteOpSimdInt32("vmovmskpd", VEX_PD, OP2_MOVMSKPD_EdVd, src, dst);
     }
 
     void vmovmskps_rr(XMMRegisterID src, RegisterID dst)
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -3745,53 +3745,95 @@ CodeGeneratorX86Shared::visitSimdBinaryB
 }
 
 void
 CodeGeneratorX86Shared::visitSimdShift(LSimdShift* ins)
 {
     FloatRegister out = ToFloatRegister(ins->output());
     MOZ_ASSERT(ToFloatRegister(ins->vector()) == out); // defineReuseInput(0);
 
-    // If the shift count is out of range, only use the low 5 bits.
+    // The shift amount is masked to the number of bits in a lane.
+    uint32_t shiftmask = (128u / SimdTypeToLength(ins->type())) - 1;
+
+    // Note that SSE doesn't have instructions for shifting 8x16 vectors.
+    // These shifts are synthesized by the MSimdShift::AddLegalized() function.
     const LAllocation* val = ins->value();
     if (val->isConstant()) {
         MOZ_ASSERT(ins->temp()->isBogusTemp());
-        Imm32 count(uint32_t(ToInt32(val)) % 32);
-        switch (ins->operation()) {
-          case MSimdShift::lsh:
-            masm.packedLeftShiftByScalar(count, out);
-            return;
-          case MSimdShift::rsh:
-            masm.packedRightShiftByScalar(count, out);
-            return;
-          case MSimdShift::ursh:
-            masm.packedUnsignedRightShiftByScalar(count, out);
-            return;
+        Imm32 count(uint32_t(ToInt32(val)) & shiftmask);
+        switch (ins->type()) {
+          case MIRType::Int16x8:
+            switch (ins->operation()) {
+              case MSimdShift::lsh:
+                masm.packedLeftShiftByScalarInt16x8(count, out);
+                return;
+              case MSimdShift::rsh:
+                masm.packedRightShiftByScalarInt16x8(count, out);
+                return;
+              case MSimdShift::ursh:
+                masm.packedUnsignedRightShiftByScalarInt16x8(count, out);
+                return;
+            }
+            break;
+          case MIRType::Int32x4:
+            switch (ins->operation()) {
+              case MSimdShift::lsh:
+                masm.packedLeftShiftByScalarInt32x4(count, out);
+                return;
+              case MSimdShift::rsh:
+                masm.packedRightShiftByScalarInt32x4(count, out);
+                return;
+              case MSimdShift::ursh:
+                masm.packedUnsignedRightShiftByScalarInt32x4(count, out);
+                return;
+            }
+            break;
+          default:
+            MOZ_CRASH("unsupported type for SIMD shifts");
         }
         MOZ_CRASH("unexpected SIMD bitwise op");
     }
 
     // Truncate val to 5 bits. We should have a temp register for that.
     MOZ_ASSERT(val->isRegister());
     Register count = ToRegister(ins->temp());
     masm.mov(ToRegister(val), count);
-    masm.andl(Imm32(31), count);
+    masm.andl(Imm32(shiftmask), count);
     ScratchFloat32Scope scratch(masm);
     masm.vmovd(count, scratch);
 
-    switch (ins->operation()) {
-      case MSimdShift::lsh:
-        masm.packedLeftShiftByScalar(scratch, out);
-        return;
-      case MSimdShift::rsh:
-        masm.packedRightShiftByScalar(scratch, out);
-        return;
-      case MSimdShift::ursh:
-        masm.packedUnsignedRightShiftByScalar(scratch, out);
-        return;
+    switch (ins->type()) {
+      case MIRType::Int16x8:
+        switch (ins->operation()) {
+          case MSimdShift::lsh:
+            masm.packedLeftShiftByScalarInt16x8(scratch, out);
+            return;
+          case MSimdShift::rsh:
+            masm.packedRightShiftByScalarInt16x8(scratch, out);
+            return;
+          case MSimdShift::ursh:
+            masm.packedUnsignedRightShiftByScalarInt16x8(scratch, out);
+            return;
+        }
+        break;
+      case MIRType::Int32x4:
+        switch (ins->operation()) {
+          case MSimdShift::lsh:
+            masm.packedLeftShiftByScalarInt32x4(scratch, out);
+            return;
+          case MSimdShift::rsh:
+            masm.packedRightShiftByScalarInt32x4(scratch, out);
+            return;
+          case MSimdShift::ursh:
+            masm.packedUnsignedRightShiftByScalarInt32x4(scratch, out);
+            return;
+        }
+        break;
+      default:
+        MOZ_CRASH("unsupported type for SIMD shifts");
     }
     MOZ_CRASH("unexpected SIMD bitwise op");
 }
 
 void
 CodeGeneratorX86Shared::visitSimdSelect(LSimdSelect* ins)
 {
     FloatRegister mask = ToFloatRegister(ins->mask());
@@ -3812,17 +3854,17 @@ CodeGeneratorX86Shared::visitSimdSelect(
         return;
     }
 
     // SSE4.1 has plain blendvps which can do this, but it is awkward
     // to use because it requires the mask to be in xmm0.
 
     // Propagate sign to all bits of mask vector, if necessary.
     if (!mir->mask()->isSimdBinaryComp())
-        masm.packedRightShiftByScalar(Imm32(31), temp);
+        masm.packedRightShiftByScalarInt32x4(Imm32(31), temp);
 
     masm.bitwiseAndSimd128(Operand(temp), output);
     masm.bitwiseAndNotSimd128(Operand(onFalse), temp);
     masm.bitwiseOrSimd128(Operand(temp), output);
 }
 
 void
 CodeGeneratorX86Shared::visitCompareExchangeTypedArrayElement(LCompareExchangeTypedArrayElement* lir)
--- a/js/src/jit/x86-shared/Encoding-x86-shared.h
+++ b/js/src/jit/x86-shared/Encoding-x86-shared.h
@@ -149,22 +149,20 @@ enum OneByteOpcodeID {
     OP_HLT                          = 0xF4,
     OP_GROUP3_EbIb                  = 0xF6,
     OP_GROUP3_Ev                    = 0xF7,
     OP_GROUP3_EvIz                  = 0xF7, // OP_GROUP3_Ev has an immediate, when instruction is a test.
     OP_GROUP5_Ev                    = 0xFF
 };
 
 enum class ShiftID {
-    vpsrld = 2,
-    vpsrlq = 2,
+    vpsrlx = 2,
     vpsrldq = 3,
     vpsrad = 4,
-    vpslld = 6,
-    vpsllq = 6
+    vpsllx = 6
 };
 
 enum TwoByteOpcodeID {
     OP2_UD2             = 0x0B,
     OP2_MOVSD_VsdWsd    = 0x10,
     OP2_MOVPS_VpsWps    = 0x10,
     OP2_MOVSD_WsdVsd    = 0x11,
     OP2_MOVPS_WpsVps    = 0x11,
@@ -214,16 +212,19 @@ enum TwoByteOpcodeID {
     OP2_ORPD_VpdWpd     = 0x56,
     OP2_XORPD_VpdWpd    = 0x57,
     OP2_PUNPCKLDQ       = 0x62,
     OP2_PCMPGTD_VdqWdq  = 0x66,
     OP2_MOVD_VdEd       = 0x6E,
     OP2_MOVDQ_VsdWsd    = 0x6F,
     OP2_MOVDQ_VdqWdq    = 0x6F,
     OP2_PSHUFD_VdqWdqIb = 0x70,
+    OP2_PSLLW_UdqIb     = 0x71,
+    OP2_PSRAW_UdqIb     = 0x71,
+    OP2_PSRLW_UdqIb     = 0x71,
     OP2_PSLLD_UdqIb     = 0x72,
     OP2_PSRAD_UdqIb     = 0x72,
     OP2_PSRLD_UdqIb     = 0x72,
     OP2_PSRLDQ_Vd       = 0x73,
     OP2_PCMPEQW         = 0x75,
     OP2_PCMPEQD_VdqWdq  = 0x76,
     OP2_HADDPD          = 0x7C,
     OP2_MOVD_EdVd       = 0x7E,
@@ -245,24 +246,27 @@ enum TwoByteOpcodeID {
     OP2_MOVZX_GvEb      = 0xB6,
     OP2_MOVZX_GvEw      = 0xB7,
     OP2_XADD_EbGb       = 0xC0,
     OP2_XADD_EvGv       = 0xC1,
     OP2_CMPPS_VpsWps    = 0xC2,
     OP2_PINSRW          = 0xC4,
     OP2_PEXTRW_GdUdIb   = 0xC5,
     OP2_SHUFPS_VpsWpsIb = 0xC6,
+    OP2_PSRLW_VdqWdq    = 0xD1,
     OP2_PSRLD_VdqWdq    = 0xD2,
     OP2_PMULLW_VdqWdq   = 0xD5,
     OP2_MOVQ_WdVd       = 0xD6,
     OP2_PANDDQ_VdqWdq   = 0xDB,
     OP2_PANDNDQ_VdqWdq  = 0xDF,
+    OP2_PSRAW_VdqWdq    = 0xE1,
     OP2_PSRAD_VdqWdq    = 0xE2,
     OP2_PORDQ_VdqWdq    = 0xEB,
     OP2_PXORDQ_VdqWdq   = 0xEF,
+    OP2_PSLLW_VdqWdq    = 0xF1,
     OP2_PSLLD_VdqWdq    = 0xF2,
     OP2_PMULUDQ_VdqWdq  = 0xF4,
     OP2_PSUBB_VdqWdq    = 0xF8,
     OP2_PSUBW_VdqWdq    = 0xF9,
     OP2_PSUBD_VdqWdq    = 0xFA,
     OP2_PADDB_VdqWdq    = 0xFC,
     OP2_PADDW_VdqWdq    = 0xFD,
     OP2_PADDD_VdqWdq    = 0xFE
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
@@ -966,32 +966,51 @@ class MacroAssemblerX86Shared : public A
     void packedRcpSqrtApproximationFloat32x4(const Operand& src, FloatRegister dest) {
         // TODO See comment above. See also bug 1068028.
         vrsqrtps(src, dest);
     }
     void packedSqrtFloat32x4(const Operand& src, FloatRegister dest) {
         vsqrtps(src, dest);
     }
 
-    void packedLeftShiftByScalar(FloatRegister src, FloatRegister dest) {
+    void packedLeftShiftByScalarInt16x8(FloatRegister src, FloatRegister dest) {
+        vpsllw(src, dest, dest);
+    }
+    void packedLeftShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
+        vpsllw(count, dest, dest);
+    }
+    void packedRightShiftByScalarInt16x8(FloatRegister src, FloatRegister dest) {
+        vpsraw(src, dest, dest);
+    }
+    void packedRightShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
+        vpsraw(count, dest, dest);
+    }
+    void packedUnsignedRightShiftByScalarInt16x8(FloatRegister src, FloatRegister dest) {
+        vpsrlw(src, dest, dest);
+    }
+    void packedUnsignedRightShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
+        vpsrlw(count, dest, dest);
+    }
+
+    void packedLeftShiftByScalarInt32x4(FloatRegister src, FloatRegister dest) {
         vpslld(src, dest, dest);
     }
-    void packedLeftShiftByScalar(Imm32 count, FloatRegister dest) {
+    void packedLeftShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
         vpslld(count, dest, dest);
     }
-    void packedRightShiftByScalar(FloatRegister src, FloatRegister dest) {
+    void packedRightShiftByScalarInt32x4(FloatRegister src, FloatRegister dest) {
         vpsrad(src, dest, dest);
     }
-    void packedRightShiftByScalar(Imm32 count, FloatRegister dest) {
+    void packedRightShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
         vpsrad(count, dest, dest);
     }
-    void packedUnsignedRightShiftByScalar(FloatRegister src, FloatRegister dest) {
+    void packedUnsignedRightShiftByScalarInt32x4(FloatRegister src, FloatRegister dest) {
         vpsrld(src, dest, dest);
     }
-    void packedUnsignedRightShiftByScalar(Imm32 count, FloatRegister dest) {
+    void packedUnsignedRightShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
         vpsrld(count, dest, dest);
     }
 
     void loadFloat32x3(const Address& src, FloatRegister dest) {
         Address srcZ(src);
         srcZ.offset += 2 * sizeof(float);
         vmovsd(src, dest);
         ScratchSimd128Scope scratch(asMasm());