Bug 1136226 - Implement shuffle for 8x16 and 16x8 SIMD types. r=sunfish
authorJakob Stoklund Olesen <jolesen@mozilla.com>
Tue, 31 May 2016 09:00:19 -0700
changeset 338731 7be2feba720f43f6c5df652f9908f7a8c8a39be1
parent 338730 018f7422c57ec03d65f58802e4cbb6ee2fc25418
child 338732 e5479106a7ab7033580c4114d96cd5d0d3c062d2
push id6249
push userjlund@mozilla.com
push dateMon, 01 Aug 2016 13:59:36 +0000
treeherdermozilla-beta@bad9d4f5bf7e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssunfish
bugs1136226
milestone49.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1136226 - Implement shuffle for 8x16 and 16x8 SIMD types. r=sunfish When SSSE3 is available, two pshufb instructions can be combined to form any shuffle. Old machines without SSSE3 bounce the two vectors through the stack.
js/src/jit/Lowering.cpp
js/src/jit/Lowering.h
js/src/jit/none/Lowering-none.h
js/src/jit/shared/LIR-shared.h
js/src/jit/shared/LOpcodes-shared.h
js/src/jit/shared/Lowering-shared.h
js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
js/src/jit/x86-shared/CodeGenerator-x86-shared.h
js/src/jit/x86-shared/Lowering-x86-shared.cpp
js/src/jit/x86-shared/Lowering-x86-shared.h
--- a/js/src/jit/Lowering.cpp
+++ b/js/src/jit/Lowering.cpp
@@ -4431,36 +4431,16 @@ LIRGenerator::visitSimdGeneralShuffle(MS
         lir->setOperand(i + ins->numVectors(), useRegister(ins->lane(i)));
     }
 
     assignSnapshot(lir, Bailout_BoundsCheck);
     define(lir, ins);
 }
 
 void
-LIRGenerator::visitSimdShuffle(MSimdShuffle* ins)
-{
-    MOZ_ASSERT(IsSimdType(ins->lhs()->type()));
-    MOZ_ASSERT(IsSimdType(ins->rhs()->type()));
-    MOZ_ASSERT(IsSimdType(ins->type()));
-    MOZ_ASSERT(ins->type() == MIRType::Int32x4 || ins->type() == MIRType::Float32x4);
-
-    bool zFromLHS = ins->lane(2) < 4;
-    bool wFromLHS = ins->lane(3) < 4;
-    uint32_t lanesFromLHS = (ins->lane(0) < 4) + (ins->lane(1) < 4) + zFromLHS + wFromLHS;
-
-    LSimdShuffle* lir = new (alloc()) LSimdShuffle();
-    lowerForFPU(lir, ins, ins->lhs(), ins->rhs());
-
-    // See codegen for requirements details.
-    LDefinition temp = (lanesFromLHS == 3) ? tempCopy(ins->rhs(), 1) : LDefinition::BogusTemp();
-    lir->setTemp(0, temp);
-}
-
-void
 LIRGenerator::visitSimdUnaryArith(MSimdUnaryArith* ins)
 {
     MOZ_ASSERT(IsSimdType(ins->input()->type()));
     MOZ_ASSERT(IsSimdType(ins->type()));
 
     // Cannot be at start, as the ouput is used as a temporary to store values.
     LUse in = use(ins->input());
 
--- a/js/src/jit/Lowering.h
+++ b/js/src/jit/Lowering.h
@@ -290,17 +290,16 @@ class LIRGenerator : public LIRGenerator
     void visitAsmJSCall(MAsmJSCall* ins);
     void visitSetDOMProperty(MSetDOMProperty* ins);
     void visitGetDOMProperty(MGetDOMProperty* ins);
     void visitGetDOMMember(MGetDOMMember* ins);
     void visitRecompileCheck(MRecompileCheck* ins);
     void visitSimdBox(MSimdBox* ins);
     void visitSimdUnbox(MSimdUnbox* ins);
     void visitSimdGeneralShuffle(MSimdGeneralShuffle* ins);
-    void visitSimdShuffle(MSimdShuffle* ins);
     void visitSimdUnaryArith(MSimdUnaryArith* ins);
     void visitSimdBinaryComp(MSimdBinaryComp* ins);
     void visitSimdBinaryBitwise(MSimdBinaryBitwise* ins);
     void visitSimdShift(MSimdShift* ins);
     void visitSimdConstant(MSimdConstant* ins);
     void visitSimdConvert(MSimdConvert* ins);
     void visitSimdReinterpretCast(MSimdReinterpretCast* ins);
     void visitSimdAllTrue(MSimdAllTrue* ins);
--- a/js/src/jit/none/Lowering-none.h
+++ b/js/src/jit/none/Lowering-none.h
@@ -87,16 +87,17 @@ class LIRGeneratorNone : public LIRGener
     void visitAsmJSAtomicBinopHeap(MAsmJSAtomicBinopHeap* ins) { MOZ_CRASH(); }
     void visitAsmSelect(MAsmSelect*) { MOZ_CRASH(); }
 
     LTableSwitch* newLTableSwitch(LAllocation, LDefinition, MTableSwitch*) { MOZ_CRASH(); }
     LTableSwitchV* newLTableSwitchV(MTableSwitch*) { MOZ_CRASH(); }
     void visitSimdSelect(MSimdSelect* ins) { MOZ_CRASH(); }
     void visitSimdSplat(MSimdSplat* ins) { MOZ_CRASH(); }
     void visitSimdSwizzle(MSimdSwizzle* ins) { MOZ_CRASH(); }
+    void visitSimdShuffle(MSimdShuffle* ins) { MOZ_CRASH(); }
     void visitSimdValueX4(MSimdValueX4* lir) { MOZ_CRASH(); }
     void visitSubstr(MSubstr*) { MOZ_CRASH(); }
     void visitSimdBinaryArith(js::jit::MSimdBinaryArith*) { MOZ_CRASH(); }
     void visitSimdBinarySaturating(MSimdBinarySaturating* ins) { MOZ_CRASH(); }
     void visitRandom(js::jit::MRandom*) { MOZ_CRASH(); }
     void visitWasmTruncateToInt64(MWasmTruncateToInt64*) { MOZ_CRASH(); }
     void visitInt64ToFloatingPoint(MInt64ToFloatingPoint*) { MOZ_CRASH(); }
 };
--- a/js/src/jit/shared/LIR-shared.h
+++ b/js/src/jit/shared/LIR-shared.h
@@ -438,21 +438,21 @@ class LSimdGeneralShuffleF : public LSim
   public:
     LIR_HEADER(SimdGeneralShuffleF);
     explicit LSimdGeneralShuffleF(const LDefinition& temp)
       : LSimdGeneralShuffleBase(temp)
     {}
 };
 
 // Base class for both int32x4 and float32x4 shuffle instructions.
-class LSimdShuffle : public LInstructionHelper<1, 2, 1>
-{
-  public:
-    LIR_HEADER(SimdShuffle);
-    LSimdShuffle()
+class LSimdShuffleX4 : public LInstructionHelper<1, 2, 1>
+{
+  public:
+    LIR_HEADER(SimdShuffleX4);
+    LSimdShuffleX4()
     {}
 
     const LAllocation* lhs() {
         return getOperand(0);
     }
     const LAllocation* rhs() {
         return getOperand(1);
     }
@@ -462,16 +462,38 @@ class LSimdShuffle : public LInstruction
 
     uint32_t lane(unsigned i) const { return mir_->toSimdShuffle()->lane(i); }
 
     bool lanesMatch(uint32_t x, uint32_t y, uint32_t z, uint32_t w) const {
         return mir_->toSimdShuffle()->lanesMatch(x, y, z, w);
     }
 };
 
+// Remaining shuffles (8x16, 16x8).
+class LSimdShuffle : public LInstructionHelper<1, 2, 1>
+{
+  public:
+    LIR_HEADER(SimdShuffle);
+    LSimdShuffle()
+    {}
+
+    const LAllocation* lhs() {
+        return getOperand(0);
+    }
+    const LAllocation* rhs() {
+        return getOperand(1);
+    }
+    const LDefinition* temp() {
+        return getTemp(0);
+    }
+
+    unsigned numLanes() const { return mir_->toSimdShuffle()->numLanes(); }
+    unsigned lane(unsigned i) const { return mir_->toSimdShuffle()->lane(i); }
+};
+
 // Binary SIMD comparison operation between two SIMD operands
 class LSimdBinaryComp: public LInstructionHelper<1, 2, 0>
 {
   protected:
     LSimdBinaryComp() {}
 
 public:
     const LAllocation* lhs() {
--- a/js/src/jit/shared/LOpcodes-shared.h
+++ b/js/src/jit/shared/LOpcodes-shared.h
@@ -34,16 +34,17 @@
     _(SimdExtractElementF)          \
     _(SimdInsertElementI)           \
     _(SimdInsertElementF)           \
     _(SimdGeneralShuffleI)          \
     _(SimdGeneralShuffleF)          \
     _(SimdSwizzleI)                 \
     _(SimdSwizzleF)                 \
     _(SimdShuffle)                  \
+    _(SimdShuffleX4)                \
     _(SimdUnaryArithIx16)           \
     _(SimdUnaryArithIx8)            \
     _(SimdUnaryArithIx4)            \
     _(SimdUnaryArithFx4)            \
     _(SimdBinaryCompIx4)            \
     _(SimdBinaryCompFx4)            \
     _(SimdBinaryArithIx16)          \
     _(SimdBinaryArithIx8)           \
--- a/js/src/jit/shared/Lowering-shared.h
+++ b/js/src/jit/shared/Lowering-shared.h
@@ -274,14 +274,15 @@ class LIRGeneratorShared : public MDefin
     void visitSimdInsertElement(MSimdInsertElement*) override { MOZ_CRASH("NYI"); }
     void visitSimdExtractElement(MSimdExtractElement*) override { MOZ_CRASH("NYI"); }
     void visitSimdBinaryArith(MSimdBinaryArith*) override { MOZ_CRASH("NYI"); }
     void visitSimdSelect(MSimdSelect*) override { MOZ_CRASH("NYI"); }
     void visitSimdSplat(MSimdSplat*) override { MOZ_CRASH("NYI"); }
     void visitSimdValueX4(MSimdValueX4*) override { MOZ_CRASH("NYI"); }
     void visitSimdBinarySaturating(MSimdBinarySaturating*) override { MOZ_CRASH("NYI"); }
     void visitSimdSwizzle(MSimdSwizzle*) override { MOZ_CRASH("NYI"); }
+    void visitSimdShuffle(MSimdShuffle*) override { MOZ_CRASH("NYI"); }
 };
 
 } // namespace jit
 } // namespace js
 
 #endif /* jit_shared_Lowering_shared_h */
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -3099,16 +3099,74 @@ CodeGeneratorX86Shared::visitSimdSwizzle
     uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
     masm.shuffleFloat32(mask, input, output);
 }
 
 void
 CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle* ins)
 {
     FloatRegister lhs = ToFloatRegister(ins->lhs());
+    FloatRegister rhs = ToFloatRegister(ins->rhs());
+    FloatRegister output = ToFloatRegister(ins->output());
+    const unsigned numLanes = ins->numLanes();
+    const unsigned bytesPerLane = 16 / numLanes;
+
+    // Convert the shuffle to a byte-wise shuffle.
+    uint8_t bLane[16];
+    for (unsigned i = 0; i < numLanes; i++) {
+        for (unsigned b = 0; b < bytesPerLane; b++) {
+            bLane[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b;
+        }
+    }
+
+    // Use pshufb if it is available.
+    if (AssemblerX86Shared::HasSSSE3()) {
+        FloatRegister scratch1 = ToFloatRegister(ins->temp());
+        ScratchSimd128Scope scratch2(masm);
+
+        // Use pshufb instructions to gather the lanes from each source vector.
+        // A negative index creates a zero lane, so the two vectors can be combined.
+
+        // Set scratch2 = lanes from lhs.
+        int8_t idx[16];
+        for (unsigned i = 0; i < 16; i++)
+            idx[i] = bLane[i] < 16 ? bLane[i] : -1;
+        masm.loadConstantSimd128Int(SimdConstant::CreateX16(idx), scratch1);
+        FloatRegister lhsCopy = masm.reusedInputInt32x4(lhs, scratch2);
+        masm.vpshufb(scratch1, lhsCopy, scratch2);
+
+        // Set output = lanes from rhs.
+        for (unsigned i = 0; i < 16; i++)
+            idx[i] = bLane[i] >= 16 ? bLane[i] - 16 : -1;
+        masm.loadConstantSimd128Int(SimdConstant::CreateX16(idx), scratch1);
+        FloatRegister rhsCopy = masm.reusedInputInt32x4(rhs, output);
+        masm.vpshufb(scratch1, rhsCopy, output);
+
+        // Combine.
+        masm.vpor(scratch2, output, output);
+        return;
+    }
+
+    // Worst-case fallback for pre-SSE3 machines. Bounce through memory.
+    Register temp = ToRegister(ins->getTemp(0));
+    masm.reserveStack(3 * Simd128DataSize);
+    masm.storeAlignedSimd128Int(lhs, Address(StackPointer, Simd128DataSize));
+    masm.storeAlignedSimd128Int(rhs, Address(StackPointer, 2 * Simd128DataSize));
+    for (unsigned i = 0; i < 16; i++) {
+        masm.load8ZeroExtend(Address(StackPointer, Simd128DataSize + bLane[i]), temp);
+        masm.store8(temp, Address(StackPointer, i));
+    }
+    masm.loadAlignedSimd128Int(Address(StackPointer, 0), output);
+    masm.freeStack(3 * Simd128DataSize);
+}
+
+void
+CodeGeneratorX86Shared::visitSimdShuffleX4(LSimdShuffleX4* ins)
+{
+    FloatRegister lhs = ToFloatRegister(ins->lhs());
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister out = ToFloatRegister(ins->output());
 
     uint32_t x = ins->lane(0);
     uint32_t y = ins->lane(1);
     uint32_t z = ins->lane(2);
     uint32_t w = ins->lane(3);
 
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.h
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.h
@@ -303,16 +303,17 @@ class CodeGeneratorX86Shared : public Co
     void visitSimdExtractElementB(LSimdExtractElementB* lir);
     void visitSimdExtractElementI(LSimdExtractElementI* lir);
     void visitSimdExtractElementU2D(LSimdExtractElementU2D* lir);
     void visitSimdExtractElementF(LSimdExtractElementF* lir);
     void visitSimdInsertElementI(LSimdInsertElementI* lir);
     void visitSimdInsertElementF(LSimdInsertElementF* lir);
     void visitSimdSwizzleI(LSimdSwizzleI* lir);
     void visitSimdSwizzleF(LSimdSwizzleF* lir);
+    void visitSimdShuffleX4(LSimdShuffleX4* lir);
     void visitSimdShuffle(LSimdShuffle* lir);
     void visitSimdUnaryArithIx16(LSimdUnaryArithIx16* lir);
     void visitSimdUnaryArithIx8(LSimdUnaryArithIx8* lir);
     void visitSimdUnaryArithIx4(LSimdUnaryArithIx4* lir);
     void visitSimdUnaryArithFx4(LSimdUnaryArithFx4* lir);
     void visitSimdBinaryCompIx4(LSimdBinaryCompIx4* lir);
     void visitSimdBinaryCompFx4(LSimdBinaryCompFx4* lir);
     void visitSimdBinaryArithIx16(LSimdBinaryArithIx16* lir);
--- a/js/src/jit/x86-shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@@ -895,8 +895,48 @@ LIRGeneratorX86Shared::visitSimdSwizzle(
         LUse use = useRegisterAtStart(ins->input());
         LSimdSwizzleF* lir = new (alloc()) LSimdSwizzleF(use);
         define(lir, ins);
         lir->setTemp(0, LDefinition::BogusTemp());
     } else {
         MOZ_CRASH("Unknown SIMD kind when getting lane");
     }
 }
+
+void
+LIRGeneratorX86Shared::visitSimdShuffle(MSimdShuffle* ins)
+{
+    MOZ_ASSERT(IsSimdType(ins->lhs()->type()));
+    MOZ_ASSERT(IsSimdType(ins->rhs()->type()));
+    MOZ_ASSERT(IsSimdType(ins->type()));
+    if (ins->type() == MIRType::Int32x4 || ins->type() == MIRType::Float32x4) {
+        bool zFromLHS = ins->lane(2) < 4;
+        bool wFromLHS = ins->lane(3) < 4;
+        uint32_t lanesFromLHS = (ins->lane(0) < 4) + (ins->lane(1) < 4) + zFromLHS + wFromLHS;
+
+        LSimdShuffleX4* lir = new (alloc()) LSimdShuffleX4();
+        lowerForFPU(lir, ins, ins->lhs(), ins->rhs());
+
+        // See codegen for requirements details.
+        LDefinition temp =
+          (lanesFromLHS == 3) ? tempCopy(ins->rhs(), 1) : LDefinition::BogusTemp();
+        lir->setTemp(0, temp);
+    } else {
+        MOZ_ASSERT(ins->type() == MIRType::Int8x16 || ins->type() == MIRType::Int16x8);
+        LSimdShuffle* lir = new (alloc()) LSimdShuffle();
+        lir->setOperand(0, useRegister(ins->lhs()));
+        lir->setOperand(1, useRegister(ins->rhs()));
+        define(lir, ins);
+        // We need a GPR temp register for pre-SSSE3 codegen, and an SSE temp
+        // when using pshufb.
+        if (Assembler::HasSSSE3()) {
+            lir->setTemp(0, temp(LDefinition::SIMD128INT));
+        } else {
+            // The temp must be a GPR usable with 8-bit loads and stores.
+#if defined(JS_CODEGEN_X86)
+            lir->setTemp(0, tempFixed(ebx));
+#else
+            lir->setTemp(0, temp());
+#endif
+        }
+    }
+}
+
--- a/js/src/jit/x86-shared/Lowering-x86-shared.h
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.h
@@ -58,16 +58,17 @@ class LIRGeneratorX86Shared : public LIR
     void lowerTruncateFToInt32(MTruncateToInt32* ins);
     void visitSimdInsertElement(MSimdInsertElement* ins);
     void visitSimdExtractElement(MSimdExtractElement* ins);
     void visitSimdBinaryArith(MSimdBinaryArith* ins);
     void visitSimdBinarySaturating(MSimdBinarySaturating* ins);
     void visitSimdSelect(MSimdSelect* ins);
     void visitSimdSplat(MSimdSplat* ins);
     void visitSimdSwizzle(MSimdSwizzle* ins);
+    void visitSimdShuffle(MSimdShuffle* ins);
     void visitSimdValueX4(MSimdValueX4* ins);
     void lowerCompareExchangeTypedArrayElement(MCompareExchangeTypedArrayElement* ins,
                                                bool useI386ByteRegisters);
     void lowerAtomicExchangeTypedArrayElement(MAtomicExchangeTypedArrayElement* ins,
                                               bool useI386ByteRegisters);
     void lowerAtomicTypedArrayElementBinop(MAtomicTypedArrayElementBinop* ins,
                                            bool useI386ByteRegisters);
 };