Bug 1136226 - Implement swizzle for 8x16 and 16x8 SIMD types. r=sunfish
authorJakob Stoklund Olesen <jolesen@mozilla.com>
Tue, 31 May 2016 09:00:19 -0700
changeset 338730 018f7422c57ec03d65f58802e4cbb6ee2fc25418
parent 338729 5472bbbb12079f4ca2da7fb8048fd0787ef6f200
child 338731 7be2feba720f43f6c5df652f9908f7a8c8a39be1
push id6249
push userjlund@mozilla.com
push dateMon, 01 Aug 2016 13:59:36 +0000
treeherdermozilla-beta@bad9d4f5bf7e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssunfish
bugs1136226
milestone49.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1136226 - Implement swizzle for 8x16 and 16x8 SIMD types. r=sunfish When we have SSSE3 available, the pshufb instruction can perform any byte-wise swizzle. Without SSSE3, fall back to using byte-wise loads and stores to simulate the swizzle. This applies to CPUs from before 2006.
js/src/asmjs/AsmJS.cpp
js/src/jit/Lowering.cpp
js/src/jit/Lowering.h
js/src/jit/MIR.h
js/src/jit/none/Lowering-none.h
js/src/jit/shared/LIR-shared.h
js/src/jit/shared/Lowering-shared.h
js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
js/src/jit/x86-shared/Lowering-x86-shared.cpp
js/src/jit/x86-shared/Lowering-x86-shared.h
--- a/js/src/asmjs/AsmJS.cpp
+++ b/js/src/asmjs/AsmJS.cpp
@@ -5213,85 +5213,90 @@ CheckSimdCast(FunctionValidator& f, Pars
         return false;
     *type = toType;
     return true;
 }
 
 } // namespace
 
 static bool
-CheckSimdShuffleSelectors(FunctionValidator& f, ParseNode* lane, int32_t lanes[4], uint32_t maxLane)
-{
-    for (unsigned i = 0; i < 4; i++, lane = NextNode(lane)) {
+CheckSimdShuffleSelectors(FunctionValidator& f, ParseNode* lane,
+                          mozilla::Array<uint8_t, 16>& lanes, unsigned numLanes, unsigned maxLane)
+{
+    for (unsigned i = 0; i < numLanes; i++, lane = NextNode(lane)) {
         uint32_t u32;
         if (!IsLiteralInt(f.m(), lane, &u32))
             return f.failf(lane, "lane selector should be a constant integer literal");
         if (u32 >= maxLane)
             return f.failf(lane, "lane selector should be less than %u", maxLane);
-        lanes[i] = int32_t(u32);
+        lanes[i] = uint8_t(u32);
     }
     return true;
 }
 
 static bool
 CheckSimdSwizzle(FunctionValidator& f, ParseNode* call, SimdType opType, Type* type)
 {
+    const unsigned numLanes = GetSimdLanes(opType);
     unsigned numArgs = CallArgListLength(call);
-    if (numArgs != 5)
-        return f.failf(call, "expected 5 arguments to SIMD swizzle, got %u", numArgs);
+    if (numArgs != 1 + numLanes)
+        return f.failf(call, "expected %u arguments to SIMD swizzle, got %u", 1 + numLanes,
+                       numArgs);
 
     Type retType = opType;
     ParseNode* vec = CallArgList(call);
     Type vecType;
     if (!CheckExpr(f, vec, &vecType))
         return false;
     if (!(vecType <= retType))
         return f.failf(vec, "%s is not a subtype of %s", vecType.toChars(), retType.toChars());
 
     if (!f.writeSimdOp(opType, SimdOperation::Fn_swizzle))
         return false;
 
-    int32_t lanes[4];
-    if (!CheckSimdShuffleSelectors(f, NextNode(vec), lanes, 4))
-        return false;
-
-    for (unsigned i = 0; i < 4; i++) {
-        if (!f.encoder().writeFixedU8(uint8_t(lanes[i])))
+    mozilla::Array<uint8_t, 16> lanes;
+    if (!CheckSimdShuffleSelectors(f, NextNode(vec), lanes, numLanes, numLanes))
+        return false;
+
+    for (unsigned i = 0; i < numLanes; i++) {
+        if (!f.encoder().writeFixedU8(lanes[i]))
             return false;
     }
 
     *type = retType;
     return true;
 }
 
 static bool
 CheckSimdShuffle(FunctionValidator& f, ParseNode* call, SimdType opType, Type* type)
 {
+    const unsigned numLanes = GetSimdLanes(opType);
     unsigned numArgs = CallArgListLength(call);
-    if (numArgs != 6)
-        return f.failf(call, "expected 6 arguments to SIMD shuffle, got %u", numArgs);
+    if (numArgs != 2 + numLanes)
+        return f.failf(call, "expected %u arguments to SIMD shuffle, got %u", 2 + numLanes,
+                       numArgs);
 
     Type retType = opType;
     ParseNode* arg = CallArgList(call);
     for (unsigned i = 0; i < 2; i++, arg = NextNode(arg)) {
         Type type;
         if (!CheckExpr(f, arg, &type))
             return false;
         if (!(type <= retType))
             return f.failf(arg, "%s is not a subtype of %s", type.toChars(), retType.toChars());
     }
 
     if (!f.writeSimdOp(opType, SimdOperation::Fn_shuffle))
         return false;
 
-    int32_t lanes[4];
-    if (!CheckSimdShuffleSelectors(f, arg, lanes, 8))
-        return false;
-
-    for (unsigned i = 0; i < 4; i++) {
+    mozilla::Array<uint8_t, 16> lanes;
+    if (!CheckSimdShuffleSelectors(f, arg, lanes, numLanes, 2 * numLanes))
+        return false;
+
+    for (unsigned i = 0; i < numLanes; i++) {
         if (!f.encoder().writeFixedU8(uint8_t(lanes[i])))
             return false;
     }
 
     *type = retType;
     return true;
 }
 
--- a/js/src/jit/Lowering.cpp
+++ b/js/src/jit/Lowering.cpp
@@ -4401,35 +4401,16 @@ LIRGenerator::visitSimdAnyTrue(MSimdAnyT
     MDefinition* input = ins->input();
     MOZ_ASSERT(IsBooleanSimdType(input->type()));
 
     LUse use = useRegisterAtStart(input);
     define(new(alloc()) LSimdAnyTrue(use), ins);
 }
 
 void
-LIRGenerator::visitSimdSwizzle(MSimdSwizzle* ins)
-{
-    MOZ_ASSERT(IsSimdType(ins->input()->type()));
-    MOZ_ASSERT(IsSimdType(ins->type()));
-
-    if (ins->input()->type() == MIRType::Int32x4) {
-        LUse use = useRegisterAtStart(ins->input());
-        LSimdSwizzleI* lir = new (alloc()) LSimdSwizzleI(use);
-        define(lir, ins);
-    } else if (ins->input()->type() == MIRType::Float32x4) {
-        LUse use = useRegisterAtStart(ins->input());
-        LSimdSwizzleF* lir = new (alloc()) LSimdSwizzleF(use);
-        define(lir, ins);
-    } else {
-        MOZ_CRASH("Unknown SIMD kind when getting lane");
-    }
-}
-
-void
 LIRGenerator::visitSimdGeneralShuffle(MSimdGeneralShuffle*ins)
 {
     MOZ_ASSERT(IsSimdType(ins->type()));
 
     LSimdGeneralShuffleBase* lir;
     if (ins->type() == MIRType::Int32x4)
         lir = new (alloc()) LSimdGeneralShuffleI(temp());
     else if (ins->type() == MIRType::Float32x4)
--- a/js/src/jit/Lowering.h
+++ b/js/src/jit/Lowering.h
@@ -289,17 +289,16 @@ class LIRGenerator : public LIRGenerator
     void visitAsmJSPassStackArg(MAsmJSPassStackArg* ins);
     void visitAsmJSCall(MAsmJSCall* ins);
     void visitSetDOMProperty(MSetDOMProperty* ins);
     void visitGetDOMProperty(MGetDOMProperty* ins);
     void visitGetDOMMember(MGetDOMMember* ins);
     void visitRecompileCheck(MRecompileCheck* ins);
     void visitSimdBox(MSimdBox* ins);
     void visitSimdUnbox(MSimdUnbox* ins);
-    void visitSimdSwizzle(MSimdSwizzle* ins);
     void visitSimdGeneralShuffle(MSimdGeneralShuffle* ins);
     void visitSimdShuffle(MSimdShuffle* ins);
     void visitSimdUnaryArith(MSimdUnaryArith* ins);
     void visitSimdBinaryComp(MSimdBinaryComp* ins);
     void visitSimdBinaryBitwise(MSimdBinaryBitwise* ins);
     void visitSimdShift(MSimdShift* ins);
     void visitSimdConstant(MSimdConstant* ins);
     void visitSimdConvert(MSimdConvert* ins);
--- a/js/src/jit/MIR.h
+++ b/js/src/jit/MIR.h
@@ -1979,16 +1979,20 @@ class MSimdShuffleBase
     }
 
     bool sameLanes(const MSimdShuffleBase* other) const {
         return arity_ == other->arity_ &&
                memcmp(&lane_[0], &other->lane_[0], arity_) == 0;
     }
 
   public:
+    unsigned numLanes() const {
+        return arity_;
+    }
+
     unsigned lane(unsigned i) const {
         MOZ_ASSERT(i < arity_);
         return lane_[i];
     }
 
     bool lanesMatch(uint32_t x, uint32_t y, uint32_t z, uint32_t w) const {
         return arity_ == 4 && lane(0) == x && lane(1) == y && lane(2) == z &&
                lane(3) == w;
--- a/js/src/jit/none/Lowering-none.h
+++ b/js/src/jit/none/Lowering-none.h
@@ -86,16 +86,17 @@ class LIRGeneratorNone : public LIRGener
     void visitAsmJSAtomicExchangeHeap(MAsmJSAtomicExchangeHeap* ins) { MOZ_CRASH(); }
     void visitAsmJSAtomicBinopHeap(MAsmJSAtomicBinopHeap* ins) { MOZ_CRASH(); }
     void visitAsmSelect(MAsmSelect*) { MOZ_CRASH(); }
 
     LTableSwitch* newLTableSwitch(LAllocation, LDefinition, MTableSwitch*) { MOZ_CRASH(); }
     LTableSwitchV* newLTableSwitchV(MTableSwitch*) { MOZ_CRASH(); }
     void visitSimdSelect(MSimdSelect* ins) { MOZ_CRASH(); }
     void visitSimdSplat(MSimdSplat* ins) { MOZ_CRASH(); }
+    void visitSimdSwizzle(MSimdSwizzle* ins) { MOZ_CRASH(); }
     void visitSimdValueX4(MSimdValueX4* lir) { MOZ_CRASH(); }
     void visitSubstr(MSubstr*) { MOZ_CRASH(); }
     void visitSimdBinaryArith(js::jit::MSimdBinaryArith*) { MOZ_CRASH(); }
     void visitSimdBinarySaturating(MSimdBinarySaturating* ins) { MOZ_CRASH(); }
     void visitRandom(js::jit::MRandom*) { MOZ_CRASH(); }
     void visitWasmTruncateToInt64(MWasmTruncateToInt64*) { MOZ_CRASH(); }
     void visitInt64ToFloatingPoint(MInt64ToFloatingPoint*) { MOZ_CRASH(); }
 };
--- a/js/src/jit/shared/LIR-shared.h
+++ b/js/src/jit/shared/LIR-shared.h
@@ -360,28 +360,29 @@ class LSimdInsertElementF : public LSimd
   public:
     LIR_HEADER(SimdInsertElementF);
     LSimdInsertElementF(const LAllocation& vec, const LAllocation& val)
       : LSimdInsertElementBase(vec, val)
     {}
 };
 
 // Base class for both int32x4 and float32x4 shuffle instructions.
-class LSimdSwizzleBase : public LInstructionHelper<1, 1, 0>
+class LSimdSwizzleBase : public LInstructionHelper<1, 1, 1>
 {
   public:
     explicit LSimdSwizzleBase(const LAllocation& base)
     {
         setOperand(0, base);
     }
 
     const LAllocation* getBase() {
         return getOperand(0);
     }
 
+    unsigned numLanes() const { return mir_->toSimdSwizzle()->numLanes(); }
     uint32_t lane(unsigned i) const { return mir_->toSimdSwizzle()->lane(i); }
 
     bool lanesMatch(uint32_t x, uint32_t y, uint32_t z, uint32_t w) const {
         return mir_->toSimdSwizzle()->lanesMatch(x, y, z, w);
     }
 };
 
 // Shuffles a int32x4 into another int32x4 vector.
--- a/js/src/jit/shared/Lowering-shared.h
+++ b/js/src/jit/shared/Lowering-shared.h
@@ -273,14 +273,15 @@ class LIRGeneratorShared : public MDefin
     // shared implementations are not included here.
     void visitSimdInsertElement(MSimdInsertElement*) override { MOZ_CRASH("NYI"); }
     void visitSimdExtractElement(MSimdExtractElement*) override { MOZ_CRASH("NYI"); }
     void visitSimdBinaryArith(MSimdBinaryArith*) override { MOZ_CRASH("NYI"); }
     void visitSimdSelect(MSimdSelect*) override { MOZ_CRASH("NYI"); }
     void visitSimdSplat(MSimdSplat*) override { MOZ_CRASH("NYI"); }
     void visitSimdValueX4(MSimdValueX4*) override { MOZ_CRASH("NYI"); }
     void visitSimdBinarySaturating(MSimdBinarySaturating*) override { MOZ_CRASH("NYI"); }
+    void visitSimdSwizzle(MSimdSwizzle*) override { MOZ_CRASH("NYI"); }
 };
 
 } // namespace jit
 } // namespace js
 
 #endif /* jit_shared_Lowering_shared_h */
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -2993,31 +2993,67 @@ CodeGeneratorX86Shared::visitSimdGeneral
     visitSimdGeneralShuffle<float, FloatRegister>(ins, scratch);
 }
 
 void
 CodeGeneratorX86Shared::visitSimdSwizzleI(LSimdSwizzleI* ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     FloatRegister output = ToFloatRegister(ins->output());
-
-    uint32_t x = ins->lane(0);
-    uint32_t y = ins->lane(1);
-    uint32_t z = ins->lane(2);
-    uint32_t w = ins->lane(3);
-
-    uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
-    masm.shuffleInt32(mask, input, output);
+    const unsigned numLanes = ins->numLanes();
+
+    switch (numLanes) {
+        case 4: {
+            uint32_t x = ins->lane(0);
+            uint32_t y = ins->lane(1);
+            uint32_t z = ins->lane(2);
+            uint32_t w = ins->lane(3);
+
+            uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
+            masm.shuffleInt32(mask, input, output);
+            return;
+        }
+    }
+
+    // In the general case, use pshufb if it is available. Convert to a
+    // byte-wise swizzle.
+    const unsigned bytesPerLane = 16 / numLanes;
+    int8_t bLane[16];
+    for (unsigned i = 0; i < numLanes; i++) {
+        for (unsigned b = 0; b < bytesPerLane; b++) {
+            bLane[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b;
+        }
+    }
+
+    if (AssemblerX86Shared::HasSSSE3()) {
+        ScratchSimd128Scope scratch(masm);
+        masm.loadConstantSimd128Int(SimdConstant::CreateX16(bLane), scratch);
+        FloatRegister inputCopy = masm.reusedInputInt32x4(input, output);
+        masm.vpshufb(scratch, inputCopy, output);
+        return;
+    }
+
+    // Worst-case fallback for pre-SSSE3 machines. Bounce through memory.
+    Register temp = ToRegister(ins->getTemp(0));
+    masm.reserveStack(2 * Simd128DataSize);
+    masm.storeAlignedSimd128Int(input, Address(StackPointer, Simd128DataSize));
+    for (unsigned i = 0; i < 16; i++) {
+        masm.load8ZeroExtend(Address(StackPointer, Simd128DataSize + bLane[i]), temp);
+        masm.store8(temp, Address(StackPointer, i));
+    }
+    masm.loadAlignedSimd128Int(Address(StackPointer, 0), output);
+    masm.freeStack(2 * Simd128DataSize);
 }
 
 void
 CodeGeneratorX86Shared::visitSimdSwizzleF(LSimdSwizzleF* ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     FloatRegister output = ToFloatRegister(ins->output());
+    MOZ_ASSERT(ins->numLanes() == 4);
 
     uint32_t x = ins->lane(0);
     uint32_t y = ins->lane(1);
     uint32_t z = ins->lane(2);
     uint32_t w = ins->lane(3);
 
     if (AssemblerX86Shared::HasSSE3()) {
         if (ins->lanesMatch(0, 0, 2, 2)) {
--- a/js/src/jit/x86-shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@@ -864,8 +864,39 @@ LIRGeneratorX86Shared::visitSimdValueX4(
         LAllocation w = useRegisterAtStart(ins->getOperand(3));
         define(new(alloc()) LSimdValueInt32x4(x, y, z, w), ins);
         break;
       }
       default:
         MOZ_CRASH("Unknown SIMD kind");
     }
 }
+
+void
+LIRGeneratorX86Shared::visitSimdSwizzle(MSimdSwizzle* ins)
+{
+    MOZ_ASSERT(IsSimdType(ins->input()->type()));
+    MOZ_ASSERT(IsSimdType(ins->type()));
+
+    if (IsIntegerSimdType(ins->input()->type())) {
+        LUse use = useRegisterAtStart(ins->input());
+        LSimdSwizzleI* lir = new (alloc()) LSimdSwizzleI(use);
+        define(lir, ins);
+        // We need a GPR temp register for pre-SSSE3 codegen (no vpshufb).
+        if (Assembler::HasSSSE3()) {
+            lir->setTemp(0, LDefinition::BogusTemp());
+        } else {
+            // The temp must be a GPR usable with 8-bit loads and stores.
+#if defined(JS_CODEGEN_X86)
+            lir->setTemp(0, tempFixed(ebx));
+#else
+            lir->setTemp(0, temp());
+#endif
+        }
+    } else if (ins->input()->type() == MIRType::Float32x4) {
+        LUse use = useRegisterAtStart(ins->input());
+        LSimdSwizzleF* lir = new (alloc()) LSimdSwizzleF(use);
+        define(lir, ins);
+        lir->setTemp(0, LDefinition::BogusTemp());
+    } else {
+        MOZ_CRASH("Unknown SIMD kind when getting lane");
+    }
+}
--- a/js/src/jit/x86-shared/Lowering-x86-shared.h
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.h
@@ -57,16 +57,17 @@ class LIRGeneratorX86Shared : public LIR
     void lowerTruncateDToInt32(MTruncateToInt32* ins);
     void lowerTruncateFToInt32(MTruncateToInt32* ins);
     void visitSimdInsertElement(MSimdInsertElement* ins);
     void visitSimdExtractElement(MSimdExtractElement* ins);
     void visitSimdBinaryArith(MSimdBinaryArith* ins);
     void visitSimdBinarySaturating(MSimdBinarySaturating* ins);
     void visitSimdSelect(MSimdSelect* ins);
     void visitSimdSplat(MSimdSplat* ins);
+    void visitSimdSwizzle(MSimdSwizzle* ins);
     void visitSimdValueX4(MSimdValueX4* ins);
     void lowerCompareExchangeTypedArrayElement(MCompareExchangeTypedArrayElement* ins,
                                                bool useI386ByteRegisters);
     void lowerAtomicExchangeTypedArrayElement(MAtomicExchangeTypedArrayElement* ins,
                                               bool useI386ByteRegisters);
     void lowerAtomicTypedArrayElementBinop(MAtomicTypedArrayElementBinop* ins,
                                            bool useI386ByteRegisters);
 };