Bug 1136226 - Add general shuffle support for 8x16 and 16x8 shuffles. r=bbouvier
authorJakob Stoklund Olesen <jolesen@mozilla.com>
Tue, 31 May 2016 09:00:19 -0700
changeset 338732 e5479106a7ab7033580c4114d96cd5d0d3c062d2
parent 338731 7be2feba720f43f6c5df652f9908f7a8c8a39be1
child 338733 8e4f48388c600edd5ee5292b014cedb0b8f7672f
push id6249
push userjlund@mozilla.com
push dateMon, 01 Aug 2016 13:59:36 +0000
treeherdermozilla-beta@bad9d4f5bf7e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbbouvier
bugs1136226
milestone49.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1136226 - Add general shuffle support for 8x16 and 16x8 shuffles. r=bbouvier This instruction is used when the shuffle indexes are not compile time constants. Give the register allocator permission to spill the index arguments to GeneralShuffle instructions. There can be up to 16 of them, and they can't all be registers. Move visitSimdGeneralShuffle into the x86-specific lowering code since it has special register allocation requirements for 8-bit lanes.
js/src/jit/Lowering.cpp
js/src/jit/Lowering.h
js/src/jit/shared/Lowering-shared.h
js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
js/src/jit/x86-shared/Lowering-x86-shared.cpp
js/src/jit/x86-shared/Lowering-x86-shared.h
js/src/jit/x86-shared/MacroAssembler-x86-shared.h
--- a/js/src/jit/Lowering.cpp
+++ b/js/src/jit/Lowering.cpp
@@ -4401,46 +4401,16 @@ LIRGenerator::visitSimdAnyTrue(MSimdAnyT
     MDefinition* input = ins->input();
     MOZ_ASSERT(IsBooleanSimdType(input->type()));
 
     LUse use = useRegisterAtStart(input);
     define(new(alloc()) LSimdAnyTrue(use), ins);
 }
 
 void
-LIRGenerator::visitSimdGeneralShuffle(MSimdGeneralShuffle*ins)
-{
-    MOZ_ASSERT(IsSimdType(ins->type()));
-
-    LSimdGeneralShuffleBase* lir;
-    if (ins->type() == MIRType::Int32x4)
-        lir = new (alloc()) LSimdGeneralShuffleI(temp());
-    else if (ins->type() == MIRType::Float32x4)
-        lir = new (alloc()) LSimdGeneralShuffleF(temp());
-    else
-        MOZ_CRASH("Unknown SIMD kind when doing a shuffle");
-
-    if (!lir->init(alloc(), ins->numVectors() + ins->numLanes()))
-        return;
-
-    for (unsigned i = 0; i < ins->numVectors(); i++) {
-        MOZ_ASSERT(IsSimdType(ins->vector(i)->type()));
-        lir->setOperand(i, useRegister(ins->vector(i)));
-    }
-
-    for (unsigned i = 0; i < ins->numLanes(); i++) {
-        MOZ_ASSERT(ins->lane(i)->type() == MIRType::Int32);
-        lir->setOperand(i + ins->numVectors(), useRegister(ins->lane(i)));
-    }
-
-    assignSnapshot(lir, Bailout_BoundsCheck);
-    define(lir, ins);
-}
-
-void
 LIRGenerator::visitSimdUnaryArith(MSimdUnaryArith* ins)
 {
     MOZ_ASSERT(IsSimdType(ins->input()->type()));
     MOZ_ASSERT(IsSimdType(ins->type()));
 
     // Cannot be at start, as the ouput is used as a temporary to store values.
     LUse in = use(ins->input());
 
--- a/js/src/jit/Lowering.h
+++ b/js/src/jit/Lowering.h
@@ -289,17 +289,16 @@ class LIRGenerator : public LIRGenerator
     void visitAsmJSPassStackArg(MAsmJSPassStackArg* ins);
     void visitAsmJSCall(MAsmJSCall* ins);
     void visitSetDOMProperty(MSetDOMProperty* ins);
     void visitGetDOMProperty(MGetDOMProperty* ins);
     void visitGetDOMMember(MGetDOMMember* ins);
     void visitRecompileCheck(MRecompileCheck* ins);
     void visitSimdBox(MSimdBox* ins);
     void visitSimdUnbox(MSimdUnbox* ins);
-    void visitSimdGeneralShuffle(MSimdGeneralShuffle* ins);
     void visitSimdUnaryArith(MSimdUnaryArith* ins);
     void visitSimdBinaryComp(MSimdBinaryComp* ins);
     void visitSimdBinaryBitwise(MSimdBinaryBitwise* ins);
     void visitSimdShift(MSimdShift* ins);
     void visitSimdConstant(MSimdConstant* ins);
     void visitSimdConvert(MSimdConvert* ins);
     void visitSimdReinterpretCast(MSimdReinterpretCast* ins);
     void visitSimdAllTrue(MSimdAllTrue* ins);
--- a/js/src/jit/shared/Lowering-shared.h
+++ b/js/src/jit/shared/Lowering-shared.h
@@ -275,14 +275,15 @@ class LIRGeneratorShared : public MDefin
     void visitSimdExtractElement(MSimdExtractElement*) override { MOZ_CRASH("NYI"); }
     void visitSimdBinaryArith(MSimdBinaryArith*) override { MOZ_CRASH("NYI"); }
     void visitSimdSelect(MSimdSelect*) override { MOZ_CRASH("NYI"); }
     void visitSimdSplat(MSimdSplat*) override { MOZ_CRASH("NYI"); }
     void visitSimdValueX4(MSimdValueX4*) override { MOZ_CRASH("NYI"); }
     void visitSimdBinarySaturating(MSimdBinarySaturating*) override { MOZ_CRASH("NYI"); }
     void visitSimdSwizzle(MSimdSwizzle*) override { MOZ_CRASH("NYI"); }
     void visitSimdShuffle(MSimdShuffle*) override { MOZ_CRASH("NYI"); }
+    void visitSimdGeneralShuffle(MSimdGeneralShuffle*) override { MOZ_CRASH("NYI"); }
 };
 
 } // namespace jit
 } // namespace js
 
 #endif /* jit_shared_Lowering_shared_h */
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -2941,29 +2941,30 @@ CodeGeneratorX86Shared::visitSimdGeneral
     masm.reserveStack(stackSpace);
 
     for (unsigned i = 0; i < numVectors; i++) {
         masm.storeAlignedVector<T>(ToFloatRegister(ins->vector(i)),
                                    Address(StackPointer, Simd128DataSize * (1 + i)));
     }
 
     Label bail;
+    const Scale laneScale = ScaleFromElemWidth(sizeof(T));
 
     for (size_t i = 0; i < mir->numLanes(); i++) {
         Operand lane = ToOperand(ins->lane(i));
 
         masm.cmp32(lane, Imm32(numVectors * mir->numLanes() - 1));
         masm.j(Assembler::Above, &bail);
 
         if (lane.kind() == Operand::REG) {
-            masm.loadScalar<T>(Operand(StackPointer, ToRegister(ins->lane(i)), TimesFour, Simd128DataSize),
+            masm.loadScalar<T>(Operand(StackPointer, ToRegister(ins->lane(i)), laneScale, Simd128DataSize),
                                tempRegister);
         } else {
             masm.load32(lane, laneTemp);
-            masm.loadScalar<T>(Operand(StackPointer, laneTemp, TimesFour, Simd128DataSize), tempRegister);
+            masm.loadScalar<T>(Operand(StackPointer, laneTemp, laneScale, Simd128DataSize), tempRegister);
         }
 
         masm.storeScalar<T>(tempRegister, Address(StackPointer, i * sizeof(T)));
     }
 
     FloatRegister output = ToFloatRegister(ins->output());
     masm.loadAlignedVector<T>(Address(StackPointer, 0), output);
 
@@ -2979,17 +2980,26 @@ CodeGeneratorX86Shared::visitSimdGeneral
     masm.bind(&join);
     masm.setFramePushed(masm.framePushed() + stackSpace);
     masm.freeStack(stackSpace);
 }
 
 void
 CodeGeneratorX86Shared::visitSimdGeneralShuffleI(LSimdGeneralShuffleI* ins)
 {
-    visitSimdGeneralShuffle<int32_t, Register>(ins, ToRegister(ins->temp()));
+    switch (ins->mir()->type()) {
+      case MIRType::Int8x16:
+        return visitSimdGeneralShuffle<int8_t, Register>(ins, ToRegister(ins->temp()));
+      case MIRType::Int16x8:
+        return visitSimdGeneralShuffle<int16_t, Register>(ins, ToRegister(ins->temp()));
+      case MIRType::Int32x4:
+        return visitSimdGeneralShuffle<int32_t, Register>(ins, ToRegister(ins->temp()));
+      default:
+        MOZ_CRASH("unsupported type for general shuffle");
+    }
 }
 void
 CodeGeneratorX86Shared::visitSimdGeneralShuffleF(LSimdGeneralShuffleF* ins)
 {
     ScratchFloat32Scope scratch(masm);
     visitSimdGeneralShuffle<float, FloatRegister>(ins, scratch);
 }
 
--- a/js/src/jit/x86-shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@@ -935,8 +935,51 @@ LIRGeneratorX86Shared::visitSimdShuffle(
             lir->setTemp(0, tempFixed(ebx));
 #else
             lir->setTemp(0, temp());
 #endif
         }
     }
 }
 
+void
+LIRGeneratorX86Shared::visitSimdGeneralShuffle(MSimdGeneralShuffle* ins)
+{
+    MOZ_ASSERT(IsSimdType(ins->type()));
+
+    LSimdGeneralShuffleBase* lir;
+    if (IsIntegerSimdType(ins->type())) {
+#if defined(JS_CODEGEN_X86)
+        // The temp register must be usable with 8-bit load and store
+        // instructions, so one of %eax-%edx.
+        LDefinition t;
+        if (ins->type() == MIRType::Int8x16)
+            t = tempFixed(ebx);
+        else
+            t = temp();
+#else
+        LDefinition t = temp();
+#endif
+        lir = new (alloc()) LSimdGeneralShuffleI(t);
+    } else if (ins->type() == MIRType::Float32x4) {
+        lir = new (alloc()) LSimdGeneralShuffleF(temp());
+    } else {
+        MOZ_CRASH("Unknown SIMD kind when doing a shuffle");
+    }
+
+    if (!lir->init(alloc(), ins->numVectors() + ins->numLanes()))
+        return;
+
+    for (unsigned i = 0; i < ins->numVectors(); i++) {
+        MOZ_ASSERT(IsSimdType(ins->vector(i)->type()));
+        lir->setOperand(i, useRegister(ins->vector(i)));
+    }
+
+    for (unsigned i = 0; i < ins->numLanes(); i++) {
+        MOZ_ASSERT(ins->lane(i)->type() == MIRType::Int32);
+        // Note that there can be up to 16 lane arguments, so we can't assume
+        // that they all get an allocated register.
+        lir->setOperand(i + ins->numVectors(), use(ins->lane(i)));
+    }
+
+    assignSnapshot(lir, Bailout_BoundsCheck);
+    define(lir, ins);
+}
--- a/js/src/jit/x86-shared/Lowering-x86-shared.h
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.h
@@ -59,16 +59,17 @@ class LIRGeneratorX86Shared : public LIR
     void visitSimdInsertElement(MSimdInsertElement* ins);
     void visitSimdExtractElement(MSimdExtractElement* ins);
     void visitSimdBinaryArith(MSimdBinaryArith* ins);
     void visitSimdBinarySaturating(MSimdBinarySaturating* ins);
     void visitSimdSelect(MSimdSelect* ins);
     void visitSimdSplat(MSimdSplat* ins);
     void visitSimdSwizzle(MSimdSwizzle* ins);
     void visitSimdShuffle(MSimdShuffle* ins);
+    void visitSimdGeneralShuffle(MSimdGeneralShuffle* ins);
     void visitSimdValueX4(MSimdValueX4* ins);
     void lowerCompareExchangeTypedArrayElement(MCompareExchangeTypedArrayElement* ins,
                                                bool useI386ByteRegisters);
     void lowerAtomicExchangeTypedArrayElement(MAtomicExchangeTypedArrayElement* ins,
                                               bool useI386ByteRegisters);
     void lowerAtomicTypedArrayElementBinop(MAtomicTypedArrayElementBinop* ins,
                                            bool useI386ByteRegisters);
 };
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
@@ -624,22 +624,28 @@ class MacroAssemblerX86Shared : public A
                 masm->pop(substitute_);
         }
 
         Register reg() {
             return substitute_;
         }
     };
 
+    void load8ZeroExtend(const Operand& src, Register dest) {
+        movzbl(src, dest);
+    }
     void load8ZeroExtend(const Address& src, Register dest) {
         movzbl(Operand(src), dest);
     }
     void load8ZeroExtend(const BaseIndex& src, Register dest) {
         movzbl(Operand(src), dest);
     }
+    void load8SignExtend(const Operand& src, Register dest) {
+        movsbl(src, dest);
+    }
     void load8SignExtend(const Address& src, Register dest) {
         movsbl(Operand(src), dest);
     }
     void load8SignExtend(const BaseIndex& src, Register dest) {
         movsbl(Operand(src), dest);
     }
     template <typename T>
     void store8(Imm32 src, const T& dest) {
@@ -677,16 +683,19 @@ class MacroAssemblerX86Shared : public A
     }
     template <typename T>
     void atomicExchange8SignExtend(const T& mem, Register value, Register output) {
         if (value != output)
             movl(value, output);
         xchgb(output, Operand(mem));
         movsbl(output, output);
     }
+    void load16ZeroExtend(const Operand& src, Register dest) {
+        movzwl(src, dest);
+    }
     void load16ZeroExtend(const Address& src, Register dest) {
         movzwl(Operand(src), dest);
     }
     void load16ZeroExtend(const BaseIndex& src, Register dest) {
         movzwl(Operand(src), dest);
     }
     template <typename S, typename T>
     void store16(const S& src, const T& dest) {
@@ -717,16 +726,19 @@ class MacroAssemblerX86Shared : public A
     }
     template <typename T>
     void atomicExchange16SignExtend(const T& mem, Register value, Register output) {
         if (value != output)
             movl(value, output);
         xchgw(output, Operand(mem));
         movswl(output, output);
     }
+    void load16SignExtend(const Operand& src, Register dest) {
+        movswl(src, dest);
+    }
     void load16SignExtend(const Address& src, Register dest) {
         movswl(Operand(src), dest);
     }
     void load16SignExtend(const BaseIndex& src, Register dest) {
         movswl(Operand(src), dest);
     }
     void load32(const Address& address, Register dest) {
         movl(Operand(address), dest);
@@ -1327,44 +1339,72 @@ class MacroAssemblerX86Shared : public A
     template<typename T>
     void atomicExchangeToTypedIntArray(Scalar::Type arrayType, const T& mem, Register value,
                                        Register temp, AnyRegister output);
 
   protected:
     bool buildOOLFakeExitFrame(void* fakeReturnAddr);
 };
 
-template <> inline void
-MacroAssemblerX86Shared::loadAlignedVector<int32_t>(const Address& src, FloatRegister dest) {
-    loadAlignedSimd128Int(src, dest);
-}
-template <> inline void
-MacroAssemblerX86Shared::loadAlignedVector<float>(const Address& src, FloatRegister dest) {
+// Specialize for float to use movaps. Use movdqa for everything else.
+template <>
+inline void
+MacroAssemblerX86Shared::loadAlignedVector<float>(const Address& src, FloatRegister dest)
+{
     loadAlignedSimd128Float(src, dest);
 }
 
-template <> inline void
-MacroAssemblerX86Shared::storeAlignedVector<int32_t>(FloatRegister src, const Address& dest) {
-    storeAlignedSimd128Int(src, dest);
+template <typename T>
+inline void
+MacroAssemblerX86Shared::loadAlignedVector(const Address& src, FloatRegister dest)
+{
+    loadAlignedSimd128Int(src, dest);
 }
-template <> inline void
-MacroAssemblerX86Shared::storeAlignedVector<float>(FloatRegister src, const Address& dest) {
+
+// Specialize for float to use movaps. Use movdqa for everything else.
+template <>
+inline void
+MacroAssemblerX86Shared::storeAlignedVector<float>(FloatRegister src, const Address& dest)
+{
     storeAlignedSimd128Float(src, dest);
 }
 
+template <typename T>
+inline void
+MacroAssemblerX86Shared::storeAlignedVector(FloatRegister src, const Address& dest)
+{
+    storeAlignedSimd128Int(src, dest);
+}
+
+template <> inline void
+MacroAssemblerX86Shared::loadScalar<int8_t>(const Operand& src, Register dest) {
+    load8ZeroExtend(src, dest);
+}
+template <> inline void
+MacroAssemblerX86Shared::loadScalar<int16_t>(const Operand& src, Register dest) {
+    load16ZeroExtend(src, dest);
+}
 template <> inline void
 MacroAssemblerX86Shared::loadScalar<int32_t>(const Operand& src, Register dest) {
     load32(src, dest);
 }
 template <> inline void
 MacroAssemblerX86Shared::loadScalar<float>(const Operand& src, FloatRegister dest) {
     loadFloat32(src, dest);
 }
 
 template <> inline void
+MacroAssemblerX86Shared::storeScalar<int8_t>(Register src, const Address& dest) {
+    store8(src, dest);
+}
+template <> inline void
+MacroAssemblerX86Shared::storeScalar<int16_t>(Register src, const Address& dest) {
+    store16(src, dest);
+}
+template <> inline void
 MacroAssemblerX86Shared::storeScalar<int32_t>(Register src, const Address& dest) {
     store32(src, dest);
 }
 template <> inline void
 MacroAssemblerX86Shared::storeScalar<float>(FloatRegister src, const Address& dest) {
     vmovss(src, dest);
 }