Bug 1135039: Generalize SimdGeneralShuffle codegen; r=sunfish
authorBenjamin Bouvier <benj@benj.me>
Thu, 12 Mar 2015 14:58:59 +0100
changeset 234684 6abbe0f834790dd4580b23bfa40fbc0ec0cb5909
parent 234683 71a8d6e735ef718ea9aed9d29e4153907c8a3e10
child 234685 62b55b1f9b7e19be8a69d45076c391bc57439561
push id57209
push userbenj@benj.me
push dateFri, 20 Mar 2015 15:21:39 +0000
treeherdermozilla-inbound@6abbe0f83479 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssunfish
bugs1135039
milestone39.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1135039: Generalize SimdGeneralShuffle codegen; r=sunfish
js/src/jit/shared/CodeGenerator-x86-shared.cpp
js/src/jit/shared/CodeGenerator-x86-shared.h
js/src/jit/shared/MacroAssembler-x86-shared.h
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@@ -2380,120 +2380,79 @@ CodeGeneratorX86Shared::visitSimdSignMas
 {
     FloatRegister input = ToFloatRegister(ins->input());
     Register output = ToRegister(ins->output());
 
     // For Float32x4 and Int32x4.
     masm.vmovmskps(input, output);
 }
 
-void
-CodeGeneratorX86Shared::visitSimdGeneralShuffleI(LSimdGeneralShuffleI *ins)
+template <class T, class Reg> void
+CodeGeneratorX86Shared::visitSimdGeneralShuffle(LSimdGeneralShuffleBase *ins, Reg tempRegister)
 {
     MSimdGeneralShuffle *mir = ins->mir();
     unsigned numVectors = mir->numVectors();
 
-    Register temp = ToRegister(ins->temp());
+    Register laneTemp = ToRegister(ins->temp());
 
     // This won't generate fast code, but it's fine because we expect users
     // to have used constant indices (and thus MSimdGeneralShuffle to be fold
     // into MSimdSwizzle/MSimdShuffle, which are fast).
     masm.reserveStack(Simd128DataSize * numVectors);
 
-    for (unsigned i = 0; i < numVectors; i++)
-        masm.storeAlignedInt32x4(ToFloatRegister(ins->vector(i)),
-                                 Address(StackPointer, Simd128DataSize * (1 + i)));
+    for (unsigned i = 0; i < numVectors; i++) {
+        masm.storeAlignedVector<T>(ToFloatRegister(ins->vector(i)),
+                                   Address(StackPointer, Simd128DataSize * (1 + i)));
+    }
 
     Label bail;
 
     for (size_t i = 0; i < mir->numLanes(); i++) {
         Operand lane = ToOperand(ins->lane(i));
 
         masm.cmp32(lane, Imm32(mir->numVectors() * mir->numLanes() - 1));
         masm.j(Assembler::Above, &bail);
 
         if (lane.kind() == Operand::REG) {
-            masm.load32(Operand(StackPointer, ToRegister(ins->lane(i)), TimesFour, Simd128DataSize),
-                        temp);
+            masm.loadScalar<T>(Operand(StackPointer, ToRegister(ins->lane(i)), TimesFour, Simd128DataSize),
+                               tempRegister);
         } else {
-            masm.load32(lane, temp);
-            masm.load32(Operand(StackPointer, temp, TimesFour, Simd128DataSize), temp);
+            masm.load32(lane, laneTemp);
+            masm.loadScalar<T>(Operand(StackPointer, laneTemp, TimesFour, Simd128DataSize), tempRegister);
         }
 
-        masm.store32(temp, Address(StackPointer, i * sizeof(int32_t)));
+        masm.storeScalar<T>(tempRegister, Address(StackPointer, i * sizeof(T)));
     }
 
     FloatRegister output = ToFloatRegister(ins->output());
-    masm.loadAlignedInt32x4(Address(StackPointer, 0), output);
+    masm.loadAlignedVector<T>(Address(StackPointer, 0), output);
 
     Label join;
     masm.jump(&join);
 
     {
         masm.bind(&bail);
         masm.freeStack(Simd128DataSize * numVectors);
         bailout(ins->snapshot());
     }
 
     masm.bind(&join);
     masm.setFramePushed(masm.framePushed() + Simd128DataSize * numVectors);
     masm.freeStack(Simd128DataSize * numVectors);
 }
 
 void
+CodeGeneratorX86Shared::visitSimdGeneralShuffleI(LSimdGeneralShuffleI *ins)
+{
+    visitSimdGeneralShuffle<int32_t, Register>(ins, ToRegister(ins->temp()));
+}
+void
 CodeGeneratorX86Shared::visitSimdGeneralShuffleF(LSimdGeneralShuffleF *ins)
 {
-    MSimdGeneralShuffle *mir = ins->mir();
-    unsigned numVectors = mir->numVectors();
-
-    Register temp = ToRegister(ins->temp());
-
-    // This won't generate fast code, but it's fine because we expect users
-    // to have used constant indices (and thus MSimdGeneralShuffle to be fold
-    // into MSimdSwizzle/MSimdShuffle, which are fast).
-    masm.reserveStack(Simd128DataSize * numVectors);
-
-    for (unsigned i = 0; i < numVectors; i++)
-        masm.storeAlignedFloat32x4(ToFloatRegister(ins->vector(i)),
-                                   Address(StackPointer, Simd128DataSize * (1 + i)));
-
-    Label bail;
-
-    for (size_t i = 0; i < mir->numLanes(); i++) {
-        Operand lane = ToOperand(ins->lane(i));
-
-        masm.cmp32(lane, Imm32(mir->numVectors() * mir->numLanes() - 1));
-        masm.j(Assembler::Above, &bail);
-
-        if (lane.kind() == Operand::REG) {
-            masm.loadFloat32(Operand(StackPointer, ToRegister(ins->lane(i)), TimesFour, Simd128DataSize),
-                            ScratchFloat32Reg);
-        } else {
-            masm.load32(lane, temp);
-            masm.loadFloat32(Operand(StackPointer, temp, TimesFour, Simd128DataSize), ScratchFloat32Reg);
-        }
-
-        masm.storeFloat32(ScratchFloat32Reg, Address(StackPointer, i * sizeof(int32_t)));
-    }
-
-    FloatRegister output = ToFloatRegister(ins->output());
-    masm.loadAlignedFloat32x4(Address(StackPointer, 0), output);
-
-    Label join;
-    masm.jump(&join);
-
-    {
-        masm.bind(&bail);
-        masm.freeStack(Simd128DataSize * numVectors);
-        bailout(ins->snapshot());
-    }
-
-    masm.bind(&join);
-    masm.setFramePushed(masm.framePushed() + Simd128DataSize * numVectors);
-    masm.freeStack(Simd128DataSize * numVectors);
+    visitSimdGeneralShuffle<float, FloatRegister>(ins, ScratchFloat32Reg);
 }
 
 void
 CodeGeneratorX86Shared::visitSimdSwizzleI(LSimdSwizzleI *ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     FloatRegister output = ToFloatRegister(ins->output());
 
--- a/js/src/jit/shared/CodeGenerator-x86-shared.h
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.h
@@ -253,31 +253,33 @@ class CodeGeneratorX86Shared : public Co
     void visitInt32x4ToFloat32x4(LInt32x4ToFloat32x4 *ins);
     void visitFloat32x4ToInt32x4(LFloat32x4ToInt32x4 *ins);
     void visitSimdReinterpretCast(LSimdReinterpretCast *lir);
     void visitSimdExtractElementI(LSimdExtractElementI *lir);
     void visitSimdExtractElementF(LSimdExtractElementF *lir);
     void visitSimdInsertElementI(LSimdInsertElementI *lir);
     void visitSimdInsertElementF(LSimdInsertElementF *lir);
     void visitSimdSignMaskX4(LSimdSignMaskX4 *ins);
-    void visitSimdGeneralShuffleI(LSimdGeneralShuffleI *lir);
-    void visitSimdGeneralShuffleF(LSimdGeneralShuffleF *lir);
     void visitSimdSwizzleI(LSimdSwizzleI *lir);
     void visitSimdSwizzleF(LSimdSwizzleF *lir);
     void visitSimdShuffle(LSimdShuffle *lir);
     void visitSimdUnaryArithIx4(LSimdUnaryArithIx4 *lir);
     void visitSimdUnaryArithFx4(LSimdUnaryArithFx4 *lir);
     void visitSimdBinaryCompIx4(LSimdBinaryCompIx4 *lir);
     void visitSimdBinaryCompFx4(LSimdBinaryCompFx4 *lir);
     void visitSimdBinaryArithIx4(LSimdBinaryArithIx4 *lir);
     void visitSimdBinaryArithFx4(LSimdBinaryArithFx4 *lir);
     void visitSimdBinaryBitwiseX4(LSimdBinaryBitwiseX4 *lir);
     void visitSimdShift(LSimdShift *lir);
     void visitSimdSelect(LSimdSelect *ins);
 
+    template <class T, class Reg> void visitSimdGeneralShuffle(LSimdGeneralShuffleBase *lir, Reg temp);
+    void visitSimdGeneralShuffleI(LSimdGeneralShuffleI *lir);
+    void visitSimdGeneralShuffleF(LSimdGeneralShuffleF *lir);
+
     // Out of line visitors.
     void visitOutOfLineBailout(OutOfLineBailout *ool);
     void visitOutOfLineUndoALUOperation(OutOfLineUndoALUOperation *ool);
     void visitMulNegativeZeroCheck(MulNegativeZeroCheck *ool);
     void visitModOverflowCheck(ModOverflowCheck *ool);
     void visitReturnZero(ReturnZero *ool);
     void visitOutOfLineTableSwitch(OutOfLineTableSwitch *ool);
     void generateInvalidateEpilogue();
--- a/js/src/jit/shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/shared/MacroAssembler-x86-shared.h
@@ -969,16 +969,21 @@ class MacroAssemblerX86Shared : public A
     }
     void zeroFloat32x4(FloatRegister dest) {
         vxorps(dest, dest, dest);
     }
     void zeroInt32x4(FloatRegister dest) {
         vpxor(dest, dest, dest);
     }
 
+    template <class T, class Reg> inline void loadScalar(const Operand &src, Reg dest);
+    template <class T, class Reg> inline void storeScalar(Reg src, const Address &dest);
+    template <class T> inline void loadAlignedVector(const Address &src, FloatRegister dest);
+    template <class T> inline void storeAlignedVector(FloatRegister src, const Address &dest);
+
     void loadAlignedInt32x4(const Address &src, FloatRegister dest) {
         vmovdqa(Operand(src), dest);
     }
     void loadAlignedInt32x4(const Operand &src, FloatRegister dest) {
         vmovdqa(src, dest);
     }
     void storeAlignedInt32x4(FloatRegister src, const Address &dest) {
         vmovdqa(src, Operand(dest));
@@ -1407,15 +1412,51 @@ class MacroAssemblerX86Shared : public A
     void abiret() {
         ret();
     }
 
   protected:
     bool buildOOLFakeExitFrame(void *fakeReturnAddr);
 };
 
+template <> inline void
+MacroAssemblerX86Shared::loadAlignedVector<int32_t>(const Address &src, FloatRegister dest) {
+    loadAlignedInt32x4(src, dest);
+}
+template <> inline void
+MacroAssemblerX86Shared::loadAlignedVector<float>(const Address &src, FloatRegister dest) {
+    loadAlignedFloat32x4(src, dest);
+}
+
+template <> inline void
+MacroAssemblerX86Shared::storeAlignedVector<int32_t>(FloatRegister src, const Address &dest) {
+    storeAlignedInt32x4(src, dest);
+}
+template <> inline void
+MacroAssemblerX86Shared::storeAlignedVector<float>(FloatRegister src, const Address &dest) {
+    storeAlignedFloat32x4(src, dest);
+}
+
+template <> inline void
+MacroAssemblerX86Shared::loadScalar<int32_t>(const Operand &src, Register dest) {
+    load32(src, dest);
+}
+template <> inline void
+MacroAssemblerX86Shared::loadScalar<float>(const Operand &src, FloatRegister dest) {
+    loadFloat32(src, dest);
+}
+
+template <> inline void
+MacroAssemblerX86Shared::storeScalar<int32_t>(Register src, const Address &dest) {
+    store32(src, dest);
+}
+template <> inline void
+MacroAssemblerX86Shared::storeScalar<float>(FloatRegister src, const Address &dest) {
+    storeFloat32(src, dest);
+}
+
 } // namespace jit
 } // namespace js
 
 #undef CHECK_BYTEREG
 #undef CHECK_BYTEREGS
 
 #endif /* jit_shared_MacroAssembler_x86_shared_h */