Bug 1021716: SIMD x86-x64: Implement MSimdExtractElement; r=sunfish
authorBenjamin Bouvier <benj@benj.me>
Wed, 13 Aug 2014 11:18:00 +0200
changeset 199209 0cab1edaa959c017c9317f6ebd96ee734f059a88
parent 199208 f6970e7f17e1dc0a113764c5c70222a430f59c10
child 199210 228b3416dace505da68efa8f5fa7b390c77cc934
push id8185
push useremorley@mozilla.com
push dateWed, 13 Aug 2014 16:01:50 +0000
treeherderfx-team@1601bb449713 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssunfish
bugs1021716
milestone34.0a1
Bug 1021716: SIMD x86-x64: Implement MSimdExtractElement; r=sunfish
js/src/assembler/assembler/X86Assembler.h
js/src/jit/IonTypes.h
js/src/jit/LIR-Common.h
js/src/jit/LOpcodes.h
js/src/jit/Lowering.cpp
js/src/jit/Lowering.h
js/src/jit/MIR.h
js/src/jit/MOpcodes.h
js/src/jit/ParallelSafetyAnalysis.cpp
js/src/jit/arm/CodeGenerator-arm.h
js/src/jit/shared/Assembler-x86-shared.h
js/src/jit/shared/CodeGenerator-x86-shared.cpp
js/src/jit/shared/CodeGenerator-x86-shared.h
js/src/jit/shared/MacroAssembler-x86-shared.h
--- a/js/src/assembler/assembler/X86Assembler.h
+++ b/js/src/assembler/assembler/X86Assembler.h
@@ -307,30 +307,32 @@ private:
         OP2_SQRTSD_VsdWsd   = 0x51,
         OP2_SQRTSS_VssWss   = 0x51,
         OP2_ANDPD_VpdWpd    = 0x54,
         OP2_ORPD_VpdWpd     = 0x56,
         OP2_XORPD_VpdWpd    = 0x57,
         OP2_MOVD_VdEd       = 0x6E,
         OP2_MOVDQ_VsdWsd    = 0x6F,
         OP2_MOVDQ_VdqWdq    = 0x6F,
+        OP2_PSHUFD_VdqWdqIb = 0x70,
         OP2_PSRLDQ_Vd       = 0x73,
         OP2_PCMPEQW         = 0x75,
         OP2_MOVD_EdVd       = 0x7E,
         OP2_MOVDQ_WdqVdq    = 0x7F,
         OP2_JCC_rel32       = 0x80,
         OP_SETCC            = 0x90,
         OP2_IMUL_GvEv       = 0xAF,
         OP2_CMPXCHG_GvEw    = 0xB1,
         OP2_MOVSX_GvEb      = 0xBE,
         OP2_MOVSX_GvEw      = 0xBF,
         OP2_MOVZX_GvEb      = 0xB6,
         OP2_MOVZX_GvEw      = 0xB7,
         OP2_XADD_EvGv       = 0xC1,
-        OP2_PEXTRW_GdUdIb   = 0xC5
+        OP2_PEXTRW_GdUdIb   = 0xC5,
+        OP2_SHUFPS_VpsWpsIb = 0xC6
     } TwoByteOpcodeID;
 
     typedef enum {
         OP3_ROUNDSS_VsdWsd  = 0x0A,
         OP3_ROUNDSD_VsdWsd  = 0x0B,
         OP3_PTEST_VdVd      = 0x17,
         OP3_PINSRD_VsdWsd   = 0x22
     } ThreeByteOpcodeID;
@@ -2582,16 +2584,35 @@ public:
     void movd_rr(RegisterID src, XMMRegisterID dst)
     {
         spew("movd       %s, %s",
              nameIReg(src), nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.twoByteOp(OP2_MOVD_VdEd, (RegisterID)dst, src);
     }
 
+    void pshufd_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
+    {
+        JS_ASSERT(mask < 256);
+        spew("pshufd      0x%x, %s, %s",
+             mask, nameFPReg(src), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.twoByteOp(OP2_PSHUFD_VdqWdqIb, (RegisterID)dst, (RegisterID)src);
+        m_formatter.immediate8(uint8_t(mask));
+    }
+
+    void shufps_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
+    {
+        JS_ASSERT(mask < 256);
+        spew("shufps     0x%x, %s, %s",
+             mask, nameFPReg(src), nameFPReg(dst));
+        m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, (RegisterID)dst, (RegisterID)src);
+        m_formatter.immediate8(uint8_t(mask));
+    }
+
     void psrldq_ir(int shift, XMMRegisterID dest)
     {
         spew("psrldq     $%d, %s",
              shift, nameFPReg(dest));
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.twoByteOp(OP2_PSRLDQ_Vd, (RegisterID)3, (RegisterID)dest);
         m_formatter.immediate8(shift);
     }
--- a/js/src/jit/IonTypes.h
+++ b/js/src/jit/IonTypes.h
@@ -446,16 +446,25 @@ SimdTypeToScalarType(MIRType type)
         return MIRType_Int32;
       case MIRType_Float32x4:
         return MIRType_Float32;
       default: break;
     }
     MOZ_ASSUME_UNREACHABLE("unexpected SIMD kind");
 }
 
+// Indicates a lane in a SIMD register: X for the first lane, Y for the second,
+// Z for the third (if any), W for the fourth (if any).
+enum SimdLane {
+    LaneX = 0x0,
+    LaneY = 0x1,
+    LaneZ = 0x2,
+    LaneW = 0x3
+};
+
 #ifdef DEBUG
 
 // Track the pipeline of opcodes which has produced a snapshot.
 #define TRACK_SNAPSHOTS 1
 
 // Make sure registers are not modified between an instruction and
 // its OsiPoint.
 #define CHECK_OSIPOINT_REGISTERS 1
--- a/js/src/jit/LIR-Common.h
+++ b/js/src/jit/LIR-Common.h
@@ -123,16 +123,54 @@ class LMoveGroup : public LInstructionHe
     size_t numMoves() const {
         return moves_.length();
     }
     const LMove &getMove(size_t i) const {
         return moves_[i];
     }
 };
 
+// Extracts an element from a given SIMD int32x4 lane.
+class LSimdExtractElementI : public LInstructionHelper<1, 1, 0>
+{
+    SimdLane lane_;
+
+  public:
+    LIR_HEADER(SimdExtractElementI);
+
+    LSimdExtractElementI(const LAllocation &base, SimdLane lane) : lane_(lane) {
+        setOperand(0, base);
+    }
+    const LAllocation *getBase() {
+        return getOperand(0);
+    }
+    SimdLane lane() const {
+        return lane_;
+    }
+};
+
+// Extracts an element from a given SIMD float32x4 lane.
+class LSimdExtractElementF : public LInstructionHelper<1, 1, 0>
+{
+    SimdLane lane_;
+
+  public:
+    LIR_HEADER(SimdExtractElementF);
+
+    LSimdExtractElementF(const LAllocation &base, SimdLane lane) : lane_(lane) {
+        setOperand(0, base);
+    }
+    const LAllocation *getBase() {
+        return getOperand(0);
+    }
+    SimdLane lane() const {
+        return lane_;
+    }
+};
+
 // Constant 32-bit integer.
 class LInteger : public LInstructionHelper<1, 0, 0>
 {
     int32_t i32_;
 
   public:
     LIR_HEADER(Integer)
 
--- a/js/src/jit/LOpcodes.h
+++ b/js/src/jit/LOpcodes.h
@@ -11,16 +11,18 @@
     _(Label)                        \
     _(Nop)                          \
     _(OsiPoint)                     \
     _(MoveGroup)                    \
     _(Integer)                      \
     _(Pointer)                      \
     _(Double)                       \
     _(Float32)                      \
+    _(SimdExtractElementI)          \
+    _(SimdExtractElementF)          \
     _(Value)                        \
     _(CloneLiteral)                 \
     _(Parameter)                    \
     _(Callee)                       \
     _(TableSwitch)                  \
     _(TableSwitchV)                 \
     _(Goto)                         \
     _(NewArray)                     \
--- a/js/src/jit/Lowering.cpp
+++ b/js/src/jit/Lowering.cpp
@@ -3633,16 +3633,38 @@ bool
 LIRGenerator::visitRecompileCheck(MRecompileCheck *ins)
 {
     LRecompileCheck *lir = new(alloc()) LRecompileCheck(temp());
     if (!add(lir, ins))
         return false;
     return assignSafepoint(lir, ins);
 }
 
+bool
+LIRGenerator::visitSimdExtractElement(MSimdExtractElement *ins)
+{
+    JS_ASSERT(IsSimdType(ins->input()->type()));
+    JS_ASSERT(!IsSimdType(ins->type()));
+
+    if (ins->input()->type() == MIRType_Int32x4) {
+        // Note: there could be int16x8 in the future, which doesn't use the
+        // same instruction. We either need to pass the arity or create new LIns.
+        LUse use = useRegisterAtStart(ins->input());
+        return define(new(alloc()) LSimdExtractElementI(use, ins->lane()), ins);
+    }
+
+    if (ins->input()->type() == MIRType_Float32x4) {
+        LUse use = useRegisterAtStart(ins->input());
+        return define(new(alloc()) LSimdExtractElementF(use, ins->lane()), ins);
+    }
+
+    MOZ_ASSUME_UNREACHABLE("Unknown SIMD kind when extracting element");
+    return false;
+}
+
 static void
 SpewResumePoint(MBasicBlock *block, MInstruction *ins, MResumePoint *resumePoint)
 {
     fprintf(IonSpewFile, "Current resume point %p details:\n", (void *)resumePoint);
     fprintf(IonSpewFile, "    frame count: %u\n", resumePoint->frameCount());
 
     if (ins) {
         fprintf(IonSpewFile, "    taken after: ");
--- a/js/src/jit/Lowering.h
+++ b/js/src/jit/Lowering.h
@@ -258,14 +258,15 @@ class LIRGenerator : public LIRGenerator
     bool visitAsmJSReturn(MAsmJSReturn *ins);
     bool visitAsmJSVoidReturn(MAsmJSVoidReturn *ins);
     bool visitAsmJSPassStackArg(MAsmJSPassStackArg *ins);
     bool visitAsmJSCall(MAsmJSCall *ins);
     bool visitSetDOMProperty(MSetDOMProperty *ins);
     bool visitGetDOMProperty(MGetDOMProperty *ins);
     bool visitGetDOMMember(MGetDOMMember *ins);
     bool visitRecompileCheck(MRecompileCheck *ins);
+    bool visitSimdExtractElement(MSimdExtractElement *ins);
 };
 
 } // namespace jit
 } // namespace js
 
 #endif /* jit_Lowering_h */
--- a/js/src/jit/MIR.h
+++ b/js/src/jit/MIR.h
@@ -1228,16 +1228,57 @@ class MConstant : public MNullaryInstruc
     void computeRange(TempAllocator &alloc);
     bool truncate(TruncateKind kind);
 
     bool canProduceFloat32() const;
 
     ALLOW_CLONE(MConstant)
 };
 
+// Extracts a lane element from a given vector type, given by its lane symbol.
+class MSimdExtractElement : public MUnaryInstruction
+{
+  protected:
+    SimdLane lane_;
+
+    MSimdExtractElement(MDefinition *obj, MIRType type, SimdLane lane)
+      : MUnaryInstruction(obj), lane_(lane)
+    {
+        JS_ASSERT(IsSimdType(obj->type()));
+        JS_ASSERT(uint32_t(lane) < SimdTypeToLength(obj->type()));
+        JS_ASSERT(!IsSimdType(type));
+        JS_ASSERT(SimdTypeToScalarType(obj->type()) == type);
+        setResultType(type);
+    }
+
+  public:
+    INSTRUCTION_HEADER(SimdExtractElement);
+    static MSimdExtractElement *NewAsmJS(TempAllocator &alloc, MDefinition *obj, MIRType type,
+                                         SimdLane lane)
+    {
+        return new(alloc) MSimdExtractElement(obj, type, lane);
+    }
+
+    SimdLane lane() const {
+        return lane_;
+    }
+
+    AliasSet getAliasSet() const {
+        return AliasSet::None();
+    }
+    bool congruentTo(const MDefinition *ins) const {
+        if (!ins->isSimdExtractElement())
+            return false;
+        const MSimdExtractElement *other = ins->toSimdExtractElement();
+        if (other->lane_ != lane_)
+            return false;
+        return congruentIfOperandsEqual(other);
+    }
+};
+
 // Deep clone a constant JSObject.
 class MCloneLiteral
   : public MUnaryInstruction,
     public ObjectPolicy<0>
 {
   protected:
     explicit MCloneLiteral(MDefinition *obj)
       : MUnaryInstruction(obj)
--- a/js/src/jit/MOpcodes.h
+++ b/js/src/jit/MOpcodes.h
@@ -7,16 +7,17 @@
 #ifndef jit_MOpcodes_h
 #define jit_MOpcodes_h
 
 namespace js {
 namespace jit {
 
 #define MIR_OPCODE_LIST(_)                                                  \
     _(Constant)                                                             \
+    _(SimdExtractElement)                                                   \
     _(CloneLiteral)                                                         \
     _(Parameter)                                                            \
     _(Callee)                                                               \
     _(TableSwitch)                                                          \
     _(Goto)                                                                 \
     _(Test)                                                                 \
     _(TypeObjectDispatch)                                                   \
     _(FunctionDispatch)                                                     \
--- a/js/src/jit/ParallelSafetyAnalysis.cpp
+++ b/js/src/jit/ParallelSafetyAnalysis.cpp
@@ -107,16 +107,17 @@ class ParallelSafetyVisitor : public MDe
     }
 
     bool convertToBailout(MInstructionIterator &iter);
 
     // I am taking the policy of blacklisting everything that's not
     // obviously safe for now.  We can loosen as we need.
 
     SAFE_OP(Constant)
+    SAFE_OP(SimdExtractElement)
     UNSAFE_OP(CloneLiteral)
     SAFE_OP(Parameter)
     SAFE_OP(Callee)
     SAFE_OP(TableSwitch)
     SAFE_OP(Goto)
     SAFE_OP(Test)
     SAFE_OP(Compare)
     SAFE_OP(Phi)
--- a/js/src/jit/arm/CodeGenerator-arm.h
+++ b/js/src/jit/arm/CodeGenerator-arm.h
@@ -219,16 +219,21 @@ class CodeGeneratorARM : public CodeGene
             }
         }
     }
 
     bool visitEffectiveAddress(LEffectiveAddress *ins);
     bool visitUDiv(LUDiv *ins);
     bool visitUMod(LUMod *ins);
     bool visitSoftUDivOrMod(LSoftUDivOrMod *ins);
+
+  public:
+    // Unimplemented SIMD instructions
+    bool visitSimdExtractElementI(LSimdExtractElementI *ins) { MOZ_ASSUME_UNREACHABLE("NYI"); }
+    bool visitSimdExtractElementF(LSimdExtractElementF *ins) { MOZ_ASSUME_UNREACHABLE("NYI"); }
 };
 
 typedef CodeGeneratorARM CodeGeneratorSpecific;
 
 // An out-of-line bailout thunk.
 class OutOfLineBailout : public OutOfLineCodeBase<CodeGeneratorARM>
 {
   protected: // Silence Clang warning.
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@@ -1458,16 +1458,24 @@ class AssemblerX86Shared : public Assemb
     void movd(Register src, FloatRegister dest) {
         JS_ASSERT(HasSSE2());
         masm.movd_rr(src.code(), dest.code());
     }
     void movd(FloatRegister src, Register dest) {
         JS_ASSERT(HasSSE2());
         masm.movd_rr(src.code(), dest.code());
     }
+    void pshufd(uint32_t mask, FloatRegister src, FloatRegister dest) {
+        JS_ASSERT(HasSSE2());
+        masm.pshufd_irr(mask, src.code(), dest.code());
+    }
+    void shufps(uint32_t mask, FloatRegister src, FloatRegister dest) {
+        JS_ASSERT(HasSSE2());
+        masm.shufps_irr(mask, src.code(), dest.code());
+    }
     void addsd(FloatRegister src, FloatRegister dest) {
         JS_ASSERT(HasSSE2());
         masm.addsd_rr(src.code(), dest.code());
     }
     void addss(FloatRegister src, FloatRegister dest) {
         JS_ASSERT(HasSSE2());
         masm.addss_rr(src.code(), dest.code());
     }
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@@ -2049,16 +2049,53 @@ CodeGeneratorX86Shared::visitNegF(LNegF 
     FloatRegister input = ToFloatRegister(ins->input());
     JS_ASSERT(input == ToFloatRegister(ins->output()));
 
     masm.negateFloat(input);
     return true;
 }
 
 bool
+CodeGeneratorX86Shared::visitSimdExtractElementI(LSimdExtractElementI *ins)
+{
+    FloatRegister input = ToFloatRegister(ins->input());
+    Register output = ToRegister(ins->output());
+
+    SimdLane lane = ins->lane();
+    if (lane == LaneX) {
+        // The value we want to extract is in the low double-word
+        masm.moveLowInt32(input, output);
+    } else {
+        uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
+        masm.shuffleInt32(mask, input, ScratchSimdReg);
+        masm.moveLowInt32(ScratchSimdReg, output);
+    }
+    return true;
+}
+
+bool
+CodeGeneratorX86Shared::visitSimdExtractElementF(LSimdExtractElementF *ins)
+{
+    FloatRegister input = ToFloatRegister(ins->input());
+    FloatRegister output = ToFloatRegister(ins->output());
+
+    SimdLane lane = ins->lane();
+    if (lane == LaneX) {
+        // The value we want to extract is in the low double-word
+        if (input != output)
+            masm.moveFloat32(input, output);
+    } else {
+        uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
+        masm.shuffleFloat32(mask, input, output);
+    }
+    masm.canonicalizeFloat(output);
+    return true;
+}
+
+bool
 CodeGeneratorX86Shared::visitForkJoinGetSlice(LForkJoinGetSlice *ins)
 {
     MOZ_ASSERT(gen->info().executionMode() == ParallelExecution);
     MOZ_ASSERT(ToRegister(ins->forkJoinContext()) == ForkJoinGetSliceReg_cx);
     MOZ_ASSERT(ToRegister(ins->temp1()) == eax);
     MOZ_ASSERT(ToRegister(ins->temp2()) == edx);
     MOZ_ASSERT(ToRegister(ins->temp3()) == ForkJoinGetSliceReg_temp0);
     MOZ_ASSERT(ToRegister(ins->temp4()) == ForkJoinGetSliceReg_temp1);
--- a/js/src/jit/shared/CodeGenerator-x86-shared.h
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.h
@@ -199,16 +199,20 @@ class CodeGeneratorX86Shared : public Co
     bool visitOutOfLineLoadTypedArrayOutOfBounds(OutOfLineLoadTypedArrayOutOfBounds *ool);
 
     bool visitForkJoinGetSlice(LForkJoinGetSlice *ins);
 
     bool visitNegI(LNegI *lir);
     bool visitNegD(LNegD *lir);
     bool visitNegF(LNegF *lir);
 
+    // SIMD operators
+    bool visitSimdExtractElementI(LSimdExtractElementI *lir);
+    bool visitSimdExtractElementF(LSimdExtractElementF *lir);
+
     // Out of line visitors.
     bool visitOutOfLineBailout(OutOfLineBailout *ool);
     bool visitOutOfLineUndoALUOperation(OutOfLineUndoALUOperation *ool);
     bool visitMulNegativeZeroCheck(MulNegativeZeroCheck *ool);
     bool visitModOverflowCheck(ModOverflowCheck *ool);
     bool visitReturnZero(ReturnZero *ool);
     bool visitOutOfLineTableSwitch(OutOfLineTableSwitch *ool);
     bool generateInvalidateEpilogue();
--- a/js/src/jit/shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/shared/MacroAssembler-x86-shared.h
@@ -494,16 +494,45 @@ class MacroAssemblerX86Shared : public A
     }
     void loadUnalignedFloat32x4(const Address &src, FloatRegister dest) {
         movups(Operand(src), dest);
     }
     void storeUnalignedFloat32x4(FloatRegister src, const Address &dest) {
         movups(src, Operand(dest));
     }
 
+    static uint32_t ComputeShuffleMask(SimdLane x, SimdLane y = LaneX,
+                                       SimdLane z = LaneX, SimdLane w = LaneX)
+    {
+        uint32_t r = (uint32_t(w) << 6) |
+                     (uint32_t(z) << 4) |
+                     (uint32_t(y) << 2) |
+                     uint32_t(x);
+        JS_ASSERT(r < 256);
+        return r;
+    }
+
+    void shuffleInt32(uint32_t mask, FloatRegister src, FloatRegister dest) {
+        pshufd(mask, src, dest);
+    }
+    void moveLowInt32(FloatRegister src, Register dest) {
+        movd(src, dest);
+    }
+
+    void shuffleFloat32(uint32_t mask, FloatRegister src, FloatRegister dest) {
+        // The shuffle instruction on x86 is such that it moves 2 words from
+        // the dest and 2 words from the src operands. To simplify things, just
+        // clobber the output with the input and apply the instruction
+        // afterwards.
+        // Note: this is useAtStart-safe because src isn't read afterwards.
+        if (src != dest)
+            moveAlignedFloat32x4(src, dest);
+        shufps(mask, dest, dest);
+    }
+
     void moveFloatAsDouble(Register src, FloatRegister dest) {
         movd(src, dest);
         cvtss2sd(dest, dest);
     }
     void loadFloatAsDouble(const Address &src, FloatRegister dest) {
         movss(src, dest);
         cvtss2sd(dest, dest);
     }