Bug 1108825: Implement SIMD.int32x4.mul in Odin; r=sunfish
authorBenjamin Bouvier <benj@benj.me>
Thu, 11 Dec 2014 12:10:35 +0100
changeset 219300 354a878fcc9f2703fcc3930a59e49611e411307d
parent 219299 e114f15ec1bd1316bf20009b22ea53d36cbf3863
child 219301 c2659bf5793d2093d573f47551125e28be94d34e
push id10368
push userkwierso@gmail.com
push dateFri, 12 Dec 2014 01:38:39 +0000
treeherderfx-team@5288b15d22de [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssunfish
bugs1108825
milestone37.0a1
Bug 1108825: Implement SIMD.int32x4.mul in Odin; r=sunfish
js/src/builtin/SIMD.h
js/src/jit-test/tests/asm.js/testSIMD.js
js/src/jit/LIR-Common.h
js/src/jit/Lowering.cpp
js/src/jit/Lowering.h
js/src/jit/MIR.h
js/src/jit/arm/Lowering-arm.cpp
js/src/jit/arm/Lowering-arm.h
js/src/jit/mips/Lowering-mips.cpp
js/src/jit/mips/Lowering-mips.h
js/src/jit/shared/Assembler-x86-shared.h
js/src/jit/shared/BaseAssembler-x86-shared.h
js/src/jit/shared/CodeGenerator-x86-shared.cpp
js/src/jit/shared/Lowering-shared.cpp
js/src/jit/shared/Lowering-shared.h
js/src/jit/shared/Lowering-x86-shared.cpp
js/src/jit/shared/Lowering-x86-shared.h
--- a/js/src/builtin/SIMD.h
+++ b/js/src/builtin/SIMD.h
@@ -135,25 +135,25 @@
     _(shiftRightLogicalByScalar)
 #define FOREACH_FLOAT32X4_SIMD_OP(_) \
     _(abs)                           \
     _(sqrt)                          \
     _(reciprocal)                    \
     _(reciprocalSqrt)                \
     _(fromInt32x4)                   \
     _(fromInt32x4Bits)               \
-    _(mul)                           \
     _(div)                           \
     _(max)                           \
     _(min)                           \
     _(maxNum)                        \
     _(minNum)
 #define FOREACH_COMMONX4_SIMD_OP(_)  \
     _(add)                           \
     _(sub)                           \
+    _(mul)                           \
     _(lessThan)                      \
     _(lessThanOrEqual)               \
     _(equal)                         \
     _(notEqual)                      \
     _(greaterThan)                   \
     _(greaterThanOrEqual)            \
     _(and)                           \
     _(or)                            \
--- a/js/src/jit-test/tests/asm.js/testSIMD.js
+++ b/js/src/jit-test/tests/asm.js/testSIMD.js
@@ -7,16 +7,17 @@ const DEBUG = false;
 if (!isSimdAvailable() || typeof SIMD === 'undefined') {
     DEBUG && print("won't run tests as simd extensions aren't activated yet");
     quit(0);
 }
 
 const I32 = 'var i4 = glob.SIMD.int32x4;'
 const I32A = 'var i4a = i4.add;'
 const I32S = 'var i4s = i4.sub;'
+const I32M = 'var i4m = i4.mul;'
 const F32 = 'var f4 = glob.SIMD.float32x4;'
 const F32A = 'var f4a = f4.add;'
 const F32S = 'var f4s = f4.sub;'
 const F32M = 'var f4m = f4.mul;'
 const F32D = 'var f4d = f4.div;'
 const FROUND = 'var f32=glob.Math.fround;'
 
 const INT32_MAX = Math.pow(2, 31) - 1;
@@ -448,19 +449,29 @@ CheckI4(I32S, 'var x=i4(' + INT32_MIN + 
 CheckI4(I32S, 'var x=i4(' + INT32_MIN + ',2,3,4); var y=i4(1,1,0,3); x=i4(i4s(x,y))', [INT32_MAX,1,3,1]);
 
 CheckF4(F32S, 'var x=f4(1,2,3,4); x=f4s(x,x)', [0,0,0,0]);
 CheckF4(F32S, 'var x=f4(1,2,3,4); var y=f4(4,3,5,2); x=f4s(x,y)', [-3,-1,-2,2]);
 CheckF4(F32S, 'var x=f4(13.37,2,3,4); var y=f4(4,3,5,2); x=f4s(x,y)', [Math.fround(13.37) - 4,-1,-2,2]);
 CheckF4(F32S, 'var x=f4(13.37,2,3,4); var y=f4(4,3,5,2); x=f4(f4s(x,y))', [Math.fround(13.37) - 4,-1,-2,2]);
 
 // 2.3.3. Multiplications / Divisions
-assertAsmTypeFail('glob', USE_ASM + I32 + "var f4m=i4.mul; function f() {} return f");
 assertAsmTypeFail('glob', USE_ASM + I32 + "var f4d=i4.div; function f() {} return f");
 
+CheckI4(I32M, 'var x=i4(1,2,3,4); var y=i4(-1,1,0,2); x=i4m(x,y)', [-1,2,0,8]);
+CheckI4(I32M, 'var x=i4(5,4,3,2); var y=i4(1,2,3,4); x=i4m(x,y)', [5,8,9,8]);
+CheckI4(I32M, 'var x=i4(1,2,3,4); x=i4m(x,x)', [1,4,9,16]);
+(function() {
+    var m = INT32_MIN, M = INT32_MAX, imul = Math.imul;
+    CheckI4(I32M, `var x=i4(${m},${m}, ${M}, ${M}); var y=i4(2,-3,4,-5); x=i4m(x,y)`,
+            [imul(m, 2), imul(m, -3), imul(M, 4), imul(M, -5)]);
+    CheckI4(I32M, `var x=i4(${m},${m}, ${M}, ${M}); var y=i4(${m}, ${M}, ${m}, ${M}); x=i4m(x,y)`,
+            [imul(m, m), imul(m, M), imul(M, m), imul(M, M)]);
+})();
+
 CheckF4(F32M, 'var x=f4(1,2,3,4); x=f4m(x,x)', [1,4,9,16]);
 CheckF4(F32M, 'var x=f4(1,2,3,4); var y=f4(4,3,5,2); x=f4m(x,y)', [4,6,15,8]);
 CheckF4(F32M, 'var x=f4(13.37,2,3,4); var y=f4(4,3,5,2); x=f4m(x,y)', [Math.fround(13.37) * 4,6,15,8]);
 CheckF4(F32M, 'var x=f4(13.37,2,3,4); var y=f4(4,3,5,2); x=f4(f4m(x,y))', [Math.fround(13.37) * 4,6,15,8]);
 
 var f32x4 = SIMD.float32x4(0, NaN, -0, NaN);
 var another = SIMD.float32x4(NaN, -1, -0, NaN);
 assertEqX4(asmLink(asmCompile('glob', USE_ASM + F32 + F32M + "function f(x, y) {x=f4(x); y=f4(y); x=f4m(x,y); return f4(x);} return f"), this)(f32x4, another), [NaN, NaN, 0, NaN]);
--- a/js/src/jit/LIR-Common.h
+++ b/js/src/jit/LIR-Common.h
@@ -361,55 +361,53 @@ class LSimdBinaryCompIx4 : public LSimdB
 class LSimdBinaryCompFx4 : public LSimdBinaryComp
 {
   public:
     LIR_HEADER(SimdBinaryCompFx4);
     LSimdBinaryCompFx4() : LSimdBinaryComp() {}
 };
 
 // Binary SIMD arithmetic operation between two SIMD operands
-template<size_t Temps>
-class LSimdBinaryArith : public LInstructionHelper<1, 2, Temps>
+class LSimdBinaryArith : public LInstructionHelper<1, 2, 1>
 {
   public:
     LSimdBinaryArith() {}
 
     const LAllocation *lhs() {
         return this->getOperand(0);
     }
     const LAllocation *rhs() {
         return this->getOperand(1);
     }
+    const LDefinition *temp() {
+        return getTemp(0);
+    }
 
     MSimdBinaryArith::Operation operation() const {
         return this->mir_->toSimdBinaryArith()->operation();
     }
     const char *extraName() const {
         return MSimdBinaryArith::OperationName(operation());
     }
 };
 
 // Binary SIMD arithmetic operation between two Int32x4 operands
-class LSimdBinaryArithIx4 : public LSimdBinaryArith<0>
+class LSimdBinaryArithIx4 : public LSimdBinaryArith
 {
   public:
     LIR_HEADER(SimdBinaryArithIx4);
-    LSimdBinaryArithIx4() : LSimdBinaryArith<0>() {}
+    LSimdBinaryArithIx4() : LSimdBinaryArith() {}
 };
 
 // Binary SIMD arithmetic operation between two Float32x4 operands
-class LSimdBinaryArithFx4 : public LSimdBinaryArith<1>
+class LSimdBinaryArithFx4 : public LSimdBinaryArith
 {
   public:
     LIR_HEADER(SimdBinaryArithFx4);
-    LSimdBinaryArithFx4() : LSimdBinaryArith<1>() {}
-
-    const LDefinition *temp() {
-        return getTemp(0);
-    }
+    LSimdBinaryArithFx4() : LSimdBinaryArith() {}
 };
 
 // Unary SIMD arithmetic operation on a SIMD operand
 class LSimdUnaryArith : public LInstructionHelper<1, 1, 0>
 {
   public:
     explicit LSimdUnaryArith(const LAllocation &in) {
         setOperand(0, in);
--- a/js/src/jit/Lowering.cpp
+++ b/js/src/jit/Lowering.cpp
@@ -626,73 +626,16 @@ ReorderComparison(JSOp op, MDefinition *
     if (lhs->isConstant()) {
         *rhsp = lhs;
         *lhsp = rhs;
         return ReverseCompareOp(op);
     }
     return op;
 }
 
-static bool
-ShouldReorderCommutative(MDefinition *lhs, MDefinition *rhs, MInstruction *ins)
-{
-    // lhs and rhs are used by the commutative operator.
-    MOZ_ASSERT(lhs->hasDefUses());
-    MOZ_ASSERT(rhs->hasDefUses());
-
-    // Ensure that if there is a constant, then it is in rhs.
-    if (rhs->isConstant())
-        return false;
-    if (lhs->isConstant())
-        return true;
-
-    // Since clobbering binary operations clobber the left operand, prefer a
-    // non-constant lhs operand with no further uses. To be fully precise, we
-    // should check whether this is the *last* use, but checking hasOneDefUse()
-    // is a decent approximation which doesn't require any extra analysis.
-    bool rhsSingleUse = rhs->hasOneDefUse();
-    bool lhsSingleUse = lhs->hasOneDefUse();
-    if (rhsSingleUse) {
-        if (!lhsSingleUse)
-            return true;
-    } else {
-        if (lhsSingleUse)
-            return false;
-    }
-
-    // If this is a reduction-style computation, such as
-    //
-    //   sum = 0;
-    //   for (...)
-    //      sum += ...;
-    //
-    // put the phi on the left to promote coalescing. This is fairly specific.
-    if (rhsSingleUse &&
-        rhs->isPhi() &&
-        rhs->block()->isLoopHeader() &&
-        ins == rhs->toPhi()->getLoopBackedgeOperand())
-    {
-        return true;
-    }
-
-    return false;
-}
-
-static void
-ReorderCommutative(MDefinition **lhsp, MDefinition **rhsp, MInstruction *ins)
-{
-    MDefinition *lhs = *lhsp;
-    MDefinition *rhs = *rhsp;
-
-    if (ShouldReorderCommutative(lhs, rhs, ins)) {
-        *rhsp = lhs;
-        *lhsp = rhs;
-    }
-}
-
 void
 LIRGenerator::visitTest(MTest *test)
 {
     MDefinition *opd = test->getOperand(0);
     MBasicBlock *ifTrue = test->ifTrue();
     MBasicBlock *ifFalse = test->ifFalse();
 
     // String is converted to length of string in the type analysis phase (see
@@ -4079,44 +4022,16 @@ LIRGenerator::visitSimdBinaryComp(MSimdB
         LSimdBinaryCompFx4 *add = new(alloc()) LSimdBinaryCompFx4();
         lowerForCompFx4(add, ins, ins->lhs(), ins->rhs());
     } else {
         MOZ_CRASH("Unknown compare type when comparing values");
     }
 }
 
 void
-LIRGenerator::visitSimdBinaryArith(MSimdBinaryArith *ins)
-{
-    MOZ_ASSERT(IsSimdType(ins->type()));
-
-    MDefinition *lhs = ins->lhs();
-    MDefinition *rhs = ins->rhs();
-
-    if (ins->isCommutative())
-        ReorderCommutative(&lhs, &rhs, ins);
-
-    if (ins->type() == MIRType_Int32x4) {
-        lowerForFPU(new(alloc()) LSimdBinaryArithIx4(), ins, lhs, rhs);
-        return;
-    }
-
-    MOZ_ASSERT(ins->type() == MIRType_Float32x4, "unknown simd type on binary arith operation");
-
-    LSimdBinaryArithFx4 *lir = new(alloc()) LSimdBinaryArithFx4();
-
-    bool needsTemp = ins->operation() == MSimdBinaryArith::Max ||
-                     ins->operation() == MSimdBinaryArith::MinNum ||
-                     ins->operation() == MSimdBinaryArith::MaxNum;
-    lir->setTemp(0, needsTemp ? temp(LDefinition::FLOAT32X4) : LDefinition::BogusTemp());
-
-    lowerForFPU(lir, ins, lhs, rhs);
-}
-
-void
 LIRGenerator::visitSimdBinaryBitwise(MSimdBinaryBitwise *ins)
 {
     MOZ_ASSERT(IsSimdType(ins->type()));
 
     MDefinition *lhs = ins->lhs();
     MDefinition *rhs = ins->rhs();
     ReorderCommutative(&lhs, &rhs, ins);
 
--- a/js/src/jit/Lowering.h
+++ b/js/src/jit/Lowering.h
@@ -278,17 +278,16 @@ class LIRGenerator : public LIRGenerator
     void visitMemoryBarrier(MMemoryBarrier *ins);
     void visitSimdExtractElement(MSimdExtractElement *ins);
     void visitSimdInsertElement(MSimdInsertElement *ins);
     void visitSimdSignMask(MSimdSignMask *ins);
     void visitSimdSwizzle(MSimdSwizzle *ins);
     void visitSimdShuffle(MSimdShuffle *ins);
     void visitSimdUnaryArith(MSimdUnaryArith *ins);
     void visitSimdBinaryComp(MSimdBinaryComp *ins);
-    void visitSimdBinaryArith(MSimdBinaryArith *ins);
     void visitSimdBinaryBitwise(MSimdBinaryBitwise *ins);
     void visitSimdShift(MSimdShift *ins);
     void visitSimdConstant(MSimdConstant *ins);
     void visitSimdConvert(MSimdConvert *ins);
     void visitSimdReinterpretCast(MSimdReinterpretCast *ins);
     void visitPhi(MPhi *ins);
     void visitBeta(MBeta *ins);
     void visitObjectState(MObjectState *ins);
--- a/js/src/jit/MIR.h
+++ b/js/src/jit/MIR.h
@@ -1878,17 +1878,17 @@ class MSimdBinaryArith : public MBinaryI
     }
 
   private:
     Operation operation_;
 
     MSimdBinaryArith(MDefinition *left, MDefinition *right, Operation op, MIRType type)
       : MBinaryInstruction(left, right), operation_(op)
     {
-        MOZ_ASSERT_IF(type == MIRType_Int32x4, op == Add || op == Sub);
+        MOZ_ASSERT_IF(type == MIRType_Int32x4, op == Add || op == Sub || op == Mul);
         MOZ_ASSERT(IsSimdType(type));
         MOZ_ASSERT(left->type() == right->type());
         MOZ_ASSERT(left->type() == type);
         setResultType(type);
         setMovable();
         if (op == Add || op == Mul || op == Min || op == Max)
             setCommutative();
     }
--- a/js/src/jit/arm/Lowering-arm.cpp
+++ b/js/src/jit/arm/Lowering-arm.cpp
@@ -552,16 +552,22 @@ LIRGeneratorARM::visitStoreTypedArrayEle
 
 void
 LIRGeneratorARM::visitForkJoinGetSlice(MForkJoinGetSlice *ins)
 {
     MOZ_CRASH("NYI");
 }
 
 void
+LIRGeneratorARM::visitSimdBinaryArith(MSimdBinaryArith *ins)
+{
+    MOZ_CRASH("NYI");
+}
+
+void
 LIRGeneratorARM::visitSimdTernaryBitwise(MSimdTernaryBitwise *ins)
 {
     MOZ_CRASH("NYI");
 }
 
 void
 LIRGeneratorARM::visitSimdSplatX4(MSimdSplatX4 *ins)
 {
--- a/js/src/jit/arm/Lowering-arm.h
+++ b/js/src/jit/arm/Lowering-arm.h
@@ -102,16 +102,17 @@ class LIRGeneratorARM : public LIRGenera
     void visitAsmJSUnsignedToFloat32(MAsmJSUnsignedToFloat32 *ins);
     void visitAsmJSLoadHeap(MAsmJSLoadHeap *ins);
     void visitAsmJSStoreHeap(MAsmJSStoreHeap *ins);
     void visitAsmJSLoadFuncPtr(MAsmJSLoadFuncPtr *ins);
     void visitAsmJSCompareExchangeHeap(MAsmJSCompareExchangeHeap *ins);
     void visitAsmJSAtomicBinopHeap(MAsmJSAtomicBinopHeap *ins);
     void visitStoreTypedArrayElementStatic(MStoreTypedArrayElementStatic *ins);
     void visitForkJoinGetSlice(MForkJoinGetSlice *ins);
+    void visitSimdBinaryArith(MSimdBinaryArith *ins);
     void visitSimdTernaryBitwise(MSimdTernaryBitwise *ins);
     void visitSimdSplatX4(MSimdSplatX4 *ins);
     void visitSimdValueX4(MSimdValueX4 *ins);
     void visitCompareExchangeTypedArrayElement(MCompareExchangeTypedArrayElement *ins);
     void visitAtomicTypedArrayElementBinop(MAtomicTypedArrayElementBinop *ins);
     void visitSubstr(MSubstr *ins);
 };
 
--- a/js/src/jit/mips/Lowering-mips.cpp
+++ b/js/src/jit/mips/Lowering-mips.cpp
@@ -542,16 +542,22 @@ LIRGeneratorMIPS::visitStoreTypedArrayEl
 
 void
 LIRGeneratorMIPS::visitForkJoinGetSlice(MForkJoinGetSlice *ins)
 {
     MOZ_CRASH("NYI");
 }
 
 void
+LIRGeneratorMIPS::visitSimdBinaryArith(MSimdBinaryArith *ins)
+{
+    MOZ_CRASH("NYI");
+}
+
+void
 LIRGeneratorMIPS::visitSimdTernaryBitwise(MSimdTernaryBitwise *ins)
 {
     MOZ_CRASH("NYI");
 }
 
 void
 LIRGeneratorMIPS::visitSimdSplatX4(MSimdSplatX4 *ins)
 {
--- a/js/src/jit/mips/Lowering-mips.h
+++ b/js/src/jit/mips/Lowering-mips.h
@@ -102,16 +102,17 @@ class LIRGeneratorMIPS : public LIRGener
     void visitAsmJSUnsignedToFloat32(MAsmJSUnsignedToFloat32 *ins);
     void visitAsmJSLoadHeap(MAsmJSLoadHeap *ins);
     void visitAsmJSStoreHeap(MAsmJSStoreHeap *ins);
     void visitAsmJSCompareExchangeHeap(MAsmJSCompareExchangeHeap *ins);
     void visitAsmJSAtomicBinopHeap(MAsmJSAtomicBinopHeap *ins);
     void visitAsmJSLoadFuncPtr(MAsmJSLoadFuncPtr *ins);
     void visitStoreTypedArrayElementStatic(MStoreTypedArrayElementStatic *ins);
     void visitForkJoinGetSlice(MForkJoinGetSlice *ins);
+    void visitSimdBinaryArith(MSimdBinaryArith *ins);
     void visitSimdTernaryBitwise(MSimdTernaryBitwise *ins);
     void visitSimdSplatX4(MSimdSplatX4 *ins);
     void visitSimdValueX4(MSimdValueX4 *ins);
     void visitCompareExchangeTypedArrayElement(MCompareExchangeTypedArrayElement *ins);
     void visitAtomicTypedArrayElementBinop(MAtomicTypedArrayElementBinop *ins);
     void visitSubstr(MSubstr *ins);
 };
 
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@@ -597,16 +597,19 @@ class AssemblerX86Shared : public Assemb
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
     void movdqa(const Operand &src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         switch (src.kind()) {
+          case Operand::FPREG:
+            masm.movdqa_rr(src.fpu(), dest.code());
+            break;
           case Operand::MEM_REG_DISP:
             masm.movdqa_mr(src.disp(), src.base(), dest.code());
             break;
           case Operand::MEM_SCALE:
             masm.movdqa_mr(src.disp(), src.base(), src.index(), src.scale(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
@@ -1807,16 +1810,36 @@ class AssemblerX86Shared : public Assemb
             break;
           case Operand::MEM_ADDRESS32:
             masm.psubd_mr(src.address(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
+    void pmuludq(FloatRegister src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.pmuludq_rr(src.code(), dest.code());
+    }
+    void pmulld(const Operand &src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE41());
+        switch (src.kind()) {
+          case Operand::FPREG:
+            masm.pmulld_rr(src.fpu(), dest.code());
+            break;
+          case Operand::MEM_REG_DISP:
+            masm.pmulld_mr(src.disp(), src.base(), dest.code());
+            break;
+          case Operand::MEM_ADDRESS32:
+            masm.pmulld_mr(src.address(), dest.code());
+            break;
+          default:
+            MOZ_CRASH("unexpected operand kind");
+        }
+    }
     void vaddps(const Operand &src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         switch (src1.kind()) {
           case Operand::FPREG:
             masm.vaddps_rr(src1.fpu(), src0.code(), dest.code());
             break;
           case Operand::MEM_REG_DISP:
             masm.vaddps_mr(src1.disp(), src1.base(), src0.code(), dest.code());
@@ -1976,16 +1999,32 @@ class AssemblerX86Shared : public Assemb
     void pxor(FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.pxor_rr(src.code(), dest.code());
     }
     void pshufd(uint32_t mask, FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.pshufd_irr(mask, src.code(), dest.code());
     }
+    void pshufd(uint32_t mask, const Operand &src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        switch (src.kind()) {
+          case Operand::FPREG:
+            masm.pshufd_irr(mask, src.fpu(), dest.code());
+            break;
+          case Operand::MEM_REG_DISP:
+            masm.pshufd_imr(mask, src.disp(), src.base(), dest.code());
+            break;
+          case Operand::MEM_ADDRESS32:
+            masm.pshufd_imr(mask, src.address(), dest.code());
+            break;
+          default:
+            MOZ_CRASH("unexpected operand kind");
+        }
+    }
     void movhlps(FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.movhlps_rr(src.code(), dest.code());
     }
     void movlhps(FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.movlhps_rr(src.code(), dest.code());
     }
--- a/js/src/jit/shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/shared/BaseAssembler-x86-shared.h
@@ -382,34 +382,37 @@ private:
         OP2_XADD_EvGv       = 0xC1,
         OP2_CMPPS_VpsWps    = 0xC2,
         OP2_PEXTRW_GdUdIb   = 0xC5,
         OP2_SHUFPS_VpsWpsIb = 0xC6,
         OP2_PSRLD_VdqWdq    = 0xD2,
         OP2_PSRAD_VdqWdq    = 0xE2,
         OP2_PXORDQ_VdqWdq   = 0xEF,
         OP2_PSLLD_VdqWdq    = 0xF2,
+        OP2_PMULUDQ_VdqWdq  = 0xF4,
         OP2_PSUBD_VdqWdq    = 0xFA,
         OP2_PADDD_VdqWdq    = 0xFE
     } TwoByteOpcodeID;
 
     typedef enum {
         OP3_ROUNDSS_VsdWsd  = 0x0A,
         OP3_ROUNDSD_VsdWsd  = 0x0B,
         OP3_BLENDVPS_VdqWdq = 0x14,
         OP3_PEXTRD_EdVdqIb  = 0x16,
         OP3_BLENDPS_VpsWpsIb = 0x0C,
         OP3_PTEST_VdVd      = 0x17,
         OP3_INSERTPS_VpsUps = 0x21,
         OP3_PINSRD_VdqEdIb  = 0x22,
+        OP3_PMULLD_VdqWdq   = 0x40,
         OP3_VBLENDVPS_VdqWdq = 0x4A
     } ThreeByteOpcodeID;
 
     typedef enum {
         ESCAPE_BLENDVPS     = 0x38,
+        ESCAPE_PMULLD       = 0x38,
         ESCAPE_PTEST        = 0x38,
         ESCAPE_PINSRD       = 0x3A,
         ESCAPE_PEXTRD       = 0x3A,
         ESCAPE_ROUNDSD      = 0x3A,
         ESCAPE_INSERTPS     = 0x3A,
         ESCAPE_BLENDPS      = 0x3A,
         ESCAPE_VBLENDVPS    = 0x3A
     } ThreeByteEscape;
@@ -797,16 +800,43 @@ public:
     }
     void psubd_mr(const void* address, XMMRegisterID dst)
     {
         spew("psubd      %p, %s", address, nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.twoByteOp(OP2_PSUBD_VdqWdq, address, (RegisterID)dst);
     }
 
+    void pmuludq_rr(XMMRegisterID src, XMMRegisterID dst)
+    {
+        spew("pmuludq     %s, %s", nameFPReg(src), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.twoByteOp(OP2_PMULUDQ_VdqWdq, (RegisterID)src, (RegisterID)dst);
+    }
+
+    void pmulld_rr(XMMRegisterID src, XMMRegisterID dst)
+    {
+        spew("pmulld      %s, %s", nameFPReg(src), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.threeByteOp(OP3_PMULLD_VdqWdq, ESCAPE_PMULLD, (RegisterID)src, (RegisterID)dst);
+    }
+    void pmulld_mr(int offset, RegisterID base, XMMRegisterID dst)
+    {
+        spew("pmulld      %s0x%x(%s), %s",
+             PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.threeByteOp(OP3_PMULLD_VdqWdq, ESCAPE_PMULLD, offset, base, (RegisterID)dst);
+    }
+    void pmulld_mr(const void* address, XMMRegisterID dst)
+    {
+        spew("pmulld      %p, %s", address, nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.threeByteOp(OP3_PMULLD_VdqWdq, ESCAPE_PMULLD, address, (RegisterID)dst);
+    }
+
     void vaddps_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vaddps", VEX_PS, OP2_ADDPS_VpsWps, src1, src0, dst);
     }
     void vaddps_mr(int offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vaddps", VEX_PS, OP2_ADDPS_VpsWps, offset, base, src0, dst);
     }
@@ -2936,16 +2966,34 @@ public:
     {
         MOZ_ASSERT(mask < 256);
         spew("pshufd     0x%x, %s, %s", mask, nameFPReg(src), nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.twoByteOp(OP2_PSHUFD_VdqWdqIb, (RegisterID)src, (RegisterID)dst);
         m_formatter.immediate8(uint8_t(mask));
     }
 
+    void pshufd_imr(uint32_t mask, int offset, RegisterID base, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(mask < 256);
+        spew("pshufd     0x%x, %s0x%x(%s), %s",
+             mask, PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.twoByteOp(OP2_PSHUFD_VdqWdqIb, offset, base, (RegisterID)dst);
+        m_formatter.immediate8(uint8_t(mask));
+    }
+
+    void pshufd_imr(uint32_t mask, const void* address, XMMRegisterID dst)
+    {
+        spew("pshufd     %x, %p, %s", mask, address, nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.twoByteOp(OP2_PSHUFD_VdqWdqIb, address, (RegisterID)dst);
+        m_formatter.immediate8(uint8_t(mask));
+    }
+
     void shufps_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
     {
         MOZ_ASSERT(mask < 256);
         spew("shufps     0x%x, %s, %s", mask, nameFPReg(src), nameFPReg(dst));
         m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, (RegisterID)src, (RegisterID)dst);
         m_formatter.immediate8(uint8_t(mask));
     }
 
@@ -2956,17 +3004,16 @@ public:
              mask, PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
         m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, offset, base, (RegisterID)dst);
         m_formatter.immediate8(uint8_t(mask));
     }
 
     void shufps_imr(uint32_t mask, const void* address, XMMRegisterID dst)
     {
         spew("shufps     %x, %p, %s", mask, address, nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F3);
         m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, address, (RegisterID)dst);
         m_formatter.immediate8(uint8_t(mask));
     }
 
     void movhlps_rr(XMMRegisterID src, XMMRegisterID dst)
     {
         spew("movhlps    %s, %s", nameFPReg(src), nameFPReg(dst));
         m_formatter.twoByteOp(OP2_MOVHLPS_VqUq, (RegisterID)src, (RegisterID)dst);
@@ -4735,16 +4782,26 @@ private:
               case 0x38: m = 2; break; // 0x0F 0x38
               case 0x3A: m = 3; break; // 0x0F 0x3A
               default: MOZ_CRASH("unexpected escape");
             }
             threeOpVex(ty, r, x, b, m, w, v, l, opcode);
             memoryModRM(offset, base, reg);
         }
 
+        void threeByteOp(ThreeByteOpcodeID opcode, ThreeByteEscape escape, const void* address, int reg)
+        {
+            m_buffer.ensureSpace(maxInstructionSize);
+            emitRexIfNeeded(reg, 0, 0);
+            m_buffer.putByteUnchecked(OP_2BYTE_ESCAPE);
+            m_buffer.putByteUnchecked(escape);
+            m_buffer.putByteUnchecked(opcode);
+            memoryModRM(address, reg);
+        }
+
         void vblendvOpVex(VexOperandType ty, ThreeByteOpcodeID opcode, ThreeByteEscape escape,
                           XMMRegisterID mask, RegisterID rm, XMMRegisterID src0, int reg)
         {
             int r = (reg >> 3), x = 0, b = (rm >> 3);
             int m = 0, w = 0, v = src0, l = 0;
             switch (escape) {
               case 0x38: m = 2; break; // 0x0F 0x38
               case 0x3A: m = 3; break; // 0x0F 0x3A
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@@ -2621,19 +2621,37 @@ CodeGeneratorX86Shared::visitSimdBinaryA
     MSimdBinaryArith::Operation op = ins->operation();
     switch (op) {
       case MSimdBinaryArith::Add:
         masm.packedAddInt32(rhs, lhs);
         return;
       case MSimdBinaryArith::Sub:
         masm.packedSubInt32(rhs, lhs);
         return;
-      case MSimdBinaryArith::Mul:
-        // we can do mul with a single instruction only if we have SSE4.1
-        // using the PMULLD instruction.
+      case MSimdBinaryArith::Mul: {
+        if (AssemblerX86Shared::HasSSE41()) {
+            masm.pmulld(rhs, lhs);
+            return;
+        }
+
+        masm.loadAlignedInt32x4(rhs, ScratchSimdReg);
+        masm.pmuludq(lhs, ScratchSimdReg);
+        // ScratchSimdReg contains (Rx, _, Rz, _) where R is the resulting vector.
+
+        FloatRegister temp = ToFloatRegister(ins->temp());
+        masm.pshufd(MacroAssembler::ComputeShuffleMask(LaneY, LaneY, LaneW, LaneW), lhs, lhs);
+        masm.pshufd(MacroAssembler::ComputeShuffleMask(LaneY, LaneY, LaneW, LaneW), rhs, temp);
+        masm.pmuludq(temp, lhs);
+        // lhs contains (Ry, _, Rw, _) where R is the resulting vector.
+
+        masm.shufps(MacroAssembler::ComputeShuffleMask(LaneX, LaneZ, LaneX, LaneZ), ScratchSimdReg, lhs);
+        // lhs contains (Ry, Rw, Rx, Rz)
+        masm.shufps(MacroAssembler::ComputeShuffleMask(LaneZ, LaneX, LaneW, LaneY), lhs, lhs);
+        return;
+      }
       case MSimdBinaryArith::Div:
         // x86 doesn't have SIMD i32 div.
         break;
       case MSimdBinaryArith::Max:
         // we can do max with a single instruction only if we have SSE4.1
         // using the PMAXSD instruction.
         break;
       case MSimdBinaryArith::Min:
--- a/js/src/jit/shared/Lowering-shared.cpp
+++ b/js/src/jit/shared/Lowering-shared.cpp
@@ -9,16 +9,73 @@
 #include "jit/LIR.h"
 #include "jit/MIR.h"
 
 #include "vm/Symbol.h"
 
 using namespace js;
 using namespace jit;
 
+bool
+LIRGeneratorShared::ShouldReorderCommutative(MDefinition *lhs, MDefinition *rhs, MInstruction *ins)
+{
+    // lhs and rhs are used by the commutative operator.
+    MOZ_ASSERT(lhs->hasDefUses());
+    MOZ_ASSERT(rhs->hasDefUses());
+
+    // Ensure that if there is a constant, then it is in rhs.
+    if (rhs->isConstant())
+        return false;
+    if (lhs->isConstant())
+        return true;
+
+    // Since clobbering binary operations clobber the left operand, prefer a
+    // non-constant lhs operand with no further uses. To be fully precise, we
+    // should check whether this is the *last* use, but checking hasOneDefUse()
+    // is a decent approximation which doesn't require any extra analysis.
+    bool rhsSingleUse = rhs->hasOneDefUse();
+    bool lhsSingleUse = lhs->hasOneDefUse();
+    if (rhsSingleUse) {
+        if (!lhsSingleUse)
+            return true;
+    } else {
+        if (lhsSingleUse)
+            return false;
+    }
+
+    // If this is a reduction-style computation, such as
+    //
+    //   sum = 0;
+    //   for (...)
+    //      sum += ...;
+    //
+    // put the phi on the left to promote coalescing. This is fairly specific.
+    if (rhsSingleUse &&
+        rhs->isPhi() &&
+        rhs->block()->isLoopHeader() &&
+        ins == rhs->toPhi()->getLoopBackedgeOperand())
+    {
+        return true;
+    }
+
+    return false;
+}
+
+void
+LIRGeneratorShared::ReorderCommutative(MDefinition **lhsp, MDefinition **rhsp, MInstruction *ins)
+{
+    MDefinition *lhs = *lhsp;
+    MDefinition *rhs = *rhsp;
+
+    if (ShouldReorderCommutative(lhs, rhs, ins)) {
+        *rhsp = lhs;
+        *lhsp = rhs;
+    }
+}
+
 void
 LIRGeneratorShared::visitConstant(MConstant *ins)
 {
     const Value &v = ins->value();
     switch (ins->type()) {
       case MIRType_Boolean:
         define(new(alloc()) LInteger(v.toBoolean()), ins);
         break;
--- a/js/src/jit/shared/Lowering-shared.h
+++ b/js/src/jit/shared/Lowering-shared.h
@@ -45,16 +45,20 @@ class LIRGeneratorShared : public MDefin
         osiPoint_(nullptr)
     { }
 
     MIRGenerator *mir() {
         return gen;
     }
 
   protected:
+
+    static void ReorderCommutative(MDefinition **lhsp, MDefinition **rhsp, MInstruction *ins);
+    static bool ShouldReorderCommutative(MDefinition *lhs, MDefinition *rhs, MInstruction *ins);
+
     // A backend can decide that an instruction should be emitted at its uses,
     // rather than at its definition. To communicate this, set the
     // instruction's virtual register set to 0. When using the instruction,
     // its virtual register is temporarily reassigned. To know to clear it
     // after constructing the use information, the worklist bit is temporarily
     // unset.
     //
     // The backend can use the worklist bit to determine whether or not a
--- a/js/src/jit/shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/shared/Lowering-x86-shared.cpp
@@ -651,16 +651,47 @@ LIRGeneratorX86Shared::visitAsmJSAtomicB
 
     LAsmJSAtomicBinopHeap *lir =
         new(alloc()) LAsmJSAtomicBinopHeap(useRegister(ptr), value, tempDef);
 
     defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
 }
 
 void
+LIRGeneratorX86Shared::visitSimdBinaryArith(MSimdBinaryArith *ins)
+{
+    MOZ_ASSERT(IsSimdType(ins->type()));
+
+    MDefinition *lhs = ins->lhs();
+    MDefinition *rhs = ins->rhs();
+
+    if (ins->isCommutative())
+        ReorderCommutative(&lhs, &rhs, ins);
+
+    if (ins->type() == MIRType_Int32x4) {
+        LSimdBinaryArithIx4 *lir = new(alloc()) LSimdBinaryArithIx4();
+        bool needsTemp = ins->operation() == MSimdBinaryArith::Mul && !MacroAssembler::HasSSE41();
+        lir->setTemp(0, needsTemp ? temp(LDefinition::INT32X4) : LDefinition::BogusTemp());
+        lowerForFPU(lir, ins, lhs, rhs);
+        return;
+    }
+
+    MOZ_ASSERT(ins->type() == MIRType_Float32x4, "unknown simd type on binary arith operation");
+
+    LSimdBinaryArithFx4 *lir = new(alloc()) LSimdBinaryArithFx4();
+
+    bool needsTemp = ins->operation() == MSimdBinaryArith::Max ||
+                     ins->operation() == MSimdBinaryArith::MinNum ||
+                     ins->operation() == MSimdBinaryArith::MaxNum;
+    lir->setTemp(0, needsTemp ? temp(LDefinition::FLOAT32X4) : LDefinition::BogusTemp());
+
+    lowerForFPU(lir, ins, lhs, rhs);
+}
+
+void
 LIRGeneratorX86Shared::visitSimdTernaryBitwise(MSimdTernaryBitwise *ins)
 {
     MOZ_ASSERT(IsSimdType(ins->type()));
 
     if (ins->type() == MIRType_Int32x4 || ins->type() == MIRType_Float32x4) {
         LSimdSelect *lins = new(alloc()) LSimdSelect;
 
         // This must be useRegisterAtStart() because it is destroyed.
--- a/js/src/jit/shared/Lowering-x86-shared.h
+++ b/js/src/jit/shared/Lowering-x86-shared.h
@@ -48,16 +48,17 @@ class LIRGeneratorX86Shared : public LIR
     void lowerUDiv(MDiv *div);
     void lowerUMod(MMod *mod);
     void lowerUrshD(MUrsh *mir);
     void lowerConstantDouble(double d, MInstruction *ins);
     void lowerConstantFloat32(float d, MInstruction *ins);
     void lowerTruncateDToInt32(MTruncateToInt32 *ins);
     void lowerTruncateFToInt32(MTruncateToInt32 *ins);
     void visitForkJoinGetSlice(MForkJoinGetSlice *ins);
+    void visitSimdBinaryArith(MSimdBinaryArith *ins);
     void visitSimdTernaryBitwise(MSimdTernaryBitwise *ins);
     void visitSimdSplatX4(MSimdSplatX4 *ins);
     void visitSimdValueX4(MSimdValueX4 *ins);
     void visitCompareExchangeTypedArrayElement(MCompareExchangeTypedArrayElement *ins);
     void visitAtomicTypedArrayElementBinop(MAtomicTypedArrayElementBinop *ins);
     void visitAsmJSCompareExchangeHeap(MAsmJSCompareExchangeHeap *ins);
     void visitAsmJSAtomicBinopHeap(MAsmJSAtomicBinopHeap *ins);
 };