Bug 1135042: Optimize SIMD.storeX/storeXY/storeXYZ in Ion; r=bhackett
authorBenjamin Bouvier <benj@benj.me>
Thu, 19 Mar 2015 13:50:56 +0100
changeset 234711 44aa05cc400f82eabbb9a9e3880dda178eca5633
parent 234710 266fac2f7b2a571959cd0684d8799e9a4f9cff50
child 234712 38c35f7b2b7d16c97f3ecacf6f6f27a42612a897
push id57227
push userbenj@benj.me
push dateFri, 20 Mar 2015 18:46:08 +0000
treeherdermozilla-inbound@44aa05cc400f [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbhackett
bugs1135042
milestone39.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1135042: Optimize SIMD.storeX/storeXY/storeXYZ in Ion; r=bhackett
js/src/builtin/SIMD.h
js/src/jit-test/tests/SIMD/store.js
js/src/jit/CodeGenerator.cpp
js/src/jit/IonBuilder.h
js/src/jit/MCallOptimize.cpp
js/src/jit/MIR.h
js/src/jit/MacroAssembler.cpp
js/src/jit/MacroAssembler.h
js/src/jit/arm/MacroAssembler-arm.h
js/src/jit/mips/MacroAssembler-mips.h
js/src/jit/none/MacroAssembler-none.h
js/src/jit/shared/MacroAssembler-x86-shared.h
--- a/js/src/builtin/SIMD.h
+++ b/js/src/builtin/SIMD.h
@@ -243,23 +243,23 @@
     _(neg)                           \
     _(swizzle)                       \
     _(shuffle)                       \
     _(load)                          \
     _(loadX)                         \
     _(loadXY)                        \
     _(loadXYZ)                       \
     _(store)                         \
+    _(storeX)                        \
+    _(storeXY)                       \
+    _(storeXYZ)                      \
     _(check)
 #define FOREACH_COMMONX4_SIMD_OP(_)  \
     ION_COMMONX4_SIMD_OP(_)          \
-    COMP_COMMONX4_TO_INT32X4_SIMD_OP(_) \
-    _(storeX)                        \
-    _(storeXY)                       \
-    _(storeXYZ)
+    COMP_COMMONX4_TO_INT32X4_SIMD_OP(_)
 #define FORALL_SIMD_OP(_)            \
     FOREACH_INT32X4_SIMD_OP(_)       \
     FOREACH_FLOAT32X4_SIMD_OP(_)     \
     FOREACH_COMMONX4_SIMD_OP(_)
 
 namespace js {
 
 class SIMDObject : public JSObject
--- a/js/src/jit-test/tests/SIMD/store.js
+++ b/js/src/jit-test/tests/SIMD/store.js
@@ -12,45 +12,109 @@ function f() {
     var u32 = new Uint32Array(f32.buffer);
     var i16 = new Int16Array(f32.buffer);
     var u16 = new Uint16Array(f32.buffer);
     var i8  = new Int8Array(f32.buffer);
     var u8  = new Uint8Array(f32.buffer);
 
     var f4 = SIMD.float32x4(42, 43, 44, 45);
 
-    function check() {
+    function check(n) {
         assertEq(f32[0], 42);
-        assertEq(f32[1], 43);
-        assertEq(f32[2], 44);
-        assertEq(f32[3], 45);
+        assertEq(f32[1], n > 1 ? 43 : 2);
+        assertEq(f32[2], n > 2 ? 44 : 3);
+        assertEq(f32[3], n > 3 ? 45 : 4);
 
         f32[0] = 1;
         f32[1] = 2;
         f32[2] = 3;
         f32[3] = 4;
     }
 
-    for (var i = 0; i < 150; i++) {
+    function testStore() {
         SIMD.float32x4.store(f64, 0, f4);
-        check();
+        check(4);
         SIMD.float32x4.store(f32, 0, f4);
-        check();
+        check(4);
         SIMD.float32x4.store(i32, 0, f4);
-        check();
+        check(4);
         SIMD.float32x4.store(u32, 0, f4);
-        check();
+        check(4);
         SIMD.float32x4.store(i16, 0, f4);
-        check();
+        check(4);
         SIMD.float32x4.store(u16, 0, f4);
-        check();
+        check(4);
         SIMD.float32x4.store(i8, 0, f4);
-        check();
+        check(4);
         SIMD.float32x4.store(u8, 0, f4);
-        check();
+        check(4);
+    }
+
+    function testStoreX() {
+        SIMD.float32x4.storeX(f64, 0, f4);
+        check(1);
+        SIMD.float32x4.storeX(f32, 0, f4);
+        check(1);
+        SIMD.float32x4.storeX(i32, 0, f4);
+        check(1);
+        SIMD.float32x4.storeX(u32, 0, f4);
+        check(1);
+        SIMD.float32x4.storeX(i16, 0, f4);
+        check(1);
+        SIMD.float32x4.storeX(u16, 0, f4);
+        check(1);
+        SIMD.float32x4.storeX(i8, 0, f4);
+        check(1);
+        SIMD.float32x4.storeX(u8, 0, f4);
+        check(1);
+    }
+
+    function testStoreXY() {
+        SIMD.float32x4.storeXY(f64, 0, f4);
+        check(2);
+        SIMD.float32x4.storeXY(f32, 0, f4);
+        check(2);
+        SIMD.float32x4.storeXY(i32, 0, f4);
+        check(2);
+        SIMD.float32x4.storeXY(u32, 0, f4);
+        check(2);
+        SIMD.float32x4.storeXY(i16, 0, f4);
+        check(2);
+        SIMD.float32x4.storeXY(u16, 0, f4);
+        check(2);
+        SIMD.float32x4.storeXY(i8, 0, f4);
+        check(2);
+        SIMD.float32x4.storeXY(u8, 0, f4);
+        check(2);
+    }
+
+    function testStoreXYZ() {
+        SIMD.float32x4.storeXYZ(f64, 0, f4);
+        check(3);
+        SIMD.float32x4.storeXYZ(f32, 0, f4);
+        check(3);
+        SIMD.float32x4.storeXYZ(i32, 0, f4);
+        check(3);
+        SIMD.float32x4.storeXYZ(u32, 0, f4);
+        check(3);
+        SIMD.float32x4.storeXYZ(i16, 0, f4);
+        check(3);
+        SIMD.float32x4.storeXYZ(u16, 0, f4);
+        check(3);
+        SIMD.float32x4.storeXYZ(i8, 0, f4);
+        check(3);
+        SIMD.float32x4.storeXYZ(u8, 0, f4);
+        check(3);
+    }
+
+    for (var i = 0; i < 150; i++) {
+        testStore();
+        testStoreX();
+        testStoreXY();
+        testStoreXYZ();
     }
 }
 
 f();
 
 function testBailout(uglyDuckling) {
     var f32 = new Float32Array(16);
     for (var i = 0; i < 16; i++)
--- a/js/src/jit/CodeGenerator.cpp
+++ b/js/src/jit/CodeGenerator.cpp
@@ -8754,47 +8754,52 @@ CodeGenerator::visitLoadTypedArrayElemen
     if (fail.used())
         bailoutFrom(&fail, lir->snapshot());
 
     masm.bind(&done);
 }
 
 template <typename T>
 static inline void
-StoreToTypedArray(MacroAssembler &masm, Scalar::Type writeType, const LAllocation *value, const T &dest)
+StoreToTypedArray(MacroAssembler &masm, Scalar::Type writeType, const LAllocation *value,
+                  const T &dest, unsigned numElems = 0)
 {
     if (Scalar::isSimdType(writeType) ||
         writeType == Scalar::Float32 ||
         writeType == Scalar::Float64)
     {
-        masm.storeToTypedFloatArray(writeType, ToFloatRegister(value), dest);
+        masm.storeToTypedFloatArray(writeType, ToFloatRegister(value), dest, numElems);
     } else {
         if (value->isConstant())
             masm.storeToTypedIntArray(writeType, Imm32(ToInt32(value)), dest);
         else
             masm.storeToTypedIntArray(writeType, ToRegister(value), dest);
     }
 }
 
 void
 CodeGenerator::visitStoreUnboxedScalar(LStoreUnboxedScalar *lir)
 {
     Register elements = ToRegister(lir->elements());
     const LAllocation *value = lir->value();
 
-    Scalar::Type writeType = lir->mir()->writeType();
-    int width = Scalar::byteSize(lir->mir()->indexType());
+    const MStoreUnboxedScalar *mir = lir->mir();
+
+    Scalar::Type writeType = mir->writeType();
+    unsigned numElems = mir->numElems();
+
+    int width = Scalar::byteSize(mir->indexType());
 
     if (lir->index()->isConstant()) {
-        Address dest(elements, ToInt32(lir->index()) * width + lir->mir()->offsetAdjustment());
-        StoreToTypedArray(masm, writeType, value, dest);
+        Address dest(elements, ToInt32(lir->index()) * width + mir->offsetAdjustment());
+        StoreToTypedArray(masm, writeType, value, dest, numElems);
     } else {
         BaseIndex dest(elements, ToRegister(lir->index()), ScaleFromElemWidth(width),
-                       lir->mir()->offsetAdjustment());
-        StoreToTypedArray(masm, writeType, value, dest);
+                       mir->offsetAdjustment());
+        StoreToTypedArray(masm, writeType, value, dest, numElems);
     }
 }
 
 void
 CodeGenerator::visitStoreTypedArrayElementHole(LStoreTypedArrayElementHole *lir)
 {
     Register elements = ToRegister(lir->elements());
     const LAllocation *value = lir->value();
--- a/js/src/jit/IonBuilder.h
+++ b/js/src/jit/IonBuilder.h
@@ -839,17 +839,18 @@ class IonBuilder
                                      SimdTypeDescr::Type from, SimdTypeDescr::Type to);
     InliningStatus inlineSimdSelect(CallInfo &callInfo, JSNative native, bool isElementWise,
                                     SimdTypeDescr::Type type);
 
     bool prepareForSimdLoadStore(CallInfo &callInfo, Scalar::Type simdType, MInstruction **elements,
                                  MDefinition **index, Scalar::Type *arrayType);
     InliningStatus inlineSimdLoad(CallInfo &callInfo, JSNative native, SimdTypeDescr::Type type,
                                   unsigned numElems);
-    InliningStatus inlineSimdStore(CallInfo &callInfo, JSNative native, SimdTypeDescr::Type type);
+    InliningStatus inlineSimdStore(CallInfo &callInfo, JSNative native, SimdTypeDescr::Type type,
+                                   unsigned numElems);
 
     // Utility intrinsics.
     InliningStatus inlineIsCallable(CallInfo &callInfo);
     InliningStatus inlineIsObject(CallInfo &callInfo);
     InliningStatus inlineToObject(CallInfo &callInfo);
     InliningStatus inlineToInteger(CallInfo &callInfo);
     InliningStatus inlineToString(CallInfo &callInfo);
     InliningStatus inlineDump(CallInfo &callInfo);
--- a/js/src/jit/MCallOptimize.cpp
+++ b/js/src/jit/MCallOptimize.cpp
@@ -373,19 +373,31 @@ IonBuilder::inlineNativeCall(CallInfo &c
     if (native == js::simd_float32x4_loadX)
         return inlineSimdLoad(callInfo, native, SimdTypeDescr::TYPE_FLOAT32, 1);
     if (native == js::simd_float32x4_loadXY)
         return inlineSimdLoad(callInfo, native, SimdTypeDescr::TYPE_FLOAT32, 2);
     if (native == js::simd_float32x4_loadXYZ)
         return inlineSimdLoad(callInfo, native, SimdTypeDescr::TYPE_FLOAT32, 3);
 
     if (native == js::simd_int32x4_store)
-        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_INT32);
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_INT32, 4);
+    if (native == js::simd_int32x4_storeX)
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_INT32, 1);
+    if (native == js::simd_int32x4_storeXY)
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_INT32, 2);
+    if (native == js::simd_int32x4_storeXYZ)
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_INT32, 3);
     if (native == js::simd_float32x4_store)
-        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_FLOAT32);
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_FLOAT32, 4);
+    if (native == js::simd_float32x4_storeX)
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_FLOAT32, 1);
+    if (native == js::simd_float32x4_storeXY)
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_FLOAT32, 2);
+    if (native == js::simd_float32x4_storeXYZ)
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_FLOAT32, 3);
 
     return InliningStatus_NotInlined;
 }
 
 IonBuilder::InliningStatus
 IonBuilder::inlineNativeGetter(CallInfo &callInfo, JSFunction *target)
 {
     MOZ_ASSERT(target->isNative());
@@ -3225,34 +3237,35 @@ IonBuilder::inlineSimdLoad(CallInfo &cal
     MLoadUnboxedScalar *load = MLoadUnboxedScalar::New(alloc(), elements, index, arrayType);
     load->setResultType(SimdTypeDescrToMIRType(type));
     load->setSimdRead(simdType, numElems);
 
     return boxSimd(callInfo, load, templateObj);
 }
 
 IonBuilder::InliningStatus
-IonBuilder::inlineSimdStore(CallInfo &callInfo, JSNative native, SimdTypeDescr::Type type)
+IonBuilder::inlineSimdStore(CallInfo &callInfo, JSNative native, SimdTypeDescr::Type type,
+                            unsigned numElems)
 {
     InlineTypedObject *templateObj = nullptr;
     if (!checkInlineSimd(callInfo, native, type, 3, &templateObj))
         return InliningStatus_NotInlined;
 
     Scalar::Type simdType = SimdTypeToScalarType(type);
 
     MDefinition *index = nullptr;
     MInstruction *elements = nullptr;
     Scalar::Type arrayType;
     if (!prepareForSimdLoadStore(callInfo, simdType, &elements, &index, &arrayType))
         return InliningStatus_NotInlined;
 
     MDefinition *valueToWrite = callInfo.getArg(2);
     MStoreUnboxedScalar *store = MStoreUnboxedScalar::New(alloc(), elements, index,
                                                           valueToWrite, arrayType);
-    store->setWriteType(simdType);
+    store->setSimdWrite(simdType, numElems);
 
     current->add(store);
     current->push(valueToWrite);
 
     callInfo.setImplicitlyUsedUnchecked();
 
     if (!resumeAfter(store))
         return InliningStatus_Error;
--- a/js/src/jit/MIR.h
+++ b/js/src/jit/MIR.h
@@ -9142,25 +9142,27 @@ class StoreUnboxedScalarBase
 class MStoreUnboxedScalar
   : public MTernaryInstruction,
     public StoreUnboxedScalarBase,
     public StoreUnboxedScalarPolicy::Data
 {
     Scalar::Type indexType_;
     bool requiresBarrier_;
     int32_t offsetAdjustment_;
+    unsigned numElems_; // used only for SIMD
 
     MStoreUnboxedScalar(MDefinition *elements, MDefinition *index, MDefinition *value,
                         Scalar::Type indexType, MemoryBarrierRequirement requiresBarrier,
                         int32_t offsetAdjustment)
       : MTernaryInstruction(elements, index, value),
         StoreUnboxedScalarBase(indexType),
         indexType_(indexType),
         requiresBarrier_(requiresBarrier == DoesRequireMemoryBarrier),
-        offsetAdjustment_(offsetAdjustment)
+        offsetAdjustment_(offsetAdjustment),
+        numElems_(1)
     {
         if (requiresBarrier_)
             setGuard();         // Not removable or movable
         else
             setMovable();
         MOZ_ASSERT(IsValidElementsType(elements, offsetAdjustment));
         MOZ_ASSERT(index->type() == MIRType_Int32);
         MOZ_ASSERT(indexType >= 0 && indexType < Scalar::MaxTypedArrayViewType);
@@ -9175,16 +9177,24 @@ class MStoreUnboxedScalar
                                     MemoryBarrierRequirement requiresBarrier =
                                         DoesNotRequireMemoryBarrier,
                                     int32_t offsetAdjustment = 0)
     {
         return new(alloc) MStoreUnboxedScalar(elements, index, value, indexType,
                                               requiresBarrier, offsetAdjustment);
     }
 
+    void setSimdWrite(Scalar::Type writeType, unsigned numElems) {
+        MOZ_ASSERT(Scalar::isSimdType(writeType));
+        setWriteType(writeType);
+        numElems_ = numElems;
+    }
+    unsigned numElems() const {
+        return numElems_;
+    }
     Scalar::Type indexType() const {
         return indexType_;
     }
     MDefinition *elements() const {
         return getOperand(0);
     }
     MDefinition *index() const {
         return getOperand(1);
--- a/js/src/jit/MacroAssembler.cpp
+++ b/js/src/jit/MacroAssembler.cpp
@@ -275,51 +275,80 @@ template void MacroAssembler::guardObjec
 
 template void MacroAssembler::guardType(const Address &address, TypeSet::Type type,
                                         Register scratch, Label *miss);
 template void MacroAssembler::guardType(const ValueOperand &value, TypeSet::Type type,
                                         Register scratch, Label *miss);
 
 template<typename S, typename T>
 static void
-StoreToTypedFloatArray(MacroAssembler &masm, int arrayType, const S &value, const T &dest)
+StoreToTypedFloatArray(MacroAssembler &masm, int arrayType, const S &value, const T &dest,
+                       unsigned numElems)
 {
     switch (arrayType) {
       case Scalar::Float32:
         masm.storeFloat32(value, dest);
         break;
       case Scalar::Float64:
 #ifdef JS_MORE_DETERMINISTIC
         // See the comment in TypedArrayObjectTemplate::doubleToNative.
         masm.canonicalizeDouble(value);
 #endif
         masm.storeDouble(value, dest);
         break;
       case Scalar::Float32x4:
-        masm.storeUnalignedFloat32x4(value, dest);
+        switch (numElems) {
+          case 1:
+            masm.storeFloat32(value, dest);
+            break;
+          case 2:
+            masm.storeDouble(value, dest);
+            break;
+          case 3:
+            masm.storeFloat32x3(value, dest);
+            break;
+          case 4:
+            masm.storeUnalignedFloat32x4(value, dest);
+            break;
+          default: MOZ_CRASH("unexpected number of elements in simd write");
+        }
         break;
       case Scalar::Int32x4:
-        masm.storeUnalignedInt32x4(value, dest);
+        switch (numElems) {
+          case 1:
+            masm.storeInt32x1(value, dest);
+            break;
+          case 2:
+            masm.storeInt32x2(value, dest);
+            break;
+          case 3:
+            masm.storeInt32x3(value, dest);
+            break;
+          case 4:
+            masm.storeUnalignedInt32x4(value, dest);
+            break;
+          default: MOZ_CRASH("unexpected number of elements in simd write");
+        }
         break;
       default:
         MOZ_CRASH("Invalid typed array type");
     }
 }
 
 void
 MacroAssembler::storeToTypedFloatArray(Scalar::Type arrayType, FloatRegister value,
-                                       const BaseIndex &dest)
+                                       const BaseIndex &dest, unsigned numElems)
 {
-    StoreToTypedFloatArray(*this, arrayType, value, dest);
+    StoreToTypedFloatArray(*this, arrayType, value, dest, numElems);
 }
 void
 MacroAssembler::storeToTypedFloatArray(Scalar::Type arrayType, FloatRegister value,
-                                       const Address &dest)
+                                       const Address &dest, unsigned numElems)
 {
-    StoreToTypedFloatArray(*this, arrayType, value, dest);
+    StoreToTypedFloatArray(*this, arrayType, value, dest, numElems);
 }
 
 template<typename T>
 void
 MacroAssembler::loadFromTypedArray(Scalar::Type arrayType, const T &src, AnyRegister dest, Register temp,
                                    Label *fail, bool canonicalizeDoubles, unsigned numElems)
 {
     switch (arrayType) {
--- a/js/src/jit/MacroAssembler.h
+++ b/js/src/jit/MacroAssembler.h
@@ -752,18 +752,20 @@ class MacroAssembler : public MacroAssem
     template<typename S, typename T>
     void atomicBinopToTypedIntArray(AtomicOp op, Scalar::Type arrayType, const S &value,
                                     const T &mem, Register temp1, Register temp2, AnyRegister output);
 
     // Generating no result.
     template<typename S, typename T>
     void atomicBinopToTypedIntArray(AtomicOp op, Scalar::Type arrayType, const S &value, const T &mem);
 
-    void storeToTypedFloatArray(Scalar::Type arrayType, FloatRegister value, const BaseIndex &dest);
-    void storeToTypedFloatArray(Scalar::Type arrayType, FloatRegister value, const Address &dest);
+    void storeToTypedFloatArray(Scalar::Type arrayType, FloatRegister value, const BaseIndex &dest,
+                                unsigned numElems = 0);
+    void storeToTypedFloatArray(Scalar::Type arrayType, FloatRegister value, const Address &dest,
+                                unsigned numElems = 0);
 
     // Load a property from an UnboxedPlainObject.
     template <typename T>
     void loadUnboxedProperty(T address, JSValueType type, TypedOrValueRegister output);
 
     // Store a property to an UnboxedPlainObject, without triggering barriers.
     // If failure is null, the value definitely has a type suitable for storing
     // in the property.
--- a/js/src/jit/arm/MacroAssembler-arm.h
+++ b/js/src/jit/arm/MacroAssembler-arm.h
@@ -1396,25 +1396,33 @@ class MacroAssemblerARMCompat : public M
     void loadPrivate(const Address &address, Register dest);
 
     void loadInt32x1(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x1(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x2(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x2(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x3(const Address &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x3(const BaseIndex &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x1(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x1(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x2(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x2(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x3(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x3(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
     void loadAlignedInt32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeAlignedInt32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void loadUnalignedInt32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadUnalignedInt32x4(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeUnalignedInt32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void storeUnalignedInt32x4(FloatRegister src, BaseIndex addr) { MOZ_CRASH("NYI"); }
 
     void loadFloat32x3(const Address &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadFloat32x3(const BaseIndex &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
+    void storeFloat32x3(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeFloat32x3(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
     void loadAlignedFloat32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeAlignedFloat32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void loadUnalignedFloat32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadUnalignedFloat32x4(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeUnalignedFloat32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void storeUnalignedFloat32x4(FloatRegister src, BaseIndex addr) { MOZ_CRASH("NYI"); }
 
     void loadDouble(const Address &addr, FloatRegister dest);
--- a/js/src/jit/mips/MacroAssembler-mips.h
+++ b/js/src/jit/mips/MacroAssembler-mips.h
@@ -1314,25 +1314,33 @@ public:
     void loadPrivate(const Address &address, Register dest);
 
     void loadInt32x1(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x1(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x2(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x2(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x3(const Address &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x3(const BaseIndex &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x1(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x1(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x2(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x2(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x3(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x3(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
     void loadAlignedInt32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeAlignedInt32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void loadUnalignedInt32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadUnalignedInt32x4(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeUnalignedInt32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void storeUnalignedInt32x4(FloatRegister src, BaseIndex addr) { MOZ_CRASH("NYI"); }
 
     void loadFloat32x3(const Address &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadFloat32x3(const BaseIndex &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
+    void storeFloat32x3(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeFloat32x(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
     void loadAlignedFloat32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeAlignedFloat32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void loadUnalignedFloat32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadUnalignedFloat32x4(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeUnalignedFloat32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void storeUnalignedFloat32x4(FloatRegister src, BaseIndex addr) { MOZ_CRASH("NYI"); }
 
     void loadDouble(const Address &addr, FloatRegister dest);
--- a/js/src/jit/none/MacroAssembler-none.h
+++ b/js/src/jit/none/MacroAssembler-none.h
@@ -311,16 +311,20 @@ class MacroAssemblerNone : public Assemb
     template <typename T, typename S> void storeFloat32(T, S) { MOZ_CRASH(); }
     template <typename T, typename S> void storeDouble(T, S) { MOZ_CRASH(); }
     template <typename T, typename S> void storeAlignedInt32x4(T, S) { MOZ_CRASH(); }
     template <typename T, typename S> void storeUnalignedInt32x4(T, S) { MOZ_CRASH(); }
     template <typename T, typename S> void storeAlignedFloat32x4(T, S) { MOZ_CRASH(); }
     template <typename T, typename S> void storeUnalignedFloat32x4(T, S) { MOZ_CRASH(); }
     template <typename T, typename S> void store8(T, S) { MOZ_CRASH(); }
     template <typename T, typename S> void store16(T, S) { MOZ_CRASH(); }
+    template <typename T, typename S> void storeInt32x1(T, S) { MOZ_CRASH(); }
+    template <typename T, typename S> void storeInt32x2(T, S) { MOZ_CRASH(); }
+    template <typename T, typename S> void storeInt32x3(T, S) { MOZ_CRASH(); }
+    template <typename T, typename S> void storeFloat32x3(T, S) { MOZ_CRASH(); }
 
     template <typename T> void computeEffectiveAddress(T, Register) { MOZ_CRASH(); }
 
     template <typename T> void compareExchange8SignExtend(const T &mem, Register oldval, Register newval, Register output) { MOZ_CRASH(); }
     template <typename T> void compareExchange8ZeroExtend(const T &mem, Register oldval, Register newval, Register output) { MOZ_CRASH(); }
     template <typename T> void compareExchange16SignExtend(const T &mem, Register oldval, Register newval, Register output) { MOZ_CRASH(); }
     template <typename T> void compareExchange16ZeroExtend(const T &mem, Register oldval, Register newval, Register output) { MOZ_CRASH(); }
     template <typename T> void compareExchange32(const T &mem, Register oldval, Register newval, Register output) { MOZ_CRASH(); }
--- a/js/src/jit/shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/shared/MacroAssembler-x86-shared.h
@@ -1036,16 +1036,44 @@ class MacroAssemblerX86Shared : public A
         vmovdqu(Operand(src), dest);
     }
     void loadUnalignedInt32x4(const BaseIndex &src, FloatRegister dest) {
         vmovdqu(Operand(src), dest);
     }
     void loadUnalignedInt32x4(const Operand &src, FloatRegister dest) {
         vmovdqu(src, dest);
     }
+
+    void storeInt32x1(FloatRegister src, const Address &dest) {
+        vmovd(src, Operand(dest));
+    }
+    void storeInt32x1(FloatRegister src, const BaseIndex &dest) {
+        vmovd(src, Operand(dest));
+    }
+    void storeInt32x2(FloatRegister src, const Address &dest) {
+        vmovq(src, Operand(dest));
+    }
+    void storeInt32x2(FloatRegister src, const BaseIndex &dest) {
+        vmovq(src, Operand(dest));
+    }
+    void storeInt32x3(FloatRegister src, const Address &dest) {
+        Address destZ(dest);
+        destZ.offset += 2 * sizeof(int32_t);
+        vmovq(src, Operand(dest));
+        vmovhlps(src, ScratchSimdReg, ScratchSimdReg);
+        vmovd(ScratchSimdReg, Operand(destZ));
+    }
+    void storeInt32x3(FloatRegister src, const BaseIndex &dest) {
+        BaseIndex destZ(dest);
+        destZ.offset += 2 * sizeof(int32_t);
+        vmovq(src, Operand(dest));
+        vmovhlps(src, ScratchSimdReg, ScratchSimdReg);
+        vmovd(ScratchSimdReg, Operand(destZ));
+    }
+
     void storeUnalignedInt32x4(FloatRegister src, const Address &dest) {
         vmovdqu(src, Operand(dest));
     }
     void storeUnalignedInt32x4(FloatRegister src, const BaseIndex &dest) {
         vmovdqu(src, Operand(dest));
     }
     void storeUnalignedInt32x4(FloatRegister src, const Operand &dest) {
         vmovdqu(src, dest);
@@ -1111,16 +1139,31 @@ class MacroAssemblerX86Shared : public A
     }
 
     void loadAlignedFloat32x4(const Address &src, FloatRegister dest) {
         vmovaps(Operand(src), dest);
     }
     void loadAlignedFloat32x4(const Operand &src, FloatRegister dest) {
         vmovaps(src, dest);
     }
+
+    void storeFloat32x3(FloatRegister src, const Address &dest) {
+        Address destZ(dest);
+        destZ.offset += 2 * sizeof(int32_t);
+        storeDouble(src, dest);
+        vmovhlps(src, ScratchSimdReg, ScratchSimdReg);
+        storeFloat32(ScratchSimdReg, destZ);
+    }
+    void storeFloat32x3(FloatRegister src, const BaseIndex &dest) {
+        BaseIndex destZ(dest);
+        destZ.offset += 2 * sizeof(int32_t);
+        storeDouble(src, dest);
+        vmovhlps(src, ScratchSimdReg, ScratchSimdReg);
+        storeFloat32(ScratchSimdReg, destZ);
+    }
     void storeAlignedFloat32x4(FloatRegister src, const Address &dest) {
         vmovaps(src, Operand(dest));
     }
     void moveFloat32x4(FloatRegister src, FloatRegister dest) {
         vmovaps(src, dest);
     }
     FloatRegister reusedInputFloat32x4(FloatRegister src, FloatRegister dest) {
         if (HasAVX())