Bug 1135042: Optimize SIMD.storeX/storeXY/storeXYZ in Ion; r=bhackett
authorBenjamin Bouvier <benj@benj.me>
Thu, 19 Mar 2015 13:50:56 +0100
changeset 252089 44aa05cc400f82eabbb9a9e3880dda178eca5633
parent 252088 266fac2f7b2a571959cd0684d8799e9a4f9cff50
child 252090 38c35f7b2b7d16c97f3ecacf6f6f27a42612a897
push id1174
push usernsm.nikhil@gmail.com
push dateSun, 22 Mar 2015 01:24:25 +0000
reviewersbhackett
bugs1135042
milestone39.0a1
Bug 1135042: Optimize SIMD.storeX/storeXY/storeXYZ in Ion; r=bhackett
js/src/builtin/SIMD.h
js/src/jit-test/tests/SIMD/store.js
js/src/jit/CodeGenerator.cpp
js/src/jit/IonBuilder.h
js/src/jit/MCallOptimize.cpp
js/src/jit/MIR.h
js/src/jit/MacroAssembler.cpp
js/src/jit/MacroAssembler.h
js/src/jit/arm/MacroAssembler-arm.h
js/src/jit/mips/MacroAssembler-mips.h
js/src/jit/none/MacroAssembler-none.h
js/src/jit/shared/MacroAssembler-x86-shared.h
--- a/js/src/builtin/SIMD.h
+++ b/js/src/builtin/SIMD.h
@@ -243,23 +243,23 @@
     _(neg)                           \
     _(swizzle)                       \
     _(shuffle)                       \
     _(load)                          \
     _(loadX)                         \
     _(loadXY)                        \
     _(loadXYZ)                       \
     _(store)                         \
+    _(storeX)                        \
+    _(storeXY)                       \
+    _(storeXYZ)                      \
     _(check)
 #define FOREACH_COMMONX4_SIMD_OP(_)  \
     ION_COMMONX4_SIMD_OP(_)          \
-    COMP_COMMONX4_TO_INT32X4_SIMD_OP(_) \
-    _(storeX)                        \
-    _(storeXY)                       \
-    _(storeXYZ)
+    COMP_COMMONX4_TO_INT32X4_SIMD_OP(_)
 #define FORALL_SIMD_OP(_)            \
     FOREACH_INT32X4_SIMD_OP(_)       \
     FOREACH_FLOAT32X4_SIMD_OP(_)     \
     FOREACH_COMMONX4_SIMD_OP(_)
 
 namespace js {
 
 class SIMDObject : public JSObject
--- a/js/src/jit-test/tests/SIMD/store.js
+++ b/js/src/jit-test/tests/SIMD/store.js
@@ -12,45 +12,109 @@ function f() {
     var u32 = new Uint32Array(f32.buffer);
     var i16 = new Int16Array(f32.buffer);
     var u16 = new Uint16Array(f32.buffer);
     var i8  = new Int8Array(f32.buffer);
     var u8  = new Uint8Array(f32.buffer);
 
     var f4 = SIMD.float32x4(42, 43, 44, 45);
 
-    function check() {
+    function check(n) {
         assertEq(f32[0], 42);
-        assertEq(f32[1], 43);
-        assertEq(f32[2], 44);
-        assertEq(f32[3], 45);
+        assertEq(f32[1], n > 1 ? 43 : 2);
+        assertEq(f32[2], n > 2 ? 44 : 3);
+        assertEq(f32[3], n > 3 ? 45 : 4);
 
         f32[0] = 1;
         f32[1] = 2;
         f32[2] = 3;
         f32[3] = 4;
     }
 
-    for (var i = 0; i < 150; i++) {
+    function testStore() {
         SIMD.float32x4.store(f64, 0, f4);
-        check();
+        check(4);
         SIMD.float32x4.store(f32, 0, f4);
-        check();
+        check(4);
         SIMD.float32x4.store(i32, 0, f4);
-        check();
+        check(4);
         SIMD.float32x4.store(u32, 0, f4);
-        check();
+        check(4);
         SIMD.float32x4.store(i16, 0, f4);
-        check();
+        check(4);
         SIMD.float32x4.store(u16, 0, f4);
-        check();
+        check(4);
         SIMD.float32x4.store(i8, 0, f4);
-        check();
+        check(4);
         SIMD.float32x4.store(u8, 0, f4);
-        check();
+        check(4);
+    }
+
+    function testStoreX() {
+        SIMD.float32x4.storeX(f64, 0, f4);
+        check(1);
+        SIMD.float32x4.storeX(f32, 0, f4);
+        check(1);
+        SIMD.float32x4.storeX(i32, 0, f4);
+        check(1);
+        SIMD.float32x4.storeX(u32, 0, f4);
+        check(1);
+        SIMD.float32x4.storeX(i16, 0, f4);
+        check(1);
+        SIMD.float32x4.storeX(u16, 0, f4);
+        check(1);
+        SIMD.float32x4.storeX(i8, 0, f4);
+        check(1);
+        SIMD.float32x4.storeX(u8, 0, f4);
+        check(1);
+    }
+
+    function testStoreXY() {
+        SIMD.float32x4.storeXY(f64, 0, f4);
+        check(2);
+        SIMD.float32x4.storeXY(f32, 0, f4);
+        check(2);
+        SIMD.float32x4.storeXY(i32, 0, f4);
+        check(2);
+        SIMD.float32x4.storeXY(u32, 0, f4);
+        check(2);
+        SIMD.float32x4.storeXY(i16, 0, f4);
+        check(2);
+        SIMD.float32x4.storeXY(u16, 0, f4);
+        check(2);
+        SIMD.float32x4.storeXY(i8, 0, f4);
+        check(2);
+        SIMD.float32x4.storeXY(u8, 0, f4);
+        check(2);
+    }
+
+    function testStoreXYZ() {
+        SIMD.float32x4.storeXYZ(f64, 0, f4);
+        check(3);
+        SIMD.float32x4.storeXYZ(f32, 0, f4);
+        check(3);
+        SIMD.float32x4.storeXYZ(i32, 0, f4);
+        check(3);
+        SIMD.float32x4.storeXYZ(u32, 0, f4);
+        check(3);
+        SIMD.float32x4.storeXYZ(i16, 0, f4);
+        check(3);
+        SIMD.float32x4.storeXYZ(u16, 0, f4);
+        check(3);
+        SIMD.float32x4.storeXYZ(i8, 0, f4);
+        check(3);
+        SIMD.float32x4.storeXYZ(u8, 0, f4);
+        check(3);
+    }
+
+    for (var i = 0; i < 150; i++) {
+        testStore();
+        testStoreX();
+        testStoreXY();
+        testStoreXYZ();
     }
 }
 
 f();
 
 function testBailout(uglyDuckling) {
     var f32 = new Float32Array(16);
     for (var i = 0; i < 16; i++)
--- a/js/src/jit/CodeGenerator.cpp
+++ b/js/src/jit/CodeGenerator.cpp
@@ -8754,47 +8754,52 @@ CodeGenerator::visitLoadTypedArrayElemen
     if (fail.used())
         bailoutFrom(&fail, lir->snapshot());
 
     masm.bind(&done);
 }
 
 template <typename T>
 static inline void
-StoreToTypedArray(MacroAssembler &masm, Scalar::Type writeType, const LAllocation *value, const T &dest)
+StoreToTypedArray(MacroAssembler &masm, Scalar::Type writeType, const LAllocation *value,
+                  const T &dest, unsigned numElems = 0)
 {
     if (Scalar::isSimdType(writeType) ||
         writeType == Scalar::Float32 ||
         writeType == Scalar::Float64)
     {
-        masm.storeToTypedFloatArray(writeType, ToFloatRegister(value), dest);
+        masm.storeToTypedFloatArray(writeType, ToFloatRegister(value), dest, numElems);
     } else {
         if (value->isConstant())
             masm.storeToTypedIntArray(writeType, Imm32(ToInt32(value)), dest);
         else
             masm.storeToTypedIntArray(writeType, ToRegister(value), dest);
     }
 }
 
 void
 CodeGenerator::visitStoreUnboxedScalar(LStoreUnboxedScalar *lir)
 {
     Register elements = ToRegister(lir->elements());
     const LAllocation *value = lir->value();
 
-    Scalar::Type writeType = lir->mir()->writeType();
-    int width = Scalar::byteSize(lir->mir()->indexType());
+    const MStoreUnboxedScalar *mir = lir->mir();
+
+    Scalar::Type writeType = mir->writeType();
+    unsigned numElems = mir->numElems();
+
+    int width = Scalar::byteSize(mir->indexType());
 
     if (lir->index()->isConstant()) {
-        Address dest(elements, ToInt32(lir->index()) * width + lir->mir()->offsetAdjustment());
-        StoreToTypedArray(masm, writeType, value, dest);
+        Address dest(elements, ToInt32(lir->index()) * width + mir->offsetAdjustment());
+        StoreToTypedArray(masm, writeType, value, dest, numElems);
     } else {
         BaseIndex dest(elements, ToRegister(lir->index()), ScaleFromElemWidth(width),
-                       lir->mir()->offsetAdjustment());
-        StoreToTypedArray(masm, writeType, value, dest);
+                       mir->offsetAdjustment());
+        StoreToTypedArray(masm, writeType, value, dest, numElems);
     }
 }
 
 void
 CodeGenerator::visitStoreTypedArrayElementHole(LStoreTypedArrayElementHole *lir)
 {
     Register elements = ToRegister(lir->elements());
     const LAllocation *value = lir->value();
--- a/js/src/jit/IonBuilder.h
+++ b/js/src/jit/IonBuilder.h
@@ -839,17 +839,18 @@ class IonBuilder
                                      SimdTypeDescr::Type from, SimdTypeDescr::Type to);
     InliningStatus inlineSimdSelect(CallInfo &callInfo, JSNative native, bool isElementWise,
                                     SimdTypeDescr::Type type);
 
     bool prepareForSimdLoadStore(CallInfo &callInfo, Scalar::Type simdType, MInstruction **elements,
                                  MDefinition **index, Scalar::Type *arrayType);
     InliningStatus inlineSimdLoad(CallInfo &callInfo, JSNative native, SimdTypeDescr::Type type,
                                   unsigned numElems);
-    InliningStatus inlineSimdStore(CallInfo &callInfo, JSNative native, SimdTypeDescr::Type type);
+    InliningStatus inlineSimdStore(CallInfo &callInfo, JSNative native, SimdTypeDescr::Type type,
+                                   unsigned numElems);
 
     // Utility intrinsics.
     InliningStatus inlineIsCallable(CallInfo &callInfo);
     InliningStatus inlineIsObject(CallInfo &callInfo);
     InliningStatus inlineToObject(CallInfo &callInfo);
     InliningStatus inlineToInteger(CallInfo &callInfo);
     InliningStatus inlineToString(CallInfo &callInfo);
     InliningStatus inlineDump(CallInfo &callInfo);
--- a/js/src/jit/MCallOptimize.cpp
+++ b/js/src/jit/MCallOptimize.cpp
@@ -373,19 +373,31 @@ IonBuilder::inlineNativeCall(CallInfo &c
     if (native == js::simd_float32x4_loadX)
         return inlineSimdLoad(callInfo, native, SimdTypeDescr::TYPE_FLOAT32, 1);
     if (native == js::simd_float32x4_loadXY)
         return inlineSimdLoad(callInfo, native, SimdTypeDescr::TYPE_FLOAT32, 2);
     if (native == js::simd_float32x4_loadXYZ)
         return inlineSimdLoad(callInfo, native, SimdTypeDescr::TYPE_FLOAT32, 3);
 
     if (native == js::simd_int32x4_store)
-        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_INT32);
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_INT32, 4);
+    if (native == js::simd_int32x4_storeX)
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_INT32, 1);
+    if (native == js::simd_int32x4_storeXY)
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_INT32, 2);
+    if (native == js::simd_int32x4_storeXYZ)
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_INT32, 3);
     if (native == js::simd_float32x4_store)
-        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_FLOAT32);
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_FLOAT32, 4);
+    if (native == js::simd_float32x4_storeX)
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_FLOAT32, 1);
+    if (native == js::simd_float32x4_storeXY)
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_FLOAT32, 2);
+    if (native == js::simd_float32x4_storeXYZ)
+        return inlineSimdStore(callInfo, native, SimdTypeDescr::TYPE_FLOAT32, 3);
 
     return InliningStatus_NotInlined;
 }
 
 IonBuilder::InliningStatus
 IonBuilder::inlineNativeGetter(CallInfo &callInfo, JSFunction *target)
 {
     MOZ_ASSERT(target->isNative());
@@ -3225,34 +3237,35 @@ IonBuilder::inlineSimdLoad(CallInfo &cal
     MLoadUnboxedScalar *load = MLoadUnboxedScalar::New(alloc(), elements, index, arrayType);
     load->setResultType(SimdTypeDescrToMIRType(type));
     load->setSimdRead(simdType, numElems);
 
     return boxSimd(callInfo, load, templateObj);
 }
 
 IonBuilder::InliningStatus
-IonBuilder::inlineSimdStore(CallInfo &callInfo, JSNative native, SimdTypeDescr::Type type)
+IonBuilder::inlineSimdStore(CallInfo &callInfo, JSNative native, SimdTypeDescr::Type type,
+                            unsigned numElems)
 {
     InlineTypedObject *templateObj = nullptr;
     if (!checkInlineSimd(callInfo, native, type, 3, &templateObj))
         return InliningStatus_NotInlined;
 
     Scalar::Type simdType = SimdTypeToScalarType(type);
 
     MDefinition *index = nullptr;
     MInstruction *elements = nullptr;
     Scalar::Type arrayType;
     if (!prepareForSimdLoadStore(callInfo, simdType, &elements, &index, &arrayType))
         return InliningStatus_NotInlined;
 
     MDefinition *valueToWrite = callInfo.getArg(2);
     MStoreUnboxedScalar *store = MStoreUnboxedScalar::New(alloc(), elements, index,
                                                           valueToWrite, arrayType);
-    store->setWriteType(simdType);
+    store->setSimdWrite(simdType, numElems);
 
     current->add(store);
     current->push(valueToWrite);
 
     callInfo.setImplicitlyUsedUnchecked();
 
     if (!resumeAfter(store))
         return InliningStatus_Error;
--- a/js/src/jit/MIR.h
+++ b/js/src/jit/MIR.h
@@ -9142,25 +9142,27 @@ class StoreUnboxedScalarBase
 class MStoreUnboxedScalar
   : public MTernaryInstruction,
     public StoreUnboxedScalarBase,
     public StoreUnboxedScalarPolicy::Data
 {
     Scalar::Type indexType_;
     bool requiresBarrier_;
     int32_t offsetAdjustment_;
+    unsigned numElems_; // used only for SIMD
 
     MStoreUnboxedScalar(MDefinition *elements, MDefinition *index, MDefinition *value,
                         Scalar::Type indexType, MemoryBarrierRequirement requiresBarrier,
                         int32_t offsetAdjustment)
       : MTernaryInstruction(elements, index, value),
         StoreUnboxedScalarBase(indexType),
         indexType_(indexType),
         requiresBarrier_(requiresBarrier == DoesRequireMemoryBarrier),
-        offsetAdjustment_(offsetAdjustment)
+        offsetAdjustment_(offsetAdjustment),
+        numElems_(1)
     {
         if (requiresBarrier_)
             setGuard();         // Not removable or movable
         else
             setMovable();
         MOZ_ASSERT(IsValidElementsType(elements, offsetAdjustment));
         MOZ_ASSERT(index->type() == MIRType_Int32);
         MOZ_ASSERT(indexType >= 0 && indexType < Scalar::MaxTypedArrayViewType);
@@ -9175,16 +9177,24 @@ class MStoreUnboxedScalar
                                     MemoryBarrierRequirement requiresBarrier =
                                         DoesNotRequireMemoryBarrier,
                                     int32_t offsetAdjustment = 0)
     {
         return new(alloc) MStoreUnboxedScalar(elements, index, value, indexType,
                                               requiresBarrier, offsetAdjustment);
     }
 
+    void setSimdWrite(Scalar::Type writeType, unsigned numElems) {
+        MOZ_ASSERT(Scalar::isSimdType(writeType));
+        setWriteType(writeType);
+        numElems_ = numElems;
+    }
+    unsigned numElems() const {
+        return numElems_;
+    }
     Scalar::Type indexType() const {
         return indexType_;
     }
     MDefinition *elements() const {
         return getOperand(0);
     }
     MDefinition *index() const {
         return getOperand(1);
--- a/js/src/jit/MacroAssembler.cpp
+++ b/js/src/jit/MacroAssembler.cpp
@@ -275,51 +275,80 @@ template void MacroAssembler::guardObjec
 
 template void MacroAssembler::guardType(const Address &address, TypeSet::Type type,
                                         Register scratch, Label *miss);
 template void MacroAssembler::guardType(const ValueOperand &value, TypeSet::Type type,
                                         Register scratch, Label *miss);
 
 template<typename S, typename T>
 static void
-StoreToTypedFloatArray(MacroAssembler &masm, int arrayType, const S &value, const T &dest)
+StoreToTypedFloatArray(MacroAssembler &masm, int arrayType, const S &value, const T &dest,
+                       unsigned numElems)
 {
     switch (arrayType) {
       case Scalar::Float32:
         masm.storeFloat32(value, dest);
         break;
       case Scalar::Float64:
 #ifdef JS_MORE_DETERMINISTIC
         // See the comment in TypedArrayObjectTemplate::doubleToNative.
         masm.canonicalizeDouble(value);
 #endif
         masm.storeDouble(value, dest);
         break;
       case Scalar::Float32x4:
-        masm.storeUnalignedFloat32x4(value, dest);
+        switch (numElems) {
+          case 1:
+            masm.storeFloat32(value, dest);
+            break;
+          case 2:
+            masm.storeDouble(value, dest);
+            break;
+          case 3:
+            masm.storeFloat32x3(value, dest);
+            break;
+          case 4:
+            masm.storeUnalignedFloat32x4(value, dest);
+            break;
+          default: MOZ_CRASH("unexpected number of elements in simd write");
+        }
         break;
       case Scalar::Int32x4:
-        masm.storeUnalignedInt32x4(value, dest);
+        switch (numElems) {
+          case 1:
+            masm.storeInt32x1(value, dest);
+            break;
+          case 2:
+            masm.storeInt32x2(value, dest);
+            break;
+          case 3:
+            masm.storeInt32x3(value, dest);
+            break;
+          case 4:
+            masm.storeUnalignedInt32x4(value, dest);
+            break;
+          default: MOZ_CRASH("unexpected number of elements in simd write");
+        }
         break;
       default:
         MOZ_CRASH("Invalid typed array type");
     }
 }
 
 void
 MacroAssembler::storeToTypedFloatArray(Scalar::Type arrayType, FloatRegister value,
-                                       const BaseIndex &dest)
+                                       const BaseIndex &dest, unsigned numElems)
 {
-    StoreToTypedFloatArray(*this, arrayType, value, dest);
+    StoreToTypedFloatArray(*this, arrayType, value, dest, numElems);
 }
 void
 MacroAssembler::storeToTypedFloatArray(Scalar::Type arrayType, FloatRegister value,
-                                       const Address &dest)
+                                       const Address &dest, unsigned numElems)
 {
-    StoreToTypedFloatArray(*this, arrayType, value, dest);
+    StoreToTypedFloatArray(*this, arrayType, value, dest, numElems);
 }
 
 template<typename T>
 void
 MacroAssembler::loadFromTypedArray(Scalar::Type arrayType, const T &src, AnyRegister dest, Register temp,
                                    Label *fail, bool canonicalizeDoubles, unsigned numElems)
 {
     switch (arrayType) {
--- a/js/src/jit/MacroAssembler.h
+++ b/js/src/jit/MacroAssembler.h
@@ -752,18 +752,20 @@ class MacroAssembler : public MacroAssem
     template<typename S, typename T>
     void atomicBinopToTypedIntArray(AtomicOp op, Scalar::Type arrayType, const S &value,
                                     const T &mem, Register temp1, Register temp2, AnyRegister output);
 
     // Generating no result.
     template<typename S, typename T>
     void atomicBinopToTypedIntArray(AtomicOp op, Scalar::Type arrayType, const S &value, const T &mem);
 
-    void storeToTypedFloatArray(Scalar::Type arrayType, FloatRegister value, const BaseIndex &dest);
-    void storeToTypedFloatArray(Scalar::Type arrayType, FloatRegister value, const Address &dest);
+    void storeToTypedFloatArray(Scalar::Type arrayType, FloatRegister value, const BaseIndex &dest,
+                                unsigned numElems = 0);
+    void storeToTypedFloatArray(Scalar::Type arrayType, FloatRegister value, const Address &dest,
+                                unsigned numElems = 0);
 
     // Load a property from an UnboxedPlainObject.
     template <typename T>
     void loadUnboxedProperty(T address, JSValueType type, TypedOrValueRegister output);
 
     // Store a property to an UnboxedPlainObject, without triggering barriers.
     // If failure is null, the value definitely has a type suitable for storing
     // in the property.
--- a/js/src/jit/arm/MacroAssembler-arm.h
+++ b/js/src/jit/arm/MacroAssembler-arm.h
@@ -1396,25 +1396,33 @@ class MacroAssemblerARMCompat : public M
     void loadPrivate(const Address &address, Register dest);
 
     void loadInt32x1(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x1(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x2(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x2(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x3(const Address &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x3(const BaseIndex &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x1(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x1(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x2(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x2(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x3(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x3(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
     void loadAlignedInt32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeAlignedInt32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void loadUnalignedInt32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadUnalignedInt32x4(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeUnalignedInt32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void storeUnalignedInt32x4(FloatRegister src, BaseIndex addr) { MOZ_CRASH("NYI"); }
 
     void loadFloat32x3(const Address &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadFloat32x3(const BaseIndex &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
+    void storeFloat32x3(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeFloat32x3(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
     void loadAlignedFloat32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeAlignedFloat32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void loadUnalignedFloat32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadUnalignedFloat32x4(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeUnalignedFloat32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void storeUnalignedFloat32x4(FloatRegister src, BaseIndex addr) { MOZ_CRASH("NYI"); }
 
     void loadDouble(const Address &addr, FloatRegister dest);
--- a/js/src/jit/mips/MacroAssembler-mips.h
+++ b/js/src/jit/mips/MacroAssembler-mips.h
@@ -1314,25 +1314,33 @@ public:
     void loadPrivate(const Address &address, Register dest);
 
     void loadInt32x1(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x1(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x2(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x2(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x3(const Address &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadInt32x3(const BaseIndex &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x1(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x1(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x2(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x2(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x3(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeInt32x3(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
     void loadAlignedInt32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeAlignedInt32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void loadUnalignedInt32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadUnalignedInt32x4(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeUnalignedInt32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void storeUnalignedInt32x4(FloatRegister src, BaseIndex addr) { MOZ_CRASH("NYI"); }
 
     void loadFloat32x3(const Address &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadFloat32x3(const BaseIndex &src, FloatRegister dest) { MOZ_CRASH("NYI"); }
+    void storeFloat32x3(FloatRegister src, const Address &dest) { MOZ_CRASH("NYI"); }
+    void storeFloat32x(FloatRegister src, const BaseIndex &dest) { MOZ_CRASH("NYI"); }
     void loadAlignedFloat32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeAlignedFloat32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void loadUnalignedFloat32x4(const Address &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void loadUnalignedFloat32x4(const BaseIndex &addr, FloatRegister dest) { MOZ_CRASH("NYI"); }
     void storeUnalignedFloat32x4(FloatRegister src, Address addr) { MOZ_CRASH("NYI"); }
     void storeUnalignedFloat32x4(FloatRegister src, BaseIndex addr) { MOZ_CRASH("NYI"); }
 
     void loadDouble(const Address &addr, FloatRegister dest);
--- a/js/src/jit/none/MacroAssembler-none.h
+++ b/js/src/jit/none/MacroAssembler-none.h
@@ -311,16 +311,20 @@ class MacroAssemblerNone : public Assemb
     template <typename T, typename S> void storeFloat32(T, S) { MOZ_CRASH(); }
     template <typename T, typename S> void storeDouble(T, S) { MOZ_CRASH(); }
     template <typename T, typename S> void storeAlignedInt32x4(T, S) { MOZ_CRASH(); }
     template <typename T, typename S> void storeUnalignedInt32x4(T, S) { MOZ_CRASH(); }
     template <typename T, typename S> void storeAlignedFloat32x4(T, S) { MOZ_CRASH(); }
     template <typename T, typename S> void storeUnalignedFloat32x4(T, S) { MOZ_CRASH(); }
     template <typename T, typename S> void store8(T, S) { MOZ_CRASH(); }
     template <typename T, typename S> void store16(T, S) { MOZ_CRASH(); }
+    template <typename T, typename S> void storeInt32x1(T, S) { MOZ_CRASH(); }
+    template <typename T, typename S> void storeInt32x2(T, S) { MOZ_CRASH(); }
+    template <typename T, typename S> void storeInt32x3(T, S) { MOZ_CRASH(); }
+    template <typename T, typename S> void storeFloat32x3(T, S) { MOZ_CRASH(); }
 
     template <typename T> void computeEffectiveAddress(T, Register) { MOZ_CRASH(); }
 
     template <typename T> void compareExchange8SignExtend(const T &mem, Register oldval, Register newval, Register output) { MOZ_CRASH(); }
     template <typename T> void compareExchange8ZeroExtend(const T &mem, Register oldval, Register newval, Register output) { MOZ_CRASH(); }
     template <typename T> void compareExchange16SignExtend(const T &mem, Register oldval, Register newval, Register output) { MOZ_CRASH(); }
     template <typename T> void compareExchange16ZeroExtend(const T &mem, Register oldval, Register newval, Register output) { MOZ_CRASH(); }
     template <typename T> void compareExchange32(const T &mem, Register oldval, Register newval, Register output) { MOZ_CRASH(); }
--- a/js/src/jit/shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/shared/MacroAssembler-x86-shared.h
@@ -1036,16 +1036,44 @@ class MacroAssemblerX86Shared : public A
         vmovdqu(Operand(src), dest);
     }
     void loadUnalignedInt32x4(const BaseIndex &src, FloatRegister dest) {
         vmovdqu(Operand(src), dest);
     }
     void loadUnalignedInt32x4(const Operand &src, FloatRegister dest) {
         vmovdqu(src, dest);
     }
+
+    void storeInt32x1(FloatRegister src, const Address &dest) {
+        vmovd(src, Operand(dest));
+    }
+    void storeInt32x1(FloatRegister src, const BaseIndex &dest) {
+        vmovd(src, Operand(dest));
+    }
+    void storeInt32x2(FloatRegister src, const Address &dest) {
+        vmovq(src, Operand(dest));
+    }
+    void storeInt32x2(FloatRegister src, const BaseIndex &dest) {
+        vmovq(src, Operand(dest));
+    }
+    void storeInt32x3(FloatRegister src, const Address &dest) {
+        Address destZ(dest);
+        destZ.offset += 2 * sizeof(int32_t);
+        vmovq(src, Operand(dest));
+        vmovhlps(src, ScratchSimdReg, ScratchSimdReg);
+        vmovd(ScratchSimdReg, Operand(destZ));
+    }
+    void storeInt32x3(FloatRegister src, const BaseIndex &dest) {
+        BaseIndex destZ(dest);
+        destZ.offset += 2 * sizeof(int32_t);
+        vmovq(src, Operand(dest));
+        vmovhlps(src, ScratchSimdReg, ScratchSimdReg);
+        vmovd(ScratchSimdReg, Operand(destZ));
+    }
+
     void storeUnalignedInt32x4(FloatRegister src, const Address &dest) {
         vmovdqu(src, Operand(dest));
     }
     void storeUnalignedInt32x4(FloatRegister src, const BaseIndex &dest) {
         vmovdqu(src, Operand(dest));
     }
     void storeUnalignedInt32x4(FloatRegister src, const Operand &dest) {
         vmovdqu(src, dest);
@@ -1111,16 +1139,31 @@ class MacroAssemblerX86Shared : public A
     }
 
     void loadAlignedFloat32x4(const Address &src, FloatRegister dest) {
         vmovaps(Operand(src), dest);
     }
     void loadAlignedFloat32x4(const Operand &src, FloatRegister dest) {
         vmovaps(src, dest);
     }
+
+    void storeFloat32x3(FloatRegister src, const Address &dest) {
+        Address destZ(dest);
+        destZ.offset += 2 * sizeof(int32_t);
+        storeDouble(src, dest);
+        vmovhlps(src, ScratchSimdReg, ScratchSimdReg);
+        storeFloat32(ScratchSimdReg, destZ);
+    }
+    void storeFloat32x3(FloatRegister src, const BaseIndex &dest) {
+        BaseIndex destZ(dest);
+        destZ.offset += 2 * sizeof(int32_t);
+        storeDouble(src, dest);
+        vmovhlps(src, ScratchSimdReg, ScratchSimdReg);
+        storeFloat32(ScratchSimdReg, destZ);
+    }
     void storeAlignedFloat32x4(FloatRegister src, const Address &dest) {
         vmovaps(src, Operand(dest));
     }
     void moveFloat32x4(FloatRegister src, FloatRegister dest) {
         vmovaps(src, dest);
     }
     FloatRegister reusedInputFloat32x4(FloatRegister src, FloatRegister dest) {
         if (HasAVX())