Bug 1416723: Move SIMD code generation to masm methods; r=lth
☠☠ backed out by 0fd93c0985bb ☠ ☠
authorBenjamin Bouvier <benj@benj.me>
Tue, 24 Jul 2018 19:34:06 +0200
changeset 428572 bfaf82051dfd4a6605fed0da60f4f934d938bc50
parent 428571 0b0036d13e81d6f1630fa08ab9f02955372380eb
child 428573 b2242216d11b7aff2b1549ce3f717a98de6cc892
push id34337
push userncsoregi@mozilla.com
push dateThu, 26 Jul 2018 21:58:45 +0000
treeherdermozilla-central@8f2f847b2f9d [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerslth
bugs1416723
milestone63.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1416723: Move SIMD code generation to masm methods; r=lth
js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
js/src/jit/x86-shared/CodeGenerator-x86-shared.h
js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
js/src/jit/x86-shared/MacroAssembler-x86-shared.h
js/src/moz.build
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -24,16 +24,17 @@
 using namespace js;
 using namespace js::jit;
 
 using mozilla::Abs;
 using mozilla::BitwiseCast;
 using mozilla::DebugOnly;
 using mozilla::FloatingPoint;
 using mozilla::FloorLog2;
+using mozilla::Maybe;
 using mozilla::NegativeInfinity;
 using mozilla::SpecificNaN;
 
 using JS::GenericNaN;
 
 namespace js {
 namespace jit {
 
@@ -2510,524 +2511,217 @@ CodeGenerator::visitInt32x4ToFloat32x4(L
 }
 
 void
 CodeGenerator::visitFloat32x4ToInt32x4(LFloat32x4ToInt32x4* ins)
 {
     FloatRegister in = ToFloatRegister(ins->input());
     FloatRegister out = ToFloatRegister(ins->output());
     Register temp = ToRegister(ins->temp());
-
-    masm.convertFloat32x4ToInt32x4(in, out);
-
     auto* ool = new(alloc()) OutOfLineSimdFloatToIntCheck(temp, in, ins,
                                                           ins->mir()->bytecodeOffset());
     addOutOfLineCode(ool, ins->mir());
-
-    static const SimdConstant InvalidResult = SimdConstant::SplatX4(int32_t(-2147483648));
-
-    ScratchSimd128Scope scratch(masm);
-    masm.loadConstantSimd128Int(InvalidResult, scratch);
-    masm.packedEqualInt32x4(Operand(out), scratch);
-    // TODO (bug 1156228): If we have SSE4.1, we can use PTEST here instead of
-    // the two following instructions.
-    masm.vmovmskps(scratch, temp);
-    masm.cmp32(temp, Imm32(0));
-    masm.j(Assembler::NotEqual, ool->entry());
-
-    masm.bind(ool->rejoin());
+    masm.checkedConvertFloat32x4ToInt32x4(in, out, temp, ool->entry(), ool->rejoin());
 }
 
 void
-CodeGeneratorX86Shared::visitOutOfLineSimdFloatToIntCheck(OutOfLineSimdFloatToIntCheck *ool)
+CodeGeneratorX86Shared::visitOutOfLineSimdFloatToIntCheck(OutOfLineSimdFloatToIntCheck* ool)
 {
-    static const SimdConstant Int32MaxX4 = SimdConstant::SplatX4(2147483647.f);
-    static const SimdConstant Int32MinX4 = SimdConstant::SplatX4(-2147483648.f);
-
     Label onConversionError;
-
-    FloatRegister input = ool->input();
-    Register temp = ool->temp();
-
-    ScratchSimd128Scope scratch(masm);
-    masm.loadConstantSimd128Float(Int32MinX4, scratch);
-    masm.vcmpleps(Operand(input), scratch, scratch);
-    masm.vmovmskps(scratch, temp);
-    masm.cmp32(temp, Imm32(15));
-    masm.j(Assembler::NotEqual, &onConversionError);
-
-    masm.loadConstantSimd128Float(Int32MaxX4, scratch);
-    masm.vcmpleps(Operand(input), scratch, scratch);
-    masm.vmovmskps(scratch, temp);
-    masm.cmp32(temp, Imm32(0));
-    masm.j(Assembler::NotEqual, &onConversionError);
-
-    masm.jump(ool->rejoin());
-
+    masm.oolConvertFloat32x4ToInt32x4(ool->input(), ool->temp(), ool->rejoin(), &onConversionError);
     masm.bind(&onConversionError);
     if (gen->compilingWasm())
         masm.wasmTrap(wasm::Trap::ImpreciseSimdConversion, ool->bytecodeOffset());
     else
         bailout(ool->ins()->snapshot());
 }
 
 // Convert Float32x4 to Uint32x4.
-//
 // If any input lane value is out of range or NaN, bail out.
 void
 CodeGenerator::visitFloat32x4ToUint32x4(LFloat32x4ToUint32x4* ins)
 {
-    const MSimdConvert* mir = ins->mir();
     FloatRegister in = ToFloatRegister(ins->input());
     FloatRegister out = ToFloatRegister(ins->output());
     Register temp = ToRegister(ins->tempR());
     FloatRegister tempF = ToFloatRegister(ins->tempF());
 
-    // Classify lane values into 4 disjoint classes:
-    //
-    //   N-lanes:             in <= -1.0
-    //   A-lanes:      -1.0 < in <= 0x0.ffffffp31
-    //   B-lanes: 0x1.0p31 <= in <= 0x0.ffffffp32
-    //   V-lanes: 0x1.0p32 <= in, or isnan(in)
-    //
-    // We need to bail out to throw a RangeError if we see any N-lanes or
-    // V-lanes.
-    //
-    // For A-lanes and B-lanes, we make two float -> int32 conversions:
-    //
-    //   A = cvttps2dq(in)
-    //   B = cvttps2dq(in - 0x1.0p31f)
-    //
-    // Note that the subtraction for the B computation is exact for B-lanes.
-    // There is no rounding, so B is the low 31 bits of the correctly converted
-    // result.
-    //
-    // The cvttps2dq instruction produces 0x80000000 when the input is NaN or
-    // out of range for a signed int32_t. This conveniently provides the missing
-    // high bit for B, so the desired result is A for A-lanes and A|B for
-    // B-lanes.
-
-    ScratchSimd128Scope scratch(masm);
-
-    // TODO: If the majority of lanes are A-lanes, it could be faster to compute
-    // A first, use vmovmskps to check for any non-A-lanes and handle them in
-    // ool code. OTOH, we we're wrong about the lane distribution, that would be
-    // slower.
-
-    // Compute B in |scratch|.
-    static const float Adjust = 0x80000000; // 0x1.0p31f for the benefit of MSVC.
-    static const SimdConstant Bias = SimdConstant::SplatX4(-Adjust);
-    masm.loadConstantSimd128Float(Bias, scratch);
-    masm.packedAddFloat32(Operand(in), scratch);
-    masm.convertFloat32x4ToInt32x4(scratch, scratch);
-
-    // Compute A in |out|. This is the last time we use |in| and the first time
-    // we use |out|, so we can tolerate if they are the same register.
-    masm.convertFloat32x4ToInt32x4(in, out);
-
-    // We can identify A-lanes by the sign bits in A: Any A-lanes will be
-    // positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a
-    // mask of non-A-lanes into |tempF|.
-    masm.zeroSimd128Float(tempF);
-    masm.packedGreaterThanInt32x4(Operand(out), tempF);
-
-    // Clear the A-lanes in B.
-    masm.bitwiseAndSimd128(Operand(tempF), scratch);
-
-    // Compute the final result: A for A-lanes, A|B for B-lanes.
-    masm.bitwiseOrSimd128(Operand(scratch), out);
-
-    // We still need to filter out the V-lanes. They would show up as 0x80000000
-    // in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are
-    // the remaining negative lanes in B.
-    masm.vmovmskps(scratch, temp);
-    masm.cmp32(temp, Imm32(0));
-
-    if (gen->compilingWasm()) {
-        Label ok;
-        masm.j(Assembler::Equal, &ok);
-        masm.wasmTrap(wasm::Trap::ImpreciseSimdConversion, mir->bytecodeOffset());
-        masm.bind(&ok);
-    } else {
-        bailoutIf(Assembler::NotEqual, ins->snapshot());
-    }
+    Label failed;
+    masm.checkedConvertFloat32x4ToUint32x4(in, out, temp, tempF, &failed);
+
+    Label ok;
+    masm.jump(&ok);
+    masm.bind(&failed);
+    if (gen->compilingWasm())
+        masm.wasmTrap(wasm::Trap::ImpreciseSimdConversion, ins->mir()->bytecodeOffset());
+    else
+        bailout(ins->snapshot());
+    masm.bind(&ok);
 }
 
 void
 CodeGenerator::visitSimdValueInt32x4(LSimdValueInt32x4* ins)
 {
     MOZ_ASSERT(ins->mir()->type() == MIRType::Int32x4 || ins->mir()->type() == MIRType::Bool32x4);
-
-    FloatRegister output = ToFloatRegister(ins->output());
-    if (AssemblerX86Shared::HasSSE41()) {
-        masm.vmovd(ToRegister(ins->getOperand(0)), output);
-        for (size_t i = 1; i < 4; ++i) {
-            Register r = ToRegister(ins->getOperand(i));
-            masm.vpinsrd(i, r, output, output);
-        }
-        return;
-    }
-
-    masm.reserveStack(Simd128DataSize);
-    for (size_t i = 0; i < 4; ++i) {
-        Register r = ToRegister(ins->getOperand(i));
-        masm.store32(r, Address(StackPointer, i * sizeof(int32_t)));
-    }
-    masm.loadAlignedSimd128Int(Address(StackPointer, 0), output);
-    masm.freeStack(Simd128DataSize);
+    masm.createInt32x4(ToRegister(ins->getOperand(0)),
+                       ToRegister(ins->getOperand(1)),
+                       ToRegister(ins->getOperand(2)),
+                       ToRegister(ins->getOperand(3)),
+                       ToFloatRegister(ins->output())
+                      );
 }
 
 void
 CodeGenerator::visitSimdValueFloat32x4(LSimdValueFloat32x4* ins)
 {
     MOZ_ASSERT(ins->mir()->type() == MIRType::Float32x4);
 
     FloatRegister r0 = ToFloatRegister(ins->getOperand(0));
     FloatRegister r1 = ToFloatRegister(ins->getOperand(1));
     FloatRegister r2 = ToFloatRegister(ins->getOperand(2));
     FloatRegister r3 = ToFloatRegister(ins->getOperand(3));
     FloatRegister tmp = ToFloatRegister(ins->getTemp(0));
     FloatRegister output = ToFloatRegister(ins->output());
 
-    FloatRegister r0Copy = masm.reusedInputFloat32x4(r0, output);
-    FloatRegister r1Copy = masm.reusedInputFloat32x4(r1, tmp);
-
-    masm.vunpcklps(r3, r1Copy, tmp);
-    masm.vunpcklps(r2, r0Copy, output);
-    masm.vunpcklps(tmp, output, output);
+    masm.createFloat32x4(r0, r1, r2, r3, tmp, output);
 }
 
 void
 CodeGenerator::visitSimdSplatX16(LSimdSplatX16* ins)
 {
     MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 16);
     Register input = ToRegister(ins->getOperand(0));
     FloatRegister output = ToFloatRegister(ins->output());
-    masm.vmovd(input, output);
-    if (AssemblerX86Shared::HasSSSE3()) {
-        masm.zeroSimd128Int(ScratchSimd128Reg);
-        masm.vpshufb(ScratchSimd128Reg, output, output);
-    } else {
-        // Use two shifts to duplicate the low 8 bits into the low 16 bits.
-        masm.vpsllw(Imm32(8), output, output);
-        masm.vmovdqa(output, ScratchSimd128Reg);
-        masm.vpsrlw(Imm32(8), ScratchSimd128Reg, ScratchSimd128Reg);
-        masm.vpor(ScratchSimd128Reg, output, output);
-        // Then do an X8 splat.
-        masm.vpshuflw(0, output, output);
-        masm.vpshufd(0, output, output);
-    }
+    masm.splatX16(input, output);
 }
 
 void
 CodeGenerator::visitSimdSplatX8(LSimdSplatX8* ins)
 {
     MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 8);
     Register input = ToRegister(ins->getOperand(0));
     FloatRegister output = ToFloatRegister(ins->output());
-    masm.vmovd(input, output);
-    masm.vpshuflw(0, output, output);
-    masm.vpshufd(0, output, output);
+    masm.splatX8(input, output);
 }
 
 void
 CodeGenerator::visitSimdSplatX4(LSimdSplatX4* ins)
 {
     FloatRegister output = ToFloatRegister(ins->output());
-
     MSimdSplat* mir = ins->mir();
     MOZ_ASSERT(IsSimdType(mir->type()));
     JS_STATIC_ASSERT(sizeof(float) == sizeof(int32_t));
-
-    if (mir->type() == MIRType::Float32x4) {
-        FloatRegister r = ToFloatRegister(ins->getOperand(0));
-        FloatRegister rCopy = masm.reusedInputFloat32x4(r, output);
-        masm.vshufps(0, rCopy, rCopy, output);
-    } else {
-        Register r = ToRegister(ins->getOperand(0));
-        masm.vmovd(r, output);
-        masm.vpshufd(0, output, output);
-    }
+    if (mir->type() == MIRType::Float32x4)
+        masm.splatX4(ToFloatRegister(ins->getOperand(0)), output);
+    else
+        masm.splatX4(ToRegister(ins->getOperand(0)), output);
 }
 
 void
 CodeGenerator::visitSimdReinterpretCast(LSimdReinterpretCast* ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     FloatRegister output = ToFloatRegister(ins->output());
-
-    if (input.aliases(output))
-        return;
-
-    if (IsIntegerSimdType(ins->mir()->type()))
-        masm.vmovdqa(input, output);
-    else
-        masm.vmovaps(input, output);
-}
-
-// Extract an integer lane from the 32x4 vector register |input| and place it in
-// |output|.
-void
-CodeGeneratorX86Shared::emitSimdExtractLane32x4(FloatRegister input, Register output, unsigned lane)
-{
-    if (lane == 0) {
-        // The value we want to extract is in the low double-word
-        masm.moveLowInt32(input, output);
-    } else if (AssemblerX86Shared::HasSSE41()) {
-        masm.vpextrd(lane, input, output);
-    } else {
-        uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
-        masm.shuffleInt32(mask, input, ScratchSimd128Reg);
-        masm.moveLowInt32(ScratchSimd128Reg, output);
-    }
-}
-
-// Extract an integer lane from the 16x8 vector register |input|, sign- or
-// zero-extend to 32 bits and place the result in |output|.
-void
-CodeGeneratorX86Shared::emitSimdExtractLane16x8(FloatRegister input, Register output,
-                                                unsigned lane, SimdSign signedness)
-{
-    // Unlike pextrd and pextrb, this is available in SSE2.
-    masm.vpextrw(lane, input, output);
-
-    if (signedness == SimdSign::Signed)
-        masm.movswl(output, output);
-}
-
-// Extract an integer lane from the 8x16 vector register |input|, sign- or
-// zero-extend to 32 bits and place the result in |output|.
-void
-CodeGeneratorX86Shared::emitSimdExtractLane8x16(FloatRegister input, Register output,
-                                                unsigned lane, SimdSign signedness)
-{
-    if (AssemblerX86Shared::HasSSE41()) {
-        masm.vpextrb(lane, input, output);
-        // vpextrb clears the high bits, so no further extension required.
-        if (signedness == SimdSign::Unsigned)
-            signedness = SimdSign::NotApplicable;
-    } else {
-        // Extract the relevant 16 bits containing our lane, then shift the
-        // right 8 bits into place.
-        emitSimdExtractLane16x8(input, output, lane / 2, SimdSign::Unsigned);
-        if (lane % 2) {
-            masm.shrl(Imm32(8), output);
-            // The shrl handles the zero-extension. Don't repeat it.
-            if (signedness == SimdSign::Unsigned)
-                signedness = SimdSign::NotApplicable;
-        }
-    }
-
-    // We have the right low 8 bits in |output|, but we may need to fix the high
-    // bits. Note that this requires |output| to be one of the %eax-%edx
-    // registers.
-    switch (signedness) {
-      case SimdSign::Signed:
-        masm.movsbl(output, output);
-        break;
-      case SimdSign::Unsigned:
-        masm.movzbl(output, output);
-        break;
-      case SimdSign::NotApplicable:
-        // No adjustment needed.
-        break;
-    }
+    bool isIntLaneType = IsIntegerSimdType(ins->mir()->type());
+    masm.reinterpretSimd(isIntLaneType, input, output);
 }
 
 void
 CodeGenerator::visitSimdExtractElementB(LSimdExtractElementB* ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     Register output = ToRegister(ins->output());
     MSimdExtractElement* mir = ins->mir();
-    unsigned length = SimdTypeToLength(mir->specialization());
-
-    switch (length) {
-      case 4:
-        emitSimdExtractLane32x4(input, output, mir->lane());
-        break;
-      case 8:
-        // Get a lane, don't bother fixing the high bits since we'll mask below.
-        emitSimdExtractLane16x8(input, output, mir->lane(), SimdSign::NotApplicable);
-        break;
-      case 16:
-        emitSimdExtractLane8x16(input, output, mir->lane(), SimdSign::NotApplicable);
-        break;
-      default:
-        MOZ_CRASH("Unhandled SIMD length");
-    }
-
-    // We need to generate a 0/1 value. We have 0/-1 and possibly dirty high bits.
-    masm.and32(Imm32(1), output);
+    unsigned numLanes = SimdTypeToLength(mir->specialization());
+    masm.extractLaneSimdBool(input, output, numLanes, mir->lane());
 }
 
 void
 CodeGenerator::visitSimdExtractElementI(LSimdExtractElementI* ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     Register output = ToRegister(ins->output());
     MSimdExtractElement* mir = ins->mir();
-    unsigned length = SimdTypeToLength(mir->specialization());
-
-    switch (length) {
+    unsigned numLanes = SimdTypeToLength(mir->specialization());
+    switch (numLanes) {
       case 4:
-        emitSimdExtractLane32x4(input, output, mir->lane());
+        masm.extractLaneInt32x4(input, output, mir->lane());
         break;
       case 8:
-        emitSimdExtractLane16x8(input, output, mir->lane(), mir->signedness());
+        masm.extractLaneInt16x8(input, output, mir->lane(), mir->signedness());
         break;
       case 16:
-        emitSimdExtractLane8x16(input, output, mir->lane(), mir->signedness());
+        masm.extractLaneInt8x16(input, output, mir->lane(), mir->signedness());
         break;
       default:
         MOZ_CRASH("Unhandled SIMD length");
     }
 }
 
 void
 CodeGenerator::visitSimdExtractElementU2D(LSimdExtractElementU2D* ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     FloatRegister output = ToFloatRegister(ins->output());
     Register temp = ToRegister(ins->temp());
     MSimdExtractElement* mir = ins->mir();
     MOZ_ASSERT(mir->specialization() == MIRType::Int32x4);
-    emitSimdExtractLane32x4(input, temp, mir->lane());
+    masm.extractLaneInt32x4(input, temp, mir->lane());
     masm.convertUInt32ToDouble(temp, output);
 }
 
 void
 CodeGenerator::visitSimdExtractElementF(LSimdExtractElementF* ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     FloatRegister output = ToFloatRegister(ins->output());
-
     unsigned lane = ins->mir()->lane();
-    if (lane == 0) {
-        // The value we want to extract is in the low double-word
-        if (input != output)
-            masm.moveFloat32(input, output);
-    } else if (lane == 2) {
-        masm.moveHighPairToLowPairFloat32(input, output);
-    } else {
-        uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
-        masm.shuffleFloat32(mask, input, output);
-    }
-    // NaNs contained within SIMD values are not enforced to be canonical, so
-    // when we extract an element into a "regular" scalar JS value, we have to
-    // canonicalize. In wasm code, we can skip this, as wasm only has to
-    // canonicalize NaNs at FFI boundaries.
-    if (!gen->compilingWasm())
-        masm.canonicalizeFloat(output);
+    bool canonicalize = !gen->compilingWasm();
+    masm.extractLaneFloat32x4(input, output, lane, canonicalize);
 }
 
 void
 CodeGenerator::visitSimdInsertElementI(LSimdInsertElementI* ins)
 {
-    FloatRegister vector = ToFloatRegister(ins->vector());
+    FloatRegister input = ToFloatRegister(ins->vector());
     Register value = ToRegister(ins->value());
     FloatRegister output = ToFloatRegister(ins->output());
-    MOZ_ASSERT(vector == output); // defineReuseInput(0)
-
+    MOZ_ASSERT(input == output); // defineReuseInput(0)
     unsigned lane = ins->lane();
     unsigned length = ins->length();
-
-    if (length == 8) {
-        // Available in SSE 2.
-        masm.vpinsrw(lane, value, vector, output);
-        return;
-    }
-
-    // Note that, contrarily to float32x4, we cannot use vmovd if the inserted
-    // value goes into the first component, as vmovd clears out the higher lanes
-    // of the output.
-    if (AssemblerX86Shared::HasSSE41()) {
-        // TODO: Teach Lowering that we don't need defineReuseInput if we have AVX.
-        switch (length) {
-          case 4:
-            masm.vpinsrd(lane, value, vector, output);
-            return;
-          case 16:
-            masm.vpinsrb(lane, value, vector, output);
-            return;
-        }
-    }
-
-    masm.reserveStack(Simd128DataSize);
-    masm.storeAlignedSimd128Int(vector, Address(StackPointer, 0));
-    switch (length) {
-      case 4:
-        masm.store32(value, Address(StackPointer, lane * sizeof(int32_t)));
-        break;
-      case 16:
-        // Note that this requires `value` to be in one the registers where the
-        // low 8 bits are addressible (%eax - %edx on x86, all of them on x86-64).
-        masm.store8(value, Address(StackPointer, lane * sizeof(int8_t)));
-        break;
-      default:
-        MOZ_CRASH("Unsupported SIMD length");
-    }
-    masm.loadAlignedSimd128Int(Address(StackPointer, 0), output);
-    masm.freeStack(Simd128DataSize);
+    masm.insertLaneSimdInt(input, value, output, lane, length);
 }
 
 void
 CodeGenerator::visitSimdInsertElementF(LSimdInsertElementF* ins)
 {
-    FloatRegister vector = ToFloatRegister(ins->vector());
+    FloatRegister input = ToFloatRegister(ins->vector());
     FloatRegister value = ToFloatRegister(ins->value());
     FloatRegister output = ToFloatRegister(ins->output());
-    MOZ_ASSERT(vector == output); // defineReuseInput(0)
-
-    if (ins->lane() == 0) {
-        // As both operands are registers, vmovss doesn't modify the upper bits
-        // of the destination operand.
-        if (value != output)
-            masm.vmovss(value, vector, output);
-        return;
-    }
-
-    if (AssemblerX86Shared::HasSSE41()) {
-        // The input value is in the low float32 of the 'value' FloatRegister.
-        masm.vinsertps(masm.vinsertpsMask(0, ins->lane()), value, output, output);
-        return;
-    }
-
-    unsigned component = unsigned(ins->lane());
-    masm.reserveStack(Simd128DataSize);
-    masm.storeAlignedSimd128Float(vector, Address(StackPointer, 0));
-    masm.storeFloat32(value, Address(StackPointer, component * sizeof(int32_t)));
-    masm.loadAlignedSimd128Float(Address(StackPointer, 0), output);
-    masm.freeStack(Simd128DataSize);
+    MOZ_ASSERT(input == output); // defineReuseInput(0)
+    masm.insertLaneFloat32x4(input, value, output, ins->lane());
 }
 
 void
 CodeGenerator::visitSimdAllTrue(LSimdAllTrue* ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     Register output = ToRegister(ins->output());
-
-    // We know that the input lanes are boolean, so they are either 0 or -1.
-    // The all-true vector has all 128 bits set, no matter the lane geometry.
-    masm.vpmovmskb(input, output);
-    masm.cmp32(output, Imm32(0xffff));
-    masm.emitSet(Assembler::Zero, output);
+    masm.allTrueSimdBool(input, output);
 }
 
 void
 CodeGenerator::visitSimdAnyTrue(LSimdAnyTrue* ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     Register output = ToRegister(ins->output());
-
-    masm.vpmovmskb(input, output);
-    masm.cmp32(output, Imm32(0x0));
-    masm.emitSet(Assembler::NonZero, output);
+    masm.anyTrueSimdBool(input, output);
 }
 
+// XXX note for reviewer: this is SIMD.js only, no need to keep it for wasm.
 template <class T, class Reg> void
 CodeGeneratorX86Shared::visitSimdGeneralShuffle(LSimdGeneralShuffleBase* ins, Reg tempRegister)
 {
     MSimdGeneralShuffle* mir = ins->mir();
     unsigned numVectors = mir->numVectors();
 
     Register laneTemp = ToRegister(ins->temp());
 
@@ -3076,16 +2770,17 @@ CodeGeneratorX86Shared::visitSimdGeneral
         bailout(ins->snapshot());
     }
 
     masm.bind(&join);
     masm.setFramePushed(masm.framePushed() + stackSpace);
     masm.freeStack(stackSpace);
 }
 
+// XXX SIMD.js only
 void
 CodeGenerator::visitSimdGeneralShuffleI(LSimdGeneralShuffleI* ins)
 {
     switch (ins->mir()->type()) {
       case MIRType::Int8x16:
         return visitSimdGeneralShuffle<int8_t, Register>(ins, ToRegister(ins->temp()));
       case MIRType::Int16x8:
         return visitSimdGeneralShuffle<int16_t, Register>(ins, ToRegister(ins->temp()));
@@ -3106,628 +2801,160 @@ void
 CodeGenerator::visitSimdSwizzleI(LSimdSwizzleI* ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     FloatRegister output = ToFloatRegister(ins->output());
     const unsigned numLanes = ins->numLanes();
 
     switch (numLanes) {
         case 4: {
-            uint32_t x = ins->lane(0);
-            uint32_t y = ins->lane(1);
-            uint32_t z = ins->lane(2);
-            uint32_t w = ins->lane(3);
-
-            uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
-            masm.shuffleInt32(mask, input, output);
+            unsigned lanes[4];
+            for (unsigned i = 0; i < 4; i++)
+                lanes[i] = ins->lane(i);
+            masm.swizzleInt32x4(input, output, lanes);
             return;
         }
     }
 
     // In the general case, use pshufb if it is available. Convert to a
     // byte-wise swizzle.
     const unsigned bytesPerLane = 16 / numLanes;
-    int8_t bLane[16];
+    int8_t lanes[16];
     for (unsigned i = 0; i < numLanes; i++) {
-        for (unsigned b = 0; b < bytesPerLane; b++) {
-            bLane[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b;
-        }
+        for (unsigned b = 0; b < bytesPerLane; b++)
+            lanes[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b;
     }
 
-    if (AssemblerX86Shared::HasSSSE3()) {
-        ScratchSimd128Scope scratch(masm);
-        masm.loadConstantSimd128Int(SimdConstant::CreateX16(bLane), scratch);
-        FloatRegister inputCopy = masm.reusedInputInt32x4(input, output);
-        masm.vpshufb(scratch, inputCopy, output);
-        return;
-    }
-
-    // Worst-case fallback for pre-SSSE3 machines. Bounce through memory.
-    Register temp = ToRegister(ins->getTemp(0));
-    masm.reserveStack(2 * Simd128DataSize);
-    masm.storeAlignedSimd128Int(input, Address(StackPointer, Simd128DataSize));
-    for (unsigned i = 0; i < 16; i++) {
-        masm.load8ZeroExtend(Address(StackPointer, Simd128DataSize + bLane[i]), temp);
-        masm.store8(temp, Address(StackPointer, i));
-    }
-    masm.loadAlignedSimd128Int(Address(StackPointer, 0), output);
-    masm.freeStack(2 * Simd128DataSize);
+    Maybe<Register> maybeTemp;
+    if (!ins->getTemp(0)->isBogusTemp())
+        maybeTemp.emplace(ToRegister(ins->getTemp(0)));
+
+    masm.swizzleInt8x16(input, output, maybeTemp, lanes);
 }
 
 void
 CodeGenerator::visitSimdSwizzleF(LSimdSwizzleF* ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     FloatRegister output = ToFloatRegister(ins->output());
     MOZ_ASSERT(ins->numLanes() == 4);
-
-    uint32_t x = ins->lane(0);
-    uint32_t y = ins->lane(1);
-    uint32_t z = ins->lane(2);
-    uint32_t w = ins->lane(3);
-
-    if (AssemblerX86Shared::HasSSE3()) {
-        if (ins->lanesMatch(0, 0, 2, 2)) {
-            masm.vmovsldup(input, output);
-            return;
-        }
-        if (ins->lanesMatch(1, 1, 3, 3)) {
-            masm.vmovshdup(input, output);
-            return;
-        }
-    }
-
-    // TODO Here and below, arch specific lowering could identify this pattern
-    // and use defineReuseInput to avoid this move (bug 1084404)
-    if (ins->lanesMatch(2, 3, 2, 3)) {
-        FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
-        masm.vmovhlps(input, inputCopy, output);
-        return;
-    }
-
-    if (ins->lanesMatch(0, 1, 0, 1)) {
-        if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) {
-            masm.vmovddup(input, output);
-            return;
-        }
-        FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
-        masm.vmovlhps(input, inputCopy, output);
-        return;
-    }
-
-    if (ins->lanesMatch(0, 0, 1, 1)) {
-        FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
-        masm.vunpcklps(input, inputCopy, output);
-        return;
-    }
-
-    if (ins->lanesMatch(2, 2, 3, 3)) {
-        FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
-        masm.vunpckhps(input, inputCopy, output);
-        return;
-    }
-
-    uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
-    masm.shuffleFloat32(mask, input, output);
+    unsigned lanes[4];
+    for (unsigned i = 0; i < 4; i++)
+        lanes[i] = ins->lane(i);
+    masm.swizzleFloat32x4(input, output, lanes);
 }
 
 void
 CodeGenerator::visitSimdShuffle(LSimdShuffle* ins)
 {
     FloatRegister lhs = ToFloatRegister(ins->lhs());
     FloatRegister rhs = ToFloatRegister(ins->rhs());
     FloatRegister output = ToFloatRegister(ins->output());
     const unsigned numLanes = ins->numLanes();
     const unsigned bytesPerLane = 16 / numLanes;
 
     // Convert the shuffle to a byte-wise shuffle.
-    uint8_t bLane[16];
+    uint8_t lanes[16];
     for (unsigned i = 0; i < numLanes; i++) {
         for (unsigned b = 0; b < bytesPerLane; b++) {
-            bLane[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b;
+            lanes[i * bytesPerLane + b] = ins->lane(i) * bytesPerLane + b;
         }
     }
 
-    // Use pshufb if it is available.
-    if (AssemblerX86Shared::HasSSSE3()) {
-        FloatRegister scratch1 = ToFloatRegister(ins->temp());
-        ScratchSimd128Scope scratch2(masm);
-
-        // Use pshufb instructions to gather the lanes from each source vector.
-        // A negative index creates a zero lane, so the two vectors can be combined.
-
-        // Set scratch2 = lanes from lhs.
-        int8_t idx[16];
-        for (unsigned i = 0; i < 16; i++)
-            idx[i] = bLane[i] < 16 ? bLane[i] : -1;
-        masm.loadConstantSimd128Int(SimdConstant::CreateX16(idx), scratch1);
-        FloatRegister lhsCopy = masm.reusedInputInt32x4(lhs, scratch2);
-        masm.vpshufb(scratch1, lhsCopy, scratch2);
-
-        // Set output = lanes from rhs.
-        for (unsigned i = 0; i < 16; i++)
-            idx[i] = bLane[i] >= 16 ? bLane[i] - 16 : -1;
-        masm.loadConstantSimd128Int(SimdConstant::CreateX16(idx), scratch1);
-        FloatRegister rhsCopy = masm.reusedInputInt32x4(rhs, output);
-        masm.vpshufb(scratch1, rhsCopy, output);
-
-        // Combine.
-        masm.vpor(scratch2, output, output);
-        return;
-    }
-
-    // Worst-case fallback for pre-SSE3 machines. Bounce through memory.
-    Register temp = ToRegister(ins->getTemp(0));
-    masm.reserveStack(3 * Simd128DataSize);
-    masm.storeAlignedSimd128Int(lhs, Address(StackPointer, Simd128DataSize));
-    masm.storeAlignedSimd128Int(rhs, Address(StackPointer, 2 * Simd128DataSize));
-    for (unsigned i = 0; i < 16; i++) {
-        masm.load8ZeroExtend(Address(StackPointer, Simd128DataSize + bLane[i]), temp);
-        masm.store8(temp, Address(StackPointer, i));
-    }
-    masm.loadAlignedSimd128Int(Address(StackPointer, 0), output);
-    masm.freeStack(3 * Simd128DataSize);
+    Maybe<FloatRegister> maybeFloatTemp;
+    Maybe<Register> maybeTemp;
+    if (AssemblerX86Shared::HasSSSE3())
+        maybeFloatTemp.emplace(ToFloatRegister(ins->temp()));
+    else
+        maybeTemp.emplace(ToRegister(ins->temp()));
+
+    masm.shuffleInt8x16(lhs, rhs, output, maybeFloatTemp, maybeTemp, lanes);
 }
 
 void
 CodeGenerator::visitSimdShuffleX4(LSimdShuffleX4* ins)
 {
     FloatRegister lhs = ToFloatRegister(ins->lhs());
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister out = ToFloatRegister(ins->output());
-
-    uint32_t x = ins->lane(0);
-    uint32_t y = ins->lane(1);
-    uint32_t z = ins->lane(2);
-    uint32_t w = ins->lane(3);
-
-    // Check that lanes come from LHS in majority:
-    unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
-    MOZ_ASSERT(numLanesFromLHS >= 2);
-
-    // When reading this method, remember that vshufps takes the two first
-    // inputs of the destination operand (right operand) and the two last
-    // inputs of the source operand (left operand).
-    //
-    // Legend for explanations:
-    // - L: LHS
-    // - R: RHS
-    // - T: temporary
-
-    uint32_t mask;
-
-    // If all lanes came from a single vector, we should have constructed a
-    // MSimdSwizzle instead.
-    MOZ_ASSERT(numLanesFromLHS < 4);
-
-    // If all values stay in their lane, this is a blend.
-    if (AssemblerX86Shared::HasSSE41()) {
-        if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) {
-            masm.vblendps(masm.blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4), rhs, lhs, out);
-            return;
-        }
+    unsigned lanes[4];
+    for (unsigned i = 0; i < 4; i++)
+        lanes[i] = ins->lane(i);
+    Maybe<FloatRegister> maybeTemp;
+    if (!ins->temp()->isBogusTemp())
+        maybeTemp.emplace(ToFloatRegister(ins->temp()));
+    masm.shuffleX4(lhs, rhs, out, maybeTemp, lanes);
+}
+
+static inline Assembler::Condition
+ToCondition(MSimdBinaryComp::Operation op)
+{
+    switch (op) {
+      case MSimdBinaryComp::greaterThan: return Assembler::GreaterThan;
+      case MSimdBinaryComp::equal: return Assembler::Equal;
+      case MSimdBinaryComp::lessThan: return Assembler::LessThan;
+      case MSimdBinaryComp::notEqual: return Assembler::NotEqual;
+      case MSimdBinaryComp::greaterThanOrEqual: return Assembler::GreaterThanOrEqual;
+      case MSimdBinaryComp::lessThanOrEqual: return Assembler::LessThanOrEqual;
     }
-
-    // One element of the second, all other elements of the first
-    if (numLanesFromLHS == 3) {
-        unsigned firstMask = -1, secondMask = -1;
-
-        // register-register vmovss preserves the high lanes.
-        if (ins->lanesMatch(4, 1, 2, 3) && rhs.kind() == Operand::FPREG) {
-            masm.vmovss(FloatRegister::FromCode(rhs.fpu()), lhs, out);
-            return;
-        }
-
-        // SSE4.1 vinsertps can handle any single element.
-        unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
-        if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
-            unsigned srcLane;
-            unsigned dstLane;
-            if (x >= 4) {
-                srcLane = x - 4;
-                dstLane = 0;
-            } else if (y >= 4) {
-                srcLane = y - 4;
-                dstLane = 1;
-            } else if (z >= 4) {
-                srcLane = z - 4;
-                dstLane = 2;
-            } else {
-                MOZ_ASSERT(w >= 4);
-                srcLane = w - 4;
-                dstLane = 3;
-            }
-            masm.vinsertps(masm.vinsertpsMask(srcLane, dstLane), rhs, lhs, out);
-            return;
-        }
-
-        FloatRegister rhsCopy = ToFloatRegister(ins->temp());
-
-        if (x < 4 && y < 4) {
-            if (w >= 4) {
-                w %= 4;
-                // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhsCopy)
-                firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
-                // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, out)
-                secondMask = MacroAssembler::ComputeShuffleMask(x, y, 2, 0);
-            } else {
-                MOZ_ASSERT(z >= 4);
-                z %= 4;
-                // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhsCopy)
-                firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
-                // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, out)
-                secondMask = MacroAssembler::ComputeShuffleMask(x, y, 0, 2);
-            }
-
-            masm.vshufps(firstMask, lhs, rhsCopy, rhsCopy);
-            masm.vshufps(secondMask, rhsCopy, lhs, out);
-            return;
-        }
-
-        MOZ_ASSERT(z < 4 && w < 4);
-
-        if (y >= 4) {
-            y %= 4;
-            // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhsCopy)
-            firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
-            // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, out)
-            secondMask = MacroAssembler::ComputeShuffleMask(2, 0, z, w);
-        } else {
-            MOZ_ASSERT(x >= 4);
-            x %= 4;
-            // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhsCopy)
-            firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
-            // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, out)
-            secondMask = MacroAssembler::ComputeShuffleMask(0, 2, z, w);
-        }
-
-        masm.vshufps(firstMask, lhs, rhsCopy, rhsCopy);
-        if (AssemblerX86Shared::HasAVX()) {
-            masm.vshufps(secondMask, lhs, rhsCopy, out);
-        } else {
-            masm.vshufps(secondMask, lhs, rhsCopy, rhsCopy);
-            masm.moveSimd128Float(rhsCopy, out);
-        }
-        return;
-    }
-
-    // Two elements from one vector, two other elements from the other
-    MOZ_ASSERT(numLanesFromLHS == 2);
-
-    // TODO Here and below, symmetric case would be more handy to avoid a move,
-    // but can't be reached because operands would get swapped (bug 1084404).
-    if (ins->lanesMatch(2, 3, 6, 7)) {
-        ScratchSimd128Scope scratch(masm);
-        if (AssemblerX86Shared::HasAVX()) {
-            FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch);
-            masm.vmovhlps(lhs, rhsCopy, out);
-        } else {
-            masm.loadAlignedSimd128Float(rhs, scratch);
-            masm.vmovhlps(lhs, scratch, scratch);
-            masm.moveSimd128Float(scratch, out);
-        }
-        return;
-    }
-
-    if (ins->lanesMatch(0, 1, 4, 5)) {
-        FloatRegister rhsCopy;
-        ScratchSimd128Scope scratch(masm);
-        if (rhs.kind() == Operand::FPREG) {
-            // No need to make an actual copy, since the operand is already
-            // in a register, and it won't be clobbered by the vmovlhps.
-            rhsCopy = FloatRegister::FromCode(rhs.fpu());
-        } else {
-            masm.loadAlignedSimd128Float(rhs, scratch);
-            rhsCopy = scratch;
-        }
-        masm.vmovlhps(rhsCopy, lhs, out);
-        return;
-    }
-
-    if (ins->lanesMatch(0, 4, 1, 5)) {
-        masm.vunpcklps(rhs, lhs, out);
-        return;
-    }
-
-    // TODO swapped case would be better (bug 1084404)
-    if (ins->lanesMatch(4, 0, 5, 1)) {
-        ScratchSimd128Scope scratch(masm);
-        if (AssemblerX86Shared::HasAVX()) {
-            FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch);
-            masm.vunpcklps(lhs, rhsCopy, out);
-        } else {
-            masm.loadAlignedSimd128Float(rhs, scratch);
-            masm.vunpcklps(lhs, scratch, scratch);
-            masm.moveSimd128Float(scratch, out);
-        }
-        return;
-    }
-
-    if (ins->lanesMatch(2, 6, 3, 7)) {
-        masm.vunpckhps(rhs, lhs, out);
-        return;
-    }
-
-    // TODO swapped case would be better (bug 1084404)
-    if (ins->lanesMatch(6, 2, 7, 3)) {
-        ScratchSimd128Scope scratch(masm);
-        if (AssemblerX86Shared::HasAVX()) {
-            FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch);
-            masm.vunpckhps(lhs, rhsCopy, out);
-        } else {
-            masm.loadAlignedSimd128Float(rhs, scratch);
-            masm.vunpckhps(lhs, scratch, scratch);
-            masm.moveSimd128Float(scratch, out);
-        }
-        return;
-    }
-
-    // In one vshufps
-    if (x < 4 && y < 4) {
-        mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
-        masm.vshufps(mask, rhs, lhs, out);
-        return;
-    }
-
-    // At creation, we should have explicitly swapped in this case.
-    MOZ_ASSERT(!(z >= 4 && w >= 4));
-
-    // In two vshufps, for the most generic case:
-    uint32_t firstMask[4], secondMask[4];
-    unsigned i = 0, j = 2, k = 0;
-
-#define COMPUTE_MASK(lane)       \
-    if (lane >= 4) {             \
-        firstMask[j] = lane % 4; \
-        secondMask[k++] = j++;   \
-    } else {                     \
-        firstMask[i] = lane;     \
-        secondMask[k++] = i++;   \
-    }
-
-    COMPUTE_MASK(x)
-    COMPUTE_MASK(y)
-    COMPUTE_MASK(z)
-    COMPUTE_MASK(w)
-#undef COMPUTE_MASK
-
-    MOZ_ASSERT(i == 2 && j == 4 && k == 4);
-
-    mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
-                                              firstMask[2], firstMask[3]);
-    masm.vshufps(mask, rhs, lhs, lhs);
-
-    mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
-                                              secondMask[2], secondMask[3]);
-    masm.vshufps(mask, lhs, lhs, lhs);
+    MOZ_CRASH("unexpected cond");
 }
 
 void
 CodeGenerator::visitSimdBinaryCompIx16(LSimdBinaryCompIx16* ins)
 {
-    static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
-
     FloatRegister lhs = ToFloatRegister(ins->lhs());
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister output = ToFloatRegister(ins->output());
     MOZ_ASSERT_IF(!Assembler::HasAVX(), output == lhs);
-
-    ScratchSimd128Scope scratch(masm);
-
-    MSimdBinaryComp::Operation op = ins->operation();
-    switch (op) {
-      case MSimdBinaryComp::greaterThan:
-        masm.vpcmpgtb(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::equal:
-        masm.vpcmpeqb(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::lessThan:
-        // src := rhs
-        if (rhs.kind() == Operand::FPREG)
-            masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
-        else
-            masm.loadAlignedSimd128Int(rhs, scratch);
-
-        // src := src > lhs (i.e. lhs < rhs)
-        // Improve by doing custom lowering (rhs is tied to the output register)
-        masm.vpcmpgtb(ToOperand(ins->lhs()), scratch, scratch);
-        masm.moveSimd128Int(scratch, output);
-        return;
-      case MSimdBinaryComp::notEqual:
-        // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
-        // should invert the comparison by, e.g. swapping the arms of a select
-        // if that's what it's used in.
-        masm.loadConstantSimd128Int(allOnes, scratch);
-        masm.vpcmpeqb(rhs, lhs, output);
-        masm.bitwiseXorSimd128(Operand(scratch), output);
-        return;
-      case MSimdBinaryComp::greaterThanOrEqual:
-        // src := rhs
-        if (rhs.kind() == Operand::FPREG)
-            masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
-        else
-            masm.loadAlignedSimd128Int(rhs, scratch);
-        masm.vpcmpgtb(ToOperand(ins->lhs()), scratch, scratch);
-        masm.loadConstantSimd128Int(allOnes, output);
-        masm.bitwiseXorSimd128(Operand(scratch), output);
-        return;
-      case MSimdBinaryComp::lessThanOrEqual:
-        // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
-        masm.loadConstantSimd128Int(allOnes, scratch);
-        masm.vpcmpgtb(rhs, lhs, output);
-        masm.bitwiseXorSimd128(Operand(scratch), output);
-        return;
-    }
-    MOZ_CRASH("unexpected SIMD op");
+    masm.compareInt8x16(lhs, rhs, ToCondition(ins->operation()), output);
 }
 
 void
 CodeGenerator::visitSimdBinaryCompIx8(LSimdBinaryCompIx8* ins)
 {
-    static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
-
     FloatRegister lhs = ToFloatRegister(ins->lhs());
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister output = ToFloatRegister(ins->output());
     MOZ_ASSERT_IF(!Assembler::HasAVX(), output == lhs);
-
-    ScratchSimd128Scope scratch(masm);
-
-    MSimdBinaryComp::Operation op = ins->operation();
-    switch (op) {
-      case MSimdBinaryComp::greaterThan:
-        masm.vpcmpgtw(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::equal:
-        masm.vpcmpeqw(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::lessThan:
-        // src := rhs
-        if (rhs.kind() == Operand::FPREG)
-            masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
-        else
-            masm.loadAlignedSimd128Int(rhs, scratch);
-
-        // src := src > lhs (i.e. lhs < rhs)
-        // Improve by doing custom lowering (rhs is tied to the output register)
-        masm.vpcmpgtw(ToOperand(ins->lhs()), scratch, scratch);
-        masm.moveSimd128Int(scratch, output);
-        return;
-      case MSimdBinaryComp::notEqual:
-        // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
-        // should invert the comparison by, e.g. swapping the arms of a select
-        // if that's what it's used in.
-        masm.loadConstantSimd128Int(allOnes, scratch);
-        masm.vpcmpeqw(rhs, lhs, output);
-        masm.bitwiseXorSimd128(Operand(scratch), output);
-        return;
-      case MSimdBinaryComp::greaterThanOrEqual:
-        // src := rhs
-        if (rhs.kind() == Operand::FPREG)
-            masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
-        else
-            masm.loadAlignedSimd128Int(rhs, scratch);
-        masm.vpcmpgtw(ToOperand(ins->lhs()), scratch, scratch);
-        masm.loadConstantSimd128Int(allOnes, output);
-        masm.bitwiseXorSimd128(Operand(scratch), output);
-        return;
-      case MSimdBinaryComp::lessThanOrEqual:
-        // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
-        masm.loadConstantSimd128Int(allOnes, scratch);
-        masm.vpcmpgtw(rhs, lhs, output);
-        masm.bitwiseXorSimd128(Operand(scratch), output);
-        return;
-    }
-    MOZ_CRASH("unexpected SIMD op");
+    masm.compareInt16x8(lhs, rhs, ToCondition(ins->operation()), output);
 }
 
 void
 CodeGenerator::visitSimdBinaryCompIx4(LSimdBinaryCompIx4* ins)
 {
-    static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
-
     FloatRegister lhs = ToFloatRegister(ins->lhs());
     Operand rhs = ToOperand(ins->rhs());
     MOZ_ASSERT(ToFloatRegister(ins->output()) == lhs);
-
-    ScratchSimd128Scope scratch(masm);
-
-    MSimdBinaryComp::Operation op = ins->operation();
-    switch (op) {
-      case MSimdBinaryComp::greaterThan:
-        masm.packedGreaterThanInt32x4(rhs, lhs);
-        return;
-      case MSimdBinaryComp::equal:
-        masm.packedEqualInt32x4(rhs, lhs);
-        return;
-      case MSimdBinaryComp::lessThan:
-        // src := rhs
-        if (rhs.kind() == Operand::FPREG)
-            masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
-        else
-            masm.loadAlignedSimd128Int(rhs, scratch);
-
-        // src := src > lhs (i.e. lhs < rhs)
-        // Improve by doing custom lowering (rhs is tied to the output register)
-        masm.packedGreaterThanInt32x4(ToOperand(ins->lhs()), scratch);
-        masm.moveSimd128Int(scratch, lhs);
-        return;
-      case MSimdBinaryComp::notEqual:
-        // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
-        // should invert the comparison by, e.g. swapping the arms of a select
-        // if that's what it's used in.
-        masm.loadConstantSimd128Int(allOnes, scratch);
-        masm.packedEqualInt32x4(rhs, lhs);
-        masm.bitwiseXorSimd128(Operand(scratch), lhs);
-        return;
-      case MSimdBinaryComp::greaterThanOrEqual:
-        // src := rhs
-        if (rhs.kind() == Operand::FPREG)
-            masm.moveSimd128Int(ToFloatRegister(ins->rhs()), scratch);
-        else
-            masm.loadAlignedSimd128Int(rhs, scratch);
-        masm.packedGreaterThanInt32x4(ToOperand(ins->lhs()), scratch);
-        masm.loadConstantSimd128Int(allOnes, lhs);
-        masm.bitwiseXorSimd128(Operand(scratch), lhs);
-        return;
-      case MSimdBinaryComp::lessThanOrEqual:
-        // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
-        masm.loadConstantSimd128Int(allOnes, scratch);
-        masm.packedGreaterThanInt32x4(rhs, lhs);
-        masm.bitwiseXorSimd128(Operand(scratch), lhs);
-        return;
-    }
-    MOZ_CRASH("unexpected SIMD op");
+    masm.compareInt32x4(lhs, rhs, ToCondition(ins->operation()), lhs);
 }
 
 void
 CodeGenerator::visitSimdBinaryCompFx4(LSimdBinaryCompFx4* ins)
 {
     FloatRegister lhs = ToFloatRegister(ins->lhs());
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister output = ToFloatRegister(ins->output());
-
-    MSimdBinaryComp::Operation op = ins->operation();
-    switch (op) {
-      case MSimdBinaryComp::equal:
-        masm.vcmpeqps(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::lessThan:
-        masm.vcmpltps(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::lessThanOrEqual:
-        masm.vcmpleps(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::notEqual:
-        masm.vcmpneqps(rhs, lhs, output);
-        return;
-      case MSimdBinaryComp::greaterThanOrEqual:
-      case MSimdBinaryComp::greaterThan:
-        // We reverse these before register allocation so that we don't have to
-        // copy into and out of temporaries after codegen.
-        MOZ_CRASH("lowering should have reversed this");
-    }
-    MOZ_CRASH("unexpected SIMD op");
+    masm.compareFloat32x4(lhs, rhs, ToCondition(ins->operation()), output);
 }
 
 void
 CodeGenerator::visitSimdBinaryArithIx16(LSimdBinaryArithIx16* ins)
 {
     FloatRegister lhs = ToFloatRegister(ins->lhs());
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister output = ToFloatRegister(ins->output());
 
     MSimdBinaryArith::Operation op = ins->operation();
     switch (op) {
       case MSimdBinaryArith::Op_add:
-        masm.vpaddb(rhs, lhs, output);
+        masm.addInt8x16(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_sub:
-        masm.vpsubb(rhs, lhs, output);
+        masm.subInt8x16(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_mul:
         // 8x16 mul is a valid operation, but not supported in SSE or AVX.
         // The operation is synthesized from 16x8 multiplies by
         // MSimdBinaryArith::AddLegalized().
         break;
       case MSimdBinaryArith::Op_div:
       case MSimdBinaryArith::Op_max:
@@ -3744,23 +2971,23 @@ CodeGenerator::visitSimdBinaryArithIx8(L
 {
     FloatRegister lhs = ToFloatRegister(ins->lhs());
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister output = ToFloatRegister(ins->output());
 
     MSimdBinaryArith::Operation op = ins->operation();
     switch (op) {
       case MSimdBinaryArith::Op_add:
-        masm.vpaddw(rhs, lhs, output);
+        masm.addInt16x8(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_sub:
-        masm.vpsubw(rhs, lhs, output);
+        masm.subInt16x8(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_mul:
-        masm.vpmullw(rhs, lhs, output);
+        masm.mulInt16x8(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_div:
       case MSimdBinaryArith::Op_max:
       case MSimdBinaryArith::Op_min:
       case MSimdBinaryArith::Op_minNum:
       case MSimdBinaryArith::Op_maxNum:
         break;
     }
@@ -3769,45 +2996,29 @@ CodeGenerator::visitSimdBinaryArithIx8(L
 
 void
 CodeGenerator::visitSimdBinaryArithIx4(LSimdBinaryArithIx4* ins)
 {
     FloatRegister lhs = ToFloatRegister(ins->lhs());
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister output = ToFloatRegister(ins->output());
 
-    ScratchSimd128Scope scratch(masm);
-
     MSimdBinaryArith::Operation op = ins->operation();
     switch (op) {
       case MSimdBinaryArith::Op_add:
-        masm.vpaddd(rhs, lhs, output);
+        masm.addInt32x4(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_sub:
-        masm.vpsubd(rhs, lhs, output);
+        masm.subInt32x4(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_mul: {
-        if (AssemblerX86Shared::HasSSE41()) {
-            masm.vpmulld(rhs, lhs, output);
-            return;
-        }
-
-        masm.loadAlignedSimd128Int(rhs, scratch);
-        masm.vpmuludq(lhs, scratch, scratch);
-        // scratch contains (Rx, _, Rz, _) where R is the resulting vector.
-
-        FloatRegister temp = ToFloatRegister(ins->temp());
-        masm.vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), lhs, lhs);
-        masm.vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), rhs, temp);
-        masm.vpmuludq(temp, lhs, lhs);
-        // lhs contains (Ry, _, Rw, _) where R is the resulting vector.
-
-        masm.vshufps(MacroAssembler::ComputeShuffleMask(0, 2, 0, 2), scratch, lhs, lhs);
-        // lhs contains (Ry, Rw, Rx, Rz)
-        masm.vshufps(MacroAssembler::ComputeShuffleMask(2, 0, 3, 1), lhs, lhs, lhs);
+        Maybe<FloatRegister> maybeTemp;
+        if (!AssemblerX86Shared::HasSSE41())
+            maybeTemp.emplace(ToFloatRegister(ins->getTemp(0)));
+        masm.mulInt32x4(lhs, rhs, maybeTemp, output);
         return;
       }
       case MSimdBinaryArith::Op_div:
         // x86 doesn't have SIMD i32 div.
         break;
       case MSimdBinaryArith::Op_max:
         // we can do max with a single instruction only if we have SSE4.1
         // using the PMAXSD instruction.
@@ -3825,114 +3036,44 @@ CodeGenerator::visitSimdBinaryArithIx4(L
 
 void
 CodeGenerator::visitSimdBinaryArithFx4(LSimdBinaryArithFx4* ins)
 {
     FloatRegister lhs = ToFloatRegister(ins->lhs());
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister output = ToFloatRegister(ins->output());
 
-    ScratchSimd128Scope scratch(masm);
-
     MSimdBinaryArith::Operation op = ins->operation();
     switch (op) {
       case MSimdBinaryArith::Op_add:
-        masm.vaddps(rhs, lhs, output);
+        masm.addFloat32x4(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_sub:
-        masm.vsubps(rhs, lhs, output);
+        masm.subFloat32x4(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_mul:
-        masm.vmulps(rhs, lhs, output);
+        masm.mulFloat32x4(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_div:
-        masm.vdivps(rhs, lhs, output);
+        masm.divFloat32x4(lhs, rhs, output);
         return;
       case MSimdBinaryArith::Op_max: {
-        FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, scratch);
-        masm.vcmpunordps(rhs, lhsCopy, scratch);
-
-        FloatRegister tmp = ToFloatRegister(ins->temp());
-        FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, tmp);
-        masm.vmaxps(Operand(lhs), rhsCopy, tmp);
-        masm.vmaxps(rhs, lhs, output);
-
-        masm.vandps(tmp, output, output);
-        masm.vorps(scratch, output, output); // or in the all-ones NaNs
+        masm.maxFloat32x4(lhs, rhs, ToFloatRegister(ins->temp()), output);
         return;
       }
       case MSimdBinaryArith::Op_min: {
-        FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, scratch);
-        masm.vminps(Operand(lhs), rhsCopy, scratch);
-        masm.vminps(rhs, lhs, output);
-        masm.vorps(scratch, output, output); // NaN or'd with arbitrary bits is NaN
+        masm.minFloat32x4(lhs, rhs, output);
         return;
       }
       case MSimdBinaryArith::Op_minNum: {
-        FloatRegister tmp = ToFloatRegister(ins->temp());
-        masm.loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), tmp);
-
-        FloatRegister mask = scratch;
-        FloatRegister tmpCopy = masm.reusedInputFloat32x4(tmp, scratch);
-        masm.vpcmpeqd(Operand(lhs), tmpCopy, mask);
-        masm.vandps(tmp, mask, mask);
-
-        FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, tmp);
-        masm.vminps(rhs, lhsCopy, tmp);
-        masm.vorps(mask, tmp, tmp);
-
-        FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, mask);
-        masm.vcmpneqps(rhs, rhsCopy, mask);
-
-        if (AssemblerX86Shared::HasAVX()) {
-            masm.vblendvps(mask, lhs, tmp, output);
-        } else {
-            // Emulate vblendvps.
-            // With SSE.4.1 we could use blendvps, however it's awkward since
-            // it requires the mask to be in xmm0.
-            if (lhs != output)
-                masm.moveSimd128Float(lhs, output);
-            masm.vandps(Operand(mask), output, output);
-            masm.vandnps(Operand(tmp), mask, mask);
-            masm.vorps(Operand(mask), output, output);
-        }
+        masm.minNumFloat32x4(lhs, rhs, ToFloatRegister(ins->temp()), output);
         return;
       }
       case MSimdBinaryArith::Op_maxNum: {
-        FloatRegister mask = scratch;
-        masm.loadConstantSimd128Int(SimdConstant::SplatX4(0), mask);
-        masm.vpcmpeqd(Operand(lhs), mask, mask);
-
-        FloatRegister tmp = ToFloatRegister(ins->temp());
-        masm.loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), tmp);
-        masm.vandps(tmp, mask, mask);
-
-        FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, tmp);
-        masm.vmaxps(rhs, lhsCopy, tmp);
-        masm.vandnps(Operand(tmp), mask, mask);
-
-        // Ensure tmp always contains the temporary result
-        mask = tmp;
-        tmp = scratch;
-
-        FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, mask);
-        masm.vcmpneqps(rhs, rhsCopy, mask);
-
-        if (AssemblerX86Shared::HasAVX()) {
-            masm.vblendvps(mask, lhs, tmp, output);
-        } else {
-            // Emulate vblendvps.
-            // With SSE.4.1 we could use blendvps, however it's awkward since
-            // it requires the mask to be in xmm0.
-            if (lhs != output)
-                masm.moveSimd128Float(lhs, output);
-            masm.vandps(Operand(mask), output, output);
-            masm.vandnps(Operand(tmp), mask, mask);
-            masm.vorps(Operand(mask), output, output);
-        }
+        masm.maxNumFloat32x4(lhs, rhs, ToFloatRegister(ins->temp()), output);
         return;
       }
     }
     MOZ_CRASH("unexpected SIMD op");
 }
 
 void
 CodeGenerator::visitSimdBinarySaturating(LSimdBinarySaturating* ins)
@@ -3943,160 +3084,119 @@ CodeGenerator::visitSimdBinarySaturating
 
     SimdSign sign = ins->signedness();
     MOZ_ASSERT(sign != SimdSign::NotApplicable);
 
     switch (ins->type()) {
       case MIRType::Int8x16:
         switch (ins->operation()) {
           case MSimdBinarySaturating::add:
-            if (sign == SimdSign::Signed)
-                masm.vpaddsb(rhs, lhs, output);
-            else
-                masm.vpaddusb(rhs, lhs, output);
+            masm.addSatInt8x16(lhs, rhs, sign, output);
             return;
           case MSimdBinarySaturating::sub:
-            if (sign == SimdSign::Signed)
-                masm.vpsubsb(rhs, lhs, output);
-            else
-                masm.vpsubusb(rhs, lhs, output);
+            masm.subSatInt8x16(lhs, rhs, sign, output);
             return;
         }
         break;
 
       case MIRType::Int16x8:
         switch (ins->operation()) {
           case MSimdBinarySaturating::add:
-            if (sign == SimdSign::Signed)
-                masm.vpaddsw(rhs, lhs, output);
-            else
-                masm.vpaddusw(rhs, lhs, output);
+            masm.addSatInt16x8(lhs, rhs, sign, output);
             return;
           case MSimdBinarySaturating::sub:
-            if (sign == SimdSign::Signed)
-                masm.vpsubsw(rhs, lhs, output);
-            else
-                masm.vpsubusw(rhs, lhs, output);
+            masm.subSatInt16x8(lhs, rhs, sign, output);
             return;
         }
         break;
 
       default:
         break;
     }
     MOZ_CRASH("unsupported type for SIMD saturating arithmetic");
 }
 
 void
 CodeGenerator::visitSimdUnaryArithIx16(LSimdUnaryArithIx16* ins)
 {
     Operand in = ToOperand(ins->input());
     FloatRegister out = ToFloatRegister(ins->output());
-
-    static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
-
     switch (ins->operation()) {
       case MSimdUnaryArith::neg:
-        masm.zeroSimd128Int(out);
-        masm.packedSubInt8(in, out);
+        masm.negInt8x16(in, out);
         return;
       case MSimdUnaryArith::not_:
-        masm.loadConstantSimd128Int(allOnes, out);
-        masm.bitwiseXorSimd128(in, out);
+        masm.notInt8x16(in, out);
         return;
       case MSimdUnaryArith::abs:
       case MSimdUnaryArith::reciprocalApproximation:
       case MSimdUnaryArith::reciprocalSqrtApproximation:
       case MSimdUnaryArith::sqrt:
         break;
     }
     MOZ_CRASH("unexpected SIMD op");
 }
 
 void
 CodeGenerator::visitSimdUnaryArithIx8(LSimdUnaryArithIx8* ins)
 {
     Operand in = ToOperand(ins->input());
     FloatRegister out = ToFloatRegister(ins->output());
-
-    static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
-
     switch (ins->operation()) {
       case MSimdUnaryArith::neg:
-        masm.zeroSimd128Int(out);
-        masm.packedSubInt16(in, out);
+        masm.negInt16x8(in, out);
         return;
       case MSimdUnaryArith::not_:
-        masm.loadConstantSimd128Int(allOnes, out);
-        masm.bitwiseXorSimd128(in, out);
+        masm.notInt16x8(in, out);
         return;
       case MSimdUnaryArith::abs:
       case MSimdUnaryArith::reciprocalApproximation:
       case MSimdUnaryArith::reciprocalSqrtApproximation:
       case MSimdUnaryArith::sqrt:
         break;
     }
     MOZ_CRASH("unexpected SIMD op");
 }
 
 void
 CodeGenerator::visitSimdUnaryArithIx4(LSimdUnaryArithIx4* ins)
 {
     Operand in = ToOperand(ins->input());
     FloatRegister out = ToFloatRegister(ins->output());
-
-    static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
-
     switch (ins->operation()) {
       case MSimdUnaryArith::neg:
-        masm.zeroSimd128Int(out);
-        masm.packedSubInt32(in, out);
+        masm.negInt32x4(in, out);
         return;
       case MSimdUnaryArith::not_:
-        masm.loadConstantSimd128Int(allOnes, out);
-        masm.bitwiseXorSimd128(in, out);
+        masm.notInt32x4(in, out);
         return;
       case MSimdUnaryArith::abs:
       case MSimdUnaryArith::reciprocalApproximation:
       case MSimdUnaryArith::reciprocalSqrtApproximation:
       case MSimdUnaryArith::sqrt:
         break;
     }
     MOZ_CRASH("unexpected SIMD op");
 }
 
 void
 CodeGenerator::visitSimdUnaryArithFx4(LSimdUnaryArithFx4* ins)
 {
     Operand in = ToOperand(ins->input());
     FloatRegister out = ToFloatRegister(ins->output());
 
-    // All ones but the sign bit
-    float signMask = SpecificNaN<float>(0, FloatingPoint<float>::kSignificandBits);
-    static const SimdConstant signMasks = SimdConstant::SplatX4(signMask);
-
-    // All ones including the sign bit
-    float ones = SpecificNaN<float>(1, FloatingPoint<float>::kSignificandBits);
-    static const SimdConstant allOnes = SimdConstant::SplatX4(ones);
-
-    // All zeros but the sign bit
-    static const SimdConstant minusZero = SimdConstant::SplatX4(-0.f);
-
     switch (ins->operation()) {
       case MSimdUnaryArith::abs:
-        masm.loadConstantSimd128Float(signMasks, out);
-        masm.bitwiseAndSimd128(in, out);
+        masm.absFloat32x4(in, out);
         return;
       case MSimdUnaryArith::neg:
-        masm.loadConstantSimd128Float(minusZero, out);
-        masm.bitwiseXorSimd128(in, out);
+        masm.negFloat32x4(in, out);
         return;
       case MSimdUnaryArith::not_:
-        masm.loadConstantSimd128Float(allOnes, out);
-        masm.bitwiseXorSimd128(in, out);
+        masm.notFloat32x4(in, out);
         return;
       case MSimdUnaryArith::reciprocalApproximation:
         masm.packedRcpApproximationFloat32x4(in, out);
         return;
       case MSimdUnaryArith::reciprocalSqrtApproximation:
         masm.packedRcpSqrtApproximationFloat32x4(in, out);
         return;
       case MSimdUnaryArith::sqrt:
@@ -4112,51 +3212,48 @@ CodeGenerator::visitSimdBinaryBitwise(LS
     FloatRegister lhs = ToFloatRegister(ins->lhs());
     Operand rhs = ToOperand(ins->rhs());
     FloatRegister output = ToFloatRegister(ins->output());
 
     MSimdBinaryBitwise::Operation op = ins->operation();
     switch (op) {
       case MSimdBinaryBitwise::and_:
         if (ins->type() == MIRType::Float32x4)
-            masm.vandps(rhs, lhs, output);
+            masm.bitwiseAndFloat32x4(lhs, rhs, output);
         else
-            masm.vpand(rhs, lhs, output);
+            masm.bitwiseAndSimdInt(lhs, rhs, output);
         return;
       case MSimdBinaryBitwise::or_:
         if (ins->type() == MIRType::Float32x4)
-            masm.vorps(rhs, lhs, output);
+            masm.bitwiseOrFloat32x4(lhs, rhs, output);
         else
-            masm.vpor(rhs, lhs, output);
+            masm.bitwiseOrSimdInt(lhs, rhs, output);
         return;
       case MSimdBinaryBitwise::xor_:
         if (ins->type() == MIRType::Float32x4)
-            masm.vxorps(rhs, lhs, output);
+            masm.bitwiseXorFloat32x4(lhs, rhs, output);
         else
-            masm.vpxor(rhs, lhs, output);
+            masm.bitwiseXorSimdInt(lhs, rhs, output);
         return;
     }
     MOZ_CRASH("unexpected SIMD bitwise op");
 }
 
 void
 CodeGenerator::visitSimdShift(LSimdShift* ins)
 {
     FloatRegister out = ToFloatRegister(ins->output());
     MOZ_ASSERT(ToFloatRegister(ins->vector()) == out); // defineReuseInput(0);
 
-    // The shift amount is masked to the number of bits in a lane.
-    uint32_t shiftmask = (128u / SimdTypeToLength(ins->type())) - 1;
-
     // Note that SSE doesn't have instructions for shifting 8x16 vectors.
     // These shifts are synthesized by the MSimdShift::AddLegalized() function.
     const LAllocation* val = ins->value();
     if (val->isConstant()) {
         MOZ_ASSERT(ins->temp()->isBogusTemp());
-        Imm32 count(uint32_t(ToInt32(val)) & shiftmask);
+        Imm32 count(uint32_t(ToInt32(val)));
         switch (ins->type()) {
           case MIRType::Int16x8:
             switch (ins->operation()) {
               case MSimdShift::lsh:
                 masm.packedLeftShiftByScalarInt16x8(count, out);
                 return;
               case MSimdShift::rsh:
                 masm.packedRightShiftByScalarInt16x8(count, out);
@@ -4180,48 +3277,43 @@ CodeGenerator::visitSimdShift(LSimdShift
             }
             break;
           default:
             MOZ_CRASH("unsupported type for SIMD shifts");
         }
         MOZ_CRASH("unexpected SIMD bitwise op");
     }
 
-    // Truncate val to 5 bits. We should have a temp register for that.
-    MOZ_ASSERT(val->isRegister());
-    Register count = ToRegister(ins->temp());
-    masm.mov(ToRegister(val), count);
-    masm.andl(Imm32(shiftmask), count);
-    ScratchFloat32Scope scratch(masm);
-    masm.vmovd(count, scratch);
+    Register temp = ToRegister(ins->temp());
+    Register count = ToRegister(val);
 
     switch (ins->type()) {
       case MIRType::Int16x8:
         switch (ins->operation()) {
           case MSimdShift::lsh:
-            masm.packedLeftShiftByScalarInt16x8(scratch, out);
+            masm.packedLeftShiftByScalarInt16x8(out, count, temp, out);
             return;
           case MSimdShift::rsh:
-            masm.packedRightShiftByScalarInt16x8(scratch, out);
+            masm.packedRightShiftByScalarInt16x8(out, count, temp, out);
             return;
           case MSimdShift::ursh:
-            masm.packedUnsignedRightShiftByScalarInt16x8(scratch, out);
+            masm.packedUnsignedRightShiftByScalarInt16x8(out, count, temp, out);
             return;
         }
         break;
       case MIRType::Int32x4:
         switch (ins->operation()) {
           case MSimdShift::lsh:
-            masm.packedLeftShiftByScalarInt32x4(scratch, out);
+            masm.packedLeftShiftByScalarInt32x4(out, count, temp, out);
             return;
           case MSimdShift::rsh:
-            masm.packedRightShiftByScalarInt32x4(scratch, out);
+            masm.packedRightShiftByScalarInt32x4(out, count, temp, out);
             return;
           case MSimdShift::ursh:
-            masm.packedUnsignedRightShiftByScalarInt32x4(scratch, out);
+            masm.packedUnsignedRightShiftByScalarInt32x4(out, count, temp, out);
             return;
         }
         break;
       default:
         MOZ_CRASH("unsupported type for SIMD shifts");
     }
     MOZ_CRASH("unexpected SIMD bitwise op");
 }
@@ -4230,36 +3322,22 @@ void
 CodeGenerator::visitSimdSelect(LSimdSelect* ins)
 {
     FloatRegister mask = ToFloatRegister(ins->mask());
     FloatRegister onTrue = ToFloatRegister(ins->lhs());
     FloatRegister onFalse = ToFloatRegister(ins->rhs());
     FloatRegister output = ToFloatRegister(ins->output());
     FloatRegister temp = ToFloatRegister(ins->temp());
 
-    if (onTrue != output)
-        masm.vmovaps(onTrue, output);
-    if (mask != temp)
-        masm.vmovaps(mask, temp);
-
     MSimdSelect* mir = ins->mir();
     unsigned lanes = SimdTypeToLength(mir->type());
-
-    if (AssemblerX86Shared::HasAVX() && lanes == 4) {
-        // TBD: Use vpblendvb for lanes > 4, HasAVX.
-        masm.vblendvps(mask, onTrue, onFalse, output);
-        return;
-    }
-
-    // SSE4.1 has plain blendvps which can do this, but it is awkward
-    // to use because it requires the mask to be in xmm0.
-
-    masm.bitwiseAndSimd128(Operand(temp), output);
-    masm.bitwiseAndNotSimd128(Operand(onFalse), temp);
-    masm.bitwiseOrSimd128(Operand(temp), output);
+    if (lanes == 4)
+        masm.selectX4(mask, onTrue, onFalse, temp, output);
+    else
+        masm.selectSimd128(mask, onTrue, onFalse, temp, output);
 }
 
 void
 CodeGenerator::visitCompareExchangeTypedArrayElement(LCompareExchangeTypedArrayElement* lir)
 {
     Register elements = ToRegister(lir->elements());
     AnyRegister output = ToAnyRegister(lir->output());
     Register temp = lir->temp()->isBogusTemp() ? InvalidReg : ToRegister(lir->temp());
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.h
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.h
@@ -168,22 +168,16 @@ class CodeGeneratorX86Shared : public Co
     {
         MOZ_ASSERT(cond == Assembler::Equal || cond == Assembler::NotEqual);
         masm.cmpPtr(reg, ImmWord(0));
         emitBranch(cond, ifTrue, ifFalse);
     }
 
     void emitTableSwitchDispatch(MTableSwitch* mir, Register index, Register base);
 
-    void emitSimdExtractLane8x16(FloatRegister input, Register output, unsigned lane,
-                                 SimdSign signedness);
-    void emitSimdExtractLane16x8(FloatRegister input, Register output, unsigned lane,
-                                 SimdSign signedness);
-    void emitSimdExtractLane32x4(FloatRegister input, Register output, unsigned lane);
-
     template <class T, class Reg> void visitSimdGeneralShuffle(LSimdGeneralShuffleBase* lir, Reg temp);
 
     void generateInvalidateEpilogue();
 
     void canonicalizeIfDeterministic(Scalar::Type type, const LAllocation* value);
 
   public:
     // Out of line visitors.
new file mode 100644
--- /dev/null
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
@@ -0,0 +1,1226 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=8 sts=4 et sw=4 tw=99:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "jit/MacroAssembler.h"
+#include "jit/x86-shared/MacroAssembler-x86-shared.h"
+
+#include "jit/MacroAssembler-inl.h"
+
+using namespace js;
+using namespace js::jit;
+
+using mozilla::DebugOnly;
+using mozilla::FloatingPoint;
+using mozilla::Maybe;
+using mozilla::SpecificNaN;
+
+void
+MacroAssemblerX86Shared::checkedConvertFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest,
+                                                          Register temp, Label* oolEntry,
+                                                          Label* rejoin)
+{
+    // Does the conversion and jumps to the OOL entry if the result value
+    // is the undefined integer pattern.
+    static const SimdConstant InvalidResult = SimdConstant::SplatX4(int32_t(-2147483648));
+    convertFloat32x4ToInt32x4(src, dest);
+
+    ScratchSimd128Scope scratch(asMasm());
+    asMasm().loadConstantSimd128Int(InvalidResult, scratch);
+    packedEqualInt32x4(Operand(dest), scratch);
+    // TODO (bug 1156228): If we have SSE4.1, we can use PTEST here instead of
+    // the two following instructions.
+    vmovmskps(scratch, temp);
+    cmp32(temp, Imm32(0));
+    j(Assembler::NotEqual, oolEntry);
+    bind(rejoin);
+}
+
+void
+MacroAssemblerX86Shared::oolConvertFloat32x4ToInt32x4(FloatRegister src, Register temp,
+                                                      Label* rejoin, Label* onConversionError)
+{
+    static const SimdConstant Int32MaxX4 = SimdConstant::SplatX4(2147483647.f);
+    static const SimdConstant Int32MinX4 = SimdConstant::SplatX4(-2147483648.f);
+
+    ScratchSimd128Scope scratch(asMasm());
+    asMasm().loadConstantSimd128Float(Int32MinX4, scratch);
+    vcmpleps(Operand(src), scratch, scratch);
+    vmovmskps(scratch, temp);
+    cmp32(temp, Imm32(15));
+    j(Assembler::NotEqual, onConversionError);
+
+    asMasm().loadConstantSimd128Float(Int32MaxX4, scratch);
+    vcmpleps(Operand(src), scratch, scratch);
+    vmovmskps(scratch, temp);
+    cmp32(temp, Imm32(0));
+    j(Assembler::NotEqual, onConversionError);
+
+    jump(rejoin);
+}
+
+void
+MacroAssemblerX86Shared::checkedConvertFloat32x4ToUint32x4(FloatRegister in, FloatRegister out, Register temp, FloatRegister tempF, Label* failed)
+{
+    // Classify lane values into 4 disjoint classes:
+    //
+    //   N-lanes:             in <= -1.0
+    //   A-lanes:      -1.0 < in <= 0x0.ffffffp31
+    //   B-lanes: 0x1.0p31 <= in <= 0x0.ffffffp32
+    //   V-lanes: 0x1.0p32 <= in, or isnan(in)
+    //
+    // We need to bail out to throw a RangeError if we see any N-lanes or
+    // V-lanes.
+    //
+    // For A-lanes and B-lanes, we make two float -> int32 conversions:
+    //
+    //   A = cvttps2dq(in)
+    //   B = cvttps2dq(in - 0x1.0p31f)
+    //
+    // Note that the subtraction for the B computation is exact for B-lanes.
+    // There is no rounding, so B is the low 31 bits of the correctly converted
+    // result.
+    //
+    // The cvttps2dq instruction produces 0x80000000 when the input is NaN or
+    // out of range for a signed int32_t. This conveniently provides the missing
+    // high bit for B, so the desired result is A for A-lanes and A|B for
+    // B-lanes.
+
+    ScratchSimd128Scope scratch(asMasm());
+
+    // TODO: If the majority of lanes are A-lanes, it could be faster to compute
+    // A first, use vmovmskps to check for any non-A-lanes and handle them in
+    // ool code. OTOH, we we're wrong about the lane distribution, that would be
+    // slower.
+
+    // Compute B in |scratch|.
+    static const float Adjust = 0x80000000; // 0x1.0p31f for the benefit of MSVC.
+    static const SimdConstant Bias = SimdConstant::SplatX4(-Adjust);
+    asMasm().loadConstantSimd128Float(Bias, scratch);
+    packedAddFloat32(Operand(in), scratch);
+    convertFloat32x4ToInt32x4(scratch, scratch);
+
+    // Compute A in |out|. This is the last time we use |in| and the first time
+    // we use |out|, so we can tolerate if they are the same register.
+    convertFloat32x4ToInt32x4(in, out);
+
+    // We can identify A-lanes by the sign bits in A: Any A-lanes will be
+    // positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a
+    // mask of non-A-lanes into |tempF|.
+    zeroSimd128Float(tempF);
+    packedGreaterThanInt32x4(Operand(out), tempF);
+
+    // Clear the A-lanes in B.
+    bitwiseAndSimdInt(scratch, Operand(tempF), scratch);
+
+    // Compute the final result: A for A-lanes, A|B for B-lanes.
+    bitwiseOrSimdInt(out, Operand(scratch), out);
+
+    // We still need to filter out the V-lanes. They would show up as 0x80000000
+    // in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are
+    // the remaining negative lanes in B.
+    vmovmskps(scratch, temp);
+    cmp32(temp, Imm32(0));
+    j(Assembler::NotEqual, failed);
+}
+
+void
+MacroAssemblerX86Shared::createInt32x4(Register lane0, Register lane1, Register lane2,
+                                       Register lane3, FloatRegister dest)
+{
+    if (AssemblerX86Shared::HasSSE41()) {
+        vmovd(lane0, dest);
+        vpinsrd(1, lane1, dest, dest);
+        vpinsrd(2, lane2, dest, dest);
+        vpinsrd(3, lane3, dest, dest);
+        return;
+    }
+
+    asMasm().reserveStack(Simd128DataSize);
+    store32(lane0, Address(StackPointer, 0 * sizeof(int32_t)));
+    store32(lane1, Address(StackPointer, 1 * sizeof(int32_t)));
+    store32(lane2, Address(StackPointer, 2 * sizeof(int32_t)));
+    store32(lane3, Address(StackPointer, 3 * sizeof(int32_t)));
+    loadAlignedSimd128Int(Address(StackPointer, 0), dest);
+    asMasm().freeStack(Simd128DataSize);
+}
+
+void
+MacroAssemblerX86Shared::createFloat32x4(FloatRegister lane0, FloatRegister lane1,
+                                         FloatRegister lane2, FloatRegister lane3,
+                                         FloatRegister temp, FloatRegister output)
+{
+    FloatRegister lane0Copy = reusedInputFloat32x4(lane0, output);
+    FloatRegister lane1Copy = reusedInputFloat32x4(lane1, temp);
+    vunpcklps(lane3, lane1Copy, temp);
+    vunpcklps(lane2, lane0Copy, output);
+    vunpcklps(temp, output, output);
+}
+
+void
+MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output)
+{
+    vmovd(input, output);
+    if (AssemblerX86Shared::HasSSSE3()) {
+        zeroSimd128Int(ScratchSimd128Reg);
+        vpshufb(ScratchSimd128Reg, output, output);
+    } else {
+        // Use two shifts to duplicate the low 8 bits into the low 16 bits.
+        vpsllw(Imm32(8), output, output);
+        vmovdqa(output, ScratchSimd128Reg);
+        vpsrlw(Imm32(8), ScratchSimd128Reg, ScratchSimd128Reg);
+        vpor(ScratchSimd128Reg, output, output);
+        // Then do an X8 splat.
+        vpshuflw(0, output, output);
+        vpshufd(0, output, output);
+    }
+}
+
+void
+MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output)
+{
+    vmovd(input, output);
+    vpshuflw(0, output, output);
+    vpshufd(0, output, output);
+}
+
+void
+MacroAssemblerX86Shared::splatX4(Register input, FloatRegister output)
+{
+    vmovd(input, output);
+    vpshufd(0, output, output);
+}
+
+void
+MacroAssemblerX86Shared::splatX4(FloatRegister input, FloatRegister output)
+{
+    FloatRegister inputCopy = reusedInputFloat32x4(input, output);
+    vshufps(0, inputCopy, inputCopy, output);
+}
+
+void
+MacroAssemblerX86Shared::reinterpretSimd(bool isIntegerLaneType, FloatRegister input,
+                                         FloatRegister output)
+{
+    if (input.aliases(output))
+        return;
+    if (isIntegerLaneType)
+        vmovdqa(input, output);
+    else
+        vmovaps(input, output);
+}
+
+void
+MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input, Register output, unsigned lane)
+{
+    if (lane == 0) {
+        // The value we want to extract is in the low double-word
+        moveLowInt32(input, output);
+    } else if (AssemblerX86Shared::HasSSE41()) {
+        vpextrd(lane, input, output);
+    } else {
+        uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
+        shuffleInt32(mask, input, ScratchSimd128Reg);
+        moveLowInt32(ScratchSimd128Reg, output);
+    }
+}
+
+void
+MacroAssemblerX86Shared::extractLaneFloat32x4(FloatRegister input, FloatRegister output,
+                                              unsigned lane, bool canonicalize)
+{
+    if (lane == 0) {
+        // The value we want to extract is in the low double-word
+        if (input != output)
+            moveFloat32(input, output);
+    } else if (lane == 2) {
+        moveHighPairToLowPairFloat32(input, output);
+    } else {
+        uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
+        shuffleFloat32(mask, input, output);
+    }
+    // NaNs contained within SIMD values are not enforced to be canonical, so
+    // when we extract an element into a "regular" scalar JS value, we have to
+    // canonicalize. In wasm code, we can skip this, as wasm only has to
+    // canonicalize NaNs at FFI boundaries.
+    if (canonicalize)
+        asMasm().canonicalizeFloat(output);
+}
+
+void
+MacroAssemblerX86Shared::extractLaneInt16x8(FloatRegister input, Register output, unsigned lane,
+                                            SimdSign sign)
+{
+    // Unlike pextrd and pextrb, this is available in SSE2.
+    vpextrw(lane, input, output);
+    if (sign == SimdSign::Signed)
+        movswl(output, output);
+}
+
+void
+MacroAssemblerX86Shared::extractLaneInt8x16(FloatRegister input, Register output, unsigned lane,
+                                            SimdSign sign)
+{
+    if (AssemblerX86Shared::HasSSE41()) {
+        vpextrb(lane, input, output);
+        // vpextrb clears the high bits, so no further extension required.
+        if (sign == SimdSign::Unsigned)
+            sign = SimdSign::NotApplicable;
+    } else {
+        // Extract the relevant 16 bits containing our lane, then shift the
+        // right 8 bits into place.
+        extractLaneInt16x8(input, output, lane / 2, SimdSign::Unsigned);
+        if (lane % 2) {
+            shrl(Imm32(8), output);
+            // The shrl handles the zero-extension. Don't repeat it.
+            if (sign == SimdSign::Unsigned)
+                sign = SimdSign::NotApplicable;
+        }
+    }
+
+    // We have the right low 8 bits in |output|, but we may need to fix the high
+    // bits. Note that this requires |output| to be one of the %eax-%edx
+    // registers.
+    switch (sign) {
+      case SimdSign::Signed:
+        movsbl(output, output);
+        break;
+      case SimdSign::Unsigned:
+        movzbl(output, output);
+        break;
+      case SimdSign::NotApplicable:
+        // No adjustment needed.
+        break;
+    }
+}
+
+void
+MacroAssemblerX86Shared::extractLaneSimdBool(FloatRegister input, Register output, unsigned numLanes,
+                                             unsigned lane)
+{
+    switch (numLanes) {
+      case 4:
+        extractLaneInt32x4(input, output, lane);
+        break;
+      case 8:
+        // Get a lane, don't bother fixing the high bits since we'll mask below.
+        extractLaneInt16x8(input, output, lane, SimdSign::NotApplicable);
+        break;
+      case 16:
+        extractLaneInt8x16(input, output, lane, SimdSign::NotApplicable);
+        break;
+      default:
+        MOZ_CRASH("Unhandled SIMD number of lanes");
+    }
+    // We need to generate a 0/1 value. We have 0/-1 and possibly dirty high bits.
+    asMasm().and32(Imm32(1), output);
+}
+
+void
+MacroAssemblerX86Shared::insertLaneSimdInt(FloatRegister input, Register value, FloatRegister output,
+                                           unsigned lane, unsigned numLanes)
+{
+    if (numLanes == 8) {
+        // Available in SSE 2.
+        vpinsrw(lane, value, input, output);
+        return;
+    }
+
+    // Note that, contrarily to float32x4, we cannot use vmovd if the inserted
+    // value goes into the first component, as vmovd clears out the higher lanes
+    // of the output.
+    if (AssemblerX86Shared::HasSSE41()) {
+        // TODO: Teach Lowering that we don't need defineReuseInput if we have AVX.
+        switch (numLanes) {
+          case 4:
+            vpinsrd(lane, value, input, output);
+            return;
+          case 16:
+            vpinsrb(lane, value, input, output);
+            return;
+        }
+    }
+
+    asMasm().reserveStack(Simd128DataSize);
+    storeAlignedSimd128Int(input, Address(StackPointer, 0));
+    switch (numLanes) {
+      case 4:
+        store32(value, Address(StackPointer, lane * sizeof(int32_t)));
+        break;
+      case 16:
+        // Note that this requires `value` to be in one the registers where the
+        // low 8 bits are addressible (%eax - %edx on x86, all of them on x86-64).
+        store8(value, Address(StackPointer, lane * sizeof(int8_t)));
+        break;
+      default:
+        MOZ_CRASH("Unsupported SIMD numLanes");
+    }
+    loadAlignedSimd128Int(Address(StackPointer, 0), output);
+    asMasm().freeStack(Simd128DataSize);
+}
+
+void
+MacroAssemblerX86Shared::insertLaneFloat32x4(FloatRegister input, FloatRegister value,
+                                             FloatRegister output, unsigned lane)
+{
+    if (lane == 0) {
+        // As both operands are registers, vmovss doesn't modify the upper bits
+        // of the destination operand.
+        if (value != output)
+            vmovss(value, input, output);
+        return;
+    }
+
+    if (AssemblerX86Shared::HasSSE41()) {
+        // The input value is in the low float32 of the 'value' FloatRegister.
+        vinsertps(vinsertpsMask(0, lane), value, output, output);
+        return;
+    }
+
+    asMasm().reserveStack(Simd128DataSize);
+    storeAlignedSimd128Float(input, Address(StackPointer, 0));
+    asMasm().storeFloat32(value, Address(StackPointer, lane * sizeof(int32_t)));
+    loadAlignedSimd128Float(Address(StackPointer, 0), output);
+    asMasm().freeStack(Simd128DataSize);
+}
+
+void
+MacroAssemblerX86Shared::allTrueSimdBool(FloatRegister input, Register output)
+{
+    // We know that the input lanes are boolean, so they are either 0 or -1.
+    // The all-true vector has all 128 bits set, no matter the lane geometry.
+    vpmovmskb(input, output);
+    cmp32(output, Imm32(0xffff));
+    emitSet(Assembler::Zero, output);
+}
+
+void
+MacroAssemblerX86Shared::anyTrueSimdBool(FloatRegister input, Register output)
+{
+    vpmovmskb(input, output);
+    cmp32(output, Imm32(0x0));
+    emitSet(Assembler::NonZero, output);
+}
+
+void
+MacroAssemblerX86Shared::swizzleInt32x4(FloatRegister input, FloatRegister output,
+                                        unsigned lanes[4])
+{
+    uint32_t mask = MacroAssembler::ComputeShuffleMask(lanes[0], lanes[1], lanes[2], lanes[3]);
+    shuffleInt32(mask, input, output);
+}
+
+void
+MacroAssemblerX86Shared::swizzleInt8x16(FloatRegister input, FloatRegister output,
+                                        const Maybe<Register>& temp, int8_t lanes[16])
+{
+    if (AssemblerX86Shared::HasSSSE3()) {
+        ScratchSimd128Scope scratch(asMasm());
+        asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(lanes), scratch);
+        FloatRegister inputCopy = reusedInputInt32x4(input, output);
+        vpshufb(scratch, inputCopy, output);
+        return;
+    }
+
+    // Worst-case fallback for pre-SSSE3 machines. Bounce through memory.
+    MOZ_ASSERT(!!temp, "needs a temp for the memory fallback");
+    asMasm().reserveStack(2 * Simd128DataSize);
+    storeAlignedSimd128Int(input, Address(StackPointer, Simd128DataSize));
+    for (unsigned i = 0; i < 16; i++) {
+        load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]), *temp);
+        store8(*temp, Address(StackPointer, i));
+    }
+    loadAlignedSimd128Int(Address(StackPointer, 0), output);
+    asMasm().freeStack(2 * Simd128DataSize);
+}
+
+static inline bool
+LanesMatch(unsigned lanes[4], unsigned x, unsigned y, unsigned z, unsigned w)
+{
+    return lanes[0] == x && lanes[1] == y && lanes[2] == z && lanes[3] == w;
+}
+
+void
+MacroAssemblerX86Shared::swizzleFloat32x4(FloatRegister input, FloatRegister output,
+                                          unsigned lanes[4])
+{
+    if (AssemblerX86Shared::HasSSE3()) {
+        if (LanesMatch(lanes, 0, 0, 2, 2)) {
+            vmovsldup(input, output);
+            return;
+        }
+        if (LanesMatch(lanes, 1, 1, 3, 3)) {
+            vmovshdup(input, output);
+            return;
+        }
+    }
+
+    // TODO Here and below, arch specific lowering could identify this pattern
+    // and use defineReuseInput to avoid this move (bug 1084404)
+    if (LanesMatch(lanes, 2, 3, 2, 3)) {
+        FloatRegister inputCopy = reusedInputFloat32x4(input, output);
+        vmovhlps(input, inputCopy, output);
+        return;
+    }
+
+    if (LanesMatch(lanes, 0, 1, 0, 1)) {
+        if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) {
+            vmovddup(input, output);
+            return;
+        }
+        FloatRegister inputCopy = reusedInputFloat32x4(input, output);
+        vmovlhps(input, inputCopy, output);
+        return;
+    }
+
+    if (LanesMatch(lanes, 0, 0, 1, 1)) {
+        FloatRegister inputCopy = reusedInputFloat32x4(input, output);
+        vunpcklps(input, inputCopy, output);
+        return;
+    }
+
+    if (LanesMatch(lanes, 2, 2, 3, 3)) {
+        FloatRegister inputCopy = reusedInputFloat32x4(input, output);
+        vunpckhps(input, inputCopy, output);
+        return;
+    }
+
+    uint32_t x = lanes[0];
+    uint32_t y = lanes[1];
+    uint32_t z = lanes[2];
+    uint32_t w = lanes[3];
+
+    uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
+    shuffleFloat32(mask, input, output);
+}
+
+void
+MacroAssemblerX86Shared::shuffleInt8x16(FloatRegister lhs, FloatRegister rhs, FloatRegister output,
+                                        const Maybe<FloatRegister>& maybeFloatTemp,
+                                        const Maybe<Register>& maybeTemp, uint8_t lanes[16])
+{
+    DebugOnly<bool> hasSSSE3 = AssemblerX86Shared::HasSSSE3();
+    MOZ_ASSERT(hasSSSE3 == !!maybeFloatTemp);
+    MOZ_ASSERT(!hasSSSE3 == !!maybeTemp);
+
+    // Use pshufb if it is available.
+    if (AssemblerX86Shared::HasSSSE3()) {
+        ScratchSimd128Scope scratch(asMasm());
+
+        // Use pshufb instructions to gather the lanes from each source vector.
+        // A negative index creates a zero lane, so the two vectors can be combined.
+
+        // Set scratch = lanes from lhs.
+        int8_t idx[16];
+        for (unsigned i = 0; i < 16; i++)
+            idx[i] = lanes[i] < 16 ? lanes[i] : -1;
+        asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(idx), *maybeFloatTemp);
+        FloatRegister lhsCopy = reusedInputInt32x4(lhs, scratch);
+        vpshufb(*maybeFloatTemp, lhsCopy, scratch);
+
+        // Set output = lanes from rhs.
+        for (unsigned i = 0; i < 16; i++)
+            idx[i] = lanes[i] >= 16 ? lanes[i] - 16 : -1;
+        asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(idx), *maybeFloatTemp);
+        FloatRegister rhsCopy = reusedInputInt32x4(rhs, output);
+        vpshufb(*maybeFloatTemp, rhsCopy, output);
+
+        // Combine.
+        vpor(scratch, output, output);
+        return;
+    }
+
+    // Worst-case fallback for pre-SSE3 machines. Bounce through memory.
+    asMasm().reserveStack(3 * Simd128DataSize);
+    storeAlignedSimd128Int(lhs, Address(StackPointer, Simd128DataSize));
+    storeAlignedSimd128Int(rhs, Address(StackPointer, 2 * Simd128DataSize));
+    for (unsigned i = 0; i < 16; i++) {
+        load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]), *maybeTemp);
+        store8(*maybeTemp, Address(StackPointer, i));
+    }
+    loadAlignedSimd128Int(Address(StackPointer, 0), output);
+    asMasm().freeStack(3 * Simd128DataSize);
+}
+
+void
+MacroAssemblerX86Shared::shuffleX4(FloatRegister lhs, Operand rhs, FloatRegister out,
+                                   const Maybe<FloatRegister>& maybeTemp, unsigned lanes[4])
+{
+    uint32_t x = lanes[0];
+    uint32_t y = lanes[1];
+    uint32_t z = lanes[2];
+    uint32_t w = lanes[3];
+
+    // Check that lanes come from LHS in majority:
+    unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
+    MOZ_ASSERT(numLanesFromLHS >= 2);
+
+    // When reading this method, remember that vshufps takes the two first
+    // inputs of the destination operand (right operand) and the two last
+    // inputs of the source operand (left operand).
+    //
+    // Legend for explanations:
+    // - L: LHS
+    // - R: RHS
+    // - T: temporary
+
+    uint32_t mask;
+
+    // If all lanes came from a single vector, we should use swizzle instead.
+    MOZ_ASSERT(numLanesFromLHS < 4);
+
+    // If all values stay in their lane, this is a blend.
+    if (AssemblerX86Shared::HasSSE41()) {
+        if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) {
+            vblendps(blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4), rhs, lhs, out);
+            return;
+        }
+    }
+
+    // One element of the second, all other elements of the first
+    if (numLanesFromLHS == 3) {
+        unsigned firstMask = -1, secondMask = -1;
+
+        // register-register vmovss preserves the high lanes.
+        if (LanesMatch(lanes, 4, 1, 2, 3) && rhs.kind() == Operand::FPREG) {
+            vmovss(FloatRegister::FromCode(rhs.fpu()), lhs, out);
+            return;
+        }
+
+        // SSE4.1 vinsertps can handle any single element.
+        unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
+        if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
+            unsigned srcLane;
+            unsigned dstLane;
+            if (x >= 4) {
+                srcLane = x - 4;
+                dstLane = 0;
+            } else if (y >= 4) {
+                srcLane = y - 4;
+                dstLane = 1;
+            } else if (z >= 4) {
+                srcLane = z - 4;
+                dstLane = 2;
+            } else {
+                MOZ_ASSERT(w >= 4);
+                srcLane = w - 4;
+                dstLane = 3;
+            }
+            vinsertps(vinsertpsMask(srcLane, dstLane), rhs, lhs, out);
+            return;
+        }
+
+        MOZ_ASSERT(!!maybeTemp);
+        FloatRegister rhsCopy = *maybeTemp;
+        loadAlignedSimd128Float(rhs, rhsCopy);
+
+        if (x < 4 && y < 4) {
+            if (w >= 4) {
+                w %= 4;
+                // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhsCopy)
+                firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
+                // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, out)
+                secondMask = MacroAssembler::ComputeShuffleMask(x, y, 2, 0);
+            } else {
+                MOZ_ASSERT(z >= 4);
+                z %= 4;
+                // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhsCopy)
+                firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
+                // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, out)
+                secondMask = MacroAssembler::ComputeShuffleMask(x, y, 0, 2);
+            }
+
+            vshufps(firstMask, lhs, rhsCopy, rhsCopy);
+            vshufps(secondMask, rhsCopy, lhs, out);
+            return;
+        }
+
+        MOZ_ASSERT(z < 4 && w < 4);
+
+        if (y >= 4) {
+            y %= 4;
+            // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhsCopy)
+            firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
+            // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, out)
+            secondMask = MacroAssembler::ComputeShuffleMask(2, 0, z, w);
+        } else {
+            MOZ_ASSERT(x >= 4);
+            x %= 4;
+            // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhsCopy)
+            firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
+            // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, out)
+            secondMask = MacroAssembler::ComputeShuffleMask(0, 2, z, w);
+        }
+
+        vshufps(firstMask, lhs, rhsCopy, rhsCopy);
+        if (AssemblerX86Shared::HasAVX()) {
+            vshufps(secondMask, lhs, rhsCopy, out);
+        } else {
+            vshufps(secondMask, lhs, rhsCopy, rhsCopy);
+            moveSimd128Float(rhsCopy, out);
+        }
+        return;
+    }
+
+    // Two elements from one vector, two other elements from the other
+    MOZ_ASSERT(numLanesFromLHS == 2);
+
+    // TODO Here and below, symmetric case would be more handy to avoid a move,
+    // but can't be reached because operands would get swapped (bug 1084404).
+    if (LanesMatch(lanes, 2, 3, 6, 7)) {
+        ScratchSimd128Scope scratch(asMasm());
+        if (AssemblerX86Shared::HasAVX()) {
+            FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch);
+            vmovhlps(lhs, rhsCopy, out);
+        } else {
+            loadAlignedSimd128Float(rhs, scratch);
+            vmovhlps(lhs, scratch, scratch);
+            moveSimd128Float(scratch, out);
+        }
+        return;
+    }
+
+    if (LanesMatch(lanes, 0, 1, 4, 5)) {
+        FloatRegister rhsCopy;
+        ScratchSimd128Scope scratch(asMasm());
+        if (rhs.kind() == Operand::FPREG) {
+            // No need to make an actual copy, since the operand is already
+            // in a register, and it won't be clobbered by the vmovlhps.
+            rhsCopy = FloatRegister::FromCode(rhs.fpu());
+        } else {
+            loadAlignedSimd128Float(rhs, scratch);
+            rhsCopy = scratch;
+        }
+        vmovlhps(rhsCopy, lhs, out);
+        return;
+    }
+
+    if (LanesMatch(lanes, 0, 4, 1, 5)) {
+        vunpcklps(rhs, lhs, out);
+        return;
+    }
+
+    // TODO swapped case would be better (bug 1084404)
+    if (LanesMatch(lanes, 4, 0, 5, 1)) {
+        ScratchSimd128Scope scratch(asMasm());
+        if (AssemblerX86Shared::HasAVX()) {
+            FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch);
+            vunpcklps(lhs, rhsCopy, out);
+        } else {
+            loadAlignedSimd128Float(rhs, scratch);
+            vunpcklps(lhs, scratch, scratch);
+            moveSimd128Float(scratch, out);
+        }
+        return;
+    }
+
+    if (LanesMatch(lanes, 2, 6, 3, 7)) {
+        vunpckhps(rhs, lhs, out);
+        return;
+    }
+
+    // TODO swapped case would be better (bug 1084404)
+    if (LanesMatch(lanes, 6, 2, 7, 3)) {
+        ScratchSimd128Scope scratch(asMasm());
+        if (AssemblerX86Shared::HasAVX()) {
+            FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch);
+            vunpckhps(lhs, rhsCopy, out);
+        } else {
+            loadAlignedSimd128Float(rhs, scratch);
+            vunpckhps(lhs, scratch, scratch);
+            moveSimd128Float(scratch, out);
+        }
+        return;
+    }
+
+    // In one vshufps
+    if (x < 4 && y < 4) {
+        mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
+        vshufps(mask, rhs, lhs, out);
+        return;
+    }
+
+    // At creation, we should have explicitly swapped in this case.
+    MOZ_ASSERT(!(z >= 4 && w >= 4));
+
+    // In two vshufps, for the most generic case:
+    uint32_t firstMask[4], secondMask[4];
+    unsigned i = 0, j = 2, k = 0;
+
+#define COMPUTE_MASK(lane)       \
+    if (lane >= 4) {             \
+        firstMask[j] = lane % 4; \
+        secondMask[k++] = j++;   \
+    } else {                     \
+        firstMask[i] = lane;     \
+        secondMask[k++] = i++;   \
+    }
+
+    COMPUTE_MASK(x)
+    COMPUTE_MASK(y)
+    COMPUTE_MASK(z)
+    COMPUTE_MASK(w)
+#undef COMPUTE_MASK
+
+    MOZ_ASSERT(i == 2 && j == 4 && k == 4);
+
+    mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
+                                              firstMask[2], firstMask[3]);
+    vshufps(mask, rhs, lhs, lhs);
+
+    mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
+                                              secondMask[2], secondMask[3]);
+    vshufps(mask, lhs, lhs, lhs);
+}
+
+static inline FloatRegister
+ToSimdFloatRegister(const Operand& op)
+{
+    return FloatRegister(op.fpu(), FloatRegister::Codes::ContentType::Simd128);
+}
+
+void
+MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                                        FloatRegister output)
+{
+    static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
+    ScratchSimd128Scope scratch(asMasm());
+    switch (cond) {
+      case Assembler::Condition::GreaterThan:
+        vpcmpgtb(rhs, lhs, output);
+        break;
+      case Assembler::Condition::Equal:
+        vpcmpeqb(rhs, lhs, output);
+        break;
+      case Assembler::Condition::LessThan:
+        // src := rhs
+        if (rhs.kind() == Operand::FPREG)
+            moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+        else
+            loadAlignedSimd128Int(rhs, scratch);
+
+        // src := src > lhs (i.e. lhs < rhs)
+        // Improve by doing custom lowering (rhs is tied to the output register)
+        vpcmpgtb(Operand(lhs), scratch, scratch);
+        moveSimd128Int(scratch, output);
+        break;
+      case Assembler::Condition::NotEqual:
+        // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
+        // should invert the comparison by, e.g. swapping the arms of a select
+        // if that's what it's used in.
+        asMasm().loadConstantSimd128Int(allOnes, scratch);
+        vpcmpeqb(rhs, lhs, output);
+        bitwiseXorSimdInt(output, Operand(scratch), output);
+        break;
+      case Assembler::Condition::GreaterThanOrEqual:
+        // src := rhs
+        if (rhs.kind() == Operand::FPREG)
+            moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+        else
+            loadAlignedSimd128Int(rhs, scratch);
+        vpcmpgtb(Operand(lhs), scratch, scratch);
+        asMasm().loadConstantSimd128Int(allOnes, output);
+        bitwiseXorSimdInt(output, Operand(scratch), output);
+        break;
+      case Assembler::Condition::LessThanOrEqual:
+        // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
+        asMasm().loadConstantSimd128Int(allOnes, scratch);
+        vpcmpgtb(rhs, lhs, output);
+        bitwiseXorSimdInt(output, Operand(scratch), output);
+        break;
+      default:
+        MOZ_CRASH("unexpected condition op");
+    }
+}
+
+void
+MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                                        FloatRegister output)
+{
+    static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
+
+    ScratchSimd128Scope scratch(asMasm());
+    switch (cond) {
+      case Assembler::Condition::GreaterThan:
+        vpcmpgtw(rhs, lhs, output);
+        break;
+      case Assembler::Condition::Equal:
+        vpcmpeqw(rhs, lhs, output);
+        break;
+      case Assembler::Condition::LessThan:
+        // src := rhs
+        if (rhs.kind() == Operand::FPREG)
+            moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+        else
+            loadAlignedSimd128Int(rhs, scratch);
+
+        // src := src > lhs (i.e. lhs < rhs)
+        // Improve by doing custom lowering (rhs is tied to the output register)
+        vpcmpgtw(Operand(lhs), scratch, scratch);
+        moveSimd128Int(scratch, output);
+        break;
+      case Assembler::Condition::NotEqual:
+        // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
+        // should invert the comparison by, e.g. swapping the arms of a select
+        // if that's what it's used in.
+        asMasm().loadConstantSimd128Int(allOnes, scratch);
+        vpcmpeqw(rhs, lhs, output);
+        bitwiseXorSimdInt(output, Operand(scratch), output);
+        break;
+      case Assembler::Condition::GreaterThanOrEqual:
+        // src := rhs
+        if (rhs.kind() == Operand::FPREG)
+            moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+        else
+            loadAlignedSimd128Int(rhs, scratch);
+        vpcmpgtw(Operand(lhs), scratch, scratch);
+        asMasm().loadConstantSimd128Int(allOnes, output);
+        bitwiseXorSimdInt(output, Operand(scratch), output);
+        break;
+      case Assembler::Condition::LessThanOrEqual:
+        // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
+        asMasm().loadConstantSimd128Int(allOnes, scratch);
+        vpcmpgtw(rhs, lhs, output);
+        bitwiseXorSimdInt(output, Operand(scratch), output);
+        break;
+      default:
+        MOZ_CRASH("unexpected condition op");
+    }
+}
+
+void
+MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                                        FloatRegister output)
+{
+    static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
+    ScratchSimd128Scope scratch(asMasm());
+    switch (cond) {
+      case Assembler::Condition::GreaterThan:
+        packedGreaterThanInt32x4(rhs, lhs);
+        break;
+      case Assembler::Condition::Equal:
+        packedEqualInt32x4(rhs, lhs);
+        break;
+      case Assembler::Condition::LessThan:
+        // src := rhs
+        if (rhs.kind() == Operand::FPREG)
+            moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+        else
+            loadAlignedSimd128Int(rhs, scratch);
+
+        // src := src > lhs (i.e. lhs < rhs)
+        // Improve by doing custom lowering (rhs is tied to the output register)
+        packedGreaterThanInt32x4(Operand(lhs), scratch);
+        moveSimd128Int(scratch, lhs);
+        break;
+      case Assembler::Condition::NotEqual:
+        // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
+        // should invert the comparison by, e.g. swapping the arms of a select
+        // if that's what it's used in.
+        asMasm().loadConstantSimd128Int(allOnes, scratch);
+        packedEqualInt32x4(rhs, lhs);
+        bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
+        break;
+      case Assembler::Condition::GreaterThanOrEqual:
+        // src := rhs
+        if (rhs.kind() == Operand::FPREG)
+            moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+        else
+            loadAlignedSimd128Int(rhs, scratch);
+        packedGreaterThanInt32x4(Operand(lhs), scratch);
+        asMasm().loadConstantSimd128Int(allOnes, lhs);
+        bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
+        break;
+      case Assembler::Condition::LessThanOrEqual:
+        // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
+        asMasm().loadConstantSimd128Int(allOnes, scratch);
+        packedGreaterThanInt32x4(rhs, lhs);
+        bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
+        break;
+      default:
+        MOZ_CRASH("unexpected condition op");
+    }
+}
+
+void
+MacroAssemblerX86Shared::compareFloat32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                                          FloatRegister output)
+{
+    switch (cond) {
+      case Assembler::Condition::Equal:
+        vcmpeqps(rhs, lhs, output);
+        break;
+      case Assembler::Condition::LessThan:
+        vcmpltps(rhs, lhs, output);
+        break;
+      case Assembler::Condition::LessThanOrEqual:
+        vcmpleps(rhs, lhs, output);
+        break;
+      case Assembler::Condition::NotEqual:
+        vcmpneqps(rhs, lhs, output);
+        break;
+      case Assembler::Condition::GreaterThanOrEqual:
+      case Assembler::Condition::GreaterThan:
+        // We reverse these before register allocation so that we don't have to
+        // copy into and out of temporaries after codegen.
+        MOZ_CRASH("should have reversed this");
+      default:
+        MOZ_CRASH("unexpected condition op");
+    }
+}
+
+void
+MacroAssemblerX86Shared::mulInt32x4(FloatRegister lhs, Operand rhs,
+                                    const Maybe<FloatRegister>& temp, FloatRegister output)
+{
+    if (AssemblerX86Shared::HasSSE41()) {
+        vpmulld(rhs, lhs, output);
+        return;
+    }
+
+    ScratchSimd128Scope scratch(asMasm());
+    loadAlignedSimd128Int(rhs, scratch);
+    vpmuludq(lhs, scratch, scratch);
+    // scratch contains (Rx, _, Rz, _) where R is the resulting vector.
+
+    MOZ_ASSERT(!!temp);
+    vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), lhs, lhs);
+    vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), rhs, *temp);
+    vpmuludq(*temp, lhs, lhs);
+    // lhs contains (Ry, _, Rw, _) where R is the resulting vector.
+
+    vshufps(MacroAssembler::ComputeShuffleMask(0, 2, 0, 2), scratch, lhs, lhs);
+    // lhs contains (Ry, Rw, Rx, Rz)
+    vshufps(MacroAssembler::ComputeShuffleMask(2, 0, 3, 1), lhs, lhs, lhs);
+}
+
+void
+MacroAssemblerX86Shared::minFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, scratch);
+    vminps(Operand(lhs), rhsCopy, scratch);
+    vminps(rhs, lhs, output);
+    vorps(scratch, output, output); // NaN or'd with arbitrary bits is NaN
+}
+
+void
+MacroAssemblerX86Shared::maxFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp,
+                                      FloatRegister output)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    FloatRegister lhsCopy = reusedInputFloat32x4(lhs, scratch);
+    vcmpunordps(rhs, lhsCopy, scratch);
+
+    FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, temp);
+    vmaxps(Operand(lhs), rhsCopy, temp);
+    vmaxps(rhs, lhs, output);
+
+    vandps(temp, output, output);
+    vorps(scratch, output, output); // or in the all-ones NaNs
+}
+
+void
+MacroAssemblerX86Shared::minNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp,
+                                         FloatRegister output)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), temp);
+
+    FloatRegister mask = scratch;
+    FloatRegister tmpCopy = reusedInputFloat32x4(temp, scratch);
+    vpcmpeqd(Operand(lhs), tmpCopy, mask);
+    vandps(temp, mask, mask);
+
+    FloatRegister lhsCopy = reusedInputFloat32x4(lhs, temp);
+    vminps(rhs, lhsCopy, temp);
+    vorps(mask, temp, temp);
+
+    FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, mask);
+    vcmpneqps(rhs, rhsCopy, mask);
+
+    if (AssemblerX86Shared::HasAVX()) {
+        vblendvps(mask, lhs, temp, output);
+    } else {
+        // Emulate vblendvps.
+        // With SSE.4.1 we could use blendvps, however it's awkward since
+        // it requires the mask to be in xmm0.
+        if (lhs != output)
+            moveSimd128Float(lhs, output);
+        vandps(Operand(mask), output, output);
+        vandnps(Operand(temp), mask, mask);
+        vorps(Operand(mask), output, output);
+    }
+}
+
+void
+MacroAssemblerX86Shared::maxNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp,
+                                         FloatRegister output)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    FloatRegister mask = scratch;
+
+    asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(0), mask);
+    vpcmpeqd(Operand(lhs), mask, mask);
+
+    asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)), temp);
+    vandps(temp, mask, mask);
+
+    FloatRegister lhsCopy = reusedInputFloat32x4(lhs, temp);
+    vmaxps(rhs, lhsCopy, temp);
+    vandnps(Operand(temp), mask, mask);
+
+    // Ensure temp always contains the temporary result
+    mask = temp;
+    temp = scratch;
+
+    FloatRegister rhsCopy = reusedInputAlignedFloat32x4(rhs, mask);
+    vcmpneqps(rhs, rhsCopy, mask);
+
+    if (AssemblerX86Shared::HasAVX()) {
+        vblendvps(mask, lhs, temp, output);
+    } else {
+        // Emulate vblendvps.
+        // With SSE.4.1 we could use blendvps, however it's awkward since
+        // it requires the mask to be in xmm0.
+        if (lhs != output)
+            moveSimd128Float(lhs, output);
+        vandps(Operand(mask), output, output);
+        vandnps(Operand(temp), mask, mask);
+        vorps(Operand(mask), output, output);
+    }
+}
+
+void
+MacroAssemblerX86Shared::negFloat32x4(Operand in, FloatRegister out)
+{
+    // All zeros but the sign bit
+    static const SimdConstant minusZero = SimdConstant::SplatX4(-0.f);
+    asMasm().loadConstantSimd128Float(minusZero, out);
+    bitwiseXorFloat32x4(out, in, out);
+}
+
+void
+MacroAssemblerX86Shared::notInt8x16(Operand in, FloatRegister out)
+{
+    static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
+    asMasm().loadConstantSimd128Int(allOnes, out);
+    bitwiseXorSimdInt(out, in, out);
+}
+
+void
+MacroAssemblerX86Shared::notInt16x8(Operand in, FloatRegister out)
+{
+    static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
+    asMasm().loadConstantSimd128Int(allOnes, out);
+    bitwiseXorSimdInt(out, in, out);
+}
+
+void
+MacroAssemblerX86Shared::notInt32x4(Operand in, FloatRegister out)
+{
+    static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
+    asMasm().loadConstantSimd128Int(allOnes, out);
+    bitwiseXorSimdInt(out, in, out);
+}
+
+void
+MacroAssemblerX86Shared::notFloat32x4(Operand in, FloatRegister out)
+{
+    float ones = SpecificNaN<float>(1, FloatingPoint<float>::kSignificandBits);
+    static const SimdConstant allOnes = SimdConstant::SplatX4(ones);
+    asMasm().loadConstantSimd128Float(allOnes, out);
+    bitwiseXorFloat32x4(out, in, out);
+}
+
+void
+MacroAssemblerX86Shared::absFloat32x4(Operand in, FloatRegister out)
+{
+    // All ones but the sign bit
+    float signMask = SpecificNaN<float>(0, FloatingPoint<float>::kSignificandBits);
+    static const SimdConstant signMasks = SimdConstant::SplatX4(signMask);
+    asMasm().loadConstantSimd128Float(signMasks, out);
+    bitwiseAndFloat32x4(out, in, out);
+}
+
+static inline void
+MaskSimdShiftCount(MacroAssembler& masm, unsigned shiftmask, Register count, Register temp,
+                   FloatRegister dest)
+{
+    masm.mov(count, temp);
+    masm.andl(Imm32(shiftmask), temp);
+    masm.vmovd(temp, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedLeftShiftByScalarInt16x8(FloatRegister in, Register count,
+                                                        Register temp, FloatRegister dest)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
+    vpsllw(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedRightShiftByScalarInt16x8(FloatRegister in, Register count,
+                                                         Register temp, FloatRegister dest)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
+    vpsraw(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt16x8(FloatRegister in, Register count,
+                                                                 Register temp, FloatRegister dest)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
+    vpsrlw(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedLeftShiftByScalarInt32x4(FloatRegister in, Register count,
+                                                        Register temp, FloatRegister dest)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
+    vpslld(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedRightShiftByScalarInt32x4(FloatRegister in, Register count,
+                                                         Register temp, FloatRegister dest)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
+    vpsrad(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt32x4(FloatRegister in, Register count,
+                                                                 Register temp, FloatRegister dest)
+{
+    ScratchSimd128Scope scratch(asMasm());
+    MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
+    vpsrld(scratch, in, dest);
+}
+
+void
+MacroAssemblerX86Shared::selectSimd128(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse,
+                       FloatRegister temp, FloatRegister output)
+{
+    if (onTrue != output)
+        vmovaps(onTrue, output);
+    if (mask != temp)
+        vmovaps(mask, temp);
+
+    // SSE4.1 has plain blendvps which can do this, but it is awkward
+    // to use because it requires the mask to be in xmm0.
+
+    bitwiseAndSimdInt(output, Operand(temp), output);
+    bitwiseAndNotSimdInt(temp, Operand(onFalse), temp);
+    bitwiseOrSimdInt(output, Operand(temp), output);
+}
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
@@ -1122,19 +1122,19 @@ MacroAssembler::canonicalizeFloat32x4(Fl
 
     FloatRegister mask = scratch;
     vcmpordps(Operand(reg), reg, mask);
 
     FloatRegister ifFalse = scratch2;
     float nanf = float(JS::GenericNaN());
     loadConstantSimd128Float(SimdConstant::SplatX4(nanf), ifFalse);
 
-    bitwiseAndSimd128(Operand(mask), reg);
-    bitwiseAndNotSimd128(Operand(ifFalse), mask);
-    bitwiseOrSimd128(Operand(mask), reg);
+    bitwiseAndFloat32x4(reg, Operand(mask), reg);
+    bitwiseAndNotFloat32x4(mask, Operand(ifFalse), mask);
+    bitwiseOrFloat32x4(reg, Operand(mask), reg);
 }
 
 // ========================================================================
 // Memory access primitives.
 void
 MacroAssembler::storeUncanonicalizedDouble(FloatRegister src, const Address& dest)
 {
     vmovsd(src, dest);
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
@@ -101,17 +101,18 @@ class MacroAssemblerX86Shared : public A
 
     void compareFloat(DoubleCondition cond, FloatRegister lhs, FloatRegister rhs) {
         if (cond & DoubleConditionBitInvert)
             vucomiss(lhs, rhs);
         else
             vucomiss(rhs, lhs);
     }
 
-    void branchNegativeZero(FloatRegister reg, Register scratch, Label* label, bool  maybeNonZero = true);
+    void branchNegativeZero(FloatRegister reg, Register scratch, Label* label,
+                            bool maybeNonZero = true);
     void branchNegativeZeroFloat32(FloatRegister reg, Register scratch, Label* label);
 
     void move32(Imm32 imm, Register dest) {
         // Use the ImmWord version of mov to register, which has special
         // optimizations. Casting to uint32_t here ensures that the value
         // is zero-extended.
         mov(ImmWord(uint32_t(imm.value)), dest);
     }
@@ -405,37 +406,206 @@ class MacroAssemblerX86Shared : public A
         // least signed int32, or NaN), this will return the undefined integer
         // value (0x8000000).
         vcvttps2dq(src, dest);
     }
     void convertInt32x4ToFloat32x4(FloatRegister src, FloatRegister dest) {
         vcvtdq2ps(src, dest);
     }
 
-    void bitwiseAndSimd128(const Operand& src, FloatRegister dest) {
-        // TODO Using the "ps" variant for all types incurs a domain crossing
-        // penalty for integer types and double.
-        vandps(src, dest, dest);
+    // SIMD methods, defined in MacroAssembler-x86-shared-SIMD.cpp.
+    void checkedConvertFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest, Register temp,
+                                          Label* oolCheck, Label* rejoin);
+    void oolConvertFloat32x4ToInt32x4(FloatRegister src, Register temp, Label* rejoin,
+                                      Label* onConversionError);
+    void checkedConvertFloat32x4ToUint32x4(FloatRegister src, FloatRegister dest, Register temp,
+                                           FloatRegister tempF, Label* failed);
+
+    void createInt32x4(Register lane0, Register lane1, Register lane2, Register lane3,
+                       FloatRegister dest);
+    void createFloat32x4(FloatRegister lane0, FloatRegister lane1, FloatRegister lane2,
+                         FloatRegister lane3, FloatRegister temp, FloatRegister output);
+
+    void splatX16(Register input, FloatRegister output);
+    void splatX8(Register input, FloatRegister output);
+    void splatX4(Register input, FloatRegister output);
+    void splatX4(FloatRegister input, FloatRegister output);
+
+    void reinterpretSimd(bool isIntegerLaneType, FloatRegister input, FloatRegister output);
+
+    void extractLaneInt32x4(FloatRegister input, Register output, unsigned lane);
+    void extractLaneFloat32x4(FloatRegister input, FloatRegister output, unsigned lane,
+                              bool canonicalize);
+    void extractLaneInt16x8(FloatRegister input, Register output, unsigned lane, SimdSign sign);
+    void extractLaneInt8x16(FloatRegister input, Register output, unsigned lane, SimdSign sign);
+    void extractLaneSimdBool(FloatRegister input, Register output, unsigned numLanes, unsigned lane);
+
+    void insertLaneSimdInt(FloatRegister input, Register value, FloatRegister output,
+                           unsigned lane, unsigned numLanes);
+    void insertLaneFloat32x4(FloatRegister input, FloatRegister value, FloatRegister output,
+                             unsigned lane);
+
+    void allTrueSimdBool(FloatRegister input, Register output);
+    void anyTrueSimdBool(FloatRegister input, Register output);
+
+    void swizzleInt32x4(FloatRegister input, FloatRegister output, unsigned lanes[4]);
+    void swizzleFloat32x4(FloatRegister input, FloatRegister output, unsigned lanes[4]);
+    void swizzleInt8x16(FloatRegister input, FloatRegister output,
+                        const mozilla::Maybe<Register>& temp, int8_t lanes[16]);
+
+    void shuffleX4(FloatRegister lhs, Operand rhs, FloatRegister out,
+                   const mozilla::Maybe<FloatRegister>& maybeTemp, unsigned lanes[4]);
+    void shuffleInt8x16(FloatRegister lhs, FloatRegister rhs, FloatRegister output,
+                        const mozilla::Maybe<FloatRegister>& maybeFloatTemp,
+                        const mozilla::Maybe<Register>& maybeTemp, uint8_t lanes[16]);
+
+    void compareInt8x16(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                        FloatRegister output);
+    void compareInt16x8(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                        FloatRegister output);
+    void compareInt32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                        FloatRegister output);
+    void compareFloat32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+                          FloatRegister output);
+
+    void addInt8x16(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vpaddb(rhs, lhs, output);
+    }
+    void addInt16x8(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vpaddw(rhs, lhs, output);
+    }
+    void addInt32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vpaddd(rhs, lhs, output);
+    }
+    void addFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vaddps(rhs, lhs, output);
+    }
+
+    void addSatInt8x16(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) {
+        if (sign == SimdSign::Signed)
+            vpaddsb(rhs, lhs, output);
+        else
+            vpaddusb(rhs, lhs, output);
+    }
+    void addSatInt16x8(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) {
+        if (sign == SimdSign::Signed)
+            vpaddsw(rhs, lhs, output);
+        else
+            vpaddusw(rhs, lhs, output);
+    }
+
+    void subInt8x16(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vpsubb(rhs, lhs, output);
+    }
+    void subInt16x8(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vpsubw(rhs, lhs, output);
     }
-    void bitwiseAndNotSimd128(const Operand& src, FloatRegister dest) {
-        vandnps(src, dest, dest);
+    void subInt32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vpsubd(rhs, lhs, output);
+    }
+    void subFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vsubps(rhs, lhs, output);
+    }
+
+    void subSatInt8x16(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) {
+        if (sign == SimdSign::Signed)
+            vpsubsb(rhs, lhs, output);
+        else
+            vpsubusb(rhs, lhs, output);
+    }
+    void subSatInt16x8(FloatRegister lhs, Operand rhs, SimdSign sign, FloatRegister output) {
+        if (sign == SimdSign::Signed)
+            vpsubsw(rhs, lhs, output);
+        else
+            vpsubusw(rhs, lhs, output);
+    }
+
+    void mulInt16x8(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vpmullw(rhs, lhs, output);
+    }
+    void mulInt32x4(FloatRegister lhs, Operand rhs, const mozilla::Maybe<FloatRegister>& temp,
+                    FloatRegister output);
+    void mulFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vmulps(rhs, lhs, output);
+    }
+
+    void negInt8x16(Operand in, FloatRegister out) {
+        zeroSimd128Int(out);
+        packedSubInt8(in, out);
+    }
+    void negInt16x8(Operand in, FloatRegister out) {
+        zeroSimd128Int(out);
+        packedSubInt16(in, out);
+    }
+    void negInt32x4(Operand in, FloatRegister out) {
+        zeroSimd128Int(out);
+        packedSubInt32(in, out);
     }
-    void bitwiseOrSimd128(const Operand& src, FloatRegister dest) {
-        vorps(src, dest, dest);
+    void negFloat32x4(Operand in, FloatRegister out);
+
+    void notInt8x16(Operand in, FloatRegister out);
+    void notInt16x8(Operand in, FloatRegister out);
+    void notInt32x4(Operand in, FloatRegister out);
+    void notFloat32x4(Operand in, FloatRegister out);
+
+    void divFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output) {
+        vdivps(rhs, lhs, output);
+    }
+    void minFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister output);
+    void maxFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp, FloatRegister output);
+    void minNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp, FloatRegister output);
+    void maxNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp, FloatRegister output);
+
+    void absFloat32x4(Operand in, FloatRegister out);
+
+    void bitwiseAndFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vandps(rhs, lhs, dest);
+    }
+    void bitwiseAndSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vpand(rhs, lhs, dest);
     }
-    void bitwiseXorSimd128(const Operand& src, FloatRegister dest) {
-        vxorps(src, dest, dest);
+
+    void bitwiseOrFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vorps(rhs, lhs, dest);
+    }
+    void bitwiseOrSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vpor(rhs, lhs, dest);
+    }
+
+    void bitwiseXorFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vxorps(rhs, lhs, dest);
     }
+    void bitwiseXorSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vpxor(rhs, lhs, dest);
+    }
+
+    void bitwiseAndNotFloat32x4(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vandnps(rhs, lhs, dest);
+    }
+    void bitwiseAndNotSimdInt(FloatRegister lhs, const Operand& rhs, FloatRegister dest) {
+        vpandn(rhs, lhs, dest);
+    }
+
     void zeroSimd128Float(FloatRegister dest) {
         vxorps(dest, dest, dest);
     }
     void zeroSimd128Int(FloatRegister dest) {
         vpxor(dest, dest, dest);
     }
 
+    void selectSimd128(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse,
+                       FloatRegister temp, FloatRegister output);
+    void selectX4(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse,
+                  FloatRegister temp, FloatRegister output) {
+        if (AssemblerX86Shared::HasAVX())
+            vblendvps(mask, onTrue, onFalse, output);
+        else
+            selectSimd128(mask, onTrue, onFalse, temp, output);
+    }
+
     template <class T, class Reg> inline void loadScalar(const Operand& src, Reg dest);
     template <class T, class Reg> inline void storeScalar(Reg src, const Address& dest);
     template <class T> inline void loadAlignedVector(const Address& src, FloatRegister dest);
     template <class T> inline void storeAlignedVector(FloatRegister src, const Address& dest);
 
     void loadInt32x1(const Address& src, FloatRegister dest) {
         vmovd(Operand(src), dest);
     }
@@ -572,51 +742,48 @@ class MacroAssemblerX86Shared : public A
     void packedRcpSqrtApproximationFloat32x4(const Operand& src, FloatRegister dest) {
         // TODO See comment above. See also bug 1068028.
         vrsqrtps(src, dest);
     }
     void packedSqrtFloat32x4(const Operand& src, FloatRegister dest) {
         vsqrtps(src, dest);
     }
 
-    void packedLeftShiftByScalarInt16x8(FloatRegister src, FloatRegister dest) {
-        vpsllw(src, dest, dest);
-    }
+  public:
+    void packedLeftShiftByScalarInt16x8(FloatRegister in, Register count, Register temp, FloatRegister dest);
+    void packedRightShiftByScalarInt16x8(FloatRegister in, Register count, Register temp, FloatRegister dest);
+    void packedUnsignedRightShiftByScalarInt16x8(FloatRegister in, Register count, Register temp, FloatRegister dest);
+
     void packedLeftShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
+        count.value &= 15;
         vpsllw(count, dest, dest);
     }
-    void packedRightShiftByScalarInt16x8(FloatRegister src, FloatRegister dest) {
-        vpsraw(src, dest, dest);
-    }
     void packedRightShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
+        count.value &= 15;
         vpsraw(count, dest, dest);
     }
-    void packedUnsignedRightShiftByScalarInt16x8(FloatRegister src, FloatRegister dest) {
-        vpsrlw(src, dest, dest);
-    }
     void packedUnsignedRightShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
+        count.value &= 15;
         vpsrlw(count, dest, dest);
     }
 
-    void packedLeftShiftByScalarInt32x4(FloatRegister src, FloatRegister dest) {
-        vpslld(src, dest, dest);
-    }
+    void packedLeftShiftByScalarInt32x4(FloatRegister in, Register count, Register temp, FloatRegister dest);
+    void packedRightShiftByScalarInt32x4(FloatRegister in, Register count, Register temp, FloatRegister dest);
+    void packedUnsignedRightShiftByScalarInt32x4(FloatRegister in, Register count, Register temp, FloatRegister dest);
+
     void packedLeftShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
+        count.value &= 31;
         vpslld(count, dest, dest);
     }
-    void packedRightShiftByScalarInt32x4(FloatRegister src, FloatRegister dest) {
-        vpsrad(src, dest, dest);
-    }
     void packedRightShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
+        count.value &= 31;
         vpsrad(count, dest, dest);
     }
-    void packedUnsignedRightShiftByScalarInt32x4(FloatRegister src, FloatRegister dest) {
-        vpsrld(src, dest, dest);
-    }
     void packedUnsignedRightShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
+        count.value &= 31;
         vpsrld(count, dest, dest);
     }
 
     void loadFloat32x3(const Address& src, FloatRegister dest) {
         Address srcZ(src);
         srcZ.offset += 2 * sizeof(float);
         vmovsd(src, dest);
         ScratchSimd128Scope scratch(asMasm());
--- a/js/src/moz.build
+++ b/js/src/moz.build
@@ -496,16 +496,17 @@ elif CONFIG['JS_CODEGEN_X86'] or CONFIG[
     UNIFIED_SOURCES += [
         'jit/x86-shared/Architecture-x86-shared.cpp',
         'jit/x86-shared/Assembler-x86-shared.cpp',
         'jit/x86-shared/AssemblerBuffer-x86-shared.cpp',
         'jit/x86-shared/BaselineCompiler-x86-shared.cpp',
         'jit/x86-shared/BaselineIC-x86-shared.cpp',
         'jit/x86-shared/CodeGenerator-x86-shared.cpp',
         'jit/x86-shared/Lowering-x86-shared.cpp',
+        'jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp',
         'jit/x86-shared/MacroAssembler-x86-shared.cpp',
         'jit/x86-shared/MoveEmitter-x86-shared.cpp',
     ]
     SOURCES += [
         'jit/x86-shared/Disassembler-x86-shared.cpp',  # using namespace js::jit::X86Encoding;
     ]
     if CONFIG['JS_CODEGEN_X64']:
         LOpcodesGenerated.inputs += ['jit/x64/LIR-x64.h']