Bug 1656229 - move unused SIMD code. r=jseward
authorLars T Hansen <lhansen@mozilla.com>
Thu, 20 Aug 2020 14:43:13 +0000
changeset 610154 da2ca8466508070950b870a6906a595e690abdd3
parent 610153 3da7d219a9e4e4f6e23892e2ef27192de1954f9d
child 610155 500009648a7cfe02b1534f092d5e148782c8b768
push id13553
push userffxbld-merge
push dateMon, 24 Aug 2020 12:51:36 +0000
treeherdermozilla-beta@a54f8b5d0977 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjseward
bugs1656229
milestone81.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1656229 - move unused SIMD code. r=jseward This separates unused SIMD code (from asm.js) from code that's being actively maintained by placing out-of-line definitions in a new file, and creating separate sections in the header for the declarations and in-line definitions. The code is included in the build so that it doesn't go completely stale, but this is not technically required. Differential Revision: https://phabricator.services.mozilla.com/D87307
js/src/jit/moz.build
js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD-unused.cpp
js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
js/src/jit/x86-shared/MacroAssembler-x86-shared.h
--- a/js/src/jit/moz.build
+++ b/js/src/jit/moz.build
@@ -109,16 +109,17 @@ elif CONFIG['JS_CODEGEN_X86'] or CONFIG[
     lir_inputs += ['x86-shared/LIR-x86-shared.h']
     UNIFIED_SOURCES += [
         'shared/AtomicOperations-shared-jit.cpp',
         'x86-shared/Architecture-x86-shared.cpp',
         'x86-shared/Assembler-x86-shared.cpp',
         'x86-shared/AssemblerBuffer-x86-shared.cpp',
         'x86-shared/CodeGenerator-x86-shared.cpp',
         'x86-shared/Lowering-x86-shared.cpp',
+        'x86-shared/MacroAssembler-x86-shared-SIMD-unused.cpp',
         'x86-shared/MacroAssembler-x86-shared-SIMD.cpp',
         'x86-shared/MacroAssembler-x86-shared.cpp',
         'x86-shared/MoveEmitter-x86-shared.cpp',
     ]
     if CONFIG['JS_CODEGEN_X64']:
         lir_inputs += ['x64/LIR-x64.h']
         UNIFIED_SOURCES += [
             'x64/Assembler-x64.cpp',
new file mode 100644
--- /dev/null
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD-unused.cpp
@@ -0,0 +1,616 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "jit/MacroAssembler.h"
+#include "jit/x86-shared/MacroAssembler-x86-shared.h"
+
+#include "jit/MacroAssembler-inl.h"
+
+using namespace js;
+using namespace js::jit;
+
+using mozilla::DebugOnly;
+using mozilla::FloatingPoint;
+using mozilla::Maybe;
+using mozilla::SpecificNaN;
+
+// The following routines are from the old asm.js implementation but are UNUSED
+// in the wasm implementation currently.  They are preserved here because it's
+// sad to throw out working code.  They are defined in the header file.
+//
+// Before using these, they should minimally be moved to
+// MacroAssembler-x86-shared-SIMD.cpp, and it would be a wrong move to assume
+// that they are correct according to the wasm spec.
+
+void MacroAssemblerX86Shared::checkedConvertFloat32x4ToInt32x4(
+    FloatRegister src, FloatRegister dest, Register temp, Label* oolEntry,
+    Label* rejoin) {
+  // Does the conversion and jumps to the OOL entry if the result value
+  // is the undefined integer pattern.
+  static const SimdConstant InvalidResult =
+      SimdConstant::SplatX4(int32_t(-2147483648));
+  convertFloat32x4ToInt32x4(src, dest);
+
+  ScratchSimd128Scope scratch(asMasm());
+  asMasm().loadConstantSimd128Int(InvalidResult, scratch);
+  packedEqualInt32x4(Operand(dest), scratch);
+  // TODO (bug 1156228): If we have SSE4.1, we can use PTEST here instead of
+  // the two following instructions.
+  vmovmskps(scratch, temp);
+  cmp32(temp, Imm32(0));
+  j(Assembler::NotEqual, oolEntry);
+  bind(rejoin);
+}
+
+void MacroAssemblerX86Shared::oolConvertFloat32x4ToInt32x4(
+    FloatRegister src, Register temp, Label* rejoin, Label* onConversionError) {
+  static const SimdConstant Int32MaxX4 = SimdConstant::SplatX4(2147483647.f);
+  static const SimdConstant Int32MinX4 = SimdConstant::SplatX4(-2147483648.f);
+
+  ScratchSimd128Scope scratch(asMasm());
+  asMasm().loadConstantSimd128Float(Int32MinX4, scratch);
+  vcmpleps(Operand(src), scratch);
+  vmovmskps(scratch, temp);
+  cmp32(temp, Imm32(15));
+  j(Assembler::NotEqual, onConversionError);
+
+  asMasm().loadConstantSimd128Float(Int32MaxX4, scratch);
+  vcmpleps(Operand(src), scratch);
+  vmovmskps(scratch, temp);
+  cmp32(temp, Imm32(0));
+  j(Assembler::NotEqual, onConversionError);
+
+  jump(rejoin);
+}
+
+void MacroAssemblerX86Shared::checkedConvertFloat32x4ToUint32x4(
+    FloatRegister in, FloatRegister out, Register temp, FloatRegister tempF,
+    Label* failed) {
+  // Classify lane values into 4 disjoint classes:
+  //
+  //   N-lanes:             in <= -1.0
+  //   A-lanes:      -1.0 < in <= 0x0.ffffffp31
+  //   B-lanes: 0x1.0p31 <= in <= 0x0.ffffffp32
+  //   V-lanes: 0x1.0p32 <= in, or isnan(in)
+  //
+  // We need to bail out to throw a RangeError if we see any N-lanes or
+  // V-lanes.
+  //
+  // For A-lanes and B-lanes, we make two float -> int32 conversions:
+  //
+  //   A = cvttps2dq(in)
+  //   B = cvttps2dq(in - 0x1.0p31f)
+  //
+  // Note that the subtraction for the B computation is exact for B-lanes.
+  // There is no rounding, so B is the low 31 bits of the correctly converted
+  // result.
+  //
+  // The cvttps2dq instruction produces 0x80000000 when the input is NaN or
+  // out of range for a signed int32_t. This conveniently provides the missing
+  // high bit for B, so the desired result is A for A-lanes and A|B for
+  // B-lanes.
+
+  ScratchSimd128Scope scratch(asMasm());
+
+  // TODO: If the majority of lanes are A-lanes, it could be faster to compute
+  // A first, use vmovmskps to check for any non-A-lanes and handle them in
+  // ool code. OTOH, we we're wrong about the lane distribution, that would be
+  // slower.
+
+  // Compute B in |scratch|.
+  static const float Adjust = 0x80000000;  // 0x1.0p31f for the benefit of MSVC.
+  static const SimdConstant Bias = SimdConstant::SplatX4(-Adjust);
+  asMasm().loadConstantSimd128Float(Bias, scratch);
+  packedAddFloat32(Operand(in), scratch);
+  convertFloat32x4ToInt32x4(scratch, scratch);
+
+  // Compute A in |out|. This is the last time we use |in| and the first time
+  // we use |out|, so we can tolerate if they are the same register.
+  convertFloat32x4ToInt32x4(in, out);
+
+  // We can identify A-lanes by the sign bits in A: Any A-lanes will be
+  // positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a
+  // mask of non-A-lanes into |tempF|.
+  zeroSimd128Float(tempF);
+  packedGreaterThanInt32x4(Operand(out), tempF);
+
+  // Clear the A-lanes in B.
+  bitwiseAndSimdInt(scratch, Operand(tempF), scratch);
+
+  // Compute the final result: A for A-lanes, A|B for B-lanes.
+  bitwiseOrSimdInt(out, Operand(scratch), out);
+
+  // We still need to filter out the V-lanes. They would show up as 0x80000000
+  // in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are
+  // the remaining negative lanes in B.
+  vmovmskps(scratch, temp);
+  cmp32(temp, Imm32(0));
+  j(Assembler::NotEqual, failed);
+}
+
+void MacroAssemblerX86Shared::createInt32x4(Register lane0, Register lane1,
+                                            Register lane2, Register lane3,
+                                            FloatRegister dest) {
+  if (AssemblerX86Shared::HasSSE41()) {
+    vmovd(lane0, dest);
+    vpinsrd(1, lane1, dest, dest);
+    vpinsrd(2, lane2, dest, dest);
+    vpinsrd(3, lane3, dest, dest);
+    return;
+  }
+
+  asMasm().reserveStack(Simd128DataSize);
+  store32(lane0, Address(StackPointer, 0 * sizeof(int32_t)));
+  store32(lane1, Address(StackPointer, 1 * sizeof(int32_t)));
+  store32(lane2, Address(StackPointer, 2 * sizeof(int32_t)));
+  store32(lane3, Address(StackPointer, 3 * sizeof(int32_t)));
+  loadAlignedSimd128Int(Address(StackPointer, 0), dest);
+  asMasm().freeStack(Simd128DataSize);
+}
+
+void MacroAssemblerX86Shared::createFloat32x4(
+    FloatRegister lane0, FloatRegister lane1, FloatRegister lane2,
+    FloatRegister lane3, FloatRegister temp, FloatRegister output) {
+  FloatRegister lane0Copy = reusedInputSimd128Float(lane0, output);
+  FloatRegister lane1Copy = reusedInputSimd128Float(lane1, temp);
+  vunpcklps(lane3, lane1Copy, temp);
+  vunpcklps(lane2, lane0Copy, output);
+  vunpcklps(temp, output, output);
+}
+
+void MacroAssemblerX86Shared::reinterpretSimd(bool isIntegerLaneType,
+                                              FloatRegister input,
+                                              FloatRegister output) {
+  if (input.aliases(output)) {
+    return;
+  }
+  if (isIntegerLaneType) {
+    vmovdqa(input, output);
+  } else {
+    vmovaps(input, output);
+  }
+}
+
+void MacroAssemblerX86Shared::extractLaneSimdBool(FloatRegister input,
+                                                  Register output,
+                                                  unsigned numLanes,
+                                                  unsigned lane) {
+  switch (numLanes) {
+    case 4:
+      extractLaneInt32x4(input, output, lane);
+      break;
+    case 8:
+      // Get a lane, don't bother fixing the high bits since we'll mask below.
+      extractLaneInt16x8(input, output, lane, SimdSign::NotApplicable);
+      break;
+    case 16:
+      extractLaneInt8x16(input, output, lane, SimdSign::NotApplicable);
+      break;
+    default:
+      MOZ_CRASH("Unhandled SIMD number of lanes");
+  }
+  // We need to generate a 0/1 value. We have 0/-1 and possibly dirty high bits.
+  asMasm().and32(Imm32(1), output);
+}
+
+void MacroAssemblerX86Shared::allTrueSimdBool(FloatRegister input,
+                                              Register output) {
+  // We know that the input lanes are boolean, so they are either 0 or -1.
+  // The all-true vector has all 128 bits set, no matter the lane geometry.
+  vpmovmskb(input, output);
+  cmp32(output, Imm32(0xffff));
+  emitSet(Assembler::Zero, output);
+}
+
+void MacroAssemblerX86Shared::anyTrueSimdBool(FloatRegister input,
+                                              Register output) {
+  vpmovmskb(input, output);
+  cmp32(output, Imm32(0x0));
+  emitSet(Assembler::NonZero, output);
+}
+
+void MacroAssemblerX86Shared::swizzleInt32x4(FloatRegister input,
+                                             FloatRegister output,
+                                             unsigned lanes[4]) {
+  uint32_t mask = MacroAssembler::ComputeShuffleMask(lanes[0], lanes[1],
+                                                     lanes[2], lanes[3]);
+  shuffleInt32(mask, input, output);
+}
+
+// For SIMD.js
+void MacroAssemblerX86Shared::oldSwizzleInt8x16(FloatRegister input,
+                                                FloatRegister output,
+                                                const Maybe<Register>& temp,
+                                                int8_t lanes[16]) {
+  if (AssemblerX86Shared::HasSSSE3()) {
+    ScratchSimd128Scope scratch(asMasm());
+    asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(lanes), scratch);
+    FloatRegister inputCopy = reusedInputInt32x4(input, output);
+    vpshufb(scratch, inputCopy, output);
+    return;
+  }
+
+  // Worst-case fallback for pre-SSSE3 machines. Bounce through memory.
+  MOZ_ASSERT(!!temp, "needs a temp for the memory fallback");
+  asMasm().reserveStack(2 * Simd128DataSize);
+  storeAlignedSimd128Int(input, Address(StackPointer, Simd128DataSize));
+  for (unsigned i = 0; i < 16; i++) {
+    load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]), *temp);
+    store8(*temp, Address(StackPointer, i));
+  }
+  loadAlignedSimd128Int(Address(StackPointer, 0), output);
+  asMasm().freeStack(2 * Simd128DataSize);
+}
+
+static inline bool LanesMatch(unsigned lanes[4], unsigned x, unsigned y,
+                              unsigned z, unsigned w) {
+  return lanes[0] == x && lanes[1] == y && lanes[2] == z && lanes[3] == w;
+}
+
+void MacroAssemblerX86Shared::swizzleFloat32x4(FloatRegister input,
+                                               FloatRegister output,
+                                               unsigned lanes[4]) {
+  if (AssemblerX86Shared::HasSSE3()) {
+    if (LanesMatch(lanes, 0, 0, 2, 2)) {
+      vmovsldup(input, output);
+      return;
+    }
+    if (LanesMatch(lanes, 1, 1, 3, 3)) {
+      vmovshdup(input, output);
+      return;
+    }
+  }
+
+  // TODO Here and below, arch specific lowering could identify this pattern
+  // and use defineReuseInput to avoid this move (bug 1084404)
+  if (LanesMatch(lanes, 2, 3, 2, 3)) {
+    FloatRegister inputCopy = reusedInputSimd128Float(input, output);
+    vmovhlps(input, inputCopy, output);
+    return;
+  }
+
+  if (LanesMatch(lanes, 0, 1, 0, 1)) {
+    if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) {
+      vmovddup(input, output);
+      return;
+    }
+    FloatRegister inputCopy = reusedInputSimd128Float(input, output);
+    vmovlhps(input, inputCopy, output);
+    return;
+  }
+
+  if (LanesMatch(lanes, 0, 0, 1, 1)) {
+    FloatRegister inputCopy = reusedInputSimd128Float(input, output);
+    vunpcklps(input, inputCopy, output);
+    return;
+  }
+
+  if (LanesMatch(lanes, 2, 2, 3, 3)) {
+    FloatRegister inputCopy = reusedInputSimd128Float(input, output);
+    vunpckhps(input, inputCopy, output);
+    return;
+  }
+
+  uint32_t x = lanes[0];
+  uint32_t y = lanes[1];
+  uint32_t z = lanes[2];
+  uint32_t w = lanes[3];
+
+  uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
+  shuffleFloat32(mask, input, output);
+}
+
+void MacroAssemblerX86Shared::shuffleX4(FloatRegister lhs, Operand rhs,
+                                        FloatRegister out,
+                                        const Maybe<FloatRegister>& maybeTemp,
+                                        unsigned lanes[4]) {
+  uint32_t x = lanes[0];
+  uint32_t y = lanes[1];
+  uint32_t z = lanes[2];
+  uint32_t w = lanes[3];
+
+  // Check that lanes come from LHS in majority:
+  unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
+  MOZ_ASSERT(numLanesFromLHS >= 2);
+
+  // When reading this method, remember that vshufps takes the two first
+  // inputs of the destination operand (right operand) and the two last
+  // inputs of the source operand (left operand).
+  //
+  // Legend for explanations:
+  // - L: LHS
+  // - R: RHS
+  // - T: temporary
+
+  uint32_t mask;
+
+  // If all lanes came from a single vector, we should use swizzle instead.
+  MOZ_ASSERT(numLanesFromLHS < 4);
+
+  // If all values stay in their lane, this is a blend.
+  if (AssemblerX86Shared::HasSSE41()) {
+    if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) {
+      vblendps(blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4), rhs, lhs, out);
+      return;
+    }
+  }
+
+  // One element of the second, all other elements of the first
+  if (numLanesFromLHS == 3) {
+    unsigned firstMask = -1, secondMask = -1;
+
+    // register-register vmovss preserves the high lanes.
+    if (LanesMatch(lanes, 4, 1, 2, 3) && rhs.kind() == Operand::FPREG) {
+      vmovss(FloatRegister::FromCode(rhs.fpu()), lhs, out);
+      return;
+    }
+
+    // SSE4.1 vinsertps can handle any single element.
+    unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
+    if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
+      unsigned srcLane;
+      unsigned dstLane;
+      if (x >= 4) {
+        srcLane = x - 4;
+        dstLane = 0;
+      } else if (y >= 4) {
+        srcLane = y - 4;
+        dstLane = 1;
+      } else if (z >= 4) {
+        srcLane = z - 4;
+        dstLane = 2;
+      } else {
+        MOZ_ASSERT(w >= 4);
+        srcLane = w - 4;
+        dstLane = 3;
+      }
+      vinsertps(vinsertpsMask(srcLane, dstLane), rhs, lhs, out);
+      return;
+    }
+
+    MOZ_ASSERT(!!maybeTemp);
+    FloatRegister rhsCopy = *maybeTemp;
+    loadAlignedSimd128Float(rhs, rhsCopy);
+
+    if (x < 4 && y < 4) {
+      if (w >= 4) {
+        w %= 4;
+        // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhsCopy)
+        firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
+        // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, out)
+        secondMask = MacroAssembler::ComputeShuffleMask(x, y, 2, 0);
+      } else {
+        MOZ_ASSERT(z >= 4);
+        z %= 4;
+        // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhsCopy)
+        firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
+        // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, out)
+        secondMask = MacroAssembler::ComputeShuffleMask(x, y, 0, 2);
+      }
+
+      vshufps(firstMask, lhs, rhsCopy, rhsCopy);
+      vshufps(secondMask, rhsCopy, lhs, out);
+      return;
+    }
+
+    MOZ_ASSERT(z < 4 && w < 4);
+
+    if (y >= 4) {
+      y %= 4;
+      // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhsCopy)
+      firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
+      // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, out)
+      secondMask = MacroAssembler::ComputeShuffleMask(2, 0, z, w);
+    } else {
+      MOZ_ASSERT(x >= 4);
+      x %= 4;
+      // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhsCopy)
+      firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
+      // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, out)
+      secondMask = MacroAssembler::ComputeShuffleMask(0, 2, z, w);
+    }
+
+    vshufps(firstMask, lhs, rhsCopy, rhsCopy);
+    if (AssemblerX86Shared::HasAVX()) {
+      vshufps(secondMask, lhs, rhsCopy, out);
+    } else {
+      vshufps(secondMask, lhs, rhsCopy, rhsCopy);
+      moveSimd128Float(rhsCopy, out);
+    }
+    return;
+  }
+
+  // Two elements from one vector, two other elements from the other
+  MOZ_ASSERT(numLanesFromLHS == 2);
+
+  // TODO Here and below, symmetric case would be more handy to avoid a move,
+  // but can't be reached because operands would get swapped (bug 1084404).
+  if (LanesMatch(lanes, 2, 3, 6, 7)) {
+    ScratchSimd128Scope scratch(asMasm());
+    if (AssemblerX86Shared::HasAVX()) {
+      FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
+      vmovhlps(lhs, rhsCopy, out);
+    } else {
+      loadAlignedSimd128Float(rhs, scratch);
+      vmovhlps(lhs, scratch, scratch);
+      moveSimd128Float(scratch, out);
+    }
+    return;
+  }
+
+  if (LanesMatch(lanes, 0, 1, 4, 5)) {
+    FloatRegister rhsCopy;
+    ScratchSimd128Scope scratch(asMasm());
+    if (rhs.kind() == Operand::FPREG) {
+      // No need to make an actual copy, since the operand is already
+      // in a register, and it won't be clobbered by the vmovlhps.
+      rhsCopy = FloatRegister::FromCode(rhs.fpu());
+    } else {
+      loadAlignedSimd128Float(rhs, scratch);
+      rhsCopy = scratch;
+    }
+    vmovlhps(rhsCopy, lhs, out);
+    return;
+  }
+
+  if (LanesMatch(lanes, 0, 4, 1, 5)) {
+    vunpcklps(rhs, lhs, out);
+    return;
+  }
+
+  // TODO swapped case would be better (bug 1084404)
+  if (LanesMatch(lanes, 4, 0, 5, 1)) {
+    ScratchSimd128Scope scratch(asMasm());
+    if (AssemblerX86Shared::HasAVX()) {
+      FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
+      vunpcklps(lhs, rhsCopy, out);
+    } else {
+      loadAlignedSimd128Float(rhs, scratch);
+      vunpcklps(lhs, scratch, scratch);
+      moveSimd128Float(scratch, out);
+    }
+    return;
+  }
+
+  if (LanesMatch(lanes, 2, 6, 3, 7)) {
+    vunpckhps(rhs, lhs, out);
+    return;
+  }
+
+  // TODO swapped case would be better (bug 1084404)
+  if (LanesMatch(lanes, 6, 2, 7, 3)) {
+    ScratchSimd128Scope scratch(asMasm());
+    if (AssemblerX86Shared::HasAVX()) {
+      FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
+      vunpckhps(lhs, rhsCopy, out);
+    } else {
+      loadAlignedSimd128Float(rhs, scratch);
+      vunpckhps(lhs, scratch, scratch);
+      moveSimd128Float(scratch, out);
+    }
+    return;
+  }
+
+  // In one vshufps
+  if (x < 4 && y < 4) {
+    mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
+    vshufps(mask, rhs, lhs, out);
+    return;
+  }
+
+  // At creation, we should have explicitly swapped in this case.
+  MOZ_ASSERT(!(z >= 4 && w >= 4));
+
+  // In two vshufps, for the most generic case:
+  uint32_t firstMask[4], secondMask[4];
+  unsigned i = 0, j = 2, k = 0;
+
+#define COMPUTE_MASK(lane)   \
+  if (lane >= 4) {           \
+    firstMask[j] = lane % 4; \
+    secondMask[k++] = j++;   \
+  } else {                   \
+    firstMask[i] = lane;     \
+    secondMask[k++] = i++;   \
+  }
+
+  COMPUTE_MASK(x)
+  COMPUTE_MASK(y)
+  COMPUTE_MASK(z)
+  COMPUTE_MASK(w)
+#undef COMPUTE_MASK
+
+  MOZ_ASSERT(i == 2 && j == 4 && k == 4);
+
+  mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
+                                            firstMask[2], firstMask[3]);
+  vshufps(mask, rhs, lhs, lhs);
+
+  mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
+                                            secondMask[2], secondMask[3]);
+  vshufps(mask, lhs, lhs, lhs);
+}
+
+void MacroAssemblerX86Shared::minNumFloat32x4(FloatRegister lhs, Operand rhs,
+                                              FloatRegister temp,
+                                              FloatRegister output) {
+  ScratchSimd128Scope scratch(asMasm());
+  asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)),
+                                  temp);
+
+  FloatRegister mask = scratch;
+  FloatRegister tmpCopy = reusedInputSimd128Float(temp, scratch);
+  vpcmpeqd(Operand(lhs), tmpCopy, mask);
+  vandps(temp, mask, mask);
+
+  FloatRegister lhsCopy = reusedInputSimd128Float(lhs, temp);
+  vminps(rhs, lhsCopy, temp);
+  vorps(mask, temp, temp);
+
+  if (AssemblerX86Shared::HasAVX()) {
+    MOZ_CRASH("Can do better by avoiding the movaps");
+  } else {
+    vmovaps(rhs, mask);
+    vcmpneqps(rhs, mask);
+  }
+
+  if (AssemblerX86Shared::HasAVX()) {
+    vblendvps(mask, lhs, temp, output);
+  } else {
+    // Emulate vblendvps.
+    // With SSE.4.1 we could use blendvps, however it's awkward since
+    // it requires the mask to be in xmm0.
+    if (lhs != output) {
+      moveSimd128Float(lhs, output);
+    }
+    vandps(Operand(mask), output, output);
+    vandnps(Operand(temp), mask, mask);
+    vorps(Operand(mask), output, output);
+  }
+}
+
+void MacroAssemblerX86Shared::maxNumFloat32x4(FloatRegister lhs, Operand rhs,
+                                              FloatRegister temp,
+                                              FloatRegister output) {
+  ScratchSimd128Scope scratch(asMasm());
+  FloatRegister mask = scratch;
+
+  asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(0), mask);
+  vpcmpeqd(Operand(lhs), mask, mask);
+
+  asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)),
+                                  temp);
+  vandps(temp, mask, mask);
+
+  FloatRegister lhsCopy = reusedInputSimd128Float(lhs, temp);
+  vmaxps(rhs, lhsCopy, temp);
+  vandnps(Operand(temp), mask, mask);
+
+  // Ensure temp always contains the temporary result
+  mask = temp;
+  temp = scratch;
+
+  if (AssemblerX86Shared::HasAVX()) {
+    MOZ_CRASH("Can do better by avoiding the movaps");
+  } else {
+    vmovaps(rhs, mask);
+    vcmpneqps(rhs, mask);
+  }
+
+  if (AssemblerX86Shared::HasAVX()) {
+    vblendvps(mask, lhs, temp, output);
+  } else {
+    // Emulate vblendvps.
+    // With SSE.4.1 we could use blendvps, however it's awkward since
+    // it requires the mask to be in xmm0.
+    if (lhs != output) {
+      moveSimd128Float(lhs, output);
+    }
+    vandps(Operand(mask), output, output);
+    vandnps(Operand(temp), mask, mask);
+    vorps(Operand(mask), output, output);
+  }
+}
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
@@ -12,152 +12,16 @@
 using namespace js;
 using namespace js::jit;
 
 using mozilla::DebugOnly;
 using mozilla::FloatingPoint;
 using mozilla::Maybe;
 using mozilla::SpecificNaN;
 
-void MacroAssemblerX86Shared::checkedConvertFloat32x4ToInt32x4(
-    FloatRegister src, FloatRegister dest, Register temp, Label* oolEntry,
-    Label* rejoin) {
-  // Does the conversion and jumps to the OOL entry if the result value
-  // is the undefined integer pattern.
-  static const SimdConstant InvalidResult =
-      SimdConstant::SplatX4(int32_t(-2147483648));
-  convertFloat32x4ToInt32x4(src, dest);
-
-  ScratchSimd128Scope scratch(asMasm());
-  asMasm().loadConstantSimd128Int(InvalidResult, scratch);
-  packedEqualInt32x4(Operand(dest), scratch);
-  // TODO (bug 1156228): If we have SSE4.1, we can use PTEST here instead of
-  // the two following instructions.
-  vmovmskps(scratch, temp);
-  cmp32(temp, Imm32(0));
-  j(Assembler::NotEqual, oolEntry);
-  bind(rejoin);
-}
-
-void MacroAssemblerX86Shared::oolConvertFloat32x4ToInt32x4(
-    FloatRegister src, Register temp, Label* rejoin, Label* onConversionError) {
-  static const SimdConstant Int32MaxX4 = SimdConstant::SplatX4(2147483647.f);
-  static const SimdConstant Int32MinX4 = SimdConstant::SplatX4(-2147483648.f);
-
-  ScratchSimd128Scope scratch(asMasm());
-  asMasm().loadConstantSimd128Float(Int32MinX4, scratch);
-  vcmpleps(Operand(src), scratch);
-  vmovmskps(scratch, temp);
-  cmp32(temp, Imm32(15));
-  j(Assembler::NotEqual, onConversionError);
-
-  asMasm().loadConstantSimd128Float(Int32MaxX4, scratch);
-  vcmpleps(Operand(src), scratch);
-  vmovmskps(scratch, temp);
-  cmp32(temp, Imm32(0));
-  j(Assembler::NotEqual, onConversionError);
-
-  jump(rejoin);
-}
-
-void MacroAssemblerX86Shared::checkedConvertFloat32x4ToUint32x4(
-    FloatRegister in, FloatRegister out, Register temp, FloatRegister tempF,
-    Label* failed) {
-  // Classify lane values into 4 disjoint classes:
-  //
-  //   N-lanes:             in <= -1.0
-  //   A-lanes:      -1.0 < in <= 0x0.ffffffp31
-  //   B-lanes: 0x1.0p31 <= in <= 0x0.ffffffp32
-  //   V-lanes: 0x1.0p32 <= in, or isnan(in)
-  //
-  // We need to bail out to throw a RangeError if we see any N-lanes or
-  // V-lanes.
-  //
-  // For A-lanes and B-lanes, we make two float -> int32 conversions:
-  //
-  //   A = cvttps2dq(in)
-  //   B = cvttps2dq(in - 0x1.0p31f)
-  //
-  // Note that the subtraction for the B computation is exact for B-lanes.
-  // There is no rounding, so B is the low 31 bits of the correctly converted
-  // result.
-  //
-  // The cvttps2dq instruction produces 0x80000000 when the input is NaN or
-  // out of range for a signed int32_t. This conveniently provides the missing
-  // high bit for B, so the desired result is A for A-lanes and A|B for
-  // B-lanes.
-
-  ScratchSimd128Scope scratch(asMasm());
-
-  // TODO: If the majority of lanes are A-lanes, it could be faster to compute
-  // A first, use vmovmskps to check for any non-A-lanes and handle them in
-  // ool code. OTOH, we we're wrong about the lane distribution, that would be
-  // slower.
-
-  // Compute B in |scratch|.
-  static const float Adjust = 0x80000000;  // 0x1.0p31f for the benefit of MSVC.
-  static const SimdConstant Bias = SimdConstant::SplatX4(-Adjust);
-  asMasm().loadConstantSimd128Float(Bias, scratch);
-  packedAddFloat32(Operand(in), scratch);
-  convertFloat32x4ToInt32x4(scratch, scratch);
-
-  // Compute A in |out|. This is the last time we use |in| and the first time
-  // we use |out|, so we can tolerate if they are the same register.
-  convertFloat32x4ToInt32x4(in, out);
-
-  // We can identify A-lanes by the sign bits in A: Any A-lanes will be
-  // positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a
-  // mask of non-A-lanes into |tempF|.
-  zeroSimd128Float(tempF);
-  packedGreaterThanInt32x4(Operand(out), tempF);
-
-  // Clear the A-lanes in B.
-  bitwiseAndSimdInt(scratch, Operand(tempF), scratch);
-
-  // Compute the final result: A for A-lanes, A|B for B-lanes.
-  bitwiseOrSimdInt(out, Operand(scratch), out);
-
-  // We still need to filter out the V-lanes. They would show up as 0x80000000
-  // in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are
-  // the remaining negative lanes in B.
-  vmovmskps(scratch, temp);
-  cmp32(temp, Imm32(0));
-  j(Assembler::NotEqual, failed);
-}
-
-void MacroAssemblerX86Shared::createInt32x4(Register lane0, Register lane1,
-                                            Register lane2, Register lane3,
-                                            FloatRegister dest) {
-  if (AssemblerX86Shared::HasSSE41()) {
-    vmovd(lane0, dest);
-    vpinsrd(1, lane1, dest, dest);
-    vpinsrd(2, lane2, dest, dest);
-    vpinsrd(3, lane3, dest, dest);
-    return;
-  }
-
-  asMasm().reserveStack(Simd128DataSize);
-  store32(lane0, Address(StackPointer, 0 * sizeof(int32_t)));
-  store32(lane1, Address(StackPointer, 1 * sizeof(int32_t)));
-  store32(lane2, Address(StackPointer, 2 * sizeof(int32_t)));
-  store32(lane3, Address(StackPointer, 3 * sizeof(int32_t)));
-  loadAlignedSimd128Int(Address(StackPointer, 0), dest);
-  asMasm().freeStack(Simd128DataSize);
-}
-
-void MacroAssemblerX86Shared::createFloat32x4(
-    FloatRegister lane0, FloatRegister lane1, FloatRegister lane2,
-    FloatRegister lane3, FloatRegister temp, FloatRegister output) {
-  FloatRegister lane0Copy = reusedInputSimd128Float(lane0, output);
-  FloatRegister lane1Copy = reusedInputSimd128Float(lane1, temp);
-  vunpcklps(lane3, lane1Copy, temp);
-  vunpcklps(lane2, lane0Copy, output);
-  vunpcklps(temp, output, output);
-}
-
 void MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output) {
   ScratchSimd128Scope scratch(asMasm());
 
   vmovd(input, output);
   if (AssemblerX86Shared::HasSSSE3()) {
     zeroSimd128Int(scratch);
     vpshufb(scratch, output, output);
   } else {
@@ -190,29 +54,16 @@ void MacroAssemblerX86Shared::splatX4(Fl
 }
 
 void MacroAssemblerX86Shared::splatX2(FloatRegister input,
                                       FloatRegister output) {
   FloatRegister inputCopy = reusedInputSimd128Float(input, output);
   vshufpd(0, inputCopy, inputCopy, output);
 }
 
-void MacroAssemblerX86Shared::reinterpretSimd(bool isIntegerLaneType,
-                                              FloatRegister input,
-                                              FloatRegister output) {
-  if (input.aliases(output)) {
-    return;
-  }
-  if (isIntegerLaneType) {
-    vmovdqa(input, output);
-  } else {
-    vmovaps(input, output);
-  }
-}
-
 void MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input,
                                                  Register output,
                                                  unsigned lane) {
   if (lane == 0) {
     // The value we want to extract is in the low double-word
     moveLowInt32(input, output);
   } else if (AssemblerX86Shared::HasSSE41()) {
     vpextrd(lane, input, output);
@@ -296,38 +147,16 @@ void MacroAssemblerX86Shared::extractLan
       movzbl(output, output);
       break;
     case SimdSign::NotApplicable:
       // No adjustment needed.
       break;
   }
 }
 
-void MacroAssemblerX86Shared::extractLaneSimdBool(FloatRegister input,
-                                                  Register output,
-                                                  unsigned numLanes,
-                                                  unsigned lane) {
-  switch (numLanes) {
-    case 4:
-      extractLaneInt32x4(input, output, lane);
-      break;
-    case 8:
-      // Get a lane, don't bother fixing the high bits since we'll mask below.
-      extractLaneInt16x8(input, output, lane, SimdSign::NotApplicable);
-      break;
-    case 16:
-      extractLaneInt8x16(input, output, lane, SimdSign::NotApplicable);
-      break;
-    default:
-      MOZ_CRASH("Unhandled SIMD number of lanes");
-  }
-  // We need to generate a 0/1 value. We have 0/-1 and possibly dirty high bits.
-  asMasm().and32(Imm32(1), output);
-}
-
 void MacroAssemblerX86Shared::insertLaneSimdInt(FloatRegister input,
                                                 Register value,
                                                 FloatRegister output,
                                                 unsigned lane,
                                                 unsigned numLanes) {
   if (numLanes == 8) {
     // Available in SSE 2.
     vpinsrw(lane, value, input, output);
@@ -428,123 +257,16 @@ void MacroAssemblerX86Shared::insertLane
       // move low qword of output into high qword of output
       vmovddup(output, output);
       // move low qword of input into low qword of output
       vmovsd(input, output, output);
     }
   }
 }
 
-void MacroAssemblerX86Shared::allTrueSimdBool(FloatRegister input,
-                                              Register output) {
-  // We know that the input lanes are boolean, so they are either 0 or -1.
-  // The all-true vector has all 128 bits set, no matter the lane geometry.
-  vpmovmskb(input, output);
-  cmp32(output, Imm32(0xffff));
-  emitSet(Assembler::Zero, output);
-}
-
-void MacroAssemblerX86Shared::anyTrueSimdBool(FloatRegister input,
-                                              Register output) {
-  vpmovmskb(input, output);
-  cmp32(output, Imm32(0x0));
-  emitSet(Assembler::NonZero, output);
-}
-
-void MacroAssemblerX86Shared::swizzleInt32x4(FloatRegister input,
-                                             FloatRegister output,
-                                             unsigned lanes[4]) {
-  uint32_t mask = MacroAssembler::ComputeShuffleMask(lanes[0], lanes[1],
-                                                     lanes[2], lanes[3]);
-  shuffleInt32(mask, input, output);
-}
-
-// For SIMD.js
-void MacroAssemblerX86Shared::oldSwizzleInt8x16(FloatRegister input,
-                                                FloatRegister output,
-                                                const Maybe<Register>& temp,
-                                                int8_t lanes[16]) {
-  if (AssemblerX86Shared::HasSSSE3()) {
-    ScratchSimd128Scope scratch(asMasm());
-    asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(lanes), scratch);
-    FloatRegister inputCopy = reusedInputInt32x4(input, output);
-    vpshufb(scratch, inputCopy, output);
-    return;
-  }
-
-  // Worst-case fallback for pre-SSSE3 machines. Bounce through memory.
-  MOZ_ASSERT(!!temp, "needs a temp for the memory fallback");
-  asMasm().reserveStack(2 * Simd128DataSize);
-  storeAlignedSimd128Int(input, Address(StackPointer, Simd128DataSize));
-  for (unsigned i = 0; i < 16; i++) {
-    load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]), *temp);
-    store8(*temp, Address(StackPointer, i));
-  }
-  loadAlignedSimd128Int(Address(StackPointer, 0), output);
-  asMasm().freeStack(2 * Simd128DataSize);
-}
-
-static inline bool LanesMatch(unsigned lanes[4], unsigned x, unsigned y,
-                              unsigned z, unsigned w) {
-  return lanes[0] == x && lanes[1] == y && lanes[2] == z && lanes[3] == w;
-}
-
-void MacroAssemblerX86Shared::swizzleFloat32x4(FloatRegister input,
-                                               FloatRegister output,
-                                               unsigned lanes[4]) {
-  if (AssemblerX86Shared::HasSSE3()) {
-    if (LanesMatch(lanes, 0, 0, 2, 2)) {
-      vmovsldup(input, output);
-      return;
-    }
-    if (LanesMatch(lanes, 1, 1, 3, 3)) {
-      vmovshdup(input, output);
-      return;
-    }
-  }
-
-  // TODO Here and below, arch specific lowering could identify this pattern
-  // and use defineReuseInput to avoid this move (bug 1084404)
-  if (LanesMatch(lanes, 2, 3, 2, 3)) {
-    FloatRegister inputCopy = reusedInputSimd128Float(input, output);
-    vmovhlps(input, inputCopy, output);
-    return;
-  }
-
-  if (LanesMatch(lanes, 0, 1, 0, 1)) {
-    if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) {
-      vmovddup(input, output);
-      return;
-    }
-    FloatRegister inputCopy = reusedInputSimd128Float(input, output);
-    vmovlhps(input, inputCopy, output);
-    return;
-  }
-
-  if (LanesMatch(lanes, 0, 0, 1, 1)) {
-    FloatRegister inputCopy = reusedInputSimd128Float(input, output);
-    vunpcklps(input, inputCopy, output);
-    return;
-  }
-
-  if (LanesMatch(lanes, 2, 2, 3, 3)) {
-    FloatRegister inputCopy = reusedInputSimd128Float(input, output);
-    vunpckhps(input, inputCopy, output);
-    return;
-  }
-
-  uint32_t x = lanes[0];
-  uint32_t y = lanes[1];
-  uint32_t z = lanes[2];
-  uint32_t w = lanes[3];
-
-  uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
-  shuffleFloat32(mask, input, output);
-}
-
 void MacroAssemblerX86Shared::blendInt8x16(FloatRegister lhs, FloatRegister rhs,
                                            FloatRegister output,
                                            FloatRegister temp,
                                            const uint8_t lanes[16]) {
   MOZ_ASSERT(AssemblerX86Shared::HasSSSE3());
   MOZ_ASSERT(lhs == output);
   MOZ_ASSERT(lhs == rhs || !temp.isInvalid());
 
@@ -634,247 +356,16 @@ void MacroAssemblerX86Shared::shuffleInt
     load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]),
                     *maybeTemp);
     store8(*maybeTemp, Address(StackPointer, i));
   }
   loadAlignedSimd128Int(Address(StackPointer, 0), output);
   asMasm().freeStack(3 * Simd128DataSize);
 }
 
-void MacroAssemblerX86Shared::shuffleX4(FloatRegister lhs, Operand rhs,
-                                        FloatRegister out,
-                                        const Maybe<FloatRegister>& maybeTemp,
-                                        unsigned lanes[4]) {
-  uint32_t x = lanes[0];
-  uint32_t y = lanes[1];
-  uint32_t z = lanes[2];
-  uint32_t w = lanes[3];
-
-  // Check that lanes come from LHS in majority:
-  unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
-  MOZ_ASSERT(numLanesFromLHS >= 2);
-
-  // When reading this method, remember that vshufps takes the two first
-  // inputs of the destination operand (right operand) and the two last
-  // inputs of the source operand (left operand).
-  //
-  // Legend for explanations:
-  // - L: LHS
-  // - R: RHS
-  // - T: temporary
-
-  uint32_t mask;
-
-  // If all lanes came from a single vector, we should use swizzle instead.
-  MOZ_ASSERT(numLanesFromLHS < 4);
-
-  // If all values stay in their lane, this is a blend.
-  if (AssemblerX86Shared::HasSSE41()) {
-    if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) {
-      vblendps(blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4), rhs, lhs, out);
-      return;
-    }
-  }
-
-  // One element of the second, all other elements of the first
-  if (numLanesFromLHS == 3) {
-    unsigned firstMask = -1, secondMask = -1;
-
-    // register-register vmovss preserves the high lanes.
-    if (LanesMatch(lanes, 4, 1, 2, 3) && rhs.kind() == Operand::FPREG) {
-      vmovss(FloatRegister::FromCode(rhs.fpu()), lhs, out);
-      return;
-    }
-
-    // SSE4.1 vinsertps can handle any single element.
-    unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
-    if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
-      unsigned srcLane;
-      unsigned dstLane;
-      if (x >= 4) {
-        srcLane = x - 4;
-        dstLane = 0;
-      } else if (y >= 4) {
-        srcLane = y - 4;
-        dstLane = 1;
-      } else if (z >= 4) {
-        srcLane = z - 4;
-        dstLane = 2;
-      } else {
-        MOZ_ASSERT(w >= 4);
-        srcLane = w - 4;
-        dstLane = 3;
-      }
-      vinsertps(vinsertpsMask(srcLane, dstLane), rhs, lhs, out);
-      return;
-    }
-
-    MOZ_ASSERT(!!maybeTemp);
-    FloatRegister rhsCopy = *maybeTemp;
-    loadAlignedSimd128Float(rhs, rhsCopy);
-
-    if (x < 4 && y < 4) {
-      if (w >= 4) {
-        w %= 4;
-        // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhsCopy)
-        firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
-        // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, out)
-        secondMask = MacroAssembler::ComputeShuffleMask(x, y, 2, 0);
-      } else {
-        MOZ_ASSERT(z >= 4);
-        z %= 4;
-        // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhsCopy)
-        firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
-        // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, out)
-        secondMask = MacroAssembler::ComputeShuffleMask(x, y, 0, 2);
-      }
-
-      vshufps(firstMask, lhs, rhsCopy, rhsCopy);
-      vshufps(secondMask, rhsCopy, lhs, out);
-      return;
-    }
-
-    MOZ_ASSERT(z < 4 && w < 4);
-
-    if (y >= 4) {
-      y %= 4;
-      // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhsCopy)
-      firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
-      // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, out)
-      secondMask = MacroAssembler::ComputeShuffleMask(2, 0, z, w);
-    } else {
-      MOZ_ASSERT(x >= 4);
-      x %= 4;
-      // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhsCopy)
-      firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
-      // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, out)
-      secondMask = MacroAssembler::ComputeShuffleMask(0, 2, z, w);
-    }
-
-    vshufps(firstMask, lhs, rhsCopy, rhsCopy);
-    if (AssemblerX86Shared::HasAVX()) {
-      vshufps(secondMask, lhs, rhsCopy, out);
-    } else {
-      vshufps(secondMask, lhs, rhsCopy, rhsCopy);
-      moveSimd128Float(rhsCopy, out);
-    }
-    return;
-  }
-
-  // Two elements from one vector, two other elements from the other
-  MOZ_ASSERT(numLanesFromLHS == 2);
-
-  // TODO Here and below, symmetric case would be more handy to avoid a move,
-  // but can't be reached because operands would get swapped (bug 1084404).
-  if (LanesMatch(lanes, 2, 3, 6, 7)) {
-    ScratchSimd128Scope scratch(asMasm());
-    if (AssemblerX86Shared::HasAVX()) {
-      FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
-      vmovhlps(lhs, rhsCopy, out);
-    } else {
-      loadAlignedSimd128Float(rhs, scratch);
-      vmovhlps(lhs, scratch, scratch);
-      moveSimd128Float(scratch, out);
-    }
-    return;
-  }
-
-  if (LanesMatch(lanes, 0, 1, 4, 5)) {
-    FloatRegister rhsCopy;
-    ScratchSimd128Scope scratch(asMasm());
-    if (rhs.kind() == Operand::FPREG) {
-      // No need to make an actual copy, since the operand is already
-      // in a register, and it won't be clobbered by the vmovlhps.
-      rhsCopy = FloatRegister::FromCode(rhs.fpu());
-    } else {
-      loadAlignedSimd128Float(rhs, scratch);
-      rhsCopy = scratch;
-    }
-    vmovlhps(rhsCopy, lhs, out);
-    return;
-  }
-
-  if (LanesMatch(lanes, 0, 4, 1, 5)) {
-    vunpcklps(rhs, lhs, out);
-    return;
-  }
-
-  // TODO swapped case would be better (bug 1084404)
-  if (LanesMatch(lanes, 4, 0, 5, 1)) {
-    ScratchSimd128Scope scratch(asMasm());
-    if (AssemblerX86Shared::HasAVX()) {
-      FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
-      vunpcklps(lhs, rhsCopy, out);
-    } else {
-      loadAlignedSimd128Float(rhs, scratch);
-      vunpcklps(lhs, scratch, scratch);
-      moveSimd128Float(scratch, out);
-    }
-    return;
-  }
-
-  if (LanesMatch(lanes, 2, 6, 3, 7)) {
-    vunpckhps(rhs, lhs, out);
-    return;
-  }
-
-  // TODO swapped case would be better (bug 1084404)
-  if (LanesMatch(lanes, 6, 2, 7, 3)) {
-    ScratchSimd128Scope scratch(asMasm());
-    if (AssemblerX86Shared::HasAVX()) {
-      FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, scratch);
-      vunpckhps(lhs, rhsCopy, out);
-    } else {
-      loadAlignedSimd128Float(rhs, scratch);
-      vunpckhps(lhs, scratch, scratch);
-      moveSimd128Float(scratch, out);
-    }
-    return;
-  }
-
-  // In one vshufps
-  if (x < 4 && y < 4) {
-    mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
-    vshufps(mask, rhs, lhs, out);
-    return;
-  }
-
-  // At creation, we should have explicitly swapped in this case.
-  MOZ_ASSERT(!(z >= 4 && w >= 4));
-
-  // In two vshufps, for the most generic case:
-  uint32_t firstMask[4], secondMask[4];
-  unsigned i = 0, j = 2, k = 0;
-
-#define COMPUTE_MASK(lane)   \
-  if (lane >= 4) {           \
-    firstMask[j] = lane % 4; \
-    secondMask[k++] = j++;   \
-  } else {                   \
-    firstMask[i] = lane;     \
-    secondMask[k++] = i++;   \
-  }
-
-  COMPUTE_MASK(x)
-  COMPUTE_MASK(y)
-  COMPUTE_MASK(z)
-  COMPUTE_MASK(w)
-#undef COMPUTE_MASK
-
-  MOZ_ASSERT(i == 2 && j == 4 && k == 4);
-
-  mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
-                                            firstMask[2], firstMask[3]);
-  vshufps(mask, rhs, lhs, lhs);
-
-  mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
-                                            secondMask[2], secondMask[3]);
-  vshufps(mask, lhs, lhs, lhs);
-}
-
 static inline FloatRegister ToSimdFloatRegister(const Operand& op) {
   return FloatRegister(op.fpu(), FloatRegister::Codes::ContentType::Simd128);
 }
 
 void MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs,
                                              Assembler::Condition cond,
                                              FloatRegister output) {
   static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
@@ -1530,97 +1021,16 @@ void MacroAssemblerX86Shared::minFloat64
 
 void MacroAssemblerX86Shared::maxFloat64x2(FloatRegister lhs, Operand rhs,
                                            FloatRegister temp1,
                                            FloatRegister temp2,
                                            FloatRegister output) {
   minMaxFloat64x2(/*isMin=*/false, lhs, rhs, temp1, temp2, output);
 }
 
-void MacroAssemblerX86Shared::minNumFloat32x4(FloatRegister lhs, Operand rhs,
-                                              FloatRegister temp,
-                                              FloatRegister output) {
-  ScratchSimd128Scope scratch(asMasm());
-  asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)),
-                                  temp);
-
-  FloatRegister mask = scratch;
-  FloatRegister tmpCopy = reusedInputSimd128Float(temp, scratch);
-  vpcmpeqd(Operand(lhs), tmpCopy, mask);
-  vandps(temp, mask, mask);
-
-  FloatRegister lhsCopy = reusedInputSimd128Float(lhs, temp);
-  vminps(rhs, lhsCopy, temp);
-  vorps(mask, temp, temp);
-
-  if (AssemblerX86Shared::HasAVX()) {
-    MOZ_CRASH("Can do better by avoiding the movaps");
-  } else {
-    vmovaps(rhs, mask);
-    vcmpneqps(rhs, mask);
-  }
-
-  if (AssemblerX86Shared::HasAVX()) {
-    vblendvps(mask, lhs, temp, output);
-  } else {
-    // Emulate vblendvps.
-    // With SSE.4.1 we could use blendvps, however it's awkward since
-    // it requires the mask to be in xmm0.
-    if (lhs != output) {
-      moveSimd128Float(lhs, output);
-    }
-    vandps(Operand(mask), output, output);
-    vandnps(Operand(temp), mask, mask);
-    vorps(Operand(mask), output, output);
-  }
-}
-
-void MacroAssemblerX86Shared::maxNumFloat32x4(FloatRegister lhs, Operand rhs,
-                                              FloatRegister temp,
-                                              FloatRegister output) {
-  ScratchSimd128Scope scratch(asMasm());
-  FloatRegister mask = scratch;
-
-  asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(0), mask);
-  vpcmpeqd(Operand(lhs), mask, mask);
-
-  asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x80000000)),
-                                  temp);
-  vandps(temp, mask, mask);
-
-  FloatRegister lhsCopy = reusedInputSimd128Float(lhs, temp);
-  vmaxps(rhs, lhsCopy, temp);
-  vandnps(Operand(temp), mask, mask);
-
-  // Ensure temp always contains the temporary result
-  mask = temp;
-  temp = scratch;
-
-  if (AssemblerX86Shared::HasAVX()) {
-    MOZ_CRASH("Can do better by avoiding the movaps");
-  } else {
-    vmovaps(rhs, mask);
-    vcmpneqps(rhs, mask);
-  }
-
-  if (AssemblerX86Shared::HasAVX()) {
-    vblendvps(mask, lhs, temp, output);
-  } else {
-    // Emulate vblendvps.
-    // With SSE.4.1 we could use blendvps, however it's awkward since
-    // it requires the mask to be in xmm0.
-    if (lhs != output) {
-      moveSimd128Float(lhs, output);
-    }
-    vandps(Operand(mask), output, output);
-    vandnps(Operand(temp), mask, mask);
-    vorps(Operand(mask), output, output);
-  }
-}
-
 void MacroAssemblerX86Shared::negFloat32x4(Operand in, FloatRegister out) {
   ScratchSimd128Scope scratch(asMasm());
   FloatRegister result = out;
   if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
     result = scratch;
   }
   // All zeros but the sign bit
   static const SimdConstant minusZero = SimdConstant::SplatX4(-0.f);
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
@@ -380,80 +380,47 @@ class MacroAssemblerX86Shared : public A
     // value (0x8000000).
     vcvttps2dq(src, dest);
   }
   void convertInt32x4ToFloat32x4(FloatRegister src, FloatRegister dest) {
     vcvtdq2ps(src, dest);
   }
 
   // SIMD methods, defined in MacroAssembler-x86-shared-SIMD.cpp.
-  void checkedConvertFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest,
-                                        Register temp, Label* oolCheck,
-                                        Label* rejoin);
-  void oolConvertFloat32x4ToInt32x4(FloatRegister src, Register temp,
-                                    Label* rejoin, Label* onConversionError);
-  void checkedConvertFloat32x4ToUint32x4(FloatRegister src, FloatRegister dest,
-                                         Register temp, FloatRegister tempF,
-                                         Label* failed);
 
   void unsignedConvertInt32x4ToFloat32x4(FloatRegister src, FloatRegister dest);
 
   void truncSatFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest);
   void unsignedTruncSatFloat32x4ToInt32x4(FloatRegister src, FloatRegister temp,
                                           FloatRegister dest);
 
-  void createInt32x4(Register lane0, Register lane1, Register lane2,
-                     Register lane3, FloatRegister dest);
-  void createFloat32x4(FloatRegister lane0, FloatRegister lane1,
-                       FloatRegister lane2, FloatRegister lane3,
-                       FloatRegister temp, FloatRegister output);
-
   void splatX16(Register input, FloatRegister output);
   void splatX8(Register input, FloatRegister output);
   void splatX4(Register input, FloatRegister output);
   void splatX4(FloatRegister input, FloatRegister output);
   void splatX2(FloatRegister input, FloatRegister output);
 
-  void reinterpretSimd(bool isIntegerLaneType, FloatRegister input,
-                       FloatRegister output);
-
   void extractLaneInt32x4(FloatRegister input, Register output, unsigned lane);
   void extractLaneFloat32x4(FloatRegister input, FloatRegister output,
                             unsigned lane);
   void extractLaneFloat64x2(FloatRegister input, FloatRegister output,
                             unsigned lane);
   void extractLaneInt16x8(FloatRegister input, Register output, unsigned lane,
                           SimdSign sign);
   void extractLaneInt8x16(FloatRegister input, Register output, unsigned lane,
                           SimdSign sign);
-  void extractLaneSimdBool(FloatRegister input, Register output,
-                           unsigned numLanes, unsigned lane);
 
   void insertLaneSimdInt(FloatRegister input, Register value,
                          FloatRegister output, unsigned lane,
                          unsigned numLanes);
   void insertLaneFloat32x4(FloatRegister input, FloatRegister value,
                            FloatRegister output, unsigned lane);
   void insertLaneFloat64x2(FloatRegister input, FloatRegister value,
                            FloatRegister output, unsigned lane);
 
-  void allTrueSimdBool(FloatRegister input, Register output);
-  void anyTrueSimdBool(FloatRegister input, Register output);
-
-  void swizzleInt32x4(FloatRegister input, FloatRegister output,
-                      unsigned lanes[4]);
-  void swizzleFloat32x4(FloatRegister input, FloatRegister output,
-                        unsigned lanes[4]);
-  void oldSwizzleInt8x16(FloatRegister input, FloatRegister output,
-                         const mozilla::Maybe<Register>& temp,
-                         int8_t lanes[16]);
-
-  void shuffleX4(FloatRegister lhs, Operand rhs, FloatRegister out,
-                 const mozilla::Maybe<FloatRegister>& maybeTemp,
-                 unsigned lanes[4]);
   void shuffleInt8x16(FloatRegister lhs, FloatRegister rhs,
                       FloatRegister output,
                       const mozilla::Maybe<FloatRegister>& maybeFloatTemp,
                       const mozilla::Maybe<Register>& maybeTemp,
                       const uint8_t lanes[16]);
   void blendInt8x16(FloatRegister lhs, FloatRegister rhs, FloatRegister output,
                     FloatRegister temp, const uint8_t lanes[16]);
   void blendInt16x8(FloatRegister lhs, FloatRegister rhs, FloatRegister output,
@@ -496,80 +463,108 @@ class MacroAssemblerX86Shared : public A
                        FloatRegister output);
   void minMaxFloat64x2(bool isMin, FloatRegister lhs, Operand rhs,
                        FloatRegister temp1, FloatRegister temp2,
                        FloatRegister output);
   void minFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp1,
                     FloatRegister temp2, FloatRegister output);
   void maxFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp1,
                     FloatRegister temp2, FloatRegister output);
-  void minNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp,
-                       FloatRegister output);
-  void maxNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp,
-                       FloatRegister output);
 
   void minFloat64x2(FloatRegister lhs, Operand rhs, FloatRegister temp1,
                     FloatRegister temp2, FloatRegister output);
   void maxFloat64x2(FloatRegister lhs, Operand rhs, FloatRegister temp1,
                     FloatRegister temp2, FloatRegister output);
 
   void absFloat32x4(Operand in, FloatRegister out);
   void absFloat64x2(Operand in, FloatRegister out);
 
+  void packedShiftByScalarInt8x16(
+      FloatRegister in, Register count, Register temp, FloatRegister xtmp,
+      FloatRegister dest,
+      void (MacroAssemblerX86Shared::*shift)(FloatRegister, FloatRegister,
+                                             FloatRegister),
+      void (MacroAssemblerX86Shared::*extend)(const Operand&, FloatRegister));
+
+  void packedLeftShiftByScalarInt8x16(FloatRegister in, Register count,
+                                      Register temp, FloatRegister xtmp,
+                                      FloatRegister dest);
+  void packedLeftShiftByScalarInt8x16(Imm32 count, FloatRegister src,
+                                      FloatRegister dest);
+  void packedRightShiftByScalarInt8x16(FloatRegister in, Register count,
+                                       Register temp, FloatRegister xtmp,
+                                       FloatRegister dest);
+  void packedRightShiftByScalarInt8x16(Imm32 count, FloatRegister src,
+                                       FloatRegister temp, FloatRegister dest);
+  void packedUnsignedRightShiftByScalarInt8x16(FloatRegister in, Register count,
+                                               Register temp,
+                                               FloatRegister xtmp,
+                                               FloatRegister dest);
+  void packedUnsignedRightShiftByScalarInt8x16(Imm32 count, FloatRegister src,
+                                               FloatRegister dest);
+
+  void packedLeftShiftByScalarInt16x8(FloatRegister in, Register count,
+                                      Register temp, FloatRegister dest);
+  void packedRightShiftByScalarInt16x8(FloatRegister in, Register count,
+                                       Register temp, FloatRegister dest);
+  void packedUnsignedRightShiftByScalarInt16x8(FloatRegister in, Register count,
+                                               Register temp,
+                                               FloatRegister dest);
+
+  void packedLeftShiftByScalarInt32x4(FloatRegister in, Register count,
+                                      Register temp, FloatRegister dest);
+  void packedRightShiftByScalarInt32x4(FloatRegister in, Register count,
+                                       Register temp, FloatRegister dest);
+  void packedUnsignedRightShiftByScalarInt32x4(FloatRegister in, Register count,
+                                               Register temp,
+                                               FloatRegister dest);
+  void packedLeftShiftByScalarInt64x2(FloatRegister in, Register count,
+                                      Register temp, FloatRegister dest);
+  void packedRightShiftByScalarInt64x2(FloatRegister in, Register count,
+                                       Register temp1, FloatRegister temp2,
+                                       FloatRegister dest);
+  void packedRightShiftByScalarInt64x2(Imm32 count, FloatRegister src,
+                                       FloatRegister dest);
+  void packedUnsignedRightShiftByScalarInt64x2(FloatRegister in, Register count,
+                                               Register temp,
+                                               FloatRegister dest);
+  void selectSimd128(FloatRegister mask, FloatRegister onTrue,
+                     FloatRegister onFalse, FloatRegister temp,
+                     FloatRegister output);
+
+  // SIMD inline methods private to the implementation, that appear to be used.
+
   void bitwiseAndFloat32x4(FloatRegister lhs, const Operand& rhs,
                            FloatRegister dest) {
     vandps(rhs, lhs, dest);
   }
   void bitwiseAndSimdInt(FloatRegister lhs, const Operand& rhs,
                          FloatRegister dest) {
     vpand(rhs, lhs, dest);
   }
-
-  void bitwiseOrFloat32x4(FloatRegister lhs, const Operand& rhs,
-                          FloatRegister dest) {
-    vorps(rhs, lhs, dest);
-  }
   void bitwiseOrSimdInt(FloatRegister lhs, const Operand& rhs,
                         FloatRegister dest) {
     vpor(rhs, lhs, dest);
   }
-
   void bitwiseXorFloat32x4(FloatRegister lhs, const Operand& rhs,
                            FloatRegister dest) {
     vxorps(rhs, lhs, dest);
   }
   void bitwiseXorSimdInt(FloatRegister lhs, const Operand& rhs,
                          FloatRegister dest) {
     vpxor(rhs, lhs, dest);
   }
-
-  void bitwiseAndNotFloat32x4(FloatRegister lhs, const Operand& rhs,
-                              FloatRegister dest) {
-    vandnps(rhs, lhs, dest);
-  }
   void bitwiseAndNotSimdInt(FloatRegister lhs, const Operand& rhs,
                             FloatRegister dest) {
     vpandn(rhs, lhs, dest);
   }
 
   void zeroSimd128Float(FloatRegister dest) { vxorps(dest, dest, dest); }
   void zeroSimd128Int(FloatRegister dest) { vpxor(dest, dest, dest); }
 
-  void selectSimd128(FloatRegister mask, FloatRegister onTrue,
-                     FloatRegister onFalse, FloatRegister temp,
-                     FloatRegister output);
-  void selectX4(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse,
-                FloatRegister temp, FloatRegister output) {
-    if (AssemblerX86Shared::HasAVX()) {
-      vblendvps(mask, onTrue, onFalse, output);
-    } else {
-      selectSimd128(mask, onTrue, onFalse, temp, output);
-    }
-  }
-
   template <class T, class Reg>
   inline void loadScalar(const Operand& src, Reg dest);
   template <class T, class Reg>
   inline void storeScalar(Reg src, const Address& dest);
   template <class T>
   inline void loadAlignedVector(const Address& src, FloatRegister dest);
   template <class T>
   inline void storeAlignedVector(FloatRegister src, const Address& dest);
@@ -588,24 +583,16 @@ class MacroAssemblerX86Shared : public A
   }
   FloatRegister reusedInputInt32x4(FloatRegister src, FloatRegister dest) {
     if (HasAVX()) {
       return src;
     }
     moveSimd128Int(src, dest);
     return dest;
   }
-  FloatRegister reusedInputAlignedInt32x4(const Operand& src,
-                                          FloatRegister dest) {
-    if (HasAVX() && src.kind() == Operand::FPREG) {
-      return FloatRegister::FromCode(src.fpu());
-    }
-    loadAlignedSimd128Int(src, dest);
-    return dest;
-  }
   void loadUnalignedSimd128Int(const Address& src, FloatRegister dest) {
     vmovdqu(Operand(src), dest);
   }
   void loadUnalignedSimd128Int(const BaseIndex& src, FloatRegister dest) {
     vmovdqu(Operand(src), dest);
   }
   void loadUnalignedSimd128Int(const Operand& src, FloatRegister dest) {
     vmovdqu(src, dest);
@@ -620,16 +607,150 @@ class MacroAssemblerX86Shared : public A
     vmovdqu(src, dest);
   }
   void packedEqualInt32x4(const Operand& src, FloatRegister dest) {
     vpcmpeqd(src, dest, dest);
   }
   void packedGreaterThanInt32x4(const Operand& src, FloatRegister dest) {
     vpcmpgtd(src, dest, dest);
   }
+  void packedLeftShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
+    count.value &= 15;
+    vpsllw(count, dest, dest);
+  }
+  void packedRightShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
+    count.value &= 15;
+    vpsraw(count, dest, dest);
+  }
+  void packedUnsignedRightShiftByScalarInt16x8(Imm32 count,
+                                               FloatRegister dest) {
+    count.value &= 15;
+    vpsrlw(count, dest, dest);
+  }
+  void packedLeftShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
+    count.value &= 31;
+    vpslld(count, dest, dest);
+  }
+  void packedRightShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
+    count.value &= 31;
+    vpsrad(count, dest, dest);
+  }
+  void packedUnsignedRightShiftByScalarInt32x4(Imm32 count,
+                                               FloatRegister dest) {
+    count.value &= 31;
+    vpsrld(count, dest, dest);
+  }
+  void loadAlignedSimd128Float(const Address& src, FloatRegister dest) {
+    vmovaps(Operand(src), dest);
+  }
+  void loadAlignedSimd128Float(const Operand& src, FloatRegister dest) {
+    vmovaps(src, dest);
+  }
+  void storeAlignedSimd128Float(FloatRegister src, const Address& dest) {
+    vmovaps(src, Operand(dest));
+  }
+  void moveSimd128Float(FloatRegister src, FloatRegister dest) {
+    vmovaps(src, dest);
+  }
+  FloatRegister reusedInputSimd128Float(FloatRegister src, FloatRegister dest) {
+    if (HasAVX()) {
+      return src;
+    }
+    moveSimd128Float(src, dest);
+    return dest;
+  }
+  void loadUnalignedSimd128(const Operand& src, FloatRegister dest) {
+    vmovups(src, dest);
+  }
+  void storeUnalignedSimd128(FloatRegister src, const Operand& dest) {
+    vmovups(src, dest);
+  }
+
+  static uint32_t ComputeShuffleMask(uint32_t x = 0, uint32_t y = 1,
+                                     uint32_t z = 2, uint32_t w = 3) {
+    MOZ_ASSERT(x < 4 && y < 4 && z < 4 && w < 4);
+    uint32_t r = (w << 6) | (z << 4) | (y << 2) | (x << 0);
+    MOZ_ASSERT(r < 256);
+    return r;
+  }
+
+  void shuffleInt32(uint32_t mask, FloatRegister src, FloatRegister dest) {
+    vpshufd(mask, src, dest);
+  }
+  void moveLowInt32(FloatRegister src, Register dest) { vmovd(src, dest); }
+
+  void moveHighPairToLowPairFloat32(FloatRegister src, FloatRegister dest) {
+    vmovhlps(src, dest, dest);
+  }
+  void shuffleFloat32(uint32_t mask, FloatRegister src, FloatRegister dest) {
+    // The shuffle instruction on x86 is such that it moves 2 words from
+    // the dest and 2 words from the src operands. To simplify things, just
+    // clobber the output with the input and apply the instruction
+    // afterwards.
+    // Note: this is useAtStart-safe because src isn't read afterwards.
+    FloatRegister srcCopy = reusedInputSimd128Float(src, dest);
+    vshufps(mask, srcCopy, srcCopy, dest);
+  }
+
+  // Unused SIMD methods, defined in MacroAssemble-x86-shared-SIMD-unused.cpp.
+  // Don't use these without moving them out of that file and moving the
+  // declaration into the list above.
+
+  void checkedConvertFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest,
+                                        Register temp, Label* oolCheck,
+                                        Label* rejoin);
+  void oolConvertFloat32x4ToInt32x4(FloatRegister src, Register temp,
+                                    Label* rejoin, Label* onConversionError);
+  void checkedConvertFloat32x4ToUint32x4(FloatRegister src, FloatRegister dest,
+                                         Register temp, FloatRegister tempF,
+                                         Label* failed);
+  void createInt32x4(Register lane0, Register lane1, Register lane2,
+                     Register lane3, FloatRegister dest);
+  void createFloat32x4(FloatRegister lane0, FloatRegister lane1,
+                       FloatRegister lane2, FloatRegister lane3,
+                       FloatRegister temp, FloatRegister output);
+  void reinterpretSimd(bool isIntegerLaneType, FloatRegister input,
+                       FloatRegister output);
+  void extractLaneSimdBool(FloatRegister input, Register output,
+                           unsigned numLanes, unsigned lane);
+  void allTrueSimdBool(FloatRegister input, Register output);
+  void anyTrueSimdBool(FloatRegister input, Register output);
+  void swizzleInt32x4(FloatRegister input, FloatRegister output,
+                      unsigned lanes[4]);
+  void swizzleFloat32x4(FloatRegister input, FloatRegister output,
+                        unsigned lanes[4]);
+  void oldSwizzleInt8x16(FloatRegister input, FloatRegister output,
+                         const mozilla::Maybe<Register>& temp,
+                         int8_t lanes[16]);
+  void shuffleX4(FloatRegister lhs, Operand rhs, FloatRegister out,
+                 const mozilla::Maybe<FloatRegister>& maybeTemp,
+                 unsigned lanes[4]);
+  void minNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp,
+                       FloatRegister output);
+  void maxNumFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp,
+                       FloatRegister output);
+
+  // Unused inline methods ditto.
+
+  void bitwiseOrFloat32x4(FloatRegister lhs, const Operand& rhs,
+                          FloatRegister dest) {
+    vorps(rhs, lhs, dest);
+  }
+  void bitwiseAndNotFloat32x4(FloatRegister lhs, const Operand& rhs,
+                              FloatRegister dest) {
+    vandnps(rhs, lhs, dest);
+  }
+  FloatRegister reusedInputAlignedInt32x4(const Operand& src,
+                                          FloatRegister dest) {
+    if (HasAVX() && src.kind() == Operand::FPREG) {
+      return FloatRegister::FromCode(src.fpu());
+    }
+    loadAlignedSimd128Int(src, dest);
+    return dest;
+  }
   void packedAddInt8(const Operand& src, FloatRegister dest) {
     vpaddb(src, dest, dest);
   }
   void packedSubInt8(const Operand& src, FloatRegister dest) {
     vpsubb(src, dest, dest);
   }
   void packedAddInt16(const Operand& src, FloatRegister dest) {
     vpaddw(src, dest, dest);
@@ -649,174 +770,51 @@ class MacroAssemblerX86Shared : public A
     // TODO See also bug 1068028.
     vrcpps(src, dest);
   }
   void packedRcpSqrtApproximationFloat32x4(const Operand& src,
                                            FloatRegister dest) {
     // TODO See comment above. See also bug 1068028.
     vrsqrtps(src, dest);
   }
-
- private:
-  void packedShiftByScalarInt8x16(
-      FloatRegister in, Register count, Register temp, FloatRegister xtmp,
-      FloatRegister dest,
-      void (MacroAssemblerX86Shared::*shift)(FloatRegister, FloatRegister,
-                                             FloatRegister),
-      void (MacroAssemblerX86Shared::*extend)(const Operand&, FloatRegister));
-
- public:
-  void packedLeftShiftByScalarInt8x16(FloatRegister in, Register count,
-                                      Register temp, FloatRegister xtmp,
-                                      FloatRegister dest);
-  void packedLeftShiftByScalarInt8x16(Imm32 count, FloatRegister src,
-                                      FloatRegister dest);
-  void packedRightShiftByScalarInt8x16(FloatRegister in, Register count,
-                                       Register temp, FloatRegister xtmp,
-                                       FloatRegister dest);
-  void packedRightShiftByScalarInt8x16(Imm32 count, FloatRegister src,
-                                       FloatRegister temp, FloatRegister dest);
-  void packedUnsignedRightShiftByScalarInt8x16(FloatRegister in, Register count,
-                                               Register temp,
-                                               FloatRegister xtmp,
-                                               FloatRegister dest);
-  void packedUnsignedRightShiftByScalarInt8x16(Imm32 count, FloatRegister src,
-                                               FloatRegister dest);
-
-  void packedLeftShiftByScalarInt16x8(FloatRegister in, Register count,
-                                      Register temp, FloatRegister dest);
-  void packedRightShiftByScalarInt16x8(FloatRegister in, Register count,
-                                       Register temp, FloatRegister dest);
-  void packedUnsignedRightShiftByScalarInt16x8(FloatRegister in, Register count,
-                                               Register temp,
-                                               FloatRegister dest);
-
-  void packedLeftShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
-    count.value &= 15;
-    vpsllw(count, dest, dest);
-  }
-  void packedRightShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
-    count.value &= 15;
-    vpsraw(count, dest, dest);
-  }
-  void packedUnsignedRightShiftByScalarInt16x8(Imm32 count,
-                                               FloatRegister dest) {
-    count.value &= 15;
-    vpsrlw(count, dest, dest);
-  }
-
-  void packedLeftShiftByScalarInt32x4(FloatRegister in, Register count,
-                                      Register temp, FloatRegister dest);
-  void packedRightShiftByScalarInt32x4(FloatRegister in, Register count,
-                                       Register temp, FloatRegister dest);
-  void packedUnsignedRightShiftByScalarInt32x4(FloatRegister in, Register count,
-                                               Register temp,
-                                               FloatRegister dest);
-  void packedLeftShiftByScalarInt64x2(FloatRegister in, Register count,
-                                      Register temp, FloatRegister dest);
-  void packedRightShiftByScalarInt64x2(FloatRegister in, Register count,
-                                       Register temp1, FloatRegister temp2,
-                                       FloatRegister dest);
-  void packedRightShiftByScalarInt64x2(Imm32 count, FloatRegister src,
-                                       FloatRegister dest);
-  void packedUnsignedRightShiftByScalarInt64x2(FloatRegister in, Register count,
-                                               Register temp,
-                                               FloatRegister dest);
-
-  void packedLeftShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
-    count.value &= 31;
-    vpslld(count, dest, dest);
-  }
-  void packedRightShiftByScalarInt32x4(Imm32 count, FloatRegister dest) {
-    count.value &= 31;
-    vpsrad(count, dest, dest);
-  }
-  void packedUnsignedRightShiftByScalarInt32x4(Imm32 count,
-                                               FloatRegister dest) {
-    count.value &= 31;
-    vpsrld(count, dest, dest);
-  }
-
-  void loadAlignedSimd128Float(const Address& src, FloatRegister dest) {
-    vmovaps(Operand(src), dest);
-  }
-  void loadAlignedSimd128Float(const Operand& src, FloatRegister dest) {
-    vmovaps(src, dest);
-  }
-
-  void storeAlignedSimd128Float(FloatRegister src, const Address& dest) {
-    vmovaps(src, Operand(dest));
-  }
-  void moveSimd128Float(FloatRegister src, FloatRegister dest) {
-    vmovaps(src, dest);
-  }
-  FloatRegister reusedInputSimd128Float(FloatRegister src, FloatRegister dest) {
-    if (HasAVX()) {
-      return src;
-    }
-    moveSimd128Float(src, dest);
-    return dest;
-  }
   FloatRegister reusedInputAlignedSimd128Float(const Operand& src,
                                                FloatRegister dest) {
     if (HasAVX() && src.kind() == Operand::FPREG) {
       return FloatRegister::FromCode(src.fpu());
     }
     loadAlignedSimd128Float(src, dest);
     return dest;
   }
-  void loadUnalignedSimd128(const Operand& src, FloatRegister dest) {
-    vmovups(src, dest);
-  }
-  void storeUnalignedSimd128(FloatRegister src, const Operand& dest) {
-    vmovups(src, dest);
-  }
   void packedAddFloat32(const Operand& src, FloatRegister dest) {
     vaddps(src, dest, dest);
   }
   void packedSubFloat32(const Operand& src, FloatRegister dest) {
     vsubps(src, dest, dest);
   }
   void packedMulFloat32(const Operand& src, FloatRegister dest) {
     vmulps(src, dest, dest);
   }
   void packedDivFloat32(const Operand& src, FloatRegister dest) {
     vdivps(src, dest, dest);
   }
-
-  static uint32_t ComputeShuffleMask(uint32_t x = 0, uint32_t y = 1,
-                                     uint32_t z = 2, uint32_t w = 3) {
-    MOZ_ASSERT(x < 4 && y < 4 && z < 4 && w < 4);
-    uint32_t r = (w << 6) | (z << 4) | (y << 2) | (x << 0);
-    MOZ_ASSERT(r < 256);
-    return r;
-  }
-
-  void shuffleInt32(uint32_t mask, FloatRegister src, FloatRegister dest) {
-    vpshufd(mask, src, dest);
-  }
-  void moveLowInt32(FloatRegister src, Register dest) { vmovd(src, dest); }
-
-  void moveHighPairToLowPairFloat32(FloatRegister src, FloatRegister dest) {
-    vmovhlps(src, dest, dest);
-  }
-  void shuffleFloat32(uint32_t mask, FloatRegister src, FloatRegister dest) {
-    // The shuffle instruction on x86 is such that it moves 2 words from
-    // the dest and 2 words from the src operands. To simplify things, just
-    // clobber the output with the input and apply the instruction
-    // afterwards.
-    // Note: this is useAtStart-safe because src isn't read afterwards.
-    FloatRegister srcCopy = reusedInputSimd128Float(src, dest);
-    vshufps(mask, srcCopy, srcCopy, dest);
-  }
   void shuffleMix(uint32_t mask, const Operand& src, FloatRegister dest) {
     // Note this uses vshufps, which is a cross-domain penalty on CPU where it
     // applies, but that's the way clang and gcc do it.
     vshufps(mask, src, dest, dest);
   }
+  void selectX4(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse,
+                FloatRegister temp, FloatRegister output) {
+    if (AssemblerX86Shared::HasAVX()) {
+      vblendvps(mask, onTrue, onFalse, output);
+    } else {
+      selectSimd128(mask, onTrue, onFalse, temp, output);
+    }
+  }
+
+  // End unused SIMD.
 
   void moveFloatAsDouble(Register src, FloatRegister dest) {
     vmovd(src, dest);
     vcvtss2sd(dest, dest, dest);
   }
   void loadFloatAsDouble(const Address& src, FloatRegister dest) {
     vmovss(src, dest);
     vcvtss2sd(dest, dest, dest);