Bug 1656229 - Use scratch scopes for v128. r=jseward
authorLars T Hansen <lhansen@mozilla.com>
Thu, 20 Aug 2020 14:42:49 +0000
changeset 610152 a475ddc1aeec7006fab0c9d28660e6c90a8c6d2e
parent 610151 acc09aef3e0784f9e2023cac0b755461c32e594c
child 610153 3da7d219a9e4e4f6e23892e2ef27192de1954f9d
push id13553
push userffxbld-merge
push dateMon, 24 Aug 2020 12:51:36 +0000
treeherdermozilla-beta@a54f8b5d0977 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjseward
bugs1656229
milestone81.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1656229 - Use scratch scopes for v128. r=jseward Always use ScratchSimd128Scope to claim ScratchSimd128Reg. The only hard part is that the register was claimed deep in the assembler in what appears to be a late non-AVX bugfix to work around the fact that compare operations are not three-address on non-AVX. I fixed this by making compare operations two-address and moving the code that shuffles registers for this case into the macroassembler, where the scratch can be claimed correctly. As a result, we have less support for AVX, but since AVX is not supported or tested this does not actually matter. A MOZ_CRASH ensures we'll run into this if testing with AVX. Another couple of similar cases elsewhere have similar local fixes: MOZ_CRASH for AVX, two-address code for the normal case. Differential Revision: https://phabricator.services.mozilla.com/D87284
js/src/jit/x86-shared/Assembler-x86-shared.h
js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
js/src/jit/x86/MacroAssembler-x86.cpp
--- a/js/src/jit/x86-shared/Assembler-x86-shared.h
+++ b/js/src/jit/x86-shared/Assembler-x86-shared.h
@@ -2473,111 +2473,80 @@ class AssemblerX86Shared : public Assemb
       case Operand::FPREG:
         masm.vpcmpgtq_rr(rhs.fpu(), lhs.encoding(), dest.encoding());
         break;
       default:
         MOZ_CRASH("unexpected operand kind");
     }
   }
 
-  void vcmpps(uint8_t order, Operand src1, FloatRegister src0,
-              FloatRegister dest) {
+  void vcmpps(uint8_t order, Operand rhs, FloatRegister srcDest) {
     MOZ_ASSERT(HasSSE2());
-    // :TODO: (Bug 1132894) See LIRGeneratorX86Shared::lowerForFPU
-    // FIXME: This logic belongs in the MacroAssembler.
-    //
-    // Also FIXME: Want to use ScratchSimd128Scope here, not the register
-    // directly.
-    if (!HasAVX() && !src0.aliases(dest)) {
-      if (src1.kind() == Operand::FPREG &&
-          dest.aliases(FloatRegister::FromCode(src1.fpu()))) {
-        vmovdqa(src1, ScratchSimd128Reg);
-        src1 = Operand(ScratchSimd128Reg);
-      }
-      vmovdqa(src0, dest);
-      src0 = dest;
-    }
-    switch (src1.kind()) {
+    switch (rhs.kind()) {
       case Operand::FPREG:
-        masm.vcmpps_rr(order, src1.fpu(), src0.encoding(), dest.encoding());
+        masm.vcmpps_rr(order, rhs.fpu(), srcDest.encoding(),
+                       srcDest.encoding());
         break;
       case Operand::MEM_REG_DISP:
-        masm.vcmpps_mr(order, src1.disp(), src1.base(), src0.encoding(),
-                       dest.encoding());
+        masm.vcmpps_mr(order, rhs.disp(), rhs.base(), srcDest.encoding(),
+                       srcDest.encoding());
         break;
       case Operand::MEM_ADDRESS32:
-        masm.vcmpps_mr(order, src1.address(), src0.encoding(), dest.encoding());
+        masm.vcmpps_mr(order, rhs.address(), srcDest.encoding(),
+                       srcDest.encoding());
         break;
       default:
         MOZ_CRASH("unexpected operand kind");
     }
   }
-  void vcmpeqps(const Operand& src1, FloatRegister src0, FloatRegister dest) {
-    vcmpps(X86Encoding::ConditionCmp_EQ, src1, src0, dest);
-  }
-  void vcmpltps(const Operand& src1, FloatRegister src0, FloatRegister dest) {
-    vcmpps(X86Encoding::ConditionCmp_LT, src1, src0, dest);
-  }
-  void vcmpleps(const Operand& src1, FloatRegister src0, FloatRegister dest) {
-    vcmpps(X86Encoding::ConditionCmp_LE, src1, src0, dest);
-  }
-  void vcmpunordps(const Operand& src1, FloatRegister src0,
-                   FloatRegister dest) {
-    vcmpps(X86Encoding::ConditionCmp_UNORD, src1, src0, dest);
-  }
-  void vcmpneqps(const Operand& src1, FloatRegister src0, FloatRegister dest) {
-    vcmpps(X86Encoding::ConditionCmp_NEQ, src1, src0, dest);
-  }
-  void vcmpordps(const Operand& src1, FloatRegister src0, FloatRegister dest) {
-    vcmpps(X86Encoding::ConditionCmp_ORD, src1, src0, dest);
-  }
-  void vcmppd(uint8_t order, Operand src1, FloatRegister src0,
-              FloatRegister dest) {
-    // Pre-AVX we require src0 == dest but logic in the macroassembler
-    // invalidates this and should be changed.
-    //
-    // FIXME: See further comments at vcmpps.
-    //
-    // Also FIXME: Want to use ScratchSimd128Scope here, not the register
-    // directly.
-    if (!HasAVX() && !src0.aliases(dest)) {
-      if (src1.kind() == Operand::FPREG &&
-          dest.aliases(FloatRegister::FromCode(src1.fpu()))) {
-        vmovdqa(src1, ScratchSimd128Reg);
-        src1 = Operand(ScratchSimd128Reg);
-      }
-      vmovdqa(src0, dest);
-      src0 = dest;
-    }
-    switch (src1.kind()) {
+  void vcmpeqps(const Operand& rhs, FloatRegister srcDest) {
+    vcmpps(X86Encoding::ConditionCmp_EQ, rhs, srcDest);
+  }
+  void vcmpltps(const Operand& rhs, FloatRegister srcDest) {
+    vcmpps(X86Encoding::ConditionCmp_LT, rhs, srcDest);
+  }
+  void vcmpleps(const Operand& rhs, FloatRegister srcDest) {
+    vcmpps(X86Encoding::ConditionCmp_LE, rhs, srcDest);
+  }
+  void vcmpunordps(const Operand& rhs, FloatRegister srcDest) {
+    vcmpps(X86Encoding::ConditionCmp_UNORD, rhs, srcDest);
+  }
+  void vcmpneqps(const Operand& rhs, FloatRegister srcDest) {
+    vcmpps(X86Encoding::ConditionCmp_NEQ, rhs, srcDest);
+  }
+  void vcmpordps(const Operand& rhs, FloatRegister srcDest) {
+    vcmpps(X86Encoding::ConditionCmp_ORD, rhs, srcDest);
+  }
+  void vcmppd(uint8_t order, Operand rhs, FloatRegister srcDest) {
+    switch (rhs.kind()) {
       case Operand::FPREG:
-        masm.vcmppd_rr(order, src1.fpu(), src0.encoding(), dest.encoding());
+        masm.vcmppd_rr(order, rhs.fpu(), srcDest.encoding(),
+                       srcDest.encoding());
         break;
       default:
         MOZ_CRASH("NYI");
     }
   }
-  void vcmpeqpd(const Operand& src1, FloatRegister src0, FloatRegister dest) {
-    vcmppd(X86Encoding::ConditionCmp_EQ, src1, src0, dest);
-  }
-  void vcmpltpd(const Operand& src1, FloatRegister src0, FloatRegister dest) {
-    vcmppd(X86Encoding::ConditionCmp_LT, src1, src0, dest);
-  }
-  void vcmplepd(const Operand& src1, FloatRegister src0, FloatRegister dest) {
-    vcmppd(X86Encoding::ConditionCmp_LE, src1, src0, dest);
-  }
-  void vcmpneqpd(const Operand& src1, FloatRegister src0, FloatRegister dest) {
-    vcmppd(X86Encoding::ConditionCmp_NEQ, src1, src0, dest);
-  }
-  void vcmpordpd(const Operand& src1, FloatRegister src0, FloatRegister dest) {
-    vcmppd(X86Encoding::ConditionCmp_ORD, src1, src0, dest);
-  }
-  void vcmpunordpd(const Operand& src1, FloatRegister src0,
-                   FloatRegister dest) {
-    vcmppd(X86Encoding::ConditionCmp_UNORD, src1, src0, dest);
+  void vcmpeqpd(const Operand& rhs, FloatRegister srcDest) {
+    vcmppd(X86Encoding::ConditionCmp_EQ, rhs, srcDest);
+  }
+  void vcmpltpd(const Operand& rhs, FloatRegister srcDest) {
+    vcmppd(X86Encoding::ConditionCmp_LT, rhs, srcDest);
+  }
+  void vcmplepd(const Operand& rhs, FloatRegister srcDest) {
+    vcmppd(X86Encoding::ConditionCmp_LE, rhs, srcDest);
+  }
+  void vcmpneqpd(const Operand& rhs, FloatRegister srcDest) {
+    vcmppd(X86Encoding::ConditionCmp_NEQ, rhs, srcDest);
+  }
+  void vcmpordpd(const Operand& rhs, FloatRegister srcDest) {
+    vcmppd(X86Encoding::ConditionCmp_ORD, rhs, srcDest);
+  }
+  void vcmpunordpd(const Operand& rhs, FloatRegister srcDest) {
+    vcmppd(X86Encoding::ConditionCmp_UNORD, rhs, srcDest);
   }
   void vrcpps(const Operand& src, FloatRegister dest) {
     MOZ_ASSERT(HasSSE2());
     switch (src.kind()) {
       case Operand::FPREG:
         masm.vrcpps_rr(src.fpu(), dest.encoding());
         break;
       case Operand::MEM_REG_DISP:
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
@@ -39,23 +39,23 @@ void MacroAssemblerX86Shared::checkedCon
 
 void MacroAssemblerX86Shared::oolConvertFloat32x4ToInt32x4(
     FloatRegister src, Register temp, Label* rejoin, Label* onConversionError) {
   static const SimdConstant Int32MaxX4 = SimdConstant::SplatX4(2147483647.f);
   static const SimdConstant Int32MinX4 = SimdConstant::SplatX4(-2147483648.f);
 
   ScratchSimd128Scope scratch(asMasm());
   asMasm().loadConstantSimd128Float(Int32MinX4, scratch);
-  vcmpleps(Operand(src), scratch, scratch);
+  vcmpleps(Operand(src), scratch);
   vmovmskps(scratch, temp);
   cmp32(temp, Imm32(15));
   j(Assembler::NotEqual, onConversionError);
 
   asMasm().loadConstantSimd128Float(Int32MaxX4, scratch);
-  vcmpleps(Operand(src), scratch, scratch);
+  vcmpleps(Operand(src), scratch);
   vmovmskps(scratch, temp);
   cmp32(temp, Imm32(0));
   j(Assembler::NotEqual, onConversionError);
 
   jump(rejoin);
 }
 
 void MacroAssemblerX86Shared::checkedConvertFloat32x4ToUint32x4(
@@ -149,26 +149,28 @@ void MacroAssemblerX86Shared::createFloa
   FloatRegister lane0Copy = reusedInputSimd128Float(lane0, output);
   FloatRegister lane1Copy = reusedInputSimd128Float(lane1, temp);
   vunpcklps(lane3, lane1Copy, temp);
   vunpcklps(lane2, lane0Copy, output);
   vunpcklps(temp, output, output);
 }
 
 void MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output) {
+  ScratchSimd128Scope scratch(asMasm());
+
   vmovd(input, output);
   if (AssemblerX86Shared::HasSSSE3()) {
-    zeroSimd128Int(ScratchSimd128Reg);
-    vpshufb(ScratchSimd128Reg, output, output);
+    zeroSimd128Int(scratch);
+    vpshufb(scratch, output, output);
   } else {
     // Use two shifts to duplicate the low 8 bits into the low 16 bits.
     vpsllw(Imm32(8), output, output);
-    vmovdqa(output, ScratchSimd128Reg);
-    vpsrlw(Imm32(8), ScratchSimd128Reg, ScratchSimd128Reg);
-    vpor(ScratchSimd128Reg, output, output);
+    vmovdqa(output, scratch);
+    vpsrlw(Imm32(8), scratch, scratch);
+    vpor(scratch, output, output);
     // Then do an X8 splat.
     vpshuflw(0, output, output);
     vpshufd(0, output, output);
   }
 }
 
 void MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output) {
   vmovd(input, output);
@@ -211,18 +213,19 @@ void MacroAssemblerX86Shared::extractLan
                                                  unsigned lane) {
   if (lane == 0) {
     // The value we want to extract is in the low double-word
     moveLowInt32(input, output);
   } else if (AssemblerX86Shared::HasSSE41()) {
     vpextrd(lane, input, output);
   } else {
     uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
-    shuffleInt32(mask, input, ScratchSimd128Reg);
-    moveLowInt32(ScratchSimd128Reg, output);
+    ScratchSimd128Scope scratch(asMasm());
+    shuffleInt32(mask, input, scratch);
+    moveLowInt32(scratch, output);
   }
 }
 
 void MacroAssemblerX86Shared::extractLaneFloat32x4(FloatRegister input,
                                                    FloatRegister output,
                                                    unsigned lane) {
   if (lane == 0) {
     // The value we want to extract is in the low double-word
@@ -939,17 +942,16 @@ void MacroAssemblerX86Shared::unsignedCo
 
   MOZ_ASSERT(AssemblerX86Shared::HasSSE41());  // PMOVZX, PMOVSX
   MOZ_ASSERT(lhs == output);
   MOZ_ASSERT(lhs != tmp1 && lhs != tmp2);
   MOZ_ASSERT_IF(rhs.kind() == Operand::FPREG,
                 ToSimdFloatRegister(rhs) != lhs &&
                     ToSimdFloatRegister(rhs) != tmp1 &&
                     ToSimdFloatRegister(rhs) != tmp2);
-  MOZ_ASSERT(tmp1 != ScratchSimd128Reg && tmp2 != ScratchSimd128Reg);
 
   bool complement = false;
   switch (cond) {
     case Assembler::Above:
     case Assembler::BelowOrEqual:
       complement = cond == Assembler::BelowOrEqual;
 
       // Low eight bytes of inputs widened to words
@@ -1247,54 +1249,90 @@ void MacroAssemblerX86Shared::unsignedCo
     vpcmpeqd(Operand(tmp1), tmp1, tmp1);
     vpxor(Operand(tmp1), output, output);
   }
 }
 
 void MacroAssemblerX86Shared::compareFloat32x4(FloatRegister lhs, Operand rhs,
                                                Assembler::Condition cond,
                                                FloatRegister output) {
+  if (HasAVX()) {
+    MOZ_CRASH("Can do better here with three-address compares");
+  }
+
+  // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output.
+  //
+  // TODO: The front end really needs to set things up so that this hack is not
+  // necessary.
+  ScratchSimd128Scope scratch(asMasm());
+  if (!lhs.aliases(output)) {
+    if (rhs.kind() == Operand::FPREG &&
+        output.aliases(FloatRegister::FromCode(rhs.fpu()))) {
+      vmovdqa(rhs, scratch);
+      rhs = Operand(scratch);
+    }
+    vmovdqa(lhs, output);
+  }
+
   switch (cond) {
     case Assembler::Condition::Equal:
-      vcmpeqps(rhs, lhs, output);
+      vcmpeqps(rhs, output);
       break;
     case Assembler::Condition::LessThan:
-      vcmpltps(rhs, lhs, output);
+      vcmpltps(rhs, output);
       break;
     case Assembler::Condition::LessThanOrEqual:
-      vcmpleps(rhs, lhs, output);
+      vcmpleps(rhs, output);
       break;
     case Assembler::Condition::NotEqual:
-      vcmpneqps(rhs, lhs, output);
+      vcmpneqps(rhs, output);
       break;
     case Assembler::Condition::GreaterThanOrEqual:
     case Assembler::Condition::GreaterThan:
       // We reverse these before register allocation so that we don't have to
       // copy into and out of temporaries after codegen.
       MOZ_CRASH("should have reversed this");
     default:
       MOZ_CRASH("unexpected condition op");
   }
 }
 
 void MacroAssemblerX86Shared::compareFloat64x2(FloatRegister lhs, Operand rhs,
                                                Assembler::Condition cond,
                                                FloatRegister output) {
+  if (HasAVX()) {
+    MOZ_CRASH("Can do better here with three-address compares");
+  }
+
+  // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output.
+  //
+  // TODO: The front end really needs to set things up so that this hack is not
+  // necessary.
+  ScratchSimd128Scope scratch(asMasm());
+  if (!lhs.aliases(output)) {
+    if (rhs.kind() == Operand::FPREG &&
+        output.aliases(FloatRegister::FromCode(rhs.fpu()))) {
+      vmovdqa(rhs, scratch);
+      rhs = Operand(scratch);
+    }
+    vmovdqa(lhs, output);
+  }
+
   switch (cond) {
     case Assembler::Condition::Equal:
-      vcmpeqpd(rhs, lhs, output);
+      vcmpeqpd(rhs, output);
       break;
     case Assembler::Condition::LessThan:
-      vcmpltpd(rhs, lhs, output);
+      vcmpltpd(rhs, output);
       break;
     case Assembler::Condition::LessThanOrEqual:
-      vcmplepd(rhs, lhs, output);
+      vcmplepd(rhs, output);
       break;
     case Assembler::Condition::NotEqual:
-      vcmpneqpd(rhs, lhs, output);
+      vcmpneqpd(rhs, output);
       break;
     case Assembler::Condition::GreaterThanOrEqual:
     case Assembler::Condition::GreaterThan:
       // We reverse these before register allocation so that we don't have to
       // copy into and out of temporaries after codegen.
       MOZ_CRASH("should have reversed this");
     default:
       MOZ_CRASH("unexpected condition op");
@@ -1381,37 +1419,37 @@ void MacroAssemblerX86Shared::minMaxFloa
   } else {
     vmovaps(lhs, output);                    // compute
     vmaxps(rhs, output, output);             //   max lhs, rhs
     vmovaps(rhs, temp1);                     // compute
     vmaxps(Operand(lhs), temp1, temp1);      //   max rhs, lhs
     vandps(temp1, output, output);           // fix max(-0, 0) with AND
   }
   vmovaps(lhs, temp1);                       // compute
-  vcmpunordps(rhs, temp1, temp1);            //   lhs UNORD rhs
+  vcmpunordps(rhs, temp1);                   //   lhs UNORD rhs
   vptest(temp1, temp1);                      // check if any unordered
   j(Assembler::Equal, &l);                   //   and exit if not
 
   // Slow path.
   // output has result for non-NaN lanes, garbage in NaN lanes.
   // temp1 has lhs UNORD rhs.
   // temp2 is dead.
 
   vmovaps(temp1, temp2);                     // clear NaN lanes of result
   vpandn(output, temp2, temp2);              //   result now in temp2
   asMasm().loadConstantSimd128Float(quietBits, output);
   vandps(output, temp1, temp1);              // setup QNaN bits in NaN lanes
   vorps(temp1, temp2, temp2);                //   and OR into result
   vmovaps(lhs, temp1);                       // find NaN lanes
-  vcmpunordps(Operand(temp1), temp1, temp1); //   in lhs
+  vcmpunordps(Operand(temp1), temp1);        //   in lhs
   vmovaps(temp1, output);                    //     (and save them for later)
   vandps(lhs, temp1, temp1);                 //       and extract the NaNs
   vorps(temp1, temp2, temp2);                //         and add to the result
   vmovaps(rhs, temp1);                       // find NaN lanes
-  vcmpunordps(Operand(temp1), temp1, temp1); //   in rhs
+  vcmpunordps(Operand(temp1), temp1);        //   in rhs
   vpandn(temp1, output, output);             //     except if they were in lhs
   vandps(rhs, output, output);               //       and extract the NaNs
   vorps(temp2, output, output);              //         and add to the result
 
   bind(&l);
   /* clang-format on */
 }
 
@@ -1435,37 +1473,37 @@ void MacroAssemblerX86Shared::minMaxFloa
   } else {
     vmovapd(lhs, output);                    // compute
     vmaxpd(rhs, output, output);             //   max lhs, rhs
     vmovapd(rhs, temp1);                     // compute
     vmaxpd(Operand(lhs), temp1, temp1);      //   max rhs, lhs
     vandpd(temp1, output, output);           // fix max(-0, 0) with AND
   }
   vmovapd(lhs, temp1);                       // compute
-  vcmpunordpd(rhs, temp1, temp1);            //   lhs UNORD rhs
+  vcmpunordpd(rhs, temp1);                   //   lhs UNORD rhs
   vptest(temp1, temp1);                      // check if any unordered
   j(Assembler::Equal, &l);                   //   and exit if not
 
   // Slow path.
   // output has result for non-NaN lanes, garbage in NaN lanes.
   // temp1 has lhs UNORD rhs.
   // temp2 is dead.
 
   vmovapd(temp1, temp2);                     // clear NaN lanes of result
   vpandn(output, temp2, temp2);              //   result now in temp2
   asMasm().loadConstantSimd128Float(quietBits, output);
   vandpd(output, temp1, temp1);              // setup QNaN bits in NaN lanes
   vorpd(temp1, temp2, temp2);                //   and OR into result
   vmovapd(lhs, temp1);                       // find NaN lanes
-  vcmpunordpd(Operand(temp1), temp1, temp1); //   in lhs
+  vcmpunordpd(Operand(temp1), temp1);        //   in lhs
   vmovapd(temp1, output);                    //     (and save them for later)
   vandpd(lhs, temp1, temp1);                 //       and extract the NaNs
   vorpd(temp1, temp2, temp2);                //         and add to the result
   vmovapd(rhs, temp1);                       // find NaN lanes
-  vcmpunordpd(Operand(temp1), temp1, temp1); //   in rhs
+  vcmpunordpd(Operand(temp1), temp1);        //   in rhs
   vpandn(temp1, output, output);             //     except if they were in lhs
   vandpd(rhs, output, output);               //       and extract the NaNs
   vorpd(temp2, output, output);              //         and add to the result
 
   bind(&l);
   /* clang-format on */
 }
 
@@ -1508,18 +1546,22 @@ void MacroAssemblerX86Shared::minNumFloa
   FloatRegister tmpCopy = reusedInputSimd128Float(temp, scratch);
   vpcmpeqd(Operand(lhs), tmpCopy, mask);
   vandps(temp, mask, mask);
 
   FloatRegister lhsCopy = reusedInputSimd128Float(lhs, temp);
   vminps(rhs, lhsCopy, temp);
   vorps(mask, temp, temp);
 
-  FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, mask);
-  vcmpneqps(rhs, rhsCopy, mask);
+  if (AssemblerX86Shared::HasAVX()) {
+    MOZ_CRASH("Can do better by avoiding the movaps");
+  } else {
+    vmovaps(rhs, mask);
+    vcmpneqps(rhs, mask);
+  }
 
   if (AssemblerX86Shared::HasAVX()) {
     vblendvps(mask, lhs, temp, output);
   } else {
     // Emulate vblendvps.
     // With SSE.4.1 we could use blendvps, however it's awkward since
     // it requires the mask to be in xmm0.
     if (lhs != output) {
@@ -1547,18 +1589,22 @@ void MacroAssemblerX86Shared::maxNumFloa
   FloatRegister lhsCopy = reusedInputSimd128Float(lhs, temp);
   vmaxps(rhs, lhsCopy, temp);
   vandnps(Operand(temp), mask, mask);
 
   // Ensure temp always contains the temporary result
   mask = temp;
   temp = scratch;
 
-  FloatRegister rhsCopy = reusedInputAlignedSimd128Float(rhs, mask);
-  vcmpneqps(rhs, rhsCopy, mask);
+  if (AssemblerX86Shared::HasAVX()) {
+    MOZ_CRASH("Can do better by avoiding the movaps");
+  } else {
+    vmovaps(rhs, mask);
+    vcmpneqps(rhs, mask);
+  }
 
   if (AssemblerX86Shared::HasAVX()) {
     vblendvps(mask, lhs, temp, output);
   } else {
     // Emulate vblendvps.
     // With SSE.4.1 we could use blendvps, however it's awkward since
     // it requires the mask to be in xmm0.
     if (lhs != output) {
@@ -1928,17 +1974,17 @@ void MacroAssemblerX86Shared::truncSatFl
 
   // The cvttps2dq instruction is the workhorse but does not handle NaN or out
   // of range values as we need it to.  We want to saturate too-large positive
   // values to 7FFFFFFFh and too-large negative values to 80000000h.  NaN and -0
   // become 0.
 
   // Convert NaN to 0 by masking away values that compare unordered to itself.
   vmovaps(dest, scratch);
-  vcmpeqps(Operand(scratch), scratch, scratch);
+  vcmpeqps(Operand(scratch), scratch);
   vpand(Operand(scratch), dest, dest);
 
   // Compute the complement of each non-NaN lane's sign bit, we'll need this to
   // correct the result of cvttps2dq.  All other output bits are garbage.
   vpxor(Operand(dest), scratch, scratch);
 
   // Convert.  This will make the output 80000000h if the input is out of range.
   vcvttps2dq(dest, dest);
@@ -1978,17 +2024,17 @@ void MacroAssemblerX86Shared::unsignedTr
   vcvtdq2ps(scratch, scratch);
 
   // temp = dest - 7FFFFFFFh (as floating), this brings integers in the unsigned
   // range but above the signed range into the signed range; 0 => -7FFFFFFFh.
   vmovaps(dest, temp);
   vsubps(Operand(scratch), temp, temp);
 
   // scratch = mask of biased values that are greater than 7FFFFFFFh.
-  vcmpleps(Operand(temp), scratch, scratch);
+  vcmpleps(Operand(temp), scratch);
 
   // Convert the biased values to integer.  Positive values above 7FFFFFFFh will
   // have been converted to 80000000h, all others become the expected integer.
   vcvttps2dq(temp, temp);
 
   // As lanes of scratch are ~0 where the result overflows, this computes
   // 7FFFFFFF in lanes of temp that are 80000000h, and leaves other lanes
   // untouched as the biased integer.
--- a/js/src/jit/x86/MacroAssembler-x86.cpp
+++ b/js/src/jit/x86/MacroAssembler-x86.cpp
@@ -1111,69 +1111,71 @@ void MacroAssembler::convertUInt64ToDoub
     bind(&notNegative);
 
     fstp(Operand(esp, 0));
     vmovsd(Address(esp, 0), dest);
     freeStack(2 * sizeof(intptr_t));
     return;
   }
 
+  ScratchSimd128Scope scratch(*this);
+
   // Following operation uses entire 128-bit of dest XMM register.
   // Currently higher 64-bit is free when we have access to lower 64-bit.
   MOZ_ASSERT(dest.size() == 8);
   FloatRegister dest128 =
       FloatRegister(dest.encoding(), FloatRegisters::Simd128);
 
   // Assume that src is represented as following:
   //   src      = 0x HHHHHHHH LLLLLLLL
 
   // Move src to dest (=dest128) and ScratchInt32x4Reg (=scratch):
   //   dest     = 0x 00000000 00000000  00000000 LLLLLLLL
   //   scratch  = 0x 00000000 00000000  00000000 HHHHHHHH
   vmovd(src.low, dest128);
-  vmovd(src.high, ScratchSimd128Reg);
+  vmovd(src.high, scratch);
 
   // Unpack and interleave dest and scratch to dest:
   //   dest     = 0x 00000000 00000000  HHHHHHHH LLLLLLLL
-  vpunpckldq(ScratchSimd128Reg, dest128, dest128);
+  vpunpckldq(scratch, dest128, dest128);
 
   // Unpack and interleave dest and a constant C1 to dest:
   //   C1       = 0x 00000000 00000000  45300000 43300000
   //   dest     = 0x 45300000 HHHHHHHH  43300000 LLLLLLLL
   // here, each 64-bit part of dest represents following double:
   //   HI(dest) = 0x 1.00000HHHHHHHH * 2**84 == 2**84 + 0x HHHHHHHH 00000000
   //   LO(dest) = 0x 1.00000LLLLLLLL * 2**52 == 2**52 + 0x 00000000 LLLLLLLL
   // See convertUInt64ToDouble for the details.
   static const int32_t CST1[4] = {
       0x43300000,
       0x45300000,
       0x0,
       0x0,
   };
 
-  loadConstantSimd128Int(SimdConstant::CreateX4(CST1), ScratchSimd128Reg);
-  vpunpckldq(ScratchSimd128Reg, dest128, dest128);
+  loadConstantSimd128Int(SimdConstant::CreateX4(CST1), scratch);
+  vpunpckldq(scratch, dest128, dest128);
 
   // Subtract a constant C2 from dest, for each 64-bit part:
   //   C2       = 0x 45300000 00000000  43300000 00000000
   // here, each 64-bit part of C2 represents following double:
   //   HI(C2)   = 0x 1.0000000000000 * 2**84 == 2**84
   //   LO(C2)   = 0x 1.0000000000000 * 2**52 == 2**52
   // after the operation each 64-bit part of dest represents following:
   //   HI(dest) = double(0x HHHHHHHH 00000000)
   //   LO(dest) = double(0x 00000000 LLLLLLLL)
   static const int32_t CST2[4] = {
       0x0,
       0x43300000,
       0x0,
       0x45300000,
   };
 
-  loadConstantSimd128Int(SimdConstant::CreateX4(CST2), ScratchSimd128Reg);
-  vsubpd(ScratchSimd128Reg, dest128, dest128);
+  loadConstantSimd128Int(SimdConstant::CreateX4(CST2), scratch);
+  vsubpd(scratch, dest128, dest128);
 
   // Add HI(dest) and LO(dest) in double and store it into LO(dest),
   //   LO(dest) = double(0x HHHHHHHH 00000000) + double(0x 00000000 LLLLLLLL)
   //            = double(0x HHHHHHHH LLLLLLLL)
   //            = double(src)
   vhaddpd(dest128, dest128);
 }