Bug 1111241 - SpiderMonkey: Use VEX encodings for several more instructions r=jandem
authorDan Gohman <sunfish@mozilla.com>
Mon, 15 Dec 2014 20:54:00 -0800
changeset 219867 8e00d242e6a28189d229eec0ca229fb6a927ecea
parent 219866 9148324616e86ae941cfd49bcb105aff1148c774
child 219868 9f0d701d11c21dd8b817a531eed55727513b0d7d
push id12723
push usercbook@mozilla.com
push dateTue, 16 Dec 2014 12:53:09 +0000
treeherderb2g-inbound@468d98a46fdb [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjandem
bugs1111241
milestone37.0a1
Bug 1111241 - SpiderMonkey: Use VEX encodings for several more instructions r=jandem
js/src/jit/shared/Assembler-x86-shared.h
js/src/jit/shared/BaseAssembler-x86-shared.h
js/src/jit/shared/CodeGenerator-x86-shared.cpp
js/src/jit/shared/CodeGenerator-x86-shared.h
js/src/jit/shared/MacroAssembler-x86-shared.cpp
js/src/jit/shared/MacroAssembler-x86-shared.h
js/src/jit/x64/Assembler-x64.h
js/src/jit/x64/CodeGenerator-x64.cpp
js/src/jit/x64/MacroAssembler-x64.h
js/src/jit/x86/CodeGenerator-x86.cpp
js/src/jit/x86/MacroAssembler-x86.h
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@@ -627,23 +627,23 @@ class AssemblerX86Shared : public Assemb
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
     void movdqa(FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.movdqa_rr(src.code(), dest.code());
     }
-    void cvtss2sd(FloatRegister src, FloatRegister dest) {
+    void vcvtss2sd(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.cvtss2sd_rr(src.code(), dest.code());
+        masm.vcvtss2sd_rr(src1.code(), src0.code(), dest.code());
     }
-    void cvtsd2ss(FloatRegister src, FloatRegister dest) {
+    void vcvtsd2ss(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.cvtsd2ss_rr(src.code(), dest.code());
+        masm.vcvtsd2ss_rr(src1.code(), src0.code(), dest.code());
     }
     void movzbl(const Operand &src, Register dest) {
         switch (src.kind()) {
           case Operand::MEM_REG_DISP:
             masm.movzbl_mr(src.disp(), src.base(), dest.code());
             break;
           case Operand::MEM_SCALE:
             masm.movzbl_mr(src.disp(), src.base(), src.index(), src.scale(), dest.code());
@@ -1562,71 +1562,71 @@ class AssemblerX86Shared : public Assemb
         MOZ_ASSERT(HasSSE2());
         masm.psrld_rr(src.code(), dest.code());
     }
     void psrld(Imm32 count, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.psrld_ir(count.value, dest.code());
     }
 
-    void cvtsi2sd(const Operand &src, FloatRegister dest) {
+    void vcvtsi2sd(const Operand &src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        switch (src.kind()) {
+        switch (src1.kind()) {
           case Operand::REG:
-            masm.cvtsi2sd_rr(src.reg(), dest.code());
+            masm.vcvtsi2sd_rr(src1.reg(), src0.code(), dest.code());
             break;
           case Operand::MEM_REG_DISP:
-            masm.cvtsi2sd_mr(src.disp(), src.base(), dest.code());
+            masm.vcvtsi2sd_mr(src1.disp(), src1.base(), src0.code(), dest.code());
             break;
           case Operand::MEM_SCALE:
-            masm.cvtsi2sd_mr(src.disp(), src.base(), src.index(), src.scale(), dest.code());
+            masm.vcvtsi2sd_mr(src1.disp(), src1.base(), src1.index(), src1.scale(), src0.code(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
-    void cvttsd2si(FloatRegister src, Register dest) {
+    void vcvttsd2si(FloatRegister src, Register dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.cvttsd2si_rr(src.code(), dest.code());
+        masm.vcvttsd2si_rr(src.code(), dest.code());
     }
-    void cvttss2si(FloatRegister src, Register dest) {
+    void vcvttss2si(FloatRegister src, Register dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.cvttss2si_rr(src.code(), dest.code());
+        masm.vcvttss2si_rr(src.code(), dest.code());
     }
-    void cvtsi2ss(const Operand &src, FloatRegister dest) {
+    void vcvtsi2ss(const Operand &src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        switch (src.kind()) {
+        switch (src1.kind()) {
           case Operand::REG:
-            masm.cvtsi2ss_rr(src.reg(), dest.code());
+            masm.vcvtsi2ss_rr(src1.reg(), src0.code(), dest.code());
             break;
           case Operand::MEM_REG_DISP:
-            masm.cvtsi2ss_mr(src.disp(), src.base(), dest.code());
+            masm.vcvtsi2ss_mr(src1.disp(), src1.base(), src0.code(), dest.code());
             break;
           case Operand::MEM_SCALE:
-            masm.cvtsi2ss_mr(src.disp(), src.base(), src.index(), src.scale(), dest.code());
+            masm.vcvtsi2ss_mr(src1.disp(), src1.base(), src1.index(), src1.scale(), src0.code(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
-    void cvtsi2ss(Register src, FloatRegister dest) {
+    void vcvtsi2ss(Register src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.cvtsi2ss_rr(src.code(), dest.code());
+        masm.vcvtsi2ss_rr(src1.code(), src0.code(), dest.code());
     }
-    void cvtsi2sd(Register src, FloatRegister dest) {
+    void vcvtsi2sd(Register src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.cvtsi2sd_rr(src.code(), dest.code());
+        masm.vcvtsi2sd_rr(src1.code(), src0.code(), dest.code());
     }
-    void cvttps2dq(FloatRegister src, FloatRegister dest) {
+    void vcvttps2dq(FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.cvttps2dq_rr(src.code(), dest.code());
+        masm.vcvttps2dq_rr(src.code(), dest.code());
     }
-    void cvtdq2ps(FloatRegister src, FloatRegister dest) {
+    void vcvtdq2ps(FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.cvtdq2ps_rr(src.code(), dest.code());
+        masm.vcvtdq2ps_rr(src.code(), dest.code());
     }
     void movmskpd(FloatRegister src, Register dest) {
         MOZ_ASSERT(HasSSE2());
         masm.movmskpd_rr(src.code(), dest.code());
     }
     void movmskps(FloatRegister src, Register dest) {
         MOZ_ASSERT(HasSSE2());
         masm.movmskps_rr(src.code(), dest.code());
@@ -1638,47 +1638,47 @@ class AssemblerX86Shared : public Assemb
     void ucomisd(FloatRegister rhs, FloatRegister lhs) {
         MOZ_ASSERT(HasSSE2());
         masm.ucomisd_rr(rhs.code(), lhs.code());
     }
     void ucomiss(FloatRegister rhs, FloatRegister lhs) {
         MOZ_ASSERT(HasSSE2());
         masm.ucomiss_rr(rhs.code(), lhs.code());
     }
-    void pcmpeqw(FloatRegister rhs, FloatRegister lhs) {
+    void vpcmpeqw(FloatRegister rhs, FloatRegister lhs, FloatRegister dst) {
         MOZ_ASSERT(HasSSE2());
-        masm.pcmpeqw_rr(rhs.code(), lhs.code());
+        masm.vpcmpeqw_rr(rhs.code(), lhs.code(), dst.code());
     }
-    void pcmpeqd(const Operand &src, FloatRegister dest) {
+    void vpcmpeqd(const Operand &rhs, FloatRegister lhs, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        switch (src.kind()) {
+        switch (rhs.kind()) {
           case Operand::FPREG:
-            masm.pcmpeqd_rr(src.fpu(), dest.code());
+            masm.vpcmpeqd_rr(rhs.fpu(), lhs.code(), dest.code());
             break;
           case Operand::MEM_REG_DISP:
-            masm.pcmpeqd_mr(src.disp(), src.base(), dest.code());
+            masm.vpcmpeqd_mr(rhs.disp(), rhs.base(), lhs.code(), dest.code());
             break;
           case Operand::MEM_ADDRESS32:
-            masm.pcmpeqd_mr(src.address(), dest.code());
+            masm.vpcmpeqd_mr(rhs.address(), lhs.code(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
-    void pcmpgtd(const Operand &src, FloatRegister dest) {
+    void vpcmpgtd(const Operand &rhs, FloatRegister lhs, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        switch (src.kind()) {
+        switch (rhs.kind()) {
           case Operand::FPREG:
-            masm.pcmpgtd_rr(src.fpu(), dest.code());
+            masm.vpcmpgtd_rr(rhs.fpu(), lhs.code(), dest.code());
             break;
           case Operand::MEM_REG_DISP:
-            masm.pcmpgtd_mr(src.disp(), src.base(), dest.code());
+            masm.vpcmpgtd_mr(rhs.disp(), rhs.base(), lhs.code(), dest.code());
             break;
           case Operand::MEM_ADDRESS32:
-            masm.pcmpgtd_mr(src.address(), dest.code());
+            masm.vpcmpgtd_mr(rhs.address(), lhs.code(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
     void vcmpps(uint8_t order, const Operand &src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         switch (src1.kind()) {
@@ -1705,71 +1705,71 @@ class AssemblerX86Shared : public Assemb
         vcmpps(X86Assembler::ConditionCmp_LE, src1, src0, dest);
     }
     void vcmpunordps(const Operand &src1, FloatRegister src0, FloatRegister dest) {
         vcmpps(X86Assembler::ConditionCmp_UNORD, src1, src0, dest);
     }
     void vcmpneqps(const Operand &src1, FloatRegister src0, FloatRegister dest) {
         vcmpps(X86Assembler::ConditionCmp_NEQ, src1, src0, dest);
     }
-    void rcpps(const Operand &src, FloatRegister dest) {
+    void vrcpps(const Operand &src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         switch (src.kind()) {
           case Operand::FPREG:
-            masm.rcpps_rr(src.fpu(), dest.code());
+            masm.vrcpps_rr(src.fpu(), dest.code());
             break;
           case Operand::MEM_REG_DISP:
-            masm.rcpps_mr(src.disp(), src.base(), dest.code());
+            masm.vrcpps_mr(src.disp(), src.base(), dest.code());
             break;
           case Operand::MEM_ADDRESS32:
-            masm.rcpps_mr(src.address(), dest.code());
+            masm.vrcpps_mr(src.address(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
-    void sqrtps(const Operand &src, FloatRegister dest) {
+    void vsqrtps(const Operand &src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         switch (src.kind()) {
           case Operand::FPREG:
-            masm.sqrtps_rr(src.fpu(), dest.code());
+            masm.vsqrtps_rr(src.fpu(), dest.code());
             break;
           case Operand::MEM_REG_DISP:
-            masm.sqrtps_mr(src.disp(), src.base(), dest.code());
+            masm.vsqrtps_mr(src.disp(), src.base(), dest.code());
             break;
           case Operand::MEM_ADDRESS32:
-            masm.sqrtps_mr(src.address(), dest.code());
+            masm.vsqrtps_mr(src.address(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
-    void rsqrtps(const Operand &src, FloatRegister dest) {
+    void vrsqrtps(const Operand &src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         switch (src.kind()) {
           case Operand::FPREG:
-            masm.rsqrtps_rr(src.fpu(), dest.code());
+            masm.vrsqrtps_rr(src.fpu(), dest.code());
             break;
           case Operand::MEM_REG_DISP:
-            masm.rsqrtps_mr(src.disp(), src.base(), dest.code());
+            masm.vrsqrtps_mr(src.disp(), src.base(), dest.code());
             break;
           case Operand::MEM_ADDRESS32:
-            masm.rsqrtps_mr(src.address(), dest.code());
+            masm.vrsqrtps_mr(src.address(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
-    void movd(Register src, FloatRegister dest) {
+    void vmovd(Register src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.movd_rr(src.code(), dest.code());
+        masm.vmovd_rr(src.code(), dest.code());
     }
-    void movd(FloatRegister src, Register dest) {
+    void vmovd(FloatRegister src, Register dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.movd_rr(src.code(), dest.code());
+        masm.vmovd_rr(src.code(), dest.code());
     }
     void vpaddd(const Operand &src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         switch (src1.kind()) {
           case Operand::FPREG:
             masm.vpaddd_rr(src1.fpu(), src0.code(), dest.code());
             break;
           case Operand::MEM_REG_DISP:
@@ -2083,23 +2083,23 @@ class AssemblerX86Shared : public Assemb
     void movhlps(FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.movhlps_rr(src.code(), dest.code());
     }
     void movlhps(FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.movlhps_rr(src.code(), dest.code());
     }
-    void unpcklps(FloatRegister src, FloatRegister dest) {
+    void vunpcklps(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.unpcklps_rr(src.code(), dest.code());
+        masm.vunpcklps_rr(src1.code(), src0.code(), dest.code());
     }
-    void unpckhps(FloatRegister src, FloatRegister dest) {
+    void vunpckhps(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.unpckhps_rr(src.code(), dest.code());
+        masm.vunpckhps_rr(src1.code(), src0.code(), dest.code());
     }
     void shufps(uint32_t mask, FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.shufps_irr(mask, src.code(), dest.code());
     }
     void shufps(uint32_t mask, const Operand &src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         switch (src.kind()) {
--- a/js/src/jit/shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/shared/BaseAssembler-x86-shared.h
@@ -2713,65 +2713,45 @@ public:
         JmpSrc r = m_formatter.immediateRel32();
         spew("j%s        ((%d))",
              nameCC(cond), r.m_offset);
         return r;
     }
 
     // SSE operations:
 
-    void pcmpeqw_rr(XMMRegisterID src, XMMRegisterID dst)
-    {
-        spew("pcmpeqw    %s, %s", nameFPReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PCMPEQW, (RegisterID)src, (RegisterID)dst); /* right order ? */
-    }
-
-    void pcmpeqd_rr(XMMRegisterID src, XMMRegisterID dst)
-    {
-        spew("pcmpeqd    %s, %s", nameFPReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PCMPEQD_VdqWdq, (RegisterID)src, (RegisterID)dst);
-    }
-
-    void pcmpeqd_mr(int offset, RegisterID base, XMMRegisterID dst)
-    {
-        spew("pcmpeqd    %s0x%x(%s), %s",
-             PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PCMPEQD_VdqWdq, offset, base, (RegisterID)dst);
-    }
-
-    void pcmpeqd_mr(const void* address, XMMRegisterID dst)
-    {
-        spew("pcmpeqd    %p, %s", address, nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PCMPEQD_VdqWdq, address, (RegisterID)dst);
-    }
-
-    void pcmpgtd_rr(XMMRegisterID src, XMMRegisterID dst)
-    {
-        spew("pcmpgtd    %s, %s", nameFPReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PCMPGTD_VdqWdq, (RegisterID)src, (RegisterID)dst);
-    }
-
-    void pcmpgtd_mr(int offset, RegisterID base, XMMRegisterID dst)
-    {
-        spew("pcmpgtd    %s0x%x(%s), %s",
-             PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PCMPGTD_VdqWdq, offset, base, (RegisterID)dst);
-    }
-
-    void pcmpgtd_mr(const void* address, XMMRegisterID dst)
-    {
-        spew("pcmpgtd    %p, %s",  address, nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PCMPGTD_VdqWdq, address, (RegisterID)dst);
+    void vpcmpeqw_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vpcmpeqw", VEX_PD, OP2_PCMPEQW, src1, src0, dst);
+    }
+
+    void vpcmpeqd_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vpcmpeqd", VEX_PD, OP2_PCMPEQD_VdqWdq, src1, src0, dst);
+    }
+    void vpcmpeqd_mr(int offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vpcmpeqd", VEX_PD, OP2_PCMPEQD_VdqWdq, offset, base, src0, dst);
+    }
+    void vpcmpeqd_mr(const void* address, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vpcmpeqd", VEX_PD, OP2_PCMPEQD_VdqWdq, address, src0, dst);
+    }
+
+    void vpcmpgtd_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vpcmpgtd", VEX_PD, OP2_PCMPGTD_VdqWdq, src1, src0, dst);
+    }
+    void vpcmpgtd_mr(int offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vpcmpgtd", VEX_PD, OP2_PCMPGTD_VdqWdq, offset, base, src0, dst);
+    }
+    void vpcmpgtd_mr(const void* address, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vpcmpgtd", VEX_PD, OP2_PCMPGTD_VdqWdq, address, src0, dst);
     }
 
     void vcmpps_rr(uint8_t order, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vcmpps", VEX_PS, OP2_CMPPS_VpsWps, src1, src0, dst);
         m_formatter.immediate8(order);
     }
     void vcmpps_mr(uint8_t order, int offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
@@ -2780,65 +2760,44 @@ public:
         m_formatter.immediate8(order);
     }
     void vcmpps_mr(uint8_t order, const void* address, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vcmpps", VEX_PS, OP2_CMPPS_VpsWps, address, src0, dst);
         m_formatter.immediate8(order);
     }
 
-    void rcpps_rr(XMMRegisterID src, XMMRegisterID dst){
-        spew("rcpps      %s, %s", nameFPReg(src), nameFPReg(dst));
-        m_formatter.twoByteOp(OP2_RCPPS_VpsWps, (RegisterID)src, (RegisterID)dst);
-    }
-    void rcpps_mr(int offset, RegisterID base, XMMRegisterID dst){
-        spew("rcpps      %s0x%x(%s), %s",
-             PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
-        m_formatter.twoByteOp(OP2_RCPPS_VpsWps, offset, base, (RegisterID)dst);
-    }
-    void rcpps_mr(const void* address, XMMRegisterID dst){
-        spew("rcpps      %p, %s", address, nameFPReg(dst));
-        m_formatter.twoByteOp(OP2_RCPPS_VpsWps, address, (RegisterID)dst);
-    }
-
-    void rsqrtps_rr(XMMRegisterID src, XMMRegisterID dst){
-        spew("rsqrtps    %s, %s", nameFPReg(src), nameFPReg(dst));
-        m_formatter.twoByteOp(OP2_RSQRTPS_VpsWps, (RegisterID)src, (RegisterID)dst);
-    }
-    void rsqrtps_mr(int offset, RegisterID base, XMMRegisterID dst){
-        spew("rsqrtps    %s0x%x(%s), %s",
-             PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
-        m_formatter.twoByteOp(OP2_RSQRTPS_VpsWps, offset, base, (RegisterID)dst);
-    }
-    void rsqrtps_mr(const void* address, XMMRegisterID dst){
-        spew("rsqrtps    %p, %s", address, nameFPReg(dst));
-        m_formatter.twoByteOp(OP2_RSQRTPS_VpsWps, address, (RegisterID)dst);
-    }
-
-    void sqrtps_rr(XMMRegisterID src, XMMRegisterID dst){
-        spew("sqrtps     %s, %s", nameFPReg(src), nameFPReg(dst));
-        m_formatter.twoByteOp(OP2_SQRTPS_VpsWps, (RegisterID)src, (RegisterID)dst);
-    }
-
-    void sqrtps_mr(int offset, RegisterID base, XMMRegisterID dst){
-        spew("sqrtps     %s0x%x(%s), %s",
-             PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
-        m_formatter.twoByteOp(OP2_SQRTPS_VpsWps, offset, base, (RegisterID)dst);
-    }
-
-    void sqrtps_mr(const void* address, XMMRegisterID dst){
-        spew("sqrtps     %p, %s", address, nameFPReg(dst));
-        m_formatter.twoByteOp(OP2_SQRTPS_VpsWps, address, (RegisterID)dst);
-    }
-
-    void addsd_rr(XMMRegisterID src, XMMRegisterID dst)
-    {
-        spew("addsd      %s, %s", nameFPReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F2);
-        m_formatter.twoByteOp(OP2_ADDSD_VsdWsd, (RegisterID)src, (RegisterID)dst);
+    void vrcpps_rr(XMMRegisterID src, XMMRegisterID dst) {
+        twoByteOpSimd("vrcpps", VEX_PS, OP2_RCPPS_VpsWps, src, X86Registers::invalid_xmm, dst);
+    }
+    void vrcpps_mr(int offset, RegisterID base, XMMRegisterID dst) {
+        twoByteOpSimd("vrcpps", VEX_PS, OP2_RCPPS_VpsWps, offset, base, X86Registers::invalid_xmm, dst);
+    }
+    void vrcpps_mr(const void* address, XMMRegisterID dst) {
+        twoByteOpSimd("vrcpps", VEX_PS, OP2_RCPPS_VpsWps, address, X86Registers::invalid_xmm, dst);
+    }
+
+    void vrsqrtps_rr(XMMRegisterID src, XMMRegisterID dst) {
+        twoByteOpSimd("vrsqrtps", VEX_PS, OP2_RSQRTPS_VpsWps, src, X86Registers::invalid_xmm, dst);
+    }
+    void vrsqrtps_mr(int offset, RegisterID base, XMMRegisterID dst) {
+        twoByteOpSimd("vrsqrtps", VEX_PS, OP2_RSQRTPS_VpsWps, offset, base, X86Registers::invalid_xmm, dst);
+    }
+    void vrsqrtps_mr(const void* address, XMMRegisterID dst) {
+        twoByteOpSimd("vrsqrtps", VEX_PS, OP2_RSQRTPS_VpsWps, address, X86Registers::invalid_xmm, dst);
+    }
+
+    void vsqrtps_rr(XMMRegisterID src, XMMRegisterID dst) {
+        twoByteOpSimd("vsqrtps", VEX_PS, OP2_SQRTPS_VpsWps, src, X86Registers::invalid_xmm, dst);
+    }
+    void vsqrtps_mr(int offset, RegisterID base, XMMRegisterID dst) {
+        twoByteOpSimd("vsqrtps", VEX_PS, OP2_SQRTPS_VpsWps, offset, base, X86Registers::invalid_xmm, dst);
+    }
+    void vsqrtps_mr(const void* address, XMMRegisterID dst) {
+        twoByteOpSimd("vsqrtps", VEX_PS, OP2_SQRTPS_VpsWps, address, X86Registers::invalid_xmm, dst);
     }
 
     void vaddsd_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vaddsd", VEX_SD, OP2_ADDSD_VsdWsd, src1, src0, dst);
     }
 
     void vaddss_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
@@ -2860,160 +2819,119 @@ public:
     {
         twoByteOpSimd("vaddsd", VEX_SD, OP2_ADDSD_VsdWsd, address, src0, dst);
     }
     void vaddss_mr(const void* address, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vaddss", VEX_SS, OP2_ADDSD_VsdWsd, address, src0, dst);
     }
 
-    void cvtss2sd_rr(XMMRegisterID src, XMMRegisterID dst)
-    {
-        spew("cvtss2sd   %s, %s", nameFPReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp(OP2_CVTSS2SD_VsdEd, (RegisterID)src, (RegisterID)dst);
-    }
-
-    void cvtsd2ss_rr(XMMRegisterID src, XMMRegisterID dst)
-    {
-        spew("cvtsd2ss   %s, %s", nameFPReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F2);
-        m_formatter.twoByteOp(OP2_CVTSD2SS_VsdEd, (RegisterID)src, (RegisterID)dst);
-    }
-
-    void cvtsi2ss_rr(RegisterID src, XMMRegisterID dst)
-    {
-        spew("cvtsi2ss   %s, %s", nameIReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp(OP2_CVTSI2SD_VsdEd, src, (RegisterID)dst);
-    }
-
-    void cvtsi2sd_rr(RegisterID src, XMMRegisterID dst)
-    {
-        spew("cvtsi2sd   %s, %s", nameIReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F2);
-        m_formatter.twoByteOp(OP2_CVTSI2SD_VsdEd, src, (RegisterID)dst);
-    }
-
-    void cvttps2dq_rr(XMMRegisterID src, XMMRegisterID dst)
-    {
-        spew("cvttps2dq  %s, %s", nameFPReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp(OP2_CVTTPS2DQ_VdqWps, (RegisterID)src, (RegisterID)dst);
-    }
-
-    void cvtdq2ps_rr(XMMRegisterID src, XMMRegisterID dst)
-    {
-        spew("cvtdq2ps   %s, %s", nameFPReg(src), nameFPReg(dst));
-        m_formatter.twoByteOp(OP2_CVTDQ2PS_VpsWdq, (RegisterID)src, (RegisterID)dst);
+    void vcvtss2sd_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vcvtss2sd", VEX_SS, OP2_CVTSS2SD_VsdEd, src1, src0, dst);
+    }
+
+    void vcvtsd2ss_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vcvtsd2ss", VEX_SD, OP2_CVTSD2SS_VsdEd, src1, src0, dst);
+    }
+
+    void vcvtsi2ss_rr(RegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpInt32Simd("vcvtsi2ss", VEX_SS, OP2_CVTSI2SD_VsdEd, src1, src0, dst);
+    }
+
+    void vcvtsi2sd_rr(RegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpInt32Simd("vcvtsi2sd", VEX_SD, OP2_CVTSI2SD_VsdEd, src1, src0, dst);
+    }
+
+    void vcvttps2dq_rr(XMMRegisterID src, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vcvttps2dq", VEX_SS, OP2_CVTTPS2DQ_VdqWps, src, X86Registers::invalid_xmm, dst);
+    }
+
+    void vcvtdq2ps_rr(XMMRegisterID src, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vcvtdq2ps", VEX_PS, OP2_CVTDQ2PS_VpsWdq, src, X86Registers::invalid_xmm, dst);
     }
 
 #ifdef JS_CODEGEN_X64
-    void cvtsq2sd_rr(RegisterID src, XMMRegisterID dst)
-    {
-        spew("cvtsq2sd   %s, %s", nameIReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F2);
-        m_formatter.twoByteOp64(OP2_CVTSI2SD_VsdEd, src, (RegisterID)dst);
-    }
-    void cvtsq2ss_rr(RegisterID src, XMMRegisterID dst)
-    {
-        spew("cvtsq2ss   %s, %s", nameIReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp64(OP2_CVTSI2SD_VsdEd, src, (RegisterID)dst);
+    void vcvtsq2sd_rr(RegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpInt64Simd("vcvtsi2sd", VEX_SD, OP2_CVTSI2SD_VsdEd, src1, src0, dst);
+    }
+    void vcvtsq2ss_rr(RegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpInt64Simd("vcvtsi2ss", VEX_SS, OP2_CVTSI2SD_VsdEd, src1, src0, dst);
     }
 #endif
 
-    void cvtsi2sd_mr(int offset, RegisterID base, XMMRegisterID dst)
-    {
-        spew("cvtsi2sd   %s0x%x(%s), %s",
-             PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F2);
-        m_formatter.twoByteOp(OP2_CVTSI2SD_VsdEd, offset, base, (RegisterID)dst);
-    }
-
-    void cvtsi2sd_mr(int offset, RegisterID base, RegisterID index, int scale, XMMRegisterID dst)
-    {
-        spew("cvtsi2sd   %d(%s,%s,%d), %s",
-             offset, nameIReg(base), nameIReg(index), 1<<scale, nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F2);
-        m_formatter.twoByteOp(OP2_CVTSI2SD_VsdEd, offset, base, index, scale, (RegisterID)dst);
-    }
-
-    void cvtsi2ss_mr(int offset, RegisterID base, XMMRegisterID dst)
-    {
-        spew("cvtsi2ss   %s0x%x(%s), %s",
-             PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp(OP2_CVTSI2SD_VsdEd, offset, base, (RegisterID)dst);
-    }
-
-    void cvtsi2ss_mr(int offset, RegisterID base, RegisterID index, int scale, XMMRegisterID dst)
-    {
-        spew("cvtsi2ss   %d(%s,%s,%d), %s",
-             offset, nameIReg(base), nameIReg(index), 1<<scale, nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp(OP2_CVTSI2SD_VsdEd, offset, base, index, scale, (RegisterID)dst);
+    void vcvtsi2sd_mr(int offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vcvtsi2sd", VEX_SD, OP2_CVTSI2SD_VsdEd, offset, base, src0, dst);
+    }
+
+    void vcvtsi2sd_mr(int offset, RegisterID base, RegisterID index, int scale, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vcvtsi2sd", VEX_SD, OP2_CVTSI2SD_VsdEd, offset, base, index, scale, src0, dst);
+    }
+
+    void vcvtsi2ss_mr(int offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vcvtsi2ss", VEX_SS, OP2_CVTSI2SD_VsdEd, offset, base, src0, dst);
+    }
+
+    void vcvtsi2ss_mr(int offset, RegisterID base, RegisterID index, int scale, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vcvtsi2ss", VEX_SS, OP2_CVTSI2SD_VsdEd, offset, base, index, scale, src0, dst);
     }
 
 #ifdef JS_CODEGEN_X86
-    void cvtsi2sd_mr(const void* address, XMMRegisterID dst)
-    {
-        spew("cvtsi2sd   %p, %s", address, nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F2);
-        m_formatter.twoByteOp(OP2_CVTSI2SD_VsdEd, address, (RegisterID)dst);
+    void vcvtsi2sd_mr(const void* address, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vcvtsi2sd", VEX_SD, OP2_CVTSI2SD_VsdEd, address, src0, dst);
     }
 #endif
 
-    void cvttsd2si_rr(XMMRegisterID src, RegisterID dst)
-    {
-        spew("cvttsd2si  %s, %s", nameFPReg(src), nameIReg(4, dst));
-        m_formatter.prefix(PRE_SSE_F2);
-        m_formatter.twoByteOp(OP2_CVTTSD2SI_GdWsd, (RegisterID)src, dst);
-    }
-
-    void cvttss2si_rr(XMMRegisterID src, RegisterID dst)
-    {
-        spew("cvttss2si  %s, %s", nameFPReg(src), nameIReg(4, dst));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp(OP2_CVTTSD2SI_GdWsd, (RegisterID)src, dst);
+    void vcvttsd2si_rr(XMMRegisterID src, RegisterID dst)
+    {
+        twoByteOpSimdInt32("vcvttsd2si", VEX_SD, OP2_CVTTSD2SI_GdWsd, src, dst);
+    }
+
+    void vcvttss2si_rr(XMMRegisterID src, RegisterID dst)
+    {
+        twoByteOpSimdInt32("vcvttss2si", VEX_SS, OP2_CVTTSD2SI_GdWsd, src, dst);
     }
 
 #ifdef JS_CODEGEN_X64
-    void cvttsd2sq_rr(XMMRegisterID src, RegisterID dst)
-    {
-        spew("cvttsd2si  %s, %s", nameFPReg(src), nameIReg(dst));
-        m_formatter.prefix(PRE_SSE_F2);
-        m_formatter.twoByteOp64(OP2_CVTTSD2SI_GdWsd, (RegisterID)src, dst);
-    }
-
-    void cvttss2sq_rr(XMMRegisterID src, RegisterID dst)
-    {
-        spew("cvttss2si  %s, %s", nameFPReg(src), nameIReg(dst));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp64(OP2_CVTTSD2SI_GdWsd, (RegisterID)src, dst);
+    void vcvttsd2sq_rr(XMMRegisterID src, RegisterID dst)
+    {
+        twoByteOpSimdInt64("vcvttsd2si", VEX_SD, OP2_CVTTSD2SI_GdWsd, src, dst);
+    }
+
+    void vcvttss2sq_rr(XMMRegisterID src, RegisterID dst)
+    {
+        twoByteOpSimdInt64("vcvttss2si", VEX_SS, OP2_CVTTSD2SI_GdWsd, src, dst);
     }
 #endif
 
-    void unpcklps_rr(XMMRegisterID src, XMMRegisterID dst)
-    {
-        spew("unpcklps   %s, %s", nameFPReg(src), nameFPReg(dst));
-        m_formatter.twoByteOp(OP2_UNPCKLPS_VsdWsd, (RegisterID)src, (RegisterID)dst);
-    }
-
-    void unpckhps_rr(XMMRegisterID src, XMMRegisterID dst)
-    {
-        spew("unpckhps   %s, %s", nameFPReg(src), nameFPReg(dst));
-        m_formatter.twoByteOp(OP2_UNPCKHPS_VsdWsd, (RegisterID)src, (RegisterID)dst);
-    }
-
-    void movd_rr(RegisterID src, XMMRegisterID dst)
-    {
-        spew("movd       %s, %s", nameIReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_MOVD_VdEd, src, (RegisterID)dst);
+    void vunpcklps_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vunpcklps", VEX_PS, OP2_UNPCKLPS_VsdWsd, src1, src0, dst);
+    }
+
+    void vunpckhps_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpSimd("vunpckhps", VEX_PS, OP2_UNPCKHPS_VsdWsd, src1, src0, dst);
+    }
+
+    void vmovd_rr(RegisterID src, XMMRegisterID dst)
+    {
+        movdOpSimd(src, dst);
     }
 
     void vpand_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vpand", VEX_PD, OP2_PANDDQ_VdqWdq, src1, src0, dst);
     }
     void vpand_mr(int offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
     {
@@ -3206,21 +3124,19 @@ public:
     }
 
     void ptest_rr(XMMRegisterID rhs, XMMRegisterID lhs) {
         spew("ptest      %s, %s", nameFPReg(rhs), nameFPReg(lhs));
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.threeByteOp(OP3_PTEST_VdVd, ESCAPE_PTEST, (RegisterID)rhs, (RegisterID)lhs);
     }
 
-    void movd_rr(XMMRegisterID src, RegisterID dst)
-    {
-        spew("movd       %s, %s", nameFPReg(src), nameIReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_MOVD_EdVd, dst, (RegisterID)src);
+    void vmovd_rr(XMMRegisterID src, RegisterID dst)
+    {
+        movdOpSimd(src, dst);
     }
 
 #ifdef JS_CODEGEN_X64
     void movq_rr(XMMRegisterID src, RegisterID dst)
     {
         spew("movq       %s, %s", nameFPReg(src), nameIReg(dst));
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.twoByteOp64(OP2_MOVD_EdVd, dst, (RegisterID)src);
@@ -4343,17 +4259,17 @@ public:
 private:
     // Methods for encoding SIMD instructions via either legacy SSE encoding or
     // VEX encoding.
 
     bool useLegacySSEEncoding(XMMRegisterID src0, XMMRegisterID dst)
     {
         // If we don't have AVX or it's disabled, use the legacy SSE encoding.
         if (!useVEX_) {
-            MOZ_ASSERT(src0 == dst);
+            MOZ_ASSERT(src0 == X86Registers::invalid_xmm || src0 == dst);
             return true;
         }
 
         // If src0 is the same as the output register, we might as well use
         // the legacy SSE encoding, since it is smaller. However, this is only
         // beneficial as long as we're not using ymm registers anywhere.
         return src0 == dst;
     }
@@ -4367,16 +4283,21 @@ private:
             MOZ_ASSERT(src0 == dst);
             MOZ_ASSERT(mask == X86Registers::xmm0);
             return true;
         }
 
         return src0 == dst && mask == X86Registers::xmm0;
     }
 
+    bool useLegacySSEEncodingForOtherOutput()
+    {
+        return !useVEX_;
+    }
+
     const char *legacySSEOpName(const char *name)
     {
         MOZ_ASSERT(name[0] == 'v');
         return name + 1;
     }
 
 #ifdef JS_CODEGEN_X64
     void twoByteRipOpSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
@@ -4467,16 +4388,76 @@ private:
             m_formatter.twoByteOp(opcode, address, dst);
             return;
         }
 
         spew("%-11s%p, %s, %s", name, address, nameFPReg(src0), nameFPReg(dst));
         m_formatter.twoByteOpVex(ty, opcode, address, src0, dst);
     }
 
+    void twoByteOpInt32Simd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
+                            RegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        if (useLegacySSEEncoding(src0, dst)) {
+            spew("%-11s%s, %s", legacySSEOpName(name), nameIReg(4, rm), nameFPReg(dst));
+            m_formatter.legacySSEPrefix(ty);
+            m_formatter.twoByteOp(opcode, rm, dst);
+            return;
+        }
+
+        spew("%-11s%s, %s, %s", name, nameIReg(4, rm), nameFPReg(src0), nameFPReg(dst));
+        m_formatter.twoByteOpVex(ty, opcode, rm, src0, dst);
+    }
+
+#ifdef JS_CODEGEN_X64
+    void twoByteOpInt64Simd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
+                            RegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        if (useLegacySSEEncoding(src0, dst)) {
+            spew("%-11s%s, %s", legacySSEOpName(name), nameIReg(rm), nameFPReg(dst));
+            m_formatter.legacySSEPrefix(ty);
+            m_formatter.twoByteOp64(opcode, rm, dst);
+            return;
+        }
+
+        spew("%-11s%s, %s, %s", name, nameIReg(rm), nameFPReg(src0), nameFPReg(dst));
+        m_formatter.twoByteOpVex64(ty, opcode, rm, src0, dst);
+    }
+#endif
+
+    void twoByteOpSimdInt32(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
+                            XMMRegisterID rm, RegisterID dst)
+    {
+        if (useLegacySSEEncodingForOtherOutput()) {
+            spew("%-11s%s, %s", legacySSEOpName(name), nameFPReg(rm), nameIReg(4, dst));
+            m_formatter.legacySSEPrefix(ty);
+            m_formatter.twoByteOp(opcode, (RegisterID)rm, dst);
+            return;
+        }
+
+        spew("%-11s%s, %s", name, nameFPReg(rm), nameIReg(4, dst));
+        m_formatter.twoByteOpVex(ty, opcode, (RegisterID)rm, X86Registers::invalid_xmm, dst);
+    }
+
+#ifdef JS_CODEGEN_X64
+    void twoByteOpSimdInt64(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
+                            XMMRegisterID rm, RegisterID dst)
+    {
+        if (useLegacySSEEncodingForOtherOutput()) {
+            spew("%-11s%s, %s", legacySSEOpName(name), nameFPReg(rm), nameIReg(dst));
+            m_formatter.legacySSEPrefix(ty);
+            m_formatter.twoByteOp64(opcode, (RegisterID)rm, dst);
+            return;
+        }
+
+        spew("%-11s%s, %s", name, nameFPReg(rm), nameIReg(dst));
+        m_formatter.twoByteOpVex64(ty, opcode, (RegisterID)rm, X86Registers::invalid_xmm, (XMMRegisterID)dst);
+    }
+#endif
+
     void threeByteOpSimd(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
                          ThreeByteEscape escape,
                          XMMRegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
     {
         if (useLegacySSEEncoding(src0, dst)) {
             spew("%-11s%s, %s", legacySSEOpName(name), nameFPReg(rm), nameFPReg(dst));
             m_formatter.legacySSEPrefix(ty);
             m_formatter.threeByteOp(opcode, escape, (RegisterID)rm, dst);
@@ -4537,31 +4518,43 @@ private:
         spew("vblendvps  %s, %s0x%x(%s), %s, %s",
              nameFPReg(mask), PRETTY_PRINT_OFFSET(offset), nameIReg(base),
              nameFPReg(src0), nameFPReg(dst));
         // Even though a "ps" instruction, vblendv is encoded with the "pd" prefix.
         m_formatter.vblendvOpVex(VEX_PD, OP3_VBLENDVPS_VdqWdq, ESCAPE_VBLENDVPS,
                                  mask, offset, base, src0, dst);
     }
 
-#ifdef JS_CODEGEN_X64
-    void twoByteOpSimd64(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
-                         XMMRegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
-    {
-        if (useLegacySSEEncoding(src0, dst)) {
-            spew("%-11s%s, %s", legacySSEOpName(name), nameFPReg(rm), nameFPReg(src0));
-            m_formatter.legacySSEPrefix(ty);
-            m_formatter.twoByteOp64(opcode, (RegisterID)rm, src0);
+    // XMM-to-GPR movd is a two-byte op, but the operands are encoded in reverse
+    // order, so we handle it specially.
+    void movdOpSimd(XMMRegisterID src, RegisterID dst) {
+        if (useLegacySSEEncodingForOtherOutput()) {
+            spew("movd       %s, %s", nameFPReg(src), nameIReg(4, dst));
+            m_formatter.prefix(PRE_SSE_66);
+            m_formatter.twoByteOp(OP2_MOVD_EdVd, dst, (RegisterID)src);
             return;
         }
 
-        spew("%-11s%s, %s, %s", name, nameFPReg(rm), nameFPReg(src0), nameFPReg(dst));
-        m_formatter.twoByteOpVex64(ty, opcode, (RegisterID)rm, src0, dst);
-    }
-#endif
+        spew("vmovd      %s, %s", nameFPReg(src), nameIReg(4, dst));
+        m_formatter.twoByteOpVex(VEX_PD, OP2_MOVD_EdVd, dst, X86Registers::invalid_xmm, src);
+    }
+
+    // GPR-to-XMM movd is a two-byte op, but it doesn't have an extra XMM
+    // input, so we handle it specially.
+    void movdOpSimd(RegisterID src, XMMRegisterID dst) {
+        if (useLegacySSEEncodingForOtherOutput()) {
+            spew("movd       %s, %s", nameIReg(4, src), nameFPReg(dst));
+            m_formatter.prefix(PRE_SSE_66);
+            m_formatter.twoByteOp(OP2_MOVD_VdEd, src, dst);
+            return;
+        }
+
+        spew("vmovd      %s, %s", nameIReg(4, src), nameFPReg(dst));
+        m_formatter.twoByteOpVex(VEX_PD, OP2_MOVD_VdEd, src, X86Registers::invalid_xmm, dst);
+    }
 
     static int32_t getInt32(void* where)
     {
         return reinterpret_cast<int32_t*>(where)[-1];
     }
 
     class X86InstructionFormatter {
 
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@@ -2085,17 +2085,17 @@ CodeGeneratorX86Shared::visitFloat32x4To
 
 void
 CodeGeneratorX86Shared::visitSimdValueInt32x4(LSimdValueInt32x4 *ins)
 {
     MOZ_ASSERT(ins->mir()->type() == MIRType_Int32x4);
 
     FloatRegister output = ToFloatRegister(ins->output());
     if (AssemblerX86Shared::HasSSE41()) {
-        masm.movd(ToRegister(ins->getOperand(0)), output);
+        masm.vmovd(ToRegister(ins->getOperand(0)), output);
         for (size_t i = 1; i < 4; ++i) {
             Register r = ToRegister(ins->getOperand(i));
             masm.pinsrd(i, r, output);
         }
         return;
     }
 
     masm.reserveStack(Simd128DataSize);
@@ -2114,34 +2114,34 @@ CodeGeneratorX86Shared::visitSimdValueFl
 
     FloatRegister r0 = ToFloatRegister(ins->getOperand(0));
     MOZ_ASSERT(r0 == ToFloatRegister(ins->output())); // defineReuseInput(0)
 
     FloatRegister r1 = ToFloatRegister(ins->getTemp(0));
     FloatRegister r2 = ToFloatRegister(ins->getOperand(2));
     FloatRegister r3 = ToFloatRegister(ins->getOperand(3));
 
-    masm.unpcklps(r3, r1);
-    masm.unpcklps(r2, r0);
-    masm.unpcklps(r1, r0);
+    masm.vunpcklps(r3, r1, r1);
+    masm.vunpcklps(r2, r0, r0);
+    masm.vunpcklps(r1, r0, r0);
 }
 
 void
 CodeGeneratorX86Shared::visitSimdSplatX4(LSimdSplatX4 *ins)
 {
     FloatRegister output = ToFloatRegister(ins->output());
 
     MSimdSplatX4 *mir = ins->mir();
     MOZ_ASSERT(IsSimdType(mir->type()));
     JS_STATIC_ASSERT(sizeof(float) == sizeof(int32_t));
 
     switch (mir->type()) {
       case MIRType_Int32x4: {
         Register r = ToRegister(ins->getOperand(0));
-        masm.movd(r, output);
+        masm.vmovd(r, output);
         masm.pshufd(0, output, output);
         break;
       }
       case MIRType_Float32x4: {
         FloatRegister r = ToFloatRegister(ins->getOperand(0));
         MOZ_ASSERT(r == output);
         masm.shufps(0, r, output);
         break;
@@ -2195,18 +2195,18 @@ CodeGeneratorX86Shared::visitSimdInsertE
 {
     FloatRegister vector = ToFloatRegister(ins->vector());
     Register value = ToRegister(ins->value());
     FloatRegister output = ToFloatRegister(ins->output());
     MOZ_ASSERT(vector == output); // defineReuseInput(0)
 
     unsigned component = unsigned(ins->lane());
 
-    // Note that, contrarily to float32x4, we cannot use movd if the inserted
-    // value goes into the first component, as movd clears out the higher lanes
+    // Note that, contrarily to float32x4, we cannot use vmovd if the inserted
+    // value goes into the first component, as vmovd clears out the higher lanes
     // of the output.
     if (AssemblerX86Shared::HasSSE41()) {
         masm.pinsrd(component, value, output);
         return;
     }
 
     masm.reserveStack(Simd128DataSize);
     masm.storeAlignedInt32x4(vector, Address(StackPointer, 0));
@@ -2302,24 +2302,24 @@ CodeGeneratorX86Shared::visitSimdSwizzle
 
     if (ins->lanesMatch(0, 1, 0, 1)) {
         masm.movaps(input, output);
         masm.movlhps(input, output);
         return;
     }
 
     if (ins->lanesMatch(0, 0, 1, 1)) {
-        masm.movaps(input, output);
-        masm.unpcklps(input, output);
+        FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
+        masm.vunpcklps(input, inputCopy, output);
         return;
     }
 
     if (ins->lanesMatch(2, 2, 3, 3)) {
-        masm.movaps(input, output);
-        masm.unpckhps(input, output);
+        FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
+        masm.vunpckhps(input, inputCopy, output);
         return;
     }
 
     uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
     masm.shuffleFloat32(mask, input, output);
 }
 
 void
@@ -2454,38 +2454,46 @@ CodeGeneratorX86Shared::visitSimdShuffle
     }
 
     if (ins->lanesMatch(0, 1, 4, 5)) {
         masm.movlhps(rhs, lhs);
         return;
     }
 
     if (ins->lanesMatch(0, 4, 1, 5)) {
-        masm.unpcklps(rhs, lhs);
+        masm.vunpcklps(rhs, lhs, lhs);
         return;
     }
 
     // TODO swapped case would be better (bug 1084404)
     if (ins->lanesMatch(4, 0, 5, 1)) {
-        masm.movaps(rhs, ScratchSimdReg);
-        masm.unpcklps(lhs, ScratchSimdReg);
-        masm.movaps(ScratchSimdReg, out);
+        if (AssemblerX86Shared::HasAVX()) {
+            masm.vunpcklps(lhs, rhs, out);
+        } else {
+            masm.movaps(rhs, ScratchSimdReg);
+            masm.vunpcklps(lhs, ScratchSimdReg, ScratchSimdReg);
+            masm.movaps(ScratchSimdReg, out);
+        }
         return;
     }
 
     if (ins->lanesMatch(2, 6, 3, 7)) {
-        masm.unpckhps(rhs, lhs);
+        masm.vunpckhps(rhs, lhs, lhs);
         return;
     }
 
     // TODO swapped case would be better (bug 1084404)
     if (ins->lanesMatch(6, 2, 7, 3)) {
-        masm.movaps(rhs, ScratchSimdReg);
-        masm.unpckhps(lhs, ScratchSimdReg);
-        masm.movaps(ScratchSimdReg, out);
+        if (AssemblerX86Shared::HasAVX()) {
+            masm.vunpckhps(lhs, rhs, out);
+        } else {
+            masm.movaps(rhs, ScratchSimdReg);
+            masm.vunpckhps(lhs, ScratchSimdReg, ScratchSimdReg);
+            masm.movaps(ScratchSimdReg, out);
+        }
         return;
     }
 
     // In one shufps
     if (x < 4 && y < 4) {
         mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
         masm.shufps(mask, rhs, out);
         return;
@@ -2703,21 +2711,21 @@ CodeGeneratorX86Shared::visitSimdBinaryA
         FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, ScratchSimdReg);
         masm.vminps(Operand(lhs), rhsCopy, ScratchSimdReg);
         masm.vminps(rhs, lhs, output);
         masm.vorps(ScratchSimdReg, output, output); // NaN or'd with arbitrary bits is NaN
         return;
       }
       case MSimdBinaryArith::MinNum: {
         FloatRegister tmp = ToFloatRegister(ins->temp());
-        masm.loadConstantInt32x4(SimdConstant::SplatX4(int32_t(0x80000000)), ScratchSimdReg);
-        masm.movdqa(ScratchSimdReg, tmp);
+        masm.loadConstantInt32x4(SimdConstant::SplatX4(int32_t(0x80000000)), tmp);
 
         FloatRegister mask = ScratchSimdReg;
-        masm.pcmpeqd(Operand(lhs), mask);
+        FloatRegister tmpCopy = masm.reusedInputFloat32x4(tmp, ScratchSimdReg);
+        masm.vpcmpeqd(Operand(lhs), tmpCopy, mask);
         masm.vandps(tmp, mask, mask);
 
         FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, tmp);
         masm.vminps(rhs, lhsCopy, tmp);
         masm.vorps(mask, tmp, tmp);
 
         FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, mask);
         masm.vcmpneqps(rhs, rhsCopy, mask);
@@ -2734,17 +2742,17 @@ CodeGeneratorX86Shared::visitSimdBinaryA
             masm.vandnps(Operand(tmp), mask, mask);
             masm.vorps(Operand(mask), output, output);
         }
         return;
       }
       case MSimdBinaryArith::MaxNum: {
         FloatRegister mask = ScratchSimdReg;
         masm.loadConstantInt32x4(SimdConstant::SplatX4(0), mask);
-        masm.pcmpeqd(Operand(lhs), mask);
+        masm.vpcmpeqd(Operand(lhs), mask, mask);
 
         FloatRegister tmp = ToFloatRegister(ins->temp());
         masm.loadConstantInt32x4(SimdConstant::SplatX4(int32_t(0x80000000)), tmp);
         masm.vandps(tmp, mask, mask);
 
         FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, tmp);
         masm.vmaxps(rhs, lhsCopy, tmp);
         masm.vandnps(Operand(tmp), mask, mask);
@@ -2900,17 +2908,17 @@ CodeGeneratorX86Shared::visitSimdShift(L
             masm.packedUnsignedRightShiftByScalar(count, out);
             return;
         }
         MOZ_CRASH("unexpected SIMD bitwise op");
     }
 
     MOZ_ASSERT(val->isRegister());
     FloatRegister tmp = ScratchFloat32Reg;
-    masm.movd(ToRegister(val), tmp);
+    masm.vmovd(ToRegister(val), tmp);
 
     switch (ins->operation()) {
       case MSimdShift::lsh:
         masm.packedLeftShiftByScalar(tmp, out);
         return;
       case MSimdShift::rsh:
         masm.packedRightShiftByScalar(tmp, out);
         return;
--- a/js/src/jit/shared/CodeGenerator-x86-shared.h
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.h
@@ -95,27 +95,27 @@ class CodeGeneratorX86Shared : public Co
         masm.test32(lhs, rhs);
         bailoutIf(c, snapshot);
     }
     void bailoutIfFalseBool(Register reg, LSnapshot *snapshot) {
         masm.test32(reg, Imm32(0xFF));
         bailoutIf(Assembler::Zero, snapshot);
     }
     void bailoutCvttsd2si(FloatRegister src, Register dest, LSnapshot *snapshot) {
-        // cvttsd2si returns 0x80000000 on failure. Test for it by
+        // vcvttsd2si returns 0x80000000 on failure. Test for it by
         // subtracting 1 and testing overflow. The other possibility is to test
         // equality for INT_MIN after a comparison, but 1 costs fewer bytes to
         // materialize.
-        masm.cvttsd2si(src, dest);
+        masm.vcvttsd2si(src, dest);
         masm.cmp32(dest, Imm32(1));
         bailoutIf(Assembler::Overflow, snapshot);
     }
     void bailoutCvttss2si(FloatRegister src, Register dest, LSnapshot *snapshot) {
         // Same trick as explained in the above comment.
-        masm.cvttss2si(src, dest);
+        masm.vcvttss2si(src, dest);
         masm.cmp32(dest, Imm32(1));
         bailoutIf(Assembler::Overflow, snapshot);
     }
 
   protected:
     bool generatePrologue();
     bool generateEpilogue();
     bool generateOutOfLineCode();
--- a/js/src/jit/shared/MacroAssembler-x86-shared.cpp
+++ b/js/src/jit/shared/MacroAssembler-x86-shared.cpp
@@ -117,17 +117,17 @@ MacroAssembler::clampDoubleToUint8(Float
     loadConstantDouble(0.5, ScratchDoubleReg);
     addDouble(ScratchDoubleReg, input);
 
     Label outOfRange;
 
     // Truncate to int32 and ensure the result <= 255. This relies on the
     // processor setting output to a value > 255 for doubles outside the int32
     // range (for instance 0x80000000).
-    cvttsd2si(input, output);
+    vcvttsd2si(input, output);
     branch32(Assembler::Above, output, Imm32(255), &outOfRange);
     {
         // Check if we had a tie.
         convertInt32ToDouble(output, ScratchDoubleReg);
         branchDouble(DoubleNotEqual, input, ScratchDoubleReg, &done);
 
         // It was a tie. Mask out the ones bit to get an even value.
         // See also js_TypedArray_uint8_clamp_double.
@@ -237,12 +237,12 @@ MacroAssemblerX86Shared::branchNegativeZ
 #endif
 }
 
 void
 MacroAssemblerX86Shared::branchNegativeZeroFloat32(FloatRegister reg,
                                                    Register scratch,
                                                    Label *label)
 {
-    movd(reg, scratch);
+    vmovd(reg, scratch);
     cmp32(scratch, Imm32(1));
     j(Overflow, label);
 }
--- a/js/src/jit/shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/shared/MacroAssembler-x86-shared.h
@@ -606,45 +606,45 @@ class MacroAssemblerX86Shared : public A
     void jump(Register reg) {
         jmp(Operand(reg));
     }
     void jump(const Address &addr) {
         jmp(Operand(addr));
     }
 
     void convertInt32ToDouble(Register src, FloatRegister dest) {
-        // cvtsi2sd and friends write only part of their output register, which
+        // vcvtsi2sd and friends write only part of their output register, which
         // causes slowdowns on out-of-order processors. Explicitly break
         // dependencies with vxorpd (and vxorps elsewhere), which are handled
         // specially in modern CPUs, for this purpose. See sections 8.14, 9.8,
         // 10.8, 12.9, 13.16, 14.14, and 15.8 of Agner's Microarchitecture
         // document.
         zeroDouble(dest);
-        cvtsi2sd(src, dest);
+        vcvtsi2sd(src, dest, dest);
     }
     void convertInt32ToDouble(const Address &src, FloatRegister dest) {
         convertInt32ToDouble(Operand(src), dest);
     }
     void convertInt32ToDouble(const Operand &src, FloatRegister dest) {
         // Clear the output register first to break dependencies; see above;
         zeroDouble(dest);
-        cvtsi2sd(Operand(src), dest);
+        vcvtsi2sd(Operand(src), dest, dest);
     }
     void convertInt32ToFloat32(Register src, FloatRegister dest) {
         // Clear the output register first to break dependencies; see above;
         zeroFloat32(dest);
-        cvtsi2ss(src, dest);
+        vcvtsi2ss(src, dest, dest);
     }
     void convertInt32ToFloat32(const Address &src, FloatRegister dest) {
         convertInt32ToFloat32(Operand(src), dest);
     }
     void convertInt32ToFloat32(const Operand &src, FloatRegister dest) {
         // Clear the output register first to break dependencies; see above;
         zeroFloat32(dest);
-        cvtsi2ss(src, dest);
+        vcvtsi2ss(src, dest, dest);
     }
     Condition testDoubleTruthy(bool truthy, FloatRegister reg) {
         zeroDouble(ScratchDoubleReg);
         ucomisd(reg, ScratchDoubleReg);
         return truthy ? NonZero : Zero;
     }
     void branchTestDoubleTruthy(bool truthy, FloatRegister reg, Label *label) {
         Condition cond = testDoubleTruthy(truthy, reg);
@@ -783,24 +783,24 @@ class MacroAssemblerX86Shared : public A
     void zeroDouble(FloatRegister reg) {
         vxorpd(reg, reg, reg);
     }
     void zeroFloat32(FloatRegister reg) {
         vxorps(reg, reg, reg);
     }
     void negateDouble(FloatRegister reg) {
         // From MacroAssemblerX86Shared::maybeInlineDouble
-        pcmpeqw(ScratchDoubleReg, ScratchDoubleReg);
+        vpcmpeqw(ScratchDoubleReg, ScratchDoubleReg, ScratchDoubleReg);
         psllq(Imm32(63), ScratchDoubleReg);
 
         // XOR the float in a float register with -0.0.
         vxorpd(ScratchDoubleReg, reg, reg); // s ^ 0x80000000000000
     }
     void negateFloat(FloatRegister reg) {
-        pcmpeqw(ScratchFloat32Reg, ScratchFloat32Reg);
+        vpcmpeqw(ScratchFloat32Reg, ScratchFloat32Reg, ScratchFloat32Reg);
         psllq(Imm32(31), ScratchFloat32Reg);
 
         // XOR the float in a float register with -0.0.
         vxorps(ScratchFloat32Reg, reg, reg); // s ^ 0x80000000
     }
     void addDouble(FloatRegister src, FloatRegister dest) {
         vaddsd(src, dest, dest);
     }
@@ -812,32 +812,32 @@ class MacroAssemblerX86Shared : public A
     }
     void divDouble(FloatRegister src, FloatRegister dest) {
         vdivsd(src, dest, dest);
     }
     void addFloat32(FloatRegister src, FloatRegister dest) {
         vaddss(src, dest, dest);
     }
     void convertFloat32ToDouble(FloatRegister src, FloatRegister dest) {
-        cvtss2sd(src, dest);
+        vcvtss2sd(src, dest, dest);
     }
     void convertDoubleToFloat32(FloatRegister src, FloatRegister dest) {
-        cvtsd2ss(src, dest);
+        vcvtsd2ss(src, dest, dest);
     }
 
     void convertFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest) {
         // TODO: Note that if the conversion failed (because the converted
         // result is larger than the maximum signed int32, or less than the
         // least signed int32, or NaN), this will return the undefined integer
         // value (0x8000000). Spec should define what to do in such cases. See
         // also bug 1068020.
-        cvttps2dq(src, dest);
+        vcvttps2dq(src, dest);
     }
     void convertInt32x4ToFloat32x4(FloatRegister src, FloatRegister dest) {
-        cvtdq2ps(src, dest);
+        vcvtdq2ps(src, dest);
     }
 
     void bitwiseAndX4(const Operand &src, FloatRegister dest) {
         // TODO Using the "ps" variant for all types incurs a domain crossing
         // penalty for integer types and double.
         vandps(src, dest, dest);
     }
     void bitwiseAndNotX4(const Operand &src, FloatRegister dest) {
@@ -888,39 +888,39 @@ class MacroAssemblerX86Shared : public A
     }
     void storeUnalignedInt32x4(FloatRegister src, const Address &dest) {
         movdqu(src, Operand(dest));
     }
     void storeUnalignedInt32x4(FloatRegister src, const Operand &dest) {
         movdqu(src, dest);
     }
     void packedEqualInt32x4(const Operand &src, FloatRegister dest) {
-        pcmpeqd(src, dest);
+        vpcmpeqd(src, dest, dest);
     }
     void packedGreaterThanInt32x4(const Operand &src, FloatRegister dest) {
-        pcmpgtd(src, dest);
+        vpcmpgtd(src, dest, dest);
     }
     void packedAddInt32(const Operand &src, FloatRegister dest) {
         vpaddd(src, dest, dest);
     }
     void packedSubInt32(const Operand &src, FloatRegister dest) {
         vpsubd(src, dest, dest);
     }
     void packedReciprocalFloat32x4(const Operand &src, FloatRegister dest) {
         // This function is an approximation of the result, this might need
         // fix up if the spec requires a given precision for this operation.
         // TODO See also bug 1068028.
-        rcpps(src, dest);
+        vrcpps(src, dest);
     }
     void packedReciprocalSqrtFloat32x4(const Operand &src, FloatRegister dest) {
         // TODO See comment above. See also bug 1068028.
-        rsqrtps(src, dest);
+        vrsqrtps(src, dest);
     }
     void packedSqrtFloat32x4(const Operand &src, FloatRegister dest) {
-        sqrtps(src, dest);
+        vsqrtps(src, dest);
     }
 
     void packedLeftShiftByScalar(FloatRegister src, FloatRegister dest) {
         pslld(src, dest);
     }
     void packedLeftShiftByScalar(Imm32 count, FloatRegister dest) {
         pslld(count, dest);
     }
@@ -994,17 +994,17 @@ class MacroAssemblerX86Shared : public A
         MOZ_ASSERT(r < 256);
         return r;
     }
 
     void shuffleInt32(uint32_t mask, FloatRegister src, FloatRegister dest) {
         pshufd(mask, src, dest);
     }
     void moveLowInt32(FloatRegister src, Register dest) {
-        movd(src, dest);
+        vmovd(src, dest);
     }
 
     void moveHighPairToLowPairFloat32(FloatRegister src, FloatRegister dest) {
         movhlps(src, dest);
     }
     void shuffleFloat32(uint32_t mask, FloatRegister src, FloatRegister dest) {
         // The shuffle instruction on x86 is such that it moves 2 words from
         // the dest and 2 words from the src operands. To simplify things, just
@@ -1017,30 +1017,30 @@ class MacroAssemblerX86Shared : public A
     }
     void shuffleMix(uint32_t mask, const Operand &src, FloatRegister dest) {
         // Note this uses shufps, which is a cross-domain penaly on CPU where it
         // applies, but that's the way clang and gcc do it.
         shufps(mask, src, dest);
     }
 
     void moveFloatAsDouble(Register src, FloatRegister dest) {
-        movd(src, dest);
-        cvtss2sd(dest, dest);
+        vmovd(src, dest);
+        vcvtss2sd(dest, dest, dest);
     }
     void loadFloatAsDouble(const Address &src, FloatRegister dest) {
         movss(src, dest);
-        cvtss2sd(dest, dest);
+        vcvtss2sd(dest, dest, dest);
     }
     void loadFloatAsDouble(const BaseIndex &src, FloatRegister dest) {
         movss(src, dest);
-        cvtss2sd(dest, dest);
+        vcvtss2sd(dest, dest, dest);
     }
     void loadFloatAsDouble(const Operand &src, FloatRegister dest) {
         loadFloat32(src, dest);
-        cvtss2sd(dest, dest);
+        vcvtss2sd(dest, dest, dest);
     }
     void loadFloat32(const Address &src, FloatRegister dest) {
         movss(src, dest);
     }
     void loadFloat32(const BaseIndex &src, FloatRegister dest) {
         movss(src, dest);
     }
     void loadFloat32(const Operand &src, FloatRegister dest) {
@@ -1083,35 +1083,35 @@ class MacroAssemblerX86Shared : public A
     // the given snapshot. This function overwrites the scratch float register.
     void convertDoubleToInt32(FloatRegister src, Register dest, Label *fail,
                               bool negativeZeroCheck = true)
     {
         // Check for -0.0
         if (negativeZeroCheck)
             branchNegativeZero(src, dest, fail);
 
-        cvttsd2si(src, dest);
-        cvtsi2sd(dest, ScratchDoubleReg);
+        vcvttsd2si(src, dest);
+        convertInt32ToDouble(dest, ScratchDoubleReg);
         ucomisd(ScratchDoubleReg, src);
         j(Assembler::Parity, fail);
         j(Assembler::NotEqual, fail);
 
     }
 
     // Checks whether a float32 is representable as a 32-bit integer. If so, the
     // integer is written to the output register. Otherwise, a bailout is taken to
     // the given snapshot. This function overwrites the scratch float register.
     void convertFloat32ToInt32(FloatRegister src, Register dest, Label *fail,
                                bool negativeZeroCheck = true)
     {
         // Check for -0.0
         if (negativeZeroCheck)
             branchNegativeZeroFloat32(src, dest, fail);
 
-        cvttss2si(src, dest);
+        vcvttss2si(src, dest);
         convertInt32ToFloat32(dest, ScratchFloat32Reg);
         ucomiss(ScratchFloat32Reg, src);
         j(Assembler::Parity, fail);
         j(Assembler::NotEqual, fail);
     }
 
     void clampIntToUint8(Register reg) {
         Label inRange;
@@ -1128,17 +1128,17 @@ class MacroAssemblerX86Shared : public A
         uint64_t u = mozilla::BitwiseCast<uint64_t>(d);
 
         // Loading zero with xor is specially optimized in hardware.
         if (u == 0) {
             zeroDouble(dest);
             return true;
         }
 
-        // It is also possible to load several common constants using pcmpeqw
+        // It is also possible to load several common constants using vpcmpeqw
         // to get all ones and then psllq and psrlq to get zeros at the ends,
         // as described in "13.4 Generating constants" of
         // "2. Optimizing subroutines in assembly language" by Agner Fog, and as
         // previously implemented here. However, with x86 and x64 both using
         // constant pool loads for double constants, this is probably only
         // worthwhile in cases where a load is likely to be delayed.
 
         return false;
@@ -1158,17 +1158,17 @@ class MacroAssemblerX86Shared : public A
     bool maybeInlineInt32x4(const SimdConstant &v, const FloatRegister &dest) {
         static const SimdConstant zero = SimdConstant::CreateX4(0, 0, 0, 0);
         static const SimdConstant minusOne = SimdConstant::CreateX4(-1, -1, -1, -1);
         if (v == zero) {
             zeroInt32x4(dest);
             return true;
         }
         if (v == minusOne) {
-            pcmpeqw(dest, dest);
+            vpcmpeqw(dest, dest, dest);
             return true;
         }
         return false;
     }
     bool maybeInlineFloat32x4(const SimdConstant &v, const FloatRegister &dest) {
         static const SimdConstant zero = SimdConstant::CreateX4(0.f, 0.f, 0.f, 0.f);
         if (v == zero) {
             // This won't get inlined if the SimdConstant v contains -0 in any
--- a/js/src/jit/x64/Assembler-x64.h
+++ b/js/src/jit/x64/Assembler-x64.h
@@ -742,27 +742,27 @@ class Assembler : public AssemblerX86Sha
     static size_t ToggledCallSize(uint8_t *code) {
         // Size of a call instruction.
         return 5;
     }
 
     // Do not mask shared implementations.
     using AssemblerX86Shared::call;
 
-    void cvttsd2sq(FloatRegister src, Register dest) {
-        masm.cvttsd2sq_rr(src.code(), dest.code());
+    void vcvttsd2sq(FloatRegister src, Register dest) {
+        masm.vcvttsd2sq_rr(src.code(), dest.code());
     }
-    void cvttss2sq(FloatRegister src, Register dest) {
-        masm.cvttss2sq_rr(src.code(), dest.code());
+    void vcvttss2sq(FloatRegister src, Register dest) {
+        masm.vcvttss2sq_rr(src.code(), dest.code());
     }
-    void cvtsq2sd(Register src, FloatRegister dest) {
-        masm.cvtsq2sd_rr(src.code(), dest.code());
+    void vcvtsq2sd(Register src1, FloatRegister src0, FloatRegister dest) {
+        masm.vcvtsq2sd_rr(src1.code(), src0.code(), dest.code());
     }
-    void cvtsq2ss(Register src, FloatRegister dest) {
-        masm.cvtsq2ss_rr(src.code(), dest.code());
+    void vcvtsq2ss(Register src1, FloatRegister src0, FloatRegister dest) {
+        masm.vcvtsq2ss_rr(src1.code(), src0.code(), dest.code());
     }
 };
 
 static inline void
 PatchJump(CodeLocationJump jump, CodeLocationLabel label)
 {
     if (X86Assembler::canRelinkJump(jump.raw(), label.raw())) {
         X86Assembler::setRel32(jump.raw(), label.raw());
--- a/js/src/jit/x64/CodeGenerator-x64.cpp
+++ b/js/src/jit/x64/CodeGenerator-x64.cpp
@@ -554,25 +554,25 @@ DispatchIonCache::initializeAddCacheStat
 }
 
 void
 CodeGeneratorX64::visitTruncateDToInt32(LTruncateDToInt32 *ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     Register output = ToRegister(ins->output());
 
-    // On x64, branchTruncateDouble uses cvttsd2sq. Unlike the x86
+    // On x64, branchTruncateDouble uses vcvttsd2sq. Unlike the x86
     // implementation, this should handle most doubles and we can just
     // call a stub if it fails.
     emitTruncateDouble(input, output, ins->mir());
 }
 
 void
 CodeGeneratorX64::visitTruncateFToInt32(LTruncateFToInt32 *ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     Register output = ToRegister(ins->output());
 
-    // On x64, branchTruncateFloat32 uses cvttss2sq. Unlike the x86
+    // On x64, branchTruncateFloat32 uses vcvttss2sq. Unlike the x86
     // implementation, this should handle most floats and we can just
     // call a stub if it fails.
     emitTruncateFloat32(input, output, ins->mir());
 }
--- a/js/src/jit/x64/MacroAssembler-x64.h
+++ b/js/src/jit/x64/MacroAssembler-x64.h
@@ -1273,28 +1273,28 @@ class MacroAssemblerX64 : public MacroAs
     void loadConstantFloat32(float f, FloatRegister dest);
   private:
     SimdData *getSimdData(const SimdConstant &v);
   public:
     void loadConstantInt32x4(const SimdConstant &v, FloatRegister dest);
     void loadConstantFloat32x4(const SimdConstant &v, FloatRegister dest);
 
     void branchTruncateDouble(FloatRegister src, Register dest, Label *fail) {
-        cvttsd2sq(src, dest);
+        vcvttsd2sq(src, dest);
 
-        // cvttsd2sq returns 0x8000000000000000 on failure. Test for it by
+        // vcvttsd2sq returns 0x8000000000000000 on failure. Test for it by
         // subtracting 1 and testing overflow (this avoids the need to
         // materialize that value in a register).
         cmpPtr(dest, Imm32(1));
         j(Assembler::Overflow, fail);
 
         movl(dest, dest); // Zero upper 32-bits.
     }
     void branchTruncateFloat32(FloatRegister src, Register dest, Label *fail) {
-        cvttss2sq(src, dest);
+        vcvttss2sq(src, dest);
 
         // Same trick as for Doubles
         cmpPtr(dest, Imm32(1));
         j(Assembler::Overflow, fail);
 
         movl(dest, dest); // Zero upper 32-bits.
     }
 
@@ -1343,21 +1343,21 @@ class MacroAssemblerX64 : public MacroAs
     template <typename T>
     void storeUnboxedValue(ConstantOrRegister value, MIRType valueType, const T &dest, MIRType slotType);
 
     void loadInstructionPointerAfterCall(Register dest) {
         loadPtr(Address(StackPointer, 0x0), dest);
     }
 
     void convertUInt32ToDouble(Register src, FloatRegister dest) {
-        cvtsq2sd(src, dest);
+        vcvtsq2sd(src, dest, dest);
     }
 
     void convertUInt32ToFloat32(Register src, FloatRegister dest) {
-        cvtsq2ss(src, dest);
+        vcvtsq2ss(src, dest, dest);
     }
 
     void inc64(AbsoluteAddress dest) {
         if (X86Assembler::isAddressImmediate(dest.addr)) {
             addPtr(Imm32(1), Operand(dest));
         } else {
             mov(ImmPtr(dest.addr), ScratchReg);
             addPtr(Imm32(1), Address(ScratchReg, 0));
--- a/js/src/jit/x86/CodeGenerator-x86.cpp
+++ b/js/src/jit/x86/CodeGenerator-x86.cpp
@@ -858,18 +858,18 @@ CodeGeneratorX86::visitOutOfLineTruncate
             masm.jmp(&skip);
 
             masm.bind(&positive);
             masm.loadConstantDouble(-4294967296.0, temp);
             masm.bind(&skip);
         }
 
         masm.addDouble(input, temp);
-        masm.cvttsd2si(temp, output);
-        masm.cvtsi2sd(output, ScratchDoubleReg);
+        masm.vcvttsd2si(temp, output);
+        masm.vcvtsi2sd(output, ScratchDoubleReg, ScratchDoubleReg);
 
         masm.ucomisd(ScratchDoubleReg, temp);
         masm.j(Assembler::Parity, &fail);
         masm.j(Assembler::Equal, ool->rejoin());
     }
 
     masm.bind(&fail);
     {
@@ -947,31 +947,31 @@ CodeGeneratorX86::visitOutOfLineTruncate
             masm.jmp(&skip);
 
             masm.bind(&positive);
             masm.loadConstantFloat32(-4294967296.f, temp);
             masm.bind(&skip);
         }
 
         masm.addFloat32(input, temp);
-        masm.cvttss2si(temp, output);
-        masm.cvtsi2ss(output, ScratchFloat32Reg);
+        masm.vcvttss2si(temp, output);
+        masm.vcvtsi2ss(output, ScratchFloat32Reg, ScratchFloat32Reg);
 
         masm.ucomiss(ScratchFloat32Reg, temp);
         masm.j(Assembler::Parity, &fail);
         masm.j(Assembler::Equal, ool->rejoin());
     }
 
     masm.bind(&fail);
     {
         saveVolatile(output);
 
         masm.push(input);
         masm.setupUnalignedABICall(1, output);
-        masm.cvtss2sd(input, input);
+        masm.vcvtss2sd(input, input, input);
         masm.passABIArg(input, MoveOp::DOUBLE);
 
         if (gen->compilingAsmJS())
             masm.callWithABI(AsmJSImm_ToInt32);
         else
             masm.callWithABI(JS_FUNC_TO_DATA_PTR(void *, js::ToInt32));
 
         masm.storeCallResult(output);
--- a/js/src/jit/x86/MacroAssembler-x86.h
+++ b/js/src/jit/x86/MacroAssembler-x86.h
@@ -862,22 +862,22 @@ class MacroAssemblerX86 : public MacroAs
     {
         MOZ_ASSERT(cond == Equal || cond == NotEqual);
         branchTestValue(cond, val, MagicValue(why), label);
     }
 
     // Note: this function clobbers the source register.
     void boxDouble(FloatRegister src, const ValueOperand &dest) {
         if (Assembler::HasSSE41()) {
-            movd(src, dest.payloadReg());
+            vmovd(src, dest.payloadReg());
             pextrd(1, src, dest.typeReg());
         } else {
-            movd(src, dest.payloadReg());
+            vmovd(src, dest.payloadReg());
             psrldq(Imm32(4), src);
-            movd(src, dest.typeReg());
+            vmovd(src, dest.typeReg());
         }
     }
     void boxNonDouble(JSValueType type, Register src, const ValueOperand &dest) {
         if (src != dest.payloadReg())
             movl(src, dest.payloadReg());
         movl(ImmType(type), dest.typeReg());
     }
 
@@ -899,38 +899,38 @@ class MacroAssemblerX86 : public MacroAs
     void unboxObject(const ValueOperand &src, Register dest) { unboxNonDouble(src, dest); }
     void unboxObject(const Address &src, Register dest) { unboxNonDouble(src, dest); }
     void unboxDouble(const Address &src, FloatRegister dest) {
         loadDouble(Operand(src), dest);
     }
     void unboxDouble(const ValueOperand &src, FloatRegister dest) {
         MOZ_ASSERT(dest != ScratchDoubleReg);
         if (Assembler::HasSSE41()) {
-            movd(src.payloadReg(), dest);
+            vmovd(src.payloadReg(), dest);
             pinsrd(1, src.typeReg(), dest);
         } else {
-            movd(src.payloadReg(), dest);
-            movd(src.typeReg(), ScratchDoubleReg);
-            unpcklps(ScratchDoubleReg, dest);
+            vmovd(src.payloadReg(), dest);
+            vmovd(src.typeReg(), ScratchDoubleReg);
+            vunpcklps(ScratchDoubleReg, dest, dest);
         }
     }
     void unboxDouble(const Operand &payload, const Operand &type,
                      Register scratch, FloatRegister dest) {
         MOZ_ASSERT(dest != ScratchDoubleReg);
         if (Assembler::HasSSE41()) {
             movl(payload, scratch);
-            movd(scratch, dest);
+            vmovd(scratch, dest);
             movl(type, scratch);
             pinsrd(1, scratch, dest);
         } else {
             movl(payload, scratch);
-            movd(scratch, dest);
+            vmovd(scratch, dest);
             movl(type, scratch);
-            movd(scratch, ScratchDoubleReg);
-            unpcklps(ScratchDoubleReg, dest);
+            vmovd(scratch, ScratchDoubleReg);
+            vunpcklps(ScratchDoubleReg, dest, dest);
         }
     }
     void unboxValue(const ValueOperand &src, AnyRegister dest) {
         if (dest.isFloat()) {
             Label notInt32, end;
             branchTestInt32(Assembler::NotEqual, src, &notInt32);
             convertInt32ToDouble(src.payloadReg(), dest.fpu());
             jump(&end);
@@ -991,28 +991,28 @@ class MacroAssemblerX86 : public MacroAs
     void loadConstantDouble(double d, FloatRegister dest);
     void addConstantDouble(double d, FloatRegister dest);
     void loadConstantFloat32(float f, FloatRegister dest);
     void addConstantFloat32(float f, FloatRegister dest);
     void loadConstantInt32x4(const SimdConstant &v, FloatRegister dest);
     void loadConstantFloat32x4(const SimdConstant &v, FloatRegister dest);
 
     void branchTruncateDouble(FloatRegister src, Register dest, Label *fail) {
-        cvttsd2si(src, dest);
+        vcvttsd2si(src, dest);
 
-        // cvttsd2si returns 0x80000000 on failure. Test for it by
+        // vcvttsd2si returns 0x80000000 on failure. Test for it by
         // subtracting 1 and testing overflow (this permits the use of a
         // smaller immediate field).
         cmp32(dest, Imm32(1));
         j(Assembler::Overflow, fail);
     }
     void branchTruncateFloat32(FloatRegister src, Register dest, Label *fail) {
-        cvttss2si(src, dest);
+        vcvttss2si(src, dest);
 
-        // cvttss2si returns 0x80000000 on failure. Test for it by
+        // vcvttss2si returns 0x80000000 on failure. Test for it by
         // subtracting 1 and testing overflow (this permits the use of a
         // smaller immediate field).
         cmp32(dest, Imm32(1));
         j(Assembler::Overflow, fail);
     }
 
     Condition testInt32Truthy(bool truthy, const ValueOperand &operand) {
         test32(operand.payloadReg(), operand.payloadReg());