Bug 1115752 - SpiderMonkey: VEX-encodings for store instructions r=jandem
authorDan Gohman <sunfish@mozilla.com>
Sun, 28 Dec 2014 07:04:13 -0800
changeset 221464 8623d3a5edbeb63de43c472bc50e930501729807
parent 221463 0ea34b180725246de813cc4f9846628980973e81
child 221465 736d53322a1d91210f65e66e8d5254ddd791a370
push id12945
push usercbook@mozilla.com
push dateMon, 29 Dec 2014 15:20:49 +0000
treeherderb2g-inbound@eeea99fa576e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjandem
bugs1115752
milestone37.0a1
Bug 1115752 - SpiderMonkey: VEX-encodings for store instructions r=jandem
js/src/jit/shared/Assembler-x86-shared.h
js/src/jit/shared/BaseAssembler-x86-shared.h
js/src/jit/shared/CodeGenerator-x86-shared.cpp
js/src/jit/shared/MacroAssembler-x86-shared.h
js/src/jit/x64/Assembler-x64.h
js/src/jit/x64/MacroAssembler-x64.cpp
js/src/jit/x64/Trampoline-x64.cpp
js/src/jit/x86/Assembler-x86.h
js/src/jit/x86/CodeGenerator-x86.cpp
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@@ -467,17 +467,17 @@ class AssemblerX86Shared : public Assemb
             MOZ_CRASH("unexpected operand kind");
         }
     }
 
     void xchgl(Register src, Register dest) {
         masm.xchgl_rr(src.code(), dest.code());
     }
 
-    // Eventually movapd should be overloaded to support loads and
+    // Eventually vmovapd should be overloaded to support loads and
     // stores too.
     void vmovapd(FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.vmovapd_rr(src.code(), dest.code());
     }
 
     void vmovaps(FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
@@ -494,24 +494,24 @@ class AssemblerX86Shared : public Assemb
             break;
           case Operand::FPREG:
             masm.vmovaps_rr(src.fpu(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
-    void movaps(FloatRegister src, const Operand &dest) {
+    void vmovaps(FloatRegister src, const Operand &dest) {
         MOZ_ASSERT(HasSSE2());
         switch (dest.kind()) {
           case Operand::MEM_REG_DISP:
-            masm.movaps_rm(src.code(), dest.disp(), dest.base());
+            masm.vmovaps_rm(src.code(), dest.disp(), dest.base());
             break;
           case Operand::MEM_SCALE:
-            masm.movaps_rm(src.code(), dest.disp(), dest.base(), dest.index(), dest.scale());
+            masm.vmovaps_rm(src.code(), dest.disp(), dest.base(), dest.index(), dest.scale());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
     void vmovups(const Operand &src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         switch (src.kind()) {
@@ -520,60 +520,60 @@ class AssemblerX86Shared : public Assemb
             break;
           case Operand::MEM_SCALE:
             masm.vmovups_mr(src.disp(), src.base(), src.index(), src.scale(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
-    void movups(FloatRegister src, const Operand &dest) {
+    void vmovups(FloatRegister src, const Operand &dest) {
         MOZ_ASSERT(HasSSE2());
         switch (dest.kind()) {
           case Operand::MEM_REG_DISP:
-            masm.movups_rm(src.code(), dest.disp(), dest.base());
+            masm.vmovups_rm(src.code(), dest.disp(), dest.base());
             break;
           case Operand::MEM_SCALE:
-            masm.movups_rm(src.code(), dest.disp(), dest.base(), dest.index(), dest.scale());
+            masm.vmovups_rm(src.code(), dest.disp(), dest.base(), dest.index(), dest.scale());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
 
     // vmovsd is only provided in load/store form since the
     // register-to-register form has different semantics (it doesn't clobber
     // the whole output register) and isn't needed currently.
     void vmovsd(const Address &src, FloatRegister dest) {
         masm.vmovsd_mr(src.offset, src.base.code(), dest.code());
     }
     void vmovsd(const BaseIndex &src, FloatRegister dest) {
         masm.vmovsd_mr(src.offset, src.base.code(), src.index.code(), src.scale, dest.code());
     }
-    void movsd(FloatRegister src, const Address &dest) {
-        masm.movsd_rm(src.code(), dest.offset, dest.base.code());
+    void vmovsd(FloatRegister src, const Address &dest) {
+        masm.vmovsd_rm(src.code(), dest.offset, dest.base.code());
     }
-    void movsd(FloatRegister src, const BaseIndex &dest) {
-        masm.movsd_rm(src.code(), dest.offset, dest.base.code(), dest.index.code(), dest.scale);
+    void vmovsd(FloatRegister src, const BaseIndex &dest) {
+        masm.vmovsd_rm(src.code(), dest.offset, dest.base.code(), dest.index.code(), dest.scale);
     }
     // Although vmovss is not only provided in load/store form (for the same
     // reasons as vmovsd above), the register to register form should be only
     // used in contexts where we care about not clearing the higher lanes of
     // the FloatRegister.
     void vmovss(const Address &src, FloatRegister dest) {
         masm.vmovss_mr(src.offset, src.base.code(), dest.code());
     }
     void vmovss(const BaseIndex &src, FloatRegister dest) {
         masm.vmovss_mr(src.offset, src.base.code(), src.index.code(), src.scale, dest.code());
     }
-    void movss(FloatRegister src, const Address &dest) {
-        masm.movss_rm(src.code(), dest.offset, dest.base.code());
+    void vmovss(FloatRegister src, const Address &dest) {
+        masm.vmovss_rm(src.code(), dest.offset, dest.base.code());
     }
-    void movss(FloatRegister src, const BaseIndex &dest) {
-        masm.movss_rm(src.code(), dest.offset, dest.base.code(), dest.index.code(), dest.scale);
+    void vmovss(FloatRegister src, const BaseIndex &dest) {
+        masm.vmovss_rm(src.code(), dest.offset, dest.base.code(), dest.index.code(), dest.scale);
     }
     void vmovss(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         masm.vmovss_rr(src1.code(), src0.code(), dest.code());
     }
     void vmovdqu(const Operand &src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         switch (src.kind()) {
           case Operand::MEM_REG_DISP:
@@ -581,24 +581,24 @@ class AssemblerX86Shared : public Assemb
             break;
           case Operand::MEM_SCALE:
             masm.vmovdqu_mr(src.disp(), src.base(), src.index(), src.scale(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
-    void movdqu(FloatRegister src, const Operand &dest) {
+    void vmovdqu(FloatRegister src, const Operand &dest) {
         MOZ_ASSERT(HasSSE2());
         switch (dest.kind()) {
           case Operand::MEM_REG_DISP:
-            masm.movdqu_rm(src.code(), dest.disp(), dest.base());
+            masm.vmovdqu_rm(src.code(), dest.disp(), dest.base());
             break;
           case Operand::MEM_SCALE:
-            masm.movdqu_rm(src.code(), dest.disp(), dest.base(), dest.index(), dest.scale());
+            masm.vmovdqu_rm(src.code(), dest.disp(), dest.base(), dest.index(), dest.scale());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
     void vmovdqa(const Operand &src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         switch (src.kind()) {
@@ -610,24 +610,24 @@ class AssemblerX86Shared : public Assemb
             break;
           case Operand::MEM_SCALE:
             masm.vmovdqa_mr(src.disp(), src.base(), src.index(), src.scale(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
-    void movdqa(FloatRegister src, const Operand &dest) {
+    void vmovdqa(FloatRegister src, const Operand &dest) {
         MOZ_ASSERT(HasSSE2());
         switch (dest.kind()) {
           case Operand::MEM_REG_DISP:
-            masm.movdqa_rm(src.code(), dest.disp(), dest.base());
+            masm.vmovdqa_rm(src.code(), dest.disp(), dest.base());
             break;
           case Operand::MEM_SCALE:
-            masm.movdqa_rm(src.code(), dest.disp(), dest.base(), dest.index(), dest.scale());
+            masm.vmovdqa_rm(src.code(), dest.disp(), dest.base(), dest.index(), dest.scale());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
     void vmovdqa(FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.vmovdqa_rr(src.code(), dest.code());
--- a/js/src/jit/shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/shared/BaseAssembler-x86-shared.h
@@ -417,16 +417,29 @@ private:
         OP2_PORDQ_VdqWdq    = 0xEB,
         OP2_PXORDQ_VdqWdq   = 0xEF,
         OP2_PSLLD_VdqWdq    = 0xF2,
         OP2_PMULUDQ_VdqWdq  = 0xF4,
         OP2_PSUBD_VdqWdq    = 0xFA,
         OP2_PADDD_VdqWdq    = 0xFE
     };
 
+    // Test whether the given opcode should be printed with its operands reversed.
+    static inline bool IsXMMReversedOperands(TwoByteOpcodeID opcode) {
+        switch (opcode) {
+          case OP2_MOVSD_WsdVsd: // also OP2_MOVPS_WpsVps
+          case OP2_MOVAPS_WsdVsd:
+          case OP2_MOVDQ_WdqVdq:
+            return true;
+          default:
+            break;
+        }
+        return false;
+    }
+
     enum ThreeByteOpcodeID {
         OP3_ROUNDSS_VsdWsd  = 0x0A,
         OP3_ROUNDSD_VsdWsd  = 0x0B,
         OP3_BLENDVPS_VdqWdq = 0x14,
         OP3_PEXTRD_EdVdqIb  = 0x16,
         OP3_BLENDPS_VpsWpsIb = 0x0C,
         OP3_PTEST_VdVd      = 0x17,
         OP3_INSERTPS_VpsUps = 0x21,
@@ -3114,66 +3127,54 @@ public:
     }
 
     void vmovq_rr(RegisterID src, XMMRegisterID dst)
     {
         twoByteOpInt64Simd("vmovq", VEX_PD, OP2_MOVD_VdEd, src, X86Registers::invalid_xmm, dst);
     }
 #endif
 
-    void movsd_rm(XMMRegisterID src, int32_t offset, RegisterID base)
-    {
-        spew("movsd      %s, " MEM_ob, nameFPReg(src), ADDR_ob(offset, base));
-        m_formatter.prefix(PRE_SSE_F2);
-        m_formatter.twoByteOp(OP2_MOVSD_WsdVsd, offset, base, (RegisterID)src);
-    }
-
-    void movsd_rm_disp32(XMMRegisterID src, int32_t offset, RegisterID base)
-    {
-        spew("movsd      %s, " MEM_o32b, nameFPReg(src), ADDR_o32b(offset, base));
-        m_formatter.prefix(PRE_SSE_F2);
-        m_formatter.twoByteOp_disp32(OP2_MOVSD_WsdVsd, offset, base, (RegisterID)src);
-    }
-
-    void movss_rm(XMMRegisterID src, int32_t offset, RegisterID base)
-    {
-        spew("movss      %s, " MEM_ob, nameFPReg(src), ADDR_ob(offset, base));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp(OP2_MOVSD_WsdVsd, offset, base, (RegisterID)src);
-    }
-
-    void movss_rm_disp32(XMMRegisterID src, int32_t offset, RegisterID base)
-    {
-        spew("movss      %s, " MEM_o32b, nameFPReg(src), ADDR_o32b(offset, base));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp_disp32(OP2_MOVSD_WsdVsd, offset, base, (RegisterID)src);
+    void vmovsd_rm(XMMRegisterID src, int32_t offset, RegisterID base)
+    {
+        twoByteOpSimd("vmovsd", VEX_SD, OP2_MOVSD_WsdVsd, offset, base, X86Registers::invalid_xmm, src);
+    }
+
+    void vmovsd_rm_disp32(XMMRegisterID src, int32_t offset, RegisterID base)
+    {
+        twoByteOpSimd_disp32("vmovsd", VEX_SD, OP2_MOVSD_WsdVsd, offset, base, X86Registers::invalid_xmm, src);
+    }
+
+    void vmovss_rm(XMMRegisterID src, int32_t offset, RegisterID base)
+    {
+        twoByteOpSimd("vmovss", VEX_SS, OP2_MOVSD_WsdVsd, offset, base, X86Registers::invalid_xmm, src);
+    }
+
+    void vmovss_rm_disp32(XMMRegisterID src, int32_t offset, RegisterID base)
+    {
+        twoByteOpSimd_disp32("vmovss", VEX_SS, OP2_MOVSD_WsdVsd, offset, base, X86Registers::invalid_xmm, src);
     }
 
     void vmovss_mr(int32_t offset, RegisterID base, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovss", VEX_SS, OP2_MOVSD_VsdWsd, offset, base, X86Registers::invalid_xmm, dst);
     }
 
     void vmovss_mr_disp32(int32_t offset, RegisterID base, XMMRegisterID dst)
     {
         twoByteOpSimd_disp32("vmovss", VEX_SS, OP2_MOVSD_VsdWsd, offset, base, X86Registers::invalid_xmm, dst);
     }
 
-    void movsd_rm(XMMRegisterID src, int32_t offset, RegisterID base, RegisterID index, int scale)
-    {
-        spew("movsd      %s, " MEM_obs, nameFPReg(src), ADDR_obs(offset, base, index, scale));
-        m_formatter.prefix(PRE_SSE_F2);
-        m_formatter.twoByteOp(OP2_MOVSD_WsdVsd, offset, base, index, scale, (RegisterID)src);
-    }
-
-    void movss_rm(XMMRegisterID src, int32_t offset, RegisterID base, RegisterID index, int scale)
-    {
-        spew("movss      %s, " MEM_obs, nameFPReg(src), ADDR_obs(offset, base, index, scale));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp(OP2_MOVSD_WsdVsd, offset, base, index, scale, (RegisterID)src);
+    void vmovsd_rm(XMMRegisterID src, int32_t offset, RegisterID base, RegisterID index, int scale)
+    {
+        twoByteOpSimd("vmovsd", VEX_SD, OP2_MOVSD_WsdVsd, offset, base, index, scale, X86Registers::invalid_xmm, src);
+    }
+
+    void vmovss_rm(XMMRegisterID src, int32_t offset, RegisterID base, RegisterID index, int scale)
+    {
+        twoByteOpSimd("vmovss", VEX_SS, OP2_MOVSD_WsdVsd, offset, base, index, scale, X86Registers::invalid_xmm, src);
     }
 
     void vmovss_mr(int32_t offset, RegisterID base, RegisterID index, int scale, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovss", VEX_SS, OP2_MOVSD_VsdWsd, offset, base, index, scale, X86Registers::invalid_xmm, dst);
     }
 
     void vmovsd_mr(int32_t offset, RegisterID base, XMMRegisterID dst)
@@ -3186,26 +3187,26 @@ public:
         twoByteOpSimd_disp32("vmovsd", VEX_SD, OP2_MOVSD_VsdWsd, offset, base, X86Registers::invalid_xmm, dst);
     }
 
     void vmovsd_mr(int32_t offset, RegisterID base, RegisterID index, int scale, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovsd", VEX_SD, OP2_MOVSD_VsdWsd, offset, base, index, scale, X86Registers::invalid_xmm, dst);
     }
 
-    // Note that the register-to-register form of movsd does not write to the
+    // Note that the register-to-register form of vmovsd does not write to the
     // entire output register. For general-purpose register-to-register moves,
-    // use movapd instead.
+    // use vmovapd instead.
     void vmovsd_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovsd", VEX_SD, OP2_MOVSD_VsdWsd, src1, src0, dst);
     }
 
-    // The register-to-register form of movss has the same problem as movsd
-    // above. Prefer movaps for register-to-register moves.
+    // The register-to-register form of vmovss has the same problem as vmovsd
+    // above. Prefer vmovaps for register-to-register moves.
     void vmovss_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovss", VEX_SS, OP2_MOVSD_VsdWsd, src1, src0, dst);
     }
 
     void vmovsd_mr(const void* address, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovsd", VEX_SD, OP2_MOVSD_VsdWsd, address, X86Registers::invalid_xmm, dst);
@@ -3221,136 +3222,104 @@ public:
         twoByteOpSimd("vmovups", VEX_PS, OP2_MOVPS_VpsWps, address, X86Registers::invalid_xmm, dst);
     }
 
     void vmovdqu_mr(const void* address, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovdqu", VEX_SS, OP2_MOVDQ_VdqWdq, address, X86Registers::invalid_xmm, dst);
     }
 
-    void movsd_rm(XMMRegisterID src, const void* address)
-    {
-        spew("movsd      %s, %p", nameFPReg(src), address);
-        m_formatter.prefix(PRE_SSE_F2);
-        m_formatter.twoByteOp(OP2_MOVSD_WsdVsd, address, (RegisterID)src);
-    }
-
-    void movss_rm(XMMRegisterID src, const void* address)
-    {
-        spew("movss      %s, %p", nameFPReg(src), address);
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp(OP2_MOVSD_WsdVsd, address, (RegisterID)src);
-    }
-
-    void movdqa_rm(XMMRegisterID src, const void* address)
-    {
-        spew("movdqa     %s, %p", nameFPReg(src), address);
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_MOVDQ_WdqVdq, address, (RegisterID)src);
-    }
-
-    void movaps_rm(XMMRegisterID src, const void* address)
-    {
-        spew("movaps     %s, %p", nameFPReg(src), address);
-        m_formatter.twoByteOp(OP2_MOVAPS_WsdVsd, address, (RegisterID)src);
-    }
-
-    void movdqu_rm(XMMRegisterID src, const void* address)
-    {
-        spew("movdqu     %s, %p", nameFPReg(src), address);
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp(OP2_MOVDQ_WdqVdq, address, (RegisterID)src);
-    }
-
-    void movups_rm(XMMRegisterID src, const void* address)
-    {
-        spew("movups     %s, %p", nameFPReg(src), address);
-        m_formatter.twoByteOp(OP2_MOVPS_WpsVps, address, (RegisterID)src);
+    void vmovsd_rm(XMMRegisterID src, const void* address)
+    {
+        twoByteOpSimd("vmovsd", VEX_SD, OP2_MOVSD_WsdVsd, address, X86Registers::invalid_xmm, src);
+    }
+
+    void vmovss_rm(XMMRegisterID src, const void* address)
+    {
+        twoByteOpSimd("vmovss", VEX_SS, OP2_MOVSD_WsdVsd, address, X86Registers::invalid_xmm, src);
+    }
+
+    void vmovdqa_rm(XMMRegisterID src, const void* address)
+    {
+        twoByteOpSimd("vmovdqa", VEX_PD, OP2_MOVDQ_WdqVdq, address, X86Registers::invalid_xmm, src);
+    }
+
+    void vmovaps_rm(XMMRegisterID src, const void* address)
+    {
+        twoByteOpSimd("vmovaps", VEX_PS, OP2_MOVAPS_WsdVsd, address, X86Registers::invalid_xmm, src);
+    }
+
+    void vmovdqu_rm(XMMRegisterID src, const void* address)
+    {
+        twoByteOpSimd("vmovdqu", VEX_SS, OP2_MOVDQ_WdqVdq, address, X86Registers::invalid_xmm, src);
+    }
+
+    void vmovups_rm(XMMRegisterID src, const void* address)
+    {
+        twoByteOpSimd("vmovups", VEX_PS, OP2_MOVPS_WpsVps, address, X86Registers::invalid_xmm, src);
     }
 #ifdef JS_CODEGEN_X64
-    JmpSrc movsd_ripr(XMMRegisterID dst)
-    {
-        spew("movsd      ?(%%rip), %s", nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F2);
-        m_formatter.twoByteRipOp(OP2_MOVSD_VsdWsd, 0, (RegisterID)dst);
-        return JmpSrc(m_formatter.size());
-    }
-    JmpSrc movss_ripr(XMMRegisterID dst)
-    {
-        spew("movss      ?(%%rip), %s", nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteRipOp(OP2_MOVSD_VsdWsd, 0, (RegisterID)dst);
-        return JmpSrc(m_formatter.size());
-    }
-    JmpSrc movsd_rrip(XMMRegisterID src)
-    {
-        spew("movsd      %s, ?(%%rip)", nameFPReg(src));
-        m_formatter.prefix(PRE_SSE_F2);
-        m_formatter.twoByteRipOp(OP2_MOVSD_WsdVsd, 0, (RegisterID)src);
-        return JmpSrc(m_formatter.size());
-    }
-    JmpSrc movss_rrip(XMMRegisterID src)
-    {
-        spew("movss      %s, ?(%%rip)", nameFPReg(src));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteRipOp(OP2_MOVSD_WsdVsd, 0, (RegisterID)src);
-        return JmpSrc(m_formatter.size());
-    }
-    JmpSrc movdqa_rrip(XMMRegisterID src)
-    {
-        spew("movdqa     %s, ?(%%rip)", nameFPReg(src));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteRipOp(OP2_MOVDQ_WdqVdq, 0, (RegisterID)src);
-        return JmpSrc(m_formatter.size());
-    }
-    JmpSrc movaps_rrip(XMMRegisterID src)
-    {
-        spew("movaps     %s, ?(%%rip)", nameFPReg(src));
-        m_formatter.twoByteRipOp(OP2_MOVPS_WpsVps, 0, (RegisterID)src);
-        return JmpSrc(m_formatter.size());
+    JmpSrc vmovsd_ripr(XMMRegisterID dst)
+    {
+        return twoByteRipOpSimd("vmovsd", VEX_SD, OP2_MOVSD_VsdWsd, 0, X86Registers::invalid_xmm, dst);
+    }
+    JmpSrc vmovss_ripr(XMMRegisterID dst)
+    {
+        return twoByteRipOpSimd("vmovss", VEX_SS, OP2_MOVSD_VsdWsd, 0, X86Registers::invalid_xmm, dst);
+    }
+    JmpSrc vmovsd_rrip(XMMRegisterID src)
+    {
+        return twoByteRipOpSimd("vmovsd", VEX_SD, OP2_MOVSD_WsdVsd, 0, X86Registers::invalid_xmm, src);
+    }
+    JmpSrc vmovss_rrip(XMMRegisterID src)
+    {
+        return twoByteRipOpSimd("vmovss", VEX_SS, OP2_MOVSD_WsdVsd, 0, X86Registers::invalid_xmm, src);
+    }
+    JmpSrc vmovdqa_rrip(XMMRegisterID src)
+    {
+        return twoByteRipOpSimd("vmovdqa", VEX_PD, OP2_MOVDQ_WdqVdq, 0, X86Registers::invalid_xmm, src);
+    }
+    JmpSrc vmovaps_rrip(XMMRegisterID src)
+    {
+        return twoByteRipOpSimd("vmovdqa", VEX_PS, OP2_MOVAPS_WsdVsd, 0, X86Registers::invalid_xmm, src);
     }
 #endif
 
     void vmovaps_rr(XMMRegisterID src, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovaps", VEX_PS, OP2_MOVAPS_VsdWsd, src, X86Registers::invalid_xmm, dst);
     }
-    void movaps_rm(XMMRegisterID src, int32_t offset, RegisterID base)
-    {
-        spew("movaps     %s, " MEM_ob, nameFPReg(src), ADDR_ob(offset, base));
-        m_formatter.twoByteOp(OP2_MOVAPS_WsdVsd, offset, base, (RegisterID)src);
-    }
-    void movaps_rm(XMMRegisterID src, int32_t offset, RegisterID base, RegisterID index, int scale)
-    {
-        spew("movaps     %s, " MEM_obs, nameFPReg(src), ADDR_obs(offset, base, index, scale));
-        m_formatter.twoByteOp(OP2_MOVAPS_WsdVsd, offset, base, index, scale, (RegisterID)src);
+    void vmovaps_rm(XMMRegisterID src, int32_t offset, RegisterID base)
+    {
+        twoByteOpSimd("vmovaps", VEX_PS, OP2_MOVAPS_WsdVsd, offset, base, X86Registers::invalid_xmm, src);
+    }
+    void vmovaps_rm(XMMRegisterID src, int32_t offset, RegisterID base, RegisterID index, int scale)
+    {
+        twoByteOpSimd("vmovaps", VEX_PS, OP2_MOVAPS_WsdVsd, offset, base, index, scale, X86Registers::invalid_xmm, src);
     }
     void vmovaps_mr(int32_t offset, RegisterID base, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovaps", VEX_PS, OP2_MOVAPS_VsdWsd, offset, base, X86Registers::invalid_xmm, dst);
     }
     void vmovaps_mr(int32_t offset, RegisterID base, RegisterID index, int scale, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovaps", VEX_PS, OP2_MOVAPS_VsdWsd, offset, base, index, scale, X86Registers::invalid_xmm, dst);
     }
 
-    void movups_rm(XMMRegisterID src, int32_t offset, RegisterID base)
-    {
-        spew("movups     %s, " MEM_ob, nameFPReg(src), ADDR_ob(offset, base));
-        m_formatter.twoByteOp(OP2_MOVPS_WpsVps, offset, base, (RegisterID)src);
-    }
-    void movups_rm_disp32(XMMRegisterID src, int32_t offset, RegisterID base)
-    {
-        spew("movups     %s, " MEM_o32b, nameFPReg(src), ADDR_o32b(offset, base));
-        m_formatter.twoByteOp_disp32(OP2_MOVPS_WpsVps, offset, base, (RegisterID)src);
-    }
-    void movups_rm(XMMRegisterID src, int32_t offset, RegisterID base, RegisterID index, int scale)
-    {
-        spew("movups     %s, " MEM_obs, nameFPReg(src), ADDR_obs(offset, base, index, scale));
-        m_formatter.twoByteOp(OP2_MOVPS_WpsVps, offset, base, index, scale, (RegisterID)src);
+    void vmovups_rm(XMMRegisterID src, int32_t offset, RegisterID base)
+    {
+        twoByteOpSimd("vmovups", VEX_PS, OP2_MOVPS_WpsVps, offset, base, X86Registers::invalid_xmm, src);
+    }
+    void vmovups_rm_disp32(XMMRegisterID src, int32_t offset, RegisterID base)
+    {
+        twoByteOpSimd_disp32("vmovups", VEX_PS, OP2_MOVPS_WpsVps, offset, base, X86Registers::invalid_xmm, src);
+    }
+    void vmovups_rm(XMMRegisterID src, int32_t offset, RegisterID base, RegisterID index, int scale)
+    {
+        twoByteOpSimd("vmovups", VEX_PS, OP2_MOVPS_WpsVps, offset, base, index, scale, X86Registers::invalid_xmm, src);
     }
     void vmovups_mr(int32_t offset, RegisterID base, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovups", VEX_PS, OP2_MOVPS_VpsWps, offset, base, X86Registers::invalid_xmm, dst);
     }
     void vmovups_mr_disp32(int32_t offset, RegisterID base, XMMRegisterID dst)
     {
         twoByteOpSimd_disp32("vmovups", VEX_PS, OP2_MOVPS_VpsWps, offset, base, X86Registers::invalid_xmm, dst);
@@ -3361,61 +3330,50 @@ public:
     }
 
     void vmovapd_rr(XMMRegisterID src, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovapd", VEX_PD, OP2_MOVAPD_VsdWsd, src, X86Registers::invalid_xmm, dst);
     }
 
 #ifdef JS_CODEGEN_X64
-    JmpSrc movaps_ripr(XMMRegisterID dst)
-    {
-        spew("movaps     ?(%%rip), %s", nameFPReg(dst));
-        m_formatter.twoByteRipOp(OP2_MOVAPS_VsdWsd, 0, (RegisterID)dst);
-        return JmpSrc(m_formatter.size());
-    }
-
-    JmpSrc movdqa_ripr(XMMRegisterID dst)
-    {
-        spew("movdqa     ?(%%rip), %s", nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteRipOp(OP2_MOVDQ_VdqWdq, 0, (RegisterID)dst);
-        return JmpSrc(m_formatter.size());
+    JmpSrc vmovaps_ripr(XMMRegisterID dst)
+    {
+        return twoByteRipOpSimd("vmovaps", VEX_PS, OP2_MOVAPS_VsdWsd, 0, X86Registers::invalid_xmm, dst);
+    }
+
+    JmpSrc vmovdqa_ripr(XMMRegisterID dst)
+    {
+        return twoByteRipOpSimd("vmovdqa", VEX_PD, OP2_MOVDQ_VdqWdq, 0, X86Registers::invalid_xmm, dst);
     }
 #else
     void vmovaps_mr(const void* address, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovaps", VEX_PS, OP2_MOVAPS_VsdWsd, address, X86Registers::invalid_xmm, dst);
     }
 
     void vmovdqa_mr(const void* address, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovdqa", VEX_PD, OP2_MOVDQ_VdqWdq, address, X86Registers::invalid_xmm, dst);
     }
 #endif // JS_CODEGEN_X64
 
-    void movdqu_rm(XMMRegisterID src, int32_t offset, RegisterID base)
-    {
-        spew("movdqu     %s, " MEM_ob, nameFPReg(src), ADDR_ob(offset, base));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp(OP2_MOVDQ_WdqVdq, offset, base, (RegisterID)src);
-    }
-
-    void movdqu_rm_disp32(XMMRegisterID src, int32_t offset, RegisterID base)
-    {
-        spew("movdqu     %s, " MEM_o32b, nameFPReg(src), ADDR_o32b(offset, base));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp_disp32(OP2_MOVDQ_WdqVdq, offset, base, (RegisterID)src);
-    }
-
-    void movdqu_rm(XMMRegisterID src, int32_t offset, RegisterID base, RegisterID index, int scale)
-    {
-        spew("movdqu     %s, " MEM_obs, nameFPReg(src), ADDR_obs(offset, base, index, scale));
-        m_formatter.prefix(PRE_SSE_F3);
-        m_formatter.twoByteOp(OP2_MOVDQ_WdqVdq, offset, base, index, scale, (RegisterID)src);
+    void vmovdqu_rm(XMMRegisterID src, int32_t offset, RegisterID base)
+    {
+        twoByteOpSimd("vmovdqu", VEX_SS, OP2_MOVDQ_WdqVdq, offset, base, X86Registers::invalid_xmm, src);
+    }
+
+    void vmovdqu_rm_disp32(XMMRegisterID src, int32_t offset, RegisterID base)
+    {
+        twoByteOpSimd_disp32("vmovdqu", VEX_SS, OP2_MOVDQ_WdqVdq, offset, base, X86Registers::invalid_xmm, src);
+    }
+
+    void vmovdqu_rm(XMMRegisterID src, int32_t offset, RegisterID base, RegisterID index, int scale)
+    {
+        twoByteOpSimd("vmovdqu", VEX_SS, OP2_MOVDQ_WdqVdq, offset, base, index, scale, X86Registers::invalid_xmm, src);
     }
 
     void vmovdqu_mr(int32_t offset, RegisterID base, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovdqu", VEX_SS, OP2_MOVDQ_VdqWdq, offset, base, X86Registers::invalid_xmm, dst);
     }
 
     void vmovdqu_mr_disp32(int32_t offset, RegisterID base, XMMRegisterID dst)
@@ -3428,28 +3386,24 @@ public:
         twoByteOpSimd("vmovdqu", VEX_SS, OP2_MOVDQ_VdqWdq, offset, base, index, scale, X86Registers::invalid_xmm, dst);
     }
 
     void vmovdqa_rr(XMMRegisterID src, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovdqa", VEX_PD, OP2_MOVDQ_VdqWdq, src, X86Registers::invalid_xmm, dst);
     }
 
-    void movdqa_rm(XMMRegisterID src, int32_t offset, RegisterID base)
-    {
-        spew("movdqa     %s, " MEM_ob, nameFPReg(src), ADDR_ob(offset, base));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_MOVDQ_WdqVdq, offset, base, (RegisterID)src);
-    }
-
-    void movdqa_rm(XMMRegisterID src, int32_t offset, RegisterID base, RegisterID index, int scale)
-    {
-        spew("movdqa     %s, " MEM_obs, nameFPReg(src), ADDR_obs(offset, base, index, scale));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_MOVDQ_WdqVdq, offset, base, index, scale, (RegisterID)src);
+    void vmovdqa_rm(XMMRegisterID src, int32_t offset, RegisterID base)
+    {
+        twoByteOpSimd("vmovdqa", VEX_PD, OP2_MOVDQ_WdqVdq, offset, base, X86Registers::invalid_xmm, src);
+    }
+
+    void vmovdqa_rm(XMMRegisterID src, int32_t offset, RegisterID base, RegisterID index, int scale)
+    {
+        twoByteOpSimd("vmovdqa", VEX_PD, OP2_MOVDQ_WdqVdq, offset, base, index, scale, X86Registers::invalid_xmm, src);
     }
 
     void vmovdqa_mr(int32_t offset, RegisterID base, XMMRegisterID dst)
     {
 
         twoByteOpSimd("vmovdqa", VEX_PD, OP2_MOVDQ_VdqWdq, offset, base, X86Registers::invalid_xmm, dst);
     }
 
@@ -4148,189 +4102,273 @@ private:
     }
 
     const char *legacySSEOpName(const char *name)
     {
         MOZ_ASSERT(name[0] == 'v');
         return name + 1;
     }
 
-#ifdef JS_CODEGEN_X64
-    void twoByteRipOpSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
-                          int ripOffset, XMMRegisterID src0, XMMRegisterID dst)
+ #ifdef JS_CODEGEN_X64
+    JmpSrc twoByteRipOpSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
+                            int ripOffset, XMMRegisterID src0, XMMRegisterID dst)
     {
         if (useLegacySSEEncoding(src0, dst)) {
-            spew("%-11s?%+d(%%rip), %s", legacySSEOpName(name), ripOffset, nameFPReg(dst));
+            if (IsXMMReversedOperands(opcode)) {
+                spew("%-11s%s, ?%+d(%%rip)", legacySSEOpName(name), nameFPReg(dst), ripOffset);
+            } else {
+                spew("%-11s?%+d(%%rip), %s", legacySSEOpName(name), ripOffset, nameFPReg(dst));
+            }
             m_formatter.legacySSEPrefix(ty);
             m_formatter.twoByteRipOp(opcode, ripOffset, dst);
-            return;
+            return JmpSrc(m_formatter.size());
         }
 
-        spew("%-11s?%+d(%%rip), %s, %s", name, ripOffset, nameFPReg(src0), nameFPReg(dst));
+        if (src0 == X86Registers::invalid_xmm) {
+            if (IsXMMReversedOperands(opcode)) {
+                spew("%-11s%s, ?%+d(%%rip)", name, nameFPReg(dst), ripOffset);
+            } else {
+                spew("%-11s?%+d(%%rip), %s", name, ripOffset, nameFPReg(dst));
+            }
+        } else {
+            spew("%-11s?%+d(%%rip), %s, %s", name, ripOffset, nameFPReg(src0), nameFPReg(dst));
+        }
         m_formatter.twoByteRipOpVex(ty, opcode, ripOffset, src0, dst);
+        return JmpSrc(m_formatter.size());
     }
 #endif
 
     void twoByteOpSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
                        XMMRegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
     {
         if (useLegacySSEEncoding(src0, dst)) {
-            spew("%-11s%s, %s", legacySSEOpName(name), nameFPReg(rm), nameFPReg(dst));
+            if (IsXMMReversedOperands(opcode))
+                spew("%-11s%s, %s", legacySSEOpName(name), nameFPReg(dst), nameFPReg(rm));
+            else
+                spew("%-11s%s, %s", legacySSEOpName(name), nameFPReg(rm), nameFPReg(dst));
             m_formatter.legacySSEPrefix(ty);
             m_formatter.twoByteOp(opcode, (RegisterID)rm, dst);
             return;
         }
 
-        spew("%-11s%s, %s, %s", name, nameFPReg(rm), nameFPReg(src0), nameFPReg(dst));
+        if (src0 == X86Registers::invalid_xmm) {
+            if (IsXMMReversedOperands(opcode))
+                spew("%-11s%s, %s", name, nameFPReg(dst), nameFPReg(rm));
+            else
+                spew("%-11s%s, %s", name, nameFPReg(rm), nameFPReg(dst));
+        } else {
+            spew("%-11s%s, %s, %s", name, nameFPReg(rm), nameFPReg(src0), nameFPReg(dst));
+        }
         m_formatter.twoByteOpVex(ty, opcode, (RegisterID)rm, src0, dst);
     }
 
     void twoByteOpSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
                        int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
     {
         if (useLegacySSEEncoding(src0, dst)) {
-            spew("%-11s" MEM_ob ", %s", legacySSEOpName(name),
-                 ADDR_ob(offset, base), nameFPReg(dst));
+            if (IsXMMReversedOperands(opcode)) {
+                spew("%-11s%s, " MEM_ob, legacySSEOpName(name),
+                     nameFPReg(dst), ADDR_ob(offset, base));
+            } else {
+                spew("%-11s" MEM_ob ", %s", legacySSEOpName(name),
+                     ADDR_ob(offset, base), nameFPReg(dst));
+            }
             m_formatter.legacySSEPrefix(ty);
             m_formatter.twoByteOp(opcode, offset, base, dst);
             return;
         }
 
         if (src0 == X86Registers::invalid_xmm) {
-            spew("%-11s" MEM_ob ", %s", name, ADDR_ob(offset, base), nameFPReg(dst));
+            if (IsXMMReversedOperands(opcode))
+                spew("%-11s%s, " MEM_ob, name, nameFPReg(dst), ADDR_ob(offset, base));
+            else
+                spew("%-11s" MEM_ob ", %s", name, ADDR_ob(offset, base), nameFPReg(dst));
         } else {
             spew("%-11s" MEM_ob ", %s, %s", name,
                  ADDR_ob(offset, base), nameFPReg(src0), nameFPReg(dst));
         }
         m_formatter.twoByteOpVex(ty, opcode, offset, base, src0, dst);
     }
 
     void twoByteOpSimd_disp32(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
                               int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
     {
         if (useLegacySSEEncoding(src0, dst)) {
-            spew("%-11s" MEM_o32b ", %s", legacySSEOpName(name), ADDR_o32b(offset, base), nameFPReg(dst));
+            if (IsXMMReversedOperands(opcode)) {
+                spew("%-11s%s, " MEM_o32b, legacySSEOpName(name), nameFPReg(dst), ADDR_o32b(offset, base));
+            } else {
+                spew("%-11s" MEM_o32b ", %s", legacySSEOpName(name), ADDR_o32b(offset, base), nameFPReg(dst));
+            }
             m_formatter.legacySSEPrefix(ty);
             m_formatter.twoByteOp_disp32(opcode, offset, base, dst);
             return;
         }
 
         if (src0 == X86Registers::invalid_xmm) {
-            spew("%-11s" MEM_o32b ", %s", name, ADDR_o32b(offset, base), nameFPReg(dst));
+            if (IsXMMReversedOperands(opcode)) {
+                spew("%-11s%s, " MEM_o32b, name, nameFPReg(dst), ADDR_o32b(offset, base));
+            } else {
+                spew("%-11s" MEM_o32b ", %s", name, ADDR_o32b(offset, base), nameFPReg(dst));
+            }
         } else {
             spew("%-11s" MEM_o32b ", %s, %s", name,
                  ADDR_o32b(offset, base), nameFPReg(src0), nameFPReg(dst));
         }
         m_formatter.twoByteOpVex_disp32(ty, opcode, offset, base, src0, dst);
     }
 
     void twoByteOpSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
                        int32_t offset, RegisterID base, RegisterID index, int scale,
                        XMMRegisterID src0, XMMRegisterID dst)
     {
         if (useLegacySSEEncoding(src0, dst)) {
-            spew("%-11s" MEM_obs ", %s", legacySSEOpName(name),
-                 ADDR_obs(offset, base, index, scale), nameFPReg(dst));
+            if (IsXMMReversedOperands(opcode)) {
+                spew("%-11s%s, " MEM_obs, legacySSEOpName(name),
+                     nameFPReg(dst), ADDR_obs(offset, base, index, scale));
+            } else {
+                spew("%-11s" MEM_obs ", %s", legacySSEOpName(name),
+                     ADDR_obs(offset, base, index, scale), nameFPReg(dst));
+            }
             m_formatter.legacySSEPrefix(ty);
             m_formatter.twoByteOp(opcode, offset, base, index, scale, dst);
             return;
         }
 
         if (src0 == X86Registers::invalid_xmm) {
-            spew("%-11s" MEM_obs ", %s", name, ADDR_obs(offset, base, index, scale),
-                 nameFPReg(dst));
+            if (IsXMMReversedOperands(opcode)) {
+                spew("%-11s%s, " MEM_obs, name, nameFPReg(dst),
+                     ADDR_obs(offset, base, index, scale));
+            } else {
+                spew("%-11s" MEM_obs ", %s", name, ADDR_obs(offset, base, index, scale),
+                     nameFPReg(dst));
+            }
         } else {
             spew("%-11s" MEM_obs ", %s, %s", name, ADDR_obs(offset, base, index, scale),
                  nameFPReg(src0), nameFPReg(dst));
         }
         m_formatter.twoByteOpVex(ty, opcode, offset, base, index, scale, src0, dst);
     }
 
     void twoByteOpSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
                        const void* address, XMMRegisterID src0, XMMRegisterID dst)
     {
         if (useLegacySSEEncoding(src0, dst)) {
-            spew("%-11s%p, %s", legacySSEOpName(name), address, nameFPReg(dst));
+            if (IsXMMReversedOperands(opcode))
+                spew("%-11s%s, %p", legacySSEOpName(name), nameFPReg(dst), address);
+            else
+                spew("%-11s%p, %s", legacySSEOpName(name), address, nameFPReg(dst));
             m_formatter.legacySSEPrefix(ty);
             m_formatter.twoByteOp(opcode, address, dst);
             return;
         }
 
-        if (src0 == X86Registers::invalid_xmm)
-            spew("%-11s%p, %s", name, address, nameFPReg(dst));
-        else
+        if (src0 == X86Registers::invalid_xmm) {
+            if (IsXMMReversedOperands(opcode))
+                spew("%-11s%s, %p", name, nameFPReg(dst), address);
+            else
+                spew("%-11s%p, %s", name, address, nameFPReg(dst));
+        } else {
             spew("%-11s%p, %s, %s", name, address, nameFPReg(src0), nameFPReg(dst));
+        }
         m_formatter.twoByteOpVex(ty, opcode, address, src0, dst);
     }
 
     void twoByteOpInt32Simd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
                             RegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
     {
         if (useLegacySSEEncoding(src0, dst)) {
-            spew("%-11s%s, %s", legacySSEOpName(name), nameIReg(4, rm), nameFPReg(dst));
+            if (IsXMMReversedOperands(opcode))
+                spew("%-11s%s, %s", legacySSEOpName(name), nameFPReg(dst), nameIReg(4, rm));
+            else
+                spew("%-11s%s, %s", legacySSEOpName(name), nameIReg(4, rm), nameFPReg(dst));
             m_formatter.legacySSEPrefix(ty);
             m_formatter.twoByteOp(opcode, rm, dst);
             return;
         }
 
-        spew("%-11s%s, %s, %s", name, nameIReg(4, rm), nameFPReg(src0), nameFPReg(dst));
+        if (src0 == X86Registers::invalid_xmm) {
+            if (IsXMMReversedOperands(opcode))
+                spew("%-11s%s, %s", name, nameFPReg(dst), nameIReg(4, rm));
+            else
+                spew("%-11s%s, %s", name, nameIReg(4, rm), nameFPReg(dst));
+        } else {
+            spew("%-11s%s, %s, %s", name, nameIReg(4, rm), nameFPReg(src0), nameFPReg(dst));
+        }
         m_formatter.twoByteOpVex(ty, opcode, rm, src0, dst);
     }
 
 #ifdef JS_CODEGEN_X64
     void twoByteOpInt64Simd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
                             RegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
     {
         if (useLegacySSEEncoding(src0, dst)) {
-            spew("%-11s%s, %s", legacySSEOpName(name), nameIReg(rm), nameFPReg(dst));
+            if (IsXMMReversedOperands(opcode))
+                spew("%-11s%s, %s", legacySSEOpName(name), nameFPReg(dst), nameIReg(rm));
+            else
+                spew("%-11s%s, %s", legacySSEOpName(name), nameIReg(rm), nameFPReg(dst));
             m_formatter.legacySSEPrefix(ty);
             m_formatter.twoByteOp64(opcode, rm, dst);
             return;
         }
 
-        spew("%-11s%s, %s, %s", name, nameIReg(rm), nameFPReg(src0), nameFPReg(dst));
+        if (src0 == X86Registers::invalid_xmm) {
+            if (IsXMMReversedOperands(opcode))
+                spew("%-11s%s, %s", name, nameFPReg(dst), nameIReg(rm));
+            else
+                spew("%-11s%s, %s", name, nameIReg(rm), nameFPReg(dst));
+        } else {
+            spew("%-11s%s, %s, %s", name, nameIReg(rm), nameFPReg(src0), nameFPReg(dst));
+        }
         m_formatter.twoByteOpVex64(ty, opcode, rm, src0, dst);
     }
 #endif
 
     void twoByteOpSimdInt32(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
                             XMMRegisterID rm, RegisterID dst)
     {
         if (useLegacySSEEncodingForOtherOutput()) {
-            if (opcode == OP2_MOVD_EdVd)
+            if (IsXMMReversedOperands(opcode))
+                spew("%-11s%s, %s", legacySSEOpName(name), nameIReg(4, dst), nameFPReg(rm));
+            else if (opcode == OP2_MOVD_EdVd)
                 spew("%-11s%s, %s", legacySSEOpName(name), nameFPReg((XMMRegisterID)dst), nameIReg(4, (RegisterID)rm));
             else
                 spew("%-11s%s, %s", legacySSEOpName(name), nameFPReg(rm), nameIReg(4, dst));
             m_formatter.legacySSEPrefix(ty);
             m_formatter.twoByteOp(opcode, (RegisterID)rm, dst);
             return;
         }
 
-        if (opcode == OP2_MOVD_EdVd)
+        if (IsXMMReversedOperands(opcode))
+            spew("%-11s%s, %s", name, nameIReg(4, dst), nameFPReg(rm));
+        else if (opcode == OP2_MOVD_EdVd)
             spew("%-11s%s, %s", name, nameFPReg((XMMRegisterID)dst), nameIReg(4, (RegisterID)rm));
         else
             spew("%-11s%s, %s", name, nameFPReg(rm), nameIReg(4, dst));
         m_formatter.twoByteOpVex(ty, opcode, (RegisterID)rm, X86Registers::invalid_xmm, dst);
     }
 
 #ifdef JS_CODEGEN_X64
     void twoByteOpSimdInt64(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
                             XMMRegisterID rm, RegisterID dst)
     {
         if (useLegacySSEEncodingForOtherOutput()) {
-            if (opcode == OP2_MOVD_EdVd)
+            if (IsXMMReversedOperands(opcode))
+                spew("%-11s%s, %s", legacySSEOpName(name), nameIReg(dst), nameFPReg(rm));
+            else if (opcode == OP2_MOVD_EdVd)
                 spew("%-11s%s, %s", legacySSEOpName(name), nameFPReg((XMMRegisterID)dst), nameIReg((RegisterID)rm));
             else
                 spew("%-11s%s, %s", legacySSEOpName(name), nameFPReg(rm), nameIReg(dst));
             m_formatter.legacySSEPrefix(ty);
             m_formatter.twoByteOp64(opcode, (RegisterID)rm, dst);
             return;
         }
 
-        if (opcode == OP2_MOVD_EdVd)
+        if (IsXMMReversedOperands(opcode))
+            spew("%-11s%s, %s", name, nameIReg(dst), nameFPReg(rm));
+        else if (opcode == OP2_MOVD_EdVd)
             spew("%-11s%s, %s", name, nameFPReg((XMMRegisterID)dst), nameIReg((RegisterID)rm));
         else
             spew("%-11s%s, %s", name, nameFPReg(rm), nameIReg(dst));
         m_formatter.twoByteOpVex64(ty, opcode, (RegisterID)rm, X86Registers::invalid_xmm, (XMMRegisterID)dst);
     }
 #endif
 
     void twoByteOpSimdFlags(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@@ -2223,17 +2223,17 @@ void
 CodeGeneratorX86Shared::visitSimdInsertElementF(LSimdInsertElementF *ins)
 {
     FloatRegister vector = ToFloatRegister(ins->vector());
     FloatRegister value = ToFloatRegister(ins->value());
     FloatRegister output = ToFloatRegister(ins->output());
     MOZ_ASSERT(vector == output); // defineReuseInput(0)
 
     if (ins->lane() == SimdLane::LaneX) {
-        // As both operands are registers, movss doesn't modify the upper bits
+        // As both operands are registers, vmovss doesn't modify the upper bits
         // of the destination operand.
         if (value != output)
             masm.vmovss(value, vector, output);
         return;
     }
 
     if (AssemblerX86Shared::HasSSE41()) {
         // The input value is in the low float32 of the 'value' FloatRegister.
@@ -2365,17 +2365,17 @@ CodeGeneratorX86Shared::visitSimdShuffle
             return;
         }
     }
 
     // One element of the second, all other elements of the first
     if (numLanesFromLHS == 3) {
         unsigned firstMask = -1, secondMask = -1;
 
-        // register-register movss preserves the high lanes.
+        // register-register vmovss preserves the high lanes.
         if (ins->lanesMatch(4, 1, 2, 3)) {
             masm.vmovss(rhs, lhs, out);
             return;
         }
 
         // SSE4.1 insertps can handle any single element.
         unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
         if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
--- a/js/src/jit/shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/shared/MacroAssembler-x86-shared.h
@@ -754,35 +754,35 @@ class MacroAssemblerX86Shared : public A
           case Operand::MEM_SCALE:
             loadDouble(src.toBaseIndex(), dest);
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
     void storeDouble(FloatRegister src, const Address &dest) {
-        movsd(src, dest);
+        vmovsd(src, dest);
     }
     void storeDouble(FloatRegister src, const BaseIndex &dest) {
-        movsd(src, dest);
+        vmovsd(src, dest);
     }
     void storeDouble(FloatRegister src, const Operand &dest) {
         switch (dest.kind()) {
           case Operand::MEM_REG_DISP:
             storeDouble(src, dest.toAddress());
             break;
           case Operand::MEM_SCALE:
             storeDouble(src, dest.toBaseIndex());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
     void moveDouble(FloatRegister src, FloatRegister dest) {
-        // Use movapd instead of movsd to avoid dependencies.
+        // Use vmovapd instead of vmovsd to avoid dependencies.
         vmovapd(src, dest);
     }
     void zeroDouble(FloatRegister reg) {
         vxorpd(reg, reg, reg);
     }
     void zeroFloat32(FloatRegister reg) {
         vxorps(reg, reg, reg);
     }
@@ -858,17 +858,17 @@ class MacroAssemblerX86Shared : public A
 
     void loadAlignedInt32x4(const Address &src, FloatRegister dest) {
         vmovdqa(Operand(src), dest);
     }
     void loadAlignedInt32x4(const Operand &src, FloatRegister dest) {
         vmovdqa(src, dest);
     }
     void storeAlignedInt32x4(FloatRegister src, const Address &dest) {
-        movdqa(src, Operand(dest));
+        vmovdqa(src, Operand(dest));
     }
     void moveInt32x4(FloatRegister src, FloatRegister dest) {
         vmovdqa(src, dest);
     }
     FloatRegister reusedInputInt32x4(FloatRegister src, FloatRegister dest) {
         if (HasAVX())
             return src;
         moveInt32x4(src, dest);
@@ -882,20 +882,20 @@ class MacroAssemblerX86Shared : public A
     }
     void loadUnalignedInt32x4(const Address &src, FloatRegister dest) {
         vmovdqu(Operand(src), dest);
     }
     void loadUnalignedInt32x4(const Operand &src, FloatRegister dest) {
         vmovdqu(src, dest);
     }
     void storeUnalignedInt32x4(FloatRegister src, const Address &dest) {
-        movdqu(src, Operand(dest));
+        vmovdqu(src, Operand(dest));
     }
     void storeUnalignedInt32x4(FloatRegister src, const Operand &dest) {
-        movdqu(src, dest);
+        vmovdqu(src, dest);
     }
     void packedEqualInt32x4(const Operand &src, FloatRegister dest) {
         vpcmpeqd(src, dest, dest);
     }
     void packedGreaterThanInt32x4(const Operand &src, FloatRegister dest) {
         vpcmpgtd(src, dest, dest);
     }
     void packedAddInt32(const Operand &src, FloatRegister dest) {
@@ -939,17 +939,17 @@ class MacroAssemblerX86Shared : public A
 
     void loadAlignedFloat32x4(const Address &src, FloatRegister dest) {
         vmovaps(Operand(src), dest);
     }
     void loadAlignedFloat32x4(const Operand &src, FloatRegister dest) {
         vmovaps(src, dest);
     }
     void storeAlignedFloat32x4(FloatRegister src, const Address &dest) {
-        movaps(src, Operand(dest));
+        vmovaps(src, Operand(dest));
     }
     void moveFloat32x4(FloatRegister src, FloatRegister dest) {
         vmovaps(src, dest);
     }
     FloatRegister reusedInputFloat32x4(FloatRegister src, FloatRegister dest) {
         if (HasAVX())
             return src;
         moveFloat32x4(src, dest);
@@ -963,20 +963,20 @@ class MacroAssemblerX86Shared : public A
     }
     void loadUnalignedFloat32x4(const Address &src, FloatRegister dest) {
         vmovups(Operand(src), dest);
     }
     void loadUnalignedFloat32x4(const Operand &src, FloatRegister dest) {
         vmovups(src, dest);
     }
     void storeUnalignedFloat32x4(FloatRegister src, const Address &dest) {
-        movups(src, Operand(dest));
+        vmovups(src, Operand(dest));
     }
     void storeUnalignedFloat32x4(FloatRegister src, const Operand &dest) {
-        movups(src, dest);
+        vmovups(src, dest);
     }
     void packedAddFloat32(const Operand &src, FloatRegister dest) {
         vaddps(src, dest, dest);
     }
     void packedSubFloat32(const Operand &src, FloatRegister dest) {
         vsubps(src, dest, dest);
     }
     void packedMulFloat32(const Operand &src, FloatRegister dest) {
@@ -1051,35 +1051,35 @@ class MacroAssemblerX86Shared : public A
           case Operand::MEM_SCALE:
             loadFloat32(src.toBaseIndex(), dest);
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
     void storeFloat32(FloatRegister src, const Address &dest) {
-        movss(src, dest);
+        vmovss(src, dest);
     }
     void storeFloat32(FloatRegister src, const BaseIndex &dest) {
-        movss(src, dest);
+        vmovss(src, dest);
     }
     void storeFloat32(FloatRegister src, const Operand &dest) {
         switch (dest.kind()) {
           case Operand::MEM_REG_DISP:
             storeFloat32(src, dest.toAddress());
             break;
           case Operand::MEM_SCALE:
             storeFloat32(src, dest.toBaseIndex());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
     void moveFloat32(FloatRegister src, FloatRegister dest) {
-        // Use movaps instead of movss to avoid dependencies.
+        // Use vmovaps instead of vmovss to avoid dependencies.
         vmovaps(src, dest);
     }
 
     // Checks whether a double is representable as a 32-bit integer. If so, the
     // integer is written to the output register. Otherwise, a bailout is taken to
     // the given snapshot. This function overwrites the scratch float register.
     void convertDoubleToInt32(FloatRegister src, Register dest, Label *fail,
                               bool negativeZeroCheck = true)
--- a/js/src/jit/x64/Assembler-x64.h
+++ b/js/src/jit/x64/Assembler-x64.h
@@ -289,17 +289,17 @@ class Assembler : public AssemblerX86Sha
             push(ScratchReg);
         }
     }
     void push(ImmPtr imm) {
         push(ImmWord(uintptr_t(imm.value)));
     }
     void push(FloatRegister src) {
         subq(Imm32(sizeof(double)), StackPointer);
-        movsd(src, Address(StackPointer, 0));
+        vmovsd(src, Address(StackPointer, 0));
     }
     CodeOffsetLabel pushWithPatch(ImmWord word) {
         CodeOffsetLabel label = movWithPatch(word, ScratchReg);
         push(ScratchReg);
         return label;
     }
 
     void pop(FloatRegister src) {
@@ -595,41 +595,41 @@ class Assembler : public AssemblerX86Sha
 
     CodeOffsetLabel loadRipRelativeInt32(Register dest) {
         return CodeOffsetLabel(masm.movl_ripr(dest.code()).offset());
     }
     CodeOffsetLabel loadRipRelativeInt64(Register dest) {
         return CodeOffsetLabel(masm.movq_ripr(dest.code()).offset());
     }
     CodeOffsetLabel loadRipRelativeDouble(FloatRegister dest) {
-        return CodeOffsetLabel(masm.movsd_ripr(dest.code()).offset());
+        return CodeOffsetLabel(masm.vmovsd_ripr(dest.code()).offset());
     }
     CodeOffsetLabel loadRipRelativeFloat32(FloatRegister dest) {
-        return CodeOffsetLabel(masm.movss_ripr(dest.code()).offset());
+        return CodeOffsetLabel(masm.vmovss_ripr(dest.code()).offset());
     }
     CodeOffsetLabel loadRipRelativeInt32x4(FloatRegister dest) {
-        return CodeOffsetLabel(masm.movdqa_ripr(dest.code()).offset());
+        return CodeOffsetLabel(masm.vmovdqa_ripr(dest.code()).offset());
     }
     CodeOffsetLabel loadRipRelativeFloat32x4(FloatRegister dest) {
-        return CodeOffsetLabel(masm.movaps_ripr(dest.code()).offset());
+        return CodeOffsetLabel(masm.vmovaps_ripr(dest.code()).offset());
     }
     CodeOffsetLabel storeRipRelativeInt32(Register dest) {
         return CodeOffsetLabel(masm.movl_rrip(dest.code()).offset());
     }
     CodeOffsetLabel storeRipRelativeDouble(FloatRegister dest) {
-        return CodeOffsetLabel(masm.movsd_rrip(dest.code()).offset());
+        return CodeOffsetLabel(masm.vmovsd_rrip(dest.code()).offset());
     }
     CodeOffsetLabel storeRipRelativeFloat32(FloatRegister dest) {
-        return CodeOffsetLabel(masm.movss_rrip(dest.code()).offset());
+        return CodeOffsetLabel(masm.vmovss_rrip(dest.code()).offset());
     }
     CodeOffsetLabel storeRipRelativeInt32x4(FloatRegister dest) {
-        return CodeOffsetLabel(masm.movdqa_rrip(dest.code()).offset());
+        return CodeOffsetLabel(masm.vmovdqa_rrip(dest.code()).offset());
     }
     CodeOffsetLabel storeRipRelativeFloat32x4(FloatRegister dest) {
-        return CodeOffsetLabel(masm.movaps_rrip(dest.code()).offset());
+        return CodeOffsetLabel(masm.vmovaps_rrip(dest.code()).offset());
     }
     CodeOffsetLabel leaRipRelative(Register dest) {
         return CodeOffsetLabel(masm.leaq_rip(dest.code()).offset());
     }
 
     void loadAsmJSActivation(Register dest) {
         CodeOffsetLabel label = loadRipRelativeInt64(dest);
         append(AsmJSGlobalAccess(label, AsmJSActivationGlobalDataOffset));
--- a/js/src/jit/x64/MacroAssembler-x64.cpp
+++ b/js/src/jit/x64/MacroAssembler-x64.cpp
@@ -39,17 +39,17 @@ MacroAssemblerX64::loadConstantDouble(do
     Double &dbl = doubles_[doubleIndex];
     MOZ_ASSERT(!dbl.uses.bound());
 
     // The constants will be stored in a pool appended to the text (see
     // finish()), so they will always be a fixed distance from the
     // instructions which reference them. This allows the instructions to use
     // PC-relative addressing. Use "jump" label support code, because we need
     // the same PC-relative address patching that jumps use.
-    JmpSrc j = masm.movsd_ripr(dest.code());
+    JmpSrc j = masm.vmovsd_ripr(dest.code());
     JmpSrc prev = JmpSrc(dbl.uses.use(j.offset()));
     masm.setNextJump(j, prev);
 }
 
 void
 MacroAssemblerX64::loadConstantFloat32(float f, FloatRegister dest)
 {
     if (maybeInlineFloat(f, dest))
@@ -69,17 +69,17 @@ MacroAssemblerX64::loadConstantFloat32(f
         enoughMemory_ &= floatMap_.add(p, f, floatIndex);
         if (!enoughMemory_)
             return;
     }
     Float &flt = floats_[floatIndex];
     MOZ_ASSERT(!flt.uses.bound());
 
     // See comment in loadConstantDouble
-    JmpSrc j = masm.movss_ripr(dest.code());
+    JmpSrc j = masm.vmovss_ripr(dest.code());
     JmpSrc prev = JmpSrc(flt.uses.use(j.offset()));
     masm.setNextJump(j, prev);
 }
 
 MacroAssemblerX64::SimdData *
 MacroAssemblerX64::getSimdData(const SimdConstant &v)
 {
     if (!simdMap_.initialized()) {
@@ -110,17 +110,17 @@ MacroAssemblerX64::loadConstantInt32x4(c
 
     SimdData *val = getSimdData(v);
     if (!val)
         return;
 
     MOZ_ASSERT(!val->uses.bound());
     MOZ_ASSERT(val->type() == SimdConstant::Int32x4);
 
-    JmpSrc j = masm.movdqa_ripr(dest.code());
+    JmpSrc j = masm.vmovdqa_ripr(dest.code());
     JmpSrc prev = JmpSrc(val->uses.use(j.offset()));
     masm.setNextJump(j, prev);
 }
 
 void
 MacroAssemblerX64::loadConstantFloat32x4(const SimdConstant&v, FloatRegister dest)
 {
     MOZ_ASSERT(v.type() == SimdConstant::Float32x4);
@@ -129,17 +129,17 @@ MacroAssemblerX64::loadConstantFloat32x4
 
     SimdData *val = getSimdData(v);
     if (!val)
         return;
 
     MOZ_ASSERT(!val->uses.bound());
     MOZ_ASSERT(val->type() == SimdConstant::Float32x4);
 
-    JmpSrc j = masm.movaps_ripr(dest.code());
+    JmpSrc j = masm.vmovaps_ripr(dest.code());
     JmpSrc prev = JmpSrc(val->uses.use(j.offset()));
     masm.setNextJump(j, prev);
 }
 
 void
 MacroAssemblerX64::finish()
 {
     if (!doubles_.empty())
--- a/js/src/jit/x64/Trampoline-x64.cpp
+++ b/js/src/jit/x64/Trampoline-x64.cpp
@@ -62,29 +62,29 @@ JitRuntime::generateEnterJIT(JSContext *
     masm.push(r12);
     masm.push(r13);
     masm.push(r14);
     masm.push(r15);
 #if defined(_WIN64)
     masm.push(rdi);
     masm.push(rsi);
 
-    // 16-byte aligment for movdqa
+    // 16-byte aligment for vmovdqa
     masm.subq(Imm32(16 * 10 + 8), rsp);
 
-    masm.movdqa(xmm6, Operand(rsp, 16 * 0));
-    masm.movdqa(xmm7, Operand(rsp, 16 * 1));
-    masm.movdqa(xmm8, Operand(rsp, 16 * 2));
-    masm.movdqa(xmm9, Operand(rsp, 16 * 3));
-    masm.movdqa(xmm10, Operand(rsp, 16 * 4));
-    masm.movdqa(xmm11, Operand(rsp, 16 * 5));
-    masm.movdqa(xmm12, Operand(rsp, 16 * 6));
-    masm.movdqa(xmm13, Operand(rsp, 16 * 7));
-    masm.movdqa(xmm14, Operand(rsp, 16 * 8));
-    masm.movdqa(xmm15, Operand(rsp, 16 * 9));
+    masm.vmovdqa(xmm6, Operand(rsp, 16 * 0));
+    masm.vmovdqa(xmm7, Operand(rsp, 16 * 1));
+    masm.vmovdqa(xmm8, Operand(rsp, 16 * 2));
+    masm.vmovdqa(xmm9, Operand(rsp, 16 * 3));
+    masm.vmovdqa(xmm10, Operand(rsp, 16 * 4));
+    masm.vmovdqa(xmm11, Operand(rsp, 16 * 5));
+    masm.vmovdqa(xmm12, Operand(rsp, 16 * 6));
+    masm.vmovdqa(xmm13, Operand(rsp, 16 * 7));
+    masm.vmovdqa(xmm14, Operand(rsp, 16 * 8));
+    masm.vmovdqa(xmm15, Operand(rsp, 16 * 9));
 #endif
 
     // Push the EnterJIT sps mark.
     masm.spsMarkJit(&cx->runtime()->spsProfiler, rbp, rbx);
 
     // Save arguments passed in registers needed after function call.
     masm.push(result);
 
@@ -267,26 +267,26 @@ JitRuntime::generateEnterJIT(JSContext *
     masm.pop(r12); // vp
     masm.storeValue(JSReturnOperand, Operand(r12, 0));
 
     // Unwind the sps mark.
     masm.spsUnmarkJit(&cx->runtime()->spsProfiler, rbx);
 
     // Restore non-volatile registers.
 #if defined(_WIN64)
-    masm.movdqa(Operand(rsp, 16 * 0), xmm6);
-    masm.movdqa(Operand(rsp, 16 * 1), xmm7);
-    masm.movdqa(Operand(rsp, 16 * 2), xmm8);
-    masm.movdqa(Operand(rsp, 16 * 3), xmm9);
-    masm.movdqa(Operand(rsp, 16 * 4), xmm10);
-    masm.movdqa(Operand(rsp, 16 * 5), xmm11);
-    masm.movdqa(Operand(rsp, 16 * 6), xmm12);
-    masm.movdqa(Operand(rsp, 16 * 7), xmm13);
-    masm.movdqa(Operand(rsp, 16 * 8), xmm14);
-    masm.movdqa(Operand(rsp, 16 * 9), xmm15);
+    masm.vmovdqa(Operand(rsp, 16 * 0), xmm6);
+    masm.vmovdqa(Operand(rsp, 16 * 1), xmm7);
+    masm.vmovdqa(Operand(rsp, 16 * 2), xmm8);
+    masm.vmovdqa(Operand(rsp, 16 * 3), xmm9);
+    masm.vmovdqa(Operand(rsp, 16 * 4), xmm10);
+    masm.vmovdqa(Operand(rsp, 16 * 5), xmm11);
+    masm.vmovdqa(Operand(rsp, 16 * 6), xmm12);
+    masm.vmovdqa(Operand(rsp, 16 * 7), xmm13);
+    masm.vmovdqa(Operand(rsp, 16 * 8), xmm14);
+    masm.vmovdqa(Operand(rsp, 16 * 9), xmm15);
 
     masm.addq(Imm32(16 * 10 + 8), rsp);
 
     masm.pop(rsi);
     masm.pop(rdi);
 #endif
     masm.pop(r15);
     masm.pop(r14);
--- a/js/src/jit/x86/Assembler-x86.h
+++ b/js/src/jit/x86/Assembler-x86.h
@@ -182,18 +182,18 @@ class Assembler : public AssemblerX86Sha
         if (kind == Relocation::JITCODE)
             writeRelocation(src);
     }
 
   public:
     using AssemblerX86Shared::movl;
     using AssemblerX86Shared::j;
     using AssemblerX86Shared::jmp;
-    using AssemblerX86Shared::movsd;
-    using AssemblerX86Shared::movss;
+    using AssemblerX86Shared::vmovsd;
+    using AssemblerX86Shared::vmovss;
     using AssemblerX86Shared::retarget;
     using AssemblerX86Shared::cmpl;
     using AssemblerX86Shared::call;
     using AssemblerX86Shared::push;
     using AssemblerX86Shared::pop;
 
     static void TraceJumpRelocations(JSTracer *trc, JitCode *code, CompactBufferReader &reader);
 
@@ -213,17 +213,17 @@ class Assembler : public AssemblerX86Sha
     void push(const ImmWord imm) {
         push(Imm32(imm.value));
     }
     void push(const ImmPtr imm) {
         push(ImmWord(uintptr_t(imm.value)));
     }
     void push(FloatRegister src) {
         subl(Imm32(sizeof(double)), StackPointer);
-        movsd(src, Address(StackPointer, 0));
+        vmovsd(src, Address(StackPointer, 0));
     }
 
     CodeOffsetLabel pushWithPatch(ImmWord word) {
         masm.push_i32(int32_t(word.value));
         return CodeOffsetLabel(masm.currentOffset());
     }
 
     void pop(FloatRegister src) {
@@ -488,34 +488,34 @@ class Assembler : public AssemblerX86Sha
     CodeOffsetLabel movwWithPatch(Register src, Address dest) {
         masm.movw_rm_disp32(src.code(), dest.offset, dest.base.code());
         return CodeOffsetLabel(masm.currentOffset());
     }
     CodeOffsetLabel movlWithPatch(Register src, Address dest) {
         masm.movl_rm_disp32(src.code(), dest.offset, dest.base.code());
         return CodeOffsetLabel(masm.currentOffset());
     }
-    CodeOffsetLabel movssWithPatch(FloatRegister src, Address dest) {
+    CodeOffsetLabel vmovssWithPatch(FloatRegister src, Address dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.movss_rm_disp32(src.code(), dest.offset, dest.base.code());
+        masm.vmovss_rm_disp32(src.code(), dest.offset, dest.base.code());
         return CodeOffsetLabel(masm.currentOffset());
     }
-    CodeOffsetLabel movsdWithPatch(FloatRegister src, Address dest) {
+    CodeOffsetLabel vmovsdWithPatch(FloatRegister src, Address dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.movsd_rm_disp32(src.code(), dest.offset, dest.base.code());
+        masm.vmovsd_rm_disp32(src.code(), dest.offset, dest.base.code());
         return CodeOffsetLabel(masm.currentOffset());
     }
-    CodeOffsetLabel movupsWithPatch(FloatRegister src, Address dest) {
+    CodeOffsetLabel vmovupsWithPatch(FloatRegister src, Address dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.movups_rm_disp32(src.code(), dest.offset, dest.base.code());
+        masm.vmovups_rm_disp32(src.code(), dest.offset, dest.base.code());
         return CodeOffsetLabel(masm.currentOffset());
     }
-    CodeOffsetLabel movdquWithPatch(FloatRegister src, Address dest) {
+    CodeOffsetLabel vmovdquWithPatch(FloatRegister src, Address dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.movdqu_rm_disp32(src.code(), dest.offset, dest.base.code());
+        masm.vmovdqu_rm_disp32(src.code(), dest.offset, dest.base.code());
         return CodeOffsetLabel(masm.currentOffset());
     }
 
     // Load from *(addr + index*scale) where addr can be patched.
     CodeOffsetLabel movlWithPatch(PatchedAbsoluteAddress addr, Register index, Scale scale,
                                   Register dest)
     {
         masm.movl_mr(addr.addr, index.code(), scale, dest.code());
@@ -582,44 +582,44 @@ class Assembler : public AssemblerX86Sha
     CodeOffsetLabel movwWithPatch(Register src, PatchedAbsoluteAddress dest) {
         masm.movw_rm(src.code(), dest.addr);
         return CodeOffsetLabel(masm.currentOffset());
     }
     CodeOffsetLabel movlWithPatch(Register src, PatchedAbsoluteAddress dest) {
         masm.movl_rm(src.code(), dest.addr);
         return CodeOffsetLabel(masm.currentOffset());
     }
-    CodeOffsetLabel movssWithPatch(FloatRegister src, PatchedAbsoluteAddress dest) {
+    CodeOffsetLabel vmovssWithPatch(FloatRegister src, PatchedAbsoluteAddress dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.movss_rm(src.code(), dest.addr);
+        masm.vmovss_rm(src.code(), dest.addr);
         return CodeOffsetLabel(masm.currentOffset());
     }
-    CodeOffsetLabel movsdWithPatch(FloatRegister src, PatchedAbsoluteAddress dest) {
+    CodeOffsetLabel vmovsdWithPatch(FloatRegister src, PatchedAbsoluteAddress dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.movsd_rm(src.code(), dest.addr);
+        masm.vmovsd_rm(src.code(), dest.addr);
         return CodeOffsetLabel(masm.currentOffset());
     }
-    CodeOffsetLabel movdqaWithPatch(FloatRegister src, PatchedAbsoluteAddress dest) {
+    CodeOffsetLabel vmovdqaWithPatch(FloatRegister src, PatchedAbsoluteAddress dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.movdqa_rm(src.code(), dest.addr);
+        masm.vmovdqa_rm(src.code(), dest.addr);
         return CodeOffsetLabel(masm.currentOffset());
     }
-    CodeOffsetLabel movapsWithPatch(FloatRegister src, PatchedAbsoluteAddress dest) {
+    CodeOffsetLabel vmovapsWithPatch(FloatRegister src, PatchedAbsoluteAddress dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.movaps_rm(src.code(), dest.addr);
+        masm.vmovaps_rm(src.code(), dest.addr);
         return CodeOffsetLabel(masm.currentOffset());
     }
-    CodeOffsetLabel movdquWithPatch(FloatRegister src, PatchedAbsoluteAddress dest) {
+    CodeOffsetLabel vmovdquWithPatch(FloatRegister src, PatchedAbsoluteAddress dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.movdqu_rm(src.code(), dest.addr);
+        masm.vmovdqu_rm(src.code(), dest.addr);
         return CodeOffsetLabel(masm.currentOffset());
     }
-    CodeOffsetLabel movupsWithPatch(FloatRegister src, PatchedAbsoluteAddress dest) {
+    CodeOffsetLabel vmovupsWithPatch(FloatRegister src, PatchedAbsoluteAddress dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.movups_rm(src.code(), dest.addr);
+        masm.vmovups_rm(src.code(), dest.addr);
         return CodeOffsetLabel(masm.currentOffset());
     }
 
     void loadAsmJSActivation(Register dest) {
         CodeOffsetLabel label = movlWithPatch(PatchedAbsoluteAddress(), dest);
         append(AsmJSGlobalAccess(label, AsmJSActivationGlobalDataOffset));
     }
     void loadAsmJSHeapRegisterFromGlobalData() {
--- a/js/src/jit/x86/CodeGenerator-x86.cpp
+++ b/js/src/jit/x86/CodeGenerator-x86.cpp
@@ -403,20 +403,20 @@ CodeGeneratorX86::store(Scalar::Type vt,
     switch (vt) {
       case Scalar::Int8:
       case Scalar::Uint8Clamped:
       case Scalar::Uint8:        masm.movbWithPatch(ToRegister(value), dstAddr); break;
       case Scalar::Int16:
       case Scalar::Uint16:       masm.movwWithPatch(ToRegister(value), dstAddr); break;
       case Scalar::Int32:
       case Scalar::Uint32:       masm.movlWithPatch(ToRegister(value), dstAddr); break;
-      case Scalar::Float32:      masm.movssWithPatch(ToFloatRegister(value), dstAddr); break;
-      case Scalar::Float64:      masm.movsdWithPatch(ToFloatRegister(value), dstAddr); break;
-      case Scalar::Float32x4:    masm.movupsWithPatch(ToFloatRegister(value), dstAddr); break;
-      case Scalar::Int32x4:      masm.movdquWithPatch(ToFloatRegister(value), dstAddr); break;
+      case Scalar::Float32:      masm.vmovssWithPatch(ToFloatRegister(value), dstAddr); break;
+      case Scalar::Float64:      masm.vmovsdWithPatch(ToFloatRegister(value), dstAddr); break;
+      case Scalar::Float32x4:    masm.vmovupsWithPatch(ToFloatRegister(value), dstAddr); break;
+      case Scalar::Int32x4:      masm.vmovdquWithPatch(ToFloatRegister(value), dstAddr); break;
       case Scalar::MaxTypedArrayViewType: MOZ_CRASH("unexpected type");
     }
 }
 
 template<typename T>
 void
 CodeGeneratorX86::storeAndNoteViewTypeElement(Scalar::Type vt, const LAllocation *value,
                                               const T &dstAddr)
@@ -635,28 +635,28 @@ CodeGeneratorX86::visitAsmJSStoreGlobalV
     MOZ_ASSERT(IsNumberType(type) || IsSimdType(type));
 
     CodeOffsetLabel label;
     switch (type) {
       case MIRType_Int32:
         label = masm.movlWithPatch(ToRegister(ins->value()), PatchedAbsoluteAddress());
         break;
       case MIRType_Float32:
-        label = masm.movssWithPatch(ToFloatRegister(ins->value()), PatchedAbsoluteAddress());
+        label = masm.vmovssWithPatch(ToFloatRegister(ins->value()), PatchedAbsoluteAddress());
         break;
       case MIRType_Double:
-        label = masm.movsdWithPatch(ToFloatRegister(ins->value()), PatchedAbsoluteAddress());
+        label = masm.vmovsdWithPatch(ToFloatRegister(ins->value()), PatchedAbsoluteAddress());
         break;
       // Aligned access: code is aligned on PageSize + there is padding
       // before the global data section.
       case MIRType_Int32x4:
-        label = masm.movdqaWithPatch(ToFloatRegister(ins->value()), PatchedAbsoluteAddress());
+        label = masm.vmovdqaWithPatch(ToFloatRegister(ins->value()), PatchedAbsoluteAddress());
         break;
       case MIRType_Float32x4:
-        label = masm.movapsWithPatch(ToFloatRegister(ins->value()), PatchedAbsoluteAddress());
+        label = masm.vmovapsWithPatch(ToFloatRegister(ins->value()), PatchedAbsoluteAddress());
         break;
       default:
         MOZ_CRASH("unexpected type in visitAsmJSStoreGlobalVar");
     }
     masm.append(AsmJSGlobalAccess(label, mir->globalDataOffset()));
 }
 
 void