Bug 1115752 - SpiderMonkey: VEX encodings for instructions with leading immediates r=jandem
authorDan Gohman <sunfish@mozilla.com>
Sun, 28 Dec 2014 07:04:13 -0800
changeset 238179 736d53322a1d91210f65e66e8d5254ddd791a370
parent 238178 8623d3a5edbeb63de43c472bc50e930501729807
child 238180 d710a4836ac908e3d76fcb68626d080b64314abe
push id7472
push userraliiev@mozilla.com
push dateMon, 12 Jan 2015 20:36:27 +0000
treeherdermozilla-aurora@300ca104f8fb [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjandem
bugs1115752
milestone37.0a1
Bug 1115752 - SpiderMonkey: VEX encodings for instructions with leading immediates r=jandem
js/src/jit/MIR.h
js/src/jit/shared/Assembler-x86-shared.h
js/src/jit/shared/BaseAssembler-x86-shared.h
js/src/jit/shared/CodeGenerator-x86-shared.cpp
js/src/jit/shared/MacroAssembler-x86-shared.h
js/src/jit/x86/MacroAssembler-x86.h
--- a/js/src/jit/MIR.h
+++ b/js/src/jit/MIR.h
@@ -1701,17 +1701,17 @@ class MSimdShuffle : public MBinaryInstr
     INSTRUCTION_HEADER(SimdShuffle);
 
     static MInstruction *NewAsmJS(TempAllocator &alloc, MDefinition *lhs, MDefinition *rhs,
                                   MIRType type, uint32_t laneX, uint32_t laneY, uint32_t laneZ,
                                   uint32_t laneW)
     {
         // Swap operands so that new lanes come from LHS in majority.
         // In the balanced case, swap operands if needs be, in order to be able
-        // to do only one shufps on x86.
+        // to do only one vshufps on x86.
         unsigned lanesFromLHS = (laneX < 4) + (laneY < 4) + (laneZ < 4) + (laneW < 4);
         if (lanesFromLHS < 2 || (lanesFromLHS == 2 && laneX >= 4 && laneY >=4)) {
             laneX = (laneX + 4) % 8;
             laneY = (laneY + 4) % 8;
             laneZ = (laneZ + 4) % 8;
             laneW = (laneW + 4) % 8;
             mozilla::Swap(lhs, rhs);
         }
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@@ -1491,85 +1491,85 @@ class AssemblerX86Shared : public Assemb
     }
     void idiv(Register divisor) {
         masm.idivl_r(divisor.code());
     }
     void udiv(Register divisor) {
         masm.divl_r(divisor.code());
     }
 
-    void pinsrd(unsigned lane, Register src, FloatRegister dest) {
+    void vpinsrd(unsigned lane, Register src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE41());
-        masm.pinsrd_irr(lane, src.code(), dest.code());
+        masm.vpinsrd_irr(lane, src1.code(), src0.code(), dest.code());
     }
-    void pinsrd(unsigned lane, const Operand &src, FloatRegister dest) {
+    void vpinsrd(unsigned lane, const Operand &src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE41());
-        switch (src.kind()) {
+        switch (src1.kind()) {
           case Operand::REG:
-            masm.pinsrd_irr(lane, src.reg(), dest.code());
+            masm.vpinsrd_irr(lane, src1.reg(), src0.code(), dest.code());
             break;
           case Operand::MEM_REG_DISP:
-            masm.pinsrd_imr(lane, src.disp(), src.base(), dest.code());
+            masm.vpinsrd_imr(lane, src1.disp(), src1.base(), src0.code(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
-    void pextrd(unsigned lane, FloatRegister src, Register dest) {
+    void vpextrd(unsigned lane, FloatRegister src, Register dest) {
         MOZ_ASSERT(HasSSE41());
-        masm.pextrd_irr(lane, src.code(), dest.code());
+        masm.vpextrd_irr(lane, src.code(), dest.code());
     }
-    void pextrd(unsigned lane, FloatRegister src, const Operand &dest) {
+    void vpextrd(unsigned lane, FloatRegister src, const Operand &dest) {
         MOZ_ASSERT(HasSSE41());
         switch (dest.kind()) {
           case Operand::REG:
-            masm.pextrd_irr(lane, src.code(), dest.reg());
+            masm.vpextrd_irr(lane, src.code(), dest.reg());
             break;
           case Operand::MEM_REG_DISP:
-            masm.pextrd_imr(lane, src.code(), dest.disp(), dest.base());
+            masm.vpextrd_irm(lane, src.code(), dest.disp(), dest.base());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
-    void psrldq(Imm32 shift, FloatRegister dest) {
+    void vpsrldq(Imm32 shift, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.psrldq_ir(shift.value, dest.code());
+        masm.vpsrldq_ir(shift.value, src0.code(), dest.code());
     }
-    void psllq(Imm32 shift, FloatRegister dest) {
+    void vpsllq(Imm32 shift, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.psllq_ir(shift.value, dest.code());
+        masm.vpsllq_ir(shift.value, src0.code(), dest.code());
     }
-    void psrlq(Imm32 shift, FloatRegister dest) {
+    void vpsrlq(Imm32 shift, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.psrlq_ir(shift.value, dest.code());
+        masm.vpsrlq_ir(shift.value, src0.code(), dest.code());
     }
     void vpslld(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.vpslld_rr(src1.code(), src0.code(), dest.code());
     }
-    void pslld(Imm32 count, FloatRegister dest) {
+    void vpslld(Imm32 count, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.pslld_ir(count.value, dest.code());
+        masm.vpslld_ir(count.value, src0.code(), dest.code());
     }
     void vpsrad(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.vpsrad_rr(src1.code(), src0.code(), dest.code());
     }
-    void psrad(Imm32 count, FloatRegister dest) {
+    void vpsrad(Imm32 count, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.psrad_ir(count.value, dest.code());
+        masm.vpsrad_ir(count.value, src0.code(), dest.code());
     }
     void vpsrld(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.vpsrld_rr(src1.code(), src0.code(), dest.code());
     }
-    void psrld(Imm32 count, FloatRegister dest) {
+    void vpsrld(Imm32 count, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.psrld_ir(count.value, dest.code());
+        masm.vpsrld_ir(count.value, src0.code(), dest.code());
     }
 
     void vcvtsi2sd(const Operand &src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         switch (src1.kind()) {
           case Operand::REG:
             masm.vcvtsi2sd_rr(src1.reg(), src0.code(), dest.code());
             break;
@@ -2068,31 +2068,31 @@ class AssemblerX86Shared : public Assemb
           case Operand::MEM_ADDRESS32:
             masm.vpandn_mr(src1.address(), src0.code(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
 
-    void pshufd(uint32_t mask, FloatRegister src, FloatRegister dest) {
+    void vpshufd(uint32_t mask, FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.pshufd_irr(mask, src.code(), dest.code());
+        masm.vpshufd_irr(mask, src.code(), dest.code());
     }
-    void pshufd(uint32_t mask, const Operand &src, FloatRegister dest) {
+    void vpshufd(uint32_t mask, const Operand &src1, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        switch (src.kind()) {
+        switch (src1.kind()) {
           case Operand::FPREG:
-            masm.pshufd_irr(mask, src.fpu(), dest.code());
+            masm.vpshufd_irr(mask, src1.fpu(), dest.code());
             break;
           case Operand::MEM_REG_DISP:
-            masm.pshufd_imr(mask, src.disp(), src.base(), dest.code());
+            masm.vpshufd_imr(mask, src1.disp(), src1.base(), dest.code());
             break;
           case Operand::MEM_ADDRESS32:
-            masm.pshufd_imr(mask, src.address(), dest.code());
+            masm.vpshufd_imr(mask, src1.address(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
     void vmovhlps(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.vmovhlps_rr(src1.code(), src0.code(), dest.code());
@@ -2104,31 +2104,31 @@ class AssemblerX86Shared : public Assemb
     void vunpcklps(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.vunpcklps_rr(src1.code(), src0.code(), dest.code());
     }
     void vunpckhps(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.vunpckhps_rr(src1.code(), src0.code(), dest.code());
     }
-    void shufps(uint32_t mask, FloatRegister src, FloatRegister dest) {
+    void vshufps(uint32_t mask, FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        masm.shufps_irr(mask, src.code(), dest.code());
+        masm.vshufps_irr(mask, src1.code(), src0.code(), dest.code());
     }
-    void shufps(uint32_t mask, const Operand &src, FloatRegister dest) {
+    void vshufps(uint32_t mask, const Operand &src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
-        switch (src.kind()) {
+        switch (src1.kind()) {
           case Operand::FPREG:
-            masm.shufps_irr(mask, src.fpu(), dest.code());
+            masm.vshufps_irr(mask, src1.fpu(), src0.code(), dest.code());
             break;
           case Operand::MEM_REG_DISP:
-            masm.shufps_imr(mask, src.disp(), src.base(), dest.code());
+            masm.vshufps_imr(mask, src1.disp(), src1.base(), src0.code(), dest.code());
             break;
           case Operand::MEM_ADDRESS32:
-            masm.shufps_imr(mask, src.address(), dest.code());
+            masm.vshufps_imr(mask, src1.address(), src0.code(), dest.code());
             break;
           default:
             MOZ_CRASH("unexpected operand kind");
         }
     }
     void vaddsd(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.vaddsd_rr(src1.code(), src0.code(), dest.code());
@@ -2298,38 +2298,38 @@ class AssemblerX86Shared : public Assemb
     void vsqrtsd(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.vsqrtsd_rr(src1.code(), src0.code(), dest.code());
     }
     void vsqrtss(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.vsqrtss_rr(src1.code(), src0.code(), dest.code());
     }
-    void roundsd(X86Assembler::RoundingMode mode, FloatRegister src, FloatRegister dest) {
+    void vroundsd(X86Assembler::RoundingMode mode, FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE41());
-        masm.roundsd_rr(mode, src.code(), dest.code());
+        masm.vroundsd_irr(mode, src1.code(), src0.code(), dest.code());
     }
-    void roundss(X86Assembler::RoundingMode mode, FloatRegister src, FloatRegister dest) {
+    void vroundss(X86Assembler::RoundingMode mode, FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE41());
-        masm.roundss_rr(mode, src.code(), dest.code());
+        masm.vroundss_irr(mode, src1.code(), src0.code(), dest.code());
     }
-    unsigned insertpsMask(SimdLane sourceLane, SimdLane destLane, unsigned zeroMask = 0)
+    unsigned vinsertpsMask(SimdLane sourceLane, SimdLane destLane, unsigned zeroMask = 0)
     {
         // Note that the sourceLane bits are ignored in the case of a source
         // memory operand, and the source is the given 32-bits memory location.
         MOZ_ASSERT(zeroMask < 16);
         unsigned ret = zeroMask ;
         ret |= unsigned(destLane) << 4;
         ret |= unsigned(sourceLane) << 6;
         MOZ_ASSERT(ret < 256);
         return ret;
     }
-    void insertps(FloatRegister src, FloatRegister dest, unsigned mask) {
+    void vinsertps(uint32_t mask, FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE41());
-        masm.insertps_irr(mask, src.code(), dest.code());
+        masm.vinsertps_irr(mask, src1.code(), src0.code(), dest.code());
     }
     unsigned blendpsMask(bool x, bool y, bool z, bool w) {
         return x | (y << 1) | (z << 2) | (w << 3);
     }
     void vblendps(unsigned mask, FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE41());
         masm.vblendps_irr(mask, src1.code(), src0.code(), dest.code());
     }
--- a/js/src/jit/shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/shared/BaseAssembler-x86-shared.h
@@ -327,16 +327,25 @@ private:
         PRE_SSE_F3                      = 0xF3,
         OP_HLT                          = 0xF4,
         OP_GROUP3_EbIb                  = 0xF6,
         OP_GROUP3_Ev                    = 0xF7,
         OP_GROUP3_EvIz                  = 0xF7, // OP_GROUP3_Ev has an immediate, when instruction is a test.
         OP_GROUP5_Ev                    = 0xFF
     };
 
+    enum ShiftID {
+        Shift_vpsrld = 2,
+        Shift_vpsrlq = 2,
+        Shift_vpsrldq = 3,
+        Shift_vpsrad = 4,
+        Shift_vpslld = 6,
+        Shift_vpsllq = 6
+    };
+
     enum TwoByteOpcodeID {
         OP2_UD2             = 0x0B,
         OP2_MOVSD_VsdWsd    = 0x10,
         OP2_MOVPS_VpsWps    = 0x10,
         OP2_MOVSD_WsdVsd    = 0x11,
         OP2_MOVPS_WpsVps    = 0x11,
         OP2_MOVHLPS_VqUq    = 0x12,
         OP2_MOVSLDUP_VpsWps = 0x12,
@@ -423,16 +432,17 @@ private:
     };
 
     // Test whether the given opcode should be printed with its operands reversed.
     static inline bool IsXMMReversedOperands(TwoByteOpcodeID opcode) {
         switch (opcode) {
           case OP2_MOVSD_WsdVsd: // also OP2_MOVPS_WpsVps
           case OP2_MOVAPS_WsdVsd:
           case OP2_MOVDQ_WdqVdq:
+          case OP3_PEXTRD_EdVdqIb:
             return true;
           default:
             break;
         }
         return false;
     }
 
     enum ThreeByteOpcodeID {
@@ -2750,28 +2760,25 @@ public:
     }
     void vpcmpgtd_mr(const void* address, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vpcmpgtd", VEX_PD, OP2_PCMPGTD_VdqWdq, address, src0, dst);
     }
 
     void vcmpps_rr(uint8_t order, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
-        twoByteOpSimd("vcmpps", VEX_PS, OP2_CMPPS_VpsWps, src1, src0, dst);
-        m_formatter.immediate8s(order);
+        twoByteOpImmSimd("vcmpps", VEX_PS, OP2_CMPPS_VpsWps, order, src1, src0, dst);
     }
     void vcmpps_mr(uint8_t order, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
     {
-        twoByteOpSimd("vcmpps", VEX_PS, OP2_CMPPS_VpsWps, offset, base, src0, dst);
-        m_formatter.immediate8s(order);
+        twoByteOpImmSimd("vcmpps", VEX_PS, OP2_CMPPS_VpsWps, order, offset, base, src0, dst);
     }
     void vcmpps_mr(uint8_t order, const void* address, XMMRegisterID src0, XMMRegisterID dst)
     {
-        twoByteOpSimd("vcmpps", VEX_PS, OP2_CMPPS_VpsWps, address, src0, dst);
-        m_formatter.immediate8s(order);
+        twoByteOpImmSimd("vcmpps", VEX_PS, OP2_CMPPS_VpsWps, order, address, src0, dst);
     }
 
     void vrcpps_rr(XMMRegisterID src, XMMRegisterID dst) {
         twoByteOpSimd("vrcpps", VEX_PS, OP2_RCPPS_VpsWps, src, X86Registers::invalid_xmm, dst);
     }
     void vrcpps_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
         twoByteOpSimd("vrcpps", VEX_PS, OP2_RCPPS_VpsWps, offset, base, X86Registers::invalid_xmm, dst);
     }
@@ -2972,133 +2979,101 @@ public:
     {
         twoByteOpSimd("vpandn", VEX_PD, OP2_PANDNDQ_VdqWdq, offset, base, src0, dst);
     }
     void vpandn_mr(const void *address, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vpandn", VEX_PD, OP2_PANDNDQ_VdqWdq, address, src0, dst);
     }
 
-    void pshufd_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
-    {
-        spew("pshufd     $0x%x, %s, %s", mask, nameFPReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PSHUFD_VdqWdqIb, (RegisterID)src, (RegisterID)dst);
-        m_formatter.immediate8u(mask);
-    }
-
-    void pshufd_imr(uint32_t mask, int32_t offset, RegisterID base, XMMRegisterID dst)
-    {
-        MOZ_ASSERT(mask < 256);
-        spew("pshufd     $0x%x, " MEM_ob ", %s", mask, ADDR_ob(offset, base), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PSHUFD_VdqWdqIb, offset, base, (RegisterID)dst);
-        m_formatter.immediate8u(mask);
-    }
-
-    void pshufd_imr(uint32_t mask, const void* address, XMMRegisterID dst)
-    {
-        spew("pshufd     $0x%x, %p, %s", mask, address, nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PSHUFD_VdqWdqIb, address, (RegisterID)dst);
-        m_formatter.immediate8u(mask);
-    }
-
-    void shufps_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
-    {
-        spew("shufps     $0x%x, %s, %s", mask, nameFPReg(src), nameFPReg(dst));
-        m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, (RegisterID)src, (RegisterID)dst);
-        m_formatter.immediate8u(mask);
-    }
-
-    void shufps_imr(uint32_t mask, int32_t offset, RegisterID base, XMMRegisterID dst)
-    {
-        spew("shufps     $0x%x, " MEM_ob ", %s", mask, ADDR_ob(offset, base), nameFPReg(dst));
-        m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, offset, base, (RegisterID)dst);
-        m_formatter.immediate8u(mask);
-    }
-
-    void shufps_imr(uint32_t mask, const void* address, XMMRegisterID dst)
-    {
-        spew("shufps     $0x%x, %p, %s", mask, address, nameFPReg(dst));
-        m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, address, (RegisterID)dst);
-        m_formatter.immediate8u(mask);
+    void vpshufd_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
+    {
+        twoByteOpImmSimd("vpshufd", VEX_PD, OP2_PSHUFD_VdqWdqIb, mask, src, X86Registers::invalid_xmm, dst);
+    }
+    void vpshufd_imr(uint32_t mask, int32_t offset, RegisterID base, XMMRegisterID dst)
+    {
+        twoByteOpImmSimd("vpshufd", VEX_PD, OP2_PSHUFD_VdqWdqIb, mask, offset, base, X86Registers::invalid_xmm, dst);
+    }
+    void vpshufd_imr(uint32_t mask, const void* address, XMMRegisterID dst)
+    {
+        twoByteOpImmSimd("vpshufd", VEX_PD, OP2_PSHUFD_VdqWdqIb, mask, address, X86Registers::invalid_xmm, dst);
+    }
+
+    void vshufps_irr(uint32_t mask, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpImmSimd("vshufps", VEX_PS, OP2_SHUFPS_VpsWpsIb, mask, src1, src0, dst);
+    }
+    void vshufps_imr(uint32_t mask, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpImmSimd("vshufps", VEX_PS, OP2_SHUFPS_VpsWpsIb, mask, offset, base, src0, dst);
+    }
+    void vshufps_imr(uint32_t mask, const void* address, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        twoByteOpImmSimd("vshufps", VEX_PS, OP2_SHUFPS_VpsWpsIb, mask, address, src0, dst);
     }
 
     void vmovhlps_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovhlps", VEX_PS, OP2_MOVHLPS_VqUq, src1, src0, dst);
     }
 
     void vmovlhps_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vmovlhps", VEX_PS, OP2_MOVLHPS_VqUq, src1, src0, dst);
     }
 
-    void psrldq_ir(int shift, XMMRegisterID dest)
-    {
-        spew("psrldq     $%d, %s", shift, nameFPReg(dest));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PSRLDQ_Vd, (RegisterID)dest, (RegisterID)3);
-        m_formatter.immediate8s(shift);
-    }
-
-    void psllq_ir(int shift, XMMRegisterID dest)
-    {
-        spew("psllq      $%d, %s", shift, nameFPReg(dest));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PSRLDQ_Vd, (RegisterID)dest, (RegisterID)6);
-        m_formatter.immediate8s(shift);
-    }
-
-    void psrlq_ir(int shift, XMMRegisterID dest)
-    {
-        spew("psrlq      $%d, %s", shift, nameFPReg(dest));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PSRLDQ_Vd, (RegisterID)dest, (RegisterID)2);
-        m_formatter.immediate8s(shift);
+    void vpsrldq_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(count < 16);
+        shiftOpImmSimd("vpsrldq", OP2_PSRLDQ_Vd, Shift_vpsrldq, count, src, dst);
+    }
+
+    void vpsllq_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(count < 64);
+        shiftOpImmSimd("vpsllq", OP2_PSRLDQ_Vd, Shift_vpsllq, count, src, dst);
+    }
+
+    void vpsrlq_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(count < 64);
+        shiftOpImmSimd("vpsrlq", OP2_PSRLDQ_Vd, Shift_vpsrlq, count, src, dst);
     }
 
     void vpslld_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vpslld", VEX_PD, OP2_PSLLD_VdqWdq, src1, src0, dst);
     }
 
-    void pslld_ir(int32_t count, XMMRegisterID dst)
-    {
-        spew("pslld      $%d, %s", count, nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PSLLD_UdqIb, (RegisterID)dst, (RegisterID)6);
-        m_formatter.immediate8s(int8_t(count));
+    void vpslld_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(count < 32);
+        shiftOpImmSimd("vpslld", OP2_PSLLD_UdqIb, Shift_vpslld, count, src, dst);
     }
 
     void vpsrad_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vpsrad", VEX_PD, OP2_PSRAD_VdqWdq, src1, src0, dst);
     }
 
-    void psrad_ir(int32_t count, XMMRegisterID dst)
-    {
-        spew("psrad      $%d, %s", count, nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PSRAD_UdqIb, (RegisterID)dst, (RegisterID)4);
-        m_formatter.immediate8s(int8_t(count));
+    void vpsrad_ir(int32_t count, XMMRegisterID src, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(count < 32);
+        shiftOpImmSimd("vpsrad", OP2_PSRAD_UdqIb, Shift_vpsrad, count, src, dst);
     }
 
     void vpsrld_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vpsrld", VEX_PD, OP2_PSRLD_VdqWdq, src1, src0, dst);
     }
 
-    void psrld_ir(int32_t count, XMMRegisterID dst)
-    {
-        spew("psrld      $%d, %s", count, nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PSRLD_UdqIb, (RegisterID)dst, (RegisterID)2);
-        m_formatter.immediate8s(int8_t(count));
+    void vpsrld_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(count < 32);
+        shiftOpImmSimd("vpsrld", OP2_PSRLD_UdqIb, Shift_vpsrld, count, src, dst);
     }
 
     void vmovmskpd_rr(XMMRegisterID src, RegisterID dst)
     {
         twoByteOpSimdInt32("vmovmskpd", VEX_PD, OP2_MOVMSKPD_EdVd, src, dst);
     }
 
     void vmovmskps_rr(XMMRegisterID src, RegisterID dst)
@@ -3427,22 +3402,20 @@ public:
         twoByteOpSimd("vmulsd", VEX_SD, OP2_MULSD_VsdWsd, offset, base, src0, dst);
     }
 
     void vmulss_mr(int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vmulss", VEX_SS, OP2_MULSD_VsdWsd, offset, base, src0, dst);
     }
 
-    void pextrw_irr(int whichWord, XMMRegisterID src, RegisterID dst)
-    {
-        FIXME_INSN_PRINTING;
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.twoByteOp(OP2_PEXTRW_GdUdIb, (RegisterID)src, (RegisterID)dst);
-        m_formatter.immediate8(whichWord);
+    void vpextrw_irr(uint32_t whichWord, XMMRegisterID src, RegisterID dst)
+    {
+        MOZ_ASSERT(whichWord < 8);
+        twoByteOpImmSimdInt32("vpextrw", VEX_PD, OP2_PEXTRW_GdUdIb, whichWord, src, dst);
     }
 
     void vsubsd_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vsubsd", VEX_SD, OP2_SUBSD_VsdWsd, src1, src0, dst);
     }
 
     void vsubss_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
@@ -3575,90 +3548,70 @@ public:
         twoByteOpSimd("vsqrtsd", VEX_SD, OP2_SQRTSD_VsdWsd, src1, src0, dst);
     }
 
     void vsqrtss_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         twoByteOpSimd("vsqrtss", VEX_SS, OP2_SQRTSS_VssWss, src1, src0, dst);
     }
 
-    void roundsd_rr(RoundingMode mode, XMMRegisterID src, XMMRegisterID dst)
-    {
-        spew("roundsd    $%d, %s, %s", (int)mode, nameFPReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.threeByteOp(OP3_ROUNDSD_VsdWsd, ESCAPE_ROUNDSD, (RegisterID)src, (RegisterID)dst);
-        m_formatter.immediate8u(mode);
-    }
-
-    void roundss_rr(RoundingMode mode, XMMRegisterID src, XMMRegisterID dst)
-    {
-        spew("roundss    $%d, %s, %s", (int)mode, nameFPReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.threeByteOp(OP3_ROUNDSS_VsdWsd, ESCAPE_ROUNDSD, (RegisterID)src, (RegisterID)dst);
-        m_formatter.immediate8(mode); // modes are the same for roundsd and roundss
-    }
-
-    void insertps_irr(unsigned mask, XMMRegisterID src, XMMRegisterID dst)
-    {
-        spew("insertps   $0x%x, %s, %s", mask, nameFPReg(src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.threeByteOp(OP3_INSERTPS_VpsUps, ESCAPE_INSERTPS, (RegisterID)src, (RegisterID)dst);
-        m_formatter.immediate8u(mask);
-    }
-
-    void pinsrd_irr(unsigned lane, RegisterID src, XMMRegisterID dst)
+    void vroundsd_irr(RoundingMode mode, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        threeByteOpImmSimd("vroundsd", VEX_PD, OP3_ROUNDSD_VsdWsd, ESCAPE_ROUNDSD, mode, src1, src0, dst);
+    }
+
+    void vroundss_irr(RoundingMode mode, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        threeByteOpImmSimd("vroundss", VEX_PD, OP3_ROUNDSS_VsdWsd, ESCAPE_ROUNDSD, mode, src1, src0, dst);
+    }
+
+    void vinsertps_irr(uint32_t mask, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        threeByteOpImmSimd("vinsertps", VEX_PD, OP3_INSERTPS_VpsUps, ESCAPE_INSERTPS, mask, src1, src0, dst);
+    }
+
+    void vpinsrd_irr(unsigned lane, RegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         MOZ_ASSERT(lane < 4);
-        spew("pinsrd     $0x%x, %s, %s", lane, nameIReg(4, src), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.threeByteOp(OP3_PINSRD_VdqEdIb, ESCAPE_PINSRD, (RegisterID)src, (RegisterID)dst);
-        m_formatter.immediate8u(lane);
-    }
-
-    void pinsrd_imr(unsigned lane, int32_t offset, RegisterID base, XMMRegisterID dst)
+        threeByteOpImmInt32Simd("vpinsrd", VEX_PD, OP3_PINSRD_VdqEdIb, ESCAPE_PINSRD, lane, src1, src0, dst);
+    }
+
+    void vpinsrd_imr(unsigned lane, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
     {
         MOZ_ASSERT(lane < 4);
-        spew("pinsrd     $0x%x, " MEM_ob ", %s", lane, ADDR_ob(offset, base), nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.threeByteOp(OP3_PINSRD_VdqEdIb, ESCAPE_PINSRD, offset, base, (RegisterID)dst);
-        m_formatter.immediate8u(lane);
-    }
-
-    void pextrd_irr(unsigned lane, XMMRegisterID src, RegisterID dst)
+        threeByteOpImmInt32Simd("vpinsrd", VEX_PD, OP3_PINSRD_VdqEdIb, ESCAPE_PINSRD, lane, offset, base, src0, dst);
+    }
+
+    void vpextrd_irr(unsigned lane, XMMRegisterID src, RegisterID dst)
     {
         MOZ_ASSERT(lane < 4);
-        spew("pextrd     $0x%x, %s, %s", lane, nameFPReg(src), nameIReg(4, dst));
-        m_formatter.prefix(PRE_SSE_66);
-        m_formatter.threeByteOp(OP3_PEXTRD_EdVdqIb, ESCAPE_PEXTRD, (RegisterID)dst, (RegisterID)src);
-        m_formatter.immediate8u(lane);
-    }
-
-    void pextrd_imr(unsigned lane, XMMRegisterID src, int32_t offset, RegisterID base)
+        threeByteOpImmSimdInt32("vpextrd", VEX_PD, OP3_PEXTRD_EdVdqIb, ESCAPE_PEXTRD, lane, (XMMRegisterID)dst, (RegisterID)src);
+    }
+
+    void vpextrd_irm(unsigned lane, XMMRegisterID src, int32_t offset, RegisterID base)
     {
         MOZ_ASSERT(lane < 4);
         spew("pextrd     $0x%x, %s, " MEM_ob, lane, nameFPReg(src), ADDR_ob(offset, base));
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.threeByteOp(OP3_PEXTRD_EdVdqIb, ESCAPE_PEXTRD, offset, base, (RegisterID)src);
         m_formatter.immediate8u(lane);
     }
 
     void vblendps_irr(unsigned imm, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
     {
         MOZ_ASSERT(imm < 16);
         // Despite being a "ps" instruction, vblendps is encoded with the "pd" prefix.
-        threeByteOpSimd("vblendps", VEX_PD, OP3_BLENDPS_VpsWpsIb, ESCAPE_BLENDPS, src1, src0, dst);
-        m_formatter.immediate8u(imm);
+        threeByteOpImmSimd("vblendps", VEX_PD, OP3_BLENDPS_VpsWpsIb, ESCAPE_BLENDPS, imm, src1, src0, dst);
     }
 
     void vblendps_imr(unsigned imm, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
     {
         MOZ_ASSERT(imm < 16);
         // Despite being a "ps" instruction, vblendps is encoded with the "pd" prefix.
-        threeByteOpSimd("vblendps", VEX_PD, OP3_BLENDPS_VpsWpsIb, ESCAPE_BLENDPS, offset, base, src0, dst);
-        m_formatter.immediate8u(imm);
+threeByteOpImmSimd("vblendps", VEX_PD, OP3_BLENDPS_VpsWpsIb, ESCAPE_BLENDPS, imm, offset, base, src0, dst);
     }
 
     void vblendvps_rr(XMMRegisterID mask, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst) {
         vblendvOpSimd(mask, src1, src0, dst);
     }
     void vblendvps_mr(XMMRegisterID mask, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst) {
         vblendvOpSimd(mask, offset, base, src0, dst);
     }
@@ -4155,16 +4108,35 @@ private:
             else
                 spew("%-11s%s, %s", name, nameFPReg(rm), nameFPReg(dst));
         } else {
             spew("%-11s%s, %s, %s", name, nameFPReg(rm), nameFPReg(src0), nameFPReg(dst));
         }
         m_formatter.twoByteOpVex(ty, opcode, (RegisterID)rm, src0, dst);
     }
 
+    void twoByteOpImmSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
+                          uint32_t imm, XMMRegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        if (useLegacySSEEncoding(src0, dst)) {
+            spew("%-11s$0x%x, %s, %s", legacySSEOpName(name), imm, nameFPReg(rm), nameFPReg(dst));
+            m_formatter.legacySSEPrefix(ty);
+            m_formatter.twoByteOp(opcode, (RegisterID)rm, dst);
+            m_formatter.immediate8u(imm);
+            return;
+        }
+
+        if (src0 == X86Registers::invalid_xmm)
+            spew("%-11s$0x%x, %s, %s", name, imm, nameFPReg(rm), nameFPReg(dst));
+        else
+            spew("%-11s$0x%x, %s, %s, %s", name, imm, nameFPReg(rm), nameFPReg(src0), nameFPReg(dst));
+        m_formatter.twoByteOpVex(ty, opcode, (RegisterID)rm, src0, dst);
+        m_formatter.immediate8u(imm);
+    }
+
     void twoByteOpSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
                        int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
     {
         if (useLegacySSEEncoding(src0, dst)) {
             if (IsXMMReversedOperands(opcode)) {
                 spew("%-11s%s, " MEM_ob, legacySSEOpName(name),
                      nameFPReg(dst), ADDR_ob(offset, base));
             } else {
@@ -4210,16 +4182,34 @@ private:
             }
         } else {
             spew("%-11s" MEM_o32b ", %s, %s", name,
                  ADDR_o32b(offset, base), nameFPReg(src0), nameFPReg(dst));
         }
         m_formatter.twoByteOpVex_disp32(ty, opcode, offset, base, src0, dst);
     }
 
+    void twoByteOpImmSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
+                          uint32_t imm, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        if (useLegacySSEEncoding(src0, dst)) {
+            spew("%-11s$0x%x, " MEM_ob ", %s", legacySSEOpName(name), imm,
+                 ADDR_ob(offset, base), nameFPReg(dst));
+            m_formatter.legacySSEPrefix(ty);
+            m_formatter.twoByteOp(opcode, offset, base, dst);
+            m_formatter.immediate8u(imm);
+            return;
+        }
+
+        spew("%-11s$0x%x, " MEM_ob ", %s, %s", name, imm, ADDR_ob(offset, base),
+             nameFPReg(src0), nameFPReg(dst));
+        m_formatter.twoByteOpVex(ty, opcode, offset, base, src0, dst);
+        m_formatter.immediate8u(imm);
+    }
+
     void twoByteOpSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
                        int32_t offset, RegisterID base, RegisterID index, int scale,
                        XMMRegisterID src0, XMMRegisterID dst)
     {
         if (useLegacySSEEncoding(src0, dst)) {
             if (IsXMMReversedOperands(opcode)) {
                 spew("%-11s%s, " MEM_obs, legacySSEOpName(name),
                      nameFPReg(dst), ADDR_obs(offset, base, index, scale));
@@ -4266,16 +4256,32 @@ private:
             else
                 spew("%-11s%p, %s", name, address, nameFPReg(dst));
         } else {
             spew("%-11s%p, %s, %s", name, address, nameFPReg(src0), nameFPReg(dst));
         }
         m_formatter.twoByteOpVex(ty, opcode, address, src0, dst);
     }
 
+    void twoByteOpImmSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
+                          uint32_t imm, const void *address, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        if (useLegacySSEEncoding(src0, dst)) {
+            spew("%-11s$0x%x, %p, %s", legacySSEOpName(name), imm, address, nameFPReg(dst));
+            m_formatter.legacySSEPrefix(ty);
+            m_formatter.twoByteOp(opcode, address, dst);
+            m_formatter.immediate8u(imm);
+            return;
+        }
+
+        spew("%-11s$0x%x, %p, %s, %s", name, imm, address, nameFPReg(src0), nameFPReg(dst));
+        m_formatter.twoByteOpVex(ty, opcode, address, src0, dst);
+        m_formatter.immediate8u(imm);
+    }
+
     void twoByteOpInt32Simd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
                             RegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
     {
         if (useLegacySSEEncoding(src0, dst)) {
             if (IsXMMReversedOperands(opcode))
                 spew("%-11s%s, %s", legacySSEOpName(name), nameFPReg(dst), nameIReg(4, rm));
             else
                 spew("%-11s%s, %s", legacySSEOpName(name), nameIReg(4, rm), nameFPReg(dst));
@@ -4340,16 +4346,32 @@ private:
             spew("%-11s%s, %s", name, nameIReg(4, dst), nameFPReg(rm));
         else if (opcode == OP2_MOVD_EdVd)
             spew("%-11s%s, %s", name, nameFPReg((XMMRegisterID)dst), nameIReg(4, (RegisterID)rm));
         else
             spew("%-11s%s, %s", name, nameFPReg(rm), nameIReg(4, dst));
         m_formatter.twoByteOpVex(ty, opcode, (RegisterID)rm, X86Registers::invalid_xmm, dst);
     }
 
+    void twoByteOpImmSimdInt32(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
+                               uint32_t imm, XMMRegisterID rm, RegisterID dst)
+    {
+        if (useLegacySSEEncodingForOtherOutput()) {
+            spew("%-11s$0x%x, %s, %s", legacySSEOpName(name), imm, nameFPReg(rm), nameIReg(4, dst));
+            m_formatter.legacySSEPrefix(ty);
+            m_formatter.twoByteOp(opcode, (RegisterID)rm, dst);
+            m_formatter.immediate8u(imm);
+            return;
+        }
+
+        spew("%-11s$0x%x, %s, %s", name, imm, nameFPReg(rm), nameIReg(4, dst));
+        m_formatter.twoByteOpVex(ty, opcode, (RegisterID)rm, X86Registers::invalid_xmm, dst);
+        m_formatter.immediate8u(imm);
+    }
+
 #ifdef JS_CODEGEN_X64
     void twoByteOpSimdInt64(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
                             XMMRegisterID rm, RegisterID dst)
     {
         if (useLegacySSEEncodingForOtherOutput()) {
             if (IsXMMReversedOperands(opcode))
                 spew("%-11s%s, %s", legacySSEOpName(name), nameIReg(dst), nameFPReg(rm));
             else if (opcode == OP2_MOVD_EdVd)
@@ -4411,16 +4433,33 @@ private:
             m_formatter.threeByteOp(opcode, escape, (RegisterID)rm, dst);
             return;
         }
 
         spew("%-11s%s, %s, %s", name, nameFPReg(rm), nameFPReg(src0), nameFPReg(dst));
         m_formatter.threeByteOpVex(ty, opcode, escape, (RegisterID)rm, src0, dst);
     }
 
+    void threeByteOpImmSimd(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
+                            ThreeByteEscape escape,
+                            uint32_t imm, XMMRegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        if (useLegacySSEEncoding(src0, dst)) {
+            spew("%-11s$0x%x, %s, %s", legacySSEOpName(name), imm, nameFPReg(rm), nameFPReg(dst));
+            m_formatter.legacySSEPrefix(ty);
+            m_formatter.threeByteOp(opcode, escape, (RegisterID)rm, dst);
+            m_formatter.immediate8u(imm);
+            return;
+        }
+
+        spew("%-11s$0x%x, %s, %s, %s", name, imm, nameFPReg(rm), nameFPReg(src0), nameFPReg(dst));
+        m_formatter.threeByteOpVex(ty, opcode, escape, (RegisterID)rm, src0, dst);
+        m_formatter.immediate8u(imm);
+    }
+
     void threeByteOpSimd(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
                          ThreeByteEscape escape,
                          int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
     {
         if (useLegacySSEEncoding(src0, dst)) {
             spew("%-11s" MEM_ob ", %s", legacySSEOpName(name),
                  ADDR_ob(offset, base), nameFPReg(dst));
             m_formatter.legacySSEPrefix(ty);
@@ -4428,31 +4467,121 @@ private:
             return;
         }
 
         spew("%-11s" MEM_ob ", %s, %s", name,
              ADDR_ob(offset, base), nameFPReg(src0), nameFPReg(dst));
         m_formatter.threeByteOpVex(ty, opcode, escape, offset, base, src0, dst);
     }
 
+    void threeByteOpImmSimd(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
+                            ThreeByteEscape escape,
+                            uint32_t imm, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        if (useLegacySSEEncoding(src0, dst)) {
+            spew("%-11s$0x%x, " MEM_ob ", %s", legacySSEOpName(name), imm,
+                 ADDR_ob(offset, base), nameFPReg(dst));
+            m_formatter.legacySSEPrefix(ty);
+            m_formatter.threeByteOp(opcode, escape, offset, base, dst);
+            m_formatter.immediate8u(imm);
+            return;
+        }
+
+        spew("%-11s$0x%x, " MEM_ob ", %s, %s", name, imm, ADDR_ob(offset, base),
+             nameFPReg(src0), nameFPReg(dst));
+        m_formatter.threeByteOpVex(ty, opcode, escape, offset, base, src0, dst);
+        m_formatter.immediate8u(imm);
+    }
+
     void threeByteOpSimd(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
                          ThreeByteEscape escape,
                          const void *address, XMMRegisterID src0, XMMRegisterID dst)
     {
         if (useLegacySSEEncoding(src0, dst)) {
             spew("%-11s%p, %s", legacySSEOpName(name), address, nameFPReg(dst));
             m_formatter.legacySSEPrefix(ty);
             m_formatter.threeByteOp(opcode, escape, address, dst);
             return;
         }
 
         spew("%-11s%p, %s, %s", name, address, nameFPReg(src0), nameFPReg(dst));
         m_formatter.threeByteOpVex(ty, opcode, escape, address, src0, dst);
     }
 
+    void threeByteOpImmInt32Simd(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
+                                 ThreeByteEscape escape, uint32_t imm,
+                                 RegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        if (useLegacySSEEncoding(src0, dst)) {
+            spew("%-11s$0x%x, %s, %s", legacySSEOpName(name), imm, nameIReg(4, src1), nameFPReg(dst));
+            m_formatter.legacySSEPrefix(ty);
+            m_formatter.threeByteOp(opcode, escape, src1, dst);
+            m_formatter.immediate8u(imm);
+            return;
+        }
+
+        spew("%-11s$0x%x, %s, %s, %s", name, imm, nameIReg(4, src1), nameFPReg(src0), nameFPReg(dst));
+        m_formatter.threeByteOpVex(ty, opcode, escape, src1, src0, dst);
+        m_formatter.immediate8u(imm);
+    }
+
+    void threeByteOpImmInt32Simd(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
+                                 ThreeByteEscape escape, uint32_t imm,
+                                 int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        if (useLegacySSEEncoding(src0, dst)) {
+            spew("%-11s$0x%x, " MEM_ob ", %s", legacySSEOpName(name), imm, ADDR_ob(offset, base), nameFPReg(dst));
+            m_formatter.legacySSEPrefix(ty);
+            m_formatter.threeByteOp(opcode, escape, offset, base, dst);
+            m_formatter.immediate8u(imm);
+            return;
+        }
+
+        spew("%-11s$0x%x, " MEM_ob ", %s, %s", name, imm, ADDR_ob(offset, base), nameFPReg(src0), nameFPReg(dst));
+        m_formatter.threeByteOpVex(ty, opcode, escape, offset, base, src0, dst);
+        m_formatter.immediate8u(imm);
+    }
+
+    void threeByteOpImmSimdInt32(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
+                                 ThreeByteEscape escape, uint32_t imm,
+                                 XMMRegisterID src, RegisterID dst)
+    {
+        if (useLegacySSEEncodingForOtherOutput()) {
+            spew("%-11s$0x%x, %s, %s", legacySSEOpName(name), imm, nameFPReg(src), nameIReg(4, dst));
+            m_formatter.legacySSEPrefix(ty);
+            m_formatter.threeByteOp(opcode, escape, (RegisterID)src, dst);
+            m_formatter.immediate8u(imm);
+            return;
+        }
+
+        if (opcode == OP3_PEXTRD_EdVdqIb)
+            spew("%-11s$0x%x, %s, %s", name, imm, nameFPReg((XMMRegisterID)dst), nameIReg(4, (RegisterID)src));
+        else
+            spew("%-11s$0x%x, %s, %s", name, imm, nameFPReg(src), nameIReg(4, dst));
+        m_formatter.threeByteOpVex(ty, opcode, escape, (RegisterID)src, X86Registers::invalid_xmm, dst);
+        m_formatter.immediate8u(imm);
+    }
+
+    void threeByteOpImmSimdInt32(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
+                                 ThreeByteEscape escape, uint32_t imm,
+                                 int32_t offset, RegisterID base, RegisterID dst)
+    {
+        if (useLegacySSEEncodingForOtherOutput()) {
+            spew("%-11s$0x%x, " MEM_ob ", %s", legacySSEOpName(name), imm, ADDR_ob(offset, base), nameIReg(4, dst));
+            m_formatter.legacySSEPrefix(ty);
+            m_formatter.threeByteOp(opcode, escape, offset, base, dst);
+            m_formatter.immediate8u(imm);
+            return;
+        }
+
+        spew("%-11s$0x%x, " MEM_ob ", %s", name, imm, ADDR_ob(offset, base), nameIReg(4, dst));
+        m_formatter.threeByteOpVex(ty, opcode, escape, offset, base, X86Registers::invalid_xmm, dst);
+        m_formatter.immediate8u(imm);
+    }
+
     // Blendv is a three-byte op, but the VEX encoding has a different opcode
     // than the SSE encoding, so we handle it specially.
     void vblendvOpSimd(XMMRegisterID mask, XMMRegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
     {
         if (useLegacySSEEncodingForVblendv(mask, src0, dst)) {
             spew("blendvps   %s, %s", nameFPReg(rm), nameFPReg(dst));
             // Even though a "ps" instruction, vblendv is encoded with the "pd" prefix.
             m_formatter.legacySSEPrefix(VEX_PD);
@@ -4479,16 +4608,32 @@ private:
 
         spew("vblendvps  %s, " MEM_ob ", %s, %s",
              nameFPReg(mask), ADDR_ob(offset, base), nameFPReg(src0), nameFPReg(dst));
         // Even though a "ps" instruction, vblendv is encoded with the "pd" prefix.
         m_formatter.vblendvOpVex(VEX_PD, OP3_VBLENDVPS_VdqWdq, ESCAPE_VBLENDVPS,
                                  mask, offset, base, src0, dst);
     }
 
+    void shiftOpImmSimd(const char *name, TwoByteOpcodeID opcode, ShiftID shiftKind,
+                        uint32_t imm, XMMRegisterID src, XMMRegisterID dst)
+    {
+        if (useLegacySSEEncoding(src, dst)) {
+            spew("%-11s$%d, %s", legacySSEOpName(name), imm, nameFPReg(dst));
+            m_formatter.legacySSEPrefix(VEX_PD);
+            m_formatter.twoByteOp(opcode, (RegisterID)dst, (int)shiftKind);
+            m_formatter.immediate8u(imm);
+            return;
+        }
+
+        spew("%-11s$%d, %s, %s", name, imm, nameFPReg(src), nameFPReg(dst));
+        m_formatter.twoByteOpVex(VEX_PD, opcode, (RegisterID)dst, src, (int)shiftKind);
+        m_formatter.immediate8u(imm);
+    }
+
     static int32_t getInt32(void* where)
     {
         return reinterpret_cast<int32_t*>(where)[-1];
     }
 
     class X86InstructionFormatter {
 
         static const int maxInstructionSize = 16;
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@@ -1591,17 +1591,17 @@ CodeGeneratorX86Shared::visitFloor(LFloo
     Label bailout;
 
     if (AssemblerX86Shared::HasSSE41()) {
         // Bail on negative-zero.
         masm.branchNegativeZero(input, output, &bailout);
         bailoutFrom(&bailout, lir->snapshot());
 
         // Round toward -Infinity.
-        masm.roundsd(X86Assembler::RoundDown, input, scratch);
+        masm.vroundsd(X86Assembler::RoundDown, input, scratch, scratch);
 
         bailoutCvttsd2si(scratch, output, lir->snapshot());
     } else {
         Label negative, end;
 
         // Branch to a slow path for negative inputs. Doesn't catch NaN or -0.
         masm.zeroDouble(scratch);
         masm.branchDouble(Assembler::DoubleLessThan, input, scratch, &negative);
@@ -1648,17 +1648,17 @@ CodeGeneratorX86Shared::visitFloorF(LFlo
     Label bailout;
 
     if (AssemblerX86Shared::HasSSE41()) {
         // Bail on negative-zero.
         masm.branchNegativeZeroFloat32(input, output, &bailout);
         bailoutFrom(&bailout, lir->snapshot());
 
         // Round toward -Infinity.
-        masm.roundss(X86Assembler::RoundDown, input, scratch);
+        masm.vroundss(X86Assembler::RoundDown, input, scratch, scratch);
 
         bailoutCvttss2si(scratch, output, lir->snapshot());
     } else {
         Label negative, end;
 
         // Branch to a slow path for negative inputs. Doesn't catch NaN or -0.
         masm.zeroFloat32(scratch);
         masm.branchFloat(Assembler::DoubleLessThan, input, scratch, &negative);
@@ -1713,17 +1713,17 @@ CodeGeneratorX86Shared::visitCeil(LCeil 
     masm.vmovmskpd(input, output);
     masm.branchTest32(Assembler::NonZero, output, Imm32(1), &bailout);
     bailoutFrom(&bailout, lir->snapshot());
 
     if (AssemblerX86Shared::HasSSE41()) {
         // x <= -1 or x > -0
         masm.bind(&lessThanMinusOne);
         // Round toward +Infinity.
-        masm.roundsd(X86Assembler::RoundUp, input, scratch);
+        masm.vroundsd(X86Assembler::RoundUp, input, scratch, scratch);
         bailoutCvttsd2si(scratch, output, lir->snapshot());
         return;
     }
 
     // No SSE4.1
     Label end;
 
     // x >= 0 and x is not -0.0, we can truncate (resp. truncate and add 1) for
@@ -1765,17 +1765,17 @@ CodeGeneratorX86Shared::visitCeilF(LCeil
     masm.vmovmskps(input, output);
     masm.branchTest32(Assembler::NonZero, output, Imm32(1), &bailout);
     bailoutFrom(&bailout, lir->snapshot());
 
     if (AssemblerX86Shared::HasSSE41()) {
         // x <= -1 or x > -0
         masm.bind(&lessThanMinusOne);
         // Round toward +Infinity.
-        masm.roundss(X86Assembler::RoundUp, input, scratch);
+        masm.vroundss(X86Assembler::RoundUp, input, scratch, scratch);
         bailoutCvttss2si(scratch, output, lir->snapshot());
         return;
     }
 
     // No SSE4.1
     Label end;
 
     // x >= 0 and x is not -0.0, we can truncate (resp. truncate and add 1) for
@@ -1840,17 +1840,17 @@ CodeGeneratorX86Shared::visitRound(LRoun
     // Input is negative.
     masm.bind(&negative);
     masm.loadConstantDouble(0.5, temp);
 
     if (AssemblerX86Shared::HasSSE41()) {
         // Add 0.5 and round toward -Infinity. The result is stored in the temp
         // register (currently contains 0.5).
         masm.addDouble(input, temp);
-        masm.roundsd(X86Assembler::RoundDown, temp, scratch);
+        masm.vroundsd(X86Assembler::RoundDown, temp, scratch, scratch);
 
         // Truncate.
         bailoutCvttsd2si(scratch, output, lir->snapshot());
 
         // If the result is positive zero, then the actual result is -0. Bail.
         // Otherwise, the truncation will have produced the correct negative integer.
         masm.test32(output, output);
         bailoutIf(Assembler::Zero, lir->snapshot());
@@ -1923,17 +1923,17 @@ CodeGeneratorX86Shared::visitRoundF(LRou
     // Input is negative.
     masm.bind(&negative);
     masm.loadConstantFloat32(0.5f, temp);
 
     if (AssemblerX86Shared::HasSSE41()) {
         // Add 0.5 and round toward -Infinity. The result is stored in the temp
         // register (currently contains 0.5).
         masm.addFloat32(input, temp);
-        masm.roundss(X86Assembler::RoundDown, temp, scratch);
+        masm.vroundss(X86Assembler::RoundDown, temp, scratch, scratch);
 
         // Truncate.
         bailoutCvttss2si(scratch, output, lir->snapshot());
 
         // If the result is positive zero, then the actual result is -0. Bail.
         // Otherwise, the truncation will have produced the correct negative integer.
         masm.test32(output, output);
         bailoutIf(Assembler::Zero, lir->snapshot());
@@ -2088,17 +2088,17 @@ CodeGeneratorX86Shared::visitSimdValueIn
 {
     MOZ_ASSERT(ins->mir()->type() == MIRType_Int32x4);
 
     FloatRegister output = ToFloatRegister(ins->output());
     if (AssemblerX86Shared::HasSSE41()) {
         masm.vmovd(ToRegister(ins->getOperand(0)), output);
         for (size_t i = 1; i < 4; ++i) {
             Register r = ToRegister(ins->getOperand(i));
-            masm.pinsrd(i, r, output);
+            masm.vpinsrd(i, r, output, output);
         }
         return;
     }
 
     masm.reserveStack(Simd128DataSize);
     for (size_t i = 0; i < 4; ++i) {
         Register r = ToRegister(ins->getOperand(i));
         masm.store32(r, Address(StackPointer, i * sizeof(int32_t)));
@@ -2135,24 +2135,24 @@ CodeGeneratorX86Shared::visitSimdSplatX4
     MSimdSplatX4 *mir = ins->mir();
     MOZ_ASSERT(IsSimdType(mir->type()));
     JS_STATIC_ASSERT(sizeof(float) == sizeof(int32_t));
 
     switch (mir->type()) {
       case MIRType_Int32x4: {
         Register r = ToRegister(ins->getOperand(0));
         masm.vmovd(r, output);
-        masm.pshufd(0, output, output);
+        masm.vpshufd(0, output, output);
         break;
       }
       case MIRType_Float32x4: {
         FloatRegister r = ToFloatRegister(ins->getOperand(0));
         if (r != output)
             masm.moveFloat32x4(r, output);
-        masm.shufps(0, output, output);
+        masm.vshufps(0, output, output, output);
         break;
       }
       default:
         MOZ_CRASH("Unknown SIMD kind");
     }
 }
 
 void
@@ -2161,17 +2161,17 @@ CodeGeneratorX86Shared::visitSimdExtract
     FloatRegister input = ToFloatRegister(ins->input());
     Register output = ToRegister(ins->output());
 
     SimdLane lane = ins->lane();
     if (lane == LaneX) {
         // The value we want to extract is in the low double-word
         masm.moveLowInt32(input, output);
     } else if (AssemblerX86Shared::HasSSE41()) {
-        masm.pextrd(lane, input, output);
+        masm.vpextrd(lane, input, output);
     } else {
         uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
         masm.shuffleInt32(mask, input, ScratchSimdReg);
         masm.moveLowInt32(ScratchSimdReg, output);
     }
 }
 
 void
@@ -2203,17 +2203,18 @@ CodeGeneratorX86Shared::visitSimdInsertE
     MOZ_ASSERT(vector == output); // defineReuseInput(0)
 
     unsigned component = unsigned(ins->lane());
 
     // Note that, contrarily to float32x4, we cannot use vmovd if the inserted
     // value goes into the first component, as vmovd clears out the higher lanes
     // of the output.
     if (AssemblerX86Shared::HasSSE41()) {
-        masm.pinsrd(component, value, output);
+        // TODO: Teach Lowering that we don't need defineReuseInput if we have AVX.
+        masm.vpinsrd(component, value, vector, output);
         return;
     }
 
     masm.reserveStack(Simd128DataSize);
     masm.storeAlignedInt32x4(vector, Address(StackPointer, 0));
     masm.store32(value, Address(StackPointer, component * sizeof(int32_t)));
     masm.loadAlignedInt32x4(Address(StackPointer, 0), output);
     masm.freeStack(Simd128DataSize);
@@ -2232,17 +2233,17 @@ CodeGeneratorX86Shared::visitSimdInsertE
         // of the destination operand.
         if (value != output)
             masm.vmovss(value, vector, output);
         return;
     }
 
     if (AssemblerX86Shared::HasSSE41()) {
         // The input value is in the low float32 of the 'value' FloatRegister.
-        masm.insertps(value, output, masm.insertpsMask(SimdLane::LaneX, ins->lane()));
+        masm.vinsertps(masm.vinsertpsMask(SimdLane::LaneX, ins->lane()), value, output, output);
         return;
     }
 
     unsigned component = unsigned(ins->lane());
     masm.reserveStack(Simd128DataSize);
     masm.storeAlignedFloat32x4(vector, Address(StackPointer, 0));
     masm.storeFloat32(value, Address(StackPointer, component * sizeof(int32_t)));
     masm.loadAlignedFloat32x4(Address(StackPointer, 0), output);
@@ -2338,17 +2339,17 @@ CodeGeneratorX86Shared::visitSimdShuffle
     uint32_t y = ins->laneY();
     uint32_t z = ins->laneZ();
     uint32_t w = ins->laneW();
 
     // Check that lanes come from LHS in majority:
     unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
     MOZ_ASSERT(numLanesFromLHS >= 2);
 
-    // When reading this method, remember that shufps takes the two first
+    // When reading this method, remember that vshufps takes the two first
     // inputs of the destination operand (right operand) and the two last
     // inputs of the source operand (left operand).
     //
     // Legend for explanations:
     // - L: LHS
     // - R: RHS
     // - T: temporary
 
@@ -2371,17 +2372,17 @@ CodeGeneratorX86Shared::visitSimdShuffle
         unsigned firstMask = -1, secondMask = -1;
 
         // register-register vmovss preserves the high lanes.
         if (ins->lanesMatch(4, 1, 2, 3)) {
             masm.vmovss(rhs, lhs, out);
             return;
         }
 
-        // SSE4.1 insertps can handle any single element.
+        // SSE4.1 vinsertps can handle any single element.
         unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
         if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
             SimdLane srcLane;
             SimdLane dstLane;
             if (x >= 4) {
                 srcLane = SimdLane(x - 4);
                 dstLane = LaneX;
             } else if (y >= 4) {
@@ -2390,62 +2391,62 @@ CodeGeneratorX86Shared::visitSimdShuffle
             } else if (z >= 4) {
                 srcLane = SimdLane(z - 4);
                 dstLane = LaneZ;
             } else {
                 MOZ_ASSERT(w >= 4);
                 srcLane = SimdLane(w - 4);
                 dstLane = LaneW;
             }
-            masm.insertps(rhs, out, masm.insertpsMask(srcLane, dstLane));
+            masm.vinsertps(masm.vinsertpsMask(srcLane, dstLane), rhs, out, out);
             return;
         }
 
         FloatRegister rhsCopy = ToFloatRegister(ins->temp());
 
         if (x < 4 && y < 4) {
             if (w >= 4) {
                 w %= 4;
-                // T = (Rw Rw Lz Lz) = shufps(firstMask, lhs, rhs)
+                // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhs)
                 firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
-                // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = shufps(secondMask, T, lhs)
+                // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, lhs)
                 secondMask = MacroAssembler::ComputeShuffleMask(x, y, LaneZ, LaneX);
             } else {
                 MOZ_ASSERT(z >= 4);
                 z %= 4;
-                // T = (Rz Rz Lw Lw) = shufps(firstMask, lhs, rhs)
+                // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhs)
                 firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
-                // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = shufps(secondMask, T, lhs)
+                // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, lhs)
                 secondMask = MacroAssembler::ComputeShuffleMask(x, y, LaneX, LaneZ);
             }
 
-            masm.shufps(firstMask, lhs, rhsCopy);
-            masm.shufps(secondMask, rhsCopy, lhs);
+            masm.vshufps(firstMask, lhs, rhsCopy, rhsCopy);
+            masm.vshufps(secondMask, rhsCopy, lhs, lhs);
             return;
         }
 
         MOZ_ASSERT(z < 4 && w < 4);
 
         if (y >= 4) {
             y %= 4;
-            // T = (Ry Ry Lx Lx) = shufps(firstMask, lhs, rhs)
+            // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhs)
             firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
-            // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = shufps(secondMask, lhs, T)
+            // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, T)
             secondMask = MacroAssembler::ComputeShuffleMask(LaneZ, LaneX, z, w);
         } else {
             MOZ_ASSERT(x >= 4);
             x %= 4;
-            // T = (Rx Rx Ly Ly) = shufps(firstMask, lhs, rhs)
+            // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhs)
             firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
-            // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = shufps(secondMask, lhs, T)
+            // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, T)
             secondMask = MacroAssembler::ComputeShuffleMask(LaneX, LaneZ, z, w);
         }
 
-        masm.shufps(firstMask, lhs, rhsCopy);
-        masm.shufps(secondMask, lhs, rhsCopy);
+        masm.vshufps(firstMask, lhs, rhsCopy, rhsCopy);
+        masm.vshufps(secondMask, lhs, rhsCopy, rhsCopy);
         masm.moveFloat32x4(rhsCopy, out);
         return;
     }
 
     // Two elements from one vector, two other elements from the other
     MOZ_ASSERT(numLanesFromLHS == 2);
 
     // TODO Here and below, symmetric case would be more handy to avoid a move,
@@ -2495,27 +2496,27 @@ CodeGeneratorX86Shared::visitSimdShuffle
         } else {
             masm.moveFloat32x4(rhs, ScratchSimdReg);
             masm.vunpckhps(lhs, ScratchSimdReg, ScratchSimdReg);
             masm.moveFloat32x4(ScratchSimdReg, out);
         }
         return;
     }
 
-    // In one shufps
+    // In one vshufps
     if (x < 4 && y < 4) {
         mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
-        masm.shufps(mask, rhs, out);
+        masm.vshufps(mask, rhs, out, out);
         return;
     }
 
     // At creation, we should have explicitly swapped in this case.
     MOZ_ASSERT(!(z >= 4 && w >= 4));
 
-    // In two shufps, for the most generic case:
+    // In two vshufps, for the most generic case:
     uint32_t firstMask[4], secondMask[4];
     unsigned i = 0, j = 2, k = 0;
 
 #define COMPUTE_MASK(lane)       \
     if (lane >= 4) {             \
         firstMask[j] = lane % 4; \
         secondMask[k++] = j++;   \
     } else {                     \
@@ -2528,21 +2529,21 @@ CodeGeneratorX86Shared::visitSimdShuffle
     COMPUTE_MASK(z)
     COMPUTE_MASK(w)
 #undef COMPUTE_MASK
 
     MOZ_ASSERT(i == 2 && j == 4 && k == 4);
 
     mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
                                               firstMask[2], firstMask[3]);
-    masm.shufps(mask, rhs, lhs);
+    masm.vshufps(mask, rhs, lhs, lhs);
 
     mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
                                               secondMask[2], secondMask[3]);
-    masm.shufps(mask, lhs, lhs);
+    masm.vshufps(mask, lhs, lhs, lhs);
 }
 
 void
 CodeGeneratorX86Shared::visitSimdBinaryCompIx4(LSimdBinaryCompIx4 *ins)
 {
     static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
 
     FloatRegister lhs = ToFloatRegister(ins->lhs());
@@ -2648,24 +2649,24 @@ CodeGeneratorX86Shared::visitSimdBinaryA
             return;
         }
 
         masm.loadAlignedInt32x4(rhs, ScratchSimdReg);
         masm.vpmuludq(lhs, ScratchSimdReg, ScratchSimdReg);
         // ScratchSimdReg contains (Rx, _, Rz, _) where R is the resulting vector.
 
         FloatRegister temp = ToFloatRegister(ins->temp());
-        masm.pshufd(MacroAssembler::ComputeShuffleMask(LaneY, LaneY, LaneW, LaneW), lhs, lhs);
-        masm.pshufd(MacroAssembler::ComputeShuffleMask(LaneY, LaneY, LaneW, LaneW), rhs, temp);
+        masm.vpshufd(MacroAssembler::ComputeShuffleMask(LaneY, LaneY, LaneW, LaneW), lhs, lhs);
+        masm.vpshufd(MacroAssembler::ComputeShuffleMask(LaneY, LaneY, LaneW, LaneW), rhs, temp);
         masm.vpmuludq(temp, lhs, lhs);
         // lhs contains (Ry, _, Rw, _) where R is the resulting vector.
 
-        masm.shufps(MacroAssembler::ComputeShuffleMask(LaneX, LaneZ, LaneX, LaneZ), ScratchSimdReg, lhs);
+        masm.vshufps(MacroAssembler::ComputeShuffleMask(LaneX, LaneZ, LaneX, LaneZ), ScratchSimdReg, lhs, lhs);
         // lhs contains (Ry, Rw, Rx, Rz)
-        masm.shufps(MacroAssembler::ComputeShuffleMask(LaneZ, LaneX, LaneW, LaneY), lhs, lhs);
+        masm.vshufps(MacroAssembler::ComputeShuffleMask(LaneZ, LaneX, LaneW, LaneY), lhs, lhs, lhs);
         return;
       }
       case MSimdBinaryArith::Div:
         // x86 doesn't have SIMD i32 div.
         break;
       case MSimdBinaryArith::Max:
         // we can do max with a single instruction only if we have SSE4.1
         // using the PMAXSD instruction.
@@ -2899,17 +2900,29 @@ CodeGeneratorX86Shared::visitSimdShift(L
     // TODO: If the shift count is greater than 31, this will just zero all
     // lanes by default for lsh and ursh, and set the count to 32 for rsh
     // (which will just extend the sign bit to all bits). Plain JS doesn't do
     // this: instead it only keeps the five low bits of the mask. Spec isn't
     // clear about that topic so this might need to be fixed. See also bug
     // 1068028.
     const LAllocation *val = ins->value();
     if (val->isConstant()) {
-        Imm32 count(ToInt32(val));
+        int32_t c = ToInt32(val);
+        if (c > 31) {
+            switch (ins->operation()) {
+              case MSimdShift::lsh:
+              case MSimdShift::ursh:
+                masm.zeroInt32x4(out);
+                return;
+              default:
+                c = 31;
+                break;
+            }
+        }
+        Imm32 count(c);
         switch (ins->operation()) {
           case MSimdShift::lsh:
             masm.packedLeftShiftByScalar(count, out);
             return;
           case MSimdShift::rsh:
             masm.packedRightShiftByScalar(count, out);
             return;
           case MSimdShift::ursh:
--- a/js/src/jit/shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/shared/MacroAssembler-x86-shared.h
@@ -784,24 +784,24 @@ class MacroAssemblerX86Shared : public A
         vxorpd(reg, reg, reg);
     }
     void zeroFloat32(FloatRegister reg) {
         vxorps(reg, reg, reg);
     }
     void negateDouble(FloatRegister reg) {
         // From MacroAssemblerX86Shared::maybeInlineDouble
         vpcmpeqw(ScratchDoubleReg, ScratchDoubleReg, ScratchDoubleReg);
-        psllq(Imm32(63), ScratchDoubleReg);
+        vpsllq(Imm32(63), ScratchDoubleReg, ScratchDoubleReg);
 
         // XOR the float in a float register with -0.0.
         vxorpd(ScratchDoubleReg, reg, reg); // s ^ 0x80000000000000
     }
     void negateFloat(FloatRegister reg) {
         vpcmpeqw(ScratchFloat32Reg, ScratchFloat32Reg, ScratchFloat32Reg);
-        psllq(Imm32(31), ScratchFloat32Reg);
+        vpsllq(Imm32(31), ScratchFloat32Reg, ScratchFloat32Reg);
 
         // XOR the float in a float register with -0.0.
         vxorps(ScratchFloat32Reg, reg, reg); // s ^ 0x80000000
     }
     void addDouble(FloatRegister src, FloatRegister dest) {
         vaddsd(src, dest, dest);
     }
     void subDouble(FloatRegister src, FloatRegister dest) {
@@ -917,29 +917,29 @@ class MacroAssemblerX86Shared : public A
     void packedSqrtFloat32x4(const Operand &src, FloatRegister dest) {
         vsqrtps(src, dest);
     }
 
     void packedLeftShiftByScalar(FloatRegister src, FloatRegister dest) {
         vpslld(src, dest, dest);
     }
     void packedLeftShiftByScalar(Imm32 count, FloatRegister dest) {
-        pslld(count, dest);
+        vpslld(count, dest, dest);
     }
     void packedRightShiftByScalar(FloatRegister src, FloatRegister dest) {
         vpsrad(src, dest, dest);
     }
     void packedRightShiftByScalar(Imm32 count, FloatRegister dest) {
-        psrad(count, dest);
+        vpsrad(count, dest, dest);
     }
     void packedUnsignedRightShiftByScalar(FloatRegister src, FloatRegister dest) {
         vpsrld(src, dest, dest);
     }
     void packedUnsignedRightShiftByScalar(Imm32 count, FloatRegister dest) {
-        psrld(count, dest);
+        vpsrld(count, dest, dest);
     }
 
     void loadAlignedFloat32x4(const Address &src, FloatRegister dest) {
         vmovaps(Operand(src), dest);
     }
     void loadAlignedFloat32x4(const Operand &src, FloatRegister dest) {
         vmovaps(src, dest);
     }
@@ -991,39 +991,39 @@ class MacroAssemblerX86Shared : public A
     {
         MOZ_ASSERT(x < 4 && y < 4 && z < 4 && w < 4);
         uint32_t r = (w << 6) | (z << 4) | (y << 2) | (x << 0);
         MOZ_ASSERT(r < 256);
         return r;
     }
 
     void shuffleInt32(uint32_t mask, FloatRegister src, FloatRegister dest) {
-        pshufd(mask, src, dest);
+        vpshufd(mask, src, dest);
     }
     void moveLowInt32(FloatRegister src, Register dest) {
         vmovd(src, dest);
     }
 
     void moveHighPairToLowPairFloat32(FloatRegister src, FloatRegister dest) {
         vmovhlps(src, dest, dest);
     }
     void shuffleFloat32(uint32_t mask, FloatRegister src, FloatRegister dest) {
         // The shuffle instruction on x86 is such that it moves 2 words from
         // the dest and 2 words from the src operands. To simplify things, just
         // clobber the output with the input and apply the instruction
         // afterwards.
         // Note: this is useAtStart-safe because src isn't read afterwards.
         if (src != dest)
             moveFloat32x4(src, dest);
-        shufps(mask, dest, dest);
+        vshufps(mask, dest, dest, dest);
     }
     void shuffleMix(uint32_t mask, const Operand &src, FloatRegister dest) {
-        // Note this uses shufps, which is a cross-domain penaly on CPU where it
+        // Note this uses vshufps, which is a cross-domain penaly on CPU where it
         // applies, but that's the way clang and gcc do it.
-        shufps(mask, src, dest);
+        vshufps(mask, src, dest, dest);
     }
 
     void moveFloatAsDouble(Register src, FloatRegister dest) {
         vmovd(src, dest);
         vcvtss2sd(dest, dest, dest);
     }
     void loadFloatAsDouble(const Address &src, FloatRegister dest) {
         vmovss(src, dest);
@@ -1129,17 +1129,17 @@ class MacroAssemblerX86Shared : public A
 
         // Loading zero with xor is specially optimized in hardware.
         if (u == 0) {
             zeroDouble(dest);
             return true;
         }
 
         // It is also possible to load several common constants using vpcmpeqw
-        // to get all ones and then psllq and psrlq to get zeros at the ends,
+        // to get all ones and then vpsllq and vpsrlq to get zeros at the ends,
         // as described in "13.4 Generating constants" of
         // "2. Optimizing subroutines in assembly language" by Agner Fog, and as
         // previously implemented here. However, with x86 and x64 both using
         // constant pool loads for double constants, this is probably only
         // worthwhile in cases where a load is likely to be delayed.
 
         return false;
     }
--- a/js/src/jit/x86/MacroAssembler-x86.h
+++ b/js/src/jit/x86/MacroAssembler-x86.h
@@ -863,20 +863,20 @@ class MacroAssemblerX86 : public MacroAs
         MOZ_ASSERT(cond == Equal || cond == NotEqual);
         branchTestValue(cond, val, MagicValue(why), label);
     }
 
     // Note: this function clobbers the source register.
     void boxDouble(FloatRegister src, const ValueOperand &dest) {
         if (Assembler::HasSSE41()) {
             vmovd(src, dest.payloadReg());
-            pextrd(1, src, dest.typeReg());
+            vpextrd(1, src, dest.typeReg());
         } else {
             vmovd(src, dest.payloadReg());
-            psrldq(Imm32(4), src);
+            vpsrldq(Imm32(4), src, src);
             vmovd(src, dest.typeReg());
         }
     }
     void boxNonDouble(JSValueType type, Register src, const ValueOperand &dest) {
         if (src != dest.payloadReg())
             movl(src, dest.payloadReg());
         movl(ImmType(type), dest.typeReg());
     }
@@ -900,31 +900,31 @@ class MacroAssemblerX86 : public MacroAs
     void unboxObject(const Address &src, Register dest) { unboxNonDouble(src, dest); }
     void unboxDouble(const Address &src, FloatRegister dest) {
         loadDouble(Operand(src), dest);
     }
     void unboxDouble(const ValueOperand &src, FloatRegister dest) {
         MOZ_ASSERT(dest != ScratchDoubleReg);
         if (Assembler::HasSSE41()) {
             vmovd(src.payloadReg(), dest);
-            pinsrd(1, src.typeReg(), dest);
+            vpinsrd(1, src.typeReg(), dest, dest);
         } else {
             vmovd(src.payloadReg(), dest);
             vmovd(src.typeReg(), ScratchDoubleReg);
             vunpcklps(ScratchDoubleReg, dest, dest);
         }
     }
     void unboxDouble(const Operand &payload, const Operand &type,
                      Register scratch, FloatRegister dest) {
         MOZ_ASSERT(dest != ScratchDoubleReg);
         if (Assembler::HasSSE41()) {
             movl(payload, scratch);
             vmovd(scratch, dest);
             movl(type, scratch);
-            pinsrd(1, scratch, dest);
+            vpinsrd(1, scratch, dest, dest);
         } else {
             movl(payload, scratch);
             vmovd(scratch, dest);
             movl(type, scratch);
             vmovd(scratch, ScratchDoubleReg);
             vunpcklps(ScratchDoubleReg, dest, dest);
         }
     }