Bug 1639517 - wasm ion simd: optimize shifts. r=jseward
authorLars T Hansen <lhansen@mozilla.com>
Wed, 27 May 2020 08:37:41 +0000
changeset 532337 7a6cac6f591da0cff8d40116d0e3181e231c40f4
parent 532336 2d1bf65618adb77372f6b14506a5bfbcce2c8e4e
child 532338 e230153536a602f4341ce5100c8d6a71763732a9
push id37454
push userccoroiu@mozilla.com
push dateWed, 27 May 2020 16:14:31 +0000
treeherdermozilla-central@a1dd9afbfdf5 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjseward
bugs1639517
milestone78.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1639517 - wasm ion simd: optimize shifts. r=jseward Recognize and optimize constant-width SIMD shifts. This removes shift count masking in the cases where the chip has instructions for the shift, and generally improves code generation in the cases where the shift must be emulated (sometimes radically). Differential Revision: https://phabricator.services.mozilla.com/D76130
js/src/builtin/TestingFunctions.cpp
js/src/jit-test/tests/wasm/simd/ad-hack.js
js/src/jit-test/tests/wasm/simd/ion-analysis.js
js/src/jit/MacroAssembler.h
js/src/jit/x64/BaseAssembler-x64.h
js/src/jit/x64/CodeGenerator-x64.cpp
js/src/jit/x64/LIR-x64.h
js/src/jit/x64/Lowering-x64.cpp
js/src/jit/x64/MacroAssembler-x64-inl.h
js/src/jit/x64/MacroAssembler-x64.cpp
js/src/jit/x64/MacroAssembler-x64.h
js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
js/src/jit/x86-shared/MacroAssembler-x86-shared.h
js/src/wasm/WasmIonCompile.cpp
--- a/js/src/builtin/TestingFunctions.cpp
+++ b/js/src/builtin/TestingFunctions.cpp
@@ -927,16 +927,17 @@ void ReportSimdAnalysis(const char* v) {
 static bool WasmSimdAnalysis(JSContext* cx, unsigned argc, Value* vp) {
   CallArgs args = CallArgsFromVp(argc, vp);
   JSString* result =
       JS_NewStringCopyZ(cx, *lastAnalysisResult ? lastAnalysisResult : "none");
   if (!result) {
     return false;
   }
   args.rval().setString(result);
+  *lastAnalysisResult = (char)0;
   return true;
 }
 #  endif
 #endif
 
 static bool ConvertToTier(JSContext* cx, HandleValue value,
                           const wasm::Code& code, wasm::Tier* tier) {
   RootedString option(cx, JS::ToString(cx, value));
--- a/js/src/jit-test/tests/wasm/simd/ad-hack.js
+++ b/js/src/jit-test/tests/wasm/simd/ad-hack.js
@@ -155,55 +155,65 @@ function unsigned_saturate(z, bits) {
     if (z > max) {
         return max;
     }
     return z;
 }
 
 function shl(count, width) {
     if (width == 64) {
+        count = BigInt(count);
         return (v) => {
-            // This is only right for small values
-            return BigInt(v << count);
+            v = BigInt(v);
+            if (v < 0)
+                v = (1n << 64n) + v;
+            let r = (v << count) & ((1n << 64n) - 1n);
+            if (r & (1n << 63n))
+                r = -((1n << 64n) - r);
+            return r;
         }
     } else {
         return (v) => {
             let mask = (width == 32) ? -1 : ((1 << width) - 1);
             return (v << count) & mask;
         }
     }
 }
 
 function shr(count, width) {
     return (v) => {
+        if (count == 0)
+            return v;
         if (width == 64) {
             if (v < 0) {
                 // This basically mirrors what the SIMD code does, so if there's
                 // a bug there then there's a bug here too.  Seems OK though.
                 let s = 0x1_0000_0000_0000_0000n + BigInt(v);
-                let t = s / BigInt(1 << count);
-                let u = BigInt((1 << count) - 1) * (2n ** BigInt(64-count));
+                let t = s / (1n << BigInt(count));
+                let u = ((1n << BigInt(count)) - 1n) * (2n ** BigInt(64-count));
                 let w = t + u;
                 return w - 0x1_0000_0000_0000_0000n;
             }
-            return BigInt(v) / BigInt(1 << count);
+            return BigInt(v) / (1n << BigInt(count));
         } else {
             let mask = (width == 32) ? -1 : ((1 << width) - 1);
-            return (sign_extend(v,8) >> count) & mask;
+            return (sign_extend(v, width) >> count) & mask;
         }
     }
 }
 
 function shru(count, width) {
     if (width == 64) {
         return (v) => {
+            if (count == 0)
+                return v;
             if (v < 0) {
                 v = 0x1_0000_0000_0000_0000n + BigInt(v);
             }
-            return BigInt(v) / BigInt(1 << count);
+            return BigInt(v) / (1n << BigInt(count));
         }
     } else {
         return (v) => {
             let mask = (width == 32) ? -1 : ((1 << width) - 1);
             return (v >>> count) & mask;
         }
     }
 }
@@ -811,106 +821,258 @@ for ( let dope of [1, 7, 32, 195 ] ) {
 }
 
 // Shifts
 //
 // lhs is v128 in memory
 // rhs is i32 (passed directly)
 // result is v128 in memory
 
+var constantI8Shifts = "";
+for ( let i=0 ; i < 10; i++ ) {
+    constantI8Shifts += `
+    (func (export "shl_i8x16_${i}")
+      (v128.store (i32.const 0) (i8x16.shl (v128.load (i32.const 16)) (i32.const ${i}))))
+    (func (export "shr_i8x16_${i}")
+      (v128.store (i32.const 0) (i8x16.shr_s (v128.load (i32.const 16)) (i32.const ${i}))))
+    (func (export "shr_u8x16_${i}")
+      (v128.store (i32.const 0) (i8x16.shr_u (v128.load (i32.const 16)) (i32.const ${i}))))`;
+}
+
 var ins = wasmEvalText(`
   (module
     (memory (export "mem") 1 1)
     (func (export "shl_i8x16") (param $count i32)
       (v128.store (i32.const 0) (i8x16.shl (v128.load (i32.const 16)) (local.get $count))))
     (func (export "shr_i8x16") (param $count i32)
       (v128.store (i32.const 0) (i8x16.shr_s (v128.load (i32.const 16)) (local.get $count))))
     (func (export "shr_u8x16") (param $count i32)
       (v128.store (i32.const 0) (i8x16.shr_u (v128.load (i32.const 16)) (local.get $count))))
+    ${constantI8Shifts}
     (func (export "shl_i16x8") (param $count i32)
       (v128.store (i32.const 0) (i16x8.shl (v128.load (i32.const 16)) (local.get $count))))
+    (func (export "shl_i16x8_3")
+      (v128.store (i32.const 0) (i16x8.shl (v128.load (i32.const 16)) (i32.const 3))))
+    (func (export "shl_i16x8_15")
+      (v128.store (i32.const 0) (i16x8.shl (v128.load (i32.const 16)) (i32.const 15))))
+    (func (export "shl_i16x8_16")
+      (v128.store (i32.const 0) (i16x8.shl (v128.load (i32.const 16)) (i32.const 16))))
     (func (export "shr_i16x8") (param $count i32)
       (v128.store (i32.const 0) (i16x8.shr_s (v128.load (i32.const 16)) (local.get $count))))
+    (func (export "shr_i16x8_3")
+      (v128.store (i32.const 0) (i16x8.shr_s (v128.load (i32.const 16)) (i32.const 3))))
+    (func (export "shr_i16x8_15")
+      (v128.store (i32.const 0) (i16x8.shr_s (v128.load (i32.const 16)) (i32.const 15))))
+    (func (export "shr_i16x8_16")
+      (v128.store (i32.const 0) (i16x8.shr_s (v128.load (i32.const 16)) (i32.const 16))))
     (func (export "shr_u16x8") (param $count i32)
       (v128.store (i32.const 0) (i16x8.shr_u (v128.load (i32.const 16)) (local.get $count))))
+    (func (export "shr_u16x8_3")
+      (v128.store (i32.const 0) (i16x8.shr_u (v128.load (i32.const 16)) (i32.const 3))))
+    (func (export "shr_u16x8_15")
+      (v128.store (i32.const 0) (i16x8.shr_u (v128.load (i32.const 16)) (i32.const 15))))
+    (func (export "shr_u16x8_16")
+      (v128.store (i32.const 0) (i16x8.shr_u (v128.load (i32.const 16)) (i32.const 16))))
     (func (export "shl_i32x4") (param $count i32)
       (v128.store (i32.const 0) (i32x4.shl (v128.load (i32.const 16)) (local.get $count))))
-    (func (export "shl_i64x2") (param $count i32)
-      (v128.store (i32.const 0) (i64x2.shl (v128.load (i32.const 16)) (local.get $count))))
+    (func (export "shl_i32x4_12")
+      (v128.store (i32.const 0) (i32x4.shl (v128.load (i32.const 16)) (i32.const 12))))
+    (func (export "shl_i32x4_31")
+      (v128.store (i32.const 0) (i32x4.shl (v128.load (i32.const 16)) (i32.const 31))))
+    (func (export "shl_i32x4_32")
+      (v128.store (i32.const 0) (i32x4.shl (v128.load (i32.const 16)) (i32.const 32))))
     (func (export "shr_i32x4") (param $count i32)
       (v128.store (i32.const 0) (i32x4.shr_s (v128.load (i32.const 16)) (local.get $count))))
+    (func (export "shr_i32x4_12")
+      (v128.store (i32.const 0) (i32x4.shr_s (v128.load (i32.const 16)) (i32.const 12))))
+    (func (export "shr_i32x4_31")
+      (v128.store (i32.const 0) (i32x4.shr_s (v128.load (i32.const 16)) (i32.const 31))))
+    (func (export "shr_i32x4_32")
+      (v128.store (i32.const 0) (i32x4.shr_s (v128.load (i32.const 16)) (i32.const 32))))
     (func (export "shr_u32x4") (param $count i32)
       (v128.store (i32.const 0) (i32x4.shr_u (v128.load (i32.const 16)) (local.get $count))))
+    (func (export "shr_u32x4_12")
+      (v128.store (i32.const 0) (i32x4.shr_u (v128.load (i32.const 16)) (i32.const 12))))
+    (func (export "shr_u32x4_31")
+      (v128.store (i32.const 0) (i32x4.shr_u (v128.load (i32.const 16)) (i32.const 31))))
+    (func (export "shr_u32x4_32")
+      (v128.store (i32.const 0) (i32x4.shr_u (v128.load (i32.const 16)) (i32.const 32))))
+    (func (export "shl_i64x2") (param $count i32)
+      (v128.store (i32.const 0) (i64x2.shl (v128.load (i32.const 16)) (local.get $count))))
+    (func (export "shl_i64x2_27")
+      (v128.store (i32.const 0) (i64x2.shl (v128.load (i32.const 16)) (i32.const 27))))
+    (func (export "shl_i64x2_63")
+      (v128.store (i32.const 0) (i64x2.shl (v128.load (i32.const 16)) (i32.const 63))))
+    (func (export "shl_i64x2_64")
+      (v128.store (i32.const 0) (i64x2.shl (v128.load (i32.const 16)) (i32.const 64))))
     (func (export "shr_i64x2") (param $count i32)
       (v128.store (i32.const 0) (i64x2.shr_s (v128.load (i32.const 16)) (local.get $count))))
+    (func (export "shr_i64x2_27")
+      (v128.store (i32.const 0) (i64x2.shr_s (v128.load (i32.const 16)) (i32.const 27))))
+    (func (export "shr_i64x2_45")
+      (v128.store (i32.const 0) (i64x2.shr_s (v128.load (i32.const 16)) (i32.const 45))))
+    (func (export "shr_i64x2_63")
+      (v128.store (i32.const 0) (i64x2.shr_s (v128.load (i32.const 16)) (i32.const 63))))
+    (func (export "shr_i64x2_64")
+      (v128.store (i32.const 0) (i64x2.shr_s (v128.load (i32.const 16)) (i32.const 64))))
     (func (export "shr_u64x2") (param $count i32)
-      (v128.store (i32.const 0) (i64x2.shr_u (v128.load (i32.const 16)) (local.get $count)))))`);
+      (v128.store (i32.const 0) (i64x2.shr_u (v128.load (i32.const 16)) (local.get $count))))
+    (func (export "shr_u64x2_27")
+      (v128.store (i32.const 0) (i64x2.shr_u (v128.load (i32.const 16)) (i32.const 27))))
+    (func (export "shr_u64x2_63")
+      (v128.store (i32.const 0) (i64x2.shr_u (v128.load (i32.const 16)) (i32.const 63))))
+    (func (export "shr_u64x2_64")
+      (v128.store (i32.const 0) (i64x2.shr_u (v128.load (i32.const 16)) (i32.const 64)))))`);
 
 var mem8 = new Uint8Array(ins.exports.mem.buffer);
 var as = [1, 2, 4, 8, 16, 32, 64, 128, 129, 130, 132, 136, 144, 160, 192, 255];
 
 set(mem8, 16, as);
 
 for (let [meth,op] of [["shl_i8x16",shl], ["shr_i8x16",shr], ["shr_u8x16",shru]]) {
     for ( let i=0 ; i < 8 ; i++ ) {
         ins.exports[meth](i);
         assertSame(get(mem8, 0, 16), as.map(op(i, 8)))
+        ins.exports[meth + "_" + i]();
+        assertSame(get(mem8, 0, 16), as.map(op(i, 8)))
     }
 
     ins.exports[meth](1);
-    var a = get(mem8, 0, 16);
+    let a = get(mem8, 0, 16);
     ins.exports[meth](9);
-    var b = get(mem8, 0, 16);
+    let b = get(mem8, 0, 16);
     assertSame(a, b);
+
+    ins.exports[meth + "_1"]();
+    let c = get(mem8, 0, 16);
+    ins.exports[meth + "_9"]();
+    let d = get(mem8, 0, 16);
+    assertSame(c, d);
 }
 
 var mem16 = new Uint16Array(ins.exports.mem.buffer);
-var as = [1, 2, 3, 4, 5, 6, 7, 8];
-
-set(mem16, 8, as)
-ins.exports.shl_i16x8(2);
-assertSame(get(mem16, 0, 8), as.map(shl(2, 16)))
-
+var as = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000];
 set(mem16, 8, as)
+
+ins.exports.shl_i16x8(2);
+var res = get(mem16, 0, 8);
+assertSame(res, as.map(shl(2, 16)))
+
+ins.exports.shl_i16x8(18);      // Masked count
+assertSame(get(mem16, 0, 8), res);
+
+for ( let shift of [3, 15, 16] ) {
+    ins.exports["shl_i16x8_" + shift]();
+    assertSame(get(mem16, 0, 8), as.map(shl(shift & 15, 16)))
+}
+
 ins.exports.shr_i16x8(1);
-assertSame(get(mem16, 0, 8), as.map(shr(1, 16)))
+var res = get(mem16, 0, 8);
+assertSame(res, as.map(shr(1, 16)))
 
-set(mem16, 8, as)
+ins.exports.shr_i16x8(17);      // Masked count
+assertSame(get(mem16, 0, 8), res);
+
+for ( let shift of [3, 15, 16] ) {
+    ins.exports["shr_i16x8_" + shift]();
+    assertSame(get(mem16, 0, 8), as.map(shr(shift & 15, 16)))
+}
+
 ins.exports.shr_u16x8(1);
-assertSame(get(mem16, 0, 8), as.map(shru(1, 16)))
+var res = get(mem16, 0, 8);
+assertSame(res, as.map(shru(1, 16)))
+
+ins.exports.shr_u16x8(17);      // Masked count
+assertSame(get(mem16, 0, 8), res);
+
+for ( let shift of [3, 15, 16] ) {
+    ins.exports["shr_u16x8_" + shift]();
+    assertSame(get(mem16, 0, 8), as.map(shru(shift & 15, 16)))
+}
 
 var mem32 = new Uint32Array(ins.exports.mem.buffer);
-var as = [5, 6, 7, 8];
+var as = [5152, 6768, 7074, 800811];
 
 set(mem32, 4, as)
 ins.exports.shl_i32x4(2);
-assertSame(get(mem32, 0, 4), as.map(shl(2, 32)))
+var res = get(mem32, 0, 4);
+assertSame(res, as.map(shl(2, 32)))
+
+ins.exports.shl_i32x4(34);      // Masked count
+assertSame(get(mem32, 0, 4), res);
 
-set(mem32, 4, as)
+for ( let shift of [12, 31, 32] ) {
+    ins.exports["shl_i32x4_" + shift]();
+    assertSame(get(mem32, 0, 4), as.map(shl(shift & 31, 32)).map(x => x>>>0))
+}
+
 ins.exports.shr_i32x4(1);
-assertSame(get(mem32, 0, 4), as.map(shr(1, 32)))
+var res = get(mem32, 0, 4);
+assertSame(res, as.map(shr(1, 32)))
+
+ins.exports.shr_i32x4(33);      // Masked count
+assertSame(get(mem32, 0, 4), res);
+
+for ( let shift of [12, 31, 32] ) {
+    ins.exports["shr_i32x4_" + shift]();
+    assertSame(get(mem32, 0, 4), as.map(shr(shift & 31, 32)))
+}
 
-set(mem32, 4, as)
 ins.exports.shr_u32x4(1);
-assertSame(get(mem32, 0, 4), as.map(shru(1, 32)))
+var res = get(mem32, 0, 4);
+assertSame(res, as.map(shru(1, 32)))
+
+ins.exports.shr_u32x4(33);      // Masked count
+assertSame(get(mem32, 0, 4), res);
+
+for ( let shift of [12, 31, 32] ) {
+    ins.exports["shr_u32x4_" + shift]();
+    assertSame(get(mem32, 0, 4), as.map(shru(shift & 31, 32)))
+}
 
 var mem64 = new BigInt64Array(ins.exports.mem.buffer);
-var as = [5, -6];
+var as = [50515253, -616263];
 
 set(mem64, 2, as)
 ins.exports.shl_i64x2(2);
-assertSame(get(mem64, 0, 2), as.map(shl(2, 64)))
+var res = get(mem64, 0, 2);
+assertSame(res, as.map(shl(2, 64)))
+
+ins.exports.shl_i64x2(66);      // Masked count
+assertSame(get(mem64, 0, 2), res);
 
-set(mem64, 2, as)
+for ( let shift of [27, 63, 64] ) {
+    ins.exports["shl_i64x2_" + shift]();
+    assertSame(get(mem64, 0, 2), as.map(shl(shift & 63, 64)))
+}
+
 ins.exports.shr_u64x2(1);
-assertSame(get(mem64, 0, 2), as.map(shru(1, 64)))
+var res = get(mem64, 0, 2);
+assertSame(res, as.map(shru(1, 64)))
+
+ins.exports.shr_u64x2(65);      // Masked count
+assertSame(get(mem64, 0, 2), res);
+
+for ( let shift of [27, 63, 64] ) {
+    ins.exports["shr_u64x2_" + shift]();
+    assertSame(get(mem64, 0, 2), as.map(shru(shift & 63, 64)))
+}
 
-set(mem64, 2, as)
 ins.exports.shr_i64x2(2);
-assertSame(get(mem64, 0, 2), as.map(shr(2, 64)))
+var res = get(mem64, 0, 2);
+assertSame(res, as.map(shr(2, 64)))
+
+ins.exports.shr_i64x2(66);      // Masked count
+assertSame(get(mem64, 0, 2), res);
+
+// The ion code generator has multiple paths here, for < 32 and >= 32
+for ( let shift of [27, 45, 63, 64] ) {
+    ins.exports["shr_i64x2_" + shift]();
+    assertSame(get(mem64, 0, 2), as.map(shr(shift & 63, 64)))
+}
 
 // Narrow
 
 var ins = wasmEvalText(`
   (module
     (memory (export "mem") 1 1)
     (func (export "narrow_i16x8_s")
       (v128.store (i32.const 0) (i8x16.narrow_i16x8_s (v128.load (i32.const 16)) (v128.load (i32.const 32)))))
--- a/js/src/jit-test/tests/wasm/simd/ion-analysis.js
+++ b/js/src/jit-test/tests/wasm/simd/ion-analysis.js
@@ -1,16 +1,17 @@
 // |jit-test| skip-if: wasmCompileMode() != "ion" || !this.wasmSimdAnalysis
 
 // White-box tests for SIMD optimizations.  These are sensitive to internal
 // details of the lowering logic, which is platform-dependent.
 //
 // In DEBUG builds, the testing function wasmSimdAnalysis() returns a string
 // describing the last decision made by the SIMD lowering code: to perform an
-// optimized lowering or the default byte shuffle+blend.
+// optimized lowering or the default byte shuffle+blend for v8x16.shuffle; to
+// shift by a constant or a variable for the various shifts; and so on.
 //
 // We test that the expected transformation applies, and that the machine code
 // generates the expected result.
 
 // 32-bit permutation that is not a rotation.
 let perm32x4_pattern = [4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3];
 
 // Operands the same, dword permutation
@@ -688,16 +689,29 @@ for ( let byte of [3, 11, 8, 2] ) {
     assertEq(wasmSimdAnalysis(), "shuffle -> permute 32x4");
 
     let mem = new Int8Array(ins.exports.mem.buffer);
     set(mem, 16, iota(16));
     ins.exports.run();
     assertSame(get(mem, 0, 16), rev64x2_pattern);
 }
 
+// In the case of shifts, we have separate tests that constant shifts work
+// correctly, so no such testing is done here.
+
+for ( let lanes of ['i8x16', 'i16x8', 'i32x4', 'i64x2'] ) {
+    for ( let shift of ['shl', 'shr_s', 'shr_u'] ) {
+        for ( let [count, result] of [['(i32.const 5)', 'shift -> constant shift'],
+                                      ['(local.get 1)', 'shift -> variable shift']] ) {
+            wasmCompile(`(module (func (param v128) (param i32) (result v128) (${lanes}.${shift} (local.get 0) ${count})))`);
+            assertEq(wasmSimdAnalysis(), result);
+        }
+    }
+}
+
 // Library
 
 function wasmCompile(text) {
     return new WebAssembly.Instance(new WebAssembly.Module(wasmTextToBinary(text)))
 }
 
 function get(arr, loc, len) {
     let res = [];
--- a/js/src/jit/MacroAssembler.h
+++ b/js/src/jit/MacroAssembler.h
@@ -2127,61 +2127,104 @@ class MacroAssembler : public MacroAssem
       DEFINED_ON(x86_shared);
 
   inline void absInt16x8(FloatRegister src, FloatRegister dest)
       DEFINED_ON(x86_shared);
 
   inline void absInt32x4(FloatRegister src, FloatRegister dest)
       DEFINED_ON(x86_shared);
 
-  // Left shift by scalar
+  // Left shift by scalar.  Immediates must have been masked; shifts of zero
+  // will work but may or may not generate code.
 
   inline void leftShiftInt8x16(Register rhs, FloatRegister lhsDest,
                                Register temp1, FloatRegister temp2)
       DEFINED_ON(x86_shared);
 
+  inline void leftShiftInt8x16(Imm32 count, FloatRegister src,
+                               FloatRegister dest) DEFINED_ON(x86_shared);
+
   inline void leftShiftInt16x8(Register rhs, FloatRegister lhsDest,
                                Register temp) DEFINED_ON(x86_shared);
 
+  inline void leftShiftInt16x8(Imm32 count, FloatRegister src,
+                               FloatRegister dest) DEFINED_ON(x86_shared);
+
   inline void leftShiftInt32x4(Register rhs, FloatRegister lhsDest,
                                Register temp) DEFINED_ON(x86_shared);
 
+  inline void leftShiftInt32x4(Imm32 count, FloatRegister src,
+                               FloatRegister dest) DEFINED_ON(x86_shared);
+
   inline void leftShiftInt64x2(Register rhs, FloatRegister lhsDest,
                                Register temp) DEFINED_ON(x86_shared);
 
-  // Right shift by scalar
+  inline void leftShiftInt64x2(Imm32 count, FloatRegister src,
+                               FloatRegister dest) DEFINED_ON(x86_shared);
+
+  // Right shift by scalar.  Immediates must have been masked; shifts of zero
+  // will work but may or may not generate code.
 
   inline void rightShiftInt8x16(Register rhs, FloatRegister lhsDest,
                                 Register temp1, FloatRegister temp2)
       DEFINED_ON(x86_shared);
 
+  inline void rightShiftInt8x16(Imm32 count, FloatRegister src,
+                                FloatRegister dest, FloatRegister temp)
+      DEFINED_ON(x86_shared);
+
   inline void unsignedRightShiftInt8x16(Register rhs, FloatRegister lhsDest,
                                         Register temp1, FloatRegister temp2)
       DEFINED_ON(x86_shared);
 
+  inline void unsignedRightShiftInt8x16(Imm32 count, FloatRegister src,
+                                        FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
   inline void rightShiftInt16x8(Register rhs, FloatRegister lhsDest,
                                 Register temp) DEFINED_ON(x86_shared);
 
+  inline void rightShiftInt16x8(Imm32 count, FloatRegister src,
+                                FloatRegister dest) DEFINED_ON(x86_shared);
+
   inline void unsignedRightShiftInt16x8(Register rhs, FloatRegister lhsDest,
                                         Register temp) DEFINED_ON(x86_shared);
 
+  inline void unsignedRightShiftInt16x8(Imm32 count, FloatRegister src,
+                                        FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
   inline void rightShiftInt32x4(Register rhs, FloatRegister lhsDest,
                                 Register temp) DEFINED_ON(x86_shared);
 
+  inline void rightShiftInt32x4(Imm32 count, FloatRegister src,
+                                FloatRegister dest) DEFINED_ON(x86_shared);
+
   inline void unsignedRightShiftInt32x4(Register rhs, FloatRegister lhsDest,
                                         Register temp) DEFINED_ON(x86_shared);
 
+  inline void unsignedRightShiftInt32x4(Imm32 count, FloatRegister src,
+                                        FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
   // `rhs` must be the CL register and it must have been masked so that its
   // value is <= 63.
   inline void rightShiftInt64x2(Register rhs, FloatRegister lhsDest)
       DEFINED_ON(x64);
 
+  inline void rightShiftInt64x2(Imm32 count, FloatRegister src,
+                                FloatRegister dest) DEFINED_ON(x64);
+
   inline void unsignedRightShiftInt64x2(Register rhs, FloatRegister lhsDest,
                                         Register temp) DEFINED_ON(x86_shared);
 
+  inline void unsignedRightShiftInt64x2(Imm32 count, FloatRegister src,
+                                        FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
   // Bitwise and, or, xor, not
 
   inline void bitwiseAndSimd128(FloatRegister rhs, FloatRegister lhsDest)
       DEFINED_ON(x86_shared);
 
   inline void bitwiseOrSimd128(FloatRegister rhs, FloatRegister lhsDest)
       DEFINED_ON(x86_shared);
 
--- a/js/src/jit/x64/BaseAssembler-x64.h
+++ b/js/src/jit/x64/BaseAssembler-x64.h
@@ -901,16 +901,21 @@ class BaseAssemblerX64 : public BaseAsse
                             dst);
   }
 
   MOZ_MUST_USE JmpSrc vmovdqa_ripr(XMMRegisterID dst) {
     return twoByteRipOpSimd("vmovdqa", VEX_PD, OP2_MOVDQ_VdqWdq, invalid_xmm,
                             dst);
   }
 
+  MOZ_MUST_USE JmpSrc vpand_ripr(XMMRegisterID dst) {
+    return twoByteRipOpSimd("vpand", VEX_PD, OP2_PANDDQ_VdqWdq, invalid_xmm,
+                            dst);
+  }
+
  private:
   MOZ_MUST_USE JmpSrc twoByteRipOpSimd(const char* name, VexOperandType ty,
                                        TwoByteOpcodeID opcode,
                                        XMMRegisterID src0, XMMRegisterID dst) {
     if (useLegacySSEEncoding(src0, dst)) {
       m_formatter.legacySSEPrefix(ty);
       m_formatter.twoByteRipOp(opcode, 0, dst);
       JmpSrc label(m_formatter.size());
--- a/js/src/jit/x64/CodeGenerator-x64.cpp
+++ b/js/src/jit/x64/CodeGenerator-x64.cpp
@@ -1094,16 +1094,85 @@ void CodeGenerator::visitWasmVariableShi
     default:
       MOZ_CRASH("Shift SimdOp not implemented");
   }
 #else
   MOZ_CRASH("No SIMD");
 #endif
 }
 
+void CodeGenerator::visitWasmConstantShiftSimd128(
+    LWasmConstantShiftSimd128* ins) {
+#ifdef ENABLE_WASM_SIMD
+  FloatRegister src = ToFloatRegister(ins->src());
+  FloatRegister dest = ToFloatRegister(ins->output());
+  int32_t shift = ins->shift();
+
+  if (shift == 0) {
+    if (src != dest) {
+      masm.moveSimd128(src, dest);
+    }
+    return;
+  }
+
+  FloatRegister temp;
+  switch (ins->simdOp()) {
+    case wasm::SimdOp::I8x16ShrS:
+      temp = ToFloatRegister(ins->temp());
+      break;
+    default:
+      MOZ_ASSERT(ins->temp()->isBogusTemp());
+      break;
+  }
+
+  switch (ins->simdOp()) {
+    case wasm::SimdOp::I8x16Shl:
+      masm.leftShiftInt8x16(Imm32(shift), src, dest);
+      break;
+    case wasm::SimdOp::I8x16ShrS:
+      masm.rightShiftInt8x16(Imm32(shift), src, dest, temp);
+      break;
+    case wasm::SimdOp::I8x16ShrU:
+      masm.unsignedRightShiftInt8x16(Imm32(shift), src, dest);
+      break;
+    case wasm::SimdOp::I16x8Shl:
+      masm.leftShiftInt16x8(Imm32(shift), src, dest);
+      break;
+    case wasm::SimdOp::I16x8ShrS:
+      masm.rightShiftInt16x8(Imm32(shift), src, dest);
+      break;
+    case wasm::SimdOp::I16x8ShrU:
+      masm.unsignedRightShiftInt16x8(Imm32(shift), src, dest);
+      break;
+    case wasm::SimdOp::I32x4Shl:
+      masm.leftShiftInt32x4(Imm32(shift), src, dest);
+      break;
+    case wasm::SimdOp::I32x4ShrS:
+      masm.rightShiftInt32x4(Imm32(shift), src, dest);
+      break;
+    case wasm::SimdOp::I32x4ShrU:
+      masm.unsignedRightShiftInt32x4(Imm32(shift), src, dest);
+      break;
+    case wasm::SimdOp::I64x2Shl:
+      masm.leftShiftInt64x2(Imm32(shift), src, dest);
+      break;
+    case wasm::SimdOp::I64x2ShrS:
+      masm.rightShiftInt64x2(Imm32(shift), src, dest);
+      break;
+    case wasm::SimdOp::I64x2ShrU:
+      masm.unsignedRightShiftInt64x2(Imm32(shift), src, dest);
+      break;
+    default:
+      MOZ_CRASH("Shift SimdOp not implemented");
+  }
+#else
+  MOZ_CRASH("No SIMD");
+#endif
+}
+
 void CodeGenerator::visitWasmShuffleSimd128(LWasmShuffleSimd128* ins) {
 #ifdef ENABLE_WASM_SIMD
   FloatRegister lhsDest = ToFloatRegister(ins->lhsDest());
   FloatRegister rhs = ToFloatRegister(ins->rhs());
   SimdConstant control = ins->control();
   switch (ins->op()) {
     case LWasmShuffleSimd128::BLEND_8x16: {
       masm.blendInt8x16(reinterpret_cast<const uint8_t*>(control.asInt8x16()),
--- a/js/src/jit/x64/LIR-x64.h
+++ b/js/src/jit/x64/LIR-x64.h
@@ -272,16 +272,38 @@ class LWasmVariableShiftSimd128 : public
     setTemp(1, temp1);
   }
 
   const LAllocation* lhsDest() { return getOperand(LhsDest); }
   const LAllocation* rhs() { return getOperand(Rhs); }
   wasm::SimdOp simdOp() const { return mir_->toWasmShiftSimd128()->simdOp(); }
 };
 
+// (v128, i32) -> v128 effect-free constant-width shift operations
+class LWasmConstantShiftSimd128 : public LInstructionHelper<1, 1, 1> {
+  int32_t shift_;
+
+ public:
+  LIR_HEADER(WasmConstantShiftSimd128)
+
+  static constexpr uint32_t Src = 0;
+
+  LWasmConstantShiftSimd128(const LAllocation& src, const LDefinition& temp,
+                            int32_t shift)
+      : LInstructionHelper(classOpcode), shift_(shift) {
+    setOperand(Src, src);
+    setTemp(0, temp);
+  }
+
+  const LAllocation* src() { return getOperand(Src); }
+  const LDefinition* temp() { return getTemp(0); }
+  int32_t shift() { return shift_; }
+  wasm::SimdOp simdOp() const { return mir_->toWasmShiftSimd128()->simdOp(); }
+};
+
 // (v128, v128, imm_simd) -> v128 effect-free operation.
 // temp is FPR (and always in use).
 class LWasmShuffleSimd128 : public LInstructionHelper<1, 2, 1> {
  public:
   // Shuffle operations.
   enum Op {
     // Blend bytes.  control_ has the blend mask as an I8x16: 0 to select from
     // the lhs, -1 to select from the rhs.
--- a/js/src/jit/x64/Lowering-x64.cpp
+++ b/js/src/jit/x64/Lowering-x64.cpp
@@ -498,16 +498,59 @@ void LIRGenerator::visitWasmBinarySimd12
 void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {
   MDefinition* lhs = ins->lhs();
   MDefinition* rhs = ins->rhs();
 
   MOZ_ASSERT(lhs->type() == MIRType::Simd128);
   MOZ_ASSERT(rhs->type() == MIRType::Int32);
   MOZ_ASSERT(ins->type() == MIRType::Simd128);
 
+  if (rhs->isConstant()) {
+    LDefinition temp = LDefinition::BogusTemp();
+    int32_t shiftCount = rhs->toConstant()->toInt32();
+    switch (ins->simdOp()) {
+      case wasm::SimdOp::I8x16Shl:
+      case wasm::SimdOp::I8x16ShrU:
+        shiftCount &= 7;
+        break;
+      case wasm::SimdOp::I8x16ShrS:
+        shiftCount &= 7;
+        temp = tempSimd128();
+        break;
+      case wasm::SimdOp::I16x8Shl:
+      case wasm::SimdOp::I16x8ShrU:
+      case wasm::SimdOp::I16x8ShrS:
+        shiftCount &= 15;
+        break;
+      case wasm::SimdOp::I32x4Shl:
+      case wasm::SimdOp::I32x4ShrU:
+      case wasm::SimdOp::I32x4ShrS:
+        shiftCount &= 31;
+        break;
+      case wasm::SimdOp::I64x2Shl:
+      case wasm::SimdOp::I64x2ShrU:
+      case wasm::SimdOp::I64x2ShrS:
+        shiftCount &= 63;
+        break;
+      default:
+        MOZ_CRASH("Unexpected shift operation");
+    }
+#  ifdef DEBUG
+    js::wasm::ReportSimdAnalysis("shift -> constant shift");
+#  endif
+    auto* lir = new (alloc())
+        LWasmConstantShiftSimd128(useRegisterAtStart(lhs), temp, shiftCount);
+    defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src);
+    return;
+  }
+
+#  ifdef DEBUG
+  js::wasm::ReportSimdAnalysis("shift -> variable shift");
+#  endif
+
   LDefinition tempReg0 = LDefinition::BogusTemp();
   LDefinition tempReg1 = LDefinition::BogusTemp();
   switch (ins->simdOp()) {
     case wasm::SimdOp::I64x2ShrS:
       break;
     case wasm::SimdOp::I8x16Shl:
     case wasm::SimdOp::I8x16ShrS:
     case wasm::SimdOp::I8x16ShrU:
--- a/js/src/jit/x64/MacroAssembler-x64-inl.h
+++ b/js/src/jit/x64/MacroAssembler-x64-inl.h
@@ -793,16 +793,21 @@ void MacroAssembler::rightShiftInt64x2(R
   vpextrq(0, lhsDest, scratch);
   sarq_cl(scratch);
   vpinsrq(0, scratch, lhsDest, lhsDest);
   vpextrq(1, lhsDest, scratch);
   sarq_cl(scratch);
   vpinsrq(1, scratch, lhsDest, lhsDest);
 }
 
+void MacroAssembler::rightShiftInt64x2(Imm32 count, FloatRegister src,
+                                       FloatRegister dest) {
+  MacroAssemblerX64::rightShiftInt64x2(count, src, dest);
+}
+
 // Extract lane as scalar
 
 void MacroAssembler::extractLaneInt64x2(uint32_t lane, FloatRegister src,
                                         Register64 dest) {
   vpextrq(lane, src, dest.reg);
 }
 
 // Replace lane value
--- a/js/src/jit/x64/MacroAssembler-x64.cpp
+++ b/js/src/jit/x64/MacroAssembler-x64.cpp
@@ -71,16 +71,26 @@ void MacroAssemblerX64::loadConstantSimd
   SimdData* val = getSimdData(v);
   if (!val) {
     return;
   }
   JmpSrc j = masm.vmovaps_ripr(dest.encoding());
   propagateOOM(val->uses.append(CodeOffset(j.offset())));
 }
 
+void MacroAssemblerX64::vpandSimd128(const SimdConstant& v,
+                                     FloatRegister dest) {
+  SimdData* val = getSimdData(v);
+  if (!val) {
+    return;
+  }
+  JmpSrc j = masm.vpand_ripr(dest.encoding());
+  propagateOOM(val->uses.append(CodeOffset(j.offset())));
+}
+
 void MacroAssemblerX64::bindOffsets(
     const MacroAssemblerX86Shared::UsesVector& uses) {
   for (CodeOffset use : uses) {
     JmpDst dst(currentOffset());
     JmpSrc src(use.offset());
     // Using linkJump here is safe, as explaind in the comment in
     // loadConstantDouble.
     masm.linkJump(src, dst);
@@ -294,16 +304,45 @@ void MacroAssembler::subFromStackPtr(Imm
       amountLeft -= fullPages * 4096;
       if (amountLeft) {
         subq(Imm32(amountLeft), StackPointer);
       }
     }
   }
 }
 
+void MacroAssemblerX64::rightShiftInt64x2(Imm32 count, FloatRegister src,
+                                          FloatRegister dest) {
+  MOZ_ASSERT(count.value <= 63);
+
+  if (count.value < 32) {
+    ScratchSimd128Scope scratch(asMasm());
+    // Compute high dwords and mask low dwords
+    asMasm().moveSimd128(src, scratch);
+    vpsrad(count, scratch, scratch);
+    vpandSimd128(SimdConstant::SplatX2(int64_t(0xFFFFFFFF00000000LL)), scratch);
+    // Compute low dwords (high dwords at most have clear high bits where the
+    // result will have set low high bits)
+    if (src != dest) {
+      asMasm().moveSimd128(src, dest);
+    }
+    vpsrlq(count, dest, dest);
+    // Merge the parts
+    vpor(scratch, dest, dest);
+  } else {
+    ScratchRegisterScope scratch(asMasm());
+    vpextrq(0, src, scratch);
+    sarq(count, scratch);
+    vpinsrq(0, scratch, dest, dest);
+    vpextrq(1, src, scratch);
+    sarq(count, scratch);
+    vpinsrq(1, scratch, dest, dest);
+  }
+}
+
 //{{{ check_macroassembler_style
 // ===============================================================
 // ABI function calls.
 
 void MacroAssembler::setupUnalignedABICall(Register scratch) {
   setupABICall();
   dynamicAlignment_ = true;
 
--- a/js/src/jit/x64/MacroAssembler-x64.h
+++ b/js/src/jit/x64/MacroAssembler-x64.h
@@ -929,16 +929,18 @@ class MacroAssemblerX64 : public MacroAs
     convertInt32ToFloat32(operand.valueReg(), dest);
   }
 
   void loadConstantDouble(double d, FloatRegister dest);
   void loadConstantFloat32(float f, FloatRegister dest);
 
   void loadConstantSimd128Int(const SimdConstant& v, FloatRegister dest);
   void loadConstantSimd128Float(const SimdConstant& v, FloatRegister dest);
+  void vpandSimd128(const SimdConstant& v, FloatRegister dest);
+  void rightShiftInt64x2(Imm32 count, FloatRegister src, FloatRegister dest);
 
   void loadWasmGlobalPtr(uint32_t globalDataOffset, Register dest) {
     loadPtr(Address(WasmTlsReg,
                     offsetof(wasm::TlsData, globalArea) + globalDataOffset),
             dest);
   }
   void loadWasmPinnedRegsFromTls() {
     loadPtr(Address(WasmTlsReg, offsetof(wasm::TlsData, memoryBase)), HeapReg);
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
@@ -1624,32 +1624,84 @@ void MacroAssemblerX86Shared::packedShif
 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
     FloatRegister in, Register count, Register temp, FloatRegister xtmp,
     FloatRegister dest) {
   packedShiftByScalarInt8x16(in, count, temp, xtmp, dest,
                              &MacroAssemblerX86Shared::vpsllw,
                              &MacroAssemblerX86Shared::vpmovzxbw);
 }
 
+void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
+    Imm32 count, FloatRegister src, FloatRegister dest) {
+  MOZ_ASSERT(count.value <= 7);
+  if (src != dest) {
+    asMasm().moveSimd128(src, dest);
+  }
+  // Use the doubling trick for low shift counts, otherwise mask off the bits
+  // that are shifted out of the low byte of each word and use word shifts.  The
+  // optimal cutoff remains to be explored.
+  if (count.value <= 3) {
+    for (int32_t shift = count.value; shift > 0; --shift) {
+      asMasm().addInt8x16(dest, dest);
+    }
+  } else {
+    ScratchSimd128Scope scratch(asMasm());
+    // Whether SplatX8 or SplatX16 is best depends on the constant probably?
+    asMasm().loadConstantSimd128Int(SimdConstant::SplatX16(0xFF >> count.value),
+                                    scratch);
+    vpand(Operand(scratch), dest, dest);
+    vpsllw(count, dest, dest);
+  }
+}
+
 void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16(
     FloatRegister in, Register count, Register temp, FloatRegister xtmp,
     FloatRegister dest) {
   packedShiftByScalarInt8x16(in, count, temp, xtmp, dest,
                              &MacroAssemblerX86Shared::vpsraw,
                              &MacroAssemblerX86Shared::vpmovsxbw);
 }
 
+void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16(
+    Imm32 count, FloatRegister src, FloatRegister temp, FloatRegister dest) {
+  MOZ_ASSERT(count.value <= 7);
+  ScratchSimd128Scope scratch(asMasm());
+
+  asMasm().moveSimd128(src, scratch);
+  vpslldq(Imm32(1), scratch, scratch);               // Low bytes -> high bytes
+  vpsraw(Imm32(count.value + 8), scratch, scratch);  // Shift low bytes
+  vpsraw(count, dest, dest);                         // Shift high bytes
+  asMasm().loadConstantSimd128Int(SimdConstant::SplatX8(0xFF00), temp);
+  bitwiseAndSimdInt(dest, Operand(temp), dest);        // Keep high bytes
+  bitwiseAndNotSimdInt(temp, Operand(scratch), temp);  // Keep low bytes
+  bitwiseOrSimdInt(dest, Operand(temp), dest);         // Combine
+}
+
 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(
     FloatRegister in, Register count, Register temp, FloatRegister xtmp,
     FloatRegister dest) {
   packedShiftByScalarInt8x16(in, count, temp, xtmp, dest,
                              &MacroAssemblerX86Shared::vpsrlw,
                              &MacroAssemblerX86Shared::vpmovzxbw);
 }
 
+void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(
+    Imm32 count, FloatRegister src, FloatRegister dest) {
+  MOZ_ASSERT(count.value <= 7);
+  if (src != dest) {
+    asMasm().moveSimd128(src, dest);
+  }
+  ScratchSimd128Scope scratch(asMasm());
+  // Whether SplatX8 or SplatX16 is best depends on the constant probably?
+  asMasm().loadConstantSimd128Int(
+      SimdConstant::SplatX16((0xFF << count.value) & 0xFF), scratch);
+  vpand(Operand(scratch), dest, dest);
+  vpsrlw(count, dest, dest);
+}
+
 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt16x8(
     FloatRegister in, Register count, Register temp, FloatRegister dest) {
   ScratchSimd128Scope scratch(asMasm());
   MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
   vpsllw(scratch, in, dest);
 }
 
 void MacroAssemblerX86Shared::packedRightShiftByScalarInt16x8(
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
@@ -1594,83 +1594,164 @@ void MacroAssembler::absInt32x4(FloatReg
 // Left shift by scalar
 
 void MacroAssembler::leftShiftInt8x16(Register rhs, FloatRegister lhsDest,
                                       Register temp1, FloatRegister temp2) {
   MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(lhsDest, rhs, temp1,
                                                           temp2, lhsDest);
 }
 
+void MacroAssembler::leftShiftInt8x16(Imm32 count, FloatRegister src,
+                                      FloatRegister dest) {
+  MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(count, src, dest);
+}
+
 void MacroAssembler::leftShiftInt16x8(Register rhs, FloatRegister lhsDest,
                                       Register temp) {
   MacroAssemblerX86Shared::packedLeftShiftByScalarInt16x8(lhsDest, rhs, temp,
                                                           lhsDest);
 }
 
+void MacroAssembler::leftShiftInt16x8(Imm32 count, FloatRegister src,
+                                      FloatRegister dest) {
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
+  vpsllw(count, src, dest);
+}
+
 void MacroAssembler::leftShiftInt32x4(Register rhs, FloatRegister lhsDest,
                                       Register temp) {
   MacroAssemblerX86Shared::packedLeftShiftByScalarInt32x4(lhsDest, rhs, temp,
                                                           lhsDest);
 }
 
+void MacroAssembler::leftShiftInt32x4(Imm32 count, FloatRegister src,
+                                      FloatRegister dest) {
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
+  vpslld(count, src, dest);
+}
+
 void MacroAssembler::leftShiftInt64x2(Register rhs, FloatRegister lhsDest,
                                       Register temp) {
   MacroAssemblerX86Shared::packedLeftShiftByScalarInt64x2(lhsDest, rhs, temp,
                                                           lhsDest);
 }
 
+void MacroAssembler::leftShiftInt64x2(Imm32 count, FloatRegister src,
+                                      FloatRegister dest) {
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
+  vpsllq(count, src, dest);
+}
+
 // Right shift by scalar
 
 void MacroAssembler::rightShiftInt8x16(Register rhs, FloatRegister lhsDest,
                                        Register temp1, FloatRegister temp2) {
   MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16(lhsDest, rhs, temp1,
                                                            temp2, lhsDest);
 }
 
+void MacroAssembler::rightShiftInt8x16(Imm32 count, FloatRegister src,
+                                       FloatRegister dest, FloatRegister temp) {
+  MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16(count, src, temp,
+                                                           dest);
+}
+
 void MacroAssembler::unsignedRightShiftInt8x16(Register rhs,
                                                FloatRegister lhsDest,
                                                Register temp1,
                                                FloatRegister temp2) {
   MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(
       lhsDest, rhs, temp1, temp2, lhsDest);
 }
 
+void MacroAssembler::unsignedRightShiftInt8x16(Imm32 count, FloatRegister src,
+                                               FloatRegister dest) {
+  MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(count, src,
+                                                                   dest);
+}
+
 void MacroAssembler::rightShiftInt16x8(Register rhs, FloatRegister lhsDest,
                                        Register temp) {
   MacroAssemblerX86Shared::packedRightShiftByScalarInt16x8(lhsDest, rhs, temp,
                                                            lhsDest);
 }
 
+void MacroAssembler::rightShiftInt16x8(Imm32 count, FloatRegister src,
+                                       FloatRegister dest) {
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
+  vpsraw(count, src, dest);
+}
+
 void MacroAssembler::unsignedRightShiftInt16x8(Register rhs,
                                                FloatRegister lhsDest,
                                                Register temp) {
   MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt16x8(
       lhsDest, rhs, temp, lhsDest);
 }
 
+void MacroAssembler::unsignedRightShiftInt16x8(Imm32 count, FloatRegister src,
+                                               FloatRegister dest) {
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
+  vpsrlw(count, src, dest);
+}
+
 void MacroAssembler::rightShiftInt32x4(Register rhs, FloatRegister lhsDest,
                                        Register temp) {
   MacroAssemblerX86Shared::packedRightShiftByScalarInt32x4(lhsDest, rhs, temp,
                                                            lhsDest);
 }
 
+void MacroAssembler::rightShiftInt32x4(Imm32 count, FloatRegister src,
+                                       FloatRegister dest) {
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
+  vpsrad(count, src, dest);
+}
+
 void MacroAssembler::unsignedRightShiftInt32x4(Register rhs,
                                                FloatRegister lhsDest,
                                                Register temp) {
   MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt32x4(
       lhsDest, rhs, temp, lhsDest);
 }
 
+void MacroAssembler::unsignedRightShiftInt32x4(Imm32 count, FloatRegister src,
+                                               FloatRegister dest) {
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
+  vpsrld(count, src, dest);
+}
+
 void MacroAssembler::unsignedRightShiftInt64x2(Register rhs,
                                                FloatRegister lhsDest,
                                                Register temp) {
   MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt64x2(
       lhsDest, rhs, temp, lhsDest);
 }
 
+void MacroAssembler::unsignedRightShiftInt64x2(Imm32 count, FloatRegister src,
+                                               FloatRegister dest) {
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
+  vpsrlq(count, src, dest);
+}
+
 // Bitwise and, or, xor, not
 
 void MacroAssembler::bitwiseAndSimd128(FloatRegister rhs,
                                        FloatRegister lhsDest) {
   MacroAssemblerX86Shared::bitwiseAndSimdInt(lhsDest, Operand(rhs), lhsDest);
 }
 
 void MacroAssembler::bitwiseOrSimd128(FloatRegister rhs,
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
@@ -654,23 +654,29 @@ class MacroAssemblerX86Shared : public A
       void (MacroAssemblerX86Shared::*shift)(FloatRegister, FloatRegister,
                                              FloatRegister),
       void (MacroAssemblerX86Shared::*extend)(const Operand&, FloatRegister));
 
  public:
   void packedLeftShiftByScalarInt8x16(FloatRegister in, Register count,
                                       Register temp, FloatRegister xtmp,
                                       FloatRegister dest);
+  void packedLeftShiftByScalarInt8x16(Imm32 count, FloatRegister src,
+                                      FloatRegister dest);
   void packedRightShiftByScalarInt8x16(FloatRegister in, Register count,
                                        Register temp, FloatRegister xtmp,
                                        FloatRegister dest);
+  void packedRightShiftByScalarInt8x16(Imm32 count, FloatRegister src,
+                                       FloatRegister temp, FloatRegister dest);
   void packedUnsignedRightShiftByScalarInt8x16(FloatRegister in, Register count,
                                                Register temp,
                                                FloatRegister xtmp,
                                                FloatRegister dest);
+  void packedUnsignedRightShiftByScalarInt8x16(Imm32 count, FloatRegister src,
+                                               FloatRegister dest);
 
   void packedLeftShiftByScalarInt16x8(FloatRegister in, Register count,
                                       Register temp, FloatRegister dest);
   void packedRightShiftByScalarInt16x8(FloatRegister in, Register count,
                                        Register temp, FloatRegister dest);
   void packedUnsignedRightShiftByScalarInt16x8(FloatRegister in, Register count,
                                                Register temp,
                                                FloatRegister dest);
--- a/js/src/wasm/WasmIonCompile.cpp
+++ b/js/src/wasm/WasmIonCompile.cpp
@@ -652,23 +652,28 @@ class FunctionCompiler {
     if (inDeadCode()) {
       return nullptr;
     }
 
     MOZ_ASSERT(lhs->type() == MIRType::Simd128 &&
                rhs->type() == MIRType::Int32);
 
     if (op == wasm::SimdOp::I64x2ShrS) {
-      // x86/x64 specific: The masm interface for this shift requires the client
-      // to mask the shift count.
-      MConstant* mask = MConstant::New(alloc(), Int32Value(63));
-      curBlock_->add(mask);
-      MBitAnd* maskedShift = MBitAnd::New(alloc(), rhs, mask, MIRType::Int32);
-      curBlock_->add(maskedShift);
-      rhs = maskedShift;
+      if (!rhs->isConstant()) {
+        // x86/x64 specific? The masm interface for this shift requires the
+        // client to mask variable shift counts.  It's OK if later optimizations
+        // transform a variable count to a constant count here. (And then
+        // optimizations should also be able to fold the mask, though this is
+        // not crucial.)
+        MConstant* mask = MConstant::New(alloc(), Int32Value(63));
+        curBlock_->add(mask);
+        MBitAnd* maskedShift = MBitAnd::New(alloc(), rhs, mask, MIRType::Int32);
+        curBlock_->add(maskedShift);
+        rhs = maskedShift;
+      }
     }
 
     auto* ins = MWasmShiftSimd128::New(alloc(), lhs, rhs, op);
     curBlock_->add(ins);
     return ins;
   }
 
   // (v128,scalar,imm) -> v128