Bug 1640669 - Part 2: Optimize bitmask for control. r=jseward
authorLars T Hansen <lhansen@mozilla.com>
Fri, 31 Jul 2020 15:07:25 +0000
changeset 542886 ce7b1451c1009774c89f46b808eec9e3590ac6d4
parent 542885 9d343deee76d59ab5ddb3c367db7876fbf22b45b
child 542887 684e8b6a41091c9b221bba91696182e558e56182
push id123095
push userlhansen@mozilla.com
push dateFri, 31 Jul 2020 15:12:33 +0000
treeherderautoland@684e8b6a4109 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjseward
bugs1640669
milestone81.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1640669 - Part 2: Optimize bitmask for control. r=jseward Code generation for i16x8.bitmask is particularly poor, so optimize for control. In the case of i8x16.bitmask and i32x4.bitmask, it's a single SIMD instruction followed by a test, and it's unlikely that something depending on a constant load will do better, so don't optimize these. A subsequent patch will move the code from Lowering.cpp into platform code, now that it is clearly platform-dependent. Differential Revision: https://phabricator.services.mozilla.com/D85260
js/src/jit-test/tests/wasm/simd/ion-analysis.js
js/src/jit/Lowering.cpp
js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
js/src/jit/x86-shared/Lowering-x86-shared.cpp
--- a/js/src/jit-test/tests/wasm/simd/ion-analysis.js
+++ b/js/src/jit-test/tests/wasm/simd/ion-analysis.js
@@ -726,49 +726,51 @@ for ( let [ty128, suffix] of [['i8x16', 
     }
 }
 
 for ( let ty128 of ['f32x4','f64x2','i64x2'] ) {
     wasmCompile(`(module (func (result ${ty128.match(/(...)x.*/)[1]}) (${ty128}.extract_lane 0 (v128.const i64x2 0 0))))`);
     assertEq(wasmSimdAnalysis(), "simd128-to-scalar -> constant folded");
 }
 
-// Optimizing all_true and any_true that are used for control flow, also when negated.
+// Optimizing all_true, any_true, and bitmask that are used for control flow, also when negated.
 
 for ( let [ty128,size] of [['i8x16',1], ['i16x8',2], ['i32x4',4]] ) {
     let all = iota(16/size).map(n => n*n);
     let some = iota(16/size).map(n => n*(n % 3));
     let none = iota(16/size).map(n => 0);
     let inputs = [all, some, none];
-    let ops = { all_true: allTrue, any_true: anyTrue };
+    let ops = { all_true: allTrue, any_true: anyTrue, bitmask };
 
-    for ( let op of ['any_true', 'all_true'] ) {
+    for ( let op of ['any_true', 'all_true', 'bitmask'] ) {
+        let folded = op != 'bitmask' || size == 2;
+
         let positive =
             wasmCompile(
                 `(module
                    (memory (export "mem") 1 1)
                    (func $f (param v128) (result i32)
                        (if (result i32) (${ty128}.${op} (local.get 0))
                            (i32.const 42)
                            (i32.const 37)))
                    (func (export "run") (result i32)
                      (call $f (v128.load (i32.const 16)))))`);
-        assertEq(wasmSimdAnalysis(), "simd128-to-scalar-and-branch -> folded");
+        assertEq(wasmSimdAnalysis(), folded ? "simd128-to-scalar-and-branch -> folded" : "none");
 
         let negative =
             wasmCompile(
                 `(module
                    (memory (export "mem") 1 1)
                    (func $f (param v128) (result i32)
                        (if (result i32) (i32.eqz (${ty128}.${op} (local.get 0)))
                            (i32.const 42)
                            (i32.const 37)))
                    (func (export "run") (result i32)
                      (call $f (v128.load (i32.const 16)))))`);
-        assertEq(wasmSimdAnalysis(), "simd128-to-scalar-and-branch -> folded");
+        assertEq(wasmSimdAnalysis(), folded ? "simd128-to-scalar-and-branch -> folded" : "none");
 
         for ( let inp of inputs ) {
             let mem = new this[`Int${8*size}Array`](positive.exports.mem.buffer);
             set(mem, 16/size, inp);
             assertEq(positive.exports.run(), ops[op](inp) ? 42 : 37);
 
             mem = new this[`Int${8*size}Array`](negative.exports.mem.buffer);
             set(mem, 16/size, inp);
@@ -829,8 +831,16 @@ function i16ToI8(xs) {
 
 function allTrue(xs) {
     return xs.every(v => v != 0);
 }
 
 function anyTrue(xs) {
     return xs.some(v => v != 0);
 }
+
+function bitmask(xs) {
+    let shift = 128/xs.length - 1;
+    let res = 0;
+    let k = 0;
+    xs.forEach(v => { res |= ((v >>> shift) & 1) << k; k++; });
+    return res;
+}
--- a/js/src/jit/Lowering.cpp
+++ b/js/src/jit/Lowering.cpp
@@ -864,17 +864,18 @@ void LIRGenerator::visitTest(MTest* test
   if (opd->isWasmReduceSimd128() && opd->isEmittedAtUses()) {
     MWasmReduceSimd128* node = opd->toWasmReduceSimd128();
     switch (node->simdOp()) {
       case wasm::SimdOp::I8x16AnyTrue:
       case wasm::SimdOp::I16x8AnyTrue:
       case wasm::SimdOp::I32x4AnyTrue:
       case wasm::SimdOp::I8x16AllTrue:
       case wasm::SimdOp::I16x8AllTrue:
-      case wasm::SimdOp::I32x4AllTrue: {
+      case wasm::SimdOp::I32x4AllTrue:
+      case wasm::SimdOp::I16x8Bitmask: {
 #  ifdef DEBUG
         js::wasm::ReportSimdAnalysis("simd128-to-scalar-and-branch -> folded");
 #  endif
         auto* lir = new (alloc()) LWasmReduceAndBranchSimd128(
             useRegister(node->input()), node->simdOp(), ifTrue, ifFalse);
         add(lir, test);
         return;
       }
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -3216,16 +3216,23 @@ void CodeGenerator::visitWasmReduceAndBr
           break;
         default:
           MOZ_CRASH();
       }
       masm.vptest(tmp, tmp);
       emitBranch(Assembler::Equal, ins->ifTrue(), ins->ifFalse());
       break;
     }
+    case wasm::SimdOp::I16x8Bitmask: {
+      ScratchSimd128Scope tmp(masm);
+      masm.loadConstantSimd128Int(SimdConstant::SplatX8(0x8000), tmp);
+      masm.vptest(tmp, src);
+      emitBranch(Assembler::NotEqual, ins->ifTrue(), ins->ifFalse());
+      break;
+    }
     default:
       MOZ_CRASH("Reduce-and-branch SimdOp not implemented");
   }
 #else
   MOZ_CRASH("No SIMD");
 #endif
 }
 
--- a/js/src/jit/x86-shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@@ -1690,16 +1690,17 @@ static bool CanEmitWasmReduceSimd128AtUs
   }
   switch (ins->simdOp()) {
     case wasm::SimdOp::I8x16AnyTrue:
     case wasm::SimdOp::I16x8AnyTrue:
     case wasm::SimdOp::I32x4AnyTrue:
     case wasm::SimdOp::I8x16AllTrue:
     case wasm::SimdOp::I16x8AllTrue:
     case wasm::SimdOp::I32x4AllTrue:
+    case wasm::SimdOp::I16x8Bitmask:
       break;
     default:
       return false;
   }
   // If never used then defer (it will be removed).
   MUseIterator iter(ins->usesBegin());
   if (iter == ins->usesEnd()) {
     return true;