Bug 1640669 - Part 1: Optimize all_true and any_true for control. r=jseward
authorLars T Hansen <lhansen@mozilla.com>
Fri, 31 Jul 2020 15:07:12 +0000
changeset 542885 9d343deee76d59ab5ddb3c367db7876fbf22b45b
parent 542884 5b4a3edffdfd27c68e11af2748ac10b15c95c9e2
child 542886 ce7b1451c1009774c89f46b808eec9e3590ac6d4
push id123095
push userlhansen@mozilla.com
push dateFri, 31 Jul 2020 15:12:33 +0000
treeherderautoland@684e8b6a4109 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjseward
bugs1640669
milestone81.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1640669 - Part 1: Optimize all_true and any_true for control. r=jseward This removes the generation of a boolean result for all_true and any_true operations that will subsequently just be tested to perform control flow. It uses the standard Ion framework for this. The test case tests that the optimization is triggered (in this simple setting), and that it produces correct code. (The optimization also triggers on a couple of test cases in the imported spec test suite.) Differential Revision: https://phabricator.services.mozilla.com/D85180
js/src/jit-test/tests/wasm/simd/ion-analysis.js
js/src/jit/Lowering.cpp
js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
js/src/jit/x86-shared/LIR-x86-shared.h
js/src/jit/x86-shared/Lowering-x86-shared.cpp
js/src/wasm/WasmJS.h
--- a/js/src/jit-test/tests/wasm/simd/ion-analysis.js
+++ b/js/src/jit-test/tests/wasm/simd/ion-analysis.js
@@ -726,16 +726,62 @@ for ( let [ty128, suffix] of [['i8x16', 
     }
 }
 
 for ( let ty128 of ['f32x4','f64x2','i64x2'] ) {
     wasmCompile(`(module (func (result ${ty128.match(/(...)x.*/)[1]}) (${ty128}.extract_lane 0 (v128.const i64x2 0 0))))`);
     assertEq(wasmSimdAnalysis(), "simd128-to-scalar -> constant folded");
 }
 
+// Optimizing all_true and any_true that are used for control flow, also when negated.
+
+for ( let [ty128,size] of [['i8x16',1], ['i16x8',2], ['i32x4',4]] ) {
+    let all = iota(16/size).map(n => n*n);
+    let some = iota(16/size).map(n => n*(n % 3));
+    let none = iota(16/size).map(n => 0);
+    let inputs = [all, some, none];
+    let ops = { all_true: allTrue, any_true: anyTrue };
+
+    for ( let op of ['any_true', 'all_true'] ) {
+        let positive =
+            wasmCompile(
+                `(module
+                   (memory (export "mem") 1 1)
+                   (func $f (param v128) (result i32)
+                       (if (result i32) (${ty128}.${op} (local.get 0))
+                           (i32.const 42)
+                           (i32.const 37)))
+                   (func (export "run") (result i32)
+                     (call $f (v128.load (i32.const 16)))))`);
+        assertEq(wasmSimdAnalysis(), "simd128-to-scalar-and-branch -> folded");
+
+        let negative =
+            wasmCompile(
+                `(module
+                   (memory (export "mem") 1 1)
+                   (func $f (param v128) (result i32)
+                       (if (result i32) (i32.eqz (${ty128}.${op} (local.get 0)))
+                           (i32.const 42)
+                           (i32.const 37)))
+                   (func (export "run") (result i32)
+                     (call $f (v128.load (i32.const 16)))))`);
+        assertEq(wasmSimdAnalysis(), "simd128-to-scalar-and-branch -> folded");
+
+        for ( let inp of inputs ) {
+            let mem = new this[`Int${8*size}Array`](positive.exports.mem.buffer);
+            set(mem, 16/size, inp);
+            assertEq(positive.exports.run(), ops[op](inp) ? 42 : 37);
+
+            mem = new this[`Int${8*size}Array`](negative.exports.mem.buffer);
+            set(mem, 16/size, inp);
+            assertEq(negative.exports.run(), ops[op](inp) ? 37 : 42);
+        }
+    }
+}
+
 // Library
 
 function wasmCompile(text) {
     return new WebAssembly.Instance(new WebAssembly.Module(wasmTextToBinary(text)))
 }
 
 function get(arr, loc, len) {
     let res = [];
@@ -775,8 +821,16 @@ function interleave(xs, ys) {
 
 function i32ToI8(xs) {
     return xs.map(x => [x*4, x*4+1, x*4+2, x*4+3]).flat();
 }
 
 function i16ToI8(xs) {
     return xs.map(x => [x*2, x*2+1]).flat();
 }
+
+function allTrue(xs) {
+    return xs.every(v => v != 0);
+}
+
+function anyTrue(xs) {
+    return xs.some(v => v != 0);
+}
--- a/js/src/jit/Lowering.cpp
+++ b/js/src/jit/Lowering.cpp
@@ -852,16 +852,43 @@ void LIRGenerator::visitTest(MTest* test
     if (lhs->type() == MIRType::Int32 && rhs->type() == MIRType::Int32) {
       ReorderCommutative(&lhs, &rhs, test);
       lowerForBitAndAndBranch(new (alloc()) LBitAndAndBranch(ifTrue, ifFalse),
                               test, lhs, rhs);
       return;
     }
   }
 
+#ifdef ENABLE_WASM_SIMD
+  // Check if the operand for this test is an any_true/all_true SIMD operation.
+  // If it is, we want to emit an LWasmReduceAndBranchSimd128 node to avoid
+  // generating an intermediate boolean result.
+  if (opd->isWasmReduceSimd128() && opd->isEmittedAtUses()) {
+    MWasmReduceSimd128* node = opd->toWasmReduceSimd128();
+    switch (node->simdOp()) {
+      case wasm::SimdOp::I8x16AnyTrue:
+      case wasm::SimdOp::I16x8AnyTrue:
+      case wasm::SimdOp::I32x4AnyTrue:
+      case wasm::SimdOp::I8x16AllTrue:
+      case wasm::SimdOp::I16x8AllTrue:
+      case wasm::SimdOp::I32x4AllTrue: {
+#  ifdef DEBUG
+        js::wasm::ReportSimdAnalysis("simd128-to-scalar-and-branch -> folded");
+#  endif
+        auto* lir = new (alloc()) LWasmReduceAndBranchSimd128(
+            useRegister(node->input()), node->simdOp(), ifTrue, ifFalse);
+        add(lir, test);
+        return;
+      }
+      default:
+        break;
+    }
+  }
+#endif
+
   if (opd->isIsObject() && opd->isEmittedAtUses()) {
     MDefinition* input = opd->toIsObject()->input();
     MOZ_ASSERT(input->type() == MIRType::Value);
 
     LIsObjectAndBranch* lir =
         new (alloc()) LIsObjectAndBranch(ifTrue, ifFalse, useBoxAtStart(input));
     add(lir, test);
     return;
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -3179,16 +3179,61 @@ void CodeGenerator::visitWasmReduceSimd1
     default:
       MOZ_CRASH("Reduce SimdOp not implemented");
   }
 #else
   MOZ_CRASH("No SIMD");
 #endif
 }
 
+void CodeGenerator::visitWasmReduceAndBranchSimd128(
+    LWasmReduceAndBranchSimd128* ins) {
+#ifdef ENABLE_WASM_SIMD
+  FloatRegister src = ToFloatRegister(ins->src());
+
+  switch (ins->simdOp()) {
+    case wasm::SimdOp::I8x16AnyTrue:
+    case wasm::SimdOp::I16x8AnyTrue:
+    case wasm::SimdOp::I32x4AnyTrue:
+      // Set the zero flag if all of the lanes are zero, and branch on that.
+      masm.vptest(src, src);
+      emitBranch(Assembler::NotEqual, ins->ifTrue(), ins->ifFalse());
+      break;
+    case wasm::SimdOp::I8x16AllTrue:
+    case wasm::SimdOp::I16x8AllTrue:
+    case wasm::SimdOp::I32x4AllTrue: {
+      // Compare all lanes to zero, set the zero flag if none of the lanes are
+      // zero, and branch on that.
+      ScratchSimd128Scope tmp(masm);
+      masm.vpxor(tmp, tmp, tmp);
+      switch (ins->simdOp()) {
+        case wasm::SimdOp::I8x16AllTrue:
+          masm.vpcmpeqb(Operand(src), tmp, tmp);
+          break;
+        case wasm::SimdOp::I16x8AllTrue:
+          masm.vpcmpeqw(Operand(src), tmp, tmp);
+          break;
+        case wasm::SimdOp::I32x4AllTrue:
+          masm.vpcmpeqd(Operand(src), tmp, tmp);
+          break;
+        default:
+          MOZ_CRASH();
+      }
+      masm.vptest(tmp, tmp);
+      emitBranch(Assembler::Equal, ins->ifTrue(), ins->ifFalse());
+      break;
+    }
+    default:
+      MOZ_CRASH("Reduce-and-branch SimdOp not implemented");
+  }
+#else
+  MOZ_CRASH("No SIMD");
+#endif
+}
+
 void CodeGenerator::visitWasmReduceSimd128ToInt64(
     LWasmReduceSimd128ToInt64* ins) {
 #ifdef ENABLE_WASM_SIMD
   FloatRegister src = ToFloatRegister(ins->src());
   Register64 dest = ToOutRegister64(ins);
   uint32_t imm = ins->imm();
 
   switch (ins->simdOp()) {
--- a/js/src/jit/x86-shared/LIR-x86-shared.h
+++ b/js/src/jit/x86-shared/LIR-x86-shared.h
@@ -650,16 +650,41 @@ class LWasmReduceSimd128 : public LInstr
     setOperand(Src, src);
   }
 
   const LAllocation* src() { return getOperand(Src); }
   uint32_t imm() const { return mir_->toWasmReduceSimd128()->imm(); }
   wasm::SimdOp simdOp() const { return mir_->toWasmReduceSimd128()->simdOp(); }
 };
 
+// (v128, onTrue, onFalse) test-and-branch operations.
+class LWasmReduceAndBranchSimd128 : public LControlInstructionHelper<2, 1, 0> {
+  wasm::SimdOp op_;
+
+ public:
+  LIR_HEADER(WasmReduceAndBranchSimd128)
+
+  static constexpr uint32_t Src = 0;
+  static constexpr uint32_t IfTrue = 0;
+  static constexpr uint32_t IfFalse = 1;
+
+  LWasmReduceAndBranchSimd128(const LAllocation& src, wasm::SimdOp op,
+                              MBasicBlock* ifTrue, MBasicBlock* ifFalse)
+      : LControlInstructionHelper(classOpcode), op_(op) {
+    setOperand(Src, src);
+    setSuccessor(IfTrue, ifTrue);
+    setSuccessor(IfFalse, ifFalse);
+  }
+
+  const LAllocation* src() { return getOperand(Src); }
+  wasm::SimdOp simdOp() const { return op_; }
+  MBasicBlock* ifTrue() const { return getSuccessor(IfTrue); }
+  MBasicBlock* ifFalse() const { return getSuccessor(IfFalse); }
+};
+
 // (v128, imm) -> i64 effect-free operations
 class LWasmReduceSimd128ToInt64
     : public LInstructionHelper<INT64_PIECES, 1, 0> {
  public:
   LIR_HEADER(WasmReduceSimd128ToInt64)
 
   static constexpr uint32_t Src = 0;
 
--- a/js/src/jit/x86-shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@@ -1675,17 +1675,55 @@ void LIRGenerator::visitWasmUnarySimd128
       break;
   }
 
   LWasmUnarySimd128* lir =
       new (alloc()) LWasmUnarySimd128(useRegister(ins->input()), tempReg);
   define(lir, ins);
 }
 
+static bool CanEmitWasmReduceSimd128AtUses(MWasmReduceSimd128* ins) {
+  if (!ins->canEmitAtUses()) {
+    return false;
+  }
+  // Only specific ops generating int32.
+  if (ins->type() != MIRType::Int32) {
+    return false;
+  }
+  switch (ins->simdOp()) {
+    case wasm::SimdOp::I8x16AnyTrue:
+    case wasm::SimdOp::I16x8AnyTrue:
+    case wasm::SimdOp::I32x4AnyTrue:
+    case wasm::SimdOp::I8x16AllTrue:
+    case wasm::SimdOp::I16x8AllTrue:
+    case wasm::SimdOp::I32x4AllTrue:
+      break;
+    default:
+      return false;
+  }
+  // If never used then defer (it will be removed).
+  MUseIterator iter(ins->usesBegin());
+  if (iter == ins->usesEnd()) {
+    return true;
+  }
+  // We require an MTest consumer.
+  MNode* node = iter->consumer();
+  if (!node->isDefinition() || !node->toDefinition()->isTest()) {
+    return false;
+  }
+  // Defer only if there's only one use.
+  iter++;
+  return iter == ins->usesEnd();
+}
+
 void LIRGenerator::visitWasmReduceSimd128(MWasmReduceSimd128* ins) {
+  if (CanEmitWasmReduceSimd128AtUses(ins)) {
+    emitAtUses(ins);
+    return;
+  }
   if (ins->type() == MIRType::Int64) {
     auto* lir =
         new (alloc()) LWasmReduceSimd128ToInt64(useRegister(ins->input()));
     defineInt64(lir, ins);
   } else {
     auto* lir = new (alloc()) LWasmReduceSimd128(useRegister(ins->input()));
     define(lir, ins);
   }
--- a/js/src/wasm/WasmJS.h
+++ b/js/src/wasm/WasmJS.h
@@ -113,17 +113,17 @@ bool GcTypesAvailable(JSContext* cx);
 bool MultiValuesAvailable(JSContext* cx);
 
 // Shared memory and atomics.
 bool ThreadsAvailable(JSContext* cx);
 
 // SIMD data and operations.
 bool SimdAvailable(JSContext* cx);
 
-#if defined(ENABLE_WASM_SIMD)
+#if defined(ENABLE_WASM_SIMD) && defined(DEBUG)
 // Report the result of a Simd simplification to the testing infrastructure.
 void ReportSimdAnalysis(const char* data);
 #endif
 
 // Compiles the given binary wasm module given the ArrayBufferObject
 // and links the module's imports with the given import object.
 
 MOZ_MUST_USE bool Eval(JSContext* cx, Handle<TypedArrayObject*> code,