Bug 1597790 - Use SIMD in inlined memory.{fill,copy} ops. r=rhunt
authorYury Delendik <ydelendik@mozilla.com>
Wed, 11 Aug 2021 16:44:17 +0000
changeset 588590 e8b9b45734faec691994a891d482eea710e2c02b
parent 588589 7e892c6e4241bc8eac6ede7638da0fdf16601a2a
child 588591 8363b7c72148a18927edbb24d9e7e147694b4659
push id147895
push userydelendik@mozilla.com
push dateWed, 11 Aug 2021 16:46:41 +0000
treeherderautoland@e8b9b45734fa [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersrhunt
bugs1597790
milestone93.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1597790 - Use SIMD in inlined memory.{fill,copy} ops. r=rhunt Enable memory.fill and memory.copy to use masm SIMD operation when ENABLE_WASM_SIMD is on. Differential Revision: https://phabricator.services.mozilla.com/D121789
js/src/wasm/WasmBaselineCompile.cpp
js/src/wasm/WasmIonCompile.cpp
js/src/wasm/WasmValue.h
--- a/js/src/wasm/WasmBaselineCompile.cpp
+++ b/js/src/wasm/WasmBaselineCompile.cpp
@@ -13008,32 +13008,54 @@ bool BaseCompiler::emitMemCopyInline() {
   uint32_t length = signedLength;
   MOZ_ASSERT(length != 0 && length <= MaxInlineMemoryCopyLength);
 
   RegI32 src = popI32();
   RegI32 dest = popI32();
 
   // Compute the number of copies of each width we will need to do
   size_t remainder = length;
+#ifdef ENABLE_WASM_SIMD
+  size_t numCopies16 = remainder / sizeof(V128);
+  remainder %= sizeof(V128);
+#endif
 #ifdef JS_64BIT
   size_t numCopies8 = remainder / sizeof(uint64_t);
   remainder %= sizeof(uint64_t);
 #endif
   size_t numCopies4 = remainder / sizeof(uint32_t);
   remainder %= sizeof(uint32_t);
   size_t numCopies2 = remainder / sizeof(uint16_t);
   remainder %= sizeof(uint16_t);
   size_t numCopies1 = remainder;
 
   // Load all source bytes onto the value stack from low to high using the
   // widest transfer width we can for the system. We will trap without writing
   // anything if any source byte is out-of-bounds.
   bool omitBoundsCheck = false;
   size_t offset = 0;
 
+#ifdef ENABLE_WASM_SIMD
+  for (uint32_t i = 0; i < numCopies16; i++) {
+    RegI32 temp = needI32();
+    moveI32(src, temp);
+    pushI32(temp);
+
+    MemoryAccessDesc access(Scalar::Simd128, 1, offset, bytecodeOffset());
+    AccessCheck check;
+    check.omitBoundsCheck = omitBoundsCheck;
+    if (!loadCommon(&access, check, ValType::V128)) {
+      return false;
+    }
+
+    offset += sizeof(V128);
+    omitBoundsCheck = true;
+  }
+#endif
+
 #ifdef JS_64BIT
   for (uint32_t i = 0; i < numCopies8; i++) {
     RegI32 temp = needI32();
     moveI32(src, temp);
     pushI32(temp);
 
     MemoryAccessDesc access(Scalar::Int64, 1, offset, bytecodeOffset());
     AccessCheck check;
@@ -13170,16 +13192,37 @@ bool BaseCompiler::emitMemCopyInline() {
     if (!storeCommon(&access, check, ValType::I64)) {
       return false;
     }
 
     omitBoundsCheck = true;
   }
 #endif
 
+#ifdef ENABLE_WASM_SIMD
+  for (uint32_t i = 0; i < numCopies16; i++) {
+    offset -= sizeof(V128);
+
+    RegV128 value = popV128();
+    RegI32 temp = needI32();
+    moveI32(dest, temp);
+    pushI32(temp);
+    pushV128(value);
+
+    MemoryAccessDesc access(Scalar::Simd128, 1, offset, bytecodeOffset());
+    AccessCheck check;
+    check.omitBoundsCheck = omitBoundsCheck;
+    if (!storeCommon(&access, check, ValType::V128)) {
+      return false;
+    }
+
+    omitBoundsCheck = true;
+  }
+#endif
+
   freeI32(dest);
   freeI32(src);
   return true;
 }
 
 bool BaseCompiler::emitTableCopy() {
   uint32_t lineOrBytecode = readCallSiteLineOrBytecode();
 
@@ -13246,29 +13289,36 @@ bool BaseCompiler::emitMemFillInline() {
   uint32_t length = uint32_t(signedLength);
   uint32_t value = uint32_t(signedValue);
   MOZ_ASSERT(length != 0 && length <= MaxInlineMemoryFillLength);
 
   RegI32 dest = popI32();
 
   // Compute the number of copies of each width we will need to do
   size_t remainder = length;
+#ifdef ENABLE_WASM_SIMD
+  size_t numCopies16 = remainder / sizeof(V128);
+  remainder %= sizeof(V128);
+#endif
 #ifdef JS_64BIT
   size_t numCopies8 = remainder / sizeof(uint64_t);
   remainder %= sizeof(uint64_t);
 #endif
   size_t numCopies4 = remainder / sizeof(uint32_t);
   remainder %= sizeof(uint32_t);
   size_t numCopies2 = remainder / sizeof(uint16_t);
   remainder %= sizeof(uint16_t);
   size_t numCopies1 = remainder;
 
   MOZ_ASSERT(numCopies2 <= 1 && numCopies1 <= 1);
 
   // Generate splatted definitions for wider fills as needed
+#ifdef ENABLE_WASM_SIMD
+  V128 val16(value);
+#endif
 #ifdef JS_64BIT
   uint64_t val8 = SplatByteToUInt<uint64_t>(value, 8);
 #endif
   uint32_t val4 = SplatByteToUInt<uint32_t>(value, 4);
   uint32_t val2 = SplatByteToUInt<uint32_t>(value, 2);
   uint32_t val1 = value;
 
   // Store the fill value to the destination from high to low. We will trap
@@ -13345,16 +13395,36 @@ bool BaseCompiler::emitMemFillInline() {
     if (!storeCommon(&access, check, ValType::I64)) {
       return false;
     }
 
     omitBoundsCheck = true;
   }
 #endif
 
+#ifdef ENABLE_WASM_SIMD
+  for (uint32_t i = 0; i < numCopies16; i++) {
+    offset -= sizeof(V128);
+
+    RegI32 temp = needI32();
+    moveI32(dest, temp);
+    pushI32(temp);
+    pushV128(val16);
+
+    MemoryAccessDesc access(Scalar::Simd128, 1, offset, bytecodeOffset());
+    AccessCheck check;
+    check.omitBoundsCheck = omitBoundsCheck;
+    if (!storeCommon(&access, check, ValType::V128)) {
+      return false;
+    }
+
+    omitBoundsCheck = true;
+  }
+#endif
+
   freeI32(dest);
   return true;
 }
 
 bool BaseCompiler::emitMemInit() {
   return emitInstanceCallOp<uint32_t>(
       SASigMemInit32, [this](uint32_t* segIndex) -> bool {
         uint32_t dstTableIndex;
--- a/js/src/wasm/WasmIonCompile.cpp
+++ b/js/src/wasm/WasmIonCompile.cpp
@@ -3631,32 +3631,48 @@ static bool EmitMemCopyInline(FunctionCo
   MOZ_ASSERT(MaxInlineMemoryCopyLength != 0);
 
   MOZ_ASSERT(len->isConstant() && len->type() == MIRType::Int32);
   uint32_t length = len->toConstant()->toInt32();
   MOZ_ASSERT(length != 0 && length <= MaxInlineMemoryCopyLength);
 
   // Compute the number of copies of each width we will need to do
   size_t remainder = length;
+#ifdef ENABLE_WASM_SIMD
+  size_t numCopies16 = remainder / sizeof(V128);
+  remainder %= sizeof(V128);
+#endif
 #ifdef JS_64BIT
   size_t numCopies8 = remainder / sizeof(uint64_t);
   remainder %= sizeof(uint64_t);
 #endif
   size_t numCopies4 = remainder / sizeof(uint32_t);
   remainder %= sizeof(uint32_t);
   size_t numCopies2 = remainder / sizeof(uint16_t);
   remainder %= sizeof(uint16_t);
   size_t numCopies1 = remainder;
 
   // Load all source bytes from low to high using the widest transfer width we
   // can for the system. We will trap without writing anything if any source
   // byte is out-of-bounds.
   size_t offset = 0;
   DefVector loadedValues;
 
+#ifdef ENABLE_WASM_SIMD
+  for (uint32_t i = 0; i < numCopies16; i++) {
+    MemoryAccessDesc access(Scalar::Simd128, 1, offset, f.bytecodeOffset());
+    auto* load = f.load(src, &access, ValType::V128);
+    if (!load || !loadedValues.append(load)) {
+      return false;
+    }
+
+    offset += sizeof(V128);
+  }
+#endif
+
 #ifdef JS_64BIT
   for (uint32_t i = 0; i < numCopies8; i++) {
     MemoryAccessDesc access(Scalar::Int64, 1, offset, f.bytecodeOffset());
     auto* load = f.load(src, &access, ValType::I64);
     if (!load || !loadedValues.append(load)) {
       return false;
     }
 
@@ -3726,16 +3742,26 @@ static bool EmitMemCopyInline(FunctionCo
     offset -= sizeof(uint64_t);
 
     MemoryAccessDesc access(Scalar::Int64, 1, offset, f.bytecodeOffset());
     auto* value = loadedValues.popCopy();
     f.store(dst, &access, value);
   }
 #endif
 
+#ifdef ENABLE_WASM_SIMD
+  for (uint32_t i = 0; i < numCopies16; i++) {
+    offset -= sizeof(V128);
+
+    MemoryAccessDesc access(Scalar::Simd128, 1, offset, f.bytecodeOffset());
+    auto* value = loadedValues.popCopy();
+    f.store(dst, &access, value);
+  }
+#endif
+
   return true;
 }
 
 static bool EmitMemCopy(FunctionCompiler& f) {
   MDefinition *dst, *src, *len;
   uint32_t dstMemIndex;
   uint32_t srcMemIndex;
   if (!f.iter().readMemOrTableCopy(true, &dstMemIndex, &dst, &srcMemIndex, &src,
@@ -3878,27 +3904,34 @@ static bool EmitMemFillInline(FunctionCo
              val->isConstant() && val->type() == MIRType::Int32);
 
   uint32_t length = len->toConstant()->toInt32();
   uint32_t value = val->toConstant()->toInt32();
   MOZ_ASSERT(length != 0 && length <= MaxInlineMemoryFillLength);
 
   // Compute the number of copies of each width we will need to do
   size_t remainder = length;
+#ifdef ENABLE_WASM_SIMD
+  size_t numCopies16 = remainder / sizeof(V128);
+  remainder %= sizeof(V128);
+#endif
 #ifdef JS_64BIT
   size_t numCopies8 = remainder / sizeof(uint64_t);
   remainder %= sizeof(uint64_t);
 #endif
   size_t numCopies4 = remainder / sizeof(uint32_t);
   remainder %= sizeof(uint32_t);
   size_t numCopies2 = remainder / sizeof(uint16_t);
   remainder %= sizeof(uint16_t);
   size_t numCopies1 = remainder;
 
   // Generate splatted definitions for wider fills as needed
+#ifdef ENABLE_WASM_SIMD
+  MDefinition* val16 = numCopies16 ? f.constant(V128(value)) : nullptr;
+#endif
 #ifdef JS_64BIT
   MDefinition* val8 =
       numCopies8 ? f.constant(int64_t(SplatByteToUInt<uint64_t>(value, 8)))
                  : nullptr;
 #endif
   MDefinition* val4 =
       numCopies4 ? f.constant(Int32Value(SplatByteToUInt<uint32_t>(value, 4)),
                               MIRType::Int32)
@@ -3938,16 +3971,25 @@ static bool EmitMemFillInline(FunctionCo
   for (uint32_t i = 0; i < numCopies8; i++) {
     offset -= sizeof(uint64_t);
 
     MemoryAccessDesc access(Scalar::Int64, 1, offset, f.bytecodeOffset());
     f.store(start, &access, val8);
   }
 #endif
 
+#ifdef ENABLE_WASM_SIMD
+  for (uint32_t i = 0; i < numCopies16; i++) {
+    offset -= sizeof(V128);
+
+    MemoryAccessDesc access(Scalar::Simd128, 1, offset, f.bytecodeOffset());
+    f.store(start, &access, val16);
+  }
+#endif
+
   return true;
 }
 
 static bool EmitMemFill(FunctionCompiler& f) {
   MDefinition *start, *val, *len;
   if (!f.iter().readMemFill(&start, &val, &len)) {
     return false;
   }
--- a/js/src/wasm/WasmValue.h
+++ b/js/src/wasm/WasmValue.h
@@ -29,16 +29,20 @@ namespace wasm {
 
 // A V128 value.
 
 struct V128 {
   uint8_t bytes[16];  // Little-endian
 
   V128() { memset(bytes, 0, sizeof(bytes)); }
 
+  explicit V128(uint8_t splatValue) {
+    memset(bytes, int(splatValue), sizeof(bytes));
+  }
+
   template <typename T>
   T extractLane(unsigned lane) const {
     T result;
     MOZ_ASSERT(lane < 16 / sizeof(T));
     memcpy(&result, bytes + sizeof(T) * lane, sizeof(T));
     return result;
   }