Bug 1594204 - Generate inline code for memory.copy and memory.fill. r=lth
☠☠ backed out by f6aa348ba29a ☠ ☠
authorRyan Hunt <rhunt@eqrion.net>
Thu, 14 Nov 2019 18:56:56 +0000
changeset 502042 74cc3a413cb0f2da50eb95dff6b6656ce4edabfc
parent 502041 6295568b6ea47d638c4346effd4731d37366547b
child 502043 0ee40940310074484845f8d2849896f0fe17cf83
push id114172
push userdluca@mozilla.com
push dateTue, 19 Nov 2019 11:31:10 +0000
treeherdermozilla-inbound@b5c5ba07d3db [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerslth
bugs1594204
milestone72.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1594204 - Generate inline code for memory.copy and memory.fill. r=lth This commit adds an inline code path for memory.copy/fill for Ion and Baseline for all platforms. To keep things simple, I reused the plain wasm load/store codegen with integer types up to 64bits. A future commit can add SIMD support as needed. A copy with constant length is reduced to a series of loads (from low-to-high) onto the value stack (for baseline), or onto a stack of definitions (for ion). Then a series of stores are emitted (from high-to-low) from the value stack or temp definition stack. A fill with constant length and value is reduced to a series of stores (from high-to-low). The stores use the widest transfer width as possible, and the value is splatted as appropriate to fill the whole integer. This optimization is limited to sizes that are less than the guard page so that we only need to perform a single bounds check for src/dest. The threshold is per-platform and derived from the wasm-bulk-bench microbenchmark. I attempted to pick the length just before the inline path began to slow exponentially. This was roughly constant at 8 loads/stores for 64 and 32 bits. Differential Revision: https://phabricator.services.mozilla.com/D52129
js/src/jit/IonTypes.h
js/src/jit/arm/Assembler-arm.h
js/src/jit/arm/MacroAssembler-arm.cpp
js/src/jit/arm64/Assembler-arm64.h
js/src/jit/mips-shared/Assembler-mips-shared.h
js/src/jit/none/MacroAssembler-none.h
js/src/jit/x86-shared/Assembler-x86-shared.h
js/src/wasm/WasmBaselineCompile.cpp
js/src/wasm/WasmIonCompile.cpp
js/src/wasm/WasmTypes.h
--- a/js/src/jit/IonTypes.h
+++ b/js/src/jit/IonTypes.h
@@ -924,12 +924,21 @@ static const uint32_t MAX_UNCHECKED_LEAF
 
 // Truncating conversion modifiers.
 typedef uint32_t TruncFlags;
 static const TruncFlags TRUNC_UNSIGNED = TruncFlags(1) << 0;
 static const TruncFlags TRUNC_SATURATING = TruncFlags(1) << 1;
 
 enum BranchDirection { FALSE_BRANCH, TRUE_BRANCH };
 
+template <typename T>
+constexpr T SplatByteToUInt(uint8_t val, uint8_t x) {
+  T splatted = val;
+  for (; x > 1; x--) {
+    splatted |= splatted << 8;
+  }
+  return splatted;
+}
+
 }  // namespace jit
 }  // namespace js
 
 #endif /* jit_IonTypes_h */
--- a/js/src/jit/arm/Assembler-arm.h
+++ b/js/src/jit/arm/Assembler-arm.h
@@ -1665,16 +1665,17 @@ class Assembler : public AssemblerShared
     for (auto& j : jumps_) {
       MOZ_ASSERT(j.kind() == RelocationKind::HARDCODED);
     }
 #endif
   }
 
   static bool SupportsFloatingPoint() { return HasVFP(); }
   static bool SupportsUnalignedAccesses() { return HasARMv7(); }
+  static bool SupportsFastUnalignedAccesses() { return false; }
   static bool SupportsSimd() { return js::jit::SupportsSimd; }
 
   static bool HasRoundInstruction(RoundingMode mode) { return false; }
 
  protected:
   void addPendingJump(BufferOffset src, ImmPtr target, RelocationKind kind) {
     enoughMemory_ &= jumps_.append(RelativePatch(target.value, kind));
     if (kind == RelocationKind::JITCODE) {
--- a/js/src/jit/arm/MacroAssembler-arm.cpp
+++ b/js/src/jit/arm/MacroAssembler-arm.cpp
@@ -6044,16 +6044,17 @@ void MacroAssemblerARM::wasmStoreImpl(co
 }
 
 void MacroAssemblerARM::wasmUnalignedLoadImpl(
     const wasm::MemoryAccessDesc& access, Register memoryBase, Register ptr,
     Register ptrScratch, AnyRegister outAny, Register64 out64, Register tmp,
     Register tmp2, Register tmp3) {
   MOZ_ASSERT(ptr == ptrScratch);
   MOZ_ASSERT(tmp != ptr);
+  MOZ_ASSERT(!Assembler::SupportsFastUnalignedAccesses());
 
   uint32_t offset = access.offset();
   MOZ_ASSERT(offset < wasm::MaxOffsetGuardLimit);
 
   if (offset) {
     ScratchRegisterScope scratch(asMasm());
     ma_add(Imm32(offset), ptr, scratch);
   }
--- a/js/src/jit/arm64/Assembler-arm64.h
+++ b/js/src/jit/arm64/Assembler-arm64.h
@@ -284,16 +284,17 @@ class Assembler : public vixl::Assembler
   void setPrinter(Sprinter* sp) {
 #ifdef JS_DISASM_ARM64
     spew_.setPrinter(sp);
 #endif
   }
 
   static bool SupportsFloatingPoint() { return true; }
   static bool SupportsUnalignedAccesses() { return true; }
+  static bool SupportsFastUnalignedAccesses() { return true; }
   static bool SupportsSimd() { return js::jit::SupportsSimd; }
 
   static bool HasRoundInstruction(RoundingMode mode) { return false; }
 
   // Tracks a jump that is patchable after finalization.
   void addJumpRelocation(BufferOffset src, RelocationKind reloc);
 
  protected:
--- a/js/src/jit/mips-shared/Assembler-mips-shared.h
+++ b/js/src/jit/mips-shared/Assembler-mips-shared.h
@@ -1231,16 +1231,17 @@ class AssemblerMIPSShared : public Assem
 #if (defined(__mips_hard_float) && !defined(__mips_single_float)) || \
     defined(JS_SIMULATOR_MIPS32) || defined(JS_SIMULATOR_MIPS64)
     return true;
 #else
     return false;
 #endif
   }
   static bool SupportsUnalignedAccesses() { return true; }
+  static bool SupportsFastUnalignedAccesses() { return false; }
   static bool SupportsSimd() { return js::jit::SupportsSimd; }
 
   static bool HasRoundInstruction(RoundingMode mode) { return false; }
 
  protected:
   InstImm invertBranch(InstImm branch, BOffImm16 skipOffset);
   void addPendingJump(BufferOffset src, ImmPtr target, RelocationKind kind) {
     enoughMemory_ &= jumps_.append(RelativePatch(src, target.value, kind));
--- a/js/src/jit/none/MacroAssembler-none.h
+++ b/js/src/jit/none/MacroAssembler-none.h
@@ -226,16 +226,17 @@ class MacroAssemblerNone : public Assemb
   }
   static void TraceDataRelocations(JSTracer*, JitCode*, CompactBufferReader&) {
     MOZ_CRASH();
   }
 
   static bool SupportsFloatingPoint() { return false; }
   static bool SupportsSimd() { return false; }
   static bool SupportsUnalignedAccesses() { return false; }
+  static bool SupportsFastUnalignedAccesses() { return false; }
 
   void executableCopy(void*, bool = true) { MOZ_CRASH(); }
   void copyJumpRelocationTable(uint8_t*) { MOZ_CRASH(); }
   void copyDataRelocationTable(uint8_t*) { MOZ_CRASH(); }
   void copyPreBarrierTable(uint8_t*) { MOZ_CRASH(); }
   void processCodeLabels(uint8_t*) { MOZ_CRASH(); }
 
   void flushBuffer() { MOZ_CRASH(); }
--- a/js/src/jit/x86-shared/Assembler-x86-shared.h
+++ b/js/src/jit/x86-shared/Assembler-x86-shared.h
@@ -1076,16 +1076,17 @@ class AssemblerX86Shared : public Assemb
   static bool HasSSE41() { return CPUInfo::IsSSE41Present(); }
   static bool HasSSE42() { return CPUInfo::IsSSE42Present(); }
   static bool HasPOPCNT() { return CPUInfo::IsPOPCNTPresent(); }
   static bool HasBMI1() { return CPUInfo::IsBMI1Present(); }
   static bool HasBMI2() { return CPUInfo::IsBMI2Present(); }
   static bool HasLZCNT() { return CPUInfo::IsLZCNTPresent(); }
   static bool SupportsFloatingPoint() { return CPUInfo::IsSSE2Present(); }
   static bool SupportsUnalignedAccesses() { return true; }
+  static bool SupportsFastUnalignedAccesses() { return true; }
   static bool SupportsSimd() { return CPUInfo::IsSSE2Present(); }
   static bool HasAVX() { return CPUInfo::IsAVXPresent(); }
 
   static bool HasRoundInstruction(RoundingMode mode) {
     switch (mode) {
       case RoundingMode::Up:
       case RoundingMode::Down:
       case RoundingMode::NearestTiesToEven:
--- a/js/src/wasm/WasmBaselineCompile.cpp
+++ b/js/src/wasm/WasmBaselineCompile.cpp
@@ -3923,16 +3923,28 @@ class BaseCompiler final : public BaseCo
     Stk& v = stk_.back();
     if (v.kind() != Stk::ConstI64) {
       return false;
     }
     *c = v.i64val();
     return true;
   }
 
+  MOZ_MUST_USE bool peek2xI32(int32_t* c0, int32_t* c1) {
+    MOZ_ASSERT(stk_.length() >= 2);
+    const Stk& v0 = *(stk_.end() - 1);
+    const Stk& v1 = *(stk_.end() - 2);
+    if (v0.kind() != Stk::ConstI32 || v1.kind() != Stk::ConstI32) {
+      return false;
+    }
+    *c0 = v0.i32val();
+    *c1 = v1.i32val();
+    return true;
+  }
+
   MOZ_MUST_USE bool popConstPositivePowerOfTwoI32(int32_t* c,
                                                   uint_fast8_t* power,
                                                   int32_t cutoff) {
     Stk& v = stk_.back();
     if (v.kind() != Stk::ConstI32) {
       return false;
     }
     *c = v.i32val();
@@ -7111,19 +7123,21 @@ class BaseCompiler final : public BaseCo
   MOZ_MUST_USE bool emitSetLocal();
   MOZ_MUST_USE bool emitTeeLocal();
   MOZ_MUST_USE bool emitGetGlobal();
   MOZ_MUST_USE bool emitSetGlobal();
   MOZ_MUST_USE RegI32 maybeLoadTlsForAccess(const AccessCheck& check);
   MOZ_MUST_USE RegI32 maybeLoadTlsForAccess(const AccessCheck& check,
                                             RegI32 specific);
   MOZ_MUST_USE bool emitLoad(ValType type, Scalar::Type viewType);
-  MOZ_MUST_USE bool loadCommon(MemoryAccessDesc* access, ValType type);
+  MOZ_MUST_USE bool loadCommon(MemoryAccessDesc* access, AccessCheck check,
+                               ValType type);
   MOZ_MUST_USE bool emitStore(ValType resultType, Scalar::Type viewType);
-  MOZ_MUST_USE bool storeCommon(MemoryAccessDesc* access, ValType resultType);
+  MOZ_MUST_USE bool storeCommon(MemoryAccessDesc* access, AccessCheck check,
+                                ValType resultType);
   MOZ_MUST_USE bool emitSelect(bool typed);
 
   template <bool isSetLocal>
   MOZ_MUST_USE bool emitSetOrTeeLocal(uint32_t slot);
 
   void endBlock(ResultType type);
   void endIfThen(ResultType type);
   void endIfThenElse(ResultType type);
@@ -7263,19 +7277,23 @@ class BaseCompiler final : public BaseCo
   MOZ_MUST_USE bool emitAtomicStore(ValType type, Scalar::Type viewType);
   MOZ_MUST_USE bool emitWait(ValType type, uint32_t byteSize);
   MOZ_MUST_USE bool emitWake();
   MOZ_MUST_USE bool emitFence();
   MOZ_MUST_USE bool emitAtomicXchg(ValType type, Scalar::Type viewType);
   void emitAtomicXchg64(MemoryAccessDesc* access, WantResult wantResult);
   MOZ_MUST_USE bool bulkmemOpsEnabled();
   MOZ_MUST_USE bool emitMemCopy();
+  MOZ_MUST_USE bool emitMemCopyCall(uint32_t lineOrBytecode);
+  MOZ_MUST_USE bool emitMemCopyInline();
   MOZ_MUST_USE bool emitTableCopy();
   MOZ_MUST_USE bool emitDataOrElemDrop(bool isData);
   MOZ_MUST_USE bool emitMemFill();
+  MOZ_MUST_USE bool emitMemFillCall(uint32_t lineOrBytecode);
+  MOZ_MUST_USE bool emitMemFillInline();
   MOZ_MUST_USE bool emitMemOrTableInit(bool isMem);
 #ifdef ENABLE_WASM_REFTYPES
   MOZ_MUST_USE bool emitTableFill();
   MOZ_MUST_USE bool emitTableGet();
   MOZ_MUST_USE bool emitTableGrow();
   MOZ_MUST_USE bool emitTableSet();
   MOZ_MUST_USE bool emitTableSize();
 #endif
@@ -9814,19 +9832,18 @@ RegI32 BaseCompiler::maybeLoadTlsForAcce
                                            RegI32 specific) {
   if (needTlsForAccess(check)) {
     masm.loadWasmTlsRegFromFrame(specific);
     return specific;
   }
   return RegI32::Invalid();
 }
 
-bool BaseCompiler::loadCommon(MemoryAccessDesc* access, ValType type) {
-  AccessCheck check;
-
+bool BaseCompiler::loadCommon(MemoryAccessDesc* access, AccessCheck check,
+                              ValType type) {
   RegI32 tls, temp1, temp2, temp3;
   needLoadTemps(*access, &temp1, &temp2, &temp3);
 
   switch (type.code()) {
     case ValType::I32: {
       RegI32 rp = popMemoryAccess(access, &check);
 #ifdef JS_CODEGEN_ARM
       RegI32 rv = IsUnaligned(*access) ? needI32() : rp;
@@ -9903,22 +9920,21 @@ bool BaseCompiler::emitLoad(ValType type
     return false;
   }
 
   if (deadCode_) {
     return true;
   }
 
   MemoryAccessDesc access(viewType, addr.align, addr.offset, bytecodeOffset());
-  return loadCommon(&access, type);
-}
-
-bool BaseCompiler::storeCommon(MemoryAccessDesc* access, ValType resultType) {
-  AccessCheck check;
-
+  return loadCommon(&access, AccessCheck(), type);
+}
+
+bool BaseCompiler::storeCommon(MemoryAccessDesc* access, AccessCheck check,
+                               ValType resultType) {
   RegI32 tls;
   RegI32 temp = needStoreTemp(*access, resultType);
 
   switch (resultType.code()) {
     case ValType::I32: {
       RegI32 rv = popI32();
       RegI32 rp = popMemoryAccess(access, &check);
       tls = maybeLoadTlsForAccess(check);
@@ -9981,17 +9997,17 @@ bool BaseCompiler::emitStore(ValType res
     return false;
   }
 
   if (deadCode_) {
     return true;
   }
 
   MemoryAccessDesc access(viewType, addr.align, addr.offset, bytecodeOffset());
-  return storeCommon(&access, resultType);
+  return storeCommon(&access, AccessCheck(), resultType);
 }
 
 bool BaseCompiler::emitSelect(bool typed) {
   StackType type;
   Nothing unused_trueValue;
   Nothing unused_falseValue;
   Nothing unused_condition;
   if (!iter_.readSelect(typed, &type, &unused_trueValue, &unused_falseValue,
@@ -10395,17 +10411,17 @@ bool BaseCompiler::emitAtomicLoad(ValTyp
   if (deadCode_) {
     return true;
   }
 
   MemoryAccessDesc access(viewType, addr.align, addr.offset, bytecodeOffset(),
                           Synchronization::Load());
 
   if (Scalar::byteSize(viewType) <= sizeof(void*)) {
-    return loadCommon(&access, type);
+    return loadCommon(&access, AccessCheck(), type);
   }
 
   MOZ_ASSERT(type == ValType::I64 && Scalar::byteSize(viewType) == 8);
 
 #if defined(JS_64BIT)
   MOZ_CRASH("Should not happen");
 #else
   PopAtomicLoad64Regs regs(this);
@@ -10512,17 +10528,17 @@ bool BaseCompiler::emitAtomicStore(ValTy
   if (deadCode_) {
     return true;
   }
 
   MemoryAccessDesc access(viewType, addr.align, addr.offset, bytecodeOffset(),
                           Synchronization::Store());
 
   if (Scalar::byteSize(viewType) <= sizeof(void*)) {
-    return storeCommon(&access, type);
+    return storeCommon(&access, AccessCheck(), type);
   }
 
   MOZ_ASSERT(type == ValType::I64 && Scalar::byteSize(viewType) == 8);
 
 #ifdef JS_64BIT
   MOZ_CRASH("Should not happen");
 #else
   emitAtomicXchg64(&access, WantResult(false));
@@ -10680,26 +10696,223 @@ bool BaseCompiler::emitMemCopy() {
                                 &srcMemOrTableIndex, &nothing, &nothing)) {
     return false;
   }
 
   if (deadCode_) {
     return true;
   }
 
+  int32_t signedLength;
+  if (Assembler::SupportsFastUnalignedAccesses() &&
+      peekConstI32(&signedLength) &&
+      uint32_t(signedLength) <= MaxInlineMemoryCopyLength) {
+    return emitMemCopyInline();
+  }
+
+  return emitMemCopyCall(lineOrBytecode);
+}
+
+bool BaseCompiler::emitMemCopyCall(uint32_t lineOrBytecode) {
   pushHeapBase();
   if (!emitInstanceCall(lineOrBytecode,
                         usesSharedMemory() ? SASigMemCopyShared : SASigMemCopy,
                         /*pushReturnedValue=*/false)) {
     return false;
   }
 
   return true;
 }
 
+bool BaseCompiler::emitMemCopyInline() {
+  MOZ_ASSERT(MaxInlineMemoryCopyLength != 0);
+
+  int32_t signedLength;
+  MOZ_ALWAYS_TRUE(popConstI32(&signedLength));
+  uint32_t length = signedLength;
+
+  RegI32 src = popI32();
+  RegI32 dest = popI32();
+
+  // A zero length copy is a no-op and cannot trap
+  if (length == 0) {
+    freeI32(src);
+    freeI32(dest);
+    return true;
+  }
+
+  // Compute the number of copies of each width we will need to do
+  size_t remainder = length;
+#ifdef JS_64BIT
+  size_t numCopies8 = remainder / sizeof(uint64_t);
+  remainder %= sizeof(uint64_t);
+#endif
+  size_t numCopies4 = remainder / sizeof(uint32_t);
+  remainder %= sizeof(uint32_t);
+  size_t numCopies2 = remainder / sizeof(uint16_t);
+  remainder %= sizeof(uint16_t);
+  size_t numCopies1 = remainder;
+
+  // Load all source bytes onto the value stack from low to high using the
+  // widest transfer width we can for the system. We will trap without writing
+  // anything if any source byte is out-of-bounds.
+  bool omitBoundsCheck = false;
+  size_t offset = 0;
+
+#ifdef JS_64BIT
+  for (uint32_t i = 0; i < numCopies8; i++) {
+    RegI32 temp = needI32();
+    moveI32(src, temp);
+    pushI32(temp);
+
+    MemoryAccessDesc access(Scalar::Int64, 1, offset, bytecodeOffset());
+    AccessCheck check;
+    check.omitBoundsCheck = omitBoundsCheck;
+    if (!loadCommon(&access, check, ValType::I64)) {
+      return false;
+    }
+
+    offset += sizeof(uint64_t);
+    omitBoundsCheck = true;
+  }
+#endif
+
+  for (uint32_t i = 0; i < numCopies4; i++) {
+    RegI32 temp = needI32();
+    moveI32(src, temp);
+    pushI32(temp);
+
+    MemoryAccessDesc access(Scalar::Uint32, 1, offset, bytecodeOffset());
+    AccessCheck check;
+    check.omitBoundsCheck = omitBoundsCheck;
+    if (!loadCommon(&access, check, ValType::I32)) {
+      return false;
+    }
+
+    offset += sizeof(uint32_t);
+    omitBoundsCheck = true;
+  }
+
+  if (numCopies2) {
+    RegI32 temp = needI32();
+    moveI32(src, temp);
+    pushI32(temp);
+
+    MemoryAccessDesc access(Scalar::Uint16, 1, offset, bytecodeOffset());
+    AccessCheck check;
+    check.omitBoundsCheck = omitBoundsCheck;
+    if (!loadCommon(&access, check, ValType::I32)) {
+      return false;
+    }
+
+    offset += sizeof(uint16_t);
+    omitBoundsCheck = true;
+  }
+
+  if (numCopies1) {
+    RegI32 temp = needI32();
+    moveI32(src, temp);
+    pushI32(temp);
+
+    MemoryAccessDesc access(Scalar::Uint8, 1, offset, bytecodeOffset());
+    AccessCheck check;
+    check.omitBoundsCheck = omitBoundsCheck;
+    if (!loadCommon(&access, check, ValType::I32)) {
+      return false;
+    }
+  }
+
+  // Store all source bytes from the value stack to the destination from
+  // high to low. We will trap without writing anything on the first store
+  // if any dest byte is out-of-bounds.
+  offset = length;
+  omitBoundsCheck = false;
+
+  if (numCopies1) {
+    offset -= sizeof(uint8_t);
+
+    RegI32 value = popI32();
+    RegI32 temp = needI32();
+    moveI32(dest, temp);
+    pushI32(temp);
+    pushI32(value);
+
+    MemoryAccessDesc access(Scalar::Uint8, 1, offset, bytecodeOffset());
+    AccessCheck check;
+    if (!storeCommon(&access, check, ValType::I32)) {
+      return false;
+    }
+
+    omitBoundsCheck = true;
+  }
+
+  if (numCopies2) {
+    offset -= sizeof(uint16_t);
+
+    RegI32 value = popI32();
+    RegI32 temp = needI32();
+    moveI32(dest, temp);
+    pushI32(temp);
+    pushI32(value);
+
+    MemoryAccessDesc access(Scalar::Uint16, 1, offset, bytecodeOffset());
+    AccessCheck check;
+    check.omitBoundsCheck = omitBoundsCheck;
+    if (!storeCommon(&access, check, ValType::I32)) {
+      return false;
+    }
+
+    omitBoundsCheck = true;
+  }
+
+  for (uint32_t i = 0; i < numCopies4; i++) {
+    offset -= sizeof(uint32_t);
+
+    RegI32 value = popI32();
+    RegI32 temp = needI32();
+    moveI32(dest, temp);
+    pushI32(temp);
+    pushI32(value);
+
+    MemoryAccessDesc access(Scalar::Uint32, 1, offset, bytecodeOffset());
+    AccessCheck check;
+    check.omitBoundsCheck = omitBoundsCheck;
+    if (!storeCommon(&access, check, ValType::I32)) {
+      return false;
+    }
+
+    omitBoundsCheck = true;
+  }
+
+#ifdef JS_64BIT
+  for (uint32_t i = 0; i < numCopies8; i++) {
+    offset -= sizeof(uint64_t);
+
+    RegI64 value = popI64();
+    RegI32 temp = needI32();
+    moveI32(dest, temp);
+    pushI32(temp);
+    pushI64(value);
+
+    MemoryAccessDesc access(Scalar::Int64, 1, offset, bytecodeOffset());
+    AccessCheck check;
+    check.omitBoundsCheck = omitBoundsCheck;
+    if (!storeCommon(&access, check, ValType::I64)) {
+      return false;
+    }
+
+    omitBoundsCheck = true;
+  }
+#endif
+
+  freeI32(dest);
+  freeI32(src);
+  return true;
+}
+
 bool BaseCompiler::emitTableCopy() {
   if (!bulkmemOpsEnabled()) {
     return false;
   }
 
   uint32_t lineOrBytecode = readCallSiteLineOrBytecode();
 
   uint32_t dstMemOrTableIndex = 0;
@@ -10759,22 +10972,156 @@ bool BaseCompiler::emitMemFill() {
   if (!iter_.readMemFill(&nothing, &nothing, &nothing)) {
     return false;
   }
 
   if (deadCode_) {
     return true;
   }
 
+  int32_t signedLength;
+  int32_t signedValue;
+  if (Assembler::SupportsFastUnalignedAccesses() &&
+      peek2xI32(&signedLength, &signedValue) &&
+      uint32_t(signedLength) <= MaxInlineMemoryFillLength) {
+    return emitMemFillInline();
+  }
+  return emitMemFillCall(lineOrBytecode);
+}
+
+bool BaseCompiler::emitMemFillCall(uint32_t lineOrBytecode) {
   pushHeapBase();
   return emitInstanceCall(
       lineOrBytecode, usesSharedMemory() ? SASigMemFillShared : SASigMemFill,
       /*pushReturnedValue=*/false);
 }
 
+bool BaseCompiler::emitMemFillInline() {
+  MOZ_ASSERT(MaxInlineMemoryFillLength != 0);
+
+  int32_t signedLength;
+  int32_t signedValue;
+  MOZ_ALWAYS_TRUE(popConstI32(&signedLength));
+  MOZ_ALWAYS_TRUE(popConstI32(&signedValue));
+  uint32_t length = uint32_t(signedLength);
+  uint32_t value = uint32_t(signedValue);
+
+  RegI32 dest = popI32();
+
+  // A zero length copy is a no-op and cannot trap
+  if (length == 0) {
+    freeI32(dest);
+    return true;
+  }
+
+  // Compute the number of copies of each width we will need to do
+  size_t remainder = length;
+#ifdef JS_64BIT
+  size_t numCopies8 = remainder / sizeof(uint64_t);
+  remainder %= sizeof(uint64_t);
+#endif
+  size_t numCopies4 = remainder / sizeof(uint32_t);
+  remainder %= sizeof(uint32_t);
+  size_t numCopies2 = remainder / sizeof(uint16_t);
+  remainder %= sizeof(uint16_t);
+  size_t numCopies1 = remainder;
+
+  MOZ_ASSERT(numCopies2 <= 1 && numCopies1 <= 1);
+
+  // Generate splatted definitions for wider fills as needed
+#ifdef JS_64BIT
+  uint64_t val8 = SplatByteToUInt<uint64_t>(value, 8);
+#endif
+  uint32_t val4 = SplatByteToUInt<uint32_t>(value, 4);
+  uint32_t val2 = SplatByteToUInt<uint32_t>(value, 2);
+  uint32_t val1 = value;
+
+  // Store the fill value to the destination from high to low. We will trap
+  // without writing anything on the first store if any dest byte is
+  // out-of-bounds.
+  size_t offset = length;
+  bool omitBoundsCheck = false;
+
+  if (numCopies1) {
+    offset -= sizeof(uint8_t);
+
+    RegI32 temp = needI32();
+    moveI32(dest, temp);
+    pushI32(temp);
+    pushI32(val1);
+
+    MemoryAccessDesc access(Scalar::Uint8, 1, offset, bytecodeOffset());
+    AccessCheck check;
+    if (!storeCommon(&access, check, ValType::I32)) {
+      return false;
+    }
+
+    omitBoundsCheck = true;
+  }
+
+  if (numCopies2) {
+    offset -= sizeof(uint16_t);
+
+    RegI32 temp = needI32();
+    moveI32(dest, temp);
+    pushI32(temp);
+    pushI32(val2);
+
+    MemoryAccessDesc access(Scalar::Uint16, 1, offset, bytecodeOffset());
+    AccessCheck check;
+    check.omitBoundsCheck = omitBoundsCheck;
+    if (!storeCommon(&access, check, ValType::I32)) {
+      return false;
+    }
+
+    omitBoundsCheck = true;
+  }
+
+  for (uint32_t i = 0; i < numCopies4; i++) {
+    offset -= sizeof(uint32_t);
+
+    RegI32 temp = needI32();
+    moveI32(dest, temp);
+    pushI32(temp);
+    pushI32(val4);
+
+    MemoryAccessDesc access(Scalar::Uint32, 1, offset, bytecodeOffset());
+    AccessCheck check;
+    check.omitBoundsCheck = omitBoundsCheck;
+    if (!storeCommon(&access, check, ValType::I32)) {
+      return false;
+    }
+
+    omitBoundsCheck = true;
+  }
+
+#ifdef JS_64BIT
+  for (uint32_t i = 0; i < numCopies8; i++) {
+    offset -= sizeof(uint64_t);
+
+    RegI32 temp = needI32();
+    moveI32(dest, temp);
+    pushI32(temp);
+    pushI64(val8);
+
+    MemoryAccessDesc access(Scalar::Int64, 1, offset, bytecodeOffset());
+    AccessCheck check;
+    check.omitBoundsCheck = omitBoundsCheck;
+    if (!storeCommon(&access, check, ValType::I64)) {
+      return false;
+    }
+
+    omitBoundsCheck = true;
+  }
+#endif
+
+  freeI32(dest);
+  return true;
+}
+
 bool BaseCompiler::emitMemOrTableInit(bool isMem) {
   if (!bulkmemOpsEnabled()) {
     return false;
   }
 
   uint32_t lineOrBytecode = readCallSiteLineOrBytecode();
 
   uint32_t segIndex = 0;
--- a/js/src/wasm/WasmIonCompile.cpp
+++ b/js/src/wasm/WasmIonCompile.cpp
@@ -2895,36 +2895,18 @@ static bool EmitAtomicXchg(FunctionCompi
   if (!f.inDeadCode() && !ins) {
     return false;
   }
 
   f.iter().setResult(ins);
   return true;
 }
 
-static bool EmitMemCopy(FunctionCompiler& f) {
-  // Bulk memory must be available if shared memory is enabled.
-#ifndef ENABLE_WASM_BULKMEM_OPS
-  if (f.env().sharedMemoryEnabled == Shareable::False) {
-    return f.iter().fail("bulk memory ops disabled");
-  }
-#endif
-
-  MDefinition *dst, *src, *len;
-  uint32_t dstTableIndex;
-  uint32_t srcTableIndex;
-  if (!f.iter().readMemOrTableCopy(true, &dstTableIndex, &dst, &srcTableIndex,
-                                   &src, &len)) {
-    return false;
-  }
-
-  if (f.inDeadCode()) {
-    return true;
-  }
-
+static bool EmitMemCopyCall(FunctionCompiler& f, MDefinition* dst,
+                            MDefinition* src, MDefinition* len) {
   uint32_t lineOrBytecode = f.readCallSiteLineOrBytecode();
 
   const SymbolicAddressSignature& callee =
       (f.env().usesSharedMemory() ? SASigMemCopyShared : SASigMemCopy);
   CallCompileState args;
   if (!f.passInstance(callee.argTypes[0], &args)) {
     return false;
   }
@@ -2944,16 +2926,156 @@ static bool EmitMemCopy(FunctionCompiler
   }
   if (!f.finishCall(&args)) {
     return false;
   }
 
   return f.builtinInstanceMethodCall(callee, lineOrBytecode, args);
 }
 
+static bool EmitMemCopyInline(FunctionCompiler& f, MDefinition* dst,
+                              MDefinition* src, MDefinition* len) {
+  MOZ_ASSERT(MaxInlineMemoryCopyLength != 0);
+
+  MOZ_ASSERT(len->isConstant() && len->type() == MIRType::Int32);
+  uint32_t length = len->toConstant()->toInt32();
+
+  // A zero length copy is a no-op and cannot trap
+  if (length == 0) {
+    return true;
+  }
+
+  // Compute the number of copies of each width we will need to do
+  size_t remainder = length;
+#ifdef JS_64BIT
+  size_t numCopies8 = remainder / sizeof(uint64_t);
+  remainder %= sizeof(uint64_t);
+#endif
+  size_t numCopies4 = remainder / sizeof(uint32_t);
+  remainder %= sizeof(uint32_t);
+  size_t numCopies2 = remainder / sizeof(uint16_t);
+  remainder %= sizeof(uint16_t);
+  size_t numCopies1 = remainder;
+
+  // Load all source bytes from low to high using the widest transfer width we
+  // can for the system. We will trap without writing anything if any source
+  // byte is out-of-bounds.
+  size_t offset = 0;
+  DefVector loadedValues;
+
+#ifdef JS_64BIT
+  for (uint32_t i = 0; i < numCopies8; i++) {
+    MemoryAccessDesc access(Scalar::Int64, 1, offset, f.bytecodeOffset());
+    auto* load = f.load(src, &access, ValType::I64);
+    if (!load || !loadedValues.append(load)) {
+      return false;
+    }
+
+    offset += sizeof(uint64_t);
+  }
+#endif
+
+  for (uint32_t i = 0; i < numCopies4; i++) {
+    MemoryAccessDesc access(Scalar::Uint32, 1, offset, f.bytecodeOffset());
+    auto* load = f.load(src, &access, ValType::I32);
+    if (!load || !loadedValues.append(load)) {
+      return false;
+    }
+
+    offset += sizeof(uint32_t);
+  }
+
+  if (numCopies2) {
+    MemoryAccessDesc access(Scalar::Uint16, 1, offset, f.bytecodeOffset());
+    auto* load = f.load(src, &access, ValType::I32);
+    if (!load || !loadedValues.append(load)) {
+      return false;
+    }
+
+    offset += sizeof(uint16_t);
+  }
+
+  if (numCopies1) {
+    MemoryAccessDesc access(Scalar::Uint8, 1, offset, f.bytecodeOffset());
+    auto* load = f.load(src, &access, ValType::I32);
+    if (!load || !loadedValues.append(load)) {
+      return false;
+    }
+  }
+
+  // Store all source bytes to the destination from high to low. We will trap
+  // without writing anything on the first store if any dest byte is
+  // out-of-bounds.
+  offset = length;
+
+  if (numCopies1) {
+    offset -= sizeof(uint8_t);
+
+    MemoryAccessDesc access(Scalar::Uint8, 1, offset, f.bytecodeOffset());
+    auto* value = loadedValues.popCopy();
+    f.store(dst, &access, value);
+  }
+
+  if (numCopies2) {
+    offset -= sizeof(uint16_t);
+
+    MemoryAccessDesc access(Scalar::Uint16, 1, offset, f.bytecodeOffset());
+    auto* value = loadedValues.popCopy();
+    f.store(dst, &access, value);
+  }
+
+  for (uint32_t i = 0; i < numCopies4; i++) {
+    offset -= sizeof(uint32_t);
+
+    MemoryAccessDesc access(Scalar::Uint32, 1, offset, f.bytecodeOffset());
+    auto* value = loadedValues.popCopy();
+    f.store(dst, &access, value);
+  }
+
+#ifdef JS_64BIT
+  for (uint32_t i = 0; i < numCopies8; i++) {
+    offset -= sizeof(uint64_t);
+
+    MemoryAccessDesc access(Scalar::Int64, 1, offset, f.bytecodeOffset());
+    auto* value = loadedValues.popCopy();
+    f.store(dst, &access, value);
+  }
+#endif
+
+  return true;
+}
+
+static bool EmitMemCopy(FunctionCompiler& f) {
+  // Bulk memory must be available if shared memory is enabled.
+#ifndef ENABLE_WASM_BULKMEM_OPS
+  if (f.env().sharedMemoryEnabled == Shareable::False) {
+    return f.iter().fail("bulk memory ops disabled");
+  }
+#endif
+
+  MDefinition *dst, *src, *len;
+  uint32_t dstMemIndex;
+  uint32_t srcMemIndex;
+  if (!f.iter().readMemOrTableCopy(true, &dstMemIndex, &dst, &srcMemIndex, &src,
+                                   &len)) {
+    return false;
+  }
+
+  if (f.inDeadCode()) {
+    return true;
+  }
+
+  if (Assembler::SupportsFastUnalignedAccesses() && len->isConstant() &&
+      len->type() == MIRType::Int32 &&
+      uint32_t(len->toConstant()->toInt32()) <= MaxInlineMemoryCopyLength) {
+    return EmitMemCopyInline(f, dst, src, len);
+  }
+  return EmitMemCopyCall(f, dst, src, len);
+}
+
 static bool EmitTableCopy(FunctionCompiler& f) {
   // Bulk memory must be available if shared memory is enabled.
 #ifndef ENABLE_WASM_BULKMEM_OPS
   if (f.env().sharedMemoryEnabled == Shareable::False) {
     return f.iter().fail("bulk memory ops disabled");
   }
 #endif
 
@@ -3041,33 +3163,18 @@ static bool EmitDataOrElemDrop(FunctionC
 
   if (!f.finishCall(&args)) {
     return false;
   }
 
   return f.builtinInstanceMethodCall(callee, lineOrBytecode, args);
 }
 
-static bool EmitMemFill(FunctionCompiler& f) {
-  // Bulk memory must be available if shared memory is enabled.
-#ifndef ENABLE_WASM_BULKMEM_OPS
-  if (f.env().sharedMemoryEnabled == Shareable::False) {
-    return f.iter().fail("bulk memory ops disabled");
-  }
-#endif
-
-  MDefinition *start, *val, *len;
-  if (!f.iter().readMemFill(&start, &val, &len)) {
-    return false;
-  }
-
-  if (f.inDeadCode()) {
-    return true;
-  }
-
+static bool EmitMemFillCall(FunctionCompiler& f, MDefinition* start,
+                            MDefinition* val, MDefinition* len) {
   uint32_t lineOrBytecode = f.readCallSiteLineOrBytecode();
 
   const SymbolicAddressSignature& callee =
       f.env().usesSharedMemory() ? SASigMemFillShared : SASigMemFill;
   CallCompileState args;
   if (!f.passInstance(callee.argTypes[0], &args)) {
     return false;
   }
@@ -3088,16 +3195,122 @@ static bool EmitMemFill(FunctionCompiler
 
   if (!f.finishCall(&args)) {
     return false;
   }
 
   return f.builtinInstanceMethodCall(callee, lineOrBytecode, args);
 }
 
+static bool EmitMemFillInline(FunctionCompiler& f, MDefinition* start,
+                              MDefinition* val, MDefinition* len) {
+  MOZ_ASSERT(MaxInlineMemoryFillLength != 0);
+
+  MOZ_ASSERT(len->isConstant() && len->type() == MIRType::Int32 &&
+             val->isConstant() && val->type() == MIRType::Int32);
+
+  uint32_t length = len->toConstant()->toInt32();
+  uint32_t value = val->toConstant()->toInt32();
+
+  // A zero length copy is a no-op and cannot trap
+  if (length == 0) {
+    return true;
+  }
+
+  // Compute the number of copies of each width we will need to do
+  size_t remainder = length;
+#ifdef JS_64BIT
+  size_t numCopies8 = remainder / sizeof(uint64_t);
+  remainder %= sizeof(uint64_t);
+#endif
+  size_t numCopies4 = remainder / sizeof(uint32_t);
+  remainder %= sizeof(uint32_t);
+  size_t numCopies2 = remainder / sizeof(uint16_t);
+  remainder %= sizeof(uint16_t);
+  size_t numCopies1 = remainder;
+
+  // Generate splatted definitions for wider fills as needed
+#ifdef JS_64BIT
+  MDefinition* val8 =
+      numCopies8 ? f.constant(int64_t(SplatByteToUInt<uint64_t>(value, 8)))
+                 : nullptr;
+#endif
+  MDefinition* val4 =
+      numCopies4 ? f.constant(Int32Value(SplatByteToUInt<uint32_t>(value, 4)),
+                              MIRType::Int32)
+                 : nullptr;
+  MDefinition* val2 =
+      numCopies2 ? f.constant(Int32Value(SplatByteToUInt<uint32_t>(value, 2)),
+                              MIRType::Int32)
+                 : nullptr;
+
+  // Store the fill value to the destination from high to low. We will trap
+  // without writing anything on the first store if any dest byte is
+  // out-of-bounds.
+  size_t offset = length;
+
+  if (numCopies1) {
+    offset -= sizeof(uint8_t);
+
+    MemoryAccessDesc access(Scalar::Uint8, 1, offset, f.bytecodeOffset());
+    f.store(start, &access, val);
+  }
+
+  if (numCopies2) {
+    offset -= sizeof(uint16_t);
+
+    MemoryAccessDesc access(Scalar::Uint16, 1, offset, f.bytecodeOffset());
+    f.store(start, &access, val2);
+  }
+
+  for (uint32_t i = 0; i < numCopies4; i++) {
+    offset -= sizeof(uint32_t);
+
+    MemoryAccessDesc access(Scalar::Uint32, 1, offset, f.bytecodeOffset());
+    f.store(start, &access, val4);
+  }
+
+#ifdef JS_64BIT
+  for (uint32_t i = 0; i < numCopies8; i++) {
+    offset -= sizeof(uint64_t);
+
+    MemoryAccessDesc access(Scalar::Int64, 1, offset, f.bytecodeOffset());
+    f.store(start, &access, val8);
+  }
+#endif
+
+  return true;
+}
+
+static bool EmitMemFill(FunctionCompiler& f) {
+  // Bulk memory must be available if shared memory is enabled.
+#ifndef ENABLE_WASM_BULKMEM_OPS
+  if (f.env().sharedMemoryEnabled == Shareable::False) {
+    return f.iter().fail("bulk memory ops disabled");
+  }
+#endif
+
+  MDefinition *start, *val, *len;
+  if (!f.iter().readMemFill(&start, &val, &len)) {
+    return false;
+  }
+
+  if (f.inDeadCode()) {
+    return true;
+  }
+
+  if (Assembler::SupportsFastUnalignedAccesses() && len->isConstant() &&
+      len->type() == MIRType::Int32 &&
+      uint32_t(len->toConstant()->toInt32()) <= MaxInlineMemoryFillLength &&
+      val->isConstant() && val->type() == MIRType::Int32) {
+    return EmitMemFillInline(f, start, val, len);
+  }
+  return EmitMemFillCall(f, start, val, len);
+}
+
 static bool EmitMemOrTableInit(FunctionCompiler& f, bool isMem) {
   // Bulk memory must be available if shared memory is enabled.
 #ifndef ENABLE_WASM_BULKMEM_OPS
   if (f.env().sharedMemoryEnabled == Shareable::False) {
     return f.iter().fail("bulk memory ops disabled");
   }
 #endif
 
--- a/js/src/wasm/WasmTypes.h
+++ b/js/src/wasm/WasmTypes.h
@@ -2372,35 +2372,54 @@ static constexpr size_t GetOffsetGuardLi
   return hugeMemory ? HugeOffsetGuardLimit : OffsetGuardLimit;
 #else
   return OffsetGuardLimit;
 #endif
 }
 
 #ifdef WASM_SUPPORTS_HUGE_MEMORY
 static const size_t MaxOffsetGuardLimit = HugeOffsetGuardLimit;
+static const size_t MinOffsetGuardLimit = OffsetGuardLimit;
 #else
 static const size_t MaxOffsetGuardLimit = OffsetGuardLimit;
+static const size_t MinOffsetGuardLimit = OffsetGuardLimit;
 #endif
 
 // Return whether the given immediate satisfies the constraints of the platform
 // (viz. that, on ARM, IsValidARMImmediate).
 
 extern bool IsValidBoundsCheckImmediate(uint32_t i);
 
 // For a given WebAssembly/asm.js max size, return the number of bytes to
 // map which will necessarily be a multiple of the system page size and greater
 // than maxSize. For a returned mappedSize:
 //   boundsCheckLimit = mappedSize - GuardSize
 //   IsValidBoundsCheckImmediate(boundsCheckLimit)
 
 extern size_t ComputeMappedSize(uint32_t maxSize);
 
-// wasm::Frame represents the bytes pushed by the call instruction and the fixed
-// prologue generated by wasm::GenerateCallablePrologue.
+// The following thresholds were derived from a microbenchmark. If we begin to
+// ship this optimization for more platforms, we will need to extend this list.
+
+#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64)
+static const uint32_t MaxInlineMemoryCopyLength = 64;
+static const uint32_t MaxInlineMemoryFillLength = 64;
+#elif defined(JS_CODEGEN_X86)
+static const uint32_t MaxInlineMemoryCopyLength = 32;
+static const uint32_t MaxInlineMemoryFillLength = 32;
+#else
+static const uint32_t MaxInlineMemoryCopyLength = 0;
+static const uint32_t MaxInlineMemoryFillLength = 0;
+#endif
+
+static_assert(MaxInlineMemoryCopyLength < MinOffsetGuardLimit, "precondition");
+static_assert(MaxInlineMemoryFillLength < MinOffsetGuardLimit, "precondition");
+
+// wasm::Frame represents the bytes pushed by the call instruction and the
+// fixed prologue generated by wasm::GenerateCallablePrologue.
 //
 // Across all architectures it is assumed that, before the call instruction, the
 // stack pointer is WasmStackAlignment-aligned. Thus after the prologue, and
 // before the function has made its stack reservation, the stack alignment is
 // sizeof(Frame) % WasmStackAlignment.
 //
 // During MacroAssembler code generation, the bytes pushed after the wasm::Frame
 // are counted by masm.framePushed. Thus, the stack alignment at any point in