Bug 1526383: Add essential arm64 support to nsWindowsDllInterceptor; r=handyman
authorAaron Klotz <aklotz@mozilla.com>
Thu, 21 Feb 2019 18:41:17 +0000
changeset 460322 fae2ad1c3dc6823fd9684d7f46968446adac66fa
parent 460321 af3314b6bcb1f9bb31700ad9a456661018e37686
child 460323 28c61342bdad7edf20d2ca099112673cca4e8360
push id78680
push useraklotz@mozilla.com
push dateThu, 21 Feb 2019 21:58:31 +0000
treeherderautoland@28c61342bdad [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewershandyman
bugs1526383, 1526016
milestone67.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1526383: Add essential arm64 support to nsWindowsDllInterceptor; r=handyman This patch doesn't cover all possible functions for which we currently instantiate interceptors inside Firefox/Gecko. Rather than asserting, we just fail in those cases (at least until we have full coverage of existing uses). This is okay, as for the upcoming milestone 2 of aarch64 builds, we are most concerned with successfully being able to hook the following functions: ntdll!LdrLoadDll ntdll!LdrUnloadDll ntdll!LdrResolveDelayLoadedAPI user32!GetWindowInfo So, within that context, the aarch64 implementation is fairly simple: Each instruction is 4-bytes wide. We iterate down each instruction, and if the current instruction is *not* PC-relative, we just copy it verbatim. If we encounter an instruction that *is* PC-relative, we either decode it and rewrite it inside the trampoline, or we fail. For the purposes of milestone 2, the only instruction that is essential to decode is ADRP. In bug 1526016 I modify TestDllInterceptor to exclude functions that are not yet supported by this patch. Differential Revision: https://phabricator.services.mozilla.com/D19446
mozglue/build/moz.build
mozglue/misc/interceptor/Arm64.cpp
mozglue/misc/interceptor/Arm64.h
mozglue/misc/interceptor/PatcherBase.h
mozglue/misc/interceptor/PatcherDetour.h
mozglue/misc/interceptor/TargetFunction.h
mozglue/misc/interceptor/Trampoline.h
mozglue/misc/interceptor/moz.build
mozglue/misc/moz.build
mozglue/misc/nsWindowsDllInterceptor.h
--- a/mozglue/build/moz.build
+++ b/mozglue/build/moz.build
@@ -80,16 +80,20 @@ if CONFIG['MOZ_WIDGET_TOOLKIT']:
             'WindowsDllBlocklistDefs.h',
         ]
         EXPORTS.mozilla.glue += [
             'WindowsDllServices.h',
         ]
         USE_LIBS += [
             'mscom-mozglue',
         ]
+        if CONFIG['CPU_ARCH'] == 'aarch64':
+            USE_LIBS += [
+                'interceptor',
+            ]
 
     EXPORTS.mozilla += [
         'arm.h',
         'mips.h',
         'SSE.h',
         'WindowsDllBlocklist.h',
     ]
 
new file mode 100644
--- /dev/null
+++ b/mozglue/misc/interceptor/Arm64.cpp
@@ -0,0 +1,78 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
+
+#include "Arm64.h"
+
+namespace mozilla {
+namespace interceptor {
+namespace arm64 {
+
+struct PCRelativeLoadTest {
+  // Bitmask to be ANDed with the instruction to isolate the bits that this
+  // instance is interested in
+  uint32_t mTestMask;
+  // The desired bits that we want to see after masking
+  uint32_t mMatchBits;
+  // If we match, mDecodeFn provide the code to decode the instruction.
+  LoadInfo (*mDecodeFn)(const uintptr_t aPC, const uint32_t aInst);
+};
+
+static LoadInfo ADRPDecode(const uintptr_t aPC, const uint32_t aInst) {
+  // Keep in mind that on Windows aarch64, uint32_t is little-endian
+  const uint32_t kMaskDataProcImmPcRelativeImmLo = 0x60000000;
+  const uint32_t kMaskDataProcImmPcRelativeImmHi = 0x00FFFFE0;
+
+  uintptr_t base = aPC;
+  uintptr_t offset = ((aInst & kMaskDataProcImmPcRelativeImmHi) >> 3) |
+                     ((aInst & kMaskDataProcImmPcRelativeImmLo) >> 29);
+  base &= ~0xFFFULL;
+  offset <<= 12;
+
+  uint8_t reg = aInst & 0x1F;
+
+  return LoadInfo(base + offset, reg);
+}
+
+// Order is important here; more specific encoding tests must be placed before
+// less specific encoding tests.
+static const PCRelativeLoadTest gPCRelTests[] = {
+    {0x9FC00000, 0x10000000, nullptr},      // ADR
+    {0x9FC00000, 0x90000000, &ADRPDecode},  // ADRP
+    {0xFF000000, 0x58000000, nullptr},      // LDR (literal) 64-bit GPR
+    {0x3B000000, 0x18000000, nullptr},      // LDR (literal) (remaining forms)
+    {0x7C000000, 0x14000000, nullptr},      // B (unconditional immediate)
+    {0xFE000000, 0x54000000, nullptr},      // B.Cond
+    {0x7E000000, 0x34000000, nullptr},      // Compare and branch (imm)
+    {0x7E000000, 0x36000000, nullptr},      // Test and branch (imm)
+    {0xFE000000, 0xD6000000, nullptr}       // Unconditional branch (reg)
+};
+
+/**
+ * In this function we interate through each entry in |gPCRelTests|, AND
+ * |aInst| with |test.mTestMask| to isolate the bits that we're interested in,
+ * then compare that result against |test.mMatchBits|. If we have a match,
+ * then that particular entry is applicable to |aInst|. If |test.mDecodeFn| is
+ * present, then we call it to decode the instruction. If it is not present,
+ * then we assume that this particular instruction is unsupported.
+ */
+MFBT_API Result<LoadInfo, PCRelCheckError> CheckForPCRel(const uintptr_t aPC,
+                                                         const uint32_t aInst) {
+  for (auto&& test : gPCRelTests) {
+    if ((aInst & test.mTestMask) == test.mMatchBits) {
+      if (!test.mDecodeFn) {
+        return Err(PCRelCheckError::NoDecoderAvailable);
+      }
+
+      return test.mDecodeFn(aPC, aInst);
+    }
+  }
+
+  return Err(PCRelCheckError::InstructionNotPCRel);
+}
+
+}  // namespace arm64
+}  // namespace interceptor
+}  // namespace mozilla
new file mode 100644
--- /dev/null
+++ b/mozglue/misc/interceptor/Arm64.h
@@ -0,0 +1,49 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
+
+#ifndef mozilla_interceptor_Arm64_h
+#define mozilla_interceptor_Arm64_h
+
+#include "mozilla/Assertions.h"
+#include "mozilla/Result.h"
+#include "mozilla/Types.h"
+
+namespace mozilla {
+namespace interceptor {
+namespace arm64 {
+
+// This currently only handles loads, not branches
+struct LoadInfo {
+  LoadInfo(const uintptr_t aAbsAddress, const uint8_t aDestReg)
+      : mAbsAddress(aAbsAddress), mDestReg(aDestReg) {
+    MOZ_ASSERT(aDestReg < 32);
+  }
+
+  // The absolute address to be loaded into a register
+  uintptr_t mAbsAddress;
+  // The destination register for the load
+  uint8_t mDestReg;
+};
+
+enum class PCRelCheckError {
+  InstructionNotPCRel,
+  NoDecoderAvailable,
+};
+
+MFBT_API Result<LoadInfo, PCRelCheckError> CheckForPCRel(const uintptr_t aPC,
+                                                         const uint32_t aInst);
+
+inline static uint32_t BuildUnconditionalBranchToRegister(const uint32_t aReg) {
+  MOZ_ASSERT(aReg < 32);
+  // BR aReg
+  return 0xD61F0000 | (aReg << 5);
+}
+
+}  // namespace arm64
+}  // namespace interceptor
+}  // namespace mozilla
+
+#endif  // mozilla_interceptor_Arm64_h
--- a/mozglue/misc/interceptor/PatcherBase.h
+++ b/mozglue/misc/interceptor/PatcherBase.h
@@ -19,16 +19,17 @@ class WindowsDllPatcherBase {
 
   template <typename... Args>
   explicit WindowsDllPatcherBase(Args... aArgs)
       : mVMPolicy(std::forward<Args>(aArgs)...) {}
 
   ReadOnlyTargetFunction<MMPolicyT> ResolveRedirectedAddress(
       FARPROC aOriginalFunction) {
     ReadOnlyTargetFunction<MMPolicyT> origFn(mVMPolicy, aOriginalFunction);
+#if defined(_M_IX86) || defined(_M_X64)
     // If function entry is jmp rel8 stub to the internal implementation, we
     // resolve redirected address from the jump target.
     if (origFn[0] == 0xeb) {
       int8_t offset = (int8_t)(origFn[1]);
       uintptr_t abstarget = origFn.GetAddress() + 2 + offset;
 
 #if defined(_M_X64)
       // We redirect to the target of a short jump backwards if the target
@@ -40,23 +41,23 @@ class WindowsDllPatcherBase {
           return redirectFn;
         }
       }
 #endif
 
       if (offset <= 0) {
         // Bail out for negative offset: probably already patched by some
         // third-party code.
-        return std::move(origFn);
+        return origFn;
       }
 
       for (int8_t i = 0; i < offset; i++) {
         if (origFn[2 + i] != 0x90) {
           // Bail out on insufficient nop space.
-          return std::move(origFn);
+          return origFn;
         }
       }
 
       return EnsureTargetIsAccessible(std::move(origFn), abstarget);
     }
 
 #if defined(_M_IX86)
     // If function entry is jmp [disp32] such as used by kernel32,
@@ -74,25 +75,26 @@ class WindowsDllPatcherBase {
     }
 
     if (origFn[0] == 0xe9) {
       // require for TestDllInterceptor with --disable-optimize
       uintptr_t abstarget = (origFn + 1).ReadDisp32AsAbsolute();
       return EnsureTargetIsAccessible(std::move(origFn), abstarget);
     }
 #endif
+#endif  // defined(_M_IX86) || defined(_M_X64)
 
-    return std::move(origFn);
+    return origFn;
   }
 
  private:
   ReadOnlyTargetFunction<MMPolicyT> EnsureTargetIsAccessible(
       ReadOnlyTargetFunction<MMPolicyT> aOrigFn, uintptr_t aRedirAddress) {
     if (!mVMPolicy.IsPageAccessible(reinterpret_cast<void*>(aRedirAddress))) {
-      return std::move(aOrigFn);
+      return aOrigFn;
     }
 
     return ReadOnlyTargetFunction<MMPolicyT>(mVMPolicy, aRedirAddress);
   }
 
  protected:
   VMPolicy mVMPolicy;
 };
--- a/mozglue/misc/interceptor/PatcherDetour.h
+++ b/mozglue/misc/interceptor/PatcherDetour.h
@@ -2,21 +2,25 @@
 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
 
 #ifndef mozilla_interceptor_PatcherDetour_h
 #define mozilla_interceptor_PatcherDetour_h
 
+#if defined(_M_ARM64)
+#  include "mozilla/interceptor/Arm64.h"
+#endif  // defined(_M_ARM64)
 #include "mozilla/interceptor/PatcherBase.h"
 #include "mozilla/interceptor/Trampoline.h"
 
 #include "mozilla/ScopeExit.h"
 #include "mozilla/TypedEnumBits.h"
+#include "mozilla/Types.h"
 #include "mozilla/Unused.h"
 
 #define COPY_CODES(NBYTES)                          \
   do {                                              \
     tramp.CopyFrom(origBytes.GetAddress(), NBYTES); \
     origBytes += NBYTES;                            \
   } while (0)
 
@@ -62,20 +66,16 @@ class WindowsDllDetourPatcher final : pu
 #elif defined(_M_ARM64)
     size_t nBytes = 4;
 #else
 #  error "Unknown processor type"
 #endif
 
     const auto& tramps = this->mVMPolicy.Items();
     for (auto&& tramp : tramps) {
-#if defined(_M_ARM64)
-      MOZ_RELEASE_ASSERT(false, "Shouldn't get here");
-#endif
-
       // First we read the pointer to the interceptor instance.
       Maybe<uintptr_t> instance = tramp.ReadEncodedPointer();
       if (!instance) {
         continue;
       }
 
       if (instance.value() != reinterpret_cast<uintptr_t>(this)) {
         // tramp does not belong to this interceptor instance.
@@ -96,63 +96,93 @@ class WindowsDllDetourPatcher final : pu
       }
 
       WritableTargetFunction<MMPolicyT> origBytes(
           this->mVMPolicy, interceptedFn.value(), nBytes);
       if (!origBytes) {
         continue;
       }
 
+#if defined(_M_IX86) || defined(_M_X64)
+
       Maybe<uint8_t> maybeOpcode1 = origBytes.ReadByte();
       if (!maybeOpcode1) {
         continue;
       }
 
       uint8_t opcode1 = maybeOpcode1.value();
 
-#if defined(_M_IX86)
+#  if defined(_M_IX86)
       // Ensure the JMP from CreateTrampoline is where we expect it to be.
       MOZ_ASSERT(opcode1 == 0xE9);
       if (opcode1 != 0xE9) {
         continue;
       }
 
       intptr_t startOfTrampInstructions =
           static_cast<intptr_t>(tramp.GetCurrentRemoteAddress());
 
       origBytes.WriteDisp32(startOfTrampInstructions);
       if (!origBytes) {
         continue;
       }
 
       origBytes.Commit();
-#elif defined(_M_X64)
+#  elif defined(_M_X64)
       if (opcode1 == 0x49) {
         if (!Clear13BytePatch(origBytes, tramp.GetCurrentRemoteAddress())) {
           continue;
         }
       } else if (opcode1 == 0xB8) {
         if (!Clear10BytePatch(origBytes)) {
           continue;
         }
       } else {
         MOZ_ASSERT_UNREACHABLE("Unrecognized patch!");
         continue;
       }
+#  endif
+
 #elif defined(_M_ARM64)
-      Unused << opcode1;
-      MOZ_RELEASE_ASSERT(false, "Shouldn't get here");
+
+      // Ensure that we see the instructions that we expect
+      Maybe<uint32_t> inst1 = origBytes.ReadLong();
+      if (!inst1) {
+        continue;
+      }
+
+      if (inst1.value() != 0x58000050) {
+        MOZ_ASSERT_UNREACHABLE("Unrecognized patch!");
+        continue;
+      }
+
+      Maybe<uint32_t> inst2 = origBytes.ReadLong();
+      if (!inst2) {
+        continue;
+      }
+
+      if (inst2.value() != arm64::BuildUnconditionalBranchToRegister(16)) {
+        MOZ_ASSERT_UNREACHABLE("Unrecognized patch!");
+        continue;
+      }
+
+      // Clobber the pointer to our hook function with a pointer to the
+      // start of the trampoline.
+      origBytes.WritePointer(tramp.GetCurrentRemoteAddress());
+      origBytes.Commit();
+
 #else
 #  error "Unknown processor type"
 #endif
     }
 
     this->mVMPolicy.Clear();
   }
 
+#if defined(_M_X64)
   bool Clear13BytePatch(WritableTargetFunction<MMPolicyT>& aOrigBytes,
                         const uintptr_t aResetToAddress) {
     Maybe<uint8_t> maybeOpcode2 = aOrigBytes.ReadByte();
     if (!maybeOpcode2) {
       return false;
     }
 
     uint8_t opcode2 = maybeOpcode2.value();
@@ -206,35 +236,38 @@ class WindowsDllDetourPatcher final : pu
     uint8_t opcode1 = maybeOpcode1.value();
     if (opcode1 != 0x49) {
       return false;
     }
 
     // Now we can just delegate the rest to our normal 13-byte patch clearing.
     return Clear13BytePatch(writableIntermediate, stubTramp.value());
   }
+#endif  // defined(_M_X64)
 
   void Init(DetourFlags aFlags = DetourFlags::eDefault, int aNumHooks = 0) {
     if (Initialized()) {
       return;
     }
 
     mFlags = aFlags;
 
     if (aNumHooks == 0) {
       // Win32 allocates VM addresses at a 64KiB granularity, so by default we
       // might as well utilize that entire 64KiB reservation instead of
       // artifically constraining ourselves to the page size.
       aNumHooks = this->mVMPolicy.GetAllocGranularity() / kHookSize;
     }
 
     ReservationFlags resFlags = ReservationFlags::eDefault;
+#if defined(_M_X64)
     if (aFlags & DetourFlags::eEnable10BytePatch) {
       resFlags |= ReservationFlags::eForceFirst2GB;
     }
+#endif  // defined(_M_X64)
 
     this->mVMPolicy.Reserve(aNumHooks, resFlags);
   }
 
   bool Initialized() const { return !!this->mVMPolicy; }
 
   bool AddHook(FARPROC aTargetFn, intptr_t aHookDest, void** aOrigFunc) {
     ReadOnlyTargetFunction<MMPolicyT> target(
@@ -244,18 +277,21 @@ class WindowsDllDetourPatcher final : pu
     if (!*aOrigFunc) {
       return false;
     }
 
     return true;
   }
 
  protected:
+  const static int kHookSize = 128;
+
+#if !defined(_M_ARM64)
+
   const static int kPageSize = 4096;
-  const static int kHookSize = 128;
 
   // rex bits
   static const BYTE kMaskHighNibble = 0xF0;
   static const BYTE kRexOpcode = 0x40;
   static const BYTE kMaskRexW = 0x08;
   static const BYTE kMaskRexR = 0x04;
   static const BYTE kMaskRexX = 0x02;
   static const BYTE kMaskRexB = 0x01;
@@ -312,25 +348,25 @@ class WindowsDllDetourPatcher final : pu
       case kModDisp8:
         numBytes += 1;
         break;
       case kModDisp32:
         numBytes += 4;
         break;
       case kModNoRegDisp:
         if ((*aModRm & kMaskRm) == kRmNoRegDispDisp32) {
-#if defined(_M_X64)
+#  if defined(_M_X64)
           if (aSubOpcode) {
             *aSubOpcode = (*aModRm & kMaskReg) >> kRegFieldShift;
           }
           return kModOperand64;
-#else
+#  else
           // On IA-32, all ModR/M instruction modes address memory relative to 0
           numBytes += 4;
-#endif
+#  endif
         } else if (((*aModRm & kMaskRm) == kRmNeedSib &&
                     (*(aModRm + 1) & kMaskSibBase) == kSibBaseEbp)) {
           numBytes += 4;
         }
         break;
       default:
         // This should not be reachable
         MOZ_ASSERT_UNREACHABLE("Impossible value for modr/m byte mod bits");
@@ -341,17 +377,17 @@ class WindowsDllDetourPatcher final : pu
       numBytes += 1;
     }
     if (aSubOpcode) {
       *aSubOpcode = (*aModRm & kMaskReg) >> kRegFieldShift;
     }
     return numBytes;
   }
 
-#if defined(_M_X64)
+#  if defined(_M_X64)
   enum class JumpType{Je, Jne, Jmp, Call};
 
   static bool GenerateJump(Trampoline<MMPolicyT>& aTramp,
                            uintptr_t aAbsTargetAddress, const JumpType aType) {
     // Near call, absolute indirect, address given in r/m32
     if (aType == JumpType::Call) {
       // CALL [RIP+0]
       aTramp.WriteByte(0xff);
@@ -379,25 +415,21 @@ class WindowsDllDetourPatcher final : pu
     aTramp.WriteByte(0xff);
     aTramp.WriteByte(0x25);
     // The offset to jump destination is 0
     aTramp.WriteInteger(0);
     aTramp.WritePointer(aAbsTargetAddress);
 
     return !!aTramp;
   }
-#endif
+#  endif
 
-  enum ePrefixGroupBits {
-    eNoPrefixes = 0,
-    ePrefixGroup1 = (1 << 0),
-    ePrefixGroup2 = (1 << 1),
-    ePrefixGroup3 = (1 << 2),
-    ePrefixGroup4 = (1 << 3)
-  };
+  enum ePrefixGroupBits{eNoPrefixes = 0, ePrefixGroup1 = (1 << 0),
+                        ePrefixGroup2 = (1 << 1), ePrefixGroup3 = (1 << 2),
+                        ePrefixGroup4 = (1 << 3)};
 
   int CountPrefixBytes(const ReadOnlyTargetFunction<MMPolicyT>& aBytes,
                        const int aBytesIndex, unsigned char* aOutGroupBits) {
     unsigned char& groupBits = *aOutGroupBits;
     groupBits = eNoPrefixes;
     int index = aBytesIndex;
     while (true) {
       switch (aBytes[index]) {
@@ -454,16 +486,18 @@ class WindowsDllDetourPatcher final : pu
   BYTE BuildModRmByte(BYTE aModBits, BYTE aReg, BYTE aRm) {
     MOZ_ASSERT((aRm & kMaskRm) == aRm);
     MOZ_ASSERT((aModBits & kMaskMod) == aModBits);
     MOZ_ASSERT(((aReg << kRegFieldShift) & kMaskReg) ==
                (aReg << kRegFieldShift));
     return aModBits | (aReg << kRegFieldShift) | aRm;
   }
 
+#endif  // !defined(_M_ARM64)
+
   void CreateTrampoline(ReadOnlyTargetFunction<MMPolicyT>& origBytes,
                         intptr_t aDest, void** aOutTramp) {
     *aOutTramp = nullptr;
 
     Trampoline<MMPolicyT> tramp(this->mVMPolicy.GetNextTrampoline());
     if (!tramp) {
       return;
     }
@@ -1037,17 +1071,46 @@ class WindowsDllDetourPatcher final : pu
         }
         COPY_CODES(len + 1);
       } else {
         MOZ_ASSERT_UNREACHABLE("Unrecognized opcode sequence");
         return;
       }
     }
 #elif defined(_M_ARM64)
-    MOZ_RELEASE_ASSERT(false, "Shouldn't get here");
+
+    // We could probably decrease this value under certain scenarios, depending
+    // on the proximity of the hook function to the target function, but this
+    // would add additional complexity that we don't have time to deal with at
+    // the moment.
+    const uint32_t kWorstCaseBytesRequired = 16;
+
+    while (origBytes.GetOffset() < kWorstCaseBytesRequired) {
+      uintptr_t curPC = origBytes.GetCurrentAbsolute();
+      uint32_t curInst = origBytes.ReadNextInstruction();
+
+      Result<arm64::LoadInfo, arm64::PCRelCheckError> pcRelInfo =
+          arm64::CheckForPCRel(curPC, curInst);
+      if (pcRelInfo.isErr()) {
+        if (pcRelInfo.unwrapErr() ==
+            arm64::PCRelCheckError::InstructionNotPCRel) {
+          // Instruction is not PC-relative, we can just copy it verbatim
+          tramp.WriteInstruction(curInst);
+          continue;
+        }
+
+        // No decoder available for PC-relative instruction; fail.
+        return;
+      }
+
+      // We need to load an absolute address into a particular register
+      tramp.WriteLoadLiteral(pcRelInfo.unwrap().mAbsAddress,
+                             pcRelInfo.unwrap().mDestReg);
+    }
+
 #else
 #  error "Unknown processor type"
 #endif
 
     if (origBytes.GetOffset() > 100) {
       // printf ("Too big!");
       return;
     }
@@ -1066,16 +1129,24 @@ class WindowsDllDetourPatcher final : pu
     // If the we found a Jmp, we don't need to add another instruction. However,
     // if we found a _conditional_ jump or a CALL (or no control operations
     // at all) then we still need to run the rest of aOriginalFunction.
     if (!foundJmp) {
       if (!GenerateJump(tramp, origBytes.GetAddress(), JumpType::Jmp)) {
         return;
       }
     }
+#elif defined(_M_ARM64)
+    // Write the branch from the trampoline back to the original code
+
+    tramp.WriteLoadLiteral(origBytes.GetAddress(), 16);
+    tramp.WriteInstruction(arm64::BuildUnconditionalBranchToRegister(16));
+
+#else
+#  error "Unsupported processor architecture"
 #endif
 
     // The trampoline is now complete.
     void* trampPtr = tramp.EndExecutableCode();
     if (!trampPtr) {
       return;
     }
 
@@ -1141,16 +1212,27 @@ class WindowsDllDetourPatcher final : pu
       target.WriteByte(0xbb);
       target.WritePointer(aDest);
 
       // jmp r11
       target.WriteByte(0x41);
       target.WriteByte(0xff);
       target.WriteByte(0xe3);
     }
+#elif defined(_M_ARM64)
+
+    // Now patch the original function
+    // LDR x16, .+8
+    target.WriteLong(0x58000050);
+    // BR x16
+    target.WriteLong(arm64::BuildUnconditionalBranchToRegister(16));
+    target.WritePointer(aDest);
+
+#else
+#  error "Unsupported processor architecture"
 #endif
 
     if (!target.Commit()) {
       return;
     }
 
     // Output the trampoline, thus signalling that this call was a success
     *aOutTramp = trampPtr;
--- a/mozglue/misc/interceptor/TargetFunction.h
+++ b/mozglue/misc/interceptor/TargetFunction.h
@@ -312,17 +312,17 @@ class MOZ_STACK_CLASS WritableTargetFunc
                             sizeof(int32_t))) {
       mAccumulatedStatus = false;
       return;
     }
 
     mOffset += sizeof(int32_t);
   }
 
-#if defined(_M_X64)
+#if defined(_M_X64) || defined(_M_ARM64)
   void WriteLong(const uint32_t aValue) {
     if (!mLocalBytes.append(reinterpret_cast<const uint8_t*>(&aValue),
                             sizeof(uint32_t))) {
       mAccumulatedStatus = false;
       return;
     }
 
     mOffset += sizeof(uint32_t);
@@ -681,16 +681,28 @@ class MOZ_STACK_CLASS ReadOnlyTargetFunc
     return reinterpret_cast<uintptr_t>(
         ::DecodePointer(reinterpret_cast<PVOID>(aEncodedPtr)));
   }
 
   bool IsValidAtOffset(const int8_t aOffset) const {
     return mTargetBytes->IsValidAtOffset(aOffset);
   }
 
+#if defined(_M_ARM64)
+
+  uint32_t ReadNextInstruction() {
+    mTargetBytes->EnsureLimit(mOffset + sizeof(uint32_t));
+    uint32_t instruction = *reinterpret_cast<const uint32_t*>(
+        mTargetBytes->GetLocalBytes() + mOffset);
+    mOffset += sizeof(uint32_t);
+    return instruction;
+  }
+
+#else
+
   uint8_t const& operator*() const {
     mTargetBytes->EnsureLimit(mOffset);
     return *(mTargetBytes->GetLocalBytes() + mOffset);
   }
 
   uint8_t const& operator[](uint32_t aIndex) const {
     mTargetBytes->EnsureLimit(mOffset + aIndex);
     return *(mTargetBytes->GetLocalBytes() + mOffset + aIndex);
@@ -701,32 +713,36 @@ class MOZ_STACK_CLASS ReadOnlyTargetFunc
     return *this;
   }
 
   ReadOnlyTargetFunction& operator+=(uint32_t aDelta) {
     mOffset += aDelta;
     return *this;
   }
 
-  uint32_t GetOffset() const { return mOffset; }
-
   uintptr_t ReadDisp32AsAbsolute() {
     mTargetBytes->EnsureLimit(mOffset + sizeof(int32_t));
     int32_t disp = *reinterpret_cast<const int32_t*>(
         mTargetBytes->GetLocalBytes() + mOffset);
     uintptr_t result =
         mTargetBytes->GetBase() + mOffset + sizeof(int32_t) + disp;
     mOffset += sizeof(int32_t);
     return result;
   }
 
+#endif
+
+  uint32_t GetOffset() const { return mOffset; }
+
   uintptr_t OffsetToAbsolute(const uint8_t aOffset) const {
     return mTargetBytes->GetBase() + mOffset + aOffset;
   }
 
+  uintptr_t GetCurrentAbsolute() const { return OffsetToAbsolute(0); }
+
   /**
    * This method promotes the code referenced by this object to be writable.
    *
    * @param aLen    The length of the function's code to make writable. If set
    *                to zero, this object's current offset is used as the length.
    * @param aOffset The result's base address will be offset from this
    *                object's base address by |aOffset| bytes. This value may be
    *                negative.
--- a/mozglue/misc/interceptor/Trampoline.h
+++ b/mozglue/misc/interceptor/Trampoline.h
@@ -4,16 +4,17 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
 
 #ifndef mozilla_interceptor_Trampoline_h
 #define mozilla_interceptor_Trampoline_h
 
 #include "mozilla/Assertions.h"
 #include "mozilla/Attributes.h"
+#include "mozilla/CheckedInt.h"
 #include "mozilla/Maybe.h"
 #include "mozilla/Types.h"
 
 namespace mozilla {
 namespace interceptor {
 
 template <typename MMPolicy>
 class MOZ_STACK_CLASS Trampoline final {
@@ -66,16 +67,85 @@ class MOZ_STACK_CLASS Trampoline final {
 
     ::VirtualProtect(mLocalBase, mMaxOffset, mPrevLocalProt, &mPrevLocalProt);
   }
 
   explicit operator bool() const {
     return mLocalBase && mRemoteBase && mPrevLocalProt && mAccumulatedStatus;
   }
 
+#if defined(_M_ARM64)
+
+  void WriteInstruction(uint32_t aInstruction) {
+    if (mOffset + sizeof(uint32_t) > mMaxOffset) {
+      mAccumulatedStatus = false;
+      return;
+    }
+
+    *reinterpret_cast<uint32_t*>(mLocalBase + mOffset) = aInstruction;
+    mOffset += sizeof(uint32_t);
+  }
+
+  void WriteLoadLiteral(const uintptr_t aAddress, const uint8_t aReg) {
+    // We grow the literal pool from the *end* of the tramp,
+    // so we need to ensure that there is enough room for both an instruction
+    // and a pointer
+    if (mOffset + sizeof(uint32_t) + sizeof(uintptr_t) > mMaxOffset) {
+      mAccumulatedStatus = false;
+      return;
+    }
+
+    mMaxOffset -= sizeof(uintptr_t);
+    *reinterpret_cast<uintptr_t*>(mLocalBase + mMaxOffset) = aAddress;
+
+    CheckedInt<intptr_t> pc(GetCurrentRemoteAddress());
+    if (!pc.isValid()) {
+      mAccumulatedStatus = false;
+      return;
+    }
+
+    CheckedInt<intptr_t> literal(reinterpret_cast<uintptr_t>(mLocalBase) +
+                                 mMaxOffset);
+    if (!literal.isValid()) {
+      mAccumulatedStatus = false;
+      return;
+    }
+
+    CheckedInt<intptr_t> ptrOffset = (literal - pc);
+    if (!ptrOffset.isValid()) {
+      mAccumulatedStatus = false;
+      return;
+    }
+
+    // ptrOffset must be properly aligned
+    MOZ_ASSERT((ptrOffset.value() % 4) == 0);
+    ptrOffset /= 4;
+
+    CheckedInt<int32_t> offset(ptrOffset.value());
+    if (!offset.isValid()) {
+      mAccumulatedStatus = false;
+      return;
+    }
+
+    // Ensure that offset falls within the range of a signed 19-bit value
+    if (offset.value() < -0x40000 || offset.value() > 0x3FFFF) {
+      mAccumulatedStatus = false;
+      return;
+    }
+
+    const int32_t kimm19Mask = 0x7FFFF;
+    int32_t masked = offset.value() & kimm19Mask;
+
+    MOZ_ASSERT(aReg < 32);
+    uint32_t loadInstr = 0x58000000 | (masked << 5) | aReg;
+    WriteInstruction(loadInstr);
+  }
+
+#else
+
   void WriteByte(uint8_t aValue) {
     if (mOffset >= mMaxOffset) {
       mAccumulatedStatus = false;
       return;
     }
 
     *(mLocalBase + mOffset) = aValue;
     ++mOffset;
@@ -86,16 +156,42 @@ class MOZ_STACK_CLASS Trampoline final {
       mAccumulatedStatus = false;
       return;
     }
 
     *reinterpret_cast<int32_t*>(mLocalBase + mOffset) = aValue;
     mOffset += sizeof(int32_t);
   }
 
+  void WriteDisp32(uintptr_t aAbsTarget) {
+    if (mOffset + sizeof(int32_t) > mMaxOffset) {
+      mAccumulatedStatus = false;
+      return;
+    }
+
+    // This needs to be computed from the remote location
+    intptr_t remoteTrampPosition = static_cast<intptr_t>(mRemoteBase + mOffset);
+
+    intptr_t diff = static_cast<intptr_t>(aAbsTarget) -
+                    (remoteTrampPosition + sizeof(int32_t));
+
+    CheckedInt<int32_t> checkedDisp(diff);
+    MOZ_ASSERT(checkedDisp.isValid());
+    if (!checkedDisp.isValid()) {
+      mAccumulatedStatus = false;
+      return;
+    }
+
+    int32_t disp = checkedDisp.value();
+    *reinterpret_cast<int32_t*>(mLocalBase + mOffset) = disp;
+    mOffset += sizeof(int32_t);
+  }
+
+#endif
+
   void WritePointer(uintptr_t aValue) {
     if (mOffset + sizeof(uintptr_t) > mMaxOffset) {
       mAccumulatedStatus = false;
       return;
     }
 
     *reinterpret_cast<uintptr_t*>(mLocalBase + mOffset) = aValue;
     mOffset += sizeof(uintptr_t);
@@ -121,40 +217,16 @@ class MOZ_STACK_CLASS Trampoline final {
     Maybe<uintptr_t> encoded(ReadPointer());
     if (!encoded) {
       return encoded;
     }
 
     return Some(ReadOnlyTargetFunction<MMPolicy>::DecodePtr(encoded.value()));
   }
 
-  void WriteDisp32(uintptr_t aAbsTarget) {
-    if (mOffset + sizeof(int32_t) > mMaxOffset) {
-      mAccumulatedStatus = false;
-      return;
-    }
-
-    // This needs to be computed from the remote location
-    intptr_t remoteTrampPosition = static_cast<intptr_t>(mRemoteBase + mOffset);
-
-    intptr_t diff = static_cast<intptr_t>(aAbsTarget) -
-                    (remoteTrampPosition + sizeof(int32_t));
-
-    CheckedInt<int32_t> checkedDisp(diff);
-    MOZ_ASSERT(checkedDisp.isValid());
-    if (!checkedDisp.isValid()) {
-      mAccumulatedStatus = false;
-      return;
-    }
-
-    int32_t disp = checkedDisp.value();
-    *reinterpret_cast<int32_t*>(mLocalBase + mOffset) = disp;
-    mOffset += sizeof(int32_t);
-  }
-
 #if defined(_M_IX86)
   // 32-bit only
   void AdjustDisp32AtOffset(uint32_t aOffset, uintptr_t aAbsTarget) {
     uint32_t effectiveOffset = mExeOffset + aOffset;
 
     if (effectiveOffset + sizeof(int32_t) > mMaxOffset) {
       mAccumulatedStatus = false;
       return;
@@ -208,17 +280,17 @@ class MOZ_STACK_CLASS Trampoline final {
 
  private:
   const MMPolicy* mMMPolicy;
   DWORD mPrevLocalProt;
   uint8_t* const mLocalBase;
   const uintptr_t mRemoteBase;
   uint32_t mOffset;
   uint32_t mExeOffset;
-  const uint32_t mMaxOffset;
+  uint32_t mMaxOffset;
   bool mAccumulatedStatus;
 };
 
 template <typename MMPolicy>
 class MOZ_STACK_CLASS TrampolineCollection final {
  public:
   class MOZ_STACK_CLASS TrampolineIterator final {
    public:
new file mode 100644
--- /dev/null
+++ b/mozglue/misc/interceptor/moz.build
@@ -0,0 +1,26 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+EXPORTS.mozilla.interceptor += [
+    'Arm64.h',
+    'MMPolicies.h',
+    'PatcherBase.h',
+    'PatcherDetour.h',
+    'PatcherNopSpace.h',
+    'TargetFunction.h',
+    'Trampoline.h',
+    'VMSharingPolicies.h',
+]
+
+if CONFIG['CPU_ARCH'] == 'aarch64':
+    Library('interceptor')
+
+    DEFINES['IMPL_MFBT'] = True
+
+    UNIFIED_SOURCES += [
+        'Arm64.cpp',
+    ]
+
--- a/mozglue/misc/moz.build
+++ b/mozglue/misc/moz.build
@@ -28,33 +28,27 @@ SOURCES += [
     'TimeStamp.cpp',
 ]
 
 OS_LIBS += CONFIG['REALTIME_LIBS']
 
 DEFINES['IMPL_MFBT'] = True
 
 if CONFIG['OS_ARCH'] == 'WINNT':
+    DIRS += [
+        'interceptor',
+    ]
     EXPORTS += [
         'nsWindowsDllInterceptor.h',
     ]
     EXPORTS.mozilla += [
         'DynamicallyLinkedFunctionPtr.h',
         'NativeNt.h',
         'WindowsMapRemoteView.h',
     ]
-    EXPORTS.mozilla.interceptor += [
-        'interceptor/MMPolicies.h',
-        'interceptor/PatcherBase.h',
-        'interceptor/PatcherDetour.h',
-        'interceptor/PatcherNopSpace.h',
-        'interceptor/TargetFunction.h',
-        'interceptor/Trampoline.h',
-        'interceptor/VMSharingPolicies.h',
-    ]
     SOURCES += [
         'TimeStamp_windows.cpp',
         'WindowsMapRemoteView.cpp',
     ]
     OS_LIBS += ['dbghelp']
 elif CONFIG['HAVE_CLOCK_MONOTONIC']:
     SOURCES += [
         'TimeStamp_posix.cpp',
--- a/mozglue/misc/nsWindowsDllInterceptor.h
+++ b/mozglue/misc/nsWindowsDllInterceptor.h
@@ -414,22 +414,16 @@ class WindowsDllInterceptor final
     }
 
     return AddDetour(proc, aHookDest, aOrigFunc);
   }
 
   bool AddDetour(FARPROC aProc, intptr_t aHookDest, void** aOrigFunc) {
     MOZ_ASSERT(mModule && aProc);
 
-#if defined(_M_ARM64)
-    // XXX: this is just to get things compiling; we'll have to add real
-    // support at some future point.
-    return false;
-#endif
-
     if (!mDetourPatcher.Initialized()) {
       DetourFlags flags = DetourFlags::eDefault;
 #if defined(_M_X64)
       if (mModule == ::GetModuleHandleW(L"ntdll.dll")) {
         // NTDLL hooks should attempt to use a 10-byte patch because some
         // injected DLLs do the same and interfere with our stuff.
         flags |= DetourFlags::eEnable10BytePatch;
       }