Bug 1284897 - Add opcodes to nsWindowsDllInterceptor for GetSaveFileNameW, GetOpenFileNameW and ImmReleaseContext. r=aklotz
☠☠ backed out by 04b72382940f ☠ ☠
authorDavid Parks <dparks@mozilla.com>
Fri, 20 Jan 2017 16:03:44 -0800
changeset 373199 a176abd99d2b2a12288694f0a8378266372d1f84
parent 373198 125a0e765379cebd702ecbcda6b60b56076e95c4
child 373200 4e81ec8850dcaf402ce8b23f146859cfea377f41
push id10863
push userjlorenzo@mozilla.com
push dateMon, 06 Mar 2017 23:02:23 +0000
treeherdermozilla-aurora@0931190cd725 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersaklotz
bugs1284897
milestone54.0a1
Bug 1284897 - Add opcodes to nsWindowsDllInterceptor for GetSaveFileNameW, GetOpenFileNameW and ImmReleaseContext. r=aklotz This includes a near-jump CALL instruction in x64, which expands to a far-jump CALL with a 64-bit address as inline data. This requires us to abandon the method where we memcpy the code block into the trampoline and, instead, build the trampoline function as we go.
xpcom/build/nsWindowsDllInterceptor.h
--- a/xpcom/build/nsWindowsDllInterceptor.h
+++ b/xpcom/build/nsWindowsDllInterceptor.h
@@ -65,16 +65,22 @@
  * jump instructions are not supported.
  *
  * Note that this is not thread-safe.  Sad day.
  *
  */
 
 #include <stdint.h>
 
+#define COPY_CODES(NBYTES)  do {    \
+  memcpy(&tramp[nTrampBytes], &origBytes[nOrigBytes], NBYTES);    \
+  nOrigBytes += NBYTES;             \
+  nTrampBytes += NBYTES;            \
+} while (0)
+
 namespace mozilla {
 namespace internal {
 
 class AutoVirtualProtect
 {
 public:
   AutoVirtualProtect(void* aFunc, size_t aSize, DWORD aProtect)
     : mFunc(aFunc), mSize(aSize), mNewProtect(aProtect), mOldProtect(0),
@@ -538,62 +544,61 @@ protected:
     return numBytes;
   }
 
 #if defined(_M_X64)
   // To patch for JMP and JE
 
   enum JumpType {
    Je,
-   Jmp
+   Jmp,
+   Call
   };
 
   struct JumpPatch {
     JumpPatch()
       : mHookOffset(0), mJumpAddress(0), mType(JumpType::Jmp)
     {
     }
 
     JumpPatch(size_t aOffset, intptr_t aAddress, JumpType aType = JumpType::Jmp)
       : mHookOffset(aOffset), mJumpAddress(aAddress), mType(aType)
     {
     }
 
-    void AddJumpPatch(size_t aHookOffset, intptr_t aAbsJumpAddress,
-                     JumpType aType = JumpType::Jmp)
-    {
-      mHookOffset = aHookOffset;
-      mJumpAddress = aAbsJumpAddress;
-      mType = aType;
-    }
-
     size_t GenerateJump(uint8_t* aCode)
     {
       size_t offset = mHookOffset;
       if (mType == JumpType::Je) {
         // JNE RIP+14
         aCode[offset]     = 0x75;
         aCode[offset + 1] = 14;
         offset += 2;
       }
 
-      // JMP [RIP+0]
-      aCode[offset] = 0xff;
-      aCode[offset + 1] = 0x25;
-      *reinterpret_cast<int32_t*>(aCode + offset + 2) = 0;
-
-      // Jump table
-      *reinterpret_cast<int64_t*>(aCode + offset + 2 + 4) = mJumpAddress;
-
-      return offset + 2 + 4 + 8;
-    }
-
-    bool HasJumpPatch() const
-    {
-      return !!mJumpAddress;
+      // Near call/jmp, absolute indirect, address given in r/m32
+      if (mType == JumpType::Call) {
+        // CALL [RIP+0]
+        aCode[offset] = 0xff;
+        aCode[offset + 1] = 0x15;
+        // The offset to jump destination -- ie it is placed 2 bytes after the offset.
+        *reinterpret_cast<int32_t*>(aCode + offset + 2) = 2;
+        aCode[offset + 2 + 4] = 0xeb;    // JMP +8 (jump over mJumpAddress)
+        aCode[offset + 2 + 4 + 1] = 8;
+        *reinterpret_cast<int64_t*>(aCode + offset + 2 + 4 + 2) = mJumpAddress;
+        return offset + 2 + 4 + 2 + 8;
+      } else {
+        // JMP [RIP+0]
+        aCode[offset] = 0xff;
+        aCode[offset + 1] = 0x25;
+        // The offset to jump destination is 0
+        *reinterpret_cast<int32_t*>(aCode + offset + 2) = 0;
+        *reinterpret_cast<int64_t*>(aCode + offset + 2 + 4) = mJumpAddress;
+        return offset + 2 + 4 + 8;
+      }
     }
 
     size_t mHookOffset;
     intptr_t mJumpAddress;
     JumpType mType;
   };
 
 #endif
@@ -667,345 +672,390 @@ protected:
   {
     *aOutTramp = nullptr;
 
     byteptr_t tramp = FindTrampolineSpace();
     if (!tramp) {
       return;
     }
 
+    // We keep the address of the original function in the first bytes of
+    // the trampoline buffer
+    *((void**)tramp) = aOrigFunction;
+    tramp += sizeof(void*);
+
     byteptr_t origBytes = (byteptr_t)aOrigFunction;
 
-    int nBytes = 0;
+    // # of bytes of the original function that we can overwrite.
+    int nOrigBytes = 0;
 
 #if defined(_M_IX86)
     int pJmp32 = -1;
-    while (nBytes < 5) {
+    while (nOrigBytes < 5) {
       // Understand some simple instructions that might be found in a
       // prologue; we might need to extend this as necessary.
       //
       // Note!  If we ever need to understand jump instructions, we'll
       // need to rewrite the displacement argument.
       unsigned char prefixGroups;
-      int numPrefixBytes = CountPrefixBytes(origBytes, nBytes, &prefixGroups);
+      int numPrefixBytes = CountPrefixBytes(origBytes, nOrigBytes, &prefixGroups);
       if (numPrefixBytes < 0 || (prefixGroups & (ePrefixGroup3 | ePrefixGroup4))) {
         // Either the prefix sequence was bad, or there are prefixes that
         // we don't currently support (groups 3 and 4)
         return;
       }
-      nBytes += numPrefixBytes;
-      if (origBytes[nBytes] >= 0x88 && origBytes[nBytes] <= 0x8B) {
+      nOrigBytes += numPrefixBytes;
+      if (origBytes[nOrigBytes] >= 0x88 &&
+          origBytes[nOrigBytes] <= 0x8B) {
         // various MOVs
-        ++nBytes;
-        int len = CountModRmSib(origBytes + nBytes);
+        ++nOrigBytes;
+        int len = CountModRmSib(origBytes + nOrigBytes);
         if (len < 0) {
           return;
         }
-        nBytes += len;
-      } else if (origBytes[nBytes] == 0xA1) {
+        nOrigBytes += len;
+      } else if (origBytes[nOrigBytes] == 0xA1) {
         // MOV eax, [seg:offset]
-        nBytes += 5;
-      } else if (origBytes[nBytes] == 0xB8) {
+        nOrigBytes += 5;
+      } else if (origBytes[nOrigBytes] == 0xB8) {
         // MOV 0xB8: http://ref.x86asm.net/coder32.html#xB8
-        nBytes += 5;
-      } else if (origBytes[nBytes] == 0x83) {
+        nOrigBytes += 5;
+      } else if (origBytes[nOrigBytes] == 0x33 &&
+                 (origBytes[nOrigBytes+1] & kMaskMod) == kModReg) {
+        // XOR r32, r32
+        nOrigBytes += 2;
+      } else if ((origBytes[nOrigBytes] & 0xf8) == 0x40) {
+        // INC r32
+        nOrigBytes += 1;
+      } else if (origBytes[nOrigBytes] == 0x83) {
         // ADD|ODR|ADC|SBB|AND|SUB|XOR|CMP r/m, imm8
-        unsigned char b = origBytes[nBytes + 1];
+        unsigned char b = origBytes[nOrigBytes + 1];
         if ((b & 0xc0) == 0xc0) {
           // ADD|ODR|ADC|SBB|AND|SUB|XOR|CMP r, imm8
-          nBytes += 3;
+          nOrigBytes += 3;
         } else {
           // bail
           return;
         }
-      } else if (origBytes[nBytes] == 0x68) {
+      } else if (origBytes[nOrigBytes] == 0x68) {
         // PUSH with 4-byte operand
-        nBytes += 5;
-      } else if ((origBytes[nBytes] & 0xf0) == 0x50) {
+        nOrigBytes += 5;
+      } else if ((origBytes[nOrigBytes] & 0xf0) == 0x50) {
         // 1-byte PUSH/POP
-        nBytes++;
-      } else if (origBytes[nBytes] == 0x6A) {
+        nOrigBytes++;
+      } else if (origBytes[nOrigBytes] == 0x6A) {
         // PUSH imm8
-        nBytes += 2;
-      } else if (origBytes[nBytes] == 0xe9) {
-        pJmp32 = nBytes;
+        nOrigBytes += 2;
+      } else if (origBytes[nOrigBytes] == 0xe9) {
+        pJmp32 = nOrigBytes;
         // jmp 32bit offset
-        nBytes += 5;
-      } else if (origBytes[nBytes] == 0xff && origBytes[nBytes + 1] == 0x25) {
+        nOrigBytes += 5;
+      } else if (origBytes[nOrigBytes] == 0xff &&
+                 origBytes[nOrigBytes + 1] == 0x25) {
         // jmp [disp32]
-        nBytes += 6;
+        nOrigBytes += 6;
+      } else if (origBytes[nOrigBytes] == 0xc2) {
+        // ret imm16.  We can't handle this but it happens.  We don't ASSERT but we do fail to hook.
+#if defined(MOZILLA_INTERNAL_API)
+        NS_WARNING("Cannot hook method -- RET opcode found");
+#endif
+        return;
       } else {
         //printf ("Unknown x86 instruction byte 0x%02x, aborting trampoline\n", origBytes[nBytes]);
         return;
       }
     }
+
+    // The trampoline is a copy of the instructions that we just traced,
+    // followed by a jump that we add below.
+    memcpy(tramp, aOrigFunction, nOrigBytes);
 #elif defined(_M_X64)
-    JumpPatch jump;
-
-    while (nBytes < 13) {
+    // The number of bytes used by the trampoline.
+    int nTrampBytes = 0;
+    bool foundJmp = false;
 
-      // if found JMP 32bit offset, next bytes must be NOP or INT3
-      if (jump.HasJumpPatch()) {
-        if (origBytes[nBytes] == 0x90 || origBytes[nBytes] == 0xcc) {
-          nBytes++;
+    while (nOrigBytes < 13) {
+      // If we found JMP 32bit offset, we require that the next bytes must
+      // be NOP or INT3.  There is no reason to copy them.
+      // TODO: This used to trigger for Je as well.  Now that I allow
+      // instructions after CALL and JE, I don't think I need that.
+      // The only real value of this condition is that if code follows a JMP
+      // then its _probably_ the target of a JMP somewhere else and we
+      // will be overwriting it, which would be tragic.  This seems
+      // highly unlikely.
+      if (foundJmp) {
+        if (origBytes[nOrigBytes] == 0x90 || origBytes[nOrigBytes] == 0xcc) {
+          nOrigBytes++;
           continue;
         }
         return;
       }
-      if (origBytes[nBytes] == 0x0f) {
-        nBytes++;
-        if (origBytes[nBytes] == 0x1f) {
+      if (origBytes[nOrigBytes] == 0x0f) {
+        COPY_CODES(1);
+        if (origBytes[nOrigBytes] == 0x1f) {
           // nop (multibyte)
-          nBytes++;
-          if ((origBytes[nBytes] & 0xc0) == 0x40 &&
-              (origBytes[nBytes] & 0x7) == 0x04) {
-            nBytes += 3;
+          COPY_CODES(1);
+          if ((origBytes[nOrigBytes] & 0xc0) == 0x40 &&
+              (origBytes[nOrigBytes] & 0x7) == 0x04) {
+            COPY_CODES(3);
           } else {
             return;
           }
-        } else if (origBytes[nBytes] == 0x05) {
+        } else if (origBytes[nOrigBytes] == 0x05) {
           // syscall
-          nBytes++;
-        } else if (origBytes[nBytes] == 0x84) {
+          COPY_CODES(1);
+        } else if (origBytes[nOrigBytes] == 0x84) {
           // je rel32
-          jump.AddJumpPatch(nBytes - 1,
-                            (intptr_t)
-                              origBytes + nBytes + 5 +
-                            *(reinterpret_cast<int32_t*>(origBytes +
-                                                         nBytes + 1)),
-                            JumpType::Je);
-          nBytes += 5;
+          JumpPatch jump(nTrampBytes - 1,  // overwrite the 0x0f we copied above
+                          (intptr_t)(origBytes + nOrigBytes + 5 +
+                                     *(reinterpret_cast<int32_t*>(origBytes + nOrigBytes + 1))),
+                          JumpType::Je);
+          nTrampBytes = jump.GenerateJump(tramp);
+          nOrigBytes += 5;
         } else {
           return;
         }
-      } else if (origBytes[nBytes] == 0x40 ||
-                 origBytes[nBytes] == 0x41) {
+      } else if (origBytes[nOrigBytes] == 0x40 ||
+                 origBytes[nOrigBytes] == 0x41) {
         // Plain REX or REX.B
-        nBytes++;
-
-        if ((origBytes[nBytes] & 0xf0) == 0x50) {
+        COPY_CODES(1);
+        if ((origBytes[nOrigBytes] & 0xf0) == 0x50) {
           // push/pop with Rx register
-          nBytes++;
-        } else if (origBytes[nBytes] >= 0xb8 && origBytes[nBytes] <= 0xbf) {
+          COPY_CODES(1);
+        } else if (origBytes[nOrigBytes] >= 0xb8 && origBytes[nOrigBytes] <= 0xbf) {
           // mov r32, imm32
-          nBytes += 5;
+          COPY_CODES(5);
         } else {
           return;
         }
-      } else if (origBytes[nBytes] == 0x45) {
+      } else if (origBytes[nOrigBytes] == 0x45) {
         // REX.R & REX.B
-        nBytes++;
+        COPY_CODES(1);
 
-        if (origBytes[nBytes] == 0x33) {
+        if (origBytes[nOrigBytes] == 0x33) {
           // xor r32, r32
-          nBytes += 2;
+          COPY_CODES(2);
         } else {
           return;
         }
-      } else if ((origBytes[nBytes] & 0xfb) == 0x48) {
+      } else if ((origBytes[nOrigBytes] & 0xfb) == 0x48) {
         // REX.W | REX.WR
-        nBytes++;
+        COPY_CODES(1);
 
-        if (origBytes[nBytes] == 0x81 &&
-            (origBytes[nBytes + 1] & 0xf8) == 0xe8) {
+        if (origBytes[nOrigBytes] == 0x81 &&
+            (origBytes[nOrigBytes + 1] & 0xf8) == 0xe8) {
           // sub r, dword
-          nBytes += 6;
-        } else if (origBytes[nBytes] == 0x83 &&
-                   (origBytes[nBytes + 1] & 0xf8) == 0xe8) {
+          COPY_CODES(6);
+        } else if (origBytes[nOrigBytes] == 0x83 &&
+                   (origBytes[nOrigBytes + 1] & 0xf8) == 0xe8) {
           // sub r, byte
-          nBytes += 3;
-        } else if (origBytes[nBytes] == 0x83 &&
-                   (origBytes[nBytes + 1] & 0xf8) == 0x60) {
+          COPY_CODES(3);
+        } else if (origBytes[nOrigBytes] == 0x83 &&
+                   (origBytes[nOrigBytes + 1] & (kMaskMod|kMaskReg)) == kModReg) {
+          // add r, byte
+          COPY_CODES(3);
+        } else if (origBytes[nOrigBytes] == 0x83 &&
+                   (origBytes[nOrigBytes + 1] & 0xf8) == 0x60) {
           // and [r+d], imm8
-          nBytes += 5;
-        } else if (origBytes[nBytes] == 0x85) {
+          COPY_CODES(5);
+        } else if (origBytes[nOrigBytes] == 0x2b &&
+                   (origBytes[nOrigBytes + 1] & kMaskMod) == kModReg) {
+          // sub r64, r64
+          COPY_CODES(2);
+        } else if (origBytes[nOrigBytes] == 0x85) {
           // 85 /r => TEST r/m32, r32
-          if ((origBytes[nBytes + 1] & 0xc0) == 0xc0) {
-            nBytes += 2;
+          if ((origBytes[nOrigBytes + 1] & 0xc0) == 0xc0) {
+            COPY_CODES(2);
           } else {
             return;
           }
-        } else if ((origBytes[nBytes] & 0xfd) == 0x89) {
-          ++nBytes;
+        } else if ((origBytes[nOrigBytes] & 0xfd) == 0x89) {
+          COPY_CODES(1);
           // MOV r/m64, r64 | MOV r64, r/m64
-          int len = CountModRmSib(origBytes + nBytes);
+          int len = CountModRmSib(origBytes + nOrigBytes);
           if (len < 0) {
             return;
           }
-          nBytes += len;
-        } else if (origBytes[nBytes] == 0xc7) {
+          COPY_CODES(len);
+        } else if (origBytes[nOrigBytes] == 0xc7) {
           // MOV r/m64, imm32
-          if (origBytes[nBytes + 1] == 0x44) {
+          if (origBytes[nOrigBytes + 1] == 0x44) {
             // MOV [r64+disp8], imm32
             // ModR/W + SIB + disp8 + imm32
-            nBytes += 8;
+            COPY_CODES(8);
           } else {
             return;
           }
-        } else if (origBytes[nBytes] == 0xff) {
+        } else if (origBytes[nOrigBytes] == 0xff) {
           // JMP /4
-          if ((origBytes[nBytes + 1] & 0xc0) == 0x0 &&
-              (origBytes[nBytes + 1] & 0x07) == 0x5) {
+          if ((origBytes[nOrigBytes + 1] & 0xc0) == 0x0 &&
+              (origBytes[nOrigBytes + 1] & 0x07) == 0x5) {
             // [rip+disp32]
             // convert JMP 32bit offset to JMP 64bit direct
-            jump.AddJumpPatch(nBytes - 1,
-                              *reinterpret_cast<intptr_t*>(
-                                origBytes + nBytes + 6 +
-                              *reinterpret_cast<int32_t*>(origBytes + nBytes +
-                                                          2)));
-            nBytes += 6;
+            JumpPatch jump(nTrampBytes - 1,  // overwrite the REX.W/REX.WR we copied above
+                           *reinterpret_cast<intptr_t*>(origBytes + nOrigBytes + 6 +
+                                                        *reinterpret_cast<int32_t*>(origBytes + nOrigBytes + 2)),
+                           JumpType::Jmp);
+            nTrampBytes = jump.GenerateJump(tramp);
+            nOrigBytes += 6;
+            foundJmp = true;
           } else {
             // not support yet!
             return;
           }
         } else {
           // not support yet!
           return;
         }
-      } else if (origBytes[nBytes] == 0x66) {
+      } else if (origBytes[nOrigBytes] == 0x66) {
         // operand override prefix
-        nBytes += 1;
+        COPY_CODES(1);
         // This is the same as the x86 version
-        if (origBytes[nBytes] >= 0x88 && origBytes[nBytes] <= 0x8B) {
+        if (origBytes[nOrigBytes] >= 0x88 && origBytes[nOrigBytes] <= 0x8B) {
           // various MOVs
-          unsigned char b = origBytes[nBytes + 1];
+          unsigned char b = origBytes[nOrigBytes + 1];
           if (((b & 0xc0) == 0xc0) ||
               (((b & 0xc0) == 0x00) &&
                ((b & 0x07) != 0x04) && ((b & 0x07) != 0x05))) {
             // REG=r, R/M=r or REG=r, R/M=[r]
-            nBytes += 2;
+            COPY_CODES(2);
           } else if ((b & 0xc0) == 0x40) {
             if ((b & 0x07) == 0x04) {
               // REG=r, R/M=[SIB + disp8]
-              nBytes += 4;
+              COPY_CODES(4);
             } else {
               // REG=r, R/M=[r + disp8]
-              nBytes += 3;
+              COPY_CODES(3);
             }
           } else {
             // complex MOV, bail
             return;
           }
         }
-      } else if ((origBytes[nBytes] & 0xf0) == 0x50) {
+      } else if ((origBytes[nOrigBytes] & 0xf0) == 0x50) {
         // 1-byte push/pop
-        nBytes++;
-      } else if (origBytes[nBytes] == 0x65) {
+        COPY_CODES(1);
+      } else if (origBytes[nOrigBytes] == 0x65) {
         // GS prefix
         //
         // The entry of GetKeyState on Windows 10 has the following code.
         // 65 48 8b 04 25 30 00 00 00    mov   rax,qword ptr gs:[30h]
         // (GS prefix + REX + MOV (0x8b) ...)
-        if (origBytes[nBytes + 1] == 0x48 &&
-            (origBytes[nBytes + 2] >= 0x88 && origBytes[nBytes + 2] <= 0x8b)) {
-          nBytes += 3;
-          int len = CountModRmSib(origBytes + nBytes);
+        if (origBytes[nOrigBytes + 1] == 0x48 &&
+            (origBytes[nOrigBytes + 2] >= 0x88 && origBytes[nOrigBytes + 2] <= 0x8b)) {
+          COPY_CODES(3);
+          int len = CountModRmSib(origBytes + nOrigBytes);
           if (len < 0) {
             // no way to support this yet.
             return;
           }
-          nBytes += len;
+          COPY_CODES(len);
         } else {
           return;
         }
-      } else if (origBytes[nBytes] == 0x90) {
+      } else if (origBytes[nOrigBytes] == 0x90) {
         // nop
-        nBytes++;
-      } else if (origBytes[nBytes] == 0xb8) {
+        COPY_CODES(1);
+      } else if (origBytes[nOrigBytes] == 0xb8) {
         // MOV 0xB8: http://ref.x86asm.net/coder32.html#xB8
-        nBytes += 5;
-      } else if (origBytes[nBytes] == 0x33) {
+        COPY_CODES(5);
+      } else if (origBytes[nOrigBytes] == 0x33) {
         // xor r32, r/m32
-        nBytes += 2;
-      } else if (origBytes[nBytes] == 0xf6) {
+        COPY_CODES(2);
+      } else if (origBytes[nOrigBytes] == 0xf6) {
         // test r/m8, imm8 (used by ntdll on Windows 10 x64)
         // (no flags are affected by near jmp since there is no task switch,
         // so it is ok for a jmp to be written immediately after a test)
         BYTE subOpcode = 0;
-        int nModRmSibBytes = CountModRmSib(&origBytes[nBytes + 1], &subOpcode);
+        int nModRmSibBytes = CountModRmSib(&origBytes[nOrigBytes + 1], &subOpcode);
         if (nModRmSibBytes < 0 || subOpcode != 0) {
           // Unsupported
           return;
         }
-        nBytes += 2 + nModRmSibBytes;
-      } else if (origBytes[nBytes] == 0xc3) {
+        COPY_CODES(2 + nModRmSibBytes);
+      } else if (origBytes[nOrigBytes] == 0xd1 &&
+                  (origBytes[nOrigBytes+1] & kMaskMod) == kModReg) {
+        // bit shifts/rotates : (SA|SH|RO|RC)(R|L) r32
+        // (e.g. 0xd1 0xe0 is SAL, 0xd1 0xc8 is ROR)
+        COPY_CODES(2);
+      } else if (origBytes[nOrigBytes] == 0xc3) {
         // ret
-        nBytes++;
-      } else if (origBytes[nBytes] == 0xcc) {
+        COPY_CODES(1);
+      } else if (origBytes[nOrigBytes] == 0xcc) {
         // int 3
-        nBytes++;
-      } else if (origBytes[nBytes] == 0xe9) {
-        // jmp 32bit offset
-        jump.AddJumpPatch(nBytes,
-                          // convert JMP 32bit offset to JMP 64bit direct
-                          (intptr_t)
-                            origBytes + nBytes + 5 +
-                          *(reinterpret_cast<int32_t*>(origBytes + nBytes + 1)));
-        nBytes += 5;
-      } else if (origBytes[nBytes] == 0xff) {
-        nBytes++;
-        if ((origBytes[nBytes] & 0xf8) == 0xf0) {
+        COPY_CODES(1);
+      } else if (origBytes[nOrigBytes] == 0xe8 ||
+                 origBytes[nOrigBytes] == 0xe9) {
+        // CALL (0xe8) or JMP (0xe9) 32bit offset
+        foundJmp = origBytes[nOrigBytes] == 0xe9;
+        JumpPatch jump(nTrampBytes,
+                       (intptr_t)(origBytes + nOrigBytes + 5 +
+                                  *(reinterpret_cast<int32_t*>(origBytes + nOrigBytes + 1))),
+                       origBytes[nOrigBytes] == 0xe8 ? JumpType::Call : JumpType::Jmp);
+        nTrampBytes = jump.GenerateJump(tramp);
+        nOrigBytes += 5;
+      } else if (origBytes[nOrigBytes] == 0xff) {
+        COPY_CODES(1);
+        if ((origBytes[nOrigBytes] & (kMaskMod|kMaskReg)) == 0xf0) {
           // push r64
-          nBytes++;
+          COPY_CODES(1);
+        } else if (origBytes[nOrigBytes] == 0x25) {
+          // jmp absolute indirect m32
+          foundJmp = true;
+          int32_t offset = *(reinterpret_cast<int32_t*>(origBytes + nOrigBytes + 1));
+          int64_t* ptrToJmpDest = reinterpret_cast<int64_t*>(origBytes + nOrigBytes + 5 + offset);
+          intptr_t jmpDest = static_cast<intptr_t>(*ptrToJmpDest);
+          JumpPatch jump(nTrampBytes, jmpDest, JumpType::Jmp);
+          nTrampBytes = jump.GenerateJump(tramp);
+          nOrigBytes += 5;
         } else {
           return;
         }
       } else {
         return;
       }
     }
 #else
 #error "Unknown processor type"
 #endif
 
-    if (nBytes > 100) {
+    if (nOrigBytes > 100) {
       //printf ("Too big!");
       return;
     }
 
-    // We keep the address of the original function in the first bytes of
-    // the trampoline buffer
-    *((void**)tramp) = aOrigFunction;
-    tramp += sizeof(void*);
-
-    memcpy(tramp, aOrigFunction, nBytes);
-
-    // OrigFunction+N, the target of the trampoline
-    byteptr_t trampDest = origBytes + nBytes;
+    // target address of the final jmp instruction in the trampoline
+    byteptr_t trampDest = origBytes + nOrigBytes;
 
 #if defined(_M_IX86)
     if (pJmp32 >= 0) {
       // Jump directly to the original target of the jump instead of jumping to the
       // original function.
       // Adjust jump target displacement to jump location in the trampoline.
       *((intptr_t*)(tramp + pJmp32 + 1)) += origBytes - tramp;
     } else {
-      tramp[nBytes] = 0xE9; // jmp
-      *((intptr_t*)(tramp + nBytes + 1)) =
-        (intptr_t)trampDest - (intptr_t)(tramp + nBytes + 5); // target displacement
+      tramp[nOrigBytes] = 0xE9; // jmp
+      *((intptr_t*)(tramp + nOrigBytes + 1)) =
+        (intptr_t)trampDest - (intptr_t)(tramp + nOrigBytes + 5); // target displacement
     }
 #elif defined(_M_X64)
-    // If JMP/JE opcode found, we don't insert to trampoline jump
-    if (jump.HasJumpPatch()) {
-      size_t offset = jump.GenerateJump(tramp);
-      if (jump.mType != JumpType::Jmp) {
-        JumpPatch patch(offset, reinterpret_cast<intptr_t>(trampDest));
-        patch.GenerateJump(tramp);
-      }
-    } else {
-      JumpPatch patch(nBytes, reinterpret_cast<intptr_t>(trampDest));
+    // If the we found a Jmp, we don't need to add another instruction. However,
+    // if we found a _conditional_ jump or a CALL (or no control operations
+    // at all) then we still need to run the rest of aOriginalFunction.
+    if (!foundJmp) {
+      JumpPatch patch(nTrampBytes, reinterpret_cast<intptr_t>(trampDest));
       patch.GenerateJump(tramp);
     }
 #endif
 
     // The trampoline is now valid.
     *aOutTramp = tramp;
 
     // ensure we can modify the original code
-    AutoVirtualProtect protect(aOrigFunction, nBytes, PAGE_EXECUTE_READWRITE);
+    AutoVirtualProtect protect(aOrigFunction, nOrigBytes, PAGE_EXECUTE_READWRITE);
     if (!protect.Protect()) {
       //printf ("VirtualProtectEx failed! %d\n", GetLastError());
       return;
     }
 
 #if defined(_M_IX86)
     // now modify the original bytes
     origBytes[0] = 0xE9; // jmp