bug 1272498 - rewrite NS_InvokeByIndex in assembly r=froydnj
☠☠ backed out by 86816e25dea7 ☠ ☠
authorTrevor Saunders <tbsaunde@tbsaunde.org>
Tue, 28 Jun 2016 10:28:33 -0400
changeset 331441 2e8cfd225f0419a552edc4d9f5942927344eef7c
parent 331440 15d996da21b247c646f5dc0b05663568bdc135c6
child 331442 bafc5c504506a1795adb94bfe54700fe254b61d6
push id9858
push userjlund@mozilla.com
push dateMon, 01 Aug 2016 14:37:10 +0000
treeherdermozilla-aurora@203106ef6cb6 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersfroydnj
bugs1272498
milestone50.0a1
bug 1272498 - rewrite NS_InvokeByIndex in assembly r=froydnj
xpcom/reflect/xptcall/md/unix/moz.build
xpcom/reflect/xptcall/md/unix/xptcinvoke_asm_x86_64_unix.s
xpcom/reflect/xptcall/md/unix/xptcinvoke_x86_64_unix.cpp
--- a/xpcom/reflect/xptcall/md/unix/moz.build
+++ b/xpcom/reflect/xptcall/md/unix/moz.build
@@ -23,16 +23,17 @@ if CONFIG['OS_ARCH'] == 'GNU':
             'xptcinvoke_gcc_x86_unix.cpp',
             'xptcstubs_gcc_x86_unix.cpp'
         ]
 
 if CONFIG['OS_ARCH'] in ('Linux', 'Bitrig', 'DragonFly', 'FreeBSD', 'NetBSD', 'OpenBSD') or \
    CONFIG['OS_ARCH'].startswith('GNU_'):
     if CONFIG['OS_TEST'] == 'x86_64':
         SOURCES += [
+            'xptcinvoke_asm_x86_64_unix.s',
             'xptcinvoke_x86_64_unix.cpp',
             'xptcstubs_x86_64_linux.cpp',
         ]
     elif '86' in CONFIG['OS_TEST']:
         SOURCES += [
             'xptcinvoke_gcc_x86_unix.cpp',
             'xptcstubs_gcc_x86_unix.cpp'
         ]
@@ -48,16 +49,17 @@ if CONFIG['OS_ARCH'] in ('Linux', 'FreeB
 
 if CONFIG['OS_ARCH'] == 'SunOS' and '86' in CONFIG['OS_TEST']:
     GENERATED_FILES = [
         'xptcstubsdef_asm.solx86',
     ]
     if CONFIG['OS_TEST'] == 'x86_64':
         if CONFIG['GNU_CC']:
             SOURCES += [
+            'xptcinvoke_asm_x86_64_unix.s',
                 'xptcinvoke_x86_64_unix.cpp',
                 'xptcstubs_x86_64_linux.cpp'
             ]
         else:
             ASFLAGS += ['-xarch=amd64']
             SOURCES += [
                 'xptcinvoke_x86_64_solaris.cpp',
                 'xptcstubs_asm_x86_64_solaris_SUNW.s',
new file mode 100644
--- /dev/null
+++ b/xpcom/reflect/xptcall/md/unix/xptcinvoke_asm_x86_64_unix.s
@@ -0,0 +1,92 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+.intel_syntax noprefix
+
+# nsresult NS_InvokeByIndex(nsISupports* this, uint32_t aVtableIndex,
+#                           uint32_t argc, nsXPTCVariant* argv);
+.text
+.global NS_InvokeByIndex
+.type NS_InvokeByIndex, @function
+.align 4
+NS_InvokeByIndex:
+  push rbp
+  mov rbp, rsp
+
+# save r12 and r13 because we use them and they are callee saved.
+  push r12
+  push r13
+
+# save this and the vtable index because we need them after setting up the
+# stack.
+  mov r12, rdi
+  mov r13, rsi
+
+# allocate space for stack arguments, in theory we only need 8 * (argc - 5)
+# bytes because at least 5 arguments will go in registers, but for now it is
+# just simpler to allocate 8 * (argc - 1) bytes.  Note that we treat the this
+# pointer specially.
+  lea eax, [edx * 8]
+  sub rsp, rax
+
+# if there is an odd number of args the stack can be misaligned so realign it.
+  and rsp, 0xffffffffffffff00
+
+# pass the stack slot area to InvokeCopyToStack.
+  mov r8, rsp
+
+# setup space for the register slots: there are 5 integer ones and 8 floating
+# point ones.  So we need 104 bytes of space, but we allocate 112 to keep rsp
+# aligned to 16 bytes.
+  sub rsp, 112
+
+# the first argument to InvokeCopyToStack is the integer register area, and the
+# second is the floating point area.
+  mov rdi, rsp
+  lea rsi, [rsp + 40]
+
+# The 3rd and 4th arguments to InvokeCopyToStack are already in the right
+# registers because they are our 3rd and 4th arguments.  So now we can just
+# call InvokeCopyToStack.
+  call InvokeCopyToStack
+
+# setup this
+  mov rdi, r12
+
+# copy the integer arguments into place.
+  mov rsi, [rsp]
+  mov rdx, [rsp + 8]
+  mov rcx, [rsp + 16]
+  mov r8, [rsp + 24]
+  mov r9, [rsp + 32]
+
+# copy the float arguments into place
+  movsd xmm0, [rsp + 40]
+  movsd xmm1, [rsp + 48]
+  movsd xmm2, [rsp + 56]
+  movsd xmm3, [rsp + 64]
+  movsd xmm4, [rsp + 72]
+  movsd xmm5, [rsp + 80]
+  movsd xmm6, [rsp + 88]
+  movsd xmm7, [rsp + 96]
+
+# get rid of the scratch space for registers
+  add rsp, 112
+
+# load the function pointer and call
+  lea eax, [r13d * 8]
+  add rax, [rdi]
+  call [rax]
+
+# r12 and r13 were pushed relative to the old stack pointer which is now the
+# frame pointer.
+  mov r12, [rbp - 0x8]
+  mov r13, [rbp - 0x10]
+
+  mov rsp, rbp
+  pop rbp
+  ret
+
+// Magic indicating no need for an executable stack
+.section .note.GNU-stack, "", @progbits ; .previous
--- a/xpcom/reflect/xptcall/md/unix/xptcinvoke_x86_64_unix.cpp
+++ b/xpcom/reflect/xptcall/md/unix/xptcinvoke_x86_64_unix.cpp
@@ -3,56 +3,29 @@
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 // Platform specific code to invoke XPCOM methods on native objects
 
 #include "xptcprivate.h"
 
-// 6 integral parameters are passed in registers
-const uint32_t GPR_COUNT = 6;
+// 6 integral parameters are passed in registers, but 1 is |this| which isn't
+// considered here.
+const uint32_t GPR_COUNT = 5;
 
 // 8 floating point parameters are passed in SSE registers
 const uint32_t FPR_COUNT = 8;
 
-// Remember that these 'words' are 64-bit long
-static inline void
-invoke_count_words(uint32_t paramCount, nsXPTCVariant * s,
-                   uint32_t & nr_stack)
+extern "C" void
+InvokeCopyToStack(uint64_t * gpregs, double * fpregs,
+                     uint32_t paramCount, nsXPTCVariant * s,
+                     uint64_t* d)
 {
-    uint32_t nr_gpr;
-    uint32_t nr_fpr;
-    nr_gpr = 1; // skip one GP register for 'that'
-    nr_fpr = 0;
-    nr_stack = 0;
-
-    /* Compute number of eightbytes of class MEMORY.  */
-    for (uint32_t i = 0; i < paramCount; i++, s++) {
-        if (!s->IsPtrData()
-            && (s->type == nsXPTType::T_FLOAT || s->type == nsXPTType::T_DOUBLE)) {
-            if (nr_fpr < FPR_COUNT)
-                nr_fpr++;
-            else
-                nr_stack++;
-        }
-        else {
-            if (nr_gpr < GPR_COUNT)
-                nr_gpr++;
-            else
-                nr_stack++;
-        }
-    }
-}
-
-static void
-invoke_copy_to_stack(uint64_t * d, uint32_t paramCount, nsXPTCVariant * s,
-                     uint64_t * gpregs, double * fpregs)
-{
-    uint32_t nr_gpr = 1u; // skip one GP register for 'that'
+    uint32_t nr_gpr = 0u; // skip one GP register for 'that'
     uint32_t nr_fpr = 0u;
     uint64_t value = 0u;
 
     for (uint32_t i = 0; i < paramCount; i++, s++) {
         if (s->IsPtrData())
             value = (uint64_t) s->ptr;
         else {
             switch (s->type) {
@@ -95,94 +68,8 @@ invoke_copy_to_stack(uint64_t * d, uint3
         else {
             if (nr_gpr < GPR_COUNT)
                 gpregs[nr_gpr++] = value;
             else
                 *d++ = value;
         }
     }
 }
-
-// Disable avx for the next function to allow compilation with
-// -march=native on new machines, or similar hardcoded -march options.
-// Having avx enabled appears to change the alignment behavior of alloca
-// (apparently adding an extra 16 bytes) of padding/alignment (and using
-// 32-byte alignment instead of 16-byte).  This seems to be the best
-// available workaround, given that this code, which should perhaps
-// better be written in assembly, is written in C++.
-#ifndef __clang__
-#pragma GCC push_options
-#pragma GCC target ("no-avx")
-#endif
-
-// Avoid AddressSanitizer instrumentation for the next function because it
-// depends on __builtin_alloca behavior and alignment that cannot be relied on
-// once the function is compiled with a version of ASan that has dynamic-alloca
-// instrumentation enabled.
-
-MOZ_ASAN_BLACKLIST
-EXPORT_XPCOM_API(nsresult)
-NS_InvokeByIndex(nsISupports * that, uint32_t methodIndex,
-                 uint32_t paramCount, nsXPTCVariant * params)
-{
-    uint32_t nr_stack;
-    invoke_count_words(paramCount, params, nr_stack);
-    
-    // Stack, if used, must be 16-bytes aligned
-    if (nr_stack)
-        nr_stack = (nr_stack + 1) & ~1;
-
-    // Load parameters to stack, if necessary
-    uint64_t *stack = (uint64_t *) __builtin_alloca(nr_stack * 8);
-    uint64_t gpregs[GPR_COUNT];
-    double fpregs[FPR_COUNT];
-    invoke_copy_to_stack(stack, paramCount, params, gpregs, fpregs);
-
-    // We used to have switches to make sure we would only load the registers
-    // that are needed for this call. That produced larger code that was
-    // not faster in practice. It also caused compiler warnings about the
-    // variables being used uninitialized.
-    // We now just load every every register. There could still be a warning
-    // from a memory analysis tools that we are loading uninitialized stack
-    // positions.
-
-    // FIXME: this function depends on the above __builtin_alloca placing
-    // the array in the correct spot for the ABI.
-
-    // Load FPR registers from fpregs[]
-    double d0, d1, d2, d3, d4, d5, d6, d7;
-
-    d7 = fpregs[7];
-    d6 = fpregs[6];
-    d5 = fpregs[5];
-    d4 = fpregs[4];
-    d3 = fpregs[3];
-    d2 = fpregs[2];
-    d1 = fpregs[1];
-    d0 = fpregs[0];
-
-    // Load GPR registers from gpregs[]
-    uint64_t a0, a1, a2, a3, a4, a5;
-
-    a5 = gpregs[5];
-    a4 = gpregs[4];
-    a3 = gpregs[3];
-    a2 = gpregs[2];
-    a1 = gpregs[1];
-    a0 = (uint64_t) that;
-
-    // Get pointer to method
-    uint64_t methodAddress = *((uint64_t *)that);
-    methodAddress += 8 * methodIndex;
-    methodAddress = *((uint64_t *)methodAddress);
-    
-    typedef nsresult (*Method)(uint64_t, uint64_t, uint64_t, uint64_t,
-                               uint64_t, uint64_t, double, double, double,
-                               double, double, double, double, double);
-    nsresult result = ((Method)methodAddress)(a0, a1, a2, a3, a4, a5,
-                                              d0, d1, d2, d3, d4, d5,
-                                              d6, d7);
-    return result;
-}
-
-#ifndef __clang__
-#pragma GCC pop_options
-#endif