bug 1272498 - rewrite NS_InvokeByIndex in assembly r=froydnj
☠☠ backed out by 86816e25dea7 ☠ ☠
authorTrevor Saunders <tbsaunde@tbsaunde.org>
Tue, 28 Jun 2016 10:28:33 -0400
changeset 348477 2e8cfd225f0419a552edc4d9f5942927344eef7c
parent 348476 15d996da21b247c646f5dc0b05663568bdc135c6
child 348478 bafc5c504506a1795adb94bfe54700fe254b61d6
push id1230
push userjlund@mozilla.com
push dateMon, 31 Oct 2016 18:13:35 +0000
treeherdermozilla-release@5e06e3766db2 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersfroydnj
bugs1272498
milestone50.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
bug 1272498 - rewrite NS_InvokeByIndex in assembly r=froydnj
xpcom/reflect/xptcall/md/unix/moz.build
xpcom/reflect/xptcall/md/unix/xptcinvoke_asm_x86_64_unix.s
xpcom/reflect/xptcall/md/unix/xptcinvoke_x86_64_unix.cpp
--- a/xpcom/reflect/xptcall/md/unix/moz.build
+++ b/xpcom/reflect/xptcall/md/unix/moz.build
@@ -23,16 +23,17 @@ if CONFIG['OS_ARCH'] == 'GNU':
             'xptcinvoke_gcc_x86_unix.cpp',
             'xptcstubs_gcc_x86_unix.cpp'
         ]
 
 if CONFIG['OS_ARCH'] in ('Linux', 'Bitrig', 'DragonFly', 'FreeBSD', 'NetBSD', 'OpenBSD') or \
    CONFIG['OS_ARCH'].startswith('GNU_'):
     if CONFIG['OS_TEST'] == 'x86_64':
         SOURCES += [
+            'xptcinvoke_asm_x86_64_unix.s',
             'xptcinvoke_x86_64_unix.cpp',
             'xptcstubs_x86_64_linux.cpp',
         ]
     elif '86' in CONFIG['OS_TEST']:
         SOURCES += [
             'xptcinvoke_gcc_x86_unix.cpp',
             'xptcstubs_gcc_x86_unix.cpp'
         ]
@@ -48,16 +49,17 @@ if CONFIG['OS_ARCH'] in ('Linux', 'FreeB
 
 if CONFIG['OS_ARCH'] == 'SunOS' and '86' in CONFIG['OS_TEST']:
     GENERATED_FILES = [
         'xptcstubsdef_asm.solx86',
     ]
     if CONFIG['OS_TEST'] == 'x86_64':
         if CONFIG['GNU_CC']:
             SOURCES += [
+            'xptcinvoke_asm_x86_64_unix.s',
                 'xptcinvoke_x86_64_unix.cpp',
                 'xptcstubs_x86_64_linux.cpp'
             ]
         else:
             ASFLAGS += ['-xarch=amd64']
             SOURCES += [
                 'xptcinvoke_x86_64_solaris.cpp',
                 'xptcstubs_asm_x86_64_solaris_SUNW.s',
new file mode 100644
--- /dev/null
+++ b/xpcom/reflect/xptcall/md/unix/xptcinvoke_asm_x86_64_unix.s
@@ -0,0 +1,92 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+.intel_syntax noprefix
+
+# nsresult NS_InvokeByIndex(nsISupports* this, uint32_t aVtableIndex,
+#                           uint32_t argc, nsXPTCVariant* argv);
+.text
+.global NS_InvokeByIndex
+.type NS_InvokeByIndex, @function
+.align 4
+NS_InvokeByIndex:
+  push rbp
+  mov rbp, rsp
+
+# save r12 and r13 because we use them and they are callee saved.
+  push r12
+  push r13
+
+# save this and the vtable index because we need them after setting up the
+# stack.
+  mov r12, rdi
+  mov r13, rsi
+
+# allocate space for stack arguments, in theory we only need 8 * (argc - 5)
+# bytes because at least 5 arguments will go in registers, but for now it is
+# just simpler to allocate 8 * (argc - 1) bytes.  Note that we treat the this
+# pointer specially.
+  lea eax, [edx * 8]
+  sub rsp, rax
+
+# if there is an odd number of args the stack can be misaligned so realign it.
+  and rsp, 0xffffffffffffff00
+
+# pass the stack slot area to InvokeCopyToStack.
+  mov r8, rsp
+
+# setup space for the register slots: there are 5 integer ones and 8 floating
+# point ones.  So we need 104 bytes of space, but we allocate 112 to keep rsp
+# aligned to 16 bytes.
+  sub rsp, 112
+
+# the first argument to InvokeCopyToStack is the integer register area, and the
+# second is the floating point area.
+  mov rdi, rsp
+  lea rsi, [rsp + 40]
+
+# The 3rd and 4th arguments to InvokeCopyToStack are already in the right
+# registers because they are our 3rd and 4th arguments.  So now we can just
+# call InvokeCopyToStack.
+  call InvokeCopyToStack
+
+# setup this
+  mov rdi, r12
+
+# copy the integer arguments into place.
+  mov rsi, [rsp]
+  mov rdx, [rsp + 8]
+  mov rcx, [rsp + 16]
+  mov r8, [rsp + 24]
+  mov r9, [rsp + 32]
+
+# copy the float arguments into place
+  movsd xmm0, [rsp + 40]
+  movsd xmm1, [rsp + 48]
+  movsd xmm2, [rsp + 56]
+  movsd xmm3, [rsp + 64]
+  movsd xmm4, [rsp + 72]
+  movsd xmm5, [rsp + 80]
+  movsd xmm6, [rsp + 88]
+  movsd xmm7, [rsp + 96]
+
+# get rid of the scratch space for registers
+  add rsp, 112
+
+# load the function pointer and call
+  lea eax, [r13d * 8]
+  add rax, [rdi]
+  call [rax]
+
+# r12 and r13 were pushed relative to the old stack pointer which is now the
+# frame pointer.
+  mov r12, [rbp - 0x8]
+  mov r13, [rbp - 0x10]
+
+  mov rsp, rbp
+  pop rbp
+  ret
+
+// Magic indicating no need for an executable stack
+.section .note.GNU-stack, "", @progbits ; .previous
--- a/xpcom/reflect/xptcall/md/unix/xptcinvoke_x86_64_unix.cpp
+++ b/xpcom/reflect/xptcall/md/unix/xptcinvoke_x86_64_unix.cpp
@@ -3,56 +3,29 @@
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 // Platform specific code to invoke XPCOM methods on native objects
 
 #include "xptcprivate.h"
 
-// 6 integral parameters are passed in registers
-const uint32_t GPR_COUNT = 6;
+// 6 integral parameters are passed in registers, but 1 is |this| which isn't
+// considered here.
+const uint32_t GPR_COUNT = 5;
 
 // 8 floating point parameters are passed in SSE registers
 const uint32_t FPR_COUNT = 8;
 
-// Remember that these 'words' are 64-bit long
-static inline void
-invoke_count_words(uint32_t paramCount, nsXPTCVariant * s,
-                   uint32_t & nr_stack)
+extern "C" void
+InvokeCopyToStack(uint64_t * gpregs, double * fpregs,
+                     uint32_t paramCount, nsXPTCVariant * s,
+                     uint64_t* d)
 {
-    uint32_t nr_gpr;
-    uint32_t nr_fpr;
-    nr_gpr = 1; // skip one GP register for 'that'
-    nr_fpr = 0;
-    nr_stack = 0;
-
-    /* Compute number of eightbytes of class MEMORY.  */
-    for (uint32_t i = 0; i < paramCount; i++, s++) {
-        if (!s->IsPtrData()
-            && (s->type == nsXPTType::T_FLOAT || s->type == nsXPTType::T_DOUBLE)) {
-            if (nr_fpr < FPR_COUNT)
-                nr_fpr++;
-            else
-                nr_stack++;
-        }
-        else {
-            if (nr_gpr < GPR_COUNT)
-                nr_gpr++;
-            else
-                nr_stack++;
-        }
-    }
-}
-
-static void
-invoke_copy_to_stack(uint64_t * d, uint32_t paramCount, nsXPTCVariant * s,
-                     uint64_t * gpregs, double * fpregs)
-{
-    uint32_t nr_gpr = 1u; // skip one GP register for 'that'
+    uint32_t nr_gpr = 0u; // skip one GP register for 'that'
     uint32_t nr_fpr = 0u;
     uint64_t value = 0u;
 
     for (uint32_t i = 0; i < paramCount; i++, s++) {
         if (s->IsPtrData())
             value = (uint64_t) s->ptr;
         else {
             switch (s->type) {
@@ -95,94 +68,8 @@ invoke_copy_to_stack(uint64_t * d, uint3
         else {
             if (nr_gpr < GPR_COUNT)
                 gpregs[nr_gpr++] = value;
             else
                 *d++ = value;
         }
     }
 }
-
-// Disable avx for the next function to allow compilation with
-// -march=native on new machines, or similar hardcoded -march options.
-// Having avx enabled appears to change the alignment behavior of alloca
-// (apparently adding an extra 16 bytes) of padding/alignment (and using
-// 32-byte alignment instead of 16-byte).  This seems to be the best
-// available workaround, given that this code, which should perhaps
-// better be written in assembly, is written in C++.
-#ifndef __clang__
-#pragma GCC push_options
-#pragma GCC target ("no-avx")
-#endif
-
-// Avoid AddressSanitizer instrumentation for the next function because it
-// depends on __builtin_alloca behavior and alignment that cannot be relied on
-// once the function is compiled with a version of ASan that has dynamic-alloca
-// instrumentation enabled.
-
-MOZ_ASAN_BLACKLIST
-EXPORT_XPCOM_API(nsresult)
-NS_InvokeByIndex(nsISupports * that, uint32_t methodIndex,
-                 uint32_t paramCount, nsXPTCVariant * params)
-{
-    uint32_t nr_stack;
-    invoke_count_words(paramCount, params, nr_stack);
-    
-    // Stack, if used, must be 16-bytes aligned
-    if (nr_stack)
-        nr_stack = (nr_stack + 1) & ~1;
-
-    // Load parameters to stack, if necessary
-    uint64_t *stack = (uint64_t *) __builtin_alloca(nr_stack * 8);
-    uint64_t gpregs[GPR_COUNT];
-    double fpregs[FPR_COUNT];
-    invoke_copy_to_stack(stack, paramCount, params, gpregs, fpregs);
-
-    // We used to have switches to make sure we would only load the registers
-    // that are needed for this call. That produced larger code that was
-    // not faster in practice. It also caused compiler warnings about the
-    // variables being used uninitialized.
-    // We now just load every every register. There could still be a warning
-    // from a memory analysis tools that we are loading uninitialized stack
-    // positions.
-
-    // FIXME: this function depends on the above __builtin_alloca placing
-    // the array in the correct spot for the ABI.
-
-    // Load FPR registers from fpregs[]
-    double d0, d1, d2, d3, d4, d5, d6, d7;
-
-    d7 = fpregs[7];
-    d6 = fpregs[6];
-    d5 = fpregs[5];
-    d4 = fpregs[4];
-    d3 = fpregs[3];
-    d2 = fpregs[2];
-    d1 = fpregs[1];
-    d0 = fpregs[0];
-
-    // Load GPR registers from gpregs[]
-    uint64_t a0, a1, a2, a3, a4, a5;
-
-    a5 = gpregs[5];
-    a4 = gpregs[4];
-    a3 = gpregs[3];
-    a2 = gpregs[2];
-    a1 = gpregs[1];
-    a0 = (uint64_t) that;
-
-    // Get pointer to method
-    uint64_t methodAddress = *((uint64_t *)that);
-    methodAddress += 8 * methodIndex;
-    methodAddress = *((uint64_t *)methodAddress);
-    
-    typedef nsresult (*Method)(uint64_t, uint64_t, uint64_t, uint64_t,
-                               uint64_t, uint64_t, double, double, double,
-                               double, double, double, double, double);
-    nsresult result = ((Method)methodAddress)(a0, a1, a2, a3, a4, a5,
-                                              d0, d1, d2, d3, d4, d5,
-                                              d6, d7);
-    return result;
-}
-
-#ifndef __clang__
-#pragma GCC pop_options
-#endif