Bug 1499507 - Allow the compiler to generate a non-atomic increment instruction for the stack pointer increment. r=njn
☠☠ backed out by 848152c22f8b ☠ ☠
authorMarkus Stange <mstange@themasta.com>
Mon, 05 Nov 2018 19:12:38 +0000
changeset 444455 541186291b888899e895b8aa92d3823cf3475905
parent 444454 8a3f4acbad3b6b573330a785e5a0e8a1c95f17bb
child 444456 09df5390e2fb01be435bf72a8c8e1c1948e45f62
push id34996
push userrgurzau@mozilla.com
push dateTue, 06 Nov 2018 09:53:23 +0000
treeherdermozilla-central@e160f0a60e4f [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersnjn
bugs1499507
milestone65.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1499507 - Allow the compiler to generate a non-atomic increment instruction for the stack pointer increment. r=njn This change reduces the binary size on macOS x64 by around 50KB. Here's a diff of the impact on the code generated for Attr_Binding::get_specified in the Mac build. It's a bit hard to read because %r12 and %rbx swap their function, but what happens in this method is that "movq %r12, %rcx" goes away, and the two instructions "leal 0x1(%r12) %eax" and "movl %eax, 0x10(%rbx)" turn into an "incl 0x10(%r12)". So the old code was preserving the original value of profilingStack->stackPointer in a register, and then using it later to compute the incremented stackPointer. The new code uses an "incl" instruction for the stackPointer increment and doesn't worry that the stackPointer value might have changed since the stack size check at the start of the function. (It can't have changed.) before: %rbx has the ProfilingStack*, %r12 has profilingStack->stackPointer after: %r12 has the ProfilingStack*, %rbx has profilingStack->stackPointer @@ -3,37 +3,35 @@ movq %rsp, %rbp pushq %r15 pushq %r14 pushq %r12 pushq %rbx subq $0x10, %rsp movq %rcx, %r14 movq %rdx, %r15 - movq 0x80(%rdi), %rbx - movq %rbx, -40(%rbp) - testq %rbx, %rbx + movq 0x80(%rdi), %r12 + movq %r12, -40(%rbp) + testq %r12, %r12 je loc_xxxxx - movl 0x10(%rbx), %r12d - cmpl (%rbx), %r12d + movl 0x10(%r12), %ebx + cmpl (%r12), %ebx jae loc_xxxxx - movq 0x8(%rbx), %rax - movq %r12, %rcx - shlq $0x5, %rcx - leaq aAttr, %rdx ; "Attr" - movq %rdx, (%rax,%rcx) - leaq aSpecified, %rdx ; "specified" - movq %rdx, 0x8(%rax,%rcx) - leaq -40(%rbp), %rdx - movq %rdx, 0x10(%rax,%rcx) - movl $0x3a1, 0x1c(%rax,%rcx) - leal 0x1(%r12), %eax - movl %eax, 0x10(%rbx) + movq 0x8(%r12), %rax + shlq $0x5, %rbx + leaq aAttr, %rcx ; "Attr" + movq %rcx, (%rax,%rbx) + leaq aSpecified, %rcx ; "specified" + movq %rcx, 0x8(%rax,%rbx) + leaq -40(%rbp), %rcx + movq %rcx, 0x10(%rax,%rbx) + movl $0x3a1, 0x1c(%rax,%rbx) + incl 0x10(%r12) movq %r15, %rdi call __ZNK7mozilla3dom4Attr9SpecifiedEv ; mozilla::dom::Attr::Specified() const movzxl %al, %eax movabsq $0xfff9000000000000, %rcx orq %rax, %rcx movq %rcx, (%r14) movq -40(%rbp), %rax @@ -47,11 +45,11 @@ popq %rbx popq %r12 popq %r14 popq %r15 popq %rbp ret ; endp - movq %rbx, %rdi + movq %r12, %rdi call __ZN14ProfilingStack18ensureCapacitySlowEv ; ProfilingStack::ensureCapacitySlow() jmp loc_xxxxx Depends on D9205 Differential Revision: https://phabricator.services.mozilla.com/D9206
js/public/ProfilingStack.h
--- a/js/public/ProfilingStack.h
+++ b/js/public/ProfilingStack.h
@@ -389,58 +389,66 @@ class ProfilingStack final
       : stackPointer(0)
     {}
 
     ~ProfilingStack();
 
     void pushLabelFrame(const char* label, const char* dynamicString, void* sp,
                         js::ProfilingStackFrame::Category category,
                         uint32_t flags = 0) {
-        uint32_t oldStackPointer = stackPointer;
+        // This thread is the only one that ever changes the value of
+        // stackPointer.
+        // Store the value of the atomic in a non-atomic local variable so that
+        // the compiler won't generate two separate loads from the atomic for
+        // the size check and the frames[] array indexing operation.
+        uint32_t stackPointerVal = stackPointer;
 
-        if (MOZ_UNLIKELY(oldStackPointer >= capacity)) {
+        if (MOZ_UNLIKELY(stackPointerVal >= capacity)) {
             ensureCapacitySlow();
         }
-        frames[oldStackPointer].initLabelFrame(label, dynamicString, sp,
+        frames[stackPointerVal].initLabelFrame(label, dynamicString, sp,
                                                category, flags);
 
         // This must happen at the end! The compiler will not reorder this
         // update because stackPointer is Atomic<..., ReleaseAcquire>, so any
         // the writes above will not be reordered below the stackPointer store.
         // Do the read and the write as two separate statements, in order to
         // make it clear that we don't need an atomic increment, which would be
         // more expensive on x86 than the separate operations done here.
-        // This thread is the only one that ever changes the value of
-        // stackPointer.
-        stackPointer = oldStackPointer + 1;
+        // However, don't use stackPointerVal here; instead, allow the compiler
+        // to turn this store into a non-atomic increment instruction which
+        // takes up less code size.
+        stackPointer = stackPointer + 1;
     }
 
     void pushSpMarkerFrame(void* sp) {
         uint32_t oldStackPointer = stackPointer;
 
         if (MOZ_UNLIKELY(oldStackPointer >= capacity)) {
             ensureCapacitySlow();
         }
         frames[oldStackPointer].initSpMarkerFrame(sp);
 
         // This must happen at the end, see the comment in pushLabelFrame.
         stackPointer = oldStackPointer + 1;
     }
 
     void pushJsFrame(const char* label, const char* dynamicString, JSScript* script,
                      jsbytecode* pc) {
+        // This thread is the only one that ever changes the value of
+        // stackPointer. Only load the atomic once.
         uint32_t oldStackPointer = stackPointer;
 
         if (MOZ_UNLIKELY(oldStackPointer >= capacity)) {
             ensureCapacitySlow();
         }
         frames[oldStackPointer].initJsFrame(label, dynamicString, script, pc);
 
         // This must happen at the end, see the comment in pushLabelFrame.
-        stackPointer = oldStackPointer + 1;
+        stackPointer = stackPointer + 1;
     }
 
     void pop() {
         MOZ_ASSERT(stackPointer > 0);
         // Do the read and the write as two separate statements, in order to
         // make it clear that we don't need an atomic decrement, which would be
         // more expensive on x86 than the separate operations done here.
         // This thread is the only one that ever changes the value of