Bug 1526579 - Rabaldr arm64 stack alignment fix. r=bbouvier
authorLars T Hansen <lhansen@mozilla.com>
Tue, 12 Feb 2019 10:39:23 +0100
changeset 458720 6cb3a502e891
parent 458719 5f58b05d9d29
child 458721 de5c7c2699c4
push id35546
push userrmaries@mozilla.com
push dateWed, 13 Feb 2019 04:27:59 +0000
treeherdermozilla-central@636d2c00234d [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbbouvier
bugs1526579
milestone67.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1526579 - Rabaldr arm64 stack alignment fix. r=bbouvier Problem: When a stack chunk had to be popped as part of a control flow instruction, the amount to pop was not always computed as a multiple of ChunkSize. The reason is that the fixed amount of stack that should not be popped isn't necessarily a multiple of ChunkSize, yet this was assumed. A small adjustment to the calculation fixes that. Also added an assertion that would have caught this problem more easily. Also did some desirable drive-by fixes to clarify documentation and to factor the resulting code. The TC sets up a situation where we require a chunk to be created and then destroyed in the 'else' arm of the 'if', at the same time as the fixed amount of stack is not a multiple of ChunkSize. Differential Revision: https://phabricator.services.mozilla.com/D19470
js/src/jit-test/tests/wasm/regress/baseline-arm64-chunk-pop.js
js/src/wasm/WasmBaselineCompile.cpp
new file mode 100644
--- /dev/null
+++ b/js/src/jit-test/tests/wasm/regress/baseline-arm64-chunk-pop.js
@@ -0,0 +1,31 @@
+var bin = wasmTextToBinary(
+    `(module
+       (func (export "f4786") (result i32)
+         (local i32 i64 i64 i64 f32)
+         i32.const 1
+         tee_local 0
+         get_local 0
+         get_local 0
+         get_local 0
+         get_local 0
+         get_local 0
+         get_local 0
+         get_local 0
+         if i32
+           get_local 0
+         else
+           get_local 0
+           tee_local 0
+           get_local 0
+           br_if 1
+         end
+         drop
+         drop
+         drop
+         drop
+         drop
+         drop
+         drop))`);
+var ins = new WebAssembly.Instance(new WebAssembly.Module(bin));
+ins.exports.f4786();
+
--- a/js/src/wasm/WasmBaselineCompile.cpp
+++ b/js/src/wasm/WasmBaselineCompile.cpp
@@ -944,56 +944,56 @@ using ScratchEBX = ScratchI32;
 
 // ScratchI8 is a mnemonic device: For some ops we need a register with a
 // byte subregister.
 using ScratchI8 = ScratchI32;
 #endif
 
 // The stack frame.
 //
-// The frame has four parts ("below" means at lower addresses):
+// The stack frame has four parts ("below" means at lower addresses):
 //
-//  - the Header, comprising the Frame and DebugFrame elements;
-//  - the Local area, allocated below the header with various forms of
-//    alignment;
+//  - the Frame element;
+//  - the Local area, including the DebugFrame element; allocated below the
+//    header with various forms of alignment;
 //  - the Dynamic area, comprising the temporary storage the compiler uses for
 //    register spilling, allocated below the Local area;
 //  - the Arguments area, comprising memory allocated for outgoing calls,
 //    allocated below the Dynamic area.
 //
 //                 +============================+
 //                 |    Incoming arg            |
 //                 |    ...                     |
 //                 +----------------------------+
 //                 |    unspecified             |
 // --------------  +============================+
-//  ^              |    Frame (fixed size)      |
-// fixedSize       +----------------------------+ <------------------ FP
-//  |              |    DebugFrame (optional)   |               ^^
-//  |    --------  +============================+ ---------     ||
-//  |       ^      |    Local (static size)     |    ^          ||
-//  | localSize    |    ...                     |    |        framePushed
-//  v       v      |    (padding)               |    |          ||
-// --------------  +============================+ stackHeight   ||
-//          ^      |    Dynamic (variable)      |    |          ||
-//   dynamicSize   |    ...                     |    |          ||
-//          v      |    ...                     |    v          ||
-// --------------  |    (free space, sometimes) | ---------     v|
+//                 |    Frame (fixed size)      |
+// --------------  +============================+ <-------------------- FP
+//          ^      |    DebugFrame (optional)   |    ^                ^^
+//          |      +----------------------------+    |                ||
+//    localSize    |    Local (static size)     |    |                ||
+//          |      |    ...                     |    |        framePushed
+//          v      |    (padding)               |    |                ||
+// --------------  +============================+ currentStackHeight  ||
+//          ^      |    Dynamic (variable size) |    |                ||
+//   dynamicSize   |    ...                     |    |                ||
+//          v      |    ...                     |    v                ||
+// --------------  |    (free space, sometimes) | ---------           v|
 //                 +============================+ <----- SP not-during calls
-//                 |    Arguments (sometimes)   |                |
-//                 |    ...                     |                v
+//                 |    Arguments (sometimes)   |                      |
+//                 |    ...                     |                      v
 //                 +============================+ <----- SP during calls
 //
-// The Header is addressed off the stack pointer.  masm.framePushed() is always
+// The Frame is addressed off the stack pointer.  masm.framePushed() is always
 // correct, and masm.getStackPointer() + masm.framePushed() always addresses the
 // Frame, with the DebugFrame optionally below it.
 //
-// The Local area is laid out by BaseLocalIter and is allocated and deallocated
-// by standard prologue and epilogue functions that manipulate the stack
-// pointer, but it is accessed via BaseStackFrame.
+// The Local area (including the DebugFrame) is laid out by BaseLocalIter and is
+// allocated and deallocated by standard prologue and epilogue functions that
+// manipulate the stack pointer, but it is accessed via BaseStackFrame.
 //
 // The Dynamic area is maintained by and accessed via BaseStackFrame.  On some
 // systems (such as ARM64), the Dynamic memory may be allocated in chunks
 // because the SP needs a specific alignment, and in this case there will
 // normally be some free space directly above the SP.  The stack height does not
 // include the free space, it reflects the logically used space only.
 //
 // The Arguments area is allocated and deallocated via BaseStackFrame (see
@@ -1142,21 +1142,24 @@ class BaseStackFrameAllocator {
   // pointer.
   //
   // Good values for ChunkSize are the subject of future empirical analysis;
   // eight words is just an educated guess.
 
   static constexpr uint32_t ChunkSize = 8 * sizeof(void*);
   static constexpr uint32_t InitialChunk = ChunkSize;
 
-  // The current logical height of the frame, ie the sum of space for the
-  // Local and Dynamic areas.  The allocated size of the frame -- provided by
-  // masm.framePushed() -- is usually larger than currentStackHeight_, notably
-  // at the beginning of execution when we've allocated InitialChunk extra
-  // space.
+  // The current logical height of the frame is
+  //   currentStackHeight_ = localSize_ + dynamicSize
+  // where dynamicSize is not accounted for explicitly and localSize_ also
+  // includes size for the DebugFrame.
+  //
+  // The allocated size of the frame, provided by masm.framePushed(), is usually
+  // larger than currentStackHeight_, notably at the beginning of execution when
+  // we've allocated InitialChunk extra space.
 
   uint32_t currentStackHeight_;
 #endif
 
   // Size of the Local area in bytes (stable after BaseCompiler::init() has
   // called BaseStackFrame::setupLocals(), which in turn calls
   // BaseStackFrameAllocator::setLocalSize()), always rounded to the proper
   // stack alignment.  The Local area is then allocated in beginFunction(),
@@ -1200,27 +1203,41 @@ class BaseStackFrameAllocator {
 #ifdef RABALDR_CHUNKY_STACK
     currentStackHeight_ = localSize_;
 #endif
   }
 
  public:
   // The fixed amount of memory, in bytes, allocated on the stack below the
   // Header for purposes such as locals and other fixed values.  Includes all
-  // necessary alignment.
-
-  uint32_t fixedSize() const {
+  // necessary alignment, and on ARM64 also the initial chunk for the working
+  // stack memory.
+
+  uint32_t fixedAllocSize() const {
     MOZ_ASSERT(localSize_ != UINT32_MAX);
 #ifdef RABALDR_CHUNKY_STACK
     return localSize_ + InitialChunk;
 #else
     return localSize_;
 #endif
   }
 
+#ifdef RABALDR_CHUNKY_STACK
+  // The allocated frame size is frequently larger than the logical stack
+  // height; we round up to a chunk boundary, and special case the initial
+  // chunk.
+  uint32_t framePushedForHeight(uint32_t logicalHeight) {
+    if (logicalHeight <= fixedAllocSize()) {
+      return fixedAllocSize();
+    }
+    return fixedAllocSize() + AlignBytes(logicalHeight - fixedAllocSize(),
+                                         ChunkSize);
+  }
+#endif
+
  protected:
   //////////////////////////////////////////////////////////////////////
   //
   // The Dynamic area - the dynamic part of the frame, for spilling and saving
   // intermediate values.
 
   // Offset off of sp_ for the slot at stack area location `offset`.
 
@@ -1235,60 +1252,59 @@ class BaseStackFrameAllocator {
     }
     currentStackHeight_ += bytes;
     checkChunkyInvariants();
   }
 
   void popChunkyBytes(uint32_t bytes) {
     checkChunkyInvariants();
     currentStackHeight_ -= bytes;
-    // Sometimes, popChunkyBytes() is used to pop a larger area, as when we
-    // drop values consumed by a call, and we may need to drop several
-    // chunks.  But never drop the initial chunk.
-    if (masm.framePushed() - currentStackHeight_ >= ChunkSize) {
-      uint32_t target =
-          Max(fixedSize(), AlignBytes(currentStackHeight_, ChunkSize));
-      uint32_t amount = masm.framePushed() - target;
-      if (amount) {
-        masm.freeStack(amount);
-      }
-      MOZ_ASSERT(masm.framePushed() >= fixedSize());
+    // Sometimes, popChunkyBytes() is used to pop a larger area, as when we drop
+    // values consumed by a call, and we may need to drop several chunks.  But
+    // never drop the initial chunk.  Crucially, the amount we drop is always an
+    // integral number of chunks.
+    uint32_t freeSpace = masm.framePushed() - currentStackHeight_;
+    if (freeSpace >= ChunkSize) {
+      uint32_t targetAllocSize = framePushedForHeight(currentStackHeight_);
+      uint32_t amountToFree = masm.framePushed() - targetAllocSize;
+      MOZ_ASSERT(amountToFree % ChunkSize == 0);
+      if (amountToFree) {
+        masm.freeStack(amountToFree);
+      }
     }
     checkChunkyInvariants();
   }
 #endif
 
   uint32_t currentStackHeight() const {
 #ifdef RABALDR_CHUNKY_STACK
     return currentStackHeight_;
 #else
     return masm.framePushed();
 #endif
   }
 
  private:
 #ifdef RABALDR_CHUNKY_STACK
   void checkChunkyInvariants() {
+    MOZ_ASSERT(masm.framePushed() >= fixedAllocSize());
     MOZ_ASSERT(masm.framePushed() >= currentStackHeight_);
-    MOZ_ASSERT(masm.framePushed() == fixedSize() ||
+    MOZ_ASSERT(masm.framePushed() == fixedAllocSize() ||
                masm.framePushed() - currentStackHeight_ < ChunkSize);
+    MOZ_ASSERT((masm.framePushed() - localSize_) % ChunkSize == 0);
   }
 #endif
 
   // For a given stack height, return the appropriate size of the allocated
   // frame.
 
   uint32_t framePushedForHeight(StackHeight stackHeight) {
 #ifdef RABALDR_CHUNKY_STACK
-    // The allocated frame size is frequently larger than the stack height;
-    // we round up to a chunk boundary, and special case the initial chunk.
-    return stackHeight.height <= fixedSize()
-               ? fixedSize()
-               : fixedSize() +
-                     AlignBytes(stackHeight.height - fixedSize(), ChunkSize);
+    // A more complicated adjustment is needed.
+    return framePushedForHeight(stackHeight.height);
 #else
     // The allocated frame size equals the stack height.
     return stackHeight.height;
 #endif
   }
 
  public:
   // The current height of the stack area, not necessarily zero-based, in a
@@ -4116,17 +4132,17 @@ class BaseCompiler final : public BaseCo
     if (!smgen_.generateStackmapEntriesForTrapExit(args, extras)) {
       return false;
     }
     if (!createStackMap("stack check", extras, masm.currentOffset(),
                         HasRefTypedDebugFrame::No)) {
       return false;
     }
 
-    size_t reservedBytes = fr.fixedSize() - masm.framePushed();
+    size_t reservedBytes = fr.fixedAllocSize() - masm.framePushed();
     MOZ_ASSERT(0 == (reservedBytes % sizeof(void*)));
 
     masm.reserveStack(reservedBytes);
     fr.onFixedStackAllocated();
     if (!smgen_.mst_.pushNonGCPointers(reservedBytes / sizeof(void*))) {
       return false;
     }
 
@@ -4277,17 +4293,17 @@ class BaseCompiler final : public BaseCo
       }
       insertBreakablePoint(CallSiteDesc::LeaveFrame);
       if (!createStackMap("debug: leave frame", refDebugFrame)) {
         return false;
       }
       restoreResult();
     }
 
-    GenerateFunctionEpilogue(masm, fr.fixedSize(), &offsets_);
+    GenerateFunctionEpilogue(masm, fr.fixedAllocSize(), &offsets_);
 
 #if defined(JS_ION_PERF)
     // FIXME - profiling code missing.  No bug for this.
 
     // Note the end of the inline code and start of the OOL code.
     // gen->perfSpewer().noteEndInlineCode(masm);
 #endif