Bug 1526579 - Rabaldr arm64 stack alignment fix. r=bbouvier, a=lizzard
authorLars T Hansen <lhansen@mozilla.com>
Tue, 12 Feb 2019 10:39:23 +0100
changeset 516044 26383c993190b7ba1b9360771efe404025ae01ad
parent 516043 2d73b02c64e35c65823e26e7eb620241ff83b5af
child 516045 c5bcd3213da631ccba03d2ed8a16bd1519c424e4
push id1953
push userffxbld-merge
push dateMon, 11 Mar 2019 12:10:20 +0000
treeherdermozilla-release@9c35dcbaa899 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbbouvier, lizzard
bugs1526579
milestone66.0
Bug 1526579 - Rabaldr arm64 stack alignment fix. r=bbouvier, a=lizzard Problem: When a stack chunk had to be popped as part of a control flow instruction, the amount to pop was not always computed as a multiple of ChunkSize. The reason is that the fixed amount of stack that should not be popped isn't necessarily a multiple of ChunkSize, yet this was assumed. A small adjustment to the calculation fixes that. Also added an assertion that would have caught this problem more easily. Also did some desirable drive-by fixes to clarify documentation and to factor the resulting code. The TC sets up a situation where we require a chunk to be created and then destroyed in the 'else' arm of the 'if', at the same time as the fixed amount of stack is not a multiple of ChunkSize. Differential Revision: https://phabricator.services.mozilla.com/D19470
js/src/jit-test/tests/wasm/regress/baseline-arm64-chunk-pop.js
js/src/wasm/WasmBaselineCompile.cpp
new file mode 100644
--- /dev/null
+++ b/js/src/jit-test/tests/wasm/regress/baseline-arm64-chunk-pop.js
@@ -0,0 +1,31 @@
+var bin = wasmTextToBinary(
+    `(module
+       (func (export "f4786") (result i32)
+         (local i32 i64 i64 i64 f32)
+         i32.const 1
+         tee_local 0
+         get_local 0
+         get_local 0
+         get_local 0
+         get_local 0
+         get_local 0
+         get_local 0
+         get_local 0
+         if i32
+           get_local 0
+         else
+           get_local 0
+           tee_local 0
+           get_local 0
+           br_if 1
+         end
+         drop
+         drop
+         drop
+         drop
+         drop
+         drop
+         drop))`);
+var ins = new WebAssembly.Instance(new WebAssembly.Module(bin));
+ins.exports.f4786();
+
--- a/js/src/wasm/WasmBaselineCompile.cpp
+++ b/js/src/wasm/WasmBaselineCompile.cpp
@@ -944,56 +944,56 @@ using ScratchEBX = ScratchI32;
 
 // ScratchI8 is a mnemonic device: For some ops we need a register with a
 // byte subregister.
 using ScratchI8 = ScratchI32;
 #endif
 
 // The stack frame.
 //
-// The frame has four parts ("below" means at lower addresses):
+// The stack frame has four parts ("below" means at lower addresses):
 //
-//  - the Header, comprising the Frame and DebugFrame elements;
-//  - the Local area, allocated below the header with various forms of
-//    alignment;
+//  - the Frame element;
+//  - the Local area, including the DebugFrame element; allocated below the
+//    header with various forms of alignment;
 //  - the Dynamic area, comprising the temporary storage the compiler uses for
 //    register spilling, allocated below the Local area;
 //  - the Arguments area, comprising memory allocated for outgoing calls,
 //    allocated below the Dynamic area.
 //
 //                 +============================+
 //                 |    Incoming arg            |
 //                 |    ...                     |
 //                 +----------------------------+
 //                 |    unspecified             |
 // --------------  +============================+
-//  ^              |    Frame (fixed size)      |
-// fixedSize       +----------------------------+ <------------------ FP
-//  |              |    DebugFrame (optional)   |               ^^
-//  |    --------  +============================+ ---------     ||
-//  |       ^      |    Local (static size)     |    ^          ||
-//  | localSize    |    ...                     |    |        framePushed
-//  v       v      |    (padding)               |    |          ||
-// --------------  +============================+ stackHeight   ||
-//          ^      |    Dynamic (variable)      |    |          ||
-//   dynamicSize   |    ...                     |    |          ||
-//          v      |    ...                     |    v          ||
-// --------------  |    (free space, sometimes) | ---------     v|
+//                 |    Frame (fixed size)      |
+// --------------  +============================+ <-------------------- FP
+//          ^      |    DebugFrame (optional)   |    ^                ^^
+//          |      +----------------------------+    |                ||
+//    localSize    |    Local (static size)     |    |                ||
+//          |      |    ...                     |    |        framePushed
+//          v      |    (padding)               |    |                ||
+// --------------  +============================+ currentStackHeight  ||
+//          ^      |    Dynamic (variable size) |    |                ||
+//   dynamicSize   |    ...                     |    |                ||
+//          v      |    ...                     |    v                ||
+// --------------  |    (free space, sometimes) | ---------           v|
 //                 +============================+ <----- SP not-during calls
-//                 |    Arguments (sometimes)   |                |
-//                 |    ...                     |                v
+//                 |    Arguments (sometimes)   |                      |
+//                 |    ...                     |                      v
 //                 +============================+ <----- SP during calls
 //
-// The Header is addressed off the stack pointer.  masm.framePushed() is always
+// The Frame is addressed off the stack pointer.  masm.framePushed() is always
 // correct, and masm.getStackPointer() + masm.framePushed() always addresses the
 // Frame, with the DebugFrame optionally below it.
 //
-// The Local area is laid out by BaseLocalIter and is allocated and deallocated
-// by standard prologue and epilogue functions that manipulate the stack
-// pointer, but it is accessed via BaseStackFrame.
+// The Local area (including the DebugFrame) is laid out by BaseLocalIter and is
+// allocated and deallocated by standard prologue and epilogue functions that
+// manipulate the stack pointer, but it is accessed via BaseStackFrame.
 //
 // The Dynamic area is maintained by and accessed via BaseStackFrame.  On some
 // systems (such as ARM64), the Dynamic memory may be allocated in chunks
 // because the SP needs a specific alignment, and in this case there will
 // normally be some free space directly above the SP.  The stack height does not
 // include the free space, it reflects the logically used space only.
 //
 // The Arguments area is allocated and deallocated via BaseStackFrame (see
@@ -1142,21 +1142,24 @@ class BaseStackFrameAllocator {
   // pointer.
   //
   // Good values for ChunkSize are the subject of future empirical analysis;
   // eight words is just an educated guess.
 
   static constexpr uint32_t ChunkSize = 8 * sizeof(void*);
   static constexpr uint32_t InitialChunk = ChunkSize;
 
-  // The current logical height of the frame, ie the sum of space for the
-  // Local and Dynamic areas.  The allocated size of the frame -- provided by
-  // masm.framePushed() -- is usually larger than currentStackHeight_, notably
-  // at the beginning of execution when we've allocated InitialChunk extra
-  // space.
+  // The current logical height of the frame is
+  //   currentStackHeight_ = localSize_ + dynamicSize
+  // where dynamicSize is not accounted for explicitly and localSize_ also
+  // includes size for the DebugFrame.
+  //
+  // The allocated size of the frame, provided by masm.framePushed(), is usually
+  // larger than currentStackHeight_, notably at the beginning of execution when
+  // we've allocated InitialChunk extra space.
 
   uint32_t currentStackHeight_;
 #endif
 
   // Size of the Local area in bytes (stable after BaseCompiler::init() has
   // called BaseStackFrame::setupLocals(), which in turn calls
   // BaseStackFrameAllocator::setLocalSize()), always rounded to the proper
   // stack alignment.  The Local area is then allocated in beginFunction(),
@@ -1200,27 +1203,41 @@ class BaseStackFrameAllocator {
 #ifdef RABALDR_CHUNKY_STACK
     currentStackHeight_ = localSize_;
 #endif
   }
 
  public:
   // The fixed amount of memory, in bytes, allocated on the stack below the
   // Header for purposes such as locals and other fixed values.  Includes all
-  // necessary alignment.
-
-  uint32_t fixedSize() const {
+  // necessary alignment, and on ARM64 also the initial chunk for the working
+  // stack memory.
+
+  uint32_t fixedAllocSize() const {
     MOZ_ASSERT(localSize_ != UINT32_MAX);
 #ifdef RABALDR_CHUNKY_STACK
     return localSize_ + InitialChunk;
 #else
     return localSize_;
 #endif
   }
 
+#ifdef RABALDR_CHUNKY_STACK
+  // The allocated frame size is frequently larger than the logical stack
+  // height; we round up to a chunk boundary, and special case the initial
+  // chunk.
+  uint32_t framePushedForHeight(uint32_t logicalHeight) {
+    if (logicalHeight <= fixedAllocSize()) {
+      return fixedAllocSize();
+    }
+    return fixedAllocSize() + AlignBytes(logicalHeight - fixedAllocSize(),
+                                         ChunkSize);
+  }
+#endif
+
  protected:
   //////////////////////////////////////////////////////////////////////
   //
   // The Dynamic area - the dynamic part of the frame, for spilling and saving
   // intermediate values.
 
   // Offset off of sp_ for the slot at stack area location `offset`.
 
@@ -1235,60 +1252,59 @@ class BaseStackFrameAllocator {
     }
     currentStackHeight_ += bytes;
     checkChunkyInvariants();
   }
 
   void popChunkyBytes(uint32_t bytes) {
     checkChunkyInvariants();
     currentStackHeight_ -= bytes;
-    // Sometimes, popChunkyBytes() is used to pop a larger area, as when we
-    // drop values consumed by a call, and we may need to drop several
-    // chunks.  But never drop the initial chunk.
-    if (masm.framePushed() - currentStackHeight_ >= ChunkSize) {
-      uint32_t target =
-          Max(fixedSize(), AlignBytes(currentStackHeight_, ChunkSize));
-      uint32_t amount = masm.framePushed() - target;
-      if (amount) {
-        masm.freeStack(amount);
-      }
-      MOZ_ASSERT(masm.framePushed() >= fixedSize());
+    // Sometimes, popChunkyBytes() is used to pop a larger area, as when we drop
+    // values consumed by a call, and we may need to drop several chunks.  But
+    // never drop the initial chunk.  Crucially, the amount we drop is always an
+    // integral number of chunks.
+    uint32_t freeSpace = masm.framePushed() - currentStackHeight_;
+    if (freeSpace >= ChunkSize) {
+      uint32_t targetAllocSize = framePushedForHeight(currentStackHeight_);
+      uint32_t amountToFree = masm.framePushed() - targetAllocSize;
+      MOZ_ASSERT(amountToFree % ChunkSize == 0);
+      if (amountToFree) {
+        masm.freeStack(amountToFree);
+      }
     }
     checkChunkyInvariants();
   }
 #endif
 
   uint32_t currentStackHeight() const {
 #ifdef RABALDR_CHUNKY_STACK
     return currentStackHeight_;
 #else
     return masm.framePushed();
 #endif
   }
 
  private:
 #ifdef RABALDR_CHUNKY_STACK
   void checkChunkyInvariants() {
+    MOZ_ASSERT(masm.framePushed() >= fixedAllocSize());
     MOZ_ASSERT(masm.framePushed() >= currentStackHeight_);
-    MOZ_ASSERT(masm.framePushed() == fixedSize() ||
+    MOZ_ASSERT(masm.framePushed() == fixedAllocSize() ||
                masm.framePushed() - currentStackHeight_ < ChunkSize);
+    MOZ_ASSERT((masm.framePushed() - localSize_) % ChunkSize == 0);
   }
 #endif
 
   // For a given stack height, return the appropriate size of the allocated
   // frame.
 
   uint32_t framePushedForHeight(StackHeight stackHeight) {
 #ifdef RABALDR_CHUNKY_STACK
-    // The allocated frame size is frequently larger than the stack height;
-    // we round up to a chunk boundary, and special case the initial chunk.
-    return stackHeight.height <= fixedSize()
-               ? fixedSize()
-               : fixedSize() +
-                     AlignBytes(stackHeight.height - fixedSize(), ChunkSize);
+    // A more complicated adjustment is needed.
+    return framePushedForHeight(stackHeight.height);
 #else
     // The allocated frame size equals the stack height.
     return stackHeight.height;
 #endif
   }
 
  public:
   // The current height of the stack area, not necessarily zero-based, in a
@@ -4118,17 +4134,17 @@ class BaseCompiler final : public BaseCo
     if (!smgen_.generateStackmapEntriesForTrapExit(args, extras)) {
       return false;
     }
     if (!createStackMap("stack check", extras, masm.currentOffset(),
                         HasRefTypedDebugFrame::No)) {
       return false;
     }
 
-    size_t reservedBytes = fr.fixedSize() - masm.framePushed();
+    size_t reservedBytes = fr.fixedAllocSize() - masm.framePushed();
     MOZ_ASSERT(0 == (reservedBytes % sizeof(void*)));
 
     masm.reserveStack(reservedBytes);
     fr.onFixedStackAllocated();
     if (!smgen_.mst_.pushNonGCPointers(reservedBytes / sizeof(void*))) {
       return false;
     }
 
@@ -4279,17 +4295,17 @@ class BaseCompiler final : public BaseCo
       }
       insertBreakablePoint(CallSiteDesc::LeaveFrame);
       if (!createStackMap("debug: leave frame", refDebugFrame)) {
         return false;
       }
       restoreResult();
     }
 
-    GenerateFunctionEpilogue(masm, fr.fixedSize(), &offsets_);
+    GenerateFunctionEpilogue(masm, fr.fixedAllocSize(), &offsets_);
 
 #if defined(JS_ION_PERF)
     // FIXME - profiling code missing.  No bug for this.
 
     // Note the end of the inline code and start of the OOL code.
     // gen->perfSpewer().noteEndInlineCode(masm);
 #endif