Bug 1578418 - Use WasmABIResults iterator to place block and function results r=luke,lth
authorAndy Wingo <wingo@igalia.com>
Tue, 22 Oct 2019 15:30:10 +0000
changeset 498684 a9d2b57a99be08cf6942e5dd8c300e75ba0b7501
parent 498683 8e233a18ab9ae1511789985097376b6ea1903508
child 498685 e4aa69b40591d7b91661b01371da50cb059608a1
push id98604
push userjdemooij@mozilla.com
push dateWed, 23 Oct 2019 08:26:28 +0000
treeherderautoland@a9d2b57a99be [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersluke, lth
bugs1578418
milestone72.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1578418 - Use WasmABIResults iterator to place block and function results r=luke,lth Differential Revision: https://phabricator.services.mozilla.com/D44477
js/src/wasm/WasmBaselineCompile.cpp
js/src/wasm/WasmIonCompile.cpp
js/src/wasm/WasmOpIter.h
js/src/wasm/WasmStubs.cpp
js/src/wasm/WasmStubs.h
js/src/wasm/WasmValidate.cpp
--- a/js/src/wasm/WasmBaselineCompile.cpp
+++ b/js/src/wasm/WasmBaselineCompile.cpp
@@ -158,17 +158,16 @@ namespace wasm {
 
 using namespace js::jit;
 
 using HandleNaNSpecially = bool;
 using InvertBranch = bool;
 using IsKnownNotZero = bool;
 using IsUnsigned = bool;
 using NeedsBoundsCheck = bool;
-using PopStack = bool;
 using WantResult = bool;
 using ZeroOnOverflow = bool;
 
 class BaseStackFrame;
 
 // Two flags, useABI and interModule, control how calls are made.
 //
 // UseABI::Wasm implies that the Tls/Heap/Global registers are nonvolatile,
@@ -223,19 +222,20 @@ static constexpr FloatRegister RabaldrSc
 
 static_assert(RabaldrScratchF32 != ScratchFloat32Reg, "Too busy");
 static_assert(RabaldrScratchF64 != ScratchDoubleReg, "Too busy");
 #endif
 
 #ifdef JS_CODEGEN_X86
 // The selection of EBX here steps gingerly around: the need for EDX
 // to be allocatable for multiply/divide; ECX to be allocatable for
-// shift/rotate; EAX (= ReturnReg) to be allocatable as the joinreg;
-// EBX not being one of the WasmTableCall registers; and needing a
-// temp register for load/store that has a single-byte persona.
+// shift/rotate; EAX (= ReturnReg) to be allocatable as the result
+// register; EBX not being one of the WasmTableCall registers; and
+// needing a temp register for load/store that has a single-byte
+// persona.
 //
 // The compiler assumes that RabaldrScratchI32 has a single-byte
 // persona.  Code for 8-byte atomic operations assumes that
 // RabaldrScratchI32 is in fact ebx.
 
 #  define RABALDR_SCRATCH_I32
 static const Register RabaldrScratchI32 = ebx;
 
@@ -479,16 +479,18 @@ struct SpecificRegs {
 
 class BaseCompilerInterface {
  public:
   // Spill all spillable registers.
   //
   // TODO / OPTIMIZE (Bug 1316802): It's possible to do better here by
   // spilling only enough registers to satisfy current needs.
   virtual void sync() = 0;
+  virtual void saveTempPtr(RegPtr r) = 0;
+  virtual void restoreTempPtr(RegPtr r) = 0;
 };
 
 // Register allocator.
 
 class BaseRegAlloc {
   // Notes on float register allocation.
   //
   // The general rule in SpiderMonkey is that float registers can alias double
@@ -762,16 +764,30 @@ class BaseRegAlloc {
 
   void needPtr(RegPtr specific) {
     if (!isAvailablePtr(specific)) {
       bc.sync();
     }
     allocGPR(specific);
   }
 
+  // Use when you need a register for a short time but explicitly want to avoid
+  // a full sync().
+  MOZ_MUST_USE RegPtr needTempPtr(RegPtr fallback, bool* saved) {
+    if (hasGPR()) {
+      *saved = false;
+      return RegPtr(allocGPR());
+    }
+    *saved = true;
+    bc.saveTempPtr(fallback);
+    MOZ_ASSERT(isAvailablePtr(fallback));
+    allocGPR(fallback);
+    return RegPtr(fallback);
+  }
+
   MOZ_MUST_USE RegF32 needF32() {
     if (!hasFPU<MIRType::Float32>()) {
       bc.sync();
     }
     return RegF32(allocFPU<MIRType::Float32>());
   }
 
   void needF32(RegF32 specific) {
@@ -800,16 +816,24 @@ class BaseRegAlloc {
   void freeI64(RegI64 r) { freeInt64(r); }
 
   void freePtr(RegPtr r) { freeGPR(r); }
 
   void freeF64(RegF64 r) { freeFPU(r); }
 
   void freeF32(RegF32 r) { freeFPU(r); }
 
+  void freeTempPtr(RegPtr r, bool saved) {
+    freePtr(r);
+    if (saved) {
+      bc.restoreTempPtr(r);
+      MOZ_ASSERT(!isAvailablePtr(r));
+    }
+  }
+
 #ifdef JS_CODEGEN_ARM
   MOZ_MUST_USE RegI64 needI64Pair() {
     if (!hasGPRPair()) {
       bc.sync();
     }
     Register low, high;
     allocGPRPair(&low, &high);
     return RegI64(Register64(high, low));
@@ -1089,16 +1113,21 @@ class StackHeight {
   friend class BaseStackFrameAllocator;
 
   uint32_t height;
 
  public:
   explicit StackHeight(uint32_t h) : height(h) {}
   static StackHeight Invalid() { return StackHeight(UINT32_MAX); }
   bool isValid() const { return height != UINT32_MAX; }
+  bool operator==(StackHeight rhs) const {
+    MOZ_ASSERT(isValid() && rhs.isValid());
+    return height == rhs.height;
+  }
+  bool operator!=(StackHeight rhs) const { return !(*this == rhs); }
 };
 
 // Abstraction of the baseline compiler's stack frame (except for the Frame /
 // DebugFrame parts).  See comments above for more.  Remember, "below" on the
 // stack means at lower addresses.
 //
 // The abstraction is split into two parts: BaseStackFrameAllocator is
 // responsible for allocating and deallocating space on the stack and for
@@ -1238,16 +1267,23 @@ class BaseStackFrameAllocator {
   //
   // The Dynamic area - the dynamic part of the frame, for spilling and saving
   // intermediate values.
 
   // Offset off of sp_ for the slot at stack area location `offset`.
 
   int32_t stackOffset(int32_t offset) { return masm.framePushed() - offset; }
 
+  uint32_t computeHeightWithStackResults(StackHeight stackBase,
+                                         uint32_t stackResultBytes) {
+    MOZ_ASSERT(stackResultBytes);
+    MOZ_ASSERT(currentStackHeight() >= stackBase.height);
+    return stackBase.height + stackResultBytes;
+  }
+
 #ifdef RABALDR_CHUNKY_STACK
   void pushChunkyBytes(uint32_t bytes) {
     MOZ_ASSERT(bytes <= ChunkSize);
     checkChunkyInvariants();
     if (masm.framePushed() - currentStackHeight_ < bytes) {
       masm.reserveStack(ChunkSize);
     }
     currentStackHeight_ += bytes;
@@ -1330,56 +1366,61 @@ class BaseStackFrameAllocator {
   uint32_t dynamicHeight() const { return currentStackHeight() - localSize_; }
 
   // Before branching to an outer control label, pop the execution stack to
   // the level expected by that region, but do not update masm.framePushed()
   // as that will happen as compilation leaves the block.
   //
   // Note these operate directly on the stack pointer register.
 
-  void popStackBeforeBranch(StackHeight destStackHeight) {
+  void popStackBeforeBranch(StackHeight destStackHeight,
+                            uint32_t stackResultBytes) {
     uint32_t framePushedHere = masm.framePushed();
-    uint32_t framePushedThere = framePushedForHeight(destStackHeight);
+    StackHeight heightThere =
+        StackHeight(destStackHeight.height + stackResultBytes);
+    uint32_t framePushedThere = framePushedForHeight(heightThere);
     if (framePushedHere > framePushedThere) {
       masm.addToStackPtr(Imm32(framePushedHere - framePushedThere));
     }
   }
 
-  bool willPopStackBeforeBranch(StackHeight destStackHeight) {
-    uint32_t framePushedHere = masm.framePushed();
-    uint32_t framePushedThere = framePushedForHeight(destStackHeight);
-    return framePushedHere > framePushedThere;
-  }
-
-  // Before exiting a nested control region, pop the execution stack
-  // to the level expected by the nesting region, and free the
-  // stack.
-  //
-  // Note this operates on the stack height, which is not the same as the
-  // stack pointer on chunky-stack systems; the stack pointer may or may not
-  // change on such systems.
-
-  void popStackOnBlockExit(StackHeight destStackHeight, bool deadCode) {
-    uint32_t stackHeightHere = currentStackHeight();
-    uint32_t stackHeightThere = destStackHeight.height;
-    if (stackHeightHere > stackHeightThere) {
-#ifdef RABALDR_CHUNKY_STACK
-      if (deadCode) {
-        setStackHeight(destStackHeight);
-      } else {
-        popChunkyBytes(stackHeightHere - stackHeightThere);
-      }
-#else
-      if (deadCode) {
-        masm.setFramePushed(stackHeightThere);
-      } else {
-        masm.freeStack(stackHeightHere - stackHeightThere);
-      }
-#endif
-    }
+  void popStackBeforeBranch(StackHeight destStackHeight, ResultType type) {
+    popStackBeforeBranch(destStackHeight,
+                         ABIResultIter::MeasureStackBytes(type));
+  }
+
+  // Given that there are |stackParamSize| bytes on the dynamic stack
+  // corresponding to the stack results, return the stack height once these
+  // parameters are popped.
+
+  StackHeight stackResultsBase(uint32_t stackParamSize) {
+    return StackHeight(currentStackHeight() - stackParamSize);
+  }
+
+  // For most of WebAssembly, adjacent instructions have fallthrough control
+  // flow between them, which allows us to simply thread the current stack
+  // height through the compiler.  There are two exceptions to this rule: when
+  // leaving a block via dead code, and when entering the "else" arm of an "if".
+  // In these cases, the stack height is the block entry height, plus any stack
+  // values (results in the block exit case, parameters in the else entry case).
+
+  void resetStackHeight(StackHeight destStackHeight, ResultType type) {
+    uint32_t height = destStackHeight.height;
+    height += ABIResultIter::MeasureStackBytes(type);
+    setStackHeight(StackHeight(height));
+  }
+
+  // Return offset of stack result.
+
+  uint32_t locateStackResult(const ABIResult& result, StackHeight stackBase,
+                             uint32_t stackResultBytes) {
+    MOZ_ASSERT(result.onStack());
+    MOZ_ASSERT(result.stackOffset() + result.size() <= stackResultBytes);
+    uint32_t end = computeHeightWithStackResults(stackBase, stackResultBytes);
+    return end - result.stackOffset();
   }
 
  public:
   //////////////////////////////////////////////////////////////////////
   //
   // The Argument area - for outgoing calls.
   //
   // We abstract these operations as an optimization: we can merge the freeing
@@ -1611,32 +1652,20 @@ class BaseStackFrame final : public Base
   // Offset off of sp_ for a local with offset `offset` from Frame.
   int32_t localOffset(int32_t offset) { return masm.framePushed() - offset; }
 
  public:
   ///////////////////////////////////////////////////////////////////////////
   //
   // Dynamic area
 
-  // Sizes of items in the stack area.
-  //
-  // The size values come from the implementations of Push() in
-  // MacroAssembler-x86-shared.cpp and MacroAssembler-arm-shared.cpp, and from
-  // VFPRegister::size() in Architecture-arm.h.
-  //
-  // On ARM unlike on x86 we push a single for float.
-
-  static const size_t StackSizeOfPtr = sizeof(intptr_t);
-  static const size_t StackSizeOfInt64 = sizeof(int64_t);
-#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS32)
-  static const size_t StackSizeOfFloat = sizeof(float);
-#else
-  static const size_t StackSizeOfFloat = sizeof(double);
-#endif
-  static const size_t StackSizeOfDouble = sizeof(double);
+  static const size_t StackSizeOfPtr = ABIResult::StackSizeOfPtr;
+  static const size_t StackSizeOfInt64 = ABIResult::StackSizeOfInt64;
+  static const size_t StackSizeOfFloat = ABIResult::StackSizeOfFloat;
+  static const size_t StackSizeOfDouble = ABIResult::StackSizeOfDouble;
 
   uint32_t pushPtr(Register r) {
     DebugOnly<uint32_t> stackBefore = currentStackHeight();
 #ifdef RABALDR_CHUNKY_STACK
     pushChunkyBytes(StackSizeOfPtr);
     masm.storePtr(r, Address(sp_, stackOffset(currentStackHeight())));
 #else
     masm.Push(r);
@@ -1742,16 +1771,112 @@ class BaseStackFrame final : public Base
 
   void loadStackF64(int32_t offset, RegF64 dest) {
     masm.loadDouble(Address(sp_, stackOffset(offset)), dest);
   }
 
   void loadStackF32(int32_t offset, RegF32 dest) {
     masm.loadFloat32(Address(sp_, stackOffset(offset)), dest);
   }
+
+  uint32_t prepareStackResultArea(StackHeight stackBase,
+                                  uint32_t stackResultBytes) {
+    uint32_t end = computeHeightWithStackResults(stackBase, stackResultBytes);
+    if (currentStackHeight() < end) {
+      uint32_t bytes = end - currentStackHeight();
+#ifdef RABALDR_CHUNKY_STACK
+      pushChunkyBytes(bytes);
+#else
+      masm.reserveStack(bytes);
+#endif
+      maxFramePushed_ = Max(maxFramePushed_, masm.framePushed());
+    }
+    return end;
+  }
+
+  void finishStackResultArea(StackHeight stackBase, uint32_t stackResultBytes) {
+    uint32_t end = computeHeightWithStackResults(stackBase, stackResultBytes);
+    MOZ_ASSERT(currentStackHeight() >= end);
+    popBytes(currentStackHeight() - end);
+  }
+
+  void shuffleStackResultsTowardFP(uint32_t srcHeight, uint32_t destHeight,
+                                   uint32_t bytes, Register temp) {
+    MOZ_ASSERT(destHeight < srcHeight);
+    MOZ_ASSERT(bytes % sizeof(uint32_t) == 0);
+    uint32_t destOffset = stackOffset(destHeight);
+    uint32_t srcOffset = stackOffset(srcHeight);
+    MOZ_ASSERT(destOffset >= bytes);
+    MOZ_ASSERT(srcOffset >= bytes);
+    while (bytes >= sizeof(intptr_t)) {
+      destOffset -= sizeof(intptr_t);
+      srcOffset -= sizeof(intptr_t);
+      bytes -= sizeof(intptr_t);
+      masm.loadPtr(Address(sp_, srcOffset), temp);
+      masm.storePtr(temp, Address(sp_, destOffset));
+    }
+    if (bytes) {
+      MOZ_ASSERT(bytes == sizeof(uint32_t));
+      destOffset -= sizeof(uint32_t);
+      srcOffset -= sizeof(uint32_t);
+      masm.load32(Address(sp_, srcOffset), temp);
+      masm.store32(temp, Address(sp_, destOffset));
+    }
+  }
+
+  void shuffleStackResultsTowardFP(StackHeight srcHeight,
+                                   StackHeight destHeight, uint32_t bytes,
+                                   Register temp) {
+    MOZ_ASSERT(srcHeight.isValid());
+    MOZ_ASSERT(destHeight.isValid());
+    uint32_t src = computeHeightWithStackResults(srcHeight, bytes);
+    uint32_t dest = computeHeightWithStackResults(destHeight, bytes);
+    MOZ_ASSERT(src <= currentStackHeight());
+    MOZ_ASSERT(dest <= currentStackHeight());
+    shuffleStackResultsTowardFP(src - bytes, dest - bytes, bytes, temp);
+  }
+
+  void shuffleStackResultsTowardSP(uint32_t srcHeight, uint32_t destHeight,
+                                   uint32_t bytes, Register temp) {
+    MOZ_ASSERT(destHeight > srcHeight);
+    MOZ_ASSERT(bytes % sizeof(uint32_t) == 0);
+    uint32_t destOffset = stackOffset(destHeight);
+    uint32_t srcOffset = stackOffset(srcHeight);
+    MOZ_ASSERT(destOffset >= bytes);
+    MOZ_ASSERT(srcOffset >= bytes);
+    while (bytes >= sizeof(intptr_t)) {
+      masm.loadPtr(Address(sp_, srcOffset - bytes), temp);
+      masm.storePtr(temp, Address(sp_, destOffset - bytes));
+      bytes -= sizeof(intptr_t);
+    }
+    if (bytes) {
+      MOZ_ASSERT(bytes == sizeof(uint32_t));
+      masm.load32(Address(sp_, srcOffset - bytes), temp);
+      masm.store32(temp, Address(sp_, destOffset - bytes));
+    }
+  }
+
+  void storeImmediateToStack(int32_t imm, uint32_t destHeight, Register temp) {
+    masm.move32(Imm32(imm), temp);
+    masm.store32(temp, Address(sp_, stackOffset(destHeight)));
+  }
+
+  void storeImmediateToStack(int64_t imm, uint32_t destHeight, Register temp) {
+#ifdef JS_PUNBOX64
+    masm.move64(Imm64(imm), Register64(temp));
+    masm.store64(Register64(temp), Address(sp_, stackOffset(destHeight)));
+#else
+    union {
+      int64_t i64;
+      int32_t i32[2];
+    } bits = {.i64 = imm};
+    storeImmediateToStack(bits.i32[0], destHeight, temp);
+    storeImmediateToStack(bits.i32[1], destHeight - sizeof(int32_t), temp);
+#endif
+  }
 };
 
 void BaseStackFrame::zeroLocals(BaseRegAlloc* ra) {
   MOZ_ASSERT(varLow_ != UINT32_MAX);
 
   if (varLow_ == varHigh_) {
     return;
   }
@@ -1921,16 +2046,43 @@ struct Stk {
     MOZ_ASSERT(k > MemLast && k <= LocalLast);
   }
   static Stk StkRef(intptr_t v) {
     Stk s;
     s.kind_ = ConstRef;
     s.refval_ = v;
     return s;
   }
+  static Stk StackResult(ValType type, uint32_t offs) {
+    Kind k;
+    switch (type.code()) {
+      case ValType::I32:
+        k = Stk::MemI32;
+        break;
+      case ValType::I64:
+        k = Stk::MemI64;
+        break;
+      case ValType::F32:
+        k = Stk::MemF32;
+        break;
+      case ValType::F64:
+        k = Stk::MemF64;
+        break;
+      case ValType::FuncRef:
+      case ValType::AnyRef:
+      case ValType::Ref:
+        k = Stk::MemRef;
+        break;
+      case ValType::NullRef:
+        MOZ_CRASH("unexpected nullref stack result");
+    }
+    Stk s;
+    s.setOffs(k, offs);
+    return s;
+  }
 
   void setOffs(Kind k, uint32_t v) {
     MOZ_ASSERT(k <= MemLast);
     kind_ = k;
     offs_ = v;
   }
 
   Kind kind() const { return kind_; }
@@ -2560,26 +2712,16 @@ class BaseCompiler final : public BaseCo
 
   BaseStackFrame::LocalVector localInfo_;
   Vector<OutOfLineCode*, 8, SystemAllocPolicy> outOfLine_;
 
   // On specific platforms we sometimes need to use specific registers.
 
   SpecificRegs specific_;
 
-  // The join registers are used to carry values out of blocks.
-  // JoinRegI32 and joinRegI64_ must overlap: emitBrIf and
-  // emitBrTable assume that.
-
-  RegI32 joinRegI32_;
-  RegI64 joinRegI64_;
-  RegPtr joinRegPtr_;
-  RegF32 joinRegF32_;
-  RegF64 joinRegF64_;
-
   // There are more members scattered throughout.
 
  public:
   BaseCompiler(const ModuleEnvironment& env, const FuncCompileInput& input,
                const ValTypeVector& locals, const MachineState& trapExitLayout,
                size_t trapExitLayoutNumWords, Decoder& decoder,
                StkVector& stkSource, TempAllocator* alloc, MacroAssembler* masm,
                StackMaps* stackMaps);
@@ -2782,116 +2924,165 @@ class BaseCompiler final : public BaseCo
   }
 
   void moveF32(RegF32 src, RegF32 dest) {
     if (src != dest) {
       masm.moveFloat32(src, dest);
     }
   }
 
-  void maybeReserveJoinRegI(ResultType type) {
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  // Block parameters and results.
+  //
+  // Blocks may have multiple parameters and multiple results.  Blocks can also
+  // be the target of branches: the entry for loops, and the exit for
+  // non-loops.
+  //
+  // Passing multiple values to a non-branch target (i.e., the entry of a
+  // "block") falls out naturally: any items on the value stack can flow
+  // directly from one block to another.
+  //
+  // However, for branch targets, we need to allocate well-known locations for
+  // the branch values.  The approach taken in the baseline compiler is to
+  // allocate registers to the top N values (currently N=1), and then stack
+  // locations for the rest.
+  //
+
+  enum class RegKind { All, OnlyGPRs };
+
+  inline void needResultRegisters(ResultType type, RegKind which) {
     if (type.empty()) {
       return;
     }
-    MOZ_ASSERT(type.length() == 1, "multi-value joins unimplemented");
-    switch (type[0].code()) {
-      case ValType::I32:
-        needI32(joinRegI32_);
-        break;
-      case ValType::I64:
-        needI64(joinRegI64_);
-        break;
-      case ValType::F32:
-      case ValType::F64:
-        break;
-      case ValType::FuncRef:
-      case ValType::AnyRef:
-      case ValType::NullRef:
-      case ValType::Ref:
-        needRef(joinRegPtr_);
-        break;
-    }
-  }
-
-  void maybeUnreserveJoinRegI(ResultType type) {
+
+    for (ABIResultIter iter(type); !iter.done(); iter.next()) {
+      ABIResult result = iter.cur();
+      // Register results are visited first; when we see a stack result we're
+      // done.
+      if (!result.inRegister()) {
+        return;
+      }
+      switch (result.type().code()) {
+        case ValType::I32:
+          needI32(RegI32(result.gpr()));
+          break;
+        case ValType::I64:
+          needI64(RegI64(result.gpr64()));
+          break;
+        case ValType::F32:
+          if (which == RegKind::All) {
+            needF32(RegF32(result.fpr()));
+          }
+          break;
+        case ValType::F64:
+          if (which == RegKind::All) {
+            needF64(RegF64(result.fpr()));
+          }
+          break;
+        case ValType::FuncRef:
+        case ValType::AnyRef:
+        case ValType::Ref:
+          needRef(RegPtr(result.gpr()));
+          break;
+        case ValType::NullRef:
+          MOZ_CRASH("unexpected nullref result");
+      }
+    }
+  }
+
+  inline void freeResultRegisters(ResultType type, RegKind which) {
     if (type.empty()) {
       return;
     }
-    MOZ_ASSERT(type.length() == 1, "multi-value joins unimplemented");
-    switch (type[0].code()) {
-      case ValType::I32:
-        freeI32(joinRegI32_);
-        break;
-      case ValType::I64:
-        freeI64(joinRegI64_);
-        break;
-      case ValType::F32:
-      case ValType::F64:
-        break;
-      case ValType::FuncRef:
-      case ValType::AnyRef:
-      case ValType::NullRef:
-      case ValType::Ref:
-        freeRef(joinRegPtr_);
-        break;
-    }
-  }
-
-  void maybeReserveJoinReg(ResultType type) {
-    if (type.empty()) {
-      return;
-    }
-    MOZ_ASSERT(type.length() == 1, "multi-value joins unimplemented");
-    switch (type[0].code()) {
-      case ValType::I32:
-        needI32(joinRegI32_);
-        break;
-      case ValType::I64:
-        needI64(joinRegI64_);
-        break;
-      case ValType::F32:
-        needF32(joinRegF32_);
-        break;
-      case ValType::F64:
-        needF64(joinRegF64_);
-        break;
-      case ValType::Ref:
-      case ValType::NullRef:
-      case ValType::FuncRef:
-      case ValType::AnyRef:
-        needRef(joinRegPtr_);
-        break;
-    }
-  }
-
-  void maybeUnreserveJoinReg(ResultType type) {
-    if (type.empty()) {
-      return;
-    }
-    MOZ_ASSERT(type.length() == 1, "multi-value joins unimplemented");
-    switch (type[0].code()) {
-      case ValType::I32:
-        freeI32(joinRegI32_);
-        break;
-      case ValType::I64:
-        freeI64(joinRegI64_);
-        break;
-      case ValType::F32:
-        freeF32(joinRegF32_);
-        break;
-      case ValType::F64:
-        freeF64(joinRegF64_);
-        break;
-      case ValType::Ref:
-      case ValType::NullRef:
-      case ValType::FuncRef:
-      case ValType::AnyRef:
-        freeRef(joinRegPtr_);
-        break;
-    }
+
+    for (ABIResultIter iter(type); !iter.done(); iter.next()) {
+      ABIResult result = iter.cur();
+      // Register results are visited first; when we see a stack result we're
+      // done.
+      if (!result.inRegister()) {
+        return;
+      }
+      switch (result.type().code()) {
+        case ValType::I32:
+          freeI32(RegI32(result.gpr()));
+          break;
+        case ValType::I64:
+          freeI64(RegI64(result.gpr64()));
+          break;
+        case ValType::F32:
+          if (which == RegKind::All) {
+            freeF32(RegF32(result.fpr()));
+          }
+          break;
+        case ValType::F64:
+          if (which == RegKind::All) {
+            freeF64(RegF64(result.fpr()));
+          }
+          break;
+        case ValType::FuncRef:
+        case ValType::AnyRef:
+        case ValType::Ref:
+          freeRef(RegPtr(result.gpr()));
+          break;
+        case ValType::NullRef:
+          MOZ_CRASH("unexpected nullref result");
+      }
+    }
+  }
+
+  void needIntegerResultRegisters(ResultType type) {
+    needResultRegisters(type, RegKind::OnlyGPRs);
+  }
+  void freeIntegerResultRegisters(ResultType type) {
+    freeResultRegisters(type, RegKind::OnlyGPRs);
+  }
+
+  void needResultRegisters(ResultType type) {
+    needResultRegisters(type, RegKind::All);
+  }
+  void freeResultRegisters(ResultType type) {
+    freeResultRegisters(type, RegKind::All);
+  }
+
+  void assertResultRegistersAvailable(ResultType type) {
+#ifdef DEBUG
+    for (ABIResultIter iter(type); !iter.done(); iter.next()) {
+      ABIResult result = iter.cur();
+      if (!result.inRegister()) {
+        return;
+      }
+      switch (result.type().code()) {
+        case ValType::I32:
+          MOZ_ASSERT(isAvailableI32(RegI32(result.gpr())));
+          break;
+        case ValType::I64:
+          MOZ_ASSERT(isAvailableI64(RegI64(result.gpr64())));
+          break;
+        case ValType::F32:
+          MOZ_ASSERT(isAvailableF32(RegF32(result.fpr())));
+          break;
+        case ValType::F64:
+          MOZ_ASSERT(isAvailableF64(RegF64(result.fpr())));
+          break;
+        case ValType::FuncRef:
+        case ValType::AnyRef:
+        case ValType::Ref:
+          MOZ_ASSERT(isAvailableRef(RegPtr(result.gpr())));
+          break;
+        case ValType::NullRef:
+          MOZ_CRASH("unexpected nullref result");
+      }
+    }
+#endif
+  }
+
+  void captureResultRegisters(ResultType type) {
+    assertResultRegistersAvailable(type);
+    needResultRegisters(type);
   }
 
   ////////////////////////////////////////////////////////////
   //
   // Value stack and spilling.
   //
   // The value stack facilitates some on-the-fly register allocation
   // and immediate-constant use.  It tracks constants, latent
@@ -3278,16 +3469,30 @@ class BaseCompiler final : public BaseCo
         }
         default: {
           break;
         }
       }
     }
   }
 
+  void saveTempPtr(RegPtr r) final {
+    MOZ_ASSERT(!ra.isAvailablePtr(r));
+    fr.pushPtr(r);
+    ra.freePtr(r);
+    MOZ_ASSERT(ra.isAvailablePtr(r));
+  }
+
+  void restoreTempPtr(RegPtr r) final {
+    MOZ_ASSERT(ra.isAvailablePtr(r));
+    ra.needPtr(r);
+    fr.popPtr(r);
+    MOZ_ASSERT(!ra.isAvailablePtr(r));
+  }
+
   // Various methods for creating a stack map.  Stack maps are indexed by the
   // lowest address of the instruction immediately *after* the instruction of
   // interest.  In practice that means either: the return point of a call, the
   // instruction immediately after a trap instruction (the "resume"
   // instruction), or the instruction immediately following a no-op (when
   // debugging is enabled).
 
   // Create a vanilla stack map.
@@ -3755,161 +3960,318 @@ class BaseCompiler final : public BaseCo
     Stk& v = stk_.back();
     if (v.kind() != Stk::LocalI32) {
       return false;
     }
     *local = v.slot();
     return true;
   }
 
-  // TODO / OPTIMIZE (Bug 1316818): At the moment we use ReturnReg
-  // for JoinReg.  It is possible other choices would lead to better
-  // register allocation, as ReturnReg is often first in the
-  // register set and will be heavily wanted by the register
-  // allocator that uses takeFirst().
+  // TODO / OPTIMIZE (Bug 1316818): At the moment we use the Wasm
+  // inter-procedure ABI for block returns, which allocates ReturnReg as the
+  // single block result register.  It is possible other choices would lead to
+  // better register allocation, as ReturnReg is often first in the register set
+  // and will be heavily wanted by the register allocator that uses takeFirst().
   //
   // Obvious options:
   //  - pick a register at the back of the register set
   //  - pick a random register per block (different blocks have
   //    different join regs)
-  //
-  // On the other hand, we sync() before every block and only the
-  // JoinReg is live out of the block.  But on the way out, we
-  // currently pop the JoinReg before freeing regs to be discarded,
-  // so there is a real risk of some pointless shuffling there.  If
-  // we instead integrate the popping of the join reg into the
-  // popping of the stack we can just use the JoinReg as it will
-  // become available in that process.
-
-  MOZ_MUST_USE Maybe<AnyReg> popJoinRegUnlessVoid(ResultType type) {
-    if (type.empty()) {
-      return Nothing();
-    }
-    MOZ_ASSERT(type.length() == 1, "multi-value return unimplemented");
-    switch (type[0].code()) {
-      case ValType::I32: {
-        DebugOnly<Stk::Kind> k(stk_.back().kind());
-        MOZ_ASSERT(k == Stk::RegisterI32 || k == Stk::ConstI32 ||
-                   k == Stk::MemI32 || k == Stk::LocalI32);
-        return Some(AnyReg(popI32(joinRegI32_)));
-      }
-      case ValType::I64: {
-        DebugOnly<Stk::Kind> k(stk_.back().kind());
-        MOZ_ASSERT(k == Stk::RegisterI64 || k == Stk::ConstI64 ||
-                   k == Stk::MemI64 || k == Stk::LocalI64);
-        return Some(AnyReg(popI64(joinRegI64_)));
-      }
-      case ValType::F64: {
-        DebugOnly<Stk::Kind> k(stk_.back().kind());
-        MOZ_ASSERT(k == Stk::RegisterF64 || k == Stk::ConstF64 ||
-                   k == Stk::MemF64 || k == Stk::LocalF64);
-        return Some(AnyReg(popF64(joinRegF64_)));
-      }
-      case ValType::F32: {
-        DebugOnly<Stk::Kind> k(stk_.back().kind());
-        MOZ_ASSERT(k == Stk::RegisterF32 || k == Stk::ConstF32 ||
-                   k == Stk::MemF32 || k == Stk::LocalF32);
-        return Some(AnyReg(popF32(joinRegF32_)));
-      }
-      case ValType::Ref:
-      case ValType::NullRef:
-      case ValType::FuncRef:
-      case ValType::AnyRef: {
-        DebugOnly<Stk::Kind> k(stk_.back().kind());
-        MOZ_ASSERT(k == Stk::RegisterRef || k == Stk::ConstRef ||
-                   k == Stk::MemRef || k == Stk::LocalRef);
-        return Some(AnyReg(popRef(joinRegPtr_)));
-      }
-    }
-    MOZ_CRASH("Compiler bug: unexpected expression type");
-  }
-
-  // If we ever start not sync-ing on entry to Block (but instead try to sync
-  // lazily) then this may start asserting because it does not spill the
-  // joinreg if the joinreg is already allocated.  Note, it *can't* spill the
-  // joinreg in the contexts it's being used, so some other solution will need
-  // to be found.
-
-  MOZ_MUST_USE Maybe<AnyReg> captureJoinRegUnlessVoid(ResultType type) {
+
+  void popRegisterResults(ABIResultIter& iter) {
+    // Pop register results.  Note that in the single-value case, popping to a
+    // register may cause a sync(); for multi-value we sync'd already.
+    for (; !iter.done(); iter.next()) {
+      const ABIResult& result = iter.cur();
+      if (!result.inRegister()) {
+        // TODO / OPTIMIZE: We sync here to avoid solving the general parallel
+        // move problem in popStackResults.  However we could avoid syncing the
+        // values that are going to registers anyway, if they are already in
+        // registers.
+        sync();
+        break;
+      }
+      switch (result.type().code()) {
+        case ValType::I32:
+          popI32(RegI32(result.gpr()));
+          break;
+        case ValType::I64:
+          popI64(RegI64(result.gpr64()));
+          break;
+        case ValType::F32:
+          popF32(RegF32(result.fpr()));
+          break;
+        case ValType::F64:
+          popF64(RegF64(result.fpr()));
+          break;
+        case ValType::FuncRef:
+        case ValType::AnyRef:
+        case ValType::Ref:
+          popRef(RegPtr(result.gpr()));
+          break;
+        default:
+          MOZ_CRASH("bad result type");
+      }
+    }
+  }
+
+  void popStackResults(ABIResultIter& iter, StackHeight stackBase) {
+    MOZ_ASSERT(!iter.done());
+
+    // The iterator should be advanced beyond register results, and register
+    // results should be popped already from the value stack.
+    uint32_t alreadyPopped = iter.index();
+
+    // At this point, only stack arguments are remaining.  Iterate through them
+    // to measure how much stack space they will take up.
+    for (; !iter.done(); iter.next()) {
+      MOZ_ASSERT(iter.cur().onStack());
+    }
+
+    // Calculate the space needed to store stack results, in bytes.
+    uint32_t stackResultBytes = iter.stackBytesConsumedSoFar();
+    MOZ_ASSERT(stackResultBytes);
+
+    // Compute the stack height including the stack results.  Note that it's
+    // possible that this call expands the stack, for example if some of the
+    // results are supplied by constants and so are not already on the machine
+    // stack.
+    uint32_t endHeight = fr.prepareStackResultArea(stackBase, stackResultBytes);
+
+    // Find a free GPR to use when shuffling stack values.  If none is
+    // available, push ReturnReg and restore it after we're done.
+    bool saved = false;
+    RegPtr temp = ra.needTempPtr(RegPtr(ReturnReg), &saved);
+
+    // The sequence of Stk values is in the same order on the machine stack as
+    // the result locations, but there is a complication: constant values are
+    // not actually pushed on the machine stack.  (At this point registers and
+    // locals have been spilled already.)  So, moving the Stk values into place
+    // isn't simply a shuffle-down or shuffle-up operation.  There is a part of
+    // the Stk sequence that shuffles toward the FP, a part that's already in
+    // place, and a part that shuffles toward the SP.  After shuffling, we have
+    // to materialize the constants.
+
+    // Shuffle mem values toward the frame pointer, copying deepest values
+    // first.  Stop when we run out of results, get to a register result, or
+    // find a Stk value that is closer to the FP than the result.
+    for (iter.switchToPrev(); !iter.done(); iter.prev()) {
+      const ABIResult& result = iter.cur();
+      if (!result.onStack()) {
+        break;
+      }
+      MOZ_ASSERT(result.stackOffset() < stackResultBytes);
+      uint32_t destHeight = endHeight - result.stackOffset();
+      uint32_t stkBase = stk_.length() - (iter.count() - alreadyPopped);
+      Stk& v = stk_[stkBase + iter.index()];
+      if (v.isMem()) {
+        uint32_t srcHeight = v.offs();
+        if (srcHeight <= destHeight) {
+          break;
+        }
+        fr.shuffleStackResultsTowardFP(srcHeight, destHeight, result.size(),
+                                       temp);
+      }
+    }
+
+    // Reset iterator and skip register results.
+    for (iter.reset(); !iter.done(); iter.next()) {
+      if (iter.cur().onStack()) {
+        break;
+      }
+    }
+
+    // Revisit top stack values, shuffling mem values toward the stack pointer,
+    // copying shallowest values first.
+    for (; !iter.done(); iter.next()) {
+      const ABIResult& result = iter.cur();
+      MOZ_ASSERT(result.onStack());
+      MOZ_ASSERT(result.stackOffset() < stackResultBytes);
+      uint32_t destHeight = endHeight - result.stackOffset();
+      Stk& v = stk_[stk_.length() - (iter.index() - alreadyPopped) - 1];
+      if (v.isMem()) {
+        uint32_t srcHeight = v.offs();
+        if (srcHeight >= destHeight) {
+          break;
+        }
+        fr.shuffleStackResultsTowardSP(srcHeight, destHeight, result.size(),
+                                       temp);
+      }
+    }
+
+    // Reset iterator and skip register results, which are already popped off
+    // the value stack.
+    for (iter.reset(); !iter.done(); iter.next()) {
+      if (iter.cur().onStack()) {
+        break;
+      }
+    }
+
+    // Materialize constants and pop the remaining items from the value stack.
+    for (; !iter.done(); iter.next()) {
+      const ABIResult& result = iter.cur();
+      uint32_t resultHeight = endHeight - result.stackOffset();
+      Stk& v = stk_.back();
+      switch (v.kind()) {
+        case Stk::ConstI32:
+        case Stk::ConstF32:
+          // Rely on the fact that Stk stores its immediate values in a union,
+          // and that the bits of an f32 will be in the i32.
+          fr.storeImmediateToStack(v.i32val_, resultHeight, temp);
+          break;
+        case Stk::ConstI64:
+        case Stk::ConstF64:
+          // Likewise, rely on f64 bits being punned to i64.
+          fr.storeImmediateToStack(v.i64val_, resultHeight, temp);
+          break;
+        case Stk::ConstRef:
+          if (sizeof(intptr_t) == sizeof(int32_t)) {
+            fr.storeImmediateToStack(int32_t(v.refval_), resultHeight, temp);
+          } else {
+            fr.storeImmediateToStack(int64_t(v.refval_), resultHeight, temp);
+          }
+          break;
+        case Stk::MemRef:
+          // Update bookkeeping as we pop the Stk entry.
+          stackMapGenerator_.memRefsOnStk--;
+          break;
+        default:
+          MOZ_ASSERT(v.isMem());
+          break;
+      }
+      stk_.popBack();
+    }
+
+    ra.freeTempPtr(temp, saved);
+
+    // This will pop the stack if needed.
+    fr.finishStackResultArea(stackBase, stackResultBytes);
+  }
+
+  enum class ContinuationKind { Fallthrough, Jump };
+
+  void popBlockResults(ResultType type, StackHeight stackBase,
+                       ContinuationKind kind) {
     if (type.empty()) {
-      return Nothing();
-    }
-    MOZ_ASSERT(type.length() == 1, "multi-value return unimplemented");
-    switch (type[0].code()) {
-      case ValType::I32:
-        MOZ_ASSERT(isAvailableI32(joinRegI32_));
-        needI32(joinRegI32_);
-        return Some(AnyReg(joinRegI32_));
-      case ValType::I64:
-        MOZ_ASSERT(isAvailableI64(joinRegI64_));
-        needI64(joinRegI64_);
-        return Some(AnyReg(joinRegI64_));
-      case ValType::F32:
-        MOZ_ASSERT(isAvailableF32(joinRegF32_));
-        needF32(joinRegF32_);
-        return Some(AnyReg(joinRegF32_));
-      case ValType::F64:
-        MOZ_ASSERT(isAvailableF64(joinRegF64_));
-        needF64(joinRegF64_);
-        return Some(AnyReg(joinRegF64_));
-      case ValType::Ref:
-      case ValType::NullRef:
-      case ValType::FuncRef:
-      case ValType::AnyRef:
-        MOZ_ASSERT(isAvailableRef(joinRegPtr_));
-        needRef(joinRegPtr_);
-        return Some(AnyReg(joinRegPtr_));
-    }
-    MOZ_CRASH("Compiler bug: unexpected type");
-  }
-
-  void pushJoinRegUnlessVoid(const Maybe<AnyReg>& r) {
-    if (!r) {
+      return;
+    }
+
+    ABIResultIter iter(type);
+    popRegisterResults(iter);
+    if (!iter.done()) {
+      popStackResults(iter, stackBase);
+    } else if (kind == ContinuationKind::Jump) {
+      fr.popStackBeforeBranch(stackBase, type);
+    }
+  }
+
+  Stk captureStackResult(const ABIResult& result, uint32_t stackResultBytes) {
+    MOZ_ASSERT(result.onStack());
+    uint32_t offs = fr.locateStackResult(result, controlItem().stackHeight,
+                                         stackResultBytes);
+    return Stk::StackResult(result.type(), offs);
+  }
+
+  void pushBlockResults(ResultType type) {
+    if (type.empty()) {
       return;
     }
-    switch (r->tag) {
-      case AnyReg::I32:
-        pushI32(r->i32());
-        break;
-      case AnyReg::I64:
-        pushI64(r->i64());
-        break;
-      case AnyReg::F64:
-        pushF64(r->f64());
-        break;
-      case AnyReg::F32:
-        pushF32(r->f32());
-        break;
-      case AnyReg::REF:
-        pushRef(r->ref());
-        break;
-    }
-  }
-
-  void freeJoinRegUnlessVoid(const Maybe<AnyReg>& r) {
-    if (!r) {
-      return;
-    }
-    switch (r->tag) {
-      case AnyReg::I32:
-        freeI32(r->i32());
-        break;
-      case AnyReg::I64:
-        freeI64(r->i64());
-        break;
-      case AnyReg::F64:
-        freeF64(r->f64());
-        break;
-      case AnyReg::F32:
-        freeF32(r->f32());
-        break;
-      case AnyReg::REF:
-        freeRef(r->ref());
-        break;
-    }
+
+    // We need to push the results in reverse order, so first iterate through
+    // all results to determine the locations of stack result types.
+    ABIResultIter iter(type);
+    while (!iter.done()) {
+      iter.next();
+    }
+    uint32_t stackResultBytes = iter.stackBytesConsumedSoFar();
+
+    for (iter.switchToPrev(); !iter.done(); iter.prev()) {
+      const ABIResult& result = iter.cur();
+      if (!result.onStack()) {
+        break;
+      }
+      Stk v = captureStackResult(result, stackResultBytes);
+      push(v);
+      if (v.kind() == Stk::MemRef) {
+        stackMapGenerator_.memRefsOnStk++;
+      }
+    }
+
+    for (; !iter.done(); iter.prev()) {
+      const ABIResult& result = iter.cur();
+      MOZ_ASSERT(result.inRegister());
+      switch (result.type().code()) {
+        case ValType::I32:
+          pushI32(RegI32(result.gpr()));
+          break;
+        case ValType::I64:
+          pushI64(RegI64(result.gpr64()));
+          break;
+        case ValType::F32:
+          pushF32(RegF32(result.fpr()));
+          break;
+        case ValType::F64:
+          pushF64(RegF64(result.fpr()));
+          break;
+        case ValType::FuncRef:
+        case ValType::AnyRef:
+        case ValType::Ref:
+          pushRef(RegPtr(result.gpr()));
+          break;
+        case ValType::NullRef:
+          MOZ_CRASH("unexpected nullref result");
+      }
+    }
+  }
+
+  // A combination of popBlockResults + pushBlockResults, to shuffle the top
+  // stack values into the expected block result locations for the given type.
+  StackHeight topBlockResults(ResultType type) {
+    if (type.empty()) {
+      return fr.stackHeight();
+    }
+    StackHeight base = fr.stackResultsBase(stackConsumed(type.length()));
+    popBlockResults(type, base, ContinuationKind::Fallthrough);
+    pushBlockResults(type);
+    return base;
+  }
+
+  // Conditional branches with fallthrough are preceded by a topBlockResults, so
+  // we know that there are no stack results that need to be materialized.  In
+  // that case, we can just shuffle the whole block down before popping the
+  // stack.
+  void shuffleStackResultsBeforeBranch(StackHeight srcHeight,
+                                       StackHeight destHeight,
+                                       ResultType type) {
+    uint32_t stackResultBytes = 0;
+
+    if (ABIResultIter::HasStackResults(type)) {
+      MOZ_ASSERT(stk_.length() >= type.length());
+      ABIResultIter iter(type);
+      for (ABIResultIter iter(type); !iter.done(); iter.next()) {
+#ifdef DEBUG
+        const ABIResult& result = iter.cur();
+        const Stk& v = stk_[stk_.length() - iter.index() - 1];
+        MOZ_ASSERT(v.isMem() == result.onStack());
+#endif
+      }
+      stackResultBytes = iter.stackBytesConsumedSoFar();
+
+      if (stackResultBytes) {
+        // Find a free GPR to use when shuffling stack values.  If none is
+        // available, push ReturnReg and restore it after we're done.
+        bool saved = false;
+        RegPtr temp = ra.needTempPtr(RegPtr(ReturnReg), &saved);
+        fr.shuffleStackResultsTowardFP(srcHeight, destHeight, stackResultBytes,
+                                       temp);
+        ra.freeTempPtr(temp, saved);
+      }
+    }
+
+    fr.popStackBeforeBranch(destHeight, stackResultBytes);
   }
 
   // Return the amount of execution stack consumed by the top numval
   // values on the value stack.
 
   size_t stackConsumed(size_t numval) {
     size_t size = 0;
     MOZ_ASSERT(numval <= stk_.length());
@@ -4044,22 +4406,24 @@ class BaseCompiler final : public BaseCo
   }
 
 #endif
 
   ////////////////////////////////////////////////////////////
   //
   // Control stack
 
-  void initControl(Control& item) {
+  void initControl(Control& item, ResultType params) {
     // Make sure the constructor was run properly
     MOZ_ASSERT(!item.stackHeight.isValid() && item.stackSize == UINT32_MAX);
 
-    item.stackHeight = fr.stackHeight();
-    item.stackSize = stk_.length();
+    uint32_t paramCount = deadCode_ ? 0 : params.length();
+    uint32_t stackParamSize = stackConsumed(paramCount);
+    item.stackHeight = fr.stackResultsBase(stackParamSize);
+    item.stackSize = stk_.length() - paramCount;
     item.deadOnArrival = deadCode_;
     item.bceSafeOnEntry = bceSafe_;
   }
 
   Control& controlItem() { return iter_.controlItem(); }
 
   Control& controlItem(uint32_t relativeDepth) {
     return iter_.controlItem(relativeDepth);
@@ -4844,23 +5208,16 @@ class BaseCompiler final : public BaseCo
 
   RegPtr captureReturnedRef() {
     RegPtr r = RegPtr(ReturnReg);
     MOZ_ASSERT(isAvailableRef(r));
     needRef(r);
     return r;
   }
 
-  void returnCleanup(bool popStack) {
-    if (popStack) {
-      fr.popStackBeforeBranch(controlOutermost().stackHeight);
-    }
-    masm.jump(&returnLabel_);
-  }
-
   void checkDivideByZeroI32(RegI32 rhs) {
     Label nonZero;
     masm.branchTest32(Assembler::NonZero, rhs, rhs, &nonZero);
     trap(Trap::IntegerDivideByZero);
     masm.bind(&nonZero);
   }
 
   void checkDivideByZeroI64(RegI64 r) {
@@ -6589,18 +6946,19 @@ class BaseCompiler final : public BaseCo
       } f32;
       struct {
         RegF64 lhs;
         RegF64 rhs;
       } f64;
     };
 
     Label* const label;             // The target of the branch, never NULL
-    const StackHeight stackHeight;  // The value to pop to along the taken edge,
-                                    // unless !hasPop()
+    const StackHeight stackHeight;  // The stack base above which to place
+                                    // stack-spilled block results, if
+                                    // hasBlockResults().
     const bool invertBranch;        // If true, invert the sense of the branch
     const ResultType resultType;    // The result propagated along the edges
 
     explicit BranchState(Label* label)
         : label(label),
           stackHeight(StackHeight::Invalid()),
           invertBranch(false),
           resultType(ResultType::Empty()) {}
@@ -6613,17 +6971,17 @@ class BaseCompiler final : public BaseCo
 
     BranchState(Label* label, StackHeight stackHeight, bool invertBranch,
                 ResultType resultType)
         : label(label),
           stackHeight(stackHeight),
           invertBranch(invertBranch),
           resultType(resultType) {}
 
-    bool hasPop() const { return stackHeight.isValid(); }
+    bool hasBlockResults() const { return stackHeight.isValid(); }
   };
 
   void setLatentCompare(Assembler::Condition compareOp, ValType operandType) {
     latentOp_ = LatentOp::Compare;
     latentType_ = operandType;
     latentIntCmp_ = compareOp;
   }
 
@@ -6673,32 +7031,35 @@ class BaseCompiler final : public BaseCo
   // Cond is either Assembler::Condition or Assembler::DoubleCondition.
   //
   // Lhs is RegI32, RegI64, or RegF32, or RegF64.
   //
   // Rhs is either the same as Lhs, or an immediate expression compatible with
   // Lhs "when applicable".
 
   template <typename Cond, typename Lhs, typename Rhs>
-  void jumpConditionalWithJoinReg(BranchState* b, Cond cond, Lhs lhs, Rhs rhs) {
-    Maybe<AnyReg> r = popJoinRegUnlessVoid(b->resultType);
-
-    if (b->hasPop() && fr.willPopStackBeforeBranch(b->stackHeight)) {
-      Label notTaken;
-      branchTo(b->invertBranch ? cond : Assembler::InvertCondition(cond), lhs,
-               rhs, &notTaken);
-      fr.popStackBeforeBranch(b->stackHeight);
-      masm.jump(b->label);
-      masm.bind(&notTaken);
-    } else {
-      branchTo(b->invertBranch ? Assembler::InvertCondition(cond) : cond, lhs,
-               rhs, b->label);
-    }
-
-    pushJoinRegUnlessVoid(r);
+  void jumpConditionalWithResults(BranchState* b, Cond cond, Lhs lhs, Rhs rhs) {
+    if (b->hasBlockResults()) {
+      StackHeight resultsBase = topBlockResults(b->resultType);
+      if (b->stackHeight != resultsBase) {
+        Label notTaken;
+        branchTo(b->invertBranch ? cond : Assembler::InvertCondition(cond), lhs,
+                 rhs, &notTaken);
+
+        // Shuffle stack args.
+        shuffleStackResultsBeforeBranch(resultsBase, b->stackHeight,
+                                        b->resultType);
+        masm.jump(b->label);
+        masm.bind(&notTaken);
+        return;
+      }
+    }
+
+    branchTo(b->invertBranch ? Assembler::InvertCondition(cond) : cond, lhs,
+             rhs, b->label);
   }
 
   // sniffConditionalControl{Cmp,Eqz} may modify the latentWhatever_ state in
   // the BaseCompiler so that a subsequent conditional branch can be compiled
   // optimally.  emitBranchSetup() and emitBranchPerform() will consume that
   // state.  If the latter methods are not called because deadCode_ is true
   // then the compiler MUST instead call resetLatentOp() to reset the state.
 
@@ -6740,21 +7101,20 @@ class BaseCompiler final : public BaseCo
   MOZ_MUST_USE bool emitStore(ValType resultType, Scalar::Type viewType);
   MOZ_MUST_USE bool storeCommon(MemoryAccessDesc* access, ValType resultType);
   MOZ_MUST_USE bool emitSelect(bool typed);
 
   template <bool isSetLocal>
   MOZ_MUST_USE bool emitSetOrTeeLocal(uint32_t slot);
 
   void endBlock(ResultType type);
-  void endLoop(ResultType type);
-  void endIfThen();
+  void endIfThen(ResultType type);
   void endIfThenElse(ResultType type);
 
-  void doReturn(bool popStack);
+  void doReturn(ContinuationKind kind);
   void pushReturnValueOfCall(const FunctionCall& call, ValType type);
   void pushReturnValueOfCall(const FunctionCall& call, MIRType type);
 
   void emitCompareI32(Assembler::Condition compareOp, ValType compareType);
   void emitCompareI64(Assembler::Condition compareOp, ValType compareType);
   void emitCompareF32(Assembler::DoubleCondition compareOp,
                       ValType compareType);
   void emitCompareF64(Assembler::DoubleCondition compareOp,
@@ -8015,17 +8375,20 @@ bool BaseCompiler::sniffConditionalContr
       setLatentEqz(operandType);
       return true;
     default:
       return false;
   }
 }
 
 void BaseCompiler::emitBranchSetup(BranchState* b) {
-  maybeReserveJoinReg(b->resultType);
+  // Avoid allocating operands to latentOp_ to result registers.
+  if (b->hasBlockResults()) {
+    needResultRegisters(b->resultType);
+  }
 
   // Set up fields so that emitBranchPerform() need not switch on latentOp_.
   switch (latentOp_) {
     case LatentOp::None: {
       latentIntCmp_ = Assembler::NotEqual;
       latentType_ = ValType::I32;
       b->i32.lhs = popI32();
       b->i32.rhsImm = true;
@@ -8082,51 +8445,53 @@ void BaseCompiler::emitBranchSetup(Branc
         default: {
           MOZ_CRASH("Unexpected type for LatentOp::Eqz");
         }
       }
       break;
     }
   }
 
-  maybeUnreserveJoinReg(b->resultType);
+  if (b->hasBlockResults()) {
+    freeResultRegisters(b->resultType);
+  }
 }
 
 void BaseCompiler::emitBranchPerform(BranchState* b) {
   switch (latentType_.code()) {
     case ValType::I32: {
       if (b->i32.rhsImm) {
-        jumpConditionalWithJoinReg(b, latentIntCmp_, b->i32.lhs,
+        jumpConditionalWithResults(b, latentIntCmp_, b->i32.lhs,
                                    Imm32(b->i32.imm));
       } else {
-        jumpConditionalWithJoinReg(b, latentIntCmp_, b->i32.lhs, b->i32.rhs);
+        jumpConditionalWithResults(b, latentIntCmp_, b->i32.lhs, b->i32.rhs);
         freeI32(b->i32.rhs);
       }
       freeI32(b->i32.lhs);
       break;
     }
     case ValType::I64: {
       if (b->i64.rhsImm) {
-        jumpConditionalWithJoinReg(b, latentIntCmp_, b->i64.lhs,
+        jumpConditionalWithResults(b, latentIntCmp_, b->i64.lhs,
                                    Imm64(b->i64.imm));
       } else {
-        jumpConditionalWithJoinReg(b, latentIntCmp_, b->i64.lhs, b->i64.rhs);
+        jumpConditionalWithResults(b, latentIntCmp_, b->i64.lhs, b->i64.rhs);
         freeI64(b->i64.rhs);
       }
       freeI64(b->i64.lhs);
       break;
     }
     case ValType::F32: {
-      jumpConditionalWithJoinReg(b, latentDoubleCmp_, b->f32.lhs, b->f32.rhs);
+      jumpConditionalWithResults(b, latentDoubleCmp_, b->f32.lhs, b->f32.rhs);
       freeF32(b->f32.lhs);
       freeF32(b->f32.rhs);
       break;
     }
     case ValType::F64: {
-      jumpConditionalWithJoinReg(b, latentDoubleCmp_, b->f64.lhs, b->f64.rhs);
+      jumpConditionalWithResults(b, latentDoubleCmp_, b->f64.lhs, b->f64.rhs);
       freeF64(b->f64.lhs);
       freeF64(b->f64.rhs);
       break;
     }
     default: {
       MOZ_CRASH("Unexpected type for LatentOp::Compare");
     }
   }
@@ -8142,75 +8507,78 @@ void BaseCompiler::emitBranchPerform(Bra
 //    branching out of the block or falling out at the end be sure to
 //    pop the appropriate stacks back to where they were on entry, while
 //    preserving the exit value.
 //  - A continue branch in a loop is much like an exit branch, but the branch
 //    value must not be preserved.
 //  - The exit value is always in a designated join register (type dependent).
 
 bool BaseCompiler::emitBlock() {
-  if (!iter_.readBlock()) {
+  ResultType params;
+  if (!iter_.readBlock(&params)) {
     return false;
   }
 
   if (!deadCode_) {
     sync();  // Simplifies branching out from block
   }
 
-  initControl(controlItem());
+  initControl(controlItem(), params);
 
   return true;
 }
 
 void BaseCompiler::endBlock(ResultType type) {
   Control& block = controlItem();
 
-  // Save the value.
-  Maybe<AnyReg> r;
-  if (!deadCode_) {
-    r = popJoinRegUnlessVoid(type);
+  if (deadCode_) {
+    // Block does not fall through; reset stack.
+    fr.resetStackHeight(block.stackHeight, type);
+    popValueStackTo(block.stackSize);
+  } else {
+    // If the block label is used, we have a control join, so we need to shuffle
+    // fallthrough values into place.  Otherwise if it's not a control join, we
+    // can leave the value stack alone.
+    MOZ_ASSERT(stk_.length() == block.stackSize + type.length());
+    if (block.label.used()) {
+      popBlockResults(type, block.stackHeight, ContinuationKind::Fallthrough);
+    }
     block.bceSafeOnExit &= bceSafe_;
   }
 
-  // Leave the block.
-  fr.popStackOnBlockExit(block.stackHeight, deadCode_);
-  popValueStackTo(block.stackSize);
-
   // Bind after cleanup: branches out will have popped the stack.
   if (block.label.used()) {
     masm.bind(&block.label);
-    // No value was provided by the fallthrough but the branch out will
-    // have stored one in joinReg, so capture that.
     if (deadCode_) {
-      r = captureJoinRegUnlessVoid(type);
-    }
-    deadCode_ = false;
+      captureResultRegisters(type);
+      deadCode_ = false;
+    }
+    pushBlockResults(type);
   }
 
   bceSafe_ = block.bceSafeOnExit;
-
-  // Retain the value stored in joinReg by all paths, if there are any.
-  if (!deadCode_) {
-    pushJoinRegUnlessVoid(r);
-  }
 }
 
 bool BaseCompiler::emitLoop() {
-  if (!iter_.readLoop()) {
+  ResultType params;
+  if (!iter_.readLoop(&params)) {
     return false;
   }
 
   if (!deadCode_) {
     sync();  // Simplifies branching out from block
   }
 
-  initControl(controlItem());
+  initControl(controlItem(), params);
   bceSafe_ = 0;
 
   if (!deadCode_) {
+    // Loop entry is a control join, so shuffle the entry parameters into the
+    // well-known locations.
+    topBlockResults(params);
     masm.nopAlign(CodeAlignment);
     masm.bind(&controlItem(0).label);
     if (!addInterruptCheck()) {
       return false;
     }
   }
 
   return true;
@@ -8226,172 +8594,209 @@ bool BaseCompiler::emitLoop() {
 //           (begin (br 1) (unreachable))
 //           (begin (unreachable)))
 //       (i32.const 1))
 //
 // The branch causes neither of the unreachable expressions to be
 // evaluated.
 
 bool BaseCompiler::emitIf() {
+  ResultType params;
   Nothing unused_cond;
-  if (!iter_.readIf(&unused_cond)) {
+  if (!iter_.readIf(&params, &unused_cond)) {
     return false;
   }
 
   BranchState b(&controlItem().otherLabel, InvertBranch(true));
   if (!deadCode_) {
     emitBranchSetup(&b);
     sync();
+    // Because params can flow immediately to results in the case of an empty
+    // "then" or "else" block, and the result of an if/then is a join in
+    // general, we shuffle params eagerly to the result allocations.
+    topBlockResults(params);
   } else {
     resetLatentOp();
   }
 
-  initControl(controlItem());
+  initControl(controlItem(), params);
 
   if (!deadCode_) {
     emitBranchPerform(&b);
   }
 
   return true;
 }
 
-void BaseCompiler::endIfThen() {
+void BaseCompiler::endIfThen(ResultType type) {
   Control& ifThen = controlItem();
 
-  fr.popStackOnBlockExit(ifThen.stackHeight, deadCode_);
-  popValueStackTo(ifThen.stackSize);
+  // The parameters to the "if" logically flow to both the "then" and "else"
+  // blocks, but the "else" block is empty.  Since we know that the "if"
+  // type-checks, that means that the "else" parameters are the "else" results,
+  // and that the "if"'s result type is the same as its parameter type.
+
+  if (deadCode_) {
+    // "then" arm does not fall through; reset stack.
+    fr.resetStackHeight(ifThen.stackHeight, type);
+    popValueStackTo(ifThen.stackSize);
+    if (!ifThen.deadOnArrival) {
+      captureResultRegisters(type);
+    }
+  } else {
+    MOZ_ASSERT(stk_.length() == ifThen.stackSize + type.length());
+    // Assume we have a control join, so place results in block result
+    // allocations.
+    popBlockResults(type, ifThen.stackHeight, ContinuationKind::Fallthrough);
+    MOZ_ASSERT(!ifThen.deadOnArrival);
+  }
 
   if (ifThen.otherLabel.used()) {
     masm.bind(&ifThen.otherLabel);
   }
 
   if (ifThen.label.used()) {
     masm.bind(&ifThen.label);
   }
 
   if (!deadCode_) {
     ifThen.bceSafeOnExit &= bceSafe_;
   }
 
   deadCode_ = ifThen.deadOnArrival;
+  if (!deadCode_) {
+    pushBlockResults(type);
+  }
 
   bceSafe_ = ifThen.bceSafeOnExit & ifThen.bceSafeOnEntry;
 }
 
 bool BaseCompiler::emitElse() {
-  ResultType thenType;
+  ResultType params, results;
   NothingVector unused_thenValues;
 
-  if (!iter_.readElse(&thenType, &unused_thenValues)) {
+  if (!iter_.readElse(&params, &results, &unused_thenValues)) {
     return false;
   }
 
   Control& ifThenElse = controlItem(0);
 
   // See comment in endIfThenElse, below.
 
   // Exit the "then" branch.
 
   ifThenElse.deadThenBranch = deadCode_;
 
-  Maybe<AnyReg> r;
-  if (!deadCode_) {
-    r = popJoinRegUnlessVoid(thenType);
-  }
-
-  fr.popStackOnBlockExit(ifThenElse.stackHeight, deadCode_);
-  popValueStackTo(ifThenElse.stackSize);
+  if (deadCode_) {
+    fr.resetStackHeight(ifThenElse.stackHeight, results);
+    popValueStackTo(ifThenElse.stackSize);
+  } else {
+    MOZ_ASSERT(stk_.length() == ifThenElse.stackSize + results.length());
+    popBlockResults(results, ifThenElse.stackHeight, ContinuationKind::Jump);
+    freeResultRegisters(results);
+    MOZ_ASSERT(!ifThenElse.deadOnArrival);
+  }
 
   if (!deadCode_) {
     masm.jump(&ifThenElse.label);
   }
 
   if (ifThenElse.otherLabel.used()) {
     masm.bind(&ifThenElse.otherLabel);
   }
 
   // Reset to the "else" branch.
 
   if (!deadCode_) {
-    freeJoinRegUnlessVoid(r);
     ifThenElse.bceSafeOnExit &= bceSafe_;
   }
 
   deadCode_ = ifThenElse.deadOnArrival;
   bceSafe_ = ifThenElse.bceSafeOnEntry;
 
+  fr.resetStackHeight(ifThenElse.stackHeight, params);
+
+  if (!deadCode_) {
+    pushBlockResults(params);
+  }
+
   return true;
 }
 
 void BaseCompiler::endIfThenElse(ResultType type) {
   Control& ifThenElse = controlItem();
 
   // The expression type is not a reliable guide to what we'll find
   // on the stack, we could have (if E (i32.const 1) (unreachable))
   // in which case the "else" arm is AnyType but the type of the
   // full expression is I32.  So restore whatever's there, not what
   // we want to find there.  The "then" arm has the same constraint.
 
-  Maybe<AnyReg> r;
-  if (!deadCode_) {
-    r = popJoinRegUnlessVoid(type);
+  if (deadCode_) {
+    // "then" arm does not fall through; reset stack.
+    fr.resetStackHeight(ifThenElse.stackHeight, type);
+    popValueStackTo(ifThenElse.stackSize);
+  } else {
+    MOZ_ASSERT(stk_.length() == ifThenElse.stackSize + type.length());
+    // Assume we have a control join, so place results in block result
+    // allocations.
+    popBlockResults(type, ifThenElse.stackHeight,
+                    ContinuationKind::Fallthrough);
     ifThenElse.bceSafeOnExit &= bceSafe_;
-  }
-
-  fr.popStackOnBlockExit(ifThenElse.stackHeight, deadCode_);
-  popValueStackTo(ifThenElse.stackSize);
+    MOZ_ASSERT(!ifThenElse.deadOnArrival);
+  }
 
   if (ifThenElse.label.used()) {
     masm.bind(&ifThenElse.label);
   }
 
   bool joinLive =
       !ifThenElse.deadOnArrival &&
       (!ifThenElse.deadThenBranch || !deadCode_ || ifThenElse.label.bound());
 
   if (joinLive) {
-    // No value was provided by the "then" path but capture the one
+    // No values were provided by the "then" path, but capture the values
     // provided by the "else" path.
     if (deadCode_) {
-      r = captureJoinRegUnlessVoid(type);
+      captureResultRegisters(type);
     }
     deadCode_ = false;
   }
 
   bceSafe_ = ifThenElse.bceSafeOnExit;
 
   if (!deadCode_) {
-    pushJoinRegUnlessVoid(r);
+    pushBlockResults(type);
   }
 }
 
 bool BaseCompiler::emitEnd() {
   LabelKind kind;
   ResultType type;
   NothingVector unused_values;
   if (!iter_.readEnd(&kind, &type, &unused_values)) {
     return false;
   }
 
   switch (kind) {
     case LabelKind::Body:
       endBlock(type);
+      doReturn(ContinuationKind::Fallthrough);
       iter_.popEnd();
       MOZ_ASSERT(iter_.controlStackEmpty());
-      doReturn(PopStack(false));
       return iter_.readFunctionEnd(iter_.end());
     case LabelKind::Block:
       endBlock(type);
       break;
     case LabelKind::Loop:
       // The end of a loop isn't a branch target, so we can just leave its
-      // results on the stack to be consumed by the outer block.
+      // results on the expression stack to be consumed by the outer block.
       break;
     case LabelKind::Then:
-      endIfThen();
+      endIfThen(type);
       break;
     case LabelKind::Else:
       endIfThenElse(type);
       break;
   }
 
   iter_.popEnd();
 
@@ -8408,28 +8813,26 @@ bool BaseCompiler::emitBr() {
 
   if (deadCode_) {
     return true;
   }
 
   Control& target = controlItem(relativeDepth);
   target.bceSafeOnExit &= bceSafe_;
 
-  // Save any value in the designated join register, where the
-  // normal block exit code will also leave it.
-
-  Maybe<AnyReg> r = popJoinRegUnlessVoid(type);
-
-  fr.popStackBeforeBranch(target.stackHeight);
+  // Save any values in the designated join registers, as if the target block
+  // returned normally.
+
+  popBlockResults(type, target.stackHeight, ContinuationKind::Jump);
   masm.jump(&target.label);
 
-  // The register holding the join value is free for the remainder
-  // of this block.
-
-  freeJoinRegUnlessVoid(r);
+  // The registers holding the join values are free for the remainder of this
+  // block.
+
+  freeResultRegisters(type);
 
   deadCode_ = true;
 
   return true;
 }
 
 bool BaseCompiler::emitBrIf() {
   uint32_t relativeDepth;
@@ -8454,50 +8857,51 @@ bool BaseCompiler::emitBrIf() {
   emitBranchPerform(&b);
 
   return true;
 }
 
 bool BaseCompiler::emitBrTable() {
   Uint32Vector depths;
   uint32_t defaultDepth;
-  ResultType type;
+  ResultType branchParams;
   NothingVector unused_values;
   Nothing unused_index;
-  // N.B., `type' gets set to the type of the default branch target.  In the
-  // presence of subtyping, it could be that the different branch targets have
-  // different types.  Here we rely on the assumption that the value
+  // N.B., `branchParams' gets set to the type of the default branch target.  In
+  // the presence of subtyping, it could be that the different branch targets
+  // have different types.  Here we rely on the assumption that the value
   // representations (e.g. Stk value types) of all branch target types are the
   // same, in the baseline compiler.  Notably, this means that all Ref types
   // should be represented the same.
-  if (!iter_.readBrTable(&depths, &defaultDepth, &type, &unused_values,
+  if (!iter_.readBrTable(&depths, &defaultDepth, &branchParams, &unused_values,
                          &unused_index)) {
     return false;
   }
 
   if (deadCode_) {
     return true;
   }
 
-  // Don't use joinReg for rc
-  maybeReserveJoinRegI(type);
+  // Don't use param registers for rc
+  needIntegerResultRegisters(branchParams);
 
   // Table switch value always on top.
   RegI32 rc = popI32();
 
-  maybeUnreserveJoinRegI(type);
-
-  Maybe<AnyReg> r = popJoinRegUnlessVoid(type);
+  freeIntegerResultRegisters(branchParams);
+
+  StackHeight resultsBase = topBlockResults(branchParams);
 
   Label dispatchCode;
   masm.branch32(Assembler::Below, rc, Imm32(depths.length()), &dispatchCode);
 
   // This is the out-of-range stub.  rc is dead here but we don't need it.
 
-  fr.popStackBeforeBranch(controlItem(defaultDepth).stackHeight);
+  shuffleStackResultsBeforeBranch(
+      resultsBase, controlItem(defaultDepth).stackHeight, branchParams);
   controlItem(defaultDepth).bceSafeOnExit &= bceSafe_;
   masm.jump(&controlItem(defaultDepth).label);
 
   // Emit stubs.  rc is dead in all of these but we don't need it.
   //
   // The labels in the vector are in the TempAllocator and will
   // be freed by and by.
   //
@@ -8507,17 +8911,18 @@ bool BaseCompiler::emitBrTable() {
   LabelVector stubs;
   if (!stubs.reserve(depths.length())) {
     return false;
   }
 
   for (uint32_t depth : depths) {
     stubs.infallibleEmplaceBack(NonAssertingLabel());
     masm.bind(&stubs.back());
-    fr.popStackBeforeBranch(controlItem(depth).stackHeight);
+    shuffleStackResultsBeforeBranch(resultsBase, controlItem(depth).stackHeight,
+                                    branchParams);
     controlItem(depth).bceSafeOnExit &= bceSafe_;
     masm.jump(&controlItem(depth).label);
   }
 
   // Emit table.
 
   Label theTable;
   jumpTable(stubs, &theTable);
@@ -8526,17 +8931,17 @@ bool BaseCompiler::emitBrTable() {
 
   tableSwitch(&theTable, rc, &dispatchCode);
 
   deadCode_ = true;
 
   // Clean up.
 
   freeI32(rc);
-  freeJoinRegUnlessVoid(r);
+  popValueStackBy(branchParams.length());
 
   return true;
 }
 
 bool BaseCompiler::emitDrop() {
   if (!iter_.readDrop()) {
     return false;
   }
@@ -8544,76 +8949,39 @@ bool BaseCompiler::emitDrop() {
   if (deadCode_) {
     return true;
   }
 
   dropValue();
   return true;
 }
 
-void BaseCompiler::doReturn(bool popStack) {
+void BaseCompiler::doReturn(ContinuationKind kind) {
   if (deadCode_) {
     return;
   }
-  Maybe<ValType> type = funcType().ret();
-  if (!type) {
-    returnCleanup(popStack);
-    return;
-  }
-  switch (type.ref().code()) {
-    case ValType::I32: {
-      RegI32 rv = popI32(RegI32(ReturnReg));
-      returnCleanup(popStack);
-      freeI32(rv);
-      break;
-    }
-    case ValType::I64: {
-      RegI64 rv = popI64(RegI64(ReturnReg64));
-      returnCleanup(popStack);
-      freeI64(rv);
-      break;
-    }
-    case ValType::F64: {
-      RegF64 rv = popF64(RegF64(ReturnDoubleReg));
-      returnCleanup(popStack);
-      freeF64(rv);
-      break;
-    }
-    case ValType::F32: {
-      RegF32 rv = popF32(RegF32(ReturnFloat32Reg));
-      returnCleanup(popStack);
-      freeF32(rv);
-      break;
-    }
-    case ValType::Ref:
-    case ValType::NullRef:
-    case ValType::FuncRef:
-    case ValType::AnyRef: {
-      RegPtr rv = popRef(RegPtr(ReturnReg));
-      returnCleanup(popStack);
-      freeRef(rv);
-      break;
-    }
-    default: {
-      MOZ_CRASH("Function return type");
-    }
-  }
+
+  StackHeight height = controlOutermost().stackHeight;
+  ResultType type = ResultType::Vector(funcType().results());
+  popBlockResults(type, height, kind);
+  masm.jump(&returnLabel_);
+  freeResultRegisters(type);
 }
 
 bool BaseCompiler::emitReturn() {
   NothingVector unused_values;
   if (!iter_.readReturn(&unused_values)) {
     return false;
   }
 
   if (deadCode_) {
     return true;
   }
 
-  doReturn(PopStack(true));
+  doReturn(ContinuationKind::Jump);
   deadCode_ = true;
 
   return true;
 }
 
 bool BaseCompiler::emitCallArgs(const ValTypeVector& argTypes,
                                 FunctionCall* baselineCall) {
   MOZ_ASSERT(!deadCode_);
@@ -10830,17 +11198,17 @@ bool BaseCompiler::emitStructNarrow() {
 
 bool BaseCompiler::emitBody() {
   MOZ_ASSERT(stackMapGenerator_.framePushedAtEntryToBody.isSome());
 
   if (!iter_.readFunctionStart(func_.index)) {
     return false;
   }
 
-  initControl(controlItem());
+  initControl(controlItem(), ResultType::Empty());
 
   uint32_t overhead = 0;
 
   for (;;) {
     Nothing unused_a, unused_b;
 
 #ifdef DEBUG
     performRegisterLeakCheck();
@@ -11889,21 +12257,16 @@ BaseCompiler::BaseCompiler(const ModuleE
       latentType_(ValType::I32),
       latentIntCmp_(Assembler::Equal),
       latentDoubleCmp_(Assembler::DoubleEqual),
       masm(*masm),
       ra(*this),
       fr(*masm),
       stackMapGenerator_(stackMaps, trapExitLayout, trapExitLayoutNumWords,
                          *masm),
-      joinRegI32_(RegI32(ReturnReg)),
-      joinRegI64_(RegI64(ReturnReg64)),
-      joinRegPtr_(RegPtr(ReturnReg)),
-      joinRegF32_(RegF32(ReturnFloat32Reg)),
-      joinRegF64_(RegF64(ReturnDoubleReg)),
       stkSource_(stkSource) {
   // Our caller, BaselineCompileFunctions, will lend us the vector contents to
   // use for the eval stack.  To get hold of those contents, we'll temporarily
   // installing an empty one in its place.
   MOZ_ASSERT(stk_.empty());
   stk_.swap(stkSource_);
 
   // Assuming that previously processed wasm functions are well formed, the
--- a/js/src/wasm/WasmIonCompile.cpp
+++ b/js/src/wasm/WasmIonCompile.cpp
@@ -1759,54 +1759,58 @@ static bool EmitF64Const(FunctionCompile
     return false;
   }
 
   f.iter().setResult(f.constant(f64));
   return true;
 }
 
 static bool EmitBlock(FunctionCompiler& f) {
-  return f.iter().readBlock() && f.startBlock();
+  ResultType params;
+  return f.iter().readBlock(&params) && f.startBlock();
 }
 
 static bool EmitLoop(FunctionCompiler& f) {
-  if (!f.iter().readLoop()) {
+  ResultType params;
+  if (!f.iter().readLoop(&params)) {
     return false;
   }
 
   MBasicBlock* loopHeader;
   if (!f.startLoop(&loopHeader)) {
     return false;
   }
 
   f.addInterruptCheck();
 
   f.iter().controlItem() = loopHeader;
   return true;
 }
 
 static bool EmitIf(FunctionCompiler& f) {
+  ResultType params;
   MDefinition* condition = nullptr;
-  if (!f.iter().readIf(&condition)) {
+  if (!f.iter().readIf(&params, &condition)) {
     return false;
   }
 
   MBasicBlock* elseBlock;
   if (!f.branchAndStartThen(condition, &elseBlock)) {
     return false;
   }
 
   f.iter().controlItem() = elseBlock;
   return true;
 }
 
 static bool EmitElse(FunctionCompiler& f) {
-  ResultType thenType;
+  ResultType paramType;
+  ResultType resultType;
   DefVector thenValues;
-  if (!f.iter().readElse(&thenType, &thenValues)) {
+  if (!f.iter().readElse(&paramType, &resultType, &thenValues)) {
     return false;
   }
 
   if (!f.pushDefs(thenValues)) {
     return false;
   }
 
   if (!f.switchToElse(f.iter().controlItem(), &f.iter().controlItem())) {
--- a/js/src/wasm/WasmOpIter.h
+++ b/js/src/wasm/WasmOpIter.h
@@ -715,20 +715,21 @@ class MOZ_STACK_CLASS OpIter : private P
 
   // ------------------------------------------------------------------------
   // Decoding and validation interface.
 
   MOZ_MUST_USE bool readOp(OpBytes* op);
   MOZ_MUST_USE bool readFunctionStart(uint32_t funcIndex);
   MOZ_MUST_USE bool readFunctionEnd(const uint8_t* bodyEnd);
   MOZ_MUST_USE bool readReturn(ValueVector* values);
-  MOZ_MUST_USE bool readBlock();
-  MOZ_MUST_USE bool readLoop();
-  MOZ_MUST_USE bool readIf(Value* condition);
-  MOZ_MUST_USE bool readElse(ResultType* thenType, ValueVector* thenValues);
+  MOZ_MUST_USE bool readBlock(ResultType* paramType);
+  MOZ_MUST_USE bool readLoop(ResultType* paramType);
+  MOZ_MUST_USE bool readIf(ResultType* paramType, Value* condition);
+  MOZ_MUST_USE bool readElse(ResultType* paramType, ResultType* resultType,
+                             ValueVector* thenValues);
   MOZ_MUST_USE bool readEnd(LabelKind* kind, ResultType* type,
                             ValueVector* values);
   void popEnd();
   MOZ_MUST_USE bool readBr(uint32_t* relativeDepth, ResultType* type,
                            ValueVector* values);
   MOZ_MUST_USE bool readBrIf(uint32_t* relativeDepth, ResultType* type,
                              ValueVector* values, Value* condition);
   MOZ_MUST_USE bool readBrTable(Uint32Vector* depths, uint32_t* defaultDepth,
@@ -1257,71 +1258,76 @@ inline bool OpIter<Policy>::readReturn(V
     return false;
   }
 
   afterUnconditionalBranch();
   return true;
 }
 
 template <typename Policy>
-inline bool OpIter<Policy>::readBlock() {
+inline bool OpIter<Policy>::readBlock(ResultType* paramType) {
   MOZ_ASSERT(Classify(op_) == OpKind::Block);
 
   BlockType type;
   if (!readBlockType(&type)) {
     return false;
   }
 
+  *paramType = type.params();
   return pushControl(LabelKind::Block, type);
 }
 
 template <typename Policy>
-inline bool OpIter<Policy>::readLoop() {
+inline bool OpIter<Policy>::readLoop(ResultType* paramType) {
   MOZ_ASSERT(Classify(op_) == OpKind::Loop);
 
   BlockType type;
   if (!readBlockType(&type)) {
     return false;
   }
 
+  *paramType = type.params();
   return pushControl(LabelKind::Loop, type);
 }
 
 template <typename Policy>
-inline bool OpIter<Policy>::readIf(Value* condition) {
+inline bool OpIter<Policy>::readIf(ResultType* paramType, Value* condition) {
   MOZ_ASSERT(Classify(op_) == OpKind::If);
 
   BlockType type;
   if (!readBlockType(&type)) {
     return false;
   }
 
   if (!popWithType(ValType::I32, condition)) {
     return false;
   }
 
   if (!pushControl(LabelKind::Then, type)) {
     return false;
   }
 
+  *paramType = type.params();
   size_t paramsLength = type.params().length();
   return thenParamStack_.append(valueStack_.end() - paramsLength, paramsLength);
 }
 
 template <typename Policy>
-inline bool OpIter<Policy>::readElse(ResultType* thenType,
+inline bool OpIter<Policy>::readElse(ResultType* paramType,
+                                     ResultType* resultType,
                                      ValueVector* values) {
   MOZ_ASSERT(Classify(op_) == OpKind::Else);
 
   Control& block = controlStack_.back();
   if (block.kind() != LabelKind::Then) {
     return fail("else can only be used within an if");
   }
 
-  if (!checkStackAtEndOfBlock(thenType, values)) {
+  *paramType = block.type().params();
+  if (!checkStackAtEndOfBlock(resultType, values)) {
     return false;
   }
 
   // Restore to the entry state of the then block. Since the then block may
   // clobbered any value in the block's params, we must restore from a
   // snapshot.
   valueStack_.shrinkTo(block.valueStackBase());
   size_t thenParamsLength = block.type().params().length();
@@ -1342,17 +1348,18 @@ inline bool OpIter<Policy>::readEnd(Labe
   if (!checkStackAtEndOfBlock(type, values)) {
     return false;
   }
 
   Control& block = controlStack_.back();
 
   // If an `if` block ends with `end` instead of `else`, then we must
   // additionally validate that the then-block doesn't push anything.
-  if (block.kind() == LabelKind::Then && !block.resultType().empty()) {
+  if (block.kind() == LabelKind::Then &&
+      block.type().params() != block.type().results()) {
     return fail("if without else with a result value");
   }
 
   *kind = block.kind();
   return true;
 }
 
 template <typename Policy>
--- a/js/src/wasm/WasmStubs.cpp
+++ b/js/src/wasm/WasmStubs.cpp
@@ -34,16 +34,103 @@ using namespace js::jit;
 using namespace js::wasm;
 
 using mozilla::ArrayLength;
 
 typedef Vector<jit::MIRType, 8, SystemAllocPolicy> MIRTypeVector;
 typedef jit::ABIArgIter<MIRTypeVector> ABIArgMIRTypeIter;
 typedef jit::ABIArgIter<ValTypeVector> ABIArgValTypeIter;
 
+/*****************************************************************************/
+// ABIResultIter implementation
+
+static uint32_t ResultStackSize(ValType type) {
+  switch (type.code()) {
+    case ValType::I32:
+      return ABIResult::StackSizeOfInt32;
+    case ValType::I64:
+      return ABIResult::StackSizeOfInt64;
+    case ValType::F32:
+      return ABIResult::StackSizeOfFloat;
+    case ValType::F64:
+      return ABIResult::StackSizeOfDouble;
+    case ValType::Ref:
+    case ValType::FuncRef:
+    case ValType::AnyRef:
+      return ABIResult::StackSizeOfPtr;
+    case ValType::NullRef:
+    default:
+      MOZ_CRASH("Unexpected result type");
+  }
+}
+
+uint32_t ABIResult::size() const { return ResultStackSize(type()); }
+
+void ABIResultIter::settleRegister(ValType type) {
+  MOZ_ASSERT(!done());
+  MOZ_ASSERT(index() < RegisterResultCount);
+  static_assert(RegisterResultCount == 1, "expected a single register result");
+
+  switch (type.code()) {
+    case ValType::I32:
+      cur_ = ABIResult(type, ReturnReg);
+      break;
+    case ValType::I64:
+      cur_ = ABIResult(type, ReturnReg64);
+      break;
+    case ValType::F32:
+      cur_ = ABIResult(type, ReturnFloat32Reg);
+      break;
+    case ValType::F64:
+      cur_ = ABIResult(type, ReturnDoubleReg);
+      break;
+    case ValType::Ref:
+    case ValType::FuncRef:
+    case ValType::AnyRef:
+      cur_ = ABIResult(type, ReturnReg);
+      break;
+    case ValType::NullRef:
+    default:
+      MOZ_CRASH("Unexpected result type");
+  }
+}
+
+void ABIResultIter::settleNext() {
+  MOZ_ASSERT(direction_ == Next);
+  MOZ_ASSERT(!done());
+
+  uint32_t typeIndex = count_ - index_ - 1;
+  ValType type = type_[typeIndex];
+
+  if (index_ < RegisterResultCount) {
+    settleRegister(type);
+    return;
+  }
+
+  cur_ = ABIResult(type, nextStackOffset_);
+  nextStackOffset_ += ResultStackSize(type);
+}
+
+void ABIResultIter::settlePrev() {
+  MOZ_ASSERT(direction_ == Prev);
+  MOZ_ASSERT(!done());
+  uint32_t typeIndex = index_;
+  ValType type = type_[typeIndex];
+
+  if (count_ - index_ - 1 < RegisterResultCount) {
+    settleRegister(type);
+    return;
+  }
+
+  uint32_t size = ResultStackSize(type);
+  MOZ_ASSERT(nextStackOffset_ >= size);
+  nextStackOffset_ -= size;
+  cur_ = ABIResult(type, nextStackOffset_);
+}
+
 #ifdef WASM_CODEGEN_DEBUG
 template <class Closure>
 static void GenPrint(DebugChannel channel, MacroAssembler& masm,
                      const Maybe<Register>& taken, Closure passArgAndCall) {
   if (!IsCodegenDebugEnabled(channel)) {
     return;
   }
 
--- a/js/src/wasm/WasmStubs.h
+++ b/js/src/wasm/WasmStubs.h
@@ -15,20 +15,237 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 #ifndef wasm_stubs_h
 #define wasm_stubs_h
 
 #include "wasm/WasmGenerator.h"
+#include "wasm/WasmOpIter.h"
 
 namespace js {
 namespace wasm {
 
+// ValType and location for a single result: either in a register or on the
+// stack.
+
+class ABIResult {
+  ValType type_;
+  enum class Location { Gpr, Gpr64, Fpr, Stack } loc_;
+  union {
+    Register gpr_;
+    Register64 gpr64_;
+    FloatRegister fpr_;
+    uint32_t stackOffset_;
+  };
+
+  void validate() {
+#ifdef DEBUG
+    if (onStack()) {
+      return;
+    }
+    MOZ_ASSERT(inRegister());
+    switch (type_.code()) {
+      case ValType::I32:
+        MOZ_ASSERT(loc_ == Location::Gpr);
+        break;
+      case ValType::I64:
+        MOZ_ASSERT(loc_ == Location::Gpr64);
+        break;
+      case ValType::F32:
+      case ValType::F64:
+        MOZ_ASSERT(loc_ == Location::Fpr);
+        break;
+      case ValType::AnyRef:
+      case ValType::FuncRef:
+      case ValType::Ref:
+        MOZ_ASSERT(loc_ == Location::Gpr);
+        break;
+      default:
+        MOZ_CRASH("bad value type");
+    }
+#endif
+  }
+
+  friend class ABIResultIter;
+  ABIResult(){};
+
+ public:
+  // Sizes of items in the stack area.
+  //
+  // The size values come from the implementations of Push() in
+  // MacroAssembler-x86-shared.cpp and MacroAssembler-arm-shared.cpp, and from
+  // VFPRegister::size() in Architecture-arm.h.
+  //
+  // On ARM unlike on x86 we push a single for float.
+
+  static constexpr size_t StackSizeOfPtr = sizeof(intptr_t);
+  static constexpr size_t StackSizeOfInt32 = StackSizeOfPtr;
+  static constexpr size_t StackSizeOfInt64 = sizeof(int64_t);
+#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS32)
+  static constexpr size_t StackSizeOfFloat = sizeof(float);
+#else
+  static constexpr size_t StackSizeOfFloat = sizeof(double);
+#endif
+  static constexpr size_t StackSizeOfDouble = sizeof(double);
+
+  ABIResult(ValType type, Register gpr)
+      : type_(type), loc_(Location::Gpr), gpr_(gpr) {
+    validate();
+  }
+  ABIResult(ValType type, Register64 gpr64)
+      : type_(type), loc_(Location::Gpr64), gpr64_(gpr64) {
+    validate();
+  }
+  ABIResult(ValType type, FloatRegister fpr)
+      : type_(type), loc_(Location::Fpr), fpr_(fpr) {
+    validate();
+  }
+  ABIResult(ValType type, uint32_t stackOffset)
+      : type_(type), loc_(Location::Stack), stackOffset_(stackOffset) {
+    validate();
+  }
+
+  ValType type() const { return type_; }
+  bool onStack() const { return loc_ == Location::Stack; }
+  bool inRegister() const { return !onStack(); }
+  Register gpr() const {
+    MOZ_ASSERT(loc_ == Location::Gpr);
+    return gpr_;
+  }
+  Register64 gpr64() const {
+    MOZ_ASSERT(loc_ == Location::Gpr64);
+    return gpr64_;
+  }
+  FloatRegister fpr() const {
+    MOZ_ASSERT(loc_ == Location::Fpr);
+    return fpr_;
+  }
+  // Offset from SP.
+  uint32_t stackOffset() const {
+    MOZ_ASSERT(loc_ == Location::Stack);
+    return stackOffset_;
+  }
+  uint32_t size() const;
+};
+
+// Just as WebAssembly functions can take multiple arguments, they can also
+// return multiple results.  As with a call, a limited number of results will be
+// located in registers, and the rest will be stored in a stack area.  The
+// |ABIResultIter| computes result locations, given a |ResultType|.
+//
+// Recall that a |ResultType| represents a sequence of value types t1..tN,
+// indexed from 1 to N.  In principle it doesn't matter how we decide which
+// results get to be in registers and which go to the stack.  To better
+// harmonize with WebAssembly's abstract stack machine, whose properties are
+// taken advantage of by the baseline compiler, our strategy is to start
+// allocating result locations in "reverse" order: from result N down to 1.
+//
+// If a result with index I is in a register, then all results with index J > I
+// are also in registers.  If a result I is on the stack, then all results with
+// index K < I are also on the stack, farther away from the stack pointer than
+// result I.
+//
+// Currently only a single result is ever stored in a register, though this may
+// change in the future on register-rich platforms.
+//
+// NB: The baseline compiler also uses thie ABI for locations of block
+// parameters and return values, within individual WebAssembly functions.
+
+class ABIResultIter {
+  ResultType type_;
+  uint32_t count_;
+  uint32_t index_;
+  uint32_t nextStackOffset_;
+  enum { Next, Prev } direction_;
+  ABIResult cur_;
+
+  void settleRegister(ValType type);
+  void settleNext();
+  void settlePrev();
+
+  static constexpr size_t RegisterResultCount = 1;
+
+ public:
+  explicit ABIResultIter(const ResultType& type)
+      : type_(type), count_(type.length()) {
+    reset();
+  }
+
+  void reset() {
+    index_ = nextStackOffset_ = 0;
+    direction_ = Next;
+    if (!done()) {
+      settleNext();
+    }
+  }
+  bool done() const { return index_ == count_; }
+  uint32_t index() const { return index_; }
+  uint32_t count() const { return count_; }
+  uint32_t remaining() const { return count_ - index_; }
+  void switchToNext() {
+    MOZ_ASSERT(direction_ == Prev);
+    if (!done() && cur().onStack()) {
+      nextStackOffset_ += cur().size();
+    }
+    index_ = count_ - index_;
+    direction_ = Next;
+    if (!done()) {
+      settleNext();
+    }
+  }
+  void switchToPrev() {
+    MOZ_ASSERT(direction_ == Next);
+    if (!done() && cur().onStack()) {
+      nextStackOffset_ -= cur().size();
+    }
+    index_ = count_ - index_;
+    direction_ = Prev;
+    if (!done()) settlePrev();
+  }
+  void next() {
+    MOZ_ASSERT(direction_ == Next);
+    MOZ_ASSERT(!done());
+    index_++;
+    if (!done()) {
+      settleNext();
+    }
+  }
+  void prev() {
+    MOZ_ASSERT(direction_ == Prev);
+    MOZ_ASSERT(!done());
+    index_++;
+    if (!done()) {
+      settlePrev();
+    }
+  }
+  const ABIResult& cur() const {
+    MOZ_ASSERT(!done());
+    return cur_;
+  }
+
+  uint32_t stackBytesConsumedSoFar() const { return nextStackOffset_; }
+
+  static inline bool HasStackResults(const ResultType& type) {
+    return type.length() > RegisterResultCount;
+  }
+
+  static uint32_t MeasureStackBytes(const ResultType& type) {
+    if (!HasStackResults(type)) {
+      return 0;
+    }
+    ABIResultIter iter(type);
+    while (!iter.done()) {
+      iter.next();
+    }
+    return iter.stackBytesConsumedSoFar();
+  }
+};
+
 extern bool GenerateBuiltinThunk(jit::MacroAssembler& masm,
                                  jit::ABIFunctionType abiType,
                                  ExitReason exitReason, void* funcPtr,
                                  CallableOffsets* offsets);
 
 extern bool GenerateImportFunctions(const ModuleEnvironment& env,
                                     const FuncImportVector& imports,
                                     CompiledCode* code);
--- a/js/src/wasm/WasmValidate.cpp
+++ b/js/src/wasm/WasmValidate.cpp
@@ -573,23 +573,23 @@ static bool DecodeFunctionBodyExprs(cons
         if (!env.refTypesEnabled()) {
           return iter.unrecognizedOpcode(&op);
         }
         StackType unused;
         CHECK(iter.readSelect(/*typed*/ true, &unused, &nothing, &nothing,
                               &nothing));
       }
       case uint16_t(Op::Block):
-        CHECK(iter.readBlock());
+        CHECK(iter.readBlock(&unusedType));
       case uint16_t(Op::Loop):
-        CHECK(iter.readLoop());
+        CHECK(iter.readLoop(&unusedType));
       case uint16_t(Op::If):
-        CHECK(iter.readIf(&nothing));
+        CHECK(iter.readIf(&unusedType, &nothing));
       case uint16_t(Op::Else):
-        CHECK(iter.readElse(&unusedType, &nothings));
+        CHECK(iter.readElse(&unusedType, &unusedType, &nothings));
       case uint16_t(Op::I32Clz):
       case uint16_t(Op::I32Ctz):
       case uint16_t(Op::I32Popcnt):
         CHECK(iter.readUnary(ValType::I32, &nothing));
       case uint16_t(Op::I64Clz):
       case uint16_t(Op::I64Ctz):
       case uint16_t(Op::I64Popcnt):
         CHECK(iter.readUnary(ValType::I64, &nothing));