Bug 969722 - Part 2: Inline ForkJoinGetSlice. (r=nmatsakis)
authorShu-yu Guo <shu@rfrn.org>
Fri, 14 Feb 2014 19:18:07 -0800
changeset 171240 b39e4dce0e0988507b78f90b8a5a5ca97c678f36
parent 171239 c1218ef1628ebb0f7174e0f9b7086e37f701c61f
child 171241 24dcfacab028d6425ff61ea6f81820d91d4182b4
push id270
push userpvanderbeken@mozilla.com
push dateThu, 06 Mar 2014 09:24:21 +0000
reviewersnmatsakis
bugs969722
milestone30.0a1
Bug 969722 - Part 2: Inline ForkJoinGetSlice. (r=nmatsakis)
js/src/assembler/assembler/X86Assembler.h
js/src/jit/CompileWrappers.cpp
js/src/jit/CompileWrappers.h
js/src/jit/Ion.cpp
js/src/jit/IonBuilder.h
js/src/jit/IonMacroAssembler.h
js/src/jit/JitCompartment.h
js/src/jit/LIR-Common.h
js/src/jit/LOpcodes.h
js/src/jit/MCallOptimize.cpp
js/src/jit/MIR.h
js/src/jit/MOpcodes.h
js/src/jit/ParallelSafetyAnalysis.cpp
js/src/jit/arm/CodeGenerator-arm.cpp
js/src/jit/arm/CodeGenerator-arm.h
js/src/jit/arm/Lowering-arm.cpp
js/src/jit/arm/Lowering-arm.h
js/src/jit/shared/Assembler-x86-shared.h
js/src/jit/shared/CodeGenerator-x86-shared.cpp
js/src/jit/shared/CodeGenerator-x86-shared.h
js/src/jit/shared/Lowering-shared.h
js/src/jit/shared/Lowering-x86-shared.cpp
js/src/jit/shared/Lowering-x86-shared.h
js/src/jit/shared/MacroAssembler-x86-shared.h
js/src/jit/x64/Assembler-x64.h
js/src/jit/x64/Lowering-x64.h
js/src/jit/x86/Assembler-x86.h
js/src/jit/x86/Lowering-x86.h
js/src/vm/ForkJoin.cpp
js/src/vm/ForkJoin.h
js/src/vm/ThreadPool.h
--- a/js/src/assembler/assembler/X86Assembler.h
+++ b/js/src/assembler/assembler/X86Assembler.h
@@ -1196,17 +1196,16 @@ public:
 
     void cmpxchg32(RegisterID src, int offset, RegisterID base)
     {
         // Note that 32-bit CMPXCHG performs comparison against %eax.
         // If %eax == [%base+offset], then %src -> [%base+offset].
         // Otherwise, [%base+offset] -> %eax.
         spew("cmpxchg    %s, %s0x%x(%s)",
              nameIReg(src), PRETTY_PRINT_OFFSET(offset), nameIReg(base));
-        m_formatter.oneByteOp(PRE_LOCK);
         m_formatter.twoByteOp(OP2_CMPXCHG_GvEw, src, base, offset);
     }
 
 
     // Comparisons:
 
     void cmpl_rr(RegisterID src, RegisterID dst)
     {
@@ -1401,16 +1400,24 @@ public:
             m_formatter.oneByteOp(OP_GROUP1_EvIb, GROUP1_OP_CMP, addr);
             m_formatter.immediate8(imm);
         } else {
             m_formatter.oneByteOp(OP_GROUP1_EvIz, GROUP1_OP_CMP, addr);
             m_formatter.immediate32(imm);
         }
     }
 
+    void cmpw_rr(RegisterID src, RegisterID dst)
+    {
+        spew("cmpw       %s, %s",
+             nameIReg(2, src), nameIReg(2, dst));
+        m_formatter.prefix(PRE_OPERAND_SIZE);
+        m_formatter.oneByteOp(OP_CMP_EvGv, src, dst);
+    }
+
     void cmpw_rm(RegisterID src, int offset, RegisterID base, RegisterID index, int scale)
     {
         FIXME_INSN_PRINTING;
         m_formatter.prefix(PRE_OPERAND_SIZE);
         m_formatter.oneByteOp(OP_CMP_EvGv, src, base, index, scale, offset);
     }
 
     void cmpw_im(int imm, int offset, RegisterID base, RegisterID index, int scale)
@@ -2059,16 +2066,23 @@ public:
     void movsbl_mr(const void* addr, RegisterID dst)
     {
         spew("movsbl     %p, %s",
              addr, nameIReg(4, dst));
         m_formatter.twoByteOp(OP2_MOVSX_GvEb, dst, addr);
     }
 #endif
 
+    void movzwl_rr(RegisterID src, RegisterID dst)
+    {
+        spew("movzwl     %s, %s",
+             nameIReg(2, src), nameIReg(4, dst));
+        m_formatter.twoByteOp(OP2_MOVZX_GvEw, dst, src);
+    }
+
     void movzwl_mr(int offset, RegisterID base, RegisterID dst)
     {
         spew("movzwl     %s0x%x(%s), %s",
              PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameIReg(4, dst));
         m_formatter.twoByteOp(OP2_MOVZX_GvEw, dst, base, offset);
     }
 
     void movzwl_mr_disp32(int offset, RegisterID base, RegisterID dst)
--- a/js/src/jit/CompileWrappers.cpp
+++ b/js/src/jit/CompileWrappers.cpp
@@ -74,16 +74,22 @@ CompileRuntime::addressOfInterrupt()
 #ifdef JS_THREADSAFE
 const void *
 CompileRuntime::addressOfInterruptPar()
 {
     return &runtime()->interruptPar;
 }
 #endif
 
+const void *
+CompileRuntime::addressOfThreadPool()
+{
+    return &runtime()->threadPool;
+}
+
 const JitRuntime *
 CompileRuntime::jitRuntime()
 {
     return runtime()->jitRuntime();
 }
 
 SPSProfiler &
 CompileRuntime::spsProfiler()
--- a/js/src/jit/CompileWrappers.h
+++ b/js/src/jit/CompileWrappers.h
@@ -51,16 +51,18 @@ class CompileRuntime
 #endif
 
     const void *addressOfInterrupt();
 
 #ifdef JS_THREADSAFE
     const void *addressOfInterruptPar();
 #endif
 
+    const void *addressOfThreadPool();
+
     const JitRuntime *jitRuntime();
 
     // Compilation does not occur off thread when the SPS profiler is enabled.
     SPSProfiler &spsProfiler();
 
     bool signalHandlersInstalled();
     bool jitSupportsFloatingPoint();
     bool hadOutOfMemory();
--- a/js/src/jit/Ion.cpp
+++ b/js/src/jit/Ion.cpp
@@ -152,16 +152,17 @@ JitRuntime::JitRuntime()
     bailoutTail_(nullptr),
     enterJIT_(nullptr),
     bailoutHandler_(nullptr),
     argumentsRectifier_(nullptr),
     argumentsRectifierReturnAddr_(nullptr),
     parallelArgumentsRectifier_(nullptr),
     invalidator_(nullptr),
     debugTrapHandler_(nullptr),
+    forkJoinGetSliceStub_(nullptr),
     functionWrappers_(nullptr),
     osrTempData_(nullptr),
     flusher_(nullptr),
     ionCodeProtected_(false)
 {
 }
 
 JitRuntime::~JitRuntime()
@@ -283,16 +284,28 @@ JitRuntime::debugTrapHandler(JSContext *
         // be allocated in the atoms compartment.
         AutoLockForExclusiveAccess lock(cx);
         AutoCompartment ac(cx, cx->runtime()->atomsCompartment());
         debugTrapHandler_ = generateDebugTrapHandler(cx);
     }
     return debugTrapHandler_;
 }
 
+bool
+JitRuntime::ensureForkJoinGetSliceStubExists(JSContext *cx)
+{
+    if (!forkJoinGetSliceStub_) {
+        IonSpew(IonSpew_Codegen, "# Emitting ForkJoinGetSlice stub");
+        AutoLockForExclusiveAccess lock(cx);
+        AutoCompartment ac(cx, cx->runtime()->atomsCompartment());
+        forkJoinGetSliceStub_ = generateForkJoinGetSliceStub(cx);
+    }
+    return !!forkJoinGetSliceStub_;
+}
+
 uint8_t *
 JitRuntime::allocateOsrTempData(size_t size)
 {
     osrTempData_ = (uint8_t *)js_realloc(osrTempData_, size);
     return osrTempData_;
 }
 
 void
@@ -1662,16 +1675,23 @@ IonCompile(JSContext *cx, JSScript *scri
     types::AutoEnterAnalysis enter(cx);
 
     if (!cx->compartment()->ensureJitCompartmentExists(cx))
         return AbortReason_Alloc;
 
     if (!cx->compartment()->jitCompartment()->ensureIonStubsExist(cx))
         return AbortReason_Alloc;
 
+    if (executionMode == ParallelExecution &&
+        LIRGenerator::allowInlineForkJoinGetSlice() &&
+        !cx->runtime()->jitRuntime()->ensureForkJoinGetSliceStubExists(cx))
+    {
+        return AbortReason_Alloc;
+    }
+
     MIRGraph *graph = alloc->new_<MIRGraph>(temp);
     if (!graph)
         return AbortReason_Alloc;
 
     CompileInfo *info = alloc->new_<CompileInfo>(script, script->functionNonDelazifying(), osrPc,
                                                  constructing, executionMode,
                                                  script->needsArgsObj());
     if (!info)
--- a/js/src/jit/IonBuilder.h
+++ b/js/src/jit/IonBuilder.h
@@ -669,16 +669,19 @@ class IonBuilder : public MIRGenerator
     InliningStatus inlineNewDenseArray(CallInfo &callInfo);
     InliningStatus inlineNewDenseArrayForSequentialExecution(CallInfo &callInfo);
     InliningStatus inlineNewDenseArrayForParallelExecution(CallInfo &callInfo);
 
     // Slot intrinsics.
     InliningStatus inlineUnsafeSetReservedSlot(CallInfo &callInfo);
     InliningStatus inlineUnsafeGetReservedSlot(CallInfo &callInfo);
 
+    // ForkJoin intrinsics
+    InliningStatus inlineForkJoinGetSlice(CallInfo &callInfo);
+
     // Utility intrinsics.
     InliningStatus inlineIsCallable(CallInfo &callInfo);
     InliningStatus inlineHaveSameClass(CallInfo &callInfo);
     InliningStatus inlineToObject(CallInfo &callInfo);
     InliningStatus inlineDump(CallInfo &callInfo);
     InliningStatus inlineHasClass(CallInfo &callInfo, const Class *clasp);
 
     // Testing functions.
--- a/js/src/jit/IonMacroAssembler.h
+++ b/js/src/jit/IonMacroAssembler.h
@@ -372,16 +372,20 @@ class MacroAssembler : public MacroAssem
         loadPtr(Address(dest, types::TypeObject::offsetOfProto()), dest);
     }
 
     void loadStringLength(Register str, Register dest) {
         loadPtr(Address(str, JSString::offsetOfLengthAndFlags()), dest);
         rshiftPtr(Imm32(JSString::LENGTH_SHIFT), dest);
     }
 
+    void loadSliceBounds(Register worker, Register dest) {
+        loadPtr(Address(worker, ThreadPoolWorker::offsetOfSliceBounds()), dest);
+    }
+
     void loadJSContext(const Register &dest) {
         loadPtr(AbsoluteAddress(GetIonContext()->runtime->addressOfJSContext()), dest);
     }
     void loadJitActivation(const Register &dest) {
         loadPtr(AbsoluteAddress(GetIonContext()->runtime->addressOfActivation()), dest);
     }
 
     template<typename T>
@@ -828,16 +832,22 @@ class MacroAssembler : public MacroAssem
         Push(ImmPtr(f));
     }
     void enterFakeExitFrame(JitCode *codeVal = nullptr) {
         linkExitFrame();
         Push(ImmPtr(codeVal));
         Push(ImmPtr(nullptr));
     }
 
+    void loadThreadPool(Register pool) {
+        // JitRuntimes are tied to JSRuntimes and there is one ThreadPool per
+        // JSRuntime, so we can hardcode the ThreadPool address here.
+        movePtr(ImmPtr(GetIonContext()->runtime->addressOfThreadPool()), pool);
+    }
+
     void loadForkJoinContext(Register cx, Register scratch);
     void loadContext(Register cxReg, Register scratch, ExecutionMode executionMode);
 
     void enterParallelExitFrameAndLoadContext(const VMFunction *f, Register cx,
                                               Register scratch);
 
     void enterExitFrameAndLoadContext(const VMFunction *f, Register cxReg, Register scratch,
                                       ExecutionMode executionMode);
--- a/js/src/jit/JitCompartment.h
+++ b/js/src/jit/JitCompartment.h
@@ -184,16 +184,19 @@ class JitRuntime
 
     // Thunk that calls the GC pre barrier.
     JitCode *valuePreBarrier_;
     JitCode *shapePreBarrier_;
 
     // Thunk used by the debugger for breakpoint and step mode.
     JitCode *debugTrapHandler_;
 
+    // Stub used to inline the ForkJoinGetSlice intrinsic.
+    JitCode *forkJoinGetSliceStub_;
+
     // Map VMFunction addresses to the JitCode of the wrapper.
     typedef WeakCache<const VMFunction *, JitCode *> VMWrapperMap;
     VMWrapperMap *functionWrappers_;
 
     // Buffer for OSR from baseline to Ion. To avoid holding on to this for
     // too long, it's also freed in JitCompartment::mark and in EnterBaseline
     // (after returning from JIT code).
     uint8_t *osrTempData_;
@@ -214,16 +217,17 @@ class JitRuntime
     JitCode *generateBailoutTailStub(JSContext *cx);
     JitCode *generateEnterJIT(JSContext *cx, EnterJitType type);
     JitCode *generateArgumentsRectifier(JSContext *cx, ExecutionMode mode, void **returnAddrOut);
     JitCode *generateBailoutTable(JSContext *cx, uint32_t frameClass);
     JitCode *generateBailoutHandler(JSContext *cx);
     JitCode *generateInvalidator(JSContext *cx);
     JitCode *generatePreBarrier(JSContext *cx, MIRType type);
     JitCode *generateDebugTrapHandler(JSContext *cx);
+    JitCode *generateForkJoinGetSliceStub(JSContext *cx);
     JitCode *generateVMWrapper(JSContext *cx, const VMFunction &f);
 
     JSC::ExecutableAllocator *createIonAlloc(JSContext *cx);
 
   public:
     JitRuntime();
     ~JitRuntime();
     bool initialize(JSContext *cx);
@@ -316,16 +320,21 @@ class JitRuntime
 
     JitCode *valuePreBarrier() const {
         return valuePreBarrier_;
     }
 
     JitCode *shapePreBarrier() const {
         return shapePreBarrier_;
     }
+
+    bool ensureForkJoinGetSliceStubExists(JSContext *cx);
+    JitCode *forkJoinGetSliceStub() const {
+        return forkJoinGetSliceStub_;
+    }
 };
 
 class JitCompartment
 {
     friend class JitActivation;
 
     // Ion state for the compartment's runtime.
     JitRuntime *rt;
--- a/js/src/jit/LIR-Common.h
+++ b/js/src/jit/LIR-Common.h
@@ -4780,16 +4780,48 @@ class LForkJoinContext : public LCallIns
         setTemp(0, temp1);
     }
 
     const LDefinition *getTempReg() {
         return getTemp(0);
     }
 };
 
+class LForkJoinGetSlice : public LInstructionHelper<1, 1, 4>
+{
+  public:
+    LIR_HEADER(ForkJoinGetSlice);
+
+    LForkJoinGetSlice(const LAllocation &cx,
+                      const LDefinition &temp1, const LDefinition &temp2,
+                      const LDefinition &temp3, const LDefinition &temp4) {
+        setOperand(0, cx);
+        setTemp(0, temp1);
+        setTemp(1, temp2);
+        setTemp(2, temp3);
+        setTemp(3, temp4);
+    }
+
+    const LAllocation *forkJoinContext() {
+        return getOperand(0);
+    }
+    const LDefinition *temp1() {
+        return getTemp(0);
+    }
+    const LDefinition *temp2() {
+        return getTemp(1);
+    }
+    const LDefinition *temp3() {
+        return getTemp(2);
+    }
+    const LDefinition *temp4() {
+        return getTemp(3);
+    }
+};
+
 class LCallGetProperty : public LCallInstructionHelper<BOX_PIECES, BOX_PIECES, 0>
 {
   public:
     LIR_HEADER(CallGetProperty)
 
     static const size_t Value = 0;
 
     MCallGetProperty *mir() const {
--- a/js/src/jit/LOpcodes.h
+++ b/js/src/jit/LOpcodes.h
@@ -209,16 +209,17 @@
     _(ClampDToUint8)                \
     _(ClampVToUint8)                \
     _(LoadFixedSlotV)               \
     _(LoadFixedSlotT)               \
     _(StoreFixedSlotV)              \
     _(StoreFixedSlotT)              \
     _(FunctionEnvironment)          \
     _(ForkJoinContext)              \
+    _(ForkJoinGetSlice)             \
     _(GetPropertyCacheV)            \
     _(GetPropertyCacheT)            \
     _(GetPropertyPolymorphicV)      \
     _(GetPropertyPolymorphicT)      \
     _(GetElementCacheV)             \
     _(GetElementCacheT)             \
     _(BindNameCache)                \
     _(CallGetProperty)              \
--- a/js/src/jit/MCallOptimize.cpp
+++ b/js/src/jit/MCallOptimize.cpp
@@ -141,16 +141,18 @@ IonBuilder::inlineNativeCall(CallInfo &c
         return inlineUnsafeSetReservedSlot(callInfo);
     if (native == intrinsic_UnsafeGetReservedSlot)
         return inlineUnsafeGetReservedSlot(callInfo);
 
     // Parallel intrinsics.
     if (native == intrinsic_ShouldForceSequential ||
         native == intrinsic_InParallelSection)
         return inlineForceSequentialOrInParallelSection(callInfo);
+    if (native == intrinsic_ForkJoinGetSlice)
+        return inlineForkJoinGetSlice(callInfo);
 
     // Utility intrinsics.
     if (native == intrinsic_IsCallable)
         return inlineIsCallable(callInfo);
     if (native == intrinsic_HaveSameClass)
         return inlineHaveSameClass(callInfo);
     if (native == intrinsic_ToObject)
         return inlineToObject(callInfo);
@@ -1385,16 +1387,50 @@ IonBuilder::inlineForceSequentialOrInPar
         return InliningStatus_Inlined;
       }
     }
 
     MOZ_ASSUME_UNREACHABLE("Invalid execution mode");
 }
 
 IonBuilder::InliningStatus
+IonBuilder::inlineForkJoinGetSlice(CallInfo &callInfo)
+{
+    if (info().executionMode() != ParallelExecution)
+        return InliningStatus_NotInlined;
+
+    // Assert the way the function is used instead of testing, as it is a
+    // self-hosted function which must be used in a particular fashion.
+    MOZ_ASSERT(callInfo.argc() == 1 && !callInfo.constructing());
+    MOZ_ASSERT(callInfo.getArg(0)->type() == MIRType_Int32);
+    MOZ_ASSERT(getInlineReturnType() == MIRType_Int32);
+
+    callInfo.setImplicitlyUsedUnchecked();
+
+    switch (info().executionMode()) {
+      case SequentialExecution:
+      case DefinitePropertiesAnalysis:
+        // ForkJoinGetSlice acts as identity for sequential execution.
+        current->push(callInfo.getArg(0));
+        return InliningStatus_Inlined;
+      case ParallelExecution:
+        if (LIRGenerator::allowInlineForkJoinGetSlice()) {
+            MForkJoinGetSlice *getSlice = MForkJoinGetSlice::New(alloc(),
+                                                                 graph().forkJoinContext());
+            current->add(getSlice);
+            current->push(getSlice);
+            return InliningStatus_Inlined;
+        }
+        return InliningStatus_NotInlined;
+    }
+
+    MOZ_ASSUME_UNREACHABLE("Invalid execution mode");
+}
+
+IonBuilder::InliningStatus
 IonBuilder::inlineNewDenseArray(CallInfo &callInfo)
 {
     if (callInfo.constructing() || callInfo.argc() != 1)
         return InliningStatus_NotInlined;
 
     // For now, in seq. mode we just call the C function.  In
     // par. mode we use inlined MIR.
     ExecutionMode executionMode = info().executionMode();
--- a/js/src/jit/MIR.h
+++ b/js/src/jit/MIR.h
@@ -7484,16 +7484,43 @@ class MForkJoinContext
         return AliasSet::None();
     }
 
     bool possiblyCalls() const {
         return true;
     }
 };
 
+// Calls the ForkJoinGetSlice stub, used for inlining the eponymous intrinsic.
+// Only applicable in ParallelExecution.
+class MForkJoinGetSlice
+  : public MUnaryInstruction
+{
+    MForkJoinGetSlice(MDefinition *cx)
+      : MUnaryInstruction(cx)
+    {
+        setResultType(MIRType_Int32);
+    }
+
+  public:
+    INSTRUCTION_HEADER(ForkJoinGetSlice);
+
+    static MForkJoinGetSlice *New(TempAllocator &alloc, MDefinition *cx) {
+        return new(alloc) MForkJoinGetSlice(cx);
+    }
+
+    MDefinition *forkJoinContext() {
+        return getOperand(0);
+    }
+
+    bool possiblyCalls() const {
+        return true;
+    }
+};
+
 // Store to vp[slot] (slots that are not inline in an object).
 class MStoreSlot
   : public MBinaryInstruction,
     public MixPolicy<ObjectPolicy<0>, NoFloatPolicy<1> >
 {
     uint32_t slot_;
     MIRType slotType_;
     bool needsBarrier_;
--- a/js/src/jit/MOpcodes.h
+++ b/js/src/jit/MOpcodes.h
@@ -209,16 +209,17 @@ namespace jit {
     _(NewCallObjectPar)                                                     \
     _(NewPar)                                                               \
     _(NewDenseArrayPar)                                                     \
     _(NewDerivedTypedObject)                                                \
     _(AbortPar)                                                             \
     _(LambdaPar)                                                            \
     _(RestPar)                                                              \
     _(ForkJoinContext)                                                      \
+    _(ForkJoinGetSlice)                                                     \
     _(GuardThreadExclusive)                                                 \
     _(InterruptCheckPar)                                                    \
     _(RecompileCheck)
 
 // Forward declarations of MIR types.
 #define FORWARD_DECLARE(op) class M##op;
  MIR_OPCODE_LIST(FORWARD_DECLARE)
 #undef FORWARD_DECLARE
--- a/js/src/jit/ParallelSafetyAnalysis.cpp
+++ b/js/src/jit/ParallelSafetyAnalysis.cpp
@@ -263,16 +263,17 @@ class ParallelSafetyVisitor : public MIn
     UNSAFE_OP(RunOncePrologue)
     CUSTOM_OP(Rest)
     SAFE_OP(RestPar)
     SAFE_OP(Floor)
     SAFE_OP(Round)
     UNSAFE_OP(InstanceOf)
     CUSTOM_OP(InterruptCheck)
     SAFE_OP(ForkJoinContext)
+    SAFE_OP(ForkJoinGetSlice)
     SAFE_OP(NewPar)
     SAFE_OP(NewDenseArrayPar)
     SAFE_OP(NewCallObjectPar)
     SAFE_OP(LambdaPar)
     SAFE_OP(AbortPar)
     UNSAFE_OP(ArrayConcat)
     UNSAFE_OP(GetDOMProperty)
     UNSAFE_OP(GetDOMMember)
--- a/js/src/jit/arm/CodeGenerator-arm.cpp
+++ b/js/src/jit/arm/CodeGenerator-arm.cpp
@@ -2305,8 +2305,20 @@ CodeGeneratorARM::visitNegD(LNegD *ins)
 
 bool
 CodeGeneratorARM::visitNegF(LNegF *ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     masm.ma_vneg_f32(input, ToFloatRegister(ins->output()));
     return true;
 }
+
+bool
+CodeGeneratorARM::visitForkJoinGetSlice(LForkJoinGetSlice *ins)
+{
+    MOZ_ASSUME_UNREACHABLE("NYI");
+}
+
+JitCode *
+JitRuntime::generateForkJoinGetSliceStub(JSContext *cx)
+{
+    MOZ_ASSUME_UNREACHABLE("NYI");
+}
--- a/js/src/jit/arm/CodeGenerator-arm.h
+++ b/js/src/jit/arm/CodeGenerator-arm.h
@@ -165,18 +165,19 @@ class CodeGeneratorARM : public CodeGene
     bool visitLoadTypedArrayElementStatic(LLoadTypedArrayElementStatic *ins);
     bool visitStoreTypedArrayElementStatic(LStoreTypedArrayElementStatic *ins);
     bool visitAsmJSLoadHeap(LAsmJSLoadHeap *ins);
     bool visitAsmJSStoreHeap(LAsmJSStoreHeap *ins);
     bool visitAsmJSLoadGlobalVar(LAsmJSLoadGlobalVar *ins);
     bool visitAsmJSStoreGlobalVar(LAsmJSStoreGlobalVar *ins);
     bool visitAsmJSLoadFuncPtr(LAsmJSLoadFuncPtr *ins);
     bool visitAsmJSLoadFFIFunc(LAsmJSLoadFFIFunc *ins);
+    bool visitAsmJSPassStackArg(LAsmJSPassStackArg *ins);
 
-    bool visitAsmJSPassStackArg(LAsmJSPassStackArg *ins);
+    bool visitForkJoinGetSlice(LForkJoinGetSlice *ins);
 
     bool generateInvalidateEpilogue();
   protected:
     void postAsmJSCall(LAsmJSCall *lir) {
         if (!useHardFpABI() && lir->mir()->callee().which() == MAsmJSCall::Callee::Builtin) {
             switch (lir->mir()->type()) {
               case MIRType_Double:
                 masm.ma_vxfer(r0, r1, d0);
--- a/js/src/jit/arm/Lowering-arm.cpp
+++ b/js/src/jit/arm/Lowering-arm.cpp
@@ -539,9 +539,15 @@ LIRGeneratorARM::lowerTruncateFToInt32(M
 }
 
 bool
 LIRGeneratorARM::visitStoreTypedArrayElementStatic(MStoreTypedArrayElementStatic *ins)
 {
     MOZ_ASSUME_UNREACHABLE("NYI");
 }
 
+bool
+LIRGeneratorARM::visitForkJoinGetSlice(MForkJoinGetSlice *ins)
+{
+    MOZ_ASSUME_UNREACHABLE("NYI");
+}
+
 //__aeabi_uidiv
--- a/js/src/jit/arm/Lowering-arm.h
+++ b/js/src/jit/arm/Lowering-arm.h
@@ -83,16 +83,17 @@ class LIRGeneratorARM : public LIRGenera
     bool visitGuardShape(MGuardShape *ins);
     bool visitGuardObjectType(MGuardObjectType *ins);
     bool visitAsmJSUnsignedToDouble(MAsmJSUnsignedToDouble *ins);
     bool visitAsmJSUnsignedToFloat32(MAsmJSUnsignedToFloat32 *ins);
     bool visitAsmJSLoadHeap(MAsmJSLoadHeap *ins);
     bool visitAsmJSStoreHeap(MAsmJSStoreHeap *ins);
     bool visitAsmJSLoadFuncPtr(MAsmJSLoadFuncPtr *ins);
     bool visitStoreTypedArrayElementStatic(MStoreTypedArrayElementStatic *ins);
+    bool visitForkJoinGetSlice(MForkJoinGetSlice *ins);
 
     static bool allowFloat32Optimizations() {
         return true;
     }
 };
 
 typedef LIRGeneratorARM LIRGeneratorSpecific;
 
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@@ -520,27 +520,32 @@ class AssemblerX86Shared
             masm.movb_i8m(src.value, dest.disp(), dest.base(), dest.index(), dest.scale());
             break;
           default:
             MOZ_ASSUME_UNREACHABLE("unexpected operand kind");
         }
     }
     void movzwl(const Operand &src, const Register &dest) {
         switch (src.kind()) {
+          case Operand::REG:
+            masm.movzwl_rr(src.reg(), dest.code());
+            break;
           case Operand::MEM_REG_DISP:
             masm.movzwl_mr(src.disp(), src.base(), dest.code());
             break;
           case Operand::MEM_SCALE:
             masm.movzwl_mr(src.disp(), src.base(), src.index(), src.scale(), dest.code());
             break;
           default:
             MOZ_ASSUME_UNREACHABLE("unexpected operand kind");
         }
     }
-
+    void movzwl(const Register &src, const Register &dest) {
+        masm.movzwl_rr(src.code(), dest.code());
+    }
     void movw(const Register &src, const Operand &dest) {
         switch (dest.kind()) {
           case Operand::MEM_REG_DISP:
             masm.movw_rm(src.code(), dest.disp(), dest.base());
             break;
           case Operand::MEM_SCALE:
             masm.movw_rm(src.code(), dest.disp(), dest.base(), dest.index(), dest.scale());
             break;
@@ -853,24 +858,30 @@ class AssemblerX86Shared
             break;
           default:
             MOZ_ASSUME_UNREACHABLE("unexpected operand kind");
         }
     }
     void cmpl(const Operand &op, ImmPtr imm) {
         cmpl(op, ImmWord(uintptr_t(imm.value)));
     }
+    void cmpw(const Register &lhs, const Register &rhs) {
+        masm.cmpw_rr(lhs.code(), rhs.code());
+    }
     void setCC(Condition cond, const Register &r) {
         masm.setCC_r(static_cast<JSC::X86Assembler::Condition>(cond), r.code());
     }
     void testb(const Register &lhs, const Register &rhs) {
         JS_ASSERT(GeneralRegisterSet(Registers::SingleByteRegs).has(lhs));
         JS_ASSERT(GeneralRegisterSet(Registers::SingleByteRegs).has(rhs));
         masm.testb_rr(rhs.code(), lhs.code());
     }
+    void testw(const Register &lhs, const Register &rhs) {
+        masm.testw_rr(rhs.code(), lhs.code());
+    }
     void testl(const Register &lhs, const Register &rhs) {
         masm.testl_rr(rhs.code(), lhs.code());
     }
     void testl(const Register &lhs, Imm32 rhs) {
         masm.testl_i32r(rhs.value, lhs.code());
     }
     void testl(const Operand &lhs, Imm32 rhs) {
         switch (lhs.kind()) {
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@@ -4,17 +4,20 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #include "jit/shared/CodeGenerator-x86-shared.h"
 
 #include "mozilla/DebugOnly.h"
 #include "mozilla/MathAlgorithms.h"
 
+#include "jsmath.h"
+
 #include "jit/IonFrames.h"
+#include "jit/IonLinker.h"
 #include "jit/JitCompartment.h"
 #include "jit/RangeAnalysis.h"
 
 #include "jit/shared/CodeGenerator-shared-inl.h"
 
 using namespace js;
 using namespace js::jit;
 
@@ -1763,10 +1766,182 @@ CodeGeneratorX86Shared::visitNegF(LNegF 
 {
     FloatRegister input = ToFloatRegister(ins->input());
     JS_ASSERT(input == ToFloatRegister(ins->output()));
 
     masm.negateFloat(input);
     return true;
 }
 
+bool
+CodeGeneratorX86Shared::visitForkJoinGetSlice(LForkJoinGetSlice *ins)
+{
+    MOZ_ASSERT(gen->info().executionMode() == ParallelExecution);
+    MOZ_ASSERT(ToRegister(ins->forkJoinContext()) == ForkJoinGetSliceReg_cx);
+    MOZ_ASSERT(ToRegister(ins->temp1()) == eax);
+    MOZ_ASSERT(ToRegister(ins->temp2()) == edx);
+    MOZ_ASSERT(ToRegister(ins->temp3()) == ForkJoinGetSliceReg_temp0);
+    MOZ_ASSERT(ToRegister(ins->temp4()) == ForkJoinGetSliceReg_temp1);
+    MOZ_ASSERT(ToRegister(ins->output()) == ForkJoinGetSliceReg_output);
+
+    masm.call(gen->jitRuntime()->forkJoinGetSliceStub());
+    return true;
+}
+
+JitCode *
+JitRuntime::generateForkJoinGetSliceStub(JSContext *cx)
+{
+#ifdef JS_THREADSAFE
+    MacroAssembler masm(cx);
+
+    // We need two fixed temps. We need to fix eax for cmpxchg, and edx for
+    // div.
+    Register cxReg = ForkJoinGetSliceReg_cx, worker = cxReg;
+    Register pool = ForkJoinGetSliceReg_temp0;
+    Register bounds = ForkJoinGetSliceReg_temp1;
+    Register output = ForkJoinGetSliceReg_output;
+
+    MOZ_ASSERT(worker != eax && worker != edx);
+    MOZ_ASSERT(pool != eax && pool != edx);
+    MOZ_ASSERT(bounds != eax && bounds != edx);
+    MOZ_ASSERT(output != eax && output != edx);
+
+    Label stealWork, noMoreWork, gotSlice;
+    Operand workerSliceBounds(Address(worker, ThreadPoolWorker::offsetOfSliceBounds()));
+
+    // Clobber cx to load the worker.
+    masm.push(cxReg);
+    masm.loadPtr(Address(cxReg, ForkJoinContext::offsetOfWorker()), worker);
+
+    // Load the thread pool, which is used in all cases below.
+    masm.loadThreadPool(pool);
+
+    {
+        // Try to get a slice from the current thread.
+        Label getOwnSliceLoopHead;
+        masm.bind(&getOwnSliceLoopHead);
+
+        // Load the slice bounds for the current thread.
+        masm.loadSliceBounds(worker, bounds);
+
+        // The slice bounds is a uint32 composed from two uint16s:
+        // [ from          , to           ]
+        //   ^~~~            ^~
+        //   upper 16 bits | lower 16 bits
+        masm.move32(bounds, output);
+        masm.shrl(Imm32(16), output);
+
+        // If we don't have any slices left ourselves, move on to stealing.
+        masm.branch16(Assembler::Equal, output, bounds, &stealWork);
+
+        // If we still have work, try to CAS [ from+1, to ].
+        masm.move32(bounds, edx);
+        masm.add32(Imm32(0x10000), edx);
+        masm.move32(bounds, eax);
+        masm.atomic_cmpxchg32(edx, workerSliceBounds, eax);
+        masm.j(Assembler::NonZero, &getOwnSliceLoopHead);
+
+        // If the CAS succeeded, return |from| in output.
+        masm.jump(&gotSlice);
+    }
+
+    // Try to steal work.
+    masm.bind(&stealWork);
+
+    // It's not technically correct to test whether work-stealing is turned on
+    // only during stub-generation time, but it's a DEBUG only thing.
+    if (cx->runtime()->threadPool.workStealing()) {
+        Label stealWorkLoopHead;
+        masm.bind(&stealWorkLoopHead);
+
+        // Check if we have work.
+        masm.branch32(Assembler::Equal,
+                      Address(pool, ThreadPool::offsetOfPendingSlices()),
+                      Imm32(0), &noMoreWork);
+
+        // Get an id at random. The following is an inline of
+        // the 32-bit xorshift in ThreadPoolWorker::randomWorker().
+        {
+            // Reload the current worker.
+            masm.loadPtr(Address(StackPointer, 0), cxReg);
+            masm.loadPtr(Address(cxReg, ForkJoinContext::offsetOfWorker()), worker);
+
+            // Perform the xorshift to get a random number in eax, using edx
+            // as a temp.
+            Address rngState(worker, ThreadPoolWorker::offsetOfSchedulerRNGState());
+            masm.load32(rngState, eax);
+            masm.move32(eax, edx);
+            masm.shll(Imm32(ThreadPoolWorker::XORSHIFT_A), eax);
+            masm.xor32(edx, eax);
+            masm.move32(eax, edx);
+            masm.shrl(Imm32(ThreadPoolWorker::XORSHIFT_B), eax);
+            masm.xor32(edx, eax);
+            masm.move32(eax, edx);
+            masm.shll(Imm32(ThreadPoolWorker::XORSHIFT_C), eax);
+            masm.xor32(edx, eax);
+            masm.store32(eax, rngState);
+
+            // Compute the random worker id by computing % numWorkers. Reuse
+            // output as a temp.
+            masm.move32(Imm32(0), edx);
+            masm.move32(Imm32(cx->runtime()->threadPool.numWorkers()), output);
+            masm.udiv(output);
+        }
+
+        // Load the worker from the workers array.
+        masm.loadPtr(Address(pool, ThreadPool::offsetOfWorkers()), worker);
+        masm.loadPtr(BaseIndex(worker, edx, ScalePointer), worker);
+
+        // Try to get a slice from the designated victim worker.
+        Label stealSliceFromWorkerLoopHead;
+        masm.bind(&stealSliceFromWorkerLoopHead);
+
+        // Load the slice bounds and decompose for the victim worker.
+        masm.loadSliceBounds(worker, bounds);
+        masm.move32(bounds, eax);
+        masm.shrl(Imm32(16), eax);
+
+        // If the victim worker has no more slices left, find another worker.
+        masm.branch16(Assembler::Equal, eax, bounds, &stealWorkLoopHead);
+
+        // If the victim worker still has work, try to CAS [ from, to-1 ].
+        masm.move32(bounds, output);
+        masm.sub32(Imm32(1), output);
+        masm.move32(bounds, eax);
+        masm.atomic_cmpxchg32(output, workerSliceBounds, eax);
+        masm.j(Assembler::NonZero, &stealSliceFromWorkerLoopHead);
+
+        // If the CAS succeeded, return |to-1| in output.
+#ifdef DEBUG
+        masm.atomic_inc32(Operand(Address(pool, ThreadPool::offsetOfStolenSlices())));
+#endif
+        // Copies lower 16 bits only.
+        masm.movzwl(output, output);
+    }
+
+    // If we successfully got a slice, decrement pool->pendingSlices_ and
+    // return the slice.
+    masm.bind(&gotSlice);
+    masm.atomic_dec32(Operand(Address(pool, ThreadPool::offsetOfPendingSlices())));
+    masm.pop(cxReg);
+    masm.ret();
+
+    // There's no more slices to give out, return -1.
+    masm.bind(&noMoreWork);
+    masm.move32(Imm32(-1), output);
+    masm.pop(cxReg);
+    masm.ret();
+
+    Linker linker(masm);
+    JitCode *code = linker.newCode<NoGC>(cx, JSC::OTHER_CODE);
+
+#ifdef JS_ION_PERF
+    writePerfSpewerJitCodeProfile(code, "ForkJoinGetSliceStub");
+#endif
+
+    return code;
+#else
+    return nullptr;
+#endif // JS_THREADSAFE
+}
+
 } // namespace jit
 } // namespace js
--- a/js/src/jit/shared/CodeGenerator-x86-shared.h
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.h
@@ -117,16 +117,18 @@ class CodeGeneratorX86Shared : public Co
     virtual bool visitRound(LRound *lir);
     virtual bool visitGuardShape(LGuardShape *guard);
     virtual bool visitGuardObjectType(LGuardObjectType *guard);
     virtual bool visitGuardClass(LGuardClass *guard);
     virtual bool visitEffectiveAddress(LEffectiveAddress *ins);
     virtual bool visitUDivOrMod(LUDivOrMod *ins);
     virtual bool visitAsmJSPassStackArg(LAsmJSPassStackArg *ins);
 
+    bool visitForkJoinGetSlice(LForkJoinGetSlice *ins);
+
     bool visitNegI(LNegI *lir);
     bool visitNegD(LNegD *lir);
     bool visitNegF(LNegF *lir);
 
     // Out of line visitors.
     bool visitOutOfLineBailout(OutOfLineBailout *ool);
     bool visitOutOfLineUndoALUOperation(OutOfLineUndoALUOperation *ool);
     bool visitMulNegativeZeroCheck(MulNegativeZeroCheck *ool);
--- a/js/src/jit/shared/Lowering-shared.h
+++ b/js/src/jit/shared/Lowering-shared.h
@@ -186,14 +186,19 @@ class LIRGeneratorShared : public MInstr
     static bool allowStaticTypedArrayAccesses() {
         return false;
     }
 
      // Whether we can emit Float32 specific optimizations.
     static bool allowFloat32Optimizations() {
        return false;
     }
+
+    // Whether we can inline ForkJoinGetSlice.
+    static bool allowInlineForkJoinGetSlice() {
+        return false;
+    }
 };
 
 } // namespace jit
 } // namespace js
 
 #endif /* jit_shared_Lowering_shared_h */
--- a/js/src/jit/shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/shared/Lowering-x86-shared.cpp
@@ -290,8 +290,21 @@ bool
 LIRGeneratorX86Shared::lowerTruncateFToInt32(MTruncateToInt32 *ins)
 {
     MDefinition *opd = ins->input();
     JS_ASSERT(opd->type() == MIRType_Float32);
 
     LDefinition maybeTemp = Assembler::HasSSE3() ? LDefinition::BogusTemp() : tempFloat32();
     return define(new(alloc()) LTruncateFToInt32(useRegister(opd), maybeTemp), ins);
 }
+
+bool
+LIRGeneratorX86Shared::visitForkJoinGetSlice(MForkJoinGetSlice *ins)
+{
+    // We fix eax and edx for cmpxchg and div.
+    LForkJoinGetSlice *lir = new(alloc())
+        LForkJoinGetSlice(useFixed(ins->forkJoinContext(), ForkJoinGetSliceReg_cx),
+                          tempFixed(eax),
+                          tempFixed(edx),
+                          tempFixed(ForkJoinGetSliceReg_temp0),
+                          tempFixed(ForkJoinGetSliceReg_temp1));
+    return defineFixed(lir, ins, LAllocation(AnyRegister(ForkJoinGetSliceReg_output)));
+}
--- a/js/src/jit/shared/Lowering-x86-shared.h
+++ b/js/src/jit/shared/Lowering-x86-shared.h
@@ -42,14 +42,15 @@ class LIRGeneratorX86Shared : public LIR
     bool lowerModI(MMod *mod);
     bool lowerUDiv(MDiv *div);
     bool lowerUMod(MMod *mod);
     bool lowerUrshD(MUrsh *mir);
     bool lowerConstantDouble(double d, MInstruction *ins);
     bool lowerConstantFloat32(float d, MInstruction *ins);
     bool lowerTruncateDToInt32(MTruncateToInt32 *ins);
     bool lowerTruncateFToInt32(MTruncateToInt32 *ins);
+    bool visitForkJoinGetSlice(MForkJoinGetSlice *ins);
 };
 
 } // namespace jit
 } // namespace js
 
 #endif /* jit_shared_Lowering_x86_shared_h */
--- a/js/src/jit/shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/shared/MacroAssembler-x86-shared.h
@@ -106,16 +106,19 @@ class MacroAssemblerX86Shared : public A
         movl(src, dest);
     }
     void and32(const Imm32 &imm, const Register &dest) {
         andl(imm, dest);
     }
     void and32(const Imm32 &imm, const Address &dest) {
         andl(imm, Operand(dest));
     }
+    void or32(const Register &src, const Register &dest) {
+        orl(src, dest);
+    }
     void or32(const Imm32 &imm, const Register &dest) {
         orl(imm, dest);
     }
     void or32(const Imm32 &imm, const Address &dest) {
         orl(imm, Operand(dest));
     }
     void neg32(const Register &reg) {
         negl(reg);
@@ -151,16 +154,19 @@ class MacroAssemblerX86Shared : public A
         subl(imm, dest);
     }
     void sub32(Register src, Register dest) {
         subl(src, dest);
     }
     void xor32(Imm32 imm, Register dest) {
         xorl(imm, dest);
     }
+    void xor32(Register src, Register dest) {
+        xorl(src, dest);
+    }
     void not32(Register reg) {
         notl(reg);
     }
     void inc32(const Operand &addr) {
         incl(addr);
     }
     void atomic_inc32(const Operand &addr) {
         lock_incl(addr);
@@ -172,16 +178,20 @@ class MacroAssemblerX86Shared : public A
         lock_decl(addr);
     }
     void atomic_cmpxchg32(const Register &src, const Operand &addr, const Register &dest) {
         // %eax must be explicitly provided for calling clarity.
         MOZ_ASSERT(dest.code() == JSC::X86Registers::eax);
         lock_cmpxchg32(src, addr);
     }
 
+    void branch16(Condition cond, const Register &lhs, const Register &rhs, Label *label) {
+        cmpw(lhs, rhs);
+        j(cond, label);
+    }
     void branch32(Condition cond, const Operand &lhs, const Register &rhs, Label *label) {
         cmpl(lhs, rhs);
         j(cond, label);
     }
     void branch32(Condition cond, const Operand &lhs, Imm32 rhs, Label *label) {
         cmpl(lhs, rhs);
         j(cond, label);
     }
@@ -196,16 +206,20 @@ class MacroAssemblerX86Shared : public A
     void branch32(Condition cond, const Register &lhs, Imm32 imm, Label *label) {
         cmpl(lhs, imm);
         j(cond, label);
     }
     void branch32(Condition cond, const Register &lhs, const Register &rhs, Label *label) {
         cmpl(lhs, rhs);
         j(cond, label);
     }
+    void branchTest16(Condition cond, const Register &lhs, const Register &rhs, Label *label) {
+        testw(lhs, rhs);
+        j(cond, label);
+    }
     void branchTest32(Condition cond, const Register &lhs, const Register &rhs, Label *label) {
         testl(lhs, rhs);
         j(cond, label);
     }
     void branchTest32(Condition cond, const Register &lhs, Imm32 imm, Label *label) {
         testl(lhs, imm);
         j(cond, label);
     }
--- a/js/src/jit/x64/Assembler-x64.h
+++ b/js/src/jit/x64/Assembler-x64.h
@@ -124,16 +124,23 @@ static MOZ_CONSTEXPR_VAR FloatRegister F
 static MOZ_CONSTEXPR_VAR FloatRegister FloatArgReg4 = xmm4;
 static MOZ_CONSTEXPR_VAR FloatRegister FloatArgReg5 = xmm5;
 static MOZ_CONSTEXPR_VAR FloatRegister FloatArgReg6 = xmm6;
 static MOZ_CONSTEXPR_VAR FloatRegister FloatArgReg7 = xmm7;
 static MOZ_CONSTEXPR_VAR uint32_t NumFloatArgRegs = 8;
 static MOZ_CONSTEXPR_VAR FloatRegister FloatArgRegs[NumFloatArgRegs] = { xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 };
 #endif
 
+// The convention used by the ForkJoinGetSlice stub. None of these can be rax
+// or rdx, which the stub also needs for cmpxchg and div, respectively.
+static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_cx = rdi;
+static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_temp0 = rbx;
+static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_temp1 = rcx;
+static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_output = rsi;
+
 class ABIArgGenerator
 {
 #if defined(XP_WIN)
     unsigned regIndex_;
 #else
     unsigned intRegIndex_;
     unsigned floatRegIndex_;
 #endif
--- a/js/src/jit/x64/Lowering-x64.h
+++ b/js/src/jit/x64/Lowering-x64.h
@@ -50,16 +50,20 @@ class LIRGeneratorX64 : public LIRGenera
     bool visitAsmJSLoadHeap(MAsmJSLoadHeap *ins);
     bool visitAsmJSStoreHeap(MAsmJSStoreHeap *ins);
     bool visitAsmJSLoadFuncPtr(MAsmJSLoadFuncPtr *ins);
     bool visitStoreTypedArrayElementStatic(MStoreTypedArrayElementStatic *ins);
 
     static bool allowFloat32Optimizations() {
         return true;
     }
+
+    static bool allowInlineForkJoinGetSlice() {
+        return true;
+    }
 };
 
 typedef LIRGeneratorX64 LIRGeneratorSpecific;
 
 } // namespace jit
 } // namespace js
 
 #endif /* jit_x64_Lowering_x64_h */
--- a/js/src/jit/x86/Assembler-x86.h
+++ b/js/src/jit/x86/Assembler-x86.h
@@ -50,16 +50,23 @@ static MOZ_CONSTEXPR_VAR FloatRegister S
 static MOZ_CONSTEXPR_VAR Register ArgumentsRectifierReg = esi;
 static MOZ_CONSTEXPR_VAR Register CallTempReg0 = edi;
 static MOZ_CONSTEXPR_VAR Register CallTempReg1 = eax;
 static MOZ_CONSTEXPR_VAR Register CallTempReg2 = ebx;
 static MOZ_CONSTEXPR_VAR Register CallTempReg3 = ecx;
 static MOZ_CONSTEXPR_VAR Register CallTempReg4 = esi;
 static MOZ_CONSTEXPR_VAR Register CallTempReg5 = edx;
 
+// The convention used by the ForkJoinGetSlice stub. None of these can be eax
+// or edx, which the stub also needs for cmpxchg and div, respectively.
+static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_cx = edi;
+static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_temp0 = ebx;
+static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_temp1 = ecx;
+static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_output = esi;
+
 // We have no arg regs, so our NonArgRegs are just our CallTempReg*
 static MOZ_CONSTEXPR_VAR Register CallTempNonArgRegs[] = { edi, eax, ebx, ecx, esi, edx };
 static const uint32_t NumCallTempNonArgRegs =
     mozilla::ArrayLength(CallTempNonArgRegs);
 
 class ABIArgGenerator
 {
     uint32_t stackOffset_;
--- a/js/src/jit/x86/Lowering-x86.h
+++ b/js/src/jit/x86/Lowering-x86.h
@@ -61,16 +61,20 @@ class LIRGeneratorX86 : public LIRGenera
 
     static bool allowStaticTypedArrayAccesses() {
         return true;
     }
 
     static bool allowFloat32Optimizations() {
         return true;
     }
+
+    static bool allowInlineForkJoinGetSlice() {
+        return true;
+    }
 };
 
 typedef LIRGeneratorX86 LIRGeneratorSpecific;
 
 } // namespace js
 } // namespace jit
 
 #endif /* jit_x86_Lowering_x86_h */
--- a/js/src/vm/ForkJoin.cpp
+++ b/js/src/vm/ForkJoin.cpp
@@ -1417,18 +1417,18 @@ ForkJoinShared::execute()
     // Check if any of the workers failed.
     if (abort_) {
         if (fatal_)
             return TP_FATAL;
         return TP_RETRY_SEQUENTIALLY;
     }
 
 #ifdef DEBUG
-    Spew(SpewOps, "Completed parallel job [slices %d, threads: %d (+1), stolen: %d (work stealing:%s)]",
-         sliceTo_ - sliceFrom_,
+    Spew(SpewOps, "Completed parallel job [slices: %d, threads: %d, stolen: %d (work stealing:%s)]",
+         sliceTo_ - sliceFrom_ + 1,
          threadPool_->numWorkers(),
          threadPool_->stolenSlices(),
          threadPool_->workStealing() ? "ON" : "OFF");
 #endif
 
     // Everything went swimmingly. Give yourself a pat on the back.
     return jobResult;
 }
--- a/js/src/vm/ForkJoin.h
+++ b/js/src/vm/ForkJoin.h
@@ -401,16 +401,21 @@ class ForkJoinContext : public ThreadSaf
     bool hasAcquiredJSContext() const;
 
     // Check the current state of parallel execution.
     static inline ForkJoinContext *current();
 
     // Initializes the thread-local state.
     static bool initialize();
 
+    // Used in inlining GetForkJoinSlice.
+    static size_t offsetOfWorker() {
+        return offsetof(ForkJoinContext, worker_);
+    }
+
   private:
     friend class AutoSetForkJoinContext;
 
     // Initialized by initialize()
     static mozilla::ThreadLocal<ForkJoinContext*> tlsForkJoinContext;
 
     ForkJoinShared *const shared_;
 
--- a/js/src/vm/ThreadPool.h
+++ b/js/src/vm/ThreadPool.h
@@ -94,16 +94,20 @@ class ThreadPoolWorker
     bool start();
 
     // Invoked from the main thread; signals the worker loop to return.
     void terminate(AutoLockMonitor &lock);
 
     static size_t offsetOfSliceBounds() {
         return offsetof(ThreadPoolWorker, sliceBounds_);
     }
+
+    static size_t offsetOfSchedulerRNGState() {
+        return offsetof(ThreadPoolWorker, schedulerRNGState_);
+    }
 };
 
 /////////////////////////////////////////////////////////////////////////////
 // A ParallelJob is the main runnable abstraction in the ThreadPool.
 //
 // The unit of work here is in terms of threads, *not* slices. The
 // user-provided function has the responsibility of getting slices of work via
 // the |ForkJoinGetSlice| intrinsic.
@@ -187,16 +191,28 @@ class ThreadPool : public Monitor
     bool lazyStartWorkers(JSContext *cx);
     void terminateWorkers();
     void terminateWorkersAndReportOOM(JSContext *cx);
     void join(AutoLockMonitor &lock);
     void waitForWorkers(AutoLockMonitor &lock);
     ThreadPoolWorker *mainThreadWorker() { return workers_[0]; }
 
   public:
+#ifdef DEBUG
+    static size_t offsetOfStolenSlices() {
+        return offsetof(ThreadPool, stolenSlices_);
+    }
+#endif
+    static size_t offsetOfPendingSlices() {
+        return offsetof(ThreadPool, pendingSlices_);
+    }
+    static size_t offsetOfWorkers() {
+        return offsetof(ThreadPool, workers_);
+    }
+
     ThreadPool(JSRuntime *rt);
     ~ThreadPool();
 
     bool init();
 
     // Return number of worker threads in the pool, counting the main thread.
     uint32_t numWorkers() const;