Bug 976110 - Part 1: Optimize signed integer division by constants; r=sunfish
authorMauricio Collares Neto <mau@ric.io>
Sat, 19 Apr 2014 10:37:51 -0700
changeset 197854 c7925215ca327f482c3b439c1148b8c63cf410e6
parent 197853 cc10a90487bb8ed0948db260b7b6c3da796d0210
child 197855 65aa67280bfb399219ab971ac1bd6856ff19e8f3
push id3624
push userasasaki@mozilla.com
push dateMon, 09 Jun 2014 21:49:01 +0000
treeherdermozilla-beta@b1a5da15899a [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssunfish
bugs976110
milestone31.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 976110 - Part 1: Optimize signed integer division by constants; r=sunfish
js/src/assembler/assembler/X86Assembler.h
js/src/jit/shared/Assembler-x86-shared.h
js/src/jit/shared/CodeGenerator-shared.cpp
js/src/jit/shared/CodeGenerator-shared.h
js/src/jit/shared/CodeGenerator-x86-shared.cpp
js/src/jit/shared/CodeGenerator-x86-shared.h
js/src/jit/shared/LIR-x86-shared.h
js/src/jit/shared/Lowering-x86-shared.cpp
js/src/jit/x64/LOpcodes-x64.h
js/src/jit/x86/LOpcodes-x86.h
--- a/js/src/assembler/assembler/X86Assembler.h
+++ b/js/src/assembler/assembler/X86Assembler.h
@@ -361,16 +361,17 @@ private:
 
         GROUP2_OP_SHL = 4,
         GROUP2_OP_SHR = 5,
         GROUP2_OP_SAR = 7,
 
         GROUP3_OP_TEST = 0,
         GROUP3_OP_NOT  = 2,
         GROUP3_OP_NEG  = 3,
+        GROUP3_OP_IMUL = 5,
         GROUP3_OP_DIV  = 6,
         GROUP3_OP_IDIV = 7,
 
         GROUP5_OP_INC   = 0,
         GROUP5_OP_DEC   = 1,
         GROUP5_OP_CALLN = 2,
         GROUP5_OP_JMPN  = 4,
         GROUP5_OP_PUSH  = 6,
@@ -1150,16 +1151,23 @@ public:
 #endif
 
     void imull_rr(RegisterID src, RegisterID dst)
     {
         spew("imull      %s, %s", nameIReg(4,src), nameIReg(4, dst));
         m_formatter.twoByteOp(OP2_IMUL_GvEv, dst, src);
     }
 
+    void imull_r(RegisterID multiplier)
+    {
+        spew("imull      %s",
+             nameIReg(4, multiplier));
+        m_formatter.oneByteOp(OP_GROUP3_Ev, GROUP3_OP_IMUL, multiplier);
+    }
+
     void imull_mr(int offset, RegisterID base, RegisterID dst)
     {
         spew("imull      %s0x%x(%s), %s",
              PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameIReg(4,dst));
         m_formatter.twoByteOp(OP2_IMUL_GvEv, dst, base, offset);
     }
 
     void imull_i32r(RegisterID src, int32_t value, RegisterID dst)
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@@ -1057,22 +1057,28 @@ class AssemblerX86Shared
             break;
           case Operand::MEM_REG_DISP:
             masm.andl_mr(src.disp(), src.base(), dest.code());
             break;
           default:
             MOZ_ASSUME_UNREACHABLE("unexpected operand kind");
         }
     }
+    void imull(const Register &multiplier) {
+        masm.imull_r(multiplier.code());
+    }
     void imull(Imm32 imm, const Register &dest) {
         masm.imull_i32r(dest.code(), imm.value, dest.code());
     }
     void imull(const Register &src, const Register &dest) {
         masm.imull_rr(src.code(), dest.code());
     }
+    void imull(Imm32 imm, const Register &src, const Register &dest) {
+        masm.imull_i32r(src.code(), imm.value, dest.code());
+    }
     void imull(const Operand &src, const Register &dest) {
         switch (src.kind()) {
           case Operand::REG:
             masm.imull_rr(src.reg(), dest.code());
             break;
           case Operand::MEM_REG_DISP:
             masm.imull_mr(src.disp(), src.base(), dest.code());
             break;
--- a/js/src/jit/shared/CodeGenerator-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-shared.cpp
@@ -1002,16 +1002,84 @@ CodeGeneratorShared::addCacheLocations(c
         new (&runtimeData_[curIndex]) CacheLocation(iter->pc, iter->script);
         numLocations++;
     }
     JS_ASSERT(numLocations != 0);
     *numLocs = numLocations;
     return firstIndex;
 }
 
+ReciprocalMulConstants
+CodeGeneratorShared::computeDivisionConstants(int d) {
+    // In what follows, d is positive and is not a power of 2.
+    JS_ASSERT(d > 0 && (d & (d - 1)) != 0);
+
+    // Speeding up division by non power-of-2 constants is possible by
+    // calculating, during compilation, a value M such that high-order
+    // bits of M*n correspond to the result of the division. Formally,
+    // we compute values 0 <= M < 2^32 and 0 <= s < 31 such that
+    //         (M * n) >> (32 + s) = floor(n/d)    if n >= 0
+    //         (M * n) >> (32 + s) = ceil(n/d) - 1 if n < 0.
+    // The original presentation of this technique appears in Hacker's
+    // Delight, a book by Henry S. Warren, Jr.. A proof of correctness
+    // for our version follows.
+
+    // Define p = 32 + s, M = ceil(2^p/d), and assume that s satisfies
+    //                     M - 2^p/d <= 2^(s+1)/d.                 (1)
+    // (Observe that s = FloorLog32(d) satisfies this, because in this
+    // case d <= 2^(s+1) and so the RHS of (1) is at least one). Then,
+    //
+    // a) If s <= FloorLog32(d), then M <= 2^32 - 1.
+    // Proof: Indeed, M is monotone in s and, for s = FloorLog32(d),
+    // the inequalities 2^31 > d >= 2^s + 1 readily imply
+    //    2^p / d  = 2^p/(d - 1) * (d - 1)/d
+    //            <= 2^32 * (1 - 1/d) < 2 * (2^31 - 1) = 2^32 - 2.
+    // The claim follows by applying the ceiling function.
+    //
+    // b) For any 0 <= n < 2^31, floor(Mn/2^p) = floor(n/d).
+    // Proof: Put x = floor(Mn/2^p); it's the unique integer for which
+    //                    Mn/2^p - 1 < x <= Mn/2^p.                (2)
+    // Using M >= 2^p/d on the LHS and (1) on the RHS, we get
+    //           n/d - 1 < x <= n/d + n/(2^31 d) < n/d + 1/d.
+    // Since x is an integer, it's not in the interval (n/d, (n+1)/d),
+    // and so n/d - 1 < x <= n/d, which implies x = floor(n/d).
+    //
+    // c) For any -2^31 <= n < 0, floor(Mn/2^p) + 1 = ceil(n/d).
+    // Proof: The proof is similar. Equation (2) holds as above. Using
+    // M > 2^p/d (d isn't a power of 2) on the RHS and (1) on the LHS,
+    //                 n/d + n/(2^31 d) - 1 < x < n/d.
+    // Using n >= -2^31 and summing 1,
+    //                  n/d - 1/d < x + 1 < n/d + 1.
+    // Since x + 1 is an integer, this implies n/d <= x + 1 < n/d + 1.
+    // In other words, x + 1 = ceil(n/d).
+    //
+    // Condition (1) isn't necessary for the existence of M and s with
+    // the properties above. Hacker's Delight provides a slightly less
+    // restrictive condition when d >= 196611, at the cost of a 3-page
+    // proof of correctness.
+
+    // Note that, since d*M - 2^p = d - (2^p)%d, (1) can be written as
+    //                   2^(s+1) >= d - (2^p)%d.
+    // We now compute the least s with this property...
+
+    int32_t shift = 0;
+    while ((int64_t(1) << (shift+1)) + (int64_t(1) << (shift+32)) % d < d)
+        shift++;
+
+    // ...and the corresponding M. This may not fit in a signed 32-bit
+    // integer; we will compute (M - 2^32) * n + (2^32 * n) instead of
+    // M * n if this is the case (cf. item (a) above).
+    ReciprocalMulConstants rmc;
+    rmc.multiplier = int32_t((int64_t(1) << (shift+32))/d + 1);
+    rmc.shift_amount = shift;
+
+    return rmc;
+}
+
+
 #ifdef JS_TRACE_LOGGING
 
 bool
 CodeGeneratorShared::emitTracelogScript(bool isStart)
 {
     RegisterSet regs = RegisterSet::Volatile();
     Register logger = regs.takeGeneral();
     Register script = regs.takeGeneral();
--- a/js/src/jit/shared/CodeGenerator-shared.h
+++ b/js/src/jit/shared/CodeGenerator-shared.h
@@ -40,16 +40,21 @@ struct PatchableBackedgeInfo
     Label *loopHeader;
     Label *interruptCheck;
 
     PatchableBackedgeInfo(CodeOffsetJump backedge, Label *loopHeader, Label *interruptCheck)
       : backedge(backedge), loopHeader(loopHeader), interruptCheck(interruptCheck)
     {}
 };
 
+struct ReciprocalMulConstants {
+    int32_t multiplier;
+    int32_t shift_amount;
+};
+
 class CodeGeneratorShared : public LInstructionVisitor
 {
     js::Vector<OutOfLineCode *, 0, SystemAllocPolicy> outOfLineCode_;
     OutOfLineCode *oolIns;
 
     MacroAssembler &ensureMasm(MacroAssembler *masm);
     mozilla::Maybe<MacroAssembler> maybeMasm_;
 
@@ -397,16 +402,17 @@ class CodeGeneratorShared : public LInst
     inline OutOfLineCode *oolCallVM(const VMFunctionsModal &f, LInstruction *ins,
                                     const ArgSeq &args, const StoreOutputTo &out)
     {
         return oolCallVM(f[gen->info().executionMode()], ins, args, out);
     }
 
     bool addCache(LInstruction *lir, size_t cacheIndex);
     size_t addCacheLocations(const CacheLocationList &locs, size_t *numLocs);
+    ReciprocalMulConstants computeDivisionConstants(int d);
 
   protected:
     bool addOutOfLineCode(OutOfLineCode *code);
     bool hasOutOfLineCode() { return !outOfLineCode_.empty(); }
     bool generateOutOfLineCode();
 
     Label *labelForBackedgeWithImplicitCheck(MBasicBlock *mir);
 
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@@ -17,16 +17,17 @@
 #include "jit/RangeAnalysis.h"
 #include "vm/TraceLogging.h"
 
 #include "jit/shared/CodeGenerator-shared-inl.h"
 
 using namespace js;
 using namespace js::jit;
 
+using mozilla::Abs;
 using mozilla::FloatingPoint;
 using mozilla::FloorLog2;
 using mozilla::NegativeInfinity;
 using mozilla::SpecificNaN;
 
 namespace js {
 namespace jit {
 
@@ -914,16 +915,98 @@ CodeGeneratorX86Shared::visitDivPowTwoI(
         if (!mir->isTruncated() && !bailoutIf(Assembler::Overflow, ins->snapshot()))
             return false;
     }
 
     return true;
 }
 
 bool
+CodeGeneratorX86Shared::visitDivOrModConstantI(LDivOrModConstantI *ins) {
+    Register lhs = ToRegister(ins->numerator());
+    Register output = ToRegister(ins->output());
+    int32_t d = ins->denominator();
+
+    // This emits the division answer into edx or the modulus answer into eax.
+    JS_ASSERT(output == eax || output == edx);
+    JS_ASSERT(lhs != eax && lhs != edx);
+
+    // The absolute value of the denominator isn't a power of 2 (see LDivPowTwoI
+    // and LModPowTwoI).
+    JS_ASSERT((Abs(d) & (Abs(d) - 1)) != 0);
+
+    // We will first divide by Abs(d), and negate the answer if d is negative.
+    // If desired, this can be avoided by generalizing computeDivisionConstants.
+    ReciprocalMulConstants rmc = computeDivisionConstants(Abs(d));
+
+    // As explained in the comments of computeDivisionConstants, we first compute
+    // X >> (32 + shift), where X is either (rmc.multiplier * n) if the multiplier
+    // is non-negative or (rmc.multiplier * n) + (2^32 * n) otherwise. This is the
+    // desired division result if n is non-negative, and is one less than the result
+    // otherwise.
+    masm.movl(lhs, eax);
+    masm.movl(Imm32(rmc.multiplier), edx);
+    masm.imull(edx);
+    if (rmc.multiplier < 0)
+        masm.addl(lhs, edx);
+    masm.sarl(Imm32(rmc.shift_amount), edx);
+
+    // We'll subtract -1 instead of adding 1, because (n < 0 ? -1 : 0) can be
+    // computed with just a sign-extending shift of 31 bits.
+    if (ins->canBeNegativeDividend()) {
+        masm.movl(lhs, eax);
+        masm.sarl(Imm32(31), eax);
+        masm.subl(eax, edx);
+    }
+
+    // After this, edx contains the correct division result.
+    if (d < 0)
+        masm.negl(edx);
+
+    if (output == eax) {
+        masm.imull(Imm32(-d), edx, eax);
+        masm.addl(lhs, eax);
+    }
+
+    if (!ins->mir()->isTruncated()) {
+        if (output == edx) {
+            // This is a division op. Multiply the obtained value by d to check if
+            // the correct answer is an integer. This cannot overflow, since |d| > 1.
+            masm.imull(Imm32(d), edx, eax);
+            masm.cmpl(lhs, eax);
+            if (!bailoutIf(Assembler::NotEqual, ins->snapshot()))
+                return false;
+
+            // If lhs is zero and the divisor is negative, the answer should have
+            // been -0.
+            if (d < 0) {
+                masm.testl(lhs, lhs);
+                if (!bailoutIf(Assembler::Zero, ins->snapshot()))
+                    return false;
+            }
+        } else if (ins->canBeNegativeDividend()) {
+            // This is a mod op. If the computed value is zero and lhs
+            // is negative, the answer should have been -0.
+            Label done;
+
+            masm.cmpl(lhs, Imm32(0));
+            masm.j(Assembler::GreaterThanOrEqual, &done);
+
+            masm.testl(eax, eax);
+            if (!bailoutIf(Assembler::Zero, ins->snapshot()))
+                return false;
+
+            masm.bind(&done);
+        }
+    }
+
+    return true;
+}
+
+bool
 CodeGeneratorX86Shared::visitDivI(LDivI *ins)
 {
     Register remainder = ToRegister(ins->remainder());
     Register lhs = ToRegister(ins->lhs());
     Register rhs = ToRegister(ins->rhs());
     Register output = ToRegister(ins->output());
 
     MDiv *mir = ins->mir();
--- a/js/src/jit/shared/CodeGenerator-x86-shared.h
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.h
@@ -119,16 +119,17 @@ class CodeGeneratorX86Shared : public Co
     virtual bool visitSqrtD(LSqrtD *ins);
     virtual bool visitSqrtF(LSqrtF *ins);
     virtual bool visitPowHalfD(LPowHalfD *ins);
     virtual bool visitAddI(LAddI *ins);
     virtual bool visitSubI(LSubI *ins);
     virtual bool visitMulI(LMulI *ins);
     virtual bool visitDivI(LDivI *ins);
     virtual bool visitDivPowTwoI(LDivPowTwoI *ins);
+    virtual bool visitDivOrModConstantI(LDivOrModConstantI *ins);
     virtual bool visitModI(LModI *ins);
     virtual bool visitModPowTwoI(LModPowTwoI *ins);
     virtual bool visitBitNotI(LBitNotI *ins);
     virtual bool visitBitOpI(LBitOpI *ins);
     virtual bool visitShiftI(LShiftI *ins);
     virtual bool visitUrshD(LUrshD *ins);
     virtual bool visitTestIAndBranch(LTestIAndBranch *test);
     virtual bool visitTestDAndBranch(LTestDAndBranch *test);
--- a/js/src/jit/shared/LIR-x86-shared.h
+++ b/js/src/jit/shared/LIR-x86-shared.h
@@ -71,16 +71,47 @@ class LDivPowTwoI : public LBinaryMath<0
     bool negativeDivisor() const {
         return negativeDivisor_;
     }
     MDiv *mir() const {
         return mir_->toDiv();
     }
 };
 
+class LDivOrModConstantI : public LInstructionHelper<1, 1, 1>
+{
+    const int32_t denominator_;
+
+  public:
+    LIR_HEADER(DivOrModConstantI)
+
+    LDivOrModConstantI(const LAllocation &lhs, int32_t denominator, const LDefinition& temp)
+    : denominator_(denominator)
+    {
+        setOperand(0, lhs);
+        setTemp(0, temp);
+    }
+
+    const LAllocation *numerator() {
+        return getOperand(0);
+    }
+    int32_t denominator() const {
+        return denominator_;
+    }
+    MBinaryArithInstruction *mir() const {
+        JS_ASSERT(mir_->isDiv() || mir_->isMod());
+        return static_cast<MBinaryArithInstruction *>(mir_);
+    }
+    bool canBeNegativeDividend() const {
+        if (mir_->isMod())
+            return mir_->toMod()->canBeNegativeDividend();
+        return mir_->toDiv()->canBeNegativeDividend();
+    }
+};
+
 class LModI : public LBinaryMath<1>
 {
   public:
     LIR_HEADER(ModI)
 
     LModI(const LAllocation &lhs, const LAllocation &rhs, const LDefinition &temp) {
         setOperand(0, lhs);
         setOperand(1, rhs);
--- a/js/src/jit/shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/shared/Lowering-x86-shared.cpp
@@ -133,35 +133,39 @@ LIRGeneratorX86Shared::lowerDivI(MDiv *d
     if (div->isUnsigned())
         return lowerUDiv(div);
 
     // Division instructions are slow. Division by constant denominators can be
     // rewritten to use other instructions.
     if (div->rhs()->isConstant()) {
         int32_t rhs = div->rhs()->toConstant()->value().toInt32();
 
-        // Check for division by a power of two, which is an easy and
-        // important case to optimize. Note that other optimizations
-        // are also possible: division by other constants can be
-        // optimized by a reciprocal multiplication technique.
+        // Division by powers of two can be done by shifting, and division by
+        // other numbers can be done by a reciprocal multiplication technique.
         int32_t shift = FloorLog2(Abs(rhs));
         if (rhs != 0 && uint32_t(1) << shift == Abs(rhs)) {
             LAllocation lhs = useRegisterAtStart(div->lhs());
             LDivPowTwoI *lir;
             if (!div->canBeNegativeDividend()) {
                 // Numerator is unsigned, so does not need adjusting.
                 lir = new(alloc()) LDivPowTwoI(lhs, lhs, shift, rhs < 0);
             } else {
                 // Numerator is signed, and needs adjusting, and an extra
                 // lhs copy register is needed.
                 lir = new(alloc()) LDivPowTwoI(lhs, useRegister(div->lhs()), shift, rhs < 0);
             }
             if (div->fallible() && !assignSnapshot(lir, Bailout_BaselineInfo))
                 return false;
             return defineReuseInput(lir, div, 0);
+        } else if (rhs != 0) {
+            LDivOrModConstantI *lir;
+            lir = new(alloc()) LDivOrModConstantI(useRegister(div->lhs()), rhs, tempFixed(eax));
+            if (div->fallible() && !assignSnapshot(lir, Bailout_BaselineInfo))
+                return false;
+            return defineFixed(lir, div, LAllocation(AnyRegister(edx)));
         }
     }
 
     LDivI *lir = new(alloc()) LDivI(useRegister(div->lhs()), useRegister(div->rhs()),
                                     tempFixed(edx));
     if (div->fallible() && !assignSnapshot(lir, Bailout_BaselineInfo))
         return false;
     return defineFixed(lir, div, LAllocation(AnyRegister(eax)));
@@ -176,16 +180,22 @@ LIRGeneratorX86Shared::lowerModI(MMod *m
     if (mod->rhs()->isConstant()) {
         int32_t rhs = mod->rhs()->toConstant()->value().toInt32();
         int32_t shift = FloorLog2(Abs(rhs));
         if (rhs != 0 && uint32_t(1) << shift == Abs(rhs)) {
             LModPowTwoI *lir = new(alloc()) LModPowTwoI(useRegisterAtStart(mod->lhs()), shift);
             if (mod->fallible() && !assignSnapshot(lir, Bailout_BaselineInfo))
                 return false;
             return defineReuseInput(lir, mod, 0);
+        } else if (rhs != 0) {
+            LDivOrModConstantI *lir;
+            lir = new(alloc()) LDivOrModConstantI(useRegister(mod->lhs()), rhs, tempFixed(edx));
+            if (mod->fallible() && !assignSnapshot(lir, Bailout_BaselineInfo))
+                return false;
+            return defineFixed(lir, mod, LAllocation(AnyRegister(eax)));
         }
     }
 
     LModI *lir = new(alloc()) LModI(useRegister(mod->lhs()),
                                     useRegister(mod->rhs()),
                                     tempFixed(eax));
     if (mod->fallible() && !assignSnapshot(lir, Bailout_BaselineInfo))
         return false;
--- a/js/src/jit/x64/LOpcodes-x64.h
+++ b/js/src/jit/x64/LOpcodes-x64.h
@@ -8,16 +8,17 @@
 #define jit_x64_LOpcodes_x64_h
 
 #define LIR_CPU_OPCODE_LIST(_)      \
     _(Box)                          \
     _(Unbox)                        \
     _(UnboxFloatingPoint)           \
     _(DivI)                         \
     _(DivPowTwoI)                   \
+    _(DivOrModConstantI)            \
     _(ModI)                         \
     _(ModPowTwoI)                   \
     _(PowHalfD)                     \
     _(AsmJSUInt32ToDouble)          \
     _(AsmJSUInt32ToFloat32)         \
     _(AsmJSLoadFuncPtr)             \
     _(UDivOrMod)
 
--- a/js/src/jit/x86/LOpcodes-x86.h
+++ b/js/src/jit/x86/LOpcodes-x86.h
@@ -9,16 +9,17 @@
 
 #define LIR_CPU_OPCODE_LIST(_)  \
     _(Unbox)                    \
     _(UnboxFloatingPoint)       \
     _(Box)                      \
     _(BoxFloatingPoint)         \
     _(DivI)                     \
     _(DivPowTwoI)               \
+    _(DivOrModConstantI)        \
     _(ModI)                     \
     _(ModPowTwoI)               \
     _(PowHalfD)                 \
     _(AsmJSUInt32ToDouble)      \
     _(AsmJSUInt32ToFloat32)     \
     _(AsmJSLoadFuncPtr)         \
     _(UDivOrMod)