Bug 822077; specialise PushRegs for ARM using STM; r=mjrosenb
authorNicholas Cameron <ncameron@mozilla.com>
Sun, 03 Feb 2013 13:40:05 +1300
changeset 131761 2716cc23146718d7134cbf4ea1cc6e71a7de3f37
parent 131760 f48618e815d1e2aace372f03c1ddb604c6215e64
child 131762 b2747f349019ef24c5aa0fcc5842254099dfe5ed
push id2323
push userbbajaj@mozilla.com
push dateMon, 01 Apr 2013 19:47:02 +0000
treeherdermozilla-beta@7712be144d91 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersmjrosenb
bugs822077
milestone21.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 822077; specialise PushRegs for ARM using STM; r=mjrosenb
js/src/ion/IonMacroAssembler.cpp
js/src/ion/RegisterSets.h
js/src/ion/arm/MacroAssembler-arm.cpp
js/src/ion/arm/MacroAssembler-arm.h
--- a/js/src/ion/IonMacroAssembler.cpp
+++ b/js/src/ion/IonMacroAssembler.cpp
@@ -68,50 +68,94 @@ MacroAssembler::guardTypeSet(const T &ad
 template void MacroAssembler::guardTypeSet(const Address &address, const types::TypeSet *types,
                                            Register scratch, Label *mismatched);
 template void MacroAssembler::guardTypeSet(const ValueOperand &value, const types::TypeSet *types,
                                            Register scratch, Label *mismatched);
 
 void
 MacroAssembler::PushRegsInMask(RegisterSet set)
 {
-    size_t diff = set.gprs().size() * STACK_SLOT_SIZE +
-                  set.fpus().size() * sizeof(double);
-
-    reserveStack(diff);
+    int32_t diffF = set.fpus().size() * sizeof(double);
+    int32_t diffG = set.gprs().size() * STACK_SLOT_SIZE;
 
-    for (GeneralRegisterIterator iter(set.gprs()); iter.more(); iter++) {
-        diff -= STACK_SLOT_SIZE;
-        storePtr(*iter, Address(StackPointer, diff));
+    reserveStack(diffG);
+#ifdef JS_CPU_ARM
+    if (set.gprs().size() > 1) {
+        startDataTransferM(IsStore, StackPointer, IA, NoWriteBack);
+        for (GeneralRegisterIterator iter(set.gprs()); iter.more(); iter++) {
+            diffG -= STACK_SLOT_SIZE;
+            transferReg(*iter);
+        }
+        finishDataTransfer();
+    } else
+#endif
+    {
+        for (GeneralRegisterIterator iter(set.gprs()); iter.more(); iter++) {
+            diffG -= STACK_SLOT_SIZE;
+            storePtr(*iter, Address(StackPointer, diffG));
+        }
     }
+    JS_ASSERT(diffG == 0);
+
+    reserveStack(diffF);
+#ifdef JS_CPU_ARM
+    diffF -= transferMultipleByRuns(set.fpus(), IsStore, StackPointer, IA);
+#else
     for (FloatRegisterIterator iter(set.fpus()); iter.more(); iter++) {
-        diff -= sizeof(double);
-        storeDouble(*iter, Address(StackPointer, diff));
+        diffF -= sizeof(double);
+        storeDouble(*iter, Address(StackPointer, diffF));
     }
+#endif
+    JS_ASSERT(diffF == 0);
 }
 
 void
 MacroAssembler::PopRegsInMaskIgnore(RegisterSet set, RegisterSet ignore)
 {
-    size_t diff = set.gprs().size() * STACK_SLOT_SIZE +
-                  set.fpus().size() * sizeof(double);
-    size_t reserved = diff;
+    int32_t diffG = set.gprs().size() * STACK_SLOT_SIZE;
+    int32_t diffF = set.fpus().size() * sizeof(double);
+    const int32_t reservedG = diffG;
+    const int32_t reservedF = diffF;
 
-    for (GeneralRegisterIterator iter(set.gprs()); iter.more(); iter++) {
-        diff -= STACK_SLOT_SIZE;
-        if (!ignore.has(*iter))
-            loadPtr(Address(StackPointer, diff), *iter);
+#ifdef JS_CPU_ARM
+    // ARM can load multiple registers at once, but only if we want back all
+    // the registers we previously saved to the stack.
+    if (ignore.empty(true)) {
+        diffF -= transferMultipleByRuns(set.fpus(), IsLoad, StackPointer, IA);
+    } else
+#endif
+    {
+        for (FloatRegisterIterator iter(set.fpus()); iter.more(); iter++) {
+            diffF -= sizeof(double);
+            if (!ignore.has(*iter))
+                loadDouble(Address(StackPointer, diffF), *iter);
+        }
     }
-    for (FloatRegisterIterator iter(set.fpus()); iter.more(); iter++) {
-        diff -= sizeof(double);
-        if (!ignore.has(*iter))
-            loadDouble(Address(StackPointer, diff), *iter);
+    freeStack(reservedF);
+    JS_ASSERT(diffF == 0);
+
+#ifdef JS_CPU_ARM
+    if (set.gprs().size() > 1 && ignore.empty(false)) {
+        startDataTransferM(IsLoad, StackPointer, IA, NoWriteBack);
+        for (GeneralRegisterIterator iter(set.gprs()); iter.more(); iter++) {
+            diffG -= STACK_SLOT_SIZE;
+            transferReg(*iter);
+        }
+        finishDataTransfer();
+    } else
+#endif
+    {
+        for (GeneralRegisterIterator iter(set.gprs()); iter.more(); iter++) {
+            diffG -= STACK_SLOT_SIZE;
+            if (!ignore.has(*iter))
+                loadPtr(Address(StackPointer, diffG), *iter);
+        }
     }
-
-    freeStack(reserved);
+    freeStack(reservedG);
+    JS_ASSERT(diffG == 0);
 }
 
 template<typename T>
 void
 MacroAssembler::loadFromTypedArray(int arrayType, const T &src, AnyRegister dest, Register temp,
                                    Label *fail)
 {
     switch (arrayType) {
--- a/js/src/ion/RegisterSets.h
+++ b/js/src/ion/RegisterSets.h
@@ -313,16 +313,21 @@ class TypedRegisterSet
         return TypedRegisterSet(T::Codes::AllocatableMask & T::Codes::VolatileMask);
     }
     void intersect(TypedRegisterSet other) {
         bits_ &= ~other.bits_;
     }
     bool has(T reg) const {
         return !!(bits_ & (1 << reg.code()));
     }
+    bool hasNextRegister(T reg) const {
+        if (reg.code() == sizeof(bits_)*8)
+            return false;
+        return !!(bits_ & (1 << (reg.code()+1)));
+    }
     void addUnchecked(T reg) {
         bits_ |= (1 << reg.code());
     }
     void add(T reg) {
         JS_ASSERT(!has(reg));
         addUnchecked(reg);
     }
     // Determemine if some register are still allocated.  This function should
@@ -339,22 +344,33 @@ class TypedRegisterSet
         bits_ &= ~(1 << reg.code());
     }
     T getAny() const {
         JS_ASSERT(!empty());
         int ireg;
         JS_FLOOR_LOG2(ireg, bits_);
         return T::FromCode(ireg);
     }
+    T getFirst() const {
+        JS_ASSERT(!empty());
+        int ireg = js_bitscan_ctz32(bits_);
+        return T::FromCode(ireg);
+    }
     T takeAny() {
         JS_ASSERT(!empty());
         T reg = getAny();
         take(reg);
         return reg;
     }
+    T takeFirst() {
+        JS_ASSERT(!empty());
+        T reg = getFirst();
+        take(reg);
+        return reg;
+    }
     void clear() {
         bits_ = 0;
     }
     uint32_t bits() const {
         return bits_;
     }
     uint32_t size() const {
         uint32_t sum2  = (bits_ & 0x55555555) + ((bits_ & 0xaaaaaaaa) >> 1);
@@ -528,16 +544,17 @@ class RegisterSet {
     void maybeTake(TypedOrValueRegister reg) {
         if (reg.hasValue())
             maybeTake(reg.valueReg());
         else if (reg.hasTyped())
             maybeTake(reg.typedReg());
     }
 };
 
+// iterates backwards, that is, rn to r0
 template <typename T>
 class TypedRegisterIterator
 {
     TypedRegisterSet<T> regset_;
 
   public:
     TypedRegisterIterator(TypedRegisterSet<T> regset) : regset_(regset)
     { }
@@ -547,23 +564,58 @@ class TypedRegisterIterator
     bool more() const {
         return !regset_.empty();
     }
     TypedRegisterIterator<T> operator ++(int) {
         TypedRegisterIterator<T> old(*this);
         regset_.takeAny();
         return old;
     }
+    TypedRegisterIterator<T>& operator ++() {
+        regset_.takeAny();
+        return *this;
+    }
     T operator *() const {
         return regset_.getAny();
     }
 };
 
+// iterates forwards, that is r0 to rn
+template <typename T>
+class TypedRegisterForwardIterator
+{
+    TypedRegisterSet<T> regset_;
+
+  public:
+    TypedRegisterForwardIterator(TypedRegisterSet<T> regset) : regset_(regset)
+    { }
+    TypedRegisterForwardIterator(const TypedRegisterForwardIterator &other) : regset_(other.regset_)
+    { }
+
+    bool more() const {
+        return !regset_.empty();
+    }
+    TypedRegisterForwardIterator<T> operator ++(int) {
+        TypedRegisterIterator<T> old(*this);
+        regset_.takeFirst();
+        return old;
+    }
+    TypedRegisterForwardIterator<T>& operator ++() {
+        regset_.takeFirst();
+        return *this;
+    }
+    T operator *() const {
+        return regset_.getFirst();
+    }
+};
+
 typedef TypedRegisterIterator<Register> GeneralRegisterIterator;
 typedef TypedRegisterIterator<FloatRegister> FloatRegisterIterator;
+typedef TypedRegisterForwardIterator<Register> GeneralRegisterForwardIterator;
+typedef TypedRegisterForwardIterator<FloatRegister> FloatRegisterForwardIterator;
 
 class AnyRegisterIterator
 {
     GeneralRegisterIterator geniter_;
     FloatRegisterIterator floatiter_;
 
   public:
     AnyRegisterIterator()
--- a/js/src/ion/arm/MacroAssembler-arm.cpp
+++ b/js/src/ion/arm/MacroAssembler-arm.cpp
@@ -1378,16 +1378,47 @@ MacroAssemblerARM::ma_vstr(VFPRegister s
 }
 void
 MacroAssemblerARM::ma_vstr(VFPRegister src, Register base, Register index, int32_t shift, Condition cc)
 {
     as_add(ScratchRegister, base, lsl(index, shift), NoSetCond, cc);
     ma_vstr(src, Operand(ScratchRegister, 0), cc);
 }
 
+
+int32_t
+MacroAssemblerARM::transferMultipleByRuns(FloatRegisterSet set, LoadStore ls,
+                                          Register rm, DTMMode mode)
+{
+    int32_t delta;
+    if (mode == IA) {
+        delta = sizeof(double);
+    } else if (mode == DB) {
+        delta = -sizeof(double);
+    } else {
+        JS_NOT_REACHED("Invalid data transfer addressing mode");
+    }
+
+    int32_t offset = 0;
+    FloatRegisterForwardIterator iter(set);
+    while (iter.more()) {
+        startFloatTransferM(ls, rm, mode, WriteBack);
+        int32_t reg = (*iter).code_;
+        do {
+            offset += delta;
+            transferFloatReg(*iter);
+        } while ((++iter).more() && (*iter).code_ == ++reg);
+        finishFloatTransfer();
+    }
+
+    JS_ASSERT(offset == set.size() * sizeof(double) * (mode == DB ? -1 : 1));
+    ma_sub(Imm32(offset), rm);
+    return offset;
+}
+
 bool
 MacroAssemblerARMCompat::buildFakeExitFrame(const Register &scratch, uint32_t *offset)
 {
     DebugOnly<uint32_t> initialDepth = framePushed();
     uint32_t descriptor = MakeFrameDescriptor(framePushed(), IonFrame_OptimizedJS);
 
     Push(Imm32(descriptor)); // descriptor_
 
--- a/js/src/ion/arm/MacroAssembler-arm.h
+++ b/js/src/ion/arm/MacroAssembler-arm.h
@@ -325,16 +325,24 @@ class MacroAssemblerARM : public Assembl
     // calls an Ion function, assumes that the stack is untouched (8 byte alinged)
     void ma_callIon(const Register reg);
     // callso an Ion function, assuming that sp has already been decremented
     void ma_callIonNoPush(const Register reg);
     // calls an ion function, assuming that the stack is currently not 8 byte aligned
     void ma_callIonHalfPush(const Register reg);
 
     void ma_call(void *dest);
+
+    // Float registers can only be loaded/stored in continuous runs
+    // when using vstm/vldm.
+    // This function breaks set into continuous runs and loads/stores
+    // them at [rm]. rm will be modified, but returned to its initial value.
+    // Returns the offset from [dm] for the logical next load/store.
+    int32_t transferMultipleByRuns(FloatRegisterSet set, LoadStore ls,
+                                   Register rm, DTMMode mode);
 };
 
 class MacroAssemblerARMCompat : public MacroAssemblerARM
 {
     // Number of bytes the stack is adjusted inside a call to C. Calls to C may
     // not be nested.
     bool inCall_;
     uint32_t args_;