Bug 520905 - collapse callee saved register spills/reloads into LDMIA/STMIA instructions, r=vlad.
authorJulian Seward <jseward@acm.org>
Fri, 20 Nov 2009 10:11:33 -0800
changeset 35352 92ea7f43feb6c2317cd889883137e2d577c2d3a9
parent 35351 a71067721eb03df15dacd667f45fc955f78293aa
child 35353 021eebf49df54b23819eb880f448ffadfe6a9750
push id10560
push userrsayre@mozilla.com
push dateTue, 01 Dec 2009 18:15:12 +0000
treeherdermozilla-central@e2860a4dcf0c [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersvlad
bugs520905
milestone1.9.3a1pre
Bug 520905 - collapse callee saved register spills/reloads into LDMIA/STMIA instructions, r=vlad.
js/src/nanojit/Assembler.cpp
js/src/nanojit/Native.h
js/src/nanojit/NativeARM.cpp
js/src/nanojit/NativeARM.h
js/src/nanojit/NativePPC.h
js/src/nanojit/NativeSparc.h
js/src/nanojit/NativeX64.h
js/src/nanojit/Nativei386.h
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
@@ -1764,17 +1764,25 @@ namespace nanojit
      * current & !saved    evict current  (unionRegisterState would keep)
      * !current & saved    keep saved
      */
     void Assembler::intersectRegisterState(RegAlloc& saved)
     {
         // evictions and pops first
         RegisterMask skip = 0;
         verbose_only(bool shouldMention=false; )
-        for (Register r=FirstReg; r <= LastReg; r = nextreg(r))
+        // The obvious thing to do here is to iterate from FirstReg to LastReg.
+        // viz: for (Register r=FirstReg; r <= LastReg; r = nextreg(r)) ...
+        // However, on ARM that causes lower-numbered integer registers
+        // to be be saved at higher addresses, which inhibits the formation
+        // of load/store multiple instructions.  Hence iterate the loop the
+        // other way.  The "r <= LastReg" guards against wraparound in
+        // the case where Register is treated as unsigned and FirstReg is zero.
+        for (Register r=LastReg; r >= FirstReg && r <= LastReg;
+                                 r = prevreg(r))
         {
             LIns * curins = _allocator.getActive(r);
             LIns * savedins = saved.getActive(r);
             if (curins == savedins)
             {
                 //verbose_only( if (curins) verbose_outputf("                                              skip %s", regNames[r]); )
                 skip |= rmask(r);
             }
--- a/js/src/nanojit/Native.h
+++ b/js/src/nanojit/Native.h
@@ -122,16 +122,25 @@ namespace nanojit {
 #endif
 
 #ifndef NJ_JTBL_SUPPORTED
 #  define NJ_JTBL_SUPPORTED 0
 #endif
 
 namespace nanojit {
 
+    inline Register nextreg(Register r) {
+        return Register(r+1);
+    }
+
+    inline Register prevreg(Register r) {
+        return Register(r-1);
+    }
+
+
     class Fragment;
     struct SideExit;
     struct SwitchInfo;
 
     struct GuardRecord
     {
         void* jmp;
         GuardRecord* next;
--- a/js/src/nanojit/NativeARM.cpp
+++ b/js/src/nanojit/NativeARM.cpp
@@ -959,16 +959,113 @@ branch_is_B(NIns* branch)
 }
 
 static inline bool
 branch_is_LDR_PC(NIns* branch)
 {
     return (*branch & 0x0F7FF000) == 0x051FF000;
 }
 
+// Is this an instruction of the form  ldr/str reg, [fp, #-imm] ?
+static inline bool
+is_ldstr_reg_fp_minus_imm(/*OUT*/uint32_t* isLoad, /*OUT*/uint32_t* rX,
+                          /*OUT*/uint32_t* immX, NIns i1)
+{
+    if ((i1 & 0xFFEF0000) != 0xE50B0000)
+        return false;
+    *isLoad = (i1 >> 20) & 1;
+    *rX     = (i1 >> 12) & 0xF;
+    *immX   = i1 & 0xFFF;
+    return true;
+}
+
+// Is this an instruction of the form  ldmdb/stmdb fp, regset ?
+static inline bool
+is_ldstmdb_fp(/*OUT*/uint32_t* isLoad, /*OUT*/uint32_t* regSet, NIns i1)
+{
+    if ((i1 & 0xFFEF0000) != 0xE90B0000)
+        return false;
+    *isLoad = (i1 >> 20) & 1;
+    *regSet = i1 & 0xFFFF;
+    return true;
+}
+
+// Make an instruction of the form ldmdb/stmdb fp, regset
+static inline NIns
+mk_ldstmdb_fp(uint32_t isLoad, uint32_t regSet)
+{
+    return 0xE90B0000 | (regSet & 0xFFFF) | ((isLoad & 1) << 20);
+}
+
+// Compute the number of 1 bits in the lowest 16 bits of regSet
+static inline uint32_t
+size_of_regSet(uint32_t regSet)
+{
+   uint32_t x = regSet;
+   x = (x & 0x5555) + ((x >> 1) & 0x5555);
+   x = (x & 0x3333) + ((x >> 2) & 0x3333);
+   x = (x & 0x0F0F) + ((x >> 4) & 0x0F0F);
+   x = (x & 0x00FF) + ((x >> 8) & 0x00FF);
+   return x;
+}
+
+// See if two ARM instructions, i1 and i2, can be combined into one
+static bool
+do_peep_2_1(/*OUT*/NIns* merged, NIns i1, NIns i2)
+{
+    uint32_t rX, rY, immX, immY, isLoadX, isLoadY, regSet;
+    /*   ld/str rX, [fp, #-8]
+         ld/str rY, [fp, #-4]
+         ==>
+         ld/stmdb fp, {rX, rY}
+         when 
+         X < Y and X != fp and Y != fp and X != 15 and Y != 15
+    */
+    if (is_ldstr_reg_fp_minus_imm(&isLoadX, &rX, &immX, i1) &&
+        is_ldstr_reg_fp_minus_imm(&isLoadY, &rY, &immY, i2) &&
+        immX == 8 && immY == 4 && rX < rY &&
+        isLoadX == isLoadY &&
+        rX != FP && rY != FP &&
+         rX != 15 && rY != 15) {
+        *merged = mk_ldstmdb_fp(isLoadX, (1 << rX) | (1<<rY));
+        return true;
+    }
+    /*   ld/str   rX, [fp, #-N]
+         ld/stmdb fp, regset
+         ==>
+         ld/stmdb fp, union(regset,{rX})
+         when
+         regset is nonempty
+         X < all elements of regset
+         N == 4 * (1 + card(regset))
+         X != fp and X != 15
+    */
+    if (is_ldstr_reg_fp_minus_imm(&isLoadX, &rX, &immX, i1) &&
+        is_ldstmdb_fp(&isLoadY, &regSet, i2) &&
+        regSet != 0 &&
+        (regSet & ((1 << (rX + 1)) - 1)) == 0 &&
+        immX == 4 * (1 + size_of_regSet(regSet)) &&
+        isLoadX == isLoadY &&
+        rX != FP && rX != 15) {
+        *merged = mk_ldstmdb_fp(isLoadX, regSet | (1 << rX));
+        return true;
+    }
+    return false;
+}
+
+// Determine whether or not it's safe to look at _nIns[1].
+// Necessary condition for safe peepholing with do_peep_2_1.
+static inline bool
+does_next_instruction_exist(NIns* _nIns, NIns* codeStart, NIns* codeEnd,
+                            NIns* exitStart, NIns* exitEnd)
+{
+    return (exitStart <= _nIns && _nIns+1 < exitEnd) ||
+           (codeStart <= _nIns && _nIns+1 < codeEnd);
+}
+
 void
 Assembler::nPatchBranch(NIns* branch, NIns* target)
 {
     // Patch the jump in a loop
 
     //
     // There are two feasible cases here, the first of which has 2 sub-cases:
     //
@@ -1113,17 +1210,29 @@ Assembler::asm_restore(LInsp i, Reservat
         if (ARM_VFP && IsFpReg(r)) {
             if (isS8(d >> 2)) {
                 FLDD(r, FP, d);
             } else {
                 FLDD(r, IP, 0);
                 asm_add_imm(IP, FP, d);
             }
         } else {
+            NIns merged;
             LDR(r, FP, d);
+            // See if we can merge this load into an immediately following
+            // one, by creating or extending an LDM instruction.
+            if (/* is it safe to poke _nIns[1] ? */
+                does_next_instruction_exist(_nIns, codeStart, codeEnd, 
+                                                   exitStart, exitEnd)
+                && /* can we merge _nIns[0] into _nIns[1] ? */
+                   do_peep_2_1(&merged, _nIns[0], _nIns[1])) {
+                _nIns[1] = merged;
+                _nIns++;
+                verbose_only( asm_output("merge next into LDMDB"); )
+            }
         }
     }
     verbose_only(
         asm_output("        restore %s",_thisfrag->lirbuf->names->formatRef(i));
         )
 }
 
 void
@@ -1135,17 +1244,29 @@ Assembler::asm_spill(Register rr, int d,
         if (ARM_VFP && IsFpReg(rr)) {
             if (isS8(d >> 2)) {
                 FSTD(rr, FP, d);
             } else {
                 FSTD(rr, IP, 0);
                 asm_add_imm(IP, FP, d);
             }
         } else {
+            NIns merged;
             STR(rr, FP, d);
+            // See if we can merge this store into an immediately following one,
+            // one, by creating or extending a STM instruction.
+            if (/* is it safe to poke _nIns[1] ? */
+                does_next_instruction_exist(_nIns, codeStart, codeEnd, 
+                                                   exitStart, exitEnd)
+                && /* can we merge _nIns[0] into _nIns[1] ? */
+                   do_peep_2_1(&merged, _nIns[0], _nIns[1])) {
+                _nIns[1] = merged;
+                _nIns++;
+                verbose_only( asm_output("merge next into STMDB"); )
+            }
         }
     }
 }
 
 void
 Assembler::asm_load64(LInsp ins)
 {
     //asm_output("<<< load64");
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@@ -190,17 +190,16 @@ static inline bool isValidDisplacement(L
     return isS12(d);
 }
 
 #define IsFpReg(_r)     ((rmask((Register)_r) & (FpRegs)) != 0)
 #define IsGpReg(_r)     ((rmask((Register)_r) & (GpRegs)) != 0)
 #define FpRegNum(_fpr)  ((_fpr) - FirstFloatReg)
 
 #define firstreg()      R0
-#define nextreg(r)      ((Register)((int)(r)+1))
 // only good for normal regs
 #define imm2register(c) (Register)(c-1)
 
 verbose_only( extern const char* regNames[]; )
 verbose_only( extern const char* condNames[]; )
 verbose_only( extern const char* shiftNames[]; )
 
 // abstract to platform specific calls
--- a/js/src/nanojit/NativePPC.h
+++ b/js/src/nanojit/NativePPC.h
@@ -296,20 +296,16 @@ namespace nanojit
 
     const int LARGEST_UNDERRUN_PROT = 9*4;  // largest value passed to underrunProtect
 
     typedef uint32_t NIns;
 
     // Bytes of icache to flush after Assembler::patch
     const size_t LARGEST_BRANCH_PATCH = 4 * sizeof(NIns);
 
-    inline Register nextreg(Register r) {
-        return Register(r+1);
-    }
-
     #define EMIT1(ins, fmt, ...) do {\
         underrunProtect(4);\
         *(--_nIns) = (NIns) (ins);\
         asm_output(fmt, ##__VA_ARGS__);\
         } while (0) /* no semi */
 
     #define GPR(r) (r)
     #define FPR(r) ((r)&31)
--- a/js/src/nanojit/NativeSparc.h
+++ b/js/src/nanojit/NativeSparc.h
@@ -183,18 +183,16 @@ namespace nanojit
     1<<F14 | 1<<F16 | 1<<F18 | 1<<F20 |
     1<<F22;
     static const RegisterMask AllowableFlagRegs = GpRegs;
 
     static inline bool isValidDisplacement(LOpcode, int32_t) {
         return true;
     }
 
-#define nextreg(r)        Register(r+1)
-
     verbose_only( extern const char* regNames[]; )
 
 #define DECLARE_PLATFORM_STATS()
 
 #define DECLARE_PLATFORM_REGALLOC()
 
 #define DECLARE_PLATFORM_ASSEMBLER()    \
     const static Register argRegs[6], retRegs[1]; \
--- a/js/src/nanojit/NativeX64.h
+++ b/js/src/nanojit/NativeX64.h
@@ -558,15 +558,11 @@ namespace nanojit
 
     const int LARGEST_UNDERRUN_PROT = 32;  // largest value passed to underrunProtect
 
     typedef uint8_t NIns;
 
     // Bytes of icache to flush after Assembler::patch
     const size_t LARGEST_BRANCH_PATCH = 16 * sizeof(NIns);
 
-    inline Register nextreg(Register r) {
-        return Register(r+1);
-    }
-
 } // namespace nanojit
 
 #endif // __nanojit_NativeX64__
--- a/js/src/nanojit/Nativei386.h
+++ b/js/src/nanojit/Nativei386.h
@@ -161,18 +161,16 @@ namespace nanojit
     }
 
     #define _rmask_(r)      (1<<(r))
     #define _is_xmm_reg_(r) ((_rmask_(r)&XmmRegs)!=0)
     #define _is_x87_reg_(r) ((_rmask_(r)&x87Regs)!=0)
     #define _is_fp_reg_(r)  ((_rmask_(r)&FpRegs)!=0)
     #define _is_gp_reg_(r)  ((_rmask_(r)&GpRegs)!=0)
 
-    #define nextreg(r)      Register(r+1)
-
     verbose_only( extern const char* regNames[]; )
 
     #define DECLARE_PLATFORM_STATS()
 
     #define DECLARE_PLATFORM_REGALLOC()
 
     #define DECLARE_PLATFORM_ASSEMBLER()    \
         const static Register argRegs[2], retRegs[2]; \