Bug 587916 - Cleanup of X87 FP stack code (r=nnethercote+)
authorWilliam Maddox <wmaddox@adobe.com>
Tue, 24 Aug 2010 11:44:17 -0700
changeset 5124 f2fa20a391ca0b856804a09bc90a86a9119ee3e3
parent 5123 f9451aec3c27e476b4c40be7bf68bdcc7f308c66
child 5125 31b1d6767e657dc9bb29535a422f6b237825be27
push id2760
push userstejohns@adobe.com
push dateThu, 26 Aug 2010 21:16:08 +0000
reviewersnnethercote
bugs587916
Bug 587916 - Cleanup of X87 FP stack code (r=nnethercote+) 1) The "register" FST0 is the sole member of the x87regs register class. In many places, however, the code is written so as to strongly suggest that there might be multiple such registers. This patch removes such conceits, replacing expressions such as (rmask(r) & x87regs) with (r == FST0), etc. 2) prepareResultReg() has been slightly refactored to make the x87 stack fiddling a bit easier to follow and to remove a fragile assumption. 3) Do not pass the "pop" argument to asm_spill() on non-IA32 platforms. 4) Remove redundant normalization of boolean values. 5) Comment the FPU stack depth consistency check.
nanojit/Assembler.cpp
nanojit/Assembler.h
nanojit/NativeARM.cpp
nanojit/NativeMIPS.cpp
nanojit/NativePPC.cpp
nanojit/NativeSparc.cpp
nanojit/NativeX64.cpp
nanojit/Nativei386.cpp
--- a/nanojit/Assembler.cpp
+++ b/nanojit/Assembler.cpp
@@ -346,24 +346,31 @@ namespace nanojit
             NanoAssertMsg(!ins->isInReg() || regs.isConsistent(ins->getReg(), ins),
                           "Register record mismatch");
         }
     }
 
     void Assembler::resourceConsistencyCheck()
     {
         NanoAssert(!error());
-
 #ifdef NANOJIT_IA32
+        // Within the expansion of a single LIR instruction, we may use the x87
+        // stack for unmanaged temporaries.  Otherwise, we do not use the x87 stack
+        // as such, but use the top element alone as a single allocatable FP register.
+        // Compensation code must be inserted to keep the stack balanced and avoid
+        // overflow, and the mechanisms for this are rather fragile and IA32-specific.
+        // The predicate below should hold between any pair of instructions within
+        // a basic block, at labels, and just after a conditional branch.  Currently,
+        // we enforce this condition between all pairs of instructions, but this is
+        // overly restrictive, and would fail if we did not generate unreachable x87
+        // stack pops following unconditional branches.
         NanoAssert((_allocator.active[FST0] && _fpuStkDepth == -1) ||
-            (!_allocator.active[FST0] && _fpuStkDepth == 0));
+                   (!_allocator.active[FST0] && _fpuStkDepth == 0));
 #endif
-
         _activation.checkForResourceConsistency(_allocator);
-
         registerConsistencyCheck();
     }
 
     void Assembler::registerConsistencyCheck()
     {
         RegisterMask managed = _allocator.managed;
         for (Register r = lsReg(managed); managed; r = nextLsReg(managed, r)) {
             // A register managed by register allocation must be either
@@ -628,56 +635,62 @@ namespace nanojit
     //   regstate:  R
     //              ...
     //   asm:       restore res into r2
     //   regstate:  R + r2(res) + other changes from "..."
     //   asm:       use res in r2
     //
     Register Assembler::prepareResultReg(LIns *ins, RegisterMask allow)
     {
-        // At this point, we know the result of 'ins' result has a use later
-        // in the code.  (Exception: if 'ins' is a call to an impure function
-        // the return value may not be used, but 'ins' will still be present
-        // because it has side-effects.)  It may have had to be evicted, in
-        // which case the restore will have already been generated, so we now
-        // generate the spill (unless the restore was actually a
-        // rematerialize, in which case it's not necessary).
+        // At this point, we know the result of 'ins' is used later in the
+        // code, unless it is a call to an impure function that must be
+        // included for effect even though its result is ignored.  It may have
+        // had to be evicted, in which case the restore will have already been
+        // generated, so we now generate the spill.  QUERY: Is there any attempt
+        // to elide the spill if we know that all restores can be rematerialized?
 #ifdef NANOJIT_IA32
-        // If 'allow' includes FST0 we have to pop if 'ins' isn't in FST0 in
-        // the post-regstate.  This could be because 'ins' is unused, 'ins' is
-        // in a spill slot, or 'ins' is in an XMM register.
-        const bool pop = (allow & rmask(FST0)) &&
-                         (!ins->isInReg() || ins->getReg() != FST0);
-#else
-        const bool pop = false;
-#endif
+        const bool notInFST0 = (!ins->isInReg() || ins->getReg() != FST0);
         Register r = findRegFor(ins, allow);
-        asm_maybe_spill(ins, pop);
-#ifdef NANOJIT_IA32
-        if (!ins->isInAr() && pop && r == FST0) {
-            // This can only happen with a LIR_calld to an impure function
-            // whose return value was ignored (ie. if ins->isInReg() was false
-            // prior to the findRegFor() call).
-            FSTP(FST0);     // pop the fpu result since it isn't used
+        // If the result register is FST0, but FST0 is not in the post-regstate,
+        // then we must pop the x87 stack.  This may occur because the result is
+        // unused, or because it has been stored to a spill slot or an XMM register.
+        const bool needPop = notInFST0 && (r == FST0);
+        const bool didSpill = asm_maybe_spill(ins, needPop);
+        if (!didSpill && needPop) {
+            // If the instruction is spilled, then the pop will have already
+            // been performed by the store to the stack slot.  Otherwise, we
+            // must pop now.  This may occur when the result of a LIR_calld
+            // to an impure (side-effecting) function is not used.
+            FSTP(FST0);
         }
+#else
+        Register r = findRegFor(ins, allow);
+        asm_maybe_spill(ins, false);
 #endif
         return r;
     }
 
-    void Assembler::asm_maybe_spill(LIns* ins, bool pop)
+    bool Assembler::asm_maybe_spill(LIns* ins, bool pop)
     {
-        int d = ins->isInAr() ? arDisp(ins) : 0;
-        Register r = ins->getReg();
         if (ins->isInAr()) {
+            int d = arDisp(ins);
+            Register r = ins->getReg();
             verbose_only( RefBuf b;
                           if (_logc->lcbits & LC_Native) {
                              setOutputForEOL("  <= spill %s",
                              _thisfrag->lirbuf->printer->formatRef(&b, ins)); } )
-            asm_spill(r, d, pop, ins->isQorD());
+#ifdef NANOJIT_IA32
+            asm_spill(r, d, pop);
+#else
+            (void)pop;
+            asm_spill(r, d, ins->isQorD());
+#endif
+            return true;
         }
+        return false;
     }
 
     // XXX: This function is error-prone and should be phased out; see bug 513615.
     void Assembler::deprecated_freeRsrcOf(LIns *ins)
     {
         if (ins->isInReg()) {
             asm_maybe_spill(ins, /*pop*/false);
             _allocator.retire(ins->getReg());   // free any register associated with entry
@@ -2353,19 +2366,19 @@ namespace nanojit
                 if (curins) {
                     //_nvprof("intersect-evict",1);
                     verbose_only( shouldMention=true; )
                     NanoAssert(curins->getReg() == r);
                     evict(curins);
                 }
 
                 #ifdef NANOJIT_IA32
-                if (savedins && (rmask(r) & x87Regs)) {
+                if (savedins && r == FST0) {
                     verbose_only( shouldMention=true; )
-                    FSTP(r);
+                    FSTP(FST0);
                 }
                 #endif
             }
         }
         // Now reassign mainline registers.
         for (int i = 0; i < nTodo; i++) {
             findSpecificRegFor(insTodo[i], regsTodo[i]);
         }
@@ -2409,22 +2422,23 @@ namespace nanojit
                 if (curins && savedins) {
                     //_nvprof("union-evict",1);
                     verbose_only( shouldMention=true; )
                     NanoAssert(curins->getReg() == r);
                     evict(curins);
                 }
 
                 #ifdef NANOJIT_IA32
-                if (rmask(r) & x87Regs) {
+                if (r == FST0) {
                     if (savedins) {
-                        FSTP(r);
+                        // Discard top of x87 stack.
+                        FSTP(FST0);
                     }
                     else if (curins) {
-                        // saved state did not have fpu reg allocated,
+                        // Saved state did not have fpu reg allocated,
                         // so we must evict here to keep x87 stack balanced.
                         evict(curins);
                     }
                     verbose_only( shouldMention=true; )
                 }
                 #endif
             }
         }
--- a/nanojit/Assembler.h
+++ b/nanojit/Assembler.h
@@ -424,18 +424,22 @@ namespace nanojit
             void        asm_store32(LOpcode op, LIns *val, int d, LIns *base);
             void        asm_store64(LOpcode op, LIns *val, int d, LIns *base);
 
             // WARNING: the implementation of asm_restore() should emit fast code
             // to rematerialize instructions where canRemat() returns true.
             // Otherwise, register allocation decisions will be suboptimal.
             void        asm_restore(LIns*, Register);
 
-            void        asm_maybe_spill(LIns* ins, bool pop);
-            void        asm_spill(Register rr, int d, bool pop, bool quad);
+            bool        asm_maybe_spill(LIns* ins, bool pop);
+#ifdef NANOJIT_IA32
+            void        asm_spill(Register rr, int d, bool pop);
+#else
+            void        asm_spill(Register rr, int d, bool quad);
+#endif
             void        asm_load64(LIns* ins);
             void        asm_ret(LIns* ins);
 #ifdef NANOJIT_64BIT
             void        asm_immq(LIns* ins);
 #endif
             void        asm_immd(LIns* ins);
             void        asm_condd(LIns* ins);
             void        asm_cond(LIns* ins);
@@ -497,20 +501,20 @@ namespace nanojit
 
         private:
 #ifdef NANOJIT_IA32
             debug_only( int32_t _fpuStkDepth; )
             debug_only( int32_t _sv_fpuStkDepth; )
 
             // since we generate backwards the depth is negative
             inline void fpu_push() {
-                debug_only( ++_fpuStkDepth; NanoAssert(_fpuStkDepth<=0); )
+                debug_only( ++_fpuStkDepth; NanoAssert(_fpuStkDepth <= 0); )
             }
             inline void fpu_pop() {
-                debug_only( --_fpuStkDepth; NanoAssert(_fpuStkDepth<=0); )
+                debug_only( --_fpuStkDepth; NanoAssert(_fpuStkDepth >= -7); )
             }
 #endif
             const Config& _config;
     };
 
     inline int32_t arDisp(LIns* ins)
     {
         // even on 64bit cpu's, we allocate stack area in 4byte chunks
--- a/nanojit/NativeARM.cpp
+++ b/nanojit/NativeARM.cpp
@@ -1294,19 +1294,18 @@ Assembler::asm_restore(LIns* i, Register
                 _nIns++;
                 verbose_only( asm_output("merge next into LDMDB"); )
             }
         }
     }
 }
 
 void
-Assembler::asm_spill(Register rr, int d, bool pop, bool quad)
+Assembler::asm_spill(Register rr, int d, bool quad)
 {
-    (void) pop;
     (void) quad;
     NanoAssert(d);
     // The following registers should never be spilled:
     NanoAssert(rr != PC);
     NanoAssert(rr != IP);
     NanoAssert(rr != SP);
     if (_config.arm_vfp && IsFpReg(rr)) {
         if (isS8(d >> 2)) {
@@ -1565,17 +1564,17 @@ Assembler::asm_immd(LIns* ins)
 {
     int d = deprecated_disp(ins);
     Register rr = ins->deprecated_getReg();
 
     deprecated_freeRsrcOf(ins);
 
     if (_config.arm_vfp && deprecated_isKnownReg(rr)) {
         if (d)
-            asm_spill(rr, d, false, true);
+            asm_spill(rr, d, true);
 
         underrunProtect(4*4);
         asm_immd_nochk(rr, ins->immDlo(), ins->immDhi());
     } else {
         NanoAssert(d);
 
         asm_str(IP, FP, d+4);
         asm_ld_imm(IP, ins->immDhi());
--- a/nanojit/NativeMIPS.cpp
+++ b/nanojit/NativeMIPS.cpp
@@ -625,17 +625,17 @@ namespace nanojit
     {
         int d = deprecated_disp(ins);
         Register rr = ins->deprecated_getReg();
 
         deprecated_freeRsrcOf(ins);
 
         if (cpu_has_fpu && deprecated_isKnownReg(rr)) {
             if (d)
-                asm_spill(rr, d, false, true);
+                asm_spill(rr, d, true);
             asm_li_d(rr, ins->immDhi(), ins->immDlo());
         }
         else {
             NanoAssert(d);
             asm_store_imm64(ins, d, FP);
         }
         TAG("asm_immd(ins=%p{%s})", ins, lirNames[ins->opcode()]);
     }
@@ -1644,30 +1644,29 @@ namespace nanojit
                 NOP();
             }
             J(targ);
         }
         TAG("asm_j(targ=%p) bdelay=%d", targ);
     }
 
     void
-    Assembler::asm_spill(Register rr, int d, bool pop, bool quad)
+    Assembler::asm_spill(Register rr, int d, bool quad)
     {
-        USE(pop);
         USE(quad);
         NanoAssert(d);
         if (IsFpReg(rr)) {
             NanoAssert(quad);
             asm_ldst64(true, rr, d, FP);
         }
         else {
             NanoAssert(!quad);
             asm_ldst(OP_SW, rr, d, FP);
         }
-        TAG("asm_spill(rr=%d, d=%d, pop=%d, quad=%d)", rr, d, pop, quad);
+        TAG("asm_spill(rr=%d, d=%d, quad=%d)", rr, d, quad);
     }
 
     void
     Assembler::asm_nongp_copy(Register dst, Register src)
     {
         NanoAssert ((rmask(dst) & FpRegs) && (rmask(src) & FpRegs));
         MOV_D(dst, src);
         TAG("asm_nongp_copy(dst=%d src=%d)", dst, src);
--- a/nanojit/NativePPC.cpp
+++ b/nanojit/NativePPC.cpp
@@ -822,17 +822,17 @@ namespace nanojit
             else {
                 // this is the last use, so fine to assign it
                 // to the scratch reg, it's dead after this point.
                 findSpecificRegFor(p, r);
             }
         }
     }
 
-    void Assembler::asm_spill(Register rr, int d, bool /* pop */, bool quad) {
+    void Assembler::asm_spill(Register rr, int d, bool quad) {
         (void)quad;
         NanoAssert(d);
         if (IsFpReg(rr)) {
             NanoAssert(quad);
             STFD(rr, d, FP);
         }
     #ifdef NANOJIT_64BIT
         else if (quad) {
--- a/nanojit/NativeSparc.cpp
+++ b/nanojit/NativeSparc.cpp
@@ -325,17 +325,17 @@ namespace nanojit
                     break;
                 case LIR_sti2c:
                     STB32(ra, dr, rb);
                     break;
                 }
             }
     }
 
-    void Assembler::asm_spill(Register rr, int d, bool pop, bool quad)
+    void Assembler::asm_spill(Register rr, int d, bool quad)
     {
         underrunProtect(24);
         (void)quad;
         NanoAssert(d);
         if (rmask(rr) & FpRegs) {
             STDF32(rr, d, FP);
         } else {
             STW32(rr, d, FP);
--- a/nanojit/NativeX64.cpp
+++ b/nanojit/NativeX64.cpp
@@ -1839,17 +1839,17 @@ namespace nanojit
             MOVQXR(rt, gt);
             asm_immq(gt, negateMask[0], /*canClobberCCs*/true);
         }
         if (ra != rr)
             asm_nongp_copy(rr,ra);
         endOpRegs(ins, rr, ra);
     }
 
-    void Assembler::asm_spill(Register rr, int d, bool /*pop*/, bool quad) {
+    void Assembler::asm_spill(Register rr, int d, bool quad) {
         NanoAssert(d);
         if (!IsFpReg(rr)) {
             if (quad)
                 MOVQMR(rr, d, FP);
             else
                 MOVLMR(rr, d, FP);
         } else {
             // store 64bits from XMM to memory
--- a/nanojit/Nativei386.cpp
+++ b/nanojit/Nativei386.cpp
@@ -1210,17 +1210,17 @@ namespace nanojit
             if (ins->isI()) {
                 NanoAssert(rmask(r) & GpRegs);
                 LD(r, d, FP);
             } else {
                 NanoAssert(ins->isD());
                 if (rmask(r) & XmmRegs) {
                     SSE_LDQ(r, d, FP);
                 } else {
-                    NanoAssert(rmask(r) & x87Regs);
+                    NanoAssert(r == FST0);
                     FLDQ(d, FP);
                 }
             }
         }
     }
 
     void Assembler::asm_store32(LOpcode op, LIns* value, int dr, LIns* base)
     {
@@ -1270,27 +1270,26 @@ namespace nanojit
                     break;
                 default:
                     NanoAssertMsg(0, "asm_store32 should never receive this LIR opcode");
                     break;
             }
         }
     }
 
-    void Assembler::asm_spill(Register rr, int d, bool pop, bool quad)
+    void Assembler::asm_spill(Register rr, int d, bool pop)
     {
-        (void)quad;
         NanoAssert(d);
         if (rmask(rr) & GpRegs) {
             ST(FP, d, rr);
         } else if (rmask(rr) & XmmRegs) {
             SSE_STQ(d, FP, rr);
         } else {
-            NanoAssert(rmask(rr) & x87Regs);
-            FSTQ((pop?1:0), d, FP);
+            NanoAssert(rr == FST0);
+            FSTQ(pop, d, FP);
         }
     }
 
     void Assembler::asm_load64(LIns* ins)
     {
         LIns* base = ins->oprnd1();
         int db = ins->disp();
 
@@ -1308,28 +1307,28 @@ namespace nanojit
         if (ins->isInReg()) {
             Register rr = ins->getReg();
             asm_maybe_spill(ins, false);    // if also in memory in post-state, spill it now
             switch (ins->opcode()) {
             case LIR_ldd:
                 if (rmask(rr) & XmmRegs) {
                     SSE_LDQ(rr, db, rb);
                 } else {
-                    NanoAssert(rmask(rr) & x87Regs);
+                    NanoAssert(rr == FST0);
                     FLDQ(db, rb);
                 }
                 break;
 
             case LIR_ldf2d:
                 if (rmask(rr) & XmmRegs) {
                     SSE_CVTSS2SD(rr, rr);
                     SSE_LDSS(rr, db, rb);
                     SSE_XORPDr(rr,rr);
                 } else {
-                    NanoAssert(rmask(rr) & x87Regs);
+                    NanoAssert(rr == FST0);
                     FLD32(db, rb);
                 }
                 break;
 
             default:
                 NanoAssert(0);
                 break;
             }
@@ -1374,17 +1373,17 @@ namespace nanojit
                 Register rt = registerAllocTmp(XmmRegs);
 
                 // cvt to single-precision and store
                 SSE_STSS(dr, rb, rt);
                 SSE_CVTSD2SS(rt, rv);
                 SSE_XORPDr(rt, rt);     // zero dest to ensure no dependency stalls
 
             } else {
-                FST32(pop?1:0, dr, rb);
+                FST32(pop, dr, rb);
             }
 
         } else if (value->isImmD()) {
             STi(rb, dr+4, value->immDhi());
             STi(rb, dr,   value->immDlo());
 
         } else if (value->isop(LIR_ldd)) {
             // value is 64bit struct or int64_t, or maybe a double.
@@ -1408,17 +1407,17 @@ namespace nanojit
             bool pop = !value->isInReg();
             Register rv = ( pop
                           ? findRegFor(value, _config.i386_sse2 ? XmmRegs : FpRegs)
                           : value->getReg() );
 
             if (rmask(rv) & XmmRegs) {
                 SSE_STQ(dr, rb, rv);
             } else {
-                FSTQ(pop?1:0, dr, rb);
+                FSTQ(pop, dr, rb);
             }
         }
     }
 
     // Copy 64 bits: (rd+dd) <- (rs+ds).
     //
     void Assembler::asm_mmq(Register rd, int dd, Register rs, int ds)
     {
@@ -2553,22 +2552,22 @@ namespace nanojit
     {
         LIns *lhs = ins->oprnd1();
 
         if (_config.i386_sse2) {
             Register rr = prepareResultReg(ins, GpRegs);
             Register ra = findRegFor(lhs, XmmRegs);
             SSE_CVTSD2SI(rr, ra);
         } else {
-            int pop = !lhs->isInReg();
+            bool pop = !lhs->isInReg();
             findSpecificRegFor(lhs, FST0);
             if (ins->isInReg())
                 evict(ins);
             int d = findMemFor(ins);
-            FIST((pop?1:0), d, FP);
+            FIST(pop, d, FP);
         }
 
         freeResourcesOf(ins);
     }
 
     void Assembler::asm_nongp_copy(Register rd, Register rs)
     {
         if ((rmask(rd) & XmmRegs) && (rmask(rs) & XmmRegs)) {
@@ -2755,17 +2754,17 @@ namespace nanojit
             switch (condop) {
             case LIR_eqd:   mask = 0x44;    break;
             case LIR_ltd:   mask = 0x05;    break;
             case LIR_led:   mask = 0x41;    break;
             default:        NanoAssert(0);  break;
             }
 
             evictIfActive(EAX);
-            int pop = !lhs->isInReg();
+            bool pop = !lhs->isInReg();
             findSpecificRegFor(lhs, FST0);
 
             if (lhs == rhs) {
                 // NaN test.
                 TEST_AH(mask);
                 FNSTSW_AX();        // requires EAX to be free
                 if (pop)
                     FCOMPP();
@@ -2773,22 +2772,22 @@ namespace nanojit
                     FCOMP();
                 FLDr(FST0); // DUP
             } else {
                 TEST_AH(mask);
                 FNSTSW_AX();        // requires EAX to be free
                 if (rhs->isImmD())
                 {
                     const uint64_t* p = findImmDFromPool(rhs->immDasQ());
-                    FCOMdm((pop?1:0), (const double*)p);
+                    FCOMdm(pop, (const double*)p);
                 }
                 else
                 {
                     int d = findMemFor(rhs);
-                    FCOM((pop?1:0), d, FP);
+                    FCOM(pop, d, FP);
                 }
             }
         }
     }
 
     // Increment the 32-bit profiling counter at pCtr, without
     // changing any registers.
     verbose_only(