Redo X64 asm_fneg to only allocate XMM regs, fix indirect calls, and revert asm_restore to old logic (bug 535706 r=nnethercote+)
authorEdwin Smith <edwsmith@adobe.com>
Mon, 15 Mar 2010 21:52:41 -0400
changeset 40284 975958755e79494abc3f930230e584f685be0a1a
parent 40283 6c2d0b28dd6e00acc99df0ee4eb3c6c9074a8eec
child 40285 6eedb7f2c6d1d215a74762e0d8de1f79509a2c1c
push id12610
push userrsayre@mozilla.com
push dateMon, 05 Apr 2010 17:26:41 +0000
treeherdermozilla-central@1942c0b4e101 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersnnethercote
bugs535706, 534310
milestone1.9.3a3pre
Redo X64 asm_fneg to only allocate XMM regs, fix indirect calls, and revert asm_restore to old logic (bug 535706 r=nnethercote+) The code for indirect calls needed shuffling; we must freeResourcesOf() before assigning the call address to a register. The old code was just getting lucky, and the regstate fixes tickled the latent bug. asm_restore() can be stricter once we eliminate all cases where an F64 instruction can be assigned to a GPR. The only known remaining case is asm_quad which is used for both LIR_float and LIR_quad, which should be fixed by bug 534310.
js/src/nanojit/NativeX64.cpp
js/src/nanojit/NativeX64.h
--- a/js/src/nanojit/NativeX64.cpp
+++ b/js/src/nanojit/NativeX64.cpp
@@ -446,17 +446,18 @@ namespace nanojit
     void Assembler::MOVSXDR(R l, R r)   { emitrr(X64_movsxdr,l,r); asm_output("movsxd %s, %s",RQ(l),RL(r)); }
 
     void Assembler::MOVZX8(R l, R r)    { emitrr8(X64_movzx8,l,r); asm_output("movzx %s, %s",RQ(l),RB(r)); }
 
 // XORPS is a 4x32f vector operation, we use it instead of the more obvious
 // XORPD because it's one byte shorter.  This is ok because it's only used for
 // zeroing an XMM register;  hence the single argument.
 // Also note that (unlike most SSE2 instructions) XORPS does not have a prefix, thus emitrr() should be used.
-    void Assembler::XORPS(        R r)  { emitrr(X64_xorps,   r,r); asm_output("xorps %s, %s",   RQ(r),RQ(r)); }
+    void Assembler::XORPS(        R r)  { emitrr(X64_xorps,    r,r); asm_output("xorps %s, %s",   RQ(r),RQ(r)); }
+    void Assembler::XORPS(   R l, R r)  { emitrr(X64_xorps,    l,r); asm_output("xorps %s, %s",   RQ(l),RQ(r)); }
     void Assembler::DIVSD(   R l, R r)  { emitprr(X64_divsd,   l,r); asm_output("divsd %s, %s",   RQ(l),RQ(r)); }
     void Assembler::MULSD(   R l, R r)  { emitprr(X64_mulsd,   l,r); asm_output("mulsd %s, %s",   RQ(l),RQ(r)); }
     void Assembler::ADDSD(   R l, R r)  { emitprr(X64_addsd,   l,r); asm_output("addsd %s, %s",   RQ(l),RQ(r)); }
     void Assembler::SUBSD(   R l, R r)  { emitprr(X64_subsd,   l,r); asm_output("subsd %s, %s",   RQ(l),RQ(r)); }
     void Assembler::CVTSQ2SD(R l, R r)  { emitprr(X64_cvtsq2sd,l,r); asm_output("cvtsq2sd %s, %s",RQ(l),RQ(r)); }
     void Assembler::CVTSI2SD(R l, R r)  { emitprr(X64_cvtsi2sd,l,r); asm_output("cvtsi2sd %s, %s",RQ(l),RL(r)); }
     void Assembler::CVTSS2SD(R l, R r)  { emitprr(X64_cvtss2sd,l,r); asm_output("cvtss2sd %s, %s",RQ(l),RL(r)); }
     void Assembler::CVTSD2SS(R l, R r)  { emitprr(X64_cvtsd2ss,l,r); asm_output("cvtsd2ss %s, %s",RL(l),RQ(r)); }
@@ -799,17 +800,17 @@ namespace nanojit
         if (!divL->isInReg()) {
             NanoAssert(rDivL == RAX);
             findSpecificRegForUnallocated(divL, RAX);
         }
     }
 
     // binary op with integer registers
     void Assembler::asm_arith(LIns *ins) {
-        Register rr, ra, rb;
+        Register rr, ra, rb = UnspecifiedReg;   // init to shut GCC up
 
         switch (ins->opcode()) {
         case LIR_lsh: case LIR_qilsh:
         case LIR_rsh: case LIR_qirsh:
         case LIR_ush: case LIR_qursh:
             asm_shift(ins);
             return;
         case LIR_mod:
@@ -890,40 +891,44 @@ namespace nanojit
         prepareResultReg(ins, rmask(rr));
 
         evictScratchRegsExcept(rmask(rr));
 
         const CallInfo *call = ins->callInfo();
         ArgSize sizes[MAXARGS];
         int argc = call->get_sizes(sizes);
 
-        bool indirect = call->isIndirect();
-        if (!indirect) {
+        if (!call->isIndirect()) {
             verbose_only(if (_logc->lcbits & LC_Assembly)
                 outputf("        %p:", _nIns);
             )
             NIns *target = (NIns*)call->_address;
             if (isTargetWithinS32(target)) {
                 CALL(8, target);
             } else {
                 // can't reach target from here, load imm64 and do an indirect jump
                 CALLRAX();
                 asm_quad(RAX, (uint64_t)target, /*canClobberCCs*/true);
             }
+            // Call this now so that the arg setup can involve 'rr'.
+            freeResourcesOf(ins);
         } else {
             // Indirect call: we assign the address arg to RAX since it's not
             // used for regular arguments, and is otherwise scratch since it's
             // clobberred by the call.
+            CALLRAX();
+
+            // Call this now so that the arg setup can involve 'rr'.
+            freeResourcesOf(ins);
+
+            // Assign the call address to RAX.  Must happen after freeResourcesOf()
+            // since RAX is usually the return value and will be allocated until that point.
             asm_regarg(ARGSIZE_P, ins->arg(--argc), RAX);
-            CALLRAX();
         }
 
-        // Call this now so that the arg setup can involve 'rr'. 
-        freeResourcesOf(ins);
-
     #ifdef _WIN64
         int stk_used = 32; // always reserve 32byte shadow area
     #else
         int stk_used = 0;
         Register fr = XMM0;
     #endif
         int arg_index = 0;
         for (int i = 0; i < argc; i++) {
@@ -1375,25 +1380,23 @@ namespace nanojit
             asm_int(r, ins->imm32(), /*canClobberCCs*/false);
         }
         else if (ins->isconstq() && IsGpReg(r)) {
             ins->clearReg();
             asm_quad(r, ins->imm64(), /*canClobberCCs*/false);
         }
         else {
             int d = findMemFor(ins);
-            if (ins->isF64()) {
-                NanoAssert(IsFpReg(r));
+            if (IsFpReg(r)) {
+                NanoAssert(ins->isF64());
                 MOVSDRM(r, d, FP);
-            } else if (ins->isI64()) {
-                NanoAssert(IsGpReg(r));
+            } else if (ins->isN64()) {
                 MOVQRM(r, d, FP);
             } else {
                 NanoAssert(ins->isI32());
-                NanoAssert(IsGpReg(r));
                 MOVLRM(r, d, FP);
             }
         }
     }
 
     void Assembler::asm_cond(LIns *ins) {
         LOpcode op = ins->opcode();
 
@@ -1684,16 +1687,18 @@ namespace nanojit
         if (a == b) {
             rb = ra;
         }
     }
 
     // Register clean-up for 2-address style unary ops of the form R = (op) R.
     // Pairs with beginOp1Regs() and beginOp2Regs().
     void Assembler::endOpRegs(LIns* ins, Register rr, Register ra) {
+        (void) rr; // quell warnings when NanoAssert is compiled out.
+
         LIns* a = ins->oprnd1();
 
         // We're finished with 'ins'.
         NanoAssert(ins->getReg() == rr);
         freeResourcesOf(ins);
 
         // If 'a' isn't in a register yet, that means it's clobbered by 'ins'.
         if (!a->isInReg()) {
@@ -1701,39 +1706,45 @@ namespace nanojit
             findSpecificRegForUnallocated(a, ra);
          }
     }
 
     static const AVMPLUS_ALIGN16(int64_t) negateMask[] = {0x8000000000000000LL,0};
 
     void Assembler::asm_fneg(LIns *ins) {
         Register rr, ra;
-        if (isS32((uintptr_t)negateMask) || isTargetWithinS32((NIns*)negateMask)) {
-            beginOp1Regs(ins, FpRegs, rr, ra);
-            if (isS32((uintptr_t)negateMask)) {
-                // builtin code is in bottom or top 2GB addr space, use absolute addressing
-                XORPSA(rr, (int32_t)(uintptr_t)negateMask);
-            } else {
-                // jit code is within +/-2GB of builtin code, use rip-relative
-                XORPSM(rr, (NIns*)negateMask);
-            }
-            if (ra != rr)
-                asm_nongp_copy(rr,ra);
-            endOpRegs(ins, rr, ra);
-
+        beginOp1Regs(ins, FpRegs, rr, ra);
+        if (isS32((uintptr_t)negateMask)) {
+            // builtin code is in bottom or top 2GB addr space, use absolute addressing
+            XORPSA(rr, (int32_t)(uintptr_t)negateMask);
+        } else if (isTargetWithinS32((NIns*)negateMask)) {
+            // jit code is within +/-2GB of builtin code, use rip-relative
+            XORPSM(rr, (NIns*)negateMask);
         } else {
             // This is just hideous - can't use RIP-relative load, can't use
             // absolute-address load, and cant move imm64 const to XMM.
-            // so do it all in a GPR.  hrmph.
-            rr = prepareResultReg(ins, GpRegs);
-            ra = findRegFor(ins->oprnd1(), GpRegs & ~rmask(rr));
-            XORQRR(rr, ra);                                     // xor rr, ra
-            asm_quad(rr, negateMask[0], /*canClobberCCs*/true); // mov rr, 0x8000000000000000
-            freeResourcesOf(ins);
+            // Solution: move negateMask into a temp GP register, then copy to
+            // a temp XMM register.
+            // Nb: we don't want any F64 values to end up in a GpReg, nor any
+            // I64 values to end up in an FpReg.
+            // 
+            //   # 'gt' and 'ga' are temporary GpRegs.
+            //   # ins->oprnd1() is in 'rr' (FpRegs)
+            //   mov   gt, 0x8000000000000000
+            //   mov   rt, gt
+            //   xorps rr, rt
+            Register rt = registerAllocTmp(FpRegs & ~(rmask(ra)|rmask(rr)));
+            Register gt = registerAllocTmp(GpRegs);
+            XORPS(rr, rt);
+            MOVQXR(rt, gt);
+            asm_quad(gt, negateMask[0], /*canClobberCCs*/true);
         }
+        if (ra != rr)
+            asm_nongp_copy(rr,ra);
+        endOpRegs(ins, rr, ra);
     }
 
     void Assembler::asm_spill(Register rr, int d, bool /*pop*/, bool quad) {
         if (d) {
             if (!IsFpReg(rr)) {
                 if (quad)
                     MOVQMR(rr, d, FP);
                 else
--- a/js/src/nanojit/NativeX64.h
+++ b/js/src/nanojit/NativeX64.h
@@ -473,16 +473,17 @@ namespace nanojit
         void CMOVQNGE(Register l, Register r);\
         void CMOVQNB(Register l, Register r);\
         void CMOVQNBE(Register l, Register r);\
         void CMOVQNA(Register l, Register r);\
         void CMOVQNAE(Register l, Register r);\
         void MOVSXDR(Register l, Register r);\
         void MOVZX8(Register l, Register r);\
         void XORPS(Register r);\
+        void XORPS(Register l, Register r);\
         void DIVSD(Register l, Register r);\
         void MULSD(Register l, Register r);\
         void ADDSD(Register l, Register r);\
         void SUBSD(Register l, Register r);\
         void CVTSQ2SD(Register l, Register r);\
         void CVTSI2SD(Register l, Register r);\
         void CVTSS2SD(Register l, Register r);\
         void CVTSD2SS(Register l, Register r);\