Bug 580515 - TM: LIR_cmovd mishandled with X86_FORCE_SSE2=no. r=edwsmith.
authorNicholas Nethercote <nnethercote@mozilla.com>
Wed, 01 Dec 2010 14:23:44 -0800
changeset 58695 a687492cff3dad319c00fc45f429d251e9972a50
parent 58694 e5a107d913777c2fbd34a3a8c1ed980e18272cf6
child 58696 f20d22c4df4737af2ab0c141f65f7a23acf478fc
push id1
push usershaver@mozilla.com
push dateTue, 04 Jan 2011 17:58:04 +0000
reviewersedwsmith
bugs580515
milestone2.0b8pre
Bug 580515 - TM: LIR_cmovd mishandled with X86_FORCE_SSE2=no. r=edwsmith.
js/src/nanojit/NativeX64.cpp
js/src/nanojit/NativeX64.h
js/src/nanojit/Nativei386.cpp
js/src/nanojit/Nativei386.h
--- a/js/src/nanojit/NativeX64.cpp
+++ b/js/src/nanojit/NativeX64.cpp
@@ -1168,39 +1168,43 @@ namespace nanojit
 
         RegisterMask allow = ins->isD() ? FpRegs : GpRegs;
 
         Register rr = prepareResultReg(ins, allow);
 
         Register rf = findRegFor(iffalse, allow & ~rmask(rr));
 
         if (ins->isop(LIR_cmovd)) {
+            // See Nativei386.cpp:asm_cmov() for an explanation of the subtleties here.
             NIns* target = _nIns;
             asm_nongp_copy(rr, rf);
-            asm_branch(false, cond, target);
+            asm_branch_helper(false, cond, target);
 
             // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
             Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
 
             if (rr != rt)
                 asm_nongp_copy(rr, rt);
+
             freeResourcesOf(ins);
             if (!iftrue->isInReg()) {
                 NanoAssert(rt == rr);
                 findSpecificRegForUnallocated(iftrue, rr);
             }
+
+            asm_cmp(cond);
             return;
         }
 
         // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
         Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
 
         // WARNING: We cannot generate any code that affects the condition
-        // codes between the MRcc generation here and the asm_cmp() call
-        // below.  See asm_cmp() for more details.
+        // codes between the MRcc generation here and the asm_cmpi() call
+        // below.  See asm_cmpi() for more details.
         LOpcode condop = cond->opcode();
         if (ins->isop(LIR_cmovi)) {
             switch (condop) {
             case LIR_eqi:  case LIR_eqq:    CMOVNE( rr, rf);  break;
             case LIR_lti:  case LIR_ltq:    CMOVNL( rr, rf);  break;
             case LIR_gti:  case LIR_gtq:    CMOVNG( rr, rf);  break;
             case LIR_lei:  case LIR_leq:    CMOVNLE(rr, rf);  break;
             case LIR_gei:  case LIR_geq:    CMOVNGE(rr, rf);  break;
@@ -1229,40 +1233,46 @@ namespace nanojit
             MR(rr, rt);
 
         freeResourcesOf(ins);
         if (!iftrue->isInReg()) {
             NanoAssert(rt == rr);
             findSpecificRegForUnallocated(iftrue, rr);
         }
 
-        asm_cmp(cond);
+        asm_cmpi(cond);
     }
 
-    NIns* Assembler::asm_branch(bool onFalse, LIns *cond, NIns *target) {
-        NanoAssert(cond->isCmp());
-        LOpcode condop = cond->opcode();
+    NIns* Assembler::asm_branch(bool onFalse, LIns* cond, NIns* target) {
+        NIns* patch = asm_branch_helper(onFalse, cond, target);
+        asm_cmp(cond);
+        return patch;
+    }
 
+    NIns* Assembler::asm_branch_helper(bool onFalse, LIns *cond, NIns *target) {
         if (target && !isTargetWithinS32(target)) {
-            // conditional jumps beyond 32bit range, so invert the branch/compare
-            // and emit an unconditional jump to the target
+            // A conditional jump beyond 32-bit range, so invert the
+            // branch/compare and emit an unconditional jump to the target:
             //         j(inverted) B1
             //         jmp target
             //     B1:
             NIns* shortTarget = _nIns;
             JMP(target);
             target = shortTarget;
-
             onFalse = !onFalse;
         }
-        if (isCmpDOpcode(condop))
-            return asm_branchd(onFalse, cond, target);
+        return isCmpDOpcode(cond->opcode())
+             ? asm_branchd_helper(onFalse, cond, target)
+             : asm_branchi_helper(onFalse, cond, target);
+    }
 
+    NIns* Assembler::asm_branchi_helper(bool onFalse, LIns *cond, NIns *target) {
         // We must ensure there's room for the instruction before calculating
         // the offset.  And the offset determines the opcode (8bit or 32bit).
+        LOpcode condop = cond->opcode();
         if (target && isTargetWithinS8(target)) {
             if (onFalse) {
                 switch (condop) {
                 case LIR_eqi:  case LIR_eqq:    JNE8( 8, target); break;
                 case LIR_lti:  case LIR_ltq:    JNL8( 8, target); break;
                 case LIR_gti:  case LIR_gtq:    JNG8( 8, target); break;
                 case LIR_lei:  case LIR_leq:    JNLE8(8, target); break;
                 case LIR_gei:  case LIR_geq:    JNGE8(8, target); break;
@@ -1310,42 +1320,44 @@ namespace nanojit
                 case LIR_ltui: case LIR_ltuq:   JB( 8, target);   break;
                 case LIR_gtui: case LIR_gtuq:   JA( 8, target);   break;
                 case LIR_leui: case LIR_leuq:   JBE(8, target);   break;
                 case LIR_geui: case LIR_geuq:   JAE(8, target);   break;
                 default:                        NanoAssert(0);    break;
                 }
             }
         }
-        NIns *patch = _nIns;    // address of instruction to patch
-        asm_cmp(cond);
-        return patch;
+        return _nIns;   // address of instruction to patch
     }
 
     NIns* Assembler::asm_branch_ov(LOpcode, NIns* target) {
         if (target && !isTargetWithinS32(target)) {
             setError(ConditionalBranchTooFar);
             NanoAssert(0);
         }
         // We must ensure there's room for the instr before calculating
         // the offset.  And the offset determines the opcode (8bit or 32bit).
         if (target && isTargetWithinS8(target))
             JO8(8, target);
         else
             JO( 8, target);
         return _nIns;
     }
 
+    void Assembler::asm_cmp(LIns *cond) {
+        isCmpDOpcode(cond->opcode()) ? asm_cmpd(cond) : asm_cmpi(cond);
+    }
+
     // WARNING: this function cannot generate code that will affect the
     // condition codes prior to the generation of the test/cmp.  See
-    // Nativei386.cpp:asm_cmp() for details.
-    void Assembler::asm_cmp(LIns *cond) {
+    // Nativei386.cpp:asm_cmpi() for details.
+    void Assembler::asm_cmpi(LIns *cond) {
         LIns *b = cond->oprnd2();
         if (isImm32(b)) {
-            asm_cmp_imm(cond);
+            asm_cmpi_imm(cond);
             return;
         }
         LIns *a = cond->oprnd1();
         Register ra, rb;
         if (a != b) {
             findRegFor2(GpRegs, a, ra, GpRegs, b, rb);
         } else {
             // optimize-me: this will produce a const result!
@@ -1356,17 +1368,17 @@ namespace nanojit
         if (isCmpQOpcode(condop)) {
             CMPQR(ra, rb);
         } else {
             NanoAssert(isCmpIOpcode(condop));
             CMPLR(ra, rb);
         }
     }
 
-    void Assembler::asm_cmp_imm(LIns *cond) {
+    void Assembler::asm_cmpi_imm(LIns *cond) {
         LOpcode condop = cond->opcode();
         LIns *a = cond->oprnd1();
         LIns *b = cond->oprnd2();
         Register ra = findRegFor(a, GpRegs);
         int32_t imm = getImm32(b);
         if (isCmpQOpcode(condop)) {
             if (isS8(imm))
                 CMPQR8(ra, imm);
@@ -1394,21 +1406,19 @@ namespace nanojit
     //
     //  Here are the cases, using conditionals:
     //
     //  branch  >=  >   <=       <        =
     //  ------  --- --- ---      ---      ---
     //  LIR_jt  jae ja  swap+jae swap+ja  jp over je
     //  LIR_jf  jb  jbe swap+jb  swap+jbe jne+jp
 
-    NIns* Assembler::asm_branchd(bool onFalse, LIns *cond, NIns *target) {
+    NIns* Assembler::asm_branchd_helper(bool onFalse, LIns *cond, NIns *target) {
         LOpcode condop = cond->opcode();
         NIns *patch;
-        LIns *a = cond->oprnd1();
-        LIns *b = cond->oprnd2();
         if (condop == LIR_eqd) {
             if (onFalse) {
                 // branch if unordered or !=
                 JP(16, target);     // underrun of 12 needed, round up for overhang --> 16
                 JNE(0, target);     // no underrun needed, previous was enough
                 patch = _nIns;
             } else {
                 // jp skip (2byte)
@@ -1417,78 +1427,77 @@ namespace nanojit
                 underrunProtect(16); // underrun of 7 needed but we write 2 instr --> 16
                 NIns *skip = _nIns;
                 JE(0, target);      // no underrun needed, previous was enough
                 patch = _nIns;
                 JP8(0, skip);       // ditto
             }
         }
         else {
-            if (condop == LIR_ltd) {
-                condop = LIR_gtd;
-                LIns *t = a; a = b; b = t;
-            } else if (condop == LIR_led) {
-                condop = LIR_ged;
-                LIns *t = a; a = b; b = t;
-            }
-            if (condop == LIR_gtd) {
-                if (onFalse)
-                    JBE(8, target);
-                else
-                    JA(8, target);
-            } else { // LIR_ged
-                if (onFalse)
-                    JB(8, target);
-                else
-                    JAE(8, target);
+            // LIR_ltd and LIR_gtd are handled by the same case because
+            // asm_cmpd() converts LIR_ltd(a,b) to LIR_gtd(b,a).  Likewise for
+            // LIR_led/LIR_ged.
+            switch (condop) {
+            case LIR_ltd:
+            case LIR_gtd: if (onFalse) JBE(8, target); else JA(8, target);  break;
+            case LIR_led:
+            case LIR_ged: if (onFalse) JB(8, target);  else JAE(8, target); break;
+            default:      NanoAssert(0);                                    break;
             }
             patch = _nIns;
         }
-        asm_cmpd(a, b);
         return patch;
     }
 
     void Assembler::asm_condd(LIns *ins) {
         LOpcode op = ins->opcode();
-        LIns *a = ins->oprnd1();
-        LIns *b = ins->oprnd2();
         if (op == LIR_eqd) {
             // result = ZF & !PF, must do logic on flags
             // r = al|bl|cl|dl, can only use rh without rex prefix
             Register r = prepareResultReg(ins, 1<<REGNUM(RAX) | 1<<REGNUM(RCX) |
                                                1<<REGNUM(RDX) | 1<<REGNUM(RBX));
             MOVZX8(r, r);       // movzx8   r,rl     r[8:63] = 0
             X86_AND8R(r);       // and      rl,rh    rl &= rh
             X86_SETNP(r);       // setnp    rh       rh = !PF
             X86_SETE(r);        // sete     rl       rl = ZF
         } else {
-            if (op == LIR_ltd) {
-                op = LIR_gtd;
-                LIns *t = a; a = b; b = t;
-            } else if (op == LIR_led) {
-                op = LIR_ged;
-                LIns *t = a; a = b; b = t;
-            }
+            // LIR_ltd and LIR_gtd are handled by the same case because
+            // asm_cmpd() converts LIR_ltd(a,b) to LIR_gtd(b,a).  Likewise for
+            // LIR_led/LIR_ged.
             Register r = prepareResultReg(ins, GpRegs); // x64 can use any GPR as setcc target
             MOVZX8(r, r);
-            if (op == LIR_gtd)
-                SETA(r);
-            else
-                SETAE(r);
+            switch (op) {
+            case LIR_ltd:
+            case LIR_gtd: SETA(r);       break;
+            case LIR_led:
+            case LIR_ged: SETAE(r);      break;
+            default:      NanoAssert(0); break;
+            }
         }
 
         freeResourcesOf(ins);
 
-        asm_cmpd(a, b);
+        asm_cmpd(ins);
     }
 
     // WARNING: This function cannot generate any code that will affect the
-    // condition codes prior to the generation of the ucomisd.  See asm_cmp()
+    // condition codes prior to the generation of the ucomisd.  See asm_cmpi()
     // for more details.
-    void Assembler::asm_cmpd(LIns *a, LIns *b) {
+    void Assembler::asm_cmpd(LIns *cond) {
+        LOpcode opcode = cond->opcode();
+        LIns* a = cond->oprnd1();
+        LIns* b = cond->oprnd2();
+        // First, we convert (a < b) into (b > a), and (a <= b) into (b >= a).
+        if (opcode == LIR_ltd) {
+            opcode = LIR_gtd;
+            LIns* t = a; a = b; b = t;
+        } else if (opcode == LIR_led) {
+            opcode = LIR_ged;
+            LIns* t = a; a = b; b = t;
+        }
         Register ra, rb;
         findRegFor2(FpRegs, a, ra, FpRegs, b, rb);
         UCOMISD(ra, rb);
     }
 
     // Return true if we can generate code for this instruction that neither
     // sets CCs nor clobbers any input register.
     // LEA is the only native instruction that fits those requirements.
@@ -1513,17 +1522,17 @@ namespace nanojit
         return false;
     }
 
     bool Assembler::canRemat(LIns* ins) {
         return ins->isImmAny() || ins->isop(LIR_allocp) || canRematLEA(ins);
     }
 
     // WARNING: the code generated by this function must not affect the
-    // condition codes.  See asm_cmp() for details.
+    // condition codes.  See asm_cmpi() for details.
     void Assembler::asm_restore(LIns *ins, Register r) {
         if (ins->isop(LIR_allocp)) {
             int d = arDisp(ins);
             LEAQRM(r, d, FP);
         }
         else if (ins->isImmI()) {
             asm_immi(r, ins->immI(), /*canClobberCCs*/false);
         }
@@ -1582,17 +1591,17 @@ namespace nanojit
         case LIR_leui:   SETBE(r);   break;
         case LIR_gtuq:
         case LIR_gtui:   SETA(r);    break;
         case LIR_geuq:
         case LIR_geui:   SETAE(r);   break;
         }
         freeResourcesOf(ins);
 
-        asm_cmp(ins);
+        asm_cmpi(ins);
     }
 
     void Assembler::asm_ret(LIns *ins) {
         genEpilogue();
 
         // Restore RSP from RBP, undoing SUB(RSP,amt) in the prologue
         MR(RSP,FP);
 
--- a/js/src/nanojit/NativeX64.h
+++ b/js/src/nanojit/NativeX64.h
@@ -418,19 +418,22 @@ namespace nanojit
         void asm_arith_imm(LIns*);\
         void beginOp1Regs(LIns *ins, RegisterMask allow, Register &rr, Register &ra);\
         void beginOp2Regs(LIns *ins, RegisterMask allow, Register &rr, Register &ra, Register &rb);\
         void endOpRegs(LIns *ins, Register rr, Register ra);\
         void beginLoadRegs(LIns *ins, RegisterMask allow, Register &rr, int32_t &d, Register &rb);\
         void endLoadRegs(LIns *ins);\
         void dis(NIns *p, int bytes);\
         void asm_cmp(LIns*);\
-        void asm_cmp_imm(LIns*);\
-        void asm_cmpd(LIns*, LIns*);\
-        NIns* asm_branchd(bool, LIns*, NIns*);\
+        void asm_cmpi(LIns*);\
+        void asm_cmpi_imm(LIns*);\
+        void asm_cmpd(LIns*);\
+        NIns* asm_branch_helper(bool, LIns*, NIns*);\
+        NIns* asm_branchi_helper(bool, LIns*, NIns*);\
+        NIns* asm_branchd_helper(bool, LIns*, NIns*);\
         void asm_div(LIns *ins);\
         void asm_div_mod(LIns *ins);\
         int max_stk_used;\
         void PUSHR(Register r);\
         void POPR(Register r);\
         void NOT(Register r);\
         void NEG(Register r);\
         void IDIV(Register r);\
--- a/js/src/nanojit/Nativei386.cpp
+++ b/js/src/nanojit/Nativei386.cpp
@@ -849,18 +849,16 @@ namespace nanojit
         asm_output("test ax, %d", i);
     }
 
     inline void Assembler::FNSTSW_AX() { count_fpu(); FPUc(0xdfe0);    asm_output("fnstsw_ax"); }
     inline void Assembler::FCHS()      { count_fpu(); FPUc(0xd9e0);    asm_output("fchs"); }
     inline void Assembler::FLD1()      { count_fpu(); FPUc(0xd9e8);    asm_output("fld1"); fpu_push(); }
     inline void Assembler::FLDZ()      { count_fpu(); FPUc(0xd9ee);    asm_output("fldz"); fpu_push(); }
 
-    inline void Assembler::FFREE(R r)  { count_fpu(); FPU(0xddc0, r);  asm_output("ffree %s",gpn(r)); }
-
     inline void Assembler::FST32(bool p, I32 d, R b){ count_stq(); FPUm(0xd902|(p?1:0), d, b);   asm_output("fst%s32 %d(%s)", (p?"p":""), d, gpn(b)); if (p) fpu_pop(); }
     inline void Assembler::FSTQ(bool p, I32 d, R b) { count_stq(); FPUm(0xdd02|(p?1:0), d, b);   asm_output("fst%sq %d(%s)", (p?"p":""), d, gpn(b)); if (p) fpu_pop(); }
 
     inline void Assembler::FSTPQ(I32 d, R b) { FSTQ(1, d, b); }
 
     inline void Assembler::FCOM(bool p, I32 d, R b) { count_fpuld(); FPUm(0xdc02|(p?1:0), d, b); asm_output("fcom%s %d(%s)", (p?"p":""), d, gpn(b)); if (p) fpu_pop(); }
     inline void Assembler::FCOMdm(bool p, const double* dm) {
         count_fpuld();
@@ -889,18 +887,16 @@ namespace nanojit
     inline void Assembler::FDIV( I32 d, R b) { count_fpu(); FPUm(0xdc06, d, b); asm_output("fdiv %d(%s)", d, gpn(b)); }
     inline void Assembler::FDIVR(I32 d, R b) { count_fpu(); FPUm(0xdc07, d, b); asm_output("fdivr %d(%s)", d, gpn(b)); }
 
     inline void Assembler::FADDdm( const double *dm) { count_ldq(); FPUdm(0xdc00, dm); asm_output("fadd (%p)", (void*)dm); }
     inline void Assembler::FSUBRdm(const double* dm) { count_ldq(); FPUdm(0xdc05, dm); asm_output("fsubr (%p)", (void*)dm); }
     inline void Assembler::FMULdm( const double* dm) { count_ldq(); FPUdm(0xdc01, dm); asm_output("fmul (%p)", (void*)dm); }
     inline void Assembler::FDIVRdm(const double* dm) { count_ldq(); FPUdm(0xdc07, dm); asm_output("fdivr (%p)", (void*)dm); }
 
-    inline void Assembler::FINCSTP()   { count_fpu(); FPUc(0xd9f7); asm_output("fincstp"); fpu_pop(); }
-
     inline void Assembler::FCOMP()     { count_fpu(); FPUc(0xD8D9);    asm_output("fcomp"); fpu_pop();}
     inline void Assembler::FCOMPP()    { count_fpu(); FPUc(0xDED9);    asm_output("fcompp"); fpu_pop();fpu_pop();}
     inline void Assembler::FLDr(R r)   { count_ldq(); FPU(0xd9c0, r);  asm_output("fld %s", gpn(r)); fpu_push(); }
     inline void Assembler::EMMS()      { count_fpu(); FPUc(0x0f77);    asm_output("emms"); }
 
     // standard direct call
     inline void Assembler::CALL(const CallInfo* ci) {
         count_call();
@@ -1203,17 +1199,17 @@ namespace nanojit
     }
 
     bool Assembler::canRemat(LIns* ins)
     {
         return ins->isImmAny() || ins->isop(LIR_allocp) || canRematLEA(ins);
     }
 
     // WARNING: the code generated by this function must not affect the
-    // condition codes.  See asm_cmp().
+    // condition codes.  See asm_cmpi().
     void Assembler::asm_restore(LIns* ins, Register r)
     {
         NanoAssert(ins->getReg() == r);
 
         uint32_t arg;
         uint32_t abi_regcount;
         if (ins->isop(LIR_allocp)) {
             // The value of a LIR_allocp instruction is the address of the
@@ -1516,56 +1512,60 @@ namespace nanojit
             Register t = registerAllocTmp(GpRegs & ~(rmask(rd)|rmask(rs)));
             ST(rd, dd+4, t);
             LD(t, ds+4, rs);
             ST(rd, dd, t);
             LD(t, ds, rs);
         }
     }
 
-    NIns* Assembler::asm_branch(bool branchOnFalse, LIns* cond, NIns* targ)
+    NIns* Assembler::asm_branch_helper(bool branchOnFalse, LIns* cond, NIns* targ)
     {
-        LOpcode condop = cond->opcode();
-        NanoAssert(cond->isCmp());
-
-        // Handle float conditions separately.
-        if (isCmpDOpcode(condop)) {
-            return asm_branchd(branchOnFalse, cond, targ);
-        }
-
+        return isCmpDOpcode(cond->opcode())
+             ? asm_branchd_helper(branchOnFalse, cond, targ)
+             : asm_branchi_helper(branchOnFalse, cond, targ);
+    }
+
+    NIns* Assembler::asm_branchi_helper(bool branchOnFalse, LIns* cond, NIns* targ)
+    {
         if (branchOnFalse) {
             // op == LIR_xf/LIR_jf
-            switch (condop) {
+            switch (cond->opcode()) {
             case LIR_eqi:   JNE(targ);      break;
             case LIR_lti:   JNL(targ);      break;
             case LIR_lei:   JNLE(targ);     break;
             case LIR_gti:   JNG(targ);      break;
             case LIR_gei:   JNGE(targ);     break;
             case LIR_ltui:  JNB(targ);      break;
             case LIR_leui:  JNBE(targ);     break;
             case LIR_gtui:  JNA(targ);      break;
             case LIR_geui:  JNAE(targ);     break;
             default:        NanoAssert(0);  break;
             }
         } else {
             // op == LIR_xt/LIR_jt
-            switch (condop) {
+            switch (cond->opcode()) {
             case LIR_eqi:   JE(targ);       break;
             case LIR_lti:   JL(targ);       break;
             case LIR_lei:   JLE(targ);      break;
             case LIR_gti:   JG(targ);       break;
             case LIR_gei:   JGE(targ);      break;
             case LIR_ltui:  JB(targ);       break;
             case LIR_leui:  JBE(targ);      break;
             case LIR_gtui:  JA(targ);       break;
             case LIR_geui:  JAE(targ);      break;
             default:        NanoAssert(0);  break;
             }
         }
-        NIns* at = _nIns;
+        return _nIns;
+    }
+
+    NIns* Assembler::asm_branch(bool branchOnFalse, LIns* cond, NIns* targ)
+    {
+        NIns* at = asm_branch_helper(branchOnFalse, cond, targ);
         asm_cmp(cond);
         return at;
     }
 
     NIns* Assembler::asm_branch_ov(LOpcode, NIns* target)
     {
         JO(target);
         return _nIns;
@@ -1579,16 +1579,21 @@ namespace nanojit
     }
 
     void Assembler::asm_jtbl(LIns* ins, NIns** table)
     {
         Register indexreg = findRegFor(ins->oprnd1(), GpRegs);
         JMP_indexed(indexreg, 2, table);
     }
 
+    void Assembler::asm_cmp(LIns *cond)
+    {
+        isCmpDOpcode(cond->opcode()) ? asm_cmpd(cond) : asm_cmpi(cond);
+    }
+
     // This generates a 'test' or 'cmp' instruction for a condition, which
     // causes the condition codes to be set appropriately.  It's used with
     // conditional branches, conditional moves, and when generating
     // conditional values.  For example:
     //
     //   LIR:   eq1 = eq a, 0
     //   LIR:   xf1: xf eq1 -> ...
     //   asm:       test edx, edx       # generated by this function
@@ -1618,17 +1623,17 @@ namespace nanojit
     // regstate, this function cannot generate any code that will affect the
     // condition codes prior to the generation of the test/cmp, because any
     // such code will be run after the test/cmp but before the instruction
     // that consumes the condition code.  And because this function calls
     // findRegFor() before the test/cmp is generated, and findRegFor() calls
     // asm_restore(), that means that asm_restore() cannot generate code which
     // affects the condition codes.
     //
-    void Assembler::asm_cmp(LIns *cond)
+    void Assembler::asm_cmpi(LIns *cond)
     {
         LIns* lhs = cond->oprnd1();
         LIns* rhs = cond->oprnd2();
 
         NanoAssert(lhs->isI() && rhs->isI());
 
         // Ready to issue the compare.
         if (rhs->isImmI()) {
@@ -1729,17 +1734,17 @@ namespace nanojit
         case LIR_leui:  SETBE(r);       break;
         case LIR_gtui:  SETA(r);        break;
         case LIR_geui:  SETAE(r);       break;
         default:        NanoAssert(0);  break;
         }
 
         freeResourcesOf(ins);
 
-        asm_cmp(ins);
+        asm_cmpi(ins);
     }
 
     // Two example cases for "ins = add lhs, rhs".  '*' lines are those
     // generated in this function.
     //
     //   asm:   define lhs into rr
     //   asm:   define rhs into rb
     //          ...
@@ -2046,76 +2051,96 @@ namespace nanojit
         LIns* iftrue  = ins->oprnd2();
         LIns* iffalse = ins->oprnd3();
 
         NanoAssert(condval->isCmp());
         NanoAssert((ins->isop(LIR_cmovi) && iftrue->isI() && iffalse->isI()) ||
                    (ins->isop(LIR_cmovd) && iftrue->isD() && iffalse->isD()));
 
         if (!_config.i386_sse2 && ins->isop(LIR_cmovd)) {
+            // See the SSE2 case below for an explanation of the subtleties here.
             debug_only( Register rr = ) prepareResultReg(ins, x87Regs);
             NanoAssert(FST0 == rr);
-            NanoAssert(!iftrue->isInReg() || iftrue->getReg() == FST0);
-
-            NanoAssert(!iffalse->isInReg());
+            NanoAssert(!iftrue->isInReg() && !iffalse->isInReg());
 
             NIns* target = _nIns;
 
             if (iffalse->isImmD()) {
                 asm_immd(FST0, iffalse->immDasQ(), iffalse->immD(), /*canClobberCCs*/false);
             } else {
                 int df = findMemFor(iffalse);
                 FLDQ(df, FP);
             }
-
-            FINCSTP();
-            // Its not sufficient to merely decrement the FP stack pointer, we have to
-            // also free FST0, otherwise the load above fails.
-            FFREE(FST0);
-            asm_branch(false, condval, target);
-
+            FSTP(FST0);     // pop the stack
+            asm_branch_helper(false, condval, target);
+
+            NanoAssert(ins->getReg() == rr);
             freeResourcesOf(ins);
             if (!iftrue->isInReg())
                 findSpecificRegForUnallocated(iftrue, FST0);
 
+            asm_cmp(condval);
+
             return;
         }
 
         RegisterMask allow = ins->isD() ? XmmRegs : GpRegs;
-
         Register rr = prepareResultReg(ins, allow);
-
         Register rf = findRegFor(iffalse, allow & ~rmask(rr));
 
         if (ins->isop(LIR_cmovd)) {
+            // The obvious way to handle this is as follows:
+            //
+            //     mov rr, rt       # only needed if rt is live afterwards
+            //     do comparison
+            //     jt end
+            //     mov rr, rf
+            //   end:
+            //
+            // The problem with this is that doing the comparison can cause
+            // registers to be evicted, possibly including 'rr', which holds
+            // 'ins'.  And that screws things up.  So instead we do this:
+            //
+            //     do comparison
+            //     mov rr, rt       # only needed if rt is live afterwards
+            //     jt end
+            //     mov rr, rf
+            //   end:
+            //
+            // Putting the 'mov' between the comparison and the jump is ok
+            // because move instructions don't modify the condition codes.
+            //
             NIns* target = _nIns;
             asm_nongp_copy(rr, rf);
-            asm_branch(false, condval, target);
+            asm_branch_helper(false, condval, target);
 
             // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
             Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
 
             if (rr != rt)
                 asm_nongp_copy(rr, rt);
+
+            NanoAssert(ins->getReg() == rr);
             freeResourcesOf(ins);
             if (!iftrue->isInReg()) {
                 NanoAssert(rt == rr);
                 findSpecificRegForUnallocated(iftrue, rr);
             }
+
+            asm_cmp(condval);
             return;
         }
 
         // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
         Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
         NanoAssert(ins->isop(LIR_cmovi));
 
         // WARNING: We cannot generate any code that affects the condition
-        // codes between the MRcc generation here and the asm_cmp() call
-        // below.  See asm_cmp() for more details.
+        // codes between the MRcc generation here and the asm_cmpi() call
+        // below.  See asm_cmpi() for more details.
         switch (condval->opcode()) {
             // Note that these are all opposites...
             case LIR_eqi:    MRNE(rr, rf);   break;
             case LIR_lti:    MRGE(rr, rf);   break;
             case LIR_lei:    MRG( rr, rf);   break;
             case LIR_gti:    MRLE(rr, rf);   break;
             case LIR_gei:    MRL( rr, rf);   break;
             case LIR_ltui:   MRAE(rr, rf);   break;
@@ -2123,16 +2148,17 @@ namespace nanojit
             case LIR_gtui:   MRBE(rr, rf);   break;
             case LIR_geui:   MRB( rr, rf);   break;
             default: NanoAssert(0); break;
         }
 
         if (rr != rt)
             MR(rr, rt);
 
+        NanoAssert(ins->getReg() == rr);
         freeResourcesOf(ins);
         if (!iftrue->isInReg()) {
             NanoAssert(rt == rr);
             findSpecificRegForUnallocated(iftrue, rr);
         }
 
         asm_cmp(condval);
     }
@@ -2609,17 +2635,17 @@ namespace nanojit
         } else if ((rmask(rd) & GpRegs) && (rmask(rs) & XmmRegs)) {
             // xmm -> gp
             SSE_MOVD(rd, rs);
         } else {
             NanoAssertMsgf(false, "bad asm_nongp_copy(%s, %s)", gpn(rd), gpn(rs));
         }
     }
 
-    NIns* Assembler::asm_branchd(bool branchOnFalse, LIns *cond, NIns *targ)
+    NIns* Assembler::asm_branchd_helper(bool branchOnFalse, LIns* cond, NIns *targ)
     {
         NIns* at = 0;
         LOpcode opcode = cond->opcode();
 
         if (_config.i386_sse2) {
             // LIR_ltd and LIR_gtd are handled by the same case because
             // asm_cmpd() converts LIR_ltd(a,b) to LIR_gtd(b,a).  Likewise
             // for LIR_led/LIR_ged.
@@ -2668,24 +2694,23 @@ namespace nanojit
             if (branchOnFalse)
                 JP(targ);
             else
                 JNP(targ);
         }
 
         if (!at)
             at = _nIns;
-        asm_cmpd(cond);
 
         return at;
     }
 
     // WARNING: This function cannot generate any code that will affect the
     // condition codes prior to the generation of the
-    // ucomisd/fcompp/fcmop/fcom.  See asm_cmp() for more details.
+    // ucomisd/fcompp/fcmop/fcom.  See asm_cmpi() for more details.
     void Assembler::asm_cmpd(LIns *cond)
     {
         LOpcode condop = cond->opcode();
         NanoAssert(isCmpDOpcode(condop));
         LIns* lhs = cond->oprnd1();
         LIns* rhs = cond->oprnd2();
         NanoAssert(lhs->isD() && rhs->isD());
 
@@ -2694,24 +2719,23 @@ namespace nanojit
             if (condop == LIR_ltd) {
                 condop = LIR_gtd;
                 LIns* t = lhs; lhs = rhs; rhs = t;
             } else if (condop == LIR_led) {
                 condop = LIR_ged;
                 LIns* t = lhs; lhs = rhs; rhs = t;
             }
 
-
             // LIR_eqd, if lhs == rhs:
             //   ucomisd       ZPC   outcome (SETNP/JNP succeeds if P==0)
             //   -------       ---   -------
             //   UNORDERED     111   SETNP/JNP fails
             //   EQUAL         100   SETNP/JNP succeeds
             //
-            // LIR_eqd, if lsh != rhs;
+            // LIR_eqd, if lhs != rhs;
             //   ucomisd       ZPC   outcome (SETP/JP succeeds if P==0,
             //                                SETE/JE succeeds if Z==0)
             //   -------       ---   -------
             //   UNORDERED     111   SETP/JP succeeds (and skips to fail target)
             //   EQUAL         100   SETP/JP fails, SETE/JE succeeds
             //   GREATER_THAN  000   SETP/JP fails, SETE/JE fails
             //   LESS_THAN     001   SETP/JP fails, SETE/JE fails
             //
@@ -2805,23 +2829,20 @@ namespace nanojit
                 if (pop)
                     FCOMPP();
                 else
                     FCOMP();
                 FLDr(FST0); // DUP
             } else {
                 TEST_AH(mask);
                 FNSTSW_AX();        // requires rEAX to be free
-                if (rhs->isImmD())
-                {
+                if (rhs->isImmD()) {
                     const uint64_t* p = findImmDFromPool(rhs->immDasQ());
                     FCOMdm(pop, (const double*)p);
-                }
-                else
-                {
+                } else {
                     int d = findMemFor(rhs);
                     FCOM(pop, d, FP);
                 }
             }
         }
     }
 
     // Increment the 32-bit profiling counter at pCtr, without
--- a/js/src/nanojit/Nativei386.h
+++ b/js/src/nanojit/Nativei386.h
@@ -194,19 +194,22 @@ namespace nanojit
         void nativePageSetup();\
         void underrunProtect(int);\
         bool hardenNopInsertion(const Config& c) { return c.harden_nop_insertion; } \
         void asm_immi(Register r, int32_t val, bool canClobberCCs);\
         void asm_stkarg(LIns* p, int32_t& stkd);\
         void asm_farg(LIns*, int32_t& stkd);\
         void asm_arg(ArgType ty, LIns* p, Register r, int32_t& stkd);\
         void asm_pusharg(LIns*);\
+        void asm_cmp(LIns *cond); \
+        void asm_cmpi(LIns *cond); \
         void asm_cmpd(LIns *cond);\
-        NIns* asm_branchd(bool, LIns*, NIns*);\
-        void asm_cmp(LIns *cond); \
+        NIns* asm_branch_helper(bool, LIns* cond, NIns*);\
+        NIns* asm_branchi_helper(bool, LIns* cond, NIns*);\
+        NIns* asm_branchd_helper(bool, LIns* cond, NIns*);\
         void asm_div_mod(LIns *cond); \
         void asm_load(int d, Register r); \
         void asm_immd(Register r, uint64_t q, double d, bool canClobberCCs); \
         \
         /* These function generate fragments of instructions. */ \
         void IMM8(int32_t i) { /* Length: 1 byte. */ \
             _nIns -= 1; \
             *((int8_t*)_nIns) = int8_t(i); \
@@ -424,17 +427,16 @@ namespace nanojit
         void FPUm(int32_t o, int32_t d, Register b); \
         void FPUdm(int32_t o, const double* const m); \
         void TEST_AH(int32_t i); \
         void TEST_AX(int32_t i); \
         void FNSTSW_AX(); \
         void FCHS(); \
         void FLD1(); \
         void FLDZ(); \
-        void FFREE(Register r); \
         void FST32(bool p, int32_t d, Register b); \
         void FSTQ(bool p, int32_t d, Register b); \
         void FSTPQ(int32_t d, Register b); \
         void FCOM(bool p, int32_t d, Register b); \
         void FCOMdm(bool p, const double* dm); \
         void FLD32(int32_t d, Register b); \
         void FLDQ(int32_t d, Register b); \
         void FLDQdm(const double* dm); \
@@ -446,17 +448,16 @@ namespace nanojit
         void FSUBR(int32_t d, Register b); \
         void FMUL( int32_t d, Register b); \
         void FDIV( int32_t d, Register b); \
         void FDIVR(int32_t d, Register b); \
         void FADDdm( const double *dm); \
         void FSUBRdm(const double* dm); \
         void FMULdm( const double* dm); \
         void FDIVRdm(const double* dm); \
-        void FINCSTP(); \
         void FSTP(Register r) { \
             count_fpu(); \
             FPU(0xddd8, r); \
             asm_output("fstp %s", gpn(r)); \
             fpu_pop(); \
         }; \
         void FCOMP(); \
         void FCOMPP(); \