Make LIR_ov work for LIR_mul on ARM. (bug 521161, r=gal)
authorJacob Bramley <Jacob.Bramley@arm.com>
Mon, 02 Nov 2009 09:35:01 +0000
changeset 34561 35ec7012e9326473542d4a4fd1b4847c2f8e6e8c
parent 34560 8a8573ae1227452f1d6989b3b838b7156378f3bb
child 34562 69642368d38f2bd9cccad9cb6d330f9d94f691ac
push idunknown
push userunknown
push dateunknown
reviewersgal
bugs521161
milestone1.9.3a1pre
Make LIR_ov work for LIR_mul on ARM. (bug 521161, r=gal)
js/src/jstracer.cpp
js/src/nanojit/NativeARM.cpp
js/src/nanojit/NativeARM.h
--- a/js/src/jstracer.cpp
+++ b/js/src/jstracer.cpp
@@ -8088,23 +8088,21 @@ TraceRecorder::alu(LOpcode v, jsdouble v
     jsdouble r;
     switch (v) {
     case LIR_fadd:
         r = v0 + v1;
         break;
     case LIR_fsub:
         r = v0 - v1;
         break;
-#if !defined NANOJIT_ARM
     case LIR_fmul:
         r = v0 * v1;
         if (r == 0.0)
             goto out;
         break;
-#endif
 #if defined NANOJIT_IA32 || defined NANOJIT_X64
     case LIR_fdiv:
         if (v1 == 0)
             goto out;
         r = v0 / v1;
         break;
     case LIR_fmod:
         if (v0 < 0 || v1 == 0 || (s1->isconstf() && v1 < 0))
--- a/js/src/nanojit/NativeARM.cpp
+++ b/js/src/nanojit/NativeARM.cpp
@@ -1852,16 +1852,24 @@ Assembler::asm_branch(bool branchOnFalse
 
     // The old "never" condition code has special meaning on newer ARM cores,
     // so use "always" as a sensible default code.
     ConditionCode cc = AL;
 
     // Detect whether or not this is a floating-point comparison.
     bool    fp_cond;
 
+    // Because MUL can't set the V flag, we use SMULL and CMP to set the Z flag
+    // to detect overflow on multiply. Thus, if cond points to a LIR_ov which
+    // in turn points to a LIR_mul, we must be conditional on !Z, not V.
+    if ((condop == LIR_ov) && (cond->oprnd1()->isop(LIR_mul))) {
+        condop = LIR_eq;
+        branchOnFalse = !branchOnFalse;
+    }
+
     // Select the appropriate ARM condition code to match the LIR instruction.
     switch (condop)
     {
         // Floating-point conditions. Note that the VFP LT/LE conditions
         // require use of the unsigned condition codes, even though
         // float-point comparisons are always signed.
         case LIR_feq:   cc = EQ;    fp_cond = true;     break;
         case LIR_flt:   cc = LO;    fp_cond = true;     break;
@@ -1986,28 +1994,40 @@ Assembler::asm_fcond(LInsp ins)
 
     asm_fcmp(ins);
 }
 
 void
 Assembler::asm_cond(LInsp ins)
 {
     Register r = prepResultReg(ins, AllowableFlagRegs);
-    switch(ins->opcode())
+    LOpcode op = ins->opcode();
+    
+    switch(op)
     {
         case LIR_eq:  SETEQ(r); break;
-        case LIR_ov:  SETVS(r); break;
         case LIR_lt:  SETLT(r); break;
         case LIR_le:  SETLE(r); break;
         case LIR_gt:  SETGT(r); break;
         case LIR_ge:  SETGE(r); break;
         case LIR_ult: SETLO(r); break;
         case LIR_ule: SETLS(r); break;
         case LIR_ugt: SETHI(r); break;
         case LIR_uge: SETHS(r); break;
+        case LIR_ov:
+            // Because MUL can't set the V flag, we use SMULL and CMP to set
+            // the Z flag to detect overflow on multiply. Thus, if ins points
+            // to a LIR_ov which in turn points to a LIR_mul, we must be
+            // conditional on !Z, not V.
+            if (!ins->oprnd1()->isop(LIR_mul)) {
+                SETVS(r);
+            } else {
+                SETNE(r);
+            }
+            break;
         default:      NanoAssert(0);  break;
     }
     asm_cmp(ins);
 }
 
 void
 Assembler::asm_arith(LInsp ins)
 {
@@ -2101,43 +2121,71 @@ Assembler::asm_arith(LInsp ins)
         case LIR_xor:   EORs(rr, ra, rb, 0);    break;
 
         case LIR_mul:
             // ARMv5 and earlier cores cannot do a MUL where the first operand
             // is also the result, so we need a special case to handle that.
             //
             // We try to use rb as the first operand by default because it is
             // common for (rr == ra) and is thus likely to be the most
-            // efficient case; if ra is no longer used after this LIR
-            // instruction, it is re-used for the result register (rr).
+            // efficient method.
+
             if ((ARM_ARCH > 5) || (rr != rb)) {
-                // Newer cores place no restrictions on the registers used in a
-                // MUL instruction (compared to other arithmetic instructions).
-                MUL(rr, rb, ra);
+                // IP is used to temporarily store the high word of the result from
+                // SMULL, so we make use of this to perform an overflow check, as
+                // ARM's MUL instruction can't set the overflow flag by itself.
+                // We can check for overflow using the following:
+                //   SMULL  rr, ip, ra, rb
+                //   CMP    ip, rr, ASR #31
+                // An explanation can be found in bug 521161. This sets Z if we did
+                // _not_ overflow, and clears it if we did.
+                ALUr_shi(AL, cmp, 1, IP, IP, rr, ASR_imm, 31);
+                SMULL(rr, IP, rb, ra);
             } else {
                 // ARM_ARCH is ARMv5 (or below) and rr == rb, so we must
                 // find a different way to encode the instruction.
 
                 // If possible, swap the arguments to avoid the restriction.
                 if (rr != ra) {
                     // We know that rr == rb, so this will be something like
                     // rX = rY * rX.
-                    MUL(rr, ra, rb);
+                    // Other than swapping ra and rb, this works in the same as
+                    // as the ARMv6+ case, above.
+                    ALUr_shi(AL, cmp, 1, IP, IP, rr, ASR_imm, 31);
+                    SMULL(rr, IP, ra, rb);
                 } else {
-                    // We're trying to do rX = rX * rX, so we must use a
-                    // temporary register to achieve this correctly on ARMv5.
+                    // We're trying to do rX = rX * rX, but we also need to
+                    // check for overflow so we would need two extra registers
+                    // on ARMv5 and below. We achieve this by observing the
+                    // following:
+                    //   - abs(rX)*abs(rX) = rX*rX, so we force the input to be
+                    //     positive to simplify the detection logic.
+                    //   - Any argument greater than 0xffff will _always_
+                    //     overflow, and we can easily check that the top 16
+                    //     bits are zero.
+                    //   - Any argument lower than (or equal to) 0xffff that
+                    //     also overflows is guaranteed to set output bit 31.
+                    // 
+                    // Thus, we know we have _not_ overflowed if:
+                    //   abs(rX)&0xffff0000 == 0 AND result[31] == 0
+                    //
+                    // The following instruction sequence will be emitted:
+                    // MOVS     IP, rX      // Put abs(rX) into IP.
+                    // RSBMI    IP, IP, #0  // ...
+                    // MUL      rX, IP, IP  // Do the actual multiplication.
+                    // MOVS     IP, IP, LSR #16 // Check that abs(arg)<=0xffff
+                    // CMPEQ    IP, rX, ASR #31 // Check that result[31] == 0
 
-                    // The register allocator will never allocate IP so it will
-                    // be safe to use here.
-                    NanoAssert(ra != IP);
                     NanoAssert(rr != IP);
 
-                    // In this case, rr == ra == rb.
-                    MUL(rr, IP, rb);
-                    MOV(IP, ra);
+                    ALUr_shi(AL, cmp, 1, IP, rr, rr, ASR_imm, 31);
+                    ALUr_shi(AL, mov, 1, IP, IP, IP, LSR_imm, 16);
+                    MUL(rr, IP, IP);
+                    ALUi(MI, rsb, 0, IP, IP, 0);
+                    ALUr(AL, mov, 1, IP, ra, ra);
                 }
             }
             break;
         
         // The shift operations need a mask to match the JavaScript
         // specification because the ARM architecture allows a greater shift
         // range than JavaScript.
         case LIR_lsh:
@@ -2224,25 +2272,35 @@ Assembler::asm_cmov(LInsp ins)
     const Register rr = prepResultReg(ins, GpRegs);
 
     // this code assumes that neither LD nor MR nor MRcc set any of the condition flags.
     // (This is true on Intel, is it true on all architectures?)
     const Register iffalsereg = findRegFor(iffalse, GpRegs & ~rmask(rr));
     switch (condval->opcode()) {
         // note that these are all opposites...
         case LIR_eq:    MOVNE(rr, iffalsereg);  break;
-        case LIR_ov:    MOVVC(rr, iffalsereg);  break;
         case LIR_lt:    MOVGE(rr, iffalsereg);  break;
         case LIR_le:    MOVGT(rr, iffalsereg);  break;
         case LIR_gt:    MOVLE(rr, iffalsereg);  break;
         case LIR_ge:    MOVLT(rr, iffalsereg);  break;
         case LIR_ult:   MOVHS(rr, iffalsereg);  break;
         case LIR_ule:   MOVHI(rr, iffalsereg);  break;
         case LIR_ugt:   MOVLS(rr, iffalsereg);  break;
         case LIR_uge:   MOVLO(rr, iffalsereg);  break;
+        case LIR_ov:
+            // Because MUL can't set the V flag, we use SMULL and CMP to set
+            // the Z flag to detect overflow on multiply. Thus, if ins points
+            // to a LIR_ov which in turn points to a LIR_mul, we must be
+            // conditional on !Z, not V.
+            if (!condval->oprnd1()->isop(LIR_mul)) {
+                MOVVC(rr, iffalsereg);
+            } else {
+                MOVEQ(rr, iffalsereg);
+            }
+            break;
         default: debug_only( NanoAssert(0) );   break;
     }
     /*const Register iftruereg =*/ findSpecificRegFor(iftrue, rr);
     asm_cmp(condval);
 }
 
 void
 Assembler::asm_qhi(LInsp ins)
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@@ -472,16 +472,36 @@ enum {
 // _d = _l - _r
 #define SUBs(_d,_l,_r,_s)   ALUr(AL, sub, _s, _d, _l, _r)
 #define SUB(_d,_l,_r)       ALUr(AL, sub,  0, _d, _l, _r)
 
 // --------
 // Other operations.
 // --------
 
+// [_d_hi,_d] = _l * _r
+#define SMULL_dont_check_op1(_d, _d_hi, _l, _r)  do {                               \
+        underrunProtect(4);                                                         \
+        NanoAssert((ARM_ARCH >= 6) || ((_d) != (_l)));                              \
+        NanoAssert(IsGpReg(_d) && IsGpReg(_d_hi) && IsGpReg(_l) && IsGpReg(_r));    \
+        NanoAssert(((_d) != PC) && ((_d_hi) != PC) && ((_l) != PC) && ((_r) != PC));\
+        *(--_nIns) = (NIns)( COND_AL | 0xc00090 | (_d_hi)<<16 | (_d)<<12 | (_r)<<8 | (_l) );\
+        asm_output("smull %s, %s, %s, %s",gpn(_d),gpn(_d_hi),gpn(_l),gpn(_r));      \
+} while(0)
+
+#if NJ_ARM_ARCH >= NJ_ARM_V6
+#define SMULL(_d, _d_hi, _l, _r) SMULL_dont_check_op1(_d, _d_hi, _l, _r)
+#else
+#define SMULL(_d, _d_hi, _l, _r) do {               \
+        NanoAssert(   (_d)!=(_l));                  \
+        NanoAssert((_d_hi)!=(_l));                  \
+        SMULL_dont_check_op1(_d, _d_hi, _l, _r);    \
+    } while(0)
+#endif
+
 // _d = _l * _r
 #define MUL_dont_check_op1(_d, _l, _r)  do {                                \
         underrunProtect(4);                                                 \
         NanoAssert((ARM_ARCH >= 6) || ((_d) != (_l)));                      \
         NanoAssert(IsGpReg(_d) && IsGpReg(_l) && IsGpReg(_r));              \
         NanoAssert(((_d) != PC) && ((_l) != PC) && ((_r) != PC));           \
         *(--_nIns) = (NIns)( COND_AL | (_d)<<16 | (_r)<<8 | 0x90 | (_l) );  \
         asm_output("mul %s, %s, %s",gpn(_d),gpn(_l),gpn(_r)); } while(0)
@@ -722,16 +742,17 @@ enum {
     underrunProtect(8);                                                 \
     *(--_nIns) = (NIns)( ( _opp<<28) | (0x3A<<20) | ((_r)<<12) | (0) ); \
     *(--_nIns) = (NIns)( (_cond<<28) | (0x3A<<20) | ((_r)<<12) | (1) ); \
     asm_output("mov%s %s, #1", condNames[_cond], gpn(_r));              \
     asm_output("mov%s %s, #0", condNames[_opp], gpn(_r));               \
     } while (0)
 
 #define SETEQ(r)    SET(r,EQ)
+#define SETNE(r)    SET(r,NE)
 #define SETLT(r)    SET(r,LT)
 #define SETLE(r)    SET(r,LE)
 #define SETGT(r)    SET(r,GT)
 #define SETGE(r)    SET(r,GE)
 #define SETLO(r)    SET(r,LO)
 #define SETLS(r)    SET(r,LS)
 #define SETHI(r)    SET(r,HI)
 #define SETHS(r)    SET(r,HS)