[arm] b=481761; Finish up ALU op conversions; r=graydon
authorVladimir Vukicevic <vladimir@pobox.com>
Fri, 20 Mar 2009 15:53:14 -0700
changeset 26538 77f4c1affaa22182690760f258403b5ecdceee76
parent 26537 4d5c8ae3362f1611d57bd8d78a4ab8aef6613c70
child 26539 cda79cc9399ed9a9ce2adb0b4c2915fed4ca34e9
push id6115
push userrsayre@mozilla.com
push dateTue, 24 Mar 2009 17:50:03 +0000
treeherdermozilla-central@4a34c6235bb7 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersgraydon
bugs481761
milestone1.9.2a1pre
[arm] b=481761; Finish up ALU op conversions; r=graydon
js/src/nanojit/NativeARM.cpp
js/src/nanojit/NativeARM.h
--- a/js/src/nanojit/NativeARM.cpp
+++ b/js/src/nanojit/NativeARM.cpp
@@ -95,17 +95,17 @@ Assembler::genPrologue()
 
     // so for alignment purposes we've pushed return addr and fp
     uint32_t stackPushed = STACK_GRANULARITY * savingCount;
     uint32_t aligned = alignUp(stackNeeded + stackPushed, NJ_ALIGN_STACK);
     int32_t amt = aligned - stackPushed;
 
     // Make room on stack for what we are doing
     if (amt)
-        SUBi(SP, amt);
+        SUBi(SP, SP, amt);
 
     verbose_only( verbose_outputf("         %p:",_nIns); )
     verbose_only( verbose_output("         patch entry"); )
     NIns *patchEntry = _nIns;
 
     MOV(FP, SP);
     PUSH_mask(savingMask);
     return patchEntry;
@@ -470,17 +470,17 @@ Assembler::asm_restore(LInsp i, Reservat
     (void)resv;
     int d = findMemFor(i);
 
     if (IsFpReg(r)) {
         if (isS8(d >> 2)) {
             FLDD(r, FP, d);
         } else {
             FLDD(r, IP, 0);
-            arm_ADDi(IP, FP, d);
+            ADDi(IP, FP, d);
         }
     } else {
         LDR(r, FP, d);
     }
 
     verbose_only(
         if (_verbose)
             outputf("        restore %s",_thisfrag->lirbuf->names->formatRef(i));
@@ -493,17 +493,17 @@ Assembler::asm_spill(Register rr, int d,
     (void) pop;
     (void) quad;
     if (d) {
         if (IsFpReg(rr)) {
             if (isS8(d >> 2)) {
                 FSTD(rr, FP, d);
             } else {
                 FSTD(rr, IP, 0);
-                arm_ADDi(IP, FP, d);
+                ADDi(IP, FP, d);
             }
         } else {
             STR(rr, FP, d);
         }
     }
 }
 
 void
@@ -524,17 +524,17 @@ Assembler::asm_load64(LInsp ins)
     Register rb = findRegFor(base, GpRegs);
 
     NanoAssert(rb != UnknownReg);
     NanoAssert(rr == UnknownReg || IsFpReg(rr));
 
     if (rr != UnknownReg) {
         if (!isS8(offset >> 2) || (offset&3) != 0) {
             FLDD(rr,IP,0);
-            arm_ADDi(IP, rb, offset);
+            ADDi(IP, rb, offset);
         } else {
             FLDD(rr,rb,offset);
         }
     } else {
         asm_mmq(FP, d, rb, offset);
     }
 
     // *(FP+dr) <- *(rb+db)
@@ -578,17 +578,17 @@ Assembler::asm_store64(LInsp value, int 
     if (!isS8(dr)) {
         baseReg = IP;
         baseOffset = 0;
     }
 
     FSTD(rv, baseReg, baseOffset);
 
     if (!isS8(dr)) {
-        arm_ADDi(IP, rb, dr);
+        ADDi(IP, rb, dr);
     }
 
     // if it's a constant, make sure our baseReg/baseOffset location
     // has the right value
     if (value->isconstq()) {
         const int32_t* p = (const int32_t*) (value-2);
 
         underrunProtect(12);
@@ -734,17 +734,17 @@ Assembler::asm_pusharg(LInsp arg)
     Reservation* argRes = getresv(arg);
     bool quad = arg->isQuad();
 
     if (argRes && argRes->reg != UnknownReg) {
         if (!quad) {
             STR_preindex(argRes->reg, SP, -4);
         } else {
             FSTD(argRes->reg, SP, 0);
-            SUBi(SP, 8);
+            SUBi(SP, SP, 8);
         }
     } else {
         int d = findMemFor(arg);
 
         if (!quad) {
             STR_preindex(IP, SP, -4);
             LDR(IP, FP, d);
         } else {
@@ -940,19 +940,18 @@ Assembler::B_cond_chk(ConditionCode _c, 
         *(--_nIns) = (NIns)( COND_AL | (0xA<<24) | ((-4)>>2) & 0xFFFFFF );
         *(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | 0x0 );
     }
 
     asm_output("%s %p", _c == AL ? "jmp" : "b(cnd)", (void*)(_t));
 }
 
 void
-Assembler::asm_add_imm(Register rd, Register rn, int32_t imm)
+Assembler::asm_add_imm(Register rd, Register rn, int32_t imm, int stat)
 {
-
     int rot = 16;
     uint32_t immval;
     bool pos;
 
     if (imm >= 0) {
         immval = (uint32_t) imm;
         pos = true;
     } else {
@@ -963,32 +962,63 @@ Assembler::asm_add_imm(Register rd, Regi
     while (immval && ((immval & 0x3) == 0)) {
         immval >>= 2;
         rot--;
     }
 
     rot &= 0xf;
 
     if (immval < 256) {
-        underrunProtect(4);
-        if (pos)
-            *(--_nIns) = (NIns)( COND_AL | OP_IMM | OP_STAT | (1<<23) | (rn<<16) | (rd<<12) | (rot << 8) | immval );
-        else
-            *(--_nIns) = (NIns)( COND_AL | OP_IMM | OP_STAT | (1<<22) | (rn<<16) | (rd<<12) | (rot << 8) | immval );
-        asm_output("add %s,%s,%d",gpn(rd),gpn(rn),imm);
-    } else {
+        if (pos) {
+            ALUi_rot(AL, add, stat, rd, rn, immval, rot);
+        } else {
+            ALUi_rot(AL, sub, stat, rd, rn, immval, rot);
+        }
+   } else {
         // add scratch to rn, after loading the value into scratch.
-
         // make sure someone isn't trying to use IP as an operand
         NanoAssert(rn != IP);
+        ALUr(AL, add, stat, rd, rn, IP);
+        LD32_nochk(IP, imm);
+    }
+}
 
-        *(--_nIns) = (NIns)( COND_AL | OP_STAT | (1<<23) | (rn<<16) | (rd<<12) | (IP));
-        asm_output("add %s,%s,%s",gpn(rd),gpn(rn),gpn(IP));
-
-        LD32_nochk(IP, imm);
+void
+Assembler::asm_sub_imm(Register rd, Register rn, int32_t imm, int stat)
+{
+    if (imm > -256 && imm < 256) {
+        if (imm >= 0)
+            ALUi(AL, sub, stat, rd, rn, imm);
+        else
+            ALUi(AL, add, stat, rd, rn, -imm);
+    } else if (imm >= 0) {
+        if (imm <= 510) {
+            /* between 0 and 510, inclusive */
+            int rem = imm - 255;
+            NanoAssert(rem < 256);
+            ALUi(AL, sub, stat, rd, rn, rem & 0xff);
+            ALUi(AL, sub, stat, rd, rn, 0xff);
+        } else {
+            /* more than 510 */
+            NanoAssert(r != IP);
+            ALUr(AL, sub, stat, rd, rn, IP);
+            LD32_nochk(IP, imm);
+        }
+    } else {
+        if (imm >= -510) {
+            /* between -510 and -1, inclusive */
+            int rem = -imm - 255;
+            ALUi(AL, add, stat, rd, rn, rem & 0xff);
+            ALUi(AL, add, stat, rd, rn, 0xff);
+        } else {
+            /* less than -510 */
+            NanoAssert(r != IP);
+            ALUr(AL, add, stat, rd, rn, IP);
+            LD32_nochk(IP, -imm);
+        }
     }
 }
 
 /*
  * VFP
  */
 
 void
@@ -1200,30 +1230,53 @@ Assembler::asm_cmp(LIns *cond)
 
     // ready to issue the compare
     if (rhs->isconst()) {
         int c = rhs->constval();
         if (c == 0 && cond->isop(LIR_eq)) {
             Register r = findRegFor(lhs, GpRegs);
             TEST(r,r);
             // No 64-bit immediates so fall-back to below
-        }
-        else if (!rhs->isQuad()) {
+        } else if (!rhs->isQuad()) {
             Register r = getBaseReg(lhs, c, GpRegs);
-            CMPi(r, c);
+            asm_cmpi(r, c);
+        } else {
+            NanoAssert(0);
         }
     } else {
         findRegFor2(GpRegs, lhs, rA, rhs, rB);
         Register ra = rA->reg;
         Register rb = rB->reg;
         CMP(ra, rb);
     }
 }
 
 void
+Assembler::asm_cmpi(Register r, int32_t imm)
+{
+    if (imm < 0) {
+        if (imm > -256) {
+            ALUi(AL, cmn, 1, 0, r, -imm);
+        } else {
+            underrunProtect(4 + LD32_size);
+            CMP(r, IP);
+            LD32_nochk(IP, imm);
+        }
+    } else {
+        if (imm < 256) {
+            ALUi(AL, cmp, 1, 0, r, imm);
+        } else {
+            underrunProtect(4 + LD32_size);
+            CMP(r, IP);
+            LD32_nochk(IP, imm);
+        }
+    }
+}
+
+void
 Assembler::asm_loop(LInsp ins, NInsList& loopJumps)
 {
     // XXX asm_loop should be in Assembler.cpp!
 
     JMP_far(0);
     loopJumps.add(_nIns);
 
     // If the target we are looping to is in a different fragment, we have to restore
@@ -1316,88 +1369,84 @@ Assembler::asm_arith(LInsp ins)
 
     Register rr = prepResultReg(ins, allow);
     Reservation* rA = getresv(lhs);
     Register ra;
     // if this is last use of lhs in reg, we can re-use result reg
     if (rA == 0 || (ra = rA->reg) == UnknownReg)
         ra = findSpecificRegFor(lhs, rr);
     // else, rA already has a register assigned.
+    NanoAssert(ra != UnknownReg);
 
     if (forceReg) {
         if (lhs == rhs)
             rb = ra;
 
         if (op == LIR_add || op == LIR_addp)
-            ADD(rr, rb);
+            ADDs(rr, ra, rb, 1);
         else if (op == LIR_sub)
-            SUB(rr, rb);
+            SUB(rr, ra, rb);
         else if (op == LIR_mul)
             MUL(rr, rb);
         else if (op == LIR_and)
-            AND(rr, rr, rb);
+            AND(rr, ra, rb);
         else if (op == LIR_or)
-            ORR(rr, rr, rb);
+            ORR(rr, ra, rb);
         else if (op == LIR_xor)
-            EOR(rr, rr, rb);
+            EOR(rr, ra, rb);
         else if (op == LIR_lsh)
-            SHL(rr, rb);
+            SHL(rr, ra, rb);
         else if (op == LIR_rsh)
-            SAR(rr, rb);
+            SAR(rr, ra, rb);
         else if (op == LIR_ush)
-            SHR(rr, rb);
+            SHR(rr, ra, rb);
         else
             NanoAssertMsg(0, "Unsupported");
     } else {
         int c = rhs->constval();
         if (op == LIR_add || op == LIR_addp)
-            ADDi(rr, c);
+            ADDi(rr, ra, c);
         else if (op == LIR_sub)
-                    SUBi(rr, c);
+            SUBi(rr, ra, c);
         else if (op == LIR_and)
-            ANDi(rr, rr, c);
+            ANDi(rr, ra, c);
         else if (op == LIR_or)
-            ORRi(rr, rr, c);
+            ORRi(rr, ra, c);
         else if (op == LIR_xor)
-            EORi(rr, rr, c);
+            EORi(rr, ra, c);
         else if (op == LIR_lsh)
-            SHLi(rr, c);
+            SHLi(rr, ra, c);
         else if (op == LIR_rsh)
-            SARi(rr, c);
+            SARi(rr, ra, c);
         else if (op == LIR_ush)
-            SHRi(rr, c);
+            SHRi(rr, ra, c);
         else
             NanoAssertMsg(0, "Unsupported");
     }
-
-    if (rr != ra)
-        MOV(rr,ra);
 }
 
 void
 Assembler::asm_neg_not(LInsp ins)
 {
     LOpcode op = ins->opcode();
     Register rr = prepResultReg(ins, GpRegs);
 
     LIns* lhs = ins->oprnd1();
     Reservation *rA = getresv(lhs);
     // if this is last use of lhs in reg, we can re-use result reg
     Register ra;
     if (rA == 0 || (ra=rA->reg) == UnknownReg)
         ra = findSpecificRegFor(lhs, rr);
     // else, rA already has a register assigned.
+    NanoAssert(ra != UnknownReg);
 
     if (op == LIR_not)
-        NOT(rr);
+        MVN(rr, ra);
     else
-        NEG(rr);
-
-    if ( rr != ra )
-        MOV(rr,ra);
+        RSBS(rr, ra);
 }
 
 void
 Assembler::asm_ld(LInsp ins)
 {
     LOpcode op = ins->opcode();
     LIns* base = ins->oprnd1();
     LIns* disp = ins->oprnd2();
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@@ -203,17 +203,19 @@ verbose_only( extern const char* shiftNa
     void LD32_nochk(Register r, int32_t imm);                           \
     void BL(NIns*);                                                     \
     void JMP_far(NIns*);                                                \
     void B_cond_chk(ConditionCode, NIns*, bool);                        \
     void underrunProtect(int bytes);                                    \
     void nativePageReset();                                             \
     void nativePageSetup();                                             \
     void asm_quad_nochk(Register, const int32_t*);                      \
-    void asm_add_imm(Register, Register, int32_t);                      \
+    void asm_add_imm(Register, Register, int32_t, int stat = 0);        \
+    void asm_sub_imm(Register, Register, int32_t, int stat = 0);        \
+    void asm_cmpi(Register, int32_t imm);                               \
     int* _nSlot;                                                        \
     int* _nExitSlot;
 
 
 #define asm_farg(i) NanoAssert(false)
 
 //printf("jmp_l_n count=%d, nins=%X, %X = %X\n", (_c), nins, _nIns, ((intptr_t)(nins+(_c))-(intptr_t)_nIns - 4) );
 
@@ -294,16 +296,36 @@ enum {
             asm_output("%s%s%s %s, #0x%X", #op, condNames[cond], (S)?"s":"", gpn(rd), (imm));\
         else if (ARM_##op >= ARM_tst && ARM_##op <= ARM_cmn) {\
             NanoAssert(S==1);\
             asm_output("%s%s %s, #0x%X", #op, condNames[cond], gpn(rl), (imm));\
         } else\
             asm_output("%s%s%s %s, %s, #0x%X", #op, condNames[cond], (S)?"s":"", gpn(rd), gpn(rl), (imm));\
     } while (0)
 
+// ALU operation with register and rotated 8-bit immediate arguments
+//  S   - bit, 0 or 1, whether the CPSR register is updated
+//  rd  - destination register
+//  rl  - first (left) operand register
+//  imm - immediate (max 8 bits)
+//  rot - rotation to apply to imm
+#define ALUi_rot(cond, op, S, rd, rl, imm, rot) do {\
+        underrunProtect(4);\
+        NanoAssert(isU8(imm));\
+        *(--_nIns) = (NIns) ((cond)<<28 | OP_IMM | (ARM_##op)<<21 | (S)<<20 | (rl)<<16 | (rd)<<12 | (rot)<<8 | (imm));\
+        if (ARM_##op == ARM_mov || ARM_##op == ARM_mvn)\
+            asm_output("%s%s%s %s, #0x%X, %d", #op, condNames[cond], (S)?"s":"", gpn(rd), (imm), (rot)*2);\
+        else if (ARM_##op >= ARM_tst && ARM_##op <= ARM_cmn) {\
+            NanoAssert(S==1);\
+            asm_output("%s%s %s, #0x%X, %d", #op, condNames[cond], gpn(rl), (imm), (rot)*2);\
+        } else\
+            asm_output("%s%s%s %s, %s, #0x%X, %d", #op, condNames[cond], (S)?"s":"", gpn(rd), gpn(rl), (imm), (rot)*2);\
+    } while (0)
+
+
 // ALU operation with two register arguments
 //  S   - bit, 0 or 1, whether the CPSR register is updated
 //  rd  - destination register
 //  rl  - first (left) operand register
 //  rr  - first (left) operand register
 #define ALUr(cond, op, S, rd, rl, rr) do {\
         underrunProtect(4);\
         *(--_nIns) = (NIns) ((cond)<<28 |(ARM_##op)<<21 | (S)<<20 | (rl)<<16 | (rd)<<12 | (rr));\
@@ -311,17 +333,17 @@ enum {
             asm_output("%s%s%s %s, %s", #op, condNames[cond], (S)?"s":"", gpn(rd), gpn(rr));\
         else if (ARM_##op >= ARM_tst && ARM_##op <= ARM_cmn) {\
             NanoAssert(S==1);\
             asm_output("%s%s  %s, %s", #op, condNames[cond], gpn(rl), gpn(rr));\
         } else\
             asm_output("%s%s%s %s, %s, %s", #op, condNames[cond], (S)?"s":"", gpn(rd), gpn(rl), gpn(rr));\
     } while (0)
 
-// ALU operator with two register arguments, with rr operated on by a shift and shift immediate
+// ALU operation with two register arguments, with rr operated on by a shift and shift immediate
 //  S   - bit, 0 or 1, whether the CPSR register is updated
 //  rd  - destination register
 //  rl  - first (left) operand register
 //  rr  - first (left) operand register
 //  sh  - a ShiftOperator
 //  imm - immediate argument to shift operator, 5 bits (0..31)
 #define ALUr_shi(cond, op, S, rd, rl, rr, sh, imm) do {\
         underrunProtect(4);\
@@ -331,22 +353,34 @@ enum {
             asm_output("%s%s%s %s, %s, %s #%d", #op, condNames[cond], (S)?"s":"", gpn(rd), gpn(rr), shiftNames[sh], (imm));\
         else if (ARM_##op >= ARM_tst && ARM_##op <= ARM_cmn) {\
             NanoAssert(S==1);\
             asm_output("%s%s  %s, %s, %s #%d", #op, condNames[cond], gpn(rl), gpn(rr), shiftNames[sh], (imm));\
         } else\
             asm_output("%s%s%s %s, %s, %s, %s #%d", #op, condNames[cond], (S)?"s":"", gpn(rd), gpn(rl), gpn(rr), shiftNames[sh], (imm));\
     } while (0)
 
-
-
-
-
-
-
+// ALU operation with two register arguments, with rr operated on by a shift and shift register
+//  S   - bit, 0 or 1, whether the CPSR register is updated
+//  rd  - destination register
+//  rl  - first (left) operand register
+//  rr  - first (left) operand register
+//  sh  - a ShiftOperator
+//  rs  - shift operand register
+#define ALUr_shr(cond, op, S, rd, rl, rr, sh, rs) do {\
+        underrunProtect(4);\
+        *(--_nIns) = (NIns) ((cond)<<28 |(ARM_##op)<<21 | (S)<<20 | (rl)<<16 | (rd)<<12 | (rs)<<8 | (sh)<<4 | (rr));\
+        if (ARM_##op == ARM_mov || ARM_##op == ARM_mvn)\
+            asm_output("%s%s%s %s, %s, %s %s", #op, condNames[cond], (S)?"s":"", gpn(rd), gpn(rr), shiftNames[sh], gpn(rs));\
+        else if (ARM_##op >= ARM_tst && ARM_##op <= ARM_cmn) {\
+            NanoAssert(S==1);\
+            asm_output("%s%s  %s, %s, %s %s", #op, condNames[cond], gpn(rl), gpn(rr), shiftNames[sh], gpn(rs));\
+        } else\
+            asm_output("%s%s%s %s, %s, %s, %s %s", #op, condNames[cond], (S)?"s":"", gpn(rd), gpn(rl), gpn(rr), shiftNames[sh], gpn(rs));\
+    } while (0)
 
 // _d = _l OR _r
 #define ORR(_d,_l,_r) ALUr(AL, orr, 0, _d, _l, _r)
 
 // _d = _l OR _imm
 #define ORRi(_d,_l,_imm) ALUi(AL, orr, 0, _d, _l, _imm)
 
 // _d = _l AND _r
@@ -356,177 +390,76 @@ enum {
 #define ANDi(_d,_l,_imm) ALUi(AL, and, 0, _d, _l, _imm)
 
 // _d = _l ^ _r
 #define EOR(_d,_l,_r) ALUr(AL, eor, 0, _d, _l, _r)
 
 // _d = _l ^ _imm
 #define EORi(_d,_l,_imm) ALUi(AL, eor, 0, _d, _l, _imm)
 
-// _d = _n + _m
-#define arm_ADD(_d,_n,_m) do {                                          \
-        underrunProtect(4);                                             \
-        *(--_nIns) = (NIns)( COND_AL | OP_STAT | (1<<23) | ((_n)<<16) | ((_d)<<12) | (_m)); \
-        asm_output("add %s,%s+%s",gpn(_d),gpn(_n),gpn(_m)); } while(0)
-
-// _l = _l + _r
-#define ADD(_l,_r)   arm_ADD(_l,_l,_r)
+// _d = _l + _r; update flags
+#define ADD(_d,_l,_r) ALUr(AL, add, 1, _d, _l, _r)
 
-// Note that this sometimes converts negative immediate values to a to a sub.
-// _d = _r + _imm
-#define arm_ADDi(_d,_n,_imm)   asm_add_imm(_d,_n,_imm)
-#define ADDi(_r,_imm)  arm_ADDi(_r,_r,_imm)
+// _d = _l + _r; update flags if _stat == 1
+#define ADDs(_d,_l,_r,_stat) ALUr(AL, add, _stat, _d, _l, _r)
 
-// _l = _l - _r
-#define SUB(_l,_r)  do {                                                \
-        underrunProtect(4);                                             \
-        *(--_nIns) = (NIns)( COND_AL | (1<<22) | ((_l)<<16) | ((_l)<<12) | (_r)); \
-        asm_output("sub %s,%s",gpn(_l),gpn(_r)); } while(0)
+// _d = _l + _imm; update flags
+#define ADDi(_d,_l,_imm) asm_add_imm(_d, _l, _imm, 1)
 
-// _r = _r - _imm
-#define SUBi(_r,_imm)  do {                                             \
-        if ((_imm)>-256 && (_imm)<256) {                                \
-            underrunProtect(4);                                         \
-            if ((_imm)>=0)  *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | ((_imm)&0xFF) ); \
-            else            *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | ((-(_imm))&0xFF) ); \
-        } else {                                                        \
-            if ((_imm)>=0) {                                            \
-                if ((_imm)<=510) {                                      \
-                    underrunProtect(8);                                 \
-                    int rem = (_imm) - 255;                             \
-                    NanoAssert(rem<256);                                \
-                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | (rem&0xFF) ); \
-                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | (0xFF) ); \
-                } else {                                                \
-                    underrunProtect(4+LD32_size);                       \
-                    *(--_nIns) = (NIns)( COND_AL | (1<<22) | ((_r)<<16) | ((_r)<<12) | (IP)); \
-                    LD32_nochk(IP, _imm);                          \
-                }                                                       \
-            } else {                                                    \
-                if ((_imm)>=-510) {                                     \
-                    underrunProtect(8);                                 \
-                    int rem = -(_imm) - 255;                            \
-                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | ((rem)&0xFF) ); \
-                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | (0xFF) ); \
-                } else {                                                \
-                    underrunProtect(4+LD32_size);                       \
-                    *(--_nIns) = (NIns)( COND_AL | (1<<23) | ((_r)<<16) | ((_r)<<12) | (IP)); \
-                    LD32_nochk(IP, -(_imm)); \
-                }                                                       \
-            }                                                           \
-        }                                                               \
-        asm_output("sub %s,%d",gpn(_r),(_imm));                        \
-    } while (0)
+// _d = _l + _imm; update flags if _stat == 1
+#define ADDis(_d,_l,_imm,_stat) asm_add_imm(_d, _l, _imm, _stat)
+
+// _d = _l - _r; update flags
+#define SUB(_d,_l,_r) ALUr(AL, sub, 1, _d, _l, _r)
+
+// _d = _l - _imm; update flags
+#define SUBi(_d,_l,_imm)  asm_sub_imm(_d, _l, _imm, 1)
 
 // _l = _l * _r
 #define MUL(_l,_r)  do {                                                \
         underrunProtect(4);                                             \
         *(--_nIns) = (NIns)( COND_AL | (_l)<<16 | (_l)<<8 | 0x90 | (_r) ); \
         asm_output("mul %s,%s",gpn(_l),gpn(_r)); } while(0)
 
+// _d = 0 - _r
+#define RSBS(_d,_r) ALUi(AL, rsb, 1, _d, _r, 0)
 
-// RSBS
-// _r = -_r
-#define NEG(_r) do {                                                    \
-        underrunProtect(4);                                             \
-        *(--_nIns) = (NIns)( COND_AL |  (0x27<<20) | ((_r)<<16) | ((_r)<<12) ); \
-        asm_output("neg %s",gpn(_r)); } while(0)
-
-// MVNS
-// _r = !_r
-#define NOT(_r) do {                                                    \
-        underrunProtect(4);                                             \
-        *(--_nIns) = (NIns)( COND_AL |  (0x1F<<20) | ((_r)<<12) |  (_r) ); \
-        asm_output("mvn %s",gpn(_r)); } while(0)
+// _d = ~_r (one's compliment)
+#define MVN(_d,_r) ALUr(AL, mvn, 0, _d, 0, _r)
 
-// MOVS _r, _r, LSR <_s>
-// _r = _r >> _s
-#define SHR(_r,_s) do {                                                 \
-        underrunProtect(4);                                             \
-        *(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_s)<<8) | (LSR_reg<<4) | (_r) ); \
-        asm_output("shr %s,%s",gpn(_r),gpn(_s)); } while(0)
+// MOVS _d, _r, LSR <_s>
+// _d = _r >> _s
+#define SHR(_d,_r,_s) ALUr_shr(AL, mov, 1, _d, 0, _r, LSR_reg, _s)
 
-// MOVS _r, _r, LSR #_imm
-// _r = _r >> _imm
-#define SHRi(_r,_imm) do {                                              \
-        underrunProtect(4);                                             \
-        *(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_imm)<<7) | (LSR_imm<<4) | (_r) ); \
-        asm_output("shr %s,%d",gpn(_r),_imm); } while(0)
+// MOVS _d, _r, LSR #_imm
+// _d = _r >> _imm
+#define SHRi(_d,_r,_imm)  ALUr_shi(AL, mov, 1, _d, 0, _r, LSR_imm, _imm)
 
-// MOVS _r, _r, ASR <_s>
-// _r = _r >> _s
-#define SAR(_r,_s) do {                                                 \
-        underrunProtect(4);                                             \
-        *(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_s)<<8) | (ASR_reg<<4) | (_r) ); \
-        asm_output("asr %s,%s",gpn(_r),gpn(_s)); } while(0)
-
+// MOVS _d, _r, ASR <_s>
+// _d = _r >> _s
+#define SAR(_d,_r,_s) ALUr_shr(AL, mov, 1, _d, 0, _r, ASR_reg, _s)
 
 // MOVS _r, _r, ASR #_imm
-// _r = _r >> _imm
-#define SARi(_r,_imm) do {                                              \
-        underrunProtect(4);                                             \
-        *(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_imm)<<7) | (ASR_imm<<4) | (_r) ); \
-        asm_output("asr %s,%d",gpn(_r),_imm); } while(0)
+// _d = _r >> _imm
+#define SARi(_d,_r,_imm) ALUr_shi(AL, mov, 1, _d, 0, _r, ASR_imm, _imm)
 
-// MOVS _r, _r, LSL <_s>
-// _r = _r << _s
-#define SHL(_r,_s) do {                                                 \
-        underrunProtect(4);                                             \
-        *(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_s)<<8) | (LSL_reg<<4) | (_r) ); \
-        asm_output("lsl %s,%s",gpn(_r),gpn(_s)); } while(0)
+// MOVS _d, _r, LSL <_s>
+// _d = _r << _s
+#define SHL(_d, _r, _s) ALUr_shr(AL, mov, 1, _d, 0, _r, LSL_reg, _s)
 
-// MOVS _r, _r, LSL #_imm
-// _r = _r << _imm
-#define SHLi(_r,_imm) do {                                              \
-        underrunProtect(4);                                             \
-        *(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_imm)<<7) | (LSL_imm<<4) | (_r) ); \
-        asm_output("lsl %s,%d",gpn(_r),(_imm)); } while(0)
+// MOVS _d, _r, LSL #_imm
+// _d = _r << _imm
+#define SHLi(_d, _r, _imm) ALUr_shi(AL, mov, 1, _d, 0, _r, LSL_imm, _imm)
                     
 // TST
-#define TEST(_d,_s) do {                                                \
-        underrunProtect(4);                                             \
-        *(--_nIns) = (NIns)( COND_AL | (0x11<<20) | ((_d)<<16) | (_s) ); \
-        asm_output("test %s,%s",gpn(_d),gpn(_s)); } while(0)
-
-#define TSTi(_d,_imm) do {                                              \
-        underrunProtect(4);                                             \
-        NanoAssert(((_imm) & 0xff) == (_imm));                          \
-        *(--_nIns) = (NIns)( COND_AL | OP_IMM | (0x11<<20) | ((_d) << 16) | (0xF<<12) | ((_imm) & 0xff) ); \
-        asm_output("tst %s,#0x%x", gpn(_d), _imm);                     \
-    } while (0);
+#define TEST(_l,_r)     ALUr(AL, tst, 1, 0, _l, _r)
+#define TSTi(_d,_imm)   ALUi(AL, tst, 1, 0, _d, _imm)
 
 // CMP
-#define CMP(_l,_r)  do {                                                \
-        underrunProtect(4);                                             \
-        *(--_nIns) = (NIns)( COND_AL | (0x015<<20) | ((_l)<<16) | (_r) ); \
-        asm_output("cmp %s,%s",gpn(_l),gpn(_r)); } while(0)
-
-// CMP (or CMN)
-#define CMPi(_r,_imm)  do {                                             \
-        if (_imm<0) {                                                   \
-            if ((_imm)>-256) {                                          \
-                underrunProtect(4);                                     \
-                *(--_nIns) = (NIns)( COND_AL | (0x37<<20) | ((_r)<<16) | (-(_imm)) ); \
-            } else {                                                      \
-                underrunProtect(4+LD32_size);                           \
-                *(--_nIns) = (NIns)( COND_AL | (0x17<<20) | ((_r)<<16) | (IP) ); \
-                LD32_nochk(IP, (_imm));                            \
-            }                                                           \
-        } else {                                                        \
-            if ((_imm)<256) {                                           \
-                underrunProtect(4);                                     \
-                *(--_nIns) = (NIns)( COND_AL | (0x035<<20) | ((_r)<<16) | ((_imm)&0xFF) ); \
-            } else {                                                    \
-                underrunProtect(4+LD32_size);                           \
-                *(--_nIns) = (NIns)( COND_AL | (0x015<<20) | ((_r)<<16) | (IP) ); \
-                LD32_nochk(IP, (_imm));                            \
-            }                                                           \
-        }                                                               \
-        asm_output("cmp %s,0x%x",gpn(_r),(_imm));                      \
-    } while(0)
+#define CMP(_l,_r)  ALUr(AL, cmp, 1, 0, _l, _r)
 
 // MOV
 #define MOV(_d,_s)  do {                                                 \
         underrunProtect(4);                                             \
         *(--_nIns) = (NIns)( COND_AL | (0xD<<21) | ((_d)<<12) | (_s) ); \
         asm_output("mov %s,%s",gpn(_d),gpn(_s)); } while (0)