Indentation sanity fixes; no code changes.
authorVladimir Vukicevic <vladimir@pobox.com>
Tue, 02 Sep 2008 22:29:23 -0700
changeset 18775 a94e3a1d900916db774e7e16c86a65e10468b0cf
parent 18774 659da061e40a9781901171a4635b5cf5f5eae9b3
child 18776 b6d60356a49cc571ebfab99c55aca0006b7c939d
push id1711
push userbrendan@mozilla.com
push dateThu, 04 Sep 2008 08:26:45 +0000
treeherderautoland@1431bbddb5de [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
milestone1.9.1b1pre
Indentation sanity fixes; no code changes.
js/src/nanojit/NativeARM.cpp
js/src/nanojit/NativeARM.h
--- a/js/src/nanojit/NativeARM.cpp
+++ b/js/src/nanojit/NativeARM.cpp
@@ -1,9 +1,9 @@
-/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 4 -*- */
+/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 40 -*- */
 /* ***** BEGIN LICENSE BLOCK *****
  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
  *
  * The contents of this file are subject to the Mozilla Public License Version
  * 1.1 (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  * http://www.mozilla.org/MPL/
  *
@@ -16,16 +16,17 @@
  *
  * The Initial Developer of the Original Code is
  * Adobe System Incorporated.
  * Portions created by the Initial Developer are Copyright (C) 2004-2007
  * the Initial Developer. All Rights Reserved.
  *
  * Contributor(s):
  *   Adobe AS3 Team
+ *   Vladimir Vukicevic <vladimir@pobox.com>
  *
  * Alternatively, the contents of this file may be used under the terms of
  * either the GNU General Public License Version 2 or later (the "GPL"), or
  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  * in which case the provisions of the GPL or the LGPL are applicable instead
  * of those above. If you wish to allow use of your version of this file only
  * under the terms of either the GPL or the LGPL, and not to allow others to
  * use your version of this file under the terms of the MPL, indicate your
@@ -47,527 +48,559 @@
 #endif
 
 #if defined(AVMPLUS_LINUX)
 #include <asm/unistd.h>
 #endif
 
 namespace nanojit
 {
-	#ifdef FEATURE_NANOJIT
+#ifdef FEATURE_NANOJIT
 
 #ifdef NJ_VERBOSE
-	const char* regNames[] = {"r0","r1","r2","r3","r4","r5","r6","r7","r8","r9","r10","r11","IP","SP","LR","PC"};
-
-#endif
-    const Register Assembler::argRegs[] = { R0, R1, R2, R3 };
-    const Register Assembler::retRegs[] = { R0, R1 };
-
-	void Assembler::nInit(AvmCore*)
-	{
-		// all ARMs have conditional move
-		has_cmov = true;
-	}
-
-	NIns* Assembler::genPrologue(RegisterMask needSaving)
-	{
-		/**
-		 * Prologue
-		 */
-
-		// NJ_RESV_OFFSET is space at the top of the stack for us
-		// to use for parameter passing (8 bytes at the moment)
-		uint32_t stackNeeded = 4 * _activation.highwatermark + NJ_STACK_OFFSET;
-		uint32_t savingCount = 0;
-
-		uint32_t savingMask = 0;
-		savingCount = 9; //R4-R10,R11,LR
-		savingMask = SavedRegs | rmask(FRAME_PTR);
-		(void)needSaving;
-
-		// so for alignment purposes we've pushed  return addr, fp, and savingCount registers
-		uint32_t stackPushed = 4 * (2+savingCount);
-		uint32_t aligned = alignUp(stackNeeded + stackPushed, NJ_ALIGN_STACK);
-		int32_t amt = aligned - stackPushed;
-
-		// Make room on stack for what we are doing
-		if (amt)
-		{ 
-			SUBi(SP, amt); 
-		}
-
-		verbose_only( verbose_outputf("         %p:",_nIns); )
-        verbose_only( verbose_output("         patch entry"); )
-        NIns *patchEntry = _nIns;
-
-		MR(FRAME_PTR, SP);
-		PUSH_mask(savingMask|rmask(LR));
-		return patchEntry;
-	}
-
-	void Assembler::nFragExit(LInsp guard)
-	{
-		SideExit* exit = guard->exit();
-		Fragment *frag = exit->target;
-		GuardRecord *lr;
-		if (frag && frag->fragEntry)
-		{
-			JMP(frag->fragEntry);
-			lr = 0;
-		}
-		else
-		{
-			// target doesn't exit yet.  emit jump to epilog, and set up to patch later.
-			lr = placeGuardRecord(guard);
-
-			// we need to know that there's an extra immediate value available
-			// for us; always force a far jump here.
-			BL_far(_epilogue);
-
-			lr->jmp = _nIns;
-		}
-
-		// pop the stack frame first
-		MR(SP, FRAME_PTR);
-
-        #ifdef NJ_VERBOSE
-        if (_frago->core()->config.show_stats) {
-			// load R1 with Fragment *fromFrag, target fragment
-			// will make use of this when calling fragenter().
-            int fromfrag = int((Fragment*)_thisfrag);
-            LDi(argRegs[1], fromfrag);
-        }
-        #endif
-
-		// return value is GuardRecord*
-        LDi(R2, int(lr));
-	}
-
-	NIns* Assembler::genEpilogue(RegisterMask restore)
-	{
-		BX(LR); // return
-		MR(R0,R2); // return LinkRecord*
-		RegisterMask savingMask = restore | rmask(FRAME_PTR) | rmask(LR);
-		POP_mask(savingMask); // regs
-		return _nIns;
-	}
-	
-	void Assembler::asm_call(LInsp ins)
-	{
-        const CallInfo* call = callInfoFor(ins->fid());
-		uint32_t atypes = call->_argtypes;
-		uint32_t roffset = 0;
-
-		// we need to detect if we have arg0 as LO followed by arg1 as F;
-		// in that case, we need to skip using r1 -- the F needs to be
-		// loaded in r2/r3, at least according to the ARM EABI and gcc 4.2's
-		// generated code.
-		bool arg0IsInt32FollowedByFloat = false;
-		while ((atypes & 3) != ARGSIZE_NONE) {
-			if (((atypes >> 4) & 3) == ARGSIZE_LO &&
-				((atypes >> 2) & 3) == ARGSIZE_F &&
-				((atypes >> 6) & 3) == ARGSIZE_NONE)
-			{
-				arg0IsInt32FollowedByFloat = true;
-				break;
-			}
-			atypes >>= 2;
-		}
-
-		CALL(call);
-        ArgSize sizes[10];
-        uint32_t argc = call->get_sizes(sizes);
-		for(uint32_t i=0; i < argc; i++)
-		{
-            uint32_t j = argc - i - 1;
-            ArgSize sz = sizes[j];
-            NanoAssert(sz == ARGSIZE_LO || sz == ARGSIZE_Q);
-    		// pre-assign registers R0-R3 for arguments (if they fit)
-            Register r = (i+roffset) < 4 ? argRegs[i+roffset] : UnknownReg;
-            asm_arg(sz, ins->arg(j), r);
-
-			if (i == 0 && arg0IsInt32FollowedByFloat)
-				roffset = 1;
-		}
-	}
-	
-	void Assembler::nMarkExecute(Page* page, int32_t count, bool enable)
-	{
-	#ifdef UNDER_CE
-		DWORD dwOld;
-		VirtualProtect(page, NJ_PAGE_SIZE, PAGE_EXECUTE_READWRITE, &dwOld);
-	#endif
-	#ifdef AVMPLUS_PORTING_API
-		NanoJIT_PortAPI_MarkExecutable(page, (void*)((int32_t)page+count));
-	#endif
-		(void)page;
-		(void)count;
-		(void)enable;
-	}
-			
-	Register Assembler::nRegisterAllocFromSet(int set)
-	{
-		// Note: The clz instruction only works on armv5 and up.
-#ifndef UNDER_CE
-#ifdef __ARMCC__
-		register int i;
-		__asm { clz i,set }
-		Register r = Register(31-i);
-		_allocator.free &= ~rmask(r);
-		return r;
-#else
-		// need to implement faster way
-		int i=0;
-		while (!(set & rmask((Register)i)))
-			i ++;
-		_allocator.free &= ~rmask((Register)i);
-		return (Register) i;
-#endif
-#else
-		Register r;
-		r = (Register)_CountLeadingZeros(set);
-		r = (Register)(31-r);
-		_allocator.free &= ~rmask(r);
-		return r;
+const char* regNames[] = {"r0","r1","r2","r3","r4","r5","r6","r7","r8","r9","r10","r11","IP","SP","LR","PC"};
 #endif
 
-	}
+const Register Assembler::argRegs[] = { R0, R1, R2, R3 };
+const Register Assembler::retRegs[] = { R0, R1 };
 
-	void Assembler::nRegisterResetAll(RegAlloc& a)
-	{
-		// add scratch registers to our free list for the allocator
-		a.clear();
-		a.used = 0;
-		a.free = rmask(R0) | rmask(R1) | rmask(R2) | rmask(R3) | rmask(R4) | rmask(R5);
-		debug_only(a.managed = a.free);
-	}
+void
+Assembler::nInit(AvmCore*)
+{
+    // all ARMs have conditional move
+    has_cmov = true;
+}
 
-	void Assembler::nPatchBranch(NIns* branch, NIns* target)
-	{
-		// Patch the jump in a loop
+NIns*
+Assembler::genPrologue(RegisterMask needSaving)
+{
+    /**
+     * Prologue
+     */
 
-		// This is ALWAYS going to be a long branch (using the BL instruction)
-		// Which is really 2 instructions, so we need to modify both
-		// XXX -- this is B, not BL, at least on non-Thumb..
+    // NJ_RESV_OFFSET is space at the top of the stack for us
+    // to use for parameter passing (8 bytes at the moment)
+    uint32_t stackNeeded = 4 * _activation.highwatermark + NJ_STACK_OFFSET;
+    uint32_t savingCount = 0;
 
-		// branch+2 because PC is always 2 instructions ahead on ARM/Thumb
-		int32_t offset = int(target) - int(branch+2);
+    uint32_t savingMask = 0;
+    savingCount = 9; //R4-R10,R11,LR
+    savingMask = SavedRegs | rmask(FRAME_PTR);
+    (void)needSaving;
 
-		//printf("---patching branch at 0x%08x to location 0x%08x (%d-0x%08x)\n", branch, target, offset, offset);
+    // so for alignment purposes we've pushed  return addr, fp, and savingCount registers
+    uint32_t stackPushed = 4 * (2+savingCount);
+    uint32_t aligned = alignUp(stackNeeded + stackPushed, NJ_ALIGN_STACK);
+    int32_t amt = aligned - stackPushed;
 
-		// We have 2 words to work with here -- if offset is in range of a 24-bit
-		// relative jump, emit that; otherwise, we do a pc-relative load into pc.
-		if (-(1<<24) <= offset & offset < (1<<24)) {
-			// ARM goodness, using unconditional B
-			*branch = (NIns)( COND_AL | (0xA<<24) | ((offset >>2) & 0xFFFFFF) );
-		} else {
-			// LDR pc,[pc]
-			*branch++ = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | ( 0x004 ) );
-			*branch = (NIns)target;
-		}
-	}
+    // Make room on stack for what we are doing
+    if (amt)
+        SUBi(SP, amt); 
 
-	RegisterMask Assembler::hint(LIns* i, RegisterMask allow /* = ~0 */)
-	{
-		uint32_t op = i->opcode();
-		int prefer = ~0;
+    verbose_only( verbose_outputf("         %p:",_nIns); )
+    verbose_only( verbose_output("         patch entry"); )
+    NIns *patchEntry = _nIns;
+
+    MR(FRAME_PTR, SP);
+    PUSH_mask(savingMask|rmask(LR));
+    return patchEntry;
+}
 
-		if (op==LIR_call || op==LIR_fcall)
-			prefer = rmask(R0);
-		else if (op == LIR_callh)
-			prefer = rmask(R1);
-		else if (op == LIR_param)
-			prefer = rmask(imm2register(i->imm8()));
-
-		if (_allocator.free & allow & prefer)
-			allow &= prefer;
-		return allow;
-	}
+void
+Assembler::nFragExit(LInsp guard)
+{
+    SideExit* exit = guard->exit();
+    Fragment *frag = exit->target;
+    GuardRecord *lr;
 
-    void Assembler::asm_qjoin(LIns *ins)
-    {
-		int d = findMemFor(ins);
-		AvmAssert(d);
-		LIns* lo = ins->oprnd1();
-		LIns* hi = ins->oprnd2();
-							
-		Register r = findRegFor(hi, GpRegs);
-		ST(FP, d+4, r);
+    if (frag && frag->fragEntry) {
+        JMP(frag->fragEntry);
+        lr = 0;
+    } else {
+        // target doesn't exit yet.  emit jump to epilog, and set up to patch later.
+        lr = placeGuardRecord(guard);
 
-        // okay if r gets recycled.
-		r = findRegFor(lo, GpRegs);
-		ST(FP, d, r);
-        freeRsrcOf(ins, false);	// if we had a reg in use, emit a ST to flush it to mem
+        // we need to know that there's an extra immediate value available
+        // for us; always force a far jump here.
+        BL_far(_epilogue);
+
+        lr->jmp = _nIns;
     }
 
-    void Assembler::asm_store32(LIns *value, int dr, LIns *base)
-    {
-	    // make sure what is in a register
-	    Reservation *rA, *rB;
-	    findRegFor2(GpRegs, value, rA, base, rB);
-	    Register ra = rA->reg;
-	    Register rb = rB->reg;
-	    ST(rb, dr, ra);
+    // pop the stack frame first
+    MR(SP, FRAME_PTR);
+
+#ifdef NJ_VERBOSE
+    if (_frago->core()->config.show_stats) {
+        // load R1 with Fragment *fromFrag, target fragment
+        // will make use of this when calling fragenter().
+        int fromfrag = int((Fragment*)_thisfrag);
+        LDi(argRegs[1], fromfrag);
+    }
+#endif
+
+    // return value is GuardRecord*
+    LDi(R2, int(lr));
+}
+
+NIns*
+Assembler::genEpilogue(RegisterMask restore)
+{
+    BX(LR); // return
+    MR(R0,R2); // return LinkRecord*
+    RegisterMask savingMask = restore | rmask(FRAME_PTR) | rmask(LR);
+    POP_mask(savingMask); // regs
+    return _nIns;
+}
+    
+void
+Assembler::asm_call(LInsp ins)
+{
+    const CallInfo* call = callInfoFor(ins->fid());
+    uint32_t atypes = call->_argtypes;
+    uint32_t roffset = 0;
+
+    // we need to detect if we have arg0 as LO followed by arg1 as F;
+    // in that case, we need to skip using r1 -- the F needs to be
+    // loaded in r2/r3, at least according to the ARM EABI and gcc 4.2's
+    // generated code.
+    bool arg0IsInt32FollowedByFloat = false;
+    while ((atypes & 3) != ARGSIZE_NONE) {
+        if (((atypes >> 4) & 3) == ARGSIZE_LO &&
+            ((atypes >> 2) & 3) == ARGSIZE_F &&
+            ((atypes >> 6) & 3) == ARGSIZE_NONE)
+        {
+            arg0IsInt32FollowedByFloat = true;
+            break;
+        }
+        atypes >>= 2;
     }
 
-	void Assembler::asm_restore(LInsp i, Reservation *resv, Register r)
-	{
-		(void)resv;
-        int d = findMemFor(i);
-	    LD(r, d, FP);
-		verbose_only(if (_verbose) {
-			outputf("        restore %s",_thisfrag->lirbuf->names->formatRef(i));
-		})
-	}
+    CALL(call);
+
+    ArgSize sizes[10];
+    uint32_t argc = call->get_sizes(sizes);
+    for(uint32_t i=0; i < argc; i++) {
+        uint32_t j = argc - i - 1;
+        ArgSize sz = sizes[j];
+        NanoAssert(sz == ARGSIZE_LO || sz == ARGSIZE_Q);
+        // pre-assign registers R0-R3 for arguments (if they fit)
+        Register r = (i+roffset) < 4 ? argRegs[i+roffset] : UnknownReg;
+        asm_arg(sz, ins->arg(j), r);
+
+        if (i == 0 && arg0IsInt32FollowedByFloat)
+            roffset = 1;
+    }
+}
+    
+void
+Assembler::nMarkExecute(Page* page, int32_t count, bool enable)
+{
+#ifdef UNDER_CE
+    DWORD dwOld;
+    VirtualProtect(page, NJ_PAGE_SIZE, PAGE_EXECUTE_READWRITE, &dwOld);
+#endif
+#ifdef AVMPLUS_PORTING_API
+    NanoJIT_PortAPI_MarkExecutable(page, (void*)((int32_t)page+count));
+#endif
+    (void)page;
+    (void)count;
+    (void)enable;
+}
+            
+Register
+Assembler::nRegisterAllocFromSet(int set)
+{
+    // Note: The clz instruction only works on armv5 and up.
+#if defined(UNDER_CE)
+    Register r;
+    r = (Register)_CountLeadingZeros(set);
+    r = (Register)(31-r);
+    _allocator.free &= ~rmask(r);
+    return r;
+#elif defined(__ARMCC__)
+    register int i;
+    __asm { clz i,set }
+    Register r = Register(31-i);
+    _allocator.free &= ~rmask(r);
+    return r;
+#else
+    // need to implement faster way
+    int i=0;
+    while (!(set & rmask((Register)i)))
+        i ++;
+    _allocator.free &= ~rmask((Register)i);
+    return (Register) i;
+#endif
+}
 
-	void Assembler::asm_spill(LInsp i, Reservation *resv, bool pop)
-	{
-    (void)i;
-		(void)pop;
-		if (resv->arIndex)
-		{
-			int d = disp(resv);
-			// save to spill location
-			Register rr = resv->reg;
-			ST(FP, d, rr);
-			verbose_only(if (_verbose){
-				outputf("        spill %s",_thisfrag->lirbuf->names->formatRef(i));
-			})
-		}
-	}
+void
+Assembler::nRegisterResetAll(RegAlloc& a)
+{
+    // add scratch registers to our free list for the allocator
+    a.clear();
+    a.used = 0;
+    a.free = rmask(R0) | rmask(R1) | rmask(R2) | rmask(R3) | rmask(R4) | rmask(R5);
+    debug_only(a.managed = a.free);
+}
+
+void
+Assembler::nPatchBranch(NIns* branch, NIns* target)
+{
+    // Patch the jump in a loop
+
+    // This is ALWAYS going to be a long branch (using the BL instruction)
+    // Which is really 2 instructions, so we need to modify both
+    // XXX -- this is B, not BL, at least on non-Thumb..
+
+    // branch+2 because PC is always 2 instructions ahead on ARM/Thumb
+    int32_t offset = int(target) - int(branch+2);
+
+    //printf("---patching branch at 0x%08x to location 0x%08x (%d-0x%08x)\n", branch, target, offset, offset);
 
-	void Assembler::asm_load64(LInsp ins)
-	{
-		LIns* base = ins->oprnd1();
-		int db = ins->oprnd2()->constval();
-		Reservation *resv = getresv(ins);
-		int dr = disp(resv);
-		NanoAssert(resv->reg == UnknownReg && dr != 0);
+    // We have 2 words to work with here -- if offset is in range of a 24-bit
+    // relative jump, emit that; otherwise, we do a pc-relative load into pc.
+    if (-(1<<24) <= offset & offset < (1<<24)) {
+        // ARM goodness, using unconditional B
+        *branch = (NIns)( COND_AL | (0xA<<24) | ((offset >>2) & 0xFFFFFF) );
+    } else {
+        // LDR pc,[pc]
+        *branch++ = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | ( 0x004 ) );
+        *branch = (NIns)target;
+    }
+}
+
+RegisterMask
+Assembler::hint(LIns* i, RegisterMask allow /* = ~0 */)
+{
+    uint32_t op = i->opcode();
+    int prefer = ~0;
 
-		Register rb = findRegFor(base, GpRegs);
-		resv->reg = UnknownReg;
-		asm_mmq(FP, dr, rb, db);
-		freeRsrcOf(ins, false);
-	}
+    if (op==LIR_call || op==LIR_fcall)
+        prefer = rmask(R0);
+    else if (op == LIR_callh)
+        prefer = rmask(R1);
+    else if (op == LIR_param)
+        prefer = rmask(imm2register(i->imm8()));
+
+    if (_allocator.free & allow & prefer)
+        allow &= prefer;
+    return allow;
+}
+
+void
+Assembler::asm_qjoin(LIns *ins)
+{
+    int d = findMemFor(ins);
+    AvmAssert(d);
+    LIns* lo = ins->oprnd1();
+    LIns* hi = ins->oprnd2();
+                            
+    Register r = findRegFor(hi, GpRegs);
+    ST(FP, d+4, r);
 
-	void Assembler::asm_store64(LInsp value, int dr, LInsp base)
-	{
-		int da = findMemFor(value);
-	    Register rb = findRegFor(base, GpRegs);
-	    asm_mmq(rb, dr, FP, da);
-	}
+    // okay if r gets recycled.
+    r = findRegFor(lo, GpRegs);
+    ST(FP, d, r);
+    freeRsrcOf(ins, false); // if we had a reg in use, emit a ST to flush it to mem
+}
+
+void
+Assembler::asm_store32(LIns *value, int dr, LIns *base)
+{
+    // make sure what is in a register
+    Reservation *rA, *rB;
+    findRegFor2(GpRegs, value, rA, base, rB);
+    Register ra = rA->reg;
+    Register rb = rB->reg;
+    ST(rb, dr, ra);
+}
+
+void
+Assembler::asm_restore(LInsp i, Reservation *resv, Register r)
+{
+    (void)resv;
+    int d = findMemFor(i);
+    LD(r, d, FP);
+
+    verbose_only(
+        if (_verbose)
+            outputf("        restore %s",_thisfrag->lirbuf->names->formatRef(i));
+    )
+}
 
-	void Assembler::asm_quad(LInsp ins)
-	{
-		Reservation *rR = getresv(ins);
-		int d = disp(rR);
-		freeRsrcOf(ins, false);
-		if (d)
-		{
-			const int32_t* p = (const int32_t*) (ins-2);
-			STi(FP,d+4,p[1]);
-			STi(FP,d,p[0]);
-		}
-	}
+void
+Assembler::asm_spill(LInsp i, Reservation *resv, bool pop)
+{
+    (void)i;
+    (void)pop;
+
+    if (resv->arIndex) {
+        int d = disp(resv);
+        // save to spill location
+        Register rr = resv->reg;
+        ST(FP, d, rr);
+
+        verbose_only(if (_verbose){
+                outputf("        spill %s",_thisfrag->lirbuf->names->formatRef(i));
+            }
+        )
+    }
+}
+
+void
+Assembler::asm_load64(LInsp ins)
+{
+    LIns* base = ins->oprnd1();
+    int db = ins->oprnd2()->constval();
+    Reservation *resv = getresv(ins);
+    int dr = disp(resv);
+    NanoAssert(resv->reg == UnknownReg && dr != 0);
+
+    Register rb = findRegFor(base, GpRegs);
+    resv->reg = UnknownReg;
+    asm_mmq(FP, dr, rb, db);
+    freeRsrcOf(ins, false);
+}
 
-	bool Assembler::asm_qlo(LInsp ins, LInsp q)
-	{
-		(void)ins; (void)q;
-		return false;
-	}
+void
+Assembler::asm_store64(LInsp value, int dr, LInsp base)
+{
+    int da = findMemFor(value);
+    Register rb = findRegFor(base, GpRegs);
+    asm_mmq(rb, dr, FP, da);
+}
+
+void
+Assembler::asm_quad(LInsp ins)
+{
+    Reservation *rR = getresv(ins);
+    int d = disp(rR);
+    freeRsrcOf(ins, false);
+
+    if (d) {
+        const int32_t* p = (const int32_t*) (ins-2);
+        STi(FP,d+4,p[1]);
+        STi(FP,d,p[0]);
+    }
+}
+
+bool
+Assembler::asm_qlo(LInsp ins, LInsp q)
+{
+    (void)ins; (void)q;
+    return false;
+}
 
-	void Assembler::asm_nongp_copy(Register r, Register s)
-	{
-		// we will need this for VFP support
-		(void)r; (void)s;
-		NanoAssert(false);
-	}
+void
+Assembler::asm_nongp_copy(Register r, Register s)
+{
+    // we will need this for VFP support
+    (void)r; (void)s;
+    NanoAssert(false);
+}
+
+Register
+Assembler::asm_binop_rhs_reg(LInsp ins)
+{
+    return UnknownReg;
+}
 
-	Register Assembler::asm_binop_rhs_reg(LInsp ins)
-	{
-		return UnknownReg;
-	}
+/**
+ * copy 64 bits: (rd+dd) <- (rs+ds)
+ */
+void
+Assembler::asm_mmq(Register rd, int dd, Register rs, int ds)
+{
+    // value is either a 64bit struct or maybe a float
+    // that isn't live in an FPU reg.  Either way, don't
+    // put it in an FPU reg just to load & store it.
+    // get a scratch reg
+    Register t = registerAlloc(GpRegs & ~(rmask(rd)|rmask(rs)));
+    _allocator.addFree(t);
+    ST(rd, dd+4, t);
+    LD(t, ds+4, rs);
+    ST(rd, dd, t);
+    LD(t, ds, rs);
+}
 
-    /**
-     * copy 64 bits: (rd+dd) <- (rs+ds)
-     */
-    void Assembler::asm_mmq(Register rd, int dd, Register rs, int ds)
+void
+Assembler::asm_pusharg(LInsp p)
+{
+    // arg goes on stack
+    Reservation* rA = getresv(p);
+    if (rA == 0)
     {
-        // value is either a 64bit struct or maybe a float
-        // that isn't live in an FPU reg.  Either way, don't
-        // put it in an FPU reg just to load & store it.
-		// get a scratch reg
-		Register t = registerAlloc(GpRegs & ~(rmask(rd)|rmask(rs)));
-		_allocator.addFree(t);
-		ST(rd, dd+4, t);
-		LD(t, ds+4, rs);
-		ST(rd, dd, t);
-		LD(t, ds, rs);
+        Register ra = findRegFor(p, GpRegs);
+        ST(SP,0,ra);
     }
+    else if (rA->reg == UnknownReg)
+    {
+        ST(SP,0,Scratch);
+        LD(Scratch,disp(rA),FP);
+    }
+    else
+    {
+        ST(SP,0,rA->reg);
+    }
+}
 
-	void Assembler::asm_pusharg(LInsp p)
-	{
-		// arg goes on stack
-		Reservation* rA = getresv(p);
-		if (rA == 0)
-		{
-			Register ra = findRegFor(p, GpRegs);
-			ST(SP,0,ra);
-		}
-		else if (rA->reg == UnknownReg)
-		{
-			ST(SP,0,Scratch);
-			LD(Scratch,disp(rA),FP);
-		}
-		else
-		{
-			ST(SP,0,rA->reg);
-		}
-	}
+void
+Assembler::nativePageReset()
+{
+    _nSlot = 0;
+    _nExitSlot = 0;
+}
 
-	void Assembler::nativePageReset()
-	{
-			_nSlot = 0;
-			_nExitSlot = 0;
-	}
+void
+Assembler::nativePageSetup()
+{
+    if (!_nIns)      _nIns     = pageAlloc();
+    if (!_nExitIns)  _nExitIns = pageAlloc(true);
+    //fprintf(stderr, "assemble onto %x exits into %x\n", (int)_nIns, (int)_nExitIns);
+    
+    if (!_nSlot)
+    {
+        // This needs to be done or the samepage macro gets confused; pageAlloc
+        // gives us a pointer to just past the end of the page.
+        _nIns--;
+        _nExitIns--;
 
-	void Assembler::nativePageSetup()
-	{
-		if (!_nIns)		 _nIns	   = pageAlloc();
-		if (!_nExitIns)  _nExitIns = pageAlloc(true);
-		//fprintf(stderr, "assemble onto %x exits into %x\n", (int)_nIns, (int)_nExitIns);
-	
-		if (!_nSlot)
-		{
-			// This needs to be done or the samepage macro gets confused; pageAlloc
-			// gives us a pointer to just past the end of the page.
-			_nIns--;
-			_nExitIns--;
+        // constpool starts at top of page and goes down,
+        // code starts at bottom of page and moves up
+        _nSlot = pageDataStart(_nIns); //(int*)(&((Page*)pageTop(_nIns))->lir[0]);
+    }
+}
 
-			// constpool starts at top of page and goes down,
-			// code starts at bottom of page and moves up
-			_nSlot = pageDataStart(_nIns); //(int*)(&((Page*)pageTop(_nIns))->lir[0]);
-		}
-	}
-
-	void Assembler::flushCache(NIns* n1, NIns* n2) {
+void
+Assembler::flushCache(NIns* n1, NIns* n2) {
 #if defined(UNDER_CE)
- 		// we changed the code, so we need to do this (sadly)
-		FlushInstructionCache(GetCurrentProcess(), NULL, NULL);
+    // we changed the code, so we need to do this (sadly)
+    FlushInstructionCache(GetCurrentProcess(), NULL, NULL);
 #elif defined(AVMPLUS_LINUX)
-		// Just need to clear this one page (not even the whole page really)
-		//Page *page = (Page*)pageTop(_nIns);
-		register unsigned long _beg __asm("a1") = (unsigned long)(n1);
-		register unsigned long _end __asm("a2") = (unsigned long)(n2);
-		register unsigned long _flg __asm("a3") = 0;
-		register unsigned long _swi __asm("r7") = 0xF0002;
-		__asm __volatile ("swi 0 	@ sys_cacheflush" : "=r" (_beg) : "0" (_beg), "r" (_end), "r" (_flg), "r" (_swi));
+    // Just need to clear this one page (not even the whole page really)
+    //Page *page = (Page*)pageTop(_nIns);
+    register unsigned long _beg __asm("a1") = (unsigned long)(n1);
+    register unsigned long _end __asm("a2") = (unsigned long)(n2);
+    register unsigned long _flg __asm("a3") = 0;
+    register unsigned long _swi __asm("r7") = 0xF0002;
+    __asm __volatile ("swi 0    @ sys_cacheflush" : "=r" (_beg) : "0" (_beg), "r" (_end), "r" (_flg), "r" (_swi));
 #endif
-	}
+}
 
-	NIns* Assembler::asm_adjustBranch(NIns* at, NIns* target)
-	{
-		// This always got emitted as a BL_far sequence; at points
-		// to the first of 4 instructions.  Ensure that we're where
-		// we think we were..
-		NanoAssert(at[1] == (NIns)( COND_AL | OP_IMM | (1<<23) | (PC<<16) | (LR<<12) | (4) ));
-		NanoAssert(at[2] == (NIns)( COND_AL | (0x9<<21) | (0xFFF<<8) | (1<<4) | (IP) ));
+NIns*
+Assembler::asm_adjustBranch(NIns* at, NIns* target)
+{
+    // This always got emitted as a BL_far sequence; at points
+    // to the first of 4 instructions.  Ensure that we're where
+    // we think we were..
+    NanoAssert(at[1] == (NIns)( COND_AL | OP_IMM | (1<<23) | (PC<<16) | (LR<<12) | (4) ));
+    NanoAssert(at[2] == (NIns)( COND_AL | (0x9<<21) | (0xFFF<<8) | (1<<4) | (IP) ));
 
-		NIns* was = (NIns*) at[3];
+    NIns* was = (NIns*) at[3];
 
-		at[3] = (NIns)target;
+    at[3] = (NIns)target;
 
-		flushCache(at, at+4);
+    flushCache(at, at+4);
 
 #ifdef AVMPLUS_PORTING_API
-		NanoJIT_PortAPI_FlushInstructionCache(at, at+4);
+    NanoJIT_PortAPI_FlushInstructionCache(at, at+4);
 #endif
 
-		return was;
-	}
+    return was;
+}
 
-	void Assembler::underrunProtect(int bytes)
-	{
-		intptr_t u = bytes + sizeof(PageHeader)/sizeof(NIns) + 8;
-		if ( (samepage(_nIns,_nSlot) && (((intptr_t)_nIns-u) <= intptr_t(_nSlot+1))) ||
-			 (!samepage((intptr_t)_nIns-u,_nIns)) )
-		{
-			NIns* target = _nIns;
+void
+Assembler::underrunProtect(int bytes)
+{
+    intptr_t u = bytes + sizeof(PageHeader)/sizeof(NIns) + 8;
+    if ( (samepage(_nIns,_nSlot) && (((intptr_t)_nIns-u) <= intptr_t(_nSlot+1))) ||
+         (!samepage((intptr_t)_nIns-u,_nIns)) )
+    {
+        NIns* target = _nIns;
 
-			_nIns = pageAlloc(_inExit);
+        _nIns = pageAlloc(_inExit);
 
-			// XXX _nIns at this point points to one past the end of
-			// the page, intended to be written into using *(--_nIns).
-			// However, (guess) something seems to be storing the value
-			// of _nIns as is, and then later generating a jump to a bogus
-			// address.  So pre-decrement to ensure that it's always
-			// valid; we end up skipping using the last instruction this
-			// way.
-			_nIns--;
+        // XXX _nIns at this point points to one past the end of
+        // the page, intended to be written into using *(--_nIns).
+        // However, (guess) something seems to be storing the value
+        // of _nIns as is, and then later generating a jump to a bogus
+        // address.  So pre-decrement to ensure that it's always
+        // valid; we end up skipping using the last instruction this
+        // way.
+        _nIns--;
 
-			// Update slot, either to _nIns (if decremented above), or
-			// _nIns-1 once the above bug is fixed/found.
-			_nSlot = pageDataStart(_nIns);
+        // Update slot, either to _nIns (if decremented above), or
+        // _nIns-1 once the above bug is fixed/found.
+        _nSlot = pageDataStart(_nIns);
 
-			// If samepage() is used on _nIns and _nSlot, it'll fail, since _nIns
-			// points to one past the end of the page right now.  Assume that 
-			// JMP_nochk won't ever try to write to _nSlot, and so won't ever
-			// check samepage().  See B_cond_chk macro.
-			JMP_nochk(target);
-		} else if (!_nSlot) {
-			// make sure that there's always a slot pointer
-			_nSlot = pageDataStart(_nIns);
-		}
-	}
+        // If samepage() is used on _nIns and _nSlot, it'll fail, since _nIns
+        // points to one past the end of the page right now.  Assume that 
+        // JMP_nochk won't ever try to write to _nSlot, and so won't ever
+        // check samepage().  See B_cond_chk macro.
+        JMP_nochk(target);
+    } else if (!_nSlot) {
+        // make sure that there's always a slot pointer
+        _nSlot = pageDataStart(_nIns);
+    }
+}
 
-	void Assembler::BL_far(NIns* addr) {
-		// we have to stick an immediate into the stream and make lr
-		// point to the right spot before branching
-		underrunProtect(16);
+void
+Assembler::BL_far(NIns* addr)
+{
+    // we have to stick an immediate into the stream and make lr
+    // point to the right spot before branching
+    underrunProtect(16);
 
-		// the address
-		*(--_nIns) = (NIns)((addr));
-		// bx ip             // branch to the address we loaded earlier
-		*(--_nIns) = (NIns)( COND_AL | (0x9<<21) | (0xFFF<<8) | (1<<4) | (IP) );
-		// add lr, [pc + #4] // set lr to be past the address that we wrote
-		*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | (PC<<16) | (LR<<12) | (4) );
-		// ldr ip, [pc + #4] // load the address into ip, reading it from [pc+4]
-		*(--_nIns) = (NIns)( COND_AL | (0x59<<20) | (PC<<16) | (IP<<12) | (4));
-		asm_output1("bl %p (32-bit)", addr);
-	}
+    // the address
+    *(--_nIns) = (NIns)((addr));
+    // bx ip             // branch to the address we loaded earlier
+    *(--_nIns) = (NIns)( COND_AL | (0x9<<21) | (0xFFF<<8) | (1<<4) | (IP) );
+    // add lr, [pc + #4] // set lr to be past the address that we wrote
+    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | (PC<<16) | (LR<<12) | (4) );
+    // ldr ip, [pc + #4] // load the address into ip, reading it from [pc+4]
+    *(--_nIns) = (NIns)( COND_AL | (0x59<<20) | (PC<<16) | (IP<<12) | (4));
+    asm_output1("bl %p (32-bit)", addr);
+}
 
-	void Assembler::BL(NIns* addr) {
-		intptr_t offs = PC_OFFSET_FROM(addr,(intptr_t)_nIns-4);
-		if (JMP_S24_OFFSET_OK(offs)) {
-			// we can do this with a single BL call
-			underrunProtect(4);
-			*(--_nIns) = (NIns)( COND_AL | (0xB<<24) | (((offs)>>2) & 0xFFFFFF) ); \
-			asm_output1("bl %p", addr);
-		} else {
-			BL_far(addr);
-		}
-	}
+void
+Assembler::BL(NIns* addr)
+{
+    intptr_t offs = PC_OFFSET_FROM(addr,(intptr_t)_nIns-4);
+    if (JMP_S24_OFFSET_OK(offs)) {
+        // we can do this with a single BL call
+        underrunProtect(4);
+        *(--_nIns) = (NIns)( COND_AL | (0xB<<24) | (((offs)>>2) & 0xFFFFFF) ); \
+        asm_output1("bl %p", addr);
+    } else {
+        BL_far(addr);
+    }
+}
 
-	void Assembler::CALL(const CallInfo *ci)
-	{
-        intptr_t addr = ci->_address;
-		BL((NIns*)addr);
-		asm_output1("   (call %s)", ci->_name);
-	}
+void
+Assembler::CALL(const CallInfo *ci)
+{
+    intptr_t addr = ci->_address;
+    BL((NIns*)addr);
+    asm_output1("   (call %s)", ci->_name);
+}
 
-	void Assembler::LD32_nochk(Register r, int32_t imm)
-	{
-		// We can always reach the const pool, since it's on the same page (<4096)
-		underrunProtect(8);
+void
+Assembler::LD32_nochk(Register r, int32_t imm)
+{
+    // We can always reach the const pool, since it's on the same page (<4096)
+    underrunProtect(8);
 
-		*(++_nSlot) = (int)imm;
+    *(++_nSlot) = (int)imm;
+
+    //fprintf (stderr, "wrote slot(2) %p with %08x, jmp @ %p\n", _nSlot, (intptr_t)imm, _nIns-1);
 
-		//fprintf (stderr, "wrote slot(2) %p with %08x, jmp @ %p\n", _nSlot, (intptr_t)imm, _nIns-1);
+    int offset = PC_OFFSET_FROM(_nSlot,(intptr_t)(_nIns)-4);
 
-		int offset = PC_OFFSET_FROM(_nSlot,(intptr_t)(_nIns)-4);
+    NanoAssert(JMP_S24_OFFSET_OK(offset) && (offset < 0));
 
-		NanoAssert(JMP_S24_OFFSET_OK(offset) && (offset < 0));
+    *(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | ((r)<<12) | ((-offset) & 0xFFFFFF) );
+    asm_output2("ld %s,%d",gpn(r),imm);
+}
 
-		*(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | ((r)<<12) | ((-offset) & 0xFFFFFF) );
-		asm_output2("ld %s,%d",gpn(r),imm);
-	}
-    #endif /* FEATURE_NANOJIT */
+#endif /* FEATURE_NANOJIT */
+
 }
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@@ -1,9 +1,9 @@
-/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 4 -*- */
+/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 40 -*- */
 /* ***** BEGIN LICENSE BLOCK *****
  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
  *
  * The contents of this file are subject to the Mozilla Public License Version
  * 1.1 (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  * http://www.mozilla.org/MPL/
  *
@@ -16,16 +16,17 @@
  *
  * The Initial Developer of the Original Code is
  * Adobe System Incorporated.
  * Portions created by the Initial Developer are Copyright (C) 2004-2007
  * the Initial Developer. All Rights Reserved.
  *
  * Contributor(s):
  *   Adobe AS3 Team
+ *   Vladimir Vukicevic <vladimir@pobox.com>
  *
  * Alternatively, the contents of this file may be used under the terms of
  * either the GNU General Public License Version 2 or later (the "GPL"), or
  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  * in which case the provisions of the GPL or the LGPL are applicable instead
  * of those above. If you wish to allow use of your version of this file only
  * under the terms of either the GPL or the LGPL, and not to allow others to
  * use your version of this file under the terms of the MPL, indicate your
@@ -38,541 +39,552 @@
 
 
 #ifndef __nanojit_NativeArm__
 #define __nanojit_NativeArm__
 
 
 namespace nanojit
 {
-	const int NJ_LOG2_PAGE_SIZE	= 12;		// 4K
-	#define NJ_MAX_REGISTERS				11
-	#define NJ_MAX_STACK_ENTRY				256
-	#define NJ_MAX_PARAMETERS				16
-	#define NJ_ALIGN_STACK					8
-	#define NJ_STACK_OFFSET					8
+
+const int NJ_LOG2_PAGE_SIZE = 12;       // 4K
 
-	#define NJ_SOFTFLOAT
-	#define NJ_STACK_GROWTH_UP
-
-	#define NJ_CONSTANT_POOLS
-	const int NJ_MAX_CPOOL_OFFSET = 4096;
-	const int NJ_CPOOL_SIZE = 16;
-
-	typedef int NIns;
+#define NJ_MAX_REGISTERS                11
+#define NJ_MAX_STACK_ENTRY              256
+#define NJ_MAX_PARAMETERS               16
+#define NJ_ALIGN_STACK                  8
+#define NJ_STACK_OFFSET                 8
 
-	/* ARM registers */
-	typedef enum 
-	{
-		R0  = 0,
-		R1  = 1,
-		R2  = 2,
-		R3  = 3,
-		R4  = 4,
-		R5  = 5,
-		R6  = 6,
-		R7  = 7,
-		R8  = 8,
-		R9  = 9,
-		R10 = 10,
-		//FP  =11,
-		IP  = 12,
-		SP  = 13,
-		LR  = 14,
-		PC  = 15,
+#define NJ_SOFTFLOAT
+#define NJ_STACK_GROWTH_UP
+
+#define NJ_CONSTANT_POOLS
+const int NJ_MAX_CPOOL_OFFSET = 4096;
+const int NJ_CPOOL_SIZE = 16;
+
+typedef int NIns;
 
-		FP = 13,
-		
-		// Pseudo-register for floating point
-		F0  = 0,
+/* ARM registers */
+typedef enum {
+    R0  = 0,
+    R1  = 1,
+    R2  = 2,
+    R3  = 3,
+    R4  = 4,
+    R5  = 5,
+    R6  = 6,
+    R7  = 7,
+    R8  = 8,
+    R9  = 9,
+    R10 = 10,
+    //FP  =11,
+    IP  = 12,
+    SP  = 13,
+    LR  = 14,
+    PC  = 15,
 
-		// helpers
-		FRAME_PTR = 11,
-		ESP	= 13,
-		
-		FirstReg = 0,
-		LastReg = 10,
-		Scratch	= 12,
-		UnknownReg = 11
-	}
-	Register;
+    FP = 13,
+        
+    // Pseudo-register for floating point
+    F0  = 0,
 
-	/* ARM condition codes */
-	typedef enum
-	{
-		EQ = 0x0, // Equal
-		NE = 0x1, // Not Equal
-		CS = 0x2, // Carry Set (or HS)
-		CC = 0x3, // Carry Clear (or LO)
-		MI = 0x4, // MInus
-		PL = 0x5, // PLus
-		VS = 0x6, // oVerflow Set
-		VC = 0x7, // oVerflow Clear
-		HI = 0x8, // HIgher
-		LS = 0x9, // Lower or Same
-		GE = 0xA, // Greater or Equal
-		LT = 0xB, // Less Than
-		GT = 0xC, // Greater Than
-		LE = 0xD, // Less or Equal
-		AL = 0xE, // ALways
-		NV = 0xF  // NeVer
-	}
-	ConditionCode;
+    // helpers
+    FRAME_PTR = 11,
+    ESP = 13,
+        
+    FirstReg = 0,
+    LastReg = 10,
+    Scratch = 12,
+    UnknownReg = 11
+} Register;
+
+/* ARM condition codes */
+typedef enum {
+    EQ = 0x0, // Equal
+    NE = 0x1, // Not Equal
+    CS = 0x2, // Carry Set (or HS)
+    CC = 0x3, // Carry Clear (or LO)
+    MI = 0x4, // MInus
+    PL = 0x5, // PLus
+    VS = 0x6, // oVerflow Set
+    VC = 0x7, // oVerflow Clear
+    HI = 0x8, // HIgher
+    LS = 0x9, // Lower or Same
+    GE = 0xA, // Greater or Equal
+    LT = 0xB, // Less Than
+    GT = 0xC, // Greater Than
+    LE = 0xD, // Less or Equal
+    AL = 0xE, // ALways
+    NV = 0xF  // NeVer
+} ConditionCode;
 
 
-	typedef int RegisterMask;
-	typedef struct _FragInfo
-	{
-		RegisterMask	needRestoring;
-		NIns*			epilogue;
-	} 
-	FragInfo;
+typedef int RegisterMask;
+typedef struct _FragInfo {
+    RegisterMask    needRestoring;
+    NIns*           epilogue;
+} FragInfo;
 
-	static const RegisterMask SavedRegs = 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7 | 1<<R8 | 1<<R9 | 1<<R10;
-	static const RegisterMask FpRegs = 0x0000; // FST0-FST7
-	static const RegisterMask GpRegs = 0x07FF;
-	static const RegisterMask AllowableFlagRegs = 1<<R0 | 1<<R1 | 1<<R2 | 1<<R3 | 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7 | 1<<R8 | 1<<R9 | 1<<R10;
+static const RegisterMask SavedRegs = 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7 | 1<<R8 | 1<<R9 | 1<<R10;
+static const RegisterMask FpRegs = 0x0000; // FST0-FST7
+static const RegisterMask GpRegs = 0x07FF;
+static const RegisterMask AllowableFlagRegs = 1<<R0 | 1<<R1 | 1<<R2 | 1<<R3 | 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7 | 1<<R8 | 1<<R9 | 1<<R10;
+
+#define firstreg()      R0
+#define nextreg(r)      (Register)((int)r+1)
+#define imm2register(c) (Register)(c-1)
+
+verbose_only( extern const char* regNames[]; )
 
-	#define firstreg()		R0
-	#define nextreg(r)		(Register)((int)r+1)
-	#define imm2register(c) (Register)(c-1)
+// abstract to platform specific calls
+#define nExtractPlatformFlags(x)    0
 
-	verbose_only( extern const char* regNames[]; )
+#define DECLARE_PLATFORM_STATS()                \
+    counter_define(x87Top);
+
+#define DECLARE_PLATFORM_REGALLOC()
 
-	// abstract to platform specific calls
-	#define nExtractPlatformFlags(x)	0
-
-	#define DECLARE_PLATFORM_STATS() \
-		counter_define(x87Top);
-
-	#define DECLARE_PLATFORM_REGALLOC()
+#define DECLARE_PLATFORM_ASSEMBLER()                                    \
+    const static Register argRegs[4], retRegs[2];                       \
+    void LD32_nochk(Register r, int32_t imm);                           \
+    void BL(NIns*);                                                     \
+    void BL_far(NIns*);                                                 \
+    void CALL(const CallInfo*);                                         \
+    void underrunProtect(int bytes);                                    \
+    bool has_cmov;                                                      \
+    void nativePageReset();                                             \
+    void nativePageSetup();                                             \
+    void flushCache(NIns*,NIns*);                                       \
+    int* _nSlot;                                                        \
+    int* _nExitSlot;
 
 
-	#define DECLARE_PLATFORM_ASSEMBLER()\
-		const static Register argRegs[4], retRegs[2];\
-		void LD32_nochk(Register r, int32_t imm);\
-		void BL(NIns*);\
-		void BL_far(NIns*);\
-		void CALL(const CallInfo*);\
-		void underrunProtect(int bytes);\
-		bool has_cmov;\
-		void nativePageReset();\
-		void nativePageSetup();\
-		void flushCache(NIns*,NIns*);\
-		int* _nSlot;\
-		int* _nExitSlot;
+#define asm_farg(i) NanoAssert(false)
+
+//printf("jmp_l_n count=%d, nins=%X, %X = %X\n", (_c), nins, _nIns, ((intptr_t)(nins+(_c))-(intptr_t)_nIns - 4) );
+
+#define swapptrs()  {                                                   \
+        NIns* _tins = _nIns; _nIns=_nExitIns; _nExitIns=_tins;          \
+        int* _nslot = _nSlot;                                           \
+        _nSlot = _nExitSlot;                                            \
+        _nExitSlot = _nslot;                                            \
+    }
 
 
-    #define asm_farg(i) NanoAssert(false)
+#define IMM32(imm)  *(--_nIns) = (NIns)((imm));
 
-	//printf("jmp_l_n count=%d, nins=%X, %X = %X\n", (_c), nins, _nIns, ((intptr_t)(nins+(_c))-(intptr_t)_nIns - 4) );
+#define FUNCADDR(addr) ( ((int)addr) )  
 
-	#define swapptrs()  { NIns* _tins = _nIns; _nIns=_nExitIns; _nExitIns=_tins; \
-								int* _nslot = _nSlot;\
-								_nSlot = _nExitSlot;\
-								_nExitSlot = _nslot;}
+#define OP_IMM  (1<<25)
 
-
-#define IMM32(imm)	*(--_nIns) = (NIns)((imm));
-
-#define FUNCADDR(addr) ( ((int)addr) )	
-
+#define COND_AL (0xE<<28)
 
-#define OP_IMM	(1<<25)
-
-#define COND_AL	(0xE<<28)
-
-typedef enum
-{
-	LSL_imm = 0, // LSL #c - Logical Shift Left
-	LSL_reg = 1, // LSL Rc - Logical Shift Left
-	LSR_imm = 2, // LSR #c - Logical Shift Right
-	LSR_reg = 3, // LSR Rc - Logical Shift Right
-	ASR_imm = 4, // ASR #c - Arithmetic Shift Right
-	ASR_reg = 5, // ASR Rc - Arithmetic Shift Right
-	ROR_imm = 6, // Rotate Right (c != 0)
-	RRX     = 6, // Rotate Right one bit with extend (c == 0)
-	ROR_reg = 7  // Rotate Right
-}
-ShiftOperator;
+typedef enum {
+    LSL_imm = 0, // LSL #c - Logical Shift Left
+    LSL_reg = 1, // LSL Rc - Logical Shift Left
+    LSR_imm = 2, // LSR #c - Logical Shift Right
+    LSR_reg = 3, // LSR Rc - Logical Shift Right
+    ASR_imm = 4, // ASR #c - Arithmetic Shift Right
+    ASR_reg = 5, // ASR Rc - Arithmetic Shift Right
+    ROR_imm = 6, // Rotate Right (c != 0)
+    RRX     = 6, // Rotate Right one bit with extend (c == 0)
+    ROR_reg = 7  // Rotate Right
+} ShiftOperator;
 
 #define LD32_size 4
 
+#define BEGIN_NATIVE_CODE(x)                    \
+    { DWORD* _nIns = (uint8_t*)x
 
-#define BEGIN_NATIVE_CODE(x) \
-	{ DWORD* _nIns = (uint8_t*)x
-
-#define END_NATIVE_CODE(x) \
-	(x) = (dictwordp*)_nIns; }
+#define END_NATIVE_CODE(x)                      \
+    (x) = (dictwordp*)_nIns; }
 
 // BX 
-#define BX(_r)	do {\
-	underrunProtect(4);\
-	*(--_nIns) = (NIns)( COND_AL | (0x12<<20) | (0xFFF<<8) | (1<<4) | (_r));\
-	asm_output("bx LR"); } while(0)
+#define BX(_r)  do {                                                    \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0x12<<20) | (0xFFF<<8) | (1<<4) | (_r)); \
+        asm_output("bx LR"); } while(0)
 
 // _l = _r OR _l
-#define OR(_l,_r)		do {\
-	underrunProtect(4);\
-	*(--_nIns) = (NIns)( COND_AL | (0xC<<21) | (_r<<16) | (_l<<12) | (_l) );\
-	asm_output2("or %s,%s",gpn(_l),gpn(_r)); } while(0)
+#define OR(_l,_r)       do {                                            \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0xC<<21) | (_r<<16) | (_l<<12) | (_l) ); \
+        asm_output2("or %s,%s",gpn(_l),gpn(_r)); } while(0)
 
 // _r = _r OR _imm
-#define ORi(_r,_imm)	do {\
-	NanoAssert(isU8((_imm)));\
-	underrunProtect(4); \
-	*(--_nIns) = (NIns)( COND_AL | OP_IMM | (0xC<<21) | (_r<<16) | (_r<<12) | ((_imm)&0xFF) );\
-	asm_output2("or %s,%d",gpn(_r), (_imm)); } while(0)
+#define ORi(_r,_imm)    do {                                            \
+        NanoAssert(isU8((_imm)));                                       \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | OP_IMM | (0xC<<21) | (_r<<16) | (_r<<12) | ((_imm)&0xFF) ); \
+        asm_output2("or %s,%d",gpn(_r), (_imm)); } while(0)
 
 // _l = _r AND _l
-#define AND(_l,_r) do {\
-	underrunProtect(4);\
-	*(--_nIns) = (NIns)( COND_AL | ((_r)<<16) | ((_l)<<12) | (_l));\
-	asm_output2("and %s,%s",gpn(_l),gpn(_r)); } while(0)
+#define AND(_l,_r) do {                                                 \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | ((_r)<<16) | ((_l)<<12) | (_l)); \
+        asm_output2("and %s,%s",gpn(_l),gpn(_r)); } while(0)
 
 // _r = _r AND _imm
-#define ANDi(_r,_imm) do {\
-	if (isU8((_imm))) {\
-		underrunProtect(4);\
-		*(--_nIns) = (NIns)( COND_AL | OP_IMM | ((_r)<<16) | ((_r)<<12) | ((_imm)&0xFF) );\
-		asm_output2("and %s,%d",gpn(_r),(_imm));}\
-	else if ((_imm)<0 && (_imm)>-256) {\
-		underrunProtect(8);\
-		*(--_nIns) = (NIns)( COND_AL | ((_r)<<16) | ((_r)<<12) | (Scratch) );\
-		asm_output2("and %s,%s",gpn(_r),gpn(Scratch));\
-		*(--_nIns) = (NIns)( COND_AL | (0x3E<<20) | ((Scratch)<<12) | (((_imm)^0xFFFFFFFF)&0xFF) );\
-		asm_output2("mvn %s,%d",gpn(Scratch),(_imm));}\
-	else NanoAssert(0);\
-	} while (0)
+#define ANDi(_r,_imm) do {                                              \
+        if (isU8((_imm))) {                                             \
+            underrunProtect(4);                                         \
+            *(--_nIns) = (NIns)( COND_AL | OP_IMM | ((_r)<<16) | ((_r)<<12) | ((_imm)&0xFF) ); \
+            asm_output2("and %s,%d",gpn(_r),(_imm));}                   \
+        else if ((_imm)<0 && (_imm)>-256) {                             \
+            underrunProtect(8);                                         \
+            *(--_nIns) = (NIns)( COND_AL | ((_r)<<16) | ((_r)<<12) | (Scratch) ); \
+            asm_output2("and %s,%s",gpn(_r),gpn(Scratch));              \
+            *(--_nIns) = (NIns)( COND_AL | (0x3E<<20) | ((Scratch)<<12) | (((_imm)^0xFFFFFFFF)&0xFF) ); \
+            asm_output2("mvn %s,%d",gpn(Scratch),(_imm));}              \
+        else NanoAssert(0);                                             \
+    } while (0)
 
 
 // _l = _l XOR _r
-#define XOR(_l,_r)	do {\
-	underrunProtect(4);\
-	*(--_nIns) = (NIns)( COND_AL | (1<<21) | ((_r)<<16) | ((_l)<<12) | (_l));\
-	asm_output2("eor %s,%s",gpn(_l),gpn(_r)); } while(0)
+#define XOR(_l,_r)  do {                                                \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (1<<21) | ((_r)<<16) | ((_l)<<12) | (_l)); \
+        asm_output2("eor %s,%s",gpn(_l),gpn(_r)); } while(0)
 
 // _r = _r XOR _imm
-#define XORi(_r,_imm)	do {	\
-	NanoAssert(isU8((_imm)));\
-	underrunProtect(4);		\
-	*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<21) | ((_r)<<16) | ((_r)<<12) | ((_imm)&0xFF) );\
-	asm_output2("eor %s,%d",gpn(_r),(_imm)); } while(0)
+#define XORi(_r,_imm)   do {                                            \
+        NanoAssert(isU8((_imm)));                                       \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<21) | ((_r)<<16) | ((_r)<<12) | ((_imm)&0xFF) ); \
+        asm_output2("eor %s,%d",gpn(_r),(_imm)); } while(0)
 
 // _l = _l + _r
-#define ADD(_l,_r) do {\
-	underrunProtect(4);\
-	*(--_nIns) = (NIns)( COND_AL | (1<<23) | ((_r)<<16) | ((_l)<<12) | (_l));\
-	asm_output2("add %s,%s",gpn(_l),gpn(_r)); } while(0)
+#define ADD(_l,_r) do {                                                 \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (1<<23) | ((_r)<<16) | ((_l)<<12) | (_l)); \
+        asm_output2("add %s,%s",gpn(_l),gpn(_r)); } while(0)
 
 // _r = _r + _imm
-#define ADDi(_r,_imm)	do {\
-	if ((_imm)>-256 && (_imm)<256) {\
-		underrunProtect(4);\
-		if	((_imm)>=0) *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | ((_imm)&0xFF) );\
-		else			*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | ((-(_imm))&0xFF) );}\
-	else {\
-		if ((_imm)>=0){\
-			if ((_imm)<=1020 && (((_imm)&3)==0) ){\
-				underrunProtect(4);\
-				*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | (15<<8)| ((_imm)>>2) );}\
-			else {\
-				underrunProtect(4+LD32_size);\
-				*(--_nIns) = (NIns)( COND_AL | (1<<23) | ((_r)<<16) | ((_r)<<12) | (Scratch));\
-				LD32_nochk(Scratch, _imm);}}\
-		else{\
-      if ((_imm)>=-510){\
-			  underrunProtect(8);\
-			  int rem = -(_imm) - 255;\
-			  *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | ((rem)&0xFF) );\
-        *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | (0xFF) );}\
-      else {\
-				underrunProtect(4+LD32_size);\
-				*(--_nIns) = (NIns)( COND_AL | (1<<22) | ((_r)<<16) | ((_r)<<12) | (Scratch));\
-        LD32_nochk(Scratch, -(_imm));}\
-    }\
-  }\
-	asm_output2("addi %s,%d",gpn(_r),(_imm));} while(0)
+#define ADDi(_r,_imm)   do {                                            \
+        if ((_imm)>-256 && (_imm)<256) {                                \
+            underrunProtect(4);                                         \
+            if ((_imm)>=0)                                              \
+                *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | ((_imm)&0xFF) ); \
+            else                                                        \
+                *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | ((-(_imm))&0xFF) ); \
+        } else {                                                        \
+            if ((_imm)>=0) {                                            \
+                if ((_imm)<=1020 && (((_imm)&3)==0) ) {                 \
+                    underrunProtect(4);                                 \
+                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | (15<<8)| ((_imm)>>2) ); \
+                } else {                                                \
+                    underrunProtect(4+LD32_size);                       \
+                    *(--_nIns) = (NIns)( COND_AL | (1<<23) | ((_r)<<16) | ((_r)<<12) | (Scratch)); \
+                    LD32_nochk(Scratch, _imm);                          \
+                }                                                       \
+            } else {                                                    \
+                if ((_imm)>=-510) {                                     \
+                    underrunProtect(8);                                 \
+                    int rem = -(_imm) - 255;                            \
+                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | ((rem)&0xFF) ); \
+                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | (0xFF) ); \
+                } else {                                                \
+                    underrunProtect(4+LD32_size);                       \
+                    *(--_nIns) = (NIns)( COND_AL | (1<<22) | ((_r)<<16) | ((_r)<<12) | (Scratch)); \
+                    LD32_nochk(Scratch, -(_imm));                       \
+                }                                                       \
+            }                                                           \
+        }                                                               \
+        asm_output2("addi %s,%d",gpn(_r),(_imm));                       \
+    } while(0)
 
 // _l = _l - _r
-#define SUB(_l,_r)	do {\
-	underrunProtect(4); \
-	*(--_nIns) = (NIns)( COND_AL | (1<<22) | ((_l)<<16) | ((_l)<<12) | (_r));\
-	asm_output2("sub %s,%s",gpn(_l),gpn(_r)); } while(0)
+#define SUB(_l,_r)  do {                                                \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (1<<22) | ((_l)<<16) | ((_l)<<12) | (_r)); \
+        asm_output2("sub %s,%s",gpn(_l),gpn(_r)); } while(0)
 
 // _r = _r - _imm
-#define SUBi(_r,_imm)	do{\
-	if ((_imm)>-256 && (_imm)<256){\
-		underrunProtect(4);\
-		if ((_imm)>=0)	*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | ((_imm)&0xFF) );\
-		else			*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | ((-(_imm))&0xFF) );}\
-	else {\
-		if ((_imm)>=0){\
-			if ((_imm)<=510){\
-				underrunProtect(8);\
-				int rem = (_imm) - 255;\
-				NanoAssert(rem<256);\
-				*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | (rem&0xFF) );\
-				*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | (0xFF) );}\
-			else {\
-				underrunProtect(4+LD32_size);\
-				*(--_nIns) = (NIns)( COND_AL | (1<<22) | ((_r)<<16) | ((_r)<<12) | (Scratch));\
-				LD32_nochk(Scratch, _imm);}}\
-		else{\
-      if ((_imm)>=-510) {\
-			  underrunProtect(8);\
-			  int rem = -(_imm) - 255;\
-			  *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | ((rem)&0xFF) );\
-			  *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | (0xFF) );}\
-      else {\
-				underrunProtect(4+LD32_size);\
-				*(--_nIns) = (NIns)( COND_AL | (1<<23) | ((_r)<<16) | ((_r)<<12) | (Scratch));\
-				LD32_nochk(Scratch, -(_imm));}\
-    }\
-  }\
-	asm_output2("sub %s,%d",gpn(_r),(_imm));} while (0)
+#define SUBi(_r,_imm)  do {                                             \
+        if ((_imm)>-256 && (_imm)<256) {                                \
+            underrunProtect(4);                                         \
+            if ((_imm)>=0)  *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | ((_imm)&0xFF) ); \
+            else            *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | ((-(_imm))&0xFF) ); \
+        } else {                                                        \
+            if ((_imm)>=0) {                                            \
+                if ((_imm)<=510) {                                      \
+                    underrunProtect(8);                                 \
+                    int rem = (_imm) - 255;                             \
+                    NanoAssert(rem<256);                                \
+                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | (rem&0xFF) ); \
+                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | (0xFF) ); \
+                } else {                                                \
+                    underrunProtect(4+LD32_size);                       \
+                    *(--_nIns) = (NIns)( COND_AL | (1<<22) | ((_r)<<16) | ((_r)<<12) | (Scratch)); \
+                    LD32_nochk(Scratch, _imm);                          \
+                }                                                       \
+            } else {                                                    \
+                if ((_imm)>=-510) {                                     \
+                    underrunProtect(8);                                 \
+                    int rem = -(_imm) - 255;                            \
+                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | ((rem)&0xFF) ); \
+                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | (0xFF) ); \
+                } else {                                                \
+                    underrunProtect(4+LD32_size);                       \
+                    *(--_nIns) = (NIns)( COND_AL | (1<<23) | ((_r)<<16) | ((_r)<<12) | (Scratch)); \
+                    LD32_nochk(Scratch, -(_imm)); \
+                }                                                       \
+            }                                                           \
+        }                                                               \
+        asm_output2("sub %s,%d",gpn(_r),(_imm));                        \
+    } while (0)
 
 // _l = _l * _r
-#define MUL(_l,_r)	do {\
-	underrunProtect(4);\
-	*(--_nIns) = (NIns)( COND_AL | (_l)<<16 | (_l)<<8 | 0x90 | (_r) );\
-	asm_output2("mul %s,%s",gpn(_l),gpn(_r)); } while(0)
+#define MUL(_l,_r)  do {                                                \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (_l)<<16 | (_l)<<8 | 0x90 | (_r) ); \
+        asm_output2("mul %s,%s",gpn(_l),gpn(_r)); } while(0)
 
 
 // RSBS
 // _r = -_r
-#define NEG(_r)	do {\
-	underrunProtect(4);	\
-	*(--_nIns) = (NIns)( COND_AL |  (0x27<<20) | ((_r)<<16) | ((_r)<<12) ); \
-	asm_output1("neg %s",gpn(_r)); } while(0)
+#define NEG(_r) do {                                                    \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL |  (0x27<<20) | ((_r)<<16) | ((_r)<<12) ); \
+        asm_output1("neg %s",gpn(_r)); } while(0)
 
 // MVNS
 // _r = !_r
-#define NOT(_r)	do {\
-	underrunProtect(4);	\
-	*(--_nIns) = (NIns)( COND_AL |  (0x1F<<20) | ((_r)<<12) |  (_r) ); \
-	asm_output1("mvn %s",gpn(_r)); } while(0)
+#define NOT(_r) do {                                                    \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL |  (0x1F<<20) | ((_r)<<12) |  (_r) ); \
+        asm_output1("mvn %s",gpn(_r)); } while(0)
 
 // MOVS _r, _r, LSR <_s>
 // _r = _r >> _s
-#define SHR(_r,_s) do {\
-	underrunProtect(4); \
-	*(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_s)<<8) | (LSR_reg<<4) | (_r) ); \
-	asm_output2("shr %s,%s",gpn(_r),gpn(_s)); } while(0)
+#define SHR(_r,_s) do {                                                 \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_s)<<8) | (LSR_reg<<4) | (_r) ); \
+        asm_output2("shr %s,%s",gpn(_r),gpn(_s)); } while(0)
 
 // MOVS _r, _r, LSR #_imm
 // _r = _r >> _imm
-#define SHRi(_r,_imm) do {\
-	underrunProtect(4); \
-	*(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_imm)<<7) | (LSR_imm<<4) | (_r) ); \
-	asm_output2("shr %s,%d",gpn(_r),_imm); } while(0)
+#define SHRi(_r,_imm) do {                                              \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_imm)<<7) | (LSR_imm<<4) | (_r) ); \
+        asm_output2("shr %s,%d",gpn(_r),_imm); } while(0)
 
 // MOVS _r, _r, ASR <_s>
 // _r = _r >> _s
-#define SAR(_r,_s) do {\
-	underrunProtect(4); \
-	*(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_s)<<8) | (ASR_reg<<4) | (_r) ); \
-	asm_output2("asr %s,%s",gpn(_r),gpn(_s)); } while(0)
+#define SAR(_r,_s) do {                                                 \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_s)<<8) | (ASR_reg<<4) | (_r) ); \
+        asm_output2("asr %s,%s",gpn(_r),gpn(_s)); } while(0)
 
 
 // MOVS _r, _r, ASR #_imm
 // _r = _r >> _imm
-#define SARi(_r,_imm) do {\
-	underrunProtect(4); \
-	*(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_imm)<<7) | (ASR_imm<<4) | (_r) ); \
-	asm_output2("asr %s,%d",gpn(_r),_imm); } while(0)
+#define SARi(_r,_imm) do {                                              \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_imm)<<7) | (ASR_imm<<4) | (_r) ); \
+        asm_output2("asr %s,%d",gpn(_r),_imm); } while(0)
 
 // MOVS _r, _r, LSL <_s>
 // _r = _r << _s
-#define SHL(_r,_s) do {\
-	underrunProtect(4);\
-	*(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_s)<<8) | (LSL_reg<<4) | (_r) ); \
-	asm_output2("lsl %s,%s",gpn(_r),gpn(_s)); } while(0)
+#define SHL(_r,_s) do {                                                 \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_s)<<8) | (LSL_reg<<4) | (_r) ); \
+        asm_output2("lsl %s,%s",gpn(_r),gpn(_s)); } while(0)
 
 // MOVS _r, _r, LSL #_imm
 // _r = _r << _imm
-#define SHLi(_r,_imm) do {\
-	underrunProtect(4); \
-	*(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_imm)<<7) | (LSL_imm<<4) | (_r) ); \
-	asm_output2("lsl %s,%d",gpn(_r),(_imm)); } while(0)
-					
+#define SHLi(_r,_imm) do {                                              \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0x1B<<20) | ((_r)<<12) | ((_imm)<<7) | (LSL_imm<<4) | (_r) ); \
+        asm_output2("lsl %s,%d",gpn(_r),(_imm)); } while(0)
+                    
 // TST
-#define TEST(_d,_s) do{\
-	underrunProtect(4); \
-	*(--_nIns) = (NIns)( COND_AL | (0x11<<20) | ((_d)<<16) | (_s) ); \
-	asm_output2("test %s,%s",gpn(_d),gpn(_s));} while(0)
+#define TEST(_d,_s) do {                                                \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0x11<<20) | ((_d)<<16) | (_s) ); \
+        asm_output2("test %s,%s",gpn(_d),gpn(_s)); } while(0)
 
 // CMP
-#define CMP(_l,_r)	do{\
-	underrunProtect(4); \
-	*(--_nIns) = (NIns)( COND_AL | (0x015<<20) | ((_l)<<16) | (_r) ); \
-	asm_output2("cmp %s,%s",gpn(_l),gpn(_r));} while(0)
+#define CMP(_l,_r)  do {                                                \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0x015<<20) | ((_l)<<16) | (_r) ); \
+        asm_output2("cmp %s,%s",gpn(_l),gpn(_r)); } while(0)
 
 // CMP (or CMN)
-#define CMPi(_r,_imm)	do{\
-	if (_imm<0) {	\
-		if ((_imm)>-256) {\
-			underrunProtect(4);\
-			*(--_nIns) = (NIns)( COND_AL | (0x37<<20) | ((_r)<<16) | (-(_imm)) );}\
-		else {\
-			underrunProtect(4+LD32_size);\
-			*(--_nIns) = (NIns)( COND_AL | (0x17<<20) | ((_r)<<16) | (Scratch) ); \
-			LD32_nochk(Scratch, (_imm));}\
-	} else {\
-		if ((_imm)<256){\
-			underrunProtect(4);\
-			*(--_nIns) = (NIns)( COND_AL | (0x035<<20) | ((_r)<<16) | ((_imm)&0xFF) ); \
-		} else {\
-			underrunProtect(4+LD32_size);\
-			*(--_nIns) = (NIns)( COND_AL | (0x015<<20) | ((_r)<<16) | (Scratch) ); \
-			LD32_nochk(Scratch, (_imm));\
-		}\
-	}\
-	asm_output2("cmp %s,%X",gpn(_r),(_imm)); } while(0)
+#define CMPi(_r,_imm)  do {                                             \
+        if (_imm<0) {                                                   \
+            if ((_imm)>-256) {                                          \
+                underrunProtect(4);                                     \
+                *(--_nIns) = (NIns)( COND_AL | (0x37<<20) | ((_r)<<16) | (-(_imm)) ); \
+            } else {                                                      \
+                underrunProtect(4+LD32_size);                           \
+                *(--_nIns) = (NIns)( COND_AL | (0x17<<20) | ((_r)<<16) | (Scratch) ); \
+                LD32_nochk(Scratch, (_imm));                            \
+            }                                                           \
+        } else {                                                        \
+            if ((_imm)<256) {                                           \
+                underrunProtect(4);                                     \
+                *(--_nIns) = (NIns)( COND_AL | (0x035<<20) | ((_r)<<16) | ((_imm)&0xFF) ); \
+            } else {                                                    \
+                underrunProtect(4+LD32_size);                           \
+                *(--_nIns) = (NIns)( COND_AL | (0x015<<20) | ((_r)<<16) | (Scratch) ); \
+                LD32_nochk(Scratch, (_imm));                            \
+            }                                                           \
+        }                                                               \
+        asm_output2("cmp %s,%X",gpn(_r),(_imm));                        \
+    } while(0)
 
 // MOV
-#define MR(_d,_s)	do {\
-	underrunProtect(4);\
-	*(--_nIns) = (NIns)( COND_AL | (0xD<<21) | ((_d)<<12) | (_s) );\
-	asm_output2("mov %s,%s",gpn(_d),gpn(_s)); } while (0)
+#define MR(_d,_s)  do {                                                 \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0xD<<21) | ((_d)<<12) | (_s) ); \
+        asm_output2("mov %s,%s",gpn(_d),gpn(_s)); } while (0)
 
 
-#define MR_cond(_d,_s,_cond,_nm)	do {\
-	underrunProtect(4);\
-	*(--_nIns) = (NIns)( ((_cond)<<28) | (0xD<<21) | ((_d)<<12) | (_s) );\
-	asm_output2(_nm " %s,%s",gpn(_d),gpn(_s)); } while (0)
+#define MR_cond(_d,_s,_cond,_nm)  do {                                  \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( ((_cond)<<28) | (0xD<<21) | ((_d)<<12) | (_s) ); \
+        asm_output2(_nm " %s,%s",gpn(_d),gpn(_s)); } while (0)
 
-#define MREQ(dr,sr)	MR_cond(dr, sr, EQ, "moveq")
-#define MRNE(dr,sr)	MR_cond(dr, sr, NE, "movne")
-#define MRL(dr,sr)	MR_cond(dr, sr, LT, "movlt")
-#define MRLE(dr,sr)	MR_cond(dr, sr, LE, "movle")
-#define MRG(dr,sr)	MR_cond(dr, sr, GT, "movgt")
-#define MRGE(dr,sr)	MR_cond(dr, sr, GE, "movge")
-#define MRB(dr,sr)	MR_cond(dr, sr, CC, "movcc")
-#define MRBE(dr,sr)	MR_cond(dr, sr, LS, "movls")
-#define MRA(dr,sr)	MR_cond(dr, sr, HI, "movcs")
-#define MRAE(dr,sr)	MR_cond(dr, sr, CS, "movhi")
-#define MRNO(dr,sr)	MR_cond(dr, sr, VC, "movvc") // overflow clear
+#define MREQ(dr,sr) MR_cond(dr, sr, EQ, "moveq")
+#define MRNE(dr,sr) MR_cond(dr, sr, NE, "movne")
+#define MRL(dr,sr)  MR_cond(dr, sr, LT, "movlt")
+#define MRLE(dr,sr) MR_cond(dr, sr, LE, "movle")
+#define MRG(dr,sr)  MR_cond(dr, sr, GT, "movgt")
+#define MRGE(dr,sr) MR_cond(dr, sr, GE, "movge")
+#define MRB(dr,sr)  MR_cond(dr, sr, CC, "movcc")
+#define MRBE(dr,sr) MR_cond(dr, sr, LS, "movls")
+#define MRA(dr,sr)  MR_cond(dr, sr, HI, "movcs")
+#define MRAE(dr,sr) MR_cond(dr, sr, CS, "movhi")
+#define MRNO(dr,sr) MR_cond(dr, sr, VC, "movvc") // overflow clear
 #define MRNC(dr,sr) MR_cond(dr, sr, CC, "movcc") // carry clear
 
-#define LD(_d,_off,_b) do{\
-	if ((_off)<0){\
-	  underrunProtect(4);\
-    NanoAssert((_off)>-4096);\
-		*(--_nIns) = (NIns)( COND_AL | (0x51<<20) | ((_b)<<16) | ((_d)<<12) | ((-(_off))&0xFFF) );\
-	} else {\
-    if (isS16(_off) || isU16(_off)) {\
-	    underrunProtect(4);\
-      NanoAssert((_off)<4096);\
-      *(--_nIns) = (NIns)( COND_AL | (0x59<<20) | ((_b)<<16) | ((_d)<<12) | ((_off)&0xFFF) );}\
-    else {\
-  	  underrunProtect(4+LD32_size);\
-      *(--_nIns) = (NIns)( COND_AL | (0x79<<20) | ((_b)<<16) | ((_d)<<12) | Scratch );\
-      LD32_nochk(Scratch, _off);}\
-	}  asm_output3("ld %s,%d(%s)",gpn((_d)),(_off),gpn((_b))); }while(0)
+#define LD(_d,_off,_b) do {                                             \
+        if ((_off)<0) {                                                 \
+            underrunProtect(4);                                         \
+            NanoAssert((_off)>-4096);                                   \
+            *(--_nIns) = (NIns)( COND_AL | (0x51<<20) | ((_b)<<16) | ((_d)<<12) | ((-(_off))&0xFFF) ); \
+        } else {                                                        \
+            if (isS16(_off) || isU16(_off)) {                           \
+                underrunProtect(4);                                     \
+                NanoAssert((_off)<4096);                                \
+                *(--_nIns) = (NIns)( COND_AL | (0x59<<20) | ((_b)<<16) | ((_d)<<12) | ((_off)&0xFFF) ); \
+            } else {                                                    \
+                underrunProtect(4+LD32_size);                           \
+                *(--_nIns) = (NIns)( COND_AL | (0x79<<20) | ((_b)<<16) | ((_d)<<12) | Scratch ); \
+                LD32_nochk(Scratch, _off);                              \
+            }                                                           \
+        }                                                               \
+        asm_output3("ld %s,%d(%s)",gpn((_d)),(_off),gpn((_b)));         \
+    } while(0)
 
 
-#define LDi(_d,_imm) do {\
-	if (isS8((_imm)) || isU8((_imm))) {	\
-		underrunProtect(4);	\
-		if ((_imm)<0)	*(--_nIns) = (NIns)( COND_AL | (0x3E<<20) | ((_d)<<12) | (((_imm)^0xFFFFFFFF)&0xFF) );\
-		else			*(--_nIns) = (NIns)( COND_AL | (0x3B<<20) | ((_d)<<12) | ((_imm)&0xFF) );\
-	} else {\
-		underrunProtect(LD32_size);\
-		LD32_nochk(_d, (_imm));\
-	} asm_output2("ld %s,%d",gpn((_d)),(_imm)); } while(0)
+#define LDi(_d,_imm) do {                                               \
+        if (isS8((_imm)) || isU8((_imm))) {                             \
+            underrunProtect(4);                                         \
+            if ((_imm)<0)   *(--_nIns) = (NIns)( COND_AL | (0x3E<<20) | ((_d)<<12) | (((_imm)^0xFFFFFFFF)&0xFF) ); \
+            else            *(--_nIns) = (NIns)( COND_AL | (0x3B<<20) | ((_d)<<12) | ((_imm)&0xFF) ); \
+        } else {                                                        \
+            underrunProtect(LD32_size);                                 \
+            LD32_nochk(_d, (_imm));                                     \
+        }                                                               \
+        asm_output2("ld %s,%d",gpn((_d)),(_imm));                       \
+    } while(0)
 
 
 // load 8-bit, zero extend (aka LDRB)
 // note, only 5-bit offsets (!) are supported for this, but that's all we need at the moment
 // (LDRB actually allows 12-bit offset in ARM mode but constraining to 5-bit gives us advantage for Thumb)
 // @todo, untested!
-#define LD8Z(_d,_off,_b) do{    \
-    NanoAssert((d)>=0&&(d)<=31);\
-    underrunProtect(4);\
-    *(--_nIns) = (NIns)( COND_AL | (0x5D<<20) | ((_b)<<16) | ((_d)<<12) |  ((_off)&0xfff)  );\
-    asm_output3("ldrb %s,%d(%s)", gpn(_d),(_off),gpn(_b));\
+#define LD8Z(_d,_off,_b) do {                                           \
+        NanoAssert((d)>=0&&(d)<=31);                                    \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0x5D<<20) | ((_b)<<16) | ((_d)<<12) |  ((_off)&0xfff)  ); \
+        asm_output3("ldrb %s,%d(%s)", gpn(_d),(_off),gpn(_b));          \
     } while(0)
 
-#define ST(_b,_off,_r) do{\
-	underrunProtect(4);	\
-	if ((_off)<0)	*(--_nIns) = (NIns)( COND_AL | (0x50<<20) | ((_b)<<16) | ((_r)<<12) | ((-(_off))&0xFFF) );\
-	else			*(--_nIns) = (NIns)( COND_AL | (0x58<<20) | ((_b)<<16) | ((_r)<<12) | ((_off)&0xFFF) );\
-	asm_output3("str %s, %d(%s)",gpn(_r), (_off),gpn(_b)); } while(0)
+#define ST(_b,_off,_r) do {                                             \
+        underrunProtect(4);                                             \
+        if ((_off)<0)   *(--_nIns) = (NIns)( COND_AL | (0x50<<20) | ((_b)<<16) | ((_r)<<12) | ((-(_off))&0xFFF) ); \
+        else            *(--_nIns) = (NIns)( COND_AL | (0x58<<20) | ((_b)<<16) | ((_r)<<12) | ((_off)&0xFFF) ); \
+        asm_output3("str %s, %d(%s)",gpn(_r), (_off),gpn(_b)); } while(0)
 
 
-#define STi(_b,_off,_imm) do{\
-	NanoAssert((_off)>0);\
-	if (isS8((_imm)) || isU8((_imm))) {	\
-		underrunProtect(8);	\
-	  *(--_nIns) = (NIns)( COND_AL | (0x58<<20) | ((_b)<<16) | ((Scratch)<<12) | ((_off)&0xFFF) );\
-	  asm_output3("str %s, %d(%s)",gpn(Scratch), (_off),gpn(_b));			\
-		if ((_imm)<0)	*(--_nIns) = (NIns)( COND_AL | (0x3E<<20) | (Scratch<<12) | (((_imm)^0xFFFFFFFF)&0xFF) );\
-		else			*(--_nIns) = (NIns)( COND_AL | (0x3B<<20) | (Scratch<<12) | ((_imm)&0xFF) );\
-    asm_output2("ld %s,%d",gpn((Scratch)),(_imm));	}\
-  else {\
-		underrunProtect(4+LD32_size);\
-	  *(--_nIns) = (NIns)( COND_AL | (0x58<<20) | ((_b)<<16) | ((Scratch)<<12) | ((_off)&0xFFF) );\
-	  asm_output3("str %s, %d(%s)",gpn(Scratch), (_off),gpn(_b));			\
-    LD32_nochk(Scratch, (_imm));}\
- } while(0);
+#define STi(_b,_off,_imm) do {                                          \
+        NanoAssert((_off)>0);                                           \
+        if (isS8((_imm)) || isU8((_imm))) {                             \
+            underrunProtect(8);                                         \
+            *(--_nIns) = (NIns)( COND_AL | (0x58<<20) | ((_b)<<16) | ((Scratch)<<12) | ((_off)&0xFFF) ); \
+            asm_output3("str %s, %d(%s)",gpn(Scratch), (_off),gpn(_b)); \
+            if ((_imm)<0)   *(--_nIns) = (NIns)( COND_AL | (0x3E<<20) | (Scratch<<12) | (((_imm)^0xFFFFFFFF)&0xFF) ); \
+            else            *(--_nIns) = (NIns)( COND_AL | (0x3B<<20) | (Scratch<<12) | ((_imm)&0xFF) ); \
+            asm_output2("ld %s,%d",gpn((Scratch)),(_imm));              \
+        } else {                                                        \
+            underrunProtect(4+LD32_size);                               \
+            *(--_nIns) = (NIns)( COND_AL | (0x58<<20) | ((_b)<<16) | ((Scratch)<<12) | ((_off)&0xFFF) ); \
+            asm_output3("str %s, %d(%s)",gpn(Scratch), (_off),gpn(_b)); \
+            LD32_nochk(Scratch, (_imm));                                \
+        }                                \
+    } while(0);
 
 
-#define LEA(_r,_d,_b) do{						\
-	NanoAssert((_d)<=1020);						\
-	NanoAssert(((_d)&3)==0);						\
-	if (_b!=SP) NanoAssert(0);					\
-	if ((_d)<256) {								\
-		underrunProtect(4);							\
-		*(--_nIns) = (NIns)( COND_AL | (0x28<<20) | ((_b)<<16) | ((_r)<<12) | ((_d)&0xFF) );}\
-	else{										\
-		underrunProtect(8);							\
-		*(--_nIns) = (NIns)( COND_AL | (0x4<<21) | ((_b)<<16) | ((_r)<<12) | (2<<7)| (_r) );\
-		*(--_nIns) = (NIns)( COND_AL | (0x3B<<20) | ((_r)<<12) | (((_d)>>2)&0xFF) );}\
-	asm_output2("lea %s, %d(SP)", gpn(_r), _d);	\
-	} while(0)
+#define LEA(_r,_d,_b) do {                                              \
+        NanoAssert((_d)<=1020);                                         \
+        NanoAssert(((_d)&3)==0);                                        \
+        if (_b!=SP) NanoAssert(0);                                      \
+        if ((_d)<256) {                                                 \
+            underrunProtect(4);                                         \
+            *(--_nIns) = (NIns)( COND_AL | (0x28<<20) | ((_b)<<16) | ((_r)<<12) | ((_d)&0xFF) ); \
+        } else {                                                        \
+            underrunProtect(8);                                         \
+            *(--_nIns) = (NIns)( COND_AL | (0x4<<21) | ((_b)<<16) | ((_r)<<12) | (2<<7)| (_r) ); \
+            *(--_nIns) = (NIns)( COND_AL | (0x3B<<20) | ((_r)<<12) | (((_d)>>2)&0xFF) ); \
+        }                                                               \
+        asm_output2("lea %s, %d(SP)", gpn(_r), _d);                     \
+    } while(0)
 
 
-//#define RET()   underrunProtect(1); *(--_nIns) = 0xc3;	asm_output("ret")
-//#define NOP() 	underrunProtect(1); *(--_nIns) = 0x90;	asm_output("nop")
+//#define RET()   underrunProtect(1); *(--_nIns) = 0xc3;    asm_output("ret")
+//#define NOP()     underrunProtect(1); *(--_nIns) = 0x90;  asm_output("nop")
 //#define INT3()  underrunProtect(1); *(--_nIns) = 0xcc;  asm_output("int3")
 //#define RET() INT3()
 
-#define BKPT_nochk() do { *(--_nIns) = (NIns)( (0xE<<24) | (0x12<<20) | (0x7<<4) ); } while (0);
+#define BKPT_nochk() do { \
+        *(--_nIns) = (NIns)( (0xE<<24) | (0x12<<20) | (0x7<<4) ); } while (0);
 
 // this is pushing a reg
-#define PUSHr(_r)  do {\
-	underrunProtect(4);\
-	*(--_nIns) = (NIns)( COND_AL | (0x92<<20) | (SP<<16) | (1<<(_r)) );	\
-	asm_output1("push %s",gpn(_r)); } while (0)
+#define PUSHr(_r)  do {                                                 \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0x92<<20) | (SP<<16) | (1<<(_r)) ); \
+        asm_output1("push %s",gpn(_r)); } while (0)
 
 // STMDB
-#define PUSH_mask(_mask)  do {\
-	underrunProtect(4);			\
-	*(--_nIns) = (NIns)( COND_AL | (0x92<<20) | (SP<<16) | (_mask) );	\
-	asm_output1("push %x", (_mask));} while (0)
+#define PUSH_mask(_mask)  do {                                          \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0x92<<20) | (SP<<16) | (_mask) ); \
+        asm_output1("push %x", (_mask));} while (0)
 
 // this form of PUSH takes a base + offset
 // we need to load into scratch reg, then push onto stack
-#define PUSHm(_off,_b)	do {\
-	NanoAssert( (int)(_off)>0 );\
-	underrunProtect(8);\
-	*(--_nIns) = (NIns)( COND_AL | (0x92<<20) | (SP<<16) | (1<<(Scratch)) );	\
-	*(--_nIns) = (NIns)( COND_AL | (0x59<<20) | ((_b)<<16) | ((Scratch)<<12) | ((_off)&0xFFF) );\
-	asm_output2("push %d(%s)",(_off),gpn(_b)); } while (0)
+#define PUSHm(_off,_b)  do {                                            \
+        NanoAssert( (int)(_off)>0 );                                    \
+        underrunProtect(8);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0x92<<20) | (SP<<16) | (1<<(Scratch)) ); \
+        *(--_nIns) = (NIns)( COND_AL | (0x59<<20) | ((_b)<<16) | ((Scratch)<<12) | ((_off)&0xFFF) ); \
+        asm_output2("push %d(%s)",(_off),gpn(_b)); } while (0)
 
-#define POPr(_r) do {\
-	underrunProtect(4);			\
-	*(--_nIns) = (NIns)( COND_AL | (0x8B<<20) | (SP<<16) | (1<<(_r)) );\
-	asm_output1("pop %s",gpn(_r));} while (0)
+#define POPr(_r) do {                                                   \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0x8B<<20) | (SP<<16) | (1<<(_r)) ); \
+        asm_output1("pop %s",gpn(_r));} while (0)
 
-#define POP_mask(_mask) do {\
-	underrunProtect(4);			\
-	*(--_nIns) = (NIns)( COND_AL | (0x8B<<20) | (SP<<16) | (_mask) );\
-	asm_output1("pop %x", (_mask));} while (0)
+#define POP_mask(_mask) do {                                            \
+        underrunProtect(4);                                             \
+        *(--_nIns) = (NIns)( COND_AL | (0x8B<<20) | (SP<<16) | (_mask) ); \
+        asm_output1("pop %x", (_mask));} while (0)
 
 #define PC_OFFSET_FROM(target,frompc) ((intptr_t)(target) - ((intptr_t)(frompc) + 8))
 #define JMP_S24_OFFSET_OK(offs) ((-(1<<24)) <= (offs) && (offs) < (1<<24))
 
 // (XXX This ought to be a function instead of a macro)
 //
 // Branch to target address _t with condition _c, doing underrun
 // checks (_chk == 1) or skipping them (_chk == 0).
@@ -582,222 +594,181 @@ ShiftOperator;
 // the instruction stream and load it into pc.
 // If the jump has a condition, but noone's mucked with _nIns and our _nSlot
 // pointer is valid, stick the constant in the slot and emit a conditional
 // load into pc.
 // Otherwise, emit the conditional load into pc from a nearby constant,
 // and emit a jump to jump over it it in case the condition fails.
 //
 // NB: JMP_nochk depends on this not calling samepage() when _c == AL
-#define B_cond_chk(_c,_t,_chk) do {										\
-		int32 offs = PC_OFFSET_FROM(_t,(intptr_t)(_nIns)-4);			\
-		if (JMP_S24_OFFSET_OK(offs)) {									\
-			if(_chk) underrunProtect(4);								\
-			*(--_nIns) = (NIns)( ((_c)<<28) | (0xA<<24) | (((offs)>>2) & 0xFFFFFF) ); \
-		} else if (_c == AL) {											\
-			if(_chk) underrunProtect(8);								\
-			*(--_nIns) = (NIns)(_t);									\
-			*(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | 0x4 ); \
-		} else if (samepage(_nIns,_nSlot)) {							\
-			if(_chk) underrunProtect(8);								\
-			*(++_nSlot) = (NIns)(_t);									\
-			offs = PC_OFFSET_FROM(_nSlot,(intptr_t)(_nIns)-4);			\
-			NanoAssert(offs < 0);										\
-			*(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | ((-offs) & 0xFFFFFF) ); \
-		} else {														\
-			if(_chk) underrunProtect(24);								\
-			*(--_nIns) = (NIns)(_t);									\
-			*(--_nIns) = (NIns)( COND_AL | (0xA<<24) | ((-4)>>2) & 0xFFFFFF ); \
-			*(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | 0x0 ); \
-		}																\
-		asm_output2("%s %p\n", _c == AL ? "jmp" : "b(cnd)", (void*)(_t)); \
-	} while(0)
+#define B_cond_chk(_c,_t,_chk) do {                                     \
+        int32 offs = PC_OFFSET_FROM(_t,(intptr_t)(_nIns)-4);            \
+        if (JMP_S24_OFFSET_OK(offs)) {                                  \
+            if(_chk) underrunProtect(4);                                \
+            *(--_nIns) = (NIns)( ((_c)<<28) | (0xA<<24) | (((offs)>>2) & 0xFFFFFF) ); \
+        } else if (_c == AL) {                                          \
+            if(_chk) underrunProtect(8);                                \
+            *(--_nIns) = (NIns)(_t);                                    \
+            *(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | 0x4 ); \
+        } else if (samepage(_nIns,_nSlot)) {                            \
+            if(_chk) underrunProtect(8);                                \
+            *(++_nSlot) = (NIns)(_t);                                   \
+            offs = PC_OFFSET_FROM(_nSlot,(intptr_t)(_nIns)-4);          \
+            NanoAssert(offs < 0);                                       \
+            *(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | ((-offs) & 0xFFFFFF) ); \
+        } else {                                                        \
+            if(_chk) underrunProtect(24);                               \
+            *(--_nIns) = (NIns)(_t);                                    \
+            *(--_nIns) = (NIns)( COND_AL | (0xA<<24) | ((-4)>>2) & 0xFFFFFF ); \
+            *(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | 0x0 ); \
+        }                                                               \
+        asm_output2("%s %p\n", _c == AL ? "jmp" : "b(cnd)", (void*)(_t)); \
+    } while(0)
 
-#define B_cond(_c,_t) \
-	B_cond_chk(_c,_t,1)
+#define B_cond(_c,_t)                           \
+    B_cond_chk(_c,_t,1)
 
 // NB: don't use COND_AL here, we shift the condition into place!
-#define JMP(_t) \
-	B_cond_chk(AL,_t,1)
+#define JMP(_t)                                 \
+    B_cond_chk(AL,_t,1)
 
-#define JMP_nochk(_t) \
-	B_cond_chk(AL,_t,0)
+#define JMP_nochk(_t)                           \
+    B_cond_chk(AL,_t,0)
 
 // emit a placeholder that will be filled in later by nPatchBranch;
 // emit two breakpoint instructions in case something goes wrong with
 // the patching.
-#define JMP_long_placeholder()	do {							\
-		underrunProtect(8);										\
-		BKPT_nochk();											\
-		BKPT_nochk();											\
-	} while(0)
+#define JMP_long_placeholder()  do {            \
+        underrunProtect(8);                     \
+        BKPT_nochk();                           \
+        BKPT_nochk();                           \
+    } while(0)
 
-#define JA(t)	do {B_cond(HI,t); asm_output1("ja 0x%08x",(unsigned int)t); } while(0)
-#define JNA(t)	do {B_cond(LS,t); asm_output1("jna 0x%08x",(unsigned int)t); } while(0)
-#define JB(t)	do {B_cond(CC,t); asm_output1("jb 0x%08x",(unsigned int)t); } while(0)
-#define JNB(t)	do {B_cond(CS,t); asm_output1("jnb 0x%08x",(unsigned int)t); } while(0)
-#define JE(t)	do {B_cond(EQ,t); asm_output1("je 0x%08x",(unsigned int)t); } while(0)
-#define JNE(t)	do {B_cond(NE,t); asm_output1("jne 0x%08x",(unsigned int)t); } while(0)						
-#define JBE(t)	do {B_cond(LS,t); asm_output1("jbe 0x%08x",(unsigned int)t); } while(0)
+#define JA(t)   do {B_cond(HI,t); asm_output1("ja 0x%08x",(unsigned int)t); } while(0)
+#define JNA(t)  do {B_cond(LS,t); asm_output1("jna 0x%08x",(unsigned int)t); } while(0)
+#define JB(t)   do {B_cond(CC,t); asm_output1("jb 0x%08x",(unsigned int)t); } while(0)
+#define JNB(t)  do {B_cond(CS,t); asm_output1("jnb 0x%08x",(unsigned int)t); } while(0)
+#define JE(t)   do {B_cond(EQ,t); asm_output1("je 0x%08x",(unsigned int)t); } while(0)
+#define JNE(t)  do {B_cond(NE,t); asm_output1("jne 0x%08x",(unsigned int)t); } while(0)                     
+#define JBE(t)  do {B_cond(LS,t); asm_output1("jbe 0x%08x",(unsigned int)t); } while(0)
 #define JNBE(t) do {B_cond(HI,t); asm_output1("jnbe 0x%08x",(unsigned int)t); } while(0)
-#define JAE(t)	do {B_cond(CS,t); asm_output1("jae 0x%08x",(unsigned int)t); } while(0)
+#define JAE(t)  do {B_cond(CS,t); asm_output1("jae 0x%08x",(unsigned int)t); } while(0)
 #define JNAE(t) do {B_cond(CC,t); asm_output1("jnae 0x%08x",(unsigned int)t); } while(0)
-#define JL(t)	do {B_cond(LT,t); asm_output1("jl 0x%08x",(unsigned int)t); } while(0)	
-#define JNL(t)	do {B_cond(GE,t); asm_output1("jnl 0x%08x",(unsigned int)t); } while(0)
-#define JLE(t)	do {B_cond(LE,t); asm_output1("jle 0x%08x",(unsigned int)t); } while(0)
-#define JNLE(t)	do {B_cond(GT,t); asm_output1("jnle 0x%08x",(unsigned int)t); } while(0)
-#define JGE(t)	do {B_cond(GE,t); asm_output1("jge 0x%08x",(unsigned int)t); } while(0)
-#define JNGE(t)	do {B_cond(LT,t); asm_output1("jnge 0x%08x",(unsigned int)t); } while(0)
-#define JG(t)	do {B_cond(GT,t); asm_output1("jg 0x%08x",(unsigned int)t); } while(0)	
-#define JNG(t)	do {B_cond(LE,t); asm_output1("jng 0x%08x",(unsigned int)t); } while(0)
-#define JC(t)	do {B_cond(CS,t); asm_output1("bcs 0x%08x",(unsigned int)t); } while(0)
-#define JNC(t)	do {B_cond(CC,t); asm_output1("bcc 0x%08x",(unsigned int)t); } while(0)
-#define JO(t)	do {B_cond(VS,t); asm_output1("bvs 0x%08x",(unsigned int)t); } while(0)
-#define JNO(t)	do {B_cond(VC,t); asm_output1("bvc 0x%08x",(unsigned int)t); } while(0)
+#define JL(t)   do {B_cond(LT,t); asm_output1("jl 0x%08x",(unsigned int)t); } while(0)  
+#define JNL(t)  do {B_cond(GE,t); asm_output1("jnl 0x%08x",(unsigned int)t); } while(0)
+#define JLE(t)  do {B_cond(LE,t); asm_output1("jle 0x%08x",(unsigned int)t); } while(0)
+#define JNLE(t) do {B_cond(GT,t); asm_output1("jnle 0x%08x",(unsigned int)t); } while(0)
+#define JGE(t)  do {B_cond(GE,t); asm_output1("jge 0x%08x",(unsigned int)t); } while(0)
+#define JNGE(t) do {B_cond(LT,t); asm_output1("jnge 0x%08x",(unsigned int)t); } while(0)
+#define JG(t)   do {B_cond(GT,t); asm_output1("jg 0x%08x",(unsigned int)t); } while(0)  
+#define JNG(t)  do {B_cond(LE,t); asm_output1("jng 0x%08x",(unsigned int)t); } while(0)
+#define JC(t)   do {B_cond(CS,t); asm_output1("bcs 0x%08x",(unsigned int)t); } while(0)
+#define JNC(t)  do {B_cond(CC,t); asm_output1("bcc 0x%08x",(unsigned int)t); } while(0)
+#define JO(t)   do {B_cond(VS,t); asm_output1("bvs 0x%08x",(unsigned int)t); } while(0)
+#define JNO(t)  do {B_cond(VC,t); asm_output1("bvc 0x%08x",(unsigned int)t); } while(0)
 
 // used for testing result of an FP compare
 // JP = comparison  false
-#define JP(t)	do {B_cond(EQ,NE,t); asm_output1("jp 0x%08x",t); } while(0)	
+#define JP(t)   do {B_cond(EQ,NE,t); asm_output1("jp 0x%08x",t); } while(0) 
 
 // JNP = comparison true
-#define JNP(t)	do {B_cond(NE,EQ,t); asm_output1("jnp 0x%08x",t); } while(0)
+#define JNP(t)  do {B_cond(NE,EQ,t); asm_output1("jnp 0x%08x",t); } while(0)
 
 
 // floating point
-#define FNSTSW_AX()	do {NanoAssert(0);		asm_output("fnstsw_ax"); } while(0)
-#define FFREE(r)	do {NanoAssert(0);		asm_output1("ffree %s",gpn(b)); } while(0)
-#define FSTQ(p,d,b)	do {NanoAssert(0);		asm_output2("fstq %d(%s)",d,gpn(b)); } while(0)
+#define FNSTSW_AX() do {NanoAssert(0);      asm_output("fnstsw_ax"); } while(0)
+#define FFREE(r)    do {NanoAssert(0);      asm_output1("ffree %s",gpn(b)); } while(0)
+#define FSTQ(p,d,b) do {NanoAssert(0);      asm_output2("fstq %d(%s)",d,gpn(b)); } while(0)
 #define FSTPQ(d,b)  FSTQ(1,d,b)
-//#define FSTPQ(d,b)	do {NanoAssert(0);		asm_output2("fstpq %d(%s)",d,gpn(b)); } while(0)
-#define FCOM(p,d,b)	do {NanoAssert(0);		asm_output2("fcom %d(%s)",d,gpn(b)); } while(0)
-#define FCOMP(d,b)	do {NanoAssert(0);		asm_output2("fcomp %d(%s)",d,gpn(b)); } while(0)
-#define FLDQ(d,b)	do {NanoAssert(0);		asm_output2("fldq %d(%s)",d,gpn(b)); } while(0)
-#define FILDQ(d,b)	do {NanoAssert(0);		asm_output2("fildq %d(%s)",d,gpn(b)); } while(0)
-#define FILD(d,b)	do {NanoAssert(0);		asm_output2("fild %d(%s)",d,gpn(b)); } while(0)
-#define FADD(d,b)	do {NanoAssert(0);		asm_output2("faddq %d(%s)",d,gpn(b)); } while(0)
-#define FSUB(d,b)	do {NanoAssert(0);		asm_output2("fsubq %d(%s)",d,gpn(b)); } while(0)
-#define FSUBR(d,b)	do {NanoAssert(0);		asm_output2("fsubr %d(%s)",d,gpn(b)); } while(0)
-#define FMUL(d,b)	do {NanoAssert(0);		asm_output2("fmulq %d(%s)",d,gpn(b)); } while(0)
-#define FDIV(d,b)	do {NanoAssert(0);		asm_output2("fdivq %d(%s)",d,gpn(b)); } while(0)
-#define FDIVR(d,b)	do {NanoAssert(0);		asm_output2("fdivr %d(%s)",d,gpn(b)); } while(0)
-#define FSTP(r)		do {NanoAssert(0);		asm_output1("fst st(%d)",r); } while(0)
-#define FLD1()		do {NanoAssert(0);		asm_output("fld1"); } while(0)
-#define FLDZ()		do {NanoAssert(0);		asm_output("fldz"); } while(0)
+//#define FSTPQ(d,b)    do {NanoAssert(0);      asm_output2("fstpq %d(%s)",d,gpn(b)); } while(0)
+#define FCOM(p,d,b) do {NanoAssert(0);      asm_output2("fcom %d(%s)",d,gpn(b)); } while(0)
+#define FCOMP(d,b)  do {NanoAssert(0);      asm_output2("fcomp %d(%s)",d,gpn(b)); } while(0)
+#define FLDQ(d,b)   do {NanoAssert(0);      asm_output2("fldq %d(%s)",d,gpn(b)); } while(0)
+#define FILDQ(d,b)  do {NanoAssert(0);      asm_output2("fildq %d(%s)",d,gpn(b)); } while(0)
+#define FILD(d,b)   do {NanoAssert(0);      asm_output2("fild %d(%s)",d,gpn(b)); } while(0)
+#define FADD(d,b)   do {NanoAssert(0);      asm_output2("faddq %d(%s)",d,gpn(b)); } while(0)
+#define FSUB(d,b)   do {NanoAssert(0);      asm_output2("fsubq %d(%s)",d,gpn(b)); } while(0)
+#define FSUBR(d,b)  do {NanoAssert(0);      asm_output2("fsubr %d(%s)",d,gpn(b)); } while(0)
+#define FMUL(d,b)   do {NanoAssert(0);      asm_output2("fmulq %d(%s)",d,gpn(b)); } while(0)
+#define FDIV(d,b)   do {NanoAssert(0);      asm_output2("fdivq %d(%s)",d,gpn(b)); } while(0)
+#define FDIVR(d,b)  do {NanoAssert(0);      asm_output2("fdivr %d(%s)",d,gpn(b)); } while(0)
+#define FSTP(r)     do {NanoAssert(0);      asm_output1("fst st(%d)",r); } while(0)
+#define FLD1()      do {NanoAssert(0);      asm_output("fld1"); } while(0)
+#define FLDZ()      do {NanoAssert(0);      asm_output("fldz"); } while(0)
 
 
 
 // MOV(EQ) _r, #1 
 // EOR(NE) _r, _r
-#define SET(_r,_cond,_opp)\
-	underrunProtect(8);								\
-	*(--_nIns) = (NIns)( (_opp<<28) | (1<<21) | ((_r)<<16) | ((_r)<<12) | (_r) );\
-	*(--_nIns) = (NIns)( (_cond<<28) | (0x3A<<20) | ((_r)<<12) | (1) );
+#define SET(_r,_cond,_opp)                                              \
+    underrunProtect(8);                                                 \
+    *(--_nIns) = (NIns)( (_opp<<28) | (1<<21) | ((_r)<<16) | ((_r)<<12) | (_r) ); \
+    *(--_nIns) = (NIns)( (_cond<<28) | (0x3A<<20) | ((_r)<<12) | (1) );
 
 
-#define SETE(r)		do {SET(r,EQ,NE); asm_output1("sete %s",gpn(r)); } while(0)
-#define SETL(r)		do {SET(r,LT,GE); asm_output1("setl %s",gpn(r)); } while(0)
-#define SETLE(r)	do {SET(r,LE,GT); asm_output1("setle %s",gpn(r)); } while(0)
-#define SETG(r)		do {SET(r,GT,LE); asm_output1("setg %s",gpn(r)); } while(0)
-#define SETGE(r)	do {SET(r,GE,LT); asm_output1("setge %s",gpn(r)); } while(0)
-#define SETB(r)		do {SET(r,CC,CS); asm_output1("setb %s",gpn(r)); } while(0)
-#define SETBE(r)	do {SET(r,LS,HI); asm_output1("setb %s",gpn(r)); } while(0)
-#define SETAE(r)	do {SET(r,CS,CC); asm_output1("setae %s",gpn(r)); } while(0)
-#define SETA(r)		do {SET(r,HI,LS); asm_output1("seta %s",gpn(r)); } while(0)
-#define SETO(r)		do {SET(r,VS,LS); asm_output1("seto %s",gpn(r)); } while(0)
-#define SETC(r)		do {SET(r,CS,LS); asm_output1("setc %s",gpn(r)); } while(0)
+#define SETE(r)     do {SET(r,EQ,NE); asm_output1("sete %s",gpn(r)); } while(0)
+#define SETL(r)     do {SET(r,LT,GE); asm_output1("setl %s",gpn(r)); } while(0)
+#define SETLE(r)    do {SET(r,LE,GT); asm_output1("setle %s",gpn(r)); } while(0)
+#define SETG(r)     do {SET(r,GT,LE); asm_output1("setg %s",gpn(r)); } while(0)
+#define SETGE(r)    do {SET(r,GE,LT); asm_output1("setge %s",gpn(r)); } while(0)
+#define SETB(r)     do {SET(r,CC,CS); asm_output1("setb %s",gpn(r)); } while(0)
+#define SETBE(r)    do {SET(r,LS,HI); asm_output1("setb %s",gpn(r)); } while(0)
+#define SETAE(r)    do {SET(r,CS,CC); asm_output1("setae %s",gpn(r)); } while(0)
+#define SETA(r)     do {SET(r,HI,LS); asm_output1("seta %s",gpn(r)); } while(0)
+#define SETO(r)     do {SET(r,VS,LS); asm_output1("seto %s",gpn(r)); } while(0)
+#define SETC(r)     do {SET(r,CS,LS); asm_output1("setc %s",gpn(r)); } while(0)
 
 // This zero-extends a reg that has been set using one of the SET macros,
 // but is a NOOP on ARM/Thumb
 #define MOVZX8(r,r2)
 
 // Load and sign extend a 16-bit value into a reg
-#define MOVSX(_d,_off,_b) do{\
-	if ((_off)>=0){\
-		if ((_off)<256){\
-			underrunProtect(4);\
-			*(--_nIns) = (NIns)( COND_AL | (0x1D<<20) | ((_b)<<16) | ((_d)<<12) |  ((((_off)>>4)&0xF)<<8) | (0xF<<4) | ((_off)&0xF)  );}\
-		else if ((_off)<=510) {\
-			underrunProtect(8);\
-			int rem = (_off) - 255;\
-			NanoAssert(rem<256);\
-			*(--_nIns) = (NIns)( COND_AL | (0x1D<<20) | ((_d)<<16) | ((_d)<<12) |  ((((rem)>>4)&0xF)<<8) | (0xF<<4) | ((rem)&0xF)  );\
-			*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_b)<<16) | ((_d)<<12) | (0xFF) );}\
-		else {\
-			underrunProtect(16);\
-			int rem = (_off) & 3;\
-			*(--_nIns) = (NIns)( COND_AL | (0x19<<20) | ((_b)<<16) | ((_d)<<12) | (0xF<<4) | (_d) );\
-			asm_output3("ldrsh %s,[%s, #%d]",gpn(_d), gpn(_b), (_off));\
-			*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_d)<<16) | ((_d)<<12) | rem );\
-			*(--_nIns) = (NIns)( COND_AL | (0x1A<<20) | ((_d)<<12) | (2<<7)| (_d) );\
-			*(--_nIns) = (NIns)( COND_AL | (0x3B<<20) | ((_d)<<12) | (((_off)>>2)&0xFF) );\
-			asm_output2("mov %s,%d",gpn(_d),(_off));}}\
-	else {\
-		if ((_off)>-256) {\
-			underrunProtect(4);\
-			*(--_nIns) = (NIns)( COND_AL | (0x15<<20) | ((_b)<<16) | ((_d)<<12) |  ((((-(_off))>>4)&0xF)<<8) | (0xF<<4) | ((-(_off))&0xF)  );\
-			asm_output3("ldrsh %s,[%s, #%d]",gpn(_d), gpn(_b), (_off));}\
-		else if ((_off)>=-510){\
-			underrunProtect(8);\
-			int rem = -(_off) - 255;\
-			NanoAssert(rem<256);\
-			*(--_nIns) = (NIns)( COND_AL | (0x15<<20) | ((_d)<<16) | ((_d)<<12) |  ((((rem)>>4)&0xF)<<8) | (0xF<<4) | ((rem)&0xF)  );\
-			*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_b)<<16) | ((_d)<<12) | (0xFF) );}\
-		else NanoAssert(0);\
-	}\
-} while(0)
-
-#define STMIA(_b, _mask) do {\
-		  underrunProtect(2);\
-		  NanoAssert(((_mask)&rmask(_b))==0 && isU8(_mask));\
-      *(--_nIns) = (NIns)(COND_AL | (0x8A<<20) | ((_b)<<16) | (_mask)&0xFF);\
-      asm_output2("stmia %s!,{%x}", gpn(_b), _mask);} while (0)
+#define MOVSX(_d,_off,_b) do {                                          \
+        if ((_off)>=0) {                                                \
+            if ((_off)<256) {                                           \
+                underrunProtect(4);                                     \
+                *(--_nIns) = (NIns)( COND_AL | (0x1D<<20) | ((_b)<<16) | ((_d)<<12) |  ((((_off)>>4)&0xF)<<8) | (0xF<<4) | ((_off)&0xF)  ); \
+            } else if ((_off)<=510) {                                   \
+                underrunProtect(8);                                     \
+                int rem = (_off) - 255;                                 \
+                NanoAssert(rem<256);                                    \
+                *(--_nIns) = (NIns)( COND_AL | (0x1D<<20) | ((_d)<<16) | ((_d)<<12) |  ((((rem)>>4)&0xF)<<8) | (0xF<<4) | ((rem)&0xF)  ); \
+                *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_b)<<16) | ((_d)<<12) | (0xFF) ); \
+            } else {                                                    \
+                underrunProtect(16);                                    \
+                int rem = (_off) & 3;                                   \
+                *(--_nIns) = (NIns)( COND_AL | (0x19<<20) | ((_b)<<16) | ((_d)<<12) | (0xF<<4) | (_d) ); \
+                asm_output3("ldrsh %s,[%s, #%d]",gpn(_d), gpn(_b), (_off)); \
+                *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_d)<<16) | ((_d)<<12) | rem ); \
+                *(--_nIns) = (NIns)( COND_AL | (0x1A<<20) | ((_d)<<12) | (2<<7)| (_d) ); \
+                *(--_nIns) = (NIns)( COND_AL | (0x3B<<20) | ((_d)<<12) | (((_off)>>2)&0xFF) ); \
+                asm_output2("mov %s,%d",gpn(_d),(_off));                \
+            }                                                           \
+        } else {                                                        \
+            if ((_off)>-256) {                                          \
+                underrunProtect(4);                                     \
+                *(--_nIns) = (NIns)( COND_AL | (0x15<<20) | ((_b)<<16) | ((_d)<<12) |  ((((-(_off))>>4)&0xF)<<8) | (0xF<<4) | ((-(_off))&0xF)  ); \
+                asm_output3("ldrsh %s,[%s, #%d]",gpn(_d), gpn(_b), (_off)); \
+            } else if ((_off)>=-510){                                   \
+                underrunProtect(8);                                     \
+                int rem = -(_off) - 255;                                \
+                NanoAssert(rem<256);                                    \
+                *(--_nIns) = (NIns)( COND_AL | (0x15<<20) | ((_d)<<16) | ((_d)<<12) |  ((((rem)>>4)&0xF)<<8) | (0xF<<4) | ((rem)&0xF)  ); \
+                *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_b)<<16) | ((_d)<<12) | (0xFF) ); \
+            } else NanoAssert(0);                                        \
+        }                                                               \
+    } while(0)
 
-#define LDMIA(_b, _mask) do {\
-      underrunProtect(2);\
- 		  NanoAssert(((_mask)&rmask(_b))==0 && isU8(_mask));\
-     *(--_nIns) = (NIns)(COND_AL | (0x8B<<20) | ((_b)<<16) | (_mask)&0xFF);\
-      asm_output2("ldmia %s!,{%x}", gpn(_b), (_mask));} while (0)
+#define STMIA(_b, _mask) do {                                           \
+        underrunProtect(2);                                             \
+        NanoAssert(((_mask)&rmask(_b))==0 && isU8(_mask));              \
+        *(--_nIns) = (NIns)(COND_AL | (0x8A<<20) | ((_b)<<16) | (_mask)&0xFF); \
+        asm_output2("stmia %s!,{%x}", gpn(_b), _mask); \
+    } while (0)
 
-/*
-#define MOVSX(_d,_off,_b) do{\
-	if ((_b)==SP){\
-		NanoAssert( (_off)>=0 );\
-		if ((_off)<256){\
-			underrunProtect(4);\
-			*(--_nIns) = (NIns)( COND_AL | (0x1D<<20) | ((_b)<<16) | ((_d)<<12) |  ((((_off)>>4)&0xF)<<8) | (0xF<<4) | ((_off)&0xF)  );}\
-		else if ((_off)<=510) {\
-			underrunProtect(8);\
-			int rem = (_off) - 255;\
-			NanoAssert(rem<256);\
-			*(--_nIns) = (NIns)( COND_AL | (0x1D<<20) | ((_d)<<16) | ((_d)<<12) |  ((((rem)>>4)&0xF)<<8) | (0xF<<4) | ((rem)&0xF)  );\
-			*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_b)<<16) | ((_d)<<12) | (0xFF) );}\
-		else {\
-			underrunProtect(16);\
-			int rem = (_off) & 3;\
-			*(--_nIns) = (NIns)( COND_AL | (0x19<<20) | ((_b)<<16) | ((_d)<<12) | (0xF<<4) | (_d) );\
-			asm_output3("ldrsh %s,[%s, #%d]",gpn(_d), gpn(_b), (_off));\
-			*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_d)<<16) | ((_d)<<12) | rem );\
-			*(--_nIns) = (NIns)( COND_AL | (0x1A<<20) | ((_d)<<12) | (2<<7)| (_d) );\
-			*(--_nIns) = (NIns)( COND_AL | (0x3B<<20) | ((_d)<<12) | (((_off)>>2)&0xFF) );\
-			asm_output2("mov %s,%d",gpn(_d),(_off));}}\
-	else {\
-		if ((_off)>=0){\
-			if ((_off)<256) {\
-				underrunProtect(4);							\
-				*(--_nIns) = (NIns)( COND_AL | (0x1D<<20) | ((_b)<<16) | ((_d)<<12) |  ((((_off)>>4)&0xF)<<8) | (0xF<<4) | ((_off)&0xF)  );\
-				asm_output3("ldrsh %s,[%s, #%d]",gpn(_d), gpn(_b), (_off));}\
-			else if ((_off)<=510) {\
-				underrunProtect(8);\
-				int rem = (_off) - 255;\
-				NanoAssert(rem<256);\
-				*(--_nIns) = (NIns)( COND_AL | (0x1D<<20) | ((_d)<<16) | ((_d)<<12) |  ((((rem)>>4)&0xF)<<8) | (0xF<<4) | ((rem)&0xF)  );\
-				*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_b)<<16) | ((_d)<<12) | (0xFF) );}\
-			else NanoAssert(0);}\
-		else {\
-			if ((_off)>-256) {\
-				*(--_nIns) = (NIns)( COND_AL | (0x15<<20) | ((_b)<<16) | ((_d)<<12) |  ((((-(_off))>>4)&0xF)<<8) | (0xF<<4) | ((-(_off))&0xF)  );\
-				asm_output3("ldrsh %s,[%s, #%d]",gpn(_d), gpn(_b), (_off));}\
-			else {}}\
-	} while(0)
-*/
-
+#define LDMIA(_b, _mask) do {                                           \
+        underrunProtect(2);                                             \
+        NanoAssert(((_mask)&rmask(_b))==0 && isU8(_mask));              \
+        *(--_nIns) = (NIns)(COND_AL | (0x8B<<20) | ((_b)<<16) | (_mask)&0xFF); \
+        asm_output2("ldmia %s!,{%x}", gpn(_b), (_mask)); \
+    } while (0)
 }
 #endif // __nanojit_NativeThumb__