Add VFP for floating point ops to nanojit ARM backend.
authorVladimir Vukicevic <vladimir@pobox.com>
Tue, 02 Sep 2008 22:29:23 -0700
changeset 18776 b6d60356a49cc571ebfab99c55aca0006b7c939d
parent 18775 a94e3a1d900916db774e7e16c86a65e10468b0cf
child 18777 aa13d8f4c3a7158b5039dc4fed5edafd9df53028
push id1711
push userbrendan@mozilla.com
push dateThu, 04 Sep 2008 08:26:45 +0000
treeherderautoland@1431bbddb5de [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
milestone1.9.1b1pre
Add VFP for floating point ops to nanojit ARM backend.
js/src/jstracer.cpp
js/src/jstracer.h
js/src/nanojit/Assembler.cpp
js/src/nanojit/LIR.cpp
js/src/nanojit/NativeARM.cpp
js/src/nanojit/NativeARM.h
js/src/nanojit/RegAlloc.h
js/src/nanojit/nanojit.h
--- a/js/src/jstracer.cpp
+++ b/js/src/jstracer.cpp
@@ -114,17 +114,17 @@ static GC gc = GC();
 static avmplus::AvmCore s_core = avmplus::AvmCore();
 static avmplus::AvmCore* core = &s_core;
 
 /* We really need a better way to configure the JIT. Shaver, where is my fancy JIT object? */
 static bool nesting_enabled = true;
 static bool oracle_enabled = true;
 static bool did_we_check_sse2 = false;
 
-#ifdef DEBUG
+#if defined(DEBUG) || defined(INCLUDE_VERBOSE_OUTPUT)
 static bool verbose_debug = getenv("TRACEMONKEY") && strstr(getenv("TRACEMONKEY"), "verbose");
 #define debug_only_v(x) if (verbose_debug) { x; }
 #else
 #define debug_only_v(x)
 #endif
 
 /* The entire VM shares one oracle. Collisions and concurrent updates are tolerated and worst
    case cause performance regressions. */
@@ -277,17 +277,17 @@ Oracle::clear()
     _dontDemote.reset();
 }
 
 static bool isi2f(LInsp i)
 {
     if (i->isop(LIR_i2f))
         return true;
 
-#ifdef NANOJIT_ARM
+#if defined(NANOJIT_ARM) && defined(NJ_SOFTFLOAT)
     if (i->isop(LIR_qjoin) &&
         i->oprnd1()->isop(LIR_call) &&
         i->oprnd2()->isop(LIR_callh))
     {
         if (i->oprnd1()->imm8() == F_i2f)
             return true;
     }
 #endif
@@ -295,32 +295,32 @@ static bool isi2f(LInsp i)
     return false;
 }
 
 static bool isu2f(LInsp i)
 {
     if (i->isop(LIR_u2f))
         return true;
 
-#ifdef NANOJIT_ARM
+#if defined(NANOJIT_ARM) && defined(NJ_SOFTFLOAT)
     if (i->isop(LIR_qjoin) &&
         i->oprnd1()->isop(LIR_call) &&
         i->oprnd2()->isop(LIR_callh))
     {
         if (i->oprnd1()->imm8() == F_u2f)
             return true;
     }
 #endif
 
     return false;
 }
 
 static LInsp iu2fArg(LInsp i)
 {
-#ifdef NANOJIT_ARM
+#if defined(NANOJIT_ARM) && defined(NJ_SOFTFLOAT)
     if (i->isop(LIR_qjoin))
         return i->oprnd1()->arg(0);
 #endif
 
     return i->oprnd1();
 }
 
 
@@ -366,17 +366,17 @@ static bool overflowSafe(LIns* i)
 {
     LIns* c;
     return (i->isop(LIR_and) && ((c = i->oprnd2())->isconst()) &&
             ((c->constval() & 0xc0000000) == 0)) ||
            (i->isop(LIR_rsh) && ((c = i->oprnd2())->isconst()) &&
             ((c->constval() > 0)));
 }
 
-#ifdef NANOJIT_ARM
+#if defined(NJ_SOFTFLOAT)
 
 class SoftFloatFilter: public LirWriter
 {
 public:
     SoftFloatFilter(LirWriter* out):
         LirWriter(out)
     {
     }
@@ -423,44 +423,31 @@ public:
 
             args[0] = s1;
             args[1] = s0;
 
             bv = out->insCall(fmap[v - LIR_feq], args);
             return out->ins2(LIR_eq, bv, out->insImm(1));
         }
 
-        // not really a softfloat filter, but needed on ARM --
-        // arm doesn't mask shifts to 31 like x86 does
-        if (v == LIR_lsh ||
-            v == LIR_rsh ||
-            v == LIR_ush)
-        {
-            if (s1->isconst())
-                s1->setimm16(s1->constval() & 31);
-            else
-                s1 = out->ins2(LIR_and, s1, out->insImm(31));
-            return out->ins2(v, s0, s1);
-        }
-
         return out->ins2(v, s0, s1);
     }
 
     LInsp insCall(uint32_t fid, LInsp args[])
     {
         // if the return type is ARGSIZE_F, we have
         // to do a quadCall ( qjoin(call,callh) )
         if ((builtins[fid]._argtypes & 3) == ARGSIZE_F)
             return quadCall(fid, args);
 
         return out->insCall(fid, args);
     }
 };
 
-#endif
+#endif // NJ_SOFTFLOAT
 
 class FuncFilter: public LirWriter
 {
     TraceRecorder& recorder;
 public:
     FuncFilter(LirWriter* out, TraceRecorder& _recorder):
         LirWriter(out), recorder(_recorder)
     {
@@ -545,16 +532,30 @@ public:
                 isconst(msw->oprnd1()->oprnd1()->oprnd2(), 16) &&
                 isconst(msw->oprnd1()->oprnd2()->oprnd2(), 16) &&
                 (x = lsw->oprnd1()->oprnd1()) == msw->oprnd1()->oprnd1()->oprnd1() &&
                 (y = lsw->oprnd2()->oprnd1()) == msw->oprnd1()->oprnd2()->oprnd1() &&
                 lsw == msw->oprnd2()->oprnd1()) {
                 return out->ins2(LIR_add, x, y);
             }
         }
+#ifdef NANOJIT_ARM
+        else if (v == LIR_lsh ||
+                 v == LIR_rsh ||
+                 v == LIR_ush)
+        {
+            // needed on ARM -- arm doesn't mask shifts to 31 like x86 does
+            if (s1->isconst())
+                s1->setimm16(s1->constval() & 31);
+            else
+                s1 = out->ins2(LIR_and, s1, out->insImm(31));
+            return out->ins2(v, s0, s1);
+        }
+#endif
+
         return out->ins2(v, s0, s1);
     }
 
     LInsp insCall(uint32_t fid, LInsp args[])
     {
         LInsp s0 = args[0];
         switch (fid) {
           case F_DoubleToUint32:
@@ -599,17 +600,17 @@ public:
             break;
         }
         return out->insCall(fid, args);
     }
 };
 
 /* In debug mode vpname contains a textual description of the type of the
    slot during the forall iteration over al slots. */
-#ifdef DEBUG
+#if defined(DEBUG) || defined(INCLUDE_VERBOSE_OUTPUT)
 #define DEF_VPNAME          const char* vpname; unsigned vpnum
 #define SET_VPNAME(name)    do { vpname = name; vpnum = 0; } while(0)
 #define INC_VPNUM()         do { ++vpnum; } while(0)
 #else
 #define DEF_VPNAME          do {} while (0)
 #define vpname ""
 #define vpnum 0
 #define SET_VPNAME(name)    ((void)0)
@@ -816,17 +817,17 @@ TraceRecorder::TraceRecorder(JSContext* 
                         js_PCToLineNumber(cx, cx->fp->script, cx->fp->regs->pc),
                         cx->fp->regs->pc - cx->fp->script->code););
 
     lir = lir_buf_writer = new (&gc) LirBufWriter(lirbuf);
 #ifdef DEBUG
     if (verbose_debug)
         lir = verbose_filter = new (&gc) VerboseWriter(&gc, lir, lirbuf->names);
 #endif
-#ifdef NANOJIT_ARM
+#ifdef NJ_SOFTFLOAT
     lir = float_filter = new (&gc) SoftFloatFilter(lir);
 #endif
     lir = cse_filter = new (&gc) CseFilter(lir, &gc);
     lir = expr_filter = new (&gc) ExprFilter(lir);
     lir = func_filter = new (&gc) FuncFilter(lir, *this);
     lir->ins0(LIR_trace);
 
     if (!nanojit::AvmCore::config.tree_opt || fragment->root == fragment) {
@@ -862,17 +863,17 @@ TraceRecorder::~TraceRecorder()
     if (trashTree)
         js_TrashTree(cx, whichTreeToTrash);
 #ifdef DEBUG
     delete verbose_filter;
 #endif
     delete cse_filter;
     delete expr_filter;
     delete func_filter;
-#ifdef NANOJIT_ARM
+#ifdef NJ_SOFTFLOAT
     delete float_filter;
 #endif
     delete lir_buf_writer;
 }
 
 /* Add debug information to a LIR instruction as we emit it. */
 inline LIns*
 TraceRecorder::addName(LIns* ins, const char* name)
@@ -2272,19 +2273,21 @@ js_ExecuteTree(JSContext* cx, Fragment**
     state.gp = global;
     state.cx = cx;
 #ifdef DEBUG
     state.nestedExit = NULL;
 #endif    
     union { NIns *code; GuardRecord* (FASTCALL *func)(InterpState*, Fragment*); } u;
     u.code = f->code();
 
-#if defined(DEBUG) && defined(NANOJIT_IA32)
+#ifdef DEBUG
+#if defined(NANOJIT_IA32)
     uint64 start = rdtsc();
 #endif
+#endif
 
     /*
      * We may be called from js_MonitorLoopEdge while not recording, or while
      * recording. Rather than over-generalize by using a counter instead of a
      * flag, we simply sample and update cx->executingTrace if necessary.
      */
     bool executingTrace = cx->executingTrace;
     if (!executingTrace)
@@ -2357,29 +2360,28 @@ js_ExecuteTree(JSContext* cx, Fragment**
     /* If we are not exiting from an inlined frame the state->sp is spbase, otherwise spbase
        is whatever slots frames around us consume. */
     fp->regs->pc = (jsbytecode*)lr->from->root->ip + e->ip_adj;
     fp->regs->sp = StackBase(fp) + (e->sp_adj / sizeof(double)) - calldepth_slots;
     JS_ASSERT(fp->slots + fp->script->nfixed +
               js_ReconstructStackDepth(cx, fp->script, fp->regs->pc) == fp->regs->sp);
 
 #if defined(DEBUG) && defined(NANOJIT_IA32)
-    if (verbose_debug) {
-        printf("leaving trace at %s:%u@%u, op=%s, lr=%p, exitType=%d, sp=%d, ip=%p, "
-               "cycles=%llu\n",
-               fp->script->filename, js_PCToLineNumber(cx, fp->script, fp->regs->pc),
-               fp->regs->pc - fp->script->code,
-               js_CodeName[*fp->regs->pc],
-               lr,
-               lr->exit->exitType,
-               fp->regs->sp - StackBase(fp), lr->jmp,
-               (rdtsc() - start));
-    }
+    uint64 cycles = rdtsc() - start;
+#else
+    uint64 cycles = 0;
 #endif
 
+    debug_only_v(printf("leaving trace at %s:%u@%u, exitType=%d, sp=%d, ip=%p, cycles=%llu\n",
+                        fp->script->filename, js_PCToLineNumber(cx, fp->script, fp->regs->pc),
+                        fp->regs->pc - fp->script->code,
+                        lr->exit->exitType,
+                        fp->regs->sp - StackBase(fp), lr->jmp,
+                        cycles));
+
     /* If this trace is part of a tree, later branches might have added additional globals for
        with we don't have any type information available in the side exit. We merge in this
        information from the entry type-map. See also comment in the constructor of TraceRecorder
        why this is always safe to do. */
     unsigned exit_gslots = e->numGlobalSlots;
     JS_ASSERT(ngslots == tm->globalTypeMap->length());
     JS_ASSERT(ngslots >= exit_gslots);
     uint8* globalTypeMap = e->typeMap;
--- a/js/src/jstracer.h
+++ b/js/src/jstracer.h
@@ -216,17 +216,17 @@ class TraceRecorder {
     TreeInfo*               treeInfo;
     nanojit::LirBuffer*     lirbuf;
     nanojit::LirWriter*     lir;
     nanojit::LirBufWriter*  lir_buf_writer;
     nanojit::LirWriter*     verbose_filter;
     nanojit::LirWriter*     cse_filter;
     nanojit::LirWriter*     expr_filter;
     nanojit::LirWriter*     func_filter;
-#ifdef NANOJIT_ARM
+#ifdef NJ_SOFTFLOAT
     nanojit::LirWriter*     float_filter;
 #endif
     nanojit::LIns*          cx_ins;
     nanojit::LIns*          gp_ins;
     nanojit::LIns*          eos_ins;
     nanojit::LIns*          eor_ins;
     nanojit::LIns*          rval_ins;
     nanojit::LIns*          inner_sp_ins;
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
@@ -39,16 +39,17 @@
 #include "nanojit.h"
 
 #ifdef AVMPLUS_PORTING_API
 #include "portapi_nanojit.h"
 #endif
 
 #if defined(AVMPLUS_LINUX) && defined(AVMPLUS_ARM)
 #include <asm/unistd.h>
+extern "C" void __clear_cache(char *BEG, char *END);
 #endif
 
 namespace nanojit
 {
 	#ifdef FEATURE_NANOJIT
 
 
 	class DeadCodeFilter: public LirFilter
@@ -173,16 +174,18 @@ namespace nanojit
 		    regs.used |= rmask(r);
 		    return r;
         }
 		counter_increment(steals);
 
 		// nothing free, steal one 
 		// LSRA says pick the one with the furthest use
 		LIns* vic = findVictim(regs,allow,prefer);
+		NanoAssert(vic != NULL);
+
 	    Reservation* resv = getresv(vic);
 
 		// restore vic
 	    Register r = resv->reg;
         regs.removeActive(r);
         resv->reg = UnknownReg;
 
 		asm_restore(vic, resv, r);
@@ -441,35 +444,47 @@ namespace nanojit
 		return findRegFor(i, rmask(w));
 	}
 			
 	Register Assembler::findRegFor(LIns* i, RegisterMask allow)
 	{
 		Reservation* resv = getresv(i);
 		Register r;
 
+		// if we have an existing reservation and it has a non-unknown
+		// register allocated, and that register is in our allowed mask,
+		// return it.
         if (resv && (r=resv->reg) != UnknownReg && (rmask(r) & allow)) {
 			return r;
         }
 
+		// figure out what registers are preferred for this instruction
 		RegisterMask prefer = hint(i, allow);
+
+		// if we didn't have a reservation, allocate one now
 		if (!resv) 	
 			resv = reserveAlloc(i);
 
+		// if the reservation doesn't have a register assigned to it...
         if ((r=resv->reg) == UnknownReg)
 		{
+			// .. if the cost is 2 and the allowed mask includes
+			// the saved regs, then prefer just those.
             if (resv->cost == 2 && (allow&SavedRegs))
                 prefer = allow&SavedRegs;
+			// grab one.
 			r = resv->reg = registerAlloc(prefer);
 			_allocator.addActive(r, i);
 			return r;
 		}
 		else 
 		{
-			// r not allowed
+			// the already-allocated register isn't in the allowed mask;
+			// we need to grab a new one and then copy over the old
+			// contents to the new.
 			resv->reg = UnknownReg;
 			_allocator.retire(r);
             if (resv->cost == 2 && (allow&SavedRegs))
                 prefer = allow&SavedRegs;
 			Register s = resv->reg = registerAlloc(prefer);
 			_allocator.addActive(s, i);
             if ((rmask(r) & GpRegs) && (rmask(s) & GpRegs)) {
     			MR(r, s);
@@ -790,22 +805,25 @@ namespace nanojit
 		_branchStateMap = 0;
 
 #ifdef AVMPLUS_ARM
 		// If we've modified the code, we need to flush so we don't end up trying 
 		// to execute junk
 # if defined(UNDER_CE)
 		FlushInstructionCache(GetCurrentProcess(), NULL, NULL);
 # elif defined(AVMPLUS_LINUX)
-		// XXX fixme flush adjacent pages together
 		for (int i = 0; i < 2; i++) {
 			Page *p = (i == 0) ? _nativePages : _nativeExitPages;
 
+			Page *first = p;
 			while (p) {
-				flushCache((NIns*)p, (NIns*)((intptr_t)(p) + NJ_PAGE_SIZE));
+				if (!p->next || p->next != p+1) {
+					__clear_cache((char*)first, (char*)(p+1));
+					first = p->next;
+				}
 				p = p->next;
 			}
 		}
 # endif
 #endif
 
 # ifdef AVMPLUS_PORTING_API
 		NanoJIT_PortAPI_FlushInstructionCache(_nIns, _endJit1Addr);
@@ -847,17 +865,17 @@ namespace nanojit
 		 
 		for (LInsp ins = reader->read(); ins != 0 && !error(); ins = reader->read())
 		{
     		Reservation *rR = getresv(ins);
 			LOpcode op = ins->opcode();			
 			switch(op)
 			{
 				default:
-					NanoAssertMsgf(false, ("unsupported LIR instruction: %d (~0x40: %d)\n",op, op&~LIR64));
+					NanoAssertMsgf(false, "unsupported LIR instruction: %d (~0x40: %d)\n", op, op&~LIR64);
 					break;
 					
 				case LIR_short:
 				case LIR_int:
 				{
 					Register rr = prepResultReg(ins, GpRegs);
 					int32_t val;
 					if (op == LIR_int)
@@ -1203,23 +1221,30 @@ namespace nanojit
 				case LIR_xf:
 				{
                     NIns* exit = asm_exit(ins);
 	
 					// we only support cmp with guard right now, also assume it is 'close' and only emit the branch
 					LIns* cond = ins->oprnd1();
 					LOpcode condop = cond->opcode();
 					NanoAssert(cond->isCond());
-#ifndef NJ_SOFTFLOAT
+#if !defined(NJ_SOFTFLOAT)
                     if (condop >= LIR_feq && condop <= LIR_fge)
 					{
+#if defined(NJ_ARM_VFP)
+						if (op == LIR_xf)
+							JNE(exit);
+						else
+							JE(exit);
+#else
 						if (op == LIR_xf)
 							JP(exit);
 						else
 							JNP(exit);
+#endif
 						asm_fcmp(cond);
                         break;
 					}
 #endif
 					// produce the branch
 					if (op == LIR_xf)
 					{
 						if (condop == LIR_eq)
@@ -1308,19 +1333,23 @@ namespace nanojit
 				case LIR_feq:
 				case LIR_fle:
 				case LIR_flt:
 				case LIR_fgt:
 				case LIR_fge:
 				{
 					// only want certain regs 
 					Register r = prepResultReg(ins, AllowableFlagRegs);
+#ifdef NJ_ARM_VFP
+					SETE(r);
+#else
 					// SETcc only sets low 8 bits, so extend 
 					MOVZX8(r,r);
 					SETNP(r);
+#endif
 					asm_fcmp(ins);
 					break;
 				}
 #endif
 				case LIR_eq:
                 case LIR_ov:
                 case LIR_cs:
 				case LIR_le:
@@ -1432,18 +1461,23 @@ namespace nanojit
         else
 		{
 			asm_farg(p);
 		}
     }
 
 	uint32_t Assembler::arFree(uint32_t idx)
 	{
+		// nothing to free
+		if (idx == 0)
+			return 0;
+
 		if (idx > 0 && _activation.entry[idx] == _activation.entry[idx+stack_direction(1)])
 			_activation.entry[idx+stack_direction(1)] = 0;  // clear 2 slots for doubles 
+
 		_activation.entry[idx] = 0;
 		return 0;
 	}
 
 #ifdef NJ_VERBOSE
 	void Assembler::printActivationState()
 	{
 		bool verbose_activation = false;
--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@@ -371,18 +371,16 @@ namespace nanojit
 		l->initOpcode(LIR_param);
 		l->c.imm8a = Assembler::argRegs[arg];
 
 		_buf->commit(1);
 		_buf->_stats.lir++;
 		return l;
     }
 	
-#define isS24(x) (((int32_t(x)<<8)>>8) == (x))
-
 	LInsp LirBufWriter::insFar(LOpcode op, LInsp target)
 	{
         NanoAssert(op == LIR_skip || op == LIR_tramp);
         LInsp l = _buf->next();
         int d = target-l;
         if (isS24(d)) {
     		ensureRoom(1);
             l->initOpcode(LOpcode(op-1)); // nearskip or neartramp
--- a/js/src/nanojit/NativeARM.cpp
+++ b/js/src/nanojit/NativeARM.cpp
@@ -44,24 +44,27 @@
 #endif
 
 #ifdef UNDER_CE
 #include <cmnintrin.h>
 #endif
 
 #if defined(AVMPLUS_LINUX)
 #include <asm/unistd.h>
+extern "C" void __clear_cache(char *BEG, char *END);
 #endif
 
+#ifdef FEATURE_NANOJIT
+
 namespace nanojit
 {
-#ifdef FEATURE_NANOJIT
 
 #ifdef NJ_VERBOSE
-const char* regNames[] = {"r0","r1","r2","r3","r4","r5","r6","r7","r8","r9","r10","r11","IP","SP","LR","PC"};
+const char* regNames[] = {"r0","r1","r2","r3","r4","r5","r6","r7","r8","r9","r10","FP","IP","SP","LR","PC",
+                          "d0","d1","d2","d3","d4","d5","d6","d7","s14"};
 #endif
 
 const Register Assembler::argRegs[] = { R0, R1, R2, R3 };
 const Register Assembler::retRegs[] = { R0, R1 };
 
 void
 Assembler::nInit(AvmCore*)
 {
@@ -117,16 +120,17 @@ Assembler::nFragExit(LInsp guard)
     } else {
         // target doesn't exit yet.  emit jump to epilog, and set up to patch later.
         lr = placeGuardRecord(guard);
 
         // we need to know that there's an extra immediate value available
         // for us; always force a far jump here.
         BL_far(_epilogue);
 
+        // stick the jmp pointer to the start of the sequence
         lr->jmp = _nIns;
     }
 
     // pop the stack frame first
     MR(SP, FRAME_PTR);
 
 #ifdef NJ_VERBOSE
     if (_frago->core()->config.show_stats) {
@@ -150,46 +154,105 @@ Assembler::genEpilogue(RegisterMask rest
     POP_mask(savingMask); // regs
     return _nIns;
 }
     
 void
 Assembler::asm_call(LInsp ins)
 {
     const CallInfo* call = callInfoFor(ins->fid());
+    Reservation *callRes = getresv(ins);
+
     uint32_t atypes = call->_argtypes;
     uint32_t roffset = 0;
 
+    // skip return type
+#ifdef NJ_ARM_VFP
+    ArgSize rsize = (ArgSize)(atypes & 3);
+#endif
+    atypes >>= 2;
+
     // we need to detect if we have arg0 as LO followed by arg1 as F;
     // in that case, we need to skip using r1 -- the F needs to be
     // loaded in r2/r3, at least according to the ARM EABI and gcc 4.2's
     // generated code.
     bool arg0IsInt32FollowedByFloat = false;
     while ((atypes & 3) != ARGSIZE_NONE) {
-        if (((atypes >> 4) & 3) == ARGSIZE_LO &&
-            ((atypes >> 2) & 3) == ARGSIZE_F &&
-            ((atypes >> 6) & 3) == ARGSIZE_NONE)
+        if (((atypes >> 2) & 3) == ARGSIZE_LO &&
+            ((atypes >> 0) & 3) == ARGSIZE_F &&
+            ((atypes >> 4) & 3) == ARGSIZE_NONE)
         {
             arg0IsInt32FollowedByFloat = true;
             break;
         }
         atypes >>= 2;
     }
 
+#ifdef NJ_ARM_VFP
+    if (rsize == ARGSIZE_F) {
+        NanoAssert(ins->opcode() == LIR_fcall);
+        NanoAssert(callRes);
+
+        //fprintf (stderr, "call ins: %p callRes: %p reg: %d ar: %d\n", ins, callRes, callRes->reg, callRes->arIndex);
+
+        Register rr = callRes->reg;
+        int d = disp(callRes);
+        freeRsrcOf(ins, rr != UnknownReg);
+
+        if (rr != UnknownReg) {
+            NanoAssert(IsFpReg(rr));
+            FMDRR(rr,R0,R1);
+        } else {
+            NanoAssert(d);
+            //fprintf (stderr, "call ins d: %d\n", d);
+            STMIA(Scratch, 1<<R0 | 1<<R1);
+            arm_ADDi(Scratch, FP, d);
+        }
+    }
+#endif
+
     CALL(call);
 
     ArgSize sizes[10];
     uint32_t argc = call->get_sizes(sizes);
-    for(uint32_t i=0; i < argc; i++) {
+    for(uint32_t i = 0; i < argc; i++) {
         uint32_t j = argc - i - 1;
         ArgSize sz = sizes[j];
-        NanoAssert(sz == ARGSIZE_LO || sz == ARGSIZE_Q);
+        LInsp arg = ins->arg(j);
         // pre-assign registers R0-R3 for arguments (if they fit)
-        Register r = (i+roffset) < 4 ? argRegs[i+roffset] : UnknownReg;
-        asm_arg(sz, ins->arg(j), r);
+
+        Register r = (i + roffset) < 4 ? argRegs[i+roffset] : UnknownReg;
+#ifdef NJ_ARM_VFP
+        if (sz == ARGSIZE_F) {
+            if (r == R0 || r == R2) {
+                roffset++;
+            } else if (r == R1) {
+                r = R2;
+                roffset++;
+            } else {
+                r = UnknownReg;
+            }
+
+            // XXX move this into asm_farg
+            Register sr = findRegFor(arg, FpRegs);
+
+            if (r != UnknownReg) {
+                // stick it into our scratch fp reg, and then copy into the base reg
+                //fprintf (stderr, "FMRRD: %d %d <- %d\n", r, nextreg(r), sr);
+                FMRRD(r, nextreg(r), sr);
+            } else {
+                asm_pusharg(arg);
+            }
+        } else {
+            asm_arg(sz, arg, r);
+        }
+#else
+        NanoAssert(sz == ARGSIZE_LO || sz == ARGSIZE_Q);
+        asm_arg(sz, arg, r);
+#endif
 
         if (i == 0 && arg0IsInt32FollowedByFloat)
             roffset = 1;
     }
 }
     
 void
 Assembler::nMarkExecute(Page* page, int32_t count, bool enable)
@@ -233,39 +296,38 @@ Assembler::nRegisterAllocFromSet(int set
 }
 
 void
 Assembler::nRegisterResetAll(RegAlloc& a)
 {
     // add scratch registers to our free list for the allocator
     a.clear();
     a.used = 0;
-    a.free = rmask(R0) | rmask(R1) | rmask(R2) | rmask(R3) | rmask(R4) | rmask(R5);
+    a.free = rmask(R0) | rmask(R1) | rmask(R2) | rmask(R3) | rmask(R4) | rmask(R5) | FpRegs;
     debug_only(a.managed = a.free);
 }
 
 void
 Assembler::nPatchBranch(NIns* branch, NIns* target)
 {
     // Patch the jump in a loop
 
     // This is ALWAYS going to be a long branch (using the BL instruction)
     // Which is really 2 instructions, so we need to modify both
     // XXX -- this is B, not BL, at least on non-Thumb..
 
-    // branch+2 because PC is always 2 instructions ahead on ARM/Thumb
-    int32_t offset = int(target) - int(branch+2);
+    int32_t offset = PC_OFFSET_FROM(target, branch);
 
     //printf("---patching branch at 0x%08x to location 0x%08x (%d-0x%08x)\n", branch, target, offset, offset);
 
     // We have 2 words to work with here -- if offset is in range of a 24-bit
     // relative jump, emit that; otherwise, we do a pc-relative load into pc.
-    if (-(1<<24) <= offset & offset < (1<<24)) {
+    if (isS24(offset)) {
         // ARM goodness, using unconditional B
-        *branch = (NIns)( COND_AL | (0xA<<24) | ((offset >>2) & 0xFFFFFF) );
+        *branch = (NIns)( COND_AL | (0xA<<24) | ((offset>>2) & 0xFFFFFF) );
     } else {
         // LDR pc,[pc]
         *branch++ = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | ( 0x004 ) );
         *branch = (NIns)target;
     }
 }
 
 RegisterMask
@@ -290,117 +352,270 @@ void
 Assembler::asm_qjoin(LIns *ins)
 {
     int d = findMemFor(ins);
     AvmAssert(d);
     LIns* lo = ins->oprnd1();
     LIns* hi = ins->oprnd2();
                             
     Register r = findRegFor(hi, GpRegs);
-    ST(FP, d+4, r);
+    STR(r, FP, d+4);
 
     // okay if r gets recycled.
     r = findRegFor(lo, GpRegs);
-    ST(FP, d, r);
+    STR(r, FP, d);
     freeRsrcOf(ins, false); // if we had a reg in use, emit a ST to flush it to mem
 }
 
 void
 Assembler::asm_store32(LIns *value, int dr, LIns *base)
 {
     // make sure what is in a register
     Reservation *rA, *rB;
     findRegFor2(GpRegs, value, rA, base, rB);
     Register ra = rA->reg;
     Register rb = rB->reg;
-    ST(rb, dr, ra);
+    STR(ra, rb, dr);
 }
 
 void
 Assembler::asm_restore(LInsp i, Reservation *resv, Register r)
 {
     (void)resv;
     int d = findMemFor(i);
-    LD(r, d, FP);
+
+    if (IsFpReg(r)) {
+        if (isS8(d >> 2)) {
+            FLDD(r, FP, d);
+        } else {
+            FLDD(r, Scratch, 0);
+            arm_ADDi(Scratch, FP, d);
+        }
+    } else {
+        LDR(r, FP, d);
+    }
 
     verbose_only(
         if (_verbose)
             outputf("        restore %s",_thisfrag->lirbuf->names->formatRef(i));
     )
 }
 
 void
 Assembler::asm_spill(LInsp i, Reservation *resv, bool pop)
 {
     (void)i;
     (void)pop;
-
+    //fprintf (stderr, "resv->arIndex: %d\n", resv->arIndex);
     if (resv->arIndex) {
         int d = disp(resv);
         // save to spill location
         Register rr = resv->reg;
-        ST(FP, d, rr);
+        if (IsFpReg(rr)) {
+            if (isS8(d >> 2)) {
+                FSTD(rr, FP, d);
+            } else {
+                FSTD(rr, Scratch, 0);
+                arm_ADDi(Scratch, FP, d);
+            }
+        } else {
+            STR(rr, FP, d);
+        }
 
         verbose_only(if (_verbose){
                 outputf("        spill %s",_thisfrag->lirbuf->names->formatRef(i));
             }
         )
     }
 }
 
 void
 Assembler::asm_load64(LInsp ins)
 {
+    ///asm_output("<<< load64");
+
     LIns* base = ins->oprnd1();
-    int db = ins->oprnd2()->constval();
+    int offset = ins->oprnd2()->constval();
+
     Reservation *resv = getresv(ins);
-    int dr = disp(resv);
-    NanoAssert(resv->reg == UnknownReg && dr != 0);
+    Register rr = resv->reg;
+    int d = disp(resv);
 
+    freeRsrcOf(ins, false);
+
+#ifdef NJ_ARM_VFP
     Register rb = findRegFor(base, GpRegs);
-    resv->reg = UnknownReg;
-    asm_mmq(FP, dr, rb, db);
-    freeRsrcOf(ins, false);
+
+    NanoAssert(rb != UnknownReg);
+    NanoAssert(rr == UnknownReg || IsFpReg(rr));
+
+    if (rr != UnknownReg) {
+        if (!isS8(offset >> 2) || (offset&3) != 0) {
+            underrunProtect(LD32_size + 8);
+            FLDD(rr,Scratch,0);
+            ADD(Scratch, rb);
+            LD32_nochk(Scratch, offset);
+        } else {
+            FLDD(rr,rb,offset);
+        }
+    } else {
+        asm_mmq(FP, d, rb, offset);
+    }
+
+    // *(FP+dr) <- *(rb+db)
+#else
+    NanoAssert(resv->reg == UnknownReg && d != 0);
+    Register rb = findRegFor(base, GpRegs);
+    asm_mmq(FP, d, rb, offset);
+#endif
+
+    //asm_output(">>> load64");
 }
 
 void
 Assembler::asm_store64(LInsp value, int dr, LInsp base)
 {
+    //asm_output1("<<< store64 (dr: %d)", dr);
+
+#ifdef NJ_ARM_VFP
+    Reservation *valResv = getresv(value);
+
+    Register rb = findRegFor(base, GpRegs);
+    Register rv = findRegFor(value, FpRegs);
+
+    NanoAssert(rb != UnknownReg);
+    NanoAssert(rv != UnknownReg);
+
+    Register baseReg = rb;
+    intptr_t baseOffset = dr;
+
+    if (!isS8(dr)) {
+        baseReg = Scratch;
+        baseOffset = 0;
+    }
+
+    FSTD(rv, baseReg, baseOffset);
+
+    if (!isS8(dr)) {
+        underrunProtect(4 + LD32_size);
+        ADD(Scratch, rb);
+        LD32_nochk(Scratch, dr);
+    }
+
+    // if it's a constant, make sure our baseReg/baseOffset location
+    // has the right value
+    if (value->isconstq()) {
+        const int32_t* p = (const int32_t*) (value-2);
+
+        underrunProtect(12 + LD32_size);
+
+        asm_quad_nochk(rv, p);
+    }
+#else
     int da = findMemFor(value);
     Register rb = findRegFor(base, GpRegs);
     asm_mmq(rb, dr, FP, da);
+#endif
+    //asm_output(">>> store64");
+}
+
+// stick a quad into register rr, where p points to the two
+// 32-bit parts of the quad, optinally also storing at FP+d
+void
+Assembler::asm_quad_nochk(Register rr, const int32_t* p)
+{
+    *(++_nSlot) = p[0];
+    *(++_nSlot) = p[1];
+
+    intptr_t constAddr = (intptr_t) (_nSlot-1);
+    intptr_t realOffset = PC_OFFSET_FROM(constAddr, _nIns-1);
+    intptr_t offset = realOffset;
+    Register baseReg = PC;
+
+    //int32_t *q = (int32_t*) constAddr;
+    //fprintf (stderr, "asm_quad_nochk: rr = %d cAddr: 0x%x quad: %08x:%08x q: %f @0x%08x\n", rr, constAddr, p[0], p[1], *(double*)q, _nIns);
+
+    // for FLDD, we only get a left-shifted 8-bit offset
+    if (!isS8(realOffset >> 2)) {
+        offset = 0;
+        baseReg = Scratch;
+    }
+
+    FLDD(rr, baseReg, offset);
+
+    if (!isS8(realOffset >> 2))
+        LD32_nochk(Scratch, constAddr);
 }
 
 void
 Assembler::asm_quad(LInsp ins)
 {
-    Reservation *rR = getresv(ins);
-    int d = disp(rR);
+    //asm_output(">>> asm_quad");
+
+    Reservation *res = getresv(ins);
+    int d = disp(res);
+    Register rr = res->reg;
+
+    NanoAssert(d || rr != UnknownReg);
+
+    const int32_t* p = (const int32_t*) (ins-2);
+
+#ifdef NJ_ARM_VFP
     freeRsrcOf(ins, false);
 
+    // XXX We probably want nochk versions of FLDD/FSTD
+    underrunProtect(16 + LD32_size);
+
+    // grab a register to do the load into if we don't have one already;
+    // XXX -- maybe do a mmq in this case?  We're going to use our
+    // D7 register that's never allocated (since it's the one we use
+    // for int-to-double conversions), so we don't have to worry about
+    // spilling something in a fp reg.
+    if (rr == UnknownReg)
+        rr = D7;
+
+    if (d)
+        FSTD(rr, FP, d);
+
+    asm_quad_nochk(rr, p);
+#else
+    freeRsrcOf(ins, false);
     if (d) {
-        const int32_t* p = (const int32_t*) (ins-2);
-        STi(FP,d+4,p[1]);
-        STi(FP,d,p[0]);
+        underrunProtect(LD32_size * 2 + 8);
+        STR(Scratch, FP, d+4);
+        LD32_nochk(Scratch, p[1]);
+        STR(Scratch, FP, d);
+        LD32_nochk(Scratch, p[0]);
     }
+#endif
+
+    //asm_output("<<< asm_quad");
 }
 
 bool
 Assembler::asm_qlo(LInsp ins, LInsp q)
 {
     (void)ins; (void)q;
     return false;
 }
 
 void
 Assembler::asm_nongp_copy(Register r, Register s)
 {
-    // we will need this for VFP support
-    (void)r; (void)s;
-    NanoAssert(false);
+    if ((rmask(r) & FpRegs) && (rmask(s) & FpRegs)) {
+        // fp->fp
+        FCPYD(r, s);
+    } else if ((rmask(r) & GpRegs) && (rmask(s) & FpRegs)) {
+        // fp->gp
+        // who's doing this and why?
+        NanoAssert(0);
+        // FMRS(r, loSingleVfp(s));
+    } else {
+        NanoAssert(0);
+    }
 }
 
 Register
 Assembler::asm_binop_rhs_reg(LInsp ins)
 {
     return UnknownReg;
 }
 
@@ -411,41 +626,51 @@ void
 Assembler::asm_mmq(Register rd, int dd, Register rs, int ds)
 {
     // value is either a 64bit struct or maybe a float
     // that isn't live in an FPU reg.  Either way, don't
     // put it in an FPU reg just to load & store it.
     // get a scratch reg
     Register t = registerAlloc(GpRegs & ~(rmask(rd)|rmask(rs)));
     _allocator.addFree(t);
-    ST(rd, dd+4, t);
-    LD(t, ds+4, rs);
-    ST(rd, dd, t);
-    LD(t, ds, rs);
+    // XXX use LDM,STM
+    STR(t, rd, dd+4);
+    LDR(t, rs, ds+4);
+    STR(t, rd, dd);
+    LDR(t, rs, ds);
 }
 
 void
-Assembler::asm_pusharg(LInsp p)
+Assembler::asm_pusharg(LInsp arg)
 {
-    // arg goes on stack
-    Reservation* rA = getresv(p);
-    if (rA == 0)
-    {
-        Register ra = findRegFor(p, GpRegs);
-        ST(SP,0,ra);
+    Reservation* argRes = getresv(arg);
+    bool quad = arg->isQuad();
+    intptr_t stack_growth = quad ? 8 : 4;
+
+    Register ra;
+
+    if (argRes)
+        ra = argRes->reg;
+    else
+        ra = findRegFor(arg, quad ? FpRegs : GpRegs);
+
+    if (ra == UnknownReg) {
+        STR(Scratch, SP, 0);
+        LDR(Scratch, FP, disp(argRes));
+    } else {
+        if (!quad) {
+            Register ra = findRegFor(arg, GpRegs);
+            STR(ra, SP, 0);
+        } else {
+            Register ra = findRegFor(arg, FpRegs);
+            FSTD(ra, SP, 0);
+        }
     }
-    else if (rA->reg == UnknownReg)
-    {
-        ST(SP,0,Scratch);
-        LD(Scratch,disp(rA),FP);
-    }
-    else
-    {
-        ST(SP,0,rA->reg);
-    }
+
+    SUBi(SP, stack_growth);
 }
 
 void
 Assembler::nativePageReset()
 {
     _nSlot = 0;
     _nExitSlot = 0;
 }
@@ -465,46 +690,37 @@ Assembler::nativePageSetup()
         _nExitIns--;
 
         // constpool starts at top of page and goes down,
         // code starts at bottom of page and moves up
         _nSlot = pageDataStart(_nIns); //(int*)(&((Page*)pageTop(_nIns))->lir[0]);
     }
 }
 
-void
-Assembler::flushCache(NIns* n1, NIns* n2) {
-#if defined(UNDER_CE)
-    // we changed the code, so we need to do this (sadly)
-    FlushInstructionCache(GetCurrentProcess(), NULL, NULL);
-#elif defined(AVMPLUS_LINUX)
-    // Just need to clear this one page (not even the whole page really)
-    //Page *page = (Page*)pageTop(_nIns);
-    register unsigned long _beg __asm("a1") = (unsigned long)(n1);
-    register unsigned long _end __asm("a2") = (unsigned long)(n2);
-    register unsigned long _flg __asm("a3") = 0;
-    register unsigned long _swi __asm("r7") = 0xF0002;
-    __asm __volatile ("swi 0    @ sys_cacheflush" : "=r" (_beg) : "0" (_beg), "r" (_end), "r" (_flg), "r" (_swi));
-#endif
-}
-
 NIns*
 Assembler::asm_adjustBranch(NIns* at, NIns* target)
 {
     // This always got emitted as a BL_far sequence; at points
     // to the first of 4 instructions.  Ensure that we're where
     // we think we were..
     NanoAssert(at[1] == (NIns)( COND_AL | OP_IMM | (1<<23) | (PC<<16) | (LR<<12) | (4) ));
     NanoAssert(at[2] == (NIns)( COND_AL | (0x9<<21) | (0xFFF<<8) | (1<<4) | (IP) ));
 
     NIns* was = (NIns*) at[3];
 
+    //fprintf (stderr, "Adjusting branch @ 0x%8x: 0x%x -> 0x%x\n", at+3, at[3], target);
+
     at[3] = (NIns)target;
 
-    flushCache(at, at+4);
+#if defined(UNDER_CE)
+    // we changed the code, so we need to do this (sadly)
+    FlushInstructionCache(GetCurrentProcess(), NULL, NULL);
+#elif defined(AVMPLUS_LINUX)
+    __clear_cache((char*)at, (char*)(at+4));
+#endif
 
 #ifdef AVMPLUS_PORTING_API
     NanoJIT_PortAPI_FlushInstructionCache(at, at+4);
 #endif
 
     return was;
 }
 
@@ -545,62 +761,283 @@ Assembler::underrunProtect(int bytes)
 
 void
 Assembler::BL_far(NIns* addr)
 {
     // we have to stick an immediate into the stream and make lr
     // point to the right spot before branching
     underrunProtect(16);
 
+    // TODO use a slot in const pool for address, but emit single insn
+    // for branch if offset fits
+
     // the address
     *(--_nIns) = (NIns)((addr));
     // bx ip             // branch to the address we loaded earlier
     *(--_nIns) = (NIns)( COND_AL | (0x9<<21) | (0xFFF<<8) | (1<<4) | (IP) );
     // add lr, [pc + #4] // set lr to be past the address that we wrote
     *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | (PC<<16) | (LR<<12) | (4) );
     // ldr ip, [pc + #4] // load the address into ip, reading it from [pc+4]
     *(--_nIns) = (NIns)( COND_AL | (0x59<<20) | (PC<<16) | (IP<<12) | (4));
+
+    //fprintf (stderr, "BL_far sequence @ 0x%08x\n", _nIns);
+
     asm_output1("bl %p (32-bit)", addr);
 }
 
 void
 Assembler::BL(NIns* addr)
 {
-    intptr_t offs = PC_OFFSET_FROM(addr,(intptr_t)_nIns-4);
-    if (JMP_S24_OFFSET_OK(offs)) {
-        // we can do this with a single BL call
+    intptr_t offs = PC_OFFSET_FROM(addr,_nIns-1);
+
+    //fprintf (stderr, "BL: 0x%x (offs: %d [%x]) @ 0x%08x\n", addr, offs, offs, (intptr_t)(_nIns-1));
+
+    if (isS24(offs)) {
+        // try to do this with a single S24 call;
+        // recompute offset in case underrunProtect had to allocate a new page
         underrunProtect(4);
-        *(--_nIns) = (NIns)( COND_AL | (0xB<<24) | (((offs)>>2) & 0xFFFFFF) ); \
+        offs = PC_OFFSET_FROM(addr,_nIns-1);
+    }
+
+    if (isS24(offs)) {
+        // already did underrunProtect above
+        *(--_nIns) = (NIns)( COND_AL | (0xB<<24) | (((offs)>>2) & 0xFFFFFF) );
         asm_output1("bl %p", addr);
     } else {
         BL_far(addr);
     }
 }
 
 void
 Assembler::CALL(const CallInfo *ci)
 {
     intptr_t addr = ci->_address;
+
     BL((NIns*)addr);
     asm_output1("   (call %s)", ci->_name);
 }
 
 void
 Assembler::LD32_nochk(Register r, int32_t imm)
 {
-    // We can always reach the const pool, since it's on the same page (<4096)
-    underrunProtect(8);
+    // We should always reach the const pool, since it's on the same page (<4096);
+    // if we can't, someone didn't underrunProtect enough.
 
     *(++_nSlot) = (int)imm;
 
     //fprintf (stderr, "wrote slot(2) %p with %08x, jmp @ %p\n", _nSlot, (intptr_t)imm, _nIns-1);
 
-    int offset = PC_OFFSET_FROM(_nSlot,(intptr_t)(_nIns)-4);
+    int offset = PC_OFFSET_FROM(_nSlot,_nIns-1);
+
+    NanoAssert(isS12(offset) && (offset < 0));
+
+    asm_output2("  (%d(PC) = 0x%x)", offset, imm);
+
+    LDR_nochk(r,PC,offset);
+}
+
+
+// Branch to target address _t with condition _c, doing underrun
+// checks (_chk == 1) or skipping them (_chk == 0).
+//
+// If the jump fits in a relative jump (+/-32MB), emit that.
+// If the jump is unconditional, emit the dest address inline in
+// the instruction stream and load it into pc.
+// If the jump has a condition, but noone's mucked with _nIns and our _nSlot
+// pointer is valid, stick the constant in the slot and emit a conditional
+// load into pc.
+// Otherwise, emit the conditional load into pc from a nearby constant,
+// and emit a jump to jump over it it in case the condition fails.
+//
+// NB: JMP_nochk depends on this not calling samepage() when _c == AL
+void
+Assembler::B_cond_chk(ConditionCode _c, NIns* _t, bool _chk)
+{
+    int32 offs = PC_OFFSET_FROM(_t,_nIns-1);
+    //fprintf(stderr, "B_cond_chk target: 0x%08x offset: %d @0x%08x\n", _t, offs, _nIns-1);
+    if (isS24(offs)) {
+        if (_chk) underrunProtect(4);
+        offs = PC_OFFSET_FROM(_t,_nIns-1);
+    }
 
-    NanoAssert(JMP_S24_OFFSET_OK(offset) && (offset < 0));
+    if (isS24(offs)) {
+        *(--_nIns) = (NIns)( ((_c)<<28) | (0xA<<24) | (((offs)>>2) & 0xFFFFFF) );
+    } else if (_c == AL) {
+        if(_chk) underrunProtect(8);
+        *(--_nIns) = (NIns)(_t);
+        *(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | 0x4 );
+    } else if (samepage(_nIns,_nSlot)) {
+        if(_chk) underrunProtect(8);
+        *(++_nSlot) = (NIns)(_t);
+        offs = PC_OFFSET_FROM(_nSlot,_nIns-1);
+        NanoAssert(offs < 0);
+        *(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | ((-offs) & 0xFFFFFF) );
+    } else {
+        if(_chk) underrunProtect(12);
+        *(--_nIns) = (NIns)(_t);
+        *(--_nIns) = (NIns)( COND_AL | (0xA<<24) | ((-4)>>2) & 0xFFFFFF );
+        *(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | 0x0 );
+    }
+
+    asm_output2("%s %p", _c == AL ? "jmp" : "b(cnd)", (void*)(_t));
+}
+
+/*
+ * VFP
+ */
+
+#ifdef NJ_ARM_VFP
+
+void
+Assembler::asm_i2f(LInsp ins)
+{
+    Register rr = prepResultReg(ins, FpRegs);
+    Register srcr = findRegFor(ins->oprnd1(), GpRegs);
 
-    *(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | ((r)<<12) | ((-offset) & 0xFFFFFF) );
-    asm_output2("ld %s,%d",gpn(r),imm);
+    // todo: support int value in memory, as per x86
+    NanoAssert(srcr != UnknownReg);
+
+    FSITOD(rr, FpSingleScratch);
+    FMSR(FpSingleScratch, srcr);
+}
+
+void
+Assembler::asm_u2f(LInsp ins)
+{
+    Register rr = prepResultReg(ins, FpRegs);
+    Register sr = findRegFor(ins->oprnd1(), GpRegs);
+
+    // todo: support int value in memory, as per x86
+    NanoAssert(sr != UnknownReg);
+
+    FUITOD(rr, FpSingleScratch);
+    FMSR(FpSingleScratch, sr);
+}
+
+void
+Assembler::asm_fneg(LInsp ins)
+{
+    LInsp lhs = ins->oprnd1();
+    Register rr = prepResultReg(ins, FpRegs);
+
+    Reservation* rA = getresv(lhs);
+    Register sr;
+
+    if (!rA || rA->reg == UnknownReg)
+        sr = findRegFor(lhs, FpRegs);
+    else
+        sr = rA->reg;
+
+    FNEGD(rr, sr);
 }
 
-#endif /* FEATURE_NANOJIT */
+void
+Assembler::asm_fop(LInsp ins)
+{
+    LInsp lhs = ins->oprnd1();
+    LInsp rhs = ins->oprnd2();
+    LOpcode op = ins->opcode();
+
+    NanoAssert(op >= LIR_fadd && op <= LIR_fdiv);
+
+    // rr = ra OP rb
+
+    Register rr = prepResultReg(ins, FpRegs);
+
+    Register ra = findRegFor(lhs, FpRegs);
+    Register rb = (rhs == lhs) ? ra : findRegFor(rhs, FpRegs);
+
+    // XXX special-case 1.0 and 0.0
+
+    if (op == LIR_fadd)
+        FADDD(rr,ra,rb);
+    else if (op == LIR_fsub)
+        FSUBD(rr,ra,rb);
+    else if (op == LIR_fmul)
+        FMULD(rr,ra,rb);
+    else //if (op == LIR_fdiv)
+        FDIVD(rr,ra,rb);
+}
+
+void
+Assembler::asm_fcmp(LInsp ins)
+{
+    LInsp lhs = ins->oprnd1();
+    LInsp rhs = ins->oprnd2();
+    LOpcode op = ins->opcode();
+
+    NanoAssert(op >= LIR_feq && op <= LIR_fge);
+
+    Register ra = findRegFor(lhs, FpRegs);
+    Register rb = findRegFor(rhs, FpRegs);
+
+    // We can't uniquely identify fge/fle via a single bit
+    // pattern (since equality and lt/gt are separate bits);
+    // so convert to the single-bit variant.
+    if (op == LIR_fge) {
+        Register temp = ra;
+        ra = rb;
+        rb = temp;
+        op = LIR_flt;
+    } else if (op == LIR_fle) {
+        Register temp = ra;
+        ra = rb;
+        rb = temp;
+        op = LIR_fgt;
+    }
+
+    // There is no way to test for an unordered result using
+    // the conditional form of an instruction; the encoding (C=1 V=1)
+    // ends up having overlaps with a few other tests.  So, test for
+    // the explicit mask.
+    uint8_t mask = 0x0;
+    
+    // NZCV
+    // for a valid ordered result, V is always 0 from VFP
+    if (op == LIR_feq)
+        // ZC // cond EQ (both equal and "not less than"
+        mask = 0x6;
+    else if (op == LIR_flt)
+        // N  // cond MI
+        mask = 0x8;
+    else if (op == LIR_fgt)
+        // C  // cond CS
+        mask = 0x2;
+    else
+        NanoAssert(0);
+/*
+    // these were converted into gt and lt above.
+    if (op == LIR_fle)
+        // NZ // cond LE
+        mask = 0xC;
+    else if (op == LIR_fge)
+        // ZC // cond fail?
+        mask = 0x6;
+*/
+
+    // TODO XXX could do this as fcmpd; fmstat; tstvs rX, #0 the tstvs
+    // would reset the status bits if V (NaN flag) is set, but that
+    // doesn't work for NE.  For NE could teqvs rX, #1.  rX needs to
+    // be any register that has lsb == 0, such as sp/fp/pc.
+    
+    // Test explicily with the full mask; if V is set, test will fail.
+    // Assumption is that this will be followed up by a BEQ/BNE
+    CMPi(Scratch, mask);
+    // grab just the condition fields
+    SHRi(Scratch, 28);
+    MRS(Scratch);
+
+    // do the comparison and get results loaded in ARM status register
+    FMSTAT();
+    FCMPD(ra, rb);
+}
+
+Register
+Assembler::asm_prep_fcall(Reservation* rR, LInsp ins)
+{
+    // We have nothing to do here; we do it all in asm_call.
+    return UnknownReg;
+}
+
+#endif /* NJ_ARM_VFP */
 
 }
+#endif /* FEATURE_NANOJIT */
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@@ -42,24 +42,38 @@
 #define __nanojit_NativeArm__
 
 
 namespace nanojit
 {
 
 const int NJ_LOG2_PAGE_SIZE = 12;       // 4K
 
-#define NJ_MAX_REGISTERS                11
+// If NJ_ARM_VFP is defined, then VFP is assumed to
+// be present.  If it's not defined, then softfloat
+// is used, and NJ_SOFTFLOAT is defined.
+#define NJ_ARM_VFP
+
+#ifdef NJ_ARM_VFP
+
+// only d0-d7; we'll use d7 as s14-s15 for i2f/u2f/etc.
+#define NJ_VFP_MAX_REGISTERS            8
+
+#else
+
+#define NJ_VFP_MAX_REGISTERS            0
+#define NJ_SOFTFLOAT
+
+#endif
+
+#define NJ_MAX_REGISTERS                (11 + NJ_VFP_MAX_REGISTERS)
 #define NJ_MAX_STACK_ENTRY              256
 #define NJ_MAX_PARAMETERS               16
 #define NJ_ALIGN_STACK                  8
-#define NJ_STACK_OFFSET                 8
-
-#define NJ_SOFTFLOAT
-#define NJ_STACK_GROWTH_UP
+#define NJ_STACK_OFFSET                 0
 
 #define NJ_CONSTANT_POOLS
 const int NJ_MAX_CPOOL_OFFSET = 4096;
 const int NJ_CPOOL_SIZE = 16;
 
 typedef int NIns;
 
 /* ARM registers */
@@ -70,35 +84,50 @@ typedef enum {
     R3  = 3,
     R4  = 4,
     R5  = 5,
     R6  = 6,
     R7  = 7,
     R8  = 8,
     R9  = 9,
     R10 = 10,
-    //FP  =11,
+    FP  = 11,
     IP  = 12,
     SP  = 13,
     LR  = 14,
     PC  = 15,
 
-    FP = 13,
-        
-    // Pseudo-register for floating point
-    F0  = 0,
+    // FP regs
+    D0 = 16,
+    D1 = 17,
+    D2 = 18,
+    D3 = 19,
+    D4 = 20,
+    D5 = 21,
+    D6 = 22,
+    D7 = 23,
+
+    FirstFloatReg = 16,
+    LastFloatReg = 22,
 
     // helpers
     FRAME_PTR = 11,
-    ESP = 13,
+    ESP = SP,
         
     FirstReg = 0,
+#ifdef NJ_ARM_VFP
+    LastReg = 23,
+#else
     LastReg = 10,
-    Scratch = 12,
-    UnknownReg = 11
+#endif
+    Scratch = IP,
+    UnknownReg = 31,
+
+    // special value referring to S14
+    FpSingleScratch = 24
 } Register;
 
 /* ARM condition codes */
 typedef enum {
     EQ = 0x0, // Equal
     NE = 0x1, // Not Equal
     CS = 0x2, // Carry Set (or HS)
     CC = 0x3, // Carry Clear (or LO)
@@ -118,23 +147,40 @@ typedef enum {
 
 
 typedef int RegisterMask;
 typedef struct _FragInfo {
     RegisterMask    needRestoring;
     NIns*           epilogue;
 } FragInfo;
 
-static const RegisterMask SavedRegs = 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7 | 1<<R8 | 1<<R9 | 1<<R10;
-static const RegisterMask FpRegs = 0x0000; // FST0-FST7
+#ifdef ARM_VFP
+static const RegisterMask SavedFpRegs = 1<<D0 | 1<<D1 | 1<<D2 | 1<<D3 | 1<<D4 | 1<<D5 | 1<<D6 | 1<<D7;
+#else
+static const RegisterMask SavedFpRegs = 0;
+#endif
+static const RegisterMask SavedRegs = 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7 | 1<<R8 | 1<<R9 | 1<<R10 | SavedFpRegs;
+static const RegisterMask FpRegs = 1<<D0 | 1<<D1 | 1<<D2 | 1<<D3 | 1<<D4 | 1<<D5 | 1<<D6; // no D7; S14-S15 are used for i2f/u2f.
 static const RegisterMask GpRegs = 0x07FF;
 static const RegisterMask AllowableFlagRegs = 1<<R0 | 1<<R1 | 1<<R2 | 1<<R3 | 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7 | 1<<R8 | 1<<R9 | 1<<R10;
 
+#define IsFpReg(_r)     ((rmask(_r) & (FpRegs | (1<<D7))) != 0)
+#define IsGpReg(_r)     ((rmask(_r) & (GpRegs | (1<<Scratch))) != 0)
+#define FpRegNum(_fpr)  ((_fpr) - FirstFloatReg)
+
 #define firstreg()      R0
-#define nextreg(r)      (Register)((int)r+1)
+#define nextreg(r)      ((Register)((int)(r)+1))
+#if 0
+static Register nextreg(Register r) {
+    if (r == R10)
+        return D0;
+    return (Register)(r+1);
+}
+#endif
+// only good for normal regs
 #define imm2register(c) (Register)(c-1)
 
 verbose_only( extern const char* regNames[]; )
 
 // abstract to platform specific calls
 #define nExtractPlatformFlags(x)    0
 
 #define DECLARE_PLATFORM_STATS()                \
@@ -143,21 +189,22 @@ verbose_only( extern const char* regName
 #define DECLARE_PLATFORM_REGALLOC()
 
 #define DECLARE_PLATFORM_ASSEMBLER()                                    \
     const static Register argRegs[4], retRegs[2];                       \
     void LD32_nochk(Register r, int32_t imm);                           \
     void BL(NIns*);                                                     \
     void BL_far(NIns*);                                                 \
     void CALL(const CallInfo*);                                         \
+    void B_cond_chk(ConditionCode, NIns*, bool);                        \
     void underrunProtect(int bytes);                                    \
     bool has_cmov;                                                      \
     void nativePageReset();                                             \
     void nativePageSetup();                                             \
-    void flushCache(NIns*,NIns*);                                       \
+    void asm_quad_nochk(Register, const int32_t*);                      \
     int* _nSlot;                                                        \
     int* _nExitSlot;
 
 
 #define asm_farg(i) NanoAssert(false)
 
 //printf("jmp_l_n count=%d, nins=%X, %X = %X\n", (_c), nins, _nIns, ((intptr_t)(nins+(_c))-(intptr_t)_nIns - 4) );
 
@@ -169,32 +216,33 @@ verbose_only( extern const char* regName
     }
 
 
 #define IMM32(imm)  *(--_nIns) = (NIns)((imm));
 
 #define FUNCADDR(addr) ( ((int)addr) )  
 
 #define OP_IMM  (1<<25)
+#define OP_STAT (1<<20)
 
 #define COND_AL (0xE<<28)
 
 typedef enum {
     LSL_imm = 0, // LSL #c - Logical Shift Left
     LSL_reg = 1, // LSL Rc - Logical Shift Left
     LSR_imm = 2, // LSR #c - Logical Shift Right
     LSR_reg = 3, // LSR Rc - Logical Shift Right
     ASR_imm = 4, // ASR #c - Arithmetic Shift Right
     ASR_reg = 5, // ASR Rc - Arithmetic Shift Right
     ROR_imm = 6, // Rotate Right (c != 0)
     RRX     = 6, // Rotate Right one bit with extend (c == 0)
     ROR_reg = 7  // Rotate Right
 } ShiftOperator;
 
-#define LD32_size 4
+#define LD32_size 8
 
 #define BEGIN_NATIVE_CODE(x)                    \
     { DWORD* _nIns = (uint8_t*)x
 
 #define END_NATIVE_CODE(x)                      \
     (x) = (dictwordp*)_nIns; }
 
 // BX 
@@ -246,55 +294,68 @@ typedef enum {
 
 // _r = _r XOR _imm
 #define XORi(_r,_imm)   do {                                            \
         NanoAssert(isU8((_imm)));                                       \
         underrunProtect(4);                                             \
         *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<21) | ((_r)<<16) | ((_r)<<12) | ((_imm)&0xFF) ); \
         asm_output2("eor %s,%d",gpn(_r),(_imm)); } while(0)
 
-// _l = _l + _r
-#define ADD(_l,_r) do {                                                 \
+// _d = _n + _m
+#define arm_ADD(_d,_n,_m) do {                                          \
         underrunProtect(4);                                             \
-        *(--_nIns) = (NIns)( COND_AL | (1<<23) | ((_r)<<16) | ((_l)<<12) | (_l)); \
-        asm_output2("add %s,%s",gpn(_l),gpn(_r)); } while(0)
+        *(--_nIns) = (NIns)( COND_AL | OP_STAT | (1<<23) | ((_n)<<16) | ((_d)<<12) | (_m)); \
+        asm_output3("add %s,%s+%s",gpn(_d),gpn(_n),gpn(_m)); } while(0)
+
+// _l = _l + _r
+#define ADD(_l,_r)   arm_ADD(_l,_l,_r)
 
-// _r = _r + _imm
-#define ADDi(_r,_imm)   do {                                            \
-        if ((_imm)>-256 && (_imm)<256) {                                \
+// TODO: we can do better here, since we can rotate the 8-bit immediate left by
+// an even number of bits; should count zeros at the end.
+
+// Note that this sometimes converts negative immediate values to a to a sub.
+// _d = _r + _imm
+#define arm_ADDi(_d,_n,_imm)   do {                                     \
+        if ((_imm) > -256 && (_imm) < 256) {                            \
             underrunProtect(4);                                         \
             if ((_imm)>=0)                                              \
-                *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | ((_imm)&0xFF) ); \
+                *(--_nIns) = (NIns)( COND_AL | OP_IMM | OP_STAT | (1<<23) | ((_n)<<16) | ((_d)<<12) | ((_imm)&0xFF) ); \
             else                                                        \
-                *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | ((-(_imm))&0xFF) ); \
+                *(--_nIns) = (NIns)( COND_AL | OP_IMM | OP_STAT | (1<<22) | ((_n)<<16) | ((_d)<<12) | ((-(_imm))&0xFF) ); \
         } else {                                                        \
             if ((_imm)>=0) {                                            \
                 if ((_imm)<=1020 && (((_imm)&3)==0) ) {                 \
                     underrunProtect(4);                                 \
-                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | (15<<8)| ((_imm)>>2) ); \
+                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | OP_STAT | (1<<23) | ((_n)<<16) | ((_d)<<12) | (15<<8)| ((_imm)>>2) ); \
                 } else {                                                \
                     underrunProtect(4+LD32_size);                       \
-                    *(--_nIns) = (NIns)( COND_AL | (1<<23) | ((_r)<<16) | ((_r)<<12) | (Scratch)); \
+                    *(--_nIns) = (NIns)( COND_AL | OP_STAT | (1<<23) | ((_n)<<16) | ((_d)<<12) | (Scratch)); \
                     LD32_nochk(Scratch, _imm);                          \
                 }                                                       \
             } else {                                                    \
+                underrunProtect(4+LD32_size);                           \
+                *(--_nIns) = (NIns)( COND_AL | OP_STAT | (1<<22) | ((_n)<<16) | ((_d)<<12) | (Scratch)); \
+                LD32_nochk(Scratch, -(_imm));                           \
+            }                                                           \
+        }                                                               \
+        asm_output3("add %s,%s,%d",gpn(_d),gpn(_n),(_imm));             \
+    } while(0)
+
+/*
+ * There used to be a :
                 if ((_imm)>=-510) {                                     \
                     underrunProtect(8);                                 \
                     int rem = -(_imm) - 255;                            \
-                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | ((rem)&0xFF) ); \
-                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | (0xFF) ); \
-                } else {                                                \
-                    underrunProtect(4+LD32_size);                       \
-                    *(--_nIns) = (NIns)( COND_AL | (1<<22) | ((_r)<<16) | ((_r)<<12) | (Scratch)); \
-                    LD32_nochk(Scratch, -(_imm));                       \
-                }                                                       \
-            }                                                           \
-        }                                                               \
-        asm_output2("addi %s,%d",gpn(_r),(_imm));                       \
-    } while(0)
+                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_n)<<16) | ((_d)<<12) | ((rem)&0xFF) ); \
+                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_n)<<16) | ((_d)<<12) | (0xFF) ); \
+                } else {                                               
+ * above, but if we do that we can't really update the status registers.  So don't do that.
+ */
+
+#define ADDi(_r,_imm)  arm_ADDi(_r,_r,_imm)
 
 // _l = _l - _r
 #define SUB(_l,_r)  do {                                                \
         underrunProtect(4);                                             \
         *(--_nIns) = (NIns)( COND_AL | (1<<22) | ((_l)<<16) | ((_l)<<12) | (_r)); \
         asm_output2("sub %s,%s",gpn(_l),gpn(_r)); } while(0)
 
 // _r = _r - _imm
@@ -397,16 +458,23 @@ typedef enum {
         asm_output2("lsl %s,%d",gpn(_r),(_imm)); } while(0)
                     
 // TST
 #define TEST(_d,_s) do {                                                \
         underrunProtect(4);                                             \
         *(--_nIns) = (NIns)( COND_AL | (0x11<<20) | ((_d)<<16) | (_s) ); \
         asm_output2("test %s,%s",gpn(_d),gpn(_s)); } while(0)
 
+#define TSTi(_d,_imm) do {                                              \
+        underrunProtect(4);                                             \
+        NanoAssert(((_imm) & 0xff) == (_imm));                          \
+        *(--_nIns) = (NIns)( COND_AL | OP_IMM | (0x11<<20) | ((_d) << 16) | (0xF<<12) | ((_imm) & 0xff) ); \
+        asm_output2("tst %s,#0x%x", gpn(_d), _imm);                     \
+    } while (0);
+
 // CMP
 #define CMP(_l,_r)  do {                                                \
         underrunProtect(4);                                             \
         *(--_nIns) = (NIns)( COND_AL | (0x015<<20) | ((_l)<<16) | (_r) ); \
         asm_output2("cmp %s,%s",gpn(_l),gpn(_r)); } while(0)
 
 // CMP (or CMN)
 #define CMPi(_r,_imm)  do {                                             \
@@ -424,17 +492,17 @@ typedef enum {
                 underrunProtect(4);                                     \
                 *(--_nIns) = (NIns)( COND_AL | (0x035<<20) | ((_r)<<16) | ((_imm)&0xFF) ); \
             } else {                                                    \
                 underrunProtect(4+LD32_size);                           \
                 *(--_nIns) = (NIns)( COND_AL | (0x015<<20) | ((_r)<<16) | (Scratch) ); \
                 LD32_nochk(Scratch, (_imm));                            \
             }                                                           \
         }                                                               \
-        asm_output2("cmp %s,%X",gpn(_r),(_imm));                        \
+        asm_output2("cmp %s,0x%x",gpn(_r),(_imm));                      \
     } while(0)
 
 // MOV
 #define MR(_d,_s)  do {                                                 \
         underrunProtect(4);                                             \
         *(--_nIns) = (NIns)( COND_AL | (0xD<<21) | ((_d)<<12) | (_s) ); \
         asm_output2("mov %s,%s",gpn(_d),gpn(_s)); } while (0)
 
@@ -452,83 +520,75 @@ typedef enum {
 #define MRGE(dr,sr) MR_cond(dr, sr, GE, "movge")
 #define MRB(dr,sr)  MR_cond(dr, sr, CC, "movcc")
 #define MRBE(dr,sr) MR_cond(dr, sr, LS, "movls")
 #define MRA(dr,sr)  MR_cond(dr, sr, HI, "movcs")
 #define MRAE(dr,sr) MR_cond(dr, sr, CS, "movhi")
 #define MRNO(dr,sr) MR_cond(dr, sr, VC, "movvc") // overflow clear
 #define MRNC(dr,sr) MR_cond(dr, sr, CC, "movcc") // carry clear
 
-#define LD(_d,_off,_b) do {                                             \
-        if ((_off)<0) {                                                 \
-            underrunProtect(4);                                         \
+#define LDR_chk(_d,_b,_off,_chk) do {                                   \
+        if (IsFpReg(_d)) {                                              \
+            FLDD_chk(_d,_b,_off,_chk);                                  \
+        } else if ((_off)<0) {                                          \
+            if (_chk) underrunProtect(4);                               \
             NanoAssert((_off)>-4096);                                   \
             *(--_nIns) = (NIns)( COND_AL | (0x51<<20) | ((_b)<<16) | ((_d)<<12) | ((-(_off))&0xFFF) ); \
         } else {                                                        \
             if (isS16(_off) || isU16(_off)) {                           \
-                underrunProtect(4);                                     \
+                if (_chk) underrunProtect(4);                           \
                 NanoAssert((_off)<4096);                                \
                 *(--_nIns) = (NIns)( COND_AL | (0x59<<20) | ((_b)<<16) | ((_d)<<12) | ((_off)&0xFFF) ); \
             } else {                                                    \
-                underrunProtect(4+LD32_size);                           \
+                if (_chk) underrunProtect(4+LD32_size);                 \
                 *(--_nIns) = (NIns)( COND_AL | (0x79<<20) | ((_b)<<16) | ((_d)<<12) | Scratch ); \
                 LD32_nochk(Scratch, _off);                              \
             }                                                           \
         }                                                               \
-        asm_output3("ld %s,%d(%s)",gpn((_d)),(_off),gpn((_b)));         \
+        asm_output3("ldr %s,%d(%s)",gpn((_d)),(_off),gpn((_b)));        \
     } while(0)
 
+#define LDR(_d,_b,_off)        LDR_chk(_d,_b,_off,0)
+#define LDR_nochk(_d,_b,_off)  LDR_chk(_d,_b,_off,1)
+
+// i386 compat, for Assembler.cpp
+#define LD(reg,offset,base)    LDR_chk(reg,base,offset,1)
+#define ST(base,offset,reg)    STR(reg,base,offset)
 
 #define LDi(_d,_imm) do {                                               \
         if (isS8((_imm)) || isU8((_imm))) {                             \
             underrunProtect(4);                                         \
             if ((_imm)<0)   *(--_nIns) = (NIns)( COND_AL | (0x3E<<20) | ((_d)<<12) | (((_imm)^0xFFFFFFFF)&0xFF) ); \
             else            *(--_nIns) = (NIns)( COND_AL | (0x3B<<20) | ((_d)<<12) | ((_imm)&0xFF) ); \
         } else {                                                        \
             underrunProtect(LD32_size);                                 \
             LD32_nochk(_d, (_imm));                                     \
         }                                                               \
-        asm_output2("ld %s,%d",gpn((_d)),(_imm));                       \
+        asm_output2("ld  %s,0x%x",gpn((_d)),(_imm));                      \
     } while(0)
 
 
 // load 8-bit, zero extend (aka LDRB)
 // note, only 5-bit offsets (!) are supported for this, but that's all we need at the moment
 // (LDRB actually allows 12-bit offset in ARM mode but constraining to 5-bit gives us advantage for Thumb)
 // @todo, untested!
 #define LD8Z(_d,_off,_b) do {                                           \
         NanoAssert((d)>=0&&(d)<=31);                                    \
         underrunProtect(4);                                             \
         *(--_nIns) = (NIns)( COND_AL | (0x5D<<20) | ((_b)<<16) | ((_d)<<12) |  ((_off)&0xfff)  ); \
         asm_output3("ldrb %s,%d(%s)", gpn(_d),(_off),gpn(_b));          \
     } while(0)
 
-#define ST(_b,_off,_r) do {                                             \
+#define STR(_d,_n,_off) do {                                            \
+        NanoAssert(!IsFpReg(_d) && isS12(_off));                        \
         underrunProtect(4);                                             \
-        if ((_off)<0)   *(--_nIns) = (NIns)( COND_AL | (0x50<<20) | ((_b)<<16) | ((_r)<<12) | ((-(_off))&0xFFF) ); \
-        else            *(--_nIns) = (NIns)( COND_AL | (0x58<<20) | ((_b)<<16) | ((_r)<<12) | ((_off)&0xFFF) ); \
-        asm_output3("str %s, %d(%s)",gpn(_r), (_off),gpn(_b)); } while(0)
-
-
-#define STi(_b,_off,_imm) do {                                          \
-        NanoAssert((_off)>0);                                           \
-        if (isS8((_imm)) || isU8((_imm))) {                             \
-            underrunProtect(8);                                         \
-            *(--_nIns) = (NIns)( COND_AL | (0x58<<20) | ((_b)<<16) | ((Scratch)<<12) | ((_off)&0xFFF) ); \
-            asm_output3("str %s, %d(%s)",gpn(Scratch), (_off),gpn(_b)); \
-            if ((_imm)<0)   *(--_nIns) = (NIns)( COND_AL | (0x3E<<20) | (Scratch<<12) | (((_imm)^0xFFFFFFFF)&0xFF) ); \
-            else            *(--_nIns) = (NIns)( COND_AL | (0x3B<<20) | (Scratch<<12) | ((_imm)&0xFF) ); \
-            asm_output2("ld %s,%d",gpn((Scratch)),(_imm));              \
-        } else {                                                        \
-            underrunProtect(4+LD32_size);                               \
-            *(--_nIns) = (NIns)( COND_AL | (0x58<<20) | ((_b)<<16) | ((Scratch)<<12) | ((_off)&0xFFF) ); \
-            asm_output3("str %s, %d(%s)",gpn(Scratch), (_off),gpn(_b)); \
-            LD32_nochk(Scratch, (_imm));                                \
-        }                                \
-    } while(0);
+        if ((_off)<0)   *(--_nIns) = (NIns)( COND_AL | (0x50<<20) | ((_n)<<16) | ((_d)<<12) | ((-(_off))&0xFFF) ); \
+        else            *(--_nIns) = (NIns)( COND_AL | (0x58<<20) | ((_n)<<16) | ((_d)<<12) | ((_off)&0xFFF) ); \
+        asm_output3("str %s, %d(%s)",gpn(_d), (_off), gpn(_n));         \
+    } while(0)
 
 
 #define LEA(_r,_d,_b) do {                                              \
         NanoAssert((_d)<=1020);                                         \
         NanoAssert(((_d)&3)==0);                                        \
         if (_b!=SP) NanoAssert(0);                                      \
         if ((_d)<256) {                                                 \
             underrunProtect(4);                                         \
@@ -543,17 +603,17 @@ typedef enum {
 
 
 //#define RET()   underrunProtect(1); *(--_nIns) = 0xc3;    asm_output("ret")
 //#define NOP()     underrunProtect(1); *(--_nIns) = 0x90;  asm_output("nop")
 //#define INT3()  underrunProtect(1); *(--_nIns) = 0xcc;  asm_output("int3")
 //#define RET() INT3()
 
 #define BKPT_nochk() do { \
-        *(--_nIns) = (NIns)( (0xE<<24) | (0x12<<20) | (0x7<<4) ); } while (0);
+        *(--_nIns) = (NIns)( (0xE<<24) | (0x12<<20) | (0x7<<4) ); } while (0)
 
 // this is pushing a reg
 #define PUSHr(_r)  do {                                                 \
         underrunProtect(4);                                             \
         *(--_nIns) = (NIns)( COND_AL | (0x92<<20) | (SP<<16) | (1<<(_r)) ); \
         asm_output1("push %s",gpn(_r)); } while (0)
 
 // STMDB
@@ -576,57 +636,20 @@ typedef enum {
         *(--_nIns) = (NIns)( COND_AL | (0x8B<<20) | (SP<<16) | (1<<(_r)) ); \
         asm_output1("pop %s",gpn(_r));} while (0)
 
 #define POP_mask(_mask) do {                                            \
         underrunProtect(4);                                             \
         *(--_nIns) = (NIns)( COND_AL | (0x8B<<20) | (SP<<16) | (_mask) ); \
         asm_output1("pop %x", (_mask));} while (0)
 
+// PC always points to current instruction + 8, so when calculating pc-relative
+// offsets, use PC+8.
 #define PC_OFFSET_FROM(target,frompc) ((intptr_t)(target) - ((intptr_t)(frompc) + 8))
-#define JMP_S24_OFFSET_OK(offs) ((-(1<<24)) <= (offs) && (offs) < (1<<24))
-
-// (XXX This ought to be a function instead of a macro)
-//
-// Branch to target address _t with condition _c, doing underrun
-// checks (_chk == 1) or skipping them (_chk == 0).
-//
-// If the jump fits in a relative jump (+/-32MB), emit that.
-// If the jump is unconditional, emit the dest address inline in
-// the instruction stream and load it into pc.
-// If the jump has a condition, but noone's mucked with _nIns and our _nSlot
-// pointer is valid, stick the constant in the slot and emit a conditional
-// load into pc.
-// Otherwise, emit the conditional load into pc from a nearby constant,
-// and emit a jump to jump over it it in case the condition fails.
-//
-// NB: JMP_nochk depends on this not calling samepage() when _c == AL
-#define B_cond_chk(_c,_t,_chk) do {                                     \
-        int32 offs = PC_OFFSET_FROM(_t,(intptr_t)(_nIns)-4);            \
-        if (JMP_S24_OFFSET_OK(offs)) {                                  \
-            if(_chk) underrunProtect(4);                                \
-            *(--_nIns) = (NIns)( ((_c)<<28) | (0xA<<24) | (((offs)>>2) & 0xFFFFFF) ); \
-        } else if (_c == AL) {                                          \
-            if(_chk) underrunProtect(8);                                \
-            *(--_nIns) = (NIns)(_t);                                    \
-            *(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | 0x4 ); \
-        } else if (samepage(_nIns,_nSlot)) {                            \
-            if(_chk) underrunProtect(8);                                \
-            *(++_nSlot) = (NIns)(_t);                                   \
-            offs = PC_OFFSET_FROM(_nSlot,(intptr_t)(_nIns)-4);          \
-            NanoAssert(offs < 0);                                       \
-            *(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | ((-offs) & 0xFFFFFF) ); \
-        } else {                                                        \
-            if(_chk) underrunProtect(24);                               \
-            *(--_nIns) = (NIns)(_t);                                    \
-            *(--_nIns) = (NIns)( COND_AL | (0xA<<24) | ((-4)>>2) & 0xFFFFFF ); \
-            *(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | 0x0 ); \
-        }                                                               \
-        asm_output2("%s %p\n", _c == AL ? "jmp" : "b(cnd)", (void*)(_t)); \
-    } while(0)
+#define isS12(offs) ((-(1<<12)) <= (offs) && (offs) < (1<<12))
 
 #define B_cond(_c,_t)                           \
     B_cond_chk(_c,_t,1)
 
 // NB: don't use COND_AL here, we shift the condition into place!
 #define JMP(_t)                                 \
     B_cond_chk(AL,_t,1)
 
@@ -660,45 +683,22 @@ typedef enum {
 #define JNGE(t) do {B_cond(LT,t); asm_output1("jnge 0x%08x",(unsigned int)t); } while(0)
 #define JG(t)   do {B_cond(GT,t); asm_output1("jg 0x%08x",(unsigned int)t); } while(0)  
 #define JNG(t)  do {B_cond(LE,t); asm_output1("jng 0x%08x",(unsigned int)t); } while(0)
 #define JC(t)   do {B_cond(CS,t); asm_output1("bcs 0x%08x",(unsigned int)t); } while(0)
 #define JNC(t)  do {B_cond(CC,t); asm_output1("bcc 0x%08x",(unsigned int)t); } while(0)
 #define JO(t)   do {B_cond(VS,t); asm_output1("bvs 0x%08x",(unsigned int)t); } while(0)
 #define JNO(t)  do {B_cond(VC,t); asm_output1("bvc 0x%08x",(unsigned int)t); } while(0)
 
-// used for testing result of an FP compare
+// used for testing result of an FP compare on x86; not used on arm.
 // JP = comparison  false
-#define JP(t)   do {B_cond(EQ,NE,t); asm_output1("jp 0x%08x",t); } while(0) 
+#define JP(t)   do {NanoAssert(0); B_cond(NE,t); asm_output1("jp 0x%08x",t); } while(0) 
 
 // JNP = comparison true
-#define JNP(t)  do {B_cond(NE,EQ,t); asm_output1("jnp 0x%08x",t); } while(0)
-
-
-// floating point
-#define FNSTSW_AX() do {NanoAssert(0);      asm_output("fnstsw_ax"); } while(0)
-#define FFREE(r)    do {NanoAssert(0);      asm_output1("ffree %s",gpn(b)); } while(0)
-#define FSTQ(p,d,b) do {NanoAssert(0);      asm_output2("fstq %d(%s)",d,gpn(b)); } while(0)
-#define FSTPQ(d,b)  FSTQ(1,d,b)
-//#define FSTPQ(d,b)    do {NanoAssert(0);      asm_output2("fstpq %d(%s)",d,gpn(b)); } while(0)
-#define FCOM(p,d,b) do {NanoAssert(0);      asm_output2("fcom %d(%s)",d,gpn(b)); } while(0)
-#define FCOMP(d,b)  do {NanoAssert(0);      asm_output2("fcomp %d(%s)",d,gpn(b)); } while(0)
-#define FLDQ(d,b)   do {NanoAssert(0);      asm_output2("fldq %d(%s)",d,gpn(b)); } while(0)
-#define FILDQ(d,b)  do {NanoAssert(0);      asm_output2("fildq %d(%s)",d,gpn(b)); } while(0)
-#define FILD(d,b)   do {NanoAssert(0);      asm_output2("fild %d(%s)",d,gpn(b)); } while(0)
-#define FADD(d,b)   do {NanoAssert(0);      asm_output2("faddq %d(%s)",d,gpn(b)); } while(0)
-#define FSUB(d,b)   do {NanoAssert(0);      asm_output2("fsubq %d(%s)",d,gpn(b)); } while(0)
-#define FSUBR(d,b)  do {NanoAssert(0);      asm_output2("fsubr %d(%s)",d,gpn(b)); } while(0)
-#define FMUL(d,b)   do {NanoAssert(0);      asm_output2("fmulq %d(%s)",d,gpn(b)); } while(0)
-#define FDIV(d,b)   do {NanoAssert(0);      asm_output2("fdivq %d(%s)",d,gpn(b)); } while(0)
-#define FDIVR(d,b)  do {NanoAssert(0);      asm_output2("fdivr %d(%s)",d,gpn(b)); } while(0)
-#define FSTP(r)     do {NanoAssert(0);      asm_output1("fst st(%d)",r); } while(0)
-#define FLD1()      do {NanoAssert(0);      asm_output("fld1"); } while(0)
-#define FLDZ()      do {NanoAssert(0);      asm_output("fldz"); } while(0)
-
+#define JNP(t)  do {NanoAssert(0); B_cond(EQ,t); asm_output1("jnp 0x%08x",t); } while(0)
 
 
 // MOV(EQ) _r, #1 
 // EOR(NE) _r, _r
 #define SET(_r,_cond,_opp)                                              \
     underrunProtect(8);                                                 \
     *(--_nIns) = (NIns)( (_opp<<28) | (1<<21) | ((_r)<<16) | ((_r)<<12) | (_r) ); \
     *(--_nIns) = (NIns)( (_cond<<28) | (0x3A<<20) | ((_r)<<12) | (1) );
@@ -753,22 +753,152 @@ typedef enum {
                 NanoAssert(rem<256);                                    \
                 *(--_nIns) = (NIns)( COND_AL | (0x15<<20) | ((_d)<<16) | ((_d)<<12) |  ((((rem)>>4)&0xF)<<8) | (0xF<<4) | ((rem)&0xF)  ); \
                 *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_b)<<16) | ((_d)<<12) | (0xFF) ); \
             } else NanoAssert(0);                                        \
         }                                                               \
     } while(0)
 
 #define STMIA(_b, _mask) do {                                           \
-        underrunProtect(2);                                             \
+        underrunProtect(4);                                             \
         NanoAssert(((_mask)&rmask(_b))==0 && isU8(_mask));              \
         *(--_nIns) = (NIns)(COND_AL | (0x8A<<20) | ((_b)<<16) | (_mask)&0xFF); \
-        asm_output2("stmia %s!,{%x}", gpn(_b), _mask); \
+        asm_output2("stmia %s!,{0x%x}", gpn(_b), _mask); \
     } while (0)
 
 #define LDMIA(_b, _mask) do {                                           \
-        underrunProtect(2);                                             \
+        underrunProtect(4);                                             \
         NanoAssert(((_mask)&rmask(_b))==0 && isU8(_mask));              \
         *(--_nIns) = (NIns)(COND_AL | (0x8B<<20) | ((_b)<<16) | (_mask)&0xFF); \
-        asm_output2("ldmia %s!,{%x}", gpn(_b), (_mask)); \
+        asm_output2("ldmia %s!,{0x%x}", gpn(_b), (_mask)); \
+    } while (0)
+
+#define MRS(_d) do {                            \
+        underrunProtect(4);                     \
+        *(--_nIns) = (NIns)(COND_AL | (0x10<<20) | (0xF<<16) | ((_d)<<12)); \
+        asm_output1("msr %s", gpn(_d));                                 \
+    } while (0)
+
+/*
+ * VFP
+ */
+
+#define FMDRR(_Dm,_Rd,_Rn) do {                                         \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dm) && IsGpReg(_Rd) && IsGpReg(_Rn));       \
+        *(--_nIns) = (NIns)( COND_AL | (0xC4<<20) | ((_Rn)<<16) | ((_Rd)<<12) | (0xB1<<4) | (FpRegNum(_Dm)) ); \
+        asm_output3("fmdrr %s,%s,%s", gpn(_Dm), gpn(_Rd), gpn(_Rn));    \
+    } while (0)
+
+#define FMRRD(_Rd,_Rn,_Dm) do {                                         \
+        underrunProtect(4);                                             \
+        NanoAssert(IsGpReg(_Rd) && IsGpReg(_Rn) && IsFpReg(_Dm));       \
+        *(--_nIns) = (NIns)( COND_AL | (0xC5<<20) | ((_Rn)<<16) | ((_Rd)<<12) | (0xB1<<4) | (FpRegNum(_Dm)) ); \
+        asm_output3("fmrrd %s,%s,%s", gpn(_Rd), gpn(_Rn), gpn(_Dm));    \
+    } while (0)
+
+#define FSTD(_Dd,_Rn,_offs) do {                                        \
+        underrunProtect(4);                                             \
+        NanoAssert((((_offs) & 3) == 0) && isS8((_offs) >> 2));         \
+        NanoAssert(IsFpReg(_Dd) && !IsFpReg(_Rn));                      \
+        int negflag = 1<<23;                                            \
+        intptr_t offs = (_offs);                                        \
+        if (_offs < 0) {                                                \
+            negflag = 0<<23;                                            \
+            offs = -(offs);                                             \
+        }                                                               \
+        *(--_nIns) = (NIns)( COND_AL | (0xD0<<20) | ((_Rn)<<16) | (FpRegNum(_Dd)<<12) | (0xB<<8) | negflag | ((offs>>2)&0xff) ); \
+        asm_output3("fstd %s,%s(%d)", gpn(_Dd), gpn(_Rn), _offs);    \
+    } while (0)
+
+#define FLDD_chk(_Dd,_Rn,_offs,_chk) do {                               \
+        if(_chk) underrunProtect(4);                                    \
+        NanoAssert((((_offs) & 3) == 0) && isS8((_offs) >> 2));         \
+        NanoAssert(IsFpReg(_Dd) && !IsFpReg(_Rn));                      \
+        int negflag = 1<<23;                                            \
+        intptr_t offs = (_offs);                                        \
+        if (_offs < 0) {                                                \
+            negflag = 0<<23;                                            \
+            offs = -(offs);                                             \
+        }                                                               \
+        *(--_nIns) = (NIns)( COND_AL | (0xD1<<20) | ((_Rn)<<16) | (FpRegNum(_Dd)<<12) | (0xB<<8) | negflag | ((offs>>2)&0xff) ); \
+        asm_output3("fldd %s,%s(%d)", gpn(_Dd), gpn(_Rn), _offs);       \
+    } while (0)
+#define FLDD(_Dd,_Rn,_offs) FLDD_chk(_Dd,_Rn,_offs,1)
+
+#define FSITOD(_Dd,_Sm) do {                                            \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && ((_Sm) == FpSingleScratch));         \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB8<<16) | (FpRegNum(_Dd)<<12) | (0x2F<<6) | (0<<5) | (0x7) ); \
+        asm_output2("fsitod %s,%s", gpn(_Dd), gpn(_Sm));                \
+    } while (0)
+
+
+#define FUITOD(_Dd,_Sm) do {                                            \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && ((_Sm) == FpSingleScratch));         \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB8<<16) | (FpRegNum(_Dd)<<12) | (0x2D<<6) | (0<<5) | (0x7) ); \
+        asm_output2("fuitod %s,%s", gpn(_Dd), gpn(_Sm));                \
+    } while (0)
+
+#define FMSR(_Sn,_Rd) do {                                              \
+        underrunProtect(4);                                             \
+        NanoAssert(((_Sn) == FpSingleScratch) && IsGpReg(_Rd));         \
+        *(--_nIns) = (NIns)( COND_AL | (0xE0<<20) | (0x7<<16) | ((_Rd)<<12) | (0xA<<8) | (0<<7) | (0x1<<4) ); \
+        asm_output2("fmsr %s,%s", gpn(_Sn), gpn(_Rd));                  \
+    } while (0)
+
+#define FNEGD(_Dd,_Dm) do {                                             \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && IsFpReg(_Dm));                       \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB1<<16) | (FpRegNum(_Dd)<<12) | (0xB4<<4) | (FpRegNum(_Dm)) ); \
+        asm_output2("fnegd %s,%s", gpn(_Dd), gpn(_Dm));                 \
+    } while (0)
+
+#define FADDD(_Dd,_Dn,_Dm) do {                                         \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && IsFpReg(_Dn) && IsFpReg(_Dm));       \
+        *(--_nIns) = (NIns)( COND_AL | (0xE3<<20) | (FpRegNum(_Dn)<<16) | (FpRegNum(_Dd)<<12) | (0xB0<<4) | (FpRegNum(_Dm)) ); \
+        asm_output3("faddd %s,%s,%s", gpn(_Dd), gpn(_Dn), gpn(_Dm));    \
+    } while (0)
+
+#define FSUBD(_Dd,_Dn,_Dm) do {                                         \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && IsFpReg(_Dn) && IsFpReg(_Dm));       \
+        *(--_nIns) = (NIns)( COND_AL | (0xE3<<20) | (FpRegNum(_Dn)<<16) | (FpRegNum(_Dd)<<12) | (0xB4<<4) | (FpRegNum(_Dm)) ); \
+        asm_output3("fsubd %s,%s,%s", gpn(_Dd), gpn(_Dn), gpn(_Dm));    \
+    } while (0)
+
+#define FMULD(_Dd,_Dn,_Dm) do {                                         \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && IsFpReg(_Dn) && IsFpReg(_Dm));       \
+        *(--_nIns) = (NIns)( COND_AL | (0xE2<<20) | (FpRegNum(_Dn)<<16) | (FpRegNum(_Dd)<<12) | (0xB0<<4) | (FpRegNum(_Dm)) ); \
+        asm_output3("fmuld %s,%s,%s", gpn(_Dd), gpn(_Dn), gpn(_Dm));    \
+    } while (0)
+
+#define FDIVD(_Dd,_Dn,_Dm) do {                                         \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && IsFpReg(_Dn) && IsFpReg(_Dm));       \
+        *(--_nIns) = (NIns)( COND_AL | (0xE8<<20) | (FpRegNum(_Dn)<<16) | (FpRegNum(_Dd)<<12) | (0xB0<<4) | (FpRegNum(_Dm)) ); \
+        asm_output3("fmuld %s,%s,%s", gpn(_Dd), gpn(_Dn), gpn(_Dm));    \
+    } while (0)
+
+#define FMSTAT() do {                               \
+        underrunProtect(4);                         \
+        *(--_nIns) = (NIns)( COND_AL | 0x0EF1FA10); \
+        asm_output("fmstat");                       \
+    } while (0)
+
+#define FCMPD(_Dd,_Dm) do {                                             \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && IsFpReg(_Dm));                       \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB4<<16) | (FpRegNum(_Dd)<<12) | (0xB4<<4) | (FpRegNum(_Dm)) ); \
+        asm_output2("fcmpd %s,%s", gpn(_Dd), gpn(_Dm));                 \
+    } while (0)
+
+#define FCPYD(_Dd,_Dm) do {                                             \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && IsFpReg(_Dm));                       \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB0<<16) | (FpRegNum(_Dd)<<12) | (0xB4<<4) | (FpRegNum(_Dm)) ); \
+        asm_output2("fcpyd %s,%s", gpn(_Dd), gpn(_Dm));                 \
     } while (0)
 }
 #endif // __nanojit_NativeThumb__
--- a/js/src/nanojit/RegAlloc.h
+++ b/js/src/nanojit/RegAlloc.h
@@ -63,17 +63,19 @@ namespace nanojit
 
 			debug_only( uint32_t	countFree(); )
 			debug_only( uint32_t	countActive(); )
 			debug_only( void		checkCount(); )
 			debug_only( bool		isConsistent(Register r, LIns* v); )
 			debug_only( uint32_t	count; )
 			debug_only( RegisterMask managed; )    // bitfield of 0..NJ_MAX_REGISTERS denoting which are under our management                     
 
-			LIns*	active[NJ_MAX_REGISTERS];  // active[r] = OP that defines r
+			// RegisterMask is a 32-bit value, so we can never have more than 32 active.
+			// hardcode 32 here in case we have non-contiguous register numbers
+			LIns*	active[32];  // active[r] = OP that defines r
 			RegisterMask	free;
 			RegisterMask	used;
 
 			verbose_only( static void formatRegisters(RegAlloc& regs, char* s, Fragment*); )
 
 			DECLARE_PLATFORM_REGALLOC()
 	};
 }
--- a/js/src/nanojit/nanojit.h
+++ b/js/src/nanojit/nanojit.h
@@ -146,16 +146,17 @@ namespace nanojit
 	#define counter_decrement(x)	
 	#define profile_only(x)	
 #endif /* NJ_PROFILE */
 
 #define isS8(i)  ( int32_t(i) == int8_t(i) )
 #define isU8(i)  ( int32_t(i) == uint8_t(i) )
 #define isS16(i) ( int32_t(i) == int16_t(i) )
 #define isU16(i) ( int32_t(i) == uint16_t(i) )
+#define isS24(i) ( ((int32_t(i)<<8)>>8) == (i) )
 
 #define alignTo(x,s)		((((uintptr_t)(x)))&~(((uintptr_t)s)-1))
 #define alignUp(x,s)		((((uintptr_t)(x))+(((uintptr_t)s)-1))&~(((uintptr_t)s)-1))
 
 #define pageTop(x)			( (int*)alignTo(x,NJ_PAGE_SIZE) )
 #define pageDataStart(x)    ( (int*)(alignTo(x,NJ_PAGE_SIZE) + sizeof(PageHeader)) )
 #define pageBottom(x)		( (int*)(alignTo(x,NJ_PAGE_SIZE)+NJ_PAGE_SIZE)-1 )
 #define samepage(x,y)		(pageTop(x) == pageTop(y))