b=449526, TM: fix up ARM code generation / softfloat
authorVladimir Vukicevic <vladimir@pobox.com>
Tue, 02 Sep 2008 22:29:23 -0700
changeset 18773 a38e9aa2307b700c44f41ac056c8f2791dd2a1eb
parent 18772 d02034acc88cd9cd7f279355e99e8f26998ac726
child 18774 659da061e40a9781901171a4635b5cf5f5eae9b3
push id1711
push userbrendan@mozilla.com
push dateThu, 04 Sep 2008 08:26:45 +0000
treeherderautoland@1431bbddb5de [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
bugs449526
milestone1.9.1b1pre
b=449526, TM: fix up ARM code generation / softfloat
js/src/builtins.tbl
js/src/jsbuiltins.cpp
js/src/jstracer.cpp
js/src/jstracer.h
js/src/nanojit/Assembler.cpp
js/src/nanojit/LIR.cpp
js/src/nanojit/NativeARM.h
js/src/nanojit/NativeThumb.cpp
js/src/nanojit/nanojit.h
--- a/js/src/builtins.tbl
+++ b/js/src/builtins.tbl
@@ -93,11 +93,25 @@ BUILTIN2(TypeOfBoolean,         LO,     
 BUILTIN2(NumberToString,        LO,     F,  P,  JSString*, JSContext*, jsdouble, 1, 1)
 BUILTIN3(Object_p_hasOwnProperty,
                                 LO, LO, LO, LO, jsint,     JSContext*, JSObject*, JSString*, 0, 0)
 BUILTIN3(Object_p_propertyIsEnumerable,
                                 LO, LO, LO, LO, jsint,     JSContext*, JSObject*, JSString*, 0, 0)
 BUILTIN2(BooleanToNumber,       LO, LO, F,      jsdouble,  JSContext*, jsint, 1, 1)
 BUILTIN2(ObjectToString,        LO,     LO, P,  JSString*, JSContext*, JSObject*, 0, 0)
 BUILTIN3(Array_1int,            LO, LO, LO, P,  JSObject*, JSContext*, JSObject*, jsint, 0, 0)
+
+// soft float
+BUILTIN1(fneg,   F,       F,  jsdouble, jsdouble, 1, 1)
+BUILTIN1(i2f,    LO,      F,  jsdouble, jsint,    1, 1)
+BUILTIN1(u2f,    LO,      F,  jsdouble, jsuint,   1, 1)
+BUILTIN2(fcmpeq, F,   F,  LO, jsint,    jsdouble, jsdouble, 1, 1)
+BUILTIN2(fcmplt, F,   F,  LO, jsint,    jsdouble, jsdouble, 1, 1)
+BUILTIN2(fcmple, F,   F,  LO, jsint,    jsdouble, jsdouble, 1, 1)
+BUILTIN2(fcmpgt, F,   F,  LO, jsint,    jsdouble, jsdouble, 1, 1)
+BUILTIN2(fcmpge, F,   F,  LO, jsint,    jsdouble, jsdouble, 1, 1)
+BUILTIN2(fmul,   F,   F,  F,  jsdouble, jsdouble, jsdouble, 1, 1)
+BUILTIN2(fadd,   F,   F,  F,  jsdouble, jsdouble, jsdouble, 1, 1)
+BUILTIN2(fdiv,   F,   F,  F,  jsdouble, jsdouble, jsdouble, 1, 1)
+BUILTIN2(fsub,   F,   F,  F,  jsdouble, jsdouble, jsdouble, 1, 1)
 BUILTIN3(Array_1str,            LO, LO, LO, P,  JSObject*, JSContext*, JSObject*, JSString*, 0, 0)
 BUILTIN4(Array_2obj,            LO, LO, LO, LO, P, JSObject*, JSContext*, JSObject*, JSObject*, JSObject**, 0, 0)
 BUILTIN5(Array_3num,            LO, LO, F, F, F, P, JSObject*, JSContext*, JSObject*, jsdouble, jsdouble, jsdouble, 0, 0)
--- a/js/src/jsbuiltins.cpp
+++ b/js/src/jsbuiltins.cpp
@@ -714,16 +714,89 @@ js_Array_3num(JSContext* cx, JSObject* p
         if (!js_NewDoubleInRootedValue(cx, n1, ++newslots))
             return NULL;
         if (!js_NewDoubleInRootedValue(cx, n2, ++newslots))
             return NULL;
         if (!js_NewDoubleInRootedValue(cx, n3, ++newslots))
             return NULL;)
 }
 
+/* soft float */
+
+jsdouble FASTCALL
+js_fneg(jsdouble x)
+{
+    return -x;
+}
+
+jsdouble FASTCALL
+js_i2f(jsint i)
+{
+    return i;
+}
+
+jsdouble FASTCALL
+js_u2f(jsuint u)
+{
+    return u;
+}
+
+jsint FASTCALL
+js_fcmpeq(jsdouble x, jsdouble y)
+{
+    return x==y;
+}
+
+jsint FASTCALL
+js_fcmplt(jsdouble x, jsdouble y)
+{
+    return x < y;
+}
+
+jsint FASTCALL
+js_fcmple(jsdouble x, jsdouble y)
+{
+    return x <= y;
+}
+
+jsint FASTCALL
+js_fcmpgt(jsdouble x, jsdouble y)
+{
+    return x > y;
+}
+
+jsint FASTCALL
+js_fcmpge(jsdouble x, jsdouble y)
+{
+    return x >= y;
+}
+
+jsdouble FASTCALL
+js_fmul(jsdouble x, jsdouble y)
+{
+    return x * y;
+}
+jsdouble FASTCALL
+js_fadd(jsdouble x, jsdouble y)
+{
+    return x + y;
+}
+
+jsdouble FASTCALL
+js_fdiv(jsdouble x, jsdouble y)
+{
+    return x / y;
+}
+
+jsdouble FASTCALL
+js_fsub(jsdouble x, jsdouble y)
+{
+    return x - y;
+}
+
 #define LO ARGSIZE_LO
 #define F  ARGSIZE_F
 #define Q  ARGSIZE_Q
 
 #if defined AVMPLUS_64BIT
 #define P	ARGSIZE_Q
 #else
 #define P	ARGSIZE_LO
--- a/js/src/jstracer.cpp
+++ b/js/src/jstracer.cpp
@@ -1,9 +1,9 @@
-/* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
  * vim: set ts=4 sw=4 et tw=99:
  *
  * ***** BEGIN LICENSE BLOCK *****
  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
  *
  * The contents of this file are subject to the Mozilla Public License Version
  * 1.1 (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
@@ -272,41 +272,88 @@ Oracle::isStackSlotUndemotable(JSScript*
 
 /* Clear the oracle. */
 void
 Oracle::clear()
 {
     _dontDemote.reset();
 }
 
+static bool isi2f(LInsp i)
+{
+    if (i->isop(LIR_i2f))
+        return true;
+
+#ifdef NANOJIT_ARM
+    if (i->isop(LIR_qjoin) &&
+        i->oprnd1()->isop(LIR_call) &&
+        i->oprnd2()->isop(LIR_callh))
+    {
+        if (i->oprnd1()->imm8() == F_i2f)
+            return true;
+    }
+#endif
+
+    return false;
+}
+
+static bool isu2f(LInsp i)
+{
+    if (i->isop(LIR_u2f))
+        return true;
+
+#ifdef NANOJIT_ARM
+    if (i->isop(LIR_qjoin) &&
+        i->oprnd1()->isop(LIR_call) &&
+        i->oprnd2()->isop(LIR_callh))
+    {
+        if (i->oprnd1()->imm8() == F_u2f)
+            return true;
+    }
+#endif
+
+    return false;
+}
+
+static LInsp iu2fArg(LInsp i)
+{
+#ifdef NANOJIT_ARM
+    if (i->isop(LIR_qjoin))
+        return i->oprnd1()->arg(0);
+#endif
+
+    return i->oprnd1();
+}
+
+
 static LIns* demote(LirWriter *out, LInsp i)
 {
     if (i->isCall())
         return callArgN(i, 0);
-    if (i->isop(LIR_i2f) || i->isop(LIR_u2f))
-        return i->oprnd1();
+    if (isi2f(i) || isu2f(i))
+        return iu2fArg(i);
     if (i->isconst())
         return i;
     AvmAssert(i->isconstq());
     double cf = i->constvalf();
     int32_t ci = cf > 0x7fffffff ? uint32_t(cf) : int32_t(cf);
     return out->insImm(ci);
 }
 
 static bool isPromoteInt(LIns* i)
 {
     jsdouble d;
-    return i->isop(LIR_i2f) || i->isconst() ||
+    return isi2f(i) || i->isconst() ||
         (i->isconstq() && ((d = i->constvalf()) == (jsdouble)(jsint)d) && !JSDOUBLE_IS_NEGZERO(d));
 }
 
 static bool isPromoteUint(LIns* i)
 {
     jsdouble d;
-    return i->isop(LIR_u2f) || i->isconst() ||
+    return isu2f(i) || i->isconst() ||
         (i->isconstq() && ((d = i->constvalf()) == (jsdouble)(jsuint)d));
 }
 
 static bool isPromote(LIns* i)
 {
     return isPromoteInt(i) || isPromoteUint(i);
 }
 
@@ -319,16 +366,102 @@ static bool overflowSafe(LIns* i)
 {
     LIns* c;
     return (i->isop(LIR_and) && ((c = i->oprnd2())->isconst()) &&
             ((c->constval() & 0xc0000000) == 0)) ||
            (i->isop(LIR_rsh) && ((c = i->oprnd2())->isconst()) &&
             ((c->constval() > 0)));
 }
 
+#ifdef NANOJIT_ARM
+
+class SoftFloatFilter: public LirWriter
+{
+public:
+    SoftFloatFilter(LirWriter* out):
+        LirWriter(out)
+    {
+    }
+
+    LInsp quadCall(uint32_t fid, LInsp args[]) {
+        LInsp qlo, qhi;
+
+        qlo = out->insCall(fid, args);
+        qhi = out->ins1(LIR_callh, qlo);
+        return out->qjoin(qlo, qhi);
+    }
+
+    LInsp ins1(LOpcode v, LInsp s0)
+    {
+        if (v == LIR_fneg)
+            return quadCall(F_fneg, &s0);
+
+        if (v == LIR_i2f)
+            return quadCall(F_i2f, &s0);
+
+        if (v == LIR_u2f)
+            return quadCall(F_u2f, &s0);
+
+        return out->ins1(v, s0);
+    }
+
+    LInsp ins2(LOpcode v, LInsp s0, LInsp s1)
+    {
+        LInsp args[2];
+        LInsp bv;
+
+        // change the numeric value and order of these LIR opcodes and die
+        if (LIR_fadd <= v && v <= LIR_fdiv) {
+            static uint32_t fmap[] = { F_fadd, F_fsub, F_fmul, F_fdiv };
+
+            args[0] = s1;
+            args[1] = s0;
+
+            return quadCall(fmap[v - LIR_fadd], args);
+        }
+
+        if (LIR_feq <= v && v <= LIR_fge) {
+            static uint32_t fmap[] = { F_fcmpeq, F_fcmplt, F_fcmpgt, F_fcmple, F_fcmpge };
+
+            args[0] = s1;
+            args[1] = s0;
+
+            bv = out->insCall(fmap[v - LIR_feq], args);
+            return out->ins2(LIR_eq, bv, out->insImm(1));
+        }
+
+        // not really a softfloat filter, but needed on ARM --
+        // arm doesn't mask shifts to 31 like x86 does
+        if (v == LIR_lsh ||
+            v == LIR_rsh ||
+            v == LIR_ush)
+        {
+            if (s1->isconst())
+                s1->setimm16(s1->constval() & 31);
+            else
+                s1 = out->ins2(LIR_and, s1, out->insImm(31));
+            return out->ins2(v, s0, s1);
+        }
+
+        return out->ins2(v, s0, s1);
+    }
+
+    LInsp insCall(uint32_t fid, LInsp args[])
+    {
+        // if the return type is ARGSIZE_F, we have
+        // to do a quadCall ( qjoin(call,callh) )
+        if ((builtins[fid]._argtypes & 3) == ARGSIZE_F)
+            return quadCall(fid, args);
+
+        return out->insCall(fid, args);
+    }
+};
+
+#endif
+
 class FuncFilter: public LirWriter
 {
     TraceRecorder& recorder;
 public:
     FuncFilter(LirWriter* out, TraceRecorder& _recorder):
         LirWriter(out), recorder(_recorder)
     {
     }
@@ -422,34 +555,33 @@ public:
 
     LInsp insCall(uint32_t fid, LInsp args[])
     {
         LInsp s0 = args[0];
         switch (fid) {
           case F_DoubleToUint32:
             if (s0->isconstq())
                 return out->insImm(js_DoubleToECMAUint32(s0->constvalf()));
-            if (s0->isop(LIR_i2f) || s0->isop(LIR_u2f)) {
-                return s0->oprnd1();
-            }
+            if (isi2f(s0) || isu2f(s0))
+                return iu2fArg(s0);
             break;
           case F_DoubleToInt32:
             if (s0->isconstq())
                 return out->insImm(js_DoubleToECMAInt32(s0->constvalf()));
             if (s0->isop(LIR_fadd) || s0->isop(LIR_fsub) || s0->isop(LIR_fmul)) {
                 LInsp lhs = s0->oprnd1();
                 LInsp rhs = s0->oprnd2();
                 if (isPromote(lhs) && isPromote(rhs)) {
                     LOpcode op = LOpcode(s0->opcode() & ~LIR64);
                     return out->ins2(op, demote(out, lhs), demote(out, rhs));
                 }
             }
-            if (s0->isop(LIR_i2f) || s0->isop(LIR_u2f)) {
-                return s0->oprnd1();
-            }
+            if (isi2f(s0) || isu2f(s0))
+                return iu2fArg(s0);
+            // XXX ARM -- check for qjoin(call(F_UnboxDouble),call(F_UnboxDouble))
             if (s0->isCall() && s0->fid() == F_UnboxDouble) {
                 LIns* args2[] = { callArgN(s0, 0) };
                 return out->insCall(F_UnboxInt32, args2);
             }
             if (s0->isCall() && s0->fid() == F_StringToNumber) {
                 // callArgN's ordering is that as seen by the builtin, not as stored in args here.
                 // True story!
                 LIns* args2[] = { callArgN(s0, 1), callArgN(s0, 0) };
@@ -684,16 +816,19 @@ TraceRecorder::TraceRecorder(JSContext* 
                         js_PCToLineNumber(cx, cx->fp->script, cx->fp->regs->pc),
                         cx->fp->regs->pc - cx->fp->script->code););
 
     lir = lir_buf_writer = new (&gc) LirBufWriter(lirbuf);
 #ifdef DEBUG
     if (verbose_debug)
         lir = verbose_filter = new (&gc) VerboseWriter(&gc, lir, lirbuf->names);
 #endif
+#ifdef NANOJIT_ARM
+    lir = float_filter = new (&gc) SoftFloatFilter(lir);
+#endif
     lir = cse_filter = new (&gc) CseFilter(lir, &gc);
     lir = expr_filter = new (&gc) ExprFilter(lir);
     lir = func_filter = new (&gc) FuncFilter(lir, *this);
     lir->ins0(LIR_trace);
 
     if (!nanojit::AvmCore::config.tree_opt || fragment->root == fragment) {
         lirbuf->state = addName(lir->insParam(0), "state");
         lirbuf->param1 = addName(lir->insParam(1), "param1");
@@ -727,16 +862,19 @@ TraceRecorder::~TraceRecorder()
     if (trashTree)
         js_TrashTree(cx, whichTreeToTrash);
 #ifdef DEBUG
     delete verbose_filter;
 #endif
     delete cse_filter;
     delete expr_filter;
     delete func_filter;
+#ifdef NANOJIT_ARM
+    delete float_filter;
+#endif
     delete lir_buf_writer;
 }
 
 /* Add debug information to a LIR instruction as we emit it. */
 inline LIns*
 TraceRecorder::addName(LIns* ins, const char* name)
 {
 #ifdef DEBUG
@@ -1430,31 +1568,31 @@ TraceRecorder::guard(bool expected, LIns
    values flowing into the loop edge is compatible with the type we expect in the loop header. */
 bool
 TraceRecorder::checkType(jsval& v, uint8 t, bool& unstable)
 {
     if (t == JSVAL_INT) { /* initially all whole numbers cause the slot to be demoted */
         if (!isNumber(v))
             return false; /* not a number? type mismatch */
         LIns* i = get(&v);
-        if (!i->isop(LIR_i2f)) {
+        if (!isi2f(i)) {
             debug_only_v(printf("int slot is !isInt32, slot #%d, triggering re-compilation\n",
                                 !isGlobal(&v)
                                 ? nativeStackOffset(&v)
                                 : nativeGlobalOffset(&v)););
             AUDIT(slotPromoted);
             unstable = true;
             return true; /* keep checking types, but request re-compilation */
         }
         /* Looks good, slot is an int32, the last instruction should be i2f. */
-        JS_ASSERT(isInt32(v) && i->isop(LIR_i2f));
+        JS_ASSERT(isInt32(v) && (i->isop(LIR_i2f) || i->isop(LIR_qjoin)));
         /* We got the final LIR_i2f as we expected. Overwrite the value in that
            slot with the argument of i2f since we want the integer store to flow along
            the loop edge, not the casted value. */
-        set(&v, i->oprnd1());
+        set(&v, iu2fArg(i));
         return true;
     }
     if (t == JSVAL_DOUBLE) {
         if (!isNumber(v))
             return false; /* not a number? type mismatch */
         LIns* i = get(&v);
         /* We sink i2f conversions into the side exit, but at the loop edge we have to make
            sure we promote back to double if at loop entry we want a double. */
@@ -2075,20 +2213,21 @@ js_ExecuteTree(JSContext* cx, Fragment**
     }
     JS_ASSERT(f->vmprivate);
 
     AUDIT(traceTriggered);
 
     /* execute previously recorded trace */
     TreeInfo* ti = (TreeInfo*)f->vmprivate;
 
-    debug_only_v(printf("entering trace at %s:%u@%u, native stack slots: %u\n",
+    debug_only_v(printf("entering trace at %s:%u@%u, native stack slots: %u code: %p\n",
                         cx->fp->script->filename,
                         js_PCToLineNumber(cx, cx->fp->script, cx->fp->regs->pc),
-                        cx->fp->regs->pc - cx->fp->script->code, ti->maxNativeStackSlots););
+                        cx->fp->regs->pc - cx->fp->script->code, ti->maxNativeStackSlots,
+                        f->code()););
 
     JSTraceMonitor* tm = &JS_TRACE_MONITOR(cx);
     unsigned ngslots = tm->globalSlots->length();
     uint16* gslots = tm->globalSlots->data();
     JSObject* globalObj = JS_GetGlobalForObject(cx, cx->fp->scopeChain);
     unsigned globalFrameSize = STOBJ_NSLOTS(globalObj);
     double* global = (double*)alloca((globalFrameSize+1) * sizeof(double));
     debug_only(*(uint64*)&global[globalFrameSize] = 0xdeadbeefdeadbeefLL;)
--- a/js/src/jstracer.h
+++ b/js/src/jstracer.h
@@ -216,16 +216,19 @@ class TraceRecorder {
     TreeInfo*               treeInfo;
     nanojit::LirBuffer*     lirbuf;
     nanojit::LirWriter*     lir;
     nanojit::LirBufWriter*  lir_buf_writer;
     nanojit::LirWriter*     verbose_filter;
     nanojit::LirWriter*     cse_filter;
     nanojit::LirWriter*     expr_filter;
     nanojit::LirWriter*     func_filter;
+#ifdef NANOJIT_ARM
+    nanojit::LirWriter*     float_filter;
+#endif
     nanojit::LIns*          cx_ins;
     nanojit::LIns*          gp_ins;
     nanojit::LIns*          eos_ins;
     nanojit::LIns*          eor_ins;
     nanojit::LIns*          rval_ins;
     nanojit::LIns*          inner_sp_ins;
     nanojit::SideExit       exit;
     bool                    trashTree;
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
@@ -783,48 +783,39 @@ namespace nanojit
 			//fprintf(stderr, "endAssembly frag %X entry %X\n", (int)frag, (int)frag->fragEntry);
 		}
 		
 		AvmAssertMsg(error() || _fpuStkDepth == 0, ("_fpuStkDepth %d\n",_fpuStkDepth));
 
 		internalReset();  // clear the reservation tables and regalloc
 		NanoAssert(_branchStateMap->isEmpty());
 		_branchStateMap = 0;
-		
-		#if defined(UNDER_CE)
+
+#ifdef AVMPLUS_ARM
 		// If we've modified the code, we need to flush so we don't end up trying 
 		// to execute junk
+# if defined(UNDER_CE)
 		FlushInstructionCache(GetCurrentProcess(), NULL, NULL);
-		#elif defined(AVMPLUS_LINUX) && defined(AVMPLUS_ARM)
-			// N A S T Y - obviously have to fix this
-		// determine our page range
+# elif defined(AVMPLUS_LINUX)
+		// XXX fixme flush adjacent pages together
+		for (int i = 0; i < 2; i++) {
+			Page *p = (i == 0) ? _nativePages : _nativeExitPages;
 
-		Page *page=0, *first=0, *last=0;
-		for (int i=2;i!=0;i--) {
-			page = first = last = (i==2 ? _nativePages : _nativeExitPages);
-			while (page)
-			{
-				if (page<first)
-					first = page;
-				if (page>last)
-					last = page;
-				page = page->next;
+			while (p) {
+				flushCache((NIns*)p, (NIns*)((intptr_t)(p) + NJ_PAGE_SIZE));
+				p = p->next;
 			}
-	
-			register unsigned long _beg __asm("a1") = (unsigned long)(first);
-			register unsigned long _end __asm("a2") = (unsigned long)(last+NJ_PAGE_SIZE);
-			register unsigned long _flg __asm("a3") = 0;
-			register unsigned long _swi __asm("r7") = 0xF0002;
-			__asm __volatile ("swi 0 	@ sys_cacheflush" : "=r" (_beg) : "0" (_beg), "r" (_end), "r" (_flg), "r" (_swi));
 		}
-		#endif
-	#ifdef AVMPLUS_PORTING_API
+# endif
+#endif
+
+# ifdef AVMPLUS_PORTING_API
 		NanoJIT_PortAPI_FlushInstructionCache(_nIns, _endJit1Addr);
 		NanoJIT_PortAPI_FlushInstructionCache(_nExitIns, _endJit2Addr);
-	#endif
+# endif
 	}
 	
 	void Assembler::copyRegisters(RegAlloc* copyTo)
 	{
 		*copyTo = _allocator;
 	}
 
 	void Assembler::releaseRegisters()
@@ -856,17 +847,17 @@ namespace nanojit
 		 
 		for (LInsp ins = reader->read(); ins != 0 && !error(); ins = reader->read())
 		{
     		Reservation *rR = getresv(ins);
 			LOpcode op = ins->opcode();			
 			switch(op)
 			{
 				default:
-					NanoAssertMsg(false, "unsupported LIR instruction");
+					NanoAssertMsgf(false, ("unsupported LIR instruction: %d (~0x40: %d)\n",op, op&~LIR64));
 					break;
 					
 				case LIR_short:
 				case LIR_int:
 				{
 					Register rr = prepResultReg(ins, GpRegs);
 					int32_t val;
 					if (op == LIR_int)
@@ -1058,33 +1049,46 @@ namespace nanojit
 				case LIR_rsh:
 				case LIR_ush:
 				{
                     LInsp lhs = ins->oprnd1();
                     LInsp rhs = ins->oprnd2();
 
 					Register rb = UnknownReg;
 					RegisterMask allow = GpRegs;
-					if (lhs != rhs && (op == LIR_mul || !rhs->isconst()))
+					bool forceReg = (op == LIR_mul || !rhs->isconst());
+
+#ifdef NANOJIT_ARM
+					// Arm can't do an immediate op with immediates
+					// outside of +/-255 (for AND) r outside of
+					// 0..255 for others.
+					if (!forceReg)
+					{
+						if (rhs->isconst() && !isU8(rhs->constval()))
+							forceReg = true;
+					}
+#endif
+
+					if (lhs != rhs && forceReg)
 					{
 						if ((rb = asm_binop_rhs_reg(ins)) == UnknownReg) {
 							rb = findRegFor(rhs, allow);
 						}
 						allow &= ~rmask(rb);
 					}
 
 					Register rr = prepResultReg(ins, allow);
 					Reservation* rA = getresv(lhs);
 					Register ra;
 					// if this is last use of lhs in reg, we can re-use result reg
 					if (rA == 0 || (ra = rA->reg) == UnknownReg)
 						ra = findSpecificRegFor(lhs, rr);
 					// else, rA already has a register assigned.
 
-					if (!rhs->isconst() || op == LIR_mul)
+					if (forceReg)
 					{
 						if (lhs == rhs)
 							rb = ra;
 
 						if (op == LIR_add)
 							ADD(rr, rb);
 						else if (op == LIR_sub)
 							SUB(rr, rb);
--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@@ -1541,17 +1541,22 @@ namespace nanojit
 			formatImm(uint32_t(ref->constvalq()), buf);
 #endif
 		}
 		else if (ref->isconst()) {
 			formatImm(ref->constval(), buf);
 		}
 		else {
 			if (ref->isCall()) {
-				copyName(ref, _functions[ref->fid()]._name, funccounts.add(ref->fid()));
+				if (ref->isop(LIR_callh)) {
+					// we've presumably seen the other half already
+					ref = ref->oprnd1();
+				} else {
+					copyName(ref, _functions[ref->fid()]._name, funccounts.add(ref->fid()));
+				}
 			} else {
                 NanoAssert(ref->opcode() < sizeof(lirNames) / sizeof(lirNames[0]));
 				copyName(ref, lirNames[ref->opcode()], lircounts.add(ref->opcode()));
 			}
 			StringNullTerminatedUTF8 cname(gc, names.get(ref)->name);
 			strcat(buf, cname.c_str());
 		}
 		return labels->dup(buffer);
@@ -1647,26 +1652,31 @@ namespace nanojit
 			case LIR_ule:
 			case LIR_ugt:
 			case LIR_uge:
 			case LIR_feq:
 			case LIR_flt:
 			case LIR_fle:
 			case LIR_fgt:
 			case LIR_fge:
-			case LIR_qjoin:
             case LIR_qiadd:
             case LIR_qiand:
             case LIR_qilsh:
             case LIR_qior:
 				sprintf(s, "%s %s, %s", lirNames[op],
 					formatRef(i->oprnd1()), 
 					formatRef(i->oprnd2()));
 				break;
 
+			case LIR_qjoin:
+				sprintf(s, "%s (%s), %s", lirNames[op],
+					formatIns(i->oprnd1()), 
+ 					formatRef(i->oprnd2()));
+ 				break;
+
 			case LIR_qcmov:
 			case LIR_cmov:
                 sprintf(s, "%s ? %s : %s", 
 					formatRef(i->oprnd1()), 
 					formatRef(i->oprnd2()->oprnd1()), 
 					formatRef(i->oprnd2()->oprnd2()));
 				break;
 
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@@ -144,21 +144,24 @@ namespace nanojit
 		counter_define(x87Top);
 
 	#define DECLARE_PLATFORM_REGALLOC()
 
 
 	#define DECLARE_PLATFORM_ASSEMBLER()\
 		const static Register argRegs[4], retRegs[2];\
 		void LD32_nochk(Register r, int32_t imm);\
+		void BL(NIns*);\
+		void BL_far(NIns*);\
 		void CALL(const CallInfo*);\
 		void underrunProtect(int bytes);\
 		bool has_cmov;\
 		void nativePageReset();\
 		void nativePageSetup();\
+		void flushCache(NIns*,NIns*);\
 		int* _nSlot;\
 		int* _nExitSlot;
 
 
     #define asm_farg(i) NanoAssert(false)
 
 	//printf("jmp_l_n count=%d, nins=%X, %X = %X\n", (_c), nins, _nIns, ((intptr_t)(nins+(_c))-(intptr_t)_nIns - 4) );
 
@@ -227,16 +230,17 @@ ShiftOperator;
 
 // _r = _r AND _imm
 #define ANDi(_r,_imm) do {\
 	if (isU8((_imm))) {\
 		underrunProtect(4);\
 		*(--_nIns) = (NIns)( COND_AL | OP_IMM | ((_r)<<16) | ((_r)<<12) | ((_imm)&0xFF) );\
 		asm_output2("and %s,%d",gpn(_r),(_imm));}\
 	else if ((_imm)<0 && (_imm)>-256) {\
+		underrunProtect(8);\
 		*(--_nIns) = (NIns)( COND_AL | ((_r)<<16) | ((_r)<<12) | (Scratch) );\
 		asm_output2("and %s,%s",gpn(_r),gpn(Scratch));\
 		*(--_nIns) = (NIns)( COND_AL | (0x3E<<20) | ((Scratch)<<12) | (((_imm)^0xFFFFFFFF)&0xFF) );\
 		asm_output2("mvn %s,%d",gpn(Scratch),(_imm));}\
 	else NanoAssert(0);\
 	} while (0)
 
 
@@ -527,16 +531,17 @@ ShiftOperator;
 	} while(0)
 
 
 //#define RET()   underrunProtect(1); *(--_nIns) = 0xc3;	asm_output("ret")
 //#define NOP() 	underrunProtect(1); *(--_nIns) = 0x90;	asm_output("nop")
 //#define INT3()  underrunProtect(1); *(--_nIns) = 0xcc;  asm_output("int3")
 //#define RET() INT3()
 
+#define BKPT_nochk() do { *(--_nIns) = (NIns)( (0xE<<24) | (0x12<<20) | (0x7<<4) ); } while (0);
 
 // this is pushing a reg
 #define PUSHr(_r)  do {\
 	underrunProtect(4);\
 	*(--_nIns) = (NIns)( COND_AL | (0x92<<20) | (SP<<16) | (1<<(_r)) );	\
 	asm_output1("push %s",gpn(_r)); } while (0)
 
 // STMDB
@@ -559,59 +564,76 @@ ShiftOperator;
 	*(--_nIns) = (NIns)( COND_AL | (0x8B<<20) | (SP<<16) | (1<<(_r)) );\
 	asm_output1("pop %s",gpn(_r));} while (0)
 
 #define POP_mask(_mask) do {\
 	underrunProtect(4);			\
 	*(--_nIns) = (NIns)( COND_AL | (0x8B<<20) | (SP<<16) | (_mask) );\
 	asm_output1("pop %x", (_mask));} while (0)
 
-// takes an offset (right?)
-#define JMP_long_nochk_offset(_off) do {\
-	*(--_nIns) = (NIns)( COND_AL | (0xA<<24) | (((_off)>>2) & 0xFFFFFF) );	\
-	asm_output1("jmp_l_n 0x%08x",(_off));} while (0)
-
-// take an address, not an offset
-#define JMP(t)	do {\
-	underrunProtect(4);\
-	intptr_t tt = (intptr_t)(t) - ((intptr_t)_nIns + 4);\
-	*(--_nIns) = (NIns)( COND_AL | (0xA<<24) | (((tt)>>2) & 0xFFFFFF) );	\
-	asm_output1("JMP 0x%08x\n",(unsigned int)(t)); } while (0)
-
-#define JMP_nochk(t)	do {\
-	intptr_t tt = (intptr_t)(t) - ((intptr_t)_nIns + 4);\
-	*(--_nIns) = (NIns)( COND_AL | (0xA<<24) | (((tt)>>2) & 0xFFFFFF) );	\
-	asm_output1("JMP 0x%08x\n",(unsigned int)(t)); } while (0)
-
-#define JMP_long_placeholder()	do {JMP_long(0xffffffff); } while(0)
+#define PC_OFFSET_FROM(target,frompc) ((intptr_t)(target) - ((intptr_t)(frompc) + 8))
+#define JMP_S24_OFFSET_OK(offs) ((-(1<<24)) <= (offs) && (offs) < (1<<24))
 
-#define JMP_long(_t)	do {\
-	underrunProtect(4);\
-	*(--_nIns) = (NIns)( COND_AL | (0xA<<24) | (((_t)>>2) & 0xFFFFFF) );	\
-	asm_output1("JMP_long 0x%08x\n", (unsigned int)(_t) ); } while (0)
-
-#define BL(_t)	do {\
-	underrunProtect(4);\
-	intptr_t _tt = (intptr_t)(_t) - ((intptr_t)_nIns + 4);\
-	*(--_nIns) = (NIns)( COND_AL | (0xB<<24) | (((_tt)>>2) & 0xFFFFFF) );	\
-	asm_output2("BL 0x%08x offset=%d",(intptr_t)(_nIns) + (_tt),(_tt)) } while (0)
-
+// (XXX This ought to be a function instead of a macro)
+//
+// Branch to target address _t with condition _c, doing underrun
+// checks (_chk == 1) or skipping them (_chk == 0).
+//
+// If the jump fits in a relative jump (+/-32MB), emit that.
+// If the jump is unconditional, emit the dest address inline in
+// the instruction stream and load it into pc.
+// If the jump has a condition, but noone's mucked with _nIns and our _nSlot
+// pointer is valid, stick the constant in the slot and emit a conditional
+// load into pc.
+// Otherwise, emit the conditional load into pc from a nearby constant,
+// and emit a jump to jump over it it in case the condition fails.
+//
+// NB: JMP_nochk depends on this not calling samepage() when _c == AL
+#define B_cond_chk(_c,_t,_chk) do {										\
+		int32 offs = PC_OFFSET_FROM(_t,(intptr_t)(_nIns)-4);			\
+		if (JMP_S24_OFFSET_OK(offs)) {									\
+			if(_chk) underrunProtect(4);								\
+			*(--_nIns) = (NIns)( ((_c)<<28) | (0xA<<24) | (((offs)>>2) & 0xFFFFFF) ); \
+		} else if (_c == AL) {											\
+			if(_chk) underrunProtect(8);								\
+			*(--_nIns) = (NIns)(_t);									\
+			*(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | 0x4 ); \
+		} else if (samepage(_nIns,_nSlot)) {							\
+			if(_chk) underrunProtect(8);								\
+			*(++_nSlot) = (NIns)(_t);									\
+			offs = PC_OFFSET_FROM(_nSlot,(intptr_t)(_nIns)-4);			\
+			NanoAssert(offs < 0);										\
+			*(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | ((-offs) & 0xFFFFFF) ); \
+		} else {														\
+			if(_chk) underrunProtect(24);								\
+			*(--_nIns) = (NIns)(_t);									\
+			*(--_nIns) = (NIns)( COND_AL | (0xA<<24) | ((-4)>>2) & 0xFFFFFF ); \
+			*(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | 0x0 ); \
+		}																\
+		asm_output2("%s %p\n", _c == AL ? "jmp" : "b(cnd)", (void*)(_t)); \
+	} while(0)
 
-#define JMP_long_nochk(_t)	do {\
-	intptr_t tt = (intptr_t)(_t) - ((intptr_t)_nIns + 4);\
-	*(--_nIns) = (NIns)( COND_AL | (0xA<<24) | (((tt)>>2) & 0xFFFFFF) );	\
-	asm_output1("JMP_l_n 0x%08x\n", (unsigned int)(_t)) } while (0)
+#define B_cond(_c,_t) \
+	B_cond_chk(_c,_t,1)
 
+// NB: don't use COND_AL here, we shift the condition into place!
+#define JMP(_t) \
+	B_cond_chk(AL,_t,1)
 
-#define B_cond(_c,_t)\
-	underrunProtect(4);\
-	intptr_t tt = (intptr_t)(_t) - ((intptr_t)_nIns + 4);\
-	*(--_nIns) = (NIns)( ((_c)<<28) | (0xA<<24) | ((tt >>2)& 0xFFFFFF) );	\
-	asm_output2("b(cond) 0x%08x (%tX)",(unsigned int)(_t), tt);
+#define JMP_nochk(_t) \
+	B_cond_chk(AL,_t,0)
 
+// emit a placeholder that will be filled in later by nPatchBranch;
+// emit two breakpoint instructions in case something goes wrong with
+// the patching.
+#define JMP_long_placeholder()	do {							\
+		underrunProtect(8);										\
+		BKPT_nochk();											\
+		BKPT_nochk();											\
+	} while(0)
 
 #define JA(t)	do {B_cond(HI,t); asm_output1("ja 0x%08x",(unsigned int)t); } while(0)
 #define JNA(t)	do {B_cond(LS,t); asm_output1("jna 0x%08x",(unsigned int)t); } while(0)
 #define JB(t)	do {B_cond(CC,t); asm_output1("jb 0x%08x",(unsigned int)t); } while(0)
 #define JNB(t)	do {B_cond(CS,t); asm_output1("jnb 0x%08x",(unsigned int)t); } while(0)
 #define JE(t)	do {B_cond(EQ,t); asm_output1("je 0x%08x",(unsigned int)t); } while(0)
 #define JNE(t)	do {B_cond(NE,t); asm_output1("jne 0x%08x",(unsigned int)t); } while(0)						
 #define JBE(t)	do {B_cond(LS,t); asm_output1("jbe 0x%08x",(unsigned int)t); } while(0)
--- a/js/src/nanojit/NativeThumb.cpp
+++ b/js/src/nanojit/NativeThumb.cpp
@@ -143,17 +143,24 @@ namespace nanojit
 		{
 			JMP(frag->fragEntry);
 			lr = 0;
 		}
 		else
 		{
 			// target doesn't exit yet.  emit jump to epilog, and set up to patch later.
 			lr = placeGuardRecord(guard);
-            BL(_epilogue);
+
+#ifdef NJ_THUMB_JIT
+			BL(_epilogue);
+#else
+			// we need to know that there's an extra immediate value available
+			// for us; always force a far jump here.
+			BL_far(_epilogue);
+#endif
 
 			lr->jmp = _nIns;
 		}
 
 		// pop the stack frame first
 		MR(SP, FRAME_PTR);
 
         #ifdef NJ_VERBOSE
@@ -191,27 +198,49 @@ namespace nanojit
 		POP_mask(savingMask); // regs
 		return _nIns;
 #endif
 	}
 	
 	void Assembler::asm_call(LInsp ins)
 	{
         const CallInfo* call = callInfoFor(ins->fid());
+		uint32_t atypes = call->_argtypes;
+		uint32_t roffset = 0;
+
+		// we need to detect if we have arg0 as LO followed by arg1 as F;
+		// in that case, we need to skip using r1 -- the F needs to be
+		// loaded in r2/r3, at least according to the ARM EABI and gcc 4.2's
+		// generated code.
+		bool arg0IsInt32FollowedByFloat = false;
+		while ((atypes & 3) != ARGSIZE_NONE) {
+			if (((atypes >> 4) & 3) == ARGSIZE_LO &&
+				((atypes >> 2) & 3) == ARGSIZE_F &&
+				((atypes >> 6) & 3) == ARGSIZE_NONE)
+			{
+				arg0IsInt32FollowedByFloat = true;
+				break;
+			}
+			atypes >>= 2;
+		}
+
 		CALL(call);
         ArgSize sizes[10];
         uint32_t argc = call->get_sizes(sizes);
 		for(uint32_t i=0; i < argc; i++)
 		{
             uint32_t j = argc - i - 1;
             ArgSize sz = sizes[j];
             NanoAssert(sz == ARGSIZE_LO || sz == ARGSIZE_Q);
     		// pre-assign registers R0-R3 for arguments (if they fit)
-            Register r = i < 4 ? argRegs[i] : UnknownReg;
+            Register r = (i+roffset) < 4 ? argRegs[i+roffset] : UnknownReg;
             asm_arg(sz, ins->arg(j), r);
+
+			if (i == 0 && arg0IsInt32FollowedByFloat)
+				roffset = 1;
 		}
 	}
 	
 	void Assembler::nMarkExecute(Page* page, int32_t count, bool enable)
 	{
 	#ifdef UNDER_CE
 		DWORD dwOld;
 		VirtualProtect(page, NJ_PAGE_SIZE, PAGE_EXECUTE_READWRITE, &dwOld);
@@ -272,29 +301,38 @@ namespace nanojit
 	}
 
 	void Assembler::nPatchBranch(NIns* branch, NIns* target)
 	{
 		// Patch the jump in a loop
 
 		// This is ALWAYS going to be a long branch (using the BL instruction)
 		// Which is really 2 instructions, so we need to modify both
+		// XXX -- this is B, not BL, at least on non-Thumb..
 
 		// branch+2 because PC is always 2 instructions ahead on ARM/Thumb
 		int32_t offset = int(target) - int(branch+2);
 
-//printf("---patching branch at %X to location %X (%d)\n", branch, target, offset);
+		//printf("---patching branch at 0x%08x to location 0x%08x (%d-0x%08x)\n", branch, target, offset, offset);
 
 #ifdef NJ_THUMB_JIT
 		NanoAssert(-(1<<21) <= offset && offset < (1<<21)); 
 		*branch++ = (NIns)(0xF000 | (offset>>12)&0x7FF);
 		*branch =   (NIns)(0xF800 | (offset>>1)&0x7FF);
 #else
-		// ARM goodness, using unconditional B
-		*branch = (NIns)( COND_AL | (0xA<<24) | ((offset >>2)& 0xFFFFFF) );
+		// We have 2 words to work with here -- if offset is in range of a 24-bit
+		// relative jump, emit that; otherwise, we do a pc-relative load into pc.
+		if (-(1<<24) <= offset & offset < (1<<24)) {
+			// ARM goodness, using unconditional B
+			*branch = (NIns)( COND_AL | (0xA<<24) | ((offset >>2) & 0xFFFFFF) );
+		} else {
+			// LDR pc,[pc]
+			*branch++ = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | ( 0x004 ) );
+			*branch = (NIns)target;
+		}
 #endif
 	}
 
 	RegisterMask Assembler::hint(LIns* i, RegisterMask allow /* = ~0 */)
 	{
 		uint32_t op = i->opcode();
 		int prefer = ~0;
 
@@ -446,47 +484,16 @@ namespace nanojit
 			LD(Scratch,disp(rA),FP);
 		}
 		else
 		{
 			ST(SP,0,rA->reg);
 		}
 	}
 
-	NIns* Assembler::asm_adjustBranch(NIns* at, NIns* target)
-	{
-		NIns* save = _nIns;
-#ifdef NJ_THUMB_JIT
-		NIns* was =  (NIns*) (((((*(at+2))&0x7ff)<<12) | (((*(at+1))&0x7ff)<<1)) + (at-2+2));
-		_nIns = at + 2;
-#else
-		NIns* was = (NIns*) (((*at&0xFFFFFF)<<2));
-	    _nIns = at + 1;
-#endif
-		BL(target);
-	#ifdef AVMPLUS_PORTING_API
-		NanoJIT_PortAPI_FlushInstructionCache(save, _nIns);
-	#endif
-                  
-		#if defined(UNDER_CE)
- 		// we changed the code, so we need to do this (sadly)
- 			FlushInstructionCache(GetCurrentProcess(), NULL, NULL);
-		#elif defined(AVMPLUS_LINUX)
-			// Just need to clear this one page (not even the whole page really)
-			//Page *page = (Page*)pageTop(_nIns);
-			register unsigned long _beg __asm("a1") = (unsigned long)(_nIns);
-			register unsigned long _end __asm("a2") = (unsigned long)(_nIns+2);
-			register unsigned long _flg __asm("a3") = 0;
-			register unsigned long _swi __asm("r7") = 0xF0002;
-			__asm __volatile ("swi 0 	@ sys_cacheflush" : "=r" (_beg) : "0" (_beg), "r" (_end), "r" (_flg), "r" (_swi));
-		#endif
-		_nIns = save;
-		return was;
-	}
-
 	void Assembler::nativePageReset()
 	{
 		#ifdef NJ_THUMB_JIT
 			_nPool = 0;
 			_nSlot = 0;
 			_nExitPool = 0;
 			_nExitSlot = 0;
 		#else
@@ -516,46 +523,81 @@ namespace nanojit
 			// Move _nIns to the top of the pool
 			_nIns = (NIns*)_nPool;
 
 			// no branch needed since this follows the epilogue
         }
 		#else
 		if (!_nSlot)
 		{
-			// This needs to be done or the samepage macro gets confused
+			// This needs to be done or the samepage macro gets confused; pageAlloc
+			// gives us a pointer to just past the end of the page.
 			_nIns--;
 			_nExitIns--;
 
 			// constpool starts at top of page and goes down,
 			// code starts at bottom of page and moves up
-			_nSlot = (int*)(pageTop(_nIns)+1);
-
+			_nSlot = pageDataStart(_nIns); //(int*)(&((Page*)pageTop(_nIns))->lir[0]);
 		}
 		#endif
 	}
 
+	void Assembler::flushCache(NIns* n1, NIns* n2) {
+#if defined(UNDER_CE)
+ 		// we changed the code, so we need to do this (sadly)
+		FlushInstructionCache(GetCurrentProcess(), NULL, NULL);
+#elif defined(AVMPLUS_LINUX)
+		// Just need to clear this one page (not even the whole page really)
+		//Page *page = (Page*)pageTop(_nIns);
+		register unsigned long _beg __asm("a1") = (unsigned long)(n1);
+		register unsigned long _end __asm("a2") = (unsigned long)(n2);
+		register unsigned long _flg __asm("a3") = 0;
+		register unsigned long _swi __asm("r7") = 0xF0002;
+		__asm __volatile ("swi 0 	@ sys_cacheflush" : "=r" (_beg) : "0" (_beg), "r" (_end), "r" (_flg), "r" (_swi));
+#endif
+	}
 
 #ifdef NJ_THUMB_JIT
 
+	NIns* Assembler::asm_adjustBranch(NIns* at, NIns* target)
+	{
+		NIns* save = _nIns;
+		NIns* was =  (NIns*) (((((*(at+2))&0x7ff)<<12) | (((*(at+1))&0x7ff)<<1)) + (at-2+2));
+
+		_nIns = at + 2;
+		BL(target);
+
+		flushCache(_nIns, _nIns+2);
+
+#ifdef AVMPLUS_PORTING_API
+		// XXX save.._nIns+2? really?
+		NanoJIT_PortAPI_FlushInstructionCache(save, _nIns+2);
+#endif
+		
+		_nIns = save;
+
+		return was;
+	}
+
 	void Assembler::STi(Register b, int32_t d, int32_t v)
 	{
 		ST(b, d, Scratch);
 		LDi(Scratch, v);
 	}
 
 	bool isB11(NIns *target, NIns *cur)
 	{
 		NIns *br_base = (cur-1)+2;
 		int br_off = int(target) - int(br_base);
 		return (-(1<<11) <= br_off && br_off < (1<<11));
 	}
 
 	void Assembler::underrunProtect(int bytes)
 	{
+		// perhaps bytes + sizeof(PageHeader)/sizeof(NIns) + 4 ?
 		intptr_t u = bytes + 4;
 		if (!samepage(_nIns-u, _nIns-1)) {
 			NIns* target = _nIns;
 			_nIns = pageAlloc(_inExit);
 			// might be able to do a B instead of BL (save an instruction)
 			if (isB11(target, _nIns))
 			{
 				NIns *br_base = (_nIns-1)+2;
@@ -850,55 +892,104 @@ namespace nanojit
 			*(--_nIns) = (NIns)(0x4700 | (IP<<3));
 			*(--_nIns) = (NIns)(0xE000 | (4>>1));
 			*(--_nIns) = (NIns)(0x4800 | (Scratch<<8) | (1));
 			asm_output2("call %08X:%s", addr, ci->_name);
 		}
 	}
 
 #else // ARM_JIT
-		void Assembler::underrunProtect(int bytes)
+	NIns* Assembler::asm_adjustBranch(NIns* at, NIns* target)
+	{
+		// This always got emitted as a BL_far sequence; at points
+		// to the first of 4 instructions.  Ensure that we're where
+		// we think we were..
+		NanoAssert(at[1] == (NIns)( COND_AL | OP_IMM | (1<<23) | (PC<<16) | (LR<<12) | (4) ));
+		NanoAssert(at[2] == (NIns)( COND_AL | (0x9<<21) | (0xFFF<<8) | (1<<4) | (IP) ));
+
+		NIns* was = (NIns*) at[3];
+
+		at[3] = (NIns)target;
+
+		flushCache(at, at+4);
+
+#ifdef AVMPLUS_PORTING_API
+		NanoJIT_PortAPI_FlushInstructionCache(at, at+4);
+#endif
+
+		return was;
+	}
+
+	void Assembler::underrunProtect(int bytes)
+	{
+		intptr_t u = bytes + sizeof(PageHeader)/sizeof(NIns) + 8;
+		if ( (samepage(_nIns,_nSlot) && (((intptr_t)_nIns-u) <= intptr_t(_nSlot+1))) ||
+			 (!samepage((intptr_t)_nIns-u,_nIns)) )
 		{
-			intptr_t u = (bytes) + 4;
-			if ( (samepage(_nIns,_nSlot) && (((intptr_t)_nIns-u) <= intptr_t(_nSlot+1))) ||
-				 (!samepage((intptr_t)_nIns-u,_nIns)) )
-			{
-				NIns* target = _nIns;
-				_nIns = pageAlloc(_inExit);
-				JMP_nochk(target);
-				_nSlot = pageTop(_nIns);
-			}
-		}		
+			NIns* target = _nIns;
+
+			_nIns = pageAlloc(_inExit);
+
+			// XXX _nIns at this point points to one past the end of
+			// the page, intended to be written into using *(--_nIns).
+			// However, (guess) something seems to be storing the value
+			// of _nIns as is, and then later generating a jump to a bogus
+			// address.  So pre-decrement to ensure that it's always
+			// valid; we end up skipping using the last instruction this
+			// way.
+			_nIns--;
+
+			// Update slot, either to _nIns (if decremented above), or
+			// _nIns-1 once the above bug is fixed/found.
+			_nSlot = pageDataStart(_nIns);
 
-	bool isB24(NIns *target, NIns *cur)
-	{
-		int offset = int(target)-int(cur-2+2);
-		return (-(1<<24) <= offset && offset < (1<<24));
+			// If samepage() is used on _nIns and _nSlot, it'll fail, since _nIns
+			// points to one past the end of the page right now.  Assume that 
+			// JMP_nochk won't ever try to write to _nSlot, and so won't ever
+			// check samepage().  See B_cond_chk macro.
+			JMP_nochk(target);
+		} else if (!_nSlot) {
+			// make sure that there's always a slot pointer
+			_nSlot = pageDataStart(_nIns);
+		}
+	}
+
+	void Assembler::BL_far(NIns* addr) {
+		// we have to stick an immediate into the stream and make lr
+		// point to the right spot before branching
+		underrunProtect(16);
+
+		// the address
+		*(--_nIns) = (NIns)((addr));
+		// bx ip             // branch to the address we loaded earlier
+		*(--_nIns) = (NIns)( COND_AL | (0x9<<21) | (0xFFF<<8) | (1<<4) | (IP) );
+		// add lr, [pc + #4] // set lr to be past the address that we wrote
+		*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | (PC<<16) | (LR<<12) | (4) );
+		// ldr ip, [pc + #4] // load the address into ip, reading it from [pc+4]
+		*(--_nIns) = (NIns)( COND_AL | (0x59<<20) | (PC<<16) | (IP<<12) | (4));
+		asm_output1("bl %p (32-bit)", addr);
+	}
+
+	void Assembler::BL(NIns* addr) {
+		intptr_t offs = PC_OFFSET_FROM(addr,(intptr_t)_nIns-4);
+		if (JMP_S24_OFFSET_OK(offs)) {
+			// we can do this with a single BL call
+			underrunProtect(4);
+			*(--_nIns) = (NIns)( COND_AL | (0xB<<24) | (((offs)>>2) & 0xFFFFFF) ); \
+			asm_output1("bl %p", addr);
+		} else {
+			BL_far(addr);
+		}
 	}
 
 	void Assembler::CALL(const CallInfo *ci)
 	{
         intptr_t addr = ci->_address;
-		if (isB24((NIns*)addr, _nIns))
-		{
-			// we can do this with a single BL call
-			underrunProtect(4);
-
-			BL(addr);
-			asm_output2("call %08X:%s", addr, ci->_name);
-		}
-		else
-		{
-			underrunProtect(16);
-			*(--_nIns) = (NIns)((addr));
-			*(--_nIns) = (NIns)( COND_AL | (0x9<<21) | (0xFFF<<8) | (1<<4) | (IP) );
-			*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | (PC<<16) | (LR<<12) | (4) );
-			*(--_nIns) = (NIns)( COND_AL | (0x59<<20) | (PC<<16) | (IP<<12) | (4));
-			asm_output2("call %08X:%s", addr, ci->_name);
-		}
+		BL((NIns*)addr);
+		asm_output1("   (call %s)", ci->_name);
 	}
 
 #endif // NJ_THUMB_JIT
 
 	
 	void Assembler::LD32_nochk(Register r, int32_t imm)
 	{
 	#ifdef NJ_THUMB_JIT
@@ -932,38 +1023,25 @@ namespace nanojit
 		int data_off = int(data) - (int(_nIns+1)&~3);
 		*(--_nIns) = (NIns)(0x4800 | r<<8 | data_off>>2);
 		asm_output3("ldr %s,%d(PC) [%X]",gpn(r),data_off,(int)data);
 
 
 	#else
 
 		// We can always reach the const pool, since it's on the same page (<4096)
-
-		if (!_nSlot)
-			_nSlot = pageTop(_nIns);
-
-		if ( (_nSlot+1) >= (_nIns-1) )
-		{
-			// This would overrun the code, so we need a new page
-			// and a jump to that page
-					
-			NIns* target = _nIns;
-			_nIns = pageAlloc(_inExit);
-			JMP_nochk(target);
-
-			// reset the slot
-			_nSlot = pageTop(_nIns);
-		}
+		underrunProtect(8);
 
 		*(++_nSlot) = (int)imm;
 
-		int offset = (int)(_nSlot) - (int)(_nIns+1);
+		//fprintf (stderr, "wrote slot(2) %p with %08x, jmp @ %p\n", _nSlot, (intptr_t)imm, _nIns-1);
+
+		int offset = PC_OFFSET_FROM(_nSlot,(intptr_t)(_nIns)-4);
 
-		*(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | ((r)<<12) | -(offset));
+		NanoAssert(JMP_S24_OFFSET_OK(offset) && (offset < 0));
+
+		*(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | ((r)<<12) | ((-offset) & 0xFFFFFF) );
 		asm_output2("ld %s,%d",gpn(r),imm);
-
-
 	#endif
 
 	}
     #endif /* FEATURE_NANOJIT */
 }
--- a/js/src/nanojit/nanojit.h
+++ b/js/src/nanojit/nanojit.h
@@ -151,16 +151,17 @@ namespace nanojit
 #define isU8(i)  ( int32_t(i) == uint8_t(i) )
 #define isS16(i) ( int32_t(i) == int16_t(i) )
 #define isU16(i) ( int32_t(i) == uint16_t(i) )
 
 #define alignTo(x,s)		((((uintptr_t)(x)))&~(((uintptr_t)s)-1))
 #define alignUp(x,s)		((((uintptr_t)(x))+(((uintptr_t)s)-1))&~(((uintptr_t)s)-1))
 
 #define pageTop(x)			( (int*)alignTo(x,NJ_PAGE_SIZE) )
+#define pageDataStart(x)    ( (int*)(alignTo(x,NJ_PAGE_SIZE) + sizeof(PageHeader)) )
 #define pageBottom(x)		( (int*)(alignTo(x,NJ_PAGE_SIZE)+NJ_PAGE_SIZE)-1 )
 #define samepage(x,y)		(pageTop(x) == pageTop(y))
 
 #include "Native.h"
 #include "LIR.h"
 #include "RegAlloc.h"
 #include "Fragmento.h"
 #include "Assembler.h"