Merge tamarin-redux (nanojit2) into tracemonkey (457786, r=edwsmith,gal,danderson).
authorGraydon Hoare
Mon, 13 Oct 2008 13:29:18 -0700
changeset 20893 53072c29a4fef5475854716f7af60fcb99b28a65
parent 20892 21d1696cbc6464c66a1520b9eed949cb974427da
child 20894 84960b72d2a4d53a9cb65c020f317b8299452ae4
push id1
push userroot
push dateTue, 26 Apr 2011 22:38:44 +0000
treeherdermozilla-beta@bfdb6e623a36 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersedwsmith, gal, danderson
bugs457786
milestone1.9.1b2pre
Merge tamarin-redux (nanojit2) into tracemonkey (457786, r=edwsmith,gal,danderson).
js/src/jsbuiltins.h
js/src/jsmath.cpp
js/src/jstracer.cpp
js/src/jstracer.h
js/src/nanojit/Assembler.cpp
js/src/nanojit/Assembler.h
js/src/nanojit/Fragmento.cpp
js/src/nanojit/Fragmento.h
js/src/nanojit/LIR.cpp
js/src/nanojit/LIR.h
js/src/nanojit/NativeAMD64.h
js/src/nanojit/NativeARM.h
js/src/nanojit/NativeThumb.cpp
js/src/nanojit/NativeThumb.h
js/src/nanojit/Nativei386.cpp
js/src/nanojit/Nativei386.h
js/src/nanojit/RegAlloc.cpp
js/src/nanojit/RegAlloc.h
js/src/nanojit/avmplus.h
js/src/nanojit/nanojit.h
js/src/t/crypto-sha1.js
--- a/js/src/jsbuiltins.h
+++ b/js/src/jsbuiltins.h
@@ -140,17 +140,17 @@ struct JSTraceableNative {
 #define _JS_RETSIZE2(ctype, size)  size##_ARGSIZE
 #define _JS_RETSIZE(tyname)        _JS_EXPAND(_JS_RETSIZE2 _JS_TYPEINFO_##tyname)
 #define _JS_ARGSIZE2(ctype, size)  size##_RETSIZE
 #define _JS_ARGSIZE(tyname)        _JS_EXPAND(_JS_ARGSIZE2 _JS_TYPEINFO_##tyname)
 
 #define _JS_DEFINE_CALLINFO(name, crtype, cargtypes, argtypes, cse, fold)      \
     crtype FASTCALL js_##name cargtypes;                                       \
     const nanojit::CallInfo ci_##name =                                        \
-        { (intptr_t) &js_##name, argtypes, cse, fold _JS_CI_NAME(name) };
+        { (intptr_t) &js_##name, argtypes, cse, fold, nanojit::ABI_FASTCALL _JS_CI_NAME(name) };
 
 /*
  * Declare a C function named js_<op> and a CallInfo struct named ci_<op> so
  * the tracer can call it.
  */
 #define JS_DEFINE_CALLINFO_1(rt, op, at0, cse, fold)                                              \
     _JS_DEFINE_CALLINFO(op, _JS_CTYPE(rt), (_JS_CTYPE(at0)),                                      \
                         (_JS_ARGSIZE(at0) << 2) | _JS_RETSIZE(rt), cse, fold)
--- a/js/src/jsmath.cpp
+++ b/js/src/jsmath.cpp
@@ -339,26 +339,22 @@ math_max(JSContext *cx, uintN argc, jsva
     for (i = 0; i < argc; i++) {
         x = js_ValueToNumber(cx, &argv[i]);
         if (JSVAL_IS_NULL(argv[i]))
             return JS_FALSE;
         if (JSDOUBLE_IS_NaN(x)) {
             *vp = DOUBLE_TO_JSVAL(cx->runtime->jsNaN);
             return JS_TRUE;
         }
-        if (x == 0 && x == z && fd_copysign(1.0, z) == -1)
-            z = x;
-        else
-            /* 
-             * Note: it is essential that you write the ternary expression
-             * here such that the false branch produces z not x, as the case
-             * of x=-0, z=0, for which we wind up in this expression but
-             * evaluate either > order as false, whether we do x>z *or* z>x.
-             */
+        if (x == 0 && x == z) {
+            if (fd_copysign(1.0, z) == -1)
+                z = x;
+        } else {
             z = (x > z) ? x : z;
+        }
     }
     return js_NewNumberInRootedValue(cx, z, vp);
 }
 
 static JSBool
 math_min(JSContext *cx, uintN argc, jsval *vp)
 {
     jsdouble x, z = *cx->runtime->jsPositiveInfinity;
@@ -373,19 +369,20 @@ math_min(JSContext *cx, uintN argc, jsva
     for (i = 0; i < argc; i++) {
         x = js_ValueToNumber(cx, &argv[i]);
         if (JSVAL_IS_NULL(argv[i]))
             return JS_FALSE;
         if (JSDOUBLE_IS_NaN(x)) {
             *vp = DOUBLE_TO_JSVAL(cx->runtime->jsNaN);
             return JS_TRUE;
         }
-        if (x == 0 && x == z && fd_copysign(1.0,x) == -1)
-            z = x;
-        else
+        if (x == 0 && x == z) {
+            if (fd_copysign(1.0, x) == -1)
+                z = x;
+        } else
             z = (x < z) ? x : z;
     }
     return js_NewNumberInRootedValue(cx, z, vp);
 }
 
 static JSBool
 math_pow(JSContext *cx, uintN argc, jsval *vp)
 {
@@ -618,19 +615,23 @@ js_Math_log(jsdouble d)
 }
 
 jsdouble FASTCALL
 js_Math_max(jsdouble d, jsdouble p)
 {
     if (JSDOUBLE_IS_NaN(d) || JSDOUBLE_IS_NaN(p))
         return js_NaN;
 
-    if (p == 0 && p == d && fd_copysign(1.0, d) == -1)
-        return p;
-    return (d > p) ? d : p;
+    if (p == 0 && p == d) {
+        if (fd_copysign(1.0, d) == -1)
+            return p;
+        else
+            return d;
+    }
+    return (p > d) ? p : d;
 }
 
 jsdouble FASTCALL
 js_Math_pow(jsdouble d, jsdouble p)
 {
     if (!JSDOUBLE_IS_FINITE(p) && (d == 1.0 || d == -1.0))
         return js_NaN;
     if (p == 0)
--- a/js/src/jstracer.cpp
+++ b/js/src/jstracer.cpp
@@ -995,22 +995,24 @@ TraceRecorder::TraceRecorder(JSContext* 
         lir = verbose_filter = new (&gc) VerboseWriter(&gc, lir, lirbuf->names);
 #endif
 #ifdef NJ_SOFTFLOAT
     lir = float_filter = new (&gc) SoftFloatFilter(lir);
 #endif
     lir = cse_filter = new (&gc) CseFilter(lir, &gc);
     lir = expr_filter = new (&gc) ExprFilter(lir);
     lir = func_filter = new (&gc) FuncFilter(lir, *this);
-    lir->ins0(LIR_trace);
+    lir->ins0(LIR_start);
 
     if (!nanojit::AvmCore::config.tree_opt || fragment->root == fragment) {
-        lirbuf->state = addName(lir->insParam(0), "state");
-        lirbuf->param1 = addName(lir->insParam(1), "param1");
-    }
+        lirbuf->state = addName(lir->insParam(0, 0), "state");
+        lirbuf->param1 = addName(lir->insParam(1, 0), "param1");
+    }
+    loop_header_ins = addName(lir->ins0(LIR_label), "loop_header");
+
     lirbuf->sp = addName(lir->insLoad(LIR_ldp, lirbuf->state, (int)offsetof(InterpState, sp)), "sp");
     lirbuf->rp = addName(lir->insLoad(LIR_ldp, lirbuf->state, offsetof(InterpState, rp)), "rp");
     cx_ins = addName(lir->insLoad(LIR_ldp, lirbuf->state, offsetof(InterpState, cx)), "cx");
     gp_ins = addName(lir->insLoad(LIR_ldp, lirbuf->state, offsetof(InterpState, gp)), "gp");
     eos_ins = addName(lir->insLoad(LIR_ldp, lirbuf->state, offsetof(InterpState, eos)), "eos");
     eor_ins = addName(lir->insLoad(LIR_ldp, lirbuf->state, offsetof(InterpState, eor)), "eor");
 
     /* read into registers all values on the stack and all globals we know so far */
@@ -1950,20 +1952,19 @@ TraceRecorder::closeLoop(Fragmento* frag
         debug_only_v(printf("Trace rejected: unstable loop variables.\n");)
         if (!trashTree)
             fragment->blacklist();
         return;
     }
     SideExit *exit = snapshot(LOOP_EXIT);
     exit->target = fragment->root;
     if (fragment == fragment->root) {
-        fragment->lastIns = lir->insGuard(LIR_loop, lir->insImm(1), exit);
-    } else {
-        fragment->lastIns = lir->insGuard(LIR_x, lir->insImm(1), exit);
-    }
+        fragment->lastIns = lir->insBranch(LIR_j, NULL, loop_header_ins);
+    }
+    fragment->lastIns = lir->insGuard(LIR_x, lir->insImm(1), exit);
     compile(fragmento);
 
     debug_only_v(printf("recording completed at %s:%u@%u via closeLoop\n", cx->fp->script->filename,
                         js_PCToLineNumber(cx, cx->fp->script, cx->fp->regs->pc),
                         cx->fp->regs->pc - cx->fp->script->code););
 }
 
 /* Emit an always-exit guard and compile the tree (used for break statements. */
@@ -2102,19 +2103,19 @@ TraceRecorder::fuseIf(jsbytecode* pc, bo
         flipIf(pc, cond);
         guard(cond, x, BRANCH_EXIT);
     }
 }
 
 int
 nanojit::StackFilter::getTop(LInsp guard)
 {
-    if (sp == frag->lirbuf->sp)
+    if (sp == lirbuf->sp)
         return guard->exit()->sp_adj;
-    JS_ASSERT(sp == frag->lirbuf->rp);
+    JS_ASSERT(sp == lirbuf->rp);
     return guard->exit()->rp_adj;
 }
 
 #if defined NJ_VERBOSE
 void
 nanojit::LirNameMap::formatGuard(LIns *i, char *out)
 {
     uint32_t ip;
@@ -2382,17 +2383,17 @@ js_RecordTree(JSContext* cx, JSTraceMoni
     }
     
     AUDIT(recorderStarted);
 
     /* Try to find an unused peer fragment, or allocate a new one. */
     while (f->code() && f->peer)
         f = f->peer;
     if (f->code())
-        f = JS_TRACE_MONITOR(cx).fragmento->newLoop(f->ip);
+        f = JS_TRACE_MONITOR(cx).fragmento->getAnchor(f->ip);
 
     f->calldepth = 0;
     f->root = f;
     /* allocate space to store the LIR for this tree */
     if (!f->lirbuf) {
         f->lirbuf = new (&gc) LirBuffer(tm->fragmento, NULL);
 #ifdef DEBUG
         f->lirbuf->names = new (&gc) LirNameMap(&gc, NULL, tm->fragmento->labels);
@@ -2599,17 +2600,17 @@ js_ExecuteTree(JSContext* cx, Fragment**
     if (ngslots &&
         (OBJ_SHAPE(globalObj) != tm->globalShape || 
          !BuildNativeGlobalFrame(cx, ngslots, gslots, tm->globalTypeMap->data(), global))) {
         AUDIT(globalShapeMismatchAtEntry);
         debug_only_v(printf("Global shape mismatch (%u vs. %u), flushing cache.\n",
                             OBJ_SHAPE(globalObj), tm->globalShape);)
         const void* ip = f->ip;
         js_FlushJITCache(cx);
-        *treep = tm->fragmento->newLoop(ip);
+        *treep = tm->fragmento->getAnchor(ip);
         return NULL;
     }
 
     if (!BuildNativeStackFrame(cx, 0/*callDepth*/, ti->stackTypeMap.data(), stack)) {
         AUDIT(typeMapMismatchAtEntry);
         debug_only_v(printf("type-map mismatch.\n");)
         if (++ti->mismatchCount > MAX_MISMATCH) {
             debug_only_v(printf("excessive mismatches, flushing tree.\n"));
@@ -2619,17 +2620,17 @@ js_ExecuteTree(JSContext* cx, Fragment**
         return NULL;
     }
 
     /* replenish the reserve pool (this might trigger a GC */
     if (tm->recoveryDoublePoolPtr < tm->recoveryDoublePool + MAX_NATIVE_STACK_SLOTS) {
         bool didGC;
         const void* ip = f->ip;
         if (!ReplenishReservePool(cx, tm, didGC) || didGC) {
-            *treep = tm->fragmento->newLoop(ip);
+            *treep = tm->fragmento->getAnchor(ip);
             return NULL;
         }
     }
     
     ti->mismatchCount = 0;
 
     double* entry_sp = &stack[ti->nativeStackBase/sizeof(double)];
     FrameInfo* callstack = (FrameInfo*) alloca(MAX_CALL_STACK_ENTRIES * sizeof(FrameInfo));
@@ -2658,16 +2659,17 @@ js_ExecuteTree(JSContext* cx, Fragment**
      * recording. Rather than over-generalize by using a counter instead of a
      * flag, we simply sample and update tm->onTrace if necessary.
      */
     bool onTrace = tm->onTrace;
     if (!onTrace)
         tm->onTrace = true;
     GuardRecord* lr;
     
+    debug_only(fflush(NULL);)
 #if defined(JS_NO_FASTCALL) && defined(NANOJIT_IA32)
     SIMULATE_FASTCALL(lr, &state, NULL, u.func);
 #else
     lr = u.func(&state, NULL);
 #endif
 
     JS_ASSERT(lr->exit->exitType != LOOP_EXIT || !lr->calldepth);
 
@@ -2849,17 +2851,17 @@ js_MonitorLoopEdge(JSContext* cx, uintN&
     jsbytecode* pc = cx->fp->regs->pc;
     Fragment* f;
     JSFragmentCacheEntry* cacheEntry = &tm->fcache[jsuword(pc) & JS_FRAGMENT_CACHE_MASK];
     if (cacheEntry->pc == pc) {
         f = cacheEntry->fragment;
     } else {
         f = tm->fragmento->getLoop(pc);
         if (!f)
-            f = tm->fragmento->newLoop(pc);
+            f = tm->fragmento->getAnchor(pc);
         cacheEntry->pc = pc;
         cacheEntry->fragment = f;
     }
 
     /* If there is a chance that js_ExecuteTree will actually succeed, invoke it (either the
        first fragment must contain some code, or at least it must have a peer fragment). */
     GuardRecord* lr = NULL;
     GuardRecord* innermostNestedGuard = NULL;
--- a/js/src/jstracer.h
+++ b/js/src/jstracer.h
@@ -215,16 +215,17 @@ class TraceRecorder : public GCObject {
     nanojit::LirBufWriter*  lir_buf_writer;
     nanojit::LirWriter*     verbose_filter;
     nanojit::LirWriter*     cse_filter;
     nanojit::LirWriter*     expr_filter;
     nanojit::LirWriter*     func_filter;
 #ifdef NJ_SOFTFLOAT
     nanojit::LirWriter*     float_filter;
 #endif
+    nanojit::LIns*          loop_header_ins;
     nanojit::LIns*          cx_ins;
     nanojit::LIns*          gp_ins;
     nanojit::LIns*          eos_ins;
     nanojit::LIns*          eor_ins;
     nanojit::LIns*          rval_ins;
     nanojit::LIns*          inner_sp_ins;
     nanojit::SideExit       exit;
     bool                    deepAborted;
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
@@ -33,93 +33,131 @@
  * and other provisions required by the GPL or the LGPL. If you do not delete
  * the provisions above, a recipient may use your version of this file under
  * the terms of any one of the MPL, the GPL or the LGPL.
  *
  * ***** END LICENSE BLOCK ***** */
 
 #include "nanojit.h"
 
+#ifdef FEATURE_NANOJIT
+
 #ifdef AVMPLUS_PORTING_API
 #include "portapi_nanojit.h"
 #endif
 
-#if defined(AVMPLUS_LINUX) && defined(AVMPLUS_ARM)
+#if defined(AVMPLUS_UNIX) && defined(AVMPLUS_ARM)
 #include <asm/unistd.h>
 extern "C" void __clear_cache(char *BEG, char *END);
 #endif
 
 namespace nanojit
 {
-	#ifdef FEATURE_NANOJIT
 
 
 	class DeadCodeFilter: public LirFilter
 	{
-		Assembler *assm;
-	public:
-		DeadCodeFilter(LirFilter *in, Assembler *a) : LirFilter(in), assm(a) {}
+		const CallInfo *functions;
+
+	    bool ignoreInstruction(LInsp ins)
+	    {
+            LOpcode op = ins->opcode();
+            if (ins->isStore() ||
+                op == LIR_loop ||
+                op == LIR_label ||
+                op == LIR_live ||
+                isRet(op)) {
+                return false;
+            }
+	        return ins->resv() == 0;
+	    }
+
+	public:		
+		DeadCodeFilter(LirFilter *in, const CallInfo *f) : LirFilter(in), functions(f) {}
 		LInsp read() {
 			for (;;) {
 				LInsp i = in->read();
-				if (!i || i->isGuard() 
-					|| i->isCall() && !i->callInfo()->_cse
-					|| !assm->ignoreInstruction(i))
+				if (!i || i->isGuard() || i->isBranch()
+					|| i->isCall() && !i->isCse(functions)
+					|| !ignoreInstruction(i))
 					return i;
 			}
 		}
 	};
 
 #ifdef NJ_VERBOSE
 	class VerboseBlockReader: public LirFilter
 	{
 		Assembler *assm;
 		LirNameMap *names;
 		avmplus::List<LInsp, avmplus::LIST_NonGCObjects> block;
+        bool flushnext;
 	public:
 		VerboseBlockReader(LirFilter *in, Assembler *a, LirNameMap *n) 
-			: LirFilter(in), assm(a), names(n), block(a->_gc) {}
+			: LirFilter(in), assm(a), names(n), block(a->_gc), flushnext(false)
+        {}
 
 		void flush() {
-			assm->outputf("        %p:", assm->_nIns);
-			assm->output("");
-			for (int j=0,n=block.size(); j < n; j++)
-				assm->outputf("    %s", names->formatIns(block[j]));
-			assm->output("");
-			block.clear();
+            flushnext = false;
+            if (!block.isEmpty()) {
+			    for (int j=0,n=block.size(); j < n; j++) {
+					LIns *i = block[j];
+				    assm->outputf("    %s", names->formatIns(block[j]));
+					if (i->isop(LIR_label)) {
+						assm->outputf("        %p:", assm->_nIns);
+						assm->output("");
+					}
+				}
+			    block.clear();
+            }
 		}
 
+        void flush_add(LInsp i) {
+            flush();
+            block.add(i);
+        }
+
 		LInsp read() {
 			LInsp i = in->read();
 			if (!i) {
 				flush();
 				return i;
 			}
-			if (i->isGuard()) {
-				flush();
-				block.add(i);
+            if (i->isGuard()) {
+				flush_add(i);
 				if (i->oprnd1())
 					block.add(i->oprnd1());
-			}
+            }
+            else if (isRet(i->opcode()) || i->isBranch()) {
+                flush_add(i);
+            }
 			else {
-				block.add(i);
+                if (flushnext)
+                    flush();
+				flush_add(i);//block.add(i);
+                if (i->isop(LIR_label))
+                    flushnext = true;
 			}
 			return i;
 		}
 	};
 #endif
 	
 	/**
 	 * Need the following:
 	 *
 	 *	- merging paths ( build a graph? ), possibly use external rep to drive codegen
 	 */
     Assembler::Assembler(Fragmento* frago)
-        : _frago(frago)
+        : hasLoop(0)
+		, _frago(frago)
         , _gc(frago->core()->gc)
+        , _labels(_gc)
+        , _patches(_gc)
+        , pending_lives(_gc)
 	{
         AvmCore *core = frago->core();
 		nInit(core);
 		verbose_only( _verbose = !core->quiet_opt() && core->verbose() );
 		verbose_only( _outputCache = 0);
 		
 		internalReset();
 		pageReset();
@@ -128,18 +166,16 @@ namespace nanojit
     void Assembler::arReset()
 	{
 		_activation.highwatermark = 0;
 		_activation.lowwatermark = 0;
 		_activation.tos = 0;
 
 		for(uint32_t i=0; i<NJ_MAX_STACK_ENTRY; i++)
 			_activation.entry[i] = 0;
-		for(uint32_t i=0; i<NJ_MAX_PARAMETERS; i++)
-			_activation.parameter[i] = 0;
 	}
 
  	void Assembler::registerResetAll()
 	{
 		nRegisterResetAll(_allocator);
 
 		// keep a tally of the registers to check that our allocator works correctly
 		debug_only(_allocator.count = _allocator.countFree(); )
@@ -173,17 +209,17 @@ namespace nanojit
 		    Register r = nRegisterAllocFromSet(set);
 		    regs.used |= rmask(r);
 		    return r;
         }
 		counter_increment(steals);
 
 		// nothing free, steal one 
 		// LSRA says pick the one with the furthest use
-		LIns* vic = findVictim(regs,allow,prefer);
+		LIns* vic = findVictim(regs,allow);
 		NanoAssert(vic != NULL);
 
 	    Reservation* resv = getresv(vic);
 
 		// restore vic
 	    Register r = resv->reg;
         regs.removeActive(r);
         resv->reg = UnknownReg;
@@ -191,68 +227,74 @@ namespace nanojit
 		asm_restore(vic, resv, r);
 		return r;
 	}
 
 	void Assembler::reserveReset()
 	{
 		_resvTable[0].arIndex = 0;
 		int i;
-		for(i=1; i<NJ_MAX_STACK_ENTRY; i++)
+        for(i=1; i<NJ_MAX_STACK_ENTRY; i++) {
 			_resvTable[i].arIndex = i-1;
+            _resvTable[i].used = 0;
+        }
 		_resvFree= i-1;
 	}
 
+    /**
+     * these instructions don't have to be saved & reloaded to spill,
+     * they can just be recalculated w/out any inputs.
+     */
+    bool Assembler::canRemat(LIns *i) {
+        return i->isconst() || i->isconstq() || i->isop(LIR_alloc);
+    }
+
 	Reservation* Assembler::reserveAlloc(LInsp i)
 	{
 		uint32_t item = _resvFree;
         Reservation *r = &_resvTable[item];
 		_resvFree = r->arIndex;
 		r->reg = UnknownReg;
 		r->arIndex = 0;
+        r->used = 1;
 		if (!item) 
 			setError(ResvFull); 
-
-        if (i->isconst() || i->isconstq())
-            r->cost = 0;
-        else if (i == _thisfrag->lirbuf->sp || i == _thisfrag->lirbuf->rp)
-            r->cost = 2;
-        else
-            r->cost = 1;
-
         i->setresv(item);
 		return r;
 	}
 
 	void Assembler::reserveFree(LInsp i)
 	{
         Reservation *rs = getresv(i);
         NanoAssert(rs == &_resvTable[i->resv()]);
 		rs->arIndex = _resvFree;
+        rs->used = 0;
 		_resvFree = i->resv();
         i->setresv(0);
 	}
 
 	void Assembler::internalReset()
 	{
 		// readies for a brand spanking new code generation pass.
 		registerResetAll();
 		reserveReset();
 		arReset();
+        pending_lives.clear();
 	}
 
 	NIns* Assembler::pageAlloc(bool exitPage)
 	{
 		Page*& list = (exitPage) ? _nativeExitPages : _nativePages;
 		Page* page = _frago->pageAlloc();
 		if (page)
 		{
 			page->next = list;
 			list = page;
 			nMarkExecute(page);
+			_stats.pages++;
 		}
 		else
 		{
 			// return prior page (to allow overwrites) and mark out of mem 
 			page = list;
 			setError(OutOMem);
 		}
 		return &page->code[sizeof(page->code)/sizeof(NIns)]; // just past the end
@@ -260,40 +302,80 @@ namespace nanojit
 	
 	void Assembler::pageReset()
 	{
 		pagesFree(_nativePages);		
 		pagesFree(_nativeExitPages);
 		
 		_nIns = 0;
 		_nExitIns = 0;
+		_stats.pages = 0;
 
 		nativePageReset();
 	}
 	
 	void Assembler::pagesFree(Page*& page)
 	{
 		while(page)
 		{
 			Page *next = page->next;  // pull next ptr prior to free
 			_frago->pageFree(page);
 			page = next;
 		}
 	}
 
+	#define bytesFromTop(x)		( (size_t)(x) - (size_t)pageTop(x) )
+	#define bytesToBottom(x)	( (size_t)pageBottom(x) - (size_t)(x) )
+	#define bytesBetween(x,y)	( (size_t)(x) - (size_t)(y) )
+	
+	int32_t Assembler::codeBytes()
+	{
+		// start and end on same page?
+		size_t exit = 0;
+		int32_t pages = _stats.pages;
+		if (_nExitIns-1 == _stats.codeExitStart)
+			;
+		else if (samepage(_nExitIns,_stats.codeExitStart))
+			exit = bytesBetween(_stats.codeExitStart, _nExitIns);
+		else
+		{
+			pages--;
+			exit = ((intptr_t)_stats.codeExitStart & (NJ_PAGE_SIZE-1)) ? bytesFromTop(_stats.codeExitStart)+1 : 0;
+			exit += bytesToBottom(_nExitIns)+1;
+		}
+
+		size_t main = 0;
+		if (_nIns-1 == _stats.codeStart)
+			;
+		else if (samepage(_nIns,_stats.codeStart))
+			main = bytesBetween(_stats.codeStart, _nIns);
+		else
+		{
+			pages--;
+			main = ((intptr_t)_stats.codeStart & (NJ_PAGE_SIZE-1)) ? bytesFromTop(_stats.codeStart)+1 : 0;
+			main += bytesToBottom(_nIns)+1;
+		}
+		//fprintf(stderr,"size %d, exit is %d, main is %d, page count %d, sizeof %d\n", (int)((pages) * NJ_PAGE_SIZE + main + exit),(int)exit, (int)main, (int)_stats.pages, (int)sizeof(Page));
+		return (pages) * NJ_PAGE_SIZE + main + exit;		
+	}
+
+	#undef bytesFromTop
+	#undef bytesToBottom
+	#undef byteBetween
+	
 	Page* Assembler::handoverPages(bool exitPages)
 	{
 		Page*& list = (exitPages) ? _nativeExitPages : _nativePages;
 		NIns*& ins =  (exitPages) ? _nExitIns : _nIns;
 		Page* start = list;
 		list = 0;
 		ins = 0;
 		return start;
 	}
-	
+
 	#ifdef _DEBUG
 	bool Assembler::onPage(NIns* where, bool exitPages)
 	{
 		Page* page = (exitPages) ? _nativeExitPages : _nativePages;
 		bool on = false;
 		while(page)
 		{
 			if (samepage(where-1,page))
@@ -318,66 +400,69 @@ namespace nanojit
 	{
 		if (error()) return;
 
 #ifdef NANOJIT_IA32
         NanoAssert(_allocator.active[FST0] && _fpuStkDepth == -1 ||
             !_allocator.active[FST0] && _fpuStkDepth == 0);
 #endif
 		
-		// for tracking resv usage
-		LIns* resv[NJ_MAX_STACK_ENTRY];
-		for(int i=0; i<NJ_MAX_STACK_ENTRY; i++)
-			resv[i]=0;
-			
+        AR &ar = _activation;
 		// check AR entries
-		NanoAssert(_activation.highwatermark < NJ_MAX_STACK_ENTRY);
+		NanoAssert(ar.highwatermark < NJ_MAX_STACK_ENTRY);
 		LIns* ins = 0;
 		RegAlloc* regs = &_allocator;
-		for(uint32_t i=_activation.lowwatermark; i<_activation.tos; i++)
+		for(uint32_t i = ar.lowwatermark; i < ar.tos; i++)
 		{
-			ins = _activation.entry[i];
+			ins = ar.entry[i];
 			if ( !ins )
 				continue;
 			Reservation *r = getresv(ins);
+            NanoAssert(r != 0);
 			int32_t idx = r - _resvTable;
-			resv[idx]=ins;
 			NanoAssertMsg(idx, "MUST have a resource for the instruction for it to have a stack location assigned to it");
-			NanoAssertMsg( r->arIndex==0 || r->arIndex==i || (ins->isQuad()&&r->arIndex==i-(stack_direction(1))), "Stack record index mismatch");
+            if (r->arIndex) {
+                if (ins->isop(LIR_alloc)) {
+                    int j=i+1;
+                    for (int n = i + (ins->size()>>2); j < n; j++) {
+                        NanoAssert(ar.entry[j]==ins);
+                    }
+		        	NanoAssert(r->arIndex == (uint32_t)j-1);
+                    i = j-1;
+                }
+                else if (ins->isQuad()) {
+                    NanoAssert(ar.entry[i - stack_direction(1)]==ins);
+                    i += 1; // skip high word
+                }
+                else {
+        			NanoAssertMsg(r->arIndex == i, "Stack record index mismatch");
+                }
+            }
 			NanoAssertMsg( r->reg==UnknownReg || regs->isConsistent(r->reg,ins), "Register record mismatch");
 		}
 	
-		registerConsistencyCheck(resv);
+		registerConsistencyCheck();
 				
 		// check resv table
 		int32_t inuseCount = 0;
 		int32_t notInuseCount = 0;
-		for(uint32_t i=1; i<NJ_MAX_STACK_ENTRY; i++)
-		{
-			if (resv[i]==0)
-			{
-				notInuseCount++;
-			}
-			else
-			{
-				inuseCount++;
-			}
-		}
+        for(uint32_t i=1; i < sizeof(_resvTable)/sizeof(_resvTable[0]); i++) {
+            _resvTable[i].used ? inuseCount++ : notInuseCount++;
+        }
 
 		int32_t freeCount = 0;
 		uint32_t free = _resvFree;
-		while(free)
-		{
+        while(free) {
 			free = _resvTable[free].arIndex;
 			freeCount++;
 		}
 		NanoAssert( ( freeCount==notInuseCount && inuseCount+notInuseCount==(NJ_MAX_STACK_ENTRY-1) ) );
 	}
 
-	void Assembler::registerConsistencyCheck(LIns** resv)
+	void Assembler::registerConsistencyCheck()
 	{	
 		// check registers
 		RegAlloc *regs = &_allocator;
 		uint32_t managed = regs->managed;
 		Register r = FirstReg;
 		while(managed)
 		{
 			if (managed&1)
@@ -386,22 +471,20 @@ namespace nanojit
 				{
 					NanoAssert(regs->getActive(r)==0);
 				}
 				else
 				{
 					LIns* ins = regs->getActive(r);
 					// @todo we should be able to check across RegAlloc's somehow (to include savedGP...)
 					Reservation *v = getresv(ins);
-					NanoAssert(v);
+					NanoAssert(v != 0);
 					int32_t idx = v - _resvTable;
 					NanoAssert(idx >= 0 && idx < NJ_MAX_STACK_ENTRY);
-					resv[idx]=ins;
 					NanoAssertMsg(idx, "MUST have a resource for the instruction for it to have a register assigned to it");
-					NanoAssertMsg( v->arIndex==0 || ins==_activation.entry[v->arIndex], "Stack record index mismatch");
 					NanoAssertMsg( regs->getActive(v->reg)==ins, "Register record mismatch");
 				}			
 			}
 			
 			// next register in bitfield
 			r = nextreg(r);
 			managed >>= 1;
 		}
@@ -413,96 +496,113 @@ namespace nanojit
 		if (ia == ib) 
 		{
 			findRegFor(ia, allow);
 			resva = resvb = getresv(ia);
 		}
 		else
 		{
 			Register rb = UnknownReg;
-			resvb = getresv(ib);
-			if (resvb && (rb = resvb->reg) != UnknownReg)
-				allow &= ~rmask(rb);
-			Register ra = findRegFor(ia, allow);
-			resva = getresv(ia);
-			NanoAssert(error() || (resva != 0 && ra != UnknownReg));
-			if (rb == UnknownReg)
-			{
-				allow &= ~rmask(ra);
-				findRegFor(ib, allow);
-				resvb = getresv(ib);
-			}
+            resvb = getresv(ib);
+            if (resvb && (rb = resvb->reg) != UnknownReg) {
+                if (allow & rmask(rb)) {
+                    // ib already assigned to an allowable reg, keep that one
+                    allow &= ~rmask(rb);
+                } else {
+                    // ib assigned to unusable reg, pick a different one below.
+                    rb = UnknownReg;
+                }
+            }
+            Register ra = findRegFor(ia, allow);
+            resva = getresv(ia);
+            NanoAssert(error() || (resva != 0 && ra != UnknownReg));
+            if (rb == UnknownReg)
+            {
+                allow &= ~rmask(ra);
+                findRegFor(ib, allow);
+                resvb = getresv(ib);
+            }
 		}
 	}
 
 	Register Assembler::findSpecificRegFor(LIns* i, Register w)
 	{
 		return findRegFor(i, rmask(w));
 	}
 			
 	Register Assembler::findRegFor(LIns* i, RegisterMask allow)
 	{
-		Reservation* resv = getresv(i);
+        if (i->isop(LIR_alloc)) {
+            // never allocate a reg for this w/out stack space too
+            findMemFor(i);
+        }
+
+        Reservation* resv = getresv(i);
 		Register r;
 
 		// if we have an existing reservation and it has a non-unknown
 		// register allocated, and that register is in our allowed mask,
 		// return it.
         if (resv && (r=resv->reg) != UnknownReg && (rmask(r) & allow)) {
+            _allocator.useActive(r);
 			return r;
         }
 
 		// figure out what registers are preferred for this instruction
 		RegisterMask prefer = hint(i, allow);
 
 		// if we didn't have a reservation, allocate one now
-		if (!resv) 	
+        if (!resv)
 			resv = reserveAlloc(i);
 
-		// if the reservation doesn't have a register assigned to it...
-        if ((r=resv->reg) == UnknownReg)
+        r = resv->reg;
+        if (r != UnknownReg && 
+            ((rmask(r)&XmmRegs) && !(allow&XmmRegs) ||
+                 (rmask(r)&x87Regs) && !(allow&x87Regs)))
+        {
+            // x87 <-> xmm copy required
+            //_nvprof("fpu-evict",1);
+            evict(r);
+            r = UnknownReg;
+        }
+
+        if (r == UnknownReg)
 		{
-			// .. if the cost is 2 and the allowed mask includes
-			// the saved regs, then prefer just those.
-            if (resv->cost == 2 && (allow&SavedRegs))
-                prefer = allow&SavedRegs;
-			// grab one.
 			r = resv->reg = registerAlloc(prefer);
 			_allocator.addActive(r, i);
 			return r;
 		}
-		else 
+		else
 		{
 			// the already-allocated register isn't in the allowed mask;
 			// we need to grab a new one and then copy over the old
 			// contents to the new.
 			resv->reg = UnknownReg;
 			_allocator.retire(r);
-            if (resv->cost == 2 && (allow&SavedRegs))
-                prefer = allow&SavedRegs;
 			Register s = resv->reg = registerAlloc(prefer);
 			_allocator.addActive(s, i);
             if ((rmask(r) & GpRegs) && (rmask(s) & GpRegs)) {
     			MR(r, s);
             } 
-			else
-			{
+            else {
 				asm_nongp_copy(r, s);
 			}
 			return s;
 		}
 	}
 
 	int Assembler::findMemFor(LIns *i)
 	{
 		Reservation* resv = getresv(i);
 		if (!resv)
 			resv = reserveAlloc(i);
-		if (!resv->arIndex)
+        if (!resv->arIndex) {
 			resv->arIndex = arReserve(i);
+            NanoAssert(resv->arIndex <= _activation.highwatermark);
+        }
 		return disp(resv);
 	}
 
 	Register Assembler::prepResultReg(LIns *i, RegisterMask allow)
 	{
 		Reservation* resv = getresv(i);
 		const bool pop = !resv || resv->reg == UnknownReg;
 		Register rr = findRegFor(i, allow);
@@ -513,20 +613,21 @@ namespace nanojit
 	void Assembler::freeRsrcOf(LIns *i, bool pop)
 	{
 		Reservation* resv = getresv(i);
 		int index = resv->arIndex;
 		Register rr = resv->reg;
 
 		if (rr != UnknownReg)
 		{
-			asm_spill(i, resv, pop);
+			asm_spilli(i, resv, pop);
 			_allocator.retire(rr);	// free any register associated with entry
 		}
-		arFree(index);			// free any stack stack space associated with entry
+		if (index)
+            arFree(index);			// free any stack stack space associated with entry
 		reserveFree(i);		// clear fields of entry and add it to free list
 	}
 
 	void Assembler::evict(Register r)
 	{
 		registerAlloc(rmask(r));
 		_allocator.addFree(r);
 	}
@@ -538,64 +639,68 @@ namespace nanojit
         // LIR_ov and LIR_cs recycle the flags set by arithmetic ops
         if ((condop == LIR_ov) || (condop == LIR_cs))
             return;
         
         LInsp lhs = cond->oprnd1();
 		LInsp rhs = cond->oprnd2();
 		Reservation *rA, *rB;
 
+		NanoAssert((!lhs->isQuad() && !rhs->isQuad()) || (lhs->isQuad() && rhs->isQuad()));
+
 		// Not supported yet.
 #if !defined NANOJIT_64BIT
 		NanoAssert(!lhs->isQuad() && !rhs->isQuad());
 #endif
 
 		// ready to issue the compare
 		if (rhs->isconst())
 		{
 			int c = rhs->constval();
-			Register r = findRegFor(lhs, GpRegs);
 			if (c == 0 && cond->isop(LIR_eq)) {
-				if (rhs->isQuad() || lhs->isQuad()) {
+				Register r = findRegFor(lhs, GpRegs);
+				if (rhs->isQuad()) {
 #if defined NANOJIT_64BIT
 					TESTQ(r, r);
 #endif
 				} else {
 					TEST(r,r);
 				}
-#if defined NANOJIT_64BIT
-			} else if (rhs->isQuad() || lhs->isQuad()) {
-                findRegFor2(GpRegs, lhs, rA, rhs, rB);
-                Register ra = rA->reg;
-                Register rb = rB->reg;
-                CMPQ(ra,rb);
-#endif
-            } else {
+			// No 64-bit immediates so fall-back to below
+			}
+			else if (!rhs->isQuad()) {
+				Register r;
+				if (lhs->isop(LIR_alloc)) {
+					r = FP;
+					c += findMemFor(lhs);
+				} else {
+					r = findRegFor(lhs, GpRegs);
+				}
 				CMPi(r, c);
-            }
+			}
 		}
 		else
 		{
 			findRegFor2(GpRegs, lhs, rA, rhs, rB);
 			Register ra = rA->reg;
 			Register rb = rB->reg;
-			if (rhs->isQuad() || lhs->isQuad()) {
+			if (rhs->isQuad()) {
 #if defined NANOJIT_64BIT
 				CMPQ(ra, rb);
 #endif
 			} else {
 				CMP(ra, rb);
 			}
 		}
 	}
 
     void Assembler::patch(GuardRecord *lr)
     {
         Fragment *frag = lr->target;
-		NanoAssert(frag->fragEntry);
+		NanoAssert(frag->fragEntry != 0);
 		NIns* was = asm_adjustBranch((NIns*)lr->jmp, frag->fragEntry);
 		if (!lr->origTarget) lr->origTarget = was;
 		verbose_only(verbose_outputf("patching jump at %p to target %p (was %p)\n",
 			lr->jmp, frag->fragEntry, was);)
     }
 
     void Assembler::unpatch(GuardRecord *lr)
     {
@@ -611,58 +716,61 @@ namespace nanojit
 		NIns* at = 0;
 		if (!_branchStateMap->get(exit))
 		{
 			at = asm_leave_trace(guard);
 		}
 		else
 		{
 			RegAlloc* captured = _branchStateMap->get(exit);
-			mergeRegisterState(*captured);
+			intersectRegisterState(*captured);
 			verbose_only(
 				verbose_outputf("        merging trunk with %s",
 					_frago->labels->format(exit->target));
 				verbose_outputf("        %p:",_nIns);
 			)			
 			at = exit->target->fragEntry;
-			NanoAssert(at);
+			NanoAssert(at != 0);
 			_branchStateMap->remove(exit);
 		}
 		return at;
 	}
 	
 	NIns* Assembler::asm_leave_trace(LInsp guard)
 	{
         verbose_only(bool priorVerbose = _verbose; )
 		verbose_only( _verbose = verbose_enabled() && _frago->core()->config.verbose_exits; )
         verbose_only( int32_t nativeSave = _stats.native );
 		verbose_only(verbose_outputf("--------------------------------------- end exit block SID %d", guard->exit()->sid);)
 
 		RegAlloc capture = _allocator;
 
         // this point is unreachable.  so free all the registers.
 		// if an instruction has a stack entry we will leave it alone,
-		// otherwise we free it entirely.  mergeRegisterState will restore.
+		// otherwise we free it entirely.  intersectRegisterState will restore.
 		releaseRegisters();
 		
 		swapptrs();
 		_inExit = true;
 		
 		//verbose_only( verbose_outputf("         LIR_xend swapptrs, _nIns is now %08X(%08X), _nExitIns is now %08X(%08X)",_nIns, *_nIns,_nExitIns,*_nExitIns) );
 		debug_only( _sv_fpuStkDepth = _fpuStkDepth; _fpuStkDepth = 0; )
 
 		nFragExit(guard);
 
+		// restore the callee-saved register (aka saved params)
+		assignSavedParams();
+
 		// if/when we patch this exit to jump over to another fragment,
 		// that fragment will need its parameters set up just like ours.
         LInsp stateins = _thisfrag->lirbuf->state;
-		Register state = findSpecificRegFor(stateins, Register(stateins->imm8()));
+		Register state = findSpecificRegFor(stateins, argRegs[stateins->imm8()]);
 		asm_bailout(guard, state);
 
-		mergeRegisterState(capture);
+		intersectRegisterState(capture);
 
 		// this can be useful for breaking whenever an exit is taken
 		//INT3();
 		//NOP();
 
 		// we are done producing the exit logic for the guard so demark where our exit block code begins
 		NIns* jmpTarget = _nIns;	 // target in exit path for our mainline conditional jump 
 
@@ -680,30 +788,22 @@ namespace nanojit
 #endif
 
         verbose_only( _verbose = priorVerbose; )
         verbose_only(_stats.exitnative += (_stats.native-nativeSave));
 
         return jmpTarget;
     }
 	
-	bool Assembler::ignoreInstruction(LInsp ins)
+	void Assembler::beginAssembly(Fragment *frag, RegAllocMap* branchStateMap)
 	{
-        LOpcode op = ins->opcode();
-        if (ins->isStore() || op == LIR_loop)
-            return false;
-	    return getresv(ins) == 0;
-	}
-
-	void Assembler::beginAssembly(Fragment* frag, RegAllocMap* branchStateMap)
-	{
+        _thisfrag = frag;
 		_activation.lowwatermark = 1;
 		_activation.tos = _activation.lowwatermark;
 		_activation.highwatermark = _activation.tos;
-        _thisfrag = frag;
 		
 		counter_reset(native);
 		counter_reset(exitnative);
 		counter_reset(steals);
 		counter_reset(spills);
 		counter_reset(remats);
 
 		setError(None);
@@ -714,101 +814,129 @@ namespace nanojit
 	#ifdef AVMPLUS_PORTING_API
 		_endJit1Addr = _nIns;
 		_endJit2Addr = _nExitIns;
 	#endif
 
 		// make sure we got memory at least one page
 		if (error()) return;
 			
-        _epilogue = genEpilogue(SavedRegs);
+#ifdef PERFM
+		_stats.pages = 0;
+		_stats.codeStart = _nIns-1;
+		_stats.codeExitStart = _nExitIns-1;		
+		//fprintf(stderr,"pageReset %d start %x exit start %x\n", _stats.pages, (int)_stats.codeStart, (int)_stats.codeExitStart);
+#endif /* PERFM */
+
+        _epilogue = genEpilogue();
 		_branchStateMap = branchStateMap;
-		
+        _labels.clear();
+        _patches.clear();
+
 		verbose_only( verbose_outputf("        %p:",_nIns) );
 		verbose_only( verbose_output("        epilogue:") );
 	}
 	
 	void Assembler::assemble(Fragment* frag,  NInsList& loopJumps)
 	{
 		if (error()) return;	
 		AvmCore *core = _frago->core();
-		GC *gc = core->gc;
         _thisfrag = frag;
 
 		// set up backwards pipeline: assembler -> StackFilter -> LirReader
 		LirReader bufreader(frag->lastIns);
-		StackFilter storefilter1(&bufreader, gc, frag, frag->lirbuf->sp);
-		StackFilter storefilter2(&storefilter1, gc, frag, frag->lirbuf->rp);
-		DeadCodeFilter deadfilter(&storefilter2, this);
+		GC *gc = core->gc;
+		StackFilter storefilter1(&bufreader, gc, frag->lirbuf, frag->lirbuf->sp);
+		StackFilter storefilter2(&storefilter1, gc, frag->lirbuf, frag->lirbuf->rp);
+		DeadCodeFilter deadfilter(&storefilter2, frag->lirbuf->_functions);
 		LirFilter* rdr = &deadfilter;
 		verbose_only(
 			VerboseBlockReader vbr(rdr, this, frag->lirbuf->names);
 			if (verbose_enabled())
 				rdr = &vbr;
 		)
 
 		verbose_only(_thisfrag->compileNbr++; )
 		verbose_only(_frago->_stats.compiles++; )
 		verbose_only(_frago->_stats.totalCompiles++; )
 		_latestGuard = 0;
-		_inExit = false;		
-		gen(rdr, loopJumps);
+		_inExit = false;	
+        gen(rdr, loopJumps);
 		frag->fragEntry = _nIns;
 		frag->outbound = core->config.tree_opt? _latestGuard : 0;
 		//fprintf(stderr, "assemble frag %X entry %X\n", (int)frag, (int)frag->fragEntry);
+
+        if (!error()) {
+		    // patch all branches
+		    while(!_patches.isEmpty())
+		    {
+			    NIns* where = _patches.lastKey();
+			    LInsp targ = _patches.removeLast();
+                LabelState *label = _labels.get(targ);
+			    NIns* ntarg = label->addr;
+                if (ntarg) {
+				    nPatchBranch(where,ntarg);
+			    }
+                else {
+				    _err = UnknownBranch;
+				    break;
+			    }
+		    }
+        }
 	}
 
 	void Assembler::endAssembly(Fragment* frag, NInsList& loopJumps)
 	{
 		while(!loopJumps.isEmpty())
 		{
 			NIns* loopJump = (NIns*)loopJumps.removeLast();
 			nPatchBranch(loopJump, _nIns);
 		}
 
 		NIns* patchEntry = 0;
 		if (!error())
 		{
-			patchEntry = genPrologue(SavedRegs);
+			patchEntry = genPrologue();
 			verbose_only( verbose_outputf("        %p:",_nIns); )
 			verbose_only( verbose_output("        prologue"); )
 		}
 		
 		// something bad happened?
 		if (!error())
 		{
 			// check for resource leaks 
 			debug_only( 
 				for(uint32_t i=_activation.lowwatermark;i<_activation.highwatermark; i++) {
 					NanoAssertMsgf(_activation.entry[i] == 0, "frame entry %d wasn't freed\n",-4*i);
 				}
 			)
 
             frag->fragEntry = patchEntry;
 			NIns* code = _nIns;
-			
+#ifdef PERFM
+			_nvprof("code", codeBytes());  // requires that all pages are released between begin/endAssembly()otherwise we double count
+#endif
 			// let the fragment manage the pages if we're using trees and there are branches
-			Page* manage = (_frago->core()->config.tree_opt) ? handoverPages() : 0;
+			Page* manage = (_frago->core()->config.tree_opt) ? handoverPages() : 0;			
 			frag->setCode(code, manage); // root of tree should manage all pages
-			NanoAssert(!_frago->core()->config.tree_opt || frag == frag->anchor || frag->kind == MergeTrace);			
 			//fprintf(stderr, "endAssembly frag %X entry %X\n", (int)frag, (int)frag->fragEntry);
 		}
 		
-		AvmAssertMsg(error() || _fpuStkDepth == 0, ("_fpuStkDepth %d\n",_fpuStkDepth));
+		NanoAssertMsgf(error() || _fpuStkDepth == 0,"_fpuStkDepth %d\n",_fpuStkDepth);
 
 		internalReset();  // clear the reservation tables and regalloc
 		NanoAssert(_branchStateMap->isEmpty());
 		_branchStateMap = 0;
 
 #ifdef AVMPLUS_ARM
 		// If we've modified the code, we need to flush so we don't end up trying 
 		// to execute junk
 # if defined(UNDER_CE)
 		FlushInstructionCache(GetCurrentProcess(), NULL, NULL);
-# elif defined(AVMPLUS_LINUX)
+# elif defined(AVMPLUS_UNIX)
 		for (int i = 0; i < 2; i++) {
 			Page *p = (i == 0) ? _nativePages : _nativeExitPages;
 
 			Page *first = p;
 			while (p) {
 				if (!p->next || p->next != p+1) {
 					__clear_cache((char*)first, (char*)(p+1));
 					first = p->next;
@@ -846,94 +974,212 @@ namespace nanojit
 
 				if (!resv->arIndex && resv->reg == UnknownReg)
 				{
 					reserveFree(i);
 				}
 			}
 		}
 	}
-	
+
+#ifdef PERFM
+#define countlir_live() _nvprof("lir-live",1)
+#define countlir_ret() _nvprof("lir-ret",1)
+#define countlir_alloc() _nvprof("lir-alloc",1)
+#define countlir_var() _nvprof("lir-var",1)
+#define countlir_use() _nvprof("lir-use",1)
+#define countlir_def() _nvprof("lir-def",1)
+#define countlir_imm() _nvprof("lir-imm",1)
+#define countlir_param() _nvprof("lir-param",1)
+#define countlir_cmov() _nvprof("lir-cmov",1)
+#define countlir_ld() _nvprof("lir-ld",1)
+#define countlir_ldq() _nvprof("lir-ldq",1)
+#define countlir_alu() _nvprof("lir-alu",1)
+#define countlir_qjoin() _nvprof("lir-qjoin",1)
+#define countlir_qlo() _nvprof("lir-qlo",1)
+#define countlir_qhi() _nvprof("lir-qhi",1)
+#define countlir_fpu() _nvprof("lir-fpu",1)
+#define countlir_st() _nvprof("lir-st",1)
+#define countlir_stq() _nvprof("lir-stq",1)
+#define countlir_jmp() _nvprof("lir-jmp",1)
+#define countlir_jcc() _nvprof("lir-jcc",1)
+#define countlir_label() _nvprof("lir-label",1)
+#define countlir_xcc() _nvprof("lir-xcc",1)
+#define countlir_x() _nvprof("lir-x",1)
+#define countlir_loop() _nvprof("lir-loop",1)
+#define countlir_call() _nvprof("lir-call",1)
+#else
+#define countlir_live()
+#define countlir_ret()
+#define countlir_alloc()
+#define countlir_var()
+#define countlir_use()
+#define countlir_def()
+#define countlir_imm()
+#define countlir_param()
+#define countlir_cmov()
+#define countlir_ld()
+#define countlir_ldq()
+#define countlir_alu()
+#define countlir_qjoin()
+#define countlir_qlo()
+#define countlir_qhi()
+#define countlir_fpu()
+#define countlir_st()
+#define countlir_stq()
+#define countlir_jmp()
+#define countlir_jcc()
+#define countlir_label()
+#define countlir_xcc()
+#define countlir_x()
+#define countlir_loop()
+#define countlir_call()
+#endif
+
 	void Assembler::gen(LirFilter* reader,  NInsList& loopJumps)
 	{
 		// trace must start with LIR_x or LIR_loop
 		NanoAssert(reader->pos()->isop(LIR_x) || reader->pos()->isop(LIR_loop));
 		 
 		for (LInsp ins = reader->read(); ins != 0 && !error(); ins = reader->read())
 		{
-    		Reservation *rR = getresv(ins);
 			LOpcode op = ins->opcode();			
 			switch(op)
 			{
 				default:
 					NanoAssertMsgf(false, "unsupported LIR instruction: %d (~0x40: %d)\n", op, op&~LIR64);
 					break;
 					
+                case LIR_live: {
+                    countlir_live();
+                    pending_lives.add(ins->oprnd1());
+                    break;
+                }
+
+                case LIR_ret:  {
+                    countlir_ret();
+                    if (_nIns != _epilogue) {
+                        JMP(_epilogue);
+                    }
+                    assignSavedParams();
+                    findSpecificRegFor(ins->oprnd1(), retRegs[0]);
+                    break;
+                }
+
+                case LIR_fret: {
+                    countlir_ret();
+                    if (_nIns != _epilogue) {
+                        JMP(_epilogue);
+                    }
+                    assignSavedParams();
+                    findSpecificRegFor(ins->oprnd1(), FST0);
+                    fpu_pop();
+                    break;
+                }
+
+                // allocate some stack space.  the value of this instruction
+                // is the address of the stack space.
+                case LIR_alloc: {
+                    countlir_alloc();
+                    Reservation *resv = getresv(ins);
+                    NanoAssert(resv->arIndex != 0);
+                    Register r = resv->reg;
+                    if (r != UnknownReg) {
+    			        _allocator.retire(r);
+                        resv->reg = UnknownReg;
+                        asm_restore(ins, resv, r);
+                    }
+                    freeRsrcOf(ins, 0);
+                    break;
+                }
 				case LIR_short:
 				case LIR_int:
 				{
+                    countlir_imm();
 					Register rr = prepResultReg(ins, GpRegs);
 					int32_t val;
 					if (op == LIR_int)
 						val = ins->imm32();
 					else
 						val = ins->imm16();
 					if (val == 0)
 						XOR(rr,rr);
 					else
 						LDi(rr, val);
 					break;
 				}
 				case LIR_quad:
 				{
+                    countlir_imm();
 					asm_quad(ins);
 					break;
 				}
 #if !defined NANOJIT_64BIT
 				case LIR_callh:
 				{
 					// return result of quad-call in register
 					prepResultReg(ins, rmask(retRegs[1]));
                     // if hi half was used, we must use the call to ensure it happens
                     findRegFor(ins->oprnd1(), rmask(retRegs[0]));
 					break;
 				}
 #endif
 				case LIR_param:
 				{
-					Register w = Register(ins->imm8());
-                    NanoAssert(w != UnknownReg);
-					// incoming arg in register
-					prepResultReg(ins, rmask(w));
+                    countlir_param();
+                    uint32_t a = ins->imm8();
+                    uint32_t kind = ins->imm8b();
+                    if (kind == 0) {
+                        // ordinary param
+                        AbiKind abi = _thisfrag->lirbuf->abi;
+                        uint32_t abi_regcount = abi == ABI_FASTCALL ? 2 : abi == ABI_THISCALL ? 1 : 0;
+                        if (a < abi_regcount) {
+					        // incoming arg in register
+					        prepResultReg(ins, rmask(argRegs[a]));
+                        } else {
+                            // incoming arg is on stack, and EAX points nearby (see genPrologue)
+                            //_nvprof("param-evict-eax",1);
+                            Register r = prepResultReg(ins, GpRegs & ~rmask(EAX));
+                            int d = (a - abi_regcount) * sizeof(intptr_t) + 8;
+                            LD(r, d, FP); 
+                        }
+                    } 
+                    else {
+                        // saved param
+                        prepResultReg(ins, rmask(savedRegs[a]));
+                    }
 					break;
 				}
 				case LIR_qlo:
                 {
+                    countlir_qlo();
 					LIns *q = ins->oprnd1();
 
 					if (!asm_qlo(ins, q))
 					{
     					Register rr = prepResultReg(ins, GpRegs);
 				        int d = findMemFor(q);
 				        LD(rr, d, FP);
                     }
 					break;
                 }
 				case LIR_qhi:
 				{
+                    countlir_qhi();
 					Register rr = prepResultReg(ins, GpRegs);
 					LIns *q = ins->oprnd1();
 					int d = findMemFor(q);
 				    LD(rr, d+4, FP);
 					break;
 				}
 
 				case LIR_qcmov:
 				case LIR_cmov:
 				{
+                    countlir_cmov();
 					LIns* condval = ins->oprnd1();
 					NanoAssert(condval->isCmp());
 
 					LIns* values = ins->oprnd2();
 
 					NanoAssert(values->opcode() == LIR_2);
 					LIns* iftrue = values->oprnd1();
 					LIns* iffalse = values->oprnd2();
@@ -988,37 +1234,47 @@ namespace nanojit
 					asm_cmp(condval);
 					break;
 				}
 
 				case LIR_ld:
 				case LIR_ldc:
 				case LIR_ldcb:
 				{
+                    countlir_ld();
 					LIns* base = ins->oprnd1();
 					LIns* disp = ins->oprnd2();
 					Register rr = prepResultReg(ins, GpRegs);
-					Register ra = findRegFor(base, GpRegs);
+					Register ra;
 					int d = disp->constval();
+                    if (base->isop(LIR_alloc)) {
+                        ra = FP;
+                        d += findMemFor(base);
+                    } else {
+                        ra = findRegFor(base, GpRegs);
+                    }
 					if (op == LIR_ldcb)
 						LD8Z(rr, d, ra);
 					else
 						LD(rr, d, ra); 
 					break;
 				}
 
 				case LIR_ldq:
+				case LIR_ldqc:
 				{
+                    countlir_ldq();
 					asm_load64(ins);
 					break;
 				}
 
 				case LIR_neg:
 				case LIR_not:
 				{
+                    countlir_alu();
 					Register rr = prepResultReg(ins, GpRegs);
 
 					LIns* lhs = ins->oprnd1();
 					Reservation *rA = getresv(lhs);
 					// if this is last use of lhs in reg, we can re-use result reg
 					Register ra;
 					if (rA == 0 || (ra=rA->reg) == UnknownReg)
 						ra = findSpecificRegFor(lhs, rr);
@@ -1031,41 +1287,44 @@ namespace nanojit
 
 					if ( rr != ra ) 
 						MR(rr,ra); 
 					break;
 				}
 				
 				case LIR_qjoin:
 				{
+                    countlir_qjoin();
                     asm_qjoin(ins);
 					break;
 				}
 
 #if defined NANOJIT_64BIT
                 case LIR_qiadd:
                 case LIR_qiand:
                 case LIR_qilsh:
                 case LIR_qior:
                 {
                     asm_qbinop(ins);
                     break;
                 }
 #endif
 
 				case LIR_add:
+				case LIR_addp:
 				case LIR_sub:
 				case LIR_mul:
 				case LIR_and:
 				case LIR_or:
 				case LIR_xor:
 				case LIR_lsh:
 				case LIR_rsh:
 				case LIR_ush:
 				{
+                    countlir_alu();
                     LInsp lhs = ins->oprnd1();
                     LInsp rhs = ins->oprnd2();
 
 					Register rb = UnknownReg;
 					RegisterMask allow = GpRegs;
 					bool forceReg = (op == LIR_mul || !rhs->isconst());
 
 #ifdef NANOJIT_ARM
@@ -1081,31 +1340,38 @@ namespace nanojit
 
 					if (lhs != rhs && forceReg)
 					{
 						if ((rb = asm_binop_rhs_reg(ins)) == UnknownReg) {
 							rb = findRegFor(rhs, allow);
 						}
 						allow &= ~rmask(rb);
 					}
+                    else if ((op == LIR_add||op == LIR_addp) && lhs->isop(LIR_alloc) && rhs->isconst()) {
+                        // add alloc+const, use lea
+                        Register rr = prepResultReg(ins, allow);
+                        int d = findMemFor(lhs) + rhs->constval();
+                        LEA(rr, d, FP);
+                        break;
+                    }
 
 					Register rr = prepResultReg(ins, allow);
 					Reservation* rA = getresv(lhs);
 					Register ra;
 					// if this is last use of lhs in reg, we can re-use result reg
 					if (rA == 0 || (ra = rA->reg) == UnknownReg)
 						ra = findSpecificRegFor(lhs, rr);
 					// else, rA already has a register assigned.
 
 					if (forceReg)
 					{
 						if (lhs == rhs)
 							rb = ra;
 
-						if (op == LIR_add)
+						if (op == LIR_add || op == LIR_addp)
 							ADD(rr, rb);
 						else if (op == LIR_sub)
 							SUB(rr, rb);
 						else if (op == LIR_mul)
 							MUL(rr, rb);
 						else if (op == LIR_and)
 							AND(rr, rb);
 						else if (op == LIR_or)
@@ -1119,19 +1385,20 @@ namespace nanojit
 						else if (op == LIR_ush)
 							SHR(rr, rb);
 						else
 							NanoAssertMsg(0, "Unsupported");
 					}
 					else
 					{
 						int c = rhs->constval();
-						if (op == LIR_add) {
-#ifdef NANOJIT_IA32
+						if (op == LIR_add || op == LIR_addp) {
+#ifdef NANOJIT_IA32_TODO
 							if (ra != rr) {
+                                // this doesn't set cc's, only use it when cc's not required.
 								LEA(rr, c, ra);
 								ra = rr; // suppress mov
 							} else
 #endif
 							{
 								ADDi(rr, c); 
 							}
 						} else if (op == LIR_sub) {
@@ -1162,181 +1429,219 @@ namespace nanojit
 
 					if ( rr != ra ) 
 						MR(rr,ra);
 					break;
 				}
 #ifndef NJ_SOFTFLOAT
 				case LIR_fneg:
 				{
+                    countlir_fpu();
 					asm_fneg(ins);
 					break;
 				}
 				case LIR_fadd:
 				case LIR_fsub:
 				case LIR_fmul:
 				case LIR_fdiv:
 				{
+                    countlir_fpu();
 					asm_fop(ins);
                     break;
 				}
 				case LIR_i2f:
 				{
+                    countlir_fpu();
 					asm_i2f(ins);
 					break;
 				}
 				case LIR_u2f:
 				{
+                    countlir_fpu();
 					asm_u2f(ins);
 					break;
 				}
 #endif // NJ_SOFTFLOAT
 				case LIR_st:
 				case LIR_sti:
 				{
+                    countlir_st();
                     asm_store32(ins->oprnd1(), ins->immdisp(), ins->oprnd2());
                     break;
 				}
 				case LIR_stq:
 				case LIR_stqi:
 				{
+                    countlir_stq();
 					LIns* value = ins->oprnd1();
 					LIns* base = ins->oprnd2();
 					int dr = ins->immdisp();
-					if (value->isop(LIR_qjoin)) {
+					if (value->isop(LIR_qjoin)) 
+					{
 						// this is correct for little-endian only
 						asm_store32(value->oprnd1(), dr, base);
 						asm_store32(value->oprnd2(), dr+4, base);
 					}
-					else {
+					else 
+					{
 						asm_store64(value, dr, base);
 					}
                     break;
 				}
-				case LIR_xt:
+
+				case LIR_j:
+				{
+                    countlir_jmp();
+					LInsp to = ins->getTarget();
+                    LabelState *label = _labels.get(to);
+                    // the jump is always taken so whatever register state we
+                    // have from downstream code, is irrelevant to code before
+                    // this jump.  so clear it out.  we will pick up register
+                    // state from the jump target, if we have seen that label.
+                    releaseRegisters();
+                    if (label && label->addr) {
+                        // forward jump - pick up register state from target.
+                        unionRegisterState(label->regs);
+                        JMP(label->addr);
+                    }
+                    else {
+                        // backwards jump
+                        hasLoop = true;
+                        handleLoopCarriedExprs();
+                        if (!label) {
+                            // save empty register state at loop header
+                            _labels.add(to, 0, _allocator);
+                        }
+                        else {
+                            intersectRegisterState(label->regs);
+                        }
+                        JMP(0);
+    					_patches.put(_nIns, to);
+                        verbose_only(
+                            verbose_outputf("        Loop %s -> %s", 
+                                lirNames[ins->opcode()], 
+                                _thisfrag->lirbuf->names->formatRef(to));
+                        )
+                    }
+					break;
+				}
+
+				case LIR_jt:
+				case LIR_jf:
+				{
+                    countlir_jcc();
+					LInsp to = ins->getTarget();
+					LIns* cond = ins->oprnd1();
+                    LabelState *label = _labels.get(to);
+                    if (label && label->addr) {
+                        // forward jump to known label.  need to merge with label's register state.
+                        unionRegisterState(label->regs);
+    					asm_branch(op == LIR_jf, cond, label->addr);
+                    }
+                    else {
+                        // back edge.
+                        hasLoop = true;
+                        handleLoopCarriedExprs();
+                        if (!label) {
+                            // evict all registers, most conservative approach.
+                            evictRegs(~_allocator.free);
+                            _labels.add(to, 0, _allocator);
+                        } 
+                        else {
+                            // evict all registers, most conservative approach.
+                            intersectRegisterState(label->regs);
+                        }
+                        NIns *branch = asm_branch(op == LIR_jf, cond, 0);
+			            _patches.put(branch,to);
+                        verbose_only(
+                            verbose_outputf("Loop %s -> %s", 
+                                lirNames[ins->opcode()], 
+                                _thisfrag->lirbuf->names->formatRef(to));
+                        )
+                    }
+					break;
+				}					
+				case LIR_label:
+				{
+                    countlir_label();
+                    LabelState *label = _labels.get(ins);
+                    if (!label) {
+                        // label seen first, normal target of forward jump, save addr & allocator
+    					_labels.add(ins, _nIns, _allocator);
+                    }
+                    else {
+                        // we're at the top of a loop
+                        hasLoop = true;
+                        NanoAssert(label->addr == 0 && label->regs.isValid());
+                        //evictRegs(~_allocator.free);
+                        intersectRegisterState(label->regs);
+                        //asm_align_code();
+                        label->addr = _nIns;
+                        verbose_only(
+                            verbose_outputf("Loop %s", _thisfrag->lirbuf->names->formatRef(ins));
+                        )
+                    }
+					break;
+				}
+
+                case LIR_xt:
 				case LIR_xf:
 				{
-                    NIns* exit = asm_exit(ins);
-	
+                    countlir_xcc();
 					// we only support cmp with guard right now, also assume it is 'close' and only emit the branch
+                    NIns* exit = asm_exit(ins); // does intersectRegisterState()
 					LIns* cond = ins->oprnd1();
-					LOpcode condop = cond->opcode();
-					NanoAssert(cond->isCond());
-#if !defined(NJ_SOFTFLOAT)
-                    if (condop >= LIR_feq && condop <= LIR_fge)
-					{
-#if defined(NJ_ARM_VFP)
-						if (op == LIR_xf)
-							JNE(exit);
-						else
-							JE(exit);
-#else
-						if (op == LIR_xf)
-							JP(exit);
-						else
-							JNP(exit);
-#endif
-						asm_fcmp(cond);
-                        break;
-					}
-#endif
-					// produce the branch
-					if (op == LIR_xf)
-					{
-						if (condop == LIR_eq)
-							JNE(exit);
-                        else if (condop == LIR_ov)
-                            JNO(exit);
-                        else if (condop == LIR_cs)
-                            JNC(exit);
-						else if (condop == LIR_lt)
-							JNL(exit);
-						else if (condop == LIR_le)
-							JNLE(exit);
-						else if (condop == LIR_gt)
-							JNG(exit);
-						else if (condop == LIR_ge)
-							JNGE(exit);
-						else if (condop == LIR_ult)
-							JNB(exit);
-						else if (condop == LIR_ule)
-							JNBE(exit);
-						else if (condop == LIR_ugt)
-							JNA(exit);
-						else //if (condop == LIR_uge)
-							JNAE(exit);
-					}
-					else // op == LIR_xt
-					{
-						if (condop == LIR_eq)
-							JE(exit);
-                        else if (condop == LIR_ov)
-                            JO(exit);
-                        else if (condop == LIR_cs)
-                            JC(exit);
-						else if (condop == LIR_lt)
-							JL(exit);
-						else if (condop == LIR_le)
-							JLE(exit);
-						else if (condop == LIR_gt)
-							JG(exit);
-						else if (condop == LIR_ge)
-							JGE(exit);
-						else if (condop == LIR_ult)
-							JB(exit);
-						else if (condop == LIR_ule)
-							JBE(exit);
-						else if (condop == LIR_ugt)
-							JA(exit);
-						else //if (condop == LIR_uge)
-							JAE(exit);
-					}
-					asm_cmp(cond);
+					asm_branch(op == LIR_xf, cond, exit);
 					break;
 				}
 				case LIR_x:
 				{
+                    countlir_x();
 		            verbose_only(verbose_output(""));
 					// generate the side exit branch on the main trace.
                     NIns *exit = asm_exit(ins);
 					JMP( exit ); 
 					break;
 				}
 				case LIR_loop:
 				{
+                    countlir_loop();
 					JMP_long_placeholder(); // jump to SOT	
 					verbose_only( if (_verbose && _outputCache) { _outputCache->removeLast(); outputf("         jmp   SOT"); } );
 					
 					loopJumps.add(_nIns);
 
                     #ifdef NJ_VERBOSE
                     // branching from this frag to ourself.
                     if (_frago->core()->config.show_stats)
-					#if defined NANOJIT_64BIT
+					#if defined NANOJIT_AMD64
                         LDQi(argRegs[1], intptr_t((Fragment*)_thisfrag));
 					#else
                         LDi(argRegs[1], int((Fragment*)_thisfrag));
                     #endif
                     #endif
 
+					assignSavedParams();
+
 					// restore first parameter, the only one we use
                     LInsp state = _thisfrag->lirbuf->state;
-                    Register a0 = Register(state->imm8());
-					findSpecificRegFor(state, a0); 
+					findSpecificRegFor(state, argRegs[state->imm8()]); 
 					break;
 				}
 #ifndef NJ_SOFTFLOAT
 				case LIR_feq:
 				case LIR_fle:
 				case LIR_flt:
 				case LIR_fgt:
 				case LIR_fge:
 				{
+                    countlir_fpu();
 					// only want certain regs 
 					Register r = prepResultReg(ins, AllowableFlagRegs);
+					asm_setcc(r, ins);
 #ifdef NJ_ARM_VFP
 					SETE(r);
 #else
 					// SETcc only sets low 8 bits, so extend 
 					MOVZX8(r,r);
 					SETNP(r);
 #endif
 					asm_fcmp(ins);
@@ -1350,16 +1655,17 @@ namespace nanojit
 				case LIR_lt:
 				case LIR_gt:
 				case LIR_ge:
 				case LIR_ult:
 				case LIR_ule:
 				case LIR_ugt:
 				case LIR_uge:
 				{
+                    countlir_alu();
 					// only want certain regs 
 					Register r = prepResultReg(ins, AllowableFlagRegs);
 					// SETcc only sets low 8 bits, so extend 
 					MOVZX8(r,r);
 					if (op == LIR_eq)
 						SETE(r);
                     else if (op == LIR_ov)
                         SETO(r);
@@ -1382,98 +1688,162 @@ namespace nanojit
 					else // if (op == LIR_uge)
 						SETAE(r);
 					asm_cmp(ins);
 					break;
 				}
 
 #ifndef NJ_SOFTFLOAT
 				case LIR_fcall:
+				case LIR_fcalli:
 #endif
 #if defined NANOJIT_64BIT
 				case LIR_callh:
 #endif
 				case LIR_call:
+				case LIR_calli:
 				{
+                    countlir_call();
                     Register rr = UnknownReg;
 #ifndef NJ_SOFTFLOAT
-                    if (op == LIR_fcall)
+                    if ((op&LIR64))
                     {
+                        // fcall or fcalli
+						Reservation* rR = getresv(ins);
 						rr = asm_prep_fcall(rR, ins);
                     }
                     else
 #endif
                     {
-						(void)rR;
                         rr = retRegs[0];
 						prepResultReg(ins, rmask(rr));
                     }
 
 					// do this after we've handled the call result, so we dont
 					// force the call result to be spilled unnecessarily.
-					restoreCallerSaved();
+
+					evictScratchRegs();
 
 					asm_call(ins);
 				}
 			}
 
 			// check that all is well (don't check in exit paths since its more complicated)
 			debug_only( pageValidate(); )
 			debug_only( resourceConsistencyCheck();  )
 		}
 	}
 
-    void Assembler::asm_arg(ArgSize sz, LInsp p, Register r)
-    {
-        if (sz == ARGSIZE_Q) 
-        {
-			// ref arg - use lea
-			if (r != UnknownReg)
-			{
-				// arg in specific reg
-				int da = findMemFor(p);
-				LEA(r, da, FP);
-			}
-			else
-			{
-				NanoAssert(0); // not supported
-			}
+	NIns* Assembler::asm_branch(bool branchOnFalse, LInsp cond, NIns* targ)
+	{
+		NIns* at = 0;
+		LOpcode condop = cond->opcode();
+		NanoAssert(cond->isCond());
+#ifndef NJ_SOFTFLOAT
+		if (condop >= LIR_feq && condop <= LIR_fge)
+		{
+			return asm_jmpcc(branchOnFalse, cond, targ);
 		}
-        else if (sz == ARGSIZE_LO)
+#endif
+		// produce the branch
+		if (branchOnFalse)
+		{
+			if (condop == LIR_eq)
+				JNE(targ);
+			else if (condop == LIR_ov)
+				JNO(targ);
+			else if (condop == LIR_cs)
+				JNC(targ);
+			else if (condop == LIR_lt)
+				JNL(targ);
+			else if (condop == LIR_le)
+				JNLE(targ);
+			else if (condop == LIR_gt)
+				JNG(targ);
+			else if (condop == LIR_ge)
+				JNGE(targ);
+			else if (condop == LIR_ult)
+				JNB(targ);
+			else if (condop == LIR_ule)
+				JNBE(targ);
+			else if (condop == LIR_ugt)
+				JNA(targ);
+			else //if (condop == LIR_uge)
+				JNAE(targ);
+		}
+		else // op == LIR_xt
 		{
-			if (r != UnknownReg)
-			{
-				// arg goes in specific register
-				if (p->isconst())
-					LDi(r, p->constval());
-				else
-					findSpecificRegFor(p, r);
-			}
-			else
-			{
-				asm_pusharg(p);
-			}
+			if (condop == LIR_eq)
+				JE(targ);
+			else if (condop == LIR_ov)
+				JO(targ);
+			else if (condop == LIR_cs)
+				JC(targ);
+			else if (condop == LIR_lt)
+				JL(targ);
+			else if (condop == LIR_le)
+				JLE(targ);
+			else if (condop == LIR_gt)
+				JG(targ);
+			else if (condop == LIR_ge)
+				JGE(targ);
+			else if (condop == LIR_ult)
+				JB(targ);
+			else if (condop == LIR_ule)
+				JBE(targ);
+			else if (condop == LIR_ugt)
+				JA(targ);
+			else //if (condop == LIR_uge)
+				JAE(targ);
 		}
-        else
-		{
-			asm_farg(p);
-		}
+		at = _nIns;
+		asm_cmp(cond);
+		return at;
+	}
+
+    void Assembler::assignSavedParams()
+    {
+        // restore saved regs
+		releaseRegisters();
+        LirBuffer *b = _thisfrag->lirbuf;
+        for (int i=0, n = NumSavedRegs; i < n; i++) {
+            LIns *p = b->savedParams[i];
+            if (p)
+                findSpecificRegFor(p, savedRegs[p->imm8()]);
+        }
     }
 
-	uint32_t Assembler::arFree(uint32_t idx)
-	{
-		// nothing to free
-		if (idx == 0)
-			return 0;
+    void Assembler::reserveSavedParams()
+    {
+        LirBuffer *b = _thisfrag->lirbuf;
+        for (int i=0, n = NumSavedRegs; i < n; i++) {
+            LIns *p = b->savedParams[i];
+            if (p)
+                findMemFor(p);
+        }
+    }
 
-		if (idx > 0 && _activation.entry[idx] == _activation.entry[idx+stack_direction(1)])
-			_activation.entry[idx+stack_direction(1)] = 0;  // clear 2 slots for doubles 
+    void Assembler::handleLoopCarriedExprs()
+    {
+        // ensure that exprs spanning the loop are marked live at the end of the loop
+        reserveSavedParams();
+        for (int i=0, n=pending_lives.size(); i < n; i++) {
+            findMemFor(pending_lives[i]);
+        }
+    }
 
-		_activation.entry[idx] = 0;
-		return 0;
+	void Assembler::arFree(uint32_t idx)
+	{
+        AR &ar = _activation;
+        LIns *i = ar.entry[idx];
+        NanoAssert(i != 0);
+        do {
+            ar.entry[idx] = 0;
+            idx--;
+        } while (ar.entry[idx] == i);
 	}
 
 #ifdef NJ_VERBOSE
 	void Assembler::printActivationState()
 	{
 		bool verbose_activation = false;
 		if (!verbose_activation)
 			return;
@@ -1513,104 +1883,245 @@ namespace nanojit
 					}
 				}
 				output(&outline[0]);
 			}
 		)
 #endif
 	}
 #endif
+
+    bool canfit(int32_t size, int32_t loc, AR &ar) {
+        for (int i=0; i < size; i++) {
+            if (ar.entry[loc+stack_direction(i)])
+                return false;
+        }
+        return true;
+    }
 	
 	uint32_t Assembler::arReserve(LIns* l)
 	{
 		NanoAssert(!l->isTramp());
 
 		//verbose_only(printActivationState());
-		const bool quad = l->isQuad();
-		const int32_t n = _activation.tos;
-		int32_t start = _activation.lowwatermark;
+        int32_t size = l->isop(LIR_alloc) ? (l->size()>>2) : l->isQuad() ? 2 : sizeof(intptr_t)>>2;
+        AR &ar = _activation;
+		const int32_t tos = ar.tos;
+		int32_t start = ar.lowwatermark;
 		int32_t i = 0;
 		NanoAssert(start>0);
-		if (n >= NJ_MAX_STACK_ENTRY-2)
-		{	
-			setError(StackFull);
-			return start;
-		}
-		else if (quad)
-		{
-			if ( (start&1)==1 ) start++;  // even 
-			for(i=start; i <= n; i+=2)
-			{
-				if ( (_activation.entry[i+stack_direction(1)] == 0) && (i==n || (_activation.entry[i] == 0)) )
-					break;   //  for fp we need 2 adjacent aligned slots
+
+        if (size == 1) {
+            // easy most common case -- find a hole, or make the frame bigger
+            for (i=start; i < NJ_MAX_STACK_ENTRY; i++) {
+                if (ar.entry[i] == 0) {
+                    // found a hole
+                    ar.entry[i] = l;
+                    break;
+                }
+            }
+        }
+        else if (size == 2) {
+			if ( (start&1)==1 ) start++;  // even 8 boundary
+			for (i=start; i < NJ_MAX_STACK_ENTRY; i+=2) {
+                if ( (ar.entry[i+stack_direction(1)] == 0) && (i==tos || (ar.entry[i] == 0)) ) {
+                    // found 2 adjacent aligned slots
+                    NanoAssert(_activation.entry[i] == 0);
+                    NanoAssert(_activation.entry[i+stack_direction(1)] == 0);
+                    ar.entry[i] = l;
+                    ar.entry[i+stack_direction(1)] = l;
+                    break;   
+                }
 			}
 		}
-		else
-		{
-			for(i=start; i < n; i++)
-			{
-				if (_activation.entry[i] == 0)
-					break;   // not being used
-			}
+        else {
+            // alloc larger block on 8byte boundary.
+            if (start < size) start = size;
+            if ((start&1)==1) start++;
+            for (i=start; i < NJ_MAX_STACK_ENTRY; i+=2) {
+                if (canfit(size, i, ar)) {
+		            // place the entry in the table and mark the instruction with it
+                    for (int32_t j=0; j < size; j++) {
+                        NanoAssert(_activation.entry[i+stack_direction(j)] == 0);
+                        _activation.entry[i+stack_direction(j)] = l;
+                    }
+                    break;
+                }
+            }
 		}
-
-		int32_t inc = ((i-n+1) < 0) ? 0 : (i-n+1);
-		if (quad && stack_direction(1)>0) inc++;
-		_activation.tos += inc;
-		_activation.highwatermark += inc;
-
-		// place the entry in the table and mark the instruction with it
-		_activation.entry[i] = l;
-		if (quad) _activation.entry[i+stack_direction(1)] = l;
-		return i;
+        if (i >= (int32_t)ar.tos) {
+            ar.tos = ar.highwatermark = i+1;
+        }
+		if (tos+size >= NJ_MAX_STACK_ENTRY) {	
+			setError(StackFull);
+		}
+        return i;
 	}
 
-	void Assembler::restoreCallerSaved()
+    /**
+     * move regs around so the SavedRegs contains the highest priority regs.
+     */
+    void Assembler::evictScratchRegs()
+    {
+		// find the top GpRegs that are candidates to put in SavedRegs
+
+        // tosave is a binary heap stored in an array.  the root is tosave[0],
+        // left child is at i+1, right child is at i+2.  
+
+        Register tosave[LastReg-FirstReg+1];
+        int len=0;
+        RegAlloc *regs = &_allocator;
+        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
+			if (rmask(r) & GpRegs) {
+				LIns *i = regs->getActive(r);
+				if (i) {
+					if (canRemat(i)) {
+						evict(r);
+					}
+					else {
+						int32_t pri = regs->getPriority(r);
+                        // add to heap by adding to end and bubbling up
+                        int j = len++;
+                        while (j > 0 && pri > regs->getPriority(tosave[j/2])) {
+                            tosave[j] = tosave[j/2];
+                            j /= 2;
+                        }
+                        NanoAssert(size_t(j) < sizeof(tosave)/sizeof(tosave[0]));
+                        tosave[j] = r;
+					}
+				}
+            }
+        }
+
+        // now primap has the live exprs in priority order.  
+		// allocate each of the top priority exprs to a SavedReg
+
+        RegisterMask allow = SavedRegs;
+        while (allow && len > 0) {
+            // get the highest priority var
+            Register hi = tosave[0];
+            LIns *i = regs->getActive(hi);
+            Register r = findRegFor(i, allow);
+			allow &= ~rmask(r);
+
+            // remove from heap by replacing root with end element and bubbling down.
+            if (allow && --len > 0) {
+                Register last = tosave[len];
+                int j = 0;
+                while (j+1 < len) {
+                    int child = j+1;
+                    if (j+2 < len && regs->getPriority(tosave[j+2]) > regs->getPriority(tosave[j+1]))
+                        child++;
+                    if (regs->getPriority(last) > regs->getPriority(tosave[child]))
+                        break;
+                    tosave[j] = tosave[child];
+                    j = child;
+                }
+                tosave[j] = last;
+            }
+        }
+
+		// now evict everything else.
+		evictRegs(~SavedRegs);
+    }
+
+	void Assembler::evictRegs(RegisterMask regs)
 	{
 		// generate code to restore callee saved registers 
 		// @todo speed this up
-		RegisterMask scratch = ~SavedRegs;
-		for (Register r = FirstReg; r <= LastReg; r = nextreg(r))
-		{
-			if ((rmask(r) & scratch) && _allocator.getActive(r))
-            {
+        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
+            if ((rmask(r) & regs) && _allocator.getActive(r)) {
 				evict(r);
             }
 		}
 	}
 	
 	/**
 	 * Merge the current state of the registers with a previously stored version
+     * current == saved    skip
+     * current & saved     evict current, keep saved
+     * current & !saved    evict current  (unionRegisterState would keep)
+     * !current & saved    keep saved
 	 */
-	void Assembler::mergeRegisterState(RegAlloc& saved)
+	void Assembler::intersectRegisterState(RegAlloc& saved)
 	{
 		// evictions and pops first
 		RegisterMask skip = 0;
 		for (Register r=FirstReg; r <= LastReg; r = nextreg(r))
 		{
 			LIns * curins = _allocator.getActive(r);
 			LIns * savedins = saved.getActive(r);
 			if (curins == savedins)
 			{
-				verbose_only( if (curins) 
-					verbose_outputf("        skip %s", regNames[r]); )
+				verbose_only( if (curins) verbose_outputf("        skip %s", regNames[r]); )
 				skip |= rmask(r);
 			}
 			else 
 			{
-				if (curins)
+                if (curins) {
+                    //_nvprof("intersect-evict",1);
 					evict(r);
+                }
 				
     			#ifdef NANOJIT_IA32
 				if (savedins && (rmask(r) & x87Regs))
 					FSTP(r);
 				#endif
 			}
 		}
+        assignSaved(saved, skip);
+	}
 
+	/**
+	 * Merge the current state of the registers with a previously stored version.
+     * 
+     * current == saved    skip
+     * current & saved     evict current, keep saved
+     * current & !saved    keep current (intersectRegisterState would evict)
+     * !current & saved    keep saved
+	 */
+	void Assembler::unionRegisterState(RegAlloc& saved)
+	{
+		// evictions and pops first
+		RegisterMask skip = 0;
+		for (Register r=FirstReg; r <= LastReg; r = nextreg(r))
+		{
+			LIns * curins = _allocator.getActive(r);
+			LIns * savedins = saved.getActive(r);
+			if (curins == savedins)
+			{
+				verbose_only( if (curins) verbose_outputf("        skip %s", regNames[r]); )
+				skip |= rmask(r);
+			}
+			else 
+			{
+                if (curins && savedins) {
+                    //_nvprof("union-evict",1);
+					evict(r);
+                }
+				
+    			#ifdef NANOJIT_IA32
+				if (rmask(r) & x87Regs) {
+					if (savedins) {
+						FSTP(r);
+					}
+					else {
+						// saved state did not have fpu reg allocated,
+						// so we must evict here to keep x87 stack balanced.
+						evict(r);
+					}
+				}
+				#endif
+			}
+		}
+        assignSaved(saved, skip);
+    }
+
+    void Assembler::assignSaved(RegAlloc &saved, RegisterMask skip)
+    {
 		// now reassign mainline registers
 		for (Register r=FirstReg; r <= LastReg; r = nextreg(r))
 		{
 			LIns *i = saved.getActive(r);
 			if (i && !(skip&rmask(r)))
 				findSpecificRegFor(i, r);
 		}
 		debug_only(saved.used = 0);  // marker that we are no longer in exit path
@@ -1710,38 +2221,50 @@ namespace nanojit
 
 	#endif /* FEATURE_NANOJIT */
 
 #if defined(FEATURE_NANOJIT) || defined(NJ_VERBOSE)
 	uint32_t CallInfo::_count_args(uint32_t mask) const
 	{
 		uint32_t argc = 0;
 		uint32_t argt = _argtypes;
-		for (int i = 0; i < 5; ++i)
-		{
+		for (uint32_t i = 0; i < MAXARGS; ++i) {
 			argt >>= 2;
 			argc += (argt & mask) != 0;
 		}
 		return argc;
 	}
 
     uint32_t CallInfo::get_sizes(ArgSize* sizes) const
     {
 		uint32_t argt = _argtypes;
 		uint32_t argc = 0;
-		for (int32_t i = 0; i < 5; i++) {
+		for (uint32_t i = 0; i < MAXARGS; i++) {
 			argt >>= 2;
 			ArgSize a = ArgSize(argt&3);
 #ifdef NJ_SOFTFLOAT
 			if (a == ARGSIZE_F) {
                 sizes[argc++] = ARGSIZE_LO;
                 sizes[argc++] = ARGSIZE_LO;
                 continue;
             }
 #endif
             if (a != ARGSIZE_NONE) {
                 sizes[argc++] = a;
             }
 		}
+        if (isIndirect()) {
+            // add one more arg for indirect call address
+            argc++;
+        }
         return argc;
     }
-#endif
+
+    void LabelStateMap::add(LIns *label, NIns *addr, RegAlloc &regs) {
+        LabelState *st = new (gc) LabelState(addr, regs);
+        labels.put(label, st);
+    }
+
+    LabelState* LabelStateMap::get(LIns *label) {
+        return labels.get(label);
+    }
 }
+#endif // FEATURE_NANOJIT
--- a/js/src/nanojit/Assembler.h
+++ b/js/src/nanojit/Assembler.h
@@ -68,67 +68,45 @@ namespace nanojit
 	#define STACK_GRANULARITY		sizeof(void *)
 
 	/**
 	 * The Assembler is only concerned with transforming LIR to native instructions
 	 */
     struct Reservation
 	{
 		uint32_t arIndex:16;	/* index into stack frame.  displ is -4*arIndex */
-		Register reg:8;			/* register UnkownReg implies not in register */
-        int cost:8;
+		Register reg:15;			/* register UnkownReg implies not in register */
+        uint32_t used:1;
 	};
 
 	struct AR
 	{
 		LIns*			entry[ NJ_MAX_STACK_ENTRY ];	/* maps to 4B contiguous locations relative to the frame pointer */
 		uint32_t		tos;							/* current top of stack entry */
 		uint32_t		highwatermark;					/* max tos hit */
 		uint32_t		lowwatermark;					/* we pre-allocate entries from 0 upto this index-1; so dynamic entries are added above this index */
-		LIns*			parameter[ NJ_MAX_PARAMETERS ]; /* incoming parameters */
-	};
-
-    enum ArgSize {
-	    ARGSIZE_NONE = 0,
-	    ARGSIZE_F = 1,
-	    ARGSIZE_LO = 2,
-	    ARGSIZE_Q = 3,
-	    _ARGSIZE_MASK_INT = 2, 
-        _ARGSIZE_MASK_ANY = 3
-    };
-
-	struct CallInfo
-	{
-		intptr_t	_address;
-		uint16_t	_argtypes;		// 6 2-bit fields indicating arg type, by ARGSIZE above (including ret type): a1 a2 a3 a4 a5 ret
-		uint8_t		_cse;			// true if no side effects
-		uint8_t		_fold;			// true if no side effects
-		verbose_only ( const char* _name; )
-		
-		uint32_t FASTCALL _count_args(uint32_t mask) const;
-        uint32_t get_sizes(ArgSize*) const;
-
-		inline uint32_t FASTCALL count_args() const { return _count_args(_ARGSIZE_MASK_ANY); }
-		inline uint32_t FASTCALL count_iargs() const { return _count_args(_ARGSIZE_MASK_INT); }
-		// fargs = args - iargs
 	};
 
 	#ifdef AVMPLUS_WIN32
 		#define AVMPLUS_ALIGN16(type) __declspec(align(16)) type
 	#else
 		#define AVMPLUS_ALIGN16(type) type __attribute__ ((aligned (16)))
 	#endif
 
 	struct Stats
 	{
 		counter_define(steals;)
 		counter_define(remats;)
 		counter_define(spills;)
 		counter_define(native;)
         counter_define(exitnative;)
+		
+		int32_t pages;
+		NIns* codeStart;
+		NIns* codeExitStart;
 
 		DECLARE_PLATFORM_STATS()
 #ifdef __GNUC__
 		// inexplicably, gnuc gives padding/alignment warnings without this. pacify it.
 		bool pad[4];
 #endif
 	};
 
@@ -141,20 +119,44 @@ namespace nanojit
 		,OutOMem
 		,StackFull
 		,ResvFull
 		,RegionFull
         ,MaxLength
         ,MaxExit
         ,MaxXJump
         ,UnknownPrim
+        ,UnknownBranch
 	};
 
 	typedef avmplus::List<NIns*, avmplus::LIST_NonGCObjects> NInsList;
+	typedef avmplus::SortedMap<LIns*,NIns*,avmplus::LIST_NonGCObjects> InsMap;
+	typedef avmplus::SortedMap<NIns*,LIns*,avmplus::LIST_NonGCObjects> NInsMap;
 
+    class LabelState MMGC_SUBCLASS_DECL
+    {
+    public:
+        RegAlloc regs;
+        NIns *addr;
+        LabelState(NIns *a, RegAlloc &r) : regs(r), addr(a)
+        {}
+    };
+
+    class LabelStateMap
+    {
+        GC *gc;
+        avmplus::SortedMap<LIns*, LabelState*, avmplus::LIST_GCObjects> labels;
+    public:
+        LabelStateMap(GC *gc) : gc(gc), labels(gc)
+        {}
+
+        void clear() { labels.clear(); }
+        void add(LIns *label, NIns *addr, RegAlloc &regs);
+        LabelState *get(LIns *);
+    };
     /**
  	 * Information about the activation record for the method is built up 
  	 * as we generate machine code.  As part of the prologue, we issue
 	 * a stack adjustment instruction and then later patch the adjustment
 	 * value.  Temporary values can be placed into the AR as method calls
 	 * are issued.   Also MIR_alloc instructions will consume space.
 	 */
 	class Assembler MMGC_SUBCLASS_DECL
@@ -185,66 +187,73 @@ namespace nanojit
 			void		copyRegisters(RegAlloc* copyTo);
 			void		releaseRegisters();
             void        patch(GuardRecord *lr);
 			void		unpatch(GuardRecord *lr);
 			AssmError   error()	{ return _err; }
 			void		setError(AssmError e) { _err = e; }
 			void		setCallTable(const CallInfo *functions);
 			void		pageReset();
+			int32_t		codeBytes();
 			Page*		handoverPages(bool exitPages=false);
 
 			debug_only ( void		pageValidate(); )
 			debug_only ( bool		onPage(NIns* where, bool exitPages=false); )
 			
 			// support calling out from a fragment ; used to debug the jit
 			debug_only( void		resourceConsistencyCheck(); )
-			debug_only( void		registerConsistencyCheck(LIns** resv); )
+			debug_only( void		registerConsistencyCheck(); )
 			
 			Stats		_stats;		
+            int hasLoop;
 
 		private:
 			
 			void		gen(LirFilter* toCompile, NInsList& loopJumps);
-			NIns*		genPrologue(RegisterMask);
-			NIns*		genEpilogue(RegisterMask);
-
-			bool		ignoreInstruction(LInsp ins);
+			NIns*		genPrologue();
+			NIns*		genEpilogue();
 
 			GuardRecord* placeGuardRecord(LInsp guard);
 			void		initGuardRecord(LInsp guard, GuardRecord*);
 
 			uint32_t	arReserve(LIns* l);
-			uint32_t	arFree(uint32_t idx);
+			void    	arFree(uint32_t idx);
 			void		arReset();
 
 			Register	registerAlloc(RegisterMask allow);
 			void		registerResetAll();
-			void		restoreCallerSaved();
-			void		mergeRegisterState(RegAlloc& saved);
-	        LInsp       findVictim(RegAlloc& regs, RegisterMask allow, RegisterMask prefer);
+			void		evictRegs(RegisterMask regs);
+            void        evictScratchRegs();
+			void		intersectRegisterState(RegAlloc& saved);
+			void		unionRegisterState(RegAlloc& saved);
+            void        assignSaved(RegAlloc &saved, RegisterMask skip);
+	        LInsp       findVictim(RegAlloc& regs, RegisterMask allow);
 		
 			int			findMemFor(LIns* i);
 			Register	findRegFor(LIns* i, RegisterMask allow);
 			void		findRegFor2(RegisterMask allow, LIns* ia, Reservation* &ra, LIns *ib, Reservation* &rb);
 			Register	findSpecificRegFor(LIns* i, Register w);
 			Register	prepResultReg(LIns *i, RegisterMask allow);
 			void		freeRsrcOf(LIns *i, bool pop);
 			void		evict(Register r);
 			RegisterMask hint(LIns*i, RegisterMask allow);
 
 			NIns*		pageAlloc(bool exitPage=false);
 			void		pagesFree(Page*& list);
 			void		internalReset();
+            bool        canRemat(LIns*);
 
 			Reservation* reserveAlloc(LInsp i);
 			void		reserveFree(LInsp i);
 			void		reserveReset();
 
-			Reservation* getresv(LIns *x) { return x->resv() ? &_resvTable[x->resv()] : 0; }
+			Reservation* getresv(LIns *x) {
+                uint32_t resv_index = x->resv();
+                return resv_index ? &_resvTable[resv_index] : 0;
+            }
 
 			DWB(Fragmento*)		_frago;
             GC*					_gc;
             DWB(Fragment*)		_thisfrag;
 			RegAllocMap*		_branchStateMap;
 			GuardRecord*		_latestGuard;
 		
 			const CallInfo	*_functions;
@@ -254,60 +263,72 @@ namespace nanojit
 			NIns*       _epilogue;
 			Page*		_nativePages;	// list of NJ_PAGE_SIZE pages that have been alloc'd
 			Page*		_nativeExitPages; // list of pages that have been allocated for exit code
 			AssmError	_err;			// 0 = means assemble() appears ok, otherwise it failed
 
 			AR			_activation;
 			RegAlloc	_allocator;
 
+			LabelStateMap	_labels; 
+			NInsMap		_patches;
 			Reservation _resvTable[ NJ_MAX_STACK_ENTRY ]; // table where we house stack and register information
 			uint32_t	_resvFree;
-			bool		_inExit,vpad2[3];
+			bool		_inExit, vpad2[3];
+            avmplus::List<LIns*, avmplus::LIST_GCObjects> pending_lives;
 
 			void		asm_cmp(LIns *cond);
 #ifndef NJ_SOFTFLOAT
 			void		asm_fcmp(LIns *cond);
+            void        asm_setcc(Register res, LIns *cond);
+            NIns *      asm_jmpcc(bool brOnFalse, LIns *cond, NIns *target);
 #endif
 			void		asm_mmq(Register rd, int dd, Register rs, int ds);
             NIns*       asm_exit(LInsp guard);
 			NIns*		asm_leave_trace(LInsp guard);
             void        asm_qjoin(LIns *ins);
             void        asm_store32(LIns *val, int d, LIns *base);
             void        asm_store64(LIns *val, int d, LIns *base);
 			void		asm_restore(LInsp, Reservation*, Register);
-			void		asm_spill(LInsp i, Reservation *resv, bool pop);
+			void		asm_load(int d, Register r);
+			void		asm_spilli(LInsp i, Reservation *resv, bool pop);
+			void		asm_spill(Register rr, int d, bool pop=false, bool quad=false);
 			void		asm_load64(LInsp i);
 			void		asm_pusharg(LInsp p);
 			NIns*		asm_adjustBranch(NIns* at, NIns* target);
 			void		asm_quad(LInsp i);
 			bool		asm_qlo(LInsp ins, LInsp q);
 			void		asm_fneg(LInsp ins);
 			void		asm_fop(LInsp ins);
 			void		asm_i2f(LInsp ins);
 			void		asm_u2f(LInsp ins);
 			Register	asm_prep_fcall(Reservation *rR, LInsp ins);
 			void		asm_nongp_copy(Register r, Register s);
 			void		asm_bailout(LInsp guard, Register state);
 			void		asm_call(LInsp);
             void        asm_arg(ArgSize, LInsp, Register);
 			Register	asm_binop_rhs_reg(LInsp ins);
+			NIns*		asm_branch(bool branchOnFalse, LInsp cond, NIns* targ);
+            void        assignSavedParams();
+            void        reserveSavedParams();
+            void        handleLoopCarriedExprs();
 
 			// platform specific implementation (see NativeXXX.cpp file)
 			void		nInit(uint32_t flags);
 			void		nInit(AvmCore *);
 			Register	nRegisterAllocFromSet(int32_t set);
 			void		nRegisterResetAll(RegAlloc& a);
 			void		nMarkExecute(Page* page, int32_t count=1, bool enable=true);
 			void		nFrameRestore(RegisterMask rmask);
 			static void	nPatchBranch(NIns* branch, NIns* location);
 			void		nFragExit(LIns* guard);
 
 			// platform specific methods
         public:
+			const static Register savedRegs[NumSavedRegs];
 			DECLARE_PLATFORM_ASSEMBLER()
 
 		private:
 			debug_only( int32_t	_fpuStkDepth; )
 			debug_only( int32_t	_sv_fpuStkDepth; )
 
 			// since we generate backwards the depth is negative
 			inline void fpu_push() {
--- a/js/src/nanojit/Fragmento.cpp
+++ b/js/src/nanojit/Fragmento.cpp
@@ -34,16 +34,17 @@
  * decision by deleting the provisions above and replace them with the notice
  * and other provisions required by the GPL or the LGPL. If you do not delete
  * the provisions above, a recipient may use your version of this file under
  * the terms of any one of the MPL, the GPL or the LGPL.
  *
  * ***** END LICENSE BLOCK ***** */
 
 #include "nanojit.h"
+#undef MEMORY_INFO
 
 namespace nanojit
 {	
 	#ifdef FEATURE_NANOJIT
 
 	using namespace avmplus;
 
 	static uint32_t calcSaneCacheSize(uint32_t in)
@@ -53,26 +54,27 @@ namespace nanojit
 		return in;
 	}
 
 	/**
 	 * This is the main control center for creating and managing fragments.
 	 */
 	Fragmento::Fragmento(AvmCore* core, uint32_t cacheSizeLog2) 
 		: _allocList(core->GetGC()),
-			_max_pages(1 << (calcSaneCacheSize(cacheSizeLog2) - NJ_LOG2_PAGE_SIZE))
+			_max_pages(1 << (calcSaneCacheSize(cacheSizeLog2) - NJ_LOG2_PAGE_SIZE)),
+			_pagesGrowth(1)
 	{
 #ifdef MEMORY_INFO
 		_allocList.set_meminfo_name("Fragmento._allocList");
 #endif
+		NanoAssert(_max_pages > _pagesGrowth); // shrink growth if needed 
 		_core = core;
 		GC *gc = core->GetGC();
 		_frags = new (gc) FragmentMap(gc, 128);
 		_assm = new (gc) nanojit::Assembler(this);
-        _pageGrowth = 1;
 		verbose_only( enterCounts = new (gc) BlockHist(gc); )
 		verbose_only( mergeCounts = new (gc) BlockHist(gc); )
 	}
 
 	Fragmento::~Fragmento()
 	{
         AllocEntry *entry;
 
@@ -104,20 +106,20 @@ namespace nanojit
 		if (_stats.maxPageUse < pageUse)
 			_stats.maxPageUse = pageUse;
 	}
 
 	Page* Fragmento::pageAlloc()
 	{
         NanoAssert(sizeof(Page) == NJ_PAGE_SIZE);
 		if (!_pageList) {
-			pagesGrow(_pageGrowth);	// try to get more mem
-            if ((_pageGrowth << 1) < _max_pages)
-                _pageGrowth <<= 1;
-        }
+			pagesGrow(_pagesGrowth);	// try to get more mem
+			            if ((_pagesGrowth << 1) < _max_pages)
+							_pagesGrowth <<= 1;						
+		}
 		Page *page = _pageList;
 		if (page)
 		{
 			_pageList = page->next;
 			trackFree(-1);
 		}
 		//fprintf(stderr, "Fragmento::pageAlloc %X,  %d free pages of %d\n", (int)page, _stats.freePages, _stats.pages);
 		NanoAssert(pageCount()==_stats.freePages);
@@ -216,17 +218,17 @@ namespace nanojit
 		return _assm;
 	}
 
 	AvmCore* Fragmento::core()
 	{
 		return _core;
 	}
 
-	Fragment* Fragmento::newLoop(const void* ip)
+    Fragment* Fragmento::getAnchor(const void* ip)
 	{
         Fragment *f = newFrag(ip);
         Fragment *p = _frags->get(ip);
         if (p) {
             f->first = p;
             /* append at the end of the peer list */
             Fragment* next;
             while ((next = p->peer) != NULL)
@@ -475,17 +477,17 @@ namespace nanojit
 		_assm->_verbose = vsave;
 
 	}
 
 	void Fragmento::countBlock(BlockHist *hist, const void* ip)
 	{
 		int c = hist->count(ip);
 		if (_assm->_verbose)
-			_assm->outputf("++ %s %d", core()->interp.labels->format(ip), c);
+			_assm->outputf("++ %s %d", labels->format(ip), c);
 	}
 
 	void Fragmento::countIL(uint32_t il, uint32_t abc)
 	{
 		_stats.ilsize += il;
 		_stats.abcsize += abc;
 	}
 	
--- a/js/src/nanojit/Fragmento.h
+++ b/js/src/nanojit/Fragmento.h
@@ -49,17 +49,16 @@ extern void drawTraceTrees(Fragmento *fr
 namespace nanojit
 {
 	struct GuardRecord;
 	class Assembler;
 	
     struct PageHeader
     {
         struct Page *next;
-        verbose_only (int seq;) // sequence # of page
     };
     struct Page: public PageHeader
     {
         union {
             LIns lir[(NJ_PAGE_SIZE-sizeof(PageHeader))/sizeof(LIns)];
             NIns code[(NJ_PAGE_SIZE-sizeof(PageHeader))/sizeof(NIns)];
         };
     };
@@ -96,18 +95,18 @@ namespace nanojit
 			~Fragmento();
 
 			void		addMemory(void* firstPage, uint32_t pageCount);  // gives memory to the Assembler
 			Assembler*	assm();
 			AvmCore*	core();
 			Page*		pageAlloc();
 			void		pageFree(Page* page);
 			
-			Fragment*   newLoop(const void* ip);
             Fragment*   getLoop(const void* ip);
+            Fragment*   getAnchor(const void* ip);
 			void        clearFrags();	// clear all fragments from the cache
             Fragment*   getMerge(GuardRecord *lr, const void* ip);
             Fragment*   createBranch(GuardRecord *lr, const void* ip);
             Fragment*   newFrag(const void* ip);
             Fragment*   newBranch(Fragment *from, const void* ip);
 
             verbose_only ( uint32_t pageCount(); )
 			verbose_only ( void dumpStats(); )
@@ -140,23 +139,23 @@ namespace nanojit
 		private:
 			void		pagesGrow(int32_t count);
 			void		trackFree(int32_t delta);
 
 			AvmCore*			_core;
 			DWB(Assembler*)		_assm;
 			DWB(FragmentMap*)	_frags;		/* map from ip -> Fragment ptr  */
 			Page*			_pageList;
-            uint32_t        _pageGrowth;
 
 			/* unmanaged mem */
 			AllocList	_allocList;
 			GCHeap*		_gcHeap;
 
 			const uint32_t _max_pages;
+			uint32_t _pagesGrowth;
 	};
 
 	enum TraceKind {
 		LoopTrace,
 		BranchTrace,
 		MergeTrace
 	};
 	
@@ -231,23 +230,10 @@ namespace nanojit
 			void* vmprivate;
 			
 		private:
 			NIns*			_code;		// ptr to start of code
 			GuardRecord*	_links;		// code which is linked (or pending to be) to this fragment
 			int32_t			_hits;
 			Page*			_pages;		// native code pages 
 	};
-	
-#ifdef NJ_VERBOSE
-	inline int nbr(LInsp x) 
-	{
-        Page *p = x->page();
-        return (p->seq * NJ_PAGE_SIZE + (intptr_t(x)-intptr_t(p))) / sizeof(LIns);
-	}
-#else
-    inline int nbr(LInsp x)
-    {
-        return (int)(intptr_t(x) & intptr_t(NJ_PAGE_SIZE-1));
-    }
-#endif
 }
 #endif // __nanojit_Fragmento__
--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@@ -33,57 +33,62 @@
  * and other provisions required by the GPL or the LGPL. If you do not delete
  * the provisions above, a recipient may use your version of this file under
  * the terms of any one of the MPL, the GPL or the LGPL.
  *
  * ***** END LICENSE BLOCK ***** */
 
 #include "nanojit.h"
 #include <stdio.h>
+#include <ctype.h>
+
+#ifdef PERFM
+#include "../vprof/vprof.h"
+#endif /* PERFM */
 
 namespace nanojit
 {
     using namespace avmplus;
 	#ifdef FEATURE_NANOJIT
 
 	const uint8_t operandCount[] = {
-	/* 0 */		2, 2, /*trace*/0, /*nearskip*/0, /*skip*/0, /*neartramp*/0, /*tramp*/0, 2, 2, 2,
-	/* 10 */	/*param*/0, 2, 2, 2, 2, 2, 2, 2, /*call*/0, /*loop*/0,
-	/* 20 */	/*x*/0, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	/* 0 */		/*trace*/0, /*nearskip*/0, /*skip*/0, /*neartramp*/0, /*tramp*/0, 2, 2, 2, 2, /*addp*/2, 
+	/* 10 */	/*param*/0, 2, 2, /*alloc*/0, 2, /*ret*/1, /*live*/1, /*calli*/0, /*call*/0, /*loop*/0,
+	/* 20 */	/*x*/0, 0, 1, 1, /*label*/0, 2, 2, 2, 2, 2,
 	/* 30 */	2, 2, /*short*/0, /*int*/0, 2, 2, /*neg*/1, 2, 2, 2,
 #if defined NANOJIT_64BIT
 	/* 40 */	/*callh*/0, 2, 2, 2, /*not*/1, 2, 2, 2, /*xt*/1, /*xf*/1,
 #else
 	/* 40 */	/*callh*/1, 2, 2, 2, /*not*/1, 2, 2, 2, /*xt*/1, /*xf*/1,
 #endif
 	/* 50 */	/*qlo*/1, /*qhi*/1, 2, /*ov*/1, /*cs*/1, 2, 2, 2, 2, 2,
-	/* 60 */	2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-	/* 70 */	2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-	/* 80 */	2, 2, /*fcall*/0, 2, 2, 2, 2, 2, 2, 2,
+	/* 60 */	2, 2, 2, 2, 2, /*file*/1, /*line*/1, 2, 2, 2,
+	/* 70 */	2, 2, 2, 2, 2, 2, 2, 2, 2, /*fret*/1,
+	/* 80 */	2, /*fcalli*/0, /*fcall*/0, 2, 2, 2, 2, 2, 2, 2,
 	/* 90 */	2, 2, 2, 2, 2, 2, 2, /*quad*/0, 2, 2,
 	/* 100 */	/*fneg*/1, 2, 2, 2, 2, 2, /*i2f*/1, /*u2f*/1, 2, 2,
 	/* 110 */	2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 	/* 120 */	2, 2, 2, 2, 2, 2, 2, 2, 
 	};
 
 	// LIR verbose specific
 	#ifdef NJ_VERBOSE
 
 	const char* lirNames[] = {
-	/* 0-9 */	"0","1","trace","nearskip","skip","neartramp","tramp","7","8","9",
-	/* 10-19 */	"param","st","ld","13","sti","15","16","17","call","loop",
-	/* 20-29 */ "x","21","22","23","24","25","feq","flt","fgt","fle",
+	/* 0-9 */	"start","nearskip","skip","neartramp","tramp","5","6","7","8","addp",
+	/* 10-19 */	"param","st","ld","alloc","sti","ret","live","calli","call","loop",
+	/* 20-29 */ "x","j","jt","jf","label","25","feq","flt","fgt","fle",
 	/* 30-39 */ "fge","cmov","short","int","ldc","","neg","add","sub","mul",
 	/* 40-49 */ "callh","and","or","xor","not","lsh","rsh","ush","xt","xf",
 	/* 50-59 */ "qlo","qhi","ldcb","ov","cs","eq","lt","gt","le","ge",
 	/* 60-63 */ "ult","ugt","ule","uge",
-	/* 64-69 */ "LIR64","65","66","67","68","69",
-	/* 70-79 */ "70","71","72","73","74","stq","ldq","77","stqi","79",
-	/* 80-89 */ "80","81","fcall","83","84","85","86","87","qiand","qiadd",
-	/* 90-99 */ "90","91","92","93","qcmov","95","96","quad","98","99",
+	/* 64-69 */ "LIR64","file","line","67","68","69",
+	/* 70-79 */ "70","71","72","73","74","stq","ldq","77","stqi","fret",
+	/* 80-89 */ "80","fcalli","fcall","83","84","85","86","87","88","89",
+	/* 90-99 */ "90","91","92","93","94","95","96","quad","ldqc","99",
 	/* 100-109 */ "fneg","fadd","fsub","fmul","fdiv","qjoin","i2f","u2f","qior","qilsh",
 	/* 110-119 */ "110","111","112","113","114","115","116","117","118","119",
 	/* 120-127 */ "120","121","122","123","124","125","126","127"
 	};
 
 	#endif /* NANOJIT_VEBROSE */
 	
 	// implementation
@@ -93,26 +98,22 @@ namespace nanojit
 	#undef counter_value
 	#define counter_value(x)		x
 #endif /* NJ_PROFILE */
 
 	//static int32_t buffer_count = 0;
 	
 	// LCompressedBuffer
 	LirBuffer::LirBuffer(Fragmento* frago, const CallInfo* functions)
-		: _frago(frago), _functions(functions)
+		: _frago(frago), _functions(functions), abi(ABI_FASTCALL), _start(0)
 	{
-		_start = 0;
 		clear();
 		_start = pageAlloc();
 		if (_start)
-		{
-			verbose_only(_start->seq = 0;)
 			_unused = &_start->lir[0];
-		}
 		//buffer_count++;
 		//fprintf(stderr, "LirBuffer %x start %x\n", (int)this, (int)_start);
 	}
 
 	LirBuffer::~LirBuffer()
 	{
 		//buffer_count--;
 		//fprintf(stderr, "~LirBuffer %x start %x\n", (int)this, (int)_start);
@@ -153,25 +154,26 @@ namespace nanojit
 			count++;
 		}
 		NanoAssert(count == _stats.pages);
 		NanoAssert(_noMem || _unused->page()->next == 0);
 		NanoAssert(_noMem || samepage(last,_unused));
 	}
 	#endif 
 
-#ifdef NJ_VERBOSE
-	int LirBuffer::insCount() {
+	int32_t LirBuffer::insCount() 
+	{
+		// doesn't include embedded constants nor LIR_skip payload
 		return _stats.lir;
 	}
-	int LirBuffer::byteCount() {
-		return (_stats.pages-1) * (sizeof(Page)-sizeof(PageHeader)) +
-			(_unused - &_unused->page()->lir[0]) * sizeof(LIns);
+	int32_t LirBuffer::byteCount() 
+	{
+		return ((_stats.pages-1) * sizeof(Page)) +
+			((int32_t)_unused - (int32_t)pageTop(_unused));
 	}
-#endif
 
 	Page* LirBuffer::pageAlloc()
 	{
 		Page* page = _frago->pageAlloc();
 		if (page)
 		{
 			page->next = 0;	// end of list marker for new page
 			_stats.pages++;
@@ -194,17 +196,16 @@ namespace nanojit
 		LInsp last = _unused;
 		// we need to pull in a new page and stamp the old one with a link to it
         Page *lastPage = last->page();
 		Page *page = pageAlloc();
 		if (page)
 		{
 			lastPage->next = page;  // forward link to next page 
 			_unused = &page->lir[0];
-            verbose_only(page->seq = lastPage->seq+1;)
 			//fprintf(stderr, "Fragmento::ensureRoom stamping %x with %x; start %x unused %x\n", (int)pageBottom(last), (int)page, (int)_start, (int)_unused);
 			debug_only( validate(); )
 			return true;
 		} 
 		else {
 			// mem failure, rewind pointer to top of page so that subsequent instruction works
 			verbose_only(if (_frago->assm()->_verbose) _frago->assm()->outputf("page alloc failed");)
 			_unused = &lastPage->lir[0];
@@ -213,18 +214,18 @@ namespace nanojit
 	}
 	
 	bool LirBufWriter::ensureRoom(uint32_t count)
 	{
 		LInsp last = _buf->next();
 		if (!samepage(last,last+2*count)
 			&& _buf->addPage()) 
 		{
-			// link LIR stream back to prior instruction (careful insFar relies on _unused...)
-			insFar(LIR_skip, last-1);
+			// link LIR stream back to prior instruction (careful insLink relies on _unused...)
+			insLink(LIR_skip, last-1);
 		}
 		return !_buf->outOmem();
 	}
 
 	LInsp LirBuffer::commit(uint32_t count)
 	{
 		debug_only(validate();)
 		NanoAssertMsg( samepage(_unused, _unused+count), "You need to call ensureRoom first!" );
@@ -236,35 +237,36 @@ namespace nanojit
 		int delta = this-r-1;
 		NanoAssert(isU8(delta));
 		return delta;
 	}
 
     LIns* LIns::deref(int32_t off) const
     {
 		LInsp i = (LInsp) this-1 - off;
-        while (i->isTramp())
+        while (i && i->isTramp()) {
             i = i->ref();
+        }
 		return i;
     }
 
 	LInsp LirBufWriter::ensureReferenceable(LInsp i, int32_t addedDistance)
 	{
-		NanoAssert(!i->isTramp());
+		NanoAssert(i != 0 && !i->isTramp());
 		LInsp next = _buf->next();
 		LInsp from = next + 2*addedDistance;
 		if (canReference(from,i))
 			return i;
         if (i == _buf->sp && spref && canReference(from, spref))
             return spref;
         if (i == _buf->rp && rpref && canReference(from, rpref))
             return rpref;
 
 		// need a trampoline to get to i
-		LInsp tramp = insFar(LIR_tramp, i);
+		LInsp tramp = insLink(LIR_tramp, i);
 		NanoAssert( tramp->ref() == i );
 
         if (i == _buf->sp)
             spref = tramp;
         else if (i == _buf->rp)
             rpref = tramp;
 		return tramp;
 	}
@@ -306,50 +308,54 @@ namespace nanojit
 		_buf->commit(1);
 		_buf->_stats.lir++;
 		return l;
 	}
 
 	LInsp LirBufWriter::ins0(LOpcode op)
 	{
 		ensureRoom(1);
-		LInsp l = _buf->next();
+        LirBuffer *b = this->_buf;
+		LInsp l = b->next();
 		l->initOpcode(op);
-		_buf->commit(1);
-		_buf->_stats.lir++;
+		b->commit(1);
+		b->_stats.lir++;
+		if (op == LIR_start) {
+			// create params for saved regs -- processor specific
+			for (int i=0; i < NumSavedRegs; i++) {
+				insParam(i, 1);
+			}
+		}
 		return l;
 	}
 	
 	LInsp LirBufWriter::ins1(LOpcode op, LInsp o1)
 	{
 		ensureRoom(2);
 		LInsp r1 = ensureReferenceable(o1,1);
 
 		LInsp l = _buf->next();
 		l->initOpcode(op);
-		if (r1)
-			l->setOprnd1(r1);
+		l->setOprnd1(r1);
 
 		_buf->commit(1);
 		_buf->_stats.lir++;
 		return l;
 	}
 	
 	LInsp LirBufWriter::ins2(LOpcode op, LInsp o1, LInsp o2)
 	{
 		ensureRoom(3);
 		LInsp r1 = ensureReferenceable(o1,2);
-		LInsp r2 = ensureReferenceable(o2,1);
+        LInsp r2 = o2==o1 ? r1 : ensureReferenceable(o2,1);
 
 		LInsp l = _buf->next();
 		l->initOpcode(op);
-		if (r1)
-			l->setOprnd1(r1);
-		if (r2)
-			l->setOprnd2(r2);
+		l->setOprnd1(r1);
+		l->setOprnd2(r2);
 
 		_buf->commit(1);
 		_buf->_stats.lir++;
 		return l;
 	}
 
 	LInsp LirBufWriter::insLoad(LOpcode op, LInsp base, LInsp d)
 	{
@@ -358,55 +364,89 @@ namespace nanojit
 
 	LInsp LirBufWriter::insGuard(LOpcode op, LInsp c, SideExit *x)
 	{
 		LInsp data = skip(SideExitSize(x));
 		*((SideExit*)data->payload()) = *x;
 		return ins2(op, c, data);
 	}
 
-    LInsp LirBufWriter::insParam(int32_t arg)
+	LInsp LirBufWriter::insBranch(LOpcode op, LInsp condition, LInsp toLabel)
+	{
+		if (!toLabel)
+			toLabel = insFar(LIR_tramp,0); //empty tramp
+        if (!condition) {
+            // unconditional, just point to something
+            condition = toLabel;
+        }
+	    return ins2(op,condition,toLabel);
+	}
+
+    LInsp LirBufWriter::insAlloc(int32_t size)
     {
+        size = (size+3)>>2; // # of required 32bit words
+        NanoAssert(isU16(size));
 		ensureRoom(1);
 		LInsp l = _buf->next();
-		l->initOpcode(LIR_param);
-		l->c.imm8a = Assembler::argRegs[arg];
-
+		l->initOpcode(LIR_alloc);
+		l->i.imm16 = uint16_t(size);
 		_buf->commit(1);
 		_buf->_stats.lir++;
 		return l;
     }
+
+    LInsp LirBufWriter::insParam(int32_t arg, int32_t kind)
+    {
+		ensureRoom(1);
+        LirBuffer *b = this->_buf;
+		LInsp l = b->next();
+		l->initOpcode(LIR_param);
+        NanoAssert(isU8(arg) && isU8(kind));
+		l->c.imm8a = arg;
+        l->c.imm8b = kind;
+        if (kind) {
+            NanoAssert(arg < NumSavedRegs);
+            b->savedParams[arg] = l;
+        }
+		b->commit(1);
+		b->_stats.lir++;
+		return l;
+    }
 	
 	LInsp LirBufWriter::insFar(LOpcode op, LInsp target)
 	{
-        NanoAssert(op == LIR_skip || op == LIR_tramp);
+		ensureRoom(2);
         LInsp l = _buf->next();
-        int d = target-l;
-        if (isS24(d)) {
-    		ensureRoom(1);
+
+		// write the pointer and operation
+		l = _buf->next()+1;
+		*((LInsp*)(l-1)) = target;
+		l->initOpcode(op);
+		_buf->commit(2);
+		_buf->_stats.lir++;
+		return l;
+	}
+	
+	LInsp LirBufWriter::insLink(LOpcode op, LInsp target)
+	{
+        NanoAssert(op == LIR_skip || op == LIR_tramp);
+		ensureRoom(2);  // must be before _buf->next() 		
+        LInsp l = _buf->next();
+        if (can24bReach(l,target))
+		{
             l->initOpcode(LOpcode(op-1)); // nearskip or neartramp
-            l->t.imm24 = d;
+            l->t.imm24 = target-l;
             _buf->commit(1);
-            return l;
+			_buf->_stats.lir++;
         }
-        else {
-            #if defined NANOJIT_64BIT
-            const unsigned int extra = 1;
-            #else
-            const unsigned int extra = 0;
-            #endif
-
-            ensureRoom(2 + extra);
-            // write the pointer and instruction
-            l = _buf->next()+1+extra;
-            *((LInsp*)(l-1-extra)) = target;
-            l->initOpcode(op);
-            _buf->commit(2+extra);
-		    return l;
-        }
+        else
+		{
+			l = insFar(op,target);
+		}
+		return l;
 	}
 	
 	LInsp LirBufWriter::insImm(int32_t imm)
 	{
 		if (isS16(imm)) {
 			ensureRoom(1);
 			LInsp l = _buf->next();
 			l->initOpcode(LIR_short);
@@ -434,17 +474,17 @@ namespace nanojit
 	}
 
 	LInsp LirBufWriter::skip(size_t size)
 	{
         const uint32_t n = (size+sizeof(LIns)-1)/sizeof(LIns);
 		ensureRoom(n+2);
 		LInsp last = _buf->next()-1;
 		_buf->commit(n);
-		return insFar(LIR_skip, last);
+		return insLink(LIR_skip, last);
 	}
 
 	LInsp LirReader::read()	
 	{
 		LInsp cur = _i;
 		if (!cur)
 			return 0;
 		LIns* i = cur;
@@ -457,16 +497,18 @@ namespace nanojit
 					i--;
 					break;
 
 #if defined NANOJIT_64BIT
             	case LIR_callh:
 #endif
 				case LIR_call:
 				case LIR_fcall:
+                case LIR_calli:
+                case LIR_fcalli:
 					i -= i->callInsWords();
 					break;
 
 				case LIR_skip:
 				case LIR_nearskip:
 					NanoAssert(i->ref() != i);
 					i = i->ref();
 					break;
@@ -486,17 +528,17 @@ namespace nanojit
 					i -= 2;
 					break;
 
 				case LIR_quad:
 					NanoAssert(samepage(i, i-3));
 					i -= 3;
 					break;
 
-				case LIR_trace:
+				case LIR_start:
 					_i = 0;  // start of trace
 					return cur;
 			}
 			iop = i->opcode();
 		}
 		while (is_trace_skip_tramp(iop)||iop==LIR_2);
 		_i = i;
 		return cur;
@@ -504,56 +546,46 @@ namespace nanojit
 
 	bool FASTCALL isCmp(LOpcode c) {
 		return c >= LIR_eq && c <= LIR_uge || c >= LIR_feq && c <= LIR_fge;
 	}
     
 	bool FASTCALL isCond(LOpcode c) {
 		return (c == LIR_ov) || (c == LIR_cs) || isCmp(c);
 	}
+
+    bool FASTCALL isFloat(LOpcode c) {
+        switch (c) {
+            default:
+                return false;
+            case LIR_fadd:
+            case LIR_fsub:
+            case LIR_fmul:
+            case LIR_fdiv:
+            case LIR_fneg:
+            case LIR_fcall:
+            case LIR_fcalli:
+            case LIR_i2f:
+            case LIR_u2f:
+                return true;
+        }
+    }
     
 	bool LIns::isCmp() const {
 		return nanojit::isCmp(u.code);
 	}
 
     bool LIns::isCond() const {
         return nanojit::isCond(u.code);
     }
 	
 	bool LIns::isQuad() const {
 		return ((u.code & LIR64) != 0 || u.code == LIR_callh);
 	}
     
-	bool LIns::isCall() const
-	{
-		return ((u.code&~LIR64) == LIR_call
-				|| (u.code == LIR_callh));
-	}
-
-	bool LIns::isGuard() const
-	{
-		return u.code==LIR_x || u.code==LIR_xf || u.code==LIR_xt || u.code==LIR_loop;
-	}
-
-    bool LIns::isStore() const
-    {
-		int c = u.code & ~LIR64;
-        return c == LIR_st || c == LIR_sti;
-    }
-
-    bool LIns::isLoad() const
-    {
-        return u.code == LIR_ldq || u.code == LIR_ld || u.code == LIR_ldc;
-    }
-
-	bool LIns::isconst() const
-	{
-		return (opcode()&~1) == LIR_short;
-	}
-
 	bool LIns::isconstval(int32_t val) const
 	{
 		return isconst() && constval()==val;
 	}
 
 	bool LIns::isconstq() const
 	{	
 		return isop(LIR_quad);
@@ -579,16 +611,22 @@ namespace nanojit
     }
 
 	void LIns::setimm16(int32_t x)
 	{
 		NanoAssert(isS16(x));
 		i.imm16 = int16_t(x);
 	}
 
+	void LIns::setimm24(int32_t x)
+	{
+		NanoAssert(isS24(x));
+		t.imm24 = x;
+	}
+
 	void LIns::setresv(uint32_t resv)
 	{
 		NanoAssert(isU8(resv));
 		g.resv = resv;
 	}
 
 	void LIns::initOpcode(LOpcode op)
 	{
@@ -612,16 +650,38 @@ namespace nanojit
 		u.oprnd_3 = reference(r);
 	}
 
     void LIns::setDisp(int8_t d)
     {
         sti.disp = d;
     }
 
+    LIns **LIns::targetAddr() {
+		NanoAssert(isBranch());
+		LInsp i = (LInsp) this-1 - u.oprnd_2;
+        NanoAssert(i->isTramp());
+        LInsp ref;
+        while ((ref=i->ref()) != 0 && ref->isTramp())
+            i = ref;
+		NanoAssert(i->isop(LIR_tramp));
+		return (LIns**)(i-1);
+    }
+
+    void LIns::target(LInsp label) {
+        NanoAssert(label && label->isop(LIR_label));
+        *(targetAddr()) = label;
+	}
+
+	LInsp LIns::getTarget()
+	{
+        NanoAssert(isBranch());
+        return oprnd2();
+	}
+
 	LInsp	LIns::oprnd1() const	
 	{
         return deref(u.oprnd_1);
 	}
 	
 	LInsp	LIns::oprnd2() const
 	{ 
         return deref(u.oprnd_2);
@@ -668,19 +728,36 @@ namespace nanojit
 				return i->oprnd1();
 		}
 		else if (v == LIR_qhi) {
 			if (i->isconstq())
 				return insImm(int32_t(i->constvalq()>>32));
 			if (i->isop(LIR_qjoin))
 				return i->oprnd2();
 		}
+		else if (i->isconst()) {
+			int32_t c = i->constval();
+			if (v == LIR_neg)
+				return insImm(-c);
+			if (v == LIR_not)
+				return insImm(~c);
+		}
 		else if (v == i->opcode() && (v == LIR_not || v == LIR_neg || v == LIR_fneg)) {
+            // not(not(x)) = x;  neg(neg(x)) = x;  fneg(fneg(x)) = x;
 			return i->oprnd1();
 		}
+        /* [ed 8.27.08] this causes a big slowdown in gameoflife.as.  why?
+        else if (i->isconst()) {
+            if (v == LIR_i2f) {
+                return insImmf(i->constval());
+            }
+            else if (v == LIR_u2f) {
+                return insImmf((uint32_t)i->constval());
+            }
+        }*/
 
 		// todo
 		// -(a-b) = b-a
 
 		return out->ins1(v, i);
 	}
 
 	LIns* ExprFilter::ins2(LOpcode v, LIns* oprnd1, LIns* oprnd2)
@@ -749,31 +826,31 @@ namespace nanojit
             if (v == LIR_and)
                 return insImm(uint32_t(c1) & int32_t(c2));
             if (v == LIR_xor)
                 return insImm(uint32_t(c1) ^ int32_t(c2));
 		}
 		else if (oprnd1->isconstq() && oprnd2->isconstq())
 		{
 			double c1 = oprnd1->constvalf();
-			double c2 = oprnd1->constvalf();
+			double c2 = oprnd2->constvalf();
 			if (v == LIR_feq)
 				return insImm(c1 == c2);
 			if (v == LIR_flt)
 				return insImm(c1 < c2);
 			if (v == LIR_fgt)
 				return insImm(c1 > c2);
 			if (v == LIR_fle)
 				return insImm(c1 <= c2);
 			if (v == LIR_fge)
 				return insImm(c1 >= c2);
 		}
 		else if (oprnd1->isconst() && !oprnd2->isconst())
 		{
-			if (v == LIR_add || v == LIR_mul ||
+			if (v == LIR_add || v == LIR_addp || v == LIR_mul ||
 				v == LIR_fadd || v == LIR_fmul ||
 				v == LIR_xor || v == LIR_or || v == LIR_and ||
 				v == LIR_eq) {
 				// move const to rhs
 				LIns* t = oprnd2;
 				oprnd2 = oprnd1;
 				oprnd1 = t;
 			}
@@ -819,17 +896,17 @@ namespace nanojit
 						if (a_lt == b_lt)
 							return insImm(a_lt);
 					}
 				}
 			}
 
 			if (c == 0)
 			{
-				if (v == LIR_add || v == LIR_or || v == LIR_xor ||
+				if (v == LIR_add || v == LIR_addp || v == LIR_or || v == LIR_xor ||
 					v == LIR_sub || v == LIR_lsh || v == LIR_rsh || v == LIR_ush)
 					return oprnd1;
 				else if (v == LIR_and || v == LIR_mul)
 					return oprnd2;
 				else if (v == LIR_eq && oprnd1->isop(LIR_or) && 
 					oprnd1->oprnd2()->isconst() &&
 					oprnd1->oprnd2()->constval() != 0) {
 					// (x or c) != 0 if c != 0
@@ -880,31 +957,59 @@ namespace nanojit
 				    v = LOpcode(v^1);
 				    c = c->oprnd1();
 				}
 			}
 		}
 		return out->insGuard(v, c, x);
 	}
 
+    LIns* ExprFilter::insBranch(LOpcode v, LIns *c, LIns *t)
+    {
+        if (v == LIR_jt || v == LIR_jf) {
+            while (c->isop(LIR_eq) && c->oprnd1()->isCmp() && c->oprnd2()->isconstval(0)) {
+                // jt(eq(cmp,0)) => jf(cmp)   or   jf(eq(cmp,0)) => jt(cmp)
+                v = LOpcode(v ^ 1);
+                c = c->oprnd1();
+            }
+        }
+        return out->insBranch(v, c, t);
+    }
+
     LIns* LirWriter::insLoadi(LIns *base, int disp) 
     { 
         return insLoad(LIR_ld,base,disp);
     }
 
 	LIns* LirWriter::insLoad(LOpcode op, LIns *base, int disp)
 	{
 		return insLoad(op, base, insImm(disp));
 	}
 
+    LIns* LirWriter::store(LInsp value, LInsp base, int32_t d)
+    {
+		return isS8(d) ? insStorei(value, base, d)
+			: insStore(value, base, insImm(d));
+    }
+
 	LIns* LirWriter::ins_eq0(LIns* oprnd1)
 	{
 		return ins2i(LIR_eq, oprnd1, 0);
 	}
 
+    LIns* LirWriter::insImmf(double f)
+    {
+        union {
+            double f;
+            uint64_t q;
+        } u;
+        u.f = f;
+        return insImmq(u.q);
+    }
+
 	LIns* LirWriter::qjoin(LInsp lo, LInsp hi)
 	{
 		return ins2(LIR_qjoin, lo, hi);
 	}
 
 	LIns* LirWriter::insImmPtr(const void *ptr)
 	{
 		return sizeof(ptr) == 8 ? insImmq((uintptr_t)ptr) : insImm((intptr_t)ptr);
@@ -932,44 +1037,46 @@ namespace nanojit
 		return ins2(LIR_or, 
 					ins2(LIR_and, iftrue, ncond), 
 					ins2(LIR_and, iffalse, ins1(LIR_not, ncond)));
 	}
 
     LIns* LirBufWriter::insCall(const CallInfo *ci, LInsp args[])
 	{
 		static const LOpcode k_callmap[] = { LIR_call, LIR_fcall, LIR_call, LIR_callh };
+		static const LOpcode k_callimap[] = { LIR_calli, LIR_fcalli, LIR_calli, LIR_skip };
 
 		uint32_t argt = ci->_argtypes;
-		LOpcode op = k_callmap[argt & 3];
+        LOpcode op = (ci->isIndirect() ? k_callimap : k_callmap)[argt & 3];
+        NanoAssert(op != LIR_skip); // LIR_skip here is just an error condition
 
-        ArgSize sizes[10];
+        ArgSize sizes[2*MAXARGS];
         uint32_t argc = ci->get_sizes(sizes);
 
 #ifdef NJ_SOFTFLOAT
 		if (op == LIR_fcall)
 			op = LIR_callh;
-		LInsp args2[5*2]; // arm could require 2 args per double
+		LInsp args2[MAXARGS*2]; // arm could require 2 args per double
 		int32_t j = 0;
-		for (int32_t i = 0; i < 5; i++) {
+		for (int32_t i = 0; i < MAXARGS; i++) {
 			argt >>= 2;
 			ArgSize a = ArgSize(argt&3);
 			if (a == ARGSIZE_F) {
 				LInsp q = args[i];
 				args2[j++] = ins1(LIR_qhi, q);
 				args2[j++] = ins1(LIR_qlo, q);
 			} else if (a != ARGSIZE_NONE) {
 				args2[j++] = args[i];
 			}
 		}
 		args = args2;
         NanoAssert(j == argc);
 #endif
 
-		NanoAssert(argc < 8);
+		NanoAssert(argc <= MAXARGS);
 		uint32_t words = argwords(argc);
 		ensureRoom(words+LIns::callInfoWords+1+argc);  // ins size + possible tramps
 		for (uint32_t i=0; i < argc; i++)
 			args[i] = ensureReferenceable(args[i], argc-i);
 		uint8_t* offs = (uint8_t*)_buf->next();
 		LIns *l = _buf->next() + words;
 		*(const CallInfo **)l = ci;
 		l += LIns::callInfoWords;
@@ -984,18 +1091,18 @@ namespace nanojit
         l->c.imm8b = argc;
 		_buf->commit(words+LIns::callInfoWords+1);
 		_buf->_stats.lir++;
 		return l;
 	}
 
     using namespace avmplus;
 
-	StackFilter::StackFilter(LirFilter *in, GC *gc, Fragment *frag, LInsp sp) 
-		: LirFilter(in), gc(gc), frag(frag), sp(sp), top(0)
+	StackFilter::StackFilter(LirFilter *in, GC *gc, LirBuffer *lirbuf, LInsp sp) 
+		: LirFilter(in), gc(gc), lirbuf(lirbuf), sp(sp), top(0)
 	{}
 
 	LInsp StackFilter::read() 
 	{
 		for (;;) 
 		{
 			LInsp i = in->read();
 			if (!i)
@@ -1025,16 +1132,21 @@ namespace nanojit
 							if (stk.get(d))
 								continue;
 							else
 								stk.set(gc, d);
 						}
 					}
 				}
 			}
+			/* 
+			 * NB: If there is a backward branch other than the loop-restart branch, this is
+			 * going to be wrong. Unfortunately there doesn't seem to be an easy way to detect
+			 * such branches. Just do not create any.
+			 */
 			else if (i->isGuard())
 			{
 				stk.reset();
 				top = getTop(i) >> 2;
 			}
 			return i;
 		}
 	}
@@ -1081,23 +1193,29 @@ namespace nanojit
 		hash ^= hash << 4;
 		hash += hash >> 17;
 		hash ^= hash << 25;
 		hash += hash >> 6;
 		return hash;
 	}
 
 	LInsHashSet::LInsHashSet(GC* gc) : 
-			m_list(gc, kInitialCap), m_used(0), m_gc(gc)
+			m_used(0), m_cap(kInitialCap), m_gc(gc)
 	{
 #ifdef MEMORY_INFO
-		m_list.set_meminfo_name("LInsHashSet.list");
+//		m_list.set_meminfo_name("LInsHashSet.list");
 #endif
-		m_list.set(kInitialCap-1, 0);
+        LInsp *list = (LInsp*) gc->Alloc(sizeof(LInsp)*m_cap);
+        WB(gc, this, &m_list, list);
 	}
+
+    void LInsHashSet::clear() {
+        memset(m_list, 0, sizeof(LInsp)*m_cap);
+        m_used = 0;
+    }
 	
 	/*static*/ uint32_t FASTCALL LInsHashSet::hashcode(LInsp i)
 	{
 		const LOpcode op = i->opcode();
 		switch (op)
 		{
 			case LIR_short:
 				return hashimm(i->imm16());
@@ -1168,68 +1286,68 @@ namespace nanojit
 					return false;
 				return true;
 			}
 		}
 	}
 
 	void FASTCALL LInsHashSet::grow()
 	{
-		const uint32_t newcap = m_list.size() << 1;
-		InsList newlist(m_gc, newcap);
+		const uint32_t newcap = m_cap << 1;
+        LInsp *newlist = (LInsp*) m_gc->Alloc(newcap * sizeof(LInsp));
+        LInsp *list = m_list;
 #ifdef MEMORY_INFO
-		newlist.set_meminfo_name("LInsHashSet.list");
+//		newlist.set_meminfo_name("LInsHashSet.list");
 #endif
-		newlist.set(newcap-1, 0);
-		for (uint32_t i=0, n=m_list.size(); i < n; i++)
-		{
-			LInsp name = m_list.get(i);
+		for (uint32_t i=0, n=m_cap; i < n; i++) {
+			LInsp name = list[i];
 			if (!name) continue;
 			uint32_t j = find(name, hashcode(name), newlist, newcap);
-			newlist.set(j, name);
+            newlist[j] = name;
 		}
-		m_list.become(newlist);
+        m_cap = newcap;
+        WB(m_gc, this, &m_list, newlist);
 	}
 
-	uint32_t FASTCALL LInsHashSet::find(LInsp name, uint32_t hash, const InsList& list, uint32_t cap)
+	uint32_t FASTCALL LInsHashSet::find(LInsp name, uint32_t hash, const LInsp *list, uint32_t cap)
 	{
 		const uint32_t bitmask = (cap - 1) & ~0x1;
 
 		uint32_t n = 7 << 1;
 		hash &= bitmask;  
 		LInsp k;
-		while ((k = list.get(hash)) != NULL &&
+		while ((k = list[hash]) != NULL &&
 			(!LIns::sameop(k,name) || !equals(k, name)))
 		{
 			hash = (hash + (n += 2)) & bitmask;		// quadratic probe
 		}
 		return hash;
 	}
 
 	LInsp LInsHashSet::add(LInsp name, uint32_t k)
 	{
 		// this is relatively short-lived so let's try a more aggressive load factor
 		// in the interest of improving performance
-		if (((m_used+1)<<1) >= m_list.size()) // 0.50
+		if (((m_used+1)<<1) >= m_cap) // 0.50
 		{
 			grow();
-			k = find(name, hashcode(name), m_list, m_list.size());
+			k = find(name, hashcode(name), m_list, m_cap);
 		}
-		NanoAssert(!m_list.get(k));
+		NanoAssert(!m_list[k]);
 		m_used++;
-		m_list.set(k, name);
-		return name;
+        return m_list[k] = name;
 	}
 
 	void LInsHashSet::replace(LInsp i)
 	{
-		uint32_t k = find(i, hashcode(i), m_list, m_list.size());
-		if (m_list.get(k)) {
+        LInsp *list = m_list;
+		uint32_t k = find(i, hashcode(i), list, m_cap);
+		if (list[k]) {
 			// already there, so replace it
-			m_list.set(k, i);
+			list[k] = i;
 		} else {
 			add(i, k);
 		}
 	}
 
 	uint32_t LInsHashSet::hashimm(int32_t a) {
 		return _hashfinish(_hash32(0,a));
 	}
@@ -1254,74 +1372,74 @@ namespace nanojit
 		uint32_t hash = _hashptr(0, ci);
 		for (int32_t j=argc-1; j >= 0; j--)
 			hash = _hashptr(hash,args[j]);
 		return _hashfinish(hash);
 	}
 
 	LInsp LInsHashSet::find32(int32_t a, uint32_t &i)
 	{
-		uint32_t cap = m_list.size();
-		const InsList& list = m_list;
+		uint32_t cap = m_cap;
+		const LInsp *list = m_list;
 		const uint32_t bitmask = (cap - 1) & ~0x1;
 		uint32_t hash = hashimm(a) & bitmask;
 		uint32_t n = 7 << 1;
 		LInsp k;
-		while ((k = list.get(hash)) != NULL && 
+		while ((k = list[hash]) != NULL && 
 			(!k->isconst() || k->constval() != a))
 		{
 			hash = (hash + (n += 2)) & bitmask;		// quadratic probe
 		}
 		i = hash;
 		return k;
 	}
 
 	LInsp LInsHashSet::find64(uint64_t a, uint32_t &i)
 	{
-		uint32_t cap = m_list.size();
-		const InsList& list = m_list;
+		uint32_t cap = m_cap;
+		const LInsp *list = m_list;
 		const uint32_t bitmask = (cap - 1) & ~0x1;
 		uint32_t hash = hashimmq(a) & bitmask;  
 		uint32_t n = 7 << 1;
 		LInsp k;
-		while ((k = list.get(hash)) != NULL && 
+		while ((k = list[hash]) != NULL && 
 			(!k->isconstq() || k->constvalq() != a))
 		{
 			hash = (hash + (n += 2)) & bitmask;		// quadratic probe
 		}
 		i = hash;
 		return k;
 	}
 
 	LInsp LInsHashSet::find1(LOpcode op, LInsp a, uint32_t &i)
 	{
-		uint32_t cap = m_list.size();
-		const InsList& list = m_list;
+		uint32_t cap = m_cap;
+		const LInsp *list = m_list;
 		const uint32_t bitmask = (cap - 1) & ~0x1;
 		uint32_t hash = hash1(op,a) & bitmask;  
 		uint32_t n = 7 << 1;
 		LInsp k;
-		while ((k = list.get(hash)) != NULL && 
+		while ((k = list[hash]) != NULL && 
 			(k->opcode() != op || k->oprnd1() != a))
 		{
 			hash = (hash + (n += 2)) & bitmask;		// quadratic probe
 		}
 		i = hash;
 		return k;
 	}
 
 	LInsp LInsHashSet::find2(LOpcode op, LInsp a, LInsp b, uint32_t &i)
 	{
-		uint32_t cap = m_list.size();
-		const InsList& list = m_list;
+		uint32_t cap = m_cap;
+		const LInsp *list = m_list;
 		const uint32_t bitmask = (cap - 1) & ~0x1;
 		uint32_t hash = hash2(op,a,b) & bitmask;  
 		uint32_t n = 7 << 1;
 		LInsp k;
-		while ((k = list.get(hash)) != NULL && 
+		while ((k = list[hash]) != NULL && 
 			(k->opcode() != op || k->oprnd1() != a || k->oprnd2() != b))
 		{
 			hash = (hash + (n += 2)) & bitmask;		// quadratic probe
 		}
 		i = hash;
 		return k;
 	}
 
@@ -1330,23 +1448,23 @@ namespace nanojit
 		for (uint32_t j=0; j < argc; j++)
 			if (i->arg(j) != args[j])
 				return false;
 		return true;
 	}
 
 	LInsp LInsHashSet::findcall(const CallInfo *ci, uint32_t argc, LInsp args[], uint32_t &i)
 	{
-		uint32_t cap = m_list.size();
-		const InsList& list = m_list;
+		uint32_t cap = m_cap;
+		const LInsp *list = m_list;
 		const uint32_t bitmask = (cap - 1) & ~0x1;
 		uint32_t hash = hashcall(ci, argc, args) & bitmask;  
 		uint32_t n = 7 << 1;
 		LInsp k;
-		while ((k = list.get(hash)) != NULL &&
+		while ((k = list[hash]) != NULL &&
 			(!k->isCall() || k->callInfo() != ci || !argsmatch(k, argc, args)))
 		{
 			hash = (hash + (n += 2)) & bitmask;		// quadratic probe
 		}
 		i = hash;
 		return k;
 	}
 
@@ -1399,36 +1517,34 @@ namespace nanojit
             live.remove(i);
             retired.add(e);
 		}
 		bool contains(LInsp i) {
 			return live.containsKey(i);
 		}
 	};
 
-    void live(GC *gc, Assembler *assm, Fragment *frag)
+    void live(GC *gc, LirBuffer *lirbuf)
 	{
 		// traverse backwards to find live exprs and a few other stats.
 
-		LInsp sp = frag->lirbuf->sp;
-		LInsp rp = frag->lirbuf->rp;
 		LiveTable live(gc);
 		uint32_t exits = 0;
-		LirBuffer *lirbuf = frag->lirbuf;
         LirReader br(lirbuf);
-		StackFilter sf(&br, gc, frag, sp);
-		StackFilter r(&sf, gc, frag, rp);
-        int total = 0;
-        live.add(frag->lirbuf->state, r.pos());
+		StackFilter sf(&br, gc, lirbuf, lirbuf->sp);
+		StackFilter r(&sf, gc, lirbuf, lirbuf->rp);
+		int total = 0;
+        if (lirbuf->state)
+            live.add(lirbuf->state, r.pos());
 		for (LInsp i = r.read(); i != 0; i = r.read())
 		{
             total++;
 
             // first handle side-effect instructions
-			if (i->isStore() || i->isGuard() || i->isCall() && !i->callInfo()->_cse)
+			if (!i->isCse(lirbuf->_functions))
 			{
 				live.add(i,0);
                 if (i->isGuard())
                     exits++;
 			}
 
 			// now propagate liveness
 			if (live.contains(i))
@@ -1453,36 +1569,43 @@ namespace nanojit
 				}
 				else if (i->isCall()) {
 					for (int j=0, c=i->argc(); j < c; j++)
 						live.add(i->arg(j),i);
 				}
 			}
 		}
  
-		assm->outputf("live instruction count %ld, total %ld, max pressure %d",
+		printf("live instruction count %d, total %u, max pressure %d\n",
 			live.retired.size(), total, live.maxlive);
-        assm->outputf("side exits %ld", exits);
+        printf("side exits %u\n", exits);
 
 		// print live exprs, going forwards
-		LirNameMap *names = frag->lirbuf->names;
+		LirNameMap *names = lirbuf->names;
+        bool newblock = true;
 		for (int j=live.retired.size()-1; j >= 0; j--) 
         {
             RetiredEntry *e = live.retired[j];
-            char livebuf[1000], *s=livebuf;
+            char livebuf[4000], *s=livebuf;
             *s = 0;
+            if (!newblock && e->i->isop(LIR_label)) {
+                printf("\n");
+            }
+            newblock = false;
             for (int k=0,n=e->live.size(); k < n; k++) {
 				strcpy(s, names->formatRef(e->live[k]));
 				s += strlen(s);
 				*s++ = ' '; *s = 0;
 				NanoAssert(s < livebuf+sizeof(livebuf));
             }
 			printf("%-60s %s\n", livebuf, names->formatIns(e->i));
-			if (e->i->isGuard())
+            if (e->i->isGuard() || e->i->isBranch() || isRet(e->i->opcode())) {
 				printf("\n");
+                newblock = true;
+            }
 		}
 	}
 
     LabelMap::Entry::~Entry()
     {
     }
 
     LirNameMap::Entry::~Entry()
@@ -1511,17 +1634,22 @@ namespace nanojit
         Stringp new_name = labels->core->newString(name);
         if (!addName(i, new_name)) {
             labels->core->freeString(new_name);
         }
 	}
 
 	void LirNameMap::copyName(LInsp i, const char *s, int suffix) {
 		char s2[200];
-		sprintf(s2,"%s%d", s,suffix);
+		if (isdigit(s[strlen(s)-1])) {
+			// if s ends with a digit, add '_' to clarify the suffix
+			sprintf(s2,"%s_%d", s, suffix);
+		} else {
+			sprintf(s2,"%s%d", s, suffix);
+		}
 		addName(i, labels->core->newString(s2));
 	}
 
 	void LirNameMap::formatImm(int32_t c, char *buf) {
 		if (c >= 10000 || c <= -10000)
 			sprintf(buf,"#%s",labels->format((void*)c));
         else
             sprintf(buf,"%d", c);
@@ -1570,81 +1698,130 @@ namespace nanojit
 		}
 		return labels->dup(buffer);
 	}
 
 	const char* LirNameMap::formatIns(LIns* i)
 	{
 		char sbuf[200];
 		char *s = sbuf;
-		if (!i->isStore() && !i->isGuard() && !i->isop(LIR_trace)) {
-			sprintf(s, "%s = ", formatRef(i));
-			s += strlen(s);
-		}
-
 		LOpcode op = i->opcode();
 		switch(op)
 		{
 			case LIR_short:
 			case LIR_int:
 			{
                 sprintf(s, "%s", formatRef(i));
 				break;
 			}
 
+            case LIR_alloc: {
+                sprintf(s, "%s = %s %d", formatRef(i), lirNames[op], i->size());
+                break;
+            }
+
 			case LIR_quad:
 			{
 				int32_t *p = (int32_t*) (i-2);
-				sprintf(s, "#%X:%X", p[1], p[0]);
+				sprintf(s, "#%X:%X /* %g */", p[1], p[0], i->constvalf());
 				break;
 			}
 
 			case LIR_loop:
-			case LIR_trace:
+			case LIR_start:
 				sprintf(s, "%s", lirNames[op]);
 				break;
 
 #if defined NANOJIT_64BIT
 			case LIR_callh:
 #endif
 			case LIR_fcall:
 			case LIR_call: {
-				sprintf(s, "%s ( ", i->callInfo()->_name);
+				sprintf(s, "%s = %s ( ", formatRef(i), i->callInfo()->_name);
 				for (int32_t j=i->argc()-1; j >= 0; j--) {
 					s += strlen(s);
 					sprintf(s, "%s ",formatRef(i->arg(j)));
 				}
 				s += strlen(s);
 				sprintf(s, ")");
 				break;
 			}
+			case LIR_fcalli:
+			case LIR_calli: {
+                int32_t argc = i->argc();
+				sprintf(s, "%s = [%s] ( ", formatRef(i), formatRef(i->arg(argc-1)));
+                s += strlen(s);
+                argc--;
+				for (int32_t j=argc-1; j >= 0; j--) {
+					s += strlen(s);
+					sprintf(s, "%s ",formatRef(i->arg(j)));
+				}
+				s += strlen(s);
+				sprintf(s, ")");
+				break;
+			}
 
-			case LIR_param:
-                sprintf(s, "%s %s", lirNames[op], gpn(i->imm8()));
+			case LIR_param: { 
+				uint32_t arg = i->imm8();
+				if (!i->imm8b()) {
+					if (arg < sizeof(Assembler::argRegs)/sizeof(Assembler::argRegs[0])) {
+						sprintf(s, "%s = %s %d %s", formatRef(i), lirNames[op],
+							arg, gpn(Assembler::argRegs[arg]));
+					} else {
+						sprintf(s, "%s = %s %d", formatRef(i), lirNames[op], arg);
+					}
+				} else {
+					sprintf(s, "%s = %s %d %s", formatRef(i), lirNames[op],
+						arg, gpn(Assembler::savedRegs[arg]));
+				}
+				break;
+			}
+
+			case LIR_label:
+                sprintf(s, "%s:", formatRef(i));
 				break;
 
+			case LIR_jt:
+			case LIR_jf:
+                sprintf(s, "%s %s -> %s", lirNames[op], formatRef(i->oprnd1()), 
+                    i->oprnd2() ? formatRef(i->oprnd2()) : "unpatched");
+				break;
+
+			case LIR_j:
+                sprintf(s, "%s -> %s", lirNames[op], 
+                    i->oprnd2() ? formatRef(i->oprnd2()) : "unpatched");
+				break;
+
+            case LIR_live:
+			case LIR_ret:
+            case LIR_fret:
+                sprintf(s, "%s %s", lirNames[op], formatRef(i->oprnd1()));
+				break;
+				
+            case LIR_callh:
 			case LIR_neg:
 			case LIR_fneg:
 			case LIR_i2f:
 			case LIR_u2f:
 			case LIR_qlo:
 			case LIR_qhi:
             case LIR_ov:
             case LIR_cs:
 			case LIR_not: 
-				sprintf(s, "%s %s", lirNames[op], formatRef(i->oprnd1()));
+				sprintf(s, "%s = %s %s", formatRef(i), lirNames[op], formatRef(i->oprnd1()));
 				break;
 
 			case LIR_x:
 			case LIR_xt:
 			case LIR_xf:
 				formatGuard(i, s);
 				break;
 
 			case LIR_add:
+			case LIR_addp:
 			case LIR_sub: 
 		 	case LIR_mul: 
 			case LIR_fadd:
 			case LIR_fsub: 
 		 	case LIR_fmul: 
 			case LIR_fdiv: 
 			case LIR_and: 
 			case LIR_or: 
@@ -1665,49 +1842,50 @@ namespace nanojit
 			case LIR_flt:
 			case LIR_fle:
 			case LIR_fgt:
 			case LIR_fge:
             case LIR_qiadd:
             case LIR_qiand:
             case LIR_qilsh:
             case LIR_qior:
-				sprintf(s, "%s %s, %s", lirNames[op],
+				sprintf(s, "%s = %s %s, %s", formatRef(i), lirNames[op],
 					formatRef(i->oprnd1()), 
 					formatRef(i->oprnd2()));
 				break;
 
 			case LIR_qjoin:
 				sprintf(s, "%s (%s), %s", lirNames[op],
 					formatIns(i->oprnd1()), 
  					formatRef(i->oprnd2()));
  				break;
 
 			case LIR_qcmov:
 			case LIR_cmov:
-                sprintf(s, "%s ? %s : %s", 
+                sprintf(s, "%s = %s %s ? %s : %s", formatRef(i), lirNames[op],
 					formatRef(i->oprnd1()), 
 					formatRef(i->oprnd2()->oprnd1()), 
 					formatRef(i->oprnd2()->oprnd2()));
 				break;
 
 			case LIR_ld: 
 			case LIR_ldc: 
 			case LIR_ldq: 
+			case LIR_ldqc: 
 			case LIR_ldcb: 
-				sprintf(s, "%s %s[%s]", lirNames[op],
+				sprintf(s, "%s = %s %s[%s]", formatRef(i), lirNames[op],
 					formatRef(i->oprnd1()), 
 					formatRef(i->oprnd2()));
 				break;
 
 			case LIR_st: 
             case LIR_sti:
 			case LIR_stq: 
             case LIR_stqi:
-				sprintf(s, "%s[%d] = %s", 
+				sprintf(s, "%s %s[%d] = %s", lirNames[op],
 					formatRef(i->oprnd2()), 
 					i->immdisp(), 
 					formatRef(i->oprnd1()));
 				break;
 
 			default:
 				sprintf(s, "?");
 				break;
@@ -1828,23 +2006,23 @@ namespace nanojit
         Fragmento *frago = triggerFrag->lirbuf->_frago;
         AvmCore *core = frago->core();
         GC *gc = core->gc;
 
 		verbose_only( StringList asmOutput(gc); )
 		verbose_only( assm->_outputCache = &asmOutput; )
 
 		verbose_only(if (assm->_verbose && core->config.verbose_live)
-			live(gc, assm, triggerFrag);)
+			live(gc, triggerFrag->lirbuf);)
 
 		bool treeCompile = core->config.tree_opt && (triggerFrag->kind == BranchTrace);
 		RegAllocMap regMap(gc);
 		NInsList loopJumps(gc);
 #ifdef MEMORY_INFO
-		loopJumps.set_meminfo_name("LIR loopjumps");
+//		loopJumps.set_meminfo_name("LIR loopjumps");
 #endif
 		assm->beginAssembly(triggerFrag, &regMap);
 
 		//fprintf(stderr, "recompile trigger %X kind %d\n", (int)triggerFrag, triggerFrag->kind);
 		Fragment* root = triggerFrag;
 		if (treeCompile)
 		{
 			// recompile the entire tree
@@ -1879,37 +2057,76 @@ namespace nanojit
 			}
 		}
 		
 		// now the the main trunk
 		assm->assemble(root, loopJumps);
 		verbose_only(if (assm->_verbose) 
 			assm->outputf("compiling trunk %s",
 				frago->labels->format(root));)
+		NanoAssert(!frago->core()->config.tree_opt || root == root->anchor || root->kind == MergeTrace);			
 		assm->endAssembly(root, loopJumps);
 			
 		// reverse output so that assembly is displayed low-to-high
 		verbose_only( assm->_outputCache = 0; )
 		verbose_only(for(int i=asmOutput.size()-1; i>=0; --i) { assm->outputf("%s",asmOutput.get(i)); } );
 
 		if (assm->error())
 		{
 			root->fragEntry = 0;
 		}
 		else
 		{
 			root->link(assm);
 			if (treeCompile) root->linkBranches(assm);
 		}
+    }
 
-#if defined(NJ_VERBOSE)
-        for (size_t i = 0; i < asmOutput.size(); i++) {
-            gc->Free(asmOutput.get(i));
+    LInsp LoadFilter::insLoad(LOpcode v, LInsp base, LInsp disp)
+    {
+        if (base != sp && base != rp && (v == LIR_ld || v == LIR_ldq)) {
+            uint32_t k;
+            LInsp found = exprs.find2(v, base, disp, k);
+            if (found)
+                return found;
+            return exprs.add(out->insLoad(v,base,disp), k);
         }
-#endif
+        return out->insLoad(v, base, disp);
+    }
+
+    void LoadFilter::clear(LInsp p)
+    {
+        if (p != sp && p != rp)
+            exprs.clear();
+    }
+
+    LInsp LoadFilter::insStore(LInsp v, LInsp b, LInsp d)
+    {
+        clear(b);
+        return out->insStore(v, b, d);
+    }
+
+    LInsp LoadFilter::insStorei(LInsp v, LInsp b, int32_t d)
+    {
+        clear(b);
+        return out->insStorei(v, b, d);
+    }
+
+    LInsp LoadFilter::insCall(const CallInfo *call, LInsp args[])
+    {
+        if (!call->_cse)
+            exprs.clear();
+        return out->insCall(call, args);
+    }
+
+    LInsp LoadFilter::ins0(LOpcode op)
+    {
+        if (op == LIR_label)
+            exprs.clear();
+        return out->ins0(op);
     }
 
 	#endif /* FEATURE_NANOJIT */
 
 #if defined(NJ_VERBOSE)
     LabelMap::LabelMap(AvmCore *core, LabelMap* parent)
         : parent(parent), names(core->gc), addrs(core->config.verbose_addrs), end(buf), core(core)
 	{}
@@ -1952,17 +2169,17 @@ namespace nanojit
 			if (p == start) {
 				if (addrs)
 					sprintf(b,"%p %s",p,name);
 				else
 					strcpy(b, name);
 				return dup(b);
 			}
 			else if (p > start && p < end) {
-				int d = (intptr_t(p)-intptr_t(start)) >> e->align;
+				int32_t d = int32_t(intptr_t(p)-intptr_t(start)) >> e->align;
 				if (addrs)
 					sprintf(b, "%p %s+%d", p, name, d);
 				else
 					sprintf(b,"%s+%d", name, d);
 				return dup(b);
 			}
 			else {
 				if (parent)
@@ -1976,17 +2193,17 @@ namespace nanojit
 			return parent->format(p);
 
 		sprintf(b, "%p", p);
 		return dup(b);
     }
 
 	const char *LabelMap::dup(const char *b)
 	{
-		int need = strlen(b)+1;
+		size_t need = strlen(b)+1;
 		char *s = end;
 		end += need;
 		if (end > buf+sizeof(buf)) {
 			s = buf;
 			end = s+need;
 		}
 		strcpy(s, b);
 		return s;
--- a/js/src/nanojit/LIR.h
+++ b/js/src/nanojit/LIR.h
@@ -58,33 +58,44 @@ namespace nanojit
 #if defined(_MSC_VER) && _MSC_VER >= 1400
           : unsigned
 #endif
 	{
 		// flags; upper bits reserved
 		LIR64	= 0x40,			// result is double or quad
 		
 		// special operations (must be 0..N)
-		LIR_trace = 2,	
-		LIR_nearskip = 3, // must be LIR_skip-1 and lsb=1
-		LIR_skip = 4,
-        LIR_neartramp = 5, // must be LIR_tramp-1 and lsb=1
-        LIR_tramp = 6,
+		LIR_start = 0,	
+		LIR_nearskip = 1, // must be LIR_skip-1 and lsb=1
+		LIR_skip = 2,
+        LIR_neartramp = 3, // must be LIR_tramp-1 and lsb=1
+        LIR_tramp = 4,
 
 		// non-pure operations
+		LIR_addp    = 9,
 		LIR_param	= 10,
 		LIR_st		= 11, // 32-bit store
 		LIR_ld		= 12, // 32-bit load
+		LIR_alloc   = 13, // alloca some stack space
         LIR_sti     = 14,
-		LIR_call	= 18, // subrouting call returning a 32-bit value
+		LIR_ret     = 15,
+		LIR_live    = 16, // extend live range of reference
+		LIR_calli   = 17, // indirect call	
+		LIR_call	= 18, // subroutine call returning a 32-bit value
 			
 		// guards
 		LIR_loop    = 19, // loop fragment
 		LIR_x		= 20, // exit always
 
+		// branches
+		LIR_j		= 21, // jump always
+		LIR_jt		= 22, // jump true
+		LIR_jf		= 23, // jump false
+		LIR_label	= 24, // a jump target
+		LIR_ji      = 25, // jump indirect
 		// operators
 
 		// LIR_feq though LIR_fge must only be used on float arguments.  They
 		// return integers.
 		LIR_feq		= 26, // floating-point equality [2 float inputs]
 		LIR_flt		= 27, // floating-point less than: arg1 < arg2
 		LIR_fgt		= 28, // floating-point greater than: arg1 > arg2
 		LIR_fle		= 29, // arg1 <= arg2, both floating-point
@@ -132,64 +143,137 @@ namespace nanojit
 		LIR_le		= 58, // 0x3A 0011 1010
 		LIR_ge		= 59, // 0x3B 0011 1011
 		// and the unsigned integer versions
 		LIR_ult		= 60, // 0x3C 0011 1100
 		LIR_ugt		= 61, // 0x3D 0011 1101
 		LIR_ule		= 62, // 0x3E 0011 1110
 		LIR_uge		= 63, // 0x3F 0011 1111
 
+		// non-64bit ops, but we're out of code space below 64
+		LIR_file    = 1 | LIR64,
+		LIR_line    = 2 | LIR64,
+
 		/**
 		 * 64bit operations
 		 */
 		LIR_stq		= LIR_st | LIR64, // quad store
 		LIR_stqi	= LIR_sti | LIR64,
+		LIR_fret    = LIR_ret | LIR64,
 		LIR_quad    = LIR_int | LIR64, // quad constant value
 		LIR_ldq		= LIR_ld    | LIR64, // quad load
+		LIR_ldqc    = LIR_ldc   | LIR64,
         LIR_qiand   = 24 | LIR64,
         LIR_qiadd   = 25 | LIR64,
         LIR_qilsh   = LIR_lsh | LIR64,
 
 		LIR_fcall   = LIR_call  | LIR64, // subroutine call returning quad
+		LIR_fcalli  = LIR_calli | LIR64,
 		LIR_fneg	= LIR_neg  | LIR64, // floating-point numeric negation
 		LIR_fadd	= LIR_add  | LIR64, // floating-point addition
 		LIR_fsub	= LIR_sub  | LIR64, // floating-point subtraction
 		LIR_fmul	= LIR_mul  | LIR64, // floating-point multiplication
 		LIR_fdiv	= 40        | LIR64, // floating-point division
 		LIR_qcmov	= LIR_cmov | LIR64, 
 
 		LIR_qjoin	= 41 | LIR64,
 		LIR_i2f		= 42 | LIR64, // convert an integer to a float
 		LIR_u2f		= 43 | LIR64, // convert an unsigned integer to a float
         LIR_qior    = 44 | LIR64
 	};
 
 	#if defined NANOJIT_64BIT
 	#define LIR_ldp     LIR_ldq
+	#define LIR_stp     LIR_stq
     #define LIR_piadd   LIR_qiadd
     #define LIR_piand   LIR_qiand
     #define LIR_pilsh   LIR_qilsh
 	#define LIR_pcmov	LIR_qcmov
     #define LIR_pior    LIR_qior
 	#else
 	#define LIR_ldp     LIR_ld
+	#define LIR_stp     LIR_st
     #define LIR_piadd   LIR_add
     #define LIR_piand   LIR_and
     #define LIR_pilsh   LIR_lsh
 	#define LIR_pcmov	LIR_cmov
     #define LIR_pior    LIR_or
 	#endif
 
 	inline uint32_t argwords(uint32_t argc) {
 		return (argc+3)>>2;
 	}
 
     struct SideExit;
     struct Page;
-    struct CallInfo;
+
+    enum AbiKind {
+        ABI_FASTCALL,
+        ABI_THISCALL,
+		ABI_STDCALL,
+        ABI_CDECL
+    };
+
+    enum ArgSize {
+	    ARGSIZE_NONE = 0,
+	    ARGSIZE_F = 1,
+	    ARGSIZE_LO = 2,
+	    ARGSIZE_Q = 3,
+	    _ARGSIZE_MASK_INT = 2, 
+        _ARGSIZE_MASK_ANY = 3
+    };
+
+    struct CallInfo
+	{
+		uintptr_t	_address;
+        uint32_t	_argtypes:18;	// 9 2-bit fields indicating arg type, by ARGSIZE above (including ret type): a1 a2 a3 a4 a5 ret
+        uint8_t		_cse:1;			// true if no side effects
+        uint8_t		_fold:1;		// true if no side effects
+        AbiKind     _abi:3;
+		verbose_only ( const char* _name; )
+		
+		uint32_t FASTCALL _count_args(uint32_t mask) const;
+        uint32_t get_sizes(ArgSize*) const;
+
+        inline bool isInterface() const {
+            return _address == 2 || _address == 3; /* hack! */
+        }
+        inline bool isIndirect() const {
+            return _address < 256;
+        }
+		inline uint32_t FASTCALL count_args() const {
+            return _count_args(_ARGSIZE_MASK_ANY) + isIndirect();
+        }
+		inline uint32_t FASTCALL count_iargs() const {
+            return _count_args(_ARGSIZE_MASK_INT);
+        }
+		// fargs = args - iargs
+	};
+
+    inline bool isGuard(LOpcode op) {
+        return op==LIR_x || op==LIR_xf || op==LIR_xt || op==LIR_loop;
+    }
+
+    inline bool isCall(LOpcode op) {
+        op = LOpcode(op & ~LIR64);
+        return op == LIR_call || op == LIR_calli;
+    }
+
+    inline bool isStore(LOpcode op) {
+        op = LOpcode(op & ~LIR64);
+        return op == LIR_st || op == LIR_sti;
+    }
+
+    inline bool isConst(LOpcode op) {
+        return (op & ~1) == LIR_short;
+    }
+
+    inline bool isLoad(LOpcode op) {
+        return op == LIR_ldq || op == LIR_ld || op == LIR_ldc || op == LIR_ldqc;
+    }
 
 	// Low-level Instruction 4B
 	// had to lay it our as a union with duplicate code fields since msvc couldn't figure out how to compact it otherwise.
 	class LIns
 	{
         friend class LirBufWriter;
 		// 3-operand form (backwards reach only)
 		struct u_type
@@ -285,28 +369,38 @@ namespace nanojit
 
 	public:
 		LIns*		FASTCALL oprnd1() const;
 		LIns*		FASTCALL oprnd2() const;
 		LIns*		FASTCALL oprnd3() const;
 
 		inline LOpcode	opcode() const	{ return u.code; }
 		inline uint8_t	imm8()	 const	{ return c.imm8a; }
+		inline uint8_t	imm8b()	 const	{ return c.imm8b; }
 		inline int16_t	imm16()	 const	{ return i.imm16; }
+		inline int32_t	imm24()	 const	{ return t.imm24; }
 		inline LIns*	ref()	 const	{ 
 #if defined NANOJIT_64BIT
             return (t.code & 1) ? (LIns*)this+t.imm24 : *(LIns**)(this-2);
 #else
             return (t.code & 1) ? (LIns*)this+t.imm24 : *(LIns**)(this-1);
 #endif
         }
 		inline int32_t	imm32()	 const	{ return *(int32_t*)(this-1); }
 		inline uint8_t	resv()	 const  { return g.resv; }
         void*	payload() const;
         inline Page*	page()			{ return (Page*) alignTo(this,NJ_PAGE_SIZE); }
+        inline int32_t  size() const {
+            NanoAssert(isop(LIR_alloc));
+            return i.imm16<<2;
+        }
+        inline void setSize(int32_t bytes) {
+            NanoAssert(isop(LIR_alloc) && (bytes&3)==0 && isU16(bytes>>2));
+            i.imm16 = bytes>>2;
+        }
 
 		// index args in r-l order.  arg(0) is rightmost arg
 		inline LIns* arg(uint32_t i) {
 			uint32_t c = argc();
 			NanoAssert(i < c);
 			uint8_t* offs = (uint8_t*) (this-callInfoWords-argwords(c));
 			return deref(offs[i]);
 		}
@@ -370,46 +464,52 @@ namespace nanojit
 		#endif
 		}
 
 		bool isCse(const CallInfo *functions) const;
 		bool isop(LOpcode o) const { return u.code == o; }
 		bool isQuad() const;
 		bool isCond() const;
 		bool isCmp() const;
-		bool isCall() const;
-        bool isStore() const;
-        bool isLoad() const;
-		bool isGuard() const;
+		bool isCall() const { return nanojit::isCall(u.code); }
+        bool isStore() const { return nanojit::isStore(u.code); }
+        bool isLoad() const { return nanojit::isLoad(u.code); }
+		bool isGuard() const { return nanojit::isGuard(u.code); }
 		// True if the instruction is a 32-bit or smaller constant integer.
-		bool isconst() const;
+		bool isconst() const { return nanojit::isConst(u.code); }
 		// True if the instruction is a 32-bit or smaller constant integer and
 		// has the value val when treated as a 32-bit signed integer.
 		bool isconstval(int32_t val) const;
 		// True if the instruction is a constant quad value.
 		bool isconstq() const;
 		// True if the instruction is a constant pointer value.
 		bool isconstp() const;
         bool isTramp() {
             return isop(LIR_neartramp) || isop(LIR_tramp);
         }
-
+		bool isBranch() const {
+			return isop(LIR_jt) || isop(LIR_jf) || isop(LIR_j);
+		}
 		// Set the imm16 member.  Should only be used on instructions that use
 		// that.  If you're not sure, you shouldn't be calling it.
 		void setimm16(int32_t i);
+		void setimm24(int32_t x);
 		// Set the resv member.  Should only be used on instructions that use
 		// that.  If you're not sure, you shouldn't be calling it.
 		void setresv(uint32_t resv);
 		// Set the opcode
 		void initOpcode(LOpcode);
 		// operand-setting methods
 		void setOprnd1(LIns*);
 		void setOprnd2(LIns*);
 		void setOprnd3(LIns*);
         void setDisp(int8_t d);
+		void target(LIns* t);
+        LIns **targetAddr();
+		LIns* getTarget();
 
         SideExit *exit();
 
 		inline uint32_t argc() const {
 			NanoAssert(isCall());
 			return c.imm8b;
 		}
 		inline size_t callInsWords() const {
@@ -419,29 +519,31 @@ namespace nanojit
 			return *(const CallInfo **) (this - callInfoWords);
 		}
 	};
 	typedef LIns*		LInsp;
 
 	bool FASTCALL isCse(LOpcode v);
 	bool FASTCALL isCmp(LOpcode v);
 	bool FASTCALL isCond(LOpcode v);
+    inline bool isRet(LOpcode c) {
+        return (c & ~LIR64) == LIR_ret;
+    }
+    bool FASTCALL isFloat(LOpcode v);
 	LIns* FASTCALL callArgN(LInsp i, uint32_t n);
 	extern const uint8_t operandCount[];
 
 	class Fragmento;	// @todo remove this ; needed for minbuild for some reason?!?  Should not be compiling this code at all
 	class LirFilter;
-	struct CallInfo;
 
 	// make it a GCObject so we can explicitly delete it early
 	class LirWriter : public GCObject
 	{
 	public:
 		LirWriter *out;
-	public:
         const CallInfo *_functions;
 
 		virtual ~LirWriter() {}
 		LirWriter(LirWriter* out) 
 			: out(out), _functions(out?out->_functions : 0) {}
 
 		virtual LInsp ins0(LOpcode v) {
 			return out->ins0(v);
@@ -450,18 +552,23 @@ namespace nanojit
 			return out->ins1(v, a);
 		}
 		virtual LInsp ins2(LOpcode v, LIns* a, LIns* b) {
 			return out->ins2(v, a, b);
 		}
 		virtual LInsp insGuard(LOpcode v, LIns *c, SideExit *x) {
 			return out->insGuard(v, c, x);
 		}
-		virtual LInsp insParam(int32_t i) {
-			return out->insParam(i);
+		virtual LInsp insBranch(LOpcode v, LInsp condition, LInsp to) {
+			return out->insBranch(v, condition, to);
+		}
+        // arg: 0=first, 1=second, ...
+        // kind: 0=arg 1=saved-reg
+		virtual LInsp insParam(int32_t arg, int32_t kind) {
+			return out->insParam(arg, kind);
 		}
 		virtual LInsp insImm(int32_t imm) {
 			return out->insImm(imm);
 		}
 		virtual LInsp insImmq(uint64_t imm) {
 			return out->insImmq(imm);
 		}
 		virtual LInsp insLoad(LOpcode op, LIns* base, LIns* d) {
@@ -472,30 +579,35 @@ namespace nanojit
 		}
 		virtual LInsp insStorei(LIns* value, LIns* base, int32_t d) {
 			return isS8(d) ? out->insStorei(value, base, d)
 				: out->insStore(value, base, insImm(d));
 		}
 		virtual LInsp insCall(const CallInfo *call, LInsp args[]) {
 			return out->insCall(call, args);
 		}
+		virtual LInsp insAlloc(int32_t size) {
+			return out->insAlloc(size);
+		}
 
 		// convenience
 	    LIns*		insLoadi(LIns *base, int disp);
 	    LIns*		insLoad(LOpcode op, LIns *base, int disp);
+	    LIns*		store(LIns* value, LIns* base, int32_t d);
 		// Inserts a conditional to execute and branches to execute if
 		// the condition is true and false respectively.
 	    LIns*		ins_choose(LIns* cond, LIns* iftrue, LIns* iffalse);
 	    // Inserts an integer comparison to 0
 	    LIns*		ins_eq0(LIns* oprnd1);
 		// Inserts a binary operation where the second operand is an
 		// integer immediate.
         LIns*       ins2i(LOpcode op, LIns *oprnd1, int32_t);
 		LIns*		qjoin(LInsp lo, LInsp hi);
 		LIns*		insImmPtr(const void *ptr);
+		LIns*		insImmf(double f);
 	};
 
 #ifdef NJ_VERBOSE
 	extern const char* lirNames[];
 
 	/**
 	 * map address ranges to meaningful names.
 	 */
@@ -511,18 +623,18 @@ namespace nanojit
 			DRCWB(avmplus::String*) name;
 			size_t size:29, align:3;
 		};
         avmplus::SortedMap<const void*, Entry*, avmplus::LIST_GCObjects> names;
 		bool addrs, pad[3];
 		char buf[1000], *end;
         void formatAddr(const void *p, char *buf);
     public:
-		AvmCore *core;
-        LabelMap(AvmCore *, LabelMap* parent);
+        avmplus::AvmCore *core;
+        LabelMap(avmplus::AvmCore *, LabelMap* parent);
         ~LabelMap();
         void add(const void *p, size_t size, size_t align, const char *name);
 		void add(const void *p, size_t size, size_t align, avmplus::String*);
 		const char *dup(const char *);
 		const char *format(const void *p);
 		void promoteAll(const void *newbase);
     };
 
@@ -574,110 +686,128 @@ namespace nanojit
 		const char *formatIns(LInsp i);
 		void formatGuard(LInsp i, char *buf);
 	};
 
 
 	class VerboseWriter : public LirWriter
 	{
 		avmplus::List<LInsp, avmplus::LIST_NonGCObjects> code;
-		LirNameMap *names;
+		DWB(LirNameMap*) names;
     public:
 		VerboseWriter(GC *gc, LirWriter *out, LirNameMap* names) 
 			: LirWriter(out), code(gc), names(names) 
 		{}
 
 		LInsp add(LInsp i) {
-			code.add(i);
+			if (i)
+				code.add(i);
 			return i;
 		}
 
+        LInsp add_flush(LInsp i) {
+            if ((i = add(i)) != 0) 
+                flush();
+            return i;
+        }
+
 		void flush()
 		{
-			for (int j=0, n=code.size(); j < n; j++)
-				printf("    %s\n",names->formatIns(code[j]));
-			code.clear();
-			printf("\n");
+            int n = code.size();
+            if (n) {
+			    for (int i=0; i < n; i++)
+				    printf("    %s\n",names->formatIns(code[i]));
+			    code.clear();
+                if (n > 1)
+        			printf("\n");
+            }
 		}
 
 		LIns* insGuard(LOpcode op, LInsp cond, SideExit *x) {
-			LInsp i = add(out->insGuard(op,cond,x));
-			if (i)
-				flush();
-			return i;
+			return add_flush(out->insGuard(op,cond,x));
+		}
+
+		LIns* insBranch(LOpcode v, LInsp condition, LInsp to) {
+			return add_flush(out->insBranch(v, condition, to));
 		}
 
+
 		LIns* ins0(LOpcode v) {
-			LInsp i = add(out->ins0(v));
-			if (i)
-				flush();
-			return i;
+            if (v == LIR_label || v == LIR_start) {
+                flush();
+            }
+			return add(out->ins0(v));
 		}
 
 		LIns* ins1(LOpcode v, LInsp a) {
-			return add(out->ins1(v, a));
+			return isRet(v) ? add_flush(out->ins1(v, a)) : add(out->ins1(v, a));
 		}
 		LIns* ins2(LOpcode v, LInsp a, LInsp b) {
 			return v == LIR_2 ? out->ins2(v,a,b) : add(out->ins2(v, a, b));
 		}
 		LIns* insCall(const CallInfo *call, LInsp args[]) {
-			return add(out->insCall(call, args));
+			return add_flush(out->insCall(call, args));
 		}
-		LIns* insParam(int32_t i) {
-			return add(out->insParam(i));
+		LIns* insParam(int32_t i, int32_t kind) {
+			return add(out->insParam(i, kind));
 		}
 		LIns* insLoad(LOpcode v, LInsp base, LInsp disp) {
 			return add(out->insLoad(v, base, disp));
 		}
 		LIns* insStore(LInsp v, LInsp b, LInsp d) {
 			return add(out->insStore(v, b, d));
 		}
 		LIns* insStorei(LInsp v, LInsp b, int32_t d) {
 			return add(out->insStorei(v, b, d));
 		}
+        LIns* insAlloc(int32_t size) {
+            return add(out->insAlloc(size));
+        }
     };
 
 #endif
 
 	class ExprFilter: public LirWriter
 	{
 	public:
 		ExprFilter(LirWriter *out) : LirWriter(out) {}
 		LIns* ins1(LOpcode v, LIns* a);
 	    LIns* ins2(LOpcode v, LIns* a, LIns* b);
-		LIns* insGuard(LOpcode v, LIns *c, SideExit *x);
+		LIns* insGuard(LOpcode, LIns *cond, SideExit *);
+        LIns* insBranch(LOpcode, LIns *cond, LIns *target);
 	};
 
 	// @todo, this could be replaced by a generic HashMap or HashSet, if we had one
 	class LInsHashSet
 	{
 		// must be a power of 2. 
 		// don't start too small, or we'll waste time growing and rehashing.
 		// don't start too large, will waste memory. 
-		static const uint32_t kInitialCap = 2048;	
+		static const uint32_t kInitialCap = 64;	
 
-		InsList m_list;
-		uint32_t m_used;
+		LInsp *m_list; // explicit WB's are used, no DWB needed.
+		uint32_t m_used, m_cap;
 		GC* m_gc;
 
 		static uint32_t FASTCALL hashcode(LInsp i);
-		uint32_t FASTCALL find(LInsp name, uint32_t hash, const InsList& list, uint32_t cap);
+		uint32_t FASTCALL find(LInsp name, uint32_t hash, const LInsp *list, uint32_t cap);
 		static bool FASTCALL equals(LInsp a, LInsp b);
 		void FASTCALL grow();
 
 	public:
 
 		LInsHashSet(GC* gc);
 		LInsp find32(int32_t a, uint32_t &i);
 		LInsp find64(uint64_t a, uint32_t &i);
 		LInsp find1(LOpcode v, LInsp a, uint32_t &i);
 		LInsp find2(LOpcode v, LInsp a, LInsp b, uint32_t &i);
 		LInsp findcall(const CallInfo *call, uint32_t argc, LInsp args[], uint32_t &i);
 		LInsp add(LInsp i, uint32_t k);
 		void replace(LInsp i);
+        void clear();
 
 		static uint32_t FASTCALL hashimm(int32_t);
 		static uint32_t FASTCALL hashimmq(uint64_t);
 		static uint32_t FASTCALL hash1(LOpcode v, LInsp);
 		static uint32_t FASTCALL hash2(LOpcode v, LInsp, LInsp);
 		static uint32_t FASTCALL hashcall(const CallInfo *call, uint32_t argc, LInsp args[]);
 	};
 
@@ -690,50 +820,55 @@ namespace nanojit
 	    LIns* insImmq(uint64_t q);
 		LIns* ins1(LOpcode v, LInsp);
 		LIns* ins2(LOpcode v, LInsp, LInsp);
 		LIns* insLoad(LOpcode v, LInsp b, LInsp d);
 		LIns* insCall(const CallInfo *call, LInsp args[]);
 		LIns* insGuard(LOpcode op, LInsp cond, SideExit *x);
 	};
 
-	struct Page;
 	class LirBuffer : public GCFinalizedObject
 	{
 		public:
 			DWB(Fragmento*)		_frago;
 			LirBuffer(Fragmento* frago, const CallInfo* functions);
 			virtual ~LirBuffer();
 			void        clear();
 			LInsp		next();
-			LInsp		commit(uint32_t count);
-			bool		addPage();
 			bool		outOmem() { return _noMem != 0; }
-			debug_only (void		validate() const;)
+			
+			debug_only (void validate() const;)
 			verbose_only(DWB(LirNameMap*) names;)
-			verbose_only(int insCount();)
-			verbose_only(int byteCount();)
+			
+			int32_t insCount();
+			int32_t byteCount();
 
 			// stats
 			struct 
 			{
 				uint32_t lir;	// # instructions
 				uint32_t pages;	// pages consumed
 			}
 			_stats;
 
 			const CallInfo* _functions;
+            AbiKind abi;
             LInsp state,param1,sp,rp;
+            LInsp savedParams[NumSavedRegs];
 			
-		private:
+		protected:
+			friend class LirBufWriter;
+
+			LInsp		commit(uint32_t count);
+			bool		addPage();
 			Page*		pageAlloc();
 
-			Page*				_start;		// first page
-			LInsp				_unused;	// next unused instruction slot
-			int					_noMem;		// set if ran out of memory when writing to buffer
+			Page*		_start;		// first page
+			LInsp		_unused;	// next unused instruction slot
+			int			_noMem;		// set if ran out of memory when writing to buffer
 	};	
 
 	class LirBufWriter : public LirWriter
 	{
 		DWB(LirBuffer*)	_buf;		// underlying buffer housing the instructions
         LInsp spref, rpref;
 
         public:
@@ -744,27 +879,34 @@ namespace nanojit
 
 			// LirWriter interface
 			LInsp   insLoad(LOpcode op, LInsp base, LInsp off);
 			LInsp	insStore(LInsp o1, LInsp o2, LInsp o3);
 			LInsp	insStorei(LInsp o1, LInsp o2, int32_t imm);
 			LInsp	ins0(LOpcode op);
 			LInsp	ins1(LOpcode op, LInsp o1);
 			LInsp	ins2(LOpcode op, LInsp o1, LInsp o2);
-			LInsp	insParam(int32_t i);
+			LInsp	insParam(int32_t i, int32_t kind);
 			LInsp	insImm(int32_t imm);
 			LInsp	insImmq(uint64_t imm);
 		    LInsp	insCall(const CallInfo *call, LInsp args[]);
 			LInsp	insGuard(LOpcode op, LInsp cond, SideExit *x);
+			LInsp	insBranch(LOpcode v, LInsp condition, LInsp to);
+			LInsp   insAlloc(int32_t size);
 
 			// buffer mgmt
 			LInsp	skip(size_t);
+
+		protected:
 			LInsp	insFar(LOpcode op, LInsp target);
+			LInsp	insLink(LOpcode op, LInsp target);
 			LInsp	ensureReferenceable(LInsp i, int32_t addedDistance);
 			bool	ensureRoom(uint32_t count);
+			bool	can8bReach(LInsp from, LInsp to) { return isU8(from-to-1); }
+			bool	can24bReach(LInsp from, LInsp to){ return isS24(from-to); }
 			bool	canReference(LInsp from, LInsp to) {
 				return isU8(from-to-1);
 			}
 	};
 
 	class LirFilter
 	{
 	public:
@@ -790,40 +932,61 @@ namespace nanojit
 		LirReader(LInsp i) : LirFilter(0), _i(i) { }
 		virtual ~LirReader() {}
 
 		// LirReader i/f
 		LInsp read(); // advance to the prior instruction
 		LInsp pos() {
 			return _i;
 		}
+        void setpos(LIns *i) {
+            _i = i;
+        }
 	};
 
     class Assembler;
 
     void compile(Assembler *assm, Fragment *frag);
     verbose_only( void printTracker(const char* s, avmplus::RegionTracker& trk, Assembler* assm); )
-	verbose_only(void live(GC *gc, Assembler *assm, Fragment *frag);)
+	verbose_only(void live(GC *gc, LirBuffer *lirbuf);)
 
 	class StackFilter: public LirFilter
 	{
 		GC *gc;
-		Fragment *frag;
+		LirBuffer *lirbuf;
 		LInsp sp;
 		avmplus::BitSet stk;
         int top;
-		int getTop(LInsp guard);
+		int getTop(LInsp br);
 	public:
-		StackFilter(LirFilter *in, GC *gc, Fragment *frag, LInsp sp); 
+		StackFilter(LirFilter *in, GC *gc, LirBuffer *lirbuf, LInsp sp); 
 		virtual ~StackFilter() {}
 		LInsp read();
 	};
 
 	class CseReader: public LirFilter
 	{
 		LInsHashSet *exprs;
 		const CallInfo *functions;
 	public:
 		CseReader(LirFilter *in, LInsHashSet *exprs, const CallInfo*);
 		LInsp read();
 	};
+
+    // eliminate redundant loads by watching for stores & mutator calls
+    class LoadFilter: public LirWriter
+    {
+    public:
+        LInsp sp, rp;
+        LInsHashSet exprs;
+        void clear(LInsp p);
+    public:
+        LoadFilter(LirWriter *out, GC *gc)
+            : LirWriter(out), exprs(gc) { }
+
+        LInsp ins0(LOpcode);
+        LInsp insLoad(LOpcode, LInsp base, LInsp disp);
+        LInsp insStore(LInsp v, LInsp b, LInsp d);
+        LInsp insStorei(LInsp v, LInsp b, int32_t d);
+        LInsp insCall(const CallInfo *call, LInsp args[]);
+    };
 }
 #endif // __nanojit_LIR__
--- a/js/src/nanojit/NativeAMD64.h
+++ b/js/src/nanojit/NativeAMD64.h
@@ -104,16 +104,17 @@ namespace nanojit
 		LastReg = 31,
 		UnknownReg = 32
 	} 
 	Register;
 
 	typedef int RegisterMask;
 
 	/* RBX, R13-R15 */
+	static const int NumSavedRegs = 3;
 	static const RegisterMask SavedRegs = /*(1<<RBX) |*/ /*(1<<R12) |*/ (1<<R13) | (1<<R14) | (1<<R15);
 	/* RAX, RCX, RDX, RDI, RSI, R8-R11 */
 	static const RegisterMask TempRegs = (1<<RAX) | (1<<RCX) | (1<<RDX) | (1<<R8) | (1<<R9) | (1<<R10) | (1<<R11) | (1<<RDI) | (1<<RSI);
 	static const RegisterMask GpRegs = SavedRegs | TempRegs;
 	/* XMM0-XMM7 */
 	static const RegisterMask XmmRegs = (1<<XMM0) | (1<<XMM1) | (1<<XMM2) | (1<<XMM3) | (1<<XMM4) | (1<<XMM5) | (1<<XMM6) | (1<<XMM7) | (1<<XMM8) | (1<<XMM9) | (1<<XMM10) | (1<<XMM11) | (1<<XMM13) | (1<<XMM14) | (1<<XMM15);
 	static const RegisterMask FpRegs = XmmRegs;
 	static const RegisterMask ScratchRegs = TempRegs | XmmRegs;
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@@ -152,16 +152,17 @@ typedef struct _FragInfo {
     NIns*           epilogue;
 } FragInfo;
 
 #ifdef ARM_VFP
 static const RegisterMask SavedFpRegs = 1<<D0 | 1<<D1 | 1<<D2 | 1<<D3 | 1<<D4 | 1<<D5 | 1<<D6 | 1<<D7;
 #else
 static const RegisterMask SavedFpRegs = 0;
 #endif
+static const int NumSavedRegs = 7;
 static const RegisterMask SavedRegs = 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7 | 1<<R8 | 1<<R9 | 1<<R10 | SavedFpRegs;
 static const RegisterMask FpRegs = 1<<D0 | 1<<D1 | 1<<D2 | 1<<D3 | 1<<D4 | 1<<D5 | 1<<D6; // no D7; S14-S15 are used for i2f/u2f.
 static const RegisterMask GpRegs = 0x07FF;
 static const RegisterMask AllowableFlagRegs = 1<<R0 | 1<<R1 | 1<<R2 | 1<<R3 | 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7 | 1<<R8 | 1<<R9 | 1<<R10;
 
 #define IsFpReg(_r)     ((rmask(_r) & (FpRegs | (1<<D7))) != 0)
 #define IsGpReg(_r)     ((rmask(_r) & (GpRegs | (1<<Scratch))) != 0)
 #define FpRegNum(_fpr)  ((_fpr) - FirstFloatReg)
--- a/js/src/nanojit/NativeThumb.cpp
+++ b/js/src/nanojit/NativeThumb.cpp
@@ -56,16 +56,22 @@ namespace nanojit
 
 #ifdef NJ_VERBOSE
 	const char* regNames[] = {"r0","r1","r2","r3","r4","r5","r6","r7","r8","r9","r10","r11","IP","SP","LR","PC"};
 
 #endif
     const Register Assembler::argRegs[] = { R0, R1, R2, R3 };
     const Register Assembler::retRegs[] = { R0, R1 };
 
+#ifdef NJ_THUMB_JIT
+	const Register Assembler::savedRegs[] = { R4, R5, R6, R7 };
+#else
+	const Register Assembler::savedRegs[] = { R4, R5, R6, R7, R8, R9, R10 };
+#endif
+
 	void Assembler::nInit(AvmCore*)
 	{
 		// Thumb mode does not have conditional move, alas
 		has_cmov = false;
 	}
 
 	NIns* Assembler::genPrologue(RegisterMask needSaving)
 	{
@@ -264,17 +270,17 @@ namespace nanojit
 		uint32_t op = i->opcode();
 		int prefer = ~0;
 
 		if (op==LIR_call || op==LIR_fcall)
 			prefer = rmask(R0);
 		else if (op == LIR_callh)
 			prefer = rmask(R1);
 		else if (op == LIR_param)
-			prefer = rmask(imm2register(i->imm8()));
+			prefer = rmask(imm2register(argRegs[i->imm8()]));
 
 		if (_allocator.free & allow & prefer)
 			allow &= prefer;
 		return allow;
 	}
 
     void Assembler::asm_qjoin(LIns *ins)
     {
--- a/js/src/nanojit/NativeThumb.h
+++ b/js/src/nanojit/NativeThumb.h
@@ -96,16 +96,17 @@ namespace nanojit
 	typedef int RegisterMask;
 	typedef struct _FragInfo
 	{
 		RegisterMask	needRestoring;
 		NIns*			epilogue;
 	} 
 	FragInfo;
 
+	static const int NumSavedRegs = 4;
 	static const RegisterMask SavedRegs = 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7;
 	static const RegisterMask FpRegs = 0x0000; // FST0-FST7
 	static const RegisterMask GpRegs = 0x003F;
 	static const RegisterMask AllowableFlagRegs = 1<<R0 | 1<<R1 | 1<<R2 | 1<<R3 | 1<<R4 | 1<<R5;
 
 	#define firstreg()		R0
 	#define nextreg(r)		(Register)((int)r+1)
 	#define imm2register(c) (Register)(c-1)
--- a/js/src/nanojit/Nativei386.cpp
+++ b/js/src/nanojit/Nativei386.cpp
@@ -66,49 +66,54 @@ namespace nanojit
             "xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"
 #endif
 		};
 	#endif
 
 #if defined NANOJIT_IA32
     const Register Assembler::argRegs[] = { ECX, EDX };
     const Register Assembler::retRegs[] = { EAX, EDX };
+    const Register Assembler::savedRegs[] = { EBX, ESI, EDI };
 #elif defined NANOJIT_AMD64
 #if defined WIN64
 	const Register Assembler::argRegs[] = { R8, R9, RCX, RDX };
 #else
 	const Register Assembler::argRegs[] = { RDI, RSI, RDX, RCX, R8, R9 };
 #endif
 	const Register Assembler::retRegs[] = { RAX, RDX };
+	const Register Assembler::savedRegs[] = { R13, R14, R15 };
 #endif
 
+    const static uint8_t max_abi_regs[] = {
+        2, /* ABI_FASTCALL */
+        1, /* ABI_THISCALL */
+        0, /* ABI_STDCALL */
+        0  /* ABI_CDECL */
+    };
+
+
 	void Assembler::nInit(AvmCore* core)
 	{
         OSDep::getDate();
 #ifdef NANOJIT_AMD64
         avmplus::AvmCore::cmov_available =
         avmplus::AvmCore::sse2_available = true;
 #endif
 	}
 
-	NIns* Assembler::genPrologue(RegisterMask needSaving)
+	NIns* Assembler::genPrologue()
 	{
 		/**
 		 * Prologue
 		 */
 		uint32_t stackNeeded = STACK_GRANULARITY * _activation.highwatermark;
-		uint32_t savingCount = 0;
 
-		for(Register i=FirstReg; i <= LastReg; i = nextreg(i))
-			if (needSaving&rmask(i)) 
-				savingCount++;
-
-		// After forcing alignment, we've pushed the pre-alignment SP
-		// and savingCount registers.
-		uint32_t stackPushed = STACK_GRANULARITY * (1+savingCount);
+		uint32_t stackPushed =
+            STACK_GRANULARITY + // returnaddr
+            STACK_GRANULARITY; // ebp
 		uint32_t aligned = alignUp(stackNeeded + stackPushed, NJ_ALIGN_STACK);
 		uint32_t amt = aligned - stackPushed;
 
 		// Reserve stackNeeded bytes, padded
 		// to preserve NJ_ALIGN_STACK-byte alignment.
 		if (amt) 
 		{
 #if defined NANOJIT_IA32
@@ -117,47 +122,47 @@ namespace nanojit
 			SUBQi(SP, amt);
 #endif
 		}
 
 		verbose_only( verbose_outputf("        %p:",_nIns); )
 		verbose_only( verbose_output("        patch entry:"); )
         NIns *patchEntry = _nIns;
 		MR(FP, SP); // Establish our own FP.
-
-		// Save pre-alignment SP value here, where the FP will point,
-		// to preserve the illusion of a valid frame chain for
-		// functions like MMgc::GetStackTrace.  The 'return address'
-		// of this 'frame' will be the last-saved register, but that's
-		// fine, because the next-older frame will be legit.
-		PUSHr(FP);
-
-		for(Register i=FirstReg; i <= LastReg; i = nextreg(i))
-			if (needSaving&rmask(i))
-				PUSHr(i);
+        PUSHr(FP); // Save caller's FP.
 
-		// We'd like to be able to use SSE instructions like MOVDQA on
-		// stack slots; it requires 16B alignment.  Darwin requires a
-		// 16B stack alignment, and Linux GCC seems to intend to
-		// establish and preserve the same, but we're told that GCC
-		// has not always done this right.  To avoid doubt, do it on
-		// all platforms.  The prologue runs only when we enter
-		// fragments from the interpreter, so forcing 16B alignment
-		// here is cheap.
-#if defined NANOJIT_IA32
-		ANDi(SP, -NJ_ALIGN_STACK);
-#elif defined NANOJIT_AMD64
-		ANDQi(SP, -NJ_ALIGN_STACK);
-#endif
-		MR(FP,SP);
-		PUSHr(FP); // Save caller's FP.
+        // align the entry point
+        asm_align_code();
 
 		return patchEntry;
 	}
 
+    void Assembler::asm_align_code() {
+        static char nop[][9] = {
+                {0x90},
+                {0x66,0x90},
+                {0x0f,0x1f,0x00},
+                {0x0f,0x1f,0x40,0x00},
+                {0x0f,0x1f,0x44,0x00,0x00},
+                {0x66,0x0f,0x1f,0x44,0x00,0x00},
+                {0x0f,0x1f,0x80,0x00,0x00,0x00,0x00},
+                {0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00},
+                {0x66,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00},
+        };
+        unsigned n;
+        while((n = uintptr_t(_nIns) & 15) != 0) {
+            if (n > 9)
+                n = 9;
+            underrunProtect(n);
+            _nIns -= n;
+            memcpy(_nIns, nop[n-1], n);
+            asm_output1("nop%d", n);
+        }
+    }
+
 	void Assembler::nFragExit(LInsp guard)
 	{
 		SideExit *exit = guard->exit();
 		bool trees = _frago->core()->config.tree_opt;
         Fragment *frag = exit->target;
         GuardRecord *lr = 0;
 		bool destKnown = (frag && frag->fragEntry);
 		if (destKnown && !trees)
@@ -186,17 +191,16 @@ namespace nanojit
 			// for trees we need the patch entry on the incoming fragment so we can unhook it later if needed
 			if (tress && destKnown)
 				patch(lr);
 #endif
 		}
 		// first restore ESP from EBP, undoing SUBi(SP,amt) from genPrologue
         MR(SP,FP);
 
-
         #ifdef NJ_VERBOSE
         if (_frago->core()->config.show_stats) {
 			// load EDX (arg1) with Fragment *fromFrag, target fragment
 			// will make use of this when calling fragenter().
 		#if defined NANOJIT_IA32
             int fromfrag = int((Fragment*)_thisfrag);
             LDi(argRegs[1], fromfrag);
 		#elif defined NANOJIT_AMD64
@@ -208,114 +212,125 @@ namespace nanojit
 		// return value is GuardRecord*
 	#if defined NANOJIT_IA32
         LDi(EAX, int(lr));
 	#elif defined NANOJIT_AMD64
 		LDQi(RAX, intptr_t(lr));
 	#endif
 	}
 
-    NIns *Assembler::genEpilogue(RegisterMask restore)
+    NIns *Assembler::genEpilogue()
     {
         RET();
         POPr(FP); // Restore caller's FP.
-        MR(SP,FP); // Undo forced alignment.
-
-		// Restore saved registers.
-		for (Register i=UnknownReg; i >= FirstReg; i = prevreg(i))
-			if (restore&rmask(i)) { POPr(i); } 
-		
-		POPr(FP); // Pop the pre-alignment SP.
+        MR(SP,FP); // pop the stack frame
         return  _nIns;
     }
 	
 #if defined NANOJIT_IA32
 	void Assembler::asm_call(LInsp ins)
 	{
         const CallInfo* call = ins->callInfo();
 		// must be signed, not unsigned
-		const uint32_t iargs = call->count_iargs();
-		int32_t fstack = call->count_args() - iargs;
+		uint32_t iargs = call->count_iargs();
+		int32_t fargs = call->count_args() - iargs - call->isIndirect();
 
-        int32_t extra = 0;
+        bool imt = call->isInterface();
+        if (imt)
+            iargs --;
+
+        uint32_t max_regs = max_abi_regs[call->_abi];
+        if (max_regs > iargs)
+            max_regs = iargs;
 
-#if defined NJ_NO_FASTCALL
-        int32_t istack = iargs;
+        int32_t istack = iargs-max_regs;  // first 2 4B args are in registers
+        int32_t extra = 0;
+		const int32_t pushsize = 4*istack + 8*fargs; // actual stack space used
+
+#if _MSC_VER
+        // msc is slack, and MIR doesn't do anything extra, so lets use this
+        // call-site alignment to at least have code size parity with MIR.
+        uint32_t align = 4;//NJ_ALIGN_STACK;
 #else
-		int32_t istack = iargs-2;  // first 2 4B args are in registers
-		if (istack <= 0)
-		{
-			istack = 0;
-		}
+        uint32_t align = NJ_ALIGN_STACK;
 #endif
 
-		const int32_t size = 4*istack + 8*fstack; // actual stack space used
-        if (size) {
+        if (pushsize) {
 		    // stack re-alignment 
 		    // only pop our adjustment amount since callee pops args in FASTCALL mode
-		    extra = alignUp(size, NJ_ALIGN_STACK) - (size); 
-#ifndef NJ_NO_FASTCALL
-		    if (extra > 0)
-			{
+		    extra = alignUp(pushsize, align) - pushsize;
+            if (call->_abi == ABI_CDECL) {
+				// with CDECL only, caller pops args
+                ADDi(SP, extra+pushsize);
+            } else if (extra > 0) {
 				ADDi(SP, extra);
-			}
-#endif
+            }
         }
 
-#ifdef NJ_NO_FASTCALL
-        // In C calling conventions, callee doesn't pop args.
-        ADDi(SP, 4*iargs + 8*fstack + extra);
-#endif
-
-		CALL(call);
-
-#ifdef NJ_NO_FASTCALL
-        if (iargs >= 1) {
-            PUSHr(ECX);
-            if (iargs >= 2) {
-                PUSHr(EDX);
-            }
+        bool indirect = false;
+        if (ins->isop(LIR_call) || ins->isop(LIR_fcall)) {
+            verbose_only(if (_verbose)
+                outputf("        %p:", _nIns);
+            )
+    		CALL(call);
         }
-#endif
+        else {
+            // indirect call.  x86 Calling conventions don't use EAX as an
+            // argument, and do use EAX as a return value.  We need a register
+            // for the address to call, so we use EAX since it will always be
+            // available
+            NanoAssert(ins->isop(LIR_calli) || ins->isop(LIR_fcalli));
+            CALLr(call, EAX);
+            indirect = true;
+        }
 
 		// make sure fpu stack is empty before call (restoreCallerSaved)
 		NanoAssert(_allocator.isFree(FST0));
 		// note: this code requires that ref arguments (ARGSIZE_Q)
         // be one of the first two arguments
-		// pre-assign registers to the first 2 4B args
-		const int max_regs = (iargs < 2) ? iargs : 2;
-		int n = 0;
+		// pre-assign registers to the first N 4B args based on the calling convention
+		uint32_t n = 0;
 
-        ArgSize sizes[10];
+        ArgSize sizes[2*MAXARGS];
         uint32_t argc = call->get_sizes(sizes);
+        if (indirect) {
+            argc--;
+            asm_arg(ARGSIZE_LO, ins->arg(argc), EAX);
+        }
+
+        if (imt) {
+            // interface thunk calling convention: put iid in EDX
+            NanoAssert(call->_abi == ABI_CDECL);
+            argc--;
+            asm_arg(ARGSIZE_LO, ins->arg(argc), EDX);
+        }
 
 		for(uint32_t i=0; i < argc; i++)
 		{
 			uint32_t j = argc-i-1;
             ArgSize sz = sizes[j];
             Register r = UnknownReg;
-            if (n < max_regs && sz != ARGSIZE_F) 
-			    r = argRegs[n++]; // tell asm_arg what reg to use
+            if (n < max_regs && sz != ARGSIZE_F) { 
+		        r = argRegs[n++]; // tell asm_arg what reg to use
+            }
             asm_arg(sz, ins->arg(j), r);
 		}
 
 		if (extra > 0)
-		{
 			SUBi(SP, extra);
-		}
 	}
 
 #elif defined NANOJIT_AMD64
 
 	void Assembler::asm_call(LInsp ins)
 	{
 		Register fpu_reg = XMM0;
         const CallInfo* call = ins->callInfo();
 		int n = 0;
-        
+
 		CALL(call);
 
         ArgSize sizes[10];
         uint32_t argc = call->get_sizes(sizes);
 
 		for(uint32_t i=0; i < argc; i++)
 		{
 			uint32_t j = argc-i-1;
@@ -412,35 +427,36 @@ namespace nanojit
         }
 #endif
 	}
 
 	RegisterMask Assembler::hint(LIns* i, RegisterMask allow)
 	{
 		uint32_t op = i->opcode();
 		int prefer = allow;
-		if (op == LIR_call)
-#if defined NANOJIT_IA32
-			prefer &= rmask(EAX);
-#elif defined NANOJIT_AMD64
-			prefer &= rmask(RAX);
-#endif
-		else if (op == LIR_param)
-			prefer &= rmask(Register(i->imm8()));
-#if defined NANOJIT_IA32
-        else if (op == LIR_callh || op == LIR_rsh && i->oprnd1()->opcode()==LIR_callh)
-            prefer &= rmask(EDX);
-#else
-		else if (op == LIR_callh)
-			prefer &= rmask(RAX);
-#endif
-		else if (i->isCmp())
+        if (op == LIR_call || op == LIR_calli) {
+			prefer &= rmask(retRegs[0]);
+        }
+        else if (op == LIR_fcall || op == LIR_fcalli) {
+            prefer &= rmask(FST0);
+        }
+        else if (op == LIR_param) {
+            uint32_t max_regs = max_abi_regs[_thisfrag->lirbuf->abi];
+            if (i->imm8() < max_regs)
+    			prefer &= rmask(Register(i->imm8()));
+        }
+        else if (op == LIR_callh || op == LIR_rsh && i->oprnd1()->opcode()==LIR_callh) {
+            prefer &= rmask(retRegs[1]);
+        }
+        else if (i->isCmp()) {
 			prefer &= AllowableFlagRegs;
-        else if (i->isconst())
+        }
+        else if (i->isconst()) {
             prefer &= ScratchRegs;
+        }
 		return (_allocator.free & prefer) ? prefer : allow;
 	}
 
     void Assembler::asm_qjoin(LIns *ins)
     {
 		int d = findMemFor(ins);
 		AvmAssert(d);
 		LIns* lo = ins->oprnd1();
@@ -471,125 +487,171 @@ namespace nanojit
 			// okay if r gets recycled.
 			Register r = findRegFor(lo, GpRegs);
 			ST(FP, d, r);
 		}
 
         freeRsrcOf(ins, false);	// if we had a reg in use, emit a ST to flush it to mem
     }
 
+	void Assembler::asm_load(int d, Register r)
+	{
+		if (rmask(r) & FpRegs)
+		{
+#if defined NANOJIT_IA32
+			if (rmask(r) & XmmRegs) {
+#endif
+				SSE_LDQ(r, d, FP);
+#if defined NANOJIT_IA32
+			} else {
+				FLDQ(d, FP); 
+			}
+#endif
+		}
+#if defined NANOJIT_AMD64
+		else if (i->opcode() == LIR_param)
+		{
+			LDQ(r, d, FP);
+		}
+#endif
+		else
+		{
+			LD(r, d, FP);
+		}
+	}
+	
 	void Assembler::asm_restore(LInsp i, Reservation *resv, Register r)
 	{
-        if (i->isconst())
-        {
+        if (i->isop(LIR_alloc)) {
+            LEA(r, disp(resv), FP);
+            verbose_only(if (_verbose) {
+                outputf("        remat %s size %d", _thisfrag->lirbuf->names->formatRef(i), i->size());
+            })
+        }
+        else if (i->isconst()) {
             if (!resv->arIndex) {
                 reserveFree(i);
             }
             LDi(r, i->constval());
         }
-        else
-        {
+        else {
             int d = findMemFor(i);
-            if (rmask(r) & FpRegs)
-		    {
-#if defined NANOJIT_IA32
-                if (rmask(r) & XmmRegs) {
-#endif
-                    SSE_LDQ(r, d, FP);
-#if defined NANOJIT_IA32
-                } else {
-			        FLDQ(d, FP); 
-                }
-#endif
-            }
-            else
-		    {
-#if defined NANOJIT_AMD64
-                LDQ(r, d, FP);
-#else
-			    LD(r, d, FP);
-#endif
-		    }
+			asm_load(d,r);
 			verbose_only(if (_verbose) {
 				outputf("        restore %s", _thisfrag->lirbuf->names->formatRef(i));
 			})
         }
 	}
 
     void Assembler::asm_store32(LIns *value, int dr, LIns *base)
     {
         if (value->isconst())
         {
-			Register rb = findRegFor(base, GpRegs);
+			Register rb;
+            if (base->isop(LIR_alloc)) {
+                rb = FP;
+                dr += findMemFor(base);
+            } else {
+                rb = findRegFor(base, GpRegs);
+            }
             int c = value->constval();
 			STi(rb, dr, c);
         }
         else
         {
 		    // make sure what is in a register
 		    Reservation *rA, *rB;
-		    findRegFor2(GpRegs, value, rA, base, rB);
-		    Register ra = rA->reg;
-		    Register rb = rB->reg;
+            Register ra, rb;
+            if (base->isop(LIR_alloc)) {
+                rb = FP;
+                dr += findMemFor(base);
+                ra = findRegFor(value, GpRegs);
+            } else if (base->isconst()) {
+                // absolute address
+                dr += base->constval();
+                ra = findRegFor(value, GpRegs);
+                rb = UnknownReg;
+            } else {
+    		    findRegFor2(GpRegs, value, rA, base, rB);
+		        ra = rA->reg;
+		        rb = rB->reg;
+            }
 		    ST(rb, dr, ra);
         }
     }
 
-	void Assembler::asm_spill(LInsp i, Reservation *resv, bool pop)
+	void Assembler::asm_spill(Register rr, int d, bool pop, bool quad)
 	{
-		(void)i;
-		int d = disp(resv);
-		Register rr = resv->reg;
+		(void)quad;
 		if (d)
 		{
 			// save to spill location
             if (rmask(rr) & FpRegs)
 			{
 #if defined NANOJIT_IA32
                 if (rmask(rr) & XmmRegs) {
 #endif
                     SSE_STQ(d, FP, rr);
 #if defined NANOJIT_IA32
                 } else {
 					FSTQ((pop?1:0), d, FP);
                 }
 #endif
 			}
+#if defined NANOJIT_AMD64
+			else if (quad)
+			{
+				STQ(FP, d, rr);
+			}
+#endif
 			else
 			{
-#if defined NANOJIT_AMD64
-				STQ(FP, d, rr);
-#else
 				ST(FP, d, rr);
-#endif
 			}
-			verbose_only(if (_verbose) {
-				outputf("        spill %s",_thisfrag->lirbuf->names->formatRef(i));
-			})
 		}
 #if defined NANOJIT_IA32
 		else if (pop && (rmask(rr) & x87Regs))
 		{
 			// pop the fpu result since it isn't used
 			FSTP(FST0);
 		}
-#endif
+#endif	
+	}
+
+	void Assembler::asm_spilli(LInsp i, Reservation *resv, bool pop)
+	{
+		int d = disp(resv);
+		Register rr = resv->reg;
+		bool quad = i->opcode() == LIR_param || i->isQuad();
+		asm_spill(rr, d, pop, quad);
+		if (d) 
+		{
+			verbose_only(if (_verbose) {
+				outputf("        spill %s",_thisfrag->lirbuf->names->formatRef(i));
+			})
+		}
 	}
 
 	void Assembler::asm_load64(LInsp ins)
 	{
 		LIns* base = ins->oprnd1();
 		int db = ins->oprnd2()->constval();
 		Reservation *resv = getresv(ins);
 		Register rr = resv->reg;
 
 		if (rr != UnknownReg && rmask(rr) & XmmRegs)
 		{
 			freeRsrcOf(ins, false);
-			Register rb = findRegFor(base, GpRegs);
+			Register rb;
+            if (base->isop(LIR_alloc)) {
+                rb = FP;
+                db += findMemFor(base);
+            } else {
+                rb = findRegFor(base, GpRegs);
+            }
 			SSE_LDQ(rr, db, rb);
 		}
 #if defined NANOJIT_AMD64
 		else if (rr != UnknownReg && rmask(rr) & GpRegs)
 		{
 			freeRsrcOf(ins, false);
 			Register rb = findRegFor(base, GpRegs);
 			LDQ(rr, db, rb);
@@ -609,17 +671,23 @@ namespace nanojit
             _allocator.addFree(rr);
 
 			freeRsrcOf(ins, false);
 		}
 #elif defined NANOJIT_IA32
 		else
 		{
 			int dr = disp(resv);
-			Register rb = findRegFor(base, GpRegs);
+			Register rb;
+            if (base->isop(LIR_alloc)) {
+                rb = FP;
+                db += findMemFor(base);
+            } else {
+                rb = findRegFor(base, GpRegs);
+            }
 			resv->reg = UnknownReg;
 
 			// don't use an fpu reg to simply load & store the value.
 			if (dr)
 				asm_mmq(FP, dr, rb, db);
 
 			freeRsrcOf(ins, false);
 
@@ -634,52 +702,84 @@ namespace nanojit
 	}
 
 	void Assembler::asm_store64(LInsp value, int dr, LInsp base)
 	{
 		if (value->isconstq())
 		{
 			// if a constant 64-bit value just store it now rather than
 			// generating a pointless store/load/store sequence
-			Register rb = findRegFor(base, GpRegs);
+			Register rb;
+            if (base->isop(LIR_alloc)) {
+                rb = FP;
+                dr += findMemFor(base);
+            } else {
+                rb = findRegFor(base, GpRegs);
+            }
 			const int32_t* p = (const int32_t*) (value-2);
 			STi(rb, dr+4, p[1]);
 			STi(rb, dr, p[0]);
             return;
 		}
 
 #if defined NANOJIT_IA32
-        if (value->isop(LIR_ldq) || value->isop(LIR_qjoin))
+        if (value->isop(LIR_ldq) || value->isop(LIR_ldqc) || value->isop(LIR_qjoin))
 		{
 			// value is 64bit struct or int64_t, or maybe a double.
 			// it may be live in an FPU reg.  Either way, don't
 			// put it in an FPU reg just to load & store it.
 
 			// a) if we know it's not a double, this is right.
 			// b) if we guarded that its a double, this store could be on
 			// the side exit, copying a non-double.
 			// c) maybe its a double just being stored.  oh well.
 
 			if (avmplus::AvmCore::use_sse2()) {
                 Register rv = findRegFor(value, XmmRegs);
-                Register rb = findRegFor(base, GpRegs);
+		Register rb;
+		if (base->isop(LIR_alloc)) {
+		    rb = FP;
+		    dr += findMemFor(base);
+		} else {
+		    rb = findRegFor(base, GpRegs);
+		}
                 SSE_STQ(dr, rb, rv);
 				return;
             }
 
 			int da = findMemFor(value);
-		    Register rb = findRegFor(base, GpRegs);
+		    Register rb;
+		    if (base->isop(LIR_alloc)) {
+					rb = FP;
+					dr += findMemFor(base);
+		    } else {
+					rb = findRegFor(base, GpRegs);
+		    }
 		    asm_mmq(rb, dr, FP, da);
             return;
 		}
 
+		Register rb;
+		if (base->isop(LIR_alloc)) {
+		    rb = FP;
+		    dr += findMemFor(base);
+		} else {
+		    rb = findRegFor(base, GpRegs);
+		}
+
+		// if value already in a reg, use that, otherwise
+		// try to get it into XMM regs before FPU regs.
 		Reservation* rA = getresv(value);
+		Register rv;
 		int pop = !rA || rA->reg==UnknownReg;
- 		Register rv = findRegFor(value, avmplus::AvmCore::use_sse2() ? XmmRegs : FpRegs);
-		Register rb = findRegFor(base, GpRegs);
+		if (pop) {
+		    rv = findRegFor(value, avmplus::AvmCore::use_sse2() ? XmmRegs : FpRegs);
+		} else {
+		    rv = rA->reg;
+		}
 
 		if (rmask(rv) & XmmRegs) {
             SSE_STQ(dr, rb, rv);
 		} else {
 			FSTQ(pop, dr, rb);
 		}
 #elif defined NANOJIT_AMD64
 		/* If this is not a float operation, we can use GpRegs instead.
@@ -758,30 +858,33 @@ namespace nanojit
 		if (rr != UnknownReg)
 		{
 			// @todo -- add special-cases for 0 and 1
 			_allocator.retire(rr);
 			rR->reg = UnknownReg;
 			NanoAssert((rmask(rr) & FpRegs) != 0);
 
 			const double d = ins->constvalf();
+            const uint64_t q = ins->constvalq();
 			if (rmask(rr) & XmmRegs) {
-				if (d == 0.0) {
+				if (q == 0.0) {
+                    // test (int64)0 since -0.0 == 0.0
 					SSE_XORPDr(rr, rr);
 				} else if (d == 1.0) {
 					// 1.0 is extremely frequent and worth special-casing!
 					static const double k_ONE = 1.0;
 					LDSDm(rr, &k_ONE);
 				} else {
 					findMemFor(ins);
 					const int d = disp(rR);
 					SSE_LDQ(rr, d, FP);
 				}
 			} else {
-				if (d == 0.0) {
+				if (q == 0.0) {
+                    // test (int64)0 since -0.0 == 0.0
 					FLDZ();
 				} else if (d == 1.0) {
 					FLD1();
 				} else {
 					findMemFor(ins);
 					int d = disp(rR);
 					FLDQ(d,FP);
 				}
@@ -798,48 +901,48 @@ namespace nanojit
 			STi(FP,d,p[0]);
 		}
 #elif defined NANOJIT_AMD64
 		Reservation *rR = getresv(ins);
 		int64_t val = *(int64_t *)(ins - 2);
 
 		if (rR->reg != UnknownReg)
 		{
-            Register rr = rR->reg;
-		    freeRsrcOf(ins, false);
-			if (rmask(rr) & GpRegs)
+			if (rmask(rR->reg) & GpRegs)
 			{
-				LDQi(rr, val);
+				LDQi(rR->reg, val);
 			}
-			else if (rmask(rr) & XmmRegs)
+			else if (rmask(rR->reg) & XmmRegs)
 			{
 				if (ins->constvalf() == 0.0)
 				{
-					SSE_XORPDr(rr, rr);
+					SSE_XORPDr(rR->reg, rR->reg);
 				}
 				else
 				{
 					/* Get a short-lived register, not associated with instruction */
+					Register rd = rR->reg;
 					Register rs = registerAlloc(GpRegs);
-
-					SSE_MOVD(rr, rs);
+	
+					SSE_MOVD(rd, rs);
 					LDQi(rs, val);
 
 					_allocator.addFree(rs);
 				}
 			}
 		}
 		else
 		{
 			const int32_t* p = (const int32_t*) (ins-2);
 			int dr = disp(rR);
-		    freeRsrcOf(ins, false);
 			STi(FP, dr+4, p[1]);
 			STi(FP, dr, p[0]);
 		}
+
+		freeRsrcOf(ins, false);
 #endif
 	}
 	
 	bool Assembler::asm_qlo(LInsp ins, LInsp q)
 	{
 #if defined NANOJIT_IA32
 		if (!avmplus::AvmCore::use_sse2())
 		{
@@ -920,54 +1023,109 @@ namespace nanojit
 			FCHS();
 
 			// if we had more than one fpu reg, this is where
 			// we would move ra into rr if rr != ra.
 		}
 #endif
 	}
 
+    void Assembler::asm_arg(ArgSize sz, LInsp p, Register r)
+    {
+        if (sz == ARGSIZE_Q) 
+        {
+			// ref arg - use lea
+			if (r != UnknownReg)
+			{
+				// arg in specific reg
+				int da = findMemFor(p);
+				LEA(r, da, FP);
+			}
+			else
+			{
+				NanoAssert(0); // not supported
+			}
+		}
+        else if (sz == ARGSIZE_LO)
+		{
+			if (r != UnknownReg) {
+				// arg goes in specific register
+                if (p->isconst()) {
+					LDi(r, p->constval());
+                } else {
+            		Reservation* rA = getresv(p);
+                    if (rA) {
+                        if (rA->reg == UnknownReg) {
+                            // load it into the arg reg
+                            int d = findMemFor(p);
+                            if (p->isop(LIR_alloc)) {
+                                LEA(r, d, FP);
+                            } else {
+                                LD(r, d, FP);
+                            }
+                        } else {
+                            // it must be in a saved reg
+                            MR(r, rA->reg);
+                        }
+                    } 
+                    else {
+                        // this is the last use, so fine to assign it
+                        // to the scratch reg, it's dead after this point.
+    					findSpecificRegFor(p, r);
+                    }
+                }
+			}
+            else {
+				asm_pusharg(p);
+			}
+		}
+        else
+		{
+            NanoAssert(sz == ARGSIZE_F);
+			asm_farg(p);
+		}
+    }
+
 	void Assembler::asm_pusharg(LInsp p)
 	{
 		// arg goes on stack
 		Reservation* rA = getresv(p);
-		if (rA == 0)
+		if (rA == 0 && p->isconst())
 		{
-			if (p->isconst())
-			{
-				// small const we push directly
-				PUSHi(p->constval());
-			}
-			else
-			{
-				Register ra = findRegFor(p, GpRegs);
-				PUSHr(ra);
-			}
+			// small const we push directly
+			PUSHi(p->constval());
+		}
+		else if (rA == 0 || p->isop(LIR_alloc))
+		{
+			Register ra = findRegFor(p, GpRegs);
+			PUSHr(ra);
 		}
 		else if (rA->reg == UnknownReg)
 		{
 			PUSHm(disp(rA), FP);
 		}
 		else
 		{
 			PUSHr(rA->reg);
 		}
 	}
 
 	void Assembler::asm_farg(LInsp p)
 	{
 #if defined NANOJIT_IA32
+        NanoAssert(p->isQuad());
 		Register r = findRegFor(p, FpRegs);
 		if (rmask(r) & XmmRegs) {
 			SSE_STQ(0, SP, r); 
 		} else {
 			FSTPQ(0, SP);
 		}
-		PUSHr(ECX); // 2*pushr is smaller than sub
-		PUSHr(ECX);
+        SUBi(ESP,8);
+		//PUSHr(ECX); // 2*pushr is smaller than sub
+		//PUSHr(ECX);
 #endif
 	}
 
 	void Assembler::asm_fop(LInsp ins)
 	{
 		LOpcode op = ins->opcode();
 #if defined NANOJIT_IA32
 		if (avmplus::AvmCore::use_sse2()) 
@@ -992,17 +1150,20 @@ namespace nanojit
 				ra = findSpecificRegFor(lhs, rr);
 			} else if ((rmask(ra) & XmmRegs) == 0) {
 				/* We need this case on AMD64, because it's possible that 
 				 * an earlier instruction has done a quadword load and reserved a 
 				 * GPR.  If so, ask for a new register.
 				 */
 				ra = findRegFor(lhs, XmmRegs);
 			}
-			// else, rA already has a register assigned.
+            else {
+    			// rA already has a register assigned but maybe not from the allow set
+                ra = findRegFor(lhs, allow);
+            }
 
 			if (lhs == rhs)
 				rb = ra;
 
 			if (op == LIR_fadd)
 				SSE_ADDSD(rr, rb);
 			else if (op == LIR_fsub)
 				SSE_SUBSD(rr, rb);
@@ -1185,36 +1346,107 @@ namespace nanojit
 				NanoAssertMsg(false, "Should not move data from GPR to XMM");
 			} else {
 				// xmm -> x87
 				NanoAssertMsg(false, "Should not move data from GPR/XMM to x87 FPU");
 			}
 		}
 	}
 
+    NIns * Assembler::asm_jmpcc(bool branchOnFalse, LIns *cond, NIns *targ)
+    {
+        LOpcode c = cond->opcode();
+        if (avmplus::AvmCore::use_sse2() && c != LIR_feq) {
+            LIns *lhs = cond->oprnd1();
+            LIns *rhs = cond->oprnd2();
+            if (c == LIR_flt) {
+                LIns *t = lhs; lhs = rhs; rhs = t;
+                c = LIR_fgt;
+            }
+            else if (c == LIR_fle) {
+                LIns *t = lhs; lhs = rhs; rhs = t;
+                c = LIR_fge;
+            }
+
+            if (c == LIR_fgt) {
+                if (branchOnFalse) { JNA(targ); } else { JA(targ); }
+            }
+            else { // if (c == LIR_fge)
+                if (branchOnFalse) { JNAE(targ); } else { JAE(targ); }
+            }
+            NIns *at = _nIns;
+            Reservation *rA, *rB;
+            findRegFor2(XmmRegs, lhs, rA, rhs, rB);
+            SSE_UCOMISD(rA->reg, rB->reg);
+            return at;
+        }
+
+    	if (branchOnFalse)
+			JP(targ);
+		else
+			JNP(targ);
+		NIns *at = _nIns;
+		asm_fcmp(cond);
+        return at;
+    }
+
+    void Assembler::asm_setcc(Register r, LIns *cond)
+    {
+        LOpcode c = cond->opcode();
+        if (avmplus::AvmCore::use_sse2() && c != LIR_feq) {
+    		MOVZX8(r,r);
+            LIns *lhs = cond->oprnd1();
+            LIns *rhs = cond->oprnd2();
+            if (c == LIR_flt) {
+                LIns *t = lhs; lhs = rhs; rhs = t;
+                SETA(r);
+            }
+            else if (c == LIR_fle) {
+                LIns *t = lhs; lhs = rhs; rhs = t;
+                SETAE(r);
+            }
+            else if (c == LIR_fgt) {
+                SETA(r);
+            }
+            else { // if (c == LIR_fge)
+                SETAE(r);
+            }
+            Reservation *rA, *rB;
+            findRegFor2(XmmRegs, lhs, rA, rhs, rB);
+            SSE_UCOMISD(rA->reg, rB->reg);
+            return;
+        }
+		// SETcc only sets low 8 bits, so extend 
+		MOVZX8(r,r);
+		SETNP(r);
+        asm_fcmp(cond);
+    }
+
 	void Assembler::asm_fcmp(LIns *cond)
 	{
 		LOpcode condop = cond->opcode();
 		NanoAssert(condop >= LIR_feq && condop <= LIR_fge);
 	    LIns* lhs = cond->oprnd1();
 	    LIns* rhs = cond->oprnd2();
 
         int mask;
 	    if (condop == LIR_feq)
 		    mask = 0x44;
 	    else if (condop == LIR_fle)
 		    mask = 0x41;
 	    else if (condop == LIR_flt)
 		    mask = 0x05;
         else if (condop == LIR_fge) {
             // swap, use le
+            condop = LIR_fle;
             LIns* t = lhs; lhs = rhs; rhs = t;
             mask = 0x41;
         } else { // if (condop == LIR_fgt)
             // swap, use lt
+            condop = LIR_flt;
             LIns* t = lhs; lhs = rhs; rhs = t;
 		    mask = 0x05;
         }
 
 #if defined NANOJIT_IA32
         if (avmplus::AvmCore::use_sse2())
         {
 #endif
@@ -1222,17 +1454,18 @@ namespace nanojit
             // GREATER_THAN: ZF,PF,CF <- 000;
             // LESS_THAN:    ZF,PF,CF <- 001;
             // EQUAL:        ZF,PF,CF <- 100;
 
             if (condop == LIR_feq && lhs == rhs) {
                 // nan check
                 Register r = findRegFor(lhs, XmmRegs);
                 SSE_UCOMISD(r, r);
-            } else {
+            } 
+            else {
 #if defined NANOJIT_IA32
                 evict(EAX);
                 TEST_AH(mask);
                 LAHF();
 #elif defined NANOJIT_AMD64
                 evict(RAX);
                 TEST_AL(mask);
                 POPr(RAX);
@@ -1379,10 +1612,24 @@ namespace nanojit
     }
 #endif
 
 	void Assembler::nativePageSetup()
 	{
 		if (!_nIns)		 _nIns	   = pageAlloc();
 		if (!_nExitIns)  _nExitIns = pageAlloc(true);
 	}
+	
+	// enough room for n bytes
+    void Assembler::underrunProtect(int n)
+    {
+        NIns *eip = this->_nIns;
+        Page *p = (Page*)pageTop(eip-1);
+        NIns *top = (NIns*) &p->code[0];
+        if (eip - n < top) {
+			_nIns = pageAlloc(_inExit);
+            JMP(eip);
+        }
+    }
+
+	
 	#endif /* FEATURE_NANOJIT */
 }
--- a/js/src/nanojit/Nativei386.h
+++ b/js/src/nanojit/Nativei386.h
@@ -96,16 +96,17 @@ namespace nanojit
 		FirstReg = 0,
 		LastReg = 23,
 		UnknownReg = 24
 	} 
 	Register;
 
 	typedef int RegisterMask;
 
+	static const int NumSavedRegs = 3;
 	static const RegisterMask SavedRegs = 1<<EBX | 1<<EDI | 1<<ESI;
 	static const RegisterMask GpRegs = SavedRegs | 1<<EAX | 1<<ECX | 1<<EDX;
     static const RegisterMask XmmRegs = 1<<XMM0|1<<XMM1|1<<XMM2|1<<XMM3|1<<XMM4|1<<XMM5|1<<XMM6|1<<XMM7;
     static const RegisterMask x87Regs = 1<<FST0;
 	static const RegisterMask FpRegs = x87Regs | XmmRegs;
 	static const RegisterMask ScratchRegs = 1<<EAX | 1<<ECX | 1<<EDX | FpRegs;
 
 	static const RegisterMask AllowableFlagRegs = 1<<EAX |1<<ECX | 1<<EDX | 1<<EBX;
@@ -127,33 +128,22 @@ namespace nanojit
 	#define DECLARE_PLATFORM_REGALLOC()
 
 	#define DECLARE_PLATFORM_ASSEMBLER()	\
         const static Register argRegs[2], retRegs[2]; \
 		bool x87Dirty;						\
 		bool pad[3];\
 		void nativePageReset();\
 		void nativePageSetup();\
-        void asm_farg(LInsp);
+        void underrunProtect(int);\
+        void asm_farg(LInsp);\
+        void asm_align_code();
 		
 	#define swapptrs()  { NIns* _tins = _nIns; _nIns=_nExitIns; _nExitIns=_tins; }
 		
-	// enough room for n bytes
-	#define underrunProtect(n)									\
-		{														\
-			intptr_t u = n + sizeof(PageHeader)/sizeof(NIns) + 5; \
-			if ( !samepage(_nIns-u,_nIns-1) )					\
-			{													\
-				NIns *tt = _nIns; \
-				_nIns = pageAlloc(_inExit);						\
-				int d = tt-_nIns; \
-				JMP_long_nochk_offset(d);			\
-			}													\
-		}														\
-
 #define IMM32(i)	\
 	_nIns -= 4;		\
 	*((int32_t*)_nIns) = (int32_t)(i)
 
 #define MODRMs(r,d,b,l,i) \
 		NanoAssert(unsigned(r)<8 && unsigned(b)<8 && unsigned(i)<8); \
  		if ((d) == 0 && (b) != EBP) { \
 			_nIns -= 2; \
@@ -166,18 +156,21 @@ namespace nanojit
 			_nIns[2] = (uint8_t) (d); \
  		} else { \
  			IMM32(d); \
  			*(--_nIns) = (uint8_t) ( (l)<<6 | (i)<<3 | (b) ); \
  			*(--_nIns) = (uint8_t)    ( 2<<6 |   (r)<<3 | 4 ); \
  		}
 
 #define MODRMm(r,d,b) \
-		NanoAssert(unsigned(r)<8 && unsigned(b)<8); \
- 		if ((b) == ESP) { \
+		NanoAssert(unsigned(r)<8 && ((b)==UnknownReg || unsigned(b)<8)); \
+        if ((b) == UnknownReg) {\
+            IMM32(d);\
+            *(--_nIns) = (uint8_t) (0<<6 | (r)<<3 | 5);\
+        } else if ((b) == ESP) { \
  			MODRMs(r, d, b, 0, (Register)4); \
  		} \
 		else if ( (d) == 0 && (b) != EBP) { \
  			*(--_nIns) = (uint8_t) ( 0<<6 | (r)<<3 | (b) ); \
  		} else if (isS8(d)) { \
  			*(--_nIns) = (uint8_t) (d); \
  			*(--_nIns) = (uint8_t) ( 1<<6 | (r)<<3 | (b) ); \
  		} else { \
@@ -339,17 +332,17 @@ namespace nanojit
 	underrunProtect(5);			\
 	IMM32(i);					\
 	NanoAssert(((unsigned)r)<8); \
 	*(--_nIns) = (uint8_t) (0xb8 | (r) );		\
 	asm_output2("mov %s,%d",gpn(r),i); } while(0)
 
 #define ST(base,disp,reg) do {  \
 	ALUm(0x89,reg,disp,base);	\
-	asm_output3("mov %d(%s),%s",disp,gpn(base),gpn(reg)); } while(0)
+    asm_output3("mov %d(%s),%s",disp,base==UnknownReg?"0":gpn(base),gpn(reg)); } while(0)
 
 #define STi(base,disp,imm)	do { \
 	underrunProtect(12);	\
 	IMM32(imm);				\
 	MODRMm(0, disp, base);	\
 	*(--_nIns) = 0xc7;		\
 	asm_output3("mov %d(%s),%d",disp,gpn(base),imm); } while(0)
 
@@ -492,17 +485,17 @@ namespace nanojit
 #define LDSDm(r,addr)do {     \
     underrunProtect(8); \
 	const double* daddr = addr; \
     IMM32(int32_t(daddr));\
     *(--_nIns) = uint8_t(((r)&7)<<3|5); \
     *(--_nIns) = 0x10;\
     *(--_nIns) = 0x0f;\
     *(--_nIns) = 0xf2;\
-    asm_output3("movsd %s,%p // =%f",gpn(r),daddr,*daddr); \
+    asm_output3("movsd %s,(#%p) // =%f",gpn(r),(void*)daddr,*daddr); \
     } while(0)
 
 #define STSD(d,b,r)do {     \
     SSEm(0xf20f11, (r)&7, (d), (b)); \
     asm_output3("movsd %d(%s),%s",(d),gpn(b),gpn(r)); \
     } while(0)
 
 #define SSE_LDQ(r,d,b)do {  \
@@ -534,71 +527,80 @@ namespace nanojit
 		NanoAssert(_is_gp_reg_(s)); \
 		NanoAssert(_is_xmm_reg_(d)); \
 		SSE(0x660f6e, (d)&7, (s)&7); \
 	} \
     asm_output2("movd %s,%s",gpn(d),gpn(s)); \
     } while(0)
 
 #define SSE_MOVSD(rd,rs) do{ \
+    NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));\
     SSE(0xf20f10, (rd)&7, (rs)&7); \
     asm_output2("movsd %s,%s",gpn(rd),gpn(rs)); \
     } while(0)
 
 #define SSE_MOVDm(d,b,xrs) do {\
+    NanoAssert(_is_xmm_reg_(xrs) && _is_gp_reg_(b));\
     SSEm(0x660f7e, (xrs)&7, d, b);\
     asm_output3("movd %d(%s),%s", d, gpn(b), gpn(xrs));\
     } while(0)
 
 #define SSE_ADDSD(rd,rs) do{ \
+    NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));\
     SSE(0xf20f58, (rd)&7, (rs)&7); \
     asm_output2("addsd %s,%s",gpn(rd),gpn(rs)); \
     } while(0)
 
 #define SSE_ADDSDm(r,addr)do {     \
     underrunProtect(8); \
+    NanoAssert(_is_xmm_reg_(r));\
 	const double* daddr = addr; \
     IMM32(int32_t(daddr));\
     *(--_nIns) = uint8_t(((r)&7)<<3|5); \
     *(--_nIns) = 0x58;\
     *(--_nIns) = 0x0f;\
     *(--_nIns) = 0xf2;\
-    asm_output3("addsd %s,%p // =%f",gpn(r),daddr,*daddr); \
+    asm_output3("addsd %s,%p // =%f",gpn(r),(void*)daddr,*daddr); \
     } while(0)
 
 #define SSE_SUBSD(rd,rs) do{ \
+    NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));\
     SSE(0xf20f5c, (rd)&7, (rs)&7); \
     asm_output2("subsd %s,%s",gpn(rd),gpn(rs)); \
     } while(0)
 #define SSE_MULSD(rd,rs) do{ \
+    NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));\
     SSE(0xf20f59, (rd)&7, (rs)&7); \
     asm_output2("mulsd %s,%s",gpn(rd),gpn(rs)); \
     } while(0)
 #define SSE_DIVSD(rd,rs) do{ \
+    NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));\
     SSE(0xf20f5e, (rd)&7, (rs)&7); \
     asm_output2("divsd %s,%s",gpn(rd),gpn(rs)); \
     } while(0)
 #define SSE_UCOMISD(rl,rr) do{ \
+    NanoAssert(_is_xmm_reg_(rl) && _is_xmm_reg_(rr));\
     SSE(0x660f2e, (rl)&7, (rr)&7); \
     asm_output2("ucomisd %s,%s",gpn(rl),gpn(rr)); \
     } while(0)
 
 #define CVTSI2SDm(xr,d,b) do{ \
+    NanoAssert(_is_xmm_reg_(xr) && _is_gp_reg_(b));\
     SSEm(0xf20f2a, (xr)&7, (d), (b)); \
     asm_output3("cvtsi2sd %s,%d(%s)",gpn(xr),(d),gpn(b)); \
     } while(0)
 
 #define SSE_XORPD(r, maskaddr) do {\
-    underrunProtect(8); \
+	underrunProtect(8); \
     IMM32(maskaddr);\
     *(--_nIns) = uint8_t(((r)&7)<<3|5); \
     *(--_nIns) = 0x57;\
     *(--_nIns) = 0x0f;\
     *(--_nIns) = 0x66;\
-    asm_output2("xorpd %s,[0x%p]",gpn(r),(maskaddr));\
+    asm_output2("xorpd %s,[0x%p]",gpn(r),(void*)(maskaddr));\
     } while(0)
 
 #define SSE_XORPDr(rd,rs) do{ \
     SSE(0x660f57, (rd)&7, (rs)&7); \
     asm_output2("xorpd %s,%s",gpn(rd),gpn(rs)); \
     } while(0)
 
 // floating point unit
@@ -652,19 +654,29 @@ namespace nanojit
 #define FDIVR(d,b)	do { FPUm(0xdc07, d, b);		asm_output2("fdivr %d(%s)",d,gpn(b)); } while(0)
 #define FINCSTP()	do { FPUc(0xd9f7);				asm_output2("fincstp"); } while(0)
 #define FSTP(r)		do { FPU(0xddd8, r&7);			asm_output1("fstp %s",fpn(r)); fpu_pop();} while(0)
 #define FCOMP()		do { FPUc(0xD8D9);				asm_output("fcomp"); fpu_pop();} while(0)
 #define FCOMPP()	do { FPUc(0xDED9);				asm_output("fcompp"); fpu_pop();fpu_pop();} while(0)
 #define FLDr(r)		do { FPU(0xd9c0,r);				asm_output1("fld %s",fpn(r)); fpu_push(); } while(0)
 #define EMMS()		do { FPUc(0x0f77);				asm_output("emms"); } while (0)
 
+// standard direct call
 #define CALL(c)	do { \
   underrunProtect(5);					\
   int offset = (c->_address) - ((int)_nIns); \
   IMM32( (uint32_t)offset );	\
   *(--_nIns) = 0xE8;		\
   verbose_only(asm_output1("call %s",(c->_name));) \
   debug_only(if ((c->_argtypes&3)==ARGSIZE_F) fpu_push();)\
 } while (0)
 
+// indirect call thru register
+#define CALLr(c,r)	do { \
+  underrunProtect(2);\
+  ALU(0xff, 2, (r));\
+  verbose_only(asm_output1("call %s",gpn(r));) \
+  debug_only(if ((c->_argtypes&3)==ARGSIZE_F) fpu_push();)\
+} while (0)
+
+
 }
 #endif // __nanojit_Nativei386__
--- a/js/src/nanojit/RegAlloc.cpp
+++ b/js/src/nanojit/RegAlloc.cpp
@@ -67,69 +67,66 @@ namespace nanojit
 	void RegAlloc::removeFree(Register r)
 	{
 		NanoAssert(isFree(r));
 		free &= ~rmask(r);
 	}
 
 	void RegAlloc::addActive(Register r, LIns* v)
 	{
-		//addActiveCount++;
+		//  Count++;
 		NanoAssert(v && r != UnknownReg && active[r] == NULL );
 		active[r] = v;
+        useActive(r);
 	}
 
+    void RegAlloc::useActive(Register r)
+    {
+        NanoAssert(r != UnknownReg && active[r] != NULL);
+        usepri[r] = priority++;
+    }
+
 	void RegAlloc::removeActive(Register r)
 	{
 		//registerReleaseCount++;
 		NanoAssert(r != UnknownReg);
 		NanoAssert(active[r] != NULL);
 
 		// remove the given register from the active list
 		active[r] = NULL;
 	}
 
-	LIns* RegAlloc::getActive(Register r)
-	{
-		NanoAssert(r != UnknownReg);
-		return active[r];
-	}
-
 	void RegAlloc::retire(Register r)
 	{
 		NanoAssert(r != UnknownReg);
 		NanoAssert(active[r] != NULL);
 		active[r] = NULL;
 		free |= rmask(r);
 	}
 
-	// scan table for instruction with longest span
-	LIns* Assembler::findVictim(RegAlloc &regs, RegisterMask allow, RegisterMask prefer)
+	// scan table for instruction with the lowest priority, meaning it is used
+    // furthest in the future.
+	LIns* Assembler::findVictim(RegAlloc &regs, RegisterMask allow)
 	{
-		NanoAssert(allow != 0 && (allow&prefer)==prefer);
-		LIns *i, *a=0, *p = 0;
-        int acost=10, pcost=10;
+		NanoAssert(allow != 0);
+		LIns *i, *a=0;
+        int allow_pri = 0x7fffffff;
 		for (Register r=FirstReg; r <= LastReg; r = nextreg(r))
 		{
             if ((allow & rmask(r)) && (i = regs.getActive(r)) != 0)
             {
-                int cost = getresv(i)->cost;
-                if (!a || cost < acost || cost == acost && nbr(i) < nbr(a)) {
+                int pri = canRemat(i) ? 0 : regs.getPriority(r);
+                if (!a || pri < allow_pri) {
                     a = i;
-                    acost = cost;
-                }
-                if (prefer & rmask(r)) {
-                    if (!p || cost < pcost || cost == pcost && nbr(i) < nbr(p)) {
-                        p = i;
-                        pcost = cost;
-                    }
+                    allow_pri = pri;
                 }
 			}
 		}
-        return acost < pcost ? a : p;
+        NanoAssert(a != 0);
+        return a;
 	}
 
 	#ifdef  NJ_VERBOSE
 	/* static */ void RegAlloc::formatRegisters(RegAlloc& regs, char* s, Fragment *frag)
 	{
 		if (!frag || !frag->lirbuf)
 			return;
 		LirNameMap *names = frag->lirbuf->names;
--- a/js/src/nanojit/RegAlloc.h
+++ b/js/src/nanojit/RegAlloc.h
@@ -46,37 +46,50 @@ namespace nanojit
 	inline RegisterMask rmask(Register r)
 	{
 		return 1 << r;
 	}
 
 	class RegAlloc MMGC_SUBCLASS_DECL
 	{
 		public:
-			RegAlloc() {}
+            RegAlloc() : free(0), used(0), priority(0) {}
 			void	clear();
 			bool	isFree(Register r); 
 			void	addFree(Register r);
 			void	removeFree(Register r);
 			void	addActive(Register r, LIns* ins);
+            void    useActive(Register r);
 			void	removeActive(Register r);
-			LIns*	getActive(Register r); 
 			void	retire(Register r);
+            bool    isValid() {
+                return (free|used) != 0;
+            }
+
+            int32_t getPriority(Register r) {
+                NanoAssert(r != UnknownReg && active[r]);
+                return usepri[r];
+            }
+
+	        LIns* getActive(Register r) {
+		        NanoAssert(r != UnknownReg);
+		        return active[r];
+	        }
 
 			debug_only( uint32_t	countFree(); )
 			debug_only( uint32_t	countActive(); )
 			debug_only( void		checkCount(); )
 			debug_only( bool		isConsistent(Register r, LIns* v); )
 			debug_only( uint32_t	count; )
 			debug_only( RegisterMask managed; )    // bitfield of 0..NJ_MAX_REGISTERS denoting which are under our management                     
 
-			// RegisterMask is a 32-bit value, so we can never have more than 32 active.
-			// hardcode 32 here in case we have non-contiguous register numbers
-			LIns*	active[32];  // active[r] = OP that defines r
+			LIns*	active[LastReg + 1];  // active[r] = OP that defines r
+			int32_t usepri[LastReg + 1]; // used priority. lower = more likely to spill.
 			RegisterMask	free;
 			RegisterMask	used;
+            int32_t         priority;
 
 			verbose_only( static void formatRegisters(RegAlloc& regs, char* s, Fragment*); )
 
 			DECLARE_PLATFORM_REGALLOC()
 	};
 }
 #endif // __nanojit_RegAlloc__
--- a/js/src/nanojit/avmplus.h
+++ b/js/src/nanojit/avmplus.h
@@ -321,16 +321,18 @@ public:
     GetGCHeap()
     {
         return &heap;
     }
 };
 
 #define DWB(x) x
 #define DRCWB(x) x
+#define WB(gc, container, addr, value) do { *(addr) = (value); } while(0)
+#define WBRC(gc, container, addr, value) do { *(addr) = (value); } while(0)
 
 #define MMGC_MEM_TYPE(x)
 
 typedef int FunctionID;
 
 namespace avmplus
 {
     struct InterpState
--- a/js/src/nanojit/nanojit.h
+++ b/js/src/nanojit/nanojit.h
@@ -37,16 +37,18 @@
  * ***** END LICENSE BLOCK ***** */
 
 #ifndef __nanojit_h__
 #define __nanojit_h__
 
 #include <stddef.h>
 #include "avmplus.h"
 
+#ifdef FEATURE_NANOJIT
+
 #ifdef AVMPLUS_IA32
 #define NANOJIT_IA32
 #elif AVMPLUS_ARM
 #define NANOJIT_ARM
 #elif AVMPLUS_PPC
 #define NANOJIT_PPC
 #elif AVMPLUS_AMD64
 #define NANOJIT_AMD64
@@ -68,16 +70,18 @@ namespace nanojit
 	class RegAlloc;
 	typedef avmplus::AvmCore AvmCore;
 	typedef avmplus::OSDep OSDep;
 	typedef avmplus::GCSortedMap<const void*,Fragment*,avmplus::LIST_GCObjects> FragmentMap;
 	typedef avmplus::SortedMap<SideExit*,RegAlloc*,avmplus::LIST_GCObjects> RegAllocMap;
 	typedef avmplus::List<LIns*,avmplus::LIST_NonGCObjects>	InsList;
 	typedef avmplus::List<char*, avmplus::LIST_GCObjects> StringList;
 
+    const uint32_t MAXARGS = 8;
+
 	#if defined(_MSC_VER) && _MSC_VER < 1400
 		static void NanoAssertMsgf(bool a,const char *f,...) {}
 		static void NanoAssertMsg(bool a,const char *m) {}
 		static void NanoAssert(bool a) {}
 	#elif defined(_DEBUG)
 		
 		#define __NanoAssertMsgf(a, file_, line_, f, ...)  \
 			if (!(a)) { \
@@ -108,22 +112,22 @@ namespace nanojit
 #define NJ_PROFILE 1
 #endif
 
 #ifdef NJ_VERBOSE
 	#include <stdio.h>
 	#define verbose_output						if (verbose_enabled()) Assembler::output
 	#define verbose_outputf						if (verbose_enabled()) Assembler::outputf
 	#define verbose_enabled()					(_verbose)
-	#define verbose_only(x)						x
+	#define verbose_only(...)					__VA_ARGS__
 #else
 	#define verbose_output
 	#define verbose_outputf
 	#define verbose_enabled()
-	#define verbose_only(x)
+	#define verbose_only(...)
 #endif /*NJ_VERBOSE*/
 
 #ifdef _DEBUG
 	#define debug_only(x)			x
 #else
 	#define debug_only(x)
 #endif /* DEBUG */
 
@@ -167,9 +171,10 @@ namespace nanojit
 
 #include "Native.h"
 #include "LIR.h"
 #include "RegAlloc.h"
 #include "Fragmento.h"
 #include "Assembler.h"
 #include "TraceTreeDrawer.h"
 
+#endif // FEATURE_NANOJIT
 #endif // __nanojit_h__
deleted file mode 100644
--- a/js/src/t/crypto-sha1.js
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * A JavaScript implementation of the Secure Hash Algorithm, SHA-1, as defined
- * in FIPS PUB 180-1
- * Version 2.1a Copyright Paul Johnston 2000 - 2002.
- * Other contributors: Greg Holt, Andrew Kepert, Ydnar, Lostinet
- * Distributed under the BSD License
- * See http://pajhome.org.uk/crypt/md5 for details.
- */
-
-/*
- * Configurable variables. You may need to tweak these to be compatible with
- * the server-side, but the defaults work in most cases.
- */
-var hexcase = 0;  /* hex output format. 0 - lowercase; 1 - uppercase        */
-var b64pad  = ""; /* base-64 pad character. "=" for strict RFC compliance   */
-var chrsz   = 8;  /* bits per input character. 8 - ASCII; 16 - Unicode      */
-
-/*
- * These are the functions you'll usually want to call
- * They take string arguments and return either hex or base-64 encoded strings
- */
-function hex_sha1(s){return binb2hex(core_sha1(str2binb(s),s.length * chrsz));}
-function b64_sha1(s){return binb2b64(core_sha1(str2binb(s),s.length * chrsz));}
-function str_sha1(s){return binb2str(core_sha1(str2binb(s),s.length * chrsz));}
-function hex_hmac_sha1(key, data){ return binb2hex(core_hmac_sha1(key, data));}
-function b64_hmac_sha1(key, data){ return binb2b64(core_hmac_sha1(key, data));}
-function str_hmac_sha1(key, data){ return binb2str(core_hmac_sha1(key, data));}
-
-/*
- * Perform a simple self-test to see if the VM is working
- */
-function sha1_vm_test()
-{
-  return hex_sha1("abc") == "a9993e364706816aba3e25717850c26c9cd0d89d";
-}
-
-/*
- * Calculate the SHA-1 of an array of big-endian words, and a bit length
- */
-function core_sha1(x, len)
-{
-  /* append padding */
-  x[len >> 5] |= 0x80 << (24 - len % 32);
-  x[((len + 64 >> 9) << 4) + 15] = len;
-
-  var w = Array(80);
-  var a =  1732584193;
-  var b = -271733879;
-  var c = -1732584194;
-  var d =  271733878;
-  var e = -1009589776;
-
-  for(var i = 0; i < x.length; i += 16)
-  {
-    var olda = a;
-    var oldb = b;
-    var oldc = c;
-    var oldd = d;
-    var olde = e;
-
-    for(var j = 0; j < 80; j++)
-    {
-      if(j < 16) w[j] = x[i + j];
-      else w[j] = rol(w[j-3] ^ w[j-8] ^ w[j-14] ^ w[j-16], 1);
-      var t = safe_add(safe_add(rol(a, 5), sha1_ft(j, b, c, d)),
-                       safe_add(safe_add(e, w[j]), sha1_kt(j)));
-      e = d;
-      d = c;
-      c = rol(b, 30);
-      b = a;
-      a = t;
-    }
-
-    a = safe_add(a, olda);
-    b = safe_add(b, oldb);
-    c = safe_add(c, oldc);
-    d = safe_add(d, oldd);
-    e = safe_add(e, olde);
-  }
-  return Array(a, b, c, d, e);
-
-}
-
-/*
- * Perform the appropriate triplet combination function for the current
- * iteration
- */
-function sha1_ft(t, b, c, d)
-{
-  if(t < 20) return (b & c) | ((~b) & d);
-  if(t < 40) return b ^ c ^ d;
-  if(t < 60) return (b & c) | (b & d) | (c & d);
-  return b ^ c ^ d;
-}
-
-/*
- * Determine the appropriate additive constant for the current iteration
- */
-function sha1_kt(t)
-{
-  return (t < 20) ?  1518500249 : (t < 40) ?  1859775393 :
-         (t < 60) ? -1894007588 : -899497514;
-}
-
-/*
- * Calculate the HMAC-SHA1 of a key and some data
- */
-function core_hmac_sha1(key, data)
-{
-  var bkey = str2binb(key);
-  if(bkey.length > 16) bkey = core_sha1(bkey, key.length * chrsz);
-
-  var ipad = Array(16), opad = Array(16);
-  for(var i = 0; i < 16; i++)
-  {
-    ipad[i] = bkey[i] ^ 0x36363636;
-    opad[i] = bkey[i] ^ 0x5C5C5C5C;
-  }
-
-  var hash = core_sha1(ipad.concat(str2binb(data)), 512 + data.length * chrsz);
-  return core_sha1(opad.concat(hash), 512 + 160);
-}
-
-/*
- * Add integers, wrapping at 2^32. This uses 16-bit operations internally
- * to work around bugs in some JS interpreters.
- */
-function safe_add(x, y)
-{
-  var lsw = (x & 0xFFFF) + (y & 0xFFFF);
-  var msw = (x >> 16) + (y >> 16) + (lsw >> 16);
-  return (msw << 16) | (lsw & 0xFFFF);
-}
-
-/*
- * Bitwise rotate a 32-bit number to the left.
- */
-function rol(num, cnt)
-{
-  return (num << cnt) | (num >>> (32 - cnt));
-}
-
-/*
- * Convert an 8-bit or 16-bit string to an array of big-endian words
- * In 8-bit function, characters >255 have their hi-byte silently ignored.
- */
-function str2binb(str)
-{
-  var bin = Array();
-  var mask = (1 << chrsz) - 1;
-  for(var i = 0; i < str.length * chrsz; i += chrsz)
-    bin[i>>5] |= (str.charCodeAt(i / chrsz) & mask) << (32 - chrsz - i%32);
-  return bin;
-}
-
-/*
- * Convert an array of big-endian words to a string
- */
-function binb2str(bin)
-{
-  var str = "";
-  var mask = (1 << chrsz) - 1;
-  for(var i = 0; i < bin.length * 32; i += chrsz)
-    str += String.fromCharCode((bin[i>>5] >>> (32 - chrsz - i%32)) & mask);
-  return str;
-}
-
-/*
- * Convert an array of big-endian words to a hex string.
- */
-function binb2hex(binarray)
-{
-  var hex_tab = hexcase ? "0123456789ABCDEF" : "0123456789abcdef";
-  var str = "";
-  for(var i = 0; i < binarray.length * 4; i++)
-  {
-    str += hex_tab.charAt((binarray[i>>2] >> ((3 - i%4)*8+4)) & 0xF) +
-           hex_tab.charAt((binarray[i>>2] >> ((3 - i%4)*8  )) & 0xF);
-  }
-  return str;
-}
-
-/*
- * Convert an array of big-endian words to a base-64 string
- */
-function binb2b64(binarray)
-{
-  var tab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-  var str = "";
-  for(var i = 0; i < binarray.length * 4; i += 3)
-  {
-    var triplet = (((binarray[i   >> 2] >> 8 * (3 -  i   %4)) & 0xFF) << 16)
-                | (((binarray[i+1 >> 2] >> 8 * (3 - (i+1)%4)) & 0xFF) << 8 )
-                |  ((binarray[i+2 >> 2] >> 8 * (3 - (i+2)%4)) & 0xFF);
-    for(var j = 0; j < 4; j++)
-    {
-      if(i * 8 + j * 6 > binarray.length * 32) str += b64pad;
-      else str += tab.charAt((triplet >> 6*(3-j)) & 0x3F);
-    }
-  }
-  return str;
-}
-
-
-var plainText = "Two households, both alike in dignity,\n\
-In fair Verona, where we lay our scene,\n\
-From ancient grudge break to new mutiny,\n\
-Where civil blood makes civil hands unclean.\n\
-From forth the fatal loins of these two foes\n\
-A pair of star-cross'd lovers take their life;\n\
-Whole misadventured piteous overthrows\n\
-Do with their death bury their parents' strife.\n\
-The fearful passage of their death-mark'd love,\n\
-And the continuance of their parents' rage,\n\
-Which, but their children's end, nought could remove,\n\
-Is now the two hours' traffic of our stage;\n\
-The which if you with patient ears attend,\n\
-What here shall miss, our toil shall strive to mend.";
-
-for (var i = 0; i <4; i++) {
-    plainText += plainText;
-}
-
-var sha1Output = hex_sha1(plainText);