Synced nanojit with TT tip.
authorDavid Anderson <danderson@mozilla.com>
Wed, 16 Jul 2008 14:21:31 -0700
changeset 17687 f2f4b2123e13059ccd074fe7c9e1d8a76fe2dec5
parent 17686 d0a717cd415e61daf864175e755ced73c2f791b2
child 17688 2c56d87c195a6e64dcbbf16bdb82e92079233119
push id1452
push usershaver@mozilla.com
push dateFri, 22 Aug 2008 00:08:22 +0000
treeherderautoland@d13bb0868596 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
milestone1.9.1a1pre
Synced nanojit with TT tip.
js/src/jstracer.cpp
js/src/nanojit/Assembler.cpp
js/src/nanojit/Assembler.h
js/src/nanojit/Fragmento.cpp
js/src/nanojit/Fragmento.h
js/src/nanojit/LIR.cpp
js/src/nanojit/LIR.h
js/src/nanojit/Native.h
js/src/nanojit/NativeARM.h
js/src/nanojit/NativeThumb.cpp
js/src/nanojit/NativeThumb.h
js/src/nanojit/Nativei386.cpp
js/src/nanojit/Nativei386.h
--- a/js/src/jstracer.cpp
+++ b/js/src/jstracer.cpp
@@ -403,23 +403,23 @@ public:
             *m++ = getStoreType(*vp));
         return out->insGuard(v, c, x);
     }
 
     /* Sink all type casts into the stack into the side exit by simply storing the original
        (uncasted) value. Each guard generates the side exit map based on the types of the
        last stores to every stack location, so its safe to not perform them on-trace. */
     virtual LInsp insStore(LIns* value, LIns* base, LIns* disp) {
-        if (base == _fragment->sp && isPromoteInt(value))
+        if (base == _fragment->lirbuf->sp && isPromoteInt(value))
             value = demote(out, value);
         return out->insStore(value, base, disp);
     }
 
     virtual LInsp insStorei(LIns* value, LIns* base, int32_t d) {
-        if (base == _fragment->sp && isPromoteInt(value))
+        if (base == _fragment->lirbuf->sp && isPromoteInt(value))
             value = demote(out, value);
         return out->insStorei(value, base, d);
     }
 };
 
 TraceRecorder::TraceRecorder(JSContext* cx, Fragmento* fragmento, Fragment* _fragment)
 {
     this->cx = cx;
@@ -486,25 +486,25 @@ TraceRecorder::TraceRecorder(JSContext* 
             *m++ = getCoercedType(*vp)
         );
     } else {
         /* recompiling the trace, we already have a fragment info structure */
         fragmentInfo = (VMFragmentInfo*)fragment->vmprivate;
     }
     fragment->vmprivate = fragmentInfo;
 
-    fragment->state = lir->insImm8(LIR_param, Assembler::argRegs[0], 0);
-    fragment->param1 = lir->insImm8(LIR_param, Assembler::argRegs[1], 0);
-    fragment->sp = lir->insLoadi(fragment->state, offsetof(InterpState, sp));
-    fragment->rp = lir->insLoadi(fragment->state, offsetof(InterpState, rp));
-    cx_ins = lir->insLoadi(fragment->state, offsetof(InterpState, cx));
+    fragment->lirbuf->state = lir->insParam(0);
+    fragment->lirbuf->param1 = lir->insParam(1);
+    fragment->lirbuf->sp = lir->insLoadi(fragment->lirbuf->state, offsetof(InterpState, sp));
+    fragment->lirbuf->rp = lir->insLoadi(fragment->lirbuf->state, offsetof(InterpState, rp));
+    cx_ins = lir->insLoadi(fragment->lirbuf->state, offsetof(InterpState, cx));
 #ifdef DEBUG
-    lirbuf->names->addName(fragment->state, "state");
-    lirbuf->names->addName(fragment->sp, "sp");
-    lirbuf->names->addName(fragment->rp, "rp");
+    lirbuf->names->addName(fragment->lirbuf->state, "state");
+    lirbuf->names->addName(fragment->lirbuf->sp, "sp");
+    lirbuf->names->addName(fragment->lirbuf->rp, "rp");
     lirbuf->names->addName(cx_ins, "cx");
 #endif
 
     uint8* m = fragmentInfo->typeMap;
     FORALL_SLOTS_IN_PENDING_FRAMES(cx, fragmentInfo->ngslots, fragmentInfo->gslots,
                                    entryFrame, entryFrame,
         import(vp, *m, vpname, vpnum);
         m++
@@ -849,20 +849,20 @@ TraceRecorder::import(jsval* p, uint8& t
        not me, so don't blame the messenger. */
     ptrdiff_t offset = -fragmentInfo->nativeStackBase + nativeFrameOffset(p) + 8;
     if (TYPEMAP_GET_TYPE(t) == JSVAL_INT) { /* demoted */
         JS_ASSERT(isInt32(*p));
         /* Ok, we have a valid demotion attempt pending, so insert an integer
            read and promote it to double since all arithmetic operations expect
            to see doubles on entry. The first op to use this slot will emit a
            f2i cast which will cancel out the i2f we insert here. */
-        ins = lir->ins1(LIR_i2f, lir->insLoadi(fragment->sp, offset));
+        ins = lir->ins1(LIR_i2f, lir->insLoadi(fragment->lirbuf->sp, offset));
     } else {
         JS_ASSERT(isNumber(*p) == (TYPEMAP_GET_TYPE(t) == JSVAL_DOUBLE));
-        ins = lir->insLoad(t == JSVAL_DOUBLE ? LIR_ldq : LIR_ld, fragment->sp, offset);
+        ins = lir->insLoad(t == JSVAL_DOUBLE ? LIR_ldq : LIR_ld, fragment->lirbuf->sp, offset);
     }
     tracker.set(p, ins);
 #ifdef DEBUG
     char name[16];
     JS_ASSERT(strlen(prefix) < 10);
     JS_snprintf(name, sizeof name, "$%s%d", prefix, index);
     lirbuf->names->addName(ins, name);
     static const char* typestr[] = {
@@ -874,17 +874,17 @@ TraceRecorder::import(jsval* p, uint8& t
 
 /* Update the tracker. If the value is part of any argv/vars/stack of any
    currently active frame (onFrame), then issue a write back store. */
 void
 TraceRecorder::set(jsval* p, LIns* i)
 {
     tracker.set(p, i);
     if (onFrame(p))
-        lir->insStorei(i, fragment->sp, -fragmentInfo->nativeStackBase + nativeFrameOffset(p) + 8);
+        lir->insStorei(i, fragment->lirbuf->sp, -fragmentInfo->nativeStackBase + nativeFrameOffset(p) + 8);
 }
 
 LIns*
 TraceRecorder::get(jsval* p)
 {
     return tracker.get(p);
 }
 
@@ -1027,19 +1027,19 @@ void
 TraceRecorder::stop()
 {
     fragment->blacklist();
 }
 
 int
 nanojit::StackFilter::getTop(LInsp guard)
 {
-    if (sp == frag->sp)
+    if (sp == frag->lirbuf->sp)
         return guard->exit()->sp_adj + 8;
-    JS_ASSERT(sp == frag->rp);
+    JS_ASSERT(sp == frag->lirbuf->rp);
     return guard->exit()->rp_adj + 4;
 }
 
 #if defined NJ_VERBOSE
 void
 nanojit::LirNameMap::formatGuard(LIns *i, char *out)
 {
     uint32_t ip;
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
@@ -55,24 +55,19 @@ namespace nanojit
 	{
 		Assembler *assm;
 	public:
 		DeadCodeFilter(LirFilter *in, Assembler *a) : LirFilter(in), assm(a) {}
 		LInsp read() {
 			for (;;) {
 				LInsp i = in->read();
 				if (!i || i->isGuard() 
-					|| i->isCall() && !assm->_functions[i->imm8()]._cse
+					|| i->isCall() && !assm->_functions[i->fid()]._cse
 					|| !assm->ignoreInstruction(i))
 					return i;
-				if (i->isCall()) {
-					// skip args
-					while (in->pos()->isArg())
-						in->read();
-				}
 			}
 		}
 	};
 
 #ifdef NJ_VERBOSE
 	class VerboseBlockReader: public LirFilter
 	{
 		Assembler *assm;
@@ -98,17 +93,17 @@ namespace nanojit
 				return i;
 			}
 			if (i->isGuard()) {
 				flush();
 				block.add(i);
 				if (i->oprnd1())
 					block.add(i->oprnd1());
 			}
-			else if (!i->isArg()) {
+			else {
 				block.add(i);
 			}
 			return i;
 		}
 	};
 #endif
 	
 	/**
@@ -210,17 +205,17 @@ namespace nanojit
 		_resvFree = r->arIndex;
 		r->reg = UnknownReg;
 		r->arIndex = 0;
 		if (!item) 
 			setError(ResvFull); 
 
         if (i->isconst() || i->isconstq())
             r->cost = 0;
-        else if (i == _thisfrag->sp || i == _thisfrag->rp)
+        else if (i == _thisfrag->lirbuf->sp || i == _thisfrag->lirbuf->rp)
             r->cost = 2;
         else
             r->cost = 1;
 
         i->setresv(item);
 		return r;
 	}
 
@@ -309,17 +304,17 @@ namespace nanojit
 	{
 		if (error()) return;
 		// _nIns and _nExitIns need to be at least on
 		// one of these pages
 		NanoAssertMsg( onPage(_nIns)&& onPage(_nExitIns,true), "Native instruction pointer overstep paging bounds; check overrideProtect for last instruction");
 	}
 	#endif
 
-	const CallInfo* Assembler::callInfoFor(int32_t fid)
+	const CallInfo* Assembler::callInfoFor(uint32_t fid)
 	{	
 		NanoAssert(fid < CI_Max);
 		return &_functions[fid];
 	}
 
 	#ifdef _DEBUG
 	
 	void Assembler::resourceConsistencyCheck()
@@ -620,17 +615,17 @@ namespace nanojit
 		
 		//verbose_only( verbose_outputf("         LIR_xend swapptrs, _nIns is now %08X(%08X), _nExitIns is now %08X(%08X)",_nIns, *_nIns,_nExitIns,*_nExitIns) );
 		debug_only( _sv_fpuStkDepth = _fpuStkDepth; _fpuStkDepth = 0; )
 
 		nFragExit(guard);
 
 		// if/when we patch this exit to jump over to another fragment,
 		// that fragment will need its parameters set up just like ours.
-        LInsp stateins = _thisfrag->state;
+        LInsp stateins = _thisfrag->lirbuf->state;
 		Register state = findSpecificRegFor(stateins, Register(stateins->imm8()));
 		asm_bailout(guard, state);
 
 		mergeRegisterState(capture);
 
 		// this can be useful for breaking whenever an exit is taken
 		//INT3();
 		//NOP();
@@ -655,17 +650,17 @@ namespace nanojit
         verbose_only(_stats.exitnative += (_stats.native-nativeSave));
 
         return jmpTarget;
     }
 	
 	bool Assembler::ignoreInstruction(LInsp ins)
 	{
         LOpcode op = ins->opcode();
-        if (ins->isStore() || op == LIR_loop || ins->isArg())
+        if (ins->isStore() || op == LIR_loop)
             return false;
 	    return getresv(ins) == 0;
 	}
 
 	void Assembler::beginAssembly(RegAllocMap* branchStateMap)
 	{
 		_activation.lowwatermark = 1;
 		_activation.tos = _activation.lowwatermark;
@@ -701,18 +696,18 @@ namespace nanojit
 	{
 		if (error()) return;	
 		AvmCore *core = _frago->core();
 		GC *gc = core->gc;
         _thisfrag = frag;
 
 		// set up backwards pipeline: assembler -> StackFilter -> LirReader
 		LirReader bufreader(frag->lastIns);
-		StackFilter storefilter1(&bufreader, gc, frag, frag->sp);
-		StackFilter storefilter2(&storefilter1, gc, frag, frag->rp);
+		StackFilter storefilter1(&bufreader, gc, frag, frag->lirbuf->sp);
+		StackFilter storefilter2(&storefilter1, gc, frag, frag->lirbuf->rp);
 		DeadCodeFilter deadfilter(&storefilter2, this);
 		LirFilter* rdr = &deadfilter;
 		verbose_only(
 			VerboseBlockReader vbr(rdr, this, frag->lirbuf->names);
 			if (verbose_enabled())
 				rdr = &vbr;
 		)
 
@@ -826,21 +821,16 @@ namespace nanojit
 					reserveFree(i);
 				}
 			}
 		}
 	}
 	
 	void Assembler::gen(LirFilter* reader,  NInsList& loopJumps)
 	{
-		_call = NULL;
-		_iargs = 0;
-		_fargs = 0;
-		_stackUsed = 0;
-
 		// trace must start with LIR_x or LIR_loop
 		NanoAssert(reader->pos()->isop(LIR_x) || reader->pos()->isop(LIR_loop));
 		 
 		for (LInsp ins = reader->read(); ins != 0 && !error(); ins = reader->read())
 		{
     		Reservation *rR = getresv(ins);
 			LOpcode op = ins->opcode();			
 			switch(op)
@@ -1238,17 +1228,17 @@ namespace nanojit
 
                     #ifdef NJ_VERBOSE
                     // branching from this frag to ourself.
                     if (_frago->core()->config.show_stats)
                         LDi(argRegs[1], int((Fragment*)_thisfrag));
                     #endif
 
 					// restore first parameter, the only one we use
-                    LInsp state = _thisfrag->state;
+                    LInsp state = _thisfrag->lirbuf->state;
                     Register a0 = Register(state->imm8());
 					findSpecificRegFor(state, a0); 
 					break;
 				}
 #ifndef NJ_SOFTFLOAT
 				case LIR_feq:
 				case LIR_fle:
 				case LIR_flt:
@@ -1300,76 +1290,22 @@ namespace nanojit
 						SETBE(r);
 					else if (op == LIR_ugt)
 						SETA(r);
 					else // if (op == LIR_uge)
 						SETAE(r);
 					asm_cmp(ins);
 					break;
 				}
-				case LIR_ref:
-				{
-					// ref arg - use lea
-					LIns *p = ins->oprnd1();
-					if (ins->resv())
-					{
-						// arg in specific reg
-						Register r = imm2register(ins->resv());
-						int da = findMemFor(p);
-						LEA(r, da, FP);
-					}
-					else
-					{
-						NanoAssert(0); // not supported
-					}
-					++_iargs;
-					nArgEmitted(_call, 0, _iargs, _fargs);
-					break;
-				}
-				case LIR_arg:
-				{
-					LIns* p = ins->oprnd1();
-					if (ins->resv())
-					{
-						// arg goes in specific register
-						Register r = imm2register(ins->resv());
-						if (p->isconst())
-							LDi(r, p->constval());
-						else
-							findSpecificRegFor(p, r);
-					}
-					else
-					{
-						asm_pusharg(p);
-						_stackUsed += 1;
-					}
-					++_iargs;
-					nArgEmitted(_call, _stackUsed, _iargs, _fargs);
-					break;
-				}
-#if defined NANOJIT_IA32 || defined NANOJIT_AMD64
-				case LIR_farg:
-				{
-					asm_farg(ins);
-					break;
-				}
-#endif
 
 #ifndef NJ_SOFTFLOAT
 				case LIR_fcall:
 #endif
 				case LIR_call:
 				{
-					const FunctionID fid = (FunctionID) ins->imm8();
-				// bogus assertion: zero is a legal value right now, with fmod() in that slot
-				//	NanoAssertMsg(fid!=0, "Function does not exist in the call table");
-					_call = &_functions[ fid ];
-					_iargs = 0;
-					_fargs = 0;
-
                     Register rr = UnknownReg;
 #ifndef NJ_SOFTFLOAT
                     if (op == LIR_fcall)
                     {
 						rr = asm_prep_fcall(rR, ins);
                     }
                     else
 #endif
@@ -1378,66 +1314,63 @@ namespace nanojit
                         rr = retRegs[0];
 						prepResultReg(ins, rmask(rr));
                     }
 
 					// do this after we've handled the call result, so we dont
 					// force the call result to be spilled unnecessarily.
 					restoreCallerSaved();
 
-					nPostCallCleanup(_call);
-			#ifdef NJ_VERBOSE
-					CALL(_call->_address, _call->_name);
-			#else
-					CALL(_call->_address, "");
-			#endif
-
-					_stackUsed = 0;
-					LirReader argReader(reader->pos());
-
-#ifdef NANOJIT_ARM
-					// pre-assign registers R0-R3 for arguments (if they fit)
-					int regsUsed = 0;
-					for (LInsp a = argReader.read(); a->isArg(); a = argReader.read())
-					{
-						if (a->isop(LIR_arg) || a->isop(LIR_ref))
-						{
-							a->setresv((int)R0 + 1 + regsUsed);
-							regsUsed++;
-						}
-						if (regsUsed>=4)
-							break;
-					}
-#endif
-#ifdef NANOJIT_IA32
-					debug_only( if (rr == FST0) fpu_push(); )
-					// make sure fpu stack is empty before call (restoreCallerSaved)
-					NanoAssert(_allocator.isFree(FST0));
-					// note: this code requires that LIR_ref arguments be one of the first two arguments
-					// pre-assign registers to the first 2 4B args
-					const uint32_t iargs = _call->count_iargs();
-					const int max_regs = (iargs < 2) ? iargs : 2;
-					int n = 0;
-					for(LIns* a = argReader.read(); a->isArg() && n<max_regs; a = argReader.read())
-					{
-						if (a->isop(LIR_arg)||a->isop(LIR_ref))
-						{
-							a->setresv(argRegs[n++]); // tell LIR_arg what reg to use
-						}
-					}
-#endif
+					asm_call(ins);
 				}
 			}
 
 			// check that all is well (don't check in exit paths since its more complicated)
 			debug_only( pageValidate(); )
 			debug_only( resourceConsistencyCheck();  )
 		}
 	}
 
+    void Assembler::asm_arg(ArgSize sz, LInsp p, Register r)
+    {
+        if (sz == ARGSIZE_Q) 
+        {
+			// ref arg - use lea
+			if (r != UnknownReg)
+			{
+				// arg in specific reg
+				int da = findMemFor(p);
+				LEA(r, da, FP);
+			}
+			else
+			{
+				NanoAssert(0); // not supported
+			}
+		}
+        else if (sz == ARGSIZE_LO)
+		{
+			if (r != UnknownReg)
+			{
+				// arg goes in specific register
+				if (p->isconst())
+					LDi(r, p->constval());
+				else
+					findSpecificRegFor(p, r);
+			}
+			else
+			{
+				asm_pusharg(p);
+			}
+		}
+        else
+		{
+			asm_farg(p);
+		}
+    }
+
 	uint32_t Assembler::arFree(uint32_t idx)
 	{
 		if (idx > 0 && _activation.entry[idx] == _activation.entry[idx+stack_direction(1)])
 			_activation.entry[idx+stack_direction(1)] = 0;  // clear 2 slots for doubles 
 		_activation.entry[idx] = 0;
 		return 0;
 	}
 
@@ -1486,17 +1419,17 @@ namespace nanojit
 			}
 		)
 #endif
 	}
 #endif
 	
 	uint32_t Assembler::arReserve(LIns* l)
 	{
-		NanoAssert(!l->isop(LIR_tramp));
+		NanoAssert(!l->isTramp());
 
 		//verbose_only(printActivationState());
 		const bool quad = l->isQuad();
 		const int32_t n = _activation.tos;
 		int32_t start = _activation.lowwatermark;
 		int32_t i = 0;
 		NanoAssert(start>0);
 		if (n >= NJ_MAX_STACK_ENTRY-2)
@@ -1687,11 +1620,31 @@ namespace nanojit
 		uint32_t argt = _argtypes;
 		for (int i = 0; i < 5; ++i)
 		{
 			argt >>= 2;
 			argc += (argt & mask) != 0;
 		}
 		return argc;
 	}
+
+    uint32_t CallInfo::get_sizes(ArgSize* sizes) const
+    {
+		uint32_t argt = _argtypes;
+		uint32_t argc = 0;
+		for (int32_t i = 0; i < 5; i++) {
+			argt >>= 2;
+			ArgSize a = ArgSize(argt&3);
+#ifdef NJ_SOFTFLOAT
+			if (a == ARGSIZE_F) {
+                sizes[argc++] = ARGSIZE_LO;
+                sizes[argc++] = ARGSIZE_LO;
+                continue;
+            }
 #endif
-
+            if (a != ARGSIZE_NONE) {
+                sizes[argc++] = a;
+            }
+		}
+        return argc;
+    }
+#endif
 }
--- a/js/src/nanojit/Assembler.h
+++ b/js/src/nanojit/Assembler.h
@@ -79,32 +79,35 @@ namespace nanojit
 	{
 		LIns*			entry[ NJ_MAX_STACK_ENTRY ];	/* maps to 4B contiguous locations relative to the frame pointer */
 		uint32_t		tos;							/* current top of stack entry */
 		uint32_t		highwatermark;					/* max tos hit */
 		uint32_t		lowwatermark;					/* we pre-allocate entries from 0 upto this index-1; so dynamic entries are added above this index */
 		LIns*			parameter[ NJ_MAX_PARAMETERS ]; /* incoming parameters */
 	};
 
-	const uint32_t ARGSIZE_NONE = 0;
-	const uint32_t ARGSIZE_F = 1;
-	const uint32_t ARGSIZE_LO = 2;
-	const uint32_t ARGSIZE_Q = 3;
-	const uint32_t _ARGSIZE_MASK_INT = 2;
-	const uint32_t _ARGSIZE_MASK_ANY = 3;
+    enum ArgSize {
+	    ARGSIZE_NONE = 0,
+	    ARGSIZE_F = 1,
+	    ARGSIZE_LO = 2,
+	    ARGSIZE_Q = 3,
+	    _ARGSIZE_MASK_INT = 2, 
+        _ARGSIZE_MASK_ANY = 3
+    };
 
 	struct CallInfo
 	{
 		intptr_t	_address;
 		uint16_t	_argtypes;		// 6 2-bit fields indicating arg type, by ARGSIZE above (including ret type): a1 a2 a3 a4 a5 ret
 		uint8_t		_cse;			// true if no side effects
 		uint8_t		_fold;			// true if no side effects
 		verbose_only ( const char* _name; )
 		
 		uint32_t FASTCALL _count_args(uint32_t mask) const;
+        uint32_t get_sizes(ArgSize*) const;
 
 		inline uint32_t FASTCALL count_args() const { return _count_args(_ARGSIZE_MASK_ANY); }
 		inline uint32_t FASTCALL count_iargs() const { return _count_args(_ARGSIZE_MASK_INT); }
 		// fargs = args - iargs
 	};
 
 	#define FUNCTIONID(name) CI_avmplus_##name
 
@@ -209,20 +212,20 @@ namespace nanojit
 			debug_only ( bool		onPage(NIns* where, bool exitPages=false); )
 			
 			// support calling out from a fragment ; used to debug the jit
 			debug_only( void		resourceConsistencyCheck(); )
 			debug_only( void		registerConsistencyCheck(LIns** resv); )
 			
 			Stats		_stats;		
 
-			const CallInfo* callInfoFor(int32_t fid);
+			const CallInfo* callInfoFor(uint32_t fid);
 			const CallInfo* callInfoFor(LInsp call)
 			{
-				return callInfoFor(call->imm8());
+				return callInfoFor(call->fid());
 			}
 
 		private:
 			
 			void		gen(LirFilter* toCompile, NInsList& loopJumps);
 			NIns*		genPrologue(RegisterMask);
 			NIns*		genEpilogue(RegisterMask);
 
@@ -260,21 +263,16 @@ namespace nanojit
 
 			Reservation* getresv(LIns *x) { return x->resv() ? &_resvTable[x->resv()] : 0; }
 
 			DWB(Fragmento*)		_frago;
             GC*					_gc;
             DWB(Fragment*)		_thisfrag;
 			RegAllocMap*		_branchStateMap;
 			GuardRecord*		_latestGuard;
-
-			const CallInfo		*_call;
-			uint32_t			_iargs;
-			uint32_t			_fargs;
-			int32_t 			_stackUsed;
 		
 			const CallInfo	*_functions;
 			
 			NIns*		_nIns;			// current native instruction
 			NIns*		_nExitIns;		// current instruction in exit fragment page
 			NIns*       _epilogue;
 			Page*		_nativePages;	// list of NJ_PAGE_SIZE pages that have been alloc'd
 			Page*		_nativeExitPages; // list of pages that have been allocated for exit code
@@ -300,32 +298,31 @@ namespace nanojit
 			void		asm_restore(LInsp, Reservation*, Register);
 			void		asm_spill(LInsp i, Reservation *resv, bool pop);
 			void		asm_load64(LInsp i);
 			void		asm_pusharg(LInsp p);
 			NIns*		asm_adjustBranch(NIns* at, NIns* target);
 			void		asm_quad(LInsp i);
 			bool		asm_qlo(LInsp ins, LInsp q);
 			void		asm_fneg(LInsp ins);
-			void		asm_farg(LInsp ins);
 			void		asm_fop(LInsp ins);
 			void		asm_i2f(LInsp ins);
 			void		asm_u2f(LInsp ins);
 			Register	asm_prep_fcall(Reservation *rR, LInsp ins);
 			void		asm_nongp_copy(Register r, Register s);
 			void		asm_bailout(LInsp guard, Register state);
+			void		asm_call(LInsp);
+            void        asm_arg(ArgSize, LInsp, Register);
 
 			// platform specific implementation (see NativeXXX.cpp file)
 			void		nInit(uint32_t flags);
 			void		nInit(AvmCore *);
 			Register	nRegisterAllocFromSet(int32_t set);
 			void		nRegisterResetAll(RegAlloc& a);
 			void		nMarkExecute(Page* page, int32_t count=1, bool enable=true);
-			void		nPostCallCleanup(const CallInfo* call);
-			void		nArgEmitted(const CallInfo* call, uint32_t stackSlotCount, uint32_t iargs, uint32_t fargs);
 			void		nFrameRestore(RegisterMask rmask);
 			static void	nPatchBranch(NIns* branch, NIns* location);
 			void		nFragExit(LIns* guard);
 
 			// platform specific methods
         public:
 			DECLARE_PLATFORM_ASSEMBLER()
 
--- a/js/src/nanojit/Fragmento.cpp
+++ b/js/src/nanojit/Fragmento.cpp
@@ -134,33 +134,18 @@ namespace nanojit
 			int32_t gcpages = (count*NJ_PAGE_SIZE) / _gcHeap->kNativePageSize;
 			MMGC_MEM_TYPE("NanoJitMem"); 
 			memory = (Page*)_gcHeap->Alloc(gcpages);
 #ifdef MEMORY_INFO
 			ChangeSizeExplicit("NanoJitMem", 1, _gcHeap->Size(memory));
 #endif
 			NanoAssert((int*)memory == pageTop(memory));
 			//fprintf(stderr,"head alloc of %d at %x of %d pages using nj page size of %d\n", gcpages, (intptr_t)memory, (intptr_t)_gcHeap->kNativePageSize, NJ_PAGE_SIZE);
-			
-			// can't add memory if its not addressable from all locations
-			for(uint32_t i=0; i<_allocList.size(); i++)
-			{
-				Page* a = _allocList.get(i);
-				int32_t delta = (a < memory) ? (intptr_t)memory+(NJ_PAGE_SIZE*(count+1))-(intptr_t)a : (intptr_t)a+(NJ_PAGE_SIZE*(count+1))-(intptr_t)memory;
-				if ( delta > 16777215 )
-				{
-					// can't use this memory
-#ifdef MEMORY_INFO
-					ChangeSizeExplicit("NanoJitMem", -1, _gcHeap->Size(memory));
-#endif
-					_gcHeap->Free(memory);
-					return;
-				}
-			}
-			_allocList.add(memory);
+
+            _allocList.add(memory);
 
 			Page* page = memory;
 			_pageList = page;
 			_stats.pages += count;
 			_stats.freePages += count;
 			trackFree(0);
 			while(--count > 0)
 			{
@@ -390,34 +375,37 @@ namespace nanojit
 		avmplus::SortedMap<uint64_t, DurData, avmplus::LIST_NonGCObjects> durs(_core->gc);
 		uint64_t totaldur=0;
 		fragstats totalstat = { 0,0,0,0,0 };
         for (int32_t i=0; i<count; i++)
         {
             Fragment *f = _frags->at(i);
 			fragstats stat = { 0,0,0,0,0 };
             dumpFragStats(f, 0, stat);
+            if (stat.lir) {
+				totalstat.lir += stat.lir;
+				totalstat.lirbytes += stat.lirbytes;
+            }
 			uint64_t bothDur = stat.traceDur + stat.interpDur;
 			if (bothDur) {
 				totalstat.interpDur += stat.interpDur;
 				totalstat.traceDur += stat.traceDur;
 				totalstat.size += stat.size;
-				totalstat.lir += stat.lir;
-				totalstat.lirbytes += stat.lirbytes;
 				totaldur += bothDur;
 				while (durs.containsKey(bothDur)) bothDur++;
 				DurData d(f, stat.traceDur, stat.interpDur, stat.size);
 				durs.put(bothDur, d);
 			}
         }
 		uint64_t totaltrace = totalstat.traceDur;
 		int totalsize = totalstat.size;
 
 		_assm->outputf("");
-		_assm->outputf("avg %.1f bytes/lir", double(totalstat.lirbytes)/totalstat.lir);
+		_assm->outputf("lirbytes %d / lir %d = %.1f bytes/lir", totalstat.lirbytes,
+            totalstat.lir, double(totalstat.lirbytes)/totalstat.lir);
 		_assm->outputf("       trace         interp");
 		_assm->outputf("%9lld (%2d%%)  %9lld (%2d%%)",
 			totaltrace/1000, int(100.0*totaltrace/totaldur),
 			(totaldur-totaltrace)/1000, int(100.0*(totaldur-totaltrace)/totaldur));
 		_assm->outputf("");
 		_assm->outputf("trace      ticks            trace           interp           size");
 		for (int32_t i=durs.size()-1; i >= 0; i--) {
 			uint64_t bothDur = durs.keyAt(i);
--- a/js/src/nanojit/Fragmento.h
+++ b/js/src/nanojit/Fragmento.h
@@ -210,17 +210,16 @@ namespace nanojit
 			GuardRecord*	outbound;
 			
 			TraceKind kind;
 			const void* ip;
 			uint32_t guardCount;
             uint32_t xjumpCount;
             int32_t blacklistLevel;
             NIns* fragEntry;
-            LInsp state,param1,sp,rp;
 			int32_t calldepth;
 			void* vmprivate;
 			
 		private:
 			NIns*			_code;		// ptr to start of code
 			GuardRecord*	_links;		// code which is linked (or pending to be) to this fragment
 			int32_t			_hits;
 			Page*			_pages;		// native code pages 
--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@@ -41,44 +41,44 @@
 #include <stdio.h>
 
 namespace nanojit
 {
     using namespace avmplus;
 	#ifdef FEATURE_NANOJIT
 
 	const uint8_t operandCount[] = {
-	/* 0 */		2, 2, /*trace*/0, /*skip*/0, /*tramp*/0, 2, 2, 2, 2, /*arg*/1,
-	/* 10 */	/*param*/0, 2, 2, /*ref*/1, 2, 2, 2, 2, /*call*/0, /*loop*/0,
+	/* 0 */		2, 2, /*trace*/0, /*nearskip*/0, /*skip*/0, /*neartramp*/0, /*tramp*/0, 2, 2, 2,
+	/* 10 */	/*param*/0, 2, 2, 2, 2, 2, 2, 2, /*call*/0, /*loop*/0,
 	/* 20 */	/*x*/0, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 	/* 30 */	2, 2, /*short*/0, /*int*/0, 2, 2, /*neg*/1, 2, 2, 2,
 	/* 40 */	/*callh*/1, 2, 2, 2, /*not*/1, 2, 2, 2, /*xt*/1, /*xf*/1,
 	/* 50 */	/*qlo*/1, /*qhi*/1, 2, /*ov*/1, /*cs*/1, 2, 2, 2, 2, 2,
 	/* 60 */	2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-	/* 70 */	2, 2, 2, /*farg*/1, 2, 2, 2, 2, 2, 2,
+	/* 70 */	2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 	/* 80 */	2, 2, /*fcall*/0, 2, 2, 2, 2, 2, 2, 2,
 	/* 90 */	2, 2, 2, 2, 2, 2, 2, /*quad*/0, 2, 2,
 	/* 100 */	/*fneg*/1, 2, 2, 2, 2, 2, /*i2f*/1, /*u2f*/1, 2, 2,
 	/* 110 */	2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 	/* 120 */	2, 2, 2, 2, 2, 2, 2, 2, 
 	};
 
 	// LIR verbose specific
 	#ifdef NJ_VERBOSE
 
 	const char* lirNames[] = {
-	/* 0-9 */	"0","1","trace","skip","tramp","5","6","7","8","arg",
-	/* 10-19 */	"param","st","ld","ref","sti","15","16","17","call","loop",
+	/* 0-9 */	"0","1","trace","nearskip","skip","neartramp","tramp","7","8","9",
+	/* 10-19 */	"param","st","ld","13","sti","15","16","17","call","loop",
 	/* 20-29 */ "x","21","22","23","24","25","feq","flt","fgt","fle",
 	/* 30-39 */ "fge","cmov","short","int","ldc","","neg","add","sub","mul",
 	/* 40-49 */ "callh","and","or","xor","not","lsh","rsh","ush","xt","xf",
 	/* 50-59 */ "qlo","qhi","ldcb","ov","cs","eq","lt","gt","le","ge",
 	/* 60-63 */ "ult","ugt","ule","uge",
 	/* 64-69 */ "LIR64","65","66","67","68","69",
-	/* 70-79 */ "70","71","72","farg","74","stq","ldq","77","stqi","79",
+	/* 70-79 */ "70","71","72","73","74","stq","ldq","77","stqi","79",
 	/* 80-89 */ "80","81","fcall","83","84","85","86","87","88","89",
 	/* 90-99 */ "90","91","92","93","94","95","96","quad","98","99",
 	/* 100-109 */ "fneg","fadd","fsub","fmul","fdiv","qjoin","i2f","u2f","108","109",
 	/* 110-119 */ "110","111","112","113","114","115","116","117","118","119",
 	/* 120-127 */ "120","121","122","123","124","125","126","127"
 	};
 
 	#endif /* NANOJIT_VEBROSE */
@@ -204,51 +204,67 @@ namespace nanojit
 			_unused = &lastPage->lir[0];
 		}
 		return false;
 	}
 	
 	bool LirBufWriter::ensureRoom(uint32_t count)
 	{
 		LInsp last = _buf->next();
-		if (!samepage(last,last+count)
+		if (!samepage(last,last+2*count)
 			&& _buf->addPage()) 
 		{
 			// link LIR stream back to prior instruction (careful insFar relies on _unused...)
-			LInsp next = _buf->next();
-			insFar(LIR_skip, last-1-next);
+			insFar(LIR_skip, last-1);
 		}
 		return !_buf->outOmem();
 	}
 
 	LInsp LirBuffer::commit(uint32_t count)
 	{
 		debug_only(validate();)
 		NanoAssertMsg( samepage(_unused, _unused+count), "You need to call ensureRoom first!" );
 		return _unused += count;
 	}
 	
-	uint32_t LIns::reference(LIns *r)
+	uint32_t LIns::reference(LIns *r) const
 	{
 		int delta = this-r-1;
 		NanoAssert(isU8(delta));
 		return delta;
 	}
 
+    LIns* LIns::deref(int32_t off) const
+    {
+		LInsp i = (LInsp) this-1 - off;
+        while (i->isTramp())
+            i = i->ref();
+		return i;
+    }
+
 	LInsp LirBufWriter::ensureReferenceable(LInsp i, int32_t addedDistance)
 	{
-		NanoAssert(!i->isop(LIR_tramp));
+		NanoAssert(!i->isTramp());
 		LInsp next = _buf->next();
-		LInsp from = next + addedDistance;
-		if ( canReference(from,i) )
+		LInsp from = next + 2*addedDistance;
+		if (canReference(from,i))
 			return i;
+        if (i == _buf->sp && spref && canReference(from, spref))
+            return spref;
+        if (i == _buf->rp && rpref && canReference(from, rpref))
+            return rpref;
 
 		// need a trampoline to get to i
-		LInsp tramp = insFar(LIR_tramp, i-next);
-		NanoAssert( tramp+tramp->imm24() == i );
+		LInsp tramp = insFar(LIR_tramp, i);
+		NanoAssert( tramp->ref() == i );
+
+        if (i == _buf->sp)
+            spref = tramp;
+        else if (i == _buf->rp)
+            rpref = tramp;
 		return tramp;
 	}
 	
 	LInsp LirBufWriter::insStore(LInsp val, LInsp base, LInsp off)
 	{
 		LOpcode op = val->isQuad() ? LIR_stq : LIR_st;
 		NanoAssert(val && base && off);
 		ensureRoom(4);
@@ -336,38 +352,51 @@ namespace nanojit
 
 	LInsp LirBufWriter::insGuard(LOpcode op, LInsp c, SideExit *x)
 	{
 		LInsp data = skip(SideExitSize(x));
 		*((SideExit*)data->payload()) = *x;
 		return ins2(op, c, data);
 	}
 
-	LInsp LirBufWriter::insImm8(LOpcode op, int32_t a, int32_t b)
-	{
+    LInsp LirBufWriter::insParam(int32_t arg)
+    {
 		ensureRoom(1);
 		LInsp l = _buf->next();
-		l->initOpcode(op);
-		l->setimm8(a,b);
+		l->initOpcode(LIR_param);
+		l->c.imm8a = Assembler::argRegs[arg];
 
 		_buf->commit(1);
 		_buf->_stats.lir++;
 		return l;
-	}
+    }
 	
-	LInsp LirBufWriter::insFar(LOpcode op, int32_t imm)
-	{
-		ensureRoom(1);
+#define isS24(x) (((int32_t(x)<<8)>>8) == (x))
 
-		LInsp l = _buf->next();
-		l->initOpcode(op);
-		l->setimm24(imm);
-
-		_buf->commit(1);
-		return l;
+	LInsp LirBufWriter::insFar(LOpcode op, LInsp target)
+	{
+        NanoAssert(op == LIR_skip || op == LIR_tramp);
+        LInsp l = _buf->next();
+        int d = target-l;
+        if (isS24(d)) {
+    		ensureRoom(1);
+            l->initOpcode(LOpcode(op-1)); // nearskip or neartramp
+            l->t.imm24 = d;
+            _buf->commit(1);
+            return l;
+        }
+        else {
+            ensureRoom(2);
+            // write the pointer and instruction
+            l = _buf->next()+1;
+            *((LInsp*)(l-1)) = target;
+            l->initOpcode(op);
+            _buf->commit(2);
+		    return l;
+        }
 	}
 	
 	LInsp LirBufWriter::insImm(int32_t imm)
 	{
 		if (isS16(imm)) {
 			ensureRoom(1);
 			LInsp l = _buf->next();
 			l->initOpcode(LIR_short);
@@ -392,20 +421,20 @@ namespace nanojit
 		l[1] = int32_t(imm>>32);
 		_buf->commit(2);	
 		return ins0(LIR_quad);
 	}
 
 	LInsp LirBufWriter::skip(size_t size)
 	{
         const uint32_t n = (size+sizeof(LIns)-1)/sizeof(LIns);
-		ensureRoom(n+1);
-		LInsp i = _buf->next();
+		ensureRoom(n+2);
+		LInsp last = _buf->next()-1;
 		_buf->commit(n);
-		return insFar(LIR_skip, i-1-_buf->next());
+		return insFar(LIR_skip, last);
 	}
 
 	LInsp LirReader::read()	
 	{
 		LInsp cur = _i;
 		if (!cur)
 			return 0;
 		LIns* i = cur;
@@ -413,28 +442,35 @@ namespace nanojit
 		do
 		{
 			switch (iop)
 			{					
 				default:
 					i--;
 					break;
 
+				case LIR_call:
+				case LIR_fcall:
+					i -= argwords(i->argc())+1;
+					break;
+
 				case LIR_skip:
-					NanoAssert(i->imm24() != 0);
-					i += i->imm24();
+				case LIR_nearskip:
+					NanoAssert(i->ref() != i);
+					i = i->ref();
 					break;
-		
+
+                case LIR_tramp:
 				case LIR_int:
 					NanoAssert(samepage(i, i-2));
 					i -= 2;
 					break;
 
 				case LIR_quad:
-					NanoAssert(samepage(i,i-3));
+					NanoAssert(samepage(i, i-3));
 					i -= 3;
 					break;
 
 				case LIR_trace:
 					_i = 0;  // start of trace
 					return cur;
 			}
 			iop = i->opcode();
@@ -498,47 +534,36 @@ namespace nanojit
 
 	bool FASTCALL isCse(LOpcode op) {
 		op = LOpcode(op & ~LIR64);
 		return op >= LIR_feq && op <= LIR_uge;
 	}
 
     bool LIns::isCse(const CallInfo *functions) const
     { 
-		return nanojit::isCse(u.code) || isCall() && functions[imm8()]._cse;
+		return nanojit::isCse(u.code) || isCall() && functions[fid()]._cse;
     }
 
-	void LIns::setimm8(int32_t a, int32_t b)
-	{
-		NanoAssert(isS8(a) && isS8(b));
-		c.imm8a = int8_t(a);
-		c.imm8b = int8_t(b);
-	}
-
 	void LIns::setimm16(int32_t x)
 	{
 		NanoAssert(isS16(x));
 		i.imm16 = int16_t(x);
 	}
 
-	void LIns::setimm24(int32_t x)
-	{
-		t.imm24 = x;
-	}
-
 	void LIns::setresv(uint32_t resv)
 	{
 		NanoAssert(isU8(resv));
 		g.resv = resv;
 	}
 
 	void LIns::initOpcode(LOpcode op)
 	{
-		t.code = op;
-		t.imm24 = 0;
+		i.code = op;
+		i.imm16 = 0;
+        i.resv = 0;
 	}
 
 	void LIns::setOprnd1(LInsp r)
 	{
 		u.oprnd_1 = reference(r);
 	}
 
 	void LIns::setOprnd2(LInsp r)
@@ -552,43 +577,34 @@ namespace nanojit
 	}
 
     void LIns::setDisp(int8_t d)
     {
         sti.disp = d;
     }
 
 	LInsp	LIns::oprnd1() const	
-	{ 
-		LInsp i = (LInsp) this - u.oprnd_1 - 1;
-		while (i->isop(LIR_tramp)) 
-			i += i->imm24();
-		return i;
+	{
+        return deref(u.oprnd_1);
 	}
 	
 	LInsp	LIns::oprnd2() const
 	{ 
-		LInsp i = (LInsp) this - u.oprnd_2 - 1;
-		while (i->isop(LIR_tramp)) 
-			i += i->imm24();
-		return i;
+        return deref(u.oprnd_2);
 	}
 
 	LInsp	LIns::oprnd3() const
 	{ 
-		LInsp i = (LInsp) this - u.oprnd_3 - 1;
-		while (i->isop(LIR_tramp)) 
-			i += i->imm24();
-		return i;
+        return deref(u.oprnd_3);
 	}
 
     void *LIns::payload() const
     {
-        NanoAssert(opcode() == LIR_skip);
-        return (void*) (this+imm24()+1);
+        NanoAssert(opcode()==LIR_skip || opcode()==LIR_nearskip);
+        return (void*) (ref()+1);
     }
 
     LIns* LirWriter::ins2i(LOpcode v, LIns* oprnd1, int32_t imm)
     {
         return ins2(v, oprnd1, insImm(imm));
     }
 
     bool insIsS16(LInsp i)
@@ -868,60 +884,62 @@ namespace nanojit
 		// @todo -- it might be better to use a short conditional branch rather than
 		// the bit-twiddling on systems that don't provide a conditional move instruction.
 		LInsp ncond = ins1(LIR_neg, cond); // cond ? -1 : 0
 		return ins2(LIR_or, 
 					ins2(LIR_and, iftrue, ncond), 
 					ins2(LIR_and, iffalse, ins1(LIR_not, ncond)));
 	}
 
-    LIns* LirBufWriter::insCall(int32_t fid, LInsp args[])
+    LIns* LirBufWriter::insCall(uint32_t fid, LInsp args[])
 	{
-		static const LOpcode k_argmap[] = { LIR_farg, LIR_arg, LIR_ref };
 		static const LOpcode k_callmap[] = { LIR_call, LIR_fcall, LIR_call, LIR_callh };
 
 		const CallInfo& ci = _functions[fid];
 		uint32_t argt = ci._argtypes;
-		int32_t argc = ci.count_args();
-		const uint32_t ret = argt & 3;
-		LOpcode op = k_callmap[ret];
-		//printf("   ret is type %d %s\n", ret, lirNames[op]);
+		LOpcode op = k_callmap[argt & 3];
+
+        ArgSize sizes[10];
+        uint32_t argc = ci.get_sizes(sizes);
 
 #ifdef NJ_SOFTFLOAT
 		if (op == LIR_fcall)
 			op = LIR_callh;
 		LInsp args2[5*2]; // arm could require 2 args per double
 		int32_t j = 0;
-		uint32_t argt2 = argt&3; // copy of return type
-		for (int32_t i = 0; i < argc; i++) {
+		for (int32_t i = 0; i < 5; i++) {
 			argt >>= 2;
-			uint32_t a = argt&3;
+			ArgSize a = ArgSize(argt&3);
 			if (a == ARGSIZE_F) {
 				LInsp q = args[i];
 				args2[j++] = ins1(LIR_qhi, q);
-				argt2 |= ARGSIZE_LO << (j*2);
 				args2[j++] = ins1(LIR_qlo, q);
-				argt2 |= ARGSIZE_LO << (j*2);
-			} else {
+			} else if (a != ARGSIZE_NONE) {
 				args2[j++] = args[i];
-				argt2 |= a << (j*2);
 			}
 		}
 		args = args2;
-		argt = argt2;
-		argc = j;
+        NanoAssert(j == argc);
 #endif
 
-		for (int32_t i = 0; i < argc; i++) {
-			argt >>= 2;
-			AvmAssert((argt&3)!=0);
-			ins1(k_argmap[(argt&3)-1], args[i]);
-		}
-
-		return insImm8(op==LIR_callh ? LIR_call : op, fid, argc);
+		NanoAssert(argc < 8);
+		uint32_t words = argwords(argc);
+		ensureRoom(words+argc+1);  // ins size + possible tramps
+		for (uint32_t i=0; i < argc; i++)
+			args[i] = ensureReferenceable(args[i], argc-i);
+		uint8_t* offs = (uint8_t*)_buf->next();
+		LIns *l = _buf->next() + words;
+		for (uint32_t i=0; i < argc; i++)
+			offs[i] = (uint8_t) l->reference(args[i]);
+		l->initOpcode(op==LIR_callh ? LIR_call : op);
+        l->c.imm8a = fid;
+        l->c.imm8b = argc;
+		_buf->commit(words+1);	
+		_buf->_stats.lir++;
+		return l;
 	}
 
     using namespace avmplus;
 
 	StackFilter::StackFilter(LirFilter *in, GC *gc, Fragment *frag, LInsp sp) 
 		: LirFilter(in), gc(gc), frag(frag), sp(sp), top(0)
 	{}
 
@@ -1036,22 +1054,21 @@ namespace nanojit
 			case LIR_int:
 				return hashimm(i->imm32());
 			case LIR_quad:
 				return hashimmq(i->constvalq());
 			case LIR_call:
 			case LIR_fcall:
 			{
 				LInsp args[10];
-				int32_t argc = i->imm8b();
+				int32_t argc = i->argc();
 				NanoAssert(argc < 10);
-				LirReader ri(i);
-				for (int32_t j=argc; j > 0; )
-					args[--j] = ri.previous()->oprnd1();
-				return hashcall(i->imm8(), argc, args);
+				for (int32_t j=0; j < argc; j++)
+					args[j] = i->arg(j);
+				return hashcall(i->fid(), argc, args);
 			} 
 			default:
 				if (operandCount[op] == 2)
 					return hash2(op, i->oprnd1(), i->oprnd2());
 				else
 					return hash1(op, i->oprnd1());
 		}
 	}
@@ -1074,22 +1091,21 @@ namespace nanojit
 			} 
 			case LIR_quad:
 			{
 				return a->constvalq() == b->constvalq();
 			}
 			case LIR_call:
 			case LIR_fcall:
 			{
-				uint32_t argc;
-				if (a->imm8() != b->imm8()) return false;
-				if ((argc=a->imm8b()) != b->imm8b()) return false;
-				LirReader ra(a), rb(b);
-				while (argc-- > 0)
-					if (ra.previous()->oprnd1() != rb.previous()->oprnd1())
+				if (a->fid() != b->fid()) return false;
+				uint32_t argc=a->argc();
+                NanoAssert(argc == b->argc());
+				for (uint32_t i=0; i < argc; i++)
+					if (a->arg(i) != b->arg(i))
 						return false;
 				return true;
 			} 
 			default:
 			{
 				const uint32_t count = operandCount[op];
 				if ((count >= 1 && a->oprnd1() != b->oprnd1()) ||
 					(count >= 2 && a->oprnd2() != b->oprnd2()))
@@ -1173,17 +1189,17 @@ namespace nanojit
 	}
 
 	uint32_t LInsHashSet::hash2(LOpcode op, LInsp a, LInsp b) {
 		uint32_t hash = _hash8(0,uint8_t(op));
 		hash = _hashptr(hash, a);
 		return _hashfinish(_hashptr(hash, b));
 	}
 
-	uint32_t LInsHashSet::hashcall(int32_t fid, uint32_t argc, LInsp args[]) {
+	uint32_t LInsHashSet::hashcall(uint32_t fid, uint32_t argc, LInsp args[]) {
 		uint32_t hash = _hash32(0,fid);
 		for (int32_t j=argc-1; j >= 0; j--)
 			hash = _hashptr(hash,args[j]);
 		return _hashfinish(hash);
 	}
 
 	LInsp LInsHashSet::find32(int32_t a, uint32_t &i)
 	{
@@ -1250,34 +1266,32 @@ namespace nanojit
 			hash = (hash + (n += 2)) & bitmask;		// quadratic probe
 		}
 		i = hash;
 		return k;
 	}
 
 	bool argsmatch(LInsp i, uint32_t argc, LInsp args[])
 	{
-		// we don't have callinfo here so we cannot use argiterator
-		LirReader r(i);
-		for (LInsp a = r.previous(); a->isArg(); a=r.previous())
-			if (a->oprnd1() != args[--argc])
+		for (uint32_t j=0; j < argc; j++)
+			if (i->arg(j) != args[j])
 				return false;
 		return true;
 	}
 
-	LInsp LInsHashSet::findcall(int32_t fid, uint32_t argc, LInsp args[], uint32_t &i)
+	LInsp LInsHashSet::findcall(uint32_t fid, uint32_t argc, LInsp args[], uint32_t &i)
 	{
 		uint32_t cap = m_list.size();
 		const InsList& list = m_list;
 		const uint32_t bitmask = (cap - 1) & ~0x1;
 		uint32_t hash = hashcall(fid, argc, args) & bitmask;  
 		uint32_t n = 7 << 1;
 		LInsp k;
 		while ((k = list.get(hash)) != NULL &&
-			(!k->isCall() || k->imm8() != fid || !argsmatch(k, argc, args)))
+			(!k->isCall() || k->fid() != fid || !argsmatch(k, argc, args)))
 		{
 			hash = (hash + (n += 2)) & bitmask;		// quadratic probe
 		}
 		i = hash;
 		return k;
 	}
 
     SideExit *LIns::exit()
@@ -1306,17 +1320,17 @@ namespace nanojit
                 live.put(i,use);
             }
 		}
         void retire(LInsp i, GC *gc) {
             RetiredEntry *e = new (gc) RetiredEntry(gc);
             e->i = i;
             for (int j=0, n=live.size(); j < n; j++) {
                 LInsp l = live.keyAt(j);
-                if (!l->isStore() && !l->isGuard() && !l->isArg())
+                if (!l->isStore() && !l->isGuard())
                     e->live.add(l);
             }
             int size=0;
 		    if ((size = e->live.size()) > maxlive)
 			    maxlive = size;
 
             live.remove(i);
             retired.add(e);
@@ -1325,41 +1339,33 @@ namespace nanojit
 			return live.containsKey(i);
 		}
 	};
 
     void live(GC *gc, Assembler *assm, Fragment *frag)
 	{
 		// traverse backwards to find live exprs and a few other stats.
 
-		LInsp sp = frag->sp;
-		LInsp rp = frag->rp;
+		LInsp sp = frag->lirbuf->sp;
+		LInsp rp = frag->lirbuf->rp;
 		LiveTable live(gc);
 		uint32_t exits = 0;
 		LirBuffer *lirbuf = frag->lirbuf;
         LirReader br(lirbuf);
 		StackFilter sf(&br, gc, frag, sp);
 		StackFilter r(&sf, gc, frag, rp);
-        bool skipargs = false;
         int total = 0;
-        live.add(frag->state, r.pos());
+        live.add(frag->lirbuf->state, r.pos());
 		for (LInsp i = r.read(); i != 0; i = r.read())
 		{
             total++;
 
-            if (i->isArg()) {
-                if (!skipargs)
-                    live.add(i->oprnd1(),0);
-            } else {
-                skipargs = false;
-            }
-
             // first handle side-effect instructions
 			if (i->isStore() || i->isGuard() ||
-				i->isCall() && !assm->callInfoFor(i->imm8())->_cse)
+				i->isCall() && !assm->callInfoFor(i->fid())->_cse)
 			{
 				live.add(i,0);
                 if (i->isGuard())
                     exits++;
 			}
 
 			// now propagate liveness
 			if (live.contains(i))
@@ -1376,20 +1382,20 @@ namespace nanojit
                 }
 				else if (operandCount[i->opcode()] == 1) {
 				    live.add(i->oprnd1(),i);
 				}
 				else if (operandCount[i->opcode()] == 2) {
 					live.add(i->oprnd1(),i);
 					live.add(i->oprnd2(),i);
 				}
-			}
-			else
-			{
-                skipargs = i->isCall();
+				else if (i->isCall()) {
+					for (int j=0, c=i->argc(); j < c; j++)
+						live.add(i->arg(j),i);
+				}
 			}
 		}
  
 		assm->outputf("live instruction count %ld, total %ld, max pressure %d",
 			live.retired.size(), total, live.maxlive);
         assm->outputf("side exits %ld", exits);
 
 		// print live exprs, going forwards
@@ -1449,17 +1455,17 @@ namespace nanojit
 			*buf++ = ':';
 			formatImm(uint32_t(ref->constvalq()), buf);
 		}
 		else if (ref->isconst()) {
 			formatImm(ref->constval(), buf);
 		}
 		else {
 			if (ref->isCall()) {
-				copyName(ref, _functions[ref->imm8()]._name, funccounts.add(ref->imm8()));
+				copyName(ref, _functions[ref->fid()]._name, funccounts.add(ref->fid()));
 			} else {
 				copyName(ref, lirNames[ref->opcode()], lircounts.add(ref->opcode()));
 			}
 			StringNullTerminatedUTF8 cname(gc, names.get(ref)->name);
 			strcat(buf, cname.c_str());
 		}
 		return labels->dup(buffer);
 	}
@@ -1492,43 +1498,40 @@ namespace nanojit
 
 			case LIR_loop:
 			case LIR_trace:
 				sprintf(s, "%s", lirNames[op]);
 				break;
 
 			case LIR_fcall:
 			case LIR_call: {
-				sprintf(s, "%s ( ", _functions[i->imm8()]._name);
-				LirReader r(i);
-				for (LInsp a = r.previous(); a->isArg(); a = r.previous()) {
+				sprintf(s, "%s ( ", _functions[i->fid()]._name);
+				for (int32_t j=i->argc()-1; j >= 0; j--) {
 					s += strlen(s);
-					sprintf(s, "%s ",formatRef(a->oprnd1()));
+					sprintf(s, "%s ",formatRef(i->arg(j)));
 				}
 				s += strlen(s);
 				sprintf(s, ")");
 				break;
 			}
 
 			case LIR_param:
                 sprintf(s, "%s %s", lirNames[op], gpn(i->imm8()));
 				break;
 
             case LIR_callh:
 			case LIR_neg:
 			case LIR_fneg:
-			case LIR_arg:
-			case LIR_farg:
 			case LIR_i2f:
 			case LIR_u2f:
 			case LIR_qlo:
 			case LIR_qhi:
-			case LIR_ref:
             case LIR_ov:
             case LIR_cs:
+			case LIR_not: 
 				sprintf(s, "%s %s", lirNames[op], formatRef(i->oprnd1()));
 				break;
 
 			case LIR_x:
 			case LIR_xt:
 			case LIR_xf:
 				formatGuard(i, s);
 				break;
@@ -1537,17 +1540,16 @@ namespace nanojit
 			case LIR_sub: 
 		 	case LIR_mul: 
 			case LIR_fadd:
 			case LIR_fsub: 
 		 	case LIR_fmul: 
 			case LIR_fdiv: 
 			case LIR_and: 
 			case LIR_or: 
-			case LIR_not: 
 			case LIR_xor: 
 			case LIR_lsh: 
 			case LIR_rsh:
 			case LIR_ush:
 			case LIR_eq:
 			case LIR_lt:
 			case LIR_le:
 			case LIR_gt:
@@ -1671,22 +1673,23 @@ namespace nanojit
 			LInsp found = exprs.find1(v, c, k);
 			if (found)
 				return 0;
 			return exprs.add(out->insGuard(v,c,x), k);
 		}
 		return out->insGuard(v, c, x);
 	}
 
-	LInsp CseFilter::insCall(int32_t fid, LInsp args[])
+	LInsp CseFilter::insCall(uint32_t fid, LInsp args[])
 	{
 		const CallInfo *c = &_functions[fid];
 		if (c->_cse) {
 			uint32_t k;
-			LInsp found = exprs.findcall(fid, c->count_args(), args, k);
+            uint32_t argc = c->count_args();
+			LInsp found = exprs.findcall(fid, argc, args, k);
 			if (found)
 				return found;
 			return exprs.add(out->insCall(fid, args), k);
 		}
 		return out->insCall(fid, args);
 	}
 
 	CseReader::CseReader(LirFilter *in, LInsHashSet *exprs, const CallInfo *functions)
@@ -1700,22 +1703,17 @@ namespace nanojit
 			if (i->isCse(functions))
 				exprs->replace(i);
 		}
 		return i;
 	}
 
     LIns* FASTCALL callArgN(LIns* i, uint32_t n)
 	{
-		// @todo clean up; shouldn't have to create a reader                                               
-		LirReader rdr(i);
-		do
-			i = rdr.read();
-		while (n-- > 0);
-		return i;
+		return i->arg(i->argc()-n-1);
 	}
 
     void compile(Assembler* assm, Fragment* triggerFrag)
     {
         Fragmento *frago = triggerFrag->lirbuf->_frago;
         AvmCore *core = frago->core();
         GC *gc = core->gc;
 
--- a/js/src/nanojit/LIR.h
+++ b/js/src/nanojit/LIR.h
@@ -47,25 +47,25 @@ namespace nanojit
 	
 	enum LOpcode __msvc_only(: unsigned)	
 	{
 		// flags; upper bits reserved
 		LIR64	= 0x40,			// result is double or quad
 		
 		// special operations (must be 0..N)
 		LIR_trace = 2,	
-		LIR_skip = 3,
-		LIR_tramp	= 4,
+		LIR_nearskip = 3, // must be LIR_skip-1 and lsb=1
+		LIR_skip = 4,
+        LIR_neartramp = 5, // must be LIR_tramp-1 and lsb=1
+        LIR_tramp = 6,
 
 		// non-pure operations
-		LIR_arg		= 9,
 		LIR_param	= 10,
 		LIR_st		= 11,
 		LIR_ld		= 12,
-		LIR_ref  	= 13, // ref arg
         LIR_sti     = 14,
 		LIR_call	= 18,
 			
 		// guards
 		LIR_loop    = 19, // loop fragment
 		LIR_x		= 20, // exit always
 
 		// operators
@@ -115,122 +115,136 @@ namespace nanojit
 		/**
 		 * 64bit operations
 		 */
 		LIR_stq		= LIR_st | LIR64,
 		LIR_stqi	= LIR_sti | LIR64,
 		LIR_quad    = LIR_int | LIR64,
 		LIR_ldq		= LIR_ld    | LIR64,
 
-		LIR_farg	= LIR_arg   | LIR64,
         LIR_fcall   = LIR_call  | LIR64,
 		LIR_fneg	= LIR_neg  | LIR64,
 		LIR_fadd	= LIR_add  | LIR64,
 		LIR_fsub	= LIR_sub  | LIR64,
 		LIR_fmul	= LIR_mul  | LIR64,
 		LIR_fdiv	= 40        | LIR64,
 
 		LIR_qjoin	= 41 | LIR64,
 		LIR_i2f		= 42 | LIR64,
 		LIR_u2f		= 43 | LIR64
 	};
 
+	inline uint32_t argwords(uint32_t argc) {
+		return (argc+3)>>2;
+	}
+
     struct SideExit;
     struct Page;
     struct CallInfo;
 
 	// Low-level Instruction 4B
 	// had to lay it our as a union with duplicate code fields since msvc couldn't figure out how to compact it otherwise.
 	class LIns
 	{
+        friend class LirBufWriter;
 		// 3-operand form (backwards reach only)
 		struct u_type
 		{
 			LOpcode			code:8;
+			uint32_t		oprnd_3:8;	// only used for store, since this location gets clobbered during generation
 			uint32_t		oprnd_1:8;  // 256 ins window and since they only point backwards this is sufficient.
 			uint32_t		oprnd_2:8;  
-			uint32_t		oprnd_3:8;	// only used for store, since this location gets clobbered during generation
 		};
 
         struct sti_type
         {
 			LOpcode			code:8;
+			int32_t	    	disp:8;
 			uint32_t		oprnd_1:8;  // 256 ins window and since they only point backwards this is sufficient.
 			uint32_t		oprnd_2:8;  
-			int32_t	    	disp:8;
         };
 
 		// imm8 form 
 		struct c_type
 		{
 			LOpcode			code:8;
+			uint32_t		resv:8;  // cobberred during assembly
 			uint32_t		imm8a:8;
 			uint32_t		imm8b:8;  
-			uint32_t		resv:8;  // cobberred during assembly
 		};
 
+        // imm24 form for short tramp & skip
+        struct t_type
+        {
+            LOpcode         code:8;
+            int32_t         imm24:24;
+        };
+
 		// imm16 form
 		struct i_type
 		{
 			LOpcode			code:8;
+			uint32_t		resv:8;  // cobberred during assembly
 			int32_t			imm16:16;
-			uint32_t		resv:8;  // cobberred during assembly
-		};
-
-		// tramp form (imm24)
-		struct t_type
-		{
-			LOpcode			code:8;
-			int32_t			imm24:24;	// +/- 8MB
 		};
 
 		// overlay used during code generation ( note that last byte is reserved for allocation )
 		struct g_type
 		{
 			LOpcode			code:8;
+			uint32_t		resv:8;   // cobberred during assembly
 			uint32_t		unused:16;
-			uint32_t		resv:8;   // cobberred during assembly
 		};
 
 		/**
 		 * Various forms of the instruction.
 		 * 
 		 *    In general the oprnd_x entries contain an uint value 0-255 that identifies a previous 
 		 *    instruction, where 0 means the previous instruction and 255 means the instruction two
 		 *    hundred and fifty five prior to this one. 
 		 *      
 		 *    For pointing to instructions further than this range LIR_tramp is used.
 		 */
 		union 
 		{
 			u_type u;
 			c_type c;
 			i_type i;
-			t_type t;
+            t_type t;
 			g_type g;
             sti_type sti;
 		};
 
-		uint32_t reference(LIns*);
+		uint32_t reference(LIns*) const;
+		LIns* deref(int32_t off) const;
 
 	public:
 		LIns*		FASTCALL oprnd1() const;
 		LIns*		FASTCALL oprnd2() const;
 		LIns*		FASTCALL oprnd3() const;
 
 		inline LOpcode	opcode() const	{ return u.code; }
 		inline uint8_t	imm8()	 const	{ return c.imm8a; }
-		inline uint8_t	imm8b()	 const	{ return c.imm8b; }
 		inline int16_t	imm16()	 const	{ return i.imm16; }
-		inline int32_t	imm24()	 const	{ return t.imm24; }
+		inline LIns*	ref()	 const	{ 
+            return (t.code & 1) ? (LIns*)this+t.imm24 : *(LIns**)(this-1);
+        }
 		inline int32_t	imm32()	 const	{ return *(int32_t*)(this-1); }
 		inline uint8_t	resv()	 const  { return g.resv; }
         void*	payload() const;
         inline Page*	page()			{ return (Page*) alignTo(this,NJ_PAGE_SIZE); }
 
+		// index args in r-l order.  arg(0) is rightmost arg
+		inline LIns* arg(uint32_t i) {
+			uint32_t c = argc();
+			NanoAssert(i < c);
+			uint8_t* offs = (uint8_t*) (this-argwords(c));
+			return deref(offs[i]);
+		}
+
         inline int32_t  immdisp()const 
 		{
             return (u.code&~LIR64) == LIR_sti ? sti.disp : oprnd3()->constval();
         }
     
 		inline static bool sameop(LIns* a, LIns* b)
 		{
 			// hacky but more efficient than opcode() == opcode() due to bit masking of 7-bit field
@@ -250,58 +264,68 @@ namespace nanojit
 
 		inline uint64_t constvalq() const
 		{
 			NanoAssert(isconstq());
 		#ifdef AVMPLUS_UNALIGNED_ACCESS
 			return *(const uint64_t*)(this-2);
 		#else
 			uint64_t tmp;
-			memcpy(&tmp, this-2, sizeof(tmp));
+            memcpy(&tmp, this-2, sizeof(tmp));
 			return tmp;
 		#endif
 		}
 		
 		inline double constvalf() const
 		{
 			NanoAssert(isconstq());
 		#ifdef AVMPLUS_UNALIGNED_ACCESS
 			return *(const double*)(this-2);
 		#else
-			double tmpf;
-			memcpy(&tmpf, this-2, sizeof(tmpf));
-			return tmpf;
+			union { uint64_t tmp; double tmpf; } u;
+            memcpy(&u.tmpf, this-2, sizeof(u.tmpf));
+			return u.tmpf;
 		#endif
 		}
 
 		bool isCse(const CallInfo *functions) const;
 		bool isop(LOpcode o) const { return u.code == o; }
 		bool isQuad() const { return (u.code & LIR64) != 0; }
-		bool isArg() const { return (u.code & ~LIR64)==LIR_arg || u.code == LIR_ref; }
 		bool isCond() const;
 		bool isCmp() const;
 		bool isCall() const;
         bool isStore() const;
         bool isLoad() const;
 		bool isGuard() const;
 		bool isconst() const;
 		bool isconstval(int32_t val) const;
 		bool isconstq() const;
+        bool isTramp() {
+            return isop(LIR_neartramp) || isop(LIR_tramp);
+        }
 
-		void setimm8(int32_t a, int32_t b);
 		void setimm16(int32_t i);
 		void setimm24(int32_t i);
 		void setresv(uint32_t resv);
 		void initOpcode(LOpcode);
 		void setOprnd1(LIns*);
 		void setOprnd2(LIns*);
 		void setOprnd3(LIns*);
         void setDisp(int8_t d);
 
         SideExit *exit();
+
+		inline uint32_t argc() {
+			NanoAssert(isCall());
+			return c.imm8b;
+		}
+        inline uint8_t  fid() const {
+			NanoAssert(isCall());
+			return c.imm8a;
+        }
 	};
 	typedef LIns*		LInsp;
 
 	bool FASTCALL isCse(LOpcode v);
 	bool FASTCALL isCmp(LOpcode v);
 	bool FASTCALL isCond(LOpcode v);
 	LIns* FASTCALL callArgN(LInsp i, uint32_t n);
 	extern const uint8_t operandCount[];
@@ -329,18 +353,18 @@ namespace nanojit
 			return out->ins1(v, a);
 		}
 		virtual LInsp ins2(LOpcode v, LIns* a, LIns* b) {
 			return out->ins2(v, a, b);
 		}
 		virtual LInsp insGuard(LOpcode v, LIns *c, SideExit *x) {
 			return out->insGuard(v, c, x);
 		}
-		virtual LInsp insImm8(LOpcode v, int32_t a, int32_t b) {
-			return out->insImm8(v, a, b);
+		virtual LInsp insParam(int32_t i) {
+			return out->insParam(i);
 		}
 		virtual LInsp insImm(int32_t imm) {
 			return out->insImm(imm);
 		}
 		virtual LInsp insImmq(uint64_t imm) {
 			return out->insImmq(imm);
 		}
 		virtual LInsp insLoad(LOpcode op, LIns* base, LIns* d) {
@@ -348,17 +372,17 @@ namespace nanojit
 		}
 		virtual LInsp insStore(LIns* value, LIns* base, LIns* disp) {
 			return out->insStore(value, base, disp);
 		}
 		virtual LInsp insStorei(LIns* value, LIns* base, int32_t d) {
 			return isS8(d) ? out->insStorei(value, base, d)
 				: out->insStore(value, base, insImm(d));
 		}
-		virtual LInsp insCall(int32_t fid, LInsp args[]) {
+		virtual LInsp insCall(uint32_t fid, LInsp args[]) {
 			return out->insCall(fid, args);
 		}
 
 		// convenience
 	    LIns*		insLoadi(LIns *base, int disp);
 	    LIns*		insLoad(LOpcode op, LIns *base, int disp);
 	    LIns*		ins_choose(LIns* cond, LIns* iftrue, LIns* iffalse, bool);
 	    LIns*		ins_eq0(LIns* oprnd1);
@@ -479,21 +503,21 @@ namespace nanojit
 		}
 
 		LIns* ins1(LOpcode v, LInsp a) {
 			return add(out->ins1(v, a));
 		}
 		LIns* ins2(LOpcode v, LInsp a, LInsp b) {
 			return v == LIR_2 ? out->ins2(v,a,b) : add(out->ins2(v, a, b));
 		}
-		LIns* insCall(int32_t fid, LInsp args[]) {
+		LIns* insCall(uint32_t fid, LInsp args[]) {
 			return add(out->insCall(fid, args));
 		}
-		LIns* insImm8(LOpcode v, int32_t a, int32_t b) {
-			return add(out->insImm8(v, a, b));
+		LIns* insParam(int32_t i) {
+			return add(out->insParam(i));
 		}
 		LIns* insLoad(LOpcode v, LInsp base, LInsp disp) {
 			return add(out->insLoad(v, base, disp));
 		}
 		LIns* insStore(LInsp v, LInsp b, LInsp d) {
 			return add(out->insStore(v, b, d));
 		}
 		LIns* insStorei(LInsp v, LInsp b, int32_t d) {
@@ -531,47 +555,46 @@ namespace nanojit
 
 	public:
 
 		LInsHashSet(GC* gc);
 		LInsp find32(int32_t a, uint32_t &i);
 		LInsp find64(uint64_t a, uint32_t &i);
 		LInsp find1(LOpcode v, LInsp a, uint32_t &i);
 		LInsp find2(LOpcode v, LInsp a, LInsp b, uint32_t &i);
-		LInsp findcall(int32_t fid, uint32_t argc, LInsp args[], uint32_t &i);
+		LInsp findcall(uint32_t fid, uint32_t argc, LInsp args[], uint32_t &i);
 		LInsp add(LInsp i, uint32_t k);
 		void replace(LInsp i);
 
 		static uint32_t FASTCALL hashimm(int32_t);
 		static uint32_t FASTCALL hashimmq(uint64_t);
 		static uint32_t FASTCALL hash1(LOpcode v, LInsp);
 		static uint32_t FASTCALL hash2(LOpcode v, LInsp, LInsp);
-		static uint32_t FASTCALL hashcall(int32_t fid, uint32_t argc, LInsp args[]);
+		static uint32_t FASTCALL hashcall(uint32_t fid, uint32_t argc, LInsp args[]);
 	};
 
 	class CseFilter: public LirWriter
 	{
 	public:
 		LInsHashSet exprs;
 		CseFilter(LirWriter *out, GC *gc);
 	    LIns* insImm(int32_t imm);
 	    LIns* insImmq(uint64_t q);
 		LIns* ins1(LOpcode v, LInsp);
 		LIns* ins2(LOpcode v, LInsp, LInsp);
 		LIns* insLoad(LOpcode v, LInsp b, LInsp d);
-		LIns* insCall(int32_t fid, LInsp args[]);
+		LIns* insCall(uint32_t fid, LInsp args[]);
 		LIns* insGuard(LOpcode op, LInsp cond, SideExit *x);
 	};
 
 	struct Page;
 	class LirBuffer : public GCFinalizedObject
 	{
 		public:
 			DWB(Fragmento*)		_frago;
-		public:
 			LirBuffer(Fragmento* frago, const CallInfo* functions);
 			virtual ~LirBuffer();
 			void        clear();
 			LInsp		next();
 			LInsp		commit(uint32_t count);
 			bool		addPage();
 			bool		outOmem() { return _noMem != 0; }
 			debug_only (void		validate() const;)
@@ -583,51 +606,53 @@ namespace nanojit
 			struct 
 			{
 				uint32_t lir;	// # instructions
 				uint32_t pages;	// pages consumed
 			}
 			_stats;
 
 			const CallInfo* _functions;
+            LInsp state,param1,sp,rp;
 			
 		private:
 			Page*		pageAlloc();
 
 			Page*				_start;		// first page
 			LInsp				_unused;	// next unused instruction slot
 			int					_noMem;		// set if ran out of memory when writing to buffer
 	};	
 
 	class LirBufWriter : public LirWriter
 	{
 		DWB(LirBuffer*)	_buf;		// underlying buffer housing the instructions
+        LInsp spref, rpref;
 
         public:
 			LirBufWriter(LirBuffer* buf)
 				: LirWriter(0), _buf(buf) {
 				_functions = buf->_functions;
 			}
 
 			// LirWriter interface
 			LInsp   insLoad(LOpcode op, LInsp base, LInsp off);
 			LInsp	insStore(LInsp o1, LInsp o2, LInsp o3);
 			LInsp	insStorei(LInsp o1, LInsp o2, int32_t imm);
 			LInsp	ins0(LOpcode op);
 			LInsp	ins1(LOpcode op, LInsp o1);
 			LInsp	ins2(LOpcode op, LInsp o1, LInsp o2);
-			LInsp	insImm8(LOpcode op, int32_t a, int32_t b);
+			LInsp	insParam(int32_t i);
 			LInsp	insImm(int32_t imm);
 			LInsp	insImmq(uint64_t imm);
-		    LInsp	insCall(int32_t fid, LInsp args[]);
+		    LInsp	insCall(uint32_t fid, LInsp args[]);
 			LInsp	insGuard(LOpcode op, LInsp cond, SideExit *x);
 
 			// buffer mgmt
 			LInsp	skip(size_t);
-			LInsp	insFar(LOpcode op, int32_t imm);
+			LInsp	insFar(LOpcode op, LInsp target);
 			LInsp	ensureReferenceable(LInsp i, int32_t addedDistance);
 			bool	ensureRoom(uint32_t count);
 			bool	canReference(LInsp from, LInsp to) {
 				return isU8(from-to-1);
 			}
 	};
 
 	class LirFilter
@@ -638,21 +663,16 @@ namespace nanojit
 		virtual ~LirFilter() {}
 
 		virtual LInsp read() {
 			return in->read();
 		}
 		virtual LInsp pos() {
 			return in->pos();
 		}
-
-		LInsp previous() {
-			read();
-			return pos();
-		}
 	};
 
 	// concrete
 	class LirReader : public LirFilter
 	{
 		LInsp _i; // current instruction that this decoder is operating on.
 
 	public:
--- a/js/src/nanojit/Native.h
+++ b/js/src/nanojit/Native.h
@@ -50,17 +50,18 @@
 #include "NativeARM.h"
 #endif
 #elif defined(NANOJIT_PPC)
 #include "NativePpc.h"
 #else
 #error "unknown nanojit architecture"
 #endif
 
-namespace nanojit {
+namespace nanojit 
+{
 	const uint32_t NJ_PAGE_SIZE = 1 << NJ_LOG2_PAGE_SIZE;
 }
 
 	#ifdef NJ_STACK_GROWTH_UP
 		#define stack_direction(n)   n
 	#else
 		#define stack_direction(n)  -n
 	#endif
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@@ -144,24 +144,27 @@ namespace nanojit
 		counter_define(x87Top);
 
 	#define DECLARE_PLATFORM_REGALLOC()
 
 
 	#define DECLARE_PLATFORM_ASSEMBLER()\
 		const static Register argRegs[4], retRegs[2];\
 		void LD32_nochk(Register r, int32_t imm);\
-		void CALL(intptr_t addr, const char* nm);\
+		void CALL(const CallInfo*);\
 		void underrunProtect(int bytes);\
 		bool has_cmov;\
 		void nativePageReset();\
 		void nativePageSetup();\
 		int* _nSlot;\
 		int* _nExitSlot;
 
+
+    #define asm_farg(i) NanoAssert(false)
+
 	//printf("jmp_l_n count=%d, nins=%X, %X = %X\n", (_c), nins, _nIns, ((intptr_t)(nins+(_c))-(intptr_t)_nIns - 4) );
 
 	#define swapptrs()  { NIns* _tins = _nIns; _nIns=_nExitIns; _nExitIns=_tins; \
 								int* _nslot = _nSlot;\
 								_nSlot = _nExitSlot;\
 								_nExitSlot = _nslot;}
 
 
--- a/js/src/nanojit/NativeThumb.cpp
+++ b/js/src/nanojit/NativeThumb.cpp
@@ -188,70 +188,31 @@ namespace nanojit
 		BX(LR); // return
 		MR(R0,R2); // return LinkRecord*
 		RegisterMask savingMask = restore | rmask(FRAME_PTR) | rmask(LR);
 		POP_mask(savingMask); // regs
 		return _nIns;
 #endif
 	}
 	
-	void Assembler::nArgEmitted(const CallInfo* call, uint32_t stackSlotCount, uint32_t iargs, uint32_t fargs)
-	{
-#if 1
-		(void)call;
-		(void)stackSlotCount;
-		(void)iargs;
-		(void)fargs;
-#else
-		// see if we have finished emitting all args.  If so then make sure the 
-		// new stack pointer is NJ_ALIGN_STACK aligned
-		if (iargs == call->iargs && fargs == call->fargs)
-		{
-			int32_t istack = iargs;
-			istack -= 4;
-			if (istack<=0)
-				return; // nothing on stack
-
-			const int32_t size = 4*stackSlotCount;
-			const int32_t extra = alignUp(size, NJ_ALIGN_STACK) - size; 
-			if (extra > 0)
-				SUBi(SP, extra);
-		}
-#endif
-	}
-	
-	void Assembler::nPostCallCleanup(const CallInfo* call)
+	void Assembler::asm_call(LInsp ins)
 	{
-#if 1
-		(void)call;
-#else
-		int32_t istack = call->iargs;
-		int32_t fstack = call->fargs;
-
-		istack -= 4;  // first 4 4B args are in registers
-		if (istack <= 0)
+        const CallInfo* call = callInfoFor(ins->fid());
+		CALL(call);
+        ArgSize sizes[10];
+        uint32_t argc = call->get_sizes(sizes);
+		for(uint32_t i=0; i < argc; i++)
 		{
-			return; // nothing on stack
-
-			//istack = 0;
-			//if (fstack == 0)
-				//return;  // only using ECX/EDX nothing passed on the stack so no cleanup needed
+            uint32_t j = argc - i - 1;
+            ArgSize sz = sizes[j];
+            NanoAssert(sz == ARGSIZES_LO || sz == ARGSIZES_Q);
+    		// pre-assign registers R0-R3 for arguments (if they fit)
+            Register r = i < 4 ? argRegs[i] : UnknownReg;
+            asm_arg(sz, ins->arg(j), r);
 		}
-
-		const int32_t size = 4*istack + 8*fstack; // actual stack space used
-		NanoAssert( size > 0 );
-		
-		const int32_t extra = alignUp(size, NJ_ALIGN_STACK); 
-
-		// stack re-alignment 
-		// only pop our adjustment amount since callee pops args in FASTCALL mode
-		if (extra > 0)
-			{ ADDi(SP, extra); }
-#endif
-		return;
 	}
 	
 	void Assembler::nMarkExecute(Page* page, int32_t count, bool enable)
 	{
 	#ifdef UNDER_CE
 		DWORD dwOld;
 		VirtualProtect(page, NJ_PAGE_SIZE, PAGE_EXECUTE_READWRITE, &dwOld);
 	#endif
@@ -853,24 +814,24 @@ namespace nanojit
 			SUBi8(r,255);
 		}
 		else {
 			SUB(r, Scratch);
 			LDi(Scratch, i);
 		}
 	}
 
-	void Assembler::CALL(intptr_t addr, const char* nm)
+	void Assembler::CALL(const CallInfo *ci)
 	{
-		(void)nm;
+        intptr_t addr = ci->_address;
 		if (isB22((NIns*)addr, _nIns)) {
 			int offset = int(addr)-int(_nIns-2+2);
 			*(--_nIns) = (NIns)(0xF800 | ((offset>>1)&0x7FF) );
 			*(--_nIns) = (NIns)(0xF000 | ((offset>>12)&0x7FF) );
-			asm_output2("call %08X:%s",(addr),(nm));
+			asm_output2("call %08X:%s", addr, ci->_name);
 		}
 		else
 		{
 			underrunProtect(2*(10));
 		
 			if ( (((int(_nIns))&0xFFFF)%4) != 0)
 				 *(--_nIns) = (NIns)0;
 
@@ -879,17 +840,17 @@ namespace nanojit
 
 			*(--_nIns) = (NIns)(0x4600 | (1<<7) | (Scratch<<3) | (IP&7));
 			*(--_nIns) = (NIns)0;
 			*(--_nIns) = (short)((addr) >> 16);
 			*(--_nIns) = (short)((addr) & 0xFFFF);
 			*(--_nIns) = (NIns)(0x4700 | (IP<<3));
 			*(--_nIns) = (NIns)(0xE000 | (4>>1));
 			*(--_nIns) = (NIns)(0x4800 | (Scratch<<8) | (1));
-			asm_output2("call %08X:%s",(addr),(nm));
+			asm_output2("call %08X:%s", addr, ci->_name);
 		}
 	}
 
 #else // ARM_JIT
 		void Assembler::underrunProtect(int bytes)
 		{
 			intptr_t u = (bytes) + 4;
 			if ( (samepage(_nIns,_nSlot) && (((intptr_t)_nIns-u) <= intptr_t(_nSlot+1))) ||
@@ -903,35 +864,35 @@ namespace nanojit
 		}		
 
 	bool isB24(NIns *target, NIns *cur)
 	{
 		int offset = int(target)-int(cur-2+2);
 		return (-(1<<24) <= offset && offset < (1<<24));
 	}
 
-	void Assembler::CALL(intptr_t addr, const char* nm)
+	void Assembler::CALL(const CallInfo *ci)
 	{
-		(void)nm;
-		if (isB24((NIns*)addr,_nIns))
+        intptr_t addr = ci->_address;
+		if (isB24((NIns*)addr, _nIns))
 		{
 			// we can do this with a single BL call
 			underrunProtect(4);
 
 			BL(addr);
-			asm_output2("call %08X:%s",(addr),(nm));
+			asm_output2("call %08X:%s", addr, ci->_name);
 		}
 		else
 		{
 			underrunProtect(16);
 			*(--_nIns) = (NIns)((addr));
 			*(--_nIns) = (NIns)( COND_AL | (0x9<<21) | (0xFFF<<8) | (1<<4) | (IP) );
 			*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | (PC<<16) | (LR<<12) | (4) );
 			*(--_nIns) = (NIns)( COND_AL | (0x59<<20) | (PC<<16) | (IP<<12) | (4));
-			asm_output2("call %08X:%s",(addr),(nm));
+			asm_output2("call %08X:%s", addr, ci->_name);
 		}
 	}
 
 #endif // NJ_THUMB_JIT
 
 	
 	void Assembler::LD32_nochk(Register r, int32_t imm)
 	{
--- a/js/src/nanojit/NativeThumb.h
+++ b/js/src/nanojit/NativeThumb.h
@@ -139,24 +139,25 @@ namespace nanojit
 		void STMIA(Register base, RegisterMask regs);\
 		void LDMIA(Register base, RegisterMask regs);\
 		void ADDi(Register r, int32_t imm);\
 		void ADDi8(Register r, int32_t imm);\
 		void SUBi(Register r, int32_t imm);\
 		void SUBi8(Register r, int32_t imm);\
 		void JMP(NIns *target);\
         void LD32_nochk(Register r, int32_t imm);\
-		void CALL(intptr_t addr, const char* nm);\
+		void CALL(const CallInfo*);\
 		void nativePageReset();\
 		void nativePageSetup();\
 		int* _nPool;\
 		int* _nSlot;\
 		int* _nExitPool;\
 		int* _nExitSlot;
 
+    #define asm_farg(i) NanoAssert(false)
 
 	#define swapptrs()  { NIns* _tins = _nIns; _nIns=_nExitIns; _nExitIns=_tins; \
 								int* _npool = _nPool;\
 								int* _nslot = _nSlot;\
 								_nPool = _nExitPool; _nExitPool = _npool;\
 								_nSlot = _nExitSlot; _nExitSlot = _nslot;}
 
 #define BX(r)		do {\
--- a/js/src/nanojit/Nativei386.cpp
+++ b/js/src/nanojit/Nativei386.cpp
@@ -177,57 +177,65 @@ namespace nanojit
 		// Restore saved registers.
 		for (Register i=UnknownReg; i >= FirstReg; i = prevreg(i))
 			if (restore&rmask(i)) { POPr(i); } 
 		
 		POPr(FP); // Pop the pre-alignment SP.
         return  _nIns;
     }
 	
-	void Assembler::nArgEmitted(const CallInfo* call, uint32_t stackSlotCount, uint32_t iargs, uint32_t fargs)
+	void Assembler::asm_call(LInsp ins)
 	{
-		// see if we have finished emitting all args.  If so then make sure the 
-		// new stack pointer is NJ_ALIGN_STACK aligned
-		const uint32_t istack = call->count_iargs();
-		const uint32_t fstack = call->count_args() - istack;
-		//printf("call %s iargs %d fargs %d istack %d fstack %d\n",call->_name,iargs,fargs,istack,fstack);
-		AvmAssert(iargs <= istack);
-		AvmAssert(fargs <= fstack);
-		if (iargs == istack && fargs == fstack)
-		{
-			const int32_t size = 4*stackSlotCount;
-			const int32_t extra = alignUp(size, NJ_ALIGN_STACK) - size; 
-			if (extra > 0)
-				SUBi(SP, extra);
-		}
-	}
-	
-	void Assembler::nPostCallCleanup(const CallInfo* call)
-	{
+        uint32_t fid = ins->fid();
+        const CallInfo* call = callInfoFor(fid);
 		// must be signed, not unsigned
-		int32_t istack = call->count_iargs();
-		int32_t fstack = call->count_args() - istack;
+		const uint32_t iargs = call->count_iargs();
+		int32_t fstack = call->count_args() - iargs;
 
-		istack -= 2;  // first 2 4B args are in registers
+        int32_t extra = 0;
+		int32_t istack = iargs-2;  // first 2 4B args are in registers
 		if (istack <= 0)
 		{
 			istack = 0;
-			if (fstack == 0)
-				return;  // only using ECX/EDX nothing passed on the stack so no cleanup needed
 		}
 
 		const int32_t size = 4*istack + 8*fstack; // actual stack space used
-		NanoAssert( size > 0 );
-		
-		const int32_t extra = alignUp(size, NJ_ALIGN_STACK) - (size); 
+        if (size) {
+		    // stack re-alignment 
+		    // only pop our adjustment amount since callee pops args in FASTCALL mode
+		    extra = alignUp(size, NJ_ALIGN_STACK) - (size); 
+		    if (extra > 0)
+			    ADDi(SP, extra);
+        }
+
+		CALL(call);
 
-		// stack re-alignment 
-		// only pop our adjustment amount since callee pops args in FASTCALL mode
+		// make sure fpu stack is empty before call (restoreCallerSaved)
+		NanoAssert(_allocator.isFree(FST0));
+		// note: this code requires that ref arguments (ARGSIZE_Q)
+        // be one of the first two arguments
+		// pre-assign registers to the first 2 4B args
+		const int max_regs = (iargs < 2) ? iargs : 2;
+		int n = 0;
+
+        ArgSize sizes[10];
+        uint32_t argc = call->get_sizes(sizes);
+
+		for(uint32_t i=0; i < argc; i++)
+		{
+			uint32_t j = argc-i-1;
+            ArgSize sz = sizes[j];
+            Register r = UnknownReg;
+            if (n < max_regs && sz != ARGSIZE_F) 
+			    r = argRegs[n++]; // tell asm_arg what reg to use
+            asm_arg(sz, ins->arg(j), r);
+		}
+
 		if (extra > 0)
-			{ ADDi(SP, extra); }
+			SUBi(SP, extra);
 	}
 	
 	void Assembler::nMarkExecute(Page* page, int32_t count, bool enable)
 	{
 		#ifdef _MAC
 			MakeDataExecutable(page, count*NJ_PAGE_SIZE);
 		#elif defined WIN32
 			DWORD dwIgnore;
@@ -664,30 +672,26 @@ namespace nanojit
 			PUSHm(disp(rA), FP);
 		}
 		else
 		{
 			PUSHr(rA->reg);
 		}
 	}
 
-	void Assembler::asm_farg(LInsp ins)
+	void Assembler::asm_farg(LInsp p)
 	{
-		LIns* p = ins->oprnd1();
 		Register r = findRegFor(p, FpRegs);
 		if (rmask(r) & XmmRegs) {
 			STQ(0, SP, r); 
 		} else {
 			FSTPQ(0, SP);
 		}
 		PUSHr(ECX); // 2*pushr is smaller than sub
 		PUSHr(ECX);
-		_stackUsed += 2;
-		++_fargs;
-		nArgEmitted(_call, _stackUsed, _iargs, _fargs);
 	}
 
 	void Assembler::asm_fop(LInsp ins)
 	{
 		LOpcode op = ins->opcode();
 		if (sse2) 
 		{
 			LIns *lhs = ins->oprnd1();
@@ -769,23 +773,22 @@ namespace nanojit
 		{
 			int d = findMemFor(ins->oprnd1());
 			FILD(d, FP);
 		}
 	}
 
 	Register Assembler::asm_prep_fcall(Reservation *rR, LInsp ins)
 	{
-		Register rr;
 		if (rR) {
+    		Register rr;
 			if ((rr=rR->reg) != UnknownReg && (rmask(rr) & XmmRegs))
 				evict(rr);
 		}
-		prepResultReg(ins, rmask(FST0));
-		return FST0;
+		return prepResultReg(ins, rmask(FST0));
 	}
 
 	void Assembler::asm_u2f(LInsp ins)
 	{
 		// where our result goes
 		Register rr = prepResultReg(ins, FpRegs);
 		const int disp = -8;
 		const Register base = ESP;
--- a/js/src/nanojit/Nativei386.h
+++ b/js/src/nanojit/Nativei386.h
@@ -128,17 +128,18 @@ namespace nanojit
 
 	#define DECLARE_PLATFORM_ASSEMBLER()	\
         const static Register argRegs[2], retRegs[2]; \
 		bool x87Dirty;						\
         bool sse2;							\
 		bool has_cmov; \
 		bool pad[1];\
 		void nativePageReset();\
-		void nativePageSetup();
+		void nativePageSetup();\
+        void asm_farg(LInsp);
 		
 	#define swapptrs()  { NIns* _tins = _nIns; _nIns=_nExitIns; _nExitIns=_tins; }
 		
 	// enough room for n bytes
 	#define underrunProtect(n)									\
 		{														\
 			intptr_t u = n + sizeof(PageHeader)/sizeof(NIns);	\
 			if ( !samepage(_nIns-u,_nIns-1) )					\
@@ -653,17 +654,19 @@ namespace nanojit
 #define FDIVR(d,b)	do { FPUm(0xdc07, d, b);		asm_output2("fdivr %d(%s)",d,gpn(b)); } while(0)
 #define FINCSTP()	do { FPUc(0xd9f7);				asm_output2("fincstp"); } while(0)
 #define FSTP(r)		do { FPU(0xddd8, r&7);			asm_output1("fstp %s",fpn(r)); fpu_pop();} while(0)
 #define FCOMP()		do { FPUc(0xD8D9);				asm_output("fcomp"); fpu_pop();} while(0)
 #define FCOMPP()	do { FPUc(0xDED9);				asm_output("fcompp"); fpu_pop();fpu_pop();} while(0)
 #define FLDr(r)		do { FPU(0xd9c0,r);				asm_output1("fld %s",fpn(r)); fpu_push(); } while(0)
 #define EMMS()		do { FPUc(0x0f77);				asm_output("emms"); } while (0)
 
-#define CALL(a,nm)	do { \
-	  underrunProtect(5);					\
-	  int offset = (a) - ((int)_nIns); \
-	  IMM32( (uint32_t)offset );	\
-	  *(--_nIns) = 0xE8;		\
-	  asm_output1("call %s",(nm)); \
-	} while (0)
+#define CALL(c)	do { \
+  underrunProtect(5);					\
+  int offset = (c->_address) - ((int)_nIns); \
+  IMM32( (uint32_t)offset );	\
+  *(--_nIns) = 0xE8;		\
+  verbose_only(asm_output1("call %s",(c->_name));) \
+  debug_only(if ((c->_argtypes&3)==ARGSIZE_F) fpu_push();)\
+} while (0)
+
 }
 #endif // __nanojit_Nativei386__