[JAEGER] Merge from tracemonkey.
authorDavid Mandelin <dmandelin@mozilla.com>
Thu, 19 Aug 2010 17:30:22 -0700
changeset 53467 8a0513a5c024cdcaa92742f25861a8ce1fa7ac6f
parent 53466 a6f55b452f916635e8cc51be5a4418f41d08c410 (current diff)
parent 51133 45a893397e3004302680c7837d677425577ad447 (diff)
child 53468 c2f1e5150e18688ebede3eadc968d3bf082f9a5b
push id1
push userroot
push dateTue, 26 Apr 2011 22:38:44 +0000
treeherdermozilla-beta@bfdb6e623a36 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
milestone2.0b5pre
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
[JAEGER] Merge from tracemonkey.
--- a/js/src/lirasm/tests/call1.in
+++ b/js/src/lirasm/tests/call1.in
@@ -1,12 +1,12 @@
 ptr = allocp 8
 a = immi 65
-sti a ptr 0
+sti2c a ptr 0
 b = immi 66
-sti b ptr 1
+sti2c b ptr 1
 c = immi 67
-sti c ptr 2
+sti2c c ptr 2
 zero = immi 0
-sti zero ptr 3
+sti2c zero ptr 3
 ss = calli puts cdecl ptr
 nn = gei ss zero
 reti nn
--- a/js/src/lirasm/tests/multfrag1.in
+++ b/js/src/lirasm/tests/multfrag1.in
@@ -1,18 +1,18 @@
 .begin a
 ptr = allocp 8
 a = immi 65
-sti a ptr 0
+sti2c a ptr 0
 b = immi 66
-sti b ptr 1
+sti2c b ptr 1
 c = immi 67
-sti c ptr 2
+sti2c c ptr 2
 zero = immi 0
-sti zero ptr 3
+sti2c zero ptr 3
 ss = calli puts cdecl ptr
 nn = gei ss zero
 reti nn
 .end
 
 .begin b
 rr = calli a fastcall
 reti rr
--- a/js/src/nanojit-import-rev
+++ b/js/src/nanojit-import-rev
@@ -1,1 +1,1 @@
-982cd218ddb049bdbbcdda4fa3a9d7e40e45e0be
+c7009f5cd83ea028b98f59e1f8830a76ba27c1dd
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
@@ -36,25 +36,37 @@
  * the terms of any one of the MPL, the GPL or the LGPL.
  *
  * ***** END LICENSE BLOCK ***** */
 
 #include "nanojit.h"
 
 #ifdef FEATURE_NANOJIT
 
-#ifdef VTUNE
+#ifdef VMCFG_VTUNE
 #include "../core/CodegenLIR.h"
 #endif
 
 #ifdef _MSC_VER
     // disable some specific warnings which are normally useful, but pervasive in the code-gen macros
     #pragma warning(disable:4310) // cast truncates constant value
 #endif
 
+#ifdef VMCFG_VTUNE
+namespace vtune {
+    using namespace nanojit;
+    void vtuneStart(void*, NIns*);
+    void vtuneEnd(void*, NIns*);
+    void vtuneLine(void*, int, NIns*);
+    void vtuneFile(void*, void*);
+}
+using namespace vtune;
+#endif // VMCFG_VTUNE
+
+
 namespace nanojit
 {
     /**
      * Need the following:
      *
      *    - merging paths ( build a graph? ), possibly use external rep to drive codegen
      */
     Assembler::Assembler(CodeAlloc& codeAlloc, Allocator& dataAlloc, Allocator& alloc, AvmCore* core, LogControl* logc, const Config& config)
@@ -69,18 +81,18 @@ namespace nanojit
     #if NJ_USES_IMMD_POOL
         , _immDPool(alloc)
     #endif
         , _epilogue(NULL)
         , _err(None)
     #if PEDANTIC
         , pedanticTop(NULL)
     #endif
-    #ifdef VTUNE
-        , cgen(NULL)
+    #ifdef VMCFG_VTUNE
+        , vtuneHandle(NULL)
     #endif
         , _config(config)
     {
         nInit(core);
         (void)logc;
         verbose_only( _logc = logc; )
         verbose_only( _outputCache = 0; )
         verbose_only( outline[0] = '\0'; )
@@ -181,20 +193,21 @@ namespace nanojit
     #if NJ_USES_IMMD_POOL
         _immDPool.clear();
     #endif
     }
 
     void Assembler::registerResetAll()
     {
         nRegisterResetAll(_allocator);
+        _allocator.managed = _allocator.free;
 
         // At start, should have some registers free and none active.
         NanoAssert(0 != _allocator.free);
-        NanoAssert(0 == _allocator.countActive());
+        NanoAssert(0 == _allocator.activeMask());
 #ifdef NANOJIT_IA32
         debug_only(_fpuStkDepth = 0; )
 #endif
     }
 
     // Legend for register sets: A = allowed, P = preferred, F = free, S = SavedReg.
     //
     // Finds a register in 'setA___' to store the result of 'ins' (one from
@@ -268,24 +281,16 @@ namespace nanojit
         if (start)
             CodeAlloc::add(codeList, start, end);
 
         // CodeAlloc contract: allocations never fail
         _codeAlloc.alloc(start, end);
         verbose_only( nBytes += (end - start) * sizeof(NIns); )
         NanoAssert(uintptr_t(end) - uintptr_t(start) >= (size_t)LARGEST_UNDERRUN_PROT);
         eip = end;
-
-        #ifdef VTUNE
-        if (_nIns && _nExitIns) {
-            //cgen->jitAddRecord((uintptr_t)list->code, 0, 0, true); // add placeholder record for top of page
-            cgen->jitCodePosUpdate((uintptr_t)list->code);
-            cgen->jitPushInfo(); // new page requires new entry
-        }
-        #endif
     }
 
     void Assembler::reset()
     {
         _nIns = 0;
         _nExitIns = 0;
         codeStart = codeEnd = 0;
         exitStart = exitEnd = 0;
@@ -355,33 +360,36 @@ namespace nanojit
         _activation.checkForResourceConsistency(_allocator);
 
         registerConsistencyCheck();
     }
 
     void Assembler::registerConsistencyCheck()
     {
         RegisterMask managed = _allocator.managed;
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
-            if (rmask(r) & managed) {
-                // A register managed by register allocation must be either
-                // free or active, but not both.
-                if (_allocator.isFree(r)) {
-                    NanoAssertMsgf(_allocator.getActive(r)==0,
-                        "register %s is free but assigned to ins", gpn(r));
-                } else {
-                    // An LIns defining a register must have that register in
-                    // its reservation.
-                    LIns* ins = _allocator.getActive(r);
-                    NanoAssert(ins);
-                    NanoAssertMsg(r == ins->getReg(), "Register record mismatch");
-                }
+        for (Register r = lsReg(managed); managed; r = nextLsReg(managed, r)) {
+            // A register managed by register allocation must be either
+            // free or active, but not both.
+            if (_allocator.isFree(r)) {
+                NanoAssertMsgf(_allocator.getActive(r)==0,
+                    "register %s is free but assigned to ins", gpn(r));
             } else {
-                // A register not managed by register allocation must be
-                // neither free nor active.
+                // An LIns defining a register must have that register in
+                // its reservation.
+                LIns* ins = _allocator.getActive(r);
+                NanoAssert(ins);
+                NanoAssertMsg(r == ins->getReg(), "Register record mismatch");
+            }
+        }
+
+        RegisterMask not_managed = ~_allocator.managed;
+        for (Register r = lsReg(not_managed); not_managed; r = nextLsReg(not_managed, r)) {
+            // A register not managed by register allocation must be
+            // neither free nor active.
+            if (r <= LastReg) {
                 NanoAssert(!_allocator.isFree(r));
                 NanoAssert(!_allocator.getActive(r));
             }
         }
     }
     #endif /* _DEBUG */
 
     void Assembler::findRegFor2(RegisterMask allowa, LIns* ia, Register& ra,
@@ -1103,37 +1111,45 @@ namespace nanojit
 
         // at this point all our new code is in the d-cache and not the i-cache,
         // so flush the i-cache on cpu's that need it.
         CodeAlloc::flushICache(codeList);
 
         // save entry point pointers
         frag->fragEntry = fragEntry;
         frag->setCode(_nIns);
+
+#ifdef VMCFG_VTUNE
+        if (vtuneHandle)
+        {
+            vtuneEnd(vtuneHandle, codeEnd);
+            vtuneStart(vtuneHandle, _nIns);
+        }
+#endif
+
         PERFM_NVPROF("code", CodeAlloc::size(codeList));
 
 #ifdef NANOJIT_IA32
         NanoAssertMsgf(_fpuStkDepth == 0,"_fpuStkDepth %d\n",_fpuStkDepth);
 #endif
 
         debug_only( pageValidate(); )
         NanoAssert(_branchStateMap.isEmpty());
     }
 
     void Assembler::releaseRegisters()
     {
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r))
+        RegisterMask active = _allocator.activeMask();
+        for (Register r = lsReg(active); active; r = nextLsReg(active, r))
         {
             LIns *ins = _allocator.getActive(r);
-            if (ins) {
-                // Clear reg allocation, preserve stack allocation.
-                _allocator.retire(r);
-                NanoAssert(r == ins->getReg());
-                ins->clearReg();
-            }
+            // Clear reg allocation, preserve stack allocation.
+            _allocator.retire(r);
+            NanoAssert(r == ins->getReg());
+            ins->clearReg();
         }
     }
 
 #ifdef PERFM
 #define countlir_live() _nvprof("lir-live",1)
 #define countlir_ret() _nvprof("lir-ret",1)
 #define countlir_alloc() _nvprof("lir-alloc",1)
 #define countlir_var() _nvprof("lir-var",1)
@@ -1726,17 +1742,17 @@ namespace nanojit
                 #if NJ_JTBL_SUPPORTED
                 case LIR_jtbl: {
                     countlir_jtbl();
                     ins->oprnd1()->setResultLive();
                     // Multiway jump can contain both forward and backward jumps.
                     // Out of range indices aren't allowed or checked.
                     // Code after this jtbl instruction is unreachable.
                     releaseRegisters();
-                    NanoAssert(_allocator.countActive() == 0);
+                    NanoAssert(_allocator.activeMask() == 0);
 
                     uint32_t count = ins->getTableSize();
                     bool has_back_edges = false;
 
                     // Merge the regstates of labels we have already seen.
                     for (uint32_t i = count; i-- > 0;) {
                         LIns* to = ins->getTarget(i);
                         LabelState *lstate = _labels.get(to);
@@ -1751,17 +1767,17 @@ namespace nanojit
                     asm_output("forward edges");
 
                     // In a multi-way jump, the register allocator has no ability to deal
                     // with two existing edges that have conflicting register assignments, unlike
                     // a conditional branch where code can be inserted on the fall-through path
                     // to reconcile registers.  So, frontends *must* insert LIR_regfence at labels of
                     // forward jtbl jumps.  Check here to make sure no registers were picked up from
                     // any forward edges.
-                    NanoAssert(_allocator.countActive() == 0);
+                    NanoAssert(_allocator.activeMask() == 0);
 
                     if (has_back_edges) {
                         handleLoopCarriedExprs(pending_lives);
                         // save merged (empty) register state at target labels we haven't seen yet
                         for (uint32_t i = count; i-- > 0;) {
                             LIns* to = ins->getTarget(i);
                             LabelState *lstate = _labels.get(to);
                             if (!lstate) {
@@ -1923,37 +1939,38 @@ namespace nanojit
                     // It must be impure or pure-and-extant -- it couldn't be
                     // pure-and-not-extant, because there's no way the codegen
                     // for a call can be folded into the codegen of another
                     // LIR instruction.
                     NanoAssert(!ins->callInfo()->_isPure || ins->isExtant());
                     asm_call(ins);
                     break;
 
-                #ifdef VTUNE
+                #ifdef VMCFG_VTUNE
                 case LIR_file: {
-                    // we traverse backwards so we are now hitting the file
-                    // that is associated with a bunch of LIR_lines we already have seen
-                    ins->oprnd1()->setResultLive();
-                    uintptr_t currentFile = ins->oprnd1()->immI();
-                    cgen->jitFilenameUpdate(currentFile);
+                     // we traverse backwards so we are now hitting the file
+                     // that is associated with a bunch of LIR_lines we already have seen
+                    if (vtuneHandle) {
+                        void * currentFile = (void *) ins->oprnd1()->immI();
+                        vtuneFile(vtuneHandle, currentFile);
+                    }
                     break;
                 }
-
                 case LIR_line: {
-                    // add a new table entry, we don't yet knwo which file it belongs
-                    // to so we need to add it to the update table too
-                    // note the alloc, actual act is delayed; see above
-                    ins->oprnd1()->setResultLive();
-                    uint32_t currentLine = (uint32_t) ins->oprnd1()->immI();
-                    cgen->jitLineNumUpdate(currentLine);
-                    cgen->jitAddRecord((uintptr_t)_nIns, 0, currentLine, true);
+                     // add a new table entry, we don't yet knwo which file it belongs
+                     // to so we need to add it to the update table too
+                     // note the alloc, actual act is delayed; see above
+                    if (vtuneHandle) {
+                        uint32_t currentLine = (uint32_t) ins->oprnd1()->immI();
+                        vtuneLine(vtuneHandle, currentLine, _nIns);
+                    }
                     break;
                 }
-                #endif // VTUNE
+               #endif // VMCFG_VTUNE
+
             }
 
 #ifdef NJ_VERBOSE
             // We do final LIR printing inside this loop to avoid printing
             // dead LIR instructions.  We print the LIns after generating the
             // code.  This ensures that the LIns will appear in debug output
             // *before* the native code, because Assembler::outputf()
             // prints everything in reverse.
@@ -1963,20 +1980,16 @@ namespace nanojit
                 LInsPrinter* printer = _thisfrag->lirbuf->printer;
                 outputf("    %s", printer->formatIns(&b, ins));
             }
 #endif
 
             if (error())
                 return;
 
-        #ifdef VTUNE
-            cgen->jitCodePosUpdate((uintptr_t)_nIns);
-        #endif
-
             // check that all is well (don't check in exit paths since its more complicated)
             debug_only( pageValidate(); )
             debug_only( resourceConsistencyCheck();  )
         }
     }
 
     /*
      * Write a jump table for the given SwitchInfo and store the table
@@ -2068,34 +2081,33 @@ namespace nanojit
     void Assembler::printRegState()
     {
         char* s = &outline[0];
         VMPI_memset(s, ' ', 26);  s[26] = '\0';
         s += VMPI_strlen(s);
         VMPI_sprintf(s, "RR");
         s += VMPI_strlen(s);
 
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
+        RegisterMask active = _allocator.activeMask();
+        for (Register r = lsReg(active); active != 0; r = nextLsReg(active, r)) {
             LIns *ins = _allocator.getActive(r);
-            if (ins) {
-                NanoAssertMsg(!_allocator.isFree(r),
-                              "Coding error; register is both free and active! " );
-                RefBuf b;
-                const char* n = _thisfrag->lirbuf->printer->formatRef(&b, ins);
+            NanoAssertMsg(!_allocator.isFree(r),
+                          "Coding error; register is both free and active! " );
+            RefBuf b;
+            const char* n = _thisfrag->lirbuf->printer->formatRef(&b, ins);
 
-                if (ins->isop(LIR_paramp) && ins->paramKind()==1 &&
-                    r == Assembler::savedRegs[ins->paramArg()])
-                {
-                    // dont print callee-saved regs that arent used
-                    continue;
-                }
+            if (ins->isop(LIR_paramp) && ins->paramKind()==1 &&
+                r == Assembler::savedRegs[ins->paramArg()])
+            {
+                // dont print callee-saved regs that arent used
+                continue;
+            }
 
-                VMPI_sprintf(s, " %s(%s)", gpn(r), n);
-                s += VMPI_strlen(s);
-            }
+            VMPI_sprintf(s, " %s(%s)", gpn(r), n);
+            s += VMPI_strlen(s);
         }
         output();
     }
 
     void Assembler::printActivationState()
     {
         char* s = &outline[0];
         VMPI_memset(s, ' ', 26);  s[26] = '\0';
@@ -2231,36 +2243,33 @@ namespace nanojit
         // Find the top GpRegs that are candidates to put in SavedRegs.
 
         // 'tosave' is a binary heap stored in an array.  The root is tosave[0],
         // left child is at i+1, right child is at i+2.
 
         Register tosave[LastReg-FirstReg+1];
         int len=0;
         RegAlloc *regs = &_allocator;
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
-            if (rmask(r) & GpRegs & ~ignore) {
-                LIns *ins = regs->getActive(r);
-                if (ins) {
-                    if (canRemat(ins)) {
-                        NanoAssert(ins->getReg() == r);
-                        evict(ins);
-                    }
-                    else {
-                        int32_t pri = regs->getPriority(r);
-                        // add to heap by adding to end and bubbling up
-                        int j = len++;
-                        while (j > 0 && pri > regs->getPriority(tosave[j/2])) {
-                            tosave[j] = tosave[j/2];
-                            j /= 2;
-                        }
-                        NanoAssert(size_t(j) < sizeof(tosave)/sizeof(tosave[0]));
-                        tosave[j] = r;
-                    }
+        RegisterMask evict_set = regs->activeMask() & GpRegs & ~ignore;
+        for (Register r = lsReg(evict_set); evict_set; r = nextLsReg(evict_set, r)) {
+            LIns *ins = regs->getActive(r);
+            if (canRemat(ins)) {
+                NanoAssert(ins->getReg() == r);
+                evict(ins);
+            }
+            else {
+                int32_t pri = regs->getPriority(r);
+                // add to heap by adding to end and bubbling up
+                int j = len++;
+                while (j > 0 && pri > regs->getPriority(tosave[j/2])) {
+                    tosave[j] = tosave[j/2];
+                    j /= 2;
                 }
+                NanoAssert(size_t(j) < sizeof(tosave)/sizeof(tosave[0]));
+                tosave[j] = r;
             }
         }
 
         // Now primap has the live exprs in priority order.
         // Allocate each of the top priority exprs to a SavedReg.
 
         RegisterMask allow = SavedRegs;
         while (allow && len > 0) {
@@ -2292,34 +2301,22 @@ namespace nanojit
                 tosave[j] = last;
             }
         }
 
         // now evict everything else.
         evictSomeActiveRegs(~(SavedRegs | ignore));
     }
 
-    void Assembler::evictAllActiveRegs()
-    {
-        // generate code to restore callee saved registers
-        // @todo speed this up
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
-            evictIfActive(r);
-        }
-    }
-
+    // Generate code to restore any registers in 'regs' that are currently active,
     void Assembler::evictSomeActiveRegs(RegisterMask regs)
     {
-        // generate code to restore callee saved registers
-        // @todo speed this up
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
-            if ((rmask(r) & regs)) {
-                evictIfActive(r);
-            }
-        }
+        RegisterMask evict_set = regs & _allocator.activeMask();
+        for (Register r = lsReg(evict_set); evict_set; r = nextLsReg(evict_set, r))
+            evict(_allocator.getActive(r));
     }
 
     /**
      * Merge the current regstate with a previously stored version.
      *
      * Situation                            Change to _allocator
      * ---------                            --------------------
      * !current & !saved
@@ -2332,29 +2329,23 @@ namespace nanojit
     {
         Register regsTodo[LastReg + 1];
         LIns* insTodo[LastReg + 1];
         int nTodo = 0;
 
         // Do evictions and pops first.
         verbose_only(bool shouldMention=false; )
         // The obvious thing to do here is to iterate from FirstReg to LastReg.
-        // viz: for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) ...
         // However, on ARM that causes lower-numbered integer registers
         // to be be saved at higher addresses, which inhibits the formation
         // of load/store multiple instructions.  Hence iterate the loop the
-        // other way.  The "r <= LastReg" guards against wraparound in
-        // the case where Register is treated as unsigned and FirstReg is zero.
-        //
-        // Note, the loop var is deliberately typed as int (*not* Register)
-        // to outsmart compilers that will otherwise report
-        // "error: comparison is always true due to limited range of data type".
-        for (int ri = LastReg; ri >= FirstReg && ri <= LastReg; ri = int(prevreg(Register(ri))))
+        // other way.
+        RegisterMask reg_set = _allocator.activeMask() | saved.activeMask();
+        for (Register r = msReg(reg_set); reg_set; r = nextMsReg(reg_set, r))
         {
-            Register const r = Register(ri);
             LIns* curins = _allocator.getActive(r);
             LIns* savedins = saved.getActive(r);
             if (curins != savedins)
             {
                 if (savedins) {
                     regsTodo[nTodo] = r;
                     insTodo[nTodo] = savedins;
                     nTodo++;
@@ -2398,17 +2389,18 @@ namespace nanojit
     void Assembler::unionRegisterState(RegAlloc& saved)
     {
         Register regsTodo[LastReg + 1];
         LIns* insTodo[LastReg + 1];
         int nTodo = 0;
 
         // Do evictions and pops first.
         verbose_only(bool shouldMention=false; )
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r))
+        RegisterMask reg_set = _allocator.activeMask() | saved.activeMask();
+        for (Register r = lsReg(reg_set); reg_set; r = nextLsReg(reg_set, r))
         {
             LIns* curins = _allocator.getActive(r);
             LIns* savedins = saved.getActive(r);
             if (curins != savedins)
             {
                 if (savedins) {
                     regsTodo[nTodo] = r;
                     insTodo[nTodo] = savedins;
@@ -2448,25 +2440,24 @@ namespace nanojit
 
     // Scan table for instruction with the lowest priority, meaning it is used
     // furthest in the future.
     LIns* Assembler::findVictim(RegisterMask allow)
     {
         NanoAssert(allow);
         LIns *ins, *vic = 0;
         int allow_pri = 0x7fffffff;
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r))
+        RegisterMask vic_set = allow & _allocator.activeMask();
+        for (Register r = lsReg(vic_set); vic_set; r = nextLsReg(vic_set, r))
         {
-            if ((allow & rmask(r)) && (ins = _allocator.getActive(r)) != 0)
-            {
-                int pri = canRemat(ins) ? 0 : _allocator.getPriority(r);
-                if (!vic || pri < allow_pri) {
-                    vic = ins;
-                    allow_pri = pri;
-                }
+            ins = _allocator.getActive(r);
+            int pri = canRemat(ins) ? 0 : _allocator.getPriority(r);
+            if (!vic || pri < allow_pri) {
+                vic = ins;
+                allow_pri = pri;
             }
         }
         NanoAssert(vic != 0);
         return vic;
     }
 
 #ifdef NJ_VERBOSE
     char Assembler::outline[8192];
--- a/js/src/nanojit/Assembler.h
+++ b/js/src/nanojit/Assembler.h
@@ -191,17 +191,17 @@ namespace nanojit
     };
 
     typedef SeqBuilder<NIns*> NInsList;
     typedef HashMap<NIns*, LIns*> NInsMap;
 #if NJ_USES_IMMD_POOL
     typedef HashMap<uint64_t, uint64_t*> ImmDPoolMap;
 #endif
 
-#ifdef VTUNE
+#ifdef VMCFG_VTUNE
     class avmplus::CodegenLIR;
 #endif
 
     class LabelState
     {
     public:
         RegAlloc regs;
         NIns *addr;
@@ -266,18 +266,18 @@ namespace nanojit
             // Sets 'outlineEOL'.
             void setOutputForEOL(const char* format, ...);
 
             void printRegState();
             void printActivationState();
             #endif // NJ_VERBOSE
 
         public:
-            #ifdef VTUNE
-            avmplus::CodegenLIR *cgen;
+            #ifdef VMCFG_VTUNE
+            void* vtuneHandle;
             #endif
 
             Assembler(CodeAlloc& codeAlloc, Allocator& dataAlloc, Allocator& alloc, AvmCore* core, LogControl* logc, const Config& config);
 
             void        compile(Fragment *frag, Allocator& alloc, bool optimize
                                 verbose_only(, LInsPrinter*));
 
             void        endAssembly(Fragment* frag);
@@ -310,17 +310,21 @@ namespace nanojit
 
             uint32_t    arReserve(LIns* ins);
             void        arFree(LIns* ins);
             void        arReset();
 
             Register    registerAlloc(LIns* ins, RegisterMask allow, RegisterMask prefer);
             Register    registerAllocTmp(RegisterMask allow);
             void        registerResetAll();
-            void        evictAllActiveRegs();
+            void        evictAllActiveRegs() {
+                // The evicted set will be be intersected with activeSet(),
+                // so use an all-1s mask to avoid an extra load or call.
+                evictSomeActiveRegs(~RegisterMask(0));
+            }
             void        evictSomeActiveRegs(RegisterMask regs);
             void        evictScratchRegsExcept(RegisterMask ignore);
             void        intersectRegisterState(RegAlloc& saved);
             void        unionRegisterState(RegAlloc& saved);
             void        assignSaved(RegAlloc &saved, RegisterMask skip);
             LIns*       findVictim(RegisterMask allow);
 
             Register    getBaseReg(LIns *ins, int &d, RegisterMask allow);
--- a/js/src/nanojit/CodeAlloc.cpp
+++ b/js/src/nanojit/CodeAlloc.cpp
@@ -42,17 +42,21 @@
 //#define DOPROF
 #include "../vprof/vprof.h"
 
 #ifdef FEATURE_NANOJIT
 
 namespace nanojit
 {
     static const bool verbose = false;
-#if defined(NANOJIT_ARM)
+#ifdef VMCFG_VTUNE
+    // vtune jit profiling api can't handle non-contiguous methods,
+    // so make the allocation size huge to avoid non-contiguous methods
+    static const int pagesPerAlloc = 128; // 1MB
+#elif defined(NANOJIT_ARM)
     // ARM requires single-page allocations, due to the constant pool that
     // lives on each page that must be reachable by a 4kb pcrel load.
     static const int pagesPerAlloc = 1;
 #else
     static const int pagesPerAlloc = 16;
 #endif
 
     CodeAlloc::CodeAlloc()
--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@@ -1971,23 +1971,26 @@ namespace nanojit
         m_capNL[LInsImmI]  = 128;
         m_capNL[LInsImmQ]  = PTR_SIZE(0, 16);
         m_capNL[LInsImmD]  = 16;
         m_capNL[LIns1]     = 256;
         m_capNL[LIns2]     = 512;
         m_capNL[LIns3]     = 16;
         m_capNL[LInsCall]  = 64;
 
-        for (NLKind nlkind = LInsFirst; nlkind <= LInsLast; nlkind = nextNLKind(nlkind))
+        for (NLKind nlkind = LInsFirst; nlkind <= LInsLast; nlkind = nextNLKind(nlkind)) {
             m_listNL[nlkind] = new (alloc) LIns*[m_capNL[nlkind]];
+            m_usedNL[nlkind] = 1; // Force memset in clearAll().
+        }
 
         // Note that this allocates the CONST and MULTIPLE tables as well.
         for (CseAcc a = 0; a < CSE_NUM_USED_ACCS; a++) {
             m_capL[a] = 16;
             m_listL[a] = new (alloc) LIns*[m_capL[a]];
+            m_usedL[a] = 1; // Force memset(0) in first clearAll().
         }
 
         clearAll();
     }
 
     // Inlined/separated version of SuperFastHash.
     // This content is copyrighted by Paul Hsieh.
     // For reference see: http://www.azillionmonkeys.com/qed/hash.html
@@ -2479,17 +2482,17 @@ namespace nanojit
         LIns* ins;
         if (isS16(disp)) {
             if (storesSinceLastLoad != ACCSET_NONE) {
                 // Clear all normal (excludes CONST and MULTIPLE) loads
                 // aliased by stores and calls since the last time we were in
                 // this function.  
                 AccSet a = storesSinceLastLoad & ((1 << EMB_NUM_USED_ACCS) - 1);
                 while (a) {
-                    int acc = msbSet(a);
+                    int acc = msbSet32(a);
                     clearL((CseAcc)acc);
                     a &= ~(1 << acc);
                 }
 
                 // No need to clear CONST loads (those in the CSE_ACC_CONST table).
 
                 // Multi-region loads must be treated conservatively -- we
                 // always clear all of them.
@@ -3033,17 +3036,17 @@ namespace nanojit
         case LIR_lived:
         case LIR_d2i:
         CASE64(LIR_dasq:)
             formals[0] = LTy_D;
             break;
 
         case LIR_file:
         case LIR_line:
-            // XXX: not sure about these ones.  Ignore for the moment.
+            // These will never get hit since VTUNE implies !DEBUG.  Ignore for the moment.
             nArgs = 0;
             break;
 
         default:
             NanoAssertMsgf(0, "%s\n", lirNames[op]);
         }
 
         typeCheckArgs(op, nArgs, formals, args);
--- a/js/src/nanojit/LIR.h
+++ b/js/src/nanojit/LIR.h
@@ -284,42 +284,19 @@ namespace nanojit
     // However, the struct gets padded inside LInsLd in an inconsistent way on
     // Windows, so we actually store a MiniAccSetVal inside LInsLd.  Sigh.
     // But we use MiniAccSet everywhere else.
     //
     typedef uint8_t MiniAccSetVal;
     struct MiniAccSet { MiniAccSetVal val; };
     static const MiniAccSet MINI_ACCSET_MULTIPLE = { 99 };
 
-#if defined(_WIN32) && (_MSC_VER >= 1300) && (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
-    extern "C" unsigned char _BitScanReverse(unsigned long * Index, unsigned long Mask);
-    # pragma intrinsic(_BitScanReverse)
-
-    // Returns the index of the most significant bit that is set.
-    static int msbSet(uint32_t x) {
-        unsigned long idx;
-        _BitScanReverse(&idx, (unsigned long)(x | 1)); // the '| 1' ensures a 0 result when x==0
-        return idx;
-    }
-#elif (__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
-    static int msbSet(uint32_t x) {
-        return 31 - __builtin_clz(x | 1);
-    }
-#else
-    static int msbSet(uint32_t x) {     // slow fallback version
-        for (int i = 31; i >= 0; i--)
-            if ((1 << i) & x) 
-                return i;
-        return 0;
-    }
-#endif
-
     static MiniAccSet compressAccSet(AccSet accSet) {
         if (isSingletonAccSet(accSet)) {
-            MiniAccSet ret = { uint8_t(msbSet(accSet)) };
+            MiniAccSet ret = { uint8_t(msbSet32(accSet)) };
             return ret;
         }
 
         // If we got here, it must be a multi-region AccSet.
         return MINI_ACCSET_MULTIPLE;
     }
 
     static AccSet decompressMiniAccSet(MiniAccSet miniAccSet) {
@@ -1138,18 +1115,22 @@ namespace nanojit
         // Nb: the LIR writer pipeline handles things if a displacement
         // exceeds 16 bits.  This is rare, but does happen occasionally.  We
         // could go to 24 bits but then it would happen so rarely that the
         // handler code would be difficult to test and thus untrustworthy.
         //
         // Nb: the types of these bitfields are all 32-bit integers to ensure
         // they are fully packed on Windows, sigh.  Also, 'loadQual' is
         // unsigned to ensure the values 0, 1, and 2 all fit in 2 bits.
-        int32_t     disp:16;
-        int32_t     miniAccSetVal:8;
+        //
+        // Nb: explicit signed keyword for bitfield types is required,
+        // some compilers may treat them as unsigned without it.
+        // See Bugzilla 584219 comment #18
+        signed int  disp:16;
+        signed int  miniAccSetVal:8;
         uint32_t    loadQual:2;
 
         LIns*       oprnd_1;
 
         LIns        ins;
 
     public:
         LIns* getLIns() { return &ins; };
--- a/js/src/nanojit/Native.h
+++ b/js/src/nanojit/Native.h
@@ -94,25 +94,16 @@
 #if NJ_SOFTFLOAT_SUPPORTED
     #define CASESF(x)   case x
 #else
     #define CASESF(x)
 #endif
 
 namespace nanojit {
 
-    inline Register nextreg(Register r) {
-        return Register(r+1);
-    }
-
-    inline Register prevreg(Register r) {
-        return Register(r-1);
-    }
-
-
     class Fragment;
     struct SideExit;
     struct SwitchInfo;
 
     struct GuardRecord
     {
         void* jmp;
         GuardRecord* next;
@@ -147,19 +138,19 @@ namespace nanojit {
         #define gpn(r)                    regNames[(r)]
     #elif defined(NJ_VERBOSE)
         // Used for printing native instructions.  Like Assembler::outputf(),
         // but only outputs if LC_Native is set.  Also prepends the output
         // with the address of the current native instruction.
         #define asm_output(...) do { \
             if (_logc->lcbits & LC_Native) { \
                 outline[0]='\0'; \
-               VMPI_sprintf(outline, "%p   ", _nIns); \
-                sprintf(&outline[13], ##__VA_ARGS__); \
-                output(); \
+                VMPI_sprintf(outline, "%p   ", _nIns);  \
+                VMPI_sprintf(outline+VMPI_strlen(outline), ##__VA_ARGS__);   \
+                output();                               \
             } \
         } while (0) /* no semi */
         #define gpn(r)                  regNames[(r)]
     #else
         #define asm_output(...)
         #define gpn(r)
     #endif /* NJ_VERBOSE */
 
--- a/js/src/nanojit/NativeARM.cpp
+++ b/js/src/nanojit/NativeARM.cpp
@@ -38,17 +38,16 @@
  * the terms of any one of the MPL, the GPL or the LGPL.
  *
  * ***** END LICENSE BLOCK ***** */
 
 #include "nanojit.h"
 
 #ifdef UNDER_CE
 #include <cmnintrin.h>
-extern "C" bool blx_lr_broken();
 #endif
 
 #if defined(FEATURE_NANOJIT) && defined(NANOJIT_ARM)
 
 namespace nanojit
 {
 
 #ifdef NJ_VERBOSE
@@ -109,23 +108,24 @@ Assembler::CountLeadingZeroes(uint32_t d
     // now we can avoid the cost of the check as we don't intend to support
     // ARMv4 anyway.
     NanoAssert(_config.arm_arch >= 5);
 
 #if defined(__ARMCC__)
     // ARMCC can do this with an intrinsic.
     leading_zeroes = __clz(data);
 
-// current Android GCC compiler incorrectly refuses to compile 'clz' for armv5
-// (even though this is a legal instruction there). Since we currently only compile for ARMv5
-// for emulation, we don't care too much (but we DO care for ARMv6+ since those are "real"
-// devices).
-#elif defined(__GNUC__) && !(defined(ANDROID) && __ARM_ARCH__ <= 5)
+#elif defined(__GNUC__) && (NJ_COMPILER_ARM_ARCH >= 5)
     // GCC can use inline assembler to insert a CLZ instruction.
     __asm (
+#if defined(ANDROID) && (NJ_COMPILER_ARM_ARCH < 7)
+    // On Android gcc compiler, the clz instruction is not supported with a
+    // target smaller than armv7, despite it being legal for armv5+.
+        "   .arch armv7-a\n"
+#endif
         "   clz     %0, %1  \n"
         :   "=r"    (leading_zeroes)
         :   "r"     (data)
     );
 #elif defined(UNDER_CE)
     // WinCE can do this with an intrinsic.
     leading_zeroes = _CountLeadingZeros(data);
 #else
@@ -458,21 +458,16 @@ Assembler::asm_eor_imm(Register rd, Regi
 
 // --------------------------------
 // Assembler functions.
 // --------------------------------
 
 void
 Assembler::nInit(AvmCore*)
 {
-#ifdef UNDER_CE
-    blx_lr_bug = blx_lr_broken();
-#else
-    blx_lr_bug = 0;
-#endif
     nHints[LIR_calli]  = rmask(retRegs[0]);
     nHints[LIR_hcalli] = rmask(retRegs[1]);
     nHints[LIR_paramp] = PREFER_SPECIAL;
 }
 
 void Assembler::nBeginAssembly()
 {
     max_out_args = 0;
@@ -623,17 +618,17 @@ Assembler::asm_arg(ArgType ty, LIns* arg
     if (ty == ARGTYPE_D) {
         // This task is fairly complex and so is delegated to asm_arg_64.
         asm_arg_64(arg, r, stkd);
     } else {
         NanoAssert(ty == ARGTYPE_I || ty == ARGTYPE_UI);
         // pre-assign registers R0-R3 for arguments (if they fit)
         if (r < R4) {
             asm_regarg(ty, arg, r);
-            r = nextreg(r);
+            r = Register(r + 1);
         } else {
             asm_stkarg(arg, stkd);
             stkd += 4;
         }
     }
 }
 
 // Encode a 64-bit floating-point argument using the appropriate ABI.
@@ -657,24 +652,24 @@ Assembler::asm_arg_64(LIns* arg, Registe
 
 #ifdef NJ_ARM_EABI
     // EABI requires that 64-bit arguments are aligned on even-numbered
     // registers, as R0:R1 or R2:R3. If the register base is at an
     // odd-numbered register, advance it. Note that this will push r past
     // R3 if r is R3 to start with, and will force the argument to go on
     // the stack.
     if ((r == R1) || (r == R3)) {
-        r = nextreg(r);
+        r = Register(r + 1);
     }
 #endif
 
     if (r < R3) {
         Register    ra = r;
-        Register    rb = nextreg(r);
-        r = nextreg(rb);
+        Register    rb = Register(r + 1);
+        r = Register(rb + 1);
 
 #ifdef NJ_ARM_EABI
         // EABI requires that 64-bit arguments are aligned on even-numbered
         // registers, as R0:R1 or R2:R3.
         NanoAssert( ((ra == R0) && (rb == R1)) || ((ra == R2) && (rb == R3)) );
 #endif
 
         // Put the argument in ra and rb. If the argument is in a VFP register,
@@ -687,22 +682,18 @@ Assembler::asm_arg_64(LIns* arg, Registe
             asm_regarg(ARGTYPE_I, arg->oprnd2(), rb);
         }
 
 #ifndef NJ_ARM_EABI
     } else if (r == R3) {
         // We only have one register left, but the legacy ABI requires that we
         // put 32 bits of the argument in the register (R3) and the remaining
         // 32 bits on the stack.
-        Register    ra = r;
-        r = nextreg(r);
-
-        // This really just checks that nextreg() works properly, as we know
-        // that r was previously R3.
-        NanoAssert(r == R4);
+        Register    ra = r; // R3
+        r = R4;
 
         // We're splitting the argument between registers and the stack.  This
         // must be the first time that the stack is used, so stkd must be at 0.
         NanoAssert(stkd == 0);
 
         if (_config.arm_vfp) {
             // TODO: We could optimize the this to store directly from
             // the VFP register to memory using "FMRRD ra, fp_reg[31:0]" and
@@ -907,36 +898,27 @@ Assembler::asm_call(LIns* ins)
     }
 
     // Emit the branch.
     if (!indirect) {
         verbose_only(if (_logc->lcbits & LC_Native)
             outputf("        %p:", _nIns);
         )
 
-        // Direct call: on v5 and above (where the calling sequence doesn't
-        // corrupt LR until the actual branch instruction), we can avoid an
-        // interlock in the "long" branch sequence by manually loading the
-        // target address into LR ourselves before setting up the parameters
-        // in other registers.
         BranchWithLink((NIns*)ci->_address);
     } else {
-        // Indirect call: we assign the address arg to LR since it's not
-        // used for regular arguments, and is otherwise scratch since it's
-        // clobberred by the call. On v4/v4T, where we have to manually do
-        // the equivalent of a BLX, move LR into IP before corrupting LR
-        // with the return address.
-        if (blx_lr_bug) {
-            // workaround for msft device emulator bug (blx lr emulated as no-op)
-            underrunProtect(8);
-            BLX(IP);
-            MOV(IP,LR);
-        } else {
-            BLX(LR);
-        }
+        // Indirect call: we assign the address arg to LR
+#ifdef UNDER_CE
+        // workaround for msft device emulator bug (blx lr emulated as no-op)
+        underrunProtect(8);
+        BLX(IP);
+        MOV(IP, LR);
+#else
+        BLX(LR);
+#endif
         asm_regarg(ARGTYPE_I, ins->arg(--argc), LR);
     }
 
     // Encode the arguments, starting at R0 and with an empty argument stack.
     Register    r = R0;
     int         stkd = 0;
 
     // Iterate through the argument list and encode each argument according to
@@ -976,18 +958,16 @@ Assembler::nRegisterResetAll(RegAlloc& a
     // add scratch registers to our free list for the allocator
     a.clear();
     a.free =
         rmask(R0) | rmask(R1) | rmask(R2) | rmask(R3) | rmask(R4) |
         rmask(R5) | rmask(R6) | rmask(R7) | rmask(R8) | rmask(R9) |
         rmask(R10) | rmask(LR);
     if (_config.arm_vfp)
         a.free |= FpRegs;
-
-    debug_only(a.managed = a.free);
 }
 
 static inline ConditionCode
 get_cc(NIns *ins)
 {
     return ConditionCode((*ins >> 28) & 0xF);
 }
 
@@ -1920,27 +1900,29 @@ inline void
 Assembler::BLX(Register addr, bool chk /* = true */)
 {
     // We need to emit an ARMv5+ instruction, so assert that we have a suitable
     // processor. Note that we don't support ARMv4(T), but this serves as a
     // useful sanity check.
     NanoAssert(_config.arm_arch >= 5);
 
     NanoAssert(IsGpReg(addr));
+#ifdef UNDER_CE
     // There is a bug in the WinCE device emulator which stops "BLX LR" from
     // working as expected. Assert that we never do that!
-    if (blx_lr_bug) { NanoAssert(addr != LR); }
+    NanoAssert(addr != LR);
+#endif
 
     if (chk) {
         underrunProtect(4);
     }
 
-    // BLX IP
+    // BLX reg
     *(--_nIns) = (NIns)( (COND_AL) | (0x12<<20) | (0xFFF<<8) | (0x3<<4) | (addr) );
-    asm_output("blx ip");
+    asm_output("blx %s", gpn(addr));
 }
 
 // Emit the code required to load a memory address into a register as follows:
 // d = *(b+off)
 // underrunProtect calls from this function can be disabled by setting chk to
 // false. However, this function can use more than LD32_size bytes of space if
 // the offset is out of the range of a LDR instruction; the maximum space this
 // function requires for underrunProtect is 4+LD32_size.
@@ -2772,37 +2754,33 @@ Assembler::asm_cmov(LIns* ins)
                (ins->isop(LIR_cmovd) && iftrue->isD() && iffalse->isD()));
 
     RegisterMask allow = ins->isD() ? FpRegs : GpRegs;
 
     Register rr = prepareResultReg(ins, allow);
 
     Register rf = findRegFor(iffalse, allow & ~rmask(rr));
 
+    // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
+    Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
+
     if (ins->isop(LIR_cmovd)) {
         NIns* target = _nIns;
         asm_nongp_copy(rr, rf);
         asm_branch(false, condval, target);
-
-        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
         if (rr != rt)
             asm_nongp_copy(rr, rt);
         freeResourcesOf(ins);
         if (!iftrue->isInReg()) {
             NanoAssert(rt == rr);
             findSpecificRegForUnallocated(iftrue, rr);
         }
         return;
     }
 
-    // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-    Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
     // WARNING: We cannot generate any code that affects the condition
     // codes between the MRcc generation here and the asm_cmp() call
     // below.  See asm_cmp() for more details.
     if (ins->isop(LIR_cmovi)) {
         switch (condval->opcode()) {
             // note that these are all opposites...
             case LIR_eqi:    MOVNE(rr, rf);  break;
             case LIR_lti:    MOVGE(rr, rf);  break;
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@@ -235,17 +235,16 @@ verbose_only( extern const char* shiftNa
     void        asm_sub_imm(Register rd, Register rn, int32_t imm, int stat = 0);   \
     void        asm_and_imm(Register rd, Register rn, int32_t imm, int stat = 0);   \
     void        asm_orr_imm(Register rd, Register rn, int32_t imm, int stat = 0);   \
     void        asm_eor_imm(Register rd, Register rn, int32_t imm, int stat = 0);   \
     inline bool     encOp2Imm(uint32_t literal, uint32_t * enc);                \
     inline uint32_t CountLeadingZeroes(uint32_t data);                          \
     int *       _nSlot;                                                         \
     int *       _nExitSlot;                                                     \
-    bool        blx_lr_bug;                                                     \
     int         max_out_args; /* bytes */
 
 #define IMM32(imm)  *(--_nIns) = (NIns)((imm));
 
 #define OP_IMM  (1<<25)
 #define OP_STAT (1<<20)
 
 #define COND_AL ((uint32_t)AL<<28)
--- a/js/src/nanojit/NativeMIPS.cpp
+++ b/js/src/nanojit/NativeMIPS.cpp
@@ -476,36 +476,36 @@ namespace nanojit
         NanoAssert(cpu_has_fpu);
 #endif
 
         // O32 ABI requires that 64-bit arguments are aligned on even-numbered
         // registers, as A0:A1/FA0 or A2:A3/FA1. Use the stack offset to keep track
         // where we are
         if (stkd & 4) {
             if (stkd < 16) {
-                r = nextreg(r);
-                fr = nextreg(fr);
+                r = Register(r + 1);
+                fr = Register(fr + 1);
             }
             stkd += 4;
         }
 
         if (stkd < 16) {
             NanoAssert(fr == FA0 || fr == FA1 || fr == A2);
             if (fr == FA0 || fr == FA1)
                 findSpecificRegFor(arg, fr);
             else {
                 findSpecificRegFor(arg, FA1);
                 // Move it to the integer pair
                 Register fpupair = arg->getReg();
                 Register intpair = fr;
-                MFC1(mswregpair(intpair), nextreg(fpupair));       // Odd fpu register contains sign,expt,manthi
+                MFC1(mswregpair(intpair), Register(fpupair + 1));  // Odd fpu register contains sign,expt,manthi
                 MFC1(lswregpair(intpair), fpupair);                // Even fpu register contains mantlo
             }
-            r = nextreg(nextreg(r));
-            fr = nextreg(nextreg(fr));
+            r = Register(r + 2);
+            fr = Register(fr + 2);
         }
         else
             asm_stkarg(arg, stkd);
 
         stkd += 8;
     }
 
     /* Required functions */
@@ -1573,18 +1573,18 @@ namespace nanojit
 
         if (ty == ARGTYPE_D) {
             // This task is fairly complex and so is delegated to asm_arg_64.
             asm_arg_64(arg, r, fr, stkd);
         } else {
             NanoAssert(ty == ARGTYPE_I || ty == ARGTYPE_UI);
             if (stkd < 16) {
                 asm_regarg(ty, arg, r);
-                fr = nextreg(fr);
-                r = nextreg(r);
+                fr = Register(fr + 1);
+                r = Register(r + 1);
             }
             else
                 asm_stkarg(arg, stkd);
             // The o32 ABI calling convention is that if the first arguments
             // is not a double, subsequent double values are passed in integer registers
             fr = r;
             stkd += 4;
         }
@@ -1679,17 +1679,16 @@ namespace nanojit
 
     void
     Assembler::nRegisterResetAll(RegAlloc& regs)
     {
         regs.clear();
         regs.free = GpRegs;
         if (cpu_has_fpu)
             regs.free |= FpRegs;
-        debug_only(regs.managed = regs.free;)
     }
 
 #define signextend16(s) ((int32_t(s)<<16)>>16)
 
     void
     Assembler::nPatchBranch(NIns* branch, NIns* target)
     {
         uint32_t op = (branch[0] >> 26) & 0x3f;
--- a/js/src/nanojit/NativePPC.cpp
+++ b/js/src/nanojit/NativePPC.cpp
@@ -731,31 +731,31 @@ namespace nanojit
             uint32_t j = argc - i - 1;
             ArgType ty = argTypes[j];
             LIns* arg = ins->arg(j);
             NanoAssert(ty != ARGTYPE_V);
             if (ty != ARGTYPE_D) {
                 // GP arg
                 if (r <= R10) {
                     asm_regarg(ty, arg, r);
-                    r = nextreg(r);
+                    r = Register(r + 1);
                     param_size += sizeof(void*);
                 } else {
                     // put arg on stack
                     TODO(stack_int32);
                 }
             } else {
                 // double
                 if (fr <= F13) {
                     asm_regarg(ty, arg, fr);
-                    fr = nextreg(fr);
+                    fr = Register(fr + 1);
                 #ifdef NANOJIT_64BIT
-                    r = nextreg(r);
+                    r = Register(r + 1);
                 #else
-                    r = nextreg(nextreg(r)); // skip 2 gpr's
+                    r = Register(r + 2); // skip 2 gpr's
                 #endif
                     param_size += sizeof(double);
                 } else {
                     // put arg on stack
                     TODO(stack_double);
                 }
             }
         }
@@ -1035,21 +1035,21 @@ namespace nanojit
             CLRLDI(r, v, 32); // clears the top 32 bits
             break;
         case LIR_i2q:
             EXTSW(r, v);
             break;
         }
     }
 
-    void Assembler::asm_dasq(LIns *ins) {
+    void Assembler::asm_dasq(LIns*) {
         TODO(asm_dasq);
     }
 
-    void Assembler::asm_qasd(LIns *ins) {
+    void Assembler::asm_qasd(LIns*) {
         TODO(asm_qasd);
     }
 
     #endif
 
 #ifdef NANOJIT_64BIT
     void Assembler::asm_immq(LIns *ins) {
         Register r = ins->deprecated_getReg();
@@ -1385,17 +1385,16 @@ namespace nanojit
         }
         _allocator.free &= ~rmask(i);
         return i;
     }
 
     void Assembler::nRegisterResetAll(RegAlloc &regs) {
         regs.clear();
         regs.free = SavedRegs | 0x1ff8 /* R3-12 */ | 0x3ffe00000000LL /* F1-13 */;
-        debug_only(regs.managed = regs.free);
     }
 
 #ifdef NANOJIT_64BIT
     void Assembler::asm_qbinop(LIns *ins) {
         LOpcode op = ins->opcode();
         switch (op) {
         case LIR_orq:
         case LIR_andq:
--- a/js/src/nanojit/NativeSparc.cpp
+++ b/js/src/nanojit/NativeSparc.cpp
@@ -229,17 +229,16 @@ namespace nanojit
         _allocator.free &= ~rmask((Register)i);
         return (Register) i;
     }
 
     void Assembler::nRegisterResetAll(RegAlloc& a)
     {
         a.clear();
         a.free = GpRegs | FpRegs;
-        debug_only( a.managed = a.free; )
     }
 
     void Assembler::nPatchBranch(NIns* branch, NIns* location)
     {
         *(uint32_t*)&branch[0] &= 0xFFC00000;
         *(uint32_t*)&branch[0] |= ((intptr_t)location >> 10) & 0x3FFFFF;
         *(uint32_t*)&branch[1] &= 0xFFFFFC00;
         *(uint32_t*)&branch[1] |= (intptr_t)location & 0x3FF;
@@ -532,32 +531,35 @@ namespace nanojit
                     BGU(0, tt);
                 else //if (condop == LIR_geui)
                     BCC(0, tt);
             }
         asm_cmp(cond);
         return at;
     }
 
-    NIns* Assembler::asm_branch_ov(LOpcode, NIns* targ)
+    NIns* Assembler::asm_branch_ov(LOpcode op, NIns* targ)
     {
         NIns* at = 0;
         underrunProtect(32);
         intptr_t tt = ((intptr_t)targ - (intptr_t)_nIns + 8) >> 2;
         // !targ means that it needs patch.
         if( !(isIMM22((int32_t)tt)) || !targ ) {
             JMP_long_nocheck((intptr_t)targ);
             at = _nIns;
             NOP();
             BA(0, 5);
             tt = 4;
         }
         NOP();
 
-        BVS(0, tt);
+        if( op == LIR_mulxovi || op == LIR_muljovi )
+            BNE(0, tt);
+        else
+            BVS(0, tt);
         return at;
     }
 
     void Assembler::asm_cmp(LIns *cond)
     {
         underrunProtect(12);
 
         LIns* lhs = cond->oprnd1();
@@ -640,17 +642,17 @@ namespace nanojit
     {
         underrunProtect(28);
         LOpcode op = ins->opcode();
         LIns* lhs = ins->oprnd1();
         LIns* rhs = ins->oprnd2();
 
         Register rb = deprecated_UnknownReg;
         RegisterMask allow = GpRegs;
-        bool forceReg = (op == LIR_muli || op == LIR_mulxovi || !rhs->isImmI());
+        bool forceReg = (op == LIR_muli || op == LIR_mulxovi || op == LIR_muljovi || !rhs->isImmI());
 
         if (lhs != rhs && forceReg)
             {
                 if ((rb = asm_binop_rhs_reg(ins)) == deprecated_UnknownReg) {
                     rb = findRegFor(rhs, allow);
                 }
                 allow &= ~rmask(rb);
             }
@@ -674,18 +676,24 @@ namespace nanojit
             {
                 if (lhs == rhs)
                     rb = ra;
 
                 if (op == LIR_addi || op == LIR_addxovi)
                     ADDCC(rr, rb, rr);
                 else if (op == LIR_subi || op == LIR_subxovi)
                     SUBCC(rr, rb, rr);
-                else if (op == LIR_muli || op == LIR_mulxovi)
-                    MULX(rr, rb, rr);
+                else if (op == LIR_muli)
+                    SMULCC(rr, rb, rr);
+                else if (op == LIR_mulxovi || op == LIR_muljovi) {
+                    SUBCC(L4, L6, L4);
+                    SRAI(rr, 31, L6);
+                    RDY(L4);
+                    SMULCC(rr, rb, rr);
+                }
                 else if (op == LIR_andi)
                     AND(rr, rb, rr);
                 else if (op == LIR_ori)
                     OR(rr, rb, rr);
                 else if (op == LIR_xori)
                     XOR(rr, rb, rr);
                 else if (op == LIR_lshi)
                     SLL(rr, rb, rr);
--- a/js/src/nanojit/NativeSparc.h
+++ b/js/src/nanojit/NativeSparc.h
@@ -732,20 +732,20 @@ namespace nanojit
     } while (0)
 
 #define MOVVSI(simm11, cc2, cc1, cc0, rd) \
     do { \
     Format_4_2I(rd, 0x2c, cc2, 7, cc1, cc0, simm11); \
     asm_output("movvs %d, %s", simm11, gpn(rd)); \
     } while (0)
 
-#define MULX(rs1, rs2, rd) \
+#define SMULCC(rs1, rs2, rd) \
     do { \
-    Format_3_1(2, rd, 0x9, rs1, 0, rs2); \
-    asm_output("mul %s, %s, %s", gpn(rs1), gpn(rs2), gpn(rd)); \
+    Format_3_1(2, rd, 0x1b, rs1, 0, rs2); \
+    asm_output("smulcc %s, %s, %s", gpn(rs1), gpn(rs2), gpn(rd)); \
     } while (0)
 
 #define NOP() \
     do { \
     Format_2_1(0, 0x4, 0); \
     asm_output("nop"); \
     } while (0)
 
@@ -768,16 +768,22 @@ namespace nanojit
     } while (0)
 
 #define ANDCC(rs1, rs2, rd) \
     do { \
     Format_3_1(2, rd, 0x11, rs1, 0, rs2); \
     asm_output("andcc %s, %s, %s", gpn(rs1), gpn(rs2), gpn(rd)); \
     } while (0)
 
+#define RDY(rd) \
+    do { \
+    Format_3_1(2, rd, 0x28, 0, 0, 0); \
+    asm_output("rdy %s", gpn(rd)); \
+    } while (0)
+
 #define RESTORE(rs1, rs2, rd) \
     do { \
     Format_3_1(2, rd, 0x3D, rs1, 0, rs2); \
     asm_output("restore"); \
     } while (0)
 
 #define SAVEI(rs1, simm13, rd) \
     do { \
@@ -804,16 +810,22 @@ namespace nanojit
     } while (0)
 
 #define SRA(rs1, rs2, rd) \
     do { \
     Format_3_5(2, rd, 0x27, rs1, 0, rs2); \
     asm_output("sra %s, %s, %s", gpn(rs1), gpn(rs2), gpn(rd)); \
     } while (0)
 
+#define SRAI(rs1, shcnt32, rd) \
+    do { \
+    Format_3_6(2, rd, 0x27, rs1, shcnt32); \
+    asm_output("sra %s, %d, %s", gpn(rs1), shcnt32, gpn(rd)); \
+    } while (0)
+
 #define SRL(rs1, rs2, rd) \
     do { \
     Format_3_5(2, rd, 0x26, rs1, 0, rs2); \
     asm_output("srl %s, %s, %s", gpn(rs1), gpn(rs2), gpn(rd)); \
     } while (0)
 
 #define STF(rd, rs1, rs2) \
     do { \
--- a/js/src/nanojit/NativeX64.cpp
+++ b/js/src/nanojit/NativeX64.cpp
@@ -961,17 +961,17 @@ namespace nanojit
                 // double goes in XMM reg # based on overall arg_index
                 asm_regarg(ty, arg, Register(XMM0+arg_index));
                 arg_index++;
             }
         #else
             else if (ty == ARGTYPE_D && fr < XMM8) {
                 // double goes in next available XMM register
                 asm_regarg(ty, arg, fr);
-                fr = nextreg(fr);
+                fr = Register(fr + 1);
             }
         #endif
             else {
                 asm_stkarg(ty, arg, stk_used);
                 stk_used += sizeof(void*);
             }
         }
 
@@ -1114,37 +1114,33 @@ namespace nanojit
                    (ins->isop(LIR_cmovd) && iftrue->isD() && iffalse->isD()));
 
         RegisterMask allow = ins->isD() ? FpRegs : GpRegs;
 
         Register rr = prepareResultReg(ins, allow);
 
         Register rf = findRegFor(iffalse, allow & ~rmask(rr));
 
+        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
+        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
+
         if (ins->isop(LIR_cmovd)) {
             NIns* target = _nIns;
             asm_nongp_copy(rr, rf);
             asm_branch(false, cond, target);
-
-            // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-            Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
             if (rr != rt)
                 asm_nongp_copy(rr, rt);
             freeResourcesOf(ins);
             if (!iftrue->isInReg()) {
                 NanoAssert(rt == rr);
                 findSpecificRegForUnallocated(iftrue, rr);
             }
             return;
         }
 
-        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
         // WARNING: We cannot generate any code that affects the condition
         // codes between the MRcc generation here and the asm_cmp() call
         // below.  See asm_cmp() for more details.
         LOpcode condop = cond->opcode();
         if (ins->isop(LIR_cmovi)) {
             switch (condop) {
             case LIR_eqi:  case LIR_eqq:    CMOVNE( rr, rf);  break;
             case LIR_lti:  case LIR_ltq:    CMOVNL( rr, rf);  break;
@@ -1900,17 +1896,16 @@ namespace nanojit
     void Assembler::nRegisterResetAll(RegAlloc &a) {
         // add scratch registers to our free list for the allocator
         a.clear();
 #ifdef _WIN64
         a.free = 0x001fffcf; // rax-rbx, rsi, rdi, r8-r15, xmm0-xmm5
 #else
         a.free = 0xffffffff & ~(1<<RSP | 1<<RBP);
 #endif
-        debug_only( a.managed = a.free; )
     }
 
     void Assembler::nPatchBranch(NIns *patch, NIns *target) {
         NIns *next = 0;
         if (patch[0] == 0xE9) {
             // jmp disp32
             next = patch+5;
         } else if (patch[0] == 0x0F && (patch[1] & 0xF0) == 0x80) {
--- a/js/src/nanojit/Nativei386.cpp
+++ b/js/src/nanojit/Nativei386.cpp
@@ -1107,17 +1107,16 @@ namespace nanojit
 
     void Assembler::nRegisterResetAll(RegAlloc& a)
     {
         // add scratch registers to our free list for the allocator
         a.clear();
         a.free = SavedRegs | ScratchRegs;
         if (!_config.i386_sse2)
             a.free &= ~XmmRegs;
-        debug_only( a.managed = a.free; )
     }
 
     void Assembler::nPatchBranch(NIns* branch, NIns* targ)
     {
         intptr_t offset = intptr_t(targ) - intptr_t(branch);
         if (branch[0] == JMP32) {
             *(int32_t*)&branch[1] = offset - 5;
         } else if (branch[0] == JCC32) {
@@ -2054,37 +2053,33 @@ namespace nanojit
         }
 
         RegisterMask allow = ins->isD() ? XmmRegs : GpRegs;
 
         Register rr = prepareResultReg(ins, allow);
 
         Register rf = findRegFor(iffalse, allow & ~rmask(rr));
 
+        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
+        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
+
         if (ins->isop(LIR_cmovd)) {
             NIns* target = _nIns;
             asm_nongp_copy(rr, rf);
             asm_branch(false, condval, target);
-
-            // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-            Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
             if (rr != rt)
                 asm_nongp_copy(rr, rt);
             freeResourcesOf(ins);
             if (!iftrue->isInReg()) {
                 NanoAssert(rt == rr);
                 findSpecificRegForUnallocated(iftrue, rr);
             }
             return;
         }
 
-        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
         NanoAssert(ins->isop(LIR_cmovi));
 
         // WARNING: We cannot generate any code that affects the condition
         // codes between the MRcc generation here and the asm_cmp() call
         // below.  See asm_cmp() for more details.
         switch (condval->opcode()) {
             // Note that these are all opposites...
             case LIR_eqi:    MRNE(rr, rf);   break;
--- a/js/src/nanojit/RegAlloc.cpp
+++ b/js/src/nanojit/RegAlloc.cpp
@@ -40,24 +40,16 @@
 #include "nanojit.h"
 
 namespace nanojit
 {
     #ifdef FEATURE_NANOJIT
 
     #ifdef _DEBUG
 
-    uint32_t RegAlloc::countActive()
-    {
-        int cnt = 0;
-        for(Register i=FirstReg; i <= LastReg; i = nextreg(i))
-            cnt += active[i] ? 1 : 0;
-        return cnt;
-    }
-
     bool RegAlloc::isConsistent(Register r, LIns* i) const
     {
         NanoAssert(r != deprecated_UnknownReg);
         return (isFree(r)  && !getActive(r)     && !i) ||
                (!isFree(r) &&  getActive(r)== i && i );
     }
 
     #endif /*DEBUG*/
--- a/js/src/nanojit/RegAlloc.h
+++ b/js/src/nanojit/RegAlloc.h
@@ -115,19 +115,23 @@ namespace nanojit
             return usepri[r];
         }
 
         LIns* getActive(Register r) const {
             NanoAssert(r != deprecated_UnknownReg);
             return active[r];
         }
 
-        debug_only( uint32_t    countActive(); )
+        // Return a mask containing the active registers.  For each register
+        // in this set, getActive(register) will be a nonzero LIns pointer.
+        RegisterMask activeMask() const {
+            return ~free & managed;
+        }
+
         debug_only( bool        isConsistent(Register r, LIns* v) const; )
-        debug_only( RegisterMask managed; )     // the registers managed by the register allocator
 
         // Some basics:
         //
         // - 'active' indicates which registers are active at a particular
         //   point, and for each active register, which instruction
         //   defines the value it holds.  At the start of register
         //   allocation no registers are active.
         //
@@ -166,15 +170,46 @@ namespace nanojit
         //   * And vice versa:  an LIns with an in-use reservation that
         //     names R must be named by 'active[R]'.
         //
         //   * If an LIns's reservation names 'deprecated_UnknownReg' then LIns
         //     should not be in 'active'.
         //
         LIns*           active[LastReg + 1];    // active[r] = LIns that defines r
         int32_t         usepri[LastReg + 1];    // used priority. lower = more likely to spill.
-        RegisterMask    free;
+        RegisterMask    free;       // Registers currently free.
+        RegisterMask    managed;    // Registers under management (invariant).
         int32_t         priority;
 
         DECLARE_PLATFORM_REGALLOC()
     };
+
+    // Return the lowest numbered Register in mask.
+    inline Register lsReg(RegisterMask mask) {
+        // This is faster than it looks; we rely on the C++ optimizer
+        // to strip the dead branch and inline just one alternative.
+        if (sizeof(RegisterMask) == 4)
+            return (Register) lsbSet32(mask);
+        else
+            return (Register) lsbSet64(mask);
+    }
+
+    // Return the highest numbered Register in mask.
+    inline Register msReg(RegisterMask mask) {
+        // This is faster than it looks; we rely on the C++ optimizer
+        // to strip the dead branch and inline just one alternative.
+        if (sizeof(RegisterMask) == 4)
+            return (Register) msbSet32(mask);
+        else
+            return (Register) msbSet64(mask);
+    }
+
+    // Clear bit r in mask, then return lsReg(mask).
+    inline Register nextLsReg(RegisterMask& mask, Register r) {
+        return lsReg(mask &= ~rmask(r));
+    }
+
+    // Clear bit r in mask, then return msReg(mask).
+    inline Register nextMsReg(RegisterMask& mask, Register r) {
+        return msReg(mask &= ~rmask(r));
+    }
 }
 #endif // __nanojit_RegAlloc__
--- a/js/src/nanojit/avmplus.cpp
+++ b/js/src/nanojit/avmplus.cpp
@@ -36,23 +36,16 @@
 #include "nanojit.h"
 
 #ifdef SOLARIS
     typedef caddr_t maddr_ptr;
 #else
     typedef void *maddr_ptr;
 #endif
 
-#if defined(AVMPLUS_ARM) && defined(UNDER_CE)
-extern "C" bool
-blx_lr_broken() {
-    return false;
-}
-#endif
-
 using namespace avmplus;
 
 nanojit::Config AvmCore::config;
 
 void
 avmplus::AvmLog(char const *msg, ...) {
     va_list ap;
     va_start(ap, msg);
--- a/js/src/nanojit/nanojit.h
+++ b/js/src/nanojit/nanojit.h
@@ -184,16 +184,131 @@ static inline bool isS32(intptr_t i) {
 
 static inline bool isU32(uintptr_t i) {
     return uint32_t(i) == i;
 }
 
 #define alignTo(x,s)        ((((uintptr_t)(x)))&~(((uintptr_t)s)-1))
 #define alignUp(x,s)        ((((uintptr_t)(x))+(((uintptr_t)s)-1))&~(((uintptr_t)s)-1))
 
+namespace nanojit
+{
+// Define msbSet32(), lsbSet32(), msbSet64(), and lsbSet64() functions using
+// fast find-first-bit instructions intrinsics when available.
+// The fall-back implementations use iteration.
+#if defined(_WIN32) && (_MSC_VER >= 1300) && (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
+
+    extern "C" unsigned char _BitScanForward(unsigned long * Index, unsigned long Mask);
+    extern "C" unsigned char _BitScanReverse(unsigned long * Index, unsigned long Mask);
+    # pragma intrinsic(_BitScanForward)
+    # pragma intrinsic(_BitScanReverse)
+
+    // Returns the index of the most significant bit that is set.
+    static inline int msbSet32(uint32_t x) {
+        unsigned long idx;
+        _BitScanReverse(&idx, (unsigned long)(x | 1)); // the '| 1' ensures a 0 result when x==0
+        return idx;
+    }
+
+    // Returns the index of the least significant bit that is set.
+    static inline int lsbSet32(uint32_t x) {
+        unsigned long idx;
+        _BitScanForward(&idx, (unsigned long)(x | 0x80000000)); // the '| 0x80000000' ensures a 0 result when x==0
+        return idx;
+    }
+
+#if defined(_M_AMD64) || defined(_M_X64)
+    extern "C" unsigned char _BitScanForward64(unsigned long * Index, unsigned __int64 Mask);
+    extern "C" unsigned char _BitScanReverse64(unsigned long * Index, unsigned __int64 Mask);
+    # pragma intrinsic(_BitScanForward64)
+    # pragma intrinsic(_BitScanReverse64)
+
+    // Returns the index of the most significant bit that is set.
+    static inline int msbSet64(uint64_t x) {
+        unsigned long idx;
+        _BitScanReverse64(&idx, (unsigned __int64)(x | 1)); // the '| 1' ensures a 0 result when x==0
+        return idx;
+    }
+
+    // Returns the index of the least significant bit that is set.
+    static inline int lsbSet64(uint64_t x) {
+        unsigned long idx;
+        _BitScanForward64(&idx, (unsigned __int64)(x | 0x8000000000000000LL)); // the '| 0x80000000' ensures a 0 result when x==0
+        return idx;
+    }
+#else
+    // Returns the index of the most significant bit that is set.
+    static int msbSet64(uint64_t x) {
+        return (x & 0xffffffff00000000LL) ? msbSet32(uint32_t(x >> 32)) + 32 : msbSet32(uint32_t(x));
+    }
+    // Returns the index of the least significant bit that is set.
+    static int lsbSet64(uint64_t x) {
+        return (x & 0x00000000ffffffffLL) ? lsbSet32(uint32_t(x)) : lsbSet32(uint32_t(x >> 32)) + 32;
+    }
+#endif
+
+#elif (__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
+
+    // Returns the index of the most significant bit that is set.
+    static inline int msbSet32(uint32_t x) {
+        return 31 - __builtin_clz(x | 1);
+    }
+
+    // Returns the index of the least significant bit that is set.
+    static inline int lsbSet32(uint32_t x) {
+        return __builtin_ctz(x | 0x80000000);
+    }
+
+    // Returns the index of the most significant bit that is set.
+    static inline int msbSet64(uint64_t x) {
+        return 63 - __builtin_clzll(x | 1);
+    }
+
+    // Returns the index of the least significant bit that is set.
+    static inline int lsbSet64(uint64_t x) {
+        return __builtin_ctzll(x | 0x8000000000000000LL);
+    }
+
+#else
+
+    // Slow fall-back: return most significant bit set by searching iteratively.
+    static int msbSet32(uint32_t x) {
+        for (int i = 31; i >= 0; i--)
+            if ((1 << i) & x)
+                return i;
+        return 0;
+    }
+
+    // Slow fall-back: return least significant bit set by searching iteratively.
+    static int lsbSet32(uint32_t x) {
+        for (int i = 0; i < 32; i++)
+            if ((1 << i) & x)
+                return i;
+        return 31;
+    }
+
+    // Slow fall-back: return most significant bit set by searching iteratively.
+    static int msbSet64(uint64_t x) {
+        for (int i = 63; i >= 0; i--)
+            if ((1LL << i) & x)
+                return i;
+        return 0;
+    }
+
+    // Slow fall-back: return least significant bit set by searching iteratively.
+    static int lsbSet64(uint64_t x) {
+        for (int i = 0; i < 64; i++)
+            if ((1LL << i) & x)
+                return i;
+        return 63;
+    }
+
+#endif // select compiler
+} // namespace nanojit
+
 // -------------------------------------------------------------------
 // START debug-logging definitions
 // -------------------------------------------------------------------
 
 /* Debug printing stuff.  All Nanojit and jstracer debug printing
    should be routed through LogControl::printf.  Don't use
    ad-hoc calls to printf, fprintf(stderr, ...) etc.