[JAEGER] Merge from tracemonkey.
authorDavid Mandelin <dmandelin@mozilla.com>
Thu, 19 Aug 2010 17:30:22 -0700
changeset 53467 8a0513a5c024cdcaa92742f25861a8ce1fa7ac6f
parent 53466 a6f55b452f916635e8cc51be5a4418f41d08c410 (current diff)
parent 51133 45a893397e3004302680c7837d677425577ad447 (diff)
child 53468 c2f1e5150e18688ebede3eadc968d3bf082f9a5b
push idunknown
push userunknown
push dateunknown
milestone2.0b5pre
[JAEGER] Merge from tracemonkey.
--- a/js/src/lirasm/tests/call1.in
+++ b/js/src/lirasm/tests/call1.in
@@ -1,12 +1,12 @@
 ptr = allocp 8
 a = immi 65
-sti a ptr 0
+sti2c a ptr 0
 b = immi 66
-sti b ptr 1
+sti2c b ptr 1
 c = immi 67
-sti c ptr 2
+sti2c c ptr 2
 zero = immi 0
-sti zero ptr 3
+sti2c zero ptr 3
 ss = calli puts cdecl ptr
 nn = gei ss zero
 reti nn
--- a/js/src/lirasm/tests/multfrag1.in
+++ b/js/src/lirasm/tests/multfrag1.in
@@ -1,18 +1,18 @@
 .begin a
 ptr = allocp 8
 a = immi 65
-sti a ptr 0
+sti2c a ptr 0
 b = immi 66
-sti b ptr 1
+sti2c b ptr 1
 c = immi 67
-sti c ptr 2
+sti2c c ptr 2
 zero = immi 0
-sti zero ptr 3
+sti2c zero ptr 3
 ss = calli puts cdecl ptr
 nn = gei ss zero
 reti nn
 .end
 
 .begin b
 rr = calli a fastcall
 reti rr
--- a/js/src/nanojit-import-rev
+++ b/js/src/nanojit-import-rev
@@ -1,1 +1,1 @@
-982cd218ddb049bdbbcdda4fa3a9d7e40e45e0be
+c7009f5cd83ea028b98f59e1f8830a76ba27c1dd
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
@@ -36,25 +36,37 @@
  * the terms of any one of the MPL, the GPL or the LGPL.
  *
  * ***** END LICENSE BLOCK ***** */
 
 #include "nanojit.h"
 
 #ifdef FEATURE_NANOJIT
 
-#ifdef VTUNE
+#ifdef VMCFG_VTUNE
 #include "../core/CodegenLIR.h"
 #endif
 
 #ifdef _MSC_VER
     // disable some specific warnings which are normally useful, but pervasive in the code-gen macros
     #pragma warning(disable:4310) // cast truncates constant value
 #endif
 
+#ifdef VMCFG_VTUNE
+namespace vtune {
+    using namespace nanojit;
+    void vtuneStart(void*, NIns*);
+    void vtuneEnd(void*, NIns*);
+    void vtuneLine(void*, int, NIns*);
+    void vtuneFile(void*, void*);
+}
+using namespace vtune;
+#endif // VMCFG_VTUNE
+
+
 namespace nanojit
 {
     /**
      * Need the following:
      *
      *    - merging paths ( build a graph? ), possibly use external rep to drive codegen
      */
     Assembler::Assembler(CodeAlloc& codeAlloc, Allocator& dataAlloc, Allocator& alloc, AvmCore* core, LogControl* logc, const Config& config)
@@ -69,18 +81,18 @@ namespace nanojit
     #if NJ_USES_IMMD_POOL
         , _immDPool(alloc)
     #endif
         , _epilogue(NULL)
         , _err(None)
     #if PEDANTIC
         , pedanticTop(NULL)
     #endif
-    #ifdef VTUNE
-        , cgen(NULL)
+    #ifdef VMCFG_VTUNE
+        , vtuneHandle(NULL)
     #endif
         , _config(config)
     {
         nInit(core);
         (void)logc;
         verbose_only( _logc = logc; )
         verbose_only( _outputCache = 0; )
         verbose_only( outline[0] = '\0'; )
@@ -181,20 +193,21 @@ namespace nanojit
     #if NJ_USES_IMMD_POOL
         _immDPool.clear();
     #endif
     }
 
     void Assembler::registerResetAll()
     {
         nRegisterResetAll(_allocator);
+        _allocator.managed = _allocator.free;
 
         // At start, should have some registers free and none active.
         NanoAssert(0 != _allocator.free);
-        NanoAssert(0 == _allocator.countActive());
+        NanoAssert(0 == _allocator.activeMask());
 #ifdef NANOJIT_IA32
         debug_only(_fpuStkDepth = 0; )
 #endif
     }
 
     // Legend for register sets: A = allowed, P = preferred, F = free, S = SavedReg.
     //
     // Finds a register in 'setA___' to store the result of 'ins' (one from
@@ -268,24 +281,16 @@ namespace nanojit
         if (start)
             CodeAlloc::add(codeList, start, end);
 
         // CodeAlloc contract: allocations never fail
         _codeAlloc.alloc(start, end);
         verbose_only( nBytes += (end - start) * sizeof(NIns); )
         NanoAssert(uintptr_t(end) - uintptr_t(start) >= (size_t)LARGEST_UNDERRUN_PROT);
         eip = end;
-
-        #ifdef VTUNE
-        if (_nIns && _nExitIns) {
-            //cgen->jitAddRecord((uintptr_t)list->code, 0, 0, true); // add placeholder record for top of page
-            cgen->jitCodePosUpdate((uintptr_t)list->code);
-            cgen->jitPushInfo(); // new page requires new entry
-        }
-        #endif
     }
 
     void Assembler::reset()
     {
         _nIns = 0;
         _nExitIns = 0;
         codeStart = codeEnd = 0;
         exitStart = exitEnd = 0;
@@ -355,33 +360,36 @@ namespace nanojit
         _activation.checkForResourceConsistency(_allocator);
 
         registerConsistencyCheck();
     }
 
     void Assembler::registerConsistencyCheck()
     {
         RegisterMask managed = _allocator.managed;
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
-            if (rmask(r) & managed) {
-                // A register managed by register allocation must be either
-                // free or active, but not both.
-                if (_allocator.isFree(r)) {
-                    NanoAssertMsgf(_allocator.getActive(r)==0,
-                        "register %s is free but assigned to ins", gpn(r));
-                } else {
-                    // An LIns defining a register must have that register in
-                    // its reservation.
-                    LIns* ins = _allocator.getActive(r);
-                    NanoAssert(ins);
-                    NanoAssertMsg(r == ins->getReg(), "Register record mismatch");
-                }
+        for (Register r = lsReg(managed); managed; r = nextLsReg(managed, r)) {
+            // A register managed by register allocation must be either
+            // free or active, but not both.
+            if (_allocator.isFree(r)) {
+                NanoAssertMsgf(_allocator.getActive(r)==0,
+                    "register %s is free but assigned to ins", gpn(r));
             } else {
-                // A register not managed by register allocation must be
-                // neither free nor active.
+                // An LIns defining a register must have that register in
+                // its reservation.
+                LIns* ins = _allocator.getActive(r);
+                NanoAssert(ins);
+                NanoAssertMsg(r == ins->getReg(), "Register record mismatch");
+            }
+        }
+
+        RegisterMask not_managed = ~_allocator.managed;
+        for (Register r = lsReg(not_managed); not_managed; r = nextLsReg(not_managed, r)) {
+            // A register not managed by register allocation must be
+            // neither free nor active.
+            if (r <= LastReg) {
                 NanoAssert(!_allocator.isFree(r));
                 NanoAssert(!_allocator.getActive(r));
             }
         }
     }
     #endif /* _DEBUG */
 
     void Assembler::findRegFor2(RegisterMask allowa, LIns* ia, Register& ra,
@@ -1103,37 +1111,45 @@ namespace nanojit
 
         // at this point all our new code is in the d-cache and not the i-cache,
         // so flush the i-cache on cpu's that need it.
         CodeAlloc::flushICache(codeList);
 
         // save entry point pointers
         frag->fragEntry = fragEntry;
         frag->setCode(_nIns);
+
+#ifdef VMCFG_VTUNE
+        if (vtuneHandle)
+        {
+            vtuneEnd(vtuneHandle, codeEnd);
+            vtuneStart(vtuneHandle, _nIns);
+        }
+#endif
+
         PERFM_NVPROF("code", CodeAlloc::size(codeList));
 
 #ifdef NANOJIT_IA32
         NanoAssertMsgf(_fpuStkDepth == 0,"_fpuStkDepth %d\n",_fpuStkDepth);
 #endif
 
         debug_only( pageValidate(); )
         NanoAssert(_branchStateMap.isEmpty());
     }
 
     void Assembler::releaseRegisters()
     {
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r))
+        RegisterMask active = _allocator.activeMask();
+        for (Register r = lsReg(active); active; r = nextLsReg(active, r))
         {
             LIns *ins = _allocator.getActive(r);
-            if (ins) {
-                // Clear reg allocation, preserve stack allocation.
-                _allocator.retire(r);
-                NanoAssert(r == ins->getReg());
-                ins->clearReg();
-            }
+            // Clear reg allocation, preserve stack allocation.
+            _allocator.retire(r);
+            NanoAssert(r == ins->getReg());
+            ins->clearReg();
         }
     }
 
 #ifdef PERFM
 #define countlir_live() _nvprof("lir-live",1)
 #define countlir_ret() _nvprof("lir-ret",1)
 #define countlir_alloc() _nvprof("lir-alloc",1)
 #define countlir_var() _nvprof("lir-var",1)
@@ -1726,17 +1742,17 @@ namespace nanojit
                 #if NJ_JTBL_SUPPORTED
                 case LIR_jtbl: {
                     countlir_jtbl();
                     ins->oprnd1()->setResultLive();
                     // Multiway jump can contain both forward and backward jumps.
                     // Out of range indices aren't allowed or checked.
                     // Code after this jtbl instruction is unreachable.
                     releaseRegisters();
-                    NanoAssert(_allocator.countActive() == 0);
+                    NanoAssert(_allocator.activeMask() == 0);
 
                     uint32_t count = ins->getTableSize();
                     bool has_back_edges = false;
 
                     // Merge the regstates of labels we have already seen.
                     for (uint32_t i = count; i-- > 0;) {
                         LIns* to = ins->getTarget(i);
                         LabelState *lstate = _labels.get(to);
@@ -1751,17 +1767,17 @@ namespace nanojit
                     asm_output("forward edges");
 
                     // In a multi-way jump, the register allocator has no ability to deal
                     // with two existing edges that have conflicting register assignments, unlike
                     // a conditional branch where code can be inserted on the fall-through path
                     // to reconcile registers.  So, frontends *must* insert LIR_regfence at labels of
                     // forward jtbl jumps.  Check here to make sure no registers were picked up from
                     // any forward edges.
-                    NanoAssert(_allocator.countActive() == 0);
+                    NanoAssert(_allocator.activeMask() == 0);
 
                     if (has_back_edges) {
                         handleLoopCarriedExprs(pending_lives);
                         // save merged (empty) register state at target labels we haven't seen yet
                         for (uint32_t i = count; i-- > 0;) {
                             LIns* to = ins->getTarget(i);
                             LabelState *lstate = _labels.get(to);
                             if (!lstate) {
@@ -1923,37 +1939,38 @@ namespace nanojit
                     // It must be impure or pure-and-extant -- it couldn't be
                     // pure-and-not-extant, because there's no way the codegen
                     // for a call can be folded into the codegen of another
                     // LIR instruction.
                     NanoAssert(!ins->callInfo()->_isPure || ins->isExtant());
                     asm_call(ins);
                     break;
 
-                #ifdef VTUNE
+                #ifdef VMCFG_VTUNE
                 case LIR_file: {
-                    // we traverse backwards so we are now hitting the file
-                    // that is associated with a bunch of LIR_lines we already have seen
-                    ins->oprnd1()->setResultLive();
-                    uintptr_t currentFile = ins->oprnd1()->immI();
-                    cgen->jitFilenameUpdate(currentFile);
+                     // we traverse backwards so we are now hitting the file
+                     // that is associated with a bunch of LIR_lines we already have seen
+                    if (vtuneHandle) {
+                        void * currentFile = (void *) ins->oprnd1()->immI();
+                        vtuneFile(vtuneHandle, currentFile);
+                    }
                     break;
                 }
-
                 case LIR_line: {
-                    // add a new table entry, we don't yet knwo which file it belongs
-                    // to so we need to add it to the update table too
-                    // note the alloc, actual act is delayed; see above
-                    ins->oprnd1()->setResultLive();
-                    uint32_t currentLine = (uint32_t) ins->oprnd1()->immI();
-                    cgen->jitLineNumUpdate(currentLine);
-                    cgen->jitAddRecord((uintptr_t)_nIns, 0, currentLine, true);
+                     // add a new table entry, we don't yet knwo which file it belongs
+                     // to so we need to add it to the update table too
+                     // note the alloc, actual act is delayed; see above
+                    if (vtuneHandle) {
+                        uint32_t currentLine = (uint32_t) ins->oprnd1()->immI();
+                        vtuneLine(vtuneHandle, currentLine, _nIns);
+                    }
                     break;
                 }
-                #endif // VTUNE
+               #endif // VMCFG_VTUNE
+
             }
 
 #ifdef NJ_VERBOSE
             // We do final LIR printing inside this loop to avoid printing
             // dead LIR instructions.  We print the LIns after generating the
             // code.  This ensures that the LIns will appear in debug output
             // *before* the native code, because Assembler::outputf()
             // prints everything in reverse.
@@ -1963,20 +1980,16 @@ namespace nanojit
                 LInsPrinter* printer = _thisfrag->lirbuf->printer;
                 outputf("    %s", printer->formatIns(&b, ins));
             }
 #endif
 
             if (error())
                 return;
 
-        #ifdef VTUNE
-            cgen->jitCodePosUpdate((uintptr_t)_nIns);
-        #endif
-
             // check that all is well (don't check in exit paths since its more complicated)
             debug_only( pageValidate(); )
             debug_only( resourceConsistencyCheck();  )
         }
     }
 
     /*
      * Write a jump table for the given SwitchInfo and store the table
@@ -2068,34 +2081,33 @@ namespace nanojit
     void Assembler::printRegState()
     {
         char* s = &outline[0];
         VMPI_memset(s, ' ', 26);  s[26] = '\0';
         s += VMPI_strlen(s);
         VMPI_sprintf(s, "RR");
         s += VMPI_strlen(s);
 
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
+        RegisterMask active = _allocator.activeMask();
+        for (Register r = lsReg(active); active != 0; r = nextLsReg(active, r)) {
             LIns *ins = _allocator.getActive(r);
-            if (ins) {
-                NanoAssertMsg(!_allocator.isFree(r),
-                              "Coding error; register is both free and active! " );
-                RefBuf b;
-                const char* n = _thisfrag->lirbuf->printer->formatRef(&b, ins);
+            NanoAssertMsg(!_allocator.isFree(r),
+                          "Coding error; register is both free and active! " );
+            RefBuf b;
+            const char* n = _thisfrag->lirbuf->printer->formatRef(&b, ins);
 
-                if (ins->isop(LIR_paramp) && ins->paramKind()==1 &&
-                    r == Assembler::savedRegs[ins->paramArg()])
-                {
-                    // dont print callee-saved regs that arent used
-                    continue;
-                }
+            if (ins->isop(LIR_paramp) && ins->paramKind()==1 &&
+                r == Assembler::savedRegs[ins->paramArg()])
+            {
+                // dont print callee-saved regs that arent used
+                continue;
+            }
 
-                VMPI_sprintf(s, " %s(%s)", gpn(r), n);
-                s += VMPI_strlen(s);
-            }
+            VMPI_sprintf(s, " %s(%s)", gpn(r), n);
+            s += VMPI_strlen(s);
         }
         output();
     }
 
     void Assembler::printActivationState()
     {
         char* s = &outline[0];
         VMPI_memset(s, ' ', 26);  s[26] = '\0';
@@ -2231,36 +2243,33 @@ namespace nanojit
         // Find the top GpRegs that are candidates to put in SavedRegs.
 
         // 'tosave' is a binary heap stored in an array.  The root is tosave[0],
         // left child is at i+1, right child is at i+2.
 
         Register tosave[LastReg-FirstReg+1];
         int len=0;
         RegAlloc *regs = &_allocator;
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
-            if (rmask(r) & GpRegs & ~ignore) {
-                LIns *ins = regs->getActive(r);
-                if (ins) {
-                    if (canRemat(ins)) {
-                        NanoAssert(ins->getReg() == r);
-                        evict(ins);
-                    }
-                    else {
-                        int32_t pri = regs->getPriority(r);
-                        // add to heap by adding to end and bubbling up
-                        int j = len++;
-                        while (j > 0 && pri > regs->getPriority(tosave[j/2])) {
-                            tosave[j] = tosave[j/2];
-                            j /= 2;
-                        }
-                        NanoAssert(size_t(j) < sizeof(tosave)/sizeof(tosave[0]));
-                        tosave[j] = r;
-                    }
+        RegisterMask evict_set = regs->activeMask() & GpRegs & ~ignore;
+        for (Register r = lsReg(evict_set); evict_set; r = nextLsReg(evict_set, r)) {
+            LIns *ins = regs->getActive(r);
+            if (canRemat(ins)) {
+                NanoAssert(ins->getReg() == r);
+                evict(ins);
+            }
+            else {
+                int32_t pri = regs->getPriority(r);
+                // add to heap by adding to end and bubbling up
+                int j = len++;
+                while (j > 0 && pri > regs->getPriority(tosave[j/2])) {
+                    tosave[j] = tosave[j/2];
+                    j /= 2;
                 }
+                NanoAssert(size_t(j) < sizeof(tosave)/sizeof(tosave[0]));
+                tosave[j] = r;
             }
         }
 
         // Now primap has the live exprs in priority order.
         // Allocate each of the top priority exprs to a SavedReg.
 
         RegisterMask allow = SavedRegs;
         while (allow && len > 0) {
@@ -2292,34 +2301,22 @@ namespace nanojit
                 tosave[j] = last;
             }
         }
 
         // now evict everything else.
         evictSomeActiveRegs(~(SavedRegs | ignore));
     }
 
-    void Assembler::evictAllActiveRegs()
-    {
-        // generate code to restore callee saved registers
-        // @todo speed this up
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
-            evictIfActive(r);
-        }
-    }
-
+    // Generate code to restore any registers in 'regs' that are currently active,
     void Assembler::evictSomeActiveRegs(RegisterMask regs)
     {
-        // generate code to restore callee saved registers
-        // @todo speed this up
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
-            if ((rmask(r) & regs)) {
-                evictIfActive(r);
-            }
-        }
+        RegisterMask evict_set = regs & _allocator.activeMask();
+        for (Register r = lsReg(evict_set); evict_set; r = nextLsReg(evict_set, r))
+            evict(_allocator.getActive(r));
     }
 
     /**
      * Merge the current regstate with a previously stored version.
      *
      * Situation                            Change to _allocator
      * ---------                            --------------------
      * !current & !saved
@@ -2332,29 +2329,23 @@ namespace nanojit
     {
         Register regsTodo[LastReg + 1];
         LIns* insTodo[LastReg + 1];
         int nTodo = 0;
 
         // Do evictions and pops first.
         verbose_only(bool shouldMention=false; )
         // The obvious thing to do here is to iterate from FirstReg to LastReg.
-        // viz: for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) ...
         // However, on ARM that causes lower-numbered integer registers
         // to be be saved at higher addresses, which inhibits the formation
         // of load/store multiple instructions.  Hence iterate the loop the
-        // other way.  The "r <= LastReg" guards against wraparound in
-        // the case where Register is treated as unsigned and FirstReg is zero.
-        //
-        // Note, the loop var is deliberately typed as int (*not* Register)
-        // to outsmart compilers that will otherwise report
-        // "error: comparison is always true due to limited range of data type".
-        for (int ri = LastReg; ri >= FirstReg && ri <= LastReg; ri = int(prevreg(Register(ri))))
+        // other way.
+        RegisterMask reg_set = _allocator.activeMask() | saved.activeMask();
+        for (Register r = msReg(reg_set); reg_set; r = nextMsReg(reg_set, r))
         {
-            Register const r = Register(ri);
             LIns* curins = _allocator.getActive(r);
             LIns* savedins = saved.getActive(r);
             if (curins != savedins)
             {
                 if (savedins) {
                     regsTodo[nTodo] = r;
                     insTodo[nTodo] = savedins;
                     nTodo++;
@@ -2398,17 +2389,18 @@ namespace nanojit
     void Assembler::unionRegisterState(RegAlloc& saved)
     {
         Register regsTodo[LastReg + 1];
         LIns* insTodo[LastReg + 1];
         int nTodo = 0;
 
         // Do evictions and pops first.
         verbose_only(bool shouldMention=false; )
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r))
+        RegisterMask reg_set = _allocator.activeMask() | saved.activeMask();
+        for (Register r = lsReg(reg_set); reg_set; r = nextLsReg(reg_set, r))
         {
             LIns* curins = _allocator.getActive(r);
             LIns* savedins = saved.getActive(r);
             if (curins != savedins)
             {
                 if (savedins) {
                     regsTodo[nTodo] = r;
                     insTodo[nTodo] = savedins;
@@ -2448,25 +2440,24 @@ namespace nanojit
 
     // Scan table for instruction with the lowest priority, meaning it is used
     // furthest in the future.
     LIns* Assembler::findVictim(RegisterMask allow)
     {
         NanoAssert(allow);
         LIns *ins, *vic = 0;
         int allow_pri = 0x7fffffff;
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r))
+        RegisterMask vic_set = allow & _allocator.activeMask();
+        for (Register r = lsReg(vic_set); vic_set; r = nextLsReg(vic_set, r))
         {
-            if ((allow & rmask(r)) && (ins = _allocator.getActive(r)) != 0)
-            {
-                int pri = canRemat(ins) ? 0 : _allocator.getPriority(r);
-                if (!vic || pri < allow_pri) {
-                    vic = ins;
-                    allow_pri = pri;
-                }
+            ins = _allocator.getActive(r);
+            int pri = canRemat(ins) ? 0 : _allocator.getPriority(r);
+            if (!vic || pri < allow_pri) {
+                vic = ins;
+                allow_pri = pri;
             }
         }
         NanoAssert(vic != 0);
         return vic;
     }
 
 #ifdef NJ_VERBOSE
     char Assembler::outline[8192];
--- a/js/src/nanojit/Assembler.h
+++ b/js/src/nanojit/Assembler.h
@@ -191,17 +191,17 @@ namespace nanojit
     };
 
     typedef SeqBuilder<NIns*> NInsList;
     typedef HashMap<NIns*, LIns*> NInsMap;
 #if NJ_USES_IMMD_POOL
     typedef HashMap<uint64_t, uint64_t*> ImmDPoolMap;
 #endif
 
-#ifdef VTUNE
+#ifdef VMCFG_VTUNE
     class avmplus::CodegenLIR;
 #endif
 
     class LabelState
     {
     public:
         RegAlloc regs;
         NIns *addr;
@@ -266,18 +266,18 @@ namespace nanojit
             // Sets 'outlineEOL'.
             void setOutputForEOL(const char* format, ...);
 
             void printRegState();
             void printActivationState();
             #endif // NJ_VERBOSE
 
         public:
-            #ifdef VTUNE
-            avmplus::CodegenLIR *cgen;
+            #ifdef VMCFG_VTUNE
+            void* vtuneHandle;
             #endif
 
             Assembler(CodeAlloc& codeAlloc, Allocator& dataAlloc, Allocator& alloc, AvmCore* core, LogControl* logc, const Config& config);
 
             void        compile(Fragment *frag, Allocator& alloc, bool optimize
                                 verbose_only(, LInsPrinter*));
 
             void        endAssembly(Fragment* frag);
@@ -310,17 +310,21 @@ namespace nanojit
 
             uint32_t    arReserve(LIns* ins);
             void        arFree(LIns* ins);
             void        arReset();
 
             Register    registerAlloc(LIns* ins, RegisterMask allow, RegisterMask prefer);
             Register    registerAllocTmp(RegisterMask allow);
             void        registerResetAll();
-            void        evictAllActiveRegs();
+            void        evictAllActiveRegs() {
+                // The evicted set will be be intersected with activeSet(),
+                // so use an all-1s mask to avoid an extra load or call.
+                evictSomeActiveRegs(~RegisterMask(0));
+            }
             void        evictSomeActiveRegs(RegisterMask regs);
             void        evictScratchRegsExcept(RegisterMask ignore);
             void        intersectRegisterState(RegAlloc& saved);
             void        unionRegisterState(RegAlloc& saved);
             void        assignSaved(RegAlloc &saved, RegisterMask skip);
             LIns*       findVictim(RegisterMask allow);
 
             Register    getBaseReg(LIns *ins, int &d, RegisterMask allow);
--- a/js/src/nanojit/CodeAlloc.cpp
+++ b/js/src/nanojit/CodeAlloc.cpp
@@ -42,17 +42,21 @@
 //#define DOPROF
 #include "../vprof/vprof.h"
 
 #ifdef FEATURE_NANOJIT
 
 namespace nanojit
 {
     static const bool verbose = false;
-#if defined(NANOJIT_ARM)
+#ifdef VMCFG_VTUNE
+    // vtune jit profiling api can't handle non-contiguous methods,
+    // so make the allocation size huge to avoid non-contiguous methods
+    static const int pagesPerAlloc = 128; // 1MB
+#elif defined(NANOJIT_ARM)
     // ARM requires single-page allocations, due to the constant pool that
     // lives on each page that must be reachable by a 4kb pcrel load.
     static const int pagesPerAlloc = 1;
 #else
     static const int pagesPerAlloc = 16;
 #endif
 
     CodeAlloc::CodeAlloc()
--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@@ -1971,23 +1971,26 @@ namespace nanojit
         m_capNL[LInsImmI]  = 128;
         m_capNL[LInsImmQ]  = PTR_SIZE(0, 16);
         m_capNL[LInsImmD]  = 16;
         m_capNL[LIns1]     = 256;
         m_capNL[LIns2]     = 512;
         m_capNL[LIns3]     = 16;
         m_capNL[LInsCall]  = 64;
 
-        for (NLKind nlkind = LInsFirst; nlkind <= LInsLast; nlkind = nextNLKind(nlkind))
+        for (NLKind nlkind = LInsFirst; nlkind <= LInsLast; nlkind = nextNLKind(nlkind)) {
             m_listNL[nlkind] = new (alloc) LIns*[m_capNL[nlkind]];
+            m_usedNL[nlkind] = 1; // Force memset in clearAll().
+        }
 
         // Note that this allocates the CONST and MULTIPLE tables as well.
         for (CseAcc a = 0; a < CSE_NUM_USED_ACCS; a++) {
             m_capL[a] = 16;
             m_listL[a] = new (alloc) LIns*[m_capL[a]];
+            m_usedL[a] = 1; // Force memset(0) in first clearAll().
         }
 
         clearAll();
     }
 
     // Inlined/separated version of SuperFastHash.
     // This content is copyrighted by Paul Hsieh.
     // For reference see: http://www.azillionmonkeys.com/qed/hash.html
@@ -2479,17 +2482,17 @@ namespace nanojit
         LIns* ins;
         if (isS16(disp)) {
             if (storesSinceLastLoad != ACCSET_NONE) {
                 // Clear all normal (excludes CONST and MULTIPLE) loads
                 // aliased by stores and calls since the last time we were in
                 // this function.  
                 AccSet a = storesSinceLastLoad & ((1 << EMB_NUM_USED_ACCS) - 1);
                 while (a) {
-                    int acc = msbSet(a);
+                    int acc = msbSet32(a);
                     clearL((CseAcc)acc);
                     a &= ~(1 << acc);
                 }
 
                 // No need to clear CONST loads (those in the CSE_ACC_CONST table).
 
                 // Multi-region loads must be treated conservatively -- we
                 // always clear all of them.
@@ -3033,17 +3036,17 @@ namespace nanojit
         case LIR_lived:
         case LIR_d2i:
         CASE64(LIR_dasq:)
             formals[0] = LTy_D;
             break;
 
         case LIR_file:
         case LIR_line:
-            // XXX: not sure about these ones.  Ignore for the moment.
+            // These will never get hit since VTUNE implies !DEBUG.  Ignore for the moment.
             nArgs = 0;
             break;
 
         default:
             NanoAssertMsgf(0, "%s\n", lirNames[op]);
         }
 
         typeCheckArgs(op, nArgs, formals, args);
--- a/js/src/nanojit/LIR.h
+++ b/js/src/nanojit/LIR.h
@@ -284,42 +284,19 @@ namespace nanojit
     // However, the struct gets padded inside LInsLd in an inconsistent way on
     // Windows, so we actually store a MiniAccSetVal inside LInsLd.  Sigh.
     // But we use MiniAccSet everywhere else.
     //
     typedef uint8_t MiniAccSetVal;
     struct MiniAccSet { MiniAccSetVal val; };
     static const MiniAccSet MINI_ACCSET_MULTIPLE = { 99 };
 
-#if defined(_WIN32) && (_MSC_VER >= 1300) && (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
-    extern "C" unsigned char _BitScanReverse(unsigned long * Index, unsigned long Mask);
-    # pragma intrinsic(_BitScanReverse)
-
-    // Returns the index of the most significant bit that is set.
-    static int msbSet(uint32_t x) {
-        unsigned long idx;
-        _BitScanReverse(&idx, (unsigned long)(x | 1)); // the '| 1' ensures a 0 result when x==0
-        return idx;
-    }
-#elif (__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
-    static int msbSet(uint32_t x) {
-        return 31 - __builtin_clz(x | 1);
-    }
-#else
-    static int msbSet(uint32_t x) {     // slow fallback version
-        for (int i = 31; i >= 0; i--)
-            if ((1 << i) & x) 
-                return i;
-        return 0;
-    }
-#endif
-
     static MiniAccSet compressAccSet(AccSet accSet) {
         if (isSingletonAccSet(accSet)) {
-            MiniAccSet ret = { uint8_t(msbSet(accSet)) };
+            MiniAccSet ret = { uint8_t(msbSet32(accSet)) };
             return ret;
         }
 
         // If we got here, it must be a multi-region AccSet.
         return MINI_ACCSET_MULTIPLE;
     }
 
     static AccSet decompressMiniAccSet(MiniAccSet miniAccSet) {
@@ -1138,18 +1115,22 @@ namespace nanojit
         // Nb: the LIR writer pipeline handles things if a displacement
         // exceeds 16 bits.  This is rare, but does happen occasionally.  We
         // could go to 24 bits but then it would happen so rarely that the
         // handler code would be difficult to test and thus untrustworthy.
         //
         // Nb: the types of these bitfields are all 32-bit integers to ensure
         // they are fully packed on Windows, sigh.  Also, 'loadQual' is
         // unsigned to ensure the values 0, 1, and 2 all fit in 2 bits.
-        int32_t     disp:16;
-        int32_t     miniAccSetVal:8;
+        //
+        // Nb: explicit signed keyword for bitfield types is required,
+        // some compilers may treat them as unsigned without it.
+        // See Bugzilla 584219 comment #18
+        signed int  disp:16;
+        signed int  miniAccSetVal:8;
         uint32_t    loadQual:2;
 
         LIns*       oprnd_1;
 
         LIns        ins;
 
     public:
         LIns* getLIns() { return &ins; };
--- a/js/src/nanojit/Native.h
+++ b/js/src/nanojit/Native.h
@@ -94,25 +94,16 @@
 #if NJ_SOFTFLOAT_SUPPORTED
     #define CASESF(x)   case x
 #else
     #define CASESF(x)
 #endif
 
 namespace nanojit {
 
-    inline Register nextreg(Register r) {
-        return Register(r+1);
-    }
-
-    inline Register prevreg(Register r) {
-        return Register(r-1);
-    }
-
-
     class Fragment;
     struct SideExit;
     struct SwitchInfo;
 
     struct GuardRecord
     {
         void* jmp;
         GuardRecord* next;
@@ -147,19 +138,19 @@ namespace nanojit {
         #define gpn(r)                    regNames[(r)]
     #elif defined(NJ_VERBOSE)
         // Used for printing native instructions.  Like Assembler::outputf(),
         // but only outputs if LC_Native is set.  Also prepends the output
         // with the address of the current native instruction.
         #define asm_output(...) do { \
             if (_logc->lcbits & LC_Native) { \
                 outline[0]='\0'; \
-               VMPI_sprintf(outline, "%p   ", _nIns); \
-                sprintf(&outline[13], ##__VA_ARGS__); \
-                output(); \
+                VMPI_sprintf(outline, "%p   ", _nIns);  \
+                VMPI_sprintf(outline+VMPI_strlen(outline), ##__VA_ARGS__);   \
+                output();                               \
             } \
         } while (0) /* no semi */
         #define gpn(r)                  regNames[(r)]
     #else
         #define asm_output(...)
         #define gpn(r)
     #endif /* NJ_VERBOSE */
 
--- a/js/src/nanojit/NativeARM.cpp
+++ b/js/src/nanojit/NativeARM.cpp
@@ -38,17 +38,16 @@
  * the terms of any one of the MPL, the GPL or the LGPL.
  *
  * ***** END LICENSE BLOCK ***** */
 
 #include "nanojit.h"
 
 #ifdef UNDER_CE
 #include <cmnintrin.h>
-extern "C" bool blx_lr_broken();
 #endif
 
 #if defined(FEATURE_NANOJIT) && defined(NANOJIT_ARM)
 
 namespace nanojit
 {
 
 #ifdef NJ_VERBOSE
@@ -109,23 +108,24 @@ Assembler::CountLeadingZeroes(uint32_t d
     // now we can avoid the cost of the check as we don't intend to support
     // ARMv4 anyway.
     NanoAssert(_config.arm_arch >= 5);
 
 #if defined(__ARMCC__)
     // ARMCC can do this with an intrinsic.
     leading_zeroes = __clz(data);
 
-// current Android GCC compiler incorrectly refuses to compile 'clz' for armv5
-// (even though this is a legal instruction there). Since we currently only compile for ARMv5
-// for emulation, we don't care too much (but we DO care for ARMv6+ since those are "real"
-// devices).
-#elif defined(__GNUC__) && !(defined(ANDROID) && __ARM_ARCH__ <= 5)
+#elif defined(__GNUC__) && (NJ_COMPILER_ARM_ARCH >= 5)
     // GCC can use inline assembler to insert a CLZ instruction.
     __asm (
+#if defined(ANDROID) && (NJ_COMPILER_ARM_ARCH < 7)
+    // On Android gcc compiler, the clz instruction is not supported with a
+    // target smaller than armv7, despite it being legal for armv5+.
+        "   .arch armv7-a\n"
+#endif
         "   clz     %0, %1  \n"
         :   "=r"    (leading_zeroes)
         :   "r"     (data)
     );
 #elif defined(UNDER_CE)
     // WinCE can do this with an intrinsic.
     leading_zeroes = _CountLeadingZeros(data);
 #else
@@ -458,21 +458,16 @@ Assembler::asm_eor_imm(Register rd, Regi
 
 // --------------------------------
 // Assembler functions.
 // --------------------------------
 
 void
 Assembler::nInit(AvmCore*)
 {
-#ifdef UNDER_CE
-    blx_lr_bug = blx_lr_broken();
-#else
-    blx_lr_bug = 0;
-#endif
     nHints[LIR_calli]  = rmask(retRegs[0]);
     nHints[LIR_hcalli] = rmask(retRegs[1]);
     nHints[LIR_paramp] = PREFER_SPECIAL;
 }
 
 void Assembler::nBeginAssembly()
 {
     max_out_args = 0;
@@ -623,17 +618,17 @@ Assembler::asm_arg(ArgType ty, LIns* arg
     if (ty == ARGTYPE_D) {
         // This task is fairly complex and so is delegated to asm_arg_64.
         asm_arg_64(arg, r, stkd);
     } else {
         NanoAssert(ty == ARGTYPE_I || ty == ARGTYPE_UI);
         // pre-assign registers R0-R3 for arguments (if they fit)
         if (r < R4) {
             asm_regarg(ty, arg, r);
-            r = nextreg(r);
+            r = Register(r + 1);
         } else {
             asm_stkarg(arg, stkd);
             stkd += 4;
         }
     }
 }
 
 // Encode a 64-bit floating-point argument using the appropriate ABI.
@@ -657,24 +652,24 @@ Assembler::asm_arg_64(LIns* arg, Registe
 
 #ifdef NJ_ARM_EABI
     // EABI requires that 64-bit arguments are aligned on even-numbered
     // registers, as R0:R1 or R2:R3. If the register base is at an
     // odd-numbered register, advance it. Note that this will push r past
     // R3 if r is R3 to start with, and will force the argument to go on
     // the stack.
     if ((r == R1) || (r == R3)) {
-        r = nextreg(r);
+        r = Register(r + 1);
     }
 #endif
 
     if (r < R3) {
         Register    ra = r;
-        Register    rb = nextreg(r);
-        r = nextreg(rb);
+        Register    rb = Register(r + 1);
+        r = Register(rb + 1);
 
 #ifdef NJ_ARM_EABI
         // EABI requires that 64-bit arguments are aligned on even-numbered
         // registers, as R0:R1 or R2:R3.
         NanoAssert( ((ra == R0) && (rb == R1)) || ((ra == R2) && (rb == R3)) );
 #endif
 
         // Put the argument in ra and rb. If the argument is in a VFP register,
@@ -687,22 +682,18 @@ Assembler::asm_arg_64(LIns* arg, Registe
             asm_regarg(ARGTYPE_I, arg->oprnd2(), rb);
         }
 
 #ifndef NJ_ARM_EABI
     } else if (r == R3) {
         // We only have one register left, but the legacy ABI requires that we
         // put 32 bits of the argument in the register (R3) and the remaining
         // 32 bits on the stack.
-        Register    ra = r;
-        r = nextreg(r);
-
-        // This really just checks that nextreg() works properly, as we know
-        // that r was previously R3.
-        NanoAssert(r == R4);
+        Register    ra = r; // R3
+        r = R4;
 
         // We're splitting the argument between registers and the stack.  This
         // must be the first time that the stack is used, so stkd must be at 0.
         NanoAssert(stkd == 0);
 
         if (_config.arm_vfp) {
             // TODO: We could optimize the this to store directly from
             // the VFP register to memory using "FMRRD ra, fp_reg[31:0]" and
@@ -907,36 +898,27 @@ Assembler::asm_call(LIns* ins)
     }
 
     // Emit the branch.
     if (!indirect) {
         verbose_only(if (_logc->lcbits & LC_Native)
             outputf("        %p:", _nIns);
         )
 
-        // Direct call: on v5 and above (where the calling sequence doesn't
-        // corrupt LR until the actual branch instruction), we can avoid an
-        // interlock in the "long" branch sequence by manually loading the
-        // target address into LR ourselves before setting up the parameters
-        // in other registers.
         BranchWithLink((NIns*)ci->_address);
     } else {
-        // Indirect call: we assign the address arg to LR since it's not
-        // used for regular arguments, and is otherwise scratch since it's
-        // clobberred by the call. On v4/v4T, where we have to manually do
-        // the equivalent of a BLX, move LR into IP before corrupting LR
-        // with the return address.
-        if (blx_lr_bug) {
-            // workaround for msft device emulator bug (blx lr emulated as no-op)
-            underrunProtect(8);
-            BLX(IP);
-            MOV(IP,LR);
-        } else {
-            BLX(LR);
-        }
+        // Indirect call: we assign the address arg to LR
+#ifdef UNDER_CE
+        // workaround for msft device emulator bug (blx lr emulated as no-op)
+        underrunProtect(8);
+        BLX(IP);
+        MOV(IP, LR);
+#else
+        BLX(LR);
+#endif
         asm_regarg(ARGTYPE_I, ins->arg(--argc), LR);
     }
 
     // Encode the arguments, starting at R0 and with an empty argument stack.
     Register    r = R0;
     int         stkd = 0;
 
     // Iterate through the argument list and encode each argument according to
@@ -976,18 +958,16 @@ Assembler::nRegisterResetAll(RegAlloc& a
     // add scratch registers to our free list for the allocator
     a.clear();
     a.free =
         rmask(R0) | rmask(R1) | rmask(R2) | rmask(R3) | rmask(R4) |
         rmask(R5) | rmask(R6) | rmask(R7) | rmask(R8) | rmask(R9) |
         rmask(R10) | rmask(LR);
     if (_config.arm_vfp)
         a.free |= FpRegs;
-
-    debug_only(a.managed = a.free);
 }
 
 static inline ConditionCode
 get_cc(NIns *ins)
 {
     return ConditionCode((*ins >> 28) & 0xF);
 }
 
@@ -1920,27 +1900,29 @@ inline void
 Assembler::BLX(Register addr, bool chk /* = true */)
 {
     // We need to emit an ARMv5+ instruction, so assert that we have a suitable
     // processor. Note that we don't support ARMv4(T), but this serves as a
     // useful sanity check.
     NanoAssert(_config.arm_arch >= 5);
 
     NanoAssert(IsGpReg(addr));
+#ifdef UNDER_CE
     // There is a bug in the WinCE device emulator which stops "BLX LR" from
     // working as expected. Assert that we never do that!
-    if (blx_lr_bug) { NanoAssert(addr != LR); }
+    NanoAssert(addr != LR);
+#endif
 
     if (chk) {
         underrunProtect(4);
     }
 
-    // BLX IP
+    // BLX reg
     *(--_nIns) = (NIns)( (COND_AL) | (0x12<<20) | (0xFFF<<8) | (0x3<<4) | (addr) );
-    asm_output("blx ip");
+    asm_output("blx %s", gpn(addr));
 }
 
 // Emit the code required to load a memory address into a register as follows:
 // d = *(b+off)
 // underrunProtect calls from this function can be disabled by setting chk to
 // false. However, this function can use more than LD32_size bytes of space if
 // the offset is out of the range of a LDR instruction; the maximum space this
 // function requires for underrunProtect is 4+LD32_size.
@@ -2772,37 +2754,33 @@ Assembler::asm_cmov(LIns* ins)
                (ins->isop(LIR_cmovd) && iftrue->isD() && iffalse->isD()));
 
     RegisterMask allow = ins->isD() ? FpRegs : GpRegs;
 
     Register rr = prepareResultReg(ins, allow);
 
     Register rf = findRegFor(iffalse, allow & ~rmask(rr));
 
+    // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
+    Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
+
     if (ins->isop(LIR_cmovd)) {
         NIns* target = _nIns;
         asm_nongp_copy(rr, rf);
         asm_branch(false, condval, target);
-
-        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
         if (rr != rt)
             asm_nongp_copy(rr, rt);
         freeResourcesOf(ins);
         if (!iftrue->isInReg()) {
             NanoAssert(rt == rr);
             findSpecificRegForUnallocated(iftrue, rr);
         }
         return;
     }
 
-    // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-    Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
     // WARNING: We cannot generate any code that affects the condition
     // codes between the MRcc generation here and the asm_cmp() call
     // below.  See asm_cmp() for more details.
     if (ins->isop(LIR_cmovi)) {
         switch (condval->opcode()) {
             // note that these are all opposites...
             case LIR_eqi:    MOVNE(rr, rf);  break;
             case LIR_lti:    MOVGE(rr, rf);  break;
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@@ -235,17 +235,16 @@ verbose_only( extern const char* shiftNa
     void        asm_sub_imm(Register rd, Register rn, int32_t imm, int stat = 0);   \
     void        asm_and_imm(Register rd, Register rn, int32_t imm, int stat = 0);   \
     void        asm_orr_imm(Register rd, Register rn, int32_t imm, int stat = 0);   \
     void        asm_eor_imm(Register rd, Register rn, int32_t imm, int stat = 0);   \
     inline bool     encOp2Imm(uint32_t literal, uint32_t * enc);                \
     inline uint32_t CountLeadingZeroes(uint32_t data);                          \
     int *       _nSlot;                                                         \
     int *       _nExitSlot;                                                     \
-    bool        blx_lr_bug;                                                     \
     int         max_out_args; /* bytes */
 
 #define IMM32(imm)  *(--_nIns) = (NIns)((imm));
 
 #define OP_IMM  (1<<25)
 #define OP_STAT (1<<20)
 
 #define COND_AL ((uint32_t)AL<<28)
--- a/js/src/nanojit/NativeMIPS.cpp
+++ b/js/src/nanojit/NativeMIPS.cpp
@@ -476,36 +476,36 @@ namespace nanojit
         NanoAssert(cpu_has_fpu);
 #endif
 
         // O32 ABI requires that 64-bit arguments are aligned on even-numbered
         // registers, as A0:A1/FA0 or A2:A3/FA1. Use the stack offset to keep track
         // where we are
         if (stkd & 4) {
             if (stkd < 16) {
-                r = nextreg(r);
-                fr = nextreg(fr);
+                r = Register(r + 1);
+                fr = Register(fr + 1);
             }
             stkd += 4;
         }
 
         if (stkd < 16) {
             NanoAssert(fr == FA0 || fr == FA1 || fr == A2);
             if (fr == FA0 || fr == FA1)
                 findSpecificRegFor(arg, fr);
             else {
                 findSpecificRegFor(arg, FA1);
                 // Move it to the integer pair
                 Register fpupair = arg->getReg();
                 Register intpair = fr;
-                MFC1(mswregpair(intpair), nextreg(fpupair));       // Odd fpu register contains sign,expt,manthi
+                MFC1(mswregpair(intpair), Register(fpupair + 1));  // Odd fpu register contains sign,expt,manthi
                 MFC1(lswregpair(intpair), fpupair);                // Even fpu register contains mantlo
             }
-            r = nextreg(nextreg(r));
-            fr = nextreg(nextreg(fr));
+            r = Register(r + 2);
+            fr = Register(fr + 2);
         }
         else
             asm_stkarg(arg, stkd);
 
         stkd += 8;
     }
 
     /* Required functions */
@@ -1573,18 +1573,18 @@ namespace nanojit
 
         if (ty == ARGTYPE_D) {
             // This task is fairly complex and so is delegated to asm_arg_64.
             asm_arg_64(arg, r, fr, stkd);
         } else {
             NanoAssert(ty == ARGTYPE_I || ty == ARGTYPE_UI);
             if (stkd < 16) {
                 asm_regarg(ty, arg, r);
-                fr = nextreg(fr);
-                r = nextreg(r);
+                fr = Register(fr + 1);
+                r = Register(r + 1);
             }
             else
                 asm_stkarg(arg, stkd);
             // The o32 ABI calling convention is that if the first arguments
             // is not a double, subsequent double values are passed in integer registers
             fr = r;
             stkd += 4;
         }
@@ -1679,17 +1679,16 @@ namespace nanojit
 
     void
     Assembler::nRegisterResetAll(RegAlloc& regs)
     {
         regs.clear();
         regs.free = GpRegs;
         if (cpu_has_fpu)
             regs.free |= FpRegs;
-        debug_only(regs.managed = regs.free;)
     }
 
 #define signextend16(s) ((int32_t(s)<<16)>>16)
 
     void
     Assembler::nPatchBranch(NIns* branch, NIns* target)
     {
         uint32_t op = (branch[0] >> 26) & 0x3f;
--- a/js/src/nanojit/NativePPC.cpp
+++ b/js/src/nanojit/NativePPC.cpp
@@ -731,31 +731,31 @@ namespace nanojit
             uint32_t j = argc - i - 1;
             ArgType ty = argTypes[j];
             LIns* arg = ins->arg(j);
             NanoAssert(ty != ARGTYPE_V);
             if (ty != ARGTYPE_D) {
                 // GP arg
                 if (r <= R10) {
                     asm_regarg(ty, arg, r);
-                    r = nextreg(r);
+                    r = Register(r + 1);
                     param_size += sizeof(void*);
                 } else {
                     // put arg on stack
                     TODO(stack_int32);
                 }
             } else {
                 // double
                 if (fr <= F13) {
                     asm_regarg(ty, arg, fr);
-                    fr = nextreg(fr);
+                    fr = Register(fr + 1);
                 #ifdef NANOJIT_64BIT
-                    r = nextreg(r);
+                    r = Register(r + 1);
                 #else
-                    r = nextreg(nextreg(r)); // skip 2 gpr's
+                    r = Register(r + 2); // skip 2 gpr's
                 #endif
                     param_size += sizeof(double);
                 } else {
                     // put arg on stack
                     TODO(stack_double);
                 }
             }
         }
@@ -1035,21 +1035,21 @@ namespace nanojit
             CLRLDI(r, v, 32); // clears the top 32 bits
             break;
         case LIR_i2q:
             EXTSW(r, v);
             break;
         }
     }
 
-    void Assembler::asm_dasq(LIns *ins) {
+    void Assembler::asm_dasq(LIns*) {
         TODO(asm_dasq);
     }
 
-    void Assembler::asm_qasd(LIns *ins) {
+    void Assembler::asm_qasd(LIns*) {
         TODO(asm_qasd);
     }
 
     #endif
 
 #ifdef NANOJIT_64BIT
     void Assembler::asm_immq(LIns *ins) {
         Register r = ins->deprecated_getReg();
@@ -1385,17 +1385,16 @@ namespace nanojit
         }
         _allocator.free &= ~rmask(i);
         return i;
     }
 
     void Assembler::nRegisterResetAll(RegAlloc &regs) {
         regs.clear();
         regs.free = SavedRegs | 0x1ff8 /* R3-12 */ | 0x3ffe00000000LL /* F1-13 */;
-        debug_only(regs.managed = regs.free);
     }
 
 #ifdef NANOJIT_64BIT
     void Assembler::asm_qbinop(LIns *ins) {
         LOpcode op = ins->opcode();
         switch (op) {
         case LIR_orq:
         case LIR_andq:
--- a/js/src/nanojit/NativeSparc.cpp
+++ b/js/src/nanojit/NativeSparc.cpp
@@ -229,17 +229,16 @@ namespace nanojit
         _allocator.free &= ~rmask((Register)i);
         return (Register) i;
     }
 
     void Assembler::nRegisterResetAll(RegAlloc& a)
     {
         a.clear();
         a.free = GpRegs | FpRegs;
-        debug_only( a.managed = a.free; )
     }
 
     void Assembler::nPatchBranch(NIns* branch, NIns* location)
     {
         *(uint32_t*)&branch[0] &= 0xFFC00000;
         *(uint32_t*)&branch[0] |= ((intptr_t)location >> 10) & 0x3FFFFF;
         *(uint32_t*)&branch[1] &= 0xFFFFFC00;
         *(uint32_t*)&branch[1] |= (intptr_t)location & 0x3FF;
@@ -532,32 +531,35 @@ namespace nanojit
                     BGU(0, tt);
                 else //if (condop == LIR_geui)
                     BCC(0, tt);
             }
         asm_cmp(cond);
         return at;
     }
 
-    NIns* Assembler::asm_branch_ov(LOpcode, NIns* targ)
+    NIns* Assembler::asm_branch_ov(LOpcode op, NIns* targ)
     {
         NIns* at = 0;
         underrunProtect(32);
         intptr_t tt = ((intptr_t)targ - (intptr_t)_nIns + 8) >> 2;
         // !targ means that it needs patch.
         if( !(isIMM22((int32_t)tt)) || !targ ) {
             JMP_long_nocheck((intptr_t)targ);
             at = _nIns;
             NOP();
             BA(0, 5);
             tt = 4;
         }
         NOP();
 
-        BVS(0, tt);
+        if( op == LIR_mulxovi || op == LIR_muljovi )
+            BNE(0, tt);
+        else
+            BVS(0, tt);
         return at;
     }
 
     void Assembler::asm_cmp(LIns *cond)
     {
         underrunProtect(12);
 
         LIns* lhs = cond->oprnd1();
@@ -640,17 +642,17 @@ namespace nanojit
     {
         underrunProtect(28);
         LOpcode op = ins->opcode();
         LIns* lhs = ins->oprnd1();
         LIns* rhs = ins->oprnd2();
 
         Register rb = deprecated_UnknownReg;
         RegisterMask allow = GpRegs;
-        bool forceReg = (op == LIR_muli || op == LIR_mulxovi || !rhs->isImmI());
+        bool forceReg = (op == LIR_muli || op == LIR_mulxovi || op == LIR_muljovi || !rhs->isImmI());
 
         if (lhs != rhs && forceReg)
             {
                 if ((rb = asm_binop_rhs_reg(ins)) == deprecated_UnknownReg) {
                     rb = findRegFor(rhs, allow);
                 }
                 allow &= ~rmask(rb);
             }
@@ -674,18 +676,24 @@ namespace nanojit
             {
                 if (lhs == rhs)
                     rb = ra;
 
                 if (op == LIR_addi || op == LIR_addxovi)
                     ADDCC(rr, rb, rr);
                 else if (op == LIR_subi || op == LIR_subxovi)
                     SUBCC(rr, rb, rr);
-                else if (op == LIR_muli || op == LIR_mulxovi)
-                    MULX(rr, rb, rr);
+                else if (op == LIR_muli)
+                    SMULCC(rr, rb, rr);
+                else if (op == LIR_mulxovi || op == LIR_muljovi) {
+                    SUBCC(L4, L6, L4);
+                    SRAI(rr, 31, L6);
+                    RDY(L4);
+                    SMULCC(rr, rb, rr);
+                }
                 else if (op == LIR_andi)
                     AND(rr, rb, rr);
                 else if (op == LIR_ori)
                     OR(rr, rb, rr);
                 else if (op == LIR_xori)
                     XOR(rr, rb, rr);
                 else if (op == LIR_lshi)
                     SLL(rr, rb, rr);
--- a/js/src/nanojit/NativeSparc.h
+++ b/js/src/nanojit/NativeSparc.h
@@ -732,20 +732,20 @@ namespace nanojit
     } while (0)
 
 #define MOVVSI(simm11, cc2, cc1, cc0, rd) \
     do { \
     Format_4_2I(rd, 0x2c, cc2, 7, cc1, cc0, simm11); \
     asm_output("movvs %d, %s", simm11, gpn(rd)); \
     } while (0)
 
-#define MULX(rs1, rs2, rd) \
+#define SMULCC(rs1, rs2, rd) \
     do { \
-    Format_3_1(2, rd, 0x9, rs1, 0, rs2); \
-    asm_output("mul %s, %s, %s", gpn(rs1), gpn(rs2), gpn(rd)); \
+    Format_3_1(2, rd, 0x1b, rs1, 0, rs2); \
+    asm_output("smulcc %s, %s, %s", gpn(rs1), gpn(rs2), gpn(rd)); \
     } while (0)
 
 #define NOP() \
     do { \
     Format_2_1(0, 0x4, 0); \
     asm_output("nop"); \
     } while (0)
 
@@ -768,16 +768,22 @@ namespace nanojit
     } while (0)
 
 #define ANDCC(rs1, rs2, rd) \
     do { \
     Format_3_1(2, rd, 0x11, rs1, 0, rs2); \
     asm_output("andcc %s, %s, %s", gpn(rs1), gpn(rs2), gpn(rd)); \
     } while (0)
 
+#define RDY(rd) \
+    do { \
+    Format_3_1(2, rd, 0x28, 0, 0, 0); \
+    asm_output("rdy %s", gpn(rd)); \
+    } while (0)
+
 #define RESTORE(rs1, rs2, rd) \
     do { \
     Format_3_1(2, rd, 0x3D, rs1, 0, rs2); \
     asm_output("restore"); \
     } while (0)
 
 #define SAVEI(rs1, simm13, rd) \
     do { \
@@ -804,16 +810,22 @@ namespace nanojit
     } while (0)
 
 #define SRA(rs1, rs2, rd) \
     do { \
     Format_3_5(2, rd, 0x27, rs1, 0, rs2); \
     asm_output("sra %s, %s, %s", gpn(rs1), gpn(rs2), gpn(rd)); \
     } while (0)
 
+#define SRAI(rs1, shcnt32, rd) \
+    do { \
+    Format_3_6(2, rd, 0x27, rs1, shcnt32); \
+    asm_output("sra %s, %d, %s", gpn(rs1), shcnt32, gpn(rd)); \
+    } while (0)
+
 #define SRL(rs1, rs2, rd) \
     do { \
     Format_3_5(2, rd, 0x26, rs1, 0, rs2); \
     asm_output("srl %s, %s, %s", gpn(rs1), gpn(rs2), gpn(rd)); \
     } while (0)
 
 #define STF(rd, rs1, rs2) \
     do { \
--- a/js/src/nanojit/NativeX64.cpp
+++ b/js/src/nanojit/NativeX64.cpp
@@ -961,17 +961,17 @@ namespace nanojit
                 // double goes in XMM reg # based on overall arg_index
                 asm_regarg(ty, arg, Register(XMM0+arg_index));
                 arg_index++;
             }
         #else
             else if (ty == ARGTYPE_D && fr < XMM8) {
                 // double goes in next available XMM register
                 asm_regarg(ty, arg, fr);
-                fr = nextreg(fr);
+                fr = Register(fr + 1);
             }
         #endif
             else {
                 asm_stkarg(ty, arg, stk_used);
                 stk_used += sizeof(void*);
             }
         }
 
@@ -1114,37 +1114,33 @@ namespace nanojit
                    (ins->isop(LIR_cmovd) && iftrue->isD() && iffalse->isD()));
 
         RegisterMask allow = ins->isD() ? FpRegs : GpRegs;
 
         Register rr = prepareResultReg(ins, allow);
 
         Register rf = findRegFor(iffalse, allow & ~rmask(rr));
 
+        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
+        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
+
         if (ins->isop(LIR_cmovd)) {
             NIns* target = _nIns;
             asm_nongp_copy(rr, rf);
             asm_branch(false, cond, target);
-
-            // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-            Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
             if (rr != rt)
                 asm_nongp_copy(rr, rt);
             freeResourcesOf(ins);
             if (!iftrue->isInReg()) {
                 NanoAssert(rt == rr);
                 findSpecificRegForUnallocated(iftrue, rr);
             }
             return;
         }
 
-        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
         // WARNING: We cannot generate any code that affects the condition
         // codes between the MRcc generation here and the asm_cmp() call
         // below.  See asm_cmp() for more details.
         LOpcode condop = cond->opcode();
         if (ins->isop(LIR_cmovi)) {
             switch (condop) {
             case LIR_eqi:  case LIR_eqq:    CMOVNE( rr, rf);  break;
             case LIR_lti:  case LIR_ltq:    CMOVNL( rr, rf);  break;
@@ -1900,17 +1896,16 @@ namespace nanojit
     void Assembler::nRegisterResetAll(RegAlloc &a) {
         // add scratch registers to our free list for the allocator
         a.clear();
 #ifdef _WIN64
         a.free = 0x001fffcf; // rax-rbx, rsi, rdi, r8-r15, xmm0-xmm5
 #else
         a.free = 0xffffffff & ~(1<<RSP | 1<<RBP);
 #endif
-        debug_only( a.managed = a.free; )
     }
 
     void Assembler::nPatchBranch(NIns *patch, NIns *target) {
         NIns *next = 0;
         if (patch[0] == 0xE9) {
             // jmp disp32
             next = patch+5;
         } else if (patch[0] == 0x0F && (patch[1] & 0xF0) == 0x80) {
--- a/js/src/nanojit/Nativei386.cpp
+++ b/js/src/nanojit/Nativei386.cpp
@@ -1107,17 +1107,16 @@ namespace nanojit
 
     void Assembler::nRegisterResetAll(RegAlloc& a)
     {
         // add scratch registers to our free list for the allocator
         a.clear();
         a.free = SavedRegs | ScratchRegs;
         if (!_config.i386_sse2)
             a.free &= ~XmmRegs;
-        debug_only( a.managed = a.free; )
     }
 
     void Assembler::nPatchBranch(NIns* branch, NIns* targ)
     {
         intptr_t offset = intptr_t(targ) - intptr_t(branch);
         if (branch[0] == JMP32) {
             *(int32_t*)&branch[1] = offset - 5;
         } else if (branch[0] == JCC32) {
@@ -2054,37 +2053,33 @@ namespace nanojit
         }
 
         RegisterMask allow = ins->isD() ? XmmRegs : GpRegs;
 
         Register rr = prepareResultReg(ins, allow);
 
         Register rf = findRegFor(iffalse, allow & ~rmask(rr));
 
+        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
+        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
+
         if (ins->isop(LIR_cmovd)) {
             NIns* target = _nIns;
             asm_nongp_copy(rr, rf);
             asm_branch(false, condval, target);
-
-            // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-            Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
             if (rr != rt)
                 asm_nongp_copy(rr, rt);
             freeResourcesOf(ins);
             if (!iftrue->isInReg()) {
                 NanoAssert(rt == rr);
                 findSpecificRegForUnallocated(iftrue, rr);
             }
             return;
         }
 
-        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
         NanoAssert(ins->isop(LIR_cmovi));
 
         // WARNING: We cannot generate any code that affects the condition
         // codes between the MRcc generation here and the asm_cmp() call
         // below.  See asm_cmp() for more details.
         switch (condval->opcode()) {
             // Note that these are all opposites...
             case LIR_eqi:    MRNE(rr, rf);   break;
--- a/js/src/nanojit/RegAlloc.cpp
+++ b/js/src/nanojit/RegAlloc.cpp
@@ -40,24 +40,16 @@
 #include "nanojit.h"
 
 namespace nanojit
 {
     #ifdef FEATURE_NANOJIT
 
     #ifdef _DEBUG
 
-    uint32_t RegAlloc::countActive()
-    {
-        int cnt = 0;
-        for(Register i=FirstReg; i <= LastReg; i = nextreg(i))
-            cnt += active[i] ? 1 : 0;
-        return cnt;
-    }
-
     bool RegAlloc::isConsistent(Register r, LIns* i) const
     {
         NanoAssert(r != deprecated_UnknownReg);
         return (isFree(r)  && !getActive(r)     && !i) ||
                (!isFree(r) &&  getActive(r)== i && i );
     }
 
     #endif /*DEBUG*/
--- a/js/src/nanojit/RegAlloc.h
+++ b/js/src/nanojit/RegAlloc.h
@@ -115,19 +115,23 @@ namespace nanojit
             return usepri[r];
         }
 
         LIns* getActive(Register r) const {
             NanoAssert(r != deprecated_UnknownReg);
             return active[r];
         }
 
-        debug_only( uint32_t    countActive(); )
+        // Return a mask containing the active registers.  For each register
+        // in this set, getActive(register) will be a nonzero LIns pointer.
+        RegisterMask activeMask() const {
+            return ~free & managed;
+        }
+
         debug_only( bool        isConsistent(Register r, LIns* v) const; )
-        debug_only( RegisterMask managed; )     // the registers managed by the register allocator
 
         // Some basics:
         //
         // - 'active' indicates which registers are active at a particular
         //   point, and for each active register, which instruction
         //   defines the value it holds.  At the start of register
         //   allocation no registers are active.
         //
@@ -166,15 +170,46 @@ namespace nanojit
         //   * And vice versa:  an LIns with an in-use reservation that
         //     names R must be named by 'active[R]'.
         //
         //   * If an LIns's reservation names 'deprecated_UnknownReg' then LIns
         //     should not be in 'active'.
         //
         LIns*           active[LastReg + 1];    // active[r] = LIns that defines r
         int32_t         usepri[LastReg + 1];    // used priority. lower = more likely to spill.
-        RegisterMask    free;
+        RegisterMask    free;       // Registers currently free.
+        RegisterMask    managed;    // Registers under management (invariant).
         int32_t         priority;
 
         DECLARE_PLATFORM_REGALLOC()
     };
+
+    // Return the lowest numbered Register in mask.
+    inline Register lsReg(RegisterMask mask) {
+        // This is faster than it looks; we rely on the C++ optimizer
+        // to strip the dead branch and inline just one alternative.
+        if (sizeof(RegisterMask) == 4)
+            return (Register) lsbSet32(mask);
+        else
+            return (Register) lsbSet64(mask);
+    }
+
+    // Return the highest numbered Register in mask.
+    inline Register msReg(RegisterMask mask) {
+        // This is faster than it looks; we rely on the C++ optimizer
+        // to strip the dead branch and inline just one alternative.
+        if (sizeof(RegisterMask) == 4)
+            return (Register) msbSet32(mask);
+        else
+            return (Register) msbSet64(mask);
+    }
+
+    // Clear bit r in mask, then return lsReg(mask).
+    inline Register nextLsReg(RegisterMask& mask, Register r) {
+        return lsReg(mask &= ~rmask(r));
+    }
+
+    // Clear bit r in mask, then return msReg(mask).
+    inline Register nextMsReg(RegisterMask& mask, Register r) {
+        return msReg(mask &= ~rmask(r));
+    }
 }
 #endif // __nanojit_RegAlloc__
--- a/js/src/nanojit/avmplus.cpp
+++ b/js/src/nanojit/avmplus.cpp
@@ -36,23 +36,16 @@
 #include "nanojit.h"
 
 #ifdef SOLARIS
     typedef caddr_t maddr_ptr;
 #else
     typedef void *maddr_ptr;
 #endif
 
-#if defined(AVMPLUS_ARM) && defined(UNDER_CE)
-extern "C" bool
-blx_lr_broken() {
-    return false;
-}
-#endif
-
 using namespace avmplus;
 
 nanojit::Config AvmCore::config;
 
 void
 avmplus::AvmLog(char const *msg, ...) {
     va_list ap;
     va_start(ap, msg);
--- a/js/src/nanojit/nanojit.h
+++ b/js/src/nanojit/nanojit.h
@@ -184,16 +184,131 @@ static inline bool isS32(intptr_t i) {
 
 static inline bool isU32(uintptr_t i) {
     return uint32_t(i) == i;
 }
 
 #define alignTo(x,s)        ((((uintptr_t)(x)))&~(((uintptr_t)s)-1))
 #define alignUp(x,s)        ((((uintptr_t)(x))+(((uintptr_t)s)-1))&~(((uintptr_t)s)-1))
 
+namespace nanojit
+{
+// Define msbSet32(), lsbSet32(), msbSet64(), and lsbSet64() functions using
+// fast find-first-bit instructions intrinsics when available.
+// The fall-back implementations use iteration.
+#if defined(_WIN32) && (_MSC_VER >= 1300) && (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
+
+    extern "C" unsigned char _BitScanForward(unsigned long * Index, unsigned long Mask);
+    extern "C" unsigned char _BitScanReverse(unsigned long * Index, unsigned long Mask);
+    # pragma intrinsic(_BitScanForward)
+    # pragma intrinsic(_BitScanReverse)
+
+    // Returns the index of the most significant bit that is set.
+    static inline int msbSet32(uint32_t x) {
+        unsigned long idx;
+        _BitScanReverse(&idx, (unsigned long)(x | 1)); // the '| 1' ensures a 0 result when x==0
+        return idx;
+    }
+
+    // Returns the index of the least significant bit that is set.
+    static inline int lsbSet32(uint32_t x) {
+        unsigned long idx;
+        _BitScanForward(&idx, (unsigned long)(x | 0x80000000)); // the '| 0x80000000' ensures a 0 result when x==0
+        return idx;
+    }
+
+#if defined(_M_AMD64) || defined(_M_X64)
+    extern "C" unsigned char _BitScanForward64(unsigned long * Index, unsigned __int64 Mask);
+    extern "C" unsigned char _BitScanReverse64(unsigned long * Index, unsigned __int64 Mask);
+    # pragma intrinsic(_BitScanForward64)
+    # pragma intrinsic(_BitScanReverse64)
+
+    // Returns the index of the most significant bit that is set.
+    static inline int msbSet64(uint64_t x) {
+        unsigned long idx;
+        _BitScanReverse64(&idx, (unsigned __int64)(x | 1)); // the '| 1' ensures a 0 result when x==0
+        return idx;
+    }
+
+    // Returns the index of the least significant bit that is set.
+    static inline int lsbSet64(uint64_t x) {
+        unsigned long idx;
+        _BitScanForward64(&idx, (unsigned __int64)(x | 0x8000000000000000LL)); // the '| 0x80000000' ensures a 0 result when x==0
+        return idx;
+    }
+#else
+    // Returns the index of the most significant bit that is set.
+    static int msbSet64(uint64_t x) {
+        return (x & 0xffffffff00000000LL) ? msbSet32(uint32_t(x >> 32)) + 32 : msbSet32(uint32_t(x));
+    }
+    // Returns the index of the least significant bit that is set.
+    static int lsbSet64(uint64_t x) {
+        return (x & 0x00000000ffffffffLL) ? lsbSet32(uint32_t(x)) : lsbSet32(uint32_t(x >> 32)) + 32;
+    }
+#endif
+
+#elif (__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
+
+    // Returns the index of the most significant bit that is set.
+    static inline int msbSet32(uint32_t x) {
+        return 31 - __builtin_clz(x | 1);
+    }
+
+    // Returns the index of the least significant bit that is set.
+    static inline int lsbSet32(uint32_t x) {
+        return __builtin_ctz(x | 0x80000000);
+    }
+
+    // Returns the index of the most significant bit that is set.
+    static inline int msbSet64(uint64_t x) {
+        return 63 - __builtin_clzll(x | 1);
+    }
+
+    // Returns the index of the least significant bit that is set.
+    static inline int lsbSet64(uint64_t x) {
+        return __builtin_ctzll(x | 0x8000000000000000LL);
+    }
+
+#else
+
+    // Slow fall-back: return most significant bit set by searching iteratively.
+    static int msbSet32(uint32_t x) {
+        for (int i = 31; i >= 0; i--)
+            if ((1 << i) & x)
+                return i;
+        return 0;
+    }
+
+    // Slow fall-back: return least significant bit set by searching iteratively.
+    static int lsbSet32(uint32_t x) {
+        for (int i = 0; i < 32; i++)
+            if ((1 << i) & x)
+                return i;
+        return 31;
+    }
+
+    // Slow fall-back: return most significant bit set by searching iteratively.
+    static int msbSet64(uint64_t x) {
+        for (int i = 63; i >= 0; i--)
+            if ((1LL << i) & x)
+                return i;
+        return 0;
+    }
+
+    // Slow fall-back: return least significant bit set by searching iteratively.
+    static int lsbSet64(uint64_t x) {
+        for (int i = 0; i < 64; i++)
+            if ((1LL << i) & x)
+                return i;
+        return 63;
+    }
+
+#endif // select compiler
+} // namespace nanojit
+
 // -------------------------------------------------------------------
 // START debug-logging definitions
 // -------------------------------------------------------------------
 
 /* Debug printing stuff.  All Nanojit and jstracer debug printing
    should be routed through LogControl::printf.  Don't use
    ad-hoc calls to printf, fprintf(stderr, ...) etc.