Bug 517910 - NJ: add more alias-set annotations to LIR so as to improve CSEing of loads. r=edwsmith.
authorNicholas Nethercote <nnethercote@mozilla.com>
Tue, 23 Mar 2010 15:05:47 -0700
changeset 40349 c4c2174afcca586d8278c3e585a5b577b4247312
parent 40348 656054962a9ebaac6236bd4daefbbfef34fbaef4
child 40350 31596ada8bfd959580399eaa1a6e741dea37a19b
push id12610
push userrsayre@mozilla.com
push dateMon, 05 Apr 2010 17:26:41 +0000
treeherdermozilla-central@1942c0b4e101 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersedwsmith
bugs517910
milestone1.9.3a3pre
Bug 517910 - NJ: add more alias-set annotations to LIR so as to improve CSEing of loads. r=edwsmith.
js/src/lirasm/lirasm.cpp
js/src/nanojit/LIR.cpp
js/src/nanojit/LIR.h
--- a/js/src/lirasm/lirasm.cpp
+++ b/js/src/lirasm/lirasm.cpp
@@ -513,17 +513,18 @@ FragmentAssembler::FragmentAssembler(Lir
                                                   nanojit::LC_FragProfile) ?
                                                   sProfId++ : 0));
     mFragment->lirbuf = mParent.mLirbuf;
     mParent.mFragments[mFragName].fragptr = mFragment;
 
     mLir = mBufWriter  = new LirBufWriter(mParent.mLirbuf, nanojit::AvmCore::config);
 #ifdef DEBUG
     if (optimize) {     // don't re-validate if no optimization has taken place
-        mLir = mValidateWriter2 = new ValidateWriter(mLir, "end of writer pipeline");
+        mLir = mValidateWriter2 =
+            new ValidateWriter(mLir, mFragment->lirbuf->printer, "end of writer pipeline");
     }
 #endif
 #ifdef DEBUG
     if (mParent.mVerbose) {
         mLir = mVerboseWriter = new VerboseWriter(mParent.mAlloc, mLir,
                                                   mParent.mLirbuf->printer,
                                                   &mParent.mLogc);
     }
@@ -535,17 +536,18 @@ FragmentAssembler::FragmentAssembler(Lir
     if (avmplus::AvmCore::config.soft_float) {
         mLir = new SoftFloatFilter(mLir);
     }
 #endif
     if (optimize) {
         mLir = mExprFilter = new ExprFilter(mLir);
     }
 #ifdef DEBUG
-    mLir = mValidateWriter1 = new ValidateWriter(mLir, "start of writer pipeline");
+    mLir = mValidateWriter1 =
+            new ValidateWriter(mLir, mFragment->lirbuf->printer, "start of writer pipeline");
 #endif
 
     mReturnTypeBits = 0;
     mLir->ins0(LIR_start);
     for (int i = 0; i < nanojit::NumSavedRegs; ++i)
         mLir->insParam(i, 1);
 
     mLineno = 0;
@@ -629,17 +631,17 @@ FragmentAssembler::assemble_load()
     // since, unlike sti/stqi, no immediate-displacement
     // load opcodes were defined in LIR.
     need(2);
     if (mTokens[1].find("0x") == 0 ||
         mTokens[1].find("0x") == 0 ||
         mTokens[1].find_first_of("0123456789") == 0) {
         return mLir->insLoad(mOpcode,
                              ref(mTokens[0]),
-                             imm(mTokens[1]));
+                             imm(mTokens[1]), ACC_LOAD_ANY);
     }
     bad("immediate offset required for load");
     return NULL;  // not reached
 }
 
 LIns *
 FragmentAssembler::assemble_call(const string &op)
 {
@@ -1054,17 +1056,17 @@ FragmentAssembler::assembleFragment(LirT
           case LIR_st32f:
 #endif
           case LIR_sti:
           CASE64(LIR_stqi:)
           case LIR_stfi:
             need(3);
             ins = mLir->insStore(mOpcode, ref(mTokens[0]),
                                   ref(mTokens[1]),
-                                  imm(mTokens[2]));
+                                  imm(mTokens[2]), ACC_STORE_ANY);
             break;
 
 #if NJ_EXPANDED_LOADSTORE_SUPPORTED 
           case LIR_ldsb:
           case LIR_ldss:
           case LIR_ld32f:
 #endif
           case LIR_ldzb:
@@ -1803,67 +1805,67 @@ FragmentAssembler::assembleRandomFragmen
                 n++;
             }
             break;
 
         case LLD_I: {
             vector<LIns*> Ms = rnd(2) ? M4s : M8ps;
             if (!Ms.empty()) {
                 LIns* base = rndPick(Ms);
-                ins = mLir->insLoad(rndPick(I_loads), base, rndOffset32(base->size()));
+                ins = mLir->insLoad(rndPick(I_loads), base, rndOffset32(base->size()), ACC_LOAD_ANY);
                 addOrReplace(Is, ins);
                 n++;
             }
             break;
         }
 
 #ifdef NANOJIT_64BIT
         case LLD_Q:
             if (!M8ps.empty()) {
                 LIns* base = rndPick(M8ps);
-                ins = mLir->insLoad(rndPick(Q_loads), base, rndOffset64(base->size()));
+                ins = mLir->insLoad(rndPick(Q_loads), base, rndOffset64(base->size()), ACC_LOAD_ANY);
                 addOrReplace(Qs, ins);
                 n++;
             }
             break;
 #endif
 
         case LLD_F:
             if (!M8ps.empty()) {
                 LIns* base = rndPick(M8ps);
-                ins = mLir->insLoad(rndPick(F_loads), base, rndOffset64(base->size()));
+                ins = mLir->insLoad(rndPick(F_loads), base, rndOffset64(base->size()), ACC_LOAD_ANY);
                 addOrReplace(Fs, ins);
                 n++;
             }
             break;
 
         case LST_I: {
             vector<LIns*> Ms = rnd(2) ? M4s : M8ps;
             if (!Ms.empty() && !Is.empty()) {
                 LIns* base = rndPick(Ms);
-                mLir->insStorei(rndPick(Is), base, rndOffset32(base->size()));
+                mLir->insStorei(rndPick(Is), base, rndOffset32(base->size()), ACC_STORE_ANY);
                 n++;
             }
             break;
         }
 
 #ifdef NANOJIT_64BIT
         case LST_Q:
             if (!M8ps.empty() && !Qs.empty()) {
                 LIns* base = rndPick(M8ps);
-                mLir->insStorei(rndPick(Qs), base, rndOffset64(base->size()));
+                mLir->insStorei(rndPick(Qs), base, rndOffset64(base->size()), ACC_STORE_ANY);
                 n++;
             }
             break;
 #endif
 
         case LST_F:
             if (!M8ps.empty() && !Fs.empty()) {
                 LIns* base = rndPick(M8ps);
-                mLir->insStorei(rndPick(Fs), base, rndOffset64(base->size()));
+                mLir->insStorei(rndPick(Fs), base, rndOffset64(base->size()), ACC_STORE_ANY);
                 n++;
             }
             break;
 
         case LCALL_I_I1:
             if (!Is.empty()) {
                 LIns* args[1] = { rndPick(Is) };
                 ins = mLir->insCall(&ci_I_I1, args);
--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@@ -1144,34 +1144,41 @@ namespace nanojit
 
     LInsHashSet::LInsHashSet(Allocator& alloc, uint32_t kInitialCaps[]) : alloc(alloc)
     {
         for (LInsHashKind kind = LInsFirst; kind <= LInsLast; kind = nextKind(kind)) {
             m_cap[kind] = kInitialCaps[kind];
             m_list[kind] = new (alloc) LInsp[m_cap[kind]];
         }
         clear();
-        m_find[LInsImm]   = &LInsHashSet::findImm;
-        m_find[LInsImmq]  = PTR_SIZE(NULL, &LInsHashSet::findImmq);
-        m_find[LInsImmf]  = &LInsHashSet::findImmf;
-        m_find[LIns1]     = &LInsHashSet::find1;
-        m_find[LIns2]     = &LInsHashSet::find2;
-        m_find[LIns3]     = &LInsHashSet::find3;
-        m_find[LInsLoad]  = &LInsHashSet::findLoad;
-        m_find[LInsCall]  = &LInsHashSet::findCall;
+        m_find[LInsImm]          = &LInsHashSet::findImm;
+        m_find[LInsImmq]         = PTR_SIZE(NULL, &LInsHashSet::findImmq);
+        m_find[LInsImmf]         = &LInsHashSet::findImmf;
+        m_find[LIns1]            = &LInsHashSet::find1;
+        m_find[LIns2]            = &LInsHashSet::find2;
+        m_find[LIns3]            = &LInsHashSet::find3;
+        m_find[LInsCall]         = &LInsHashSet::findCall;
+        m_find[LInsLoadReadOnly] = &LInsHashSet::findLoadReadOnly;
+        m_find[LInsLoadStack]    = &LInsHashSet::findLoadStack;
+        m_find[LInsLoadRStack]   = &LInsHashSet::findLoadRStack;
+        m_find[LInsLoadOther]    = &LInsHashSet::findLoadOther;
+        m_find[LInsLoadMultiple] = &LInsHashSet::findLoadMultiple;
+    }
+
+    void LInsHashSet::clear(LInsHashKind kind) {
+        VMPI_memset(m_list[kind], 0, sizeof(LInsp)*m_cap[kind]);
+        m_used[kind] = 0;
     }
 
     void LInsHashSet::clear() {
         for (LInsHashKind kind = LInsFirst; kind <= LInsLast; kind = nextKind(kind)) {
-            VMPI_memset(m_list[kind], 0, sizeof(LInsp)*m_cap[kind]);
-            m_used[kind] = 0;
+            clear(kind);
         }
     }
 
-
     inline uint32_t LInsHashSet::hashImm(int32_t a) {
         return _hashfinish(_hash32(0,a));
     }
 
     inline uint32_t LInsHashSet::hashImmq(uint64_t a) {
         uint32_t hash = _hash32(0, uint32_t(a >> 32));
         return _hashfinish(_hash32(hash, uint32_t(a)));
     }
@@ -1189,20 +1196,25 @@ namespace nanojit
 
     inline uint32_t LInsHashSet::hash3(LOpcode op, LInsp a, LInsp b, LInsp c) {
         uint32_t hash = _hash8(0,uint8_t(op));
         hash = _hashptr(hash, a);
         hash = _hashptr(hash, b);
         return _hashfinish(_hashptr(hash, c));
     }
 
-    inline uint32_t LInsHashSet::hashLoad(LOpcode op, LInsp a, int32_t d) {
+    NanoStaticAssert(sizeof(AccSet) == 1);  // required for hashLoad to work properly
+
+    // Nb: no need to hash the load's AccSet because each region's loads go in
+    // a different hash table.
+    inline uint32_t LInsHashSet::hashLoad(LOpcode op, LInsp a, int32_t d, AccSet accSet) {
         uint32_t hash = _hash8(0,uint8_t(op));
         hash = _hashptr(hash, a);
-        return _hashfinish(_hash32(hash, d));
+        hash = _hash32(hash, d);
+        return _hashfinish(_hash8(hash, accSet));
     }
 
     inline uint32_t LInsHashSet::hashCall(const CallInfo *ci, uint32_t argc, LInsp args[]) {
         uint32_t hash = _hashptr(0, ci);
         for (int32_t j=argc-1; j >= 0; j--)
             hash = _hashptr(hash,args[j]);
         return _hashfinish(hash);
     }
@@ -1214,234 +1226,264 @@ namespace nanojit
         LInsp *oldlist = m_list[kind];
         m_list[kind] = new (alloc) LInsp[m_cap[kind]];
         VMPI_memset(m_list[kind], 0, m_cap[kind] * sizeof(LInsp));
         find_t find = m_find[kind];
         for (uint32_t i = 0; i < oldcap; i++) {
             LInsp ins = oldlist[i];
             if (!ins) continue;
             uint32_t j = (this->*find)(ins);
+            NanoAssert(!m_list[kind][j]);
             m_list[kind][j] = ins;
         }
     }
 
-    LInsp LInsHashSet::add(LInsHashKind kind, LInsp ins, uint32_t k)
+    void LInsHashSet::add(LInsHashKind kind, LInsp ins, uint32_t k)
     {
         NanoAssert(!m_list[kind][k]);
         m_used[kind]++;
         m_list[kind][k] = ins;
         if ((m_used[kind] * 4) >= (m_cap[kind] * 3)) {  // load factor of 0.75
             grow(kind);
         }
-        return ins;
     }
 
     LInsp LInsHashSet::findImm(int32_t a, uint32_t &k)
     {
         LInsHashKind kind = LInsImm;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hashImm(a) & bitmask;
+        k = hashImm(a) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (ins->imm32() != a))
-        {
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
             NanoAssert(ins->isconst());
+            if (ins->imm32() == a)
+                return ins;
             // Quadratic probe:  h(k,i) = h(k) + 0.5i + 0.5i^2, which gives the
             // sequence h(k), h(k)+1, h(k)+3, h(k)+6, h+10, ...  This is a
             // good sequence for 2^n-sized tables as the values h(k,i) for i
             // in [0,m − 1] are all distinct so termination is guaranteed.
             // See http://portal.acm.org/citation.cfm?id=360737 and
             // http://en.wikipedia.org/wiki/Quadratic_probing (fetched
             // 06-Nov-2009) for more details.
-            hash = (hash + n) & bitmask;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
     uint32_t LInsHashSet::findImm(LInsp ins)
     {
         uint32_t k;
         findImm(ins->imm32(), k);
         return k;
     }
 
 #ifdef NANOJIT_64BIT
     LInsp LInsHashSet::findImmq(uint64_t a, uint32_t &k)
     {
         LInsHashKind kind = LInsImmq;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hashImmq(a) & bitmask;
+        k = hashImmq(a) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (ins->imm64() != a))
-        {
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
             NanoAssert(ins->isconstq());
-            hash = (hash + n) & bitmask;
+            if (ins->imm64() == a)
+                return ins;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
     uint32_t LInsHashSet::findImmq(LInsp ins)
     {
         uint32_t k;
         findImmq(ins->imm64(), k);
         return k;
     }
 #endif
 
     LInsp LInsHashSet::findImmf(uint64_t a, uint32_t &k)
     {
         LInsHashKind kind = LInsImmf;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hashImmq(a) & bitmask;
+        k = hashImmq(a) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (ins->imm64() != a))
-        {
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
             NanoAssert(ins->isconstf());
-            hash = (hash + n) & bitmask;
+            if (ins->imm64() == a)
+                return ins;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
     uint32_t LInsHashSet::findImmf(LInsp ins)
     {
         uint32_t k;
         findImmf(ins->imm64(), k);
         return k;
     }
 
     LInsp LInsHashSet::find1(LOpcode op, LInsp a, uint32_t &k)
     {
         LInsHashKind kind = LIns1;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hash1(op,a) & bitmask;
+        k = hash1(op, a) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (ins->opcode() != op || ins->oprnd1() != a))
-        {
-            hash = (hash + n) & bitmask;
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
+            if (ins->isop(op) && ins->oprnd1() == a)
+                return ins;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
     uint32_t LInsHashSet::find1(LInsp ins)
     {
         uint32_t k;
         find1(ins->opcode(), ins->oprnd1(), k);
         return k;
     }
 
     LInsp LInsHashSet::find2(LOpcode op, LInsp a, LInsp b, uint32_t &k)
     {
         LInsHashKind kind = LIns2;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hash2(op,a,b) & bitmask;
+        k = hash2(op, a, b) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (ins->opcode() != op || ins->oprnd1() != a || ins->oprnd2() != b))
-        {
-            hash = (hash + n) & bitmask;
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
+            if (ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b)
+                return ins;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
     uint32_t LInsHashSet::find2(LInsp ins)
     {
         uint32_t k;
         find2(ins->opcode(), ins->oprnd1(), ins->oprnd2(), k);
         return k;
     }
 
     LInsp LInsHashSet::find3(LOpcode op, LInsp a, LInsp b, LInsp c, uint32_t &k)
     {
         LInsHashKind kind = LIns3;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hash3(op,a,b,c) & bitmask;
+        k = hash3(op, a, b, c) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (ins->opcode() != op || ins->oprnd1() != a || ins->oprnd2() != b || ins->oprnd3() != c))
-        {
-            hash = (hash + n) & bitmask;
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
+            if (ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b && ins->oprnd3() == c)
+                return ins;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
     uint32_t LInsHashSet::find3(LInsp ins)
     {
         uint32_t k;
         find3(ins->opcode(), ins->oprnd1(), ins->oprnd2(), ins->oprnd3(), k);
         return k;
     }
 
-    LInsp LInsHashSet::findLoad(LOpcode op, LInsp a, int32_t d, uint32_t &k)
+    LInsp LInsHashSet::findLoad(LOpcode op, LInsp a, int32_t d, AccSet accSet, LInsHashKind kind,
+                                uint32_t &k)
     {
-        LInsHashKind kind = LInsLoad;
+        (void)accSet;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hashLoad(op,a,d) & bitmask;
+        k = hashLoad(op, a, d, accSet) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (ins->opcode() != op || ins->oprnd1() != a || ins->disp() != d))
-        {
-            hash = (hash + n) & bitmask;
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
+            NanoAssert(ins->accSet() == accSet);
+            if (ins->isop(op) && ins->oprnd1() == a && ins->disp() == d)
+                return ins;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
-    uint32_t LInsHashSet::findLoad(LInsp ins)
+    uint32_t LInsHashSet::findLoadReadOnly(LInsp ins)
+    {
+        uint32_t k;
+        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadReadOnly, k);
+        return k;
+    }
+
+    uint32_t LInsHashSet::findLoadStack(LInsp ins)
     {
         uint32_t k;
-        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), k);
+        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadStack, k);
+        return k;
+    }
+
+    uint32_t LInsHashSet::findLoadRStack(LInsp ins)
+    {
+        uint32_t k;
+        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadRStack, k);
+        return k;
+    }
+
+    uint32_t LInsHashSet::findLoadOther(LInsp ins)
+    {
+        uint32_t k;
+        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadOther, k);
+        return k;
+    }
+
+    uint32_t LInsHashSet::findLoadMultiple(LInsp ins)
+    {
+        uint32_t k;
+        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadMultiple, k);
         return k;
     }
 
     bool argsmatch(LInsp ins, uint32_t argc, LInsp args[])
     {
         for (uint32_t j=0; j < argc; j++)
             if (ins->arg(j) != args[j])
                 return false;
         return true;
     }
 
     LInsp LInsHashSet::findCall(const CallInfo *ci, uint32_t argc, LInsp args[], uint32_t &k)
     {
         LInsHashKind kind = LInsCall;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hashCall(ci, argc, args) & bitmask;
+        k = hashCall(ci, argc, args) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (!ins->isCall() || ins->callInfo() != ci || !argsmatch(ins, argc, args)))
-        {
-            hash = (hash + n) & bitmask;
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
+            if (ins->isCall() && ins->callInfo() == ci && argsmatch(ins, argc, args))
+                return ins;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
     uint32_t LInsHashSet::findCall(LInsp ins)
     {
         LInsp args[MAXARGS];
         uint32_t argc = ins->argc();
         NanoAssert(argc < MAXARGS);
         for (uint32_t j=0; j < argc; j++)
@@ -1723,17 +1765,16 @@ namespace nanojit
 
             if (e->i->isGuard() || e->i->isBranch() || e->i->isRet()) {
                 logc->printf("\n");
                 newblock = true;
             }
         }
     }
 
-
     void LirNameMap::addNameWithSuffix(LInsp ins, const char *name, int suffix,
                                        bool ignoreOneSuffix) {
         // The lookup may succeed, ie. we may already have a name for this
         // instruction.  This can happen because of CSE.  Eg. if we have this:
         //
         //   ins = addName("foo", insImm(0))
         //
         // that assigns the name "foo1" to 'ins'.  If we later do this:
@@ -1787,33 +1828,28 @@ namespace nanojit
 
     const char* LirNameMap::lookupName(LInsp ins)
     {
         Entry* e = names.get(ins);
         return e ? e->name : NULL;
     }
 
 
-    char* LInsPrinter::formatAccSet(RefBuf* buf, LInsp ins, bool isLoad) {
-        AccSet accSet = ins->accSet();
+    char* LInsPrinter::formatAccSet(RefBuf* buf, AccSet accSet) {
         int i = 0;
-        if ((isLoad && accSet == ACC_LOAD_ANY) ||
-            (!isLoad && accSet == ACC_STORE_ANY))
-        {
-            // boring, don't bother with a suffix
-        } else {
-            buf->buf[i++] = '.';
-            if (accSet & ACC_READONLY) { buf->buf[i++] = 'r'; accSet &= ~ACC_READONLY; }
-            if (accSet & ACC_STACK)    { buf->buf[i++] = 's'; accSet &= ~ACC_STACK; }
-            if (accSet & ACC_OTHER)    { buf->buf[i++] = 'o'; accSet &= ~ACC_OTHER; }
-            // This assertion will fail if we add a new accSet value but
-            // forget to handle it here.
-            NanoAssert(accSet == 0);
-        }
+        // 'c' is short for "const", because 'r' is used for RSTACK.
+        if (accSet & ACC_READONLY) { buf->buf[i++] = 'c'; accSet &= ~ACC_READONLY; }
+        if (accSet & ACC_STACK)    { buf->buf[i++] = 's'; accSet &= ~ACC_STACK; }
+        if (accSet & ACC_RSTACK)   { buf->buf[i++] = 'r'; accSet &= ~ACC_RSTACK; }
+        if (accSet & ACC_OTHER)    { buf->buf[i++] = 'o'; accSet &= ~ACC_OTHER; }
+        // This assertion will fail if we add a new accSet value but
+        // forget to handle it here.
+        NanoAssert(accSet == 0);
         buf->buf[i] = 0;
+        NanoAssert(size_t(i) < buf->len);
         return buf->buf;
     }
 
     void LInsPrinter::formatImm(RefBuf* buf, int32_t c) {
         if (-10000 < c || c < 10000) {
             VMPI_snprintf(buf->buf, buf->len, "%d", c);
         } else {
 #if !defined NANOJIT_64BIT
@@ -1914,21 +1950,22 @@ namespace nanojit
 
             case LIR_icall:
             case LIR_fcall:
             CASE64(LIR_qcall:) {
                 const CallInfo* call = i->callInfo();
                 int32_t argc = i->argc();
                 int32_t m = int32_t(n);     // Windows doesn't have 'ssize_t'
                 if (call->isIndirect())
-                    m -= VMPI_snprintf(s, m, "%s = %s [%s] ( ", formatRef(&b1, i), lirNames[op],
-                                       formatRef(&b2, i->arg(--argc)));
+                    m -= VMPI_snprintf(s, m, "%s = %s.%s [%s] ( ", formatRef(&b1, i), lirNames[op],
+                                       formatAccSet(&b2, call->_storeAccSet),
+                                       formatRef(&b3, i->arg(--argc)));
                 else
-                    m -= VMPI_snprintf(s, m, "%s = %s #%s ( ", formatRef(&b1, i), lirNames[op],
-                                       call->_name);
+                    m -= VMPI_snprintf(s, m, "%s = %s.%s #%s ( ", formatRef(&b1, i), lirNames[op],
+                                       formatAccSet(&b2, call->_storeAccSet), call->_name);
                 if (m < 0) break;
                 for (int32_t j = argc - 1; j >= 0; j--) {
                     s += VMPI_strlen(s);
                     m -= VMPI_snprintf(s, m, "%s ",formatRef(&b2, i->arg(j)));
                     if (m < 0) break;
                 }
                 s += VMPI_strlen(s);
                 m -= VMPI_snprintf(s, m, ")");
@@ -2069,175 +2106,217 @@ namespace nanojit
             case LIR_ld:
             CASE64(LIR_ldq:)
             case LIR_ldf:
             case LIR_ldzb:
             case LIR_ldzs:
             case LIR_ldsb:
             case LIR_ldss:
             case LIR_ld32f:
-                VMPI_snprintf(s, n, "%s = %s%s %s[%d]", formatRef(&b1, i), lirNames[op],
-                    formatAccSet(&b2, i, /*isLoad*/true),
+                VMPI_snprintf(s, n, "%s = %s.%s %s[%d]", formatRef(&b1, i), lirNames[op],
+                    formatAccSet(&b2, i->accSet()),
                     formatRef(&b3, i->oprnd1()),
                     i->disp());
                 break;
 
             case LIR_sti:
             CASE64(LIR_stqi:)
             case LIR_stfi:
             case LIR_stb:
             case LIR_sts:
             case LIR_st32f:
-                VMPI_snprintf(s, n, "%s%s %s[%d] = %s", lirNames[op],
-                    formatAccSet(&b1, i, /*isLoad*/false),
+                VMPI_snprintf(s, n, "%s.%s %s[%d] = %s", lirNames[op],
+                    formatAccSet(&b1, i->accSet()),
                     formatRef(&b2, i->oprnd2()),
                     i->disp(),
                     formatRef(&b3, i->oprnd1()));
                 break;
 
             default:
                 NanoAssertMsgf(0, "Can't handle opcode %s\n", lirNames[op]);
                 break;
         }
         return buf->buf;
     }
 #endif
 
 
     CseFilter::CseFilter(LirWriter *out, Allocator& alloc)
-        : LirWriter(out)
+        : LirWriter(out), storesSinceLastLoad(ACC_NONE)
     {
         uint32_t kInitialCaps[LInsLast + 1];
-        kInitialCaps[LInsImm]   = 128;
-        kInitialCaps[LInsImmq]  = PTR_SIZE(0, 16);
-        kInitialCaps[LInsImmf]  = 16;
-        kInitialCaps[LIns1]     = 256;
-        kInitialCaps[LIns2]     = 512;
-        kInitialCaps[LIns3]     = 16;
-        kInitialCaps[LInsLoad]  = 16;
-        kInitialCaps[LInsCall]  = 64;
+        kInitialCaps[LInsImm]          = 128;
+        kInitialCaps[LInsImmq]         = PTR_SIZE(0, 16);
+        kInitialCaps[LInsImmf]         = 16;
+        kInitialCaps[LIns1]            = 256;
+        kInitialCaps[LIns2]            = 512;
+        kInitialCaps[LIns3]            = 16;
+        kInitialCaps[LInsCall]         = 64;
+        kInitialCaps[LInsLoadReadOnly] = 16;
+        kInitialCaps[LInsLoadStack]    = 16;
+        kInitialCaps[LInsLoadRStack]   = 16;
+        kInitialCaps[LInsLoadOther]    = 16;
+        kInitialCaps[LInsLoadMultiple] = 16;
         exprs = new (alloc) LInsHashSet(alloc, kInitialCaps);
     }
 
     LIns* CseFilter::insImm(int32_t imm)
     {
         uint32_t k;
         LInsp ins = exprs->findImm(imm, k);
-        if (ins)
-            return ins;
-        ins = out->insImm(imm);
+        if (!ins) {
+            ins = out->insImm(imm);
+            exprs->add(LInsImm, ins, k);
+        }
         // We assume that downstream stages do not modify the instruction, so
         // that we can insert 'ins' into slot 'k'.  Check this.
-        NanoAssert(ins->opcode() == LIR_int && ins->imm32() == imm);
-        return exprs->add(LInsImm, ins, k);
+        NanoAssert(ins->isop(LIR_int) && ins->imm32() == imm);
+        return ins;
     }
 
 #ifdef NANOJIT_64BIT
     LIns* CseFilter::insImmq(uint64_t q)
     {
         uint32_t k;
         LInsp ins = exprs->findImmq(q, k);
-        if (ins)
-            return ins;
-        ins = out->insImmq(q);
-        NanoAssert(ins->opcode() == LIR_quad && ins->imm64() == q);
-        return exprs->add(LInsImmq, ins, k);
+        if (!ins) {
+            ins = out->insImmq(q);
+            exprs->add(LInsImmq, ins, k);
+        }
+        NanoAssert(ins->isop(LIR_quad) && ins->imm64() == q);
+        return ins;
     }
 #endif
 
     LIns* CseFilter::insImmf(double d)
     {
         uint32_t k;
         // We must pun 'd' as a uint64_t otherwise 0 and -0 will be treated as
         // equal, which breaks things (see bug 527288).
         union {
             double d;
             uint64_t u64;
         } u;
         u.d = d;
         LInsp ins = exprs->findImmf(u.u64, k);
-        if (ins)
-            return ins;
-        ins = out->insImmf(d);
-        NanoAssert(ins->opcode() == LIR_float && ins->imm64() == u.u64);
-        return exprs->add(LInsImmf, ins, k);
+        if (!ins) {
+            ins = out->insImmf(d);
+            exprs->add(LInsImmf, ins, k);
+        }
+        NanoAssert(ins->isop(LIR_float) && ins->imm64() == u.u64);
+        return ins;
     }
 
-    LIns* CseFilter::ins0(LOpcode v)
+    LIns* CseFilter::ins0(LOpcode op)
     {
-        if (v == LIR_label)
+        if (op == LIR_label)
             exprs->clear();
-        return out->ins0(v);
+        return out->ins0(op);
     }
 
-    LIns* CseFilter::ins1(LOpcode v, LInsp a)
+    LIns* CseFilter::ins1(LOpcode op, LInsp a)
     {
-        if (isCseOpcode(v)) {
+        LInsp ins;
+        if (isCseOpcode(op)) {
             uint32_t k;
-            LInsp ins = exprs->find1(v, a, k);
-            if (ins)
-                return ins;
-            ins = out->ins1(v, a);
-            NanoAssert(ins->opcode() == v && ins->oprnd1() == a);
-            return exprs->add(LIns1, ins, k);
+            ins = exprs->find1(op, a, k);
+            if (!ins) {
+                ins = out->ins1(op, a);
+                exprs->add(LIns1, ins, k);
+            }
+        } else {
+            ins = out->ins1(op, a);
         }
-        return out->ins1(v,a);
+        NanoAssert(ins->isop(op) && ins->oprnd1() == a);
+        return ins;
     }
 
-    LIns* CseFilter::ins2(LOpcode v, LInsp a, LInsp b)
+    LIns* CseFilter::ins2(LOpcode op, LInsp a, LInsp b)
     {
-        if (isCseOpcode(v)) {
+        LInsp ins;
+        if (isCseOpcode(op)) {
             uint32_t k;
-            LInsp ins = exprs->find2(v, a, b, k);
-            if (ins)
-                return ins;
-            ins = out->ins2(v, a, b);
-            NanoAssert(ins->opcode() == v && ins->oprnd1() == a && ins->oprnd2() == b);
-            return exprs->add(LIns2, ins, k);
+            ins = exprs->find2(op, a, b, k);
+            if (!ins) {
+                ins = out->ins2(op, a, b);
+                exprs->add(LIns2, ins, k);
+            }
+        } else {
+            ins = out->ins2(op, a, b);
         }
-        return out->ins2(v,a,b);
+        NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b);
+        return ins;
     }
 
-    LIns* CseFilter::ins3(LOpcode v, LInsp a, LInsp b, LInsp c)
+    LIns* CseFilter::ins3(LOpcode op, LInsp a, LInsp b, LInsp c)
     {
-        NanoAssert(isCseOpcode(v));
+        NanoAssert(isCseOpcode(op));
         uint32_t k;
-        LInsp ins = exprs->find3(v, a, b, c, k);
-        if (ins)
-            return ins;
-        ins = out->ins3(v, a, b, c);
-        NanoAssert(ins->opcode() == v && ins->oprnd1() == a && ins->oprnd2() == b &&
-                                                               ins->oprnd3() == c);
-        return exprs->add(LIns3, ins, k);
+        LInsp ins = exprs->find3(op, a, b, c, k);
+        if (!ins) {
+            ins = out->ins3(op, a, b, c);
+            exprs->add(LIns3, ins, k);
+        }
+        NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b && ins->oprnd3() == c);
+        return ins;
     }
 
-    LIns* CseFilter::insLoad(LOpcode v, LInsp base, int32_t disp, AccSet accSet)
+    LIns* CseFilter::insLoad(LOpcode op, LInsp base, int32_t disp, AccSet loadAccSet)
     {
+        LInsp ins;
         if (isS16(disp)) {
-            // XXX: This condition is overly strict.  Bug 517910 will make it better.
-            if (accSet == ACC_READONLY) {
-                uint32_t k;
-                LInsp ins = exprs->findLoad(v, base, disp, k);
-                if (ins)
-                    return ins;
-                ins = out->insLoad(v, base, disp, accSet);
-                NanoAssert(ins->opcode() == v && ins->oprnd1() == base && ins->disp() == disp);
-                return exprs->add(LInsLoad, ins, k);
+            // Clear all loads aliased by stores and calls since the last time
+            // we were in this function.
+            if (storesSinceLastLoad != ACC_NONE) {
+                NanoAssert(!(storesSinceLastLoad & ACC_READONLY));  // can't store to READONLY
+                if (storesSinceLastLoad & ACC_STACK)  { exprs->clear(LInsLoadStack); }
+                if (storesSinceLastLoad & ACC_RSTACK) { exprs->clear(LInsLoadRStack); }
+                if (storesSinceLastLoad & ACC_OTHER)  { exprs->clear(LInsLoadOther); }
+                // Loads marked with multiple access regions must be treated
+                // conservatively -- we always clear all of them.
+                exprs->clear(LInsLoadMultiple);
+                storesSinceLastLoad = ACC_NONE;
             }
-            return out->insLoad(v, base, disp, accSet);
+
+            LInsHashKind kind;
+            switch (loadAccSet) {
+            case ACC_READONLY:  kind = LInsLoadReadOnly;    break;
+            case ACC_STACK:     kind = LInsLoadStack;       break;
+            case ACC_RSTACK:    kind = LInsLoadRStack;      break;
+            case ACC_OTHER:     kind = LInsLoadOther;       break;
+            default:            kind = LInsLoadMultiple;    break;
+            }
+
+            uint32_t k;
+            ins = exprs->findLoad(op, base, disp, loadAccSet, kind, k);
+            if (!ins) {
+                ins = out->insLoad(op, base, disp, loadAccSet);
+                exprs->add(kind, ins, k);
+            }
+            NanoAssert(ins->isop(op) && ins->oprnd1() == base && ins->disp() == disp);
+
         } else {
             // If the displacement is more than 16 bits, put it in a separate
-            // instruction.  LirBufWriter also does this, we do it here as
-            // well because CseFilter relies on LirBufWriter not changing
-            // code.
-            return insLoad(v, ins2(LIR_addp, base, insImmWord(disp)), 0, accSet);
+            // instruction.  Nb: LirBufWriter also does this, we do it here
+            // too because CseFilter relies on LirBufWriter not changing code.
+            ins = insLoad(op, ins2(LIR_addp, base, insImmWord(disp)), 0, loadAccSet);
         }
+        return ins;
     }
 
-    LInsp CseFilter::insGuard(LOpcode v, LInsp c, GuardRecord *gr)
+    LIns* CseFilter::insStore(LOpcode op, LInsp value, LInsp base, int32_t disp, AccSet accSet)
+    {
+        storesSinceLastLoad |= accSet;
+        LIns* ins = out->insStore(op, value, base, disp, accSet);
+        NanoAssert(ins->isop(op) && ins->oprnd1() == value && ins->oprnd2() == base &&
+                   ins->disp() == disp && ins->accSet() == accSet);
+        return ins;
+    }
+
+    LInsp CseFilter::insGuard(LOpcode op, LInsp c, GuardRecord *gr)
     {
         // LIR_xt and LIR_xf guards are CSEable.  Note that we compare the
         // opcode and condition when determining if two guards are equivalent
         // -- in find1() and hash1() -- but we do *not* compare the
         // GuardRecord.  This works because:
         // - If guard 1 is taken (exits) then guard 2 is never reached, so
         //   guard 2 can be removed.
         // - If guard 1 is not taken then neither is guard 2, so guard 2 can
@@ -2246,114 +2325,68 @@ namespace nanojit
         // The underlying assumptions that are required for this to be safe:
         // - There's never a path from the side exit of guard 1 back to guard
         //   2;  for tree-shaped fragments this should be true.
         // - GuardRecords do not contain information other than what is needed
         //   to execute a successful exit.  That is currently true.
         // - The CSE algorithm will always keep guard 1 and remove guard 2
         //   (not vice versa).  The current algorithm does this.
         //
-        if (isCseOpcode(v)) {
+        LInsp ins;
+        if (isCseOpcode(op)) {
             // conditional guard
             uint32_t k;
-            LInsp ins = exprs->find1(v, c, k);
-            if (ins)
-                return 0;
-            ins = out->insGuard(v, c, gr);
-            NanoAssert(ins->opcode() == v && ins->oprnd1() == c);
-            return exprs->add(LIns1, ins, k);
+            ins = exprs->find1(op, c, k);
+            if (!ins) {
+                ins = out->insGuard(op, c, gr);
+                exprs->add(LIns1, ins, k);
+            }
+        } else {
+            ins = out->insGuard(op, c, gr);
         }
-        return out->insGuard(v, c, gr);
+        NanoAssert(ins->isop(op) && ins->oprnd1() == c);
+        return ins;
     }
 
-    LInsp CseFilter::insGuardXov(LOpcode v, LInsp a, LInsp b, GuardRecord *gr)
+    LInsp CseFilter::insGuardXov(LOpcode op, LInsp a, LInsp b, GuardRecord *gr)
     {
         // LIR_*xov are CSEable.  See CseFilter::insGuard() for details.
-        NanoAssert(isCseOpcode(v));
+        NanoAssert(isCseOpcode(op));
         // conditional guard
         uint32_t k;
-        LInsp ins = exprs->find2(v, a, b, k);
-        if (ins)
-            return ins;
-        ins = out->insGuardXov(v, a, b, gr);
-        NanoAssert(ins->opcode() == v && ins->oprnd1() == a && ins->oprnd2() == b);
-        return exprs->add(LIns2, ins, k);
+        LInsp ins = exprs->find2(op, a, b, k);
+        if (!ins) {
+            ins = out->insGuardXov(op, a, b, gr);
+            exprs->add(LIns2, ins, k);
+        }
+        NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b);
+        return ins;
     }
 
     LInsp CseFilter::insCall(const CallInfo *ci, LInsp args[])
     {
+        LInsp ins;
+        uint32_t argc = ci->count_args();
         if (ci->_isPure) {
             NanoAssert(ci->_storeAccSet == ACC_NONE);
             uint32_t k;
-            uint32_t argc = ci->count_args();
-            LInsp ins = exprs->findCall(ci, argc, args, k);
-            if (ins)
-                return ins;
+            ins = exprs->findCall(ci, argc, args, k);
+            if (!ins) {
+                ins = out->insCall(ci, args);
+                exprs->add(LInsCall, ins, k);
+            }
+        } else {
+            // We only need to worry about aliasing if !ci->_isPure.
+            storesSinceLastLoad |= ci->_storeAccSet;
             ins = out->insCall(ci, args);
-            NanoAssert(ins->isCall() && ins->callInfo() == ci && argsmatch(ins, argc, args));
-            return exprs->add(LInsCall, ins, k);
         }
-        return out->insCall(ci, args);
+        NanoAssert(ins->isCall() && ins->callInfo() == ci && argsmatch(ins, argc, args));
+        return ins;
     }
 
-    LInsp LoadFilter::insLoad(LOpcode v, LInsp base, int32_t disp, AccSet accSet)
-    {
-        if (base != sp && base != rp)
-        {
-            switch (v)
-            {
-                case LIR_ld:
-                CASE64(LIR_ldq:)
-                case LIR_ldf:
-                case LIR_ld32f:
-                case LIR_ldsb:
-                case LIR_ldss:
-                case LIR_ldzb:
-                case LIR_ldzs:
-                {
-                    uint32_t k;
-                    LInsp ins = exprs->findLoad(v, base, disp, k);
-                    if (ins)
-                        return ins;
-                    ins = out->insLoad(v, base, disp, accSet);
-                    return exprs->add(LInsLoad, ins, k);
-                }
-                default:
-                    // fall thru
-                    break;
-            }
-        }
-        return out->insLoad(v, base, disp, accSet);
-    }
-
-    void LoadFilter::clear(LInsp p)
-    {
-        if (p != sp && p != rp)
-            exprs->clear();
-    }
-
-    LInsp LoadFilter::insStore(LOpcode op, LInsp v, LInsp b, int32_t d, AccSet accSet)
-    {
-        clear(b);
-        return out->insStore(op, v, b, d, accSet);
-    }
-
-    LInsp LoadFilter::insCall(const CallInfo *ci, LInsp args[])
-    {
-        if (!ci->_isPure)
-            exprs->clear();
-        return out->insCall(ci, args);
-    }
-
-    LInsp LoadFilter::ins0(LOpcode op)
-    {
-        if (op == LIR_label)
-            exprs->clear();
-        return out->ins0(op);
-    }
 
 #if NJ_SOFTFLOAT_SUPPORTED
     static double FASTCALL i2f(int32_t i)           { return i; }
     static double FASTCALL u2f(uint32_t u)          { return u; }
     static double FASTCALL fneg(double a)           { return -a; }
     static double FASTCALL fadd(double a, double b) { return a + b; }
     static double FASTCALL fsub(double a, double b) { return a - b; }
     static double FASTCALL fmul(double a, double b) { return a * b; }
@@ -2559,37 +2592,38 @@ namespace nanojit
                 // opposed to printing a message and continuing) is that at
                 // most one type error will be detected per run.  But type
                 // errors should be rare, and assertion failures are certain
                 // to be caught by test suites whereas error messages may not
                 // be.
                 NanoAssertMsgf(0,
                     "LIR type error (%s): arg %d of '%s' is '%s' "
                     "which has type %s (expected %s)",
-                    _whereInPipeline, i+1, lirNames[op],
+                    whereInPipeline, i+1, lirNames[op],
                     lirNames[args[i]->opcode()],
                     type2string(actual), type2string(formal));
             }
         }
     }
 
     void ValidateWriter::errorStructureShouldBe(LOpcode op, const char* argDesc, int argN,
                                                 LIns* arg, const char* shouldBeDesc)
     {
         NanoAssertMsgf(0,
             "LIR structure error (%s): %s %d of '%s' is '%s' (expected %s)",
-            _whereInPipeline, argDesc, argN, 
+            whereInPipeline, argDesc, argN, 
             lirNames[op], lirNames[arg->opcode()], shouldBeDesc);
     }
 
-    void ValidateWriter::errorAccSetShould(const char* what, AccSet accSet, const char* shouldDesc)
+    void ValidateWriter::errorAccSet(const char* what, AccSet accSet, const char* shouldDesc)
     {
+        RefBuf b;
         NanoAssertMsgf(0,
-            "LIR AccSet error (%s): '%s' AccSet is %d; it should %s",
-            _whereInPipeline, what, accSet, shouldDesc);
+            "LIR AccSet error (%s): '%s' AccSet is '%s'; %s",
+            whereInPipeline, what, printer->formatAccSet(&b, accSet), shouldDesc);
     }
 
     void ValidateWriter::checkLInsIsACondOrConst(LOpcode op, int argN, LIns* ins)
     {
         // We could introduce a LTy_B32 type in the type system but that's a
         // bit weird because its representation is identical to LTy_I32.  It's
         // easier to just do this check structurally.  Also, optimization can
         // cause the condition to become a LIR_int.
@@ -2604,27 +2638,70 @@ namespace nanojit
     }
 
     void ValidateWriter::checkLInsHasOpcode(LOpcode op, int argN, LIns* ins, LOpcode op2)
     {
         if (!ins->isop(op2))
             errorStructureShouldBe(op, "argument", argN, ins, lirNames[op2]);
     }
 
-    ValidateWriter::ValidateWriter(LirWriter *out, const char* stageName)
-        : LirWriter(out), _whereInPipeline(stageName)
+    void ValidateWriter::checkAccSet(LOpcode op, LInsp base, AccSet accSet, AccSet maxAccSet)
+    {
+        if (accSet == ACC_NONE)
+            errorAccSet(lirNames[op], accSet, "it should not equal ACC_NONE");
+
+        if (accSet & ~maxAccSet)
+            errorAccSet(lirNames[op], accSet,
+                "it should not contain bits that aren't in ACC_LOAD_ANY/ACC_STORE_ANY");
+
+        // Some sanity checking, which is based on the following assumptions:
+        // - STACK ones should use 'sp' or 'sp+k' as the base.  (We could look
+        //   for more complex patterns, but that feels dangerous.  Better to
+        //   keep it really simple.)
+        // - RSTACK ones should use 'rp' as the base.
+        // - READONLY/OTHER ones should not use 'sp'/'sp+k' or 'rp' as the base.
+        //
+        // Things that aren't checked:
+        // - There's no easy way to check if READONLY ones really are read-only.
+
+        bool isStack = base == sp ||
+                      (base->isop(LIR_piadd) && base->oprnd1() == sp && base->oprnd2()->isconstp());
+        bool isRStack = base == rp;
+
+        switch (accSet) {
+        case ACC_STACK:
+            if (!isStack)
+                errorAccSet(lirNames[op], accSet, "but it's not a stack access");
+            break;
+
+        case ACC_RSTACK:
+            if (!isRStack)
+                errorAccSet(lirNames[op], accSet, "but it's not an rstack access");
+            break;
+
+        case ACC_READONLY:
+        case ACC_OTHER:
+            if (isStack)
+                errorAccSet(lirNames[op], accSet, "but it's a stack access");
+            if (isRStack)
+                errorAccSet(lirNames[op], accSet, "but it's an rstack access");
+            break;
+
+        default:
+            break;
+        }
+    }
+
+    ValidateWriter::ValidateWriter(LirWriter *out, LInsPrinter* printer, const char* where)
+        : LirWriter(out), printer(printer), whereInPipeline(where), sp(0), rp(0)
     {}
 
     LIns* ValidateWriter::insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet)
     {
-        if (accSet == ACC_NONE)
-            errorAccSetShould(lirNames[op], accSet, "not equal ACC_NONE");
-
-        if (accSet & ~ACC_LOAD_ANY)
-            errorAccSetShould(lirNames[op], accSet, "not contain bits that aren't in ACC_LOAD_ANY");
+        checkAccSet(op, base, accSet, ACC_LOAD_ANY);
 
         int nArgs = 1;
         LTy formals[1] = { LTy_Ptr };
         LIns* args[1] = { base };
 
         switch (op) {
         case LIR_ld:
         case LIR_ldf:
@@ -2641,21 +2718,17 @@ namespace nanojit
 
         typeCheckArgs(op, nArgs, formals, args);
 
         return out->insLoad(op, base, d, accSet);
     }
 
     LIns* ValidateWriter::insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet)
     {
-        if (accSet == ACC_NONE)
-            errorAccSetShould(lirNames[op], accSet, "not equal ACC_NONE");
-
-        if (accSet & ~ACC_STORE_ANY)
-            errorAccSetShould(lirNames[op], accSet, "not contain bits that aren't in ACC_STORE_ANY");
+        checkAccSet(op, base, accSet, ACC_STORE_ANY);
 
         int nArgs = 2;
         LTy formals[2] = { LTy_Void, LTy_Ptr };     // LTy_Void is overwritten shortly
         LIns* args[2] = { value, base };
 
         switch (op) {
         case LIR_stb:
         case LIR_sts:
@@ -2675,17 +2748,17 @@ namespace nanojit
             break;
 
         default:
             NanoAssert(0);
         }
 
         typeCheckArgs(op, nArgs, formals, args);
 
-        return out->insStore(op, value, base, d);
+        return out->insStore(op, value, base, d, accSet);
     }
 
     LIns* ValidateWriter::ins0(LOpcode op)
     {
         switch (op) {
         case LIR_start:
         case LIR_regfence:
         case LIR_label:
@@ -2918,21 +2991,21 @@ namespace nanojit
         ArgType argTypes[MAXARGS];
         uint32_t nArgs = ci->getArgTypes(argTypes);
         LTy formals[MAXARGS];
         LIns* args[MAXARGS];    // in left-to-right order, unlike args0[]
 
         LOpcode op = getCallOpcode(ci);
 
         if (ci->_isPure && ci->_storeAccSet != ACC_NONE)
-            errorAccSetShould(ci->_name, ci->_storeAccSet, "equal ACC_NONE for pure functions");
+            errorAccSet(ci->_name, ci->_storeAccSet, "it should be ACC_NONE for pure functions");
 
         if (ci->_storeAccSet & ~ACC_STORE_ANY)
-            errorAccSetShould(lirNames[op], ci->_storeAccSet,
-                "not contain bits that aren't in ACC_STORE_ANY");
+            errorAccSet(lirNames[op], ci->_storeAccSet,
+                "it should not contain bits that aren't in ACC_STORE_ANY");
 
         // This loop iterates over the args from right-to-left (because arg()
         // and getArgTypes() use right-to-left order), but puts the results
         // into formals[] and args[] in left-to-right order so that arg
         // numbers in error messages make sense to the user.
         for (uint32_t i = 0; i < nArgs; i++) {
             uint32_t i2 = nArgs - i - 1;    // converts right-to-left to left-to-right
             switch (argTypes[i]) {
--- a/js/src/nanojit/LIR.h
+++ b/js/src/nanojit/LIR.h
@@ -205,17 +205,20 @@ namespace nanojit
     // then they do not alias.
     //
     // The access regions used are as follows:
     //
     // - READONLY: all memory that is read-only, ie. never stored to.
     //   A load from a READONLY region will never alias with any stores.
     //
     // - STACK: the stack.  Stack loads/stores can usually be easily
-    //   identified because they use SP as the stack pointer.
+    //   identified because they use SP as the base pointer.
+    //
+    // - RSTACK: the return stack.  Return stack loads/stores can usually be
+    //   easily identified because they use RP as the base pointer.
     //
     // - OTHER: all other regions of memory.
     //
     // It makes sense to add new access regions when doing so will help with
     // one or more optimisations.
     //
     // One subtlety is that the meanings of the access region markings only
     // apply to the LIR fragment that they are in.  For example, if a memory
@@ -254,45 +257,54 @@ namespace nanojit
     //
     //   In other words, a load/store can be marked with an access region set
     //   that is a superset of its actual access region set.  Taking this to
     //   its logical conclusion, any load can be safely marked with LOAD_ANY and
     //   any store can be safely marked with with STORE_ANY (and the latter is
     //   true for the store set of a function.)
     //
     // Such imprecision is safe but may reduce optimisation opportunities.
+    //
+    // Optimisations that use access region info
+    // -----------------------------------------
+    // Currently only CseFilter uses this, and only for determining whether
+    // loads can be CSE'd.  Note that CseFilter treats loads that are marked
+    // with a single access region precisely, but all loads marked with
+    // multiple access regions get lumped together.  So if you can't mark a
+    // load with a single access region, you might as well use ACC_LOAD_ANY.
     //-----------------------------------------------------------------------
 
     // An access region set is represented as a bitset.  Nb: this restricts us
     // to at most eight alias regions for the moment.
     typedef uint8_t AccSet;
 
     // The access regions.  Note that because of the bitset representation
     // these constants are also valid (singleton) AccSet values.  If you add
-    // new ones please update ACC_ALL_WRITABLE and LirNameMap::formatAccSet().
+    // new ones please update ACC_ALL_STORABLE and formatAccSet() and
+    // CseFilter.
     //
     static const AccSet ACC_READONLY = 1 << 0;      // 0000_0001b
     static const AccSet ACC_STACK    = 1 << 1;      // 0000_0010b
-    static const AccSet ACC_OTHER    = 1 << 2;      // 0000_0100b
+    static const AccSet ACC_RSTACK   = 1 << 2;      // 0000_0100b
+    static const AccSet ACC_OTHER    = 1 << 3;      // 0000_1000b
 
     // Some common (non-singleton) access region sets.  ACC_NONE does not make
     // sense for loads or stores (which must access at least one region), it
     // only makes sense for calls.
     //
     // A convention that's worth using:  use ACC_LOAD_ANY/ACC_STORE_ANY for
     // cases that you're unsure about or haven't considered carefully.  Use
-    // ACC_ALL/ACC_ALL_WRITABLE for cases that you have considered carefully.
+    // ACC_ALL/ACC_ALL_STORABLE for cases that you have considered carefully.
     // That way it's easy to tell which ones have been considered and which
     // haven't.
     static const AccSet ACC_NONE         = 0x0;
-    static const AccSet ACC_ALL_WRITABLE = ACC_STACK | ACC_OTHER;
-    static const AccSet ACC_ALL          = ACC_READONLY | ACC_ALL_WRITABLE;
+    static const AccSet ACC_ALL_STORABLE = ACC_STACK | ACC_RSTACK | ACC_OTHER;
+    static const AccSet ACC_ALL          = ACC_READONLY | ACC_ALL_STORABLE;
     static const AccSet ACC_LOAD_ANY     = ACC_ALL;            // synonym
-    static const AccSet ACC_STORE_ANY    = ACC_ALL_WRITABLE;   // synonym
-
+    static const AccSet ACC_STORE_ANY    = ACC_ALL_STORABLE;   // synonym
 
     struct CallInfo
     {
     private:
 
     public:
         uintptr_t   _address;
         uint32_t    _typesig:27;     // 9 3-bit fields indicating arg type, by ARGTYPE above (including ret type): a1 a2 a3 a4 a5 ret
@@ -1483,27 +1495,16 @@ namespace nanojit
             return ins1(LIR_u2q, uintIns);
     #else
             return uintIns;
     #endif
         }
 
         // Chooses LIR_sti or LIR_stqi based on size of value.
         LIns* insStorei(LIns* value, LIns* base, int32_t d, AccSet accSet);
-
-        // Insert a load/store with the most pessimistic region access info, which is always safe.
-        LIns* insLoad(LOpcode op, LIns* base, int32_t d) {
-            return insLoad(op, base, d, ACC_LOAD_ANY);
-        }
-        LIns* insStore(LOpcode op, LIns* value, LIns* base, int32_t d) {
-            return insStore(op, value, base, d, ACC_STORE_ANY);
-        }
-        LIns* insStorei(LIns* value, LIns* base, int32_t d) {
-            return insStorei(value, base, d, ACC_STORE_ANY);
-        }
     };
 
 
 #ifdef NJ_VERBOSE
     extern const char* lirNames[];
 
     // Maps address ranges to meaningful names.
     class AddrNameMap
@@ -1593,29 +1594,29 @@ namespace nanojit
     {
     private:
         Allocator& alloc;
 
         void formatImm(RefBuf* buf, int32_t c);
         void formatImmq(RefBuf* buf, uint64_t c);
         void formatGuard(InsBuf* buf, LInsp ins);
         void formatGuardXov(InsBuf* buf, LInsp ins);
-        char* formatAccSet(RefBuf* buf, LInsp ins, bool isLoad);
 
     public:
         LInsPrinter(Allocator& alloc)
             : alloc(alloc)
         {
             addrNameMap = new (alloc) AddrNameMap(alloc);
             lirNameMap = new (alloc) LirNameMap(alloc);
         }
 
         char *formatAddr(RefBuf* buf, void* p);
         char *formatRef(RefBuf* buf, LInsp ref);
         char *formatIns(InsBuf* buf, LInsp ins);
+        char *formatAccSet(RefBuf* buf, AccSet accSet);
 
         AddrNameMap* addrNameMap;
         LirNameMap* lirNameMap;
     };
 
 
     class VerboseWriter : public LirWriter
     {
@@ -1734,114 +1735,135 @@ namespace nanojit
         LIns* insBranch(LOpcode, LIns *cond, LIns *target);
         LIns* insLoad(LOpcode op, LInsp base, int32_t off, AccSet accSet);
     };
 
     enum LInsHashKind {
         // We divide instruction kinds into groups for the use of LInsHashSet.
         // LIns0 isn't present because we don't need to record any 0-ary
         // instructions.
-        LInsImm   = 0,
-        LInsImmq  = 1,  // only occurs on 64-bit platforms
-        LInsImmf  = 2,
-        LIns1     = 3,
-        LIns2     = 4,
-        LIns3     = 5,
-        LInsLoad  = 6,
-        LInsCall  = 7,
+        LInsImm  = 0,
+        LInsImmq = 1,   // only occurs on 64-bit platforms
+        LInsImmf = 2,
+        LIns1    = 3,
+        LIns2    = 4,
+        LIns3    = 5,
+        LInsCall = 6,
+
+        // Loads are special.  We group them by access region:  one table for
+        // each region, and then a catch-all table for any loads marked with
+        // multiple regions.  This arrangement makes the removal of
+        // invalidated loads fast -- eg. we can invalidate all STACK loads by
+        // just clearing the LInsLoadStack table.  The disadvantage is that
+        // loads marked with multiple regions must be invalidated
+        // conservatively, eg. if any intervening stores occur.  But loads
+        // marked with multiple regions should be rare.
+        LInsLoadReadOnly = 7,
+        LInsLoadStack    = 8,
+        LInsLoadRStack   = 9,
+        LInsLoadOther    = 10,
+        LInsLoadMultiple = 11,
 
         LInsFirst = 0,
-        LInsLast = 7,
+        LInsLast = 11,
         // need a value after "last" to outsmart compilers that will insist last+1 is impossible
-        LInsInvalid = 8
+        LInsInvalid = 12
     };
     #define nextKind(kind)  LInsHashKind(kind+1)
 
-    // @todo, this could be replaced by a generic HashMap or HashSet, if we had one
     class LInsHashSet
     {
         // Must be a power of 2.
         // Don't start too small, or we'll waste time growing and rehashing.
         // Don't start too large, will waste memory.
         static const uint32_t kInitialCap[LInsLast + 1];
 
         // There is one list for each instruction kind.  This lets us size the
         // lists appropriately (some instructions are more common than others).
         // It also lets us have kind-specific find/add/grow functions, which
         // are faster than generic versions.
         LInsp *m_list[LInsLast + 1];
         uint32_t m_cap[LInsLast + 1];
         uint32_t m_used[LInsLast + 1];
         typedef uint32_t (LInsHashSet::*find_t)(LInsp);
         find_t m_find[LInsLast + 1];
+
         Allocator& alloc;
 
         static uint32_t hashImm(int32_t);
-        static uint32_t hashImmq(uint64_t);     // not NANOJIT_64BIT only used by findImmf()
-        static uint32_t hash1(LOpcode v, LInsp);
-        static uint32_t hash2(LOpcode v, LInsp, LInsp);
-        static uint32_t hash3(LOpcode v, LInsp, LInsp, LInsp);
-        static uint32_t hashLoad(LOpcode v, LInsp, int32_t);
+        static uint32_t hashImmq(uint64_t);     // not NANOJIT_64BIT-only -- used by findImmf()
+        static uint32_t hash1(LOpcode op, LInsp);
+        static uint32_t hash2(LOpcode op, LInsp, LInsp);
+        static uint32_t hash3(LOpcode op, LInsp, LInsp, LInsp);
+        static uint32_t hashLoad(LOpcode op, LInsp, int32_t, AccSet);
         static uint32_t hashCall(const CallInfo *call, uint32_t argc, LInsp args[]);
 
         // These private versions are used after an LIns has been created;
         // they are used for rehashing after growing.
         uint32_t findImm(LInsp ins);
 #ifdef NANOJIT_64BIT
         uint32_t findImmq(LInsp ins);
 #endif
         uint32_t findImmf(LInsp ins);
         uint32_t find1(LInsp ins);
         uint32_t find2(LInsp ins);
         uint32_t find3(LInsp ins);
-        uint32_t findLoad(LInsp ins);
         uint32_t findCall(LInsp ins);
+        uint32_t findLoadReadOnly(LInsp ins);
+        uint32_t findLoadStack(LInsp ins);
+        uint32_t findLoadRStack(LInsp ins);
+        uint32_t findLoadOther(LInsp ins);
+        uint32_t findLoadMultiple(LInsp ins);
 
         void grow(LInsHashKind kind);
 
     public:
         // kInitialCaps[i] holds the initial size for m_list[i].
         LInsHashSet(Allocator&, uint32_t kInitialCaps[]);
 
         // These public versions are used before an LIns has been created.
         LInsp findImm(int32_t a, uint32_t &k);
 #ifdef NANOJIT_64BIT
         LInsp findImmq(uint64_t a, uint32_t &k);
 #endif
         LInsp findImmf(uint64_t d, uint32_t &k);
         LInsp find1(LOpcode v, LInsp a, uint32_t &k);
         LInsp find2(LOpcode v, LInsp a, LInsp b, uint32_t &k);
         LInsp find3(LOpcode v, LInsp a, LInsp b, LInsp c, uint32_t &k);
-        LInsp findLoad(LOpcode v, LInsp a, int32_t b, uint32_t &k);
+        LInsp findLoad(LOpcode v, LInsp a, int32_t b, AccSet accSet, LInsHashKind kind,
+                       uint32_t &k);
         LInsp findCall(const CallInfo *call, uint32_t argc, LInsp args[], uint32_t &k);
 
         // 'k' is the index found by findXYZ().
-        LInsp add(LInsHashKind kind, LInsp ins, uint32_t k);
+        void add(LInsHashKind kind, LInsp ins, uint32_t k);
 
-        void clear();
+        void clear();               // clears all tables
+        void clear(LInsHashKind);   // clears one table
     };
 
     class CseFilter: public LirWriter
     {
     private:
         LInsHashSet* exprs;
+        AccSet       storesSinceLastLoad;   // regions stored to since the last load
 
     public:
         CseFilter(LirWriter *out, Allocator&);
 
         LIns* insImm(int32_t imm);
 #ifdef NANOJIT_64BIT
         LIns* insImmq(uint64_t q);
 #endif
         LIns* insImmf(double d);
         LIns* ins0(LOpcode v);
         LIns* ins1(LOpcode v, LInsp);
         LIns* ins2(LOpcode v, LInsp, LInsp);
         LIns* ins3(LOpcode v, LInsp, LInsp, LInsp);
-        LIns* insLoad(LOpcode op, LInsp cond, int32_t d, AccSet accSet);
+        LIns* insLoad(LOpcode op, LInsp base, int32_t d, AccSet accSet);
+        LIns* insStore(LOpcode op, LInsp value, LInsp base, int32_t d, AccSet accSet);
         LIns* insCall(const CallInfo *call, LInsp args[]);
         LIns* insGuard(LOpcode op, LInsp cond, GuardRecord *gr);
         LIns* insGuardXov(LOpcode op, LInsp a, LInsp b, GuardRecord *gr);
     };
 
     class LirBuffer
     {
         public:
@@ -1970,47 +1992,16 @@ namespace nanojit
         int top;
         int getTop(LInsp br);
 
     public:
         StackFilter(LirFilter *in, Allocator& alloc, LInsp sp);
         LInsp read();
     };
 
-    // eliminate redundant loads by watching for stores & mutator calls
-    class LoadFilter: public LirWriter
-    {
-    public:
-        LInsp sp, rp;
-        LInsHashSet* exprs;
-
-        void clear(LInsp p);
-
-    public:
-        LoadFilter(LirWriter *out, Allocator& alloc)
-            : LirWriter(out), sp(NULL), rp(NULL)
-        {
-            uint32_t kInitialCaps[LInsLast + 1];
-            kInitialCaps[LInsImm]   = 1;
-            kInitialCaps[LInsImmq]  = 1;
-            kInitialCaps[LInsImmf]  = 1;
-            kInitialCaps[LIns1]     = 1;
-            kInitialCaps[LIns2]     = 1;
-            kInitialCaps[LIns3]     = 1;
-            kInitialCaps[LInsLoad]  = 64;
-            kInitialCaps[LInsCall]  = 1;
-            exprs = new (alloc) LInsHashSet(alloc, kInitialCaps);
-        }
-
-        LInsp ins0(LOpcode);
-        LInsp insLoad(LOpcode op, LInsp base, int32_t disp, AccSet accSet);
-        LInsp insStore(LOpcode op, LInsp value, LInsp base, int32_t disp, AccSet accSet);
-        LInsp insCall(const CallInfo *call, LInsp args[]);
-    };
-
     struct SoftFloatOps
     {
         const CallInfo* opmap[LIR_sentinel];
         SoftFloatOps();
     };
 
     extern const SoftFloatOps softFloatOps;
 
@@ -2044,29 +2035,36 @@ namespace nanojit
     // writer pipeline, exactly as it is generated by the compiler front-end.
     //
     // A general note about the errors produced by this class:  for
     // TraceMonkey, they won't include special names for instructions that
     // have them unless TMFLAGS is specified.
     class ValidateWriter : public LirWriter
     {
     private:
-        const char* _whereInPipeline;
+        LInsPrinter* printer;
+        const char* whereInPipeline;
 
         const char* type2string(LTy type);
         void typeCheckArgs(LOpcode op, int nArgs, LTy formals[], LIns* args[]);
         void errorStructureShouldBe(LOpcode op, const char* argDesc, int argN, LIns* arg,
                                     const char* shouldBeDesc);
-        void errorAccSetShould(const char* what, AccSet accSet, const char* shouldDesc);
+        void errorAccSet(const char* what, AccSet accSet, const char* shouldDesc);
         void checkLInsHasOpcode(LOpcode op, int argN, LIns* ins, LOpcode op2);
         void checkLInsIsACondOrConst(LOpcode op, int argN, LIns* ins);
         void checkLInsIsNull(LOpcode op, int argN, LIns* ins);
+        void checkAccSet(LOpcode op, LInsp base, AccSet accSet, AccSet maxAccSet);
+
+        LInsp sp, rp;
 
     public:
-        ValidateWriter(LirWriter* out, const char* stageName);
+        ValidateWriter(LirWriter* out, LInsPrinter* printer, const char* where);
+        void setSp(LInsp ins) { sp = ins; }
+        void setRp(LInsp ins) { rp = ins; }
+
         LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet);
         LIns* insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet);
         LIns* ins0(LOpcode v);
         LIns* ins1(LOpcode v, LIns* a);
         LIns* ins2(LOpcode v, LIns* a, LIns* b);
         LIns* ins3(LOpcode v, LIns* a, LIns* b, LIns* c);
         LIns* insParam(int32_t arg, int32_t kind);
         LIns* insImm(int32_t imm);