Bug 979594 - ARM Ion and assembler support. r=sstangl r=dtc-moz
authorLars T Hansen <lhansen@mozilla.com>
Thu, 23 Oct 2014 14:23:27 +0200
changeset 211920 983259897284c61f208733ac520ac3f9ba646f09
parent 211919 ab936277cf4ba207714f13d14ddbbfa9996c86fd
child 211921 6b733d690a38570b1af99d4996da87ace77de4e4
push id27693
push userryanvm@gmail.com
push dateThu, 23 Oct 2014 18:06:22 +0000
treeherdermozilla-central@d8de0d7e52e0 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssstangl, dtc-moz
bugs979594
milestone36.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 979594 - ARM Ion and assembler support. r=sstangl r=dtc-moz
js/src/jit/arm/CodeGenerator-arm.cpp
js/src/jit/arm/CodeGenerator-arm.h
js/src/jit/arm/Lowering-arm.cpp
js/src/jit/arm/Lowering-arm.h
js/src/jit/arm/MacroAssembler-arm.cpp
js/src/jit/arm/MacroAssembler-arm.h
--- a/js/src/jit/arm/CodeGenerator-arm.cpp
+++ b/js/src/jit/arm/CodeGenerator-arm.cpp
@@ -2242,8 +2242,29 @@ CodeGeneratorARM::visitForkJoinGetSlice(
     MOZ_CRASH("NYI");
 }
 
 JitCode *
 JitRuntime::generateForkJoinGetSliceStub(JSContext *cx)
 {
     MOZ_CRASH("NYI");
 }
+
+void
+CodeGeneratorARM::memoryBarrier(int barrier)
+{
+    // On ARMv6 the optional argument (BarrierST, etc) is ignored.
+    if (barrier == (MembarStoreStore|MembarSynchronizing))
+        masm.ma_dsb(masm.BarrierST);
+    else if (barrier & MembarSynchronizing)
+        masm.ma_dsb();
+    else if (barrier == MembarStoreStore)
+        masm.ma_dmb(masm.BarrierST);
+    else if (barrier)
+        masm.ma_dmb();
+}
+
+bool
+CodeGeneratorARM::visitMemoryBarrier(LMemoryBarrier *ins)
+{
+    memoryBarrier(ins->type());
+    return true;
+}
--- a/js/src/jit/arm/CodeGenerator-arm.h
+++ b/js/src/jit/arm/CodeGenerator-arm.h
@@ -170,16 +170,18 @@ class CodeGeneratorARM : public CodeGene
     // Functions for LTestVAndBranch.
     Register splitTagForTest(const ValueOperand &value);
 
     bool divICommon(MDiv *mir, Register lhs, Register rhs, Register output, LSnapshot *snapshot,
                     Label &done);
     bool modICommon(MMod *mir, Register lhs, Register rhs, Register output, LSnapshot *snapshot,
                     Label &done);
 
+    void memoryBarrier(int barrier);
+
   public:
     CodeGeneratorARM(MIRGenerator *gen, LIRGraph *graph, MacroAssembler *masm);
 
   public:
     bool visitBox(LBox *box);
     bool visitBoxFloatingPoint(LBoxFloatingPoint *box);
     bool visitUnbox(LUnbox *unbox);
     bool visitValue(LValue *value);
@@ -201,16 +203,18 @@ class CodeGeneratorARM : public CodeGene
     bool visitAsmJSLoadGlobalVar(LAsmJSLoadGlobalVar *ins);
     bool visitAsmJSStoreGlobalVar(LAsmJSStoreGlobalVar *ins);
     bool visitAsmJSLoadFuncPtr(LAsmJSLoadFuncPtr *ins);
     bool visitAsmJSLoadFFIFunc(LAsmJSLoadFFIFunc *ins);
     bool visitAsmJSPassStackArg(LAsmJSPassStackArg *ins);
 
     bool visitForkJoinGetSlice(LForkJoinGetSlice *ins);
 
+    bool visitMemoryBarrier(LMemoryBarrier *ins);
+
     bool generateInvalidateEpilogue();
 
   protected:
     bool visitEffectiveAddress(LEffectiveAddress *ins);
     bool visitUDiv(LUDiv *ins);
     bool visitUMod(LUMod *ins);
     bool visitSoftUDivOrMod(LSoftUDivOrMod *ins);
 
--- a/js/src/jit/arm/Lowering-arm.cpp
+++ b/js/src/jit/arm/Lowering-arm.cpp
@@ -565,9 +565,74 @@ LIRGeneratorARM::visitSimdSplatX4(MSimdS
 }
 
 bool
 LIRGeneratorARM::visitSimdValueX4(MSimdValueX4 *ins)
 {
     MOZ_CRASH("NYI");
 }
 
-//__aeabi_uidiv
+bool
+LIRGeneratorARM::visitAtomicTypedArrayElementBinop(MAtomicTypedArrayElementBinop *ins)
+{
+    MOZ_ASSERT(ins->arrayType() != Scalar::Uint8Clamped);
+    MOZ_ASSERT(ins->arrayType() != Scalar::Float32);
+    MOZ_ASSERT(ins->arrayType() != Scalar::Float64);
+
+    MOZ_ASSERT(ins->elements()->type() == MIRType_Elements);
+    MOZ_ASSERT(ins->index()->type() == MIRType_Int32);
+
+    const LUse elements = useRegister(ins->elements());
+    const LAllocation index = useRegisterOrConstant(ins->index());
+
+    // For most operations we don't need any temps because there are
+    // enough scratch registers.  tempDef2 is never needed on ARM.
+    //
+    // For a Uint32Array with a known double result we need a temp for
+    // the intermediate output, this is tempDef1.
+    //
+    // Optimization opportunity (bug 1077317): We can do better by
+    // allowing 'value' to remain as an imm32 if it is small enough to
+    // fit in an instruction.
+
+    LDefinition tempDef1 = LDefinition::BogusTemp();
+    LDefinition tempDef2 = LDefinition::BogusTemp();
+
+    const LAllocation value = useRegister(ins->value());
+    if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type()))
+        tempDef1 = temp();
+
+    LAtomicTypedArrayElementBinop *lir =
+        new(alloc()) LAtomicTypedArrayElementBinop(elements, index, value, tempDef1, tempDef2);
+
+    return define(lir, ins);
+}
+
+bool
+LIRGeneratorARM::visitCompareExchangeTypedArrayElement(MCompareExchangeTypedArrayElement *ins)
+{
+    MOZ_ASSERT(ins->arrayType() != Scalar::Float32);
+    MOZ_ASSERT(ins->arrayType() != Scalar::Float64);
+
+    MOZ_ASSERT(ins->elements()->type() == MIRType_Elements);
+    MOZ_ASSERT(ins->index()->type() == MIRType_Int32);
+
+    const LUse elements = useRegister(ins->elements());
+    const LAllocation index = useRegisterOrConstant(ins->index());
+
+    // If the target is a floating register then we need a temp at the
+    // CodeGenerator level for creating the result.
+    //
+    // Optimization opportunity (bug 1077317): We could do better by
+    // allowing oldval to remain an immediate, if it is small enough
+    // to fit in an instruction.
+
+    const LAllocation newval = useRegister(ins->newval());
+    const LAllocation oldval = useRegister(ins->oldval());
+    LDefinition tempDef = LDefinition::BogusTemp();
+    if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type()))
+        tempDef = temp();
+
+    LCompareExchangeTypedArrayElement *lir =
+        new(alloc()) LCompareExchangeTypedArrayElement(elements, index, oldval, newval, tempDef);
+
+    return define(lir, ins);
+}
--- a/js/src/jit/arm/Lowering-arm.h
+++ b/js/src/jit/arm/Lowering-arm.h
@@ -101,16 +101,18 @@ class LIRGeneratorARM : public LIRGenera
     bool visitAsmJSLoadHeap(MAsmJSLoadHeap *ins);
     bool visitAsmJSStoreHeap(MAsmJSStoreHeap *ins);
     bool visitAsmJSLoadFuncPtr(MAsmJSLoadFuncPtr *ins);
     bool visitStoreTypedArrayElementStatic(MStoreTypedArrayElementStatic *ins);
     bool visitForkJoinGetSlice(MForkJoinGetSlice *ins);
     bool visitSimdTernaryBitwise(MSimdTernaryBitwise *ins);
     bool visitSimdSplatX4(MSimdSplatX4 *ins);
     bool visitSimdValueX4(MSimdValueX4 *ins);
+    bool visitCompareExchangeTypedArrayElement(MCompareExchangeTypedArrayElement *ins);
+    bool visitAtomicTypedArrayElementBinop(MAtomicTypedArrayElementBinop *ins);
 };
 
 typedef LIRGeneratorARM LIRGeneratorSpecific;
 
 } // namespace jit
 } // namespace js
 
 #endif /* jit_arm_Lowering_arm_h */
--- a/js/src/jit/arm/MacroAssembler-arm.cpp
+++ b/js/src/jit/arm/MacroAssembler-arm.cpp
@@ -4685,9 +4685,285 @@ MacroAssemblerARMCompat::branchValueIsNu
     Label done;
 
     branchTestObject(Assembler::NotEqual, value, cond == Assembler::Equal ? &done : label);
     branchPtrInNurseryRange(cond, value.payloadReg(), temp, label);
 
     bind(&done);
 }
 
+namespace js {
+namespace jit {
+
+template<>
+Register
+MacroAssemblerARMCompat::computePointer<BaseIndex>(const BaseIndex &src, Register r)
+{
+    Register base = src.base;
+    Register index = src.index;
+    uint32_t scale = Imm32::ShiftOf(src.scale).value;
+    int32_t offset = src.offset;
+    as_add(r, base, lsl(index, scale));
+    if (offset != 0)
+        ma_add(r, Imm32(offset), r);
+    return r;
+}
+
+template<>
+Register
+MacroAssemblerARMCompat::computePointer<Address>(const Address &src, Register r)
+{
+    if (src.offset == 0)
+        return src.base;
+    ma_add(src.base, Imm32(src.offset), r);
+    return r;
+}
+
+} // namespace jit
+} // namespace js
+
+template<typename T>
+void
+MacroAssemblerARMCompat::compareExchange(int nbytes, bool signExtend, const T &mem,
+                                         Register oldval, Register newval, Register output)
+{
+    // If LDREXB/H and STREXB/H are not available we use the
+    // word-width operations with read-modify-add.  That does not
+    // abstract well, so fork.
+    //
+    // Bug 1077321: We may further optimize for ARMv8 here.
+    if (nbytes < 4 && !HasLDSTREXBHD())
+        compareExchangeARMv6(nbytes, signExtend, mem, oldval, newval, output);
+    else
+        compareExchangeARMv7(nbytes, signExtend, mem, oldval, newval, output);
+}
+
+// General algorithm:
+//
+//     ...    ptr, <addr>         ; compute address of item
+//     dmb
+// L0  ldrex* output, [ptr]
+//     sxt*   output, output, 0   ; sign-extend if applicable
+//     *xt*   tmp, oldval, 0      ; sign-extend or zero-extend if applicable
+//     cmp    output, tmp
+//     bne    L1                  ; failed - values are different
+//     strex* tmp, newval, [ptr]
+//     cmp    tmp, 1
+//     beq    L0                  ; failed - location is dirty, retry
+// L1  dmb
+//
+// Discussion here:  http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html.
+// However note that that discussion uses 'isb' as the trailing fence.
+// I've not quite figured out why, and I've gone with dmb here which
+// is safe.  Also see the LLVM source, which uses 'dmb ish' generally.
+// (Apple's Swift CPU apparently handles ish in a non-default, faster
+// way.)
+
+template<typename T>
+void
+MacroAssemblerARMCompat::compareExchangeARMv7(int nbytes, bool signExtend, const T &mem,
+                                              Register oldval, Register newval, Register output)
+{
+    Label Lagain;
+    Label Ldone;
+    ma_dmb(BarrierST);
+    Register ptr = computePointer(mem, secondScratchReg_);
+    bind(&Lagain);
+    switch (nbytes) {
+      case 1:
+        as_ldrexb(output, ptr);
+        if (signExtend) {
+            as_sxtb(output, output, 0);
+            as_sxtb(ScratchRegister, oldval, 0);
+        } else {
+            as_uxtb(ScratchRegister, oldval, 0);
+        }
+        break;
+      case 2:
+        as_ldrexh(output, ptr);
+        if (signExtend) {
+            as_sxth(output, output, 0);
+            as_sxth(ScratchRegister, oldval, 0);
+        } else {
+            as_uxth(ScratchRegister, oldval, 0);
+        }
+        break;
+      case 4:
+        MOZ_ASSERT(!signExtend);
+        as_ldrex(output, ptr);
+        break;
+    }
+    if (nbytes < 4)
+        as_cmp(output, O2Reg(ScratchRegister));
+    else
+        as_cmp(output, O2Reg(oldval));
+    as_b(&Ldone, NotEqual);
+    switch (nbytes) {
+      case 1:
+        as_strexb(ScratchRegister, newval, ptr);
+        break;
+      case 2:
+        as_strexh(ScratchRegister, newval, ptr);
+        break;
+      case 4:
+        as_strex(ScratchRegister, newval, ptr);
+        break;
+    }
+    as_cmp(ScratchRegister, Imm8(1));
+    as_b(&Lagain, Equal);
+    bind(&Ldone);
+    ma_dmb();
+}
+
+template<typename T>
+void
+MacroAssemblerARMCompat::compareExchangeARMv6(int nbytes, bool signExtend, const T &mem,
+                                              Register oldval, Register newval, Register output)
+{
+    // Bug 1077318: Must use read-modify-write with LDREX / STREX.
+    MOZ_ASSERT(nbytes == 1 || nbytes == 2);
+    MOZ_CRASH("NYI");
+}
+
+template void
+js::jit::MacroAssemblerARMCompat::compareExchange(int nbytes, bool signExtend,
+                                                  const Address &address, Register oldval,
+                                                  Register newval, Register output);
+template void
+js::jit::MacroAssemblerARMCompat::compareExchange(int nbytes, bool signExtend,
+                                                  const BaseIndex &address, Register oldval,
+                                                  Register newval, Register output);
+
+template<typename T>
+void
+MacroAssemblerARMCompat::atomicFetchOp(int nbytes, bool signExtend, AtomicOp op, const Imm32 &value,
+                                       const T &mem, Register temp, Register output)
+{
+    // The Imm32 value case is not needed yet because lowering always
+    // forces the value into a register at present (bug 1077317).  But
+    // the method must be present for the platform-independent code to
+    // link.
+    MOZ_CRASH("Feature NYI");
+}
+
+// General algorithm:
+//
+//     ...    ptr, <addr>         ; compute address of item
+//     dmb
+// L0  ldrex* output, [ptr]
+//     sxt*   output, output, 0   ; sign-extend if applicable
+//     OP     tmp, output, value  ; compute value to store
+//     strex* tmp, tmp, [ptr]
+//     cmp    tmp, 1
+//     beq    L0                  ; failed - location is dirty, retry
+//     dmb                        ; ordering barrier required
+//
+// Also see notes above at compareExchange re the barrier strategy.
+//
+// Observe that the value being operated into the memory element need
+// not be sign-extended because no OP will make use of bits to the
+// left of the bits indicated by the width of the element, and neither
+// output nor the bits stored are affected by OP.
+
+template<typename T>
+void
+MacroAssemblerARMCompat::atomicFetchOp(int nbytes, bool signExtend, AtomicOp op,
+                                       const Register &value, const T &mem, Register temp,
+                                       Register output)
+{
+    // Fork for non-word operations on ARMv6.
+    //
+    // Bug 1077321: We may further optimize for ARMv8 here.
+    if (nbytes < 4 && !HasLDSTREXBHD())
+        atomicFetchOpARMv6(nbytes, signExtend, op, value, mem, temp, output);
+    else {
+        MOZ_ASSERT(temp == InvalidReg);
+        atomicFetchOpARMv7(nbytes, signExtend, op, value, mem, output);
+    }
+}
+
+template<typename T>
+void
+MacroAssemblerARMCompat::atomicFetchOpARMv7(int nbytes, bool signExtend, AtomicOp op,
+                                            const Register &value, const T &mem, Register output)
+{
+    Label Lagain;
+    Register ptr = computePointer(mem, secondScratchReg_);
+    ma_dmb();
+    bind(&Lagain);
+    switch (nbytes) {
+      case 1:
+        as_ldrexb(output, ptr);
+        if (signExtend)
+            as_sxtb(output, output, 0);
+        break;
+      case 2:
+        as_ldrexh(output, ptr);
+        if (signExtend)
+            as_sxth(output, output, 0);
+        break;
+      case 4:
+        MOZ_ASSERT(!signExtend);
+        as_ldrex(output, ptr);
+        break;
+    }
+    switch (op) {
+      case AtomicFetchAddOp:
+        as_add(ScratchRegister, output, O2Reg(value));
+        break;
+      case AtomicFetchSubOp:
+        as_sub(ScratchRegister, output, O2Reg(value));
+        break;
+      case AtomicFetchAndOp:
+        as_and(ScratchRegister, output, O2Reg(value));
+        break;
+      case AtomicFetchOrOp:
+        as_orr(ScratchRegister, output, O2Reg(value));
+        break;
+      case AtomicFetchXorOp:
+        as_eor(ScratchRegister, output, O2Reg(value));
+        break;
+    }
+    switch (nbytes) {
+      case 1:
+        as_strexb(ScratchRegister, ScratchRegister, ptr);
+        break;
+      case 2:
+        as_strexh(ScratchRegister, ScratchRegister, ptr);
+        break;
+      case 4:
+        as_strex(ScratchRegister, ScratchRegister, ptr);
+        break;
+    }
+    as_cmp(ScratchRegister, Imm8(1));
+    as_b(&Lagain, Equal);
+    ma_dmb();
+}
+
+template<typename T>
+void
+MacroAssemblerARMCompat::atomicFetchOpARMv6(int nbytes, bool signExtend, AtomicOp op,
+                                            const Register &value, const T &mem, Register temp,
+                                            Register output)
+{
+    // Bug 1077318: Must use read-modify-write with LDREX / STREX.
+    MOZ_ASSERT(nbytes == 1 || nbytes == 2);
+    MOZ_CRASH("NYI");
+}
+
+template void
+js::jit::MacroAssemblerARMCompat::atomicFetchOp(int nbytes, bool signExtend, AtomicOp op,
+                                                const Imm32 &value, const Address &mem,
+                                                Register temp, Register output);
+template void
+js::jit::MacroAssemblerARMCompat::atomicFetchOp(int nbytes, bool signExtend, AtomicOp op,
+                                                const Imm32 &value, const BaseIndex &mem,
+                                                Register temp, Register output);
+template void
+js::jit::MacroAssemblerARMCompat::atomicFetchOp(int nbytes, bool signExtend, AtomicOp op,
+                                                const Register &value, const Address &mem,
+                                                Register temp, Register output);
+template void
+js::jit::MacroAssemblerARMCompat::atomicFetchOp(int nbytes, bool signExtend, AtomicOp op,
+                                                const Register &value, const BaseIndex &mem,
+                                                Register temp, Register output);
+
 #endif
--- a/js/src/jit/arm/MacroAssembler-arm.h
+++ b/js/src/jit/arm/MacroAssembler-arm.h
@@ -7,16 +7,17 @@
 #ifndef jit_arm_MacroAssembler_arm_h
 #define jit_arm_MacroAssembler_arm_h
 
 #include "mozilla/DebugOnly.h"
 
 #include "jsopcode.h"
 
 #include "jit/arm/Assembler-arm.h"
+#include "jit/AtomicOp.h"
 #include "jit/IonCaches.h"
 #include "jit/IonFrames.h"
 #include "jit/MoveResolver.h"
 
 using mozilla::DebugOnly;
 
 namespace js {
 namespace jit {
@@ -1415,16 +1416,182 @@ class MacroAssemblerARMCompat : public M
     }
     void storeFloat32(FloatRegister src, BaseIndex addr) {
         // Harder cases not handled yet.
         MOZ_ASSERT(addr.offset == 0);
         uint32_t scale = Imm32::ShiftOf(addr.scale).value;
         ma_vstr(VFPRegister(src).singleOverlay(), addr.base, addr.index, scale);
     }
 
+  private:
+    template<typename T>
+    Register computePointer(const T &src, Register r);
+
+    template<typename T>
+    void compareExchangeARMv6(int nbytes, bool signExtend, const T &mem, Register oldval,
+                              Register newval, Register output);
+
+    template<typename T>
+    void compareExchangeARMv7(int nbytes, bool signExtend, const T &mem, Register oldval,
+                              Register newval, Register output);
+
+    template<typename T>
+    void compareExchange(int nbytes, bool signExtend, const T &address, Register oldval,
+                         Register newval, Register output);
+
+    template<typename T>
+    void atomicFetchOpARMv6(int nbytes, bool signExtend, AtomicOp op, const Register &value,
+                            const T &mem, Register temp, Register output);
+
+    template<typename T>
+    void atomicFetchOpARMv7(int nbytes, bool signExtend, AtomicOp op, const Register &value,
+                            const T &mem, Register output);
+
+    template<typename T>
+    void atomicFetchOp(int nbytes, bool signExtend, AtomicOp op, const Imm32 &value,
+                       const T &address, Register temp, Register output);
+
+    template<typename T>
+    void atomicFetchOp(int nbytes, bool signExtend, AtomicOp op, const Register &value,
+                       const T &address, Register temp, Register output);
+
+  public:
+    // T in {Address,BaseIndex}
+    // S in {Imm32,Register}
+
+    template<typename T>
+    void compareExchange8SignExtend(const T &mem, Register oldval, Register newval, Register output)
+    {
+        compareExchange(1, true, mem, oldval, newval, output);
+    }
+    template<typename T>
+    void compareExchange8ZeroExtend(const T &mem, Register oldval, Register newval, Register output)
+    {
+        compareExchange(1, false, mem, oldval, newval, output);
+    }
+    template<typename T>
+    void compareExchange16SignExtend(const T &mem, Register oldval, Register newval, Register output)
+    {
+        compareExchange(2, true, mem, oldval, newval, output);
+    }
+    template<typename T>
+    void compareExchange16ZeroExtend(const T &mem, Register oldval, Register newval, Register output)
+    {
+        compareExchange(2, false, mem, oldval, newval, output);
+    }
+    template<typename T>
+    void compareExchange32(const T &mem, Register oldval, Register newval, Register output)  {
+        compareExchange(4, false, mem, oldval, newval, output);
+    }
+
+    template<typename T, typename S>
+    void atomicFetchAdd8SignExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(1, true, AtomicFetchAddOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchAdd8ZeroExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(1, false, AtomicFetchAddOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchAdd16SignExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(2, true, AtomicFetchAddOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchAdd16ZeroExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(2, false, AtomicFetchAddOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchAdd32(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(4, false, AtomicFetchAddOp, value, mem, temp, output);
+    }
+
+    template<typename T, typename S>
+    void atomicFetchSub8SignExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(1, true, AtomicFetchSubOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchSub8ZeroExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(1, false, AtomicFetchSubOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchSub16SignExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(2, true, AtomicFetchSubOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchSub16ZeroExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(2, false, AtomicFetchSubOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchSub32(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(4, false, AtomicFetchSubOp, value, mem, temp, output);
+    }
+
+    template<typename T, typename S>
+    void atomicFetchAnd8SignExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(1, true, AtomicFetchAndOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchAnd8ZeroExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(1, false, AtomicFetchAndOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchAnd16SignExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(2, true, AtomicFetchAndOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchAnd16ZeroExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(2, false, AtomicFetchAndOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchAnd32(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(4, false, AtomicFetchAndOp, value, mem, temp, output);
+    }
+
+    template<typename T, typename S>
+    void atomicFetchOr8SignExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(1, true, AtomicFetchOrOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchOr8ZeroExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(1, false, AtomicFetchOrOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchOr16SignExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(2, true, AtomicFetchOrOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchOr16ZeroExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(2, false, AtomicFetchOrOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchOr32(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(4, false, AtomicFetchOrOp, value, mem, temp, output);
+    }
+
+    template<typename T, typename S>
+    void atomicFetchXor8SignExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(1, true, AtomicFetchXorOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchXor8ZeroExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(1, false, AtomicFetchXorOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchXor16SignExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(2, true, AtomicFetchXorOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchXor16ZeroExtend(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(2, false, AtomicFetchXorOp, value, mem, temp, output);
+    }
+    template<typename T, typename S>
+    void atomicFetchXor32(const S &value, const T &mem, Register temp, Register output) {
+        atomicFetchOp(4, false, AtomicFetchXorOp, value, mem, temp, output);
+    }
+
     void clampIntToUint8(Register reg) {
         // Look at (reg >> 8) if it is 0, then reg shouldn't be clamped if it is
         // <0, then we want to clamp to 0, otherwise, we wish to clamp to 255
         as_mov(ScratchRegister, asr(reg, 8), SetCond);
         ma_mov(Imm32(0xff), reg, NoSetCond, NotEqual);
         ma_mov(Imm32(0), reg, NoSetCond, Signed);
     }