Bug 1141121 - Immediate operands to atomics, x86 and x64. r=h4writer
authorLars T Hansen <lhansen@mozilla.com>
Wed, 25 Mar 2015 10:51:12 +0100
changeset 265802 7cf3406ac1cd9977cb0da4c96f00c1db69230c04
parent 265801 0c7cb48aed8616cdd47bbab66cf649e01353de1b
child 265803 1550b7322e45cc67ca53af2281608bede1cf3f2c
push id830
push userraliiev@mozilla.com
push dateFri, 19 Jun 2015 19:24:37 +0000
treeherdermozilla-release@932614382a68 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersh4writer
bugs1141121
milestone39.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1141121 - Immediate operands to atomics, x86 and x64. r=h4writer
js/src/jit-test/tests/atomics/optimization-tests.js
js/src/jit/LIR-Common.h
js/src/jit/shared/Lowering-x86-shared.cpp
js/src/jit/x64/Lowering-x64.cpp
js/src/jit/x86/Lowering-x86.cpp
--- a/js/src/jit-test/tests/atomics/optimization-tests.js
+++ b/js/src/jit-test/tests/atomics/optimization-tests.js
@@ -33,16 +33,28 @@ function f2(ia, k) {
     // code on x86/x64 should be one LOCK SUBB.
     Atomics.sub(ia, 2, k);
 
     // Ditto constant value.  Here the LOCK SUBB should have an
     // immediate operand.
     Atomics.sub(ia, 2, 1);
 }
 
+function f4(ia, k) {
+    // For effect, variable value.  The generated code on x86/x64
+    // should be one LOCK ORB.  (On ARM, there should be no
+    // sign-extend of the current value in the cell, otherwise this is
+    // still a LDREX/STREX loop.)
+    Atomics.or(ia, 6, k);
+
+    // Ditto constant value.  Here the LOCK ORB should have an
+    // immediate operand.
+    Atomics.or(ia, 6, 1);
+}
+
 function g(ia, k) {
     // For its value, variable value.  The generated code on x86/x64
     // should be one LOCK XADDB.
     sum += Atomics.add(ia, 1, k);
 
     // Ditto constant value.  XADD does not admit an immediate
     // operand, so in the second case there should be a preliminary
     // MOV of the immediate to the output register.
@@ -56,16 +68,26 @@ function g2(ia, k) {
     sum += Atomics.sub(ia, 3, k);
 
     // Ditto constant value.  XADD does not admit an immediate
     // operand, so in the second case there should be a preliminary
     // MOV of the negated immediate to the output register.
     sum += Atomics.sub(ia, 3, 1);
 }
 
+function g4(ia, k) {
+    // For its value, variable value.  The generated code on x86/x64
+    // should be a loop around ORB ; CMPXCHGB
+    sum += Atomics.or(ia, 7, k);
+
+    // Ditto constant value.  Here the ORB in the loop should have
+    // an immediate operand.
+    sum += Atomics.or(ia, 7, 1);
+}
+
 function mod(stdlib, ffi, heap) {
     "use asm";
 
     var i8a = new stdlib.SharedInt8Array(heap);
     var add = stdlib.Atomics.add;
     var sum = 0;
 
     function f3(k) {
@@ -87,16 +109,20 @@ var i8a = new SharedInt8Array(65536);
 var { f3, g3 } = mod(this, {}, i8a.buffer);
 for ( var i=0 ; i < 10000 ; i++ ) {
     f(i8a, i % 10);
     g(i8a, i % 10);
     f2(i8a, i % 10);
     g2(i8a, i % 10);
     f3(i % 10);
     g3(i % 10);
+    f4(i8a, i % 10);
+    g4(i8a, i % 10);
 }
 
 assertEq(i8a[0], ((10000 + 10000*4.5) << 24) >> 24);
 assertEq(i8a[1], ((10000 + 10000*4.5) << 24) >> 24);
 assertEq(i8a[2], ((-10000 + -10000*4.5) << 24) >> 24);
 assertEq(i8a[3], ((-10000 + -10000*4.5) << 24) >> 24);
 assertEq(i8a[4], ((10000 + 10000*4.5) << 24) >> 24);
 assertEq(i8a[5], ((10000 + 10000*4.5) << 24) >> 24);
+assertEq(i8a[6], 15);
+assertEq(i8a[7], 15);
--- a/js/src/jit/LIR-Common.h
+++ b/js/src/jit/LIR-Common.h
@@ -5054,16 +5054,18 @@ class LCompareExchangeTypedArrayElement 
     }
 };
 
 class LAtomicTypedArrayElementBinop : public LInstructionHelper<1, 3, 2>
 {
   public:
     LIR_HEADER(AtomicTypedArrayElementBinop)
 
+    static const int32_t valueOp = 2;
+
     LAtomicTypedArrayElementBinop(const LAllocation &elements, const LAllocation &index,
                                   const LAllocation &value, const LDefinition &temp1,
                                   const LDefinition &temp2)
     {
         setOperand(0, elements);
         setOperand(1, index);
         setOperand(2, value);
         setTemp(0, temp1);
@@ -5072,16 +5074,17 @@ class LAtomicTypedArrayElementBinop : pu
 
     const LAllocation *elements() {
         return getOperand(0);
     }
     const LAllocation *index() {
         return getOperand(1);
     }
     const LAllocation *value() {
+        MOZ_ASSERT(valueOp == 2);
         return getOperand(2);
     }
     const LDefinition *temp1() {
         return getTemp(0);
     }
     const LDefinition *temp2() {
         return getTemp(1);
     }
@@ -6486,28 +6489,32 @@ class LAsmJSCompareExchangeHeap : public
         return mir_->toAsmJSCompareExchangeHeap();
     }
 };
 
 class LAsmJSAtomicBinopHeap : public LInstructionHelper<1, 2, 2>
 {
   public:
     LIR_HEADER(AsmJSAtomicBinopHeap);
+
+    static const int32_t valueOp = 1;
+
     LAsmJSAtomicBinopHeap(const LAllocation &ptr, const LAllocation &value,
                           const LDefinition &temp)
     {
         setOperand(0, ptr);
         setOperand(1, value);
         setTemp(0, temp);
         setTemp(1, LDefinition::BogusTemp());
     }
     const LAllocation *ptr() {
         return getOperand(0);
     }
     const LAllocation *value() {
+        MOZ_ASSERT(valueOp == 1);
         return getOperand(1);
     }
     const LDefinition *temp() {
         return getTemp(0);
     }
     const LDefinition *addrTemp() {
         return getTemp(1);
     }
--- a/js/src/jit/shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/shared/Lowering-x86-shared.cpp
@@ -429,43 +429,39 @@ LIRGeneratorX86Shared::lowerAtomicTypedA
 
     const LUse elements = useRegister(ins->elements());
     const LAllocation index = useRegisterOrConstant(ins->index());
 
     // Case 1: the result of the operation is not used.
     //
     // We'll emit a single instruction: LOCK ADD, LOCK SUB, LOCK AND,
     // LOCK OR, or LOCK XOR.  We can do this even for the Uint32 case.
-    //
-    // If the operand is 8-bit we shall need to use an 8-bit register
-    // for it on x86 systems.
 
     if (!ins->hasUses()) {
         LAllocation value;
-        if (useI386ByteRegisters && ins->isByteArray())
+        if (useI386ByteRegisters && ins->isByteArray() && !ins->value()->isConstant())
             value = useFixed(ins->value(), ebx);
         else
-            value = useRegister(ins->value());
+            value = useRegisterOrConstant(ins->value());
 
         LAtomicTypedArrayElementBinopForEffect *lir =
             new(alloc()) LAtomicTypedArrayElementBinopForEffect(elements, index, value);
 
         add(lir, ins);
         return;
     }
 
     // Case 2: the result of the operation is used.
     //
     // For ADD and SUB we'll use XADD:
     //
     //    movl       src, output
     //    lock xaddl output, mem
     //
-    // For the 8-bit variants XADD needs a byte register for the
-    // output only.
+    // For the 8-bit variants XADD needs a byte register for the output.
     //
     // For AND/OR/XOR we need to use a CMPXCHG loop:
     //
     //    movl          *mem, eax
     // L: mov           eax, temp
     //    andl          src, temp
     //    lock cmpxchg  temp, mem  ; reads eax also
     //    jnz           L
@@ -479,55 +475,61 @@ LIRGeneratorX86Shared::lowerAtomicTypedA
     //  - eax should be the output (one result of the cmpxchg)
     //  - there is a temp, which must have a byte register if
     //    the array has 1-byte elements elements
     //
     // If the array is a uint32 array then:
     //  - eax is the first temp
     //  - we also need a second temp
     //
-    // For simplicity we force the 'value' into a byte register if the
-    // array has 1-byte elements, though that could be worked around.
-    //
-    // For simplicity we also choose fixed byte registers even when
-    // any available byte register would have been OK.
-    //
     // There are optimization opportunities:
-    //  - better register allocation and instruction selection, Bug #1077036.
+    //  - better register allocation in the x86 8-bit case, Bug #1077036.
 
     bool bitOp = !(ins->operation() == AtomicFetchAddOp || ins->operation() == AtomicFetchSubOp);
     bool fixedOutput = true;
+    bool reuseInput = false;
     LDefinition tempDef1 = LDefinition::BogusTemp();
     LDefinition tempDef2 = LDefinition::BogusTemp();
     LAllocation value;
 
     if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) {
-        value = useRegister(ins->value());
+        value = useRegisterOrConstant(ins->value());
         fixedOutput = false;
         if (bitOp) {
             tempDef1 = tempFixed(eax);
             tempDef2 = temp();
         } else {
             tempDef1 = temp();
         }
     } else if (useI386ByteRegisters && ins->isByteArray()) {
-        value = useFixed(ins->value(), ebx);
+        if (ins->value()->isConstant())
+            value = useRegisterOrConstant(ins->value());
+        else
+            value = useFixed(ins->value(), ebx);
         if (bitOp)
             tempDef1 = tempFixed(ecx);
+    } else if (bitOp) {
+        value = useRegisterOrConstant(ins->value());
+        tempDef1 = temp();
+    } else if (ins->value()->isConstant()) {
+        fixedOutput = false;
+        value = useRegisterOrConstant(ins->value());
     } else {
-        value = useRegister(ins->value());
-        if (bitOp)
-            tempDef1 = temp();
+        fixedOutput = false;
+        reuseInput = true;
+        value = useRegisterAtStart(ins->value());
     }
 
     LAtomicTypedArrayElementBinop *lir =
         new(alloc()) LAtomicTypedArrayElementBinop(elements, index, value, tempDef1, tempDef2);
 
     if (fixedOutput)
         defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
+    else if (reuseInput)
+        defineReuseInput(lir, ins, LAtomicTypedArrayElementBinop::valueOp);
     else
         define(lir, ins);
 }
 
 void
 LIRGeneratorX86Shared::visitSimdBinaryArith(MSimdBinaryArith *ins)
 {
     MOZ_ASSERT(IsSimdType(ins->lhs()->type()));
--- a/js/src/jit/x64/Lowering-x64.cpp
+++ b/js/src/jit/x64/Lowering-x64.cpp
@@ -228,52 +228,66 @@ LIRGeneratorX64::visitAsmJSAtomicBinopHe
     // Case 1: the result of the operation is not used.
     //
     // We'll emit a single instruction: LOCK ADD, LOCK SUB, LOCK AND,
     // LOCK OR, or LOCK XOR.
 
     if (!ins->hasUses()) {
         LAsmJSAtomicBinopHeapForEffect *lir =
             new(alloc()) LAsmJSAtomicBinopHeapForEffect(useRegister(ptr),
-                                                        useRegister(ins->value()));
+                                                        useRegisterOrConstant(ins->value()));
         add(lir, ins);
         return;
     }
 
     // Case 2: the result of the operation is used.
     //
-    // For ADD and SUB we'll use XADD (with word and byte ops as appropriate):
+    // For ADD and SUB we'll use XADD with word and byte ops as
+    // appropriate.  Any output register can be used and if value is a
+    // register it's best if it's the same as output:
     //
-    //    movl       value, output
+    //    movl       value, output  ; if value != output
     //    lock xaddl output, mem
     //
-    // For AND/OR/XOR we need to use a CMPXCHG loop:
+    // For AND/OR/XOR we need to use a CMPXCHG loop, and the output is
+    // always in rax:
     //
-    //    movl          *mem, eax
-    // L: mov           eax, temp
+    //    movl          *mem, rax
+    // L: mov           rax, temp
     //    andl          value, temp
-    //    lock cmpxchg  temp, mem  ; reads eax also
+    //    lock cmpxchg  temp, mem  ; reads rax also
     //    jnz           L
-    //    ; result in eax
+    //    ; result in rax
     //
-    // Note the placement of L, cmpxchg will update eax with *mem if
+    // Note the placement of L, cmpxchg will update rax with *mem if
     // *mem does not have the expected value, so reloading it at the
     // top of the loop would be redundant.
-    //
-    // We want to fix eax as the output.  We also need a temp for
-    // the intermediate value.
 
     bool bitOp = !(ins->operation() == AtomicFetchAddOp || ins->operation() == AtomicFetchSubOp);
-    LAllocation value = useRegister(ins->value());
-    LDefinition tempDef = bitOp ? temp() : LDefinition::BogusTemp();
+    bool reuseInput = false;
+    LAllocation value;
+
+    if (bitOp || ins->value()->isConstant()) {
+        value = useRegisterOrConstant(ins->value());
+    } else {
+        reuseInput = true;
+        value = useRegisterAtStart(ins->value());
+    }
 
     LAsmJSAtomicBinopHeap *lir =
-        new(alloc()) LAsmJSAtomicBinopHeap(useRegister(ptr), value, tempDef);
+        new(alloc()) LAsmJSAtomicBinopHeap(useRegister(ptr),
+                                           value,
+                                           bitOp ? temp() : LDefinition::BogusTemp());
 
-    defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
+    if (reuseInput)
+        defineReuseInput(lir, ins, LAsmJSAtomicBinopHeap::valueOp);
+    else if (bitOp)
+        defineFixed(lir, ins, LAllocation(AnyRegister(rax)));
+    else
+        define(lir, ins);
 }
 
 void
 LIRGeneratorX64::visitAsmJSLoadFuncPtr(MAsmJSLoadFuncPtr *ins)
 {
     define(new(alloc()) LAsmJSLoadFuncPtr(useRegister(ins->index()), temp()), ins);
 }
 
--- a/js/src/jit/x86/Lowering-x86.cpp
+++ b/js/src/jit/x86/Lowering-x86.cpp
@@ -312,23 +312,24 @@ LIRGeneratorX86::visitAsmJSAtomicBinopHe
     MDefinition *ptr = ins->ptr();
     MOZ_ASSERT(ptr->type() == MIRType_Int32);
 
     bool byteArray = byteSize(ins->accessType()) == 1;
 
     // Case 1: the result of the operation is not used.
     //
     // We'll emit a single instruction: LOCK ADD, LOCK SUB, LOCK AND,
-    // LOCK OR, or LOCK XOR.
-    //
-    // For the 8-bit variant the ops need a byte register for the
-    // value; just pin the value to ebx.
+    // LOCK OR, or LOCK XOR.  These can all take an immediate.
 
     if (!ins->hasUses()) {
-        LAllocation value = byteArray ? useFixed(ins->value(), ebx) : useRegister(ins->value());
+        LAllocation value;
+        if (byteArray && !ins->value()->isConstant())
+            value = useFixed(ins->value(), ebx);
+        else
+            value = useRegisterOrConstant(ins->value());
         LAsmJSAtomicBinopHeapForEffect *lir =
             new(alloc()) LAsmJSAtomicBinopHeapForEffect(useRegister(ptr), value);
         lir->setAddrTemp(temp());
         add(lir, ins);
         return;
     }
 
     // Case 2: the result of the operation is used.
@@ -356,41 +357,44 @@ LIRGeneratorX86::visitAsmJSAtomicBinopHe
     // top of the loop would be redundant.
     //
     // We want to fix eax as the output.  We also need a temp for
     // the intermediate value.
     //
     // For the 8-bit variants the temp must have a byte register.
     //
     // There are optimization opportunities:
-    //  - better register allocation and instruction selection, Bug #1077036.
+    //  - better 8-bit register allocation and instruction selection, Bug #1077036.
 
     bool bitOp = !(ins->operation() == AtomicFetchAddOp || ins->operation() == AtomicFetchSubOp);
     LDefinition tempDef = LDefinition::BogusTemp();
     LAllocation value;
 
-    // Optimization opportunity: "value" need not be pinned to something that
-    // has a byte register unless the back-end insists on using a byte move
-    // for the setup or the payload computation, which really it need not do.
-
     if (byteArray) {
         value = useFixed(ins->value(), ebx);
         if (bitOp)
             tempDef = tempFixed(ecx);
-    } else {
-        value = useRegister(ins->value());
+    } else if (bitOp || ins->value()->isConstant()) {
+        value = useRegisterOrConstant(ins->value());
         if (bitOp)
             tempDef = temp();
+    } else {
+        value = useRegisterAtStart(ins->value());
     }
 
     LAsmJSAtomicBinopHeap *lir =
         new(alloc()) LAsmJSAtomicBinopHeap(useRegister(ptr), value, tempDef);
 
     lir->setAddrTemp(temp());
-    defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
+    if (byteArray || bitOp)
+        defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
+    else if (ins->value()->isConstant())
+        define(lir, ins);
+    else
+        defineReuseInput(lir, ins, LAsmJSAtomicBinopHeap::valueOp);
 }
 
 void
 LIRGeneratorX86::visitAsmJSLoadFuncPtr(MAsmJSLoadFuncPtr *ins)
 {
     define(new(alloc()) LAsmJSLoadFuncPtr(useRegisterAtStart(ins->index())), ins);
 }