Bug 669132: Optimize VFP memory accesses on ARM. [r=cdleary,Marty]
authorJacob Bramley <Jacob.Bramley@arm.com>
Tue, 09 Aug 2011 16:58:11 +0100
changeset 74088 ac20c581f617db48a067591c3ef457399b2a99d8
parent 74087 aee7dcfde22319196ad1cab7e0ffe6a40b221756
child 74089 e1aae4baeaa43a69cd1e1ad330d652aeb4505689
push id1082
push userJacob.Bramley@arm.com
push dateTue, 09 Aug 2011 16:01:55 +0000
treeherdermozilla-inbound@77c924963f36 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerscdleary, Marty
bugs669132
milestone8.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 669132: Optimize VFP memory accesses on ARM. [r=cdleary,Marty]
js/src/assembler/assembler/ARMAssembler.cpp
js/src/assembler/assembler/ARMAssembler.h
js/src/assembler/assembler/MacroAssemblerARM.h
--- a/js/src/assembler/assembler/ARMAssembler.cpp
+++ b/js/src/assembler/assembler/ARMAssembler.cpp
@@ -270,73 +270,73 @@ ARMWord ARMAssembler::encodeComplexImm(A
 }
 
 // Memory load/store helpers
 // TODO: this does not take advantage of all of ARMv7's instruction encodings, it should.
 void ARMAssembler::dataTransferN(bool isLoad, bool isSigned, int size, RegisterID rt, RegisterID base, int32_t offset)
 {
     bool posOffset = true;
 
-    // there may be more elegant ways of handling this, but this one works.
+    // There may be more elegant ways of handling this, but this one works.
     if (offset == 0x80000000) {
-        // for even bigger offsets, load the entire offset into a register, then do an
-        // indexed load using the base register and the index register
+        // For even bigger offsets, load the entire offset into a register, then do an
+        // indexed load using the base register and the index register.
         moveImm(offset, ARMRegisters::S0);
         mem_reg_off(isLoad, isSigned, size, posOffset, rt, base, ARMRegisters::S0);
         return;
     }
     if (offset < 0) {
         offset = - offset;
         posOffset = false;
     }
     if (offset <= 0xfff) {
         // LDR rd, [rb, #+offset]
         mem_imm_off(isLoad, isSigned, size, posOffset, rt, base, offset);
     } else if (offset <= 0xfffff) {
-        // add upper bits of offset to the base, and store the result into the temp registe
+        // Add upper bits of offset to the base, and store the result into the temp register.
         if (posOffset) {
-            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | (10 << 8));
+            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | getOp2RotLSL(12));
         } else {
-            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | (10 << 8));
+            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | getOp2RotLSL(12));
         }
-        // load using the lower bits of the offset
+        // Load using the lower bits of the offset.
         mem_imm_off(isLoad, isSigned, size, posOffset, rt,
                     ARMRegisters::S0, (offset & 0xfff));
     } else {
-        // for even bigger offsets, load the entire offset into a register, then do an
-        // indexed load using the base register and the index register
+        // For even bigger offsets, load the entire offset into a register, then do an
+        // indexed load using the base register and the index register.
         moveImm(offset, ARMRegisters::S0);
         mem_reg_off(isLoad, isSigned, size, posOffset, rt, base, ARMRegisters::S0);
     }
 }
 
 void ARMAssembler::dataTransfer32(bool isLoad, RegisterID srcDst, RegisterID base, int32_t offset)
 {
     if (offset >= 0) {
         if (offset <= 0xfff)
             // LDR rd, [rb, +offset]
             dtr_u(isLoad, srcDst, base, offset);
         else if (offset <= 0xfffff) {
-            // add upper bits of offset to the base, and store the result into the temp registe
-            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | (10 << 8));
-            // load using the lower bits of the register
+            // Add upper bits of offset to the base, and store the result into the temp register.
+            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | getOp2RotLSL(12));
+            // Load using the lower bits of the register.
             dtr_u(isLoad, srcDst, ARMRegisters::S0, (offset & 0xfff));
         } else {
-            // for even bigger offsets, load the entire offset into a registe, then do an
-            // indexed load using the base register and the index register
+            // For even bigger offsets, load the entire offset into a register, then do an
+            // indexed load using the base register and the index register.
             moveImm(offset, ARMRegisters::S0);
             dtr_ur(isLoad, srcDst, base, ARMRegisters::S0);
         }
     } else {
-        // negative offsets 
+        // Negative offsets.
         offset = -offset;
         if (offset <= 0xfff)
             dtr_d(isLoad, srcDst, base, offset);
         else if (offset <= 0xfffff) {
-            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | (10 << 8));
+            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | getOp2RotLSL(12));
             dtr_d(isLoad, srcDst, ARMRegisters::S0, (offset & 0xfff));
         } else {
             moveImm(offset, ARMRegisters::S0);
             dtr_dr(isLoad, srcDst, base, ARMRegisters::S0);
         }
     }
 }
 /* this is large, ugly and obsolete.  dataTransferN is superior.*/
@@ -344,17 +344,17 @@ void ARMAssembler::dataTransfer8(bool is
 {
     if (offset >= 0) {
         if (offset <= 0xfff) {
             if (isSigned)
                 mem_imm_off(isLoad, true, 8, true, srcDst, base, offset);
             else
                 dtrb_u(isLoad, srcDst, base, offset);
         } else if (offset <= 0xfffff) {
-            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | (10 << 8));
+            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | getOp2RotLSL(12));
             if (isSigned)
                 mem_imm_off(isLoad, true, 8, true, srcDst, ARMRegisters::S0, (offset & 0xfff));
             else
                 dtrb_u(isLoad, srcDst, ARMRegisters::S0, (offset & 0xfff));
         } else {
             moveImm(offset, ARMRegisters::S0);
             if (isSigned)
                 mem_reg_off(isLoad, true, 8, true, srcDst, base, ARMRegisters::S0);
@@ -365,17 +365,17 @@ void ARMAssembler::dataTransfer8(bool is
         offset = -offset;
         if (offset <= 0xfff) {
             if (isSigned)
                 mem_imm_off(isLoad, true, 8, false, srcDst, base, offset);
             else
                 dtrb_d(isLoad, srcDst, base, offset);
         }
         else if (offset <= 0xfffff) {
-            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | (10 << 8));
+            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | getOp2RotLSL(12));
             if (isSigned)
                 mem_imm_off(isLoad, true, 8, false, srcDst, ARMRegisters::S0, (offset & 0xfff));
             else
                 dtrb_d(isLoad, srcDst, ARMRegisters::S0, (offset & 0xfff));
 
         } else {
             moveImm(offset, ARMRegisters::S0);
             if (isSigned)
@@ -430,85 +430,85 @@ void ARMAssembler::baseIndexTransferN(bo
     }
     ldr_un_imm(ARMRegisters::S0, offset);
     add_r(ARMRegisters::S0, ARMRegisters::S0, op2);
     mem_reg_off(isLoad, isSigned, size, true, srcDst, base, ARMRegisters::S0);
 }
 
 void ARMAssembler::doubleTransfer(bool isLoad, FPRegisterID srcDst, RegisterID base, int32_t offset)
 {
-    if (offset & 0x3) {
-        if (offset <= 0x3ff && offset >= 0) {
-            fdtr_u(isLoad, srcDst, base, offset >> 2);
-            return;
-        }
-        if (offset <= 0x3ffff && offset >= 0) {
-            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 10) | (11 << 8));
-            fdtr_u(isLoad, srcDst, ARMRegisters::S0, (offset >> 2) & 0xff);
-            return;
-        }
-        offset = -offset;
+    // VFP cannot directly access memory that is not four-byte-aligned, so
+    // special-case support will be required for such cases. However, we don't
+    // currently use any unaligned floating-point memory accesses and probably
+    // never will, so for now just assert that the offset is aligned.
+    //
+    // Note that we cannot assert that the base register is aligned, but in
+    // that case, an alignment fault will be raised at run-time.
+    ASSERT((offset & 0x3) == 0);
 
-        if (offset <= 0x3ff && offset >= 0) {
-            fdtr_d(isLoad, srcDst, base, offset >> 2);
+    // Try to use a single load/store instruction, or at least a simple address
+    // calculation.
+    if (offset >= 0) {
+        if (offset <= 0x3ff) {
+            fmem_imm_off(isLoad, true, true, srcDst, base, offset >> 2);
             return;
         }
-        if (offset <= 0x3ffff && offset >= 0) {
-            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 10) | (11 << 8));
-            fdtr_d(isLoad, srcDst, ARMRegisters::S0, (offset >> 2) & 0xff);
+        if (offset <= 0x3ffff) {
+            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 10) | getOp2RotLSL(10));
+            fmem_imm_off(isLoad, true, true, srcDst, ARMRegisters::S0, (offset >> 2) & 0xff);
             return;
         }
-        offset = -offset;
+    } else {
+        if (offset >= -0x3ff) {
+            fmem_imm_off(isLoad, true, false, srcDst, base, -offset >> 2);
+            return;
+        }
+        if (offset >= -0x3ffff) {
+            sub_r(ARMRegisters::S0, base, OP2_IMM | (-offset >> 10) | getOp2RotLSL(10));
+            fmem_imm_off(isLoad, true, false, srcDst, ARMRegisters::S0, (-offset >> 2) & 0xff);
+            return;
+        }
     }
 
-    // TODO: This is broken in the case that offset is unaligned. VFP can never
-    // perform unaligned accesses, even from an unaligned register base. (NEON
-    // can, but VFP isn't NEON. It is not advisable to interleave a NEON load
-    // with VFP code, so the best solution here is probably to perform an
-    // unaligned integer load, then move the result into VFP using VMOV.)
-    ASSERT((offset & 0x3) == 0);
-
+    // Slow case for long-range accesses.
     ldr_un_imm(ARMRegisters::S0, offset);
     add_r(ARMRegisters::S0, ARMRegisters::S0, base);
-    fdtr_u(isLoad, srcDst, ARMRegisters::S0, 0);
+    fmem_imm_off(isLoad, true, true, srcDst, ARMRegisters::S0, 0);
 }
 
 void ARMAssembler::floatTransfer(bool isLoad, FPRegisterID srcDst, RegisterID base, int32_t offset)
 {
-    if (offset & 0x3) {
-        if (offset <= 0x3ff && offset >= 0) {
+    // Assert that the access is aligned, as in doubleTransfer.
+    ASSERT((offset & 0x3) == 0);
+
+    // Try to use a single load/store instruction, or at least a simple address
+    // calculation.
+    if (offset >= 0) {
+        if (offset <= 0x3ff) {
             fmem_imm_off(isLoad, false, true, srcDst, base, offset >> 2);
             return;
         }
-        if (offset <= 0x3ffff && offset >= 0) {
-            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 10) | (11 << 8));
+        if (offset <= 0x3ffff) {
+            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 10) | getOp2RotLSL(10));
             fmem_imm_off(isLoad, false, true, srcDst, ARMRegisters::S0, (offset >> 2) & 0xff);
             return;
         }
-        offset = -offset;
-
-        if (offset <= 0x3ff && offset >= 0) {
-            fmem_imm_off(isLoad, false, false, srcDst, base, offset >> 2);
-            return;
-        }
-        if (offset <= 0x3ffff && offset >= 0) {
-            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 10) | (11 << 8));
-            fmem_imm_off(isLoad, false, true, srcDst, ARMRegisters::S0, (offset >> 2) & 0xff);
+    } else {
+        if (offset >= -0x3ff) {
+            fmem_imm_off(isLoad, false, false, srcDst, base, -offset >> 2);
             return;
         }
-        offset = -offset;
+        if (offset >= -0x3ffff) {
+            sub_r(ARMRegisters::S0, base, OP2_IMM | (-offset >> 10) | getOp2RotLSL(10));
+            fmem_imm_off(isLoad, false, false, srcDst, ARMRegisters::S0, (-offset >> 2) & 0xff);
+            return;
+        }
     }
 
-    // TODO: This is broken in the case that offset is unaligned. VFP can never
-    // perform unaligned accesses, even from an unaligned register base. (NEON
-    // can, but VFP isn't NEON. It is not advisable to interleave a NEON load
-    // with VFP code, so the best solution here is probably to perform an
-    // unaligned integer load, then move the result into VFP using VMOV.)
-    ASSERT((offset & 0x3) == 0);
-
+    // Slow case for long-range accesses.
     ldr_un_imm(ARMRegisters::S0, offset);
     add_r(ARMRegisters::S0, ARMRegisters::S0, base);
     fmem_imm_off(isLoad, false, true, srcDst, ARMRegisters::S0, 0);
 }
 
 void ARMAssembler::baseIndexFloatTransfer(bool isLoad, bool isDouble, FPRegisterID srcDst, RegisterID base, RegisterID index, int scale, int32_t offset)
 {
     ARMWord op2;
--- a/js/src/assembler/assembler/ARMAssembler.h
+++ b/js/src/assembler/assembler/ARMAssembler.h
@@ -314,16 +314,29 @@ namespace JSC {
         // Instruction formating
 
         void emitInst(ARMWord op, int rd, int rn, ARMWord op2)
         {
             ASSERT ( ((op2 & ~OP2_IMM) <= 0xfff) || (((op2 & ~OP2_IMMh) <= 0xfff)) );
             m_buffer.putInt(op | RN(rn) | RD(rd) | op2);
         }
 
+        // Work out the pre-shifted constant necessary to encode the specified
+        // logical shift left for op2 immediates. Only even shifts can be
+        // applied.
+        //
+        // Input validity is asserted in debug builds.
+        ARMWord getOp2RotLSL(int lsl)
+        {
+            ASSERT((lsl >= 0) && (lsl <= 24));
+            ASSERT(!(lsl % 2));
+
+            return (-(lsl/2) & 0xf) << 8;
+        }
+
         void and_r(int rd, int rn, ARMWord op2, Condition cc = AL)
         {
             spewInsWithOp2("and", cc, rd, rn, op2);
             emitInst(static_cast<ARMWord>(cc) | AND, rd, rn, op2);
         }
 
         void ands_r(int rd, int rn, ARMWord op2, Condition cc = AL)
         {
@@ -1649,34 +1662,16 @@ namespace JSC {
         {
             js::JaegerSpew(js::JSpew_Insns,
                     IPFX   "%-15s %s, %s\n", MAYBE_PAD, "vsqrt.f64", nameFpRegD(dd), nameFpRegD(dm));
             // TODO: emitInst doesn't work for VFP instructions, though it
             // seems to work for current usage.
             emitInst(static_cast<ARMWord>(cc) | FSQRTD, dd, 0, dm);
         }
 
-        void fdtr_u(bool isLoad, int dd, int rn, ARMWord offset, Condition cc = AL)
-        {
-            char const * ins = isLoad ? "vldr.f64" : "vstr.f64";
-            js::JaegerSpew(js::JSpew_Insns,
-                    IPFX   "%-15s %s, [%s, #+%u]\n", MAYBE_PAD, ins, nameFpRegD(dd), nameGpReg(rn), offset);
-            ASSERT(offset <= 0xff);
-            emitInst(static_cast<ARMWord>(cc) | FDTR | DT_UP | (isLoad ? DT_LOAD : 0), dd, rn, offset);
-        }
-
-        void fdtr_d(bool isLoad, int dd, int rn, ARMWord offset, Condition cc = AL)
-        {
-            char const * ins = isLoad ? "vldr.f64" : "vstr.f64";
-            js::JaegerSpew(js::JSpew_Insns,
-                    IPFX   "%-15s %s, [%s, #-%u]\n", MAYBE_PAD, ins, nameFpRegD(dd), nameGpReg(rn), offset);
-            ASSERT(offset <= 0xff);
-            emitInst(static_cast<ARMWord>(cc) | FDTR | (isLoad ? DT_LOAD : 0), dd, rn, offset);
-        }
-
         void fmsr_r(int dd, int rn, Condition cc = AL)
         {
             // TODO: emitInst doesn't work for VFP instructions, though it
             // seems to work for current usage.
             emitInst(static_cast<ARMWord>(cc) | FMSR, rn, dd, 0);
         }
 
         void fmrs_r(int rd, int dn, Condition cc = AL)
--- a/js/src/assembler/assembler/MacroAssemblerARM.h
+++ b/js/src/assembler/assembler/MacroAssemblerARM.h
@@ -1132,17 +1132,17 @@ public:
         m_assembler.baseIndexFloatTransfer(true, true, dest,
                                            address.base, address.index,
                                            address.scale, address.offset);
     }
 
     DataLabelPtr loadDouble(const void* address, FPRegisterID dest)
     {
         DataLabelPtr label = moveWithPatch(ImmPtr(address), ARMRegisters::S0);
-        m_assembler.fdtr_u(true, dest, ARMRegisters::S0, 0);
+        m_assembler.doubleTransfer(true, dest, ARMRegisters::S0, 0);
         return label;
     }
 
     void fastLoadDouble(RegisterID lo, RegisterID hi, FPRegisterID fpReg) {
         m_assembler.vmov64(false, true, lo, hi, fpReg);
     }
 
     void loadFloat(ImplicitAddress address, FPRegisterID dest)