Bug 669132: Optimize VFP memory accesses on ARM. [r=cdleary,Marty]
authorJacob Bramley <Jacob.Bramley@arm.com>
Tue, 09 Aug 2011 16:58:11 +0100
changeset 75397 25fad7b95686ae05fc20c796c41451f91a073b4b
parent 75396 f9379ad86c5d1afdadd75bf4a468c4a2dab968f9
child 75398 604271b95a33764a36add2b957deabb55c0e45ca
push id2
push userbsmedberg@mozilla.com
push dateFri, 19 Aug 2011 14:38:13 +0000
reviewerscdleary, Marty
bugs669132
milestone8.0a1
Bug 669132: Optimize VFP memory accesses on ARM. [r=cdleary,Marty]
js/src/assembler/assembler/ARMAssembler.cpp
js/src/assembler/assembler/ARMAssembler.h
js/src/assembler/assembler/MacroAssemblerARM.h
--- a/js/src/assembler/assembler/ARMAssembler.cpp
+++ b/js/src/assembler/assembler/ARMAssembler.cpp
@@ -270,73 +270,73 @@ ARMWord ARMAssembler::encodeComplexImm(A
 }
 
 // Memory load/store helpers
 // TODO: this does not take advantage of all of ARMv7's instruction encodings, it should.
 void ARMAssembler::dataTransferN(bool isLoad, bool isSigned, int size, RegisterID rt, RegisterID base, int32_t offset)
 {
     bool posOffset = true;
 
-    // there may be more elegant ways of handling this, but this one works.
+    // There may be more elegant ways of handling this, but this one works.
     if (offset == 0x80000000) {
-        // for even bigger offsets, load the entire offset into a register, then do an
-        // indexed load using the base register and the index register
+        // For even bigger offsets, load the entire offset into a register, then do an
+        // indexed load using the base register and the index register.
         moveImm(offset, ARMRegisters::S0);
         mem_reg_off(isLoad, isSigned, size, posOffset, rt, base, ARMRegisters::S0);
         return;
     }
     if (offset < 0) {
         offset = - offset;
         posOffset = false;
     }
     if (offset <= 0xfff) {
         // LDR rd, [rb, #+offset]
         mem_imm_off(isLoad, isSigned, size, posOffset, rt, base, offset);
     } else if (offset <= 0xfffff) {
-        // add upper bits of offset to the base, and store the result into the temp registe
+        // Add upper bits of offset to the base, and store the result into the temp register.
         if (posOffset) {
-            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | (10 << 8));
+            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | getOp2RotLSL(12));
         } else {
-            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | (10 << 8));
+            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | getOp2RotLSL(12));
         }
-        // load using the lower bits of the offset
+        // Load using the lower bits of the offset.
         mem_imm_off(isLoad, isSigned, size, posOffset, rt,
                     ARMRegisters::S0, (offset & 0xfff));
     } else {
-        // for even bigger offsets, load the entire offset into a register, then do an
-        // indexed load using the base register and the index register
+        // For even bigger offsets, load the entire offset into a register, then do an
+        // indexed load using the base register and the index register.
         moveImm(offset, ARMRegisters::S0);
         mem_reg_off(isLoad, isSigned, size, posOffset, rt, base, ARMRegisters::S0);
     }
 }
 
 void ARMAssembler::dataTransfer32(bool isLoad, RegisterID srcDst, RegisterID base, int32_t offset)
 {
     if (offset >= 0) {
         if (offset <= 0xfff)
             // LDR rd, [rb, +offset]
             dtr_u(isLoad, srcDst, base, offset);
         else if (offset <= 0xfffff) {
-            // add upper bits of offset to the base, and store the result into the temp registe
-            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | (10 << 8));
-            // load using the lower bits of the register
+            // Add upper bits of offset to the base, and store the result into the temp register.
+            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | getOp2RotLSL(12));
+            // Load using the lower bits of the register.
             dtr_u(isLoad, srcDst, ARMRegisters::S0, (offset & 0xfff));
         } else {
-            // for even bigger offsets, load the entire offset into a registe, then do an
-            // indexed load using the base register and the index register
+            // For even bigger offsets, load the entire offset into a register, then do an
+            // indexed load using the base register and the index register.
             moveImm(offset, ARMRegisters::S0);
             dtr_ur(isLoad, srcDst, base, ARMRegisters::S0);
         }
     } else {
-        // negative offsets 
+        // Negative offsets.
         offset = -offset;
         if (offset <= 0xfff)
             dtr_d(isLoad, srcDst, base, offset);
         else if (offset <= 0xfffff) {
-            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | (10 << 8));
+            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | getOp2RotLSL(12));
             dtr_d(isLoad, srcDst, ARMRegisters::S0, (offset & 0xfff));
         } else {
             moveImm(offset, ARMRegisters::S0);
             dtr_dr(isLoad, srcDst, base, ARMRegisters::S0);
         }
     }
 }
 /* this is large, ugly and obsolete.  dataTransferN is superior.*/
@@ -344,17 +344,17 @@ void ARMAssembler::dataTransfer8(bool is
 {
     if (offset >= 0) {
         if (offset <= 0xfff) {
             if (isSigned)
                 mem_imm_off(isLoad, true, 8, true, srcDst, base, offset);
             else
                 dtrb_u(isLoad, srcDst, base, offset);
         } else if (offset <= 0xfffff) {
-            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | (10 << 8));
+            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | getOp2RotLSL(12));
             if (isSigned)
                 mem_imm_off(isLoad, true, 8, true, srcDst, ARMRegisters::S0, (offset & 0xfff));
             else
                 dtrb_u(isLoad, srcDst, ARMRegisters::S0, (offset & 0xfff));
         } else {
             moveImm(offset, ARMRegisters::S0);
             if (isSigned)
                 mem_reg_off(isLoad, true, 8, true, srcDst, base, ARMRegisters::S0);
@@ -365,17 +365,17 @@ void ARMAssembler::dataTransfer8(bool is
         offset = -offset;
         if (offset <= 0xfff) {
             if (isSigned)
                 mem_imm_off(isLoad, true, 8, false, srcDst, base, offset);
             else
                 dtrb_d(isLoad, srcDst, base, offset);
         }
         else if (offset <= 0xfffff) {
-            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | (10 << 8));
+            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 12) | getOp2RotLSL(12));
             if (isSigned)
                 mem_imm_off(isLoad, true, 8, false, srcDst, ARMRegisters::S0, (offset & 0xfff));
             else
                 dtrb_d(isLoad, srcDst, ARMRegisters::S0, (offset & 0xfff));
 
         } else {
             moveImm(offset, ARMRegisters::S0);
             if (isSigned)
@@ -430,85 +430,85 @@ void ARMAssembler::baseIndexTransferN(bo
     }
     ldr_un_imm(ARMRegisters::S0, offset);
     add_r(ARMRegisters::S0, ARMRegisters::S0, op2);
     mem_reg_off(isLoad, isSigned, size, true, srcDst, base, ARMRegisters::S0);
 }
 
 void ARMAssembler::doubleTransfer(bool isLoad, FPRegisterID srcDst, RegisterID base, int32_t offset)
 {
-    if (offset & 0x3) {
-        if (offset <= 0x3ff && offset >= 0) {
-            fdtr_u(isLoad, srcDst, base, offset >> 2);
-            return;
-        }
-        if (offset <= 0x3ffff && offset >= 0) {
-            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 10) | (11 << 8));
-            fdtr_u(isLoad, srcDst, ARMRegisters::S0, (offset >> 2) & 0xff);
-            return;
-        }
-        offset = -offset;
+    // VFP cannot directly access memory that is not four-byte-aligned, so
+    // special-case support will be required for such cases. However, we don't
+    // currently use any unaligned floating-point memory accesses and probably
+    // never will, so for now just assert that the offset is aligned.
+    //
+    // Note that we cannot assert that the base register is aligned, but in
+    // that case, an alignment fault will be raised at run-time.
+    ASSERT((offset & 0x3) == 0);
 
-        if (offset <= 0x3ff && offset >= 0) {
-            fdtr_d(isLoad, srcDst, base, offset >> 2);
+    // Try to use a single load/store instruction, or at least a simple address
+    // calculation.
+    if (offset >= 0) {
+        if (offset <= 0x3ff) {
+            fmem_imm_off(isLoad, true, true, srcDst, base, offset >> 2);
             return;
         }
-        if (offset <= 0x3ffff && offset >= 0) {
-            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 10) | (11 << 8));
-            fdtr_d(isLoad, srcDst, ARMRegisters::S0, (offset >> 2) & 0xff);
+        if (offset <= 0x3ffff) {
+            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 10) | getOp2RotLSL(10));
+            fmem_imm_off(isLoad, true, true, srcDst, ARMRegisters::S0, (offset >> 2) & 0xff);
             return;
         }
-        offset = -offset;
+    } else {
+        if (offset >= -0x3ff) {
+            fmem_imm_off(isLoad, true, false, srcDst, base, -offset >> 2);
+            return;
+        }
+        if (offset >= -0x3ffff) {
+            sub_r(ARMRegisters::S0, base, OP2_IMM | (-offset >> 10) | getOp2RotLSL(10));
+            fmem_imm_off(isLoad, true, false, srcDst, ARMRegisters::S0, (-offset >> 2) & 0xff);
+            return;
+        }
     }
 
-    // TODO: This is broken in the case that offset is unaligned. VFP can never
-    // perform unaligned accesses, even from an unaligned register base. (NEON
-    // can, but VFP isn't NEON. It is not advisable to interleave a NEON load
-    // with VFP code, so the best solution here is probably to perform an
-    // unaligned integer load, then move the result into VFP using VMOV.)
-    ASSERT((offset & 0x3) == 0);
-
+    // Slow case for long-range accesses.
     ldr_un_imm(ARMRegisters::S0, offset);
     add_r(ARMRegisters::S0, ARMRegisters::S0, base);
-    fdtr_u(isLoad, srcDst, ARMRegisters::S0, 0);
+    fmem_imm_off(isLoad, true, true, srcDst, ARMRegisters::S0, 0);
 }
 
 void ARMAssembler::floatTransfer(bool isLoad, FPRegisterID srcDst, RegisterID base, int32_t offset)
 {
-    if (offset & 0x3) {
-        if (offset <= 0x3ff && offset >= 0) {
+    // Assert that the access is aligned, as in doubleTransfer.
+    ASSERT((offset & 0x3) == 0);
+
+    // Try to use a single load/store instruction, or at least a simple address
+    // calculation.
+    if (offset >= 0) {
+        if (offset <= 0x3ff) {
             fmem_imm_off(isLoad, false, true, srcDst, base, offset >> 2);
             return;
         }
-        if (offset <= 0x3ffff && offset >= 0) {
-            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 10) | (11 << 8));
+        if (offset <= 0x3ffff) {
+            add_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 10) | getOp2RotLSL(10));
             fmem_imm_off(isLoad, false, true, srcDst, ARMRegisters::S0, (offset >> 2) & 0xff);
             return;
         }
-        offset = -offset;
-
-        if (offset <= 0x3ff && offset >= 0) {
-            fmem_imm_off(isLoad, false, false, srcDst, base, offset >> 2);
-            return;
-        }
-        if (offset <= 0x3ffff && offset >= 0) {
-            sub_r(ARMRegisters::S0, base, OP2_IMM | (offset >> 10) | (11 << 8));
-            fmem_imm_off(isLoad, false, true, srcDst, ARMRegisters::S0, (offset >> 2) & 0xff);
+    } else {
+        if (offset >= -0x3ff) {
+            fmem_imm_off(isLoad, false, false, srcDst, base, -offset >> 2);
             return;
         }
-        offset = -offset;
+        if (offset >= -0x3ffff) {
+            sub_r(ARMRegisters::S0, base, OP2_IMM | (-offset >> 10) | getOp2RotLSL(10));
+            fmem_imm_off(isLoad, false, false, srcDst, ARMRegisters::S0, (-offset >> 2) & 0xff);
+            return;
+        }
     }
 
-    // TODO: This is broken in the case that offset is unaligned. VFP can never
-    // perform unaligned accesses, even from an unaligned register base. (NEON
-    // can, but VFP isn't NEON. It is not advisable to interleave a NEON load
-    // with VFP code, so the best solution here is probably to perform an
-    // unaligned integer load, then move the result into VFP using VMOV.)
-    ASSERT((offset & 0x3) == 0);
-
+    // Slow case for long-range accesses.
     ldr_un_imm(ARMRegisters::S0, offset);
     add_r(ARMRegisters::S0, ARMRegisters::S0, base);
     fmem_imm_off(isLoad, false, true, srcDst, ARMRegisters::S0, 0);
 }
 
 void ARMAssembler::baseIndexFloatTransfer(bool isLoad, bool isDouble, FPRegisterID srcDst, RegisterID base, RegisterID index, int scale, int32_t offset)
 {
     ARMWord op2;
--- a/js/src/assembler/assembler/ARMAssembler.h
+++ b/js/src/assembler/assembler/ARMAssembler.h
@@ -314,16 +314,29 @@ namespace JSC {
         // Instruction formating
 
         void emitInst(ARMWord op, int rd, int rn, ARMWord op2)
         {
             ASSERT ( ((op2 & ~OP2_IMM) <= 0xfff) || (((op2 & ~OP2_IMMh) <= 0xfff)) );
             m_buffer.putInt(op | RN(rn) | RD(rd) | op2);
         }
 
+        // Work out the pre-shifted constant necessary to encode the specified
+        // logical shift left for op2 immediates. Only even shifts can be
+        // applied.
+        //
+        // Input validity is asserted in debug builds.
+        ARMWord getOp2RotLSL(int lsl)
+        {
+            ASSERT((lsl >= 0) && (lsl <= 24));
+            ASSERT(!(lsl % 2));
+
+            return (-(lsl/2) & 0xf) << 8;
+        }
+
         void and_r(int rd, int rn, ARMWord op2, Condition cc = AL)
         {
             spewInsWithOp2("and", cc, rd, rn, op2);
             emitInst(static_cast<ARMWord>(cc) | AND, rd, rn, op2);
         }
 
         void ands_r(int rd, int rn, ARMWord op2, Condition cc = AL)
         {
@@ -1649,34 +1662,16 @@ namespace JSC {
         {
             js::JaegerSpew(js::JSpew_Insns,
                     IPFX   "%-15s %s, %s\n", MAYBE_PAD, "vsqrt.f64", nameFpRegD(dd), nameFpRegD(dm));
             // TODO: emitInst doesn't work for VFP instructions, though it
             // seems to work for current usage.
             emitInst(static_cast<ARMWord>(cc) | FSQRTD, dd, 0, dm);
         }
 
-        void fdtr_u(bool isLoad, int dd, int rn, ARMWord offset, Condition cc = AL)
-        {
-            char const * ins = isLoad ? "vldr.f64" : "vstr.f64";
-            js::JaegerSpew(js::JSpew_Insns,
-                    IPFX   "%-15s %s, [%s, #+%u]\n", MAYBE_PAD, ins, nameFpRegD(dd), nameGpReg(rn), offset);
-            ASSERT(offset <= 0xff);
-            emitInst(static_cast<ARMWord>(cc) | FDTR | DT_UP | (isLoad ? DT_LOAD : 0), dd, rn, offset);
-        }
-
-        void fdtr_d(bool isLoad, int dd, int rn, ARMWord offset, Condition cc = AL)
-        {
-            char const * ins = isLoad ? "vldr.f64" : "vstr.f64";
-            js::JaegerSpew(js::JSpew_Insns,
-                    IPFX   "%-15s %s, [%s, #-%u]\n", MAYBE_PAD, ins, nameFpRegD(dd), nameGpReg(rn), offset);
-            ASSERT(offset <= 0xff);
-            emitInst(static_cast<ARMWord>(cc) | FDTR | (isLoad ? DT_LOAD : 0), dd, rn, offset);
-        }
-
         void fmsr_r(int dd, int rn, Condition cc = AL)
         {
             // TODO: emitInst doesn't work for VFP instructions, though it
             // seems to work for current usage.
             emitInst(static_cast<ARMWord>(cc) | FMSR, rn, dd, 0);
         }
 
         void fmrs_r(int rd, int dn, Condition cc = AL)
--- a/js/src/assembler/assembler/MacroAssemblerARM.h
+++ b/js/src/assembler/assembler/MacroAssemblerARM.h
@@ -1132,17 +1132,17 @@ public:
         m_assembler.baseIndexFloatTransfer(true, true, dest,
                                            address.base, address.index,
                                            address.scale, address.offset);
     }
 
     DataLabelPtr loadDouble(const void* address, FPRegisterID dest)
     {
         DataLabelPtr label = moveWithPatch(ImmPtr(address), ARMRegisters::S0);
-        m_assembler.fdtr_u(true, dest, ARMRegisters::S0, 0);
+        m_assembler.doubleTransfer(true, dest, ARMRegisters::S0, 0);
         return label;
     }
 
     void fastLoadDouble(RegisterID lo, RegisterID hi, FPRegisterID fpReg) {
         m_assembler.vmov64(false, true, lo, hi, fpReg);
     }
 
     void loadFloat(ImplicitAddress address, FPRegisterID dest)