Add support for ARM's 'hard' EABI variant. (FP arguments go in VFP registers.) [Bug 602834] [r=jbramley,rreitmai]
authorTero Koskinen <tero.koskinen@digia.com>
Mon, 25 Oct 2010 09:51:59 +0100
changeset 56714 ffd02f65ffb3f8f9d4c96bcce64a960f44f4c370
parent 56713 35d097cc89a1810ba5cbca143d4e74d6e42b47c9
child 56715 7eadec8c91c60735b2285a1d99c03bb409327303
push id16665
push userrsayre@mozilla.com
push dateSun, 31 Oct 2010 10:52:31 +0000
treeherdermozilla-central@504a46e82712 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjbramley, rreitmai
bugs602834
milestone2.0b8pre
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Add support for ARM's 'hard' EABI variant. (FP arguments go in VFP registers.) [Bug 602834] [r=jbramley,rreitmai]
js/src/nanojit/NativeARM.cpp
js/src/nanojit/NativeARM.h
--- a/js/src/nanojit/NativeARM.cpp
+++ b/js/src/nanojit/NativeARM.cpp
@@ -19,16 +19,17 @@
  * Adobe System Incorporated.
  * Portions created by the Initial Developer are Copyright (C) 2004-2007
  * the Initial Developer. All Rights Reserved.
  *
  * Contributor(s):
  *   Adobe AS3 Team
  *   Vladimir Vukicevic <vladimir@pobox.com>
  *   Jacob Bramley <Jacob.Bramley@arm.com>
+ *   Tero Koskinen <tero.koskinen@digia.com>
  *
  * Alternatively, the contents of this file may be used under the terms of
  * either the GNU General Public License Version 2 or later (the "GPL"), or
  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  * in which case the provisions of the GPL or the LGPL are applicable instead
  * of those above. If you wish to allow use of your version of this file only
  * under the terms of either the GPL or the LGPL, and not to allow others to
  * use your version of this file under the terms of the MPL, indicate your
@@ -47,17 +48,17 @@
 
 #if defined(FEATURE_NANOJIT) && defined(NANOJIT_ARM)
 
 namespace nanojit
 {
 
 #ifdef NJ_VERBOSE
 const char* regNames[] = {"r0","r1","r2","r3","r4","r5","r6","r7","r8","r9","r10","fp","ip","sp","lr","pc",
-                          "d0","d1","d2","d3","d4","d5","d6","d7","s14"};
+                          "d0","d1","d2","d3","d4","d5","d6","d7","s0"};
 const char* condNames[] = {"eq","ne","cs","cc","mi","pl","vs","vc","hi","ls","ge","lt","gt","le",""/*al*/,"nv"};
 const char* shiftNames[] = { "lsl", "lsl", "lsr", "lsr", "asr", "asr", "ror", "ror" };
 #endif
 
 const Register Assembler::argRegs[] = { R0, R1, R2, R3 };
 const Register Assembler::retRegs[] = { R0, R1 };
 const Register Assembler::savedRegs[] = { R4, R5, R6, R7, R8, R9, R10 };
 
@@ -608,73 +609,91 @@ Assembler::genEpilogue()
  * Under EABI:
  * - doubles are 64-bit aligned both in registers and on the stack.
  *   If the next available argument register is R1, it is skipped
  *   and the double is placed in R2:R3.  If R0:R1 or R2:R3 are not
  *   available, the double is placed on the stack, 64-bit aligned.
  * - 32-bit arguments are placed in registers and 32-bit aligned
  *   on the stack.
  *
+ * Under EABI with hardware floating-point procedure-call variant:
+ * - Same as EABI, but doubles are passed in D0..D7 registers.
+ *
  * Under legacy ABI:
  * - doubles are placed in subsequent arg registers; if the next
  *   available register is r3, the low order word goes into r3
  *   and the high order goes on the stack.
  * - 32-bit arguments are placed in the next available arg register,
  * - both doubles and 32-bit arguments are placed on stack with 32-bit
  *   alignment.
  */
 void
-Assembler::asm_arg(ArgType ty, LIns* arg, Register& r, int& stkd)
+Assembler::asm_arg(ArgType ty, LIns* arg, ParameterRegisters& params)
 {
     // The stack pointer must always be at least aligned to 4 bytes.
-    NanoAssert((stkd & 3) == 0);
+    NanoAssert((params.stkd & 3) == 0);
 
     if (ty == ARGTYPE_D) {
         // This task is fairly complex and so is delegated to asm_arg_64.
-        asm_arg_64(arg, r, stkd);
+        asm_arg_64(arg, params);
     } else {
         NanoAssert(ty == ARGTYPE_I || ty == ARGTYPE_UI);
         // pre-assign registers R0-R3 for arguments (if they fit)
-        if (r < R4) {
-            asm_regarg(ty, arg, r);
-            r = Register(r + 1);
+        if (params.r < R4) {
+            asm_regarg(ty, arg, params.r);
+            params.r = Register(params.r + 1);
         } else {
-            asm_stkarg(arg, stkd);
-            stkd += 4;
+            asm_stkarg(arg, params.stkd);
+            params.stkd += 4;
         }
     }
 }
 
 // Encode a 64-bit floating-point argument using the appropriate ABI.
 // This function operates in the same way as asm_arg, except that it will only
 // handle arguments where (ArgType)ty == ARGTYPE_D.
+
+#ifdef NJ_ARM_EABI_HARD_FLOAT
 void
-Assembler::asm_arg_64(LIns* arg, Register& r, int& stkd)
+Assembler::asm_arg_64(LIns* arg, ParameterRegisters& params)
+{
+    NanoAssert(IsFpReg(params.float_r));
+    if (params.float_r <= D7) {
+        findSpecificRegFor(arg, params.float_r);
+        params.float_r = Register(params.float_r + 1);
+    } else {
+        NanoAssertMsg(0, "Only 8 floating point arguments supported");
+    }
+}
+
+#else
+void
+Assembler::asm_arg_64(LIns* arg, ParameterRegisters& params)
 {
     // The stack pointer must always be at least aligned to 4 bytes.
-    NanoAssert((stkd & 3) == 0);
+    NanoAssert((params.stkd & 3) == 0);
     // The only use for this function when we are using soft floating-point
     // is for LIR_ii2d.
     NanoAssert(ARM_VFP || arg->isop(LIR_ii2d));
 
 #ifdef NJ_ARM_EABI
     // EABI requires that 64-bit arguments are aligned on even-numbered
     // registers, as R0:R1 or R2:R3. If the register base is at an
     // odd-numbered register, advance it. Note that this will push r past
     // R3 if r is R3 to start with, and will force the argument to go on
     // the stack.
-    if ((r == R1) || (r == R3)) {
-        r = Register(r + 1);
+    if ((params.r == R1) || (params.r == R3)) {
+        params.r = Register(params.r + 1);
     }
 #endif
 
-    if (r < R3) {
-        Register    ra = r;
-        Register    rb = Register(r + 1);
-        r = Register(rb + 1);
+    if (params.r < R3) {
+        Register    ra = params.r;
+        Register    rb = Register(params.r + 1);
+        params.r = Register(rb + 1);
 
 #ifdef NJ_ARM_EABI
         // EABI requires that 64-bit arguments are aligned on even-numbered
         // registers, as R0:R1 or R2:R3.
         NanoAssert( ((ra == R0) && (rb == R1)) || ((ra == R2) && (rb == R3)) );
 #endif
 
         // Put the argument in ra and rb. If the argument is in a VFP register,
@@ -688,22 +707,22 @@ Assembler::asm_arg_64(LIns* arg, Registe
             asm_regarg(ARGTYPE_I, arg->oprnd2(), rb);
         }
 
 #ifndef NJ_ARM_EABI
     } else if (r == R3) {
         // We only have one register left, but the legacy ABI requires that we
         // put 32 bits of the argument in the register (R3) and the remaining
         // 32 bits on the stack.
-        Register    ra = r; // R3
-        r = R4;
+        Register    ra = params.r; // R3
+        params.r = R4;
 
         // We're splitting the argument between registers and the stack.  This
         // must be the first time that the stack is used, so stkd must be at 0.
-        NanoAssert(stkd == 0);
+        NanoAssert(params.stkd == 0);
 
         if (ARM_VFP) {
             Register dm = findRegFor(arg, FpRegs);
             // TODO: We could optimize the this to store directly from
             // the VFP register to memory using "FMRRD ra, fp_reg[31:0]" and
             // "STR fp_reg[63:32], [SP, #stkd]".
 
             // Load from the floating-point register as usual, but use IP
@@ -712,37 +731,38 @@ Assembler::asm_arg_64(LIns* arg, Registe
             FMRRD(ra, IP, dm);
         } else {
             // Without VFP, we can simply use asm_regarg and asm_stkarg to
             // encode the two 32-bit words as we don't need to load from a VFP
             // register.
             asm_regarg(ARGTYPE_I, arg->oprnd1(), ra);
             asm_stkarg(arg->oprnd2(), 0);
         }
-        stkd += 4;
+        params.stkd += 4;
 #endif
     } else {
         // The argument won't fit in registers, so pass on to asm_stkarg.
 #ifdef NJ_ARM_EABI
         // EABI requires that 64-bit arguments are 64-bit aligned.
-        if ((stkd & 7) != 0) {
+        if ((params.stkd & 7) != 0) {
             // stkd will always be aligned to at least 4 bytes; this was
             // asserted on entry to this function.
-            stkd += 4;
+            params.stkd += 4;
         }
 #endif
         if (ARM_VFP) {
-            asm_stkarg(arg, stkd);
+            asm_stkarg(arg, params.stkd);
         } else {
-            asm_stkarg(arg->oprnd1(), stkd);
-            asm_stkarg(arg->oprnd2(), stkd+4);
+            asm_stkarg(arg->oprnd1(), params.stkd);
+            asm_stkarg(arg->oprnd2(), params.stkd+4);
         }
-        stkd += 8;
+        params.stkd += 8;
     }
 }
+#endif // NJ_ARM_EABI_HARD_FLOAT
 
 void
 Assembler::asm_regarg(ArgType ty, LIns* p, Register rd)
 {
     // Note that we don't have to prepareResultReg here because it is already
     // done by the caller, and the target register is passed as 'rd'.
     // Similarly, we don't have to freeResourcesOf(p).
 
@@ -813,16 +833,24 @@ Assembler::asm_call(LIns* ins)
          * restoring of spilled data into R0 is done via a call to
          * prepareResultReg(R0) in the other branch of this if-then-else,
          * meaning that evictScratchRegsExcept() will not modify R0. However,
          * prepareResultReg is not aware of the concept of using a register
          * pair (R0,R1) for the result of a single operation, so it can only be
          * used here with the ultimate VFP register, and not R0/R1, which
          * potentially allows for R0/R1 to get corrupted as described.
          */
+#ifdef NJ_ARM_EABI_HARD_FLOAT
+        /* With ARM hardware floating point ABI, D0 is used to return the double
+         * from the function. We need to prepare it like we do for R0 in the else
+         * branch.
+         */
+        prepareResultReg(ins, rmask(D0));
+        freeResourcesOf(ins);
+#endif
     } else if (!ins->isop(LIR_callv)) {
         prepareResultReg(ins, rmask(retRegs[0]));
         // Immediately free the resources as we need to re-use the register for
         // the arguments.
         freeResourcesOf(ins);
     }
 
     // Do this after we've handled the call result, so we don't
@@ -834,21 +862,22 @@ Assembler::asm_call(LIns* ins)
     ArgType argTypes[MAXARGS];
     uint32_t argc = ci->getArgTypes(argTypes);
     bool indirect = ci->isIndirect();
 
     // If we aren't using VFP, assert that the LIR operation is an integer
     // function call.
     NanoAssert(ARM_VFP || ins->isop(LIR_callv) || ins->isop(LIR_calli));
 
-    // If we're using VFP, and the return type is a double, it'll come back in
-    // R0/R1. We need to either place it in the result fp reg, or store it.
+    // If we're using VFP, but not hardware floating point ABI, and
+    // the return type is a double, it'll come back in R0/R1.
+    // We need to either place it in the result fp reg, or store it.
     // See comments above for more details as to why this is necessary here
     // for floating point calls, but not for integer calls.
-    if (ARM_VFP && ins->isExtant()) {
+    if (!ARM_EABI_HARD && ARM_VFP && ins->isExtant()) {
         // If the result size is a floating-point value, treat the result
         // specially, as described previously.
         if (ci->returnType() == ARGTYPE_D) {
             NanoAssert(ins->isop(LIR_calld));
 
             if (ins->isInReg()) {
                 Register dd = ins->getReg();
                 // Copy the result to the (VFP) result register.
@@ -889,31 +918,31 @@ Assembler::asm_call(LIns* ins)
             underrunProtect(12);
             BX(IP);
             MOV(LR, PC);
             MOV(IP, LR);
         }
         asm_regarg(ARGTYPE_I, ins->arg(--argc), LR);
     }
 
-    // Encode the arguments, starting at R0 and with an empty argument stack.
-    Register    r = R0;
-    int         stkd = 0;
+    // Encode the arguments, starting at R0 and with an empty argument stack (0).
+    // With hardware fp ABI, floating point arguments start from D0.
+    ParameterRegisters params = init_params(0, R0, D0);
 
     // Iterate through the argument list and encode each argument according to
     // the ABI.
     // Note that we loop through the arguments backwards as LIR specifies them
     // in reverse order.
     uint32_t    i = argc;
     while(i--) {
-        asm_arg(argTypes[i], ins->arg(i), r, stkd);
+        asm_arg(argTypes[i], ins->arg(i), params);
     }
 
-    if (stkd > max_out_args) {
-        max_out_args = stkd;
+    if (params.stkd > max_out_args) {
+        max_out_args = params.stkd;
     }
 }
 
 Register
 Assembler::nRegisterAllocFromSet(RegisterMask set)
 {
     NanoAssert(set != 0);
 
@@ -936,17 +965,17 @@ Assembler::nRegisterResetAll(RegAlloc& a
     a.clear();
     a.free =
         rmask(R0) | rmask(R1) | rmask(R2) | rmask(R3) | rmask(R4) |
         rmask(R5) | rmask(R6) | rmask(R7) | rmask(R8) | rmask(R9) |
         rmask(R10) | rmask(LR);
     if (ARM_VFP) {
         a.free |=
             rmask(D0) | rmask(D1) | rmask(D2) | rmask(D3) |
-            rmask(D4) | rmask(D5) | rmask(D6);
+            rmask(D4) | rmask(D5) | rmask(D6) | rmask(D7);
     }
 }
 
 static inline ConditionCode
 get_cc(NIns *ins)
 {
     return ConditionCode((*ins >> 28) & 0xF);
 }
@@ -1324,23 +1353,24 @@ Assembler::asm_load64(LIns* ins)
 
     if (ARM_VFP) {
         Register    dd;
         LIns*       base = ins->oprnd1();
         Register    rn = findRegFor(base, GpRegs);
         int         offset = ins->disp();
 
         if (ins->isInReg()) {
-            dd = prepareResultReg(ins, FpRegs);
+            dd = prepareResultReg(ins, FpRegs & ~rmask(D0));
         } else {
             // If the result isn't already in a register, use the VFP scratch
             // register for the result and store it directly into memory.
             NanoAssert(ins->isInAr());
             int d = arDisp(ins);
-            dd = D7;
+            evictIfActive(D0);
+            dd = D0;
             // VFP can only do loads and stores with a range of ±1020, so we
             // might need to do some arithmetic to extend its range.
             if (isU8(d/4) || isU8(-d/4)) {
                 FSTD(dd, FP, d);
             } else {
                 FSTD(dd, IP, d%1024);
                 asm_add_imm(IP, FP, d-(d%1024));
             }
@@ -1351,21 +1381,22 @@ Assembler::asm_load64(LIns* ins)
                 if (isU8(offset/4) || isU8(-offset/4)) {
                     FLDD(dd, rn, offset);
                 } else {
                     FLDD(dd, IP, offset%1024);
                     asm_add_imm(IP, rn, offset-(offset%1024));
                 }
                 break;
             case LIR_ldf2d:
-                FCVTDS(dd, S14);
+                evictIfActive(D0);
+                FCVTDS(dd, S0);
                 if (isU8(offset/4) || isU8(-offset/4)) {
-                    FLDS(S14, rn, offset);
+                    FLDS(S0, rn, offset);
                 } else {
-                    FLDS(S14, IP, offset%1024);
+                    FLDS(S0, IP, offset%1024);
                     asm_add_imm(IP, rn, offset-(offset%1024));
                 }
                 break;
             default:
                 NanoAssertMsg(0, "LIR opcode unsupported by asm_load64.");
                 break;
         }
     } else {
@@ -1393,17 +1424,17 @@ Assembler::asm_load64(LIns* ins)
 }
 
 void
 Assembler::asm_store64(LOpcode op, LIns* value, int dr, LIns* base)
 {
     NanoAssert(value->isD());
 
     if (ARM_VFP) {
-        Register dd = findRegFor(value, FpRegs);
+        Register dd = findRegFor(value, FpRegs & ~rmask(D0));
         Register rn = findRegFor(base, GpRegs);
 
         switch (op) {
             case LIR_std:
                 // VFP can only do stores with a range of ±1020, so we might
                 // need to do some arithmetic to extend its range.
                 if (isU8(dr/4) || isU8(-dr/4)) {
                     FSTD(dd, rn, dr);
@@ -1411,24 +1442,25 @@ Assembler::asm_store64(LOpcode op, LIns*
                     FSTD(dd, IP, dr%1024);
                     asm_add_imm(IP, rn, dr-(dr%1024));
                 }
 
                 break;
             case LIR_std2f:
                 // VFP can only do stores with a range of ±1020, so we might
                 // need to do some arithmetic to extend its range.
+                evictIfActive(D0);
                 if (isU8(dr/4) || isU8(-dr/4)) {
-                    FSTS(S14, rn, dr);
+                    FSTS(S0, rn, dr);
                 } else {
-                    FSTS(S14, IP, dr%1024);
+                    FSTS(S0, IP, dr%1024);
                     asm_add_imm(IP, rn, dr-(dr%1024));
                 }
 
-                FCVTSD(S14, dd);
+                FCVTSD(S0, dd);
 
                 break;
             default:
                 NanoAssertMsg(0, "LIR opcode unsupported by asm_store64.");
                 break;
         }
     } else {
         int         d = findMemFor(value);
@@ -2118,59 +2150,62 @@ Assembler::B_cond_chk(ConditionCode _c, 
 
 /*
  * VFP
  */
 
 void
 Assembler::asm_i2d(LIns* ins)
 {
-    Register dd = prepareResultReg(ins, FpRegs);
+    Register dd = prepareResultReg(ins, FpRegs & ~rmask(D0));
     Register rt = findRegFor(ins->oprnd1(), GpRegs);
 
-    FSITOD(dd, S14);
-    FMSR(S14, rt);
+    evictIfActive(D0);
+    FSITOD(dd, S0);
+    FMSR(S0, rt);
 
     freeResourcesOf(ins);
 }
 
 void
 Assembler::asm_ui2d(LIns* ins)
 {
-    Register dd = prepareResultReg(ins, FpRegs);
+    Register dd = prepareResultReg(ins, FpRegs & ~rmask(D0));
     Register rt = findRegFor(ins->oprnd1(), GpRegs);
 
-    FUITOD(dd, S14);
-    FMSR(S14, rt);
+    evictIfActive(D0);
+    FUITOD(dd, S0);
+    FMSR(S0, rt);
 
     freeResourcesOf(ins);
 }
 
 void Assembler::asm_d2i(LIns* ins)
 {
+    evictIfActive(D0);
     if (ins->isInReg()) {
         Register rt = ins->getReg();
-        FMRS(rt, S14);
+        FMRS(rt, S0);
     } else {
         // There's no active result register, so store the result directly into
         // memory to avoid the FP->GP transfer cost on Cortex-A8.
         int32_t d = arDisp(ins);
         // VFP can only do stores with a range of ±1020, so we might need to do
         // some arithmetic to extend its range.
         if (isU8(d/4) || isU8(-d/4)) {
-            FSTS(S14, FP, d);
+            FSTS(S0, FP, d);
         } else {
-            FSTS(S14, IP, d%1024);
+            FSTS(S0, IP, d%1024);
             asm_add_imm(IP, FP, d-(d%1024));
         }
     }
 
-    Register dm = findRegFor(ins->oprnd1(), FpRegs);
+    Register dm = findRegFor(ins->oprnd1(), FpRegs & ~rmask(D0));
 
-    FTOSID(S14, dm);
+    FTOSID(S0, dm);
 
     freeResourcesOf(ins);
 }
 
 void
 Assembler::asm_fneg(LIns* ins)
 {
     LIns* lhs = ins->oprnd1();
@@ -2827,33 +2862,40 @@ void
 Assembler::asm_ret(LIns *ins)
 {
     genEpilogue();
 
     // NB: our contract with genEpilogue is actually that the return value
     // we are intending for R0 is currently IP, not R0. This has to do with
     // the strange dual-nature of the patchable jump in a side-exit. See
     // nPatchBranch.
-
-    MOV(IP, R0);
+    //
+    // With hardware floating point ABI we can skip this for retd.
+    if (!(ARM_EABI_HARD && ins->isop(LIR_retd))) {
+        MOV(IP, R0);
+    }
 
     // Pop the stack frame.
     MOV(SP,FP);
 
     releaseRegisters();
     assignSavedRegs();
     LIns *value = ins->oprnd1();
     if (ins->isop(LIR_reti)) {
         findSpecificRegFor(value, R0);
     }
     else {
         NanoAssert(ins->isop(LIR_retd));
         if (ARM_VFP) {
+#ifdef NJ_ARM_EABI_HARD_FLOAT
+            findSpecificRegFor(value, D0);
+#else
             Register reg = findRegFor(value, FpRegs);
             FMRRD(R0, R1, reg);
+#endif
         } else {
             NanoAssert(value->isop(LIR_ii2d));
             findSpecificRegFor(value->oprnd1(), R0); // lo
             findSpecificRegFor(value->oprnd2(), R1); // hi
         }
     }
 }
 
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@@ -70,17 +70,29 @@
 namespace nanojit
 {
 #if defined VMCFG_DOUBLE_MSW_FIRST || defined _MSC_VER
 #  undef  NJ_ARM_EABI
 #else
 #  define NJ_ARM_EABI  1
 #endif
 
-// only d0-d6 are actually used; we'll use d7 as s14-s15 for i2d/u2f/etc.
+// GCC defines __ARM_PCS_VFP if it uses hardware floating point ABI
+// See http://gcc.gnu.org/viewcvs?view=revision&revision=162637
+#ifdef __ARM_PCS_VFP
+#  define NJ_ARM_EABI_HARD_FLOAT 1
+#endif
+
+#ifdef NJ_ARM_EABI_HARD_FLOAT
+#  define ARM_EABI_HARD true
+#else
+#  define ARM_EABI_HARD false
+#endif
+
+// only d0-d7 are used; in addition, we'll use d0 as s0-s1 for i2d/u2f/etc.
 #define NJ_VFP_MAX_REGISTERS            8
 #define NJ_MAX_REGISTERS                (11 + NJ_VFP_MAX_REGISTERS)
 #define NJ_MAX_STACK_ENTRY              4096
 #define NJ_MAX_PARAMETERS               16
 #define NJ_ALIGN_STACK                  8
 
 #define NJ_JTBL_SUPPORTED               1
 #define NJ_EXPANDED_LOADSTORE_SUPPORTED 1
@@ -113,41 +125,40 @@ static const Register
     R9  = { 9 },
     R10 = { 10 },
     FP  = { 11 },
     IP  = { 12 },
     SP  = { 13 },
     LR  = { 14 },
     PC  = { 15 },
 
-    // VFP regs (we currently only use D0-D6 and S14)
+    // VFP regs (we currently only use D0-D7 and S0)
     D0 = { 16 },
     D1 = { 17 },
     D2 = { 18 },
     D3 = { 19 },
     D4 = { 20 },
     D5 = { 21 },
     D6 = { 22 },
-    // S14 overlaps with D7 and is hard-coded into i2d and u2f operations, but
-    // D7 is still listed here for completeness and to facilitate assertions.
     D7 = { 23 },
     // D8-D15 are caller-saved registers that we don't currently handle.
 
     FirstFloatReg = D0,
-    LastFloatReg = D6,
+    LastFloatReg = D7,
 
     deprecated_UnknownReg = { 32 },     // XXX: remove eventually, see bug 538924
 
-    S14 = { 24 },
+    // S0 overlaps with D0 and is hard-coded into i2d and u2f operations
+    S0 = { 24 },
 
     SBZ = { 0 } ;   // Used for 'should-be-zero' fields in instructions with
                     // unused register fields.
 
 static const uint32_t FirstRegNum = R0;
-static const uint32_t LastRegNum = D6;
+static const uint32_t LastRegNum = D7;
 }
 
 #define NJ_USE_UINT32_REGISTER 1
 #include "NativeCommon.h"
 
 namespace nanojit
 {
 
@@ -184,16 +195,30 @@ typedef enum {
 #define OppositeCond(cc)  ((ConditionCode)((unsigned int)(cc)^0x1))
 
 typedef int RegisterMask;
 typedef struct _FragInfo {
     RegisterMask    needRestoring;
     NIns*           epilogue;
 } FragInfo;
 
+typedef struct _ParameterRegisters {
+    int stkd;
+    Register r;
+#ifdef NJ_ARM_EABI_HARD_FLOAT
+    Register float_r;
+#endif
+} ParameterRegisters;
+
+#ifdef NJ_ARM_EABI_HARD_FLOAT
+#define init_params(a,b,c) { (a), (b), (c) }
+#else
+#define init_params(a,b,c) { (a), (b) }
+#endif
+
 // D0-D7 are not saved; D8-D15 are, but we don't use those,
 // so we don't have to worry about saving/restoring them
 static const RegisterMask SavedFpRegs = 0;
 static const RegisterMask SavedRegs = 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7 | 1<<R8 | 1<<R9 | 1<<R10;
 static const int NumSavedRegs = 7;
 
 static const RegisterMask FpRegs = 1<<D0 | 1<<D1 | 1<<D2 | 1<<D3 | 1<<D4 | 1<<D5 | 1<<D6 | 1<<D7;
 static const RegisterMask GpRegs = 0xFFFF;
@@ -248,18 +273,18 @@ verbose_only( extern const char* shiftNa
     void        asm_regarg(ArgType, LIns*, Register);                           \
     void        asm_stkarg(LIns* p, int stkd);                                  \
     void        asm_cmpi(Register, int32_t imm);                                \
     void        asm_ldr_chk(Register d, Register b, int32_t off, bool chk);     \
     int32_t     asm_str(Register rt, Register rr, int32_t off);                 \
     void        asm_cmp(LIns *cond);                                            \
     void        asm_cmpd(LIns *cond);                                           \
     void        asm_ld_imm(Register d, int32_t imm, bool chk = true);           \
-    void        asm_arg(ArgType ty, LIns* arg, Register& r, int& stkd);         \
-    void        asm_arg_64(LIns* arg, Register& r, int& stkd);                  \
+    void        asm_arg(ArgType ty, LIns* arg, ParameterRegisters& params);     \
+    void        asm_arg_64(LIns* arg, ParameterRegisters& params);              \
     void        asm_add_imm(Register rd, Register rn, int32_t imm, int stat = 0);   \
     void        asm_sub_imm(Register rd, Register rn, int32_t imm, int stat = 0);   \
     void        asm_and_imm(Register rd, Register rn, int32_t imm, int stat = 0);   \
     void        asm_orr_imm(Register rd, Register rn, int32_t imm, int stat = 0);   \
     void        asm_eor_imm(Register rd, Register rn, int32_t imm, int stat = 0);   \
     inline bool     encOp2Imm(uint32_t literal, uint32_t * enc);                \
     inline uint32_t CountLeadingZeroes(uint32_t data);                          \
     int *       _nSlot;                                                         \
@@ -905,18 +930,18 @@ enum {
         *(--_nIns) = (NIns)( COND_AL | (0xD1<<20) | ((_Rn)<<16) | (FpRegNum(_Dd)<<12) | (0xB<<8) | negflag | ((offs>>2)&0xff) ); \
         asm_output("fldd %s,%s(%d)", gpn(_Dd), gpn(_Rn), _offs);       \
     } while (0)
 #define FLDD(_Dd,_Rn,_offs) FLDD_chk(_Dd,_Rn,_offs,1)
 
 #define FUITOD(_Dd,_Sm) do {                                            \
         underrunProtect(4);                                             \
         NanoAssert(ARM_VFP);                                    \
-        NanoAssert(IsFpReg(_Dd) && ((_Sm) == S14));                     \
-        *(--_nIns) = (NIns)( COND_AL | (0xEB8<<16) | (FpRegNum(_Dd)<<12) | (0x2D<<6) | (0<<5) | (0x7) ); \
+        NanoAssert(IsFpReg(_Dd) && ((_Sm) == S0));                     \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB8<<16) | (FpRegNum(_Dd)<<12) | (0x2D<<6) | (0<<5) | (0x0) ); \
         asm_output("fuitod %s,%s", gpn(_Dd), gpn(_Sm));                \
     } while (0)
 
 #define FNEGD(_Dd,_Dm) do {                                             \
         underrunProtect(4);                                             \
         NanoAssert(ARM_VFP);                                    \
         NanoAssert(IsFpReg(_Dd) && IsFpReg(_Dm));                       \
         *(--_nIns) = (NIns)( COND_AL | (0xEB1<<16) | (FpRegNum(_Dd)<<12) | (0xB4<<4) | (FpRegNum(_Dm)) ); \
@@ -979,111 +1004,111 @@ enum {
         *(--_nIns) = (NIns)( ((_cond)<<28) | (0xEB0<<16) | (FpRegNum(_Dd)<<12) | (0xB4<<4) | (FpRegNum(_Dm)) ); \
         asm_output("fcpyd%s %s,%s", condNames[_cond], gpn(_Dd), gpn(_Dm));  \
     } while (0)
 #define FCPYD(_Dd,_Dm)      FCPYD_cond(AL,_Dd,_Dm)
 
 #define FMRS(_Rd,_Sn) do {                                              \
         underrunProtect(4);                                             \
         NanoAssert(ARM_VFP);                                    \
-        NanoAssert(((_Sn) == S14) && IsGpReg(_Rd));                     \
-        *(--_nIns) = (NIns)( COND_AL | (0xE1<<20) | (0x7<<16) | ((_Rd)<<12) | (0xA<<8) | (0<<7) | (0x1<<4) ); \
+        NanoAssert(((_Sn) == S0) && IsGpReg(_Rd));                     \
+        *(--_nIns) = (NIns)( COND_AL | (0xE1<<20) | (0x0<<16) | ((_Rd)<<12) | (0xA<<8) | (0<<7) | (0x1<<4) ); \
         asm_output("fmrs %s,%s", gpn(_Rd), gpn(_Sn));                  \
     } while (0)
 
 /*
- * The following instructions can only be used with S14 as the
+ * The following instructions can only be used with S0 as the
  * single-precision register; that limitation can be removed if
  * needed, but we'd have to teach NJ about all the single precision
  * regs, and their encoding is strange (top 4 bits usually in a block,
  * low bit elsewhere).
  */
 
 #define FSITOD(_Dd,_Sm) do {                                            \
         underrunProtect(4);                                             \
         NanoAssert(ARM_VFP);                                    \
-        NanoAssert(IsFpReg(_Dd) && ((_Sm) == S14));                     \
-        *(--_nIns) = (NIns)( COND_AL | (0xEB8<<16) | (FpRegNum(_Dd)<<12) | (0x2F<<6) | (0<<5) | (0x7) ); \
+        NanoAssert(IsFpReg(_Dd) && ((_Sm) == S0));                     \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB8<<16) | (FpRegNum(_Dd)<<12) | (0x2F<<6) | (0<<5) | (0x0) ); \
         asm_output("fsitod %s,%s", gpn(_Dd), gpn(_Sm));                \
     } while (0)
 
 #define FMSR(_Sn,_Rd) do {                                              \
         underrunProtect(4);                                             \
         NanoAssert(ARM_VFP);                                    \
-        NanoAssert(((_Sn) == S14) && IsGpReg(_Rd));                     \
-        *(--_nIns) = (NIns)( COND_AL | (0xE0<<20) | (0x7<<16) | ((_Rd)<<12) | (0xA<<8) | (0<<7) | (0x1<<4) ); \
+        NanoAssert(((_Sn) == S0) && IsGpReg(_Rd));                     \
+        *(--_nIns) = (NIns)( COND_AL | (0xE0<<20) | (0x0<<16) | ((_Rd)<<12) | (0xA<<8) | (0<<7) | (0x1<<4) ); \
         asm_output("fmsr %s,%s", gpn(_Sn), gpn(_Rd));                  \
     } while (0)
 
 #define FMRS(_Rd,_Sn) do {                                              \
         underrunProtect(4);                                             \
         NanoAssert(ARM_VFP);                                    \
-        NanoAssert(((_Sn) == S14) && IsGpReg(_Rd));                     \
-        *(--_nIns) = (NIns)( COND_AL | (0xE1<<20) | (0x7<<16) | ((_Rd)<<12) | (0xA<<8) | (0<<7) | (0x1<<4) ); \
+        NanoAssert(((_Sn) == S0) && IsGpReg(_Rd));                     \
+        *(--_nIns) = (NIns)( COND_AL | (0xE1<<20) | (0x0<<16) | ((_Rd)<<12) | (0xA<<8) | (0<<7) | (0x1<<4) ); \
         asm_output("fmrs %s,%s", gpn(_Rd), gpn(_Sn));                  \
     } while (0)
 
 #define FMSR(_Sn,_Rd) do {                                              \
         underrunProtect(4);                                             \
         NanoAssert(ARM_VFP);                                    \
-        NanoAssert(((_Sn) == S14) && IsGpReg(_Rd));                     \
-        *(--_nIns) = (NIns)( COND_AL | (0xE0<<20) | (0x7<<16) | ((_Rd)<<12) | (0xA<<8) | (0<<7) | (0x1<<4) ); \
+        NanoAssert(((_Sn) == S0) && IsGpReg(_Rd));                     \
+        *(--_nIns) = (NIns)( COND_AL | (0xE0<<20) | (0x0<<16) | ((_Rd)<<12) | (0xA<<8) | (0<<7) | (0x1<<4) ); \
         asm_output("fmsr %s,%s", gpn(_Sn), gpn(_Rd));                  \
     } while (0)
 
 #define FCVTSD(_Sd,_Dm) do {                        \
         underrunProtect(4);                         \
         NanoAssert(ARM_VFP);                \
-        NanoAssert(((_Sd) == S14) && IsFpReg(_Dm)); \
-        *(--_nIns) = (NIns)( COND_AL | (0xEB7<<16) | (0x7<<12) | (0xBC<<4) | (FpRegNum(_Dm)) ); \
-        asm_output("[0x%08x] fcvtsd s14,%s", *_nIns, gpn(_Dm));                          \
+        NanoAssert(((_Sd) == S0) && IsFpReg(_Dm)); \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB7<<16) | (0x0<<12) | (0xBC<<4) | (FpRegNum(_Dm)) ); \
+        asm_output("[0x%08x] fcvtsd s0,%s", *_nIns, gpn(_Dm));                          \
     } while (0)
 
 #define FCVTDS(_Dd,_Sm) do {                                    \
         underrunProtect(4);                                     \
         NanoAssert(ARM_VFP);                                    \
-        NanoAssert(((_Sm) == S14) && IsFpReg(_Dd));             \
-        *(--_nIns) = (NIns)( COND_AL | (0xEB7<<16) | (FpRegNum(_Dd)<<12) | (0xAC<<4) | (0x7) ); \
-        asm_output("fcvtds %s,s14", gpn(_Dd));                  \
+        NanoAssert(((_Sm) == S0) && IsFpReg(_Dd));             \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB7<<16) | (FpRegNum(_Dd)<<12) | (0xAC<<4) | (0x0) ); \
+        asm_output("fcvtds %s,s0", gpn(_Dd));                  \
     } while(0)
 
 #define FLDS(_Sd,_Rn,_offs) do {                                \
         underrunProtect(4);                                     \
         NanoAssert(ARM_VFP);                                    \
-        NanoAssert(((_Sd) == S14) && !IsFpReg(_Rn));            \
+        NanoAssert(((_Sd) == S0) && !IsFpReg(_Rn));            \
         NanoAssert(((_offs)%4) == 0);                           \
         NanoAssert((isU8((_offs)/4)) || isU8(-(_offs)/4));      \
         int addflag = 1<<23;                                    \
         intptr_t offs = (_offs);                                \
         if (offs < 0) {                                         \
             addflag = 0;                                        \
             offs = -offs;                                       \
         }                                                       \
-        *(--_nIns) = (NIns)( COND_AL | (0xD1<<20) | ((_Rn)<<16) | (0x7<<12) | (0xA << 8) | addflag | ((offs>>2)&0xff) ); \
-        asm_output("flds s14, [%s, #%d]", gpn(_Rn), (_offs));   \
+        *(--_nIns) = (NIns)( COND_AL | (0xD1<<20) | ((_Rn)<<16) | (0x0<<12) | (0xA << 8) | addflag | ((offs>>2)&0xff) ); \
+        asm_output("flds s0, [%s, #%d]", gpn(_Rn), (_offs));   \
     } while (0)
 
 #define FSTS(_Sd,_Rn,_offs) do {                                \
         underrunProtect(4);                                     \
         NanoAssert(ARM_VFP);                                    \
-        NanoAssert(((_Sd) == S14) && !IsFpReg(_Rn));            \
+        NanoAssert(((_Sd) == S0) && !IsFpReg(_Rn));            \
         NanoAssert(((_offs)%4) == 0);                           \
         NanoAssert((isU8((_offs)/4)) || isU8(-(_offs)/4));      \
         int addflag = 1<<23;                                    \
         intptr_t offs = (_offs);                                \
         if (offs < 0) {                                         \
             addflag = 0;                                        \
             offs = -offs;                                       \
         }                                                       \
-        *(--_nIns) = (NIns)( COND_AL | (0xD0<<20) | ((_Rn)<<16) | (0x7<<12) | (0xA << 8) | addflag | ((offs>>2)&0xff) ); \
-        asm_output("fsts s14, [%s, #%d]", gpn(_Rn), (_offs));   \
+        *(--_nIns) = (NIns)( COND_AL | (0xD0<<20) | ((_Rn)<<16) | (0x0<<12) | (0xA << 8) | addflag | ((offs>>2)&0xff) ); \
+        asm_output("fsts s0, [%s, #%d]", gpn(_Rn), (_offs));   \
     } while (0)
 
 #define FTOSID(_Sd,_Dm) do {                                   \
         underrunProtect(4);                                    \
         NanoAssert(ARM_VFP);                           \
-        NanoAssert(((_Sd) == S14) && IsFpReg(_Dm));            \
-        *(--_nIns) = (NIns)( COND_AL | (0xEBD<<16) | (0x7<<12) | (0xB4<<4) | FpRegNum(_Dm) ); \
-        asm_output("ftosid s14, %s", gpn(_Dm));                \
+        NanoAssert(((_Sd) == S0) && IsFpReg(_Dm));            \
+        *(--_nIns) = (NIns)( COND_AL | (0xEBD<<16) | (0x0<<12) | (0xB4<<4) | FpRegNum(_Dm) ); \
+        asm_output("ftosid s0, %s", gpn(_Dm));                \
     } while (0)
 
 } // namespace nanojit
 #endif // __nanojit_NativeARM__