Bug 789753 - Save non-volatile XMM registers on Win64 ABI. r=dvander
authorMakoto Kato <m_kato@ga2.so-net.ne.jp>
Thu, 04 Oct 2012 18:15:31 +0900
changeset 109342 3ce909bed24a824050f69bc0649c71309fae1767
parent 109341 1415aa5411b1355700b80f3728ee1503abf455eb
child 109343 38b9f2f85de0a615e784a99612323e478a25d708
push id82
push usershu@rfrn.org
push dateFri, 05 Oct 2012 13:20:22 +0000
reviewersdvander
bugs789753
milestone18.0a1
Bug 789753 - Save non-volatile XMM registers on Win64 ABI. r=dvander
js/src/assembler/assembler/X86Assembler.h
js/src/ion/shared/Assembler-x86-shared.h
js/src/ion/x64/Trampoline-x64.cpp
js/src/methodjit/TrampolineMasmX64.asm
--- a/js/src/assembler/assembler/X86Assembler.h
+++ b/js/src/assembler/assembler/X86Assembler.h
@@ -282,19 +282,21 @@ private:
         OP2_CVTSD2SS_VsdEd  = 0x5A,
         OP2_SUBSD_VsdWsd    = 0x5C,
         OP2_DIVSD_VsdWsd    = 0x5E,
         OP2_SQRTSD_VsdWsd   = 0x51,
         OP2_ANDPD_VpdWpd    = 0x54,
         OP2_ORPD_VpdWpd     = 0x56,
         OP2_XORPD_VpdWpd    = 0x57,
         OP2_MOVD_VdEd       = 0x6E,
+        OP2_MOVDQA_VsdWsd   = 0x6F,
         OP2_PSRLDQ_Vd       = 0x73,
         OP2_PCMPEQW         = 0x75,
         OP2_MOVD_EdVd       = 0x7E,
+        OP2_MOVDQA_WsdVsd   = 0x7F,
         OP2_JCC_rel32       = 0x80,
         OP_SETCC            = 0x90,
         OP2_IMUL_GvEv       = 0xAF,
         OP2_MOVSX_GvEb      = 0xBE,
         OP2_MOVSX_GvEw      = 0xBF,
         OP2_MOVZX_GvEb      = 0xB6,
         OP2_MOVZX_GvEw      = 0xB7,
         OP2_PEXTRW_GdUdIb   = 0xC5
@@ -2360,16 +2362,52 @@ public:
         js::JaegerSpew(js::JSpew_Insns,
                        IPFX "movsd      %p, %s\n", MAYBE_PAD,
                        address, nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_F2);
         m_formatter.twoByteOp(OP2_MOVSD_VsdWsd, (RegisterID)dst, address);
     }
 #endif
 
+    void movdqa_rm(XMMRegisterID src, int offset, RegisterID base)
+    {
+        js::JaegerSpew(js::JSpew_Insns,
+                       IPFX "movdqa     %s, %s0x%x(%s)\n", MAYBE_PAD,
+                       nameFPReg(src), PRETTY_PRINT_OFFSET(offset), nameIReg(base));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.twoByteOp(OP2_MOVDQA_WsdVsd, (RegisterID)src, base, offset);
+    }
+
+    void movdqa_rm(XMMRegisterID src, int offset, RegisterID base, RegisterID index, int scale)
+    {
+        js::JaegerSpew(js::JSpew_Insns,
+                       IPFX "movdqa      %s, %d(%s,%s,%d)\n", MAYBE_PAD, 
+                       nameFPReg(src), offset, nameIReg(base), nameIReg(index), scale);
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.twoByteOp(OP2_MOVDQA_WsdVsd, (RegisterID)src, base, index, scale, offset);
+    }
+
+    void movdqa_mr(int offset, RegisterID base, XMMRegisterID dst)
+    {
+        js::JaegerSpew(js::JSpew_Insns,
+                       IPFX "movdqa     %s0x%x(%s), %s\n", MAYBE_PAD,
+                       PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.twoByteOp(OP2_MOVDQA_VsdWsd, (RegisterID)dst, base, offset);
+    }
+
+    void movdqa_mr(int offset, RegisterID base, RegisterID index, int scale, XMMRegisterID dst)
+    {
+        js::JaegerSpew(js::JSpew_Insns,
+                       IPFX "movdqa     %d(%s,%s,%d), %s\n", MAYBE_PAD,
+                       offset, nameIReg(base), nameIReg(index), scale, nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.twoByteOp(OP2_MOVDQA_VsdWsd, (RegisterID)dst, base, index, scale, offset);
+    }
+
     void mulsd_rr(XMMRegisterID src, XMMRegisterID dst)
     {
         js::JaegerSpew(js::JSpew_Insns,
                        IPFX "mulsd      %s, %s\n", MAYBE_PAD,
                        nameFPReg(src), nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_F2);
         m_formatter.twoByteOp(OP2_MULSD_VsdWsd, (RegisterID)dst, (RegisterID)src);
     }
--- a/js/src/ion/shared/Assembler-x86-shared.h
+++ b/js/src/ion/shared/Assembler-x86-shared.h
@@ -286,16 +286,40 @@ class AssemblerX86Shared
             break;
           case Operand::SCALE:
             masm.movss_rm(src.code(), dest.disp(), dest.base(), dest.index(), dest.scale());
             break;
           default:
             JS_NOT_REACHED("unexpected operand kind");
         }
     }
+    void movdqa(const Operand &src, const FloatRegister &dest) {
+        switch (src.kind()) {
+          case Operand::REG_DISP:
+            masm.movdqa_mr(src.disp(), src.base(), dest.code());
+            break;
+          case Operand::SCALE:
+            masm.movdqa_mr(src.disp(), src.base(), src.index(), src.scale(), dest.code());
+            break;
+          default:
+            JS_NOT_REACHED("unexpected operand kind");
+        }
+    }
+    void movdqa(const FloatRegister &src, const Operand &dest) {
+        switch (dest.kind()) {
+          case Operand::REG_DISP:
+            masm.movdqa_rm(src.code(), dest.disp(), dest.base());
+            break;
+          case Operand::SCALE:
+            masm.movdqa_rm(src.code(), dest.disp(), dest.base(), dest.index(), dest.scale());
+            break;
+          default:
+            JS_NOT_REACHED("unexpected operand kind");
+        }
+    }
     void cvtss2sd(const FloatRegister &src, const FloatRegister &dest) {
         masm.cvtss2sd_rr(src.code(), dest.code());
     }
     void cvtsd2ss(const FloatRegister &src, const FloatRegister &dest) {
         masm.cvtsd2ss_rr(src.code(), dest.code());
     }
     void movzbl(const Operand &src, const Register &dest) {
         switch (src.kind()) {
--- a/js/src/ion/x64/Trampoline-x64.cpp
+++ b/js/src/ion/x64/Trampoline-x64.cpp
@@ -51,16 +51,30 @@ IonCompartment::generateEnterJIT(JSConte
     masm.push(rbx);
     masm.push(r12);
     masm.push(r13);
     masm.push(r14);
     masm.push(r15);
 #if defined(_WIN64)
     masm.push(rdi);
     masm.push(rsi);
+
+    // 16-byte aligment for movdqa
+    masm.subq(Imm32(16 * 10 + 8), rsp);
+
+    masm.movdqa(xmm6, Operand(rsp, 16 * 0));
+    masm.movdqa(xmm7, Operand(rsp, 16 * 1));
+    masm.movdqa(xmm8, Operand(rsp, 16 * 2));
+    masm.movdqa(xmm9, Operand(rsp, 16 * 3));
+    masm.movdqa(xmm10, Operand(rsp, 16 * 4));
+    masm.movdqa(xmm11, Operand(rsp, 16 * 5));
+    masm.movdqa(xmm12, Operand(rsp, 16 * 6));
+    masm.movdqa(xmm13, Operand(rsp, 16 * 7));
+    masm.movdqa(xmm14, Operand(rsp, 16 * 8));
+    masm.movdqa(xmm15, Operand(rsp, 16 * 9));
 #endif
 
     // Save arguments passed in registers needed after function call.
     masm.push(result);
 
     // Remember stack depth without padding and arguments.
     masm.mov(rsp, r14);
 
@@ -130,16 +144,29 @@ IonCompartment::generateEnterJIT(JSConte
     /*****************************************************************
     Place return value where it belongs, pop all saved registers
     *****************************************************************/
     masm.pop(r12); // vp
     masm.storeValue(JSReturnOperand, Operand(r12, 0));
 
     // Restore non-volatile registers.
 #if defined(_WIN64)
+    masm.movdqa(Operand(rsp, 16 * 0), xmm6);
+    masm.movdqa(Operand(rsp, 16 * 1), xmm7);
+    masm.movdqa(Operand(rsp, 16 * 2), xmm8);
+    masm.movdqa(Operand(rsp, 16 * 3), xmm9);
+    masm.movdqa(Operand(rsp, 16 * 4), xmm10);
+    masm.movdqa(Operand(rsp, 16 * 5), xmm11);
+    masm.movdqa(Operand(rsp, 16 * 6), xmm12);
+    masm.movdqa(Operand(rsp, 16 * 7), xmm13);
+    masm.movdqa(Operand(rsp, 16 * 8), xmm14);
+    masm.movdqa(Operand(rsp, 16 * 9), xmm15);
+
+    masm.addq(Imm32(16 * 10 + 8), rsp);
+
     masm.pop(rsi);
     masm.pop(rdi);
 #endif
     masm.pop(r15);
     masm.pop(r14);
     masm.pop(r13);
     masm.pop(r12);
     masm.pop(rbx);
--- a/js/src/methodjit/TrampolineMasmX64.asm
+++ b/js/src/methodjit/TrampolineMasmX64.asm
@@ -27,16 +27,42 @@ JaegerTrampoline PROC FRAME
     push    r15
     .PUSHREG r15
     push    rdi
     .PUSHREG rdi
     push    rsi
     .PUSHREG rsi
     push    rbx
     .PUSHREG rbx
+    sub     rsp, 16*10+8
+    .ALLOCSTACK 168
+    ; .SAVEXMM128 only supports 16 byte alignment offset
+    movdqa  xmmword ptr [rsp], xmm6
+    .SAVEXMM128 xmm6, 0
+    movdqa  xmmword ptr [rsp+16], xmm7
+    .SAVEXMM128 xmm7, 16
+    movdqa  xmmword ptr [rsp+16*2], xmm8
+    .SAVEXMM128 xmm8, 32
+    movdqa  xmmword ptr [rsp+16*3], xmm9
+    .SAVEXMM128 xmm9, 48
+    movdqa  xmmword ptr [rsp+16*4], xmm10
+    .SAVEXMM128 xmm10, 64
+    movdqa  xmmword ptr [rsp+16*5], xmm11
+    .SAVEXMM128 xmm11, 80
+    movdqa  xmmword ptr [rsp+16*6], xmm12
+    .SAVEXMM128 xmm12, 96
+    movdqa  xmmword ptr [rsp+16*7], xmm13
+    .SAVEXMM128 xmm13, 112
+    movdqa  xmmword ptr [rsp+16*8], xmm14
+    .SAVEXMM128 xmm14, 128
+    movdqa  xmmword ptr [rsp+16*9], xmm15
+    .SAVEXMM128 xmm15, 144
+    ; stack aligment  for Win64 ABI
+    sub     rsp, 8
+    .ALLOCSTACK 8
     .ENDPROLOG
 
     ; Load mask registers
     mov     r13, 0ffff800000000000h
     mov     r14, 7fffffffffffh
 
     ; Build the JIT frame.
     ; rcx = cx
@@ -73,17 +99,27 @@ JaegerTrampoline ENDP
 JaegerTrampolineReturn PROC FRAME
     .ENDPROLOG
     or      rsi, rdi
     mov     qword ptr [rbx+30h], rsi
     sub     rsp, 20h
     lea     rcx, [rsp+20h]
     call    PopActiveVMFrame
 
-    add     rsp, 68h+20h
+    add     rsp, 68h+20h+8+16*10+8
+    movdqa  xmm6, xmmword ptr [rsp-16*10-8]
+    movdqa  xmm7, xmmword ptr [rsp-16*9-8]
+    movdqa  xmm8, xmmword ptr [rsp-16*8-8]
+    movdqa  xmm9, xmmword ptr [rsp-16*7-8]
+    movdqa  xmm10, xmmword ptr [rsp-16*6-8]
+    movdqa  xmm11, xmmword ptr [rsp-16*5-8]
+    movdqa  xmm12, xmmword ptr [rsp-16*4-8]
+    movdqa  xmm13, xmmword ptr [rsp-16*3-8]
+    movdqa  xmm14, xmmword ptr [rsp-16*2-8]
+    movdqa  xmm15, xmmword ptr [rsp-16*1-8]
     pop     rbx
     pop     rsi
     pop     rdi
     pop     r15
     pop     r14
     pop     r13
     pop     r12
     pop     rbp
@@ -102,17 +138,27 @@ JaegerThrowpoline PROC FRAME
     test    rax, rax
     je      throwpoline_exit
     add     rsp, 20h
     jmp     rax
 
 throwpoline_exit:
     lea     rcx, [rsp+20h]
     call    PopActiveVMFrame
-    add     rsp, 68h+20h
+    add     rsp, 68h+20h+8+16*10+8
+    movdqa  xmm6, xmmword ptr [rsp-16*10-8]
+    movdqa  xmm7, xmmword ptr [rsp-16*9-8]
+    movdqa  xmm8, xmmword ptr [rsp-16*8-8]
+    movdqa  xmm9, xmmword ptr [rsp-16*7-8]
+    movdqa  xmm10, xmmword ptr [rsp-16*6-8]
+    movdqa  xmm11, xmmword ptr [rsp-16*5-8]
+    movdqa  xmm12, xmmword ptr [rsp-16*4-8]
+    movdqa  xmm13, xmmword ptr [rsp-16*3-8]
+    movdqa  xmm14, xmmword ptr [rsp-16*2-8]
+    movdqa  xmm15, xmmword ptr [rsp-16*1-8]
     pop     rbx
     pop     rsi
     pop     rdi
     pop     r15
     pop     r14
     pop     r13
     pop     r12
     pop     rbp
@@ -136,17 +182,27 @@ JaegerInterpoline PROC FRAME
     test    rax, rax
     je      interpoline_exit
     add     rsp, 20h
     jmp     rax
 
 interpoline_exit:
     lea     rcx, [rsp+20h]
     call    PopActiveVMFrame
-    add     rsp, 68h+20h
+    add     rsp, 68h+20h+8+16*10+8
+    movdqa  xmm6, xmmword ptr [rsp-16*10-8]
+    movdqa  xmm7, xmmword ptr [rsp-16*9-8]
+    movdqa  xmm8, xmmword ptr [rsp-16*8-8]
+    movdqa  xmm9, xmmword ptr [rsp-16*7-8]
+    movdqa  xmm10, xmmword ptr [rsp-16*6-8]
+    movdqa  xmm11, xmmword ptr [rsp-16*5-8]
+    movdqa  xmm12, xmmword ptr [rsp-16*4-8]
+    movdqa  xmm13, xmmword ptr [rsp-16*3-8]
+    movdqa  xmm14, xmmword ptr [rsp-16*2-8]
+    movdqa  xmm15, xmmword ptr [rsp-16*1-8]
     pop     rbx
     pop     rsi
     pop     rdi
     pop     r15
     pop     r14
     pop     r13
     pop     r12
     pop     rbp