Bug 789753 - Save non-volatile XMM registers on Win64 ABI. r=dvander
authorMakoto Kato <m_kato@ga2.so-net.ne.jp>
Thu, 04 Oct 2012 18:15:31 +0900
changeset 115531 3ce909bed24a824050f69bc0649c71309fae1767
parent 115530 1415aa5411b1355700b80f3728ee1503abf455eb
child 115532 38b9f2f85de0a615e784a99612323e478a25d708
push id1708
push userakeybl@mozilla.com
push dateMon, 19 Nov 2012 21:10:21 +0000
treeherdermozilla-beta@27b14fe50103 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersdvander
bugs789753
milestone18.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 789753 - Save non-volatile XMM registers on Win64 ABI. r=dvander
js/src/assembler/assembler/X86Assembler.h
js/src/ion/shared/Assembler-x86-shared.h
js/src/ion/x64/Trampoline-x64.cpp
js/src/methodjit/TrampolineMasmX64.asm
--- a/js/src/assembler/assembler/X86Assembler.h
+++ b/js/src/assembler/assembler/X86Assembler.h
@@ -282,19 +282,21 @@ private:
         OP2_CVTSD2SS_VsdEd  = 0x5A,
         OP2_SUBSD_VsdWsd    = 0x5C,
         OP2_DIVSD_VsdWsd    = 0x5E,
         OP2_SQRTSD_VsdWsd   = 0x51,
         OP2_ANDPD_VpdWpd    = 0x54,
         OP2_ORPD_VpdWpd     = 0x56,
         OP2_XORPD_VpdWpd    = 0x57,
         OP2_MOVD_VdEd       = 0x6E,
+        OP2_MOVDQA_VsdWsd   = 0x6F,
         OP2_PSRLDQ_Vd       = 0x73,
         OP2_PCMPEQW         = 0x75,
         OP2_MOVD_EdVd       = 0x7E,
+        OP2_MOVDQA_WsdVsd   = 0x7F,
         OP2_JCC_rel32       = 0x80,
         OP_SETCC            = 0x90,
         OP2_IMUL_GvEv       = 0xAF,
         OP2_MOVSX_GvEb      = 0xBE,
         OP2_MOVSX_GvEw      = 0xBF,
         OP2_MOVZX_GvEb      = 0xB6,
         OP2_MOVZX_GvEw      = 0xB7,
         OP2_PEXTRW_GdUdIb   = 0xC5
@@ -2360,16 +2362,52 @@ public:
         js::JaegerSpew(js::JSpew_Insns,
                        IPFX "movsd      %p, %s\n", MAYBE_PAD,
                        address, nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_F2);
         m_formatter.twoByteOp(OP2_MOVSD_VsdWsd, (RegisterID)dst, address);
     }
 #endif
 
+    void movdqa_rm(XMMRegisterID src, int offset, RegisterID base)
+    {
+        js::JaegerSpew(js::JSpew_Insns,
+                       IPFX "movdqa     %s, %s0x%x(%s)\n", MAYBE_PAD,
+                       nameFPReg(src), PRETTY_PRINT_OFFSET(offset), nameIReg(base));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.twoByteOp(OP2_MOVDQA_WsdVsd, (RegisterID)src, base, offset);
+    }
+
+    void movdqa_rm(XMMRegisterID src, int offset, RegisterID base, RegisterID index, int scale)
+    {
+        js::JaegerSpew(js::JSpew_Insns,
+                       IPFX "movdqa      %s, %d(%s,%s,%d)\n", MAYBE_PAD, 
+                       nameFPReg(src), offset, nameIReg(base), nameIReg(index), scale);
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.twoByteOp(OP2_MOVDQA_WsdVsd, (RegisterID)src, base, index, scale, offset);
+    }
+
+    void movdqa_mr(int offset, RegisterID base, XMMRegisterID dst)
+    {
+        js::JaegerSpew(js::JSpew_Insns,
+                       IPFX "movdqa     %s0x%x(%s), %s\n", MAYBE_PAD,
+                       PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.twoByteOp(OP2_MOVDQA_VsdWsd, (RegisterID)dst, base, offset);
+    }
+
+    void movdqa_mr(int offset, RegisterID base, RegisterID index, int scale, XMMRegisterID dst)
+    {
+        js::JaegerSpew(js::JSpew_Insns,
+                       IPFX "movdqa     %d(%s,%s,%d), %s\n", MAYBE_PAD,
+                       offset, nameIReg(base), nameIReg(index), scale, nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.twoByteOp(OP2_MOVDQA_VsdWsd, (RegisterID)dst, base, index, scale, offset);
+    }
+
     void mulsd_rr(XMMRegisterID src, XMMRegisterID dst)
     {
         js::JaegerSpew(js::JSpew_Insns,
                        IPFX "mulsd      %s, %s\n", MAYBE_PAD,
                        nameFPReg(src), nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_F2);
         m_formatter.twoByteOp(OP2_MULSD_VsdWsd, (RegisterID)dst, (RegisterID)src);
     }
--- a/js/src/ion/shared/Assembler-x86-shared.h
+++ b/js/src/ion/shared/Assembler-x86-shared.h
@@ -286,16 +286,40 @@ class AssemblerX86Shared
             break;
           case Operand::SCALE:
             masm.movss_rm(src.code(), dest.disp(), dest.base(), dest.index(), dest.scale());
             break;
           default:
             JS_NOT_REACHED("unexpected operand kind");
         }
     }
+    void movdqa(const Operand &src, const FloatRegister &dest) {
+        switch (src.kind()) {
+          case Operand::REG_DISP:
+            masm.movdqa_mr(src.disp(), src.base(), dest.code());
+            break;
+          case Operand::SCALE:
+            masm.movdqa_mr(src.disp(), src.base(), src.index(), src.scale(), dest.code());
+            break;
+          default:
+            JS_NOT_REACHED("unexpected operand kind");
+        }
+    }
+    void movdqa(const FloatRegister &src, const Operand &dest) {
+        switch (dest.kind()) {
+          case Operand::REG_DISP:
+            masm.movdqa_rm(src.code(), dest.disp(), dest.base());
+            break;
+          case Operand::SCALE:
+            masm.movdqa_rm(src.code(), dest.disp(), dest.base(), dest.index(), dest.scale());
+            break;
+          default:
+            JS_NOT_REACHED("unexpected operand kind");
+        }
+    }
     void cvtss2sd(const FloatRegister &src, const FloatRegister &dest) {
         masm.cvtss2sd_rr(src.code(), dest.code());
     }
     void cvtsd2ss(const FloatRegister &src, const FloatRegister &dest) {
         masm.cvtsd2ss_rr(src.code(), dest.code());
     }
     void movzbl(const Operand &src, const Register &dest) {
         switch (src.kind()) {
--- a/js/src/ion/x64/Trampoline-x64.cpp
+++ b/js/src/ion/x64/Trampoline-x64.cpp
@@ -51,16 +51,30 @@ IonCompartment::generateEnterJIT(JSConte
     masm.push(rbx);
     masm.push(r12);
     masm.push(r13);
     masm.push(r14);
     masm.push(r15);
 #if defined(_WIN64)
     masm.push(rdi);
     masm.push(rsi);
+
+    // 16-byte aligment for movdqa
+    masm.subq(Imm32(16 * 10 + 8), rsp);
+
+    masm.movdqa(xmm6, Operand(rsp, 16 * 0));
+    masm.movdqa(xmm7, Operand(rsp, 16 * 1));
+    masm.movdqa(xmm8, Operand(rsp, 16 * 2));
+    masm.movdqa(xmm9, Operand(rsp, 16 * 3));
+    masm.movdqa(xmm10, Operand(rsp, 16 * 4));
+    masm.movdqa(xmm11, Operand(rsp, 16 * 5));
+    masm.movdqa(xmm12, Operand(rsp, 16 * 6));
+    masm.movdqa(xmm13, Operand(rsp, 16 * 7));
+    masm.movdqa(xmm14, Operand(rsp, 16 * 8));
+    masm.movdqa(xmm15, Operand(rsp, 16 * 9));
 #endif
 
     // Save arguments passed in registers needed after function call.
     masm.push(result);
 
     // Remember stack depth without padding and arguments.
     masm.mov(rsp, r14);
 
@@ -130,16 +144,29 @@ IonCompartment::generateEnterJIT(JSConte
     /*****************************************************************
     Place return value where it belongs, pop all saved registers
     *****************************************************************/
     masm.pop(r12); // vp
     masm.storeValue(JSReturnOperand, Operand(r12, 0));
 
     // Restore non-volatile registers.
 #if defined(_WIN64)
+    masm.movdqa(Operand(rsp, 16 * 0), xmm6);
+    masm.movdqa(Operand(rsp, 16 * 1), xmm7);
+    masm.movdqa(Operand(rsp, 16 * 2), xmm8);
+    masm.movdqa(Operand(rsp, 16 * 3), xmm9);
+    masm.movdqa(Operand(rsp, 16 * 4), xmm10);
+    masm.movdqa(Operand(rsp, 16 * 5), xmm11);
+    masm.movdqa(Operand(rsp, 16 * 6), xmm12);
+    masm.movdqa(Operand(rsp, 16 * 7), xmm13);
+    masm.movdqa(Operand(rsp, 16 * 8), xmm14);
+    masm.movdqa(Operand(rsp, 16 * 9), xmm15);
+
+    masm.addq(Imm32(16 * 10 + 8), rsp);
+
     masm.pop(rsi);
     masm.pop(rdi);
 #endif
     masm.pop(r15);
     masm.pop(r14);
     masm.pop(r13);
     masm.pop(r12);
     masm.pop(rbx);
--- a/js/src/methodjit/TrampolineMasmX64.asm
+++ b/js/src/methodjit/TrampolineMasmX64.asm
@@ -27,16 +27,42 @@ JaegerTrampoline PROC FRAME
     push    r15
     .PUSHREG r15
     push    rdi
     .PUSHREG rdi
     push    rsi
     .PUSHREG rsi
     push    rbx
     .PUSHREG rbx
+    sub     rsp, 16*10+8
+    .ALLOCSTACK 168
+    ; .SAVEXMM128 only supports 16 byte alignment offset
+    movdqa  xmmword ptr [rsp], xmm6
+    .SAVEXMM128 xmm6, 0
+    movdqa  xmmword ptr [rsp+16], xmm7
+    .SAVEXMM128 xmm7, 16
+    movdqa  xmmword ptr [rsp+16*2], xmm8
+    .SAVEXMM128 xmm8, 32
+    movdqa  xmmword ptr [rsp+16*3], xmm9
+    .SAVEXMM128 xmm9, 48
+    movdqa  xmmword ptr [rsp+16*4], xmm10
+    .SAVEXMM128 xmm10, 64
+    movdqa  xmmword ptr [rsp+16*5], xmm11
+    .SAVEXMM128 xmm11, 80
+    movdqa  xmmword ptr [rsp+16*6], xmm12
+    .SAVEXMM128 xmm12, 96
+    movdqa  xmmword ptr [rsp+16*7], xmm13
+    .SAVEXMM128 xmm13, 112
+    movdqa  xmmword ptr [rsp+16*8], xmm14
+    .SAVEXMM128 xmm14, 128
+    movdqa  xmmword ptr [rsp+16*9], xmm15
+    .SAVEXMM128 xmm15, 144
+    ; stack aligment  for Win64 ABI
+    sub     rsp, 8
+    .ALLOCSTACK 8
     .ENDPROLOG
 
     ; Load mask registers
     mov     r13, 0ffff800000000000h
     mov     r14, 7fffffffffffh
 
     ; Build the JIT frame.
     ; rcx = cx
@@ -73,17 +99,27 @@ JaegerTrampoline ENDP
 JaegerTrampolineReturn PROC FRAME
     .ENDPROLOG
     or      rsi, rdi
     mov     qword ptr [rbx+30h], rsi
     sub     rsp, 20h
     lea     rcx, [rsp+20h]
     call    PopActiveVMFrame
 
-    add     rsp, 68h+20h
+    add     rsp, 68h+20h+8+16*10+8
+    movdqa  xmm6, xmmword ptr [rsp-16*10-8]
+    movdqa  xmm7, xmmword ptr [rsp-16*9-8]
+    movdqa  xmm8, xmmword ptr [rsp-16*8-8]
+    movdqa  xmm9, xmmword ptr [rsp-16*7-8]
+    movdqa  xmm10, xmmword ptr [rsp-16*6-8]
+    movdqa  xmm11, xmmword ptr [rsp-16*5-8]
+    movdqa  xmm12, xmmword ptr [rsp-16*4-8]
+    movdqa  xmm13, xmmword ptr [rsp-16*3-8]
+    movdqa  xmm14, xmmword ptr [rsp-16*2-8]
+    movdqa  xmm15, xmmword ptr [rsp-16*1-8]
     pop     rbx
     pop     rsi
     pop     rdi
     pop     r15
     pop     r14
     pop     r13
     pop     r12
     pop     rbp
@@ -102,17 +138,27 @@ JaegerThrowpoline PROC FRAME
     test    rax, rax
     je      throwpoline_exit
     add     rsp, 20h
     jmp     rax
 
 throwpoline_exit:
     lea     rcx, [rsp+20h]
     call    PopActiveVMFrame
-    add     rsp, 68h+20h
+    add     rsp, 68h+20h+8+16*10+8
+    movdqa  xmm6, xmmword ptr [rsp-16*10-8]
+    movdqa  xmm7, xmmword ptr [rsp-16*9-8]
+    movdqa  xmm8, xmmword ptr [rsp-16*8-8]
+    movdqa  xmm9, xmmword ptr [rsp-16*7-8]
+    movdqa  xmm10, xmmword ptr [rsp-16*6-8]
+    movdqa  xmm11, xmmword ptr [rsp-16*5-8]
+    movdqa  xmm12, xmmword ptr [rsp-16*4-8]
+    movdqa  xmm13, xmmword ptr [rsp-16*3-8]
+    movdqa  xmm14, xmmword ptr [rsp-16*2-8]
+    movdqa  xmm15, xmmword ptr [rsp-16*1-8]
     pop     rbx
     pop     rsi
     pop     rdi
     pop     r15
     pop     r14
     pop     r13
     pop     r12
     pop     rbp
@@ -136,17 +182,27 @@ JaegerInterpoline PROC FRAME
     test    rax, rax
     je      interpoline_exit
     add     rsp, 20h
     jmp     rax
 
 interpoline_exit:
     lea     rcx, [rsp+20h]
     call    PopActiveVMFrame
-    add     rsp, 68h+20h
+    add     rsp, 68h+20h+8+16*10+8
+    movdqa  xmm6, xmmword ptr [rsp-16*10-8]
+    movdqa  xmm7, xmmword ptr [rsp-16*9-8]
+    movdqa  xmm8, xmmword ptr [rsp-16*8-8]
+    movdqa  xmm9, xmmword ptr [rsp-16*7-8]
+    movdqa  xmm10, xmmword ptr [rsp-16*6-8]
+    movdqa  xmm11, xmmword ptr [rsp-16*5-8]
+    movdqa  xmm12, xmmword ptr [rsp-16*4-8]
+    movdqa  xmm13, xmmword ptr [rsp-16*3-8]
+    movdqa  xmm14, xmmword ptr [rsp-16*2-8]
+    movdqa  xmm15, xmmword ptr [rsp-16*1-8]
     pop     rbx
     pop     rsi
     pop     rdi
     pop     r15
     pop     r14
     pop     r13
     pop     r12
     pop     rbp