Bug 644900 - Insert stack probes to properly grow Windows stack for large frames (r=edwsmith,rreitmai) default tip
authorWilliam Maddox <wmaddox@adobe.com>
Fri, 01 Apr 2011 13:48:58 -0700
changeset 5439 25d1d0ab335ef92f56251437a73112787b40ef46
parent 5438 55ac92ea3533b3d18b3877c9fcc5d6b3fed69a3d
push id24
push userwmaddox@adobe.com
push dateFri, 01 Apr 2011 20:51:17 +0000
reviewersedwsmith, rreitmai
bugs644900
Bug 644900 - Insert stack probes to properly grow Windows stack for large frames (r=edwsmith,rreitmai) Includes backport of MOVLMI on X64.
nanojit/NativeX64.cpp
nanojit/NativeX64.h
nanojit/Nativei386.cpp
--- a/nanojit/NativeX64.cpp
+++ b/nanojit/NativeX64.cpp
@@ -289,16 +289,26 @@ namespace nanojit
 
     // disp32 modrm form when the disp must be written separately (opcode is 4+ bytes)
     // p = prefix -- opcode must have a 66, F2, or F3 prefix
     void Assembler::emitprm(uint64_t op, Register r, int32_t d, Register b) {
         op = emit_disp32(op, d);
         emitprr(op, r, b);
     }
 
+    // disp32 modrm form with 32-bit immediate value
+    void Assembler::emitrm_imm32(uint64_t op, Register b, int32_t d, int32_t imm) {
+        NanoAssert(IsGpReg(b));
+        NanoAssert((b & 7) != 4); // using RSP or R12 as base requires SIB
+        underrunProtect(4+4+8); // room for imm plus disp plus fullsize op
+        *((int32_t*)(_nIns -= 4)) = imm;
+        _nvprof("x86-bytes", 4);
+        emitrm_wide(op, (Register)0, d, b);
+    }
+
     void Assembler::emitrr_imm(uint64_t op, Register r, Register b, int32_t imm) {
         NanoAssert(IsGpReg(r) && IsGpReg(b));
         underrunProtect(4+8); // room for imm plus fullsize op
         *((int32_t*)(_nIns -= 4)) = imm;
         _nvprof("x86-bytes", 4);
         emitrr(op, r, b);
     }
 
@@ -580,16 +590,18 @@ namespace nanojit
     void Assembler::JNA8( S n, NIns* t)    { emit_target8(n,X64_ja8 ^X64_jneg8, t); asm_output("jna %p", t); }
     void Assembler::JNAE8(S n, NIns* t)    { emit_target8(n,X64_jae8^X64_jneg8, t); asm_output("jnae %p",t); }
 
     void Assembler::CALL( S n, NIns* t)    { emit_target32(n,X64_call,t); asm_output("call %p",t); }
 
     void Assembler::CALLRAX()       { emit(X64_callrax); asm_output("call (rax)"); }
     void Assembler::RET()           { emit(X64_ret);     asm_output("ret");        }
 
+    void Assembler::MOVLMI(R r, I d, I32 imm) { emitrm_imm32(X64_movlmi,r,d,imm); asm_output("movl %d(%s), %d",d,RQ(r),imm); }
+
     void Assembler::MOVQSPR(I d, R r)   { emit(X64_movqspr | U64(d) << 56 | U64((r&7)<<3) << 40 | U64((r&8)>>1) << 24); asm_output("movq %d(rsp), %s", d, RQ(r)); }    // insert r into mod/rm and rex bytes
 
     void Assembler::XORPSA(R r, I32 i32)    { emitxm_abs(X64_xorpsa, r, i32); asm_output("xorps %s, (0x%x)",RQ(r), i32); }
     void Assembler::XORPSM(R r, NIns* a64)  { emitxm_rel(X64_xorpsm, r, a64); asm_output("xorps %s, (%p)",  RQ(r), a64); }
 
     void Assembler::X86_AND8R(R r)  { emit(X86_and8r | U64(r<<3|(r|4))<<56); asm_output("andb %s, %s", RB(r), RBhi(r)); }
     void Assembler::X86_SETNP(R r)  { emit(X86_setnp | U64(r|4)<<56); asm_output("setnp %s", RBhi(r)); }
     void Assembler::X86_SETE(R r)   { emit(X86_sete  | U64(r)<<56);   asm_output("sete %s", RB(r)); }
@@ -1879,16 +1891,34 @@ namespace nanojit
         uint32_t stackNeeded = max_stk_used + _activation.stackSlotsNeeded() * 4;
 
         uint32_t stackPushed =
             sizeof(void*) + // returnaddr
             sizeof(void*); // ebp
         uint32_t aligned = alignUp(stackNeeded + stackPushed, NJ_ALIGN_STACK);
         uint32_t amt = aligned - stackPushed;
 
+#ifdef _WIN64
+        // Windows uses a single guard page for extending the stack, so
+        // new stack pages must be first touched in stack-growth order.
+        // We touch each whole page that will be allocated to the frame
+        // (following the saved FP) to cause the OS to commit the page if
+        // necessary.  Since we don't calculate page boundaries, but just
+        // probe at intervals of the pagesize, it is possible that the
+        // last page of the frame will be touched unnecessarily.  Note that
+        // we must generate the probes in the reverse order of their execution.
+        // We require that the page size be a power of 2.
+        uint32_t pageSize = uint32_t(VMPI_getVMPageSize());
+        NanoAssert((pageSize & (pageSize-1)) == 0);
+        uint32_t pageRounded = amt & ~(pageSize-1);
+        for (int32_t d = pageRounded; d > 0; d -= pageSize) {
+            MOVLMI(RBP, -d, 0);
+        }
+#endif
+
         // Reserve stackNeeded bytes, padded
         // to preserve NJ_ALIGN_STACK-byte alignment.
         if (amt) {
             if (isS8(amt))
                 SUBQR8(RSP, amt);
             else
                 SUBQRI(RSP, amt);
         }
--- a/nanojit/NativeX64.h
+++ b/nanojit/NativeX64.h
@@ -314,16 +314,18 @@ namespace nanojit
         X64_xorpd   = 0xC0570F4066000005LL, // 128bit xor xmm (two packed doubles)
         X64_xorps   = 0xC0570F4000000004LL, // 128bit xor xmm (four packed singles), one byte shorter
         X64_xorpsm  = 0x05570F4000000004LL, // 128bit xor xmm, [rip+disp32]
         X64_xorpsa  = 0x2504570F40000005LL, // 128bit xor xmm, [disp32]
         X64_inclmRAX= 0x00FF000000000002LL, // incl (%rax)
         X64_jmpx    = 0xC524ff4000000004LL, // jmp [d32+x*8]
         X64_jmpxb   = 0xC024ff4000000004LL, // jmp [b+x*8]
 
+        X64_movlmi  = 0x80C7400000000003LL, // 32bit store imm -> dword ptr[b+disp32]
+
         X86_and8r   = 0xC022000000000002LL, // and rl,rh
         X86_sete    = 0xC0940F0000000003LL, // no-rex version of X64_sete
         X86_setnp   = 0xC09B0F0000000003LL  // no-rex set byte if odd parity (ordered fcmp result) (PF == 0)
     };
 
     typedef uint32_t RegisterMask;
 
     static const RegisterMask GpRegs = 0xffff;
@@ -380,16 +382,17 @@ namespace nanojit
         void emitprr(uint64_t op, Register r, Register b);\
         void emitrm8(uint64_t op, Register r, int32_t d, Register b);\
         void emitrm(uint64_t op, Register r, int32_t d, Register b);\
         void emitrm_wide(uint64_t op, Register r, int32_t d, Register b);\
         uint64_t emit_disp32(uint64_t op, int32_t d);\
         void emitprm(uint64_t op, Register r, int32_t d, Register b);\
         void emitrr_imm(uint64_t op, Register r, Register b, int32_t imm);\
         void emitr_imm64(uint64_t op, Register r, uint64_t imm);\
+        void emitrm_imm32(uint64_t op, Register r, int32_t d, int32_t imm);\
         void emitrxb_imm(uint64_t op, Register r, Register x, Register b, int32_t imm);\
         void emitr_imm(uint64_t op, Register r, int32_t imm) { emitrr_imm(op, (Register)0, r, imm); }\
         void emitr_imm8(uint64_t op, Register b, int32_t imm8);\
         void emitxm_abs(uint64_t op, Register r, int32_t addr32);\
         void emitxm_rel(uint64_t op, Register r, NIns* addr64);\
         bool isTargetWithinS8(NIns* target);\
         bool isTargetWithinS32(NIns* target);\
         void asm_immi(Register r, int32_t v, bool canClobberCCs);\
@@ -588,16 +591,17 @@ namespace nanojit
         void CALLRAX();\
         void RET();\
         void MOVQSPR(int d, Register r);\
         void XORPSA(Register r, int32_t i32);\
         void XORPSM(Register r, NIns* a64);\
         void X86_AND8R(Register r);\
         void X86_SETNP(Register r);\
         void X86_SETE(Register r);\
+        void MOVLMI(Register base, int disp, int32_t imm32); \
 
     const int LARGEST_UNDERRUN_PROT = 32;  // largest value passed to underrunProtect
 
     typedef uint8_t NIns;
 
     // Bytes of icache to flush after Assembler::patch
     const size_t LARGEST_BRANCH_PATCH = 16 * sizeof(NIns);
 
--- a/nanojit/Nativei386.cpp
+++ b/nanojit/Nativei386.cpp
@@ -879,20 +879,37 @@ namespace nanojit
 
         uint32_t stackPushed =
             STACK_GRANULARITY + // returnaddr
             STACK_GRANULARITY; // ebp
 
         uint32_t aligned = alignUp(stackNeeded + stackPushed, NJ_ALIGN_STACK);
         uint32_t amt = aligned - stackPushed;
 
+#ifdef _WIN32
+        // Windows uses a single guard page for extending the stack, so
+        // new stack pages must be first touched in stack-growth order.
+        // We touch each whole page that will be allocated to the frame
+        // (following the saved FP) to cause the OS to commit the page if
+        // necessary.  Since we don't calculate page boundaries, but just
+        // probe at intervals of the pagesize, it is possible that the
+        // last page of the frame will be touched unnecessarily.  Note that
+        // we must generate the probes in the reverse order of their execution.
+        // We require that the page size be a power of 2.
+        size_t pageSize = VMPI_getVMPageSize();
+        NanoAssert((pageSize & (pageSize-1)) == 0);
+        size_t pageRounded = amt & ~(pageSize-1);
+        for (int32_t d = pageRounded; d > 0; d -= pageSize) {
+            STi(EBP, -d, 0);
+        }
+#endif
+
         // Reserve stackNeeded bytes, padded
         // to preserve NJ_ALIGN_STACK-byte alignment.
-        if (amt)
-        {
+        if (amt) {
             SUBi(SP, amt);
         }
 
         verbose_only( asm_output("[frag entry]"); )
         NIns *fragEntry = _nIns;
         MR(FP, SP); // Establish our own FP.
         PUSHr(FP); // Save caller's FP.