Bug 1480550 - add ctypes support for aarch64 windows; r=dmajor
authorNathan Froyd <froydnj@mozilla.com>
Fri, 26 Oct 2018 13:00:41 -0400
changeset 499614 94c9a52a4f1da95c6a260cffe391cb024ede58d2
parent 499613 43d1a6288fb9b9104dd477344f128ed69e6c6ec0
child 499615 4f3ef378832408b1274d7dac9ea1e137d0bacc3c
push id10290
push userffxbld-merge
push dateMon, 03 Dec 2018 16:23:23 +0000
treeherdermozilla-beta@700bed2445e6 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersdmajor
bugs1480550
milestone65.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1480550 - add ctypes support for aarch64 windows; r=dmajor The bulk of this patch is the new win64.asm, which is a more-or-less direct copy of aarch64's sysv.S file, with modifications for armasm64's peculiarities. The changes to ffi.c were minimal, mostly so that arithmetic on `void*` conforms to the C standard.
config/external/ffi/moz.build
js/ffi.configure
js/src/ctypes/CTypes.cpp
js/src/ctypes/libffi/src/aarch64/ffi.c
js/src/ctypes/libffi/src/aarch64/win64.asm
--- a/config/external/ffi/moz.build
+++ b/config/external/ffi/moz.build
@@ -72,16 +72,18 @@ else:
     # Per-platform sources and flags.
     ffi_srcs = ()
     if CONFIG['FFI_TARGET'] == 'ARM':
         ffi_srcs = ('sysv.S', 'ffi.c')
         if CONFIG['CC_TYPE'] == 'clang':
             ASFLAGS += ['-no-integrated-as']
     elif CONFIG['FFI_TARGET'] == 'AARCH64':
         ffi_srcs = ('sysv.S', 'ffi.c')
+    elif CONFIG['FFI_TARGET'] == 'ARM64_WIN64':
+        ffi_srcs = ('win64.asm', 'ffi.c')
     elif CONFIG['FFI_TARGET'] == 'X86':
         ffi_srcs = ('ffi.c', 'sysv.S', 'win32.S')
     elif CONFIG['FFI_TARGET'] == 'X86_64':
         ffi_srcs = ('ffi64.c', 'unix64.S', 'ffi.c', 'sysv.S')
     elif CONFIG['FFI_TARGET'] == 'X86_WIN32':
         ffi_srcs = ['ffi.c']
         # MinGW Build for 32 bit
         if CONFIG['CC_TYPE'] in ('gcc', 'clang'):
--- a/js/ffi.configure
+++ b/js/ffi.configure
@@ -28,21 +28,21 @@ add_old_configure_assignment('MOZ_SYSTEM
 # Target selection, based on ffi/configure.ac.
 @depends(target, when=building_ffi)
 def ffi_target(target):
     if target.cpu not in ('x86', 'x86_64', 'arm', 'aarch64'):
         die('Building libffi from the tree is not supported on this platform. '
             'Use --with-system-ffi instead.')
 
     if target.os == 'WINNT':
-        target_dir = 'x86'
-        if target.cpu == 'x86_64':
-            target_name = 'X86_WIN64'
-        else:
-            target_name = 'X86_WIN32'
+        target_dir, target_name = {
+            'x86_64': ('x86', 'X86_WIN64'),
+            'x86': ('x86', 'X86_WIN32'),
+            'aarch64': ('aarch64', 'ARM64_WIN64'),
+        }[target.cpu]
     elif target.os == 'OSX':
         target_dir = 'x86'
         target_name = 'X86_DARWIN'
     elif target.cpu == 'arm':
         target_dir = 'arm'
         target_name = 'ARM'
     elif target.cpu == 'aarch64':
         target_dir = 'aarch64'
--- a/js/src/ctypes/CTypes.cpp
+++ b/js/src/ctypes/CTypes.cpp
@@ -6803,33 +6803,45 @@ GetABI(JSContext* cx, HandleValue abiTyp
   // given platform. ABI_DEFAULT specifies the default
   // C calling convention (cdecl) on each platform.
   switch (abi) {
   case ABI_DEFAULT:
     *result = FFI_DEFAULT_ABI;
     return true;
   case ABI_THISCALL:
 #if defined(_WIN64)
+#if defined(_M_X64)
     *result = FFI_WIN64;
+#elif defined(_M_ARM64)
+    *result = FFI_SYSV;
+#else
+#error unknown 64-bit Windows platform
+#endif
     return true;
 #elif defined(_WIN32)
     *result = FFI_THISCALL;
     return true;
 #else
     break;
 #endif
   case ABI_STDCALL:
   case ABI_WINAPI:
 #if (defined(_WIN32) && !defined(_WIN64)) || defined(_OS2)
     *result = FFI_STDCALL;
     return true;
 #elif (defined(_WIN64))
     // We'd like the same code to work across Win32 and Win64, so stdcall_api
     // and winapi_abi become aliases to the lone Win64 ABI.
+#if defined(_M_X64)
     *result = FFI_WIN64;
+#elif defined(_M_ARM64)
+    *result = FFI_SYSV;
+#else
+#error unknown 64-bit Windows platform
+#endif
     return true;
 #endif
   case INVALID_ABI:
     break;
   }
   return false;
 }
 
--- a/js/src/ctypes/libffi/src/aarch64/ffi.c
+++ b/js/src/ctypes/libffi/src/aarch64/ffi.c
@@ -21,16 +21,23 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN
 
 #include <stdio.h>
 
 #include <ffi.h>
 #include <ffi_common.h>
 
 #include <stdlib.h>
 
+#if defined(_WIN32)
+#if !defined(WIN32_LEAN_AND_MEAN)
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#endif
+
 /* Stack alignment requirement in bytes */
 #if defined (__APPLE__)
 #define AARCH64_STACK_ALIGN 1
 #else
 #define AARCH64_STACK_ALIGN 16
 #endif
 
 #define N_X_ARG_REG 8
@@ -60,16 +67,19 @@ sys_icache_invalidate (void *start, size
 
 static inline void
 ffi_clear_cache (void *start, void *end)
 {
 #if defined (__clang__) && defined (__APPLE__)
 	sys_icache_invalidate (start, (char *)end - (char *)start);
 #elif defined (__GNUC__)
 	__builtin___clear_cache (start, end);
+#elif defined (_WIN32)
+	FlushInstructionCache (GetCurrentProcess (), start,
+			       (char*)end - (char*)start);
 #else
 #error "Missing builtin to flush instruction cache"
 #endif
 }
 
 static void *
 get_x_addr (struct call_context *context, unsigned n)
 {
@@ -214,16 +224,20 @@ get_basic_type_size (unsigned short type
       return sizeof (SINT64);
 
     default:
       FFI_ASSERT (0);
       return 0;
     }
 }
 
+// XXX The Win64 and the SYSV ABI are very close, differing only in their
+// calling of varargs functions.  Since we don't care about calling varargs
+// functions in our use of libffi, we just hack our way through and use the
+// SYSV-designated functions everywhere.
 extern void
 ffi_call_SYSV (unsigned (*)(struct call_context *context, unsigned char *,
 			    extended_cif *),
                struct call_context *context,
                extended_cif *,
                size_t,
                void (*fn)(void));
 
@@ -486,17 +500,17 @@ allocate_to_stack (struct arg_state *sta
   state->nsaa = ALIGN (state->nsaa, alignment);
 #if defined (__APPLE__)
   if (state->allocating_variadic)
     state->nsaa = ALIGN (state->nsaa, 8);
 #else
   state->nsaa = ALIGN (state->nsaa, 8);
 #endif
 
-  allocation = stack + state->nsaa;
+  allocation = (char*)stack + state->nsaa;
 
   state->nsaa += size;
   return allocation;
 }
 
 static void
 copy_basic_type (void *dest, void *source, unsigned short type)
 {
@@ -570,17 +584,17 @@ copy_hfa_to_reg_or_stack (void *memory,
   else
     {
       int i;
       unsigned short type = get_homogeneous_type (ty);
       for (i = 0; i < elems; i++)
 	{
 	  void *reg = allocate_to_v (context, state);
 	  copy_basic_type (reg, memory, type);
-	  memory += get_basic_type_size (type);
+	  memory = (char*)memory + get_basic_type_size (type);
 	}
     }
 }
 
 /* Either allocate an appropriate register for the argument type, or if
    none are available, allocate a stack slot and return a pointer
    to the allocated space.  */
 
@@ -854,17 +868,17 @@ ffi_call (ffi_cif *cif, void (*fn)(void)
 		  {
 		    int j;
 		    unsigned short type = get_homogeneous_type (cif->rtype);
 		    unsigned elems = element_count (cif->rtype);
 		    for (j = 0; j < elems; j++)
 		      {
 			void *reg = get_basic_type_addr (type, &context, j);
 			copy_basic_type (rvalue, reg, type);
-			rvalue += get_basic_type_size (type);
+			rvalue = (char*)rvalue + get_basic_type_size (type);
 		      }
 		  }
                 else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
                   {
                     size_t size = ALIGN (cif->rtype->size, sizeof (UINT64));
                     memcpy (rvalue, get_x_addr (&context, 0), size);
                   }
                 else
@@ -897,26 +911,27 @@ static unsigned char trampoline [] =
 { 0x70, 0x00, 0x00, 0x58,	/* ldr	x16, 1f	*/
   0x91, 0x00, 0x00, 0x10,	/* adr	x17, 2f	*/
   0x00, 0x02, 0x1f, 0xd6	/* br	x16	*/
 };
 
 /* Build a trampoline.  */
 
 #define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX,FLAGS)			\
-  ({unsigned char *__tramp = (unsigned char*)(TRAMP);			\
+  do {									\
+    unsigned char *__tramp = (unsigned char*)(TRAMP);			\
     UINT64  __fun = (UINT64)(FUN);					\
     UINT64  __ctx = (UINT64)(CTX);					\
     UINT64  __flags = (UINT64)(FLAGS);					\
     memcpy (__tramp, trampoline, sizeof (trampoline));			\
     memcpy (__tramp + 12, &__fun, sizeof (__fun));			\
     memcpy (__tramp + 20, &__ctx, sizeof (__ctx));			\
     memcpy (__tramp + 28, &__flags, sizeof (__flags));			\
     ffi_clear_cache(__tramp, __tramp + FFI_TRAMPOLINE_SIZE);		\
-  })
+  } while(0)
 
 ffi_status
 ffi_prep_closure_loc (ffi_closure* closure,
                       ffi_cif* cif,
                       void (*fun)(ffi_cif*,void*,void**,void*),
                       void *user_data,
                       void *codeloc)
 {
@@ -1136,17 +1151,17 @@ ffi_closure_SYSV_inner (ffi_closure *clo
 	    {
 	      int j;
 	      unsigned short type = get_homogeneous_type (cif->rtype);
 	      unsigned elems = element_count (cif->rtype);
 	      for (j = 0; j < elems; j++)
 		{
 		  void *reg = get_basic_type_addr (type, context, j);
 		  copy_basic_type (reg, rvalue, type);
-		  rvalue += get_basic_type_size (type);
+		  rvalue = (char*)rvalue + get_basic_type_size (type);
 		}
 	    }
           else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
             {
               size_t size = ALIGN (cif->rtype->size, sizeof (UINT64)) ;
               memcpy (get_x_addr (context, 0), rvalue, size);
             }
           else
new file mode 100644
--- /dev/null
+++ b/js/src/ctypes/libffi/src/aarch64/win64.asm
@@ -0,0 +1,272 @@
+;; Copyright (c) 2009, 2010, 2011, 2012 ARM Ltd.
+
+;; Permission is hereby granted, free of charge, to any person obtaining
+;; a copy of this software and associated documentation files (the
+;; ``Software''), to deal in the Software without restriction, including
+;; without limitation the rights to use, copy, modify, merge, publish,
+;; distribute, sublicense, and/or sell copies of the Software, and to
+;; permit persons to whom the Software is furnished to do so, subject to
+;; the following conditions:
+
+;; The above copyright notice and this permission notice shall be
+;; included in all copies or substantial portions of the Software.
+
+;; THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+;; IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+;; CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+;; TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+;; SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+;; Hand-converted from the sysv.S file in this directory.
+
+        AREA |.text|, CODE, ARM64
+
+   ;; ffi_call_SYSV()
+
+   ;; Create a stack frame, setup an argument context, call the callee
+   ;; and extract the result.
+
+   ;; The maximum required argument stack size is provided,
+   ;; ffi_call_SYSV() allocates that stack space then calls the
+   ;; prepare_fn to populate register context and stack.  The
+   ;; argument passing registers are loaded from the register
+   ;; context and the callee called, on return the register passing
+   ;; register are saved back to the context.  Our caller will
+   ;; extract the return value from the final state of the saved
+   ;; register context.
+
+   ;; Prototype:
+
+   ;; extern unsigned
+   ;; ffi_call_SYSV (void (*)(struct call_context *context, unsigned char *,
+   ;; 			   extended_cif *),
+   ;;                struct call_context *context,
+   ;;                extended_cif *,
+   ;;                size_t required_stack_size,
+   ;;                void (*fn)(void));
+
+   ;; Therefore on entry we have:
+
+   ;; x0 prepare_fn
+   ;; x1 &context
+   ;; x2 &ecif
+   ;; x3 bytes
+   ;; x4 fn
+
+   ;; This function uses the following stack frame layout:
+
+   ;; ==
+   ;;              saved x30(lr)
+   ;; x29(fp)->    saved x29(fp)
+   ;;              saved x24
+   ;;              saved x23
+   ;;              saved x22
+   ;; sp'    ->    saved x21
+   ;;              ...
+   ;; sp     ->    (constructed callee stack arguments)
+   ;; ==
+
+   ;; Voila!
+
+	EXPORT |ffi_call_SYSV|
+
+|ffi_call_SYSV| PROC
+;#define ffi_call_SYSV_FS (8 * 4)
+
+        stp     x29, x30, [sp, #-16]!
+
+        mov     x29, sp
+        sub     sp, sp, #32 	; ffi_call_SYSV_FS
+
+        stp     x21, x22, [sp, #0]
+        stp     x23, x24, [sp, #16]
+
+        mov     x21, x1
+        mov     x22, x2
+        mov     x24, x4
+
+        ; Allocate the stack space for the actual arguments, many
+        ; arguments will be passed in registers, but we assume
+        ; worst case and allocate sufficient stack for ALL of
+        ; the arguments.
+        sub     sp, sp, x3
+
+        ; unsigned (*prepare_fn) (struct call_context *context,
+        ;                         unsigned char *stack, extended_cif *ecif);
+
+        mov     x23, x0
+        mov     x0, x1
+        mov     x1, sp
+        ; x2 already in place
+        blr     x23
+
+        ; Preserve the flags returned.
+        mov     x23, x0
+
+        ; Figure out if we should touch the vector registers.
+        tbz     x23, #0, noload_call
+
+        ; Load the vector argument passing registers.
+        ldp     q0, q1, [x21, #8*32 +  0]
+        ldp     q2, q3, [x21, #8*32 + 32]
+        ldp     q4, q5, [x21, #8*32 + 64]
+        ldp     q6, q7, [x21, #8*32 + 96]
+
+noload_call
+        ; Load the core argument passing registers.
+        ldp     x0, x1, [x21,  #0]
+        ldp     x2, x3, [x21, #16]
+        ldp     x4, x5, [x21, #32]
+        ldp     x6, x7, [x21, #48]
+
+        ; Don't forget x8 which may be holding the address of a return buffer.
+        ldr     x8,     [x21, #8*8]
+
+        blr     x24
+
+        ; Save the core argument passing registers.
+        stp     x0, x1, [x21,  #0]
+        stp     x2, x3, [x21, #16]
+        stp     x4, x5, [x21, #32]
+        stp     x6, x7, [x21, #48]
+
+        ; Note nothing useful ever comes back in x8!
+
+        ; Figure out if we should touch the vector registers.
+        tbz     x23, #0, nosave_call ; AARCH64_FFI_WITH_V_BIT
+
+        ; Save the vector argument passing registers.
+        stp     q0, q1, [x21, #8*32 + 0]
+        stp     q2, q3, [x21, #8*32 + 32]
+        stp     q4, q5, [x21, #8*32 + 64]
+        stp     q6, q7, [x21, #8*32 + 96]
+
+nosave_call
+        ; All done, unwind our stack frame.
+        ldp     x21, x22, [x29,  # - 32] ; ffi_call_SYSV_FS
+
+        ldp     x23, x24, [x29,  # - 32 + 16] ; ffi_call_SYSV_FS
+
+        mov     sp, x29
+
+        ldp     x29, x30, [sp], #16
+
+        ret
+
+	ENDP
+
+; #define ffi_closure_SYSV_FS (8 * 2 + AARCH64_CALL_CONTEXT_SIZE)
+
+   ;; ffi_closure_SYSV
+
+   ;; Closure invocation glue. This is the low level code invoked directly by
+   ;; the closure trampoline to setup and call a closure.
+
+   ;; On entry x17 points to a struct trampoline_data, x16 has been clobbered
+   ;; all other registers are preserved.
+
+   ;; We allocate a call context and save the argument passing registers,
+   ;; then invoked the generic C ffi_closure_SYSV_inner() function to do all
+   ;; the real work, on return we load the result passing registers back from
+   ;; the call context.
+
+   ;; On entry
+
+   ;; extern void
+   ;; ffi_closure_SYSV (struct trampoline_data *);
+
+   ;; struct trampoline_data
+   ;; {
+   ;;      UINT64 *ffi_closure;
+   ;;      UINT64 flags;
+   ;; };
+
+   ;; This function uses the following stack frame layout:
+
+   ;; ==
+   ;;              saved x30(lr)
+   ;; x29(fp)->    saved x29(fp)
+   ;;              saved x22
+   ;;              saved x21
+   ;;              ...
+   ;; sp     ->    call_context
+   ;; ==
+
+   ;; Voila!
+
+	IMPORT |ffi_closure_SYSV_inner|
+	EXPORT |ffi_closure_SYSV|
+
+|ffi_closure_SYSV| PROC
+        stp     x29, x30, [sp, #-16]!
+
+        mov     x29, sp
+
+        sub     sp, sp, #256+512+16
+
+        stp     x21, x22, [x29, #-16]
+
+        ; Load x21 with &call_context.
+        mov     x21, sp
+        ; Preserve our struct trampoline_data
+        mov     x22, x17
+
+        ; Save the rest of the argument passing registers.
+        stp     x0, x1, [x21, #0]
+        stp     x2, x3, [x21, #16]
+        stp     x4, x5, [x21, #32]
+        stp     x6, x7, [x21, #48]
+        ; Don't forget we may have been given a result scratch pad address.
+        str     x8,     [x21, #64]
+
+        ; Figure out if we should touch the vector registers.
+        ldr     x0, [x22, #8]
+        tbz     x0, #0, nosave_closure ; AARCH64_FFI_WITH_V_BIT
+
+        ; Save the argument passing vector registers.
+        stp     q0, q1, [x21, #8*32 + 0]
+        stp     q2, q3, [x21, #8*32 + 32]
+        stp     q4, q5, [x21, #8*32 + 64]
+        stp     q6, q7, [x21, #8*32 + 96]
+
+nosave_closure
+        ; Load &ffi_closure..
+        ldr     x0, [x22, #0]
+        mov     x1, x21
+        ; Compute the location of the stack at the point that the
+        ; trampoline was called.
+        add     x2, x29, #16
+
+        bl      ffi_closure_SYSV_inner
+
+        ; Figure out if we should touch the vector registers.
+        ldr     x0, [x22, #8]
+        tbz     x0, #0, noload_closure ; AARCH64_FFI_WITH_V_BIT
+
+        ; Load the result passing vector registers.
+        ldp     q0, q1, [x21, #8*32 + 0]
+        ldp     q2, q3, [x21, #8*32 + 32]
+        ldp     q4, q5, [x21, #8*32 + 64]
+        ldp     q6, q7, [x21, #8*32 + 96]
+
+noload_closure
+        ; Load the result passing core registers.
+        ldp     x0, x1, [x21,  #0]
+        ldp     x2, x3, [x21, #16]
+        ldp     x4, x5, [x21, #32]
+        ldp     x6, x7, [x21, #48]
+        ; Note nothing useful is returned in x8.
+
+        ; We are done, unwind our frame.
+        ldp     x21, x22, [x29,  #-16]
+
+        mov     sp, x29
+
+        ldp     x29, x30, [sp], #16
+
+        ret
+
+	ENDP
+	END