Bug 965870 - Try to use __builtin_ffsl if ffsl is unavailable. r=mshal
authorMike Hommey <mh+mozilla@glandium.org>
Fri, 06 Jun 2014 08:15:58 +0900
changeset 186984 b3b490f2ddc6a9d31b4d5d209016519d61f440af
parent 186983 a3b45c160d4658fbb8b6bdfe50cdd8fe7b8a8206
child 186985 7ab785fa6f362a98852f4bc28e7657ce230fc457
push idunknown
push userunknown
push dateunknown
reviewersmshal
bugs965870
milestone32.0a1
Bug 965870 - Try to use __builtin_ffsl if ffsl is unavailable. r=mshal Cherry-picked from https://github.com/jemalloc/jemalloc/commit/9c3a10fdf6baa5ddb042b6adbef1ff1b3c613ce3
memory/jemalloc/0004-Try-to-use-__builtin_ffsl-if-ffsl-is-unavailable.patch
memory/jemalloc/src/configure
memory/jemalloc/src/configure.ac
memory/jemalloc/src/include/jemalloc/internal/arena.h
memory/jemalloc/src/include/jemalloc/internal/bitmap.h
memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal.h.in
memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal_defs.h.in
memory/jemalloc/src/include/jemalloc/internal/util.h
memory/jemalloc/src/src/arena.c
memory/jemalloc/src/src/rtree.c
memory/jemalloc/update.sh
new file mode 100644
--- /dev/null
+++ b/memory/jemalloc/0004-Try-to-use-__builtin_ffsl-if-ffsl-is-unavailable.patch
@@ -0,0 +1,354 @@
+diff --git a/configure b/configure
+--- a/configure
++++ b/configure
+@@ -6940,18 +6940,67 @@ else
+   je_cv_function_ffsl=no
+ fi
+ rm -f core conftest.err conftest.$ac_objext \
+     conftest$ac_exeext conftest.$ac_ext
+ fi
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $je_cv_function_ffsl" >&5
+ $as_echo "$je_cv_function_ffsl" >&6; }
+ 
+-if test "x${je_cv_function_ffsl}" != "xyes" ; then
+-   as_fn_error $? "Cannot build without ffsl(3)" "$LINENO" 5
++if test "x${je_cv_function_ffsl}" == "xyes" ; then
++  $as_echo "#define JEMALLOC_INTERNAL_FFSL ffsl" >>confdefs.h
++
++  $as_echo "#define JEMALLOC_INTERNAL_FFS ffs" >>confdefs.h
++
++else
++
++{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether a program using __builtin_ffsl is compilable" >&5
++$as_echo_n "checking whether a program using __builtin_ffsl is compilable... " >&6; }
++if ${je_cv_gcc_builtin_ffsl+:} false; then :
++  $as_echo_n "(cached) " >&6
++else
++  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
++/* end confdefs.h.  */
++
++  #include <stdio.h>
++  #include <strings.h>
++  #include <string.h>
++
++int
++main ()
++{
++
++	{
++		int rv = __builtin_ffsl(0x08);
++		printf("%d\n", rv);
++	}
++
++  ;
++  return 0;
++}
++_ACEOF
++if ac_fn_c_try_link "$LINENO"; then :
++  je_cv_gcc_builtin_ffsl=yes
++else
++  je_cv_gcc_builtin_ffsl=no
++fi
++rm -f core conftest.err conftest.$ac_objext \
++    conftest$ac_exeext conftest.$ac_ext
++fi
++{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $je_cv_gcc_builtin_ffsl" >&5
++$as_echo "$je_cv_gcc_builtin_ffsl" >&6; }
++
++  if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
++    $as_echo "#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl" >>confdefs.h
++
++    $as_echo "#define JEMALLOC_INTERNAL_FFS __builtin_ffs" >>confdefs.h
++
++  else
++    as_fn_error $? "Cannot build without ffsl(3) or __builtin_ffsl()" "$LINENO" 5
++  fi
+ fi
+ 
+ 
+ 
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether atomic(9) is compilable" >&5
+ $as_echo_n "checking whether atomic(9) is compilable... " >&6; }
+ if ${je_cv_atomic9+:} false; then :
+   $as_echo_n "(cached) " >&6
+diff --git a/configure.ac b/configure.ac
+--- a/configure.ac
++++ b/configure.ac
+@@ -1160,31 +1160,51 @@ fi
+ AC_SUBST([enable_tls])
+ if test "x${enable_tls}" = "x1" ; then
+   AC_DEFINE_UNQUOTED([JEMALLOC_TLS], [ ])
+ elif test "x${force_tls}" = "x1" ; then
+   AC_MSG_ERROR([Failed to configure TLS, which is mandatory for correct function])
+ fi
+ 
+ dnl ============================================================================
+-dnl Check for ffsl(3), and fail if not found.  This function exists on all
+-dnl platforms that jemalloc currently has a chance of functioning on without
+-dnl modification.
++dnl Check for ffsl(3), then __builtin_ffsl(), and fail if neither are found.
++dnl One of those two functions should (theoretically) exist on all platforms
++dnl that jemalloc currently has a chance of functioning on without modification.
++dnl We additionally assume ffs() or __builtin_ffs() are defined if
++dnl ffsl() or __builtin_ffsl() are defined, respectively.
+ JE_COMPILABLE([a program using ffsl], [
+ #include <stdio.h>
+ #include <strings.h>
+ #include <string.h>
+ ], [
+ 	{
+ 		int rv = ffsl(0x08);
+ 		printf("%d\n", rv);
+ 	}
+ ], [je_cv_function_ffsl])
+-if test "x${je_cv_function_ffsl}" != "xyes" ; then
+-   AC_MSG_ERROR([Cannot build without ffsl(3)])
++if test "x${je_cv_function_ffsl}" == "xyes" ; then
++  AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl])
++  AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs])
++else
++  JE_COMPILABLE([a program using __builtin_ffsl], [
++  #include <stdio.h>
++  #include <strings.h>
++  #include <string.h>
++  ], [
++	{
++		int rv = __builtin_ffsl(0x08);
++		printf("%d\n", rv);
++	}
++  ], [je_cv_gcc_builtin_ffsl])
++  if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
++    AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl])
++    AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs])
++  else
++    AC_MSG_ERROR([Cannot build without ffsl(3) or __builtin_ffsl()])
++  fi
+ fi
+ 
+ dnl ============================================================================
+ dnl Check for atomic(9) operations as provided on FreeBSD.
+ 
+ JE_COMPILABLE([atomic(9)], [
+ #include <sys/types.h>
+ #include <machine/atomic.h>
+diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
+--- a/include/jemalloc/internal/arena.h
++++ b/include/jemalloc/internal/arena.h
+@@ -810,17 +810,17 @@ arena_run_regind(arena_run_t *run, arena
+ 	 * Avoid doing division with a variable divisor if possible.  Using
+ 	 * actual division here can reduce allocator throughput by over 20%!
+ 	 */
+ 	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run -
+ 	    bin_info->reg0_offset);
+ 
+ 	/* Rescale (factor powers of 2 out of the numerator and denominator). */
+ 	interval = bin_info->reg_interval;
+-	shift = ffs(interval) - 1;
++	shift = jemalloc_ffs(interval) - 1;
+ 	diff >>= shift;
+ 	interval >>= shift;
+ 
+ 	if (interval == 1) {
+ 		/* The divisor was a power of 2. */
+ 		regind = diff;
+ 	} else {
+ 		/*
+diff --git a/include/jemalloc/internal/bitmap.h b/include/jemalloc/internal/bitmap.h
+--- a/include/jemalloc/internal/bitmap.h
++++ b/include/jemalloc/internal/bitmap.h
+@@ -125,21 +125,21 @@ bitmap_sfu(bitmap_t *bitmap, const bitma
+ 	size_t bit;
+ 	bitmap_t g;
+ 	unsigned i;
+ 
+ 	assert(bitmap_full(bitmap, binfo) == false);
+ 
+ 	i = binfo->nlevels - 1;
+ 	g = bitmap[binfo->levels[i].group_offset];
+-	bit = ffsl(g) - 1;
++	bit = jemalloc_ffsl(g) - 1;
+ 	while (i > 0) {
+ 		i--;
+ 		g = bitmap[binfo->levels[i].group_offset + bit];
+-		bit = (bit << LG_BITMAP_GROUP_NBITS) + (ffsl(g) - 1);
++		bit = (bit << LG_BITMAP_GROUP_NBITS) + (jemalloc_ffsl(g) - 1);
+ 	}
+ 
+ 	bitmap_set(bitmap, binfo, bit);
+ 	return (bit);
+ }
+ 
+ JEMALLOC_INLINE void
+ bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
+diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
+--- a/include/jemalloc/internal/jemalloc_internal.h.in
++++ b/include/jemalloc/internal/jemalloc_internal.h.in
+@@ -9,21 +9,23 @@
+ #  define EPERM  ERROR_WRITE_FAULT
+ #  define EFAULT ERROR_INVALID_ADDRESS
+ #  define ENOMEM ERROR_NOT_ENOUGH_MEMORY
+ #  undef ERANGE
+ #  define ERANGE ERROR_INVALID_DATA
+ #else
+ #  include <sys/param.h>
+ #  include <sys/mman.h>
+-#  include <sys/syscall.h>
+-#  if !defined(SYS_write) && defined(__NR_write)
+-#    define SYS_write __NR_write
++#  if !defined(__pnacl__) && !defined(__native_client__)
++#    include <sys/syscall.h>
++#    if !defined(SYS_write) && defined(__NR_write)
++#      define SYS_write __NR_write
++#    endif
++#    include <sys/uio.h>
+ #  endif
+-#  include <sys/uio.h>
+ #  include <pthread.h>
+ #  include <errno.h>
+ #endif
+ #include <sys/types.h>
+ 
+ #include <limits.h>
+ #ifndef SIZE_T_MAX
+ #  define SIZE_T_MAX	SIZE_MAX
+@@ -275,16 +277,19 @@ static const bool config_ivsalloc =
+ #    define LG_QUANTUM		4
+ #  endif
+ #  ifdef __SH4__
+ #    define LG_QUANTUM		4
+ #  endif
+ #  ifdef __tile__
+ #    define LG_QUANTUM		4
+ #  endif
++#  ifdef __le32__
++#    define LG_QUANTUM		4
++#  endif
+ #  ifndef LG_QUANTUM
+ #    error "No LG_QUANTUM definition for architecture; specify via CPPFLAGS"
+ #  endif
+ #endif
+ 
+ #define	QUANTUM			((size_t)(1U << LG_QUANTUM))
+ #define	QUANTUM_MASK		(QUANTUM - 1)
+ 
+diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
+--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
++++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
+@@ -153,16 +153,23 @@
+  * memory map holes, much like munmap(2) does.
+  */
+ #undef JEMALLOC_MREMAP
+ 
+ /* TLS is used to map arenas and magazine caches to threads. */
+ #undef JEMALLOC_TLS
+ 
+ /*
++ * ffs()/ffsl() functions to use for bitmapping.  Don't use these directly;
++ * instead, use jemalloc_ffs() or jemalloc_ffsl() from util.h.
++ */
++#undef JEMALLOC_INTERNAL_FFSL
++#undef JEMALLOC_INTERNAL_FFS
++
++/*
+  * JEMALLOC_IVSALLOC enables ivsalloc(), which verifies that pointers reside
+  * within jemalloc-owned chunks before dereferencing them.
+  */
+ #undef JEMALLOC_IVSALLOC
+ 
+ /*
+  * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+  */
+diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
+--- a/include/jemalloc/internal/util.h
++++ b/include/jemalloc/internal/util.h
+@@ -104,22 +104,44 @@ void malloc_cprintf(void (*write)(void *
+ void	malloc_printf(const char *format, ...)
+     JEMALLOC_ATTR(format(printf, 1, 2));
+ 
+ #endif /* JEMALLOC_H_EXTERNS */
+ /******************************************************************************/
+ #ifdef JEMALLOC_H_INLINES
+ 
+ #ifndef JEMALLOC_ENABLE_INLINE
++int	jemalloc_ffsl(long bitmap);
++int	jemalloc_ffs(int bitmap);
+ size_t	pow2_ceil(size_t x);
+ void	set_errno(int errnum);
+ int	get_errno(void);
+ #endif
+ 
+ #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_UTIL_C_))
++
++/* Sanity check: */
++#if !defined(JEMALLOC_INTERNAL_FFSL) || !defined(JEMALLOC_INTERNAL_FFS)
++#  error Both JEMALLOC_INTERNAL_FFSL && JEMALLOC_INTERNAL_FFS should have been defined by configure
++#endif
++
++JEMALLOC_ALWAYS_INLINE int
++jemalloc_ffsl(long bitmap)
++{
++
++        return (JEMALLOC_INTERNAL_FFSL(bitmap));
++}
++
++JEMALLOC_ALWAYS_INLINE int
++jemalloc_ffs(int bitmap)
++{
++
++        return (JEMALLOC_INTERNAL_FFS(bitmap));
++}
++
+ /* Compute the smallest power of 2 that is >= x. */
+ JEMALLOC_INLINE size_t
+ pow2_ceil(size_t x)
+ {
+ 
+ 	x--;
+ 	x |= x >> 1;
+ 	x |= x >> 2;
+diff --git a/src/arena.c b/src/arena.c
+--- a/src/arena.c
++++ b/src/arena.c
+@@ -2378,17 +2378,17 @@ bin_info_run_size_calc(arena_bin_info_t 
+ 	/*
+ 	 * Determine redzone size based on minimum alignment and minimum
+ 	 * redzone size.  Add padding to the end of the run if it is needed to
+ 	 * align the regions.  The padding allows each redzone to be half the
+ 	 * minimum alignment; without the padding, each redzone would have to
+ 	 * be twice as large in order to maintain alignment.
+ 	 */
+ 	if (config_fill && opt_redzone) {
+-		size_t align_min = ZU(1) << (ffs(bin_info->reg_size) - 1);
++		size_t align_min = ZU(1) << (jemalloc_ffs(bin_info->reg_size) - 1);
+ 		if (align_min <= REDZONE_MINSIZE) {
+ 			bin_info->redzone_size = REDZONE_MINSIZE;
+ 			pad_size = 0;
+ 		} else {
+ 			bin_info->redzone_size = align_min >> 1;
+ 			pad_size = bin_info->redzone_size;
+ 		}
+ 	} else {
+diff --git a/src/rtree.c b/src/rtree.c
+--- a/src/rtree.c
++++ b/src/rtree.c
+@@ -4,18 +4,18 @@
+ rtree_t *
+ rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc)
+ {
+ 	rtree_t *ret;
+ 	unsigned bits_per_level, bits_in_leaf, height, i;
+ 
+ 	assert(bits > 0 && bits <= (sizeof(uintptr_t) << 3));
+ 
+-	bits_per_level = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
+-	bits_in_leaf = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(uint8_t)))) - 1;
++	bits_per_level = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
++	bits_in_leaf = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(uint8_t)))) - 1;
+ 	if (bits > bits_in_leaf) {
+ 		height = 1 + (bits - bits_in_leaf) / bits_per_level;
+ 		if ((height-1) * bits_per_level + bits_in_leaf != bits)
+ 			height++;
+ 	} else {
+ 		height = 1;
+ 	}
+ 	assert((height-1) * bits_per_level + bits_in_leaf >= bits);
--- a/memory/jemalloc/src/configure
+++ b/memory/jemalloc/src/configure
@@ -6940,18 +6940,67 @@ else
   je_cv_function_ffsl=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $je_cv_function_ffsl" >&5
 $as_echo "$je_cv_function_ffsl" >&6; }
 
-if test "x${je_cv_function_ffsl}" != "xyes" ; then
-   as_fn_error $? "Cannot build without ffsl(3)" "$LINENO" 5
+if test "x${je_cv_function_ffsl}" == "xyes" ; then
+  $as_echo "#define JEMALLOC_INTERNAL_FFSL ffsl" >>confdefs.h
+
+  $as_echo "#define JEMALLOC_INTERNAL_FFS ffs" >>confdefs.h
+
+else
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether a program using __builtin_ffsl is compilable" >&5
+$as_echo_n "checking whether a program using __builtin_ffsl is compilable... " >&6; }
+if ${je_cv_gcc_builtin_ffsl+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+  #include <stdio.h>
+  #include <strings.h>
+  #include <string.h>
+
+int
+main ()
+{
+
+	{
+		int rv = __builtin_ffsl(0x08);
+		printf("%d\n", rv);
+	}
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  je_cv_gcc_builtin_ffsl=yes
+else
+  je_cv_gcc_builtin_ffsl=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $je_cv_gcc_builtin_ffsl" >&5
+$as_echo "$je_cv_gcc_builtin_ffsl" >&6; }
+
+  if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
+    $as_echo "#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl" >>confdefs.h
+
+    $as_echo "#define JEMALLOC_INTERNAL_FFS __builtin_ffs" >>confdefs.h
+
+  else
+    as_fn_error $? "Cannot build without ffsl(3) or __builtin_ffsl()" "$LINENO" 5
+  fi
 fi
 
 
 
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether atomic(9) is compilable" >&5
 $as_echo_n "checking whether atomic(9) is compilable... " >&6; }
 if ${je_cv_atomic9+:} false; then :
   $as_echo_n "(cached) " >&6
--- a/memory/jemalloc/src/configure.ac
+++ b/memory/jemalloc/src/configure.ac
@@ -1160,31 +1160,51 @@ fi
 AC_SUBST([enable_tls])
 if test "x${enable_tls}" = "x1" ; then
   AC_DEFINE_UNQUOTED([JEMALLOC_TLS], [ ])
 elif test "x${force_tls}" = "x1" ; then
   AC_MSG_ERROR([Failed to configure TLS, which is mandatory for correct function])
 fi
 
 dnl ============================================================================
-dnl Check for ffsl(3), and fail if not found.  This function exists on all
-dnl platforms that jemalloc currently has a chance of functioning on without
-dnl modification.
+dnl Check for ffsl(3), then __builtin_ffsl(), and fail if neither are found.
+dnl One of those two functions should (theoretically) exist on all platforms
+dnl that jemalloc currently has a chance of functioning on without modification.
+dnl We additionally assume ffs() or __builtin_ffs() are defined if
+dnl ffsl() or __builtin_ffsl() are defined, respectively.
 JE_COMPILABLE([a program using ffsl], [
 #include <stdio.h>
 #include <strings.h>
 #include <string.h>
 ], [
 	{
 		int rv = ffsl(0x08);
 		printf("%d\n", rv);
 	}
 ], [je_cv_function_ffsl])
-if test "x${je_cv_function_ffsl}" != "xyes" ; then
-   AC_MSG_ERROR([Cannot build without ffsl(3)])
+if test "x${je_cv_function_ffsl}" == "xyes" ; then
+  AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl])
+  AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs])
+else
+  JE_COMPILABLE([a program using __builtin_ffsl], [
+  #include <stdio.h>
+  #include <strings.h>
+  #include <string.h>
+  ], [
+	{
+		int rv = __builtin_ffsl(0x08);
+		printf("%d\n", rv);
+	}
+  ], [je_cv_gcc_builtin_ffsl])
+  if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
+    AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl])
+    AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs])
+  else
+    AC_MSG_ERROR([Cannot build without ffsl(3) or __builtin_ffsl()])
+  fi
 fi
 
 dnl ============================================================================
 dnl Check for atomic(9) operations as provided on FreeBSD.
 
 JE_COMPILABLE([atomic(9)], [
 #include <sys/types.h>
 #include <machine/atomic.h>
--- a/memory/jemalloc/src/include/jemalloc/internal/arena.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/arena.h
@@ -810,17 +810,17 @@ arena_run_regind(arena_run_t *run, arena
 	 * Avoid doing division with a variable divisor if possible.  Using
 	 * actual division here can reduce allocator throughput by over 20%!
 	 */
 	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run -
 	    bin_info->reg0_offset);
 
 	/* Rescale (factor powers of 2 out of the numerator and denominator). */
 	interval = bin_info->reg_interval;
-	shift = ffs(interval) - 1;
+	shift = jemalloc_ffs(interval) - 1;
 	diff >>= shift;
 	interval >>= shift;
 
 	if (interval == 1) {
 		/* The divisor was a power of 2. */
 		regind = diff;
 	} else {
 		/*
--- a/memory/jemalloc/src/include/jemalloc/internal/bitmap.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/bitmap.h
@@ -125,21 +125,21 @@ bitmap_sfu(bitmap_t *bitmap, const bitma
 	size_t bit;
 	bitmap_t g;
 	unsigned i;
 
 	assert(bitmap_full(bitmap, binfo) == false);
 
 	i = binfo->nlevels - 1;
 	g = bitmap[binfo->levels[i].group_offset];
-	bit = ffsl(g) - 1;
+	bit = jemalloc_ffsl(g) - 1;
 	while (i > 0) {
 		i--;
 		g = bitmap[binfo->levels[i].group_offset + bit];
-		bit = (bit << LG_BITMAP_GROUP_NBITS) + (ffsl(g) - 1);
+		bit = (bit << LG_BITMAP_GROUP_NBITS) + (jemalloc_ffsl(g) - 1);
 	}
 
 	bitmap_set(bitmap, binfo, bit);
 	return (bit);
 }
 
 JEMALLOC_INLINE void
 bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
--- a/memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal.h.in
@@ -9,21 +9,23 @@
 #  define EPERM  ERROR_WRITE_FAULT
 #  define EFAULT ERROR_INVALID_ADDRESS
 #  define ENOMEM ERROR_NOT_ENOUGH_MEMORY
 #  undef ERANGE
 #  define ERANGE ERROR_INVALID_DATA
 #else
 #  include <sys/param.h>
 #  include <sys/mman.h>
-#  include <sys/syscall.h>
-#  if !defined(SYS_write) && defined(__NR_write)
-#    define SYS_write __NR_write
+#  if !defined(__pnacl__) && !defined(__native_client__)
+#    include <sys/syscall.h>
+#    if !defined(SYS_write) && defined(__NR_write)
+#      define SYS_write __NR_write
+#    endif
+#    include <sys/uio.h>
 #  endif
-#  include <sys/uio.h>
 #  include <pthread.h>
 #  include <errno.h>
 #endif
 #include <sys/types.h>
 
 #include <limits.h>
 #ifndef SIZE_T_MAX
 #  define SIZE_T_MAX	SIZE_MAX
@@ -275,16 +277,19 @@ static const bool config_ivsalloc =
 #    define LG_QUANTUM		4
 #  endif
 #  ifdef __SH4__
 #    define LG_QUANTUM		4
 #  endif
 #  ifdef __tile__
 #    define LG_QUANTUM		4
 #  endif
+#  ifdef __le32__
+#    define LG_QUANTUM		4
+#  endif
 #  ifndef LG_QUANTUM
 #    error "No LG_QUANTUM definition for architecture; specify via CPPFLAGS"
 #  endif
 #endif
 
 #define	QUANTUM			((size_t)(1U << LG_QUANTUM))
 #define	QUANTUM_MASK		(QUANTUM - 1)
 
--- a/memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -153,16 +153,23 @@
  * memory map holes, much like munmap(2) does.
  */
 #undef JEMALLOC_MREMAP
 
 /* TLS is used to map arenas and magazine caches to threads. */
 #undef JEMALLOC_TLS
 
 /*
+ * ffs()/ffsl() functions to use for bitmapping.  Don't use these directly;
+ * instead, use jemalloc_ffs() or jemalloc_ffsl() from util.h.
+ */
+#undef JEMALLOC_INTERNAL_FFSL
+#undef JEMALLOC_INTERNAL_FFS
+
+/*
  * JEMALLOC_IVSALLOC enables ivsalloc(), which verifies that pointers reside
  * within jemalloc-owned chunks before dereferencing them.
  */
 #undef JEMALLOC_IVSALLOC
 
 /*
  * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
  */
--- a/memory/jemalloc/src/include/jemalloc/internal/util.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/util.h
@@ -104,22 +104,44 @@ void malloc_cprintf(void (*write)(void *
 void	malloc_printf(const char *format, ...)
     JEMALLOC_ATTR(format(printf, 1, 2));
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
+int	jemalloc_ffsl(long bitmap);
+int	jemalloc_ffs(int bitmap);
 size_t	pow2_ceil(size_t x);
 void	set_errno(int errnum);
 int	get_errno(void);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_UTIL_C_))
+
+/* Sanity check: */
+#if !defined(JEMALLOC_INTERNAL_FFSL) || !defined(JEMALLOC_INTERNAL_FFS)
+#  error Both JEMALLOC_INTERNAL_FFSL && JEMALLOC_INTERNAL_FFS should have been defined by configure
+#endif
+
+JEMALLOC_ALWAYS_INLINE int
+jemalloc_ffsl(long bitmap)
+{
+
+        return (JEMALLOC_INTERNAL_FFSL(bitmap));
+}
+
+JEMALLOC_ALWAYS_INLINE int
+jemalloc_ffs(int bitmap)
+{
+
+        return (JEMALLOC_INTERNAL_FFS(bitmap));
+}
+
 /* Compute the smallest power of 2 that is >= x. */
 JEMALLOC_INLINE size_t
 pow2_ceil(size_t x)
 {
 
 	x--;
 	x |= x >> 1;
 	x |= x >> 2;
--- a/memory/jemalloc/src/src/arena.c
+++ b/memory/jemalloc/src/src/arena.c
@@ -2378,17 +2378,17 @@ bin_info_run_size_calc(arena_bin_info_t 
 	/*
 	 * Determine redzone size based on minimum alignment and minimum
 	 * redzone size.  Add padding to the end of the run if it is needed to
 	 * align the regions.  The padding allows each redzone to be half the
 	 * minimum alignment; without the padding, each redzone would have to
 	 * be twice as large in order to maintain alignment.
 	 */
 	if (config_fill && opt_redzone) {
-		size_t align_min = ZU(1) << (ffs(bin_info->reg_size) - 1);
+		size_t align_min = ZU(1) << (jemalloc_ffs(bin_info->reg_size) - 1);
 		if (align_min <= REDZONE_MINSIZE) {
 			bin_info->redzone_size = REDZONE_MINSIZE;
 			pad_size = 0;
 		} else {
 			bin_info->redzone_size = align_min >> 1;
 			pad_size = bin_info->redzone_size;
 		}
 	} else {
--- a/memory/jemalloc/src/src/rtree.c
+++ b/memory/jemalloc/src/src/rtree.c
@@ -4,18 +4,18 @@
 rtree_t *
 rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc)
 {
 	rtree_t *ret;
 	unsigned bits_per_level, bits_in_leaf, height, i;
 
 	assert(bits > 0 && bits <= (sizeof(uintptr_t) << 3));
 
-	bits_per_level = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
-	bits_in_leaf = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(uint8_t)))) - 1;
+	bits_per_level = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
+	bits_in_leaf = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(uint8_t)))) - 1;
 	if (bits > bits_in_leaf) {
 		height = 1 + (bits - bits_in_leaf) / bits_per_level;
 		if ((height-1) * bits_per_level + bits_in_leaf != bits)
 			height++;
 	} else {
 		height = 1;
 	}
 	assert((height-1) * bits_per_level + bits_in_leaf >= bits);
--- a/memory/jemalloc/update.sh
+++ b/memory/jemalloc/update.sh
@@ -12,13 +12,14 @@ cd src
 git checkout "$UPSTREAM_COMMIT"
 autoconf
 git describe --long --abbrev=40 > VERSION
 rm -rf .git .gitignore autom4te.cache
 
 patch -p1 < ../0001-Use-a-configure-test-to-detect-whether-to-use-a-cons.patch
 patch -p1 < ../0002-Use-ULL-prefix-instead-of-LLU-for-unsigned-long-long.patch
 patch -p1 < ../0003-Don-t-use-msvc_compat-s-C99-headers-with-MSVC-versio.patch
+patch -p1 < ../0004-Try-to-use-__builtin_ffsl-if-ffsl-is-unavailable.patch
 
 cd ..
 hg addremove -q src
 
 echo "jemalloc has now been updated.  Don't forget to run hg commit!"