Bug 815071 - Update jemalloc3 to commit 6eb84fb. r=jlebar
authorMike Hommey <mh+mozilla@glandium.org>
Fri, 07 Dec 2012 09:32:23 +0100
changeset 115293 e6f7e0e1257b998a30fb0f7f4354df82ea75521b
parent 115292 143acb93b300318cd1e44a6f9f24ed2aa1811334
child 115294 af6ca1b6b58a4f61fe0e9436d56a52d007f71013
push id19262
push usermh@glandium.org
push dateFri, 07 Dec 2012 08:33:22 +0000
treeherdermozilla-inbound@7a37a4baac4f [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjlebar
bugs815071
milestone20.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 815071 - Update jemalloc3 to commit 6eb84fb. r=jlebar
memory/jemalloc/src/ChangeLog
memory/jemalloc/src/INSTALL
memory/jemalloc/src/Makefile.in
memory/jemalloc/src/VERSION
memory/jemalloc/src/configure
memory/jemalloc/src/configure.ac
memory/jemalloc/src/doc/jemalloc.xml.in
memory/jemalloc/src/include/jemalloc/internal/arena.h
memory/jemalloc/src/include/jemalloc/internal/chunk.h
memory/jemalloc/src/include/jemalloc/internal/chunk_dss.h
memory/jemalloc/src/include/jemalloc/internal/ctl.h
memory/jemalloc/src/include/jemalloc/internal/huge.h
memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal.h.in
memory/jemalloc/src/include/jemalloc/internal/private_namespace.h
memory/jemalloc/src/include/jemalloc/jemalloc.h.in
memory/jemalloc/src/src/arena.c
memory/jemalloc/src/src/base.c
memory/jemalloc/src/src/chunk.c
memory/jemalloc/src/src/chunk_dss.c
memory/jemalloc/src/src/ctl.c
memory/jemalloc/src/src/huge.c
memory/jemalloc/src/src/jemalloc.c
memory/jemalloc/src/src/prof.c
memory/jemalloc/src/src/stats.c
memory/jemalloc/src/src/tcache.c
memory/jemalloc/src/src/zone.c
memory/jemalloc/src/test/ALLOCM_ARENA.c
memory/jemalloc/src/test/ALLOCM_ARENA.exp
memory/jemalloc/src/test/thread_arena.c
memory/jemalloc/upstream.info
--- a/memory/jemalloc/src/ChangeLog
+++ b/memory/jemalloc/src/ChangeLog
@@ -1,30 +1,61 @@
 Following are change highlights associated with official releases.  Important
 bug fixes are all mentioned, but internal enhancements are omitted here for
 brevity (even though they are more fun to write about).  Much more detail can be
 found in the git revision history:
 
     http://www.canonware.com/cgi-bin/gitweb.cgi?p=jemalloc.git
     git://canonware.com/jemalloc.git
 
-* 3.x.x (XXX not yet released)
+* 3.x.x (Not yet released)
+
+  Bug fixes:
+  - Fix "arenas.extend" mallctl to output the number of arenas.
+
+* 3.2.0 (November 9, 2012)
+
+  In addition to a couple of bug fixes, this version modifies page run
+  allocation and dirty page purging algorithms in order to better control
+  page-level virtual memory fragmentation.
+
+  Incompatible changes:
+  - Change the "opt.lg_dirty_mult" default from 5 to 3 (32:1 to 8:1).
+
+  Bug fixes:
+  - Fix dss/mmap allocation precedence code to use recyclable mmap memory only
+    after primary dss allocation fails.
+  - Fix deadlock in the "arenas.purge" mallctl.  This regression was introduced
+    in 3.1.0 by the addition of the "arena.<i>.purge" mallctl.
+
+* 3.1.0 (October 16, 2012)
 
   New features:
   - Auto-detect whether running inside Valgrind, thus removing the need to
     manually specify MALLOC_CONF=valgrind:true.
+  - Add the "arenas.extend" mallctl, which allows applications to create
+    manually managed arenas.
+  - Add the ALLOCM_ARENA() flag for {,r,d}allocm().
+  - Add the "opt.dss", "arena.<i>.dss", and "stats.arenas.<i>.dss" mallctls,
+    which provide control over dss/mmap precedence.
+  - Add the "arena.<i>.purge" mallctl, which obsoletes "arenas.purge".
+  - Define LG_QUANTUM for hppa.
 
   Incompatible changes:
   - Disable tcache by default if running inside Valgrind, in order to avoid
     making unallocated objects appear reachable to Valgrind.
+  - Drop const from malloc_usable_size() argument on Linux.
 
   Bug fixes:
   - Fix heap profiling crash if sampled object is freed via realloc(p, 0).
   - Remove const from __*_hook variable declarations, so that glibc can modify
     them during process forking.
+  - Fix mlockall(2)/madvise(2) interaction.
+  - Fix fork(2)-related deadlocks.
+  - Fix error return value for "thread.tcache.enabled" mallctl.
 
 * 3.0.0 (May 11, 2012)
 
   Although this version adds some major new features, the primary focus is on
   internal code cleanup that facilitates maintainability and portability, most
   of which is not reflected in the ChangeLog.  This is the first release to
   incorporate substantial contributions from numerous other developers, and the
   result is a more broadly useful allocator (see the git revision history for
--- a/memory/jemalloc/src/INSTALL
+++ b/memory/jemalloc/src/INSTALL
@@ -50,16 +50,21 @@ any of the following arguments (not a de
 
     This makes it possible to use jemalloc at the same time as the system
     allocator, or even to use multiple copies of jemalloc simultaneously.
 
     By default, the prefix is "", except on OS X, where it is "je_".  On OS X,
     jemalloc overlays the default malloc zone, but makes no attempt to actually
     replace the "malloc", "calloc", etc. symbols.
 
+--without-export
+    Don't export public APIs. This can be useful when building jemalloc as a
+    static library, or to avoid exporting public APIs when using the zone
+    allocator on OSX.
+
 --with-private-namespace=<prefix>
     Prefix all library-private APIs with <prefix>.  For shared libraries,
     symbol visibility mechanisms prevent these symbols from being exported, but
     for static libraries, naming collisions are a real possibility.  By
     default, the prefix is "" (empty string).
 
 --with-install-suffix=<suffix>
     Append <suffix> to the base name of all installed files, such that multiple
--- a/memory/jemalloc/src/Makefile.in
+++ b/memory/jemalloc/src/Makefile.in
@@ -96,19 +96,19 @@ ifneq ($(SOREV),$(SO))
 DSOS += $(objroot)lib/$(LIBJEMALLOC).$(SO)
 endif
 MAN3 := $(objroot)doc/jemalloc$(install_suffix).3
 DOCS_XML := $(objroot)doc/jemalloc$(install_suffix).xml
 DOCS_HTML := $(DOCS_XML:$(objroot)%.xml=$(srcroot)%.html)
 DOCS_MAN3 := $(DOCS_XML:$(objroot)%.xml=$(srcroot)%.3)
 DOCS := $(DOCS_HTML) $(DOCS_MAN3)
 CTESTS := $(srcroot)test/aligned_alloc.c $(srcroot)test/allocated.c \
-	$(srcroot)test/bitmap.c $(srcroot)test/mremap.c \
-	$(srcroot)test/posix_memalign.c $(srcroot)test/thread_arena.c \
-	$(srcroot)test/thread_tcache_enabled.c
+	$(srcroot)test/ALLOCM_ARENA.c $(srcroot)test/bitmap.c \
+	$(srcroot)test/mremap.c $(srcroot)test/posix_memalign.c \
+	$(srcroot)test/thread_arena.c $(srcroot)test/thread_tcache_enabled.c
 ifeq ($(enable_experimental), 1)
 CTESTS += $(srcroot)test/allocm.c $(srcroot)test/rallocm.c
 endif
 
 COBJS := $(CSRCS:$(srcroot)%.c=$(objroot)%.$(O))
 CPICOBJS := $(CSRCS:$(srcroot)%.c=$(objroot)%.pic.$(O))
 CTESTOBJS := $(CTESTS:$(srcroot)%.c=$(objroot)%.$(O))
 
--- a/memory/jemalloc/src/VERSION
+++ b/memory/jemalloc/src/VERSION
@@ -1,1 +1,1 @@
-1.0.0-357-gd0ffd8ed4f6aa4cf7248028eddfcb35f93247fe4
+1.0.0-370-g6eb84fbe315add1e1d4f8deedc25d260fff3ae97
--- a/memory/jemalloc/src/configure
+++ b/memory/jemalloc/src/configure
@@ -745,16 +745,17 @@ ac_subst_files=''
 ac_user_opts='
 enable_option_checking
 with_xslroot
 with_rpath
 enable_autogen
 enable_experimental
 with_mangling
 with_jemalloc_prefix
+with_export
 with_private_namespace
 with_install_suffix
 enable_cc_silence
 enable_debug
 enable_stats
 enable_prof
 enable_prof_libunwind
 with_static_libunwind
@@ -1415,16 +1416,17 @@ Optional Features:
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
   --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
   --with-xslroot=<path>   XSL stylesheet root path
   --with-rpath=<rpath>    Colon-separated rpath (ELF systems only)
   --with-mangling=<map>   Mangle symbols in <map>
   --with-jemalloc-prefix=<prefix>
                           Prefix to prepend to all public APIs
+  --without-export        disable exporting jemalloc public APIs
   --with-private-namespace=<prefix>
                           Prefix to prepend to all library-private APIs
   --with-install-suffix=<suffix>
                           Suffix to append to all installed files
   --with-static-libunwind=<libunwind.a>
                           Path to static libunwind library; use rather than
                           dynamically linking
 
@@ -5143,16 +5145,27 @@ for stem in ${public_syms}; do
   m="${JEMALLOC_PREFIX}${stem}"
   cat >>confdefs.h <<_ACEOF
 #define ${n} ${m}
 _ACEOF
 
 done
 
 
+# Check whether --with-export was given.
+if test "${with_export+set}" = set; then :
+  withval=$with_export; if test "x$with_export" = "xno"; then
+  $as_echo "#define JEMALLOC_EXPORT /**/" >>confdefs.h
+
+fi
+]
+fi
+
+
+
 # Check whether --with-private_namespace was given.
 if test "${with_private_namespace+set}" = set; then :
   withval=$with_private_namespace; JEMALLOC_PRIVATE_NAMESPACE="$with_private_namespace"
 else
   JEMALLOC_PRIVATE_NAMESPACE=""
 
 fi
 
--- a/memory/jemalloc/src/configure.ac
+++ b/memory/jemalloc/src/configure.ac
@@ -466,16 +466,23 @@ dnl Generate macros to rename public sym
 dnl with je_ in the source code, so these macro definitions are needed even if
 dnl --with-jemalloc-prefix wasn't specified.
 for stem in ${public_syms}; do
   n="je_${stem}"
   m="${JEMALLOC_PREFIX}${stem}"
   AC_DEFINE_UNQUOTED([${n}], [${m}])
 done
 
+AC_ARG_WITH([export],
+  [AS_HELP_STRING([--without-export], [disable exporting jemalloc public APIs])],
+  [if test "x$with_export" = "xno"; then
+  AC_DEFINE([JEMALLOC_EXPORT],[])]
+fi]
+)
+
 dnl Do not mangle library-private APIs by default.
 AC_ARG_WITH([private_namespace],
   [AS_HELP_STRING([--with-private-namespace=<prefix>], [Prefix to prepend to all library-private APIs])],
   [JEMALLOC_PRIVATE_NAMESPACE="$with_private_namespace"],
   [JEMALLOC_PRIVATE_NAMESPACE=""]
 )
 AC_DEFINE_UNQUOTED([JEMALLOC_PRIVATE_NAMESPACE], ["$JEMALLOC_PRIVATE_NAMESPACE"])
 if test "x$JEMALLOC_PRIVATE_NAMESPACE" != "x" ; then
--- a/memory/jemalloc/src/doc/jemalloc.xml.in
+++ b/memory/jemalloc/src/doc/jemalloc.xml.in
@@ -363,16 +363,25 @@ for (i = 0; i < nbins; i++) {
           </varlistentry>
           <varlistentry>
             <term><constant>ALLOCM_NO_MOVE</constant></term>
 
             <listitem><para>For reallocation, fail rather than moving the
             object.  This constraint can apply to both growth and
             shrinkage.</para></listitem>
           </varlistentry>
+          <varlistentry>
+            <term><constant>ALLOCM_ARENA(<parameter>a</parameter>)
+            </constant></term>
+
+            <listitem><para>Use the arena specified by the index
+            <parameter>a</parameter>.  This macro does not validate that
+            <parameter>a</parameter> specifies an arena in the valid
+            range.</para></listitem>
+          </varlistentry>
         </variablelist>
       </para>
 
       <para>The <function>allocm<parameter/></function> function allocates at
       least <parameter>size</parameter> bytes of memory, sets
       <parameter>*ptr</parameter> to the base address of the allocation, and
       sets <parameter>*rsize</parameter> to the real size of the allocation if
       <parameter>rsize</parameter> is not <constant>NULL</constant>.  Behavior
@@ -780,42 +789,56 @@ for (i = 0; i < nbins; i++) {
           <mallctl>opt.lg_chunk</mallctl>
           (<type>size_t</type>)
           <literal>r-</literal>
         </term>
         <listitem><para>Virtual memory chunk size (log base 2).  The default
         chunk size is 4 MiB (2^22).</para></listitem>
       </varlistentry>
 
+      <varlistentry id="opt.dss">
+        <term>
+          <mallctl>opt.dss</mallctl>
+          (<type>const char *</type>)
+          <literal>r-</literal>
+        </term>
+        <listitem><para>dss (<citerefentry><refentrytitle>sbrk</refentrytitle>
+        <manvolnum>2</manvolnum></citerefentry>) allocation precedence as
+        related to <citerefentry><refentrytitle>mmap</refentrytitle>
+        <manvolnum>2</manvolnum></citerefentry> allocation.  The following
+        settings are supported: &ldquo;disabled&rdquo;, &ldquo;primary&rdquo;,
+        and &ldquo;secondary&rdquo; (default).</para></listitem>
+      </varlistentry>
+
       <varlistentry id="opt.narenas">
         <term>
           <mallctl>opt.narenas</mallctl>
           (<type>size_t</type>)
           <literal>r-</literal>
         </term>
-        <listitem><para>Maximum number of arenas to use.  The default maximum
-        number of arenas is four times the number of CPUs, or one if there is a
-        single CPU.</para></listitem>
+        <listitem><para>Maximum number of arenas to use for automatic
+        multiplexing of threads and arenas.  The default is four times the
+        number of CPUs, or one if there is a single CPU.</para></listitem>
       </varlistentry>
 
       <varlistentry id="opt.lg_dirty_mult">
         <term>
           <mallctl>opt.lg_dirty_mult</mallctl>
           (<type>ssize_t</type>)
           <literal>r-</literal>
         </term>
         <listitem><para>Per-arena minimum ratio (log base 2) of active to dirty
         pages.  Some dirty unused pages may be allowed to accumulate, within
         the limit set by the ratio (or one chunk worth of dirty pages,
         whichever is greater), before informing the kernel about some of those
         pages via <citerefentry><refentrytitle>madvise</refentrytitle>
         <manvolnum>2</manvolnum></citerefentry> or a similar system call.  This
         provides the kernel with sufficient information to recycle dirty pages
         if physical memory becomes scarce and the pages remain unused.  The
-        default minimum ratio is 32:1 (2^5:1); an option value of -1 will
+        default minimum ratio is 8:1 (2^3:1); an option value of -1 will
         disable dirty page purging.</para></listitem>
       </varlistentry>
 
       <varlistentry id="opt.stats_print">
         <term>
           <mallctl>opt.stats_print</mallctl>
           (<type>bool</type>)
           <literal>r-</literal>
@@ -1144,21 +1167,18 @@ malloc_conf = "xmalloc:true";]]></progra
 
       <varlistentry>
         <term>
           <mallctl>thread.arena</mallctl>
           (<type>unsigned</type>)
           <literal>rw</literal>
         </term>
         <listitem><para>Get or set the arena associated with the calling
-        thread.  The arena index must be less than the maximum number of arenas
-        (see the <link
-        linkend="arenas.narenas"><mallctl>arenas.narenas</mallctl></link>
-        mallctl).  If the specified arena was not initialized beforehand (see
-        the <link
+        thread.  If the specified arena was not initialized beforehand (see the
+        <link
         linkend="arenas.initialized"><mallctl>arenas.initialized</mallctl></link>
         mallctl), it will be automatically initialized as a side effect of
         calling this interface.</para></listitem>
       </varlistentry>
 
       <varlistentry id="thread.allocated">
         <term>
           <mallctl>thread.allocated</mallctl>
@@ -1240,23 +1260,50 @@ malloc_conf = "xmalloc:true";]]></progra
         need not be called, since automatic periodic incremental garbage
         collection occurs, and the thread cache is automatically discarded when
         a thread exits.  However, garbage collection is triggered by allocation
         activity, so it is possible for a thread that stops
         allocating/deallocating to retain its cache indefinitely, in which case
         the developer may find manual flushing useful.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="arena.i.purge">
+        <term>
+          <mallctl>arena.&lt;i&gt;.purge</mallctl>
+          (<type>unsigned</type>)
+          <literal>--</literal>
+        </term>
+        <listitem><para>Purge unused dirty pages for arena &lt;i&gt;, or for
+        all arenas if &lt;i&gt; equals <link
+        linkend="arenas.narenas"><mallctl>arenas.narenas</mallctl></link>.
+        </para></listitem>
+      </varlistentry>
+
+      <varlistentry id="arena.i.dss">
+        <term>
+          <mallctl>arena.&lt;i&gt;.dss</mallctl>
+          (<type>const char *</type>)
+          <literal>rw</literal>
+        </term>
+        <listitem><para>Set the precedence of dss allocation as related to mmap
+        allocation for arena &lt;i&gt;, or for all arenas if &lt;i&gt; equals
+        <link
+        linkend="arenas.narenas"><mallctl>arenas.narenas</mallctl></link>.  See
+        <link linkend="opt.dss"><mallctl>opt.dss</mallctl></link> for supported
+        settings.
+        </para></listitem>
+      </varlistentry>
+
       <varlistentry id="arenas.narenas">
         <term>
           <mallctl>arenas.narenas</mallctl>
           (<type>unsigned</type>)
           <literal>r-</literal>
         </term>
-        <listitem><para>Maximum number of arenas.</para></listitem>
+        <listitem><para>Current limit on number of arenas.</para></listitem>
       </varlistentry>
 
       <varlistentry id="arenas.initialized">
         <term>
           <mallctl>arenas.initialized</mallctl>
           (<type>bool *</type>)
           <literal>r-</literal>
         </term>
@@ -1365,16 +1412,26 @@ malloc_conf = "xmalloc:true";]]></progra
           <mallctl>arenas.purge</mallctl>
           (<type>unsigned</type>)
           <literal>-w</literal>
         </term>
         <listitem><para>Purge unused dirty pages for the specified arena, or
         for all arenas if none is specified.</para></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term>
+          <mallctl>arenas.extend</mallctl>
+          (<type>unsigned</type>)
+          <literal>r-</literal>
+        </term>
+        <listitem><para>Extend the array of arenas by appending a new arena,
+        and returning the new arena index.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="prof.active">
         <term>
           <mallctl>prof.active</mallctl>
           (<type>bool</type>)
           <literal>rw</literal>
           [<option>--enable-prof</option>]
         </term>
         <listitem><para>Control whether sampling is currently active.  See the
@@ -1450,17 +1507,19 @@ malloc_conf = "xmalloc:true";]]></progra
           (<type>size_t</type>)
           <literal>r-</literal>
           [<option>--enable-stats</option>]
         </term>
         <listitem><para>Total number of bytes in active pages allocated by the
         application.  This is a multiple of the page size, and greater than or
         equal to <link
         linkend="stats.allocated"><mallctl>stats.allocated</mallctl></link>.
-        </para></listitem>
+        This does not include <link linkend="stats.arenas.i.pdirty">
+        <mallctl>stats.arenas.&lt;i&gt;.pdirty</mallctl></link> and pages
+        entirely devoted to allocator metadata.</para></listitem>
       </varlistentry>
 
       <varlistentry>
         <term>
           <mallctl>stats.mapped</mallctl>
           (<type>size_t</type>)
           <literal>r-</literal>
           [<option>--enable-stats</option>]
@@ -1535,16 +1594,30 @@ malloc_conf = "xmalloc:true";]]></progra
           [<option>--enable-stats</option>]
         </term>
         <listitem><para>Cumulative number of huge deallocation requests.
         </para></listitem>
       </varlistentry>
 
       <varlistentry>
         <term>
+          <mallctl>stats.arenas.&lt;i&gt;.dss</mallctl>
+          (<type>const char *</type>)
+          <literal>r-</literal>
+        </term>
+        <listitem><para>dss (<citerefentry><refentrytitle>sbrk</refentrytitle>
+        <manvolnum>2</manvolnum></citerefentry>) allocation precedence as
+        related to <citerefentry><refentrytitle>mmap</refentrytitle>
+        <manvolnum>2</manvolnum></citerefentry> allocation.  See <link
+        linkend="opt.dss"><mallctl>opt.dss</mallctl></link> for details.
+        </para></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term>
           <mallctl>stats.arenas.&lt;i&gt;.nthreads</mallctl>
           (<type>unsigned</type>)
           <literal>r-</literal>
         </term>
         <listitem><para>Number of threads currently assigned to
         arena.</para></listitem>
       </varlistentry>
 
@@ -1552,17 +1625,17 @@ malloc_conf = "xmalloc:true";]]></progra
         <term>
           <mallctl>stats.arenas.&lt;i&gt;.pactive</mallctl>
           (<type>size_t</type>)
           <literal>r-</literal>
         </term>
         <listitem><para>Number of pages in active runs.</para></listitem>
       </varlistentry>
 
-      <varlistentry>
+      <varlistentry id="stats.arenas.i.pdirty">
         <term>
           <mallctl>stats.arenas.&lt;i&gt;.pdirty</mallctl>
           (<type>size_t</type>)
           <literal>r-</literal>
         </term>
         <listitem><para>Number of pages within unused runs that are potentially
         dirty, and for which <function>madvise<parameter>...</parameter>
         <parameter><constant>MADV_DONTNEED</constant></parameter></function> or
--- a/memory/jemalloc/src/include/jemalloc/internal/arena.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/arena.h
@@ -33,20 +33,20 @@
  */
 #define	REDZONE_MINSIZE		16
 
 /*
  * The minimum ratio of active:dirty pages per arena is computed as:
  *
  *   (nactive >> opt_lg_dirty_mult) >= ndirty
  *
- * So, supposing that opt_lg_dirty_mult is 5, there can be no less than 32
- * times as many active pages as dirty pages.
+ * So, supposing that opt_lg_dirty_mult is 3, there can be no less than 8 times
+ * as many active pages as dirty pages.
  */
-#define	LG_DIRTY_MULT_DEFAULT	5
+#define	LG_DIRTY_MULT_DEFAULT	3
 
 typedef struct arena_chunk_map_s arena_chunk_map_t;
 typedef struct arena_chunk_s arena_chunk_t;
 typedef struct arena_run_s arena_run_t;
 typedef struct arena_bin_info_s arena_bin_info_t;
 typedef struct arena_bin_s arena_bin_t;
 typedef struct arena_s arena_t;
 
@@ -64,17 +64,17 @@ struct arena_chunk_map_s {
 	 * being just a fixed cost.
 	 */
 	union {
 #endif
 	union {
 		/*
 		 * Linkage for run trees.  There are two disjoint uses:
 		 *
-		 * 1) arena_t's runs_avail_{clean,dirty} trees.
+		 * 1) arena_t's runs_avail tree.
 		 * 2) arena_run_t conceptually uses this linkage for in-use
 		 *    non-full runs, rather than directly embedding linkage.
 		 */
 		rb_node(arena_chunk_map_t)	rb_link;
 		/*
 		 * List of runs currently in purgatory.  arena_chunk_purge()
 		 * temporarily allocates runs that contain dirty pages while
 		 * purging, so that other threads cannot use the runs while the
@@ -157,38 +157,42 @@ struct arena_chunk_map_s {
 #define	CHUNK_MAP_KEY		CHUNK_MAP_ALLOCATED
 };
 typedef rb_tree(arena_chunk_map_t) arena_avail_tree_t;
 typedef rb_tree(arena_chunk_map_t) arena_run_tree_t;
 
 /* Arena chunk header. */
 struct arena_chunk_s {
 	/* Arena that owns the chunk. */
-	arena_t		*arena;
+	arena_t			*arena;
+
+	/* Linkage for tree of arena chunks that contain dirty runs. */
+	rb_node(arena_chunk_t)	dirty_link;
 
-	/* Linkage for the arena's chunks_dirty list. */
-	ql_elm(arena_chunk_t) link_dirty;
+	/* Number of dirty pages. */
+	size_t			ndirty;
+
+	/* Number of available runs. */
+	size_t			nruns_avail;
 
 	/*
-	 * True if the chunk is currently in the chunks_dirty list, due to
-	 * having at some point contained one or more dirty pages.  Removal
-	 * from chunks_dirty is lazy, so (dirtied && ndirty == 0) is possible.
-	 */
-	bool		dirtied;
-
-	/* Number of dirty pages. */
-	size_t		ndirty;
+	 * Number of available run adjacencies.  Clean and dirty available runs
+	 * are not coalesced, which causes virtual memory fragmentation.  The
+	 * ratio of (nruns_avail-nruns_adjac):nruns_adjac is used for tracking
+	 * this fragmentation.
+	 * */
+	size_t			nruns_adjac;
 
 	/*
 	 * Map of pages within chunk that keeps track of free/large/small.  The
 	 * first map_bias entries are omitted, since the chunk header does not
 	 * need to be tracked in the map.  This omission saves a header page
 	 * for common chunk sizes (e.g. 4 MiB).
 	 */
-	arena_chunk_map_t map[1]; /* Dynamically sized. */
+	arena_chunk_map_t	map[1]; /* Dynamically sized. */
 };
 typedef rb_tree(arena_chunk_t) arena_chunk_tree_t;
 
 struct arena_run_s {
 	/* Bin this run is associated with. */
 	arena_bin_t	*bin;
 
 	/* Index of next region that has never been allocated, or nregs. */
@@ -326,18 +330,20 @@ struct arena_s {
 	/*
 	 * List of tcaches for extant threads associated with this arena.
 	 * Stats from these are merged incrementally, and at exit.
 	 */
 	ql_head(tcache_t)	tcache_ql;
 
 	uint64_t		prof_accumbytes;
 
-	/* List of dirty-page-containing chunks this arena manages. */
-	ql_head(arena_chunk_t)	chunks_dirty;
+	dss_prec_t		dss_prec;
+
+	/* Tree of dirty-page-containing chunks this arena manages. */
+	arena_chunk_tree_t	chunks_dirty;
 
 	/*
 	 * In order to avoid rapid chunk allocation/deallocation when an arena
 	 * oscillates right on the cusp of needing a new chunk, cache the most
 	 * recently freed chunk.  The spare is left in the arena's chunk trees
 	 * until it is deleted.
 	 *
 	 * There is one spare chunk per arena, rather than one spare total, in
@@ -362,28 +368,19 @@ struct arena_s {
 	 * multiple threads to purge dirty pages concurrently, and they use
 	 * npurgatory to indicate the total number of pages all threads are
 	 * attempting to purge.
 	 */
 	size_t			npurgatory;
 
 	/*
 	 * Size/address-ordered trees of this arena's available runs.  The trees
-	 * are used for first-best-fit run allocation.  The dirty tree contains
-	 * runs with dirty pages (i.e. very likely to have been touched and
-	 * therefore have associated physical pages), whereas the clean tree
-	 * contains runs with pages that either have no associated physical
-	 * pages, or have pages that the kernel may recycle at any time due to
-	 * previous madvise(2) calls.  The dirty tree is used in preference to
-	 * the clean tree for allocations, because using dirty pages reduces
-	 * the amount of dirty purging necessary to keep the active:dirty page
-	 * ratio below the purge threshold.
+	 * are used for first-best-fit run allocation.
 	 */
-	arena_avail_tree_t	runs_avail_clean;
-	arena_avail_tree_t	runs_avail_dirty;
+	arena_avail_tree_t	runs_avail;
 
 	/* bins is used to store trees of free regions. */
 	arena_bin_t		bins[NBINS];
 };
 
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
@@ -398,17 +395,16 @@ extern uint8_t const	small_size2bin[];
 #define	SMALL_SIZE2BIN(s)	(small_size2bin[(s-1) >> LG_TINY_MIN])
 
 extern arena_bin_info_t	arena_bin_info[NBINS];
 
 /* Number of large size classes. */
 #define			nlclasses (chunk_npages - map_bias)
 
 void	arena_purge_all(arena_t *arena);
-void	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
     size_t binind, uint64_t prof_accumbytes);
 void	arena_alloc_junk_small(void *ptr, arena_bin_info_t *bin_info,
     bool zero);
 void	arena_dalloc_junk_small(void *ptr, arena_bin_info_t *bin_info);
 void	*arena_malloc_small(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
 void	*arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero);
@@ -417,23 +413,26 @@ void	arena_dalloc_bin_locked(arena_t *ar
     arena_chunk_map_t *mapelm);
 void	arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     size_t pageind, arena_chunk_map_t *mapelm);
 void	arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     size_t pageind);
 void	arena_dalloc_large_locked(arena_t *arena, arena_chunk_t *chunk,
     void *ptr);
 void	arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr);
-void	arena_stats_merge(arena_t *arena, size_t *nactive, size_t *ndirty,
-    arena_stats_t *astats, malloc_bin_stats_t *bstats,
-    malloc_large_stats_t *lstats);
 void	*arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
     size_t extra, bool zero);
-void	*arena_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero, bool try_tcache);
+void	*arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
+    size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
+    bool try_tcache_dalloc);
+dss_prec_t	arena_dss_prec_get(arena_t *arena);
+void	arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
+void	arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
+    size_t *ndirty, arena_stats_t *astats, malloc_bin_stats_t *bstats,
+    malloc_large_stats_t *lstats);
 bool	arena_new(arena_t *arena, unsigned ind);
 void	arena_boot(void);
 void	arena_prefork(arena_t *arena);
 void	arena_postfork_parent(arena_t *arena);
 void	arena_postfork_child(arena_t *arena);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
@@ -459,16 +458,19 @@ void	arena_mapbits_unallocated_size_set(
 void	arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind,
     size_t size, size_t flags);
 void	arena_mapbits_large_binind_set(arena_chunk_t *chunk, size_t pageind,
     size_t binind);
 void	arena_mapbits_small_set(arena_chunk_t *chunk, size_t pageind,
     size_t runind, size_t binind, size_t flags);
 void	arena_mapbits_unzeroed_set(arena_chunk_t *chunk, size_t pageind,
     size_t unzeroed);
+void	arena_prof_accum_impl(arena_t *arena, uint64_t accumbytes);
+void	arena_prof_accum_locked(arena_t *arena, uint64_t accumbytes);
+void	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
 size_t	arena_ptr_small_binind_get(const void *ptr, size_t mapbits);
 size_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
 unsigned	arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info,
     const void *ptr);
 prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
 void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 void	*arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache);
 size_t	arena_salloc(const void *ptr, bool demote);
@@ -656,16 +658,54 @@ arena_mapbits_unzeroed_set(arena_chunk_t
     size_t unzeroed)
 {
 	size_t *mapbitsp;
 
 	mapbitsp = arena_mapbitsp_get(chunk, pageind);
 	*mapbitsp = (*mapbitsp & ~CHUNK_MAP_UNZEROED) | unzeroed;
 }
 
+JEMALLOC_INLINE void
+arena_prof_accum_impl(arena_t *arena, uint64_t accumbytes)
+{
+
+	cassert(config_prof);
+	assert(prof_interval != 0);
+
+	arena->prof_accumbytes += accumbytes;
+	if (arena->prof_accumbytes >= prof_interval) {
+		prof_idump();
+		arena->prof_accumbytes -= prof_interval;
+	}
+}
+
+JEMALLOC_INLINE void
+arena_prof_accum_locked(arena_t *arena, uint64_t accumbytes)
+{
+
+	cassert(config_prof);
+
+	if (prof_interval == 0)
+		return;
+	arena_prof_accum_impl(arena, accumbytes);
+}
+
+JEMALLOC_INLINE void
+arena_prof_accum(arena_t *arena, uint64_t accumbytes)
+{
+
+	cassert(config_prof);
+
+	if (prof_interval == 0)
+		return;
+	malloc_mutex_lock(&arena->lock);
+	arena_prof_accum_impl(arena, accumbytes);
+	malloc_mutex_unlock(&arena->lock);
+}
+
 JEMALLOC_INLINE size_t
 arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 {
 	size_t binind;
 
 	binind = (mapbits & CHUNK_MAP_BININD_MASK) >> CHUNK_MAP_BININD_SHIFT;
 
 	if (config_debug) {
--- a/memory/jemalloc/src/include/jemalloc/internal/chunk.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/chunk.h
@@ -23,31 +23,34 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
 
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
 extern size_t		opt_lg_chunk;
+extern const char	*opt_dss;
 
 /* Protects stats_chunks; currently not used for any other purpose. */
 extern malloc_mutex_t	chunks_mtx;
 /* Chunk statistics. */
 extern chunk_stats_t	stats_chunks;
 
 extern rtree_t		*chunks_rtree;
 
 extern size_t		chunksize;
 extern size_t		chunksize_mask; /* (chunksize - 1). */
 extern size_t		chunk_npages;
 extern size_t		map_bias; /* Number of arena chunk header pages. */
 extern size_t		arena_maxclass; /* Max size class for arenas. */
 
-void	*chunk_alloc(size_t size, size_t alignment, bool base, bool *zero);
+void	*chunk_alloc(size_t size, size_t alignment, bool base, bool *zero,
+    dss_prec_t dss_prec);
+void	chunk_unmap(void *chunk, size_t size);
 void	chunk_dealloc(void *chunk, size_t size, bool unmap);
 bool	chunk_boot(void);
 void	chunk_prefork(void);
 void	chunk_postfork_parent(void);
 void	chunk_postfork_child(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
--- a/memory/jemalloc/src/include/jemalloc/internal/chunk_dss.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/chunk_dss.h
@@ -1,19 +1,33 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
+typedef enum {
+	dss_prec_disabled  = 0,
+	dss_prec_primary   = 1,
+	dss_prec_secondary = 2,
+
+	dss_prec_limit     = 3
+} dss_prec_t ;
+#define	DSS_PREC_DEFAULT	dss_prec_secondary
+#define	DSS_DEFAULT		"secondary"
+
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
 
+extern const char *dss_prec_names[];
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
+dss_prec_t	chunk_dss_prec_get(void);
+bool	chunk_dss_prec_set(dss_prec_t dss_prec);
 void	*chunk_alloc_dss(size_t size, size_t alignment, bool *zero);
 bool	chunk_in_dss(void *chunk);
 bool	chunk_dss_boot(void);
 void	chunk_dss_prefork(void);
 void	chunk_dss_postfork_parent(void);
 void	chunk_dss_postfork_child(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
--- a/memory/jemalloc/src/include/jemalloc/internal/ctl.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/ctl.h
@@ -28,16 +28,17 @@ struct ctl_named_node_s {
 struct ctl_indexed_node_s {
 	struct ctl_node_s	node;
 	const ctl_named_node_t	*(*index)(const size_t *, size_t, size_t);
 };
 
 struct ctl_arena_stats_s {
 	bool			initialized;
 	unsigned		nthreads;
+	const char		*dss;
 	size_t			pactive;
 	size_t			pdirty;
 	arena_stats_t		astats;
 
 	/* Aggregate stats for small size classes, based on bin stats. */
 	size_t			allocated_small;
 	uint64_t		nmalloc_small;
 	uint64_t		ndalloc_small;
@@ -56,16 +57,17 @@ struct ctl_stats_s {
 		uint64_t	total;		/* stats_chunks.nchunks */
 		size_t		high;		/* stats_chunks.highchunks */
 	} chunks;
 	struct {
 		size_t		allocated;	/* huge_allocated */
 		uint64_t	nmalloc;	/* huge_nmalloc */
 		uint64_t	ndalloc;	/* huge_ndalloc */
 	} huge;
+	unsigned		narenas;
 	ctl_arena_stats_t	*arenas;	/* (narenas + 1) elements. */
 };
 
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
 int	ctl_byname(const char *name, void *oldp, size_t *oldlenp, void *newp,
--- a/memory/jemalloc/src/include/jemalloc/internal/huge.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/huge.h
@@ -17,17 +17,17 @@ extern size_t		huge_allocated;
 /* Protects chunk-related data structures. */
 extern malloc_mutex_t	huge_mtx;
 
 void	*huge_malloc(size_t size, bool zero);
 void	*huge_palloc(size_t size, size_t alignment, bool zero);
 void	*huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
     size_t extra);
 void	*huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero);
+    size_t alignment, bool zero, bool try_tcache_dalloc);
 void	huge_dalloc(void *ptr, bool unmap);
 size_t	huge_salloc(const void *ptr);
 prof_ctx_t	*huge_prof_ctx_get(const void *ptr);
 void	huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 bool	huge_boot(void);
 void	huge_prefork(void);
 void	huge_postfork_parent(void);
 void	huge_postfork_child(void);
--- a/memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal.h.in
@@ -509,23 +509,29 @@ extern bool	opt_utrace;
 extern bool	opt_valgrind;
 extern bool	opt_xmalloc;
 extern bool	opt_zero;
 extern size_t	opt_narenas;
 
 /* Number of CPUs. */
 extern unsigned		ncpus;
 
-extern malloc_mutex_t	arenas_lock; /* Protects arenas initialization. */
+/* Protects arenas initialization (arenas, arenas_total). */
+extern malloc_mutex_t	arenas_lock;
 /*
  * Arenas that are used to service external requests.  Not all elements of the
  * arenas array are necessarily used; arenas are created lazily as needed.
+ *
+ * arenas[0..narenas_auto) are used for automatic multiplexing of threads and
+ * arenas.  arenas[narenas_auto..narenas_total) are only used if the application
+ * takes some action to create them and allocate from them.
  */
 extern arena_t		**arenas;
-extern unsigned		narenas;
+extern unsigned		narenas_total;
+extern unsigned		narenas_auto; /* Read-only after initialization. */
 
 arena_t	*arenas_extend(unsigned ind);
 void	arenas_cleanup(void *arg);
 arena_t	*choose_arena_hard(void);
 void	jemalloc_prefork(void);
 void	jemalloc_postfork_parent(void);
 void	jemalloc_postfork_child(void);
 
@@ -570,16 +576,17 @@ void	jemalloc_postfork_child(void);
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
 malloc_tsd_protos(JEMALLOC_ATTR(unused), arenas, arena_t *)
 
 size_t	s2u(size_t size);
 size_t	sa2u(size_t size, size_t alignment);
+unsigned	narenas_total_get(void);
 arena_t	*choose_arena(arena_t *arena);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
 /*
  * Map of pthread_self() --> arenas[???], used for selecting an arena to use
  * for allocations.
  */
@@ -674,16 +681,28 @@ sa2u(size_t size, size_t alignment)
 		 */
 		run_size = usize + alignment - PAGE;
 		if (run_size <= arena_maxclass)
 			return (PAGE_CEILING(usize));
 		return (CHUNK_CEILING(usize));
 	}
 }
 
+JEMALLOC_INLINE unsigned
+narenas_total_get(void)
+{
+	unsigned narenas;
+
+	malloc_mutex_lock(&arenas_lock);
+	narenas = narenas_total;
+	malloc_mutex_unlock(&arenas_lock);
+
+	return (narenas);
+}
+
 /* Choose an arena based on a per-thread value. */
 JEMALLOC_INLINE arena_t *
 choose_arena(arena_t *arena)
 {
 	arena_t *ret;
 
 	if (arena != NULL)
 		return (arena);
@@ -709,77 +728,108 @@ choose_arena(arena_t *arena)
 #include "jemalloc/internal/tcache.h"
 #define	JEMALLOC_ARENA_INLINE_B
 #include "jemalloc/internal/arena.h"
 #undef JEMALLOC_ARENA_INLINE_B
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/quarantine.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
+void	*imallocx(size_t size, bool try_tcache, arena_t *arena);
 void	*imalloc(size_t size);
+void	*icallocx(size_t size, bool try_tcache, arena_t *arena);
 void	*icalloc(size_t size);
+void	*ipallocx(size_t usize, size_t alignment, bool zero, bool try_tcache,
+    arena_t *arena);
 void	*ipalloc(size_t usize, size_t alignment, bool zero);
 size_t	isalloc(const void *ptr, bool demote);
 size_t	ivsalloc(const void *ptr, bool demote);
 size_t	u2rz(size_t usize);
 size_t	p2rz(const void *ptr);
+void	idallocx(void *ptr, bool try_tcache);
 void	idalloc(void *ptr);
+void	iqallocx(void *ptr, bool try_tcache);
 void	iqalloc(void *ptr);
+void	*irallocx(void *ptr, size_t size, size_t extra, size_t alignment,
+    bool zero, bool no_move, bool try_tcache_alloc, bool try_tcache_dalloc,
+    arena_t *arena);
 void	*iralloc(void *ptr, size_t size, size_t extra, size_t alignment,
     bool zero, bool no_move);
 malloc_tsd_protos(JEMALLOC_ATTR(unused), thread_allocated, thread_allocated_t)
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
 JEMALLOC_INLINE void *
-imalloc(size_t size)
+imallocx(size_t size, bool try_tcache, arena_t *arena)
 {
 
 	assert(size != 0);
 
 	if (size <= arena_maxclass)
-		return (arena_malloc(NULL, size, false, true));
+		return (arena_malloc(arena, size, false, try_tcache));
 	else
 		return (huge_malloc(size, false));
 }
 
 JEMALLOC_INLINE void *
+imalloc(size_t size)
+{
+
+	return (imallocx(size, true, NULL));
+}
+
+JEMALLOC_INLINE void *
+icallocx(size_t size, bool try_tcache, arena_t *arena)
+{
+
+	if (size <= arena_maxclass)
+		return (arena_malloc(arena, size, true, try_tcache));
+	else
+		return (huge_malloc(size, true));
+}
+
+JEMALLOC_INLINE void *
 icalloc(size_t size)
 {
 
-	if (size <= arena_maxclass)
-		return (arena_malloc(NULL, size, true, true));
-	else
-		return (huge_malloc(size, true));
+	return (icallocx(size, true, NULL));
 }
 
 JEMALLOC_INLINE void *
-ipalloc(size_t usize, size_t alignment, bool zero)
+ipallocx(size_t usize, size_t alignment, bool zero, bool try_tcache,
+    arena_t *arena)
 {
 	void *ret;
 
 	assert(usize != 0);
 	assert(usize == sa2u(usize, alignment));
 
 	if (usize <= arena_maxclass && alignment <= PAGE)
-		ret = arena_malloc(NULL, usize, zero, true);
+		ret = arena_malloc(arena, usize, zero, try_tcache);
 	else {
 		if (usize <= arena_maxclass) {
-			ret = arena_palloc(choose_arena(NULL), usize, alignment,
-			    zero);
+			ret = arena_palloc(choose_arena(arena), usize,
+			    alignment, zero);
 		} else if (alignment <= chunksize)
 			ret = huge_malloc(usize, zero);
 		else
 			ret = huge_palloc(usize, alignment, zero);
 	}
 
 	assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret);
 	return (ret);
 }
 
+JEMALLOC_INLINE void *
+ipalloc(size_t usize, size_t alignment, bool zero)
+{
+
+	return (ipallocx(usize, alignment, zero, true, NULL));
+}
+
 /*
  * Typical usage:
  *   void *ptr = [...]
  *   size_t sz = isalloc(ptr, config_prof);
  */
 JEMALLOC_INLINE size_t
 isalloc(const void *ptr, bool demote)
 {
@@ -828,42 +878,56 @@ JEMALLOC_INLINE size_t
 p2rz(const void *ptr)
 {
 	size_t usize = isalloc(ptr, false);
 
 	return (u2rz(usize));
 }
 
 JEMALLOC_INLINE void
-idalloc(void *ptr)
+idallocx(void *ptr, bool try_tcache)
 {
 	arena_chunk_t *chunk;
 
 	assert(ptr != NULL);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr)
-		arena_dalloc(chunk->arena, chunk, ptr, true);
+		arena_dalloc(chunk->arena, chunk, ptr, try_tcache);
 	else
 		huge_dalloc(ptr, true);
 }
 
 JEMALLOC_INLINE void
+idalloc(void *ptr)
+{
+
+	idallocx(ptr, true);
+}
+
+JEMALLOC_INLINE void
+iqallocx(void *ptr, bool try_tcache)
+{
+
+	if (config_fill && opt_quarantine)
+		quarantine(ptr);
+	else
+		idallocx(ptr, try_tcache);
+}
+
+JEMALLOC_INLINE void
 iqalloc(void *ptr)
 {
 
-	if (config_fill && opt_quarantine)
-		quarantine(ptr);
-	else
-		idalloc(ptr);
+	iqallocx(ptr, true);
 }
 
 JEMALLOC_INLINE void *
-iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
-    bool no_move)
+irallocx(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
+    bool no_move, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena)
 {
 	void *ret;
 	size_t oldsize;
 
 	assert(ptr != NULL);
 	assert(size != 0);
 
 	oldsize = isalloc(ptr, config_prof);
@@ -876,58 +940,69 @@ iralloc(void *ptr, size_t size, size_t e
 		 * Existing object alignment is inadequate; allocate new space
 		 * and copy.
 		 */
 		if (no_move)
 			return (NULL);
 		usize = sa2u(size + extra, alignment);
 		if (usize == 0)
 			return (NULL);
-		ret = ipalloc(usize, alignment, zero);
+		ret = ipallocx(usize, alignment, zero, try_tcache_alloc, arena);
 		if (ret == NULL) {
 			if (extra == 0)
 				return (NULL);
 			/* Try again, without extra this time. */
 			usize = sa2u(size, alignment);
 			if (usize == 0)
 				return (NULL);
-			ret = ipalloc(usize, alignment, zero);
+			ret = ipallocx(usize, alignment, zero, try_tcache_alloc,
+			    arena);
 			if (ret == NULL)
 				return (NULL);
 		}
 		/*
 		 * Copy at most size bytes (not size+extra), since the caller
 		 * has no expectation that the extra bytes will be reliably
 		 * preserved.
 		 */
 		copysize = (size < oldsize) ? size : oldsize;
 		memcpy(ret, ptr, copysize);
-		iqalloc(ptr);
+		iqallocx(ptr, try_tcache_dalloc);
 		return (ret);
 	}
 
 	if (no_move) {
 		if (size <= arena_maxclass) {
 			return (arena_ralloc_no_move(ptr, oldsize, size,
 			    extra, zero));
 		} else {
 			return (huge_ralloc_no_move(ptr, oldsize, size,
 			    extra));
 		}
 	} else {
 		if (size + extra <= arena_maxclass) {
-			return (arena_ralloc(ptr, oldsize, size, extra,
-			    alignment, zero, true));
+			return (arena_ralloc(arena, ptr, oldsize, size, extra,
+			    alignment, zero, try_tcache_alloc,
+			    try_tcache_dalloc));
 		} else {
 			return (huge_ralloc(ptr, oldsize, size, extra,
-			    alignment, zero));
+			    alignment, zero, try_tcache_dalloc));
 		}
 	}
 }
 
+JEMALLOC_INLINE void *
+iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
+    bool no_move)
+{
+
+	return (irallocx(ptr, size, extra, alignment, zero, no_move, true, true,
+	    NULL));
+}
+
 malloc_tsd_externs(thread_allocated, thread_allocated_t)
 malloc_tsd_funcs(JEMALLOC_INLINE, thread_allocated, thread_allocated_t,
     THREAD_ALLOCATED_INITIALIZER, malloc_tsd_no_cleanup)
 #endif
 
 #include "jemalloc/internal/prof.h"
 
 #undef JEMALLOC_H_INLINES
--- a/memory/jemalloc/src/include/jemalloc/internal/private_namespace.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/private_namespace.h
@@ -7,16 +7,18 @@
 #define	arena_boot JEMALLOC_N(arena_boot)
 #define	arena_dalloc JEMALLOC_N(arena_dalloc)
 #define	arena_dalloc_bin JEMALLOC_N(arena_dalloc_bin)
 #define	arena_dalloc_bin_locked JEMALLOC_N(arena_dalloc_bin_locked)
 #define	arena_dalloc_junk_small JEMALLOC_N(arena_dalloc_junk_small)
 #define	arena_dalloc_large JEMALLOC_N(arena_dalloc_large)
 #define	arena_dalloc_large_locked JEMALLOC_N(arena_dalloc_large_locked)
 #define	arena_dalloc_small JEMALLOC_N(arena_dalloc_small)
+#define	arena_dss_prec_get JEMALLOC_N(arena_dss_prec_get)
+#define	arena_dss_prec_set JEMALLOC_N(arena_dss_prec_set)
 #define	arena_malloc JEMALLOC_N(arena_malloc)
 #define	arena_malloc_large JEMALLOC_N(arena_malloc_large)
 #define	arena_malloc_small JEMALLOC_N(arena_malloc_small)
 #define	arena_mapbits_allocated_get JEMALLOC_N(arena_mapbits_allocated_get)
 #define	arena_mapbits_binind_get JEMALLOC_N(arena_mapbits_binind_get)
 #define	arena_mapbits_dirty_get JEMALLOC_N(arena_mapbits_dirty_get)
 #define	arena_mapbits_get JEMALLOC_N(arena_mapbits_get)
 #define	arena_mapbits_large_binind_set JEMALLOC_N(arena_mapbits_large_binind_set)
@@ -34,35 +36,35 @@
 #define	arena_mapp_get JEMALLOC_N(arena_mapp_get)
 #define	arena_maxclass JEMALLOC_N(arena_maxclass)
 #define	arena_new JEMALLOC_N(arena_new)
 #define	arena_palloc JEMALLOC_N(arena_palloc)
 #define	arena_postfork_child JEMALLOC_N(arena_postfork_child)
 #define	arena_postfork_parent JEMALLOC_N(arena_postfork_parent)
 #define	arena_prefork JEMALLOC_N(arena_prefork)
 #define	arena_prof_accum JEMALLOC_N(arena_prof_accum)
+#define	arena_prof_accum_impl JEMALLOC_N(arena_prof_accum_impl)
+#define	arena_prof_accum_locked JEMALLOC_N(arena_prof_accum_locked)
 #define	arena_prof_ctx_get JEMALLOC_N(arena_prof_ctx_get)
 #define	arena_prof_ctx_set JEMALLOC_N(arena_prof_ctx_set)
 #define	arena_prof_promoted JEMALLOC_N(arena_prof_promoted)
 #define	arena_ptr_small_binind_get JEMALLOC_N(arena_ptr_small_binind_get)
 #define	arena_purge_all JEMALLOC_N(arena_purge_all)
 #define	arena_ralloc JEMALLOC_N(arena_ralloc)
 #define	arena_ralloc_no_move JEMALLOC_N(arena_ralloc_no_move)
 #define	arena_run_regind JEMALLOC_N(arena_run_regind)
 #define	arena_salloc JEMALLOC_N(arena_salloc)
 #define	arena_stats_merge JEMALLOC_N(arena_stats_merge)
 #define	arena_tcache_fill_small JEMALLOC_N(arena_tcache_fill_small)
 #define	arenas JEMALLOC_N(arenas)
-#define	arenas_bin_i_index JEMALLOC_N(arenas_bin_i_index)
 #define	arenas_booted JEMALLOC_N(arenas_booted)
 #define	arenas_cleanup JEMALLOC_N(arenas_cleanup)
 #define	arenas_extend JEMALLOC_N(arenas_extend)
 #define	arenas_initialized JEMALLOC_N(arenas_initialized)
 #define	arenas_lock JEMALLOC_N(arenas_lock)
-#define	arenas_lrun_i_index JEMALLOC_N(arenas_lrun_i_index)
 #define	arenas_tls JEMALLOC_N(arenas_tls)
 #define	arenas_tsd JEMALLOC_N(arenas_tsd)
 #define	arenas_tsd_boot JEMALLOC_N(arenas_tsd_boot)
 #define	arenas_tsd_cleanup_wrapper JEMALLOC_N(arenas_tsd_cleanup_wrapper)
 #define	arenas_tsd_get JEMALLOC_N(arenas_tsd_get)
 #define	arenas_tsd_set JEMALLOC_N(arenas_tsd_set)
 #define	atomic_add_u JEMALLOC_N(atomic_add_u)
 #define	atomic_add_uint32 JEMALLOC_N(atomic_add_uint32)
@@ -97,22 +99,25 @@
 #define	chunk_alloc_dss JEMALLOC_N(chunk_alloc_dss)
 #define	chunk_alloc_mmap JEMALLOC_N(chunk_alloc_mmap)
 #define	chunk_boot JEMALLOC_N(chunk_boot)
 #define	chunk_dealloc JEMALLOC_N(chunk_dealloc)
 #define	chunk_dealloc_mmap JEMALLOC_N(chunk_dealloc_mmap)
 #define	chunk_dss_boot JEMALLOC_N(chunk_dss_boot)
 #define	chunk_dss_postfork_child JEMALLOC_N(chunk_dss_postfork_child)
 #define	chunk_dss_postfork_parent JEMALLOC_N(chunk_dss_postfork_parent)
+#define	chunk_dss_prec_get JEMALLOC_N(chunk_dss_prec_get)
+#define	chunk_dss_prec_set JEMALLOC_N(chunk_dss_prec_set)
 #define	chunk_dss_prefork JEMALLOC_N(chunk_dss_prefork)
 #define	chunk_in_dss JEMALLOC_N(chunk_in_dss)
 #define	chunk_npages JEMALLOC_N(chunk_npages)
 #define	chunk_postfork_child JEMALLOC_N(chunk_postfork_child)
 #define	chunk_postfork_parent JEMALLOC_N(chunk_postfork_parent)
 #define	chunk_prefork JEMALLOC_N(chunk_prefork)
+#define	chunk_unmap JEMALLOC_N(chunk_unmap)
 #define	chunks_mtx JEMALLOC_N(chunks_mtx)
 #define	chunks_rtree JEMALLOC_N(chunks_rtree)
 #define	chunksize JEMALLOC_N(chunksize)
 #define	chunksize_mask JEMALLOC_N(chunksize_mask)
 #define	ckh_bucket_search JEMALLOC_N(ckh_bucket_search)
 #define	ckh_count JEMALLOC_N(ckh_count)
 #define	ckh_delete JEMALLOC_N(ckh_delete)
 #define	ckh_evict_reloc_insert JEMALLOC_N(ckh_evict_reloc_insert)
@@ -131,16 +136,17 @@
 #define	ckh_try_insert JEMALLOC_N(ckh_try_insert)
 #define	ctl_boot JEMALLOC_N(ctl_boot)
 #define	ctl_bymib JEMALLOC_N(ctl_bymib)
 #define	ctl_byname JEMALLOC_N(ctl_byname)
 #define	ctl_nametomib JEMALLOC_N(ctl_nametomib)
 #define	ctl_postfork_child JEMALLOC_N(ctl_postfork_child)
 #define	ctl_postfork_parent JEMALLOC_N(ctl_postfork_parent)
 #define	ctl_prefork JEMALLOC_N(ctl_prefork)
+#define	dss_prec_names JEMALLOC_N(dss_prec_names)
 #define	extent_tree_ad_first JEMALLOC_N(extent_tree_ad_first)
 #define	extent_tree_ad_insert JEMALLOC_N(extent_tree_ad_insert)
 #define	extent_tree_ad_iter JEMALLOC_N(extent_tree_ad_iter)
 #define	extent_tree_ad_iter_recurse JEMALLOC_N(extent_tree_ad_iter_recurse)
 #define	extent_tree_ad_iter_start JEMALLOC_N(extent_tree_ad_iter_start)
 #define	extent_tree_ad_last JEMALLOC_N(extent_tree_ad_last)
 #define	extent_tree_ad_new JEMALLOC_N(extent_tree_ad_new)
 #define	extent_tree_ad_next JEMALLOC_N(extent_tree_ad_next)
@@ -183,21 +189,27 @@
 #define	huge_prefork JEMALLOC_N(huge_prefork)
 #define	huge_prof_ctx_get JEMALLOC_N(huge_prof_ctx_get)
 #define	huge_prof_ctx_set JEMALLOC_N(huge_prof_ctx_set)
 #define	huge_ralloc JEMALLOC_N(huge_ralloc)
 #define	huge_ralloc_no_move JEMALLOC_N(huge_ralloc_no_move)
 #define	huge_salloc JEMALLOC_N(huge_salloc)
 #define	iallocm JEMALLOC_N(iallocm)
 #define	icalloc JEMALLOC_N(icalloc)
+#define	icallocx JEMALLOC_N(icallocx)
 #define	idalloc JEMALLOC_N(idalloc)
+#define	idallocx JEMALLOC_N(idallocx)
 #define	imalloc JEMALLOC_N(imalloc)
+#define	imallocx JEMALLOC_N(imallocx)
 #define	ipalloc JEMALLOC_N(ipalloc)
+#define	ipallocx JEMALLOC_N(ipallocx)
 #define	iqalloc JEMALLOC_N(iqalloc)
+#define	iqallocx JEMALLOC_N(iqallocx)
 #define	iralloc JEMALLOC_N(iralloc)
+#define	irallocx JEMALLOC_N(irallocx)
 #define	isalloc JEMALLOC_N(isalloc)
 #define	isthreaded JEMALLOC_N(isthreaded)
 #define	ivsalloc JEMALLOC_N(ivsalloc)
 #define	jemalloc_postfork_child JEMALLOC_N(jemalloc_postfork_child)
 #define	jemalloc_postfork_parent JEMALLOC_N(jemalloc_postfork_parent)
 #define	jemalloc_prefork JEMALLOC_N(jemalloc_prefork)
 #define	malloc_cprintf JEMALLOC_N(malloc_cprintf)
 #define	malloc_mutex_init JEMALLOC_N(malloc_mutex_init)
@@ -215,17 +227,19 @@
 #define	malloc_tsd_malloc JEMALLOC_N(malloc_tsd_malloc)
 #define	malloc_tsd_no_cleanup JEMALLOC_N(malloc_tsd_no_cleanup)
 #define	malloc_vcprintf JEMALLOC_N(malloc_vcprintf)
 #define	malloc_vsnprintf JEMALLOC_N(malloc_vsnprintf)
 #define	malloc_write JEMALLOC_N(malloc_write)
 #define	map_bias JEMALLOC_N(map_bias)
 #define	mb_write JEMALLOC_N(mb_write)
 #define	mutex_boot JEMALLOC_N(mutex_boot)
-#define	narenas JEMALLOC_N(narenas)
+#define	narenas_auto JEMALLOC_N(narenas_auto)
+#define	narenas_total JEMALLOC_N(narenas_total)
+#define	narenas_total_get JEMALLOC_N(narenas_total_get)
 #define	ncpus JEMALLOC_N(ncpus)
 #define	nhbins JEMALLOC_N(nhbins)
 #define	opt_abort JEMALLOC_N(opt_abort)
 #define	opt_junk JEMALLOC_N(opt_junk)
 #define	opt_lg_chunk JEMALLOC_N(opt_lg_chunk)
 #define	opt_lg_dirty_mult JEMALLOC_N(opt_lg_dirty_mult)
 #define	opt_lg_prof_interval JEMALLOC_N(opt_lg_prof_interval)
 #define	opt_lg_prof_sample JEMALLOC_N(opt_lg_prof_sample)
@@ -292,19 +306,16 @@
 #define	rtree_new JEMALLOC_N(rtree_new)
 #define	rtree_postfork_child JEMALLOC_N(rtree_postfork_child)
 #define	rtree_postfork_parent JEMALLOC_N(rtree_postfork_parent)
 #define	rtree_prefork JEMALLOC_N(rtree_prefork)
 #define	rtree_set JEMALLOC_N(rtree_set)
 #define	s2u JEMALLOC_N(s2u)
 #define	sa2u JEMALLOC_N(sa2u)
 #define	set_errno JEMALLOC_N(set_errno)
-#define	stats_arenas_i_bins_j_index JEMALLOC_N(stats_arenas_i_bins_j_index)
-#define	stats_arenas_i_index JEMALLOC_N(stats_arenas_i_index)
-#define	stats_arenas_i_lruns_j_index JEMALLOC_N(stats_arenas_i_lruns_j_index)
 #define	stats_cactive JEMALLOC_N(stats_cactive)
 #define	stats_cactive_add JEMALLOC_N(stats_cactive_add)
 #define	stats_cactive_get JEMALLOC_N(stats_cactive_get)
 #define	stats_cactive_sub JEMALLOC_N(stats_cactive_sub)
 #define	stats_chunks JEMALLOC_N(stats_chunks)
 #define	stats_print JEMALLOC_N(stats_print)
 #define	tcache_alloc_easy JEMALLOC_N(tcache_alloc_easy)
 #define	tcache_alloc_large JEMALLOC_N(tcache_alloc_large)
--- a/memory/jemalloc/src/include/jemalloc/jemalloc.h.in
+++ b/memory/jemalloc/src/include/jemalloc/jemalloc.h.in
@@ -20,16 +20,18 @@ extern "C" {
 #define	ALLOCM_LG_ALIGN(la)	(la)
 #if LG_SIZEOF_PTR == 2
 #define	ALLOCM_ALIGN(a)	(ffs(a)-1)
 #else
 #define	ALLOCM_ALIGN(a)	((a < (size_t)INT_MAX) ? ffs(a)-1 : ffs(a>>32)+31)
 #endif
 #define	ALLOCM_ZERO	((int)0x40)
 #define	ALLOCM_NO_MOVE	((int)0x80)
+/* Bias arena index bits so that 0 encodes "ALLOCM_ARENA() unspecified". */
+#define	ALLOCM_ARENA(a)	((int)(((a)+1) << 8))
 
 #define	ALLOCM_SUCCESS		0
 #define	ALLOCM_ERR_OOM		1
 #define	ALLOCM_ERR_NOT_MOVED	2
 #endif
 
 /*
  * The je_ prefix on the following public symbol declarations is an artifact of
--- a/memory/jemalloc/src/src/arena.c
+++ b/memory/jemalloc/src/src/arena.c
@@ -35,26 +35,35 @@ const uint8_t	small_size2bin[] = {
 #undef S2B_4096
 #undef S2B_8192
 #undef SIZE_CLASS
 };
 
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
 
+static void	arena_avail_insert(arena_t *arena, arena_chunk_t *chunk,
+    size_t pageind, size_t npages, bool maybe_adjac_pred,
+    bool maybe_adjac_succ);
+static void	arena_avail_remove(arena_t *arena, arena_chunk_t *chunk,
+    size_t pageind, size_t npages, bool maybe_adjac_pred,
+    bool maybe_adjac_succ);
 static void	arena_run_split(arena_t *arena, arena_run_t *run, size_t size,
     bool large, size_t binind, bool zero);
 static arena_chunk_t *arena_chunk_alloc(arena_t *arena);
 static void	arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk);
 static arena_run_t	*arena_run_alloc_helper(arena_t *arena, size_t size,
     bool large, size_t binind, bool zero);
 static arena_run_t *arena_run_alloc(arena_t *arena, size_t size, bool large,
     size_t binind, bool zero);
+static arena_chunk_t	*chunks_dirty_iter_cb(arena_chunk_tree_t *tree,
+    arena_chunk_t *chunk, void *arg);
 static void	arena_purge(arena_t *arena, bool all);
-static void	arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty);
+static void	arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty,
+    bool cleaned);
 static void	arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, size_t oldsize, size_t newsize);
 static void	arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, size_t oldsize, size_t newsize, bool dirty);
 static arena_run_t	*arena_bin_runs_first(arena_bin_t *bin);
 static void	arena_bin_runs_insert(arena_bin_t *bin, arena_run_t *run);
 static void	arena_bin_runs_remove(arena_bin_t *bin, arena_run_t *run);
 static arena_run_t *arena_bin_nonfull_run_tryget(arena_bin_t *bin);
@@ -96,19 +105,16 @@ rb_gen(static UNUSED, arena_run_tree_, a
 
 static inline int
 arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
 {
 	int ret;
 	size_t a_size = a->bits & ~PAGE_MASK;
 	size_t b_size = b->bits & ~PAGE_MASK;
 
-	assert((a->bits & CHUNK_MAP_KEY) == CHUNK_MAP_KEY || (a->bits &
-	    CHUNK_MAP_DIRTY) == (b->bits & CHUNK_MAP_DIRTY));
-
 	ret = (a_size > b_size) - (a_size < b_size);
 	if (ret == 0) {
 		uintptr_t a_mapelm, b_mapelm;
 
 		if ((a->bits & CHUNK_MAP_KEY) != CHUNK_MAP_KEY)
 			a_mapelm = (uintptr_t)a;
 		else {
 			/*
@@ -124,16 +130,192 @@ arena_avail_comp(arena_chunk_map_t *a, a
 
 	return (ret);
 }
 
 /* Generate red-black tree functions. */
 rb_gen(static UNUSED, arena_avail_tree_, arena_avail_tree_t, arena_chunk_map_t,
     u.rb_link, arena_avail_comp)
 
+static inline int
+arena_chunk_dirty_comp(arena_chunk_t *a, arena_chunk_t *b)
+{
+
+	assert(a != NULL);
+	assert(b != NULL);
+
+	/*
+	 * Short-circuit for self comparison.  The following comparison code
+	 * would come to the same result, but at the cost of executing the slow
+	 * path.
+	 */
+	if (a == b)
+		return (0);
+
+	/*
+	 * Order such that chunks with higher fragmentation are "less than"
+	 * those with lower fragmentation -- purging order is from "least" to
+	 * "greatest".  Fragmentation is measured as:
+	 *
+	 *     mean current avail run size
+	 *   --------------------------------
+	 *   mean defragmented avail run size
+	 *
+	 *            navail
+	 *         -----------
+	 *         nruns_avail           nruns_avail-nruns_adjac
+	 * = ========================= = -----------------------
+	 *            navail                  nruns_avail
+	 *    -----------------------
+	 *    nruns_avail-nruns_adjac
+	 *
+	 * The following code multiplies away the denominator prior to
+	 * comparison, in order to avoid division.
+	 *
+	 */
+	{
+		size_t a_val = (a->nruns_avail - a->nruns_adjac) *
+		    b->nruns_avail;
+		size_t b_val = (b->nruns_avail - b->nruns_adjac) *
+		    a->nruns_avail;
+
+		if (a_val < b_val)
+			return (1);
+		if (a_val > b_val)
+			return (-1);
+	}
+	/*
+	 * Break ties by chunk address.  For fragmented chunks, report lower
+	 * addresses as "lower", so that fragmentation reduction happens first
+	 * at lower addresses.  However, use the opposite ordering for
+	 * unfragmented chunks, in order to increase the chances of
+	 * re-allocating dirty runs.
+	 */
+	{
+		uintptr_t a_chunk = (uintptr_t)a;
+		uintptr_t b_chunk = (uintptr_t)b;
+		int ret = ((a_chunk > b_chunk) - (a_chunk < b_chunk));
+		if (a->nruns_adjac == 0) {
+			assert(b->nruns_adjac == 0);
+			ret = -ret;
+		}
+		return (ret);
+	}
+}
+
+/* Generate red-black tree functions. */
+rb_gen(static UNUSED, arena_chunk_dirty_, arena_chunk_tree_t, arena_chunk_t,
+    dirty_link, arena_chunk_dirty_comp)
+
+static inline bool
+arena_avail_adjac_pred(arena_chunk_t *chunk, size_t pageind)
+{
+	bool ret;
+
+	if (pageind-1 < map_bias)
+		ret = false;
+	else {
+		ret = (arena_mapbits_allocated_get(chunk, pageind-1) == 0);
+		assert(ret == false || arena_mapbits_dirty_get(chunk,
+		    pageind-1) != arena_mapbits_dirty_get(chunk, pageind));
+	}
+	return (ret);
+}
+
+static inline bool
+arena_avail_adjac_succ(arena_chunk_t *chunk, size_t pageind, size_t npages)
+{
+	bool ret;
+
+	if (pageind+npages == chunk_npages)
+		ret = false;
+	else {
+		assert(pageind+npages < chunk_npages);
+		ret = (arena_mapbits_allocated_get(chunk, pageind+npages) == 0);
+		assert(ret == false || arena_mapbits_dirty_get(chunk, pageind)
+		    != arena_mapbits_dirty_get(chunk, pageind+npages));
+	}
+	return (ret);
+}
+
+static inline bool
+arena_avail_adjac(arena_chunk_t *chunk, size_t pageind, size_t npages)
+{
+
+	return (arena_avail_adjac_pred(chunk, pageind) ||
+	    arena_avail_adjac_succ(chunk, pageind, npages));
+}
+
+static void
+arena_avail_insert(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
+    size_t npages, bool maybe_adjac_pred, bool maybe_adjac_succ)
+{
+
+	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
+	    LG_PAGE));
+
+	/*
+	 * chunks_dirty is keyed by nruns_{avail,adjac}, so the chunk must be
+	 * removed and reinserted even if the run to be inserted is clean.
+	 */
+	if (chunk->ndirty != 0)
+		arena_chunk_dirty_remove(&arena->chunks_dirty, chunk);
+
+	if (maybe_adjac_pred && arena_avail_adjac_pred(chunk, pageind))
+		chunk->nruns_adjac++;
+	if (maybe_adjac_succ && arena_avail_adjac_succ(chunk, pageind, npages))
+		chunk->nruns_adjac++;
+	chunk->nruns_avail++;
+	assert(chunk->nruns_avail > chunk->nruns_adjac);
+
+	if (arena_mapbits_dirty_get(chunk, pageind) != 0) {
+		arena->ndirty += npages;
+		chunk->ndirty += npages;
+	}
+	if (chunk->ndirty != 0)
+		arena_chunk_dirty_insert(&arena->chunks_dirty, chunk);
+
+	arena_avail_tree_insert(&arena->runs_avail, arena_mapp_get(chunk,
+	    pageind));
+}
+
+static void
+arena_avail_remove(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
+    size_t npages, bool maybe_adjac_pred, bool maybe_adjac_succ)
+{
+
+	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
+	    LG_PAGE));
+
+	/*
+	 * chunks_dirty is keyed by nruns_{avail,adjac}, so the chunk must be
+	 * removed and reinserted even if the run to be removed is clean.
+	 */
+	if (chunk->ndirty != 0)
+		arena_chunk_dirty_remove(&arena->chunks_dirty, chunk);
+
+	if (maybe_adjac_pred && arena_avail_adjac_pred(chunk, pageind))
+		chunk->nruns_adjac--;
+	if (maybe_adjac_succ && arena_avail_adjac_succ(chunk, pageind, npages))
+		chunk->nruns_adjac--;
+	chunk->nruns_avail--;
+	assert(chunk->nruns_avail > chunk->nruns_adjac || (chunk->nruns_avail
+	    == 0 && chunk->nruns_adjac == 0));
+
+	if (arena_mapbits_dirty_get(chunk, pageind) != 0) {
+		arena->ndirty -= npages;
+		chunk->ndirty -= npages;
+	}
+	if (chunk->ndirty != 0)
+		arena_chunk_dirty_insert(&arena->chunks_dirty, chunk);
+
+	arena_avail_tree_remove(&arena->runs_avail, arena_mapp_get(chunk,
+	    pageind));
+}
+
 static inline void *
 arena_run_reg_alloc(arena_run_t *run, arena_bin_info_t *bin_info)
 {
 	void *ret;
 	unsigned regind;
 	bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run +
 	    (uintptr_t)bin_info->bitmap_offset);
 
@@ -188,36 +370,33 @@ arena_chunk_validate_zeroed(arena_chunk_
 
 static void
 arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
     size_t binind, bool zero)
 {
 	arena_chunk_t *chunk;
 	size_t run_ind, total_pages, need_pages, rem_pages, i;
 	size_t flag_dirty;
-	arena_avail_tree_t *runs_avail;
 
 	assert((large && binind == BININD_INVALID) || (large == false && binind
 	    != BININD_INVALID));
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
 	flag_dirty = arena_mapbits_dirty_get(chunk, run_ind);
-	runs_avail = (flag_dirty != 0) ? &arena->runs_avail_dirty :
-	    &arena->runs_avail_clean;
 	total_pages = arena_mapbits_unallocated_size_get(chunk, run_ind) >>
 	    LG_PAGE;
 	assert(arena_mapbits_dirty_get(chunk, run_ind+total_pages-1) ==
 	    flag_dirty);
 	need_pages = (size >> LG_PAGE);
 	assert(need_pages > 0);
 	assert(need_pages <= total_pages);
 	rem_pages = total_pages - need_pages;
 
-	arena_avail_tree_remove(runs_avail, arena_mapp_get(chunk, run_ind));
+	arena_avail_remove(arena, chunk, run_ind, total_pages, true, true);
 	if (config_stats) {
 		/*
 		 * Update stats_cactive if nactive is crossing a chunk
 		 * multiple.
 		 */
 		size_t cactive_diff = CHUNK_CEILING((arena->nactive +
 		    need_pages) << LG_PAGE) - CHUNK_CEILING(arena->nactive <<
 		    LG_PAGE);
@@ -239,24 +418,18 @@ arena_run_split(arena_t *arena, arena_ru
 			    (rem_pages << LG_PAGE),
 			    arena_mapbits_unzeroed_get(chunk,
 			    run_ind+need_pages));
 			arena_mapbits_unallocated_set(chunk,
 			    run_ind+total_pages-1, (rem_pages << LG_PAGE),
 			    arena_mapbits_unzeroed_get(chunk,
 			    run_ind+total_pages-1));
 		}
-		arena_avail_tree_insert(runs_avail, arena_mapp_get(chunk,
-		    run_ind+need_pages));
-	}
-
-	/* Update dirty page accounting. */
-	if (flag_dirty != 0) {
-		chunk->ndirty -= need_pages;
-		arena->ndirty -= need_pages;
+		arena_avail_insert(arena, chunk, run_ind+need_pages, rem_pages,
+		    false, true);
 	}
 
 	/*
 	 * Update the page map separately for large vs. small runs, since it is
 	 * possible to avoid iteration for large mallocs.
 	 */
 	if (large) {
 		if (zero) {
@@ -339,61 +512,52 @@ arena_run_split(arena_t *arena, arena_ru
 
 static arena_chunk_t *
 arena_chunk_alloc(arena_t *arena)
 {
 	arena_chunk_t *chunk;
 	size_t i;
 
 	if (arena->spare != NULL) {
-		arena_avail_tree_t *runs_avail;
-
 		chunk = arena->spare;
 		arena->spare = NULL;
 
 		assert(arena_mapbits_allocated_get(chunk, map_bias) == 0);
 		assert(arena_mapbits_allocated_get(chunk, chunk_npages-1) == 0);
 		assert(arena_mapbits_unallocated_size_get(chunk, map_bias) ==
 		    arena_maxclass);
 		assert(arena_mapbits_unallocated_size_get(chunk,
 		    chunk_npages-1) == arena_maxclass);
 		assert(arena_mapbits_dirty_get(chunk, map_bias) ==
 		    arena_mapbits_dirty_get(chunk, chunk_npages-1));
-
-		/* Insert the run into the appropriate runs_avail_* tree. */
-		if (arena_mapbits_dirty_get(chunk, map_bias) == 0)
-			runs_avail = &arena->runs_avail_clean;
-		else
-			runs_avail = &arena->runs_avail_dirty;
-		arena_avail_tree_insert(runs_avail, arena_mapp_get(chunk,
-		    map_bias));
 	} else {
 		bool zero;
 		size_t unzeroed;
 
 		zero = false;
 		malloc_mutex_unlock(&arena->lock);
 		chunk = (arena_chunk_t *)chunk_alloc(chunksize, chunksize,
-		    false, &zero);
+		    false, &zero, arena->dss_prec);
 		malloc_mutex_lock(&arena->lock);
 		if (chunk == NULL)
 			return (NULL);
 		if (config_stats)
 			arena->stats.mapped += chunksize;
 
 		chunk->arena = arena;
-		ql_elm_new(chunk, link_dirty);
-		chunk->dirtied = false;
 
 		/*
 		 * Claim that no pages are in use, since the header is merely
 		 * overhead.
 		 */
 		chunk->ndirty = 0;
 
+		chunk->nruns_avail = 0;
+		chunk->nruns_adjac = 0;
+
 		/*
 		 * Initialize the map to contain one maximal free untouched run.
 		 * Mark the pages as zeroed iff chunk_alloc() returned a zeroed
 		 * chunk.
 		 */
 		unzeroed = zero ? 0 : CHUNK_MAP_UNZEROED;
 		arena_mapbits_unallocated_set(chunk, map_bias, arena_maxclass,
 		    unzeroed);
@@ -407,58 +571,48 @@ arena_chunk_alloc(arena_t *arena)
 		} else if (config_debug) {
 			for (i = map_bias+1; i < chunk_npages-1; i++) {
 				assert(arena_mapbits_unzeroed_get(chunk, i) ==
 				    unzeroed);
 			}
 		}
 		arena_mapbits_unallocated_set(chunk, chunk_npages-1,
 		    arena_maxclass, unzeroed);
+	}
 
-		/* Insert the run into the runs_avail_clean tree. */
-		arena_avail_tree_insert(&arena->runs_avail_clean,
-		    arena_mapp_get(chunk, map_bias));
-	}
+	/* Insert the run into the runs_avail tree. */
+	arena_avail_insert(arena, chunk, map_bias, chunk_npages-map_bias,
+	    false, false);
 
 	return (chunk);
 }
 
 static void
 arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk)
 {
-	arena_avail_tree_t *runs_avail;
-
 	assert(arena_mapbits_allocated_get(chunk, map_bias) == 0);
 	assert(arena_mapbits_allocated_get(chunk, chunk_npages-1) == 0);
 	assert(arena_mapbits_unallocated_size_get(chunk, map_bias) ==
 	    arena_maxclass);
 	assert(arena_mapbits_unallocated_size_get(chunk, chunk_npages-1) ==
 	    arena_maxclass);
 	assert(arena_mapbits_dirty_get(chunk, map_bias) ==
 	    arena_mapbits_dirty_get(chunk, chunk_npages-1));
 
 	/*
-	 * Remove run from the appropriate runs_avail_* tree, so that the arena
-	 * does not use it.
+	 * Remove run from the runs_avail tree, so that the arena does not use
+	 * it.
 	 */
-	if (arena_mapbits_dirty_get(chunk, map_bias) == 0)
-		runs_avail = &arena->runs_avail_clean;
-	else
-		runs_avail = &arena->runs_avail_dirty;
-	arena_avail_tree_remove(runs_avail, arena_mapp_get(chunk, map_bias));
+	arena_avail_remove(arena, chunk, map_bias, chunk_npages-map_bias,
+	    false, false);
 
 	if (arena->spare != NULL) {
 		arena_chunk_t *spare = arena->spare;
 
 		arena->spare = chunk;
-		if (spare->dirtied) {
-			ql_remove(&chunk->arena->chunks_dirty, spare,
-			    link_dirty);
-			arena->ndirty -= spare->ndirty;
-		}
 		malloc_mutex_unlock(&arena->lock);
 		chunk_dealloc((void *)spare, chunksize, true);
 		malloc_mutex_lock(&arena->lock);
 		if (config_stats)
 			arena->stats.mapped -= chunksize;
 	} else
 		arena->spare = chunk;
 }
@@ -466,29 +620,17 @@ arena_chunk_dealloc(arena_t *arena, aren
 static arena_run_t *
 arena_run_alloc_helper(arena_t *arena, size_t size, bool large, size_t binind,
     bool zero)
 {
 	arena_run_t *run;
 	arena_chunk_map_t *mapelm, key;
 
 	key.bits = size | CHUNK_MAP_KEY;
-	mapelm = arena_avail_tree_nsearch(&arena->runs_avail_dirty, &key);
-	if (mapelm != NULL) {
-		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = (((uintptr_t)mapelm -
-		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
-		    + map_bias;
-
-		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
-		    LG_PAGE));
-		arena_run_split(arena, run, size, large, binind, zero);
-		return (run);
-	}
-	mapelm = arena_avail_tree_nsearch(&arena->runs_avail_clean, &key);
+	mapelm = arena_avail_tree_nsearch(&arena->runs_avail, &key);
 	if (mapelm != NULL) {
 		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
 		size_t pageind = (((uintptr_t)mapelm -
 		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
 		    + map_bias;
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    LG_PAGE));
@@ -532,39 +674,50 @@ arena_run_alloc(arena_t *arena, size_t s
 	 * arena_chunk_alloc(), so search one more time.
 	 */
 	return (arena_run_alloc_helper(arena, size, large, binind, zero));
 }
 
 static inline void
 arena_maybe_purge(arena_t *arena)
 {
+	size_t npurgeable, threshold;
 
-	/* Enforce opt_lg_dirty_mult. */
-	if (opt_lg_dirty_mult >= 0 && arena->ndirty > arena->npurgatory &&
-	    (arena->ndirty - arena->npurgatory) > chunk_npages &&
-	    (arena->nactive >> opt_lg_dirty_mult) < (arena->ndirty -
-	    arena->npurgatory))
-		arena_purge(arena, false);
+	/* Don't purge if the option is disabled. */
+	if (opt_lg_dirty_mult < 0)
+		return;
+	/* Don't purge if all dirty pages are already being purged. */
+	if (arena->ndirty <= arena->npurgatory)
+		return;
+	npurgeable = arena->ndirty - arena->npurgatory;
+	threshold = (arena->nactive >> opt_lg_dirty_mult);
+	/*
+	 * Don't purge unless the number of purgeable pages exceeds the
+	 * threshold.
+	 */
+	if (npurgeable <= threshold)
+		return;
+
+	arena_purge(arena, false);
 }
 
-static inline void
-arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
+static inline size_t
+arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk, bool all)
 {
+	size_t npurged;
 	ql_head(arena_chunk_map_t) mapelms;
 	arena_chunk_map_t *mapelm;
-	size_t pageind;
-	size_t ndirty;
+	size_t pageind, npages;
 	size_t nmadvise;
 
 	ql_new(&mapelms);
 
 	/*
 	 * If chunk is the spare, temporarily re-allocate it, 1) so that its
-	 * run is reinserted into runs_avail_dirty, and 2) so that it cannot be
+	 * run is reinserted into runs_avail, and 2) so that it cannot be
 	 * completely discarded by another thread while arena->lock is dropped
 	 * by this thread.  Note that the arena_run_dalloc() call will
 	 * implicitly deallocate the chunk, so no explicit action is required
 	 * in this function to deallocate the chunk.
 	 *
 	 * Note that once a chunk contains dirty pages, it cannot again contain
 	 * a single run unless 1) it is a dirty run, or 2) this function purges
 	 * dirty pages and causes the transition to a single clean run.  Thus
@@ -574,103 +727,90 @@ arena_chunk_purge(arena_t *arena, arena_
 	 */
 	if (chunk == arena->spare) {
 		assert(arena_mapbits_dirty_get(chunk, map_bias) != 0);
 		assert(arena_mapbits_dirty_get(chunk, chunk_npages-1) != 0);
 
 		arena_chunk_alloc(arena);
 	}
 
-	/* Temporarily allocate all free dirty runs within chunk. */
-	for (pageind = map_bias; pageind < chunk_npages;) {
+	if (config_stats)
+		arena->stats.purged += chunk->ndirty;
+
+	/*
+	 * Operate on all dirty runs if there is no clean/dirty run
+	 * fragmentation.
+	 */
+	if (chunk->nruns_adjac == 0)
+		all = true;
+
+	/*
+	 * Temporarily allocate free dirty runs within chunk.  If all is false,
+	 * only operate on dirty runs that are fragments; otherwise operate on
+	 * all dirty runs.
+	 */
+	for (pageind = map_bias; pageind < chunk_npages; pageind += npages) {
 		mapelm = arena_mapp_get(chunk, pageind);
 		if (arena_mapbits_allocated_get(chunk, pageind) == 0) {
-			size_t npages;
+			size_t run_size =
+			    arena_mapbits_unallocated_size_get(chunk, pageind);
 
-			npages = arena_mapbits_unallocated_size_get(chunk,
-			    pageind) >> LG_PAGE;
+			npages = run_size >> LG_PAGE;
 			assert(pageind + npages <= chunk_npages);
 			assert(arena_mapbits_dirty_get(chunk, pageind) ==
 			    arena_mapbits_dirty_get(chunk, pageind+npages-1));
-			if (arena_mapbits_dirty_get(chunk, pageind) != 0) {
-				arena_avail_tree_remove(
-				    &arena->runs_avail_dirty, mapelm);
-
-				arena_mapbits_large_set(chunk, pageind,
-				    (npages << LG_PAGE), 0);
-				if (npages > 1) {
-					arena_mapbits_large_set(chunk,
-					    pageind+npages-1, 0, 0);
-				}
 
-				if (config_stats) {
-					/*
-					 * Update stats_cactive if nactive is
-					 * crossing a chunk multiple.
-					 */
-					size_t cactive_diff =
-					    CHUNK_CEILING((arena->nactive +
-					    npages) << LG_PAGE) -
-					    CHUNK_CEILING(arena->nactive <<
-					    LG_PAGE);
-					if (cactive_diff != 0)
-						stats_cactive_add(cactive_diff);
-				}
-				arena->nactive += npages;
+			if (arena_mapbits_dirty_get(chunk, pageind) != 0 &&
+			    (all || arena_avail_adjac(chunk, pageind,
+			    npages))) {
+				arena_run_t *run = (arena_run_t *)((uintptr_t)
+				    chunk + (uintptr_t)(pageind << LG_PAGE));
+
+				arena_run_split(arena, run, run_size, true,
+				    BININD_INVALID, false);
 				/* Append to list for later processing. */
 				ql_elm_new(mapelm, u.ql_link);
 				ql_tail_insert(&mapelms, mapelm, u.ql_link);
 			}
-
-			pageind += npages;
 		} else {
-			/* Skip allocated run. */
-			if (arena_mapbits_large_get(chunk, pageind))
-				pageind += arena_mapbits_large_size_get(chunk,
+			/* Skip run. */
+			if (arena_mapbits_large_get(chunk, pageind) != 0) {
+				npages = arena_mapbits_large_size_get(chunk,
 				    pageind) >> LG_PAGE;
-			else {
+			} else {
 				size_t binind;
 				arena_bin_info_t *bin_info;
 				arena_run_t *run = (arena_run_t *)((uintptr_t)
 				    chunk + (uintptr_t)(pageind << LG_PAGE));
 
 				assert(arena_mapbits_small_runind_get(chunk,
 				    pageind) == 0);
 				binind = arena_bin_index(arena, run->bin);
 				bin_info = &arena_bin_info[binind];
-				pageind += bin_info->run_size >> LG_PAGE;
+				npages = bin_info->run_size >> LG_PAGE;
 			}
 		}
 	}
 	assert(pageind == chunk_npages);
-
-	if (config_debug)
-		ndirty = chunk->ndirty;
-	if (config_stats)
-		arena->stats.purged += chunk->ndirty;
-	arena->ndirty -= chunk->ndirty;
-	chunk->ndirty = 0;
-	ql_remove(&arena->chunks_dirty, chunk, link_dirty);
-	chunk->dirtied = false;
+	assert(chunk->ndirty == 0 || all == false);
+	assert(chunk->nruns_adjac == 0);
 
 	malloc_mutex_unlock(&arena->lock);
 	if (config_stats)
 		nmadvise = 0;
+	npurged = 0;
 	ql_foreach(mapelm, &mapelms, u.ql_link) {
-		size_t pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t)) + map_bias;
-		size_t npages = arena_mapbits_large_size_get(chunk, pageind) >>
-		    LG_PAGE;
 		bool unzeroed;
 		size_t flag_unzeroed, i;
 
+		pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		    sizeof(arena_chunk_map_t)) + map_bias;
+		npages = arena_mapbits_large_size_get(chunk, pageind) >>
+		    LG_PAGE;
 		assert(pageind + npages <= chunk_npages);
-		assert(ndirty >= npages);
-		if (config_debug)
-			ndirty -= npages;
 		unzeroed = pages_purge((void *)((uintptr_t)chunk + (pageind <<
 		    LG_PAGE)), (npages << LG_PAGE));
 		flag_unzeroed = unzeroed ? CHUNK_MAP_UNZEROED : 0;
 		/*
 		 * Set the unzeroed flag for all pages, now that pages_purge()
 		 * has returned whether the pages were zeroed as a side effect
 		 * of purging.  This chunk map modification is safe even though
 		 * the arena mutex isn't currently owned by this thread,
@@ -678,139 +818,155 @@ arena_chunk_purge(arena_t *arena, arena_
 		 * from being modified by any other thread.  As long as these
 		 * writes don't perturb the first and last elements'
 		 * CHUNK_MAP_ALLOCATED bits, behavior is well defined.
 		 */
 		for (i = 0; i < npages; i++) {
 			arena_mapbits_unzeroed_set(chunk, pageind+i,
 			    flag_unzeroed);
 		}
+		npurged += npages;
 		if (config_stats)
 			nmadvise++;
 	}
-	assert(ndirty == 0);
 	malloc_mutex_lock(&arena->lock);
 	if (config_stats)
 		arena->stats.nmadvise += nmadvise;
 
 	/* Deallocate runs. */
 	for (mapelm = ql_first(&mapelms); mapelm != NULL;
 	    mapelm = ql_first(&mapelms)) {
-		size_t pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
+		arena_run_t *run;
+
+		pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
 		    sizeof(arena_chunk_map_t)) + map_bias;
-		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
-		    (uintptr_t)(pageind << LG_PAGE));
+		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)(pageind <<
+		    LG_PAGE));
+		ql_remove(&mapelms, mapelm, u.ql_link);
+		arena_run_dalloc(arena, run, false, true);
+	}
+
+	return (npurged);
+}
 
-		ql_remove(&mapelms, mapelm, u.ql_link);
-		arena_run_dalloc(arena, run, false);
-	}
+static arena_chunk_t *
+chunks_dirty_iter_cb(arena_chunk_tree_t *tree, arena_chunk_t *chunk, void *arg)
+{
+       size_t *ndirty = (size_t *)arg;
+
+       assert(chunk->ndirty != 0);
+       *ndirty += chunk->ndirty;
+       return (NULL);
 }
 
 static void
 arena_purge(arena_t *arena, bool all)
 {
 	arena_chunk_t *chunk;
 	size_t npurgatory;
 	if (config_debug) {
 		size_t ndirty = 0;
 
-		ql_foreach(chunk, &arena->chunks_dirty, link_dirty) {
-		    assert(chunk->dirtied);
-		    ndirty += chunk->ndirty;
-		}
+		arena_chunk_dirty_iter(&arena->chunks_dirty, NULL,
+		    chunks_dirty_iter_cb, (void *)&ndirty);
 		assert(ndirty == arena->ndirty);
 	}
 	assert(arena->ndirty > arena->npurgatory || all);
-	assert(arena->ndirty - arena->npurgatory > chunk_npages || all);
 	assert((arena->nactive >> opt_lg_dirty_mult) < (arena->ndirty -
 	    arena->npurgatory) || all);
 
 	if (config_stats)
 		arena->stats.npurge++;
 
 	/*
 	 * Compute the minimum number of pages that this thread should try to
 	 * purge, and add the result to arena->npurgatory.  This will keep
 	 * multiple threads from racing to reduce ndirty below the threshold.
 	 */
-	npurgatory = arena->ndirty - arena->npurgatory;
-	if (all == false) {
-		assert(npurgatory >= arena->nactive >> opt_lg_dirty_mult);
-		npurgatory -= arena->nactive >> opt_lg_dirty_mult;
+	{
+		size_t npurgeable = arena->ndirty - arena->npurgatory;
+
+		if (all == false) {
+			size_t threshold = (arena->nactive >>
+			    opt_lg_dirty_mult);
+
+			npurgatory = npurgeable - threshold;
+		} else
+			npurgatory = npurgeable;
 	}
 	arena->npurgatory += npurgatory;
 
 	while (npurgatory > 0) {
+		size_t npurgeable, npurged, nunpurged;
+
 		/* Get next chunk with dirty pages. */
-		chunk = ql_first(&arena->chunks_dirty);
+		chunk = arena_chunk_dirty_first(&arena->chunks_dirty);
 		if (chunk == NULL) {
 			/*
 			 * This thread was unable to purge as many pages as
 			 * originally intended, due to races with other threads
 			 * that either did some of the purging work, or re-used
 			 * dirty pages.
 			 */
 			arena->npurgatory -= npurgatory;
 			return;
 		}
-		while (chunk->ndirty == 0) {
-			ql_remove(&arena->chunks_dirty, chunk, link_dirty);
-			chunk->dirtied = false;
-			chunk = ql_first(&arena->chunks_dirty);
-			if (chunk == NULL) {
-				/* Same logic as for above. */
-				arena->npurgatory -= npurgatory;
-				return;
-			}
-		}
+		npurgeable = chunk->ndirty;
+		assert(npurgeable != 0);
 
-		if (chunk->ndirty > npurgatory) {
+		if (npurgeable > npurgatory && chunk->nruns_adjac == 0) {
 			/*
-			 * This thread will, at a minimum, purge all the dirty
-			 * pages in chunk, so set npurgatory to reflect this
-			 * thread's commitment to purge the pages.  This tends
-			 * to reduce the chances of the following scenario:
+			 * This thread will purge all the dirty pages in chunk,
+			 * so set npurgatory to reflect this thread's intent to
+			 * purge the pages.  This tends to reduce the chances
+			 * of the following scenario:
 			 *
 			 * 1) This thread sets arena->npurgatory such that
 			 *    (arena->ndirty - arena->npurgatory) is at the
 			 *    threshold.
 			 * 2) This thread drops arena->lock.
 			 * 3) Another thread causes one or more pages to be
 			 *    dirtied, and immediately determines that it must
 			 *    purge dirty pages.
 			 *
 			 * If this scenario *does* play out, that's okay,
 			 * because all of the purging work being done really
 			 * needs to happen.
 			 */
-			arena->npurgatory += chunk->ndirty - npurgatory;
-			npurgatory = chunk->ndirty;
+			arena->npurgatory += npurgeable - npurgatory;
+			npurgatory = npurgeable;
 		}
 
-		arena->npurgatory -= chunk->ndirty;
-		npurgatory -= chunk->ndirty;
-		arena_chunk_purge(arena, chunk);
+		/*
+		 * Keep track of how many pages are purgeable, versus how many
+		 * actually get purged, and adjust counters accordingly.
+		 */
+		arena->npurgatory -= npurgeable;
+		npurgatory -= npurgeable;
+		npurged = arena_chunk_purge(arena, chunk, all);
+		nunpurged = npurgeable - npurged;
+		arena->npurgatory += nunpurged;
+		npurgatory += nunpurged;
 	}
 }
 
 void
 arena_purge_all(arena_t *arena)
 {
 
 	malloc_mutex_lock(&arena->lock);
 	arena_purge(arena, true);
 	malloc_mutex_unlock(&arena->lock);
 }
 
 static void
-arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
+arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 {
 	arena_chunk_t *chunk;
 	size_t size, run_ind, run_pages, flag_dirty;
-	arena_avail_tree_t *runs_avail;
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
 	assert(run_ind >= map_bias);
 	assert(run_ind < chunk_npages);
 	if (arena_mapbits_large_get(chunk, run_ind) != 0) {
 		size = arena_mapbits_large_size_get(chunk, run_ind);
 		assert(size == PAGE ||
@@ -831,35 +987,31 @@ arena_run_dalloc(arena_t *arena, arena_r
 		    CHUNK_CEILING((arena->nactive - run_pages) << LG_PAGE);
 		if (cactive_diff != 0)
 			stats_cactive_sub(cactive_diff);
 	}
 	arena->nactive -= run_pages;
 
 	/*
 	 * The run is dirty if the caller claims to have dirtied it, as well as
-	 * if it was already dirty before being allocated.
+	 * if it was already dirty before being allocated and the caller
+	 * doesn't claim to have cleaned it.
 	 */
 	assert(arena_mapbits_dirty_get(chunk, run_ind) ==
 	    arena_mapbits_dirty_get(chunk, run_ind+run_pages-1));
-	if (arena_mapbits_dirty_get(chunk, run_ind) != 0)
+	if (cleaned == false && arena_mapbits_dirty_get(chunk, run_ind) != 0)
 		dirty = true;
 	flag_dirty = dirty ? CHUNK_MAP_DIRTY : 0;
-	runs_avail = dirty ? &arena->runs_avail_dirty :
-	    &arena->runs_avail_clean;
 
 	/* Mark pages as unallocated in the chunk map. */
 	if (dirty) {
 		arena_mapbits_unallocated_set(chunk, run_ind, size,
 		    CHUNK_MAP_DIRTY);
 		arena_mapbits_unallocated_set(chunk, run_ind+run_pages-1, size,
 		    CHUNK_MAP_DIRTY);
-
-		chunk->ndirty += run_pages;
-		arena->ndirty += run_pages;
 	} else {
 		arena_mapbits_unallocated_set(chunk, run_ind, size,
 		    arena_mapbits_unzeroed_get(chunk, run_ind));
 		arena_mapbits_unallocated_set(chunk, run_ind+run_pages-1, size,
 		    arena_mapbits_unzeroed_get(chunk, run_ind+run_pages-1));
 	}
 
 	/* Try to coalesce forward. */
@@ -873,18 +1025,18 @@ arena_run_dalloc(arena_t *arena, arena_r
 		/*
 		 * Remove successor from runs_avail; the coalesced run is
 		 * inserted later.
 		 */
 		assert(arena_mapbits_unallocated_size_get(chunk,
 		    run_ind+run_pages+nrun_pages-1) == nrun_size);
 		assert(arena_mapbits_dirty_get(chunk,
 		    run_ind+run_pages+nrun_pages-1) == flag_dirty);
-		arena_avail_tree_remove(runs_avail,
-		    arena_mapp_get(chunk, run_ind+run_pages));
+		arena_avail_remove(arena, chunk, run_ind+run_pages, nrun_pages,
+		    false, true);
 
 		size += nrun_size;
 		run_pages += nrun_pages;
 
 		arena_mapbits_unallocated_size_set(chunk, run_ind, size);
 		arena_mapbits_unallocated_size_set(chunk, run_ind+run_pages-1,
 		    size);
 	}
@@ -900,45 +1052,33 @@ arena_run_dalloc(arena_t *arena, arena_r
 
 		/*
 		 * Remove predecessor from runs_avail; the coalesced run is
 		 * inserted later.
 		 */
 		assert(arena_mapbits_unallocated_size_get(chunk, run_ind) ==
 		    prun_size);
 		assert(arena_mapbits_dirty_get(chunk, run_ind) == flag_dirty);
-		arena_avail_tree_remove(runs_avail, arena_mapp_get(chunk,
-		    run_ind));
+		arena_avail_remove(arena, chunk, run_ind, prun_pages, true,
+		    false);
 
 		size += prun_size;
 		run_pages += prun_pages;
 
 		arena_mapbits_unallocated_size_set(chunk, run_ind, size);
 		arena_mapbits_unallocated_size_set(chunk, run_ind+run_pages-1,
 		    size);
 	}
 
 	/* Insert into runs_avail, now that coalescing is complete. */
 	assert(arena_mapbits_unallocated_size_get(chunk, run_ind) ==
 	    arena_mapbits_unallocated_size_get(chunk, run_ind+run_pages-1));
 	assert(arena_mapbits_dirty_get(chunk, run_ind) ==
 	    arena_mapbits_dirty_get(chunk, run_ind+run_pages-1));
-	arena_avail_tree_insert(runs_avail, arena_mapp_get(chunk, run_ind));
-
-	if (dirty) {
-		/*
-		 * Insert into chunks_dirty before potentially calling
-		 * arena_chunk_dealloc(), so that chunks_dirty and
-		 * arena->ndirty are consistent.
-		 */
-		if (chunk->dirtied == false) {
-			ql_tail_insert(&arena->chunks_dirty, chunk, link_dirty);
-			chunk->dirtied = true;
-		}
-	}
+	arena_avail_insert(arena, chunk, run_ind, run_pages, true, true);
 
 	/* Deallocate chunk if it is now completely unused. */
 	if (size == arena_maxclass) {
 		assert(run_ind == map_bias);
 		assert(run_pages == (arena_maxclass >> LG_PAGE));
 		arena_chunk_dealloc(arena, chunk);
 	}
 
@@ -977,17 +1117,17 @@ arena_run_trim_head(arena_t *arena, aren
 		assert(arena_mapbits_large_size_get(chunk,
 		    pageind+head_npages+tail_npages-1) == 0);
 		assert(arena_mapbits_dirty_get(chunk,
 		    pageind+head_npages+tail_npages-1) == flag_dirty);
 	}
 	arena_mapbits_large_set(chunk, pageind+head_npages, newsize,
 	    flag_dirty);
 
-	arena_run_dalloc(arena, run, false);
+	arena_run_dalloc(arena, run, false, false);
 }
 
 static void
 arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
     size_t oldsize, size_t newsize, bool dirty)
 {
 	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE;
 	size_t head_npages = newsize >> LG_PAGE;
@@ -1010,17 +1150,17 @@ arena_run_trim_tail(arena_t *arena, aren
 		    pageind+head_npages+tail_npages-1) == 0);
 		assert(arena_mapbits_dirty_get(chunk,
 		    pageind+head_npages+tail_npages-1) == flag_dirty);
 	}
 	arena_mapbits_large_set(chunk, pageind+head_npages, oldsize-newsize,
 	    flag_dirty);
 
 	arena_run_dalloc(arena, (arena_run_t *)((uintptr_t)run + newsize),
-	    dirty);
+	    dirty, false);
 }
 
 static arena_run_t *
 arena_bin_runs_first(arena_bin_t *bin)
 {
 	arena_chunk_map_t *mapelm = arena_run_tree_first(&bin->runs);
 	if (mapelm != NULL) {
 		arena_chunk_t *chunk;
@@ -1177,46 +1317,28 @@ arena_bin_malloc_hard(arena_t *arena, ar
 	bin->runcur = run;
 
 	assert(bin->runcur->nfree > 0);
 
 	return (arena_run_reg_alloc(bin->runcur, bin_info));
 }
 
 void
-arena_prof_accum(arena_t *arena, uint64_t accumbytes)
-{
-
-	cassert(config_prof);
-
-	if (config_prof && prof_interval != 0) {
-		arena->prof_accumbytes += accumbytes;
-		if (arena->prof_accumbytes >= prof_interval) {
-			prof_idump();
-			arena->prof_accumbytes -= prof_interval;
-		}
-	}
-}
-
-void
 arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind,
     uint64_t prof_accumbytes)
 {
 	unsigned i, nfill;
 	arena_bin_t *bin;
 	arena_run_t *run;
 	void *ptr;
 
 	assert(tbin->ncached == 0);
 
-	if (config_prof) {
-		malloc_mutex_lock(&arena->lock);
+	if (config_prof)
 		arena_prof_accum(arena, prof_accumbytes);
-		malloc_mutex_unlock(&arena->lock);
-	}
 	bin = &arena->bins[binind];
 	malloc_mutex_lock(&bin->lock);
 	for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >>
 	    tbin->lg_fill_div); i < nfill; i++) {
 		if ((run = bin->runcur) != NULL && run->nfree > 0)
 			ptr = arena_run_reg_alloc(run, &arena_bin_info[binind]);
 		else
 			ptr = arena_bin_malloc_hard(arena, bin);
@@ -1314,21 +1436,18 @@ arena_malloc_small(arena_t *arena, size_
 	}
 
 	if (config_stats) {
 		bin->stats.allocated += size;
 		bin->stats.nmalloc++;
 		bin->stats.nrequests++;
 	}
 	malloc_mutex_unlock(&bin->lock);
-	if (config_prof && isthreaded == false) {
-		malloc_mutex_lock(&arena->lock);
+	if (config_prof && isthreaded == false)
 		arena_prof_accum(arena, size);
-		malloc_mutex_unlock(&arena->lock);
-	}
 
 	if (zero == false) {
 		if (config_fill) {
 			if (opt_junk) {
 				arena_alloc_junk_small(ret,
 				    &arena_bin_info[binind], false);
 			} else if (opt_zero)
 				memset(ret, 0, size);
@@ -1362,17 +1481,17 @@ arena_malloc_large(arena_t *arena, size_
 		arena->stats.nmalloc_large++;
 		arena->stats.nrequests_large++;
 		arena->stats.allocated_large += size;
 		arena->stats.lstats[(size >> LG_PAGE) - 1].nmalloc++;
 		arena->stats.lstats[(size >> LG_PAGE) - 1].nrequests++;
 		arena->stats.lstats[(size >> LG_PAGE) - 1].curruns++;
 	}
 	if (config_prof)
-		arena_prof_accum(arena, size);
+		arena_prof_accum_locked(arena, size);
 	malloc_mutex_unlock(&arena->lock);
 
 	if (zero == false) {
 		if (config_fill) {
 			if (opt_junk)
 				memset(ret, 0xa5, size);
 			else if (opt_zero)
 				memset(ret, 0, size);
@@ -1521,17 +1640,17 @@ arena_dalloc_bin_run(arena_t *arena, are
 		/* Trim clean pages.  Convert to large run beforehand. */
 		assert(npages > 0);
 		arena_mapbits_large_set(chunk, run_ind, bin_info->run_size, 0);
 		arena_mapbits_large_set(chunk, run_ind+npages-1, 0, 0);
 		arena_run_trim_tail(arena, chunk, run, (npages << LG_PAGE),
 		    ((past - run_ind) << LG_PAGE), false);
 		/* npages = past - run_ind; */
 	}
-	arena_run_dalloc(arena, run, true);
+	arena_run_dalloc(arena, run, true, false);
 	malloc_mutex_unlock(&arena->lock);
 	/****************************/
 	malloc_mutex_lock(&bin->lock);
 	if (config_stats)
 		bin->stats.curruns--;
 }
 
 static void
@@ -1614,62 +1733,16 @@ arena_dalloc_small(arena_t *arena, arena
 	if (config_debug) {
 		/* arena_ptr_small_binind_get() does extra sanity checking. */
 		assert(arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk,
 		    pageind)) != BININD_INVALID);
 	}
 	mapelm = arena_mapp_get(chunk, pageind);
 	arena_dalloc_bin(arena, chunk, ptr, pageind, mapelm);
 }
-void
-arena_stats_merge(arena_t *arena, size_t *nactive, size_t *ndirty,
-    arena_stats_t *astats, malloc_bin_stats_t *bstats,
-    malloc_large_stats_t *lstats)
-{
-	unsigned i;
-
-	malloc_mutex_lock(&arena->lock);
-	*nactive += arena->nactive;
-	*ndirty += arena->ndirty;
-
-	astats->mapped += arena->stats.mapped;
-	astats->npurge += arena->stats.npurge;
-	astats->nmadvise += arena->stats.nmadvise;
-	astats->purged += arena->stats.purged;
-	astats->allocated_large += arena->stats.allocated_large;
-	astats->nmalloc_large += arena->stats.nmalloc_large;
-	astats->ndalloc_large += arena->stats.ndalloc_large;
-	astats->nrequests_large += arena->stats.nrequests_large;
-
-	for (i = 0; i < nlclasses; i++) {
-		lstats[i].nmalloc += arena->stats.lstats[i].nmalloc;
-		lstats[i].ndalloc += arena->stats.lstats[i].ndalloc;
-		lstats[i].nrequests += arena->stats.lstats[i].nrequests;
-		lstats[i].curruns += arena->stats.lstats[i].curruns;
-	}
-	malloc_mutex_unlock(&arena->lock);
-
-	for (i = 0; i < NBINS; i++) {
-		arena_bin_t *bin = &arena->bins[i];
-
-		malloc_mutex_lock(&bin->lock);
-		bstats[i].allocated += bin->stats.allocated;
-		bstats[i].nmalloc += bin->stats.nmalloc;
-		bstats[i].ndalloc += bin->stats.ndalloc;
-		bstats[i].nrequests += bin->stats.nrequests;
-		if (config_tcache) {
-			bstats[i].nfills += bin->stats.nfills;
-			bstats[i].nflushes += bin->stats.nflushes;
-		}
-		bstats[i].nruns += bin->stats.nruns;
-		bstats[i].reruns += bin->stats.reruns;
-		bstats[i].curruns += bin->stats.curruns;
-		malloc_mutex_unlock(&bin->lock);
-	}
-}
 
 void
 arena_dalloc_large_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 {
 
 	if (config_fill || config_stats) {
 		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 		size_t size = arena_mapbits_large_size_get(chunk, pageind);
@@ -1679,17 +1752,17 @@ arena_dalloc_large_locked(arena_t *arena
 		if (config_stats) {
 			arena->stats.ndalloc_large++;
 			arena->stats.allocated_large -= size;
 			arena->stats.lstats[(size >> LG_PAGE) - 1].ndalloc++;
 			arena->stats.lstats[(size >> LG_PAGE) - 1].curruns--;
 		}
 	}
 
-	arena_run_dalloc(arena, (arena_run_t *)ptr, true);
+	arena_run_dalloc(arena, (arena_run_t *)ptr, true, false);
 }
 
 void
 arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 {
 
 	malloc_mutex_lock(&arena->lock);
 	arena_dalloc_large_locked(arena, chunk, ptr);
@@ -1872,18 +1945,19 @@ arena_ralloc_no_move(void *ptr, size_t o
 		}
 	}
 
 	/* Reallocation would require a move. */
 	return (NULL);
 }
 
 void *
-arena_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero, bool try_tcache)
+arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
+    size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
+    bool try_tcache_dalloc)
 {
 	void *ret;
 	size_t copysize;
 
 	/* Try to avoid moving the allocation. */
 	ret = arena_ralloc_no_move(ptr, oldsize, size, extra, zero);
 	if (ret != NULL)
 		return (ret);
@@ -1892,49 +1966,118 @@ arena_ralloc(void *ptr, size_t oldsize, 
 	 * size and oldsize are different enough that we need to move the
 	 * object.  In that case, fall back to allocating new space and
 	 * copying.
 	 */
 	if (alignment != 0) {
 		size_t usize = sa2u(size + extra, alignment);
 		if (usize == 0)
 			return (NULL);
-		ret = ipalloc(usize, alignment, zero);
+		ret = ipallocx(usize, alignment, zero, try_tcache_alloc, arena);
 	} else
-		ret = arena_malloc(NULL, size + extra, zero, try_tcache);
+		ret = arena_malloc(arena, size + extra, zero, try_tcache_alloc);
 
 	if (ret == NULL) {
 		if (extra == 0)
 			return (NULL);
 		/* Try again, this time without extra. */
 		if (alignment != 0) {
 			size_t usize = sa2u(size, alignment);
 			if (usize == 0)
 				return (NULL);
-			ret = ipalloc(usize, alignment, zero);
+			ret = ipallocx(usize, alignment, zero, try_tcache_alloc,
+			    arena);
 		} else
-			ret = arena_malloc(NULL, size, zero, try_tcache);
+			ret = arena_malloc(arena, size, zero, try_tcache_alloc);
 
 		if (ret == NULL)
 			return (NULL);
 	}
 
 	/* Junk/zero-filling were already done by ipalloc()/arena_malloc(). */
 
 	/*
 	 * Copy at most size bytes (not size+extra), since the caller has no
 	 * expectation that the extra bytes will be reliably preserved.
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	VALGRIND_MAKE_MEM_UNDEFINED(ret, copysize);
 	memcpy(ret, ptr, copysize);
-	iqalloc(ptr);
+	iqallocx(ptr, try_tcache_dalloc);
+	return (ret);
+}
+
+dss_prec_t
+arena_dss_prec_get(arena_t *arena)
+{
+	dss_prec_t ret;
+
+	malloc_mutex_lock(&arena->lock);
+	ret = arena->dss_prec;
+	malloc_mutex_unlock(&arena->lock);
 	return (ret);
 }
 
+void
+arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec)
+{
+
+	malloc_mutex_lock(&arena->lock);
+	arena->dss_prec = dss_prec;
+	malloc_mutex_unlock(&arena->lock);
+}
+
+void
+arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
+    size_t *ndirty, arena_stats_t *astats, malloc_bin_stats_t *bstats,
+    malloc_large_stats_t *lstats)
+{
+	unsigned i;
+
+	malloc_mutex_lock(&arena->lock);
+	*dss = dss_prec_names[arena->dss_prec];
+	*nactive += arena->nactive;
+	*ndirty += arena->ndirty;
+
+	astats->mapped += arena->stats.mapped;
+	astats->npurge += arena->stats.npurge;
+	astats->nmadvise += arena->stats.nmadvise;
+	astats->purged += arena->stats.purged;
+	astats->allocated_large += arena->stats.allocated_large;
+	astats->nmalloc_large += arena->stats.nmalloc_large;
+	astats->ndalloc_large += arena->stats.ndalloc_large;
+	astats->nrequests_large += arena->stats.nrequests_large;
+
+	for (i = 0; i < nlclasses; i++) {
+		lstats[i].nmalloc += arena->stats.lstats[i].nmalloc;
+		lstats[i].ndalloc += arena->stats.lstats[i].ndalloc;
+		lstats[i].nrequests += arena->stats.lstats[i].nrequests;
+		lstats[i].curruns += arena->stats.lstats[i].curruns;
+	}
+	malloc_mutex_unlock(&arena->lock);
+
+	for (i = 0; i < NBINS; i++) {
+		arena_bin_t *bin = &arena->bins[i];
+
+		malloc_mutex_lock(&bin->lock);
+		bstats[i].allocated += bin->stats.allocated;
+		bstats[i].nmalloc += bin->stats.nmalloc;
+		bstats[i].ndalloc += bin->stats.ndalloc;
+		bstats[i].nrequests += bin->stats.nrequests;
+		if (config_tcache) {
+			bstats[i].nfills += bin->stats.nfills;
+			bstats[i].nflushes += bin->stats.nflushes;
+		}
+		bstats[i].nruns += bin->stats.nruns;
+		bstats[i].reruns += bin->stats.reruns;
+		bstats[i].curruns += bin->stats.curruns;
+		malloc_mutex_unlock(&bin->lock);
+	}
+}
+
 bool
 arena_new(arena_t *arena, unsigned ind)
 {
 	unsigned i;
 	arena_bin_t *bin;
 
 	arena->ind = ind;
 	arena->nthreads = 0;
@@ -1953,26 +2096,27 @@ arena_new(arena_t *arena, unsigned ind)
 		    sizeof(malloc_large_stats_t));
 		if (config_tcache)
 			ql_new(&arena->tcache_ql);
 	}
 
 	if (config_prof)
 		arena->prof_accumbytes = 0;
 
+	arena->dss_prec = chunk_dss_prec_get();
+
 	/* Initialize chunks. */
-	ql_new(&arena->chunks_dirty);
+	arena_chunk_dirty_new(&arena->chunks_dirty);
 	arena->spare = NULL;
 
 	arena->nactive = 0;
 	arena->ndirty = 0;
 	arena->npurgatory = 0;
 
-	arena_avail_tree_new(&arena->runs_avail_clean);
-	arena_avail_tree_new(&arena->runs_avail_dirty);
+	arena_avail_tree_new(&arena->runs_avail);
 
 	/* Initialize bins. */
 	for (i = 0; i < NBINS; i++) {
 		bin = &arena->bins[i];
 		if (malloc_mutex_init(&bin->lock))
 			return (true);
 		bin->runcur = NULL;
 		arena_run_tree_new(&bin->runs);
--- a/memory/jemalloc/src/src/base.c
+++ b/memory/jemalloc/src/src/base.c
@@ -27,17 +27,18 @@ static bool
 base_pages_alloc(size_t minsize)
 {
 	size_t csize;
 	bool zero;
 
 	assert(minsize != 0);
 	csize = CHUNK_CEILING(minsize);
 	zero = false;
-	base_pages = chunk_alloc(csize, chunksize, true, &zero);
+	base_pages = chunk_alloc(csize, chunksize, true, &zero,
+	    chunk_dss_prec_get());
 	if (base_pages == NULL)
 		return (true);
 	base_next_addr = base_pages;
 	base_past_addr = (void *)((uintptr_t)base_pages + csize);
 
 	return (false);
 }
 
--- a/memory/jemalloc/src/src/chunk.c
+++ b/memory/jemalloc/src/src/chunk.c
@@ -1,48 +1,54 @@
 #define	JEMALLOC_CHUNK_C_
 #include "jemalloc/internal/jemalloc_internal.h"
 
 /******************************************************************************/
 /* Data. */
 
-size_t	opt_lg_chunk = LG_CHUNK_DEFAULT;
+const char	*opt_dss = DSS_DEFAULT;
+size_t		opt_lg_chunk = LG_CHUNK_DEFAULT;
 
 malloc_mutex_t	chunks_mtx;
 chunk_stats_t	stats_chunks;
 
 /*
  * Trees of chunks that were previously allocated (trees differ only in node
  * ordering).  These are used when allocating chunks, in an attempt to re-use
  * address space.  Depending on function, different tree orderings are needed,
  * which is why there are two trees with the same contents.
  */
-static extent_tree_t	chunks_szad;
-static extent_tree_t	chunks_ad;
+static extent_tree_t	chunks_szad_mmap;
+static extent_tree_t	chunks_ad_mmap;
+static extent_tree_t	chunks_szad_dss;
+static extent_tree_t	chunks_ad_dss;
 
 rtree_t		*chunks_rtree;
 
 /* Various chunk-related settings. */
 size_t		chunksize;
 size_t		chunksize_mask; /* (chunksize - 1). */
 size_t		chunk_npages;
 size_t		map_bias;
 size_t		arena_maxclass; /* Max size class for arenas. */
 
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
 
-static void	*chunk_recycle(size_t size, size_t alignment, bool base,
+static void	*chunk_recycle(extent_tree_t *chunks_szad,
+    extent_tree_t *chunks_ad, size_t size, size_t alignment, bool base,
     bool *zero);
-static void	chunk_record(void *chunk, size_t size);
+static void	chunk_record(extent_tree_t *chunks_szad,
+    extent_tree_t *chunks_ad, void *chunk, size_t size);
 
 /******************************************************************************/
 
 static void *
-chunk_recycle(size_t size, size_t alignment, bool base, bool *zero)
+chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
+    size_t alignment, bool base, bool *zero)
 {
 	void *ret;
 	extent_node_t *node;
 	extent_node_t key;
 	size_t alloc_size, leadsize, trailsize;
 	bool zeroed;
 
 	if (base) {
@@ -57,34 +63,34 @@ chunk_recycle(size_t size, size_t alignm
 
 	alloc_size = size + alignment - chunksize;
 	/* Beware size_t wrap-around. */
 	if (alloc_size < size)
 		return (NULL);
 	key.addr = NULL;
 	key.size = alloc_size;
 	malloc_mutex_lock(&chunks_mtx);
-	node = extent_tree_szad_nsearch(&chunks_szad, &key);
+	node = extent_tree_szad_nsearch(chunks_szad, &key);
 	if (node == NULL) {
 		malloc_mutex_unlock(&chunks_mtx);
 		return (NULL);
 	}
 	leadsize = ALIGNMENT_CEILING((uintptr_t)node->addr, alignment) -
 	    (uintptr_t)node->addr;
 	assert(node->size >= leadsize + size);
 	trailsize = node->size - leadsize - size;
 	ret = (void *)((uintptr_t)node->addr + leadsize);
 	/* Remove node from the tree. */
-	extent_tree_szad_remove(&chunks_szad, node);
-	extent_tree_ad_remove(&chunks_ad, node);
+	extent_tree_szad_remove(chunks_szad, node);
+	extent_tree_ad_remove(chunks_ad, node);
 	if (leadsize != 0) {
 		/* Insert the leading space as a smaller chunk. */
 		node->size = leadsize;
-		extent_tree_szad_insert(&chunks_szad, node);
-		extent_tree_ad_insert(&chunks_ad, node);
+		extent_tree_szad_insert(chunks_szad, node);
+		extent_tree_ad_insert(chunks_ad, node);
 		node = NULL;
 	}
 	if (trailsize != 0) {
 		/* Insert the trailing space as a smaller chunk. */
 		if (node == NULL) {
 			/*
 			 * An additional node is required, but
 			 * base_node_alloc() can cause a new base chunk to be
@@ -97,18 +103,18 @@ chunk_recycle(size_t size, size_t alignm
 			if (node == NULL) {
 				chunk_dealloc(ret, size, true);
 				return (NULL);
 			}
 			malloc_mutex_lock(&chunks_mtx);
 		}
 		node->addr = (void *)((uintptr_t)(ret) + size);
 		node->size = trailsize;
-		extent_tree_szad_insert(&chunks_szad, node);
-		extent_tree_ad_insert(&chunks_ad, node);
+		extent_tree_szad_insert(chunks_szad, node);
+		extent_tree_ad_insert(chunks_ad, node);
 		node = NULL;
 	}
 	malloc_mutex_unlock(&chunks_mtx);
 
 	zeroed = false;
 	if (node != NULL) {
 		if (node->zeroed) {
 			zeroed = true;
@@ -125,36 +131,46 @@ chunk_recycle(size_t size, size_t alignm
 
 /*
  * If the caller specifies (*zero == false), it is still possible to receive
  * zeroed memory, in which case *zero is toggled to true.  arena_chunk_alloc()
  * takes advantage of this to avoid demanding zeroed chunks, but taking
  * advantage of them if they are returned.
  */
 void *
-chunk_alloc(size_t size, size_t alignment, bool base, bool *zero)
+chunk_alloc(size_t size, size_t alignment, bool base, bool *zero,
+    dss_prec_t dss_prec)
 {
 	void *ret;
 
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 	assert(alignment != 0);
 	assert((alignment & chunksize_mask) == 0);
 
-	ret = chunk_recycle(size, alignment, base, zero);
-	if (ret != NULL)
+	/* "primary" dss. */
+	if (config_dss && dss_prec == dss_prec_primary) {
+		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss, size,
+		    alignment, base, zero)) != NULL)
+			goto label_return;
+		if ((ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
+			goto label_return;
+	}
+	/* mmap. */
+	if ((ret = chunk_recycle(&chunks_szad_mmap, &chunks_ad_mmap, size,
+	    alignment, base, zero)) != NULL)
 		goto label_return;
-
-	ret = chunk_alloc_mmap(size, alignment, zero);
-	if (ret != NULL)
+	if ((ret = chunk_alloc_mmap(size, alignment, zero)) != NULL)
 		goto label_return;
-
-	if (config_dss) {
-		ret = chunk_alloc_dss(size, alignment, zero);
-		if (ret != NULL)
+	/* "secondary" dss. */
+	if (config_dss && dss_prec == dss_prec_secondary) {
+		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss, size,
+		    alignment, base, zero)) != NULL)
+			goto label_return;
+		if ((ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
 			goto label_return;
 	}
 
 	/* All strategies for allocation failed. */
 	ret = NULL;
 label_return:
 	if (config_ivsalloc && base == false && ret != NULL) {
 		if (rtree_set(chunks_rtree, (uintptr_t)ret, ret)) {
@@ -186,46 +202,47 @@ label_return:
 		for (i = 0; i < size / sizeof(size_t); i++)
 			assert(p[i] == 0);
 	}
 	assert(CHUNK_ADDR2BASE(ret) == ret);
 	return (ret);
 }
 
 static void
-chunk_record(void *chunk, size_t size)
+chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
+    size_t size)
 {
 	bool unzeroed;
 	extent_node_t *xnode, *node, *prev, key;
 
 	unzeroed = pages_purge(chunk, size);
 
 	/*
 	 * Allocate a node before acquiring chunks_mtx even though it might not
 	 * be needed, because base_node_alloc() may cause a new base chunk to
 	 * be allocated, which could cause deadlock if chunks_mtx were already
 	 * held.
 	 */
 	xnode = base_node_alloc();
 
 	malloc_mutex_lock(&chunks_mtx);
 	key.addr = (void *)((uintptr_t)chunk + size);
-	node = extent_tree_ad_nsearch(&chunks_ad, &key);
+	node = extent_tree_ad_nsearch(chunks_ad, &key);
 	/* Try to coalesce forward. */
 	if (node != NULL && node->addr == key.addr) {
 		/*
 		 * Coalesce chunk with the following address range.  This does
 		 * not change the position within chunks_ad, so only
 		 * remove/insert from/into chunks_szad.
 		 */
-		extent_tree_szad_remove(&chunks_szad, node);
+		extent_tree_szad_remove(chunks_szad, node);
 		node->addr = chunk;
 		node->size += size;
 		node->zeroed = (node->zeroed && (unzeroed == false));
-		extent_tree_szad_insert(&chunks_szad, node);
+		extent_tree_szad_insert(chunks_szad, node);
 		if (xnode != NULL)
 			base_node_dealloc(xnode);
 	} else {
 		/* Coalescing forward failed, so insert a new node. */
 		if (xnode == NULL) {
 			/*
 			 * base_node_alloc() failed, which is an exceedingly
 			 * unlikely failure.  Leak chunk; its pages have
@@ -234,65 +251,77 @@ chunk_record(void *chunk, size_t size)
 			 */
 			malloc_mutex_unlock(&chunks_mtx);
 			return;
 		}
 		node = xnode;
 		node->addr = chunk;
 		node->size = size;
 		node->zeroed = (unzeroed == false);
-		extent_tree_ad_insert(&chunks_ad, node);
-		extent_tree_szad_insert(&chunks_szad, node);
+		extent_tree_ad_insert(chunks_ad, node);
+		extent_tree_szad_insert(chunks_szad, node);
 	}
 
 	/* Try to coalesce backward. */
-	prev = extent_tree_ad_prev(&chunks_ad, node);
+	prev = extent_tree_ad_prev(chunks_ad, node);
 	if (prev != NULL && (void *)((uintptr_t)prev->addr + prev->size) ==
 	    chunk) {
 		/*
 		 * Coalesce chunk with the previous address range.  This does
 		 * not change the position within chunks_ad, so only
 		 * remove/insert node from/into chunks_szad.
 		 */
-		extent_tree_szad_remove(&chunks_szad, prev);
-		extent_tree_ad_remove(&chunks_ad, prev);
+		extent_tree_szad_remove(chunks_szad, prev);
+		extent_tree_ad_remove(chunks_ad, prev);
 
-		extent_tree_szad_remove(&chunks_szad, node);
+		extent_tree_szad_remove(chunks_szad, node);
 		node->addr = prev->addr;
 		node->size += prev->size;
 		node->zeroed = (node->zeroed && prev->zeroed);
-		extent_tree_szad_insert(&chunks_szad, node);
+		extent_tree_szad_insert(chunks_szad, node);
 
 		base_node_dealloc(prev);
 	}
 	malloc_mutex_unlock(&chunks_mtx);
 }
 
 void
+chunk_unmap(void *chunk, size_t size)
+{
+	assert(chunk != NULL);
+	assert(CHUNK_ADDR2BASE(chunk) == chunk);
+	assert(size != 0);
+	assert((size & chunksize_mask) == 0);
+
+	if (config_dss && chunk_in_dss(chunk))
+		chunk_record(&chunks_szad_dss, &chunks_ad_dss, chunk, size);
+	else if (chunk_dealloc_mmap(chunk, size))
+		chunk_record(&chunks_szad_mmap, &chunks_ad_mmap, chunk, size);
+}
+
+void
 chunk_dealloc(void *chunk, size_t size, bool unmap)
 {
 
 	assert(chunk != NULL);
 	assert(CHUNK_ADDR2BASE(chunk) == chunk);
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 
 	if (config_ivsalloc)
 		rtree_set(chunks_rtree, (uintptr_t)chunk, NULL);
 	if (config_stats || config_prof) {
 		malloc_mutex_lock(&chunks_mtx);
+		assert(stats_chunks.curchunks >= (size / chunksize));
 		stats_chunks.curchunks -= (size / chunksize);
 		malloc_mutex_unlock(&chunks_mtx);
 	}
 
-	if (unmap) {
-		if ((config_dss && chunk_in_dss(chunk)) ||
-		    chunk_dealloc_mmap(chunk, size))
-			chunk_record(chunk, size);
-	}
+	if (unmap)
+		chunk_unmap(chunk, size);
 }
 
 bool
 chunk_boot(void)
 {
 
 	/* Set variables according to the value of opt_lg_chunk. */
 	chunksize = (ZU(1) << opt_lg_chunk);
@@ -302,18 +331,20 @@ chunk_boot(void)
 
 	if (config_stats || config_prof) {
 		if (malloc_mutex_init(&chunks_mtx))
 			return (true);
 		memset(&stats_chunks, 0, sizeof(chunk_stats_t));
 	}
 	if (config_dss && chunk_dss_boot())
 		return (true);
-	extent_tree_szad_new(&chunks_szad);
-	extent_tree_ad_new(&chunks_ad);
+	extent_tree_szad_new(&chunks_szad_mmap);
+	extent_tree_ad_new(&chunks_ad_mmap);
+	extent_tree_szad_new(&chunks_szad_dss);
+	extent_tree_ad_new(&chunks_ad_dss);
 	if (config_ivsalloc) {
 		chunks_rtree = rtree_new((ZU(1) << (LG_SIZEOF_PTR+3)) -
 		    opt_lg_chunk);
 		if (chunks_rtree == NULL)
 			return (true);
 	}
 
 	return (false);
--- a/memory/jemalloc/src/src/chunk_dss.c
+++ b/memory/jemalloc/src/src/chunk_dss.c
@@ -1,13 +1,23 @@
 #define	JEMALLOC_CHUNK_DSS_C_
 #include "jemalloc/internal/jemalloc_internal.h"
 /******************************************************************************/
 /* Data. */
 
+const char	*dss_prec_names[] = {
+	"disabled",
+	"primary",
+	"secondary",
+	"N/A"
+};
+
+/* Current dss precedence default, used when creating new arenas. */
+static dss_prec_t	dss_prec_default = DSS_PREC_DEFAULT;
+
 /*
  * Protects sbrk() calls.  This avoids malloc races among threads, though it
  * does not protect against races with threads that call sbrk() directly.
  */
 static malloc_mutex_t	dss_mtx;
 
 /* Base address of the DSS. */
 static void		*dss_base;
@@ -24,16 +34,41 @@ sbrk(intptr_t increment)
 {
 
 	not_implemented();
 
 	return (NULL);
 }
 #endif
 
+dss_prec_t
+chunk_dss_prec_get(void)
+{
+	dss_prec_t ret;
+
+	if (config_dss == false)
+		return (dss_prec_disabled);
+	malloc_mutex_lock(&dss_mtx);
+	ret = dss_prec_default;
+	malloc_mutex_unlock(&dss_mtx);
+	return (ret);
+}
+
+bool
+chunk_dss_prec_set(dss_prec_t dss_prec)
+{
+
+	if (config_dss == false)
+		return (true);
+	malloc_mutex_lock(&dss_mtx);
+	dss_prec_default = dss_prec;
+	malloc_mutex_unlock(&dss_mtx);
+	return (false);
+}
+
 void *
 chunk_alloc_dss(size_t size, size_t alignment, bool *zero)
 {
 	void *ret;
 
 	cassert(config_dss);
 	assert(size > 0 && (size & chunksize_mask) == 0);
 	assert(alignment > 0 && (alignment & chunksize_mask) == 0);
@@ -83,17 +118,17 @@ chunk_alloc_dss(size_t size, size_t alig
 			}
 			incr = gap_size + cpad_size + size;
 			dss_prev = sbrk(incr);
 			if (dss_prev == dss_max) {
 				/* Success. */
 				dss_max = dss_next;
 				malloc_mutex_unlock(&dss_mtx);
 				if (cpad_size != 0)
-					chunk_dealloc(cpad, cpad_size, true);
+					chunk_unmap(cpad, cpad_size);
 				if (*zero) {
 					VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 					memset(ret, 0, size);
 				}
 				return (ret);
 			}
 		} while (dss_prev != (void *)-1);
 	}
--- a/memory/jemalloc/src/src/ctl.c
+++ b/memory/jemalloc/src/src/ctl.c
@@ -43,26 +43,27 @@ ctl_indexed_node(const ctl_node_t *node)
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
 
 #define	CTL_PROTO(n)							\
 static int	n##_ctl(const size_t *mib, size_t miblen, void *oldp,	\
     size_t *oldlenp, void *newp, size_t newlen);
 
 #define	INDEX_PROTO(n)							\
-const ctl_named_node_t	*n##_index(const size_t *mib, size_t miblen,	\
-    size_t i);
+static const ctl_named_node_t	*n##_index(const size_t *mib,		\
+    size_t miblen, size_t i);
 
 static bool	ctl_arena_init(ctl_arena_stats_t *astats);
 static void	ctl_arena_clear(ctl_arena_stats_t *astats);
 static void	ctl_arena_stats_amerge(ctl_arena_stats_t *cstats,
     arena_t *arena);
 static void	ctl_arena_stats_smerge(ctl_arena_stats_t *sstats,
     ctl_arena_stats_t *astats);
 static void	ctl_arena_refresh(arena_t *arena, unsigned i);
+static bool	ctl_grow(void);
 static void	ctl_refresh(void);
 static bool	ctl_init(void);
 static int	ctl_lookup(const char *name, ctl_node_t const **nodesp,
     size_t *mibp, size_t *depthp);
 
 CTL_PROTO(version)
 CTL_PROTO(epoch)
 CTL_PROTO(thread_tcache_enabled)
@@ -83,16 +84,17 @@ CTL_PROTO(config_prof_libgcc)
 CTL_PROTO(config_prof_libunwind)
 CTL_PROTO(config_stats)
 CTL_PROTO(config_tcache)
 CTL_PROTO(config_tls)
 CTL_PROTO(config_utrace)
 CTL_PROTO(config_valgrind)
 CTL_PROTO(config_xmalloc)
 CTL_PROTO(opt_abort)
+CTL_PROTO(opt_dss)
 CTL_PROTO(opt_lg_chunk)
 CTL_PROTO(opt_narenas)
 CTL_PROTO(opt_lg_dirty_mult)
 CTL_PROTO(opt_stats_print)
 CTL_PROTO(opt_junk)
 CTL_PROTO(opt_zero)
 CTL_PROTO(opt_quarantine)
 CTL_PROTO(opt_redzone)
@@ -105,31 +107,36 @@ CTL_PROTO(opt_prof)
 CTL_PROTO(opt_prof_prefix)
 CTL_PROTO(opt_prof_active)
 CTL_PROTO(opt_lg_prof_sample)
 CTL_PROTO(opt_lg_prof_interval)
 CTL_PROTO(opt_prof_gdump)
 CTL_PROTO(opt_prof_final)
 CTL_PROTO(opt_prof_leak)
 CTL_PROTO(opt_prof_accum)
+CTL_PROTO(arena_i_purge)
+static void	arena_purge(unsigned arena_ind);
+CTL_PROTO(arena_i_dss)
+INDEX_PROTO(arena_i)
 CTL_PROTO(arenas_bin_i_size)
 CTL_PROTO(arenas_bin_i_nregs)
 CTL_PROTO(arenas_bin_i_run_size)
 INDEX_PROTO(arenas_bin_i)
 CTL_PROTO(arenas_lrun_i_size)
 INDEX_PROTO(arenas_lrun_i)
 CTL_PROTO(arenas_narenas)
 CTL_PROTO(arenas_initialized)
 CTL_PROTO(arenas_quantum)
 CTL_PROTO(arenas_page)
 CTL_PROTO(arenas_tcache_max)
 CTL_PROTO(arenas_nbins)
 CTL_PROTO(arenas_nhbins)
 CTL_PROTO(arenas_nlruns)
 CTL_PROTO(arenas_purge)
+CTL_PROTO(arenas_extend)
 CTL_PROTO(prof_active)
 CTL_PROTO(prof_dump)
 CTL_PROTO(prof_interval)
 CTL_PROTO(stats_chunks_current)
 CTL_PROTO(stats_chunks_total)
 CTL_PROTO(stats_chunks_high)
 CTL_PROTO(stats_huge_allocated)
 CTL_PROTO(stats_huge_nmalloc)
@@ -153,16 +160,17 @@ CTL_PROTO(stats_arenas_i_bins_j_nreruns)
 CTL_PROTO(stats_arenas_i_bins_j_curruns)
 INDEX_PROTO(stats_arenas_i_bins_j)
 CTL_PROTO(stats_arenas_i_lruns_j_nmalloc)
 CTL_PROTO(stats_arenas_i_lruns_j_ndalloc)
 CTL_PROTO(stats_arenas_i_lruns_j_nrequests)
 CTL_PROTO(stats_arenas_i_lruns_j_curruns)
 INDEX_PROTO(stats_arenas_i_lruns_j)
 CTL_PROTO(stats_arenas_i_nthreads)
+CTL_PROTO(stats_arenas_i_dss)
 CTL_PROTO(stats_arenas_i_pactive)
 CTL_PROTO(stats_arenas_i_pdirty)
 CTL_PROTO(stats_arenas_i_mapped)
 CTL_PROTO(stats_arenas_i_npurge)
 CTL_PROTO(stats_arenas_i_nmadvise)
 CTL_PROTO(stats_arenas_i_purged)
 INDEX_PROTO(stats_arenas_i)
 CTL_PROTO(stats_cactive)
@@ -218,16 +226,17 @@ static const ctl_named_node_t	config_nod
 	{NAME("tls"),			CTL(config_tls)},
 	{NAME("utrace"),		CTL(config_utrace)},
 	{NAME("valgrind"),		CTL(config_valgrind)},
 	{NAME("xmalloc"),		CTL(config_xmalloc)}
 };
 
 static const ctl_named_node_t opt_node[] = {
 	{NAME("abort"),			CTL(opt_abort)},
+	{NAME("dss"),			CTL(opt_dss)},
 	{NAME("lg_chunk"),		CTL(opt_lg_chunk)},
 	{NAME("narenas"),		CTL(opt_narenas)},
 	{NAME("lg_dirty_mult"),		CTL(opt_lg_dirty_mult)},
 	{NAME("stats_print"),		CTL(opt_stats_print)},
 	{NAME("junk"),			CTL(opt_junk)},
 	{NAME("zero"),			CTL(opt_zero)},
 	{NAME("quarantine"),		CTL(opt_quarantine)},
 	{NAME("redzone"),		CTL(opt_redzone)},
@@ -242,16 +251,28 @@ static const ctl_named_node_t opt_node[]
 	{NAME("lg_prof_sample"),	CTL(opt_lg_prof_sample)},
 	{NAME("lg_prof_interval"),	CTL(opt_lg_prof_interval)},
 	{NAME("prof_gdump"),		CTL(opt_prof_gdump)},
 	{NAME("prof_final"),		CTL(opt_prof_final)},
 	{NAME("prof_leak"),		CTL(opt_prof_leak)},
 	{NAME("prof_accum"),		CTL(opt_prof_accum)}
 };
 
+static const ctl_named_node_t arena_i_node[] = {
+	{NAME("purge"),			CTL(arena_i_purge)},
+	{NAME("dss"),			CTL(arena_i_dss)}
+};
+static const ctl_named_node_t super_arena_i_node[] = {
+	{NAME(""),			CHILD(named, arena_i)}
+};
+
+static const ctl_indexed_node_t arena_node[] = {
+	{INDEX(arena_i)}
+};
+
 static const ctl_named_node_t arenas_bin_i_node[] = {
 	{NAME("size"),			CTL(arenas_bin_i_size)},
 	{NAME("nregs"),			CTL(arenas_bin_i_nregs)},
 	{NAME("run_size"),		CTL(arenas_bin_i_run_size)}
 };
 static const ctl_named_node_t super_arenas_bin_i_node[] = {
 	{NAME(""),			CHILD(named, arenas_bin_i)}
 };
@@ -277,17 +298,18 @@ static const ctl_named_node_t arenas_nod
 	{NAME("quantum"),		CTL(arenas_quantum)},
 	{NAME("page"),			CTL(arenas_page)},
 	{NAME("tcache_max"),		CTL(arenas_tcache_max)},
 	{NAME("nbins"),			CTL(arenas_nbins)},
 	{NAME("nhbins"),		CTL(arenas_nhbins)},
 	{NAME("bin"),			CHILD(indexed, arenas_bin)},
 	{NAME("nlruns"),		CTL(arenas_nlruns)},
 	{NAME("lrun"),			CHILD(indexed, arenas_lrun)},
-	{NAME("purge"),			CTL(arenas_purge)}
+	{NAME("purge"),			CTL(arenas_purge)},
+	{NAME("extend"),		CTL(arenas_extend)}
 };
 
 static const ctl_named_node_t	prof_node[] = {
 	{NAME("active"),	CTL(prof_active)},
 	{NAME("dump"),		CTL(prof_dump)},
 	{NAME("interval"),	CTL(prof_interval)}
 };
 
@@ -347,16 +369,17 @@ static const ctl_named_node_t super_stat
 };
 
 static const ctl_indexed_node_t stats_arenas_i_lruns_node[] = {
 	{INDEX(stats_arenas_i_lruns_j)}
 };
 
 static const ctl_named_node_t stats_arenas_i_node[] = {
 	{NAME("nthreads"),		CTL(stats_arenas_i_nthreads)},
+	{NAME("dss"),			CTL(stats_arenas_i_dss)},
 	{NAME("pactive"),		CTL(stats_arenas_i_pactive)},
 	{NAME("pdirty"),		CTL(stats_arenas_i_pdirty)},
 	{NAME("mapped"),		CTL(stats_arenas_i_mapped)},
 	{NAME("npurge"),		CTL(stats_arenas_i_npurge)},
 	{NAME("nmadvise"),		CTL(stats_arenas_i_nmadvise)},
 	{NAME("purged"),		CTL(stats_arenas_i_purged)},
 	{NAME("small"),			CHILD(named, stats_arenas_i_small)},
 	{NAME("large"),			CHILD(named, stats_arenas_i_large)},
@@ -382,16 +405,17 @@ static const ctl_named_node_t stats_node
 };
 
 static const ctl_named_node_t	root_node[] = {
 	{NAME("version"),	CTL(version)},
 	{NAME("epoch"),		CTL(epoch)},
 	{NAME("thread"),	CHILD(named, thread)},
 	{NAME("config"),	CHILD(named, config)},
 	{NAME("opt"),		CHILD(named, opt)},
+	{NAME("arena"),		CHILD(indexed, arena)},
 	{NAME("arenas"),	CHILD(named, arenas)},
 	{NAME("prof"),		CHILD(named, prof)},
 	{NAME("stats"),		CHILD(named, stats)}
 };
 static const ctl_named_node_t super_root_node[] = {
 	{NAME(""),		CHILD(named, root)}
 };
 
@@ -415,16 +439,17 @@ ctl_arena_init(ctl_arena_stats_t *astats
 
 	return (false);
 }
 
 static void
 ctl_arena_clear(ctl_arena_stats_t *astats)
 {
 
+	astats->dss = dss_prec_names[dss_prec_limit];
 	astats->pactive = 0;
 	astats->pdirty = 0;
 	if (config_stats) {
 		memset(&astats->astats, 0, sizeof(arena_stats_t));
 		astats->allocated_small = 0;
 		astats->nmalloc_small = 0;
 		astats->ndalloc_small = 0;
 		astats->nrequests_small = 0;
@@ -434,18 +459,18 @@ ctl_arena_clear(ctl_arena_stats_t *astat
 	}
 }
 
 static void
 ctl_arena_stats_amerge(ctl_arena_stats_t *cstats, arena_t *arena)
 {
 	unsigned i;
 
-	arena_stats_merge(arena, &cstats->pactive, &cstats->pdirty,
-	    &cstats->astats, cstats->bstats, cstats->lstats);
+	arena_stats_merge(arena, &cstats->dss, &cstats->pactive,
+	    &cstats->pdirty, &cstats->astats, cstats->bstats, cstats->lstats);
 
 	for (i = 0; i < NBINS; i++) {
 		cstats->allocated_small += cstats->bstats[i].allocated;
 		cstats->nmalloc_small += cstats->bstats[i].nmalloc;
 		cstats->ndalloc_small += cstats->bstats[i].ndalloc;
 		cstats->nrequests_small += cstats->bstats[i].nrequests;
 	}
 }
@@ -495,17 +520,17 @@ ctl_arena_stats_smerge(ctl_arena_stats_t
 		sstats->bstats[i].curruns += astats->bstats[i].curruns;
 	}
 }
 
 static void
 ctl_arena_refresh(arena_t *arena, unsigned i)
 {
 	ctl_arena_stats_t *astats = &ctl_stats.arenas[i];
-	ctl_arena_stats_t *sstats = &ctl_stats.arenas[narenas];
+	ctl_arena_stats_t *sstats = &ctl_stats.arenas[ctl_stats.narenas];
 
 	ctl_arena_clear(astats);
 
 	sstats->nthreads += astats->nthreads;
 	if (config_stats) {
 		ctl_arena_stats_amerge(astats, arena);
 		/* Merge into sum stats as well. */
 		ctl_arena_stats_smerge(sstats, astats);
@@ -513,21 +538,82 @@ ctl_arena_refresh(arena_t *arena, unsign
 		astats->pactive += arena->nactive;
 		astats->pdirty += arena->ndirty;
 		/* Merge into sum stats as well. */
 		sstats->pactive += arena->nactive;
 		sstats->pdirty += arena->ndirty;
 	}
 }
 
+static bool
+ctl_grow(void)
+{
+	size_t astats_size;
+	ctl_arena_stats_t *astats;
+	arena_t **tarenas;
+
+	/* Extend arena stats and arenas arrays. */
+	astats_size = (ctl_stats.narenas + 2) * sizeof(ctl_arena_stats_t);
+	if (ctl_stats.narenas == narenas_auto) {
+		/* ctl_stats.arenas and arenas came from base_alloc(). */
+		astats = (ctl_arena_stats_t *)imalloc(astats_size);
+		if (astats == NULL)
+			return (true);
+		memcpy(astats, ctl_stats.arenas, (ctl_stats.narenas + 1) *
+		    sizeof(ctl_arena_stats_t));
+
+		tarenas = (arena_t **)imalloc((ctl_stats.narenas + 1) *
+		    sizeof(arena_t *));
+		if (tarenas == NULL) {
+			idalloc(astats);
+			return (true);
+		}
+		memcpy(tarenas, arenas, ctl_stats.narenas * sizeof(arena_t *));
+	} else {
+		astats = (ctl_arena_stats_t *)iralloc(ctl_stats.arenas,
+		    astats_size, 0, 0, false, false);
+		if (astats == NULL)
+			return (true);
+
+		tarenas = (arena_t **)iralloc(arenas, (ctl_stats.narenas + 1) *
+		    sizeof(arena_t *), 0, 0, false, false);
+		if (tarenas == NULL)
+			return (true);
+	}
+	/* Initialize the new astats and arenas elements. */
+	memset(&astats[ctl_stats.narenas + 1], 0, sizeof(ctl_arena_stats_t));
+	if (ctl_arena_init(&astats[ctl_stats.narenas + 1]))
+		return (true);
+	tarenas[ctl_stats.narenas] = NULL;
+	/* Swap merged stats to their new location. */
+	{
+		ctl_arena_stats_t tstats;
+		memcpy(&tstats, &astats[ctl_stats.narenas],
+		    sizeof(ctl_arena_stats_t));
+		memcpy(&astats[ctl_stats.narenas],
+		    &astats[ctl_stats.narenas + 1], sizeof(ctl_arena_stats_t));
+		memcpy(&astats[ctl_stats.narenas + 1], &tstats,
+		    sizeof(ctl_arena_stats_t));
+	}
+	ctl_stats.arenas = astats;
+	ctl_stats.narenas++;
+	malloc_mutex_lock(&arenas_lock);
+	arenas = tarenas;
+	narenas_total++;
+	arenas_extend(narenas_total - 1);
+	malloc_mutex_unlock(&arenas_lock);
+
+	return (false);
+}
+
 static void
 ctl_refresh(void)
 {
 	unsigned i;
-	VARIABLE_ARRAY(arena_t *, tarenas, narenas);
+	VARIABLE_ARRAY(arena_t *, tarenas, ctl_stats.narenas);
 
 	if (config_stats) {
 		malloc_mutex_lock(&chunks_mtx);
 		ctl_stats.chunks.current = stats_chunks.curchunks;
 		ctl_stats.chunks.total = stats_chunks.nchunks;
 		ctl_stats.chunks.high = stats_chunks.highchunks;
 		malloc_mutex_unlock(&chunks_mtx);
 
@@ -537,42 +623,44 @@ ctl_refresh(void)
 		ctl_stats.huge.ndalloc = huge_ndalloc;
 		malloc_mutex_unlock(&huge_mtx);
 	}
 
 	/*
 	 * Clear sum stats, since they will be merged into by
 	 * ctl_arena_refresh().
 	 */
-	ctl_stats.arenas[narenas].nthreads = 0;
-	ctl_arena_clear(&ctl_stats.arenas[narenas]);
+	ctl_stats.arenas[ctl_stats.narenas].nthreads = 0;
+	ctl_arena_clear(&ctl_stats.arenas[ctl_stats.narenas]);
 
 	malloc_mutex_lock(&arenas_lock);
-	memcpy(tarenas, arenas, sizeof(arena_t *) * narenas);
-	for (i = 0; i < narenas; i++) {
+	memcpy(tarenas, arenas, sizeof(arena_t *) * ctl_stats.narenas);
+	for (i = 0; i < ctl_stats.narenas; i++) {
 		if (arenas[i] != NULL)
 			ctl_stats.arenas[i].nthreads = arenas[i]->nthreads;
 		else
 			ctl_stats.arenas[i].nthreads = 0;
 	}
 	malloc_mutex_unlock(&arenas_lock);
-	for (i = 0; i < narenas; i++) {
+	for (i = 0; i < ctl_stats.narenas; i++) {
 		bool initialized = (tarenas[i] != NULL);
 
 		ctl_stats.arenas[i].initialized = initialized;
 		if (initialized)
 			ctl_arena_refresh(tarenas[i], i);
 	}
 
 	if (config_stats) {
-		ctl_stats.allocated = ctl_stats.arenas[narenas].allocated_small
-		    + ctl_stats.arenas[narenas].astats.allocated_large
+		ctl_stats.allocated =
+		    ctl_stats.arenas[ctl_stats.narenas].allocated_small
+		    + ctl_stats.arenas[ctl_stats.narenas].astats.allocated_large
 		    + ctl_stats.huge.allocated;
-		ctl_stats.active = (ctl_stats.arenas[narenas].pactive <<
-		    LG_PAGE) + ctl_stats.huge.allocated;
+		ctl_stats.active =
+		    (ctl_stats.arenas[ctl_stats.narenas].pactive << LG_PAGE)
+		    + ctl_stats.huge.allocated;
 		ctl_stats.mapped = (ctl_stats.chunks.current << opt_lg_chunk);
 	}
 
 	ctl_epoch++;
 }
 
 static bool
 ctl_init(void)
@@ -580,40 +668,42 @@ ctl_init(void)
 	bool ret;
 
 	malloc_mutex_lock(&ctl_mtx);
 	if (ctl_initialized == false) {
 		/*
 		 * Allocate space for one extra arena stats element, which
 		 * contains summed stats across all arenas.
 		 */
+		assert(narenas_auto == narenas_total_get());
+		ctl_stats.narenas = narenas_auto;
 		ctl_stats.arenas = (ctl_arena_stats_t *)base_alloc(
-		    (narenas + 1) * sizeof(ctl_arena_stats_t));
+		    (ctl_stats.narenas + 1) * sizeof(ctl_arena_stats_t));
 		if (ctl_stats.arenas == NULL) {
 			ret = true;
 			goto label_return;
 		}
-		memset(ctl_stats.arenas, 0, (narenas + 1) *
+		memset(ctl_stats.arenas, 0, (ctl_stats.narenas + 1) *
 		    sizeof(ctl_arena_stats_t));
 
 		/*
 		 * Initialize all stats structures, regardless of whether they
 		 * ever get used.  Lazy initialization would allow errors to
 		 * cause inconsistent state to be viewable by the application.
 		 */
 		if (config_stats) {
 			unsigned i;
-			for (i = 0; i <= narenas; i++) {
+			for (i = 0; i <= ctl_stats.narenas; i++) {
 				if (ctl_arena_init(&ctl_stats.arenas[i])) {
 					ret = true;
 					goto label_return;
 				}
 			}
 		}
-		ctl_stats.arenas[narenas].initialized = true;
+		ctl_stats.arenas[ctl_stats.narenas].initialized = true;
 
 		ctl_epoch = 0;
 		ctl_refresh();
 		ctl_initialized = true;
 	}
 
 	ret = false;
 label_return:
@@ -865,31 +955,31 @@ ctl_postfork_child(void)
 	}								\
 } while (0)
 
 #define	READ(v, t)	do {						\
 	if (oldp != NULL && oldlenp != NULL) {				\
 		if (*oldlenp != sizeof(t)) {				\
 			size_t	copylen = (sizeof(t) <= *oldlenp)	\
 			    ? sizeof(t) : *oldlenp;			\
-			memcpy(oldp, (void *)&v, copylen);		\
+			memcpy(oldp, (void *)&(v), copylen);		\
 			ret = EINVAL;					\
 			goto label_return;				\
 		} else							\
-			*(t *)oldp = v;					\
+			*(t *)oldp = (v);				\
 	}								\
 } while (0)
 
 #define	WRITE(v, t)	do {						\
 	if (newp != NULL) {						\
 		if (newlen != sizeof(t)) {				\
 			ret = EINVAL;					\
 			goto label_return;				\
 		}							\
-		v = *(t *)newp;						\
+		(v) = *(t *)newp;					\
 	}								\
 } while (0)
 
 /*
  * There's a lot of code duplication in the following macros due to limitations
  * in how nested cpp macros are expanded.
  */
 #define	CTL_RO_CLGEN(c, l, n, v, t)					\
@@ -900,17 +990,17 @@ n##_ctl(const size_t *mib, size_t miblen
 	int ret;							\
 	t oldval;							\
 									\
 	if ((c) == false)						\
 		return (ENOENT);					\
 	if (l)								\
 		malloc_mutex_lock(&ctl_mtx);				\
 	READONLY();							\
-	oldval = v;							\
+	oldval = (v);							\
 	READ(oldval, t);						\
 									\
 	ret = 0;							\
 label_return:								\
 	if (l)								\
 		malloc_mutex_unlock(&ctl_mtx);				\
 	return (ret);							\
 }
@@ -922,17 +1012,17 @@ n##_ctl(const size_t *mib, size_t miblen
 {									\
 	int ret;							\
 	t oldval;							\
 									\
 	if ((c) == false)						\
 		return (ENOENT);					\
 	malloc_mutex_lock(&ctl_mtx);					\
 	READONLY();							\
-	oldval = v;							\
+	oldval = (v);							\
 	READ(oldval, t);						\
 									\
 	ret = 0;							\
 label_return:								\
 	malloc_mutex_unlock(&ctl_mtx);					\
 	return (ret);							\
 }
 
@@ -941,17 +1031,17 @@ static int								\
 n##_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,	\
     void *newp, size_t newlen)						\
 {									\
 	int ret;							\
 	t oldval;							\
 									\
 	malloc_mutex_lock(&ctl_mtx);					\
 	READONLY();							\
-	oldval = v;							\
+	oldval = (v);							\
 	READ(oldval, t);						\
 									\
 	ret = 0;							\
 label_return:								\
 	malloc_mutex_unlock(&ctl_mtx);					\
 	return (ret);							\
 }
 
@@ -965,34 +1055,34 @@ n##_ctl(const size_t *mib, size_t miblen
     void *newp, size_t newlen)						\
 {									\
 	int ret;							\
 	t oldval;							\
 									\
 	if ((c) == false)						\
 		return (ENOENT);					\
 	READONLY();							\
-	oldval = v;							\
+	oldval = (v);							\
 	READ(oldval, t);						\
 									\
 	ret = 0;							\
 label_return:								\
 	return (ret);							\
 }
 
 #define	CTL_RO_NL_GEN(n, v, t)						\
 static int								\
 n##_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,	\
     void *newp, size_t newlen)						\
 {									\
 	int ret;							\
 	t oldval;							\
 									\
 	READONLY();							\
-	oldval = v;							\
+	oldval = (v);							\
 	READ(oldval, t);						\
 									\
 	ret = 0;							\
 label_return:								\
 	return (ret);							\
 }
 
 #define	CTL_RO_BOOL_CONFIG_GEN(n)					\
@@ -1079,23 +1169,24 @@ label_return:
 
 static int
 thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
     void *newp, size_t newlen)
 {
 	int ret;
 	unsigned newind, oldind;
 
+	malloc_mutex_lock(&ctl_mtx);
 	newind = oldind = choose_arena(NULL)->ind;
 	WRITE(newind, unsigned);
 	READ(oldind, unsigned);
 	if (newind != oldind) {
 		arena_t *arena;
 
-		if (newind >= narenas) {
+		if (newind >= ctl_stats.narenas) {
 			/* New arena index is out of range. */
 			ret = EFAULT;
 			goto label_return;
 		}
 
 		/* Initialize arena if necessary. */
 		malloc_mutex_lock(&arenas_lock);
 		if ((arena = arenas[newind]) == NULL && (arena =
@@ -1118,16 +1209,17 @@ thread_arena_ctl(const size_t *mib, size
 				tcache_arena_associate(tcache, arena);
 			}
 		}
 		arenas_tsd_set(&arena);
 	}
 
 	ret = 0;
 label_return:
+	malloc_mutex_unlock(&ctl_mtx);
 	return (ret);
 }
 
 CTL_RO_NL_CGEN(config_stats, thread_allocated,
     thread_allocated_tsd_get()->allocated, uint64_t)
 CTL_RO_NL_CGEN(config_stats, thread_allocatedp,
     &thread_allocated_tsd_get()->allocated, uint64_t *)
 CTL_RO_NL_CGEN(config_stats, thread_deallocated,
@@ -1151,16 +1243,17 @@ CTL_RO_BOOL_CONFIG_GEN(config_tcache)
 CTL_RO_BOOL_CONFIG_GEN(config_tls)
 CTL_RO_BOOL_CONFIG_GEN(config_utrace)
 CTL_RO_BOOL_CONFIG_GEN(config_valgrind)
 CTL_RO_BOOL_CONFIG_GEN(config_xmalloc)
 
 /******************************************************************************/
 
 CTL_RO_NL_GEN(opt_abort, opt_abort, bool)
+CTL_RO_NL_GEN(opt_dss, opt_dss, const char *)
 CTL_RO_NL_GEN(opt_lg_chunk, opt_lg_chunk, size_t)
 CTL_RO_NL_GEN(opt_narenas, opt_narenas, size_t)
 CTL_RO_NL_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t)
 CTL_RO_NL_GEN(opt_stats_print, opt_stats_print, bool)
 CTL_RO_NL_CGEN(config_fill, opt_junk, opt_junk, bool)
 CTL_RO_NL_CGEN(config_fill, opt_zero, opt_zero, bool)
 CTL_RO_NL_CGEN(config_fill, opt_quarantine, opt_quarantine, size_t)
 CTL_RO_NL_CGEN(config_fill, opt_redzone, opt_redzone, bool)
@@ -1176,56 +1269,187 @@ CTL_RO_NL_CGEN(config_prof, opt_lg_prof_
 CTL_RO_NL_CGEN(config_prof, opt_lg_prof_interval, opt_lg_prof_interval, ssize_t)
 CTL_RO_NL_CGEN(config_prof, opt_prof_gdump, opt_prof_gdump, bool)
 CTL_RO_NL_CGEN(config_prof, opt_prof_final, opt_prof_final, bool)
 CTL_RO_NL_CGEN(config_prof, opt_prof_leak, opt_prof_leak, bool)
 CTL_RO_NL_CGEN(config_prof, opt_prof_accum, opt_prof_accum, bool)
 
 /******************************************************************************/
 
+/* ctl_mutex must be held during execution of this function. */
+static void
+arena_purge(unsigned arena_ind)
+{
+	VARIABLE_ARRAY(arena_t *, tarenas, ctl_stats.narenas);
+
+	malloc_mutex_lock(&arenas_lock);
+	memcpy(tarenas, arenas, sizeof(arena_t *) * ctl_stats.narenas);
+	malloc_mutex_unlock(&arenas_lock);
+
+	if (arena_ind == ctl_stats.narenas) {
+		unsigned i;
+		for (i = 0; i < ctl_stats.narenas; i++) {
+			if (tarenas[i] != NULL)
+				arena_purge_all(tarenas[i]);
+		}
+	} else {
+		assert(arena_ind < ctl_stats.narenas);
+		if (tarenas[arena_ind] != NULL)
+			arena_purge_all(tarenas[arena_ind]);
+	}
+}
+
+static int
+arena_i_purge_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+
+	READONLY();
+	WRITEONLY();
+	malloc_mutex_lock(&ctl_mtx);
+	arena_purge(mib[1]);
+	malloc_mutex_unlock(&ctl_mtx);
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
+static int
+arena_i_dss_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret, i;
+	bool match, err;
+	const char *dss;
+	unsigned arena_ind = mib[1];
+	dss_prec_t dss_prec_old = dss_prec_limit;
+	dss_prec_t dss_prec = dss_prec_limit;
+
+	malloc_mutex_lock(&ctl_mtx);
+	WRITE(dss, const char *);
+	match = false;
+	for (i = 0; i < dss_prec_limit; i++) {
+		if (strcmp(dss_prec_names[i], dss) == 0) {
+			dss_prec = i;
+			match = true;
+			break;
+		}
+	}
+	if (match == false) {
+		ret = EINVAL;
+		goto label_return;
+	}
+
+	if (arena_ind < ctl_stats.narenas) {
+		arena_t *arena = arenas[arena_ind];
+		if (arena != NULL) {
+			dss_prec_old = arena_dss_prec_get(arena);
+			arena_dss_prec_set(arena, dss_prec);
+			err = false;
+		} else
+			err = true;
+	} else {
+		dss_prec_old = chunk_dss_prec_get();
+		err = chunk_dss_prec_set(dss_prec);
+	}
+	dss = dss_prec_names[dss_prec_old];
+	READ(dss, const char *);
+	if (err) {
+		ret = EFAULT;
+		goto label_return;
+	}
+
+	ret = 0;
+label_return:
+	malloc_mutex_unlock(&ctl_mtx);
+	return (ret);
+}
+
+static const ctl_named_node_t *
+arena_i_index(const size_t *mib, size_t miblen, size_t i)
+{
+	const ctl_named_node_t * ret;
+
+	malloc_mutex_lock(&ctl_mtx);
+	if (i > ctl_stats.narenas) {
+		ret = NULL;
+		goto label_return;
+	}
+
+	ret = super_arena_i_node;
+label_return:
+	malloc_mutex_unlock(&ctl_mtx);
+	return (ret);
+}
+
+
+/******************************************************************************/
+
 CTL_RO_NL_GEN(arenas_bin_i_size, arena_bin_info[mib[2]].reg_size, size_t)
 CTL_RO_NL_GEN(arenas_bin_i_nregs, arena_bin_info[mib[2]].nregs, uint32_t)
 CTL_RO_NL_GEN(arenas_bin_i_run_size, arena_bin_info[mib[2]].run_size, size_t)
-const ctl_named_node_t *
+static const ctl_named_node_t *
 arenas_bin_i_index(const size_t *mib, size_t miblen, size_t i)
 {
 
 	if (i > NBINS)
 		return (NULL);
 	return (super_arenas_bin_i_node);
 }
 
 CTL_RO_NL_GEN(arenas_lrun_i_size, ((mib[2]+1) << LG_PAGE), size_t)
-const ctl_named_node_t *
+static const ctl_named_node_t *
 arenas_lrun_i_index(const size_t *mib, size_t miblen, size_t i)
 {
 
 	if (i > nlclasses)
 		return (NULL);
 	return (super_arenas_lrun_i_node);
 }
 
-CTL_RO_NL_GEN(arenas_narenas, narenas, unsigned)
+static int
+arenas_narenas_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+	unsigned narenas;
+
+	malloc_mutex_lock(&ctl_mtx);
+	READONLY();
+	if (*oldlenp != sizeof(unsigned)) {
+		ret = EINVAL;
+		goto label_return;
+	}
+	narenas = ctl_stats.narenas;
+	READ(narenas, unsigned);
+
+	ret = 0;
+label_return:
+	malloc_mutex_unlock(&ctl_mtx);
+	return (ret);
+}
 
 static int
 arenas_initialized_ctl(const size_t *mib, size_t miblen, void *oldp,
     size_t *oldlenp, void *newp, size_t newlen)
 {
 	int ret;
 	unsigned nread, i;
 
 	malloc_mutex_lock(&ctl_mtx);
 	READONLY();
-	if (*oldlenp != narenas * sizeof(bool)) {
+	if (*oldlenp != ctl_stats.narenas * sizeof(bool)) {
 		ret = EINVAL;
-		nread = (*oldlenp < narenas * sizeof(bool))
-		    ? (*oldlenp / sizeof(bool)) : narenas;
+		nread = (*oldlenp < ctl_stats.narenas * sizeof(bool))
+		    ? (*oldlenp / sizeof(bool)) : ctl_stats.narenas;
 	} else {
 		ret = 0;
-		nread = narenas;
+		nread = ctl_stats.narenas;
 	}
 
 	for (i = 0; i < nread; i++)
 		((bool *)oldp)[i] = ctl_stats.arenas[i].initialized;
 
 label_return:
 	malloc_mutex_unlock(&ctl_mtx);
 	return (ret);
@@ -1238,46 +1462,55 @@ CTL_RO_NL_GEN(arenas_nbins, NBINS, unsig
 CTL_RO_NL_CGEN(config_tcache, arenas_nhbins, nhbins, unsigned)
 CTL_RO_NL_GEN(arenas_nlruns, nlclasses, size_t)
 
 static int
 arenas_purge_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
     void *newp, size_t newlen)
 {
 	int ret;
-	unsigned arena;
+	unsigned arena_ind;
 
+	malloc_mutex_lock(&ctl_mtx);
 	WRITEONLY();
-	arena = UINT_MAX;
-	WRITE(arena, unsigned);
-	if (newp != NULL && arena >= narenas) {
+	arena_ind = UINT_MAX;
+	WRITE(arena_ind, unsigned);
+	if (newp != NULL && arena_ind >= ctl_stats.narenas)
 		ret = EFAULT;
-		goto label_return;
-	} else {
-		VARIABLE_ARRAY(arena_t *, tarenas, narenas);
+	else {
+		if (arena_ind == UINT_MAX)
+			arena_ind = ctl_stats.narenas;
+		arena_purge(arena_ind);
+		ret = 0;
+	}
 
-		malloc_mutex_lock(&arenas_lock);
-		memcpy(tarenas, arenas, sizeof(arena_t *) * narenas);
-		malloc_mutex_unlock(&arenas_lock);
+label_return:
+	malloc_mutex_unlock(&ctl_mtx);
+	return (ret);
+}
 
-		if (arena == UINT_MAX) {
-			unsigned i;
-			for (i = 0; i < narenas; i++) {
-				if (tarenas[i] != NULL)
-					arena_purge_all(tarenas[i]);
-			}
-		} else {
-			assert(arena < narenas);
-			if (tarenas[arena] != NULL)
-				arena_purge_all(tarenas[arena]);
-		}
+static int
+arenas_extend_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	unsigned narenas;
+
+	malloc_mutex_lock(&ctl_mtx);
+	READONLY();
+	if (ctl_grow()) {
+		ret = EAGAIN;
+		goto label_return;
 	}
+	narenas = ctl_stats.narenas - 1;
+	READ(narenas, unsigned);
 
 	ret = 0;
 label_return:
+	malloc_mutex_unlock(&ctl_mtx);
 	return (ret);
 }
 
 /******************************************************************************/
 
 static int
 prof_active_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
     void *newp, size_t newlen)
@@ -1372,17 +1605,17 @@ CTL_RO_CGEN(config_stats && config_tcach
     ctl_stats.arenas[mib[2]].bstats[mib[4]].nflushes, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nruns,
     ctl_stats.arenas[mib[2]].bstats[mib[4]].nruns, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nreruns,
     ctl_stats.arenas[mib[2]].bstats[mib[4]].reruns, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_curruns,
     ctl_stats.arenas[mib[2]].bstats[mib[4]].curruns, size_t)
 
-const ctl_named_node_t *
+static const ctl_named_node_t *
 stats_arenas_i_bins_j_index(const size_t *mib, size_t miblen, size_t j)
 {
 
 	if (j > NBINS)
 		return (NULL);
 	return (super_stats_arenas_i_bins_j_node);
 }
 
@@ -1390,44 +1623,45 @@ CTL_RO_CGEN(config_stats, stats_arenas_i
     ctl_stats.arenas[mib[2]].lstats[mib[4]].nmalloc, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_lruns_j_ndalloc,
     ctl_stats.arenas[mib[2]].lstats[mib[4]].ndalloc, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_lruns_j_nrequests,
     ctl_stats.arenas[mib[2]].lstats[mib[4]].nrequests, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_lruns_j_curruns,
     ctl_stats.arenas[mib[2]].lstats[mib[4]].curruns, size_t)
 
-const ctl_named_node_t *
+static const ctl_named_node_t *
 stats_arenas_i_lruns_j_index(const size_t *mib, size_t miblen, size_t j)
 {
 
 	if (j > nlclasses)
 		return (NULL);
 	return (super_stats_arenas_i_lruns_j_node);
 }
 
 CTL_RO_GEN(stats_arenas_i_nthreads, ctl_stats.arenas[mib[2]].nthreads, unsigned)
+CTL_RO_GEN(stats_arenas_i_dss, ctl_stats.arenas[mib[2]].dss, const char *)
 CTL_RO_GEN(stats_arenas_i_pactive, ctl_stats.arenas[mib[2]].pactive, size_t)
 CTL_RO_GEN(stats_arenas_i_pdirty, ctl_stats.arenas[mib[2]].pdirty, size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_mapped,
     ctl_stats.arenas[mib[2]].astats.mapped, size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_npurge,
     ctl_stats.arenas[mib[2]].astats.npurge, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_nmadvise,
     ctl_stats.arenas[mib[2]].astats.nmadvise, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_purged,
     ctl_stats.arenas[mib[2]].astats.purged, uint64_t)
 
-const ctl_named_node_t *
+static const ctl_named_node_t *
 stats_arenas_i_index(const size_t *mib, size_t miblen, size_t i)
 {
 	const ctl_named_node_t * ret;
 
 	malloc_mutex_lock(&ctl_mtx);
-	if (ctl_stats.arenas[i].initialized == false) {
+	if (i > ctl_stats.narenas || ctl_stats.arenas[i].initialized == false) {
 		ret = NULL;
 		goto label_return;
 	}
 
 	ret = super_stats_arenas_i_node;
 label_return:
 	malloc_mutex_unlock(&ctl_mtx);
 	return (ret);
--- a/memory/jemalloc/src/src/huge.c
+++ b/memory/jemalloc/src/src/huge.c
@@ -43,17 +43,18 @@ huge_palloc(size_t size, size_t alignmen
 	if (node == NULL)
 		return (NULL);
 
 	/*
 	 * Copy zero into is_zeroed and pass the copy to chunk_alloc(), so that
 	 * it is possible to make correct junk/zero fill decisions below.
 	 */
 	is_zeroed = zero;
-	ret = chunk_alloc(csize, alignment, false, &is_zeroed);
+	ret = chunk_alloc(csize, alignment, false, &is_zeroed,
+	    chunk_dss_prec_get());
 	if (ret == NULL) {
 		base_node_dealloc(node);
 		return (NULL);
 	}
 
 	/* Insert node into huge. */
 	node->addr = ret;
 	node->size = csize;
@@ -96,17 +97,17 @@ huge_ralloc_no_move(void *ptr, size_t ol
 	}
 
 	/* Reallocation would require a move. */
 	return (NULL);
 }
 
 void *
 huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero)
+    size_t alignment, bool zero, bool try_tcache_dalloc)
 {
 	void *ret;
 	size_t copysize;
 
 	/* Try to avoid moving the allocation. */
 	ret = huge_ralloc_no_move(ptr, oldsize, size, extra);
 	if (ret != NULL)
 		return (ret);
@@ -175,17 +176,17 @@ huge_ralloc(void *ptr, size_t oldsize, s
 				abort();
 			memcpy(ret, ptr, copysize);
 			chunk_dealloc_mmap(ptr, oldsize);
 		}
 	} else
 #endif
 	{
 		memcpy(ret, ptr, copysize);
-		iqalloc(ptr);
+		iqallocx(ptr, try_tcache_dalloc);
 	}
 	return (ret);
 }
 
 void
 huge_dalloc(void *ptr, bool unmap)
 {
 	extent_node_t *node, key;
--- a/memory/jemalloc/src/src/jemalloc.c
+++ b/memory/jemalloc/src/src/jemalloc.c
@@ -28,17 +28,18 @@ bool	opt_valgrind = false;
 bool	opt_xmalloc = false;
 bool	opt_zero = false;
 size_t	opt_narenas = 0;
 
 unsigned	ncpus;
 
 malloc_mutex_t		arenas_lock;
 arena_t			**arenas;
-unsigned		narenas;
+unsigned		narenas_total;
+unsigned		narenas_auto;
 
 /* Set to true once the allocator has been initialized. */
 static bool		malloc_initialized = false;
 
 #ifdef JEMALLOC_THREADED_INIT
 /* Used to let the initializing thread recursively allocate. */
 #  define NO_INITIALIZER	((unsigned long)0)
 #  define INITIALIZER		pthread_self()
@@ -139,47 +140,48 @@ arenas_extend(unsigned ind)
 }
 
 /* Slow path, called only by choose_arena(). */
 arena_t *
 choose_arena_hard(void)
 {
 	arena_t *ret;
 
-	if (narenas > 1) {
+	if (narenas_auto > 1) {
 		unsigned i, choose, first_null;
 
 		choose = 0;
-		first_null = narenas;
+		first_null = narenas_auto;
 		malloc_mutex_lock(&arenas_lock);
 		assert(arenas[0] != NULL);
-		for (i = 1; i < narenas; i++) {
+		for (i = 1; i < narenas_auto; i++) {
 			if (arenas[i] != NULL) {
 				/*
 				 * Choose the first arena that has the lowest
 				 * number of threads assigned to it.
 				 */
 				if (arenas[i]->nthreads <
 				    arenas[choose]->nthreads)
 					choose = i;
-			} else if (first_null == narenas) {
+			} else if (first_null == narenas_auto) {
 				/*
 				 * Record the index of the first uninitialized
 				 * arena, in case all extant arenas are in use.
 				 *
 				 * NB: It is possible for there to be
 				 * discontinuities in terms of initialized
 				 * versus uninitialized arenas, due to the
 				 * "thread.arena" mallctl.
 				 */
 				first_null = i;
 			}
 		}
 
-		if (arenas[choose]->nthreads == 0 || first_null == narenas) {
+		if (arenas[choose]->nthreads == 0
+		    || first_null == narenas_auto) {
 			/*
 			 * Use an unloaded arena, or the least loaded arena if
 			 * all arenas are already initialized.
 			 */
 			ret = arenas[choose];
 		} else {
 			/* Initialize a new arena. */
 			ret = arenas_extend(first_null);
@@ -198,26 +200,26 @@ choose_arena_hard(void)
 	return (ret);
 }
 
 static void
 stats_print_atexit(void)
 {
 
 	if (config_tcache && config_stats) {
-		unsigned i;
+		unsigned narenas, i;
 
 		/*
 		 * Merge stats from extant threads.  This is racy, since
 		 * individual threads do not lock when recording tcache stats
 		 * events.  As a consequence, the final stats may be slightly
 		 * out of date by the time they are reported, if other threads
 		 * continue to allocate.
 		 */
-		for (i = 0; i < narenas; i++) {
+		for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
 			arena_t *arena = arenas[i];
 			if (arena != NULL) {
 				tcache_t *tcache;
 
 				/*
 				 * tcache_stats_merge() locks bins, so if any
 				 * code is introduced that acquires both arena
 				 * and bin locks in the opposite order,
@@ -549,16 +551,40 @@ malloc_conf_init(void)
 			 * Chunks always require at least one header page, plus
 			 * one data page in the absence of redzones, or three
 			 * pages in the presence of redzones.  In order to
 			 * simplify options processing, fix the limit based on
 			 * config_fill.
 			 */
 			CONF_HANDLE_SIZE_T(opt_lg_chunk, "lg_chunk", LG_PAGE +
 			    (config_fill ? 2 : 1), (sizeof(size_t) << 3) - 1)
+			if (strncmp("dss", k, klen) == 0) {
+				int i;
+				bool match = false;
+				for (i = 0; i < dss_prec_limit; i++) {
+					if (strncmp(dss_prec_names[i], v, vlen)
+					    == 0) {
+						if (chunk_dss_prec_set(i)) {
+							malloc_conf_error(
+							    "Error setting dss",
+							    k, klen, v, vlen);
+						} else {
+							opt_dss =
+							    dss_prec_names[i];
+							match = true;
+							break;
+						}
+					}
+				}
+				if (match == false) {
+					malloc_conf_error("Invalid conf value",
+					    k, klen, v, vlen);
+				}
+				continue;
+			}
 			CONF_HANDLE_SIZE_T(opt_narenas, "narenas", 1,
 			    SIZE_T_MAX)
 			CONF_HANDLE_SSIZE_T(opt_lg_dirty_mult, "lg_dirty_mult",
 			    -1, (sizeof(size_t) << 3) - 1)
 			CONF_HANDLE_BOOL(opt_stats_print, "stats_print")
 			if (config_fill) {
 				CONF_HANDLE_BOOL(opt_junk, "junk")
 				CONF_HANDLE_SIZE_T(opt_quarantine, "quarantine",
@@ -694,19 +720,19 @@ malloc_init_hard(void)
 
 	if (malloc_mutex_init(&arenas_lock))
 		return (true);
 
 	/*
 	 * Create enough scaffolding to allow recursive allocation in
 	 * malloc_ncpus().
 	 */
-	narenas = 1;
+	narenas_total = narenas_auto = 1;
 	arenas = init_arenas;
-	memset(arenas, 0, sizeof(arena_t *) * narenas);
+	memset(arenas, 0, sizeof(arena_t *) * narenas_auto);
 
 	/*
 	 * Initialize one arena here.  The rest are lazily created in
 	 * choose_arena_hard().
 	 */
 	arenas_extend(0);
 	if (arenas[0] == NULL) {
 		malloc_mutex_unlock(&init_lock);
@@ -754,39 +780,40 @@ malloc_init_hard(void)
 		 * For SMP systems, create more than one arena per CPU by
 		 * default.
 		 */
 		if (ncpus > 1)
 			opt_narenas = ncpus << 2;
 		else
 			opt_narenas = 1;
 	}
-	narenas = opt_narenas;
+	narenas_auto = opt_narenas;
 	/*
 	 * Make sure that the arenas array can be allocated.  In practice, this
 	 * limit is enough to allow the allocator to function, but the ctl
 	 * machinery will fail to allocate memory at far lower limits.
 	 */
-	if (narenas > chunksize / sizeof(arena_t *)) {
-		narenas = chunksize / sizeof(arena_t *);
+	if (narenas_auto > chunksize / sizeof(arena_t *)) {
+		narenas_auto = chunksize / sizeof(arena_t *);
 		malloc_printf("<jemalloc>: Reducing narenas to limit (%d)\n",
-		    narenas);
+		    narenas_auto);
 	}
+	narenas_total = narenas_auto;
 
 	/* Allocate and initialize arenas. */
-	arenas = (arena_t **)base_alloc(sizeof(arena_t *) * narenas);
+	arenas = (arena_t **)base_alloc(sizeof(arena_t *) * narenas_total);
 	if (arenas == NULL) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
 	}
 	/*
 	 * Zero the array.  In practice, this should always be pre-zeroed,
 	 * since it was just mmap()ed, but let's be sure.
 	 */
-	memset(arenas, 0, sizeof(arena_t *) * narenas);
+	memset(arenas, 0, sizeof(arena_t *) * narenas_total);
 	/* Copy the pointer to the one arena that was already initialized. */
 	arenas[0] = init_arenas[0];
 
 	malloc_initialized = true;
 	malloc_mutex_unlock(&init_lock);
 	return (false);
 }
 
@@ -1341,73 +1368,86 @@ je_mallctlbymib(const size_t *mib, size_
  */
 /******************************************************************************/
 /*
  * Begin experimental functions.
  */
 #ifdef JEMALLOC_EXPERIMENTAL
 
 JEMALLOC_INLINE void *
-iallocm(size_t usize, size_t alignment, bool zero)
+iallocm(size_t usize, size_t alignment, bool zero, bool try_tcache,
+    arena_t *arena)
 {
 
 	assert(usize == ((alignment == 0) ? s2u(usize) : sa2u(usize,
 	    alignment)));
 
 	if (alignment != 0)
-		return (ipalloc(usize, alignment, zero));
+		return (ipallocx(usize, alignment, zero, try_tcache, arena));
 	else if (zero)
-		return (icalloc(usize));
+		return (icallocx(usize, try_tcache, arena));
 	else
-		return (imalloc(usize));
+		return (imallocx(usize, try_tcache, arena));
 }
 
 int
 je_allocm(void **ptr, size_t *rsize, size_t size, int flags)
 {
 	void *p;
 	size_t usize;
 	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
 	    & (SIZE_T_MAX-1));
 	bool zero = flags & ALLOCM_ZERO;
+	unsigned arena_ind = ((unsigned)(flags >> 8)) - 1;
+	arena_t *arena;
+	bool try_tcache;
 
 	assert(ptr != NULL);
 	assert(size != 0);
 
 	if (malloc_init())
 		goto label_oom;
 
+	if (arena_ind != UINT_MAX) {
+		arena = arenas[arena_ind];
+		try_tcache = false;
+	} else {
+		arena = NULL;
+		try_tcache = true;
+	}
+
 	usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
 	if (usize == 0)
 		goto label_oom;
 
 	if (config_prof && opt_prof) {
 		prof_thr_cnt_t *cnt;
 
 		PROF_ALLOC_PREP(1, usize, cnt);
 		if (cnt == NULL)
 			goto label_oom;
 		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <=
 		    SMALL_MAXCLASS) {
 			size_t usize_promoted = (alignment == 0) ?
 			    s2u(SMALL_MAXCLASS+1) : sa2u(SMALL_MAXCLASS+1,
 			    alignment);
 			assert(usize_promoted != 0);
-			p = iallocm(usize_promoted, alignment, zero);
+			p = iallocm(usize_promoted, alignment, zero,
+			    try_tcache, arena);
 			if (p == NULL)
 				goto label_oom;
 			arena_prof_promoted(p, usize);
 		} else {
-			p = iallocm(usize, alignment, zero);
+			p = iallocm(usize, alignment, zero, try_tcache, arena);
 			if (p == NULL)
 				goto label_oom;
 		}
 		prof_malloc(p, usize, cnt);
 	} else {
-		p = iallocm(usize, alignment, zero);
+		p = iallocm(usize, alignment, zero, try_tcache, arena);
 		if (p == NULL)
 			goto label_oom;
 	}
 	if (rsize != NULL)
 		*rsize = usize;
 
 	*ptr = p;
 	if (config_stats) {
@@ -1434,23 +1474,39 @@ je_rallocm(void **ptr, size_t *rsize, si
 	void *p, *q;
 	size_t usize;
 	size_t old_size;
 	size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
 	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
 	    & (SIZE_T_MAX-1));
 	bool zero = flags & ALLOCM_ZERO;
 	bool no_move = flags & ALLOCM_NO_MOVE;
+	unsigned arena_ind = ((unsigned)(flags >> 8)) - 1;
+	bool try_tcache_alloc, try_tcache_dalloc;
+	arena_t *arena;
 
 	assert(ptr != NULL);
 	assert(*ptr != NULL);
 	assert(size != 0);
 	assert(SIZE_T_MAX - size >= extra);
 	assert(malloc_initialized || IS_INITIALIZER);
 
+	if (arena_ind != UINT_MAX) {
+		arena_chunk_t *chunk;
+		try_tcache_alloc = true;
+		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(*ptr);
+		try_tcache_dalloc = (chunk == *ptr || chunk->arena !=
+		    arenas[arena_ind]);
+		arena = arenas[arena_ind];
+	} else {
+		try_tcache_alloc = true;
+		try_tcache_dalloc = true;
+		arena = NULL;
+	}
+
 	p = *ptr;
 	if (config_prof && opt_prof) {
 		prof_thr_cnt_t *cnt;
 
 		/*
 		 * usize isn't knowable before iralloc() returns when extra is
 		 * non-zero.  Therefore, compute its maximum possible value and
 		 * use that in PROF_ALLOC_PREP() to decide whether to capture a
@@ -1467,45 +1523,48 @@ je_rallocm(void **ptr, size_t *rsize, si
 		if (cnt == NULL)
 			goto label_oom;
 		/*
 		 * Use minimum usize to determine whether promotion may happen.
 		 */
 		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U
 		    && ((alignment == 0) ? s2u(size) : sa2u(size, alignment))
 		    <= SMALL_MAXCLASS) {
-			q = iralloc(p, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >=
+			q = irallocx(p, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >=
 			    size+extra) ? 0 : size+extra - (SMALL_MAXCLASS+1),
-			    alignment, zero, no_move);
+			    alignment, zero, no_move, try_tcache_alloc,
+			    try_tcache_dalloc, arena);
 			if (q == NULL)
 				goto label_err;
 			if (max_usize < PAGE) {
 				usize = max_usize;
 				arena_prof_promoted(q, usize);
 			} else
 				usize = isalloc(q, config_prof);
 		} else {
-			q = iralloc(p, size, extra, alignment, zero, no_move);
+			q = irallocx(p, size, extra, alignment, zero, no_move,
+			    try_tcache_alloc, try_tcache_dalloc, arena);
 			if (q == NULL)
 				goto label_err;
 			usize = isalloc(q, config_prof);
 		}
 		prof_realloc(q, usize, cnt, old_size, old_ctx);
 		if (rsize != NULL)
 			*rsize = usize;
 	} else {
 		if (config_stats) {
 			old_size = isalloc(p, false);
 			if (config_valgrind && opt_valgrind)
 				old_rzsize = u2rz(old_size);
 		} else if (config_valgrind && opt_valgrind) {
 			old_size = isalloc(p, false);
 			old_rzsize = u2rz(old_size);
 		}
-		q = iralloc(p, size, extra, alignment, zero, no_move);
+		q = irallocx(p, size, extra, alignment, zero, no_move,
+		    try_tcache_alloc, try_tcache_dalloc, arena);
 		if (q == NULL)
 			goto label_err;
 		if (config_stats)
 			usize = isalloc(q, config_prof);
 		if (rsize != NULL) {
 			if (config_stats == false)
 				usize = isalloc(q, config_prof);
 			*rsize = usize;
@@ -1556,33 +1615,42 @@ je_sallocm(const void *ptr, size_t *rsiz
 	return (ALLOCM_SUCCESS);
 }
 
 int
 je_dallocm(void *ptr, int flags)
 {
 	size_t usize;
 	size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
+	unsigned arena_ind = ((unsigned)(flags >> 8)) - 1;
+	bool try_tcache;
 
 	assert(ptr != NULL);
 	assert(malloc_initialized || IS_INITIALIZER);
 
+	if (arena_ind != UINT_MAX) {
+		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+		try_tcache = (chunk == ptr || chunk->arena !=
+		    arenas[arena_ind]);
+	} else
+		try_tcache = true;
+
 	UTRACE(ptr, 0, 0);
 	if (config_stats || config_valgrind)
 		usize = isalloc(ptr, config_prof);
 	if (config_prof && opt_prof) {
 		if (config_stats == false && config_valgrind == false)
 			usize = isalloc(ptr, config_prof);
 		prof_free(ptr, usize);
 	}
 	if (config_stats)
 		thread_allocated_tsd_get()->deallocated += usize;
 	if (config_valgrind && opt_valgrind)
 		rzsize = p2rz(ptr);
-	iqalloc(ptr);
+	iqallocx(ptr, try_tcache);
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
 
 	return (ALLOCM_SUCCESS);
 }
 
 int
 je_nallocm(size_t *rsize, size_t size, int flags)
 {
@@ -1649,17 +1717,17 @@ JEMALLOC_EXPORT void
 	if (malloc_initialized == false)
 		return;
 #endif
 	assert(malloc_initialized);
 
 	/* Acquire all mutexes in a safe order. */
 	ctl_prefork();
 	malloc_mutex_prefork(&arenas_lock);
-	for (i = 0; i < narenas; i++) {
+	for (i = 0; i < narenas_total; i++) {
 		if (arenas[i] != NULL)
 			arena_prefork(arenas[i]);
 	}
 	prof_prefork();
 	chunk_prefork();
 	base_prefork();
 	huge_prefork();
 }
@@ -1680,17 +1748,17 @@ JEMALLOC_EXPORT void
 #endif
 	assert(malloc_initialized);
 
 	/* Release all mutexes, now that fork() has completed. */
 	huge_postfork_parent();
 	base_postfork_parent();
 	chunk_postfork_parent();
 	prof_postfork_parent();
-	for (i = 0; i < narenas; i++) {
+	for (i = 0; i < narenas_total; i++) {
 		if (arenas[i] != NULL)
 			arena_postfork_parent(arenas[i]);
 	}
 	malloc_mutex_postfork_parent(&arenas_lock);
 	ctl_postfork_parent();
 }
 
 void
@@ -1700,17 +1768,17 @@ jemalloc_postfork_child(void)
 
 	assert(malloc_initialized);
 
 	/* Release all mutexes, now that fork() has completed. */
 	huge_postfork_child();
 	base_postfork_child();
 	chunk_postfork_child();
 	prof_postfork_child();
-	for (i = 0; i < narenas; i++) {
+	for (i = 0; i < narenas_total; i++) {
 		if (arenas[i] != NULL)
 			arena_postfork_child(arenas[i]);
 	}
 	malloc_mutex_postfork_child(&arenas_lock);
 	ctl_postfork_child();
 }
 
 /******************************************************************************/
--- a/memory/jemalloc/src/src/prof.c
+++ b/memory/jemalloc/src/src/prof.c
@@ -21,17 +21,17 @@ bool		opt_prof_active = true;
 size_t		opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT;
 ssize_t		opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT;
 bool		opt_prof_gdump = false;
 bool		opt_prof_final = true;
 bool		opt_prof_leak = false;
 bool		opt_prof_accum = false;
 char		opt_prof_prefix[PATH_MAX + 1];
 
-uint64_t	prof_interval;
+uint64_t	prof_interval = 0;
 bool		prof_promote;
 
 /*
  * Table of mutexes that are shared among ctx's.  These are leaf locks, so
  * there is no problem with using them for more than one ctx at the same time.
  * The primary motivation for this sharing though is that ctx's are ephemeral,
  * and destroying mutexes causes complications for systems that allocate when
  * creating/destroying mutexes.
@@ -1201,23 +1201,21 @@ prof_boot1(void)
 
 	if (opt_prof_leak && opt_prof == false) {
 		/*
 		 * Enable opt_prof, but in such a way that profiles are never
 		 * automatically dumped.
 		 */
 		opt_prof = true;
 		opt_prof_gdump = false;
-		prof_interval = 0;
 	} else if (opt_prof) {
 		if (opt_lg_prof_interval >= 0) {
 			prof_interval = (((uint64_t)1U) <<
 			    opt_lg_prof_interval);
-		} else
-			prof_interval = 0;
+		}
 	}
 
 	prof_promote = (opt_prof && opt_lg_prof_sample > LG_PAGE);
 }
 
 bool
 prof_boot2(void)
 {
--- a/memory/jemalloc/src/src/stats.c
+++ b/memory/jemalloc/src/src/stats.c
@@ -201,28 +201,32 @@ stats_arena_lruns_print(void (*write_cb)
 		malloc_cprintf(write_cb, cbopaque, "[%zu]\n", j - gap_start);
 }
 
 static void
 stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
     unsigned i, bool bins, bool large)
 {
 	unsigned nthreads;
+	const char *dss;
 	size_t page, pactive, pdirty, mapped;
 	uint64_t npurge, nmadvise, purged;
 	size_t small_allocated;
 	uint64_t small_nmalloc, small_ndalloc, small_nrequests;
 	size_t large_allocated;
 	uint64_t large_nmalloc, large_ndalloc, large_nrequests;
 
 	CTL_GET("arenas.page", &page, size_t);
 
 	CTL_I_GET("stats.arenas.0.nthreads", &nthreads, unsigned);
 	malloc_cprintf(write_cb, cbopaque,
 	    "assigned threads: %u\n", nthreads);
+	CTL_I_GET("stats.arenas.0.dss", &dss, const char *);
+	malloc_cprintf(write_cb, cbopaque, "dss allocation precedence: %s\n",
+	    dss);
 	CTL_I_GET("stats.arenas.0.pactive", &pactive, size_t);
 	CTL_I_GET("stats.arenas.0.pdirty", &pdirty, size_t);
 	CTL_I_GET("stats.arenas.0.npurge", &npurge, uint64_t);
 	CTL_I_GET("stats.arenas.0.nmadvise", &nmadvise, uint64_t);
 	CTL_I_GET("stats.arenas.0.purged", &purged, uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
 	    "dirty pages: %zu:%zu active:dirty, %"PRIu64" sweep%s,"
 	    " %"PRIu64" madvise%s, %"PRIu64" purged\n",
@@ -365,16 +369,17 @@ stats_print(void (*write_cb)(void *, con
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": \"%s\"\n", cpv);		\
 		}
 
 		malloc_cprintf(write_cb, cbopaque,
 		    "Run-time option settings:\n");
 		OPT_WRITE_BOOL(abort)
 		OPT_WRITE_SIZE_T(lg_chunk)
+		OPT_WRITE_CHAR_P(dss)
 		OPT_WRITE_SIZE_T(narenas)
 		OPT_WRITE_SSIZE_T(lg_dirty_mult)
 		OPT_WRITE_BOOL(stats_print)
 		OPT_WRITE_BOOL(junk)
 		OPT_WRITE_SIZE_T(quarantine)
 		OPT_WRITE_BOOL(redzone)
 		OPT_WRITE_BOOL(zero)
 		OPT_WRITE_BOOL(utrace)
@@ -395,17 +400,17 @@ stats_print(void (*write_cb)(void *, con
 #undef OPT_WRITE_BOOL
 #undef OPT_WRITE_SIZE_T
 #undef OPT_WRITE_SSIZE_T
 #undef OPT_WRITE_CHAR_P
 
 		malloc_cprintf(write_cb, cbopaque, "CPUs: %u\n", ncpus);
 
 		CTL_GET("arenas.narenas", &uv, unsigned);
-		malloc_cprintf(write_cb, cbopaque, "Max arenas: %u\n", uv);
+		malloc_cprintf(write_cb, cbopaque, "Arenas: %u\n", uv);
 
 		malloc_cprintf(write_cb, cbopaque, "Pointer size: %zu\n",
 		    sizeof(void *));
 
 		CTL_GET("arenas.quantum", &sv, size_t);
 		malloc_cprintf(write_cb, cbopaque, "Quantum size: %zu\n", sv);
 
 		CTL_GET("arenas.page", &sv, size_t);
@@ -467,17 +472,18 @@ stats_print(void (*write_cb)(void *, con
 		    "Current active ceiling: %zu\n", atomic_read_z(cactive));
 
 		/* Print chunk stats. */
 		CTL_GET("stats.chunks.total", &chunks_total, uint64_t);
 		CTL_GET("stats.chunks.high", &chunks_high, size_t);
 		CTL_GET("stats.chunks.current", &chunks_current, size_t);
 		malloc_cprintf(write_cb, cbopaque, "chunks: nchunks   "
 		    "highchunks    curchunks\n");
-		malloc_cprintf(write_cb, cbopaque, "  %13"PRIu64"%13zu%13zu\n",
+		malloc_cprintf(write_cb, cbopaque,
+		    "  %13"PRIu64" %12zu %12zu\n",
 		    chunks_total, chunks_high, chunks_current);
 
 		/* Print huge stats. */
 		CTL_GET("stats.huge.nmalloc", &huge_nmalloc, uint64_t);
 		CTL_GET("stats.huge.ndalloc", &huge_ndalloc, uint64_t);
 		CTL_GET("stats.huge.allocated", &huge_allocated, size_t);
 		malloc_cprintf(write_cb, cbopaque,
 		    "huge: nmalloc      ndalloc    allocated\n");
--- a/memory/jemalloc/src/src/tcache.c
+++ b/memory/jemalloc/src/src/tcache.c
@@ -92,19 +92,17 @@ tcache_bin_flush_small(tcache_bin_t *tbi
 	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena bin associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
 		    tbin->avail[0]);
 		arena_t *arena = chunk->arena;
 		arena_bin_t *bin = &arena->bins[binind];
 
 		if (config_prof && arena == tcache->arena) {
-			malloc_mutex_lock(&arena->lock);
 			arena_prof_accum(arena, tcache->prof_accumbytes);
-			malloc_mutex_unlock(&arena->lock);
 			tcache->prof_accumbytes = 0;
 		}
 
 		malloc_mutex_lock(&bin->lock);
 		if (config_stats && arena == tcache->arena) {
 			assert(merged_stats == false);
 			merged_stats = true;
 			bin->stats.nflushes++;
@@ -175,17 +173,17 @@ tcache_bin_flush_large(tcache_bin_t *tbi
 		/* Lock the arena associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
 		    tbin->avail[0]);
 		arena_t *arena = chunk->arena;
 
 		malloc_mutex_lock(&arena->lock);
 		if ((config_prof || config_stats) && arena == tcache->arena) {
 			if (config_prof) {
-				arena_prof_accum(arena,
+				arena_prof_accum_locked(arena,
 				    tcache->prof_accumbytes);
 				tcache->prof_accumbytes = 0;
 			}
 			if (config_stats) {
 				merged_stats = true;
 				arena->stats.nrequests_large +=
 				    tbin->tstats.nrequests;
 				arena->stats.lstats[binind - NBINS].nrequests +=
@@ -283,17 +281,17 @@ tcache_create(arena_t *arena)
 	 */
 	size = (size + CACHELINE_MASK) & (-CACHELINE);
 
 	if (size <= SMALL_MAXCLASS)
 		tcache = (tcache_t *)arena_malloc_small(arena, size, true);
 	else if (size <= tcache_maxclass)
 		tcache = (tcache_t *)arena_malloc_large(arena, size, true);
 	else
-		tcache = (tcache_t *)icalloc(size);
+		tcache = (tcache_t *)icallocx(size, false, arena);
 
 	if (tcache == NULL)
 		return (NULL);
 
 	tcache_arena_associate(tcache, arena);
 
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
 	for (i = 0; i < nhbins; i++) {
@@ -338,38 +336,35 @@ tcache_destroy(tcache_t *tcache)
 			malloc_mutex_lock(&arena->lock);
 			arena->stats.nrequests_large += tbin->tstats.nrequests;
 			arena->stats.lstats[i - NBINS].nrequests +=
 			    tbin->tstats.nrequests;
 			malloc_mutex_unlock(&arena->lock);
 		}
 	}
 
-	if (config_prof && tcache->prof_accumbytes > 0) {
-		malloc_mutex_lock(&tcache->arena->lock);
+	if (config_prof && tcache->prof_accumbytes > 0)
 		arena_prof_accum(tcache->arena, tcache->prof_accumbytes);
-		malloc_mutex_unlock(&tcache->arena->lock);
-	}
 
 	tcache_size = arena_salloc(tcache, false);
 	if (tcache_size <= SMALL_MAXCLASS) {
 		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
 		arena_t *arena = chunk->arena;
 		size_t pageind = ((uintptr_t)tcache - (uintptr_t)chunk) >>
 		    LG_PAGE;
 		arena_chunk_map_t *mapelm = arena_mapp_get(chunk, pageind);
 
 		arena_dalloc_bin(arena, chunk, tcache, pageind, mapelm);
 	} else if (tcache_size <= tcache_maxclass) {
 		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
 		arena_t *arena = chunk->arena;
 
 		arena_dalloc_large(arena, chunk, tcache);
 	} else
-		idalloc(tcache);
+		idallocx(tcache, false);
 }
 
 void
 tcache_thread_cleanup(void *arg)
 {
 	tcache_t *tcache = *(tcache_t **)arg;
 
 	if (tcache == TCACHE_STATE_DISABLED) {
--- a/memory/jemalloc/src/src/zone.c
+++ b/memory/jemalloc/src/src/zone.c
@@ -166,16 +166,26 @@ zone_force_unlock(malloc_zone_t *zone)
 		jemalloc_postfork_parent();
 }
 
 JEMALLOC_ATTR(constructor)
 void
 register_zone(void)
 {
 
+	/*
+	 * If something else replaced the system default zone allocator, don't
+	 * register jemalloc's.
+	 */
+	malloc_zone_t *default_zone = malloc_default_zone();
+	if (!default_zone->zone_name ||
+	    strcmp(default_zone->zone_name, "DefaultMallocZone") != 0) {
+		return;
+	}
+
 	zone.size = (void *)zone_size;
 	zone.malloc = (void *)zone_malloc;
 	zone.calloc = (void *)zone_calloc;
 	zone.valloc = (void *)zone_valloc;
 	zone.free = (void *)zone_free;
 	zone.realloc = (void *)zone_realloc;
 	zone.destroy = (void *)zone_destroy;
 	zone.zone_name = "jemalloc_zone";
@@ -236,13 +246,13 @@ register_zone(void)
 	 * Unregister and reregister the default zone.  On OSX >= 10.6,
 	 * unregistering takes the last registered zone and places it at the
 	 * location of the specified zone.  Unregistering the default zone thus
 	 * makes the last registered one the default.  On OSX < 10.6,
 	 * unregistering shifts all registered zones.  The first registered zone
 	 * then becomes the default.
 	 */
 	do {
-		malloc_zone_t *default_zone = malloc_default_zone();
+		default_zone = malloc_default_zone();
 		malloc_zone_unregister(default_zone);
 		malloc_zone_register(default_zone);
 	} while (malloc_default_zone() != &zone);
 }
new file mode 100644
--- /dev/null
+++ b/memory/jemalloc/src/test/ALLOCM_ARENA.c
@@ -0,0 +1,66 @@
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+#define	NTHREADS 10
+
+void *
+je_thread_start(void *arg)
+{
+	unsigned thread_ind = (unsigned)(uintptr_t)arg;
+	unsigned arena_ind;
+	int r;
+	void *p;
+	size_t rsz, sz;
+
+	sz = sizeof(arena_ind);
+	if (mallctl("arenas.extend", &arena_ind, &sz, NULL, 0)
+	    != 0) {
+		malloc_printf("Error in arenas.extend\n");
+		abort();
+	}
+
+	if (thread_ind % 4 != 3) {
+		size_t mib[3];
+		size_t miblen = sizeof(mib) / sizeof(size_t);
+		const char *dss_precs[] = {"disabled", "primary", "secondary"};
+		const char *dss = dss_precs[thread_ind % 4];
+		if (mallctlnametomib("arena.0.dss", mib, &miblen) != 0) {
+			malloc_printf("Error in mallctlnametomib()\n");
+			abort();
+		}
+		mib[1] = arena_ind;
+		if (mallctlbymib(mib, miblen, NULL, NULL, (void *)&dss,
+		    sizeof(const char *))) {
+			malloc_printf("Error in mallctlbymib()\n");
+			abort();
+		}
+	}
+
+	r = allocm(&p, &rsz, 1, ALLOCM_ARENA(arena_ind));
+	if (r != ALLOCM_SUCCESS) {
+		malloc_printf("Unexpected allocm() error\n");
+		abort();
+	}
+
+	return (NULL);
+}
+
+int
+main(void)
+{
+	je_thread_t threads[NTHREADS];
+	unsigned i;
+
+	malloc_printf("Test begin\n");
+
+	for (i = 0; i < NTHREADS; i++) {
+		je_thread_create(&threads[i], je_thread_start,
+		    (void *)(uintptr_t)i);
+	}
+
+	for (i = 0; i < NTHREADS; i++)
+		je_thread_join(threads[i], NULL);
+
+	malloc_printf("Test end\n");
+	return (0);
+}
new file mode 100644
--- /dev/null
+++ b/memory/jemalloc/src/test/ALLOCM_ARENA.exp
@@ -0,0 +1,2 @@
+Test begin
+Test end
--- a/memory/jemalloc/src/test/thread_arena.c
+++ b/memory/jemalloc/src/test/thread_arena.c
@@ -1,12 +1,12 @@
 #define	JEMALLOC_MANGLE
 #include "jemalloc_test.h"
 
-#define NTHREADS 10
+#define	NTHREADS 10
 
 void *
 je_thread_start(void *arg)
 {
 	unsigned main_arena_ind = *(unsigned *)arg;
 	void *p;
 	unsigned arena_ind;
 	size_t size;
@@ -61,18 +61,20 @@ main(void)
 	size = sizeof(arena_ind);
 	if ((err = mallctl("thread.arena", &arena_ind, &size, NULL, 0))) {
 		malloc_printf("%s(): Error in mallctl(): %s\n", __func__,
 		    strerror(err));
 		ret = 1;
 		goto label_return;
 	}
 
-	for (i = 0; i < NTHREADS; i++)
-		je_thread_create(&threads[i], je_thread_start, (void *)&arena_ind);
+	for (i = 0; i < NTHREADS; i++) {
+		je_thread_create(&threads[i], je_thread_start,
+		    (void *)&arena_ind);
+	}
 
 	for (i = 0; i < NTHREADS; i++)
 		je_thread_join(threads[i], (void *)&ret);
 
 label_return:
 	malloc_printf("Test end\n");
 	return (ret);
 }
--- a/memory/jemalloc/upstream.info
+++ b/memory/jemalloc/upstream.info
@@ -1,2 +1,2 @@
 UPSTREAM_REPO=git://canonware.com/jemalloc.git
-UPSTREAM_COMMIT=d0ffd8ed4f6aa4cf7248028eddfcb35f93247fe4
+UPSTREAM_COMMIT=6eb84fbe315add1e1d4f8deedc25d260fff3ae97