Bug 446096: Integrate upstream jemalloc optimizations, r=pavlov
authorJason Evans <jasone@canonware.com>
Fri, 25 Jul 2008 14:53:20 -0700
changeset 16218 d6bbea319540609e528d103a8826f027c07ef66b
parent 16217 dfaa3fa1c9b109576bbff59b6736f9a12d7e726f
child 16219 589f091246af30f83530b0cc2828cd834419341a
push idunknown
push userunknown
push dateunknown
reviewerspavlov
bugs446096
milestone1.9.1a2pre
Bug 446096: Integrate upstream jemalloc optimizations, r=pavlov Enhance arena_chunk_map_t to directly support run coalescing, and use the chunk map instead of red-black trees where possible. Remove the red-black trees and node objects that are obsoleted by this change. The net result is a ~1-2% memory savings, and a substantial allocation speed improvement. Add a radix tree to optimize isalloc_validate().
memory/jemalloc/jemalloc.c
--- a/memory/jemalloc/jemalloc.c
+++ b/memory/jemalloc/jemalloc.c
@@ -67,19 +67,19 @@
  *   |          |----------------+---------|
  *   |          | Sub-page       |    1 kB |
  *   |          |                |    2 kB |
  *   |=====================================|
  *   | Large                     |    4 kB |
  *   |                           |    8 kB |
  *   |                           |   12 kB |
  *   |                           |     ... |
- *   |                           | 1008 kB |
  *   |                           | 1012 kB |
  *   |                           | 1016 kB |
+ *   |                           | 1020 kB |
  *   |=====================================|
  *   | Huge                      |    1 MB |
  *   |                           |    2 MB |
  *   |                           |    3 MB |
  *   |                           |     ... |
  *   |=====================================|
  *
  * A different mechanism is used for each category:
@@ -261,17 +261,17 @@ typedef long ssize_t;
 #ifndef MOZ_MEMORY_WINDOWS
 #ifndef MOZ_MEMORY_SOLARIS
 #include <sys/cdefs.h>
 #endif
 #ifndef __DECONST
 #  define __DECONST(type, var)	((type)(uintptr_t)(const void *)(var))
 #endif
 #ifndef MOZ_MEMORY
-__FBSDID("$FreeBSD: head/lib/libc/stdlib/malloc.c 179704 2008-06-10 15:46:18Z jasone $");
+__FBSDID("$FreeBSD: head/lib/libc/stdlib/malloc.c 180599 2008-07-18 19:35:44Z jasone $");
 #include "libc_private.h"
 #ifdef MALLOC_DEBUG
 #  define _LOCK_DEBUG
 #endif
 #include "spinlock.h"
 #include "namespace.h"
 #endif
 #include <sys/mman.h>
@@ -404,16 +404,21 @@ static const bool __isthreaded = true;
 #  define SIZEOF_PTR_2POW	3
 #  define CPU_SPINWAIT		__asm__ volatile("pause")
 #endif
 #ifdef __arm__
 #  define QUANTUM_2POW_MIN	3
 #  define SIZEOF_PTR_2POW	2
 #  define NO_TLS
 #endif
+#ifdef __mips__
+#  define QUANTUM_2POW_MIN	3
+#  define SIZEOF_PTR_2POW	2
+#  define NO_TLS
+#endif
 #ifdef __powerpc__
 #  define QUANTUM_2POW_MIN	4
 #  define SIZEOF_PTR_2POW	2
 #endif
 #endif
 
 #define	SIZEOF_PTR		(1U << SIZEOF_PTR_2POW)
 
@@ -490,21 +495,17 @@ static const bool __isthreaded = true;
  *
  *   (RUN_MAX_OVRHD / (reg_size << (3+RUN_BFP))
  */
 #define	RUN_BFP			12
 /*                                    \/   Implicit binary fixed point. */
 #define	RUN_MAX_OVRHD		0x0000003dU
 #define	RUN_MAX_OVRHD_RELAX	0x00001800U
 
-/*
- * Put a cap on small object run size.  This overrides RUN_MAX_OVRHD.  Note
- * that small runs must be small enough that page offsets can fit within the
- * CHUNK_MAP_POS_MASK bits.
- */
+/* Put a cap on small object run size.  This overrides RUN_MAX_OVRHD. */
 #define	RUN_MAX_SMALL_2POW	15
 #define	RUN_MAX_SMALL		(1U << RUN_MAX_SMALL_2POW)
 
 /*
  * Hyper-threaded CPUs may need a special instruction inside spin loops in
  * order to yield to another virtual CPU.  If no such instruction is defined
  * above, make CPU_SPINWAIT a no-op.
  */
@@ -693,16 +694,41 @@ struct extent_node_s {
 
 	/* Total region size. */
 	size_t	size;
 };
 typedef rb_tree(extent_node_t) extent_tree_t;
 
 /******************************************************************************/
 /*
+ * Radix tree data structures.
+ */
+
+#ifdef MALLOC_VALIDATE
+   /*
+    * Size of each radix tree node (must be a power of 2).  This impacts tree
+    * depth.
+    */
+#  if (SIZEOF_PTR == 4)
+#    define MALLOC_RTREE_NODESIZE (1U << 14)
+#  else
+#    define MALLOC_RTREE_NODESIZE CACHELINE
+#  endif
+
+typedef struct malloc_rtree_s malloc_rtree_t;
+struct malloc_rtree_s {
+	malloc_spinlock_t	lock;
+	void			**root;
+	unsigned		height;
+	unsigned		level2bits[1]; /* Dynamically sized. */
+};
+#endif
+
+/******************************************************************************/
+/*
  * Reserve data structures.
  */
 
 /* Callback registration. */
 typedef struct reserve_reg_s reserve_reg_t;
 struct reserve_reg_s {
 	/* Linkage for list of all registered callbacks. */
 	ql_elm(reserve_reg_t)	link;
@@ -723,73 +749,101 @@ struct reserve_reg_s {
 /******************************************************************************/
 /*
  * Arena data structures.
  */
 
 typedef struct arena_s arena_t;
 typedef struct arena_bin_s arena_bin_t;
 
-/*
- * Each map element contains several flags, plus page position for runs that
- * service small allocations.
- */
-typedef uint8_t arena_chunk_map_t;
-#define	CHUNK_MAP_UNTOUCHED	0x80U
-#define	CHUNK_MAP_DIRTY		0x40U
-#define	CHUNK_MAP_LARGE		0x20U
+/* Each element of the chunk map corresponds to one page within the chunk. */
+typedef struct arena_chunk_map_s arena_chunk_map_t;
+struct arena_chunk_map_s {
+	/*
+	 * Linkage for run trees.  There are two disjoint uses:
+	 *
+	 * 1) arena_t's runs_avail tree.
+	 * 2) arena_run_t conceptually uses this linkage for in-use non-full
+	 *    runs, rather than directly embedding linkage.
+	 */
+	rb_node(arena_chunk_map_t)	link;
+
+	/*
+	 * Run address (or size) and various flags are stored together.  The bit
+	 * layout looks like (assuming 32-bit system):
+	 *
+	 *   ???????? ???????? ????---- --ckdzla
+	 *
+	 * ? : Unallocated: Run address for first/last pages, unset for internal
+	 *                  pages.
+	 *     Small: Run address.
+	 *     Large: Run size for first page, unset for trailing pages.
+	 * - : Unused.
+	 * c : decommitted?
+	 * k : key?
+	 * d : dirty?
+	 * z : zeroed?
+	 * l : large?
+	 * a : allocated?
+	 *
+	 * Following are example bit patterns for the three types of runs.
+	 *
+	 * r : run address
+	 * s : run size
+	 * x : don't care
+	 * - : 0
+	 * [cdzla] : bit set
+	 *
+	 *   Unallocated:
+	 *     ssssssss ssssssss ssss---- --c-----
+	 *     xxxxxxxx xxxxxxxx xxxx---- ----d---
+	 *     ssssssss ssssssss ssss---- -----z--
+	 *
+	 *   Small:
+	 *     rrrrrrrr rrrrrrrr rrrr---- -------a
+	 *     rrrrrrrr rrrrrrrr rrrr---- -------a
+	 *     rrrrrrrr rrrrrrrr rrrr---- -------a
+	 *
+	 *   Large:
+	 *     ssssssss ssssssss ssss---- ------la
+	 *     -------- -------- -------- ------la
+	 *     -------- -------- -------- ------la
+	 */
+	size_t				bits;
 #ifdef MALLOC_DECOMMIT
-#define	CHUNK_MAP_DECOMMITTED	0x10U
-#define	CHUNK_MAP_POS_MASK	0x0fU
-#else
-#define	CHUNK_MAP_POS_MASK	0x1fU
-#endif
+#define	CHUNK_MAP_DECOMMITTED	((size_t)0x20U)
+#endif
+#define	CHUNK_MAP_KEY		((size_t)0x10U)
+#define	CHUNK_MAP_DIRTY		((size_t)0x08U)
+#define	CHUNK_MAP_ZEROED	((size_t)0x04U)
+#define	CHUNK_MAP_LARGE		((size_t)0x02U)
+#define	CHUNK_MAP_ALLOCATED	((size_t)0x01U)
+};
+typedef rb_tree(arena_chunk_map_t) arena_avail_tree_t;
+typedef rb_tree(arena_chunk_map_t) arena_run_tree_t;
 
 /* Arena chunk header. */
 typedef struct arena_chunk_s arena_chunk_t;
 struct arena_chunk_s {
 	/* Arena that owns the chunk. */
 	arena_t		*arena;
 
-	/* Linkage for the arena's chunks_all tree. */
-	rb_node(arena_chunk_t) link_all;
-
 	/* Linkage for the arena's chunks_dirty tree. */
 	rb_node(arena_chunk_t) link_dirty;
 
-	/*
-	 * Number of pages in use.  This is maintained in order to make
-	 * detection of empty chunks fast.
-	 */
-	size_t		pages_used;
-
 	/* Number of dirty pages. */
 	size_t		ndirty;
 
-	/*
-	 * Tree of extent nodes that are embedded in the arena chunk header
-	 * page(s).  These nodes are used by arena_chunk_node_alloc().
-	 */
-	extent_tree_t	nodes;
-	extent_node_t	*nodes_past;
-
-	/*
-	 * Map of pages within chunk that keeps track of free/large/small.  For
-	 * free runs, only the map entries for the first and last pages are
-	 * kept up to date, so that free runs can be quickly coalesced.
-	 */
+	/* Map of pages within chunk that keeps track of free/large/small. */
 	arena_chunk_map_t map[1]; /* Dynamically sized. */
 };
 typedef rb_tree(arena_chunk_t) arena_chunk_tree_t;
 
 typedef struct arena_run_s arena_run_t;
 struct arena_run_s {
-	/* Linkage for run trees. */
-	rb_node(arena_run_t) link;
-
 #ifdef MALLOC_DEBUG
 	uint32_t	magic;
 #  define ARENA_RUN_MAGIC 0x384adf93
 #endif
 
 	/* Bin this run is associated with. */
 	arena_bin_t	*bin;
 
@@ -797,17 +851,16 @@ struct arena_run_s {
 	unsigned	regs_minelm;
 
 	/* Number of free regions in run. */
 	unsigned	nfree;
 
 	/* Bitmask of in-use regions (0: in use, 1: free). */
 	unsigned	regs_mask[1]; /* Dynamically sized. */
 };
-typedef rb_tree(arena_run_t) arena_run_tree_t;
 
 struct arena_bin_s {
 	/*
 	 * Current run being used to service allocations of this bin's size
 	 * class.
 	 */
 	arena_run_t	*runcur;
 
@@ -859,49 +912,44 @@ struct arena_s {
 #endif
 
 	/*
 	 * Chunk allocation sequence number, used to detect races with other
 	 * threads during chunk allocation, and then discard unnecessary chunks.
 	 */
 	uint64_t		chunk_seq;
 
-	/* Tree of all chunks this arena manages. */
-	arena_chunk_tree_t	chunks_all;
+	/* Tree of dirty-page-containing chunks this arena manages. */
+	arena_chunk_tree_t	chunks_dirty;
 
 	/*
-	 * Tree of dirty-page-containing chunks this arena manages.  This tree
-	 * is maintained in addition to chunks_all in order to make
-	 * deallocation O(lg d), where 'd' is the size of chunks_dirty.
+	 * In order to avoid rapid chunk allocation/deallocation when an arena
+	 * oscillates right on the cusp of needing a new chunk, cache the most
+	 * recently freed chunk.  The spare is left in the arena's chunk trees
+	 * until it is deleted.
 	 *
-	 * Without this tree, deallocation would be O(a), where 'a' is the size
-	 * of chunks_all.  Since dirty pages are purged in descending memory
-	 * order, it would not be difficult to trigger something approaching
-	 * worst case behavior with a series of large deallocations.
+	 * There is one spare chunk per arena, rather than one spare total, in
+	 * order to avoid interactions between multiple threads that could make
+	 * a single spare inadequate.
 	 */
-	arena_chunk_tree_t	chunks_dirty;
+	arena_chunk_t		*spare;
 
 	/*
 	 * Current count of pages within unused runs that are potentially
 	 * dirty, and for which madvise(... MADV_FREE) has not been called.  By
 	 * tracking this, we can institute a limit on how much dirty unused
 	 * memory is mapped for each arena.
 	 */
 	size_t			ndirty;
 
 	/*
-	 * Trees of this arena's available runs.  Two trees are maintained
-	 * using one set of nodes, since one is needed for first-best-fit run
-	 * allocation, and the other is needed for coalescing.
+	 * Size/address-ordered tree of this arena's available runs.  This tree
+	 * is used for first-best-fit run allocation.
 	 */
-	extent_tree_t		runs_avail_szad;
-	extent_tree_t		runs_avail_ad;
-
-	/* Tree of this arena's allocated (in-use) runs. */
-	extent_tree_t		runs_alloced_ad;
+	arena_avail_tree_t	runs_avail;
 
 #ifdef MALLOC_BALANCE
 	/*
 	 * The arena load balancing machinery needs to keep track of how much
 	 * lock contention there is.  This value is exponentially averaged.
 	 */
 	uint32_t		contention;
 #endif
@@ -964,16 +1012,20 @@ static size_t		chunk_npages;
 static size_t		arena_chunk_header_npages;
 static size_t		arena_maxclass; /* Max size class for arenas. */
 
 /********/
 /*
  * Chunks.
  */
 
+#ifdef MALLOC_VALIDATE
+static malloc_rtree_t *chunk_rtree;
+#endif
+
 /* Protects chunk-related data structures. */
 static malloc_mutex_t	huge_mtx;
 
 /* Tree of chunks that are stand-alone huge allocations. */
 static extent_tree_t	huge;
 
 #ifdef MALLOC_STATS
 /* Huge allocation statistics. */
@@ -1183,32 +1235,28 @@ static void	pagefile_close(int pfd);
 static void	*chunk_recycle_reserve(size_t size, bool zero);
 static void	*chunk_alloc(size_t size, bool zero, bool pagefile);
 static extent_node_t *chunk_dealloc_reserve(void *chunk, size_t size);
 static void	chunk_dealloc_mmap(void *chunk, size_t size);
 static void	chunk_dealloc(void *chunk, size_t size);
 #ifndef NO_TLS
 static arena_t	*choose_arena_hard(void);
 #endif
-static extent_node_t *arena_chunk_node_alloc(arena_chunk_t *chunk);
-static void	arena_chunk_node_dealloc(arena_chunk_t *chunk,
-    extent_node_t *node);
 static void	arena_run_split(arena_t *arena, arena_run_t *run, size_t size,
-    bool small, bool zero);
+    bool large, bool zero);
 static void arena_chunk_init(arena_t *arena, arena_chunk_t *chunk);
 static void	arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk);
 static arena_run_t *arena_run_alloc(arena_t *arena, arena_bin_t *bin,
-    size_t size, bool small, bool zero);
+    size_t size, bool large, bool zero);
 static void	arena_purge(arena_t *arena);
 static void	arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty);
 static void	arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk,
-    extent_node_t *nodeB, arena_run_t *run, size_t oldsize, size_t newsize);
+    arena_run_t *run, size_t oldsize, size_t newsize);
 static void	arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk,
-    extent_node_t *nodeA, arena_run_t *run, size_t oldsize, size_t newsize,
-    bool dirty);
+    arena_run_t *run, size_t oldsize, size_t newsize, bool dirty);
 static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin);
 static void *arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
 static size_t arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
 #ifdef MALLOC_BALANCE
 static void	arena_lock_balance_hard(arena_t *arena);
 #endif
 static void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
 static void	*arena_palloc(arena_t *arena, size_t alignment, size_t size,
@@ -1510,17 +1558,16 @@ malloc_spin_unlock(pthread_mutex_t *lock
 	if (__isthreaded)
 		_pthread_mutex_unlock(lock);
 }
 #endif
 
 /*
  * End spin lock.
  */
-
 /******************************************************************************/
 /*
  * Begin Utility functions/macros.
  */
 
 /* Return the chunk address for allocation address a. */
 #define	CHUNK_ADDR2BASE(a)						\
 	((void *)((uintptr_t)(a) & ~chunksize_mask))
@@ -1602,21 +1649,16 @@ prn_##suffix(uint32_t lg_range)						\
 	ret = x >> (32 - lg_range);					\
 									\
 	return (ret);							\
 }
 #  define SPRN(suffix, seed)	sprn_##suffix(seed)
 #  define PRN(suffix, lg_range)	prn_##suffix(lg_range)
 #endif
 
-/*
- * Define PRNGs, one for each purpose, in order to avoid auto-correlation
- * problems.
- */
-
 #ifdef MALLOC_BALANCE
 /* Define the PRNG used for arena assignment. */
 static __thread uint32_t balance_x;
 PRN_DEFINE(balance, balance_x, 1297, 1301)
 #endif
 
 #ifdef MALLOC_UTRACE
 static int
@@ -2170,17 +2212,123 @@ pages_unmap(void *addr, size_t size)
 		_malloc_message(_getprogname(),
 		    ": (malloc) Error in munmap(): ", buf, "\n");
 		if (opt_abort)
 			abort();
 	}
 }
 #endif
 
+#ifdef MALLOC_VALIDATE
+static inline malloc_rtree_t *
+malloc_rtree_new(unsigned bits)
+{
+	malloc_rtree_t *ret;
+	unsigned bits_per_level, height, i;
+
+	bits_per_level = ffs(pow2_ceil((MALLOC_RTREE_NODESIZE /
+	    sizeof(void *)))) - 1;
+	height = bits / bits_per_level;
+	if (height * bits_per_level != bits)
+		height++;
+	assert(height * bits_per_level >= bits);
+
+	ret = base_calloc(1, sizeof(malloc_rtree_t) + (sizeof(unsigned) *
+	    (height - 1)));
+	if (ret == NULL)
+		return (NULL);
+
+	malloc_spin_init(&ret->lock);
+	ret->height = height;
+	if (bits_per_level * height > bits)
+		ret->level2bits[0] = bits % bits_per_level;
+	else
+		ret->level2bits[0] = bits_per_level;
+	for (i = 1; i < height; i++)
+		ret->level2bits[i] = bits_per_level;
+
+	ret->root = base_calloc(1, sizeof(void *) << ret->level2bits[0]);
+	if (ret->root == NULL) {
+		/*
+		 * We leak the rtree here, since there's no generic base
+		 * deallocation.
+		 */
+		return (NULL);
+	}
+
+	return (ret);
+}
+
+/* The least significant bits of the key are ignored. */
 static inline void *
+malloc_rtree_get(malloc_rtree_t *rtree, uintptr_t key)
+{
+	void *ret;
+	uintptr_t subkey;
+	unsigned i, lshift, height, bits;
+	void **node, **child;
+
+	malloc_spin_lock(&rtree->lock);
+	for (i = lshift = 0, height = rtree->height, node = rtree->root;
+	    i < height - 1;
+	    i++, lshift += bits, node = child) {
+		bits = rtree->level2bits[i];
+		subkey = (key << lshift) >> ((SIZEOF_PTR << 3) - bits);
+		child = node[subkey];
+		if (child == NULL) {
+			malloc_spin_unlock(&rtree->lock);
+			return (NULL);
+		}
+	}
+
+	/* node is a leaf, so it contains values rather than node pointers. */
+	bits = rtree->level2bits[i];
+	subkey = (key << lshift) >> ((SIZEOF_PTR << 3) - bits);
+	ret = node[subkey];
+	malloc_spin_unlock(&rtree->lock);
+
+	return (ret);
+}
+
+static inline bool
+malloc_rtree_set(malloc_rtree_t *rtree, uintptr_t key, void *val)
+{
+	uintptr_t subkey;
+	unsigned i, lshift, height, bits;
+	void **node, **child;
+
+	malloc_spin_lock(&rtree->lock);
+	for (i = lshift = 0, height = rtree->height, node = rtree->root;
+	    i < height - 1;
+	    i++, lshift += bits, node = child) {
+		bits = rtree->level2bits[i];
+		subkey = (key << lshift) >> ((SIZEOF_PTR << 3) - bits);
+		child = node[subkey];
+		if (child == NULL) {
+			child = base_calloc(1, sizeof(void *) <<
+			    rtree->level2bits[i+1]);
+			if (child == NULL) {
+				malloc_spin_unlock(&rtree->lock);
+				return (true);
+			}
+			node[subkey] = child;
+		}
+	}
+
+	/* node is a leaf, so it contains values rather than node pointers. */
+	bits = rtree->level2bits[i];
+	subkey = (key << lshift) >> ((SIZEOF_PTR << 3) - bits);
+	node[subkey] = val;
+	malloc_spin_unlock(&rtree->lock);
+
+	return (false);
+}
+#endif
+
+static void *
 chunk_alloc_mmap(size_t size, bool pagefile)
 {
 	void *ret;
 	size_t offset;
 	int pfd;
 
 #ifdef MALLOC_PAGEFILE
 	if (opt_pagefile && pagefile) {
@@ -2238,16 +2386,20 @@ chunk_alloc_mmap(size_t size, bool pagef
 		}
 	}
 
 RETURN:
 #ifdef MALLOC_PAGEFILE
 	if (pfd != -1)
 		pagefile_close(pfd);
 #endif
+#ifdef MALLOC_STATS
+	if (ret != NULL)
+		stats_chunks.nchunks += (size / chunksize);
+#endif
 	return (ret);
 }
 
 #ifdef MALLOC_PAGEFILE
 static int
 pagefile_init(size_t size)
 {
 	int ret;
@@ -2419,32 +2571,38 @@ chunk_alloc(size_t size, bool zero, bool
 	assert((size & chunksize_mask) == 0);
 
 	ret = chunk_recycle_reserve(size, zero);
 	if (ret != NULL)
 		goto RETURN;
 
 	ret = chunk_alloc_mmap(size, pagefile);
 	if (ret != NULL) {
-#ifdef MALLOC_STATS
-		stats_chunks.nchunks += (size / chunksize);
-#endif
 		goto RETURN;
 	}
 
 	/* All strategies for allocation failed. */
 	ret = NULL;
 RETURN:
 #ifdef MALLOC_STATS
 	if (ret != NULL)
 		stats_chunks.curchunks += (size / chunksize);
 	if (stats_chunks.curchunks > stats_chunks.highchunks)
 		stats_chunks.highchunks = stats_chunks.curchunks;
 #endif
 
+#ifdef MALLOC_VALIDATE
+	if (ret != NULL) {
+		if (malloc_rtree_set(chunk_rtree, (uintptr_t)ret, ret)) {
+			chunk_dealloc(ret, size);
+			return (NULL);
+		}
+	}
+#endif
+
 	assert(CHUNK_ADDR2BASE(ret) == ret);
 	return (ret);
 }
 
 static extent_node_t *
 chunk_dealloc_reserve(void *chunk, size_t size)
 {
 	extent_node_t *node;
@@ -2530,16 +2688,19 @@ chunk_dealloc(void *chunk, size_t size)
 	assert(chunk != NULL);
 	assert(CHUNK_ADDR2BASE(chunk) == chunk);
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 
 #ifdef MALLOC_STATS
 	stats_chunks.curchunks -= (size / chunksize);
 #endif
+#ifdef MALLOC_VALIDATE
+	malloc_rtree_set(chunk_rtree, (uintptr_t)chunk, NULL);
+#endif
 
 	/* Try to merge chunk into the reserve. */
 	malloc_mutex_lock(&reserve_mtx);
 	node = chunk_dealloc_reserve(chunk, size);
 	malloc_mutex_unlock(&reserve_mtx);
 	if (node == NULL)
 		chunk_dealloc_mmap(chunk, size);
 }
@@ -2686,64 +2847,66 @@ arena_chunk_comp(arena_chunk_t *a, arena
 
 	assert(a != NULL);
 	assert(b != NULL);
 
 	return ((a_chunk > b_chunk) - (a_chunk < b_chunk));
 }
 
 /* Wrap red-black tree macros in functions. */
-rb_wrap(static, arena_chunk_tree_all_, arena_chunk_tree_t,
-    arena_chunk_t, link_all, arena_chunk_comp)
 rb_wrap(static, arena_chunk_tree_dirty_, arena_chunk_tree_t,
     arena_chunk_t, link_dirty, arena_chunk_comp)
 
 static inline int
-arena_run_comp(arena_run_t *a, arena_run_t *b)
-{
-	uintptr_t a_run = (uintptr_t)a;
-	uintptr_t b_run = (uintptr_t)b;
+arena_run_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
+{
+	uintptr_t a_mapelm = (uintptr_t)a;
+	uintptr_t b_mapelm = (uintptr_t)b;
 
 	assert(a != NULL);
 	assert(b != NULL);
 
-	return ((a_run > b_run) - (a_run < b_run));
+	return ((a_mapelm > b_mapelm) - (a_mapelm < b_mapelm));
 }
 
 /* Wrap red-black tree macros in functions. */
-rb_wrap(static, arena_run_tree_, arena_run_tree_t, arena_run_t, link,
+rb_wrap(static, arena_run_tree_, arena_run_tree_t, arena_chunk_map_t, link,
     arena_run_comp)
 
-static extent_node_t *
-arena_chunk_node_alloc(arena_chunk_t *chunk)
-{
-	extent_node_t *ret;
-
-	ret = extent_tree_ad_first(&chunk->nodes);
-	if (ret != NULL)
-		extent_tree_ad_remove(&chunk->nodes, ret);
-	else {
-		ret = chunk->nodes_past;
-		chunk->nodes_past = (extent_node_t *)
-		    ((uintptr_t)chunk->nodes_past + sizeof(extent_node_t));
-		assert((uintptr_t)ret + sizeof(extent_node_t) <=
-		    (uintptr_t)chunk + (arena_chunk_header_npages <<
-		    pagesize_2pow));
+static inline int
+arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
+{
+	int ret;
+	size_t a_size = a->bits & ~pagesize_mask;
+	size_t b_size = b->bits & ~pagesize_mask;
+
+	ret = (a_size > b_size) - (a_size < b_size);
+	if (ret == 0) {
+		uintptr_t a_mapelm, b_mapelm;
+
+		if ((a->bits & CHUNK_MAP_KEY) == 0)
+			a_mapelm = (uintptr_t)a;
+		else {
+			/*
+			 * Treat keys as though they are lower than anything
+			 * else.
+			 */
+			a_mapelm = 0;
+		}
+		b_mapelm = (uintptr_t)b;
+
+		ret = (a_mapelm > b_mapelm) - (a_mapelm < b_mapelm);
 	}
 
 	return (ret);
 }
 
-static void
-arena_chunk_node_dealloc(arena_chunk_t *chunk, extent_node_t *node)
-{
-
-	node->addr = (void *)node;
-	extent_tree_ad_insert(&chunk->nodes, node);
-}
+/* Wrap red-black tree macros in functions. */
+rb_wrap(static, arena_avail_tree_, arena_avail_tree_t, arena_chunk_map_t, link,
+    arena_avail_comp)
 
 static inline void *
 arena_run_reg_alloc(arena_run_t *run, arena_bin_t *bin)
 {
 	void *ret;
 	unsigned i, mask, bit, regind;
 
 	assert(run->magic == ARENA_RUN_MAGIC);
@@ -2899,240 +3062,251 @@ arena_run_reg_dalloc(arena_run_t *run, a
 	bit = regind - (elm << (SIZEOF_INT_2POW + 3));
 	assert((run->regs_mask[elm] & (1U << bit)) == 0);
 	run->regs_mask[elm] |= (1U << bit);
 #undef SIZE_INV
 #undef SIZE_INV_SHIFT
 }
 
 static void
-arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool small,
+arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
     bool zero)
 {
 	arena_chunk_t *chunk;
 	size_t old_ndirty, run_ind, total_pages, need_pages, rem_pages, i;
-	extent_node_t *nodeA, *nodeB, key;
-
-	/* Insert a node into runs_alloced_ad for the first part of the run. */
+
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 	old_ndirty = chunk->ndirty;
-	nodeA = arena_chunk_node_alloc(chunk);
-	nodeA->addr = run;
-	nodeA->size = size;
-	extent_tree_ad_insert(&arena->runs_alloced_ad, nodeA);
-
-	key.addr = run;
-	nodeB = extent_tree_ad_search(&arena->runs_avail_ad, &key);
-	assert(nodeB != NULL);
-
 	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk)
 	    >> pagesize_2pow);
-	total_pages = nodeB->size >> pagesize_2pow;
+	total_pages = (chunk->map[run_ind].bits & ~pagesize_mask) >>
+	    pagesize_2pow;
 	need_pages = (size >> pagesize_2pow);
 	assert(need_pages > 0);
 	assert(need_pages <= total_pages);
-	assert(need_pages <= CHUNK_MAP_POS_MASK || small == false);
 	rem_pages = total_pages - need_pages;
 
+	arena_avail_tree_remove(&arena->runs_avail, &chunk->map[run_ind]);
+
+	/* Keep track of trailing unused pages for later use. */
+	if (rem_pages > 0) {
+		chunk->map[run_ind+need_pages].bits = (rem_pages <<
+		    pagesize_2pow) | (chunk->map[run_ind+need_pages].bits &
+		    pagesize_mask);
+		chunk->map[run_ind+total_pages-1].bits = (rem_pages <<
+		    pagesize_2pow) | (chunk->map[run_ind+total_pages-1].bits &
+		    pagesize_mask);
+		arena_avail_tree_insert(&arena->runs_avail,
+		    &chunk->map[run_ind+need_pages]);
+	}
+
 	for (i = 0; i < need_pages; i++) {
 #ifdef MALLOC_DECOMMIT
 		/*
 		 * Commit decommitted pages if necessary.  If a decommitted
 		 * page is encountered, commit all needed adjacent decommitted
 		 * pages in one operation, in order to reduce system call
 		 * overhead.
 		 */
-		if (chunk->map[run_ind + i] & CHUNK_MAP_DECOMMITTED) {
+		if (chunk->map[run_ind + i].bits & CHUNK_MAP_DECOMMITTED) {
 			size_t j;
 
 			/*
 			 * Advance i+j to just past the index of the last page
 			 * to commit.  Clear CHUNK_MAP_DECOMMITTED along the
 			 * way.
 			 */
 			for (j = 0; i + j < need_pages && (chunk->map[run_ind +
-			    i + j] & CHUNK_MAP_DECOMMITTED); j++) {
-				chunk->map[run_ind + i + j] ^=
+			    i + j].bits & CHUNK_MAP_DECOMMITTED); j++) {
+				chunk->map[run_ind + i + j].bits ^=
 				    CHUNK_MAP_DECOMMITTED;
 			}
 
 			pages_commit((void *)((uintptr_t)chunk + ((run_ind + i)
 			    << pagesize_2pow)), (j << pagesize_2pow));
 #  ifdef MALLOC_STATS
 			arena->stats.ncommit++;
 #  endif
 		} else /* No need to zero since commit zeros. */
 #endif
 
 		/* Zero if necessary. */
 		if (zero) {
-			if ((chunk->map[run_ind + i] & CHUNK_MAP_UNTOUCHED)
+			if ((chunk->map[run_ind + i].bits & CHUNK_MAP_ZEROED)
 			    == 0) {
 				VALGRIND_MALLOCLIKE_BLOCK((void *)((uintptr_t)
 				    chunk + ((run_ind + i) << pagesize_2pow)),
 				    pagesize, 0, false);
 				memset((void *)((uintptr_t)chunk + ((run_ind
 				    + i) << pagesize_2pow)), 0, pagesize);
 				VALGRIND_FREELIKE_BLOCK((void *)((uintptr_t)
 				    chunk + ((run_ind + i) << pagesize_2pow)),
 				    0);
-				/* CHUNK_MAP_UNTOUCHED is cleared below. */
+				/* CHUNK_MAP_ZEROED is cleared below. */
 			}
 		}
 
 		/* Update dirty page accounting. */
-		if (chunk->map[run_ind + i] & CHUNK_MAP_DIRTY) {
+		if (chunk->map[run_ind + i].bits & CHUNK_MAP_DIRTY) {
 			chunk->ndirty--;
 			arena->ndirty--;
+			/* CHUNK_MAP_DIRTY is cleared below. */
 		}
 
 		/* Initialize the chunk map. */
-		if (small)
-			chunk->map[run_ind + i] = (uint8_t)i;
-		else
-			chunk->map[run_ind + i] = CHUNK_MAP_LARGE;
+		if (large) {
+			chunk->map[run_ind + i].bits = CHUNK_MAP_LARGE
+			    | CHUNK_MAP_ALLOCATED;
+		} else {
+			chunk->map[run_ind + i].bits = (size_t)run
+			    | CHUNK_MAP_ALLOCATED;
+		}
 	}
 
-	/* Keep track of trailing unused pages for later use. */
-	extent_tree_szad_remove(&arena->runs_avail_szad, nodeB);
-	if (rem_pages > 0) {
-		/*
-		 * Update nodeB in runs_avail_*.  Its position within
-		 * runs_avail_ad does not change.
-		 */
-		nodeB->addr = (void *)((uintptr_t)nodeB->addr + size);
-		nodeB->size -= size;
-		extent_tree_szad_insert(&arena->runs_avail_szad, nodeB);
-	} else {
-		/* Remove nodeB from runs_avail_*. */
-		extent_tree_ad_remove(&arena->runs_avail_ad, nodeB);
-		arena_chunk_node_dealloc(chunk, nodeB);
-	}
-
-	chunk->pages_used += need_pages;
+	/*
+	 * Set the run size only in the first element for large runs.  This is
+	 * primarily a debugging aid, since the lack of size info for trailing
+	 * pages only matters if the application tries to operate on an
+	 * interior pointer.
+	 */
+	if (large)
+		chunk->map[run_ind].bits |= size;
+
 	if (chunk->ndirty == 0 && old_ndirty > 0)
 		arena_chunk_tree_dirty_remove(&arena->chunks_dirty, chunk);
 }
 
 static void
 arena_chunk_init(arena_t *arena, arena_chunk_t *chunk)
 {
-	extent_node_t *node;
+	arena_run_t *run;
+	size_t i;
 
 	VALGRIND_MALLOCLIKE_BLOCK(chunk, (arena_chunk_header_npages <<
 	    pagesize_2pow), 0, false);
 #ifdef MALLOC_STATS
 	arena->stats.mapped += chunksize;
 #endif
 
 	chunk->arena = arena;
 
-	arena_chunk_tree_all_insert(&arena->chunks_all, chunk);
-
 	/*
 	 * Claim that no pages are in use, since the header is merely overhead.
 	 */
-	chunk->pages_used = 0;
 	chunk->ndirty = 0;
 
 	/* Initialize the map to contain one maximal free untouched run. */
-	memset(chunk->map, (CHUNK_MAP_LARGE | CHUNK_MAP_POS_MASK),
-	    arena_chunk_header_npages);
-	memset(&chunk->map[arena_chunk_header_npages], (CHUNK_MAP_UNTOUCHED
+	run = (arena_run_t *)((uintptr_t)chunk + (arena_chunk_header_npages <<
+	    pagesize_2pow));
+	for (i = 0; i < arena_chunk_header_npages; i++)
+		chunk->map[i].bits = 0;
+	chunk->map[i].bits = arena_maxclass
 #ifdef MALLOC_DECOMMIT
 	    | CHUNK_MAP_DECOMMITTED
 #endif
-	    ), (chunk_npages -
-	    arena_chunk_header_npages));
-
-	/* Initialize the tree of unused extent nodes. */
-	extent_tree_ad_new(&chunk->nodes);
-	chunk->nodes_past = (extent_node_t *)QUANTUM_CEILING(
-	    (uintptr_t)&chunk->map[chunk_npages]);
+	    | CHUNK_MAP_ZEROED;
+	for (i++; i < chunk_npages-1; i++) {
+		chunk->map[i].bits =
+#ifdef MALLOC_DECOMMIT
+		    CHUNK_MAP_DECOMMITTED |
+#endif
+		    CHUNK_MAP_ZEROED;
+	}
+	chunk->map[chunk_npages-1].bits = arena_maxclass
+#ifdef MALLOC_DECOMMIT
+	    | CHUNK_MAP_DECOMMITTED
+#endif
+	    | CHUNK_MAP_ZEROED;
 
 #ifdef MALLOC_DECOMMIT
 	/*
 	 * Start out decommitted, in order to force a closer correspondence
 	 * between dirty pages and committed untouched pages.
 	 */
-	pages_decommit((void *)((uintptr_t)chunk +
-	    (arena_chunk_header_npages << pagesize_2pow)),
-	    ((chunk_npages - arena_chunk_header_npages) <<
-	    pagesize_2pow));
+	pages_decommit(run, arena_maxclass);
 #  ifdef MALLOC_STATS
 	arena->stats.ndecommit++;
 	arena->stats.decommitted += (chunk_npages - arena_chunk_header_npages);
 #  endif
 #endif
 
-	/* Insert the run into the runs_avail_* red-black trees. */
-	node = arena_chunk_node_alloc(chunk);
-	node->addr = (void *)((uintptr_t)chunk + (arena_chunk_header_npages <<
-	    pagesize_2pow));
-	node->size = chunksize - (arena_chunk_header_npages << pagesize_2pow);
-	extent_tree_szad_insert(&arena->runs_avail_szad, node);
-	extent_tree_ad_insert(&arena->runs_avail_ad, node);
+	/* Insert the run into the runs_avail tree. */
+	arena_avail_tree_insert(&arena->runs_avail,
+	    &chunk->map[arena_chunk_header_npages]);
 }
 
 static void
 arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk)
 {
-	extent_node_t *node, key;
+
+	if (arena->spare != NULL) {
+		if (arena->spare->ndirty > 0) {
+			arena_chunk_tree_dirty_remove(
+			    &chunk->arena->chunks_dirty, arena->spare);
+			arena->ndirty -= arena->spare->ndirty;
+		}
+		VALGRIND_FREELIKE_BLOCK(arena->spare, 0);
+		chunk_dealloc((void *)arena->spare, chunksize);
+#ifdef MALLOC_STATS
+		arena->stats.mapped -= chunksize;
+#endif
+	}
 
 	/*
-	 * Remove run from the runs trees, regardless of whether this chunk
+	 * Remove run from runs_avail, regardless of whether this chunk
 	 * will be cached, so that the arena does not use it.  Dirty page
 	 * flushing only uses the chunks_dirty tree, so leaving this chunk in
 	 * the chunks_* trees is sufficient for that purpose.
 	 */
-	key.addr = (void *)((uintptr_t)chunk + (arena_chunk_header_npages <<
-	    pagesize_2pow));
-	node = extent_tree_ad_search(&arena->runs_avail_ad, &key);
-	assert(node != NULL);
-	extent_tree_szad_remove(&arena->runs_avail_szad, node);
-	extent_tree_ad_remove(&arena->runs_avail_ad, node);
-	arena_chunk_node_dealloc(chunk, node);
-
-	arena_chunk_tree_all_remove(&chunk->arena->chunks_all,
-	    chunk);
-	if (chunk->ndirty > 0) {
-		arena_chunk_tree_dirty_remove(
-		    &chunk->arena->chunks_dirty, chunk);
-		arena->ndirty -= chunk->ndirty;
-	}
-	VALGRIND_FREELIKE_BLOCK(chunk, 0);
-	chunk_dealloc((void *)chunk, chunksize);
-#ifdef MALLOC_STATS
-	arena->stats.mapped -= chunksize;
-#endif
+	arena_avail_tree_remove(&arena->runs_avail,
+	    &chunk->map[arena_chunk_header_npages]);
+
+	arena->spare = chunk;
 }
 
 static arena_run_t *
-arena_run_alloc(arena_t *arena, arena_bin_t *bin, size_t size, bool small,
+arena_run_alloc(arena_t *arena, arena_bin_t *bin, size_t size, bool large,
     bool zero)
 {
 	arena_chunk_t *chunk;
 	arena_run_t *run;
-	extent_node_t *node, key;
-
-	assert(size <= (chunksize - (arena_chunk_header_npages <<
-	    pagesize_2pow)));
+	arena_chunk_map_t *mapelm, key;
+
+	assert(size <= arena_maxclass);
 	assert((size & pagesize_mask) == 0);
 
 	chunk = NULL;
 	while (true) {
 		/* Search the arena's chunks for the lowest best fit. */
-		key.addr = NULL;
-		key.size = size;
-		node = extent_tree_szad_nsearch(&arena->runs_avail_szad, &key);
-		if (node != NULL) {
+		key.bits = size | CHUNK_MAP_KEY;
+		mapelm = arena_avail_tree_nsearch(&arena->runs_avail, &key);
+		if (mapelm != NULL) {
+			arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
+			size_t pageind = ((uintptr_t)mapelm -
+			    (uintptr_t)run_chunk->map) /
+			    sizeof(arena_chunk_map_t);
+
 			if (chunk != NULL)
 				chunk_dealloc(chunk, chunksize);
-			run = (arena_run_t *)node->addr;
-			arena_run_split(arena, run, size, small, zero);
+			run = (arena_run_t *)((uintptr_t)run_chunk + (pageind
+			    << pagesize_2pow));
+			arena_run_split(arena, run, size, large, zero);
+			return (run);
+		}
+
+		if (arena->spare != NULL) {
+			/* Use the spare. */
+			chunk = arena->spare;
+			arena->spare = NULL;
+			run = (arena_run_t *)((uintptr_t)chunk +
+			    (arena_chunk_header_npages << pagesize_2pow));
+			/* Insert the run into the runs_avail tree. */
+			arena_avail_tree_insert(&arena->runs_avail,
+			    &chunk->map[arena_chunk_header_npages]);
+			arena_run_split(arena, run, size, large, zero);
 			return (run);
 		}
 
 		/*
 		 * No usable runs.  Create a new chunk from which to allocate
 		 * the run.
 		 */
 		if (chunk == NULL) {
@@ -3182,36 +3356,28 @@ arena_run_alloc(arena_t *arena, arena_bi
 			if (chunk == NULL)
 				return (NULL);
 		}
 
 		arena_chunk_init(arena, chunk);
 		run = (arena_run_t *)((uintptr_t)chunk +
 		    (arena_chunk_header_npages << pagesize_2pow));
 		/* Update page map. */
-		arena_run_split(arena, run, size, small, zero);
+		arena_run_split(arena, run, size, large, zero);
 		return (run);
 	}
 }
 
 static void
 arena_purge(arena_t *arena)
 {
 	arena_chunk_t *chunk;
 	size_t i, npages;
 #ifdef MALLOC_DEBUG
-	size_t ndirty;
-
-	ndirty = 0;
-	rb_foreach_begin(arena_chunk_t, link_all, &arena->chunks_all, chunk) {
-		ndirty += chunk->ndirty;
-	} rb_foreach_end(arena_chunk_t, link_all, &arena->chunks_all, chunk)
-	assert(ndirty == arena->ndirty);
-
-	ndirty = 0;
+	size_t ndirty = 0;
 	rb_foreach_begin(arena_chunk_t, link_dirty, &arena->chunks_dirty,
 	    chunk) {
 		ndirty += chunk->ndirty;
 	} rb_foreach_end(arena_chunk_t, link_dirty, &arena->chunks_dirty, chunk)
 	assert(ndirty == arena->ndirty);
 #endif
 	assert(arena->ndirty > opt_dirty_max);
 
@@ -3227,47 +3393,55 @@ arena_purge(arena_t *arena)
 	 */
 	while (arena->ndirty > (opt_dirty_max >> 1)) {
 		chunk = arena_chunk_tree_dirty_last(&arena->chunks_dirty);
 		assert(chunk != NULL);
 
 		for (i = chunk_npages - 1; chunk->ndirty > 0; i--) {
 			assert(i >= arena_chunk_header_npages);
 
-			if (chunk->map[i] & CHUNK_MAP_DIRTY) {
-				chunk->map[i] = (CHUNK_MAP_LARGE |
+			if (chunk->map[i].bits & CHUNK_MAP_DIRTY) {
+#ifdef MALLOC_DECOMMIT
+				assert((chunk->map[i].bits &
+				    CHUNK_MAP_DECOMMITTED) == 0);
+#endif
+				chunk->map[i].bits ^=
 #ifdef MALLOC_DECOMMIT
 				    CHUNK_MAP_DECOMMITTED |
 #endif
-				    CHUNK_MAP_POS_MASK);
+				    CHUNK_MAP_DIRTY;
 				/* Find adjacent dirty run(s). */
 				for (npages = 1; i > arena_chunk_header_npages
-				    && (chunk->map[i - 1] & CHUNK_MAP_DIRTY);
-				    npages++) {
+				    && (chunk->map[i - 1].bits &
+				    CHUNK_MAP_DIRTY); npages++) {
 					i--;
-					chunk->map[i] = (CHUNK_MAP_LARGE
 #ifdef MALLOC_DECOMMIT
-					    | CHUNK_MAP_DECOMMITTED
-#endif
-					    | CHUNK_MAP_POS_MASK);
+					assert((chunk->map[i].bits &
+					    CHUNK_MAP_DECOMMITTED) == 0);
+#endif
+					chunk->map[i].bits ^=
+#ifdef MALLOC_DECOMMIT
+					    CHUNK_MAP_DECOMMITTED |
+#endif
+					    CHUNK_MAP_DIRTY;
 				}
 				chunk->ndirty -= npages;
 				arena->ndirty -= npages;
 
 #ifdef MALLOC_DECOMMIT
 				pages_decommit((void *)((uintptr_t)
 				    chunk + (i << pagesize_2pow)),
 				    (npages << pagesize_2pow));
 #  ifdef MALLOC_STATS
 				arena->stats.ndecommit++;
 				arena->stats.decommitted += npages;
 #  endif
 #else
 				madvise((void *)((uintptr_t)chunk + (i <<
-				    pagesize_2pow)), pagesize * npages,
+				    pagesize_2pow)), (npages << pagesize_2pow),
 				    MADV_FREE);
 #endif
 #ifdef MALLOC_STATS
 				arena->stats.nmadvise++;
 				arena->stats.purged += npages;
 #endif
 				if (arena->ndirty <= (opt_dirty_max >> 1))
 					break;
@@ -3280,194 +3454,185 @@ arena_purge(arena_t *arena)
 		}
 	}
 }
 
 static void
 arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 {
 	arena_chunk_t *chunk;
-	extent_node_t *nodeA, *nodeB, *nodeC, key;
 	size_t size, run_ind, run_pages;
 
-	/* Remove run from runs_alloced_ad. */
-	key.addr = run;
-	nodeB = extent_tree_ad_search(&arena->runs_alloced_ad, &key);
-	assert(nodeB != NULL);
-	extent_tree_ad_remove(&arena->runs_alloced_ad, nodeB);
-	size = nodeB->size;
-
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
-	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk)
+	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk)
 	    >> pagesize_2pow);
 	assert(run_ind >= arena_chunk_header_npages);
-	assert(run_ind < (chunksize >> pagesize_2pow));
+	assert(run_ind < chunk_npages);
+	if ((chunk->map[run_ind].bits & CHUNK_MAP_LARGE) != 0)
+		size = chunk->map[run_ind].bits & ~pagesize_mask;
+	else
+		size = run->bin->run_size;
 	run_pages = (size >> pagesize_2pow);
 
-	/* Subtract pages from count of pages used in chunk. */
-	chunk->pages_used -= run_pages;
-
+	/* Mark pages as unallocated in the chunk map. */
 	if (dirty) {
 		size_t i;
 
 		for (i = 0; i < run_pages; i++) {
-			assert((chunk->map[run_ind + i] & CHUNK_MAP_DIRTY) ==
-			    0);
-			chunk->map[run_ind + i] |= CHUNK_MAP_DIRTY;
+			assert((chunk->map[run_ind + i].bits & CHUNK_MAP_DIRTY)
+			    == 0);
+			chunk->map[run_ind + i].bits = CHUNK_MAP_DIRTY;
 		}
 
 		if (chunk->ndirty == 0) {
 			arena_chunk_tree_dirty_insert(&arena->chunks_dirty,
 			    chunk);
 		}
 		chunk->ndirty += run_pages;
 		arena->ndirty += run_pages;
-	}
-#ifdef MALLOC_DEBUG
-	/* Set map elements to a bogus value in order to aid error detection. */
-	{
+	} else {
 		size_t i;
 
 		for (i = 0; i < run_pages; i++) {
-			chunk->map[run_ind + i] |= (CHUNK_MAP_LARGE |
-			    CHUNK_MAP_POS_MASK);
+			chunk->map[run_ind + i].bits &= ~(CHUNK_MAP_LARGE |
+			    CHUNK_MAP_ALLOCATED);
 		}
 	}
-#endif
+	chunk->map[run_ind].bits = size | (chunk->map[run_ind].bits &
+	    pagesize_mask);
+	chunk->map[run_ind+run_pages-1].bits = size |
+	    (chunk->map[run_ind+run_pages-1].bits & pagesize_mask);
 
 	/* Try to coalesce forward. */
-	key.addr = (void *)((uintptr_t)run + size);
-	nodeC = extent_tree_ad_nsearch(&arena->runs_avail_ad, &key);
-	if (nodeC != NULL && nodeC->addr == key.addr) {
+	if (run_ind + run_pages < chunk_npages &&
+	    (chunk->map[run_ind+run_pages].bits & CHUNK_MAP_ALLOCATED) == 0) {
+		size_t nrun_size = chunk->map[run_ind+run_pages].bits &
+		    ~pagesize_mask;
+
 		/*
-		 * Coalesce forward.  This does not change the position within
-		 * runs_avail_ad, so only remove/insert from/into
-		 * runs_avail_szad.
+		 * Remove successor from runs_avail; the coalesced run is
+		 * inserted later.
 		 */
-		extent_tree_szad_remove(&arena->runs_avail_szad, nodeC);
-		nodeC->addr = (void *)run;
-		nodeC->size += size;
-		extent_tree_szad_insert(&arena->runs_avail_szad, nodeC);
-		arena_chunk_node_dealloc(chunk, nodeB);
-		nodeB = nodeC;
-	} else {
-		/*
-		 * Coalescing forward failed, so insert nodeB into runs_avail_*.
-		 */
-		extent_tree_szad_insert(&arena->runs_avail_szad, nodeB);
-		extent_tree_ad_insert(&arena->runs_avail_ad, nodeB);
+		arena_avail_tree_remove(&arena->runs_avail,
+		    &chunk->map[run_ind+run_pages]);
+
+		size += nrun_size;
+		run_pages = size >> pagesize_2pow;
+
+		assert((chunk->map[run_ind+run_pages-1].bits & ~pagesize_mask)
+		    == nrun_size);
+		chunk->map[run_ind].bits = size | (chunk->map[run_ind].bits &
+		    pagesize_mask);
+		chunk->map[run_ind+run_pages-1].bits = size |
+		    (chunk->map[run_ind+run_pages-1].bits & pagesize_mask);
 	}
 
 	/* Try to coalesce backward. */
-	nodeA = extent_tree_ad_prev(&arena->runs_avail_ad, nodeB);
-	if (nodeA != NULL && (void *)((uintptr_t)nodeA->addr + nodeA->size) ==
-	    (void *)run) {
+	if (run_ind > arena_chunk_header_npages && (chunk->map[run_ind-1].bits &
+	    CHUNK_MAP_ALLOCATED) == 0) {
+		size_t prun_size = chunk->map[run_ind-1].bits & ~pagesize_mask;
+
+		run_ind -= prun_size >> pagesize_2pow;
+
 		/*
-		 * Coalesce with previous run.  This does not change nodeB's
-		 * position within runs_avail_ad, so only remove/insert
-		 * from/into runs_avail_szad.
+		 * Remove predecessor from runs_avail; the coalesced run is
+		 * inserted later.
 		 */
-		extent_tree_szad_remove(&arena->runs_avail_szad, nodeA);
-		extent_tree_ad_remove(&arena->runs_avail_ad, nodeA);
-
-		extent_tree_szad_remove(&arena->runs_avail_szad, nodeB);
-		nodeB->addr = nodeA->addr;
-		nodeB->size += nodeA->size;
-		extent_tree_szad_insert(&arena->runs_avail_szad, nodeB);
-
-		arena_chunk_node_dealloc(chunk, nodeA);
+		arena_avail_tree_remove(&arena->runs_avail,
+		    &chunk->map[run_ind]);
+
+		size += prun_size;
+		run_pages = size >> pagesize_2pow;
+
+		assert((chunk->map[run_ind].bits & ~pagesize_mask) ==
+		    prun_size);
+		chunk->map[run_ind].bits = size | (chunk->map[run_ind].bits &
+		    pagesize_mask);
+		chunk->map[run_ind+run_pages-1].bits = size |
+		    (chunk->map[run_ind+run_pages-1].bits & pagesize_mask);
 	}
 
+	/* Insert into runs_avail, now that coalescing is complete. */
+	arena_avail_tree_insert(&arena->runs_avail, &chunk->map[run_ind]);
+
 	/* Deallocate chunk if it is now completely unused. */
-	if (chunk->pages_used == 0)
+	if ((chunk->map[arena_chunk_header_npages].bits & (~pagesize_mask |
+	    CHUNK_MAP_ALLOCATED)) == arena_maxclass)
 		arena_chunk_dealloc(arena, chunk);
 
 	/* Enforce opt_dirty_max. */
 	if (arena->ndirty > opt_dirty_max)
 		arena_purge(arena);
 }
 
 static void
-arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk, extent_node_t *nodeB,
-    arena_run_t *run, size_t oldsize, size_t newsize)
-{
-	extent_node_t *nodeA;
-
-	assert(nodeB->addr == run);
-	assert(nodeB->size == oldsize);
+arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
+    size_t oldsize, size_t newsize)
+{
+	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> pagesize_2pow;
+	size_t head_npages = (oldsize - newsize) >> pagesize_2pow;
+
 	assert(oldsize > newsize);
 
 	/*
-	 * Update the run's node in runs_alloced_ad.  Its position does not
-	 * change.
+	 * Update the chunk map so that arena_run_dalloc() can treat the
+	 * leading run as separately allocated.
 	 */
-	nodeB->addr = (void *)((uintptr_t)run + (oldsize - newsize));
-	nodeB->size = newsize;
-
-	/*
-	 * Insert a node into runs_alloced_ad so that arena_run_dalloc() can
-	 * treat the leading run as separately allocated.
-	 */
-	nodeA = arena_chunk_node_alloc(chunk);
-	nodeA->addr = (void *)run;
-	nodeA->size = oldsize - newsize;
-	extent_tree_ad_insert(&arena->runs_alloced_ad, nodeA);
-
-	arena_run_dalloc(arena, (arena_run_t *)run, false);
+	chunk->map[pageind].bits = (oldsize - newsize) | CHUNK_MAP_LARGE |
+	    CHUNK_MAP_ALLOCATED;
+	chunk->map[pageind+head_npages].bits = newsize | CHUNK_MAP_LARGE |
+	    CHUNK_MAP_ALLOCATED;
+
+	arena_run_dalloc(arena, run, false);
 }
 
 static void
-arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, extent_node_t *nodeA,
-    arena_run_t *run, size_t oldsize, size_t newsize, bool dirty)
-{
-	extent_node_t *nodeB;
-
-	assert(nodeA->addr == run);
-	assert(nodeA->size == oldsize);
+arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
+    size_t oldsize, size_t newsize, bool dirty)
+{
+	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> pagesize_2pow;
+	size_t npages = newsize >> pagesize_2pow;
+
 	assert(oldsize > newsize);
 
 	/*
-	 * Update the run's node in runs_alloced_ad.  Its position does not
-	 * change.
+	 * Update the chunk map so that arena_run_dalloc() can treat the
+	 * trailing run as separately allocated.
 	 */
-	nodeA->size = newsize;
-
-	/*
-	 * Insert a node into runs_alloced_ad so that arena_run_dalloc() can
-	 * treat the trailing run as separately allocated.
-	 */
-	nodeB = arena_chunk_node_alloc(chunk);
-	nodeB->addr = (void *)((uintptr_t)run + newsize);
-	nodeB->size = oldsize - newsize;
-	extent_tree_ad_insert(&arena->runs_alloced_ad, nodeB);
+	chunk->map[pageind].bits = newsize | CHUNK_MAP_LARGE |
+	    CHUNK_MAP_ALLOCATED;
+	chunk->map[pageind+npages].bits = (oldsize - newsize) | CHUNK_MAP_LARGE
+	    | CHUNK_MAP_ALLOCATED;
 
 	arena_run_dalloc(arena, (arena_run_t *)((uintptr_t)run + newsize),
 	    dirty);
 }
 
 static arena_run_t *
 arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 {
+	arena_chunk_map_t *mapelm;
 	arena_run_t *run;
 	unsigned i, remainder;
 
 	/* Look for a usable run. */
-	run = arena_run_tree_first(&bin->runs);
-	if (run != NULL) {
+	mapelm = arena_run_tree_first(&bin->runs);
+	if (mapelm != NULL) {
 		/* run is guaranteed to have available space. */
-		arena_run_tree_remove(&bin->runs, run);
+		arena_run_tree_remove(&bin->runs, mapelm);
+		run = (arena_run_t *)(mapelm->bits & ~pagesize_mask);
 #ifdef MALLOC_STATS
 		bin->stats.reruns++;
 #endif
 		return (run);
 	}
 	/* No existing runs have any space available. */
 
 	/* Allocate a new run. */
-	run = arena_run_alloc(arena, bin, bin->run_size, true, false);
+	run = arena_run_alloc(arena, bin, bin->run_size, false, false);
 	if (run == NULL)
 		return (NULL);
 	/*
 	 * Don't initialize if a race in arena_run_alloc() allowed an existing
 	 * run to become usable.
 	 */
 	if (run == bin->runcur)
 		return (run);
@@ -3752,17 +3917,17 @@ arena_malloc_large(arena_t *arena, size_
 
 	/* Large allocation. */
 	size = PAGE_CEILING(size);
 #ifdef MALLOC_BALANCE
 	arena_lock_balance(arena);
 #else
 	malloc_spin_lock(&arena->lock);
 #endif
-	ret = (void *)arena_run_alloc(arena, NULL, size, false, zero);
+	ret = (void *)arena_run_alloc(arena, NULL, size, true, zero);
 	if (ret == NULL) {
 		malloc_spin_unlock(&arena->lock);
 		return (NULL);
 	}
 #ifdef MALLOC_STATS
 	arena->stats.nmalloc_large++;
 	arena->stats.allocated_large += size;
 #endif
@@ -3820,72 +3985,54 @@ icalloc(size_t size)
 
 /* Only handles large allocations that require more than page alignment. */
 static void *
 arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size)
 {
 	void *ret;
 	size_t offset;
 	arena_chunk_t *chunk;
-	extent_node_t *node, key;
 
 	assert((size & pagesize_mask) == 0);
 	assert((alignment & pagesize_mask) == 0);
 
 #ifdef MALLOC_BALANCE
 	arena_lock_balance(arena);
 #else
 	malloc_spin_lock(&arena->lock);
 #endif
-	ret = (void *)arena_run_alloc(arena, NULL, alloc_size, false, false);
+	ret = (void *)arena_run_alloc(arena, NULL, alloc_size, true, false);
 	if (ret == NULL) {
 		malloc_spin_unlock(&arena->lock);
 		return (NULL);
 	}
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ret);
 
 	offset = (uintptr_t)ret & (alignment - 1);
 	assert((offset & pagesize_mask) == 0);
 	assert(offset < alloc_size);
-	if (offset == 0) {
-		/*
-		 * Update the run's node in runs_alloced_ad.  Its position
-		 * does not change.
-		 */
-		key.addr = ret;
-		node = extent_tree_ad_search(&arena->runs_alloced_ad, &key);
-		assert(node != NULL);
-
-		arena_run_trim_tail(arena, chunk, node, ret, alloc_size, size,
-		    false);
-	} else {
+	if (offset == 0)
+		arena_run_trim_tail(arena, chunk, ret, alloc_size, size, false);
+	else {
 		size_t leadsize, trailsize;
 
-		/*
-		 * Update the run's node in runs_alloced_ad.  Its position
-		 * does not change.
-		 */
-		key.addr = ret;
-		node = extent_tree_ad_search(&arena->runs_alloced_ad, &key);
-		assert(node != NULL);
-
 		leadsize = alignment - offset;
 		if (leadsize > 0) {
-			arena_run_trim_head(arena, chunk, node, ret, alloc_size,
+			arena_run_trim_head(arena, chunk, ret, alloc_size,
 			    alloc_size - leadsize);
 			ret = (void *)((uintptr_t)ret + leadsize);
 		}
 
 		trailsize = alloc_size - leadsize - size;
 		if (trailsize != 0) {
 			/* Trim trailing space. */
 			assert(trailsize < alloc_size);
-			arena_run_trim_tail(arena, chunk, node, ret, size +
-			    trailsize, size, false);
+			arena_run_trim_tail(arena, chunk, ret, size + trailsize,
+			    size, false);
 		}
 	}
 
 #ifdef MALLOC_STATS
 	arena->stats.nmalloc_large++;
 	arena->stats.allocated_large += size;
 #endif
 	malloc_spin_unlock(&arena->lock);
@@ -3996,47 +4143,32 @@ ipalloc(size_t alignment, size_t size)
 }
 
 /* Return the size of the allocation pointed to by ptr. */
 static size_t
 arena_salloc(const void *ptr)
 {
 	size_t ret;
 	arena_chunk_t *chunk;
-	arena_chunk_map_t mapelm;
-	size_t pageind;
+	size_t pageind, mapbits;
 
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> pagesize_2pow);
-	mapelm = chunk->map[pageind];
-	if ((mapelm & CHUNK_MAP_LARGE) == 0) {
-		arena_run_t *run;
-
-		/* Small allocation size is in the run header. */
-		pageind -= (mapelm & CHUNK_MAP_POS_MASK);
-		run = (arena_run_t *)((uintptr_t)chunk + (pageind <<
-		    pagesize_2pow));
+	mapbits = chunk->map[pageind].bits;
+	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
+	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
+		arena_run_t *run = (arena_run_t *)(mapbits & ~pagesize_mask);
 		assert(run->magic == ARENA_RUN_MAGIC);
 		ret = run->bin->reg_size;
 	} else {
-		arena_t *arena = chunk->arena;
-		extent_node_t *node, key;
-
-		/* Large allocation size is in the extent tree. */
-		assert((mapelm & CHUNK_MAP_POS_MASK) == 0);
-		arena = chunk->arena;
-		malloc_spin_lock(&arena->lock);
-		key.addr = (void *)ptr;
-		node = extent_tree_ad_search(&arena->runs_alloced_ad, &key);
-		assert(node != NULL);
-		ret = node->size;
-		malloc_spin_unlock(&arena->lock);
+		ret = mapbits & ~pagesize_mask;
+		assert(ret != 0);
 	}
 
 	return (ret);
 }
 
 #if (defined(MALLOC_VALIDATE) || defined(MOZ_MEMORY_DARWIN))
 /*
  * Validate ptr before assuming that it points to an allocation.  Currently,
@@ -4046,64 +4178,26 @@ arena_salloc(const void *ptr)
  *
  * + Check that ptr lies within a mapped chunk.
  */
 static inline size_t
 isalloc_validate(const void *ptr)
 {
 	arena_chunk_t *chunk;
 
-	if (ptr == NULL)
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk == NULL)
 		return (0);
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (malloc_rtree_get(chunk_rtree, (uintptr_t)chunk) == NULL)
+		return (0);
+
 	if (chunk != ptr) {
-		arena_t *arena;
-		unsigned i;
-
-		if (narenas > 1) {
-			/*
-			 * Use arenas_lock as a memory barrier in order to
-			 * force an update of this processor's cache, so that
-			 * the arenas vector is sufficiently current for us to
-			 * be sure of searching all the arenas that existed
-			 * when ptr was allocated.
-			 *
-			 * Only do this when using multiple arenas, since when
-			 * there is only one arena, there are no race
-			 * conditions that allow arenas[0] to be stale on this
-			 * processor under any conditions that even remotely
-			 * resemble normal program behavior.
-			 */
-			malloc_spin_lock(&arenas_lock);
-			malloc_spin_unlock(&arenas_lock);
-		}
-
-		for (i = 0; i < narenas; i++) {
-			arena = arenas[i];
-			if (arena != NULL) {
-				/* Make sure ptr is within a chunk. */
-				malloc_spin_lock(&arena->lock);
-				if (arena_chunk_tree_all_search(
-				    &arena->chunks_all, chunk) == chunk) {
-					malloc_spin_unlock(&arena->lock);
-					/*
-					 * We only lock in arena_salloc() for
-					 * large objects, so don't worry about
-					 * the overhead of possibly locking
-					 * twice.
-					 */
-					assert(chunk->arena->magic ==
-					    ARENA_MAGIC);
-					return (arena_salloc(ptr));
-				}
-				malloc_spin_unlock(&arena->lock);
-			}
-		}
-		return (0);
+		assert(chunk->arena->magic == ARENA_MAGIC);
+		return (arena_salloc(ptr));
 	} else {
 		size_t ret;
 		extent_node_t *node;
 		extent_node_t key;
 
 		/* Chunk. */
 		key.addr = (void *)chunk;
 		malloc_mutex_lock(&huge_mtx);
@@ -4149,25 +4243,23 @@ isalloc(const void *ptr)
 		malloc_mutex_unlock(&huge_mtx);
 	}
 
 	return (ret);
 }
 
 static inline void
 arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    size_t pageind, arena_chunk_map_t mapelm)
+    arena_chunk_map_t *mapelm)
 {
 	arena_run_t *run;
 	arena_bin_t *bin;
 	size_t size;
 
-	pageind -= (mapelm & CHUNK_MAP_POS_MASK);
-
-	run = (arena_run_t *)((uintptr_t)chunk + (pageind << pagesize_2pow));
+	run = (arena_run_t *)(mapelm->bits & ~pagesize_mask);
 	assert(run->magic == ARENA_RUN_MAGIC);
 	bin = run->bin;
 	size = bin->reg_size;
 
 #ifdef MALLOC_FILL
 	if (opt_junk)
 		memset(ptr, 0x5a, size);
 #endif
@@ -4175,23 +4267,28 @@ arena_dalloc_small(arena_t *arena, arena
 	arena_run_reg_dalloc(run, bin, ptr, size);
 	run->nfree++;
 
 	if (run->nfree == bin->nregs) {
 		/* Deallocate run. */
 		if (run == bin->runcur)
 			bin->runcur = NULL;
 		else if (bin->nregs != 1) {
+			size_t run_pageind = (((uintptr_t)run -
+			    (uintptr_t)chunk)) >> pagesize_2pow;
+			arena_chunk_map_t *run_mapelm =
+			    &chunk->map[run_pageind];
 			/*
 			 * This block's conditional is necessary because if the
 			 * run only contains one region, then it never gets
 			 * inserted into the non-full runs tree.
 			 */
-			assert(arena_run_tree_search(&bin->runs, run) == run);
-			arena_run_tree_remove(&bin->runs, run);
+			assert(arena_run_tree_search(&bin->runs, run_mapelm) ==
+			    run_mapelm);
+			arena_run_tree_remove(&bin->runs, run_mapelm);
 		}
 #ifdef MALLOC_DEBUG
 		run->magic = 0;
 #endif
 		VALGRIND_FREELIKE_BLOCK(run, 0);
 		arena_run_dalloc(arena, run, true);
 #ifdef MALLOC_STATS
 		bin->stats.curruns--;
@@ -4201,25 +4298,40 @@ arena_dalloc_small(arena_t *arena, arena
 		 * Make sure that bin->runcur always refers to the lowest
 		 * non-full run, if one exists.
 		 */
 		if (bin->runcur == NULL)
 			bin->runcur = run;
 		else if ((uintptr_t)run < (uintptr_t)bin->runcur) {
 			/* Switch runcur. */
 			if (bin->runcur->nfree > 0) {
+				arena_chunk_t *runcur_chunk =
+				    CHUNK_ADDR2BASE(bin->runcur);
+				size_t runcur_pageind =
+				    (((uintptr_t)bin->runcur -
+				    (uintptr_t)runcur_chunk)) >> pagesize_2pow;
+				arena_chunk_map_t *runcur_mapelm =
+				    &runcur_chunk->map[runcur_pageind];
+
 				/* Insert runcur. */
 				assert(arena_run_tree_search(&bin->runs,
-				    bin->runcur) == NULL);
-				arena_run_tree_insert(&bin->runs, bin->runcur);
+				    runcur_mapelm) == NULL);
+				arena_run_tree_insert(&bin->runs,
+				    runcur_mapelm);
 			}
 			bin->runcur = run;
 		} else {
-			assert(arena_run_tree_search(&bin->runs, run) == NULL);
-			arena_run_tree_insert(&bin->runs, run);
+			size_t run_pageind = (((uintptr_t)run -
+			    (uintptr_t)chunk)) >> pagesize_2pow;
+			arena_chunk_map_t *run_mapelm =
+			    &chunk->map[run_pageind];
+
+			assert(arena_run_tree_search(&bin->runs, run_mapelm) ==
+			    NULL);
+			arena_run_tree_insert(&bin->runs, run_mapelm);
 		}
 	}
 #ifdef MALLOC_STATS
 	arena->stats.allocated_small -= size;
 	arena->stats.ndalloc_small++;
 #endif
 }
 
@@ -4230,23 +4342,20 @@ arena_dalloc_large(arena_t *arena, arena
 	malloc_spin_lock(&arena->lock);
 
 #ifdef MALLOC_FILL
 #ifndef MALLOC_STATS
 	if (opt_junk)
 #endif
 #endif
 	{
-		extent_node_t *node, key;
-		size_t size;
-
-		key.addr = ptr;
-		node = extent_tree_ad_search(&arena->runs_alloced_ad, &key);
-		assert(node != NULL);
-		size = node->size;
+		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >>
+		    pagesize_2pow;
+		size_t size = chunk->map[pageind].bits & ~pagesize_mask;
+
 #ifdef MALLOC_FILL
 #ifdef MALLOC_STATS
 		if (opt_junk)
 #endif
 			memset(ptr, 0x5a, size);
 #endif
 #ifdef MALLOC_STATS
 		arena->stats.allocated_large -= size;
@@ -4269,25 +4378,24 @@ arena_dalloc(arena_t *arena, arena_chunk
 	assert(arena != NULL);
 	assert(arena->magic == ARENA_MAGIC);
 	assert(chunk->arena == arena);
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> pagesize_2pow);
 	mapelm = &chunk->map[pageind];
-	if ((*mapelm & CHUNK_MAP_LARGE) == 0) {
+	assert((mapelm->bits & CHUNK_MAP_ALLOCATED) != 0);
+	if ((mapelm->bits & CHUNK_MAP_LARGE) == 0) {
 		/* Small allocation. */
 		malloc_spin_lock(&arena->lock);
-		arena_dalloc_small(arena, chunk, ptr, pageind, *mapelm);
+		arena_dalloc_small(arena, chunk, ptr, mapelm);
 		malloc_spin_unlock(&arena->lock);
-	} else {
-		assert((*mapelm & CHUNK_MAP_POS_MASK) == 0);
+	} else
 		arena_dalloc_large(arena, chunk, ptr);
-	}
 	VALGRIND_FREELIKE_BLOCK(ptr, 0);
 }
 
 static inline void
 idalloc(void *ptr)
 {
 	arena_chunk_t *chunk;
 
@@ -4299,80 +4407,68 @@ idalloc(void *ptr)
 	else
 		huge_dalloc(ptr);
 }
 
 static void
 arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     size_t size, size_t oldsize)
 {
-	extent_node_t *node, key;
 
 	assert(size < oldsize);
 
 	/*
 	 * Shrink the run, and make trailing pages available for other
 	 * allocations.
 	 */
-	key.addr = (void *)((uintptr_t)ptr);
 #ifdef MALLOC_BALANCE
 	arena_lock_balance(arena);
 #else
 	malloc_spin_lock(&arena->lock);
 #endif
-	node = extent_tree_ad_search(&arena->runs_alloced_ad, &key);
-	assert(node != NULL);
-	arena_run_trim_tail(arena, chunk, node, (arena_run_t *)ptr, oldsize,
-	    size, true);
+	arena_run_trim_tail(arena, chunk, (arena_run_t *)ptr, oldsize, size,
+	    true);
 #ifdef MALLOC_STATS
 	arena->stats.allocated_large -= oldsize - size;
 #endif
 	malloc_spin_unlock(&arena->lock);
 }
 
 static bool
 arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     size_t size, size_t oldsize)
 {
-	extent_node_t *nodeC, key;
+	size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> pagesize_2pow;
+	size_t npages = oldsize >> pagesize_2pow;
+
+	assert(oldsize == (chunk->map[pageind].bits & ~pagesize_mask));
 
 	/* Try to extend the run. */
 	assert(size > oldsize);
-	key.addr = (void *)((uintptr_t)ptr + oldsize);
 #ifdef MALLOC_BALANCE
 	arena_lock_balance(arena);
 #else
 	malloc_spin_lock(&arena->lock);
 #endif
-	nodeC = extent_tree_ad_search(&arena->runs_avail_ad, &key);
-	if (nodeC != NULL && oldsize + nodeC->size >= size) {
-		extent_node_t *nodeA, *nodeB;
-
+	if (pageind + npages < chunk_npages && (chunk->map[pageind+npages].bits
+	    & CHUNK_MAP_ALLOCATED) == 0 && (chunk->map[pageind+npages].bits &
+	    ~pagesize_mask) >= size - oldsize) {
 		/*
 		 * The next run is available and sufficiently large.  Split the
 		 * following run, then merge the first part with the existing
-		 * allocation.  This results in a bit more tree manipulation
-		 * than absolutely necessary, but it substantially simplifies
-		 * the code.
+		 * allocation.
 		 */
-		arena_run_split(arena, (arena_run_t *)nodeC->addr, size -
-		    oldsize, false, false);
-
-		key.addr = ptr;
-		nodeA = extent_tree_ad_search(&arena->runs_alloced_ad, &key);
-		assert(nodeA != NULL);
-
-		key.addr = (void *)((uintptr_t)ptr + oldsize);
-		nodeB = extent_tree_ad_search(&arena->runs_alloced_ad, &key);
-		assert(nodeB != NULL);
-
-		nodeA->size += nodeB->size;
-
-		extent_tree_ad_remove(&arena->runs_alloced_ad, nodeB);
-		arena_chunk_node_dealloc(chunk, nodeB);
+		arena_run_split(arena, (arena_run_t *)((uintptr_t)chunk +
+		    ((pageind+npages) << pagesize_2pow)), size - oldsize, true,
+		    false);
+
+		chunk->map[pageind].bits = size | CHUNK_MAP_LARGE |
+		    CHUNK_MAP_ALLOCATED;
+		chunk->map[pageind+npages].bits = CHUNK_MAP_LARGE |
+		    CHUNK_MAP_ALLOCATED;
 
 #ifdef MALLOC_STATS
 		arena->stats.allocated_large += size - oldsize;
 #endif
 		malloc_spin_unlock(&arena->lock);
 		return (false);
 	}
 	malloc_spin_unlock(&arena->lock);
@@ -4534,24 +4630,22 @@ arena_new(arena_t *arena)
 
 #ifdef MALLOC_STATS
 	memset(&arena->stats, 0, sizeof(arena_stats_t));
 #endif
 
 	arena->chunk_seq = 0;
 
 	/* Initialize chunks. */
-	arena_chunk_tree_all_new(&arena->chunks_all);
 	arena_chunk_tree_dirty_new(&arena->chunks_dirty);
+	arena->spare = NULL;
 
 	arena->ndirty = 0;
 
-	extent_tree_szad_new(&arena->runs_avail_szad);
-	extent_tree_ad_new(&arena->runs_avail_ad);
-	extent_tree_ad_new(&arena->runs_alloced_ad);
+	arena_avail_tree_new(&arena->runs_avail);
 
 #ifdef MALLOC_BALANCE
 	arena->contention = 0;
 #endif
 
 	/* Initialize bins. */
 	prev_run_size = pagesize;
 
@@ -5629,18 +5723,17 @@ MALLOC_OUT:
 		/*
 		 * Compute the header size such that it is large
 		 * enough to contain the page map and enough nodes for the
 		 * worst case: one node per non-header page plus one extra for
 		 * situations where we briefly have one more node allocated
 		 * than we will need.
 		 */
 		header_size = sizeof(arena_chunk_t) +
-		    (sizeof(arena_chunk_map_t) * (chunk_npages - 1)) +
-		    (sizeof(extent_node_t) * chunk_npages);
+		    (sizeof(arena_chunk_map_t) * (chunk_npages - 1));
 		arena_chunk_header_npages = (header_size >> pagesize_2pow) +
 		    ((header_size & pagesize_mask) != 0);
 	}
 	arena_maxclass = chunksize - (arena_chunk_header_npages <<
 	    pagesize_2pow);
 
 	UTRACE(0, 0, 0);
 
@@ -5784,24 +5877,30 @@ MALLOC_OUT:
 	 * called for other threads.  The seed value doesn't really matter.
 	 */
 #ifdef MALLOC_BALANCE
 	SPRN(balance, 42);
 #endif
 
 	malloc_spin_init(&arenas_lock);
 
+#ifdef MALLOC_VALIDATE
+	chunk_rtree = malloc_rtree_new((SIZEOF_PTR << 3) - opt_chunk_2pow);
+	if (chunk_rtree == NULL)
+		return (true);
+#endif
+
 	/*
 	 * Configure and initialize the memory reserve.  This needs to happen
 	 * late during initialization, since chunks are allocated.
 	 */
 	malloc_mutex_init(&reserve_mtx);
 	reserve_min = 0;
 	reserve_cur = 0;
-	reserve_max = chunksize * narenas;
+	reserve_max = 0;
 	if (RESERVE_RANGE_2POW_DEFAULT + opt_reserve_range_lshift >= 0) {
 		reserve_max += chunksize << (RESERVE_RANGE_2POW_DEFAULT +
 		    opt_reserve_range_lshift);
 	}
 	ql_new(&reserve_regs);
 	reserve_seq = 0;
 	extent_tree_szad_new(&reserve_chunks_szad);
 	extent_tree_ad_new(&reserve_chunks_ad);
@@ -6216,27 +6315,27 @@ jemalloc_stats(jemalloc_stats_t *stats)
 		arena_t *arena = arenas[i];
 		if (arena != NULL) {
 			arena_chunk_t *chunk;
 
 			malloc_spin_lock(&arena->lock);
 			stats->allocated += arena->stats.allocated_small;
 			stats->allocated += arena->stats.allocated_large;
 #ifdef MALLOC_DECOMMIT
-			rb_foreach_begin(arena_chunk_t, link_all,
-			    &arena->chunks_all, chunk) {
+			rb_foreach_begin(arena_chunk_t, link_dirty,
+			    &arena->chunks_dirty, chunk) {
 				size_t j;
 
 				for (j = 0; j < chunk_npages; j++) {
-					if ((chunk->map[j] &
+					if ((chunk->map[j].bits &
 					    CHUNK_MAP_DECOMMITTED) == 0)
 						stats->committed += pagesize;
 				}
-			} rb_foreach_end(arena_chunk_t, link_all,
-			    &arena->chunks_all, chunk)
+			} rb_foreach_end(arena_chunk_t, link_dirty,
+			    &arena->chunks_dirty, chunk)
 #endif
 			stats->dirty += (arena->ndirty << pagesize_2pow);
 			malloc_spin_unlock(&arena->lock);
 		}
 	}
 
 #ifndef MALLOC_DECOMMIT
 	stats->committed = stats->mapped;
@@ -6451,30 +6550,37 @@ reserve_shrink(void)
 			assert(node->size <= reserve_cur - reserve_max);
 #endif
 
 			/* Discard the entire [multi-]chunk. */
 			extent_tree_szad_remove(&reserve_chunks_szad, node);
 			extent_tree_ad_remove(&reserve_chunks_ad, node);
 			reserve_cur -= node->size;
 			pages_unmap(node->addr, node->size);
+#ifdef MALLOC_STATS
+			stats_chunks.curchunks -= (node->size / chunksize);
+#endif
 			base_node_dealloc(node);
 			if (reserve_cur == reserve_max)
 				break;
 
 			rb_foreach_reverse_prev(extent_node_t, link_ad,
 			    extent_ad_comp, &reserve_chunks_ad, tnode);
 #ifndef MALLOC_DECOMMIT
 		} else {
 			/* Discard the end of the multi-chunk. */
 			extent_tree_szad_remove(&reserve_chunks_szad, node);
 			node->size -= reserve_cur - reserve_max;
 			extent_tree_szad_insert(&reserve_chunks_szad, node);
 			pages_unmap((void *)((uintptr_t)node->addr +
 			    node->size), reserve_cur - reserve_max);
+#ifdef MALLOC_STATS
+			stats_chunks.curchunks -= ((reserve_cur - reserve_max) /
+			    chunksize);
+#endif
 			reserve_cur = reserve_max;
 			break;
 		}
 #endif
 		assert(reserve_cur > reserve_max);
 	} rb_foreach_reverse_end(extent_node_t, link_ad, &reserve_chunks_ad,
 	    node)
 }
@@ -6778,17 +6884,16 @@ void
 	malloc_spin_unlock(&arenas_lock);
 }
 
 /*
  * End library-private functions.
  */
 /******************************************************************************/
 
-
 #ifdef MOZ_MEMORY_DARWIN
 static malloc_zone_t zone;
 static struct malloc_introspection_t zone_introspect;
 
 static size_t
 zone_size(malloc_zone_t *zone, void *ptr)
 {