bug 424040. add valgrind hooks to jemalloc. patch from Jason Evans <jasone@canonware.com> r=me
authorpavlov@pavlov.net
Tue, 08 Apr 2008 00:19:40 -0700
changeset 14033 f7d58808e9a9fd91263c585db286d195ffbd821e
parent 14032 eef38d5c92f71f33045a1960c0b6f7486c3e04a8
child 14034 84d19cc901a4b386398d085673f442796cfe762f
push id11
push userbsmedberg@mozilla.com
push dateTue, 15 Apr 2008 18:11:53 +0000
treeherdermozilla-central@40e4b99f0dea [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersme
bugs424040
milestone1.9pre
bug 424040. add valgrind hooks to jemalloc. patch from Jason Evans <jasone@canonware.com> r=me
configure.in
memory/jemalloc/jemalloc.c
--- a/configure.in
+++ b/configure.in
@@ -6139,17 +6139,17 @@ if test "$NS_TRACE_MALLOC"; then
   AC_DEFINE(NS_TRACE_MALLOC)
 fi
 AC_SUBST(NS_TRACE_MALLOC)
 
 dnl ========================================================
 dnl = Enable jemalloc
 dnl ========================================================
 MOZ_ARG_ENABLE_BOOL(jemalloc,
-[  --enable-jemalloc         Replace memory allocator with jemalloc],
+[  --enable-jemalloc       Replace memory allocator with jemalloc],
     MOZ_MEMORY=1,
     MOZ_MEMORY=)
 
 if test "$NS_TRACE_MALLOC"; then
     MOZ_MEMORY=
 fi
 
 if test "$MOZ_MEMORY"; then
@@ -6237,16 +6237,24 @@ if test "$MOZ_MEMORY"; then
     dnl Also pass this to NSPR/NSS
     DLLFLAGS="$DLLFLAGS -MANIFEST:NO"
     export DLLFLAGS
     ;;
   *)
     AC_MSG_ERROR([--enable-jemalloc not supported on ${target}])
     ;;
   esac
+
+  AC_ARG_WITH([valgrind],
+    [  --with-valgrind         Enable valgrind integration hooks],
+    [enable_valgrind="yes"], [enable_valgrind="no"])
+  AC_CHECK_HEADER([valgrind/valgrind.h], [], [enable_valgrind="no"])
+  if test "x$enable_valgrind" = "xyes" ; then
+    AC_DEFINE(MOZ_VALGRIND)
+  fi
 fi
 AC_SUBST(MOZ_MEMORY)
 AC_SUBST(WIN32_CRT_SRC_DIR)
 AC_SUBST(WIN32_CUSTOM_CRT_DIR)
 AC_SUBST(MOZ_LIB)
 AC_SUBST(MOZ_PATH)
 dnl Need to set this for make because NSS doesn't have configure
 AC_SUBST(DLLFLAGS)
--- a/memory/jemalloc/jemalloc.c
+++ b/memory/jemalloc/jemalloc.c
@@ -121,22 +121,26 @@
 
    /* Support optional abort() on OOM. */
 #  define MALLOC_XMALLOC
 
    /* Support SYSV semantics. */
 #  define MALLOC_SYSV
 #endif
 
-/*
- * MALLOC_LAZY_FREE enables the use of a per-thread vector of slots that free()
- * can atomically stuff object pointers into.  This can reduce arena lock
- * contention.
- */
-/* #define	MALLOC_LAZY_FREE */
+/* Embed no-op macros that support memory allocation tracking via valgrind. */
+#ifdef MOZ_VALGRIND
+#  define MALLOC_VALGRIND
+#endif
+#ifdef MALLOC_VALGRIND
+#  include <valgrind/valgrind.h>
+#else
+#  define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed)
+#  define VALGRIND_FREELIKE_BLOCK(addr, rzB)
+#endif
 
 /*
  * MALLOC_BALANCE enables monitoring of arena lock contention and dynamically
  * re-balances arena load if exponentially averaged contention exceeds a
  * certain threshold.
  */
 /* #define	MALLOC_BALANCE */
 
@@ -405,20 +409,16 @@ static const bool __isthreaded = true;
 #  define NO_TLS
 #endif
 
 #ifdef NO_TLS
    /* MALLOC_BALANCE requires TLS. */
 #  ifdef MALLOC_BALANCE
 #    undef MALLOC_BALANCE
 #  endif
-   /* MALLOC_LAZY_FREE requires TLS. */
-#  ifdef MALLOC_LAZY_FREE
-#    undef MALLOC_LAZY_FREE
-#  endif
 #endif
 
 /*
  * Size and alignment of memory chunks that are allocated by the OS's virtual
  * memory system.
  */
 #define	CHUNK_2POW_DEFAULT	20
 
@@ -468,29 +468,16 @@ static const bool __isthreaded = true;
 /*
  * Put a cap on small object run size.  This overrides RUN_MAX_OVRHD.  Note
  * that small runs must be small enough that page offsets can fit within the
  * CHUNK_MAP_POS_MASK bits.
  */
 #define	RUN_MAX_SMALL_2POW	15
 #define	RUN_MAX_SMALL		(1U << RUN_MAX_SMALL_2POW)
 
-#ifdef MALLOC_LAZY_FREE
-   /* Default size of each arena's lazy free cache. */
-#  define LAZY_FREE_2POW_DEFAULT 8
-   /*
-    * Number of pseudo-random probes to conduct before considering the cache to
-    * be overly full.  It takes on average n probes to detect fullness of
-    * (n-1)/n.  However, we are effectively doing multiple non-independent
-    * trials (each deallocation is a trial), so the actual average threshold
-    * for clearing the cache is somewhat lower.
-    */
-#  define LAZY_FREE_NPROBES	5
-#endif
-
 /*
  * Hyper-threaded CPUs may need a special instruction inside spin loops in
  * order to yield to another virtual CPU.  If no such instruction is defined
  * above, make CPU_SPINWAIT a no-op.
  */
 #ifndef CPU_SPINWAIT
 #  define CPU_SPINWAIT
 #endif
@@ -858,26 +845,16 @@ struct arena_s {
 #ifdef MALLOC_BALANCE
 	/*
 	 * The arena load balancing machinery needs to keep track of how much
 	 * lock contention there is.  This value is exponentially averaged.
 	 */
 	uint32_t		contention;
 #endif
 
-#ifdef MALLOC_LAZY_FREE
-	/*
-	 * Deallocation of small objects can be lazy, in which case free_cache
-	 * stores pointers to those objects that have not yet been deallocated.
-	 * In order to avoid lock contention, slots are chosen randomly.  Empty
-	 * slots contain NULL.
-	 */
-	void			**free_cache;
-#endif
-
 	/*
 	 * bins is used to store rings of free regions of the following sizes,
 	 * assuming a 16-byte quantum, 4kB pagesize, and default MALLOC_OPTIONS.
 	 *
 	 *   bins[i] | size |
 	 *   --------+------+
 	 *        0  |    2 |
 	 *        1  |    4 |
@@ -1058,19 +1035,16 @@ static bool	opt_abort = false;
 static bool	opt_junk = false;
 #endif
 #endif
 #ifdef MALLOC_DSS
 static bool	opt_dss = true;
 static bool	opt_mmap = true;
 #endif
 static size_t	opt_dirty_max = DIRTY_MAX_DEFAULT;
-#ifdef MALLOC_LAZY_FREE
-static int	opt_lazy_free_2pow = LAZY_FREE_2POW_DEFAULT;
-#endif
 #ifdef MALLOC_BALANCE
 static uint64_t	opt_balance_threshold = BALANCE_THRESHOLD_DEFAULT;
 #endif
 static bool	opt_print_stats = false;
 static size_t	opt_quantum_2pow = QUANTUM_2POW_MIN;
 static size_t	opt_small_max_2pow = SMALL_MAX_2POW_DEFAULT;
 static size_t	opt_chunk_2pow = CHUNK_2POW_DEFAULT;
 #ifdef MALLOC_UTRACE
@@ -1173,20 +1147,16 @@ static void *arena_bin_malloc_hard(arena
 static size_t arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
 #ifdef MALLOC_BALANCE
 static void	arena_lock_balance_hard(arena_t *arena);
 #endif
 static void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
 static void	*arena_palloc(arena_t *arena, size_t alignment, size_t size,
     size_t alloc_size);
 static size_t	arena_salloc(const void *ptr);
-#ifdef MALLOC_LAZY_FREE
-static void	arena_dalloc_lazy_hard(arena_t *arena, arena_chunk_t *chunk,
-    void *ptr, size_t pageind, arena_chunk_map_t *mapelm);
-#endif
 static void	arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk,
     void *ptr);
 static void	arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk,
     void *ptr, size_t size, size_t oldsize);
 static bool	arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk,
     void *ptr, size_t size, size_t oldsize);
 static bool	arena_ralloc_large(void *ptr, size_t size, size_t oldsize);
 static void	*arena_ralloc(void *ptr, size_t size, size_t oldsize);
@@ -1467,17 +1437,17 @@ pow2_ceil(size_t x)
 	x |= x >> 16;
 #if (SIZEOF_PTR == 8)
 	x |= x >> 32;
 #endif
 	x++;
 	return (x);
 }
 
-#if (defined(MALLOC_LAZY_FREE) || defined(MALLOC_BALANCE))
+#ifdef MALLOC_BALANCE
 /*
  * Use a simple linear congruential pseudo-random number generator:
  *
  *   prn(y) = (a*x + c) % m
  *
  * where the following constants ensure maximal period:
  *
  *   a == Odd number (relatively prime to 2^n), and (a-1) is a multiple of 4.
@@ -1516,22 +1486,16 @@ prn_##suffix(uint32_t lg_range)						\
 #  define PRN(suffix, lg_range)	prn_##suffix(lg_range)
 #endif
 
 /*
  * Define PRNGs, one for each purpose, in order to avoid auto-correlation
  * problems.
  */
 
-#ifdef MALLOC_LAZY_FREE
-/* Define the per-thread PRNG used for lazy deallocation. */
-static __thread uint32_t lazy_free_x;
-PRN_DEFINE(lazy_free, lazy_free_x, 12345, 12347)
-#endif
-
 #ifdef MALLOC_BALANCE
 /* Define the PRNG used for arena assignment. */
 static __thread uint32_t balance_x;
 PRN_DEFINE(balance, balance_x, 1297, 1301)
 #endif
 
 #ifdef MALLOC_UTRACE
 static int
@@ -1780,54 +1744,65 @@ base_alloc(size_t size)
 		    (void *)(PAGE_CEILING((uintptr_t)base_next_addr));
 
 		pages_commit(base_next_decommitted, (uintptr_t)pbase_next_addr -
 		    (uintptr_t)base_next_decommitted);
 		base_next_decommitted = pbase_next_addr;
 	}
 #endif
 	malloc_mutex_unlock(&base_mtx);
+	VALGRIND_MALLOCLIKE_BLOCK(ret, size, 0, false);
 
 	return (ret);
 }
 
 static void *
 base_calloc(size_t number, size_t size)
 {
 	void *ret;
 
 	ret = base_alloc(number * size);
+#ifdef MALLOC_VALGRIND
+	if (ret != NULL) {
+		VALGRIND_FREELIKE_BLOCK(ret, 0);
+		VALGRIND_MALLOCLIKE_BLOCK(ret, size, 0, true);
+	}
+#endif
 	memset(ret, 0, number * size);
 
 	return (ret);
 }
 
 static extent_node_t *
 base_node_alloc(void)
 {
 	extent_node_t *ret;
 
 	malloc_mutex_lock(&base_mtx);
 	if (base_nodes != NULL) {
 		ret = base_nodes;
 		base_nodes = *(extent_node_t **)ret;
+		VALGRIND_FREELIKE_BLOCK(ret, 0);
+		VALGRIND_MALLOCLIKE_BLOCK(ret, sizeof(extent_node_t), 0, false);
 		malloc_mutex_unlock(&base_mtx);
 	} else {
 		malloc_mutex_unlock(&base_mtx);
 		ret = (extent_node_t *)base_alloc(sizeof(extent_node_t));
 	}
 
 	return (ret);
 }
 
 static void
 base_node_dealloc(extent_node_t *node)
 {
 
 	malloc_mutex_lock(&base_mtx);
+	VALGRIND_FREELIKE_BLOCK(node, 0);
+	VALGRIND_MALLOCLIKE_BLOCK(node, sizeof(extent_node_t *), 0, false);
 	*(extent_node_t **)node = base_nodes;
 	base_nodes = node;
 	malloc_mutex_unlock(&base_mtx);
 }
 
 /******************************************************************************/
 
 #ifdef MALLOC_STATS
@@ -2623,30 +2598,16 @@ choose_arena(void)
  */
 static arena_t *
 choose_arena_hard(void)
 {
 	arena_t *ret;
 
 	assert(__isthreaded);
 
-#ifdef MALLOC_LAZY_FREE
-	/*
-	 * Seed the PRNG used for lazy deallocation.  Since seeding only occurs
-	 * on the first allocation by a thread, it is possible for a thread to
-	 * deallocate before seeding.  This is not a critical issue though,
-	 * since it is extremely unusual for an application to to use threads
-	 * that deallocate but *never* allocate, and because even if seeding
-	 * never occurs for multiple threads, they will tend to drift apart
-	 * unless some aspect of the application forces deallocation
-	 * synchronization.
-	 */
-	SPRN(lazy_free, (uint32_t)(uintptr_t)(_pthread_self()));
-#endif
-
 #ifdef MALLOC_BALANCE
 	/*
 	 * Seed the PRNG used for arena load balancing.  We can get away with
 	 * using the same seed here as for the lazy_free PRNG without
 	 * introducing autocorrelation because the PRNG parameters are
 	 * distinct.
 	 */
 	SPRN(balance, (uint32_t)(uintptr_t)(_pthread_self()));
@@ -3012,16 +2973,18 @@ arena_chunk_alloc(arena_t *arena)
 
 	if (arena->spare != NULL) {
 		chunk = arena->spare;
 		arena->spare = NULL;
 	} else {
 		chunk = (arena_chunk_t *)chunk_alloc(chunksize, true);
 		if (chunk == NULL)
 			return (NULL);
+		VALGRIND_MALLOCLIKE_BLOCK(chunk, (arena_chunk_header_npages <<
+		    pagesize_2pow), 0, false);
 #ifdef MALLOC_STATS
 		arena->stats.mapped += chunksize;
 #endif
 
 		chunk->arena = arena;
 
 		RB_INSERT(arena_chunk_tree_s, &arena->chunks, chunk);
 
@@ -3084,16 +3047,17 @@ static void
 arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk)
 {
 	extent_node_t *node, key;
 
 	if (arena->spare != NULL) {
 		RB_REMOVE(arena_chunk_tree_s, &chunk->arena->chunks,
 		    arena->spare);
 		arena->ndirty -= arena->spare->ndirty;
+		VALGRIND_FREELIKE_BLOCK(arena->spare, 0);
 		chunk_dealloc((void *)arena->spare, chunksize);
 #ifdef MALLOC_STATS
 		arena->stats.mapped -= chunksize;
 #endif
 	}
 
 	/*
 	 * Remove run from the runs trees, regardless of whether this chunk
@@ -3397,16 +3361,19 @@ arena_bin_nonfull_run_get(arena_t *arena
 	}
 	/* No existing runs have any space available. */
 
 	/* Allocate a new run. */
 	run = arena_run_alloc(arena, bin->run_size, true, false);
 	if (run == NULL)
 		return (NULL);
 
+	VALGRIND_MALLOCLIKE_BLOCK(run, sizeof(arena_run_t) + (sizeof(unsigned) *
+	    bin->regs_mask_nelms - 1), 0, false);
+
 	/* Initialize run internals. */
 	run->bin = bin;
 
 	for (i = 0; i < bin->regs_mask_nelms; i++)
 		run->regs_mask[i] = UINT_MAX;
 	remainder = bin->nregs & ((1U << (SIZEOF_INT_2POW + 3)) - 1);
 	if (remainder != 0) {
 		/* The last element has spare bits that need to be unset. */
@@ -3651,16 +3618,17 @@ arena_malloc_small(arena_t *arena, size_
 
 #ifdef MALLOC_STATS
 	bin->stats.nrequests++;
 	arena->stats.nmalloc_small++;
 	arena->stats.allocated_small += size;
 #endif
 	malloc_spin_unlock(&arena->lock);
 
+	VALGRIND_MALLOCLIKE_BLOCK(ret, size, 0, zero);
 	if (zero == false) {
 #ifdef MALLOC_FILL
 		if (opt_junk)
 			memset(ret, 0xa5, size);
 		else if (opt_zero)
 			memset(ret, 0, size);
 #endif
 	} else
@@ -3687,16 +3655,17 @@ arena_malloc_large(arena_t *arena, size_
 		return (NULL);
 	}
 #ifdef MALLOC_STATS
 	arena->stats.nmalloc_large++;
 	arena->stats.allocated_large += size;
 #endif
 	malloc_spin_unlock(&arena->lock);
 
+	VALGRIND_MALLOCLIKE_BLOCK(ret, size, 0, zero);
 	if (zero == false) {
 #ifdef MALLOC_FILL
 		if (opt_junk)
 			memset(ret, 0xa5, size);
 		else if (opt_zero)
 			memset(ret, 0, size);
 #endif
 	}
@@ -3808,16 +3777,17 @@ arena_palloc(arena_t *arena, size_t alig
 	}
 
 #ifdef MALLOC_STATS
 	arena->stats.nmalloc_large++;
 	arena->stats.allocated_large += size;
 #endif
 	malloc_spin_unlock(&arena->lock);
 
+	VALGRIND_MALLOCLIKE_BLOCK(ret, size, 0, false);
 #ifdef MALLOC_FILL
 	if (opt_junk)
 		memset(ret, 0xa5, size);
 	else if (opt_zero)
 		memset(ret, 0, size);
 #endif
 	return (ret);
 }
@@ -4026,16 +3996,17 @@ arena_dalloc_small(arena_t *arena, arena
 			 * run only contains one region, then it never gets
 			 * inserted into the non-full runs tree.
 			 */
 			RB_REMOVE(arena_run_tree_s, &bin->runs, run);
 		}
 #ifdef MALLOC_DEBUG
 		run->magic = 0;
 #endif
+		VALGRIND_FREELIKE_BLOCK(run, 0);
 		arena_run_dalloc(arena, run, true);
 #ifdef MALLOC_STATS
 		bin->stats.curruns--;
 #endif
 	} else if (run->nfree == 1 && run != bin->runcur) {
 		/*
 		 * Make sure that bin->runcur always refers to the lowest
 		 * non-full run, if one exists.
@@ -4054,100 +4025,16 @@ arena_dalloc_small(arena_t *arena, arena
 			RB_INSERT(arena_run_tree_s, &bin->runs, run);
 	}
 #ifdef MALLOC_STATS
 	arena->stats.allocated_small -= size;
 	arena->stats.ndalloc_small++;
 #endif
 }
 
-#ifdef MALLOC_LAZY_FREE
-static inline void
-arena_dalloc_lazy(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    size_t pageind, arena_chunk_map_t *mapelm)
-{
-	void **free_cache = arena->free_cache;
-	unsigned i, slot;
-
-	if (__isthreaded == false || opt_lazy_free_2pow < 0) {
-		malloc_spin_lock(&arena->lock);
-		arena_dalloc_small(arena, chunk, ptr, pageind, *mapelm);
-		malloc_spin_unlock(&arena->lock);
-		return;
-	}
-
-	for (i = 0; i < LAZY_FREE_NPROBES; i++) {
-		slot = PRN(lazy_free, opt_lazy_free_2pow);
-		if (atomic_cmpset_ptr((uintptr_t *)&free_cache[slot],
-		    (uintptr_t)NULL, (uintptr_t)ptr)) {
-			return;
-		}
-	}
-
-	arena_dalloc_lazy_hard(arena, chunk, ptr, pageind, mapelm);
-}
-
-static void
-arena_dalloc_lazy_hard(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    size_t pageind, arena_chunk_map_t *mapelm)
-{
-	void **free_cache = arena->free_cache;
-	unsigned i, slot;
-
-	malloc_spin_lock(&arena->lock);
-	arena_dalloc_small(arena, chunk, ptr, pageind, *mapelm);
-
-	/*
-	 * Check whether another thread already cleared the cache.  It is
-	 * possible that another thread cleared the cache *and* this slot was
-	 * already refilled, which could result in a mostly fruitless cache
-	 * sweep, but such a sequence of events causes no correctness issues.
-	 */
-	if ((ptr = (void *)atomic_readandclear_ptr(
-	    (uintptr_t *)&free_cache[slot]))
-	    != NULL) {
-		unsigned lazy_free_mask;
-		
-		/*
-		 * Clear the cache, since we failed to find a slot.  It is
-		 * possible that other threads will continue to insert objects
-		 * into the cache while this one sweeps, but that is okay,
-		 * since on average the cache is still swept with the same
-		 * frequency.
-		 */
-
-		/* Handle pointer at current slot. */
-		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-		pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >>
-		    pagesize_2pow);
-		mapelm = &chunk->map[pageind];
-		arena_dalloc_small(arena, chunk, ptr, pageind, *mapelm);
-
-		/* Sweep remainder of slots. */
-		lazy_free_mask = (1U << opt_lazy_free_2pow) - 1;
-		for (i = (slot + 1) & lazy_free_mask;
-		     i != slot;
-		     i = (i + 1) & lazy_free_mask) {
-			ptr = (void *)atomic_readandclear_ptr(
-			    (uintptr_t *)&free_cache[i]);
-			if (ptr != NULL) {
-				chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-				pageind = (((uintptr_t)ptr - (uintptr_t)chunk)
-				    >> pagesize_2pow);
-				mapelm = &chunk->map[pageind];
-				arena_dalloc_small(arena, chunk, ptr, pageind,
-				    *mapelm);
-			}
-		}
-	}
-
-	malloc_spin_unlock(&arena->lock);
-}
-#endif
-
 static void
 arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 {
 	/* Large allocation. */
 	malloc_spin_lock(&arena->lock);
 
 #ifdef MALLOC_FILL
 #ifndef MALLOC_STATS
@@ -4192,27 +4079,24 @@ arena_dalloc(arena_t *arena, arena_chunk
 	assert(chunk->arena == arena);
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> pagesize_2pow);
 	mapelm = &chunk->map[pageind];
 	if ((*mapelm & CHUNK_MAP_LARGE) == 0) {
 		/* Small allocation. */
-#ifdef MALLOC_LAZY_FREE
-		arena_dalloc_lazy(arena, chunk, ptr, pageind, mapelm);
-#else
 		malloc_spin_lock(&arena->lock);
 		arena_dalloc_small(arena, chunk, ptr, pageind, *mapelm);
 		malloc_spin_unlock(&arena->lock);
-#endif
 	} else {
 		assert((*mapelm & CHUNK_MAP_POS_MASK) == 0);
 		arena_dalloc_large(arena, chunk, ptr);
 	}
+	VALGRIND_FREELIKE_BLOCK(ptr, 0);
 }
 
 static inline void
 idalloc(void *ptr)
 {
 	arena_chunk_t *chunk;
 
 	assert(ptr != NULL);
@@ -4419,20 +4303,38 @@ iralloc(void *ptr, size_t size)
 {
 	size_t oldsize;
 
 	assert(ptr != NULL);
 	assert(size != 0);
 
 	oldsize = isalloc(ptr);
 
+#ifndef MALLOC_VALGRIND
 	if (size <= arena_maxclass)
 		return (arena_ralloc(ptr, size, oldsize));
 	else
 		return (huge_ralloc(ptr, size, oldsize));
+#else
+	/*
+	 * Valgrind does not provide a public interface for modifying an
+	 * existing allocation, so use malloc/memcpy/free instead.
+	 */
+	{
+		void *ret = imalloc(size);
+		if (ret != NULL) {
+			if (oldsize < size)
+			    memcpy(ret, ptr, oldsize);
+			else
+			    memcpy(ret, ptr, size);
+			idalloc(ptr);
+		}
+		return (ret);
+	}
+#endif
 }
 
 static bool
 arena_new(arena_t *arena)
 {
 	unsigned i;
 	arena_bin_t *bin;
 	size_t pow2_size, prev_run_size;
@@ -4452,25 +4354,16 @@ arena_new(arena_t *arena)
 
 	RB_INIT(&arena->runs_avail_szad);
 	RB_INIT(&arena->runs_avail_ad);
 	RB_INIT(&arena->runs_alloced_ad);
 
 #ifdef MALLOC_BALANCE
 	arena->contention = 0;
 #endif
-#ifdef MALLOC_LAZY_FREE
-	if (opt_lazy_free_2pow >= 0) {
-		arena->free_cache = (void **) base_calloc(1, sizeof(void *)
-		    * (1U << opt_lazy_free_2pow));
-		if (arena->free_cache == NULL)
-			return (true);
-	} else
-		arena->free_cache = NULL;
-#endif
 
 	/* Initialize bins. */
 	prev_run_size = pagesize;
 
 	/* (2^n)-spaced tiny bins. */
 	for (i = 0; i < ntbins; i++) {
 		bin = &arena->bins[i];
 		bin->runcur = NULL;
@@ -4610,16 +4503,22 @@ huge_malloc(size_t size, bool zero)
 #endif
 	malloc_mutex_unlock(&huge_mtx);
 
 #ifdef MALLOC_DECOMMIT
 	if (csize - psize > 0)
 		pages_decommit((void *)((uintptr_t)ret + psize), csize - psize);
 #endif
 
+#ifdef MALLOC_DECOMMIT
+	VALGRIND_MALLOCLIKE_BLOCK(ret, psize, 0, zero);
+#else
+	VALGRIND_MALLOCLIKE_BLOCK(ret, csize, 0, zero);
+#endif
+
 #ifdef MALLOC_FILL
 	if (zero == false) {
 		if (opt_junk)
 #  ifdef MALLOC_DECOMMIT
 			memset(ret, 0xa5, psize);
 #  else
 			memset(ret, 0xa5, csize);
 #  endif
@@ -4749,16 +4648,22 @@ huge_palloc(size_t alignment, size_t siz
 
 #ifdef MALLOC_DECOMMIT
 	if (chunk_size - psize > 0) {
 		pages_decommit((void *)((uintptr_t)ret + psize),
 		    chunk_size - psize);
 	}
 #endif
 
+#ifdef MALLOC_DECOMMIT
+	VALGRIND_MALLOCLIKE_BLOCK(ret, psize, 0, false);
+#else
+	VALGRIND_MALLOCLIKE_BLOCK(ret, chunk_size, 0, false);
+#endif
+
 #ifdef MALLOC_FILL
 	if (opt_junk)
 #  ifdef MALLOC_DECOMMIT
 		memset(ret, 0xa5, psize);
 #  else
 		memset(ret, 0xa5, chunk_size);
 #  endif
 	else if (opt_zero)
@@ -4885,16 +4790,17 @@ huge_dalloc(void *ptr)
 		memset(node->addr, 0x5a, node->size);
 #endif
 #endif
 #ifdef MALLOC_DECOMMIT
 	chunk_dealloc(node->addr, CHUNK_CEILING(node->size));
 #else
 	chunk_dealloc(node->addr, node->size);
 #endif
+	VALGRIND_FREELIKE_BLOCK(node->addr, 0);
 
 	base_node_dealloc(node);
 }
 
 #ifdef MOZ_MEMORY_BSD
 static inline unsigned
 malloc_ncpus(void)
 {
@@ -5080,23 +4986,16 @@ malloc_print_stats(void)
 #endif
 #ifdef MALLOC_FILL
 		_malloc_message(opt_zero ? "Z" : "z", "", "", "");
 #endif
 		_malloc_message("\n", "", "", "");
 
 		_malloc_message("CPUs: ", umax2s(ncpus, s), "\n", "");
 		_malloc_message("Max arenas: ", umax2s(narenas, s), "\n", "");
-#ifdef MALLOC_LAZY_FREE
-		if (opt_lazy_free_2pow >= 0) {
-			_malloc_message("Lazy free slots: ",
-			    umax2s(1U << opt_lazy_free_2pow, s), "\n", "");
-		} else
-			_malloc_message("Lazy free slots: 0\n", "", "", "");
-#endif
 #ifdef MALLOC_BALANCE
 		_malloc_message("Arena balance threshold: ",
 		    umax2s(opt_balance_threshold, s), "\n", "");
 #endif
 		_malloc_message("Pointer size: ", umax2s(sizeof(void *), s),
 		    "\n", "");
 		_malloc_message("Quantum size: ", umax2s(quantum, s), "\n", "");
 		_malloc_message("Max small size: ", umax2s(small_max, s), "\n",
@@ -5276,21 +5175,16 @@ malloc_init_hard(void)
 	/*
 	 * We assume that pagesize is a power of 2 when calculating
 	 * pagesize_mask and pagesize_2pow.
 	 */
 	assert(((result - 1) & result) == 0);
 	pagesize_mask = result - 1;
 	pagesize_2pow = ffs((int)result) - 1;
 
-#ifdef MALLOC_LAZY_FREE
-		if (ncpus == 1)
-			opt_lazy_free_2pow = -1;
-#endif
-
 	for (i = 0; i < 3; i++) {
 		unsigned j;
 
 		/* Get runtime configuration. */
 		switch (i) {
 		case 0:
 #ifndef MOZ_MEMORY_WINDOWS
 			if ((linklen = readlink("/etc/malloc.conf", buf,
@@ -5422,28 +5316,16 @@ MALLOC_OUT:
 					if (opt_chunk_2pow > pagesize_2pow + 1)
 						opt_chunk_2pow--;
 					break;
 				case 'K':
 					if (opt_chunk_2pow + 1 <
 					    (sizeof(size_t) << 3))
 						opt_chunk_2pow++;
 					break;
-				case 'l':
-#ifdef MALLOC_LAZY_FREE
-					if (opt_lazy_free_2pow >= 0)
-						opt_lazy_free_2pow--;
-#endif
-					break;
-				case 'L':
-#ifdef MALLOC_LAZY_FREE
-					if (ncpus > 1)
-						opt_lazy_free_2pow++;
-#endif
-					break;
 				case 'm':
 #ifdef MALLOC_DSS
 					opt_mmap = false;
 #endif
 					break;
 				case 'M':
 #ifdef MALLOC_DSS
 					opt_mmap = true;
@@ -5580,24 +5462,16 @@ MALLOC_OUT:
 		header_size = sizeof(arena_chunk_t) +
 		    (sizeof(arena_chunk_map_t) * (chunk_npages - 1)) +
 		    (sizeof(extent_node_t) * chunk_npages);
 		arena_chunk_header_npages = (header_size >> pagesize_2pow) +
 		    ((header_size & pagesize_mask) != 0);
 	}
 	arena_maxclass = chunksize - (arena_chunk_header_npages <<
 	    pagesize_2pow);
-#ifdef MALLOC_LAZY_FREE
-	/*
-	 * Make sure that allocating the free_cache does not exceed the limits
-	 * of what base_alloc() can handle.
-	 */
-	while ((sizeof(void *) << opt_lazy_free_2pow) > chunksize)
-		opt_lazy_free_2pow--;
-#endif
 
 	UTRACE(0, 0, 0);
 
 #ifdef MALLOC_STATS
 	memset(&stats_chunks, 0, sizeof(chunk_stats_t));
 #endif
 
 	/* Various sanity checks that regard configuration. */
@@ -5742,19 +5616,16 @@ MALLOC_OUT:
 	arenas_map = arenas[0];
 #endif
 #endif
 
 	/*
 	 * Seed here for the initial thread, since choose_arena_hard() is only
 	 * called for other threads.  The seed values don't really matter.
 	 */
-#ifdef MALLOC_LAZY_FREE
-	SPRN(lazy_free, 42);
-#endif
 #ifdef MALLOC_BALANCE
 	SPRN(balance, 42);
 #endif
 
 	malloc_spin_init(&arenas_lock);
 
 	malloc_initialized = true;
 #ifndef MOZ_MEMORY_WINDOWS