Bug 1519454 - Hook up MallocSizeOf and use it to replace some manual reporting. r=emilio
☠☠ backed out by 5d2e7b3ecb63 ☠ ☠
authorBobby Holley <bobbyholley@gmail.com>
Sat, 12 Jan 2019 19:21:58 +0000
changeset 453641 e9b1d04247debbf9506351b855091546ce59c4d1
parent 453640 f3ef205b4f632c18d20eb83e7b5f6a0e3ca77b21
child 453642 054028a8d4a75083599eda3791913d0d66aa1b8d
push id35364
push userdvarga@mozilla.com
push dateSun, 13 Jan 2019 10:04:23 +0000
treeherdermozilla-central@173e847312e0 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersemilio
bugs1519454
milestone66.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1519454 - Hook up MallocSizeOf and use it to replace some manual reporting. r=emilio Depends on D16353 Differential Revision: https://phabricator.services.mozilla.com/D16354
gfx/wr/webrender/src/clip.rs
gfx/wr/webrender/src/clip_scroll_tree.rs
gfx/wr/webrender/src/gpu_cache.rs
gfx/wr/webrender/src/hit_test.rs
gfx/wr/webrender/src/intern.rs
gfx/wr/webrender/src/lib.rs
gfx/wr/webrender/src/prim_store/mod.rs
gfx/wr/webrender/src/render_backend.rs
gfx/wr/webrender/src/renderer.rs
gfx/wr/webrender/src/resource_cache.rs
gfx/wr/webrender/src/scene_builder.rs
gfx/wr/webrender/src/util.rs
gfx/wr/webrender_api/src/api.rs
gfx/wr/webrender_api/src/display_item.rs
gfx/wr/webrender_api/src/lib.rs
gfx/wr/webrender_api/src/units.rs
gfx/wr/wr_malloc_size_of/lib.rs
--- a/gfx/wr/webrender/src/clip.rs
+++ b/gfx/wr/webrender/src/clip.rs
@@ -1,33 +1,32 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 use api::{BorderRadius, ClipMode, ComplexClipRegion, DeviceIntRect, DevicePixelScale, ImageMask};
 use api::{ImageRendering, LayoutRect, LayoutSize, LayoutPoint, LayoutVector2D};
 use api::{BoxShadowClipMode, LayoutToWorldScale, PicturePixel, WorldPixel};
 use api::{PictureRect, LayoutPixel, WorldPoint, WorldSize, WorldRect, LayoutToWorldTransform};
-use api::{VoidPtrToSizeFn, ImageKey};
+use api::{ImageKey};
 use app_units::Au;
 use border::{ensure_no_corner_overlap, BorderRadiusAu};
 use box_shadow::{BLUR_SAMPLE_SCALE, BoxShadowClipSource, BoxShadowCacheKey};
 use clip_scroll_tree::{ClipScrollTree, ROOT_SPATIAL_NODE_INDEX, SpatialNodeIndex};
 use ellipse::Ellipse;
 use gpu_cache::{GpuCache, GpuCacheHandle, ToGpuBlocks};
 use gpu_types::{BoxShadowStretchMode};
 use image::{self, Repetition};
 use intern;
 use internal_types::FastHashSet;
 use prim_store::{ClipData, ImageMaskData, SpaceMapper, VisibleMaskImageTile};
 use prim_store::{PointKey, PrimitiveInstance, SizeKey, RectangleKey};
 use render_task::to_cache_size;
 use resource_cache::{ImageRequest, ResourceCache};
 use std::{cmp, u32};
-use std::os::raw::c_void;
 use util::{extract_inner_rect_safe, project_rect, ScaleOffset};
 
 /*
 
  Module Overview
 
  There are a number of data structures involved in the clip module:
 
@@ -102,17 +101,17 @@ use util::{extract_inner_rect_safe, proj
     | ...              | ...              | ...              | ...              | ...              |
     +------------------+------------------+------------------+------------------+------------------+
 
  */
 
 // Type definitions for interning clip nodes.
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
-#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+#[derive(Clone, Copy, Debug, Eq, Hash, MallocSizeOf, PartialEq)]
 pub struct ClipDataMarker;
 
 pub type ClipDataStore = intern::DataStore<ClipItemKey, ClipNode, ClipDataMarker>;
 pub type ClipDataHandle = intern::Handle<ClipDataMarker>;
 pub type ClipDataUpdateList = intern::UpdateList<ClipItemKey>;
 pub type ClipDataInterner = intern::Interner<ClipItemKey, (), ClipDataMarker>;
 
 // Result of comparing a clip node instance against a local rect.
@@ -181,54 +180,55 @@ impl From<ClipItemKey> for ClipNode {
         }
     }
 }
 
 // Flags that are attached to instances of clip nodes.
 bitflags! {
     #[cfg_attr(feature = "capture", derive(Serialize))]
     #[cfg_attr(feature = "replay", derive(Deserialize))]
+    #[derive(MallocSizeOf)]
     pub struct ClipNodeFlags: u8 {
         const SAME_SPATIAL_NODE = 0x1;
         const SAME_COORD_SYSTEM = 0x2;
     }
 }
 
 // Identifier for a clip chain. Clip chains are stored
 // in a contiguous array in the clip store. They are
 // identified by a simple index into that array.
-#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
+#[derive(Clone, Copy, Debug, Eq, MallocSizeOf, PartialEq, Hash)]
 pub struct ClipChainId(pub u32);
 
 // The root of each clip chain is the NONE id. The
 // value is specifically set to u32::MAX so that if
 // any code accidentally tries to access the root
 // node, a bounds error will occur.
 impl ClipChainId {
     pub const NONE: Self = ClipChainId(u32::MAX);
     pub const INVALID: Self = ClipChainId(0xDEADBEEF);
 }
 
 // A clip chain node is an id for a range of clip sources,
 // and a link to a parent clip chain node, or ClipChainId::NONE.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, MallocSizeOf)]
 pub struct ClipChainNode {
     pub handle: ClipDataHandle,
     pub local_pos: LayoutPoint,
     pub spatial_node_index: SpatialNodeIndex,
     pub parent_clip_chain_id: ClipChainId,
 }
 
 // When a clip node is found to be valid for a
 // clip chain instance, it's stored in an index
 // buffer style structure. This struct contains
 // an index to the node data itself, as well as
 // some flags describing how this clip node instance
 // is positioned.
-#[derive(Debug)]
+#[derive(Debug, MallocSizeOf)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct ClipNodeInstance {
     pub handle: ClipDataHandle,
     pub flags: ClipNodeFlags,
     pub spatial_node_index: SpatialNodeIndex,
     pub local_pos: LayoutPoint,
 
@@ -245,25 +245,26 @@ pub struct ClipNodeRange {
     pub count: u32,
 }
 
 // A helper struct for converting between coordinate systems
 // of clip sources and primitives.
 // todo(gw): optimize:
 //  separate arrays for matrices
 //  cache and only build as needed.
-#[derive(Debug)]
+#[derive(Debug, MallocSizeOf)]
 enum ClipSpaceConversion {
     Local,
     ScaleOffset(ScaleOffset),
     Transform(LayoutToWorldTransform),
 }
 
 // Temporary information that is cached and reused
 // during building of a clip chain instance.
+#[derive(MallocSizeOf)]
 struct ClipNodeInfo {
     conversion: ClipSpaceConversion,
     handle: ClipDataHandle,
     local_pos: LayoutPoint,
     spatial_node_index: SpatialNodeIndex,
 }
 
 impl ClipNodeInfo {
@@ -429,17 +430,18 @@ impl ClipNode {
                     let data = ClipData::rounded_rect(size, radius, mode);
                     data.write(&mut request);
                 }
             }
         }
     }
 }
 
-// The main clipping public interface that other modules access.
+/// The main clipping public interface that other modules access.
+#[derive(MallocSizeOf)]
 pub struct ClipStore {
     pub clip_chain_nodes: Vec<ClipChainNode>,
     clip_node_instances: Vec<ClipNodeInstance>,
     clip_node_info: Vec<ClipNodeInfo>,
     clip_node_collectors: Vec<ClipNodeCollector>,
 }
 
 // A clip chain instance is what gets built for a given clip
@@ -703,27 +705,16 @@ impl ClipStore {
             pic_clip_rect,
             needs_mask,
         })
     }
 
     pub fn clear_old_instances(&mut self) {
         self.clip_node_instances.clear();
     }
-
-    /// Reports the heap usage of this clip store.
-    pub fn malloc_size_of(&self, op: VoidPtrToSizeFn) -> usize {
-        let mut size = 0;
-        unsafe {
-            size += op(self.clip_chain_nodes.as_ptr() as *const c_void);
-            size += op(self.clip_node_instances.as_ptr() as *const c_void);
-            size += op(self.clip_node_info.as_ptr() as *const c_void);
-        }
-        size
-    }
 }
 
 pub struct ComplexTranslateIter<I> {
     source: I,
     offset: LayoutVector2D,
 }
 
 impl<I: Iterator<Item = ComplexClipRegion>> Iterator for ComplexTranslateIter<I> {
@@ -1282,17 +1273,17 @@ pub fn project_inner_rect(
     Some(WorldRect::new(
         WorldPoint::new(xs[1], ys[1]),
         WorldSize::new(xs[2] - xs[1], ys[2] - ys[1]),
     ))
 }
 
 // Collects a list of unique clips to be applied to a rasterization
 // root at the end of primitive preparation.
-#[derive(Debug)]
+#[derive(Debug, MallocSizeOf)]
 pub struct ClipNodeCollector {
     spatial_node_index: SpatialNodeIndex,
     clips: FastHashSet<ClipChainId>,
 }
 
 impl ClipNodeCollector {
     pub fn new(
         spatial_node_index: SpatialNodeIndex,
--- a/gfx/wr/webrender/src/clip_scroll_tree.rs
+++ b/gfx/wr/webrender/src/clip_scroll_tree.rs
@@ -36,17 +36,17 @@ impl CoordinateSystem {
     fn root() -> Self {
         CoordinateSystem {
             transform: LayoutTransform::identity(),
             parent: None,
         }
     }
 }
 
-#[derive(Debug, Copy, Clone, Eq, Hash, PartialEq, PartialOrd, Ord)]
+#[derive(Debug, Copy, Clone, Eq, Hash, MallocSizeOf, PartialEq, PartialOrd, Ord)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct SpatialNodeIndex(pub u32);
 
 //Note: these have to match ROOT_REFERENCE_FRAME_SPATIAL_ID and ROOT_SCROLL_NODE_SPATIAL_ID
 pub const ROOT_SPATIAL_NODE_INDEX: SpatialNodeIndex = SpatialNodeIndex(0);
 const TOPMOST_SCROLL_NODE_INDEX: SpatialNodeIndex = SpatialNodeIndex(1);
 
--- a/gfx/wr/webrender/src/gpu_cache.rs
+++ b/gfx/wr/webrender/src/gpu_cache.rs
@@ -20,26 +20,24 @@
 //! will be invoked to build the data.
 //!
 //! After ```end_frame``` has occurred, callers can
 //! use the ```get_address``` API to get the allocated
 //! address in the GPU cache of a given resource slot
 //! for this frame.
 
 use api::{DebugFlags, DocumentId, PremultipliedColorF, IdNamespace, TexelRect};
-use api::{VoidPtrToSizeFn};
 use euclid::TypedRect;
 use internal_types::{FastHashMap};
 use profiler::GpuCacheProfileCounters;
 use render_backend::{FrameStamp, FrameId};
 use renderer::MAX_VERTEX_TEXTURE_WIDTH;
 use std::{mem, u16, u32};
 use std::num::NonZeroU32;
 use std::ops::Add;
-use std::os::raw::c_void;
 use std::time::{Duration, Instant};
 
 
 /// At the time of this writing, Firefox uses about 15 GPU cache rows on
 /// startup, and then gradually works its way up to the mid-30s with normal
 /// browsing.
 pub const GPU_CACHE_INITIAL_HEIGHT: i32 = 20;
 const NEW_ROWS_PER_RESIZE: i32 = 10;
@@ -50,17 +48,17 @@ const FRAMES_BEFORE_EVICTION: usize = 10
 /// The ratio of utilized blocks to total blocks for which we start the clock
 /// on reclaiming memory.
 const RECLAIM_THRESHOLD: f32 = 0.2;
 
 /// The amount of time utilization must be below the above threshold before we
 /// blow away the cache and rebuild it.
 const RECLAIM_DELAY_S: u64 = 5;
 
-#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+#[derive(Debug, Copy, Clone, Eq, MallocSizeOf, PartialEq)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 struct Epoch(u32);
 
 impl Epoch {
     fn next(&mut self) {
         *self = Epoch(self.0.wrapping_add(1));
     }
@@ -70,17 +68,17 @@ impl Epoch {
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 struct CacheLocation {
     block_index: BlockIndex,
     epoch: Epoch,
 }
 
 /// A single texel in RGBAF32 texture - 16 bytes.
-#[derive(Copy, Clone, Debug)]
+#[derive(Copy, Clone, Debug, MallocSizeOf)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct GpuBlockData {
     data: [f32; 4],
 }
 
 impl GpuBlockData {
     pub const EMPTY: Self = GpuBlockData { data: [0.0; 4] };
@@ -142,17 +140,17 @@ impl GpuCacheHandle {
     pub fn new() -> Self {
         GpuCacheHandle { location: None }
     }
 }
 
 // A unique address in the GPU cache. These are uploaded
 // as part of the primitive instances, to allow the vertex
 // shader to fetch the specific data.
-#[derive(Copy, Debug, Clone, Eq, PartialEq)]
+#[derive(Copy, Debug, Clone, MallocSizeOf, Eq, PartialEq)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct GpuCacheAddress {
     pub u: u16,
     pub v: u16,
 }
 
 impl GpuCacheAddress {
@@ -178,17 +176,17 @@ impl Add<usize> for GpuCacheAddress {
         GpuCacheAddress {
             u: self.u + other as u16,
             v: self.v,
         }
     }
 }
 
 // An entry in a free-list of blocks in the GPU cache.
-#[derive(Debug)]
+#[derive(Debug, MallocSizeOf)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 struct Block {
     // The location in the cache of this block.
     address: GpuCacheAddress,
     // The current epoch (generation) of this block.
     epoch: Epoch,
     // Index of the next free block in the list it
@@ -230,17 +228,17 @@ impl Block {
     };
 }
 
 /// Represents the index of a Block in the block array. We only create such
 /// structs for blocks that represent the start of a chunk.
 ///
 /// Because we use Option<BlockIndex> in a lot of places, we use a NonZeroU32
 /// here and avoid ever using the index zero.
-#[derive(Debug, Copy, Clone)]
+#[derive(Debug, Copy, Clone, MallocSizeOf)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 struct BlockIndex(NonZeroU32);
 
 impl BlockIndex {
     fn new(idx: usize) -> Self {
         debug_assert!(idx <= u32::MAX as usize);
         BlockIndex(NonZeroU32::new(idx as u32).expect("Index zero forbidden"))
@@ -249,16 +247,17 @@ impl BlockIndex {
     fn get(&self) -> usize {
         self.0.get() as usize
     }
 }
 
 // A row in the cache texture.
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
+#[derive(MallocSizeOf)]
 struct Row {
     // The fixed size of blocks that this row supports.
     // Each row becomes a slab allocator for a fixed block size.
     // This means no dealing with fragmentation within a cache
     // row as items are allocated and freed.
     block_count_per_item: usize,
 }
 
@@ -271,42 +270,45 @@ impl Row {
 }
 
 // A list of update operations that can be applied on the cache
 // this frame. The list of updates is created by the render backend
 // during frame construction. It's passed to the render thread
 // where GL commands can be applied.
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
+#[derive(MallocSizeOf)]
 pub enum GpuCacheUpdate {
     Copy {
         block_index: usize,
         block_count: usize,
         address: GpuCacheAddress,
     },
 }
 
 /// Command to inform the debug display in the renderer when chunks are allocated
 /// or freed.
+#[derive(MallocSizeOf)]
 pub enum GpuCacheDebugCmd {
     /// Describes an allocated chunk.
     Alloc(GpuCacheDebugChunk),
     /// Describes a freed chunk.
     Free(GpuCacheAddress),
 }
 
-#[derive(Clone)]
+#[derive(Clone, MallocSizeOf)]
 pub struct GpuCacheDebugChunk {
     pub address: GpuCacheAddress,
     pub size: usize,
 }
 
 #[must_use]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
+#[derive(MallocSizeOf)]
 pub struct GpuCacheUpdateList {
     /// The frame current update list was generated from.
     pub frame_id: FrameId,
     /// Whether the texture should be cleared before updates
     /// are applied.
     pub clear: bool,
     /// The current height of the texture. The render thread
     /// should resize the texture if required.
@@ -320,16 +322,17 @@ pub struct GpuCacheUpdateList {
     #[cfg_attr(feature = "serde", serde(skip))]
     pub debug_commands: Vec<GpuCacheDebugCmd>,
 }
 
 // Holds the free lists of fixed size blocks. Mostly
 // just serves to work around the borrow checker.
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
+#[derive(MallocSizeOf)]
 struct FreeBlockLists {
     free_list_1: Option<BlockIndex>,
     free_list_2: Option<BlockIndex>,
     free_list_4: Option<BlockIndex>,
     free_list_8: Option<BlockIndex>,
     free_list_16: Option<BlockIndex>,
     free_list_32: Option<BlockIndex>,
     free_list_64: Option<BlockIndex>,
@@ -387,16 +390,17 @@ impl FreeBlockLists {
             _ => panic!("Can't allocate > MAX_VERTEX_TEXTURE_WIDTH per resource!"),
         }
     }
 }
 
 // CPU-side representation of the GPU resource cache texture.
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
+#[derive(MallocSizeOf)]
 struct Texture {
     // Current texture height
     height: i32,
     // All blocks that have been created for this texture
     blocks: Vec<Block>,
     // Metadata about each allocated row.
     rows: Vec<Row>,
     // The base Epoch for this texture.
@@ -451,28 +455,16 @@ impl Texture {
             occupied_list_heads: FastHashMap::default(),
             allocated_block_count: 0,
             reached_reclaim_threshold: None,
             debug_commands: Vec::new(),
             debug_flags,
         }
     }
 
-    // Reports the CPU heap usage of this Texture struct.
-    fn malloc_size_of(&self, op: VoidPtrToSizeFn) -> usize {
-        let mut size = 0;
-        unsafe {
-            size += op(self.blocks.as_ptr() as *const c_void);
-            size += op(self.rows.as_ptr() as *const c_void);
-            size += op(self.pending_blocks.as_ptr() as *const c_void);
-            size += op(self.updates.as_ptr() as *const c_void);
-        }
-        size
-    }
-
     // Push new data into the cache. The ```pending_block_index``` field represents
     // where the data was pushed into the texture ```pending_blocks``` array.
     // Return the allocated address for this data.
     fn push_data(
         &mut self,
         pending_block_index: Option<usize>,
         block_count: usize,
         frame_stamp: FrameStamp
@@ -667,16 +659,17 @@ impl<'a> Drop for GpuDataRequest<'a> {
         self.handle.location = Some(location);
     }
 }
 
 
 /// The main LRU cache interface.
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
+#[derive(MallocSizeOf)]
 pub struct GpuCache {
     /// Current FrameId.
     now: FrameStamp,
     /// CPU-side texture allocator.
     texture: Texture,
     /// Number of blocks requested this frame that don't
     /// need to be re-uploaded.
     saved_block_count: usize,
@@ -860,21 +853,16 @@ impl GpuCache {
     /// freed or pending slot will panic!
     pub fn get_address(&self, id: &GpuCacheHandle) -> GpuCacheAddress {
         let location = id.location.expect("handle not requested or allocated!");
         let block = &self.texture.blocks[location.block_index.get()];
         debug_assert_eq!(block.epoch, location.epoch);
         debug_assert_eq!(block.last_access_time, self.now.frame_id());
         block.address
     }
-
-    /// Reports the CPU heap usage of this GpuCache struct.
-    pub fn malloc_size_of(&self, op: VoidPtrToSizeFn) -> usize {
-        self.texture.malloc_size_of(op)
-    }
 }
 
 #[test]
 #[cfg(target_pointer_width = "64")]
 fn test_struct_sizes() {
     use std::mem;
     // We can end up with a lot of blocks stored in the global vec, and keeping
     // them small helps reduce memory overhead.
--- a/gfx/wr/webrender/src/hit_test.rs
+++ b/gfx/wr/webrender/src/hit_test.rs
@@ -1,37 +1,38 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 use api::{BorderRadius, ClipMode, HitTestFlags, HitTestItem, HitTestResult, ItemTag, LayoutPoint};
-use api::{LayoutPrimitiveInfo, LayoutRect, PipelineId, VoidPtrToSizeFn, WorldPoint};
+use api::{LayoutPrimitiveInfo, LayoutRect, PipelineId, WorldPoint};
 use clip::{ClipDataStore, ClipNode, ClipItem, ClipStore};
 use clip::{rounded_rectangle_contains_point};
 use clip_scroll_tree::{SpatialNodeIndex, ClipScrollTree};
 use internal_types::FastHashMap;
 use prim_store::ScrollNodeAndClipChain;
-use std::os::raw::c_void;
 use std::u32;
 use util::LayoutToWorldFastTransform;
 
 /// A copy of important clip scroll node data to use during hit testing. This a copy of
 /// data from the ClipScrollTree that will persist as a new frame is under construction,
 /// allowing hit tests consistent with the currently rendered frame.
+#[derive(MallocSizeOf)]
 pub struct HitTestSpatialNode {
     /// The pipeline id of this node.
     pipeline_id: PipelineId,
 
     /// World transform for content transformed by this node.
     world_content_transform: LayoutToWorldFastTransform,
 
     /// World viewport transform for content transformed by this node.
     world_viewport_transform: LayoutToWorldFastTransform,
 }
 
+#[derive(MallocSizeOf)]
 pub struct HitTestClipNode {
     /// A particular point must be inside all of these regions to be considered clipped in
     /// for the purposes of a hit test.
     region: HitTestRegion,
 }
 
 impl HitTestClipNode {
     fn new(local_pos: LayoutPoint, node: &ClipNode) -> Self {
@@ -59,30 +60,31 @@ impl HitTestClipNode {
 
 // A hit testing clip chain node is the same as a
 // normal clip chain node, except that the clip
 // node is embedded inside the clip chain, rather
 // than referenced. This means we don't need to
 // copy the complete interned clip data store for
 // hit testing.
 
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+#[derive(Debug, Copy, Clone, MallocSizeOf, PartialEq, Eq, Hash)]
 pub struct HitTestClipChainId(u32);
 
 impl HitTestClipChainId {
     pub const NONE: Self = HitTestClipChainId(u32::MAX);
 }
 
+#[derive(MallocSizeOf)]
 pub struct HitTestClipChainNode {
     pub region: HitTestClipNode,
     pub spatial_node_index: SpatialNodeIndex,
     pub parent_clip_chain_id: HitTestClipChainId,
 }
 
-#[derive(Clone)]
+#[derive(Clone, MallocSizeOf)]
 pub struct HitTestingItem {
     rect: LayoutRect,
     clip_rect: LayoutRect,
     tag: ItemTag,
     is_backface_visible: bool,
 }
 
 impl HitTestingItem {
@@ -91,19 +93,20 @@ impl HitTestingItem {
             rect: info.rect,
             clip_rect: info.clip_rect,
             tag,
             is_backface_visible: info.is_backface_visible,
         }
     }
 }
 
-#[derive(Clone)]
+#[derive(Clone, MallocSizeOf)]
 pub struct HitTestingRun(pub Vec<HitTestingItem>, pub ScrollNodeAndClipChain);
 
+#[derive(MallocSizeOf)]
 enum HitTestRegion {
     Invalid,
     Rectangle(LayoutRect, ClipMode),
     RoundedRectangle(LayoutRect, BorderRadius, ClipMode),
 }
 
 impl HitTestRegion {
     pub fn contains(&self, point: &LayoutPoint) -> bool {
@@ -116,16 +119,17 @@ impl HitTestRegion {
                 rounded_rectangle_contains_point(point, &rect, &radii),
             HitTestRegion::RoundedRectangle(rect, radii, ClipMode::ClipOut) =>
                 !rounded_rectangle_contains_point(point, &rect, &radii),
             HitTestRegion::Invalid => true,
         }
     }
 }
 
+#[derive(MallocSizeOf)]
 pub struct HitTester {
     runs: Vec<HitTestingRun>,
     spatial_nodes: Vec<HitTestSpatialNode>,
     clip_chains: Vec<HitTestClipChainNode>,
     pipeline_root_nodes: FastHashMap<PipelineId, SpatialNodeIndex>,
 }
 
 impl HitTester {
@@ -366,38 +370,25 @@ impl HitTester {
 
         result.items.dedup();
         result
     }
 
     pub fn get_pipeline_root(&self, pipeline_id: PipelineId) -> &HitTestSpatialNode {
         &self.spatial_nodes[self.pipeline_root_nodes[&pipeline_id].0 as usize]
     }
-
-    // Reports the CPU heap usage of this HitTester struct.
-    pub fn malloc_size_of(&self, op: VoidPtrToSizeFn) -> usize {
-        let mut size = 0;
-        unsafe {
-            size += op(self.runs.as_ptr() as *const c_void);
-            size += op(self.spatial_nodes.as_ptr() as *const c_void);
-            size += op(self.clip_chains.as_ptr() as *const c_void);
-            // We can't measure pipeline_root_nodes because we don't have the
-            // real machinery from the malloc_size_of crate. We could estimate
-            // it but it should generally be very small so we don't bother.
-        }
-        size
-    }
 }
 
-#[derive(Clone, Copy, PartialEq)]
+#[derive(Clone, Copy, MallocSizeOf, PartialEq)]
 enum ClippedIn {
     ClippedIn,
     NotClippedIn,
 }
 
+#[derive(MallocSizeOf)]
 pub struct HitTest {
     pipeline_id: Option<PipelineId>,
     point: WorldPoint,
     flags: HitTestFlags,
     node_cache: FastHashMap<HitTestClipChainId, ClippedIn>,
     clip_chain_cache: Vec<Option<ClippedIn>>,
 }
 
--- a/gfx/wr/webrender/src/intern.rs
+++ b/gfx/wr/webrender/src/intern.rs
@@ -46,17 +46,17 @@ use util::VecHelper;
 
 /// The epoch is incremented each time a scene is
 /// built. The most recently used scene epoch is
 /// stored inside each item and handle. This is
 /// then used for cache invalidation (item) and
 /// correctness validation (handle).
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
-#[derive(Debug, Copy, Clone, PartialEq)]
+#[derive(Debug, Copy, Clone, MallocSizeOf, PartialEq)]
 struct Epoch(u64);
 
 impl Epoch {
     pub const INVALID: Self = Epoch(u64::MAX);
 }
 
 /// A list of updates to be applied to the data store,
 /// provided by the interning structure.
@@ -71,31 +71,31 @@ pub struct UpdateList<S> {
 
 lazy_static! {
     static ref NEXT_UID: AtomicUsize = AtomicUsize::new(0);
 }
 
 /// A globally, unique identifier
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
-#[derive(Debug, Copy, Clone, Eq, Hash, PartialEq)]
+#[derive(Debug, Copy, Clone, Eq, Hash, MallocSizeOf, PartialEq)]
 pub struct ItemUid {
     uid: usize,
 }
 
 impl ItemUid {
     pub fn next_uid() -> ItemUid {
         let uid = NEXT_UID.fetch_add(1, Ordering::Relaxed);
         ItemUid { uid }
     }
 }
 
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
-#[derive(Debug, Copy, Clone)]
+#[derive(Debug, Copy, Clone, MallocSizeOf)]
 pub struct Handle<M: Copy> {
     index: u32,
     epoch: Epoch,
     uid: ItemUid,
     _marker: PhantomData<M>,
 }
 
 impl <M> Handle<M> where M: Copy {
--- a/gfx/wr/webrender/src/lib.rs
+++ b/gfx/wr/webrender/src/lib.rs
@@ -53,22 +53,27 @@ macro_rules! matches {
 #[macro_use]
 extern crate bitflags;
 #[macro_use]
 extern crate cfg_if;
 #[macro_use]
 extern crate lazy_static;
 #[macro_use]
 extern crate log;
+#[macro_use]
+extern crate malloc_size_of_derive;
 #[cfg(any(feature = "serde"))]
 #[macro_use]
 extern crate serde;
 #[macro_use]
 extern crate thread_profiler;
 
+extern crate wr_malloc_size_of;
+use wr_malloc_size_of as malloc_size_of;
+
 mod batch;
 mod border;
 mod box_shadow;
 #[cfg(any(feature = "capture", feature = "replay"))]
 mod capture;
 mod clip;
 mod clip_scroll_tree;
 mod debug_colors;
--- a/gfx/wr/webrender/src/prim_store/mod.rs
+++ b/gfx/wr/webrender/src/prim_store/mod.rs
@@ -74,17 +74,17 @@ pub fn register_prim_chase_id(id: Primit
 
 #[cfg(not(debug_assertions))]
 pub fn register_prim_chase_id(_: PrimitiveDebugId) {
 }
 
 const MIN_BRUSH_SPLIT_AREA: f32 = 256.0 * 256.0;
 pub const VECS_PER_SEGMENT: usize = 2;
 
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+#[derive(Clone, Copy, Debug, Eq, MallocSizeOf, PartialEq)]
 pub struct ScrollNodeAndClipChain {
     pub spatial_node_index: SpatialNodeIndex,
     pub clip_chain_id: ClipChainId,
 }
 
 impl ScrollNodeAndClipChain {
     pub fn new(
         spatial_node_index: SpatialNodeIndex,
@@ -841,17 +841,17 @@ impl OpacityBinding {
             let opacity = scene_properties.resolve_float(binding);
             new_opacity = new_opacity * opacity;
         }
 
         self.current = new_opacity;
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, MallocSizeOf)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct VisibleMaskImageTile {
     pub tile_offset: TileOffset,
     pub tile_rect: LayoutRect,
 }
 
 #[derive(Debug)]
--- a/gfx/wr/webrender/src/render_backend.rs
+++ b/gfx/wr/webrender/src/render_backend.rs
@@ -25,16 +25,17 @@ use api::CapturedDocument;
 use clip::ClipDataStore;
 use clip_scroll_tree::{SpatialNodeIndex, ClipScrollTree};
 #[cfg(feature = "debugger")]
 use debug_server;
 use frame_builder::{FrameBuilder, FrameBuilderConfig};
 use gpu_cache::GpuCache;
 use hit_test::{HitTest, HitTester};
 use internal_types::{DebugOutput, FastHashMap, FastHashSet, RenderedDocument, ResultMsg};
+use malloc_size_of::{MallocSizeOf, MallocSizeOfOps};
 use picture::RetainedTiles;
 use prim_store::{PrimitiveDataStore, PrimitiveScratchBuffer, PrimitiveInstance};
 use prim_store::{PrimitiveInstanceKind, PrimTemplateCommonData};
 use prim_store::borders::{ImageBorderDataStore, NormalBorderDataStore};
 use prim_store::gradient::{LinearGradientDataStore, RadialGradientDataStore};
 use prim_store::image::{ImageDataStore, YuvImageDataStore};
 use prim_store::line_dec::LineDecorationDataStore;
 use prim_store::picture::PictureDataStore;
@@ -83,17 +84,17 @@ impl DocumentView {
         DevicePixelScale::new(
             self.device_pixel_ratio *
             self.page_zoom_factor *
             self.pinch_zoom_factor
         )
     }
 }
 
-#[derive(Copy, Clone, Hash, PartialEq, PartialOrd, Debug, Eq, Ord)]
+#[derive(Copy, Clone, Hash, MallocSizeOf, PartialEq, PartialOrd, Debug, Eq, Ord)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct FrameId(usize);
 
 impl FrameId {
     /// Returns a FrameId corresponding to the first frame.
     ///
     /// Note that we use 0 as the internal id here because the current code
@@ -137,17 +138,17 @@ impl ::std::ops::Sub<usize> for FrameId 
 
 /// Identifier to track a sequence of frames.
 ///
 /// This is effectively a `FrameId` with a ridealong timestamp corresponding
 /// to when advance() was called, which allows for more nuanced cache eviction
 /// decisions. As such, we use the `FrameId` for equality and comparison, since
 /// we should never have two `FrameStamps` with the same id but different
 /// timestamps.
-#[derive(Copy, Clone, Debug)]
+#[derive(Copy, Clone, Debug, MallocSizeOf)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct FrameStamp {
     id: FrameId,
     time: SystemTime,
     document_id: DocumentId,
 }
 
@@ -674,17 +675,17 @@ pub struct RenderBackend {
     resource_cache: ResourceCache,
 
     frame_config: FrameBuilderConfig,
     documents: FastHashMap<DocumentId, Document>,
 
     notifier: Box<RenderNotifier>,
     recorder: Option<Box<ApiRecordingReceiver>>,
     sampler: Option<Box<AsyncPropertySampler + Send>>,
-    size_of_op: Option<VoidPtrToSizeFn>,
+    size_of_ops: Option<MallocSizeOfOps>,
     debug_flags: DebugFlags,
     namespace_alloc_by_client: bool,
 }
 
 impl RenderBackend {
     pub fn new(
         api_rx: MsgReceiver<ApiMsg>,
         payload_rx: Receiver<Payload>,
@@ -693,17 +694,17 @@ impl RenderBackend {
         low_priority_scene_tx: Sender<SceneBuilderRequest>,
         scene_rx: Receiver<SceneBuilderResult>,
         default_device_pixel_ratio: f32,
         resource_cache: ResourceCache,
         notifier: Box<RenderNotifier>,
         frame_config: FrameBuilderConfig,
         recorder: Option<Box<ApiRecordingReceiver>>,
         sampler: Option<Box<AsyncPropertySampler + Send>>,
-        size_of_op: Option<VoidPtrToSizeFn>,
+        size_of_ops: Option<MallocSizeOfOps>,
         debug_flags: DebugFlags,
         namespace_alloc_by_client: bool,
     ) -> RenderBackend {
         RenderBackend {
             api_rx,
             payload_rx,
             result_tx,
             scene_tx,
@@ -713,17 +714,17 @@ impl RenderBackend {
             default_device_pixel_ratio,
             resource_cache,
             gpu_cache: GpuCache::new(),
             frame_config,
             documents: FastHashMap::default(),
             notifier,
             recorder,
             sampler,
-            size_of_op,
+            size_of_ops,
             debug_flags,
             namespace_alloc_by_client,
         }
     }
 
     fn process_scene_msg(
         &mut self,
         document_id: DocumentId,
@@ -1547,26 +1548,26 @@ impl RenderBackend {
             doc.clip_scroll_tree.print_with(&mut builder);
 
             debug_root.add(builder.build());
         }
 
         serde_json::to_string(&debug_root).unwrap()
     }
 
-    fn report_memory(&self, tx: MsgSender<MemoryReport>) {
+    fn report_memory(&mut self, tx: MsgSender<MemoryReport>) {
         let mut report = MemoryReport::default();
-        let op = self.size_of_op.unwrap();
-        report.gpu_cache_metadata = self.gpu_cache.malloc_size_of(op);
+        let ops = self.size_of_ops.as_mut().unwrap();
+        let op = ops.size_of_op;
+        report.gpu_cache_metadata = self.gpu_cache.size_of(ops);
         for (_id, doc) in &self.documents {
             if let Some(ref fb) = doc.frame_builder {
-                report.clip_stores += fb.clip_store.malloc_size_of(op);
+                report.clip_stores += fb.clip_store.size_of(ops);
             }
-            report.hit_testers +=
-                doc.hit_tester.as_ref().map_or(0, |ht| ht.malloc_size_of(op));
+            report.hit_testers += doc.hit_tester.size_of(ops);
 
             doc.resources.report_memory(op, &mut report)
         }
 
         report += self.resource_cache.report_memory(op);
 
         // Send a message to report memory on the scene-builder thread, which
         // will add its report to this one and send the result back to the original
--- a/gfx/wr/webrender/src/renderer.rs
+++ b/gfx/wr/webrender/src/renderer.rs
@@ -54,16 +54,17 @@ use gpu_cache::{GpuBlockData, GpuCacheUp
 use gpu_cache::{GpuCacheDebugChunk, GpuCacheDebugCmd};
 #[cfg(feature = "pathfinder")]
 use gpu_glyph_renderer::GpuGlyphRenderer;
 use gpu_types::ScalingInstance;
 use internal_types::{TextureSource, ORTHO_FAR_PLANE, ORTHO_NEAR_PLANE, ResourceCacheError};
 use internal_types::{CacheTextureId, DebugOutput, FastHashMap, LayerIndex, RenderedDocument, ResultMsg};
 use internal_types::{TextureCacheAllocationKind, TextureCacheUpdate, TextureUpdateList, TextureUpdateSource};
 use internal_types::{RenderTargetInfo, SavedTargetIndex};
+use malloc_size_of::MallocSizeOfOps;
 use prim_store::DeferredResolve;
 use profiler::{BackendProfileCounters, FrameProfileCounters, TimeProfileCounter,
                GpuProfileTag, RendererProfileCounters, RendererProfileTimers};
 use device::query::GpuProfiler;
 use rayon::{ThreadPool, ThreadPoolBuilder};
 use record::ApiRecordingReceiver;
 use render_backend::{FrameId, RenderBackend};
 use render_task::ClearMode;
@@ -1551,27 +1552,19 @@ pub struct Renderer {
     /// application to provide external buffers for image data.
     external_image_handler: Option<Box<ExternalImageHandler>>,
 
     /// Optional trait object that allows the client
     /// application to provide a texture handle to
     /// copy the WR output to.
     output_image_handler: Option<Box<OutputImageHandler>>,
 
-    /// Optional function pointer for measuring memory used by a given
+    /// Optional function pointers for measuring memory used by a given
     /// heap-allocated pointer.
-    size_of_op: Option<VoidPtrToSizeFn>,
-
-    /// Optional function pointer for measuring memory used by a given
-    /// heap-allocated region of memory. Unlike the above, pointers passed
-    /// to this function do not need to point to the start of the allocation,
-    /// and can be anywhere in the allocated region. This is useful for measuring
-    /// structures like hashmaps that don't expose pointers to the start of the
-    /// allocation, but do expose pointers to elements within the allocation.
-    _enclosing_size_of_op: Option<VoidPtrToSizeFn>,
+    size_of_ops: Option<MallocSizeOfOps>,
 
     // Currently allocated FBOs for output frames.
     output_targets: FastHashMap<u32, FrameOutput>,
 
     pub renderer_errors: Vec<RendererError>,
 
     /// List of profile results from previous frames. Can be retrieved
     /// via get_frame_profiles().
@@ -1827,16 +1820,20 @@ impl Renderer {
             dual_source_blending_is_supported: ext_dual_source_blending,
             chase_primitive: options.chase_primitive,
             enable_picture_caching: options.enable_picture_caching,
         };
 
         let device_pixel_ratio = options.device_pixel_ratio;
         let debug_flags = options.debug_flags;
         let payload_rx_for_backend = payload_rx.to_mpsc_receiver();
+        let size_of_op = options.size_of_op;
+        let enclosing_size_of_op = options.enclosing_size_of_op;
+        let make_size_of_ops =
+            move || size_of_op.map(|o| MallocSizeOfOps::new(o, enclosing_size_of_op));
         let recorder = options.recorder;
         let thread_listener = Arc::new(options.thread_listener);
         let thread_listener_for_rayon_start = thread_listener.clone();
         let thread_listener_for_rayon_end = thread_listener.clone();
         let workers = options
             .workers
             .take()
             .unwrap_or_else(|| {
@@ -1852,36 +1849,33 @@ impl Renderer {
                         if let Some(ref thread_listener) = *thread_listener_for_rayon_end {
                             thread_listener.thread_stopped(&format!("WRWorker#{}", idx));
                         }
                     })
                     .build();
                 Arc::new(worker.unwrap())
             });
         let sampler = options.sampler;
-        let size_of_op = options.size_of_op;
-        let enclosing_size_of_op = options.enclosing_size_of_op;
         let namespace_alloc_by_client = options.namespace_alloc_by_client;
 
         let blob_image_handler = options.blob_image_handler.take();
         let thread_listener_for_render_backend = thread_listener.clone();
         let thread_listener_for_scene_builder = thread_listener.clone();
         let thread_listener_for_lp_scene_builder = thread_listener.clone();
         let scene_builder_hooks = options.scene_builder_hooks;
         let rb_thread_name = format!("WRRenderBackend#{}", options.renderer_id.unwrap_or(0));
         let scene_thread_name = format!("WRSceneBuilder#{}", options.renderer_id.unwrap_or(0));
         let lp_scene_thread_name = format!("WRSceneBuilderLP#{}", options.renderer_id.unwrap_or(0));
         let glyph_rasterizer = GlyphRasterizer::new(workers)?;
 
         let (scene_builder, scene_tx, scene_rx) = SceneBuilder::new(
             config,
             api_tx.clone(),
             scene_builder_hooks,
-            size_of_op,
-            enclosing_size_of_op,
+            make_size_of_ops(),
         );
         thread::Builder::new().name(scene_thread_name.clone()).spawn(move || {
             register_thread_with_profiler(scene_thread_name.clone());
             if let Some(ref thread_listener) = *thread_listener_for_scene_builder {
                 thread_listener.thread_started(&scene_thread_name);
             }
 
             let mut scene_builder = scene_builder;
@@ -1944,17 +1938,17 @@ impl Renderer {
                 low_priority_scene_tx,
                 scene_rx,
                 device_pixel_ratio,
                 resource_cache,
                 backend_notifier,
                 config,
                 recorder,
                 sampler,
-                size_of_op,
+                make_size_of_ops(),
                 debug_flags,
                 namespace_alloc_by_client,
             );
             backend.run(backend_profile_counters);
             if let Some(ref thread_listener) = *thread_listener_for_render_backend {
                 thread_listener.thread_stopped(&rb_thread_name);
             }
         })?;
@@ -2006,18 +2000,17 @@ impl Renderer {
             transforms_texture,
             prim_header_i_texture,
             prim_header_f_texture,
             render_task_texture,
             pipeline_info: PipelineInfo::default(),
             dither_matrix_texture,
             external_image_handler: None,
             output_image_handler: None,
-            size_of_op: options.size_of_op,
-            _enclosing_size_of_op: options.enclosing_size_of_op,
+            size_of_ops: make_size_of_ops(),
             output_targets: FastHashMap::default(),
             cpu_profiles: VecDeque::new(),
             gpu_profiles: VecDeque::new(),
             gpu_cache_texture,
             #[cfg(feature = "debug_renderer")]
             gpu_cache_debug_chunks: Vec::new(),
             gpu_cache_frame_id: FrameId::INVALID,
             gpu_cache_overflow: false,
@@ -4567,17 +4560,17 @@ impl Renderer {
         #[cfg(feature = "replay")]
         for (_, ext) in self.owned_external_images {
             self.device.delete_external_texture(ext);
         }
         self.device.end_frame();
     }
 
     fn size_of<T>(&self, ptr: *const T) -> usize {
-        let op = self.size_of_op.as_ref().unwrap();
+        let op = self.size_of_ops.as_ref().unwrap().size_of_op;
         unsafe { op(ptr as *const c_void) }
     }
 
     /// Collects a memory report.
     pub fn report_memory(&self) -> MemoryReport {
         let mut report = MemoryReport::default();
 
         // GPU cache CPU memory.
--- a/gfx/wr/webrender/src/resource_cache.rs
+++ b/gfx/wr/webrender/src/resource_cache.rs
@@ -1816,16 +1816,23 @@ impl ResourceCache {
             .clear_fonts(|font| font.font_key.0 == namespace);
 
         if let Some(ref mut r) = self.blob_image_handler {
             r.clear_namespace(namespace);
         }
     }
 
     /// Reports the CPU heap usage of this ResourceCache.
+    ///
+    /// NB: It would be much better to use the derive(MallocSizeOf) machinery
+    /// here, but the Arcs complicate things. The two ways to handle that would
+    /// be to either (a) Implement MallocSizeOf manually for the things that own
+    /// them and manually avoid double-counting, or (b) Use the "seen this pointer
+    /// yet" machinery from the proper malloc_size_of crate. We can do this if/when
+    /// more accurate memory reporting on these resources becomes a priority.
     pub fn report_memory(&self, op: VoidPtrToSizeFn) -> MemoryReport {
         let mut report = MemoryReport::default();
 
         // Measure fonts. We only need the templates here, because the instances
         // don't have big buffers.
         for (_, font) in self.resources.font_templates.iter() {
             if let FontTemplate::Raw(ref raw, _) = font {
                 report.fonts += unsafe { op(raw.as_ptr() as *const c_void) };
--- a/gfx/wr/webrender/src/scene_builder.rs
+++ b/gfx/wr/webrender/src/scene_builder.rs
@@ -1,25 +1,26 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 use api::{AsyncBlobImageRasterizer, BlobImageRequest, BlobImageParams, BlobImageResult};
 use api::{DocumentId, PipelineId, ApiMsg, FrameMsg, ResourceUpdate, ExternalEvent, Epoch};
 use api::{BuiltDisplayList, ColorF, LayoutSize, NotificationRequest, Checkpoint, IdNamespace};
-use api::{MemoryReport, VoidPtrToSizeFn};
+use api::{MemoryReport};
 use api::channel::MsgSender;
 #[cfg(feature = "capture")]
 use capture::CaptureConfig;
 use frame_builder::{FrameBuilderConfig, FrameBuilder};
 use clip::{ClipDataInterner, ClipDataUpdateList};
 use clip_scroll_tree::ClipScrollTree;
 use display_list_flattener::DisplayListFlattener;
 use intern::{Internable, Interner};
 use internal_types::{FastHashMap, FastHashSet};
+use malloc_size_of::MallocSizeOfOps;
 use prim_store::{PrimitiveDataInterner, PrimitiveDataUpdateList, PrimitiveKeyKind};
 use prim_store::PrimitiveStoreStats;
 use prim_store::borders::{
     ImageBorder, ImageBorderDataInterner, ImageBorderDataUpdateList,
     NormalBorderPrim, NormalBorderDataInterner, NormalBorderDataUpdateList
 };
 use prim_store::gradient::{
     LinearGradient, LinearGradientDataInterner, LinearGradientDataUpdateList,
@@ -215,20 +216,21 @@ pub struct DocumentResources {
     pub text_run_interner: TextRunDataInterner,
     pub yuv_image_interner: YuvImageDataInterner,
 }
 
 impl DocumentResources {
     /// Reports CPU heap memory used by the interners.
     fn report_memory(
         &self,
-        op: VoidPtrToSizeFn,
-        eop: VoidPtrToSizeFn,
+        ops: &mut MallocSizeOfOps,
         r: &mut MemoryReport,
     ) {
+        let op = ops.size_of_op;
+        let eop = ops.enclosing_size_of_op.unwrap();
         r.interners += self.clip_interner.malloc_size_of(op, eop);
         r.interners += self.prim_interner.malloc_size_of(op, eop);
         r.interners += self.linear_grad_interner.malloc_size_of(op, eop);
         r.interners += self.radial_grad_interner.malloc_size_of(op, eop);
         r.interners += self.text_run_interner.malloc_size_of(op, eop);
     }
 }
 
@@ -288,40 +290,37 @@ impl Document {
 pub struct SceneBuilder {
     documents: FastHashMap<DocumentId, Document>,
     rx: Receiver<SceneBuilderRequest>,
     tx: Sender<SceneBuilderResult>,
     api_tx: MsgSender<ApiMsg>,
     config: FrameBuilderConfig,
     hooks: Option<Box<SceneBuilderHooks + Send>>,
     simulate_slow_ms: u32,
-    size_of_op: Option<VoidPtrToSizeFn>,
-    enclosing_size_of_op: Option<VoidPtrToSizeFn>,
+    size_of_ops: Option<MallocSizeOfOps>,
 }
 
 impl SceneBuilder {
     pub fn new(
         config: FrameBuilderConfig,
         api_tx: MsgSender<ApiMsg>,
         hooks: Option<Box<SceneBuilderHooks + Send>>,
-        size_of_op: Option<VoidPtrToSizeFn>,
-        enclosing_size_of_op: Option<VoidPtrToSizeFn>,
+        size_of_ops: Option<MallocSizeOfOps>,
     ) -> (Self, Sender<SceneBuilderRequest>, Receiver<SceneBuilderResult>) {
         let (in_tx, in_rx) = channel();
         let (out_tx, out_rx) = channel();
         (
             SceneBuilder {
                 documents: FastHashMap::default(),
                 rx: in_rx,
                 tx: out_tx,
                 api_tx,
                 config,
                 hooks,
-                size_of_op,
-                enclosing_size_of_op,
+                size_of_ops,
                 simulate_slow_ms: 0,
             },
             in_tx,
             out_rx,
         )
     }
 
     /// Send a message to the render backend thread.
@@ -758,22 +757,21 @@ impl SceneBuilder {
         } else {
             if let &Some(ref hooks) = &self.hooks {
                 hooks.post_empty_scene_build();
             }
         }
     }
 
     /// Reports CPU heap memory used by the SceneBuilder.
-    fn report_memory(&self) -> MemoryReport {
-        let op = self.size_of_op.unwrap();
-        let eop = self.enclosing_size_of_op.unwrap();
+    fn report_memory(&mut self) -> MemoryReport {
+        let ops = self.size_of_ops.as_mut().unwrap();
         let mut report = MemoryReport::default();
         for doc in self.documents.values() {
-            doc.resources.report_memory(op, eop, &mut report);
+            doc.resources.report_memory(ops, &mut report);
         }
 
         report
     }
 }
 
 /// A scene builder thread which executes expensive operations such as blob rasterization
 /// with a lower priority than the normal scene builder thread.
--- a/gfx/wr/webrender/src/util.rs
+++ b/gfx/wr/webrender/src/util.rs
@@ -87,17 +87,17 @@ impl<T> VecHelper<T> for Vec<T> {
 
 // Represents an optimized transform where there is only
 // a scale and translation (which are guaranteed to maintain
 // an axis align rectangle under transformation). The
 // scaling is applied first, followed by the translation.
 // TODO(gw): We should try and incorporate F <-> T units here,
 //           but it's a bit tricky to do that now with the
 //           way the current clip-scroll tree works.
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, MallocSizeOf)]
 pub struct ScaleOffset {
     pub scale: Vector2D<f32>,
     pub offset: Vector2D<f32>,
 }
 
 impl ScaleOffset {
     pub fn identity() -> Self {
         ScaleOffset {
@@ -551,17 +551,17 @@ impl<U> MaxRect for TypedRect<f32, U> {
             TypedPoint2D::new(-MAX_COORD, -MAX_COORD),
             TypedSize2D::new(2.0 * MAX_COORD, 2.0 * MAX_COORD),
         )
     }
 }
 
 /// An enum that tries to avoid expensive transformation matrix calculations
 /// when possible when dealing with non-perspective axis-aligned transformations.
-#[derive(Debug)]
+#[derive(Debug, MallocSizeOf)]
 pub enum FastTransform<Src, Dst> {
     /// A simple offset, which can be used without doing any matrix math.
     Offset(TypedVector2D<f32, Src>),
 
     /// A 2D transformation with an inverse.
     Transform {
         transform: TypedTransform3D<f32, Src, Dst>,
         inverse: Option<TypedTransform3D<f32, Dst, Src>>,
--- a/gfx/wr/webrender_api/src/api.rs
+++ b/gfx/wr/webrender_api/src/api.rs
@@ -556,17 +556,17 @@ pub struct HitTestItem {
 }
 
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct HitTestResult {
     pub items: Vec<HitTestItem>,
 }
 
 bitflags! {
-    #[derive(Deserialize, Serialize)]
+    #[derive(Deserialize, MallocSizeOf, Serialize)]
     pub struct HitTestFlags: u8 {
         const FIND_ALL = 0b00000001;
         const POINT_RELATIVE_TO_PIPELINE_VIEWPORT = 0b00000010;
     }
 }
 
 #[derive(Clone, Deserialize, Serialize)]
 pub struct AddFontInstance {
@@ -782,37 +782,37 @@ pub struct Epoch(pub u32);
 
 impl Epoch {
     pub fn invalid() -> Epoch {
         Epoch(u32::MAX)
     }
 }
 
 #[repr(C)]
-#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd, Deserialize, Serialize)]
+#[derive(Clone, Copy, Debug, Eq, MallocSizeOf, PartialEq, Hash, Ord, PartialOrd, Deserialize, Serialize)]
 pub struct IdNamespace(pub u32);
 
 #[repr(C)]
-#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]
+#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, MallocSizeOf, PartialEq, Serialize)]
 pub struct DocumentId(pub IdNamespace, pub u32);
 
 impl DocumentId {
     pub const INVALID: DocumentId = DocumentId(IdNamespace(0), 0);
 }
 
 /// This type carries no valuable semantics for WR. However, it reflects the fact that
 /// clients (Servo) may generate pipelines by different semi-independent sources.
 /// These pipelines still belong to the same `IdNamespace` and the same `DocumentId`.
 /// Having this extra Id field enables them to generate `PipelineId` without collision.
 pub type PipelineSourceId = u32;
 
 /// From the point of view of WR, `PipelineId` is completely opaque and generic as long as
 /// it's clonable, serializable, comparable, and hashable.
 #[repr(C)]
-#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]
+#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, MallocSizeOf, PartialEq, Serialize)]
 pub struct PipelineId(pub PipelineSourceId, pub u32);
 
 impl PipelineId {
     pub fn dummy() -> Self {
         PipelineId(0, 0)
     }
 }
 
@@ -958,17 +958,17 @@ impl RenderApiSender {
             payload_sender: self.payload_sender.clone(),
             namespace_id,
             next_id: Cell::new(ResourceId(0)),
         }
     }
 }
 
 bitflags! {
-    #[derive(Default, Deserialize, Serialize)]
+    #[derive(Default, Deserialize, MallocSizeOf, Serialize)]
     pub struct DebugFlags: u32 {
         /// Display the frame profiler on screen.
         const PROFILER_DBG          = 1 << 0;
         /// Display intermediate render targets on screen.
         const RENDER_TARGET_DBG     = 1 << 1;
         /// Display all texture cache pages on screen.
         const TEXTURE_CACHE_DBG     = 1 << 2;
         /// Display GPU timing results.
--- a/gfx/wr/webrender_api/src/display_item.rs
+++ b/gfx/wr/webrender_api/src/display_item.rs
@@ -396,17 +396,17 @@ pub struct BorderDisplayItem {
 #[repr(C)]
 #[derive(Clone, Copy, Debug, Deserialize, PartialEq, Serialize)]
 pub enum BorderRadiusKind {
     Uniform,
     NonUniform,
 }
 
 #[repr(C)]
-#[derive(Clone, Copy, Debug, Deserialize, PartialEq, Serialize)]
+#[derive(Clone, Copy, Debug, Deserialize, MallocSizeOf, PartialEq, Serialize)]
 pub struct BorderRadius {
     pub top_left: LayoutSize,
     pub top_right: LayoutSize,
     pub bottom_left: LayoutSize,
     pub bottom_right: LayoutSize,
 }
 
 #[repr(C)]
@@ -736,17 +736,17 @@ impl ImageMask {
             None
         } else {
             Some(self.rect)
         }
     }
 }
 
 #[repr(C)]
-#[derive(Copy, Clone, Debug, PartialEq, Serialize, Deserialize, Eq, Hash)]
+#[derive(Copy, Clone, Debug, MallocSizeOf, PartialEq, Serialize, Deserialize, Eq, Hash)]
 pub enum ClipMode {
     Clip,    // Pixels inside the region are visible.
     ClipOut, // Pixels outside the region are visible.
 }
 
 impl Not for ClipMode {
     type Output = ClipMode;
 
--- a/gfx/wr/webrender_api/src/lib.rs
+++ b/gfx/wr/webrender_api/src/lib.rs
@@ -25,21 +25,25 @@ extern crate core;
 extern crate core_foundation;
 #[cfg(target_os = "macos")]
 extern crate core_graphics;
 #[cfg(target_os = "windows")]
 extern crate dwrote;
 pub extern crate euclid;
 #[cfg(feature = "ipc")]
 extern crate ipc_channel;
+#[macro_use]
+extern crate malloc_size_of_derive;
 extern crate serde;
 #[macro_use]
 extern crate serde_derive;
 extern crate time;
 
+extern crate wr_malloc_size_of;
+use wr_malloc_size_of as malloc_size_of;
 
 mod api;
 pub mod channel;
 mod color;
 mod display_item;
 mod display_list;
 mod font;
 mod gradient_builder;
--- a/gfx/wr/webrender_api/src/units.rs
+++ b/gfx/wr/webrender_api/src/units.rs
@@ -60,33 +60,33 @@ pub type RasterIntSize = TypedSize2D<i32
 pub type RasterRect = TypedRect<f32, RasterPixel>;
 pub type RasterPoint = TypedPoint2D<f32, RasterPixel>;
 pub type RasterSize = TypedSize2D<f32, RasterPixel>;
 pub type RasterPoint3D = TypedPoint3D<f32, RasterPixel>;
 pub type RasterVector2D = TypedVector2D<f32, RasterPixel>;
 pub type RasterVector3D = TypedVector3D<f32, RasterPixel>;
 
 /// Geometry in a stacking context's local coordinate space (logical pixels).
-#[derive(Hash, Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
+#[derive(Hash, Clone, Copy, Debug, Eq, MallocSizeOf, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
 pub struct LayoutPixel;
 
 pub type LayoutRect = TypedRect<f32, LayoutPixel>;
 pub type LayoutPoint = TypedPoint2D<f32, LayoutPixel>;
 pub type LayoutPoint3D = TypedPoint3D<f32, LayoutPixel>;
 pub type LayoutVector2D = TypedVector2D<f32, LayoutPixel>;
 pub type LayoutVector3D = TypedVector3D<f32, LayoutPixel>;
 pub type LayoutSize = TypedSize2D<f32, LayoutPixel>;
 pub type LayoutSideOffsets = TypedSideOffsets2D<f32, LayoutPixel>;
 
 pub type LayoutIntRect = TypedRect<i32, LayoutPixel>;
 pub type LayoutIntPoint = TypedPoint2D<i32, LayoutPixel>;
 pub type LayoutIntSize = TypedSize2D<i32, LayoutPixel>;
 
 /// Geometry in the document's coordinate space (logical pixels).
-#[derive(Hash, Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Hash, Clone, Copy, Debug, Eq, MallocSizeOf, PartialEq, Ord, PartialOrd)]
 pub struct WorldPixel;
 
 pub type WorldRect = TypedRect<f32, WorldPixel>;
 pub type WorldPoint = TypedPoint2D<f32, WorldPixel>;
 pub type WorldSize = TypedSize2D<f32, WorldPixel>;
 pub type WorldPoint3D = TypedPoint3D<f32, WorldPixel>;
 pub type WorldVector2D = TypedVector2D<f32, WorldPixel>;
 pub type WorldVector3D = TypedVector3D<f32, WorldPixel>;
--- a/gfx/wr/wr_malloc_size_of/lib.rs
+++ b/gfx/wr/wr_malloc_size_of/lib.rs
@@ -98,23 +98,16 @@ pub trait MallocShallowSizeOf {
 }
 
 impl MallocSizeOf for String {
     fn size_of(&self, ops: &mut MallocSizeOfOps) -> usize {
         unsafe { ops.malloc_size_of(self.as_ptr()) }
     }
 }
 
-impl<'a, T: ?Sized> MallocSizeOf for &'a T {
-    fn size_of(&self, _ops: &mut MallocSizeOfOps) -> usize {
-        // Zero makes sense for a non-owning reference.
-        0
-    }
-}
-
 impl<T: ?Sized> MallocShallowSizeOf for Box<T> {
     fn shallow_size_of(&self, ops: &mut MallocSizeOfOps) -> usize {
         unsafe { ops.malloc_size_of(&**self) }
     }
 }
 
 impl<T: MallocSizeOf + ?Sized> MallocSizeOf for Box<T> {
     fn size_of(&self, ops: &mut MallocSizeOfOps) -> usize {