Bug 1369152 - Update webrender to cset b2614e4eb58f9dee08b8c38f96bc3bac834c837b. r=jrmuizel
authorKartikaya Gupta <kgupta@mozilla.com>
Mon, 05 Jun 2017 16:42:02 -0400
changeset 362457 b6fa08f52247f7a11c319001d39f3b87f52fe918
parent 362456 b6f04897fdda51e42612617a89a93f696edbdf92
child 362458 4f5e82dbe1b6beb010fbcd939176f43726cb3a42
push id91086
push userarchaeopteryx@coole-files.de
push dateTue, 06 Jun 2017 09:34:58 +0000
treeherdermozilla-inbound@278a95776dd6 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjrmuizel
bugs1369152
milestone55.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1369152 - Update webrender to cset b2614e4eb58f9dee08b8c38f96bc3bac834c837b. r=jrmuizel MozReview-Commit-ID: 5PRr2dZLTZ4
gfx/doc/README.webrender
gfx/webrender/res/prim_shared.glsl
gfx/webrender/res/ps_angle_gradient.vs.glsl
gfx/webrender/res/ps_border_corner.vs.glsl
gfx/webrender/res/ps_border_edge.vs.glsl
gfx/webrender/res/ps_box_shadow.vs.glsl
gfx/webrender/res/ps_cache_image.vs.glsl
gfx/webrender/res/ps_gradient.vs.glsl
gfx/webrender/res/ps_image.vs.glsl
gfx/webrender/res/ps_radial_gradient.vs.glsl
gfx/webrender/res/ps_rectangle.vs.glsl
gfx/webrender/res/ps_text_run.vs.glsl
gfx/webrender/res/ps_yuv_image.vs.glsl
gfx/webrender/src/border.rs
gfx/webrender/src/device.rs
gfx/webrender/src/frame.rs
gfx/webrender/src/frame_builder.rs
gfx/webrender/src/glyph_rasterizer.rs
gfx/webrender/src/gpu_cache.rs
gfx/webrender/src/internal_types.rs
gfx/webrender/src/lib.rs
gfx/webrender/src/prim_store.rs
gfx/webrender/src/profiler.rs
gfx/webrender/src/render_backend.rs
gfx/webrender/src/renderer.rs
gfx/webrender/src/resource_cache.rs
gfx/webrender/src/tiling.rs
--- a/gfx/doc/README.webrender
+++ b/gfx/doc/README.webrender
@@ -74,9 +74,9 @@ there is another crate in m-c called moz
 the same folder to store its rust dependencies. If one of the libraries that is
 required by both mozjs_sys and webrender is updated without updating the other
 project's Cargo.lock file, that results in build bustage.
 This means that any time you do this sort of manual update of packages, you need
 to make sure that mozjs_sys also has its Cargo.lock file updated if needed, hence
 the need to run the cargo update command in js/src as well. Hopefully this will
 be resolved soon.
 
-Latest Commit: a54cc729259588dd1ff52c86d0c62cb2a1767137
+Latest Commit: b2614e4eb58f9dee08b8c38f96bc3bac834c837b
--- a/gfx/webrender/res/prim_shared.glsl
+++ b/gfx/webrender/res/prim_shared.glsl
@@ -79,26 +79,24 @@ RectWithSize to_rect_with_size(RectWithE
 vec2 clamp_rect(vec2 point, RectWithSize rect) {
     return clamp(point, rect.p0, rect.p0 + rect.size);
 }
 
 vec2 clamp_rect(vec2 point, RectWithEndpoint rect) {
     return clamp(point, rect.p0, rect.p1);
 }
 
-RectWithEndpoint intersect_rect(RectWithEndpoint a, RectWithEndpoint b) {
-    vec2 p0 = clamp_rect(a.p0, b);
-    vec2 p1 = clamp_rect(a.p1, b);
-    return RectWithEndpoint(p0, max(p0, p1));
+// Clamp 2 points at once.
+vec4 clamp_rect(vec4 points, RectWithSize rect) {
+    return clamp(points, rect.p0.xyxy, rect.p0.xyxy + rect.size.xyxy);
 }
 
 RectWithSize intersect_rect(RectWithSize a, RectWithSize b) {
-    RectWithEndpoint r = intersect_rect(to_rect_with_endpoint(a),
-                                        to_rect_with_endpoint(b));
-    return to_rect_with_size(r);
+    vec4 p = clamp_rect(vec4(a.p0, a.p0 + a.size), b);
+    return RectWithSize(p.xy, max(vec2(0.0), p.zw - p.xy));
 }
 
 float distance_to_line(vec2 p0, vec2 perp_dir, vec2 p) {
     vec2 dir_to_p0 = p0 - p;
     return dot(normalize(perp_dir), dir_to_p0);
 }
 
 // TODO: convert back to RectWithEndPoint if driver issues are resolved, if ever.
@@ -116,19 +114,18 @@ varying vec3 vClipMaskUv;
 #define VECS_PER_SPLIT_GEOM         3
 
 uniform sampler2D sLayers;
 uniform sampler2D sRenderTasks;
 uniform sampler2D sPrimGeometry;
 
 uniform sampler2D sData16;
 uniform sampler2D sData32;
-uniform sampler2D sData64;
-uniform sampler2D sData128;
 uniform sampler2D sResourceRects;
+uniform sampler2D sResourceCache;
 
 // Instanced attributes
 in ivec4 aData0;
 in ivec4 aData1;
 
 // get_fetch_uv is a macro to work around a macOS Intel driver parsing bug.
 // TODO: convert back to a function once the driver issues are resolved, if ever.
 // https://github.com/servo/webrender/pull/623
@@ -143,40 +140,65 @@ vec4 fetch_data_1(int index) {
 vec4[2] fetch_data_2(int index) {
     ivec2 uv = get_fetch_uv(index, 2);
     return vec4[2](
         texelFetchOffset(sData32, uv, 0, ivec2(0, 0)),
         texelFetchOffset(sData32, uv, 0, ivec2(1, 0))
     );
 }
 
-vec4[4] fetch_data_4(int index) {
-    ivec2 uv = get_fetch_uv(index, 4);
-    return vec4[4](
-        texelFetchOffset(sData64, uv, 0, ivec2(0, 0)),
-        texelFetchOffset(sData64, uv, 0, ivec2(1, 0)),
-        texelFetchOffset(sData64, uv, 0, ivec2(2, 0)),
-        texelFetchOffset(sData64, uv, 0, ivec2(3, 0))
+// TODO(gw): This is here temporarily while we have
+//           both GPU store and cache. When the GPU
+//           store code is removed, we can change the
+//           PrimitiveInstance instance structure to
+//           use 2x unsigned shorts as vertex attributes
+//           instead of an int, and encode the UV directly
+//           in the vertices.
+ivec2 get_resource_cache_uv(int address) {
+    return ivec2(address % WR_MAX_VERTEX_TEXTURE_WIDTH,
+                 address / WR_MAX_VERTEX_TEXTURE_WIDTH);
+}
+
+vec4[8] fetch_from_resource_cache_8(int address) {
+    ivec2 uv = get_resource_cache_uv(address);
+    return vec4[8](
+        texelFetchOffset(sResourceCache, uv, 0, ivec2(0, 0)),
+        texelFetchOffset(sResourceCache, uv, 0, ivec2(1, 0)),
+        texelFetchOffset(sResourceCache, uv, 0, ivec2(2, 0)),
+        texelFetchOffset(sResourceCache, uv, 0, ivec2(3, 0)),
+        texelFetchOffset(sResourceCache, uv, 0, ivec2(4, 0)),
+        texelFetchOffset(sResourceCache, uv, 0, ivec2(5, 0)),
+        texelFetchOffset(sResourceCache, uv, 0, ivec2(6, 0)),
+        texelFetchOffset(sResourceCache, uv, 0, ivec2(7, 0))
     );
 }
 
-vec4[8] fetch_data_8(int index) {
-    ivec2 uv = get_fetch_uv(index, 8);
-    return vec4[8](
-        texelFetchOffset(sData128, uv, 0, ivec2(0, 0)),
-        texelFetchOffset(sData128, uv, 0, ivec2(1, 0)),
-        texelFetchOffset(sData128, uv, 0, ivec2(2, 0)),
-        texelFetchOffset(sData128, uv, 0, ivec2(3, 0)),
-        texelFetchOffset(sData128, uv, 0, ivec2(4, 0)),
-        texelFetchOffset(sData128, uv, 0, ivec2(5, 0)),
-        texelFetchOffset(sData128, uv, 0, ivec2(6, 0)),
-        texelFetchOffset(sData128, uv, 0, ivec2(7, 0))
+vec4[3] fetch_from_resource_cache_3(int address) {
+    ivec2 uv = get_resource_cache_uv(address);
+    return vec4[3](
+        texelFetchOffset(sResourceCache, uv, 0, ivec2(0, 0)),
+        texelFetchOffset(sResourceCache, uv, 0, ivec2(1, 0)),
+        texelFetchOffset(sResourceCache, uv, 0, ivec2(2, 0))
     );
 }
 
+vec4[4] fetch_from_resource_cache_4(int address) {
+    ivec2 uv = get_resource_cache_uv(address);
+    return vec4[4](
+        texelFetchOffset(sResourceCache, uv, 0, ivec2(0, 0)),
+        texelFetchOffset(sResourceCache, uv, 0, ivec2(1, 0)),
+        texelFetchOffset(sResourceCache, uv, 0, ivec2(2, 0)),
+        texelFetchOffset(sResourceCache, uv, 0, ivec2(3, 0))
+    );
+}
+
+vec4 fetch_from_resource_cache_1(int address) {
+    ivec2 uv = get_resource_cache_uv(address);
+    return texelFetch(sResourceCache, uv, 0);
+}
 
 struct Layer {
     mat4 transform;
     mat4 inv_transform;
     RectWithSize local_clip_rect;
 };
 
 Layer fetch_layer(int index) {
@@ -284,18 +306,18 @@ ClipArea fetch_clip_area(int index) {
 }
 
 struct Gradient {
     vec4 start_end_point;
     vec4 tile_size_repeat;
     vec4 extend_mode;
 };
 
-Gradient fetch_gradient(int index) {
-    vec4 data[4] = fetch_data_4(index);
+Gradient fetch_gradient(int address) {
+    vec4 data[3] = fetch_from_resource_cache_3(address);
     return Gradient(data[0], data[1], data[2]);
 }
 
 struct GradientStop {
     vec4 color;
     vec4 offset;
 };
 
@@ -305,18 +327,18 @@ GradientStop fetch_gradient_stop(int ind
 }
 
 struct RadialGradient {
     vec4 start_end_center;
     vec4 start_end_radius_ratio_xy_extend_mode;
     vec4 tile_size_repeat;
 };
 
-RadialGradient fetch_radial_gradient(int index) {
-    vec4 data[4] = fetch_data_4(index);
+RadialGradient fetch_radial_gradient(int address) {
+    vec4 data[3] = fetch_from_resource_cache_3(address);
     return RadialGradient(data[0], data[1], data[2]);
 }
 
 struct Border {
     vec4 style;
     vec4 widths;
     vec4 colors[4];
     vec4 radii[2];
@@ -339,18 +361,18 @@ vec4 get_effective_border_widths(Border 
         case BORDER_STYLE_GROOVE:
         case BORDER_STYLE_RIDGE:
             return floor(0.5 + border.widths * 0.5);
         default:
             return border.widths;
     }
 }
 
-Border fetch_border(int index) {
-    vec4 data[8] = fetch_data_8(index);
+Border fetch_border(int address) {
+    vec4 data[8] = fetch_from_resource_cache_8(address);
     return Border(data[0], data[1],
                   vec4[4](data[2], data[3], data[4], data[5]),
                   vec4[2](data[6], data[7]));
 }
 
 struct BorderCorners {
     vec2 tl_outer;
     vec2 tl_inner;
@@ -398,18 +420,18 @@ struct Glyph {
     vec4 offset;
 };
 
 Glyph fetch_glyph(int index) {
     vec4 data = fetch_data_1(index);
     return Glyph(data);
 }
 
-RectWithSize fetch_instance_geometry(int index) {
-    vec4 data = fetch_data_1(index);
+RectWithSize fetch_instance_geometry(int address) {
+    vec4 data = fetch_from_resource_cache_1(address);
     return RectWithSize(data.xy, data.zw);
 }
 
 struct PrimitiveGeometry {
     RectWithSize local_rect;
     RectWithSize local_clip_rect;
 };
 
@@ -551,76 +573,58 @@ vec4 get_layer_pos(vec2 pos, Layer layer
     // get a point on the layer plane
     vec4 ah = layer.transform * vec4(0.0, 0.0, 0.0, 1.0);
     vec3 a = ah.xyz / ah.w;
     // get the normal to the layer plane
     vec3 n = transpose(mat3(layer.inv_transform)) * vec3(0.0, 0.0, 1.0);
     return untransform(pos, n, a, layer.inv_transform);
 }
 
-// Compute a snapping offset in world space (adjusted to pixel ratio),
-// given local position on the layer and a snap rectangle.
-vec2 compute_snap_offset(vec2 local_pos,
-                         RectWithSize local_clip_rect,
-                         Layer layer,
-                         RectWithSize raw_snap_rect) {
-    // Clamp the snap rectangle.
-    RectWithSize snap_rect = intersect_rect(intersect_rect(raw_snap_rect, local_clip_rect),
-                                            layer.local_clip_rect);
-    // Transform the snap corners to the world space.
-    vec4 world_snap_p0 = layer.transform * vec4(snap_rect.p0, 0.0, 1.0);
-    vec4 world_snap_p1 = layer.transform * vec4(snap_rect.p0 + snap_rect.size, 0.0, 1.0);
-    // Snap bounds in world coordinates, adjusted for pixel ratio. XY = top left, ZW = bottom right
-    vec4 world_snap = uDevicePixelRatio * vec4(world_snap_p0.xy, world_snap_p1.xy) /
-                                          vec4(world_snap_p0.ww, world_snap_p1.ww);
-    /// World offsets applied to the corners of the snap rectangle.
-    vec4 snap_offsets = floor(world_snap + 0.5) - world_snap;
-
-    /// Compute the position of this vertex inside the snap rectangle.
-    vec2 normalized_snap_pos = (local_pos - snap_rect.p0) / snap_rect.size;
-    /// Compute the actual world offset for this vertex needed to make it snap.
-    return mix(snap_offsets.xy, snap_offsets.zw, normalized_snap_pos);
-}
-
 struct VertexInfo {
     vec2 local_pos;
     vec2 screen_pos;
 };
 
 VertexInfo write_vertex(RectWithSize instance_rect,
                         RectWithSize local_clip_rect,
                         float z,
                         Layer layer,
                         AlphaBatchTask task,
-                        RectWithSize snap_rect) {
-
+                        vec2 snap_ref) {
     // Select the corner of the local rect that we are processing.
     vec2 local_pos = instance_rect.p0 + instance_rect.size * aPosition.xy;
 
+    // xy = top left corner of the local rect, zw = position of current vertex.
+    vec4 local_p0_pos = vec4(snap_ref, local_pos);
+
     // Clamp to the two local clip rects.
-    vec2 clamped_local_pos = clamp_rect(clamp_rect(local_pos, local_clip_rect),
-                                        layer.local_clip_rect);
+    local_p0_pos = clamp_rect(local_p0_pos, local_clip_rect);
+    local_p0_pos = clamp_rect(local_p0_pos, layer.local_clip_rect);
 
-    /// Compute the snapping offset.
-    vec2 snap_offset = compute_snap_offset(clamped_local_pos, local_clip_rect, layer, snap_rect);
+    // Transform the top corner and current vertex to world space.
+    vec4 world_p0 = layer.transform * vec4(local_p0_pos.xy, 0.0, 1.0);
+    world_p0.xyz /= world_p0.w;
+    vec4 world_pos = layer.transform * vec4(local_p0_pos.zw, 0.0, 1.0);
+    world_pos.xyz /= world_pos.w;
 
-    // Transform the current vertex to the world cpace.
-    vec4 world_pos = layer.transform * vec4(clamped_local_pos, 0.0, 1.0);
+    // Convert the world positions to device pixel space. xy=top left corner. zw=current vertex.
+    vec4 device_p0_pos = vec4(world_p0.xy, world_pos.xy) * uDevicePixelRatio;
 
-    // Convert the world positions to device pixel space.
-    vec2 device_pos = world_pos.xy / world_pos.w * uDevicePixelRatio;
+    // Calculate the distance to snap the vertex by (snap top left corner).
+    vec2 snap_delta = device_p0_pos.xy - floor(device_p0_pos.xy + 0.5);
 
     // Apply offsets for the render task to get correct screen location.
-    vec2 final_pos = device_pos + snap_offset -
+    vec2 final_pos = device_p0_pos.zw -
+                     snap_delta -
                      task.screen_space_origin +
                      task.render_target_origin;
 
     gl_Position = uTransform * vec4(final_pos, z, 1.0);
 
-    VertexInfo vi = VertexInfo(clamped_local_pos, device_pos);
+    VertexInfo vi = VertexInfo(local_p0_pos.zw, device_p0_pos.zw);
     return vi;
 }
 
 #ifdef WR_FEATURE_TRANSFORM
 
 struct TransformVertexInfo {
     vec3 local_pos;
     vec2 screen_pos;
@@ -645,17 +649,17 @@ vec2 intersect_lines(vec2 p0, vec2 p1, v
     return vec2(nx / d, ny / d);
 }
 
 TransformVertexInfo write_transform_vertex(RectWithSize instance_rect,
                                            RectWithSize local_clip_rect,
                                            float z,
                                            Layer layer,
                                            AlphaBatchTask task,
-                                           RectWithSize snap_rect) {
+                                           vec2 snap_ref) {
     RectWithEndpoint local_rect = to_rect_with_endpoint(instance_rect);
 
     vec2 current_local_pos, prev_local_pos, next_local_pos;
 
     // Select the current vertex and the previous/next vertices,
     // based on the vertex ID that is known based on the instance rect.
     switch (gl_VertexID) {
         case 0:
@@ -704,31 +708,33 @@ TransformVertexInfo write_transform_vert
     vec2 adjusted_next_p1 = next_device_pos + norm_next * amount;
 
     // Intersect those adjusted lines to find the actual vertex position.
     vec2 device_pos = intersect_lines(adjusted_prev_p0,
                                       adjusted_prev_p1,
                                       adjusted_next_p0,
                                       adjusted_next_p1);
 
-    vec4 layer_pos = get_layer_pos(device_pos / uDevicePixelRatio, layer);
-
-    /// Compute the snapping offset.
-    vec2 snap_offset = compute_snap_offset(layer_pos.xy / layer_pos.w,
-                                           local_clip_rect, layer, snap_rect);
+    // Calculate the snap amount based on the first vertex as a reference point.
+    vec4 world_p0 = layer.transform * vec4(snap_ref, 0.0, 1.0);
+    vec2 device_p0 = uDevicePixelRatio * world_p0.xy / world_p0.w;
+    vec2 snap_delta = device_p0 - floor(device_p0 + 0.5);
 
     // Apply offsets for the render task to get correct screen location.
-    vec2 final_pos = device_pos + snap_offset -
+    vec2 final_pos = device_pos -
+                     snap_delta -
                      task.screen_space_origin +
                      task.render_target_origin;
 
     gl_Position = uTransform * vec4(final_pos, z, 1.0);
 
     vLocalBounds = vec4(local_rect.p0, local_rect.p1);
 
+    vec4 layer_pos = get_layer_pos(device_pos / uDevicePixelRatio, layer);
+
     return TransformVertexInfo(layer_pos.xyw, device_pos);
 }
 
 #endif //WR_FEATURE_TRANSFORM
 
 struct ResourceRect {
     vec4 uv_rect;
 };
@@ -742,58 +748,58 @@ ResourceRect fetch_resource_rect(int ind
 
     return rect;
 }
 
 struct Rectangle {
     vec4 color;
 };
 
-Rectangle fetch_rectangle(int index) {
-    vec4 data = fetch_data_1(index);
+Rectangle fetch_rectangle(int address) {
+    vec4 data = fetch_from_resource_cache_1(address);
     return Rectangle(data);
 }
 
 struct TextRun {
     vec4 color;
 };
 
-TextRun fetch_text_run(int index) {
-    vec4 data = fetch_data_1(index);
+TextRun fetch_text_run(int address) {
+    vec4 data = fetch_from_resource_cache_1(address);
     return TextRun(data);
 }
 
 struct Image {
     vec4 stretch_size_and_tile_spacing;  // Size of the actual image and amount of space between
                                          //     tiled instances of this image.
 };
 
-Image fetch_image(int index) {
-    vec4 data = fetch_data_1(index);
+Image fetch_image(int address) {
+    vec4 data = fetch_from_resource_cache_1(address);
     return Image(data);
 }
 
 struct YuvImage {
     vec2 size;
 };
 
-YuvImage fetch_yuv_image(int index) {
-    vec4 data = fetch_data_1(index);
+YuvImage fetch_yuv_image(int address) {
+    vec4 data = fetch_from_resource_cache_1(address);
     return YuvImage(data.xy);
 }
 
 struct BoxShadow {
     vec4 src_rect;
     vec4 bs_rect;
     vec4 color;
     vec4 border_radius_edge_size_blur_radius_inverted;
 };
 
-BoxShadow fetch_boxshadow(int index) {
-    vec4 data[4] = fetch_data_4(index);
+BoxShadow fetch_boxshadow(int address) {
+    vec4 data[4] = fetch_from_resource_cache_4(address);
     return BoxShadow(data[0], data[1], data[2], data[3]);
 }
 
 void write_clip(vec2 global_pos, ClipArea area) {
     vec2 texture_size = vec2(textureSize(sCacheA8, 0).xy);
     vec2 uv = global_pos + area.task_bounds.xy - area.screen_origin_target_index.xy;
     vClipMaskUvBounds = area.task_bounds / texture_size.xyxy;
     vClipMaskUv = vec3(uv / texture_size, area.screen_origin_target_index.z);
--- a/gfx/webrender/res/ps_angle_gradient.vs.glsl
+++ b/gfx/webrender/res/ps_angle_gradient.vs.glsl
@@ -7,17 +7,17 @@ void main(void) {
     Primitive prim = load_primitive();
     Gradient gradient = fetch_gradient(prim.prim_index);
 
     VertexInfo vi = write_vertex(prim.local_rect,
                                  prim.local_clip_rect,
                                  prim.z,
                                  prim.layer,
                                  prim.task,
-                                 prim.local_rect);
+                                 prim.local_rect.p0);
 
     vPos = vi.local_pos - prim.local_rect.p0;
 
     vec2 start_point = gradient.start_end_point.xy;
     vec2 end_point = gradient.start_end_point.zw;
     vec2 dir = end_point - start_point;
 
     vStartPoint = start_point;
--- a/gfx/webrender/res/ps_border_corner.vs.glsl
+++ b/gfx/webrender/res/ps_border_corner.vs.glsl
@@ -266,21 +266,21 @@ void main(void) {
     segment_rect.size = p1 - p0;
 
 #ifdef WR_FEATURE_TRANSFORM
     TransformVertexInfo vi = write_transform_vertex(segment_rect,
                                                     prim.local_clip_rect,
                                                     prim.z,
                                                     prim.layer,
                                                     prim.task,
-                                                    prim.local_rect);
+                                                    prim.local_rect.p0);
 #else
     VertexInfo vi = write_vertex(segment_rect,
                                  prim.local_clip_rect,
                                  prim.z,
                                  prim.layer,
                                  prim.task,
-                                 prim.local_rect);
+                                 prim.local_rect.p0);
 #endif
 
     vLocalPos = vi.local_pos;
     write_clip(vi.screen_pos, prim.clip_area);
 }
--- a/gfx/webrender/res/ps_border_edge.vs.glsl
+++ b/gfx/webrender/res/ps_border_edge.vs.glsl
@@ -179,21 +179,21 @@ void main(void) {
     write_color(color, style, color_flip);
 
 #ifdef WR_FEATURE_TRANSFORM
     TransformVertexInfo vi = write_transform_vertex(segment_rect,
                                                     prim.local_clip_rect,
                                                     prim.z,
                                                     prim.layer,
                                                     prim.task,
-                                                    prim.local_rect);
+                                                    prim.local_rect.p0);
 #else
     VertexInfo vi = write_vertex(segment_rect,
                                  prim.local_clip_rect,
                                  prim.z,
                                  prim.layer,
                                  prim.task,
-                                 prim.local_rect);
+                                 prim.local_rect.p0);
 #endif
 
     vLocalPos = vi.local_pos;
     write_clip(vi.screen_pos, prim.clip_area);
 }
--- a/gfx/webrender/res/ps_box_shadow.vs.glsl
+++ b/gfx/webrender/res/ps_box_shadow.vs.glsl
@@ -1,24 +1,26 @@
 #line 1
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
+#define BS_HEADER_VECS 4
+
 void main(void) {
     Primitive prim = load_primitive();
     BoxShadow bs = fetch_boxshadow(prim.prim_index);
-    RectWithSize segment_rect = fetch_instance_geometry(prim.user_data0);
+    RectWithSize segment_rect = fetch_instance_geometry(prim.prim_index + BS_HEADER_VECS + prim.user_data0);
 
     VertexInfo vi = write_vertex(segment_rect,
                                  prim.local_clip_rect,
                                  prim.z,
                                  prim.layer,
                                  prim.task,
-                                 prim.local_rect);
+                                 prim.local_rect.p0);
 
     RenderTaskData child_task = fetch_render_task(prim.user_data1);
     vUv.z = child_task.data1.x;
 
     // Constant offsets to inset from bilinear filtering border.
     vec2 patch_origin = child_task.data0.xy + vec2(1.0);
     vec2 patch_size_device_pixels = child_task.data0.zw - vec2(2.0);
     vec2 patch_size = patch_size_device_pixels / uDevicePixelRatio;
--- a/gfx/webrender/res/ps_cache_image.vs.glsl
+++ b/gfx/webrender/res/ps_cache_image.vs.glsl
@@ -9,17 +9,17 @@
 void main(void) {
     Primitive prim = load_primitive();
 
     VertexInfo vi = write_vertex(prim.local_rect,
                                  prim.local_clip_rect,
                                  prim.z,
                                  prim.layer,
                                  prim.task,
-                                 prim.local_rect);
+                                 prim.local_rect.p0);
 
     RenderTaskData child_task = fetch_render_task(prim.user_data1);
     vUv.z = child_task.data1.x;
 
     vec2 texture_size = vec2(textureSize(sCacheRGBA8, 0));
     vec2 uv0 = child_task.data0.xy / texture_size;
     vec2 uv1 = (child_task.data0.xy + child_task.data0.zw) / texture_size;
 
--- a/gfx/webrender/res/ps_gradient.vs.glsl
+++ b/gfx/webrender/res/ps_gradient.vs.glsl
@@ -57,26 +57,26 @@ void main(void) {
     }
 
 #ifdef WR_FEATURE_TRANSFORM
     TransformVertexInfo vi = write_transform_vertex(segment_rect,
                                                     prim.local_clip_rect,
                                                     prim.z,
                                                     prim.layer,
                                                     prim.task,
-                                                    prim.local_rect);
+                                                    prim.local_rect.p0);
     vLocalPos = vi.local_pos;
     vec2 f = (vi.local_pos.xy - prim.local_rect.p0) / prim.local_rect.size;
 #else
     VertexInfo vi = write_vertex(segment_rect,
                                  prim.local_clip_rect,
                                  prim.z,
                                  prim.layer,
                                  prim.task,
-                                 prim.local_rect);
+                                 prim.local_rect.p0);
 
     vec2 f = (vi.local_pos - segment_rect.p0) / segment_rect.size;
     vPos = vi.local_pos;
 #endif
 
     write_clip(vi.screen_pos, prim.clip_area);
 
     vColor = mix(adjusted_color_g0, adjusted_color_g1, dot(f, axis));
--- a/gfx/webrender/res/ps_image.vs.glsl
+++ b/gfx/webrender/res/ps_image.vs.glsl
@@ -9,25 +9,25 @@ void main(void) {
     ResourceRect res = fetch_resource_rect(prim.user_data0);
 
 #ifdef WR_FEATURE_TRANSFORM
     TransformVertexInfo vi = write_transform_vertex(prim.local_rect,
                                                     prim.local_clip_rect,
                                                     prim.z,
                                                     prim.layer,
                                                     prim.task,
-                                                    prim.local_rect);
+                                                    prim.local_rect.p0);
     vLocalPos = vi.local_pos;
 #else
     VertexInfo vi = write_vertex(prim.local_rect,
                                  prim.local_clip_rect,
                                  prim.z,
                                  prim.layer,
                                  prim.task,
-                                 prim.local_rect);
+                                 prim.local_rect.p0);
     vLocalPos = vi.local_pos - prim.local_rect.p0;
 #endif
 
     write_clip(vi.screen_pos, prim.clip_area);
 
     // If this is in WR_FEATURE_TEXTURE_RECT mode, the rect and size use
     // non-normalized texture coordinates.
 #ifdef WR_FEATURE_TEXTURE_RECT
--- a/gfx/webrender/res/ps_radial_gradient.vs.glsl
+++ b/gfx/webrender/res/ps_radial_gradient.vs.glsl
@@ -7,17 +7,17 @@ void main(void) {
     Primitive prim = load_primitive();
     RadialGradient gradient = fetch_radial_gradient(prim.prim_index);
 
     VertexInfo vi = write_vertex(prim.local_rect,
                                  prim.local_clip_rect,
                                  prim.z,
                                  prim.layer,
                                  prim.task,
-                                 prim.local_rect);
+                                 prim.local_rect.p0);
 
     vPos = vi.local_pos - prim.local_rect.p0;
 
     vStartCenter = gradient.start_end_center.xy;
     vEndCenter = gradient.start_end_center.zw;
 
     vStartRadius = gradient.start_end_radius_ratio_xy_extend_mode.x;
     vEndRadius = gradient.start_end_radius_ratio_xy_extend_mode.y;
--- a/gfx/webrender/res/ps_rectangle.vs.glsl
+++ b/gfx/webrender/res/ps_rectangle.vs.glsl
@@ -8,23 +8,23 @@ void main(void) {
     Rectangle rect = fetch_rectangle(prim.prim_index);
     vColor = rect.color;
 #ifdef WR_FEATURE_TRANSFORM
     TransformVertexInfo vi = write_transform_vertex(prim.local_rect,
                                                     prim.local_clip_rect,
                                                     prim.z,
                                                     prim.layer,
                                                     prim.task,
-                                                    prim.local_rect);
+                                                    prim.local_rect.p0);
     vLocalPos = vi.local_pos;
 #else
     VertexInfo vi = write_vertex(prim.local_rect,
                                  prim.local_clip_rect,
                                  prim.z,
                                  prim.layer,
                                  prim.task,
-                                 prim.local_rect);
+                                 prim.local_rect.p0);
 #endif
 
 #ifdef WR_FEATURE_CLIP
     write_clip(vi.screen_pos, prim.clip_area);
 #endif
 }
--- a/gfx/webrender/res/ps_text_run.vs.glsl
+++ b/gfx/webrender/res/ps_text_run.vs.glsl
@@ -13,26 +13,26 @@ void main(void) {
                                            (res.uv_rect.zw - res.uv_rect.xy) / uDevicePixelRatio);
 
 #ifdef WR_FEATURE_TRANSFORM
     TransformVertexInfo vi = write_transform_vertex(local_rect,
                                                     prim.local_clip_rect,
                                                     prim.z,
                                                     prim.layer,
                                                     prim.task,
-                                                    local_rect);
+                                                    local_rect.p0);
     vLocalPos = vi.local_pos;
     vec2 f = (vi.local_pos.xy / vi.local_pos.z - local_rect.p0) / local_rect.size;
 #else
     VertexInfo vi = write_vertex(local_rect,
                                  prim.local_clip_rect,
                                  prim.z,
                                  prim.layer,
                                  prim.task,
-                                 local_rect);
+                                 local_rect.p0);
     vec2 f = (vi.local_pos - local_rect.p0) / local_rect.size;
 #endif
 
     write_clip(vi.screen_pos, prim.clip_area);
 
     vec2 texture_size = vec2(textureSize(sColor0, 0));
     vec2 st0 = res.uv_rect.xy / texture_size;
     vec2 st1 = res.uv_rect.zw / texture_size;
--- a/gfx/webrender/res/ps_yuv_image.vs.glsl
+++ b/gfx/webrender/res/ps_yuv_image.vs.glsl
@@ -6,25 +6,25 @@
 void main(void) {
     Primitive prim = load_primitive();
 #ifdef WR_FEATURE_TRANSFORM
     TransformVertexInfo vi = write_transform_vertex(prim.local_rect,
                                                     prim.local_clip_rect,
                                                     prim.z,
                                                     prim.layer,
                                                     prim.task,
-                                                    prim.local_rect);
+                                                    prim.local_rect.p0);
     vLocalPos = vi.local_pos;
 #else
     VertexInfo vi = write_vertex(prim.local_rect,
                                  prim.local_clip_rect,
                                  prim.z,
                                  prim.layer,
                                  prim.task,
-                                 prim.local_rect);
+                                 prim.local_rect.p0);
     vLocalPos = vi.local_pos - prim.local_rect.p0;
 #endif
 
     write_clip(vi.screen_pos, prim.clip_area);
 
     ResourceRect y_rect = fetch_resource_rect(prim.user_data0);
 #ifndef WR_FEATURE_INTERLEAVED_Y_CB_CR  // only 1 channel
     ResourceRect u_rect = fetch_resource_rect(prim.user_data0 + 1);
--- a/gfx/webrender/src/border.rs
+++ b/gfx/webrender/src/border.rs
@@ -1,16 +1,16 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 use ellipse::Ellipse;
 use frame_builder::FrameBuilder;
 use mask_cache::{ClipSource};
-use prim_store::{BorderPrimitiveCpu, BorderPrimitiveGpu, GpuBlock32, PrimitiveContainer};
+use prim_store::{BorderPrimitiveCpu, GpuBlock32, PrimitiveContainer};
 use tiling::PrimitiveFlags;
 use util::{lerp, pack_as_float};
 use webrender_traits::{BorderSide, BorderStyle, BorderWidths, ClipAndScrollInfo, ClipRegion};
 use webrender_traits::{ColorF, LayerPoint, LayerRect, LayerSize, NormalBorder};
 
 #[repr(u8)]
 #[derive(Debug, Copy, Clone, PartialEq)]
 pub enum BorderCornerInstance {
@@ -237,43 +237,49 @@ impl FrameBuilder {
         // These colors are used during inset/outset scaling.
         let left_color      = left.border_color(1.0, 2.0/3.0, 0.3, 0.7);
         let top_color       = top.border_color(1.0, 2.0/3.0, 0.3, 0.7);
         let right_color     = right.border_color(2.0/3.0, 1.0, 0.7, 0.3);
         let bottom_color    = bottom.border_color(2.0/3.0, 1.0, 0.7, 0.3);
 
         let prim_cpu = BorderPrimitiveCpu {
             corner_instances: corner_instances,
-        };
 
-        let prim_gpu = BorderPrimitiveGpu {
-            colors: [ left_color, top_color, right_color, bottom_color ],
-            widths: [ widths.left,
-                      widths.top,
-                      widths.right,
-                      widths.bottom ],
-            style: [
-                pack_as_float(left.style as u32),
-                pack_as_float(top.style as u32),
-                pack_as_float(right.style as u32),
-                pack_as_float(bottom.style as u32),
-            ],
-            radii: [
-                radius.top_left,
-                radius.top_right,
-                radius.bottom_right,
-                radius.bottom_left,
+            // TODO(gw): In the future, we will build these on demand
+            //           from the deserialized display list, rather
+            //           than creating it immediately.
+            gpu_blocks: [
+                [ pack_as_float(left.style as u32),
+                  pack_as_float(top.style as u32),
+                  pack_as_float(right.style as u32),
+                  pack_as_float(bottom.style as u32) ].into(),
+                [ widths.left,
+                  widths.top,
+                  widths.right,
+                  widths.bottom ].into(),
+                left_color.into(),
+                top_color.into(),
+                right_color.into(),
+                bottom_color.into(),
+                [ radius.top_left.width,
+                  radius.top_left.height,
+                  radius.top_right.width,
+                  radius.top_right.height ].into(),
+                [ radius.bottom_right.width,
+                  radius.bottom_right.height,
+                  radius.bottom_left.width,
+                  radius.bottom_left.height ].into(),
             ],
         };
 
         self.add_primitive(clip_and_scroll,
                            &rect,
                            clip_region,
                            extra_clips,
-                           PrimitiveContainer::Border(prim_cpu, prim_gpu));
+                           PrimitiveContainer::Border(prim_cpu));
     }
 
     // TODO(gw): This allows us to move border types over to the
     // simplified shader model one at a time. Once all borders
     // are converted, this can be removed, along with the complex
     // border code path.
     pub fn add_normal_border(&mut self,
                              rect: &LayerRect,
--- a/gfx/webrender/src/device.rs
+++ b/gfx/webrender/src/device.rs
@@ -11,26 +11,41 @@ use internal_types::{DebugFontVertex, De
 //use notify::{self, Watcher};
 use super::shader_source;
 use std::collections::HashMap;
 use std::fs::File;
 use std::hash::BuildHasherDefault;
 use std::io::Read;
 use std::iter::repeat;
 use std::mem;
+use std::ops::Add;
 use std::path::PathBuf;
 use std::rc::Rc;
 //use std::sync::mpsc::{channel, Sender};
 //use std::thread;
 use webrender_traits::{ColorF, ImageFormat};
 use webrender_traits::{DeviceIntPoint, DeviceIntRect, DeviceIntSize, DeviceUintSize};
 
-#[derive(Debug, Copy, Clone)]
+#[derive(Debug, Copy, Clone, PartialEq, Ord, Eq, PartialOrd)]
 pub struct FrameId(usize);
 
+impl FrameId {
+    pub fn new(value: usize) -> FrameId {
+        FrameId(value)
+    }
+}
+
+impl Add<usize> for FrameId {
+    type Output = FrameId;
+
+    fn add(self, other: usize) -> FrameId {
+        FrameId(self.0 + other)
+    }
+}
+
 #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))]
 const GL_FORMAT_A: gl::GLuint = gl::RED;
 
 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
 const GL_FORMAT_A: gl::GLuint = gl::ALPHA;
 
 const GL_FORMAT_BGRA_GL: gl::GLuint = gl::BGRA;
 
@@ -486,65 +501,71 @@ pub struct GpuSample<T> {
 
 pub struct GpuFrameProfile<T> {
     gl: Rc<gl::Gl>,
     queries: Vec<gl::GLuint>,
     samples: Vec<GpuSample<T>>,
     next_query: usize,
     pending_query: gl::GLuint,
     frame_id: FrameId,
+    inside_frame: bool,
 }
 
 impl<T> GpuFrameProfile<T> {
-    fn new(gl: Rc<gl::Gl>) -> GpuFrameProfile<T> {
+    fn new(gl: Rc<gl::Gl>) -> Self {
         match gl.get_type() {
             gl::GlType::Gl => {
                 let queries = gl.gen_queries(MAX_EVENTS_PER_FRAME as gl::GLint);
                 GpuFrameProfile {
                     gl: gl,
                     queries: queries,
                     samples: Vec::new(),
                     next_query: 0,
                     pending_query: 0,
                     frame_id: FrameId(0),
+                    inside_frame: false,
                 }
             }
             gl::GlType::Gles => {
                 GpuFrameProfile {
                     gl: gl,
                     queries: Vec::new(),
                     samples: Vec::new(),
                     next_query: 0,
                     pending_query: 0,
                     frame_id: FrameId(0),
+                    inside_frame: false,
                 }
             }
         }
     }
 
     fn begin_frame(&mut self, frame_id: FrameId) {
         self.frame_id = frame_id;
         self.next_query = 0;
         self.pending_query = 0;
         self.samples.clear();
+        self.inside_frame = true;
     }
 
     fn end_frame(&mut self) {
+        self.inside_frame = false;
         match self.gl.get_type() {
             gl::GlType::Gl => {
                 if self.pending_query != 0 {
                     self.gl.end_query(gl::TIME_ELAPSED);
                 }
             }
             gl::GlType::Gles => {},
         }
     }
 
     fn add_marker(&mut self, tag: T) -> GpuMarker
     where T: NamedTag {
+        debug_assert!(self.inside_frame);
         match self.gl.get_type() {
             gl::GlType::Gl => {
                 self.add_marker_gl(tag)
             }
             gl::GlType::Gles => {
                 self.add_marker_gles(tag)
             }
         }
@@ -583,16 +604,17 @@ impl<T> GpuFrameProfile<T> {
         marker
     }
 
     fn is_valid(&self) -> bool {
         self.next_query > 0 && self.next_query <= MAX_EVENTS_PER_FRAME
     }
 
     fn build_samples(&mut self) -> Vec<GpuSample<T>> {
+        debug_assert!(!self.inside_frame);
         match self.gl.get_type() {
             gl::GlType::Gl => {
                 self.build_samples_gl()
             }
             gl::GlType::Gles => {
                 self.build_samples_gles()
             }
         }
@@ -1542,24 +1564,19 @@ impl Device {
             self.gl.uniform_1i(u_data16, TextureSampler::Data16 as i32);
         }
 
         let u_data32 = self.gl.get_uniform_location(program.id, "sData32");
         if u_data32 != -1 {
             self.gl.uniform_1i(u_data32, TextureSampler::Data32 as i32);
         }
 
-        let u_data64 = self.gl.get_uniform_location(program.id, "sData64");
-        if u_data64 != -1 {
-            self.gl.uniform_1i(u_data64, TextureSampler::Data64 as i32);
-        }
-
-        let u_data128 = self.gl.get_uniform_location(program.id, "sData128");
-        if u_data128 != -1 {
-            self.gl.uniform_1i(u_data128, TextureSampler::Data128    as i32);
+        let u_resource_cache = self.gl.get_uniform_location(program.id, "sResourceCache");
+        if u_resource_cache != -1 {
+            self.gl.uniform_1i(u_resource_cache, TextureSampler::ResourceCache as i32);
         }
 
         let u_resource_rects = self.gl.get_uniform_location(program.id, "sResourceRects");
         if u_resource_rects != -1 {
             self.gl.uniform_1i(u_resource_rects, TextureSampler::ResourceRects as i32);
         }
 
         let u_gradients = self.gl.get_uniform_location(program.id, "sGradients");
@@ -1629,58 +1646,42 @@ impl Device {
                     device_pixel_ratio: f32) {
         debug_assert!(self.inside_frame);
         self.gl.uniform_matrix_4fv(program.u_transform,
                                false,
                                &transform.to_row_major_array());
         self.gl.uniform_1f(program.u_device_pixel_ratio, device_pixel_ratio);
     }
 
-    fn update_image_for_2d_texture(&mut self,
-                                   target: gl::GLuint,
-                                   x0: gl::GLint,
-                                   y0: gl::GLint,
-                                   width: gl::GLint,
-                                   height: gl::GLint,
-                                   format: gl::GLuint,
-                                   data: &[u8]) {
-        self.gl.tex_sub_image_2d(target,
-                                  0,
-                                  x0, y0,
-                                  width, height,
-                                  format,
-                                  gl::UNSIGNED_BYTE,
-                                  data);
-    }
-
     pub fn update_texture(&mut self,
                           texture_id: TextureId,
                           x0: u32,
                           y0: u32,
                           width: u32,
                           height: u32,
                           stride: Option<u32>,
                           data: &[u8]) {
         debug_assert!(self.inside_frame);
 
         let mut expanded_data = Vec::new();
 
-        let (gl_format, bpp, data) = match self.textures.get(&texture_id).unwrap().format {
+        let (gl_format, bpp, data, data_type) = match self.textures.get(&texture_id).unwrap().format {
             ImageFormat::A8 => {
                 if cfg!(any(target_arch="arm", target_arch="aarch64")) {
                     expanded_data.extend(data.iter().flat_map(|byte| repeat(*byte).take(4)));
-                    (get_gl_format_bgra(self.gl()), 4, expanded_data.as_slice())
+                    (get_gl_format_bgra(self.gl()), 4, expanded_data.as_slice(), gl::UNSIGNED_BYTE)
                 } else {
-                    (GL_FORMAT_A, 1, data)
+                    (GL_FORMAT_A, 1, data, gl::UNSIGNED_BYTE)
                 }
             }
-            ImageFormat::RGB8 => (gl::RGB, 3, data),
-            ImageFormat::RGBA8 => (get_gl_format_bgra(self.gl()), 4, data),
-            ImageFormat::RG8 => (gl::RG, 2, data),
-            ImageFormat::Invalid | ImageFormat::RGBAF32 => unreachable!(),
+            ImageFormat::RGB8 => (gl::RGB, 3, data, gl::UNSIGNED_BYTE),
+            ImageFormat::RGBA8 => (get_gl_format_bgra(self.gl()), 4, data, gl::UNSIGNED_BYTE),
+            ImageFormat::RG8 => (gl::RG, 2, data, gl::UNSIGNED_BYTE),
+            ImageFormat::RGBAF32 => (gl::RGBA, 16, data, gl::FLOAT),
+            ImageFormat::Invalid => unreachable!(),
         };
 
         let row_length = match stride {
             Some(value) => value / bpp,
             None => width,
         };
 
         // Take the stride into account for all rows, except the last one.
@@ -1688,23 +1689,26 @@ impl Device {
                 + width * bpp;
         let data = &data[0..len as usize];
 
         if let Some(..) = stride {
             self.gl.pixel_store_i(gl::UNPACK_ROW_LENGTH, row_length as gl::GLint);
         }
 
         self.bind_texture(DEFAULT_TEXTURE, texture_id);
-        self.update_image_for_2d_texture(texture_id.target,
-                                         x0 as gl::GLint,
-                                         y0 as gl::GLint,
-                                         width as gl::GLint,
-                                         height as gl::GLint,
-                                         gl_format,
-                                         data);
+
+        self.gl.tex_sub_image_2d(texture_id.target,
+                                 0,
+                                 x0 as gl::GLint,
+                                 y0 as gl::GLint,
+                                 width as gl::GLint,
+                                 height as gl::GLint,
+                                 gl_format,
+                                 data_type,
+                                 data);
 
         // Reset row length to 0, otherwise the stride would apply to all texture uploads.
         if let Some(..) = stride {
             self.gl.pixel_store_i(gl::UNPACK_ROW_LENGTH, 0 as gl::GLint);
         }
     }
 
     fn clear_vertex_array(&mut self) {
--- a/gfx/webrender/src/frame.rs
+++ b/gfx/webrender/src/frame.rs
@@ -5,17 +5,17 @@
 use app_units::Au;
 use euclid::rect::rect;
 use fnv::FnvHasher;
 use internal_types::{ANGLE_FLOAT_TO_FIXED, AxisDirection};
 use internal_types::{LowLevelFilterOp};
 use internal_types::{RendererFrame};
 use frame_builder::{FrameBuilder, FrameBuilderConfig};
 use clip_scroll_tree::{ClipScrollTree, ScrollStates};
-use profiler::TextureCacheProfileCounters;
+use profiler::{GpuCacheProfileCounters, TextureCacheProfileCounters};
 use resource_cache::ResourceCache;
 use scene::{Scene, SceneProperties};
 use std::cmp;
 use std::collections::HashMap;
 use std::hash::BuildHasherDefault;
 use tiling::{CompositeOps, DisplayListMap, PrimitiveFlags};
 use util::{ComplexClipRegionHelpers, subtract_rect};
 use webrender_traits::{BuiltDisplayList, BuiltDisplayListIter, ClipAndScrollInfo, ClipDisplayItem};
@@ -966,43 +966,47 @@ impl Frame {
         }
     }
 
     pub fn build(&mut self,
                  resource_cache: &mut ResourceCache,
                  display_lists: &DisplayListMap,
                  device_pixel_ratio: f32,
                  pan: LayerPoint,
-                 texture_cache_profile: &mut TextureCacheProfileCounters)
+                 texture_cache_profile: &mut TextureCacheProfileCounters,
+                 gpu_cache_profile: &mut GpuCacheProfileCounters)
                  -> RendererFrame {
         self.clip_scroll_tree.update_all_node_transforms(pan);
         let frame = self.build_frame(resource_cache,
                                      display_lists,
                                      device_pixel_ratio,
-                                     texture_cache_profile);
+                                     texture_cache_profile,
+                                     gpu_cache_profile);
         // Expire any resources that haven't been used for `cache_expiry_frames`.
         let num_frames_back = self.frame_builder_config.cache_expiry_frames;
         let expiry_frame = FrameId(cmp::max(num_frames_back, self.id.0) - num_frames_back);
         resource_cache.expire_old_resources(expiry_frame);
         frame
     }
 
     fn build_frame(&mut self,
                    resource_cache: &mut ResourceCache,
                    display_lists: &DisplayListMap,
                    device_pixel_ratio: f32,
-                   texture_cache_profile: &mut TextureCacheProfileCounters)
+                   texture_cache_profile: &mut TextureCacheProfileCounters,
+                   gpu_cache_profile: &mut GpuCacheProfileCounters)
                    -> RendererFrame {
         let mut frame_builder = self.frame_builder.take();
         let frame = frame_builder.as_mut().map(|builder|
             builder.build(resource_cache,
                           self.id,
                           &mut self.clip_scroll_tree,
                           display_lists,
                           device_pixel_ratio,
-                          texture_cache_profile)
+                          texture_cache_profile,
+                          gpu_cache_profile)
         );
         self.frame_builder = frame_builder;
 
         let nodes_bouncing_back = self.clip_scroll_tree.collect_nodes_bouncing_back();
         RendererFrame::new(self.pipeline_epoch_map.clone(), nodes_bouncing_back, frame)
     }
 }
--- a/gfx/webrender/src/frame_builder.rs
+++ b/gfx/webrender/src/frame_builder.rs
@@ -3,22 +3,22 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 use app_units::Au;
 use frame::FrameId;
 use gpu_store::GpuStoreAddress;
 use internal_types::{HardwareCompositeOp, SourceTexture};
 use mask_cache::{ClipMode, ClipSource, MaskCacheInfo, RegionMode};
 use plane_split::{BspSplitter, Polygon, Splitter};
-use prim_store::{GradientPrimitiveCpu, GradientPrimitiveGpu, ImagePrimitiveCpu, ImagePrimitiveGpu};
+use prim_store::{GradientPrimitiveCpu, ImagePrimitiveCpu};
 use prim_store::{ImagePrimitiveKind, PrimitiveContainer, PrimitiveGeometry, PrimitiveIndex};
-use prim_store::{PrimitiveStore, RadialGradientPrimitiveCpu, RadialGradientPrimitiveGpu};
-use prim_store::{RectanglePrimitive, SplitGeometry, TextRunPrimitiveCpu, TextRunPrimitiveGpu};
-use prim_store::{BoxShadowPrimitiveGpu, TexelRect, YuvImagePrimitiveCpu, YuvImagePrimitiveGpu};
-use profiler::{FrameProfileCounters, TextureCacheProfileCounters};
+use prim_store::{PrimitiveStore, RadialGradientPrimitiveCpu};
+use prim_store::{RectanglePrimitive, SplitGeometry, TextRunPrimitiveCpu};
+use prim_store::{BoxShadowPrimitiveCpu, TexelRect, YuvImagePrimitiveCpu};
+use profiler::{FrameProfileCounters, GpuCacheProfileCounters, TextureCacheProfileCounters};
 use render_task::{AlphaRenderItem, MaskCacheKey, MaskResult, RenderTask, RenderTaskIndex};
 use render_task::{RenderTaskId, RenderTaskLocation};
 use resource_cache::ResourceCache;
 use clip_scroll_node::{ClipInfo, ClipScrollNode, NodeType};
 use clip_scroll_tree::ClipScrollTree;
 use std::{cmp, f32, i32, mem, usize};
 use std::collections::HashMap;
 use euclid::{SideOffsets2D, TypedPoint3D};
@@ -645,46 +645,44 @@ impl FrameBuilder {
         // just designate the reference orientation as start < end. Aligned gradient rendering
         // manages to produce the same result regardless of orientation, so don't worry about
         // reversing in that case.
         let reverse_stops = !aligned &&
                             (start_point.x > end_point.x ||
                              (start_point.x == end_point.x &&
                               start_point.y > end_point.y));
 
-        let gradient_cpu = GradientPrimitiveCpu {
-            stops_range: stops,
-            stops_count: stops_count,
-            extend_mode: extend_mode,
-            reverse_stops: reverse_stops,
-            cache_dirty: true,
-        };
-
         // To get reftests exactly matching with reverse start/end
         // points, it's necessary to reverse the gradient
         // line in some cases.
         let (sp, ep) = if reverse_stops {
             (end_point, start_point)
         } else {
             (start_point, end_point)
         };
 
-        let gradient_gpu = GradientPrimitiveGpu {
-            start_point: sp,
-            end_point: ep,
-            extend_mode: pack_as_float(extend_mode as u32),
-            tile_size: tile_size,
-            tile_repeat: tile_repeat,
-            padding: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        let gradient_cpu = GradientPrimitiveCpu {
+            stops_range: stops,
+            stops_count: stops_count,
+            extend_mode: extend_mode,
+            reverse_stops: reverse_stops,
+            cache_dirty: true,
+            gpu_data_address: GpuStoreAddress(0),
+            gpu_data_count: 0,
+            gpu_blocks: [
+                [sp.x, sp.y, ep.x, ep.y].into(),
+                [tile_size.width, tile_size.height, tile_repeat.width, tile_repeat.height].into(),
+                [pack_as_float(extend_mode as u32), 0.0, 0.0, 0.0].into(),
+            ],
         };
 
         let prim = if aligned {
-            PrimitiveContainer::AlignedGradient(gradient_cpu, gradient_gpu)
+            PrimitiveContainer::AlignedGradient(gradient_cpu)
         } else {
-            PrimitiveContainer::AngleGradient(gradient_cpu, gradient_gpu)
+            PrimitiveContainer::AngleGradient(gradient_cpu)
         };
 
         self.add_primitive(clip_and_scroll, &rect, clip_region, &[], prim);
     }
 
     pub fn add_radial_gradient(&mut self,
                                clip_and_scroll: ClipAndScrollInfo,
                                rect: LayerRect,
@@ -693,39 +691,36 @@ impl FrameBuilder {
                                start_radius: f32,
                                end_center: LayerPoint,
                                end_radius: f32,
                                ratio_xy: f32,
                                stops: ItemRange<GradientStop>,
                                extend_mode: ExtendMode,
                                tile_size: LayerSize,
                                tile_spacing: LayerSize) {
+        let tile_repeat = tile_size + tile_spacing;
+
         let radial_gradient_cpu = RadialGradientPrimitiveCpu {
             stops_range: stops,
             extend_mode: extend_mode,
             cache_dirty: true,
-        };
-
-        let radial_gradient_gpu = RadialGradientPrimitiveGpu {
-            start_center: start_center,
-            end_center: end_center,
-            start_radius: start_radius,
-            end_radius: end_radius,
-            ratio_xy: ratio_xy,
-            extend_mode: pack_as_float(extend_mode as u32),
-            tile_size: tile_size,
-            tile_repeat: tile_size + tile_spacing,
-            padding: [0.0, 0.0, 0.0, 0.0],
+            gpu_data_address: GpuStoreAddress(0),
+            gpu_data_count: 0,
+            gpu_blocks: [
+                [start_center.x, start_center.y, end_center.x, end_center.y].into(),
+                [start_radius, end_radius, ratio_xy, pack_as_float(extend_mode as u32)].into(),
+                [tile_size.width, tile_size.height, tile_repeat.width, tile_repeat.height].into(),
+            ],
         };
 
         self.add_primitive(clip_and_scroll,
                            &rect,
                            clip_region,
                            &[],
-                           PrimitiveContainer::RadialGradient(radial_gradient_cpu, radial_gradient_gpu));
+                           PrimitiveContainer::RadialGradient(radial_gradient_cpu));
     }
 
     pub fn add_text(&mut self,
                     clip_and_scroll: ClipAndScrollInfo,
                     rect: LayerRect,
                     clip_region: &ClipRegion,
                     font_key: FontKey,
                     size: Au,
@@ -737,16 +732,19 @@ impl FrameBuilder {
         if color.a == 0.0 {
             return
         }
 
         if size.0 <= 0 {
             return
         }
 
+        // Expand the rectangle of the text run by the blur radius.
+        let rect = rect.inflate(blur_radius, blur_radius);
+
         // TODO(gw): Use a proper algorithm to select
         // whether this item should be rendered with
         // subpixel AA!
         let mut render_mode = self.config.default_font_render_mode;
 
         // There are some conditions under which we can't use
         // subpixel text rendering, even if enabled.
         if render_mode == FontRenderMode::Subpixel {
@@ -776,27 +774,25 @@ impl FrameBuilder {
             glyph_count: glyph_count,
             cache_dirty: true,
             glyph_instances: Vec::new(),
             color_texture_id: SourceTexture::Invalid,
             color: *color,
             render_mode: render_mode,
             glyph_options: glyph_options,
             resource_address: GpuStoreAddress(0),
-        };
-
-        let prim_gpu = TextRunPrimitiveGpu {
-            color: *color,
+            gpu_data_address: GpuStoreAddress(0),
+            gpu_data_count: 0,
         };
 
         self.add_primitive(clip_and_scroll,
                            &rect,
                            clip_region,
                            &[],
-                           PrimitiveContainer::TextRun(prim_cpu, prim_gpu));
+                           PrimitiveContainer::TextRun(prim_cpu));
     }
 
     pub fn fill_box_shadow_rect(&mut self,
                                 clip_and_scroll: ClipAndScrollInfo,
                                 box_bounds: &LayerRect,
                                 bs_rect: LayerRect,
                                 clip_region: &ClipRegion,
                                 color: &ColorF,
@@ -978,57 +974,54 @@ impl FrameBuilder {
 
                 let mut extra_clips = Vec::new();
                 if border_radius >= 0.0 {
                     extra_clips.push(ClipSource::Complex(*box_bounds,
                                                 border_radius,
                                                 extra_clip_mode));
                 }
 
-                let prim_gpu = BoxShadowPrimitiveGpu {
+                let prim_cpu = BoxShadowPrimitiveCpu {
                     src_rect: *box_bounds,
                     bs_rect: bs_rect,
                     color: *color,
                     blur_radius: blur_radius,
                     border_radius: border_radius,
                     edge_size: edge_size,
                     inverted: inverted,
+                    rects: rects,
                 };
 
                 self.add_primitive(clip_and_scroll,
                                    &outer_rect,
                                    clip_region,
                                    extra_clips.as_slice(),
-                                   PrimitiveContainer::BoxShadow(prim_gpu, rects));
+                                   PrimitiveContainer::BoxShadow(prim_cpu));
             }
         }
     }
 
     pub fn add_webgl_rectangle(&mut self,
                                clip_and_scroll: ClipAndScrollInfo,
                                rect: LayerRect,
                                clip_region: &ClipRegion,
                                context_id: WebGLContextId) {
         let prim_cpu = ImagePrimitiveCpu {
             kind: ImagePrimitiveKind::WebGL(context_id),
             color_texture_id: SourceTexture::Invalid,
             resource_address: GpuStoreAddress(0),
             sub_rect: None,
-        };
-
-        let prim_gpu = ImagePrimitiveGpu {
-            stretch_size: rect.size,
-            tile_spacing: LayerSize::zero(),
+            gpu_block: [rect.size.width, rect.size.height, 0.0, 0.0].into(),
         };
 
         self.add_primitive(clip_and_scroll,
                            &rect,
                            clip_region,
                            &[],
-                           PrimitiveContainer::Image(prim_cpu, prim_gpu));
+                           PrimitiveContainer::Image(prim_cpu));
     }
 
     pub fn add_image(&mut self,
                      clip_and_scroll: ClipAndScrollInfo,
                      rect: LayerRect,
                      clip_region: &ClipRegion,
                      stretch_size: &LayerSize,
                      tile_spacing: &LayerSize,
@@ -1039,28 +1032,27 @@ impl FrameBuilder {
         let prim_cpu = ImagePrimitiveCpu {
             kind: ImagePrimitiveKind::Image(image_key,
                                             image_rendering,
                                             tile,
                                             *tile_spacing),
             color_texture_id: SourceTexture::Invalid,
             resource_address: GpuStoreAddress(0),
             sub_rect: sub_rect,
-        };
-
-        let prim_gpu = ImagePrimitiveGpu {
-            stretch_size: *stretch_size,
-            tile_spacing: *tile_spacing,
+            gpu_block: [ stretch_size.width,
+                         stretch_size.height,
+                         tile_spacing.width,
+                         tile_spacing.height ].into(),
         };
 
         self.add_primitive(clip_and_scroll,
                            &rect,
                            clip_region,
                            &[],
-                           PrimitiveContainer::Image(prim_cpu, prim_gpu));
+                           PrimitiveContainer::Image(prim_cpu));
     }
 
     pub fn add_yuv_image(&mut self,
                          clip_and_scroll: ClipAndScrollInfo,
                          rect: LayerRect,
                          clip_region: &ClipRegion,
                          yuv_data: YuvData,
                          color_space: YuvColorSpace,
@@ -1076,25 +1068,24 @@ impl FrameBuilder {
 
         let prim_cpu = YuvImagePrimitiveCpu {
             yuv_key: yuv_key,
             yuv_texture_id: [SourceTexture::Invalid, SourceTexture::Invalid, SourceTexture::Invalid],
             yuv_resource_address: GpuStoreAddress(0),
             format: format,
             color_space: color_space,
             image_rendering: image_rendering,
+            gpu_block: [rect.size.width, rect.size.height, 0.0, 0.0].into(),
         };
 
-        let prim_gpu = YuvImagePrimitiveGpu::new(rect.size);
-
         self.add_primitive(clip_and_scroll,
                            &rect,
                            clip_region,
                            &[],
-                           PrimitiveContainer::YuvImage(prim_cpu, prim_gpu));
+                           PrimitiveContainer::YuvImage(prim_cpu));
     }
 
     /// Compute the contribution (bounding rectangles, and resources) of layers and their
     /// primitives in screen space.
     fn build_layer_screen_rects_and_cull_layers(&mut self,
                                                 screen_rect: &DeviceIntRect,
                                                 clip_scroll_tree: &mut ClipScrollTree,
                                                 display_lists: &DisplayListMap,
@@ -1365,17 +1356,18 @@ impl FrameBuilder {
     }
 
     pub fn build(&mut self,
                  resource_cache: &mut ResourceCache,
                  frame_id: FrameId,
                  clip_scroll_tree: &mut ClipScrollTree,
                  display_lists: &DisplayListMap,
                  device_pixel_ratio: f32,
-                 texture_cache_profile: &mut TextureCacheProfileCounters)
+                 texture_cache_profile: &mut TextureCacheProfileCounters,
+                 gpu_cache_profile: &mut GpuCacheProfileCounters)
                  -> Frame {
         profile_scope!("build");
 
         let mut profile_counters = FrameProfileCounters::new();
         profile_counters.total_primitives.set(self.prim_store.prim_count());
 
         resource_cache.begin_frame(frame_id);
 
@@ -1414,16 +1406,19 @@ impl FrameBuilder {
                     self.prim_store.resolve_clip_cache(mask_info, resource_cache);
                 }
             }
         }
 
         let deferred_resolves = self.prim_store.resolve_primitives(resource_cache,
                                                                    device_pixel_ratio);
 
+        let gpu_cache_updates = resource_cache.gpu_cache
+                                              .end_frame(gpu_cache_profile);
+
         let mut passes = Vec::new();
 
         // Do the allocations now, assigning each tile's tasks to a render
         // pass and target as required.
         for index in 0..required_pass_count {
             passes.push(RenderPass::new(index as isize,
                                         index == required_pass_count-1,
                                         cache_size));
@@ -1454,23 +1449,22 @@ impl FrameBuilder {
             window_size: self.screen_size,
             profile_counters: profile_counters,
             passes: passes,
             cache_size: cache_size,
             layer_texture_data: self.packed_layers.clone(),
             render_task_data: render_tasks.render_task_data,
             gpu_data16: self.prim_store.gpu_data16.build(),
             gpu_data32: self.prim_store.gpu_data32.build(),
-            gpu_data64: self.prim_store.gpu_data64.build(),
-            gpu_data128: self.prim_store.gpu_data128.build(),
             gpu_geometry: self.prim_store.gpu_geometry.build(),
             gpu_gradient_data: self.prim_store.gpu_gradient_data.build(),
             gpu_split_geometry: self.prim_store.gpu_split_geometry.build(),
             gpu_resource_rects: self.prim_store.gpu_resource_rects.build(),
             deferred_resolves: deferred_resolves,
+            gpu_cache_updates: Some(gpu_cache_updates),
         }
     }
 
 }
 
 struct LayerRectCalculationAndCullingPass<'a> {
     frame_builder: &'a mut FrameBuilder,
     screen_rect: &'a DeviceIntRect,
@@ -1739,27 +1733,21 @@ impl<'a> LayerRectCalculationAndCullingP
                                              .expect("No display list?");
         for i in 0..prim_count {
             let prim_index = PrimitiveIndex(prim_index.0 + i);
             if self.frame_builder.prim_store.build_bounding_rect(prim_index,
                                                                  self.screen_rect,
                                                                  &packed_layer.transform,
                                                                  &packed_layer.local_clip_rect,
                                                                  self.device_pixel_ratio) {
-                if self.frame_builder.prim_store.prepare_prim_for_render(prim_index,
-                                                                         self.resource_cache,
-                                                                         &packed_layer.transform,
-                                                                         self.device_pixel_ratio,
-                                                                         display_list) {
-                    self.frame_builder.prim_store.build_bounding_rect(prim_index,
-                                                                      self.screen_rect,
+                self.frame_builder.prim_store.prepare_prim_for_render(prim_index,
+                                                                      self.resource_cache,
                                                                       &packed_layer.transform,
-                                                                      &packed_layer.local_clip_rect,
-                                                                      self.device_pixel_ratio);
-                }
+                                                                      self.device_pixel_ratio,
+                                                                      display_list);
 
                 // If the primitive is visible, consider culling it via clip rect(s).
                 // If it is visible but has clips, create the clip task for it.
                 let prim_bounding_rect =
                     match self.frame_builder.prim_store.cpu_bounding_rects[prim_index.0] {
                     Some(rect) => rect,
                     _ => continue,
                 };
--- a/gfx/webrender/src/glyph_rasterizer.rs
+++ b/gfx/webrender/src/glyph_rasterizer.rs
@@ -150,38 +150,32 @@ impl GlyphRasterizer {
         glyph_instances: &[GlyphInstance],
         render_mode: FontRenderMode,
         glyph_options: Option<GlyphOptions>,
     ) {
         assert!(self.font_contexts.lock_shared_context().has_font(&font_key));
 
         let mut glyphs = Vec::with_capacity(glyph_instances.len());
 
-        {
-            // TODO: If this takes too long we can resurect a dedicated glyph
-            // dispatch thread, hopefully not.
-            profile_scope!("glyph-requests");
+        // select glyphs that have not been requested yet.
+        for glyph in glyph_instances {
+            let glyph_request = GlyphRequest::new(
+                font_key,
+                size,
+                color,
+                glyph.index,
+                glyph.point,
+                render_mode,
+                glyph_options,
+            );
 
-            // select glyphs that have not been requested yet.
-            for glyph in glyph_instances {
-                let glyph_request = GlyphRequest::new(
-                    font_key,
-                    size,
-                    color,
-                    glyph.index,
-                    glyph.point,
-                    render_mode,
-                    glyph_options,
-                );
-
-                glyph_cache.mark_as_needed(&glyph_request, current_frame_id);
-                if !glyph_cache.contains_key(&glyph_request) && !self.pending_glyphs.contains(&glyph_request) {
-                    self.pending_glyphs.insert(glyph_request.clone());
-                    glyphs.push(glyph_request);
-                }
+            glyph_cache.mark_as_needed(&glyph_request, current_frame_id);
+            if !glyph_cache.contains_key(&glyph_request) && !self.pending_glyphs.contains(&glyph_request) {
+                self.pending_glyphs.insert(glyph_request.clone());
+                glyphs.push(glyph_request);
             }
         }
 
         if glyphs.is_empty() {
             return;
         }
 
         let font_contexts = Arc::clone(&self.font_contexts);
new file mode 100644
--- /dev/null
+++ b/gfx/webrender/src/gpu_cache.rs
@@ -0,0 +1,493 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+//! Overview of the GPU cache.
+//!
+//! The main goal of the GPU cache is to allow on-demand
+//! allocation and construction of GPU resources for the
+//! vertex shaders to consume.
+//!
+//! Every item that wants to be stored in the GPU cache
+//! should create a GpuCacheHandle that is used to refer
+//! to a cached GPU resource. Creating a handle is a
+//! cheap operation, that does *not* allocate room in the
+//! cache.
+//!
+//! On any frame when that data is required, the caller
+//! must request that handle, via ```request```. If the
+//! data is not in the cache, the user provided closure
+//! will be invoked to build the data.
+//!
+//! After ```end_frame``` has occurred, callers can
+//! use the ```get_address``` API to get the allocated
+//! address in the GPU cache of a given resource slot
+//! for this frame.
+
+use device::FrameId;
+use profiler::GpuCacheProfileCounters;
+use renderer::MAX_VERTEX_TEXTURE_WIDTH;
+use std::mem;
+use webrender_traits::{ColorF, LayerRect};
+
+pub const GPU_CACHE_INITIAL_HEIGHT: u32 = 512;
+const FRAMES_BEFORE_EVICTION: usize = 10;
+
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+struct Epoch(u32);
+
+#[derive(Debug, Copy, Clone)]
+struct CacheLocation {
+    block_index: BlockIndex,
+    epoch: Epoch,
+}
+
+/// A single texel in RGBAF32 texture - 16 bytes.
+#[derive(Copy, Clone, Debug)]
+pub struct GpuBlockData {
+    pub data: [f32; 4],
+}
+
+/// Conversion helpers for GpuBlockData
+impl Into<GpuBlockData> for ColorF {
+    fn into(self) -> GpuBlockData {
+        GpuBlockData {
+            data: [self.r, self.g, self.b, self.a],
+        }
+    }
+}
+
+impl Into<GpuBlockData> for [f32; 4] {
+    fn into(self) -> GpuBlockData {
+        GpuBlockData {
+            data: self,
+        }
+    }
+}
+
+impl Into<GpuBlockData> for LayerRect {
+    fn into(self) -> GpuBlockData {
+        GpuBlockData {
+            data: [ self.origin.x,
+                    self.origin.y,
+                    self.size.width,
+                    self.size.height ],
+        }
+    }
+}
+
+// Any data type that can be stored in the GPU cache should
+// implement this trait.
+pub trait ToGpuBlocks {
+    // Request an arbitrary number of GPU data blocks.
+    fn write_gpu_blocks(&self, GpuDataRequest);
+}
+
+// A handle to a GPU resource.
+#[derive(Debug, Copy, Clone)]
+pub struct GpuCacheHandle {
+    location: Option<CacheLocation>,
+}
+
+impl GpuCacheHandle {
+    pub fn new() -> GpuCacheHandle {
+        GpuCacheHandle {
+            location: None,
+        }
+    }
+}
+
+// A unique address in the GPU cache. These are uploaded
+// as part of the primitive instances, to allow the vertex
+// shader to fetch the specific data.
+#[derive(Copy, Debug, Clone)]
+pub struct GpuCacheAddress {
+    pub u: u16,
+    pub v: u16,
+}
+
+impl GpuCacheAddress {
+    fn new(u: usize, v: usize) -> GpuCacheAddress {
+        GpuCacheAddress {
+            u: u as u16,
+            v: v as u16,
+        }
+    }
+}
+
+// An entry in a free-list of blocks in the GPU cache.
+#[derive(Debug)]
+struct Block {
+    // The location in the cache of this block.
+    address: GpuCacheAddress,
+    // Index of the next free block in the list it
+    // belongs to (either a free-list or the
+    // occupied list).
+    next: Option<BlockIndex>,
+    // The current epoch (generation) of this block.
+    epoch: Epoch,
+    // The last frame this block was referenced.
+    last_access_time: FrameId,
+}
+
+impl Block {
+    fn new(address: GpuCacheAddress,
+           next: Option<BlockIndex>,
+           frame_id: FrameId) -> Block {
+        Block {
+            address: address,
+            next: next,
+            last_access_time: frame_id,
+            epoch: Epoch(0),
+        }
+    }
+}
+
+#[derive(Debug, Copy, Clone)]
+struct BlockIndex(usize);
+
+// A row in the cache texture.
+struct Row {
+    // The fixed size of blocks that this row supports.
+    // Each row becomes a slab allocator for a fixed block size.
+    // This means no dealing with fragmentation within a cache
+    // row as items are allocated and freed.
+    block_count_per_item: usize,
+}
+
+impl Row {
+    fn new(block_count_per_item: usize) -> Row {
+        Row {
+            block_count_per_item: block_count_per_item,
+        }
+    }
+}
+
+// A list of update operations that can be applied on the cache
+// this frame. The list of updates is created by the render backend
+// during frame construction. It's passed to the render thread
+// where GL commands can be applied.
+pub enum GpuCacheUpdate {
+    Copy {
+        block_index: usize,
+        block_count: usize,
+        address: GpuCacheAddress,
+    }
+}
+
+pub struct GpuCacheUpdateList {
+    // The current height of the texture. The render thread
+    // should resize the texture if required.
+    pub height: u32,
+    // List of updates to apply.
+    pub updates: Vec<GpuCacheUpdate>,
+    // A flat list of GPU blocks that are pending upload
+    // to GPU memory.
+    pub blocks: Vec<GpuBlockData>,
+}
+
+// Holds the free lists of fixed size blocks. Mostly
+// just serves to work around the borrow checker.
+struct FreeBlockLists {
+    free_list_1: Option<BlockIndex>,
+    free_list_2: Option<BlockIndex>,
+    free_list_4: Option<BlockIndex>,
+    free_list_8: Option<BlockIndex>,
+    free_list_large: Option<BlockIndex>,
+}
+
+impl FreeBlockLists {
+    fn new() -> FreeBlockLists {
+        FreeBlockLists {
+            free_list_1: None,
+            free_list_2: None,
+            free_list_4: None,
+            free_list_8: None,
+            free_list_large: None,
+        }
+    }
+
+    fn get_actual_block_count_and_free_list(&mut self,
+                                            block_count: usize) -> (usize, &mut Option<BlockIndex>) {
+        // Find the appropriate free list to use
+        // based on the block size.
+        match block_count {
+            0 => panic!("Can't allocate zero sized blocks!"),
+            1 => (1, &mut self.free_list_1),
+            2 => (2, &mut self.free_list_2),
+            3...4 => (4, &mut self.free_list_4),
+            5...8 => (8, &mut self.free_list_8),
+            9...MAX_VERTEX_TEXTURE_WIDTH => (MAX_VERTEX_TEXTURE_WIDTH, &mut self.free_list_large),
+            _ => panic!("Can't allocate > MAX_VERTEX_TEXTURE_WIDTH per resource!"),
+        }
+    }
+}
+
+// CPU-side representation of the GPU resource cache texture.
+struct Texture {
+    // Current texture height
+    height: u32,
+    // All blocks that have been created for this texture
+    blocks: Vec<Block>,
+    // Metadata about each allocated row.
+    rows: Vec<Row>,
+    // Free lists of available blocks for each supported
+    // block size in the texture. These are intrusive
+    // linked lists.
+    free_lists: FreeBlockLists,
+    // Linked list of currently occupied blocks. This
+    // makes it faster to iterate blocks looking for
+    // candidates to be evicted from the cache.
+    occupied_list_head: Option<BlockIndex>,
+    // Pending blocks that have been written this frame
+    // and will need to be sent to the GPU.
+    pending_blocks: Vec<GpuBlockData>,
+    // Pending update commands.
+    updates: Vec<GpuCacheUpdate>,
+    // Profile stats
+    allocated_block_count: usize,
+}
+
+impl Texture {
+    fn new() -> Texture {
+        Texture {
+            height: GPU_CACHE_INITIAL_HEIGHT,
+            blocks: Vec::new(),
+            rows: Vec::new(),
+            free_lists: FreeBlockLists::new(),
+            pending_blocks: Vec::new(),
+            updates: Vec::new(),
+            occupied_list_head: None,
+            allocated_block_count: 0,
+        }
+    }
+
+    // Push new data into the cache. The ```pending_block_index``` field represents
+    // where the data was pushed into the texture ```pending_blocks``` array.
+    // Return the allocated address for this data.
+    fn push_data(&mut self,
+                 pending_block_index: usize,
+                 block_count: usize,
+                 frame_id: FrameId) -> CacheLocation {
+        // Find the appropriate free list to use based on the block size.
+        let (alloc_size, free_list) = self.free_lists
+                                          .get_actual_block_count_and_free_list(block_count);
+
+        // See if we need a new row (if free-list has nothing available)
+        if free_list.is_none() {
+            // TODO(gw): Handle the case where we need to resize
+            //           the cache texture itself!
+            if self.rows.len() as u32 == self.height {
+                panic!("need to re-alloc texture!!");
+            }
+
+            // Create a new row.
+            let items_per_row = MAX_VERTEX_TEXTURE_WIDTH / alloc_size;
+            let row_index = self.rows.len();
+            self.rows.push(Row::new(alloc_size));
+
+            // Create a ```Block``` for each possible allocation address
+            // in this row, and link it in to the free-list for this
+            // block size.
+            let mut prev_block_index = None;
+            for i in 0..items_per_row {
+                let address = GpuCacheAddress::new(i * alloc_size, row_index);
+                let block_index = BlockIndex(self.blocks.len());
+                let block = Block::new(address, prev_block_index, frame_id);
+                self.blocks.push(block);
+                prev_block_index = Some(block_index);
+            }
+
+            *free_list = prev_block_index;
+        }
+
+        // Given the code above, it's now guaranteed that there is a block
+        // available in the appropriate free-list. Pull a block from the
+        // head of the list.
+        let free_block_index = free_list.take().unwrap();
+        let block = &mut self.blocks[free_block_index.0 as usize];
+        *free_list = block.next;
+
+        // Add the block to the occupied linked list.
+        block.next = self.occupied_list_head;
+        block.last_access_time = frame_id;
+        self.occupied_list_head = Some(free_block_index);
+        self.allocated_block_count += alloc_size;
+
+        // Add this update to the pending list of blocks that need
+        // to be updated on the GPU.
+        self.updates.push(GpuCacheUpdate::Copy {
+            block_index: pending_block_index,
+            block_count: block_count,
+            address: block.address,
+        });
+
+        CacheLocation {
+            block_index: free_block_index,
+            epoch: block.epoch,
+        }
+    }
+
+    // Run through the list of occupied cache blocks and evict
+    // any old blocks that haven't been referenced for a while.
+    fn evict_old_blocks(&mut self, frame_id: FrameId) {
+        // Prune any old items from the list to make room.
+        // Traverse the occupied linked list and see
+        // which items have not been used for a long time.
+        let mut current_block = self.occupied_list_head;
+        let mut prev_block: Option<BlockIndex> = None;
+
+        while let Some(index) = current_block {
+            let (next_block, should_unlink) = {
+                let block = &mut self.blocks[index.0 as usize];
+
+                let next_block = block.next;
+                let mut should_unlink = false;
+
+                // If this resource has not been used in the last
+                // few frames, free it from the texture and mark
+                // as empty.
+                if block.last_access_time + FRAMES_BEFORE_EVICTION < frame_id {
+                    should_unlink = true;
+
+                    // Get the row metadata from the address.
+                    let row = &mut self.rows[block.address.v as usize];
+
+                    // Use the row metadata to determine which free-list
+                    // this block belongs to.
+                    let (_, free_list) = self.free_lists
+                                             .get_actual_block_count_and_free_list(row.block_count_per_item);
+
+                    block.epoch = Epoch(block.epoch.0 + 1);
+                    block.next = *free_list;
+                    *free_list = Some(index);
+
+                    self.allocated_block_count -= row.block_count_per_item;
+                };
+
+                (next_block, should_unlink)
+            };
+
+            // If the block was released, we will need to remove it
+            // from the occupied linked list.
+            if should_unlink {
+                match prev_block {
+                    Some(prev_block) => {
+                        self.blocks[prev_block.0 as usize].next = next_block;
+                    }
+                    None => {
+                        self.occupied_list_head = next_block;
+                    }
+                }
+            } else {
+                prev_block = current_block;
+            }
+
+            current_block = next_block;
+        }
+    }
+}
+
+
+/// A wrapper object for GPU data requests,
+/// works as a container that can only grow.
+#[must_use]
+pub struct GpuDataRequest<'a> {
+    handle: &'a mut GpuCacheHandle,
+    frame_id: FrameId,
+    start_index: usize,
+    texture: &'a mut Texture,
+}
+
+impl<'a> GpuDataRequest<'a> {
+    pub fn push(&mut self, block: GpuBlockData) {
+        self.texture.pending_blocks.push(block);
+    }
+
+    pub fn extend_from_slice(&mut self, blocks: &[GpuBlockData]) {
+        self.texture.pending_blocks.extend_from_slice(blocks);
+    }
+}
+
+impl<'a> Drop for GpuDataRequest<'a> {
+    fn drop(&mut self) {
+        // Push the data to the texture pending updates list.
+        let block_count = self.texture.pending_blocks.len() - self.start_index;
+        let location = self.texture.push_data(self.start_index,
+                                              block_count,
+                                              self.frame_id);
+        self.handle.location = Some(location);
+    }
+}
+
+
+/// The main LRU cache interface.
+pub struct GpuCache {
+    /// Current frame ID.
+    frame_id: FrameId,
+    /// CPU-side texture allocator.
+    texture: Texture,
+}
+
+impl GpuCache {
+    pub fn new() -> GpuCache {
+        GpuCache {
+            frame_id: FrameId::new(0),
+            texture: Texture::new(),
+        }
+    }
+
+    /// Begin a new frame.
+    pub fn begin_frame(&mut self) {
+        debug_assert!(self.texture.pending_blocks.is_empty());
+        self.frame_id = self.frame_id + 1;
+        self.texture.evict_old_blocks(self.frame_id);
+    }
+
+    // Request a resource be added to the cache. If the resource
+    /// is already in the cache, `None` will be returned.
+    pub fn request<'a>(&'a mut self, handle: &'a mut GpuCacheHandle) -> Option<GpuDataRequest<'a>> {
+        // Check if the allocation for this handle is still valid.
+        if let Some(ref location) = handle.location {
+            let block = &mut self.texture.blocks[location.block_index.0];
+            if block.epoch == location.epoch {
+                // Mark last access time to avoid evicting this block.
+                block.last_access_time = self.frame_id;
+                return None
+            }
+        }
+        Some(GpuDataRequest {
+            handle: handle,
+            frame_id: self.frame_id,
+            start_index: self.texture.pending_blocks.len(),
+            texture: &mut self.texture,
+        })
+    }
+
+    /// End the frame. Return the list of updates to apply to the
+    /// device specific cache texture.
+    pub fn end_frame(&mut self,
+                     profile_counters: &mut GpuCacheProfileCounters) -> GpuCacheUpdateList {
+        profile_counters.allocated_rows.set(self.texture.rows.len());
+        profile_counters.allocated_blocks.set(self.texture.allocated_block_count);
+
+        GpuCacheUpdateList {
+            height: self.texture.height,
+            updates: mem::replace(&mut self.texture.updates, Vec::new()),
+            blocks: mem::replace(&mut self.texture.pending_blocks, Vec::new()),
+        }
+    }
+
+    /// Get the actual GPU address in the texture for a given slot ID.
+    /// It's assumed at this point that the given slot has been requested
+    /// and built for this frame. Attempting to get the address for a
+    /// freed or pending slot will panic!
+    pub fn get_address(&self, id: &GpuCacheHandle) -> GpuCacheAddress {
+        let location = id.location
+                         .expect("handle not requested or allocated!");
+        let block = &self.texture.blocks[location.block_index.0];
+        debug_assert_eq!(block.epoch, location.epoch);
+        debug_assert_eq!(block.last_access_time, self.frame_id);
+        block.address
+    }
+}
--- a/gfx/webrender/src/internal_types.rs
+++ b/gfx/webrender/src/internal_types.rs
@@ -64,18 +64,17 @@ pub enum FontTemplate {
 pub enum TextureSampler {
     Color0,
     Color1,
     Color2,
     CacheA8,
     CacheRGBA8,
     Data16,
     Data32,
-    Data64,
-    Data128,
+    ResourceCache,
     Layers,
     RenderTasks,
     Geometry,
     ResourceRects,
     Gradients,
     SplitGeometry,
     Dither,
 }
--- a/gfx/webrender/src/lib.rs
+++ b/gfx/webrender/src/lib.rs
@@ -53,16 +53,17 @@ mod debug_font_data;
 mod debug_render;
 mod device;
 mod ellipse;
 mod frame;
 mod frame_builder;
 mod freelist;
 mod geometry;
 mod glyph_rasterizer;
+mod gpu_cache;
 mod gpu_store;
 mod internal_types;
 mod mask_cache;
 mod prim_store;
 mod print_tree;
 mod profiler;
 mod record;
 mod render_backend;
--- a/gfx/webrender/src/prim_store.rs
+++ b/gfx/webrender/src/prim_store.rs
@@ -1,20 +1,21 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 use app_units::Au;
 use border::{BorderCornerClipData, BorderCornerDashClipData, BorderCornerDotClipData};
 use border::BorderCornerInstance;
 use euclid::{Size2D};
+use gpu_cache::{GpuBlockData, GpuCache, GpuCacheHandle, GpuDataRequest, ToGpuBlocks};
 use gpu_store::GpuStoreAddress;
 use internal_types::{SourceTexture, PackedTexel};
 use mask_cache::{ClipMode, ClipSource, MaskCacheInfo};
-use renderer::{VertexDataStore, GradientDataStore, SplitGeometryStore};
+use renderer::{VertexDataStore, GradientDataStore, SplitGeometryStore, MAX_VERTEX_TEXTURE_WIDTH};
 use render_task::{RenderTask, RenderTaskLocation};
 use resource_cache::{CacheItem, ImageProperties, ResourceCache};
 use std::mem;
 use std::usize;
 use util::{TransformedRect, recycle_vec};
 use webrender_traits::{BuiltDisplayList, ColorF, ImageKey, ImageRendering, YuvColorSpace};
 use webrender_traits::{YuvFormat, ClipRegion, ComplexClipRegion, ItemRange, GlyphKey};
 use webrender_traits::{FontKey, FontRenderMode, WebGLContextId};
@@ -67,22 +68,16 @@ impl TexelRect {
 pub struct DeferredResolve {
     pub resource_address: GpuStoreAddress,
     pub image_properties: ImageProperties,
 }
 
 #[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Ord, PartialOrd)]
 pub struct SpecificPrimitiveIndex(pub usize);
 
-impl SpecificPrimitiveIndex {
-    pub fn invalid() -> SpecificPrimitiveIndex {
-        SpecificPrimitiveIndex(usize::MAX)
-    }
-}
-
 #[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Ord, PartialOrd)]
 pub struct PrimitiveIndex(pub usize);
 
 #[derive(Debug, Copy, Clone, Eq, PartialEq)]
 pub enum PrimitiveKind {
     Rectangle,
     TextRun,
     Image,
@@ -111,27 +106,36 @@ impl Default for PrimitiveGeometry {
 }
 
 #[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
 pub enum PrimitiveCacheKey {
     BoxShadow(BoxShadowPrimitiveCacheKey),
     TextShadow(PrimitiveIndex),
 }
 
+impl GpuCacheHandle {
+    pub fn as_int(&self, gpu_cache: &GpuCache) -> i32 {
+        let address = gpu_cache.get_address(self);
+
+        // TODO(gw): Temporarily encode GPU Cache addresses as a single int.
+        //           In the future, we can change the PrimitiveInstance struct
+        //           to use 2x u16 for the vertex attribute instead of an i32.
+        address.v as i32 * MAX_VERTEX_TEXTURE_WIDTH as i32 + address.u as i32
+    }
+}
+
 // TODO(gw): Pack the fields here better!
 #[derive(Debug)]
 pub struct PrimitiveMetadata {
     pub is_opaque: bool,
     pub clips: Vec<ClipSource>,
     pub clip_cache_info: Option<MaskCacheInfo>,
     pub prim_kind: PrimitiveKind,
     pub cpu_prim_index: SpecificPrimitiveIndex,
-    pub gpu_prim_index: GpuStoreAddress,
-    pub gpu_data_address: GpuStoreAddress,
-    pub gpu_data_count: i32,
+    pub gpu_location: GpuCacheHandle,
     // An optional render task that is a dependency of
     // drawing this primitive. For instance, box shadows
     // use this to draw a portion of the box shadow to
     // a render target to reduce the number of pixels
     // that the box-shadow shader needs to run on. For
     // text-shadow, this creates a render task chain
     // that implements a 2-pass separable blur on a
     // text run.
@@ -160,149 +164,157 @@ impl Default for SplitGeometry {
 
 
 #[derive(Debug, Clone)]
 #[repr(C)]
 pub struct RectanglePrimitive {
     pub color: ColorF,
 }
 
+impl ToGpuBlocks for RectanglePrimitive {
+    fn write_gpu_blocks(&self, mut request: GpuDataRequest) {
+        request.push(self.color.into());
+    }
+}
+
 #[derive(Debug)]
 pub enum ImagePrimitiveKind {
     Image(ImageKey, ImageRendering, Option<TileOffset>, LayerSize),
     WebGL(WebGLContextId),
 }
 
 #[derive(Debug)]
 pub struct ImagePrimitiveCpu {
     pub kind: ImagePrimitiveKind,
     pub color_texture_id: SourceTexture,
     pub resource_address: GpuStoreAddress,
     pub sub_rect: Option<TexelRect>,
+    // TODO(gw): Build on demand
+    pub gpu_block: GpuBlockData,
 }
 
-#[derive(Debug, Clone)]
-#[repr(C)]
-pub struct ImagePrimitiveGpu {
-    pub stretch_size: LayerSize,
-    pub tile_spacing: LayerSize,
+impl ToGpuBlocks for ImagePrimitiveCpu {
+    fn write_gpu_blocks(&self, mut request: GpuDataRequest) {
+        request.push(self.gpu_block);
+    }
 }
 
 #[derive(Debug)]
 pub struct YuvImagePrimitiveCpu {
     pub yuv_key: [ImageKey; 3],
     // All textures should be the same type here.
     pub yuv_texture_id: [SourceTexture; 3],
     pub format: YuvFormat,
     pub color_space: YuvColorSpace,
 
     // The first address of yuv resource_address. Use "yuv_resource_address + N-th" to get the N-th channel data.
     // e.g. yuv_resource_address + 0 => y channel resource_address
     pub yuv_resource_address: GpuStoreAddress,
 
     pub image_rendering: ImageRendering,
+
+    // TODO(gw): Generate on demand
+    pub gpu_block: GpuBlockData,
 }
 
-#[derive(Debug, Clone)]
-#[repr(C)]
-pub struct YuvImagePrimitiveGpu {
-    pub size: LayerSize,
-    pub padding: [f32; 2],
-}
-
-impl YuvImagePrimitiveGpu {
-    pub fn new(size: LayerSize) -> Self {
-        YuvImagePrimitiveGpu {
-            size: size,
-            padding: [0.0; 2],
-        }
+impl ToGpuBlocks for YuvImagePrimitiveCpu {
+    fn write_gpu_blocks(&self, mut request: GpuDataRequest) {
+        request.push(self.gpu_block);
     }
 }
 
 #[derive(Debug, Clone)]
 pub struct BorderPrimitiveCpu {
     pub corner_instances: [BorderCornerInstance; 4],
+    pub gpu_blocks: [GpuBlockData; 8],
 }
 
-#[derive(Debug, Clone)]
-#[repr(C)]
-pub struct BorderPrimitiveGpu {
-    pub style: [f32; 4],
-    pub widths: [f32; 4],
-    pub colors: [ColorF; 4],
-    pub radii: [LayerSize; 4],
+impl ToGpuBlocks for BorderPrimitiveCpu {
+    fn write_gpu_blocks(&self, mut request: GpuDataRequest) {
+        request.extend_from_slice(&self.gpu_blocks);
+    }
 }
 
 #[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
 pub struct BoxShadowPrimitiveCacheKey {
     pub shadow_rect_size: Size2D<Au>,
     pub border_radius: Au,
     pub blur_radius: Au,
     pub inverted: bool,
 }
 
 #[derive(Debug, Clone)]
-#[repr(C)]
-pub struct BoxShadowPrimitiveGpu {
+pub struct BoxShadowPrimitiveCpu {
+    // todo(gw): generate on demand
+    // gpu data
     pub src_rect: LayerRect,
     pub bs_rect: LayerRect,
     pub color: ColorF,
     pub border_radius: f32,
     pub edge_size: f32,
     pub blur_radius: f32,
     pub inverted: f32,
+    pub rects: Vec<LayerRect>,
+}
+
+impl ToGpuBlocks for BoxShadowPrimitiveCpu {
+    fn write_gpu_blocks(&self, mut request: GpuDataRequest) {
+        request.push(self.src_rect.into());
+        request.push(self.bs_rect.into());
+        request.push(self.color.into());
+        request.push([self.border_radius,
+                      self.edge_size,
+                      self.blur_radius,
+                      self.inverted].into());
+        for &rect in &self.rects {
+            request.push(rect.into());
+        }
+    }
 }
 
 #[derive(Debug, Clone)]
 #[repr(C)]
 pub struct GradientStopGpu {
     color: ColorF,
     offset: f32,
     padding: [f32; 3],
 }
 
-#[derive(Debug, Clone)]
-#[repr(C)]
-pub struct GradientPrimitiveGpu {
-    pub start_point: LayerPoint,
-    pub end_point: LayerPoint,
-    pub tile_size: LayerSize,
-    pub tile_repeat: LayerSize,
-    pub extend_mode: f32,
-    pub padding: [f32; 7],
-}
-
 #[derive(Debug)]
 pub struct GradientPrimitiveCpu {
     pub stops_range: ItemRange<GradientStop>,
     pub stops_count: usize,
     pub extend_mode: ExtendMode,
     pub reverse_stops: bool,
     pub cache_dirty: bool,
+    pub gpu_data_address: GpuStoreAddress,
+    pub gpu_data_count: i32,
+    pub gpu_blocks: [GpuBlockData; 3],
 }
 
-#[derive(Debug, Clone)]
-#[repr(C)]
-pub struct RadialGradientPrimitiveGpu {
-    pub start_center: LayerPoint,
-    pub end_center: LayerPoint,
-    pub start_radius: f32,
-    pub end_radius: f32,
-    pub ratio_xy: f32,
-    pub extend_mode: f32,
-    pub tile_size: LayerSize,
-    pub tile_repeat: LayerSize,
-    pub padding: [f32; 4],
+impl ToGpuBlocks for GradientPrimitiveCpu {
+    fn write_gpu_blocks(&self, mut request: GpuDataRequest) {
+        request.extend_from_slice(&self.gpu_blocks);
+    }
 }
 
 #[derive(Debug)]
 pub struct RadialGradientPrimitiveCpu {
     pub stops_range: ItemRange<GradientStop>,
     pub extend_mode: ExtendMode,
     pub cache_dirty: bool,
+    pub gpu_data_address: GpuStoreAddress,
+    pub gpu_data_count: i32,
+    pub gpu_blocks: [GpuBlockData; 3],
+}
+
+impl ToGpuBlocks for RadialGradientPrimitiveCpu {
+    fn write_gpu_blocks(&self, mut request: GpuDataRequest) {
+        request.extend_from_slice(&self.gpu_blocks);
+    }
 }
 
 // The gradient entry index for the first color stop
 pub const GRADIENT_DATA_FIRST_STOP: usize = 0;
 // The gradient entry index for the last color stop
 pub const GRADIENT_DATA_LAST_STOP: usize = GRADIENT_DATA_SIZE - 1;
 
 // The start of the gradient data table
@@ -465,36 +477,38 @@ impl GradientData {
 
 #[derive(Debug, Clone)]
 #[repr(C)]
 struct InstanceRect {
     rect: LayerRect,
 }
 
 #[derive(Debug, Clone)]
-#[repr(C)]
-pub struct TextRunPrimitiveGpu {
-    pub color: ColorF,
-}
-
-#[derive(Debug, Clone)]
 pub struct TextRunPrimitiveCpu {
     pub font_key: FontKey,
     pub logical_font_size: Au,
     pub blur_radius: f32,
     pub glyph_range: ItemRange<GlyphInstance>,
     pub glyph_count: usize,
     pub cache_dirty: bool,
     // TODO(gw): Maybe make this an Arc for sharing with resource cache
     pub glyph_instances: Vec<GlyphInstance>,
     pub color_texture_id: SourceTexture,
     pub color: ColorF,
     pub render_mode: FontRenderMode,
     pub resource_address: GpuStoreAddress,
     pub glyph_options: Option<GlyphOptions>,
+    pub gpu_data_address: GpuStoreAddress,
+    pub gpu_data_count: i32,
+}
+
+impl ToGpuBlocks for TextRunPrimitiveCpu {
+    fn write_gpu_blocks(&self, mut request: GpuDataRequest) {
+        request.push(self.color.into());
+    }
 }
 
 #[derive(Debug, Clone)]
 #[repr(C)]
 struct GlyphPrimitive {
     offset: LayerPoint,
     padding: LayerPoint,
 }
@@ -623,94 +637,94 @@ impl ClipData {
                 radius, 0.0),
         }
     }
 }
 
 #[derive(Debug)]
 pub enum PrimitiveContainer {
     Rectangle(RectanglePrimitive),
-    TextRun(TextRunPrimitiveCpu, TextRunPrimitiveGpu),
-    Image(ImagePrimitiveCpu, ImagePrimitiveGpu),
-    YuvImage(YuvImagePrimitiveCpu, YuvImagePrimitiveGpu),
-    Border(BorderPrimitiveCpu, BorderPrimitiveGpu),
-    AlignedGradient(GradientPrimitiveCpu, GradientPrimitiveGpu),
-    AngleGradient(GradientPrimitiveCpu, GradientPrimitiveGpu),
-    RadialGradient(RadialGradientPrimitiveCpu, RadialGradientPrimitiveGpu),
-    BoxShadow(BoxShadowPrimitiveGpu, Vec<LayerRect>),
+    TextRun(TextRunPrimitiveCpu),
+    Image(ImagePrimitiveCpu),
+    YuvImage(YuvImagePrimitiveCpu),
+    Border(BorderPrimitiveCpu),
+    AlignedGradient(GradientPrimitiveCpu),
+    AngleGradient(GradientPrimitiveCpu),
+    RadialGradient(RadialGradientPrimitiveCpu),
+    BoxShadow(BoxShadowPrimitiveCpu),
 }
 
 pub struct PrimitiveStore {
     /// CPU side information only.
     pub cpu_bounding_rects: Vec<Option<DeviceIntRect>>,
+    pub cpu_rectangles: Vec<RectanglePrimitive>,
     pub cpu_text_runs: Vec<TextRunPrimitiveCpu>,
     pub cpu_images: Vec<ImagePrimitiveCpu>,
     pub cpu_yuv_images: Vec<YuvImagePrimitiveCpu>,
     pub cpu_gradients: Vec<GradientPrimitiveCpu>,
     pub cpu_radial_gradients: Vec<RadialGradientPrimitiveCpu>,
     pub cpu_metadata: Vec<PrimitiveMetadata>,
     pub cpu_borders: Vec<BorderPrimitiveCpu>,
+    pub cpu_box_shadows: Vec<BoxShadowPrimitiveCpu>,
 
     /// Gets uploaded directly to GPU via vertex texture.
     pub gpu_geometry: VertexDataStore<PrimitiveGeometry>,
     pub gpu_data16: VertexDataStore<GpuBlock16>,
     pub gpu_data32: VertexDataStore<GpuBlock32>,
-    pub gpu_data64: VertexDataStore<GpuBlock64>,
-    pub gpu_data128: VertexDataStore<GpuBlock128>,
     pub gpu_gradient_data: GradientDataStore,
 
     /// Geometry generated by plane splitting.
     pub gpu_split_geometry: SplitGeometryStore,
 
     /// Resolved resource rects.
     pub gpu_resource_rects: VertexDataStore<TexelRect>,
 
     /// General
     prims_to_resolve: Vec<PrimitiveIndex>,
 }
 
 impl PrimitiveStore {
     pub fn new() -> PrimitiveStore {
         PrimitiveStore {
             cpu_metadata: Vec::new(),
+            cpu_rectangles: Vec::new(),
             cpu_bounding_rects: Vec::new(),
             cpu_text_runs: Vec::new(),
             cpu_images: Vec::new(),
             cpu_yuv_images: Vec::new(),
             cpu_gradients: Vec::new(),
             cpu_radial_gradients: Vec::new(),
             cpu_borders: Vec::new(),
+            cpu_box_shadows: Vec::new(),
             prims_to_resolve: Vec::new(),
             gpu_geometry: VertexDataStore::new(),
             gpu_data16: VertexDataStore::new(),
             gpu_data32: VertexDataStore::new(),
-            gpu_data64: VertexDataStore::new(),
-            gpu_data128: VertexDataStore::new(),
             gpu_gradient_data: GradientDataStore::new(),
             gpu_split_geometry: SplitGeometryStore::new(),
             gpu_resource_rects: VertexDataStore::new(),
         }
     }
 
     pub fn recycle(self) -> Self {
         PrimitiveStore {
             cpu_metadata: recycle_vec(self.cpu_metadata),
+            cpu_rectangles: recycle_vec(self.cpu_rectangles),
             cpu_bounding_rects: recycle_vec(self.cpu_bounding_rects),
             cpu_text_runs: recycle_vec(self.cpu_text_runs),
             cpu_images: recycle_vec(self.cpu_images),
             cpu_yuv_images: recycle_vec(self.cpu_yuv_images),
             cpu_gradients: recycle_vec(self.cpu_gradients),
             cpu_radial_gradients: recycle_vec(self.cpu_radial_gradients),
             cpu_borders: recycle_vec(self.cpu_borders),
+            cpu_box_shadows: recycle_vec(self.cpu_box_shadows),
             prims_to_resolve: recycle_vec(self.prims_to_resolve),
             gpu_geometry: self.gpu_geometry.recycle(),
             gpu_data16: self.gpu_data16.recycle(),
             gpu_data32: self.gpu_data32.recycle(),
-            gpu_data64: self.gpu_data64.recycle(),
-            gpu_data128: self.gpu_data128.recycle(),
             gpu_gradient_data: self.gpu_gradient_data.recycle(),
             gpu_split_geometry: self.gpu_split_geometry.recycle(),
             gpu_resource_rects: self.gpu_resource_rects.recycle(),
         }
     }
 
     pub fn populate_clip_data(data: &mut [GpuBlock32], clip: ClipData) {
         data[0] = GpuBlock32::from(clip.rect);
@@ -727,185 +741,171 @@ impl PrimitiveStore {
                          container: PrimitiveContainer) -> PrimitiveIndex {
         let prim_index = self.cpu_metadata.len();
         self.cpu_bounding_rects.push(None);
         self.gpu_geometry.push(geometry);
 
         let metadata = match container {
             PrimitiveContainer::Rectangle(rect) => {
                 let is_opaque = rect.color.a == 1.0;
-                let gpu_address = self.gpu_data16.push(rect);
 
                 let metadata = PrimitiveMetadata {
                     is_opaque: is_opaque,
                     clips: clips,
                     clip_cache_info: clip_info,
                     prim_kind: PrimitiveKind::Rectangle,
-                    cpu_prim_index: SpecificPrimitiveIndex::invalid(),
-                    gpu_prim_index: gpu_address,
-                    gpu_data_address: GpuStoreAddress(0),
-                    gpu_data_count: 0,
+                    cpu_prim_index: SpecificPrimitiveIndex(self.cpu_rectangles.len()),
+                    gpu_location: GpuCacheHandle::new(),
                     render_task: None,
                     clip_task: None,
                 };
 
+                self.cpu_rectangles.push(rect);
+
                 metadata
             }
-            PrimitiveContainer::TextRun(mut text_cpu, text_gpu) => {
-                let gpu_address = self.gpu_data16.push(text_gpu);
+            PrimitiveContainer::TextRun(mut text_cpu) => {
                 let gpu_glyphs_address = self.gpu_data16.alloc(text_cpu.glyph_count);
                 text_cpu.resource_address = self.gpu_resource_rects.alloc(text_cpu.glyph_count);
+                text_cpu.gpu_data_address = gpu_glyphs_address;
+                text_cpu.gpu_data_count = text_cpu.glyph_count as i32;
 
                 let metadata = PrimitiveMetadata {
                     is_opaque: false,
                     clips: clips,
                     clip_cache_info: clip_info,
                     prim_kind: PrimitiveKind::TextRun,
                     cpu_prim_index: SpecificPrimitiveIndex(self.cpu_text_runs.len()),
-                    gpu_prim_index: gpu_address,
-                    gpu_data_address: gpu_glyphs_address,
-                    gpu_data_count: text_cpu.glyph_count as i32,
+                    gpu_location: GpuCacheHandle::new(),
                     render_task: None,
                     clip_task: None,
                 };
 
                 self.cpu_text_runs.push(text_cpu);
                 metadata
             }
-            PrimitiveContainer::Image(mut image_cpu, image_gpu) => {
+            PrimitiveContainer::Image(mut image_cpu) => {
                 image_cpu.resource_address = self.gpu_resource_rects.alloc(1);
 
-                let gpu_address = self.gpu_data16.push(image_gpu);
-
                 let metadata = PrimitiveMetadata {
                     is_opaque: false,
                     clips: clips,
                     clip_cache_info: clip_info,
                     prim_kind: PrimitiveKind::Image,
                     cpu_prim_index: SpecificPrimitiveIndex(self.cpu_images.len()),
-                    gpu_prim_index: gpu_address,
-                    gpu_data_address: GpuStoreAddress(0),
-                    gpu_data_count: 0,
+                    gpu_location: GpuCacheHandle::new(),
                     render_task: None,
                     clip_task: None,
                 };
 
                 self.cpu_images.push(image_cpu);
                 metadata
             }
-            PrimitiveContainer::YuvImage(mut image_cpu, image_gpu) => {
+            PrimitiveContainer::YuvImage(mut image_cpu) => {
                 image_cpu.yuv_resource_address = self.gpu_resource_rects.alloc(3);
 
-                let gpu_address = self.gpu_data16.push(image_gpu);
-
                 let metadata = PrimitiveMetadata {
                     is_opaque: true,
                     clips: clips,
                     clip_cache_info: clip_info,
                     prim_kind: PrimitiveKind::YuvImage,
                     cpu_prim_index: SpecificPrimitiveIndex(self.cpu_yuv_images.len()),
-                    gpu_prim_index: gpu_address,
-                    gpu_data_address: GpuStoreAddress(0),
-                    gpu_data_count: 0,
+                    gpu_location: GpuCacheHandle::new(),
                     render_task: None,
                     clip_task: None,
                 };
 
                 self.cpu_yuv_images.push(image_cpu);
                 metadata
             }
-            PrimitiveContainer::Border(border_cpu, border_gpu) => {
-                let gpu_address = self.gpu_data128.push(border_gpu);
-
+            PrimitiveContainer::Border(border_cpu) => {
                 let metadata = PrimitiveMetadata {
                     is_opaque: false,
                     clips: clips,
                     clip_cache_info: clip_info,
                     prim_kind: PrimitiveKind::Border,
                     cpu_prim_index: SpecificPrimitiveIndex(self.cpu_borders.len()),
-                    gpu_prim_index: gpu_address,
-                    gpu_data_address: GpuStoreAddress(0),
-                    gpu_data_count: 0,
+                    gpu_location: GpuCacheHandle::new(),
                     render_task: None,
                     clip_task: None,
                 };
 
                 self.cpu_borders.push(border_cpu);
                 metadata
             }
-            PrimitiveContainer::AlignedGradient(gradient_cpu, gradient_gpu) => {
-                let gpu_address = self.gpu_data64.push(gradient_gpu);
+            PrimitiveContainer::AlignedGradient(mut gradient_cpu) => {
                 let gpu_stops_address = self.gpu_data32.alloc(gradient_cpu.stops_count);
 
+                gradient_cpu.gpu_data_address = gpu_stops_address;
+                gradient_cpu.gpu_data_count = gradient_cpu.stops_count as i32;
+
                 let metadata = PrimitiveMetadata {
                     // TODO: calculate if the gradient is actually opaque
                     is_opaque: false,
                     clips: clips,
                     clip_cache_info: clip_info,
                     prim_kind: PrimitiveKind::AlignedGradient,
                     cpu_prim_index: SpecificPrimitiveIndex(self.cpu_gradients.len()),
-                    gpu_prim_index: gpu_address,
-                    gpu_data_address: gpu_stops_address,
-                    gpu_data_count: gradient_cpu.stops_count as i32,
+                    gpu_location: GpuCacheHandle::new(),
                     render_task: None,
                     clip_task: None,
                 };
 
                 self.cpu_gradients.push(gradient_cpu);
                 metadata
             }
-            PrimitiveContainer::AngleGradient(gradient_cpu, gradient_gpu) => {
-                let gpu_address = self.gpu_data64.push(gradient_gpu);
+            PrimitiveContainer::AngleGradient(mut gradient_cpu) => {
                 let gpu_gradient_address = self.gpu_gradient_data.alloc(1);
 
+                gradient_cpu.gpu_data_address = gpu_gradient_address;
+                gradient_cpu.gpu_data_count = 1;
+
                 let metadata = PrimitiveMetadata {
                     // TODO: calculate if the gradient is actually opaque
                     is_opaque: false,
                     clips: clips,
                     clip_cache_info: clip_info,
                     prim_kind: PrimitiveKind::AngleGradient,
                     cpu_prim_index: SpecificPrimitiveIndex(self.cpu_gradients.len()),
-                    gpu_prim_index: gpu_address,
-                    gpu_data_address: gpu_gradient_address,
-                    gpu_data_count: 1,
+                    gpu_location: GpuCacheHandle::new(),
                     render_task: None,
                     clip_task: None,
                 };
 
                 self.cpu_gradients.push(gradient_cpu);
                 metadata
             }
-            PrimitiveContainer::RadialGradient(radial_gradient_cpu, radial_gradient_gpu) => {
-                let gpu_address = self.gpu_data64.push(radial_gradient_gpu);
+            PrimitiveContainer::RadialGradient(mut radial_gradient_cpu) => {
                 let gpu_gradient_address = self.gpu_gradient_data.alloc(1);
 
+                radial_gradient_cpu.gpu_data_address = gpu_gradient_address;
+                radial_gradient_cpu.gpu_data_count = 1;
+
                 let metadata = PrimitiveMetadata {
                     // TODO: calculate if the gradient is actually opaque
                     is_opaque: false,
                     clips: clips,
                     clip_cache_info: clip_info,
                     prim_kind: PrimitiveKind::RadialGradient,
                     cpu_prim_index: SpecificPrimitiveIndex(self.cpu_radial_gradients.len()),
-                    gpu_prim_index: gpu_address,
-                    gpu_data_address: gpu_gradient_address,
-                    gpu_data_count: 1,
+                    gpu_location: GpuCacheHandle::new(),
                     render_task: None,
                     clip_task: None,
                 };
 
                 self.cpu_radial_gradients.push(radial_gradient_cpu);
                 metadata
             }
-            PrimitiveContainer::BoxShadow(box_shadow_gpu, instance_rects) => {
+            PrimitiveContainer::BoxShadow(box_shadow) => {
                 let cache_key = PrimitiveCacheKey::BoxShadow(BoxShadowPrimitiveCacheKey {
-                    blur_radius: Au::from_f32_px(box_shadow_gpu.blur_radius),
-                    border_radius: Au::from_f32_px(box_shadow_gpu.border_radius),
-                    inverted: box_shadow_gpu.inverted != 0.0,
-                    shadow_rect_size: Size2D::new(Au::from_f32_px(box_shadow_gpu.bs_rect.size.width),
-                                                  Au::from_f32_px(box_shadow_gpu.bs_rect.size.height)),
+                    blur_radius: Au::from_f32_px(box_shadow.blur_radius),
+                    border_radius: Au::from_f32_px(box_shadow.border_radius),
+                    inverted: box_shadow.inverted != 0.0,
+                    shadow_rect_size: Size2D::new(Au::from_f32_px(box_shadow.bs_rect.size.width),
+                                                  Au::from_f32_px(box_shadow.bs_rect.size.height)),
                 });
 
                 // The actual cache size is calculated during prepare_prim_for_render().
                 // This is necessary since the size may change depending on the device
                 // pixel ratio (for example, during zoom or moving the window to a
                 // monitor with a different device pixel ratio).
                 let cache_size = DeviceIntSize::zero();
 
@@ -915,38 +915,28 @@ impl PrimitiveStore {
                 // shader, to reduce the number of pixels that the expensive box
                 // shadow shader needs to run on.
                 // TODO(gw): In the future, we can probably merge the box shadow
                 // primitive (stretch) shader with the generic cached primitive shader.
                 let render_task = RenderTask::new_prim_cache(cache_key,
                                                              cache_size,
                                                              PrimitiveIndex(prim_index));
 
-                let gpu_prim_address = self.gpu_data64.push(box_shadow_gpu);
-                let gpu_data_address = self.gpu_data16.get_next_address();
-
                 let metadata = PrimitiveMetadata {
                     is_opaque: false,
                     clips: clips,
                     clip_cache_info: clip_info,
                     prim_kind: PrimitiveKind::BoxShadow,
-                    cpu_prim_index: SpecificPrimitiveIndex::invalid(),
-                    gpu_prim_index: gpu_prim_address,
-                    gpu_data_address: gpu_data_address,
-                    gpu_data_count: instance_rects.len() as i32,
+                    cpu_prim_index: SpecificPrimitiveIndex(self.cpu_box_shadows.len()),
+                    gpu_location: GpuCacheHandle::new(),
                     render_task: Some(render_task),
                     clip_task: None,
                 };
 
-                for rect in instance_rects {
-                    self.gpu_data16.push(InstanceRect {
-                        rect: rect,
-                    });
-                }
-
+                self.cpu_box_shadows.push(box_shadow);
                 metadata
             }
         };
 
         self.cpu_metadata.push(metadata);
 
         PrimitiveIndex(prim_index)
     }
@@ -1165,21 +1155,59 @@ impl PrimitiveStore {
     }
 
     /// Returns true if the bounding box needs to be updated.
     pub fn prepare_prim_for_render(&mut self,
                                    prim_index: PrimitiveIndex,
                                    resource_cache: &mut ResourceCache,
                                    layer_transform: &LayerToWorldTransform,
                                    device_pixel_ratio: f32,
-                                   display_list: &BuiltDisplayList) -> bool {
+                                   display_list: &BuiltDisplayList) {
 
         let metadata = &mut self.cpu_metadata[prim_index.0];
         let mut prim_needs_resolve = false;
-        let mut rebuild_bounding_rect = false;
+
+        // Mark this GPU resource as required for this frame.
+        if let Some(request) = resource_cache.gpu_cache.request(&mut metadata.gpu_location) {
+            match metadata.prim_kind {
+                PrimitiveKind::Rectangle => {
+                    let rect = &self.cpu_rectangles[metadata.cpu_prim_index.0];
+                    rect.write_gpu_blocks(request);
+                }
+                PrimitiveKind::Border => {
+                    let border = &self.cpu_borders[metadata.cpu_prim_index.0];
+                    border.write_gpu_blocks(request);
+                }
+                PrimitiveKind::BoxShadow => {
+                    let box_shadow = &self.cpu_box_shadows[metadata.cpu_prim_index.0];
+                    box_shadow.write_gpu_blocks(request);
+                }
+                PrimitiveKind::Image => {
+                    let image = &self.cpu_images[metadata.cpu_prim_index.0];
+                    image.write_gpu_blocks(request);
+                }
+                PrimitiveKind::YuvImage => {
+                    let yuv_image = &self.cpu_yuv_images[metadata.cpu_prim_index.0];
+                    yuv_image.write_gpu_blocks(request);
+                }
+                PrimitiveKind::AlignedGradient |
+                PrimitiveKind::AngleGradient => {
+                    let gradient = &self.cpu_gradients[metadata.cpu_prim_index.0];
+                    gradient.write_gpu_blocks(request);
+                }
+                PrimitiveKind::RadialGradient => {
+                    let gradient = &self.cpu_radial_gradients[metadata.cpu_prim_index.0];
+                    gradient.write_gpu_blocks(request);
+                }
+                PrimitiveKind::TextRun => {
+                    let text = &self.cpu_text_runs[metadata.cpu_prim_index.0];
+                    text.write_gpu_blocks(request);
+                }
+            }
+        }
 
         if let Some(ref mut clip_info) = metadata.clip_cache_info {
             clip_info.update(&metadata.clips,
                              layer_transform,
                              &mut self.gpu_data32,
                              device_pixel_ratio,
                              display_list);
             for clip in &metadata.clips {
@@ -1196,49 +1224,45 @@ impl PrimitiveStore {
             PrimitiveKind::BoxShadow => {
                 // TODO(gw): Account for zoom factor!
                 // Here, we calculate the size of the patch required in order
                 // to create the box shadow corner. First, scale it by the
                 // device pixel ratio since the cache shader expects vertices
                 // in device space. The shader adds a 1-pixel border around
                 // the patch, in order to prevent bilinear filter artifacts as
                 // the patch is clamped / mirrored across the box shadow rect.
-                let box_shadow_gpu: &BoxShadowPrimitiveGpu = unsafe {
-                    mem::transmute(self.gpu_data64.get(metadata.gpu_prim_index))
-                };
-                let edge_size = box_shadow_gpu.edge_size.ceil() * device_pixel_ratio;
+                let box_shadow_cpu = &self.cpu_box_shadows[metadata.cpu_prim_index.0];
+                let edge_size = box_shadow_cpu.edge_size.ceil() * device_pixel_ratio;
                 let edge_size = edge_size as i32 + 2;   // Account for bilinear filtering
                 let cache_size = DeviceIntSize::new(edge_size, edge_size);
                 let location = RenderTaskLocation::Dynamic(None, cache_size);
                 metadata.render_task.as_mut().unwrap().location = location;
             }
             PrimitiveKind::TextRun => {
                 let text = &mut self.cpu_text_runs[metadata.cpu_prim_index.0];
 
                 let font_size_dp = text.logical_font_size.scale_by(device_pixel_ratio);
                 let src_glyphs = display_list.get(text.glyph_range);
                 prim_needs_resolve = true;
 
                 if text.cache_dirty {
-                    rebuild_bounding_rect = true;
                     text.cache_dirty = false;
 
-                    debug_assert!(metadata.gpu_data_count == src_glyphs.len() as i32);
+                    debug_assert!(text.gpu_data_count == src_glyphs.len() as i32);
                     debug_assert!(text.glyph_instances.is_empty());
 
-                    let dest_glyphs = self.gpu_data16.get_slice_mut(metadata.gpu_data_address,
+                    let dest_glyphs = self.gpu_data16.get_slice_mut(text.gpu_data_address,
                                                                     src_glyphs.len());
 
                     let mut glyph_key = GlyphKey::new(text.font_key,
                                                       font_size_dp,
                                                       text.color,
                                                       0,
                                                       LayoutPoint::new(0.0, 0.0),
                                                       text.render_mode);
-                    let mut local_rect = LayerRect::zero();
                     let mut actual_glyph_count = 0;
 
                     for src in src_glyphs {
                         glyph_key.index = src.index;
                         glyph_key.subpixel_point.set_offset(src.point, text.render_mode);
 
                         let dimensions = match resource_cache.get_glyph_dimensions(&glyph_key) {
                             None => continue,
@@ -1246,61 +1270,53 @@ impl PrimitiveStore {
                         };
 
                         // TODO(gw): Check for this and ensure platforms return None in this case!!!
                         debug_assert!(dimensions.width > 0 && dimensions.height > 0);
 
                         let x = src.point.x + dimensions.left as f32 / device_pixel_ratio;
                         let y = src.point.y - dimensions.top as f32 / device_pixel_ratio;
 
-                        let width = dimensions.width as f32 / device_pixel_ratio;
-                        let height = dimensions.height as f32 / device_pixel_ratio;
-
-                        let local_glyph_rect = LayerRect::new(LayerPoint::new(x, y),
-                                                              LayerSize::new(width, height));
-                        local_rect = local_rect.union(&local_glyph_rect);
+                        let glyph_pos = LayerPoint::new(x, y);
 
                         dest_glyphs[actual_glyph_count] = GpuBlock16::from(GlyphPrimitive {
                             padding: LayerPoint::zero(),
-                            offset: local_glyph_rect.origin,
+                            offset: glyph_pos,
                         });
 
                         text.glyph_instances.push(GlyphInstance {
                             index: src.index,
-                            point: LayoutPoint::new(src.point.x, src.point.y),
+                            point: glyph_pos,
                         });
 
                         actual_glyph_count += 1;
                     }
 
-                    // Expand the rectangle of the text run by the blur radius.
-                    let local_rect = local_rect.inflate(text.blur_radius, text.blur_radius);
-
                     let render_task = if text.blur_radius == 0.0 {
                         None
                     } else {
                         // This is a text-shadow element. Create a render task that will
                         // render the text run to a target, and then apply a gaussian
                         // blur to that text run in order to build the actual primitive
                         // which will be blitted to the framebuffer.
-                        let cache_width = (local_rect.size.width * device_pixel_ratio).ceil() as i32;
-                        let cache_height = (local_rect.size.height * device_pixel_ratio).ceil() as i32;
+                        let geom = &self.gpu_geometry.get(GpuStoreAddress(prim_index.0 as i32));
+                        let cache_width = (geom.local_rect.size.width * device_pixel_ratio).ceil() as i32;
+                        let cache_height = (geom.local_rect.size.height * device_pixel_ratio).ceil() as i32;
                         let cache_size = DeviceIntSize::new(cache_width, cache_height);
                         let cache_key = PrimitiveCacheKey::TextShadow(prim_index);
                         let blur_radius = device_length(text.blur_radius,
                                                         device_pixel_ratio);
                         Some(RenderTask::new_blur(cache_key,
                                                   cache_size,
                                                   blur_radius,
                                                   prim_index))
                     };
 
-                    metadata.gpu_data_count = actual_glyph_count as i32;
+                    text.gpu_data_count = actual_glyph_count as i32;
                     metadata.render_task = render_task;
-                    self.gpu_geometry.get_mut(GpuStoreAddress(prim_index.0 as i32)).local_rect = local_rect;
                 }
 
                 resource_cache.request_glyphs(text.font_key,
                                               font_size_dp,
                                               text.color,
                                               &text.glyph_instances,
                                               text.render_mode,
                                               text.glyph_options);
@@ -1338,18 +1354,18 @@ impl PrimitiveStore {
                 // TODO(nical): Currently assuming no tile_spacing for yuv images.
                 metadata.is_opaque = true;
             }
             PrimitiveKind::AlignedGradient => {
                 let gradient = &mut self.cpu_gradients[metadata.cpu_prim_index.0];
                 if gradient.cache_dirty {
                     let src_stops = display_list.get(gradient.stops_range);
 
-                    debug_assert!(metadata.gpu_data_count == src_stops.len() as i32);
-                    let dest_stops = self.gpu_data32.get_slice_mut(metadata.gpu_data_address,
+                    debug_assert!(gradient.gpu_data_count == src_stops.len() as i32);
+                    let dest_stops = self.gpu_data32.get_slice_mut(gradient.gpu_data_address,
                                                                    src_stops.len());
 
                     for (src, dest) in src_stops.zip(dest_stops.iter_mut()) {
                         *dest = GpuBlock32::from(GradientStopGpu {
                             offset: src.offset,
                             color: src.color.premultiplied(),
                             padding: [0.0; 3],
                         });
@@ -1358,38 +1374,36 @@ impl PrimitiveStore {
                     gradient.cache_dirty = false;
                 }
             }
             PrimitiveKind::AngleGradient => {
                 let gradient = &mut self.cpu_gradients[metadata.cpu_prim_index.0];
                 if gradient.cache_dirty {
                     let src_stops = display_list.get(gradient.stops_range);
 
-                    let dest_gradient = self.gpu_gradient_data.get_mut(metadata.gpu_data_address);
+                    let dest_gradient = self.gpu_gradient_data.get_mut(gradient.gpu_data_address);
                     dest_gradient.build(src_stops, gradient.reverse_stops);
                     gradient.cache_dirty = false;
                 }
             }
             PrimitiveKind::RadialGradient => {
                 let gradient = &mut self.cpu_radial_gradients[metadata.cpu_prim_index.0];
                 if gradient.cache_dirty {
                     let src_stops = display_list.get(gradient.stops_range);
 
-                    let dest_gradient = self.gpu_gradient_data.get_mut(metadata.gpu_data_address);
+                    let dest_gradient = self.gpu_gradient_data.get_mut(gradient.gpu_data_address);
                     dest_gradient.build(src_stops, false);
                     gradient.cache_dirty = false;
                 }
             }
         }
 
         if prim_needs_resolve {
             self.prims_to_resolve.push(prim_index);
         }
-
-        rebuild_bounding_rect
     }
 }
 
 
 macro_rules! define_gpu_block {
     ($name:ident: $ty:ty = $($derive:ident),* ) => (
         #[derive(Clone)]
         #[repr(C)]
@@ -1411,30 +1425,22 @@ macro_rules! define_gpu_block {
                     unsafe { mem::transmute(data) }
                 }
             }
         )*
     )
 }
 
 define_gpu_block!(GpuBlock16: [f32; 4] =
-    RectanglePrimitive, InstanceRect, GlyphPrimitive,
-    TextRunPrimitiveGpu, ImagePrimitiveGpu, YuvImagePrimitiveGpu
+    InstanceRect, GlyphPrimitive
 );
 define_gpu_block!(GpuBlock32: [f32; 8] =
     GradientStopGpu, ClipCorner, ClipRect, ImageMaskData,
     BorderCornerClipData, BorderCornerDashClipData, BorderCornerDotClipData
 );
-define_gpu_block!(GpuBlock64: [f32; 16] =
-    GradientPrimitiveGpu, RadialGradientPrimitiveGpu, BoxShadowPrimitiveGpu
-);
-define_gpu_block!(GpuBlock128: [f32; 32] =
-    BorderPrimitiveGpu
-);
-
 
 //Test for one clip region contains another
 trait InsideTest<T> {
     fn might_contain(&self, clip: &T) -> bool;
 }
 
 impl InsideTest<ComplexClipRegion> for ComplexClipRegion {
     // Returns true if clip is inside self, can return false negative
--- a/gfx/webrender/src/profiler.rs
+++ b/gfx/webrender/src/profiler.rs
@@ -285,27 +285,43 @@ impl TextureCacheProfileCounters {
             pages_rgb8: ResourceProfileCounter::new("Texture RGB8 cached pages"),
             pages_rgba8: ResourceProfileCounter::new("Texture RGBA8 cached pages"),
             pages_rg8: ResourceProfileCounter::new("Texture RG8 cached pages"),
         }
     }
 }
 
 #[derive(Clone)]
+pub struct GpuCacheProfileCounters {
+    pub allocated_rows: IntProfileCounter,
+    pub allocated_blocks: IntProfileCounter,
+}
+
+impl GpuCacheProfileCounters {
+    pub fn new() -> GpuCacheProfileCounters {
+        GpuCacheProfileCounters {
+            allocated_rows: IntProfileCounter::new("GPU cache rows"),
+            allocated_blocks: IntProfileCounter::new("GPU cache blocks"),
+        }
+    }
+}
+
+#[derive(Clone)]
 pub struct BackendProfileCounters {
     pub total_time: TimeProfileCounter,
     pub resources: ResourceProfileCounters,
     pub ipc: IpcProfileCounters,
 }
 
 #[derive(Clone)]
 pub struct ResourceProfileCounters {
     pub font_templates: ResourceProfileCounter,
     pub image_templates: ResourceProfileCounter,
     pub texture_cache: TextureCacheProfileCounters,
+    pub gpu_cache: GpuCacheProfileCounters,
 }
 
 #[derive(Clone)]
 pub struct IpcProfileCounters {
     pub build_time: TimeProfileCounter,
     pub consume_time: TimeProfileCounter,
     pub send_time: TimeProfileCounter,
     pub total_time: TimeProfileCounter,
@@ -327,24 +343,25 @@ impl IpcProfileCounters {
 impl BackendProfileCounters {
     pub fn new() -> BackendProfileCounters {
         BackendProfileCounters {
             total_time: TimeProfileCounter::new("Backend CPU Time", false),
             resources: ResourceProfileCounters {
                 font_templates: ResourceProfileCounter::new("Font Templates"),
                 image_templates: ResourceProfileCounter::new("Image Templates"),
                 texture_cache: TextureCacheProfileCounters::new(),
+                gpu_cache: GpuCacheProfileCounters::new(),
             },
             ipc: IpcProfileCounters {
                 build_time: TimeProfileCounter::new("Display List Build Time", false),
                 consume_time: TimeProfileCounter::new("Display List Consume Time", false),
                 send_time: TimeProfileCounter::new("Display List Send Time", false),
                 total_time: TimeProfileCounter::new("Total Display List Time", false),
                 display_lists: ResourceProfileCounter::new("Display Lists Sent"),
-            }
+            },
         }
     }
 
     pub fn reset(&mut self) {
         self.total_time.reset();
         self.ipc.total_time.reset();
         self.ipc.build_time.reset();
         self.ipc.consume_time.reset();
@@ -716,16 +733,18 @@ impl Profiler {
         ], debug_renderer, true);
 
         self.draw_counters(&[
             &frame_profile.total_primitives,
             &frame_profile.visible_primitives,
             &frame_profile.passes,
             &frame_profile.color_targets,
             &frame_profile.alpha_targets,
+            &backend_profile.resources.gpu_cache.allocated_rows,
+            &backend_profile.resources.gpu_cache.allocated_blocks,
         ], debug_renderer, true);
 
         self.draw_counters(&[
             &backend_profile.resources.font_templates,
             &backend_profile.resources.image_templates,
         ], debug_renderer, true);
 
         self.draw_counters(&[
@@ -748,19 +767,16 @@ impl Profiler {
             &renderer_profile.vertices,
         ], debug_renderer, true);
 
         self.draw_counters(&[
             &backend_profile.total_time,
             &renderer_timers.cpu_time,
             &renderer_timers.gpu_time,
         ], debug_renderer, false);
-        
-
-
 
         self.backend_time.push(backend_profile.total_time.nanoseconds);
         self.compositor_time.push(renderer_timers.cpu_time.nanoseconds);
         self.ipc_time.push(backend_profile.ipc.total_time.nanoseconds);
         self.gpu_time.push(gpu_time);
         self.gpu_frames.push(gpu_time, gpu_samples);
 
 
--- a/gfx/webrender/src/render_backend.rs
+++ b/gfx/webrender/src/render_backend.rs
@@ -1,16 +1,16 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 use frame::Frame;
 use frame_builder::FrameBuilderConfig;
 use internal_types::{FontTemplate, SourceTexture, ResultMsg, RendererFrame};
-use profiler::{BackendProfileCounters, TextureCacheProfileCounters};
+use profiler::{BackendProfileCounters, GpuCacheProfileCounters, TextureCacheProfileCounters};
 use record::ApiRecordingReceiver;
 use resource_cache::ResourceCache;
 use scene::Scene;
 use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use std::sync::mpsc::Sender;
 use texture_cache::TextureCache;
 use time::precise_time_ns;
@@ -248,19 +248,20 @@ impl RenderBackend {
                             profile_counters.total_time.profile(|| {
                                 self.build_scene();
                             })
                         }
                         ApiMsg::Scroll(delta, cursor, move_phase) => {
                             profile_scope!("Scroll");
                             let frame = {
                                 let counters = &mut profile_counters.resources.texture_cache;
+                                let gpu_cache_counters = &mut profile_counters.resources.gpu_cache;
                                 profile_counters.total_time.profile(|| {
                                     if self.frame.scroll(delta, cursor, move_phase) {
-                                        Some(self.render(counters))
+                                        Some(self.render(counters, gpu_cache_counters))
                                     } else {
                                         None
                                     }
                                 })
                             };
 
                             match frame {
                                 Some(frame) => {
@@ -269,19 +270,20 @@ impl RenderBackend {
                                 }
                                 None => self.notify_compositor_of_new_scroll_frame(false),
                             }
                         }
                         ApiMsg::ScrollNodeWithId(origin, id, clamp) => {
                             profile_scope!("ScrollNodeWithScrollId");
                             let frame = {
                                 let counters = &mut profile_counters.resources.texture_cache;
+                                let gpu_cache_counters = &mut profile_counters.resources.gpu_cache;
                                 profile_counters.total_time.profile(|| {
                                     if self.frame.scroll_node(origin, id, clamp) {
-                                        Some(self.render(counters))
+                                        Some(self.render(counters, gpu_cache_counters))
                                     } else {
                                         None
                                     }
                                 })
                             };
 
                             match frame {
                                 Some(frame) => {
@@ -291,19 +293,20 @@ impl RenderBackend {
                                 None => self.notify_compositor_of_new_scroll_frame(false),
                             }
 
                         }
                         ApiMsg::TickScrollingBounce => {
                             profile_scope!("TickScrollingBounce");
                             let frame = {
                                 let counters = &mut profile_counters.resources.texture_cache;
+                                let gpu_cache_counters = &mut profile_counters.resources.gpu_cache;
                                 profile_counters.total_time.profile(|| {
                                     self.frame.tick_scrolling_bounce_animations();
-                                    self.render(counters)
+                                    self.render(counters, gpu_cache_counters)
                                 })
                             };
 
                             self.publish_frame_and_notify_compositor(frame, &mut profile_counters);
                         }
                         ApiMsg::TranslatePointToLayerSpace(..) => {
                             panic!("unused api - remove from webrender_traits");
                         }
@@ -403,18 +406,19 @@ impl RenderBackend {
                                 self.scene.properties.set_properties(property_bindings);
                                 profile_counters.total_time.profile(|| {
                                     self.build_scene();
                                 });
                             }
 
                             let frame = {
                                 let counters = &mut profile_counters.resources.texture_cache;
+                                let gpu_cache_counters = &mut profile_counters.resources.gpu_cache;
                                 profile_counters.total_time.profile(|| {
-                                    self.render(counters)
+                                    self.render(counters, gpu_cache_counters)
                                 })
                             };
                             if self.scene.root_pipeline_id.is_some() {
                                 self.publish_frame_and_notify_compositor(frame, &mut profile_counters);
                                 frame_counter += 1;
                             }
                         }
                         ApiMsg::ExternalEvent(evt) => {
@@ -478,26 +482,28 @@ impl RenderBackend {
         self.frame.create(&self.scene,
                           &mut self.resource_cache,
                           self.window_size,
                           self.inner_rect,
                           accumulated_scale_factor);
     }
 
     fn render(&mut self,
-              texture_cache_profile: &mut TextureCacheProfileCounters)
+              texture_cache_profile: &mut TextureCacheProfileCounters,
+              gpu_cache_profile: &mut GpuCacheProfileCounters)
               -> RendererFrame {
         let accumulated_scale_factor = self.accumulated_scale_factor();
         let pan = LayerPoint::new(self.pan.x as f32 / accumulated_scale_factor,
                                   self.pan.y as f32 / accumulated_scale_factor);
         let frame = self.frame.build(&mut self.resource_cache,
                                      &self.scene.display_lists,
                                      accumulated_scale_factor,
                                      pan,
-                                     texture_cache_profile);
+                                     texture_cache_profile,
+                                     gpu_cache_profile);
         frame
     }
 
     fn publish_frame(&mut self,
                      frame: RendererFrame,
                      profile_counters: &mut BackendProfileCounters) {
         let pending_update = self.resource_cache.pending_updates();
         let msg = ResultMsg::NewFrame(frame, pending_update, profile_counters.clone());
--- a/gfx/webrender/src/renderer.rs
+++ b/gfx/webrender/src/renderer.rs
@@ -13,16 +13,17 @@ use debug_colors;
 use debug_render::DebugRenderer;
 use device::{DepthFunction, Device, FrameId, ProgramId, TextureId, VertexFormat, GpuMarker, GpuProfiler};
 use device::{GpuSample, TextureFilter, VAOId, VertexUsageHint, FileWatcherHandler, TextureTarget, ShaderError};
 use device::get_gl_format_bgra;
 use euclid::Matrix4D;
 use fnv::FnvHasher;
 use frame_builder::FrameBuilderConfig;
 use gleam::gl;
+use gpu_cache::{GpuCacheUpdate, GpuCacheUpdateList};
 use gpu_store::{GpuStore, GpuStoreLayout};
 use internal_types::{CacheTextureId, RendererFrame, ResultMsg, TextureUpdateOp};
 use internal_types::{TextureUpdateList, PackedVertex, RenderTargetMode};
 use internal_types::{ORTHO_NEAR_PLANE, ORTHO_FAR_PLANE, SourceTexture};
 use internal_types::{BatchTextures, TextureSampler};
 use prim_store::{GradientData, SplitGeometry};
 use profiler::{Profiler, BackendProfileCounters};
 use profiler::{GpuProfileTag, RendererProfileTimers, RendererProfileCounters};
@@ -33,16 +34,17 @@ use std;
 use std::cmp;
 use std::collections::{HashMap, VecDeque};
 use std::f32;
 use std::hash::BuildHasherDefault;
 use std::marker::PhantomData;
 use std::mem;
 use std::path::PathBuf;
 use std::rc::Rc;
+use std::slice;
 use std::sync::{Arc, Mutex};
 use std::sync::mpsc::{channel, Receiver, Sender};
 use std::thread;
 use texture_cache::TextureCache;
 use rayon::ThreadPool;
 use rayon::Configuration as ThreadPoolConfig;
 use tiling::{AlphaBatchKind, BlurCommand, CompositePrimitiveInstance, Frame, PrimitiveBatch, RenderTarget};
 use tiling::{AlphaRenderTarget, CacheClipInstance, PrimitiveInstance, ColorRenderTarget, RenderTargetKind};
@@ -186,16 +188,80 @@ pub enum BlendMode {
     None,
     Alpha,
     PremultipliedAlpha,
 
     // Use the color of the text itself as a constant color blend factor.
     Subpixel(ColorF),
 }
 
+/// The device-specific representation of the cache texture in gpu_cache.rs
+struct CacheTexture {
+    id: TextureId,
+}
+
+impl CacheTexture {
+    fn new(device: &mut Device) -> CacheTexture {
+        let id = device.create_texture_ids(1, TextureTarget::Default)[0];
+
+        CacheTexture {
+            id: id,
+        }
+    }
+
+    fn update(&mut self, device: &mut Device, updates: &GpuCacheUpdateList) {
+        // See if we need to create or resize the texture.
+        let current_dimensions = device.get_texture_dimensions(self.id);
+
+        if updates.height > current_dimensions.height {
+            // TODO(gw): Handle resizing an existing cache texture.
+            if current_dimensions.height > 0 {
+                panic!("TODO: Implement texture copy!!!");
+            }
+
+            // Create a f32 texture that can be used for the vertex shader
+            // to fetch data from.
+            device.init_texture(self.id,
+                                MAX_VERTEX_TEXTURE_WIDTH as u32,
+                                updates.height as u32,
+                                ImageFormat::RGBAF32,
+                                TextureFilter::Nearest,
+                                RenderTargetMode::None,
+                                None);
+        }
+
+        for update in &updates.updates {
+            match update {
+                &GpuCacheUpdate::Copy { block_index, block_count, address } => {
+                    // Apply an incremental update to the cache texture.
+                    // TODO(gw): For the initial implementation, we will just
+                    //           use update_texture() since it's simple. If / when
+                    //           we profile this and find it to be slow on some / all
+                    //           devices - we can look into other options, such as
+                    //           using glMapBuffer() with the unsynchronized bit,
+                    //           and managing the synchronization ourselves with fences.
+                    let data: &[u8] = unsafe {
+                        let ptr = updates.blocks
+                                         .as_ptr()
+                                         .offset(block_index as isize);
+                        slice::from_raw_parts(ptr as *const _, block_count * 16)
+                    };
+                    device.update_texture(self.id,
+                                          address.u as u32,
+                                          address.v as u32,
+                                          block_count as u32,
+                                          1,
+                                          None,
+                                          data);
+                }
+            }
+        }
+    }
+}
+
 struct GpuDataTexture<L> {
     id: TextureId,
     layout: PhantomData<L>,
 }
 
 impl<L: GpuStoreLayout> GpuDataTexture<L> {
     fn new(device: &mut Device) -> GpuDataTexture<L> {
         let id = device.create_texture_ids(1, TextureTarget::Default)[0];
@@ -462,58 +528,50 @@ fn create_clip_shader(name: &'static str
 }
 
 struct GpuDataTextures {
     layer_texture: VertexDataTexture,
     render_task_texture: VertexDataTexture,
     prim_geom_texture: VertexDataTexture,
     data16_texture: VertexDataTexture,
     data32_texture: VertexDataTexture,
-    data64_texture: VertexDataTexture,
-    data128_texture: VertexDataTexture,
     resource_rects_texture: VertexDataTexture,
     gradient_data_texture: GradientDataTexture,
     split_geometry_texture: SplitGeometryTexture,
 }
 
 impl GpuDataTextures {
     fn new(device: &mut Device) -> GpuDataTextures {
         GpuDataTextures {
             layer_texture: VertexDataTexture::new(device),
             render_task_texture: VertexDataTexture::new(device),
             prim_geom_texture: VertexDataTexture::new(device),
             data16_texture: VertexDataTexture::new(device),
             data32_texture: VertexDataTexture::new(device),
-            data64_texture: VertexDataTexture::new(device),
-            data128_texture: VertexDataTexture::new(device),
             resource_rects_texture: VertexDataTexture::new(device),
             gradient_data_texture: GradientDataTexture::new(device),
             split_geometry_texture: SplitGeometryTexture::new(device),
         }
     }
 
     fn init_frame(&mut self, device: &mut Device, frame: &mut Frame) {
         self.data16_texture.init(device, &mut frame.gpu_data16);
         self.data32_texture.init(device, &mut frame.gpu_data32);
-        self.data64_texture.init(device, &mut frame.gpu_data64);
-        self.data128_texture.init(device, &mut frame.gpu_data128);
         self.prim_geom_texture.init(device, &mut frame.gpu_geometry);
         self.resource_rects_texture.init(device, &mut frame.gpu_resource_rects);
         self.layer_texture.init(device, &mut frame.layer_texture_data);
         self.render_task_texture.init(device, &mut frame.render_task_data);
         self.gradient_data_texture.init(device, &mut frame.gpu_gradient_data);
         self.split_geometry_texture.init(device, &mut frame.gpu_split_geometry);
 
         device.bind_texture(TextureSampler::Layers, self.layer_texture.id);
         device.bind_texture(TextureSampler::RenderTasks, self.render_task_texture.id);
         device.bind_texture(TextureSampler::Geometry, self.prim_geom_texture.id);
         device.bind_texture(TextureSampler::Data16, self.data16_texture.id);
         device.bind_texture(TextureSampler::Data32, self.data32_texture.id);
-        device.bind_texture(TextureSampler::Data64, self.data64_texture.id);
-        device.bind_texture(TextureSampler::Data128, self.data128_texture.id);
         device.bind_texture(TextureSampler::ResourceRects, self.resource_rects_texture.id);
         device.bind_texture(TextureSampler::Gradients, self.gradient_data_texture.id);
         device.bind_texture(TextureSampler::SplitGeometry, self.split_geometry_texture.id);
     }
 }
 
 #[derive(Clone, Debug, PartialEq)]
 pub enum ReadPixelsFormat {
@@ -522,16 +580,17 @@ pub enum ReadPixelsFormat {
 }
 
 /// The renderer is responsible for submitting to the GPU the work prepared by the
 /// RenderBackend.
 pub struct Renderer {
     result_rx: Receiver<ResultMsg>,
     device: Device,
     pending_texture_updates: Vec<TextureUpdateList>,
+    pending_gpu_cache_updates: Vec<GpuCacheUpdateList>,
     pending_shader_updates: Vec<PathBuf>,
     current_frame: Option<RendererFrame>,
 
     // These are "cache shaders". These shaders are used to
     // draw intermediate results to cache targets. The results
     // of these shaders are then used by the primitive shaders.
     cs_box_shadow: LazilyCompiledShader,
     cs_text_run: LazilyCompiledShader,
@@ -590,16 +649,18 @@ pub struct Renderer {
     gpu_profile: GpuProfiler<GpuProfileTag>,
     prim_vao_id: VAOId,
     blur_vao_id: VAOId,
     clip_vao_id: VAOId,
 
     gdt_index: usize,
     gpu_data_textures: [GpuDataTextures; GPU_DATA_TEXTURE_POOL],
 
+    gpu_cache_texture: CacheTexture,
+
     pipeline_epoch_map: HashMap<PipelineId, Epoch, BuildHasherDefault<FnvHasher>>,
     /// Used to dispatch functions to the main thread's event loop.
     /// Required to allow GLContext sharing in some implementations like WGL.
     main_thread_dispatcher: Arc<Mutex<Option<Box<RenderDispatcher>>>>,
 
     /// A vector for fast resolves of texture cache IDs to
     /// native texture IDs. This maps to a free-list managed
     /// by the backend thread / texture cache. We free the
@@ -1062,17 +1123,19 @@ impl Renderer {
             debug: options.debug,
             cache_expiry_frames: options.cache_expiry_frames,
         };
 
         let device_pixel_ratio = options.device_pixel_ratio;
         let render_target_debug = options.render_target_debug;
         let payload_tx_for_backend = payload_tx.clone();
         let recorder = options.recorder;
-        let worker_config = ThreadPoolConfig::new().thread_name(|idx|{ format!("WebRender:Worker#{}", idx) });
+        let worker_config = ThreadPoolConfig::new()
+            .thread_name(|idx|{ format!("WebRender:Worker#{}", idx) })
+            .start_handler(|idx| { register_thread_with_profiler(format!("WebRender:Worker#{}", idx)); });
         let workers = options.workers.take().unwrap_or_else(||{
             Arc::new(ThreadPool::new(worker_config).unwrap())
         });
 
         let blob_image_renderer = options.blob_image_renderer.take();
         try!{ thread::Builder::new().name("RenderBackend".to_string()).spawn(move || {
             let mut backend = RenderBackend::new(api_rx,
                                                  payload_rx,
@@ -1087,23 +1150,26 @@ impl Renderer {
                                                  recorder,
                                                  backend_main_thread_dispatcher,
                                                  blob_image_renderer,
                                                  backend_vr_compositor,
                                                  initial_window_size);
             backend.run(backend_profile_counters);
         })};
 
+        let gpu_cache_texture = CacheTexture::new(&mut device);
+
         let gpu_profile = GpuProfiler::new(device.rc_gl());
 
         let renderer = Renderer {
             result_rx: result_rx,
             device: device,
             current_frame: None,
             pending_texture_updates: Vec::new(),
+            pending_gpu_cache_updates: Vec::new(),
             pending_shader_updates: Vec::new(),
             cs_box_shadow: cs_box_shadow,
             cs_text_run: cs_text_run,
             cs_blur: cs_blur,
             cs_clip_rectangle: cs_clip_rectangle,
             cs_clip_border: cs_clip_border,
             cs_clip_image: cs_clip_image,
             ps_rectangle: ps_rectangle,
@@ -1149,16 +1215,17 @@ impl Renderer {
             cache_texture_id_map: Vec::new(),
             dummy_cache_texture_id: dummy_cache_texture_id,
             dither_matrix_texture_id: dither_matrix_texture_id,
             external_image_handler: None,
             external_images: HashMap::default(),
             vr_compositor_handler: vr_compositor,
             cpu_profiles: VecDeque::new(),
             gpu_profiles: VecDeque::new(),
+            gpu_cache_texture: gpu_cache_texture,
         };
 
         let sender = RenderApiSender::new(api_tx, payload_tx);
         Ok((renderer, sender))
     }
 
     pub fn get_graphics_api_info(&self) -> GraphicsApiInfo {
         GraphicsApiInfo {
@@ -1213,18 +1280,25 @@ impl Renderer {
     ///
     /// Should be called before `render()`, as texture cache updates are done here.
     pub fn update(&mut self) {
         profile_scope!("update");
 
         // Pull any pending results and return the most recent.
         while let Ok(msg) = self.result_rx.try_recv() {
             match msg {
-                ResultMsg::NewFrame(frame, texture_update_list, profile_counters) => {
+                ResultMsg::NewFrame(mut frame, texture_update_list, profile_counters) => {
                     self.pending_texture_updates.push(texture_update_list);
+                    if let Some(ref mut frame) = frame.frame {
+                        // TODO(gw): This whole message / Frame / RendererFrame stuff
+                        //           is really messy and needs to be refactored!!
+                        if let Some(update_list) = frame.gpu_cache_updates.take() {
+                            self.pending_gpu_cache_updates.push(update_list);
+                        }
+                    }
                     self.backend_profile_counters = profile_counters;
 
                     // Update the list of available epochs for use during reftests.
                     // This is a workaround for https://github.com/servo/servo/issues/13149.
                     for (pipeline_id, epoch) in &frame.pipeline_epoch_map {
                         self.pipeline_epoch_map.insert(*pipeline_id, *epoch);
                     }
 
@@ -1304,16 +1378,19 @@ impl Renderer {
                         self.gpu_profile.begin_frame(frame_id);
 
                         self.device.disable_scissor();
                         self.device.disable_depth();
                         self.device.set_blend(false);
                         //self.update_shaders();
                         self.update_texture_cache();
 
+                        self.update_gpu_cache();
+                        self.device.bind_texture(TextureSampler::ResourceCache, self.gpu_cache_texture.id);
+
                         frame_id
                     };
 
                     self.draw_tile_frame(frame, &framebuffer_size);
 
                     self.gpu_profile.end_frame();
                     cpu_frame_id
                 });
@@ -1376,16 +1453,23 @@ impl Renderer {
         }
 
         if update_uniforms {
             self.update_uniform_locations();
         }
     }
 */
 
+    fn update_gpu_cache(&mut self) {
+        let _gm = GpuMarker::new(self.device.rc_gl(), "gpu cache update");
+        for update_list in self.pending_gpu_cache_updates.drain(..) {
+            self.gpu_cache_texture.update(&mut self.device, &update_list);
+        }
+    }
+
     fn update_texture_cache(&mut self) {
         let _gm = GpuMarker::new(self.device.rc_gl(), "texture cache update");
         let mut pending_texture_updates = mem::replace(&mut self.pending_texture_updates, vec![]);
         for update_list in pending_texture_updates.drain(..) {
             for update in update_list.updates {
                 match update.op {
                     TextureUpdateOp::Create { width, height, format, filter, mode, data } => {
                         let CacheTextureId(cache_texture_index) = update.id;
--- a/gfx/webrender/src/resource_cache.rs
+++ b/gfx/webrender/src/resource_cache.rs
@@ -1,16 +1,17 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 use app_units::Au;
 use device::TextureFilter;
 use fnv::FnvHasher;
 use frame::FrameId;
+use gpu_cache::GpuCache;
 use internal_types::{FontTemplate, SourceTexture, TextureUpdateList};
 use profiler::TextureCacheProfileCounters;
 use std::collections::{HashMap, HashSet};
 use std::collections::hash_map::Entry::{self, Occupied, Vacant};
 use std::fmt::Debug;
 use std::hash::BuildHasherDefault;
 use std::hash::Hash;
 use std::mem;
@@ -193,16 +194,17 @@ pub struct ResourceCache {
     webgl_textures: HashMap<WebGLContextId, WebGLTexture, BuildHasherDefault<FnvHasher>>,
 
     font_templates: HashMap<FontKey, FontTemplate, BuildHasherDefault<FnvHasher>>,
     image_templates: ImageTemplates,
     state: State,
     current_frame_id: FrameId,
 
     texture_cache: TextureCache,
+    pub gpu_cache: GpuCache,
 
     // TODO(gw): We should expire (parts of) this cache semi-regularly!
     cached_glyph_dimensions: HashMap<GlyphKey, Option<GlyphDimensions>, BuildHasherDefault<FnvHasher>>,
     pending_image_requests: Vec<ImageRequest>,
     glyph_rasterizer: GlyphRasterizer,
 
     blob_image_renderer: Option<Box<BlobImageRenderer>>,
     blob_image_requests: HashSet<ImageRequest>,
@@ -215,16 +217,17 @@ impl ResourceCache {
         ResourceCache {
             cached_glyphs: ResourceClassCache::new(),
             cached_images: ResourceClassCache::new(),
             webgl_textures: HashMap::default(),
             font_templates: HashMap::default(),
             image_templates: ImageTemplates::new(),
             cached_glyph_dimensions: HashMap::default(),
             texture_cache: texture_cache,
+            gpu_cache: GpuCache::new(),
             state: State::Idle,
             current_frame_id: FrameId(0),
             pending_image_requests: Vec::new(),
             glyph_rasterizer: GlyphRasterizer::new(workers),
 
             blob_image_renderer: blob_image_renderer,
             blob_image_requests: HashSet::new(),
         }
@@ -559,16 +562,17 @@ impl ResourceCache {
         self.cached_images.expire_old_resources(&mut self.texture_cache, frame_id);
         self.cached_glyphs.expire_old_resources(&mut self.texture_cache, frame_id);
     }
 
     pub fn begin_frame(&mut self, frame_id: FrameId) {
         debug_assert_eq!(self.state, State::Idle);
         self.state = State::AddResources;
         self.current_frame_id = frame_id;
+        self.gpu_cache.begin_frame();
     }
 
     pub fn block_until_all_resources_added(&mut self,
                                            texture_cache_profile: &mut TextureCacheProfileCounters) {
         profile_scope!("block_until_all_resources_added");
 
         debug_assert_eq!(self.state, State::AddResources);
         self.state = State::QueryResources;
--- a/gfx/webrender/src/tiling.rs
+++ b/gfx/webrender/src/tiling.rs
@@ -1,22 +1,23 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 use app_units::Au;
 use border::{BorderCornerInstance, BorderCornerSide};
 use device::TextureId;
 use fnv::FnvHasher;
+use gpu_cache::GpuCacheUpdateList;
 use gpu_store::GpuStoreAddress;
 use internal_types::{ANGLE_FLOAT_TO_FIXED, BatchTextures, CacheTextureId, LowLevelFilterOp};
 use internal_types::SourceTexture;
 use mask_cache::MaskCacheInfo;
-use prim_store::{CLIP_DATA_GPU_SIZE, DeferredResolve, GpuBlock128, GpuBlock16, GpuBlock32};
-use prim_store::{GpuBlock64, GradientData, SplitGeometry, PrimitiveCacheKey, PrimitiveGeometry};
+use prim_store::{CLIP_DATA_GPU_SIZE, DeferredResolve, GpuBlock16, GpuBlock32};
+use prim_store::{GradientData, SplitGeometry, PrimitiveCacheKey, PrimitiveGeometry};
 use prim_store::{PrimitiveIndex, PrimitiveKind, PrimitiveMetadata, PrimitiveStore, TexelRect};
 use profiler::FrameProfileCounters;
 use render_task::{AlphaRenderItem, MaskGeometryKind, MaskSegment, RenderTask, RenderTaskData};
 use render_task::{RenderTaskId, RenderTaskIndex, RenderTaskKey, RenderTaskKind};
 use render_task::RenderTaskLocation;
 use renderer::BlendMode;
 use renderer::ImageBufferKind;
 use resource_cache::ResourceCache;
@@ -40,16 +41,17 @@ pub type DisplayListMap = HashMap<Pipeli
                                   BuiltDisplayList,
                                   BuildHasherDefault<FnvHasher>>;
 
 trait AlphaBatchHelpers {
     fn get_color_textures(&self, metadata: &PrimitiveMetadata) -> [SourceTexture; 3];
     fn get_blend_mode(&self,
                       needs_blending: bool,
                       metadata: &PrimitiveMetadata) -> BlendMode;
+    fn can_draw(&self, metadata: &PrimitiveMetadata) -> bool;
 }
 
 impl AlphaBatchHelpers for PrimitiveStore {
     fn get_color_textures(&self, metadata: &PrimitiveMetadata) -> [SourceTexture; 3] {
         let invalid = SourceTexture::Invalid;
         match metadata.prim_kind {
             PrimitiveKind::Border |
             PrimitiveKind::BoxShadow |
@@ -100,16 +102,36 @@ impl AlphaBatchHelpers for PrimitiveStor
                 if needs_blending {
                     BlendMode::Alpha
                 } else {
                     BlendMode::None
                 }
             }
         }
     }
+
+    fn can_draw(&self, metadata: &PrimitiveMetadata) -> bool {
+        match metadata.prim_kind {
+            PrimitiveKind::Border |
+            PrimitiveKind::BoxShadow |
+            PrimitiveKind::Rectangle |
+            PrimitiveKind::AlignedGradient |
+            PrimitiveKind::AngleGradient |
+            PrimitiveKind::RadialGradient |
+            PrimitiveKind::Image |
+            PrimitiveKind::YuvImage => true,
+            PrimitiveKind::TextRun => {
+                // If the glyph failed to rasterize, we may have a text run
+                // without a valid texture. In this case, we need to prevent
+                // drawing the primitive this frame.
+                let text_run_cpu = &self.cpu_text_runs[metadata.cpu_prim_index.0];
+                text_run_cpu.color_texture_id != SourceTexture::Invalid
+            }
+        }
+    }
 }
 
 #[derive(Debug)]
 pub struct ScrollbarPrimitive {
     pub clip_id: ClipId,
     pub prim_index: PrimitiveIndex,
     pub border_radius: f32,
 }
@@ -360,16 +382,20 @@ impl AlphaRenderItem {
                                                                mode as u32 as i32,
                                                                0,
                                                                z);
 
                 batch.add_instance(PrimitiveInstance::from(instance));
             }
             AlphaRenderItem::Primitive(clip_scroll_group_index_opt, prim_index, z) => {
                 let prim_metadata = ctx.prim_store.get_metadata(prim_index);
+                // Bail out if this primitive can't be drawn this frame for some reason.
+                if !ctx.prim_store.can_draw(prim_metadata) {
+                    return;
+                }
                 let (transform_kind, packed_layer_index) = match clip_scroll_group_index_opt {
                     Some(group_index) => {
                         let group = &ctx.clip_scroll_group_store[group_index.0];
                         let bounding_rect = group.screen_bounding_rect.as_ref().unwrap();
                         (bounding_rect.0, group.packed_layer_index)
                     },
                     None => (TransformedRectKind::AxisAligned, PackedLayerIndex(0)),
                 };
@@ -393,18 +419,21 @@ impl AlphaRenderItem {
                         OPAQUE_TASK_INDEX
                     }
                 };
                 let needs_blending = !prim_metadata.is_opaque ||
                                      needs_clipping ||
                                      transform_kind == TransformedRectKind::Complex;
                 let blend_mode = ctx.prim_store.get_blend_mode(needs_blending, prim_metadata);
 
+                let prim_cache_address = prim_metadata.gpu_location
+                                                      .as_int(&ctx.resource_cache.gpu_cache);
+
                 let base_instance = SimplePrimitiveInstance::new(prim_index,
-                                                                 prim_metadata.gpu_prim_index,
+                                                                 prim_cache_address,
                                                                  task_index,
                                                                  clip_task_index,
                                                                  packed_layer_index,
                                                                  z);
 
                 match prim_metadata.prim_kind {
                     PrimitiveKind::Border => {
                         let border_cpu = &ctx.prim_store.cpu_borders[prim_metadata.cpu_prim_index.0];
@@ -483,44 +512,47 @@ impl AlphaRenderItem {
                             Some(ref task) => {
                                 let cache_task_id = task.id;
                                 render_tasks.get_task_index(&cache_task_id,
                                                             child_pass_index).0 as i32
                             }
                             None => 0,
                         };
 
-                        for glyph_index in 0..prim_metadata.gpu_data_count {
+                        for glyph_index in 0..text_cpu.gpu_data_count {
                             let user_data1 = match batch_kind {
                                 AlphaBatchKind::TextRun => text_cpu.resource_address.0 + glyph_index,
                                 AlphaBatchKind::CacheImage => cache_task_index,
                                 _ => unreachable!(),
                             };
 
-                            batch.add_instance(base_instance.build(prim_metadata.gpu_data_address.0 + glyph_index,
+                            batch.add_instance(base_instance.build(text_cpu.gpu_data_address.0 + glyph_index,
                                                                    user_data1));
                         }
                     }
                     PrimitiveKind::AlignedGradient => {
+                        let gradient_cpu = &ctx.prim_store.cpu_gradients[prim_metadata.cpu_prim_index.0];
                         let key = AlphaBatchKey::new(AlphaBatchKind::AlignedGradient, flags, blend_mode, textures);
                         let batch = batch_list.get_suitable_batch(&key, item_bounding_rect);
-                        for part_index in 0..(prim_metadata.gpu_data_count - 1) {
-                            batch.add_instance(base_instance.build(prim_metadata.gpu_data_address.0 + part_index, 0));
+                        for part_index in 0..(gradient_cpu.gpu_data_count - 1) {
+                            batch.add_instance(base_instance.build(gradient_cpu.gpu_data_address.0 + part_index, 0));
                         }
                     }
                     PrimitiveKind::AngleGradient => {
+                        let gradient_cpu = &ctx.prim_store.cpu_gradients[prim_metadata.cpu_prim_index.0];
                         let key = AlphaBatchKey::new(AlphaBatchKind::AngleGradient, flags, blend_mode, textures);
                         let batch = batch_list.get_suitable_batch(&key, item_bounding_rect);
-                        batch.add_instance(base_instance.build(prim_metadata.gpu_data_address.0,
+                        batch.add_instance(base_instance.build(gradient_cpu.gpu_data_address.0,
                                                                0));
                     }
                     PrimitiveKind::RadialGradient => {
+                        let gradient_cpu = &ctx.prim_store.cpu_radial_gradients[prim_metadata.cpu_prim_index.0];
                         let key = AlphaBatchKey::new(AlphaBatchKind::RadialGradient, flags, blend_mode, textures);
                         let batch = batch_list.get_suitable_batch(&key, item_bounding_rect);
-                        batch.add_instance(base_instance.build(prim_metadata.gpu_data_address.0,
+                        batch.add_instance(base_instance.build(gradient_cpu.gpu_data_address.0,
                                                                0));
                     }
                     PrimitiveKind::YuvImage => {
                         let image_yuv_cpu = &ctx.prim_store.cpu_yuv_images[prim_metadata.cpu_prim_index.0];
 
                         let get_buffer_kind = |texture: SourceTexture| {
                             match texture {
                                 SourceTexture::External(ext_image) => {
@@ -552,25 +584,26 @@ impl AlphaRenderItem {
                                                      blend_mode,
                                                      textures);
                         let batch = batch_list.get_suitable_batch(&key, item_bounding_rect);
 
                         batch.add_instance(base_instance.build(image_yuv_cpu.yuv_resource_address.0,
                                                                0));
                     }
                     PrimitiveKind::BoxShadow => {
+                        let box_shadow = &ctx.prim_store.cpu_box_shadows[prim_metadata.cpu_prim_index.0];
                         let cache_task_id = &prim_metadata.render_task.as_ref().unwrap().id;
                         let cache_task_index = render_tasks.get_task_index(cache_task_id,
                                                                            child_pass_index);
 
                         let key = AlphaBatchKey::new(AlphaBatchKind::BoxShadow, flags, blend_mode, textures);
                         let batch = batch_list.get_suitable_batch(&key, item_bounding_rect);
 
-                        for rect_index in 0..prim_metadata.gpu_data_count {
-                            batch.add_instance(base_instance.build(prim_metadata.gpu_data_address.0 + rect_index,
+                        for rect_index in 0..box_shadow.rects.len() {
+                            batch.add_instance(base_instance.build(rect_index as i32,
                                                                    cache_task_index.0 as i32));
                         }
                     }
                 }
             }
             AlphaRenderItem::SplitComposite(sc_index, task_id, gpu_address, z) => {
                 let key = AlphaBatchKey::new(AlphaBatchKind::SplitComposite,
                                              AlphaBatchKeyFlags::empty(),
@@ -942,20 +975,23 @@ impl RenderTarget for ColorRenderTarget 
                     src_task_id: render_tasks.get_task_index(&src_id, child_pass_index).0 as i32,
                     blur_direction: BlurDirection::Horizontal as i32,
                     padding: 0,
                 });
             }
             RenderTaskKind::CachePrimitive(prim_index) => {
                 let prim_metadata = ctx.prim_store.get_metadata(prim_index);
 
+                let prim_address = prim_metadata.gpu_location
+                                                .as_int(&ctx.resource_cache.gpu_cache);
+
                 match prim_metadata.prim_kind {
                     PrimitiveKind::BoxShadow => {
                         let instance = SimplePrimitiveInstance::new(prim_index,
-                                                                    prim_metadata.gpu_prim_index,
+                                                                    prim_address,
                                                                     render_tasks.get_task_index(&task.id, pass_index),
                                                                     RenderTaskIndex(0),
                                                                     PackedLayerIndex(0),
                                                                     0);     // z is disabled for rendering cache primitives
                         self.box_shadow_cache_prims.push(instance.build(0, 0));
                     }
                     PrimitiveKind::TextRun => {
                         let text = &ctx.prim_store.cpu_text_runs[prim_metadata.cpu_prim_index.0];
@@ -970,24 +1006,24 @@ impl RenderTarget for ColorRenderTarget 
                         };
 
                         debug_assert!(textures.colors[0] != SourceTexture::Invalid);
                         debug_assert!(self.text_run_textures.colors[0] == SourceTexture::Invalid ||
                                       self.text_run_textures.colors[0] == textures.colors[0]);
                         self.text_run_textures = textures;
 
                         let instance = SimplePrimitiveInstance::new(prim_index,
-                                                                    prim_metadata.gpu_prim_index,
+                                                                    prim_address,
                                                                     render_tasks.get_task_index(&task.id, pass_index),
                                                                     RenderTaskIndex(0),
                                                                     PackedLayerIndex(0),
                                                                     0);     // z is disabled for rendering cache primitives
 
-                        for glyph_index in 0..prim_metadata.gpu_data_count {
-                            self.text_run_cache_prims.push(instance.build(prim_metadata.gpu_data_address.0 + glyph_index,
+                        for glyph_index in 0..text.gpu_data_count {
+                            self.text_run_cache_prims.push(instance.build(text.gpu_data_address.0 + glyph_index,
                                                                           text.resource_address.0 + glyph_index));
                         }
                     }
                     _ => {
                         // No other primitives make use of primitive caching yet!
                         unreachable!()
                     }
                 }
@@ -1267,33 +1303,39 @@ pub struct CacheClipInstance {
 // 32 bytes per instance should be enough for anyone!
 #[derive(Debug, Clone)]
 pub struct PrimitiveInstance {
     data: [i32; 8],
 }
 
 struct SimplePrimitiveInstance {
     pub global_prim_index: i32,
+    // TODO(gw): specific_prim_address is encoded as an i32, since
+    //           some primitives use GPU Cache and some still use
+    //           GPU Store. Once everything is converted to use the
+    //           on-demand GPU cache, then we change change this to
+    //           be an ivec2 of u16 - and encode the UV directly
+    //           so that the vertex shader can fetch directly.
     pub specific_prim_address: i32,
     pub task_index: i32,
     pub clip_task_index: i32,
     pub layer_index: i32,
     pub z_sort_index: i32,
 }
 
 impl SimplePrimitiveInstance {
     fn new(prim_index: PrimitiveIndex,
-           specific_prim_address: GpuStoreAddress,
+           specific_prim_address: i32,
            task_index: RenderTaskIndex,
            clip_task_index: RenderTaskIndex,
            layer_index: PackedLayerIndex,
            z_sort_index: i32) -> SimplePrimitiveInstance {
         SimplePrimitiveInstance {
             global_prim_index: prim_index.0 as i32,
-            specific_prim_address: specific_prim_address.0 as i32,
+            specific_prim_address: specific_prim_address,
             task_index: task_index.0 as i32,
             clip_task_index: clip_task_index.0 as i32,
             layer_index: layer_index.0 as i32,
             z_sort_index: z_sort_index,
         }
     }
 
     fn build(&self, data0: i32, data1: i32) -> PrimitiveInstance {
@@ -1597,21 +1639,23 @@ pub struct Frame {
     pub cache_size: DeviceUintSize,
     pub passes: Vec<RenderPass>,
     pub profile_counters: FrameProfileCounters,
 
     pub layer_texture_data: Vec<PackedLayer>,
     pub render_task_data: Vec<RenderTaskData>,
     pub gpu_data16: Vec<GpuBlock16>,
     pub gpu_data32: Vec<GpuBlock32>,
-    pub gpu_data64: Vec<GpuBlock64>,
-    pub gpu_data128: Vec<GpuBlock128>,
     pub gpu_geometry: Vec<PrimitiveGeometry>,
     pub gpu_gradient_data: Vec<GradientData>,
     pub gpu_split_geometry: Vec<SplitGeometry>,
     pub gpu_resource_rects: Vec<TexelRect>,
 
+    // List of updates that need to be pushed to the
+    // gpu resource cache.
+    pub gpu_cache_updates: Option<GpuCacheUpdateList>,
+
     // List of textures that we don't know about yet
     // from the backend thread. The render thread
     // will use a callback to resolve these and
     // patch the data structures.
     pub deferred_resolves: Vec<DeferredResolve>,
 }