Bug 1365934 - Update webrender to 76a3213080ca5c2e2a612c3023c50c81a111fd55. r=jrmuizel,kvark
authorKartikaya Gupta <kgupta@mozilla.com>
Thu, 25 May 2017 13:27:55 -0400
changeset 409349 3d821b8e5cbf1c250c476648b43b146d65087082
parent 409348 1a0568ebe975cafcd2f8e785730a2a13d5ee9a9e
child 409350 8f762279e32cc257acbc851ae2cfeb0020ce8e3b
push id7391
push usermtabara@mozilla.com
push dateMon, 12 Jun 2017 13:08:53 +0000
treeherdermozilla-beta@2191d7f87e2e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjrmuizel, kvark
bugs1365934
milestone55.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1365934 - Update webrender to 76a3213080ca5c2e2a612c3023c50c81a111fd55. r=jrmuizel,kvark This includes a change to bindings.rs for an API change to the readback function in webrender cset 052b0a7. MozReview-Commit-ID: K9eMrF3O6OX
gfx/doc/README.webrender
gfx/webrender/Cargo.toml
gfx/webrender/benches/coalesce.rs
gfx/webrender/res/cs_box_shadow.vs.glsl
gfx/webrender/res/cs_text_run.vs.glsl
gfx/webrender/res/prim_shared.glsl
gfx/webrender/res/ps_angle_gradient.vs.glsl
gfx/webrender/res/ps_blend.vs.glsl
gfx/webrender/res/ps_border_corner.vs.glsl
gfx/webrender/res/ps_border_edge.vs.glsl
gfx/webrender/res/ps_box_shadow.vs.glsl
gfx/webrender/res/ps_cache_image.vs.glsl
gfx/webrender/res/ps_composite.vs.glsl
gfx/webrender/res/ps_gradient.vs.glsl
gfx/webrender/res/ps_hardware_composite.vs.glsl
gfx/webrender/res/ps_image.vs.glsl
gfx/webrender/res/ps_radial_gradient.vs.glsl
gfx/webrender/res/ps_split_composite.vs.glsl
gfx/webrender/res/ps_text_run.vs.glsl
gfx/webrender/res/ps_yuv_image.vs.glsl
gfx/webrender/src/device.rs
gfx/webrender/src/internal_types.rs
gfx/webrender/src/lib.rs
gfx/webrender/src/renderer.rs
gfx/webrender/src/texture_cache.rs
gfx/webrender/src/tiling.rs
gfx/webrender_bindings/src/bindings.rs
--- a/gfx/doc/README.webrender
+++ b/gfx/doc/README.webrender
@@ -74,9 +74,9 @@ there is another crate in m-c called moz
 the same folder to store its rust dependencies. If one of the libraries that is
 required by both mozjs_sys and webrender is updated without updating the other
 project's Cargo.lock file, that results in build bustage.
 This means that any time you do this sort of manual update of packages, you need
 to make sure that mozjs_sys also has its Cargo.lock file updated if needed, hence
 the need to run the cargo update command in js/src as well. Hopefully this will
 be resolved soon.
 
-Latest Commit: 102603520d52f335f152ab74b6bcfdae061b6bc8
+Latest Commit: 76a3213080ca5c2e2a612c3023c50c81a111fd55
--- a/gfx/webrender/Cargo.toml
+++ b/gfx/webrender/Cargo.toml
@@ -29,16 +29,17 @@ rayon = {version = "0.7", features = ["u
 webrender_traits = {path = "../webrender_traits"}
 bitflags = "0.7"
 gamma-lut = "0.2"
 thread_profiler = "0.1.1"
 plane-split = "0.3"
 
 [dev-dependencies]
 angle = {git = "https://github.com/servo/angle", branch = "servo"}
+rand = "0.3"                # for the benchmarks
 servo-glutin = "0.10.1"     # for the example apps
 
 [target.'cfg(any(target_os = "android", all(unix, not(target_os = "macos"))))'.dependencies]
 freetype = { version = "0.2", default-features = false }
 
 [target.'cfg(target_os = "windows")'.dependencies]
 dwrote = "0.3"
 
new file mode 100644
--- /dev/null
+++ b/gfx/webrender/benches/coalesce.rs
@@ -0,0 +1,23 @@
+#![feature(test)]
+
+extern crate rand;
+extern crate test;
+extern crate webrender;
+extern crate webrender_traits;
+
+use rand::Rng;
+use test::Bencher;
+use webrender::TexturePage;
+use webrender_traits::{DeviceUintSize as Size};
+
+#[bench]
+fn bench_coalesce(b: &mut Bencher) {
+    let mut rng = rand::thread_rng();
+    let mut page = TexturePage::new_dummy(Size::new(10000, 10000));
+    let mut test_page = TexturePage::new_dummy(Size::new(10000, 10000));
+    while page.allocate(&Size::new(rng.gen_range(1, 100), rng.gen_range(1, 100))).is_some() {}
+    b.iter(|| {
+        test_page.fill_from(&page);
+        test_page.coalesce();
+    });
+}
--- a/gfx/webrender/res/cs_box_shadow.vs.glsl
+++ b/gfx/webrender/res/cs_box_shadow.vs.glsl
@@ -1,17 +1,17 @@
 #line 1
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 void main(void) {
-    CachePrimitiveInstance cpi = fetch_cache_instance();
-    RenderTaskData task = fetch_render_task(cpi.render_task_index);
-    BoxShadow bs = fetch_boxshadow(cpi.specific_prim_index);
+    PrimitiveInstance pi = fetch_prim_instance();
+    RenderTaskData task = fetch_render_task(pi.render_task_index);
+    BoxShadow bs = fetch_boxshadow(pi.specific_prim_address);
 
     vec2 p0 = task.data0.xy;
     vec2 p1 = p0 + task.data0.zw;
 
     vec2 pos = mix(p0, p1, aPosition.xy);
 
     vBorderRadii = bs.border_radius_edge_size_blur_radius_inverted.xx;
     vBlurRadius = bs.border_radius_edge_size_blur_radius_inverted.z;
--- a/gfx/webrender/res/cs_text_run.vs.glsl
+++ b/gfx/webrender/res/cs_text_run.vs.glsl
@@ -3,22 +3,22 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 // Draw a text run to a cache target. These are always
 // drawn un-transformed. These are used for effects such
 // as text-shadow.
 
 void main(void) {
-    CachePrimitiveInstance cpi = fetch_cache_instance();
-    RenderTaskData task = fetch_render_task(cpi.render_task_index);
-    TextRun text = fetch_text_run(cpi.specific_prim_index);
-    Glyph glyph = fetch_glyph(cpi.sub_index);
-    PrimitiveGeometry pg = fetch_prim_geometry(cpi.global_prim_index);
-    ResourceRect res = fetch_resource_rect(cpi.user_data.x);
+    PrimitiveInstance pi = fetch_prim_instance();
+    RenderTaskData task = fetch_render_task(pi.render_task_index);
+    TextRun text = fetch_text_run(pi.specific_prim_address);
+    Glyph glyph = fetch_glyph(pi.user_data0);
+    PrimitiveGeometry pg = fetch_prim_geometry(pi.global_prim_index);
+    ResourceRect res = fetch_resource_rect(pi.user_data1);
 
     // Glyphs size is already in device-pixels.
     // The render task origin is in device-pixels. Offset that by
     // the glyph offset, relative to its primitive bounding rect.
     vec2 size = res.uv_rect.zw - res.uv_rect.xy;
     vec2 origin = task.data0.xy + uDevicePixelRatio * (glyph.offset.xy - pg.local_rect.p0);
     vec4 local_rect = vec4(origin, size);
 
--- a/gfx/webrender/res/prim_shared.glsl
+++ b/gfx/webrender/res/prim_shared.glsl
@@ -112,40 +112,34 @@ float distance_to_line(vec2 p0, vec2 per
 flat varying vec4 vClipMaskUvBounds;
 varying vec3 vClipMaskUv;
 #ifdef WR_FEATURE_TRANSFORM
     flat varying vec4 vLocalBounds;
 #endif
 
 #ifdef WR_VERTEX_SHADER
 
-#define VECS_PER_LAYER             13
+#define VECS_PER_LAYER              9
 #define VECS_PER_RENDER_TASK        3
 #define VECS_PER_PRIM_GEOM          2
 #define VECS_PER_SPLIT_GEOM         3
 
 uniform sampler2D sLayers;
 uniform sampler2D sRenderTasks;
 uniform sampler2D sPrimGeometry;
 
 uniform sampler2D sData16;
 uniform sampler2D sData32;
 uniform sampler2D sData64;
 uniform sampler2D sData128;
 uniform sampler2D sResourceRects;
 
 // Instanced attributes
-in int aGlobalPrimId;
-in int aPrimitiveAddress;
-in int aTaskIndex;
-in int aClipTaskIndex;
-in int aLayerIndex;
-in int aElementIndex;
-in ivec2 aUserData;
-in int aZIndex;
+in ivec4 aData0;
+in ivec4 aData1;
 
 // get_fetch_uv is a macro to work around a macOS Intel driver parsing bug.
 // TODO: convert back to a function once the driver issues are resolved, if ever.
 // https://github.com/servo/webrender/pull/623
 // https://github.com/servo/servo/issues/13953
 #define get_fetch_uv(i, vpi)  ivec2(vpi * (i % (WR_MAX_VERTEX_TEXTURE_WIDTH/vpi)), i / (WR_MAX_VERTEX_TEXTURE_WIDTH/vpi))
 
 vec4 fetch_data_1(int index) {
@@ -185,17 +179,16 @@ vec4[8] fetch_data_8(int index) {
     );
 }
 
 
 struct Layer {
     mat4 transform;
     mat4 inv_transform;
     RectWithSize local_clip_rect;
-    vec4 screen_vertices[4];
 };
 
 Layer fetch_layer(int index) {
     Layer layer;
 
     // Create a UV base coord for each 8 texels.
     // This is required because trying to use an offset
     // of more than 8 texels doesn't work on some versions
@@ -212,21 +205,16 @@ Layer fetch_layer(int index) {
     layer.inv_transform[0] = texelFetchOffset(sLayers, uv0, 0, ivec2(4, 0));
     layer.inv_transform[1] = texelFetchOffset(sLayers, uv0, 0, ivec2(5, 0));
     layer.inv_transform[2] = texelFetchOffset(sLayers, uv0, 0, ivec2(6, 0));
     layer.inv_transform[3] = texelFetchOffset(sLayers, uv0, 0, ivec2(7, 0));
 
     vec4 clip_rect = texelFetchOffset(sLayers, uv1, 0, ivec2(0, 0));
     layer.local_clip_rect = RectWithSize(clip_rect.xy, clip_rect.zw);
 
-    layer.screen_vertices[0] = texelFetchOffset(sLayers, uv1, 0, ivec2(1, 0));
-    layer.screen_vertices[1] = texelFetchOffset(sLayers, uv1, 0, ivec2(2, 0));
-    layer.screen_vertices[2] = texelFetchOffset(sLayers, uv1, 0, ivec2(3, 0));
-    layer.screen_vertices[3] = texelFetchOffset(sLayers, uv1, 0, ivec2(4, 0));
-
     return layer;
 }
 
 struct RenderTaskData {
     vec4 data0;
     vec4 data1;
     vec4 data2;
 };
@@ -442,90 +430,89 @@ PrimitiveGeometry fetch_prim_geometry(in
     vec4 local_clip_rect = texelFetchOffset(sPrimGeometry, uv, 0, ivec2(1, 0));
     pg.local_clip_rect = RectWithSize(local_clip_rect.xy, local_clip_rect.zw);
 
     return pg;
 }
 
 struct PrimitiveInstance {
     int global_prim_index;
-    int specific_prim_index;
+    int specific_prim_address;
     int render_task_index;
     int clip_task_index;
     int layer_index;
-    int sub_index;
     int z;
-    ivec2 user_data;
+    int user_data0;
+    int user_data1;
 };
 
 PrimitiveInstance fetch_prim_instance() {
     PrimitiveInstance pi;
 
-    pi.global_prim_index = aGlobalPrimId;
-    pi.specific_prim_index = aPrimitiveAddress;
-    pi.render_task_index = aTaskIndex;
-    pi.clip_task_index = aClipTaskIndex;
-    pi.layer_index = aLayerIndex;
-    pi.sub_index = aElementIndex;
-    pi.user_data = aUserData;
-    pi.z = aZIndex;
+    pi.global_prim_index = aData0.x;
+    pi.specific_prim_address = aData0.y;
+    pi.render_task_index = aData0.z;
+    pi.clip_task_index = aData0.w;
+    pi.layer_index = aData1.x;
+    pi.z = aData1.y;
+    pi.user_data0 = aData1.z;
+    pi.user_data1 = aData1.w;
 
     return pi;
 }
 
-struct CachePrimitiveInstance {
-    int global_prim_index;
-    int specific_prim_index;
+struct CompositeInstance {
     int render_task_index;
-    int sub_index;
-    ivec2 user_data;
+    int src_task_index;
+    int backdrop_task_index;
+    int user_data0;
+    int user_data1;
+    float z;
 };
 
-CachePrimitiveInstance fetch_cache_instance() {
-    CachePrimitiveInstance cpi;
-
-    PrimitiveInstance pi = fetch_prim_instance();
+CompositeInstance fetch_composite_instance() {
+    CompositeInstance ci;
 
-    cpi.global_prim_index = pi.global_prim_index;
-    cpi.specific_prim_index = pi.specific_prim_index;
-    cpi.render_task_index = pi.render_task_index;
-    cpi.sub_index = pi.sub_index;
-    cpi.user_data = pi.user_data;
+    ci.render_task_index = aData0.x;
+    ci.src_task_index = aData0.y;
+    ci.backdrop_task_index = aData0.z;
+    ci.z = float(aData0.w);
 
-    return cpi;
+    ci.user_data0 = aData1.x;
+    ci.user_data1 = aData1.y;
+
+    return ci;
 }
 
 struct Primitive {
     Layer layer;
     ClipArea clip_area;
     AlphaBatchTask task;
     RectWithSize local_rect;
     RectWithSize local_clip_rect;
     int prim_index;
-    // when sending multiple primitives of the same type (e.g. border segments)
-    // this index allows the vertex shader to recognize the difference
-    int sub_index;
-    ivec2 user_data;
+    int user_data0;
+    int user_data1;
     float z;
 };
 
 Primitive load_primitive_custom(PrimitiveInstance pi) {
     Primitive prim;
 
     prim.layer = fetch_layer(pi.layer_index);
     prim.clip_area = fetch_clip_area(pi.clip_task_index);
     prim.task = fetch_alpha_batch_task(pi.render_task_index);
 
     PrimitiveGeometry pg = fetch_prim_geometry(pi.global_prim_index);
     prim.local_rect = pg.local_rect;
     prim.local_clip_rect = pg.local_clip_rect;
 
-    prim.prim_index = pi.specific_prim_index;
-    prim.sub_index = pi.sub_index;
-    prim.user_data = pi.user_data;
+    prim.prim_index = pi.specific_prim_address;
+    prim.user_data0 = pi.user_data0;
+    prim.user_data1 = pi.user_data1;
     prim.z = float(pi.z);
 
     return prim;
 }
 
 Primitive load_primitive() {
     PrimitiveInstance pi = fetch_prim_instance();
 
@@ -534,17 +521,17 @@ Primitive load_primitive() {
 
 
 // Return the intersection of the plane (set up by "normal" and "point")
 // with the ray (set up by "ray_origin" and "ray_dir"),
 // writing the resulting scaler into "t".
 bool ray_plane(vec3 normal, vec3 point, vec3 ray_origin, vec3 ray_dir, out float t)
 {
     float denom = dot(normal, ray_dir);
-    if (denom > 1e-6) {
+    if (abs(denom) > 1e-6) {
         vec3 d = point - ray_origin;
         t = dot(d, normal) / denom;
         return t >= 0.0;
     }
 
     return false;
 }
 
@@ -563,22 +550,21 @@ vec4 untransform(vec2 ref, vec3 n, vec3 
     float z = p.z + d.z * t; // Z of the visible point on the layer
 
     vec4 r = inv_transform * vec4(ref, z, 1.0);
     return r;
 }
 
 // Given a CSS space position, transform it back into the layer space.
 vec4 get_layer_pos(vec2 pos, Layer layer) {
-    // get 3 of the layer corners in CSS space
-    vec3 a = layer.screen_vertices[0].xyz / layer.screen_vertices[0].w;
-    vec3 b = layer.screen_vertices[3].xyz / layer.screen_vertices[3].w;
-    vec3 c = layer.screen_vertices[2].xyz / layer.screen_vertices[2].w;
+    // get a point on the layer plane
+    vec4 ah = layer.transform * vec4(0.0, 0.0, 0.0, 1.0);
+    vec3 a = ah.xyz / ah.w;
     // get the normal to the layer plane
-    vec3 n = normalize(cross(b-a, c-a));
+    vec3 n = transpose(mat3(layer.inv_transform)) * vec3(0.0, 0.0, 1.0);
     return untransform(pos, n, a, layer.inv_transform);
 }
 
 struct VertexInfo {
     vec2 local_pos;
     vec2 screen_pos;
 };
 
--- a/gfx/webrender/res/ps_angle_gradient.vs.glsl
+++ b/gfx/webrender/res/ps_angle_gradient.vs.glsl
@@ -22,16 +22,16 @@ void main(void) {
 
     vStartPoint = start_point;
     vScaledDir = dir / dot(dir, dir);
 
     vTileSize = gradient.tile_size_repeat.xy;
     vTileRepeat = gradient.tile_size_repeat.zw;
 
     // V coordinate of gradient row in lookup texture.
-    vGradientIndex = float(prim.sub_index);
+    vGradientIndex = float(prim.user_data0);
 
     // The texture size of the lookup texture
     vGradientTextureSize = vec2(textureSize(sGradients, 0));
 
     // Whether to repeat the gradient instead of clamping.
     vGradientRepeat = float(int(gradient.extend_mode.x) == EXTEND_MODE_REPEAT);
 }
--- a/gfx/webrender/res/ps_blend.vs.glsl
+++ b/gfx/webrender/res/ps_blend.vs.glsl
@@ -1,17 +1,17 @@
 #line 1
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 void main(void) {
-    PrimitiveInstance pi = fetch_prim_instance();
-    AlphaBatchTask dest_task = fetch_alpha_batch_task(pi.render_task_index);
-    AlphaBatchTask src_task = fetch_alpha_batch_task(pi.user_data.x);
+    CompositeInstance ci = fetch_composite_instance();
+    AlphaBatchTask dest_task = fetch_alpha_batch_task(ci.render_task_index);
+    AlphaBatchTask src_task = fetch_alpha_batch_task(ci.src_task_index);
 
     vec2 dest_origin = dest_task.render_target_origin -
                        dest_task.screen_space_origin +
                        src_task.screen_space_origin;
 
     vec2 local_pos = mix(dest_origin,
                          dest_origin + src_task.size,
                          aPosition.xy);
@@ -19,13 +19,13 @@ void main(void) {
     vec2 texture_size = vec2(textureSize(sCacheRGBA8, 0));
     vec2 st0 = src_task.render_target_origin;
     vec2 st1 = src_task.render_target_origin + src_task.size;
 
     vec2 uv = src_task.render_target_origin + aPosition.xy * src_task.size;
     vUv = vec3(uv / texture_size, src_task.render_target_layer_index);
     vUvBounds = vec4(st0 + 0.5, st1 - 0.5) / texture_size.xyxy;
 
-    vOp = pi.sub_index;
-    vAmount = float(pi.user_data.y) / 65535.0;
+    vOp = ci.user_data0;
+    vAmount = float(ci.user_data1) / 65535.0;
 
-    gl_Position = uTransform * vec4(local_pos, pi.z, 1.0);
+    gl_Position = uTransform * vec4(local_pos, ci.z, 1.0);
 }
--- a/gfx/webrender/res/ps_border_corner.vs.glsl
+++ b/gfx/webrender/res/ps_border_corner.vs.glsl
@@ -86,38 +86,40 @@ void write_color(vec4 color0, vec4 color
     vColor11 = vec4(color1.rgb * modulate.w, color1.a);
 }
 
 int select_style(int color_select, vec2 fstyle) {
     ivec2 style = ivec2(fstyle);
 
     switch (color_select) {
         case SIDE_BOTH:
+        {
             // TODO(gw): A temporary hack! While we don't support
             //           border corners that have dots or dashes
             //           with another style, pretend they are solid
             //           border corners.
             bool has_dots = style.x == BORDER_STYLE_DOTTED ||
                             style.y == BORDER_STYLE_DOTTED;
             bool has_dashes = style.x == BORDER_STYLE_DASHED ||
                               style.y == BORDER_STYLE_DASHED;
             if (style.x != style.y && (has_dots || has_dashes))
                 return BORDER_STYLE_SOLID;
             return style.x;
+        }
         case SIDE_FIRST:
             return style.x;
         case SIDE_SECOND:
             return style.y;
     }
 }
 
 void main(void) {
     Primitive prim = load_primitive();
     Border border = fetch_border(prim.prim_index);
-    int sub_part = prim.sub_index;
+    int sub_part = prim.user_data0;
     BorderCorners corners = get_border_corners(border, prim.local_rect);
 
     vec2 p0, p1;
 
     // TODO(gw): We'll need to pass through multiple styles
     //           once we support style transitions per corner.
     int style;
     vec4 edge_distances;
@@ -130,17 +132,17 @@ void main(void) {
     switch (sub_part) {
         case 0: {
             p0 = corners.tl_outer;
             p1 = corners.tl_inner;
             color0 = border.colors[0];
             color1 = border.colors[1];
             vClipCenter = corners.tl_outer + border.radii[0].xy;
             vClipSign = vec2(1.0);
-            style = select_style(prim.user_data.x, border.style.yx);
+            style = select_style(prim.user_data1, border.style.yx);
             vec4 adjusted_widths = get_effective_border_widths(border, style);
             vec4 inv_adjusted_widths = border.widths - adjusted_widths;
             set_radii(style,
                       border.radii[0].xy,
                       border.widths.xy,
                       adjusted_widths.xy);
             set_edge_line(border.widths.xy,
                           corners.tl_outer,
@@ -152,17 +154,17 @@ void main(void) {
         }
         case 1: {
             p0 = vec2(corners.tr_inner.x, corners.tr_outer.y);
             p1 = vec2(corners.tr_outer.x, corners.tr_inner.y);
             color0 = border.colors[1];
             color1 = border.colors[2];
             vClipCenter = corners.tr_outer + vec2(-border.radii[0].z, border.radii[0].w);
             vClipSign = vec2(-1.0, 1.0);
-            style = select_style(prim.user_data.x, border.style.zy);
+            style = select_style(prim.user_data1, border.style.zy);
             vec4 adjusted_widths = get_effective_border_widths(border, style);
             vec4 inv_adjusted_widths = border.widths - adjusted_widths;
             set_radii(style,
                       border.radii[0].zw,
                       border.widths.zy,
                       adjusted_widths.zy);
             set_edge_line(border.widths.zy,
                           corners.tr_outer,
@@ -176,17 +178,17 @@ void main(void) {
         }
         case 2: {
             p0 = corners.br_inner;
             p1 = corners.br_outer;
             color0 = border.colors[2];
             color1 = border.colors[3];
             vClipCenter = corners.br_outer - border.radii[1].xy;
             vClipSign = vec2(-1.0, -1.0);
-            style = select_style(prim.user_data.x, border.style.wz);
+            style = select_style(prim.user_data1, border.style.wz);
             vec4 adjusted_widths = get_effective_border_widths(border, style);
             vec4 inv_adjusted_widths = border.widths - adjusted_widths;
             set_radii(style,
                       border.radii[1].xy,
                       border.widths.zw,
                       adjusted_widths.zw);
             set_edge_line(border.widths.zw,
                           corners.br_outer,
@@ -200,17 +202,17 @@ void main(void) {
         }
         case 3: {
             p0 = vec2(corners.bl_outer.x, corners.bl_inner.y);
             p1 = vec2(corners.bl_inner.x, corners.bl_outer.y);
             color0 = border.colors[3];
             color1 = border.colors[0];
             vClipCenter = corners.bl_outer + vec2(border.radii[1].z, -border.radii[1].w);
             vClipSign = vec2(1.0, -1.0);
-            style = select_style(prim.user_data.x, border.style.xw);
+            style = select_style(prim.user_data1, border.style.xw);
             vec4 adjusted_widths = get_effective_border_widths(border, style);
             vec4 inv_adjusted_widths = border.widths - adjusted_widths;
             set_radii(style,
                       border.radii[1].zw,
                       border.widths.xw,
                       adjusted_widths.xw);
             set_edge_line(border.widths.xw,
                           corners.bl_outer,
@@ -248,17 +250,17 @@ void main(void) {
         default: {
             vEdgeDistance = vec4(0.0);
             vAlphaSelect = 1.0;
             vSDFSelect = 0.0;
             break;
         }
     }
 
-    write_color(color0, color1, style, color_delta, prim.user_data.x);
+    write_color(color0, color1, style, color_delta, prim.user_data1);
 
     RectWithSize segment_rect;
     segment_rect.p0 = p0;
     segment_rect.size = p1 - p0;
 
 #ifdef WR_FEATURE_TRANSFORM
     TransformVertexInfo vi = write_transform_vertex(segment_rect,
                                                     prim.local_clip_rect,
--- a/gfx/webrender/res/ps_border_edge.vs.glsl
+++ b/gfx/webrender/res/ps_border_edge.vs.glsl
@@ -37,21 +37,25 @@ void write_alpha_select(float style) {
     }
 }
 
 void write_color(vec4 color, float style, bool flip) {
     vec2 modulate;
 
     switch (int(style)) {
         case BORDER_STYLE_GROOVE:
+        {
             modulate = flip ? vec2(1.3, 0.7) : vec2(0.7, 1.3);
             break;
+        }
         case BORDER_STYLE_RIDGE:
+        {
             modulate = flip ? vec2(0.7, 1.3) : vec2(1.3, 0.7);
             break;
+        }
         default:
             modulate = vec2(1.0);
             break;
     }
 
     vColor0 = vec4(color.rgb * modulate.x, color.a);
     vColor1 = vec4(color.rgb * modulate.y, color.a);
 }
@@ -96,17 +100,17 @@ void write_clip_params(float style,
             vClipSelect = 0.0;
             break;
     }
 }
 
 void main(void) {
     Primitive prim = load_primitive();
     Border border = fetch_border(prim.prim_index);
-    int sub_part = prim.sub_index;
+    int sub_part = prim.user_data0;
     BorderCorners corners = get_border_corners(border, prim.local_rect);
     vec4 color = border.colors[sub_part];
 
     // TODO(gw): Now that all border styles are supported, the switch
     //           statement below can be tidied up quite a bit.
 
     float style;
     bool color_flip;
--- a/gfx/webrender/res/ps_box_shadow.vs.glsl
+++ b/gfx/webrender/res/ps_box_shadow.vs.glsl
@@ -1,26 +1,26 @@
 #line 1
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 void main(void) {
     Primitive prim = load_primitive();
     BoxShadow bs = fetch_boxshadow(prim.prim_index);
-    RectWithSize segment_rect = fetch_instance_geometry(prim.sub_index);
+    RectWithSize segment_rect = fetch_instance_geometry(prim.user_data0);
 
     VertexInfo vi = write_vertex(segment_rect,
                                  prim.local_clip_rect,
                                  prim.z,
                                  prim.layer,
                                  prim.task,
                                  prim.local_rect.p0);
 
-    RenderTaskData child_task = fetch_render_task(prim.user_data.x);
+    RenderTaskData child_task = fetch_render_task(prim.user_data1);
     vUv.z = child_task.data1.x;
 
     // Constant offsets to inset from bilinear filtering border.
     vec2 patch_origin = child_task.data0.xy + vec2(1.0);
     vec2 patch_size_device_pixels = child_task.data0.zw - vec2(2.0);
     vec2 patch_size = patch_size_device_pixels / uDevicePixelRatio;
 
     vUv.xy = (vi.local_pos - prim.local_rect.p0) / patch_size;
--- a/gfx/webrender/res/ps_cache_image.vs.glsl
+++ b/gfx/webrender/res/ps_cache_image.vs.glsl
@@ -11,17 +11,17 @@ void main(void) {
 
     VertexInfo vi = write_vertex(prim.local_rect,
                                  prim.local_clip_rect,
                                  prim.z,
                                  prim.layer,
                                  prim.task,
                                  prim.local_rect.p0);
 
-    RenderTaskData child_task = fetch_render_task(prim.user_data.x);
+    RenderTaskData child_task = fetch_render_task(prim.user_data1);
     vUv.z = child_task.data1.x;
 
     vec2 texture_size = vec2(textureSize(sCacheRGBA8, 0));
     vec2 uv0 = child_task.data0.xy / texture_size;
     vec2 uv1 = (child_task.data0.xy + child_task.data0.zw) / texture_size;
 
     vec2 f = (vi.local_pos - prim.local_rect.p0) / prim.local_rect.size;
 
--- a/gfx/webrender/res/ps_composite.vs.glsl
+++ b/gfx/webrender/res/ps_composite.vs.glsl
@@ -1,18 +1,18 @@
 #line 1
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 void main(void) {
-    PrimitiveInstance pi = fetch_prim_instance();
-    AlphaBatchTask dest_task = fetch_alpha_batch_task(pi.render_task_index);
-    ReadbackTask backdrop_task = fetch_readback_task(pi.user_data.x);
-    AlphaBatchTask src_task = fetch_alpha_batch_task(pi.user_data.y);
+    CompositeInstance ci = fetch_composite_instance();
+    AlphaBatchTask dest_task = fetch_alpha_batch_task(ci.render_task_index);
+    ReadbackTask backdrop_task = fetch_readback_task(ci.backdrop_task_index);
+    AlphaBatchTask src_task = fetch_alpha_batch_task(ci.src_task_index);
 
     vec2 dest_origin = dest_task.render_target_origin -
                        dest_task.screen_space_origin +
                        src_task.screen_space_origin;
 
     vec2 local_pos = mix(dest_origin,
                          dest_origin + src_task.size,
                          aPosition.xy);
@@ -22,13 +22,12 @@ void main(void) {
     vec2 st0 = backdrop_task.render_target_origin / texture_size;
     vec2 st1 = (backdrop_task.render_target_origin + backdrop_task.size) / texture_size;
     vUv0 = vec3(mix(st0, st1, aPosition.xy), backdrop_task.render_target_layer_index);
 
     st0 = src_task.render_target_origin / texture_size;
     st1 = (src_task.render_target_origin + src_task.size) / texture_size;
     vUv1 = vec3(mix(st0, st1, aPosition.xy), src_task.render_target_layer_index);
 
-    vOp = pi.sub_index;
+    vOp = ci.user_data0;
 
-    gl_Position = uTransform * vec4(local_pos, pi.z, 1.0);
-
+    gl_Position = uTransform * vec4(local_pos, ci.z, 1.0);
 }
--- a/gfx/webrender/res/ps_gradient.vs.glsl
+++ b/gfx/webrender/res/ps_gradient.vs.glsl
@@ -4,18 +4,18 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 void main(void) {
     Primitive prim = load_primitive();
     Gradient gradient = fetch_gradient(prim.prim_index);
 
     vec4 abs_start_end_point = gradient.start_end_point + prim.local_rect.p0.xyxy;
 
-    GradientStop g0 = fetch_gradient_stop(prim.sub_index + 0);
-    GradientStop g1 = fetch_gradient_stop(prim.sub_index + 1);
+    GradientStop g0 = fetch_gradient_stop(prim.user_data0 + 0);
+    GradientStop g1 = fetch_gradient_stop(prim.user_data0 + 1);
 
     RectWithSize segment_rect;
     vec2 axis;
     vec4 adjusted_color_g0 = g0.color;
     vec4 adjusted_color_g1 = g1.color;
     if (abs_start_end_point.y == abs_start_end_point.w) {
         // Calculate the x coord of the gradient stops
         vec2 g01_x = mix(abs_start_end_point.xx, abs_start_end_point.zz,
--- a/gfx/webrender/res/ps_hardware_composite.vs.glsl
+++ b/gfx/webrender/res/ps_hardware_composite.vs.glsl
@@ -1,25 +1,25 @@
 #line 1
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 void main(void) {
-    PrimitiveInstance pi = fetch_prim_instance();
-    AlphaBatchTask dest_task = fetch_alpha_batch_task(pi.render_task_index);
-    AlphaBatchTask src_task = fetch_alpha_batch_task(pi.user_data.x);
+    CompositeInstance ci = fetch_composite_instance();
+    AlphaBatchTask dest_task = fetch_alpha_batch_task(ci.render_task_index);
+    AlphaBatchTask src_task = fetch_alpha_batch_task(ci.src_task_index);
 
     vec2 dest_origin = dest_task.render_target_origin -
                        dest_task.screen_space_origin +
                        src_task.screen_space_origin;
 
     vec2 local_pos = mix(dest_origin,
                          dest_origin + src_task.size,
                          aPosition.xy);
 
     vec2 texture_size = vec2(textureSize(sCacheRGBA8, 0));
     vec2 st0 = src_task.render_target_origin / texture_size;
     vec2 st1 = (src_task.render_target_origin + src_task.size) / texture_size;
     vUv = vec3(mix(st0, st1, aPosition.xy), src_task.render_target_layer_index);
 
-    gl_Position = uTransform * vec4(local_pos, pi.z, 1.0);
+    gl_Position = uTransform * vec4(local_pos, ci.z, 1.0);
 }
--- a/gfx/webrender/res/ps_image.vs.glsl
+++ b/gfx/webrender/res/ps_image.vs.glsl
@@ -1,17 +1,17 @@
 #line 1
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 void main(void) {
     Primitive prim = load_primitive();
     Image image = fetch_image(prim.prim_index);
-    ResourceRect res = fetch_resource_rect(prim.user_data.x);
+    ResourceRect res = fetch_resource_rect(prim.user_data0);
 
 #ifdef WR_FEATURE_TRANSFORM
     TransformVertexInfo vi = write_transform_vertex(prim.local_rect,
                                                     prim.local_clip_rect,
                                                     prim.z,
                                                     prim.layer,
                                                     prim.task,
                                                     prim.local_rect.p0);
--- a/gfx/webrender/res/ps_radial_gradient.vs.glsl
+++ b/gfx/webrender/res/ps_radial_gradient.vs.glsl
@@ -30,16 +30,16 @@ void main(void) {
     float ratio_xy = gradient.start_end_radius_ratio_xy_extend_mode.z;
     vPos.y *= ratio_xy;
     vStartCenter.y *= ratio_xy;
     vEndCenter.y *= ratio_xy;
     vTileSize.y *= ratio_xy;
     vTileRepeat.y *= ratio_xy;
 
     // V coordinate of gradient row in lookup texture.
-    vGradientIndex = float(prim.sub_index);
+    vGradientIndex = float(prim.user_data0);
 
     // The texture size of the lookup texture
     vGradientTextureSize = vec2(textureSize(sGradients, 0));
 
     // Whether to repeat the gradient instead of clamping.
     vGradientRepeat = float(int(gradient.start_end_radius_ratio_xy_extend_mode.w) == EXTEND_MODE_REPEAT);
 }
--- a/gfx/webrender/res/ps_split_composite.vs.glsl
+++ b/gfx/webrender/res/ps_split_composite.vs.glsl
@@ -26,24 +26,24 @@ SplitGeometry fetch_split_geometry(int i
 
 vec3 bilerp(vec3 a, vec3 b, vec3 c, vec3 d, float s, float t) {
     vec3 x = mix(a, b, t);
     vec3 y = mix(c, d, t);
     return mix(x, y, s);
 }
 
 void main(void) {
-    PrimitiveInstance pi = fetch_prim_instance();
-    SplitGeometry geometry = fetch_split_geometry(pi.specific_prim_index);
-    AlphaBatchTask src_task = fetch_alpha_batch_task(pi.user_data.x);
+    CompositeInstance ci = fetch_composite_instance();
+    SplitGeometry geometry = fetch_split_geometry(ci.user_data0);
+    AlphaBatchTask src_task = fetch_alpha_batch_task(ci.src_task_index);
 
     vec3 world_pos = bilerp(geometry.points[0], geometry.points[1],
                             geometry.points[3], geometry.points[2],
                             aPosition.y, aPosition.x);
-    vec4 final_pos = vec4(world_pos.xy * uDevicePixelRatio, pi.z, 1.0);
+    vec4 final_pos = vec4(world_pos.xy * uDevicePixelRatio, ci.z, 1.0);
 
     gl_Position = uTransform * final_pos;
 
     vec2 uv_origin = src_task.render_target_origin;
     vec2 uv_pos = uv_origin + world_pos.xy - src_task.screen_space_origin;
     vec2 texture_size = vec2(textureSize(sCacheRGBA8, 0));
     vUv = vec3(uv_pos / texture_size, src_task.render_target_layer_index);
     vUvTaskBounds = vec4(uv_origin, uv_origin + src_task.size) / texture_size.xyxy;
--- a/gfx/webrender/res/ps_text_run.vs.glsl
+++ b/gfx/webrender/res/ps_text_run.vs.glsl
@@ -1,18 +1,18 @@
 #line 1
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 void main(void) {
     Primitive prim = load_primitive();
     TextRun text = fetch_text_run(prim.prim_index);
-    Glyph glyph = fetch_glyph(prim.sub_index);
-    ResourceRect res = fetch_resource_rect(prim.user_data.x);
+    Glyph glyph = fetch_glyph(prim.user_data0);
+    ResourceRect res = fetch_resource_rect(prim.user_data1);
 
     RectWithSize local_rect = RectWithSize(glyph.offset.xy,
                                            (res.uv_rect.zw - res.uv_rect.xy) / uDevicePixelRatio);
 
 #ifdef WR_FEATURE_TRANSFORM
     TransformVertexInfo vi = write_transform_vertex(local_rect,
                                                     prim.local_clip_rect,
                                                     prim.z,
--- a/gfx/webrender/res/ps_yuv_image.vs.glsl
+++ b/gfx/webrender/res/ps_yuv_image.vs.glsl
@@ -20,21 +20,21 @@ void main(void) {
                                  prim.layer,
                                  prim.task,
                                  prim.local_rect.p0);
     vLocalPos = vi.local_pos - prim.local_rect.p0;
 #endif
 
     write_clip(vi.screen_pos, prim.clip_area);
 
-    ResourceRect y_rect = fetch_resource_rect(prim.user_data.x);
+    ResourceRect y_rect = fetch_resource_rect(prim.user_data0);
 #ifndef WR_FEATURE_INTERLEAVED_Y_CB_CR  // only 1 channel
-    ResourceRect u_rect = fetch_resource_rect(prim.user_data.x + 1);
+    ResourceRect u_rect = fetch_resource_rect(prim.user_data0 + 1);
 #ifndef WR_FEATURE_NV12 // 2 channel
-    ResourceRect v_rect = fetch_resource_rect(prim.user_data.x + 2);
+    ResourceRect v_rect = fetch_resource_rect(prim.user_data0 + 2);
 #endif
 #endif
 
     // If this is in WR_FEATURE_TEXTURE_RECT mode, the rect and size use
     // non-normalized texture coordinates.
 #ifdef WR_FEATURE_TEXTURE_RECT
     vec2 y_texture_size_normalization_factor = vec2(1, 1);
 #else
--- a/gfx/webrender/src/device.rs
+++ b/gfx/webrender/src/device.rs
@@ -1,17 +1,17 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 use euclid::Matrix4D;
 use fnv::FnvHasher;
 use gleam::gl;
 use internal_types::{PackedVertex, RenderTargetMode, TextureSampler, DEFAULT_TEXTURE};
-use internal_types::{BlurAttribute, ClearAttribute, ClipAttribute, VertexAttribute};
+use internal_types::{BlurAttribute, ClipAttribute, VertexAttribute};
 use internal_types::{DebugFontVertex, DebugColorVertex};
 //use notify::{self, Watcher};
 use super::shader_source;
 use std::collections::HashMap;
 use std::fs::File;
 use std::hash::BuildHasherDefault;
 use std::io::Read;
 use std::iter::repeat;
@@ -71,30 +71,28 @@ impl TextureTarget {
 pub enum TextureFilter {
     Nearest,
     Linear,
 }
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub enum VertexFormat {
     Triangles,
-    Rectangles,
     DebugFont,
     DebugColor,
-    Clear,
     Blur,
     Clip,
 }
 
 enum FBOTarget {
     Read,
     Draw,
 }
 
-fn get_gl_format_bgra(gl: &gl::Gl) -> gl::GLuint {
+pub fn get_gl_format_bgra(gl: &gl::Gl) -> gl::GLuint {
     match gl.get_type() {
         gl::GlType::Gl => {
             GL_FORMAT_BGRA_GL
         }
         gl::GlType::Gles => {
             GL_FORMAT_BGRA_GLES
         }
     }
@@ -185,79 +183,43 @@ impl VertexFormat {
                                           0 + vertex_stride * offset);
                 gl.vertex_attrib_pointer(VertexAttribute::Color as gl::GLuint,
                                           4,
                                           gl::UNSIGNED_BYTE,
                                           true,
                                           vertex_stride as gl::GLint,
                                           8 + vertex_stride * offset);
             }
-            VertexFormat::Rectangles |
             VertexFormat::Triangles => {
                 let vertex_stride = mem::size_of::<PackedVertex>() as gl::GLuint;
                 gl.enable_vertex_attrib_array(VertexAttribute::Position as gl::GLuint);
                 gl.vertex_attrib_divisor(VertexAttribute::Position as gl::GLuint, 0);
 
                 gl.vertex_attrib_pointer(VertexAttribute::Position as gl::GLuint,
                                           2,
                                           gl::FLOAT,
                                           false,
                                           vertex_stride as gl::GLint,
                                           0);
 
                 instance.bind(gl);
                 let mut offset = 0;
 
-                for &attrib in [VertexAttribute::GlobalPrimId,
-                                VertexAttribute::PrimitiveAddress,
-                                VertexAttribute::TaskIndex,
-                                VertexAttribute::ClipTaskIndex,
-                                VertexAttribute::LayerIndex,
-                                VertexAttribute::ElementIndex,
-                                VertexAttribute::ZIndex,
+                for &attrib in [VertexAttribute::Data0,
+                                VertexAttribute::Data1,
                                ].into_iter() {
                     gl.enable_vertex_attrib_array(attrib as gl::GLuint);
                     gl.vertex_attrib_divisor(attrib as gl::GLuint, 1);
                     gl.vertex_attrib_i_pointer(attrib as gl::GLuint,
-                                                1,
+                                                4,
                                                 gl::INT,
                                                 instance_stride,
                                                 offset);
-                    offset += 4;
+                    offset += 16;
                 }
-
-                gl.enable_vertex_attrib_array(VertexAttribute::UserData as gl::GLuint);
-                gl.vertex_attrib_divisor(VertexAttribute::UserData as gl::GLuint, 1);
-                gl.vertex_attrib_i_pointer(VertexAttribute::UserData as gl::GLuint,
-                                            2,
-                                            gl::INT,
-                                            instance_stride,
-                                            offset);
-            }
-            VertexFormat::Clear => {
-                let vertex_stride = mem::size_of::<PackedVertex>() as gl::GLuint;
-                gl.enable_vertex_attrib_array(ClearAttribute::Position as gl::GLuint);
-                gl.vertex_attrib_divisor(ClearAttribute::Position as gl::GLuint, 0);
-
-                gl.vertex_attrib_pointer(ClearAttribute::Position as gl::GLuint,
-                                          2,
-                                          gl::FLOAT,
-                                          false,
-                                          vertex_stride as gl::GLint,
-                                          0);
-
-                instance.bind(gl);
-
-                gl.enable_vertex_attrib_array(ClearAttribute::Rectangle as gl::GLuint);
-                gl.vertex_attrib_divisor(ClearAttribute::Rectangle as gl::GLuint, 1);
-                gl.vertex_attrib_i_pointer(ClearAttribute::Rectangle as gl::GLuint,
-                                            4,
-                                            gl::INT,
-                                            instance_stride,
-                                            0);
             }
             VertexFormat::Blur => {
                 let vertex_stride = mem::size_of::<PackedVertex>() as gl::GLuint;
                 gl.enable_vertex_attrib_array(BlurAttribute::Position as gl::GLuint);
                 gl.vertex_attrib_divisor(BlurAttribute::Position as gl::GLuint, 0);
 
                 gl.vertex_attrib_pointer(BlurAttribute::Position as gl::GLuint,
                                           2,
@@ -407,34 +369,25 @@ impl Program {
     fn attach_and_bind_shaders(&mut self,
                                vs_id: gl::GLuint,
                                fs_id: gl::GLuint,
                                vertex_format: VertexFormat) -> Result<(), ShaderError> {
         self.gl.attach_shader(self.id, vs_id);
         self.gl.attach_shader(self.id, fs_id);
 
         match vertex_format {
-            VertexFormat::Triangles | VertexFormat::Rectangles |
-            VertexFormat::DebugFont |  VertexFormat::DebugColor => {
+            VertexFormat::Triangles |
+            VertexFormat::DebugFont |
+            VertexFormat::DebugColor => {
                 self.gl.bind_attrib_location(self.id, VertexAttribute::Position as gl::GLuint, "aPosition");
                 self.gl.bind_attrib_location(self.id, VertexAttribute::Color as gl::GLuint, "aColor");
                 self.gl.bind_attrib_location(self.id, VertexAttribute::ColorTexCoord as gl::GLuint, "aColorTexCoord");
 
-                self.gl.bind_attrib_location(self.id, VertexAttribute::GlobalPrimId as gl::GLuint, "aGlobalPrimId");
-                self.gl.bind_attrib_location(self.id, VertexAttribute::PrimitiveAddress as gl::GLuint, "aPrimitiveAddress");
-                self.gl.bind_attrib_location(self.id, VertexAttribute::TaskIndex as gl::GLuint, "aTaskIndex");
-                self.gl.bind_attrib_location(self.id, VertexAttribute::ClipTaskIndex as gl::GLuint, "aClipTaskIndex");
-                self.gl.bind_attrib_location(self.id, VertexAttribute::LayerIndex as gl::GLuint, "aLayerIndex");
-                self.gl.bind_attrib_location(self.id, VertexAttribute::ElementIndex as gl::GLuint, "aElementIndex");
-                self.gl.bind_attrib_location(self.id, VertexAttribute::UserData as gl::GLuint, "aUserData");
-                self.gl.bind_attrib_location(self.id, VertexAttribute::ZIndex as gl::GLuint, "aZIndex");
-            }
-            VertexFormat::Clear => {
-                self.gl.bind_attrib_location(self.id, ClearAttribute::Position as gl::GLuint, "aPosition");
-                self.gl.bind_attrib_location(self.id, ClearAttribute::Rectangle as gl::GLuint, "aClearRectangle");
+                self.gl.bind_attrib_location(self.id, VertexAttribute::Data0 as gl::GLuint, "aData0");
+                self.gl.bind_attrib_location(self.id, VertexAttribute::Data1 as gl::GLuint, "aData1");
             }
             VertexFormat::Blur => {
                 self.gl.bind_attrib_location(self.id, BlurAttribute::Position as gl::GLuint, "aPosition");
                 self.gl.bind_attrib_location(self.id, BlurAttribute::RenderTaskIndex as gl::GLuint, "aBlurRenderTaskIndex");
                 self.gl.bind_attrib_location(self.id, BlurAttribute::SourceTaskIndex as gl::GLuint, "aBlurSourceTaskIndex");
                 self.gl.bind_attrib_location(self.id, BlurAttribute::Direction as gl::GLuint, "aBlurDirection");
             }
             VertexFormat::Clip => {
--- a/gfx/webrender/src/internal_types.rs
+++ b/gfx/webrender/src/internal_types.rs
@@ -113,32 +113,18 @@ pub const DEFAULT_TEXTURE: TextureSample
 
 #[derive(Clone, Copy, Debug)]
 pub enum VertexAttribute {
     // vertex-frequency basic attributes
     Position,
     Color,
     ColorTexCoord,
     // instance-frequency primitive attributes
-    GlobalPrimId,
-    PrimitiveAddress,
-    TaskIndex,
-    ClipTaskIndex,
-    LayerIndex,
-    ElementIndex,
-    UserData,
-    ZIndex,
-}
-
-#[derive(Clone, Copy, Debug)]
-pub enum ClearAttribute {
-    // vertex frequency
-    Position,
-    // instance frequency
-    Rectangle,
+    Data0,
+    Data1,
 }
 
 #[derive(Clone, Copy, Debug)]
 pub enum BlurAttribute {
     // vertex frequency
     Position,
     // instance frequency
     RenderTaskIndex,
--- a/gfx/webrender/src/lib.rs
+++ b/gfx/webrender/src/lib.rs
@@ -68,16 +68,19 @@ mod render_backend;
 mod render_task;
 mod resource_cache;
 mod scene;
 mod spring;
 mod texture_cache;
 mod tiling;
 mod util;
 
+#[doc(hidden)] // for benchmarks
+pub use texture_cache::TexturePage;
+
 #[cfg(feature = "webgl")]
 mod webgl_types;
 
 #[cfg(not(feature = "webgl"))]
 #[path = "webgl_stubs.rs"]
 mod webgl_types;
 
 mod shader_source {
@@ -135,9 +138,9 @@ extern crate offscreen_gl_context;
 extern crate byteorder;
 extern crate rayon;
 extern crate plane_split;
 
 #[cfg(any(target_os="macos", target_os="windows"))]
 extern crate gamma_lut;
 
 pub use renderer::{ExternalImage, ExternalImageSource, ExternalImageHandler};
-pub use renderer::{Renderer, RendererOptions};
+pub use renderer::{GraphicsApi, GraphicsApiInfo, ReadPixelsFormat, Renderer, RendererOptions};
--- a/gfx/webrender/src/renderer.rs
+++ b/gfx/webrender/src/renderer.rs
@@ -8,16 +8,17 @@
 //! is accessible through [`Renderer`][renderer]
 //!
 //! [renderer]: struct.Renderer.html
 
 use debug_colors;
 use debug_render::DebugRenderer;
 use device::{DepthFunction, Device, FrameId, ProgramId, TextureId, VertexFormat, GpuMarker, GpuProfiler};
 use device::{GpuSample, TextureFilter, VAOId, VertexUsageHint, FileWatcherHandler, TextureTarget, ShaderError};
+use device::get_gl_format_bgra;
 use euclid::Matrix4D;
 use fnv::FnvHasher;
 use frame_builder::FrameBuilderConfig;
 use gleam::gl;
 use gpu_store::{GpuStore, GpuStoreLayout};
 use internal_types::{CacheTextureId, RendererFrame, ResultMsg, TextureUpdateOp};
 use internal_types::{TextureUpdateList, PackedVertex, RenderTargetMode};
 use internal_types::{ORTHO_NEAR_PLANE, ORTHO_FAR_PLANE, SourceTexture};
@@ -38,25 +39,25 @@ use std::mem;
 use std::path::PathBuf;
 use std::rc::Rc;
 use std::sync::{Arc, Mutex};
 use std::sync::mpsc::{channel, Receiver, Sender};
 use std::thread;
 use texture_cache::TextureCache;
 use rayon::ThreadPool;
 use rayon::Configuration as ThreadPoolConfig;
-use tiling::{AlphaBatchKind, BlurCommand, Frame, PrimitiveBatch, RenderTarget};
+use tiling::{AlphaBatchKind, BlurCommand, CompositePrimitiveInstance, Frame, PrimitiveBatch, RenderTarget};
 use tiling::{AlphaRenderTarget, CacheClipInstance, PrimitiveInstance, ColorRenderTarget, RenderTargetKind};
 use time::precise_time_ns;
 use thread_profiler::{register_thread_with_profiler, write_profile};
 use util::TransformedRectKind;
 use webgl_types::GLContextHandleWrapper;
 use webrender_traits::{ColorF, Epoch, PipelineId, RenderNotifier, RenderDispatcher};
 use webrender_traits::{ExternalImageId, ExternalImageType, ImageData, ImageFormat, RenderApiSender};
-use webrender_traits::{DeviceIntRect, DevicePoint, DeviceIntPoint, DeviceIntSize, DeviceUintSize};
+use webrender_traits::{DeviceIntRect, DeviceUintRect, DevicePoint, DeviceIntPoint, DeviceIntSize, DeviceUintSize};
 use webrender_traits::{ImageDescriptor, BlobImageRenderer};
 use webrender_traits::{channel, FontRenderMode};
 use webrender_traits::VRCompositorHandler;
 use webrender_traits::{YuvColorSpace, YuvFormat};
 use webrender_traits::{YUV_COLOR_SPACES, YUV_FORMATS};
 
 pub const GPU_DATA_TEXTURE_POOL: usize = 5;
 pub const MAX_VERTEX_TEXTURE_WIDTH: usize = 1024;
@@ -78,16 +79,28 @@ const GPU_TAG_PRIM_GRADIENT: GpuProfileT
 const GPU_TAG_PRIM_ANGLE_GRADIENT: GpuProfileTag = GpuProfileTag { label: "AngleGradient", color: debug_colors::POWDERBLUE };
 const GPU_TAG_PRIM_RADIAL_GRADIENT: GpuProfileTag = GpuProfileTag { label: "RadialGradient", color: debug_colors::LIGHTPINK };
 const GPU_TAG_PRIM_BOX_SHADOW: GpuProfileTag = GpuProfileTag { label: "BoxShadow", color: debug_colors::CYAN };
 const GPU_TAG_PRIM_BORDER_CORNER: GpuProfileTag = GpuProfileTag { label: "BorderCorner", color: debug_colors::DARKSLATEGREY };
 const GPU_TAG_PRIM_BORDER_EDGE: GpuProfileTag = GpuProfileTag { label: "BorderEdge", color: debug_colors::LAVENDER };
 const GPU_TAG_PRIM_CACHE_IMAGE: GpuProfileTag = GpuProfileTag { label: "CacheImage", color: debug_colors::SILVER };
 const GPU_TAG_BLUR: GpuProfileTag = GpuProfileTag { label: "Blur", color: debug_colors::VIOLET };
 
+#[derive(Clone, Debug, PartialEq)]
+pub enum GraphicsApi {
+    OpenGL,
+}
+
+#[derive(Clone, Debug)]
+pub struct GraphicsApiInfo {
+    pub kind: GraphicsApi,
+    pub renderer: String,
+    pub version: String,
+}
+
 #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
 pub enum ImageBufferKind {
     Texture2D = 0,
     TextureRect = 1,
     TextureExternal = 2,
 }
 pub const IMAGE_BUFFER_KINDS: [ImageBufferKind; 3] = [
     ImageBufferKind::Texture2D,
@@ -497,16 +510,22 @@ impl GpuDataTextures {
         device.bind_texture(TextureSampler::Data64, self.data64_texture.id);
         device.bind_texture(TextureSampler::Data128, self.data128_texture.id);
         device.bind_texture(TextureSampler::ResourceRects, self.resource_rects_texture.id);
         device.bind_texture(TextureSampler::Gradients, self.gradient_data_texture.id);
         device.bind_texture(TextureSampler::SplitGeometry, self.split_geometry_texture.id);
     }
 }
 
+#[derive(Clone, Debug, PartialEq)]
+pub enum ReadPixelsFormat {
+    Rgba8,
+    Bgra8,
+}
+
 /// The renderer is responsible for submitting to the GPU the work prepared by the
 /// RenderBackend.
 pub struct Renderer {
     result_rx: Receiver<ResultMsg>,
     device: Device,
     pending_texture_updates: Vec<TextureUpdateList>,
     pending_shader_updates: Vec<PathBuf>,
     current_frame: Option<RendererFrame>,
@@ -1131,24 +1150,28 @@ impl Renderer {
             cpu_profiles: VecDeque::new(),
             gpu_profiles: VecDeque::new(),
         };
 
         let sender = RenderApiSender::new(api_tx, payload_tx);
         Ok((renderer, sender))
     }
 
+    pub fn get_graphics_api_info(&self) -> GraphicsApiInfo {
+        GraphicsApiInfo {
+            kind: GraphicsApi::OpenGL,
+            version: self.device.gl().get_string(gl::VERSION),
+            renderer: self.device.gl().get_string(gl::RENDERER),
+        }
+    }
+
     fn get_yuv_shader_index(buffer_kind: ImageBufferKind, format: YuvFormat, color_space: YuvColorSpace) -> usize {
         ((buffer_kind as usize) * YUV_FORMATS.len() + (format as usize)) * YUV_COLOR_SPACES.len() + (color_space as usize)
     }
 
-    pub fn gl(&self) -> &gl::Gl {
-        self.device.gl()
-    }
-
     /// Sets the new RenderNotifier.
     ///
     /// The RenderNotifier will be called when processing e.g. of a (scrolling) frame is done,
     /// and therefore the screen should be updated.
     pub fn set_render_notifier(&self, notifier: Box<RenderNotifier>) {
         let mut notifier_arc = self.notifier.lock().unwrap();
         *notifier_arc = Some(notifier);
     }
@@ -1500,18 +1523,22 @@ impl Renderer {
                     projection: &Matrix4D<f32>,
                     render_task_data: &[RenderTaskData],
                     cache_texture: TextureId,
                     render_target: Option<(TextureId, i32)>,
                     target_dimensions: DeviceUintSize) {
         let transform_kind = batch.key.flags.transform_kind();
         let needs_clipping = batch.key.flags.needs_clipping();
         debug_assert!(!needs_clipping ||
-                      batch.key.blend_mode == BlendMode::Alpha ||
-                      batch.key.blend_mode == BlendMode::PremultipliedAlpha);
+                      match batch.key.blend_mode {
+                          BlendMode::Alpha |
+                          BlendMode::PremultipliedAlpha |
+                          BlendMode::Subpixel(..) => true,
+                          BlendMode::None => false,
+                      });
 
         let (marker, shader) = match batch.key.kind {
             AlphaBatchKind::Composite => {
                 let shader = self.ps_composite.get(&mut self.device);
                 (GPU_TAG_PRIM_COMPOSITE, shader)
             }
             AlphaBatchKind::HardwareComposite => {
                 let shader = self.ps_hw_composite.get(&mut self.device);
@@ -1581,32 +1608,32 @@ impl Renderer {
             }
         };
 
         // Handle special case readback for composites.
         if batch.key.kind == AlphaBatchKind::Composite {
             // composites can't be grouped together because
             // they may overlap and affect each other.
             debug_assert!(batch.instances.len() == 1);
-            let instance = &batch.instances[0];
+            let instance = CompositePrimitiveInstance::from(&batch.instances[0]);
 
             // TODO(gw): This code branch is all a bit hacky. We rely
             // on pulling specific values from the render target data
             // and also cloning the single primitive instance to be
             // able to pass to draw_instanced_batch(). We should
             // think about a cleaner way to achieve this!
 
             // Before submitting the composite batch, do the
             // framebuffer readbacks that are needed for each
             // composite operation in this batch.
             let cache_texture_dimensions = self.device.get_texture_dimensions(cache_texture);
 
-            let backdrop = &render_task_data[instance.task_index as usize];
-            let readback = &render_task_data[instance.user_data[0] as usize];
-            let source = &render_task_data[instance.user_data[1] as usize];
+            let backdrop = &render_task_data[instance.task_index.0 as usize];
+            let readback = &render_task_data[instance.backdrop_task_index.0 as usize];
+            let source = &render_task_data[instance.src_task_index.0 as usize];
 
             // Bind the FBO to blit the backdrop to.
             // Called per-instance in case the layer (and therefore FBO)
             // changes. The device will skip the GL call if the requested
             // target is already bound.
             let cache_draw_target = (cache_texture, readback.data[4] as i32);
             self.device.bind_draw_target(Some(cache_draw_target), Some(cache_texture_dimensions));
 
@@ -2139,16 +2166,41 @@ impl Renderer {
                                                    dest_rect);
 
                     current_target += 1;
                 }
             }
         }
     }
 
+    pub fn read_pixels_rgba8(&self, rect: DeviceUintRect) -> Vec<u8> {
+        let mut pixels = vec![0u8; (4 * rect.size.width * rect.size.height) as usize];
+        self.read_pixels_into(rect, ReadPixelsFormat::Rgba8, &mut pixels);
+        pixels
+    }
+
+    pub fn read_pixels_into(&self,
+                            rect: DeviceUintRect,
+                            format: ReadPixelsFormat,
+                            output: &mut [u8]) {
+        let (gl_format, gl_type, size) = match format {
+            ReadPixelsFormat::Rgba8 => (gl::RGBA, gl::UNSIGNED_BYTE, 4),
+            ReadPixelsFormat::Bgra8 => (get_gl_format_bgra(self.device.gl()), gl::UNSIGNED_BYTE, 4),
+        };
+        assert_eq!(output.len(), (size * rect.size.width * rect.size.height) as usize);
+        self.device.gl().flush();
+        self.device.gl().read_pixels_into_buffer(rect.origin.x as gl::GLint,
+                                                 rect.origin.y as gl::GLint,
+                                                 rect.size.width as gl::GLsizei,
+                                                 rect.size.height as gl::GLsizei,
+                                                 gl_format,
+                                                 gl_type,
+                                                 output);
+    }
+
     // De-initialize the Renderer safely, assuming the GL is still alive and active.
     pub fn deinit(mut self) {
         //Note: this is a fake frame, only needed because texture deletion is require to happen inside a frame
         self.device.begin_frame(1.0);
         self.device.deinit_texture(self.dummy_cache_texture_id);
         self.device.end_frame();
     }
 }
--- a/gfx/webrender/src/texture_cache.rs
+++ b/gfx/webrender/src/texture_cache.rs
@@ -3,17 +3,17 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 use device::TextureFilter;
 use fnv::FnvHasher;
 use freelist::{FreeList, FreeListItem, FreeListItemId};
 use internal_types::{TextureUpdate, TextureUpdateOp};
 use internal_types::{CacheTextureId, RenderTargetMode, TextureUpdateList, RectUv};
 use profiler::TextureCacheProfileCounters;
-use std::cmp::{self, Ordering};
+use std::cmp;
 use std::collections::HashMap;
 use std::collections::hash_map::Entry;
 use std::hash::BuildHasherDefault;
 use std::mem;
 use std::slice::Iter;
 use time;
 use util;
 use webrender_traits::{ExternalImageType, ImageData, ImageFormat, DevicePixel, DeviceIntPoint};
@@ -45,38 +45,46 @@ const MINIMUM_LARGE_RECT_SIZE: u32 = 32;
 const COALESCING_TIMEOUT: u64 = 100;
 
 /// The number of items that we process in the coalescing work list before checking whether we hit
 /// the timeout.
 const COALESCING_TIMEOUT_CHECKING_INTERVAL: usize = 256;
 
 pub type TextureCacheItemId = FreeListItemId;
 
+enum CoalescingStatus {
+    Changed,
+    Unchanged,
+    Timeout,
+}
+
 /// A texture allocator using the guillotine algorithm with the rectangle merge improvement. See
 /// sections 2.2 and 2.2.5 in "A Thousand Ways to Pack the Bin - A Practical Approach to Two-
 /// Dimensional Rectangle Bin Packing":
 ///
 ///    http://clb.demon.fi/files/RectangleBinPack.pdf
 ///
 /// This approach was chosen because of its simplicity, good performance, and easy support for
 /// dynamic texture deallocation.
 pub struct TexturePage {
     texture_id: CacheTextureId,
     texture_size: DeviceUintSize,
     free_list: FreeRectList,
+    coalesce_vec: Vec<DeviceUintRect>,
     allocations: u32,
     dirty: bool,
 }
 
 impl TexturePage {
     pub fn new(texture_id: CacheTextureId, texture_size: DeviceUintSize) -> TexturePage {
         let mut page = TexturePage {
             texture_id: texture_id,
             texture_size: texture_size,
             free_list: FreeRectList::new(),
+            coalesce_vec: Vec::new(),
             allocations: 0,
             dirty: false,
         };
         page.clear();
         page
     }
 
     fn find_index_of_best_rect_in_bin(&self, bin: FreeListBin, requested_dimensions: &DeviceUintSize)
@@ -112,16 +120,19 @@ impl TexturePage {
     }
 
     pub fn can_allocate(&self, requested_dimensions: &DeviceUintSize) -> bool {
         self.find_index_of_best_rect(requested_dimensions).is_some()
     }
 
     pub fn allocate(&mut self,
                     requested_dimensions: &DeviceUintSize) -> Option<DeviceUintPoint> {
+        if requested_dimensions.width == 0 || requested_dimensions.height == 0 {
+            return Some(DeviceUintPoint::new(0, 0))
+        }
         let index = match self.find_index_of_best_rect(requested_dimensions) {
             None => return None,
             Some(index) => index,
         };
 
         // Remove the rect from the free list and decide how to guillotine it. We choose the split
         // that results in the single largest area (Min Area Split Rule, MINAS).
         let chosen_rect = self.free_list.remove(index);
@@ -168,116 +179,127 @@ impl TexturePage {
 
         // Bump the allocation counter.
         self.allocations += 1;
 
         // Return the result.
         Some(chosen_rect.origin)
     }
 
-    #[inline(never)]
+    fn coalesce_impl<F, U>(rects: &mut [DeviceUintRect], deadline: u64, fun_key: F, fun_union: U)
+                    -> CoalescingStatus where
+        F: Fn(&DeviceUintRect) -> (u32, u32),
+        U: Fn(&mut DeviceUintRect, &mut DeviceUintRect) -> usize,
+    {
+        let mut num_changed = 0;
+        rects.sort_by_key(&fun_key);
+
+        for work_index in 0..rects.len() {
+            if work_index % COALESCING_TIMEOUT_CHECKING_INTERVAL == 0 &&
+                    time::precise_time_ns() >= deadline {
+                return CoalescingStatus::Timeout
+            }
+
+            let (left, candidates) = rects.split_at_mut(work_index + 1);
+            let mut item = left.last_mut().unwrap();
+            if util::rect_is_empty(item) {
+                continue
+            }
+
+            let key = fun_key(item);
+            for candidate in candidates.iter_mut()
+                                       .take_while(|r| key == fun_key(r)) {
+                num_changed += fun_union(item, candidate);
+            }
+        }
+
+        if num_changed > 0 {
+            CoalescingStatus::Changed
+        } else {
+            CoalescingStatus::Unchanged
+        }
+    }
+
+    /// Combine rects that have the same width and are adjacent.
+    fn coalesce_horisontal(rects: &mut [DeviceUintRect], deadline: u64) -> CoalescingStatus {
+        Self::coalesce_impl(rects, deadline,
+                            |item| (item.size.width, item.origin.x),
+                            |item, candidate| {
+            if item.origin.y == candidate.max_y() || item.max_y() == candidate.origin.y {
+                *item = item.union(candidate);
+                candidate.size.width = 0;
+                1
+            } else { 0 }
+        })
+    }
+
+    /// Combine rects that have the same height and are adjacent.
+    fn coalesce_vertical(rects: &mut [DeviceUintRect], deadline: u64) -> CoalescingStatus {
+        Self::coalesce_impl(rects, deadline,
+                            |item| (item.size.height, item.origin.y),
+                            |item, candidate| {
+            if item.origin.x == candidate.max_x() || item.max_x() == candidate.origin.x {
+                *item = item.union(candidate);
+                candidate.size.height = 0;
+                1
+            } else { 0 }
+        })
+    }
+
     pub fn coalesce(&mut self) -> bool {
         if !self.dirty {
             return false
         }
 
         // Iterate to a fixed point or until a timeout is reached.
         let deadline = time::precise_time_ns() + COALESCING_TIMEOUT;
-        let mut free_list = mem::replace(&mut self.free_list, FreeRectList::new()).into_vec();
+        self.free_list.copy_to_vec(&mut self.coalesce_vec);
         let mut changed = false;
 
-        // Combine rects that have the same width and are adjacent.
-        let mut new_free_list = Vec::new();
-        free_list.sort_by(|a, b| {
-            match a.size.width.cmp(&b.size.width) {
-                Ordering::Equal => a.origin.x.cmp(&b.origin.x),
-                ordering => ordering,
-            }
-        });
-        for work_index in 0..free_list.len() {
-            if work_index % COALESCING_TIMEOUT_CHECKING_INTERVAL == 0 &&
-                    time::precise_time_ns() >= deadline {
-                self.free_list = FreeRectList::from_slice(&free_list[..]);
-                self.dirty = true;
+        //Note: we might want to consider try to use the last sorted order first
+        // but the elements get shuffled around a bit anyway during the bin placement
+
+        match Self::coalesce_horisontal(&mut self.coalesce_vec, deadline) {
+            CoalescingStatus::Changed => changed = true,
+            CoalescingStatus::Unchanged => (),
+            CoalescingStatus::Timeout => {
+                self.free_list.init_from_slice(&self.coalesce_vec);
                 return true
             }
+        }
 
-            if free_list[work_index].size.width == 0 {
-                continue
-            }
-            for candidate_index in (work_index + 1)..free_list.len() {
-                if free_list[work_index].size.width != free_list[candidate_index].size.width ||
-                        free_list[work_index].origin.x != free_list[candidate_index].origin.x {
-                    break
-                }
-                if free_list[work_index].origin.y == free_list[candidate_index].max_y() ||
-                        free_list[work_index].max_y() == free_list[candidate_index].origin.y {
-                    changed = true;
-                    free_list[work_index] =
-                        free_list[work_index].union(&free_list[candidate_index]);
-                    free_list[candidate_index].size.width = 0
-                }
-                new_free_list.push(free_list[work_index])
-            }
-            new_free_list.push(free_list[work_index])
-        }
-        free_list = new_free_list;
-
-        // Combine rects that have the same height and are adjacent.
-        let mut new_free_list = Vec::new();
-        free_list.sort_by(|a, b| {
-            match a.size.height.cmp(&b.size.height) {
-                Ordering::Equal => a.origin.y.cmp(&b.origin.y),
-                ordering => ordering,
-            }
-        });
-        for work_index in 0..free_list.len() {
-            if work_index % COALESCING_TIMEOUT_CHECKING_INTERVAL == 0 &&
-                    time::precise_time_ns() >= deadline {
-                self.free_list = FreeRectList::from_slice(&free_list[..]);
-                self.dirty = true;
+        match Self::coalesce_vertical(&mut self.coalesce_vec, deadline) {
+            CoalescingStatus::Changed => changed = true,
+            CoalescingStatus::Unchanged => (),
+            CoalescingStatus::Timeout => {
+                self.free_list.init_from_slice(&self.coalesce_vec);
                 return true
             }
+        }
 
-            if free_list[work_index].size.height == 0 {
-                continue
-            }
-            for candidate_index in (work_index + 1)..free_list.len() {
-                if free_list[work_index].size.height !=
-                        free_list[candidate_index].size.height ||
-                        free_list[work_index].origin.y != free_list[candidate_index].origin.y {
-                    break
-                }
-                if free_list[work_index].origin.x == free_list[candidate_index].max_x() ||
-                        free_list[work_index].max_x() == free_list[candidate_index].origin.x {
-                    changed = true;
-                    free_list[work_index] =
-                        free_list[work_index].union(&free_list[candidate_index]);
-                    free_list[candidate_index].size.height = 0
-                }
-            }
-            new_free_list.push(free_list[work_index])
+        if changed {
+            self.free_list.init_from_slice(&self.coalesce_vec);
         }
-        free_list = new_free_list;
-
-        self.free_list = FreeRectList::from_slice(&free_list[..]);
         self.dirty = changed;
         changed
     }
 
     pub fn clear(&mut self) {
         self.free_list = FreeRectList::new();
         self.free_list.push(&DeviceUintRect::new(
             DeviceUintPoint::zero(),
             self.texture_size));
         self.allocations = 0;
         self.dirty = false;
     }
 
     fn free(&mut self, rect: &DeviceUintRect) {
+        if util::rect_is_empty(rect) {
+            return
+        }
         debug_assert!(self.allocations > 0);
         self.allocations -= 1;
         if self.allocations == 0 {
             self.clear();
             return
         }
 
         self.free_list.push(rect);
@@ -307,16 +329,35 @@ impl TexturePage {
         self.texture_size = new_texture_size
     }
 
     fn can_grow(&self, max_size: u32) -> bool {
         self.texture_size.width < max_size || self.texture_size.height < max_size
     }
 }
 
+// testing functionality
+impl TexturePage {
+    #[doc(hidden)]
+    pub fn new_dummy(size: DeviceUintSize) -> TexturePage {
+        Self::new(CacheTextureId(0), size)
+    }
+
+    #[doc(hidden)]
+    pub fn fill_from(&mut self, other: &TexturePage) {
+        self.dirty = true;
+        self.free_list.small.clear();
+        self.free_list.small.extend_from_slice(&other.free_list.small);
+        self.free_list.medium.clear();
+        self.free_list.medium.extend_from_slice(&other.free_list.medium);
+        self.free_list.large.clear();
+        self.free_list.large.extend_from_slice(&other.free_list.large);
+    }
+}
+
 /// A binning free list. Binning is important to avoid sifting through lots of small strips when
 /// allocating many texture items.
 struct FreeRectList {
     small: Vec<DeviceUintRect>,
     medium: Vec<DeviceUintRect>,
     large: Vec<DeviceUintRect>,
 }
 
@@ -324,22 +365,25 @@ impl FreeRectList {
     fn new() -> FreeRectList {
         FreeRectList {
             small: vec![],
             medium: vec![],
             large: vec![],
         }
     }
 
-    fn from_slice(vector: &[DeviceUintRect]) -> FreeRectList {
-        let mut free_list = FreeRectList::new();
-        for rect in vector {
-            free_list.push(rect)
+    fn init_from_slice(&mut self, rects: &[DeviceUintRect]) {
+        self.small.clear();
+        self.medium.clear();
+        self.large.clear();
+        for rect in rects {
+            if !util::rect_is_empty(rect) {
+                self.push(rect)
+            }
         }
-        free_list
     }
 
     fn push(&mut self, rect: &DeviceUintRect) {
         match FreeListBin::for_size(&rect.size) {
             FreeListBin::Small => self.small.push(*rect),
             FreeListBin::Medium => self.medium.push(*rect),
             FreeListBin::Large => self.large.push(*rect),
         }
@@ -356,41 +400,43 @@ impl FreeRectList {
     fn iter(&self, bin: FreeListBin) -> Iter<DeviceUintRect> {
         match bin {
             FreeListBin::Small => self.small.iter(),
             FreeListBin::Medium => self.medium.iter(),
             FreeListBin::Large => self.large.iter(),
         }
     }
 
-    fn into_vec(mut self) -> Vec<DeviceUintRect> {
-        self.small.extend(self.medium.drain(..));
-        self.small.extend(self.large.drain(..));
-        self.small
+    fn copy_to_vec(&self, rects: &mut Vec<DeviceUintRect>) {
+        rects.clear();
+        rects.extend_from_slice(&self.small);
+        rects.extend_from_slice(&self.medium);
+        rects.extend_from_slice(&self.large);
     }
 }
 
 #[derive(Debug, Clone, Copy)]
 struct FreeListIndex(FreeListBin, usize);
 
 #[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
 enum FreeListBin {
     Small,
     Medium,
     Large,
 }
 
 impl FreeListBin {
-    pub fn for_size(size: &DeviceUintSize) -> FreeListBin {
+    fn for_size(size: &DeviceUintSize) -> FreeListBin {
         if size.width >= MINIMUM_LARGE_RECT_SIZE && size.height >= MINIMUM_LARGE_RECT_SIZE {
             FreeListBin::Large
         } else if size.width >= MINIMUM_MEDIUM_RECT_SIZE &&
                 size.height >= MINIMUM_MEDIUM_RECT_SIZE {
             FreeListBin::Medium
         } else {
+            debug_assert!(size.width > 0 && size.height > 0);
             FreeListBin::Small
         }
     }
 }
 
 #[derive(Debug, Clone)]
 pub struct TextureCacheItem {
     // Identifies the texture and array slice
--- a/gfx/webrender/src/tiling.rs
+++ b/gfx/webrender/src/tiling.rs
@@ -309,80 +309,75 @@ impl AlphaRenderItem {
                     LowLevelFilterOp::Sepia(amount) => (6, amount.to_f32_px()),
                     LowLevelFilterOp::Brightness(amount) => (7, amount.to_f32_px()),
                     LowLevelFilterOp::Opacity(amount) => (8, amount.to_f32_px()),
                 };
 
                 let amount = (amount * 65535.0).round() as i32;
                 let batch = batch_list.get_suitable_batch(&key, &stacking_context.screen_bounds);
 
-                batch.add_instance(PrimitiveInstance {
-                    global_prim_id: -1,
-                    prim_address: GpuStoreAddress(0),
-                    task_index: task_index.0 as i32,
-                    clip_task_index: -1,
-                    layer_index: -1,
-                    sub_index: filter_mode,
-                    user_data: [src_task_index.0 as i32, amount],
-                    z_sort_index: z,
-                });
+                let instance = CompositePrimitiveInstance::new(task_index,
+                                                               src_task_index,
+                                                               RenderTaskIndex(0),
+                                                               filter_mode,
+                                                               amount,
+                                                               z);
+
+                batch.add_instance(PrimitiveInstance::from(instance));
             }
             AlphaRenderItem::HardwareComposite(stacking_context_index, src_id, composite_op, z) => {
                 let stacking_context = &ctx.stacking_context_store[stacking_context_index.0];
                 let src_task_index = render_tasks.get_static_task_index(&src_id);
                 let key = AlphaBatchKey::new(AlphaBatchKind::HardwareComposite,
                                              AlphaBatchKeyFlags::empty(),
                                              composite_op.to_blend_mode(),
                                              BatchTextures::no_texture());
                 let batch = batch_list.get_suitable_batch(&key, &stacking_context.screen_bounds);
-                batch.add_instance(PrimitiveInstance {
-                    global_prim_id: -1,
-                    prim_address: GpuStoreAddress(0),
-                    task_index: task_index.0 as i32,
-                    clip_task_index: -1,
-                    layer_index: -1,
-                    sub_index: -1,
-                    user_data: [src_task_index.0 as i32, 0],
-                    z_sort_index: z,
-                });
+
+                let instance = CompositePrimitiveInstance::new(task_index,
+                                                               src_task_index,
+                                                               RenderTaskIndex(0),
+                                                               0,
+                                                               0,
+                                                               z);
+
+                batch.add_instance(PrimitiveInstance::from(instance));
             }
             AlphaRenderItem::Composite(stacking_context_index,
                                        backdrop_id,
                                        src_id,
                                        mode,
                                        z) => {
                 let stacking_context = &ctx.stacking_context_store[stacking_context_index.0];
                 let key = AlphaBatchKey::new(AlphaBatchKind::Composite,
                                              AlphaBatchKeyFlags::empty(),
                                              BlendMode::Alpha,
                                              BatchTextures::no_texture());
                 let batch = batch_list.get_suitable_batch(&key, &stacking_context.screen_bounds);
                 let backdrop_task = render_tasks.get_task_index(&backdrop_id, child_pass_index);
                 let src_task_index = render_tasks.get_static_task_index(&src_id);
-                batch.add_instance(PrimitiveInstance {
-                    global_prim_id: -1,
-                    prim_address: GpuStoreAddress(0),
-                    task_index: task_index.0 as i32,
-                    clip_task_index: -1,
-                    layer_index: -1,
-                    sub_index: mode as u32 as i32,
-                    user_data: [ backdrop_task.0 as i32,
-                                 src_task_index.0 as i32 ],
-                    z_sort_index: z,
-                });
+
+                let instance = CompositePrimitiveInstance::new(task_index,
+                                                               src_task_index,
+                                                               backdrop_task,
+                                                               mode as u32 as i32,
+                                                               0,
+                                                               z);
+
+                batch.add_instance(PrimitiveInstance::from(instance));
             }
             AlphaRenderItem::Primitive(clip_scroll_group_index_opt, prim_index, z) => {
                 let prim_metadata = ctx.prim_store.get_metadata(prim_index);
                 let (transform_kind, packed_layer_index) = match clip_scroll_group_index_opt {
                     Some(group_index) => {
                         let group = &ctx.clip_scroll_group_store[group_index.0];
                         let bounding_rect = group.screen_bounding_rect.as_ref().unwrap();
-                        (bounding_rect.0, group.packed_layer_index.0 as i32)
+                        (bounding_rect.0, group.packed_layer_index)
                     },
-                    None => (TransformedRectKind::AxisAligned, 0),
+                    None => (TransformedRectKind::AxisAligned, PackedLayerIndex(0)),
                 };
                 let needs_clipping = prim_metadata.needs_clipping();
                 let mut flags = AlphaBatchKeyFlags::empty();
                 if needs_clipping {
                     flags |= NEEDS_CLIPPING;
                 }
                 if transform_kind == TransformedRectKind::AxisAligned {
                     flags |= AXIS_ALIGNED;
@@ -393,73 +388,64 @@ impl AlphaRenderItem {
                 let item_bounding_rect = ctx.prim_store.cpu_bounding_rects[prim_index.0].as_ref().unwrap();
                 let clip_task_index = match prim_metadata.clip_task {
                     Some(ref clip_task) => {
                         render_tasks.get_task_index(&clip_task.id, child_pass_index)
                     }
                     None => {
                         OPAQUE_TASK_INDEX
                     }
-                }.0 as i32;
-                let global_prim_id = prim_index.0 as i32;
-                let prim_address = prim_metadata.gpu_prim_index;
-                let task_index = task_index.0 as i32;
+                };
                 let needs_blending = !prim_metadata.is_opaque ||
                                      needs_clipping ||
                                      transform_kind == TransformedRectKind::Complex;
                 let blend_mode = ctx.prim_store.get_blend_mode(needs_blending, prim_metadata);
-                let base_instance = PrimitiveInstance {
-                    task_index: task_index,
-                    clip_task_index: clip_task_index,
-                    layer_index: packed_layer_index,
-                    global_prim_id: global_prim_id,
-                    prim_address: prim_address,
-                    sub_index: 0,
-                    user_data: [0, 0],
-                    z_sort_index: z,
-                };
+
+                let base_instance = SimplePrimitiveInstance::new(prim_index,
+                                                                 prim_metadata.gpu_prim_index,
+                                                                 task_index,
+                                                                 clip_task_index,
+                                                                 packed_layer_index,
+                                                                 z);
 
                 match prim_metadata.prim_kind {
                     PrimitiveKind::Border => {
                         let border_cpu = &ctx.prim_store.cpu_borders[prim_metadata.cpu_prim_index.0];
                         // TODO(gw): Select correct blend mode for edges and corners!!
                         let corner_key = AlphaBatchKey::new(AlphaBatchKind::BorderCorner, flags, blend_mode, textures);
                         let edge_key = AlphaBatchKey::new(AlphaBatchKind::BorderEdge, flags, blend_mode, textures);
 
                         batch_list.with_suitable_batch(&corner_key, item_bounding_rect, |batch| {
                             for (i, instance_kind) in border_cpu.corner_instances.iter().enumerate() {
                                 let sub_index = i as i32;
                                 match *instance_kind {
                                     BorderCornerInstance::Single => {
                                         batch.add_instance(base_instance.build(sub_index,
-                                                                               BorderCornerSide::Both as i32,
-                                                                               0));
+                                                                               BorderCornerSide::Both as i32,));
                                     }
                                     BorderCornerInstance::Double => {
                                         batch.add_instance(base_instance.build(sub_index,
-                                                                               BorderCornerSide::First as i32,
-                                                                               0));
+                                                                               BorderCornerSide::First as i32));
                                         batch.add_instance(base_instance.build(sub_index,
-                                                                               BorderCornerSide::Second as i32,
-                                                                               0));
+                                                                               BorderCornerSide::Second as i32));
                                     }
                                 }
                             }
                         });
 
                         batch_list.with_suitable_batch(&edge_key, item_bounding_rect, |batch| {
                             for border_segment in 0..4 {
-                                batch.add_instance(base_instance.build(border_segment, 0, 0));
+                                batch.add_instance(base_instance.build(border_segment, 0));
                             }
                         });
                     }
                     PrimitiveKind::Rectangle => {
                         let key = AlphaBatchKey::new(AlphaBatchKind::Rectangle, flags, blend_mode, textures);
                         let batch = batch_list.get_suitable_batch(&key, item_bounding_rect);
-                        batch.add_instance(base_instance);
+                        batch.add_instance(base_instance.build(0, 0));
                     }
                     PrimitiveKind::Image => {
                         let image_cpu = &ctx.prim_store.cpu_images[prim_metadata.cpu_prim_index.0];
 
                         let batch_kind = match image_cpu.color_texture_id {
                             SourceTexture::External(ext_image) => {
                                 match ext_image.image_type {
                                     ExternalImageType::Texture2DHandle => AlphaBatchKind::Image(ImageBufferKind::Texture2D),
@@ -474,17 +460,17 @@ impl AlphaRenderItem {
                             }
                             _ => {
                                 AlphaBatchKind::Image(ImageBufferKind::Texture2D)
                             }
                         };
 
                         let key = AlphaBatchKey::new(batch_kind, flags, blend_mode, textures);
                         let batch = batch_list.get_suitable_batch(&key, item_bounding_rect);
-                        batch.add_instance(base_instance.build(0, image_cpu.resource_address.0, 0));
+                        batch.add_instance(base_instance.build(image_cpu.resource_address.0, 0));
                     }
                     PrimitiveKind::TextRun => {
                         let text_cpu = &ctx.prim_store.cpu_text_runs[prim_metadata.cpu_prim_index.0];
                         let batch_kind = if text_cpu.blur_radius == 0.0 {
                             AlphaBatchKind::TextRun
                         } else {
                             // Select a generic primitive shader that can blit the
                             // results of the cached text blur to the framebuffer,
@@ -499,46 +485,43 @@ impl AlphaRenderItem {
                                 let cache_task_id = task.id;
                                 render_tasks.get_task_index(&cache_task_id,
                                                             child_pass_index).0 as i32
                             }
                             None => 0,
                         };
 
                         for glyph_index in 0..prim_metadata.gpu_data_count {
-                            let user_data0 = match batch_kind {
+                            let user_data1 = match batch_kind {
                                 AlphaBatchKind::TextRun => text_cpu.resource_address.0 + glyph_index,
                                 AlphaBatchKind::CacheImage => cache_task_index,
                                 _ => unreachable!(),
                             };
 
                             batch.add_instance(base_instance.build(prim_metadata.gpu_data_address.0 + glyph_index,
-                                                                   user_data0,
-                                                                   0));
+                                                                   user_data1));
                         }
                     }
                     PrimitiveKind::AlignedGradient => {
                         let key = AlphaBatchKey::new(AlphaBatchKind::AlignedGradient, flags, blend_mode, textures);
                         let batch = batch_list.get_suitable_batch(&key, item_bounding_rect);
                         for part_index in 0..(prim_metadata.gpu_data_count - 1) {
-                            batch.add_instance(base_instance.build(prim_metadata.gpu_data_address.0 + part_index, 0, 0));
+                            batch.add_instance(base_instance.build(prim_metadata.gpu_data_address.0 + part_index, 0));
                         }
                     }
                     PrimitiveKind::AngleGradient => {
                         let key = AlphaBatchKey::new(AlphaBatchKind::AngleGradient, flags, blend_mode, textures);
                         let batch = batch_list.get_suitable_batch(&key, item_bounding_rect);
                         batch.add_instance(base_instance.build(prim_metadata.gpu_data_address.0,
-                                                               prim_metadata.gpu_data_count,
                                                                0));
                     }
                     PrimitiveKind::RadialGradient => {
                         let key = AlphaBatchKey::new(AlphaBatchKind::RadialGradient, flags, blend_mode, textures);
                         let batch = batch_list.get_suitable_batch(&key, item_bounding_rect);
                         batch.add_instance(base_instance.build(prim_metadata.gpu_data_address.0,
-                                                               prim_metadata.gpu_data_count,
                                                                0));
                     }
                     PrimitiveKind::YuvImage => {
                         let image_yuv_cpu = &ctx.prim_store.cpu_yuv_images[prim_metadata.cpu_prim_index.0];
 
                         let get_buffer_kind = |texture: SourceTexture| {
                             match texture {
                                 SourceTexture::External(ext_image) => {
@@ -566,54 +549,51 @@ impl AlphaRenderItem {
                         ));
 
                         let key = AlphaBatchKey::new(AlphaBatchKind::YuvImage(buffer_kind, image_yuv_cpu.format, image_yuv_cpu.color_space),
                                                      flags,
                                                      blend_mode,
                                                      textures);
                         let batch = batch_list.get_suitable_batch(&key, item_bounding_rect);
 
-                        batch.add_instance(base_instance.build(0,
-                                                               image_yuv_cpu.yuv_resource_address.0,
+                        batch.add_instance(base_instance.build(image_yuv_cpu.yuv_resource_address.0,
                                                                0));
                     }
                     PrimitiveKind::BoxShadow => {
                         let cache_task_id = &prim_metadata.render_task.as_ref().unwrap().id;
                         let cache_task_index = render_tasks.get_task_index(cache_task_id,
                                                                            child_pass_index);
 
                         let key = AlphaBatchKey::new(AlphaBatchKind::BoxShadow, flags, blend_mode, textures);
                         let batch = batch_list.get_suitable_batch(&key, item_bounding_rect);
 
                         for rect_index in 0..prim_metadata.gpu_data_count {
                             batch.add_instance(base_instance.build(prim_metadata.gpu_data_address.0 + rect_index,
-                                                                   cache_task_index.0 as i32,
-                                                                   0));
+                                                                   cache_task_index.0 as i32));
                         }
                     }
                 }
             }
             AlphaRenderItem::SplitComposite(sc_index, task_id, gpu_address, z) => {
                 let key = AlphaBatchKey::new(AlphaBatchKind::SplitComposite,
                                              AlphaBatchKeyFlags::empty(),
                                              BlendMode::PremultipliedAlpha,
                                              BatchTextures::no_texture());
                 let stacking_context = &ctx.stacking_context_store[sc_index.0];
                 let batch = batch_list.get_suitable_batch(&key, &stacking_context.screen_bounds);
                 let source_task = render_tasks.get_task_index(&task_id, child_pass_index);
-                batch.add_instance(PrimitiveInstance {
-                    global_prim_id: -1,
-                    prim_address: gpu_address,
-                    task_index: task_index.0 as i32,
-                    clip_task_index: -1,
-                    layer_index: -1, // not be used
-                    sub_index: 0,
-                    user_data: [ source_task.0 as i32, 0 ],
-                    z_sort_index: z,
-                });
+
+                let instance = CompositePrimitiveInstance::new(task_index,
+                                                               source_task,
+                                                               RenderTaskIndex(0),
+                                                               gpu_address.0,
+                                                               0,
+                                                               z);
+
+                batch.add_instance(PrimitiveInstance::from(instance));
             }
         }
     }
 }
 
 impl AlphaBatcher {
     fn new() -> AlphaBatcher {
         AlphaBatcher {
@@ -965,26 +945,23 @@ impl RenderTarget for ColorRenderTarget 
                     padding: 0,
                 });
             }
             RenderTaskKind::CachePrimitive(prim_index) => {
                 let prim_metadata = ctx.prim_store.get_metadata(prim_index);
 
                 match prim_metadata.prim_kind {
                     PrimitiveKind::BoxShadow => {
-                        self.box_shadow_cache_prims.push(PrimitiveInstance {
-                            global_prim_id: prim_index.0 as i32,
-                            prim_address: prim_metadata.gpu_prim_index,
-                            task_index: render_tasks.get_task_index(&task.id, pass_index).0 as i32,
-                            clip_task_index: 0,
-                            layer_index: 0,
-                            sub_index: 0,
-                            user_data: [0; 2],
-                            z_sort_index: 0,        // z is disabled for rendering cache primitives
-                        });
+                        let instance = SimplePrimitiveInstance::new(prim_index,
+                                                                    prim_metadata.gpu_prim_index,
+                                                                    render_tasks.get_task_index(&task.id, pass_index),
+                                                                    RenderTaskIndex(0),
+                                                                    PackedLayerIndex(0),
+                                                                    0);     // z is disabled for rendering cache primitives
+                        self.box_shadow_cache_prims.push(instance.build(0, 0));
                     }
                     PrimitiveKind::TextRun => {
                         let text = &ctx.prim_store.cpu_text_runs[prim_metadata.cpu_prim_index.0];
                         // We only cache text runs with a text-shadow (for now).
                         debug_assert!(text.blur_radius != 0.0);
 
                         // TODO(gw): This should always be fine for now, since the texture
                         // atlas grows to 4k. However, it won't be a problem soon, once
@@ -993,27 +970,26 @@ impl RenderTarget for ColorRenderTarget 
                             colors: ctx.prim_store.get_color_textures(prim_metadata),
                         };
 
                         debug_assert!(textures.colors[0] != SourceTexture::Invalid);
                         debug_assert!(self.text_run_textures.colors[0] == SourceTexture::Invalid ||
                                       self.text_run_textures.colors[0] == textures.colors[0]);
                         self.text_run_textures = textures;
 
+                        let instance = SimplePrimitiveInstance::new(prim_index,
+                                                                    prim_metadata.gpu_prim_index,
+                                                                    render_tasks.get_task_index(&task.id, pass_index),
+                                                                    RenderTaskIndex(0),
+                                                                    PackedLayerIndex(0),
+                                                                    0);     // z is disabled for rendering cache primitives
+
                         for glyph_index in 0..prim_metadata.gpu_data_count {
-                            self.text_run_cache_prims.push(PrimitiveInstance {
-                                global_prim_id: prim_index.0 as i32,
-                                prim_address: prim_metadata.gpu_prim_index,
-                                task_index: render_tasks.get_task_index(&task.id, pass_index).0 as i32,
-                                clip_task_index: 0,
-                                layer_index: 0,
-                                sub_index: prim_metadata.gpu_data_address.0 + glyph_index,
-                                user_data: [ text.resource_address.0 + glyph_index, 0],
-                                z_sort_index: 0,        // z is disabled for rendering cache primitives
-                            });
+                            self.text_run_cache_prims.push(instance.build(prim_metadata.gpu_data_address.0 + glyph_index,
+                                                                          text.resource_address.0 + glyph_index));
                         }
                     }
                     _ => {
                         // No other primitives make use of primitive caching yet!
                         unreachable!()
                     }
                 }
             }
@@ -1284,37 +1260,117 @@ pub struct BlurCommand {
 #[derive(Clone, Copy, Debug)]
 pub struct CacheClipInstance {
     task_id: i32,
     layer_index: i32,
     address: GpuStoreAddress,
     segment: i32,
 }
 
+// 32 bytes per instance should be enough for anyone!
 #[derive(Debug, Clone)]
 pub struct PrimitiveInstance {
-    global_prim_id: i32,
-    prim_address: GpuStoreAddress,
+    data: [i32; 8],
+}
+
+struct SimplePrimitiveInstance {
+    pub global_prim_index: i32,
+    pub specific_prim_address: i32,
     pub task_index: i32,
-    clip_task_index: i32,
-    layer_index: i32,
-    sub_index: i32,
-    z_sort_index: i32,
-    pub user_data: [i32; 2],
+    pub clip_task_index: i32,
+    pub layer_index: i32,
+    pub z_sort_index: i32,
+}
+
+impl SimplePrimitiveInstance {
+    fn new(prim_index: PrimitiveIndex,
+           specific_prim_address: GpuStoreAddress,
+           task_index: RenderTaskIndex,
+           clip_task_index: RenderTaskIndex,
+           layer_index: PackedLayerIndex,
+           z_sort_index: i32) -> SimplePrimitiveInstance {
+        SimplePrimitiveInstance {
+            global_prim_index: prim_index.0 as i32,
+            specific_prim_address: specific_prim_address.0 as i32,
+            task_index: task_index.0 as i32,
+            clip_task_index: clip_task_index.0 as i32,
+            layer_index: layer_index.0 as i32,
+            z_sort_index: z_sort_index,
+        }
+    }
+
+    fn build(&self, data0: i32, data1: i32) -> PrimitiveInstance {
+        PrimitiveInstance {
+            data: [
+                self.global_prim_index,
+                self.specific_prim_address,
+                self.task_index,
+                self.clip_task_index,
+                self.layer_index,
+                self.z_sort_index,
+                data0,
+                data1,
+            ]
+        }
+    }
 }
 
-impl PrimitiveInstance {
-    pub fn build(&self,
-                 sub_index: i32,
-                 user_data0: i32,
-                 user_data1: i32) -> PrimitiveInstance {
+pub struct CompositePrimitiveInstance {
+    pub task_index: RenderTaskIndex,
+    pub src_task_index: RenderTaskIndex,
+    pub backdrop_task_index: RenderTaskIndex,
+    pub data0: i32,
+    pub data1: i32,
+    pub z: i32,
+}
+
+impl CompositePrimitiveInstance {
+    fn new(task_index: RenderTaskIndex,
+           src_task_index: RenderTaskIndex,
+           backdrop_task_index: RenderTaskIndex,
+           data0: i32,
+           data1: i32,
+           z: i32) -> CompositePrimitiveInstance {
+        CompositePrimitiveInstance {
+            task_index: task_index,
+            src_task_index: src_task_index,
+            backdrop_task_index: backdrop_task_index,
+            data0: data0,
+            data1: data1,
+            z: z,
+        }
+    }
+}
+
+impl From<CompositePrimitiveInstance> for PrimitiveInstance {
+    fn from(instance: CompositePrimitiveInstance) -> PrimitiveInstance {
         PrimitiveInstance {
-            sub_index: sub_index,
-            user_data: [user_data0, user_data1],
-            ..*self
+            data: [
+                instance.task_index.0 as i32,
+                instance.src_task_index.0 as i32,
+                instance.backdrop_task_index.0 as i32,
+                instance.z,
+                instance.data0,
+                instance.data1,
+                0,
+                0,
+            ]
+        }
+    }
+}
+
+impl<'a> From<&'a PrimitiveInstance> for CompositePrimitiveInstance {
+    fn from(instance: &'a PrimitiveInstance) -> CompositePrimitiveInstance {
+        CompositePrimitiveInstance {
+            task_index: RenderTaskIndex(instance.data[0] as usize),
+            src_task_index: RenderTaskIndex(instance.data[1] as usize),
+            backdrop_task_index: RenderTaskIndex(instance.data[2] as usize),
+            z: instance.data[3],
+            data0: instance.data[4],
+            data1: instance.data[5],
         }
     }
 }
 
 #[derive(Debug)]
 pub struct PrimitiveBatch {
     pub key: AlphaBatchKey,
     pub instances: Vec<PrimitiveInstance>,
@@ -1452,26 +1508,24 @@ impl ClipScrollGroup {
 }
 
 #[derive(Debug, Clone)]
 #[repr(C)]
 pub struct PackedLayer {
     pub transform: LayerToWorldTransform,
     pub inv_transform: WorldToLayerTransform,
     pub local_clip_rect: LayerRect,
-    pub screen_vertices: [WorldPoint4D; 4],
 }
 
 impl Default for PackedLayer {
     fn default() -> PackedLayer {
         PackedLayer {
             transform: LayerToWorldTransform::identity(),
             inv_transform: WorldToLayerTransform::identity(),
             local_clip_rect: LayerRect::zero(),
-            screen_vertices: [WorldPoint4D::zero(); 4],
         }
     }
 }
 
 impl PackedLayer {
     pub fn empty() -> PackedLayer {
         Default::default()
     }
@@ -1483,17 +1537,16 @@ impl PackedLayer {
 
     pub fn set_rect(&mut self,
                     local_rect: &LayerRect,
                     screen_rect: &DeviceIntRect,
                     device_pixel_ratio: f32)
                     -> Option<(TransformedRectKind, DeviceIntRect)> {
         let xf_rect = TransformedRect::new(&local_rect, &self.transform, device_pixel_ratio);
         xf_rect.bounding_rect.intersection(screen_rect).map(|rect| {
-            self.screen_vertices = xf_rect.vertices.clone();
             self.local_clip_rect = *local_rect;
             (xf_rect.kind, rect)
         })
     }
 }
 
 #[derive(Debug, Clone)]
 pub struct CompositeOps {
--- a/gfx/webrender_bindings/src/bindings.rs
+++ b/gfx/webrender_bindings/src/bindings.rs
@@ -2,17 +2,17 @@ use std::collections::HashSet;
 use std::ffi::CString;
 use std::{mem, slice};
 use std::path::PathBuf;
 use std::os::raw::{c_void, c_char};
 use std::collections::HashMap;
 use gleam::gl;
 
 use webrender_traits::*;
-use webrender::renderer::{Renderer, RendererOptions};
+use webrender::renderer::{ReadPixelsFormat, Renderer, RendererOptions};
 use webrender::renderer::{ExternalImage, ExternalImageHandler, ExternalImageSource};
 use webrender::{ApiRecordingReceiver, BinaryRecorder};
 use app_units::Au;
 use euclid::{TypedPoint2D, TypedSize2D, TypedRect, TypedMatrix4D, SideOffsets2D};
 
 extern crate webrender_traits;
 
 // Enables binary recording that can be used with `wrench replay`
@@ -65,26 +65,16 @@ impl Into<ExternalImageId> for WrExterna
     }
 }
 impl Into<WrExternalImageId> for ExternalImageId {
     fn into(self) -> WrExternalImageId {
         WrExternalImageId(self.0)
     }
 }
 
-const GL_FORMAT_BGRA_GL: gl::GLuint = gl::BGRA;
-const GL_FORMAT_BGRA_GLES: gl::GLuint = gl::BGRA_EXT;
-
-fn get_gl_format_bgra(gl: &gl::Gl) -> gl::GLuint {
-    match gl.get_type() {
-        gl::GlType::Gl => GL_FORMAT_BGRA_GL,
-        gl::GlType::Gles => GL_FORMAT_BGRA_GLES,
-    }
-}
-
 fn make_slice<'a, T>(ptr: *const T, len: usize) -> &'a [T] {
     if ptr.is_null() {
         &[]
     } else {
         unsafe { slice::from_raw_parts(ptr, len) }
     }
 }
 
@@ -785,27 +775,22 @@ pub extern "C" fn wr_renderer_render(ren
 #[no_mangle]
 pub unsafe extern "C" fn wr_renderer_readback(renderer: &mut WrRenderer,
                                               width: u32,
                                               height: u32,
                                               dst_buffer: *mut u8,
                                               buffer_size: usize) {
     assert!(is_in_render_thread());
 
-    renderer.gl().flush();
-
     let mut slice = make_slice_mut(dst_buffer, buffer_size);
-    renderer.gl()
-            .read_pixels_into_buffer(0,
-                                     0,
-                                     width as gl::GLsizei,
-                                     height as gl::GLsizei,
-                                     get_gl_format_bgra(renderer.gl()),
-                                     gl::UNSIGNED_BYTE,
-                                     slice);
+    renderer.read_pixels_into(DeviceUintRect::new(
+                                DeviceUintPoint::new(0, 0),
+                                DeviceUintSize::new(width, height)),
+                              ReadPixelsFormat::Bgra8,
+                              &mut slice);
 }
 
 #[no_mangle]
 pub extern "C" fn wr_renderer_set_profiler_enabled(renderer: &mut WrRenderer,
                                                    enabled: bool) {
     renderer.set_profiler_enabled(enabled);
 }