Bug 1536732 - Add experimental pixel-local-storage render path to WR r=kvark
authorGlenn Watson <github@intuitionlibrary.com>
Thu, 21 Mar 2019 02:59:04 +0000
changeset 465322 47f7db4203c6c3184a8d802ad388d0d0af4ede6b
parent 465321 a423f146621147ee1b6d3b18629e950148e529c0
child 465323 104d3e54d7ab9a54c6685b1b84b6ccafa238c1ff
push id35736
push userncsoregi@mozilla.com
push dateThu, 21 Mar 2019 10:40:08 +0000
treeherdermozilla-central@5cac2c92926e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerskvark
bugs1536732
milestone68.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1536732 - Add experimental pixel-local-storage render path to WR r=kvark Add an experimental code path that makes use of the pixel local storage extension available on many mobile GPUs. This code path is currently disabled by default, as the support is not complete for all primitives and blend modes. The initial aim is to get feature parity with the existing renderer. Once that's complete, we can take advantage of the (minimum) 12 bytes per pixel of high speed on-tile memory to store custom data. Clip masks are a good use case for this, since they map 1:1 with the position of the fragment they are clipping. Using this for clip masks allows us to handle clipping on mobile GPUs in a much more efficient way - we can skip (a) separate render targets, (b) target resolve (c) sample the mask texture during rendering. Depends on D24123 Differential Revision: https://phabricator.services.mozilla.com/D24124
gfx/webrender_bindings/src/bindings.rs
gfx/wr/webrender/res/brush.glsl
gfx/wr/webrender/res/pls_init.glsl
gfx/wr/webrender/res/pls_resolve.glsl
gfx/wr/webrender/res/ps_split_composite.glsl
gfx/wr/webrender/res/ps_text_run.glsl
gfx/wr/webrender/res/shared.glsl
gfx/wr/webrender/src/batch.rs
gfx/wr/webrender/src/device/gl.rs
gfx/wr/webrender/src/gpu_types.rs
gfx/wr/webrender/src/renderer.rs
gfx/wr/webrender/src/shade.rs
gfx/wr/webrender/src/tiling.rs
gfx/wr/wrench/src/main.rs
--- a/gfx/webrender_bindings/src/bindings.rs
+++ b/gfx/webrender_bindings/src/bindings.rs
@@ -1036,17 +1036,17 @@ fn wr_device_new(gl_context: *mut c_void
         }
     };
 
     let cached_programs = match pc {
       Some(cached_programs) => Some(Rc::clone(cached_programs.rc_get())),
       None => None,
     };
 
-    Device::new(gl, resource_override_path, upload_method, cached_programs)
+    Device::new(gl, resource_override_path, upload_method, cached_programs, false)
 }
 
 // Call MakeCurrent before this.
 #[no_mangle]
 pub extern "C" fn wr_window_new(window_id: WrWindowId,
                                 window_width: i32,
                                 window_height: i32,
                                 support_low_priority_transactions: bool,
@@ -1129,16 +1129,17 @@ pub extern "C" fn wr_window_new(window_i
         upload_method,
         scene_builder_hooks: Some(Box::new(APZCallbacks::new(window_id))),
         sampler: Some(Box::new(SamplerCallback::new(window_id))),
         max_texture_size: Some(8192), // Moz2D doesn't like textures bigger than this
         clear_color: Some(ColorF::new(0.0, 0.0, 0.0, 0.0)),
         precache_flags,
         namespace_alloc_by_client: true,
         enable_picture_caching,
+        allow_pixel_local_storage_support: false,
         ..Default::default()
     };
 
     // Ensure the WR profiler callbacks are hooked up to the Gecko profiler.
     set_profiler_hooks(Some(&PROFILER_HOOKS));
 
     let notifier = Box::new(CppNotifier {
         window_id: window_id,
@@ -2931,16 +2932,17 @@ pub extern "C" fn wr_shaders_new(gl_cont
     let precache_flags = if env_var_to_bool("MOZ_WR_PRECACHE_SHADERS") {
         ShaderPrecacheFlags::FULL_COMPILE
     } else {
         ShaderPrecacheFlags::ASYNC_COMPILE
     };
 
     let opts = RendererOptions {
         precache_flags,
+        allow_pixel_local_storage_support: false,
         ..Default::default()
     };
 
     let gl_type = device.gl().get_type();
     device.begin_frame();
 
     let shaders = Rc::new(RefCell::new(match Shaders::new(&mut device, gl_type, &opts) {
         Ok(shaders) => shaders,
--- a/gfx/wr/webrender/res/brush.glsl
+++ b/gfx/wr/webrender/res/brush.glsl
@@ -151,13 +151,12 @@ void main(void) {
 
     frag.color *= clip_alpha;
 
     #ifdef WR_FEATURE_DUAL_SOURCE_BLENDING
         oFragBlend = frag.blend * clip_alpha;
     #endif
 #endif
 
-    // TODO(gw): Handle pre-multiply common code here as required.
-    oFragColor = frag.color;
+    write_output(frag.color);
 #endif
 }
 #endif
new file mode 100644
--- /dev/null
+++ b/gfx/wr/webrender/res/pls_init.glsl
@@ -0,0 +1,27 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// Initialize the pixel local storage area by reading the current
+// framebuffer color. We might be able to skip this in future by
+// making the opaque pass also write to pixel local storage.
+
+#define PLS_WRITEONLY
+
+#include shared
+
+#ifdef WR_VERTEX_SHADER
+in vec4 aRect;
+
+void main(void) {
+    vec2 pos = aRect.xy + aPosition.xy * aRect.zw;
+    gl_Position = uTransform * vec4(pos, 0.0, 1.0);
+}
+#endif
+
+#ifdef WR_FRAGMENT_SHADER
+void main(void) {
+    // Store current framebuffer color in our custom PLS struct.
+	PLS.color = gl_LastFragColorARM;
+}
+#endif
new file mode 100644
--- /dev/null
+++ b/gfx/wr/webrender/res/pls_resolve.glsl
@@ -0,0 +1,29 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// Write the final value stored in pixel local store out to normal
+// fragment outputs. This will be the color that gets resolved out
+// to main memory.
+
+#define PLS_READONLY
+
+#include shared
+
+#ifdef WR_VERTEX_SHADER
+in vec4 aRect;
+
+void main(void) {
+    vec2 pos = aRect.xy + aPosition.xy * aRect.zw;
+    gl_Position = uTransform * vec4(pos, 0.0, 1.0);
+}
+#endif
+
+#ifdef WR_FRAGMENT_SHADER
+out vec4 oFragColor;
+
+void main(void) {
+	// Write the final color value in pixel local storage out as a fragment color.
+    oFragColor = PLS.color;
+}
+#endif
--- a/gfx/wr/webrender/res/ps_split_composite.glsl
+++ b/gfx/wr/webrender/res/ps_split_composite.glsl
@@ -107,11 +107,11 @@ void main(void) {
 }
 #endif
 
 #ifdef WR_FRAGMENT_SHADER
 void main(void) {
     float alpha = do_clip();
     float perspective_divisor = mix(gl_FragCoord.w, 1.0, vLayerAndPerspective.y);
     vec2 uv = clamp(vUv * perspective_divisor, vUvSampleBounds.xy, vUvSampleBounds.zw);
-    oFragColor = alpha * textureLod(sPrevPassColor, vec3(uv, vLayerAndPerspective.x), 0.0);
+    write_output(alpha * textureLod(sPrevPassColor, vec3(uv, vLayerAndPerspective.x), 0.0));
 }
 #endif
--- a/gfx/wr/webrender/res/ps_text_run.glsl
+++ b/gfx/wr/webrender/res/ps_text_run.glsl
@@ -295,12 +295,12 @@ void main(void) {
 
 #if defined(WR_FEATURE_DEBUG_OVERDRAW)
     oFragColor = WR_DEBUG_OVERDRAW_COLOR;
 #elif defined(WR_FEATURE_DUAL_SOURCE_BLENDING)
     vec4 alpha_mask = mask * alpha;
     oFragColor = vColor * alpha_mask;
     oFragBlend = alpha_mask * vColor.a;
 #else
-    oFragColor = vColor * mask * alpha;
+    write_output(vColor * mask * alpha);
 #endif
 }
 #endif
--- a/gfx/wr/webrender/res/shared.glsl
+++ b/gfx/wr/webrender/res/shared.glsl
@@ -1,12 +1,22 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
+#ifdef WR_FEATURE_PIXEL_LOCAL_STORAGE
+// For now, we need both extensions here, in order to initialize
+// the PLS to the current framebuffer color. In future, we can
+// possibly remove that requirement, or at least support the
+// other framebuffer fetch extensions that provide the same
+// functionality.
+#extension GL_EXT_shader_pixel_local_storage : require
+#extension GL_ARM_shader_framebuffer_fetch : require
+#endif
+
 #ifdef WR_FEATURE_TEXTURE_EXTERNAL
 // Please check https://www.khronos.org/registry/OpenGL/extensions/OES/OES_EGL_image_external_essl3.txt
 // for this extension.
 #extension GL_OES_EGL_image_external_essl3 : require
 #endif
 
 #ifdef WR_FEATURE_DUAL_SOURCE_BLENDING
 #extension GL_ARB_explicit_attrib_location : require
@@ -43,22 +53,72 @@
 #endif
 
 //======================================================================================
 // Fragment shader attributes and uniforms
 //======================================================================================
 #ifdef WR_FRAGMENT_SHADER
     // Uniform inputs
 
-    // Fragment shader outputs
-    #ifdef WR_FEATURE_DUAL_SOURCE_BLENDING
-        layout(location = 0, index = 0) out vec4 oFragColor;
-        layout(location = 0, index = 1) out vec4 oFragBlend;
+    #ifdef WR_FEATURE_PIXEL_LOCAL_STORAGE
+        // Define the storage class of the pixel local storage.
+        // If defined as writable, it's a compile time error to
+        // have a normal fragment output variable declared.
+        #if defined(PLS_READONLY)
+            #define PLS_BLOCK __pixel_local_inEXT
+        #elif defined(PLS_WRITEONLY)
+            #define PLS_BLOCK __pixel_local_outEXT
+        #else
+            #define PLS_BLOCK __pixel_localEXT
+        #endif
+
+        // The structure of pixel local storage. Right now, it's
+        // just the current framebuffer color. In future, we have
+        // (at least) 12 bytes of space we can store extra info
+        // here (such as clip mask values).
+        PLS_BLOCK FrameBuffer {
+            layout(rgba8) highp vec4 color;
+        } PLS;
+
+        #ifndef PLS_READONLY
+        // Write the output of a fragment shader to PLS. Applies
+        // premultipled alpha blending by default, since the blender
+        // is disabled when PLS is active.
+        // TODO(gw): Properly support alpha blend mode for webgl / canvas.
+        void write_output(vec4 color) {
+            PLS.color = color + PLS.color * (1.0 - color.a);
+        }
+
+        // Write a raw value straight to PLS, if the fragment shader has
+        // already applied blending.
+        void write_output_raw(vec4 color) {
+            PLS.color = color;
+        }
+        #endif
+
+        #ifndef PLS_WRITEONLY
+        // Retrieve the current framebuffer color. Useful in conjunction with
+        // the write_output_raw function.
+        vec4 get_current_framebuffer_color() {
+            return PLS.color;
+        }
+        #endif
     #else
-        out vec4 oFragColor;
+        // Fragment shader outputs
+        #ifdef WR_FEATURE_DUAL_SOURCE_BLENDING
+            layout(location = 0, index = 0) out vec4 oFragColor;
+            layout(location = 0, index = 1) out vec4 oFragBlend;
+        #else
+            out vec4 oFragColor;
+        #endif
+
+        // Write an output color in normal (non-PLS) shaders.
+        void write_output(vec4 color) {
+            oFragColor = color;
+        }
     #endif
 
     #define EPSILON                     0.0001
 
     // "Show Overdraw" color. Premultiplied.
     #define WR_DEBUG_OVERDRAW_COLOR     vec4(0.110, 0.077, 0.027, 0.125)
 
     float distance_to_line(vec2 p0, vec2 perp_dir, vec2 p) {
--- a/gfx/wr/webrender/src/batch.rs
+++ b/gfx/wr/webrender/src/batch.rs
@@ -391,38 +391,44 @@ pub struct AlphaBatchContainer {
     pub alpha_batches: Vec<PrimitiveBatch>,
     /// The overall scissor rect for this render task, if one
     /// is required.
     pub task_scissor_rect: Option<DeviceIntRect>,
     /// A list of rectangle regions this batch should be drawn
     /// in. Each region will have scissor rect set before drawing.
     pub regions: Vec<DeviceIntRect>,
     pub tile_blits: Vec<TileBlit>,
+    /// The rectangle of the owning render target that this
+    /// set of batches affects.
+    pub task_rect: DeviceIntRect,
 }
 
 impl AlphaBatchContainer {
     pub fn new(
         task_scissor_rect: Option<DeviceIntRect>,
         regions: Vec<DeviceIntRect>,
     ) -> AlphaBatchContainer {
         AlphaBatchContainer {
             opaque_batches: Vec::new(),
             alpha_batches: Vec::new(),
             task_scissor_rect,
             regions,
             tile_blits: Vec::new(),
+            task_rect: DeviceIntRect::zero(),
         }
     }
 
     pub fn is_empty(&self) -> bool {
         self.opaque_batches.is_empty() &&
         self.alpha_batches.is_empty()
     }
 
-    fn merge(&mut self, batch_list: BatchList) {
+    fn merge(&mut self, batch_list: BatchList, task_rect: &DeviceIntRect) {
+        self.task_rect = self.task_rect.union(task_rect);
+
         for other_batch in batch_list.opaque_batch_list.batches {
             let batch_index = self.opaque_batches.iter().position(|batch| {
                 batch.key.is_compatible_with(&other_batch.key)
             });
 
             match batch_index {
                 Some(batch_index) => {
                     self.opaque_batches[batch_index].instances.extend(other_batch.instances);
@@ -512,33 +518,35 @@ impl AlphaBatchBuilder {
         self.task_scissor_rect.is_none() &&
         self.batch_lists.len() == 1
     }
 
     pub fn build(
         mut self,
         batch_containers: &mut Vec<AlphaBatchContainer>,
         merged_batches: &mut AlphaBatchContainer,
+        task_rect: DeviceIntRect,
     ) {
         for batch_list in &mut self.batch_lists {
             batch_list.finalize();
         }
 
         if self.can_merge() {
             let batch_list = self.batch_lists.pop().unwrap();
             debug_assert!(batch_list.tile_blits.is_empty());
-            merged_batches.merge(batch_list);
+            merged_batches.merge(batch_list, &task_rect);
         } else {
             for batch_list in self.batch_lists {
                 batch_containers.push(AlphaBatchContainer {
                     alpha_batches: batch_list.alpha_batch_list.batches,
                     opaque_batches: batch_list.opaque_batch_list.batches,
                     task_scissor_rect: self.task_scissor_rect,
                     regions: batch_list.regions,
                     tile_blits: batch_list.tile_blits,
+                    task_rect,
                 });
             }
         }
     }
 
     pub fn add_pic_to_batch(
         &mut self,
         pic: &PicturePrimitive,
--- a/gfx/wr/webrender/src/device/gl.rs
+++ b/gfx/wr/webrender/src/device/gl.rs
@@ -880,17 +880,20 @@ impl UniformLocation {
 pub struct Capabilities {
     pub supports_multisampling: bool,
     /// Whether the function glCopyImageSubData is available.
     pub supports_copy_image_sub_data: bool,
     /// Whether we are able to use glBlitFramebuffers with the draw fbo
     /// bound to a non-0th layer of a texture array. This is buggy on
     /// Adreno devices.
     pub supports_blit_to_texture_array: bool,
-
+    /// Whether we can use the pixel local storage functionality that
+    /// is available on some mobile GPUs. This allows fast access to
+    /// the per-pixel tile memory.
+    pub supports_pixel_local_storage: bool,
 }
 
 #[derive(Clone, Debug)]
 pub enum ShaderError {
     Compilation(String, String), // name, error message
     Link(String, String),        // name, error message
 }
 
@@ -1109,16 +1112,17 @@ impl<'a> From<DrawTarget<'a>> for ReadTa
 }
 
 impl Device {
     pub fn new(
         mut gl: Rc<gl::Gl>,
         resource_override_path: Option<PathBuf>,
         upload_method: UploadMethod,
         cached_programs: Option<Rc<ProgramCache>>,
+        allow_pixel_local_storage_support: bool,
     ) -> Device {
         // On debug builds, assert that each GL call is error-free. We don't do
         // this on release builds because the synchronous call can stall the
         // pipeline.
         if cfg!(debug_assertions) {
             gl = gl::ErrorReactingGl::wrap(gl, |gl, name, code| {
                 Self::echo_driver_messages(gl);
                 panic!("Caught GL error {:x} at {}", code, name);
@@ -1224,16 +1228,27 @@ impl Device {
 
         let supports_copy_image_sub_data = supports_extension(&extensions, "GL_EXT_copy_image") ||
             supports_extension(&extensions, "GL_ARB_copy_image");
 
         // Due to a bug on Adreno devices, blitting to an fbo bound to
         // a non-0th layer of a texture array is not supported.
         let supports_blit_to_texture_array = !renderer_name.starts_with("Adreno");
 
+        // Check if the device supports the two extensions needed in order to use
+        // pixel local storage.
+        // TODO(gw): Consider if we can remove fb fetch / init, by using PLS for opaque pass too.
+        // TODO(gw): Support EXT_shader_framebuffer_fetch as well.
+        let ext_pixel_local_storage = supports_extension(&extensions, "GL_EXT_shader_pixel_local_storage");
+        let ext_framebuffer_fetch = supports_extension(&extensions, "GL_ARM_shader_framebuffer_fetch");
+        let supports_pixel_local_storage =
+            allow_pixel_local_storage_support &&
+            ext_framebuffer_fetch &&
+            ext_pixel_local_storage;
+
         // On Adreno GPUs PBO texture upload is only performed asynchronously
         // if the stride of the data in the PBO is a multiple of 256 bytes.
         // Other platforms may have similar requirements and should be added
         // here.
         // The default value should be 4.
         let optimal_pbo_stride = if renderer_name.contains("Adreno") {
             NonZeroUsize::new(256).unwrap()
         } else {
@@ -1246,16 +1261,17 @@ impl Device {
             resource_override_path,
             upload_method,
             inside_frame: false,
 
             capabilities: Capabilities {
                 supports_multisampling: false, //TODO
                 supports_copy_image_sub_data,
                 supports_blit_to_texture_array,
+                supports_pixel_local_storage,
             },
 
             bgra_format_internal,
             bgra_format_external,
 
             depth_targets: FastHashMap::default(),
 
             bound_textures: [0; 16],
@@ -2945,16 +2961,28 @@ impl Device {
         self.gl.blend_func(gl::ONE, gl::ONE_MINUS_SRC_ALPHA);
         self.gl.blend_equation(gl::FUNC_ADD);
     }
 
     pub fn supports_extension(&self, extension: &str) -> bool {
         supports_extension(&self.extensions, extension)
     }
 
+    /// Enable the pixel local storage functionality. Caller must
+    /// have already confirmed the device supports this.
+    pub fn enable_pixel_local_storage(&mut self, enable: bool) {
+        debug_assert!(self.capabilities.supports_pixel_local_storage);
+
+        if enable {
+            self.gl.enable(gl::SHADER_PIXEL_LOCAL_STORAGE_EXT);
+        } else {
+            self.gl.disable(gl::SHADER_PIXEL_LOCAL_STORAGE_EXT);
+        }
+    }
+
     pub fn echo_driver_messages(gl: &gl::Gl) {
         for msg in gl.get_debug_messages() {
             let level = match msg.severity {
                 gl::DEBUG_SEVERITY_HIGH => Level::Error,
                 gl::DEBUG_SEVERITY_MEDIUM => Level::Warn,
                 gl::DEBUG_SEVERITY_LOW => Level::Info,
                 gl::DEBUG_SEVERITY_NOTIFICATION => Level::Debug,
                 _ => Level::Trace,
--- a/gfx/wr/webrender/src/gpu_types.rs
+++ b/gfx/wr/webrender/src/gpu_types.rs
@@ -159,16 +159,36 @@ pub struct ClipMaskBorderCornerDotDash {
 // 16 bytes per instance should be enough for anyone!
 #[derive(Debug, Clone)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct PrimitiveInstanceData {
     data: [i32; 4],
 }
 
+/// Vertex format for resolve style operations with pixel local storage.
+#[derive(Debug, Clone)]
+#[repr(C)]
+pub struct ResolveInstanceData {
+    rect: [f32; 4],
+}
+
+impl ResolveInstanceData {
+    pub fn new(rect: DeviceIntRect) -> Self {
+        ResolveInstanceData {
+            rect: [
+                rect.origin.x as f32,
+                rect.origin.y as f32,
+                rect.size.width as f32,
+                rect.size.height as f32,
+            ],
+        }
+    }
+}
+
 #[derive(Debug, Copy, Clone)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct PrimitiveHeaderIndex(pub i32);
 
 #[derive(Debug)]
 #[repr(C)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
--- a/gfx/wr/webrender/src/renderer.rs
+++ b/gfx/wr/webrender/src/renderer.rs
@@ -59,17 +59,17 @@ use euclid::rect;
 use euclid::{Transform3D, TypedScale};
 use frame_builder::{ChasePrimitive, FrameBuilderConfig};
 use gleam::gl;
 use glyph_rasterizer::{GlyphFormat, GlyphRasterizer};
 use gpu_cache::{GpuBlockData, GpuCacheUpdate, GpuCacheUpdateList};
 use gpu_cache::{GpuCacheDebugChunk, GpuCacheDebugCmd};
 #[cfg(feature = "pathfinder")]
 use gpu_glyph_renderer::GpuGlyphRenderer;
-use gpu_types::{PrimitiveHeaderI, PrimitiveHeaderF, ScalingInstance, TransformData};
+use gpu_types::{PrimitiveHeaderI, PrimitiveHeaderF, ScalingInstance, TransformData, ResolveInstanceData};
 use internal_types::{TextureSource, ORTHO_FAR_PLANE, ORTHO_NEAR_PLANE, ResourceCacheError};
 use internal_types::{CacheTextureId, DebugOutput, FastHashMap, LayerIndex, RenderedDocument, ResultMsg};
 use internal_types::{TextureCacheAllocationKind, TextureCacheUpdate, TextureUpdateList, TextureUpdateSource};
 use internal_types::{RenderTargetInfo, SavedTargetIndex};
 use malloc_size_of::MallocSizeOfOps;
 use picture::{RecordedDirtyRegion, TileCache};
 use prim_store::DeferredResolve;
 use profiler::{BackendProfileCounters, FrameProfileCounters, TimeProfileCounter,
@@ -642,16 +642,33 @@ pub(crate) mod desc {
                 name: "aValue",
                 count: 4,
                 kind: VertexAttributeKind::F32,
             },
         ],
         instance_attributes: &[],
     };
 
+    pub const RESOLVE: VertexDescriptor = VertexDescriptor {
+        vertex_attributes: &[
+            VertexAttribute {
+                name: "aPosition",
+                count: 2,
+                kind: VertexAttributeKind::F32,
+            },
+        ],
+        instance_attributes: &[
+            VertexAttribute {
+                name: "aRect",
+                count: 4,
+                kind: VertexAttributeKind::F32,
+            },
+        ],
+    };
+
     pub const VECTOR_STENCIL: VertexDescriptor = VertexDescriptor {
         vertex_attributes: &[
             VertexAttribute {
                 name: "aPosition",
                 count: 2,
                 kind: VertexAttributeKind::F32,
             },
         ],
@@ -738,16 +755,17 @@ pub(crate) enum VertexArrayKind {
     Blur,
     Clip,
     VectorStencil,
     VectorCover,
     Border,
     Scale,
     LineDecoration,
     Gradient,
+    Resolve,
 }
 
 #[derive(Clone, Debug, PartialEq)]
 pub enum GraphicsApi {
     OpenGL,
 }
 
 #[derive(Clone, Debug)]
@@ -1569,16 +1587,17 @@ impl LazyInitializedDebugRenderer {
 pub struct RendererVAOs {
     prim_vao: VAO,
     blur_vao: VAO,
     clip_vao: VAO,
     border_vao: VAO,
     line_vao: VAO,
     scale_vao: VAO,
     gradient_vao: VAO,
+    resolve_vao: VAO,
 }
 
 /// The renderer is responsible for submitting to the GPU the work prepared by the
 /// RenderBackend.
 ///
 /// We have a separate `Renderer` instance for each instance of WebRender (generally
 /// one per OS window), and all instances share the same thread.
 pub struct Renderer {
@@ -1739,19 +1758,24 @@ impl Renderer {
 
         let debug_server = DebugServer::new(api_tx.clone());
 
         let mut device = Device::new(
             gl,
             options.resource_override_path.clone(),
             options.upload_method.clone(),
             options.cached_programs.take(),
+            options.allow_pixel_local_storage_support,
         );
 
-        let ext_dual_source_blending = !options.disable_dual_source_blending &&
+        let ext_dual_source_blending =
+            !options.disable_dual_source_blending &&
+            // If using pixel local storage, subpixel AA isn't supported (we disable it on all
+            // mobile devices explicitly anyway).
+            !device.get_capabilities().supports_pixel_local_storage &&
             device.supports_extension("GL_ARB_blend_func_extended") &&
             device.supports_extension("GL_ARB_explicit_attrib_location");
 
         // 512 is the minimum that the texture cache can work with.
         const MIN_TEXTURE_SIZE: i32 = 512;
         if let Some(user_limit) = options.max_texture_size {
             assert!(user_limit >= MIN_TEXTURE_SIZE);
             device.clamp_max_texture_size(user_limit);
@@ -1886,16 +1910,17 @@ impl Renderer {
                                                             options.precache_flags));
 
         let blur_vao = device.create_vao_with_new_instances(&desc::BLUR, &prim_vao);
         let clip_vao = device.create_vao_with_new_instances(&desc::CLIP, &prim_vao);
         let border_vao = device.create_vao_with_new_instances(&desc::BORDER, &prim_vao);
         let scale_vao = device.create_vao_with_new_instances(&desc::SCALE, &prim_vao);
         let line_vao = device.create_vao_with_new_instances(&desc::LINE, &prim_vao);
         let gradient_vao = device.create_vao_with_new_instances(&desc::GRADIENT, &prim_vao);
+        let resolve_vao = device.create_vao_with_new_instances(&desc::RESOLVE, &prim_vao);
         let texture_cache_upload_pbo = device.create_pbo();
 
         let texture_resolver = TextureResolver::new(&mut device);
 
         let prim_header_f_texture = VertexDataTexture::new(&mut device, ImageFormat::RGBAF32);
         let prim_header_i_texture = VertexDataTexture::new(&mut device, ImageFormat::RGBAI32);
         let transforms_texture = VertexDataTexture::new(&mut device, ImageFormat::RGBAF32);
         let render_task_texture = VertexDataTexture::new(&mut device, ImageFormat::RGBAF32);
@@ -2090,16 +2115,17 @@ impl Renderer {
             gpu_glyph_renderer,
             vaos: RendererVAOs {
                 prim_vao,
                 blur_vao,
                 clip_vao,
                 border_vao,
                 scale_vao,
                 gradient_vao,
+                resolve_vao,
                 line_vao,
             },
             transforms_texture,
             prim_header_i_texture,
             prim_header_f_texture,
             render_task_texture,
             pipeline_info: PipelineInfo::default(),
             dither_matrix_texture,
@@ -2874,16 +2900,17 @@ impl Renderer {
             // See comment for texture_resolver.begin_frame() for explanation
             // of why this must be done after all rendering, including debug
             // overlays. The end_frame() call implicitly calls end_pass(), which
             // should ensure any left over render targets get invalidated and
             // returned to the pool correctly.
             self.texture_resolver.end_frame(&mut self.device, cpu_frame_id);
             self.device.end_frame();
         });
+
         if framebuffer_size.is_some() {
             self.last_time = current_time;
         }
 
         if self.renderer_errors.is_empty() {
             Ok(results)
         } else {
             Err(mem::replace(&mut self.renderer_errors, Vec::new()))
@@ -3522,16 +3549,33 @@ impl Renderer {
             }
 
             if !alpha_batch_container.alpha_batches.is_empty() {
                 let _gl = self.gpu_profile.start_marker("alpha batches");
                 let transparent_sampler = self.gpu_profile.start_sampler(GPU_SAMPLER_TAG_TRANSPARENT);
                 self.set_blend(true, framebuffer_kind);
                 let mut prev_blend_mode = BlendMode::None;
 
+                // If the device supports pixel local storage, initialize the PLS buffer for
+                // the transparent pass. This involves reading the current framebuffer value
+                // and storing that in PLS.
+                // TODO(gw): This is quite expensive and relies on framebuffer fetch being
+                //           available. We can probably switch the opaque pass over to use
+                //           PLS too, and remove this pass completely.
+                if self.device.get_capabilities().supports_pixel_local_storage {
+                    // TODO(gw): If using PLS, the fixed function blender is disabled. It's possible
+                    //           we could take advantage of this by skipping batching on the blend
+                    //           mode in these cases.
+                    self.init_pixel_local_storage(
+                        alpha_batch_container.task_rect,
+                        projection,
+                        stats,
+                    );
+                }
+
                 for batch in &alpha_batch_container.alpha_batches {
                     self.shaders.borrow_mut()
                         .get(&batch.key, self.debug_flags)
                         .bind(
                             &mut self.device, projection,
                             &mut self.renderer_errors,
                         );
 
@@ -3626,16 +3670,27 @@ impl Renderer {
                         }
                     );
 
                     if batch.key.blend_mode == BlendMode::SubpixelWithBgColor {
                         prev_blend_mode = BlendMode::None;
                     }
                 }
 
+                // If the device supports pixel local storage, resolve the PLS values.
+                // This pass reads the final PLS color value, and writes it to a normal
+                // fragment output.
+                if self.device.get_capabilities().supports_pixel_local_storage {
+                    self.resolve_pixel_local_storage(
+                        alpha_batch_container.task_rect,
+                        projection,
+                        stats,
+                    );
+                }
+
                 self.device.disable_depth();
                 self.set_blend(false, framebuffer_kind);
                 self.gpu_profile.finish_sampler(transparent_sampler);
             }
 
             if uses_scissor {
                 self.device.disable_scissor();
             }
@@ -4485,16 +4540,76 @@ impl Renderer {
                 false
             } else {
                 true
             });
 
         frame.has_been_rendered = true;
     }
 
+    /// Initialize the PLS block, by reading the current framebuffer color.
+    pub fn init_pixel_local_storage(
+        &mut self,
+        task_rect: DeviceIntRect,
+        projection: &Transform3D<f32>,
+        stats: &mut RendererStats,
+    ) {
+        self.device.enable_pixel_local_storage(true);
+
+        self.shaders
+            .borrow_mut()
+            .pls_init
+            .bind(
+                &mut self.device,
+                projection,
+                &mut self.renderer_errors,
+            );
+
+        let instances = [
+            ResolveInstanceData::new(task_rect),
+        ];
+
+        self.draw_instanced_batch(
+            &instances,
+            VertexArrayKind::Resolve,
+            &BatchTextures::no_texture(),
+            stats,
+        );
+    }
+
+    /// Resolve the current PLS structure, writing it to a fragment color output.
+    pub fn resolve_pixel_local_storage(
+        &mut self,
+        task_rect: DeviceIntRect,
+        projection: &Transform3D<f32>,
+        stats: &mut RendererStats,
+    ) {
+        self.shaders
+            .borrow_mut()
+            .pls_resolve
+            .bind(
+                &mut self.device,
+                projection,
+                &mut self.renderer_errors,
+            );
+
+        let instances = [
+            ResolveInstanceData::new(task_rect),
+        ];
+
+        self.draw_instanced_batch(
+            &instances,
+            VertexArrayKind::Resolve,
+            &BatchTextures::no_texture(),
+            stats,
+        );
+
+        self.device.enable_pixel_local_storage(false);
+    }
+
     pub fn debug_renderer<'b>(&'b mut self) -> Option<&'b mut DebugRenderer> {
         self.debug.get_mut(&mut self.device)
     }
 
     pub fn get_debug_flags(&self) -> DebugFlags {
         self.debug_flags
     }
 
@@ -4911,16 +5026,17 @@ impl Renderer {
         }
         self.transforms_texture.deinit(&mut self.device);
         self.prim_header_f_texture.deinit(&mut self.device);
         self.prim_header_i_texture.deinit(&mut self.device);
         self.render_task_texture.deinit(&mut self.device);
         self.device.delete_pbo(self.texture_cache_upload_pbo);
         self.texture_resolver.deinit(&mut self.device);
         self.device.delete_vao(self.vaos.prim_vao);
+        self.device.delete_vao(self.vaos.resolve_vao);
         self.device.delete_vao(self.vaos.clip_vao);
         self.device.delete_vao(self.vaos.gradient_vao);
         self.device.delete_vao(self.vaos.blur_vao);
         self.device.delete_vao(self.vaos.line_vao);
         self.device.delete_vao(self.vaos.border_vao);
         self.device.delete_vao(self.vaos.scale_vao);
 
         self.debug.deinit(&mut self.device);
@@ -5186,16 +5302,21 @@ pub struct RendererOptions {
     pub namespace_alloc_by_client: bool,
     pub enable_picture_caching: bool,
     pub testing: bool,
     /// Set to true if this GPU supports hardware fast clears as a performance
     /// optimization. Likely requires benchmarking on various GPUs to see if
     /// it is a performance win. The default is false, which tends to be best
     /// performance on lower end / integrated GPUs.
     pub gpu_supports_fast_clears: bool,
+    /// If true, allow WR to use pixel local storage if the device supports it.
+    /// For now, this defaults to false since the code is still experimental
+    /// and not complete. This option will probably be removed once support is
+    /// complete, and WR can implicitly choose whether to make use of PLS.
+    pub allow_pixel_local_storage_support: bool,
 }
 
 impl Default for RendererOptions {
     fn default() -> Self {
         RendererOptions {
             device_pixel_ratio: 1.0,
             resource_override_path: None,
             enable_aa: true,
@@ -5225,16 +5346,17 @@ impl Default for RendererOptions {
             scene_builder_hooks: None,
             sampler: None,
             chase_primitive: ChasePrimitive::Nothing,
             support_low_priority_transactions: false,
             namespace_alloc_by_client: false,
             enable_picture_caching: false,
             testing: false,
             gpu_supports_fast_clears: false,
+            allow_pixel_local_storage_support: false,
         }
     }
 }
 
 #[cfg(not(feature = "debugger"))]
 pub struct DebugServer;
 
 #[cfg(not(feature = "debugger"))]
@@ -5691,16 +5813,17 @@ fn get_vao<'a>(vertex_array_kind: Vertex
         VertexArrayKind::Clip => &vaos.clip_vao,
         VertexArrayKind::Blur => &vaos.blur_vao,
         VertexArrayKind::VectorStencil => &gpu_glyph_renderer.vector_stencil_vao,
         VertexArrayKind::VectorCover => &gpu_glyph_renderer.vector_cover_vao,
         VertexArrayKind::Border => &vaos.border_vao,
         VertexArrayKind::Scale => &vaos.scale_vao,
         VertexArrayKind::LineDecoration => &vaos.line_vao,
         VertexArrayKind::Gradient => &vaos.gradient_vao,
+        VertexArrayKind::Resolve => &vaos.resolve_vao,
     }
 }
 
 #[cfg(not(feature = "pathfinder"))]
 fn get_vao<'a>(vertex_array_kind: VertexArrayKind,
                vaos: &'a RendererVAOs,
                _: &'a GpuGlyphRenderer)
                -> &'a VAO {
@@ -5708,16 +5831,17 @@ fn get_vao<'a>(vertex_array_kind: Vertex
         VertexArrayKind::Primitive => &vaos.prim_vao,
         VertexArrayKind::Clip => &vaos.clip_vao,
         VertexArrayKind::Blur => &vaos.blur_vao,
         VertexArrayKind::VectorStencil | VertexArrayKind::VectorCover => unreachable!(),
         VertexArrayKind::Border => &vaos.border_vao,
         VertexArrayKind::Scale => &vaos.scale_vao,
         VertexArrayKind::LineDecoration => &vaos.line_vao,
         VertexArrayKind::Gradient => &vaos.gradient_vao,
+        VertexArrayKind::Resolve => &vaos.resolve_vao,
     }
 }
 
 #[derive(Clone, Copy, PartialEq)]
 enum FramebufferKind {
     Main,
     Other,
 }
--- a/gfx/wr/webrender/src/shade.rs
+++ b/gfx/wr/webrender/src/shade.rs
@@ -47,27 +47,29 @@ pub const IMAGE_BUFFER_KINDS: [ImageBuff
     ImageBufferKind::Texture2DArray,
 ];
 
 const ALPHA_FEATURE: &str = "ALPHA_PASS";
 const DEBUG_OVERDRAW_FEATURE: &str = "DEBUG_OVERDRAW";
 const DITHERING_FEATURE: &str = "DITHERING";
 const DUAL_SOURCE_FEATURE: &str = "DUAL_SOURCE_BLENDING";
 const FAST_PATH_FEATURE: &str = "FAST_PATH";
+const PIXEL_LOCAL_STORAGE_FEATURE: &str = "PIXEL_LOCAL_STORAGE";
 
 pub(crate) enum ShaderKind {
     Primitive,
     Cache(VertexArrayKind),
     ClipCache,
     Brush,
     Text,
     #[allow(dead_code)]
     VectorStencil,
     #[allow(dead_code)]
     VectorCover,
+    Resolve,
 }
 
 pub struct LazilyCompiledShader {
     program: Option<Program>,
     name: &'static str,
     kind: ShaderKind,
     features: Vec<&'static str>,
 }
@@ -120,17 +122,17 @@ impl LazilyCompiledShader {
 
     fn get_internal(
         &mut self,
         device: &mut Device,
         precache_flags: ShaderPrecacheFlags,
     ) -> Result<&mut Program, ShaderError> {
         if self.program.is_none() {
             let program = match self.kind {
-                ShaderKind::Primitive | ShaderKind::Brush | ShaderKind::Text => {
+                ShaderKind::Primitive | ShaderKind::Brush | ShaderKind::Text | ShaderKind::Resolve => {
                     create_prim_shader(
                         self.name,
                         device,
                         &self.features,
                     )
                 }
                 ShaderKind::Cache(..) => {
                     create_prim_shader(
@@ -170,28 +172,30 @@ impl LazilyCompiledShader {
             let vertex_format = match self.kind {
                 ShaderKind::Primitive |
                 ShaderKind::Brush |
                 ShaderKind::Text => VertexArrayKind::Primitive,
                 ShaderKind::Cache(format) => format,
                 ShaderKind::VectorStencil => VertexArrayKind::VectorStencil,
                 ShaderKind::VectorCover => VertexArrayKind::VectorCover,
                 ShaderKind::ClipCache => VertexArrayKind::Clip,
+                ShaderKind::Resolve => VertexArrayKind::Resolve,
             };
 
             let vertex_descriptor = match vertex_format {
                 VertexArrayKind::Primitive => &desc::PRIM_INSTANCES,
                 VertexArrayKind::LineDecoration => &desc::LINE,
                 VertexArrayKind::Gradient => &desc::GRADIENT,
                 VertexArrayKind::Blur => &desc::BLUR,
                 VertexArrayKind::Clip => &desc::CLIP,
                 VertexArrayKind::VectorStencil => &desc::VECTOR_STENCIL,
                 VertexArrayKind::VectorCover => &desc::VECTOR_COVER,
                 VertexArrayKind::Border => &desc::BORDER,
                 VertexArrayKind::Scale => &desc::SCALE,
+                VertexArrayKind::Resolve => &desc::RESOLVE,
             };
 
             device.link_program(program, vertex_descriptor)?;
             device.bind_program(program);
             match self.kind {
                 ShaderKind::ClipCache => {
                     device.bind_shader_samplers(
                         &program,
@@ -260,37 +264,44 @@ struct BrushShader {
 
 impl BrushShader {
     fn new(
         name: &'static str,
         device: &mut Device,
         features: &[&'static str],
         precache_flags: ShaderPrecacheFlags,
         dual_source: bool,
+        use_pixel_local_storage: bool,
     ) -> Result<Self, ShaderError> {
         let opaque = LazilyCompiledShader::new(
             ShaderKind::Brush,
             name,
             features,
             device,
             precache_flags,
         )?;
 
         let mut alpha_features = features.to_vec();
         alpha_features.push(ALPHA_FEATURE);
+        if use_pixel_local_storage {
+            alpha_features.push(PIXEL_LOCAL_STORAGE_FEATURE);
+        }
 
         let alpha = LazilyCompiledShader::new(
             ShaderKind::Brush,
             name,
             &alpha_features,
             device,
             precache_flags,
         )?;
 
-        let dual_source = if dual_source {
+        // If using PLS, we disable all subpixel AA implicitly. Subpixel AA is always
+        // disabled on mobile devices anyway, due to uncertainty over the subpixel
+        // layout configuration.
+        let dual_source = if dual_source && !use_pixel_local_storage {
             let mut dual_source_features = alpha_features.to_vec();
             dual_source_features.push(DUAL_SOURCE_FEATURE);
 
             let shader = LazilyCompiledShader::new(
                 ShaderKind::Brush,
                 name,
                 &dual_source_features,
                 device,
@@ -493,71 +504,86 @@ pub struct Shaders {
     // Most draw directly to the framebuffer, but some use inputs
     // from the cache shaders to draw. Specifically, the box
     // shadow primitive shader stretches the box shadow cache
     // output, and the cache_image shader blits the results of
     // a cache shader (e.g. blur) to the screen.
     pub ps_text_run: TextShader,
     pub ps_text_run_dual_source: TextShader,
 
+    // Helper shaders for pixel local storage render paths.
+    // pls_init: Initialize pixel local storage, based on current framebuffer value.
+    // pls_resolve: Convert pixel local storage, writing out to fragment value.
+    pub pls_init: LazilyCompiledShader,
+    pub pls_resolve: LazilyCompiledShader,
+
     ps_split_composite: LazilyCompiledShader,
 }
 
 impl Shaders {
     pub fn new(
         device: &mut Device,
         gl_type: GlType,
         options: &RendererOptions,
     ) -> Result<Self, ShaderError> {
+        let use_pixel_local_storage = device
+            .get_capabilities()
+            .supports_pixel_local_storage;
+
         let brush_solid = BrushShader::new(
             "brush_solid",
             device,
             &[],
             options.precache_flags,
             false,
+            use_pixel_local_storage,
         )?;
 
         let brush_blend = BrushShader::new(
             "brush_blend",
             device,
             &[],
             options.precache_flags,
             false,
+            use_pixel_local_storage,
         )?;
 
         let brush_mix_blend = BrushShader::new(
             "brush_mix_blend",
             device,
             &[],
             options.precache_flags,
             false,
+            use_pixel_local_storage,
         )?;
 
         let brush_radial_gradient = BrushShader::new(
             "brush_radial_gradient",
             device,
             if options.enable_dithering {
                &[DITHERING_FEATURE]
             } else {
                &[]
             },
             options.precache_flags,
             false,
+            use_pixel_local_storage,
         )?;
 
         let brush_linear_gradient = BrushShader::new(
             "brush_linear_gradient",
             device,
             if options.enable_dithering {
                &[DITHERING_FEATURE]
             } else {
                &[]
             },
             options.precache_flags,
             false,
+            use_pixel_local_storage,
         )?;
 
         let cs_blur_a8 = LazilyCompiledShader::new(
             ShaderKind::Cache(VertexArrayKind::Blur),
             "cs_blur",
             &["ALPHA_TARGET"],
             device,
             options.precache_flags,
@@ -598,50 +624,88 @@ impl Shaders {
         let cs_clip_image = LazilyCompiledShader::new(
             ShaderKind::ClipCache,
             "cs_clip_image",
             &[],
             device,
             options.precache_flags,
         )?;
 
+        let pls_precache_flags = if use_pixel_local_storage {
+            options.precache_flags
+        } else {
+            ShaderPrecacheFlags::empty()
+        };
+
+        let pls_init = LazilyCompiledShader::new(
+            ShaderKind::Resolve,
+            "pls_init",
+            &[PIXEL_LOCAL_STORAGE_FEATURE],
+            device,
+            pls_precache_flags,
+        )?;
+
+        let pls_resolve = LazilyCompiledShader::new(
+            ShaderKind::Resolve,
+            "pls_resolve",
+            &[PIXEL_LOCAL_STORAGE_FEATURE],
+            device,
+            pls_precache_flags,
+        )?;
+
         let cs_scale_a8 = LazilyCompiledShader::new(
             ShaderKind::Cache(VertexArrayKind::Scale),
             "cs_scale",
             &["ALPHA_TARGET"],
             device,
             options.precache_flags,
         )?;
 
         let cs_scale_rgba8 = LazilyCompiledShader::new(
             ShaderKind::Cache(VertexArrayKind::Scale),
             "cs_scale",
             &["COLOR_TARGET"],
             device,
             options.precache_flags,
         )?;
 
+        // TODO(gw): The split composite + text shader are special cases - the only
+        //           shaders used during normal scene rendering that aren't a brush
+        //           shader. Perhaps we can unify these in future?
+        let mut extra_features = Vec::new();
+        if use_pixel_local_storage {
+            extra_features.push(PIXEL_LOCAL_STORAGE_FEATURE);
+        }
+
         let ps_text_run = TextShader::new("ps_text_run",
             device,
-            &[],
+            &extra_features,
             options.precache_flags,
         )?;
 
         let dual_source_precache_flags = if options.disable_dual_source_blending {
             ShaderPrecacheFlags::empty()
         } else {
             options.precache_flags
         };
 
         let ps_text_run_dual_source = TextShader::new("ps_text_run",
             device,
             &[DUAL_SOURCE_FEATURE],
             dual_source_precache_flags,
         )?;
 
+        let ps_split_composite = LazilyCompiledShader::new(
+            ShaderKind::Primitive,
+            "ps_split_composite",
+            &extra_features,
+            device,
+            options.precache_flags,
+        )?;
+
         // All image configuration.
         let mut image_features = Vec::new();
         let mut brush_image = Vec::new();
         // PrimitiveShader is not clonable. Use push() to initialize the vec.
         for _ in 0 .. IMAGE_BUFFER_KINDS.len() {
             brush_image.push(None);
         }
         for buffer_kind in 0 .. IMAGE_BUFFER_KINDS.len() {
@@ -651,16 +715,17 @@ impl Shaders {
                     image_features.push(feature_string);
                 }
                 brush_image[buffer_kind] = Some(BrushShader::new(
                     "brush_image",
                     device,
                     &image_features,
                     options.precache_flags,
                     !options.disable_dual_source_blending,
+                    use_pixel_local_storage,
                 )?);
             }
             image_features.clear();
         }
 
         // All yuv_image configuration.
         let mut yuv_features = Vec::new();
         let yuv_shader_num = IMAGE_BUFFER_KINDS.len();
@@ -677,16 +742,17 @@ impl Shaders {
                 }
 
                 let shader = BrushShader::new(
                     "brush_yuv_image",
                     device,
                     &yuv_features,
                     options.precache_flags,
                     false,
+                    use_pixel_local_storage,
                 )?;
                 let index = Self::get_yuv_shader_index(
                     *image_buffer_kind,
                 );
                 brush_yuv_image[index] = Some(shader);
                 yuv_features.clear();
             }
         }
@@ -718,24 +784,16 @@ impl Shaders {
         let cs_border_solid = LazilyCompiledShader::new(
             ShaderKind::Cache(VertexArrayKind::Border),
             "cs_border_solid",
             &[],
             device,
             options.precache_flags,
         )?;
 
-        let ps_split_composite = LazilyCompiledShader::new(
-            ShaderKind::Primitive,
-            "ps_split_composite",
-            &[],
-            device,
-            options.precache_flags,
-        )?;
-
         Ok(Shaders {
             cs_blur_a8,
             cs_blur_rgba8,
             cs_border_segment,
             cs_line_decoration,
             cs_gradient,
             cs_border_solid,
             cs_scale_a8,
@@ -746,16 +804,18 @@ impl Shaders {
             brush_mix_blend,
             brush_yuv_image,
             brush_radial_gradient,
             brush_linear_gradient,
             cs_clip_rectangle_slow,
             cs_clip_rectangle_fast,
             cs_clip_box_shadow,
             cs_clip_image,
+            pls_init,
+            pls_resolve,
             ps_text_run,
             ps_text_run_dual_source,
             ps_split_composite,
         })
     }
 
     fn get_yuv_shader_index(buffer_kind: ImageBufferKind) -> usize {
         (buffer_kind as usize)
@@ -817,16 +877,18 @@ impl Shaders {
         self.brush_blend.deinit(device);
         self.brush_mix_blend.deinit(device);
         self.brush_radial_gradient.deinit(device);
         self.brush_linear_gradient.deinit(device);
         self.cs_clip_rectangle_slow.deinit(device);
         self.cs_clip_rectangle_fast.deinit(device);
         self.cs_clip_box_shadow.deinit(device);
         self.cs_clip_image.deinit(device);
+        self.pls_init.deinit(device);
+        self.pls_resolve.deinit(device);
         self.ps_text_run.deinit(device);
         self.ps_text_run_dual_source.deinit(device);
         for shader in self.brush_image {
             if let Some(shader) = shader {
                 shader.deinit(device);
             }
         }
         for shader in self.brush_yuv_image {
--- a/gfx/wr/webrender/src/tiling.rs
+++ b/gfx/wr/webrender/src/tiling.rs
@@ -445,16 +445,17 @@ impl RenderTarget for ColorRenderTarget 
                         transforms,
                         pic_task.root_spatial_node_index,
                         z_generator,
                     );
 
                     batch_builder.build(
                         &mut self.alpha_batch_containers,
                         &mut merged_batches,
+                        target_rect,
                     );
                 }
                 _ => {
                     unreachable!();
                 }
             }
         }
 
--- a/gfx/wr/wrench/src/main.rs
+++ b/gfx/wr/wrench/src/main.rs
@@ -583,17 +583,17 @@ fn render<'a>(
     let dim = window.get_inner_size();
     wrench.update(dim);
     thing.do_frame(wrench);
 
     let mut debug_flags = DebugFlags::empty();
 
     // Default the profile overlay on for android.
     if cfg!(target_os = "android") {
-        debug_flags.toggle(DebugFlags::PROFILER_DBG);
+        debug_flags.toggle(DebugFlags::PROFILER_DBG | DebugFlags::COMPACT_PROFILER);
         wrench.api.send_debug_cmd(DebugCommand::SetFlags(debug_flags));
     }
 
     let mut body = |wrench: &mut Wrench, events: Vec<winit::Event>| {
         let mut do_frame = false;
         let mut do_render = false;
 
         for event in events {