Bug 1531248 - Reduce the number of resolve / copy steps in WR on mobile devices. r=kvark
authorGlenn Watson <github@intuitionlibrary.com>
Thu, 28 Feb 2019 19:50:44 +0000
changeset 461814 a901d60873b90ab874bfb0a741ca128c4e44a667
parent 461813 8829698bebbbeccea5f8e3e0657a5214c13cde9d
child 461815 ec57fe6f4e173478cfe52b69481db7a2f8145adc
push id35629
push useropoprus@mozilla.com
push dateFri, 01 Mar 2019 05:20:57 +0000
treeherdermozilla-central@9d39099e5fc5 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerskvark
bugs1531248
milestone67.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1531248 - Reduce the number of resolve / copy steps in WR on mobile devices. r=kvark This patch fixes some wasted GPU time on mobile devices due to redundant resolve / copy steps. In the first case, we would previously do: - Global clear of color / depth on main framebuffer. - Bind and draw off-screen targets. - Bind main framebuffer and draw scene. Between step 1 and 2, a resolve step is triggered on tiled GPU drivers, wasting a lot of GPU time. To fix this, the clear is now deferred until the framebuffer of the first document is drawn. This does slightly change the semantics of how WR does clear operations, but I think it works fine and makes more sense. In the second case, we would previously do: - ... - Draw main framebuffer - End frame and invalidate the contents of input textures. - Bind main framebuffer and draw debug overlay. This also introduces an extra resolve / copy step, even if the debug overlay is not enabled. To fix this, the invalidation step of the input textures to the main framebuffer pass is deferred until all drawing is complete on the main framebuffer, by doing the invalidation in the end_frame() call of the texture resolver. Together, these save a very significant amount of ms per frame in GPU time on the mobile devices I tested. Differential Revision: https://phabricator.services.mozilla.com/D21490
gfx/wr/webrender/src/renderer.rs
--- a/gfx/wr/webrender/src/renderer.rs
+++ b/gfx/wr/webrender/src/renderer.rs
@@ -2536,16 +2536,24 @@ impl Renderer {
         if self.active_documents.is_empty() {
             self.last_time = precise_time_ns();
             return Ok(results);
         }
 
         let mut frame_profiles = Vec::new();
         let mut profile_timers = RendererProfileTimers::new();
 
+        // The texture resolver scope should be outside of any rendering, including
+        // debug rendering. This ensures that when we return render targets to the
+        // pool via glInvalidateFramebuffer, we don't do any debug rendering after
+        // that point. Otherwise, the bind / invalidate / bind logic trips up the
+        // render pass logic in tiled / mobile GPUs, resulting in an extra copy /
+        // resolve step when the debug overlay is enabled.
+        self.texture_resolver.begin_frame();
+
         let profile_samplers = {
             let _gm = self.gpu_profile.start_marker("build samples");
             // Block CPU waiting for last frame's GPU profiles to arrive.
             // In general this shouldn't block unless heavily GPU limited.
             let (gpu_frame_id, timers, samplers) = self.gpu_profile.build_samples();
 
             if self.max_recorded_profiles > 0 {
                 while self.gpu_profiles.len() >= self.max_recorded_profiles {
@@ -2570,68 +2578,72 @@ impl Renderer {
             //self.update_shaders();
 
             self.update_texture_cache();
 
             frame_id
         });
 
         profile_timers.cpu_time.profile(|| {
-            let clear_depth_value = if self.are_documents_intersecting_depth() {
-                None
-            } else {
-                Some(1.0)
-            };
+            // If the documents don't intersect for depth, we can just do
+            // a single, global depth clear.
+            let clear_depth_per_doc = self.are_documents_intersecting_depth();
 
             //Note: another borrowck dance
             let mut active_documents = mem::replace(&mut self.active_documents, Vec::default());
             // sort by the document layer id
             active_documents.sort_by_key(|&(_, ref render_doc)| render_doc.frame.layer);
 
-            // don't clear the framebuffer if one of the rendered documents will overwrite it
-            if let Some(framebuffer_size) = framebuffer_size {
-                let needs_color_clear = !active_documents
-                    .iter()
-                    .any(|&(_, RenderedDocument { ref frame, .. })| {
-                        frame.background_color.is_some() &&
-                        frame.inner_rect.origin == DeviceIntPoint::zero() &&
-                        frame.inner_rect.size == framebuffer_size
-                    });
-
-                if needs_color_clear || clear_depth_value.is_some() {
-                    let clear_color = if needs_color_clear {
-                        self.clear_color.map(|color| color.to_array())
-                    } else {
-                        None
-                    };
-                    self.device.reset_draw_target();
-                    self.device.enable_depth_write();
-                    self.device.clear_target(clear_color, clear_depth_value, None);
-                    self.device.disable_depth_write();
-                }
-            }
-
             #[cfg(feature = "replay")]
             self.texture_resolver.external_images.extend(
                 self.owned_external_images.iter().map(|(key, value)| (*key, value.clone()))
             );
 
-            for &mut (_, RenderedDocument { ref mut frame, .. }) in &mut active_documents {
+            for (doc_index, (_, RenderedDocument { ref mut frame, .. })) in active_documents.iter_mut().enumerate() {
                 frame.profile_counters.reset_targets();
                 self.prepare_gpu_cache(frame);
                 assert!(frame.gpu_cache_frame_id <= self.gpu_cache_frame_id,
                     "Received frame depends on a later GPU cache epoch ({:?}) than one we received last via `UpdateGpuCache` ({:?})",
                     frame.gpu_cache_frame_id, self.gpu_cache_frame_id);
 
+                // Work out what color to clear the frame buffer for this document.
+                // The document's supplied clear color is used, unless:
+                //  (a) The document has no specified clear color AND
+                //  (b) We are rendering the first document.
+                // If both those conditions are true, the overall renderer
+                // clear color will be used, if specified.
+
+                // Get the default clear color from the renderer.
+                let mut fb_clear_color = if doc_index == 0 {
+                    self.clear_color
+                } else {
+                    None
+                };
+
+                // Override with document clear color if no overall clear
+                // color or not on the first document.
+                if fb_clear_color.is_none() {
+                    fb_clear_color = frame.background_color;
+                }
+
+                // Only clear the depth buffer for this document if this is
+                // the first document, or we need to clear depth per document.
+                let fb_clear_depth = if clear_depth_per_doc || doc_index == 0 {
+                    Some(1.0)
+                } else {
+                    None
+                };
+
                 self.draw_tile_frame(
                     frame,
                     framebuffer_size,
-                    clear_depth_value.is_some(),
                     cpu_frame_id,
-                    &mut results.stats
+                    &mut results.stats,
+                    fb_clear_color,
+                    fb_clear_depth,
                 );
 
                 if self.debug_flags.contains(DebugFlags::PROFILER_DBG) {
                     frame_profiles.push(frame.profile_counters.clone());
                 }
 
                 let dirty_regions =
                     mem::replace(&mut frame.recorded_dirty_regions, Vec::new());
@@ -2728,16 +2740,22 @@ impl Renderer {
         self.gpu_cache_upload_time = 0;
 
         profile_timers.cpu_time.profile(|| {
             let _gm = self.gpu_profile.start_marker("end frame");
             self.gpu_profile.end_frame();
             if let Some(debug_renderer) = self.debug.try_get_mut() {
                 debug_renderer.render(&mut self.device, framebuffer_size);
             }
+            // See comment for texture_resolver.begin_frame() for explanation
+            // of why this must be done after all rendering, including debug
+            // overlays. The end_frame() call implicitly calls end_pass(), which
+            // should ensure any left over render targets get invalidated and
+            // returned to the pool correctly.
+            self.texture_resolver.end_frame(&mut self.device, cpu_frame_id);
             self.device.end_frame();
         });
         if framebuffer_size.is_some() {
             self.last_time = current_time;
         }
 
         if self.renderer_errors.is_empty() {
             Ok(results)
@@ -3102,18 +3120,18 @@ impl Renderer {
         );
     }
 
     fn draw_color_target(
         &mut self,
         draw_target: DrawTarget,
         target: &ColorRenderTarget,
         framebuffer_target_rect: DeviceIntRect,
-        depth_is_ready: bool,
         clear_color: Option<[f32; 4]>,
+        clear_depth: Option<f32>,
         render_tasks: &RenderTaskTree,
         projection: &Transform3D<f32>,
         frame_id: GpuFrameId,
         stats: &mut RendererStats,
     ) {
         self.profile_counters.color_targets.inc();
         let _gm = self.gpu_profile.start_marker("color target");
 
@@ -3129,22 +3147,19 @@ impl Renderer {
         };
 
         {
             let _timer = self.gpu_profile.start_timer(GPU_TAG_SETUP_TARGET);
             self.device.bind_draw_target(draw_target);
             self.device.disable_depth();
             self.set_blend(false, framebuffer_kind);
 
-            let depth_clear = if !depth_is_ready && target.needs_depth() {
+            if clear_depth.is_some() {
                 self.device.enable_depth_write();
-                Some(1.0)
-            } else {
-                None
-            };
+            }
 
             let clear_rect = if !draw_target.is_default() {
                 if self.enable_clear_scissor {
                     // TODO(gw): Applying a scissor rect and minimal clear here
                     // is a very large performance win on the Intel and nVidia
                     // GPUs that I have tested with. It's possible it may be a
                     // performance penalty on other GPU types - we should test this
                     // and consider different code paths.
@@ -3164,19 +3179,23 @@ impl Renderer {
                 let mut rect = framebuffer_target_rect.to_i32();
                 // Note: `framebuffer_target_rect` needs a Y-flip before going to GL
                 // Note: at this point, the target rectangle is not guaranteed to be within the main framebuffer bounds
                 // but `clear_target_rect` is totally fine with negative origin, as long as width & height are positive
                 rect.origin.y = draw_target.dimensions().height as i32 - rect.origin.y - rect.size.height;
                 Some(rect)
             };
 
-            self.device.clear_target(clear_color, depth_clear, clear_rect);
-
-            if depth_clear.is_some() {
+            self.device.clear_target(
+                clear_color,
+                clear_depth,
+                clear_rect,
+            );
+
+            if clear_depth.is_some() {
                 self.device.disable_depth_write();
             }
         }
 
         // Handle any blits from the texture cache to this target.
         self.handle_blits(&target.blits, render_tasks);
 
         // Draw any blurs for this target.
@@ -4037,77 +4056,74 @@ impl Renderer {
         debug_assert!(self.texture_resolver.prev_pass_alpha.is_none());
         debug_assert!(self.texture_resolver.prev_pass_color.is_none());
     }
 
     fn draw_tile_frame(
         &mut self,
         frame: &mut Frame,
         framebuffer_size: Option<DeviceIntSize>,
-        framebuffer_depth_is_ready: bool,
         frame_id: GpuFrameId,
         stats: &mut RendererStats,
+        fb_clear_color: Option<ColorF>,
+        fb_clear_depth: Option<f32>,
     ) {
         let _gm = self.gpu_profile.start_marker("tile frame draw");
 
         if frame.passes.is_empty() {
             frame.has_been_rendered = true;
             return;
         }
 
         self.device.disable_depth_write();
         self.set_blend(false, FramebufferKind::Other);
         self.device.disable_stencil();
 
         self.bind_frame_data(frame);
-        self.texture_resolver.begin_frame();
 
         for (pass_index, pass) in frame.passes.iter_mut().enumerate() {
             let _gm = self.gpu_profile.start_marker(&format!("pass {}", pass_index));
 
             self.texture_resolver.bind(
                 &TextureSource::PrevPassAlpha,
                 TextureSampler::PrevPassAlpha,
                 &mut self.device,
             );
             self.texture_resolver.bind(
                 &TextureSource::PrevPassColor,
                 TextureSampler::PrevPassColor,
                 &mut self.device,
             );
 
-            let (cur_alpha, cur_color) = match pass.kind {
+            match pass.kind {
                 RenderPassKind::MainFramebuffer(ref target) => {
                     if let Some(framebuffer_size) = framebuffer_size {
                         stats.color_target_count += 1;
 
-                        let clear_color = frame.background_color.map(|color| color.to_array());
                         let projection = Transform3D::ortho(
                             0.0,
                             framebuffer_size.width as f32,
                             framebuffer_size.height as f32,
                             0.0,
                             ORTHO_NEAR_PLANE,
                             ORTHO_FAR_PLANE,
                         );
 
                         self.draw_color_target(
                             DrawTarget::Default(framebuffer_size),
                             target,
                             frame.inner_rect,
-                            framebuffer_depth_is_ready,
-                            clear_color,
+                            fb_clear_color.map(|color| color.to_array()),
+                            fb_clear_depth,
                             &frame.render_tasks,
                             &projection,
                             frame_id,
                             stats,
                         );
                     }
-
-                    (None, None)
                 }
                 RenderPassKind::OffScreen { ref mut alpha, ref mut color, ref mut texture_cache } => {
                     let alpha_tex = self.allocate_target_texture(alpha, &mut frame.profile_counters);
                     let color_tex = self.allocate_target_texture(color, &mut frame.profile_counters);
 
                     // If this frame has already been drawn, then any texture
                     // cache targets have already been updated and can be
                     // skipped this time.
@@ -4161,42 +4177,49 @@ impl Renderer {
                             0.0,
                             draw_target.dimensions().width as f32,
                             0.0,
                             draw_target.dimensions().height as f32,
                             ORTHO_NEAR_PLANE,
                             ORTHO_FAR_PLANE,
                         );
 
+                        let clear_depth = if target.needs_depth() {
+                            Some(1.0)
+                        } else {
+                            None
+                        };
+
                         self.draw_color_target(
                             draw_target,
                             target,
                             frame.inner_rect,
-                            false,
                             Some([0.0, 0.0, 0.0, 0.0]),
+                            clear_depth,
                             &frame.render_tasks,
                             &projection,
                             frame_id,
                             stats,
                         );
                     }
 
-                    (alpha_tex, color_tex)
+                    // Only end the pass here and invalidate previous textures for
+                    // off-screen targets. Deferring return of the inputs to the
+                    // frame buffer until the implicit end_pass in end_frame allows
+                    // debug draw overlays to be added without triggering a copy
+                    // resolve stage in mobile / tiled GPUs.
+                    self.texture_resolver.end_pass(
+                        &mut self.device,
+                        alpha_tex,
+                        color_tex,
+                    );
                 }
-            };
-
-            self.texture_resolver.end_pass(
-                &mut self.device,
-                cur_alpha,
-                cur_color,
-            );
+            }
         }
 
-        self.texture_resolver.end_frame(&mut self.device, frame_id);
-
         if let Some(framebuffer_size) = framebuffer_size {
             self.draw_frame_debug_items(&frame.debug_items);
             self.draw_render_target_debug(framebuffer_size);
             self.draw_texture_cache_debug(framebuffer_size);
             self.draw_gpu_cache_debug(framebuffer_size);
             self.draw_zoom_debug(framebuffer_size);
         }
         self.draw_epoch_debug();