servo: Merge #17192 - Parallel layout optimizations (from mbrubeck:layout); r=pcwalton
authorMatt Brubeck <mbrubeck@limpet.net>
Wed, 07 Jun 2017 19:57:57 -0700
changeset 413380 66124e23e8d8637de199d3ede303ec3cb9672f55
parent 413379 757a3c336d6e5e399fa944b4342372f233bb72c3
child 413381 b1628969b8330e235e62664d966b38fa560fbb23
push id1490
push usermtabara@mozilla.com
push dateMon, 31 Jul 2017 14:08:16 +0000
treeherdermozilla-release@70e32e6bf15e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerspcwalton
milestone55.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
servo: Merge #17192 - Parallel layout optimizations (from mbrubeck:layout); r=pcwalton This takes some of the optimizations made to parallel styling in #16971 and applies them to parallel layout. Specifically: * Reduce the chunk size, to increase chances for parallelism on trees with small fan-out. * Reduce allocations by using SmallVec. * Reduce task switching by processing up to one chunk of children within the same rayon task as the parent. This cuts the "Primary Layout Pass" time in **half** on the MySpace page from [tp5n], and on my other real-world test pages it is a small improvement or close to no change. [tp5n]: https://wiki.mozilla.org/Buildbot/Talos/Tests#tp5n_pages_set --- - [x] `./mach build -d` does not report any errors - [x] `./mach test-tidy` does not report any errors - [x] These changes do not require tests because they affect performance only Source-Repo: https://github.com/servo/servo Source-Revision: c0f3ec87806a0d718d7f9ef1ccb912c78fc482d2
servo/components/layout/parallel.rs
servo/components/layout_thread/lib.rs
--- a/servo/components/layout/parallel.rs
+++ b/servo/components/layout/parallel.rs
@@ -9,29 +9,29 @@
 #![allow(unsafe_code)]
 
 use context::LayoutContext;
 use flow::{self, Flow, MutableFlowUtils, PostorderFlowTraversal, PreorderFlowTraversal};
 use flow_ref::FlowRef;
 use profile_traits::time::{self, TimerMetadata, profile};
 use rayon;
 use servo_config::opts;
+use smallvec::SmallVec;
 use std::mem;
 use std::sync::atomic::{AtomicIsize, Ordering};
 use style::dom::UnsafeNode;
 use traversal::{AssignISizes, BubbleISizes};
 use traversal::AssignBSizes;
 
 pub use style::parallel::traverse_dom;
 
 /// Traversal chunk size.
-///
-/// FIXME(bholley): This is all likely very inefficient and should probably be
-/// reworked to mirror the style system's parallel.rs.
-pub const CHUNK_SIZE: usize = 64;
+const CHUNK_SIZE: usize = 16;
+
+pub type FlowList = SmallVec<[UnsafeNode; CHUNK_SIZE]>;
 
 #[allow(dead_code)]
 fn static_assertion(node: UnsafeNode) {
     unsafe {
         let _: UnsafeFlow = ::std::intrinsics::transmute(node);
     }
 }
 
@@ -126,17 +126,17 @@ fn buttom_up_flow(mut unsafe_flow: Unsaf
     }
 }
 
 fn top_down_flow<'scope>(unsafe_flows: &[UnsafeFlow],
                          scope: &rayon::Scope<'scope>,
                          assign_isize_traversal: &'scope AssignISizes,
                          assign_bsize_traversal: &'scope AssignBSizes)
 {
-    let mut discovered_child_flows = vec![];
+    let mut discovered_child_flows = FlowList::new();
 
     for unsafe_flow in unsafe_flows {
         let mut had_children = false;
         unsafe {
             // Get a real flow.
             let flow: &mut Flow = mem::transmute(*unsafe_flow);
 
             // FIXME(emilio): With the switch to rayon we can no longer
@@ -159,39 +159,56 @@ fn top_down_flow<'scope>(unsafe_flows: &
         }
 
         // If there were no more children, start assigning block-sizes.
         if !had_children {
             buttom_up_flow(*unsafe_flow, &assign_bsize_traversal)
         }
     }
 
-    for chunk in discovered_child_flows.chunks(CHUNK_SIZE) {
-        let nodes = chunk.iter().cloned().collect::<Vec<_>>().into_boxed_slice();
+    if discovered_child_flows.is_empty() {
+        return
+    }
 
-        scope.spawn(move |scope| {
-            top_down_flow(&nodes, scope, &assign_isize_traversal, &assign_bsize_traversal);
-        });
+    if discovered_child_flows.len() <= CHUNK_SIZE {
+        // We can handle all the children in this work unit.
+        top_down_flow(&discovered_child_flows,
+                      scope,
+                      &assign_isize_traversal,
+                      &assign_bsize_traversal);
+    } else {
+        // Spawn a new work unit for each chunk after the first.
+        let mut chunks = discovered_child_flows.chunks(CHUNK_SIZE);
+        let first_chunk = chunks.next();
+        for chunk in chunks {
+            let nodes = chunk.iter().cloned().collect::<FlowList>();
+            scope.spawn(move |scope| {
+                top_down_flow(&nodes, scope, &assign_isize_traversal, &assign_bsize_traversal);
+            });
+        }
+        if let Some(chunk) = first_chunk {
+            top_down_flow(chunk, scope, &assign_isize_traversal, &assign_bsize_traversal);
+        }
     }
 }
 
 pub fn traverse_flow_tree_preorder(
         root: &mut Flow,
         profiler_metadata: Option<TimerMetadata>,
         time_profiler_chan: time::ProfilerChan,
         context: &LayoutContext,
         queue: &rayon::ThreadPool) {
     if opts::get().bubble_inline_sizes_separately {
         let bubble_inline_sizes = BubbleISizes { layout_context: &context };
         root.traverse_postorder(&bubble_inline_sizes);
     }
 
     let assign_isize_traversal = &AssignISizes { layout_context: &context };
     let assign_bsize_traversal = &AssignBSizes { layout_context: &context };
-    let nodes = vec![borrowed_flow_to_unsafe_flow(root)].into_boxed_slice();
+    let nodes = [borrowed_flow_to_unsafe_flow(root)];
 
     queue.install(move || {
         rayon::scope(move |scope| {
             profile(time::ProfilerCategory::LayoutParallelWarmup,
                     profiler_metadata, time_profiler_chan, move || {
                         top_down_flow(&nodes, scope, assign_isize_traversal, assign_bsize_traversal);
             });
         });
--- a/servo/components/layout_thread/lib.rs
+++ b/servo/components/layout_thread/lib.rs
@@ -438,17 +438,21 @@ impl LayoutThread {
            layout_threads: usize)
            -> LayoutThread {
         let device = Device::new(
             MediaType::Screen,
             opts::get().initial_window_size.to_f32() * ScaleFactor::new(1.0));
 
         let configuration =
             rayon::Configuration::new().num_threads(layout_threads);
-        let parallel_traversal = rayon::ThreadPool::new(configuration).ok();
+        let parallel_traversal = if layout_threads > 1 {
+            Some(rayon::ThreadPool::new(configuration).expect("ThreadPool creation failed"))
+        } else {
+            None
+        };
         debug!("Possible layout Threads: {}", layout_threads);
 
         // Create the channel on which new animations can be sent.
         let (new_animations_sender, new_animations_receiver) = channel();
 
         // Proxy IPC messages from the pipeline to the layout thread.
         let pipeline_receiver = ROUTER.route_ipc_receiver_to_new_mpsc_receiver(pipeline_port);
 
@@ -1069,17 +1073,17 @@ impl LayoutThread {
                 }
                 return;
             },
             Some(x) => x.as_element().unwrap(),
         };
 
         debug!("layout: processing reflow request for: {:?} ({}) (query={:?})",
                element, self.url, data.query_type);
-        debug!("{:?}", ShowSubtree(element.as_node()));
+        trace!("{:?}", ShowSubtree(element.as_node()));
 
         let initial_viewport = data.window_size.initial_viewport;
         let old_viewport_size = self.viewport_size;
         let current_screen_size = Size2D::new(Au::from_f32_px(initial_viewport.width),
                                               Au::from_f32_px(initial_viewport.height));
 
         // Calculate the actual viewport as per DEVICE-ADAPT ยง 6