servo: Merge #17235 - Increase the size of the style sharing cache to 31 (from bzbarsky:bigger-sharing-cache); r=bholley
authorBoris Zbarsky <bzbarsky@mit.edu>
Thu, 08 Jun 2017 12:22:38 -0700
changeset 413542 b63a5b39d75a772d36d3d57a1d8e9d19ea9dcf7d
parent 413541 7035543349b631b27540f2b82c974544328e93f1
child 413543 27f5231ba905f39bc90ee3c56a375b40b7d02569
push id1490
push usermtabara@mozilla.com
push dateMon, 31 Jul 2017 14:08:16 +0000
treeherdermozilla-release@70e32e6bf15e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbholley
milestone55.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
servo: Merge #17235 - Increase the size of the style sharing cache to 31 (from bzbarsky:bigger-sharing-cache); r=bholley <!-- Please describe your changes on the following line: --> --- <!-- Thank you for contributing to Servo! Please replace each `[ ]` by `[X]` when the step is complete, and replace `__` with appropriate data: --> - [X] `./mach build -d` does not report any errors - [X] `./mach test-tidy` does not report any errors - [X] These changes fix https://bugzilla.mozilla.org/show_bug.cgi?id=1369621 <!-- Either: --> - [ ] There are tests for these changes OR - [ ] These changes do not require tests because _____ <!-- Also, please make sure that "Allow edits from maintainers" checkbox is checked, so that we can help you if you get stuck somewhere along the way.--> <!-- Pull requests that do not address these steps are welcome, but they will require additional verification as part of the review process. --> Source-Repo: https://github.com/servo/servo Source-Revision: 612f2c1c2a9e56de2abe9ce32fcb6461a133686d
servo/components/style/parallel.rs
servo/components/style/scoped_tls.rs
servo/components/style/sharing/mod.rs
--- a/servo/components/style/parallel.rs
+++ b/servo/components/style/parallel.rs
@@ -21,41 +21,29 @@
 //! code in the parallel traversal.
 
 #![deny(missing_docs)]
 
 use context::TraversalStatistics;
 use dom::{OpaqueNode, SendNode, TElement, TNode};
 use rayon;
 use scoped_tls::ScopedTLS;
-use sharing::STYLE_SHARING_CANDIDATE_CACHE_SIZE;
 use smallvec::SmallVec;
 use std::borrow::Borrow;
 use std::mem;
 use time;
 use traversal::{DomTraversal, PerLevelTraversalData, PreTraverseToken};
 
 /// The maximum number of child nodes that we will process as a single unit.
 ///
 /// Larger values will increase style sharing cache hits and general DOM locality
-/// at the expense of decreased opportunities for parallelism. The style sharing
-/// cache can hold 8 entries, but not all styles are shareable, so we set this
-/// value to 16. These values have not been measured and could potentially be
-/// tuned.
+/// at the expense of decreased opportunities for parallelism. This value has not
+/// been measured and could potentially be tuned.
 pub const WORK_UNIT_MAX: usize = 16;
 
-/// Verify that the style sharing cache size doesn't change. If it does, we should
-/// reconsider the above. We do this, rather than defining WORK_UNIT_MAX in terms
-/// of STYLE_SHARING_CANDIDATE_CACHE_SIZE, so that altering the latter doesn't
-/// have surprising effects on the parallelism characteristics of the style system.
-#[allow(dead_code)]
-fn static_assert() {
-    unsafe { mem::transmute::<_, [u32; STYLE_SHARING_CANDIDATE_CACHE_SIZE]>([1; 8]); }
-}
-
 /// A list of node pointers.
 ///
 /// Note that the inline storage doesn't need to be sized to WORK_UNIT_MAX, but
 /// it generally seems sensible to do so.
 type NodeList<N> = SmallVec<[SendNode<N>; WORK_UNIT_MAX]>;
 
 /// Entry point for the parallel traversal.
 #[allow(unsafe_code)]
@@ -120,16 +108,29 @@ pub fn traverse_dom<E, D>(traversal: &D,
         });
         aggregate.finish(traversal, start_time.unwrap());
         if aggregate.is_large_traversal() {
             println!("{}", aggregate);
         }
     }
 }
 
+/// A callback to create our thread local context.  This needs to be
+/// out of line so we don't allocate stack space for the entire struct
+/// in the caller.
+#[inline(never)]
+fn create_thread_local_context<'scope, E, D>(
+    traversal: &'scope D,
+    slot: &mut Option<D::ThreadLocalContext>)
+    where E: TElement + 'scope,
+          D: DomTraversal<E>
+{
+    *slot = Some(traversal.create_thread_local_context())
+}
+
 /// A parallel top-down DOM traversal.
 ///
 /// This algorithm traverses the DOM in a breadth-first, top-down manner. The
 /// goals are:
 /// * Never process a child before its parent (since child style depends on
 ///   parent style). If this were to happen, the styling algorithm would panic.
 /// * Prioritize discovering nodes as quickly as possible to maximize
 ///   opportunities for parallelism.
@@ -148,17 +149,18 @@ fn top_down_dom<'a, 'scope, E, D>(nodes:
     where E: TElement + 'scope,
           D: DomTraversal<E>,
 {
     debug_assert!(nodes.len() <= WORK_UNIT_MAX);
     let mut discovered_child_nodes = NodeList::<E::ConcreteNode>::new();
     {
         // Scope the borrow of the TLS so that the borrow is dropped before
         // a potential recursive call when we pass TailCall.
-        let mut tlc = tls.ensure(|| traversal.create_thread_local_context());
+        let mut tlc = tls.ensure(
+            |slot: &mut Option<D::ThreadLocalContext>| create_thread_local_context(traversal, slot));
 
         for n in nodes {
             // If the last node we processed produced children, spawn them off
             // into a work item. We do this at the beginning of the loop (rather
             // than at the end) so that we can traverse the children of the last
             // sibling directly on this thread without a spawn call.
             //
             // This has the important effect of removing the allocation and
--- a/servo/components/style/scoped_tls.rs
+++ b/servo/components/style/scoped_tls.rs
@@ -4,16 +4,17 @@
 
 //! Stack-scoped thread-local storage for rayon thread pools.
 
 #![allow(unsafe_code)]
 #![deny(missing_docs)]
 
 use rayon;
 use std::cell::{Ref, RefCell, RefMut};
+use std::ops::DerefMut;
 
 /// A scoped TLS set, that is alive during the `'scope` lifetime.
 ///
 /// We use this on Servo to construct thread-local contexts, but clear them once
 /// we're done with restyling.
 pub struct ScopedTLS<'scope, T: Send> {
     pool: &'scope rayon::ThreadPool,
     slots: Box<[RefCell<Option<T>>]>,
@@ -47,21 +48,26 @@ impl<'scope, T: Send> ScopedTLS<'scope, 
 
     /// Return a mutable reference to the `Option<T>` that this thread owns.
     pub fn borrow_mut(&self) -> RefMut<Option<T>> {
         let idx = self.pool.current_thread_index().unwrap();
         self.slots[idx].borrow_mut()
     }
 
     /// Ensure that the current data this thread owns is initialized, or
-    /// initialize it using `f`.
-    pub fn ensure<F: FnOnce() -> T>(&self, f: F) -> RefMut<T> {
+    /// initialize it using `f`.  We want ensure() to be fast and inline, and we
+    /// want to inline the memmove that initializes the Option<T>.  But we don't
+    /// want to inline space for the entire large T struct in our stack frame.
+    /// That's why we hand `f` a mutable borrow to write to instead of just
+    /// having it return a T.
+    #[inline(always)]
+    pub fn ensure<F: FnOnce(&mut Option<T>)>(&self, f: F) -> RefMut<T> {
         let mut opt = self.borrow_mut();
         if opt.is_none() {
-            *opt = Some(f());
+            f(opt.deref_mut());
         }
 
         RefMut::map(opt, |x| x.as_mut().unwrap())
     }
 
     /// Unsafe access to the slots. This can be used to access the TLS when
     /// the caller knows that the pool does not have access to the TLS.
     pub unsafe fn unsafe_get(&self) -> &[RefCell<Option<T>>] {
--- a/servo/components/style/sharing/mod.rs
+++ b/servo/components/style/sharing/mod.rs
@@ -79,18 +79,26 @@ use selectors::matching::{ElementSelecto
 use smallvec::SmallVec;
 use std::mem;
 use std::ops::Deref;
 use stylist::{ApplicableDeclarationBlock, Stylist};
 
 mod checks;
 
 /// The amount of nodes that the style sharing candidate cache should hold at
-/// most.
-pub const STYLE_SHARING_CANDIDATE_CACHE_SIZE: usize = 8;
+/// most.  We'd somewhat like 32, but ArrayDeque only implements certain backing
+/// store sizes.  A cache size of 32 would mean a backing store of 33, but
+/// that's not an implemented size: we can do 32 or 40.
+///
+/// The cache size was chosen by measuring style sharing and resulting
+/// performance on a few pages; sizes up to about 32 were giving good sharing
+/// improvements (e.g. 3x fewer styles having to be resolved than at size 8) and
+/// slight performance improvements.  Sizes larger than 32 haven't really been
+/// tested.
+pub const STYLE_SHARING_CANDIDATE_CACHE_SIZE: usize = 31;
 
 /// Controls whether the style sharing cache is used.
 #[derive(Clone, Copy, PartialEq)]
 pub enum StyleSharingBehavior {
     /// Style sharing allowed.
     Allow,
     /// Style sharing disallowed.
     Disallow,