treemanifest: lazily load manifests
authorMartin von Zweigbergk <martinvonz@google.com>
Thu, 09 Apr 2015 17:14:35 -0700
changeset 25284 0de132d5328acc43a9f267b559c199cf20e0a362
parent 25283 eafa06e9edde4bb5254ef1f577440fe9c16222f5
child 25285 29be0450b667ddd66a7d1356793f1f40c19fdf33
push id13
push usergszorc@mozilla.com
push dateTue, 26 May 2015 00:39:25 +0000
treemanifest: lazily load manifests Most operations on treemanifests already visit only relevant submanifests. Notable examples include __getitem__, __contains__, walk/matches with matcher, diff. By making submanifests lazily loaded, we speed up all these operations. The lazy loading is achieved by adding a _load() method that gets defined where we currently eagerly parse the manifest. We make sure to call it before any access to _dirs, _files or _flags. Some timings on the Mozilla repo (with flat manifest timings for reference): hg cat -r . README.txt: 1.644s -> 0.096s (0.255s) hg diff -r .^ -r . : 1.746s -> 0.137s (0.431s) hg files -r . python : 1.508s -> 0.146s (0.335s) hg files -r . : 2.125s -> 2.203s (0.712s)
mercurial/manifest.py
tests/test-treemanifest.t
--- a/mercurial/manifest.py
+++ b/mercurial/manifest.py
@@ -436,48 +436,55 @@ def _addlistdelta(addlist, x):
 
 def _splittopdir(f):
     if '/' in f:
         dir, subpath = f.split('/', 1)
         return dir + '/', subpath
     else:
         return '', f
 
+_noop = lambda: None
+
 class treemanifest(object):
     def __init__(self, dir='', text=''):
         self._dir = dir
         self._node = revlog.nullid
+        self._load = _noop
         self._dirty = False
         self._dirs = {}
         # Using _lazymanifest here is a little slower than plain old dicts
         self._files = {}
         self._flags = {}
         if text:
             def readsubtree(subdir, subm):
                 raise AssertionError('treemanifest constructor only accepts '
                                      'flat manifests')
             self.parse(text, readsubtree)
             self._dirty = True # Mark flat manifest dirty after parsing
 
     def _subpath(self, path):
         return self._dir + path
 
     def __len__(self):
+        self._load()
         size = len(self._files)
         for m in self._dirs.values():
             size += m.__len__()
         return size
 
     def _isempty(self):
+        self._load() # for consistency; already loaded by all callers
         return (not self._files and (not self._dirs or
                 all(m._isempty() for m in self._dirs.values())))
 
     def __str__(self):
-        return ('<treemanifest dir=%s, node=%s, dirty=%s>' %
-                (self._dir, revlog.hex(self._node), self._dirty))
+        return ('<treemanifest dir=%s, node=%s, loaded=%s, dirty=%s>' %
+                (self._dir, revlog.hex(self._node),
+                 bool(self._load is _noop),
+                 self._dirty))
 
     def dir(self):
         '''The directory that this tree manifest represents, including a
         trailing '/'. Empty string for the repo root directory.'''
         return self._dir
 
     def node(self):
         '''This node of this instance. nullid for unsaved instances. Should
@@ -486,134 +493,155 @@ class treemanifest(object):
         assert not self._dirty
         return self._node
 
     def setnode(self, node):
         self._node = node
         self._dirty = False
 
     def iteritems(self):
+        self._load()
         for p, n in sorted(self._dirs.items() + self._files.items()):
             if p in self._files:
                 yield self._subpath(p), n
             else:
                 for f, sn in n.iteritems():
                     yield f, sn
 
     def iterkeys(self):
+        self._load()
         for p in sorted(self._dirs.keys() + self._files.keys()):
             if p in self._files:
                 yield self._subpath(p)
             else:
                 for f in self._dirs[p].iterkeys():
                     yield f
 
     def keys(self):
         return list(self.iterkeys())
 
     def __iter__(self):
         return self.iterkeys()
 
     def __contains__(self, f):
         if f is None:
             return False
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             if dir not in self._dirs:
                 return False
             return self._dirs[dir].__contains__(subpath)
         else:
             return f in self._files
 
     def get(self, f, default=None):
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             if dir not in self._dirs:
                 return default
             return self._dirs[dir].get(subpath, default)
         else:
             return self._files.get(f, default)
 
     def __getitem__(self, f):
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             return self._dirs[dir].__getitem__(subpath)
         else:
             return self._files[f]
 
     def flags(self, f):
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             if dir not in self._dirs:
                 return ''
             return self._dirs[dir].flags(subpath)
         else:
             if f in self._dirs:
                 return ''
             return self._flags.get(f, '')
 
     def find(self, f):
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             return self._dirs[dir].find(subpath)
         else:
             return self._files[f], self._flags.get(f, '')
 
     def __delitem__(self, f):
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             self._dirs[dir].__delitem__(subpath)
             # If the directory is now empty, remove it
             if self._dirs[dir]._isempty():
                 del self._dirs[dir]
         else:
             del self._files[f]
             if f in self._flags:
                 del self._flags[f]
         self._dirty = True
 
     def __setitem__(self, f, n):
         assert n is not None
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             if dir not in self._dirs:
                 self._dirs[dir] = treemanifest(self._subpath(dir))
             self._dirs[dir].__setitem__(subpath, n)
         else:
             self._files[f] = n[:21] # to match manifestdict's behavior
         self._dirty = True
 
     def setflag(self, f, flags):
         """Set the flags (symlink, executable) for path f."""
         assert 'd' not in flags
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             if dir not in self._dirs:
                 self._dirs[dir] = treemanifest(self._subpath(dir))
             self._dirs[dir].setflag(subpath, flags)
         else:
             self._flags[f] = flags
         self._dirty = True
 
     def copy(self):
         copy = treemanifest(self._dir)
         copy._node = self._node
         copy._dirty = self._dirty
-        for d in self._dirs:
-            copy._dirs[d] = self._dirs[d].copy()
-        copy._files = dict.copy(self._files)
-        copy._flags = dict.copy(self._flags)
+        def _load():
+            self._load()
+            for d in self._dirs:
+                copy._dirs[d] = self._dirs[d].copy()
+            copy._files = dict.copy(self._files)
+            copy._flags = dict.copy(self._flags)
+            copy._load = _noop
+        copy._load = _load
+        if self._load == _noop:
+            # Chaining _load if it's _noop is functionally correct, but the
+            # chain may end up excessively long (stack overflow), and
+            # will prevent garbage collection of 'self'.
+            copy._load()
         return copy
 
     def filesnotin(self, m2):
         '''Set of files in this manifest that are not in the other'''
         files = set()
         def _filesnotin(t1, t2):
             if t1._node == t2._node and not t1._dirty and not t2._dirty:
                 return
+            t1._load()
+            t2._load()
             for d, m1 in t1._dirs.iteritems():
                 if d in t2._dirs:
                     m2 = t2._dirs[d]
                     _filesnotin(m1, m2)
                 else:
                     files.update(m1.iterkeys())
 
             for fn in t1._files.iterkeys():
@@ -626,16 +654,17 @@ class treemanifest(object):
     @propertycache
     def _alldirs(self):
         return util.dirs(self)
 
     def dirs(self):
         return self._alldirs
 
     def hasdir(self, dir):
+        self._load()
         topdir, subdir = _splittopdir(dir)
         if topdir:
             if topdir in self._dirs:
                 return self._dirs[topdir].hasdir(subdir)
             return False
         return (dir + '/') in self._dirs
 
     def walk(self, match):
@@ -668,16 +697,17 @@ class treemanifest(object):
                 match.bad(fn, None)
 
     def _walk(self, match):
         '''Recursively generates matching file names for walk().'''
         if not match.visitdir(self._dir[:-1] or '.'):
             return
 
         # yield this dir's files and walk its submanifests
+        self._load()
         for p in sorted(self._dirs.keys() + self._files.keys()):
             if p in self._files:
                 fullp = self._subpath(p)
                 if match(fullp):
                     yield fullp
             else:
                 for f in self._dirs[p]._walk(match):
                     yield f
@@ -692,16 +722,17 @@ class treemanifest(object):
     def _matches(self, match):
         '''recursively generate a new manifest filtered by the match argument.
         '''
         ret = treemanifest(self._dir)
 
         if not match.visitdir(self._dir[:-1] or '.'):
             return ret
 
+        self._load()
         for fn in self._files:
             fullp = self._subpath(fn)
             if not match(fullp):
                 continue
             ret._files[fn] = self._files[fn]
             if fn in self._flags:
                 ret._flags[fn] = self._flags[fn]
 
@@ -729,16 +760,18 @@ class treemanifest(object):
         the nodeid will be None and the flags will be the empty
         string.
         '''
         result = {}
         emptytree = treemanifest()
         def _diff(t1, t2):
             if t1._node == t2._node and not t1._dirty and not t2._dirty:
                 return
+            t1._load()
+            t2._load()
             for d, m1 in t1._dirs.iteritems():
                 m2 = t2._dirs.get(d, emptytree)
                 _diff(m1, m2)
 
             for d, m2 in t2._dirs.iteritems():
                 if d not in t1._dirs:
                     _diff(emptytree, m2)
 
@@ -779,30 +812,42 @@ class treemanifest(object):
                 # Assigning to _files and _flags avoids marking as dirty,
                 # and should be a little faster.
                 self._files[f] = n
                 if fl:
                     self._flags[f] = fl
 
     def text(self, usemanifestv2=False):
         """Get the full data of this manifest as a bytestring."""
+        self._load()
         flags = self.flags
         return _text(((f, self[f], flags(f)) for f in self.keys()),
                      usemanifestv2)
 
     def dirtext(self, usemanifestv2=False):
         """Get the full data of this directory as a bytestring. Make sure that
         any submanifests have been written first, so their nodeids are correct.
         """
+        self._load()
         flags = self.flags
         dirs = [(d[:-1], self._dirs[d]._node, 'd') for d in self._dirs]
         files = [(f, self._files[f], flags(f)) for f in self._files]
         return _text(sorted(dirs + files), usemanifestv2)
 
+    def read(self, gettext, readsubtree):
+        def _load():
+            # Mark as loaded already here, so __setitem__ and setflag() don't
+            # cause infinite loops when they try to load.
+            self._load = _noop
+            self.parse(gettext(), readsubtree)
+            self._dirty = False
+        self._load = _load
+
     def writesubtrees(self, m1, m2, writesubtree):
+        self._load() # for consistency; should never have any effect here
         emptytree = treemanifest()
         for d, subm in self._dirs.iteritems():
             subp1 = m1._dirs.get(d, emptytree)._node
             subp2 = m2._dirs.get(d, emptytree)._node
             if subp1 == revlog.nullid:
                 subp1, subp2 = subp2, subp1
             writesubtree(subm, subp1, subp2)
 
@@ -886,25 +931,27 @@ class manifest(revlog.revlog):
             return self.readdelta(node)
         return self.read(node)
 
     def read(self, node):
         if node == revlog.nullid:
             return self._newmanifest() # don't upset local cache
         if node in self._mancache:
             return self._mancache[node][0]
-        text = self.revision(node)
         if self._treeondisk:
+            def gettext():
+                return self.revision(node)
             def readsubtree(dir, subm):
                 return self.dirlog(dir).read(subm)
             m = self._newmanifest()
-            m.parse(text, readsubtree)
+            m.read(gettext, readsubtree)
             m.setnode(node)
             arraytext = None
         else:
+            text = self.revision(node)
             m = self._newmanifest(text)
             arraytext = array.array('c', text)
         self._mancache[node] = (m, arraytext)
         return m
 
     def find(self, node, f):
         '''look up entry for a single file efficiently.
         return (node, flags) pair if found, (None, None) if not.'''
--- a/tests/test-treemanifest.t
+++ b/tests/test-treemanifest.t
@@ -73,34 +73,38 @@ Removing directory does not create an re
   removing dir1/dir1/b (glob)
   $ hg debugindex --dir dir1/dir1 > before
   $ hg ci -qm 'remove dir1/dir1'
   $ hg debugindex --dir dir1/dir1 > after
   $ diff before after
   $ rm before after
 
 Check that hg files (calls treemanifest.walk()) works
+without loading all directory revlogs
 
   $ hg co 'desc("add dir2")'
   2 files updated, 0 files merged, 0 files removed, 0 files unresolved
+  $ mv .hg/store/meta/dir2 .hg/store/meta/dir2-backup
   $ hg files -r . dir1
   dir1/a (glob)
   dir1/b (glob)
   dir1/dir1/a (glob)
   dir1/dir1/b (glob)
   dir1/dir2/a (glob)
   dir1/dir2/b (glob)
 
 Check that status between revisions works (calls treemanifest.matches())
+without loading all directory revlogs
 
   $ hg status --rev 'desc("add dir1")' --rev . dir1
   A dir1/dir1/a
   A dir1/dir1/b
   A dir1/dir2/a
   A dir1/dir2/b
+  $ mv .hg/store/meta/dir2-backup .hg/store/meta/dir2
 
 Merge creates 2-parent revision of directory revlog
 
   $ echo 5 > dir1/a
   $ hg ci -Aqm 'modify dir1/a'
   $ hg co '.^'
   1 files updated, 0 files merged, 0 files removed, 0 files unresolved
   $ echo 6 > dir1/b