verify: check directory manifests
authorMartin von Zweigbergk <martinvonz@google.com>
Sun, 07 Feb 2016 21:13:24 -0800
changeset 30315 7297e9e13a8a5132ba1bb78c51cf8ab962048df8
parent 30314 a4692267bc2d2b5809bd5eb449030a9115e7ad9a
child 30316 962921c330b00d53accdb3d4075cfe8e6d2ce440
push id187
push usergszorc@mozilla.com
push dateSun, 28 Feb 2016 01:45:48 +0000
verify: check directory manifests In repos with treemanifests, there is no specific verification of directory manifest revlogs. It simply collects all file nodes by reading each manifest delta. With treemanifests, that's means calling the manifest._slowreaddelta(). If there are missing revlog entries in a subdirectory revlog, 'hg verify' will simply report the exception that occurred while trying to read the root manifest: manifest@0: reading delta 1700e2e92882: meta/b/00manifest.i@67688a370455: no node This patch changes the verify code to load only the root manifest at first and verify all revisions of it, then verify all revisions of each direct subdirectory, and so on, recursively. The above message becomes b/@0: parent-directory manifest refers to unknown revision 67688a370455 Since the new algorithm reads a single revlog at a time and in order, 'hg verify' on a treemanifest version of the hg core repo goes from ~50s to ~14s. As expected, there is no significant difference on a repo with flat manifests.
mercurial/manifest.py
mercurial/verify.py
tests/test-treemanifest.t
--- a/mercurial/manifest.py
+++ b/mercurial/manifest.py
@@ -320,16 +320,19 @@ class manifestdict(object):
     def copy(self):
         c = manifestdict()
         c._lm = self._lm.copy()
         return c
 
     def iteritems(self):
         return (x[:2] for x in self._lm.iterentries())
 
+    def iterentries(self):
+        return self._lm.iterentries()
+
     def text(self, usemanifestv2=False):
         if usemanifestv2:
             return _textv2(self._lm.iterentries())
         else:
             # use (probably) native version for v1
             return self._lm.text()
 
     def fastdelta(self, base, changes):
@@ -915,17 +918,18 @@ class manifest(revlog.revlog):
             self._dirlogcache = {'': self}
 
     def _newmanifest(self, data=''):
         if self._treeinmem:
             return treemanifest(self._dir, data)
         return manifestdict(data)
 
     def dirlog(self, dir):
-        assert self._treeondisk
+        if dir:
+            assert self._treeondisk
         if dir not in self._dirlogcache:
             self._dirlogcache[dir] = manifest(self.opener, dir,
                                               self._dirlogcache)
         return self._dirlogcache[dir]
 
     def _slowreaddelta(self, node):
         r0 = self.deltaparent(self.rev(node))
         m0 = self.read(self.node(r0))
@@ -940,16 +944,32 @@ class manifest(revlog.revlog):
 
     def readdelta(self, node):
         if self._usemanifestv2 or self._treeondisk:
             return self._slowreaddelta(node)
         r = self.rev(node)
         d = mdiff.patchtext(self.revdiff(self.deltaparent(r), r))
         return self._newmanifest(d)
 
+    def readshallowdelta(self, node):
+        '''For flat manifests, this is the same as readdelta(). For
+        treemanifests, this will read the delta for this revlog's directory,
+        without recursively reading subdirectory manifests. Instead, any
+        subdirectory entry will be reported as it appears in the manifests, i.e.
+        the subdirectory will be reported among files and distinguished only by
+        its 't' flag.'''
+        if not self._treeondisk:
+            return self.readdelta(node)
+        if self._usemanifestv2:
+            raise error.Abort(
+                "readshallowdelta() not implemented for manifestv2")
+        r = self.rev(node)
+        d = mdiff.patchtext(self.revdiff(self.deltaparent(r), r))
+        return manifestdict(d)
+
     def readfast(self, node):
         '''use the faster of readdelta or read
 
         This will return a manifest which is either only the files
         added/modified relative to p1, or all files in the
         manifest. Which one is returned depends on the codepath used
         to retrieve the data.
         '''
--- a/mercurial/verify.py
+++ b/mercurial/verify.py
@@ -192,56 +192,83 @@ class verifier(object):
                     if _validpath(repo, f):
                         filelinkrevs.setdefault(_normpath(f), []).append(i)
             except Exception as inst:
                 self.refersmf = True
                 self.exc(i, _("unpacking changeset %s") % short(n), inst)
         ui.progress(_('checking'), None)
         return mflinkrevs, filelinkrevs
 
-    def _verifymanifest(self, mflinkrevs):
+    def _verifymanifest(self, mflinkrevs, dir=""):
         repo = self.repo
         ui = self.ui
-        mf = self.repo.manifest
+        mf = self.repo.manifest.dirlog(dir)
 
-        ui.status(_("checking manifests\n"))
+        if not dir:
+            self.ui.status(_("checking manifests\n"))
+
         filenodes = {}
+        subdirnodes = {}
         seen = {}
         label = "manifest"
+        if dir:
+            label = dir
         if self.refersmf:
             # Do not check manifest if there are only changelog entries with
             # null manifests.
             self.checklog(mf, label, 0)
         total = len(mf)
         for i in mf:
-            ui.progress(_('checking'), i, total=total, unit=_('manifests'))
+            if not dir:
+                ui.progress(_('checking'), i, total=total, unit=_('manifests'))
             n = mf.node(i)
             lr = self.checkentry(mf, i, n, seen, mflinkrevs.get(n, []), label)
             if n in mflinkrevs:
                 del mflinkrevs[n]
+            elif dir:
+                self.err(lr, _("%s not in parent-directory manifest") %
+                         short(n), label)
             else:
                 self.err(lr, _("%s not in changesets") % short(n), label)
 
             try:
-                for f, fn in mf.readdelta(n).iteritems():
+                for f, fn, fl in mf.readshallowdelta(n).iterentries():
                     if not f:
-                        self.err(lr, _("file without name in manifest"))
-                    elif f != "/dev/null": # ignore this in very old repos
-                        if _validpath(repo, f):
-                            filenodes.setdefault(
-                                _normpath(f), {}).setdefault(fn, lr)
+                        self.err(lr, _("entry without name in manifest"))
+                    elif f == "/dev/null":  # ignore this in very old repos
+                        continue
+                    fullpath = dir + _normpath(f)
+                    if not _validpath(repo, fullpath):
+                        continue
+                    if fl == 't':
+                        subdirnodes.setdefault(fullpath + '/', {}).setdefault(
+                            fn, []).append(lr)
+                    else:
+                        filenodes.setdefault(fullpath, {}).setdefault(fn, lr)
             except Exception as inst:
                 self.exc(lr, _("reading delta %s") % short(n), inst, label)
-        ui.progress(_('checking'), None)
+        if not dir:
+            ui.progress(_('checking'), None)
 
         if self.havemf:
             for c, m in sorted([(c, m) for m in mflinkrevs
                         for c in mflinkrevs[m]]):
-                self.err(c, _("changeset refers to unknown revision %s") %
-                         short(m), label)
+                if dir:
+                    self.err(c, _("parent-directory manifest refers to unknown "
+                                  "revision %s") % short(m), label)
+                else:
+                    self.err(c, _("changeset refers to unknown revision %s") %
+                             short(m), label)
+
+        if not dir and subdirnodes:
+            self.ui.status(_("checking directory manifests\n"))
+        for subdir, linkrevs in subdirnodes.iteritems():
+            subdirfilenodes = self._verifymanifest(linkrevs, subdir)
+            for f, onefilenodes in subdirfilenodes.iteritems():
+                filenodes.setdefault(f, {}).update(onefilenodes)
 
         return filenodes
 
     def _crosscheckfiles(self, filelinkrevs, filenodes):
         repo = self.repo
         ui = self.ui
         ui.status(_("crosschecking files in changesets and manifests\n"))
 
--- a/tests/test-treemanifest.t
+++ b/tests/test-treemanifest.t
@@ -466,16 +466,17 @@ Add some more changes to the deep repo
   $ hg ci -m narf
   $ echo troz >> b/bar/orange/fly/gnat.py
   $ hg ci -m troz
 
 Verify works
   $ hg verify
   checking changesets
   checking manifests
+  checking directory manifests
   crosschecking files in changesets and manifests
   checking files
   8 files, 3 changesets, 10 total revisions
 
 Dirlogs are included in fncache
   $ grep meta/.A/00manifest.i .hg/store/fncache
   meta/.A/00manifest.i
 
@@ -498,16 +499,66 @@ Rebuilt fncache includes dirlogs
   adding meta/b/foo/00manifest.i
   adding meta/b/foo/apple/00manifest.i
   adding meta/b/foo/apple/bees/00manifest.i
   16 items added, 0 removed from fncache
 
 Finish first server
   $ killdaemons.py
 
+Back up the recently added revlogs
+  $ cp -r .hg/store .hg/store-newcopy
+
+Verify reports missing dirlog
+  $ rm .hg/store/meta/b/00manifest.*
+  $ hg verify
+  checking changesets
+  checking manifests
+  checking directory manifests
+   0: empty or missing b/
+   b/@0: parent-directory manifest refers to unknown revision 67688a370455
+   b/@1: parent-directory manifest refers to unknown revision f38e85d334c5
+   b/@2: parent-directory manifest refers to unknown revision 99c9792fd4b0
+  crosschecking files in changesets and manifests
+   b/bar/fruits.txt@0: in changeset but not in manifest
+   b/bar/orange/fly/gnat.py@0: in changeset but not in manifest
+   b/bar/orange/fly/housefly.txt@0: in changeset but not in manifest
+   b/foo/apple/bees/flower.py@0: in changeset but not in manifest
+  checking files
+  8 files, 3 changesets, 10 total revisions
+  8 integrity errors encountered!
+  (first damaged changeset appears to be 0)
+  [1]
+  $ cp -rT .hg/store-newcopy .hg/store
+
+Verify reports missing dirlog entry
+  $ mv -f .hg/store-copy/meta/b/00manifest.* .hg/store/meta/b/
+  $ hg verify
+  checking changesets
+  checking manifests
+  checking directory manifests
+   b/@1: parent-directory manifest refers to unknown revision f38e85d334c5
+   b/@2: parent-directory manifest refers to unknown revision 99c9792fd4b0
+   b/bar/@?: rev 1 points to unexpected changeset 1
+   b/bar/@?: 5e03c4ee5e4a not in parent-directory manifest
+   b/bar/@?: rev 2 points to unexpected changeset 2
+   b/bar/@?: 1b16940d66d6 not in parent-directory manifest
+   b/bar/orange/@?: rev 1 points to unexpected changeset 2
+   (expected None)
+   b/bar/orange/fly/@?: rev 1 points to unexpected changeset 2
+   (expected None)
+  crosschecking files in changesets and manifests
+  checking files
+  8 files, 3 changesets, 10 total revisions
+  2 warnings encountered!
+  8 integrity errors encountered!
+  (first damaged changeset appears to be 1)
+  [1]
+  $ cp -rT .hg/store-newcopy .hg/store
+
 Test cloning a treemanifest repo over http.
   $ hg serve -p $HGPORT -d --pid-file=hg.pid --errorlog=errors.log
   $ cat hg.pid >> $DAEMON_PIDS
   $ cd ..
 We can clone even with the knob turned off and we'll get a treemanifest repo.
   $ hg clone --config experimental.treemanifest=False \
   >   --config experimental.changegroup3=True \
   >   http://localhost:$HGPORT deepclone
@@ -542,16 +593,17 @@ Tree manifest revlogs exist.
   deepclone/.hg/store/meta/b/foo/apple/bees/00manifest.i
   deepclone/.hg/store/meta/~2e_a
   deepclone/.hg/store/meta/~2e_a/00manifest.i
 Verify passes.
   $ cd deepclone
   $ hg verify
   checking changesets
   checking manifests
+  checking directory manifests
   crosschecking files in changesets and manifests
   checking files
   8 files, 3 changesets, 10 total revisions
   $ cd ..
 
 Create clones using old repo formats to use in later tests
   $ hg clone --config format.usestore=False \
   >   --config experimental.changegroup3=True \
@@ -586,79 +638,85 @@ Create clones using old repo formats to 
   $ cat hg.pid >> $DAEMON_PIDS
   $ cd ..
 
 Local clone with basicstore
   $ hg clone -U deeprepo-basicstore local-clone-basicstore
   $ hg -R local-clone-basicstore verify
   checking changesets
   checking manifests
+  checking directory manifests
   crosschecking files in changesets and manifests
   checking files
   8 files, 3 changesets, 10 total revisions
 
 Local clone with encodedstore
   $ hg clone -U deeprepo-encodedstore local-clone-encodedstore
   $ hg -R local-clone-encodedstore verify
   checking changesets
   checking manifests
+  checking directory manifests
   crosschecking files in changesets and manifests
   checking files
   8 files, 3 changesets, 10 total revisions
 
 Local clone with fncachestore
   $ hg clone -U deeprepo local-clone-fncachestore
   $ hg -R local-clone-fncachestore verify
   checking changesets
   checking manifests
+  checking directory manifests
   crosschecking files in changesets and manifests
   checking files
   8 files, 3 changesets, 10 total revisions
 
 Stream clone with basicstore
   $ hg clone --config experimental.changegroup3=True --uncompressed -U \
   >   http://localhost:$HGPORT1 stream-clone-basicstore
   streaming all changes
   18 files to transfer, * of data (glob)
   transferred * in * seconds (*) (glob)
   searching for changes
   no changes found
   $ hg -R stream-clone-basicstore verify
   checking changesets
   checking manifests
+  checking directory manifests
   crosschecking files in changesets and manifests
   checking files
   8 files, 3 changesets, 10 total revisions
 
 Stream clone with encodedstore
   $ hg clone --config experimental.changegroup3=True --uncompressed -U \
   >   http://localhost:$HGPORT2 stream-clone-encodedstore
   streaming all changes
   18 files to transfer, * of data (glob)
   transferred * in * seconds (*) (glob)
   searching for changes
   no changes found
   $ hg -R stream-clone-encodedstore verify
   checking changesets
   checking manifests
+  checking directory manifests
   crosschecking files in changesets and manifests
   checking files
   8 files, 3 changesets, 10 total revisions
 
 Stream clone with fncachestore
   $ hg clone --config experimental.changegroup3=True --uncompressed -U \
   >   http://localhost:$HGPORT stream-clone-fncachestore
   streaming all changes
   18 files to transfer, * of data (glob)
   transferred * in * seconds (*) (glob)
   searching for changes
   no changes found
   $ hg -R stream-clone-fncachestore verify
   checking changesets
   checking manifests
+  checking directory manifests
   crosschecking files in changesets and manifests
   checking files
   8 files, 3 changesets, 10 total revisions
 
 Packed bundle
   $ hg -R deeprepo debugcreatestreamclonebundle repo-packed.hg
   writing 3349 bytes for 18 files
   bundle requirements: generaldelta, revlogv1, treemanifest