setdiscovery: avoid a full changelog graph traversal
authorSiddharth Agarwal <sid0@fb.com>
Sun, 16 Nov 2014 00:40:29 -0800
changeset 23343 f8a2647fe020da7bf68e12083f10bef9183e7ee3
parent 23342 f710644e1ce99a9f422b5a3a7b53d3dae2af36eb
child 23347 49cdf51cbc6c42bb6785c1b3f64b7ed8258f79d3
push id1
push usergszorc@mozilla.com
push dateWed, 18 Mar 2015 16:34:57 +0000
setdiscovery: avoid a full changelog graph traversal We were definitely being suboptimal here: we were constructing two full sets, one with the full set of common nodes (i.e. a graph traversal) and one with all nodes. Then we subtract one set from the other. This whole process is O(commits) and causes discovery to be significantly slower than it should be. Instead, keep track of common incrementally and keep undecided as small as possible. This makes discovery massively faster on large repos: on one such repo, 'hg debugdiscovery' over SSH with one commit missing on the client and five on the server went from 4.5 seconds to 1.5. (An 'hg debugdiscovery' with no commits missing on the client, i.e. connection startup time, was 1.2 seconds.)
mercurial/setdiscovery.py
--- a/mercurial/setdiscovery.py
+++ b/mercurial/setdiscovery.py
@@ -35,17 +35,17 @@ sample of missing, start by sending all 
 repo is a subset, you computed the answer in one round trip.
 
 Then you can do something similar to the bisecting strategy used when
 finding faulty changesets. Instead of random samples, you can try picking
 nodes that will maximize the number of nodes that will be
 classified with it (since all ancestors or descendants will be marked as well).
 """
 
-from node import nullid
+from node import nullid, nullrev
 from i18n import _
 import random
 import util, dagutil
 
 def _updatesample(dag, nodes, sample, always, quicksamplesize=0):
     # if nodes is empty we scan the entire graph
     if nodes:
         heads = dag.headsetofconnecteds(nodes)
@@ -172,46 +172,42 @@ def findcommonheads(ui, local, remote,
         ownheadhashes = dag.externalizeall(ownheads)
         return (ownheadhashes, True, srvheadhashes,)
 
     # full blown discovery
 
     # own nodes where I don't know if remote knows them
     undecided = dag.nodeset()
     # own nodes I know we both know
-    common = set()
+    # treat remote heads (and maybe own heads) as a first implicit sample
+    # response
+    common = cl.incrementalmissingrevs(srvheads)
+    commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
+    common.addbases(commoninsample)
+    undecided = set(common.missingancestors(ownheads))
     # own nodes I know remote lacks
     missing = set()
 
-    # treat remote heads (and maybe own heads) as a first implicit sample
-    # response
-    common.update(dag.ancestorset(srvheads))
-    undecided.difference_update(common)
-
     full = False
     while undecided:
 
         if sample:
-            commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
-            common.update(dag.ancestorset(commoninsample, common))
-
             missinginsample = [n for i, n in enumerate(sample) if not yesno[i]]
             missing.update(dag.descendantset(missinginsample, missing))
 
             undecided.difference_update(missing)
-            undecided.difference_update(common)
 
         if not undecided:
             break
 
         if full:
             ui.note(_("sampling from both directions\n"))
             sample = _takefullsample(dag, undecided, size=fullsamplesize)
             targetsize = fullsamplesize
-        elif common:
+        elif common.hasbases():
             # use cheapish initial sample
             ui.debug("taking initial sample\n")
             sample = _takefullsample(dag, undecided, size=fullsamplesize)
             targetsize = fullsamplesize
         else:
             # use even cheaper initial sample
             ui.debug("taking quick initial sample\n")
             sample = _takequicksample(dag, undecided, size=initialsamplesize,
@@ -223,17 +219,27 @@ def findcommonheads(ui, local, remote,
         ui.progress(_('searching'), roundtrips, unit=_('queries'))
         ui.debug("query %i; still undecided: %i, sample size is: %i\n"
                  % (roundtrips, len(undecided), len(sample)))
         # indices between sample and externalized version must match
         sample = list(sample)
         yesno = remote.known(dag.externalizeall(sample))
         full = True
 
-    result = dag.headsetofconnecteds(common)
+        if sample:
+            commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
+            common.addbases(commoninsample)
+            common.removeancestorsfrom(undecided)
+
+    # heads(common) == heads(common.bases) since common represents common.bases
+    # and all its ancestors
+    result = dag.headsetofconnecteds(common.bases)
+    # common.bases can include nullrev, but our contract requires us to not
+    # return any heads in that case, so discard that
+    result.discard(nullrev)
     ui.progress(_('searching'), None)
     ui.debug("%d total queries\n" % roundtrips)
 
     if not result and srvheadhashes != [nullid]:
         if abortwhenunrelated:
             raise util.Abort(_("repository is unrelated"))
         else:
             ui.warn(_("warning: repository is unrelated\n"))