Extract bug info from commit messages
authorGregory Szorc <gps@mozilla.com>
Thu, 25 Jul 2013 19:33:36 -0700
changeset 35 741341c5d5cd2d3002826f51222ec62003276420
parent 34 9bc323bcc74f4fc7e84d11fe9d30c53929c937d8
child 36 6121cfdb8eec79a4df3f3f0cb1865d2e7bee1097
push id19
push usergszorc@mozilla.com
push dateFri, 26 Jul 2013 02:33:46 +0000
Extract bug info from commit messages
__init__.py
mozautomation/changetracker.py
--- a/__init__.py
+++ b/__init__.py
@@ -93,24 +93,34 @@ This extension supports downloading push
 Pushlog data records who pushed changesets where and when.
 
 To use this functionality, you'll need to sync pushlog data from the server to
 the local machine. This facilitates rapid data lookup and can be done by
 running `hg pushlogsync`.
 
 Once pushlog data is synced, you can use `hg changesetpushes` to look up push
 information for a specific changeset.
+
+Bug Info
+========
+
+Information about bugs is extracted from commit messages via the `hg bugsync`
+command. Once information about bugs is extracted, you can look up information
+about specific bugs via `hg buginfo`.
 """
 
 import datetime
 import errno
 import os
+import re
 import shutil
 import sys
 
+from operator import methodcaller
+
 import mercurial.commands as commands
 
 from mercurial.i18n import _
 from mercurial.commands import (
     push,
 )
 from mercurial.error import (
     RepoError,
@@ -385,48 +395,41 @@ def syncpushinfo(ui, repo, tree=None, **
     """Synchronize the pushlog information for all known Gecko trees.
 
     The pushlog info contains who, when, and where individual changesets were
     pushed.
 
     After running this command, you can query for push information for specific
     changesets.
     """
-    tracker = ChangeTracker(repo.join('changetracker.db'))
-
     for i, tree in enumerate(sorted(REPOS)):
-        tracker.load_pushlog(tree)
+        repo.changetracker.load_pushlog(tree)
         ui.progress('pushlogsync', i, total=len(REPOS))
 
     ui.progress('pushlogsync', None)
 
 
-@command('changesetpushes',
-    [('a', 'all', False, _('Show all trees, not just release trees.'), '')],
-    _('hg changesetpushes REV'))
-def changesetpushes(ui, repo, rev, all=False, **opts):
-    """Display pushlog information for a changeset.
+def print_changeset_pushes(ui, repo, rev, all=False):
+    ctx = repo[rev]
+    node = ctx.node()
 
-    This command prints pushlog entries for a given changeset. It is used to
-    answer the question: how did a changeset propagate to all the trees.
-    """
-    ctx = repo[rev]
-    node = ctx.hex()
+    pushes = repo.changetracker.pushes_for_changeset(node)
+    pushes = [p for p in pushes if all or p[0] in RELEASE_TREES]
+    if not pushes:
+        ui.warn('No pushes recorded for changeset: ', str(ctx), '\n')
+        return 1
 
-    tracker = ChangeTracker(repo.join('changetracker.db'))
-    pushes = [p for p in tracker.pushes_for_changeset(node) if all or p[0] in
-        RELEASE_TREES]
     longest_tree = max(len(p[0]) for p in pushes) + 2
     longest_user = max(len(p[3]) for p in pushes) + 2
 
     ui.write(ctx.rev(), ':', str(ctx), ' ', ctx.description(), '\n')
 
     ui.write('Release ', 'Tree'.ljust(longest_tree), 'Date'.ljust(20),
             'Username'.ljust(longest_user), 'Build Info\n')
-    for tree, push_id, when, user, head_changeset in pushes:
+    for tree, push_id, when, user, head_node in pushes:
         releases = set()
         release = ''
         versions = {}
 
         if tree == 'beta':
             versions = repo._beta_releases()
         elif tree == 'release':
             versions = repo._release_releases()
@@ -434,22 +437,72 @@ def changesetpushes(ui, repo, rev, all=F
         for version, e in versions.items():
             vctx = repo[e[0]]
             if ctx.descendant(vctx):
                 releases.add(version)
 
         if len(releases):
             release = sorted(releases)[0]
 
-        tbpl = tbpl_url(tree, head_changeset[0:12])
+        tbpl = tbpl_url(tree, hex(head_node)[0:12])
         date = datetime.datetime.fromtimestamp(when)
         ui.write(release.ljust(8), tree.ljust(longest_tree), date.isoformat(),
             ' ', user.ljust(longest_user), tbpl or '', '\n')
 
 
+@command('changesetpushes',
+    [('a', 'all', False, _('Show all trees, not just release trees.'), '')],
+    _('hg changesetpushes REV'))
+def changesetpushes(ui, repo, rev, all=False, **opts):
+    """Display pushlog information for a changeset.
+
+    This command prints pushlog entries for a given changeset. It is used to
+    answer the question: how did a changeset propagate to all the trees.
+    """
+    print_changeset_pushes(ui, repo, rev, all=all)
+
+
+@command('bugsync', [], 'hg bugsync')
+def syncbuginfo(ui, repo, **opts):
+    """Synchronize bug info with the local database.
+
+    This command must be performed before `hg buginfo` to ensure the data is up
+    to date.
+    """
+    for rev in repo:
+        ui.progress('changeset', rev, total=len(repo))
+        ctx = repo[rev]
+
+        bugs = repo._bugs_in_description(ctx.description())
+        if not bugs:
+            continue
+        repo.changetracker.associate_bugs_with_changeset(bugs, ctx.node())
+
+    ui.progress('changeset', None)
+
+
+@command('buginfo',
+    [('a', 'all', False, _('Show all trees, not just release trees.'), '')],
+    _('hg buginfo [BUG] ...'))
+def buginfo(ui, repo, *bugs, **opts):
+    tracker = ChangeTracker(repo.join('changetracker.db'))
+
+    nodes = set()
+    for bug in bugs:
+        nodes |= set(tracker.changesets_with_bug(bug))
+
+    # Sorting by topological order would probably be preferred. This is quick
+    # and easy.
+    contexts = sorted([repo[node] for node in nodes], key=methodcaller('rev'))
+
+    for ctx in contexts:
+        print_changeset_pushes(ui, repo, ctx.rev(), all=opts['all'])
+        ui.write('\n')
+
+
 def critic_hook(ui, repo, node=None, **opts):
     critique(ui, repo, node=node, **opts)
     return 0
 
 
 class remoterefs(dict):
     """Represents a remote refs file."""
 
@@ -505,16 +558,23 @@ def reposetup(ui, repo):
     orig_pull = repo.pull
     orig_push = repo.push
 
     class remotestrackingrepo(repo.__class__):
         @repofilecache('remoterefs')
         def remoterefs(self):
             return remoterefs(self)
 
+        @util.propertycache
+        def changetracker(self):
+            try:
+                return ChangeTracker(self.join('changetracker.db'))
+            except Exception as e:
+                raise util.Abort(e.message)
+
         # Resolve remote ref symbols. For some reason, we need both lookup
         # and findtags implemented.
         def lookup(self, key):
             try:
                 key = self.remoterefs[key]
             except (KeyError, TypeError):
                 pass
 
@@ -524,24 +584,33 @@ def reposetup(ui, repo):
             tags, tagtypes = orig_findtags()
             tags.update(self.remoterefs)
 
             return tags, tagtypes
 
         def pull(self, remote, *args, **kwargs):
             # Pulls from known repositories will automatically update our
             # remote tracking references.
+            old_rev = len(self)
             res = orig_pull(remote, *args, **kwargs)
             lock = self.wlock()
             try:
                 tree = resolve_uri_to_tree(remote.url())
 
                 if tree:
                     self._update_remote_refs(remote, tree)
 
+                # Sync bug info.
+                for rev in self.changelog.revs(old_rev + 1):
+                    ctx = self[rev]
+                    bugs = self._bugs_in_description(ctx.description())
+                    if bugs:
+                        self.changetracker.associate_bugs_with_changeset(bugs,
+                            ctx.node())
+
             finally:
                 lock.release()
 
             return res
 
         def push(self, remote, *args, **kwargs):
             res = orig_push(remote, *args, **kwargs)
             lock = self.wlock()
@@ -620,12 +689,29 @@ def reposetup(ui, repo):
                 version = '%s.%s' % (major, minor)
                 if marker:
                     version += '%s%s' % (marker, after)
 
                 d[version] = (key, node, major, minor, marker or None, after or None)
 
             return d
 
+        BUG_RE = re.compile(r'''# bug followed by any sequence of numbers, or
+                                # a standalone sequence of numbers
+                             (
+                               (?:
+                                 bug |
+                                 b= |
+                                 # a sequence of 5+ numbers preceded by whitespace
+                                 (?=\b\#?\d{5,}) |
+                                 # numbers at the very beginning
+                                 ^(?=\d)
+                               )
+                               (?:\s*\#?)(\d+)
+                             )''', re.I | re.X)
+
+        def _bugs_in_description(self, desc):
+            return [int(m[1]) for m in self.BUG_RE.findall(desc)]
+
     repo.__class__ = remotestrackingrepo
     if not ui.configbool('mozext', 'noautocritic'):
         ui.setconfig('hooks', 'commit.critic', critic_hook)
         ui.setconfig('hooks', 'qrefresh.critic', critic_hook)
--- a/mozautomation/changetracker.py
+++ b/mozautomation/changetracker.py
@@ -1,59 +1,84 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 from __future__ import unicode_literals
 
+import binascii
+import os
 import sqlite3
 
 from .repository import (
     MercurialRepository,
     resolve_trees_to_uris,
 )
 
 
 class ChangeTracker(object):
     """Data store for tracking changes and bugs and repository events."""
 
     def __init__(self, path):
+        self.path = path
+        self.created = False
+        if not os.path.exists(path):
+            self.created = True
+
         self._db = sqlite3.connect(path)
 
-        if not self._schema_current():
-            self._create_schema()
+        # We don't care about data loss because all data can be reconstructed
+        # relatively easily.
+        self._db.execute('PRAGMA SYNCHRONOUS=OFF')
+        self._db.execute('PRAGMA JOURNAL_MODE=WAL')
+
+        self._create_schema(self._schema_version)
 
-    def _schema_current(self):
-        return self._db.execute('PRAGMA user_version').fetchone()[0] == 1
+    @property
+    def _schema_version(self):
+        return self._db.execute('PRAGMA user_version').fetchone()[0]
 
-    def _create_schema(self):
+    def _create_schema(self, existing):
+        if existing < 2 and not self.created:
+            raise Exception("Incompatible local database detected. Delete "
+                "database file and try again: %s" % self.path)
+
         with self._db:
             self._db.execute('CREATE TABLE IF NOT EXISTS trees ('
                 'id INTEGER PRIMARY KEY AUTOINCREMENT, '
                 'name TEXT, '
                 'url TEXT '
                 ')')
 
             self._db.execute('CREATE TABLE IF NOT EXISTS pushes ('
                 'push_id INTEGER, '
                 'tree_id INTEGER, '
                 'time INTEGER, '
                 'user TEXT, '
                 'PRIMARY KEY (push_id, tree_id) '
                 ')')
 
             self._db.execute('CREATE TABLE IF NOT EXISTS changeset_pushes ('
-                'changeset TEXT, '
-                'head_changeset TEXT, '
+                'changeset BLOB, '
+                'head_changeset BLOB, '
                 'push_id INTEGER, '
                 'tree_id INTEGER, '
                 'UNIQUE (changeset, tree_id) '
                 ')')
 
-            self._db.execute('PRAGMA user_version=1')
+            self._db.execute('CREATE TABLE IF NOT EXISTS bug_changesets ('
+                'bug INTEGER, '
+                'changeset BLOB, '
+                'UNIQUE (bug, changeset) '
+                ')')
+
+            self._db.execute('CREATE INDEX IF NOT EXISTS i_bug_changesets_bug '
+                'ON bug_changesets (bug)')
+
+            self._db.execute('PRAGMA user_version=2')
 
     def tree_id(self, tree, url=None):
         with self._db:
             field = self._db.execute('SELECT id FROM trees WHERE name=? LIMIT 1',
                 [tree]).fetchone()
 
             if field:
                 return field[0]
@@ -76,23 +101,42 @@ class ChangeTracker(object):
         last_push_id = last_push_id[0] if last_push_id else -1
 
         with self._db:
             for push_id, push in repo.push_info(start_id=last_push_id + 1):
                 self._db.execute('INSERT INTO pushes (push_id, tree_id, time, '
                 'user) VALUES (?, ?, ?, ?)', [push_id, tree_id, push['date'],
                     push['user']])
 
-                head = push['changesets'][0]
+                head = buffer(binascii.unhexlify(push['changesets'][0]))
 
-                for changeset in push['changesets']:
-                    self._db.execute('INSERT INTO changeset_pushes VALUES '
-                        '(?, ?, ?, ?)', [changeset, head, push_id, tree_id])
+                params = [(buffer(binascii.unhexlify(c)), head, push_id,
+                    tree_id) for c in push['changesets']]
+                self._db.executemany('INSERT INTO changeset_pushes VALUES '
+                    '(?, ?, ?, ?)', params)
 
     def pushes_for_changeset(self, changeset):
         for row in self._db.execute('SELECT trees.name, pushes.push_id, '
             'pushes.time, pushes.user, changeset_pushes.head_changeset '
             'FROM trees, pushes, changeset_pushes '
             'WHERE pushes.push_id = changeset_pushes.push_id AND '
             'pushes.tree_id = changeset_pushes.tree_id AND '
             'trees.id = pushes.tree_id AND changeset_pushes.changeset=? '
-            'ORDER BY pushes.time ASC', [changeset]):
+            'ORDER BY pushes.time ASC', [buffer(changeset)]):
             yield row
+
+    def associate_bugs_with_changeset(self, bugs, changeset):
+        """Associate a numeric bug number with a changeset.
+
+        This facilitates rapidly looking up changesets associated with
+        bugs.
+        """
+        if len(changeset) != 20:
+            raise ValueError('Expected binary changesets, not hex.')
+
+        with self._db:
+            self._db.executemany('INSERT OR REPLACE INTO bug_changesets '
+                'VALUES (?, ?)', [(bug, buffer(changeset)) for bug in bugs])
+
+    def changesets_with_bug(self, bug):
+        for row in self._db.execute('SELECT changeset FROM bug_changesets WHERE '
+            'bug = ?', [bug]):
+            yield str(row[0])