hghooks/convert-pushlog-db.py
author Connor Sheehan <sheehan@mozilla.com>
Tue, 24 Aug 2021 16:03:49 +0000
changeset 7726 177006efefb39dca50b218119cdb14d69c000085
parent 710 6b560ece0c3ac2eceda7c272467d8ccb2c24f4f1
permissions -rw-r--r--
clonebundles: remove `replicatesync` logging message from `test-clonebundles.t` (Bug 1727344) r=zeid This was removed after bug 1714463. Depends on D123482 Differential Revision: https://phabricator.services.mozilla.com/D123483

# Copyright (C) 2010 Mozilla Foundation
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

# This script imports data from the older flat-file pushlog format,
# and the newer sqlite pushlog format into a newer sqlite schema.
#
# Unfortunately, when changing between the older format and the db,
# the semantics changed somewhat. The older format used to record the
# *HEAD* revision as of a push, but the db records the *first* changeset
# in a group of pushed changes. To make life simpler, the new schema
# will record all changesets for a push, but we need to migrate the old data
# over.
# To do so, we grab all logged pushes from the old log and the db,
# and then for each logged push, if it is in the old log, then it's a head,
# so store all changes since the previous push with this push. Otherwise,
# it's a 'first changeset', so store all changes up until the next push
# with this push. At the end we'll have one entry in the new pushlog
# table for every push, and one entry per-changeset in the changesets
# table, mapped back to the pushlog table.

try:
    import sqlite3 as sqlite
except ImportError:
    from pysqlite2 import dbapi2 as sqlite

import sys
import os.path
import re
from datetime import datetime
import time
from calendar import timegm
from rfc822 import parsedate_tz, mktime_tz
from mercurial import ui, hg
from mercurial.node import hex

reader = re.compile(r'^"([a-f0-9]{40})"\t"([^\t]*)"\t"([^\t]*)"$')
def readlog(logfile):
    """Read a pushlog and yield (node, user, date) for each line. Returns
    all the entries in chronological order. |date| is a timestamp."""
    try:
        fd = open(logfile)
    except IOError:
        return []
    entries = []
    for line in fd:
        (node, user, date) = reader.match(line).group(1, 2, 3)
        entries.append((node, user, mktime_tz(parsedate_tz(date))))
    fd.close()
    return entries

def readpushdb(pushdb):
    """Read a pushlog db and yield (node, user, date) for each line. Returns
    all the entries in chronological order. |date| is a timestamp."""
    try:
        conn = sqlite.connect(pushdb)
        entries = []
        res = conn.execute("SELECT node, user, date FROM pushlog ORDER BY date ASC")
        for (node, user, date) in res:
            entries.append((node, user, timegm(time.strptime(date, "%Y-%m-%dT%H:%M:%SZ"))))
        return entries
    except:
        return []

def nodeindb(pushdb, node):
    return pushdb.execute("SELECT COUNT(*) from changesets WHERE node = ?", (node,)) == 1

if len(sys.argv) != 2:
    print >>sys.stderr, "Must specify a repository as the only parameter (/path/to/repo/)"
    sys.exit(1)

### Main entrypoint

repo_path = os.path.abspath(sys.argv[1])
if not os.path.exists(repo_path):
    print >>sys.stderr, "Must specify a repository as the only parameter (/path/to/repo/)"
    sys.exit(1)

try:
    repo = hg.repository(ui.ui(), repo_path)
except:
    print >>sys.stderr, "Must specify a repository as the only parameter (/path/to/repo/)"
    sys.exit(1)

# we need to read both the old text pushlog
pushlog = os.path.join(repo_path, ".hg", "pushlog")
# ... and the newer pushlog db
oldpushdb = pushlog + ".db"
# and we're going to migrate them both to a new schema
pushdb = pushlog + "2.db"

# Open or create our new db
conn = sqlite.connect(pushdb)
conn.execute("CREATE TABLE IF NOT EXISTS changesets (pushid INTEGER, rev INTEGER, node text)")
conn.execute("CREATE TABLE IF NOT EXISTS pushlog (id INTEGER PRIMARY KEY AUTOINCREMENT, user TEXT, date INTEGER)")
conn.execute("CREATE UNIQUE INDEX IF NOT EXISTS changeset_node ON changesets (node)")
conn.execute("CREATE UNIQUE INDEX IF NOT EXISTS changeset_rev ON changesets (rev)")
conn.execute("CREATE INDEX IF NOT EXISTS pushlog_date ON pushlog (date)")
conn.execute("CREATE INDEX IF NOT EXISTS pushlog_user ON pushlog (user)")

# Read all entries from both pushlogs
flatlogentries = readlog(pushlog)
flatnodes = dict()
# dict for easy lookup of nodes
for (node, user, date) in flatlogentries:
    flatnodes[node] = 1
logentries = readpushdb(oldpushdb)
if len(logentries) == 0:
    # just in case someone is importing from an old flatfile log
    logentries = flatlogentries

# sort by revision #, just in case we have two pushes with the same date
logentries = [(node, repo.changectx(node), user, date) for (node,user,date) in logentries]
logentries.sort(lambda a,b: cmp(a[1].rev(),b[1].rev()))

# start at the beginning
lastrev = -1
next = 0
for (node, ctx, user, date) in logentries:
    next += 1
    if nodeindb(conn, node):
        # already in the database, move along
        lastrev = ctx.rev()
        continue
    res = conn.execute("INSERT INTO pushlog (user, date) VALUES(?,?)",
                       (user, date))
    pushid = res.lastrowid
    # insert this change first
    conn.execute("INSERT INTO changesets (pushid,rev,node) VALUES(?,?,?)",
                 (pushid, ctx.rev(), node))
    if node in flatnodes:
        # this was a HEAD revision, see if any other changes were pushed
        # along with it
        if lastrev != ctx.rev() - 1:
            for i in range(lastrev+1, ctx.rev()):
                c = repo.changectx(i)
                conn.execute("INSERT INTO changesets (pushid,rev,node) VALUES(?,?,?)",
                 (pushid, c.rev(), hex(c.node())))
        lastrev = ctx.rev()
    else:
        # this was the first change in a set of changes pushed, see
        # if any other changes were pushed along with it
        if next < len(logentries):
            nextctx = repo.changectx(logentries[next][0])
            if ctx.rev() + 1 != nextctx.rev():
                for i in range(ctx.rev()+1, nextctx.rev()):
                    c = repo.changectx(i)
                    conn.execute("INSERT INTO changesets (pushid,rev,node) VALUES(?,?,?)",
                                 (pushid, c.rev(), hex(c.node())))
                    lastrev = c.rev()
        else: # end of the list, see if we're missing any changes to tip
            if not 'tip' in ctx.tags():
                tip =  repo.changectx('tip')
                # we want everything up to and including tip
                for i in range(ctx.rev()+1, tip.rev()+1):
                    c = repo.changectx(i)
                    conn.execute("INSERT INTO changesets (pushid,rev,node) VALUES(?,?,?)",
                                 (pushid, c.rev(), hex(c.node())))
                    lastrev = c.rev()

conn.commit()