The new DB schema, but slow. Checkpoint. newdbschema
authorBenjamin Smedberg <benjamin@smedbergs.us>
Fri, 25 Sep 2009 12:23:53 -0400
branchnewdbschema
changeset 20 5c4f22089101
parent 19 f2f8dbf94a36
child 21 b792fdef1b26
push id13
push userbsmedberg@mozilla.com
push dateTue, 29 Sep 2009 17:12:47 +0000
The new DB schema, but slow. Checkpoint.
warning-parser.py
warningstep.py
--- a/warning-parser.py
+++ b/warning-parser.py
@@ -5,26 +5,26 @@ Reads a build log on stdin. Parse warnin
 and store them in a sqlite database for later consumption.
 
 Uses gmake "Entering directory" and "Leaving directory" messages to
 keep track of the working directory. Resolves relative file paths against
 these working directories, and also follows symlinks back into the
 source tree.
 """
 
-import sys, os, sqlite3, re, mercurial.hg, mercurial.ui, mercurial.revlog, mercurial.node
+import sys, os, re, mercurial.hg, mercurial.ui, mercurial.revlog, mercurial.node
 
 (srcdir, blamedb, logfile) = sys.argv[1:]
 
 srcdir = os.path.realpath(srcdir) + '/'
 
 blamedb = sqlite3.connect(blamedb)
 blamecur = blamedb.cursor()
 
-repo = mercurial.hg.repository(mercurial.ui.ui(), srcdir)
+repo = mercurial.hg.repository(mercurial.ui.ui(quiet=True, interactive=False), srcdir)
 ctx = repo.changectx('.')
 
 emptyBlame = (None, None, None, None, None)
 
 def getCVSBlame(file, line):
     """
     Get CVS blame information for a specific line that is blamed on the initial
     import of CVS code to Mercurial.
--- a/warningstep.py
+++ b/warningstep.py
@@ -1,186 +1,177 @@
 from buildbot.process.buildstep import SUCCESS, FAILURE
 from buildbot.steps.shell import ShellCommand
-import sqlite3, os.path
+import MySQLdb, os.path, sys
+
+class Location(object):
+    def __init__(self, type, file, lineno, rev, who):
+        self.type = type
+        self.file = file
+        self.lineno = lineno
+        self.rev = rev
+        self.who = who
+
+    def getLocationID(self, db, cur):
+        cur.execute('''INSERT INTO locations (ltype, lfile, lline, lrev, lwho)
+                       VALUES (%s, %s, %s, %s, %s)
+                       ON DUPLICATE KEY UPDATE locationid = LAST_INSERT_ID(locationid)''',
+                    (self.type, self.file, self.lineno, self.rev, self.who))
+        print "locationid: %i" % (db.insert_id(),)
+        return db.insert_id()
+
+class WarningLine(object):
+    def __init__(self, data):
+        file, lineno, msg, blametype, blamewho, blamefile, blamerev, blameline = data
+
+        if blametype is None:
+            blametype = 'system'
+        if lineno is None:
+            lineno = -1
+        if blamewho is None:
+            blamewho = ''
+        if blamefile is None:
+            blamefile = file.rsplit('/', 1)[-1]
+        if blameline is None:
+            blameline = lineno
+        if blamerev is None:
+            blamerev = ''
+
+        self.blame = Location(blametype, blamefile, blameline, blamerev, blamewho)
+        self.loc = Location('build', file, lineno, '', '')
+        self.msg = msg
+
+    def getMessageID(self, db, cur):
+        cur.execute('''INSERT INTO messages (msg) VALUES (%s)
+                       ON DUPLICATE KEY UPDATE msgid = LAST_INSERT_ID(msgid)''',
+                    (self.msg,))
+        print "msgid: %i" % (db.insert_id(),)
+        return db.insert_id()
+
+class Warning(object):
+    def __init__(self):
+        self.lines = []
+
+    def append(self, wl):
+        self.lines.append(wl)
+
+    def findWarningID(self, db, cur):
+        cols = 'linecount, primaryline, ' + ','.join(['l%i, msg%i' % (i, i) for i in range(0, len(self.lines))])
+        values = '%s, 0, ' + ','.join(['%s, %s' for i in range(0, len(self.lines))])
+        p = [len(self.lines)]
+        for i in range(0, len(self.lines)):
+            p += [self.lines[i].blame.getLocationID(db, cur), self.lines[i].getMessageID(db, cur)]
+
+        q = '''INSERT INTO warnings (%s) VALUES (%s)
+               ON DUPLICATE KEY UPDATE warningid = LAST_INSERT_ID(warningid)''' % (cols, values)
+        cur.execute(q, p)
+        print "warningid: %i" % (db.insert_id(),)
+        return db.insert_id()
+
+    def insertIntoBuild(self, buildnumber, warningid, db, cur):
+        cur.execute('''INSERT INTO buildwarnings (buildnumber, warningid) VALUES (%s, %s)
+                       ON DUPLICATE KEY UPDATE wcount = wcount + 1''', (buildnumber, warningid))
+        if cur.rowcount == 1: # New row, not DUPLICATE KEY
+            p = [[buildnumber, warningid, i, self.lines[i].loc.getLocationID(db, cur)] for i in range(0, len(self.lines))]
+            cur.executemany('''INSERT INTO buildwarninglines (buildnumber, warningid, wline, locationid)
+                               VALUES (%s, %s, %s, %s)''', p)
 
 class WarningParserStep(ShellCommand):
     """
     A subclass of shellcommand used for running warning-parser.py.
 
     This parses the output and saves interesting information to the
     master-side sqlite database. It produces output about "new" warnings, etc.
 
     If the dbfile does not exist, it will be created the first time it is needed.
     """
 
-    _createSchema = """
-    CREATE TABLE builds (
-      buildnumber INTEGER NOT NULL PRIMARY KEY,
-      rev TEXT NOT NULL
-    );
-
-    CREATE TABLE warnings (
-      buildnumber INTEGER NOT NULL CONSTRAINT warnings_buildnumer_fkey REFERENCES builds (buildnumber),
-      signature TEXT NOT NULL,
-      count INTEGER NOT NULL,
-      CONSTRAINT warnings_pkey PRIMARY KEY (buildnumber, signature)
-    );
-
-    CREATE TABLE wlines (
-      buildnumber INTEGER NOT NULL,
-      signature TEXT NOT NULL,
-      wline INTEGER NOT NULL,
-      file TEXT NOT NULL,
-      lineno INTEGER,
-      msg TEXT NOT NULL,
-      blametype TEXT,
-      blamefile TEXT,
-      blamerev TEXT,
-      blameline INTEGER,
-      blamewho TEXT,
-      CONSTRAINT wlines_refwarning_fkey FOREIGN KEY (buildnumber, signature) REFERENCES warnings (buildnumber, signature),
-      CONSTRAINT wlines_pkey PRIMARY KEY (buildnumber, signature, wline)
-    );
-    """
-
-    def __init__(self, dbfile, **kwargs):
+    def __init__(self, connectargs, **kwargs):
         ShellCommand.__init__(self, **kwargs)
 
-        self.dbfile = dbfile
-        self.addFactoryArguments(dbfile=dbfile)
+        self.connectargs = connectargs
+        self.addFactoryArguments(connectargs=connectargs)
 
     def createSummary(self, log):
         stepresults = self.step_status.getBuild().getSteps()
         for sr in stepresults:
             result, str = sr.getResults()
             if result == FAILURE:
                 self.addCompleteLog('skipped', 'Skipped analyzing warnings because a prior step failed.')
                 return
 
-        createSchema = not os.path.exists(self.dbfile)
-
-        db = sqlite3.connect(self.dbfile)
-        cur = db.cursor()
-
-        if createSchema:
-            cur.executescript(self._createSchema)
-
-        cur.execute('PRAGMA synchronous = NORMAL');
+        comparison = self.parseLog(log.getText(), self.getProperty('buildnumber'), self.getProperty('got_revision'))
+        self.addCompleteLog('comparison', '\n'.join(comparison))
 
-        buildnumber = self.getProperty('buildnumber')
-
-        cur.execute('INSERT INTO builds (buildnumber, rev) VALUES ( ?, ? )',
-                    (buildnumber, self.getProperty('got_revision')))
-
-        def lineSignature(line):
-            """
-            Given a line tuple, return a string signature for the line. This signature is based
-            on the blame location if available, else the reported location.
-            """
-            file, lineno, msg, blametype, blamewho, blamepath, blamerev, blameline = line
+    def parseLog(self, logdata, buildnumber, revision):
+        # Use different connections for the warningID-finder (which does table locking and can't have a long-lived transaction),
+        # and the buildwarnings inserter, which should have a long-lived transaction
+        warndb = MySQLdb.connect(**self.connectargs)
+        warndb.autocommit(True)
+        warncur = warndb.cursor()
 
-            if blametype is None:
-                return "%s:%s:%s" % (file, lineno, msg[:30])
-
-            return "%s:%s:%s:%s:%s" % (blametype, blamepath, blamerev[:12], blameline, msg[:30])
-
-        def processWarning(lines):
-            """
-            process a warning. A warning is a list of lines. Each line is a tuple of
-            (file, lineno, msg, blametype, blamewho, blamepath, blamerev, blameline)
-            """
-
-            signature = "\t".join( (lineSignature(line) for line in lines) )
+        db = MySQLdb.connect(**self.connectargs)
+        dbcur = db.cursor()
 
-            cur.execute('''UPDATE warnings
-                           SET count = count + 1
-                           WHERE buildnumber = ? AND signature = ?''', (buildnumber, signature))
-            if cur.rowcount == 0:
-                cur.execute('INSERT INTO warnings (buildnumber, signature, count) VALUES (?, ?, ?)', (buildnumber, signature, 1))
+        dbcur.execute('INSERT INTO builds (buildnumber, rev) VALUES ( %s, %s )',
+                      (buildnumber, revision))
 
-                for i in xrange(0, len(lines)):
-                    file, lineno, msg, blametype, blamewho, blamepath, blamerev, blameline = lines[i]
-                    cur.execute('''INSERT INTO wlines (buildnumber, signature, wline, file, lineno, msg, blametype, blamefile, blamerev, blameline, blamewho)
-                                   VALUES             (?,           ?,         ?,     ?,    ?,      ?,   ?,         ?,         ?,        ?,         ?)''',
-                                (buildnumber, signature, i, file, lineno, msg, blametype, blamepath, blamerev, blameline, blamewho))
-
+        def processWarning(w):
+            warningid = w.findWarningID(warndb, warncur)
+            w.insertIntoBuild(buildnumber, warningid, db, dbcur)
 
         curwarning = None
-        for line in log.getText().split("\n"):
+        for line in logdata.split("\n"):
             if line.startswith('WARN-DB: '):
-                id, ord, file, lineno, msg, blametype, blamewho, blamepath, blamerev, blameline = eval(line[9:].strip())
+                data = eval(line[9:].strip())
+                id, ord = data[0:2]
+                wl = WarningLine(data[2:])
 
                 if ord == 0:
                     if curwarning is not None:
+                        print "processing #%s" % id
                         processWarning(curwarning)
-                    curwarning = []
+                    curwarning = Warning()
 
-                curwarning.append( (file, lineno, msg, blametype, blamewho, blamepath, blamerev, blameline) )
+                curwarning.append(wl)
 
         if curwarning is not None:
             processWarning(curwarning)
 
         db.commit()
 
         # A list of line to stick in the comparison log
         comparison = []
 
-        cur.execute('SELECT sum(count) FROM warnings WHERE buildnumber = ?', (buildnumber,))
-        wcount, = cur.fetchone()
-        comparison.append('TinderboxPrint:<a href="http://office.smedbergs.us:8080/build?id=%i">warn:%i</a>' % (buildnumber, wcount))
+        dbcur.execute('''SELECT COUNT(*) as unqiue, SUM(wcount) AS total
+                       FROM buildwarnings
+                       WHERE buildnumber = %s''', (buildnumber,))
+        unique, total = cur.fetchone()
+        comparison.append('TinderboxPrint:<a href="http://office.smedbergs.us:8080/build?id=%i">warn:%i</a>' % (buildnumber, unique))
+        comparison.append('TinderboxPrint:(%s total)' % (total))
 
-        cur.execute('SELECT max(buildnumber) FROM builds WHERE buildnumber < ?', (buildnumber,))
-        r = cur.fetchone()
+        dbcur.execute('SELECT MAX(buildnumber) FROM builds WHERE buildnumber < %s', (buildnumber,))
+        r = dbcur.fetchone()
         if r is None:
             comparison.append('no prior build found: skipping comparison')
         else:
             prevbuildnumber, = r
 
-            cur.execute('''SELECT signature
-                           FROM warnings
-                           WHERE buildnumber = ? AND
-                             NOT EXISTS (SELECT *
-                                         FROM warnings AS oldwarnings
-                                         WHERE oldwarnings.signature = warnings.signature AND
-                                           oldwarnings.buildnumber = ?)''', (buildnumber, prevbuildnumber))
-            cur2 = db.cursor()
-
-            wnewcount = 0
-
-            for signature, in cur:
-                wnewcount = wnewcount + 1
-                cur2.execute('''SELECT file, lineno, msg, blametype, blamefile, blamerev, blameline, blamewho
-                                FROM wlines
-                                WHERE buildnumber = ? AND signature = ?
-                                ORDER BY wline''',
-                             (buildnumber, signature))
-                comparison.append('NEW WARNING:')
+            cbcur.execute('''SELECT COUNT(*)
+                             FROM buildwarnings
+                             WHERE buildnumber = %s AND
+                               NOT EXISTS (SELECT *
+                                           FROM buildwarnings AS oldwarnings
+                                           WHERE oldwarnings.warningid = warnings.warningid AND
+                                             oldwarnings.buildnumber = %s)''', (buildnumber, prevbuildnumber))
+            wnewcount, = cur.fetchone()
+            if wnewcount > 0:
+                comparison.append('TinderboxPrint:<a href="http://office.smedbergs.us:8080/build?id=%i">warn-new:%i</a>' % (buildnumber, wnewcount))
 
-                for file, lineno, msg, blametype, blamefile, blamerev, blameline, blamewho in cur2:
-                    out = "  %s" % file
-                    if lineno is not None:
-                        out += ":%i" % lineno
-
-                    out += ": %s" % msg
-
-                    if blamewho is not None:
-                        out += ' - Blamed on %s' % blamewho
+        dbcur.close()
+        db.close()
 
-                    if blametype == 'cvs':
-                        out += ' -  http://bonsai.mozilla.org/cvsblame.cgi?file=mozilla/%s&rev=HG_REPO_INITIAL_IMPORT&mark=%s#%s' % (blamefile, blameline, blameline)
-                    elif blametype == 'hg':
-                        out += ' - http://hg.mozilla.org/mozilla-central/annotate/%s/%s#l%i' % (blamerev, blamefile, blameline)
-
-                    comparison.append(out)
-
-        if wnewcount > 0:
-            comparison.append('TinderboxPrint:<a href="http://office.smedbergs.us:8080/build?id=%i">warn-new:%i</a>' % (buildnumber, wnewcount))
-
-        self.addCompleteLog('comparison', '\n'.join(comparison))
-
-        cur.close()
-        db.close()
+        return comparison
 
     def evaluateCommand(self, cmd):
         if cmd.rc != 0:
             return FAILURE
         return SUCCESS