Use regular expressions for findtoken as well... shaves another 20% off parsing time, but parsing is still the long pole by far. parser-perf
authorBenjamin Smedberg <benjamin@smedbergs.us>
Fri, 13 Feb 2009 09:50:07 -0500
branchparser-perf
changeset 114 e308b8e36c260cc5cb2103515a7146893fe1656a
parent 113 df9087ec3e089c3f5126999731d6b40cd1cc318f
child 116 9ff6065d262ddb313e16b0facbff1edadb97b1ee
push id62
push userbsmedberg@mozilla.com
push dateFri, 13 Feb 2009 14:50:18 +0000
Use regular expressions for findtoken as well... shaves another 20% off parsing time, but parsing is still the long pole by far.
pymake/parser.py
--- a/pymake/parser.py
+++ b/pymake/parser.py
@@ -137,32 +137,35 @@ class Data(object):
         """
         while offset < len(self.data):
             c = self.data[offset]
             if not c.isspace():
                 break
             offset += 1
         return offset
 
-    def findtoken(self, o, tlist, needws):
+    def findtoken(self, o, tlist, skipws):
         """
         Check data at position o for any of the tokens in tlist followed by whitespace
         or end-of-data.
 
         If a token is found, skip trailing whitespace and return (token, newoffset).
         Otherwise return None, oldoffset
         """
-        for t in tlist:
-            end = o + len(t)
-            if self.data[o:end] == t:
-                if not needws:
-                    return t, end
-                elif end == len(self.data) or self.data[end].isspace():
-                    end = self.skipwhitespace(end)
-                    return t, end
+        assert isinstance(tlist, TokenList)
+
+        if skipws:
+            m = tlist.wslist.match(self.data, pos=o)
+            if m is not None:
+                return m.group(1), m.end(0)
+        else:
+            m = tlist.simplere.match(self.data, pos=o)
+            if m is not None:
+                return m.group(0), m.end(0)
+
         return None, o
 
 makefiletokensescaped = [r'\\\\#', r'\\#', '\\\\\n', '\\\\\\s+\\\\\n', r'\\.', '#', '\n']
 continuationtokensescaped = ['\\\\\n', r'\\.', '\n']
 
 class TokenList(object):
     """
     A list of tokens to search. Because these lists are static, we can perform
@@ -170,16 +173,18 @@ class TokenList(object):
     """
     def __init__(self, tlist):
         self.emptylist = len(tlist) == 0
         escapedlist = [re.escape(t) for t in tlist]
         self.simplere = re.compile('|'.join(escapedlist))
         self.makefilere = re.compile('|'.join(escapedlist + makefiletokensescaped))
         self.continuationre = re.compile('|'.join(escapedlist + continuationtokensescaped))
 
+        self.wslist = re.compile('(' + '|'.join(escapedlist) + ')' + r'(\s+|$)')
+
     imap = {}
 
     @staticmethod
     def get(s):
         if s in TokenList.imap:
             return TokenList.imap[s]
 
         i = TokenList(s)
@@ -279,32 +284,32 @@ def itercommandchars(d, offset, tokenlis
         end = m.end(0)
 
         if token == '\n':
             assert end == len(d.data)
             yield d.data[offset:start], None, None, None
             return
 
         if token == '\\\n':
-            print "found newline"
             yield d.data[offset:end], None, None, None
             d.readline()
             offset = end
-            print "new len: %s offset: %s" % (len(d.data), offset)
             if offset < len(d.data) and d.data[offset] == '\t':
                 offset += 1
             continue
         
         if token.startswith('\\'):
             yield d.data[offset:end], None, None, None
         else:
             yield d.data[offset:start], token, start, end
 
         offset = end
 
+definestokenlist = TokenList.get(('define', 'endef'))
+
 def iterdefinechars(d, offset, tokenlist):
     """
     A Data generator yielding (char, offset). It will process define/endef
     according to define parsing rules.
     """
 
     def checkfortoken(o):
         """
@@ -313,17 +318,17 @@ def iterdefinechars(d, offset, tokenlist
         """
         if o >= len(d.data):
             return 0
 
         if d.data[o] == '\t':
             return 0
 
         o = d.skipwhitespace(o)
-        token, o = d.findtoken(o, ('define', 'endef'), True)
+        token, o = d.findtoken(o, definestokenlist, True)
         if token == 'define':
             return 1
 
         if token == 'endef':
             return -1
         
         return 0
 
@@ -459,19 +464,21 @@ def parsecommandlineargs(makefile, args)
             setvariable(makefile.variables, makefile.variables,
                         vname, t, d, 0, source=data.Variables.SOURCE_COMMANDLINE,
                         iterfunc=iterdata)
         else:
             r.append(a)
 
     return r
 
+eqargstokenlist = TokenList.get(('(', "'", '"'))
+
 def ifeq(d, offset, makefile):
     # the variety of formats for this directive is rather maddening
-    token, offset = d.findtoken(offset, ('(', "'", '"'), False)
+    token, offset = d.findtoken(offset, eqargstokenlist, False)
     if token is None:
         raise SyntaxError("No arguments after conditional", d.getloc(offset))
 
     if token == '(':
         arg1, t, offset = parsemakesyntax(d, offset, (',',), itermakefilechars)
         if t is None:
             raise SyntaxError("Expected two arguments in conditional", d.getloc(offset))
 
@@ -547,18 +554,20 @@ class Condition(object):
         if self.everactive:
             self.active = False
             return
 
         self.active = active
         if active:
             self.everactive = True
 
-directives = [k for k in conditionkeywords.iterkeys()] + \
-    ['else', 'endif', 'define', 'endef', 'override', 'include', '-include', 'vpath', 'export', 'unexport']
+conditiontokens = tuple(conditionkeywords.iterkeys())
+directivestokenlist = TokenList.get(conditiontokens + \
+    ('else', 'endif', 'define', 'endef', 'override', 'include', '-include', 'vpath', 'export', 'unexport'))
+conditionkeywordstokenlist = TokenList.get(conditiontokens)
 
 varsettokens = (':=', '+=', '?=', '=')
 
 def parsestream(fd, filename, makefile):
     """
     Parse a stream of makefile into a makefile data structure.
 
     @param fd A file-like object containing the makefile data.
@@ -584,32 +593,32 @@ def parsestream(fd, filename, makefile):
             currule.addcommand(e)
         else:
             # To parse Makefile syntax, we first strip leading whitespace and
             # look for initial keywords. If there are no keywords, it's either
             # setting a variable or writing a rule.
 
             offset = d.skipwhitespace(0)
 
-            kword, offset = d.findtoken(offset, directives, True)
+            kword, offset = d.findtoken(offset, directivestokenlist, True)
             if kword == 'endif':
                 ensureend(d, offset, "Unexpected data after 'endif' directive")
                 if not len(condstack):
                     raise SyntaxError("unmatched 'endif' directive",
                                       d.getloc(offset))
 
                 condstack.pop()
                 continue
             
             if kword == 'else':
                 if not len(condstack):
                     raise SyntaxError("unmatched 'else' directive",
                                       d.getloc(offset))
 
-                kword, offset = d.findtoken(offset, conditionkeywords, True)
+                kword, offset = d.findtoken(offset, conditionkeywordstokenlist, True)
                 if kword is None:
                     ensureend(d, offset, "Unexpected data after 'else' directive.")
                     condstack[-1].makeactive(True)
                 else:
                     if kword not in conditionkeywords:
                         raise SyntaxError("Unexpected condition after 'else' directive.",
                                           d.getloc(offset))
                         
@@ -810,18 +819,19 @@ class ParseStackFrame(object):
     def __init__(self, parsestate, expansion, tokenlist, closebrace, **kwargs):
         self.parsestate = parsestate
         self.expansion = expansion
         self.tokenlist = tokenlist
         self.closebrace = closebrace
         for key, value in kwargs.iteritems():
             setattr(self, key, value)
 
-functiontokens = [k for k in functions.functionmap.iterkeys()]
+functiontokens = list(functions.functionmap.iterkeys())
 functiontokens.sort(key=len, reverse=True)
+functiontokenlist = TokenList.get(tuple(functiontokens))
 
 def parsemakesyntax(d, startat, stopon, iterfunc):
     """
     Given Data, parse it into a data.Expansion.
 
     @param stopon (sequence)
         Indicate characters where toplevel parsing should stop.
 
@@ -867,17 +877,17 @@ def parsemakesyntax(d, startat, stopon, 
             c = d.data[offset]
             if c == '$':
                 stacktop.expansion.append('$')
                 offset = offset + 1
             elif c in ('(', '{'):
                 closebrace = c == '(' and ')' or '}'
 
                 # look forward for a function name
-                fname, offset = d.findtoken(offset + 1, functiontokens, True)
+                fname, offset = d.findtoken(offset + 1, functiontokenlist, True)
                 if fname is not None:
                     fn = functions.functionmap[fname](loc)
                     e = data.Expansion()
                     fn.append(e)
                     if len(fn) == fn.maxargs:
                         tokenlist = TokenList.get((closebrace, '$'))
                     else:
                         tokenlist = TokenList.get((',', closebrace, '$'))