author | Kristen Wright <kwright@mozilla.com> |
Tue, 14 Apr 2020 17:17:38 +0000 | |
changeset 523985 | 71493b05535924f61c3bb0a15887c0de1a919c36 |
parent 523984 | fe6f888d24b82bfc38b3767cb5e7fa4efad7c69b |
child 523986 | 6eb3134baecc69d6997ea383080831ec8c7a1674 |
push id | 37313 |
push user | ccoroiu@mozilla.com |
push date | Wed, 15 Apr 2020 04:09:17 +0000 |
treeherder | mozilla-central@c224327f2bdd [default view] [failures only] |
perfherder | [talos] [build metrics] [platform microbench] (compared to previous push) |
reviewers | firefox-build-system-reviewers, rstewart |
bugs | 1621359 |
milestone | 77.0a1 |
first release with | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
last release without | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/ANNOUNCE @@ -0,0 +1,40 @@ +January 31, 2017 + + Announcing : PLY-3.10 (Python Lex-Yacc) + + http://www.dabeaz.com/ply + +I'm pleased to announce PLY-3.10--a pure Python implementation of the +common parsing tools lex and yacc. PLY-3.10 is a minor bug fix +release. It supports both Python 2 and Python 3. + +If you are new to PLY, here are a few highlights: + +- PLY is closely modeled after traditional lex/yacc. If you know how + to use these or similar tools in other languages, you will find + PLY to be comparable. + +- PLY provides very extensive error reporting and diagnostic + information to assist in parser construction. The original + implementation was developed for instructional purposes. As + a result, the system tries to identify the most common types + of errors made by novice users. + +- PLY provides full support for empty productions, error recovery, + precedence rules, and ambiguous grammars. + +- Parsing is based on LR-parsing which is fast, memory efficient, + better suited to large grammars, and which has a number of nice + properties when dealing with syntax errors and other parsing + problems. Currently, PLY can build its parsing tables using + either SLR or LALR(1) algorithms. + +More information about PLY can be obtained on the PLY webpage at: + + http://www.dabeaz.com/ply + +PLY is freely available. + +Cheers, + +David Beazley (http://www.dabeaz.com) \ No newline at end of file
deleted file mode 100644 --- a/other-licenses/ply/COPYING +++ /dev/null @@ -1,28 +0,0 @@ -Copyright (C) 2001-2009, -David M. Beazley (Dabeaz LLC) -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -* Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. -* Neither the name of the David Beazley or Dabeaz LLC may be used to - endorse or promote products derived from this software without - specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/MANIFEST.in @@ -0,0 +1,8 @@ +recursive-include example * +recursive-include doc * +recursive-include test * +include ANNOUNCE +include README.md +include CHANGES +include TODO +global-exclude *.pyc
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/PKG-INFO @@ -0,0 +1,22 @@ +Metadata-Version: 1.1 +Name: ply +Version: 3.10 +Summary: Python Lex & Yacc +Home-page: http://www.dabeaz.com/ply/ +Author: David Beazley +Author-email: dave@dabeaz.com +License: BSD +Description: + PLY is yet another implementation of lex and yacc for Python. Some notable + features include the fact that its implemented entirely in Python and it + uses LALR(1) parsing which is efficient and well suited for larger grammars. + + PLY provides most of the standard lex/yacc features including support for empty + productions, precedence rules, error recovery, and support for ambiguous grammars. + + PLY is extremely easy to use and provides very extensive error checking. + It is compatible with both Python 2 and Python 3. + +Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 2
deleted file mode 100644 --- a/other-licenses/ply/README +++ /dev/null @@ -1,9 +0,0 @@ -David Beazley's PLY (Python Lex-Yacc) -http://www.dabeaz.com/ply/ - -Licensed under BSD. - -This directory contains just the code and license from PLY version 3.3; -the full distribution (see the URL) also contains examples, tests, -documentation, and a longer README. -
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/README.md @@ -0,0 +1,273 @@ +PLY (Python Lex-Yacc) Version 3.10 + +Copyright (C) 2001-2017 +David M. Beazley (Dabeaz LLC) +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +* Neither the name of the David Beazley or Dabeaz LLC may be used to + endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Introduction +============ + +PLY is a 100% Python implementation of the common parsing tools lex +and yacc. Here are a few highlights: + + - PLY is very closely modeled after traditional lex/yacc. + If you know how to use these tools in C, you will find PLY + to be similar. + + - PLY provides *very* extensive error reporting and diagnostic + information to assist in parser construction. The original + implementation was developed for instructional purposes. As + a result, the system tries to identify the most common types + of errors made by novice users. + + - PLY provides full support for empty productions, error recovery, + precedence specifiers, and moderately ambiguous grammars. + + - Parsing is based on LR-parsing which is fast, memory efficient, + better suited to large grammars, and which has a number of nice + properties when dealing with syntax errors and other parsing problems. + Currently, PLY builds its parsing tables using the LALR(1) + algorithm used in yacc. + + - PLY uses Python introspection features to build lexers and parsers. + This greatly simplifies the task of parser construction since it reduces + the number of files and eliminates the need to run a separate lex/yacc + tool before running your program. + + - PLY can be used to build parsers for "real" programming languages. + Although it is not ultra-fast due to its Python implementation, + PLY can be used to parse grammars consisting of several hundred + rules (as might be found for a language like C). The lexer and LR + parser are also reasonably efficient when parsing typically + sized programs. People have used PLY to build parsers for + C, C++, ADA, and other real programming languages. + +How to Use +========== + +PLY consists of two files : lex.py and yacc.py. These are contained +within the 'ply' directory which may also be used as a Python package. +To use PLY, simply copy the 'ply' directory to your project and import +lex and yacc from the associated 'ply' package. For example: + + import ply.lex as lex + import ply.yacc as yacc + +Alternatively, you can copy just the files lex.py and yacc.py +individually and use them as modules. For example: + + import lex + import yacc + +The file setup.py can be used to install ply using distutils. + +The file doc/ply.html contains complete documentation on how to use +the system. + +The example directory contains several different examples including a +PLY specification for ANSI C as given in K&R 2nd Ed. + +A simple example is found at the end of this document + +Requirements +============ +PLY requires the use of Python 2.6 or greater. However, you should +use the latest Python release if possible. It should work on just +about any platform. PLY has been tested with both CPython and Jython. +It also seems to work with IronPython. + +Resources +========= +More information about PLY can be obtained on the PLY webpage at: + + http://www.dabeaz.com/ply + +For a detailed overview of parsing theory, consult the excellent +book "Compilers : Principles, Techniques, and Tools" by Aho, Sethi, and +Ullman. The topics found in "Lex & Yacc" by Levine, Mason, and Brown +may also be useful. + +The GitHub page for PLY can be found at: + + https://github.com/dabeaz/ply + +An old and relatively inactive discussion group for PLY is found at: + + http://groups.google.com/group/ply-hack + +Acknowledgments +=============== +A special thanks is in order for all of the students in CS326 who +suffered through about 25 different versions of these tools :-). + +The CHANGES file acknowledges those who have contributed patches. + +Elias Ioup did the first implementation of LALR(1) parsing in PLY-1.x. +Andrew Waters and Markus Schoepflin were instrumental in reporting bugs +and testing a revised LALR(1) implementation for PLY-2.0. + +Special Note for PLY-3.0 +======================== +PLY-3.0 the first PLY release to support Python 3. However, backwards +compatibility with Python 2.6 is still preserved. PLY provides dual +Python 2/3 compatibility by restricting its implementation to a common +subset of basic language features. You should not convert PLY using +2to3--it is not necessary and may in fact break the implementation. + +Example +======= + +Here is a simple example showing a PLY implementation of a calculator +with variables. + + # ----------------------------------------------------------------------------- + # calc.py + # + # A simple calculator with variables. + # ----------------------------------------------------------------------------- + + tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + + # Tokens + + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_EQUALS = r'=' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + # Ignored characters + t_ignore = " \t" + + def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer + import ply.lex as lex + lex.lex() + + # Precedence rules for the arithmetic operators + precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + + # dictionary of names (for storing variables) + names = { } + + def p_statement_assign(p): + 'statement : NAME EQUALS expression' + names[p[1]] = p[3] + + def p_statement_expr(p): + 'statement : expression' + print(p[1]) + + def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if p[2] == '+' : p[0] = p[1] + p[3] + elif p[2] == '-': p[0] = p[1] - p[3] + elif p[2] == '*': p[0] = p[1] * p[3] + elif p[2] == '/': p[0] = p[1] / p[3] + + def p_expression_uminus(p): + 'expression : MINUS expression %prec UMINUS' + p[0] = -p[2] + + def p_expression_group(p): + 'expression : LPAREN expression RPAREN' + p[0] = p[2] + + def p_expression_number(p): + 'expression : NUMBER' + p[0] = p[1] + + def p_expression_name(p): + 'expression : NAME' + try: + p[0] = names[p[1]] + except LookupError: + print("Undefined name '%s'" % p[1]) + p[0] = 0 + + def p_error(p): + print("Syntax error at '%s'" % p.value) + + import ply.yacc as yacc + yacc.yacc() + + while True: + try: + s = raw_input('calc > ') # use input() on Python 3 + except EOFError: + break + yacc.parse(s) + + +Bug Reports and Patches +======================= +My goal with PLY is to simply have a decent lex/yacc implementation +for Python. As a general rule, I don't spend huge amounts of time +working on it unless I receive very specific bug reports and/or +patches to fix problems. I also try to incorporate submitted feature +requests and enhancements into each new version. Please visit the PLY +github page at https://github.com/dabeaz/ply to submit issues and pull +requests. To contact me about bugs and/or new features, please send +email to dave@dabeaz.com. + +-- Dave + + + + + + + + +
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/TODO @@ -0,0 +1,16 @@ +The PLY to-do list: + +1. Finish writing the C Preprocessor module. Started in the + file ply/cpp.py + +2. Create and document libraries of useful tokens. + +3. Expand the examples/yply tool that parses bison/yacc + files. + +4. Think of various diabolical things to do with the + new yacc internals. For example, it is now possible + to specify grammrs using completely different schemes + than the reflection approach used by PLY. + +
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/doc/internal.html @@ -0,0 +1,874 @@ +<html> +<head> +<title>PLY Internals</title> +</head> +<body bgcolor="#ffffff"> + +<h1>PLY Internals</h1> + +<b> +David M. Beazley <br> +dave@dabeaz.com<br> +</b> + +<p> +<b>PLY Version: 3.10</b> +<p> + +<!-- INDEX --> +<div class="sectiontoc"> +<ul> +<li><a href="#internal_nn1">Introduction</a> +<li><a href="#internal_nn2">Grammar Class</a> +<li><a href="#internal_nn3">Productions</a> +<li><a href="#internal_nn4">LRItems</a> +<li><a href="#internal_nn5">LRTable</a> +<li><a href="#internal_nn6">LRGeneratedTable</a> +<li><a href="#internal_nn7">LRParser</a> +<li><a href="#internal_nn8">ParserReflect</a> +<li><a href="#internal_nn9">High-level operation</a> +</ul> +</div> +<!-- INDEX --> + + +<H2><a name="internal_nn1"></a>1. Introduction</H2> + + +This document describes classes and functions that make up the internal +operation of PLY. Using this programming interface, it is possible to +manually build an parser using a different interface specification +than what PLY normally uses. For example, you could build a gramar +from information parsed in a completely different input format. Some of +these objects may be useful for building more advanced parsing engines +such as GLR. + +<p> +It should be stressed that using PLY at this level is not for the +faint of heart. Generally, it's assumed that you know a bit of +the underlying compiler theory and how an LR parser is put together. + +<H2><a name="internal_nn2"></a>2. Grammar Class</H2> + + +The file <tt>ply.yacc</tt> defines a class <tt>Grammar</tt> that +is used to hold and manipulate information about a grammar +specification. It encapsulates the same basic information +about a grammar that is put into a YACC file including +the list of tokens, precedence rules, and grammar rules. +Various operations are provided to perform different validations +on the grammar. In addition, there are operations to compute +the first and follow sets that are needed by the various table +generation algorithms. + +<p> +<tt><b>Grammar(terminals)</b></tt> + +<blockquote> +Creates a new grammar object. <tt>terminals</tt> is a list of strings +specifying the terminals for the grammar. An instance <tt>g</tt> of +<tt>Grammar</tt> has the following methods: +</blockquote> + +<p> +<b><tt>g.set_precedence(term,assoc,level)</tt></b> +<blockquote> +Sets the precedence level and associativity for a given terminal <tt>term</tt>. +<tt>assoc</tt> is one of <tt>'right'</tt>, +<tt>'left'</tt>, or <tt>'nonassoc'</tt> and <tt>level</tt> is a positive integer. The higher +the value of <tt>level</tt>, the higher the precedence. Here is an example of typical +precedence settings: + +<pre> +g.set_precedence('PLUS', 'left',1) +g.set_precedence('MINUS', 'left',1) +g.set_precedence('TIMES', 'left',2) +g.set_precedence('DIVIDE','left',2) +g.set_precedence('UMINUS','left',3) +</pre> + +This method must be called prior to adding any productions to the +grammar with <tt>g.add_production()</tt>. The precedence of individual grammar +rules is determined by the precedence of the right-most terminal. + +</blockquote> +<p> +<b><tt>g.add_production(name,syms,func=None,file='',line=0)</tt></b> +<blockquote> +Adds a new grammar rule. <tt>name</tt> is the name of the rule, +<tt>syms</tt> is a list of symbols making up the right hand +side of the rule, <tt>func</tt> is the function to call when +reducing the rule. <tt>file</tt> and <tt>line</tt> specify +the filename and line number of the rule and are used for +generating error messages. + +<p> +The list of symbols in <tt>syms</tt> may include character +literals and <tt>%prec</tt> specifiers. Here are some +examples: + +<pre> +g.add_production('expr',['expr','PLUS','term'],func,file,line) +g.add_production('expr',['expr','"+"','term'],func,file,line) +g.add_production('expr',['MINUS','expr','%prec','UMINUS'],func,file,line) +</pre> + +<p> +If any kind of error is detected, a <tt>GrammarError</tt> exception +is raised with a message indicating the reason for the failure. +</blockquote> + +<p> +<b><tt>g.set_start(start=None)</tt></b> +<blockquote> +Sets the starting rule for the grammar. <tt>start</tt> is a string +specifying the name of the start rule. If <tt>start</tt> is omitted, +the first grammar rule added with <tt>add_production()</tt> is taken to be +the starting rule. This method must always be called after all +productions have been added. +</blockquote> + +<p> +<b><tt>g.find_unreachable()</tt></b> +<blockquote> +Diagnostic function. Returns a list of all unreachable non-terminals +defined in the grammar. This is used to identify inactive parts of +the grammar specification. +</blockquote> + +<p> +<b><tt>g.infinite_cycle()</tt></b> +<blockquote> +Diagnostic function. Returns a list of all non-terminals in the +grammar that result in an infinite cycle. This condition occurs if +there is no way for a grammar rule to expand to a string containing +only terminal symbols. +</blockquote> + +<p> +<b><tt>g.undefined_symbols()</tt></b> +<blockquote> +Diagnostic function. Returns a list of tuples <tt>(name, prod)</tt> +corresponding to undefined symbols in the grammar. <tt>name</tt> is the +name of the undefined symbol and <tt>prod</tt> is an instance of +<tt>Production</tt> which has information about the production rule +where the undefined symbol was used. +</blockquote> + +<p> +<b><tt>g.unused_terminals()</tt></b> +<blockquote> +Diagnostic function. Returns a list of terminals that were defined, +but never used in the grammar. +</blockquote> + +<p> +<b><tt>g.unused_rules()</tt></b> +<blockquote> +Diagnostic function. Returns a list of <tt>Production</tt> instances +corresponding to production rules that were defined in the grammar, +but never used anywhere. This is slightly different +than <tt>find_unreachable()</tt>. +</blockquote> + +<p> +<b><tt>g.unused_precedence()</tt></b> +<blockquote> +Diagnostic function. Returns a list of tuples <tt>(term, assoc)</tt> +corresponding to precedence rules that were set, but never used the +grammar. <tt>term</tt> is the terminal name and <tt>assoc</tt> is the +precedence associativity (e.g., <tt>'left'</tt>, <tt>'right'</tt>, +or <tt>'nonassoc'</tt>. +</blockquote> + +<p> +<b><tt>g.compute_first()</tt></b> +<blockquote> +Compute all of the first sets for all symbols in the grammar. Returns a dictionary +mapping symbol names to a list of all first symbols. +</blockquote> + +<p> +<b><tt>g.compute_follow()</tt></b> +<blockquote> +Compute all of the follow sets for all non-terminals in the grammar. +The follow set is the set of all possible symbols that might follow a +given non-terminal. Returns a dictionary mapping non-terminal names +to a list of symbols. +</blockquote> + +<p> +<b><tt>g.build_lritems()</tt></b> +<blockquote> +Calculates all of the LR items for all productions in the grammar. This +step is required before using the grammar for any kind of table generation. +See the section on LR items below. +</blockquote> + +<p> +The following attributes are set by the above methods and may be useful +in code that works with the grammar. All of these attributes should be +assumed to be read-only. Changing their values directly will likely +break the grammar. + +<p> +<b><tt>g.Productions</tt></b> +<blockquote> +A list of all productions added. The first entry is reserved for +a production representing the starting rule. The objects in this list +are instances of the <tt>Production</tt> class, described shortly. +</blockquote> + +<p> +<b><tt>g.Prodnames</tt></b> +<blockquote> +A dictionary mapping the names of nonterminals to a list of all +productions of that nonterminal. +</blockquote> + +<p> +<b><tt>g.Terminals</tt></b> +<blockquote> +A dictionary mapping the names of terminals to a list of the +production numbers where they are used. +</blockquote> + +<p> +<b><tt>g.Nonterminals</tt></b> +<blockquote> +A dictionary mapping the names of nonterminals to a list of the +production numbers where they are used. +</blockquote> + +<p> +<b><tt>g.First</tt></b> +<blockquote> +A dictionary representing the first sets for all grammar symbols. This is +computed and returned by the <tt>compute_first()</tt> method. +</blockquote> + +<p> +<b><tt>g.Follow</tt></b> +<blockquote> +A dictionary representing the follow sets for all grammar rules. This is +computed and returned by the <tt>compute_follow()</tt> method. +</blockquote> + +<p> +<b><tt>g.Start</tt></b> +<blockquote> +Starting symbol for the grammar. Set by the <tt>set_start()</tt> method. +</blockquote> + +For the purposes of debugging, a <tt>Grammar</tt> object supports the <tt>__len__()</tt> and +<tt>__getitem__()</tt> special methods. Accessing <tt>g[n]</tt> returns the nth production +from the grammar. + + +<H2><a name="internal_nn3"></a>3. Productions</H2> + + +<tt>Grammar</tt> objects store grammar rules as instances of a <tt>Production</tt> class. This +class has no public constructor--you should only create productions by calling <tt>Grammar.add_production()</tt>. +The following attributes are available on a <tt>Production</tt> instance <tt>p</tt>. + +<p> +<b><tt>p.name</tt></b> +<blockquote> +The name of the production. For a grammar rule such as <tt>A : B C D</tt>, this is <tt>'A'</tt>. +</blockquote> + +<p> +<b><tt>p.prod</tt></b> +<blockquote> +A tuple of symbols making up the right-hand side of the production. For a grammar rule such as <tt>A : B C D</tt>, this is <tt>('B','C','D')</tt>. +</blockquote> + +<p> +<b><tt>p.number</tt></b> +<blockquote> +Production number. An integer containing the index of the production in the grammar's <tt>Productions</tt> list. +</blockquote> + +<p> +<b><tt>p.func</tt></b> +<blockquote> +The name of the reduction function associated with the production. +This is the function that will execute when reducing the entire +grammar rule during parsing. +</blockquote> + +<p> +<b><tt>p.callable</tt></b> +<blockquote> +The callable object associated with the name in <tt>p.func</tt>. This is <tt>None</tt> +unless the production has been bound using <tt>bind()</tt>. +</blockquote> + +<p> +<b><tt>p.file</tt></b> +<blockquote> +Filename associated with the production. Typically this is the file where the production was defined. Used for error messages. +</blockquote> + +<p> +<b><tt>p.lineno</tt></b> +<blockquote> +Line number associated with the production. Typically this is the line number in <tt>p.file</tt> where the production was defined. Used for error messages. +</blockquote> + +<p> +<b><tt>p.prec</tt></b> +<blockquote> +Precedence and associativity associated with the production. This is a tuple <tt>(assoc,level)</tt> where +<tt>assoc</tt> is one of <tt>'left'</tt>,<tt>'right'</tt>, or <tt>'nonassoc'</tt> and <tt>level</tt> is +an integer. This value is determined by the precedence of the right-most terminal symbol in the production +or by use of the <tt>%prec</tt> specifier when adding the production. +</blockquote> + +<p> +<b><tt>p.usyms</tt></b> +<blockquote> +A list of all unique symbols found in the production. +</blockquote> + +<p> +<b><tt>p.lr_items</tt></b> +<blockquote> +A list of all LR items for this production. This attribute only has a meaningful value if the +<tt>Grammar.build_lritems()</tt> method has been called. The items in this list are +instances of <tt>LRItem</tt> described below. +</blockquote> + +<p> +<b><tt>p.lr_next</tt></b> +<blockquote> +The head of a linked-list representation of the LR items in <tt>p.lr_items</tt>. +This attribute only has a meaningful value if the <tt>Grammar.build_lritems()</tt> +method has been called. Each <tt>LRItem</tt> instance has a <tt>lr_next</tt> attribute +to move to the next item. The list is terminated by <tt>None</tt>. +</blockquote> + +<p> +<b><tt>p.bind(dict)</tt></b> +<blockquote> +Binds the production function name in <tt>p.func</tt> to a callable object in +<tt>dict</tt>. This operation is typically carried out in the last step +prior to running the parsing engine and is needed since parsing tables are typically +read from files which only include the function names, not the functions themselves. +</blockquote> + +<P> +<tt>Production</tt> objects support +the <tt>__len__()</tt>, <tt>__getitem__()</tt>, and <tt>__str__()</tt> +special methods. +<tt>len(p)</tt> returns the number of symbols in <tt>p.prod</tt> +and <tt>p[n]</tt> is the same as <tt>p.prod[n]</tt>. + +<H2><a name="internal_nn4"></a>4. LRItems</H2> + + +The construction of parsing tables in an LR-based parser generator is primarily +done over a set of "LR Items". An LR item represents a stage of parsing one +of the grammar rules. To compute the LR items, it is first necessary to +call <tt>Grammar.build_lritems()</tt>. Once this step, all of the productions +in the grammar will have their LR items attached to them. + +<p> +Here is an interactive example that shows what LR items look like if you +interactively experiment. In this example, <tt>g</tt> is a <tt>Grammar</tt> +object. + +<blockquote> +<pre> +>>> <b>g.build_lritems()</b> +>>> <b>p = g[1]</b> +>>> <b>p</b> +Production(statement -> ID = expr) +>>> +</pre> +</blockquote> + +In the above code, <tt>p</tt> represents the first grammar rule. In +this case, a rule <tt>'statement -> ID = expr'</tt>. + +<p> +Now, let's look at the LR items for <tt>p</tt>. + +<blockquote> +<pre> +>>> <b>p.lr_items</b> +[LRItem(statement -> . ID = expr), + LRItem(statement -> ID . = expr), + LRItem(statement -> ID = . expr), + LRItem(statement -> ID = expr .)] +>>> +</pre> +</blockquote> + +In each LR item, the dot (.) represents a specific stage of parsing. In each LR item, the dot +is advanced by one symbol. It is only when the dot reaches the very end that a production +is successfully parsed. + +<p> +An instance <tt>lr</tt> of <tt>LRItem</tt> has the following +attributes that hold information related to that specific stage of +parsing. + +<p> +<b><tt>lr.name</tt></b> +<blockquote> +The name of the grammar rule. For example, <tt>'statement'</tt> in the above example. +</blockquote> + +<p> +<b><tt>lr.prod</tt></b> +<blockquote> +A tuple of symbols representing the right-hand side of the production, including the +special <tt>'.'</tt> character. For example, <tt>('ID','.','=','expr')</tt>. +</blockquote> + +<p> +<b><tt>lr.number</tt></b> +<blockquote> +An integer representing the production number in the grammar. +</blockquote> + +<p> +<b><tt>lr.usyms</tt></b> +<blockquote> +A set of unique symbols in the production. Inherited from the original <tt>Production</tt> instance. +</blockquote> + +<p> +<b><tt>lr.lr_index</tt></b> +<blockquote> +An integer representing the position of the dot (.). You should never use <tt>lr.prod.index()</tt> +to search for it--the result will be wrong if the grammar happens to also use (.) as a character +literal. +</blockquote> + +<p> +<b><tt>lr.lr_after</tt></b> +<blockquote> +A list of all productions that can legally appear immediately to the right of the +dot (.). This list contains <tt>Production</tt> instances. This attribute +represents all of the possible branches a parse can take from the current position. +For example, suppose that <tt>lr</tt> represents a stage immediately before +an expression like this: + +<pre> +>>> <b>lr</b> +LRItem(statement -> ID = . expr) +>>> +</pre> + +Then, the value of <tt>lr.lr_after</tt> might look like this, showing all productions that +can legally appear next: + +<pre> +>>> <b>lr.lr_after</b> +[Production(expr -> expr PLUS expr), + Production(expr -> expr MINUS expr), + Production(expr -> expr TIMES expr), + Production(expr -> expr DIVIDE expr), + Production(expr -> MINUS expr), + Production(expr -> LPAREN expr RPAREN), + Production(expr -> NUMBER), + Production(expr -> ID)] +>>> +</pre> + +</blockquote> + +<p> +<b><tt>lr.lr_before</tt></b> +<blockquote> +The grammar symbol that appears immediately before the dot (.) or <tt>None</tt> if +at the beginning of the parse. +</blockquote> + +<p> +<b><tt>lr.lr_next</tt></b> +<blockquote> +A link to the next LR item, representing the next stage of the parse. <tt>None</tt> if <tt>lr</tt> +is the last LR item. +</blockquote> + +<tt>LRItem</tt> instances also support the <tt>__len__()</tt> and <tt>__getitem__()</tt> special methods. +<tt>len(lr)</tt> returns the number of items in <tt>lr.prod</tt> including the dot (.). <tt>lr[n]</tt> +returns <tt>lr.prod[n]</tt>. + +<p> +It goes without saying that all of the attributes associated with LR +items should be assumed to be read-only. Modifications will very +likely create a small black-hole that will consume you and your code. + +<H2><a name="internal_nn5"></a>5. LRTable</H2> + + +The <tt>LRTable</tt> class is used to represent LR parsing table data. This +minimally includes the production list, action table, and goto table. + +<p> +<b><tt>LRTable()</tt></b> +<blockquote> +Create an empty LRTable object. This object contains only the information needed to +run an LR parser. +</blockquote> + +An instance <tt>lrtab</tt> of <tt>LRTable</tt> has the following methods: + +<p> +<b><tt>lrtab.read_table(module)</tt></b> +<blockquote> +Populates the LR table with information from the module specified in <tt>module</tt>. +<tt>module</tt> is either a module object already loaded with <tt>import</tt> or +the name of a Python module. If it's a string containing a module name, it is +loaded and parsing data is extracted. Returns the signature value that was used +when initially writing the tables. Raises a <tt>VersionError</tt> exception if +the module was created using an incompatible version of PLY. +</blockquote> + +<p> +<b><tt>lrtab.bind_callables(dict)</tt></b> +<blockquote> +This binds all of the function names used in productions to callable objects +found in the dictionary <tt>dict</tt>. During table generation and when reading +LR tables from files, PLY only uses the names of action functions such as <tt>'p_expr'</tt>, +<tt>'p_statement'</tt>, etc. In order to actually run the parser, these names +have to be bound to callable objects. This method is always called prior to +running a parser. +</blockquote> + +After <tt>lrtab</tt> has been populated, the following attributes are defined. + +<p> +<b><tt>lrtab.lr_method</tt></b> +<blockquote> +The LR parsing method used (e.g., <tt>'LALR'</tt>) +</blockquote> + + +<p> +<b><tt>lrtab.lr_productions</tt></b> +<blockquote> +The production list. If the parsing tables have been newly +constructed, this will be a list of <tt>Production</tt> instances. If +the parsing tables have been read from a file, it's a list +of <tt>MiniProduction</tt> instances. This, together +with <tt>lr_action</tt> and <tt>lr_goto</tt> contain all of the +information needed by the LR parsing engine. +</blockquote> + +<p> +<b><tt>lrtab.lr_action</tt></b> +<blockquote> +The LR action dictionary that implements the underlying state machine. +The keys of this dictionary are the LR states. +</blockquote> + +<p> +<b><tt>lrtab.lr_goto</tt></b> +<blockquote> +The LR goto table that contains information about grammar rule reductions. +</blockquote> + + +<H2><a name="internal_nn6"></a>6. LRGeneratedTable</H2> + + +The <tt>LRGeneratedTable</tt> class represents constructed LR parsing tables on a +grammar. It is a subclass of <tt>LRTable</tt>. + +<p> +<b><tt>LRGeneratedTable(grammar, method='LALR',log=None)</tt></b> +<blockquote> +Create the LR parsing tables on a grammar. <tt>grammar</tt> is an instance of <tt>Grammar</tt>, +<tt>method</tt> is a string with the parsing method (<tt>'SLR'</tt> or <tt>'LALR'</tt>), and +<tt>log</tt> is a logger object used to write debugging information. The debugging information +written to <tt>log</tt> is the same as what appears in the <tt>parser.out</tt> file created +by yacc. By supplying a custom logger with a different message format, it is possible to get +more information (e.g., the line number in <tt>yacc.py</tt> used for issuing each line of +output in the log). The result is an instance of <tt>LRGeneratedTable</tt>. +</blockquote> + +<p> +An instance <tt>lr</tt> of <tt>LRGeneratedTable</tt> has the following attributes. + +<p> +<b><tt>lr.grammar</tt></b> +<blockquote> +A link to the Grammar object used to construct the parsing tables. +</blockquote> + +<p> +<b><tt>lr.lr_method</tt></b> +<blockquote> +The LR parsing method used (e.g., <tt>'LALR'</tt>) +</blockquote> + + +<p> +<b><tt>lr.lr_productions</tt></b> +<blockquote> +A reference to <tt>grammar.Productions</tt>. This, together with <tt>lr_action</tt> and <tt>lr_goto</tt> +contain all of the information needed by the LR parsing engine. +</blockquote> + +<p> +<b><tt>lr.lr_action</tt></b> +<blockquote> +The LR action dictionary that implements the underlying state machine. The keys of this dictionary are +the LR states. +</blockquote> + +<p> +<b><tt>lr.lr_goto</tt></b> +<blockquote> +The LR goto table that contains information about grammar rule reductions. +</blockquote> + +<p> +<b><tt>lr.sr_conflicts</tt></b> +<blockquote> +A list of tuples <tt>(state,token,resolution)</tt> identifying all shift/reduce conflicts. <tt>state</tt> is the LR state +number where the conflict occurred, <tt>token</tt> is the token causing the conflict, and <tt>resolution</tt> is +a string describing the resolution taken. <tt>resolution</tt> is either <tt>'shift'</tt> or <tt>'reduce'</tt>. +</blockquote> + +<p> +<b><tt>lr.rr_conflicts</tt></b> +<blockquote> +A list of tuples <tt>(state,rule,rejected)</tt> identifying all reduce/reduce conflicts. <tt>state</tt> is the +LR state number where the conflict occurred, <tt>rule</tt> is the production rule that was selected +and <tt>rejected</tt> is the production rule that was rejected. Both <tt>rule</tt> and </tt>rejected</tt> are +instances of <tt>Production</tt>. They can be inspected to provide the user with more information. +</blockquote> + +<p> +There are two public methods of <tt>LRGeneratedTable</tt>. + +<p> +<b><tt>lr.write_table(modulename,outputdir="",signature="")</tt></b> +<blockquote> +Writes the LR parsing table information to a Python module. <tt>modulename</tt> is a string +specifying the name of a module such as <tt>"parsetab"</tt>. <tt>outputdir</tt> is the name of a +directory where the module should be created. <tt>signature</tt> is a string representing a +grammar signature that's written into the output file. This can be used to detect when +the data stored in a module file is out-of-sync with the the grammar specification (and that +the tables need to be regenerated). If <tt>modulename</tt> is a string <tt>"parsetab"</tt>, +this function creates a file called <tt>parsetab.py</tt>. If the module name represents a +package such as <tt>"foo.bar.parsetab"</tt>, then only the last component, <tt>"parsetab"</tt> is +used. +</blockquote> + + +<H2><a name="internal_nn7"></a>7. LRParser</H2> + + +The <tt>LRParser</tt> class implements the low-level LR parsing engine. + + +<p> +<b><tt>LRParser(lrtab, error_func)</tt></b> +<blockquote> +Create an LRParser. <tt>lrtab</tt> is an instance of <tt>LRTable</tt> +containing the LR production and state tables. <tt>error_func</tt> is the +error function to invoke in the event of a parsing error. +</blockquote> + +An instance <tt>p</tt> of <tt>LRParser</tt> has the following methods: + +<p> +<b><tt>p.parse(input=None,lexer=None,debug=0,tracking=0,tokenfunc=None)</tt></b> +<blockquote> +Run the parser. <tt>input</tt> is a string, which if supplied is fed into the +lexer using its <tt>input()</tt> method. <tt>lexer</tt> is an instance of the +<tt>Lexer</tt> class to use for tokenizing. If not supplied, the last lexer +created with the <tt>lex</tt> module is used. <tt>debug</tt> is a boolean flag +that enables debugging. <tt>tracking</tt> is a boolean flag that tells the +parser to perform additional line number tracking. <tt>tokenfunc</tt> is a callable +function that returns the next token. If supplied, the parser will use it to get +all tokens. +</blockquote> + +<p> +<b><tt>p.restart()</tt></b> +<blockquote> +Resets the parser state for a parse already in progress. +</blockquote> + +<H2><a name="internal_nn8"></a>8. ParserReflect</H2> + + +<p> +The <tt>ParserReflect</tt> class is used to collect parser specification data +from a Python module or object. This class is what collects all of the +<tt>p_rule()</tt> functions in a PLY file, performs basic error checking, +and collects all of the needed information to build a grammar. Most of the +high-level PLY interface as used by the <tt>yacc()</tt> function is actually +implemented by this class. + +<p> +<b><tt>ParserReflect(pdict, log=None)</tt></b> +<blockquote> +Creates a <tt>ParserReflect</tt> instance. <tt>pdict</tt> is a dictionary +containing parser specification data. This dictionary typically corresponds +to the module or class dictionary of code that implements a PLY parser. +<tt>log</tt> is a logger instance that will be used to report error +messages. +</blockquote> + +An instance <tt>p</tt> of <tt>ParserReflect</tt> has the following methods: + +<p> +<b><tt>p.get_all()</tt></b> +<blockquote> +Collect and store all required parsing information. +</blockquote> + +<p> +<b><tt>p.validate_all()</tt></b> +<blockquote> +Validate all of the collected parsing information. This is a seprate step +from <tt>p.get_all()</tt> as a performance optimization. In order to +increase parser start-up time, a parser can elect to only validate the +parsing data when regenerating the parsing tables. The validation +step tries to collect as much information as possible rather than +raising an exception at the first sign of trouble. The attribute +<tt>p.error</tt> is set if there are any validation errors. The +value of this attribute is also returned. +</blockquote> + +<p> +<b><tt>p.signature()</tt></b> +<blockquote> +Compute a signature representing the contents of the collected parsing +data. The signature value should change if anything in the parser +specification has changed in a way that would justify parser table +regeneration. This method can be called after <tt>p.get_all()</tt>, +but before <tt>p.validate_all()</tt>. +</blockquote> + +The following attributes are set in the process of collecting data: + +<p> +<b><tt>p.start</tt></b> +<blockquote> +The grammar start symbol, if any. Taken from <tt>pdict['start']</tt>. +</blockquote> + +<p> +<b><tt>p.error_func</tt></b> +<blockquote> +The error handling function or <tt>None</tt>. Taken from <tt>pdict['p_error']</tt>. +</blockquote> + +<p> +<b><tt>p.tokens</tt></b> +<blockquote> +The token list. Taken from <tt>pdict['tokens']</tt>. +</blockquote> + +<p> +<b><tt>p.prec</tt></b> +<blockquote> +The precedence specifier. Taken from <tt>pdict['precedence']</tt>. +</blockquote> + +<p> +<b><tt>p.preclist</tt></b> +<blockquote> +A parsed version of the precedence specified. A list of tuples of the form +<tt>(token,assoc,level)</tt> where <tt>token</tt> is the terminal symbol, +<tt>assoc</tt> is the associativity (e.g., <tt>'left'</tt>) and <tt>level</tt> +is a numeric precedence level. +</blockquote> + +<p> +<b><tt>p.grammar</tt></b> +<blockquote> +A list of tuples <tt>(name, rules)</tt> representing the grammar rules. <tt>name</tt> is the +name of a Python function or method in <tt>pdict</tt> that starts with <tt>"p_"</tt>. +<tt>rules</tt> is a list of tuples <tt>(filename,line,prodname,syms)</tt> representing +the grammar rules found in the documentation string of that function. <tt>filename</tt> and <tt>line</tt> contain location +information that can be used for debugging. <tt>prodname</tt> is the name of the +production. <tt>syms</tt> is the right-hand side of the production. If you have a +function like this + +<pre> +def p_expr(p): + '''expr : expr PLUS expr + | expr MINUS expr + | expr TIMES expr + | expr DIVIDE expr''' +</pre> + +then the corresponding entry in <tt>p.grammar</tt> might look like this: + +<pre> +('p_expr', [ ('calc.py',10,'expr', ['expr','PLUS','expr']), + ('calc.py',11,'expr', ['expr','MINUS','expr']), + ('calc.py',12,'expr', ['expr','TIMES','expr']), + ('calc.py',13,'expr', ['expr','DIVIDE','expr']) + ]) +</pre> +</blockquote> + +<p> +<b><tt>p.pfuncs</tt></b> +<blockquote> +A sorted list of tuples <tt>(line, file, name, doc)</tt> representing all of +the <tt>p_</tt> functions found. <tt>line</tt> and <tt>file</tt> give location +information. <tt>name</tt> is the name of the function. <tt>doc</tt> is the +documentation string. This list is sorted in ascending order by line number. +</blockquote> + +<p> +<b><tt>p.files</tt></b> +<blockquote> +A dictionary holding all of the source filenames that were encountered +while collecting parser information. Only the keys of this dictionary have +any meaning. +</blockquote> + +<p> +<b><tt>p.error</tt></b> +<blockquote> +An attribute that indicates whether or not any critical errors +occurred in validation. If this is set, it means that that some kind +of problem was detected and that no further processing should be +performed. +</blockquote> + + +<H2><a name="internal_nn9"></a>9. High-level operation</H2> + + +Using all of the above classes requires some attention to detail. The <tt>yacc()</tt> +function carries out a very specific sequence of operations to create a grammar. +This same sequence should be emulated if you build an alternative PLY interface. + +<ol> +<li>A <tt>ParserReflect</tt> object is created and raw grammar specification data is +collected. +<li>A <tt>Grammar</tt> object is created and populated with information +from the specification data. +<li>A <tt>LRGenerator</tt> object is created to run the LALR algorithm over +the <tt>Grammar</tt> object. +<li>Productions in the LRGenerator and bound to callables using the <tt>bind_callables()</tt> +method. +<li>A <tt>LRParser</tt> object is created from from the information in the +<tt>LRGenerator</tt> object. +</ol> + +</body> +</html> + + + + + + +
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/doc/makedoc.py @@ -0,0 +1,194 @@ +#!/usr/local/bin/python + +############################################################################### +# Takes a chapter as input and adds internal links and numbering to all +# of the H1, H2, H3, H4 and H5 sections. +# +# Every heading HTML tag (H1, H2 etc) is given an autogenerated name to link +# to. However, if the name is not an autogenerated name from a previous run, +# it will be kept. If it is autogenerated, it might change on subsequent runs +# of this program. Thus if you want to create links to one of the headings, +# then change the heading link name to something that does not look like an +# autogenerated link name. +############################################################################### + +import sys +import re +import string + +############################################################################### +# Functions +############################################################################### + +# Regexs for <a name="..."></a> +alink = re.compile(r"<a *name *= *\"(.*)\"></a>", re.IGNORECASE) +heading = re.compile(r"(_nn\d)", re.IGNORECASE) + +def getheadingname(m): + autogeneratedheading = True; + if m.group(1) != None: + amatch = alink.match(m.group(1)) + if amatch: + # A non-autogenerated heading - keep it + headingname = amatch.group(1) + autogeneratedheading = heading.match(headingname) + if autogeneratedheading: + # The heading name was either non-existent or autogenerated, + # We can create a new heading / change the existing heading + headingname = "%s_nn%d" % (filenamebase, nameindex) + return headingname + +############################################################################### +# Main program +############################################################################### + +if len(sys.argv) != 2: + print "usage: makedoc.py filename" + sys.exit(1) + +filename = sys.argv[1] +filenamebase = string.split(filename,".")[0] + +section = 0 +subsection = 0 +subsubsection = 0 +subsubsubsection = 0 +nameindex = 0 + +name = "" + +# Regexs for <h1>,... <h5> sections + +h1 = re.compile(r".*?<H1>(<a.*a>)*[\d\.\s]*(.*?)</H1>", re.IGNORECASE) +h2 = re.compile(r".*?<H2>(<a.*a>)*[\d\.\s]*(.*?)</H2>", re.IGNORECASE) +h3 = re.compile(r".*?<H3>(<a.*a>)*[\d\.\s]*(.*?)</H3>", re.IGNORECASE) +h4 = re.compile(r".*?<H4>(<a.*a>)*[\d\.\s]*(.*?)</H4>", re.IGNORECASE) +h5 = re.compile(r".*?<H5>(<a.*a>)*[\d\.\s]*(.*?)</H5>", re.IGNORECASE) + +data = open(filename).read() # Read data +open(filename+".bak","w").write(data) # Make backup + +lines = data.splitlines() +result = [ ] # This is the result of postprocessing the file +index = "<!-- INDEX -->\n<div class=\"sectiontoc\">\n" # index contains the index for adding at the top of the file. Also printed to stdout. + +skip = 0 +skipspace = 0 + +for s in lines: + if s == "<!-- INDEX -->": + if not skip: + result.append("@INDEX@") + skip = 1 + else: + skip = 0 + continue; + if skip: + continue + + if not s and skipspace: + continue + + if skipspace: + result.append("") + result.append("") + skipspace = 0 + + m = h2.match(s) + if m: + prevheadingtext = m.group(2) + nameindex += 1 + section += 1 + headingname = getheadingname(m) + result.append("""<H2><a name="%s"></a>%d. %s</H2>""" % (headingname,section, prevheadingtext)) + + if subsubsubsection: + index += "</ul>\n" + if subsubsection: + index += "</ul>\n" + if subsection: + index += "</ul>\n" + if section == 1: + index += "<ul>\n" + + index += """<li><a href="#%s">%s</a>\n""" % (headingname,prevheadingtext) + subsection = 0 + subsubsection = 0 + subsubsubsection = 0 + skipspace = 1 + continue + m = h3.match(s) + if m: + prevheadingtext = m.group(2) + nameindex += 1 + subsection += 1 + headingname = getheadingname(m) + result.append("""<H3><a name="%s"></a>%d.%d %s</H3>""" % (headingname,section, subsection, prevheadingtext)) + + if subsubsubsection: + index += "</ul>\n" + if subsubsection: + index += "</ul>\n" + if subsection == 1: + index += "<ul>\n" + + index += """<li><a href="#%s">%s</a>\n""" % (headingname,prevheadingtext) + subsubsection = 0 + skipspace = 1 + continue + m = h4.match(s) + if m: + prevheadingtext = m.group(2) + nameindex += 1 + subsubsection += 1 + subsubsubsection = 0 + headingname = getheadingname(m) + result.append("""<H4><a name="%s"></a>%d.%d.%d %s</H4>""" % (headingname,section, subsection, subsubsection, prevheadingtext)) + + if subsubsubsection: + index += "</ul>\n" + if subsubsection == 1: + index += "<ul>\n" + + index += """<li><a href="#%s">%s</a>\n""" % (headingname,prevheadingtext) + skipspace = 1 + continue + m = h5.match(s) + if m: + prevheadingtext = m.group(2) + nameindex += 1 + subsubsubsection += 1 + headingname = getheadingname(m) + result.append("""<H5><a name="%s"></a>%d.%d.%d.%d %s</H5>""" % (headingname,section, subsection, subsubsection, subsubsubsection, prevheadingtext)) + + if subsubsubsection == 1: + index += "<ul>\n" + + index += """<li><a href="#%s">%s</a>\n""" % (headingname,prevheadingtext) + skipspace = 1 + continue + + result.append(s) + +if subsubsubsection: + index += "</ul>\n" + +if subsubsection: + index += "</ul>\n" + +if subsection: + index += "</ul>\n" + +if section: + index += "</ul>\n" + +index += "</div>\n<!-- INDEX -->\n" + +data = "\n".join(result) + +data = data.replace("@INDEX@",index) + "\n"; + +# Write the file back out +open(filename,"w").write(data) + +
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/doc/ply.html @@ -0,0 +1,3496 @@ +<html> +<head> +<title>PLY (Python Lex-Yacc)</title> +</head> +<body bgcolor="#ffffff"> + +<h1>PLY (Python Lex-Yacc)</h1> + +<b> +David M. Beazley <br> +dave@dabeaz.com<br> +</b> + +<p> +<b>PLY Version: 3.10</b> +<p> + +<!-- INDEX --> +<div class="sectiontoc"> +<ul> +<li><a href="#ply_nn1">Preface and Requirements</a> +<li><a href="#ply_nn1">Introduction</a> +<li><a href="#ply_nn2">PLY Overview</a> +<li><a href="#ply_nn3">Lex</a> +<ul> +<li><a href="#ply_nn4">Lex Example</a> +<li><a href="#ply_nn5">The tokens list</a> +<li><a href="#ply_nn6">Specification of tokens</a> +<li><a href="#ply_nn7">Token values</a> +<li><a href="#ply_nn8">Discarded tokens</a> +<li><a href="#ply_nn9">Line numbers and positional information</a> +<li><a href="#ply_nn10">Ignored characters</a> +<li><a href="#ply_nn11">Literal characters</a> +<li><a href="#ply_nn12">Error handling</a> +<li><a href="#ply_nn14">EOF Handling</a> +<li><a href="#ply_nn13">Building and using the lexer</a> +<li><a href="#ply_nn14">The @TOKEN decorator</a> +<li><a href="#ply_nn15">Optimized mode</a> +<li><a href="#ply_nn16">Debugging</a> +<li><a href="#ply_nn17">Alternative specification of lexers</a> +<li><a href="#ply_nn18">Maintaining state</a> +<li><a href="#ply_nn19">Lexer cloning</a> +<li><a href="#ply_nn20">Internal lexer state</a> +<li><a href="#ply_nn21">Conditional lexing and start conditions</a> +<li><a href="#ply_nn21">Miscellaneous Issues</a> +</ul> +<li><a href="#ply_nn22">Parsing basics</a> +<li><a href="#ply_nn23">Yacc</a> +<ul> +<li><a href="#ply_nn24">An example</a> +<li><a href="#ply_nn25">Combining Grammar Rule Functions</a> +<li><a href="#ply_nn26">Character Literals</a> +<li><a href="#ply_nn26">Empty Productions</a> +<li><a href="#ply_nn28">Changing the starting symbol</a> +<li><a href="#ply_nn27">Dealing With Ambiguous Grammars</a> +<li><a href="#ply_nn28">The parser.out file</a> +<li><a href="#ply_nn29">Syntax Error Handling</a> +<ul> +<li><a href="#ply_nn30">Recovery and resynchronization with error rules</a> +<li><a href="#ply_nn31">Panic mode recovery</a> +<li><a href="#ply_nn35">Signalling an error from a production</a> +<li><a href="#ply_nn38">When Do Syntax Errors Get Reported</a> +<li><a href="#ply_nn32">General comments on error handling</a> +</ul> +<li><a href="#ply_nn33">Line Number and Position Tracking</a> +<li><a href="#ply_nn34">AST Construction</a> +<li><a href="#ply_nn35">Embedded Actions</a> +<li><a href="#ply_nn36">Miscellaneous Yacc Notes</a> +</ul> +<li><a href="#ply_nn37">Multiple Parsers and Lexers</a> +<li><a href="#ply_nn38">Using Python's Optimized Mode</a> +<li><a href="#ply_nn44">Advanced Debugging</a> +<ul> +<li><a href="#ply_nn45">Debugging the lex() and yacc() commands</a> +<li><a href="#ply_nn46">Run-time Debugging</a> +</ul> +<li><a href="#ply_nn49">Packaging Advice</a> +<li><a href="#ply_nn39">Where to go from here?</a> +</ul> +</div> +<!-- INDEX --> + + + + + + +<H2><a name="ply_nn1"></a>1. Preface and Requirements</H2> + + +<p> +This document provides an overview of lexing and parsing with PLY. +Given the intrinsic complexity of parsing, I would strongly advise +that you read (or at least skim) this entire document before jumping +into a big development project with PLY. +</p> + +<p> +PLY-3.5 is compatible with both Python 2 and Python 3. If you are using +Python 2, you have to use Python 2.6 or newer. +</p> + +<H2><a name="ply_nn1"></a>2. Introduction</H2> + + +PLY is a pure-Python implementation of the popular compiler +construction tools lex and yacc. The main goal of PLY is to stay +fairly faithful to the way in which traditional lex/yacc tools work. +This includes supporting LALR(1) parsing as well as providing +extensive input validation, error reporting, and diagnostics. Thus, +if you've used yacc in another programming language, it should be +relatively straightforward to use PLY. + +<p> +Early versions of PLY were developed to support an Introduction to +Compilers Course I taught in 2001 at the University of Chicago. +Since PLY was primarily developed as an instructional tool, you will +find it to be fairly picky about token and grammar rule +specification. In part, this +added formality is meant to catch common programming mistakes made by +novice users. However, advanced users will also find such features to +be useful when building complicated grammars for real programming +languages. It should also be noted that PLY does not provide much in +the way of bells and whistles (e.g., automatic construction of +abstract syntax trees, tree traversal, etc.). Nor would I consider it +to be a parsing framework. Instead, you will find a bare-bones, yet +fully capable lex/yacc implementation written entirely in Python. + +<p> +The rest of this document assumes that you are somewhat familiar with +parsing theory, syntax directed translation, and the use of compiler +construction tools such as lex and yacc in other programming +languages. If you are unfamiliar with these topics, you will probably +want to consult an introductory text such as "Compilers: Principles, +Techniques, and Tools", by Aho, Sethi, and Ullman. O'Reilly's "Lex +and Yacc" by John Levine may also be handy. In fact, the O'Reilly book can be +used as a reference for PLY as the concepts are virtually identical. + +<H2><a name="ply_nn2"></a>3. PLY Overview</H2> + + +<p> +PLY consists of two separate modules; <tt>lex.py</tt> and +<tt>yacc.py</tt>, both of which are found in a Python package +called <tt>ply</tt>. The <tt>lex.py</tt> module is used to break input text into a +collection of tokens specified by a collection of regular expression +rules. <tt>yacc.py</tt> is used to recognize language syntax that has +been specified in the form of a context free grammar. +</p> + +<p> +The two tools are meant to work together. Specifically, +<tt>lex.py</tt> provides an external interface in the form of a +<tt>token()</tt> function that returns the next valid token on the +input stream. <tt>yacc.py</tt> calls this repeatedly to retrieve +tokens and invoke grammar rules. The output of <tt>yacc.py</tt> is +often an Abstract Syntax Tree (AST). However, this is entirely up to +the user. If desired, <tt>yacc.py</tt> can also be used to implement +simple one-pass compilers. + +<p> +Like its Unix counterpart, <tt>yacc.py</tt> provides most of the +features you expect including extensive error checking, grammar +validation, support for empty productions, error tokens, and ambiguity +resolution via precedence rules. In fact, almost everything that is possible in traditional yacc +should be supported in PLY. + +<p> +The primary difference between +<tt>yacc.py</tt> and Unix <tt>yacc</tt> is that <tt>yacc.py</tt> +doesn't involve a separate code-generation process. +Instead, PLY relies on reflection (introspection) +to build its lexers and parsers. Unlike traditional lex/yacc which +require a special input file that is converted into a separate source +file, the specifications given to PLY <em>are</em> valid Python +programs. This means that there are no extra source files nor is +there a special compiler construction step (e.g., running yacc to +generate Python code for the compiler). Since the generation of the +parsing tables is relatively expensive, PLY caches the results and +saves them to a file. If no changes are detected in the input source, +the tables are read from the cache. Otherwise, they are regenerated. + +<H2><a name="ply_nn3"></a>4. Lex</H2> + + +<tt>lex.py</tt> is used to tokenize an input string. For example, suppose +you're writing a programming language and a user supplied the following input string: + +<blockquote> +<pre> +x = 3 + 42 * (s - t) +</pre> +</blockquote> + +A tokenizer splits the string into individual tokens + +<blockquote> +<pre> +'x','=', '3', '+', '42', '*', '(', 's', '-', 't', ')' +</pre> +</blockquote> + +Tokens are usually given names to indicate what they are. For example: + +<blockquote> +<pre> +'ID','EQUALS','NUMBER','PLUS','NUMBER','TIMES', +'LPAREN','ID','MINUS','ID','RPAREN' +</pre> +</blockquote> + +More specifically, the input is broken into pairs of token types and values. For example: + +<blockquote> +<pre> +('ID','x'), ('EQUALS','='), ('NUMBER','3'), +('PLUS','+'), ('NUMBER','42), ('TIMES','*'), +('LPAREN','('), ('ID','s'), ('MINUS','-'), +('ID','t'), ('RPAREN',')' +</pre> +</blockquote> + +The identification of tokens is typically done by writing a series of regular expression +rules. The next section shows how this is done using <tt>lex.py</tt>. + +<H3><a name="ply_nn4"></a>4.1 Lex Example</H3> + + +The following example shows how <tt>lex.py</tt> is used to write a simple tokenizer. + +<blockquote> +<pre> +# ------------------------------------------------------------ +# calclex.py +# +# tokenizer for a simple expression evaluator for +# numbers and +,-,*,/ +# ------------------------------------------------------------ +import ply.lex as lex + +# List of token names. This is always required +tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', +) + +# Regular expression rules for simple tokens +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_LPAREN = r'\(' +t_RPAREN = r'\)' + +# A regular expression rule with some action code +def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + +# Define a rule so we can track line numbers +def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + +# A string containing ignored characters (spaces and tabs) +t_ignore = ' \t' + +# Error handling rule +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +lexer = lex.lex() + +</pre> +</blockquote> +To use the lexer, you first need to feed it some input text using +its <tt>input()</tt> method. After that, repeated calls +to <tt>token()</tt> produce tokens. The following code shows how this +works: + +<blockquote> +<pre> + +# Test it out +data = ''' +3 + 4 * 10 + + -20 *2 +''' + +# Give the lexer some input +lexer.input(data) + +# Tokenize +while True: + tok = lexer.token() + if not tok: + break # No more input + print(tok) +</pre> +</blockquote> + +When executed, the example will produce the following output: + +<blockquote> +<pre> +$ python example.py +LexToken(NUMBER,3,2,1) +LexToken(PLUS,'+',2,3) +LexToken(NUMBER,4,2,5) +LexToken(TIMES,'*',2,7) +LexToken(NUMBER,10,2,10) +LexToken(PLUS,'+',3,14) +LexToken(MINUS,'-',3,16) +LexToken(NUMBER,20,3,18) +LexToken(TIMES,'*',3,20) +LexToken(NUMBER,2,3,21) +</pre> +</blockquote> + +Lexers also support the iteration protocol. So, you can write the above loop as follows: + +<blockquote> +<pre> +for tok in lexer: + print(tok) +</pre> +</blockquote> + +The tokens returned by <tt>lexer.token()</tt> are instances +of <tt>LexToken</tt>. This object has +attributes <tt>tok.type</tt>, <tt>tok.value</tt>, +<tt>tok.lineno</tt>, and <tt>tok.lexpos</tt>. The following code shows an example of +accessing these attributes: + +<blockquote> +<pre> +# Tokenize +while True: + tok = lexer.token() + if not tok: + break # No more input + print(tok.type, tok.value, tok.lineno, tok.lexpos) +</pre> +</blockquote> + +The <tt>tok.type</tt> and <tt>tok.value</tt> attributes contain the +type and value of the token itself. +<tt>tok.line</tt> and <tt>tok.lexpos</tt> contain information about +the location of the token. <tt>tok.lexpos</tt> is the index of the +token relative to the start of the input text. + +<H3><a name="ply_nn5"></a>4.2 The tokens list</H3> + + +<p> +All lexers must provide a list <tt>tokens</tt> that defines all of the possible token +names that can be produced by the lexer. This list is always required +and is used to perform a variety of validation checks. The tokens list is also used by the +<tt>yacc.py</tt> module to identify terminals. +</p> + +<p> +In the example, the following code specified the token names: + +<blockquote> +<pre> +tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', +) +</pre> +</blockquote> + +<H3><a name="ply_nn6"></a>4.3 Specification of tokens</H3> + + +Each token is specified by writing a regular expression rule compatible with Python's <tt>re</tt> module. Each of these rules +are defined by making declarations with a special prefix <tt>t_</tt> to indicate that it +defines a token. For simple tokens, the regular expression can +be specified as strings such as this (note: Python raw strings are used since they are the +most convenient way to write regular expression strings): + +<blockquote> +<pre> +t_PLUS = r'\+' +</pre> +</blockquote> + +In this case, the name following the <tt>t_</tt> must exactly match one of the +names supplied in <tt>tokens</tt>. If some kind of action needs to be performed, +a token rule can be specified as a function. For example, this rule matches numbers and +converts the string into a Python integer. + +<blockquote> +<pre> +def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t +</pre> +</blockquote> + +When a function is used, the regular expression rule is specified in the function documentation string. +The function always takes a single argument which is an instance of +<tt>LexToken</tt>. This object has attributes of <tt>t.type</tt> which is the token type (as a string), +<tt>t.value</tt> which is the lexeme (the actual text matched), <tt>t.lineno</tt> which is the current line number, and <tt>t.lexpos</tt> which +is the position of the token relative to the beginning of the input text. +By default, <tt>t.type</tt> is set to the name following the <tt>t_</tt> prefix. The action +function can modify the contents of the <tt>LexToken</tt> object as appropriate. However, +when it is done, the resulting token should be returned. If no value is returned by the action +function, the token is simply discarded and the next token read. + +<p> +Internally, <tt>lex.py</tt> uses the <tt>re</tt> module to do its pattern matching. Patterns are compiled +using the <tt>re.VERBOSE</tt> flag which can be used to help readability. However, be aware that unescaped +whitespace is ignored and comments are allowed in this mode. If your pattern involves whitespace, make sure you +use <tt>\s</tt>. If you need to match the <tt>#</tt> character, use <tt>[#]</tt>. +</p> + +<p> +When building the master regular expression, +rules are added in the following order: +</p> + +<p> +<ol> +<li>All tokens defined by functions are added in the same order as they appear in the lexer file. +<li>Tokens defined by strings are added next by sorting them in order of decreasing regular expression length (longer expressions +are added first). +</ol> +<p> +Without this ordering, it can be difficult to correctly match certain types of tokens. For example, if you +wanted to have separate tokens for "=" and "==", you need to make sure that "==" is checked first. By sorting regular +expressions in order of decreasing length, this problem is solved for rules defined as strings. For functions, +the order can be explicitly controlled since rules appearing first are checked first. + +<p> +To handle reserved words, you should write a single rule to match an +identifier and do a special name lookup in a function like this: + +<blockquote> +<pre> +reserved = { + 'if' : 'IF', + 'then' : 'THEN', + 'else' : 'ELSE', + 'while' : 'WHILE', + ... +} + +tokens = ['LPAREN','RPAREN',...,'ID'] + list(reserved.values()) + +def t_ID(t): + r'[a-zA-Z_][a-zA-Z_0-9]*' + t.type = reserved.get(t.value,'ID') # Check for reserved words + return t +</pre> +</blockquote> + +This approach greatly reduces the number of regular expression rules and is likely to make things a little faster. + +<p> +<b>Note:</b> You should avoid writing individual rules for reserved words. For example, if you write rules like this, + +<blockquote> +<pre> +t_FOR = r'for' +t_PRINT = r'print' +</pre> +</blockquote> + +those rules will be triggered for identifiers that include those words as a prefix such as "forget" or "printed". This is probably not +what you want. + +<H3><a name="ply_nn7"></a>4.4 Token values</H3> + + +When tokens are returned by lex, they have a value that is stored in the <tt>value</tt> attribute. Normally, the value is the text +that was matched. However, the value can be assigned to any Python object. For instance, when lexing identifiers, you may +want to return both the identifier name and information from some sort of symbol table. To do this, you might write a rule like this: + +<blockquote> +<pre> +def t_ID(t): + ... + # Look up symbol table information and return a tuple + t.value = (t.value, symbol_lookup(t.value)) + ... + return t +</pre> +</blockquote> + +It is important to note that storing data in other attribute names is <em>not</em> recommended. The <tt>yacc.py</tt> module only exposes the +contents of the <tt>value</tt> attribute. Thus, accessing other attributes may be unnecessarily awkward. If you +need to store multiple values on a token, assign a tuple, dictionary, or instance to <tt>value</tt>. + +<H3><a name="ply_nn8"></a>4.5 Discarded tokens</H3> + + +To discard a token, such as a comment, simply define a token rule that returns no value. For example: + +<blockquote> +<pre> +def t_COMMENT(t): + r'\#.*' + pass + # No return value. Token discarded +</pre> +</blockquote> + +Alternatively, you can include the prefix "ignore_" in the token declaration to force a token to be ignored. For example: + +<blockquote> +<pre> +t_ignore_COMMENT = r'\#.*' +</pre> +</blockquote> + +Be advised that if you are ignoring many different kinds of text, you may still want to use functions since these provide more precise +control over the order in which regular expressions are matched (i.e., functions are matched in order of specification whereas strings are +sorted by regular expression length). + +<H3><a name="ply_nn9"></a>4.6 Line numbers and positional information</H3> + + +<p>By default, <tt>lex.py</tt> knows nothing about line numbers. This is because <tt>lex.py</tt> doesn't know anything +about what constitutes a "line" of input (e.g., the newline character or even if the input is textual data). +To update this information, you need to write a special rule. In the example, the <tt>t_newline()</tt> rule shows how to do this. + +<blockquote> +<pre> +# Define a rule so we can track line numbers +def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) +</pre> +</blockquote> +Within the rule, the <tt>lineno</tt> attribute of the underlying lexer <tt>t.lexer</tt> is updated. +After the line number is updated, the token is simply discarded since nothing is returned. + +<p> +<tt>lex.py</tt> does not perform and kind of automatic column tracking. However, it does record positional +information related to each token in the <tt>lexpos</tt> attribute. Using this, it is usually possible to compute +column information as a separate step. For instance, just count backwards until you reach a newline. + +<blockquote> +<pre> +# Compute column. +# input is the input text string +# token is a token instance +def find_column(input,token): + last_cr = input.rfind('\n',0,token.lexpos) + if last_cr < 0: + last_cr = 0 + column = (token.lexpos - last_cr) + 1 + return column +</pre> +</blockquote> + +Since column information is often only useful in the context of error handling, calculating the column +position can be performed when needed as opposed to doing it for each token. + +<H3><a name="ply_nn10"></a>4.7 Ignored characters</H3> + + +<p> +The special <tt>t_ignore</tt> rule is reserved by <tt>lex.py</tt> for characters +that should be completely ignored in the input stream. +Usually this is used to skip over whitespace and other non-essential characters. +Although it is possible to define a regular expression rule for whitespace in a manner +similar to <tt>t_newline()</tt>, the use of <tt>t_ignore</tt> provides substantially better +lexing performance because it is handled as a special case and is checked in a much +more efficient manner than the normal regular expression rules. +</p> + +<p> +The characters given in <tt>t_ignore</tt> are not ignored when such characters are part of +other regular expression patterns. For example, if you had a rule to capture quoted text, +that pattern can include the ignored characters (which will be captured in the normal way). The +main purpose of <tt>t_ignore</tt> is to ignore whitespace and other padding between the +tokens that you actually want to parse. +</p> + +<H3><a name="ply_nn11"></a>4.8 Literal characters</H3> + + +<p> +Literal characters can be specified by defining a variable <tt>literals</tt> in your lexing module. For example: + +<blockquote> +<pre> +literals = [ '+','-','*','/' ] +</pre> +</blockquote> + +or alternatively + +<blockquote> +<pre> +literals = "+-*/" +</pre> +</blockquote> + +A literal character is simply a single character that is returned "as is" when encountered by the lexer. Literals are checked +after all of the defined regular expression rules. Thus, if a rule starts with one of the literal characters, it will always +take precedence. + +<p> +When a literal token is returned, both its <tt>type</tt> and <tt>value</tt> attributes are set to the character itself. For example, <tt>'+'</tt>. +</p> + +<p> +It's possible to write token functions that perform additional actions +when literals are matched. However, you'll need to set the token type +appropriately. For example: +</p> + +<blockquote> +<pre> +literals = [ '{', '}' ] + +def t_lbrace(t): + r'\{' + t.type = '{' # Set token type to the expected literal + return t + +def t_rbrace(t): + r'\}' + t.type = '}' # Set token type to the expected literal + return t +</pre> +</blockquote> + +<H3><a name="ply_nn12"></a>4.9 Error handling</H3> + + +<p> +The <tt>t_error()</tt> +function is used to handle lexing errors that occur when illegal +characters are detected. In this case, the <tt>t.value</tt> attribute contains the +rest of the input string that has not been tokenized. In the example, the error function +was defined as follows: + +<blockquote> +<pre> +# Error handling rule +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) +</pre> +</blockquote> + +In this case, we simply print the offending character and skip ahead one character by calling <tt>t.lexer.skip(1)</tt>. + +<H3><a name="ply_nn14"></a>4.10 EOF Handling</H3> + + +<p> +The <tt>t_eof()</tt> function is used to handle an end-of-file (EOF) condition in the input. As input, it +receives a token type <tt>'eof'</tt> with the <tt>lineno</tt> and <tt>lexpos</tt> attributes set appropriately. +The main use of this function is provide more input to the lexer so that it can continue to parse. Here is an +example of how this works: +</p> + +<blockquote> +<pre> +# EOF handling rule +def t_eof(t): + # Get more input (Example) + more = raw_input('... ') + if more: + self.lexer.input(more) + return self.lexer.token() + return None +</pre> +</blockquote> + +<p> +The EOF function should return the next available token (by calling <tt>self.lexer.token())</tt> or <tt>None</tt> to +indicate no more data. Be aware that setting more input with the <tt>self.lexer.input()</tt> method does +NOT reset the lexer state or the <tt>lineno</tt> attribute used for position tracking. The <tt>lexpos</tt> +attribute is reset so be aware of that if you're using it in error reporting. +</p> + +<H3><a name="ply_nn13"></a>4.11 Building and using the lexer</H3> + + +<p> +To build the lexer, the function <tt>lex.lex()</tt> is used. For example:</p> + +<blockquote> +<pre> +lexer = lex.lex() +</pre> +</blockquote> + +<p>This function +uses Python reflection (or introspection) to read the regular expression rules +out of the calling context and build the lexer. Once the lexer has been built, two methods can +be used to control the lexer. +</p> +<ul> +<li><tt>lexer.input(data)</tt>. Reset the lexer and store a new input string. +<li><tt>lexer.token()</tt>. Return the next token. Returns a special <tt>LexToken</tt> instance on success or +None if the end of the input text has been reached. +</ul> + +<H3><a name="ply_nn14"></a>4.12 The @TOKEN decorator</H3> + + +In some applications, you may want to define build tokens from as a series of +more complex regular expression rules. For example: + +<blockquote> +<pre> +digit = r'([0-9])' +nondigit = r'([_A-Za-z])' +identifier = r'(' + nondigit + r'(' + digit + r'|' + nondigit + r')*)' + +def t_ID(t): + # want docstring to be identifier above. ????? + ... +</pre> +</blockquote> + +In this case, we want the regular expression rule for <tt>ID</tt> to be one of the variables above. However, there is no +way to directly specify this using a normal documentation string. To solve this problem, you can use the <tt>@TOKEN</tt> +decorator. For example: + +<blockquote> +<pre> +from ply.lex import TOKEN + +@TOKEN(identifier) +def t_ID(t): + ... +</pre> +</blockquote> + +<p> +This will attach <tt>identifier</tt> to the docstring for <tt>t_ID()</tt> allowing <tt>lex.py</tt> to work normally. +</p> + +<H3><a name="ply_nn15"></a>4.13 Optimized mode</H3> + + +For improved performance, it may be desirable to use Python's +optimized mode (e.g., running Python with the <tt>-O</tt> +option). However, doing so causes Python to ignore documentation +strings. This presents special problems for <tt>lex.py</tt>. To +handle this case, you can create your lexer using +the <tt>optimize</tt> option as follows: + +<blockquote> +<pre> +lexer = lex.lex(optimize=1) +</pre> +</blockquote> + +Next, run Python in its normal operating mode. When you do +this, <tt>lex.py</tt> will write a file called <tt>lextab.py</tt> in +the same directory as the module containing the lexer specification. +This file contains all of the regular +expression rules and tables used during lexing. On subsequent +executions, +<tt>lextab.py</tt> will simply be imported to build the lexer. This +approach substantially improves the startup time of the lexer and it +works in Python's optimized mode. + +<p> +To change the name of the lexer-generated module, use the <tt>lextab</tt> keyword argument. For example: +</p> + +<blockquote> +<pre> +lexer = lex.lex(optimize=1,lextab="footab") +</pre> +</blockquote> + +When running in optimized mode, it is important to note that lex disables most error checking. Thus, this is really only recommended +if you're sure everything is working correctly and you're ready to start releasing production code. + +<H3><a name="ply_nn16"></a>4.14 Debugging</H3> + + +For the purpose of debugging, you can run <tt>lex()</tt> in a debugging mode as follows: + +<blockquote> +<pre> +lexer = lex.lex(debug=1) +</pre> +</blockquote> + +<p> +This will produce various sorts of debugging information including all of the added rules, +the master regular expressions used by the lexer, and tokens generating during lexing. +</p> + +<p> +In addition, <tt>lex.py</tt> comes with a simple main function which +will either tokenize input read from standard input or from a file specified +on the command line. To use it, simply put this in your lexer: +</p> + +<blockquote> +<pre> +if __name__ == '__main__': + lex.runmain() +</pre> +</blockquote> + +Please refer to the "Debugging" section near the end for some more advanced details +of debugging. + +<H3><a name="ply_nn17"></a>4.15 Alternative specification of lexers</H3> + + +As shown in the example, lexers are specified all within one Python module. If you want to +put token rules in a different module from the one in which you invoke <tt>lex()</tt>, use the +<tt>module</tt> keyword argument. + +<p> +For example, you might have a dedicated module that just contains +the token rules: + +<blockquote> +<pre> +# module: tokrules.py +# This module just contains the lexing rules + +# List of token names. This is always required +tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', +) + +# Regular expression rules for simple tokens +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_LPAREN = r'\(' +t_RPAREN = r'\)' + +# A regular expression rule with some action code +def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + +# Define a rule so we can track line numbers +def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + +# A string containing ignored characters (spaces and tabs) +t_ignore = ' \t' + +# Error handling rule +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) +</pre> +</blockquote> + +Now, if you wanted to build a tokenizer from these rules from within a different module, you would do the following (shown for Python interactive mode): + +<blockquote> +<pre> +>>> import tokrules +>>> <b>lexer = lex.lex(module=tokrules)</b> +>>> lexer.input("3 + 4") +>>> lexer.token() +LexToken(NUMBER,3,1,1,0) +>>> lexer.token() +LexToken(PLUS,'+',1,2) +>>> lexer.token() +LexToken(NUMBER,4,1,4) +>>> lexer.token() +None +>>> +</pre> +</blockquote> + +The <tt>module</tt> option can also be used to define lexers from instances of a class. For example: + +<blockquote> +<pre> +import ply.lex as lex + +class MyLexer(object): + # List of token names. This is always required + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + # Note addition of self parameter since we're in a class + def t_NUMBER(self,t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(self,t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(self,t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + <b># Build the lexer + def build(self,**kwargs): + self.lexer = lex.lex(module=self, **kwargs)</b> + + # Test it output + def test(self,data): + self.lexer.input(data) + while True: + tok = self.lexer.token() + if not tok: + break + print(tok) + +# Build the lexer and try it out +m = MyLexer() +m.build() # Build the lexer +m.test("3 + 4") # Test it +</pre> +</blockquote> + + +When building a lexer from class, <em>you should construct the lexer from +an instance of the class</em>, not the class object itself. This is because +PLY only works properly if the lexer actions are defined by bound-methods. + +<p> +When using the <tt>module</tt> option to <tt>lex()</tt>, PLY collects symbols +from the underlying object using the <tt>dir()</tt> function. There is no +direct access to the <tt>__dict__</tt> attribute of the object supplied as a +module value. </p> + +<P> +Finally, if you want to keep things nicely encapsulated, but don't want to use a +full-fledged class definition, lexers can be defined using closures. For example: + +<blockquote> +<pre> +import ply.lex as lex + +# List of token names. This is always required +tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', +) + +def MyLexer(): + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer from my environment and return it + return lex.lex() +</pre> +</blockquote> + +<p> +<b>Important note:</b> If you are defining a lexer using a class or closure, be aware that PLY still requires you to only +define a single lexer per module (source file). There are extensive validation/error checking parts of the PLY that +may falsely report error messages if you don't follow this rule. +</p> + +<H3><a name="ply_nn18"></a>4.16 Maintaining state</H3> + + +In your lexer, you may want to maintain a variety of state +information. This might include mode settings, symbol tables, and +other details. As an example, suppose that you wanted to keep +track of how many NUMBER tokens had been encountered. + +<p> +One way to do this is to keep a set of global variables in the module +where you created the lexer. For example: + +<blockquote> +<pre> +num_count = 0 +def t_NUMBER(t): + r'\d+' + global num_count + num_count += 1 + t.value = int(t.value) + return t +</pre> +</blockquote> + +If you don't like the use of a global variable, another place to store +information is inside the Lexer object created by <tt>lex()</tt>. +To this, you can use the <tt>lexer</tt> attribute of tokens passed to +the various rules. For example: + +<blockquote> +<pre> +def t_NUMBER(t): + r'\d+' + t.lexer.num_count += 1 # Note use of lexer attribute + t.value = int(t.value) + return t + +lexer = lex.lex() +lexer.num_count = 0 # Set the initial count +</pre> +</blockquote> + +This latter approach has the advantage of being simple and working +correctly in applications where multiple instantiations of a given +lexer exist in the same application. However, this might also feel +like a gross violation of encapsulation to OO purists. +Just to put your mind at some ease, all +internal attributes of the lexer (with the exception of <tt>lineno</tt>) have names that are prefixed +by <tt>lex</tt> (e.g., <tt>lexdata</tt>,<tt>lexpos</tt>, etc.). Thus, +it is perfectly safe to store attributes in the lexer that +don't have names starting with that prefix or a name that conflicts with one of the +predefined methods (e.g., <tt>input()</tt>, <tt>token()</tt>, etc.). + +<p> +If you don't like assigning values on the lexer object, you can define your lexer as a class as +shown in the previous section: + +<blockquote> +<pre> +class MyLexer: + ... + def t_NUMBER(self,t): + r'\d+' + self.num_count += 1 + t.value = int(t.value) + return t + + def build(self, **kwargs): + self.lexer = lex.lex(object=self,**kwargs) + + def __init__(self): + self.num_count = 0 +</pre> +</blockquote> + +The class approach may be the easiest to manage if your application is +going to be creating multiple instances of the same lexer and you need +to manage a lot of state. + +<p> +State can also be managed through closures. For example, in Python 3: + +<blockquote> +<pre> +def MyLexer(): + num_count = 0 + ... + def t_NUMBER(t): + r'\d+' + nonlocal num_count + num_count += 1 + t.value = int(t.value) + return t + ... +</pre> +</blockquote> + +<H3><a name="ply_nn19"></a>4.17 Lexer cloning</H3> + + +<p> +If necessary, a lexer object can be duplicated by invoking its <tt>clone()</tt> method. For example: + +<blockquote> +<pre> +lexer = lex.lex() +... +newlexer = lexer.clone() +</pre> +</blockquote> + +When a lexer is cloned, the copy is exactly identical to the original lexer +including any input text and internal state. However, the clone allows a +different set of input text to be supplied which may be processed separately. +This may be useful in situations when you are writing a parser/compiler that +involves recursive or reentrant processing. For instance, if you +needed to scan ahead in the input for some reason, you could create a +clone and use it to look ahead. Or, if you were implementing some kind of preprocessor, +cloned lexers could be used to handle different input files. + +<p> +Creating a clone is different than calling <tt>lex.lex()</tt> in that +PLY doesn't regenerate any of the internal tables or regular expressions. + +<p> +Special considerations need to be made when cloning lexers that also +maintain their own internal state using classes or closures. Namely, +you need to be aware that the newly created lexers will share all of +this state with the original lexer. For example, if you defined a +lexer as a class and did this: + +<blockquote> +<pre> +m = MyLexer() +a = lex.lex(object=m) # Create a lexer + +b = a.clone() # Clone the lexer +</pre> +</blockquote> + +Then both <tt>a</tt> and <tt>b</tt> are going to be bound to the same +object <tt>m</tt> and any changes to <tt>m</tt> will be reflected in both lexers. It's +important to emphasize that <tt>clone()</tt> is only meant to create a new lexer +that reuses the regular expressions and environment of another lexer. If you +need to make a totally new copy of a lexer, then call <tt>lex()</tt> again. + +<H3><a name="ply_nn20"></a>4.18 Internal lexer state</H3> + + +A Lexer object <tt>lexer</tt> has a number of internal attributes that may be useful in certain +situations. + +<p> +<tt>lexer.lexpos</tt> +<blockquote> +This attribute is an integer that contains the current position within the input text. If you modify +the value, it will change the result of the next call to <tt>token()</tt>. Within token rule functions, this points +to the first character <em>after</em> the matched text. If the value is modified within a rule, the next returned token will be +matched at the new position. +</blockquote> + +<p> +<tt>lexer.lineno</tt> +<blockquote> +The current value of the line number attribute stored in the lexer. PLY only specifies that the attribute +exists---it never sets, updates, or performs any processing with it. If you want to track line numbers, +you will need to add code yourself (see the section on line numbers and positional information). +</blockquote> + +<p> +<tt>lexer.lexdata</tt> +<blockquote> +The current input text stored in the lexer. This is the string passed with the <tt>input()</tt> method. It +would probably be a bad idea to modify this unless you really know what you're doing. +</blockquote> + +<P> +<tt>lexer.lexmatch</tt> +<blockquote> +This is the raw <tt>Match</tt> object returned by the Python <tt>re.match()</tt> function (used internally by PLY) for the +current token. If you have written a regular expression that contains named groups, you can use this to retrieve those values. +Note: This attribute is only updated when tokens are defined and processed by functions. +</blockquote> + +<H3><a name="ply_nn21"></a>4.19 Conditional lexing and start conditions</H3> + + +In advanced parsing applications, it may be useful to have different +lexing states. For instance, you may want the occurrence of a certain +token or syntactic construct to trigger a different kind of lexing. +PLY supports a feature that allows the underlying lexer to be put into +a series of different states. Each state can have its own tokens, +lexing rules, and so forth. The implementation is based largely on +the "start condition" feature of GNU flex. Details of this can be found +at <a +href="http://flex.sourceforge.net/manual/Start-Conditions.html">http://flex.sourceforge.net/manual/Start-Conditions.html</a>. + +<p> +To define a new lexing state, it must first be declared. This is done by including a "states" declaration in your +lex file. For example: + +<blockquote> +<pre> +states = ( + ('foo','exclusive'), + ('bar','inclusive'), +) +</pre> +</blockquote> + +This declaration declares two states, <tt>'foo'</tt> +and <tt>'bar'</tt>. States may be of two types; <tt>'exclusive'</tt> +and <tt>'inclusive'</tt>. An exclusive state completely overrides the +default behavior of the lexer. That is, lex will only return tokens +and apply rules defined specifically for that state. An inclusive +state adds additional tokens and rules to the default set of rules. +Thus, lex will return both the tokens defined by default in addition +to those defined for the inclusive state. + +<p> +Once a state has been declared, tokens and rules are declared by including the +state name in token/rule declaration. For example: + +<blockquote> +<pre> +t_foo_NUMBER = r'\d+' # Token 'NUMBER' in state 'foo' +t_bar_ID = r'[a-zA-Z_][a-zA-Z0-9_]*' # Token 'ID' in state 'bar' + +def t_foo_newline(t): + r'\n' + t.lexer.lineno += 1 +</pre> +</blockquote> + +A token can be declared in multiple states by including multiple state names in the declaration. For example: + +<blockquote> +<pre> +t_foo_bar_NUMBER = r'\d+' # Defines token 'NUMBER' in both state 'foo' and 'bar' +</pre> +</blockquote> + +Alternative, a token can be declared in all states using the 'ANY' in the name. + +<blockquote> +<pre> +t_ANY_NUMBER = r'\d+' # Defines a token 'NUMBER' in all states +</pre> +</blockquote> + +If no state name is supplied, as is normally the case, the token is associated with a special state <tt>'INITIAL'</tt>. For example, +these two declarations are identical: + +<blockquote> +<pre> +t_NUMBER = r'\d+' +t_INITIAL_NUMBER = r'\d+' +</pre> +</blockquote> + +<p> +States are also associated with the special <tt>t_ignore</tt>, <tt>t_error()</tt>, and <tt>t_eof()</tt> declarations. For example, if a state treats +these differently, you can declare:</p> + +<blockquote> +<pre> +t_foo_ignore = " \t\n" # Ignored characters for state 'foo' + +def t_bar_error(t): # Special error handler for state 'bar' + pass +</pre> +</blockquote> + +By default, lexing operates in the <tt>'INITIAL'</tt> state. This state includes all of the normally defined tokens. +For users who aren't using different states, this fact is completely transparent. If, during lexing or parsing, you want to change +the lexing state, use the <tt>begin()</tt> method. For example: + +<blockquote> +<pre> +def t_begin_foo(t): + r'start_foo' + t.lexer.begin('foo') # Starts 'foo' state +</pre> +</blockquote> + +To get out of a state, you use <tt>begin()</tt> to switch back to the initial state. For example: + +<blockquote> +<pre> +def t_foo_end(t): + r'end_foo' + t.lexer.begin('INITIAL') # Back to the initial state +</pre> +</blockquote> + +The management of states can also be done with a stack. For example: + +<blockquote> +<pre> +def t_begin_foo(t): + r'start_foo' + t.lexer.push_state('foo') # Starts 'foo' state + +def t_foo_end(t): + r'end_foo' + t.lexer.pop_state() # Back to the previous state +</pre> +</blockquote> + +<p> +The use of a stack would be useful in situations where there are many ways of entering a new lexing state and you merely want to go back +to the previous state afterwards. + +<P> +An example might help clarify. Suppose you were writing a parser and you wanted to grab sections of arbitrary C code enclosed by +curly braces. That is, whenever you encounter a starting brace '{', you want to read all of the enclosed code up to the ending brace '}' +and return it as a string. Doing this with a normal regular expression rule is nearly (if not actually) impossible. This is because braces can +be nested and can be included in comments and strings. Thus, simply matching up to the first matching '}' character isn't good enough. Here is how +you might use lexer states to do this: + +<blockquote> +<pre> +# Declare the state +states = ( + ('ccode','exclusive'), +) + +# Match the first {. Enter ccode state. +def t_ccode(t): + r'\{' + t.lexer.code_start = t.lexer.lexpos # Record the starting position + t.lexer.level = 1 # Initial brace level + t.lexer.begin('ccode') # Enter 'ccode' state + +# Rules for the ccode state +def t_ccode_lbrace(t): + r'\{' + t.lexer.level +=1 + +def t_ccode_rbrace(t): + r'\}' + t.lexer.level -=1 + + # If closing brace, return the code fragment + if t.lexer.level == 0: + t.value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos+1] + t.type = "CCODE" + t.lexer.lineno += t.value.count('\n') + t.lexer.begin('INITIAL') + return t + +# C or C++ comment (ignore) +def t_ccode_comment(t): + r'(/\*(.|\n)*?\*/)|(//.*)' + pass + +# C string +def t_ccode_string(t): + r'\"([^\\\n]|(\\.))*?\"' + +# C character literal +def t_ccode_char(t): + r'\'([^\\\n]|(\\.))*?\'' + +# Any sequence of non-whitespace characters (not braces, strings) +def t_ccode_nonspace(t): + r'[^\s\{\}\'\"]+' + +# Ignored characters (whitespace) +t_ccode_ignore = " \t\n" + +# For bad characters, we just skip over it +def t_ccode_error(t): + t.lexer.skip(1) +</pre> +</blockquote> + +In this example, the occurrence of the first '{' causes the lexer to record the starting position and enter a new state <tt>'ccode'</tt>. A collection of rules then match +various parts of the input that follow (comments, strings, etc.). All of these rules merely discard the token (by not returning a value). +However, if the closing right brace is encountered, the rule <tt>t_ccode_rbrace</tt> collects all of the code (using the earlier recorded starting +position), stores it, and returns a token 'CCODE' containing all of that text. When returning the token, the lexing state is restored back to its +initial state. + +<H3><a name="ply_nn21"></a>4.20 Miscellaneous Issues</H3> + + +<P> +<li>The lexer requires input to be supplied as a single input string. Since most machines have more than enough memory, this +rarely presents a performance concern. However, it means that the lexer currently can't be used with streaming data +such as open files or sockets. This limitation is primarily a side-effect of using the <tt>re</tt> module. You might be +able to work around this by implementing an appropriate <tt>def t_eof()</tt> end-of-file handling rule. The main complication +here is that you'll probably need to ensure that data is fed to the lexer in a way so that it doesn't split in in the middle +of a token.</p> + +<p> +<li>The lexer should work properly with both Unicode strings given as token and pattern matching rules as +well as for input text. + +<p> +<li>If you need to supply optional flags to the re.compile() function, use the reflags option to lex. For example: + +<blockquote> +<pre> +lex.lex(reflags=re.UNICODE) +</pre> +</blockquote> + +<p> +<li>Since the lexer is written entirely in Python, its performance is +largely determined by that of the Python <tt>re</tt> module. Although +the lexer has been written to be as efficient as possible, it's not +blazingly fast when used on very large input files. If +performance is concern, you might consider upgrading to the most +recent version of Python, creating a hand-written lexer, or offloading +the lexer into a C extension module. + +<p> +If you are going to create a hand-written lexer and you plan to use it with <tt>yacc.py</tt>, +it only needs to conform to the following requirements: + +<ul> +<li>It must provide a <tt>token()</tt> method that returns the next token or <tt>None</tt> if no more +tokens are available. +<li>The <tt>token()</tt> method must return an object <tt>tok</tt> that has <tt>type</tt> and <tt>value</tt> attributes. If +line number tracking is being used, then the token should also define a <tt>lineno</tt> attribute. +</ul> + +<H2><a name="ply_nn22"></a>5. Parsing basics</H2> + + +<tt>yacc.py</tt> is used to parse language syntax. Before showing an +example, there are a few important bits of background that must be +mentioned. First, <em>syntax</em> is usually specified in terms of a BNF grammar. +For example, if you wanted to parse +simple arithmetic expressions, you might first write an unambiguous +grammar specification like this: + +<blockquote> +<pre> +expression : expression + term + | expression - term + | term + +term : term * factor + | term / factor + | factor + +factor : NUMBER + | ( expression ) +</pre> +</blockquote> + +In the grammar, symbols such as <tt>NUMBER</tt>, <tt>+</tt>, <tt>-</tt>, <tt>*</tt>, and <tt>/</tt> are known +as <em>terminals</em> and correspond to raw input tokens. Identifiers such as <tt>term</tt> and <tt>factor</tt> refer to +grammar rules comprised of a collection of terminals and other rules. These identifiers are known as <em>non-terminals</em>. +<P> + +The semantic behavior of a language is often specified using a +technique known as syntax directed translation. In syntax directed +translation, attributes are attached to each symbol in a given grammar +rule along with an action. Whenever a particular grammar rule is +recognized, the action describes what to do. For example, given the +expression grammar above, you might write the specification for a +simple calculator like this: + +<blockquote> +<pre> +Grammar Action +-------------------------------- -------------------------------------------- +expression0 : expression1 + term expression0.val = expression1.val + term.val + | expression1 - term expression0.val = expression1.val - term.val + | term expression0.val = term.val + +term0 : term1 * factor term0.val = term1.val * factor.val + | term1 / factor term0.val = term1.val / factor.val + | factor term0.val = factor.val + +factor : NUMBER factor.val = int(NUMBER.lexval) + | ( expression ) factor.val = expression.val +</pre> +</blockquote> + +A good way to think about syntax directed translation is to +view each symbol in the grammar as a kind of object. Associated +with each symbol is a value representing its "state" (for example, the +<tt>val</tt> attribute above). Semantic +actions are then expressed as a collection of functions or methods +that operate on the symbols and associated values. + +<p> +Yacc uses a parsing technique known as LR-parsing or shift-reduce parsing. LR parsing is a +bottom up technique that tries to recognize the right-hand-side of various grammar rules. +Whenever a valid right-hand-side is found in the input, the appropriate action code is triggered and the +grammar symbols are replaced by the grammar symbol on the left-hand-side. + +<p> +LR parsing is commonly implemented by shifting grammar symbols onto a +stack and looking at the stack and the next input token for patterns that +match one of the grammar rules. +The details of the algorithm can be found in a compiler textbook, but the +following example illustrates the steps that are performed if you +wanted to parse the expression +<tt>3 + 5 * (10 - 20)</tt> using the grammar defined above. In the example, +the special symbol <tt>$</tt> represents the end of input. + + +<blockquote> +<pre> +Step Symbol Stack Input Tokens Action +---- --------------------- --------------------- ------------------------------- +1 3 + 5 * ( 10 - 20 )$ Shift 3 +2 3 + 5 * ( 10 - 20 )$ Reduce factor : NUMBER +3 factor + 5 * ( 10 - 20 )$ Reduce term : factor +4 term + 5 * ( 10 - 20 )$ Reduce expr : term +5 expr + 5 * ( 10 - 20 )$ Shift + +6 expr + 5 * ( 10 - 20 )$ Shift 5 +7 expr + 5 * ( 10 - 20 )$ Reduce factor : NUMBER +8 expr + factor * ( 10 - 20 )$ Reduce term : factor +9 expr + term * ( 10 - 20 )$ Shift * +10 expr + term * ( 10 - 20 )$ Shift ( +11 expr + term * ( 10 - 20 )$ Shift 10 +12 expr + term * ( 10 - 20 )$ Reduce factor : NUMBER +13 expr + term * ( factor - 20 )$ Reduce term : factor +14 expr + term * ( term - 20 )$ Reduce expr : term +15 expr + term * ( expr - 20 )$ Shift - +16 expr + term * ( expr - 20 )$ Shift 20 +17 expr + term * ( expr - 20 )$ Reduce factor : NUMBER +18 expr + term * ( expr - factor )$ Reduce term : factor +19 expr + term * ( expr - term )$ Reduce expr : expr - term +20 expr + term * ( expr )$ Shift ) +21 expr + term * ( expr ) $ Reduce factor : (expr) +22 expr + term * factor $ Reduce term : term * factor +23 expr + term $ Reduce expr : expr + term +24 expr $ Reduce expr +25 $ Success! +</pre> +</blockquote> + +When parsing the expression, an underlying state machine and the +current input token determine what happens next. If the next token +looks like part of a valid grammar rule (based on other items on the +stack), it is generally shifted onto the stack. If the top of the +stack contains a valid right-hand-side of a grammar rule, it is +usually "reduced" and the symbols replaced with the symbol on the +left-hand-side. When this reduction occurs, the appropriate action is +triggered (if defined). If the input token can't be shifted and the +top of stack doesn't match any grammar rules, a syntax error has +occurred and the parser must take some kind of recovery step (or bail +out). A parse is only successful if the parser reaches a state where +the symbol stack is empty and there are no more input tokens. + +<p> +It is important to note that the underlying implementation is built +around a large finite-state machine that is encoded in a collection of +tables. The construction of these tables is non-trivial and +beyond the scope of this discussion. However, subtle details of this +process explain why, in the example above, the parser chooses to shift +a token onto the stack in step 9 rather than reducing the +rule <tt>expr : expr + term</tt>. + +<H2><a name="ply_nn23"></a>6. Yacc</H2> + + +The <tt>ply.yacc</tt> module implements the parsing component of PLY. +The name "yacc" stands for "Yet Another Compiler Compiler" and is +borrowed from the Unix tool of the same name. + +<H3><a name="ply_nn24"></a>6.1 An example</H3> + + +Suppose you wanted to make a grammar for simple arithmetic expressions as previously described. Here is +how you would do it with <tt>yacc.py</tt>: + +<blockquote> +<pre> +# Yacc example + +import ply.yacc as yacc + +# Get the token map from the lexer. This is required. +from calclex import tokens + +def p_expression_plus(p): + 'expression : expression PLUS term' + p[0] = p[1] + p[3] + +def p_expression_minus(p): + 'expression : expression MINUS term' + p[0] = p[1] - p[3] + +def p_expression_term(p): + 'expression : term' + p[0] = p[1] + +def p_term_times(p): + 'term : term TIMES factor' + p[0] = p[1] * p[3] + +def p_term_div(p): + 'term : term DIVIDE factor' + p[0] = p[1] / p[3] + +def p_term_factor(p): + 'term : factor' + p[0] = p[1] + +def p_factor_num(p): + 'factor : NUMBER' + p[0] = p[1] + +def p_factor_expr(p): + 'factor : LPAREN expression RPAREN' + p[0] = p[2] + +# Error rule for syntax errors +def p_error(p): + print("Syntax error in input!") + +# Build the parser +parser = yacc.yacc() + +while True: + try: + s = raw_input('calc > ') + except EOFError: + break + if not s: continue + result = parser.parse(s) + print(result) +</pre> +</blockquote> + +In this example, each grammar rule is defined by a Python function +where the docstring to that function contains the appropriate +context-free grammar specification. The statements that make up the +function body implement the semantic actions of the rule. Each function +accepts a single argument <tt>p</tt> that is a sequence containing the +values of each grammar symbol in the corresponding rule. The values +of <tt>p[i]</tt> are mapped to grammar symbols as shown here: + +<blockquote> +<pre> +def p_expression_plus(p): + 'expression : expression PLUS term' + # ^ ^ ^ ^ + # p[0] p[1] p[2] p[3] + + p[0] = p[1] + p[3] +</pre> +</blockquote> + +<p> +For tokens, the "value" of the corresponding <tt>p[i]</tt> is the +<em>same</em> as the <tt>p.value</tt> attribute assigned in the lexer +module. For non-terminals, the value is determined by whatever is +placed in <tt>p[0]</tt> when rules are reduced. This value can be +anything at all. However, it probably most common for the value to be +a simple Python type, a tuple, or an instance. In this example, we +are relying on the fact that the <tt>NUMBER</tt> token stores an +integer value in its value field. All of the other rules simply +perform various types of integer operations and propagate the result. +</p> + +<p> +Note: The use of negative indices have a special meaning in +yacc---specially <tt>p[-1]</tt> does not have the same value +as <tt>p[3]</tt> in this example. Please see the section on "Embedded +Actions" for further details. +</p> + +<p> +The first rule defined in the yacc specification determines the +starting grammar symbol (in this case, a rule for <tt>expression</tt> +appears first). Whenever the starting rule is reduced by the parser +and no more input is available, parsing stops and the final value is +returned (this value will be whatever the top-most rule placed +in <tt>p[0]</tt>). Note: an alternative starting symbol can be +specified using the <tt>start</tt> keyword argument to +<tt>yacc()</tt>. + +<p>The <tt>p_error(p)</tt> rule is defined to catch syntax errors. +See the error handling section below for more detail. + +<p> +To build the parser, call the <tt>yacc.yacc()</tt> function. This +function looks at the module and attempts to construct all of the LR +parsing tables for the grammar you have specified. The first +time <tt>yacc.yacc()</tt> is invoked, you will get a message such as +this: + +<blockquote> +<pre> +$ python calcparse.py +Generating LALR tables +calc > +</pre> +</blockquote> + +<p> +Since table construction is relatively expensive (especially for large +grammars), the resulting parsing table is written to +a file called <tt>parsetab.py</tt>. In addition, a +debugging file called <tt>parser.out</tt> is created. On subsequent +executions, <tt>yacc</tt> will reload the table from +<tt>parsetab.py</tt> unless it has detected a change in the underlying +grammar (in which case the tables and <tt>parsetab.py</tt> file are +regenerated). Both of these files are written to the same directory +as the module in which the parser is specified. +The name of the <tt>parsetab</tt> module can be changed using the +<tt>tabmodule</tt> keyword argument to <tt>yacc()</tt>. For example: +</p> + +<blockquote> +<pre> +parser = yacc.yacc(tabmodule='fooparsetab') +</pre> +</blockquote> + +<p> +If any errors are detected in your grammar specification, <tt>yacc.py</tt> will produce +diagnostic messages and possibly raise an exception. Some of the errors that can be detected include: + +<ul> +<li>Duplicated function names (if more than one rule function have the same name in the grammar file). +<li>Shift/reduce and reduce/reduce conflicts generated by ambiguous grammars. +<li>Badly specified grammar rules. +<li>Infinite recursion (rules that can never terminate). +<li>Unused rules and tokens +<li>Undefined rules and tokens +</ul> + +The next few sections discuss grammar specification in more detail. + +<p> +The final part of the example shows how to actually run the parser +created by +<tt>yacc()</tt>. To run the parser, you simply have to call +the <tt>parse()</tt> with a string of input text. This will run all +of the grammar rules and return the result of the entire parse. This +result return is the value assigned to <tt>p[0]</tt> in the starting +grammar rule. + +<H3><a name="ply_nn25"></a>6.2 Combining Grammar Rule Functions</H3> + + +When grammar rules are similar, they can be combined into a single function. +For example, consider the two rules in our earlier example: + +<blockquote> +<pre> +def p_expression_plus(p): + 'expression : expression PLUS term' + p[0] = p[1] + p[3] + +def p_expression_minus(t): + 'expression : expression MINUS term' + p[0] = p[1] - p[3] +</pre> +</blockquote> + +Instead of writing two functions, you might write a single function like this: + +<blockquote> +<pre> +def p_expression(p): + '''expression : expression PLUS term + | expression MINUS term''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] +</pre> +</blockquote> + +In general, the doc string for any given function can contain multiple grammar rules. So, it would +have also been legal (although possibly confusing) to write this: + +<blockquote> +<pre> +def p_binary_operators(p): + '''expression : expression PLUS term + | expression MINUS term + term : term TIMES factor + | term DIVIDE factor''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + elif p[2] == '*': + p[0] = p[1] * p[3] + elif p[2] == '/': + p[0] = p[1] / p[3] +</pre> +</blockquote> + +When combining grammar rules into a single function, it is usually a good idea for all of the rules to have +a similar structure (e.g., the same number of terms). Otherwise, the corresponding action code may be more +complicated than necessary. However, it is possible to handle simple cases using len(). For example: + +<blockquote> +<pre> +def p_expressions(p): + '''expression : expression MINUS expression + | MINUS expression''' + if (len(p) == 4): + p[0] = p[1] - p[3] + elif (len(p) == 3): + p[0] = -p[2] +</pre> +</blockquote> + +If parsing performance is a concern, you should resist the urge to put +too much conditional processing into a single grammar rule as shown in +these examples. When you add checks to see which grammar rule is +being handled, you are actually duplicating the work that the parser +has already performed (i.e., the parser already knows exactly what rule it +matched). You can eliminate this overhead by using a +separate <tt>p_rule()</tt> function for each grammar rule. + +<H3><a name="ply_nn26"></a>6.3 Character Literals</H3> + + +If desired, a grammar may contain tokens defined as single character literals. For example: + +<blockquote> +<pre> +def p_binary_operators(p): + '''expression : expression '+' term + | expression '-' term + term : term '*' factor + | term '/' factor''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + elif p[2] == '*': + p[0] = p[1] * p[3] + elif p[2] == '/': + p[0] = p[1] / p[3] +</pre> +</blockquote> + +A character literal must be enclosed in quotes such as <tt>'+'</tt>. In addition, if literals are used, they must be declared in the +corresponding <tt>lex</tt> file through the use of a special <tt>literals</tt> declaration. + +<blockquote> +<pre> +# Literals. Should be placed in module given to lex() +literals = ['+','-','*','/' ] +</pre> +</blockquote> + +<b>Character literals are limited to a single character</b>. Thus, it is not legal to specify literals such as <tt>'<='</tt> or <tt>'=='</tt>. For this, use +the normal lexing rules (e.g., define a rule such as <tt>t_EQ = r'=='</tt>). + +<H3><a name="ply_nn26"></a>6.4 Empty Productions</H3> + + +<tt>yacc.py</tt> can handle empty productions by defining a rule like this: + +<blockquote> +<pre> +def p_empty(p): + 'empty :' + pass +</pre> +</blockquote> + +Now to use the empty production, simply use 'empty' as a symbol. For example: + +<blockquote> +<pre> +def p_optitem(p): + 'optitem : item' + ' | empty' + ... +</pre> +</blockquote> + +Note: You can write empty rules anywhere by simply specifying an empty +right hand side. However, I personally find that writing an "empty" +rule and using "empty" to denote an empty production is easier to read +and more clearly states your intentions. + +<H3><a name="ply_nn28"></a>6.5 Changing the starting symbol</H3> + + +Normally, the first rule found in a yacc specification defines the starting grammar rule (top level rule). To change this, simply +supply a <tt>start</tt> specifier in your file. For example: + +<blockquote> +<pre> +start = 'foo' + +def p_bar(p): + 'bar : A B' + +# This is the starting rule due to the start specifier above +def p_foo(p): + 'foo : bar X' +... +</pre> +</blockquote> + +The use of a <tt>start</tt> specifier may be useful during debugging +since you can use it to have yacc build a subset of a larger grammar. +For this purpose, it is also possible to specify a starting symbol as +an argument to <tt>yacc()</tt>. For example: + +<blockquote> +<pre> +parser = yacc.yacc(start='foo') +</pre> +</blockquote> + +<H3><a name="ply_nn27"></a>6.6 Dealing With Ambiguous Grammars</H3> + + +The expression grammar given in the earlier example has been written +in a special format to eliminate ambiguity. However, in many +situations, it is extremely difficult or awkward to write grammars in +this format. A much more natural way to express the grammar is in a +more compact form like this: + +<blockquote> +<pre> +expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression + | LPAREN expression RPAREN + | NUMBER +</pre> +</blockquote> + +Unfortunately, this grammar specification is ambiguous. For example, +if you are parsing the string "3 * 4 + 5", there is no way to tell how +the operators are supposed to be grouped. For example, does the +expression mean "(3 * 4) + 5" or is it "3 * (4+5)"? + +<p> +When an ambiguous grammar is given to <tt>yacc.py</tt> it will print +messages about "shift/reduce conflicts" or "reduce/reduce conflicts". +A shift/reduce conflict is caused when the parser generator can't +decide whether or not to reduce a rule or shift a symbol on the +parsing stack. For example, consider the string "3 * 4 + 5" and the +internal parsing stack: + +<blockquote> +<pre> +Step Symbol Stack Input Tokens Action +---- --------------------- --------------------- ------------------------------- +1 $ 3 * 4 + 5$ Shift 3 +2 $ 3 * 4 + 5$ Reduce : expression : NUMBER +3 $ expr * 4 + 5$ Shift * +4 $ expr * 4 + 5$ Shift 4 +5 $ expr * 4 + 5$ Reduce: expression : NUMBER +6 $ expr * expr + 5$ SHIFT/REDUCE CONFLICT ???? +</pre> +</blockquote> + +In this case, when the parser reaches step 6, it has two options. One +is to reduce the rule <tt>expr : expr * expr</tt> on the stack. The +other option is to shift the token <tt>+</tt> on the stack. Both +options are perfectly legal from the rules of the +context-free-grammar. + +<p> +By default, all shift/reduce conflicts are resolved in favor of +shifting. Therefore, in the above example, the parser will always +shift the <tt>+</tt> instead of reducing. Although this strategy +works in many cases (for example, the case of +"if-then" versus "if-then-else"), it is not enough for arithmetic expressions. In fact, +in the above example, the decision to shift <tt>+</tt> is completely +wrong---we should have reduced <tt>expr * expr</tt> since +multiplication has higher mathematical precedence than addition. + +<p>To resolve ambiguity, especially in expression +grammars, <tt>yacc.py</tt> allows individual tokens to be assigned a +precedence level and associativity. This is done by adding a variable +<tt>precedence</tt> to the grammar file like this: + +<blockquote> +<pre> +precedence = ( + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), +) +</pre> +</blockquote> + +This declaration specifies that <tt>PLUS</tt>/<tt>MINUS</tt> have the +same precedence level and are left-associative and that +<tt>TIMES</tt>/<tt>DIVIDE</tt> have the same precedence and are +left-associative. Within the <tt>precedence</tt> declaration, tokens +are ordered from lowest to highest precedence. Thus, this declaration +specifies that <tt>TIMES</tt>/<tt>DIVIDE</tt> have higher precedence +than <tt>PLUS</tt>/<tt>MINUS</tt> (since they appear later in the +precedence specification). + +<p> +The precedence specification works by associating a numerical +precedence level value and associativity direction to the listed +tokens. For example, in the above example you get: + +<blockquote> +<pre> +PLUS : level = 1, assoc = 'left' +MINUS : level = 1, assoc = 'left' +TIMES : level = 2, assoc = 'left' +DIVIDE : level = 2, assoc = 'left' +</pre> +</blockquote> + +These values are then used to attach a numerical precedence value and +associativity direction to each grammar rule. <em>This is always +determined by looking at the precedence of the right-most terminal +symbol.</em> For example: + +<blockquote> +<pre> +expression : expression PLUS expression # level = 1, left + | expression MINUS expression # level = 1, left + | expression TIMES expression # level = 2, left + | expression DIVIDE expression # level = 2, left + | LPAREN expression RPAREN # level = None (not specified) + | NUMBER # level = None (not specified) +</pre> +</blockquote> + +When shift/reduce conflicts are encountered, the parser generator resolves the conflict by +looking at the precedence rules and associativity specifiers. + +<p> +<ol> +<li>If the current token has higher precedence than the rule on the stack, it is shifted. +<li>If the grammar rule on the stack has higher precedence, the rule is reduced. +<li>If the current token and the grammar rule have the same precedence, the +rule is reduced for left associativity, whereas the token is shifted for right associativity. +<li>If nothing is known about the precedence, shift/reduce conflicts are resolved in +favor of shifting (the default). +</ol> + +For example, if "expression PLUS expression" has been parsed and the +next token is "TIMES", the action is going to be a shift because +"TIMES" has a higher precedence level than "PLUS". On the other hand, +if "expression TIMES expression" has been parsed and the next token is +"PLUS", the action is going to be reduce because "PLUS" has a lower +precedence than "TIMES." + +<p> +When shift/reduce conflicts are resolved using the first three +techniques (with the help of precedence rules), <tt>yacc.py</tt> will +report no errors or conflicts in the grammar (although it will print +some information in the <tt>parser.out</tt> debugging file). + +<p> +One problem with the precedence specifier technique is that it is +sometimes necessary to change the precedence of an operator in certain +contexts. For example, consider a unary-minus operator in "3 + 4 * +-5". Mathematically, the unary minus is normally given a very high +precedence--being evaluated before the multiply. However, in our +precedence specifier, MINUS has a lower precedence than TIMES. To +deal with this, precedence rules can be given for so-called "fictitious tokens" +like this: + +<blockquote> +<pre> +precedence = ( + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ('right', 'UMINUS'), # Unary minus operator +) +</pre> +</blockquote> + +Now, in the grammar file, we can write our unary minus rule like this: + +<blockquote> +<pre> +def p_expr_uminus(p): + 'expression : MINUS expression %prec UMINUS' + p[0] = -p[2] +</pre> +</blockquote> + +In this case, <tt>%prec UMINUS</tt> overrides the default rule precedence--setting it to that +of UMINUS in the precedence specifier. + +<p> +At first, the use of UMINUS in this example may appear very confusing. +UMINUS is not an input token or a grammar rule. Instead, you should +think of it as the name of a special marker in the precedence table. When you use the <tt>%prec</tt> qualifier, you're simply +telling yacc that you want the precedence of the expression to be the same as for this special marker instead of the usual precedence. + +<p> +It is also possible to specify non-associativity in the <tt>precedence</tt> table. This would +be used when you <em>don't</em> want operations to chain together. For example, suppose +you wanted to support comparison operators like <tt><</tt> and <tt>></tt> but you didn't want to allow +combinations like <tt>a < b < c</tt>. To do this, simply specify a rule like this: + +<blockquote> +<pre> +precedence = ( + ('nonassoc', 'LESSTHAN', 'GREATERTHAN'), # Nonassociative operators + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ('right', 'UMINUS'), # Unary minus operator +) +</pre> +</blockquote> + +<p> +If you do this, the occurrence of input text such as <tt> a < b < c</tt> will result in a syntax error. However, simple +expressions such as <tt>a < b</tt> will still be fine. + +<p> +Reduce/reduce conflicts are caused when there are multiple grammar +rules that can be applied to a given set of symbols. This kind of +conflict is almost always bad and is always resolved by picking the +rule that appears first in the grammar file. Reduce/reduce conflicts +are almost always caused when different sets of grammar rules somehow +generate the same set of symbols. For example: + +<blockquote> +<pre> +assignment : ID EQUALS NUMBER + | ID EQUALS expression + +expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression + | LPAREN expression RPAREN + | NUMBER +</pre> +</blockquote> + +In this case, a reduce/reduce conflict exists between these two rules: + +<blockquote> +<pre> +assignment : ID EQUALS NUMBER +expression : NUMBER +</pre> +</blockquote> + +For example, if you wrote "a = 5", the parser can't figure out if this +is supposed to be reduced as <tt>assignment : ID EQUALS NUMBER</tt> or +whether it's supposed to reduce the 5 as an expression and then reduce +the rule <tt>assignment : ID EQUALS expression</tt>. + +<p> +It should be noted that reduce/reduce conflicts are notoriously +difficult to spot simply looking at the input grammar. When a +reduce/reduce conflict occurs, <tt>yacc()</tt> will try to help by +printing a warning message such as this: + +<blockquote> +<pre> +WARNING: 1 reduce/reduce conflict +WARNING: reduce/reduce conflict in state 15 resolved using rule (assignment -> ID EQUALS NUMBER) +WARNING: rejected rule (expression -> NUMBER) +</pre> +</blockquote> + +This message identifies the two rules that are in conflict. However, +it may not tell you how the parser arrived at such a state. To try +and figure it out, you'll probably have to look at your grammar and +the contents of the +<tt>parser.out</tt> debugging file with an appropriately high level of +caffeination. + +<H3><a name="ply_nn28"></a>6.7 The parser.out file</H3> + + +Tracking down shift/reduce and reduce/reduce conflicts is one of the finer pleasures of using an LR +parsing algorithm. To assist in debugging, <tt>yacc.py</tt> creates a debugging file called +'parser.out' when it generates the parsing table. The contents of this file look like the following: + +<blockquote> +<pre> +Unused terminals: + + +Grammar + +Rule 1 expression -> expression PLUS expression +Rule 2 expression -> expression MINUS expression +Rule 3 expression -> expression TIMES expression +Rule 4 expression -> expression DIVIDE expression +Rule 5 expression -> NUMBER +Rule 6 expression -> LPAREN expression RPAREN + +Terminals, with rules where they appear + +TIMES : 3 +error : +MINUS : 2 +RPAREN : 6 +LPAREN : 6 +DIVIDE : 4 +PLUS : 1 +NUMBER : 5 + +Nonterminals, with rules where they appear + +expression : 1 1 2 2 3 3 4 4 6 0 + + +Parsing method: LALR + + +state 0 + + S' -> . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + +state 1 + + S' -> expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + PLUS shift and go to state 6 + MINUS shift and go to state 5 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + +state 2 + + expression -> LPAREN . expression RPAREN + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + +state 3 + + expression -> NUMBER . + + $ reduce using rule 5 + PLUS reduce using rule 5 + MINUS reduce using rule 5 + TIMES reduce using rule 5 + DIVIDE reduce using rule 5 + RPAREN reduce using rule 5 + + +state 4 + + expression -> expression TIMES . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + +state 5 + + expression -> expression MINUS . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + +state 6 + + expression -> expression PLUS . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + +state 7 + + expression -> expression DIVIDE . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + +state 8 + + expression -> LPAREN expression . RPAREN + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + RPAREN shift and go to state 13 + PLUS shift and go to state 6 + MINUS shift and go to state 5 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + +state 9 + + expression -> expression TIMES expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 3 + PLUS reduce using rule 3 + MINUS reduce using rule 3 + TIMES reduce using rule 3 + DIVIDE reduce using rule 3 + RPAREN reduce using rule 3 + + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + ! TIMES [ shift and go to state 4 ] + ! DIVIDE [ shift and go to state 7 ] + +state 10 + + expression -> expression MINUS expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 2 + PLUS reduce using rule 2 + MINUS reduce using rule 2 + RPAREN reduce using rule 2 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + ! TIMES [ reduce using rule 2 ] + ! DIVIDE [ reduce using rule 2 ] + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + +state 11 + + expression -> expression PLUS expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 1 + PLUS reduce using rule 1 + MINUS reduce using rule 1 + RPAREN reduce using rule 1 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + ! TIMES [ reduce using rule 1 ] + ! DIVIDE [ reduce using rule 1 ] + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + +state 12 + + expression -> expression DIVIDE expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 4 + PLUS reduce using rule 4 + MINUS reduce using rule 4 + TIMES reduce using rule 4 + DIVIDE reduce using rule 4 + RPAREN reduce using rule 4 + + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + ! TIMES [ shift and go to state 4 ] + ! DIVIDE [ shift and go to state 7 ] + +state 13 + + expression -> LPAREN expression RPAREN . + + $ reduce using rule 6 + PLUS reduce using rule 6 + MINUS reduce using rule 6 + TIMES reduce using rule 6 + DIVIDE reduce using rule 6 + RPAREN reduce using rule 6 +</pre> +</blockquote> + +The different states that appear in this file are a representation of +every possible sequence of valid input tokens allowed by the grammar. +When receiving input tokens, the parser is building up a stack and +looking for matching rules. Each state keeps track of the grammar +rules that might be in the process of being matched at that point. Within each +rule, the "." character indicates the current location of the parse +within that rule. In addition, the actions for each valid input token +are listed. When a shift/reduce or reduce/reduce conflict arises, +rules <em>not</em> selected are prefixed with an !. For example: + +<blockquote> +<pre> + ! TIMES [ reduce using rule 2 ] + ! DIVIDE [ reduce using rule 2 ] + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] +</pre> +</blockquote> + +By looking at these rules (and with a little practice), you can usually track down the source +of most parsing conflicts. It should also be stressed that not all shift-reduce conflicts are +bad. However, the only way to be sure that they are resolved correctly is to look at <tt>parser.out</tt>. + +<H3><a name="ply_nn29"></a>6.8 Syntax Error Handling</H3> + + +If you are creating a parser for production use, the handling of +syntax errors is important. As a general rule, you don't want a +parser to simply throw up its hands and stop at the first sign of +trouble. Instead, you want it to report the error, recover if possible, and +continue parsing so that all of the errors in the input get reported +to the user at once. This is the standard behavior found in compilers +for languages such as C, C++, and Java. + +In PLY, when a syntax error occurs during parsing, the error is immediately +detected (i.e., the parser does not read any more tokens beyond the +source of the error). However, at this point, the parser enters a +recovery mode that can be used to try and continue further parsing. +As a general rule, error recovery in LR parsers is a delicate +topic that involves ancient rituals and black-magic. The recovery mechanism +provided by <tt>yacc.py</tt> is comparable to Unix yacc so you may want +consult a book like O'Reilly's "Lex and Yacc" for some of the finer details. + +<p> +When a syntax error occurs, <tt>yacc.py</tt> performs the following steps: + +<ol> +<li>On the first occurrence of an error, the user-defined <tt>p_error()</tt> function +is called with the offending token as an argument. However, if the syntax error is due to +reaching the end-of-file, <tt>p_error()</tt> is called with an + argument of <tt>None</tt>. +Afterwards, the parser enters +an "error-recovery" mode in which it will not make future calls to <tt>p_error()</tt> until it +has successfully shifted at least 3 tokens onto the parsing stack. + +<p> +<li>If no recovery action is taken in <tt>p_error()</tt>, the offending lookahead token is replaced +with a special <tt>error</tt> token. + +<p> +<li>If the offending lookahead token is already set to <tt>error</tt>, the top item of the parsing stack is +deleted. + +<p> +<li>If the entire parsing stack is unwound, the parser enters a restart state and attempts to start +parsing from its initial state. + +<p> +<li>If a grammar rule accepts <tt>error</tt> as a token, it will be +shifted onto the parsing stack. + +<p> +<li>If the top item of the parsing stack is <tt>error</tt>, lookahead tokens will be discarded until the +parser can successfully shift a new symbol or reduce a rule involving <tt>error</tt>. +</ol> + +<H4><a name="ply_nn30"></a>6.8.1 Recovery and resynchronization with error rules</H4> + + +The most well-behaved approach for handling syntax errors is to write grammar rules that include the <tt>error</tt> +token. For example, suppose your language had a grammar rule for a print statement like this: + +<blockquote> +<pre> +def p_statement_print(p): + 'statement : PRINT expr SEMI' + ... +</pre> +</blockquote> + +To account for the possibility of a bad expression, you might write an additional grammar rule like this: + +<blockquote> +<pre> +def p_statement_print_error(p): + 'statement : PRINT error SEMI' + print("Syntax error in print statement. Bad expression") + +</pre> +</blockquote> + +In this case, the <tt>error</tt> token will match any sequence of +tokens that might appear up to the first semicolon that is +encountered. Once the semicolon is reached, the rule will be +invoked and the <tt>error</tt> token will go away. + +<p> +This type of recovery is sometimes known as parser resynchronization. +The <tt>error</tt> token acts as a wildcard for any bad input text and +the token immediately following <tt>error</tt> acts as a +synchronization token. + +<p> +It is important to note that the <tt>error</tt> token usually does not appear as the last token +on the right in an error rule. For example: + +<blockquote> +<pre> +def p_statement_print_error(p): + 'statement : PRINT error' + print("Syntax error in print statement. Bad expression") +</pre> +</blockquote> + +This is because the first bad token encountered will cause the rule to +be reduced--which may make it difficult to recover if more bad tokens +immediately follow. + +<H4><a name="ply_nn31"></a>6.8.2 Panic mode recovery</H4> + + +An alternative error recovery scheme is to enter a panic mode recovery in which tokens are +discarded to a point where the parser might be able to recover in some sensible manner. + +<p> +Panic mode recovery is implemented entirely in the <tt>p_error()</tt> function. For example, this +function starts discarding tokens until it reaches a closing '}'. Then, it restarts the +parser in its initial state. + +<blockquote> +<pre> +def p_error(p): + print("Whoa. You are seriously hosed.") + if not p: + print("End of File!") + return + + # Read ahead looking for a closing '}' + while True: + tok = parser.token() # Get the next token + if not tok or tok.type == 'RBRACE': + break + parser.restart() +</pre> +</blockquote> + +<p> +This function simply discards the bad token and tells the parser that the error was ok. + +<blockquote> +<pre> +def p_error(p): + if p: + print("Syntax error at token", p.type) + # Just discard the token and tell the parser it's okay. + parser.errok() + else: + print("Syntax error at EOF") +</pre> +</blockquote> + +<P> +More information on these methods is as follows: +</p> + +<p> +<ul> +<li><tt>parser.errok()</tt>. This resets the parser state so it doesn't think it's in error-recovery +mode. This will prevent an <tt>error</tt> token from being generated and will reset the internal +error counters so that the next syntax error will call <tt>p_error()</tt> again. + +<p> +<li><tt>parser.token()</tt>. This returns the next token on the input stream. + +<p> +<li><tt>parser.restart()</tt>. This discards the entire parsing stack and resets the parser +to its initial state. +</ul> + +<p> +To supply the next lookahead token to the parser, <tt>p_error()</tt> can return a token. This might be +useful if trying to synchronize on special characters. For example: + +<blockquote> +<pre> +def p_error(p): + # Read ahead looking for a terminating ";" + while True: + tok = parser.token() # Get the next token + if not tok or tok.type == 'SEMI': break + parser.errok() + + # Return SEMI to the parser as the next lookahead token + return tok +</pre> +</blockquote> + +<p> +Keep in mind in that the above error handling functions, +<tt>parser</tt> is an instance of the parser created by +<tt>yacc()</tt>. You'll need to save this instance someplace in your +code so that you can refer to it during error handling. +</p> + +<H4><a name="ply_nn35"></a>6.8.3 Signalling an error from a production</H4> + + +If necessary, a production rule can manually force the parser to enter error recovery. This +is done by raising the <tt>SyntaxError</tt> exception like this: + +<blockquote> +<pre> +def p_production(p): + 'production : some production ...' + raise SyntaxError +</pre> +</blockquote> + +The effect of raising <tt>SyntaxError</tt> is the same as if the last symbol shifted onto the +parsing stack was actually a syntax error. Thus, when you do this, the last symbol shifted is popped off +of the parsing stack and the current lookahead token is set to an <tt>error</tt> token. The parser +then enters error-recovery mode where it tries to reduce rules that can accept <tt>error</tt> tokens. +The steps that follow from this point are exactly the same as if a syntax error were detected and +<tt>p_error()</tt> were called. + +<P> +One important aspect of manually setting an error is that the <tt>p_error()</tt> function will <b>NOT</b> be +called in this case. If you need to issue an error message, make sure you do it in the production that +raises <tt>SyntaxError</tt>. + +<P> +Note: This feature of PLY is meant to mimic the behavior of the YYERROR macro in yacc. + +<H4><a name="ply_nn38"></a>6.8.4 When Do Syntax Errors Get Reported</H4> + + +<p> +In most cases, yacc will handle errors as soon as a bad input token is +detected on the input. However, be aware that yacc may choose to +delay error handling until after it has reduced one or more grammar +rules first. This behavior might be unexpected, but it's related to +special states in the underlying parsing table known as "defaulted +states." A defaulted state is parsing condition where the same +grammar rule will be reduced regardless of what <em>valid</em> token +comes next on the input. For such states, yacc chooses to go ahead +and reduce the grammar rule <em>without reading the next input +token</em>. If the next token is bad, yacc will eventually get around to reading it and +report a syntax error. It's just a little unusual in that you might +see some of your grammar rules firing immediately prior to the syntax +error. +</p> + +<p> +Usually, the delayed error reporting with defaulted states is harmless +(and there are other reasons for wanting PLY to behave in this way). +However, if you need to turn this behavior off for some reason. You +can clear the defaulted states table like this: +</p> + +<blockquote> +<pre> +parser = yacc.yacc() +parser.defaulted_states = {} +</pre> +</blockquote> + +<p> +Disabling defaulted states is not recommended if your grammar makes use +of embedded actions as described in Section 6.11.</p> + +<H4><a name="ply_nn32"></a>6.8.5 General comments on error handling</H4> + + +For normal types of languages, error recovery with error rules and resynchronization characters is probably the most reliable +technique. This is because you can instrument the grammar to catch errors at selected places where it is relatively easy +to recover and continue parsing. Panic mode recovery is really only useful in certain specialized applications where you might want +to discard huge portions of the input text to find a valid restart point. + +<H3><a name="ply_nn33"></a>6.9 Line Number and Position Tracking</H3> + + +Position tracking is often a tricky problem when writing compilers. +By default, PLY tracks the line number and position of all tokens. +This information is available using the following functions: + +<ul> +<li><tt>p.lineno(num)</tt>. Return the line number for symbol <em>num</em> +<li><tt>p.lexpos(num)</tt>. Return the lexing position for symbol <em>num</em> +</ul> + +For example: + +<blockquote> +<pre> +def p_expression(p): + 'expression : expression PLUS expression' + line = p.lineno(2) # line number of the PLUS token + index = p.lexpos(2) # Position of the PLUS token +</pre> +</blockquote> + +As an optional feature, <tt>yacc.py</tt> can automatically track line +numbers and positions for all of the grammar symbols as well. +However, this extra tracking requires extra processing and can +significantly slow down parsing. Therefore, it must be enabled by +passing the +<tt>tracking=True</tt> option to <tt>yacc.parse()</tt>. For example: + +<blockquote> +<pre> +yacc.parse(data,tracking=True) +</pre> +</blockquote> + +Once enabled, the <tt>lineno()</tt> and <tt>lexpos()</tt> methods work +for all grammar symbols. In addition, two additional methods can be +used: + +<ul> +<li><tt>p.linespan(num)</tt>. Return a tuple (startline,endline) with the starting and ending line number for symbol <em>num</em>. +<li><tt>p.lexspan(num)</tt>. Return a tuple (start,end) with the starting and ending positions for symbol <em>num</em>. +</ul> + +For example: + +<blockquote> +<pre> +def p_expression(p): + 'expression : expression PLUS expression' + p.lineno(1) # Line number of the left expression + p.lineno(2) # line number of the PLUS operator + p.lineno(3) # line number of the right expression + ... + start,end = p.linespan(3) # Start,end lines of the right expression + starti,endi = p.lexspan(3) # Start,end positions of right expression + +</pre> +</blockquote> + +Note: The <tt>lexspan()</tt> function only returns the range of values up to the start of the last grammar symbol. + +<p> +Although it may be convenient for PLY to track position information on +all grammar symbols, this is often unnecessary. For example, if you +are merely using line number information in an error message, you can +often just key off of a specific token in the grammar rule. For +example: + +<blockquote> +<pre> +def p_bad_func(p): + 'funccall : fname LPAREN error RPAREN' + # Line number reported from LPAREN token + print("Bad function call at line", p.lineno(2)) +</pre> +</blockquote> + +<p> +Similarly, you may get better parsing performance if you only +selectively propagate line number information where it's needed using +the <tt>p.set_lineno()</tt> method. For example: + +<blockquote> +<pre> +def p_fname(p): + 'fname : ID' + p[0] = p[1] + p.set_lineno(0,p.lineno(1)) +</pre> +</blockquote> + +PLY doesn't retain line number information from rules that have already been +parsed. If you are building an abstract syntax tree and need to have line numbers, +you should make sure that the line numbers appear in the tree itself. + +<H3><a name="ply_nn34"></a>6.10 AST Construction</H3> + + +<tt>yacc.py</tt> provides no special functions for constructing an +abstract syntax tree. However, such construction is easy enough to do +on your own. + +<p>A minimal way to construct a tree is to simply create and +propagate a tuple or list in each grammar rule function. There +are many possible ways to do this, but one example would be something +like this: + +<blockquote> +<pre> +def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + + p[0] = ('binary-expression',p[2],p[1],p[3]) + +def p_expression_group(p): + 'expression : LPAREN expression RPAREN' + p[0] = ('group-expression',p[2]) + +def p_expression_number(p): + 'expression : NUMBER' + p[0] = ('number-expression',p[1]) +</pre> +</blockquote> + +<p> +Another approach is to create a set of data structure for different +kinds of abstract syntax tree nodes and assign nodes to <tt>p[0]</tt> +in each rule. For example: + +<blockquote> +<pre> +class Expr: pass + +class BinOp(Expr): + def __init__(self,left,op,right): + self.type = "binop" + self.left = left + self.right = right + self.op = op + +class Number(Expr): + def __init__(self,value): + self.type = "number" + self.value = value + +def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + + p[0] = BinOp(p[1],p[2],p[3]) + +def p_expression_group(p): + 'expression : LPAREN expression RPAREN' + p[0] = p[2] + +def p_expression_number(p): + 'expression : NUMBER' + p[0] = Number(p[1]) +</pre> +</blockquote> + +The advantage to this approach is that it may make it easier to attach more complicated +semantics, type checking, code generation, and other features to the node classes. + +<p> +To simplify tree traversal, it may make sense to pick a very generic +tree structure for your parse tree nodes. For example: + +<blockquote> +<pre> +class Node: + def __init__(self,type,children=None,leaf=None): + self.type = type + if children: + self.children = children + else: + self.children = [ ] + self.leaf = leaf + +def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + + p[0] = Node("binop", [p[1],p[3]], p[2]) +</pre> +</blockquote> + +<H3><a name="ply_nn35"></a>6.11 Embedded Actions</H3> + + +The parsing technique used by yacc only allows actions to be executed at the end of a rule. For example, +suppose you have a rule like this: + +<blockquote> +<pre> +def p_foo(p): + "foo : A B C D" + print("Parsed a foo", p[1],p[2],p[3],p[4]) +</pre> +</blockquote> + +<p> +In this case, the supplied action code only executes after all of the +symbols <tt>A</tt>, <tt>B</tt>, <tt>C</tt>, and <tt>D</tt> have been +parsed. Sometimes, however, it is useful to execute small code +fragments during intermediate stages of parsing. For example, suppose +you wanted to perform some action immediately after <tt>A</tt> has +been parsed. To do this, write an empty rule like this: + +<blockquote> +<pre> +def p_foo(p): + "foo : A seen_A B C D" + print("Parsed a foo", p[1],p[3],p[4],p[5]) + print("seen_A returned", p[2]) + +def p_seen_A(p): + "seen_A :" + print("Saw an A = ", p[-1]) # Access grammar symbol to left + p[0] = some_value # Assign value to seen_A + +</pre> +</blockquote> + +<p> +In this example, the empty <tt>seen_A</tt> rule executes immediately +after <tt>A</tt> is shifted onto the parsing stack. Within this +rule, <tt>p[-1]</tt> refers to the symbol on the stack that appears +immediately to the left of the <tt>seen_A</tt> symbol. In this case, +it would be the value of <tt>A</tt> in the <tt>foo</tt> rule +immediately above. Like other rules, a value can be returned from an +embedded action by simply assigning it to <tt>p[0]</tt> + +<p> +The use of embedded actions can sometimes introduce extra shift/reduce conflicts. For example, +this grammar has no conflicts: + +<blockquote> +<pre> +def p_foo(p): + """foo : abcd + | abcx""" + +def p_abcd(p): + "abcd : A B C D" + +def p_abcx(p): + "abcx : A B C X" +</pre> +</blockquote> + +However, if you insert an embedded action into one of the rules like this, + +<blockquote> +<pre> +def p_foo(p): + """foo : abcd + | abcx""" + +def p_abcd(p): + "abcd : A B C D" + +def p_abcx(p): + "abcx : A B seen_AB C X" + +def p_seen_AB(p): + "seen_AB :" +</pre> +</blockquote> + +an extra shift-reduce conflict will be introduced. This conflict is +caused by the fact that the same symbol <tt>C</tt> appears next in +both the <tt>abcd</tt> and <tt>abcx</tt> rules. The parser can either +shift the symbol (<tt>abcd</tt> rule) or reduce the empty +rule <tt>seen_AB</tt> (<tt>abcx</tt> rule). + +<p> +A common use of embedded rules is to control other aspects of parsing +such as scoping of local variables. For example, if you were parsing C code, you might +write code like this: + +<blockquote> +<pre> +def p_statements_block(p): + "statements: LBRACE new_scope statements RBRACE""" + # Action code + ... + pop_scope() # Return to previous scope + +def p_new_scope(p): + "new_scope :" + # Create a new scope for local variables + s = new_scope() + push_scope(s) + ... +</pre> +</blockquote> + +In this case, the embedded action <tt>new_scope</tt> executes +immediately after a <tt>LBRACE</tt> (<tt>{</tt>) symbol is parsed. +This might adjust internal symbol tables and other aspects of the +parser. Upon completion of the rule <tt>statements_block</tt>, code +might undo the operations performed in the embedded action +(e.g., <tt>pop_scope()</tt>). + +<H3><a name="ply_nn36"></a>6.12 Miscellaneous Yacc Notes</H3> + + +<ul> + +<li>By default, <tt>yacc.py</tt> relies on <tt>lex.py</tt> for tokenizing. However, an alternative tokenizer +can be supplied as follows: + +<blockquote> +<pre> +parser = yacc.parse(lexer=x) +</pre> +</blockquote> +in this case, <tt>x</tt> must be a Lexer object that minimally has a <tt>x.token()</tt> method for retrieving the next +token. If an input string is given to <tt>yacc.parse()</tt>, the lexer must also have an <tt>x.input()</tt> method. + +<p> +<li>By default, the yacc generates tables in debugging mode (which produces the parser.out file and other output). +To disable this, use + +<blockquote> +<pre> +parser = yacc.yacc(debug=False) +</pre> +</blockquote> + +<p> +<li>To change the name of the <tt>parsetab.py</tt> file, use: + +<blockquote> +<pre> +parser = yacc.yacc(tabmodule="foo") +</pre> +</blockquote> + +<P> +Normally, the <tt>parsetab.py</tt> file is placed into the same directory as +the module where the parser is defined. If you want it to go somewhere else, you can +given an absolute package name for <tt>tabmodule</tt> instead. In that case, the +tables will be written there. +</p> + +<p> +<li>To change the directory in which the <tt>parsetab.py</tt> file (and other output files) are written, use: +<blockquote> +<pre> +parser = yacc.yacc(tabmodule="foo",outputdir="somedirectory") +</pre> +</blockquote> + +<p> +Note: Be aware that unless the directory specified is also on Python's path (<tt>sys.path</tt>), subsequent +imports of the table file will fail. As a general rule, it's better to specify a destination using the +<tt>tabmodule</tt> argument instead of directly specifying a directory using the <tt>outputdir</tt> argument. +</p> + +<p> +<li>To prevent yacc from generating any kind of parser table file, use: +<blockquote> +<pre> +parser = yacc.yacc(write_tables=False) +</pre> +</blockquote> + +Note: If you disable table generation, yacc() will regenerate the parsing tables +each time it runs (which may take awhile depending on how large your grammar is). + +<P> +<li>To print copious amounts of debugging during parsing, use: + +<blockquote> +<pre> +parser = yacc.parse(debug=True) +</pre> +</blockquote> + +<p> +<li>Since the generation of the LALR tables is relatively expensive, previously generated tables are +cached and reused if possible. The decision to regenerate the tables is determined by taking an MD5 +checksum of all grammar rules and precedence rules. Only in the event of a mismatch are the tables regenerated. + +<p> +It should be noted that table generation is reasonably efficient, even for grammars that involve around a 100 rules +and several hundred states. </li> + + +<p> +<li>Since LR parsing is driven by tables, the performance of the parser is largely independent of the +size of the grammar. The biggest bottlenecks will be the lexer and the complexity of the code in your grammar rules. +</li> +</p> + +<p> +<li><tt>yacc()</tt> also allows parsers to be defined as classes and as closures (see the section on alternative specification of +lexers). However, be aware that only one parser may be defined in a single module (source file). There are various +error checks and validation steps that may issue confusing error messages if you try to define multiple parsers +in the same source file. +</li> +</p> + +<p> +<li>Decorators of production rules have to update the wrapped function's line number. <tt>wrapper.co_firstlineno = func.__code__.co_firstlineno</tt>: + +<blockquote> +<pre> +from functools import wraps +from nodes import Collection + + +def strict(*types): + def decorate(func): + @wraps(func) + def wrapper(p): + func(p) + if not isinstance(p[0], types): + raise TypeError + + wrapper.co_firstlineno = func.__code__.co_firstlineno + return wrapper + + return decorate + +@strict(Collection) +def p_collection(p): + """ + collection : sequence + | map + """ + p[0] = p[1] +</pre> +</blockquote> + +</li> +</p> + + +</ul> +</p> + + +<H2><a name="ply_nn37"></a>7. Multiple Parsers and Lexers</H2> + + +In advanced parsing applications, you may want to have multiple +parsers and lexers. + +<p> +As a general rules this isn't a problem. However, to make it work, +you need to carefully make sure everything gets hooked up correctly. +First, make sure you save the objects returned by <tt>lex()</tt> and +<tt>yacc()</tt>. For example: + +<blockquote> +<pre> +lexer = lex.lex() # Return lexer object +parser = yacc.yacc() # Return parser object +</pre> +</blockquote> + +Next, when parsing, make sure you give the <tt>parse()</tt> function a reference to the lexer it +should be using. For example: + +<blockquote> +<pre> +parser.parse(text,lexer=lexer) +</pre> +</blockquote> + +If you forget to do this, the parser will use the last lexer +created--which is not always what you want. + +<p> +Within lexer and parser rule functions, these objects are also +available. In the lexer, the "lexer" attribute of a token refers to +the lexer object that triggered the rule. For example: + +<blockquote> +<pre> +def t_NUMBER(t): + r'\d+' + ... + print(t.lexer) # Show lexer object +</pre> +</blockquote> + +In the parser, the "lexer" and "parser" attributes refer to the lexer +and parser objects respectively. + +<blockquote> +<pre> +def p_expr_plus(p): + 'expr : expr PLUS expr' + ... + print(p.parser) # Show parser object + print(p.lexer) # Show lexer object +</pre> +</blockquote> + +If necessary, arbitrary attributes can be attached to the lexer or parser object. +For example, if you wanted to have different parsing modes, you could attach a mode +attribute to the parser object and look at it later. + +<H2><a name="ply_nn38"></a>8. Using Python's Optimized Mode</H2> + + +Because PLY uses information from doc-strings, parsing and lexing +information must be gathered while running the Python interpreter in +normal mode (i.e., not with the -O or -OO options). However, if you +specify optimized mode like this: + +<blockquote> +<pre> +lex.lex(optimize=1) +yacc.yacc(optimize=1) +</pre> +</blockquote> + +then PLY can later be used when Python runs in optimized mode. To make this work, +make sure you first run Python in normal mode. Once the lexing and parsing tables +have been generated the first time, run Python in optimized mode. PLY will use +the tables without the need for doc strings. + +<p> +Beware: running PLY in optimized mode disables a lot of error +checking. You should only do this when your project has stabilized +and you don't need to do any debugging. One of the purposes of +optimized mode is to substantially decrease the startup time of +your compiler (by assuming that everything is already properly +specified and works). + +<H2><a name="ply_nn44"></a>9. Advanced Debugging</H2> + + +<p> +Debugging a compiler is typically not an easy task. PLY provides some +advanced diagostic capabilities through the use of Python's +<tt>logging</tt> module. The next two sections describe this: + +<H3><a name="ply_nn45"></a>9.1 Debugging the lex() and yacc() commands</H3> + + +<p> +Both the <tt>lex()</tt> and <tt>yacc()</tt> commands have a debugging +mode that can be enabled using the <tt>debug</tt> flag. For example: + +<blockquote> +<pre> +lex.lex(debug=True) +yacc.yacc(debug=True) +</pre> +</blockquote> + +Normally, the output produced by debugging is routed to either +standard error or, in the case of <tt>yacc()</tt>, to a file +<tt>parser.out</tt>. This output can be more carefully controlled +by supplying a logging object. Here is an example that adds +information about where different debugging messages are coming from: + +<blockquote> +<pre> +# Set up a logging object +import logging +logging.basicConfig( + level = logging.DEBUG, + filename = "parselog.txt", + filemode = "w", + format = "%(filename)10s:%(lineno)4d:%(message)s" +) +log = logging.getLogger() + +lex.lex(debug=True,debuglog=log) +yacc.yacc(debug=True,debuglog=log) +</pre> +</blockquote> + +If you supply a custom logger, the amount of debugging +information produced can be controlled by setting the logging level. +Typically, debugging messages are either issued at the <tt>DEBUG</tt>, +<tt>INFO</tt>, or <tt>WARNING</tt> levels. + +<p> +PLY's error messages and warnings are also produced using the logging +interface. This can be controlled by passing a logging object +using the <tt>errorlog</tt> parameter. + +<blockquote> +<pre> +lex.lex(errorlog=log) +yacc.yacc(errorlog=log) +</pre> +</blockquote> + +If you want to completely silence warnings, you can either pass in a +logging object with an appropriate filter level or use the <tt>NullLogger</tt> +object defined in either <tt>lex</tt> or <tt>yacc</tt>. For example: + +<blockquote> +<pre> +yacc.yacc(errorlog=yacc.NullLogger()) +</pre> +</blockquote> + +<H3><a name="ply_nn46"></a>9.2 Run-time Debugging</H3> + + +<p> +To enable run-time debugging of a parser, use the <tt>debug</tt> option to parse. This +option can either be an integer (which simply turns debugging on or off) or an instance +of a logger object. For example: + +<blockquote> +<pre> +log = logging.getLogger() +parser.parse(input,debug=log) +</pre> +</blockquote> + +If a logging object is passed, you can use its filtering level to control how much +output gets generated. The <tt>INFO</tt> level is used to produce information +about rule reductions. The <tt>DEBUG</tt> level will show information about the +parsing stack, token shifts, and other details. The <tt>ERROR</tt> level shows information +related to parsing errors. + +<p> +For very complicated problems, you should pass in a logging object that +redirects to a file where you can more easily inspect the output after +execution. + +<H2><a name="ply_nn49"></a>10. Packaging Advice</H2> + + +<p> +If you are distributing a package that makes use of PLY, you should +spend a few moments thinking about how you want to handle the files +that are automatically generated. For example, the <tt>parsetab.py</tt> +file generated by the <tt>yacc()</tt> function.</p> + +<p> +Starting in PLY-3.6, the table files are created in the same directory +as the file where a parser is defined. This means that the +<tt>parsetab.py</tt> file will live side-by-side with your parser +specification. In terms of packaging, this is probably the easiest and +most sane approach to manage. You don't need to give <tt>yacc()</tt> +any extra arguments and it should just "work."</p> + +<p> +One concern is the management of the <tt>parsetab.py</tt> file itself. +For example, should you have this file checked into version control (e.g., GitHub), +should it be included in a package distribution as a normal file, or should you +just let PLY generate it automatically for the user when they install your package? +</p> + +<p> +As of PLY-3.6, the <tt>parsetab.py</tt> file should be compatible across all versions +of Python including Python 2 and 3. Thus, a table file generated in Python 2 should +work fine if it's used on Python 3. Because of this, it should be relatively harmless +to distribute the <tt>parsetab.py</tt> file yourself if you need to. However, be aware +that older/newer versions of PLY may try to regenerate the file if there are future +enhancements or changes to its format. +</p> + +<p> +To make the generation of table files easier for the purposes of installation, you might +way to make your parser files executable using the <tt>-m</tt> option or similar. For +example: +</p> + +<blockquote> +<pre> +# calc.py +... +... +def make_parser(): + parser = yacc.yacc() + return parser + +if __name__ == '__main__': + make_parser() +</pre> +</blockquote> + +<p> +You can then use a command such as <tt>python -m calc.py</tt> to generate the tables. Alternatively, +a <tt>setup.py</tt> script, can import the module and use <tt>make_parser()</tt> to create the +parsing tables. +</p> + +<p> +If you're willing to sacrifice a little startup time, you can also instruct PLY to never write the +tables using <tt>yacc.yacc(write_tables=False, debug=False)</tt>. In this mode, PLY will regenerate +the parsing tables from scratch each time. For a small grammar, you probably won't notice. For a +large grammar, you should probably reconsider--the parsing tables are meant to dramatically speed up this process. +</p> + +<p> +During operation, is is normal for PLY to produce diagnostic error +messages (usually printed to standard error). These are generated +entirely using the <tt>logging</tt> module. If you want to redirect +these messages or silence them, you can provide your own logging +object to <tt>yacc()</tt>. For example: +</p> + +<blockquote> +<pre> +import logging +log = logging.getLogger('ply') +... +parser = yacc.yacc(errorlog=log) +</pre> +</blockquote> + +<H2><a name="ply_nn39"></a>11. Where to go from here?</H2> + + +The <tt>examples</tt> directory of the PLY distribution contains several simple examples. Please consult a +compilers textbook for the theory and underlying implementation details or LR parsing. + +</body> +</html> + + + + + + +
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/README @@ -0,0 +1,79 @@ +Inspired by a September 14, 2006 Salon article "Why Johnny Can't Code" by +David Brin (http://www.salon.com/tech/feature/2006/09/14/basic/index.html), +I thought that a fully working BASIC interpreter might be an interesting, +if not questionable, PLY example. Uh, okay, so maybe it's just a bad idea, +but in any case, here it is. + +In this example, you'll find a rough implementation of 1964 Dartmouth BASIC +as described in the manual at: + + http://www.bitsavers.org/pdf/dartmouth/BASIC_Oct64.pdf + +See also: + + http://en.wikipedia.org/wiki/Dartmouth_BASIC + +This dialect is downright primitive---there are no string variables +and no facilities for interactive input. Moreover, subroutines and functions +are brain-dead even more than they usually are for BASIC. Of course, +the GOTO statement is provided. + +Nevertheless, there are a few interesting aspects of this example: + + - It illustrates a fully working interpreter including lexing, parsing, + and interpretation of instructions. + + - The parser shows how to catch and report various kinds of parsing + errors in a more graceful way. + + - The example both parses files (supplied on command line) and + interactive input entered line by line. + + - It shows how you might represent parsed information. In this case, + each BASIC statement is encoded into a Python tuple containing the + statement type and parameters. These tuples are then stored in + a dictionary indexed by program line numbers. + + - Even though it's just BASIC, the parser contains more than 80 + rules and 150 parsing states. Thus, it's a little more meaty than + the calculator example. + +To use the example, run it as follows: + + % python basic.py hello.bas + HELLO WORLD + % + +or use it interactively: + + % python basic.py + [BASIC] 10 PRINT "HELLO WORLD" + [BASIC] 20 END + [BASIC] RUN + HELLO WORLD + [BASIC] + +The following files are defined: + + basic.py - High level script that controls everything + basiclex.py - BASIC tokenizer + basparse.py - BASIC parser + basinterp.py - BASIC interpreter that runs parsed programs. + +In addition, a number of sample BASIC programs (.bas suffix) are +provided. These were taken out of the Dartmouth manual. + +Disclaimer: I haven't spent a ton of time testing this and it's likely that +I've skimped here and there on a few finer details (e.g., strictly enforcing +variable naming rules). However, the interpreter seems to be able to run +the examples in the BASIC manual. + +Have fun! + +-Dave + + + + + +
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/basic.py @@ -0,0 +1,65 @@ +# An implementation of Dartmouth BASIC (1964) +# + +import sys +sys.path.insert(0, "../..") + +if sys.version_info[0] >= 3: + raw_input = input + +import basiclex +import basparse +import basinterp + +# If a filename has been specified, we try to run it. +# If a runtime error occurs, we bail out and enter +# interactive mode below +if len(sys.argv) == 2: + data = open(sys.argv[1]).read() + prog = basparse.parse(data) + if not prog: + raise SystemExit + b = basinterp.BasicInterpreter(prog) + try: + b.run() + raise SystemExit + except RuntimeError: + pass + +else: + b = basinterp.BasicInterpreter({}) + +# Interactive mode. This incrementally adds/deletes statements +# from the program stored in the BasicInterpreter object. In +# addition, special commands 'NEW','LIST',and 'RUN' are added. +# Specifying a line number with no code deletes that line from +# the program. + +while 1: + try: + line = raw_input("[BASIC] ") + except EOFError: + raise SystemExit + if not line: + continue + line += "\n" + prog = basparse.parse(line) + if not prog: + continue + + keys = list(prog) + if keys[0] > 0: + b.add_statements(prog) + else: + stat = prog[keys[0]] + if stat[0] == 'RUN': + try: + b.run() + except RuntimeError: + pass + elif stat[0] == 'LIST': + b.list() + elif stat[0] == 'BLANK': + b.del_line(stat[1]) + elif stat[0] == 'NEW': + b.new()
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/basiclex.py @@ -0,0 +1,61 @@ +# An implementation of Dartmouth BASIC (1964) + +from ply import * + +keywords = ( + 'LET', 'READ', 'DATA', 'PRINT', 'GOTO', 'IF', 'THEN', 'FOR', 'NEXT', 'TO', 'STEP', + 'END', 'STOP', 'DEF', 'GOSUB', 'DIM', 'REM', 'RETURN', 'RUN', 'LIST', 'NEW', +) + +tokens = keywords + ( + 'EQUALS', 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'POWER', + 'LPAREN', 'RPAREN', 'LT', 'LE', 'GT', 'GE', 'NE', + 'COMMA', 'SEMI', 'INTEGER', 'FLOAT', 'STRING', + 'ID', 'NEWLINE' +) + +t_ignore = ' \t' + + +def t_REM(t): + r'REM .*' + return t + + +def t_ID(t): + r'[A-Z][A-Z0-9]*' + if t.value in keywords: + t.type = t.value + return t + +t_EQUALS = r'=' +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_POWER = r'\^' +t_DIVIDE = r'/' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_LT = r'<' +t_LE = r'<=' +t_GT = r'>' +t_GE = r'>=' +t_NE = r'<>' +t_COMMA = r'\,' +t_SEMI = r';' +t_INTEGER = r'\d+' +t_FLOAT = r'((\d*\.\d+)(E[\+-]?\d+)?|([1-9]\d*E[\+-]?\d+))' +t_STRING = r'\".*?\"' + + +def t_NEWLINE(t): + r'\n' + t.lexer.lineno += 1 + return t + + +def t_error(t): + print("Illegal character %s" % t.value[0]) + t.lexer.skip(1) + +lex.lex(debug=0)
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/basiclog.py @@ -0,0 +1,73 @@ +# An implementation of Dartmouth BASIC (1964) +# + +import sys +sys.path.insert(0, "../..") + +if sys.version_info[0] >= 3: + raw_input = input + +import logging +logging.basicConfig( + level=logging.INFO, + filename="parselog.txt", + filemode="w" +) +log = logging.getLogger() + +import basiclex +import basparse +import basinterp + +# If a filename has been specified, we try to run it. +# If a runtime error occurs, we bail out and enter +# interactive mode below +if len(sys.argv) == 2: + data = open(sys.argv[1]).read() + prog = basparse.parse(data, debug=log) + if not prog: + raise SystemExit + b = basinterp.BasicInterpreter(prog) + try: + b.run() + raise SystemExit + except RuntimeError: + pass + +else: + b = basinterp.BasicInterpreter({}) + +# Interactive mode. This incrementally adds/deletes statements +# from the program stored in the BasicInterpreter object. In +# addition, special commands 'NEW','LIST',and 'RUN' are added. +# Specifying a line number with no code deletes that line from +# the program. + +while 1: + try: + line = raw_input("[BASIC] ") + except EOFError: + raise SystemExit + if not line: + continue + line += "\n" + prog = basparse.parse(line, debug=log) + if not prog: + continue + + keys = list(prog) + if keys[0] > 0: + b.add_statements(prog) + else: + stat = prog[keys[0]] + if stat[0] == 'RUN': + try: + b.run() + except RuntimeError: + pass + elif stat[0] == 'LIST': + b.list() + elif stat[0] == 'BLANK': + b.del_line(stat[1]) + elif stat[0] == 'NEW': + b.new()
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/basinterp.py @@ -0,0 +1,496 @@ +# This file provides the runtime support for running a basic program +# Assumes the program has been parsed using basparse.py + +import sys +import math +import random + + +class BasicInterpreter: + + # Initialize the interpreter. prog is a dictionary + # containing (line,statement) mappings + def __init__(self, prog): + self.prog = prog + + self.functions = { # Built-in function table + 'SIN': lambda z: math.sin(self.eval(z)), + 'COS': lambda z: math.cos(self.eval(z)), + 'TAN': lambda z: math.tan(self.eval(z)), + 'ATN': lambda z: math.atan(self.eval(z)), + 'EXP': lambda z: math.exp(self.eval(z)), + 'ABS': lambda z: abs(self.eval(z)), + 'LOG': lambda z: math.log(self.eval(z)), + 'SQR': lambda z: math.sqrt(self.eval(z)), + 'INT': lambda z: int(self.eval(z)), + 'RND': lambda z: random.random() + } + + # Collect all data statements + def collect_data(self): + self.data = [] + for lineno in self.stat: + if self.prog[lineno][0] == 'DATA': + self.data = self.data + self.prog[lineno][1] + self.dc = 0 # Initialize the data counter + + # Check for end statements + def check_end(self): + has_end = 0 + for lineno in self.stat: + if self.prog[lineno][0] == 'END' and not has_end: + has_end = lineno + if not has_end: + print("NO END INSTRUCTION") + self.error = 1 + return + if has_end != lineno: + print("END IS NOT LAST") + self.error = 1 + + # Check loops + def check_loops(self): + for pc in range(len(self.stat)): + lineno = self.stat[pc] + if self.prog[lineno][0] == 'FOR': + forinst = self.prog[lineno] + loopvar = forinst[1] + for i in range(pc + 1, len(self.stat)): + if self.prog[self.stat[i]][0] == 'NEXT': + nextvar = self.prog[self.stat[i]][1] + if nextvar != loopvar: + continue + self.loopend[pc] = i + break + else: + print("FOR WITHOUT NEXT AT LINE %s" % self.stat[pc]) + self.error = 1 + + # Evaluate an expression + def eval(self, expr): + etype = expr[0] + if etype == 'NUM': + return expr[1] + elif etype == 'GROUP': + return self.eval(expr[1]) + elif etype == 'UNARY': + if expr[1] == '-': + return -self.eval(expr[2]) + elif etype == 'BINOP': + if expr[1] == '+': + return self.eval(expr[2]) + self.eval(expr[3]) + elif expr[1] == '-': + return self.eval(expr[2]) - self.eval(expr[3]) + elif expr[1] == '*': + return self.eval(expr[2]) * self.eval(expr[3]) + elif expr[1] == '/': + return float(self.eval(expr[2])) / self.eval(expr[3]) + elif expr[1] == '^': + return abs(self.eval(expr[2]))**self.eval(expr[3]) + elif etype == 'VAR': + var, dim1, dim2 = expr[1] + if not dim1 and not dim2: + if var in self.vars: + return self.vars[var] + else: + print("UNDEFINED VARIABLE %s AT LINE %s" % + (var, self.stat[self.pc])) + raise RuntimeError + # May be a list lookup or a function evaluation + if dim1 and not dim2: + if var in self.functions: + # A function + return self.functions[var](dim1) + else: + # A list evaluation + if var in self.lists: + dim1val = self.eval(dim1) + if dim1val < 1 or dim1val > len(self.lists[var]): + print("LIST INDEX OUT OF BOUNDS AT LINE %s" % + self.stat[self.pc]) + raise RuntimeError + return self.lists[var][dim1val - 1] + if dim1 and dim2: + if var in self.tables: + dim1val = self.eval(dim1) + dim2val = self.eval(dim2) + if dim1val < 1 or dim1val > len(self.tables[var]) or dim2val < 1 or dim2val > len(self.tables[var][0]): + print("TABLE INDEX OUT OUT BOUNDS AT LINE %s" % + self.stat[self.pc]) + raise RuntimeError + return self.tables[var][dim1val - 1][dim2val - 1] + print("UNDEFINED VARIABLE %s AT LINE %s" % + (var, self.stat[self.pc])) + raise RuntimeError + + # Evaluate a relational expression + def releval(self, expr): + etype = expr[1] + lhs = self.eval(expr[2]) + rhs = self.eval(expr[3]) + if etype == '<': + if lhs < rhs: + return 1 + else: + return 0 + + elif etype == '<=': + if lhs <= rhs: + return 1 + else: + return 0 + + elif etype == '>': + if lhs > rhs: + return 1 + else: + return 0 + + elif etype == '>=': + if lhs >= rhs: + return 1 + else: + return 0 + + elif etype == '=': + if lhs == rhs: + return 1 + else: + return 0 + + elif etype == '<>': + if lhs != rhs: + return 1 + else: + return 0 + + # Assignment + def assign(self, target, value): + var, dim1, dim2 = target + if not dim1 and not dim2: + self.vars[var] = self.eval(value) + elif dim1 and not dim2: + # List assignment + dim1val = self.eval(dim1) + if not var in self.lists: + self.lists[var] = [0] * 10 + + if dim1val > len(self.lists[var]): + print ("DIMENSION TOO LARGE AT LINE %s" % self.stat[self.pc]) + raise RuntimeError + self.lists[var][dim1val - 1] = self.eval(value) + elif dim1 and dim2: + dim1val = self.eval(dim1) + dim2val = self.eval(dim2) + if not var in self.tables: + temp = [0] * 10 + v = [] + for i in range(10): + v.append(temp[:]) + self.tables[var] = v + # Variable already exists + if dim1val > len(self.tables[var]) or dim2val > len(self.tables[var][0]): + print("DIMENSION TOO LARGE AT LINE %s" % self.stat[self.pc]) + raise RuntimeError + self.tables[var][dim1val - 1][dim2val - 1] = self.eval(value) + + # Change the current line number + def goto(self, linenum): + if not linenum in self.prog: + print("UNDEFINED LINE NUMBER %d AT LINE %d" % + (linenum, self.stat[self.pc])) + raise RuntimeError + self.pc = self.stat.index(linenum) + + # Run it + def run(self): + self.vars = {} # All variables + self.lists = {} # List variables + self.tables = {} # Tables + self.loops = [] # Currently active loops + self.loopend = {} # Mapping saying where loops end + self.gosub = None # Gosub return point (if any) + self.error = 0 # Indicates program error + + self.stat = list(self.prog) # Ordered list of all line numbers + self.stat.sort() + self.pc = 0 # Current program counter + + # Processing prior to running + + self.collect_data() # Collect all of the data statements + self.check_end() + self.check_loops() + + if self.error: + raise RuntimeError + + while 1: + line = self.stat[self.pc] + instr = self.prog[line] + + op = instr[0] + + # END and STOP statements + if op == 'END' or op == 'STOP': + break # We're done + + # GOTO statement + elif op == 'GOTO': + newline = instr[1] + self.goto(newline) + continue + + # PRINT statement + elif op == 'PRINT': + plist = instr[1] + out = "" + for label, val in plist: + if out: + out += ' ' * (15 - (len(out) % 15)) + out += label + if val: + if label: + out += " " + eval = self.eval(val) + out += str(eval) + sys.stdout.write(out) + end = instr[2] + if not (end == ',' or end == ';'): + sys.stdout.write("\n") + if end == ',': + sys.stdout.write(" " * (15 - (len(out) % 15))) + if end == ';': + sys.stdout.write(" " * (3 - (len(out) % 3))) + + # LET statement + elif op == 'LET': + target = instr[1] + value = instr[2] + self.assign(target, value) + + # READ statement + elif op == 'READ': + for target in instr[1]: + if self.dc < len(self.data): + value = ('NUM', self.data[self.dc]) + self.assign(target, value) + self.dc += 1 + else: + # No more data. Program ends + return + elif op == 'IF': + relop = instr[1] + newline = instr[2] + if (self.releval(relop)): + self.goto(newline) + continue + + elif op == 'FOR': + loopvar = instr[1] + initval = instr[2] + finval = instr[3] + stepval = instr[4] + + # Check to see if this is a new loop + if not self.loops or self.loops[-1][0] != self.pc: + # Looks like a new loop. Make the initial assignment + newvalue = initval + self.assign((loopvar, None, None), initval) + if not stepval: + stepval = ('NUM', 1) + stepval = self.eval(stepval) # Evaluate step here + self.loops.append((self.pc, stepval)) + else: + # It's a repeat of the previous loop + # Update the value of the loop variable according to the + # step + stepval = ('NUM', self.loops[-1][1]) + newvalue = ( + 'BINOP', '+', ('VAR', (loopvar, None, None)), stepval) + + if self.loops[-1][1] < 0: + relop = '>=' + else: + relop = '<=' + if not self.releval(('RELOP', relop, newvalue, finval)): + # Loop is done. Jump to the NEXT + self.pc = self.loopend[self.pc] + self.loops.pop() + else: + self.assign((loopvar, None, None), newvalue) + + elif op == 'NEXT': + if not self.loops: + print("NEXT WITHOUT FOR AT LINE %s" % line) + return + + nextvar = instr[1] + self.pc = self.loops[-1][0] + loopinst = self.prog[self.stat[self.pc]] + forvar = loopinst[1] + if nextvar != forvar: + print("NEXT DOESN'T MATCH FOR AT LINE %s" % line) + return + continue + elif op == 'GOSUB': + newline = instr[1] + if self.gosub: + print("ALREADY IN A SUBROUTINE AT LINE %s" % line) + return + self.gosub = self.stat[self.pc] + self.goto(newline) + continue + + elif op == 'RETURN': + if not self.gosub: + print("RETURN WITHOUT A GOSUB AT LINE %s" % line) + return + self.goto(self.gosub) + self.gosub = None + + elif op == 'FUNC': + fname = instr[1] + pname = instr[2] + expr = instr[3] + + def eval_func(pvalue, name=pname, self=self, expr=expr): + self.assign((pname, None, None), pvalue) + return self.eval(expr) + self.functions[fname] = eval_func + + elif op == 'DIM': + for vname, x, y in instr[1]: + if y == 0: + # Single dimension variable + self.lists[vname] = [0] * x + else: + # Double dimension variable + temp = [0] * y + v = [] + for i in range(x): + v.append(temp[:]) + self.tables[vname] = v + + self.pc += 1 + + # Utility functions for program listing + def expr_str(self, expr): + etype = expr[0] + if etype == 'NUM': + return str(expr[1]) + elif etype == 'GROUP': + return "(%s)" % self.expr_str(expr[1]) + elif etype == 'UNARY': + if expr[1] == '-': + return "-" + str(expr[2]) + elif etype == 'BINOP': + return "%s %s %s" % (self.expr_str(expr[2]), expr[1], self.expr_str(expr[3])) + elif etype == 'VAR': + return self.var_str(expr[1]) + + def relexpr_str(self, expr): + return "%s %s %s" % (self.expr_str(expr[2]), expr[1], self.expr_str(expr[3])) + + def var_str(self, var): + varname, dim1, dim2 = var + if not dim1 and not dim2: + return varname + if dim1 and not dim2: + return "%s(%s)" % (varname, self.expr_str(dim1)) + return "%s(%s,%s)" % (varname, self.expr_str(dim1), self.expr_str(dim2)) + + # Create a program listing + def list(self): + stat = list(self.prog) # Ordered list of all line numbers + stat.sort() + for line in stat: + instr = self.prog[line] + op = instr[0] + if op in ['END', 'STOP', 'RETURN']: + print("%s %s" % (line, op)) + continue + elif op == 'REM': + print("%s %s" % (line, instr[1])) + elif op == 'PRINT': + _out = "%s %s " % (line, op) + first = 1 + for p in instr[1]: + if not first: + _out += ", " + if p[0] and p[1]: + _out += '"%s"%s' % (p[0], self.expr_str(p[1])) + elif p[1]: + _out += self.expr_str(p[1]) + else: + _out += '"%s"' % (p[0],) + first = 0 + if instr[2]: + _out += instr[2] + print(_out) + elif op == 'LET': + print("%s LET %s = %s" % + (line, self.var_str(instr[1]), self.expr_str(instr[2]))) + elif op == 'READ': + _out = "%s READ " % line + first = 1 + for r in instr[1]: + if not first: + _out += "," + _out += self.var_str(r) + first = 0 + print(_out) + elif op == 'IF': + print("%s IF %s THEN %d" % + (line, self.relexpr_str(instr[1]), instr[2])) + elif op == 'GOTO' or op == 'GOSUB': + print("%s %s %s" % (line, op, instr[1])) + elif op == 'FOR': + _out = "%s FOR %s = %s TO %s" % ( + line, instr[1], self.expr_str(instr[2]), self.expr_str(instr[3])) + if instr[4]: + _out += " STEP %s" % (self.expr_str(instr[4])) + print(_out) + elif op == 'NEXT': + print("%s NEXT %s" % (line, instr[1])) + elif op == 'FUNC': + print("%s DEF %s(%s) = %s" % + (line, instr[1], instr[2], self.expr_str(instr[3]))) + elif op == 'DIM': + _out = "%s DIM " % line + first = 1 + for vname, x, y in instr[1]: + if not first: + _out += "," + first = 0 + if y == 0: + _out += "%s(%d)" % (vname, x) + else: + _out += "%s(%d,%d)" % (vname, x, y) + + print(_out) + elif op == 'DATA': + _out = "%s DATA " % line + first = 1 + for v in instr[1]: + if not first: + _out += "," + first = 0 + _out += v + print(_out) + + # Erase the current program + def new(self): + self.prog = {} + + # Insert statements + def add_statements(self, prog): + for line, stat in prog.items(): + self.prog[line] = stat + + # Delete a statement + def del_line(self, lineno): + try: + del self.prog[lineno] + except KeyError: + pass
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/basparse.py @@ -0,0 +1,474 @@ +# An implementation of Dartmouth BASIC (1964) +# + +from ply import * +import basiclex + +tokens = basiclex.tokens + +precedence = ( + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ('left', 'POWER'), + ('right', 'UMINUS') +) + +# A BASIC program is a series of statements. We represent the program as a +# dictionary of tuples indexed by line number. + + +def p_program(p): + '''program : program statement + | statement''' + + if len(p) == 2 and p[1]: + p[0] = {} + line, stat = p[1] + p[0][line] = stat + elif len(p) == 3: + p[0] = p[1] + if not p[0]: + p[0] = {} + if p[2]: + line, stat = p[2] + p[0][line] = stat + +# This catch-all rule is used for any catastrophic errors. In this case, +# we simply return nothing + + +def p_program_error(p): + '''program : error''' + p[0] = None + p.parser.error = 1 + +# Format of all BASIC statements. + + +def p_statement(p): + '''statement : INTEGER command NEWLINE''' + if isinstance(p[2], str): + print("%s %s %s" % (p[2], "AT LINE", p[1])) + p[0] = None + p.parser.error = 1 + else: + lineno = int(p[1]) + p[0] = (lineno, p[2]) + +# Interactive statements. + + +def p_statement_interactive(p): + '''statement : RUN NEWLINE + | LIST NEWLINE + | NEW NEWLINE''' + p[0] = (0, (p[1], 0)) + +# Blank line number + + +def p_statement_blank(p): + '''statement : INTEGER NEWLINE''' + p[0] = (0, ('BLANK', int(p[1]))) + +# Error handling for malformed statements + + +def p_statement_bad(p): + '''statement : INTEGER error NEWLINE''' + print("MALFORMED STATEMENT AT LINE %s" % p[1]) + p[0] = None + p.parser.error = 1 + +# Blank line + + +def p_statement_newline(p): + '''statement : NEWLINE''' + p[0] = None + +# LET statement + + +def p_command_let(p): + '''command : LET variable EQUALS expr''' + p[0] = ('LET', p[2], p[4]) + + +def p_command_let_bad(p): + '''command : LET variable EQUALS error''' + p[0] = "BAD EXPRESSION IN LET" + +# READ statement + + +def p_command_read(p): + '''command : READ varlist''' + p[0] = ('READ', p[2]) + + +def p_command_read_bad(p): + '''command : READ error''' + p[0] = "MALFORMED VARIABLE LIST IN READ" + +# DATA statement + + +def p_command_data(p): + '''command : DATA numlist''' + p[0] = ('DATA', p[2]) + + +def p_command_data_bad(p): + '''command : DATA error''' + p[0] = "MALFORMED NUMBER LIST IN DATA" + +# PRINT statement + + +def p_command_print(p): + '''command : PRINT plist optend''' + p[0] = ('PRINT', p[2], p[3]) + + +def p_command_print_bad(p): + '''command : PRINT error''' + p[0] = "MALFORMED PRINT STATEMENT" + +# Optional ending on PRINT. Either a comma (,) or semicolon (;) + + +def p_optend(p): + '''optend : COMMA + | SEMI + |''' + if len(p) == 2: + p[0] = p[1] + else: + p[0] = None + +# PRINT statement with no arguments + + +def p_command_print_empty(p): + '''command : PRINT''' + p[0] = ('PRINT', [], None) + +# GOTO statement + + +def p_command_goto(p): + '''command : GOTO INTEGER''' + p[0] = ('GOTO', int(p[2])) + + +def p_command_goto_bad(p): + '''command : GOTO error''' + p[0] = "INVALID LINE NUMBER IN GOTO" + +# IF-THEN statement + + +def p_command_if(p): + '''command : IF relexpr THEN INTEGER''' + p[0] = ('IF', p[2], int(p[4])) + + +def p_command_if_bad(p): + '''command : IF error THEN INTEGER''' + p[0] = "BAD RELATIONAL EXPRESSION" + + +def p_command_if_bad2(p): + '''command : IF relexpr THEN error''' + p[0] = "INVALID LINE NUMBER IN THEN" + +# FOR statement + + +def p_command_for(p): + '''command : FOR ID EQUALS expr TO expr optstep''' + p[0] = ('FOR', p[2], p[4], p[6], p[7]) + + +def p_command_for_bad_initial(p): + '''command : FOR ID EQUALS error TO expr optstep''' + p[0] = "BAD INITIAL VALUE IN FOR STATEMENT" + + +def p_command_for_bad_final(p): + '''command : FOR ID EQUALS expr TO error optstep''' + p[0] = "BAD FINAL VALUE IN FOR STATEMENT" + + +def p_command_for_bad_step(p): + '''command : FOR ID EQUALS expr TO expr STEP error''' + p[0] = "MALFORMED STEP IN FOR STATEMENT" + +# Optional STEP qualifier on FOR statement + + +def p_optstep(p): + '''optstep : STEP expr + | empty''' + if len(p) == 3: + p[0] = p[2] + else: + p[0] = None + +# NEXT statement + + +def p_command_next(p): + '''command : NEXT ID''' + + p[0] = ('NEXT', p[2]) + + +def p_command_next_bad(p): + '''command : NEXT error''' + p[0] = "MALFORMED NEXT" + +# END statement + + +def p_command_end(p): + '''command : END''' + p[0] = ('END',) + +# REM statement + + +def p_command_rem(p): + '''command : REM''' + p[0] = ('REM', p[1]) + +# STOP statement + + +def p_command_stop(p): + '''command : STOP''' + p[0] = ('STOP',) + +# DEF statement + + +def p_command_def(p): + '''command : DEF ID LPAREN ID RPAREN EQUALS expr''' + p[0] = ('FUNC', p[2], p[4], p[7]) + + +def p_command_def_bad_rhs(p): + '''command : DEF ID LPAREN ID RPAREN EQUALS error''' + p[0] = "BAD EXPRESSION IN DEF STATEMENT" + + +def p_command_def_bad_arg(p): + '''command : DEF ID LPAREN error RPAREN EQUALS expr''' + p[0] = "BAD ARGUMENT IN DEF STATEMENT" + +# GOSUB statement + + +def p_command_gosub(p): + '''command : GOSUB INTEGER''' + p[0] = ('GOSUB', int(p[2])) + + +def p_command_gosub_bad(p): + '''command : GOSUB error''' + p[0] = "INVALID LINE NUMBER IN GOSUB" + +# RETURN statement + + +def p_command_return(p): + '''command : RETURN''' + p[0] = ('RETURN',) + +# DIM statement + + +def p_command_dim(p): + '''command : DIM dimlist''' + p[0] = ('DIM', p[2]) + + +def p_command_dim_bad(p): + '''command : DIM error''' + p[0] = "MALFORMED VARIABLE LIST IN DIM" + +# List of variables supplied to DIM statement + + +def p_dimlist(p): + '''dimlist : dimlist COMMA dimitem + | dimitem''' + if len(p) == 4: + p[0] = p[1] + p[0].append(p[3]) + else: + p[0] = [p[1]] + +# DIM items + + +def p_dimitem_single(p): + '''dimitem : ID LPAREN INTEGER RPAREN''' + p[0] = (p[1], eval(p[3]), 0) + + +def p_dimitem_double(p): + '''dimitem : ID LPAREN INTEGER COMMA INTEGER RPAREN''' + p[0] = (p[1], eval(p[3]), eval(p[5])) + +# Arithmetic expressions + + +def p_expr_binary(p): + '''expr : expr PLUS expr + | expr MINUS expr + | expr TIMES expr + | expr DIVIDE expr + | expr POWER expr''' + + p[0] = ('BINOP', p[2], p[1], p[3]) + + +def p_expr_number(p): + '''expr : INTEGER + | FLOAT''' + p[0] = ('NUM', eval(p[1])) + + +def p_expr_variable(p): + '''expr : variable''' + p[0] = ('VAR', p[1]) + + +def p_expr_group(p): + '''expr : LPAREN expr RPAREN''' + p[0] = ('GROUP', p[2]) + + +def p_expr_unary(p): + '''expr : MINUS expr %prec UMINUS''' + p[0] = ('UNARY', '-', p[2]) + +# Relational expressions + + +def p_relexpr(p): + '''relexpr : expr LT expr + | expr LE expr + | expr GT expr + | expr GE expr + | expr EQUALS expr + | expr NE expr''' + p[0] = ('RELOP', p[2], p[1], p[3]) + +# Variables + + +def p_variable(p): + '''variable : ID + | ID LPAREN expr RPAREN + | ID LPAREN expr COMMA expr RPAREN''' + if len(p) == 2: + p[0] = (p[1], None, None) + elif len(p) == 5: + p[0] = (p[1], p[3], None) + else: + p[0] = (p[1], p[3], p[5]) + +# Builds a list of variable targets as a Python list + + +def p_varlist(p): + '''varlist : varlist COMMA variable + | variable''' + if len(p) > 2: + p[0] = p[1] + p[0].append(p[3]) + else: + p[0] = [p[1]] + + +# Builds a list of numbers as a Python list + +def p_numlist(p): + '''numlist : numlist COMMA number + | number''' + + if len(p) > 2: + p[0] = p[1] + p[0].append(p[3]) + else: + p[0] = [p[1]] + +# A number. May be an integer or a float + + +def p_number(p): + '''number : INTEGER + | FLOAT''' + p[0] = eval(p[1]) + +# A signed number. + + +def p_number_signed(p): + '''number : MINUS INTEGER + | MINUS FLOAT''' + p[0] = eval("-" + p[2]) + +# List of targets for a print statement +# Returns a list of tuples (label,expr) + + +def p_plist(p): + '''plist : plist COMMA pitem + | pitem''' + if len(p) > 3: + p[0] = p[1] + p[0].append(p[3]) + else: + p[0] = [p[1]] + + +def p_item_string(p): + '''pitem : STRING''' + p[0] = (p[1][1:-1], None) + + +def p_item_string_expr(p): + '''pitem : STRING expr''' + p[0] = (p[1][1:-1], p[2]) + + +def p_item_expr(p): + '''pitem : expr''' + p[0] = ("", p[1]) + +# Empty + + +def p_empty(p): + '''empty : ''' + +# Catastrophic error handler + + +def p_error(p): + if not p: + print("SYNTAX ERROR AT EOF") + +bparser = yacc.yacc() + + +def parse(data, debug=0): + bparser.error = 0 + p = bparser.parse(data, debug=debug) + if bparser.error: + return None + return p
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/dim.bas @@ -0,0 +1,14 @@ +5 DIM A(50,15) +10 FOR I = 1 TO 50 +20 FOR J = 1 TO 15 +30 LET A(I,J) = I + J +35 REM PRINT I,J, A(I,J) +40 NEXT J +50 NEXT I +100 FOR I = 1 TO 50 +110 FOR J = 1 TO 15 +120 PRINT A(I,J), +130 NEXT J +140 PRINT +150 NEXT I +999 END
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/func.bas @@ -0,0 +1,5 @@ +10 DEF FDX(X) = 2*X +20 FOR I = 0 TO 100 +30 PRINT FDX(I) +40 NEXT I +50 END
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/gcd.bas @@ -0,0 +1,22 @@ +10 PRINT "A","B","C","GCD" +20 READ A,B,C +30 LET X = A +40 LET Y = B +50 GOSUB 200 +60 LET X = G +70 LET Y = C +80 GOSUB 200 +90 PRINT A, B, C, G +100 GOTO 20 +110 DATA 60, 90, 120 +120 DATA 38456, 64872, 98765 +130 DATA 32, 384, 72 +200 LET Q = INT(X/Y) +210 LET R = X - Q*Y +220 IF R = 0 THEN 300 +230 LET X = Y +240 LET Y = R +250 GOTO 200 +300 LET G = Y +310 RETURN +999 END
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/gosub.bas @@ -0,0 +1,13 @@ +100 LET X = 3 +110 GOSUB 400 +120 PRINT U, V, W +200 LET X = 5 +210 GOSUB 400 +220 LET Z = U + 2*V + 3*W +230 PRINT Z +240 GOTO 999 +400 LET U = X*X +410 LET V = X*X*X +420 LET W = X*X*X*X + X*X*X + X*X + X +430 RETURN +999 END
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/hello.bas @@ -0,0 +1,4 @@ +5 REM HELLO WORLD PROGAM +10 PRINT "HELLO WORLD" +99 END +
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/linear.bas @@ -0,0 +1,17 @@ +1 REM ::: SOLVE A SYSTEM OF LINEAR EQUATIONS +2 REM ::: A1*X1 + A2*X2 = B1 +3 REM ::: A3*X1 + A4*X2 = B2 +4 REM -------------------------------------- +10 READ A1, A2, A3, A4 +15 LET D = A1 * A4 - A3 * A2 +20 IF D = 0 THEN 65 +30 READ B1, B2 +37 LET X1 = (B1*A4 - B2*A2) / D +42 LET X2 = (A1*B2 - A3*B1) / D +55 PRINT X1, X2 +60 GOTO 30 +65 PRINT "NO UNIQUE SOLUTION" +70 DATA 1, 2, 4 +80 DATA 2, -7, 5 +85 DATA 1, 3, 4, -7 +90 END
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/maxsin.bas @@ -0,0 +1,12 @@ +5 PRINT "X VALUE", "SINE", "RESOLUTION" +10 READ D +20 LET M = -1 +30 FOR X = 0 TO 3 STEP D +40 IF SIN(X) <= M THEN 80 +50 LET X0 = X +60 LET M = SIN(X) +80 NEXT X +85 PRINT X0, M, D +90 GOTO 10 +100 DATA .1, .01, .001 +110 END
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/powers.bas @@ -0,0 +1,13 @@ +5 PRINT "THIS PROGRAM COMPUTES AND PRINTS THE NTH POWERS" +6 PRINT "OF THE NUMBERS LESS THAN OR EQUAL TO N FOR VARIOUS" +7 PRINT "N FROM 1 THROUGH 7" +8 PRINT +10 FOR N = 1 TO 7 +15 PRINT "N = "N +20 FOR I = 1 TO N +30 PRINT I^N, +40 NEXT I +50 PRINT +60 PRINT +70 NEXT N +80 END
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/rand.bas @@ -0,0 +1,4 @@ +10 FOR I = 1 TO 20 +20 PRINT INT(10*RND(0)) +30 NEXT I +40 END
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/sales.bas @@ -0,0 +1,20 @@ +10 FOR I = 1 TO 3 +20 READ P(I) +30 NEXT I +40 FOR I = 1 TO 3 +50 FOR J = 1 TO 5 +60 READ S(I,J) +70 NEXT J +80 NEXT I +90 FOR J = 1 TO 5 +100 LET S = 0 +110 FOR I = 1 TO 3 +120 LET S = S + P(I) * S(I,J) +130 NEXT I +140 PRINT "TOTAL SALES FOR SALESMAN"J, "$"S +150 NEXT J +200 DATA 1.25, 4.30, 2.50 +210 DATA 40, 20, 37, 29, 42 +220 DATA 10, 16, 3, 21, 8 +230 DATA 35, 47, 29, 16, 33 +300 END
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/sears.bas @@ -0,0 +1,18 @@ +1 REM :: THIS PROGRAM COMPUTES HOW MANY TIMES YOU HAVE TO FOLD +2 REM :: A PIECE OF PAPER SO THAT IT IS TALLER THAN THE +3 REM :: SEARS TOWER. +4 REM :: S = HEIGHT OF TOWER (METERS) +5 REM :: T = THICKNESS OF PAPER (MILLIMETERS) +10 LET S = 442 +20 LET T = 0.1 +30 REM CONVERT T TO METERS +40 LET T = T * .001 +50 LET F = 1 +60 LET H = T +100 IF H > S THEN 200 +120 LET H = 2 * H +125 LET F = F + 1 +130 GOTO 100 +200 PRINT "NUMBER OF FOLDS ="F +220 PRINT "FINAL HEIGHT ="H +999 END
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/sqrt1.bas @@ -0,0 +1,5 @@ +10 LET X = 0 +20 LET X = X + 1 +30 PRINT X, SQR(X) +40 IF X < 100 THEN 20 +50 END
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/BASIC/sqrt2.bas @@ -0,0 +1,4 @@ +10 FOR X = 1 TO 100 +20 PRINT X, SQR(X) +30 NEXT X +40 END
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/GardenSnake/GardenSnake.py @@ -0,0 +1,777 @@ +# GardenSnake - a parser generator demonstration program +# +# This implements a modified version of a subset of Python: +# - only 'def', 'return' and 'if' statements +# - 'if' only has 'then' clause (no elif nor else) +# - single-quoted strings only, content in raw format +# - numbers are decimal.Decimal instances (not integers or floats) +# - no print statment; use the built-in 'print' function +# - only < > == + - / * implemented (and unary + -) +# - assignment and tuple assignment work +# - no generators of any sort +# - no ... well, no quite a lot + +# Why? I'm thinking about a new indentation-based configuration +# language for a project and wanted to figure out how to do it. Once +# I got that working I needed a way to test it out. My original AST +# was dumb so I decided to target Python's AST and compile it into +# Python code. Plus, it's pretty cool that it only took a day or so +# from sitting down with Ply to having working code. + +# This uses David Beazley's Ply from http://www.dabeaz.com/ply/ + +# This work is hereby released into the Public Domain. To view a copy of +# the public domain dedication, visit +# http://creativecommons.org/licenses/publicdomain/ or send a letter to +# Creative Commons, 543 Howard Street, 5th Floor, San Francisco, +# California, 94105, USA. +# +# Portions of this work are derived from Python's Grammar definition +# and may be covered under the Python copyright and license +# +# Andrew Dalke / Dalke Scientific Software, LLC +# 30 August 2006 / Cape Town, South Africa + +# Changelog: +# 30 August - added link to CC license; removed the "swapcase" encoding + +# Modifications for inclusion in PLY distribution +import sys +sys.path.insert(0, "../..") +from ply import * + +##### Lexer ###### +#import lex +import decimal + +tokens = ( + 'DEF', + 'IF', + 'NAME', + 'NUMBER', # Python decimals + 'STRING', # single quoted strings only; syntax of raw strings + 'LPAR', + 'RPAR', + 'COLON', + 'EQ', + 'ASSIGN', + 'LT', + 'GT', + 'PLUS', + 'MINUS', + 'MULT', + 'DIV', + 'RETURN', + 'WS', + 'NEWLINE', + 'COMMA', + 'SEMICOLON', + 'INDENT', + 'DEDENT', + 'ENDMARKER', +) + +#t_NUMBER = r'\d+' +# taken from decmial.py but without the leading sign + + +def t_NUMBER(t): + r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?""" + t.value = decimal.Decimal(t.value) + return t + + +def t_STRING(t): + r"'([^\\']+|\\'|\\\\)*'" # I think this is right ... + t.value = t.value[1:-1].decode("string-escape") # .swapcase() # for fun + return t + +t_COLON = r':' +t_EQ = r'==' +t_ASSIGN = r'=' +t_LT = r'<' +t_GT = r'>' +t_PLUS = r'\+' +t_MINUS = r'-' +t_MULT = r'\*' +t_DIV = r'/' +t_COMMA = r',' +t_SEMICOLON = r';' + +# Ply nicely documented how to do this. + +RESERVED = { + "def": "DEF", + "if": "IF", + "return": "RETURN", +} + + +def t_NAME(t): + r'[a-zA-Z_][a-zA-Z0-9_]*' + t.type = RESERVED.get(t.value, "NAME") + return t + +# Putting this before t_WS let it consume lines with only comments in +# them so the latter code never sees the WS part. Not consuming the +# newline. Needed for "if 1: #comment" + + +def t_comment(t): + r"[ ]*\043[^\n]*" # \043 is '#' + pass + + +# Whitespace +def t_WS(t): + r' [ ]+ ' + if t.lexer.at_line_start and t.lexer.paren_count == 0: + return t + +# Don't generate newline tokens when inside of parenthesis, eg +# a = (1, +# 2, 3) + + +def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + t.type = "NEWLINE" + if t.lexer.paren_count == 0: + return t + + +def t_LPAR(t): + r'\(' + t.lexer.paren_count += 1 + return t + + +def t_RPAR(t): + r'\)' + # check for underflow? should be the job of the parser + t.lexer.paren_count -= 1 + return t + + +def t_error(t): + raise SyntaxError("Unknown symbol %r" % (t.value[0],)) + print "Skipping", repr(t.value[0]) + t.lexer.skip(1) + +# I implemented INDENT / DEDENT generation as a post-processing filter + +# The original lex token stream contains WS and NEWLINE characters. +# WS will only occur before any other tokens on a line. + +# I have three filters. One tags tokens by adding two attributes. +# "must_indent" is True if the token must be indented from the +# previous code. The other is "at_line_start" which is True for WS +# and the first non-WS/non-NEWLINE on a line. It flags the check so +# see if the new line has changed indication level. + +# Python's syntax has three INDENT states +# 0) no colon hence no need to indent +# 1) "if 1: go()" - simple statements have a COLON but no need for an indent +# 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent +NO_INDENT = 0 +MAY_INDENT = 1 +MUST_INDENT = 2 + +# only care about whitespace at the start of a line + + +def track_tokens_filter(lexer, tokens): + lexer.at_line_start = at_line_start = True + indent = NO_INDENT + saw_colon = False + for token in tokens: + token.at_line_start = at_line_start + + if token.type == "COLON": + at_line_start = False + indent = MAY_INDENT + token.must_indent = False + + elif token.type == "NEWLINE": + at_line_start = True + if indent == MAY_INDENT: + indent = MUST_INDENT + token.must_indent = False + + elif token.type == "WS": + assert token.at_line_start == True + at_line_start = True + token.must_indent = False + + else: + # A real token; only indent after COLON NEWLINE + if indent == MUST_INDENT: + token.must_indent = True + else: + token.must_indent = False + at_line_start = False + indent = NO_INDENT + + yield token + lexer.at_line_start = at_line_start + + +def _new_token(type, lineno): + tok = lex.LexToken() + tok.type = type + tok.value = None + tok.lineno = lineno + return tok + +# Synthesize a DEDENT tag + + +def DEDENT(lineno): + return _new_token("DEDENT", lineno) + +# Synthesize an INDENT tag + + +def INDENT(lineno): + return _new_token("INDENT", lineno) + + +# Track the indentation level and emit the right INDENT / DEDENT events. +def indentation_filter(tokens): + # A stack of indentation levels; will never pop item 0 + levels = [0] + token = None + depth = 0 + prev_was_ws = False + for token in tokens: + # if 1: + # print "Process", token, + # if token.at_line_start: + # print "at_line_start", + # if token.must_indent: + # print "must_indent", + # print + + # WS only occurs at the start of the line + # There may be WS followed by NEWLINE so + # only track the depth here. Don't indent/dedent + # until there's something real. + if token.type == "WS": + assert depth == 0 + depth = len(token.value) + prev_was_ws = True + # WS tokens are never passed to the parser + continue + + if token.type == "NEWLINE": + depth = 0 + if prev_was_ws or token.at_line_start: + # ignore blank lines + continue + # pass the other cases on through + yield token + continue + + # then it must be a real token (not WS, not NEWLINE) + # which can affect the indentation level + + prev_was_ws = False + if token.must_indent: + # The current depth must be larger than the previous level + if not (depth > levels[-1]): + raise IndentationError("expected an indented block") + + levels.append(depth) + yield INDENT(token.lineno) + + elif token.at_line_start: + # Must be on the same level or one of the previous levels + if depth == levels[-1]: + # At the same level + pass + elif depth > levels[-1]: + raise IndentationError( + "indentation increase but not in new block") + else: + # Back up; but only if it matches a previous level + try: + i = levels.index(depth) + except ValueError: + raise IndentationError("inconsistent indentation") + for _ in range(i + 1, len(levels)): + yield DEDENT(token.lineno) + levels.pop() + + yield token + + ### Finished processing ### + + # Must dedent any remaining levels + if len(levels) > 1: + assert token is not None + for _ in range(1, len(levels)): + yield DEDENT(token.lineno) + + +# The top-level filter adds an ENDMARKER, if requested. +# Python's grammar uses it. +def filter(lexer, add_endmarker=True): + token = None + tokens = iter(lexer.token, None) + tokens = track_tokens_filter(lexer, tokens) + for token in indentation_filter(tokens): + yield token + + if add_endmarker: + lineno = 1 + if token is not None: + lineno = token.lineno + yield _new_token("ENDMARKER", lineno) + +# Combine Ply and my filters into a new lexer + + +class IndentLexer(object): + + def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0): + self.lexer = lex.lex(debug=debug, optimize=optimize, + lextab=lextab, reflags=reflags) + self.token_stream = None + + def input(self, s, add_endmarker=True): + self.lexer.paren_count = 0 + self.lexer.input(s) + self.token_stream = filter(self.lexer, add_endmarker) + + def token(self): + try: + return self.token_stream.next() + except StopIteration: + return None + +########## Parser (tokens -> AST) ###### + +# also part of Ply +#import yacc + +# I use the Python AST +from compiler import ast + +# Helper function + + +def Assign(left, right): + names = [] + if isinstance(left, ast.Name): + # Single assignment on left + return ast.Assign([ast.AssName(left.name, 'OP_ASSIGN')], right) + elif isinstance(left, ast.Tuple): + # List of things - make sure they are Name nodes + names = [] + for child in left.getChildren(): + if not isinstance(child, ast.Name): + raise SyntaxError("that assignment not supported") + names.append(child.name) + ass_list = [ast.AssName(name, 'OP_ASSIGN') for name in names] + return ast.Assign([ast.AssTuple(ass_list)], right) + else: + raise SyntaxError("Can't do that yet") + + +# The grammar comments come from Python's Grammar/Grammar file + +# NB: compound_stmt in single_input is followed by extra NEWLINE! +# file_input: (NEWLINE | stmt)* ENDMARKER +def p_file_input_end(p): + """file_input_end : file_input ENDMARKER""" + p[0] = ast.Stmt(p[1]) + + +def p_file_input(p): + """file_input : file_input NEWLINE + | file_input stmt + | NEWLINE + | stmt""" + if isinstance(p[len(p) - 1], basestring): + if len(p) == 3: + p[0] = p[1] + else: + p[0] = [] # p == 2 --> only a blank line + else: + if len(p) == 3: + p[0] = p[1] + p[2] + else: + p[0] = p[1] + + +# funcdef: [decorators] 'def' NAME parameters ':' suite +# ignoring decorators +def p_funcdef(p): + "funcdef : DEF NAME parameters COLON suite" + p[0] = ast.Function(None, p[2], tuple(p[3]), (), 0, None, p[5]) + +# parameters: '(' [varargslist] ')' + + +def p_parameters(p): + """parameters : LPAR RPAR + | LPAR varargslist RPAR""" + if len(p) == 3: + p[0] = [] + else: + p[0] = p[2] + + +# varargslist: (fpdef ['=' test] ',')* ('*' NAME [',' '**' NAME] | '**' NAME) | +# highly simplified +def p_varargslist(p): + """varargslist : varargslist COMMA NAME + | NAME""" + if len(p) == 4: + p[0] = p[1] + p[3] + else: + p[0] = [p[1]] + +# stmt: simple_stmt | compound_stmt + + +def p_stmt_simple(p): + """stmt : simple_stmt""" + # simple_stmt is a list + p[0] = p[1] + + +def p_stmt_compound(p): + """stmt : compound_stmt""" + p[0] = [p[1]] + +# simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE + + +def p_simple_stmt(p): + """simple_stmt : small_stmts NEWLINE + | small_stmts SEMICOLON NEWLINE""" + p[0] = p[1] + + +def p_small_stmts(p): + """small_stmts : small_stmts SEMICOLON small_stmt + | small_stmt""" + if len(p) == 4: + p[0] = p[1] + [p[3]] + else: + p[0] = [p[1]] + +# small_stmt: expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt | +# import_stmt | global_stmt | exec_stmt | assert_stmt + + +def p_small_stmt(p): + """small_stmt : flow_stmt + | expr_stmt""" + p[0] = p[1] + +# expr_stmt: testlist (augassign (yield_expr|testlist) | +# ('=' (yield_expr|testlist))*) +# augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' | +# '<<=' | '>>=' | '**=' | '//=') + + +def p_expr_stmt(p): + """expr_stmt : testlist ASSIGN testlist + | testlist """ + if len(p) == 2: + # a list of expressions + p[0] = ast.Discard(p[1]) + else: + p[0] = Assign(p[1], p[3]) + + +def p_flow_stmt(p): + "flow_stmt : return_stmt" + p[0] = p[1] + +# return_stmt: 'return' [testlist] + + +def p_return_stmt(p): + "return_stmt : RETURN testlist" + p[0] = ast.Return(p[2]) + + +def p_compound_stmt(p): + """compound_stmt : if_stmt + | funcdef""" + p[0] = p[1] + + +def p_if_stmt(p): + 'if_stmt : IF test COLON suite' + p[0] = ast.If([(p[2], p[4])], None) + + +def p_suite(p): + """suite : simple_stmt + | NEWLINE INDENT stmts DEDENT""" + if len(p) == 2: + p[0] = ast.Stmt(p[1]) + else: + p[0] = ast.Stmt(p[3]) + + +def p_stmts(p): + """stmts : stmts stmt + | stmt""" + if len(p) == 3: + p[0] = p[1] + p[2] + else: + p[0] = p[1] + +# No using Python's approach because Ply supports precedence + +# comparison: expr (comp_op expr)* +# arith_expr: term (('+'|'-') term)* +# term: factor (('*'|'/'|'%'|'//') factor)* +# factor: ('+'|'-'|'~') factor | power +# comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not' + + +def make_lt_compare((left, right)): + return ast.Compare(left, [('<', right), ]) + + +def make_gt_compare((left, right)): + return ast.Compare(left, [('>', right), ]) + + +def make_eq_compare((left, right)): + return ast.Compare(left, [('==', right), ]) + + +binary_ops = { + "+": ast.Add, + "-": ast.Sub, + "*": ast.Mul, + "/": ast.Div, + "<": make_lt_compare, + ">": make_gt_compare, + "==": make_eq_compare, +} +unary_ops = { + "+": ast.UnaryAdd, + "-": ast.UnarySub, +} +precedence = ( + ("left", "EQ", "GT", "LT"), + ("left", "PLUS", "MINUS"), + ("left", "MULT", "DIV"), +) + + +def p_comparison(p): + """comparison : comparison PLUS comparison + | comparison MINUS comparison + | comparison MULT comparison + | comparison DIV comparison + | comparison LT comparison + | comparison EQ comparison + | comparison GT comparison + | PLUS comparison + | MINUS comparison + | power""" + if len(p) == 4: + p[0] = binary_ops[p[2]]((p[1], p[3])) + elif len(p) == 3: + p[0] = unary_ops[p[1]](p[2]) + else: + p[0] = p[1] + +# power: atom trailer* ['**' factor] +# trailers enables function calls. I only allow one level of calls +# so this is 'trailer' + + +def p_power(p): + """power : atom + | atom trailer""" + if len(p) == 2: + p[0] = p[1] + else: + if p[2][0] == "CALL": + p[0] = ast.CallFunc(p[1], p[2][1], None, None) + else: + raise AssertionError("not implemented") + + +def p_atom_name(p): + """atom : NAME""" + p[0] = ast.Name(p[1]) + + +def p_atom_number(p): + """atom : NUMBER + | STRING""" + p[0] = ast.Const(p[1]) + + +def p_atom_tuple(p): + """atom : LPAR testlist RPAR""" + p[0] = p[2] + +# trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME + + +def p_trailer(p): + "trailer : LPAR arglist RPAR" + p[0] = ("CALL", p[2]) + +# testlist: test (',' test)* [','] +# Contains shift/reduce error + + +def p_testlist(p): + """testlist : testlist_multi COMMA + | testlist_multi """ + if len(p) == 2: + p[0] = p[1] + else: + # May need to promote singleton to tuple + if isinstance(p[1], list): + p[0] = p[1] + else: + p[0] = [p[1]] + # Convert into a tuple? + if isinstance(p[0], list): + p[0] = ast.Tuple(p[0]) + + +def p_testlist_multi(p): + """testlist_multi : testlist_multi COMMA test + | test""" + if len(p) == 2: + # singleton + p[0] = p[1] + else: + if isinstance(p[1], list): + p[0] = p[1] + [p[3]] + else: + # singleton -> tuple + p[0] = [p[1], p[3]] + + +# test: or_test ['if' or_test 'else' test] | lambdef +# as I don't support 'and', 'or', and 'not' this works down to 'comparison' +def p_test(p): + "test : comparison" + p[0] = p[1] + + +# arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test) +# XXX INCOMPLETE: this doesn't allow the trailing comma +def p_arglist(p): + """arglist : arglist COMMA argument + | argument""" + if len(p) == 4: + p[0] = p[1] + [p[3]] + else: + p[0] = [p[1]] + +# argument: test [gen_for] | test '=' test # Really [keyword '='] test + + +def p_argument(p): + "argument : test" + p[0] = p[1] + + +def p_error(p): + # print "Error!", repr(p) + raise SyntaxError(p) + + +class GardenSnakeParser(object): + + def __init__(self, lexer=None): + if lexer is None: + lexer = IndentLexer() + self.lexer = lexer + self.parser = yacc.yacc(start="file_input_end") + + def parse(self, code): + self.lexer.input(code) + result = self.parser.parse(lexer=self.lexer) + return ast.Module(None, result) + + +###### Code generation ###### + +from compiler import misc, syntax, pycodegen + + +class GardenSnakeCompiler(object): + + def __init__(self): + self.parser = GardenSnakeParser() + + def compile(self, code, filename="<string>"): + tree = self.parser.parse(code) + # print tree + misc.set_filename(filename, tree) + syntax.check(tree) + gen = pycodegen.ModuleCodeGenerator(tree) + code = gen.getCode() + return code + +####### Test code ####### + +compile = GardenSnakeCompiler().compile + +code = r""" + +print('LET\'S TRY THIS \\OUT') + +#Comment here +def x(a): + print('called with',a) + if a == 1: + return 2 + if a*2 > 10: return 999 / 4 + # Another comment here + + return a+2*3 + +ints = (1, 2, + 3, 4, +5) +print('mutiline-expression', ints) + +t = 4+1/3*2+6*(9-5+1) +print('predence test; should be 34+2/3:', t, t==(34+2/3)) + +print('numbers', 1,2,3,4,5) +if 1: + 8 + a=9 + print(x(a)) + +print(x(1)) +print(x(2)) +print(x(8),'3') +print('this is decimal', 1/5) +print('BIG DECIMAL', 1.234567891234567e12345) + +""" + +# Set up the GardenSnake run-time environment + + +def print_(*args): + print "-->", " ".join(map(str, args)) + +globals()["print"] = print_ + +compiled_code = compile(code) + +exec compiled_code in globals() +print "Done"
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/GardenSnake/README @@ -0,0 +1,5 @@ +This example is Andrew Dalke's GardenSnake language. It shows how to process an +indentation-like language like Python. Further details can be found here: + +http://dalkescientific.com/writings/diary/archive/2006/08/30/gardensnake_language.html +
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/README @@ -0,0 +1,10 @@ +Simple examples: + calc - Simple calculator + classcalc - Simple calculate defined as a class + +Complex examples + ansic - ANSI C grammar from K&R + BASIC - A small BASIC interpreter + GardenSnake - A simple python-like language + yply - Converts Unix yacc files to PLY programs. +
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/ansic/README @@ -0,0 +1,2 @@ +This example is incomplete. Was going to specify an ANSI C parser. +This is part of it.
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/ansic/clex.py @@ -0,0 +1,168 @@ +# ---------------------------------------------------------------------- +# clex.py +# +# A lexer for ANSI C. +# ---------------------------------------------------------------------- + +import sys +sys.path.insert(0, "../..") + +import ply.lex as lex + +# Reserved words +reserved = ( + 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST', 'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', + 'ELSE', 'ENUM', 'EXTERN', 'FLOAT', 'FOR', 'GOTO', 'IF', 'INT', 'LONG', 'REGISTER', + 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT', 'SWITCH', 'TYPEDEF', + 'UNION', 'UNSIGNED', 'VOID', 'VOLATILE', 'WHILE', +) + +tokens = reserved + ( + # Literals (identifier, integer constant, float constant, string constant, + # char const) + 'ID', 'TYPEID', 'ICONST', 'FCONST', 'SCONST', 'CCONST', + + # Operators (+,-,*,/,%,|,&,~,^,<<,>>, ||, &&, !, <, <=, >, >=, ==, !=) + 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', + 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', + 'LOR', 'LAND', 'LNOT', + 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', + + # Assignment (=, *=, /=, %=, +=, -=, <<=, >>=, &=, ^=, |=) + 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 'PLUSEQUAL', 'MINUSEQUAL', + 'LSHIFTEQUAL', 'RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', 'OREQUAL', + + # Increment/decrement (++,--) + 'PLUSPLUS', 'MINUSMINUS', + + # Structure dereference (->) + 'ARROW', + + # Conditional operator (?) + 'CONDOP', + + # Delimeters ( ) [ ] { } , . ; : + 'LPAREN', 'RPAREN', + 'LBRACKET', 'RBRACKET', + 'LBRACE', 'RBRACE', + 'COMMA', 'PERIOD', 'SEMI', 'COLON', + + # Ellipsis (...) + 'ELLIPSIS', +) + +# Completely ignored characters +t_ignore = ' \t\x0c' + +# Newlines + + +def t_NEWLINE(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + +# Operators +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_MOD = r'%' +t_OR = r'\|' +t_AND = r'&' +t_NOT = r'~' +t_XOR = r'\^' +t_LSHIFT = r'<<' +t_RSHIFT = r'>>' +t_LOR = r'\|\|' +t_LAND = r'&&' +t_LNOT = r'!' +t_LT = r'<' +t_GT = r'>' +t_LE = r'<=' +t_GE = r'>=' +t_EQ = r'==' +t_NE = r'!=' + +# Assignment operators + +t_EQUALS = r'=' +t_TIMESEQUAL = r'\*=' +t_DIVEQUAL = r'/=' +t_MODEQUAL = r'%=' +t_PLUSEQUAL = r'\+=' +t_MINUSEQUAL = r'-=' +t_LSHIFTEQUAL = r'<<=' +t_RSHIFTEQUAL = r'>>=' +t_ANDEQUAL = r'&=' +t_OREQUAL = r'\|=' +t_XOREQUAL = r'\^=' + +# Increment/decrement +t_PLUSPLUS = r'\+\+' +t_MINUSMINUS = r'--' + +# -> +t_ARROW = r'->' + +# ? +t_CONDOP = r'\?' + +# Delimeters +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_LBRACKET = r'\[' +t_RBRACKET = r'\]' +t_LBRACE = r'\{' +t_RBRACE = r'\}' +t_COMMA = r',' +t_PERIOD = r'\.' +t_SEMI = r';' +t_COLON = r':' +t_ELLIPSIS = r'\.\.\.' + +# Identifiers and reserved words + +reserved_map = {} +for r in reserved: + reserved_map[r.lower()] = r + + +def t_ID(t): + r'[A-Za-z_][\w_]*' + t.type = reserved_map.get(t.value, "ID") + return t + +# Integer literal +t_ICONST = r'\d+([uU]|[lL]|[uU][lL]|[lL][uU])?' + +# Floating literal +t_FCONST = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?' + +# String literal +t_SCONST = r'\"([^\\\n]|(\\.))*?\"' + +# Character constant 'c' or L'c' +t_CCONST = r'(L)?\'([^\\\n]|(\\.))*?\'' + +# Comments + + +def t_comment(t): + r'/\*(.|\n)*?\*/' + t.lexer.lineno += t.value.count('\n') + +# Preprocessor directive (ignored) + + +def t_preprocessor(t): + r'\#(.)*?\n' + t.lexer.lineno += 1 + + +def t_error(t): + print("Illegal character %s" % repr(t.value[0])) + t.lexer.skip(1) + +lexer = lex.lex() +if __name__ == "__main__": + lex.runmain(lexer)
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/ansic/cparse.py @@ -0,0 +1,1048 @@ +# ----------------------------------------------------------------------------- +# cparse.py +# +# Simple parser for ANSI C. Based on the grammar in K&R, 2nd Ed. +# ----------------------------------------------------------------------------- + +import sys +import clex +import ply.yacc as yacc + +# Get the token map +tokens = clex.tokens + +# translation-unit: + + +def p_translation_unit_1(t): + 'translation_unit : external_declaration' + pass + + +def p_translation_unit_2(t): + 'translation_unit : translation_unit external_declaration' + pass + +# external-declaration: + + +def p_external_declaration_1(t): + 'external_declaration : function_definition' + pass + + +def p_external_declaration_2(t): + 'external_declaration : declaration' + pass + +# function-definition: + + +def p_function_definition_1(t): + 'function_definition : declaration_specifiers declarator declaration_list compound_statement' + pass + + +def p_function_definition_2(t): + 'function_definition : declarator declaration_list compound_statement' + pass + + +def p_function_definition_3(t): + 'function_definition : declarator compound_statement' + pass + + +def p_function_definition_4(t): + 'function_definition : declaration_specifiers declarator compound_statement' + pass + +# declaration: + + +def p_declaration_1(t): + 'declaration : declaration_specifiers init_declarator_list SEMI' + pass + + +def p_declaration_2(t): + 'declaration : declaration_specifiers SEMI' + pass + +# declaration-list: + + +def p_declaration_list_1(t): + 'declaration_list : declaration' + pass + + +def p_declaration_list_2(t): + 'declaration_list : declaration_list declaration ' + pass + +# declaration-specifiers + + +def p_declaration_specifiers_1(t): + 'declaration_specifiers : storage_class_specifier declaration_specifiers' + pass + + +def p_declaration_specifiers_2(t): + 'declaration_specifiers : type_specifier declaration_specifiers' + pass + + +def p_declaration_specifiers_3(t): + 'declaration_specifiers : type_qualifier declaration_specifiers' + pass + + +def p_declaration_specifiers_4(t): + 'declaration_specifiers : storage_class_specifier' + pass + + +def p_declaration_specifiers_5(t): + 'declaration_specifiers : type_specifier' + pass + + +def p_declaration_specifiers_6(t): + 'declaration_specifiers : type_qualifier' + pass + +# storage-class-specifier + + +def p_storage_class_specifier(t): + '''storage_class_specifier : AUTO + | REGISTER + | STATIC + | EXTERN + | TYPEDEF + ''' + pass + +# type-specifier: + + +def p_type_specifier(t): + '''type_specifier : VOID + | CHAR + | SHORT + | INT + | LONG + | FLOAT + | DOUBLE + | SIGNED + | UNSIGNED + | struct_or_union_specifier + | enum_specifier + | TYPEID + ''' + pass + +# type-qualifier: + + +def p_type_qualifier(t): + '''type_qualifier : CONST + | VOLATILE''' + pass + +# struct-or-union-specifier + + +def p_struct_or_union_specifier_1(t): + 'struct_or_union_specifier : struct_or_union ID LBRACE struct_declaration_list RBRACE' + pass + + +def p_struct_or_union_specifier_2(t): + 'struct_or_union_specifier : struct_or_union LBRACE struct_declaration_list RBRACE' + pass + + +def p_struct_or_union_specifier_3(t): + 'struct_or_union_specifier : struct_or_union ID' + pass + +# struct-or-union: + + +def p_struct_or_union(t): + '''struct_or_union : STRUCT + | UNION + ''' + pass + +# struct-declaration-list: + + +def p_struct_declaration_list_1(t): + 'struct_declaration_list : struct_declaration' + pass + + +def p_struct_declaration_list_2(t): + 'struct_declaration_list : struct_declaration_list struct_declaration' + pass + +# init-declarator-list: + + +def p_init_declarator_list_1(t): + 'init_declarator_list : init_declarator' + pass + + +def p_init_declarator_list_2(t): + 'init_declarator_list : init_declarator_list COMMA init_declarator' + pass + +# init-declarator + + +def p_init_declarator_1(t): + 'init_declarator : declarator' + pass + + +def p_init_declarator_2(t): + 'init_declarator : declarator EQUALS initializer' + pass + +# struct-declaration: + + +def p_struct_declaration(t): + 'struct_declaration : specifier_qualifier_list struct_declarator_list SEMI' + pass + +# specifier-qualifier-list: + + +def p_specifier_qualifier_list_1(t): + 'specifier_qualifier_list : type_specifier specifier_qualifier_list' + pass + + +def p_specifier_qualifier_list_2(t): + 'specifier_qualifier_list : type_specifier' + pass + + +def p_specifier_qualifier_list_3(t): + 'specifier_qualifier_list : type_qualifier specifier_qualifier_list' + pass + + +def p_specifier_qualifier_list_4(t): + 'specifier_qualifier_list : type_qualifier' + pass + +# struct-declarator-list: + + +def p_struct_declarator_list_1(t): + 'struct_declarator_list : struct_declarator' + pass + + +def p_struct_declarator_list_2(t): + 'struct_declarator_list : struct_declarator_list COMMA struct_declarator' + pass + +# struct-declarator: + + +def p_struct_declarator_1(t): + 'struct_declarator : declarator' + pass + + +def p_struct_declarator_2(t): + 'struct_declarator : declarator COLON constant_expression' + pass + + +def p_struct_declarator_3(t): + 'struct_declarator : COLON constant_expression' + pass + +# enum-specifier: + + +def p_enum_specifier_1(t): + 'enum_specifier : ENUM ID LBRACE enumerator_list RBRACE' + pass + + +def p_enum_specifier_2(t): + 'enum_specifier : ENUM LBRACE enumerator_list RBRACE' + pass + + +def p_enum_specifier_3(t): + 'enum_specifier : ENUM ID' + pass + +# enumerator_list: + + +def p_enumerator_list_1(t): + 'enumerator_list : enumerator' + pass + + +def p_enumerator_list_2(t): + 'enumerator_list : enumerator_list COMMA enumerator' + pass + +# enumerator: + + +def p_enumerator_1(t): + 'enumerator : ID' + pass + + +def p_enumerator_2(t): + 'enumerator : ID EQUALS constant_expression' + pass + +# declarator: + + +def p_declarator_1(t): + 'declarator : pointer direct_declarator' + pass + + +def p_declarator_2(t): + 'declarator : direct_declarator' + pass + +# direct-declarator: + + +def p_direct_declarator_1(t): + 'direct_declarator : ID' + pass + + +def p_direct_declarator_2(t): + 'direct_declarator : LPAREN declarator RPAREN' + pass + + +def p_direct_declarator_3(t): + 'direct_declarator : direct_declarator LBRACKET constant_expression_opt RBRACKET' + pass + + +def p_direct_declarator_4(t): + 'direct_declarator : direct_declarator LPAREN parameter_type_list RPAREN ' + pass + + +def p_direct_declarator_5(t): + 'direct_declarator : direct_declarator LPAREN identifier_list RPAREN ' + pass + + +def p_direct_declarator_6(t): + 'direct_declarator : direct_declarator LPAREN RPAREN ' + pass + +# pointer: + + +def p_pointer_1(t): + 'pointer : TIMES type_qualifier_list' + pass + + +def p_pointer_2(t): + 'pointer : TIMES' + pass + + +def p_pointer_3(t): + 'pointer : TIMES type_qualifier_list pointer' + pass + + +def p_pointer_4(t): + 'pointer : TIMES pointer' + pass + +# type-qualifier-list: + + +def p_type_qualifier_list_1(t): + 'type_qualifier_list : type_qualifier' + pass + + +def p_type_qualifier_list_2(t): + 'type_qualifier_list : type_qualifier_list type_qualifier' + pass + +# parameter-type-list: + + +def p_parameter_type_list_1(t): + 'parameter_type_list : parameter_list' + pass + + +def p_parameter_type_list_2(t): + 'parameter_type_list : parameter_list COMMA ELLIPSIS' + pass + +# parameter-list: + + +def p_parameter_list_1(t): + 'parameter_list : parameter_declaration' + pass + + +def p_parameter_list_2(t): + 'parameter_list : parameter_list COMMA parameter_declaration' + pass + +# parameter-declaration: + + +def p_parameter_declaration_1(t): + 'parameter_declaration : declaration_specifiers declarator' + pass + + +def p_parameter_declaration_2(t): + 'parameter_declaration : declaration_specifiers abstract_declarator_opt' + pass + +# identifier-list: + + +def p_identifier_list_1(t): + 'identifier_list : ID' + pass + + +def p_identifier_list_2(t): + 'identifier_list : identifier_list COMMA ID' + pass + +# initializer: + + +def p_initializer_1(t): + 'initializer : assignment_expression' + pass + + +def p_initializer_2(t): + '''initializer : LBRACE initializer_list RBRACE + | LBRACE initializer_list COMMA RBRACE''' + pass + +# initializer-list: + + +def p_initializer_list_1(t): + 'initializer_list : initializer' + pass + + +def p_initializer_list_2(t): + 'initializer_list : initializer_list COMMA initializer' + pass + +# type-name: + + +def p_type_name(t): + 'type_name : specifier_qualifier_list abstract_declarator_opt' + pass + + +def p_abstract_declarator_opt_1(t): + 'abstract_declarator_opt : empty' + pass + + +def p_abstract_declarator_opt_2(t): + 'abstract_declarator_opt : abstract_declarator' + pass + +# abstract-declarator: + + +def p_abstract_declarator_1(t): + 'abstract_declarator : pointer ' + pass + + +def p_abstract_declarator_2(t): + 'abstract_declarator : pointer direct_abstract_declarator' + pass + + +def p_abstract_declarator_3(t): + 'abstract_declarator : direct_abstract_declarator' + pass + +# direct-abstract-declarator: + + +def p_direct_abstract_declarator_1(t): + 'direct_abstract_declarator : LPAREN abstract_declarator RPAREN' + pass + + +def p_direct_abstract_declarator_2(t): + 'direct_abstract_declarator : direct_abstract_declarator LBRACKET constant_expression_opt RBRACKET' + pass + + +def p_direct_abstract_declarator_3(t): + 'direct_abstract_declarator : LBRACKET constant_expression_opt RBRACKET' + pass + + +def p_direct_abstract_declarator_4(t): + 'direct_abstract_declarator : direct_abstract_declarator LPAREN parameter_type_list_opt RPAREN' + pass + + +def p_direct_abstract_declarator_5(t): + 'direct_abstract_declarator : LPAREN parameter_type_list_opt RPAREN' + pass + +# Optional fields in abstract declarators + + +def p_constant_expression_opt_1(t): + 'constant_expression_opt : empty' + pass + + +def p_constant_expression_opt_2(t): + 'constant_expression_opt : constant_expression' + pass + + +def p_parameter_type_list_opt_1(t): + 'parameter_type_list_opt : empty' + pass + + +def p_parameter_type_list_opt_2(t): + 'parameter_type_list_opt : parameter_type_list' + pass + +# statement: + + +def p_statement(t): + ''' + statement : labeled_statement + | expression_statement + | compound_statement + | selection_statement + | iteration_statement + | jump_statement + ''' + pass + +# labeled-statement: + + +def p_labeled_statement_1(t): + 'labeled_statement : ID COLON statement' + pass + + +def p_labeled_statement_2(t): + 'labeled_statement : CASE constant_expression COLON statement' + pass + + +def p_labeled_statement_3(t): + 'labeled_statement : DEFAULT COLON statement' + pass + +# expression-statement: + + +def p_expression_statement(t): + 'expression_statement : expression_opt SEMI' + pass + +# compound-statement: + + +def p_compound_statement_1(t): + 'compound_statement : LBRACE declaration_list statement_list RBRACE' + pass + + +def p_compound_statement_2(t): + 'compound_statement : LBRACE statement_list RBRACE' + pass + + +def p_compound_statement_3(t): + 'compound_statement : LBRACE declaration_list RBRACE' + pass + + +def p_compound_statement_4(t): + 'compound_statement : LBRACE RBRACE' + pass + +# statement-list: + + +def p_statement_list_1(t): + 'statement_list : statement' + pass + + +def p_statement_list_2(t): + 'statement_list : statement_list statement' + pass + +# selection-statement + + +def p_selection_statement_1(t): + 'selection_statement : IF LPAREN expression RPAREN statement' + pass + + +def p_selection_statement_2(t): + 'selection_statement : IF LPAREN expression RPAREN statement ELSE statement ' + pass + + +def p_selection_statement_3(t): + 'selection_statement : SWITCH LPAREN expression RPAREN statement ' + pass + +# iteration_statement: + + +def p_iteration_statement_1(t): + 'iteration_statement : WHILE LPAREN expression RPAREN statement' + pass + + +def p_iteration_statement_2(t): + 'iteration_statement : FOR LPAREN expression_opt SEMI expression_opt SEMI expression_opt RPAREN statement ' + pass + + +def p_iteration_statement_3(t): + 'iteration_statement : DO statement WHILE LPAREN expression RPAREN SEMI' + pass + +# jump_statement: + + +def p_jump_statement_1(t): + 'jump_statement : GOTO ID SEMI' + pass + + +def p_jump_statement_2(t): + 'jump_statement : CONTINUE SEMI' + pass + + +def p_jump_statement_3(t): + 'jump_statement : BREAK SEMI' + pass + + +def p_jump_statement_4(t): + 'jump_statement : RETURN expression_opt SEMI' + pass + + +def p_expression_opt_1(t): + 'expression_opt : empty' + pass + + +def p_expression_opt_2(t): + 'expression_opt : expression' + pass + +# expression: + + +def p_expression_1(t): + 'expression : assignment_expression' + pass + + +def p_expression_2(t): + 'expression : expression COMMA assignment_expression' + pass + +# assigment_expression: + + +def p_assignment_expression_1(t): + 'assignment_expression : conditional_expression' + pass + + +def p_assignment_expression_2(t): + 'assignment_expression : unary_expression assignment_operator assignment_expression' + pass + +# assignment_operator: + + +def p_assignment_operator(t): + ''' + assignment_operator : EQUALS + | TIMESEQUAL + | DIVEQUAL + | MODEQUAL + | PLUSEQUAL + | MINUSEQUAL + | LSHIFTEQUAL + | RSHIFTEQUAL + | ANDEQUAL + | OREQUAL + | XOREQUAL + ''' + pass + +# conditional-expression + + +def p_conditional_expression_1(t): + 'conditional_expression : logical_or_expression' + pass + + +def p_conditional_expression_2(t): + 'conditional_expression : logical_or_expression CONDOP expression COLON conditional_expression ' + pass + +# constant-expression + + +def p_constant_expression(t): + 'constant_expression : conditional_expression' + pass + +# logical-or-expression + + +def p_logical_or_expression_1(t): + 'logical_or_expression : logical_and_expression' + pass + + +def p_logical_or_expression_2(t): + 'logical_or_expression : logical_or_expression LOR logical_and_expression' + pass + +# logical-and-expression + + +def p_logical_and_expression_1(t): + 'logical_and_expression : inclusive_or_expression' + pass + + +def p_logical_and_expression_2(t): + 'logical_and_expression : logical_and_expression LAND inclusive_or_expression' + pass + +# inclusive-or-expression: + + +def p_inclusive_or_expression_1(t): + 'inclusive_or_expression : exclusive_or_expression' + pass + + +def p_inclusive_or_expression_2(t): + 'inclusive_or_expression : inclusive_or_expression OR exclusive_or_expression' + pass + +# exclusive-or-expression: + + +def p_exclusive_or_expression_1(t): + 'exclusive_or_expression : and_expression' + pass + + +def p_exclusive_or_expression_2(t): + 'exclusive_or_expression : exclusive_or_expression XOR and_expression' + pass + +# AND-expression + + +def p_and_expression_1(t): + 'and_expression : equality_expression' + pass + + +def p_and_expression_2(t): + 'and_expression : and_expression AND equality_expression' + pass + + +# equality-expression: +def p_equality_expression_1(t): + 'equality_expression : relational_expression' + pass + + +def p_equality_expression_2(t): + 'equality_expression : equality_expression EQ relational_expression' + pass + + +def p_equality_expression_3(t): + 'equality_expression : equality_expression NE relational_expression' + pass + + +# relational-expression: +def p_relational_expression_1(t): + 'relational_expression : shift_expression' + pass + + +def p_relational_expression_2(t): + 'relational_expression : relational_expression LT shift_expression' + pass + + +def p_relational_expression_3(t): + 'relational_expression : relational_expression GT shift_expression' + pass + + +def p_relational_expression_4(t): + 'relational_expression : relational_expression LE shift_expression' + pass + + +def p_relational_expression_5(t): + 'relational_expression : relational_expression GE shift_expression' + pass + +# shift-expression + + +def p_shift_expression_1(t): + 'shift_expression : additive_expression' + pass + + +def p_shift_expression_2(t): + 'shift_expression : shift_expression LSHIFT additive_expression' + pass + + +def p_shift_expression_3(t): + 'shift_expression : shift_expression RSHIFT additive_expression' + pass + +# additive-expression + + +def p_additive_expression_1(t): + 'additive_expression : multiplicative_expression' + pass + + +def p_additive_expression_2(t): + 'additive_expression : additive_expression PLUS multiplicative_expression' + pass + + +def p_additive_expression_3(t): + 'additive_expression : additive_expression MINUS multiplicative_expression' + pass + +# multiplicative-expression + + +def p_multiplicative_expression_1(t): + 'multiplicative_expression : cast_expression' + pass + + +def p_multiplicative_expression_2(t): + 'multiplicative_expression : multiplicative_expression TIMES cast_expression' + pass + + +def p_multiplicative_expression_3(t): + 'multiplicative_expression : multiplicative_expression DIVIDE cast_expression' + pass + + +def p_multiplicative_expression_4(t): + 'multiplicative_expression : multiplicative_expression MOD cast_expression' + pass + +# cast-expression: + + +def p_cast_expression_1(t): + 'cast_expression : unary_expression' + pass + + +def p_cast_expression_2(t): + 'cast_expression : LPAREN type_name RPAREN cast_expression' + pass + +# unary-expression: + + +def p_unary_expression_1(t): + 'unary_expression : postfix_expression' + pass + + +def p_unary_expression_2(t): + 'unary_expression : PLUSPLUS unary_expression' + pass + + +def p_unary_expression_3(t): + 'unary_expression : MINUSMINUS unary_expression' + pass + + +def p_unary_expression_4(t): + 'unary_expression : unary_operator cast_expression' + pass + + +def p_unary_expression_5(t): + 'unary_expression : SIZEOF unary_expression' + pass + + +def p_unary_expression_6(t): + 'unary_expression : SIZEOF LPAREN type_name RPAREN' + pass + +# unary-operator + + +def p_unary_operator(t): + '''unary_operator : AND + | TIMES + | PLUS + | MINUS + | NOT + | LNOT ''' + pass + +# postfix-expression: + + +def p_postfix_expression_1(t): + 'postfix_expression : primary_expression' + pass + + +def p_postfix_expression_2(t): + 'postfix_expression : postfix_expression LBRACKET expression RBRACKET' + pass + + +def p_postfix_expression_3(t): + 'postfix_expression : postfix_expression LPAREN argument_expression_list RPAREN' + pass + + +def p_postfix_expression_4(t): + 'postfix_expression : postfix_expression LPAREN RPAREN' + pass + + +def p_postfix_expression_5(t): + 'postfix_expression : postfix_expression PERIOD ID' + pass + + +def p_postfix_expression_6(t): + 'postfix_expression : postfix_expression ARROW ID' + pass + + +def p_postfix_expression_7(t): + 'postfix_expression : postfix_expression PLUSPLUS' + pass + + +def p_postfix_expression_8(t): + 'postfix_expression : postfix_expression MINUSMINUS' + pass + +# primary-expression: + + +def p_primary_expression(t): + '''primary_expression : ID + | constant + | SCONST + | LPAREN expression RPAREN''' + pass + +# argument-expression-list: + + +def p_argument_expression_list(t): + '''argument_expression_list : assignment_expression + | argument_expression_list COMMA assignment_expression''' + pass + +# constant: + + +def p_constant(t): + '''constant : ICONST + | FCONST + | CCONST''' + pass + + +def p_empty(t): + 'empty : ' + pass + + +def p_error(t): + print("Whoa. We're hosed") + +import profile +# Build the grammar + +yacc.yacc() +#yacc.yacc(method='LALR',write_tables=False,debug=False) + +#profile.run("yacc.yacc(method='LALR')")
new file mode 100644 --- /dev/null +++ b/other-licenses/ply/example/calc/calc.py @@ -0,0 +1,123 @@ +# ----------------------------------------------------------------------------- +# calc.py +# +# A simple calculator with variables. This is from O'Reilly's +# "Lex and Yacc", p. 63. +# ----------------------------------------------------------------------------- + +import sys +sys.path.insert(0, "../..") + +if sys.version_info[0] >= 3: + raw_input = input + +tokens = ( + 'NAME', 'NUMBER', +) + +literals = ['=', '+', '-', '*', '/', '(', ')'] + +# Tokens + +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + +def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + +t_ignore = " \t" + + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +import ply.lex as lex +lex.lex() + +# Parsing rules + +precedence = ( + ('left', '+', '-'), + ('left', '*', '/'), + ('right', 'UMINUS'), +) + +# dictionary of names +names = {} + + +def p_statement_assign(p): + 'statement : NAME "=" expression' + names[p[1]] = p[3] + + +def p_statement_expr(p): + 'statement : expression' + print(p[1]) + + +def p_expression_binop(p): + '''expression : expression '+' expression + | expression '-' expression + | expression '*' expression + | expression '/' expression''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + elif p[2] == '*': + p[0] = p[1] * p[3] + elif p[2] == '/': + p[0] = p[1] / p[3] + + +def p_expression_uminus(p): + "expression : '-' expression %prec UMINUS" + p[0] = -p[2] + + +def p_expression_group(p): + "expression : '(' expression ')'" + p[0] = p[2] + + +def p_expression_number(p): + "expression : NUMBER" + p[0] = p[1] + + +def p_expression_name(p): + "expression : NAME" + try: + p[0] = names[p[1]] + except LookupError: + print("Undefined name '%s'" % p[1]) + p[0] = 0 + + +def p_error(p): + if p: + print("Syntax error at '%s'" % p.value) + else: + print("Syntax error at EOF") + +import ply.yacc as yacc +yacc.yacc() + +while 1: + try: + s = raw_input('calc > ') + except EOFError: + break + if not s: + continue + yacc.parse(s)