1"Parses and creates Grammar objects"
2
3import os.path
4import sys
5from copy import copy, deepcopy
6from io import open
7
8from .utils import bfs, eval_escaping
9from .lexer import Token, TerminalDef, PatternStr, PatternRE
10
11from .parse_tree_builder import ParseTreeBuilder
12from .parser_frontends import LALR_TraditionalLexer
13from .common import LexerConf, ParserConf
14from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
15from .utils import classify, suppress, dedup_list, Str
16from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken
17
18from .tree import Tree, SlottedTree as ST
19from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive
20inline_args = v_args(inline=True)
21
22__path__ = os.path.dirname(__file__)
23IMPORT_PATHS = [os.path.join(__path__, 'grammars')]
24
25EXT = '.lark'
26
27_RE_FLAGS = 'imslux'
28
29_EMPTY = Symbol('__empty__')
30
31_TERMINAL_NAMES = {
32    '.' : 'DOT',
33    ',' : 'COMMA',
34    ':' : 'COLON',
35    ';' : 'SEMICOLON',
36    '+' : 'PLUS',
37    '-' : 'MINUS',
38    '*' : 'STAR',
39    '/' : 'SLASH',
40    '\\' : 'BACKSLASH',
41    '|' : 'VBAR',
42    '?' : 'QMARK',
43    '!' : 'BANG',
44    '@' : 'AT',
45    '#' : 'HASH',
46    '$' : 'DOLLAR',
47    '%' : 'PERCENT',
48    '^' : 'CIRCUMFLEX',
49    '&' : 'AMPERSAND',
50    '_' : 'UNDERSCORE',
51    '<' : 'LESSTHAN',
52    '>' : 'MORETHAN',
53    '=' : 'EQUAL',
54    '"' : 'DBLQUOTE',
55    '\'' : 'QUOTE',
56    '`' : 'BACKQUOTE',
57    '~' : 'TILDE',
58    '(' : 'LPAR',
59    ')' : 'RPAR',
60    '{' : 'LBRACE',
61    '}' : 'RBRACE',
62    '[' : 'LSQB',
63    ']' : 'RSQB',
64    '\n' : 'NEWLINE',
65    '\r\n' : 'CRLF',
66    '\t' : 'TAB',
67    ' ' : 'SPACE',
68}
69
70# Grammar Parser
71TERMINALS = {
72    '_LPAR': r'\(',
73    '_RPAR': r'\)',
74    '_LBRA': r'\[',
75    '_RBRA': r'\]',
76    '_LBRACE': r'\{',
77    '_RBRACE': r'\}',
78    'OP': '[+*]|[?](?![a-z])',
79    '_COLON': ':',
80    '_COMMA': ',',
81    '_OR': r'\|',
82    '_DOT': r'\.(?!\.)',
83    '_DOTDOT': r'\.\.',
84    'TILDE': '~',
85    'RULE': '!?[_?]?[a-z][_a-z0-9]*',
86    'TERMINAL': '_?[A-Z][_A-Z0-9]*',
87    'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
88    'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS,
89    '_NL': r'(\r?\n)+\s*',
90    'WS': r'[ \t]+',
91    'COMMENT': r'\s*//[^\n]*',
92    '_TO': '->',
93    '_IGNORE': r'%ignore',
94    '_DECLARE': r'%declare',
95    '_IMPORT': r'%import',
96    'NUMBER': r'[+-]?\d+',
97}
98
99RULES = {
100    'start': ['_list'],
101    '_list':  ['_item', '_list _item'],
102    '_item':  ['rule', 'term', 'statement', '_NL'],
103
104    'rule': ['RULE template_params _COLON expansions _NL',
105             'RULE template_params _DOT NUMBER _COLON expansions _NL'],
106    'template_params': ['_LBRACE _template_params _RBRACE',
107                        ''],
108    '_template_params': ['RULE',
109                         '_template_params _COMMA RULE'],
110    'expansions': ['alias',
111                   'expansions _OR alias',
112                   'expansions _NL _OR alias'],
113
114    '?alias':     ['expansion _TO RULE', 'expansion'],
115    'expansion': ['_expansion'],
116
117    '_expansion': ['', '_expansion expr'],
118
119    '?expr': ['atom',
120              'atom OP',
121              'atom TILDE NUMBER',
122              'atom TILDE NUMBER _DOTDOT NUMBER',
123              ],
124
125    '?atom': ['_LPAR expansions _RPAR',
126              'maybe',
127              'value'],
128
129    'value': ['terminal',
130              'nonterminal',
131              'literal',
132              'range',
133              'template_usage'],
134
135    'terminal': ['TERMINAL'],
136    'nonterminal': ['RULE'],
137
138    '?name': ['RULE', 'TERMINAL'],
139
140    'maybe': ['_LBRA expansions _RBRA'],
141    'range': ['STRING _DOTDOT STRING'],
142
143    'template_usage': ['RULE _LBRACE _template_args _RBRACE'],
144    '_template_args': ['value',
145                       '_template_args _COMMA value'],
146
147    'term': ['TERMINAL _COLON expansions _NL',
148             'TERMINAL _DOT NUMBER _COLON expansions _NL'],
149    'statement': ['ignore', 'import', 'declare'],
150    'ignore': ['_IGNORE expansions _NL'],
151    'declare': ['_DECLARE _declare_args _NL'],
152    'import': ['_IMPORT _import_path _NL',
153               '_IMPORT _import_path _LPAR name_list _RPAR _NL',
154               '_IMPORT _import_path _TO name _NL'],
155
156    '_import_path': ['import_lib', 'import_rel'],
157    'import_lib': ['_import_args'],
158    'import_rel': ['_DOT _import_args'],
159    '_import_args': ['name', '_import_args _DOT name'],
160
161    'name_list': ['_name_list'],
162    '_name_list': ['name', '_name_list _COMMA name'],
163
164    '_declare_args': ['name', '_declare_args name'],
165    'literal': ['REGEXP', 'STRING'],
166}
167
168@inline_args
169class EBNF_to_BNF(Transformer_InPlace):
170    def __init__(self):
171        self.new_rules = []
172        self.rules_by_expr = {}
173        self.prefix = 'anon'
174        self.i = 0
175        self.rule_options = None
176
177    def _add_recurse_rule(self, type_, expr):
178        if expr in self.rules_by_expr:
179            return self.rules_by_expr[expr]
180
181        new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
182        self.i += 1
183        t = NonTerminal(new_name)
184        tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])])
185        self.new_rules.append((new_name, tree, self.rule_options))
186        self.rules_by_expr[expr] = t
187        return t
188
189    def expr(self, rule, op, *args):
190        if op.value == '?':
191            empty = ST('expansion', [])
192            return ST('expansions', [rule, empty])
193        elif op.value == '+':
194            # a : b c+ d
195            #   -->
196            # a : b _c d
197            # _c : _c c | c;
198            return self._add_recurse_rule('plus', rule)
199        elif op.value == '*':
200            # a : b c* d
201            #   -->
202            # a : b _c? d
203            # _c : _c c | c;
204            new_name = self._add_recurse_rule('star', rule)
205            return ST('expansions', [new_name, ST('expansion', [])])
206        elif op.value == '~':
207            if len(args) == 1:
208                mn = mx = int(args[0])
209            else:
210                mn, mx = map(int, args)
211                if mx < mn or mn < 0:
212                    raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
213            return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])
214        assert False, op
215
216    def maybe(self, rule):
217        keep_all_tokens = self.rule_options and self.rule_options.keep_all_tokens
218
219        def will_not_get_removed(sym):
220            if isinstance(sym, NonTerminal):
221                return not sym.name.startswith('_')
222            if isinstance(sym, Terminal):
223                return keep_all_tokens or not sym.filter_out
224            assert False
225
226        if any(rule.scan_values(will_not_get_removed)):
227            empty = _EMPTY
228        else:
229            empty = ST('expansion', [])
230
231        return ST('expansions', [rule, empty])
232
233
234class SimplifyRule_Visitor(Visitor):
235
236    @staticmethod
237    def _flatten(tree):
238        while True:
239            to_expand = [i for i, child in enumerate(tree.children)
240                         if isinstance(child, Tree) and child.data == tree.data]
241            if not to_expand:
242                break
243            tree.expand_kids_by_index(*to_expand)
244
245    def expansion(self, tree):
246        # rules_list unpacking
247        # a : b (c|d) e
248        #  -->
249        # a : b c e | b d e
250        #
251        # In AST terms:
252        # expansion(b, expansions(c, d), e)
253        #   -->
254        # expansions( expansion(b, c, e), expansion(b, d, e) )
255
256        self._flatten(tree)
257
258        for i, child in enumerate(tree.children):
259            if isinstance(child, Tree) and child.data == 'expansions':
260                tree.data = 'expansions'
261                tree.children = [self.visit(ST('expansion', [option if i==j else other
262                                                            for j, other in enumerate(tree.children)]))
263                                    for option in dedup_list(child.children)]
264                self._flatten(tree)
265                break
266
267    def alias(self, tree):
268        rule, alias_name = tree.children
269        if rule.data == 'expansions':
270            aliases = []
271            for child in tree.children[0].children:
272                aliases.append(ST('alias', [child, alias_name]))
273            tree.data = 'expansions'
274            tree.children = aliases
275
276    def expansions(self, tree):
277        self._flatten(tree)
278        # Ensure all children are unique
279        if len(set(tree.children)) != len(tree.children):
280            tree.children = dedup_list(tree.children)   # dedup is expensive, so try to minimize its use
281
282
283class RuleTreeToText(Transformer):
284    def expansions(self, x):
285        return x
286    def expansion(self, symbols):
287        return symbols, None
288    def alias(self, x):
289        (expansion, _alias), alias = x
290        assert _alias is None, (alias, expansion, '-', _alias)  # Double alias not allowed
291        return expansion, alias.value
292
293
294@inline_args
295class CanonizeTree(Transformer_InPlace):
296    def tokenmods(self, *args):
297        if len(args) == 1:
298            return list(args)
299        tokenmods, value = args
300        return tokenmods + [value]
301
302class PrepareAnonTerminals(Transformer_InPlace):
303    "Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them"
304
305    def __init__(self, terminals):
306        self.terminals = terminals
307        self.term_set = {td.name for td in self.terminals}
308        self.term_reverse = {td.pattern: td for td in terminals}
309        self.i = 0
310
311
312    @inline_args
313    def pattern(self, p):
314        value = p.value
315        if p in self.term_reverse and p.flags != self.term_reverse[p].pattern.flags:
316            raise GrammarError(u'Conflicting flags for the same terminal: %s' % p)
317
318        term_name = None
319
320        if isinstance(p, PatternStr):
321            try:
322                # If already defined, use the user-defined terminal name
323                term_name = self.term_reverse[p].name
324            except KeyError:
325                # Try to assign an indicative anon-terminal name
326                try:
327                    term_name = _TERMINAL_NAMES[value]
328                except KeyError:
329                    if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set:
330                        with suppress(UnicodeEncodeError):
331                            value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names
332                            term_name = value.upper()
333
334                if term_name in self.term_set:
335                    term_name = None
336
337        elif isinstance(p, PatternRE):
338            if p in self.term_reverse: # Kind of a wierd placement.name
339                term_name = self.term_reverse[p].name
340        else:
341            assert False, p
342
343        if term_name is None:
344            term_name = '__ANON_%d' % self.i
345            self.i += 1
346
347        if term_name not in self.term_set:
348            assert p not in self.term_reverse
349            self.term_set.add(term_name)
350            termdef = TerminalDef(term_name, p)
351            self.term_reverse[p] = termdef
352            self.terminals.append(termdef)
353
354        return Terminal(term_name, filter_out=isinstance(p, PatternStr))
355
356class _ReplaceSymbols(Transformer_InPlace):
357    " Helper for ApplyTemplates "
358
359    def __init__(self):
360        self.names = {}
361
362    def value(self, c):
363        if len(c) == 1 and isinstance(c[0], Token) and c[0].value in self.names:
364            return self.names[c[0].value]
365        return self.__default__('value', c, None)
366
367    def template_usage(self, c):
368        if c[0] in self.names:
369            return self.__default__('template_usage', [self.names[c[0]].name] + c[1:], None)
370        return self.__default__('template_usage', c, None)
371
372class ApplyTemplates(Transformer_InPlace):
373    " Apply the templates, creating new rules that represent the used templates "
374
375    def __init__(self, rule_defs):
376        self.rule_defs = rule_defs
377        self.replacer = _ReplaceSymbols()
378        self.created_templates = set()
379
380    def template_usage(self, c):
381        name = c[0]
382        args = c[1:]
383        result_name = "%s{%s}" % (name, ",".join(a.name for a in args))
384        if result_name not in self.created_templates:
385            self.created_templates.add(result_name)
386            (_n, params, tree, options) ,= (t for t in self.rule_defs if t[0] == name)
387            assert len(params) == len(args), args
388            result_tree = deepcopy(tree)
389            self.replacer.names = dict(zip(params, args))
390            self.replacer.transform(result_tree)
391            self.rule_defs.append((result_name, [], result_tree, deepcopy(options)))
392        return NonTerminal(result_name)
393
394
395def _rfind(s, choices):
396    return max(s.rfind(c) for c in choices)
397
398
399
400
401def _literal_to_pattern(literal):
402    v = literal.value
403    flag_start = _rfind(v, '/"')+1
404    assert flag_start > 0
405    flags = v[flag_start:]
406    assert all(f in _RE_FLAGS for f in flags), flags
407
408    v = v[:flag_start]
409    assert v[0] == v[-1] and v[0] in '"/'
410    x = v[1:-1]
411
412    s = eval_escaping(x)
413
414    if literal.type == 'STRING':
415        s = s.replace('\\\\', '\\')
416
417    return { 'STRING': PatternStr,
418             'REGEXP': PatternRE }[literal.type](s, flags)
419
420
421@inline_args
422class PrepareLiterals(Transformer_InPlace):
423    def literal(self, literal):
424        return ST('pattern', [_literal_to_pattern(literal)])
425
426    def range(self, start, end):
427        assert start.type == end.type == 'STRING'
428        start = start.value[1:-1]
429        end = end.value[1:-1]
430        assert len(eval_escaping(start)) == len(eval_escaping(end)) == 1, (start, end, len(eval_escaping(start)), len(eval_escaping(end)))
431        regexp = '[%s-%s]' % (start, end)
432        return ST('pattern', [PatternRE(regexp)])
433
434
435class TerminalTreeToPattern(Transformer):
436    def pattern(self, ps):
437        p ,= ps
438        return p
439
440    def expansion(self, items):
441        assert items
442        if len(items) == 1:
443            return items[0]
444        if len({i.flags for i in items}) > 1:
445            raise GrammarError("Lark doesn't support joining terminals with conflicting flags!")
446        return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ())
447
448    def expansions(self, exps):
449        if len(exps) == 1:
450            return exps[0]
451        if len({i.flags for i in exps}) > 1:
452            raise GrammarError("Lark doesn't support joining terminals with conflicting flags!")
453        return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), exps[0].flags)
454
455    def expr(self, args):
456        inner, op = args[:2]
457        if op == '~':
458            if len(args) == 3:
459                op = "{%d}" % int(args[2])
460            else:
461                mn, mx = map(int, args[2:])
462                if mx < mn:
463                    raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (inner, mn, mx))
464                op = "{%d,%d}" % (mn, mx)
465        else:
466            assert len(args) == 2
467        return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags)
468
469    def maybe(self, expr):
470        return self.expr(expr + ['?'])
471
472    def alias(self, t):
473        raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)")
474
475    def value(self, v):
476        return v[0]
477
478class PrepareSymbols(Transformer_InPlace):
479    def value(self, v):
480        v ,= v
481        if isinstance(v, Tree):
482            return v
483        elif v.type == 'RULE':
484            return NonTerminal(Str(v.value))
485        elif v.type == 'TERMINAL':
486            return Terminal(Str(v.value), filter_out=v.startswith('_'))
487        assert False
488
489def _choice_of_rules(rules):
490    return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules])
491
492def nr_deepcopy_tree(t):
493    "Deepcopy tree `t` without recursion"
494    return Transformer_NonRecursive(False).transform(t)
495
496class Grammar:
497    def __init__(self, rule_defs, term_defs, ignore):
498        self.term_defs = term_defs
499        self.rule_defs = rule_defs
500        self.ignore = ignore
501
502    def compile(self, start):
503        # We change the trees in-place (to support huge grammars)
504        # So deepcopy allows calling compile more than once.
505        term_defs = deepcopy(list(self.term_defs))
506        rule_defs = [(n,p,nr_deepcopy_tree(t),o) for n,p,t,o in self.rule_defs]
507
508        # ===================
509        #  Compile Terminals
510        # ===================
511
512        # Convert terminal-trees to strings/regexps
513
514        for name, (term_tree, priority) in term_defs:
515            if term_tree is None:  # Terminal added through %declare
516                continue
517            expansions = list(term_tree.find_data('expansion'))
518            if len(expansions) == 1 and not expansions[0].children:
519                raise GrammarError("Terminals cannot be empty (%s)" % name)
520
521        transformer = PrepareLiterals() * TerminalTreeToPattern()
522        terminals = [TerminalDef(name, transformer.transform( term_tree ), priority)
523                     for name, (term_tree, priority) in term_defs if term_tree]
524
525        # =================
526        #  Compile Rules
527        # =================
528
529        # 1. Pre-process terminals
530        transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(terminals)  # Adds to terminals
531
532        # 2. Inline Templates
533
534        transformer *= ApplyTemplates(rule_defs)
535
536        # 3. Convert EBNF to BNF (and apply step 1 & 2)
537        ebnf_to_bnf = EBNF_to_BNF()
538        rules = []
539        i = 0
540        while i < len(rule_defs): # We have to do it like this because rule_defs might grow due to templates
541            name, params, rule_tree, options = rule_defs[i]
542            i += 1
543            if len(params) != 0: # Dont transform templates
544                continue
545            ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options.keep_all_tokens else None
546            ebnf_to_bnf.prefix = name
547            tree = transformer.transform(rule_tree)
548            res = ebnf_to_bnf.transform(tree)
549            rules.append((name, res, options))
550        rules += ebnf_to_bnf.new_rules
551
552        assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision"
553
554        # 4. Compile tree to Rule objects
555        rule_tree_to_text = RuleTreeToText()
556
557        simplify_rule = SimplifyRule_Visitor()
558        compiled_rules = []
559        for rule_content in rules:
560            name, tree, options = rule_content
561            simplify_rule.visit(tree)
562            expansions = rule_tree_to_text.transform(tree)
563
564            for i, (expansion, alias) in enumerate(expansions):
565                if alias and name.startswith('_'):
566                    raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))
567
568                empty_indices = [x==_EMPTY for x in expansion]
569                if any(empty_indices):
570                    exp_options = copy(options) or RuleOptions()
571                    exp_options.empty_indices = empty_indices
572                    expansion = [x for x in expansion if x!=_EMPTY]
573                else:
574                    exp_options = options
575
576                assert all(isinstance(x, Symbol) for x in expansion), expansion
577                rule = Rule(NonTerminal(name), expansion, i, alias, exp_options)
578                compiled_rules.append(rule)
579
580        # Remove duplicates of empty rules, throw error for non-empty duplicates
581        if len(set(compiled_rules)) != len(compiled_rules):
582            duplicates = classify(compiled_rules, lambda x: x)
583            for dups in duplicates.values():
584                if len(dups) > 1:
585                    if dups[0].expansion:
586                        raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)"
587                                           % ''.join('\n  * %s' % i for i in dups))
588
589                    # Empty rule; assert all other attributes are equal
590                    assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups)
591
592            # Remove duplicates
593            compiled_rules = list(set(compiled_rules))
594
595
596        # Filter out unused rules
597        while True:
598            c = len(compiled_rules)
599            used_rules = {s for r in compiled_rules
600                                for s in r.expansion
601                                if isinstance(s, NonTerminal)
602                                and s != r.origin}
603            used_rules |= {NonTerminal(s) for s in start}
604            compiled_rules = [r for r in compiled_rules if r.origin in used_rules]
605            if len(compiled_rules) == c:
606                break
607
608        # Filter out unused terminals
609        used_terms = {t.name for r in compiled_rules
610                             for t in r.expansion
611                             if isinstance(t, Terminal)}
612        terminals = [t for t in terminals if t.name in used_terms or t.name in self.ignore]
613
614        return terminals, compiled_rules, self.ignore
615
616
617
618_imported_grammars = {}
619def import_grammar(grammar_path, re_, base_paths=[]):
620    if grammar_path not in _imported_grammars:
621        import_paths = base_paths + IMPORT_PATHS
622        for import_path in import_paths:
623            with suppress(IOError):
624                joined_path = os.path.join(import_path, grammar_path)
625                with open(joined_path, encoding='utf8') as f:
626                    text = f.read()
627                grammar = load_grammar(text, joined_path, re_)
628                _imported_grammars[grammar_path] = grammar
629                break
630        else:
631            open(grammar_path, encoding='utf8')
632            assert False
633
634    return _imported_grammars[grammar_path]
635
636def import_from_grammar_into_namespace(grammar, namespace, aliases):
637    """Returns all rules and terminals of grammar, prepended
638    with a 'namespace' prefix, except for those which are aliased.
639    """
640
641    imported_terms = dict(grammar.term_defs)
642    imported_rules = {n:(n,p,deepcopy(t),o) for n,p,t,o in grammar.rule_defs}
643
644    term_defs = []
645    rule_defs = []
646
647    def rule_dependencies(symbol):
648        if symbol.type != 'RULE':
649            return []
650        try:
651            _, params, tree,_ = imported_rules[symbol]
652        except KeyError:
653            raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace))
654        return _find_used_symbols(tree) - set(params)
655
656
657
658    def get_namespace_name(name, params):
659        if params is not None:
660            try:
661                return params[name]
662            except KeyError:
663                pass
664        try:
665            return aliases[name].value
666        except KeyError:
667            if name[0] == '_':
668                return '_%s__%s' % (namespace, name[1:])
669            return '%s__%s' % (namespace, name)
670
671    to_import = list(bfs(aliases, rule_dependencies))
672    for symbol in to_import:
673        if symbol.type == 'TERMINAL':
674            term_defs.append([get_namespace_name(symbol, None), imported_terms[symbol]])
675        else:
676            assert symbol.type == 'RULE'
677            _, params, tree, options = imported_rules[symbol]
678            params_map = {p: ('%s__%s' if p[0]!='_' else '_%s__%s' ) % (namespace, p) for p in params}
679            for t in tree.iter_subtrees():
680                for i, c in enumerate(t.children):
681                    if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'):
682                        t.children[i] = Token(c.type, get_namespace_name(c, params_map))
683            params = [params_map[p] for p in params] # We can not rely on ordered dictionaries
684            rule_defs.append((get_namespace_name(symbol, params_map), params, tree, options))
685
686
687    return term_defs, rule_defs
688
689
690
691def resolve_term_references(term_defs):
692    # TODO Solve with transitive closure (maybe)
693
694    term_dict = {k:t for k, (t,_p) in term_defs}
695    assert len(term_dict) == len(term_defs), "Same name defined twice?"
696
697    while True:
698        changed = False
699        for name, (token_tree, _p) in term_defs:
700            if token_tree is None:  # Terminal added through %declare
701                continue
702            for exp in token_tree.find_data('value'):
703                item ,= exp.children
704                if isinstance(item, Token):
705                    if item.type == 'RULE':
706                        raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name))
707                    if item.type == 'TERMINAL':
708                        term_value = term_dict[item]
709                        assert term_value is not None
710                        exp.children[0] = term_value
711                        changed = True
712        if not changed:
713            break
714
715    for name, term in term_dict.items():
716        if term:    # Not just declared
717            for child in term.children:
718                ids = [id(x) for x in child.iter_subtrees()]
719                if id(term) in ids:
720                    raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name)
721
722
723def options_from_rule(name, params, *x):
724    if len(x) > 1:
725        priority, expansions = x
726        priority = int(priority)
727    else:
728        expansions ,= x
729        priority = None
730    params = [t.value for t in params.children] if params is not None else [] # For the grammar parser
731
732    keep_all_tokens = name.startswith('!')
733    name = name.lstrip('!')
734    expand1 = name.startswith('?')
735    name = name.lstrip('?')
736
737    return name, params, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority,
738                                                 template_source=(name if params else None))
739
740
741def symbols_from_strcase(expansion):
742    return [Terminal(x, filter_out=x.startswith('_')) if x.isupper() else NonTerminal(x) for x in expansion]
743
744@inline_args
745class PrepareGrammar(Transformer_InPlace):
746    def terminal(self, name):
747        return name
748    def nonterminal(self, name):
749        return name
750
751
752def _find_used_symbols(tree):
753    assert tree.data == 'expansions'
754    return {t for x in tree.find_data('expansion')
755              for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}
756
757class GrammarLoader:
758    def __init__(self, re_):
759        self.re = re_
760        terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
761
762        rules = [options_from_rule(name, None, x) for name, x in  RULES.items()]
763        rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)]
764        callback = ParseTreeBuilder(rules, ST).create_callback()
765        lexer_conf = LexerConf(terminals, ['WS', 'COMMENT'])
766
767        parser_conf = ParserConf(rules, callback, ['start'])
768        self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_)
769
770        self.canonize_tree = CanonizeTree()
771
772    def load_grammar(self, grammar_text, grammar_name='<?>'):
773        "Parse grammar_text, verify, and create Grammar object. Display nice messages on error."
774
775        try:
776            tree = self.canonize_tree.transform( self.parser.parse(grammar_text+'\n') )
777        except UnexpectedCharacters as e:
778            context = e.get_context(grammar_text)
779            raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" %
780                               (e.line, e.column, grammar_name, context))
781        except UnexpectedToken as e:
782            context = e.get_context(grammar_text)
783            error = e.match_examples(self.parser.parse, {
784                'Unclosed parenthesis': ['a: (\n'],
785                'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'],
786                'Expecting rule or terminal definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'],
787                'Alias expects lowercase name': ['a: -> "a"\n'],
788                'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'],
789                'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'],
790                'Expecting option ("|") or a new rule or terminal definition': ['a:a\n()\n'],
791                '%import expects a name': ['%import "a"\n'],
792                '%ignore expects a value': ['%ignore %import\n'],
793            })
794            if error:
795                raise GrammarError("%s at line %s column %s\n\n%s" % (error, e.line, e.column, context))
796            elif 'STRING' in e.expected:
797                raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context))
798            raise
799
800        tree = PrepareGrammar().transform(tree)
801
802        # Extract grammar items
803        defs = classify(tree.children, lambda c: c.data, lambda c: c.children)
804        term_defs = defs.pop('term', [])
805        rule_defs = defs.pop('rule', [])
806        statements = defs.pop('statement', [])
807        assert not defs
808
809        term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs]
810        term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs]
811        rule_defs = [options_from_rule(*x) for x in rule_defs]
812
813        # Execute statements
814        ignore, imports = [], {}
815        for (stmt,) in statements:
816            if stmt.data == 'ignore':
817                t ,= stmt.children
818                ignore.append(t)
819            elif stmt.data == 'import':
820                if len(stmt.children) > 1:
821                    path_node, arg1 = stmt.children
822                else:
823                    path_node, = stmt.children
824                    arg1 = None
825
826                if isinstance(arg1, Tree):  # Multi import
827                    dotted_path = tuple(path_node.children)
828                    names = arg1.children
829                    aliases = dict(zip(names, names))  # Can't have aliased multi import, so all aliases will be the same as names
830                else:  # Single import
831                    dotted_path = tuple(path_node.children[:-1])
832                    name = path_node.children[-1]  # Get name from dotted path
833                    aliases = {name: arg1 or name}  # Aliases if exist
834
835                if path_node.data == 'import_lib':  # Import from library
836                    base_paths = []
837                else:  # Relative import
838                    if grammar_name == '<string>':  # Import relative to script file path if grammar is coded in script
839                        try:
840                            base_file = os.path.abspath(sys.modules['__main__'].__file__)
841                        except AttributeError:
842                            base_file = None
843                    else:
844                        base_file = grammar_name  # Import relative to grammar file path if external grammar file
845                    if base_file:
846                        base_paths = [os.path.split(base_file)[0]]
847                    else:
848                        base_paths = [os.path.abspath(os.path.curdir)]
849
850                try:
851                    import_base_paths, import_aliases = imports[dotted_path]
852                    assert base_paths == import_base_paths, 'Inconsistent base_paths for %s.' % '.'.join(dotted_path)
853                    import_aliases.update(aliases)
854                except KeyError:
855                    imports[dotted_path] = base_paths, aliases
856
857            elif stmt.data == 'declare':
858                for t in stmt.children:
859                    term_defs.append([t.value, (None, None)])
860            else:
861                assert False, stmt
862
863        # import grammars
864        for dotted_path, (base_paths, aliases) in imports.items():
865            grammar_path = os.path.join(*dotted_path) + EXT
866            g = import_grammar(grammar_path, self.re, base_paths=base_paths)
867            new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)
868
869            term_defs += new_td
870            rule_defs += new_rd
871
872        # Verify correctness 1
873        for name, _ in term_defs:
874            if name.startswith('__'):
875                raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
876
877        # Handle ignore tokens
878        # XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's
879        #     inability to handle duplicate terminals (two names, one value)
880        ignore_names = []
881        for t in ignore:
882            if t.data=='expansions' and len(t.children) == 1:
883                t2 ,= t.children
884                if t2.data=='expansion' and len(t2.children) == 1:
885                    item ,= t2.children
886                    if item.data == 'value':
887                        item ,= item.children
888                        if isinstance(item, Token) and item.type == 'TERMINAL':
889                            ignore_names.append(item.value)
890                            continue
891
892            name = '__IGNORE_%d'% len(ignore_names)
893            ignore_names.append(name)
894            term_defs.append((name, (t, 1)))
895
896        # Verify correctness 2
897        terminal_names = set()
898        for name, _ in term_defs:
899            if name in terminal_names:
900                raise GrammarError("Terminal '%s' defined more than once" % name)
901            terminal_names.add(name)
902
903        if set(ignore_names) > terminal_names:
904            raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names))
905
906        resolve_term_references(term_defs)
907
908        rules = rule_defs
909
910        rule_names = {}
911        for name, params, _x, _o in rules:
912            if name.startswith('__'):
913                raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
914            if name in rule_names:
915                raise GrammarError("Rule '%s' defined more than once" % name)
916            rule_names[name] = len(params)
917
918        for name, params , expansions, _o in rules:
919            for i, p in enumerate(params):
920                if p in rule_names:
921                    raise GrammarError("Template Parameter conflicts with rule %s (in template %s)" % (p, name))
922                if p in params[:i]:
923                    raise GrammarError("Duplicate Template Parameter %s (in template %s)" % (p, name))
924            for temp in expansions.find_data('template_usage'):
925                sym = temp.children[0]
926                args = temp.children[1:]
927                if sym not in params:
928                    if sym not in rule_names:
929                        raise GrammarError("Template '%s' used but not defined (in rule %s)" % (sym, name))
930                    if len(args) != rule_names[sym]:
931                        raise GrammarError("Wrong number of template arguments used for %s "
932                                           "(expected %s, got %s) (in rule %s)"%(sym, rule_names[sym], len(args), name))
933            for sym in _find_used_symbols(expansions):
934                if sym.type == 'TERMINAL':
935                    if sym not in terminal_names:
936                        raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
937                else:
938                    if sym not in rule_names and sym not in params:
939                        raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))
940
941
942        return Grammar(rules, term_defs, ignore_names)
943
944
945
946def load_grammar(grammar, source, re_):
947    return GrammarLoader(re_).load_grammar(grammar, source)
948