1"Parses and creates Grammar objects" 2 3import os.path 4import sys 5from copy import copy, deepcopy 6from io import open 7 8from .utils import bfs, eval_escaping 9from .lexer import Token, TerminalDef, PatternStr, PatternRE 10 11from .parse_tree_builder import ParseTreeBuilder 12from .parser_frontends import LALR_TraditionalLexer 13from .common import LexerConf, ParserConf 14from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol 15from .utils import classify, suppress, dedup_list, Str 16from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken 17 18from .tree import Tree, SlottedTree as ST 19from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive 20inline_args = v_args(inline=True) 21 22__path__ = os.path.dirname(__file__) 23IMPORT_PATHS = [os.path.join(__path__, 'grammars')] 24 25EXT = '.lark' 26 27_RE_FLAGS = 'imslux' 28 29_EMPTY = Symbol('__empty__') 30 31_TERMINAL_NAMES = { 32 '.' : 'DOT', 33 ',' : 'COMMA', 34 ':' : 'COLON', 35 ';' : 'SEMICOLON', 36 '+' : 'PLUS', 37 '-' : 'MINUS', 38 '*' : 'STAR', 39 '/' : 'SLASH', 40 '\\' : 'BACKSLASH', 41 '|' : 'VBAR', 42 '?' : 'QMARK', 43 '!' : 'BANG', 44 '@' : 'AT', 45 '#' : 'HASH', 46 '$' : 'DOLLAR', 47 '%' : 'PERCENT', 48 '^' : 'CIRCUMFLEX', 49 '&' : 'AMPERSAND', 50 '_' : 'UNDERSCORE', 51 '<' : 'LESSTHAN', 52 '>' : 'MORETHAN', 53 '=' : 'EQUAL', 54 '"' : 'DBLQUOTE', 55 '\'' : 'QUOTE', 56 '`' : 'BACKQUOTE', 57 '~' : 'TILDE', 58 '(' : 'LPAR', 59 ')' : 'RPAR', 60 '{' : 'LBRACE', 61 '}' : 'RBRACE', 62 '[' : 'LSQB', 63 ']' : 'RSQB', 64 '\n' : 'NEWLINE', 65 '\r\n' : 'CRLF', 66 '\t' : 'TAB', 67 ' ' : 'SPACE', 68} 69 70# Grammar Parser 71TERMINALS = { 72 '_LPAR': r'\(', 73 '_RPAR': r'\)', 74 '_LBRA': r'\[', 75 '_RBRA': r'\]', 76 '_LBRACE': r'\{', 77 '_RBRACE': r'\}', 78 'OP': '[+*]|[?](?![a-z])', 79 '_COLON': ':', 80 '_COMMA': ',', 81 '_OR': r'\|', 82 '_DOT': r'\.(?!\.)', 83 '_DOTDOT': r'\.\.', 84 'TILDE': '~', 85 'RULE': '!?[_?]?[a-z][_a-z0-9]*', 86 'TERMINAL': '_?[A-Z][_A-Z0-9]*', 87 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', 88 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, 89 '_NL': r'(\r?\n)+\s*', 90 'WS': r'[ \t]+', 91 'COMMENT': r'\s*//[^\n]*', 92 '_TO': '->', 93 '_IGNORE': r'%ignore', 94 '_DECLARE': r'%declare', 95 '_IMPORT': r'%import', 96 'NUMBER': r'[+-]?\d+', 97} 98 99RULES = { 100 'start': ['_list'], 101 '_list': ['_item', '_list _item'], 102 '_item': ['rule', 'term', 'statement', '_NL'], 103 104 'rule': ['RULE template_params _COLON expansions _NL', 105 'RULE template_params _DOT NUMBER _COLON expansions _NL'], 106 'template_params': ['_LBRACE _template_params _RBRACE', 107 ''], 108 '_template_params': ['RULE', 109 '_template_params _COMMA RULE'], 110 'expansions': ['alias', 111 'expansions _OR alias', 112 'expansions _NL _OR alias'], 113 114 '?alias': ['expansion _TO RULE', 'expansion'], 115 'expansion': ['_expansion'], 116 117 '_expansion': ['', '_expansion expr'], 118 119 '?expr': ['atom', 120 'atom OP', 121 'atom TILDE NUMBER', 122 'atom TILDE NUMBER _DOTDOT NUMBER', 123 ], 124 125 '?atom': ['_LPAR expansions _RPAR', 126 'maybe', 127 'value'], 128 129 'value': ['terminal', 130 'nonterminal', 131 'literal', 132 'range', 133 'template_usage'], 134 135 'terminal': ['TERMINAL'], 136 'nonterminal': ['RULE'], 137 138 '?name': ['RULE', 'TERMINAL'], 139 140 'maybe': ['_LBRA expansions _RBRA'], 141 'range': ['STRING _DOTDOT STRING'], 142 143 'template_usage': ['RULE _LBRACE _template_args _RBRACE'], 144 '_template_args': ['value', 145 '_template_args _COMMA value'], 146 147 'term': ['TERMINAL _COLON expansions _NL', 148 'TERMINAL _DOT NUMBER _COLON expansions _NL'], 149 'statement': ['ignore', 'import', 'declare'], 150 'ignore': ['_IGNORE expansions _NL'], 151 'declare': ['_DECLARE _declare_args _NL'], 152 'import': ['_IMPORT _import_path _NL', 153 '_IMPORT _import_path _LPAR name_list _RPAR _NL', 154 '_IMPORT _import_path _TO name _NL'], 155 156 '_import_path': ['import_lib', 'import_rel'], 157 'import_lib': ['_import_args'], 158 'import_rel': ['_DOT _import_args'], 159 '_import_args': ['name', '_import_args _DOT name'], 160 161 'name_list': ['_name_list'], 162 '_name_list': ['name', '_name_list _COMMA name'], 163 164 '_declare_args': ['name', '_declare_args name'], 165 'literal': ['REGEXP', 'STRING'], 166} 167 168@inline_args 169class EBNF_to_BNF(Transformer_InPlace): 170 def __init__(self): 171 self.new_rules = [] 172 self.rules_by_expr = {} 173 self.prefix = 'anon' 174 self.i = 0 175 self.rule_options = None 176 177 def _add_recurse_rule(self, type_, expr): 178 if expr in self.rules_by_expr: 179 return self.rules_by_expr[expr] 180 181 new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) 182 self.i += 1 183 t = NonTerminal(new_name) 184 tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) 185 self.new_rules.append((new_name, tree, self.rule_options)) 186 self.rules_by_expr[expr] = t 187 return t 188 189 def expr(self, rule, op, *args): 190 if op.value == '?': 191 empty = ST('expansion', []) 192 return ST('expansions', [rule, empty]) 193 elif op.value == '+': 194 # a : b c+ d 195 # --> 196 # a : b _c d 197 # _c : _c c | c; 198 return self._add_recurse_rule('plus', rule) 199 elif op.value == '*': 200 # a : b c* d 201 # --> 202 # a : b _c? d 203 # _c : _c c | c; 204 new_name = self._add_recurse_rule('star', rule) 205 return ST('expansions', [new_name, ST('expansion', [])]) 206 elif op.value == '~': 207 if len(args) == 1: 208 mn = mx = int(args[0]) 209 else: 210 mn, mx = map(int, args) 211 if mx < mn or mn < 0: 212 raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) 213 return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) 214 assert False, op 215 216 def maybe(self, rule): 217 keep_all_tokens = self.rule_options and self.rule_options.keep_all_tokens 218 219 def will_not_get_removed(sym): 220 if isinstance(sym, NonTerminal): 221 return not sym.name.startswith('_') 222 if isinstance(sym, Terminal): 223 return keep_all_tokens or not sym.filter_out 224 assert False 225 226 if any(rule.scan_values(will_not_get_removed)): 227 empty = _EMPTY 228 else: 229 empty = ST('expansion', []) 230 231 return ST('expansions', [rule, empty]) 232 233 234class SimplifyRule_Visitor(Visitor): 235 236 @staticmethod 237 def _flatten(tree): 238 while True: 239 to_expand = [i for i, child in enumerate(tree.children) 240 if isinstance(child, Tree) and child.data == tree.data] 241 if not to_expand: 242 break 243 tree.expand_kids_by_index(*to_expand) 244 245 def expansion(self, tree): 246 # rules_list unpacking 247 # a : b (c|d) e 248 # --> 249 # a : b c e | b d e 250 # 251 # In AST terms: 252 # expansion(b, expansions(c, d), e) 253 # --> 254 # expansions( expansion(b, c, e), expansion(b, d, e) ) 255 256 self._flatten(tree) 257 258 for i, child in enumerate(tree.children): 259 if isinstance(child, Tree) and child.data == 'expansions': 260 tree.data = 'expansions' 261 tree.children = [self.visit(ST('expansion', [option if i==j else other 262 for j, other in enumerate(tree.children)])) 263 for option in dedup_list(child.children)] 264 self._flatten(tree) 265 break 266 267 def alias(self, tree): 268 rule, alias_name = tree.children 269 if rule.data == 'expansions': 270 aliases = [] 271 for child in tree.children[0].children: 272 aliases.append(ST('alias', [child, alias_name])) 273 tree.data = 'expansions' 274 tree.children = aliases 275 276 def expansions(self, tree): 277 self._flatten(tree) 278 # Ensure all children are unique 279 if len(set(tree.children)) != len(tree.children): 280 tree.children = dedup_list(tree.children) # dedup is expensive, so try to minimize its use 281 282 283class RuleTreeToText(Transformer): 284 def expansions(self, x): 285 return x 286 def expansion(self, symbols): 287 return symbols, None 288 def alias(self, x): 289 (expansion, _alias), alias = x 290 assert _alias is None, (alias, expansion, '-', _alias) # Double alias not allowed 291 return expansion, alias.value 292 293 294@inline_args 295class CanonizeTree(Transformer_InPlace): 296 def tokenmods(self, *args): 297 if len(args) == 1: 298 return list(args) 299 tokenmods, value = args 300 return tokenmods + [value] 301 302class PrepareAnonTerminals(Transformer_InPlace): 303 "Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them" 304 305 def __init__(self, terminals): 306 self.terminals = terminals 307 self.term_set = {td.name for td in self.terminals} 308 self.term_reverse = {td.pattern: td for td in terminals} 309 self.i = 0 310 311 312 @inline_args 313 def pattern(self, p): 314 value = p.value 315 if p in self.term_reverse and p.flags != self.term_reverse[p].pattern.flags: 316 raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) 317 318 term_name = None 319 320 if isinstance(p, PatternStr): 321 try: 322 # If already defined, use the user-defined terminal name 323 term_name = self.term_reverse[p].name 324 except KeyError: 325 # Try to assign an indicative anon-terminal name 326 try: 327 term_name = _TERMINAL_NAMES[value] 328 except KeyError: 329 if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set: 330 with suppress(UnicodeEncodeError): 331 value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names 332 term_name = value.upper() 333 334 if term_name in self.term_set: 335 term_name = None 336 337 elif isinstance(p, PatternRE): 338 if p in self.term_reverse: # Kind of a wierd placement.name 339 term_name = self.term_reverse[p].name 340 else: 341 assert False, p 342 343 if term_name is None: 344 term_name = '__ANON_%d' % self.i 345 self.i += 1 346 347 if term_name not in self.term_set: 348 assert p not in self.term_reverse 349 self.term_set.add(term_name) 350 termdef = TerminalDef(term_name, p) 351 self.term_reverse[p] = termdef 352 self.terminals.append(termdef) 353 354 return Terminal(term_name, filter_out=isinstance(p, PatternStr)) 355 356class _ReplaceSymbols(Transformer_InPlace): 357 " Helper for ApplyTemplates " 358 359 def __init__(self): 360 self.names = {} 361 362 def value(self, c): 363 if len(c) == 1 and isinstance(c[0], Token) and c[0].value in self.names: 364 return self.names[c[0].value] 365 return self.__default__('value', c, None) 366 367 def template_usage(self, c): 368 if c[0] in self.names: 369 return self.__default__('template_usage', [self.names[c[0]].name] + c[1:], None) 370 return self.__default__('template_usage', c, None) 371 372class ApplyTemplates(Transformer_InPlace): 373 " Apply the templates, creating new rules that represent the used templates " 374 375 def __init__(self, rule_defs): 376 self.rule_defs = rule_defs 377 self.replacer = _ReplaceSymbols() 378 self.created_templates = set() 379 380 def template_usage(self, c): 381 name = c[0] 382 args = c[1:] 383 result_name = "%s{%s}" % (name, ",".join(a.name for a in args)) 384 if result_name not in self.created_templates: 385 self.created_templates.add(result_name) 386 (_n, params, tree, options) ,= (t for t in self.rule_defs if t[0] == name) 387 assert len(params) == len(args), args 388 result_tree = deepcopy(tree) 389 self.replacer.names = dict(zip(params, args)) 390 self.replacer.transform(result_tree) 391 self.rule_defs.append((result_name, [], result_tree, deepcopy(options))) 392 return NonTerminal(result_name) 393 394 395def _rfind(s, choices): 396 return max(s.rfind(c) for c in choices) 397 398 399 400 401def _literal_to_pattern(literal): 402 v = literal.value 403 flag_start = _rfind(v, '/"')+1 404 assert flag_start > 0 405 flags = v[flag_start:] 406 assert all(f in _RE_FLAGS for f in flags), flags 407 408 v = v[:flag_start] 409 assert v[0] == v[-1] and v[0] in '"/' 410 x = v[1:-1] 411 412 s = eval_escaping(x) 413 414 if literal.type == 'STRING': 415 s = s.replace('\\\\', '\\') 416 417 return { 'STRING': PatternStr, 418 'REGEXP': PatternRE }[literal.type](s, flags) 419 420 421@inline_args 422class PrepareLiterals(Transformer_InPlace): 423 def literal(self, literal): 424 return ST('pattern', [_literal_to_pattern(literal)]) 425 426 def range(self, start, end): 427 assert start.type == end.type == 'STRING' 428 start = start.value[1:-1] 429 end = end.value[1:-1] 430 assert len(eval_escaping(start)) == len(eval_escaping(end)) == 1, (start, end, len(eval_escaping(start)), len(eval_escaping(end))) 431 regexp = '[%s-%s]' % (start, end) 432 return ST('pattern', [PatternRE(regexp)]) 433 434 435class TerminalTreeToPattern(Transformer): 436 def pattern(self, ps): 437 p ,= ps 438 return p 439 440 def expansion(self, items): 441 assert items 442 if len(items) == 1: 443 return items[0] 444 if len({i.flags for i in items}) > 1: 445 raise GrammarError("Lark doesn't support joining terminals with conflicting flags!") 446 return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ()) 447 448 def expansions(self, exps): 449 if len(exps) == 1: 450 return exps[0] 451 if len({i.flags for i in exps}) > 1: 452 raise GrammarError("Lark doesn't support joining terminals with conflicting flags!") 453 return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), exps[0].flags) 454 455 def expr(self, args): 456 inner, op = args[:2] 457 if op == '~': 458 if len(args) == 3: 459 op = "{%d}" % int(args[2]) 460 else: 461 mn, mx = map(int, args[2:]) 462 if mx < mn: 463 raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (inner, mn, mx)) 464 op = "{%d,%d}" % (mn, mx) 465 else: 466 assert len(args) == 2 467 return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags) 468 469 def maybe(self, expr): 470 return self.expr(expr + ['?']) 471 472 def alias(self, t): 473 raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") 474 475 def value(self, v): 476 return v[0] 477 478class PrepareSymbols(Transformer_InPlace): 479 def value(self, v): 480 v ,= v 481 if isinstance(v, Tree): 482 return v 483 elif v.type == 'RULE': 484 return NonTerminal(Str(v.value)) 485 elif v.type == 'TERMINAL': 486 return Terminal(Str(v.value), filter_out=v.startswith('_')) 487 assert False 488 489def _choice_of_rules(rules): 490 return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules]) 491 492def nr_deepcopy_tree(t): 493 "Deepcopy tree `t` without recursion" 494 return Transformer_NonRecursive(False).transform(t) 495 496class Grammar: 497 def __init__(self, rule_defs, term_defs, ignore): 498 self.term_defs = term_defs 499 self.rule_defs = rule_defs 500 self.ignore = ignore 501 502 def compile(self, start): 503 # We change the trees in-place (to support huge grammars) 504 # So deepcopy allows calling compile more than once. 505 term_defs = deepcopy(list(self.term_defs)) 506 rule_defs = [(n,p,nr_deepcopy_tree(t),o) for n,p,t,o in self.rule_defs] 507 508 # =================== 509 # Compile Terminals 510 # =================== 511 512 # Convert terminal-trees to strings/regexps 513 514 for name, (term_tree, priority) in term_defs: 515 if term_tree is None: # Terminal added through %declare 516 continue 517 expansions = list(term_tree.find_data('expansion')) 518 if len(expansions) == 1 and not expansions[0].children: 519 raise GrammarError("Terminals cannot be empty (%s)" % name) 520 521 transformer = PrepareLiterals() * TerminalTreeToPattern() 522 terminals = [TerminalDef(name, transformer.transform( term_tree ), priority) 523 for name, (term_tree, priority) in term_defs if term_tree] 524 525 # ================= 526 # Compile Rules 527 # ================= 528 529 # 1. Pre-process terminals 530 transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(terminals) # Adds to terminals 531 532 # 2. Inline Templates 533 534 transformer *= ApplyTemplates(rule_defs) 535 536 # 3. Convert EBNF to BNF (and apply step 1 & 2) 537 ebnf_to_bnf = EBNF_to_BNF() 538 rules = [] 539 i = 0 540 while i < len(rule_defs): # We have to do it like this because rule_defs might grow due to templates 541 name, params, rule_tree, options = rule_defs[i] 542 i += 1 543 if len(params) != 0: # Dont transform templates 544 continue 545 ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options.keep_all_tokens else None 546 ebnf_to_bnf.prefix = name 547 tree = transformer.transform(rule_tree) 548 res = ebnf_to_bnf.transform(tree) 549 rules.append((name, res, options)) 550 rules += ebnf_to_bnf.new_rules 551 552 assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision" 553 554 # 4. Compile tree to Rule objects 555 rule_tree_to_text = RuleTreeToText() 556 557 simplify_rule = SimplifyRule_Visitor() 558 compiled_rules = [] 559 for rule_content in rules: 560 name, tree, options = rule_content 561 simplify_rule.visit(tree) 562 expansions = rule_tree_to_text.transform(tree) 563 564 for i, (expansion, alias) in enumerate(expansions): 565 if alias and name.startswith('_'): 566 raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) 567 568 empty_indices = [x==_EMPTY for x in expansion] 569 if any(empty_indices): 570 exp_options = copy(options) or RuleOptions() 571 exp_options.empty_indices = empty_indices 572 expansion = [x for x in expansion if x!=_EMPTY] 573 else: 574 exp_options = options 575 576 assert all(isinstance(x, Symbol) for x in expansion), expansion 577 rule = Rule(NonTerminal(name), expansion, i, alias, exp_options) 578 compiled_rules.append(rule) 579 580 # Remove duplicates of empty rules, throw error for non-empty duplicates 581 if len(set(compiled_rules)) != len(compiled_rules): 582 duplicates = classify(compiled_rules, lambda x: x) 583 for dups in duplicates.values(): 584 if len(dups) > 1: 585 if dups[0].expansion: 586 raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)" 587 % ''.join('\n * %s' % i for i in dups)) 588 589 # Empty rule; assert all other attributes are equal 590 assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups) 591 592 # Remove duplicates 593 compiled_rules = list(set(compiled_rules)) 594 595 596 # Filter out unused rules 597 while True: 598 c = len(compiled_rules) 599 used_rules = {s for r in compiled_rules 600 for s in r.expansion 601 if isinstance(s, NonTerminal) 602 and s != r.origin} 603 used_rules |= {NonTerminal(s) for s in start} 604 compiled_rules = [r for r in compiled_rules if r.origin in used_rules] 605 if len(compiled_rules) == c: 606 break 607 608 # Filter out unused terminals 609 used_terms = {t.name for r in compiled_rules 610 for t in r.expansion 611 if isinstance(t, Terminal)} 612 terminals = [t for t in terminals if t.name in used_terms or t.name in self.ignore] 613 614 return terminals, compiled_rules, self.ignore 615 616 617 618_imported_grammars = {} 619def import_grammar(grammar_path, re_, base_paths=[]): 620 if grammar_path not in _imported_grammars: 621 import_paths = base_paths + IMPORT_PATHS 622 for import_path in import_paths: 623 with suppress(IOError): 624 joined_path = os.path.join(import_path, grammar_path) 625 with open(joined_path, encoding='utf8') as f: 626 text = f.read() 627 grammar = load_grammar(text, joined_path, re_) 628 _imported_grammars[grammar_path] = grammar 629 break 630 else: 631 open(grammar_path, encoding='utf8') 632 assert False 633 634 return _imported_grammars[grammar_path] 635 636def import_from_grammar_into_namespace(grammar, namespace, aliases): 637 """Returns all rules and terminals of grammar, prepended 638 with a 'namespace' prefix, except for those which are aliased. 639 """ 640 641 imported_terms = dict(grammar.term_defs) 642 imported_rules = {n:(n,p,deepcopy(t),o) for n,p,t,o in grammar.rule_defs} 643 644 term_defs = [] 645 rule_defs = [] 646 647 def rule_dependencies(symbol): 648 if symbol.type != 'RULE': 649 return [] 650 try: 651 _, params, tree,_ = imported_rules[symbol] 652 except KeyError: 653 raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace)) 654 return _find_used_symbols(tree) - set(params) 655 656 657 658 def get_namespace_name(name, params): 659 if params is not None: 660 try: 661 return params[name] 662 except KeyError: 663 pass 664 try: 665 return aliases[name].value 666 except KeyError: 667 if name[0] == '_': 668 return '_%s__%s' % (namespace, name[1:]) 669 return '%s__%s' % (namespace, name) 670 671 to_import = list(bfs(aliases, rule_dependencies)) 672 for symbol in to_import: 673 if symbol.type == 'TERMINAL': 674 term_defs.append([get_namespace_name(symbol, None), imported_terms[symbol]]) 675 else: 676 assert symbol.type == 'RULE' 677 _, params, tree, options = imported_rules[symbol] 678 params_map = {p: ('%s__%s' if p[0]!='_' else '_%s__%s' ) % (namespace, p) for p in params} 679 for t in tree.iter_subtrees(): 680 for i, c in enumerate(t.children): 681 if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'): 682 t.children[i] = Token(c.type, get_namespace_name(c, params_map)) 683 params = [params_map[p] for p in params] # We can not rely on ordered dictionaries 684 rule_defs.append((get_namespace_name(symbol, params_map), params, tree, options)) 685 686 687 return term_defs, rule_defs 688 689 690 691def resolve_term_references(term_defs): 692 # TODO Solve with transitive closure (maybe) 693 694 term_dict = {k:t for k, (t,_p) in term_defs} 695 assert len(term_dict) == len(term_defs), "Same name defined twice?" 696 697 while True: 698 changed = False 699 for name, (token_tree, _p) in term_defs: 700 if token_tree is None: # Terminal added through %declare 701 continue 702 for exp in token_tree.find_data('value'): 703 item ,= exp.children 704 if isinstance(item, Token): 705 if item.type == 'RULE': 706 raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name)) 707 if item.type == 'TERMINAL': 708 term_value = term_dict[item] 709 assert term_value is not None 710 exp.children[0] = term_value 711 changed = True 712 if not changed: 713 break 714 715 for name, term in term_dict.items(): 716 if term: # Not just declared 717 for child in term.children: 718 ids = [id(x) for x in child.iter_subtrees()] 719 if id(term) in ids: 720 raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name) 721 722 723def options_from_rule(name, params, *x): 724 if len(x) > 1: 725 priority, expansions = x 726 priority = int(priority) 727 else: 728 expansions ,= x 729 priority = None 730 params = [t.value for t in params.children] if params is not None else [] # For the grammar parser 731 732 keep_all_tokens = name.startswith('!') 733 name = name.lstrip('!') 734 expand1 = name.startswith('?') 735 name = name.lstrip('?') 736 737 return name, params, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority, 738 template_source=(name if params else None)) 739 740 741def symbols_from_strcase(expansion): 742 return [Terminal(x, filter_out=x.startswith('_')) if x.isupper() else NonTerminal(x) for x in expansion] 743 744@inline_args 745class PrepareGrammar(Transformer_InPlace): 746 def terminal(self, name): 747 return name 748 def nonterminal(self, name): 749 return name 750 751 752def _find_used_symbols(tree): 753 assert tree.data == 'expansions' 754 return {t for x in tree.find_data('expansion') 755 for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} 756 757class GrammarLoader: 758 def __init__(self, re_): 759 self.re = re_ 760 terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] 761 762 rules = [options_from_rule(name, None, x) for name, x in RULES.items()] 763 rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)] 764 callback = ParseTreeBuilder(rules, ST).create_callback() 765 lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) 766 767 parser_conf = ParserConf(rules, callback, ['start']) 768 self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_) 769 770 self.canonize_tree = CanonizeTree() 771 772 def load_grammar(self, grammar_text, grammar_name='<?>'): 773 "Parse grammar_text, verify, and create Grammar object. Display nice messages on error." 774 775 try: 776 tree = self.canonize_tree.transform( self.parser.parse(grammar_text+'\n') ) 777 except UnexpectedCharacters as e: 778 context = e.get_context(grammar_text) 779 raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" % 780 (e.line, e.column, grammar_name, context)) 781 except UnexpectedToken as e: 782 context = e.get_context(grammar_text) 783 error = e.match_examples(self.parser.parse, { 784 'Unclosed parenthesis': ['a: (\n'], 785 'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'], 786 'Expecting rule or terminal definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'], 787 'Alias expects lowercase name': ['a: -> "a"\n'], 788 'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'], 789 'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'], 790 'Expecting option ("|") or a new rule or terminal definition': ['a:a\n()\n'], 791 '%import expects a name': ['%import "a"\n'], 792 '%ignore expects a value': ['%ignore %import\n'], 793 }) 794 if error: 795 raise GrammarError("%s at line %s column %s\n\n%s" % (error, e.line, e.column, context)) 796 elif 'STRING' in e.expected: 797 raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context)) 798 raise 799 800 tree = PrepareGrammar().transform(tree) 801 802 # Extract grammar items 803 defs = classify(tree.children, lambda c: c.data, lambda c: c.children) 804 term_defs = defs.pop('term', []) 805 rule_defs = defs.pop('rule', []) 806 statements = defs.pop('statement', []) 807 assert not defs 808 809 term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs] 810 term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs] 811 rule_defs = [options_from_rule(*x) for x in rule_defs] 812 813 # Execute statements 814 ignore, imports = [], {} 815 for (stmt,) in statements: 816 if stmt.data == 'ignore': 817 t ,= stmt.children 818 ignore.append(t) 819 elif stmt.data == 'import': 820 if len(stmt.children) > 1: 821 path_node, arg1 = stmt.children 822 else: 823 path_node, = stmt.children 824 arg1 = None 825 826 if isinstance(arg1, Tree): # Multi import 827 dotted_path = tuple(path_node.children) 828 names = arg1.children 829 aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names 830 else: # Single import 831 dotted_path = tuple(path_node.children[:-1]) 832 name = path_node.children[-1] # Get name from dotted path 833 aliases = {name: arg1 or name} # Aliases if exist 834 835 if path_node.data == 'import_lib': # Import from library 836 base_paths = [] 837 else: # Relative import 838 if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script 839 try: 840 base_file = os.path.abspath(sys.modules['__main__'].__file__) 841 except AttributeError: 842 base_file = None 843 else: 844 base_file = grammar_name # Import relative to grammar file path if external grammar file 845 if base_file: 846 base_paths = [os.path.split(base_file)[0]] 847 else: 848 base_paths = [os.path.abspath(os.path.curdir)] 849 850 try: 851 import_base_paths, import_aliases = imports[dotted_path] 852 assert base_paths == import_base_paths, 'Inconsistent base_paths for %s.' % '.'.join(dotted_path) 853 import_aliases.update(aliases) 854 except KeyError: 855 imports[dotted_path] = base_paths, aliases 856 857 elif stmt.data == 'declare': 858 for t in stmt.children: 859 term_defs.append([t.value, (None, None)]) 860 else: 861 assert False, stmt 862 863 # import grammars 864 for dotted_path, (base_paths, aliases) in imports.items(): 865 grammar_path = os.path.join(*dotted_path) + EXT 866 g = import_grammar(grammar_path, self.re, base_paths=base_paths) 867 new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) 868 869 term_defs += new_td 870 rule_defs += new_rd 871 872 # Verify correctness 1 873 for name, _ in term_defs: 874 if name.startswith('__'): 875 raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) 876 877 # Handle ignore tokens 878 # XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's 879 # inability to handle duplicate terminals (two names, one value) 880 ignore_names = [] 881 for t in ignore: 882 if t.data=='expansions' and len(t.children) == 1: 883 t2 ,= t.children 884 if t2.data=='expansion' and len(t2.children) == 1: 885 item ,= t2.children 886 if item.data == 'value': 887 item ,= item.children 888 if isinstance(item, Token) and item.type == 'TERMINAL': 889 ignore_names.append(item.value) 890 continue 891 892 name = '__IGNORE_%d'% len(ignore_names) 893 ignore_names.append(name) 894 term_defs.append((name, (t, 1))) 895 896 # Verify correctness 2 897 terminal_names = set() 898 for name, _ in term_defs: 899 if name in terminal_names: 900 raise GrammarError("Terminal '%s' defined more than once" % name) 901 terminal_names.add(name) 902 903 if set(ignore_names) > terminal_names: 904 raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names)) 905 906 resolve_term_references(term_defs) 907 908 rules = rule_defs 909 910 rule_names = {} 911 for name, params, _x, _o in rules: 912 if name.startswith('__'): 913 raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) 914 if name in rule_names: 915 raise GrammarError("Rule '%s' defined more than once" % name) 916 rule_names[name] = len(params) 917 918 for name, params , expansions, _o in rules: 919 for i, p in enumerate(params): 920 if p in rule_names: 921 raise GrammarError("Template Parameter conflicts with rule %s (in template %s)" % (p, name)) 922 if p in params[:i]: 923 raise GrammarError("Duplicate Template Parameter %s (in template %s)" % (p, name)) 924 for temp in expansions.find_data('template_usage'): 925 sym = temp.children[0] 926 args = temp.children[1:] 927 if sym not in params: 928 if sym not in rule_names: 929 raise GrammarError("Template '%s' used but not defined (in rule %s)" % (sym, name)) 930 if len(args) != rule_names[sym]: 931 raise GrammarError("Wrong number of template arguments used for %s " 932 "(expected %s, got %s) (in rule %s)"%(sym, rule_names[sym], len(args), name)) 933 for sym in _find_used_symbols(expansions): 934 if sym.type == 'TERMINAL': 935 if sym not in terminal_names: 936 raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name)) 937 else: 938 if sym not in rule_names and sym not in params: 939 raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name)) 940 941 942 return Grammar(rules, term_defs, ignore_names) 943 944 945 946def load_grammar(grammar, source, re_): 947 return GrammarLoader(re_).load_grammar(grammar, source) 948