1#!/usr/bin/env python3 2 3import io 4import re 5import unittest 6import typing 7 8import jsparagus 9from jsparagus import gen, lexer, rewrites 10from jsparagus.grammar import (Grammar, Production, CallMethod, Nt, 11 Optional, LookaheadRule, NtDef, Var) 12from js_parser.parse_esgrammar import parse_esgrammar 13 14 15LispTokenizer = lexer.LexicalGrammar("( )", SYMBOL=r'[!%&*+:<=>?@A-Z^_a-z~]+') 16 17 18def prod(body, method_name): 19 return Production(body, CallMethod(method_name, list(range(len(body))))) 20 21 22class GenTestCase(unittest.TestCase): 23 def compile(self, tokenize, grammar, **kwargs): 24 """Compile a grammar. Use this when you expect compilation to 25 succeed.""" 26 self.tokenize = tokenize 27 self.parser_class = gen.compile(grammar, **kwargs) 28 29 def parse(self, text, goal=None): 30 if goal is None: 31 parser = self.parser_class() 32 else: 33 parser = self.parser_class(goal=goal) 34 lexer = self.tokenize(parser) 35 lexer.write(text) 36 return lexer.close() 37 38 def compile_multi(self, tokenize, grammar): 39 self.tokenize = tokenize 40 obj = gen.compile_multi(grammar) 41 for attr in dir(obj): 42 if attr.startswith("parse_"): 43 setattr(self, attr, getattr(obj, attr)) 44 45 def assertParse(self, s, expected=None, *, goal=None): 46 result = self.parse(s, goal=goal) 47 if expected is not None: 48 self.assertEqual(expected, result) 49 50 def assertNoParse(self, s, *, goal=None, message="banana"): 51 if goal is None: 52 kwargs = {} 53 else: 54 kwargs = {"goal": goal} 55 self.assertRaisesRegex( 56 SyntaxError, 57 re.escape(message), 58 lambda: self.parse(s, **kwargs)) 59 60 def testSimple(self): 61 grammar = parse_esgrammar( 62 """ 63 expr : 64 SYMBOL => $0 65 `(` tail 66 67 tail : 68 `)` => $0 69 expr tail 70 """, 71 terminal_names=["SYMBOL"] 72 ) 73 self.compile(LispTokenizer, grammar) 74 75 self.assertParse( 76 "(lambda (x) (* x x))", 77 ('expr_1', 78 '(', 79 ('tail_1', 80 'lambda', 81 ('tail_1', 82 ('expr_1', '(', ('tail_1', 'x', ')')), 83 ('tail_1', 84 ('expr_1', 85 '(', 86 ('tail_1', 87 '*', 88 ('tail_1', 89 'x', 90 ('tail_1', 'x', ')')))), 91 ')'))))) 92 93 def testEnd(self): 94 self.compile( 95 lexer.LexicalGrammar("ONE TWO"), 96 Grammar({ 97 'goal': [ 98 ['ONE', 'TWO'] 99 ] 100 }) 101 ) 102 self.assertNoParse("ONE TWO TWO", 103 message="expected 'end of input', got 'TWO'") 104 105 def testList(self): 106 list_grammar = Grammar({ 107 'prelist': [ 108 ['word', 'list'] 109 ], 110 'list': [ 111 ['word'], 112 ['list', 'word'], 113 ], 114 'word': [ 115 ['SYMBOL'] 116 ], 117 }) 118 self.compile(LispTokenizer, list_grammar) 119 self.assertParse( 120 "the quick brown fox jumped over the lazy dog", 121 ('prelist', 122 'the', 123 ('list_1', 124 ('list_1', 125 ('list_1', 126 ('list_1', 127 ('list_1', 128 ('list_1', 129 ('list_1', 130 'quick', 131 'brown'), 132 'fox'), 133 'jumped'), 134 'over'), 135 'the'), 136 'lazy'), 137 'dog'))) 138 139 def testArithmetic(self): 140 tokenize = lexer.LexicalGrammar( 141 "+ - * / ( )", 142 NUM=r'[0-9]\w*', 143 VAR=r'[A-Za-z]\w*') 144 arith_grammar = Grammar({ 145 'expr': [ 146 ['term'], 147 ['expr', '+', 'term'], 148 ['expr', '-', 'term'], 149 ], 150 'term': [ 151 ['prim'], 152 ['term', '*', 'prim'], 153 ['term', '/', 'prim'], 154 ], 155 'prim': [ 156 ['NUM'], 157 ['VAR'], 158 ['(', 'expr', ')'], 159 ], 160 }) 161 self.compile(tokenize, arith_grammar) 162 163 self.assertParse( 164 '2 * 3 + 4 * (5 + 7)', 165 ('expr_1', 166 ('term_1', '2', '*', '3'), 167 '+', 168 ('term_1', 169 '4', 170 '*', 171 ('prim_2', 172 '(', 173 ('expr_1', '5', '+', '7'), 174 ')')))) 175 176 self.assertNoParse( 177 "(", 178 message="unexpected end of input") 179 self.assertNoParse( 180 ")", 181 message="expected one of ['(', 'NUM', 'VAR'], got ')'") 182 183 def testAmbiguous(self): 184 # This grammar should fail verification. 185 # It's ambiguous: is ABC s(A)y(BC) or s(AB)y(C)? 186 grammar = Grammar({ 187 'goal': [ 188 ['s', 'y'], 189 ], 190 's': [ 191 ['A'], 192 ['s', 'B'], 193 ], 194 'y': [ 195 ['C'], 196 ['B', 'C'], 197 ], 198 }) 199 200 out = io.StringIO() 201 self.assertRaisesRegex(ValueError, r"conflict", 202 lambda: gen.generate_parser(out, grammar)) 203 204 def testAmbiguousEmpty(self): 205 """Reject grammars that are ambiguous due to empty productions. 206 207 (Empty productions are ones that match the empty string.)""" 208 209 def check(rules): 210 grammar = Grammar(rules, goal_nts=['goal']) 211 out = io.StringIO() 212 self.assertRaisesRegex( 213 ValueError, 214 r"ambiguous grammar|conflict", 215 lambda: gen.generate_parser(out, grammar)) 216 217 check({'goal': [[], []]}) 218 check({'goal': [[Optional('X')], []]}) 219 check({'goal': [[Optional('X')], [Optional('Y')]]}) 220 check({'goal': [[Optional('X'), Optional('Y')], [Optional('Z')]]}) 221 222 # Issue #3: This also has an abiguity; empty string matches either 223 # `goal ::= [empty]` or `goal ::= phrase, phrase ::= [empty]`. 224 check({ 225 'goal': [[Optional('phrase')]], 226 'phrase': [[Optional('X')]], 227 }) 228 229 # Input "X" is ambiguous, could be ('goal', ('a', None), ('a', 'X')) 230 # or the other 'a' could be the one that's missing. 231 check({ 232 'goal': [['a', 'a']], 233 'a': [[Optional('X')]], 234 }) 235 236 def testLeftFactor(self): 237 """Most basic left-factoring test.""" 238 tokenize = lexer.LexicalGrammar("A B") 239 grammar = Grammar({ 240 'goal': [ 241 ['A'], 242 ['A', 'B'], 243 ], 244 }) 245 246 self.compile(tokenize, grammar) 247 self.assertParse("A", 'A') 248 self.assertParse("A B", ('goal_1', 'A', 'B')) 249 250 def testLeftFactorMulti(self): 251 """Test left-factoring with common prefix of length >1.""" 252 tokenize = lexer.LexicalGrammar("A B C D E") 253 grammar = Grammar({ 254 'goal': [ 255 ['A', 'B', 'C', 'D'], 256 ['A', 'B', 'C', 'E'], 257 ], 258 }) 259 self.compile(tokenize, grammar) 260 self.assertParse( 261 "A B C D", 262 ('goal_0', 'A', 'B', 'C', 'D')) 263 self.assertParse( 264 "A B C E", 265 ('goal_1', 'A', 'B', 'C', 'E')) 266 267 def testLeftFactorMultiLevel(self): 268 """Test left-factoring again on a nonterminal introduced by 269 left-factoring.""" 270 tokenize = lexer.LexicalGrammar("FOR IN TO BY ( ) = ;", 271 VAR=r'[A-Za-z]+') 272 273 # The first left-factoring pass on `stmt` will left-factor `FOR ( VAR`. 274 # A second pass is needed to left-factor `= expr TO expr`. 275 grammar = Grammar({ 276 'stmt': [ 277 ['expr', ';'], 278 ['FOR', '(', 'VAR', 'IN', 'expr', ')', 'stmt'], 279 ['FOR', '(', 'VAR', '=', 'expr', 'TO', 'expr', ')', 'stmt'], 280 ['FOR', '(', 'VAR', '=', 'expr', 'TO', 'expr', 281 'BY', 'expr', ')', 'stmt'], 282 ['IF', '(', 'expr', ')', 'stmt'], 283 ], 284 'expr': [ 285 ['VAR'], 286 ], 287 }) 288 self.compile(tokenize, grammar) 289 self.assertParse( 290 "FOR (x IN y) z;", 291 ('stmt_1', 'FOR', '(', 'x', 'IN', 'y', ')', 292 ('stmt_0', 'z', ';'))) 293 self.assertParse( 294 "FOR (x = y TO z) x;", 295 ('stmt_2', 'FOR', '(', 'x', '=', 'y', 'TO', 'z', ')', 296 ('stmt_0', 'x', ';'))) 297 self.assertParse( 298 "FOR (x = y TO z BY w) x;", 299 ('stmt_3', 'FOR', '(', 'x', '=', 'y', 'TO', 'z', 'BY', 'w', ')', 300 ('stmt_0', 'x', ';'))) 301 302 def testFirstFirstConflict(self): 303 """This grammar is unambiguous, but is not LL(1) due to a first/first conflict. 304 305 Cribbed from: https://stackoverflow.com/a/17047370/94977 306 """ 307 308 tokenize = lexer.LexicalGrammar("A B C") 309 grammar = Grammar({ 310 's': [ 311 ['x', 'B'], 312 ['y', 'C'], 313 ], 314 'x': [ 315 prod(['A'], "x"), 316 ], 317 'y': [ 318 prod(['A'], "y"), 319 ], 320 }) 321 self.compile(tokenize, grammar) 322 323 self.assertParse("A B", ('s_0', ('x', 'A'), 'B')) 324 self.assertParse("A C", ('s_1', ('y', 'A'), 'C')) 325 326 def testLeftHandSideExpression(self): 327 """Example of a grammar that's in SLR(1) but hard to smoosh into an LL(1) form. 328 329 This is taken from the ECMAScript grammar. 330 331 ...Of course, it's not really possible to enforce the desired syntactic 332 restrictions in LR(k) either; the ES grammar matches `(x + y) = z` and 333 an additional attribute grammar (IsValidSimpleAssignmentTarget) is 334 necessary to rule it out. 335 """ 336 self.compile( 337 lexer.LexicalGrammar("= +", VAR=r'[a-z]+\b'), 338 Grammar({ 339 'AssignmentExpression': [ 340 ['AdditiveExpression'], 341 ['LeftHandSideExpression', '=', 'AssignmentExpression'], 342 ], 343 'AdditiveExpression': [ 344 ['LeftHandSideExpression'], 345 ['AdditiveExpression', '+', 'LeftHandSideExpression'], 346 ], 347 'LeftHandSideExpression': [ 348 ['VAR'], 349 ] 350 }) 351 ) 352 self.assertParse("z = x + y") 353 self.assertNoParse( 354 "x + y = z", 355 message="expected one of ['+', 'end of input'], got '='") 356 357 def testDeepRecursion(self): 358 grammar = Grammar({ 359 'expr': [ 360 ['SYMBOL'], 361 ['(', ')'], 362 ['(', 'exprs', ')'], 363 ], 364 'exprs': [ 365 ['expr'], 366 ['exprs', 'expr'], 367 ], 368 }) 369 self.compile(LispTokenizer, grammar) 370 371 N = 3000 372 s = "x" 373 t = ('expr_0', 'x') 374 for i in range(N): 375 s = "(" + s + ")" 376 t = ('expr_2', '(', t, ')') 377 378 result = self.parse(s) 379 380 # Python can't check that result == t; it causes a RecursionError. 381 # Testing that repr(result) == repr(t), same deal. So: 382 for i in range(N): 383 self.assertIsInstance(result, tuple) 384 self.assertEqual(len(result), 4) 385 self.assertEqual(result[0], 'expr_2') 386 self.assertEqual(result[1], '(') 387 self.assertEqual(result[3], ')') 388 result = result[2] 389 390 def testExpandOptional(self): 391 grammar = Grammar({'goal': [[]]}) 392 empties = {} 393 # Unit test for rewrites.expand_optional_symbols_in_rhs 394 self.assertEqual( 395 list(rewrites.expand_optional_symbols_in_rhs(['ONE', 'TWO', '3'], 396 grammar, empties)), 397 [(['ONE', 'TWO', '3'], {})]) 398 self.assertEqual( 399 list(rewrites.expand_optional_symbols_in_rhs( 400 ['a', 'b', Optional('c')], grammar, empties)), 401 [(['a', 'b'], {2: None}), 402 (['a', 'b', 'c'], {})]) 403 self.assertEqual( 404 list(rewrites.expand_optional_symbols_in_rhs( 405 [Optional('a'), Optional('b')], grammar, empties)), 406 [([], {0: None, 1: None}), 407 (['a'], {1: None}), 408 (['b'], {0: None}), 409 (['a', 'b'], {})]) 410 411 def testEmptyGrammar(self): 412 tokenize = lexer.LexicalGrammar("X") 413 self.compile(tokenize, Grammar({'goal': [[]]})) 414 self.assertParse("", ('goal',)) 415 self.assertNoParse( 416 "X", 417 message="expected 'end of input', got 'X' (line 1)") 418 419 def testOptionalEmpty(self): 420 tokenize = lexer.LexicalGrammar("X Y") 421 grammar = Grammar({ 422 'a': [ 423 [Optional('b'), Optional('c')], 424 ], 425 'b': [ 426 prod(['X'], 'b'), 427 ], 428 'c': [ 429 prod(['Y'], 'c'), 430 ] 431 }) 432 self.compile(tokenize, grammar) 433 self.assertParse("", ('a', None, None)) 434 self.assertParse("X", ('a', ('b', 'X'), None)) 435 self.assertParse("Y", ('a', None, ('c', 'Y'))) 436 self.assertParse("X Y", ('a', ('b', 'X'), ('c', 'Y'))) 437 438 def testOptional(self): 439 tokenize = lexer.LexicalGrammar('[ ] , X') 440 grammar = Grammar({ 441 'array': [ 442 ['[', Optional('elision'), ']'], 443 ['[', 'elements', ']'], 444 ['[', 'elements', ',', Optional('elision'), ']'] 445 ], 446 'elements': [ 447 [Optional('elision'), 'X'], 448 ['elements', ',', Optional('elision'), 'X'] 449 ], 450 'elision': [ 451 [','], 452 ['elision', ','] 453 ] 454 }) 455 self.compile(tokenize, grammar) 456 self.assertParse("[]", 457 ('array_0', '[', None, ']')) 458 self.assertParse("[,]", 459 ('array_0', '[', ',', ']')) 460 self.assertParse( 461 "[,,X,,X,]", 462 ('array_2', 463 '[', 464 ('elements_1', 465 ('elements_0', 466 ('elision_1', 467 ',', 468 ','), 469 'X'), 470 ',', 471 ',', 472 'X'), 473 ',', 474 None, 475 ']')) 476 477 def testPositiveLookahead(self): 478 self.compile( 479 lexer.LexicalGrammar('A B + ( )'), 480 Grammar({ 481 'goal': [ 482 [LookaheadRule(frozenset({'A', 'B'}), True), 'expr'], 483 ], 484 'expr': [ 485 ['term'], 486 ['expr', '+', 'term'], 487 ], 488 'term': [ 489 ['A'], 490 ['B'], 491 ['(', 'expr', ')'], 492 ] 493 }) 494 ) 495 self.assertNoParse( 496 "(A)", 497 message="expected one of ['A', 'B'], got '('") 498 self.assertParse("A + B") 499 500 def testNegativeLookahead(self): 501 tokenize = lexer.LexicalGrammar('a b') 502 rules = { 503 'goal': [ 504 [LookaheadRule(frozenset({'a'}), False), 'abs'], 505 ], 506 'abs': [ 507 ['a'], 508 ['b'], 509 ['abs', 'a'], 510 ['abs', 'b'], 511 ], 512 } 513 514 self.compile(tokenize, Grammar(rules)) 515 self.assertNoParse("a b", message="expected 'b', got 'a'") 516 self.assertParse( 517 'b a', 518 ('goal', ('abs_2', 'b', 'a'))) 519 520 # In simple cases like this, the lookahead restriction can even 521 # disambiguate a grammar that would otherwise be ambiguous. 522 rules['goal'].append(prod(['a'], 'goal_a')) 523 self.compile(tokenize, Grammar(rules)) 524 self.assertParse('a', ('goal_a', 'a')) 525 526 def disabledNegativeLookaheadDisambiguation(self): 527 tokenize = lexer.LexicalGrammar( 528 '( ) { } ; function =', 529 IDENT=r'[A-Za-z_][A-Za-z_0-9]*') 530 grammar = Grammar({ 531 'stmts': [ 532 ['stmt'], 533 ['stmts', 'stmt'], 534 ], 535 'stmt': [ 536 [LookaheadRule(set=frozenset({'function'}), positive=False), 537 'expr', ';'], 538 ['fndecl'], 539 ], 540 'fndecl': [ 541 ['function', 'IDENT', '(', ')', '{', Optional('stmt'), '}'], 542 ], 543 'expr': [ 544 ['term'], 545 ['IDENT', '=', 'expr'], 546 ], 547 'term': [ 548 ['(', 'expr', ')'], 549 ['fndecl'], 550 ['term', '(', 'expr', ')'], 551 ], 552 }) 553 self.compile(tokenize, grammar) 554 555 # Test that without the lookahead restriction, we reject this grammar 556 # (it's ambiguous): 557 del grammar['stmt'][0][0] 558 self.assertRaisesRegex(ValueError, 559 'banana', 560 lambda: gen.compile(grammar)) 561 562 self.assertParse( 563 'function f() { x = function y() {}; }', 564 ('stmt', 1, 565 ('fndecl', 566 'function', 'f', '(', ')', '{', 567 ('stmt', 0, 568 ('expr', 1, 569 'x', 570 '=', 571 ('expr', 0, 572 ('term', 1, 573 ('fndecl', 574 'function', 'y', '(', ')', 575 '{', None, '}')))), 576 ';')))) 577 578 self.assertParse( 579 '(function g(){});', 580 ('stmts', 0, 581 ('stmt', 0, 582 ('term', 1, 583 ('fndecl', 584 'function', 'g', '(', ')', '{', None, '}')), 585 ';'))) 586 587 def testTrailingLookahead(self): 588 """Lookahead at the end of a production is banned.""" 589 tokenize = lexer.LexicalGrammar('IF ( X ) ELSE OTHER ;') 590 grammar = gen.Grammar({ 591 'goal': [['stmt']], 592 'stmt': [ 593 ['OTHER', ';'], 594 ['IF', '(', 'X', ')', 'stmt', 595 LookaheadRule(frozenset({'ELSE'}), False)], 596 ['IF', '(', 'X', ')', 'stmt', 'ELSE', 'stmt'], 597 ], 598 }) 599 600 def stmt_0(): 601 return ('stmt_0', 'OTHER', ';') 602 603 def stmt_1(t): 604 return ('stmt_1', 'IF', '(', 'X', ')', t) 605 606 def stmt_2(t, e): 607 return ('stmt_2', 'IF', '(', 'X', ')', t, 'ELSE', e) 608 609 self.compile(tokenize, grammar) 610 self.assertParse('IF(X) OTHER;', stmt_1(stmt_0())) 611 self.assertParse('IF(X) OTHER; ELSE OTHER;', 612 stmt_2(stmt_0(), stmt_0())) 613 self.assertParse('IF(X) IF(X) OTHER; ELSE OTHER; ELSE OTHER;', 614 stmt_2(stmt_2(stmt_0(), stmt_0()), stmt_0())) 615 self.assertParse('IF(X) OTHER; ELSE IF(X) OTHER; ELSE OTHER;', 616 stmt_2(stmt_0(), stmt_2(stmt_0(), stmt_0()))) 617 self.assertParse('IF(X) IF(X) OTHER; ELSE OTHER;', 618 stmt_1(stmt_2(stmt_0(), stmt_0()))) 619 620 def testLookaheadBeforeOptional(self): 621 self.compile( 622 lexer.LexicalGrammar( 623 '= : _', 624 PUBLIC=r'public\b', 625 IDENT=r'[a-z]+\b', 626 NUM=r'[0-9]\b'), 627 Grammar({ 628 'decl': [ 629 [ 630 LookaheadRule(frozenset({'IDENT'}), True), 631 Optional('attrs'), 632 'pat', '=', 'NUM' 633 ], 634 ], 635 'attrs': [ 636 ['attr'], 637 ['attrs', 'attr'], 638 ], 639 'attr': [ 640 ['PUBLIC', ':'], 641 ['IDENT', ':'], 642 ], 643 'pat': [ 644 ['IDENT'], 645 ['_'], 646 ], 647 }) 648 ) 649 self.assertEqual( 650 self.parse("x = 0"), 651 ("decl", None, "x", "=", "0")) 652 self.assertParse("thread: x = 0") 653 self.assertNoParse( 654 "public: x = 0", 655 message="expected 'IDENT', got 'public'") 656 self.assertNoParse("_ = 0", message="expected 'IDENT', got '_'") 657 self.assertParse("funny: public: x = 0") 658 self.assertParse("funny: _ = 0") 659 660 def testForLookahead(self): 661 grammar = Grammar({ 662 'Stmt': [ 663 [';'], 664 ['ForStmt'], 665 ], 666 'ForStmt': [ 667 ["for", "(", LookaheadRule(frozenset({"let"}), False), 668 "Expr", ";", ";", ")", "Stmt"], 669 ], 670 'Expr': [ 671 ["0"], 672 ["let"], 673 ], 674 }) 675 self.compile(lexer.LexicalGrammar("for ( let ; ) 0"), grammar) 676 self.assertParse("for (0;;) ;") 677 self.assertNoParse("for (let;;) ;", message="expected '0', got 'let'") 678 679 def testLookaheadDisambiguation(self): 680 """A lookahead restriction should be able to rule out certain nonterminals entirely.""" 681 682 grammar = Grammar({ 683 'Script': [ 684 ['Statement'], 685 ['Statement', 'Statement'], 686 ], 687 'Statement': [ 688 [LookaheadRule(frozenset({'function'}), False), 'Expression', ';'], 689 ['Function'], 690 ], 691 'Function': [ 692 ['function', 'x', '(', ')', '{', '}'], 693 ], 694 'Expression': [ 695 ['Primary'], 696 ['++', 'Primary'], 697 ['Primary', '++'], 698 ], 699 'Primary': [ 700 ['Function'], 701 ['x'], 702 ], 703 }) 704 705 self.compile(lexer.LexicalGrammar("function x ( ) { } ++ ;"), grammar) 706 self.assertParse("function x() {}") 707 self.assertParse("++function x() {};") 708 self.assertNoParse("++function x() {}", message="unexpected end") 709 # TODO: The parser generator fails to handle this case because it does 710 # not forward the restriction from producting a Function to the 711 # Primitive rule. Therefore, `Function [lookahead: ;]` is incorrectly 712 # reduced to a `Primitive [lookahead: ;]` 713 # self.assertNoParse("function x() {}++;", message="got ';'") 714 self.assertParse("function x() {} ++x;") 715 716 # XXX to test: combination of lookaheads, ++, +-, -+, -- 717 # XXX todo: find an example where lookahead canonicalization matters 718 719 def testHugeExample(self): 720 grammar = Grammar( 721 { 722 'grammar': [['nt_def_or_blank_line'], 723 ['grammar', 'nt_def_or_blank_line']], 724 'arg': [['sigil', 'NT']], 725 'args': [['arg'], ['args', ',', 'arg']], 726 'definite_sigil': [['~'], ['+']], 727 'exclusion': [['terminal'], 728 ['nonterminal'], 729 ['CHR', 'through', 'CHR']], 730 'exclusion_list': [['exclusion'], 731 ['exclusion_list', 'or', 'exclusion']], 732 'ifdef': [['[', 'definite_sigil', 'NT', ']']], 733 'line_terminator': [['NT'], ['NTALT']], 734 'lookahead_assertion': [ 735 ['==', 'terminal'], 736 ['!=', 'terminal'], 737 ['<!', 'NT'], 738 ['<!', '{', 'lookahead_exclusions', '}']], 739 'lookahead_exclusion': [['lookahead_exclusion_element'], 740 ['lookahead_exclusion', 741 'lookahead_exclusion_element']], 742 'lookahead_exclusion_element': [['terminal'], 743 ['no_line_terminator_here']], 744 'lookahead_exclusions': [['lookahead_exclusion'], 745 ['lookahead_exclusions', ',', 746 'lookahead_exclusion']], 747 'no_line_terminator_here': [ 748 ['[', 'no', 'line_terminator', 'here', ']']], 749 'nonterminal': [['NT'], ['NTCALL', '[', 'args', ']']], 750 'nt_def': [['nt_lhs', 'EQ', 'NL', 'rhs_lines', 'NL'], 751 ['nt_lhs', 'EQ', 'one', 'of', 'NL', 752 't_list_lines', 'NL']], 753 'nt_def_or_blank_line': [['NL'], ['nt_def']], 754 'nt_lhs': [['NT'], ['NTCALL', '[', 'params', ']']], 755 'param': [['NT']], 756 'params': [['param'], ['params', ',', 'param']], 757 'rhs': [['symbols'], ['[', 'empty', ']']], 758 'rhs_line': [[Optional(inner='ifdef'), 'rhs', 759 Optional(inner='PRODID'), 'NL'], 760 ['PROSE', 'NL']], 761 'rhs_lines': [['rhs_line'], ['rhs_lines', 'rhs_line']], 762 'sigil': [['definite_sigil'], ['?']], 763 'symbol': [['terminal'], 764 ['nonterminal'], 765 ['nonterminal', '?'], 766 ['nonterminal', 'but', 'not', 'exclusion'], 767 ['nonterminal', 'but', 'not', 'one', 'of', 768 'exclusion_list'], 769 ['[', 'lookahead', 'lookahead_assertion', ']'], 770 ['no_line_terminator_here'], 771 ['WPROSE']], 772 'symbols': [['symbol'], ['symbols', 'symbol']], 773 't_list_line': [['terminal_seq', 'NL']], 774 't_list_lines': [['t_list_line'], 775 ['t_list_lines', 't_list_line']], 776 'terminal': [['T'], ['CHR']], 777 'terminal_seq': [['terminal'], ['terminal_seq', 'terminal']] 778 }, 779 variable_terminals='EQ T CHR NTCALL NT NTALT ' 780 'PRODID PROSE WPROSE'.split() 781 ) 782 783 # Note: This lexical grammar is not suitable for use with incremental 784 # parsing. 785 emu_grammar_lexer = lexer.LexicalGrammar( 786 # the operators and keywords: 787 "[ ] { } , ~ + ? <! == != " 788 "but empty here lookahead no not of one or through", 789 NL="\n", 790 # any number of colons together 791 EQ=r':+', 792 # terminals of the ES grammar, quoted with backticks 793 T=r'`[^` \n]+`|```', 794 # also terminals, denoting control characters 795 CHR=r'<[A-Z]+>|U\+[0-9A-f]{4}', 796 # nonterminals that will be followed by boolean parameters 797 NTCALL=r'(?:uri|[A-Z])\w*(?=\[)', 798 # nonterminals (also, boolean parameters) 799 NT=r'(?:uri|[A-Z])\w*', 800 # nonterminals wrapped in vertical bars for no apparent reason 801 NTALT=r'\|[A-Z]\w+\|', 802 # the spec also gives a few productions names 803 PRODID=r'#[A-Za-z]\w*', 804 # prose to the end of the line 805 PROSE=r'>.*', 806 # prose wrapped in square brackets 807 WPROSE=r'\[>[^]]*\]', 808 ) 809 810 self.compile(emu_grammar_lexer, grammar) 811 812 source = """\ 813 IdentifierReference[Yield, Await] : 814 Identifier 815 [~Yield] `yield` 816 [~Await] `await` 817 818 """ 819 820 self.assertParse(source) 821 822 def testParameterizedProductions(self): 823 passthru = ('Yield', Var('Yield')), 824 name = Nt("name", passthru) 825 stmt = Nt("stmt", passthru) 826 stmts = Nt("stmts", passthru) 827 grammar = Grammar({ 828 'script': [ 829 ['def'], 830 ['script', 'def'], 831 ], 832 'def': [ 833 [ 834 'function', 'IDENT', '(', ')', '{', 835 Nt('stmts', (('Yield', False),)), '}' 836 ], 837 [ 838 'function', '*', 'IDENT', '(', ')', '{', 839 Nt('stmts', (('Yield', True),)), '}' 840 ], 841 ], 842 'stmts': NtDef(('Yield',), [ 843 [stmt], 844 [stmts, stmt], 845 ], None), 846 'stmt': NtDef(('Yield',), [ 847 [name, "(", ")", ";"], 848 [name, "=", name, ";"], 849 Production(["yield", name, ";"], 850 reducer=CallMethod("yield_stmt", [1]), 851 condition=('Yield', True)), 852 ], None), 853 'name': NtDef(('Yield',), [ 854 ["IDENT"], 855 # Specifically ask for a method here, because otherwise we 856 # wouldn't get one and then type checking would fail. 857 Production(["yield"], 858 CallMethod("yield_as_name", []), 859 condition=('Yield', False)), 860 ], None), 861 }, variable_terminals=["IDENT"]) 862 self.compile(lexer.LexicalGrammar("( ) { } ; * = function yield", 863 IDENT=r'[A-Za-z]\w*'), 864 grammar) 865 self.assertParse("function* farm() { cow = pig; yield cow; }") 866 self.assertNoParse( 867 "function city() { yield toOncomingTraffic; }", 868 message="expected one of ['(', '='], got 'toOncomingTraffic'") 869 self.assertNoParse( 870 "function* farm() { yield = corn; yield yield; }", 871 message="expected 'IDENT', got '='") 872 873 def testMissingParameterError(self): 874 grammar = { 875 'Foo': [ 876 ['Bar'], 877 ], 878 'Bar': NtDef(('Arg',), [ 879 ['NUM'], 880 Production(['STR'], 881 reducer=0, 882 condition=('Arg', True)), 883 ], None), 884 } 885 886 self.assertRaisesRegex(ValueError, "missing parameters for 'Bar'", 887 lambda: Grammar(grammar)) 888 889 def testCanonicalLR(self): 890 """Example 4.39 (grammar 4.20) from the book.""" 891 892 # Modified as marked below 893 grammar = Grammar({ 894 "S": [ 895 ["L", "=", "R"], 896 ["R"], 897 ], 898 "L": [ 899 ["*", "R"], 900 ["id"], 901 ], 902 "R": [ 903 ["L"], 904 # added so we can have a negative test, showing that 905 # `R = R` is not an S: 906 ["7"], 907 ], 908 }) 909 self.compile(lexer.LexicalGrammar("id = * 7"), grammar) 910 self.assertParse("id = *id") 911 self.assertParse("*id = id") 912 self.assertParse("id = 7") 913 self.assertNoParse("7 = id", 914 message="expected 'end of input', got '='") 915 916 def testLookaheadWithCanonicalLR(self): 917 """Only a lookahead assertion makes this grammar unambiguous.""" 918 tokenize = lexer.LexicalGrammar("async => { } ;", Identifier=r'\w+') 919 grammar = Grammar({ 920 "script": [ 921 ["Expression", ";"], 922 ], 923 "Expression": [ 924 ["PrimaryExpression"], 925 ["async", "Identifier", "=>", "AsyncConciseBody"], 926 ], 927 "AsyncConciseBody": [ 928 [LookaheadRule(set=frozenset(["{"]), positive=False), 929 "Expression"], 930 ["{", "}"], 931 ], 932 "PrimaryExpression": [ 933 ["{", "}"], 934 ], 935 }) 936 937 self.compile(tokenize, grammar) 938 self.assertParse("{};") 939 self.assertParse("async x => {};") 940 self.assertParse("async x => async y => {};") 941 942 def testMultiGoal(self): 943 tokenize = lexer.LexicalGrammar("WHILE DEF FN { } ( ) -> ;", ID=r'\w+') 944 grammar = Grammar({ 945 "stmt": [ 946 ["expr", ";"], 947 ["{", "stmts", "}"], 948 ["WHILE", "(", "expr", ")", "stmt"], 949 ["DEF", "ID", "(", "ID", ")", "{", Optional("stmts"), "}"], 950 ], 951 "stmts": [ 952 ["stmt"], 953 ["stmts", "stmt"], 954 ], 955 "expr": [ 956 ["FN", "ID", "->", "expr"], 957 ["call_expr"], 958 ], 959 "call_expr": [ 960 ["ID"], 961 ["call_expr", "(", "expr", ")"], 962 ["(", "expr", ")"], 963 ], 964 }, goal_nts=["stmts", "expr"]) 965 self.compile(tokenize, grammar) 966 self.assertParse("WHILE ( x ) { decx ( x ) ; }", goal="stmts") 967 self.assertNoParse( 968 "WHILE ( x ) { decx ( x ) ; }", goal="expr", 969 message="expected one of ['(', 'FN', 'ID'], got 'WHILE'") 970 self.assertParse("f(x);", goal="stmts") 971 self.assertNoParse("f(x);", goal="expr", 972 message="expected 'end of input', got ';'") 973 self.assertParse("(FN x -> f ( x ))(x)", goal="expr") 974 self.assertNoParse("(FN x -> f ( x ))(x)", goal="stmts", 975 message="unexpected end of input") 976 977 def testStaggeredItems(self): 978 """Items in a state can have different amounts of leading context.""" 979 # In this example grammar, after "A" "B", we're in a state that 980 # contains these two items (ignoring lookahead): 981 # goal ::= "A" "B" · y 982 # x ::= "B" · stars "X" 983 # 984 # Likewise, after `"A" "B" stars`, we have: 985 # x ::= "B" stars · "X" 986 # y ::= stars · "Y" 987 # stars ::= stars · "*" 988 tokenize = lexer.LexicalGrammar("A B * X Y") 989 grammar = Grammar({ 990 "goal": [ 991 ["A", "x"], 992 ["A", "B", "y"], 993 ], 994 "x": [ 995 ["B", "stars", "X"], 996 ], 997 "y": [ 998 ["stars", "Y"], 999 ], 1000 "stars": [ 1001 ["*"], 1002 ["stars", "*"], 1003 ], 1004 }) 1005 self.compile(tokenize, grammar) 1006 self.assertParse("A B * * * X") 1007 self.assertParse("A B * * * Y") 1008 1009 def testCheckCycleFree(self): 1010 tokenize = lexer.LexicalGrammar("!") 1011 grammar = Grammar({ 1012 "problem": [ 1013 ["one", "two"], 1014 ], 1015 "one": [ 1016 ["!"], 1017 ], 1018 "two": [ 1019 [Optional("problem")], 1020 ], 1021 }) 1022 self.compile(tokenize, grammar) 1023 self.assertParse("! ! ! ! !") 1024 1025 def testReduceActions(self): 1026 tokenize = lexer.LexicalGrammar("+ - * / ( )", 1027 NUM=r'[0-9]\w*', 1028 VAR=r'[A-Za-z]\w*') 1029 grammar = Grammar({ 1030 "expr": [ 1031 ["term"], 1032 prod(["expr", "+", "term"], "add"), 1033 prod(["expr", "-", "term"], "sub"), 1034 ], 1035 "term": [ 1036 ["unary"], 1037 prod(["term", "*", "unary"], "mul"), 1038 prod(["term", "/", "unary"], "div"), 1039 ], 1040 "unary": [ 1041 ["prim"], 1042 prod(["-", "prim"], "neg"), 1043 ], 1044 "prim": [ 1045 prod(["(", "expr", ")"], "parens"), 1046 prod(["NUM"], "num"), 1047 prod(["VAR"], "var"), 1048 ], 1049 }, goal_nts=['expr']) 1050 1051 self.compile(tokenize, grammar) 1052 self.assertParse("X", ('var', 'X')) 1053 self.assertParse("3 + 4", ('add', ('num', '3'), '+', ('num', '4'))) 1054 self.assertParse( 1055 "2 * 3 + 4 * (5 + 7)", 1056 ( 1057 'add', 1058 ('mul', ('num', '2'), '*', ('num', '3')), 1059 '+', 1060 ( 1061 'mul', 1062 ('num', '4'), 1063 '*', 1064 ('parens', '(', 1065 ('add', ('num', '5'), '+', ('num', '7')), ')')))) 1066 self.assertParse( 1067 "1 / (1 + 1 / (1 + 1 / (1 + 1)))", 1068 ( 1069 'div', ('num', '1'), '/', ( 1070 'parens', '(', ( 1071 'add', ('num', '1'), '+', ( 1072 'div', ('num', '1'), '/', ( 1073 'parens', '(', ( 1074 'add', ('num', '1'), '+', ( 1075 'div', ('num', '1'), '/', ( 1076 'parens', '(', ( 1077 'add', ('num', '1'), '+', 1078 ('num', '1')), 1079 ')'))), 1080 ')'))), 1081 ')'))) 1082 1083 def testConvenienceMethodTypeInference(self): 1084 """A method can be called only in an intermediate reduce expression.""" 1085 1086 # The reduce expression `f(g($0))`. 1087 reducer = CallMethod("f", [CallMethod("g", [0])]) 1088 1089 # The grammar `goal ::= NAME => f(g($1))`. 1090 grammar = Grammar( 1091 { 1092 'goal': [Production(['NAME'], reducer)], 1093 }, 1094 variable_terminals=['NAME']) 1095 1096 # Since the return value of f() is used as the value of a `goal`, 1097 # we infer that f() returns a goal. 1098 self.assertEqual( 1099 grammar.methods['f'].return_type, 1100 jsparagus.types.Type('goal')) 1101 1102 # Since the return value of g() isn't used except as an argument, we 1103 # just give it the type `g`. 1104 self.assertEqual( 1105 grammar.methods['g'].return_type, 1106 jsparagus.types.Type('g')) 1107 1108 # Since g() is passed to f(), we infer this: 1109 self.assertEqual( 1110 grammar.methods['f'].argument_types, 1111 [jsparagus.types.Type('g')]) 1112 1113 def testEpsilonFreeTransform(self): 1114 tokenize = lexer.LexicalGrammar('{ } X') 1115 grammar = Grammar({ 1116 'goal': [ 1117 ['{', 'xlist', '}'], 1118 ], 1119 'xlist': [ 1120 [], 1121 ['xlist', 'X'], 1122 ], 1123 }) 1124 self.compile(tokenize, grammar) 1125 self.assertParse("{}", ('goal', '{', ('xlist_0',), '}')) 1126 1127 def compile_as_js( 1128 self, 1129 grammar_source: str, 1130 goals: typing.Optional[typing.Iterable[str]] = None, 1131 verbose: bool = False, 1132 ) -> None: 1133 """Like self.compile(), but generate a parser from ESGrammar, 1134 with ASI support, using the JS lexer. 1135 """ 1136 from js_parser.lexer import JSLexer 1137 from js_parser import load_es_grammar 1138 from js_parser import generate_js_parser_tables 1139 1140 grammar = parse_esgrammar( 1141 grammar_source, 1142 filename="es-simplified.esgrammar", 1143 extensions=[], 1144 goals=goals, 1145 synthetic_terminals=load_es_grammar.ECMASCRIPT_SYNTHETIC_TERMINALS, 1146 terminal_names=load_es_grammar.TERMINAL_NAMES_FOR_SYNTACTIC_GRAMMAR) 1147 grammar = generate_js_parser_tables.hack_grammar(grammar) 1148 base_parser_class = gen.compile(grammar, verbose=verbose) 1149 1150 # "type: ignore" because poor mypy can't cope with the runtime codegen 1151 # we're doing here. 1152 class JSParser(base_parser_class): # type: ignore 1153 def __init__(self, goal='Script', builder=None): 1154 super().__init__(goal, builder) 1155 self._goal = goal 1156 # self.debug = True 1157 1158 def clone(self): 1159 return JSParser(self._goal, self.methods) 1160 1161 def on_recover(self, error_code, lexer, stv): 1162 """Check that ASI error recovery is really acceptable.""" 1163 if error_code == 'asi': 1164 if not self.closed and stv.term != '}' and not lexer.saw_line_terminator(): 1165 lexer.throw("missing semicolon") 1166 else: 1167 assert error_code == 'do_while_asi' 1168 1169 self.tokenize = JSLexer 1170 self.parser_class = JSParser 1171 1172 def testExtraGoal(self): 1173 1174 grammar_source = """ 1175StuffToIgnore_ThisWorksAroundAnUnrelatedBug: 1176 Identifier 1177 IdentifierName 1178 1179Hat : 1180 `^` 1181 1182ArrowFunction : 1183 `^` `=>` 1184 Hat `*` `=>` 1185 1186Script : 1187 `?` `?` ArrowFunction 1188 `?` `?` [lookahead <! {`async`} ] Hat `of` 1189 1190LazyArrowFunction : 1191 ArrowFunction 1192 """ 1193 1194 def try_it(goals): 1195 self.compile_as_js(grammar_source, goals=goals) 1196 self.assertParse("? ? ^ =>", goal='Script') 1197 self.assertParse("? ? ^ of", goal='Script') 1198 1199 try_it(['Script', 'LazyArrowFunction']) 1200 try_it(['Script']) 1201 1202 1203if __name__ == '__main__': 1204 unittest.main() 1205