1#!/usr/bin/env python3
2
3import io
4import re
5import unittest
6import typing
7
8import jsparagus
9from jsparagus import gen, lexer, rewrites
10from jsparagus.grammar import (Grammar, Production, CallMethod, Nt,
11                               Optional, LookaheadRule, NtDef, Var)
12from js_parser.parse_esgrammar import parse_esgrammar
13
14
15LispTokenizer = lexer.LexicalGrammar("( )", SYMBOL=r'[!%&*+:<=>?@A-Z^_a-z~]+')
16
17
18def prod(body, method_name):
19    return Production(body, CallMethod(method_name, list(range(len(body)))))
20
21
22class GenTestCase(unittest.TestCase):
23    def compile(self, tokenize, grammar, **kwargs):
24        """Compile a grammar. Use this when you expect compilation to
25        succeed."""
26        self.tokenize = tokenize
27        self.parser_class = gen.compile(grammar, **kwargs)
28
29    def parse(self, text, goal=None):
30        if goal is None:
31            parser = self.parser_class()
32        else:
33            parser = self.parser_class(goal=goal)
34        lexer = self.tokenize(parser)
35        lexer.write(text)
36        return lexer.close()
37
38    def compile_multi(self, tokenize, grammar):
39        self.tokenize = tokenize
40        obj = gen.compile_multi(grammar)
41        for attr in dir(obj):
42            if attr.startswith("parse_"):
43                setattr(self, attr, getattr(obj, attr))
44
45    def assertParse(self, s, expected=None, *, goal=None):
46        result = self.parse(s, goal=goal)
47        if expected is not None:
48            self.assertEqual(expected, result)
49
50    def assertNoParse(self, s, *, goal=None, message="banana"):
51        if goal is None:
52            kwargs = {}
53        else:
54            kwargs = {"goal": goal}
55        self.assertRaisesRegex(
56            SyntaxError,
57            re.escape(message),
58            lambda: self.parse(s, **kwargs))
59
60    def testSimple(self):
61        grammar = parse_esgrammar(
62            """
63            expr :
64                SYMBOL  => $0
65                `(` tail
66
67            tail :
68                `)`  => $0
69                expr tail
70            """,
71            terminal_names=["SYMBOL"]
72        )
73        self.compile(LispTokenizer, grammar)
74
75        self.assertParse(
76            "(lambda (x) (* x x))",
77            ('expr_1',
78                '(',
79                ('tail_1',
80                    'lambda',
81                    ('tail_1',
82                        ('expr_1', '(', ('tail_1', 'x', ')')),
83                        ('tail_1',
84                            ('expr_1',
85                                '(',
86                                ('tail_1',
87                                    '*',
88                                    ('tail_1',
89                                        'x',
90                                        ('tail_1', 'x', ')')))),
91                            ')')))))
92
93    def testEnd(self):
94        self.compile(
95            lexer.LexicalGrammar("ONE TWO"),
96            Grammar({
97                'goal': [
98                    ['ONE', 'TWO']
99                ]
100            })
101        )
102        self.assertNoParse("ONE TWO TWO",
103                           message="expected 'end of input', got 'TWO'")
104
105    def testList(self):
106        list_grammar = Grammar({
107            'prelist': [
108                ['word', 'list']
109            ],
110            'list': [
111                ['word'],
112                ['list', 'word'],
113            ],
114            'word': [
115                ['SYMBOL']
116            ],
117        })
118        self.compile(LispTokenizer, list_grammar)
119        self.assertParse(
120            "the quick brown fox jumped over the lazy dog",
121            ('prelist',
122                'the',
123                ('list_1',
124                    ('list_1',
125                        ('list_1',
126                            ('list_1',
127                                ('list_1',
128                                    ('list_1',
129                                        ('list_1',
130                                            'quick',
131                                            'brown'),
132                                        'fox'),
133                                    'jumped'),
134                                'over'),
135                            'the'),
136                        'lazy'),
137                    'dog')))
138
139    def testArithmetic(self):
140        tokenize = lexer.LexicalGrammar(
141            "+ - * / ( )",
142            NUM=r'[0-9]\w*',
143            VAR=r'[A-Za-z]\w*')
144        arith_grammar = Grammar({
145            'expr': [
146                ['term'],
147                ['expr', '+', 'term'],
148                ['expr', '-', 'term'],
149            ],
150            'term': [
151                ['prim'],
152                ['term', '*', 'prim'],
153                ['term', '/', 'prim'],
154            ],
155            'prim': [
156                ['NUM'],
157                ['VAR'],
158                ['(', 'expr', ')'],
159            ],
160        })
161        self.compile(tokenize, arith_grammar)
162
163        self.assertParse(
164            '2 * 3 + 4 * (5 + 7)',
165            ('expr_1',
166                ('term_1', '2', '*', '3'),
167                '+',
168                ('term_1',
169                    '4',
170                    '*',
171                    ('prim_2',
172                        '(',
173                        ('expr_1', '5', '+', '7'),
174                        ')'))))
175
176        self.assertNoParse(
177            "(",
178            message="unexpected end of input")
179        self.assertNoParse(
180            ")",
181            message="expected one of ['(', 'NUM', 'VAR'], got ')'")
182
183    def testAmbiguous(self):
184        # This grammar should fail verification.
185        # It's ambiguous: is ABC s(A)y(BC) or s(AB)y(C)?
186        grammar = Grammar({
187            'goal': [
188                ['s', 'y'],
189            ],
190            's': [
191                ['A'],
192                ['s', 'B'],
193            ],
194            'y': [
195                ['C'],
196                ['B', 'C'],
197            ],
198        })
199
200        out = io.StringIO()
201        self.assertRaisesRegex(ValueError, r"conflict",
202                               lambda: gen.generate_parser(out, grammar))
203
204    def testAmbiguousEmpty(self):
205        """Reject grammars that are ambiguous due to empty productions.
206
207        (Empty productions are ones that match the empty string.)"""
208
209        def check(rules):
210            grammar = Grammar(rules, goal_nts=['goal'])
211            out = io.StringIO()
212            self.assertRaisesRegex(
213                ValueError,
214                r"ambiguous grammar|conflict",
215                lambda: gen.generate_parser(out, grammar))
216
217        check({'goal': [[], []]})
218        check({'goal': [[Optional('X')], []]})
219        check({'goal': [[Optional('X')], [Optional('Y')]]})
220        check({'goal': [[Optional('X'), Optional('Y')], [Optional('Z')]]})
221
222        # Issue #3: This also has an abiguity; empty string matches either
223        # `goal ::= [empty]` or `goal ::= phrase, phrase ::= [empty]`.
224        check({
225            'goal': [[Optional('phrase')]],
226            'phrase': [[Optional('X')]],
227        })
228
229        # Input "X" is ambiguous, could be ('goal', ('a', None), ('a', 'X'))
230        # or the other 'a' could be the one that's missing.
231        check({
232            'goal': [['a', 'a']],
233            'a': [[Optional('X')]],
234        })
235
236    def testLeftFactor(self):
237        """Most basic left-factoring test."""
238        tokenize = lexer.LexicalGrammar("A B")
239        grammar = Grammar({
240            'goal': [
241                ['A'],
242                ['A', 'B'],
243            ],
244        })
245
246        self.compile(tokenize, grammar)
247        self.assertParse("A", 'A')
248        self.assertParse("A B", ('goal_1', 'A', 'B'))
249
250    def testLeftFactorMulti(self):
251        """Test left-factoring with common prefix of length >1."""
252        tokenize = lexer.LexicalGrammar("A B C D E")
253        grammar = Grammar({
254            'goal': [
255                ['A', 'B', 'C', 'D'],
256                ['A', 'B', 'C', 'E'],
257            ],
258        })
259        self.compile(tokenize, grammar)
260        self.assertParse(
261            "A B C D",
262            ('goal_0', 'A', 'B', 'C', 'D'))
263        self.assertParse(
264            "A B C E",
265            ('goal_1', 'A', 'B', 'C', 'E'))
266
267    def testLeftFactorMultiLevel(self):
268        """Test left-factoring again on a nonterminal introduced by
269        left-factoring."""
270        tokenize = lexer.LexicalGrammar("FOR IN TO BY ( ) = ;",
271                                        VAR=r'[A-Za-z]+')
272
273        # The first left-factoring pass on `stmt` will left-factor `FOR ( VAR`.
274        # A second pass is needed to left-factor `= expr TO expr`.
275        grammar = Grammar({
276            'stmt': [
277                ['expr', ';'],
278                ['FOR', '(', 'VAR', 'IN', 'expr', ')', 'stmt'],
279                ['FOR', '(', 'VAR', '=', 'expr', 'TO', 'expr', ')', 'stmt'],
280                ['FOR', '(', 'VAR', '=', 'expr', 'TO', 'expr',
281                 'BY', 'expr', ')', 'stmt'],
282                ['IF', '(', 'expr', ')', 'stmt'],
283            ],
284            'expr': [
285                ['VAR'],
286            ],
287        })
288        self.compile(tokenize, grammar)
289        self.assertParse(
290            "FOR (x IN y) z;",
291            ('stmt_1', 'FOR', '(', 'x', 'IN', 'y', ')',
292             ('stmt_0', 'z', ';')))
293        self.assertParse(
294            "FOR (x = y TO z) x;",
295            ('stmt_2', 'FOR', '(', 'x', '=', 'y', 'TO', 'z', ')',
296             ('stmt_0', 'x', ';')))
297        self.assertParse(
298            "FOR (x = y TO z BY w) x;",
299            ('stmt_3', 'FOR', '(', 'x', '=', 'y', 'TO', 'z', 'BY', 'w', ')',
300             ('stmt_0', 'x', ';')))
301
302    def testFirstFirstConflict(self):
303        """This grammar is unambiguous, but is not LL(1) due to a first/first conflict.
304
305        Cribbed from: https://stackoverflow.com/a/17047370/94977
306        """
307
308        tokenize = lexer.LexicalGrammar("A B C")
309        grammar = Grammar({
310            's': [
311                ['x', 'B'],
312                ['y', 'C'],
313            ],
314            'x': [
315                prod(['A'], "x"),
316            ],
317            'y': [
318                prod(['A'], "y"),
319            ],
320        })
321        self.compile(tokenize, grammar)
322
323        self.assertParse("A B", ('s_0', ('x', 'A'), 'B'))
324        self.assertParse("A C", ('s_1', ('y', 'A'), 'C'))
325
326    def testLeftHandSideExpression(self):
327        """Example of a grammar that's in SLR(1) but hard to smoosh into an LL(1) form.
328
329        This is taken from the ECMAScript grammar.
330
331        ...Of course, it's not really possible to enforce the desired syntactic
332        restrictions in LR(k) either; the ES grammar matches `(x + y) = z` and
333        an additional attribute grammar (IsValidSimpleAssignmentTarget) is
334        necessary to rule it out.
335        """
336        self.compile(
337            lexer.LexicalGrammar("= +", VAR=r'[a-z]+\b'),
338            Grammar({
339                'AssignmentExpression': [
340                    ['AdditiveExpression'],
341                    ['LeftHandSideExpression', '=', 'AssignmentExpression'],
342                ],
343                'AdditiveExpression': [
344                    ['LeftHandSideExpression'],
345                    ['AdditiveExpression', '+', 'LeftHandSideExpression'],
346                ],
347                'LeftHandSideExpression': [
348                    ['VAR'],
349                ]
350            })
351        )
352        self.assertParse("z = x + y")
353        self.assertNoParse(
354            "x + y = z",
355            message="expected one of ['+', 'end of input'], got '='")
356
357    def testDeepRecursion(self):
358        grammar = Grammar({
359            'expr': [
360                ['SYMBOL'],
361                ['(', ')'],
362                ['(', 'exprs', ')'],
363            ],
364            'exprs': [
365                ['expr'],
366                ['exprs', 'expr'],
367            ],
368        })
369        self.compile(LispTokenizer, grammar)
370
371        N = 3000
372        s = "x"
373        t = ('expr_0', 'x')
374        for i in range(N):
375            s = "(" + s + ")"
376            t = ('expr_2', '(', t, ')')
377
378        result = self.parse(s)
379
380        # Python can't check that result == t; it causes a RecursionError.
381        # Testing that repr(result) == repr(t), same deal. So:
382        for i in range(N):
383            self.assertIsInstance(result, tuple)
384            self.assertEqual(len(result), 4)
385            self.assertEqual(result[0], 'expr_2')
386            self.assertEqual(result[1], '(')
387            self.assertEqual(result[3], ')')
388            result = result[2]
389
390    def testExpandOptional(self):
391        grammar = Grammar({'goal': [[]]})
392        empties = {}
393        # Unit test for rewrites.expand_optional_symbols_in_rhs
394        self.assertEqual(
395            list(rewrites.expand_optional_symbols_in_rhs(['ONE', 'TWO', '3'],
396                                                         grammar, empties)),
397            [(['ONE', 'TWO', '3'], {})])
398        self.assertEqual(
399            list(rewrites.expand_optional_symbols_in_rhs(
400                ['a', 'b', Optional('c')], grammar, empties)),
401            [(['a', 'b'], {2: None}),
402             (['a', 'b', 'c'], {})])
403        self.assertEqual(
404            list(rewrites.expand_optional_symbols_in_rhs(
405                [Optional('a'), Optional('b')], grammar, empties)),
406            [([], {0: None, 1: None}),
407             (['a'], {1: None}),
408             (['b'], {0: None}),
409             (['a', 'b'], {})])
410
411    def testEmptyGrammar(self):
412        tokenize = lexer.LexicalGrammar("X")
413        self.compile(tokenize, Grammar({'goal': [[]]}))
414        self.assertParse("", ('goal',))
415        self.assertNoParse(
416            "X",
417            message="expected 'end of input', got 'X' (line 1)")
418
419    def testOptionalEmpty(self):
420        tokenize = lexer.LexicalGrammar("X Y")
421        grammar = Grammar({
422            'a': [
423                [Optional('b'), Optional('c')],
424            ],
425            'b': [
426                prod(['X'], 'b'),
427            ],
428            'c': [
429                prod(['Y'], 'c'),
430            ]
431        })
432        self.compile(tokenize, grammar)
433        self.assertParse("", ('a', None, None))
434        self.assertParse("X", ('a', ('b', 'X'), None))
435        self.assertParse("Y", ('a', None, ('c', 'Y')))
436        self.assertParse("X Y", ('a', ('b', 'X'), ('c', 'Y')))
437
438    def testOptional(self):
439        tokenize = lexer.LexicalGrammar('[ ] , X')
440        grammar = Grammar({
441            'array': [
442                ['[', Optional('elision'), ']'],
443                ['[', 'elements', ']'],
444                ['[', 'elements', ',', Optional('elision'), ']']
445            ],
446            'elements': [
447                [Optional('elision'), 'X'],
448                ['elements', ',', Optional('elision'), 'X']
449            ],
450            'elision': [
451                [','],
452                ['elision', ',']
453            ]
454        })
455        self.compile(tokenize, grammar)
456        self.assertParse("[]",
457                         ('array_0', '[', None, ']'))
458        self.assertParse("[,]",
459                         ('array_0', '[', ',', ']'))
460        self.assertParse(
461            "[,,X,,X,]",
462            ('array_2',
463                '[',
464                ('elements_1',
465                    ('elements_0',
466                        ('elision_1',
467                            ',',
468                            ','),
469                        'X'),
470                    ',',
471                    ',',
472                    'X'),
473                ',',
474                None,
475                ']'))
476
477    def testPositiveLookahead(self):
478        self.compile(
479            lexer.LexicalGrammar('A B + ( )'),
480            Grammar({
481                'goal': [
482                    [LookaheadRule(frozenset({'A', 'B'}), True), 'expr'],
483                ],
484                'expr': [
485                    ['term'],
486                    ['expr', '+', 'term'],
487                ],
488                'term': [
489                    ['A'],
490                    ['B'],
491                    ['(', 'expr', ')'],
492                ]
493            })
494        )
495        self.assertNoParse(
496            "(A)",
497            message="expected one of ['A', 'B'], got '('")
498        self.assertParse("A + B")
499
500    def testNegativeLookahead(self):
501        tokenize = lexer.LexicalGrammar('a b')
502        rules = {
503            'goal': [
504                [LookaheadRule(frozenset({'a'}), False), 'abs'],
505            ],
506            'abs': [
507                ['a'],
508                ['b'],
509                ['abs', 'a'],
510                ['abs', 'b'],
511            ],
512        }
513
514        self.compile(tokenize, Grammar(rules))
515        self.assertNoParse("a b", message="expected 'b', got 'a'")
516        self.assertParse(
517            'b a',
518            ('goal', ('abs_2', 'b', 'a')))
519
520        # In simple cases like this, the lookahead restriction can even
521        # disambiguate a grammar that would otherwise be ambiguous.
522        rules['goal'].append(prod(['a'], 'goal_a'))
523        self.compile(tokenize, Grammar(rules))
524        self.assertParse('a', ('goal_a', 'a'))
525
526    def disabledNegativeLookaheadDisambiguation(self):
527        tokenize = lexer.LexicalGrammar(
528            '( ) { } ; function =',
529            IDENT=r'[A-Za-z_][A-Za-z_0-9]*')
530        grammar = Grammar({
531            'stmts': [
532                ['stmt'],
533                ['stmts', 'stmt'],
534            ],
535            'stmt': [
536                [LookaheadRule(set=frozenset({'function'}), positive=False),
537                 'expr', ';'],
538                ['fndecl'],
539            ],
540            'fndecl': [
541                ['function', 'IDENT', '(', ')', '{', Optional('stmt'), '}'],
542            ],
543            'expr': [
544                ['term'],
545                ['IDENT', '=', 'expr'],
546            ],
547            'term': [
548                ['(', 'expr', ')'],
549                ['fndecl'],
550                ['term', '(', 'expr', ')'],
551            ],
552        })
553        self.compile(tokenize, grammar)
554
555        # Test that without the lookahead restriction, we reject this grammar
556        # (it's ambiguous):
557        del grammar['stmt'][0][0]
558        self.assertRaisesRegex(ValueError,
559                               'banana',
560                               lambda: gen.compile(grammar))
561
562        self.assertParse(
563            'function f() { x = function y() {}; }',
564            ('stmt', 1,
565                ('fndecl',
566                    'function', 'f', '(', ')', '{',
567                    ('stmt', 0,
568                        ('expr', 1,
569                            'x',
570                            '=',
571                            ('expr', 0,
572                                ('term', 1,
573                                    ('fndecl',
574                                        'function', 'y', '(', ')',
575                                        '{', None, '}')))),
576                        ';'))))
577
578        self.assertParse(
579            '(function g(){});',
580            ('stmts', 0,
581                ('stmt', 0,
582                    ('term', 1,
583                        ('fndecl',
584                            'function', 'g', '(', ')', '{', None, '}')),
585                    ';')))
586
587    def testTrailingLookahead(self):
588        """Lookahead at the end of a production is banned."""
589        tokenize = lexer.LexicalGrammar('IF ( X ) ELSE OTHER ;')
590        grammar = gen.Grammar({
591            'goal': [['stmt']],
592            'stmt': [
593                ['OTHER', ';'],
594                ['IF', '(', 'X', ')', 'stmt',
595                 LookaheadRule(frozenset({'ELSE'}), False)],
596                ['IF', '(', 'X', ')', 'stmt', 'ELSE', 'stmt'],
597            ],
598        })
599
600        def stmt_0():
601            return ('stmt_0', 'OTHER', ';')
602
603        def stmt_1(t):
604            return ('stmt_1', 'IF', '(', 'X', ')', t)
605
606        def stmt_2(t, e):
607            return ('stmt_2', 'IF', '(', 'X', ')', t, 'ELSE', e)
608
609        self.compile(tokenize, grammar)
610        self.assertParse('IF(X) OTHER;', stmt_1(stmt_0()))
611        self.assertParse('IF(X) OTHER; ELSE OTHER;',
612                         stmt_2(stmt_0(), stmt_0()))
613        self.assertParse('IF(X) IF(X) OTHER; ELSE OTHER; ELSE OTHER;',
614                         stmt_2(stmt_2(stmt_0(), stmt_0()), stmt_0()))
615        self.assertParse('IF(X) OTHER; ELSE IF(X) OTHER; ELSE OTHER;',
616                         stmt_2(stmt_0(), stmt_2(stmt_0(), stmt_0())))
617        self.assertParse('IF(X) IF(X) OTHER; ELSE OTHER;',
618                         stmt_1(stmt_2(stmt_0(), stmt_0())))
619
620    def testLookaheadBeforeOptional(self):
621        self.compile(
622            lexer.LexicalGrammar(
623                '= : _',
624                PUBLIC=r'public\b',
625                IDENT=r'[a-z]+\b',
626                NUM=r'[0-9]\b'),
627            Grammar({
628                'decl': [
629                    [
630                        LookaheadRule(frozenset({'IDENT'}), True),
631                        Optional('attrs'),
632                        'pat', '=', 'NUM'
633                    ],
634                ],
635                'attrs': [
636                    ['attr'],
637                    ['attrs', 'attr'],
638                ],
639                'attr': [
640                    ['PUBLIC', ':'],
641                    ['IDENT', ':'],
642                ],
643                'pat': [
644                    ['IDENT'],
645                    ['_'],
646                ],
647            })
648        )
649        self.assertEqual(
650            self.parse("x = 0"),
651            ("decl", None, "x", "=", "0"))
652        self.assertParse("thread: x = 0")
653        self.assertNoParse(
654            "public: x = 0",
655            message="expected 'IDENT', got 'public'")
656        self.assertNoParse("_ = 0", message="expected 'IDENT', got '_'")
657        self.assertParse("funny: public: x = 0")
658        self.assertParse("funny: _ = 0")
659
660    def testForLookahead(self):
661        grammar = Grammar({
662            'Stmt': [
663                [';'],
664                ['ForStmt'],
665            ],
666            'ForStmt': [
667                ["for", "(", LookaheadRule(frozenset({"let"}), False),
668                 "Expr", ";", ";", ")", "Stmt"],
669            ],
670            'Expr': [
671                ["0"],
672                ["let"],
673            ],
674        })
675        self.compile(lexer.LexicalGrammar("for ( let ; ) 0"), grammar)
676        self.assertParse("for (0;;) ;")
677        self.assertNoParse("for (let;;) ;", message="expected '0', got 'let'")
678
679    def testLookaheadDisambiguation(self):
680        """A lookahead restriction should be able to rule out certain nonterminals entirely."""
681
682        grammar = Grammar({
683            'Script': [
684                ['Statement'],
685                ['Statement', 'Statement'],
686            ],
687            'Statement': [
688                [LookaheadRule(frozenset({'function'}), False), 'Expression', ';'],
689                ['Function'],
690            ],
691            'Function': [
692                ['function', 'x', '(', ')', '{', '}'],
693            ],
694            'Expression': [
695                ['Primary'],
696                ['++', 'Primary'],
697                ['Primary', '++'],
698            ],
699            'Primary': [
700                ['Function'],
701                ['x'],
702            ],
703        })
704
705        self.compile(lexer.LexicalGrammar("function x ( ) { } ++ ;"), grammar)
706        self.assertParse("function x() {}")
707        self.assertParse("++function x() {};")
708        self.assertNoParse("++function x() {}", message="unexpected end")
709        # TODO: The parser generator fails to handle this case because it does
710        # not forward the restriction from producting a Function to the
711        # Primitive rule. Therefore, `Function [lookahead: ;]` is incorrectly
712        # reduced to a `Primitive [lookahead: ;]`
713        # self.assertNoParse("function x() {}++;", message="got ';'")
714        self.assertParse("function x() {} ++x;")
715
716    # XXX to test: combination of lookaheads, ++, +-, -+, --
717    # XXX todo: find an example where lookahead canonicalization matters
718
719    def testHugeExample(self):
720        grammar = Grammar(
721            {
722                'grammar': [['nt_def_or_blank_line'],
723                            ['grammar', 'nt_def_or_blank_line']],
724                'arg': [['sigil', 'NT']],
725                'args': [['arg'], ['args', ',', 'arg']],
726                'definite_sigil': [['~'], ['+']],
727                'exclusion': [['terminal'],
728                              ['nonterminal'],
729                              ['CHR', 'through', 'CHR']],
730                'exclusion_list': [['exclusion'],
731                                   ['exclusion_list', 'or', 'exclusion']],
732                'ifdef': [['[', 'definite_sigil', 'NT', ']']],
733                'line_terminator': [['NT'], ['NTALT']],
734                'lookahead_assertion': [
735                    ['==', 'terminal'],
736                    ['!=', 'terminal'],
737                    ['<!', 'NT'],
738                    ['<!', '{', 'lookahead_exclusions', '}']],
739                'lookahead_exclusion': [['lookahead_exclusion_element'],
740                                        ['lookahead_exclusion',
741                                         'lookahead_exclusion_element']],
742                'lookahead_exclusion_element': [['terminal'],
743                                                ['no_line_terminator_here']],
744                'lookahead_exclusions': [['lookahead_exclusion'],
745                                         ['lookahead_exclusions', ',',
746                                          'lookahead_exclusion']],
747                'no_line_terminator_here': [
748                    ['[', 'no', 'line_terminator', 'here', ']']],
749                'nonterminal': [['NT'], ['NTCALL', '[', 'args', ']']],
750                'nt_def': [['nt_lhs', 'EQ', 'NL', 'rhs_lines', 'NL'],
751                           ['nt_lhs', 'EQ', 'one', 'of', 'NL',
752                            't_list_lines', 'NL']],
753                'nt_def_or_blank_line': [['NL'], ['nt_def']],
754                'nt_lhs': [['NT'], ['NTCALL', '[', 'params', ']']],
755                'param': [['NT']],
756                'params': [['param'], ['params', ',', 'param']],
757                'rhs': [['symbols'], ['[', 'empty', ']']],
758                'rhs_line': [[Optional(inner='ifdef'), 'rhs',
759                              Optional(inner='PRODID'), 'NL'],
760                             ['PROSE', 'NL']],
761                'rhs_lines': [['rhs_line'], ['rhs_lines', 'rhs_line']],
762                'sigil': [['definite_sigil'], ['?']],
763                'symbol': [['terminal'],
764                           ['nonterminal'],
765                           ['nonterminal', '?'],
766                           ['nonterminal', 'but', 'not', 'exclusion'],
767                           ['nonterminal', 'but', 'not', 'one', 'of',
768                            'exclusion_list'],
769                           ['[', 'lookahead', 'lookahead_assertion', ']'],
770                           ['no_line_terminator_here'],
771                           ['WPROSE']],
772                'symbols': [['symbol'], ['symbols', 'symbol']],
773                't_list_line': [['terminal_seq', 'NL']],
774                't_list_lines': [['t_list_line'],
775                                 ['t_list_lines', 't_list_line']],
776                'terminal': [['T'], ['CHR']],
777                'terminal_seq': [['terminal'], ['terminal_seq', 'terminal']]
778            },
779            variable_terminals='EQ T CHR NTCALL NT NTALT '
780                               'PRODID PROSE WPROSE'.split()
781        )
782
783        # Note: This lexical grammar is not suitable for use with incremental
784        # parsing.
785        emu_grammar_lexer = lexer.LexicalGrammar(
786            # the operators and keywords:
787            "[ ] { } , ~ + ? <! == != "
788            "but empty here lookahead no not of one or through",
789            NL="\n",
790            # any number of colons together
791            EQ=r':+',
792            # terminals of the ES grammar, quoted with backticks
793            T=r'`[^` \n]+`|```',
794            # also terminals, denoting control characters
795            CHR=r'<[A-Z]+>|U\+[0-9A-f]{4}',
796            # nonterminals that will be followed by boolean parameters
797            NTCALL=r'(?:uri|[A-Z])\w*(?=\[)',
798            # nonterminals (also, boolean parameters)
799            NT=r'(?:uri|[A-Z])\w*',
800            # nonterminals wrapped in vertical bars for no apparent reason
801            NTALT=r'\|[A-Z]\w+\|',
802            # the spec also gives a few productions names
803            PRODID=r'#[A-Za-z]\w*',
804            # prose to the end of the line
805            PROSE=r'>.*',
806            # prose wrapped in square brackets
807            WPROSE=r'\[>[^]]*\]',
808        )
809
810        self.compile(emu_grammar_lexer, grammar)
811
812        source = """\
813        IdentifierReference[Yield, Await] :
814          Identifier
815          [~Yield] `yield`
816          [~Await] `await`
817
818        """
819
820        self.assertParse(source)
821
822    def testParameterizedProductions(self):
823        passthru = ('Yield', Var('Yield')),
824        name = Nt("name", passthru)
825        stmt = Nt("stmt", passthru)
826        stmts = Nt("stmts", passthru)
827        grammar = Grammar({
828            'script': [
829                ['def'],
830                ['script', 'def'],
831            ],
832            'def': [
833                [
834                    'function', 'IDENT', '(', ')', '{',
835                    Nt('stmts', (('Yield', False),)), '}'
836                ],
837                [
838                    'function', '*', 'IDENT', '(', ')', '{',
839                    Nt('stmts', (('Yield', True),)), '}'
840                ],
841            ],
842            'stmts': NtDef(('Yield',), [
843                [stmt],
844                [stmts, stmt],
845            ], None),
846            'stmt': NtDef(('Yield',), [
847                [name, "(", ")", ";"],
848                [name, "=", name, ";"],
849                Production(["yield", name, ";"],
850                           reducer=CallMethod("yield_stmt", [1]),
851                           condition=('Yield', True)),
852            ], None),
853            'name': NtDef(('Yield',), [
854                ["IDENT"],
855                # Specifically ask for a method here, because otherwise we
856                # wouldn't get one and then type checking would fail.
857                Production(["yield"],
858                           CallMethod("yield_as_name", []),
859                           condition=('Yield', False)),
860            ], None),
861        }, variable_terminals=["IDENT"])
862        self.compile(lexer.LexicalGrammar("( ) { } ; * = function yield",
863                                          IDENT=r'[A-Za-z]\w*'),
864                     grammar)
865        self.assertParse("function* farm() { cow = pig; yield cow; }")
866        self.assertNoParse(
867            "function city() { yield toOncomingTraffic; }",
868            message="expected one of ['(', '='], got 'toOncomingTraffic'")
869        self.assertNoParse(
870            "function* farm() { yield = corn; yield yield; }",
871            message="expected 'IDENT', got '='")
872
873    def testMissingParameterError(self):
874        grammar = {
875            'Foo': [
876                ['Bar'],
877            ],
878            'Bar': NtDef(('Arg',), [
879                ['NUM'],
880                Production(['STR'],
881                           reducer=0,
882                           condition=('Arg', True)),
883            ], None),
884        }
885
886        self.assertRaisesRegex(ValueError, "missing parameters for 'Bar'",
887                               lambda: Grammar(grammar))
888
889    def testCanonicalLR(self):
890        """Example 4.39 (grammar 4.20) from the book."""
891
892        # Modified as marked below
893        grammar = Grammar({
894            "S": [
895                ["L", "=", "R"],
896                ["R"],
897            ],
898            "L": [
899                ["*", "R"],
900                ["id"],
901            ],
902            "R": [
903                ["L"],
904                # added so we can have a negative test, showing that
905                # `R = R` is not an S:
906                ["7"],
907            ],
908        })
909        self.compile(lexer.LexicalGrammar("id = * 7"), grammar)
910        self.assertParse("id = *id")
911        self.assertParse("*id = id")
912        self.assertParse("id = 7")
913        self.assertNoParse("7 = id",
914                           message="expected 'end of input', got '='")
915
916    def testLookaheadWithCanonicalLR(self):
917        """Only a lookahead assertion makes this grammar unambiguous."""
918        tokenize = lexer.LexicalGrammar("async => { } ;", Identifier=r'\w+')
919        grammar = Grammar({
920            "script": [
921                ["Expression", ";"],
922            ],
923            "Expression": [
924                ["PrimaryExpression"],
925                ["async", "Identifier", "=>", "AsyncConciseBody"],
926            ],
927            "AsyncConciseBody": [
928                [LookaheadRule(set=frozenset(["{"]), positive=False),
929                 "Expression"],
930                ["{", "}"],
931            ],
932            "PrimaryExpression": [
933                ["{", "}"],
934            ],
935        })
936
937        self.compile(tokenize, grammar)
938        self.assertParse("{};")
939        self.assertParse("async x => {};")
940        self.assertParse("async x => async y => {};")
941
942    def testMultiGoal(self):
943        tokenize = lexer.LexicalGrammar("WHILE DEF FN { } ( ) -> ;", ID=r'\w+')
944        grammar = Grammar({
945            "stmt": [
946                ["expr", ";"],
947                ["{", "stmts", "}"],
948                ["WHILE", "(", "expr", ")", "stmt"],
949                ["DEF", "ID", "(", "ID", ")", "{", Optional("stmts"), "}"],
950            ],
951            "stmts": [
952                ["stmt"],
953                ["stmts", "stmt"],
954            ],
955            "expr": [
956                ["FN", "ID", "->", "expr"],
957                ["call_expr"],
958            ],
959            "call_expr": [
960                ["ID"],
961                ["call_expr", "(", "expr", ")"],
962                ["(", "expr", ")"],
963            ],
964        }, goal_nts=["stmts", "expr"])
965        self.compile(tokenize, grammar)
966        self.assertParse("WHILE ( x ) { decx ( x ) ; }", goal="stmts")
967        self.assertNoParse(
968            "WHILE ( x ) { decx ( x ) ; }", goal="expr",
969            message="expected one of ['(', 'FN', 'ID'], got 'WHILE'")
970        self.assertParse("f(x);", goal="stmts")
971        self.assertNoParse("f(x);", goal="expr",
972                           message="expected 'end of input', got ';'")
973        self.assertParse("(FN x -> f ( x ))(x)", goal="expr")
974        self.assertNoParse("(FN x -> f ( x ))(x)", goal="stmts",
975                           message="unexpected end of input")
976
977    def testStaggeredItems(self):
978        """Items in a state can have different amounts of leading context."""
979        # In this example grammar, after "A" "B", we're in a state that
980        # contains these two items (ignoring lookahead):
981        #       goal ::= "A" "B" · y
982        #       x ::= "B" · stars "X"
983        #
984        # Likewise, after `"A" "B" stars`, we have:
985        #       x ::= "B" stars · "X"
986        #       y ::= stars · "Y"
987        #       stars ::= stars · "*"
988        tokenize = lexer.LexicalGrammar("A B * X Y")
989        grammar = Grammar({
990            "goal": [
991                ["A", "x"],
992                ["A", "B", "y"],
993            ],
994            "x": [
995                ["B", "stars", "X"],
996            ],
997            "y": [
998                ["stars", "Y"],
999            ],
1000            "stars": [
1001                ["*"],
1002                ["stars", "*"],
1003            ],
1004        })
1005        self.compile(tokenize, grammar)
1006        self.assertParse("A B * * * X")
1007        self.assertParse("A B * * * Y")
1008
1009    def testCheckCycleFree(self):
1010        tokenize = lexer.LexicalGrammar("!")
1011        grammar = Grammar({
1012            "problem": [
1013                ["one", "two"],
1014            ],
1015            "one": [
1016                ["!"],
1017            ],
1018            "two": [
1019                [Optional("problem")],
1020            ],
1021        })
1022        self.compile(tokenize, grammar)
1023        self.assertParse("! ! ! ! !")
1024
1025    def testReduceActions(self):
1026        tokenize = lexer.LexicalGrammar("+ - * / ( )",
1027                                        NUM=r'[0-9]\w*',
1028                                        VAR=r'[A-Za-z]\w*')
1029        grammar = Grammar({
1030            "expr": [
1031                ["term"],
1032                prod(["expr", "+", "term"], "add"),
1033                prod(["expr", "-", "term"], "sub"),
1034            ],
1035            "term": [
1036                ["unary"],
1037                prod(["term", "*", "unary"], "mul"),
1038                prod(["term", "/", "unary"], "div"),
1039            ],
1040            "unary": [
1041                ["prim"],
1042                prod(["-", "prim"], "neg"),
1043            ],
1044            "prim": [
1045                prod(["(", "expr", ")"], "parens"),
1046                prod(["NUM"], "num"),
1047                prod(["VAR"], "var"),
1048            ],
1049        }, goal_nts=['expr'])
1050
1051        self.compile(tokenize, grammar)
1052        self.assertParse("X", ('var', 'X'))
1053        self.assertParse("3 + 4", ('add', ('num', '3'), '+', ('num', '4')))
1054        self.assertParse(
1055            "2 * 3 + 4 * (5 + 7)",
1056            (
1057                'add',
1058                ('mul', ('num', '2'), '*', ('num', '3')),
1059                '+',
1060                (
1061                    'mul',
1062                    ('num', '4'),
1063                    '*',
1064                    ('parens', '(',
1065                        ('add', ('num', '5'), '+', ('num', '7')), ')'))))
1066        self.assertParse(
1067            "1 / (1 + 1 / (1 + 1 / (1 + 1)))",
1068            (
1069                'div', ('num', '1'), '/', (
1070                    'parens', '(', (
1071                        'add', ('num', '1'), '+', (
1072                            'div', ('num', '1'), '/', (
1073                                'parens', '(', (
1074                                    'add', ('num', '1'), '+', (
1075                                        'div', ('num', '1'), '/', (
1076                                            'parens', '(', (
1077                                                'add', ('num', '1'), '+',
1078                                                ('num', '1')),
1079                                            ')'))),
1080                                ')'))),
1081                    ')')))
1082
1083    def testConvenienceMethodTypeInference(self):
1084        """A method can be called only in an intermediate reduce expression."""
1085
1086        # The reduce expression `f(g($0))`.
1087        reducer = CallMethod("f", [CallMethod("g", [0])])
1088
1089        # The grammar `goal ::= NAME => f(g($1))`.
1090        grammar = Grammar(
1091            {
1092                'goal': [Production(['NAME'], reducer)],
1093            },
1094            variable_terminals=['NAME'])
1095
1096        # Since the return value of f() is used as the value of a `goal`,
1097        # we infer that f() returns a goal.
1098        self.assertEqual(
1099            grammar.methods['f'].return_type,
1100            jsparagus.types.Type('goal'))
1101
1102        # Since the return value of g() isn't used except as an argument, we
1103        # just give it the type `g`.
1104        self.assertEqual(
1105            grammar.methods['g'].return_type,
1106            jsparagus.types.Type('g'))
1107
1108        # Since g() is passed to f(), we infer this:
1109        self.assertEqual(
1110            grammar.methods['f'].argument_types,
1111            [jsparagus.types.Type('g')])
1112
1113    def testEpsilonFreeTransform(self):
1114        tokenize = lexer.LexicalGrammar('{ } X')
1115        grammar = Grammar({
1116            'goal': [
1117                ['{', 'xlist', '}'],
1118            ],
1119            'xlist': [
1120                [],
1121                ['xlist', 'X'],
1122            ],
1123        })
1124        self.compile(tokenize, grammar)
1125        self.assertParse("{}", ('goal', '{', ('xlist_0',), '}'))
1126
1127    def compile_as_js(
1128            self,
1129            grammar_source: str,
1130            goals: typing.Optional[typing.Iterable[str]] = None,
1131            verbose: bool = False,
1132    ) -> None:
1133        """Like self.compile(), but generate a parser from ESGrammar,
1134        with ASI support, using the JS lexer.
1135        """
1136        from js_parser.lexer import JSLexer
1137        from js_parser import load_es_grammar
1138        from js_parser import generate_js_parser_tables
1139
1140        grammar = parse_esgrammar(
1141            grammar_source,
1142            filename="es-simplified.esgrammar",
1143            extensions=[],
1144            goals=goals,
1145            synthetic_terminals=load_es_grammar.ECMASCRIPT_SYNTHETIC_TERMINALS,
1146            terminal_names=load_es_grammar.TERMINAL_NAMES_FOR_SYNTACTIC_GRAMMAR)
1147        grammar = generate_js_parser_tables.hack_grammar(grammar)
1148        base_parser_class = gen.compile(grammar, verbose=verbose)
1149
1150        # "type: ignore" because poor mypy can't cope with the runtime codegen
1151        # we're doing here.
1152        class JSParser(base_parser_class):  # type: ignore
1153            def __init__(self, goal='Script', builder=None):
1154                super().__init__(goal, builder)
1155                self._goal = goal
1156                # self.debug = True
1157
1158            def clone(self):
1159                return JSParser(self._goal, self.methods)
1160
1161            def on_recover(self, error_code, lexer, stv):
1162                """Check that ASI error recovery is really acceptable."""
1163                if error_code == 'asi':
1164                    if not self.closed and stv.term != '}' and not lexer.saw_line_terminator():
1165                        lexer.throw("missing semicolon")
1166                else:
1167                    assert error_code == 'do_while_asi'
1168
1169        self.tokenize = JSLexer
1170        self.parser_class = JSParser
1171
1172    def testExtraGoal(self):
1173
1174        grammar_source = """
1175StuffToIgnore_ThisWorksAroundAnUnrelatedBug:
1176  Identifier
1177  IdentifierName
1178
1179Hat :
1180  `^`
1181
1182ArrowFunction :
1183  `^` `=>`
1184  Hat `*` `=>`
1185
1186Script :
1187  `?` `?` ArrowFunction
1188  `?` `?` [lookahead <! {`async`} ] Hat `of`
1189
1190LazyArrowFunction :
1191  ArrowFunction
1192            """
1193
1194        def try_it(goals):
1195            self.compile_as_js(grammar_source, goals=goals)
1196            self.assertParse("? ? ^ =>", goal='Script')
1197            self.assertParse("? ? ^ of", goal='Script')
1198
1199        try_it(['Script', 'LazyArrowFunction'])
1200        try_it(['Script'])
1201
1202
1203if __name__ == '__main__':
1204    unittest.main()
1205