1from __future__ import annotations 2# mypy: disallow-untyped-defs, disallow-incomplete-defs, disallow-untyped-calls 3 4import collections 5import hashlib 6import os 7import pickle 8import typing 9import itertools 10 11from . import types 12from .utils import consume, keep_until, split, default_id_dict, default_fwd_dict 13from .ordered import OrderedSet, OrderedFrozenSet 14from .actions import Action, Replay, Reduce, FilterStates, Seq 15from .grammar import End, ErrorSymbol, InitNt, Nt 16from .rewrites import CanonicalGrammar 17from .lr0 import LR0Generator, Term 18from .aps import APS, Edge, Path 19 20# StateAndTransitions objects are indexed using a StateId which is an integer. 21StateId = int 22 23# Action or ordered sequence of action which have to be performed. 24DelayedAction = typing.Union[Action, typing.Tuple[Action, ...]] 25 26 27class StateAndTransitions: 28 """This is one state of the parse table, which has transitions based on 29 terminals (text), non-terminals (grammar rules) and epsilon (reduce). 30 31 In this model epsilon transitions are used to represent code to be executed 32 such as reduce actions and any others actions. 33 """ 34 35 __slots__ = ["index", "locations", "terminals", "nonterminals", "errors", 36 "epsilon", "delayed_actions", "arguments", "backedges", "_hash", 37 "stable_hash"] 38 39 # Numerical index of this state. 40 index: StateId 41 42 # The stable_str of each LRItem we could be parsing in this state: the 43 # places in grammar productions that tell what we've already parsed, 44 # i.e. how we got to this state. 45 locations: OrderedFrozenSet[str] 46 47 # Ordered set of Actions which are pushed to the next state after a 48 # conflict. 49 delayed_actions: OrderedFrozenSet[DelayedAction] 50 51 # Number of argument of an action state. 52 # 53 # Instead of having action states with a non-empty replay list of terms, we 54 # have a non-empty list of argument which size is described by this 55 # variable. 56 arguments: int 57 58 # Outgoing edges taken when shifting terminals. 59 terminals: typing.Dict[str, StateId] 60 61 # Outgoing edges taken when shifting nonterminals after reducing. 62 nonterminals: typing.Dict[Nt, StateId] 63 64 # Error symbol transitions. 65 errors: typing.Dict[ErrorSymbol, StateId] 66 67 # List of epsilon transitions with associated actions. 68 epsilon: typing.List[typing.Tuple[Action, StateId]] 69 70 # Set of edges that lead to this state. 71 backedges: OrderedSet[Edge] 72 73 # Cached hash code. This class implements __hash__ and __eq__ in order to 74 # help detect equivalent states (which must be merged, for correctness). 75 _hash: int 76 77 # A hash code computed the same way as _hash, but used only for 78 # human-readable output. The stability is useful for debugging, to match 79 # states across multiple runs of the parser generator. 80 stable_hash: str 81 82 def __init__( 83 self, 84 index: StateId, 85 locations: OrderedFrozenSet[str], 86 delayed_actions: OrderedFrozenSet[DelayedAction] = OrderedFrozenSet(), 87 arguments: int = 0 88 ) -> None: 89 assert isinstance(locations, OrderedFrozenSet) 90 assert isinstance(delayed_actions, OrderedFrozenSet) 91 self.index = index 92 self.terminals = {} 93 self.nonterminals = {} 94 self.errors = {} 95 self.epsilon = [] 96 self.locations = locations 97 self.delayed_actions = delayed_actions 98 self.arguments = arguments 99 self.backedges = OrderedSet() 100 101 # NOTE: The hash of a state depends on its location in the LR0 102 # parse-table, as well as the actions which have not yet been executed. 103 def hashed_content() -> typing.Iterator[object]: 104 for item in sorted(self.locations): 105 yield item 106 yield "\n" 107 yield "delayed_actions" 108 for action in self.delayed_actions: 109 yield hash(action) 110 yield "arguments" 111 yield arguments 112 113 self._hash = hash(tuple(hashed_content())) 114 h = hashlib.md5() 115 h.update("".join(map(str, hashed_content())).encode()) 116 self.stable_hash = h.hexdigest()[:6] 117 118 def is_inconsistent(self) -> bool: 119 "Returns True if the state transitions are inconsistent." 120 # TODO: We could easily allow having a state with non-terminal 121 # transition and other epsilon transitions, as the non-terminal shift 122 # transitions are a form of condition based on the fact that a 123 # non-terminal, produced by a reduce action is consumed by the 124 # automaton. 125 if len(self.terminals) + len(self.nonterminals) + len(self.errors) > 0 and len(self.epsilon) > 0: 126 return True 127 elif len(self.epsilon) == 1: 128 if any(k.is_inconsistent() for k, s in self.epsilon): 129 return True 130 elif len(self.epsilon) > 1: 131 if any(k.is_inconsistent() for k, s in self.epsilon): 132 return True 133 # NOTE: We can accept multiple conditions as epsilon transitions 134 # iff they are checking the same variable with non-overlapping 135 # values. This implies that we can implement these conditions as a 136 # deterministic switch statement in the code emitter. 137 if any(not k.is_condition() for k, s in self.epsilon): 138 return True 139 iterator = iter(self.epsilon) 140 first, _ = next(iterator) 141 if any(not first.check_same_variable(k) for k, s in iterator): 142 return True 143 # "type: ignore" because mypy does not see that the preceding if-statement 144 # means all k.condition() actions are FilterFlags. 145 pairs = itertools.combinations((k for k, s in self.epsilon), 2) 146 if any(not k1.check_different_values(k2) for k1, k2 in pairs): 147 return True 148 else: 149 try: 150 self.get_error_symbol() 151 except ValueError: 152 return True 153 return False 154 155 def shifted_edges(self) -> typing.Iterator[ 156 typing.Tuple[typing.Union[str, Nt, ErrorSymbol], StateId] 157 ]: 158 k: Term 159 s: StateId 160 for k, s in self.terminals.items(): 161 yield (k, s) 162 for k, s in self.nonterminals.items(): 163 yield (k, s) 164 for k, s in self.errors.items(): 165 yield (k, s) 166 167 def edges(self) -> typing.Iterator[typing.Tuple[Term, StateId]]: 168 k: Term 169 s: StateId 170 for k, s in self.terminals.items(): 171 yield (k, s) 172 for k, s in self.nonterminals.items(): 173 yield (k, s) 174 for k, s in self.errors.items(): 175 yield (k, s) 176 for k, s in self.epsilon: 177 yield (k, s) 178 179 def rewrite_state_indexes( 180 self, 181 state_map: typing.Dict[StateId, StateId] 182 ) -> None: 183 def apply_on_term(term: typing.Union[Term, None]) -> Term: 184 assert term is not None 185 if isinstance(term, Action): 186 return term.rewrite_state_indexes(state_map) 187 return term 188 189 self.index = state_map[self.index] 190 self.terminals = { 191 k: state_map[s] for k, s in self.terminals.items() 192 } 193 self.nonterminals = { 194 k: state_map[s] for k, s in self.nonterminals.items() 195 } 196 self.errors = { 197 k: state_map[s] for k, s in self.errors.items() 198 } 199 self.epsilon = [ 200 (k.rewrite_state_indexes(state_map), state_map[s]) 201 for k, s in self.epsilon 202 ] 203 # We cannot have multiple identical actions jumping to different locations. 204 assert len(self.epsilon) == len(set(k for k, _ in self.epsilon)) 205 self.backedges = OrderedSet( 206 Edge(state_map[edge.src], apply_on_term(edge.term)) 207 for edge in self.backedges 208 ) 209 210 def get_error_symbol(self) -> typing.Optional[ErrorSymbol]: 211 if len(self.errors) > 1: 212 raise ValueError("More than one error symbol on the same state.") 213 else: 214 return next(iter(self.errors), None) 215 216 def __contains__(self, term: object) -> bool: 217 if isinstance(term, Action): 218 for t, s in self.epsilon: 219 if t == term: 220 return True 221 return False 222 elif isinstance(term, Nt): 223 return term in self.nonterminals 224 elif isinstance(term, ErrorSymbol): 225 return term in self.errors 226 else: 227 return term in self.terminals 228 229 def __getitem__(self, term: Term) -> StateId: 230 if isinstance(term, Action): 231 for t, s in self.epsilon: 232 if t == term: 233 return s 234 raise KeyError(term) 235 elif isinstance(term, Nt): 236 return self.nonterminals[term] 237 if isinstance(term, ErrorSymbol): 238 return self.errors[term] 239 else: 240 return self.terminals[term] 241 242 def get(self, term: Term, default: object) -> object: 243 try: 244 return self.__getitem__(term) 245 except KeyError: 246 return default 247 248 def stable_str(self, states: typing.List[StateAndTransitions]) -> str: 249 conflict = "" 250 if self.is_inconsistent(): 251 conflict = " (inconsistent)" 252 return "{}{}:\n{}".format(self.stable_hash, conflict, "\n".join([ 253 "\t{} --> {}".format(k, states[s].stable_hash) for k, s in self.edges()])) 254 255 def __str__(self) -> str: 256 conflict = "" 257 if self.is_inconsistent(): 258 conflict = " (inconsistent)" 259 return "{}{}:\n{}".format(self.index, conflict, "\n".join([ 260 "\t{} --> {}".format(k, s) for k, s in self.edges()])) 261 262 def __eq__(self, other: object) -> bool: 263 return (isinstance(other, StateAndTransitions) 264 and sorted(self.locations) == sorted(other.locations) 265 and sorted(self.delayed_actions) == sorted(other.delayed_actions) 266 and self.arguments == other.arguments) 267 268 def __hash__(self) -> int: 269 return self._hash 270 271 272DebugInfo = typing.Dict[StateId, int] 273 274 275class ParseTable: 276 """The parser can be represented as a matrix of state transitions where on one 277 side we have the current state, and on the other we have the expected 278 terminal, non-terminal or epsilon transition. 279 280 a b c A B C #1 #2 #3 281 +---+---+---+---+---+---+----+----+----+ 282 s1 | | | | | | | | | | 283 s2 | | | | | | | | | | 284 s3 | | | | | | | | | | 285 . | | | | | | | | | | 286 . | | | | | | | | | | 287 . | | | | | | | | | | 288 s67 | | | | | | | | | | 289 s68 | | | | | | | | | | 290 s69 | | | | | | | | | | 291 +---+---+---+---+---+---+----+----+----+ 292 293 The terminals `a` are the token which are read from the input. The 294 non-terminals `A` are the token which are pushed by the reduce actions of 295 the epsilon transitions. The epsilon transitions `#1` are the actions which 296 have to be executed as code by the parser. 297 298 A parse table is inconsistent if there is any state which has an epsilon 299 transitions and terminals/non-terminals transitions (shift-reduce 300 conflict), or a state with more than one epsilon transitions (reduce-reduce 301 conflict). This is equivalent to having a non deterministic state machine. 302 303 """ 304 305 __slots__ = [ 306 "actions", "states", "state_cache", "named_goals", "terminals", 307 "nonterminals", "debug_info", "exec_modes", "assume_inconsistent" 308 ] 309 310 # Map of actions identifier to the corresponding object. 311 actions: typing.List[Action] 312 313 # Map of state identifier to the corresponding object. 314 states: typing.List[StateAndTransitions] 315 316 # Hash table of state objects, ensuring we never have two equal states. 317 state_cache: typing.Dict[StateAndTransitions, StateAndTransitions] 318 319 # List of (Nt, states) tuples which are the entry point of the state 320 # machine. 321 named_goals: typing.List[typing.Tuple[Nt, StateId]] 322 323 # Set of all terminals. 324 terminals: OrderedFrozenSet[typing.Union[str, End]] 325 326 # List of non-terminals. 327 nonterminals: typing.List[Nt] 328 329 # Carry the info to be used when generating debug_context. If False, 330 # then no debug_context is ever produced. 331 debug_info: typing.Union[bool, DebugInfo] 332 333 # Execution modes are used by the code generator to decide which 334 # function is executed when. This is a dictionary of OrderedSet, where 335 # the keys are the various parsing modes, and the mapped set contains 336 # the list of traits which have to be implemented, and consequently 337 # which functions would be encoded. 338 exec_modes: typing.Optional[typing.DefaultDict[str, OrderedSet[types.Type]]] 339 340 # True if the parse table might be inconsistent. When this is False, we add 341 # extra assertions when computing the reduce path. 342 assume_inconsistent: bool 343 344 def __init__( 345 self, 346 grammar: CanonicalGrammar, 347 verbose: bool = False, 348 progress: bool = False, 349 debug: bool = False 350 ) -> None: 351 self.actions = [] 352 self.states = [] 353 self.state_cache = {} 354 self.named_goals = [] 355 self.terminals = grammar.grammar.terminals 356 self.nonterminals = typing.cast( 357 typing.List[Nt], 358 list(grammar.grammar.nonterminals.keys())) 359 360 # typing.cast() doesn't actually check at run time, so let's do that: 361 assert all(isinstance(nt, Nt) for nt in self.nonterminals) 362 363 self.debug_info = debug 364 self.exec_modes = grammar.grammar.exec_modes 365 self.assume_inconsistent = True 366 self.create_lr0_table(grammar, verbose, progress) 367 self.fix_inconsistent_table(verbose, progress) 368 # TODO: Optimize chains of actions into sequences. 369 # Optimize by removing unused states. 370 self.remove_all_unreachable_state(verbose, progress) 371 # TODO: Statically compute replayed terms. (maybe?) 372 # Replace reduce actions by programmatic stack manipulation. 373 self.lower_reduce_actions(verbose, progress) 374 # Fold Replay followed by Unwind instruction. 375 self.fold_replay_unwind(verbose, progress) 376 # Fold paths which have the same ending. 377 self.fold_identical_endings(verbose, progress) 378 # Group state with similar non-terminal edges close-by, to improve the 379 # generated Rust code by grouping matched state numbers. 380 self.group_nonterminal_states(verbose, progress) 381 # Split shift states from epsilon states. 382 # self.group_epsilon_states(verbose, progress) 383 384 def save(self, filename: os.PathLike) -> None: 385 with open(filename, 'wb') as f: 386 pickle.dump(self, f) 387 388 @classmethod 389 def load(cls, filename: os.PathLike) -> ParseTable: 390 with open(filename, 'rb') as f: 391 obj = pickle.load(f) 392 if len(f.read()) != 0: 393 raise ValueError("file has unexpected extra bytes at end") 394 if not isinstance(obj, cls): 395 raise TypeError("file contains wrong kind of object: expected {}, got {}" 396 .format(cls.__name__, obj.__class__.__name__)) 397 return obj 398 399 def is_inconsistent(self) -> bool: 400 "Returns True if the grammar contains any inconsistent state." 401 for s in self.states: 402 if s is not None and s.is_inconsistent(): 403 return True 404 return False 405 406 def rewrite_state_indexes(self, state_map: typing.Dict[StateId, StateId]) -> None: 407 for s in self.states: 408 if s is not None: 409 s.rewrite_state_indexes(state_map) 410 self.named_goals = [ 411 (nt, state_map[s]) for nt, s in self.named_goals 412 ] 413 414 # After a rewrite, multiple actions (conditions) might jump to the same 415 # target, attempt to fold these conditions based on having the same 416 # target. If we can merge them, then remove previous edges (updating 417 # the backedges of successor states) and replace them by the newly 418 # created edges. 419 for s in self.states: 420 if s is not None and len(s.epsilon) != 0: 421 epsilon_by_dest = collections.defaultdict(list) 422 for k, d in s.epsilon: 423 epsilon_by_dest[d].append(k) 424 for d, ks in epsilon_by_dest.items(): 425 if len(ks) == 1: 426 continue 427 new_ks = ks[0].fold_by_destination(ks) 428 if new_ks == ks: 429 continue 430 # This collection is required by `remove_edge`, but in this 431 # particular case we know for sure that at least one edge 432 # would be added back. Therefore no need to use the content 433 # of the set. 434 maybe_unreachable_set: OrderedSet[StateId] = OrderedSet() 435 assert len(new_ks) > 0 436 for k in ks: 437 self.remove_edge(s, k, maybe_unreachable_set) 438 for k in new_ks: 439 self.add_edge(s, k, d) 440 441 self.assert_table_invariants() 442 443 def rewrite_reordered_state_indexes(self) -> None: 444 state_map = { 445 s.index: i 446 for i, s in enumerate(self.states) 447 if s is not None 448 } 449 self.rewrite_state_indexes(state_map) 450 451 def new_state( 452 self, 453 locations: OrderedFrozenSet[str], 454 delayed_actions: OrderedFrozenSet[DelayedAction] = OrderedFrozenSet(), 455 arguments: int = 0 456 ) -> typing.Tuple[bool, StateAndTransitions]: 457 """Get or create state with an LR0 location and delayed actions. Returns a tuple 458 where the first element is whether the element is newly created, and 459 the second element is the State object.""" 460 index = len(self.states) 461 state = StateAndTransitions(index, locations, delayed_actions, arguments) 462 try: 463 return False, self.state_cache[state] 464 except KeyError: 465 self.state_cache[state] = state 466 self.states.append(state) 467 return True, state 468 469 def get_state( 470 self, 471 locations: OrderedFrozenSet[str], 472 delayed_actions: OrderedFrozenSet[DelayedAction] = OrderedFrozenSet(), 473 arguments: int = 0 474 ) -> StateAndTransitions: 475 """Like new_state(), but only returns the state without returning whether it is 476 newly created or not.""" 477 _, state = self.new_state(locations, delayed_actions, arguments) 478 return state 479 480 def remove_state(self, s: StateId, maybe_unreachable_set: OrderedSet[StateId]) -> None: 481 state = self.states[s] 482 self.clear_edges(state, maybe_unreachable_set) 483 del self.state_cache[state] 484 485 # "type: ignore" because the type annotation on `states` doesn't allow 486 # entries to be `None`. 487 self.states[s] = None # type: ignore 488 489 def add_edge( 490 self, 491 src: StateAndTransitions, 492 term: Term, 493 dest: StateId 494 ) -> None: 495 assert term not in src 496 assert dest < len(self.states) 497 if isinstance(term, Action): 498 src.epsilon.append((term, dest)) 499 elif isinstance(term, Nt): 500 src.nonterminals[term] = dest 501 elif isinstance(term, ErrorSymbol): 502 src.errors[term] = dest 503 else: 504 src.terminals[term] = dest 505 self.states[dest].backedges.add(Edge(src.index, term)) 506 507 def remove_backedge( 508 self, 509 src: StateAndTransitions, 510 term: Term, 511 dest: StateId, 512 maybe_unreachable_set: OrderedSet[StateId] 513 ) -> None: 514 self.states[dest].backedges.remove(Edge(src.index, term)) 515 maybe_unreachable_set.add(dest) 516 517 def replace_edge( 518 self, 519 src: StateAndTransitions, 520 term: Term, 521 dest: StateId, 522 maybe_unreachable_set: OrderedSet[StateId] 523 ) -> None: 524 assert isinstance(dest, int) and dest < len(self.states) 525 526 edge_existed = term in src 527 if edge_existed: 528 old_dest = src[term] 529 self.remove_backedge(src, term, old_dest, maybe_unreachable_set) 530 531 if isinstance(term, Action): 532 src.epsilon = [(t, d) for t, d in src.epsilon if t != term] 533 src.epsilon.append((term, dest)) 534 elif isinstance(term, Nt): 535 src.nonterminals[term] = dest 536 elif isinstance(term, ErrorSymbol): 537 src.errors[term] = dest 538 else: 539 src.terminals[term] = dest 540 self.states[dest].backedges.add(Edge(src.index, term)) 541 542 self.assert_state_invariants(src) 543 self.assert_state_invariants(dest) 544 if edge_existed: 545 self.assert_state_invariants(old_dest) 546 547 def remove_edge( 548 self, 549 src: StateAndTransitions, 550 term: Term, 551 maybe_unreachable_set: OrderedSet[StateId] 552 ) -> None: 553 edge_existed = term in src 554 if edge_existed: 555 old_dest = src[term] 556 self.remove_backedge(src, term, old_dest, maybe_unreachable_set) 557 if isinstance(term, Action): 558 src.epsilon = [(t, d) for t, d in src.epsilon if t != term] 559 elif isinstance(term, Nt): 560 del src.nonterminals[term] 561 elif isinstance(term, ErrorSymbol): 562 del src.errors[term] 563 else: 564 del src.terminals[term] 565 566 self.assert_state_invariants(src) 567 if edge_existed: 568 self.assert_state_invariants(old_dest) 569 570 def clear_edges( 571 self, 572 src: StateAndTransitions, 573 maybe_unreachable_set: OrderedSet[StateId] 574 ) -> None: 575 """Remove all existing edges, in order to replace them by new one. This is used 576 when resolving shift-reduce conflicts.""" 577 assert isinstance(src, StateAndTransitions) 578 old_dest = [] 579 for term, dest in src.edges(): 580 self.remove_backedge(src, term, dest, maybe_unreachable_set) 581 old_dest.append(dest) 582 src.terminals = {} 583 src.nonterminals = {} 584 src.errors = {} 585 src.epsilon = [] 586 self.assert_state_invariants(src) 587 for dest in old_dest: 588 self.assert_state_invariants(dest) 589 590 def assert_table_invariants(self) -> None: 591 for s in self.states: 592 if s is not None: 593 self.assert_state_invariants(s) 594 595 def assert_state_invariants(self, src: typing.Union[StateId, StateAndTransitions]) -> None: 596 if not self.debug_info: 597 return 598 if isinstance(src, int): 599 src = self.states[src] 600 assert isinstance(src, StateAndTransitions) 601 try: 602 for term, dest in src.edges(): 603 assert Edge(src.index, term) in self.states[dest].backedges 604 for e in src.backedges: 605 assert e.term is not None 606 assert self.states[e.src][e.term] == src.index 607 if not self.assume_inconsistent: 608 assert not src.is_inconsistent() 609 except AssertionError as exc: 610 print("assert_state_inveriants for {}\n".format(src)) 611 for e in src.backedges: 612 print("backedge {} from {}\n".format(e, self.states[e.src])) 613 raise exc 614 615 def remove_unreachable_states( 616 self, 617 maybe_unreachable_set: OrderedSet[StateId] 618 ) -> None: 619 # TODO: This function is incomplete in case of loops, some cycle might 620 # remain isolated while not being reachable from the init states. We 621 # should maintain a notion of depth per-state, such that we can 622 # identify loops by noticing the all backedges have a larger depth than 623 # the current state. 624 init: OrderedSet[StateId] 625 init = OrderedSet(goal for name, goal in self.named_goals) 626 while maybe_unreachable_set: 627 next_set: OrderedSet[StateId] = OrderedSet() 628 for s in maybe_unreachable_set: 629 # Check if the state is reachable, if not remove the state and 630 # fill the next_set with all outgoing edges. 631 if len(self.states[s].backedges) == 0 and s not in init: 632 self.remove_state(s, next_set) 633 maybe_unreachable_set = next_set 634 635 def is_reachable_state(self, s: StateId) -> bool: 636 """Check whether the current state is reachable or not.""" 637 if self.states[s] is None: 638 return False 639 reachable_back: OrderedSet[StateId] = OrderedSet() 640 todo = [s] 641 while todo: 642 s = todo.pop() 643 reachable_back.add(s) 644 for edge in self.states[s].backedges: 645 if edge.src not in reachable_back: 646 todo.append(edge.src) 647 for _, s in self.named_goals: 648 if s in reachable_back: 649 return True 650 return False 651 652 def debug_dump(self) -> None: 653 # Sort the grammar by state hash, such that it can be compared 654 # before/after grammar modifications. 655 temp = [s for s in self.states if s is not None] 656 temp = sorted(temp, key=lambda s: s.stable_hash) 657 for s in temp: 658 print(s.stable_str(self.states)) 659 660 def create_lr0_table( 661 self, 662 grammar: CanonicalGrammar, 663 verbose: bool, 664 progress: bool 665 ) -> None: 666 if verbose or progress: 667 print("Create LR(0) parse table.") 668 669 goals = grammar.grammar.goals() 670 self.named_goals = [] 671 672 # Temporary work queue. 673 todo: typing.Deque[typing.Tuple[LR0Generator, StateAndTransitions]] 674 todo = collections.deque() 675 676 # Record the starting goals in the todo list. 677 for nt in goals: 678 init_nt = Nt(InitNt(nt), ()) 679 it = LR0Generator.start(grammar, init_nt) 680 s = self.get_state(it.stable_locations()) 681 todo.append((it, s)) 682 self.named_goals.append((nt, s.index)) 683 684 # Iterate the grammar with sets of LR Items abstracted by the 685 # LR0Generator, and create new states in the parse table as long as new 686 # sets of LR Items are discovered. 687 def visit_grammar() -> typing.Iterator[None]: 688 while todo: 689 yield # progress bar. 690 # TODO: Compare stack / queue, for the traversal of the states. 691 s_it, s = todo.popleft() 692 if verbose: 693 print("\nMapping state {} to LR0:\n{}".format(s.stable_hash, s_it)) 694 for k, sk_it in s_it.transitions().items(): 695 locations = sk_it.stable_locations() 696 if not self.term_is_shifted(k): 697 locations = OrderedFrozenSet() 698 is_new, sk = self.new_state(locations) 699 if is_new: 700 todo.append((sk_it, sk)) 701 702 # Add the edge from s to sk with k. 703 self.add_edge(s, k, sk.index) 704 705 consume(visit_grammar(), progress) 706 707 if verbose: 708 print("Create LR(0) Table Result:") 709 self.debug_dump() 710 711 def term_is_shifted(self, term: typing.Optional[Term]) -> bool: 712 return not isinstance(term, Action) or term.follow_edge() 713 714 def is_valid_path( 715 self, 716 path: typing.Sequence[Edge], 717 state: typing.Optional[StateId] = None 718 ) -> bool: 719 """This function is used to check a list of edges and returns whether it 720 corresponds to a valid path within the parse table. This is useful when 721 merging sequences of edges from various locations.""" 722 if not state and path != []: 723 state = path[0].src 724 while path: 725 edge = path[0] 726 path = path[1:] 727 if state != edge.src: 728 return False 729 assert isinstance(state, StateId) 730 731 term = edge.term 732 if term is None and len(path) == 0: 733 return True 734 735 row = self.states[state] 736 if term not in row: 737 return False 738 assert term is not None 739 state = row[term] 740 return True 741 742 def term_is_stacked(self, term: typing.Optional[Term]) -> bool: 743 # The `term` argument is annotated as Optional because `Edge.term` is a 744 # common argument. If it's ever None in practice, the caller has a bug. 745 assert term is not None 746 747 return not isinstance(term, Action) 748 749 def aps_visitor(self, aps: APS, visit: typing.Callable[[APS], bool]) -> None: 750 """Visit all the states of the parse table, as-if we were running a 751 Generalized LR parser. 752 753 However, instead parsing content, we use this algorithm to generate 754 both the content which remains to be parsed as well as the context 755 which might lead us to be in the state which from which we started. 756 757 This algorithm takes an APS (Abstract Parser State) and a callback, and 758 consider all edges of the parse table, unless restricted by one of the 759 previously encountered actions. These restrictions, such as replayed 760 lookahead or the path which might be reduced are used for filtering out 761 states which are not handled by this parse table. 762 763 For each edge, this functions calls the visit functions to know whether 764 to stop or continue. The visit function might capture APS given as 765 argument to be used for other analysis. 766 767 """ 768 todo = [aps] 769 while todo: 770 aps = todo.pop() 771 cont = visit(aps) 772 if not cont: 773 continue 774 todo.extend(aps.shift_next(self)) 775 776 def context_lanes(self, state: StateId) -> typing.Tuple[bool, typing.List[APS]]: 777 """Compute lanes, such that each reduce action can have set of unique stack to 778 reach the given state. The stacks are assumed to be loop-free by 779 reducing edges at most once. 780 781 In order to avoid attempting to eagerly solve everything using context 782 information, we break this loop as soon as we have one token of 783 lookahead in a case which does not have enough context. 784 785 The return value is a tuple where the first element is a boolean which 786 is True if we should fallback on solving this issue with more 787 lookahead, and the second is the list of APS lanes which are providing 788 enough context to disambiguate the inconsistency of the given state.""" 789 790 def not_interesting(aps: APS) -> bool: 791 reduce_list = [e for e in aps.history if self.term_is_shifted(e.term)] 792 has_reduce_loop = len(reduce_list) != len(set(reduce_list)) 793 return has_reduce_loop 794 795 # The context is a dictionary which maps all stack suffixes from an APS 796 # stack. It is mapped to a list of tuples, where the each tuple is the 797 # index with the APS stack and the APS action used to follow this path. 798 context: typing.DefaultDict[typing.Tuple[Edge, ...], typing.List[Edge]] 799 context = collections.defaultdict(lambda: []) 800 801 def has_enough_context(aps: APS) -> bool: 802 try: 803 assert aps.history[0] in context[tuple(aps.stack)] 804 # Check the number of different actions which can reach this 805 # location. If there is more than 1, then we do not have enough 806 # context. 807 return len(set(context[tuple(aps.stack)])) <= 1 808 except IndexError: 809 return False 810 811 collect = [APS.start(state)] 812 enough_context = False 813 while not enough_context: 814 # print("collect.len = {}".format(len(collect))) 815 # Fill the context dictionary with all the sub-stack which might be 816 # encountered by other APS. 817 recurse = [] 818 context = collections.defaultdict(lambda: []) 819 while collect: 820 aps = collect.pop() 821 recurse.append(aps) 822 if aps.history == []: 823 continue 824 for i in range(len(aps.stack)): 825 context[tuple(aps.stack[i:])].append(aps.history[0]) 826 assert collect == [] 827 828 # print("recurse.len = {}".format(len(recurse))) 829 # Iterate over APS which do not yet have enough context information 830 # to uniquely identify a single action. 831 enough_context = True 832 while recurse: 833 aps = recurse.pop() 834 if not_interesting(aps): 835 # print("discard uninteresting context lane:") 836 # print(aps.string("\tcontext")) 837 continue 838 if has_enough_context(aps): 839 collect.append(aps) 840 continue 841 # If we have not enough context but some lookahead is 842 # available, attempt to first solve this issue using more 843 # lookahead before attempting to use context information. 844 if len(aps.lookahead) >= 1: 845 # print("discard context_lanes due to lookahead:") 846 # for aps in itertools.chain(collect, recurse, [aps]): 847 # print(aps.string("\tcontext")) 848 return True, [] 849 enough_context = False 850 # print("extend starting at:\n{}".format(aps.string("\tcontext"))) 851 collect.extend(aps.shift_next(self)) 852 assert recurse == [] 853 854 # print("context_lanes:") 855 # for aps in collect: 856 # print(aps.string("\tcontext")) 857 858 return False, collect 859 860 def lookahead_lanes(self, state: StateId) -> typing.List[APS]: 861 """Compute lanes to collect all lookahead symbols available. After each reduce 862 action, there is no need to consider the same non-terminal multiple 863 times, we are only interested in lookahead token and not in the context 864 information provided by reducing action.""" 865 866 record = [] 867 # After the first reduce action, we do not want to spend too much 868 # resource visiting edges which would give us the same information. 869 # Therefore, if we already reduce an action to a given state, then we 870 # skip looking for lookahead that we already visited. 871 # 872 # Set of (first-reduce-edge, reducing-base, last-reduce-edge) 873 seen_edge_after_reduce: typing.Set[typing.Tuple[Edge, StateId, typing.Optional[Term]]] 874 seen_edge_after_reduce = set() 875 876 def find_first_reduce( 877 edges: Path 878 ) -> typing.Tuple[int, typing.Optional[Edge]]: 879 for i, edge in enumerate(edges): 880 if not self.term_is_shifted(edge.term): 881 return i, edge 882 return 0, None 883 884 def find_last_reduce( 885 edges: Path 886 ) -> typing.Tuple[int, typing.Optional[Edge]]: 887 for i, edge in zip(reversed(range(len(edges))), reversed(edges)): 888 if not self.term_is_shifted(edge.term): 889 return i, edge 890 return 0, None 891 892 def visit(aps: APS) -> bool: 893 # Note, this suppose that we are not considering flags when 894 # computing, as flag might prevent some lookahead investigations. 895 reduce_key = None 896 first_index, first_reduce = find_first_reduce(aps.history) 897 last_index, last_reduce = find_last_reduce(aps.history) 898 if first_index != last_index and first_reduce and last_reduce: 899 if not isinstance(aps.history[-1].term, Action): 900 reduce_key = (first_reduce, aps.shift[0].src, last_reduce.term) 901 has_seen_edge_after_reduce = reduce_key and reduce_key in seen_edge_after_reduce 902 has_lookahead = len(aps.lookahead) >= 1 903 stop = has_seen_edge_after_reduce or has_lookahead 904 # print("stop: {}, size lookahead: {}, seen_edge_after_reduce: {}".format( 905 # stop, len(aps.lookahead), repr(reduce_key) 906 # )) 907 # print(aps.string("\tvisitor")) 908 if stop: 909 if has_lookahead: 910 record.append(aps) 911 if reduce_key: 912 seen_edge_after_reduce.add(reduce_key) 913 return not stop 914 915 self.aps_visitor(APS.start(state), visit) 916 return record 917 918 def fix_with_context(self, s: StateId, aps_lanes: typing.List[APS]) -> None: 919 raise ValueError("fix_with_context: Not Implemented") 920 # # This strategy is about using context information. By using chains of 921 # # reduce actions, we are able to increase the knowledge of the stack 922 # # content. The stack content is the context which can be used to 923 # # determine how to consider a reduction. The stack content is also 924 # # called a lane, as defined in the Lane Table algorithm. 925 # # 926 # # To add context information to the current graph, we add flags 927 # # manipulation actions. 928 # # 929 # # Consider each edge as having an implicit function which can map one 930 # # flag value to another. The following implements a unification 931 # # algorithm which is attempting to solve the question of what is the 932 # # flag value, and where it should be changed. 933 # # 934 # # NOTE: (nbp) I would not be surprised if there is a more specialized 935 # # algorithm, but I failed to find one so far, and this problem 936 # # definitely looks like a unification problem. 937 # Id = collections.namedtuple("Id", "edge") 938 # Eq = collections.namedtuple("Eq", "flag_in edge flag_out") 939 # Var = collections.namedtuple("Var", "n") 940 # SubSt = collections.namedtuple("SubSt", "var by") 941 # 942 # # Unify expression, and return one substitution if both expressions 943 # # can be unified. 944 # def unify_expr(expr1, expr2, swapped=False): 945 # if isinstance(expr1, Eq) and isinstance(expr2, Id): 946 # if expr1.edge != expr2.edge: 947 # # Different edges are ok, but produce no substituions. 948 # return True 949 # if isinstance(expr1.flag_in, Var): 950 # return SubSt(expr1.flag_in, expr1.flag_out) 951 # if isinstance(expr1.flag_out, Var): 952 # return SubSt(expr1.flag_out, expr1.flag_in) 953 # # We are unifying with a relation which consider the current 954 # # function as an identity function. Having different values as 955 # # input and output fails the unification rule. 956 # return expr1.flag_out == expr1.flag_in 957 # if isinstance(expr1, Eq) and isinstance(expr2, Eq): 958 # if expr1.edge != expr2.edge: 959 # # Different edges are ok, but produce no substituions. 960 # return True 961 # if expr1.flag_in is None and isinstance(expr2.flag_in, Var): 962 # return SubSt(expr2.flag_in, None) 963 # if expr1.flag_out is None and isinstance(expr2.flag_out, Var): 964 # return SubSt(expr2.flag_out, None) 965 # if expr1.flag_in == expr2.flag_in: 966 # if isinstance(expr1.flag_out, Var): 967 # return SubSt(expr1.flag_out, expr2.flag_out) 968 # elif isinstance(expr2.flag_out, Var): 969 # return SubSt(expr2.flag_out, expr1.flag_out) 970 # # Reject solutions which are not deterministic. We do not 971 # # want the same input flag to have multiple outputs. 972 # return expr1.flag_out == expr2.flag_out 973 # if expr1.flag_out == expr2.flag_out: 974 # if isinstance(expr1.flag_in, Var): 975 # return SubSt(expr1.flag_in, expr2.flag_in) 976 # elif isinstance(expr2.flag_in, Var): 977 # return SubSt(expr2.flag_in, expr1.flag_in) 978 # return True 979 # if not swapped: 980 # return unify_expr(expr2, expr1, True) 981 # return True 982 # 983 # # Apply substituion rule to an expression. 984 # def subst_expr(subst, expr): 985 # if expr == subst.var: 986 # return True, subst.by 987 # if isinstance(expr, Eq): 988 # subst1, flag_in = subst_expr(subst, expr.flag_in) 989 # subst2, flag_out = subst_expr(subst, expr.flag_out) 990 # return subst1 or subst2, Eq(flag_in, expr.edge, flag_out) 991 # return False, expr 992 # 993 # # Add an expression to an existing knowledge based which is relying on 994 # # a set of free variables. 995 # def unify_with(expr, knowledge, free_vars): 996 # old_knowledge = knowledge 997 # old_free_Vars = free_vars 998 # while True: 999 # subst = None 1000 # for rel in knowledge: 1001 # subst = unify_expr(rel, expr) 1002 # if subst is False: 1003 # raise Error("Failed to find a coherent solution") 1004 # if subst is True: 1005 # continue 1006 # break 1007 # else: 1008 # return knowledge + [expr], free_vars 1009 # free_vars = [fv for fv in free_vars if fv != subst.var] 1010 # # Substitue variables, and re-add rules which have substituted 1011 # # vars to check the changes to other rules, in case 2 rules are 1012 # # now in conflict or in case we can propagate more variable 1013 # # changes. 1014 # subst_rules = [subst_expr(subst, k) for k in knowledge] 1015 # knowledge = [rule for changed, rule in subst_rule if not changed] 1016 # for changed, rule in subst_rule: 1017 # if not changed: 1018 # continue 1019 # knowledge, free_vars = unify_with(rule, knowledge, free_vars) 1020 # 1021 # # Register boundary conditions as part of the knowledge based, i-e that 1022 # # reduce actions are expecting to see the flag value matching the 1023 # # reduced non-terminal, and that we have no flag value at the start of 1024 # # every lane head. 1025 # # 1026 # # TODO: Catch exceptions from the unify function in case we do not yet 1027 # # have enough context to disambiguate. 1028 # rules = [] 1029 # free_vars = [] 1030 # last_free = 0 1031 # maybe_id_edges = set() 1032 # nts = set() 1033 # for aps in aps_lanes: 1034 # assert len(aps.stack) >= 1 1035 # flag_in = None 1036 # for edge in aps.stack[-1]: 1037 # i = last_free 1038 # last_free += 1 1039 # free_vars.append(Var(i)) 1040 # rule = Eq(flag_in, edge, Var(i)) 1041 # rules, free_vars = unify_with(rule, rules, free_vars) 1042 # flag_in = Var(i) 1043 # if flag_in is not None: 1044 # maybe_id_edges.add(Id(edge)) 1045 # edge = aps.stack[-1] 1046 # nt = edge.term.update_stack_with().nt 1047 # rule = Eq(nt, edge, None) 1048 # rules, free_vars = unify_with(rule, rules, free_vars) 1049 # nts.add(nt) 1050 # 1051 # # We want to produce a parse table where most of the node are ignoring 1052 # # the content of the flag which is being added. Thus we want to find a 1053 # # solution where most edges are the identical function. 1054 # def fill_with_id_functions(rules, free_vars, maybe_id_edges): 1055 # min_rules, min_vars = rules, free_vars 1056 # for num_id_edges in reversed(range(len(maybe_id_edges))): 1057 # for id_edges in itertools.combinations(edges, num_id_edges): 1058 # for edge in id_edges: 1059 # new_rules, new_free_vars = unify_with(rule, rules, free_vars) 1060 # if new_free_vars == []: 1061 # return new_rules, new_free_vars 1062 # if len(new_free_vars) < len(min_free_vars): 1063 # min_vars = new_free_vars 1064 # min_rules = new_rules 1065 # return rules, free_vars 1066 # 1067 # rules, free_vars = fill_with_id_functions(rules, free_vars, maybe_id_edges) 1068 # if free_vars != []: 1069 # raise Error("Hum … maybe we can try to iterate over the remaining free-variable.") 1070 # print("debug: Great we found a solution for a reduce-reduce conflict") 1071 # 1072 # # The set of rules describe the function that each edge is expected to 1073 # # support. If there is an Id(edge), then we know that we do not have to 1074 # # change the graph for the given edge. If the rule is Eq(A, edge, B), 1075 # # then we have to (filter A & pop) and push B, except if A or B is 1076 # # None. 1077 # # 1078 # # For each edge, collect the set of rules concerning the edge to 1079 # # determine which edges have to be transformed to add the filter&pop 1080 # # and push actions. 1081 # edge_rules = collections.defaultdict(lambda: []) 1082 # for rule in rules: 1083 # if isinstance(rule, Id): 1084 # edge_rules[rule.edge] = None 1085 # elif isinstance(rule, Eq): 1086 # if edge_rules[rule.edge] is not None: 1087 # edge_rules[rule.edge].append(rule) 1088 # 1089 # maybe_unreachable_set = set() 1090 # flag_name = self.get_flag_for(nts) 1091 # for edge, rules in edge_rules.items(): 1092 # # If the edge is an identity function, then skip doing any 1093 # # modifications on it. 1094 # if rules is None: 1095 # continue 1096 # # Otherwise, create a new state and transition for each mapping. 1097 # src = self.states[edge.src] 1098 # dest = src[edge.term] 1099 # dest_state = self.states[dest] 1100 # # TODO: Add some information to avoid having identical hashes as 1101 # # the destination. 1102 # actions = [] 1103 # for rule in OrderedFrozenSet(rules): 1104 # assert isinstance(rule, Eq) 1105 # seq = [] 1106 # if rule.flag_in is not None: 1107 # seq.append(FilterFlag(flag_name, True)) 1108 # if rule.flag_in != rule.flag_out: 1109 # seq.append(PopFlag(flag_name)) 1110 # if rule.flag_out is not None and rule.flag_in != rule.flag_out: 1111 # seq.append(PushFlag(flag_name, rule.flag_out)) 1112 # actions.append(Seq(seq)) 1113 # # Assert that we do not map flag_in more than once. 1114 # assert len(set(eq.flag_in for eq in rules)) < len(rules) 1115 # # Create the new state and add edges. 1116 # is_new, switch = self.new_state(dest.locations, OrderedFrozenSet(actions)) 1117 # assert is_new 1118 # for seq in actions: 1119 # self.add_edge(switch, seq, dest) 1120 # 1121 # # Replace the edge from src to dest, by an edge from src to the 1122 # # newly created switch state, which then decide which flag to set 1123 # # before going to the destination target. 1124 # self.replace_edge(src, edge.term, switch, maybe_unreachable_set) 1125 # 1126 # self.remove_unreachable_states(maybe_unreachable_set) 1127 # pass 1128 1129 def fix_with_lookahead(self, s: StateId, aps_lanes: typing.List[APS]) -> None: 1130 # Find the list of terminals following each actions (even reduce 1131 # actions). 1132 assert all(len(aps.lookahead) >= 1 for aps in aps_lanes) 1133 if self.debug_info: 1134 for aps in aps_lanes: 1135 print(str(aps)) 1136 maybe_unreachable_set: OrderedSet[StateId] = OrderedSet() 1137 1138 # For each shifted term, associate a set of state and actions which 1139 # would have to be executed. 1140 shift_map: typing.DefaultDict[ 1141 Term, 1142 typing.List[typing.Tuple[StateAndTransitions, typing.List[Edge]]] 1143 ] 1144 shift_map = collections.defaultdict(lambda: []) 1145 for aps in aps_lanes: 1146 actions = aps.history 1147 assert isinstance(actions[-1], Edge) 1148 src = actions[-1].src 1149 term = actions[-1].term 1150 assert term == aps.lookahead[0] 1151 assert isinstance(term, (str, End, ErrorSymbol, Nt)) 1152 1153 # No need to consider any action beyind the first reduced action 1154 # since the reduced action is in charge of replaying the lookahead 1155 # terms. 1156 actions = list(keep_until(actions[:-1], lambda edge: not self.term_is_shifted(edge.term))) 1157 assert all(isinstance(edge.term, Action) for edge in actions) 1158 1159 # Change the order of the shifted term, shift all actions by 1 with 1160 # the given lookahead term, in order to match the newly generated 1161 # state machine. 1162 # 1163 # Shifting actions with the list of shifted terms is used to record 1164 # the number of terms to be replayed, as well as verifying whether 1165 # Lookahead filter actions should accept or reject this lane. 1166 new_actions = [] 1167 accept = True 1168 for edge in actions: 1169 edge_term = edge.term 1170 assert isinstance(edge_term, Action) 1171 new_term = edge_term.shifted_action(term) 1172 if isinstance(new_term, bool): 1173 if new_term is False: 1174 accept = False 1175 break 1176 else: 1177 continue 1178 new_actions.append(Edge(edge.src, new_term)) 1179 if accept: 1180 target_id = self.states[src][term] 1181 target = self.states[target_id] 1182 shift_map[term].append((target, new_actions)) 1183 1184 # Restore the new state machine based on a given state to use as a base 1185 # and the shift_map corresponding to edges. 1186 def restore_edges( 1187 state: StateAndTransitions, 1188 shift_map: typing.DefaultDict[ 1189 Term, 1190 typing.List[typing.Tuple[StateAndTransitions, typing.List[Edge]]] 1191 ], 1192 depth: str 1193 ) -> None: 1194 # print("{}starting with {}\n".format(depth, state)) 1195 edges = {} 1196 for term, actions_list in shift_map.items(): 1197 # print("{}term: {}, lists: {}\n".format(depth, repr(term), repr(actions_list))) 1198 # Collect all the states reachable after shifting the term. 1199 # Compute the unique name, based on the locations and actions 1200 # which are delayed. 1201 locations: OrderedSet[str] = OrderedSet() 1202 delayed: OrderedSet[DelayedAction] = OrderedSet() 1203 new_shift_map: typing.DefaultDict[ 1204 Term, 1205 typing.List[typing.Tuple[StateAndTransitions, typing.List[Edge]]] 1206 ] 1207 new_shift_map = collections.defaultdict(lambda: []) 1208 recurse = False 1209 if not self.term_is_shifted(term): 1210 # There is no more target after a reduce action. 1211 actions_list = [] 1212 for target, actions in actions_list: 1213 assert isinstance(target, StateAndTransitions) 1214 locations |= target.locations 1215 delayed |= target.delayed_actions 1216 if actions != []: 1217 # Pull edges, with delayed actions. 1218 edge = actions[0] 1219 assert isinstance(edge, Edge) 1220 for action in actions: 1221 action_term = action.term 1222 assert isinstance(action_term, Action) 1223 delayed.add(action_term) 1224 edge_term = edge.term 1225 assert edge_term is not None 1226 new_shift_map[edge_term].append((target, actions[1:])) 1227 recurse = True 1228 else: 1229 # Pull edges, as a copy of existing edges. 1230 for next_term, next_dest_id in target.edges(): 1231 next_dest = self.states[next_dest_id] 1232 new_shift_map[next_term].append((next_dest, [])) 1233 1234 is_new, new_target = self.new_state( 1235 OrderedFrozenSet(locations), OrderedFrozenSet(delayed)) 1236 edges[term] = new_target.index 1237 if self.debug_info: 1238 print("{}is_new = {}, index = {}".format(depth, is_new, new_target.index)) 1239 print("{}Add: {} -- {} --> {}".format(depth, state.index, str(term), new_target.index)) 1240 print("{}continue: (is_new: {}) or (recurse: {})".format(depth, is_new, recurse)) 1241 if is_new or recurse: 1242 restore_edges(new_target, new_shift_map, depth + " ") 1243 1244 self.clear_edges(state, maybe_unreachable_set) 1245 for term, target_id in edges.items(): 1246 self.add_edge(state, term, target_id) 1247 if self.debug_info: 1248 print("{}replaced by {}\n".format(depth, state)) 1249 1250 state = self.states[s] 1251 restore_edges(state, shift_map, "") 1252 self.remove_unreachable_states(maybe_unreachable_set) 1253 1254 def fix_inconsistent_state(self, s: StateId, verbose: bool) -> bool: 1255 # Fix inconsistent states works one state at a time. The goal is to 1256 # achieve the same method as the Lane Tracer, but instead of building a 1257 # table to then mutate the parse state, we mutate the parse state 1258 # directly. 1259 # 1260 # This strategy is simpler, and should be able to reproduce the same 1261 # graph mutations as seen with Lane Table algorithm. One of the problem 1262 # with the Lane Table algorithm is that it assume reduce operations, 1263 # and as such it does not apply simply to epsilon transitions which are 1264 # used as conditions on the parse table. 1265 # 1266 # By using push-flag and filter-flag actions, we are capable to 1267 # decompose the Lane Table transformation of the parse table into 1268 # multiple steps which are working one step at a time, and with less 1269 # table state duplication. 1270 1271 state = self.states[s] 1272 if state is None or not state.is_inconsistent(): 1273 return False 1274 1275 all_reduce = all(a.update_stack() for a, _ in state.epsilon) 1276 any_shift = (len(state.terminals) + len(state.nonterminals) + len(state.errors)) > 0 1277 try_with_context = all_reduce and not any_shift 1278 try_with_lookahead = not try_with_context 1279 # if verbose: 1280 # print(aps_lanes_str(aps_lanes, "fix_inconsistent_state:", "\taps")) 1281 if try_with_context: 1282 if verbose: 1283 print("\tFix with context.") 1284 try_with_lookahead, aps_lanes = self.context_lanes(s) 1285 if not try_with_lookahead: 1286 assert aps_lanes != [] 1287 self.fix_with_context(s, aps_lanes) 1288 elif verbose: 1289 print("\tFallback on fixing with lookahead.") 1290 if try_with_lookahead: 1291 if verbose: 1292 print("\tFix with lookahead.") 1293 aps_lanes = self.lookahead_lanes(s) 1294 assert aps_lanes != [] 1295 self.fix_with_lookahead(s, aps_lanes) 1296 return True 1297 1298 def fix_inconsistent_table(self, verbose: bool, progress: bool) -> None: 1299 """The parse table might be inconsistent. We fix the parse table by looking 1300 around the inconsistent states for more context. Either by looking at the 1301 potential stack state which might lead to the inconsistent state, or by 1302 increasing the lookahead.""" 1303 self.assume_inconsistent = True 1304 if verbose or progress: 1305 print("Fix parse table inconsistencies.") 1306 1307 todo: typing.Deque[StateId] = collections.deque() 1308 for state in self.states: 1309 if state.is_inconsistent(): 1310 todo.append(state.index) 1311 1312 if verbose and todo: 1313 print("\n".join([ 1314 "\nGrammar is inconsistent.", 1315 "\tNumber of States = {}", 1316 "\tNumber of inconsistencies found = {}"]).format( 1317 len(self.states), len(todo))) 1318 1319 count = 0 1320 1321 def visit_table() -> typing.Iterator[None]: 1322 nonlocal count 1323 unreachable = [] 1324 while todo: 1325 while todo: 1326 yield # progress bar. 1327 # TODO: Compare stack / queue, for the traversal of the states. 1328 s = todo.popleft() 1329 if not self.is_reachable_state(s): 1330 # NOTE: We do not fix unreachable states, as we might 1331 # not be able to compute the reduce actions. However, 1332 # we should not clean edges not backedges as the state 1333 # might become reachable later on, since states are 1334 # shared if they have the same locations. 1335 unreachable.append(s) 1336 continue 1337 assert self.states[s].is_inconsistent() 1338 start_len = len(self.states) 1339 if verbose: 1340 count = count + 1 1341 print("Fixing state {}\n".format(self.states[s].stable_str(self.states))) 1342 try: 1343 self.fix_inconsistent_state(s, verbose) 1344 except Exception as exc: 1345 self.debug_info = True 1346 raise ValueError( 1347 "Error while fixing conflict in state {}\n\n" 1348 "In the following grammar productions:\n{}" 1349 .format(self.states[s].stable_str(self.states), 1350 self.debug_context(s, "\n", "\t")) 1351 ) from exc 1352 new_inconsistent_states = [ 1353 s.index for s in self.states[start_len:] 1354 if s.is_inconsistent() 1355 ] 1356 if verbose: 1357 print("\tAdding {} states".format(len(self.states[start_len:]))) 1358 print("\tWith {} inconsistent states".format(len(new_inconsistent_states))) 1359 todo.extend(new_inconsistent_states) 1360 1361 # Check whether none of the previously inconsistent and 1362 # unreahable state became reachable. If so add it back to the 1363 # todo list. 1364 still_unreachable = [] 1365 for s in unreachable: 1366 if self.is_reachable_state(s): 1367 todo.append(s) 1368 else: 1369 still_unreachable.append(s) 1370 unreachable = still_unreachable 1371 1372 consume(visit_table(), progress) 1373 if verbose: 1374 print("\n".join([ 1375 "\nGrammar is now consistent.", 1376 "\tNumber of States = {}", 1377 "\tNumber of inconsistencies solved = {}"]).format( 1378 len(self.states), count)) 1379 assert not self.is_inconsistent() 1380 self.assume_inconsistent = False 1381 1382 if verbose: 1383 print("Fix Inconsistent Table Result:") 1384 self.debug_dump() 1385 1386 def remove_all_unreachable_state(self, verbose: bool, progress: bool) -> None: 1387 self.states = [s for s in self.states if s is not None] 1388 self.rewrite_reordered_state_indexes() 1389 1390 def lower_reduce_actions(self, verbose: bool, progress: bool) -> None: 1391 # Remove Reduce actions and replace them by the programmatic 1392 # equivalent. 1393 # 1394 # This transformation preserves the stack manipulations of the parse 1395 # table. It only changes it from being implicitly executed by the LR 1396 # parser, to being explicitly executed with actions. 1397 # 1398 # This transformation converts the hard-to-predict load of the shift 1399 # table into a branch prediction which is potentially easier to 1400 # predict. 1401 # 1402 # A side-effect of this transformation is that it removes the need for 1403 # replaying non-terminals, thus the backends could safely ignore the 1404 # ability of the shift function from handling non-terminals. 1405 if verbose or progress: 1406 print("Lower Reduce actions.") 1407 1408 maybe_unreachable_set: OrderedSet[StateId] = OrderedSet() 1409 1410 def transform() -> typing.Iterator[None]: 1411 for s in self.states: 1412 term, _ = next(iter(s.epsilon), (None, None)) 1413 if self.term_is_shifted(term): 1414 continue 1415 assert len(s.epsilon) == 1 1416 yield # progress bar. 1417 reduce_state = s 1418 if verbose: 1419 print("Inlining shift-operation for state {}".format(str(reduce_state))) 1420 1421 # The reduced_aps should contain all reduced path of the single 1422 # Reduce action which is present on this state. However, as 1423 # state of the graph are shared, some reduced paths might follow 1424 # the same path and reach the same state. 1425 # 1426 # This code collect for each replayed path, the tops of the 1427 # stack on top of which these states are replayed. 1428 aps = APS.start(s.index) 1429 states_by_replay_term = collections.defaultdict(list) 1430 # print("Start:\n{}".format(aps.string(name="\titer_aps"))) 1431 # print(s.stable_str(self.states)) 1432 for reduced_aps in aps.shift_next(self): 1433 # As long as we have elements to replay, we should only 1434 # have a single path for each reduced path. If the next 1435 # state contains an action, then we stop here. 1436 iter_aps = reduced_aps 1437 next_is_action = self.states[iter_aps.state].epsilon != [] 1438 has_replay = iter_aps.replay != [] 1439 assert next_is_action is False and has_replay is True 1440 while (not next_is_action) and has_replay: 1441 # print("Step {}:\n{}".format(len(iter_aps.history), 1442 # iter_aps.string(name="\titer_aps"))) 1443 next_aps = list(iter_aps.shift_next(self)) 1444 if len(next_aps) == 0: 1445 # Note, this might happen as we are adding 1446 # lookahead tokens from any successor, we might not 1447 # always have a way to replay all tokens, in such 1448 # case an error should be produced, but in the mean 1449 # time, let's use the shift function as usual. 1450 break 1451 assert len(next_aps) == 1 1452 iter_aps = next_aps[0] 1453 next_is_action = self.states[iter_aps.state].epsilon != [] 1454 has_replay = iter_aps.replay != [] 1455 # print("End at {}:\n{}".format(len(iter_aps.history), 1456 # iter_aps.string(name="\titer_aps"))) 1457 replay_list = [e.src for e in iter_aps.shift] 1458 assert len(replay_list) >= 2 1459 replay_term = Replay(replay_list[1:]) 1460 states_by_replay_term[replay_term].append(replay_list[0]) 1461 1462 # Create FilterStates actions. 1463 filter_by_replay_term = { 1464 replay_term: FilterStates(states) 1465 for replay_term, states in states_by_replay_term.items() 1466 } 1467 1468 # Convert the Reduce action to an Unwind action. 1469 reduce_term, _ = next(iter(s.epsilon)) 1470 if isinstance(reduce_term, Reduce): 1471 unwind_term: Action = reduce_term.unwind 1472 else: 1473 assert isinstance(reduce_term, Seq) 1474 assert isinstance(reduce_term.actions[-1], Reduce) 1475 unwind_term = Seq(list(reduce_term.actions[:-1]) + [reduce_term.actions[-1].unwind]) 1476 1477 # Remove the old Reduce edge if still present. 1478 # print("Before:\n{}".format(reduce_state.stable_str(self.states))) 1479 self.remove_edge(reduce_state, reduce_term, maybe_unreachable_set) 1480 1481 # Add Unwind action. 1482 # print("After:\n") 1483 locations = reduce_state.locations 1484 delayed: OrderedFrozenSet[DelayedAction] = OrderedFrozenSet(filter_by_replay_term.items()) 1485 replay_size = 1 # Replay the unwound non-terminal 1486 is_new, filter_state = self.new_state(locations, delayed, replay_size) 1487 self.add_edge(reduce_state, unwind_term, filter_state.index) 1488 if not is_new: 1489 # The destination state already exists. Assert that all 1490 # outgoing edges are matching what we would have generated. 1491 if len(filter_by_replay_term) == 1: 1492 # There is only one predecessor, no need for a 1493 # FilterState condition. 1494 replay_term = next(iter(filter_by_replay_term)) 1495 assert replay_term in filter_state 1496 continue 1497 1498 for replay_term, filter_term in filter_by_replay_term.items(): 1499 assert filter_term in filter_state 1500 replay_state = self.states[filter_state[filter_term]] 1501 assert replay_term in replay_state 1502 continue 1503 1504 if len(filter_by_replay_term) == 1: 1505 replay_term = next(iter(filter_by_replay_term)) 1506 dest_idx = replay_term.replay_steps[-1] 1507 # Do not add the FilterStates action, as there is only one. 1508 # Add Replay actions from the filter_state to the destination. 1509 self.add_edge(filter_state, replay_term, dest_idx) 1510 else: 1511 for replay_term, filter_term in filter_by_replay_term.items(): 1512 dest_idx = replay_term.replay_steps[-1] 1513 dest = self.states[dest_idx] 1514 1515 # Add FilterStates action from the filter_state to the replay_state. 1516 locations = dest.locations 1517 delayed = OrderedFrozenSet(itertools.chain(dest.delayed_actions, [replay_term])) 1518 is_new, replay_state = self.new_state(locations, delayed, replay_size) 1519 self.add_edge(filter_state, filter_term, replay_state.index) 1520 assert (not is_new) == (replay_term in replay_state) 1521 1522 # Add Replay actions from the replay_state to the destination. 1523 if is_new: 1524 dest_idx = replay_term.replay_steps[-1] 1525 self.add_edge(replay_state, replay_term, dest_idx) 1526 # print(replay_state.stable_str(self.states)) 1527 assert not replay_state.is_inconsistent() 1528 1529 # print(filter_state.stable_str(self.states)) 1530 # print(reduce_state.stable_str(self.states)) 1531 assert not reduce_state.is_inconsistent() 1532 assert not filter_state.is_inconsistent() 1533 1534 consume(transform(), progress) 1535 1536 def fold_replay_unwind(self, verbose: bool, progress: bool) -> None: 1537 """Convert Replay action falling into Unwind action to an Unwind action which 1538 replay less terms.""" 1539 if verbose or progress: 1540 print("Fold Replay followed by Unwind actions.") 1541 1542 maybe_unreachable_set: OrderedSet[StateId] = OrderedSet() 1543 1544 def try_transform(s: StateAndTransitions) -> bool: 1545 if len(s.epsilon) != 1: 1546 return False 1547 replay_term, replay_dest_idx = next(iter(s.epsilon)) 1548 if not isinstance(replay_term, Replay): 1549 return False 1550 replay_dest = self.states[replay_dest_idx] 1551 if len(replay_dest.epsilon) != 1: 1552 return False 1553 unwind_term, unwind_dest_idx = next(iter(replay_dest.epsilon)) 1554 if not unwind_term.update_stack(): 1555 return False 1556 stack_diff = unwind_term.update_stack_with() 1557 if not stack_diff.reduce_stack(): 1558 return False 1559 if stack_diff.pop + stack_diff.replay <= 0: 1560 return False 1561 1562 # Remove replayed terms from the Unwind action. 1563 replayed = replay_term.replay_steps 1564 unshifted = min(stack_diff.replay + min(s.arguments, stack_diff.pop), len(replayed)) 1565 if unshifted < len(replayed): 1566 # We do not have all replayed terms as arguments, thus do not 1567 # consume arguments 1568 unshifted = min(stack_diff.replay, len(replayed)) 1569 if unshifted == 0: 1570 return False 1571 new_unwind_term = unwind_term.unshift_action(unshifted) 1572 new_replay = new_unwind_term.update_stack_with().replay 1573 1574 # Replace the replay_term and unwind_term by terms which are 1575 # avoiding extra replay actions. 1576 self.remove_edge(s, replay_term, maybe_unreachable_set) 1577 if len(replayed) == unshifted: 1578 # The Unwind action replay more terms than what we originally 1579 # had. The replay term is replaced by an Unwind edge instead. 1580 assert s.arguments >= -new_replay 1581 self.add_edge(s, new_unwind_term, unwind_dest_idx) 1582 else: 1583 # The Unwind action replay and pop less terms than what we 1584 # originally had. Thus the replay action is shortened and a new 1585 # state is created to accomodate the new Unwind action. 1586 assert unshifted >= 1 1587 new_replay_term = Replay(replayed[:-unshifted]) 1588 implicit_replay_term = Replay(replayed[-unshifted:]) 1589 locations = replay_dest.locations 1590 delayed: OrderedFrozenSet[DelayedAction] 1591 delayed = OrderedFrozenSet( 1592 itertools.chain(replay_dest.delayed_actions, [implicit_replay_term])) 1593 is_new, unwind_state = self.new_state(locations, delayed) 1594 assert (not is_new) == (new_unwind_term in unwind_state) 1595 1596 # Add new Replay and new Unwind actions. 1597 self.add_edge(s, new_replay_term, unwind_state.index) 1598 if is_new: 1599 assert unwind_state.arguments >= -new_replay 1600 self.add_edge(unwind_state, new_unwind_term, unwind_dest_idx) 1601 assert not unwind_state.is_inconsistent() 1602 assert not s.is_inconsistent() 1603 return True 1604 1605 def transform() -> typing.Iterator[None]: 1606 for s in self.states: 1607 if try_transform(s): 1608 yield # progress bar 1609 1610 consume(transform(), progress) 1611 self.remove_unreachable_states(maybe_unreachable_set) 1612 1613 def fold_identical_endings(self, verbose: bool, progress: bool) -> None: 1614 # If 2 states have the same outgoing edges, then we can merge the 2 1615 # states into a single state, and rewrite all the backedges leading to 1616 # these states to be replaced by edges going to the reference state. 1617 if verbose or progress: 1618 print("Fold identical endings.") 1619 1620 def rewrite_backedges(state_list: typing.List[StateAndTransitions], 1621 state_map: typing.Dict[StateId, StateId], 1622 backrefs: typing.Dict[StateId, 1623 typing.List[typing.Tuple[StateId, Action, StateId]]], 1624 maybe_unreachable: OrderedSet[StateId]) -> bool: 1625 all_backrefs = [] 1626 new_backrefs = set() 1627 for s in state_list: 1628 all_backrefs.extend(backrefs[s.index]) 1629 # All states have the same outgoing edges. Thus we replace all of 1630 # them by a single state. We do that by replacing edges of which 1631 # are targeting the state in the state_list by edges targetting the 1632 # ref state. 1633 ref = state_list.pop() 1634 tmp_state_map = default_fwd_dict(state_map) 1635 for s in state_list: 1636 tmp_state_map[s.index] = ref.index 1637 1638 for ref_s, ref_t, _d in all_backrefs: 1639 new_backrefs.add((ref_s, ref_t.rewrite_state_indexes(tmp_state_map))) 1640 if len(all_backrefs) != len(new_backrefs): 1641 # Skip this rewrite if when rewritting we are going to cause 1642 # some aliasing to happen between actions which are going to 1643 # different states. 1644 return False 1645 1646 replace_edges = [e for s in state_list for e in s.backedges] 1647 hit = False 1648 for edge in replace_edges: 1649 edge_term = edge.term 1650 assert edge_term is not None 1651 src = self.states[edge.src] 1652 old_dest = src[edge_term] 1653 # print("replace {} -- {} --> {}, by {} -- {} --> {}" 1654 # .format(src.index, edge_term, src[edge_term], src.index, edge_term, ref.index)) 1655 self.replace_edge(src, edge_term, ref.index, maybe_unreachable) 1656 state_map[old_dest] = ref.index 1657 hit = True 1658 return hit 1659 1660 def rewrite_if_same_outedges(state_list: typing.List[StateAndTransitions]) -> bool: 1661 maybe_unreachable: OrderedSet[StateId] = OrderedSet() 1662 backrefs = collections.defaultdict(list) 1663 outedges = collections.defaultdict(list) 1664 for s in state_list: 1665 # Iterate first over actions, then over ordinary states. 1666 self.assert_state_invariants(s) 1667 outedges[tuple(s.edges())].append(s) 1668 if s.epsilon == []: 1669 continue 1670 for t, d in s.edges(): 1671 if not isinstance(t, Action): 1672 continue 1673 for r in t.state_refs(): 1674 backrefs[r].append((s.index, t, d)) 1675 hit = False 1676 state_map: typing.Dict[StateId, StateId] = default_id_dict() 1677 for same in outedges.values(): 1678 if len(same) > 1: 1679 hit = rewrite_backedges(same, state_map, backrefs, maybe_unreachable) or hit 1680 if hit: 1681 self.remove_unreachable_states(maybe_unreachable) 1682 self.rewrite_state_indexes(state_map) 1683 self.remove_all_unreachable_state(verbose, progress) 1684 return hit 1685 1686 def visit_table() -> typing.Iterator[None]: 1687 hit = True 1688 while hit: 1689 yield # progress bar. 1690 hit = rewrite_if_same_outedges(self.states) 1691 1692 consume(visit_table(), progress) 1693 1694 def group_epsilon_states(self, verbose: bool, progress: bool) -> None: 1695 def all_action_inedges(s: StateAndTransitions) -> bool: 1696 return all(isinstance(e.term, Action) for e in s.backedges) 1697 shift_states, action_states = split(self.states, lambda s: len(s.epsilon) == 0) 1698 from_act_action_states, from_shf_action_states = split(action_states, all_action_inedges) 1699 self.states = [] 1700 self.states.extend(shift_states) 1701 self.states.extend(from_shf_action_states) 1702 self.states.extend(from_act_action_states) 1703 self.rewrite_reordered_state_indexes() 1704 1705 def group_nonterminal_states(self, verbose: bool, progress: bool) -> None: 1706 # This function is used to reduce the range of FilterStates values, 1707 # such that the Rust compiler can compile FilterStates match statements 1708 # to a table-switch. 1709 freq_count = collections.Counter(nt for s in self.states for nt in s.nonterminals) 1710 freq_nt, _ = zip(*freq_count.most_common()) 1711 1712 def state_value(s: StateAndTransitions) -> float: 1713 value = 0.0 1714 if len(s.epsilon) != 0: 1715 return 4.0 1716 if len(s.nonterminals) == 0: 1717 return 2.0 1718 i = 1.0 1719 for nt in freq_nt: 1720 if nt in s: 1721 value += i 1722 i /= 2.0 1723 return -value 1724 self.states.sort(key=state_value) 1725 self.rewrite_reordered_state_indexes() 1726 1727 def count_shift_states(self) -> int: 1728 return sum(1 for s in self.states if s is not None and len(s.epsilon) == 0) 1729 1730 def count_action_states(self) -> int: 1731 return sum(1 for s in self.states if s is not None and len(s.epsilon) > 0) 1732 1733 def count_action_from_shift_states(self) -> int: 1734 def from_shift_states(s: StateAndTransitions) -> bool: 1735 return any(not isinstance(e.term, Action) for e in s.backedges) 1736 1737 return sum(1 for s in self.states if len(s.epsilon) > 0 and from_shift_states(s)) 1738 1739 def prepare_debug_context(self) -> DebugInfo: 1740 """To better filter out the traversal of the grammar in debug context, we 1741 pre-compute for each state the maximal depth of each state within a 1742 production. Therefore, if visiting a state no increases the reducing 1743 depth beyind the ability to shrink the shift list to 0, then we can 1744 stop going deeper, as we entered a different production. """ 1745 depths = collections.defaultdict(lambda: []) 1746 for s in self.states: 1747 if s is None or not s.epsilon: 1748 continue 1749 aps = APS.start(s.index) 1750 for aps_next in aps.shift_next(self): 1751 if not aps_next.reducing: 1752 continue 1753 for i, edge in enumerate(aps_next.stack): 1754 depths[edge.src].append(i + 1) 1755 return {s: max(ds) for s, ds in depths.items()} 1756 1757 def debug_context( 1758 self, 1759 state: StateId, 1760 split_txt: str = "; ", 1761 prefix: str = "" 1762 ) -> str: 1763 """Reconstruct the grammar production by traversing the parse table.""" 1764 if self.debug_info is False: 1765 return "" 1766 if self.debug_info is True: 1767 self.debug_info = self.prepare_debug_context() 1768 debug_info = typing.cast(typing.Dict[StateId, int], self.debug_info) 1769 1770 record = [] 1771 1772 def visit(aps: APS) -> bool: 1773 # Stop after reducing once. 1774 if aps.history == []: 1775 return True 1776 last = aps.history[-1].term 1777 is_unwind = isinstance(last, Action) and last.update_stack() 1778 has_shift_loop = len(aps.shift) != 1 + len(set(zip(aps.shift, aps.shift[1:]))) 1779 can_reduce_later = True 1780 try: 1781 can_reduce_later = debug_info[aps.shift[-1].src] >= len(aps.shift) 1782 except KeyError: 1783 can_reduce_later = False 1784 stop = is_unwind or has_shift_loop or not can_reduce_later 1785 # Record state which are reducing at most all the shifted states. 1786 save = stop and len(aps.shift) == 1 1787 save = save and is_unwind 1788 if save: 1789 assert isinstance(last, Action) 1790 save = last.update_stack_with().nt in self.states[aps.shift[0].src] 1791 if save: 1792 record.append(aps) 1793 return not stop 1794 1795 self.aps_visitor(APS.start(state), visit) 1796 1797 context: OrderedSet[str] = OrderedSet() 1798 for aps in record: 1799 assert aps.history != [] 1800 action = aps.history[-1].term 1801 assert isinstance(action, Action) 1802 assert action.update_stack() 1803 stack_diff = action.update_stack_with() 1804 replay = stack_diff.replay 1805 before = [repr(e.term) for e in aps.stack[:-1]] 1806 after = [repr(e.term) for e in aps.history[:-1]] 1807 prod = before + ["\N{MIDDLE DOT}"] + after 1808 if replay < len(after) and replay > 0: 1809 del prod[-replay:] 1810 replay = 0 1811 if replay > len(after): 1812 replay += 1 1813 if replay > 0: 1814 prod = prod[:-replay] + ["[lookahead:"] + prod[-replay:] + ["]"] 1815 txt = "{}{} ::= {}".format(prefix, repr(stack_diff.nt), " ".join(prod)) 1816 context.add(txt) 1817 1818 if split_txt is None: 1819 return context 1820 return split_txt.join(txt for txt in sorted(context)) 1821