1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 2# Licensed to PSF under a Contributor Agreement. 3# 4# Modifications: 5# Copyright David Halter and Contributors 6# Modifications are dual-licensed: MIT and PSF. 7# 99% of the code is different from pgen2, now. 8# 9# A fork of `parso.pgen2.generator`. 10# https://github.com/davidhalter/parso/blob/master/parso/pgen2/generator.py 11# 12# The following changes were made: 13# - Type stubs were directly applied. 14# pyre-unsafe 15 16""" 17This module defines the data structures used to represent a grammar. 18 19Specifying grammars in pgen is possible with this grammar:: 20 21 grammar: (NEWLINE | rule)* ENDMARKER 22 rule: NAME ':' rhs NEWLINE 23 rhs: items ('|' items)* 24 items: item+ 25 item: '[' rhs ']' | atom ['+' | '*'] 26 atom: '(' rhs ')' | NAME | STRING 27 28This grammar is self-referencing. 29 30This parser generator (pgen2) was created by Guido Rossum and used for lib2to3. 31Most of the code has been refactored to make it more Pythonic. Since this was a 32"copy" of the CPython Parser parser "pgen", there was some work needed to make 33it more readable. It should also be slightly faster than the original pgen2, 34because we made some optimizations. 35""" 36 37from ast import literal_eval 38from typing import Any, Generic, Mapping, Sequence, Set, TypeVar, Union 39 40from libcst._parser.parso.pgen2.grammar_parser import GrammarParser, NFAState 41 42_TokenTypeT = TypeVar("_TokenTypeT") 43 44 45class DFAPlan: 46 """ 47 Plans are used for the parser to create stack nodes and do the proper 48 DFA state transitions. 49 """ 50 51 def __init__( 52 self, next_dfa: "DFAState", dfa_pushes: Sequence["DFAState"] = [] 53 ) -> None: 54 self.next_dfa = next_dfa 55 self.dfa_pushes = dfa_pushes 56 57 def __repr__(self) -> str: 58 return "%s(%s, %s)" % (self.__class__.__name__, self.next_dfa, self.dfa_pushes) 59 60 61class DFAState(Generic[_TokenTypeT]): 62 """ 63 The DFAState object is the core class for pretty much anything. DFAState 64 are the vertices of an ordered graph while arcs and transitions are the 65 edges. 66 67 Arcs are the initial edges, where most DFAStates are not connected and 68 transitions are then calculated to connect the DFA state machines that have 69 different nonterminals. 70 """ 71 72 def __init__(self, from_rule: str, nfa_set: Set[NFAState], final: NFAState) -> None: 73 self.from_rule = from_rule 74 self.nfa_set = nfa_set 75 self.arcs: Mapping[ 76 str, DFAState 77 ] = {} # map from terminals/nonterminals to DFAState 78 # In an intermediary step we set these nonterminal arcs (which has the 79 # same structure as arcs). These don't contain terminals anymore. 80 self.nonterminal_arcs: Mapping[str, DFAState] = {} 81 82 # Transitions are basically the only thing that the parser is using 83 # with is_final. Everyting else is purely here to create a parser. 84 self.transitions: Mapping[Union[_TokenTypeT, ReservedString], DFAPlan] = {} 85 self.is_final = final in nfa_set 86 87 def add_arc(self, next_, label): 88 assert isinstance(label, str) 89 assert label not in self.arcs 90 assert isinstance(next_, DFAState) 91 self.arcs[label] = next_ 92 93 def unifystate(self, old, new): 94 for label, next_ in self.arcs.items(): 95 if next_ is old: 96 self.arcs[label] = new 97 98 def __eq__(self, other): 99 # Equality test -- ignore the nfa_set instance variable 100 assert isinstance(other, DFAState) 101 if self.is_final != other.is_final: 102 return False 103 # Can't just return self.arcs == other.arcs, because that 104 # would invoke this method recursively, with cycles... 105 if len(self.arcs) != len(other.arcs): 106 return False 107 for label, next_ in self.arcs.items(): 108 if next_ is not other.arcs.get(label): 109 return False 110 return True 111 112 def __repr__(self) -> str: 113 return "<%s: %s is_final=%s>" % ( 114 self.__class__.__name__, 115 self.from_rule, 116 self.is_final, 117 ) 118 119 120class ReservedString: 121 """ 122 Most grammars will have certain keywords and operators that are mentioned 123 in the grammar as strings (e.g. "if") and not token types (e.g. NUMBER). 124 This class basically is the former. 125 """ 126 127 def __init__(self, value: str) -> None: 128 self.value = value 129 130 def __repr__(self) -> str: 131 return "%s(%s)" % (self.__class__.__name__, self.value) 132 133 134class Grammar(Generic[_TokenTypeT]): 135 """ 136 Once initialized, this class supplies the grammar tables for the 137 parsing engine implemented by parse.py. The parsing engine 138 accesses the instance variables directly. 139 140 The only important part in this parsers are dfas and transitions between 141 dfas. 142 """ 143 144 def __init__( 145 self, 146 start_nonterminal: str, 147 rule_to_dfas: Mapping[str, Sequence[DFAState[_TokenTypeT]]], 148 reserved_syntax_strings: Mapping[str, ReservedString], 149 ) -> None: 150 self.nonterminal_to_dfas = rule_to_dfas 151 self.reserved_syntax_strings = reserved_syntax_strings 152 self.start_nonterminal = start_nonterminal 153 154 155def _simplify_dfas(dfas): 156 """ 157 This is not theoretically optimal, but works well enough. 158 Algorithm: repeatedly look for two states that have the same 159 set of arcs (same labels pointing to the same nodes) and 160 unify them, until things stop changing. 161 162 dfas is a list of DFAState instances 163 """ 164 changes = True 165 while changes: 166 changes = False 167 for i, state_i in enumerate(dfas): 168 for j in range(i + 1, len(dfas)): 169 state_j = dfas[j] 170 if state_i == state_j: 171 # print " unify", i, j 172 del dfas[j] 173 for state in dfas: 174 state.unifystate(state_j, state_i) 175 changes = True 176 break 177 178 179def _make_dfas(start, finish): 180 """ 181 Uses the powerset construction algorithm to create DFA states from sets of 182 NFA states. 183 184 Also does state reduction if some states are not needed. 185 """ 186 # To turn an NFA into a DFA, we define the states of the DFA 187 # to correspond to *sets* of states of the NFA. Then do some 188 # state reduction. 189 assert isinstance(start, NFAState) 190 assert isinstance(finish, NFAState) 191 192 def addclosure(nfa_state, base_nfa_set): 193 assert isinstance(nfa_state, NFAState) 194 if nfa_state in base_nfa_set: 195 return 196 base_nfa_set.add(nfa_state) 197 for nfa_arc in nfa_state.arcs: 198 if nfa_arc.nonterminal_or_string is None: 199 addclosure(nfa_arc.next, base_nfa_set) 200 201 base_nfa_set = set() 202 addclosure(start, base_nfa_set) 203 states = [DFAState(start.from_rule, base_nfa_set, finish)] 204 for state in states: # NB states grows while we're iterating 205 arcs = {} 206 # Find state transitions and store them in arcs. 207 for nfa_state in state.nfa_set: 208 for nfa_arc in nfa_state.arcs: 209 if nfa_arc.nonterminal_or_string is not None: 210 nfa_set = arcs.setdefault(nfa_arc.nonterminal_or_string, set()) 211 addclosure(nfa_arc.next, nfa_set) 212 213 # Now create the dfa's with no None's in arcs anymore. All Nones have 214 # been eliminated and state transitions (arcs) are properly defined, we 215 # just need to create the dfa's. 216 for nonterminal_or_string, nfa_set in arcs.items(): 217 for nested_state in states: 218 if nested_state.nfa_set == nfa_set: 219 # The DFA state already exists for this rule. 220 break 221 else: 222 nested_state = DFAState(start.from_rule, nfa_set, finish) 223 states.append(nested_state) 224 225 state.add_arc(nested_state, nonterminal_or_string) 226 return states # List of DFAState instances; first one is start 227 228 229def generate_grammar(bnf_grammar: str, token_namespace: Any) -> Grammar[Any]: 230 """ 231 ``bnf_text`` is a grammar in extended BNF (using * for repetition, + for 232 at-least-once repetition, [] for optional parts, | for alternatives and () 233 for grouping). 234 235 It's not EBNF according to ISO/IEC 14977. It's a dialect Python uses in its 236 own parser. 237 """ 238 rule_to_dfas = {} 239 start_nonterminal = None 240 for nfa_a, nfa_z in GrammarParser(bnf_grammar).parse(): 241 dfas = _make_dfas(nfa_a, nfa_z) 242 _simplify_dfas(dfas) 243 rule_to_dfas[nfa_a.from_rule] = dfas 244 245 if start_nonterminal is None: 246 start_nonterminal = nfa_a.from_rule 247 248 reserved_strings = {} 249 for nonterminal, dfas in rule_to_dfas.items(): 250 for dfa_state in dfas: 251 for terminal_or_nonterminal, next_dfa in dfa_state.arcs.items(): 252 if terminal_or_nonterminal in rule_to_dfas: 253 dfa_state.nonterminal_arcs[terminal_or_nonterminal] = next_dfa 254 else: 255 transition = _make_transition( 256 token_namespace, reserved_strings, terminal_or_nonterminal 257 ) 258 dfa_state.transitions[transition] = DFAPlan(next_dfa) 259 260 _calculate_tree_traversal(rule_to_dfas) 261 if start_nonterminal is None: 262 raise Exception("could not find starting nonterminal!") 263 return Grammar(start_nonterminal, rule_to_dfas, reserved_strings) 264 265 266def _make_transition(token_namespace, reserved_syntax_strings, label): 267 """ 268 Creates a reserved string ("if", "for", "*", ...) or returns the token type 269 (NUMBER, STRING, ...) for a given grammar terminal. 270 """ 271 if label[0].isalpha(): 272 # A named token (e.g. NAME, NUMBER, STRING) 273 return getattr(token_namespace, label) 274 else: 275 # Either a keyword or an operator 276 assert label[0] in ('"', "'"), label 277 assert not label.startswith('"""') and not label.startswith("'''") 278 value = literal_eval(label) 279 try: 280 return reserved_syntax_strings[value] 281 except KeyError: 282 r = reserved_syntax_strings[value] = ReservedString(value) 283 return r 284 285 286def _calculate_tree_traversal(nonterminal_to_dfas): 287 """ 288 By this point we know how dfas can move around within a stack node, but we 289 don't know how we can add a new stack node (nonterminal transitions). 290 """ 291 # Map from grammar rule (nonterminal) name to a set of tokens. 292 first_plans = {} 293 294 nonterminals = list(nonterminal_to_dfas.keys()) 295 nonterminals.sort() 296 for nonterminal in nonterminals: 297 if nonterminal not in first_plans: 298 _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal) 299 300 # Now that we have calculated the first terminals, we are sure that 301 # there is no left recursion. 302 303 for dfas in nonterminal_to_dfas.values(): 304 for dfa_state in dfas: 305 transitions = dfa_state.transitions 306 for nonterminal, next_dfa in dfa_state.nonterminal_arcs.items(): 307 for transition, pushes in first_plans[nonterminal].items(): 308 if transition in transitions: 309 prev_plan = transitions[transition] 310 # Make sure these are sorted so that error messages are 311 # at least deterministic 312 choices = sorted( 313 [ 314 ( 315 prev_plan.dfa_pushes[0].from_rule 316 if prev_plan.dfa_pushes 317 else prev_plan.next_dfa.from_rule 318 ), 319 (pushes[0].from_rule if pushes else next_dfa.from_rule), 320 ] 321 ) 322 raise ValueError( 323 ( 324 "Rule %s is ambiguous; given a %s token, we " 325 + "can't determine if we should evaluate %s or %s." 326 ) 327 % ((dfa_state.from_rule, transition) + tuple(choices)) 328 ) 329 transitions[transition] = DFAPlan(next_dfa, pushes) 330 331 332def _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal): 333 """ 334 Calculates the first plan in the first_plans dictionary for every given 335 nonterminal. This is going to be used to know when to create stack nodes. 336 """ 337 dfas = nonterminal_to_dfas[nonterminal] 338 new_first_plans = {} 339 first_plans[nonterminal] = None # dummy to detect left recursion 340 # We only need to check the first dfa. All the following ones are not 341 # interesting to find first terminals. 342 state = dfas[0] 343 for transition, next_ in state.transitions.items(): 344 # It's a string. We have finally found a possible first token. 345 new_first_plans[transition] = [next_.next_dfa] 346 347 for nonterminal2, next_ in state.nonterminal_arcs.items(): 348 # It's a nonterminal and we have either a left recursion issue 349 # in the grammar or we have to recurse. 350 try: 351 first_plans2 = first_plans[nonterminal2] 352 except KeyError: 353 first_plans2 = _calculate_first_plans( 354 nonterminal_to_dfas, first_plans, nonterminal2 355 ) 356 else: 357 if first_plans2 is None: 358 raise ValueError("left recursion for rule %r" % nonterminal) 359 360 for t, pushes in first_plans2.items(): 361 new_first_plans[t] = [next_] + pushes 362 363 first_plans[nonterminal] = new_first_plans 364 return new_first_plans 365