1# 2# Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. 3# Use of this file is governed by the BSD 3-clause license that 4# can be found in the LICENSE.txt file in the project root. 5#/ 6 7# When we hit an accept state in either the DFA or the ATN, we 8# have to notify the character stream to start buffering characters 9# via {@link IntStream#mark} and record the current state. The current sim state 10# includes the current index into the input, the current line, 11# and current character position in that line. Note that the Lexer is 12# tracking the starting line and characterization of the token. These 13# variables track the "state" of the simulator when it hits an accept state. 14# 15# <p>We track these variables separately for the DFA and ATN simulation 16# because the DFA simulation often has to fail over to the ATN 17# simulation. If the ATN simulation fails, we need the DFA to fall 18# back to its previously accepted state, if any. If the ATN succeeds, 19# then the ATN does the accept and the DFA simulator that invoked it 20# can simply return the predicted token type.</p> 21#/ 22 23from antlr4.PredictionContext import PredictionContextCache, SingletonPredictionContext, PredictionContext 24from antlr4.InputStream import InputStream 25from antlr4.Token import Token 26from antlr4.atn.ATN import ATN 27from antlr4.atn.ATNConfig import LexerATNConfig 28from antlr4.atn.ATNSimulator import ATNSimulator 29from antlr4.atn.ATNConfigSet import ATNConfigSet, OrderedATNConfigSet 30from antlr4.atn.ATNState import RuleStopState, ATNState 31from antlr4.atn.LexerActionExecutor import LexerActionExecutor 32from antlr4.atn.Transition import Transition 33from antlr4.dfa.DFAState import DFAState 34from antlr4.error.Errors import LexerNoViableAltException, UnsupportedOperationException 35 36class SimState(object): 37 38 def __init__(self): 39 self.reset() 40 41 def reset(self): 42 self.index = -1 43 self.line = 0 44 self.column = -1 45 self.dfaState = None 46 47# need forward declaration 48Lexer = None 49LexerATNSimulator = None 50 51class LexerATNSimulator(ATNSimulator): 52 53 debug = False 54 dfa_debug = False 55 56 MIN_DFA_EDGE = 0 57 MAX_DFA_EDGE = 127 # forces unicode to stay in ATN 58 59 ERROR = None 60 61 match_calls = 0 62 63 def __init__(self, recog:Lexer, atn:ATN, decisionToDFA:list, sharedContextCache:PredictionContextCache): 64 super().__init__(atn, sharedContextCache) 65 self.decisionToDFA = decisionToDFA 66 self.recog = recog 67 # The current token's starting index into the character stream. 68 # Shared across DFA to ATN simulation in case the ATN fails and the 69 # DFA did not have a previous accept state. In this case, we use the 70 # ATN-generated exception object. 71 self.startIndex = -1 72 # line number 1..n within the input#/ 73 self.line = 1 74 # The index of the character relative to the beginning of the line 0..n-1#/ 75 self.column = 0 76 from antlr4.Lexer import Lexer 77 self.mode = Lexer.DEFAULT_MODE 78 # Used during DFA/ATN exec to record the most recent accept configuration info 79 self.prevAccept = SimState() 80 81 82 def copyState(self, simulator:LexerATNSimulator ): 83 self.column = simulator.column 84 self.line = simulator.line 85 self.mode = simulator.mode 86 self.startIndex = simulator.startIndex 87 88 def match(self, input:InputStream , mode:int): 89 self.match_calls += 1 90 self.mode = mode 91 mark = input.mark() 92 try: 93 self.startIndex = input.index 94 self.prevAccept.reset() 95 dfa = self.decisionToDFA[mode] 96 if dfa.s0 is None: 97 return self.matchATN(input) 98 else: 99 return self.execATN(input, dfa.s0) 100 finally: 101 input.release(mark) 102 103 def reset(self): 104 self.prevAccept.reset() 105 self.startIndex = -1 106 self.line = 1 107 self.column = 0 108 from antlr4.Lexer import Lexer 109 self.mode = Lexer.DEFAULT_MODE 110 111 def matchATN(self, input:InputStream): 112 startState = self.atn.modeToStartState[self.mode] 113 114 if LexerATNSimulator.debug: 115 print("matchATN mode " + str(self.mode) + " start: " + str(startState)) 116 117 old_mode = self.mode 118 s0_closure = self.computeStartState(input, startState) 119 suppressEdge = s0_closure.hasSemanticContext 120 s0_closure.hasSemanticContext = False 121 122 next = self.addDFAState(s0_closure) 123 if not suppressEdge: 124 self.decisionToDFA[self.mode].s0 = next 125 126 predict = self.execATN(input, next) 127 128 if LexerATNSimulator.debug: 129 print("DFA after matchATN: " + str(self.decisionToDFA[old_mode].toLexerString())) 130 131 return predict 132 133 def execATN(self, input:InputStream, ds0:DFAState): 134 if LexerATNSimulator.debug: 135 print("start state closure=" + str(ds0.configs)) 136 137 if ds0.isAcceptState: 138 # allow zero-length tokens 139 self.captureSimState(self.prevAccept, input, ds0) 140 141 t = input.LA(1) 142 s = ds0 # s is current/from DFA state 143 144 while True: # while more work 145 if LexerATNSimulator.debug: 146 print("execATN loop starting closure:", str(s.configs)) 147 148 # As we move src->trg, src->trg, we keep track of the previous trg to 149 # avoid looking up the DFA state again, which is expensive. 150 # If the previous target was already part of the DFA, we might 151 # be able to avoid doing a reach operation upon t. If s!=null, 152 # it means that semantic predicates didn't prevent us from 153 # creating a DFA state. Once we know s!=null, we check to see if 154 # the DFA state has an edge already for t. If so, we can just reuse 155 # it's configuration set; there's no point in re-computing it. 156 # This is kind of like doing DFA simulation within the ATN 157 # simulation because DFA simulation is really just a way to avoid 158 # computing reach/closure sets. Technically, once we know that 159 # we have a previously added DFA state, we could jump over to 160 # the DFA simulator. But, that would mean popping back and forth 161 # a lot and making things more complicated algorithmically. 162 # This optimization makes a lot of sense for loops within DFA. 163 # A character will take us back to an existing DFA state 164 # that already has lots of edges out of it. e.g., .* in comments. 165 # print("Target for:" + str(s) + " and:" + str(t)) 166 target = self.getExistingTargetState(s, t) 167 # print("Existing:" + str(target)) 168 if target is None: 169 target = self.computeTargetState(input, s, t) 170 # print("Computed:" + str(target)) 171 172 if target == self.ERROR: 173 break 174 175 # If this is a consumable input element, make sure to consume before 176 # capturing the accept state so the input index, line, and char 177 # position accurately reflect the state of the interpreter at the 178 # end of the token. 179 if t != Token.EOF: 180 self.consume(input) 181 182 if target.isAcceptState: 183 self.captureSimState(self.prevAccept, input, target) 184 if t == Token.EOF: 185 break 186 187 t = input.LA(1) 188 189 s = target # flip; current DFA target becomes new src/from state 190 191 return self.failOrAccept(self.prevAccept, input, s.configs, t) 192 193 # Get an existing target state for an edge in the DFA. If the target state 194 # for the edge has not yet been computed or is otherwise not available, 195 # this method returns {@code null}. 196 # 197 # @param s The current DFA state 198 # @param t The next input symbol 199 # @return The existing target DFA state for the given input symbol 200 # {@code t}, or {@code null} if the target state for this edge is not 201 # already cached 202 def getExistingTargetState(self, s:DFAState, t:int): 203 if s.edges is None or t < self.MIN_DFA_EDGE or t > self.MAX_DFA_EDGE: 204 return None 205 206 target = s.edges[t - self.MIN_DFA_EDGE] 207 if LexerATNSimulator.debug and target is not None: 208 print("reuse state", str(s.stateNumber), "edge to", str(target.stateNumber)) 209 210 return target 211 212 # Compute a target state for an edge in the DFA, and attempt to add the 213 # computed state and corresponding edge to the DFA. 214 # 215 # @param input The input stream 216 # @param s The current DFA state 217 # @param t The next input symbol 218 # 219 # @return The computed target DFA state for the given input symbol 220 # {@code t}. If {@code t} does not lead to a valid DFA state, this method 221 # returns {@link #ERROR}. 222 def computeTargetState(self, input:InputStream, s:DFAState, t:int): 223 reach = OrderedATNConfigSet() 224 225 # if we don't find an existing DFA state 226 # Fill reach starting from closure, following t transitions 227 self.getReachableConfigSet(input, s.configs, reach, t) 228 229 if len(reach)==0: # we got nowhere on t from s 230 if not reach.hasSemanticContext: 231 # we got nowhere on t, don't throw out this knowledge; it'd 232 # cause a failover from DFA later. 233 self. addDFAEdge(s, t, self.ERROR) 234 235 # stop when we can't match any more char 236 return self.ERROR 237 238 # Add an edge from s to target DFA found/created for reach 239 return self.addDFAEdge(s, t, cfgs=reach) 240 241 def failOrAccept(self, prevAccept:SimState , input:InputStream, reach:ATNConfigSet, t:int): 242 if self.prevAccept.dfaState is not None: 243 lexerActionExecutor = prevAccept.dfaState.lexerActionExecutor 244 self.accept(input, lexerActionExecutor, self.startIndex, prevAccept.index, prevAccept.line, prevAccept.column) 245 return prevAccept.dfaState.prediction 246 else: 247 # if no accept and EOF is first char, return EOF 248 if t==Token.EOF and input.index==self.startIndex: 249 return Token.EOF 250 raise LexerNoViableAltException(self.recog, input, self.startIndex, reach) 251 252 # Given a starting configuration set, figure out all ATN configurations 253 # we can reach upon input {@code t}. Parameter {@code reach} is a return 254 # parameter. 255 def getReachableConfigSet(self, input:InputStream, closure:ATNConfigSet, reach:ATNConfigSet, t:int): 256 # this is used to skip processing for configs which have a lower priority 257 # than a config that already reached an accept state for the same rule 258 skipAlt = ATN.INVALID_ALT_NUMBER 259 for cfg in closure: 260 currentAltReachedAcceptState = ( cfg.alt == skipAlt ) 261 if currentAltReachedAcceptState and cfg.passedThroughNonGreedyDecision: 262 continue 263 264 if LexerATNSimulator.debug: 265 print("testing", self.getTokenName(t), "at", str(cfg)) 266 267 for trans in cfg.state.transitions: # for each transition 268 target = self.getReachableTarget(trans, t) 269 if target is not None: 270 lexerActionExecutor = cfg.lexerActionExecutor 271 if lexerActionExecutor is not None: 272 lexerActionExecutor = lexerActionExecutor.fixOffsetBeforeMatch(input.index - self.startIndex) 273 274 treatEofAsEpsilon = (t == Token.EOF) 275 config = LexerATNConfig(state=target, lexerActionExecutor=lexerActionExecutor, config=cfg) 276 if self.closure(input, config, reach, currentAltReachedAcceptState, True, treatEofAsEpsilon): 277 # any remaining configs for this alt have a lower priority than 278 # the one that just reached an accept state. 279 skipAlt = cfg.alt 280 281 def accept(self, input:InputStream, lexerActionExecutor:LexerActionExecutor, startIndex:int, index:int, line:int, charPos:int): 282 if LexerATNSimulator.debug: 283 print("ACTION", lexerActionExecutor) 284 285 # seek to after last char in token 286 input.seek(index) 287 self.line = line 288 self.column = charPos 289 290 if lexerActionExecutor is not None and self.recog is not None: 291 lexerActionExecutor.execute(self.recog, input, startIndex) 292 293 def getReachableTarget(self, trans:Transition, t:int): 294 from antlr4.Lexer import Lexer 295 if trans.matches(t, 0, Lexer.MAX_CHAR_VALUE): 296 return trans.target 297 else: 298 return None 299 300 def computeStartState(self, input:InputStream, p:ATNState): 301 initialContext = PredictionContext.EMPTY 302 configs = OrderedATNConfigSet() 303 for i in range(0,len(p.transitions)): 304 target = p.transitions[i].target 305 c = LexerATNConfig(state=target, alt=i+1, context=initialContext) 306 self.closure(input, c, configs, False, False, False) 307 return configs 308 309 # Since the alternatives within any lexer decision are ordered by 310 # preference, this method stops pursuing the closure as soon as an accept 311 # state is reached. After the first accept state is reached by depth-first 312 # search from {@code config}, all other (potentially reachable) states for 313 # this rule would have a lower priority. 314 # 315 # @return {@code true} if an accept state is reached, otherwise 316 # {@code false}. 317 def closure(self, input:InputStream, config:LexerATNConfig, configs:ATNConfigSet, currentAltReachedAcceptState:bool, 318 speculative:bool, treatEofAsEpsilon:bool): 319 if LexerATNSimulator.debug: 320 print("closure(" + str(config) + ")") 321 322 if isinstance( config.state, RuleStopState ): 323 if LexerATNSimulator.debug: 324 if self.recog is not None: 325 print("closure at", self.recog.symbolicNames[config.state.ruleIndex], "rule stop", str(config)) 326 else: 327 print("closure at rule stop", str(config)) 328 329 if config.context is None or config.context.hasEmptyPath(): 330 if config.context is None or config.context.isEmpty(): 331 configs.add(config) 332 return True 333 else: 334 configs.add(LexerATNConfig(state=config.state, config=config, context=PredictionContext.EMPTY)) 335 currentAltReachedAcceptState = True 336 337 if config.context is not None and not config.context.isEmpty(): 338 for i in range(0,len(config.context)): 339 if config.context.getReturnState(i) != PredictionContext.EMPTY_RETURN_STATE: 340 newContext = config.context.getParent(i) # "pop" return state 341 returnState = self.atn.states[config.context.getReturnState(i)] 342 c = LexerATNConfig(state=returnState, config=config, context=newContext) 343 currentAltReachedAcceptState = self.closure(input, c, configs, 344 currentAltReachedAcceptState, speculative, treatEofAsEpsilon) 345 346 return currentAltReachedAcceptState 347 348 # optimization 349 if not config.state.epsilonOnlyTransitions: 350 if not currentAltReachedAcceptState or not config.passedThroughNonGreedyDecision: 351 configs.add(config) 352 353 for t in config.state.transitions: 354 c = self.getEpsilonTarget(input, config, t, configs, speculative, treatEofAsEpsilon) 355 if c is not None: 356 currentAltReachedAcceptState = self.closure(input, c, configs, currentAltReachedAcceptState, speculative, treatEofAsEpsilon) 357 358 return currentAltReachedAcceptState 359 360 # side-effect: can alter configs.hasSemanticContext 361 def getEpsilonTarget(self, input:InputStream, config:LexerATNConfig, t:Transition, configs:ATNConfigSet, 362 speculative:bool, treatEofAsEpsilon:bool): 363 c = None 364 if t.serializationType==Transition.RULE: 365 newContext = SingletonPredictionContext.create(config.context, t.followState.stateNumber) 366 c = LexerATNConfig(state=t.target, config=config, context=newContext) 367 368 elif t.serializationType==Transition.PRECEDENCE: 369 raise UnsupportedOperationException("Precedence predicates are not supported in lexers.") 370 371 elif t.serializationType==Transition.PREDICATE: 372 # Track traversing semantic predicates. If we traverse, 373 # we cannot add a DFA state for this "reach" computation 374 # because the DFA would not test the predicate again in the 375 # future. Rather than creating collections of semantic predicates 376 # like v3 and testing them on prediction, v4 will test them on the 377 # fly all the time using the ATN not the DFA. This is slower but 378 # semantically it's not used that often. One of the key elements to 379 # this predicate mechanism is not adding DFA states that see 380 # predicates immediately afterwards in the ATN. For example, 381 382 # a : ID {p1}? | ID {p2}? ; 383 384 # should create the start state for rule 'a' (to save start state 385 # competition), but should not create target of ID state. The 386 # collection of ATN states the following ID references includes 387 # states reached by traversing predicates. Since this is when we 388 # test them, we cannot cash the DFA state target of ID. 389 390 if LexerATNSimulator.debug: 391 print("EVAL rule "+ str(t.ruleIndex) + ":" + str(t.predIndex)) 392 configs.hasSemanticContext = True 393 if self.evaluatePredicate(input, t.ruleIndex, t.predIndex, speculative): 394 c = LexerATNConfig(state=t.target, config=config) 395 396 elif t.serializationType==Transition.ACTION: 397 if config.context is None or config.context.hasEmptyPath(): 398 # execute actions anywhere in the start rule for a token. 399 # 400 # TODO: if the entry rule is invoked recursively, some 401 # actions may be executed during the recursive call. The 402 # problem can appear when hasEmptyPath() is true but 403 # isEmpty() is false. In this case, the config needs to be 404 # split into two contexts - one with just the empty path 405 # and another with everything but the empty path. 406 # Unfortunately, the current algorithm does not allow 407 # getEpsilonTarget to return two configurations, so 408 # additional modifications are needed before we can support 409 # the split operation. 410 lexerActionExecutor = LexerActionExecutor.append(config.lexerActionExecutor, 411 self.atn.lexerActions[t.actionIndex]) 412 c = LexerATNConfig(state=t.target, config=config, lexerActionExecutor=lexerActionExecutor) 413 414 else: 415 # ignore actions in referenced rules 416 c = LexerATNConfig(state=t.target, config=config) 417 418 elif t.serializationType==Transition.EPSILON: 419 c = LexerATNConfig(state=t.target, config=config) 420 421 elif t.serializationType in [ Transition.ATOM, Transition.RANGE, Transition.SET ]: 422 if treatEofAsEpsilon: 423 from antlr4.Lexer import Lexer 424 if t.matches(Token.EOF, 0, Lexer.MAX_CHAR_VALUE): 425 c = LexerATNConfig(state=t.target, config=config) 426 427 return c 428 429 # Evaluate a predicate specified in the lexer. 430 # 431 # <p>If {@code speculative} is {@code true}, this method was called before 432 # {@link #consume} for the matched character. This method should call 433 # {@link #consume} before evaluating the predicate to ensure position 434 # sensitive values, including {@link Lexer#getText}, {@link Lexer#getLine}, 435 # and {@link Lexer#getcolumn}, properly reflect the current 436 # lexer state. This method should restore {@code input} and the simulator 437 # to the original state before returning (i.e. undo the actions made by the 438 # call to {@link #consume}.</p> 439 # 440 # @param input The input stream. 441 # @param ruleIndex The rule containing the predicate. 442 # @param predIndex The index of the predicate within the rule. 443 # @param speculative {@code true} if the current index in {@code input} is 444 # one character before the predicate's location. 445 # 446 # @return {@code true} if the specified predicate evaluates to 447 # {@code true}. 448 #/ 449 def evaluatePredicate(self, input:InputStream, ruleIndex:int, predIndex:int, speculative:bool): 450 # assume true if no recognizer was provided 451 if self.recog is None: 452 return True 453 454 if not speculative: 455 return self.recog.sempred(None, ruleIndex, predIndex) 456 457 savedcolumn = self.column 458 savedLine = self.line 459 index = input.index 460 marker = input.mark() 461 try: 462 self.consume(input) 463 return self.recog.sempred(None, ruleIndex, predIndex) 464 finally: 465 self.column = savedcolumn 466 self.line = savedLine 467 input.seek(index) 468 input.release(marker) 469 470 def captureSimState(self, settings:SimState, input:InputStream, dfaState:DFAState): 471 settings.index = input.index 472 settings.line = self.line 473 settings.column = self.column 474 settings.dfaState = dfaState 475 476 def addDFAEdge(self, from_:DFAState, tk:int, to:DFAState=None, cfgs:ATNConfigSet=None) -> DFAState: 477 478 if to is None and cfgs is not None: 479 # leading to this call, ATNConfigSet.hasSemanticContext is used as a 480 # marker indicating dynamic predicate evaluation makes this edge 481 # dependent on the specific input sequence, so the static edge in the 482 # DFA should be omitted. The target DFAState is still created since 483 # execATN has the ability to resynchronize with the DFA state cache 484 # following the predicate evaluation step. 485 # 486 # TJP notes: next time through the DFA, we see a pred again and eval. 487 # If that gets us to a previously created (but dangling) DFA 488 # state, we can continue in pure DFA mode from there. 489 #/ 490 suppressEdge = cfgs.hasSemanticContext 491 cfgs.hasSemanticContext = False 492 493 to = self.addDFAState(cfgs) 494 495 if suppressEdge: 496 return to 497 498 # add the edge 499 if tk < self.MIN_DFA_EDGE or tk > self.MAX_DFA_EDGE: 500 # Only track edges within the DFA bounds 501 return to 502 503 if LexerATNSimulator.debug: 504 print("EDGE " + str(from_) + " -> " + str(to) + " upon "+ chr(tk)) 505 506 if from_.edges is None: 507 # make room for tokens 1..n and -1 masquerading as index 0 508 from_.edges = [ None ] * (self.MAX_DFA_EDGE - self.MIN_DFA_EDGE + 1) 509 510 from_.edges[tk - self.MIN_DFA_EDGE] = to # connect 511 512 return to 513 514 515 # Add a new DFA state if there isn't one with this set of 516 # configurations already. This method also detects the first 517 # configuration containing an ATN rule stop state. Later, when 518 # traversing the DFA, we will know which rule to accept. 519 def addDFAState(self, configs:ATNConfigSet) -> DFAState: 520 521 proposed = DFAState(configs=configs) 522 firstConfigWithRuleStopState = next((cfg for cfg in configs if isinstance(cfg.state, RuleStopState)), None) 523 524 if firstConfigWithRuleStopState is not None: 525 proposed.isAcceptState = True 526 proposed.lexerActionExecutor = firstConfigWithRuleStopState.lexerActionExecutor 527 proposed.prediction = self.atn.ruleToTokenType[firstConfigWithRuleStopState.state.ruleIndex] 528 529 dfa = self.decisionToDFA[self.mode] 530 existing = dfa.states.get(proposed, None) 531 if existing is not None: 532 return existing 533 534 newState = proposed 535 536 newState.stateNumber = len(dfa.states) 537 configs.setReadonly(True) 538 newState.configs = configs 539 dfa.states[newState] = newState 540 return newState 541 542 def getDFA(self, mode:int): 543 return self.decisionToDFA[mode] 544 545 # Get the text matched so far for the current token. 546 def getText(self, input:InputStream): 547 # index is first lookahead char, don't include. 548 return input.getText(self.startIndex, input.index-1) 549 550 def consume(self, input:InputStream): 551 curChar = input.LA(1) 552 if curChar==ord('\n'): 553 self.line += 1 554 self.column = 0 555 else: 556 self.column += 1 557 input.consume() 558 559 def getTokenName(self, t:int): 560 if t==-1: 561 return "EOF" 562 else: 563 return "'" + chr(t) + "'" 564 565 566LexerATNSimulator.ERROR = DFAState(0x7FFFFFFF, ATNConfigSet()) 567 568del Lexer