1#
2# Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
3# Use of this file is governed by the BSD 3-clause license that
4# can be found in the LICENSE.txt file in the project root.
5#/
6
7# When we hit an accept state in either the DFA or the ATN, we
8#  have to notify the character stream to start buffering characters
9#  via {@link IntStream#mark} and record the current state. The current sim state
10#  includes the current index into the input, the current line,
11#  and current character position in that line. Note that the Lexer is
12#  tracking the starting line and characterization of the token. These
13#  variables track the "state" of the simulator when it hits an accept state.
14#
15#  <p>We track these variables separately for the DFA and ATN simulation
16#  because the DFA simulation often has to fail over to the ATN
17#  simulation. If the ATN simulation fails, we need the DFA to fall
18#  back to its previously accepted state, if any. If the ATN succeeds,
19#  then the ATN does the accept and the DFA simulator that invoked it
20#  can simply return the predicted token type.</p>
21#/
22
23from antlr4.PredictionContext import PredictionContextCache, SingletonPredictionContext, PredictionContext
24from antlr4.InputStream import InputStream
25from antlr4.Token import Token
26from antlr4.atn.ATN import ATN
27from antlr4.atn.ATNConfig import LexerATNConfig
28from antlr4.atn.ATNSimulator import ATNSimulator
29from antlr4.atn.ATNConfigSet import ATNConfigSet, OrderedATNConfigSet
30from antlr4.atn.ATNState import RuleStopState, ATNState
31from antlr4.atn.LexerActionExecutor import LexerActionExecutor
32from antlr4.atn.Transition import Transition
33from antlr4.dfa.DFAState import DFAState
34from antlr4.error.Errors import LexerNoViableAltException, UnsupportedOperationException
35
36class SimState(object):
37
38    def __init__(self):
39        self.reset()
40
41    def reset(self):
42        self.index = -1
43        self.line = 0
44        self.column = -1
45        self.dfaState = None
46
47# need forward declaration
48Lexer = None
49LexerATNSimulator = None
50
51class LexerATNSimulator(ATNSimulator):
52
53    debug = False
54    dfa_debug = False
55
56    MIN_DFA_EDGE = 0
57    MAX_DFA_EDGE = 127 # forces unicode to stay in ATN
58
59    ERROR = None
60
61    match_calls = 0
62
63    def __init__(self, recog:Lexer, atn:ATN, decisionToDFA:list, sharedContextCache:PredictionContextCache):
64        super().__init__(atn, sharedContextCache)
65        self.decisionToDFA = decisionToDFA
66        self.recog = recog
67        # The current token's starting index into the character stream.
68        #  Shared across DFA to ATN simulation in case the ATN fails and the
69        #  DFA did not have a previous accept state. In this case, we use the
70        #  ATN-generated exception object.
71        self.startIndex = -1
72        # line number 1..n within the input#/
73        self.line = 1
74        # The index of the character relative to the beginning of the line 0..n-1#/
75        self.column = 0
76        from antlr4.Lexer import Lexer
77        self.mode = Lexer.DEFAULT_MODE
78        # Used during DFA/ATN exec to record the most recent accept configuration info
79        self.prevAccept = SimState()
80
81
82    def copyState(self, simulator:LexerATNSimulator ):
83        self.column = simulator.column
84        self.line = simulator.line
85        self.mode = simulator.mode
86        self.startIndex = simulator.startIndex
87
88    def match(self, input:InputStream , mode:int):
89        self.match_calls += 1
90        self.mode = mode
91        mark = input.mark()
92        try:
93            self.startIndex = input.index
94            self.prevAccept.reset()
95            dfa = self.decisionToDFA[mode]
96            if dfa.s0 is None:
97                return self.matchATN(input)
98            else:
99                return self.execATN(input, dfa.s0)
100        finally:
101            input.release(mark)
102
103    def reset(self):
104        self.prevAccept.reset()
105        self.startIndex = -1
106        self.line = 1
107        self.column = 0
108        from antlr4.Lexer import Lexer
109        self.mode = Lexer.DEFAULT_MODE
110
111    def matchATN(self, input:InputStream):
112        startState = self.atn.modeToStartState[self.mode]
113
114        if LexerATNSimulator.debug:
115            print("matchATN mode " + str(self.mode) + " start: " + str(startState))
116
117        old_mode = self.mode
118        s0_closure = self.computeStartState(input, startState)
119        suppressEdge = s0_closure.hasSemanticContext
120        s0_closure.hasSemanticContext = False
121
122        next = self.addDFAState(s0_closure)
123        if not suppressEdge:
124            self.decisionToDFA[self.mode].s0 = next
125
126        predict = self.execATN(input, next)
127
128        if LexerATNSimulator.debug:
129            print("DFA after matchATN: " + str(self.decisionToDFA[old_mode].toLexerString()))
130
131        return predict
132
133    def execATN(self, input:InputStream, ds0:DFAState):
134        if LexerATNSimulator.debug:
135            print("start state closure=" + str(ds0.configs))
136
137        if ds0.isAcceptState:
138            # allow zero-length tokens
139            self.captureSimState(self.prevAccept, input, ds0)
140
141        t = input.LA(1)
142        s = ds0 # s is current/from DFA state
143
144        while True: # while more work
145            if LexerATNSimulator.debug:
146                print("execATN loop starting closure:", str(s.configs))
147
148            # As we move src->trg, src->trg, we keep track of the previous trg to
149            # avoid looking up the DFA state again, which is expensive.
150            # If the previous target was already part of the DFA, we might
151            # be able to avoid doing a reach operation upon t. If s!=null,
152            # it means that semantic predicates didn't prevent us from
153            # creating a DFA state. Once we know s!=null, we check to see if
154            # the DFA state has an edge already for t. If so, we can just reuse
155            # it's configuration set; there's no point in re-computing it.
156            # This is kind of like doing DFA simulation within the ATN
157            # simulation because DFA simulation is really just a way to avoid
158            # computing reach/closure sets. Technically, once we know that
159            # we have a previously added DFA state, we could jump over to
160            # the DFA simulator. But, that would mean popping back and forth
161            # a lot and making things more complicated algorithmically.
162            # This optimization makes a lot of sense for loops within DFA.
163            # A character will take us back to an existing DFA state
164            # that already has lots of edges out of it. e.g., .* in comments.
165            # print("Target for:" + str(s) + " and:" + str(t))
166            target = self.getExistingTargetState(s, t)
167            # print("Existing:" + str(target))
168            if target is None:
169                target = self.computeTargetState(input, s, t)
170                # print("Computed:" + str(target))
171
172            if target == self.ERROR:
173                break
174
175            # If this is a consumable input element, make sure to consume before
176            # capturing the accept state so the input index, line, and char
177            # position accurately reflect the state of the interpreter at the
178            # end of the token.
179            if t != Token.EOF:
180                self.consume(input)
181
182            if target.isAcceptState:
183                self.captureSimState(self.prevAccept, input, target)
184                if t == Token.EOF:
185                    break
186
187            t = input.LA(1)
188
189            s = target # flip; current DFA target becomes new src/from state
190
191        return self.failOrAccept(self.prevAccept, input, s.configs, t)
192
193    # Get an existing target state for an edge in the DFA. If the target state
194    # for the edge has not yet been computed or is otherwise not available,
195    # this method returns {@code null}.
196    #
197    # @param s The current DFA state
198    # @param t The next input symbol
199    # @return The existing target DFA state for the given input symbol
200    # {@code t}, or {@code null} if the target state for this edge is not
201    # already cached
202    def getExistingTargetState(self, s:DFAState, t:int):
203        if s.edges is None or t < self.MIN_DFA_EDGE or t > self.MAX_DFA_EDGE:
204            return None
205
206        target = s.edges[t - self.MIN_DFA_EDGE]
207        if LexerATNSimulator.debug and target is not None:
208            print("reuse state", str(s.stateNumber), "edge to", str(target.stateNumber))
209
210        return target
211
212    # Compute a target state for an edge in the DFA, and attempt to add the
213    # computed state and corresponding edge to the DFA.
214    #
215    # @param input The input stream
216    # @param s The current DFA state
217    # @param t The next input symbol
218    #
219    # @return The computed target DFA state for the given input symbol
220    # {@code t}. If {@code t} does not lead to a valid DFA state, this method
221    # returns {@link #ERROR}.
222    def computeTargetState(self, input:InputStream, s:DFAState, t:int):
223        reach = OrderedATNConfigSet()
224
225        # if we don't find an existing DFA state
226        # Fill reach starting from closure, following t transitions
227        self.getReachableConfigSet(input, s.configs, reach, t)
228
229        if len(reach)==0: # we got nowhere on t from s
230            if not reach.hasSemanticContext:
231                # we got nowhere on t, don't throw out this knowledge; it'd
232                # cause a failover from DFA later.
233               self. addDFAEdge(s, t, self.ERROR)
234
235            # stop when we can't match any more char
236            return self.ERROR
237
238        # Add an edge from s to target DFA found/created for reach
239        return self.addDFAEdge(s, t, cfgs=reach)
240
241    def failOrAccept(self, prevAccept:SimState , input:InputStream, reach:ATNConfigSet, t:int):
242        if self.prevAccept.dfaState is not None:
243            lexerActionExecutor = prevAccept.dfaState.lexerActionExecutor
244            self.accept(input, lexerActionExecutor, self.startIndex, prevAccept.index, prevAccept.line, prevAccept.column)
245            return prevAccept.dfaState.prediction
246        else:
247            # if no accept and EOF is first char, return EOF
248            if t==Token.EOF and input.index==self.startIndex:
249                return Token.EOF
250            raise LexerNoViableAltException(self.recog, input, self.startIndex, reach)
251
252    # Given a starting configuration set, figure out all ATN configurations
253    #  we can reach upon input {@code t}. Parameter {@code reach} is a return
254    #  parameter.
255    def getReachableConfigSet(self, input:InputStream, closure:ATNConfigSet, reach:ATNConfigSet, t:int):
256        # this is used to skip processing for configs which have a lower priority
257        # than a config that already reached an accept state for the same rule
258        skipAlt = ATN.INVALID_ALT_NUMBER
259        for cfg in closure:
260            currentAltReachedAcceptState = ( cfg.alt == skipAlt )
261            if currentAltReachedAcceptState and cfg.passedThroughNonGreedyDecision:
262                continue
263
264            if LexerATNSimulator.debug:
265                print("testing", self.getTokenName(t), "at",  str(cfg))
266
267            for trans in cfg.state.transitions:          # for each transition
268                target = self.getReachableTarget(trans, t)
269                if target is not None:
270                    lexerActionExecutor = cfg.lexerActionExecutor
271                    if lexerActionExecutor is not None:
272                        lexerActionExecutor = lexerActionExecutor.fixOffsetBeforeMatch(input.index - self.startIndex)
273
274                    treatEofAsEpsilon = (t == Token.EOF)
275                    config = LexerATNConfig(state=target, lexerActionExecutor=lexerActionExecutor, config=cfg)
276                    if self.closure(input, config, reach, currentAltReachedAcceptState, True, treatEofAsEpsilon):
277                        # any remaining configs for this alt have a lower priority than
278                        # the one that just reached an accept state.
279                        skipAlt = cfg.alt
280
281    def accept(self, input:InputStream, lexerActionExecutor:LexerActionExecutor, startIndex:int, index:int, line:int, charPos:int):
282        if LexerATNSimulator.debug:
283            print("ACTION", lexerActionExecutor)
284
285        # seek to after last char in token
286        input.seek(index)
287        self.line = line
288        self.column = charPos
289
290        if lexerActionExecutor is not None and self.recog is not None:
291            lexerActionExecutor.execute(self.recog, input, startIndex)
292
293    def getReachableTarget(self, trans:Transition, t:int):
294        from antlr4.Lexer import Lexer
295        if trans.matches(t, 0, Lexer.MAX_CHAR_VALUE):
296            return trans.target
297        else:
298            return None
299
300    def computeStartState(self, input:InputStream, p:ATNState):
301        initialContext = PredictionContext.EMPTY
302        configs = OrderedATNConfigSet()
303        for i in range(0,len(p.transitions)):
304            target = p.transitions[i].target
305            c = LexerATNConfig(state=target, alt=i+1, context=initialContext)
306            self.closure(input, c, configs, False, False, False)
307        return configs
308
309    # Since the alternatives within any lexer decision are ordered by
310    # preference, this method stops pursuing the closure as soon as an accept
311    # state is reached. After the first accept state is reached by depth-first
312    # search from {@code config}, all other (potentially reachable) states for
313    # this rule would have a lower priority.
314    #
315    # @return {@code true} if an accept state is reached, otherwise
316    # {@code false}.
317    def closure(self, input:InputStream, config:LexerATNConfig, configs:ATNConfigSet, currentAltReachedAcceptState:bool,
318                speculative:bool, treatEofAsEpsilon:bool):
319        if LexerATNSimulator.debug:
320            print("closure(" + str(config) + ")")
321
322        if isinstance( config.state, RuleStopState ):
323            if LexerATNSimulator.debug:
324                if self.recog is not None:
325                    print("closure at", self.recog.symbolicNames[config.state.ruleIndex],  "rule stop", str(config))
326                else:
327                    print("closure at rule stop", str(config))
328
329            if config.context is None or config.context.hasEmptyPath():
330                if config.context is None or config.context.isEmpty():
331                    configs.add(config)
332                    return True
333                else:
334                    configs.add(LexerATNConfig(state=config.state, config=config, context=PredictionContext.EMPTY))
335                    currentAltReachedAcceptState = True
336
337            if config.context is not None and not config.context.isEmpty():
338                for i in range(0,len(config.context)):
339                    if config.context.getReturnState(i) != PredictionContext.EMPTY_RETURN_STATE:
340                        newContext = config.context.getParent(i) # "pop" return state
341                        returnState = self.atn.states[config.context.getReturnState(i)]
342                        c = LexerATNConfig(state=returnState, config=config, context=newContext)
343                        currentAltReachedAcceptState = self.closure(input, c, configs,
344                                    currentAltReachedAcceptState, speculative, treatEofAsEpsilon)
345
346            return currentAltReachedAcceptState
347
348        # optimization
349        if not config.state.epsilonOnlyTransitions:
350            if not currentAltReachedAcceptState or not config.passedThroughNonGreedyDecision:
351                configs.add(config)
352
353        for t in config.state.transitions:
354            c = self.getEpsilonTarget(input, config, t, configs, speculative, treatEofAsEpsilon)
355            if c is not None:
356                currentAltReachedAcceptState = self.closure(input, c, configs, currentAltReachedAcceptState, speculative, treatEofAsEpsilon)
357
358        return currentAltReachedAcceptState
359
360    # side-effect: can alter configs.hasSemanticContext
361    def getEpsilonTarget(self, input:InputStream, config:LexerATNConfig, t:Transition, configs:ATNConfigSet,
362                                           speculative:bool, treatEofAsEpsilon:bool):
363        c = None
364        if t.serializationType==Transition.RULE:
365                newContext = SingletonPredictionContext.create(config.context, t.followState.stateNumber)
366                c = LexerATNConfig(state=t.target, config=config, context=newContext)
367
368        elif t.serializationType==Transition.PRECEDENCE:
369                raise UnsupportedOperationException("Precedence predicates are not supported in lexers.")
370
371        elif t.serializationType==Transition.PREDICATE:
372                #  Track traversing semantic predicates. If we traverse,
373                # we cannot add a DFA state for this "reach" computation
374                # because the DFA would not test the predicate again in the
375                # future. Rather than creating collections of semantic predicates
376                # like v3 and testing them on prediction, v4 will test them on the
377                # fly all the time using the ATN not the DFA. This is slower but
378                # semantically it's not used that often. One of the key elements to
379                # this predicate mechanism is not adding DFA states that see
380                # predicates immediately afterwards in the ATN. For example,
381
382                # a : ID {p1}? | ID {p2}? ;
383
384                # should create the start state for rule 'a' (to save start state
385                # competition), but should not create target of ID state. The
386                # collection of ATN states the following ID references includes
387                # states reached by traversing predicates. Since this is when we
388                # test them, we cannot cash the DFA state target of ID.
389
390                if LexerATNSimulator.debug:
391                    print("EVAL rule "+ str(t.ruleIndex) + ":" + str(t.predIndex))
392                configs.hasSemanticContext = True
393                if self.evaluatePredicate(input, t.ruleIndex, t.predIndex, speculative):
394                    c = LexerATNConfig(state=t.target, config=config)
395
396        elif t.serializationType==Transition.ACTION:
397                if config.context is None or config.context.hasEmptyPath():
398                    # execute actions anywhere in the start rule for a token.
399                    #
400                    # TODO: if the entry rule is invoked recursively, some
401                    # actions may be executed during the recursive call. The
402                    # problem can appear when hasEmptyPath() is true but
403                    # isEmpty() is false. In this case, the config needs to be
404                    # split into two contexts - one with just the empty path
405                    # and another with everything but the empty path.
406                    # Unfortunately, the current algorithm does not allow
407                    # getEpsilonTarget to return two configurations, so
408                    # additional modifications are needed before we can support
409                    # the split operation.
410                    lexerActionExecutor = LexerActionExecutor.append(config.lexerActionExecutor,
411                                    self.atn.lexerActions[t.actionIndex])
412                    c = LexerATNConfig(state=t.target, config=config, lexerActionExecutor=lexerActionExecutor)
413
414                else:
415                    # ignore actions in referenced rules
416                    c = LexerATNConfig(state=t.target, config=config)
417
418        elif t.serializationType==Transition.EPSILON:
419            c = LexerATNConfig(state=t.target, config=config)
420
421        elif t.serializationType in [ Transition.ATOM, Transition.RANGE, Transition.SET ]:
422            if treatEofAsEpsilon:
423                from antlr4.Lexer import Lexer
424                if t.matches(Token.EOF, 0, Lexer.MAX_CHAR_VALUE):
425                    c = LexerATNConfig(state=t.target, config=config)
426
427        return c
428
429    # Evaluate a predicate specified in the lexer.
430    #
431    # <p>If {@code speculative} is {@code true}, this method was called before
432    # {@link #consume} for the matched character. This method should call
433    # {@link #consume} before evaluating the predicate to ensure position
434    # sensitive values, including {@link Lexer#getText}, {@link Lexer#getLine},
435    # and {@link Lexer#getcolumn}, properly reflect the current
436    # lexer state. This method should restore {@code input} and the simulator
437    # to the original state before returning (i.e. undo the actions made by the
438    # call to {@link #consume}.</p>
439    #
440    # @param input The input stream.
441    # @param ruleIndex The rule containing the predicate.
442    # @param predIndex The index of the predicate within the rule.
443    # @param speculative {@code true} if the current index in {@code input} is
444    # one character before the predicate's location.
445    #
446    # @return {@code true} if the specified predicate evaluates to
447    # {@code true}.
448    #/
449    def evaluatePredicate(self, input:InputStream, ruleIndex:int, predIndex:int, speculative:bool):
450        # assume true if no recognizer was provided
451        if self.recog is None:
452            return True
453
454        if not speculative:
455            return self.recog.sempred(None, ruleIndex, predIndex)
456
457        savedcolumn = self.column
458        savedLine = self.line
459        index = input.index
460        marker = input.mark()
461        try:
462            self.consume(input)
463            return self.recog.sempred(None, ruleIndex, predIndex)
464        finally:
465            self.column = savedcolumn
466            self.line = savedLine
467            input.seek(index)
468            input.release(marker)
469
470    def captureSimState(self, settings:SimState, input:InputStream, dfaState:DFAState):
471        settings.index = input.index
472        settings.line = self.line
473        settings.column = self.column
474        settings.dfaState = dfaState
475
476    def addDFAEdge(self, from_:DFAState, tk:int, to:DFAState=None, cfgs:ATNConfigSet=None) -> DFAState:
477
478        if to is None and cfgs is not None:
479            # leading to this call, ATNConfigSet.hasSemanticContext is used as a
480            # marker indicating dynamic predicate evaluation makes this edge
481            # dependent on the specific input sequence, so the static edge in the
482            # DFA should be omitted. The target DFAState is still created since
483            # execATN has the ability to resynchronize with the DFA state cache
484            # following the predicate evaluation step.
485            #
486            # TJP notes: next time through the DFA, we see a pred again and eval.
487            # If that gets us to a previously created (but dangling) DFA
488            # state, we can continue in pure DFA mode from there.
489            #/
490            suppressEdge = cfgs.hasSemanticContext
491            cfgs.hasSemanticContext = False
492
493            to = self.addDFAState(cfgs)
494
495            if suppressEdge:
496                return to
497
498        # add the edge
499        if tk < self.MIN_DFA_EDGE or tk > self.MAX_DFA_EDGE:
500            # Only track edges within the DFA bounds
501            return to
502
503        if LexerATNSimulator.debug:
504            print("EDGE " + str(from_) + " -> " + str(to) + " upon "+ chr(tk))
505
506        if from_.edges is None:
507            #  make room for tokens 1..n and -1 masquerading as index 0
508            from_.edges = [ None ] * (self.MAX_DFA_EDGE - self.MIN_DFA_EDGE + 1)
509
510        from_.edges[tk - self.MIN_DFA_EDGE] = to # connect
511
512        return to
513
514
515    # Add a new DFA state if there isn't one with this set of
516    # configurations already. This method also detects the first
517    # configuration containing an ATN rule stop state. Later, when
518    # traversing the DFA, we will know which rule to accept.
519    def addDFAState(self, configs:ATNConfigSet) -> DFAState:
520
521        proposed = DFAState(configs=configs)
522        firstConfigWithRuleStopState = next((cfg for cfg in configs if isinstance(cfg.state, RuleStopState)), None)
523
524        if firstConfigWithRuleStopState is not None:
525            proposed.isAcceptState = True
526            proposed.lexerActionExecutor = firstConfigWithRuleStopState.lexerActionExecutor
527            proposed.prediction = self.atn.ruleToTokenType[firstConfigWithRuleStopState.state.ruleIndex]
528
529        dfa = self.decisionToDFA[self.mode]
530        existing = dfa.states.get(proposed, None)
531        if existing is not None:
532            return existing
533
534        newState = proposed
535
536        newState.stateNumber = len(dfa.states)
537        configs.setReadonly(True)
538        newState.configs = configs
539        dfa.states[newState] = newState
540        return newState
541
542    def getDFA(self, mode:int):
543        return self.decisionToDFA[mode]
544
545    # Get the text matched so far for the current token.
546    def getText(self, input:InputStream):
547        # index is first lookahead char, don't include.
548        return input.getText(self.startIndex, input.index-1)
549
550    def consume(self, input:InputStream):
551        curChar = input.LA(1)
552        if curChar==ord('\n'):
553            self.line += 1
554            self.column = 0
555        else:
556            self.column += 1
557        input.consume()
558
559    def getTokenName(self, t:int):
560        if t==-1:
561            return "EOF"
562        else:
563            return "'" + chr(t) + "'"
564
565
566LexerATNSimulator.ERROR = DFAState(0x7FFFFFFF, ATNConfigSet())
567
568del Lexer