jsparagus/jsparagus/runtime.py

"""Runtime support for jsparagus-generated parsers."""

# Nt is unused here, but we re-export it.
from .grammar import Nt, InitNt, End
from .lexer import UnexpectedEndError
import collections
from dataclasses import dataclass


__all__ = ['ACCEPT', 'ERROR', 'Nt', 'InitNt', 'End', 'Parser', 'ErrorToken']

# Actions are encoded as 64-bit signed integers, with the following meanings:
# - n in range(0, 0x8000_0000_0000_0000) - shift to state n
# - n in range(0x8000_0000_0000_0000, 0xc000_0000_0000_0000) - call special_case(n & SPECIAL_CASE_MASK)
# - n == ERROR (0xbfff_ffff_ffff_fffe)
# - n == ACCEPT (0xbfff_ffff_ffff_ffff)
# - n in range(0xc000_0000_0000_0000, 0x1_0000_0000_0000_0000) - reduce by production -n - 1

SPECIAL_CASE_MASK = 0x3fff_ffff_ffff_ffff
SPECIAL_CASE_TAG = -0x8000_0000_0000_0000
ACCEPT = 0x_bfff_ffff_ffff_ffff - (1 << 64)
ERROR = ACCEPT - 1


@dataclass(frozen=True)
class ErrorTokenClass:
    def __repr__(self):
        return 'ErrorToken'


ErrorToken = ErrorTokenClass()


def throw_syntax_error(actions, state, t, tokens):
    assert t is not None
    if isinstance(state, StateTermValue):
        state = state.state
    expected = set(actions[state].keys())
    expected = set(e for e in expected if not isinstance(e, Nt))

    # Tidy up the `expected` set a bit.
    if End() in expected:
        expected.remove(End())
        expected.add("end of input")
    if ErrorToken in expected:
        # This is possible because we restore the stack in _try_error_handling
        # after reducing and then failing to find a recovery rule after all.
        # But don't tell people in error messages that an error is one of the
        # things we expect. It makes no sense.
        expected.remove(ErrorToken)

    if len(expected) < 2:
        tokens.throw("expected {!r}, got {!r}".format(list(expected)[0], t))
    else:
        tokens.throw("expected one of {!r}, got {!r}"
                     .format(sorted(expected), t))


StateTermValue = collections.namedtuple("StateTermValue", "state term value new_line")


class ShiftError(Exception):
    pass


class ShiftAccept(Exception):
    pass


class Parser:
    """Parser using jsparagus-generated tables.

    The usual design is, a parser object consumes a token iterator.
    This Parser is not like that. Instead, the lexer feeds tokens to it
    by calling `parser.write_terminal(lexer, token)` repeatedly, then
    `parser.close(lexer)`.

    The parser uses these methods of the lexer object:

    *   lexer.take() - Return data associated with a token, like the
        numeric value of an int literal token.

    *   lexer.throw(message) - Throw a syntax error. (This is on the lexer
        because the lexer has the current position.)

    *   lexer.throw_unexpected_end() - Throw a syntax error after we
        successfully parsed the whole file except more tokens were expected at
        the end.

    """

    def __init__(self, actions, error_codes, entry_state, methods):
        self.actions = actions
        self.stack = [StateTermValue(entry_state, None, None, False)]
        self.replay = []
        self.flags = collections.defaultdict(lambda: [])
        self.error_codes = error_codes
        self.methods = methods
        self.closed = False
        self.debug = False
        self.is_simulator = False
        self.last_shift = None

    def clone(self):
        return Parser(self.actions, self.error_codes, 0, self.methods)

    def simulator_clone(self):
        """Make a copy of this parser for simulation.

        The copy has a version of the self.reductions table that never actually
        does anything.

        This is absurdly expensive and is for very odd and special use cases.
        """
        p = self.clone()
        p.stack = self.stack[:]
        p.replay = self.replay[:]
        p.debug = self.debug
        p.is_simulator = True
        return p

    def _str_stv(self, stv):
        # NOTE: replace this function by repr(), to inspect wrong computations.
        val = ''
        if stv.value:
            val = '*'
        return "-- {} {}--> {}".format(stv.term, val, stv.state)

    def _dbg_where(self, t=""):
        name = "stack"
        if self.is_simulator:
            name = "simulator"
        print("{}: {}; {}\nexpect one of: {}".format(
            name,
            " ".join(self._str_stv(s) for s in self.stack), t,
            repr(self.actions[self.stack[-1].state])
        ))

    def _shift(self, stv, lexer):
        state = self.stack[-1].state
        if self.debug:
            self._dbg_where("shift: {}".format(str(stv.term)))
        if not isinstance(self.actions[state], dict):
            # This happens after raising a ShiftAccept error.
            if stv.term == End():
                raise ShiftAccept()
            raise ShiftError()
        self.last_shift = (state, stv)
        while True:
            goto = self.actions[state].get(stv.term, ERROR)
            if goto == ERROR:
                if self.debug:
                    self._dbg_where("(error)")
                self._try_error_handling(lexer, stv)
                stv = self.replay.pop()
                if self.debug:
                    self._dbg_where("error: {}".format(str(stv.term)))
                continue
            state = goto
            self.stack.append(StateTermValue(state, stv.term, stv.value, stv.new_line))
            action = self.actions[state]
            if not isinstance(action, dict):  # Action
                if self.debug:
                    self._dbg_where("(action {})".format(state))
                action(self, lexer)
                state = self.stack[-1].state
                action = self.actions[state]
                # Actions should always unwind or do an epsilon transition to a
                # shift state.
                assert isinstance(action, dict)
            if self.replay != []:
                stv = self.replay.pop()
                if self.debug:
                    self._dbg_where("replay: {}".format(repr(stv.term)))
            else:
                break

    def replay_action(self, dest):
        # This code emulates the code which would be executed by the shift
        # function, if we were to return to this shift function instead of
        # staying within the action functions. The destination provided as
        # argument should match the content of the parse table, otherwise this
        # would imply that the replay action does not encode a transition from
        # the parse table.
        state = self.stack[-1].state
        stv = self.replay.pop()
        if self.debug:
            self._dbg_where("(inline-replay: {})".format(repr(stv.term)))
        goto = self.actions[state].get(stv.term, ERROR)
        assert goto == dest
        self.stack.append(StateTermValue(dest, stv.term, stv.value, stv.new_line))

    def shift_list(self, stv_list, lexer):
        self.replay.extend(reversed(stv_list))

    def write_terminal(self, lexer, t):
        assert not self.closed
        try:
            stv = StateTermValue(0, t, lexer.take(), lexer.saw_line_terminator())
            self._shift(stv, lexer)
        except ShiftAccept:
            if self.debug:
                self._dbg_where("(write_terminal accept)")
            if self.replay != []:
                state, stv = self.last_shift
                throw_syntax_error(self.actions, state, lexer.take(), lexer)
        except ShiftError:
            state, stv = self.last_shift
            throw_syntax_error(self.actions, state, lexer.take(), lexer)

    def close(self, lexer):
        assert not self.closed
        self.closed = True
        try:
            self._shift(StateTermValue(0, End(), End(), False), lexer)
        except ShiftAccept:
            if self.debug:
                self._dbg_where("(close accept)")
                print(repr(self.stack))
            while self.stack[-1].term == End():
                self.stack.pop()
            assert len(self.stack) == 2
            assert self.stack[0].term is None
            assert isinstance(self.stack[1].term, Nt)
            return self.stack[1].value

    def top_state(self):
        return self.stack[-1].state

    def check_not_on_new_line(self, lexer, peek):
        if peek <= 0:
            raise ValueError("check_not_on_new_line got an impossible peek offset")
        if not self.stack[-peek].new_line:
            return True
        for _ in range(peek - 1):
            self.replay.append(self.stack.pop())
        stv = self.stack.pop()
        self._try_error_handling(lexer, stv)
        return False

    def _try_error_handling(self, lexer, stv):
        # Error recovery version of the code in write_terminal. Three differences
        # between this and write_terminal are commented below.
        if stv.term is ErrorToken:
            if stv.value == End():
                lexer.throw_unexpected_end()
                raise
            throw_syntax_error(self.actions, self.stack[-1], stv.value, lexer)
            raise

        state = self.stack[-1].state
        error_code = self.error_codes[state]
        if error_code is not None:
            self.on_recover(error_code, lexer, stv)
            self.replay.append(stv)
            self.replay.append(StateTermValue(0, ErrorToken, stv.value, stv.new_line))
        elif stv.term == End():
            lexer.throw_unexpected_end()
            raise
        else:
            throw_syntax_error(self.actions, self.stack[-1], stv.value, lexer)
            raise

    def on_recover(self, error_code, lexer, stv):
        """Called when the grammar says to recover from a parse error.

        Subclasses can override this to add custom code when an ErrorSymbol in
        a production is matched. This base-class implementation does nothing,
        allowing the parser to recover from the error silently.
        """
        pass

    def can_accept_terminal(self, lexer, t):
        """Return True if the terminal `t` is OK next.

        False if it's an error. `t` can be None, querying if we can accept
        end-of-input.
        """
        class BogusLexer:
            def throw_unexpected_end(self):
                raise UnexpectedEndError("")

            def throw(self, message):
                raise SyntaxError(message)

            def take(self):
                return str(t)

            def saw_line_terminator(self):
                return lexer.saw_line_terminator()

        sim = self.simulator_clone()
        try:
            sim.write_terminal(BogusLexer(), t)
        except Exception:
            return False
        return True

    def can_close(self):
        """Return True if self.close() would succeed."""

        # The easy case: no error, parsing just succeeds.
        # The hard case: maybe error-handling would succeed?
        # The easiest thing is simply to run the method.
        class BogusLexer:
            def throw_unexpected_end(self):
                raise UnexpectedEndError("")

            def throw(self, message):
                raise SyntaxError(message)

        sim = self.simulator_clone()
        try:
            sim.close(BogusLexer())
        except SyntaxError:
            return False
        return True