1# -*- coding: utf-8 -*-
2"""Lexer for xonsh code.
3
4Written using a hybrid of ``tokenize`` and PLY.
5"""
6import io
7
8# 'keyword' interferes with ast.keyword
9import keyword as kwmod
10
11try:
12    from ply.lex import LexToken
13except ImportError:
14    from xonsh.ply.ply.lex import LexToken
15
16from xonsh.lazyasd import lazyobject
17from xonsh.platform import PYTHON_VERSION_INFO
18from xonsh.tokenize import (
19    OP,
20    IOREDIRECT,
21    STRING,
22    DOLLARNAME,
23    NUMBER,
24    SEARCHPATH,
25    NEWLINE,
26    INDENT,
27    DEDENT,
28    NL,
29    COMMENT,
30    ENCODING,
31    ENDMARKER,
32    NAME,
33    ERRORTOKEN,
34    GREATER,
35    LESS,
36    RIGHTSHIFT,
37    tokenize,
38    TokenError,
39)
40
41
42@lazyobject
43def token_map():
44    """Mapping from ``tokenize`` tokens (or token types) to PLY token types. If
45    a simple one-to-one mapping from ``tokenize`` to PLY exists, the lexer will
46    look it up here and generate a single PLY token of the given type.
47    Otherwise, it will fall back to handling that token using one of the
48    handlers in``special_handlers``.
49    """
50    tm = {}
51    # operators
52    _op_map = {
53        # punctuation
54        ",": "COMMA",
55        ".": "PERIOD",
56        ";": "SEMI",
57        ":": "COLON",
58        "...": "ELLIPSIS",
59        # basic operators
60        "+": "PLUS",
61        "-": "MINUS",
62        "*": "TIMES",
63        "@": "AT",
64        "/": "DIVIDE",
65        "//": "DOUBLEDIV",
66        "%": "MOD",
67        "**": "POW",
68        "|": "PIPE",
69        "~": "TILDE",
70        "^": "XOR",
71        "<<": "LSHIFT",
72        ">>": "RSHIFT",
73        "<": "LT",
74        "<=": "LE",
75        ">": "GT",
76        ">=": "GE",
77        "==": "EQ",
78        "!=": "NE",
79        "->": "RARROW",
80        # assignment operators
81        "=": "EQUALS",
82        "+=": "PLUSEQUAL",
83        "-=": "MINUSEQUAL",
84        "*=": "TIMESEQUAL",
85        "@=": "ATEQUAL",
86        "/=": "DIVEQUAL",
87        "%=": "MODEQUAL",
88        "**=": "POWEQUAL",
89        "<<=": "LSHIFTEQUAL",
90        ">>=": "RSHIFTEQUAL",
91        "&=": "AMPERSANDEQUAL",
92        "^=": "XOREQUAL",
93        "|=": "PIPEEQUAL",
94        "//=": "DOUBLEDIVEQUAL",
95        # extra xonsh operators
96        "?": "QUESTION",
97        "??": "DOUBLE_QUESTION",
98        "@$": "ATDOLLAR",
99        "&": "AMPERSAND",
100    }
101    for (op, typ) in _op_map.items():
102        tm[(OP, op)] = typ
103    tm[IOREDIRECT] = "IOREDIRECT"
104    tm[STRING] = "STRING"
105    tm[DOLLARNAME] = "DOLLAR_NAME"
106    tm[NUMBER] = "NUMBER"
107    tm[SEARCHPATH] = "SEARCHPATH"
108    tm[NEWLINE] = "NEWLINE"
109    tm[INDENT] = "INDENT"
110    tm[DEDENT] = "DEDENT"
111    if (3, 5, 0) <= PYTHON_VERSION_INFO < (3, 7, 0):
112        from xonsh.tokenize import ASYNC, AWAIT
113
114        tm[ASYNC] = "ASYNC"
115        tm[AWAIT] = "AWAIT"
116    return tm
117
118
119def handle_name(state, token):
120    """Function for handling name tokens"""
121    typ = "NAME"
122    if state["pymode"][-1][0]:
123        if token.string in kwmod.kwlist:
124            typ = token.string.upper()
125        state["last"] = token
126        yield _new_token(typ, token.string, token.start)
127    else:
128        prev = state["last"]
129        state["last"] = token
130        has_whitespace = prev.end != token.start
131        if token.string == "and" and has_whitespace:
132            yield _new_token("AND", token.string, token.start)
133        elif token.string == "or" and has_whitespace:
134            yield _new_token("OR", token.string, token.start)
135        else:
136            yield _new_token("NAME", token.string, token.start)
137
138
139def _end_delimiter(state, token):
140    py = state["pymode"]
141    s = token.string
142    l, c = token.start
143    if len(py) > 1:
144        mode, orig, match, pos = py.pop()
145        if s != match:
146            e = '"{}" at {} ends "{}" at {} (expected "{}")'
147            return e.format(s, (l, c), orig, pos, match)
148    else:
149        return 'Unmatched "{}" at line {}, column {}'.format(s, l, c)
150
151
152def handle_rparen(state, token):
153    """
154    Function for handling ``)``
155    """
156    e = _end_delimiter(state, token)
157    if e is None:
158        state["last"] = token
159        yield _new_token("RPAREN", ")", token.start)
160    else:
161        yield _new_token("ERRORTOKEN", e, token.start)
162
163
164def handle_rbrace(state, token):
165    """Function for handling ``}``"""
166    e = _end_delimiter(state, token)
167    if e is None:
168        state["last"] = token
169        yield _new_token("RBRACE", "}", token.start)
170    else:
171        yield _new_token("ERRORTOKEN", e, token.start)
172
173
174def handle_rbracket(state, token):
175    """
176    Function for handling ``]``
177    """
178    e = _end_delimiter(state, token)
179    if e is None:
180        state["last"] = token
181        yield _new_token("RBRACKET", "]", token.start)
182    else:
183        yield _new_token("ERRORTOKEN", e, token.start)
184
185
186def handle_error_space(state, token):
187    """
188    Function for handling special whitespace characters in subprocess mode
189    """
190    if not state["pymode"][-1][0]:
191        state["last"] = token
192        yield _new_token("WS", token.string, token.start)
193    else:
194        yield from []
195
196
197def handle_error_linecont(state, token):
198    """Function for handling special line continuations as whitespace
199    characters in subprocess mode.
200    """
201    if state["pymode"][-1][0]:
202        return
203    prev = state["last"]
204    if prev.end != token.start:
205        return  # previous token is separated by whitespace
206    state["last"] = token
207    yield _new_token("WS", "\\", token.start)
208
209
210def handle_error_token(state, token):
211    """
212    Function for handling error tokens
213    """
214    state["last"] = token
215    if token.string == "!":
216        typ = "BANG"
217    elif not state["pymode"][-1][0]:
218        typ = "NAME"
219    else:
220        typ = "ERRORTOKEN"
221    yield _new_token(typ, token.string, token.start)
222
223
224def handle_ignore(state, token):
225    """Function for handling tokens that should be ignored"""
226    yield from []
227
228
229def handle_double_amps(state, token):
230    yield _new_token("AND", "and", token.start)
231
232
233def handle_double_pipe(state, token):
234    yield _new_token("OR", "or", token.start)
235
236
237def handle_redirect(state, token):
238    # The parser expects whitespace after a redirection in subproc mode.
239    # If whitespace does not exist, we'll issue an empty whitespace
240    # token before proceeding.
241    state["last"] = token
242    typ = token.type
243    st = token.string
244    key = (typ, st) if (typ, st) in token_map else typ
245    yield _new_token(token_map[key], st, token.start)
246    if state["pymode"][-1][0]:
247        return
248    # add a whitespace token after a redirection, if we need to
249    next_tok = next(state["stream"])
250    if next_tok.start == token.end:
251        yield _new_token("WS", "", token.end)
252    yield from handle_token(state, next_tok)
253
254
255def _make_matcher_handler(tok, typ, pymode, ender, handlers):
256    matcher = (
257        ")"
258        if tok.endswith("(")
259        else "}"
260        if tok.endswith("{")
261        else "]"
262        if tok.endswith("[")
263        else None
264    )
265
266    def _inner_handler(state, token):
267        state["pymode"].append((pymode, tok, matcher, token.start))
268        state["last"] = token
269        yield _new_token(typ, tok, token.start)
270
271    handlers[(OP, tok)] = _inner_handler
272
273
274@lazyobject
275def special_handlers():
276    """Mapping from ``tokenize`` tokens (or token types) to the proper
277    function for generating PLY tokens from them.  In addition to
278    yielding PLY tokens, these functions may manipulate the Lexer's state.
279    """
280    sh = {
281        NL: handle_ignore,
282        COMMENT: handle_ignore,
283        ENCODING: handle_ignore,
284        ENDMARKER: handle_ignore,
285        NAME: handle_name,
286        ERRORTOKEN: handle_error_token,
287        LESS: handle_redirect,
288        GREATER: handle_redirect,
289        RIGHTSHIFT: handle_redirect,
290        IOREDIRECT: handle_redirect,
291        (OP, "<"): handle_redirect,
292        (OP, ">"): handle_redirect,
293        (OP, ">>"): handle_redirect,
294        (OP, ")"): handle_rparen,
295        (OP, "}"): handle_rbrace,
296        (OP, "]"): handle_rbracket,
297        (OP, "&&"): handle_double_amps,
298        (OP, "||"): handle_double_pipe,
299        (ERRORTOKEN, " "): handle_error_space,
300        (ERRORTOKEN, "\\\n"): handle_error_linecont,
301        (ERRORTOKEN, "\\\r\n"): handle_error_linecont,
302    }
303    _make_matcher_handler("(", "LPAREN", True, ")", sh)
304    _make_matcher_handler("[", "LBRACKET", True, "]", sh)
305    _make_matcher_handler("{", "LBRACE", True, "}", sh)
306    _make_matcher_handler("$(", "DOLLAR_LPAREN", False, ")", sh)
307    _make_matcher_handler("$[", "DOLLAR_LBRACKET", False, "]", sh)
308    _make_matcher_handler("${", "DOLLAR_LBRACE", True, "}", sh)
309    _make_matcher_handler("!(", "BANG_LPAREN", False, ")", sh)
310    _make_matcher_handler("![", "BANG_LBRACKET", False, "]", sh)
311    _make_matcher_handler("@(", "AT_LPAREN", True, ")", sh)
312    _make_matcher_handler("@$(", "ATDOLLAR_LPAREN", False, ")", sh)
313    return sh
314
315
316def handle_token(state, token):
317    """
318    General-purpose token handler.  Makes use of ``token_map`` or
319    ``special_map`` to yield one or more PLY tokens from the given input.
320
321    Parameters
322    ----------
323
324    state :
325        The current state of the lexer, including information about whether
326        we are in Python mode or subprocess mode, which changes the lexer's
327        behavior.  Also includes the stream of tokens yet to be considered.
328    token :
329        The token (from ``tokenize``) currently under consideration
330    """
331    typ = token.type
332    st = token.string
333    pymode = state["pymode"][-1][0]
334    if not pymode:
335        if state["last"] is not None and state["last"].end != token.start:
336            cur = token.start
337            old = state["last"].end
338            if cur[0] == old[0] and cur[1] > old[1]:
339                yield _new_token("WS", token.line[old[1] : cur[1]], old)
340    if (typ, st) in special_handlers:
341        yield from special_handlers[(typ, st)](state, token)
342    elif (typ, st) in token_map:
343        state["last"] = token
344        yield _new_token(token_map[(typ, st)], st, token.start)
345    elif typ in special_handlers:
346        yield from special_handlers[typ](state, token)
347    elif typ in token_map:
348        state["last"] = token
349        yield _new_token(token_map[typ], st, token.start)
350    else:
351        m = "Unexpected token: {0}".format(token)
352        yield _new_token("ERRORTOKEN", m, token.start)
353
354
355def get_tokens(s):
356    """
357    Given a string containing xonsh code, generates a stream of relevant PLY
358    tokens using ``handle_token``.
359    """
360    state = {
361        "indents": [0],
362        "last": None,
363        "pymode": [(True, "", "", (0, 0))],
364        "stream": tokenize(io.BytesIO(s.encode("utf-8")).readline),
365    }
366    while True:
367        try:
368            token = next(state["stream"])
369            yield from handle_token(state, token)
370        except StopIteration:
371            if len(state["pymode"]) > 1:
372                pm, o, m, p = state["pymode"][-1]
373                l, c = p
374                e = 'Unmatched "{}" at line {}, column {}'
375                yield _new_token("ERRORTOKEN", e.format(o, l, c), (0, 0))
376            break
377        except TokenError as e:
378            # this is recoverable in single-line mode (from the shell)
379            # (e.g., EOF while scanning string literal)
380            yield _new_token("ERRORTOKEN", e.args[0], (0, 0))
381            break
382        except IndentationError as e:
383            # this is never recoverable
384            yield _new_token("ERRORTOKEN", e, (0, 0))
385            break
386
387
388# synthesize a new PLY token
389def _new_token(type, value, pos):
390    o = LexToken()
391    o.type = type
392    o.value = value
393    o.lineno, o.lexpos = pos
394    return o
395
396
397class Lexer(object):
398    """Implements a lexer for the xonsh language."""
399
400    _tokens = None
401
402    def __init__(self):
403        """
404        Attributes
405        ----------
406        fname : str
407            Filename
408        last : token
409            The last token seen.
410        lineno : int
411            The last line number seen.
412
413        """
414        self.fname = ""
415        self.last = None
416        self.beforelast = None
417
418    def build(self, **kwargs):
419        """Part of the PLY lexer API."""
420        pass
421
422    def reset(self):
423        pass
424
425    def input(self, s):
426        """Calls the lexer on the string s."""
427        self.token_stream = get_tokens(s)
428
429    def token(self):
430        """Retrieves the next token."""
431        self.beforelast = self.last
432        self.last = next(self.token_stream, None)
433        return self.last
434
435    def __iter__(self):
436        t = self.token()
437        while t is not None:
438            yield t
439            t = self.token()
440
441    def split(self, s):
442        """Splits a string into a list of strings which are whitespace-separated
443        tokens.
444        """
445        vals = []
446        self.input(s)
447        l = c = -1
448        ws = "WS"
449        nl = "\n"
450        for t in self:
451            if t.type == ws:
452                continue
453            elif l < t.lineno:
454                vals.append(t.value)
455            elif len(vals) > 0 and c == t.lexpos:
456                vals[-1] = vals[-1] + t.value
457            else:
458                vals.append(t.value)
459            nnl = t.value.count(nl)
460            if nnl == 0:
461                l = t.lineno
462                c = t.lexpos + len(t.value)
463            else:
464                l = t.lineno + nnl
465                c = len(t.value.rpartition(nl)[-1])
466        return vals
467
468    #
469    # All the tokens recognized by the lexer
470    #
471    @property
472    def tokens(self):
473        if self._tokens is None:
474            t = (
475                tuple(token_map.values())
476                + (
477                    "NAME",  # name tokens
478                    "BANG",  # ! tokens
479                    "WS",  # whitespace in subprocess mode
480                    "LPAREN",
481                    "RPAREN",  # ( )
482                    "LBRACKET",
483                    "RBRACKET",  # [ ]
484                    "LBRACE",
485                    "RBRACE",  # { }
486                    "AT_LPAREN",  # @(
487                    "BANG_LPAREN",  # !(
488                    "BANG_LBRACKET",  # ![
489                    "DOLLAR_LPAREN",  # $(
490                    "DOLLAR_LBRACE",  # ${
491                    "DOLLAR_LBRACKET",  # $[
492                    "ATDOLLAR_LPAREN",  # @$(
493                    "ERRORTOKEN",  # whoops!
494                )
495                + tuple(i.upper() for i in kwmod.kwlist)
496            )
497            self._tokens = t
498        return self._tokens
499