1"""Tokenization help for xonsh programs.
2
3This file is a modified version of tokenize.py form the Python 3.4 and 3.5
4standard libraries (licensed under the Python Software Foundation License,
5version 2), which provides tokenization help for Python programs.
6
7It is modified to properly tokenize xonsh code, including backtick regex
8path and several xonsh-specific operators.
9
10A few pieces of this file are specific to the version of Python being used.
11To find these pieces, search the PY35.
12
13Original file credits:
14   __author__ = 'Ka-Ping Yee <ping@lfw.org>'
15   __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
16                  'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
17                  'Michael Foord')
18"""
19
20import re
21import io
22import sys
23import codecs
24import builtins
25import itertools
26import collections
27import token
28from token import (
29    AMPER,
30    AMPEREQUAL,
31    AT,
32    CIRCUMFLEX,
33    CIRCUMFLEXEQUAL,
34    COLON,
35    COMMA,
36    DEDENT,
37    DOT,
38    DOUBLESLASH,
39    DOUBLESLASHEQUAL,
40    DOUBLESTAR,
41    DOUBLESTAREQUAL,
42    ENDMARKER,
43    EQEQUAL,
44    EQUAL,
45    ERRORTOKEN,
46    GREATER,
47    GREATEREQUAL,
48    INDENT,
49    LBRACE,
50    LEFTSHIFT,
51    LEFTSHIFTEQUAL,
52    LESS,
53    LESSEQUAL,
54    LPAR,
55    LSQB,
56    MINEQUAL,
57    MINUS,
58    NAME,
59    NEWLINE,
60    NOTEQUAL,
61    NUMBER,
62    N_TOKENS,
63    OP,
64    PERCENT,
65    PERCENTEQUAL,
66    PLUS,
67    PLUSEQUAL,
68    RBRACE,
69    RIGHTSHIFT,
70    RIGHTSHIFTEQUAL,
71    RPAR,
72    RSQB,
73    SEMI,
74    SLASH,
75    SLASHEQUAL,
76    STAR,
77    STAREQUAL,
78    STRING,
79    TILDE,
80    VBAR,
81    VBAREQUAL,
82    tok_name,
83)
84
85from xonsh.lazyasd import LazyObject
86from xonsh.platform import PYTHON_VERSION_INFO
87
88cookie_re = LazyObject(
89    lambda: re.compile(r"^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)", re.ASCII),
90    globals(),
91    "cookie_re",
92)
93blank_re = LazyObject(
94    lambda: re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII), globals(), "blank_re"
95)
96
97#
98# token modifications
99#
100tok_name = tok_name.copy()
101__all__ = token.__all__ + [
102    "COMMENT",
103    "tokenize",
104    "detect_encoding",
105    "NL",
106    "untokenize",
107    "ENCODING",
108    "TokenInfo",
109    "TokenError",
110    "SEARCHPATH",
111    "ATDOLLAR",
112    "ATEQUAL",
113    "DOLLARNAME",
114    "IOREDIRECT",
115]
116HAS_ASYNC = (3, 5, 0) <= PYTHON_VERSION_INFO < (3, 7, 0)
117if HAS_ASYNC:
118    ASYNC = token.ASYNC
119    AWAIT = token.AWAIT
120    ADDSPACE_TOKS = (NAME, NUMBER, ASYNC, AWAIT)
121else:
122    ADDSPACE_TOKS = (NAME, NUMBER)
123del token  # must clean up token
124PY35 = (3, 5, 0) <= PYTHON_VERSION_INFO
125AUGASSIGN_OPS = r"[+\-*/%&@|^=<>]=?"
126if not PY35:
127    AUGASSIGN_OPS = AUGASSIGN_OPS.replace("@", "")
128
129
130COMMENT = N_TOKENS
131tok_name[COMMENT] = "COMMENT"
132NL = N_TOKENS + 1
133tok_name[NL] = "NL"
134ENCODING = N_TOKENS + 2
135tok_name[ENCODING] = "ENCODING"
136N_TOKENS += 3
137SEARCHPATH = N_TOKENS
138tok_name[N_TOKENS] = "SEARCHPATH"
139N_TOKENS += 1
140IOREDIRECT = N_TOKENS
141tok_name[N_TOKENS] = "IOREDIRECT"
142N_TOKENS += 1
143DOLLARNAME = N_TOKENS
144tok_name[N_TOKENS] = "DOLLARNAME"
145N_TOKENS += 1
146ATDOLLAR = N_TOKENS
147tok_name[N_TOKENS] = "ATDOLLAR"
148N_TOKENS += 1
149ATEQUAL = N_TOKENS
150tok_name[N_TOKENS] = "ATEQUAL"
151N_TOKENS += 1
152_xonsh_tokens = {
153    "?": "QUESTION",
154    "@=": "ATEQUAL",
155    "@$": "ATDOLLAR",
156    "||": "DOUBLEPIPE",
157    "&&": "DOUBLEAMPER",
158    "@(": "ATLPAREN",
159    "!(": "BANGLPAREN",
160    "![": "BANGLBRACKET",
161    "$(": "DOLLARLPAREN",
162    "$[": "DOLLARLBRACKET",
163    "${": "DOLLARLBRACE",
164    "??": "DOUBLEQUESTION",
165    "@$(": "ATDOLLARLPAREN",
166}
167
168additional_parenlevs = frozenset({"@(", "!(", "![", "$(", "$[", "${", "@$("})
169
170_glbs = globals()
171for v in _xonsh_tokens.values():
172    _glbs[v] = N_TOKENS
173    tok_name[N_TOKENS] = v
174    N_TOKENS += 1
175    __all__.append(v)
176del _glbs, v
177
178EXACT_TOKEN_TYPES = {
179    "(": LPAR,
180    ")": RPAR,
181    "[": LSQB,
182    "]": RSQB,
183    ":": COLON,
184    ",": COMMA,
185    ";": SEMI,
186    "+": PLUS,
187    "-": MINUS,
188    "*": STAR,
189    "/": SLASH,
190    "|": VBAR,
191    "&": AMPER,
192    "<": LESS,
193    ">": GREATER,
194    "=": EQUAL,
195    ".": DOT,
196    "%": PERCENT,
197    "{": LBRACE,
198    "}": RBRACE,
199    "==": EQEQUAL,
200    "!=": NOTEQUAL,
201    "<=": LESSEQUAL,
202    ">=": GREATEREQUAL,
203    "~": TILDE,
204    "^": CIRCUMFLEX,
205    "<<": LEFTSHIFT,
206    ">>": RIGHTSHIFT,
207    "**": DOUBLESTAR,
208    "+=": PLUSEQUAL,
209    "-=": MINEQUAL,
210    "*=": STAREQUAL,
211    "/=": SLASHEQUAL,
212    "%=": PERCENTEQUAL,
213    "&=": AMPEREQUAL,
214    "|=": VBAREQUAL,
215    "^=": CIRCUMFLEXEQUAL,
216    "<<=": LEFTSHIFTEQUAL,
217    ">>=": RIGHTSHIFTEQUAL,
218    "**=": DOUBLESTAREQUAL,
219    "//": DOUBLESLASH,
220    "//=": DOUBLESLASHEQUAL,
221    "@": AT,
222}
223
224EXACT_TOKEN_TYPES.update(_xonsh_tokens)
225
226
227class TokenInfo(collections.namedtuple("TokenInfo", "type string start end line")):
228    def __repr__(self):
229        annotated_type = "%d (%s)" % (self.type, tok_name[self.type])
230        return (
231            "TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)"
232            % self._replace(type=annotated_type)
233        )
234
235    @property
236    def exact_type(self):
237        if self.type == OP and self.string in EXACT_TOKEN_TYPES:
238            return EXACT_TOKEN_TYPES[self.string]
239        else:
240            return self.type
241
242
243def group(*choices):
244    return "(" + "|".join(choices) + ")"
245
246
247def tokany(*choices):
248    return group(*choices) + "*"
249
250
251def maybe(*choices):
252    return group(*choices) + "?"
253
254
255# Note: we use unicode matching for names ("\w") but ascii matching for
256# number literals.
257Whitespace = r"[ \f\t]*"
258Comment = r"#[^\r\n]*"
259Ignore = Whitespace + tokany(r"\\\r?\n" + Whitespace) + maybe(Comment)
260Name_RE = r"\$?\w+"
261
262Hexnumber = r"0[xX](?:_?[0-9a-fA-F])+"
263Binnumber = r"0[bB](?:_?[01])+"
264Octnumber = r"0[oO](?:_?[0-7])+"
265Decnumber = r"(?:0(?:_?0)*|[1-9](?:_?[0-9])*)"
266Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
267Exponent = r"[eE][-+]?[0-9](?:_?[0-9])*"
268Pointfloat = group(
269    r"[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?", r"\.[0-9](?:_?[0-9])*"
270) + maybe(Exponent)
271Expfloat = r"[0-9](?:_?[0-9])*" + Exponent
272Floatnumber = group(Pointfloat, Expfloat)
273Imagnumber = group(r"[0-9](?:_?[0-9])*[jJ]", Floatnumber + r"[jJ]")
274Number = group(Imagnumber, Floatnumber, Intnumber)
275
276StringPrefix = r"(?:[bBp][rR]?|[rR][bBpfF]?|[uU]|[fF][rR]?)?"
277
278# Tail end of ' string.
279Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
280# Tail end of " string.
281Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
282# Tail end of ''' string.
283Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
284# Tail end of """ string.
285Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
286Triple = group(StringPrefix + "'''", StringPrefix + '"""')
287# Single-line ' or " string.
288String = group(
289    StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
290    StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
291)
292
293# Xonsh-specific Syntax
294SearchPath = r"((?:[rgp]+|@\w*)?)`([^\n`\\]*(?:\\.[^\n`\\]*)*)`"
295
296# Because of leftmost-then-longest match semantics, be sure to put the
297# longest operators first (e.g., if = came before ==, == would get
298# recognized as two instances of =).
299_redir_names = ("out", "all", "err", "e", "2", "a", "&", "1", "o")
300_redir_map = (
301    # stderr to stdout
302    "err>out",
303    "err>&1",
304    "2>out",
305    "err>o",
306    "err>1",
307    "e>out",
308    "e>&1",
309    "2>&1",
310    "e>o",
311    "2>o",
312    "e>1",
313    "2>1",
314    # stdout to stderr
315    "out>err",
316    "out>&2",
317    "1>err",
318    "out>e",
319    "out>2",
320    "o>err",
321    "o>&2",
322    "1>&2",
323    "o>e",
324    "1>e",
325    "o>2",
326    "1>2",
327)
328IORedirect = group(group(*_redir_map), "{}>>?".format(group(*_redir_names)))
329_redir_check = set(_redir_map)
330_redir_check = {"{}>".format(i) for i in _redir_names}.union(_redir_check)
331_redir_check = {"{}>>".format(i) for i in _redir_names}.union(_redir_check)
332_redir_check = frozenset(_redir_check)
333Operator = group(
334    r"\*\*=?",
335    r">>=?",
336    r"<<=?",
337    r"!=",
338    r"//=?",
339    r"->",
340    r"@\$\(?",
341    r"\|\|",
342    "&&",
343    r"@\(",
344    r"!\(",
345    r"!\[",
346    r"\$\(",
347    r"\$\[",
348    "\${",
349    r"\?\?",
350    r"\?",
351    AUGASSIGN_OPS,
352    r"~",
353)
354
355Bracket = "[][(){}]"
356Special = group(r"\r?\n", r"\.\.\.", r"[:;.,@]")
357Funny = group(Operator, Bracket, Special)
358
359PlainToken = group(IORedirect, Number, Funny, String, Name_RE, SearchPath)
360
361# First (or only) line of ' or " string.
362ContStr = group(
363    StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
364    StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
365)
366PseudoExtras = group(r"\\\r?\n|\Z", Comment, Triple, SearchPath)
367PseudoToken = Whitespace + group(
368    PseudoExtras, IORedirect, Number, Funny, ContStr, Name_RE
369)
370
371
372def _compile(expr):
373    return re.compile(expr, re.UNICODE)
374
375
376endpats = {
377    "'": Single,
378    '"': Double,
379    "'''": Single3,
380    '"""': Double3,
381    "r'''": Single3,
382    'r"""': Double3,
383    "b'''": Single3,
384    'b"""': Double3,
385    "f'''": Single3,
386    'f"""': Double3,
387    "R'''": Single3,
388    'R"""': Double3,
389    "B'''": Single3,
390    'B"""': Double3,
391    "F'''": Single3,
392    'F"""': Double3,
393    "br'''": Single3,
394    'br"""': Double3,
395    "fr'''": Single3,
396    'fr"""': Double3,
397    "bR'''": Single3,
398    'bR"""': Double3,
399    "Br'''": Single3,
400    'Br"""': Double3,
401    "BR'''": Single3,
402    'BR"""': Double3,
403    "rb'''": Single3,
404    'rb"""': Double3,
405    "rf'''": Single3,
406    'rf"""': Double3,
407    "Rb'''": Single3,
408    'Rb"""': Double3,
409    "Fr'''": Single3,
410    'Fr"""': Double3,
411    "rB'''": Single3,
412    'rB"""': Double3,
413    "rF'''": Single3,
414    'rF"""': Double3,
415    "RB'''": Single3,
416    'RB"""': Double3,
417    "RF'''": Single3,
418    'RF"""': Double3,
419    "u'''": Single3,
420    'u"""': Double3,
421    "U'''": Single3,
422    'U"""': Double3,
423    "p'''": Single3,
424    'p"""': Double3,
425    "pr'''": Single3,
426    'pr"""': Double3,
427    "pR'''": Single3,
428    'pR"""': Double3,
429    "rp'''": Single3,
430    'rp"""': Double3,
431    "Rp'''": Single3,
432    'Rp"""': Double3,
433    "r": None,
434    "R": None,
435    "b": None,
436    "B": None,
437    "u": None,
438    "U": None,
439    "p": None,
440    "f": None,
441    "F": None,
442}
443
444triple_quoted = {}
445for t in (
446    "'''",
447    '"""',
448    "r'''",
449    'r"""',
450    "R'''",
451    'R"""',
452    "b'''",
453    'b"""',
454    "B'''",
455    'B"""',
456    "f'''",
457    'f"""',
458    "F'''",
459    'F"""',
460    "br'''",
461    'br"""',
462    "Br'''",
463    'Br"""',
464    "bR'''",
465    'bR"""',
466    "BR'''",
467    'BR"""',
468    "rb'''",
469    'rb"""',
470    "rB'''",
471    'rB"""',
472    "Rb'''",
473    'Rb"""',
474    "RB'''",
475    'RB"""',
476    "fr'''",
477    'fr"""',
478    "Fr'''",
479    'Fr"""',
480    "fR'''",
481    'fR"""',
482    "FR'''",
483    'FR"""',
484    "rf'''",
485    'rf"""',
486    "rF'''",
487    'rF"""',
488    "Rf'''",
489    'Rf"""',
490    "RF'''",
491    'RF"""',
492    "u'''",
493    'u"""',
494    "U'''",
495    'U"""',
496    "p'''",
497    'p""""',
498    "pr'''",
499    'pr""""',
500    "pR'''",
501    'pR""""',
502    "rp'''",
503    'rp""""',
504    "Rp'''",
505    'Rp""""',
506):
507    triple_quoted[t] = t
508single_quoted = {}
509for t in (
510    "'",
511    '"',
512    "r'",
513    'r"',
514    "R'",
515    'R"',
516    "b'",
517    'b"',
518    "B'",
519    'B"',
520    "f'",
521    'f"',
522    "F'",
523    'F"',
524    "br'",
525    'br"',
526    "Br'",
527    'Br"',
528    "bR'",
529    'bR"',
530    "BR'",
531    'BR"',
532    "rb'",
533    'rb"',
534    "rB'",
535    'rB"',
536    "Rb'",
537    'Rb"',
538    "RB'",
539    'RB"',
540    "fr'",
541    'fr"',
542    "Fr'",
543    'Fr"',
544    "fR'",
545    'fR"',
546    "FR'",
547    'FR"',
548    "rf'",
549    'rf"',
550    "rF'",
551    'rF"',
552    "Rf'",
553    'Rf"',
554    "RF'",
555    'RF"',
556    "u'",
557    'u"',
558    "U'",
559    'U"',
560    "p'",
561    'p"',
562    "pr'",
563    'pr"',
564    "pR'",
565    'pR"',
566    "rp'",
567    'rp"',
568    "Rp'",
569    'Rp"',
570):
571    single_quoted[t] = t
572
573tabsize = 8
574
575
576class TokenError(Exception):
577    pass
578
579
580class StopTokenizing(Exception):
581    pass
582
583
584class Untokenizer:
585    def __init__(self):
586        self.tokens = []
587        self.prev_row = 1
588        self.prev_col = 0
589        self.encoding = None
590
591    def add_whitespace(self, start):
592        row, col = start
593        if row < self.prev_row or row == self.prev_row and col < self.prev_col:
594            raise ValueError(
595                "start ({},{}) precedes previous end ({},{})".format(
596                    row, col, self.prev_row, self.prev_col
597                )
598            )
599        row_offset = row - self.prev_row
600        if row_offset:
601            self.tokens.append("\\\n" * row_offset)
602            self.prev_col = 0
603        col_offset = col - self.prev_col
604        if col_offset:
605            self.tokens.append(" " * col_offset)
606
607    def untokenize(self, iterable):
608        it = iter(iterable)
609        indents = []
610        startline = False
611        for t in it:
612            if len(t) == 2:
613                self.compat(t, it)
614                break
615            tok_type, token, start, end, line = t
616            if tok_type == ENCODING:
617                self.encoding = token
618                continue
619            if tok_type == ENDMARKER:
620                break
621            if tok_type == INDENT:
622                indents.append(token)
623                continue
624            elif tok_type == DEDENT:
625                indents.pop()
626                self.prev_row, self.prev_col = end
627                continue
628            elif tok_type in (NEWLINE, NL):
629                startline = True
630            elif startline and indents:
631                indent = indents[-1]
632                if start[1] >= len(indent):
633                    self.tokens.append(indent)
634                    self.prev_col = len(indent)
635                startline = False
636            self.add_whitespace(start)
637            self.tokens.append(token)
638            self.prev_row, self.prev_col = end
639            if tok_type in (NEWLINE, NL):
640                self.prev_row += 1
641                self.prev_col = 0
642        return "".join(self.tokens)
643
644    def compat(self, token, iterable):
645        indents = []
646        toks_append = self.tokens.append
647        startline = token[0] in (NEWLINE, NL)
648        prevstring = False
649
650        for tok in itertools.chain([token], iterable):
651            toknum, tokval = tok[:2]
652            if toknum == ENCODING:
653                self.encoding = tokval
654                continue
655
656            if toknum in ADDSPACE_TOKS:
657                tokval += " "
658
659            # Insert a space between two consecutive strings
660            if toknum == STRING:
661                if prevstring:
662                    tokval = " " + tokval
663                prevstring = True
664            else:
665                prevstring = False
666
667            if toknum == INDENT:
668                indents.append(tokval)
669                continue
670            elif toknum == DEDENT:
671                indents.pop()
672                continue
673            elif toknum in (NEWLINE, NL):
674                startline = True
675            elif startline and indents:
676                toks_append(indents[-1])
677                startline = False
678            toks_append(tokval)
679
680
681def untokenize(iterable):
682    """Transform tokens back into Python source code.
683    It returns a bytes object, encoded using the ENCODING
684    token, which is the first token sequence output by tokenize.
685
686    Each element returned by the iterable must be a token sequence
687    with at least two elements, a token number and token value.  If
688    only two tokens are passed, the resulting output is poor.
689
690    Round-trip invariant for full input:
691        Untokenized source will match input source exactly
692
693    Round-trip invariant for limited intput:
694        # Output bytes will tokenize the back to the input
695        t1 = [tok[:2] for tok in tokenize(f.readline)]
696        newcode = untokenize(t1)
697        readline = BytesIO(newcode).readline
698        t2 = [tok[:2] for tok in tokenize(readline)]
699        assert t1 == t2
700    """
701    ut = Untokenizer()
702    out = ut.untokenize(iterable)
703    if ut.encoding is not None:
704        out = out.encode(ut.encoding)
705    return out
706
707
708def _get_normal_name(orig_enc):
709    """Imitates get_normal_name in tokenizer.c."""
710    # Only care about the first 12 characters.
711    enc = orig_enc[:12].lower().replace("_", "-")
712    if enc == "utf-8" or enc.startswith("utf-8-"):
713        return "utf-8"
714    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
715        ("latin-1-", "iso-8859-1-", "iso-latin-1-")
716    ):
717        return "iso-8859-1"
718    return orig_enc
719
720
721def detect_encoding(readline):
722    """
723    The detect_encoding() function is used to detect the encoding that should
724    be used to decode a Python source file.  It requires one argument, readline,
725    in the same way as the tokenize() generator.
726
727    It will call readline a maximum of twice, and return the encoding used
728    (as a string) and a list of any lines (left as bytes) it has read in.
729
730    It detects the encoding from the presence of a utf-8 bom or an encoding
731    cookie as specified in pep-0263.  If both a bom and a cookie are present,
732    but disagree, a SyntaxError will be raised.  If the encoding cookie is an
733    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
734    'utf-8-sig' is returned.
735
736    If no encoding is specified, then the default of 'utf-8' will be returned.
737    """
738    try:
739        filename = readline.__self__.name
740    except AttributeError:
741        filename = None
742    bom_found = False
743    encoding = None
744    default = "utf-8"
745
746    def read_or_stop():
747        try:
748            return readline()
749        except StopIteration:
750            return b""
751
752    def find_cookie(line):
753        try:
754            # Decode as UTF-8. Either the line is an encoding declaration,
755            # in which case it should be pure ASCII, or it must be UTF-8
756            # per default encoding.
757            line_string = line.decode("utf-8")
758        except UnicodeDecodeError:
759            msg = "invalid or missing encoding declaration"
760            if filename is not None:
761                msg = "{} for {!r}".format(msg, filename)
762            raise SyntaxError(msg)
763
764        match = cookie_re.match(line_string)
765        if not match:
766            return None
767        encoding = _get_normal_name(match.group(1))
768        try:
769            codecs.lookup(encoding)
770        except LookupError:
771            # This behaviour mimics the Python interpreter
772            if filename is None:
773                msg = "unknown encoding: " + encoding
774            else:
775                msg = "unknown encoding for {!r}: {}".format(filename, encoding)
776            raise SyntaxError(msg)
777
778        if bom_found:
779            if encoding != "utf-8":
780                # This behaviour mimics the Python interpreter
781                if filename is None:
782                    msg = "encoding problem: utf-8"
783                else:
784                    msg = "encoding problem for {!r}: utf-8".format(filename)
785                raise SyntaxError(msg)
786            encoding += "-sig"
787        return encoding
788
789    first = read_or_stop()
790    if first.startswith(codecs.BOM_UTF8):
791        bom_found = True
792        first = first[3:]
793        default = "utf-8-sig"
794    if not first:
795        return default, []
796
797    encoding = find_cookie(first)
798    if encoding:
799        return encoding, [first]
800    if not blank_re.match(first):
801        return default, [first]
802
803    second = read_or_stop()
804    if not second:
805        return default, [first]
806
807    encoding = find_cookie(second)
808    if encoding:
809        return encoding, [first, second]
810
811    return default, [first, second]
812
813
814def tokopen(filename):
815    """Open a file in read only mode using the encoding detected by
816    detect_encoding().
817    """
818    buffer = builtins.open(filename, "rb")
819    try:
820        encoding, lines = detect_encoding(buffer.readline)
821        buffer.seek(0)
822        text = io.TextIOWrapper(buffer, encoding, line_buffering=True)
823        text.mode = "r"
824        return text
825    except Exception:
826        buffer.close()
827        raise
828
829
830def _tokenize(readline, encoding):
831    lnum = parenlev = continued = 0
832    numchars = "0123456789"
833    contstr, needcont = "", 0
834    contline = None
835    indents = [0]
836
837    # 'stashed' and 'async_*' are used for async/await parsing
838    stashed = None
839    async_def = False
840    async_def_indent = 0
841    async_def_nl = False
842
843    if encoding is not None:
844        if encoding == "utf-8-sig":
845            # BOM will already have been stripped.
846            encoding = "utf-8"
847        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), "")
848    while True:  # loop over lines in stream
849        try:
850            line = readline()
851        except StopIteration:
852            line = b""
853
854        if encoding is not None:
855            line = line.decode(encoding)
856        lnum += 1
857        pos, max = 0, len(line)
858
859        if contstr:  # continued string
860            if not line:
861                raise TokenError("EOF in multi-line string", strstart)
862            endmatch = endprog.match(line)
863            if endmatch:
864                pos = end = endmatch.end(0)
865                yield TokenInfo(
866                    STRING, contstr + line[:end], strstart, (lnum, end), contline + line
867                )
868                contstr, needcont = "", 0
869                contline = None
870            elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
871                yield TokenInfo(
872                    ERRORTOKEN, contstr + line, strstart, (lnum, len(line)), contline
873                )
874                contstr = ""
875                contline = None
876                continue
877            else:
878                contstr = contstr + line
879                contline = contline + line
880                continue
881
882        elif parenlev == 0 and not continued:  # new statement
883            if not line:
884                break
885            column = 0
886            while pos < max:  # measure leading whitespace
887                if line[pos] == " ":
888                    column += 1
889                elif line[pos] == "\t":
890                    column = (column // tabsize + 1) * tabsize
891                elif line[pos] == "\f":
892                    column = 0
893                else:
894                    break
895                pos += 1
896            if pos == max:
897                break
898
899            if line[pos] in "#\r\n":  # skip comments or blank lines
900                if line[pos] == "#":
901                    comment_token = line[pos:].rstrip("\r\n")
902                    nl_pos = pos + len(comment_token)
903                    yield TokenInfo(
904                        COMMENT,
905                        comment_token,
906                        (lnum, pos),
907                        (lnum, pos + len(comment_token)),
908                        line,
909                    )
910                    yield TokenInfo(
911                        NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line
912                    )
913                else:
914                    yield TokenInfo(
915                        (NL, COMMENT)[line[pos] == "#"],
916                        line[pos:],
917                        (lnum, pos),
918                        (lnum, len(line)),
919                        line,
920                    )
921                continue
922
923            if column > indents[-1]:  # count indents or dedents
924                indents.append(column)
925                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
926            while column < indents[-1]:
927                if column not in indents:
928                    raise IndentationError(
929                        "unindent does not match any outer indentation level",
930                        ("<tokenize>", lnum, pos, line),
931                    )
932                indents = indents[:-1]
933
934                if async_def and async_def_indent >= indents[-1]:
935                    async_def = False
936                    async_def_nl = False
937                    async_def_indent = 0
938
939                yield TokenInfo(DEDENT, "", (lnum, pos), (lnum, pos), line)
940
941            if async_def and async_def_nl and async_def_indent >= indents[-1]:
942                async_def = False
943                async_def_nl = False
944                async_def_indent = 0
945
946        else:  # continued statement
947            if not line:
948                raise TokenError("EOF in multi-line statement", (lnum, 0))
949            continued = 0
950
951        while pos < max:
952            pseudomatch = _compile(PseudoToken).match(line, pos)
953            if pseudomatch:  # scan for tokens
954                start, end = pseudomatch.span(1)
955                spos, epos, pos = (lnum, start), (lnum, end), end
956                if start == end:
957                    continue
958                token, initial = line[start:end], line[start]
959
960                if token in _redir_check:
961                    yield TokenInfo(IOREDIRECT, token, spos, epos, line)
962                elif initial in numchars or (  # ordinary number
963                    initial == "." and token != "." and token != "..."
964                ):
965                    yield TokenInfo(NUMBER, token, spos, epos, line)
966                elif initial in "\r\n":
967                    if stashed:
968                        yield stashed
969                        stashed = None
970                    if parenlev > 0:
971                        yield TokenInfo(NL, token, spos, epos, line)
972                    else:
973                        yield TokenInfo(NEWLINE, token, spos, epos, line)
974                        if async_def:
975                            async_def_nl = True
976
977                elif initial == "#":
978                    assert not token.endswith("\n")
979                    if stashed:
980                        yield stashed
981                        stashed = None
982                    yield TokenInfo(COMMENT, token, spos, epos, line)
983                # Xonsh-specific Regex Globbing
984                elif re.match(SearchPath, token):
985                    yield TokenInfo(SEARCHPATH, token, spos, epos, line)
986                elif token in triple_quoted:
987                    endprog = _compile(endpats[token])
988                    endmatch = endprog.match(line, pos)
989                    if endmatch:  # all on one line
990                        pos = endmatch.end(0)
991                        token = line[start:pos]
992                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
993                    else:
994                        strstart = (lnum, start)  # multiple lines
995                        contstr = line[start:]
996                        contline = line
997                        break
998                elif (
999                    initial in single_quoted
1000                    or token[:2] in single_quoted
1001                    or token[:3] in single_quoted
1002                ):
1003                    if token[-1] == "\n":  # continued string
1004                        strstart = (lnum, start)
1005                        endprog = _compile(
1006                            endpats[initial] or endpats[token[1]] or endpats[token[2]]
1007                        )
1008                        contstr, needcont = line[start:], 1
1009                        contline = line
1010                        break
1011                    else:  # ordinary string
1012                        yield TokenInfo(STRING, token, spos, epos, line)
1013                elif token.startswith("$") and token[1:].isidentifier():
1014                    yield TokenInfo(DOLLARNAME, token, spos, epos, line)
1015                elif initial.isidentifier():  # ordinary name
1016                    if token in ("async", "await"):
1017                        if async_def:
1018                            yield TokenInfo(
1019                                ASYNC if token == "async" else AWAIT,
1020                                token,
1021                                spos,
1022                                epos,
1023                                line,
1024                            )
1025                            continue
1026
1027                    tok = TokenInfo(NAME, token, spos, epos, line)
1028                    if token == "async" and not stashed:
1029                        stashed = tok
1030                        continue
1031
1032                    if (
1033                        HAS_ASYNC
1034                        and token == "def"
1035                        and (
1036                            stashed
1037                            and stashed.type == NAME
1038                            and stashed.string == "async"
1039                        )
1040                    ):
1041                        async_def = True
1042                        async_def_indent = indents[-1]
1043
1044                        yield TokenInfo(
1045                            ASYNC,
1046                            stashed.string,
1047                            stashed.start,
1048                            stashed.end,
1049                            stashed.line,
1050                        )
1051                        stashed = None
1052
1053                    if stashed:
1054                        yield stashed
1055                        stashed = None
1056
1057                    yield tok
1058                elif token == "\\\n" or token == "\\\r\n":  # continued stmt
1059                    continued = 1
1060                    yield TokenInfo(ERRORTOKEN, token, spos, epos, line)
1061                elif initial == "\\":  # continued stmt
1062                    # for cases like C:\\path\\to\\file
1063                    continued = 1
1064                else:
1065                    if initial in "([{":
1066                        parenlev += 1
1067                    elif initial in ")]}":
1068                        parenlev -= 1
1069                    elif token in additional_parenlevs:
1070                        parenlev += 1
1071                    if stashed:
1072                        yield stashed
1073                        stashed = None
1074                    yield TokenInfo(OP, token, spos, epos, line)
1075            else:
1076                yield TokenInfo(
1077                    ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line
1078                )
1079                pos += 1
1080
1081    if stashed:
1082        yield stashed
1083        stashed = None
1084
1085    for indent in indents[1:]:  # pop remaining indent levels
1086        yield TokenInfo(DEDENT, "", (lnum, 0), (lnum, 0), "")
1087    yield TokenInfo(ENDMARKER, "", (lnum, 0), (lnum, 0), "")
1088
1089
1090def tokenize(readline):
1091    """
1092    The tokenize() generator requires one argument, readline, which
1093    must be a callable object which provides the same interface as the
1094    readline() method of built-in file objects.  Each call to the function
1095    should return one line of input as bytes.  Alternately, readline
1096    can be a callable function terminating with StopIteration:
1097        readline = open(myfile, 'rb').__next__  # Example of alternate readline
1098
1099    The generator produces 5-tuples with these members: the token type; the
1100    token string; a 2-tuple (srow, scol) of ints specifying the row and
1101    column where the token begins in the source; a 2-tuple (erow, ecol) of
1102    ints specifying the row and column where the token ends in the source;
1103    and the line on which the token was found.  The line passed is the
1104    logical line; continuation lines are included.
1105
1106    The first token sequence will always be an ENCODING token
1107    which tells you which encoding was used to decode the bytes stream.
1108    """
1109    encoding, consumed = detect_encoding(readline)
1110    rl_gen = iter(readline, b"")
1111    empty = itertools.repeat(b"")
1112    return _tokenize(itertools.chain(consumed, rl_gen, empty).__next__, encoding)
1113
1114
1115# An undocumented, backwards compatible, API for all the places in the standard
1116# library that expect to be able to use tokenize with strings
1117def generate_tokens(readline):
1118    return _tokenize(readline, None)
1119
1120
1121def tokenize_main():
1122    import argparse
1123
1124    # Helper error handling routines
1125    def perror(message):
1126        print(message, file=sys.stderr)
1127
1128    def error(message, filename=None, location=None):
1129        if location:
1130            args = (filename,) + location + (message,)
1131            perror("%s:%d:%d: error: %s" % args)
1132        elif filename:
1133            perror("%s: error: %s" % (filename, message))
1134        else:
1135            perror("error: %s" % message)
1136        sys.exit(1)
1137
1138    # Parse the arguments and options
1139    parser = argparse.ArgumentParser(prog="python -m tokenize")
1140    parser.add_argument(
1141        dest="filename",
1142        nargs="?",
1143        metavar="filename.py",
1144        help="the file to tokenize; defaults to stdin",
1145    )
1146    parser.add_argument(
1147        "-e",
1148        "--exact",
1149        dest="exact",
1150        action="store_true",
1151        help="display token names using the exact type",
1152    )
1153    args = parser.parse_args()
1154
1155    try:
1156        # Tokenize the input
1157        if args.filename:
1158            filename = args.filename
1159            with builtins.open(filename, "rb") as f:
1160                tokens = list(tokenize(f.readline))
1161        else:
1162            filename = "<stdin>"
1163            tokens = _tokenize(sys.stdin.readline, None)
1164
1165        # Output the tokenization
1166        for token in tokens:
1167            token_type = token.type
1168            if args.exact:
1169                token_type = token.exact_type
1170            token_range = "%d,%d-%d,%d:" % (token.start + token.end)
1171            print("%-20s%-15s%-15r" % (token_range, tok_name[token_type], token.string))
1172    except IndentationError as err:
1173        line, column = err.args[1][1:3]
1174        error(err.args[0], filename, (line, column))
1175    except TokenError as err:
1176        line, column = err.args[1]
1177        error(err.args[0], filename, (line, column))
1178    except SyntaxError as err:
1179        error(err, filename)
1180    except OSError as err:
1181        error(err)
1182    except KeyboardInterrupt:
1183        print("interrupted\n")
1184    except Exception as err:
1185        perror("unexpected error: %s" % err)
1186        raise
1187