1"""Tokenization help for xonsh programs. 2 3This file is a modified version of tokenize.py form the Python 3.4 and 3.5 4standard libraries (licensed under the Python Software Foundation License, 5version 2), which provides tokenization help for Python programs. 6 7It is modified to properly tokenize xonsh code, including backtick regex 8path and several xonsh-specific operators. 9 10A few pieces of this file are specific to the version of Python being used. 11To find these pieces, search the PY35. 12 13Original file credits: 14 __author__ = 'Ka-Ping Yee <ping@lfw.org>' 15 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 16 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 17 'Michael Foord') 18""" 19 20import re 21import io 22import sys 23import codecs 24import builtins 25import itertools 26import collections 27import token 28from token import ( 29 AMPER, 30 AMPEREQUAL, 31 AT, 32 CIRCUMFLEX, 33 CIRCUMFLEXEQUAL, 34 COLON, 35 COMMA, 36 DEDENT, 37 DOT, 38 DOUBLESLASH, 39 DOUBLESLASHEQUAL, 40 DOUBLESTAR, 41 DOUBLESTAREQUAL, 42 ENDMARKER, 43 EQEQUAL, 44 EQUAL, 45 ERRORTOKEN, 46 GREATER, 47 GREATEREQUAL, 48 INDENT, 49 LBRACE, 50 LEFTSHIFT, 51 LEFTSHIFTEQUAL, 52 LESS, 53 LESSEQUAL, 54 LPAR, 55 LSQB, 56 MINEQUAL, 57 MINUS, 58 NAME, 59 NEWLINE, 60 NOTEQUAL, 61 NUMBER, 62 N_TOKENS, 63 OP, 64 PERCENT, 65 PERCENTEQUAL, 66 PLUS, 67 PLUSEQUAL, 68 RBRACE, 69 RIGHTSHIFT, 70 RIGHTSHIFTEQUAL, 71 RPAR, 72 RSQB, 73 SEMI, 74 SLASH, 75 SLASHEQUAL, 76 STAR, 77 STAREQUAL, 78 STRING, 79 TILDE, 80 VBAR, 81 VBAREQUAL, 82 tok_name, 83) 84 85from xonsh.lazyasd import LazyObject 86from xonsh.platform import PYTHON_VERSION_INFO 87 88cookie_re = LazyObject( 89 lambda: re.compile(r"^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)", re.ASCII), 90 globals(), 91 "cookie_re", 92) 93blank_re = LazyObject( 94 lambda: re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII), globals(), "blank_re" 95) 96 97# 98# token modifications 99# 100tok_name = tok_name.copy() 101__all__ = token.__all__ + [ 102 "COMMENT", 103 "tokenize", 104 "detect_encoding", 105 "NL", 106 "untokenize", 107 "ENCODING", 108 "TokenInfo", 109 "TokenError", 110 "SEARCHPATH", 111 "ATDOLLAR", 112 "ATEQUAL", 113 "DOLLARNAME", 114 "IOREDIRECT", 115] 116HAS_ASYNC = (3, 5, 0) <= PYTHON_VERSION_INFO < (3, 7, 0) 117if HAS_ASYNC: 118 ASYNC = token.ASYNC 119 AWAIT = token.AWAIT 120 ADDSPACE_TOKS = (NAME, NUMBER, ASYNC, AWAIT) 121else: 122 ADDSPACE_TOKS = (NAME, NUMBER) 123del token # must clean up token 124PY35 = (3, 5, 0) <= PYTHON_VERSION_INFO 125AUGASSIGN_OPS = r"[+\-*/%&@|^=<>]=?" 126if not PY35: 127 AUGASSIGN_OPS = AUGASSIGN_OPS.replace("@", "") 128 129 130COMMENT = N_TOKENS 131tok_name[COMMENT] = "COMMENT" 132NL = N_TOKENS + 1 133tok_name[NL] = "NL" 134ENCODING = N_TOKENS + 2 135tok_name[ENCODING] = "ENCODING" 136N_TOKENS += 3 137SEARCHPATH = N_TOKENS 138tok_name[N_TOKENS] = "SEARCHPATH" 139N_TOKENS += 1 140IOREDIRECT = N_TOKENS 141tok_name[N_TOKENS] = "IOREDIRECT" 142N_TOKENS += 1 143DOLLARNAME = N_TOKENS 144tok_name[N_TOKENS] = "DOLLARNAME" 145N_TOKENS += 1 146ATDOLLAR = N_TOKENS 147tok_name[N_TOKENS] = "ATDOLLAR" 148N_TOKENS += 1 149ATEQUAL = N_TOKENS 150tok_name[N_TOKENS] = "ATEQUAL" 151N_TOKENS += 1 152_xonsh_tokens = { 153 "?": "QUESTION", 154 "@=": "ATEQUAL", 155 "@$": "ATDOLLAR", 156 "||": "DOUBLEPIPE", 157 "&&": "DOUBLEAMPER", 158 "@(": "ATLPAREN", 159 "!(": "BANGLPAREN", 160 "![": "BANGLBRACKET", 161 "$(": "DOLLARLPAREN", 162 "$[": "DOLLARLBRACKET", 163 "${": "DOLLARLBRACE", 164 "??": "DOUBLEQUESTION", 165 "@$(": "ATDOLLARLPAREN", 166} 167 168additional_parenlevs = frozenset({"@(", "!(", "![", "$(", "$[", "${", "@$("}) 169 170_glbs = globals() 171for v in _xonsh_tokens.values(): 172 _glbs[v] = N_TOKENS 173 tok_name[N_TOKENS] = v 174 N_TOKENS += 1 175 __all__.append(v) 176del _glbs, v 177 178EXACT_TOKEN_TYPES = { 179 "(": LPAR, 180 ")": RPAR, 181 "[": LSQB, 182 "]": RSQB, 183 ":": COLON, 184 ",": COMMA, 185 ";": SEMI, 186 "+": PLUS, 187 "-": MINUS, 188 "*": STAR, 189 "/": SLASH, 190 "|": VBAR, 191 "&": AMPER, 192 "<": LESS, 193 ">": GREATER, 194 "=": EQUAL, 195 ".": DOT, 196 "%": PERCENT, 197 "{": LBRACE, 198 "}": RBRACE, 199 "==": EQEQUAL, 200 "!=": NOTEQUAL, 201 "<=": LESSEQUAL, 202 ">=": GREATEREQUAL, 203 "~": TILDE, 204 "^": CIRCUMFLEX, 205 "<<": LEFTSHIFT, 206 ">>": RIGHTSHIFT, 207 "**": DOUBLESTAR, 208 "+=": PLUSEQUAL, 209 "-=": MINEQUAL, 210 "*=": STAREQUAL, 211 "/=": SLASHEQUAL, 212 "%=": PERCENTEQUAL, 213 "&=": AMPEREQUAL, 214 "|=": VBAREQUAL, 215 "^=": CIRCUMFLEXEQUAL, 216 "<<=": LEFTSHIFTEQUAL, 217 ">>=": RIGHTSHIFTEQUAL, 218 "**=": DOUBLESTAREQUAL, 219 "//": DOUBLESLASH, 220 "//=": DOUBLESLASHEQUAL, 221 "@": AT, 222} 223 224EXACT_TOKEN_TYPES.update(_xonsh_tokens) 225 226 227class TokenInfo(collections.namedtuple("TokenInfo", "type string start end line")): 228 def __repr__(self): 229 annotated_type = "%d (%s)" % (self.type, tok_name[self.type]) 230 return ( 231 "TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)" 232 % self._replace(type=annotated_type) 233 ) 234 235 @property 236 def exact_type(self): 237 if self.type == OP and self.string in EXACT_TOKEN_TYPES: 238 return EXACT_TOKEN_TYPES[self.string] 239 else: 240 return self.type 241 242 243def group(*choices): 244 return "(" + "|".join(choices) + ")" 245 246 247def tokany(*choices): 248 return group(*choices) + "*" 249 250 251def maybe(*choices): 252 return group(*choices) + "?" 253 254 255# Note: we use unicode matching for names ("\w") but ascii matching for 256# number literals. 257Whitespace = r"[ \f\t]*" 258Comment = r"#[^\r\n]*" 259Ignore = Whitespace + tokany(r"\\\r?\n" + Whitespace) + maybe(Comment) 260Name_RE = r"\$?\w+" 261 262Hexnumber = r"0[xX](?:_?[0-9a-fA-F])+" 263Binnumber = r"0[bB](?:_?[01])+" 264Octnumber = r"0[oO](?:_?[0-7])+" 265Decnumber = r"(?:0(?:_?0)*|[1-9](?:_?[0-9])*)" 266Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 267Exponent = r"[eE][-+]?[0-9](?:_?[0-9])*" 268Pointfloat = group( 269 r"[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?", r"\.[0-9](?:_?[0-9])*" 270) + maybe(Exponent) 271Expfloat = r"[0-9](?:_?[0-9])*" + Exponent 272Floatnumber = group(Pointfloat, Expfloat) 273Imagnumber = group(r"[0-9](?:_?[0-9])*[jJ]", Floatnumber + r"[jJ]") 274Number = group(Imagnumber, Floatnumber, Intnumber) 275 276StringPrefix = r"(?:[bBp][rR]?|[rR][bBpfF]?|[uU]|[fF][rR]?)?" 277 278# Tail end of ' string. 279Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 280# Tail end of " string. 281Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 282# Tail end of ''' string. 283Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 284# Tail end of """ string. 285Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 286Triple = group(StringPrefix + "'''", StringPrefix + '"""') 287# Single-line ' or " string. 288String = group( 289 StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 290 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"', 291) 292 293# Xonsh-specific Syntax 294SearchPath = r"((?:[rgp]+|@\w*)?)`([^\n`\\]*(?:\\.[^\n`\\]*)*)`" 295 296# Because of leftmost-then-longest match semantics, be sure to put the 297# longest operators first (e.g., if = came before ==, == would get 298# recognized as two instances of =). 299_redir_names = ("out", "all", "err", "e", "2", "a", "&", "1", "o") 300_redir_map = ( 301 # stderr to stdout 302 "err>out", 303 "err>&1", 304 "2>out", 305 "err>o", 306 "err>1", 307 "e>out", 308 "e>&1", 309 "2>&1", 310 "e>o", 311 "2>o", 312 "e>1", 313 "2>1", 314 # stdout to stderr 315 "out>err", 316 "out>&2", 317 "1>err", 318 "out>e", 319 "out>2", 320 "o>err", 321 "o>&2", 322 "1>&2", 323 "o>e", 324 "1>e", 325 "o>2", 326 "1>2", 327) 328IORedirect = group(group(*_redir_map), "{}>>?".format(group(*_redir_names))) 329_redir_check = set(_redir_map) 330_redir_check = {"{}>".format(i) for i in _redir_names}.union(_redir_check) 331_redir_check = {"{}>>".format(i) for i in _redir_names}.union(_redir_check) 332_redir_check = frozenset(_redir_check) 333Operator = group( 334 r"\*\*=?", 335 r">>=?", 336 r"<<=?", 337 r"!=", 338 r"//=?", 339 r"->", 340 r"@\$\(?", 341 r"\|\|", 342 "&&", 343 r"@\(", 344 r"!\(", 345 r"!\[", 346 r"\$\(", 347 r"\$\[", 348 "\${", 349 r"\?\?", 350 r"\?", 351 AUGASSIGN_OPS, 352 r"~", 353) 354 355Bracket = "[][(){}]" 356Special = group(r"\r?\n", r"\.\.\.", r"[:;.,@]") 357Funny = group(Operator, Bracket, Special) 358 359PlainToken = group(IORedirect, Number, Funny, String, Name_RE, SearchPath) 360 361# First (or only) line of ' or " string. 362ContStr = group( 363 StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"), 364 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"), 365) 366PseudoExtras = group(r"\\\r?\n|\Z", Comment, Triple, SearchPath) 367PseudoToken = Whitespace + group( 368 PseudoExtras, IORedirect, Number, Funny, ContStr, Name_RE 369) 370 371 372def _compile(expr): 373 return re.compile(expr, re.UNICODE) 374 375 376endpats = { 377 "'": Single, 378 '"': Double, 379 "'''": Single3, 380 '"""': Double3, 381 "r'''": Single3, 382 'r"""': Double3, 383 "b'''": Single3, 384 'b"""': Double3, 385 "f'''": Single3, 386 'f"""': Double3, 387 "R'''": Single3, 388 'R"""': Double3, 389 "B'''": Single3, 390 'B"""': Double3, 391 "F'''": Single3, 392 'F"""': Double3, 393 "br'''": Single3, 394 'br"""': Double3, 395 "fr'''": Single3, 396 'fr"""': Double3, 397 "bR'''": Single3, 398 'bR"""': Double3, 399 "Br'''": Single3, 400 'Br"""': Double3, 401 "BR'''": Single3, 402 'BR"""': Double3, 403 "rb'''": Single3, 404 'rb"""': Double3, 405 "rf'''": Single3, 406 'rf"""': Double3, 407 "Rb'''": Single3, 408 'Rb"""': Double3, 409 "Fr'''": Single3, 410 'Fr"""': Double3, 411 "rB'''": Single3, 412 'rB"""': Double3, 413 "rF'''": Single3, 414 'rF"""': Double3, 415 "RB'''": Single3, 416 'RB"""': Double3, 417 "RF'''": Single3, 418 'RF"""': Double3, 419 "u'''": Single3, 420 'u"""': Double3, 421 "U'''": Single3, 422 'U"""': Double3, 423 "p'''": Single3, 424 'p"""': Double3, 425 "pr'''": Single3, 426 'pr"""': Double3, 427 "pR'''": Single3, 428 'pR"""': Double3, 429 "rp'''": Single3, 430 'rp"""': Double3, 431 "Rp'''": Single3, 432 'Rp"""': Double3, 433 "r": None, 434 "R": None, 435 "b": None, 436 "B": None, 437 "u": None, 438 "U": None, 439 "p": None, 440 "f": None, 441 "F": None, 442} 443 444triple_quoted = {} 445for t in ( 446 "'''", 447 '"""', 448 "r'''", 449 'r"""', 450 "R'''", 451 'R"""', 452 "b'''", 453 'b"""', 454 "B'''", 455 'B"""', 456 "f'''", 457 'f"""', 458 "F'''", 459 'F"""', 460 "br'''", 461 'br"""', 462 "Br'''", 463 'Br"""', 464 "bR'''", 465 'bR"""', 466 "BR'''", 467 'BR"""', 468 "rb'''", 469 'rb"""', 470 "rB'''", 471 'rB"""', 472 "Rb'''", 473 'Rb"""', 474 "RB'''", 475 'RB"""', 476 "fr'''", 477 'fr"""', 478 "Fr'''", 479 'Fr"""', 480 "fR'''", 481 'fR"""', 482 "FR'''", 483 'FR"""', 484 "rf'''", 485 'rf"""', 486 "rF'''", 487 'rF"""', 488 "Rf'''", 489 'Rf"""', 490 "RF'''", 491 'RF"""', 492 "u'''", 493 'u"""', 494 "U'''", 495 'U"""', 496 "p'''", 497 'p""""', 498 "pr'''", 499 'pr""""', 500 "pR'''", 501 'pR""""', 502 "rp'''", 503 'rp""""', 504 "Rp'''", 505 'Rp""""', 506): 507 triple_quoted[t] = t 508single_quoted = {} 509for t in ( 510 "'", 511 '"', 512 "r'", 513 'r"', 514 "R'", 515 'R"', 516 "b'", 517 'b"', 518 "B'", 519 'B"', 520 "f'", 521 'f"', 522 "F'", 523 'F"', 524 "br'", 525 'br"', 526 "Br'", 527 'Br"', 528 "bR'", 529 'bR"', 530 "BR'", 531 'BR"', 532 "rb'", 533 'rb"', 534 "rB'", 535 'rB"', 536 "Rb'", 537 'Rb"', 538 "RB'", 539 'RB"', 540 "fr'", 541 'fr"', 542 "Fr'", 543 'Fr"', 544 "fR'", 545 'fR"', 546 "FR'", 547 'FR"', 548 "rf'", 549 'rf"', 550 "rF'", 551 'rF"', 552 "Rf'", 553 'Rf"', 554 "RF'", 555 'RF"', 556 "u'", 557 'u"', 558 "U'", 559 'U"', 560 "p'", 561 'p"', 562 "pr'", 563 'pr"', 564 "pR'", 565 'pR"', 566 "rp'", 567 'rp"', 568 "Rp'", 569 'Rp"', 570): 571 single_quoted[t] = t 572 573tabsize = 8 574 575 576class TokenError(Exception): 577 pass 578 579 580class StopTokenizing(Exception): 581 pass 582 583 584class Untokenizer: 585 def __init__(self): 586 self.tokens = [] 587 self.prev_row = 1 588 self.prev_col = 0 589 self.encoding = None 590 591 def add_whitespace(self, start): 592 row, col = start 593 if row < self.prev_row or row == self.prev_row and col < self.prev_col: 594 raise ValueError( 595 "start ({},{}) precedes previous end ({},{})".format( 596 row, col, self.prev_row, self.prev_col 597 ) 598 ) 599 row_offset = row - self.prev_row 600 if row_offset: 601 self.tokens.append("\\\n" * row_offset) 602 self.prev_col = 0 603 col_offset = col - self.prev_col 604 if col_offset: 605 self.tokens.append(" " * col_offset) 606 607 def untokenize(self, iterable): 608 it = iter(iterable) 609 indents = [] 610 startline = False 611 for t in it: 612 if len(t) == 2: 613 self.compat(t, it) 614 break 615 tok_type, token, start, end, line = t 616 if tok_type == ENCODING: 617 self.encoding = token 618 continue 619 if tok_type == ENDMARKER: 620 break 621 if tok_type == INDENT: 622 indents.append(token) 623 continue 624 elif tok_type == DEDENT: 625 indents.pop() 626 self.prev_row, self.prev_col = end 627 continue 628 elif tok_type in (NEWLINE, NL): 629 startline = True 630 elif startline and indents: 631 indent = indents[-1] 632 if start[1] >= len(indent): 633 self.tokens.append(indent) 634 self.prev_col = len(indent) 635 startline = False 636 self.add_whitespace(start) 637 self.tokens.append(token) 638 self.prev_row, self.prev_col = end 639 if tok_type in (NEWLINE, NL): 640 self.prev_row += 1 641 self.prev_col = 0 642 return "".join(self.tokens) 643 644 def compat(self, token, iterable): 645 indents = [] 646 toks_append = self.tokens.append 647 startline = token[0] in (NEWLINE, NL) 648 prevstring = False 649 650 for tok in itertools.chain([token], iterable): 651 toknum, tokval = tok[:2] 652 if toknum == ENCODING: 653 self.encoding = tokval 654 continue 655 656 if toknum in ADDSPACE_TOKS: 657 tokval += " " 658 659 # Insert a space between two consecutive strings 660 if toknum == STRING: 661 if prevstring: 662 tokval = " " + tokval 663 prevstring = True 664 else: 665 prevstring = False 666 667 if toknum == INDENT: 668 indents.append(tokval) 669 continue 670 elif toknum == DEDENT: 671 indents.pop() 672 continue 673 elif toknum in (NEWLINE, NL): 674 startline = True 675 elif startline and indents: 676 toks_append(indents[-1]) 677 startline = False 678 toks_append(tokval) 679 680 681def untokenize(iterable): 682 """Transform tokens back into Python source code. 683 It returns a bytes object, encoded using the ENCODING 684 token, which is the first token sequence output by tokenize. 685 686 Each element returned by the iterable must be a token sequence 687 with at least two elements, a token number and token value. If 688 only two tokens are passed, the resulting output is poor. 689 690 Round-trip invariant for full input: 691 Untokenized source will match input source exactly 692 693 Round-trip invariant for limited intput: 694 # Output bytes will tokenize the back to the input 695 t1 = [tok[:2] for tok in tokenize(f.readline)] 696 newcode = untokenize(t1) 697 readline = BytesIO(newcode).readline 698 t2 = [tok[:2] for tok in tokenize(readline)] 699 assert t1 == t2 700 """ 701 ut = Untokenizer() 702 out = ut.untokenize(iterable) 703 if ut.encoding is not None: 704 out = out.encode(ut.encoding) 705 return out 706 707 708def _get_normal_name(orig_enc): 709 """Imitates get_normal_name in tokenizer.c.""" 710 # Only care about the first 12 characters. 711 enc = orig_enc[:12].lower().replace("_", "-") 712 if enc == "utf-8" or enc.startswith("utf-8-"): 713 return "utf-8" 714 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith( 715 ("latin-1-", "iso-8859-1-", "iso-latin-1-") 716 ): 717 return "iso-8859-1" 718 return orig_enc 719 720 721def detect_encoding(readline): 722 """ 723 The detect_encoding() function is used to detect the encoding that should 724 be used to decode a Python source file. It requires one argument, readline, 725 in the same way as the tokenize() generator. 726 727 It will call readline a maximum of twice, and return the encoding used 728 (as a string) and a list of any lines (left as bytes) it has read in. 729 730 It detects the encoding from the presence of a utf-8 bom or an encoding 731 cookie as specified in pep-0263. If both a bom and a cookie are present, 732 but disagree, a SyntaxError will be raised. If the encoding cookie is an 733 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, 734 'utf-8-sig' is returned. 735 736 If no encoding is specified, then the default of 'utf-8' will be returned. 737 """ 738 try: 739 filename = readline.__self__.name 740 except AttributeError: 741 filename = None 742 bom_found = False 743 encoding = None 744 default = "utf-8" 745 746 def read_or_stop(): 747 try: 748 return readline() 749 except StopIteration: 750 return b"" 751 752 def find_cookie(line): 753 try: 754 # Decode as UTF-8. Either the line is an encoding declaration, 755 # in which case it should be pure ASCII, or it must be UTF-8 756 # per default encoding. 757 line_string = line.decode("utf-8") 758 except UnicodeDecodeError: 759 msg = "invalid or missing encoding declaration" 760 if filename is not None: 761 msg = "{} for {!r}".format(msg, filename) 762 raise SyntaxError(msg) 763 764 match = cookie_re.match(line_string) 765 if not match: 766 return None 767 encoding = _get_normal_name(match.group(1)) 768 try: 769 codecs.lookup(encoding) 770 except LookupError: 771 # This behaviour mimics the Python interpreter 772 if filename is None: 773 msg = "unknown encoding: " + encoding 774 else: 775 msg = "unknown encoding for {!r}: {}".format(filename, encoding) 776 raise SyntaxError(msg) 777 778 if bom_found: 779 if encoding != "utf-8": 780 # This behaviour mimics the Python interpreter 781 if filename is None: 782 msg = "encoding problem: utf-8" 783 else: 784 msg = "encoding problem for {!r}: utf-8".format(filename) 785 raise SyntaxError(msg) 786 encoding += "-sig" 787 return encoding 788 789 first = read_or_stop() 790 if first.startswith(codecs.BOM_UTF8): 791 bom_found = True 792 first = first[3:] 793 default = "utf-8-sig" 794 if not first: 795 return default, [] 796 797 encoding = find_cookie(first) 798 if encoding: 799 return encoding, [first] 800 if not blank_re.match(first): 801 return default, [first] 802 803 second = read_or_stop() 804 if not second: 805 return default, [first] 806 807 encoding = find_cookie(second) 808 if encoding: 809 return encoding, [first, second] 810 811 return default, [first, second] 812 813 814def tokopen(filename): 815 """Open a file in read only mode using the encoding detected by 816 detect_encoding(). 817 """ 818 buffer = builtins.open(filename, "rb") 819 try: 820 encoding, lines = detect_encoding(buffer.readline) 821 buffer.seek(0) 822 text = io.TextIOWrapper(buffer, encoding, line_buffering=True) 823 text.mode = "r" 824 return text 825 except Exception: 826 buffer.close() 827 raise 828 829 830def _tokenize(readline, encoding): 831 lnum = parenlev = continued = 0 832 numchars = "0123456789" 833 contstr, needcont = "", 0 834 contline = None 835 indents = [0] 836 837 # 'stashed' and 'async_*' are used for async/await parsing 838 stashed = None 839 async_def = False 840 async_def_indent = 0 841 async_def_nl = False 842 843 if encoding is not None: 844 if encoding == "utf-8-sig": 845 # BOM will already have been stripped. 846 encoding = "utf-8" 847 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), "") 848 while True: # loop over lines in stream 849 try: 850 line = readline() 851 except StopIteration: 852 line = b"" 853 854 if encoding is not None: 855 line = line.decode(encoding) 856 lnum += 1 857 pos, max = 0, len(line) 858 859 if contstr: # continued string 860 if not line: 861 raise TokenError("EOF in multi-line string", strstart) 862 endmatch = endprog.match(line) 863 if endmatch: 864 pos = end = endmatch.end(0) 865 yield TokenInfo( 866 STRING, contstr + line[:end], strstart, (lnum, end), contline + line 867 ) 868 contstr, needcont = "", 0 869 contline = None 870 elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n": 871 yield TokenInfo( 872 ERRORTOKEN, contstr + line, strstart, (lnum, len(line)), contline 873 ) 874 contstr = "" 875 contline = None 876 continue 877 else: 878 contstr = contstr + line 879 contline = contline + line 880 continue 881 882 elif parenlev == 0 and not continued: # new statement 883 if not line: 884 break 885 column = 0 886 while pos < max: # measure leading whitespace 887 if line[pos] == " ": 888 column += 1 889 elif line[pos] == "\t": 890 column = (column // tabsize + 1) * tabsize 891 elif line[pos] == "\f": 892 column = 0 893 else: 894 break 895 pos += 1 896 if pos == max: 897 break 898 899 if line[pos] in "#\r\n": # skip comments or blank lines 900 if line[pos] == "#": 901 comment_token = line[pos:].rstrip("\r\n") 902 nl_pos = pos + len(comment_token) 903 yield TokenInfo( 904 COMMENT, 905 comment_token, 906 (lnum, pos), 907 (lnum, pos + len(comment_token)), 908 line, 909 ) 910 yield TokenInfo( 911 NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line 912 ) 913 else: 914 yield TokenInfo( 915 (NL, COMMENT)[line[pos] == "#"], 916 line[pos:], 917 (lnum, pos), 918 (lnum, len(line)), 919 line, 920 ) 921 continue 922 923 if column > indents[-1]: # count indents or dedents 924 indents.append(column) 925 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 926 while column < indents[-1]: 927 if column not in indents: 928 raise IndentationError( 929 "unindent does not match any outer indentation level", 930 ("<tokenize>", lnum, pos, line), 931 ) 932 indents = indents[:-1] 933 934 if async_def and async_def_indent >= indents[-1]: 935 async_def = False 936 async_def_nl = False 937 async_def_indent = 0 938 939 yield TokenInfo(DEDENT, "", (lnum, pos), (lnum, pos), line) 940 941 if async_def and async_def_nl and async_def_indent >= indents[-1]: 942 async_def = False 943 async_def_nl = False 944 async_def_indent = 0 945 946 else: # continued statement 947 if not line: 948 raise TokenError("EOF in multi-line statement", (lnum, 0)) 949 continued = 0 950 951 while pos < max: 952 pseudomatch = _compile(PseudoToken).match(line, pos) 953 if pseudomatch: # scan for tokens 954 start, end = pseudomatch.span(1) 955 spos, epos, pos = (lnum, start), (lnum, end), end 956 if start == end: 957 continue 958 token, initial = line[start:end], line[start] 959 960 if token in _redir_check: 961 yield TokenInfo(IOREDIRECT, token, spos, epos, line) 962 elif initial in numchars or ( # ordinary number 963 initial == "." and token != "." and token != "..." 964 ): 965 yield TokenInfo(NUMBER, token, spos, epos, line) 966 elif initial in "\r\n": 967 if stashed: 968 yield stashed 969 stashed = None 970 if parenlev > 0: 971 yield TokenInfo(NL, token, spos, epos, line) 972 else: 973 yield TokenInfo(NEWLINE, token, spos, epos, line) 974 if async_def: 975 async_def_nl = True 976 977 elif initial == "#": 978 assert not token.endswith("\n") 979 if stashed: 980 yield stashed 981 stashed = None 982 yield TokenInfo(COMMENT, token, spos, epos, line) 983 # Xonsh-specific Regex Globbing 984 elif re.match(SearchPath, token): 985 yield TokenInfo(SEARCHPATH, token, spos, epos, line) 986 elif token in triple_quoted: 987 endprog = _compile(endpats[token]) 988 endmatch = endprog.match(line, pos) 989 if endmatch: # all on one line 990 pos = endmatch.end(0) 991 token = line[start:pos] 992 yield TokenInfo(STRING, token, spos, (lnum, pos), line) 993 else: 994 strstart = (lnum, start) # multiple lines 995 contstr = line[start:] 996 contline = line 997 break 998 elif ( 999 initial in single_quoted 1000 or token[:2] in single_quoted 1001 or token[:3] in single_quoted 1002 ): 1003 if token[-1] == "\n": # continued string 1004 strstart = (lnum, start) 1005 endprog = _compile( 1006 endpats[initial] or endpats[token[1]] or endpats[token[2]] 1007 ) 1008 contstr, needcont = line[start:], 1 1009 contline = line 1010 break 1011 else: # ordinary string 1012 yield TokenInfo(STRING, token, spos, epos, line) 1013 elif token.startswith("$") and token[1:].isidentifier(): 1014 yield TokenInfo(DOLLARNAME, token, spos, epos, line) 1015 elif initial.isidentifier(): # ordinary name 1016 if token in ("async", "await"): 1017 if async_def: 1018 yield TokenInfo( 1019 ASYNC if token == "async" else AWAIT, 1020 token, 1021 spos, 1022 epos, 1023 line, 1024 ) 1025 continue 1026 1027 tok = TokenInfo(NAME, token, spos, epos, line) 1028 if token == "async" and not stashed: 1029 stashed = tok 1030 continue 1031 1032 if ( 1033 HAS_ASYNC 1034 and token == "def" 1035 and ( 1036 stashed 1037 and stashed.type == NAME 1038 and stashed.string == "async" 1039 ) 1040 ): 1041 async_def = True 1042 async_def_indent = indents[-1] 1043 1044 yield TokenInfo( 1045 ASYNC, 1046 stashed.string, 1047 stashed.start, 1048 stashed.end, 1049 stashed.line, 1050 ) 1051 stashed = None 1052 1053 if stashed: 1054 yield stashed 1055 stashed = None 1056 1057 yield tok 1058 elif token == "\\\n" or token == "\\\r\n": # continued stmt 1059 continued = 1 1060 yield TokenInfo(ERRORTOKEN, token, spos, epos, line) 1061 elif initial == "\\": # continued stmt 1062 # for cases like C:\\path\\to\\file 1063 continued = 1 1064 else: 1065 if initial in "([{": 1066 parenlev += 1 1067 elif initial in ")]}": 1068 parenlev -= 1 1069 elif token in additional_parenlevs: 1070 parenlev += 1 1071 if stashed: 1072 yield stashed 1073 stashed = None 1074 yield TokenInfo(OP, token, spos, epos, line) 1075 else: 1076 yield TokenInfo( 1077 ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line 1078 ) 1079 pos += 1 1080 1081 if stashed: 1082 yield stashed 1083 stashed = None 1084 1085 for indent in indents[1:]: # pop remaining indent levels 1086 yield TokenInfo(DEDENT, "", (lnum, 0), (lnum, 0), "") 1087 yield TokenInfo(ENDMARKER, "", (lnum, 0), (lnum, 0), "") 1088 1089 1090def tokenize(readline): 1091 """ 1092 The tokenize() generator requires one argument, readline, which 1093 must be a callable object which provides the same interface as the 1094 readline() method of built-in file objects. Each call to the function 1095 should return one line of input as bytes. Alternately, readline 1096 can be a callable function terminating with StopIteration: 1097 readline = open(myfile, 'rb').__next__ # Example of alternate readline 1098 1099 The generator produces 5-tuples with these members: the token type; the 1100 token string; a 2-tuple (srow, scol) of ints specifying the row and 1101 column where the token begins in the source; a 2-tuple (erow, ecol) of 1102 ints specifying the row and column where the token ends in the source; 1103 and the line on which the token was found. The line passed is the 1104 logical line; continuation lines are included. 1105 1106 The first token sequence will always be an ENCODING token 1107 which tells you which encoding was used to decode the bytes stream. 1108 """ 1109 encoding, consumed = detect_encoding(readline) 1110 rl_gen = iter(readline, b"") 1111 empty = itertools.repeat(b"") 1112 return _tokenize(itertools.chain(consumed, rl_gen, empty).__next__, encoding) 1113 1114 1115# An undocumented, backwards compatible, API for all the places in the standard 1116# library that expect to be able to use tokenize with strings 1117def generate_tokens(readline): 1118 return _tokenize(readline, None) 1119 1120 1121def tokenize_main(): 1122 import argparse 1123 1124 # Helper error handling routines 1125 def perror(message): 1126 print(message, file=sys.stderr) 1127 1128 def error(message, filename=None, location=None): 1129 if location: 1130 args = (filename,) + location + (message,) 1131 perror("%s:%d:%d: error: %s" % args) 1132 elif filename: 1133 perror("%s: error: %s" % (filename, message)) 1134 else: 1135 perror("error: %s" % message) 1136 sys.exit(1) 1137 1138 # Parse the arguments and options 1139 parser = argparse.ArgumentParser(prog="python -m tokenize") 1140 parser.add_argument( 1141 dest="filename", 1142 nargs="?", 1143 metavar="filename.py", 1144 help="the file to tokenize; defaults to stdin", 1145 ) 1146 parser.add_argument( 1147 "-e", 1148 "--exact", 1149 dest="exact", 1150 action="store_true", 1151 help="display token names using the exact type", 1152 ) 1153 args = parser.parse_args() 1154 1155 try: 1156 # Tokenize the input 1157 if args.filename: 1158 filename = args.filename 1159 with builtins.open(filename, "rb") as f: 1160 tokens = list(tokenize(f.readline)) 1161 else: 1162 filename = "<stdin>" 1163 tokens = _tokenize(sys.stdin.readline, None) 1164 1165 # Output the tokenization 1166 for token in tokens: 1167 token_type = token.type 1168 if args.exact: 1169 token_type = token.exact_type 1170 token_range = "%d,%d-%d,%d:" % (token.start + token.end) 1171 print("%-20s%-15s%-15r" % (token_range, tok_name[token_type], token.string)) 1172 except IndentationError as err: 1173 line, column = err.args[1][1:3] 1174 error(err.args[0], filename, (line, column)) 1175 except TokenError as err: 1176 line, column = err.args[1] 1177 error(err.args[0], filename, (line, column)) 1178 except SyntaxError as err: 1179 error(err, filename) 1180 except OSError as err: 1181 error(err) 1182 except KeyboardInterrupt: 1183 print("interrupted\n") 1184 except Exception as err: 1185 perror("unexpected error: %s" % err) 1186 raise 1187