1"""Tokenization help for Python programs. 2 3tokenize(readline) is a generator that breaks a stream of bytes into 4Python tokens. It decodes the bytes according to PEP-0263 for 5determining source file encoding. 6 7It accepts a readline-like method which is called repeatedly to get the 8next line of input (or b"" for EOF). It generates 5-tuples with these 9members: 10 11 the token type (see token.py) 12 the token (a string) 13 the starting (row, column) indices of the token (a 2-tuple of ints) 14 the ending (row, column) indices of the token (a 2-tuple of ints) 15 the original line (string) 16 17It is designed to match the working of the Python tokenizer exactly, except 18that it produces COMMENT tokens for comments and gives type OP for all 19operators. Additionally, all token lists start with an ENCODING token 20which tells you which encoding was used to decode the bytes stream. 21""" 22 23__author__ = 'Ka-Ping Yee <ping@lfw.org>' 24__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 26 'Michael Foord') 27from builtins import open as _builtin_open 28from codecs import lookup, BOM_UTF8 29import collections 30from io import TextIOWrapper 31import itertools as _itertools 32import re 33import sys 34from token import * 35from token import EXACT_TOKEN_TYPES 36 37cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) 38blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) 39 40import token 41__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding", 42 "untokenize", "TokenInfo"] 43del token 44 45class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): 46 def __repr__(self): 47 annotated_type = '%d (%s)' % (self.type, tok_name[self.type]) 48 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' % 49 self._replace(type=annotated_type)) 50 51 @property 52 def exact_type(self): 53 if self.type == OP and self.string in EXACT_TOKEN_TYPES: 54 return EXACT_TOKEN_TYPES[self.string] 55 else: 56 return self.type 57 58def group(*choices): return '(' + '|'.join(choices) + ')' 59def any(*choices): return group(*choices) + '*' 60def maybe(*choices): return group(*choices) + '?' 61 62# Note: we use unicode matching for names ("\w") but ascii matching for 63# number literals. 64Whitespace = r'[ \f\t]*' 65Comment = r'#[^\r\n]*' 66Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 67Name = r'\w+' 68 69Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' 70Binnumber = r'0[bB](?:_?[01])+' 71Octnumber = r'0[oO](?:_?[0-7])+' 72Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' 73Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 74Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' 75Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', 76 r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) 77Expfloat = r'[0-9](?:_?[0-9])*' + Exponent 78Floatnumber = group(Pointfloat, Expfloat) 79Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') 80Number = group(Imagnumber, Floatnumber, Intnumber) 81 82# Return the empty string, plus all of the valid string prefixes. 83def _all_string_prefixes(): 84 # The valid string prefixes. Only contain the lower case versions, 85 # and don't contain any permutations (include 'fr', but not 86 # 'rf'). The various permutations will be generated. 87 _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr'] 88 # if we add binary f-strings, add: ['fb', 'fbr'] 89 result = {''} 90 for prefix in _valid_string_prefixes: 91 for t in _itertools.permutations(prefix): 92 # create a list with upper and lower versions of each 93 # character 94 for u in _itertools.product(*[(c, c.upper()) for c in t]): 95 result.add(''.join(u)) 96 return result 97 98def _compile(expr): 99 return re.compile(expr, re.UNICODE) 100 101# Note that since _all_string_prefixes includes the empty string, 102# StringPrefix can be the empty string (making it optional). 103StringPrefix = group(*_all_string_prefixes()) 104 105# Tail end of ' string. 106Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 107# Tail end of " string. 108Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 109# Tail end of ''' string. 110Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 111# Tail end of """ string. 112Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 113Triple = group(StringPrefix + "'''", StringPrefix + '"""') 114# Single-line ' or " string. 115String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 116 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 117 118# Sorting in reverse order puts the long operators before their prefixes. 119# Otherwise if = came before ==, == would get recognized as two instances 120# of =. 121Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True))) 122Funny = group(r'\r?\n', Special) 123 124PlainToken = group(Number, Funny, String, Name) 125Token = Ignore + PlainToken 126 127# First (or only) line of ' or " string. 128ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 129 group("'", r'\\\r?\n'), 130 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 131 group('"', r'\\\r?\n')) 132PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) 133PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 134 135# For a given string prefix plus quotes, endpats maps it to a regex 136# to match the remainder of that string. _prefix can be empty, for 137# a normal single or triple quoted string (with no prefix). 138endpats = {} 139for _prefix in _all_string_prefixes(): 140 endpats[_prefix + "'"] = Single 141 endpats[_prefix + '"'] = Double 142 endpats[_prefix + "'''"] = Single3 143 endpats[_prefix + '"""'] = Double3 144 145# A set of all of the single and triple quoted string prefixes, 146# including the opening quotes. 147single_quoted = set() 148triple_quoted = set() 149for t in _all_string_prefixes(): 150 for u in (t + '"', t + "'"): 151 single_quoted.add(u) 152 for u in (t + '"""', t + "'''"): 153 triple_quoted.add(u) 154 155tabsize = 8 156 157class TokenError(Exception): pass 158 159class StopTokenizing(Exception): pass 160 161 162class Untokenizer: 163 164 def __init__(self): 165 self.tokens = [] 166 self.prev_row = 1 167 self.prev_col = 0 168 self.encoding = None 169 170 def add_whitespace(self, start): 171 row, col = start 172 if row < self.prev_row or row == self.prev_row and col < self.prev_col: 173 raise ValueError("start ({},{}) precedes previous end ({},{})" 174 .format(row, col, self.prev_row, self.prev_col)) 175 row_offset = row - self.prev_row 176 if row_offset: 177 self.tokens.append("\\\n" * row_offset) 178 self.prev_col = 0 179 col_offset = col - self.prev_col 180 if col_offset: 181 self.tokens.append(" " * col_offset) 182 183 def untokenize(self, iterable): 184 it = iter(iterable) 185 indents = [] 186 startline = False 187 for t in it: 188 if len(t) == 2: 189 self.compat(t, it) 190 break 191 tok_type, token, start, end, line = t 192 if tok_type == ENCODING: 193 self.encoding = token 194 continue 195 if tok_type == ENDMARKER: 196 break 197 if tok_type == INDENT: 198 indents.append(token) 199 continue 200 elif tok_type == DEDENT: 201 indents.pop() 202 self.prev_row, self.prev_col = end 203 continue 204 elif tok_type in (NEWLINE, NL): 205 startline = True 206 elif startline and indents: 207 indent = indents[-1] 208 if start[1] >= len(indent): 209 self.tokens.append(indent) 210 self.prev_col = len(indent) 211 startline = False 212 self.add_whitespace(start) 213 self.tokens.append(token) 214 self.prev_row, self.prev_col = end 215 if tok_type in (NEWLINE, NL): 216 self.prev_row += 1 217 self.prev_col = 0 218 return "".join(self.tokens) 219 220 def compat(self, token, iterable): 221 indents = [] 222 toks_append = self.tokens.append 223 startline = token[0] in (NEWLINE, NL) 224 prevstring = False 225 226 for tok in _itertools.chain([token], iterable): 227 toknum, tokval = tok[:2] 228 if toknum == ENCODING: 229 self.encoding = tokval 230 continue 231 232 if toknum in (NAME, NUMBER): 233 tokval += ' ' 234 235 # Insert a space between two consecutive strings 236 if toknum == STRING: 237 if prevstring: 238 tokval = ' ' + tokval 239 prevstring = True 240 else: 241 prevstring = False 242 243 if toknum == INDENT: 244 indents.append(tokval) 245 continue 246 elif toknum == DEDENT: 247 indents.pop() 248 continue 249 elif toknum in (NEWLINE, NL): 250 startline = True 251 elif startline and indents: 252 toks_append(indents[-1]) 253 startline = False 254 toks_append(tokval) 255 256 257def untokenize(iterable): 258 """Transform tokens back into Python source code. 259 It returns a bytes object, encoded using the ENCODING 260 token, which is the first token sequence output by tokenize. 261 262 Each element returned by the iterable must be a token sequence 263 with at least two elements, a token number and token value. If 264 only two tokens are passed, the resulting output is poor. 265 266 Round-trip invariant for full input: 267 Untokenized source will match input source exactly 268 269 Round-trip invariant for limited input: 270 # Output bytes will tokenize back to the input 271 t1 = [tok[:2] for tok in tokenize(f.readline)] 272 newcode = untokenize(t1) 273 readline = BytesIO(newcode).readline 274 t2 = [tok[:2] for tok in tokenize(readline)] 275 assert t1 == t2 276 """ 277 ut = Untokenizer() 278 out = ut.untokenize(iterable) 279 if ut.encoding is not None: 280 out = out.encode(ut.encoding) 281 return out 282 283 284def _get_normal_name(orig_enc): 285 """Imitates get_normal_name in tokenizer.c.""" 286 # Only care about the first 12 characters. 287 enc = orig_enc[:12].lower().replace("_", "-") 288 if enc == "utf-8" or enc.startswith("utf-8-"): 289 return "utf-8" 290 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 291 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 292 return "iso-8859-1" 293 return orig_enc 294 295def detect_encoding(readline): 296 """ 297 The detect_encoding() function is used to detect the encoding that should 298 be used to decode a Python source file. It requires one argument, readline, 299 in the same way as the tokenize() generator. 300 301 It will call readline a maximum of twice, and return the encoding used 302 (as a string) and a list of any lines (left as bytes) it has read in. 303 304 It detects the encoding from the presence of a utf-8 bom or an encoding 305 cookie as specified in pep-0263. If both a bom and a cookie are present, 306 but disagree, a SyntaxError will be raised. If the encoding cookie is an 307 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, 308 'utf-8-sig' is returned. 309 310 If no encoding is specified, then the default of 'utf-8' will be returned. 311 """ 312 try: 313 filename = readline.__self__.name 314 except AttributeError: 315 filename = None 316 bom_found = False 317 encoding = None 318 default = 'utf-8' 319 def read_or_stop(): 320 try: 321 return readline() 322 except StopIteration: 323 return b'' 324 325 def find_cookie(line): 326 try: 327 # Decode as UTF-8. Either the line is an encoding declaration, 328 # in which case it should be pure ASCII, or it must be UTF-8 329 # per default encoding. 330 line_string = line.decode('utf-8') 331 except UnicodeDecodeError: 332 msg = "invalid or missing encoding declaration" 333 if filename is not None: 334 msg = '{} for {!r}'.format(msg, filename) 335 raise SyntaxError(msg) 336 337 match = cookie_re.match(line_string) 338 if not match: 339 return None 340 encoding = _get_normal_name(match.group(1)) 341 try: 342 codec = lookup(encoding) 343 except LookupError: 344 # This behaviour mimics the Python interpreter 345 if filename is None: 346 msg = "unknown encoding: " + encoding 347 else: 348 msg = "unknown encoding for {!r}: {}".format(filename, 349 encoding) 350 raise SyntaxError(msg) 351 352 if bom_found: 353 if encoding != 'utf-8': 354 # This behaviour mimics the Python interpreter 355 if filename is None: 356 msg = 'encoding problem: utf-8' 357 else: 358 msg = 'encoding problem for {!r}: utf-8'.format(filename) 359 raise SyntaxError(msg) 360 encoding += '-sig' 361 return encoding 362 363 first = read_or_stop() 364 if first.startswith(BOM_UTF8): 365 bom_found = True 366 first = first[3:] 367 default = 'utf-8-sig' 368 if not first: 369 return default, [] 370 371 encoding = find_cookie(first) 372 if encoding: 373 return encoding, [first] 374 if not blank_re.match(first): 375 return default, [first] 376 377 second = read_or_stop() 378 if not second: 379 return default, [first] 380 381 encoding = find_cookie(second) 382 if encoding: 383 return encoding, [first, second] 384 385 return default, [first, second] 386 387 388def open(filename): 389 """Open a file in read only mode using the encoding detected by 390 detect_encoding(). 391 """ 392 buffer = _builtin_open(filename, 'rb') 393 try: 394 encoding, lines = detect_encoding(buffer.readline) 395 buffer.seek(0) 396 text = TextIOWrapper(buffer, encoding, line_buffering=True) 397 text.mode = 'r' 398 return text 399 except: 400 buffer.close() 401 raise 402 403 404def tokenize(readline): 405 """ 406 The tokenize() generator requires one argument, readline, which 407 must be a callable object which provides the same interface as the 408 readline() method of built-in file objects. Each call to the function 409 should return one line of input as bytes. Alternatively, readline 410 can be a callable function terminating with StopIteration: 411 readline = open(myfile, 'rb').__next__ # Example of alternate readline 412 413 The generator produces 5-tuples with these members: the token type; the 414 token string; a 2-tuple (srow, scol) of ints specifying the row and 415 column where the token begins in the source; a 2-tuple (erow, ecol) of 416 ints specifying the row and column where the token ends in the source; 417 and the line on which the token was found. The line passed is the 418 physical line. 419 420 The first token sequence will always be an ENCODING token 421 which tells you which encoding was used to decode the bytes stream. 422 """ 423 encoding, consumed = detect_encoding(readline) 424 empty = _itertools.repeat(b"") 425 rl_gen = _itertools.chain(consumed, iter(readline, b""), empty) 426 return _tokenize(rl_gen.__next__, encoding) 427 428 429def _tokenize(readline, encoding): 430 lnum = parenlev = continued = 0 431 numchars = '0123456789' 432 contstr, needcont = '', 0 433 contline = None 434 indents = [0] 435 436 if encoding is not None: 437 if encoding == "utf-8-sig": 438 # BOM will already have been stripped. 439 encoding = "utf-8" 440 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') 441 last_line = b'' 442 line = b'' 443 while True: # loop over lines in stream 444 try: 445 # We capture the value of the line variable here because 446 # readline uses the empty string '' to signal end of input, 447 # hence `line` itself will always be overwritten at the end 448 # of this loop. 449 last_line = line 450 line = readline() 451 except StopIteration: 452 line = b'' 453 454 if encoding is not None: 455 line = line.decode(encoding) 456 lnum += 1 457 pos, max = 0, len(line) 458 459 if contstr: # continued string 460 if not line: 461 raise TokenError("EOF in multi-line string", strstart) 462 endmatch = endprog.match(line) 463 if endmatch: 464 pos = end = endmatch.end(0) 465 yield TokenInfo(STRING, contstr + line[:end], 466 strstart, (lnum, end), contline + line) 467 contstr, needcont = '', 0 468 contline = None 469 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 470 yield TokenInfo(ERRORTOKEN, contstr + line, 471 strstart, (lnum, len(line)), contline) 472 contstr = '' 473 contline = None 474 continue 475 else: 476 contstr = contstr + line 477 contline = contline + line 478 continue 479 480 elif parenlev == 0 and not continued: # new statement 481 if not line: break 482 column = 0 483 while pos < max: # measure leading whitespace 484 if line[pos] == ' ': 485 column += 1 486 elif line[pos] == '\t': 487 column = (column//tabsize + 1)*tabsize 488 elif line[pos] == '\f': 489 column = 0 490 else: 491 break 492 pos += 1 493 if pos == max: 494 break 495 496 if line[pos] in '#\r\n': # skip comments or blank lines 497 if line[pos] == '#': 498 comment_token = line[pos:].rstrip('\r\n') 499 yield TokenInfo(COMMENT, comment_token, 500 (lnum, pos), (lnum, pos + len(comment_token)), line) 501 pos += len(comment_token) 502 503 yield TokenInfo(NL, line[pos:], 504 (lnum, pos), (lnum, len(line)), line) 505 continue 506 507 if column > indents[-1]: # count indents or dedents 508 indents.append(column) 509 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 510 while column < indents[-1]: 511 if column not in indents: 512 raise IndentationError( 513 "unindent does not match any outer indentation level", 514 ("<tokenize>", lnum, pos, line)) 515 indents = indents[:-1] 516 517 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) 518 519 else: # continued statement 520 if not line: 521 raise TokenError("EOF in multi-line statement", (lnum, 0)) 522 continued = 0 523 524 while pos < max: 525 pseudomatch = _compile(PseudoToken).match(line, pos) 526 if pseudomatch: # scan for tokens 527 start, end = pseudomatch.span(1) 528 spos, epos, pos = (lnum, start), (lnum, end), end 529 if start == end: 530 continue 531 token, initial = line[start:end], line[start] 532 533 if (initial in numchars or # ordinary number 534 (initial == '.' and token != '.' and token != '...')): 535 yield TokenInfo(NUMBER, token, spos, epos, line) 536 elif initial in '\r\n': 537 if parenlev > 0: 538 yield TokenInfo(NL, token, spos, epos, line) 539 else: 540 yield TokenInfo(NEWLINE, token, spos, epos, line) 541 542 elif initial == '#': 543 assert not token.endswith("\n") 544 yield TokenInfo(COMMENT, token, spos, epos, line) 545 546 elif token in triple_quoted: 547 endprog = _compile(endpats[token]) 548 endmatch = endprog.match(line, pos) 549 if endmatch: # all on one line 550 pos = endmatch.end(0) 551 token = line[start:pos] 552 yield TokenInfo(STRING, token, spos, (lnum, pos), line) 553 else: 554 strstart = (lnum, start) # multiple lines 555 contstr = line[start:] 556 contline = line 557 break 558 559 # Check up to the first 3 chars of the token to see if 560 # they're in the single_quoted set. If so, they start 561 # a string. 562 # We're using the first 3, because we're looking for 563 # "rb'" (for example) at the start of the token. If 564 # we switch to longer prefixes, this needs to be 565 # adjusted. 566 # Note that initial == token[:1]. 567 # Also note that single quote checking must come after 568 # triple quote checking (above). 569 elif (initial in single_quoted or 570 token[:2] in single_quoted or 571 token[:3] in single_quoted): 572 if token[-1] == '\n': # continued string 573 strstart = (lnum, start) 574 # Again, using the first 3 chars of the 575 # token. This is looking for the matching end 576 # regex for the correct type of quote 577 # character. So it's really looking for 578 # endpats["'"] or endpats['"'], by trying to 579 # skip string prefix characters, if any. 580 endprog = _compile(endpats.get(initial) or 581 endpats.get(token[1]) or 582 endpats.get(token[2])) 583 contstr, needcont = line[start:], 1 584 contline = line 585 break 586 else: # ordinary string 587 yield TokenInfo(STRING, token, spos, epos, line) 588 589 elif initial.isidentifier(): # ordinary name 590 yield TokenInfo(NAME, token, spos, epos, line) 591 elif initial == '\\': # continued stmt 592 continued = 1 593 else: 594 if initial in '([{': 595 parenlev += 1 596 elif initial in ')]}': 597 parenlev -= 1 598 yield TokenInfo(OP, token, spos, epos, line) 599 else: 600 yield TokenInfo(ERRORTOKEN, line[pos], 601 (lnum, pos), (lnum, pos+1), line) 602 pos += 1 603 604 # Add an implicit NEWLINE if the input doesn't end in one 605 if last_line and last_line[-1] not in '\r\n': 606 yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') 607 for indent in indents[1:]: # pop remaining indent levels 608 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') 609 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') 610 611 612def generate_tokens(readline): 613 """Tokenize a source reading Python code as unicode strings. 614 615 This has the same API as tokenize(), except that it expects the *readline* 616 callable to return str objects instead of bytes. 617 """ 618 return _tokenize(readline, None) 619 620def main(): 621 import argparse 622 623 # Helper error handling routines 624 def perror(message): 625 sys.stderr.write(message) 626 sys.stderr.write('\n') 627 628 def error(message, filename=None, location=None): 629 if location: 630 args = (filename,) + location + (message,) 631 perror("%s:%d:%d: error: %s" % args) 632 elif filename: 633 perror("%s: error: %s" % (filename, message)) 634 else: 635 perror("error: %s" % message) 636 sys.exit(1) 637 638 # Parse the arguments and options 639 parser = argparse.ArgumentParser(prog='python -m tokenize') 640 parser.add_argument(dest='filename', nargs='?', 641 metavar='filename.py', 642 help='the file to tokenize; defaults to stdin') 643 parser.add_argument('-e', '--exact', dest='exact', action='store_true', 644 help='display token names using the exact type') 645 args = parser.parse_args() 646 647 try: 648 # Tokenize the input 649 if args.filename: 650 filename = args.filename 651 with _builtin_open(filename, 'rb') as f: 652 tokens = list(tokenize(f.readline)) 653 else: 654 filename = "<stdin>" 655 tokens = _tokenize(sys.stdin.readline, None) 656 657 # Output the tokenization 658 for token in tokens: 659 token_type = token.type 660 if args.exact: 661 token_type = token.exact_type 662 token_range = "%d,%d-%d,%d:" % (token.start + token.end) 663 print("%-20s%-15s%-15r" % 664 (token_range, tok_name[token_type], token.string)) 665 except IndentationError as err: 666 line, column = err.args[1][1:3] 667 error(err.args[0], filename, (line, column)) 668 except TokenError as err: 669 line, column = err.args[1] 670 error(err.args[0], filename, (line, column)) 671 except SyntaxError as err: 672 error(err, filename) 673 except OSError as err: 674 error(err) 675 except KeyboardInterrupt: 676 print("interrupted\n") 677 except Exception as err: 678 perror("unexpected error: %s" % err) 679 raise 680 681if __name__ == "__main__": 682 main() 683