1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens.  It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF).  It generates
95-tuples with these members:
10
11    the token type (see token.py)
12    the token (a string)
13    the starting (row, column) indices of the token (a 2-tuple of ints)
14    the ending (row, column) indices of the token (a 2-tuple of ints)
15    the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22    tokenize_loop(readline, tokeneater)
23    tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
33from codecs import BOM_UTF8, lookup
34from lib2to3.pgen2.token import *
35
36from . import token
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38           "generate_tokens", "untokenize"]
39del token
40
41try:
42    bytes
43except NameError:
44    # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45    # valid Python 3 code.
46    bytes = str
47
48def group(*choices): return '(' + '|'.join(choices) + ')'
49def any(*choices): return group(*choices) + '*'
50def maybe(*choices): return group(*choices) + '?'
51
52Whitespace = r'[ \f\t]*'
53Comment = r'#[^\r\n]*'
54Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55Name = r'[a-zA-Z_]\w*'
56
57Binnumber = r'0[bB][01]*'
58Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
59Octnumber = r'0[oO]?[0-7]*[lL]?'
60Decnumber = r'[1-9]\d*[lL]?'
61Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
62Exponent = r'[eE][-+]?\d+'
63Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
64Expfloat = r'\d+' + Exponent
65Floatnumber = group(Pointfloat, Expfloat)
66Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
67Number = group(Imagnumber, Floatnumber, Intnumber)
68
69# Tail end of ' string.
70Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
71# Tail end of " string.
72Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
73# Tail end of ''' string.
74Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75# Tail end of """ string.
76Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
77Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
78# Single-line ' or " string.
79String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
80               r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
81
82# Because of leftmost-then-longest match semantics, be sure to put the
83# longest operators first (e.g., if = came before ==, == would get
84# recognized as two instances of =).
85Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
86                 r"//=?", r"->",
87                 r"[+\-*/%&|^=<>]=?",
88                 r"~")
89
90Bracket = '[][(){}]'
91Special = group(r'\r?\n', r'[:;.,`@]')
92Funny = group(Operator, Bracket, Special)
93
94PlainToken = group(Number, Funny, String, Name)
95Token = Ignore + PlainToken
96
97# First (or only) line of ' or " string.
98ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
99                group("'", r'\\\r?\n'),
100                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
101                group('"', r'\\\r?\n'))
102PseudoExtras = group(r'\\\r?\n', Comment, Triple)
103PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
104
105tokenprog, pseudoprog, single3prog, double3prog = map(
106    re.compile, (Token, PseudoToken, Single3, Double3))
107endprogs = {"'": re.compile(Single), '"': re.compile(Double),
108            "'''": single3prog, '"""': double3prog,
109            "r'''": single3prog, 'r"""': double3prog,
110            "u'''": single3prog, 'u"""': double3prog,
111            "b'''": single3prog, 'b"""': double3prog,
112            "ur'''": single3prog, 'ur"""': double3prog,
113            "br'''": single3prog, 'br"""': double3prog,
114            "R'''": single3prog, 'R"""': double3prog,
115            "U'''": single3prog, 'U"""': double3prog,
116            "B'''": single3prog, 'B"""': double3prog,
117            "uR'''": single3prog, 'uR"""': double3prog,
118            "Ur'''": single3prog, 'Ur"""': double3prog,
119            "UR'''": single3prog, 'UR"""': double3prog,
120            "bR'''": single3prog, 'bR"""': double3prog,
121            "Br'''": single3prog, 'Br"""': double3prog,
122            "BR'''": single3prog, 'BR"""': double3prog,
123            'r': None, 'R': None,
124            'u': None, 'U': None,
125            'b': None, 'B': None}
126
127triple_quoted = {}
128for t in ("'''", '"""',
129          "r'''", 'r"""', "R'''", 'R"""',
130          "u'''", 'u"""', "U'''", 'U"""',
131          "b'''", 'b"""', "B'''", 'B"""',
132          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
133          "uR'''", 'uR"""', "UR'''", 'UR"""',
134          "br'''", 'br"""', "Br'''", 'Br"""',
135          "bR'''", 'bR"""', "BR'''", 'BR"""',):
136    triple_quoted[t] = t
137single_quoted = {}
138for t in ("'", '"',
139          "r'", 'r"', "R'", 'R"',
140          "u'", 'u"', "U'", 'U"',
141          "b'", 'b"', "B'", 'B"',
142          "ur'", 'ur"', "Ur'", 'Ur"',
143          "uR'", 'uR"', "UR'", 'UR"',
144          "br'", 'br"', "Br'", 'Br"',
145          "bR'", 'bR"', "BR'", 'BR"', ):
146    single_quoted[t] = t
147
148tabsize = 8
149
150class TokenError(Exception): pass
151
152class StopTokenizing(Exception): pass
153
154def printtoken(type, token, start, end, line): # for testing
155    (srow, scol) = start
156    (erow, ecol) = end
157    print "%d,%d-%d,%d:\t%s\t%s" % \
158        (srow, scol, erow, ecol, tok_name[type], repr(token))
159
160def tokenize(readline, tokeneater=printtoken):
161    """
162    The tokenize() function accepts two parameters: one representing the
163    input stream, and one providing an output mechanism for tokenize().
164
165    The first parameter, readline, must be a callable object which provides
166    the same interface as the readline() method of built-in file objects.
167    Each call to the function should return one line of input as a string.
168
169    The second parameter, tokeneater, must also be a callable object. It is
170    called once for each token, with five arguments, corresponding to the
171    tuples generated by generate_tokens().
172    """
173    try:
174        tokenize_loop(readline, tokeneater)
175    except StopTokenizing:
176        pass
177
178# backwards compatible interface
179def tokenize_loop(readline, tokeneater):
180    for token_info in generate_tokens(readline):
181        tokeneater(*token_info)
182
183class Untokenizer:
184
185    def __init__(self):
186        self.tokens = []
187        self.prev_row = 1
188        self.prev_col = 0
189
190    def add_whitespace(self, start):
191        row, col = start
192        assert row <= self.prev_row
193        col_offset = col - self.prev_col
194        if col_offset:
195            self.tokens.append(" " * col_offset)
196
197    def untokenize(self, iterable):
198        for t in iterable:
199            if len(t) == 2:
200                self.compat(t, iterable)
201                break
202            tok_type, token, start, end, line = t
203            self.add_whitespace(start)
204            self.tokens.append(token)
205            self.prev_row, self.prev_col = end
206            if tok_type in (NEWLINE, NL):
207                self.prev_row += 1
208                self.prev_col = 0
209        return "".join(self.tokens)
210
211    def compat(self, token, iterable):
212        startline = False
213        indents = []
214        toks_append = self.tokens.append
215        toknum, tokval = token
216        if toknum in (NAME, NUMBER):
217            tokval += ' '
218        if toknum in (NEWLINE, NL):
219            startline = True
220        for tok in iterable:
221            toknum, tokval = tok[:2]
222
223            if toknum in (NAME, NUMBER):
224                tokval += ' '
225
226            if toknum == INDENT:
227                indents.append(tokval)
228                continue
229            elif toknum == DEDENT:
230                indents.pop()
231                continue
232            elif toknum in (NEWLINE, NL):
233                startline = True
234            elif startline and indents:
235                toks_append(indents[-1])
236                startline = False
237            toks_append(tokval)
238
239cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
240
241def _get_normal_name(orig_enc):
242    """Imitates get_normal_name in tokenizer.c."""
243    # Only care about the first 12 characters.
244    enc = orig_enc[:12].lower().replace("_", "-")
245    if enc == "utf-8" or enc.startswith("utf-8-"):
246        return "utf-8"
247    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
248       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
249        return "iso-8859-1"
250    return orig_enc
251
252def detect_encoding(readline):
253    """
254    The detect_encoding() function is used to detect the encoding that should
255    be used to decode a Python source file. It requires one argment, readline,
256    in the same way as the tokenize() generator.
257
258    It will call readline a maximum of twice, and return the encoding used
259    (as a string) and a list of any lines (left as bytes) it has read
260    in.
261
262    It detects the encoding from the presence of a utf-8 bom or an encoding
263    cookie as specified in pep-0263. If both a bom and a cookie are present, but
264    disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
265    charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
266    'utf-8-sig' is returned.
267
268    If no encoding is specified, then the default of 'utf-8' will be returned.
269    """
270    bom_found = False
271    encoding = None
272    default = 'utf-8'
273    def read_or_stop():
274        try:
275            return readline()
276        except StopIteration:
277            return bytes()
278
279    def find_cookie(line):
280        try:
281            line_string = line.decode('ascii')
282        except UnicodeDecodeError:
283            return None
284
285        matches = cookie_re.findall(line_string)
286        if not matches:
287            return None
288        encoding = _get_normal_name(matches[0])
289        try:
290            codec = lookup(encoding)
291        except LookupError:
292            # This behaviour mimics the Python interpreter
293            raise SyntaxError("unknown encoding: " + encoding)
294
295        if bom_found:
296            if codec.name != 'utf-8':
297                # This behaviour mimics the Python interpreter
298                raise SyntaxError('encoding problem: utf-8')
299            encoding += '-sig'
300        return encoding
301
302    first = read_or_stop()
303    if first.startswith(BOM_UTF8):
304        bom_found = True
305        first = first[3:]
306        default = 'utf-8-sig'
307    if not first:
308        return default, []
309
310    encoding = find_cookie(first)
311    if encoding:
312        return encoding, [first]
313
314    second = read_or_stop()
315    if not second:
316        return default, [first]
317
318    encoding = find_cookie(second)
319    if encoding:
320        return encoding, [first, second]
321
322    return default, [first, second]
323
324def untokenize(iterable):
325    """Transform tokens back into Python source code.
326
327    Each element returned by the iterable must be a token sequence
328    with at least two elements, a token number and token value.  If
329    only two tokens are passed, the resulting output is poor.
330
331    Round-trip invariant for full input:
332        Untokenized source will match input source exactly
333
334    Round-trip invariant for limited intput:
335        # Output text will tokenize the back to the input
336        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
337        newcode = untokenize(t1)
338        readline = iter(newcode.splitlines(1)).next
339        t2 = [tok[:2] for tokin generate_tokens(readline)]
340        assert t1 == t2
341    """
342    ut = Untokenizer()
343    return ut.untokenize(iterable)
344
345def generate_tokens(readline):
346    """
347    The generate_tokens() generator requires one argment, readline, which
348    must be a callable object which provides the same interface as the
349    readline() method of built-in file objects. Each call to the function
350    should return one line of input as a string.  Alternately, readline
351    can be a callable function terminating with StopIteration:
352        readline = open(myfile).next    # Example of alternate readline
353
354    The generator produces 5-tuples with these members: the token type; the
355    token string; a 2-tuple (srow, scol) of ints specifying the row and
356    column where the token begins in the source; a 2-tuple (erow, ecol) of
357    ints specifying the row and column where the token ends in the source;
358    and the line on which the token was found. The line passed is the
359    logical line; continuation lines are included.
360    """
361    lnum = parenlev = continued = 0
362    namechars, numchars = string.ascii_letters + '_', '0123456789'
363    contstr, needcont = '', 0
364    contline = None
365    indents = [0]
366
367    while 1:                                   # loop over lines in stream
368        try:
369            line = readline()
370        except StopIteration:
371            line = ''
372        lnum = lnum + 1
373        pos, max = 0, len(line)
374
375        if contstr:                            # continued string
376            if not line:
377                raise TokenError, ("EOF in multi-line string", strstart)
378            endmatch = endprog.match(line)
379            if endmatch:
380                pos = end = endmatch.end(0)
381                yield (STRING, contstr + line[:end],
382                       strstart, (lnum, end), contline + line)
383                contstr, needcont = '', 0
384                contline = None
385            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
386                yield (ERRORTOKEN, contstr + line,
387                           strstart, (lnum, len(line)), contline)
388                contstr = ''
389                contline = None
390                continue
391            else:
392                contstr = contstr + line
393                contline = contline + line
394                continue
395
396        elif parenlev == 0 and not continued:  # new statement
397            if not line: break
398            column = 0
399            while pos < max:                   # measure leading whitespace
400                if line[pos] == ' ': column = column + 1
401                elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
402                elif line[pos] == '\f': column = 0
403                else: break
404                pos = pos + 1
405            if pos == max: break
406
407            if line[pos] in '#\r\n':           # skip comments or blank lines
408                if line[pos] == '#':
409                    comment_token = line[pos:].rstrip('\r\n')
410                    nl_pos = pos + len(comment_token)
411                    yield (COMMENT, comment_token,
412                           (lnum, pos), (lnum, pos + len(comment_token)), line)
413                    yield (NL, line[nl_pos:],
414                           (lnum, nl_pos), (lnum, len(line)), line)
415                else:
416                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
417                           (lnum, pos), (lnum, len(line)), line)
418                continue
419
420            if column > indents[-1]:           # count indents or dedents
421                indents.append(column)
422                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
423            while column < indents[-1]:
424                if column not in indents:
425                    raise IndentationError(
426                        "unindent does not match any outer indentation level",
427                        ("<tokenize>", lnum, pos, line))
428                indents = indents[:-1]
429                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
430
431        else:                                  # continued statement
432            if not line:
433                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
434            continued = 0
435
436        while pos < max:
437            pseudomatch = pseudoprog.match(line, pos)
438            if pseudomatch:                                # scan for tokens
439                start, end = pseudomatch.span(1)
440                spos, epos, pos = (lnum, start), (lnum, end), end
441                token, initial = line[start:end], line[start]
442
443                if initial in numchars or \
444                   (initial == '.' and token != '.'):      # ordinary number
445                    yield (NUMBER, token, spos, epos, line)
446                elif initial in '\r\n':
447                    newline = NEWLINE
448                    if parenlev > 0:
449                        newline = NL
450                    yield (newline, token, spos, epos, line)
451                elif initial == '#':
452                    assert not token.endswith("\n")
453                    yield (COMMENT, token, spos, epos, line)
454                elif token in triple_quoted:
455                    endprog = endprogs[token]
456                    endmatch = endprog.match(line, pos)
457                    if endmatch:                           # all on one line
458                        pos = endmatch.end(0)
459                        token = line[start:pos]
460                        yield (STRING, token, spos, (lnum, pos), line)
461                    else:
462                        strstart = (lnum, start)           # multiple lines
463                        contstr = line[start:]
464                        contline = line
465                        break
466                elif initial in single_quoted or \
467                    token[:2] in single_quoted or \
468                    token[:3] in single_quoted:
469                    if token[-1] == '\n':                  # continued string
470                        strstart = (lnum, start)
471                        endprog = (endprogs[initial] or endprogs[token[1]] or
472                                   endprogs[token[2]])
473                        contstr, needcont = line[start:], 1
474                        contline = line
475                        break
476                    else:                                  # ordinary string
477                        yield (STRING, token, spos, epos, line)
478                elif initial in namechars:                 # ordinary name
479                    yield (NAME, token, spos, epos, line)
480                elif initial == '\\':                      # continued stmt
481                    # This yield is new; needed for better idempotency:
482                    yield (NL, token, spos, (lnum, pos), line)
483                    continued = 1
484                else:
485                    if initial in '([{': parenlev = parenlev + 1
486                    elif initial in ')]}': parenlev = parenlev - 1
487                    yield (OP, token, spos, epos, line)
488            else:
489                yield (ERRORTOKEN, line[pos],
490                           (lnum, pos), (lnum, pos+1), line)
491                pos = pos + 1
492
493    for indent in indents[1:]:                 # pop remaining indent levels
494        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
495    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
496
497if __name__ == '__main__':                     # testing
498    import sys
499    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
500    else: tokenize(sys.stdin.readline)
501