1"""Tokenization help for Python programs.
2
3generate_tokens(readline) is a generator that breaks a stream of
4text into Python tokens.  It accepts a readline-like method which is called
5repeatedly to get the next line of input (or "" for EOF).  It generates
65-tuples with these members:
7
8    the token type (see token.py)
9    the token (a string)
10    the starting (row, column) indices of the token (a 2-tuple of ints)
11    the ending (row, column) indices of the token (a 2-tuple of ints)
12    the original line (string)
13
14It is designed to match the working of the Python tokenizer exactly, except
15that it produces COMMENT tokens for comments and gives type OP for all
16operators
17
18Older entry points
19    tokenize_loop(readline, tokeneater)
20    tokenize(readline, tokeneater=printtoken)
21are the same, except instead of generating tokens, tokeneater is a callback
22function to which the 5 fields described above are passed as 5 arguments,
23each time a new token is found."""
24
25__author__ = 'Ka-Ping Yee <ping@lfw.org>'
26__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
27               'Skip Montanaro, Raymond Hettinger')
28
29import string, re
30from token import *
31
32import token
33__all__ = [x for x in dir(token) if not x.startswith("_")]
34__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
35del x
36del token
37
38COMMENT = N_TOKENS
39tok_name[COMMENT] = 'COMMENT'
40NL = N_TOKENS + 1
41tok_name[NL] = 'NL'
42N_TOKENS += 2
43
44def group(*choices): return '(' + '|'.join(choices) + ')'
45def any(*choices): return group(*choices) + '*'
46def maybe(*choices): return group(*choices) + '?'
47
48Whitespace = r'[ \f\t]*'
49Comment = r'#[^\r\n]*'
50Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51Name = r'[a-zA-Z_]\w*'
52
53Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
54Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
55Binnumber = r'0[bB][01]+[lL]?'
56Decnumber = r'[1-9]\d*[lL]?'
57Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
58Exponent = r'[eE][-+]?\d+'
59Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
60Expfloat = r'\d+' + Exponent
61Floatnumber = group(Pointfloat, Expfloat)
62Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
63Number = group(Imagnumber, Floatnumber, Intnumber)
64
65# Tail end of ' string.
66Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
67# Tail end of " string.
68Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
69# Tail end of ''' string.
70Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
71# Tail end of """ string.
72Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
73Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
74# Single-line ' or " string.
75String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
76               r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
77
78# Because of leftmost-then-longest match semantics, be sure to put the
79# longest operators first (e.g., if = came before ==, == would get
80# recognized as two instances of =).
81Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
82                 r"//=?",
83                 r"[+\-*/%&|^=<>]=?",
84                 r"~")
85
86Bracket = '[][(){}]'
87Special = group(r'\r?\n', r'[:;.,`@]')
88Funny = group(Operator, Bracket, Special)
89
90PlainToken = group(Number, Funny, String, Name)
91Token = Ignore + PlainToken
92
93# First (or only) line of ' or " string.
94ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
95                group("'", r'\\\r?\n'),
96                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
97                group('"', r'\\\r?\n'))
98PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
99PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
100
101tokenprog, pseudoprog, single3prog, double3prog = map(
102    re.compile, (Token, PseudoToken, Single3, Double3))
103endprogs = {"'": re.compile(Single), '"': re.compile(Double),
104            "'''": single3prog, '"""': double3prog,
105            "r'''": single3prog, 'r"""': double3prog,
106            "u'''": single3prog, 'u"""': double3prog,
107            "ur'''": single3prog, 'ur"""': double3prog,
108            "R'''": single3prog, 'R"""': double3prog,
109            "U'''": single3prog, 'U"""': double3prog,
110            "uR'''": single3prog, 'uR"""': double3prog,
111            "Ur'''": single3prog, 'Ur"""': double3prog,
112            "UR'''": single3prog, 'UR"""': double3prog,
113            "b'''": single3prog, 'b"""': double3prog,
114            "br'''": single3prog, 'br"""': double3prog,
115            "B'''": single3prog, 'B"""': double3prog,
116            "bR'''": single3prog, 'bR"""': double3prog,
117            "Br'''": single3prog, 'Br"""': double3prog,
118            "BR'''": single3prog, 'BR"""': double3prog,
119            'r': None, 'R': None, 'u': None, 'U': None,
120            'b': None, 'B': None}
121
122triple_quoted = {}
123for t in ("'''", '"""',
124          "r'''", 'r"""', "R'''", 'R"""',
125          "u'''", 'u"""', "U'''", 'U"""',
126          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
127          "uR'''", 'uR"""', "UR'''", 'UR"""',
128          "b'''", 'b"""', "B'''", 'B"""',
129          "br'''", 'br"""', "Br'''", 'Br"""',
130          "bR'''", 'bR"""', "BR'''", 'BR"""'):
131    triple_quoted[t] = t
132single_quoted = {}
133for t in ("'", '"',
134          "r'", 'r"', "R'", 'R"',
135          "u'", 'u"', "U'", 'U"',
136          "ur'", 'ur"', "Ur'", 'Ur"',
137          "uR'", 'uR"', "UR'", 'UR"',
138          "b'", 'b"', "B'", 'B"',
139          "br'", 'br"', "Br'", 'Br"',
140          "bR'", 'bR"', "BR'", 'BR"' ):
141    single_quoted[t] = t
142
143tabsize = 8
144
145class TokenError(Exception): pass
146
147class StopTokenizing(Exception): pass
148
149def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
150    srow, scol = srow_scol
151    erow, ecol = erow_ecol
152    print "%d,%d-%d,%d:\t%s\t%s" % \
153        (srow, scol, erow, ecol, tok_name[type], repr(token))
154
155def tokenize(readline, tokeneater=printtoken):
156    """
157    The tokenize() function accepts two parameters: one representing the
158    input stream, and one providing an output mechanism for tokenize().
159
160    The first parameter, readline, must be a callable object which provides
161    the same interface as the readline() method of built-in file objects.
162    Each call to the function should return one line of input as a string.
163
164    The second parameter, tokeneater, must also be a callable object. It is
165    called once for each token, with five arguments, corresponding to the
166    tuples generated by generate_tokens().
167    """
168    try:
169        tokenize_loop(readline, tokeneater)
170    except StopTokenizing:
171        pass
172
173# backwards compatible interface
174def tokenize_loop(readline, tokeneater):
175    for token_info in generate_tokens(readline):
176        tokeneater(*token_info)
177
178class Untokenizer:
179
180    def __init__(self):
181        self.tokens = []
182        self.prev_row = 1
183        self.prev_col = 0
184
185    def add_whitespace(self, start):
186        row, col = start
187        assert row <= self.prev_row
188        col_offset = col - self.prev_col
189        if col_offset:
190            self.tokens.append(" " * col_offset)
191
192    def untokenize(self, iterable):
193        for t in iterable:
194            if len(t) == 2:
195                self.compat(t, iterable)
196                break
197            tok_type, token, start, end, line = t
198            self.add_whitespace(start)
199            self.tokens.append(token)
200            self.prev_row, self.prev_col = end
201            if tok_type in (NEWLINE, NL):
202                self.prev_row += 1
203                self.prev_col = 0
204        return "".join(self.tokens)
205
206    def compat(self, token, iterable):
207        startline = False
208        indents = []
209        toks_append = self.tokens.append
210        toknum, tokval = token
211        if toknum in (NAME, NUMBER):
212            tokval += ' '
213        if toknum in (NEWLINE, NL):
214            startline = True
215        prevstring = False
216        for tok in iterable:
217            toknum, tokval = tok[:2]
218
219            if toknum in (NAME, NUMBER):
220                tokval += ' '
221
222            # Insert a space between two consecutive strings
223            if toknum == STRING:
224                if prevstring:
225                    tokval = ' ' + tokval
226                prevstring = True
227            else:
228                prevstring = False
229
230            if toknum == INDENT:
231                indents.append(tokval)
232                continue
233            elif toknum == DEDENT:
234                indents.pop()
235                continue
236            elif toknum in (NEWLINE, NL):
237                startline = True
238            elif startline and indents:
239                toks_append(indents[-1])
240                startline = False
241            toks_append(tokval)
242
243def untokenize(iterable):
244    """Transform tokens back into Python source code.
245
246    Each element returned by the iterable must be a token sequence
247    with at least two elements, a token number and token value.  If
248    only two tokens are passed, the resulting output is poor.
249
250    Round-trip invariant for full input:
251        Untokenized source will match input source exactly
252
253    Round-trip invariant for limited intput:
254        # Output text will tokenize the back to the input
255        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
256        newcode = untokenize(t1)
257        readline = iter(newcode.splitlines(1)).next
258        t2 = [tok[:2] for tok in generate_tokens(readline)]
259        assert t1 == t2
260    """
261    ut = Untokenizer()
262    return ut.untokenize(iterable)
263
264def generate_tokens(readline):
265    """
266    The generate_tokens() generator requires one argment, readline, which
267    must be a callable object which provides the same interface as the
268    readline() method of built-in file objects. Each call to the function
269    should return one line of input as a string.  Alternately, readline
270    can be a callable function terminating with StopIteration:
271        readline = open(myfile).next    # Example of alternate readline
272
273    The generator produces 5-tuples with these members: the token type; the
274    token string; a 2-tuple (srow, scol) of ints specifying the row and
275    column where the token begins in the source; a 2-tuple (erow, ecol) of
276    ints specifying the row and column where the token ends in the source;
277    and the line on which the token was found. The line passed is the
278    logical line; continuation lines are included.
279    """
280    lnum = parenlev = continued = 0
281    namechars, numchars = string.ascii_letters + '_', '0123456789'
282    contstr, needcont = '', 0
283    contline = None
284    indents = [0]
285
286    while 1:                                   # loop over lines in stream
287        try:
288            line = readline()
289        except StopIteration:
290            line = ''
291        lnum += 1
292        pos, max = 0, len(line)
293
294        if contstr:                            # continued string
295            if not line:
296                raise TokenError, ("EOF in multi-line string", strstart)
297            endmatch = endprog.match(line)
298            if endmatch:
299                pos = end = endmatch.end(0)
300                yield (STRING, contstr + line[:end],
301                       strstart, (lnum, end), contline + line)
302                contstr, needcont = '', 0
303                contline = None
304            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
305                yield (ERRORTOKEN, contstr + line,
306                           strstart, (lnum, len(line)), contline)
307                contstr = ''
308                contline = None
309                continue
310            else:
311                contstr = contstr + line
312                contline = contline + line
313                continue
314
315        elif parenlev == 0 and not continued:  # new statement
316            if not line: break
317            column = 0
318            while pos < max:                   # measure leading whitespace
319                if line[pos] == ' ':
320                    column += 1
321                elif line[pos] == '\t':
322                    column = (column//tabsize + 1)*tabsize
323                elif line[pos] == '\f':
324                    column = 0
325                else:
326                    break
327                pos += 1
328            if pos == max:
329                break
330
331            if line[pos] in '#\r\n':           # skip comments or blank lines
332                if line[pos] == '#':
333                    comment_token = line[pos:].rstrip('\r\n')
334                    nl_pos = pos + len(comment_token)
335                    yield (COMMENT, comment_token,
336                           (lnum, pos), (lnum, pos + len(comment_token)), line)
337                    yield (NL, line[nl_pos:],
338                           (lnum, nl_pos), (lnum, len(line)), line)
339                else:
340                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
341                           (lnum, pos), (lnum, len(line)), line)
342                continue
343
344            if column > indents[-1]:           # count indents or dedents
345                indents.append(column)
346                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
347            while column < indents[-1]:
348                if column not in indents:
349                    raise IndentationError(
350                        "unindent does not match any outer indentation level",
351                        ("<tokenize>", lnum, pos, line))
352                indents = indents[:-1]
353                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
354
355        else:                                  # continued statement
356            if not line:
357                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
358            continued = 0
359
360        while pos < max:
361            pseudomatch = pseudoprog.match(line, pos)
362            if pseudomatch:                                # scan for tokens
363                start, end = pseudomatch.span(1)
364                spos, epos, pos = (lnum, start), (lnum, end), end
365                if start == end:
366                    continue
367                token, initial = line[start:end], line[start]
368
369                if initial in numchars or \
370                   (initial == '.' and token != '.'):      # ordinary number
371                    yield (NUMBER, token, spos, epos, line)
372                elif initial in '\r\n':
373                    yield (NL if parenlev > 0 else NEWLINE,
374                           token, spos, epos, line)
375                elif initial == '#':
376                    assert not token.endswith("\n")
377                    yield (COMMENT, token, spos, epos, line)
378                elif token in triple_quoted:
379                    endprog = endprogs[token]
380                    endmatch = endprog.match(line, pos)
381                    if endmatch:                           # all on one line
382                        pos = endmatch.end(0)
383                        token = line[start:pos]
384                        yield (STRING, token, spos, (lnum, pos), line)
385                    else:
386                        strstart = (lnum, start)           # multiple lines
387                        contstr = line[start:]
388                        contline = line
389                        break
390                elif initial in single_quoted or \
391                    token[:2] in single_quoted or \
392                    token[:3] in single_quoted:
393                    if token[-1] == '\n':                  # continued string
394                        strstart = (lnum, start)
395                        endprog = (endprogs[initial] or endprogs[token[1]] or
396                                   endprogs[token[2]])
397                        contstr, needcont = line[start:], 1
398                        contline = line
399                        break
400                    else:                                  # ordinary string
401                        yield (STRING, token, spos, epos, line)
402                elif initial in namechars:                 # ordinary name
403                    yield (NAME, token, spos, epos, line)
404                elif initial == '\\':                      # continued stmt
405                    continued = 1
406                else:
407                    if initial in '([{':
408                        parenlev += 1
409                    elif initial in ')]}':
410                        parenlev -= 1
411                    yield (OP, token, spos, epos, line)
412            else:
413                yield (ERRORTOKEN, line[pos],
414                           (lnum, pos), (lnum, pos+1), line)
415                pos += 1
416
417    for indent in indents[1:]:                 # pop remaining indent levels
418        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
419    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
420
421if __name__ == '__main__':                     # testing
422    import sys
423    if len(sys.argv) > 1:
424        tokenize(open(sys.argv[1]).readline)
425    else:
426        tokenize(sys.stdin.readline)
427