1# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
2# For details: https://github.com/nedbat/coveragepy/blob/master/NOTICE.txt
3
4"""Better tokenizing for coverage.py."""
5
6import codecs
7import keyword
8import re
9import sys
10import token
11import tokenize
12
13from coverage import env
14from coverage.backward import iternext, unicode_class
15from coverage.misc import contract
16
17
18def phys_tokens(toks):
19    """Return all physical tokens, even line continuations.
20
21    tokenize.generate_tokens() doesn't return a token for the backslash that
22    continues lines.  This wrapper provides those tokens so that we can
23    re-create a faithful representation of the original source.
24
25    Returns the same values as generate_tokens()
26
27    """
28    last_line = None
29    last_lineno = -1
30    last_ttext = None
31    for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
32        if last_lineno != elineno:
33            if last_line and last_line.endswith("\\\n"):
34                # We are at the beginning of a new line, and the last line
35                # ended with a backslash.  We probably have to inject a
36                # backslash token into the stream. Unfortunately, there's more
37                # to figure out.  This code::
38                #
39                #   usage = """\
40                #   HEY THERE
41                #   """
42                #
43                # triggers this condition, but the token text is::
44                #
45                #   '"""\\\nHEY THERE\n"""'
46                #
47                # so we need to figure out if the backslash is already in the
48                # string token or not.
49                inject_backslash = True
50                if last_ttext.endswith("\\"):
51                    inject_backslash = False
52                elif ttype == token.STRING:
53                    if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
54                        # It's a multi-line string and the first line ends with
55                        # a backslash, so we don't need to inject another.
56                        inject_backslash = False
57                if inject_backslash:
58                    # Figure out what column the backslash is in.
59                    ccol = len(last_line.split("\n")[-2]) - 1
60                    # Yield the token, with a fake token type.
61                    yield (
62                        99999, "\\\n",
63                        (slineno, ccol), (slineno, ccol+2),
64                        last_line
65                        )
66            last_line = ltext
67        if ttype not in (tokenize.NEWLINE, tokenize.NL):
68            last_ttext = ttext
69        yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
70        last_lineno = elineno
71
72
73@contract(source='unicode')
74def source_token_lines(source):
75    """Generate a series of lines, one for each line in `source`.
76
77    Each line is a list of pairs, each pair is a token::
78
79        [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
80
81    Each pair has a token class, and the token text.
82
83    If you concatenate all the token texts, and then join them with newlines,
84    you should have your original `source` back, with two differences:
85    trailing whitespace is not preserved, and a final line with no newline
86    is indistinguishable from a final line with a newline.
87
88    """
89
90    ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
91    line = []
92    col = 0
93
94    source = source.expandtabs(8).replace('\r\n', '\n')
95    tokgen = generate_tokens(source)
96
97    for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
98        mark_start = True
99        for part in re.split('(\n)', ttext):
100            if part == '\n':
101                yield line
102                line = []
103                col = 0
104                mark_end = False
105            elif part == '':
106                mark_end = False
107            elif ttype in ws_tokens:
108                mark_end = False
109            else:
110                if mark_start and scol > col:
111                    line.append(("ws", u" " * (scol - col)))
112                    mark_start = False
113                tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
114                if ttype == token.NAME and keyword.iskeyword(ttext):
115                    tok_class = "key"
116                line.append((tok_class, part))
117                mark_end = True
118            scol = 0
119        if mark_end:
120            col = ecol
121
122    if line:
123        yield line
124
125
126class CachedTokenizer(object):
127    """A one-element cache around tokenize.generate_tokens.
128
129    When reporting, coverage.py tokenizes files twice, once to find the
130    structure of the file, and once to syntax-color it.  Tokenizing is
131    expensive, and easily cached.
132
133    This is a one-element cache so that our twice-in-a-row tokenizing doesn't
134    actually tokenize twice.
135
136    """
137    def __init__(self):
138        self.last_text = None
139        self.last_tokens = None
140
141    @contract(text='unicode')
142    def generate_tokens(self, text):
143        """A stand-in for `tokenize.generate_tokens`."""
144        if text != self.last_text:
145            self.last_text = text
146            readline = iternext(text.splitlines(True))
147            self.last_tokens = list(tokenize.generate_tokens(readline))
148        return self.last_tokens
149
150# Create our generate_tokens cache as a callable replacement function.
151generate_tokens = CachedTokenizer().generate_tokens
152
153
154COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE)
155
156@contract(source='bytes')
157def _source_encoding_py2(source):
158    """Determine the encoding for `source`, according to PEP 263.
159
160    `source` is a byte string, the text of the program.
161
162    Returns a string, the name of the encoding.
163
164    """
165    assert isinstance(source, bytes)
166
167    # Do this so the detect_encode code we copied will work.
168    readline = iternext(source.splitlines(True))
169
170    # This is mostly code adapted from Py3.2's tokenize module.
171
172    def _get_normal_name(orig_enc):
173        """Imitates get_normal_name in tokenizer.c."""
174        # Only care about the first 12 characters.
175        enc = orig_enc[:12].lower().replace("_", "-")
176        if re.match(r"^utf-8($|-)", enc):
177            return "utf-8"
178        if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
179            return "iso-8859-1"
180        return orig_enc
181
182    # From detect_encode():
183    # It detects the encoding from the presence of a UTF-8 BOM or an encoding
184    # cookie as specified in PEP-0263.  If both a BOM and a cookie are present,
185    # but disagree, a SyntaxError will be raised.  If the encoding cookie is an
186    # invalid charset, raise a SyntaxError.  Note that if a UTF-8 BOM is found,
187    # 'utf-8-sig' is returned.
188
189    # If no encoding is specified, then the default will be returned.
190    default = 'ascii'
191
192    bom_found = False
193    encoding = None
194
195    def read_or_stop():
196        """Get the next source line, or ''."""
197        try:
198            return readline()
199        except StopIteration:
200            return ''
201
202    def find_cookie(line):
203        """Find an encoding cookie in `line`."""
204        try:
205            line_string = line.decode('ascii')
206        except UnicodeDecodeError:
207            return None
208
209        matches = COOKIE_RE.findall(line_string)
210        if not matches:
211            return None
212        encoding = _get_normal_name(matches[0])
213        try:
214            codec = codecs.lookup(encoding)
215        except LookupError:
216            # This behavior mimics the Python interpreter
217            raise SyntaxError("unknown encoding: " + encoding)
218
219        if bom_found:
220            # codecs in 2.3 were raw tuples of functions, assume the best.
221            codec_name = getattr(codec, 'name', encoding)
222            if codec_name != 'utf-8':
223                # This behavior mimics the Python interpreter
224                raise SyntaxError('encoding problem: utf-8')
225            encoding += '-sig'
226        return encoding
227
228    first = read_or_stop()
229    if first.startswith(codecs.BOM_UTF8):
230        bom_found = True
231        first = first[3:]
232        default = 'utf-8-sig'
233    if not first:
234        return default
235
236    encoding = find_cookie(first)
237    if encoding:
238        return encoding
239
240    second = read_or_stop()
241    if not second:
242        return default
243
244    encoding = find_cookie(second)
245    if encoding:
246        return encoding
247
248    return default
249
250
251@contract(source='bytes')
252def _source_encoding_py3(source):
253    """Determine the encoding for `source`, according to PEP 263.
254
255    `source` is a byte string: the text of the program.
256
257    Returns a string, the name of the encoding.
258
259    """
260    readline = iternext(source.splitlines(True))
261    return tokenize.detect_encoding(readline)[0]
262
263
264if env.PY3:
265    source_encoding = _source_encoding_py3
266else:
267    source_encoding = _source_encoding_py2
268
269
270@contract(source='unicode')
271def compile_unicode(source, filename, mode):
272    """Just like the `compile` builtin, but works on any Unicode string.
273
274    Python 2's compile() builtin has a stupid restriction: if the source string
275    is Unicode, then it may not have a encoding declaration in it.  Why not?
276    Who knows!  It also decodes to utf8, and then tries to interpret those utf8
277    bytes according to the encoding declaration.  Why? Who knows!
278
279    This function neuters the coding declaration, and compiles it.
280
281    """
282    source = neuter_encoding_declaration(source)
283    if env.PY2 and isinstance(filename, unicode_class):
284        filename = filename.encode(sys.getfilesystemencoding(), "replace")
285    code = compile(source, filename, mode)
286    return code
287
288
289@contract(source='unicode', returns='unicode')
290def neuter_encoding_declaration(source):
291    """Return `source`, with any encoding declaration neutered."""
292    if COOKIE_RE.search(source):
293        source_lines = source.splitlines(True)
294        for lineno in range(min(2, len(source_lines))):
295            source_lines[lineno] = COOKIE_RE.sub("# (deleted declaration)", source_lines[lineno])
296        source = "".join(source_lines)
297    return source
298