1# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
2# For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt
3
4"""Better tokenizing for coverage.py."""
5
6import codecs
7import keyword
8import re
9import sys
10import token
11import tokenize
12
13from coverage import env
14from coverage.backward import iternext, unicode_class
15from coverage.misc import contract
16
17
18def phys_tokens(toks):
19    """Return all physical tokens, even line continuations.
20
21    tokenize.generate_tokens() doesn't return a token for the backslash that
22    continues lines.  This wrapper provides those tokens so that we can
23    re-create a faithful representation of the original source.
24
25    Returns the same values as generate_tokens()
26
27    """
28    last_line = None
29    last_lineno = -1
30    last_ttype = None
31    for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
32        if last_lineno != elineno:
33            if last_line and last_line.endswith("\\\n"):
34                # We are at the beginning of a new line, and the last line
35                # ended with a backslash.  We probably have to inject a
36                # backslash token into the stream. Unfortunately, there's more
37                # to figure out.  This code::
38                #
39                #   usage = """\
40                #   HEY THERE
41                #   """
42                #
43                # triggers this condition, but the token text is::
44                #
45                #   '"""\\\nHEY THERE\n"""'
46                #
47                # so we need to figure out if the backslash is already in the
48                # string token or not.
49                inject_backslash = True
50                if last_ttype == tokenize.COMMENT:
51                    # Comments like this \
52                    # should never result in a new token.
53                    inject_backslash = False
54                elif ttype == token.STRING:
55                    if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
56                        # It's a multi-line string and the first line ends with
57                        # a backslash, so we don't need to inject another.
58                        inject_backslash = False
59                if inject_backslash:
60                    # Figure out what column the backslash is in.
61                    ccol = len(last_line.split("\n")[-2]) - 1
62                    # Yield the token, with a fake token type.
63                    yield (
64                        99999, "\\\n",
65                        (slineno, ccol), (slineno, ccol+2),
66                        last_line
67                        )
68            last_line = ltext
69            last_ttype = ttype
70        yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
71        last_lineno = elineno
72
73
74@contract(source='unicode')
75def source_token_lines(source):
76    """Generate a series of lines, one for each line in `source`.
77
78    Each line is a list of pairs, each pair is a token::
79
80        [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
81
82    Each pair has a token class, and the token text.
83
84    If you concatenate all the token texts, and then join them with newlines,
85    you should have your original `source` back, with two differences:
86    trailing whitespace is not preserved, and a final line with no newline
87    is indistinguishable from a final line with a newline.
88
89    """
90
91    ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
92    line = []
93    col = 0
94
95    source = source.expandtabs(8).replace('\r\n', '\n')
96    tokgen = generate_tokens(source)
97
98    for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
99        mark_start = True
100        for part in re.split('(\n)', ttext):
101            if part == '\n':
102                yield line
103                line = []
104                col = 0
105                mark_end = False
106            elif part == '':
107                mark_end = False
108            elif ttype in ws_tokens:
109                mark_end = False
110            else:
111                if mark_start and scol > col:
112                    line.append(("ws", u" " * (scol - col)))
113                    mark_start = False
114                tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
115                if ttype == token.NAME and keyword.iskeyword(ttext):
116                    tok_class = "key"
117                line.append((tok_class, part))
118                mark_end = True
119            scol = 0
120        if mark_end:
121            col = ecol
122
123    if line:
124        yield line
125
126
127class CachedTokenizer(object):
128    """A one-element cache around tokenize.generate_tokens.
129
130    When reporting, coverage.py tokenizes files twice, once to find the
131    structure of the file, and once to syntax-color it.  Tokenizing is
132    expensive, and easily cached.
133
134    This is a one-element cache so that our twice-in-a-row tokenizing doesn't
135    actually tokenize twice.
136
137    """
138    def __init__(self):
139        self.last_text = None
140        self.last_tokens = None
141
142    @contract(text='unicode')
143    def generate_tokens(self, text):
144        """A stand-in for `tokenize.generate_tokens`."""
145        if text != self.last_text:
146            self.last_text = text
147            readline = iternext(text.splitlines(True))
148            self.last_tokens = list(tokenize.generate_tokens(readline))
149        return self.last_tokens
150
151# Create our generate_tokens cache as a callable replacement function.
152generate_tokens = CachedTokenizer().generate_tokens
153
154
155COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE)
156
157@contract(source='bytes')
158def _source_encoding_py2(source):
159    """Determine the encoding for `source`, according to PEP 263.
160
161    `source` is a byte string, the text of the program.
162
163    Returns a string, the name of the encoding.
164
165    """
166    assert isinstance(source, bytes)
167
168    # Do this so the detect_encode code we copied will work.
169    readline = iternext(source.splitlines(True))
170
171    # This is mostly code adapted from Py3.2's tokenize module.
172
173    def _get_normal_name(orig_enc):
174        """Imitates get_normal_name in tokenizer.c."""
175        # Only care about the first 12 characters.
176        enc = orig_enc[:12].lower().replace("_", "-")
177        if re.match(r"^utf-8($|-)", enc):
178            return "utf-8"
179        if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
180            return "iso-8859-1"
181        return orig_enc
182
183    # From detect_encode():
184    # It detects the encoding from the presence of a UTF-8 BOM or an encoding
185    # cookie as specified in PEP-0263.  If both a BOM and a cookie are present,
186    # but disagree, a SyntaxError will be raised.  If the encoding cookie is an
187    # invalid charset, raise a SyntaxError.  Note that if a UTF-8 BOM is found,
188    # 'utf-8-sig' is returned.
189
190    # If no encoding is specified, then the default will be returned.
191    default = 'ascii'
192
193    bom_found = False
194    encoding = None
195
196    def read_or_stop():
197        """Get the next source line, or ''."""
198        try:
199            return readline()
200        except StopIteration:
201            return ''
202
203    def find_cookie(line):
204        """Find an encoding cookie in `line`."""
205        try:
206            line_string = line.decode('ascii')
207        except UnicodeDecodeError:
208            return None
209
210        matches = COOKIE_RE.findall(line_string)
211        if not matches:
212            return None
213        encoding = _get_normal_name(matches[0])
214        try:
215            codec = codecs.lookup(encoding)
216        except LookupError:
217            # This behavior mimics the Python interpreter
218            raise SyntaxError("unknown encoding: " + encoding)
219
220        if bom_found:
221            # codecs in 2.3 were raw tuples of functions, assume the best.
222            codec_name = getattr(codec, 'name', encoding)
223            if codec_name != 'utf-8':
224                # This behavior mimics the Python interpreter
225                raise SyntaxError('encoding problem: utf-8')
226            encoding += '-sig'
227        return encoding
228
229    first = read_or_stop()
230    if first.startswith(codecs.BOM_UTF8):
231        bom_found = True
232        first = first[3:]
233        default = 'utf-8-sig'
234    if not first:
235        return default
236
237    encoding = find_cookie(first)
238    if encoding:
239        return encoding
240
241    second = read_or_stop()
242    if not second:
243        return default
244
245    encoding = find_cookie(second)
246    if encoding:
247        return encoding
248
249    return default
250
251
252@contract(source='bytes')
253def _source_encoding_py3(source):
254    """Determine the encoding for `source`, according to PEP 263.
255
256    `source` is a byte string: the text of the program.
257
258    Returns a string, the name of the encoding.
259
260    """
261    readline = iternext(source.splitlines(True))
262    return tokenize.detect_encoding(readline)[0]
263
264
265if env.PY3:
266    source_encoding = _source_encoding_py3
267else:
268    source_encoding = _source_encoding_py2
269
270
271@contract(source='unicode')
272def compile_unicode(source, filename, mode):
273    """Just like the `compile` builtin, but works on any Unicode string.
274
275    Python 2's compile() builtin has a stupid restriction: if the source string
276    is Unicode, then it may not have a encoding declaration in it.  Why not?
277    Who knows!  It also decodes to utf8, and then tries to interpret those utf8
278    bytes according to the encoding declaration.  Why? Who knows!
279
280    This function neuters the coding declaration, and compiles it.
281
282    """
283    source = neuter_encoding_declaration(source)
284    if env.PY2 and isinstance(filename, unicode_class):
285        filename = filename.encode(sys.getfilesystemencoding(), "replace")
286    code = compile(source, filename, mode)
287    return code
288
289
290@contract(source='unicode', returns='unicode')
291def neuter_encoding_declaration(source):
292    """Return `source`, with any encoding declaration neutered."""
293    if COOKIE_RE.search(source):
294        source_lines = source.splitlines(True)
295        for lineno in range(min(2, len(source_lines))):
296            source_lines[lineno] = COOKIE_RE.sub("# (deleted declaration)", source_lines[lineno])
297        source = "".join(source_lines)
298    return source
299