1# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2# For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt 3 4"""Better tokenizing for coverage.py.""" 5 6import codecs 7import keyword 8import re 9import sys 10import token 11import tokenize 12 13from coverage import env 14from coverage.backward import iternext, unicode_class 15from coverage.misc import contract 16 17 18def phys_tokens(toks): 19 """Return all physical tokens, even line continuations. 20 21 tokenize.generate_tokens() doesn't return a token for the backslash that 22 continues lines. This wrapper provides those tokens so that we can 23 re-create a faithful representation of the original source. 24 25 Returns the same values as generate_tokens() 26 27 """ 28 last_line = None 29 last_lineno = -1 30 last_ttype = None 31 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: 32 if last_lineno != elineno: 33 if last_line and last_line.endswith("\\\n"): 34 # We are at the beginning of a new line, and the last line 35 # ended with a backslash. We probably have to inject a 36 # backslash token into the stream. Unfortunately, there's more 37 # to figure out. This code:: 38 # 39 # usage = """\ 40 # HEY THERE 41 # """ 42 # 43 # triggers this condition, but the token text is:: 44 # 45 # '"""\\\nHEY THERE\n"""' 46 # 47 # so we need to figure out if the backslash is already in the 48 # string token or not. 49 inject_backslash = True 50 if last_ttype == tokenize.COMMENT: 51 # Comments like this \ 52 # should never result in a new token. 53 inject_backslash = False 54 elif ttype == token.STRING: 55 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': 56 # It's a multi-line string and the first line ends with 57 # a backslash, so we don't need to inject another. 58 inject_backslash = False 59 if inject_backslash: 60 # Figure out what column the backslash is in. 61 ccol = len(last_line.split("\n")[-2]) - 1 62 # Yield the token, with a fake token type. 63 yield ( 64 99999, "\\\n", 65 (slineno, ccol), (slineno, ccol+2), 66 last_line 67 ) 68 last_line = ltext 69 last_ttype = ttype 70 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext 71 last_lineno = elineno 72 73 74@contract(source='unicode') 75def source_token_lines(source): 76 """Generate a series of lines, one for each line in `source`. 77 78 Each line is a list of pairs, each pair is a token:: 79 80 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] 81 82 Each pair has a token class, and the token text. 83 84 If you concatenate all the token texts, and then join them with newlines, 85 you should have your original `source` back, with two differences: 86 trailing whitespace is not preserved, and a final line with no newline 87 is indistinguishable from a final line with a newline. 88 89 """ 90 91 ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) 92 line = [] 93 col = 0 94 95 source = source.expandtabs(8).replace('\r\n', '\n') 96 tokgen = generate_tokens(source) 97 98 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): 99 mark_start = True 100 for part in re.split('(\n)', ttext): 101 if part == '\n': 102 yield line 103 line = [] 104 col = 0 105 mark_end = False 106 elif part == '': 107 mark_end = False 108 elif ttype in ws_tokens: 109 mark_end = False 110 else: 111 if mark_start and scol > col: 112 line.append(("ws", u" " * (scol - col))) 113 mark_start = False 114 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] 115 if ttype == token.NAME and keyword.iskeyword(ttext): 116 tok_class = "key" 117 line.append((tok_class, part)) 118 mark_end = True 119 scol = 0 120 if mark_end: 121 col = ecol 122 123 if line: 124 yield line 125 126 127class CachedTokenizer(object): 128 """A one-element cache around tokenize.generate_tokens. 129 130 When reporting, coverage.py tokenizes files twice, once to find the 131 structure of the file, and once to syntax-color it. Tokenizing is 132 expensive, and easily cached. 133 134 This is a one-element cache so that our twice-in-a-row tokenizing doesn't 135 actually tokenize twice. 136 137 """ 138 def __init__(self): 139 self.last_text = None 140 self.last_tokens = None 141 142 @contract(text='unicode') 143 def generate_tokens(self, text): 144 """A stand-in for `tokenize.generate_tokens`.""" 145 if text != self.last_text: 146 self.last_text = text 147 readline = iternext(text.splitlines(True)) 148 self.last_tokens = list(tokenize.generate_tokens(readline)) 149 return self.last_tokens 150 151# Create our generate_tokens cache as a callable replacement function. 152generate_tokens = CachedTokenizer().generate_tokens 153 154 155COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE) 156 157@contract(source='bytes') 158def _source_encoding_py2(source): 159 """Determine the encoding for `source`, according to PEP 263. 160 161 `source` is a byte string, the text of the program. 162 163 Returns a string, the name of the encoding. 164 165 """ 166 assert isinstance(source, bytes) 167 168 # Do this so the detect_encode code we copied will work. 169 readline = iternext(source.splitlines(True)) 170 171 # This is mostly code adapted from Py3.2's tokenize module. 172 173 def _get_normal_name(orig_enc): 174 """Imitates get_normal_name in tokenizer.c.""" 175 # Only care about the first 12 characters. 176 enc = orig_enc[:12].lower().replace("_", "-") 177 if re.match(r"^utf-8($|-)", enc): 178 return "utf-8" 179 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): 180 return "iso-8859-1" 181 return orig_enc 182 183 # From detect_encode(): 184 # It detects the encoding from the presence of a UTF-8 BOM or an encoding 185 # cookie as specified in PEP-0263. If both a BOM and a cookie are present, 186 # but disagree, a SyntaxError will be raised. If the encoding cookie is an 187 # invalid charset, raise a SyntaxError. Note that if a UTF-8 BOM is found, 188 # 'utf-8-sig' is returned. 189 190 # If no encoding is specified, then the default will be returned. 191 default = 'ascii' 192 193 bom_found = False 194 encoding = None 195 196 def read_or_stop(): 197 """Get the next source line, or ''.""" 198 try: 199 return readline() 200 except StopIteration: 201 return '' 202 203 def find_cookie(line): 204 """Find an encoding cookie in `line`.""" 205 try: 206 line_string = line.decode('ascii') 207 except UnicodeDecodeError: 208 return None 209 210 matches = COOKIE_RE.findall(line_string) 211 if not matches: 212 return None 213 encoding = _get_normal_name(matches[0]) 214 try: 215 codec = codecs.lookup(encoding) 216 except LookupError: 217 # This behavior mimics the Python interpreter 218 raise SyntaxError("unknown encoding: " + encoding) 219 220 if bom_found: 221 # codecs in 2.3 were raw tuples of functions, assume the best. 222 codec_name = getattr(codec, 'name', encoding) 223 if codec_name != 'utf-8': 224 # This behavior mimics the Python interpreter 225 raise SyntaxError('encoding problem: utf-8') 226 encoding += '-sig' 227 return encoding 228 229 first = read_or_stop() 230 if first.startswith(codecs.BOM_UTF8): 231 bom_found = True 232 first = first[3:] 233 default = 'utf-8-sig' 234 if not first: 235 return default 236 237 encoding = find_cookie(first) 238 if encoding: 239 return encoding 240 241 second = read_or_stop() 242 if not second: 243 return default 244 245 encoding = find_cookie(second) 246 if encoding: 247 return encoding 248 249 return default 250 251 252@contract(source='bytes') 253def _source_encoding_py3(source): 254 """Determine the encoding for `source`, according to PEP 263. 255 256 `source` is a byte string: the text of the program. 257 258 Returns a string, the name of the encoding. 259 260 """ 261 readline = iternext(source.splitlines(True)) 262 return tokenize.detect_encoding(readline)[0] 263 264 265if env.PY3: 266 source_encoding = _source_encoding_py3 267else: 268 source_encoding = _source_encoding_py2 269 270 271@contract(source='unicode') 272def compile_unicode(source, filename, mode): 273 """Just like the `compile` builtin, but works on any Unicode string. 274 275 Python 2's compile() builtin has a stupid restriction: if the source string 276 is Unicode, then it may not have a encoding declaration in it. Why not? 277 Who knows! It also decodes to utf8, and then tries to interpret those utf8 278 bytes according to the encoding declaration. Why? Who knows! 279 280 This function neuters the coding declaration, and compiles it. 281 282 """ 283 source = neuter_encoding_declaration(source) 284 if env.PY2 and isinstance(filename, unicode_class): 285 filename = filename.encode(sys.getfilesystemencoding(), "replace") 286 code = compile(source, filename, mode) 287 return code 288 289 290@contract(source='unicode', returns='unicode') 291def neuter_encoding_declaration(source): 292 """Return `source`, with any encoding declaration neutered.""" 293 if COOKIE_RE.search(source): 294 source_lines = source.splitlines(True) 295 for lineno in range(min(2, len(source_lines))): 296 source_lines[lineno] = COOKIE_RE.sub("# (deleted declaration)", source_lines[lineno]) 297 source = "".join(source_lines) 298 return source 299