1# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2# For details: https://github.com/nedbat/coveragepy/blob/master/NOTICE.txt 3 4"""Better tokenizing for coverage.py.""" 5 6import codecs 7import keyword 8import re 9import sys 10import token 11import tokenize 12 13from coverage import env 14from coverage.backward import iternext, unicode_class 15from coverage.misc import contract 16 17 18def phys_tokens(toks): 19 """Return all physical tokens, even line continuations. 20 21 tokenize.generate_tokens() doesn't return a token for the backslash that 22 continues lines. This wrapper provides those tokens so that we can 23 re-create a faithful representation of the original source. 24 25 Returns the same values as generate_tokens() 26 27 """ 28 last_line = None 29 last_lineno = -1 30 last_ttext = None 31 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: 32 if last_lineno != elineno: 33 if last_line and last_line.endswith("\\\n"): 34 # We are at the beginning of a new line, and the last line 35 # ended with a backslash. We probably have to inject a 36 # backslash token into the stream. Unfortunately, there's more 37 # to figure out. This code:: 38 # 39 # usage = """\ 40 # HEY THERE 41 # """ 42 # 43 # triggers this condition, but the token text is:: 44 # 45 # '"""\\\nHEY THERE\n"""' 46 # 47 # so we need to figure out if the backslash is already in the 48 # string token or not. 49 inject_backslash = True 50 if last_ttext.endswith("\\"): 51 inject_backslash = False 52 elif ttype == token.STRING: 53 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': 54 # It's a multi-line string and the first line ends with 55 # a backslash, so we don't need to inject another. 56 inject_backslash = False 57 if inject_backslash: 58 # Figure out what column the backslash is in. 59 ccol = len(last_line.split("\n")[-2]) - 1 60 # Yield the token, with a fake token type. 61 yield ( 62 99999, "\\\n", 63 (slineno, ccol), (slineno, ccol+2), 64 last_line 65 ) 66 last_line = ltext 67 if ttype not in (tokenize.NEWLINE, tokenize.NL): 68 last_ttext = ttext 69 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext 70 last_lineno = elineno 71 72 73@contract(source='unicode') 74def source_token_lines(source): 75 """Generate a series of lines, one for each line in `source`. 76 77 Each line is a list of pairs, each pair is a token:: 78 79 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] 80 81 Each pair has a token class, and the token text. 82 83 If you concatenate all the token texts, and then join them with newlines, 84 you should have your original `source` back, with two differences: 85 trailing whitespace is not preserved, and a final line with no newline 86 is indistinguishable from a final line with a newline. 87 88 """ 89 90 ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) 91 line = [] 92 col = 0 93 94 source = source.expandtabs(8).replace('\r\n', '\n') 95 tokgen = generate_tokens(source) 96 97 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): 98 mark_start = True 99 for part in re.split('(\n)', ttext): 100 if part == '\n': 101 yield line 102 line = [] 103 col = 0 104 mark_end = False 105 elif part == '': 106 mark_end = False 107 elif ttype in ws_tokens: 108 mark_end = False 109 else: 110 if mark_start and scol > col: 111 line.append(("ws", u" " * (scol - col))) 112 mark_start = False 113 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] 114 if ttype == token.NAME and keyword.iskeyword(ttext): 115 tok_class = "key" 116 line.append((tok_class, part)) 117 mark_end = True 118 scol = 0 119 if mark_end: 120 col = ecol 121 122 if line: 123 yield line 124 125 126class CachedTokenizer(object): 127 """A one-element cache around tokenize.generate_tokens. 128 129 When reporting, coverage.py tokenizes files twice, once to find the 130 structure of the file, and once to syntax-color it. Tokenizing is 131 expensive, and easily cached. 132 133 This is a one-element cache so that our twice-in-a-row tokenizing doesn't 134 actually tokenize twice. 135 136 """ 137 def __init__(self): 138 self.last_text = None 139 self.last_tokens = None 140 141 @contract(text='unicode') 142 def generate_tokens(self, text): 143 """A stand-in for `tokenize.generate_tokens`.""" 144 if text != self.last_text: 145 self.last_text = text 146 readline = iternext(text.splitlines(True)) 147 self.last_tokens = list(tokenize.generate_tokens(readline)) 148 return self.last_tokens 149 150# Create our generate_tokens cache as a callable replacement function. 151generate_tokens = CachedTokenizer().generate_tokens 152 153 154COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE) 155 156@contract(source='bytes') 157def _source_encoding_py2(source): 158 """Determine the encoding for `source`, according to PEP 263. 159 160 `source` is a byte string, the text of the program. 161 162 Returns a string, the name of the encoding. 163 164 """ 165 assert isinstance(source, bytes) 166 167 # Do this so the detect_encode code we copied will work. 168 readline = iternext(source.splitlines(True)) 169 170 # This is mostly code adapted from Py3.2's tokenize module. 171 172 def _get_normal_name(orig_enc): 173 """Imitates get_normal_name in tokenizer.c.""" 174 # Only care about the first 12 characters. 175 enc = orig_enc[:12].lower().replace("_", "-") 176 if re.match(r"^utf-8($|-)", enc): 177 return "utf-8" 178 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): 179 return "iso-8859-1" 180 return orig_enc 181 182 # From detect_encode(): 183 # It detects the encoding from the presence of a UTF-8 BOM or an encoding 184 # cookie as specified in PEP-0263. If both a BOM and a cookie are present, 185 # but disagree, a SyntaxError will be raised. If the encoding cookie is an 186 # invalid charset, raise a SyntaxError. Note that if a UTF-8 BOM is found, 187 # 'utf-8-sig' is returned. 188 189 # If no encoding is specified, then the default will be returned. 190 default = 'ascii' 191 192 bom_found = False 193 encoding = None 194 195 def read_or_stop(): 196 """Get the next source line, or ''.""" 197 try: 198 return readline() 199 except StopIteration: 200 return '' 201 202 def find_cookie(line): 203 """Find an encoding cookie in `line`.""" 204 try: 205 line_string = line.decode('ascii') 206 except UnicodeDecodeError: 207 return None 208 209 matches = COOKIE_RE.findall(line_string) 210 if not matches: 211 return None 212 encoding = _get_normal_name(matches[0]) 213 try: 214 codec = codecs.lookup(encoding) 215 except LookupError: 216 # This behavior mimics the Python interpreter 217 raise SyntaxError("unknown encoding: " + encoding) 218 219 if bom_found: 220 # codecs in 2.3 were raw tuples of functions, assume the best. 221 codec_name = getattr(codec, 'name', encoding) 222 if codec_name != 'utf-8': 223 # This behavior mimics the Python interpreter 224 raise SyntaxError('encoding problem: utf-8') 225 encoding += '-sig' 226 return encoding 227 228 first = read_or_stop() 229 if first.startswith(codecs.BOM_UTF8): 230 bom_found = True 231 first = first[3:] 232 default = 'utf-8-sig' 233 if not first: 234 return default 235 236 encoding = find_cookie(first) 237 if encoding: 238 return encoding 239 240 second = read_or_stop() 241 if not second: 242 return default 243 244 encoding = find_cookie(second) 245 if encoding: 246 return encoding 247 248 return default 249 250 251@contract(source='bytes') 252def _source_encoding_py3(source): 253 """Determine the encoding for `source`, according to PEP 263. 254 255 `source` is a byte string: the text of the program. 256 257 Returns a string, the name of the encoding. 258 259 """ 260 readline = iternext(source.splitlines(True)) 261 return tokenize.detect_encoding(readline)[0] 262 263 264if env.PY3: 265 source_encoding = _source_encoding_py3 266else: 267 source_encoding = _source_encoding_py2 268 269 270@contract(source='unicode') 271def compile_unicode(source, filename, mode): 272 """Just like the `compile` builtin, but works on any Unicode string. 273 274 Python 2's compile() builtin has a stupid restriction: if the source string 275 is Unicode, then it may not have a encoding declaration in it. Why not? 276 Who knows! It also decodes to utf8, and then tries to interpret those utf8 277 bytes according to the encoding declaration. Why? Who knows! 278 279 This function neuters the coding declaration, and compiles it. 280 281 """ 282 source = neuter_encoding_declaration(source) 283 if env.PY2 and isinstance(filename, unicode_class): 284 filename = filename.encode(sys.getfilesystemencoding(), "replace") 285 code = compile(source, filename, mode) 286 return code 287 288 289@contract(source='unicode', returns='unicode') 290def neuter_encoding_declaration(source): 291 """Return `source`, with any encoding declaration neutered.""" 292 if COOKIE_RE.search(source): 293 source_lines = source.splitlines(True) 294 for lineno in range(min(2, len(source_lines))): 295 source_lines[lineno] = COOKIE_RE.sub("# (deleted declaration)", source_lines[lineno]) 296 source = "".join(source_lines) 297 return source 298