1# -*- coding: utf-8 -*- 2""" 3 pygments.util 4 ~~~~~~~~~~~~~ 5 6 Utility functions. 7 8 :copyright: Copyright 2006-2020 by the Pygments team, see AUTHORS. 9 :license: BSD, see LICENSE for details. 10""" 11 12import re 13import sys 14from io import TextIOWrapper 15 16 17split_path_re = re.compile(r'[/\\ ]') 18doctype_lookup_re = re.compile(r''' 19 (<\?.*?\?>)?\s* 20 <!DOCTYPE\s+( 21 [a-zA-Z_][a-zA-Z0-9]* 22 (?: \s+ # optional in HTML5 23 [a-zA-Z_][a-zA-Z0-9]*\s+ 24 "[^"]*")? 25 ) 26 [^>]*> 27''', re.DOTALL | re.MULTILINE | re.VERBOSE) 28tag_re = re.compile(r'<(.+?)(\s.*?)?>.*?</.+?>', 29 re.UNICODE | re.IGNORECASE | re.DOTALL | re.MULTILINE) 30xml_decl_re = re.compile(r'\s*<\?xml[^>]*\?>', re.I) 31 32 33class ClassNotFound(ValueError): 34 """Raised if one of the lookup functions didn't find a matching class.""" 35 36 37class OptionError(Exception): 38 pass 39 40 41def get_choice_opt(options, optname, allowed, default=None, normcase=False): 42 string = options.get(optname, default) 43 if normcase: 44 string = string.lower() 45 if string not in allowed: 46 raise OptionError('Value for option %s must be one of %s' % 47 (optname, ', '.join(map(str, allowed)))) 48 return string 49 50 51def get_bool_opt(options, optname, default=None): 52 string = options.get(optname, default) 53 if isinstance(string, bool): 54 return string 55 elif isinstance(string, int): 56 return bool(string) 57 elif not isinstance(string, str): 58 raise OptionError('Invalid type %r for option %s; use ' 59 '1/0, yes/no, true/false, on/off' % ( 60 string, optname)) 61 elif string.lower() in ('1', 'yes', 'true', 'on'): 62 return True 63 elif string.lower() in ('0', 'no', 'false', 'off'): 64 return False 65 else: 66 raise OptionError('Invalid value %r for option %s; use ' 67 '1/0, yes/no, true/false, on/off' % ( 68 string, optname)) 69 70 71def get_int_opt(options, optname, default=None): 72 string = options.get(optname, default) 73 try: 74 return int(string) 75 except TypeError: 76 raise OptionError('Invalid type %r for option %s; you ' 77 'must give an integer value' % ( 78 string, optname)) 79 except ValueError: 80 raise OptionError('Invalid value %r for option %s; you ' 81 'must give an integer value' % ( 82 string, optname)) 83 84 85def get_list_opt(options, optname, default=None): 86 val = options.get(optname, default) 87 if isinstance(val, str): 88 return val.split() 89 elif isinstance(val, (list, tuple)): 90 return list(val) 91 else: 92 raise OptionError('Invalid type %r for option %s; you ' 93 'must give a list value' % ( 94 val, optname)) 95 96 97def docstring_headline(obj): 98 if not obj.__doc__: 99 return '' 100 res = [] 101 for line in obj.__doc__.strip().splitlines(): 102 if line.strip(): 103 res.append(" " + line.strip()) 104 else: 105 break 106 return ''.join(res).lstrip() 107 108 109def make_analysator(f): 110 """Return a static text analyser function that returns float values.""" 111 def text_analyse(text): 112 try: 113 rv = f(text) 114 except Exception: 115 return 0.0 116 if not rv: 117 return 0.0 118 try: 119 return min(1.0, max(0.0, float(rv))) 120 except (ValueError, TypeError): 121 return 0.0 122 text_analyse.__doc__ = f.__doc__ 123 return staticmethod(text_analyse) 124 125 126def shebang_matches(text, regex): 127 r"""Check if the given regular expression matches the last part of the 128 shebang if one exists. 129 130 >>> from pygments.util import shebang_matches 131 >>> shebang_matches('#!/usr/bin/env python', r'python(2\.\d)?') 132 True 133 >>> shebang_matches('#!/usr/bin/python2.4', r'python(2\.\d)?') 134 True 135 >>> shebang_matches('#!/usr/bin/python-ruby', r'python(2\.\d)?') 136 False 137 >>> shebang_matches('#!/usr/bin/python/ruby', r'python(2\.\d)?') 138 False 139 >>> shebang_matches('#!/usr/bin/startsomethingwith python', 140 ... r'python(2\.\d)?') 141 True 142 143 It also checks for common windows executable file extensions:: 144 145 >>> shebang_matches('#!C:\\Python2.4\\Python.exe', r'python(2\.\d)?') 146 True 147 148 Parameters (``'-f'`` or ``'--foo'`` are ignored so ``'perl'`` does 149 the same as ``'perl -e'``) 150 151 Note that this method automatically searches the whole string (eg: 152 the regular expression is wrapped in ``'^$'``) 153 """ 154 index = text.find('\n') 155 if index >= 0: 156 first_line = text[:index].lower() 157 else: 158 first_line = text.lower() 159 if first_line.startswith('#!'): 160 try: 161 found = [x for x in split_path_re.split(first_line[2:].strip()) 162 if x and not x.startswith('-')][-1] 163 except IndexError: 164 return False 165 regex = re.compile(r'^%s(\.(exe|cmd|bat|bin))?$' % regex, re.IGNORECASE) 166 if regex.search(found) is not None: 167 return True 168 return False 169 170 171def doctype_matches(text, regex): 172 """Check if the doctype matches a regular expression (if present). 173 174 Note that this method only checks the first part of a DOCTYPE. 175 eg: 'html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"' 176 """ 177 m = doctype_lookup_re.search(text) 178 if m is None: 179 return False 180 doctype = m.group(2) 181 return re.compile(regex, re.I).match(doctype.strip()) is not None 182 183 184def html_doctype_matches(text): 185 """Check if the file looks like it has a html doctype.""" 186 return doctype_matches(text, r'html') 187 188 189_looks_like_xml_cache = {} 190 191 192def looks_like_xml(text): 193 """Check if a doctype exists or if we have some tags.""" 194 if xml_decl_re.match(text): 195 return True 196 key = hash(text) 197 try: 198 return _looks_like_xml_cache[key] 199 except KeyError: 200 m = doctype_lookup_re.search(text) 201 if m is not None: 202 return True 203 rv = tag_re.search(text[:1000]) is not None 204 _looks_like_xml_cache[key] = rv 205 return rv 206 207 208def surrogatepair(c): 209 """Given a unicode character code with length greater than 16 bits, 210 return the two 16 bit surrogate pair. 211 """ 212 # From example D28 of: 213 # http://www.unicode.org/book/ch03.pdf 214 return (0xd7c0 + (c >> 10), (0xdc00 + (c & 0x3ff))) 215 216 217def format_lines(var_name, seq, raw=False, indent_level=0): 218 """Formats a sequence of strings for output.""" 219 lines = [] 220 base_indent = ' ' * indent_level * 4 221 inner_indent = ' ' * (indent_level + 1) * 4 222 lines.append(base_indent + var_name + ' = (') 223 if raw: 224 # These should be preformatted reprs of, say, tuples. 225 for i in seq: 226 lines.append(inner_indent + i + ',') 227 else: 228 for i in seq: 229 # Force use of single quotes 230 r = repr(i + '"') 231 lines.append(inner_indent + r[:-2] + r[-1] + ',') 232 lines.append(base_indent + ')') 233 return '\n'.join(lines) 234 235 236def duplicates_removed(it, already_seen=()): 237 """ 238 Returns a list with duplicates removed from the iterable `it`. 239 240 Order is preserved. 241 """ 242 lst = [] 243 seen = set() 244 for i in it: 245 if i in seen or i in already_seen: 246 continue 247 lst.append(i) 248 seen.add(i) 249 return lst 250 251 252class Future: 253 """Generic class to defer some work. 254 255 Handled specially in RegexLexerMeta, to support regex string construction at 256 first use. 257 """ 258 def get(self): 259 raise NotImplementedError 260 261 262def guess_decode(text): 263 """Decode *text* with guessed encoding. 264 265 First try UTF-8; this should fail for non-UTF-8 encodings. 266 Then try the preferred locale encoding. 267 Fall back to latin-1, which always works. 268 """ 269 try: 270 text = text.decode('utf-8') 271 return text, 'utf-8' 272 except UnicodeDecodeError: 273 try: 274 import locale 275 prefencoding = locale.getpreferredencoding() 276 text = text.decode() 277 return text, prefencoding 278 except (UnicodeDecodeError, LookupError): 279 text = text.decode('latin1') 280 return text, 'latin1' 281 282 283def guess_decode_from_terminal(text, term): 284 """Decode *text* coming from terminal *term*. 285 286 First try the terminal encoding, if given. 287 Then try UTF-8. Then try the preferred locale encoding. 288 Fall back to latin-1, which always works. 289 """ 290 if getattr(term, 'encoding', None): 291 try: 292 text = text.decode(term.encoding) 293 except UnicodeDecodeError: 294 pass 295 else: 296 return text, term.encoding 297 return guess_decode(text) 298 299 300def terminal_encoding(term): 301 """Return our best guess of encoding for the given *term*.""" 302 if getattr(term, 'encoding', None): 303 return term.encoding 304 import locale 305 return locale.getpreferredencoding() 306 307 308class UnclosingTextIOWrapper(TextIOWrapper): 309 # Don't close underlying buffer on destruction. 310 def close(self): 311 self.flush() 312