1import codecs 2import io 3import re 4from typing import Iterable, Union, overload 5 6 7# https://mypy.readthedocs.io/en/stable/more_types.html#function-overloading 8 9@overload 10def always_bytes(str_or_bytes: None, *encode_args) -> None: 11 ... 12 13 14@overload 15def always_bytes(str_or_bytes: Union[str, bytes], *encode_args) -> bytes: 16 ... 17 18 19def always_bytes(str_or_bytes: Union[None, str, bytes], *encode_args) -> Union[None, bytes]: 20 if str_or_bytes is None or isinstance(str_or_bytes, bytes): 21 return str_or_bytes 22 elif isinstance(str_or_bytes, str): 23 return str_or_bytes.encode(*encode_args) 24 else: 25 raise TypeError("Expected str or bytes, but got {}.".format(type(str_or_bytes).__name__)) 26 27 28@overload 29def always_str(str_or_bytes: None, *encode_args) -> None: 30 ... 31 32 33@overload 34def always_str(str_or_bytes: Union[str, bytes], *encode_args) -> str: 35 ... 36 37 38def always_str(str_or_bytes: Union[None, str, bytes], *decode_args) -> Union[None, str]: 39 """ 40 Returns, 41 str_or_bytes unmodified, if 42 """ 43 if str_or_bytes is None or isinstance(str_or_bytes, str): 44 return str_or_bytes 45 elif isinstance(str_or_bytes, bytes): 46 return str_or_bytes.decode(*decode_args) 47 else: 48 raise TypeError("Expected str or bytes, but got {}.".format(type(str_or_bytes).__name__)) 49 50 51# Translate control characters to "safe" characters. This implementation 52# initially replaced them with the matching control pictures 53# (http://unicode.org/charts/PDF/U2400.pdf), but that turned out to render badly 54# with monospace fonts. We are back to "." therefore. 55_control_char_trans = { 56 x: ord(".") # x + 0x2400 for unicode control group pictures 57 for x in range(32) 58} 59_control_char_trans[127] = ord(".") # 0x2421 60_control_char_trans_newline = _control_char_trans.copy() 61for x in ("\r", "\n", "\t"): 62 del _control_char_trans_newline[ord(x)] 63 64_control_char_trans = str.maketrans(_control_char_trans) 65_control_char_trans_newline = str.maketrans(_control_char_trans_newline) 66 67 68def escape_control_characters(text: str, keep_spacing=True) -> str: 69 """ 70 Replace all unicode C1 control characters from the given text with a single "." 71 72 Args: 73 keep_spacing: If True, tabs and newlines will not be replaced. 74 """ 75 if not isinstance(text, str): 76 raise ValueError("text type must be unicode but is {}".format(type(text).__name__)) 77 78 trans = _control_char_trans_newline if keep_spacing else _control_char_trans 79 return text.translate(trans) 80 81 82def bytes_to_escaped_str(data: bytes, keep_spacing: bool = False, escape_single_quotes: bool = False) -> str: 83 """ 84 Take bytes and return a safe string that can be displayed to the user. 85 86 Single quotes are always escaped, double quotes are never escaped: 87 "'" + bytes_to_escaped_str(...) + "'" 88 gives a valid Python string. 89 90 Args: 91 keep_spacing: If True, tabs and newlines will not be escaped. 92 """ 93 94 if not isinstance(data, bytes): 95 raise ValueError(f"data must be bytes, but is {data.__class__.__name__}") 96 # We always insert a double-quote here so that we get a single-quoted string back 97 # https://stackoverflow.com/questions/29019340/why-does-python-use-different-quotes-for-representing-strings-depending-on-their 98 ret = repr(b'"' + data).lstrip("b")[2:-1] 99 if not escape_single_quotes: 100 ret = re.sub(r"(?<!\\)(\\\\)*\\'", lambda m: (m.group(1) or "") + "'", ret) 101 if keep_spacing: 102 ret = re.sub( 103 r"(?<!\\)(\\\\)*\\([nrt])", 104 lambda m: (m.group(1) or "") + dict(n="\n", r="\r", t="\t")[m.group(2)], 105 ret 106 ) 107 return ret 108 109 110def escaped_str_to_bytes(data: str) -> bytes: 111 """ 112 Take an escaped string and return the unescaped bytes equivalent. 113 114 Raises: 115 ValueError, if the escape sequence is invalid. 116 """ 117 if not isinstance(data, str): 118 raise ValueError(f"data must be str, but is {data.__class__.__name__}") 119 120 # This one is difficult - we use an undocumented Python API here 121 # as per http://stackoverflow.com/a/23151714/934719 122 return codecs.escape_decode(data)[0] # type: ignore 123 124 125def is_mostly_bin(s: bytes) -> bool: 126 if not s or len(s) == 0: 127 return False 128 129 return sum( 130 i < 9 or 13 < i < 32 or 126 < i 131 for i in s[:100] 132 ) / len(s[:100]) > 0.3 133 134 135def is_xml(s: bytes) -> bool: 136 for char in s: 137 if char in (9, 10, 32): # is space? 138 continue 139 return char == 60 # is a "<"? 140 return False 141 142 143def clean_hanging_newline(t): 144 """ 145 Many editors will silently add a newline to the final line of a 146 document (I'm looking at you, Vim). This function fixes this common 147 problem at the risk of removing a hanging newline in the rare cases 148 where the user actually intends it. 149 """ 150 if t and t[-1] == "\n": 151 return t[:-1] 152 return t 153 154 155def hexdump(s): 156 """ 157 Returns: 158 A generator of (offset, hex, str) tuples 159 """ 160 for i in range(0, len(s), 16): 161 offset = f"{i:0=10x}" 162 part = s[i:i + 16] 163 x = " ".join(f"{i:0=2x}" for i in part) 164 x = x.ljust(47) # 16*2 + 15 165 part_repr = always_str(escape_control_characters( 166 part.decode("ascii", "replace").replace("\ufffd", "."), 167 False 168 )) 169 yield (offset, x, part_repr) 170 171 172def _move_to_private_code_plane(matchobj): 173 return chr(ord(matchobj.group(0)) + 0xE000) 174 175 176def _restore_from_private_code_plane(matchobj): 177 return chr(ord(matchobj.group(0)) - 0xE000) 178 179 180NO_ESCAPE = r"(?<!\\)(?:\\\\)*" 181MULTILINE_CONTENT = r"[\s\S]*?" 182SINGLELINE_CONTENT = r".*?" 183MULTILINE_CONTENT_LINE_CONTINUATION = r"(?:.|(?<=\\)\n)*?" 184 185 186def split_special_areas( 187 data: str, 188 area_delimiter: Iterable[str], 189): 190 """ 191 Split a string of code into a [code, special area, code, special area, ..., code] list. 192 193 For example, 194 195 >>> split_special_areas( 196 >>> "test /* don't modify me */ foo", 197 >>> [r"/\\*[\\s\\S]*?\\*/"]) # (regex matching comments) 198 ["test ", "/* don't modify me */", " foo"] 199 200 "".join(split_special_areas(x, ...)) == x always holds true. 201 """ 202 return re.split( 203 "({})".format("|".join(area_delimiter)), 204 data, 205 flags=re.MULTILINE 206 ) 207 208 209def escape_special_areas( 210 data: str, 211 area_delimiter: Iterable[str], 212 control_characters, 213): 214 """ 215 Escape all control characters present in special areas with UTF8 symbols 216 in the private use plane (U+E000 t+ ord(char)). 217 This is useful so that one can then use regex replacements on the resulting string without 218 interfering with special areas. 219 220 control_characters must be 0 < ord(x) < 256. 221 222 Example: 223 224 >>> print(x) 225 if (true) { console.log('{}'); } 226 >>> x = escape_special_areas(x, "{", ["'" + SINGLELINE_CONTENT + "'"]) 227 >>> print(x) 228 if (true) { console.log('�}'); } 229 >>> x = re.sub(r"\\s*{\\s*", " {\n ", x) 230 >>> x = unescape_special_areas(x) 231 >>> print(x) 232 if (true) { 233 console.log('{}'); } 234 """ 235 buf = io.StringIO() 236 parts = split_special_areas(data, area_delimiter) 237 rex = re.compile(fr"[{control_characters}]") 238 for i, x in enumerate(parts): 239 if i % 2: 240 x = rex.sub(_move_to_private_code_plane, x) 241 buf.write(x) 242 return buf.getvalue() 243 244 245def unescape_special_areas(data: str): 246 """ 247 Invert escape_special_areas. 248 249 x == unescape_special_areas(escape_special_areas(x)) always holds true. 250 """ 251 return re.sub(r"[\ue000-\ue0ff]", _restore_from_private_code_plane, data) 252