1import codecs
2import io
3import re
4from typing import Iterable, Union, overload
5
6
7# https://mypy.readthedocs.io/en/stable/more_types.html#function-overloading
8
9@overload
10def always_bytes(str_or_bytes: None, *encode_args) -> None:
11    ...
12
13
14@overload
15def always_bytes(str_or_bytes: Union[str, bytes], *encode_args) -> bytes:
16    ...
17
18
19def always_bytes(str_or_bytes: Union[None, str, bytes], *encode_args) -> Union[None, bytes]:
20    if str_or_bytes is None or isinstance(str_or_bytes, bytes):
21        return str_or_bytes
22    elif isinstance(str_or_bytes, str):
23        return str_or_bytes.encode(*encode_args)
24    else:
25        raise TypeError("Expected str or bytes, but got {}.".format(type(str_or_bytes).__name__))
26
27
28@overload
29def always_str(str_or_bytes: None, *encode_args) -> None:
30    ...
31
32
33@overload
34def always_str(str_or_bytes: Union[str, bytes], *encode_args) -> str:
35    ...
36
37
38def always_str(str_or_bytes: Union[None, str, bytes], *decode_args) -> Union[None, str]:
39    """
40    Returns,
41        str_or_bytes unmodified, if
42    """
43    if str_or_bytes is None or isinstance(str_or_bytes, str):
44        return str_or_bytes
45    elif isinstance(str_or_bytes, bytes):
46        return str_or_bytes.decode(*decode_args)
47    else:
48        raise TypeError("Expected str or bytes, but got {}.".format(type(str_or_bytes).__name__))
49
50
51# Translate control characters to "safe" characters. This implementation
52# initially replaced them with the matching control pictures
53# (http://unicode.org/charts/PDF/U2400.pdf), but that turned out to render badly
54# with monospace fonts. We are back to "." therefore.
55_control_char_trans = {
56    x: ord(".")  # x + 0x2400 for unicode control group pictures
57    for x in range(32)
58}
59_control_char_trans[127] = ord(".")  # 0x2421
60_control_char_trans_newline = _control_char_trans.copy()
61for x in ("\r", "\n", "\t"):
62    del _control_char_trans_newline[ord(x)]
63
64_control_char_trans = str.maketrans(_control_char_trans)
65_control_char_trans_newline = str.maketrans(_control_char_trans_newline)
66
67
68def escape_control_characters(text: str, keep_spacing=True) -> str:
69    """
70    Replace all unicode C1 control characters from the given text with a single "."
71
72    Args:
73        keep_spacing: If True, tabs and newlines will not be replaced.
74    """
75    if not isinstance(text, str):
76        raise ValueError("text type must be unicode but is {}".format(type(text).__name__))
77
78    trans = _control_char_trans_newline if keep_spacing else _control_char_trans
79    return text.translate(trans)
80
81
82def bytes_to_escaped_str(data: bytes, keep_spacing: bool = False, escape_single_quotes: bool = False) -> str:
83    """
84    Take bytes and return a safe string that can be displayed to the user.
85
86    Single quotes are always escaped, double quotes are never escaped:
87        "'" + bytes_to_escaped_str(...) + "'"
88    gives a valid Python string.
89
90    Args:
91        keep_spacing: If True, tabs and newlines will not be escaped.
92    """
93
94    if not isinstance(data, bytes):
95        raise ValueError(f"data must be bytes, but is {data.__class__.__name__}")
96    # We always insert a double-quote here so that we get a single-quoted string back
97    # https://stackoverflow.com/questions/29019340/why-does-python-use-different-quotes-for-representing-strings-depending-on-their
98    ret = repr(b'"' + data).lstrip("b")[2:-1]
99    if not escape_single_quotes:
100        ret = re.sub(r"(?<!\\)(\\\\)*\\'", lambda m: (m.group(1) or "") + "'", ret)
101    if keep_spacing:
102        ret = re.sub(
103            r"(?<!\\)(\\\\)*\\([nrt])",
104            lambda m: (m.group(1) or "") + dict(n="\n", r="\r", t="\t")[m.group(2)],
105            ret
106        )
107    return ret
108
109
110def escaped_str_to_bytes(data: str) -> bytes:
111    """
112    Take an escaped string and return the unescaped bytes equivalent.
113
114    Raises:
115        ValueError, if the escape sequence is invalid.
116    """
117    if not isinstance(data, str):
118        raise ValueError(f"data must be str, but is {data.__class__.__name__}")
119
120    # This one is difficult - we use an undocumented Python API here
121    # as per http://stackoverflow.com/a/23151714/934719
122    return codecs.escape_decode(data)[0]  # type: ignore
123
124
125def is_mostly_bin(s: bytes) -> bool:
126    if not s or len(s) == 0:
127        return False
128
129    return sum(
130        i < 9 or 13 < i < 32 or 126 < i
131        for i in s[:100]
132    ) / len(s[:100]) > 0.3
133
134
135def is_xml(s: bytes) -> bool:
136    for char in s:
137        if char in (9, 10, 32):  # is space?
138            continue
139        return char == 60  # is a "<"?
140    return False
141
142
143def clean_hanging_newline(t):
144    """
145        Many editors will silently add a newline to the final line of a
146        document (I'm looking at you, Vim). This function fixes this common
147        problem at the risk of removing a hanging newline in the rare cases
148        where the user actually intends it.
149    """
150    if t and t[-1] == "\n":
151        return t[:-1]
152    return t
153
154
155def hexdump(s):
156    """
157        Returns:
158            A generator of (offset, hex, str) tuples
159    """
160    for i in range(0, len(s), 16):
161        offset = f"{i:0=10x}"
162        part = s[i:i + 16]
163        x = " ".join(f"{i:0=2x}" for i in part)
164        x = x.ljust(47)  # 16*2 + 15
165        part_repr = always_str(escape_control_characters(
166            part.decode("ascii", "replace").replace("\ufffd", "."),
167            False
168        ))
169        yield (offset, x, part_repr)
170
171
172def _move_to_private_code_plane(matchobj):
173    return chr(ord(matchobj.group(0)) + 0xE000)
174
175
176def _restore_from_private_code_plane(matchobj):
177    return chr(ord(matchobj.group(0)) - 0xE000)
178
179
180NO_ESCAPE = r"(?<!\\)(?:\\\\)*"
181MULTILINE_CONTENT = r"[\s\S]*?"
182SINGLELINE_CONTENT = r".*?"
183MULTILINE_CONTENT_LINE_CONTINUATION = r"(?:.|(?<=\\)\n)*?"
184
185
186def split_special_areas(
187        data: str,
188        area_delimiter: Iterable[str],
189):
190    """
191    Split a string of code into a [code, special area, code, special area, ..., code] list.
192
193    For example,
194
195    >>> split_special_areas(
196    >>>     "test /* don't modify me */ foo",
197    >>>     [r"/\\*[\\s\\S]*?\\*/"])  # (regex matching comments)
198    ["test ", "/* don't modify me */", " foo"]
199
200    "".join(split_special_areas(x, ...)) == x always holds true.
201    """
202    return re.split(
203        "({})".format("|".join(area_delimiter)),
204        data,
205        flags=re.MULTILINE
206    )
207
208
209def escape_special_areas(
210        data: str,
211        area_delimiter: Iterable[str],
212        control_characters,
213):
214    """
215    Escape all control characters present in special areas with UTF8 symbols
216    in the private use plane (U+E000 t+ ord(char)).
217    This is useful so that one can then use regex replacements on the resulting string without
218    interfering with special areas.
219
220    control_characters must be 0 < ord(x) < 256.
221
222    Example:
223
224    >>> print(x)
225    if (true) { console.log('{}'); }
226    >>> x = escape_special_areas(x, "{", ["'" + SINGLELINE_CONTENT + "'"])
227    >>> print(x)
228    if (true) { console.log('�}'); }
229    >>> x = re.sub(r"\\s*{\\s*", " {\n    ", x)
230    >>> x = unescape_special_areas(x)
231    >>> print(x)
232    if (true) {
233        console.log('{}'); }
234    """
235    buf = io.StringIO()
236    parts = split_special_areas(data, area_delimiter)
237    rex = re.compile(fr"[{control_characters}]")
238    for i, x in enumerate(parts):
239        if i % 2:
240            x = rex.sub(_move_to_private_code_plane, x)
241        buf.write(x)
242    return buf.getvalue()
243
244
245def unescape_special_areas(data: str):
246    """
247    Invert escape_special_areas.
248
249    x == unescape_special_areas(escape_special_areas(x)) always holds true.
250    """
251    return re.sub(r"[\ue000-\ue0ff]", _restore_from_private_code_plane, data)
252