1"""Utilities for parsing source text
2"""
3import html
4import re
5from typing import Any
6
7from .entities import entities
8
9
10def charCodeAt(src: str, pos: int) -> Any:
11    """
12    Returns the Unicode value of the character at the specified location.
13
14    @param - index The zero-based index of the desired character.
15    If there is no character at the specified index, NaN is returned.
16
17    This was added for compatibility with python
18    """
19    try:
20        return ord(src[pos])
21    except IndexError:
22        return None
23
24
25# Merge objects
26#
27def assign(obj):
28    """Merge objects /*from1, from2, from3, ...*/)"""
29    raise NotImplementedError
30    # sources = Array.prototype.slice.call(arguments, 1)
31
32    # sources.forEach(function (source) {
33    #   if (!source) { return; }
34
35    #   if (typeof source !== 'object') {
36    #     throw new TypeError(source + 'must be object')
37    #   }
38
39    #   Object.keys(source).forEach(function (key) {
40    #     obj[key] = source[key]
41    #   })
42    # })
43
44    # return obj
45
46
47def arrayReplaceAt(src: list, pos: int, newElements: list) -> list:
48    """
49    Remove element from array and put another array at those position.
50    Useful for some operations with tokens
51    """
52    return src[:pos] + newElements + src[pos + 1 :]
53
54
55######################################################################
56
57
58def isValidEntityCode(c: int) -> bool:
59
60    # broken sequence
61    if c >= 0xD800 and c <= 0xDFFF:
62        return False
63    # never used
64    if c >= 0xFDD0 and c <= 0xFDEF:
65        return False
66    if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE):
67        return False
68    # control codes
69    if c >= 0x00 and c <= 0x08:
70        return False
71    if c == 0x0B:
72        return False
73    if c >= 0x0E and c <= 0x1F:
74        return False
75    if c >= 0x7F and c <= 0x9F:
76        return False
77    # out of range
78    if c > 0x10FFFF:
79        return False
80    return True
81
82
83def fromCodePoint(c: int) -> str:
84    """Convert ordinal to unicode.
85
86    Note, in the original Javascript two string characters were required,
87    for codepoints larger than `0xFFFF`.
88    But Python 3 can represent any unicode codepoint in one character.
89    """
90    return chr(c)
91
92
93UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
94# ENTITY_RE_g       = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
95UNESCAPE_ALL_RE = re.compile(
96    r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
97    re.IGNORECASE,
98)
99DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE)
100
101
102def replaceEntityPattern(match: str, name: str) -> str:
103    """Convert HTML entity patterns
104
105    ::
106
107        https://www.google.com -> https%3A//www.google.com
108
109    """
110    code = 0
111
112    if name in entities:
113        return entities[name]
114
115    if ord(name[0]) == 0x23 and DIGITAL_ENTITY_TEST_RE.search(name):
116        code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10)
117        if isValidEntityCode(code):
118            return fromCodePoint(code)
119
120    return match
121
122
123# def replaceEntities(string):
124#   if (string.indexOf('&') < 0):
125#       return string
126#   return string.replace(ENTITY_RE, replaceEntityPattern)
127
128
129def unescapeMd(string: str) -> str:
130    raise NotImplementedError
131    # if "\\" in string:
132    #     return string
133    # return string.replace(UNESCAPE_MD_RE, "$1")
134
135
136def unescapeAll(string: str) -> str:
137    def replacer_func(match):
138        escaped = match.group(1)
139        if escaped:
140            return escaped
141        entity = match.group(2)
142        return replaceEntityPattern(match.group(), entity)
143
144    if "\\" not in string and "&" not in string:
145        return string
146    return UNESCAPE_ALL_RE.sub(replacer_func, string)
147
148
149ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-"""
150ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])")
151
152
153def stripEscape(string: str) -> str:
154    """Strip escape \\ characters"""
155    return ESCAPE_CHAR.sub(r"\1", string)
156
157
158# //////////////////////////////////////////////////////////////////////////////
159
160# TODO This section changed quite a lot, should re-check
161
162# UNESCAPE_HTML_RE = re.compile(r"\\&(?=(amp\;|lt\;|gt\;|quot\;))")
163# ESCAPE_AND_HTML = re.compile(r"&(?!(amp\;|lt\;|gt\;|quot\;))")
164# HTML_ESCAPE_REPLACE_RE = re.compile(r'[&<>"]')
165
166
167# def escapeHtml(string: str):
168
169#     if HTML_ESCAPE_REPLACE_RE.search(string):
170
171#         string = UNESCAPE_HTML_RE.sub("&", string)
172#         string = ESCAPE_AND_HTML.sub("&amp;", string)
173#         for k, v in {"<": "&lt;", ">": "&gt;", '"': "&quot;"}.items():
174#             string = string.replace(k, v)
175
176#     return string
177
178
179def escapeHtml(raw: str) -> str:
180    # return html.escape(html.unescape(raw)).replace("&#x27;", "'")
181    return html.escape(raw).replace("&#x27;", "'")
182
183
184# //////////////////////////////////////////////////////////////////////////////
185
186REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]")
187
188
189def escapeRE(string: str) -> str:
190    string = REGEXP_ESCAPE_RE.sub("\\$&", string)
191    return string
192
193
194# //////////////////////////////////////////////////////////////////////////////
195
196
197def isSpace(code: object) -> bool:
198    return code in {0x09, 0x20}
199
200
201MD_WHITESPACE = {
202    0x09,  # \t
203    0x0A,  # \n
204    0x0B,  # \v
205    0x0C,  # \f
206    0x0D,  # \r
207    0x20,
208    0xA0,
209    0x1680,
210    0x202F,
211    0x205F,
212    0x3000,
213}
214
215
216def isWhiteSpace(code: int) -> bool:
217    r"""Zs (unicode class) || [\t\f\v\r\n]"""
218    if code >= 0x2000 and code <= 0x200A:
219        return True
220    return code in MD_WHITESPACE
221
222
223# //////////////////////////////////////////////////////////////////////////////
224
225UNICODE_PUNCT_RE = re.compile(
226    r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]"  # noqa: E501
227)
228
229
230# Currently without astral characters support.
231def isPunctChar(ch: str) -> bool:
232    return UNICODE_PUNCT_RE.search(ch) is not None
233
234
235MD_ASCII_PUNCT = {
236    0x21,  # /* ! */
237    0x22,  # /* " */
238    0x23,  # /* # */
239    0x24,  # /* $ */
240    0x25,  # /* % */
241    0x26,  # /* & */
242    0x27,  # /* ' */
243    0x28,  # /* ( */
244    0x29,  # /* ) */
245    0x2A,  # /* * */
246    0x2B,  # /* + */
247    0x2C,  # /* , */
248    0x2D,  # /* - */
249    0x2E,  # /* . */
250    0x2F,  # /* / */
251    0x3A,  # /* : */
252    0x3B,  # /* ; */
253    0x3C,  # /* < */
254    0x3D,  # /* = */
255    0x3E,  # /* > */
256    0x3F,  # /* ? */
257    0x40,  # /* @ */
258    0x5B,  # /* [ */
259    0x5C,  # /* \ */
260    0x5D,  # /* ] */
261    0x5E,  # /* ^ */
262    0x5F,  # /* _ */
263    0x60,  # /* ` */
264    0x7B,  # /* { */
265    0x7C,  # /* | */
266    0x7D,  # /* } */
267    0x7E,  # /* ~ */
268}
269
270
271def isMdAsciiPunct(ch: int) -> bool:
272    """Markdown ASCII punctuation characters.
273
274    ::
275
276        !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~
277
278    See http://spec.commonmark.org/0.15/#ascii-punctuation-character
279
280    Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.
281
282    """  # noqa: E501
283    return ch in MD_ASCII_PUNCT
284
285
286def normalizeReference(string: str) -> str:
287    """Helper to unify [reference labels]."""
288    # Trim and collapse whitespace
289    #
290    string = re.sub(r"\s+", " ", string.strip())
291
292    # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug
293    # fixed in v12 (couldn't find any details).
294    #
295    # So treat this one as a special case
296    # (remove this when node v10 is no longer supported).
297    #
298    # if ('ẞ'.toLowerCase() === 'Ṿ') {
299    #   str = str.replace(/ẞ/g, 'ß')
300    # }
301
302    # .toLowerCase().toUpperCase() should get rid of all differences
303    # between letter variants.
304    #
305    # Simple .toLowerCase() doesn't normalize 125 code points correctly,
306    # and .toUpperCase doesn't normalize 6 of them (list of exceptions:
307    # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently
308    # uppercased versions).
309    #
310    # Here's an example showing how it happens. Lets take greek letter omega:
311    # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)
312    #
313    # Unicode entries:
314    # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8
315    # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398
316    # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398
317    # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8
318    #
319    # Case-insensitive comparison should treat all of them as equivalent.
320    #
321    # But .toLowerCase() doesn't change ϑ (it's already lowercase),
322    # and .toUpperCase() doesn't change ϴ (already uppercase).
323    #
324    # Applying first lower then upper case normalizes any character:
325    # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'
326    #
327    # Note: this is equivalent to unicode case folding; unicode normalization
328    # is a different step that is not required here.
329    #
330    # Final result should be uppercased, because it's later stored in an object
331    # (this avoid a conflict with Object.prototype members,
332    # most notably, `__proto__`)
333    #
334    return string.lower().upper()
335