1"""Utilities for parsing source text 2""" 3import html 4import re 5from typing import Any 6 7from .entities import entities 8 9 10def charCodeAt(src: str, pos: int) -> Any: 11 """ 12 Returns the Unicode value of the character at the specified location. 13 14 @param - index The zero-based index of the desired character. 15 If there is no character at the specified index, NaN is returned. 16 17 This was added for compatibility with python 18 """ 19 try: 20 return ord(src[pos]) 21 except IndexError: 22 return None 23 24 25# Merge objects 26# 27def assign(obj): 28 """Merge objects /*from1, from2, from3, ...*/)""" 29 raise NotImplementedError 30 # sources = Array.prototype.slice.call(arguments, 1) 31 32 # sources.forEach(function (source) { 33 # if (!source) { return; } 34 35 # if (typeof source !== 'object') { 36 # throw new TypeError(source + 'must be object') 37 # } 38 39 # Object.keys(source).forEach(function (key) { 40 # obj[key] = source[key] 41 # }) 42 # }) 43 44 # return obj 45 46 47def arrayReplaceAt(src: list, pos: int, newElements: list) -> list: 48 """ 49 Remove element from array and put another array at those position. 50 Useful for some operations with tokens 51 """ 52 return src[:pos] + newElements + src[pos + 1 :] 53 54 55###################################################################### 56 57 58def isValidEntityCode(c: int) -> bool: 59 60 # broken sequence 61 if c >= 0xD800 and c <= 0xDFFF: 62 return False 63 # never used 64 if c >= 0xFDD0 and c <= 0xFDEF: 65 return False 66 if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE): 67 return False 68 # control codes 69 if c >= 0x00 and c <= 0x08: 70 return False 71 if c == 0x0B: 72 return False 73 if c >= 0x0E and c <= 0x1F: 74 return False 75 if c >= 0x7F and c <= 0x9F: 76 return False 77 # out of range 78 if c > 0x10FFFF: 79 return False 80 return True 81 82 83def fromCodePoint(c: int) -> str: 84 """Convert ordinal to unicode. 85 86 Note, in the original Javascript two string characters were required, 87 for codepoints larger than `0xFFFF`. 88 But Python 3 can represent any unicode codepoint in one character. 89 """ 90 return chr(c) 91 92 93UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])') 94# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE) 95UNESCAPE_ALL_RE = re.compile( 96 r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});", 97 re.IGNORECASE, 98) 99DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE) 100 101 102def replaceEntityPattern(match: str, name: str) -> str: 103 """Convert HTML entity patterns 104 105 :: 106 107 https://www.google.com -> https%3A//www.google.com 108 109 """ 110 code = 0 111 112 if name in entities: 113 return entities[name] 114 115 if ord(name[0]) == 0x23 and DIGITAL_ENTITY_TEST_RE.search(name): 116 code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10) 117 if isValidEntityCode(code): 118 return fromCodePoint(code) 119 120 return match 121 122 123# def replaceEntities(string): 124# if (string.indexOf('&') < 0): 125# return string 126# return string.replace(ENTITY_RE, replaceEntityPattern) 127 128 129def unescapeMd(string: str) -> str: 130 raise NotImplementedError 131 # if "\\" in string: 132 # return string 133 # return string.replace(UNESCAPE_MD_RE, "$1") 134 135 136def unescapeAll(string: str) -> str: 137 def replacer_func(match): 138 escaped = match.group(1) 139 if escaped: 140 return escaped 141 entity = match.group(2) 142 return replaceEntityPattern(match.group(), entity) 143 144 if "\\" not in string and "&" not in string: 145 return string 146 return UNESCAPE_ALL_RE.sub(replacer_func, string) 147 148 149ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-""" 150ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])") 151 152 153def stripEscape(string: str) -> str: 154 """Strip escape \\ characters""" 155 return ESCAPE_CHAR.sub(r"\1", string) 156 157 158# ////////////////////////////////////////////////////////////////////////////// 159 160# TODO This section changed quite a lot, should re-check 161 162# UNESCAPE_HTML_RE = re.compile(r"\\&(?=(amp\;|lt\;|gt\;|quot\;))") 163# ESCAPE_AND_HTML = re.compile(r"&(?!(amp\;|lt\;|gt\;|quot\;))") 164# HTML_ESCAPE_REPLACE_RE = re.compile(r'[&<>"]') 165 166 167# def escapeHtml(string: str): 168 169# if HTML_ESCAPE_REPLACE_RE.search(string): 170 171# string = UNESCAPE_HTML_RE.sub("&", string) 172# string = ESCAPE_AND_HTML.sub("&", string) 173# for k, v in {"<": "<", ">": ">", '"': """}.items(): 174# string = string.replace(k, v) 175 176# return string 177 178 179def escapeHtml(raw: str) -> str: 180 # return html.escape(html.unescape(raw)).replace("'", "'") 181 return html.escape(raw).replace("'", "'") 182 183 184# ////////////////////////////////////////////////////////////////////////////// 185 186REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]") 187 188 189def escapeRE(string: str) -> str: 190 string = REGEXP_ESCAPE_RE.sub("\\$&", string) 191 return string 192 193 194# ////////////////////////////////////////////////////////////////////////////// 195 196 197def isSpace(code: object) -> bool: 198 return code in {0x09, 0x20} 199 200 201MD_WHITESPACE = { 202 0x09, # \t 203 0x0A, # \n 204 0x0B, # \v 205 0x0C, # \f 206 0x0D, # \r 207 0x20, 208 0xA0, 209 0x1680, 210 0x202F, 211 0x205F, 212 0x3000, 213} 214 215 216def isWhiteSpace(code: int) -> bool: 217 r"""Zs (unicode class) || [\t\f\v\r\n]""" 218 if code >= 0x2000 and code <= 0x200A: 219 return True 220 return code in MD_WHITESPACE 221 222 223# ////////////////////////////////////////////////////////////////////////////// 224 225UNICODE_PUNCT_RE = re.compile( 226 r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]" # noqa: E501 227) 228 229 230# Currently without astral characters support. 231def isPunctChar(ch: str) -> bool: 232 return UNICODE_PUNCT_RE.search(ch) is not None 233 234 235MD_ASCII_PUNCT = { 236 0x21, # /* ! */ 237 0x22, # /* " */ 238 0x23, # /* # */ 239 0x24, # /* $ */ 240 0x25, # /* % */ 241 0x26, # /* & */ 242 0x27, # /* ' */ 243 0x28, # /* ( */ 244 0x29, # /* ) */ 245 0x2A, # /* * */ 246 0x2B, # /* + */ 247 0x2C, # /* , */ 248 0x2D, # /* - */ 249 0x2E, # /* . */ 250 0x2F, # /* / */ 251 0x3A, # /* : */ 252 0x3B, # /* ; */ 253 0x3C, # /* < */ 254 0x3D, # /* = */ 255 0x3E, # /* > */ 256 0x3F, # /* ? */ 257 0x40, # /* @ */ 258 0x5B, # /* [ */ 259 0x5C, # /* \ */ 260 0x5D, # /* ] */ 261 0x5E, # /* ^ */ 262 0x5F, # /* _ */ 263 0x60, # /* ` */ 264 0x7B, # /* { */ 265 0x7C, # /* | */ 266 0x7D, # /* } */ 267 0x7E, # /* ~ */ 268} 269 270 271def isMdAsciiPunct(ch: int) -> bool: 272 """Markdown ASCII punctuation characters. 273 274 :: 275 276 !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~ 277 278 See http://spec.commonmark.org/0.15/#ascii-punctuation-character 279 280 Don't confuse with unicode punctuation !!! It lacks some chars in ascii range. 281 282 """ # noqa: E501 283 return ch in MD_ASCII_PUNCT 284 285 286def normalizeReference(string: str) -> str: 287 """Helper to unify [reference labels].""" 288 # Trim and collapse whitespace 289 # 290 string = re.sub(r"\s+", " ", string.strip()) 291 292 # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug 293 # fixed in v12 (couldn't find any details). 294 # 295 # So treat this one as a special case 296 # (remove this when node v10 is no longer supported). 297 # 298 # if ('ẞ'.toLowerCase() === 'Ṿ') { 299 # str = str.replace(/ẞ/g, 'ß') 300 # } 301 302 # .toLowerCase().toUpperCase() should get rid of all differences 303 # between letter variants. 304 # 305 # Simple .toLowerCase() doesn't normalize 125 code points correctly, 306 # and .toUpperCase doesn't normalize 6 of them (list of exceptions: 307 # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently 308 # uppercased versions). 309 # 310 # Here's an example showing how it happens. Lets take greek letter omega: 311 # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ) 312 # 313 # Unicode entries: 314 # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8 315 # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398 316 # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398 317 # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8 318 # 319 # Case-insensitive comparison should treat all of them as equivalent. 320 # 321 # But .toLowerCase() doesn't change ϑ (it's already lowercase), 322 # and .toUpperCase() doesn't change ϴ (already uppercase). 323 # 324 # Applying first lower then upper case normalizes any character: 325 # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398' 326 # 327 # Note: this is equivalent to unicode case folding; unicode normalization 328 # is a different step that is not required here. 329 # 330 # Final result should be uppercased, because it's later stored in an object 331 # (this avoid a conflict with Object.prototype members, 332 # most notably, `__proto__`) 333 # 334 return string.lower().upper() 335