1import re
2from html.entities import entitydefs
3
4# Helpers for xhtml-im
5
6SAFE_ENTITIES = {
7    e: entitydefs[e] for e in entitydefs if e not in ("amp", "quot", "apos", "gt", "lt")
8}
9
10_invalid_codepoints = {
11    # 0x0001 to 0x0008
12    0x1,
13    0x2,
14    0x3,
15    0x4,
16    0x5,
17    0x6,
18    0x7,
19    0x8,
20    # 0x000E to 0x001F
21    0xE,
22    0xF,
23    0x10,
24    0x11,
25    0x12,
26    0x13,
27    0x14,
28    0x15,
29    0x16,
30    0x17,
31    0x18,
32    0x19,
33    0x1A,
34    0x1B,
35    0x1C,
36    0x1D,
37    0x1E,
38    0x1F,
39    # 0x007F to 0x009F
40    0x7F,
41    0x80,
42    0x81,
43    0x82,
44    0x83,
45    0x84,
46    0x85,
47    0x86,
48    0x87,
49    0x88,
50    0x89,
51    0x8A,
52    0x8B,
53    0x8C,
54    0x8D,
55    0x8E,
56    0x8F,
57    0x90,
58    0x91,
59    0x92,
60    0x93,
61    0x94,
62    0x95,
63    0x96,
64    0x97,
65    0x98,
66    0x99,
67    0x9A,
68    0x9B,
69    0x9C,
70    0x9D,
71    0x9E,
72    0x9F,
73    # 0xFDD0 to 0xFDEF
74    0xFDD0,
75    0xFDD1,
76    0xFDD2,
77    0xFDD3,
78    0xFDD4,
79    0xFDD5,
80    0xFDD6,
81    0xFDD7,
82    0xFDD8,
83    0xFDD9,
84    0xFDDA,
85    0xFDDB,
86    0xFDDC,
87    0xFDDD,
88    0xFDDE,
89    0xFDDF,
90    0xFDE0,
91    0xFDE1,
92    0xFDE2,
93    0xFDE3,
94    0xFDE4,
95    0xFDE5,
96    0xFDE6,
97    0xFDE7,
98    0xFDE8,
99    0xFDE9,
100    0xFDEA,
101    0xFDEB,
102    0xFDEC,
103    0xFDED,
104    0xFDEE,
105    0xFDEF,
106    # others
107    0xB,
108    0xFFFE,
109    0xFFFF,
110    0x1FFFE,
111    0x1FFFF,
112    0x2FFFE,
113    0x2FFFF,
114    0x3FFFE,
115    0x3FFFF,
116    0x4FFFE,
117    0x4FFFF,
118    0x5FFFE,
119    0x5FFFF,
120    0x6FFFE,
121    0x6FFFF,
122    0x7FFFE,
123    0x7FFFF,
124    0x8FFFE,
125    0x8FFFF,
126    0x9FFFE,
127    0x9FFFF,
128    0xAFFFE,
129    0xAFFFF,
130    0xBFFFE,
131    0xBFFFF,
132    0xCFFFE,
133    0xCFFFF,
134    0xDFFFE,
135    0xDFFFF,
136    0xEFFFE,
137    0xEFFFF,
138    0xFFFFE,
139    0xFFFFF,
140    0x10FFFE,
141    0x10FFFF,
142}
143
144_invalid_charrefs = {
145    0x00: "\ufffd",  # REPLACEMENT CHARACTER
146    0x0D: "\r",  # CARRIAGE RETURN
147    0x80: "\u20ac",  # EURO SIGN
148    0x81: "\x81",  # <control>
149    0x82: "\u201a",  # SINGLE LOW-9 QUOTATION MARK
150    0x83: "\u0192",  # LATIN SMALL LETTER F WITH HOOK
151    0x84: "\u201e",  # DOUBLE LOW-9 QUOTATION MARK
152    0x85: "\u2026",  # HORIZONTAL ELLIPSIS
153    0x86: "\u2020",  # DAGGER
154    0x87: "\u2021",  # DOUBLE DAGGER
155    0x88: "\u02c6",  # MODIFIER LETTER CIRCUMFLEX ACCENT
156    0x89: "\u2030",  # PER MILLE SIGN
157    0x8A: "\u0160",  # LATIN CAPITAL LETTER S WITH CARON
158    0x8B: "\u2039",  # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
159    0x8C: "\u0152",  # LATIN CAPITAL LIGATURE OE
160    0x8D: "\x8d",  # <control>
161    0x8E: "\u017d",  # LATIN CAPITAL LETTER Z WITH CARON
162    0x8F: "\x8f",  # <control>
163    0x90: "\x90",  # <control>
164    0x91: "\u2018",  # LEFT SINGLE QUOTATION MARK
165    0x92: "\u2019",  # RIGHT SINGLE QUOTATION MARK
166    0x93: "\u201c",  # LEFT DOUBLE QUOTATION MARK
167    0x94: "\u201d",  # RIGHT DOUBLE QUOTATION MARK
168    0x95: "\u2022",  # BULLET
169    0x96: "\u2013",  # EN DASH
170    0x97: "\u2014",  # EM DASH
171    0x98: "\u02dc",  # SMALL TILDE
172    0x99: "\u2122",  # TRADE MARK SIGN
173    0x9A: "\u0161",  # LATIN SMALL LETTER S WITH CARON
174    0x9B: "\u203a",  # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
175    0x9C: "\u0153",  # LATIN SMALL LIGATURE OE
176    0x9D: "\x9d",  # <control>
177    0x9E: "\u017e",  # LATIN SMALL LETTER Z WITH CARON
178    0x9F: "\u0178",  # LATIN CAPITAL LETTER Y WITH DIAERESIS
179}
180
181
182def _replace_charref(s):
183    s = s.group(1)
184    if s[0] == "#":
185        # numeric charref
186        if s[1] in "xX":
187            num = int(s[2:].rstrip(";"), 16)
188        else:
189            num = int(s[1:].rstrip(";"))
190        if num in _invalid_charrefs:
191            return _invalid_charrefs[num]
192        if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:
193            return "\uFFFD"
194        if num in _invalid_codepoints:
195            return ""
196        return chr(num)
197    else:
198        # named charref
199        if s in SAFE_ENTITIES:
200            return SAFE_ENTITIES[s]
201        # find the longest matching name (as defined by the standard)
202        for x in range(len(s) - 1, 1, -1):
203            if s[:x] in SAFE_ENTITIES:
204                return SAFE_ENTITIES[s[:x]] + s[x:]
205        else:
206            return "&" + s
207
208
209_charref = re.compile(
210    r"&(#[0-9]+;?" r"|#[xX][0-9a-fA-F]+;?" r"|[^\t\n\f <&#;]{1,32};?)"
211)
212
213
214def unescape(s):
215    if "&" not in s:
216        return s
217    return _charref.sub(_replace_charref, s)
218