1import re 2from html.entities import entitydefs 3 4# Helpers for xhtml-im 5 6SAFE_ENTITIES = { 7 e: entitydefs[e] for e in entitydefs if e not in ("amp", "quot", "apos", "gt", "lt") 8} 9 10_invalid_codepoints = { 11 # 0x0001 to 0x0008 12 0x1, 13 0x2, 14 0x3, 15 0x4, 16 0x5, 17 0x6, 18 0x7, 19 0x8, 20 # 0x000E to 0x001F 21 0xE, 22 0xF, 23 0x10, 24 0x11, 25 0x12, 26 0x13, 27 0x14, 28 0x15, 29 0x16, 30 0x17, 31 0x18, 32 0x19, 33 0x1A, 34 0x1B, 35 0x1C, 36 0x1D, 37 0x1E, 38 0x1F, 39 # 0x007F to 0x009F 40 0x7F, 41 0x80, 42 0x81, 43 0x82, 44 0x83, 45 0x84, 46 0x85, 47 0x86, 48 0x87, 49 0x88, 50 0x89, 51 0x8A, 52 0x8B, 53 0x8C, 54 0x8D, 55 0x8E, 56 0x8F, 57 0x90, 58 0x91, 59 0x92, 60 0x93, 61 0x94, 62 0x95, 63 0x96, 64 0x97, 65 0x98, 66 0x99, 67 0x9A, 68 0x9B, 69 0x9C, 70 0x9D, 71 0x9E, 72 0x9F, 73 # 0xFDD0 to 0xFDEF 74 0xFDD0, 75 0xFDD1, 76 0xFDD2, 77 0xFDD3, 78 0xFDD4, 79 0xFDD5, 80 0xFDD6, 81 0xFDD7, 82 0xFDD8, 83 0xFDD9, 84 0xFDDA, 85 0xFDDB, 86 0xFDDC, 87 0xFDDD, 88 0xFDDE, 89 0xFDDF, 90 0xFDE0, 91 0xFDE1, 92 0xFDE2, 93 0xFDE3, 94 0xFDE4, 95 0xFDE5, 96 0xFDE6, 97 0xFDE7, 98 0xFDE8, 99 0xFDE9, 100 0xFDEA, 101 0xFDEB, 102 0xFDEC, 103 0xFDED, 104 0xFDEE, 105 0xFDEF, 106 # others 107 0xB, 108 0xFFFE, 109 0xFFFF, 110 0x1FFFE, 111 0x1FFFF, 112 0x2FFFE, 113 0x2FFFF, 114 0x3FFFE, 115 0x3FFFF, 116 0x4FFFE, 117 0x4FFFF, 118 0x5FFFE, 119 0x5FFFF, 120 0x6FFFE, 121 0x6FFFF, 122 0x7FFFE, 123 0x7FFFF, 124 0x8FFFE, 125 0x8FFFF, 126 0x9FFFE, 127 0x9FFFF, 128 0xAFFFE, 129 0xAFFFF, 130 0xBFFFE, 131 0xBFFFF, 132 0xCFFFE, 133 0xCFFFF, 134 0xDFFFE, 135 0xDFFFF, 136 0xEFFFE, 137 0xEFFFF, 138 0xFFFFE, 139 0xFFFFF, 140 0x10FFFE, 141 0x10FFFF, 142} 143 144_invalid_charrefs = { 145 0x00: "\ufffd", # REPLACEMENT CHARACTER 146 0x0D: "\r", # CARRIAGE RETURN 147 0x80: "\u20ac", # EURO SIGN 148 0x81: "\x81", # <control> 149 0x82: "\u201a", # SINGLE LOW-9 QUOTATION MARK 150 0x83: "\u0192", # LATIN SMALL LETTER F WITH HOOK 151 0x84: "\u201e", # DOUBLE LOW-9 QUOTATION MARK 152 0x85: "\u2026", # HORIZONTAL ELLIPSIS 153 0x86: "\u2020", # DAGGER 154 0x87: "\u2021", # DOUBLE DAGGER 155 0x88: "\u02c6", # MODIFIER LETTER CIRCUMFLEX ACCENT 156 0x89: "\u2030", # PER MILLE SIGN 157 0x8A: "\u0160", # LATIN CAPITAL LETTER S WITH CARON 158 0x8B: "\u2039", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK 159 0x8C: "\u0152", # LATIN CAPITAL LIGATURE OE 160 0x8D: "\x8d", # <control> 161 0x8E: "\u017d", # LATIN CAPITAL LETTER Z WITH CARON 162 0x8F: "\x8f", # <control> 163 0x90: "\x90", # <control> 164 0x91: "\u2018", # LEFT SINGLE QUOTATION MARK 165 0x92: "\u2019", # RIGHT SINGLE QUOTATION MARK 166 0x93: "\u201c", # LEFT DOUBLE QUOTATION MARK 167 0x94: "\u201d", # RIGHT DOUBLE QUOTATION MARK 168 0x95: "\u2022", # BULLET 169 0x96: "\u2013", # EN DASH 170 0x97: "\u2014", # EM DASH 171 0x98: "\u02dc", # SMALL TILDE 172 0x99: "\u2122", # TRADE MARK SIGN 173 0x9A: "\u0161", # LATIN SMALL LETTER S WITH CARON 174 0x9B: "\u203a", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 175 0x9C: "\u0153", # LATIN SMALL LIGATURE OE 176 0x9D: "\x9d", # <control> 177 0x9E: "\u017e", # LATIN SMALL LETTER Z WITH CARON 178 0x9F: "\u0178", # LATIN CAPITAL LETTER Y WITH DIAERESIS 179} 180 181 182def _replace_charref(s): 183 s = s.group(1) 184 if s[0] == "#": 185 # numeric charref 186 if s[1] in "xX": 187 num = int(s[2:].rstrip(";"), 16) 188 else: 189 num = int(s[1:].rstrip(";")) 190 if num in _invalid_charrefs: 191 return _invalid_charrefs[num] 192 if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF: 193 return "\uFFFD" 194 if num in _invalid_codepoints: 195 return "" 196 return chr(num) 197 else: 198 # named charref 199 if s in SAFE_ENTITIES: 200 return SAFE_ENTITIES[s] 201 # find the longest matching name (as defined by the standard) 202 for x in range(len(s) - 1, 1, -1): 203 if s[:x] in SAFE_ENTITIES: 204 return SAFE_ENTITIES[s[:x]] + s[x:] 205 else: 206 return "&" + s 207 208 209_charref = re.compile( 210 r"&(#[0-9]+;?" r"|#[xX][0-9a-fA-F]+;?" r"|[^\t\n\f <&#;]{1,32};?)" 211) 212 213 214def unescape(s): 215 if "&" not in s: 216 return s 217 return _charref.sub(_replace_charref, s) 218