1#!/usr/local/bin/python3.8 2# vim:fileencoding=utf-8 3# License: GPLv3 Copyright: 2010, Kovid Goyal <kovid at kovidgoyal.net> 4 5 6import re 7 8from calibre.constants import preferred_encoding 9from calibre_extensions.speedup import clean_xml_chars as _ncxc 10from polyglot.builtins import codepoint_to_chr 11from polyglot.html_entities import name2codepoint 12 13 14def native_clean_xml_chars(x): 15 if isinstance(x, bytes): 16 x = x.decode(preferred_encoding) 17 return _ncxc(x) 18 19 20def ascii_pat(for_binary=False): 21 attr = 'binary' if for_binary else 'text' 22 ans = getattr(ascii_pat, attr, None) 23 if ans is None: 24 chars = set(range(32)) - {9, 10, 13} 25 chars.add(127) 26 pat = '|'.join(map(codepoint_to_chr, chars)) 27 if for_binary: 28 pat = pat.encode('ascii') 29 ans = re.compile(pat) 30 setattr(ascii_pat, attr, ans) 31 return ans 32 33 34def clean_ascii_chars(txt, charlist=None): 35 r''' 36 Remove ASCII control chars. 37 This is all control chars except \t, \n and \r 38 ''' 39 is_binary = isinstance(txt, bytes) 40 empty = b'' if is_binary else '' 41 if not txt: 42 return empty 43 44 if charlist is None: 45 pat = ascii_pat(is_binary) 46 else: 47 pat = '|'.join(map(codepoint_to_chr, charlist)) 48 if is_binary: 49 pat = pat.encode('utf-8') 50 return pat.sub(empty, txt) 51 52 53def allowed(x): 54 x = ord(x) 55 return (x != 127 and (31 < x < 0xd7ff or x in (9, 10, 13))) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff) 56 57 58def py_clean_xml_chars(unicode_string): 59 return ''.join(filter(allowed, unicode_string)) 60 61 62clean_xml_chars = native_clean_xml_chars or py_clean_xml_chars 63 64 65def test_clean_xml_chars(): 66 raw = 'asd\x02a\U00010437x\ud801b\udffe\ud802' 67 if native_clean_xml_chars(raw) != 'asda\U00010437xb': 68 raise ValueError('Failed to XML clean: %r' % raw) 69 70 71# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html 72# Removes HTML or XML character references and entities from a text string. 73# 74# @param text The HTML (or XML) source text. 75# @return The plain text, as a Unicode string, if necessary. 76 77def unescape(text, rm=False, rchar=''): 78 def fixup(m, rm=rm, rchar=rchar): 79 text = m.group(0) 80 if text[:2] == "&#": 81 # character reference 82 try: 83 if text[:3] == "&#x": 84 return codepoint_to_chr(int(text[3:-1], 16)) 85 else: 86 return codepoint_to_chr(int(text[2:-1])) 87 except ValueError: 88 pass 89 else: 90 # named entity 91 try: 92 text = codepoint_to_chr(name2codepoint[text[1:-1]]) 93 except KeyError: 94 pass 95 if rm: 96 return rchar # replace by char 97 return text # leave as is 98 return re.sub("&#?\\w+;", fixup, text) 99