1#!/usr/local/bin/python3.8
2# vim:fileencoding=utf-8
3# License: GPLv3 Copyright: 2010, Kovid Goyal <kovid at kovidgoyal.net>
4
5
6import re
7
8from calibre.constants import preferred_encoding
9from calibre_extensions.speedup import clean_xml_chars as _ncxc
10from polyglot.builtins import codepoint_to_chr
11from polyglot.html_entities import name2codepoint
12
13
14def native_clean_xml_chars(x):
15    if isinstance(x, bytes):
16        x = x.decode(preferred_encoding)
17    return _ncxc(x)
18
19
20def ascii_pat(for_binary=False):
21    attr = 'binary' if for_binary else 'text'
22    ans = getattr(ascii_pat, attr, None)
23    if ans is None:
24        chars = set(range(32)) - {9, 10, 13}
25        chars.add(127)
26        pat = '|'.join(map(codepoint_to_chr, chars))
27        if for_binary:
28            pat = pat.encode('ascii')
29        ans = re.compile(pat)
30        setattr(ascii_pat, attr, ans)
31    return ans
32
33
34def clean_ascii_chars(txt, charlist=None):
35    r'''
36    Remove ASCII control chars.
37    This is all control chars except \t, \n and \r
38    '''
39    is_binary = isinstance(txt, bytes)
40    empty = b'' if is_binary else ''
41    if not txt:
42        return empty
43
44    if charlist is None:
45        pat = ascii_pat(is_binary)
46    else:
47        pat = '|'.join(map(codepoint_to_chr, charlist))
48        if is_binary:
49            pat = pat.encode('utf-8')
50    return pat.sub(empty, txt)
51
52
53def allowed(x):
54    x = ord(x)
55    return (x != 127 and (31 < x < 0xd7ff or x in (9, 10, 13))) or (0xe000 < x < 0xfffd) or (0x10000 < x < 0x10ffff)
56
57
58def py_clean_xml_chars(unicode_string):
59    return ''.join(filter(allowed, unicode_string))
60
61
62clean_xml_chars = native_clean_xml_chars or py_clean_xml_chars
63
64
65def test_clean_xml_chars():
66    raw = 'asd\x02a\U00010437x\ud801b\udffe\ud802'
67    if native_clean_xml_chars(raw) != 'asda\U00010437xb':
68        raise ValueError('Failed to XML clean: %r' % raw)
69
70
71# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
72# Removes HTML or XML character references and entities from a text string.
73#
74# @param text The HTML (or XML) source text.
75# @return The plain text, as a Unicode string, if necessary.
76
77def unescape(text, rm=False, rchar=''):
78    def fixup(m, rm=rm, rchar=rchar):
79        text = m.group(0)
80        if text[:2] == "&#":
81            # character reference
82            try:
83                if text[:3] == "&#x":
84                    return codepoint_to_chr(int(text[3:-1], 16))
85                else:
86                    return codepoint_to_chr(int(text[2:-1]))
87            except ValueError:
88                pass
89        else:
90            # named entity
91            try:
92                text = codepoint_to_chr(name2codepoint[text[1:-1]])
93            except KeyError:
94                pass
95        if rm:
96            return rchar  # replace by char
97        return text  # leave as is
98    return re.sub("&#?\\w+;", fixup, text)
99