1#!/usr/local/bin/python3.8
2# vim:fileencoding=utf-8
3
4
5__license__ = 'GPL v3'
6__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
7
8import sys
9from collections import defaultdict, Counter
10
11from calibre import replace_entities
12from calibre.spell.break_iterator import split_into_words, index_of
13from calibre.spell.dictionary import parse_lang_code
14from calibre.ebooks.oeb.base import barename
15from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container
16from calibre.ebooks.oeb.polish.parsing import parse
17from calibre.ebooks.oeb.polish.toc import find_existing_ncx_toc, find_existing_nav_toc
18from calibre.utils.icu import ord_string
19from polyglot.builtins import iteritems
20
21_patterns = None
22
23
24class Patterns:
25
26    __slots__ = ('sanitize_invisible_pat', 'split_pat', 'digit_pat', 'fr_elision_pat')
27
28    def __init__(self):
29        import regex
30        # Remove soft hyphens/zero width spaces/control codes
31        self.sanitize_invisible_pat = regex.compile(
32            r'[\u00ad\u200b\u200c\u200d\ufeff\0-\x08\x0b\x0c\x0e-\x1f\x7f]', regex.VERSION1 | regex.UNICODE)
33        self.split_pat = regex.compile(
34            r'\W+', flags=regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.UNICODE)
35        self.digit_pat = regex.compile(
36            r'^\d+$', flags=regex.VERSION1 | regex.WORD | regex.UNICODE)
37        # French words with prefixes are reduced to the stem word, so that the
38        # words appear only once in the word list
39        self.fr_elision_pat = regex.compile(
40            "^(?:l|d|m|t|s|j|c|ç|lorsqu|puisqu|quoiqu|qu)['’]", flags=regex.UNICODE | regex.VERSION1 | regex.IGNORECASE)
41
42
43def patterns():
44    global _patterns
45    if _patterns is None:
46        _patterns = Patterns()
47    return _patterns
48
49
50class CharCounter:
51
52    def __init__(self):
53        self.counter = Counter()
54        self.chars = defaultdict(set)
55        self.update = self.counter.update
56
57
58class Location:
59
60    __slots__ = ('file_name', 'sourceline', 'original_word', 'location_node', 'node_item', 'elided_prefix')
61
62    def __init__(self, file_name=None, elided_prefix='', original_word=None, location_node=None, node_item=(None, None)):
63        self.file_name, self.elided_prefix, self.original_word = file_name, elided_prefix, original_word
64        self.location_node, self.node_item, self.sourceline = location_node, node_item, location_node.sourceline
65
66    def __repr__(self):
67        return '%s @ %s:%s' % (self.original_word, self.file_name, self.sourceline)
68    __str__ = __repr__
69
70    def replace(self, new_word):
71        self.original_word = self.elided_prefix + new_word
72
73
74def filter_words(word):
75    if not word:
76        return False
77    p = patterns()
78    if p.digit_pat.match(word) is not None:
79        return False
80    return True
81
82
83def get_words(text, lang):
84    try:
85        ans = split_into_words(str(text), lang)
86    except (TypeError, ValueError):
87        return ()
88    return list(filter(filter_words, ans))
89
90
91def add_words(text, node, words, file_name, locale, node_item):
92    candidates = get_words(text, locale.langcode)
93    if candidates:
94        p = patterns()
95        is_fr = locale.langcode == 'fra'
96        for word in candidates:
97            sword = p.sanitize_invisible_pat.sub('', word).strip()
98            elided_prefix = ''
99            if is_fr:
100                m = p.fr_elision_pat.match(sword)
101                if m is not None and len(sword) > len(elided_prefix):
102                    elided_prefix = m.group()
103                    sword = sword[len(elided_prefix):]
104            loc = Location(file_name, elided_prefix, word, node, node_item)
105            words[(sword, locale)].append(loc)
106            words[None] += 1
107
108
109def add_chars(text, counter, file_name):
110    if text:
111        if isinstance(text, bytes):
112            text = text.decode('utf-8', 'ignore')
113        counts = Counter(ord_string(text))
114        counter.update(counts)
115        for codepoint in counts:
116            counter.chars[codepoint].add(file_name)
117
118
119def add_words_from_attr(node, attr, words, file_name, locale):
120    text = node.get(attr, None)
121    if text:
122        add_words(text, node, words, file_name, locale, (True, attr))
123
124
125def count_chars_in_attr(node, attr, counter, file_name, locale):
126    text = node.get(attr, None)
127    if text:
128        add_chars(text, counter, file_name)
129
130
131def add_words_from_text(node, attr, words, file_name, locale):
132    add_words(getattr(node, attr), node, words, file_name, locale, (False, attr))
133
134
135def count_chars_in_text(node, attr, counter, file_name, locale):
136    add_chars(getattr(node, attr), counter, file_name)
137
138
139def add_words_from_escaped_html(text, words, file_name, node, attr, locale):
140    text = replace_entities(text)
141    root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8'))
142    ewords = defaultdict(list)
143    ewords[None] = 0
144    read_words_from_html(root, ewords, file_name, locale)
145    words[None] += ewords.pop(None)
146    for k, locs in iteritems(ewords):
147        for loc in locs:
148            loc.location_node, loc.node_item = node, (False, attr)
149        words[k].extend(locs)
150
151
152def count_chars_in_escaped_html(text, counter, file_name, node, attr, locale):
153    text = replace_entities(text)
154    root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8'))
155    count_chars_in_html(root, counter, file_name, locale)
156
157
158_opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf']
159opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'}
160
161# We can only use barename() for tag names and simple attribute checks so that
162# this code matches up with the syntax highlighter base spell checking
163
164
165def read_words_from_opf(root, words, file_name, book_locale):
166    for tag in root.iterdescendants('*'):
167        if tag.text is not None and barename(tag.tag) in opf_spell_tags:
168            if barename(tag.tag) == 'description':
169                add_words_from_escaped_html(tag.text, words, file_name, tag, 'text', book_locale)
170            else:
171                add_words_from_text(tag, 'text', words, file_name, book_locale)
172        add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale)
173
174
175def count_chars_in_opf(root, counter, file_name, book_locale):
176    for tag in root.iterdescendants('*'):
177        if tag.text is not None and barename(tag.tag) in opf_spell_tags:
178            if barename(tag.tag) == 'description':
179                count_chars_in_escaped_html(tag.text, counter, file_name, tag, 'text', book_locale)
180            else:
181                count_chars_in_text(tag, 'text', counter, file_name, book_locale)
182        count_chars_in_attr(tag, _opf_file_as, counter, file_name, book_locale)
183
184
185ncx_spell_tags = {'text'}
186xml_spell_tags = opf_spell_tags | ncx_spell_tags
187
188
189def read_words_from_ncx(root, words, file_name, book_locale):
190    for tag in root.xpath('//*[local-name()="text"]'):
191        if tag.text is not None:
192            add_words_from_text(tag, 'text', words, file_name, book_locale)
193
194
195def count_chars_in_ncx(root, counter, file_name, book_locale):
196    for tag in root.xpath('//*[local-name()="text"]'):
197        if tag.text is not None:
198            count_chars_in_text(tag, 'text', counter, file_name, book_locale)
199
200
201html_spell_tags = {'script', 'style', 'link'}
202
203
204def read_words_from_html_tag(tag, words, file_name, parent_locale, locale):
205    if tag.text is not None and barename(tag.tag) not in html_spell_tags:
206        add_words_from_text(tag, 'text', words, file_name, locale)
207    for attr in {'alt', 'title'}:
208        add_words_from_attr(tag, attr, words, file_name, locale)
209    if tag.tail is not None and tag.getparent() is not None and barename(tag.getparent().tag) not in html_spell_tags:
210        add_words_from_text(tag, 'tail', words, file_name, parent_locale)
211
212
213def count_chars_in_html_tag(tag, counter, file_name, parent_locale, locale):
214    if tag.text is not None and barename(tag.tag) not in html_spell_tags:
215        count_chars_in_text(tag, 'text', counter, file_name, locale)
216    for attr in {'alt', 'title'}:
217        count_chars_in_attr(tag, attr, counter, file_name, locale)
218    if tag.tail is not None and tag.getparent() is not None and barename(tag.getparent().tag) not in html_spell_tags:
219        count_chars_in_text(tag, 'tail', counter, file_name, parent_locale)
220
221
222def locale_from_tag(tag):
223    if 'lang' in tag.attrib:
224        try:
225            loc = parse_lang_code(tag.get('lang'))
226        except ValueError:
227            loc = None
228        if loc is not None:
229            return loc
230    if '{http://www.w3.org/XML/1998/namespace}lang' in tag.attrib:
231        try:
232            loc = parse_lang_code(tag.get('{http://www.w3.org/XML/1998/namespace}lang'))
233        except ValueError:
234            loc = None
235        if loc is not None:
236            return loc
237
238
239def read_words_from_html(root, words, file_name, book_locale):
240    stack = [(root, book_locale)]
241    while stack:
242        parent, parent_locale = stack.pop()
243        locale = locale_from_tag(parent) or parent_locale
244        read_words_from_html_tag(parent, words, file_name, parent_locale, locale)
245        stack.extend((tag, locale) for tag in parent.iterchildren('*'))
246
247
248def count_chars_in_html(root, counter, file_name, book_locale):
249    stack = [(root, book_locale)]
250    while stack:
251        parent, parent_locale = stack.pop()
252        locale = locale_from_tag(parent) or parent_locale
253        count_chars_in_html_tag(parent, counter, file_name, parent_locale, locale)
254        stack.extend((tag, locale) for tag in parent.iterchildren('*'))
255
256
257def group_sort(locations):
258    order = {}
259    for loc in locations:
260        if loc.file_name not in order:
261            order[loc.file_name] = len(order)
262    return sorted(locations, key=lambda l:(order[l.file_name], l.sourceline))
263
264
265def get_checkable_file_names(container):
266    file_names = [name for name, linear in container.spine_names] + [container.opf_name]
267    ncx_toc = find_existing_ncx_toc(container)
268    if ncx_toc is not None and container.exists(ncx_toc) and ncx_toc not in file_names:
269        file_names.append(ncx_toc)
270    else:
271        ncx_toc = None
272    toc = find_existing_nav_toc(container)
273    if toc is not None and container.exists(toc) and toc not in file_names:
274        file_names.append(toc)
275    return file_names, ncx_toc
276
277
278def root_is_excluded_from_spell_check(root):
279    for child in root:
280        q = (getattr(child, 'text', '') or '').strip().lower()
281        if q == 'calibre-no-spell-check':
282            return True
283    return False
284
285
286def get_all_words(container, book_locale, get_word_count=False, excluded_files=()):
287    words = defaultdict(list)
288    words[None] = 0
289    file_names, ncx_toc = get_checkable_file_names(container)
290    for file_name in file_names:
291        if not container.exists(file_name) or file_name in excluded_files:
292            continue
293        root = container.parsed(file_name)
294        if root_is_excluded_from_spell_check(root):
295            continue
296        if file_name == container.opf_name:
297            read_words_from_opf(root, words, file_name, book_locale)
298        elif file_name == ncx_toc:
299            read_words_from_ncx(root, words, file_name, book_locale)
300        elif hasattr(root, 'xpath'):
301            read_words_from_html(root, words, file_name, book_locale)
302    count = words.pop(None)
303    ans = {k:group_sort(v) for k, v in iteritems(words)}
304    if get_word_count:
305        return count, ans
306    return ans
307
308
309def count_all_chars(container, book_locale):
310    ans = CharCounter()
311    file_names, ncx_toc = get_checkable_file_names(container)
312    for file_name in file_names:
313        if not container.exists(file_name):
314            continue
315        root = container.parsed(file_name)
316        if file_name == container.opf_name:
317            count_chars_in_opf(root, ans, file_name, book_locale)
318        elif file_name == ncx_toc:
319            count_chars_in_ncx(root, ans, file_name, book_locale)
320        elif hasattr(root, 'xpath'):
321            count_chars_in_html(root, ans, file_name, book_locale)
322    return ans
323
324
325def merge_locations(locs1, locs2):
326    return group_sort(locs1 + locs2)
327
328
329def replace(text, original_word, new_word, lang):
330    indices = []
331    original_word, new_word, text = str(original_word), str(new_word), str(text)
332    q = text
333    offset = 0
334    while True:
335        idx = index_of(original_word, q, lang=lang)
336        if idx == -1:
337            break
338        indices.append(offset + idx)
339        offset += idx + len(original_word)
340        q = text[offset:]
341    for idx in reversed(indices):
342        text = text[:idx] + new_word + text[idx+len(original_word):]
343    return text, bool(indices)
344
345
346def replace_word(container, new_word, locations, locale, undo_cache=None):
347    changed = set()
348    for loc in locations:
349        node = loc.location_node
350        is_attr, attr = loc.node_item
351        if is_attr:
352            text = node.get(attr)
353        else:
354            text = getattr(node, attr)
355        replacement = loc.elided_prefix + new_word
356        rtext, replaced = replace(text, loc.original_word, replacement, locale.langcode)
357        if replaced:
358            if undo_cache is not None:
359                undo_cache[(loc.file_name, node, is_attr, attr)] = text
360            if is_attr:
361                node.set(attr, rtext)
362            else:
363                setattr(node, attr, rtext)
364            container.replace(loc.file_name, node.getroottree().getroot())
365            changed.add(loc.file_name)
366    return changed
367
368
369def undo_replace_word(container, undo_cache):
370    changed = set()
371    for (file_name, node, is_attr, attr), text in iteritems(undo_cache):
372        node.set(attr, text) if is_attr else setattr(node, attr, text)
373        container.replace(file_name, node.getroottree().getroot())
374        changed.add(file_name)
375    return changed
376
377
378if __name__ == '__main__':
379    import pprint
380    from calibre.gui2.tweak_book import set_book_locale, dictionaries
381    container = get_container(sys.argv[-1], tweak_mode=True)
382    set_book_locale(container.mi.language)
383    pprint.pprint(get_all_words(container, dictionaries.default_locale))
384