1#!/usr/local/bin/python3.8 2# vim:fileencoding=utf-8 3 4 5__license__ = 'GPL v3' 6__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>' 7 8import sys 9from collections import defaultdict, Counter 10 11from calibre import replace_entities 12from calibre.spell.break_iterator import split_into_words, index_of 13from calibre.spell.dictionary import parse_lang_code 14from calibre.ebooks.oeb.base import barename 15from calibre.ebooks.oeb.polish.container import OPF_NAMESPACES, get_container 16from calibre.ebooks.oeb.polish.parsing import parse 17from calibre.ebooks.oeb.polish.toc import find_existing_ncx_toc, find_existing_nav_toc 18from calibre.utils.icu import ord_string 19from polyglot.builtins import iteritems 20 21_patterns = None 22 23 24class Patterns: 25 26 __slots__ = ('sanitize_invisible_pat', 'split_pat', 'digit_pat', 'fr_elision_pat') 27 28 def __init__(self): 29 import regex 30 # Remove soft hyphens/zero width spaces/control codes 31 self.sanitize_invisible_pat = regex.compile( 32 r'[\u00ad\u200b\u200c\u200d\ufeff\0-\x08\x0b\x0c\x0e-\x1f\x7f]', regex.VERSION1 | regex.UNICODE) 33 self.split_pat = regex.compile( 34 r'\W+', flags=regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.UNICODE) 35 self.digit_pat = regex.compile( 36 r'^\d+$', flags=regex.VERSION1 | regex.WORD | regex.UNICODE) 37 # French words with prefixes are reduced to the stem word, so that the 38 # words appear only once in the word list 39 self.fr_elision_pat = regex.compile( 40 "^(?:l|d|m|t|s|j|c|ç|lorsqu|puisqu|quoiqu|qu)['’]", flags=regex.UNICODE | regex.VERSION1 | regex.IGNORECASE) 41 42 43def patterns(): 44 global _patterns 45 if _patterns is None: 46 _patterns = Patterns() 47 return _patterns 48 49 50class CharCounter: 51 52 def __init__(self): 53 self.counter = Counter() 54 self.chars = defaultdict(set) 55 self.update = self.counter.update 56 57 58class Location: 59 60 __slots__ = ('file_name', 'sourceline', 'original_word', 'location_node', 'node_item', 'elided_prefix') 61 62 def __init__(self, file_name=None, elided_prefix='', original_word=None, location_node=None, node_item=(None, None)): 63 self.file_name, self.elided_prefix, self.original_word = file_name, elided_prefix, original_word 64 self.location_node, self.node_item, self.sourceline = location_node, node_item, location_node.sourceline 65 66 def __repr__(self): 67 return '%s @ %s:%s' % (self.original_word, self.file_name, self.sourceline) 68 __str__ = __repr__ 69 70 def replace(self, new_word): 71 self.original_word = self.elided_prefix + new_word 72 73 74def filter_words(word): 75 if not word: 76 return False 77 p = patterns() 78 if p.digit_pat.match(word) is not None: 79 return False 80 return True 81 82 83def get_words(text, lang): 84 try: 85 ans = split_into_words(str(text), lang) 86 except (TypeError, ValueError): 87 return () 88 return list(filter(filter_words, ans)) 89 90 91def add_words(text, node, words, file_name, locale, node_item): 92 candidates = get_words(text, locale.langcode) 93 if candidates: 94 p = patterns() 95 is_fr = locale.langcode == 'fra' 96 for word in candidates: 97 sword = p.sanitize_invisible_pat.sub('', word).strip() 98 elided_prefix = '' 99 if is_fr: 100 m = p.fr_elision_pat.match(sword) 101 if m is not None and len(sword) > len(elided_prefix): 102 elided_prefix = m.group() 103 sword = sword[len(elided_prefix):] 104 loc = Location(file_name, elided_prefix, word, node, node_item) 105 words[(sword, locale)].append(loc) 106 words[None] += 1 107 108 109def add_chars(text, counter, file_name): 110 if text: 111 if isinstance(text, bytes): 112 text = text.decode('utf-8', 'ignore') 113 counts = Counter(ord_string(text)) 114 counter.update(counts) 115 for codepoint in counts: 116 counter.chars[codepoint].add(file_name) 117 118 119def add_words_from_attr(node, attr, words, file_name, locale): 120 text = node.get(attr, None) 121 if text: 122 add_words(text, node, words, file_name, locale, (True, attr)) 123 124 125def count_chars_in_attr(node, attr, counter, file_name, locale): 126 text = node.get(attr, None) 127 if text: 128 add_chars(text, counter, file_name) 129 130 131def add_words_from_text(node, attr, words, file_name, locale): 132 add_words(getattr(node, attr), node, words, file_name, locale, (False, attr)) 133 134 135def count_chars_in_text(node, attr, counter, file_name, locale): 136 add_chars(getattr(node, attr), counter, file_name) 137 138 139def add_words_from_escaped_html(text, words, file_name, node, attr, locale): 140 text = replace_entities(text) 141 root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8')) 142 ewords = defaultdict(list) 143 ewords[None] = 0 144 read_words_from_html(root, ewords, file_name, locale) 145 words[None] += ewords.pop(None) 146 for k, locs in iteritems(ewords): 147 for loc in locs: 148 loc.location_node, loc.node_item = node, (False, attr) 149 words[k].extend(locs) 150 151 152def count_chars_in_escaped_html(text, counter, file_name, node, attr, locale): 153 text = replace_entities(text) 154 root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8')) 155 count_chars_in_html(root, counter, file_name, locale) 156 157 158_opf_file_as = '{%s}file-as' % OPF_NAMESPACES['opf'] 159opf_spell_tags = {'title', 'creator', 'subject', 'description', 'publisher'} 160 161# We can only use barename() for tag names and simple attribute checks so that 162# this code matches up with the syntax highlighter base spell checking 163 164 165def read_words_from_opf(root, words, file_name, book_locale): 166 for tag in root.iterdescendants('*'): 167 if tag.text is not None and barename(tag.tag) in opf_spell_tags: 168 if barename(tag.tag) == 'description': 169 add_words_from_escaped_html(tag.text, words, file_name, tag, 'text', book_locale) 170 else: 171 add_words_from_text(tag, 'text', words, file_name, book_locale) 172 add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale) 173 174 175def count_chars_in_opf(root, counter, file_name, book_locale): 176 for tag in root.iterdescendants('*'): 177 if tag.text is not None and barename(tag.tag) in opf_spell_tags: 178 if barename(tag.tag) == 'description': 179 count_chars_in_escaped_html(tag.text, counter, file_name, tag, 'text', book_locale) 180 else: 181 count_chars_in_text(tag, 'text', counter, file_name, book_locale) 182 count_chars_in_attr(tag, _opf_file_as, counter, file_name, book_locale) 183 184 185ncx_spell_tags = {'text'} 186xml_spell_tags = opf_spell_tags | ncx_spell_tags 187 188 189def read_words_from_ncx(root, words, file_name, book_locale): 190 for tag in root.xpath('//*[local-name()="text"]'): 191 if tag.text is not None: 192 add_words_from_text(tag, 'text', words, file_name, book_locale) 193 194 195def count_chars_in_ncx(root, counter, file_name, book_locale): 196 for tag in root.xpath('//*[local-name()="text"]'): 197 if tag.text is not None: 198 count_chars_in_text(tag, 'text', counter, file_name, book_locale) 199 200 201html_spell_tags = {'script', 'style', 'link'} 202 203 204def read_words_from_html_tag(tag, words, file_name, parent_locale, locale): 205 if tag.text is not None and barename(tag.tag) not in html_spell_tags: 206 add_words_from_text(tag, 'text', words, file_name, locale) 207 for attr in {'alt', 'title'}: 208 add_words_from_attr(tag, attr, words, file_name, locale) 209 if tag.tail is not None and tag.getparent() is not None and barename(tag.getparent().tag) not in html_spell_tags: 210 add_words_from_text(tag, 'tail', words, file_name, parent_locale) 211 212 213def count_chars_in_html_tag(tag, counter, file_name, parent_locale, locale): 214 if tag.text is not None and barename(tag.tag) not in html_spell_tags: 215 count_chars_in_text(tag, 'text', counter, file_name, locale) 216 for attr in {'alt', 'title'}: 217 count_chars_in_attr(tag, attr, counter, file_name, locale) 218 if tag.tail is not None and tag.getparent() is not None and barename(tag.getparent().tag) not in html_spell_tags: 219 count_chars_in_text(tag, 'tail', counter, file_name, parent_locale) 220 221 222def locale_from_tag(tag): 223 if 'lang' in tag.attrib: 224 try: 225 loc = parse_lang_code(tag.get('lang')) 226 except ValueError: 227 loc = None 228 if loc is not None: 229 return loc 230 if '{http://www.w3.org/XML/1998/namespace}lang' in tag.attrib: 231 try: 232 loc = parse_lang_code(tag.get('{http://www.w3.org/XML/1998/namespace}lang')) 233 except ValueError: 234 loc = None 235 if loc is not None: 236 return loc 237 238 239def read_words_from_html(root, words, file_name, book_locale): 240 stack = [(root, book_locale)] 241 while stack: 242 parent, parent_locale = stack.pop() 243 locale = locale_from_tag(parent) or parent_locale 244 read_words_from_html_tag(parent, words, file_name, parent_locale, locale) 245 stack.extend((tag, locale) for tag in parent.iterchildren('*')) 246 247 248def count_chars_in_html(root, counter, file_name, book_locale): 249 stack = [(root, book_locale)] 250 while stack: 251 parent, parent_locale = stack.pop() 252 locale = locale_from_tag(parent) or parent_locale 253 count_chars_in_html_tag(parent, counter, file_name, parent_locale, locale) 254 stack.extend((tag, locale) for tag in parent.iterchildren('*')) 255 256 257def group_sort(locations): 258 order = {} 259 for loc in locations: 260 if loc.file_name not in order: 261 order[loc.file_name] = len(order) 262 return sorted(locations, key=lambda l:(order[l.file_name], l.sourceline)) 263 264 265def get_checkable_file_names(container): 266 file_names = [name for name, linear in container.spine_names] + [container.opf_name] 267 ncx_toc = find_existing_ncx_toc(container) 268 if ncx_toc is not None and container.exists(ncx_toc) and ncx_toc not in file_names: 269 file_names.append(ncx_toc) 270 else: 271 ncx_toc = None 272 toc = find_existing_nav_toc(container) 273 if toc is not None and container.exists(toc) and toc not in file_names: 274 file_names.append(toc) 275 return file_names, ncx_toc 276 277 278def root_is_excluded_from_spell_check(root): 279 for child in root: 280 q = (getattr(child, 'text', '') or '').strip().lower() 281 if q == 'calibre-no-spell-check': 282 return True 283 return False 284 285 286def get_all_words(container, book_locale, get_word_count=False, excluded_files=()): 287 words = defaultdict(list) 288 words[None] = 0 289 file_names, ncx_toc = get_checkable_file_names(container) 290 for file_name in file_names: 291 if not container.exists(file_name) or file_name in excluded_files: 292 continue 293 root = container.parsed(file_name) 294 if root_is_excluded_from_spell_check(root): 295 continue 296 if file_name == container.opf_name: 297 read_words_from_opf(root, words, file_name, book_locale) 298 elif file_name == ncx_toc: 299 read_words_from_ncx(root, words, file_name, book_locale) 300 elif hasattr(root, 'xpath'): 301 read_words_from_html(root, words, file_name, book_locale) 302 count = words.pop(None) 303 ans = {k:group_sort(v) for k, v in iteritems(words)} 304 if get_word_count: 305 return count, ans 306 return ans 307 308 309def count_all_chars(container, book_locale): 310 ans = CharCounter() 311 file_names, ncx_toc = get_checkable_file_names(container) 312 for file_name in file_names: 313 if not container.exists(file_name): 314 continue 315 root = container.parsed(file_name) 316 if file_name == container.opf_name: 317 count_chars_in_opf(root, ans, file_name, book_locale) 318 elif file_name == ncx_toc: 319 count_chars_in_ncx(root, ans, file_name, book_locale) 320 elif hasattr(root, 'xpath'): 321 count_chars_in_html(root, ans, file_name, book_locale) 322 return ans 323 324 325def merge_locations(locs1, locs2): 326 return group_sort(locs1 + locs2) 327 328 329def replace(text, original_word, new_word, lang): 330 indices = [] 331 original_word, new_word, text = str(original_word), str(new_word), str(text) 332 q = text 333 offset = 0 334 while True: 335 idx = index_of(original_word, q, lang=lang) 336 if idx == -1: 337 break 338 indices.append(offset + idx) 339 offset += idx + len(original_word) 340 q = text[offset:] 341 for idx in reversed(indices): 342 text = text[:idx] + new_word + text[idx+len(original_word):] 343 return text, bool(indices) 344 345 346def replace_word(container, new_word, locations, locale, undo_cache=None): 347 changed = set() 348 for loc in locations: 349 node = loc.location_node 350 is_attr, attr = loc.node_item 351 if is_attr: 352 text = node.get(attr) 353 else: 354 text = getattr(node, attr) 355 replacement = loc.elided_prefix + new_word 356 rtext, replaced = replace(text, loc.original_word, replacement, locale.langcode) 357 if replaced: 358 if undo_cache is not None: 359 undo_cache[(loc.file_name, node, is_attr, attr)] = text 360 if is_attr: 361 node.set(attr, rtext) 362 else: 363 setattr(node, attr, rtext) 364 container.replace(loc.file_name, node.getroottree().getroot()) 365 changed.add(loc.file_name) 366 return changed 367 368 369def undo_replace_word(container, undo_cache): 370 changed = set() 371 for (file_name, node, is_attr, attr), text in iteritems(undo_cache): 372 node.set(attr, text) if is_attr else setattr(node, attr, text) 373 container.replace(file_name, node.getroottree().getroot()) 374 changed.add(file_name) 375 return changed 376 377 378if __name__ == '__main__': 379 import pprint 380 from calibre.gui2.tweak_book import set_book_locale, dictionaries 381 container = get_container(sys.argv[-1], tweak_mode=True) 382 set_book_locale(container.mi.language) 383 pprint.pprint(get_all_words(container, dictionaries.default_locale)) 384