1# -*- coding: utf-8 -*- 2# vim:et sts=4 sw=4 3# 4# ibus-typing-booster - A completion input method for IBus 5# 6# Copyright (c) 2012-2013 Anish Patil <apatil@redhat.com> 7# Copyright (c) 2012-2016 Mike FABIAN <mfabian@redhat.com> 8# 9# This program is free software: you can redistribute it and/or modify 10# it under the terms of the GNU General Public License as published by 11# the Free Software Foundation, either version 3 of the License, or 12# (at your option) any later version. 13# 14# This program is distributed in the hope that it will be useful, 15# but WITHOUT ANY WARRANTY; without even the implied warranty of 16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17# GNU General Public License for more details. 18# 19# You should have received a copy of the GNU General Public License 20# along with this program. If not, see <http://www.gnu.org/licenses/> 21 22'''A module used by ibus-typing-booster to suggest words by using the 23hunspell dictonaries. 24 25''' 26 27from typing import Dict 28from typing import Tuple 29from typing import List 30import os 31import sys 32import unicodedata 33IMPORT_REGEX_SUCCESFUL = False 34try: 35 import regex # type: ignore 36 IMPORT_REGEX_SUCCESFUL = True 37 # Enable new improved regex engine instead of backwards compatible 38 # v0. regex.match('ß', 'SS', regex.IGNORECASE) matches only with 39 # the improved version! See also: https://pypi.org/project/regex/ 40 regex.DEFAULT_VERSION = regex.VERSION1 41except (ImportError,): 42 # Use standard “re” module as a fallback: 43 import re 44import logging 45import itb_util 46 47LOGGER = logging.getLogger('ibus-typing-booster') 48 49DEBUG_LEVEL = int(0) 50 51IMPORT_ENCHANT_SUCCESSFUL = False 52IMPORT_HUNSPELL_SUCCESSFUL = False 53try: 54 import enchant # type: ignore 55 IMPORT_ENCHANT_SUCCESSFUL = True 56except (ImportError,): 57 try: 58 import hunspell # type: ignore 59 IMPORT_HUNSPELL_SUCCESSFUL = True 60 except (ImportError,): 61 pass 62 63IMPORT_LIBVOIKKO_SUCCESSFUL = False 64try: 65 import libvoikko # type: ignore 66 IMPORT_LIBVOIKKO_SUCCESSFUL = True 67except (ImportError,): 68 pass 69 70# Maximum words that should be returned. 71# This should a rather big number in order not 72# to throw away useful matches. But making it very huge 73# makes the performance worse. For example when setting 74# it to 1000, I see a noticable delay when typing the first 75# letter of a word until the candidate lookup table pops up. 76MAX_WORDS = 100 77 78class Dictionary: 79 '''A class to hold a hunspell dictionary 80 ''' 81 def __init__(self, name='en_US') -> None: 82 if DEBUG_LEVEL > 1: 83 LOGGER.debug('Dictionary.__init__(name=%s)\n', name) 84 self.name = name 85 self.language = self.name.split('_')[0] 86 self.dic_path = '' 87 self.encoding = 'UTF-8' 88 self.words: List[str]= [] 89 self.word_pairs: List[Tuple[str, str]] = [] 90 self.max_word_len = 0 # maximum length of words in this dictionary 91 self.enchant_dict = None 92 self.pyhunspell_object = None 93 self.voikko: libvoikko.Voikko = None 94 if self.name != 'None': 95 self.load_dictionary() 96 97 def load_dictionary(self) -> None: 98 '''Load a hunspell dictionary and instantiate a 99 enchant.Dict() or a hunspell.Hunspell() object. 100 101 ''' 102 if DEBUG_LEVEL > 0: 103 LOGGER.debug('load_dictionary() ...\n') 104 (self.dic_path, 105 self.encoding, 106 self.words) = itb_util.get_hunspell_dictionary_wordlist(self.name) 107 if self.words: 108 if self.language in itb_util.ACCENT_LANGUAGES: 109 self.word_pairs = [ 110 (x, itb_util.remove_accents( 111 x, keep=itb_util.ACCENT_LANGUAGES[self.language])) 112 for x in self.words 113 ] 114 for word in self.words: 115 if len(word) > self.max_word_len: 116 self.max_word_len = len(word) 117 if DEBUG_LEVEL > 1: 118 LOGGER.debug( 119 'max_word_len = %s\n', self.max_word_len) 120 if self.name.split('_')[0] == 'fi': 121 self.enchant_dict = None 122 self.pyhunspell_object = None 123 self.voikko = None 124 if not IMPORT_LIBVOIKKO_SUCCESSFUL: 125 LOGGER.warning( 126 'Language is “fi” but “import libvoikko” failed.') 127 return 128 self.voikko = libvoikko.Voikko('fi') 129 return 130 if IMPORT_ENCHANT_SUCCESSFUL: 131 try: 132 self.enchant_dict = enchant.Dict(self.name) 133 except enchant.errors.DictNotFoundError: 134 LOGGER.exception( 135 'Error initializing enchant for %s', self.name) 136 self.enchant_dict = None 137 except Exception: 138 LOGGER.exception( 139 'Unknown error initializing enchant for %s', 140 self.name) 141 self.enchant_dict = None 142 elif IMPORT_HUNSPELL_SUCCESSFUL and self.dic_path: 143 aff_path = self.dic_path.replace('.dic', '.aff') 144 try: 145 self.pyhunspell_object = hunspell.HunSpell( 146 self.dic_path, aff_path) 147 except hunspell.HunSpellError: 148 LOGGER.debug( 149 'Error initializing hunspell for %s', self.name) 150 self.pyhunspell_object = None 151 except Exception: 152 LOGGER.debug( 153 'Unknown error initializing hunspell for %s', 154 self.name) 155 self.pyhunspell_object = None 156 157 def spellcheck_enchant(self, word: str) -> bool: 158 ''' 159 Spellcheck a word using enchant 160 161 :param word: The word to spellcheck 162 :return: True if spelling is correct, False if not or unknown 163 ''' 164 if not self.enchant_dict: 165 return False 166 # enchant does the right thing for all languages, including 167 # Korean, if the input is a Unicode string in NFC. 168 return self.enchant_dict.check(unicodedata.normalize('NFC', word)) 169 170 def spellcheck_pyhunspell(self, word: str) -> bool: 171 ''' 172 Spellcheck a word using pyhunspell 173 174 :param word: The word to spellcheck 175 :return: True if spelling is correct, False if not or unknown 176 ''' 177 if not self.pyhunspell_object: 178 return False 179 # pyhunspell needs its input passed in dictionary encoding. 180 # and also returns in dictionary encoding. 181 return self.pyhunspell_object.spell( 182 unicodedata.normalize('NFC', word).encode( 183 self.encoding, 'replace')) 184 185 def spellcheck_voikko(self, word: str) -> bool: 186 ''' 187 Spellcheck a word using voikko 188 189 :param word: The word to spellcheck 190 :return: True if spelling is correct, False if not or unknown 191 ''' 192 if not self.voikko: 193 return False 194 # voikko works correctly if the input is a Unicode string in NFC. 195 return self.voikko.spell(unicodedata.normalize('NFC', word)) 196 197 def spellcheck(self, word: str) -> bool: 198 ''' 199 Spellcheck a word using enchant, pyhunspell, or voikko 200 201 :param word: The word to spellcheck 202 :return: True if spelling is correct, False if not or unknown 203 204 >>> d = Dictionary('en_US') 205 >>> d.spellcheck('winter') 206 True 207 208 >>> d.spellcheck('winxer') 209 False 210 211 >>> d = Dictionary('None') 212 >>> d.spellcheck('winter') 213 False 214 215 >>> d.spellcheck('winxer') 216 False 217 ''' 218 if self.enchant_dict: 219 return self.spellcheck_enchant(word) 220 if self.pyhunspell_object: 221 return self.spellcheck_pyhunspell(word) 222 if self.voikko: 223 return self.voikko.spell(word) 224 return False 225 226 def has_spellchecking(self) -> bool: 227 ''' 228 Returns wether this dictionary supports spellchecking or not 229 230 :return: True if this dictionary spports spellchecking, False if not 231 232 Examples: 233 234 >>> d = Dictionary('en_US') 235 >>> d.has_spellchecking() 236 True 237 238 >>> d = Dictionary('zh_CN') 239 >>> d.has_spellchecking() 240 False 241 242 >>> d = Dictionary('None') 243 >>> d.has_spellchecking() 244 False 245 ''' 246 if self.enchant_dict or self.pyhunspell_object or self.voikko: 247 return True 248 return False 249 250 def spellcheck_suggest_enchant(self, word: str) -> List[str]: 251 ''' 252 Return spellchecking suggestions for word using enchant 253 254 :param word: The word to return spellchecking suggestions for 255 :return: List of spellchecking suggestions, possibly empty. 256 ''' 257 if not word or not self.enchant_dict: 258 return [] 259 # enchant does the right thing for all languages, including 260 # Korean, if the input is NFC. It takes Unicode strings and 261 # returns Unicode strings, no encoding and decoding is 262 # necessary, neither for Python2 nor for Python3. 263 return [ 264 unicodedata.normalize( 265 itb_util.NORMALIZATION_FORM_INTERNAL, x) 266 for x in 267 self.enchant_dict.suggest(unicodedata.normalize('NFC', word)) 268 ] 269 270 def spellcheck_suggest_pyhunspell(self, word: str) -> List[str]: 271 ''' 272 Return spellchecking suggestions for word using pyhunspell 273 274 :param word: The word to return spellchecking suggestions for 275 :return: List of spellchecking suggestions, possibly empty. 276 ''' 277 if not word or not self.pyhunspell_object: 278 return [] 279 # pyhunspell needs its input passed in dictionary encoding. 280 return [ 281 unicodedata.normalize( 282 itb_util.NORMALIZATION_FORM_INTERNAL, x) 283 for x in 284 self.pyhunspell_object.suggest( 285 unicodedata.normalize('NFC', word).encode( 286 self.encoding, 'replace')) 287 ] 288 289 def spellcheck_suggest_voikko(self, word: str) -> List[str]: 290 ''' 291 Return spellchecking suggestions for word using voikko 292 293 :param word: The word to return spellchecking suggestions for 294 :return: List of spellchecking suggestions, possibly empty. 295 ''' 296 if not word or not self.voikko: 297 return [] 298 return [ 299 unicodedata.normalize( 300 itb_util.NORMALIZATION_FORM_INTERNAL, x) 301 for x in 302 self.voikko.suggest(unicodedata.normalize('NFC', word)) 303 ] 304 305 def spellcheck_suggest(self, word: str) -> List[str]: 306 ''' 307 Return spellchecking suggestions for word using enchant, pyhunspell or voikko 308 309 :param word: The word to return spellchecking suggestions for 310 :return: List of spellchecking suggestions, possibly empty. 311 312 Results can be quite different depending on whether enchant or pyhunspell 313 is used and in case of enchant whether hunspell, aspell, 314 myspell, or ispell are used. So for the doctests, better don’t 315 check the complete list of suggestions returned. 316 317 Examples: 318 319 >>> d = Dictionary('en_US') 320 >>> 'Camel' in d.spellcheck_suggest('kamel') 321 True 322 323 >>> d.spellcheck_suggest('') 324 [] 325 326 >>> d = Dictionary('None') 327 >>> d.spellcheck_suggest('kamel') 328 [] 329 ''' 330 if self.enchant_dict: 331 return self.spellcheck_suggest_enchant(word) 332 if self.pyhunspell_object: 333 return self.spellcheck_suggest_pyhunspell(word) 334 if self.voikko: 335 return self.spellcheck_suggest_voikko(word) 336 return [] 337 338class Hunspell: 339 '''A class to suggest completions or corrections 340 using a list of Hunspell dictionaries 341 ''' 342 def __init__(self, dictionary_names=()) -> None: 343 global DEBUG_LEVEL 344 try: 345 DEBUG_LEVEL = int(str(os.getenv('IBUS_TYPING_BOOSTER_DEBUG_LEVEL'))) 346 except (TypeError, ValueError): 347 DEBUG_LEVEL = int(0) 348 if DEBUG_LEVEL > 1: 349 if dictionary_names: 350 LOGGER.debug( 351 'Hunspell.__init__(dictionary_names=%s)\n', 352 dictionary_names) 353 else: 354 LOGGER.debug('Hunspell.__init__(dictionary_names=())\n') 355 self._suggest_cache: Dict[str, List[Tuple[str, int]]] = {} 356 self._dictionary_names = dictionary_names 357 self._dictionaries: List[Dictionary] = [] 358 self.init_dictionaries() 359 360 def init_dictionaries(self) -> None: 361 '''Initialize the hunspell dictionaries 362 ''' 363 if DEBUG_LEVEL > 1: 364 if self._dictionary_names: 365 LOGGER.debug( 366 'Hunspell.init_dictionaries() dictionary_names=%s\n', 367 self._dictionary_names) 368 else: 369 LOGGER.debug( 370 'Hunspell.init_dictionaries() dictionary_names=()\n') 371 self._suggest_cache = {} 372 self._dictionaries = [] 373 for dictionary_name in self._dictionary_names: 374 self._dictionaries.append(Dictionary(name=dictionary_name)) 375 376 def get_dictionary_names(self) -> List[str]: 377 '''Returns a copy of the list of dictionary names. 378 379 It is important to return a copy, we do not want to change 380 the private member variable directly.''' 381 return self._dictionary_names[:] 382 383 def set_dictionary_names(self, dictionary_names: List[str]): 384 '''Sets the list of dictionary names. 385 386 If the new list of dictionary names differs from the existing 387 one, re-initialize the dictionaries. 388 ''' 389 if dictionary_names != self._dictionary_names: 390 if set(dictionary_names) != set(self._dictionary_names): 391 # Some dictionaries are really different, reinitialize: 392 self._dictionary_names = dictionary_names 393 self.init_dictionaries() 394 else: 395 # Only the order of dictionaries has changed. 396 # Reinitializing wastes time, just reorder the 397 # dictionaries: 398 self._dictionary_names = dictionary_names 399 dictionaries_new = [] 400 for name in dictionary_names: 401 for dictionary in self._dictionaries: 402 if dictionary.name == name: 403 dictionaries_new.append(dictionary) 404 self._dictionaries = dictionaries_new 405 if DEBUG_LEVEL > 1: 406 LOGGER.debug('set_dictionary_names(%s):\n', dictionary_names) 407 for dictionary in self._dictionaries: 408 LOGGER.debug('%s\n', dictionary.name) 409 410 def spellcheck(self, input_phrase: str) -> bool: 411 ''' 412 Checks if a string is likely to be spelled correctly checking 413 multiple dictionaries 414 415 :param input_phrase: A string to spellcheck 416 :return: True if it is more likely to be spelled correctly, 417 False if it is more likely to be spelled incorrectly. 418 In detail this means: 419 True: 420 - If it is a correctly spelled word in at least one of 421 the dictionaries supporting spellchecking 422 - None of the dictionaries support spellchecking 423 - Contains spaces, spellchecking cannot work 424 else False. 425 426 Examples: 427 428 >>> h = Hunspell(['en_US', 'de_DE', 'ja_JP']) 429 >>> h.spellcheck('Hello') 430 True 431 432 >>> h.spellcheck('Grüße') 433 True 434 435 >>> h.spellcheck('Gruße') 436 False 437 438 >>> h = Hunspell(['en_US', 'ja_JP']) 439 >>> h.spellcheck('Grüße') 440 False 441 442 >>> h = Hunspell(['ja_JP']) 443 >>> h.spellcheck('Grüße') 444 True 445 446 >>> h = Hunspell(['en_US', 'None']) 447 >>> h.spellcheck('Grüße') 448 False 449 450 >>> h = Hunspell(['None']) 451 >>> h.spellcheck('Grüße') 452 True 453 ''' 454 if ' ' in input_phrase: 455 return True 456 spellchecking_dictionaries_available = False 457 spellcheck_total = False 458 for dictionary in self._dictionaries: 459 if dictionary.has_spellchecking(): 460 spellchecking_dictionaries_available = True 461 spellcheck_total |= dictionary.spellcheck(input_phrase) 462 if not spellcheck_total and spellchecking_dictionaries_available: 463 return False 464 return True 465 466 def suggest(self, input_phrase: str) -> List[Tuple[str, int]]: 467 # pylint: disable=line-too-long 468 '''Return completions or corrections for the input phrase 469 470 :param input_phrase: A string to find completions or corrections for 471 472 Returns a list of tuples of the form (<word>, <score>) 473 <score> can have these values: 474 0: This is a completion, i.e. input_phrase matches 475 the beginning of <word> (accent insensitive match) 476 -1: This is a spell checking correction from hunspell 477 (i.e. either from enchant or pyhunspell) 478 479 Examples: 480 481 (Attention, the return values are in internal 482 normalization form ('NFD')) 483 484 >>> h = Hunspell(['de_DE', 'cs_CZ']) 485 >>> h.suggest('Geschwindigkeitsubertre')[0] 486 ('Geschwindigkeitsübertretungsverfahren', 0) 487 488 >>> h.suggest('Geschwindigkeitsübertretungsverfahren')[0] 489 ('Geschwindigkeitsübertretungsverfahren', 0) 490 491 >>> h.suggest('Glühwürmchen')[0] 492 ('Glühwürmchen', 0) 493 494 >>> h.suggest('Alpengluhen')[0] 495 ('Alpenglühen', 0) 496 497 >>> h.suggest('filosofictejs')[0] 498 ('filosofičtější', 0) 499 500 >>> h.suggest('filosofičtější')[0] 501 ('filosofičtější', 0) 502 503 >>> h.suggest('filosofičtějš')[0] 504 ('filosofičtější', 0) 505 506 >>> h = Hunspell(['it_IT']) 507 >>> h.suggest('principianti') 508 [('principianti', 0), ('principiati', -1), ('principiante', -1), ('principiarti', -1), ('principiasti', -1)] 509 510 >>> h = Hunspell(['es_ES']) 511 >>> h.suggest('teneis') 512 [('tenéis', 0), ('tenes', -1), ('tenis', -1), ('teneos', -1), ('tienes', -1), ('te neis', -1), ('te-neis', -1)] 513 514 >>> h.suggest('tenéis')[0] 515 ('tenéis', 0) 516 517 >>> h = Hunspell(['en_US']) 518 >>> ('Camelot', 0) in h.suggest('camel') # Should work with aspell and hunspell 519 True 520 521 >>> h = Hunspell(['fr_FR']) 522 >>> h.suggest('differemmen') 523 [('différemment', 0)] 524 525 >>> h = Hunspell(['None']) 526 >>> h.suggest('camel') 527 [] 528 529 >>> h = Hunspell(['None', 'en_US']) 530 >>> ('Camelot', 0) in h.suggest('camel') # Should work with aspell and hunspell 531 True 532 ''' 533 # pylint: enable=line-too-long 534 if input_phrase in self._suggest_cache: 535 return self._suggest_cache[input_phrase] 536 if DEBUG_LEVEL > 1: 537 LOGGER.debug( 538 "Hunspell.suggest() input_phrase=%(ip)s\n", 539 {'ip': input_phrase.encode('UTF-8')}) 540 # http://pwet.fr/man/linux/fichiers_speciaux/hunspell says: 541 # 542 # > A dictionary file (*.dic) contains a list of words, one per 543 # > line. The first line of the dictionaries (except personal 544 # > dictionaries) contains the word count. Each word may 545 # > optionally be followed by a slash ("/") and one or more 546 # > flags, which represents affixes or special attributes. 547 # 548 # I.e. if '/' is already contained in the input, it cannot 549 # match a word in the dictionary and we return an empty list 550 # immediately: 551 if '/' in input_phrase: 552 self._suggest_cache[input_phrase] = [] 553 return [] 554 # make sure input_phrase is in the internal normalization form (NFD): 555 input_phrase = unicodedata.normalize( 556 itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) 557 558 suggested_words: Dict[str, int] = {} 559 for dictionary in self._dictionaries: 560 if dictionary.words: 561 if dictionary.word_pairs: 562 input_phrase_no_accents = itb_util.remove_accents( 563 input_phrase, 564 keep=itb_util.ACCENT_LANGUAGES[dictionary.language]) 565 # If the input phrase is longer than than the maximum 566 # word length in a dictionary, don’t try 567 # complete it, it just wastes time then. 568 if len(input_phrase) <= dictionary.max_word_len: 569 if dictionary.word_pairs: 570 if IMPORT_REGEX_SUCCESFUL: 571 regex_pattern = regex.compile( 572 regex.escape(input_phrase_no_accents), 573 regex.IGNORECASE) 574 suggested_words.update([ 575 (x[0], 0) 576 for x in dictionary.word_pairs 577 if regex_pattern.match(x[1])]) 578 else: 579 re_pattern = re.compile( 580 re.escape(input_phrase_no_accents), 581 re.IGNORECASE) 582 suggested_words.update([ 583 (x[0], 0) 584 for x in dictionary.word_pairs 585 if re_pattern.match(x[1])]) 586 else: 587 if IMPORT_REGEX_SUCCESFUL: 588 regex_pattern = regex.compile( 589 regex.escape(input_phrase), 590 regex.IGNORECASE) 591 suggested_words.update([ 592 (x, 0) 593 for x in dictionary.words 594 if regex_pattern.match(x)]) 595 else: 596 re_pattern = re.compile( 597 re.escape(input_phrase), 598 re.IGNORECASE) 599 suggested_words.update([ 600 (x, 0) 601 for x in dictionary.words 602 if re_pattern.match(x)]) 603 if len(input_phrase) >= 4: 604 if dictionary.spellcheck(input_phrase): 605 # This is a valid word in this dictionary. 606 # It might have been missed by the 607 # matching above because the dictionary 608 # might not contain all possible word 609 # forms (The prefix and suffix information 610 # has been ignored). But the spell checker 611 # knows about this, if the spell checker 612 # thinks it is a correct word, it must be 613 # counted as a match of course: 614 suggested_words[input_phrase] = 0 615 extra_suggestions = [ 616 unicodedata.normalize( 617 itb_util.NORMALIZATION_FORM_INTERNAL, x) 618 for x in 619 dictionary.spellcheck_suggest(input_phrase) 620 ] 621 for suggestion in extra_suggestions: 622 if suggestion not in suggested_words: 623 if (dictionary.word_pairs 624 and 625 itb_util.remove_accents( 626 suggestion, 627 keep=itb_util.ACCENT_LANGUAGES[ 628 dictionary.language]) 629 == input_phrase_no_accents): 630 suggested_words[suggestion] = 0 631 else: 632 suggested_words[suggestion] = -1 633 sorted_suggestions = sorted( 634 suggested_words.items(), 635 key=lambda x: ( 636 - x[1], # 0: in dictionary, -1: hunspell 637 len(x[0]), # length of word ascending 638 x[0], # alphabetical 639 ))[0:MAX_WORDS] 640 self._suggest_cache[input_phrase] = sorted_suggestions 641 return sorted_suggestions 642 643BENCHMARK = True 644 645def main(): 646 ''' 647 Used for testing and profiling. 648 649 “python3 hunspell_suggest.py” 650 651 runs some tests and prints profiling data. 652 ''' 653 LOG_HANDLER = logging.StreamHandler(stream=sys.stderr) 654 LOGGER.setLevel(logging.DEBUG) 655 LOGGER.addHandler(LOG_HANDLER) 656 657 if BENCHMARK: 658 import cProfile 659 import pstats 660 profile = cProfile.Profile() 661 profile.enable() 662 663 import doctest 664 (failed, dummy_attempted) = doctest.testmod() 665 666 if BENCHMARK: 667 profile.disable() 668 stats = pstats.Stats(profile) 669 stats.strip_dirs() 670 stats.sort_stats('cumulative') 671 stats.print_stats('hunspell', 25) 672 stats.print_stats('enchant', 25) 673 674 LOGGER.info('itb_util.remove_accents() cache info: %s', 675 itb_util.remove_accents.cache_info()) 676 677 sys.exit(failed) 678 679if __name__ == "__main__": 680 main() 681