1# -*- coding: utf-8 -*-
2# vim:et sts=4 sw=4
3#
4# ibus-typing-booster - A completion input method for IBus
5#
6# Copyright (c) 2012-2013 Anish Patil <apatil@redhat.com>
7# Copyright (c) 2012-2016 Mike FABIAN <mfabian@redhat.com>
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program.  If not, see <http://www.gnu.org/licenses/>
21
22'''A module used by ibus-typing-booster to suggest words by using the
23hunspell dictonaries.
24
25'''
26
27from typing import Dict
28from typing import Tuple
29from typing import List
30import os
31import sys
32import unicodedata
33IMPORT_REGEX_SUCCESFUL = False
34try:
35    import regex # type: ignore
36    IMPORT_REGEX_SUCCESFUL = True
37    # Enable new improved regex engine instead of backwards compatible
38    # v0.  regex.match('ß', 'SS', regex.IGNORECASE) matches only with
39    # the improved version!  See also: https://pypi.org/project/regex/
40    regex.DEFAULT_VERSION = regex.VERSION1
41except (ImportError,):
42    # Use standard “re” module as a fallback:
43    import re
44import logging
45import itb_util
46
47LOGGER = logging.getLogger('ibus-typing-booster')
48
49DEBUG_LEVEL = int(0)
50
51IMPORT_ENCHANT_SUCCESSFUL = False
52IMPORT_HUNSPELL_SUCCESSFUL = False
53try:
54    import enchant # type: ignore
55    IMPORT_ENCHANT_SUCCESSFUL = True
56except (ImportError,):
57    try:
58        import hunspell # type: ignore
59        IMPORT_HUNSPELL_SUCCESSFUL = True
60    except (ImportError,):
61        pass
62
63IMPORT_LIBVOIKKO_SUCCESSFUL = False
64try:
65    import libvoikko # type: ignore
66    IMPORT_LIBVOIKKO_SUCCESSFUL = True
67except (ImportError,):
68    pass
69
70# Maximum words that should be returned.
71# This should a rather big number in order not
72# to throw away useful matches. But making it very huge
73# makes the performance worse. For example when setting
74# it to 1000, I see a noticable delay when typing the first
75# letter of a word until the candidate lookup table pops up.
76MAX_WORDS = 100
77
78class Dictionary:
79    '''A class to hold a hunspell dictionary
80    '''
81    def __init__(self, name='en_US') -> None:
82        if DEBUG_LEVEL > 1:
83            LOGGER.debug('Dictionary.__init__(name=%s)\n', name)
84        self.name = name
85        self.language = self.name.split('_')[0]
86        self.dic_path = ''
87        self.encoding = 'UTF-8'
88        self.words: List[str]= []
89        self.word_pairs: List[Tuple[str, str]] = []
90        self.max_word_len = 0 # maximum length of words in this dictionary
91        self.enchant_dict = None
92        self.pyhunspell_object = None
93        self.voikko: libvoikko.Voikko = None
94        if self.name != 'None':
95            self.load_dictionary()
96
97    def load_dictionary(self) -> None:
98        '''Load a hunspell dictionary and instantiate a
99        enchant.Dict() or a hunspell.Hunspell() object.
100
101        '''
102        if DEBUG_LEVEL > 0:
103            LOGGER.debug('load_dictionary() ...\n')
104        (self.dic_path,
105         self.encoding,
106         self.words) = itb_util.get_hunspell_dictionary_wordlist(self.name)
107        if self.words:
108            if self.language in itb_util.ACCENT_LANGUAGES:
109                self.word_pairs = [
110                    (x, itb_util.remove_accents(
111                        x, keep=itb_util.ACCENT_LANGUAGES[self.language]))
112                    for x in self.words
113                ]
114            for word in self.words:
115                if len(word) > self.max_word_len:
116                    self.max_word_len = len(word)
117            if DEBUG_LEVEL > 1:
118                LOGGER.debug(
119                    'max_word_len = %s\n', self.max_word_len)
120            if self.name.split('_')[0] == 'fi':
121                self.enchant_dict = None
122                self.pyhunspell_object = None
123                self.voikko = None
124                if not IMPORT_LIBVOIKKO_SUCCESSFUL:
125                    LOGGER.warning(
126                        'Language is “fi” but “import libvoikko” failed.')
127                    return
128                self.voikko = libvoikko.Voikko('fi')
129                return
130            if IMPORT_ENCHANT_SUCCESSFUL:
131                try:
132                    self.enchant_dict = enchant.Dict(self.name)
133                except enchant.errors.DictNotFoundError:
134                    LOGGER.exception(
135                        'Error initializing enchant for %s', self.name)
136                    self.enchant_dict = None
137                except Exception:
138                    LOGGER.exception(
139                        'Unknown error initializing enchant for %s',
140                        self.name)
141                    self.enchant_dict = None
142            elif IMPORT_HUNSPELL_SUCCESSFUL and self.dic_path:
143                aff_path = self.dic_path.replace('.dic', '.aff')
144                try:
145                    self.pyhunspell_object = hunspell.HunSpell(
146                        self.dic_path, aff_path)
147                except hunspell.HunSpellError:
148                    LOGGER.debug(
149                        'Error initializing hunspell for %s', self.name)
150                    self.pyhunspell_object = None
151                except Exception:
152                    LOGGER.debug(
153                        'Unknown error initializing hunspell for %s',
154                        self.name)
155                    self.pyhunspell_object = None
156
157    def spellcheck_enchant(self, word: str) -> bool:
158        '''
159        Spellcheck a word using enchant
160
161        :param word: The word to spellcheck
162        :return: True if spelling is correct, False if not or unknown
163        '''
164        if not self.enchant_dict:
165            return False
166        # enchant does the right thing for all languages, including
167        # Korean, if the input is a Unicode string in NFC.
168        return self.enchant_dict.check(unicodedata.normalize('NFC', word))
169
170    def spellcheck_pyhunspell(self, word: str) -> bool:
171        '''
172        Spellcheck a word using pyhunspell
173
174        :param word: The word to spellcheck
175        :return: True if spelling is correct, False if not or unknown
176        '''
177        if not self.pyhunspell_object:
178            return False
179        # pyhunspell needs its input passed in dictionary encoding.
180        # and also returns in dictionary encoding.
181        return self.pyhunspell_object.spell(
182            unicodedata.normalize('NFC', word).encode(
183                self.encoding, 'replace'))
184
185    def spellcheck_voikko(self, word: str) -> bool:
186        '''
187        Spellcheck a word using voikko
188
189        :param word: The word to spellcheck
190        :return: True if spelling is correct, False if not or unknown
191        '''
192        if not self.voikko:
193            return False
194        # voikko works correctly if the input is a Unicode string in NFC.
195        return self.voikko.spell(unicodedata.normalize('NFC', word))
196
197    def spellcheck(self, word: str) -> bool:
198        '''
199        Spellcheck a word using enchant, pyhunspell, or voikko
200
201        :param word: The word to spellcheck
202        :return: True if spelling is correct, False if not or unknown
203
204        >>> d = Dictionary('en_US')
205        >>> d.spellcheck('winter')
206        True
207
208        >>> d.spellcheck('winxer')
209        False
210
211        >>> d = Dictionary('None')
212        >>> d.spellcheck('winter')
213        False
214
215        >>> d.spellcheck('winxer')
216        False
217        '''
218        if self.enchant_dict:
219            return self.spellcheck_enchant(word)
220        if self.pyhunspell_object:
221            return self.spellcheck_pyhunspell(word)
222        if self.voikko:
223            return self.voikko.spell(word)
224        return False
225
226    def has_spellchecking(self) -> bool:
227        '''
228        Returns wether this dictionary supports spellchecking or not
229
230        :return: True if this dictionary spports spellchecking, False if not
231
232        Examples:
233
234        >>> d = Dictionary('en_US')
235        >>> d.has_spellchecking()
236        True
237
238        >>> d = Dictionary('zh_CN')
239        >>> d.has_spellchecking()
240        False
241
242        >>> d = Dictionary('None')
243        >>> d.has_spellchecking()
244        False
245        '''
246        if self.enchant_dict or self.pyhunspell_object or self.voikko:
247            return True
248        return False
249
250    def spellcheck_suggest_enchant(self, word: str) -> List[str]:
251        '''
252        Return spellchecking suggestions for word using enchant
253
254        :param word: The word to return spellchecking suggestions for
255        :return: List of spellchecking suggestions, possibly empty.
256        '''
257        if not word or not self.enchant_dict:
258            return []
259        # enchant does the right thing for all languages, including
260        # Korean, if the input is NFC. It takes Unicode strings and
261        # returns Unicode strings, no encoding and decoding is
262        # necessary, neither for Python2 nor for Python3.
263        return [
264            unicodedata.normalize(
265                itb_util.NORMALIZATION_FORM_INTERNAL, x)
266            for x in
267            self.enchant_dict.suggest(unicodedata.normalize('NFC', word))
268            ]
269
270    def spellcheck_suggest_pyhunspell(self, word: str) -> List[str]:
271        '''
272        Return spellchecking suggestions for word using pyhunspell
273
274        :param word: The word to return spellchecking suggestions for
275        :return: List of spellchecking suggestions, possibly empty.
276        '''
277        if not word or not self.pyhunspell_object:
278            return []
279        # pyhunspell needs its input passed in dictionary encoding.
280        return [
281            unicodedata.normalize(
282                itb_util.NORMALIZATION_FORM_INTERNAL, x)
283            for x in
284            self.pyhunspell_object.suggest(
285                unicodedata.normalize('NFC', word).encode(
286                    self.encoding, 'replace'))
287            ]
288
289    def spellcheck_suggest_voikko(self, word: str) -> List[str]:
290        '''
291        Return spellchecking suggestions for word using voikko
292
293        :param word: The word to return spellchecking suggestions for
294        :return: List of spellchecking suggestions, possibly empty.
295        '''
296        if not word or not self.voikko:
297            return []
298        return [
299            unicodedata.normalize(
300                itb_util.NORMALIZATION_FORM_INTERNAL, x)
301            for x in
302            self.voikko.suggest(unicodedata.normalize('NFC', word))
303            ]
304
305    def spellcheck_suggest(self, word: str) -> List[str]:
306        '''
307        Return spellchecking suggestions for word using enchant, pyhunspell or voikko
308
309        :param word: The word to return spellchecking suggestions for
310        :return: List of spellchecking suggestions, possibly empty.
311
312        Results can be quite different depending on whether enchant or pyhunspell
313        is used and in case of enchant whether hunspell, aspell,
314        myspell, or ispell are used. So for the doctests, better don’t
315        check the complete list of suggestions returned.
316
317        Examples:
318
319        >>> d = Dictionary('en_US')
320        >>> 'Camel' in d.spellcheck_suggest('kamel')
321        True
322
323        >>> d.spellcheck_suggest('')
324        []
325
326        >>> d = Dictionary('None')
327        >>> d.spellcheck_suggest('kamel')
328        []
329        '''
330        if self.enchant_dict:
331            return self.spellcheck_suggest_enchant(word)
332        if self.pyhunspell_object:
333            return self.spellcheck_suggest_pyhunspell(word)
334        if self.voikko:
335            return self.spellcheck_suggest_voikko(word)
336        return []
337
338class Hunspell:
339    '''A class to suggest completions or corrections
340    using a list of Hunspell dictionaries
341    '''
342    def __init__(self, dictionary_names=()) -> None:
343        global DEBUG_LEVEL
344        try:
345            DEBUG_LEVEL = int(str(os.getenv('IBUS_TYPING_BOOSTER_DEBUG_LEVEL')))
346        except (TypeError, ValueError):
347            DEBUG_LEVEL = int(0)
348        if DEBUG_LEVEL > 1:
349            if dictionary_names:
350                LOGGER.debug(
351                    'Hunspell.__init__(dictionary_names=%s)\n',
352                    dictionary_names)
353            else:
354                LOGGER.debug('Hunspell.__init__(dictionary_names=())\n')
355        self._suggest_cache: Dict[str, List[Tuple[str, int]]] = {}
356        self._dictionary_names = dictionary_names
357        self._dictionaries: List[Dictionary] = []
358        self.init_dictionaries()
359
360    def init_dictionaries(self) -> None:
361        '''Initialize the hunspell dictionaries
362        '''
363        if DEBUG_LEVEL > 1:
364            if self._dictionary_names:
365                LOGGER.debug(
366                    'Hunspell.init_dictionaries() dictionary_names=%s\n',
367                    self._dictionary_names)
368            else:
369                LOGGER.debug(
370                    'Hunspell.init_dictionaries() dictionary_names=()\n')
371        self._suggest_cache = {}
372        self._dictionaries = []
373        for dictionary_name in self._dictionary_names:
374            self._dictionaries.append(Dictionary(name=dictionary_name))
375
376    def get_dictionary_names(self) -> List[str]:
377        '''Returns a copy of the list of dictionary names.
378
379        It is important to return a copy, we do not want to change
380        the private member variable directly.'''
381        return self._dictionary_names[:]
382
383    def set_dictionary_names(self, dictionary_names: List[str]):
384        '''Sets the list of dictionary names.
385
386        If the new list of dictionary names differs from the existing
387        one, re-initialize the dictionaries.
388        '''
389        if dictionary_names != self._dictionary_names:
390            if set(dictionary_names) != set(self._dictionary_names):
391                # Some dictionaries are really different, reinitialize:
392                self._dictionary_names = dictionary_names
393                self.init_dictionaries()
394            else:
395                # Only the order of dictionaries has changed.
396                # Reinitializing wastes time, just reorder the
397                # dictionaries:
398                self._dictionary_names = dictionary_names
399                dictionaries_new = []
400                for name in dictionary_names:
401                    for dictionary in self._dictionaries:
402                        if dictionary.name == name:
403                            dictionaries_new.append(dictionary)
404                self._dictionaries = dictionaries_new
405        if DEBUG_LEVEL > 1:
406            LOGGER.debug('set_dictionary_names(%s):\n', dictionary_names)
407            for dictionary in self._dictionaries:
408                LOGGER.debug('%s\n', dictionary.name)
409
410    def spellcheck(self, input_phrase: str) -> bool:
411        '''
412        Checks if a string is likely to be spelled correctly checking
413        multiple dictionaries
414
415        :param input_phrase: A string to spellcheck
416        :return: True if it is more likely to be spelled correctly,
417                 False if it is more likely to be spelled incorrectly.
418                 In detail this means:
419                 True:
420                     - If it is a correctly spelled word in at least one of
421                       the dictionaries supporting spellchecking
422                     - None of the dictionaries support spellchecking
423                     - Contains spaces, spellchecking cannot work
424                 else False.
425
426        Examples:
427
428        >>> h = Hunspell(['en_US', 'de_DE', 'ja_JP'])
429        >>> h.spellcheck('Hello')
430        True
431
432        >>> h.spellcheck('Grüße')
433        True
434
435        >>> h.spellcheck('Gruße')
436        False
437
438        >>> h = Hunspell(['en_US', 'ja_JP'])
439        >>> h.spellcheck('Grüße')
440        False
441
442        >>> h = Hunspell(['ja_JP'])
443        >>> h.spellcheck('Grüße')
444        True
445
446        >>> h = Hunspell(['en_US', 'None'])
447        >>> h.spellcheck('Grüße')
448        False
449
450        >>> h = Hunspell(['None'])
451        >>> h.spellcheck('Grüße')
452        True
453        '''
454        if ' ' in input_phrase:
455            return True
456        spellchecking_dictionaries_available = False
457        spellcheck_total = False
458        for dictionary in self._dictionaries:
459            if dictionary.has_spellchecking():
460                spellchecking_dictionaries_available = True
461                spellcheck_total |= dictionary.spellcheck(input_phrase)
462        if not spellcheck_total and spellchecking_dictionaries_available:
463            return False
464        return True
465
466    def suggest(self, input_phrase: str) -> List[Tuple[str, int]]:
467        # pylint: disable=line-too-long
468        '''Return completions or corrections for the input phrase
469
470        :param input_phrase: A string to find completions or corrections for
471
472        Returns a list of tuples of the form (<word>, <score>)
473                <score> can have these values:
474                    0: This is a completion, i.e. input_phrase matches
475                       the beginning of <word> (accent insensitive match)
476                   -1: This is a spell checking correction from hunspell
477                       (i.e. either from enchant or pyhunspell)
478
479        Examples:
480
481        (Attention, the return values are in internal
482        normalization form ('NFD'))
483
484        >>> h = Hunspell(['de_DE', 'cs_CZ'])
485        >>> h.suggest('Geschwindigkeitsubertre')[0]
486        ('Geschwindigkeitsübertretungsverfahren', 0)
487
488        >>> h.suggest('Geschwindigkeitsübertretungsverfahren')[0]
489        ('Geschwindigkeitsübertretungsverfahren', 0)
490
491        >>> h.suggest('Glühwürmchen')[0]
492        ('Glühwürmchen', 0)
493
494        >>> h.suggest('Alpengluhen')[0]
495        ('Alpenglühen', 0)
496
497        >>> h.suggest('filosofictejs')[0]
498        ('filosofičtější', 0)
499
500        >>> h.suggest('filosofičtější')[0]
501        ('filosofičtější', 0)
502
503        >>> h.suggest('filosofičtějš')[0]
504        ('filosofičtější', 0)
505
506        >>> h = Hunspell(['it_IT'])
507        >>> h.suggest('principianti')
508        [('principianti', 0), ('principiati', -1), ('principiante', -1), ('principiarti', -1), ('principiasti', -1)]
509
510        >>> h = Hunspell(['es_ES'])
511        >>> h.suggest('teneis')
512        [('tenéis', 0), ('tenes', -1), ('tenis', -1), ('teneos', -1), ('tienes', -1), ('te neis', -1), ('te-neis', -1)]
513
514        >>> h.suggest('tenéis')[0]
515        ('tenéis', 0)
516
517        >>> h = Hunspell(['en_US'])
518        >>> ('Camelot', 0) in h.suggest('camel') # Should work with aspell and hunspell
519        True
520
521        >>> h = Hunspell(['fr_FR'])
522        >>> h.suggest('differemmen')
523        [('différemment', 0)]
524
525        >>> h = Hunspell(['None'])
526        >>> h.suggest('camel')
527        []
528
529        >>> h = Hunspell(['None', 'en_US'])
530        >>> ('Camelot', 0) in h.suggest('camel') # Should work with aspell and hunspell
531        True
532        '''
533        # pylint: enable=line-too-long
534        if input_phrase in self._suggest_cache:
535            return self._suggest_cache[input_phrase]
536        if DEBUG_LEVEL > 1:
537            LOGGER.debug(
538                "Hunspell.suggest() input_phrase=%(ip)s\n",
539                {'ip': input_phrase.encode('UTF-8')})
540        # http://pwet.fr/man/linux/fichiers_speciaux/hunspell says:
541        #
542        # > A dictionary file (*.dic) contains a list of words, one per
543        # > line. The first line of the dictionaries (except personal
544        # > dictionaries) contains the word count. Each word may
545        # > optionally be followed by a slash ("/") and one or more
546        # > flags, which represents affixes or special attributes.
547        #
548        # I.e. if '/' is already contained in the input, it cannot
549        # match a word in the dictionary and we return an empty list
550        # immediately:
551        if '/' in input_phrase:
552            self._suggest_cache[input_phrase] = []
553            return []
554        # make sure input_phrase is in the internal normalization form (NFD):
555        input_phrase = unicodedata.normalize(
556            itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
557
558        suggested_words: Dict[str, int] = {}
559        for dictionary in self._dictionaries:
560            if dictionary.words:
561                if dictionary.word_pairs:
562                    input_phrase_no_accents = itb_util.remove_accents(
563                        input_phrase,
564                        keep=itb_util.ACCENT_LANGUAGES[dictionary.language])
565                # If the input phrase is longer than than the maximum
566                # word length in a dictionary, don’t try
567                # complete it, it just wastes time then.
568                if len(input_phrase) <= dictionary.max_word_len:
569                    if dictionary.word_pairs:
570                        if IMPORT_REGEX_SUCCESFUL:
571                            regex_pattern = regex.compile(
572                                regex.escape(input_phrase_no_accents),
573                                regex.IGNORECASE)
574                            suggested_words.update([
575                                (x[0], 0)
576                                for x in dictionary.word_pairs
577                                if regex_pattern.match(x[1])])
578                        else:
579                            re_pattern = re.compile(
580                                re.escape(input_phrase_no_accents),
581                                re.IGNORECASE)
582                            suggested_words.update([
583                                (x[0], 0)
584                                for x in dictionary.word_pairs
585                                if re_pattern.match(x[1])])
586                    else:
587                        if IMPORT_REGEX_SUCCESFUL:
588                            regex_pattern = regex.compile(
589                                regex.escape(input_phrase),
590                                regex.IGNORECASE)
591                            suggested_words.update([
592                                (x, 0)
593                                for x in dictionary.words
594                                if regex_pattern.match(x)])
595                        else:
596                            re_pattern = re.compile(
597                                re.escape(input_phrase),
598                                re.IGNORECASE)
599                            suggested_words.update([
600                                (x, 0)
601                                for x in dictionary.words
602                                if re_pattern.match(x)])
603                if len(input_phrase) >= 4:
604                    if dictionary.spellcheck(input_phrase):
605                        # This is a valid word in this dictionary.
606                        # It might have been missed by the
607                        # matching above because the dictionary
608                        # might not contain all possible word
609                        # forms (The prefix and suffix information
610                        # has been ignored). But the spell checker
611                        # knows about this, if the spell checker
612                        # thinks it is a correct word, it must be
613                        # counted as a match of course:
614                        suggested_words[input_phrase] = 0
615                    extra_suggestions = [
616                        unicodedata.normalize(
617                            itb_util.NORMALIZATION_FORM_INTERNAL, x)
618                        for x in
619                        dictionary.spellcheck_suggest(input_phrase)
620                    ]
621                    for suggestion in extra_suggestions:
622                        if suggestion not in suggested_words:
623                            if (dictionary.word_pairs
624                                and
625                                itb_util.remove_accents(
626                                    suggestion,
627                                    keep=itb_util.ACCENT_LANGUAGES[
628                                        dictionary.language])
629                                == input_phrase_no_accents):
630                                suggested_words[suggestion] = 0
631                            else:
632                                suggested_words[suggestion] = -1
633        sorted_suggestions = sorted(
634            suggested_words.items(),
635            key=lambda x: (
636                - x[1],    # 0: in dictionary, -1: hunspell
637                len(x[0]), # length of word ascending
638                x[0],      # alphabetical
639            ))[0:MAX_WORDS]
640        self._suggest_cache[input_phrase] = sorted_suggestions
641        return sorted_suggestions
642
643BENCHMARK = True
644
645def main():
646    '''
647    Used for testing and profiling.
648
649    “python3 hunspell_suggest.py650
651    runs some tests and prints profiling data.
652    '''
653    LOG_HANDLER = logging.StreamHandler(stream=sys.stderr)
654    LOGGER.setLevel(logging.DEBUG)
655    LOGGER.addHandler(LOG_HANDLER)
656
657    if BENCHMARK:
658        import cProfile
659        import pstats
660        profile = cProfile.Profile()
661        profile.enable()
662
663    import doctest
664    (failed, dummy_attempted) = doctest.testmod()
665
666    if BENCHMARK:
667        profile.disable()
668        stats = pstats.Stats(profile)
669        stats.strip_dirs()
670        stats.sort_stats('cumulative')
671        stats.print_stats('hunspell', 25)
672        stats.print_stats('enchant', 25)
673
674    LOGGER.info('itb_util.remove_accents() cache info: %s',
675                itb_util.remove_accents.cache_info())
676
677    sys.exit(failed)
678
679if __name__ == "__main__":
680    main()
681