1"""
2The main "suggest correction for this misspelling" module.
3
4On a bird-eye view level, suggest does:
5
6* tries small word "edits" (remove letters, insert letters, swap letters) and checks (with the help
7  of :mod:`lookup  <spylls.hunspell.algo.lookup>`) there are any valid ones
8* if no good suggestions found, tries "ngram-based" suggestions (calculating ngram-based distance to
9  all dictionary words and select the closest ones), handled by
10  :mod:`ngram_suggest <spylls.hunspell.algo.ngram_suggest>`
11* if possible, tries metaphone-based suggestions, handled by :mod:`phonet_suggest <spylls.hunspell.algo.phonet_suggest>`
12
13Note that Spylls's implementation takes one liberty comparing to Hunspell's:
14In Hunspell, ngram suggestions (select all words from dictionary that ngram-similar => produce suggestions)
15and phonetic suggestions (select all words from dictionary that phonetically similar => produce suggestions)
16are done in the same cycle, because they both iterate through entire dictionary. Spylls does it
17in two separate cycles, again, for the sake of clarity (note that dictionaries with metaphone
18transformation rules defined are extremely rare).
19
20To follow algorithm details, start reading from :meth:`Suggest.__call__`
21
22.. toctree::
23  :caption: Specific algorithms
24
25  algo_ngram_suggest
26  algo_phonet_suggest
27
28.. autoclass:: Suggest
29
30Suggestion classes
31^^^^^^^^^^^^^^^^^^
32
33.. autoclass:: Suggestion
34    :members:
35.. autoclass:: MultiWordSuggestion
36    :members:
37
38"""
39
40from typing import Iterator, List, Set, Union
41
42import dataclasses
43from dataclasses import dataclass
44
45from spylls.hunspell import data
46from spylls.hunspell.algo.capitalization import Type as CapType
47from spylls.hunspell.algo import ngram_suggest, phonet_suggest, permutations as pmt
48
49MAXPHONSUGS = 2
50MAXSUGGESTIONS = 15
51GOOD_EDITS = ['spaceword', 'uppercase', 'replchars']
52
53
54@dataclass
55class Suggestion:
56    """
57    Suggestions is what Suggest produces internally to store enough information about some suggestion
58    to make sure it is a good one.
59    """
60
61    #: Actual suggestion text
62    text: str
63    #: How suggestion was produced, useful for debugging, typically same as the method
64    #: of the editing which led to this suggestion, like "badchar", "twowords", "phonet" etc.
65    kind: str
66
67    def __repr__(self):
68        return f"Suggestion[{self.kind}]({self.text})"
69
70    def replace(self, **changes):
71        return dataclasses.replace(self, **changes)
72
73
74@dataclass
75class MultiWordSuggestion:
76    """
77    Represents suggestion to split words into several.
78    """
79
80    #: List of words
81    words: List[str]
82    #: Same as :attr:`Suggestion.source`
83    source: str
84
85    #: Whether those words are allowed to be joined by dash. We should disallow it if the multi-word
86    #: suggestion was produced by :attr:`Aff.REP <spylls.hunspell.data.aff.Aff.REP>` table, see
87    #: :meth:`Suggest.edits` for details.
88    allow_dash: bool = True
89
90    def stringify(self, separator=' '):
91        return Suggestion(separator.join(self.words), self.source)
92
93    def __repr__(self):
94        return f"Suggestion[{self.source}]({self.words!r})"
95
96
97class Suggest:
98    """
99    ``Suggest`` object is created on :class:`Dictionary <spylls.hunspell.Dictionary>` reading. Typically,
100    you would not use it directly, but you might want for experiments::
101
102        >>> dictionary = Dictionary.from_files('dictionaries/en_US')
103        >>> suggest = dictionary.suggester
104
105        >>> [*suggest('spylls')]
106        ['spells', 'spills']
107
108        >>> for suggestion in suggest.suggestions('spylls'):
109        ...    print(suggestion)
110        Suggestion[badchar](spell)
111        Suggestion[badchar](spill)
112
113    See :meth:`__call__` as the main entry point for algorithm explanation.
114
115    **Main methods**
116
117    .. automethod:: __call__
118    .. automethod:: suggestions
119
120    **Suggestion types**
121
122    .. automethod:: edits
123    .. automethod:: ngram_suggestions
124    .. automethod:: phonet_suggestions
125    """
126    def __init__(self, aff: data.Aff, dic: data.Dic, lookup):
127        self.aff = aff
128        self.dic = dic
129        self.lookup = lookup
130
131        # TODO: also NONGRAMSUGGEST and ONLYUPCASE
132        bad_flags = {*filter(None, [self.aff.FORBIDDENWORD, self.aff.NOSUGGEST, self.aff.ONLYINCOMPOUND])}
133
134        self.words_for_ngram = [word for word in self.dic.words if not bad_flags.intersection(word.flags)]
135
136    def __call__(self, word: str) -> Iterator[str]:
137        """
138        Outer "public" interface: returns a list of all valid suggestions, as strings.
139
140        Method returns a generator, so it is up to client code to fetch as many suggestions as it
141        needs::
142
143            >>> suggestions = suggester('unredable')
144            <generator object Suggest.__call__ at 0x7f74f5056350>
145            >>> suggestions.__next__()
146            'unreadable'
147
148        Note that suggestion to split words in two also returned as a single string, with a space::
149
150            >>> [*suggester('badcat')]
151            ['bad cat', 'bad-cat', 'baccarat']
152
153        Internally, the method just calls :meth:`suggestions` (which returns instances of :class:`Suggestion`)
154        and yields suggestion texts.
155
156        Args:
157            word: Word to check
158        """
159        yield from (suggestion.text for suggestion in self.suggestions(word))
160
161    def suggestions(self, word: str) -> Iterator[Suggestion]:  # pylint: disable=too-many-statements
162        """
163        Main suggestion search loop. What it does, in general, is:
164
165        * generates possible misspelled word cases (for ex., "KIttens" in dictionary might've been
166          'KIttens', 'kIttens', 'kittens', or 'Kittens')
167        * produces word edits with :meth:`edits` (with the help of
168          :mod:`permutations <spylls.hunspell.algo.permutations>` module), checks them with
169          :class:`Lookup <spylls.hunspell.algo.lookup.Lookup>`, and decides if that's maybe enough
170        * but if it is not (and if .aff settings allow), ngram-based suggestions are produced with
171          :meth:`ngram_suggestions`, and phonetically similar suggestions with :meth:`phonet_suggestions`
172
173        That's very simplified explanation, read the code!
174
175        Args:
176            word: Word to check
177        """
178
179        # Whether some suggestion (permutation of the word) is an existing and allowed word,
180        # just delegates to Lookup
181        def is_good_suggestion(word):
182            # Note that instead of using Lookup's main method, we just see if there is any good forms
183            # of this exact word, avoiding ICONV and trying to break word by dashes.
184            return any(self.lookup.good_forms(word, capitalization=False, allow_nosuggest=False))
185
186        # The suggestion is considered forbidden if there is ANY homonym in dictionary with flag
187        # FORBIDDENWORD. Besides marking swearing words, this feature also allows to include in
188        # dictionaries known "correctly-looking but actually non-existent" forms, which might important
189        # with very flexive languages.
190        def is_forbidden(word):
191            return self.aff.FORBIDDENWORD and self.dic.has_flag(word, self.aff.FORBIDDENWORD)
192
193        # This set will gather all good suggestions that were already returned (in order, for example,
194        # to not return same suggestion twice)
195        handled: Set[str] = set()
196
197        # Suggestions that are already considered good are passed through this method, which converts
198        # it to proper capitalization form, and then either yields it (if it is not forbidden,
199        # hadn't already seen, etc), or just does nothing.
200        # Method is quite lengthy, but is nested because updates and reuses ``handled`` local var
201        def handle_found(suggestion, *, check_inclusion=False):
202            text = suggestion.text
203
204            # If any of the homonyms has KEEPCASE flag, we shouldn't coerce it from the base form.
205            # But CHECKSHARPS flag presence changes the meaning of KEEPCASE...
206            if (self.aff.KEEPCASE and self.dic.has_flag(text, self.aff.KEEPCASE) and not
207                    (self.aff.CHECKSHARPS and 'ß' in text)):
208                # Don't try to change text's case
209                pass
210            else:
211                # "Coerce" suggested text from the capitalization that it has in the dictionary, to
212                # the capitalization of the misspelled word. E.g., if misspelled was "Kiten", suggestion
213                # is "kitten" (how it is in the dictionary), and coercion (what we really want
214                # to return to user) is "Kitten"
215                text = self.aff.casing.coerce(text, captype)
216                # ...but if this particular capitalized form is forbidden, return back to original text
217                if text != suggestion.text and is_forbidden(text):
218                    text = suggestion.text
219
220                # "aNew" will suggest "a new", here we fix it back to "a New"
221                if captype in [CapType.HUH, CapType.HUHINIT] and ' ' in text:
222                    pos = text.find(' ')
223                    if text[pos + 1] != word[pos] and text[pos + 1].upper() == word[pos]:
224                        text = text[:pos+1] + word[pos] + text[pos+2:]
225
226            # If the word is forbidden, nothing more to do
227            if is_forbidden(text):
228                return
229
230            # Finally, OCONV table in .aff-file might specify what chars to replace in suggestions
231            # (for example, "'" to proper typographic "’", or common digraphs)
232            text = self.aff.OCONV(text) if self.aff.OCONV else text
233
234            # If we already seen this suggestion, nothing to do
235            # Note that this should happen AFTER the OCONV: it sometimes changes the content significantly.
236            # For example, Dutch dictionary recodes "ij" into ligature (one character representing this
237            # two letters) to enforce proper capitalization (and decodes it back on OCONV).
238            if text in handled:
239                return
240
241            # Sometimes we want to skip suggestions even if they are same as PARTS of already
242            # seen ones: for examle, ngram-based suggestions might produce very similar forms, like
243            # "impermanent" and "permanent" -- both of them are correct, but if the first is
244            # closer (by length/content) to misspelling, there is no point in suggesting the second
245            if check_inclusion and any(previous.lower() in text.lower() for previous in handled):
246                return
247
248            # Remember we seen it
249            handled.add(text)
250
251            # And here we are!
252            yield suggestion.replace(text=text)
253
254        # **Start of the main suggest code**
255
256        # First, produce all possible "good capitalizations" from the provided word. For example,
257        # if the word is "MsDonalds", good capitalizations (what it might've been in dictionary) are
258        # "msdonalds" (full lowercase) "msDonalds" (first letter lowercased), or maybe "Msdonalds"
259        # (only first letter capitalized). Note that "MSDONALDS" (it should've been all caps) is not
260        # produced as a possible good form, but checked separately in ``edits``
261        captype, variants = self.aff.casing.corrections(word)
262
263        # Check a special case: if it is possible that words would be possible to be capitalized
264        # on compounding, then we check capitalized form of the word. If it is correct, that's the
265        # only suggestion we ever need.
266        if self.aff.FORCEUCASE and captype == CapType.NO:
267            for capitalized in self.aff.casing.capitalize(word):
268                if is_good_suggestion(capitalized):
269                    yield from handle_found(Suggestion(capitalized.capitalize(), 'forceucase'))
270                    return  # No more need to check anything
271
272        good_edits_found = False
273
274        # Now, for all capitalization variant
275        for idx, variant in enumerate(variants):
276            # If it is different from original capitalization, and is good, we suggest it
277            if idx > 0 and is_good_suggestion(variant):
278                yield from handle_found(Suggestion(variant, 'case'))
279
280            # Now we do two rounds:
281            # * generate edits and check whether they are valid non-compound words,
282            # * generate the same edits again and check whether they are valid compound words
283            #
284            # The reason of splitting the checks is: a) optimization (checking whether the word is
285            # a valid compound is much more pricey) and b) suggestion quality (it is considered that
286            # non-compound words are more probable).
287            # There are cases when this algorithm brings problems: for example, "11thhour" will NOT
288            # suggest "11th hour" -- because "11th" is compound word, and "hour" is not, but we
289            # first check whether the splitting is correct assuming all words are non-compound, and
290            # then check whether it is correct assuming both are compound. That's how it is in Hunspell,
291            # and Spylls doesn't tries to fix that.
292
293            nocompound = False
294
295            # 1. Only generate suggestions that are correct NOT COMPOUND words.
296            for suggestion in self.edit_suggestions(variant, handle_found, limit=MAXSUGGESTIONS, compounds=False):
297                yield suggestion
298                # remember if any "good" edits was found -- in this case we don't need
299                # ngram suggestions
300                good_edits_found = good_edits_found or (suggestion.kind in GOOD_EDITS)
301                # If any of those kinds are found, we don't need to try _any_ compound suggestions
302                if suggestion.kind in ['uppercase', 'replchars', 'mapchars']:
303                    nocompound = True
304                # We might break from the method immediately if the suggestion is to split word
305                # with space, and this phrase is _in the dictionary as a whole_: "alot" => "a lot".
306                # Then we don't need ANY other suggestions. This is a trick to enforce suggesting
307                # to break commonly misspelled phrases withot oversuggesting
308                if suggestion.kind == 'spaceword':
309                    return
310
311            # 2. Only generate suggestions that are correct COMPOUND words
312            if not nocompound:
313                for suggestion in self.edit_suggestions(variant, handle_found,
314                                                        limit=self.aff.MAXCPDSUGS, compounds=True):
315                    yield suggestion
316                    good_edits_found = good_edits_found or (suggestion.kind in GOOD_EDITS)
317
318        if good_edits_found:
319            return
320
321        # One additional pass at suggesting: if the word contains dashes, and there were no dash-containing
322        # suggestions, we should try to spellcheck it separately... But only one misspelled part is allowed.
323        if '-' in word and not any('-' in sug for sug in handled):
324            chunks = word.split('-')
325            for idx, chunk in enumerate(chunks):
326                # If this chunk of the word is misspelled...
327                if not is_good_suggestion(chunk):
328                    # ...try all suggestions for this separate chunk
329                    for sug in self(chunk):
330                        candidate = '-'.join([*chunks[:idx], sug, *chunks[idx+1:]])
331                        # And check if the whole word with this chunk replaced is a good word. Note that
332                        # we use lookup's main method here, which will also break it into words by
333                        # dashes. It is done (instead of just checking chunk-by-chunk), because there
334                        # are ambiguities how to split: if we are fixing "quas-Afro-American"->"quasi-Afro-American",
335                        # and "Afro-American" is present in the dictionary as a whole word, it is better
336                        # to delegate search for the right breaking to the Lookup.
337                        if self.lookup(candidate):
338                            yield Suggestion(candidate, 'dashes')
339                    # after only one misspelled chunk, we stop the search anyways, whether we found some
340                    # sugestions or not
341                    break
342
343        # If there was no good edits that were valid words, we might try
344        # ngram-based suggestion algorithm: it is slower, but able to correct severely misspelled words
345
346        ngrams_seen = 0
347        for sug in self.ngram_suggestions(word, handled=handled):
348            for res in handle_found(Suggestion(sug, 'ngram'), check_inclusion=True):
349                ngrams_seen += 1
350                yield res
351            if ngrams_seen >= self.aff.MAXNGRAMSUGS:
352                break
353
354        # Also, if metaphone transformations (phonetic coding of words) were defined in the .aff file,
355        # we might try to use them to produce suggestions
356
357        phonet_seen = 0
358        for sug in self.phonet_suggestions(word):
359            for res in handle_found(Suggestion(sug, 'phonet'), check_inclusion=True):
360                phonet_seen += 1
361                yield res
362            if phonet_seen >= MAXPHONSUGS:
363                break
364
365    def edit_suggestions(self, word: str, handle_found, *, compounds: bool, limit: int) -> Iterator[Suggestion]:
366        def is_good_suggestion(word):
367            # Note that instead of using Lookup's main method, we just see if there is any good forms
368            # of this exact word, avoiding ICONV and trying to break word by dashes.
369            if compounds:
370                return any(self.lookup.good_forms(word, capitalization=False, allow_nosuggest=False, affix_forms=False))
371            return any(self.lookup.good_forms(word, capitalization=False, allow_nosuggest=False, compound_forms=False))
372
373        # For some set of suggestions, produces only good ones:
374        def filter_suggestions(suggestions):
375            for suggestion in suggestions:
376                # For multiword suggestion,
377                if isinstance(suggestion, MultiWordSuggestion):
378                    # ...if all of the words is correct
379                    if all(is_good_suggestion(word) for word in suggestion.words):
380                        # ...we just convert it to plain text suggestion "word1 word2"
381                        yield suggestion.stringify()
382                        if suggestion.allow_dash:
383                            # ...and "word1-word2" if allowed
384                            yield suggestion.stringify('-')
385                else:
386                    # Singleword suggestion is just yielded if it is good
387                    if is_good_suggestion(suggestion.text):
388                        yield suggestion
389
390        count = 0
391
392        for suggestion in filter_suggestions(self.edits(word)):
393            # Then, for each of those correct suggestions,
394            # we run them through handle_found (which performs some final transformations and
395            # checks, and may handle 1 or 0 suggestions; 0 in the case it was already seen or
396            # is a forbidden word)
397            for res in handle_found(suggestion):
398                yield res
399                count += 1
400
401                if count > limit:
402                    return
403
404    def edits(self, word: str) -> Iterator[Union[Suggestion, MultiWordSuggestion]]:
405        """
406        Produces all possible word edits in a form of :class:`Suggestion` or :class:`MultiWordSuggestion`.
407        Note that:
408
409        * order is important, that's the order user will receive the suggestions (and the further the
410          suggestion type in the order, the more probably it would be dropped due to suggestion count
411          limit)
412        * suggestion "source" tag is important: :meth:`suggestions` uses it to distinguish between
413          good and questionble edits (if there were any good ones, ngram suggestion wouldn't
414          be used)
415
416        Args:
417            word: Word to mutate
418        """
419        # suggestions for an uppercase word (html -> HTML)
420        yield Suggestion(self.aff.casing.upper(word), 'uppercase')
421
422        # REP table in affix file specifies "typical misspellings", and we try to replace them.
423        # Note that the content of REP table taken not only from aff file, but also from "ph:" tag
424        # in dictionary (lines looking like ``hello ph:helo`` put word "hello" in dictionary, and
425        # "helo -> hello" in REP-table), see read_dic.
426        #
427        # It might return several words if REP table has "REP <something> <some>_<thing>" (_ is code
428        # for space).
429        #
430        # ...in this case we should suggest both "<word1> <word2>" as one dictionary entry, and
431        # "<word1>" "<word2>" as a sequence -- but clarifying this sequence might NOT be joined by "-"
432        for suggestion in pmt.replchars(word, self.aff.REP):
433            if isinstance(suggestion, list):
434                yield Suggestion(' '.join(suggestion), 'replchars')
435                yield MultiWordSuggestion(suggestion, 'replchars', allow_dash=False)
436            else:
437                yield Suggestion(suggestion, 'replchars')
438
439        for words in pmt.twowords(word):
440            yield Suggestion(' '.join(words), 'spaceword')
441
442            if self.use_dash():
443                # "alot" => "a-lot"
444                yield Suggestion('-'.join(words), 'spaceword')
445
446        # MAP in aff file specifies related chars (for example, "ïi"), and mapchars produces all
447        # changes of the word with related chars replaced. For example, "naive" produces "naïve".
448        for suggestion in pmt.mapchars(word, self.aff.MAP):
449            yield Suggestion(suggestion, 'mapchars')
450
451        # Try to swap adjacent characters (ktiten -> kitten), produces all possible forms with ONE
452        # swap; but for 4- and 5-letter words tries also two swaps "ahev -> have"
453        for suggestion in pmt.swapchar(word):
454            yield Suggestion(suggestion, 'swapchar')
455
456        # Try longer swaps (up to 4 chars distance)
457        for suggestion in pmt.longswapchar(word):
458            yield Suggestion(suggestion, 'longswapchar')
459
460        # Try to replace chars by those close on keyboard ("wueue" -> "queue"), KEY in aff file specifies
461        # keyboard layout.
462        for suggestion in pmt.badcharkey(word, self.aff.KEY):
463            yield Suggestion(suggestion, 'badcharkey')
464
465        # Try remove character (produces all forms with one char removed: "clat" => "lat", "cat", "clt", "cla")
466        for suggestion in pmt.extrachar(word):
467            yield Suggestion(suggestion, 'extrachar')
468
469        # Try insert character (from set of all possible language chars specified in aff), produces
470        # all forms with any of the TRY chars inserted in all possible positions
471        for suggestion in pmt.forgotchar(word, self.aff.TRY):
472            yield Suggestion(suggestion, 'forgotchar')
473
474        # Try to move a character forward and backwars:
475        #  "rnai" => "nari", "nair", "rain" (forward: r moved 2 and 3 chars, n moved 2 chars)
476        #         => "rina", "irna", "arni" (backward: i moved 2 and 3 chars, a moved 2 chars)
477        # (no 1-char movements necessary, they are covered by swapchar above)
478        for suggestion in pmt.movechar(word):
479            yield Suggestion(suggestion, 'movechar')
480
481        # Try replace each character with any of other language characters
482        for suggestion in pmt.badchar(word, self.aff.TRY):
483            yield Suggestion(suggestion, 'badchar')
484
485        # Try fix two-character doubling: "chickcken" -> "chicken" (one-character doubling is
486        # already covered by extrachar)
487        for suggestion in pmt.doubletwochars(word):
488            yield Suggestion(suggestion, 'doubletwochars')
489
490        if not self.aff.NOSPLITSUGS:
491            # Try split word by space in all possible positions
492            # NOSPLITSUGS option in aff prohibits it, it is important, say, for Scandinavian languages
493            for suggestion_pair in pmt.twowords(word):
494                yield MultiWordSuggestion(suggestion_pair, 'twowords', allow_dash=self.use_dash())
495
496    def ngram_suggestions(self, word: str, handled: Set[str]) -> Iterator[str]:
497        """
498        Produces ngram-based suggestions, by passing to
499        :meth:`ngram_suggest.ngram_suggest <spylls.hunspell.algo.ngram_suggest.ngram_suggest>` current
500        misspelling, already found suggestions and settings from .aff file.
501
502        See :mod:`ngram_suggest <spylls.hunspell.algo.ngram_suggest>`.
503
504        Args:
505            word: Misspelled word
506            handled: List of already handled (known) suggestions; it is reused in
507                     :meth:`ngram_suggest.filter_guesses <spylls.hunspell.algo.ngram_suggest.filter_guesses>`
508                     to decide whether we add "not really good" ngram-based suggestions to result
509        """
510        if self.aff.MAXNGRAMSUGS == 0:
511            return
512
513        yield from ngram_suggest.ngram_suggest(
514                    word.lower(),
515                    dictionary_words=self.words_for_ngram,
516                    prefixes=self.aff.PFX, suffixes=self.aff.SFX,
517                    known={*(word.lower() for word in handled)},
518                    maxdiff=self.aff.MAXDIFF,
519                    onlymaxdiff=self.aff.ONLYMAXDIFF,
520                    has_phonetic=(self.aff.PHONE is not None))
521
522    def phonet_suggestions(self, word: str) -> Iterator[str]:
523        """
524        Produces phonetical similarity-based suggestions, by passing to
525        :meth:`phonet_suggest.phonet_suggest <spylls.hunspell.algo.phonet_suggest.phonet_suggest>` current
526        misspelling and settings from .aff file.
527
528        See :mod:`phonet_suggest <spylls.hunspell.algo.phonet_suggest.phonet_suggest>`.
529
530        Args:
531            word: Misspelled word
532        """
533        if not self.aff.PHONE:
534            return
535
536        yield from phonet_suggest.phonet_suggest(word, dictionary_words=self.words_for_ngram, table=self.aff.PHONE)
537
538    def use_dash(self) -> bool:
539        """
540        Yeah, that's how hunspell defines whether words can be split by dash in this language:
541        either dash is explicitly mentioned in TRY directive, or TRY directive indicates the
542        language uses Latinic script. So dictionaries omiting TRY directive, or for languages with,
543        say, Cyrillic script not including "-" in it, will never suggest "foobar" => "foo-bar",
544        even if it is the perfectly normal way to spell.
545        """
546        return '-' in self.aff.TRY or 'a' in self.aff.TRY
547