1"""
2.. autofunction:: ngram_suggest
3
4.. autofunction:: forms_for
5.. autofunction:: filter_guesses
6
7Scoring
8^^^^^^^
9
10.. autofunction:: detect_threshold
11.. autofunction:: root_score
12.. autofunction:: rough_affix_score
13.. autofunction:: precise_affix_score
14
15"""
16
17from typing import Iterator, Tuple, List, Set, Dict
18from operator import itemgetter
19import heapq
20
21from spylls.hunspell import data
22import spylls.hunspell.algo.string_metrics as sm
23
24
25MAX_ROOTS = 100
26MAX_GUESSES = 200
27
28
29def ngram_suggest(misspelling: str, *,
30                  dictionary_words: List[data.dic.Word],
31                  prefixes: Dict[str, List[data.aff.Prefix]],
32                  suffixes: Dict[str, List[data.aff.Suffix]],
33                  known: Set[str], maxdiff: int, onlymaxdiff: bool = False,
34                  has_phonetic: bool = False) -> Iterator[str]:
35    """
36    Try to suggest all possible variants for misspelling based on ngram-similarity.
37
38    Internally:
39
40    * calculates misspelling similarity to all dictionary word stems with :meth:`root_score`, and
41      chooses the best ones
42    * of those words, produces all forms possible with suffixes/prefixes by :meth:`forms_for`,
43      calculates their score against misspelling with :meth:`rough_affix_score` and chooses the best ones,
44      using threshold calculated in :meth:`detect_threshold`
45    * calculates more precise (but more time-consuming) score for those with :meth:`precise_affix_score` and
46      sorts by it
47    * filters suggestions depending on their score with :meth:`filter_guesses`
48
49    Args:
50        misspelling: Misspelled word
51        dictionary_words: all entries from dictionary to iterate against (without forbidden, ``ONLYINCOMPOUND``
52                          and such)
53        prefixes: all prefixes from .aff file to try produce forms with
54        suffixes: all suffixes from .aff file to try produce forms with
55        maxdiff: contents of :attr:`Aff.MAXDIFF <spylls.hunspell.data.aff.Aff.MAXDIFF>` (changes amount of suggestions)
56        onlymaxdiff: contents of :attr:`Aff.ONLYMAXDIFF <spylls.hunspell.data.aff.Aff.ONLYMAXDIFF>`
57                     (exlcudes not very good suggestions, see :meth:`filter_guesses`)
58        has_phonetic: whether there are :attr:`Aff.PHONE <spylls.hunspell.data.aff.Aff.PHONE>`
59                      definitions present (it changes ngram thresholds a bit, in order to produce
60                      less ngrams)
61    """
62
63    root_scores: List[Tuple[float, data.dic.Word]] = []
64
65    # First, find MAX_ROOTS candidate dictionary entries, by calculating stem score against the
66    # misspelled word.
67    for word in dictionary_words:
68        if abs(len(word.stem) - len(misspelling)) > 4:
69            continue
70
71        # TODO: hunspell has more exceptions/flag checks here (part of it we cover later in suggest,
72        # deciding, for example, if the suggestion is forbidden)
73
74        score = root_score(misspelling, word.stem)
75
76        # If dictionary word have alternative spellings provided via `pp:` data tag, calculate
77        # score against them, too. Note that only simple ph:spelling are listed in alt_spellings,
78        # more complicated tags like ph:spellin* or ph:spellng->spelling are ignored in ngrams
79        if word.alt_spellings:
80            for variant in word.alt_spellings:
81                score = max(score, root_score(misspelling, variant))
82
83        # Pythons stdlib heapq used to always keep only MAX_ROOTS of best results
84        if len(root_scores) > MAX_ROOTS:
85            heapq.heappushpop(root_scores, (score, word))
86        else:
87            heapq.heappush(root_scores, (score, word))
88
89    roots = heapq.nlargest(MAX_ROOTS, root_scores)
90
91    # "Minimum passable" suggestion threshold (decided by replacing some chars in word with * and
92    # calculating what score it would have).
93    threshold = detect_threshold(misspelling)
94
95    guess_scores: List[Tuple[float, str, str]] = []
96
97    # Now, for all "good" dictionary words, generate all of their forms with suffixes/prefixes, and
98    # calculate their scores.
99    # Produced structure is (score, word_variant_to_calculate_score, word_form_to_suggest)
100    # The second item is, again, to support alternative spellings suggested in dictionary by ``ph:``
101    # tag.
102    for (_, root) in roots:
103        if root.alt_spellings:
104            # If any of alternative spelling passes the threshold
105            for variant in root.alt_spellings:
106                score = rough_affix_score(misspelling, variant)
107                if score > threshold:
108                    # ...we add them to the final suggestion list (but don't try to produce affix forms)
109                    heapq.heappush(guess_scores, (score, variant, root.stem))
110
111        # For all acceptable forms from current dictionary word (with all possible suffixes and prefixes)...
112        for form in forms_for(root, prefixes, suffixes, similar_to=misspelling):
113            score = rough_affix_score(misspelling, form.lower())
114            if score > threshold:
115                # ...push them to final suggestion list if they pass the threshold
116                heapq.heappush(guess_scores, (score, form, form))
117
118    # We are done generating guesses. Take only limited amount, and sort in order of decreasing score.
119    guesses = heapq.nlargest(MAX_GUESSES, guess_scores)
120
121    fact = (10.0 - maxdiff) / 5.0 if maxdiff >= 0 else 1.0
122
123    # Now, calculate more precise scores for all good suggestions
124    guesses2 = [
125        (precise_affix_score(misspelling, compared.lower(), fact, base=score, has_phonetic=has_phonetic), real)
126        for (score, compared, real) in guesses
127    ]
128
129    # ...and sort them based on that score.
130    # (NB: actually, we might not need ``key`` here, but it is
131    # added for sorting stability; doesn't changes the objective quality of suggestions, but passes
132    # hunspell test ``phone.sug``!)
133    guesses2 = sorted(guesses2, key=itemgetter(0), reverse=True)
134
135    # We can return suggestions now (but filter them to not overflow with)
136    yield from filter_guesses(guesses2, known=known, onlymaxdiff=onlymaxdiff)
137
138# Scoring algorithms
139# ------------------
140
141
142def root_score(word1: str, word2: str) -> float:
143    """
144    Scoring, stage 1: Simple score for first dictionary words choosing: 3-gram score + longest start
145    substring.
146
147    Args:
148        word1: misspelled word
149        word2: possible suggestion
150    """
151
152    return (
153        sm.ngram(3, word1, word2.lower(), longer_worse=True) +
154        sm.leftcommonsubstring(word1, word2.lower())
155    )
156
157
158def rough_affix_score(word1: str, word2: str) -> float:
159    """
160    Scoring, stage 2: First (rough and quick) score of affixed forms: n-gram score with n=length of
161    the misspelled word + longest start substring
162
163    Args:
164        word1: misspelled word
165        word2: possible suggestion
166    """
167
168    return (
169        sm.ngram(len(word1), word1, word2, any_mismatch=True) +
170        sm.leftcommonsubstring(word1, word2)
171    )
172
173
174def precise_affix_score(word1: str, word2: str, diff_factor: float, *, base: float, has_phonetic: bool) -> float:
175    """
176    Scoring, stage 3: Hardcore final score for affixed forms!
177
178    It actually produces score of one of 3 groups:
179
180    * > 1000: if the words are actually same with different casing (surprisingly enough, not all of
181      those are caught in the "editing" suggest; example: "unesco's" => "UNESCO's")
182    * < -100: if the word difference is too much (what is "too much" defined by ``diff_factor``), only
183      one of those questionable suggestions would be returned
184    * -100...1000: just a normal suggestion score, defining its sorting position
185
186    See also :meth:`filter_guesses` below which uses this separation into "groups" to drop some results.
187
188    Args:
189        word1: misspelled word
190        word2: possible suggestion
191        diff_factor: factor changing amount of suggestions (:attr:`Aff.MAXDIFF <spylls.hunspell.data.aff.Aff.MAXDIFF>`)
192        base: initial score of word1 against word2
193        has_phonetic: whether there are :attr:`Aff.PHONE <spylls.hunspell.data.aff.Aff.PHONE>`
194                      definitions present (it changes ngram thresholds a bit, in order to produce
195                      less ngrams)
196    """
197
198    lcs = sm.lcslen(word1, word2)
199
200    # same characters with different casing -- "very good" suggestion class
201    if len(word1) == len(word2) and len(word1) == lcs:
202        return base + 2000
203
204    # Score is: length of longest common subsequent minus length difference...
205    result = 2 * lcs - abs(len(word1) - len(word2))
206
207    # increase score by length of common start substring
208    result += sm.leftcommonsubstring(word1, word2)
209
210    # Add 1 if there were _any_ occurence of "same chars in same positions" in two words
211    if sm.commoncharacters(word1, word2.lower()):
212        result += 1
213
214    # Add regular four-gram weight
215    result += sm.ngram(4, word1, word2, any_mismatch=True)
216
217    # Sum of weighted bigrams used to estimate result quality
218    bigrams = (
219        sm.ngram(2, word1, word2, any_mismatch=True, weighted=True) +
220        sm.ngram(2, word2, word1, any_mismatch=True, weighted=True)
221    )
222
223    result += bigrams
224
225    # diff_factor's ranges from 0 to 2 (depending of aff.MAXDIFF=0..10, with 10 meaning "give me all
226    # possible ngrams" and 0 meaninig "avoid most of the questionable ngrams"); with MAXDIFF=10 the
227    # factor would be 0, and this branch will be avoided; with MAXDIFF=0 the factor would be 2, and
228    # lots of "slihtly similar" words would be dropped into "questionable" bag.
229    #
230    # In a presence of ``PHONE`` definitions table in aff-file (used for phonetic similarity search),
231    # the threshold is a bit different. NB: I (zverok) believe it is a bug in Hunspell, because this
232    # threshold difference probably (?) meant to produce _less_ questionable ngram suggestions in
233    # a presence of phonetic ones, but actually produces more (branches confused?)
234    if has_phonetic:
235        questionable_limit = len(word2) * diff_factor
236    else:
237        questionable_limit = (len(word1) + len(word2)) * diff_factor
238    if bigrams < questionable_limit:
239        result -= 1000
240
241    return result
242
243
244def detect_threshold(word: str) -> float:
245    """
246    Find minimum threshold for a passable suggestion
247
248    Mangle original word three differnt ways (by replacing each 4th character with "*", starting from
249    1st, 2nd or 3rd), and score them to generate a minimum acceptable score.
250
251    Args:
252        word: misspelled word
253    """
254
255    thresh = 0.0
256
257    for start_pos in range(1, 4):
258        mangled = list(word)
259        for pos in range(start_pos, len(word), 4):
260            mangled[pos] = '*'
261
262        mangled_word = ''.join(mangled)
263
264        thresh += sm.ngram(len(word), word, mangled_word, any_mismatch=True)
265
266    # Take average of the three scores
267    return thresh // 3 - 1
268
269
270def forms_for(word: data.dic.Word, all_prefixes, all_suffixes, *, similar_to: str):
271    """
272    Produce forms with all possible affixes and prefixes from the dictionary word, but only those
273    the ``candidate`` can have. Note that there is no comprehensive flag checks (like "this prefix
274    is prohibited with suffix with this flag"). Probably main suggest's code should check it
275    (e.g. use ``filter_guesses`` (in
276    :meth:`suggestions <spylls.hunspell.algo.suggest.Suggest.suggestions>`)
277    for ngram-based suggestions, too).
278
279    Args:
280        word: dictionary stem to produce forms for
281        all_prefixes:
282        all_suffixes:
283        similar_to: initial misspelling (to filter suffixes/prefixes against it)
284    """
285
286    # word without prefixes/suffixes is also present
287    res = [word.stem]
288
289    suffixes = [
290        suffix
291        for flag in word.flags
292        for suffix in all_suffixes.get(flag, [])
293        if suffix.cond_regexp.search(word.stem) and similar_to.endswith(suffix.add)
294    ]
295    prefixes = [
296        prefix
297        for flag in word.flags
298        for prefix in all_prefixes.get(flag, [])
299        if prefix.cond_regexp.search(word.stem) and similar_to.startswith(prefix.add)
300    ]
301
302    cross = [
303        (prefix, suffix)
304        for prefix in prefixes
305        for suffix in suffixes
306        if suffix.crossproduct and prefix.crossproduct
307    ]
308
309    for suf in suffixes:
310        # FIXME: this things should be more atomic
311        root = word.stem[0:-len(suf.strip)] if suf.strip else word.stem
312        res.append(root + suf.add)
313
314    for pref, suf in cross:
315        root = word.stem[len(pref.strip):-len(suf.strip)] if suf.strip else word.stem[len(pref.strip):]
316        res.append(pref.add + root + suf.add)
317
318    for pref in prefixes:
319        root = word.stem[len(pref.strip):]
320        res.append(pref.add + root)
321
322    return res
323
324
325def filter_guesses(guesses: List[Tuple[float, str]], *, known: Set[str], onlymaxdiff=True) -> Iterator[str]:
326    """
327    Filter guesses by score, to decide which ones we'll yield to the client, considering the "suggestion
328    bags" -- "very good", "normal", "questionable" (see :meth:`precise_affix_score` for bags definition).
329
330    Args:
331        guesses: All possible suggestions
332        known: Passed from main Suggest, list of already produced suggestions
333        onlymaxdiff: contents of :attr:`Aff.ONLYMAXDIFF <spylls.hunspell.data.aff.Aff.ONLYMAXDIFF>`
334                     (excludes not very good suggestions, see code)
335    """
336
337    seen = False
338    found = 0
339
340    for (score, value) in guesses:
341        if seen and score <= 1000:
342            return
343
344        if score > 1000:
345            # If very good suggestion exists, we set the flag so that only other very good suggestions
346            # would be returned, and then the cycle would stop
347            seen = True
348        elif score < -100:
349            # If we found first questionable suggestion,
350            # we stop immediately if there were any better suggestion, or if aff.ONLYMAXDIFF says
351            # to avoid questionable ones alltogether
352            if found > 0 or onlymaxdiff:
353                return
354
355            # ...and then we set flag so the cycle would end on
356            # the next pass (suggestions are sorted by score, so everythig below is questionable, too,
357            # and we allow only one suggestion from "questionable" bag)
358            seen = True
359
360        # This condition, and ``found`` variable somewhat duplicates tracking of found suggestions
361        # in the main suggest cycle. It is this way because we need a counter of "how many ngram-based
362        # suggestions were yielded and successfully consumed", to decide whether we want "questionable
363        # ngram-suggestions" at all.
364        # (Another possible approach is to return pairs (suggestion, quality) from ngram_suggest,
365        # and handle "how many good/normal/questionable do we want" in main suggest.py)
366        if not any(known_word in value for known_word in known):
367            found += 1
368
369            yield value
370