1""" 2.. autofunction:: ngram_suggest 3 4.. autofunction:: forms_for 5.. autofunction:: filter_guesses 6 7Scoring 8^^^^^^^ 9 10.. autofunction:: detect_threshold 11.. autofunction:: root_score 12.. autofunction:: rough_affix_score 13.. autofunction:: precise_affix_score 14 15""" 16 17from typing import Iterator, Tuple, List, Set, Dict 18from operator import itemgetter 19import heapq 20 21from spylls.hunspell import data 22import spylls.hunspell.algo.string_metrics as sm 23 24 25MAX_ROOTS = 100 26MAX_GUESSES = 200 27 28 29def ngram_suggest(misspelling: str, *, 30 dictionary_words: List[data.dic.Word], 31 prefixes: Dict[str, List[data.aff.Prefix]], 32 suffixes: Dict[str, List[data.aff.Suffix]], 33 known: Set[str], maxdiff: int, onlymaxdiff: bool = False, 34 has_phonetic: bool = False) -> Iterator[str]: 35 """ 36 Try to suggest all possible variants for misspelling based on ngram-similarity. 37 38 Internally: 39 40 * calculates misspelling similarity to all dictionary word stems with :meth:`root_score`, and 41 chooses the best ones 42 * of those words, produces all forms possible with suffixes/prefixes by :meth:`forms_for`, 43 calculates their score against misspelling with :meth:`rough_affix_score` and chooses the best ones, 44 using threshold calculated in :meth:`detect_threshold` 45 * calculates more precise (but more time-consuming) score for those with :meth:`precise_affix_score` and 46 sorts by it 47 * filters suggestions depending on their score with :meth:`filter_guesses` 48 49 Args: 50 misspelling: Misspelled word 51 dictionary_words: all entries from dictionary to iterate against (without forbidden, ``ONLYINCOMPOUND`` 52 and such) 53 prefixes: all prefixes from .aff file to try produce forms with 54 suffixes: all suffixes from .aff file to try produce forms with 55 maxdiff: contents of :attr:`Aff.MAXDIFF <spylls.hunspell.data.aff.Aff.MAXDIFF>` (changes amount of suggestions) 56 onlymaxdiff: contents of :attr:`Aff.ONLYMAXDIFF <spylls.hunspell.data.aff.Aff.ONLYMAXDIFF>` 57 (exlcudes not very good suggestions, see :meth:`filter_guesses`) 58 has_phonetic: whether there are :attr:`Aff.PHONE <spylls.hunspell.data.aff.Aff.PHONE>` 59 definitions present (it changes ngram thresholds a bit, in order to produce 60 less ngrams) 61 """ 62 63 root_scores: List[Tuple[float, data.dic.Word]] = [] 64 65 # First, find MAX_ROOTS candidate dictionary entries, by calculating stem score against the 66 # misspelled word. 67 for word in dictionary_words: 68 if abs(len(word.stem) - len(misspelling)) > 4: 69 continue 70 71 # TODO: hunspell has more exceptions/flag checks here (part of it we cover later in suggest, 72 # deciding, for example, if the suggestion is forbidden) 73 74 score = root_score(misspelling, word.stem) 75 76 # If dictionary word have alternative spellings provided via `pp:` data tag, calculate 77 # score against them, too. Note that only simple ph:spelling are listed in alt_spellings, 78 # more complicated tags like ph:spellin* or ph:spellng->spelling are ignored in ngrams 79 if word.alt_spellings: 80 for variant in word.alt_spellings: 81 score = max(score, root_score(misspelling, variant)) 82 83 # Pythons stdlib heapq used to always keep only MAX_ROOTS of best results 84 if len(root_scores) > MAX_ROOTS: 85 heapq.heappushpop(root_scores, (score, word)) 86 else: 87 heapq.heappush(root_scores, (score, word)) 88 89 roots = heapq.nlargest(MAX_ROOTS, root_scores) 90 91 # "Minimum passable" suggestion threshold (decided by replacing some chars in word with * and 92 # calculating what score it would have). 93 threshold = detect_threshold(misspelling) 94 95 guess_scores: List[Tuple[float, str, str]] = [] 96 97 # Now, for all "good" dictionary words, generate all of their forms with suffixes/prefixes, and 98 # calculate their scores. 99 # Produced structure is (score, word_variant_to_calculate_score, word_form_to_suggest) 100 # The second item is, again, to support alternative spellings suggested in dictionary by ``ph:`` 101 # tag. 102 for (_, root) in roots: 103 if root.alt_spellings: 104 # If any of alternative spelling passes the threshold 105 for variant in root.alt_spellings: 106 score = rough_affix_score(misspelling, variant) 107 if score > threshold: 108 # ...we add them to the final suggestion list (but don't try to produce affix forms) 109 heapq.heappush(guess_scores, (score, variant, root.stem)) 110 111 # For all acceptable forms from current dictionary word (with all possible suffixes and prefixes)... 112 for form in forms_for(root, prefixes, suffixes, similar_to=misspelling): 113 score = rough_affix_score(misspelling, form.lower()) 114 if score > threshold: 115 # ...push them to final suggestion list if they pass the threshold 116 heapq.heappush(guess_scores, (score, form, form)) 117 118 # We are done generating guesses. Take only limited amount, and sort in order of decreasing score. 119 guesses = heapq.nlargest(MAX_GUESSES, guess_scores) 120 121 fact = (10.0 - maxdiff) / 5.0 if maxdiff >= 0 else 1.0 122 123 # Now, calculate more precise scores for all good suggestions 124 guesses2 = [ 125 (precise_affix_score(misspelling, compared.lower(), fact, base=score, has_phonetic=has_phonetic), real) 126 for (score, compared, real) in guesses 127 ] 128 129 # ...and sort them based on that score. 130 # (NB: actually, we might not need ``key`` here, but it is 131 # added for sorting stability; doesn't changes the objective quality of suggestions, but passes 132 # hunspell test ``phone.sug``!) 133 guesses2 = sorted(guesses2, key=itemgetter(0), reverse=True) 134 135 # We can return suggestions now (but filter them to not overflow with) 136 yield from filter_guesses(guesses2, known=known, onlymaxdiff=onlymaxdiff) 137 138# Scoring algorithms 139# ------------------ 140 141 142def root_score(word1: str, word2: str) -> float: 143 """ 144 Scoring, stage 1: Simple score for first dictionary words choosing: 3-gram score + longest start 145 substring. 146 147 Args: 148 word1: misspelled word 149 word2: possible suggestion 150 """ 151 152 return ( 153 sm.ngram(3, word1, word2.lower(), longer_worse=True) + 154 sm.leftcommonsubstring(word1, word2.lower()) 155 ) 156 157 158def rough_affix_score(word1: str, word2: str) -> float: 159 """ 160 Scoring, stage 2: First (rough and quick) score of affixed forms: n-gram score with n=length of 161 the misspelled word + longest start substring 162 163 Args: 164 word1: misspelled word 165 word2: possible suggestion 166 """ 167 168 return ( 169 sm.ngram(len(word1), word1, word2, any_mismatch=True) + 170 sm.leftcommonsubstring(word1, word2) 171 ) 172 173 174def precise_affix_score(word1: str, word2: str, diff_factor: float, *, base: float, has_phonetic: bool) -> float: 175 """ 176 Scoring, stage 3: Hardcore final score for affixed forms! 177 178 It actually produces score of one of 3 groups: 179 180 * > 1000: if the words are actually same with different casing (surprisingly enough, not all of 181 those are caught in the "editing" suggest; example: "unesco's" => "UNESCO's") 182 * < -100: if the word difference is too much (what is "too much" defined by ``diff_factor``), only 183 one of those questionable suggestions would be returned 184 * -100...1000: just a normal suggestion score, defining its sorting position 185 186 See also :meth:`filter_guesses` below which uses this separation into "groups" to drop some results. 187 188 Args: 189 word1: misspelled word 190 word2: possible suggestion 191 diff_factor: factor changing amount of suggestions (:attr:`Aff.MAXDIFF <spylls.hunspell.data.aff.Aff.MAXDIFF>`) 192 base: initial score of word1 against word2 193 has_phonetic: whether there are :attr:`Aff.PHONE <spylls.hunspell.data.aff.Aff.PHONE>` 194 definitions present (it changes ngram thresholds a bit, in order to produce 195 less ngrams) 196 """ 197 198 lcs = sm.lcslen(word1, word2) 199 200 # same characters with different casing -- "very good" suggestion class 201 if len(word1) == len(word2) and len(word1) == lcs: 202 return base + 2000 203 204 # Score is: length of longest common subsequent minus length difference... 205 result = 2 * lcs - abs(len(word1) - len(word2)) 206 207 # increase score by length of common start substring 208 result += sm.leftcommonsubstring(word1, word2) 209 210 # Add 1 if there were _any_ occurence of "same chars in same positions" in two words 211 if sm.commoncharacters(word1, word2.lower()): 212 result += 1 213 214 # Add regular four-gram weight 215 result += sm.ngram(4, word1, word2, any_mismatch=True) 216 217 # Sum of weighted bigrams used to estimate result quality 218 bigrams = ( 219 sm.ngram(2, word1, word2, any_mismatch=True, weighted=True) + 220 sm.ngram(2, word2, word1, any_mismatch=True, weighted=True) 221 ) 222 223 result += bigrams 224 225 # diff_factor's ranges from 0 to 2 (depending of aff.MAXDIFF=0..10, with 10 meaning "give me all 226 # possible ngrams" and 0 meaninig "avoid most of the questionable ngrams"); with MAXDIFF=10 the 227 # factor would be 0, and this branch will be avoided; with MAXDIFF=0 the factor would be 2, and 228 # lots of "slihtly similar" words would be dropped into "questionable" bag. 229 # 230 # In a presence of ``PHONE`` definitions table in aff-file (used for phonetic similarity search), 231 # the threshold is a bit different. NB: I (zverok) believe it is a bug in Hunspell, because this 232 # threshold difference probably (?) meant to produce _less_ questionable ngram suggestions in 233 # a presence of phonetic ones, but actually produces more (branches confused?) 234 if has_phonetic: 235 questionable_limit = len(word2) * diff_factor 236 else: 237 questionable_limit = (len(word1) + len(word2)) * diff_factor 238 if bigrams < questionable_limit: 239 result -= 1000 240 241 return result 242 243 244def detect_threshold(word: str) -> float: 245 """ 246 Find minimum threshold for a passable suggestion 247 248 Mangle original word three differnt ways (by replacing each 4th character with "*", starting from 249 1st, 2nd or 3rd), and score them to generate a minimum acceptable score. 250 251 Args: 252 word: misspelled word 253 """ 254 255 thresh = 0.0 256 257 for start_pos in range(1, 4): 258 mangled = list(word) 259 for pos in range(start_pos, len(word), 4): 260 mangled[pos] = '*' 261 262 mangled_word = ''.join(mangled) 263 264 thresh += sm.ngram(len(word), word, mangled_word, any_mismatch=True) 265 266 # Take average of the three scores 267 return thresh // 3 - 1 268 269 270def forms_for(word: data.dic.Word, all_prefixes, all_suffixes, *, similar_to: str): 271 """ 272 Produce forms with all possible affixes and prefixes from the dictionary word, but only those 273 the ``candidate`` can have. Note that there is no comprehensive flag checks (like "this prefix 274 is prohibited with suffix with this flag"). Probably main suggest's code should check it 275 (e.g. use ``filter_guesses`` (in 276 :meth:`suggestions <spylls.hunspell.algo.suggest.Suggest.suggestions>`) 277 for ngram-based suggestions, too). 278 279 Args: 280 word: dictionary stem to produce forms for 281 all_prefixes: 282 all_suffixes: 283 similar_to: initial misspelling (to filter suffixes/prefixes against it) 284 """ 285 286 # word without prefixes/suffixes is also present 287 res = [word.stem] 288 289 suffixes = [ 290 suffix 291 for flag in word.flags 292 for suffix in all_suffixes.get(flag, []) 293 if suffix.cond_regexp.search(word.stem) and similar_to.endswith(suffix.add) 294 ] 295 prefixes = [ 296 prefix 297 for flag in word.flags 298 for prefix in all_prefixes.get(flag, []) 299 if prefix.cond_regexp.search(word.stem) and similar_to.startswith(prefix.add) 300 ] 301 302 cross = [ 303 (prefix, suffix) 304 for prefix in prefixes 305 for suffix in suffixes 306 if suffix.crossproduct and prefix.crossproduct 307 ] 308 309 for suf in suffixes: 310 # FIXME: this things should be more atomic 311 root = word.stem[0:-len(suf.strip)] if suf.strip else word.stem 312 res.append(root + suf.add) 313 314 for pref, suf in cross: 315 root = word.stem[len(pref.strip):-len(suf.strip)] if suf.strip else word.stem[len(pref.strip):] 316 res.append(pref.add + root + suf.add) 317 318 for pref in prefixes: 319 root = word.stem[len(pref.strip):] 320 res.append(pref.add + root) 321 322 return res 323 324 325def filter_guesses(guesses: List[Tuple[float, str]], *, known: Set[str], onlymaxdiff=True) -> Iterator[str]: 326 """ 327 Filter guesses by score, to decide which ones we'll yield to the client, considering the "suggestion 328 bags" -- "very good", "normal", "questionable" (see :meth:`precise_affix_score` for bags definition). 329 330 Args: 331 guesses: All possible suggestions 332 known: Passed from main Suggest, list of already produced suggestions 333 onlymaxdiff: contents of :attr:`Aff.ONLYMAXDIFF <spylls.hunspell.data.aff.Aff.ONLYMAXDIFF>` 334 (excludes not very good suggestions, see code) 335 """ 336 337 seen = False 338 found = 0 339 340 for (score, value) in guesses: 341 if seen and score <= 1000: 342 return 343 344 if score > 1000: 345 # If very good suggestion exists, we set the flag so that only other very good suggestions 346 # would be returned, and then the cycle would stop 347 seen = True 348 elif score < -100: 349 # If we found first questionable suggestion, 350 # we stop immediately if there were any better suggestion, or if aff.ONLYMAXDIFF says 351 # to avoid questionable ones alltogether 352 if found > 0 or onlymaxdiff: 353 return 354 355 # ...and then we set flag so the cycle would end on 356 # the next pass (suggestions are sorted by score, so everythig below is questionable, too, 357 # and we allow only one suggestion from "questionable" bag) 358 seen = True 359 360 # This condition, and ``found`` variable somewhat duplicates tracking of found suggestions 361 # in the main suggest cycle. It is this way because we need a counter of "how many ngram-based 362 # suggestions were yielded and successfully consumed", to decide whether we want "questionable 363 # ngram-suggestions" at all. 364 # (Another possible approach is to return pairs (suggestion, quality) from ngram_suggest, 365 # and handle "how many good/normal/questionable do we want" in main suggest.py) 366 if not any(known_word in value for known_word in known): 367 found += 1 368 369 yield value 370