1""" 2The main "suggest correction for this misspelling" module. 3 4On a bird-eye view level, suggest does: 5 6* tries small word "edits" (remove letters, insert letters, swap letters) and checks (with the help 7 of :mod:`lookup <spylls.hunspell.algo.lookup>`) there are any valid ones 8* if no good suggestions found, tries "ngram-based" suggestions (calculating ngram-based distance to 9 all dictionary words and select the closest ones), handled by 10 :mod:`ngram_suggest <spylls.hunspell.algo.ngram_suggest>` 11* if possible, tries metaphone-based suggestions, handled by :mod:`phonet_suggest <spylls.hunspell.algo.phonet_suggest>` 12 13Note that Spylls's implementation takes one liberty comparing to Hunspell's: 14In Hunspell, ngram suggestions (select all words from dictionary that ngram-similar => produce suggestions) 15and phonetic suggestions (select all words from dictionary that phonetically similar => produce suggestions) 16are done in the same cycle, because they both iterate through entire dictionary. Spylls does it 17in two separate cycles, again, for the sake of clarity (note that dictionaries with metaphone 18transformation rules defined are extremely rare). 19 20To follow algorithm details, start reading from :meth:`Suggest.__call__` 21 22.. toctree:: 23 :caption: Specific algorithms 24 25 algo_ngram_suggest 26 algo_phonet_suggest 27 28.. autoclass:: Suggest 29 30Suggestion classes 31^^^^^^^^^^^^^^^^^^ 32 33.. autoclass:: Suggestion 34 :members: 35.. autoclass:: MultiWordSuggestion 36 :members: 37 38""" 39 40from typing import Iterator, List, Set, Union 41 42import dataclasses 43from dataclasses import dataclass 44 45from spylls.hunspell import data 46from spylls.hunspell.algo.capitalization import Type as CapType 47from spylls.hunspell.algo import ngram_suggest, phonet_suggest, permutations as pmt 48 49MAXPHONSUGS = 2 50MAXSUGGESTIONS = 15 51GOOD_EDITS = ['spaceword', 'uppercase', 'replchars'] 52 53 54@dataclass 55class Suggestion: 56 """ 57 Suggestions is what Suggest produces internally to store enough information about some suggestion 58 to make sure it is a good one. 59 """ 60 61 #: Actual suggestion text 62 text: str 63 #: How suggestion was produced, useful for debugging, typically same as the method 64 #: of the editing which led to this suggestion, like "badchar", "twowords", "phonet" etc. 65 kind: str 66 67 def __repr__(self): 68 return f"Suggestion[{self.kind}]({self.text})" 69 70 def replace(self, **changes): 71 return dataclasses.replace(self, **changes) 72 73 74@dataclass 75class MultiWordSuggestion: 76 """ 77 Represents suggestion to split words into several. 78 """ 79 80 #: List of words 81 words: List[str] 82 #: Same as :attr:`Suggestion.source` 83 source: str 84 85 #: Whether those words are allowed to be joined by dash. We should disallow it if the multi-word 86 #: suggestion was produced by :attr:`Aff.REP <spylls.hunspell.data.aff.Aff.REP>` table, see 87 #: :meth:`Suggest.edits` for details. 88 allow_dash: bool = True 89 90 def stringify(self, separator=' '): 91 return Suggestion(separator.join(self.words), self.source) 92 93 def __repr__(self): 94 return f"Suggestion[{self.source}]({self.words!r})" 95 96 97class Suggest: 98 """ 99 ``Suggest`` object is created on :class:`Dictionary <spylls.hunspell.Dictionary>` reading. Typically, 100 you would not use it directly, but you might want for experiments:: 101 102 >>> dictionary = Dictionary.from_files('dictionaries/en_US') 103 >>> suggest = dictionary.suggester 104 105 >>> [*suggest('spylls')] 106 ['spells', 'spills'] 107 108 >>> for suggestion in suggest.suggestions('spylls'): 109 ... print(suggestion) 110 Suggestion[badchar](spell) 111 Suggestion[badchar](spill) 112 113 See :meth:`__call__` as the main entry point for algorithm explanation. 114 115 **Main methods** 116 117 .. automethod:: __call__ 118 .. automethod:: suggestions 119 120 **Suggestion types** 121 122 .. automethod:: edits 123 .. automethod:: ngram_suggestions 124 .. automethod:: phonet_suggestions 125 """ 126 def __init__(self, aff: data.Aff, dic: data.Dic, lookup): 127 self.aff = aff 128 self.dic = dic 129 self.lookup = lookup 130 131 # TODO: also NONGRAMSUGGEST and ONLYUPCASE 132 bad_flags = {*filter(None, [self.aff.FORBIDDENWORD, self.aff.NOSUGGEST, self.aff.ONLYINCOMPOUND])} 133 134 self.words_for_ngram = [word for word in self.dic.words if not bad_flags.intersection(word.flags)] 135 136 def __call__(self, word: str) -> Iterator[str]: 137 """ 138 Outer "public" interface: returns a list of all valid suggestions, as strings. 139 140 Method returns a generator, so it is up to client code to fetch as many suggestions as it 141 needs:: 142 143 >>> suggestions = suggester('unredable') 144 <generator object Suggest.__call__ at 0x7f74f5056350> 145 >>> suggestions.__next__() 146 'unreadable' 147 148 Note that suggestion to split words in two also returned as a single string, with a space:: 149 150 >>> [*suggester('badcat')] 151 ['bad cat', 'bad-cat', 'baccarat'] 152 153 Internally, the method just calls :meth:`suggestions` (which returns instances of :class:`Suggestion`) 154 and yields suggestion texts. 155 156 Args: 157 word: Word to check 158 """ 159 yield from (suggestion.text for suggestion in self.suggestions(word)) 160 161 def suggestions(self, word: str) -> Iterator[Suggestion]: # pylint: disable=too-many-statements 162 """ 163 Main suggestion search loop. What it does, in general, is: 164 165 * generates possible misspelled word cases (for ex., "KIttens" in dictionary might've been 166 'KIttens', 'kIttens', 'kittens', or 'Kittens') 167 * produces word edits with :meth:`edits` (with the help of 168 :mod:`permutations <spylls.hunspell.algo.permutations>` module), checks them with 169 :class:`Lookup <spylls.hunspell.algo.lookup.Lookup>`, and decides if that's maybe enough 170 * but if it is not (and if .aff settings allow), ngram-based suggestions are produced with 171 :meth:`ngram_suggestions`, and phonetically similar suggestions with :meth:`phonet_suggestions` 172 173 That's very simplified explanation, read the code! 174 175 Args: 176 word: Word to check 177 """ 178 179 # Whether some suggestion (permutation of the word) is an existing and allowed word, 180 # just delegates to Lookup 181 def is_good_suggestion(word): 182 # Note that instead of using Lookup's main method, we just see if there is any good forms 183 # of this exact word, avoiding ICONV and trying to break word by dashes. 184 return any(self.lookup.good_forms(word, capitalization=False, allow_nosuggest=False)) 185 186 # The suggestion is considered forbidden if there is ANY homonym in dictionary with flag 187 # FORBIDDENWORD. Besides marking swearing words, this feature also allows to include in 188 # dictionaries known "correctly-looking but actually non-existent" forms, which might important 189 # with very flexive languages. 190 def is_forbidden(word): 191 return self.aff.FORBIDDENWORD and self.dic.has_flag(word, self.aff.FORBIDDENWORD) 192 193 # This set will gather all good suggestions that were already returned (in order, for example, 194 # to not return same suggestion twice) 195 handled: Set[str] = set() 196 197 # Suggestions that are already considered good are passed through this method, which converts 198 # it to proper capitalization form, and then either yields it (if it is not forbidden, 199 # hadn't already seen, etc), or just does nothing. 200 # Method is quite lengthy, but is nested because updates and reuses ``handled`` local var 201 def handle_found(suggestion, *, check_inclusion=False): 202 text = suggestion.text 203 204 # If any of the homonyms has KEEPCASE flag, we shouldn't coerce it from the base form. 205 # But CHECKSHARPS flag presence changes the meaning of KEEPCASE... 206 if (self.aff.KEEPCASE and self.dic.has_flag(text, self.aff.KEEPCASE) and not 207 (self.aff.CHECKSHARPS and 'ß' in text)): 208 # Don't try to change text's case 209 pass 210 else: 211 # "Coerce" suggested text from the capitalization that it has in the dictionary, to 212 # the capitalization of the misspelled word. E.g., if misspelled was "Kiten", suggestion 213 # is "kitten" (how it is in the dictionary), and coercion (what we really want 214 # to return to user) is "Kitten" 215 text = self.aff.casing.coerce(text, captype) 216 # ...but if this particular capitalized form is forbidden, return back to original text 217 if text != suggestion.text and is_forbidden(text): 218 text = suggestion.text 219 220 # "aNew" will suggest "a new", here we fix it back to "a New" 221 if captype in [CapType.HUH, CapType.HUHINIT] and ' ' in text: 222 pos = text.find(' ') 223 if text[pos + 1] != word[pos] and text[pos + 1].upper() == word[pos]: 224 text = text[:pos+1] + word[pos] + text[pos+2:] 225 226 # If the word is forbidden, nothing more to do 227 if is_forbidden(text): 228 return 229 230 # Finally, OCONV table in .aff-file might specify what chars to replace in suggestions 231 # (for example, "'" to proper typographic "’", or common digraphs) 232 text = self.aff.OCONV(text) if self.aff.OCONV else text 233 234 # If we already seen this suggestion, nothing to do 235 # Note that this should happen AFTER the OCONV: it sometimes changes the content significantly. 236 # For example, Dutch dictionary recodes "ij" into ligature (one character representing this 237 # two letters) to enforce proper capitalization (and decodes it back on OCONV). 238 if text in handled: 239 return 240 241 # Sometimes we want to skip suggestions even if they are same as PARTS of already 242 # seen ones: for examle, ngram-based suggestions might produce very similar forms, like 243 # "impermanent" and "permanent" -- both of them are correct, but if the first is 244 # closer (by length/content) to misspelling, there is no point in suggesting the second 245 if check_inclusion and any(previous.lower() in text.lower() for previous in handled): 246 return 247 248 # Remember we seen it 249 handled.add(text) 250 251 # And here we are! 252 yield suggestion.replace(text=text) 253 254 # **Start of the main suggest code** 255 256 # First, produce all possible "good capitalizations" from the provided word. For example, 257 # if the word is "MsDonalds", good capitalizations (what it might've been in dictionary) are 258 # "msdonalds" (full lowercase) "msDonalds" (first letter lowercased), or maybe "Msdonalds" 259 # (only first letter capitalized). Note that "MSDONALDS" (it should've been all caps) is not 260 # produced as a possible good form, but checked separately in ``edits`` 261 captype, variants = self.aff.casing.corrections(word) 262 263 # Check a special case: if it is possible that words would be possible to be capitalized 264 # on compounding, then we check capitalized form of the word. If it is correct, that's the 265 # only suggestion we ever need. 266 if self.aff.FORCEUCASE and captype == CapType.NO: 267 for capitalized in self.aff.casing.capitalize(word): 268 if is_good_suggestion(capitalized): 269 yield from handle_found(Suggestion(capitalized.capitalize(), 'forceucase')) 270 return # No more need to check anything 271 272 good_edits_found = False 273 274 # Now, for all capitalization variant 275 for idx, variant in enumerate(variants): 276 # If it is different from original capitalization, and is good, we suggest it 277 if idx > 0 and is_good_suggestion(variant): 278 yield from handle_found(Suggestion(variant, 'case')) 279 280 # Now we do two rounds: 281 # * generate edits and check whether they are valid non-compound words, 282 # * generate the same edits again and check whether they are valid compound words 283 # 284 # The reason of splitting the checks is: a) optimization (checking whether the word is 285 # a valid compound is much more pricey) and b) suggestion quality (it is considered that 286 # non-compound words are more probable). 287 # There are cases when this algorithm brings problems: for example, "11thhour" will NOT 288 # suggest "11th hour" -- because "11th" is compound word, and "hour" is not, but we 289 # first check whether the splitting is correct assuming all words are non-compound, and 290 # then check whether it is correct assuming both are compound. That's how it is in Hunspell, 291 # and Spylls doesn't tries to fix that. 292 293 nocompound = False 294 295 # 1. Only generate suggestions that are correct NOT COMPOUND words. 296 for suggestion in self.edit_suggestions(variant, handle_found, limit=MAXSUGGESTIONS, compounds=False): 297 yield suggestion 298 # remember if any "good" edits was found -- in this case we don't need 299 # ngram suggestions 300 good_edits_found = good_edits_found or (suggestion.kind in GOOD_EDITS) 301 # If any of those kinds are found, we don't need to try _any_ compound suggestions 302 if suggestion.kind in ['uppercase', 'replchars', 'mapchars']: 303 nocompound = True 304 # We might break from the method immediately if the suggestion is to split word 305 # with space, and this phrase is _in the dictionary as a whole_: "alot" => "a lot". 306 # Then we don't need ANY other suggestions. This is a trick to enforce suggesting 307 # to break commonly misspelled phrases withot oversuggesting 308 if suggestion.kind == 'spaceword': 309 return 310 311 # 2. Only generate suggestions that are correct COMPOUND words 312 if not nocompound: 313 for suggestion in self.edit_suggestions(variant, handle_found, 314 limit=self.aff.MAXCPDSUGS, compounds=True): 315 yield suggestion 316 good_edits_found = good_edits_found or (suggestion.kind in GOOD_EDITS) 317 318 if good_edits_found: 319 return 320 321 # One additional pass at suggesting: if the word contains dashes, and there were no dash-containing 322 # suggestions, we should try to spellcheck it separately... But only one misspelled part is allowed. 323 if '-' in word and not any('-' in sug for sug in handled): 324 chunks = word.split('-') 325 for idx, chunk in enumerate(chunks): 326 # If this chunk of the word is misspelled... 327 if not is_good_suggestion(chunk): 328 # ...try all suggestions for this separate chunk 329 for sug in self(chunk): 330 candidate = '-'.join([*chunks[:idx], sug, *chunks[idx+1:]]) 331 # And check if the whole word with this chunk replaced is a good word. Note that 332 # we use lookup's main method here, which will also break it into words by 333 # dashes. It is done (instead of just checking chunk-by-chunk), because there 334 # are ambiguities how to split: if we are fixing "quas-Afro-American"->"quasi-Afro-American", 335 # and "Afro-American" is present in the dictionary as a whole word, it is better 336 # to delegate search for the right breaking to the Lookup. 337 if self.lookup(candidate): 338 yield Suggestion(candidate, 'dashes') 339 # after only one misspelled chunk, we stop the search anyways, whether we found some 340 # sugestions or not 341 break 342 343 # If there was no good edits that were valid words, we might try 344 # ngram-based suggestion algorithm: it is slower, but able to correct severely misspelled words 345 346 ngrams_seen = 0 347 for sug in self.ngram_suggestions(word, handled=handled): 348 for res in handle_found(Suggestion(sug, 'ngram'), check_inclusion=True): 349 ngrams_seen += 1 350 yield res 351 if ngrams_seen >= self.aff.MAXNGRAMSUGS: 352 break 353 354 # Also, if metaphone transformations (phonetic coding of words) were defined in the .aff file, 355 # we might try to use them to produce suggestions 356 357 phonet_seen = 0 358 for sug in self.phonet_suggestions(word): 359 for res in handle_found(Suggestion(sug, 'phonet'), check_inclusion=True): 360 phonet_seen += 1 361 yield res 362 if phonet_seen >= MAXPHONSUGS: 363 break 364 365 def edit_suggestions(self, word: str, handle_found, *, compounds: bool, limit: int) -> Iterator[Suggestion]: 366 def is_good_suggestion(word): 367 # Note that instead of using Lookup's main method, we just see if there is any good forms 368 # of this exact word, avoiding ICONV and trying to break word by dashes. 369 if compounds: 370 return any(self.lookup.good_forms(word, capitalization=False, allow_nosuggest=False, affix_forms=False)) 371 return any(self.lookup.good_forms(word, capitalization=False, allow_nosuggest=False, compound_forms=False)) 372 373 # For some set of suggestions, produces only good ones: 374 def filter_suggestions(suggestions): 375 for suggestion in suggestions: 376 # For multiword suggestion, 377 if isinstance(suggestion, MultiWordSuggestion): 378 # ...if all of the words is correct 379 if all(is_good_suggestion(word) for word in suggestion.words): 380 # ...we just convert it to plain text suggestion "word1 word2" 381 yield suggestion.stringify() 382 if suggestion.allow_dash: 383 # ...and "word1-word2" if allowed 384 yield suggestion.stringify('-') 385 else: 386 # Singleword suggestion is just yielded if it is good 387 if is_good_suggestion(suggestion.text): 388 yield suggestion 389 390 count = 0 391 392 for suggestion in filter_suggestions(self.edits(word)): 393 # Then, for each of those correct suggestions, 394 # we run them through handle_found (which performs some final transformations and 395 # checks, and may handle 1 or 0 suggestions; 0 in the case it was already seen or 396 # is a forbidden word) 397 for res in handle_found(suggestion): 398 yield res 399 count += 1 400 401 if count > limit: 402 return 403 404 def edits(self, word: str) -> Iterator[Union[Suggestion, MultiWordSuggestion]]: 405 """ 406 Produces all possible word edits in a form of :class:`Suggestion` or :class:`MultiWordSuggestion`. 407 Note that: 408 409 * order is important, that's the order user will receive the suggestions (and the further the 410 suggestion type in the order, the more probably it would be dropped due to suggestion count 411 limit) 412 * suggestion "source" tag is important: :meth:`suggestions` uses it to distinguish between 413 good and questionble edits (if there were any good ones, ngram suggestion wouldn't 414 be used) 415 416 Args: 417 word: Word to mutate 418 """ 419 # suggestions for an uppercase word (html -> HTML) 420 yield Suggestion(self.aff.casing.upper(word), 'uppercase') 421 422 # REP table in affix file specifies "typical misspellings", and we try to replace them. 423 # Note that the content of REP table taken not only from aff file, but also from "ph:" tag 424 # in dictionary (lines looking like ``hello ph:helo`` put word "hello" in dictionary, and 425 # "helo -> hello" in REP-table), see read_dic. 426 # 427 # It might return several words if REP table has "REP <something> <some>_<thing>" (_ is code 428 # for space). 429 # 430 # ...in this case we should suggest both "<word1> <word2>" as one dictionary entry, and 431 # "<word1>" "<word2>" as a sequence -- but clarifying this sequence might NOT be joined by "-" 432 for suggestion in pmt.replchars(word, self.aff.REP): 433 if isinstance(suggestion, list): 434 yield Suggestion(' '.join(suggestion), 'replchars') 435 yield MultiWordSuggestion(suggestion, 'replchars', allow_dash=False) 436 else: 437 yield Suggestion(suggestion, 'replchars') 438 439 for words in pmt.twowords(word): 440 yield Suggestion(' '.join(words), 'spaceword') 441 442 if self.use_dash(): 443 # "alot" => "a-lot" 444 yield Suggestion('-'.join(words), 'spaceword') 445 446 # MAP in aff file specifies related chars (for example, "ïi"), and mapchars produces all 447 # changes of the word with related chars replaced. For example, "naive" produces "naïve". 448 for suggestion in pmt.mapchars(word, self.aff.MAP): 449 yield Suggestion(suggestion, 'mapchars') 450 451 # Try to swap adjacent characters (ktiten -> kitten), produces all possible forms with ONE 452 # swap; but for 4- and 5-letter words tries also two swaps "ahev -> have" 453 for suggestion in pmt.swapchar(word): 454 yield Suggestion(suggestion, 'swapchar') 455 456 # Try longer swaps (up to 4 chars distance) 457 for suggestion in pmt.longswapchar(word): 458 yield Suggestion(suggestion, 'longswapchar') 459 460 # Try to replace chars by those close on keyboard ("wueue" -> "queue"), KEY in aff file specifies 461 # keyboard layout. 462 for suggestion in pmt.badcharkey(word, self.aff.KEY): 463 yield Suggestion(suggestion, 'badcharkey') 464 465 # Try remove character (produces all forms with one char removed: "clat" => "lat", "cat", "clt", "cla") 466 for suggestion in pmt.extrachar(word): 467 yield Suggestion(suggestion, 'extrachar') 468 469 # Try insert character (from set of all possible language chars specified in aff), produces 470 # all forms with any of the TRY chars inserted in all possible positions 471 for suggestion in pmt.forgotchar(word, self.aff.TRY): 472 yield Suggestion(suggestion, 'forgotchar') 473 474 # Try to move a character forward and backwars: 475 # "rnai" => "nari", "nair", "rain" (forward: r moved 2 and 3 chars, n moved 2 chars) 476 # => "rina", "irna", "arni" (backward: i moved 2 and 3 chars, a moved 2 chars) 477 # (no 1-char movements necessary, they are covered by swapchar above) 478 for suggestion in pmt.movechar(word): 479 yield Suggestion(suggestion, 'movechar') 480 481 # Try replace each character with any of other language characters 482 for suggestion in pmt.badchar(word, self.aff.TRY): 483 yield Suggestion(suggestion, 'badchar') 484 485 # Try fix two-character doubling: "chickcken" -> "chicken" (one-character doubling is 486 # already covered by extrachar) 487 for suggestion in pmt.doubletwochars(word): 488 yield Suggestion(suggestion, 'doubletwochars') 489 490 if not self.aff.NOSPLITSUGS: 491 # Try split word by space in all possible positions 492 # NOSPLITSUGS option in aff prohibits it, it is important, say, for Scandinavian languages 493 for suggestion_pair in pmt.twowords(word): 494 yield MultiWordSuggestion(suggestion_pair, 'twowords', allow_dash=self.use_dash()) 495 496 def ngram_suggestions(self, word: str, handled: Set[str]) -> Iterator[str]: 497 """ 498 Produces ngram-based suggestions, by passing to 499 :meth:`ngram_suggest.ngram_suggest <spylls.hunspell.algo.ngram_suggest.ngram_suggest>` current 500 misspelling, already found suggestions and settings from .aff file. 501 502 See :mod:`ngram_suggest <spylls.hunspell.algo.ngram_suggest>`. 503 504 Args: 505 word: Misspelled word 506 handled: List of already handled (known) suggestions; it is reused in 507 :meth:`ngram_suggest.filter_guesses <spylls.hunspell.algo.ngram_suggest.filter_guesses>` 508 to decide whether we add "not really good" ngram-based suggestions to result 509 """ 510 if self.aff.MAXNGRAMSUGS == 0: 511 return 512 513 yield from ngram_suggest.ngram_suggest( 514 word.lower(), 515 dictionary_words=self.words_for_ngram, 516 prefixes=self.aff.PFX, suffixes=self.aff.SFX, 517 known={*(word.lower() for word in handled)}, 518 maxdiff=self.aff.MAXDIFF, 519 onlymaxdiff=self.aff.ONLYMAXDIFF, 520 has_phonetic=(self.aff.PHONE is not None)) 521 522 def phonet_suggestions(self, word: str) -> Iterator[str]: 523 """ 524 Produces phonetical similarity-based suggestions, by passing to 525 :meth:`phonet_suggest.phonet_suggest <spylls.hunspell.algo.phonet_suggest.phonet_suggest>` current 526 misspelling and settings from .aff file. 527 528 See :mod:`phonet_suggest <spylls.hunspell.algo.phonet_suggest.phonet_suggest>`. 529 530 Args: 531 word: Misspelled word 532 """ 533 if not self.aff.PHONE: 534 return 535 536 yield from phonet_suggest.phonet_suggest(word, dictionary_words=self.words_for_ngram, table=self.aff.PHONE) 537 538 def use_dash(self) -> bool: 539 """ 540 Yeah, that's how hunspell defines whether words can be split by dash in this language: 541 either dash is explicitly mentioned in TRY directive, or TRY directive indicates the 542 language uses Latinic script. So dictionaries omiting TRY directive, or for languages with, 543 say, Cyrillic script not including "-" in it, will never suggest "foobar" => "foo-bar", 544 even if it is the perfectly normal way to spell. 545 """ 546 return '-' in self.aff.TRY or 'a' in self.aff.TRY 547