1"""
2The module represents data from Hunspell's ``*.aff`` file.
3
4This text file has the following format:
5
6.. code-block:: text
7
8    # pseudo-comment
9    DIRECTIVE_NAME value1 value2 value 3
10
11    # directives with large array of values
12    DIRECTIVE_NAME <num_of_values>
13    DIRECTIVE_NAME value1_1 value1_2 value1_3
14    DIRECTIVE_NAME value2_1 value2_2 value2_3
15    # ...
16
17How many values should be after ``DIRECTIVE_NAME``, is defined by directive itself. Values are separated
18by any number of spaces (so, if some values should include literal " ", they encode it as "_").
19
20    *Note:* We are saying "pseudo-comment" above, because it is just a convention. In fact, Hunspell has
21    no code explicitly interpreting anything starting with ``#`` as a comment -- it is rather ignores everything
22    that is not known directive name, and everything after expected number of directive values. But it is
23    important NOT to drop ``#`` and content after it before interpreting, as it might be meaningful!
24    Some dictionaries define ``#`` to be a flag, or a ``BREAK`` character. For example ``en_GB`` in
25    LibreOffice does this:
26
27    .. code-block:: text
28
29        # in .aff file:
30        COMPOUNDRULE #*0{
31        # reads: rule of producing compound words:
32        #  any words with flag "#", 0 or more times (*),
33        #  then any word with flag "0",
34        #  then any word with flag "{"
35
36        # in .dic file:
37        1/#0
38        # reads: "1" is a word, having flags "#" and "0"
39
40The :class:`Aff` class stores all data from the the file — read class docs to better understand the
41conventions and usage of directives.
42
43``Aff``
44-------
45
46.. autoclass:: Aff
47
48``Prefix`` and ``Suffix``
49-------------------------
50
51.. autoclass:: Affix
52    :members:
53.. autoclass:: Prefix
54.. autoclass:: Suffix
55
56Helper pattern-alike classes
57----------------------------
58
59This classes are wrapping several types of somewhat pattern-alike objects that can be ``*.aff``-file,
60"compiling" them into something applyable much like Python's ``re`` module compiles regexps.
61
62.. autoclass:: BreakPattern
63.. autoclass:: Ignore
64.. autoclass:: RepPattern
65.. autoclass:: ConvTable
66.. autoclass:: CompoundPattern
67.. autoclass:: CompoundRule
68.. autoclass:: PhonetTable
69"""
70
71import re
72import functools
73import itertools
74from operator import itemgetter
75
76from collections import defaultdict
77
78from dataclasses import dataclass, field
79from typing import List, Set, Dict, Tuple, Optional
80
81from spylls.hunspell.algo.capitalization import Casing, GermanCasing, TurkicCasing
82from spylls.hunspell.algo.trie import Trie
83
84
85@dataclass
86class BreakPattern:
87    """
88    Contents of the :attr:`Aff.BREAK` directive, pattern for splitting the word, compiled to regexp.
89
90    Directives are stored this way:
91
92    .. code-block:: text
93
94        BREAK 3
95        BREAK -
96        BREAK ^-
97        BREAK -$
98
99    (That's, by the way, the default value of ``BREAK``). It means Hunspell while checking the word
100    like "left-right", will check "left" and "right" separately; also will ignore "-" at the beginning
101    and end of the word (second and third lines). Note that ``BREAK -`` without any special chars
102    will NOT ignore "-" at the beginning/end.
103    """
104    pattern: str
105
106    def __post_init__(self):
107        # special chars like #, -, * etc should be escaped, but ^ and $ should be treated as in regexps
108        pattern = re.escape(self.pattern).replace('\\^', '^').replace('\\$', '$')
109        if pattern.startswith('^') or pattern.endswith('$'):
110            self.regexp = re.compile(f"({pattern})")
111        else:
112            self.regexp = re.compile(f".({pattern}).")
113
114
115@dataclass
116class Ignore:
117    """
118    Contents of the :attr:`Aff.IGNORE` directive, chars to ignore on lookup/suggest, compiled with
119    ``str.maketrans``.
120    """
121    chars: str
122
123    def __post_init__(self):
124        self.tr = str.maketrans('', '', self.chars)
125
126
127@dataclass
128class RepPattern:
129    """
130    Contents of the :attr:`Aff.REP` directive, pair of ``(frequent typo, its replacement)``. Typo pattern
131    compiled to regexp.
132
133    Example from Hunspell's docs, showing all the features:
134
135    .. code-block:: text
136
137        REP 5
138        REP f ph
139        REP ph f
140        REP tion$ shun
141        REP ^cooccurr co-occurr
142        REP ^alot$ a_lot
143
144    This means:
145
146    * table of 5 replacements (first line):
147    * try to see if "f -> ph" produces good word,
148    * try "ph -> f",
149    * at the end of the word try "tion -> shun",
150    * at the beginning of the word try "cooccurr -> co-occurr",
151    * and try to replace the whole word "alot" with "a lot" (``_`` stands for space).
152    """
153    pattern: str
154    replacement: str
155
156    def __post_init__(self):
157        self.regexp = re.compile(self.pattern)
158
159
160@dataclass
161class Affix:
162    """
163    Common base for :class:`Prefix` and :class:`Suffix`.
164
165    Affixes are stored in table looking this way:
166
167    .. code-block:: text
168
169        SFX X Y 1
170        SFX X   0 able/CD . ds:able
171
172    Meaning of the first line (table header):
173
174    * Suffix (can be ``PFX`` for prefix)
175    * ...designated by flag ``X``
176    * ...supports cross-product (Y or N, "cross-product" means form with this suffix also allowed to
177      have prefixes)
178    * ...and there is 1 of them below
179
180    Meaning of the table row:
181
182    * Suffix X (should be same as table header)
183    * ...when applies, doesn't change the stem (0 = "", but it can be "...removes some part at the end of the stem")
184    * ...when applies, adds "able" to the stem
185    * ...and the whole form will have also flags "C", "D"
186    * ...condition of application is "any stem" (``.`` -- read it as regexp's "any char")
187    * ...and the whole form would have data tags (morphology) ``ds:able``
188
189    Then, if in the dictionary we have ``drink/X`` (can have the suffix marked by ``X``), the whole
190    thing means "'drinkable' is a valid word form, has additional flags 'C', 'D' and some morphological info".
191
192    Another example (from ``en_US.aff``):
193
194    .. code-block:: text
195
196        SFX N Y 3
197        SFX N   e     ion        e
198        SFX N   y     ication    y
199        SFX N   0     en         [^ey]
200
201    This defines suffix designated by flag ``N``, non-cross-productable, with 3 forms:
202
203    * removes "e" and adds "ion" for words ending with "e" (animate => animation)
204    * removes "y" and adds "icaton" for words ending with "y" (amplify => amplification)
205    * removes nothing and adds "en" for words ending with neither (befall => befallen)
206
207    *(TBH, I don't have a slightest idea why the third option is grouped with two previous... Probably
208    because dictionary building is semi-automated process of "packing" word lists in dic+aff, and
209    the "affixes" actually doesn't need to bear any grammatical sense.)*
210    """
211
212    #: Flag this affix marked with. Note that several affixes can have same flag (and in this case,
213    #: which of them is relevant for the word, is decided by its :attr:`condition`)
214    flag: str
215    #: Whether this affix is compatible with opposite affix (e.g. if the word has both suffix and prefix,
216    #: both of them should have ``crossproduct=True``)
217    crossproduct: bool
218    #: What is stripped from the stem when the affix is applied
219    strip: str
220    #: What is added when the affix is applied
221    add: str
222    #: Condition against which stem should be checked to understand whether this affix is relevant
223    condition: str
224    #: Flags this affix has
225    flags: Set[str] = field(default_factory=set)
226
227
228@dataclass
229class Prefix(Affix):
230    """
231    :class:`Affix` at the beginning of the word, stored in :attr:`Aff.PFX` directive.
232    """
233
234    def __post_init__(self):
235        # "-" does NOT have a special regex-meaning, while might happen as a regular word char (for ex., hu_HU)
236        condition = self.condition.replace('-', '\\-')
237        self.cond_regexp = re.compile('^' + condition)
238
239        cond_parts = re.findall(r'(\[.+\]|[^\[])', condition)
240        cond_parts = cond_parts[len(self.strip):]
241
242        if cond_parts and cond_parts != ['.']:
243            cond = '(?=' + ''.join(cond_parts) + ')'
244        else:
245            cond = ''
246
247        self.lookup_regexp = re.compile('^' + self.add + cond)
248        self.replace_regexp = re.compile('^' + self.add)
249
250    def __repr__(self):
251        return (
252            f"Prefix({self.add}: {self.flag}{'×' if self.crossproduct else ''}" +
253            (f"/{','.join(self.flags)}" if self.flags else '') +
254            f", on ^{self.strip}[{self.condition}])"
255        )
256
257
258@dataclass
259class Suffix(Affix):
260    """
261    :class:`Affix` at the end of the word, stored in :attr:`Aff.SFX` directive.
262    """
263
264    def __post_init__(self):
265        # "-" does NOT have a special regex-meaning, while might happen as a regular word char (for ex., hu_HU)
266        self.cond_regexp = re.compile(self.condition.replace('-', '\\-') + '$')
267
268        cond_parts = re.findall(r'(\[.+\]|[^\[])', self.condition)
269        if self.strip:
270            cond_parts = cond_parts[:-len(self.strip)]
271
272        if cond_parts and cond_parts != ['.']:
273            cond = '(' + ''.join(cond_parts) + ')'
274        else:
275            cond = ''
276
277        cond = cond.replace('-', '\\-')
278        self.lookup_regexp = re.compile(cond + self.add + '$')
279        self.replace_regexp = re.compile(self.add + '$')
280
281    def __repr__(self):
282        return (
283            f"Suffix({self.add}: {self.flag}{'×' if self.crossproduct else ''}" +
284            (f"/{','.join(self.flags)}" if self.flags else '') +
285            f", on [{self.condition}]{self.strip}$)"
286        )
287
288
289@dataclass
290class CompoundRule:
291    """
292    Regexp-alike rule for generating compound words, content of :attr:`Aff.COMPOUNDRULE` directive.
293    It is a way of specifying compounding alternative (and unrelated) to :attr:`Aff.COMPOUNDFLAG` and
294    similar. Rules look this way:
295
296    .. code-block:: text
297
298        COMPOUNDRULE A*B?CD
299
300    ...reading: compound word might consist of any number of words with flag ``A``, then 0 or 1 words
301    with flag ``B``, then words with flags ``C`` and ``D``.
302
303    ``en_US.aff`` uses this feature to specify spelling of numerals. In .aff-file, it has
304
305    .. code-block:: text
306
307        COMPOUNDRULE 2
308        COMPOUNDRULE n*1t
309        COMPOUNDRULE n*mp
310
311    And, in .dic-file:
312
313    .. code-block:: text
314
315        0/nm
316        0th/pt
317        1/n1
318        1st/p
319        1th/tc
320        2/nm
321        2nd/p
322        2th/tc
323        # ...and so on...
324
325    Which makes "111th" valid (one hundred eleventh): "1" with "n", "1" with "1" and "1th" with "t"
326    is valid by rule ``n*1t``, but "121th" is not valid (should be "121st")
327    """
328
329    text: str
330
331    def __post_init__(self):
332        # TODO: proper flag parsing! Long is (aa)(bb)*(cc), numeric is (1001)(1002)*(1003)
333        # This works but is super ad-hoc!
334        if '(' in self.text:
335            self.flags = set(re.findall(r'\((.+?)\)', self.text))
336            parts = re.findall(r'\([^*?]+?\)[*?]?', self.text)
337        else:
338            self.flags = set(re.sub(r'[\*\?]', '', self.text))
339            # There are ) flags used in real-life sv_* dictionaries
340            # Obviously it is quite ad-hoc (other chars that have special meaning in regexp might be
341            # used eventually)
342            parts = [part.replace(')', '\\)') for part in re.findall(r'[^*?][*?]?', self.text)]
343
344        self.re = re.compile(''.join(parts))
345        self.partial_re = re.compile(
346            functools.reduce(lambda res, part: f"{part}({res})?", parts[::-1])
347        )
348
349    def fullmatch(self, flag_sets):
350        relevant_flags = [self.flags.intersection(f) for f in flag_sets]
351        return any(
352            self.re.fullmatch(''.join(fc))
353            for fc in itertools.product(*relevant_flags)
354        )
355
356    def partial_match(self, flag_sets):
357        relevant_flags = [self.flags.intersection(f) for f in flag_sets]
358        return any(
359            self.partial_re.fullmatch(''.join(fc))
360            for fc in itertools.product(*relevant_flags)
361        )
362
363
364@dataclass
365class CompoundPattern:
366    """
367    Pattern to check whether compound word is correct, stored in :attr:`Aff.CHECKCOMPOUNDPATTERN` directive.
368    Format of the pattern:
369
370    .. code-block:: text
371
372        endchars[/flag] beginchars[/flag] [replacement]
373
374    The pattern matches (telling that this compound is not allowed) if some pair of the words inside
375    compound matches conditions:
376
377    * first word ends with ``endchars`` (and have ``flags`` from the first element, if they are specified)
378    * second word starts with ``beginchars`` (and have ``flags`` from the second element, if they are
379      specified)
380
381    ``endchars`` can be 0, specifying "word has zero affixes".
382
383    ``replacement`` complicates things, allowing to specify "...but this string at the border of the
384    words, should be unpacked into this ``endchars`` and that ``beginchars``, but make the compound
385    allowed"... It complicates algorithm significantly, and **no known dictionary** uses this feature,
386    so ``replacement`` is just ignored by Spylls.
387    """
388
389    left: str
390    right: str
391    replacement: Optional[str] = None
392
393    def __post_init__(self):
394        self.left_stem, _, self.left_flag = self.left.partition('/')
395        self.right_stem, _, self.right_flag = self.right.partition('/')
396
397        if self.left_stem == '0':
398            self.left_stem = ''
399            self.left_no_affix = True
400        else:
401            self.left_no_affix = False
402
403        # FIXME: Hunpell docs say 0 is only allowed for the first pattern
404        if self.right_stem == '0':
405            self.right_stem = ''
406            self.right_no_affix = True
407        else:
408            self.right_no_affix = False
409
410    def match(self, left, right):
411        return (left.stem.endswith(self.left_stem)) and (right.stem.startswith(self.right_stem)) and \
412               (not self.left_no_affix or not left.is_base()) and \
413               (not self.right_no_affix or not right.is_base()) and \
414               (not self.left_flag or self.left_flag in left.flags()) and \
415               (not self.right_flag or self.right_flag in right.flags())
416
417
418@dataclass
419class ConvTable:
420    """
421    Table of conversions that should be applied on pre- or post-processing, stored in :attr:`Aff.ICONV` and
422    :attr:`Aff.OCONV`. Format is as follows (as far as I can guess from code and tests, documentation
423    is very sparse):
424
425    .. code-block:: text
426
427        ICONV <number of entries>
428        ICONV <pattern> <replacement>
429
430    Typically, ``pattern`` and ``replacement`` are just simple strings, used mostly for replacing
431    typographics (like trigraphs and "nice" apostrophes) before/after processing.
432
433    But if there is a ``_`` in ``pattern``, it is treated as: regexp ``^`` if at the beginning of
434    the pattern, regexp ``$`` if at the end, and just ignored otherwise. This seem to be a "hidden"
435    feature, demonstrated by ``nepali.*`` set of tests in Hunspell distribution
436
437    Conversion rules are applied as follows:
438
439    * for each position in word
440    * ...find any matching rules
441    * ...chose the one with longest pattern
442    * ...apply it, and shift to position after its applied (so there can't be recursive application
443      of several rules on top of each other).
444    """
445
446    pairs: List[Tuple[str, str]]
447
448    def __post_init__(self):
449        def compile_row(pat1, pat2):
450            pat1clean = pat1.replace('_', '')
451            pat1re = pat1clean
452            if pat1.startswith('_'):
453                pat1re = '^' + pat1re
454            if pat1.endswith('_'):
455                pat1re = pat1re + '$'
456
457            return (pat1clean, re.compile(pat1re), pat2.replace('_', ' '))
458
459        # TODO: don't need key=?.. (default behavior)
460        self.table = sorted([compile_row(*row) for row in self.pairs], key=itemgetter(0))
461
462    def __call__(self, word):
463        pos = 0
464        res = ''
465        while pos < len(word):
466            matches = sorted(
467                [(search, pattern, replacement)
468                 for search, pattern, replacement in self.table
469                 if pattern.match(word, pos)],
470                key=lambda r: len(r[0]),
471                reverse=True
472            )
473            if matches:
474                search, pattern, replacement = matches[0]
475                res += replacement
476                pos += len(search)
477            else:
478                res += word[pos]
479                pos += 1
480
481        return res
482
483
484@dataclass
485class PhonetTable:
486    """
487    Represents table of metaphone transformations stored in :attr:`Aff.PHONE`. Format is borrowed
488    from aspell and described `in its docs <http://aspell.net/man-html/Phonetic-Code.html>`_.
489
490    Basically, each line of the table specifies pair of "pattern"/"replacement". Replacement is
491    a literal string (with "_" meaning "empty string"), and pattern is ... complicated. Spylls, as
492    of now, parses rules fully (see ``parse_rule`` method in the source), but doesn't implements all
493    the algorithm's details (like rule prioritizing, concept of "follow-up rule" etc.)
494
495    It is enough to pass Hunspell's (small) test for PHONE implementation, but definitely more naive
496    than expected. But as it is marginal feature (and there are enough metaphone implementations in
497    Python), we aren't (yet?) bothered by this fact.
498    """
499    table: List[Tuple[str, str]]
500
501    RULE_PATTERN = re.compile(
502        r'(?P<letters>\w+)(\((?P<optional>\w+)\))?(?P<lookahead>[-]+)?(?P<flags>[\^$<]*)(?P<priority>\d)?'
503    )
504
505    @dataclass
506    class Rule:             # pylint: disable=missing-class-docstring
507        search: re.Pattern
508        replacement: str
509
510        start: bool = False
511        end: bool = False
512
513        priority: int = 5
514
515        followup: bool = True
516
517        def match(self, word, pos):
518            if self.start and pos > 0:
519                return False
520            if self.end:
521                return self.search.fullmatch(word, pos)
522            return self.search.match(word, pos)
523
524    def __post_init__(self):
525        self.rules = defaultdict(list)
526
527        for search, replacement in self.table:
528            self.rules[search[0]].append(self.parse_rule(search, replacement))
529
530    def parse_rule(self, search: str, replacement: str) -> Rule:
531        m = self.RULE_PATTERN.fullmatch(search)
532
533        if not m:
534            raise ValueError(f'Not a proper rule: {search!r}')
535
536        text = [*m.group('letters')]
537        if m.group('optional'):
538            text.append('[' + m.group('optional') + ']')
539        if m.group('lookahead'):
540            la = len(m.group('lookahead'))
541            regex = ''.join(text[:-la]) + '(?=' + ''.join(text[-la:]) + ')'
542        else:
543            regex = ''.join(text)
544
545        return PhonetTable.Rule(
546            search=re.compile(regex),
547            replacement=replacement,
548            start=('^' in m.group('flags')),
549            end=('$' in m.group('flags')),
550            followup=(m.group('lookahead') is not None)
551        )
552
553
554@dataclass
555class Aff:
556    """
557    The class contains all directives from .aff file in its attributes.
558
559    Attribute **names** are exactly the same as directives they've read from
560    (they are upper-case, which is un-Pythonic, but allows to unambiguously relate directives to attrs and
561    grep them in code).
562
563    Attribute **values** are either appropriate primitive data types (strings, numbers, arrays etc),
564    or simple objects wrapping this data to make it easily usable in algorithms (mostly it is some
565    pattern-alike objects, like the result of Python's standard ``re.compile``, but specific for
566    Hunspell domain).
567
568    Attribute **docs** include explanations derived from
569    `Hunspell's man page <https://www.manpagez.com/man/5/hunspell/>`_ (sometimes rephrased/abbreviated),
570    plus links to relevant chunks of ``spylls`` code which uses the directive.
571
572    Note that **all** directives are optional, empty .aff file is a valid one.
573
574    **General**
575
576    .. autoattribute:: SET
577    .. autoattribute:: FLAG
578    .. autoattribute:: LANG
579    .. autoattribute:: WORDCHARS
580    .. autoattribute:: IGNORE
581    .. autoattribute:: CHECKSHARPS
582    .. autoattribute:: FORBIDDENWORD
583
584    **Suggestions**
585
586    .. autoattribute:: KEY
587    .. autoattribute:: TRY
588    .. autoattribute:: NOSUGGEST
589    .. autoattribute:: KEEPCASE
590    .. autoattribute:: REP
591    .. autoattribute:: MAP
592    .. autoattribute:: NOSPLITSUGS
593    .. autoattribute:: PHONE
594    .. autoattribute:: MAXCPDSUGS
595
596    **N-gram suggestions**
597
598    .. autoattribute:: MAXNGRAMSUGS
599    .. autoattribute:: MAXDIFF
600    .. autoattribute:: ONLYMAXDIFF
601
602    **Stemming**
603
604    .. autoattribute:: PFX
605    .. autoattribute:: SFX
606    .. autoattribute:: NEEDAFFIX
607    .. autoattribute:: CIRCUMFIX
608    .. autoattribute:: COMPLEXPREFIXES
609    .. autoattribute:: FULLSTRIP
610
611    **Compounding**
612
613    .. autoattribute:: BREAK
614    .. autoattribute:: COMPOUNDRULE
615    .. autoattribute:: COMPOUNDMIN
616    .. autoattribute:: COMPOUNDWORDMAX
617    .. autoattribute:: COMPOUNDFLAG
618    .. autoattribute:: COMPOUNDBEGIN
619    .. autoattribute:: COMPOUNDMIDDLE
620    .. autoattribute:: COMPOUNDEND
621    .. autoattribute:: ONLYINCOMPOUND
622    .. autoattribute:: COMPOUNDPERMITFLAG
623    .. autoattribute:: COMPOUNDFORBIDFLAG
624    .. autoattribute:: FORCEUCASE
625    .. autoattribute:: CHECKCOMPOUNDCASE
626    .. autoattribute:: CHECKCOMPOUNDDUP
627    .. autoattribute:: CHECKCOMPOUNDREP
628    .. autoattribute:: CHECKCOMPOUNDTRIPLE
629    .. autoattribute:: CHECKCOMPOUNDPATTERN
630    .. autoattribute:: SIMPLIFIEDTRIPLE
631    .. autoattribute:: COMPOUNDSYLLABLE
632    .. autoattribute:: COMPOUNDMORESUFFIXES
633    .. autoattribute:: COMPOUNDROOT
634
635    **Pre/post-processing**
636
637    .. autoattribute:: ICONV
638    .. autoattribute:: OCONV
639
640    **Aliasing**
641
642    .. autoattribute:: AF
643    .. autoattribute:: AM
644
645    **Other/Ignored**
646
647    .. autoattribute:: WARN
648    .. autoattribute:: FORBIDWARN
649    .. autoattribute:: SYLLABLENUM
650    .. autoattribute:: SUBSTANDARD
651
652    Some other directives that are in docs, but are deprecated/not used (and never implemented by Spylls):
653
654    * ``LEMMA_PRESENT``
655
656    **Derived attributes**
657
658    This attributes are calculated after Aff reading and initialization
659
660    .. py:attribute:: casing
661        :type: spylls.hunspell.algo.capitalization.Casing
662
663        "Casing" class (defining how the words in this language lowercased/uppercased). See
664        :class:`Casing <spylls.hunspell.algo.capitalization.Casing>` for details. In ``Aff``, basically, it is
665
666        * :class:`GermanCasing <spylls.hunspell.algo.capitalization.GermanCasing>` if :attr:`CHECKSHARPS`
667          is ``True``,
668        * :class:`TurkicCasing <spylls.hunspell.algo.capitalization.TurkicCasing>` if :attr:`LANG` is
669          one of Turkic languages (Turkish, Azerbaijani, Crimean Tatar),
670        * regular ``Casing`` otherwise.
671
672    .. py:attribute:: suffixes_index
673        :type: spylls.hunspell.algo.trie.Trie
674
675        `Trie <https://en.wikipedia.org/wiki/Trie>`_ structure for fast selecting of all possible suffixes
676        for some word, created from :attr:`SFX`
677
678    .. py:attribute:: prefixes_index
679        :type: spylls.hunspell.algo.trie.Trie
680
681        `Trie <https://en.wikipedia.org/wiki/Trie>`_ structure for fast selecting all possible prefixes
682        for some word, created from :attr:`PFX`
683    """
684
685    #: .aff and .dic encoding.
686    #:
687    #: *Usage*: Stored in :class:`readers.aff.Context <spylls.hunspell.readers.aff.Context>` and used
688    #: for reopening .aff file (after the directive was read) in
689    #: :meth:`reader_aff <spylls.hunspell.readers.aff.read_aff>`, and for opening .dic file
690    #: in :meth:`reader_dic <spylls.hunspell.readers.dic.read_dic>`
691    SET: str = 'Windows-1252'
692
693    #: .aff file declares one of the possible flag formats:
694    #:
695    #: * ``short`` (default) -- each flag is one ASCII character
696    #: * ``long`` -- each flag is two ASCII characters
697    #: * ``numeric`` -- each flag is number, set of flags separates them with ``,``
698    #: * ``UTF-8`` -- each flag is one UTF-8 character
699    #:
700    #: Flag format defines how flag sets attached to stems and affixes are parsed. For example,
701    #: .dic file entry ``cat/ABCD`` can be considered having flags ``{"A", "B", "C", "D"}``
702    #: (default flag format, "short"), or ``{"AB", "CD"}`` (flag format "long")
703    #:
704    #: *Usage*: Stored in :class:`readers.aff.Context <spylls.hunspell.readers.aff.Context>` and used
705    #: in :meth:`reader_aff <spylls.hunspell.readers.aff.read_aff>`, and
706    #: in :meth:`reader_dic <spylls.hunspell.readers.dic.read_dic>`
707    FLAG: str = 'short'  # TODO: Enum of possible values, in fact
708
709    #: ISO language code. The only codes that change behavior is codes of Turkic languages, which
710    #: have different I/i capitalization logic.
711    #:
712    #: *Usage*: Abstracted into :attr:`casing` which is used in both lookup and suggest.
713    LANG: Optional[str] = None
714
715    #: Extends tokenizer of Hunspell command line interface with additional word characters, for example,
716    #: dot, dash, n-dash, numbers.
717    #:
718    #: *Usage*: Not used in Spylls at all, as it doesn't do tokenization.
719    WORDCHARS: Optional[str] = None
720
721    #: Sets characters to ignore dictionary words, affixes and input words. Useful for optional characters,
722    #: as Arabic (harakat) or Hebrew (niqqud) diacritical marks.
723    #:
724    #: *Usage*: in :meth:`Lookup.__call__ <spylls.hunspell.algo.lookup.Lookup.__call__>` for preparing
725    #: input word, and in :meth:`reader_aff <spylls.hunspell.readers.aff.read_aff>`, and
726    #: in :meth:`reader_dic <spylls.hunspell.readers.dic.read_dic>`.
727    IGNORE: Optional[Ignore] = None
728
729    #: Specify this language has German "sharp S" (ß), so this language is probably German
730    #: :)
731    #:
732    #: This declaration effect is that uppercase word with "ß" letter is considered correct (uppercase
733    #: form of "ß" is "SS", but it is allowed to leave downcased "ß"). The effect can be prohibited
734    #: for some words by applying to word :attr:`KEEPCASE` flag (which for other situations has
735    #: different meaning).
736    #:
737    #: *Usage:* To define whether to use
738    #: :class:`GermanCasing <spylls.hunspell.algo.capitalization.GermanCasing>` in :attr:`casing`
739    #: (which changes word lower/upper-casing slightly), and in
740    #: :meth:`Lookup.good_forms <spylls.hunspell.algo.lookup.Lookup.good_forms>` to drop forms where
741    #: lowercase "ß" is prohibited.
742    CHECKSHARPS: bool = False
743
744    #: Flag that marks word as forbidden. The main usage of this flag is to specify that some form
745    #: that is logically possible (by affixing/suffixing or compounding) is in fact non-existent.
746    #:
747    #: Imaginary example (not from actual English dictionary!): let's say word "create" can have suffixes
748    #: "-d", "-s", "-ion", and prefixes: "un-", "re-", "de-", but of all possible forms (created,
749    #: creates, creation, uncreates, uncreation, ....) we decide "decreated" is not an existing word.
750    #: Then we mark (in .dic file) word "create" with flag for all those suffixes and prefixes,
751    #: but also add separate word "decreated" to dictionary, marked with flag that specified
752    #: in .aff's FORBIDDENWORD directive. Now, this word wouldn't be considered correct, but all other
753    #: combinations would.
754    #:
755    #: *Usage:* multiple times in both :class:`Lookup <spylls.hunspell.algo.lookup.Lookup>` and
756    #: :class:`Suggest <spylls.hunspell.algo.suggest.Suggest>`
757    FORBIDDENWORD: Optional[str] = None
758
759    #: Flag to mark words which shouldn't be considered correct unless their casing is exactly like in
760    #: the dictionary.
761    #:
762    #:      Note: With :attr:`CHECKSHARPS` declaration, words with sharp s (ß) and ``KEEPCASE`` flag
763    #:      may be capitalized and uppercased, but uppercased forms of these words may not contain "ß",
764    #:      only "SS".
765    #:
766    #: *Usage:* :meth:`Suggest.suggestions <spylls.hunspell.algo.suggest.Suggest.suggestions>`
767    #: to produce suggestions in proper case,
768    #: :meth:`Lookup.is_good_form <spylls.hunspell.algo.lookup.Lookup.is_good_form>`.
769    KEEPCASE: Optional[str] = None
770
771    # **Suggestions**
772
773    #: Flag to mark word/affix as "shouldn't be suggested" (but considered correct on lookup), like
774    #: obscenities.
775    #:
776    #: *Usage:* on :class:`Suggest <spylls.hunspell.algo.suggest.Suggest>` creation (to make list of
777    #: dictionary words for ngram-check), and in
778    #: :meth:`Lookup.is_good_form <spylls.hunspell.algo.lookup.Lookup.is_good_form>` (if the lookup is
779    #: called from suggest, with ``allow_nosuggest=False``)
780    NOSUGGEST: Optional[str] = None
781
782    #: String that specifies sets of adjacent characters on keyboard (so suggest could understand
783    #: that "kitteb" is most probable misspelling of "kitten"). Format is "abc|def|xyz". For QWERTY
784    #: English keyboard might be ``qwertyuiop|asdfghjkl|zxcvbnm``
785    #:
786    #: *Usage:*
787    #: :meth:`Suggest.edits <spylls.hunspell.algo.suggest.Suggest.edits>`
788    #: to pass to :meth:`permutations.badcharkey <spylls.hunspell.algo.permutations.badcharkey>`.
789    KEY: str = 'qwertyuiop|asdfghjkl|zxcvbnm'
790
791    #: List of all characters that can be used in words, *in order of probability* (most probable first),
792    #: used on edits for suggestions (trying to add missing, or replace erroneous character).
793    #:
794    #: *Usage:*
795    #: :meth:`Suggest.edits <spylls.hunspell.algo.suggest.Suggest.edits>`
796    #: to pass to :meth:`permutations.badchar <spylls.hunspell.algo.permutations.badchar>` and
797    #: :meth:`permutations.forgotchar <spylls.hunspell.algo.permutations.forgotchar>`. Note that,
798    #: obscurely enough, Suggest checks this option to
799    #: decide whether dash should be used when suggesting two words (e.g. for misspelled "foobar",
800    #: when it is decided that it is two words erroneously joined, suggest either returns only
801    #: "foo bar", or also "foo-bar"). Whether dash is suggested, decided by presence of ``"-"`` in ``TRY``,
802    #: or by presence of Latin ``"a"`` (= "the language use Latin script, all of them allow dashes
803    #: between words")... That's how it is in Hunspell!
804    TRY: str = ''
805
806    #: *Table* of replacements for typical typos (like "shun"->"tion") to try on suggest. See :class:`RepPattern`
807    #: for details of format.
808    #:
809    #: *Usage:* :meth:`Suggest.edits <spylls.hunspell.algo.suggest.Suggest.edits>` to pass to
810    #: :meth:`permutations.replchars <spylls.hunspell.algo.permutations.replchars>`.
811    #: Note that the table populated from aff's ``REP`` directive, *and* from dic's file ``ph:``
812    #: tags (see :class:`Word <spylls.hunspell.data.dic.Word>` and
813    #: :meth:`read_dic <spylls.hunspell.readers.dic.read_dic>` for detailed explanations).
814    REP: List[RepPattern] = field(default_factory=list)
815
816    #: Sets of "similar" chars to try in suggestion (like ``aáã`` -- if they all exist in the language,
817    #: replacing one in another would be a frequent typo). Several chars as a single entry should be
818    #: grouped by parentheses: ``MAP ß(ss)`` (German "sharp s" and "ss" sequence are more or less the same).
819    #:
820    #: *Usage:*
821    #: :meth:`Suggest.edits <spylls.hunspell.algo.suggest.Suggest.edits>`
822    #: to pass to :meth:`permutations.mapchars <spylls.hunspell.algo.permutations.mapchars>`.
823    MAP: List[Set[str]] = field(default_factory=list)
824
825    #: Never try to suggest "this word should be split in two". LibreOffice sv_SE dictionary says
826    #: "it is a must for Swedish". (Interestingly enough, Hunspell's tests doesn't check this flag at
827    #: all).
828    #:
829    #: *Usage:*
830    #: :meth:`Suggest.edits <spylls.hunspell.algo.suggest.Suggest.edits>`
831    NOSPLITSUGS: bool = False
832
833    #: Table for metaphone transformations. Format is borrowed from aspell and described
834    #: `in its docs <http://aspell.net/man-html/Phonetic-Code.html>`_.
835    #:
836    #: Note that dictionaries with ``PHONE`` table are extremely rare: of all LibreOffice/Firefox
837    #: dictionaries on en_ZA (South Africa) contains it -- though it is a generic English metaphone
838    #: rules an it is quite weird they are not used more frequently.
839    #:
840    #: Showcase (with LibreOffice dictionaries)::
841    #:
842    #:  >>> misspelled = 'excersized'
843    #:
844    #:  >>> nometaphone = Dictionary.from_files('en/en_US')
845    #:  >>> [*nometaphone.suggest(misspelled)])
846    #:  ['supersized']
847    #:
848    #:  >>> withmetaphone = Dictionary.from_files('en/en_ZA')
849    #:  >>> [*withmetaphone.suggest(misspelled)]
850    #:  ['excerpted', 'exercised', 'excessive']
851    #:
852    #: *Usage:* :mod:`phonet_suggest <spylls.hunspell.algo.phonet_suggest>`
853    PHONE: Optional[PhonetTable] = None
854
855    #: Limits number of compound suggestions.
856    #: *Usage:* :meth:`Suggest.suggestions <spylls.hunspell.algo.suggest.Suggest.suggestions>`
857    #: to limit number of edit-based suggestions which are compound words.
858    MAXCPDSUGS: int = 3
859
860    # *NGram suggestions*:
861
862    #: Set max. number of n-gram suggestions. Value 0 switches off the n-gram suggestions (see also
863    #: :attr:`MAXDIFF`).
864    #:
865    #: *Usage:* :meth:`Suggest.ngram_suggestions <spylls.hunspell.algo.suggest.Suggest.ngram_suggestions>`
866    #: (to decide whether ``ngram_suggest`` should be called at all) and
867    #: :meth:`Suggest.suggestions <spylls.hunspell.algo.suggest.Suggest.suggestions>` (to limit
868    #: amount of ngram-based suggestions).
869    MAXNGRAMSUGS: int = 4
870
871    #: Set the similarity factor for the n-gram based suggestions:
872    #:
873    #: * 5 = default value
874    #: * 0 = fewer n-gram suggestions, but at least one;
875    #: * 10 (max) = :attr:`MAXNGRAMSUGS` n-gram suggestions.
876    #:
877    #: *Usage:* :meth:`Suggest.ngram_suggestions <spylls.hunspell.algo.suggest.Suggest.ngram_suggestions>` where
878    #: it is passed to :mod:`ngram_suggest <spylls.hunspell.algo.ngram_suggest>` module, and used in
879    #: :meth:`detailed_affix_score <spylls.hunspell.algo.ngram_suggest.detailed_affix_score>`.
880    MAXDIFF: int = -1
881
882    #: Remove all bad n-gram suggestions (default mode keeps one, see :attr:`MAXDIFF`).
883    #:
884    #: *Usage:* :meth:`Suggest.ngram_suggestions <spylls.hunspell.algo.suggest.Suggest.ngram_suggestions>` where
885    #: it is passed to :mod:`ngram_suggest <spylls.hunspell.algo.ngram_suggest>` module, and used in
886    #: :meth:`filter_guesses <spylls.hunspell.algo.ngram_suggest.filter_guesses>`.
887    ONLYMAXDIFF: bool = False
888
889    # **Stemming**
890
891    #: Dictionary of ``flag => prefixes with this flag``. See :class:`Affix` for detailed format and
892    #: meaning description.
893    #:
894    #: Usage:
895    #:
896    #: * in :meth:`Suggest.ngram_suggestions <spylls.hunspell.algo.suggest.Suggest.ngram_suggestions>`
897    #:   to pass to :mod:`ngram_suggest <spylls.hunspell.algo.ngram_suggest>`
898    #:   (and there to construct all possible forms).
899    #: * also parsed into :attr:`prefixes_index` Trie, which then used in
900    #:   :meth:`Lookup.deprefix <spylls.hunspell.algo.lookup.Lookup.deprefix>`
901    PFX: Dict[str, List[Prefix]] = field(default_factory=dict)
902
903    #: Dictionary of ``flag => suffixes with this flag``. See :class:`Affix` for detailed format and
904    #: meaning description.
905    #:
906    #: Usage:
907    #:
908    #: * in :meth:`Suggest.ngram_suggestions <spylls.hunspell.algo.suggest.Suggest.ngram_suggestions>`
909    #:   to pass to :mod:`ngram_suggest <spylls.hunspell.algo.ngram_suggest>`
910    #:   (and there to construct all possible forms).
911    #: * also parsed into :attr:`suffixes_index` Trie, which then used in
912    #:   :meth:`Lookup.desuffix <spylls.hunspell.algo.lookup.Lookup.desuffix>`
913    SFX: Dict[str, List[Suffix]] = field(default_factory=dict)
914
915    #: Flag saying "this stem can't be used without affixes". Can be also assigned to suffix/prefix,
916    #: meaning "there should be other affixes besides this one".
917    #:
918    #: *Usage:* :meth:`Lookup.is_good_form <spylls.hunspell.algo.lookup.Lookup.is_good_form>`
919    NEEDAFFIX: Optional[str] = None
920
921    #: Suffixes signed with this flag may be on a word when this word also has a prefix with
922    #: this flag, and vice versa.
923    #:
924    #: *Usage:* :meth:`Lookup.is_good_form <spylls.hunspell.algo.lookup.Lookup.is_good_form>`
925    CIRCUMFIX: Optional[str] = None
926
927    #: If two prefixes stripping is allowed (only one prefix by default). Random fun fact:
928    #: of all currently available LibreOffice and Firefox dictionaries, only Firefox's Zulu has this
929    #: flag.
930    #:
931    #: *Usage:* :meth:`Lookup.deprefix <spylls.hunspell.algo.lookup.Lookup.deprefix>`
932    COMPLEXPREFIXES: bool = False
933
934    #: If affixes are allowed to remove entire stem.
935    #:
936    #: Not used in Spylls (e.g. spylls doesn't fails when this option is False and entire word is removed,
937    #: so hunspell's tests ``fullstrip.*`` are passing).
938    FULLSTRIP: bool = False
939
940    # **Compounding**
941
942    #: Defines break points for breaking words and checking word parts separately. See :class:`BreakPattern`
943    #: for format definition.
944    #:
945    #: *Usage:* :meth:`Lookup.break_word <spylls.hunspell.algo.lookup.Lookup.break_word>`
946    BREAK: List[BreakPattern] = \
947        field(default_factory=lambda: [BreakPattern('-'), BreakPattern('^-'), BreakPattern('-$')])
948
949    #: Rule of producing compound words, with regexp-like syntax. See :class:`CompoundRule` for
950    #: format definition.
951    #:
952    #: *Usage:* :meth:`Lookup.compounds_by_rules <spylls.hunspell.algo.lookup.Lookup.compounds_by_rules>`
953    COMPOUNDRULE: List[CompoundRule] = field(default_factory=list)
954
955    #: Minimum length of words used for compounding.
956    #:
957    #: *Usage:* :meth:`Lookup.compounds_by_rules <spylls.hunspell.algo.lookup.Lookup.compounds_by_rules>` &
958    #: :meth:`Lookup.compounds_by_flags <spylls.hunspell.algo.lookup.Lookup.compounds_by_flags>`
959    COMPOUNDMIN: int = 3
960
961    #: Set maximum word count in a compound word.
962    #:
963    #: *Usage:* :meth:`Lookup.compounds_by_rules <spylls.hunspell.algo.lookup.Lookup.compounds_by_rules>` &
964    #: :meth:`Lookup.compounds_by_flags <spylls.hunspell.algo.lookup.Lookup.compounds_by_flags>`
965    COMPOUNDWORDMAX: Optional[int] = None
966
967    #: Forms with this flag (marking either stem, or one of affixes) can be part of the compound.
968    #: Note that triple of flags :attr:`COMPOUNDBEGIN`, :attr:`COMPOUNDMIDDLE`, :attr:`COMPOUNDEND`
969    #: is more precise way of marking ("this word can be at the beginning of compound").
970    #:
971    #: *Usage:* :meth:`Lookup.is_good_form <spylls.hunspell.algo.lookup.Lookup.is_good_form>` to compare
972    #: form's compound position (or lack thereof) with presence of teh flag.
973    COMPOUNDFLAG: Optional[str] = None
974
975    #: Forms with this flag (marking either stem, or one of affixes) can be at the beginning of the
976    #: compound.
977    #: Part of the triple of flags :attr:`COMPOUNDBEGIN`, :attr:`COMPOUNDMIDDLE`, :attr:`COMPOUNDEND`;
978    #: alternative to the triple is just :attr:`COMPOUNDFLAG` ("this form can be at any place in compound").
979    #:
980    #: *Usage:* :meth:`Lookup.is_good_form <spylls.hunspell.algo.lookup.Lookup.is_good_form>`
981    #: to compare form's compound position (or lack thereof) with the presence of the flag.
982    COMPOUNDBEGIN: Optional[str] = None
983
984    #: Forms with this flag (marking either stem, or one of affixes) can be in the middle of the
985    #: compound (not the last part, and not the first).
986    #: Part of the triple of flags :attr:`COMPOUNDBEGIN`, :attr:`COMPOUNDMIDDLE`, :attr:`COMPOUNDEND`;
987    #: alternative to the triple is just :attr:`COMPOUNDFLAG` ("this form can be at any place in compound").
988    #:
989    #: *Usage:* :meth:`Lookup.is_good_form <spylls.hunspell.algo.lookup.Lookup.is_good_form>`
990    #: to compare form's compound position (or lack thereof) with the presence of the flag.
991    COMPOUNDMIDDLE: Optional[str] = None
992
993    #: Forms with this flag (marking either stem, or one of affixes) can be at the end of the
994    #: compound.
995    #: Part of the triple of flags :attr:`COMPOUNDBEGIN`, :attr:`COMPOUNDMIDDLE`, :attr:`COMPOUNDEND`;
996    #: alternative to the triple is just :attr:`COMPOUNDFLAG` ("this form can be at any place in compound").
997    #:
998    #: *Usage:* :meth:`Lookup.is_good_form <spylls.hunspell.algo.lookup.Lookup.is_good_form>`
999    #: to compare form's compound position (or lack thereof) with the presence of the flag.
1000    COMPOUNDEND: Optional[str] = None
1001
1002    #: Forms with this flag (marking either stem, or one of affixes) can only be part of the compound
1003    #: word, and never standalone.
1004    #:
1005    #: *Usage:* :meth:`Lookup.is_good_form <spylls.hunspell.algo.lookup.Lookup.is_good_form>`
1006    #: to compare form's compound position (or lack thereof) with the presence of the flag.
1007    #: Also in :class:`Suggest  <spylls.hunspell.algo.suggest.Suggest>` to produce list of the words
1008    #: suitable for ngram search.
1009    ONLYINCOMPOUND: Optional[str] = None
1010
1011    #: Prefixes are allowed at the beginning of compounds, suffixes are allowed at the end of compounds
1012    #: by default. Affixes with ``COMPOUNDPERMITFLAG`` may be inside of compounds.
1013    #:
1014    #: *Usage:* :meth:`Lookup.compounds_by_flags <spylls.hunspell.algo.lookup.Lookup.compounds_by_flags>`
1015    #: to make list of flags passed to
1016    #: :meth:`Lookup.produce_affix_forms <spylls.hunspell.algo.lookup.Lookup.produce_affix_forms>`
1017    #: (for this part of the compound, try find affixed spellings, you can use affixes with this flag).
1018    COMPOUNDPERMITFLAG: Optional[str] = None
1019
1020    #: Prefixes are allowed at the beginning of compounds, suffixes are allowed at the end of compounds
1021    #: by default. Suffixes with ``COMPOUNDFORBIDFLAG`` may not be even at the end, and prefixes with
1022    #: this flag may not be even at the beginning.
1023    #:
1024    #: *Usage:* :meth:`Lookup.compounds_by_flags <spylls.hunspell.algo.lookup.Lookup.compounds_by_flags>`
1025    #: to make list of flags passed to
1026    #: :meth:`Lookup.produce_affix_forms <spylls.hunspell.algo.lookup.Lookup.produce_affix_forms>`
1027    #: (for this part of the compound, try find affixed spellings, you can use affixes with this flag).
1028    COMPOUNDFORBIDFLAG: Optional[str] = None
1029
1030    #: Last word part of a compound with flag FORCEUCASE forces capitalization of the whole compound
1031    #: word. Eg. Dutch word "straat" (street) with FORCEUCASE flags will allowed only in capitalized
1032    #: compound forms, according to the Dutch spelling rules for proper names.
1033    #:
1034    #: *Usage:* :meth:`Lookup.is_bad_compound <spylls.hunspell.algo.lookup.Lookup.is_bad_compound>`
1035    #: and
1036    #: :meth:`Suggest.suggestions <spylls.hunspell.algo.suggest.Suggest.suggestions>` (if
1037    #: this flag is present in the .aff file, we check that maybe
1038    #: just capitalization of misspelled word would make it right).
1039    FORCEUCASE: Optional[str] = None
1040
1041    #: Forbid upper case characters at word boundaries in compounds.
1042    #:
1043    #: *Usage:* :meth:`Lookup.is_bad_compound <spylls.hunspell.algo.lookup.Lookup.is_bad_compound>`
1044    CHECKCOMPOUNDCASE: bool = False
1045
1046    #: Forbid word duplication in compounds (e.g. "foofoo").
1047    #:
1048    #: *Usage:* :meth:`Lookup.is_bad_compound <spylls.hunspell.algo.lookup.Lookup.is_bad_compound>`
1049    CHECKCOMPOUNDDUP: bool = False
1050
1051    #: Forbid compounding, if the (usually bad) compound word may be a non-compound word if some
1052    #: replacement by :attr:`REP` table (frequent misspellings) is made. Useful for languages with
1053    #: "compound friendly" orthography.
1054    #:
1055    #: *Usage:* :meth:`Lookup.is_bad_compound <spylls.hunspell.algo.lookup.Lookup.is_bad_compound>`
1056    CHECKCOMPOUNDREP: bool = False
1057
1058    #: Forbid compounding, if compound word contains triple repeating letters (e.g. `foo|ox` or `xo|oof`).
1059    #:
1060    #: *Usage:* :meth:`Lookup.is_bad_compound <spylls.hunspell.algo.lookup.Lookup.is_bad_compound>`
1061    CHECKCOMPOUNDTRIPLE: bool = False
1062
1063    #: List of patterns which forbid compound words when pair of words in compound matches this
1064    #: pattern. See :class:`CompoundPattern` for explanation about format.
1065    #:
1066    #: *Usage:* :meth:`Lookup.is_bad_compound <spylls.hunspell.algo.lookup.Lookup.is_bad_compound>`
1067    CHECKCOMPOUNDPATTERN: List[CompoundPattern] = field(default_factory=list)
1068
1069    #: Allow simplified 2-letter forms of the compounds forbidden by :attr:`CHECKCOMPOUNDTRIPLE`.
1070    #: Example: "Schiff"+"fahrt" -> "Schiffahrt"
1071    #:
1072    #: *Usage:* :meth:`Lookup.compounds_by_flags <spylls.hunspell.algo.lookup.Lookup.compounds_by_flags>`,
1073    #: after the main splitting cycle, we also try the
1074    #: hypothesis that if the letter on the current boundary is duplicated, we should triplicate it.
1075    SIMPLIFIEDTRIPLE: bool = False
1076
1077    #: Need for special compounding rules in Hungarian.
1078    #:
1079    #: Not implemented in Spylls
1080    COMPOUNDSYLLABLE: Optional[Tuple[int, str]] = None
1081
1082    #: Allow twofold suffixes within compounds.
1083    #:
1084    #: Not used in Spylls and doesn't have tests in Hunspell
1085    COMPOUNDMORESUFFIXES: bool = False
1086
1087    # *Hu-only, COMPLICATED!*
1088
1089    #: Need for special compounding rules in Hungarian. (The previous phrase is the only docs Hunspell provides ``:)``)
1090    #:
1091    #: Not used in Spylls.
1092    SYLLABLENUM: Optional[str] = None
1093
1094    #: Flag that signs the compounds in the dictionary (Now it is used only in the Hungarian language specific code).
1095    #:
1096    #: Not used in Spylls.
1097    COMPOUNDROOT: Optional[str] = None
1098
1099    # **Pre/post-processing**
1100
1101    #: Input conversion table (what to do with word before checking if it is valid). See :class:`ConvTable`
1102    #: for format description.
1103    #:
1104    #: *Usage:* :meth:`Lookup.__call__ <spylls.hunspell.algo.lookup.Lookup.__call__>`
1105    ICONV: Optional[ConvTable] = None
1106
1107    #: Output conversion table (what to do with suggestion before returning it to the user). See :class:`ConvTable`
1108    #: for format description.
1109    #:
1110    #: *Usage:* :meth:`Suggest.suggestions <spylls.hunspell.algo.suggest.Suggest.suggestions>`
1111    OCONV: Optional[ConvTable] = None
1112
1113    # **Aliasing**
1114
1115    #: Table of flag set aliases. Defined in .aff-file this way:
1116    #:
1117    #: .. code-block:: text
1118    #:
1119    #:    AF 3
1120    #:    AF ABC
1121    #:    AF BCD
1122    #:    AF DE
1123    #:
1124    #: This means set of flags "ABC" has an alias "1", "BCD" alias "2", "DE" alias "3" (aliases are
1125    #: just a sequental number in the table). Now, in .dic-file, ``foo/1`` would be equivalent of
1126    #: ``foo/ABC``, meaning stem ``foo`` has flags ``A, B, C``.
1127    #:
1128    #: *Usage:* Stored in :class:`readers.aff.Context <spylls.hunspell.readers.aff.Context>` to decode
1129    #: flags on reading .aff and .dic files.
1130    AF: Dict[str, Set[str]] = field(default_factory=dict)
1131
1132    #: Table of word data aliases. Logic of aliasing is the same as for :attr:`AM`.
1133    #:
1134    #: *Usage:* :meth:`read_dic <spylls.hunspell.readers.dic.read_dic>`
1135    AM: Dict[str, Set[str]] = field(default_factory=dict)
1136
1137    # **Other**
1138
1139    #: This flag is for rare words, which are also often spelling mistakes.
1140    #: With command-line flag ``-r``, Hunspell will warn about words with this flag in input text.
1141    #:
1142    #: Not implemented in Spylls
1143    WARN: Optional[str] = None
1144
1145    #: Sets if words with :attr:`WARN` flag should be considered as misspellings (errors, not warnings).
1146    #:
1147    #: Not used in any known dictionary, and not implemented in Spylls (even in aff-reader).
1148    FORBIDWARN: bool = False
1149
1150    #: Flag signs affix rules and dictionary words (allomorphs) not used in morphological generation
1151    #: and root words removed from suggestion.
1152    #:
1153    #: Not implemented in Spylls
1154    SUBSTANDARD: Optional[str] = None
1155
1156    def __post_init__(self):
1157        suffixes = defaultdict(list)
1158        for suf in itertools.chain.from_iterable(self.SFX.values()):
1159            suffixes[suf.add[::-1]].append(suf)
1160
1161        self.suffixes_index = Trie(suffixes)
1162
1163        prefixes = defaultdict(list)
1164        for pref in itertools.chain.from_iterable(self.PFX.values()):
1165            prefixes[pref.add].append(pref)
1166
1167        self.prefixes_index = Trie(prefixes)
1168
1169        if self.CHECKSHARPS:
1170            self.casing = GermanCasing()
1171        elif self.LANG in ['tr', 'tr_TR', 'az', 'crh']:     # TODO: more robust language code check!
1172            self.casing = TurkicCasing()
1173        else:
1174            self.casing = Casing()
1175