1 /* ***** BEGIN LICENSE BLOCK *****
2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3  *
4  * Copyright (C) 2002-2017 Németh László
5  *
6  * The contents of this file are subject to the Mozilla Public License Version
7  * 1.1 (the "License"); you may not use this file except in compliance with
8  * the License. You may obtain a copy of the License at
9  * http://www.mozilla.org/MPL/
10  *
11  * Software distributed under the License is distributed on an "AS IS" basis,
12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13  * for the specific language governing rights and limitations under the
14  * License.
15  *
16  * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17  *
18  * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19  * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20  * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21  * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22  * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23  *
24  * Alternatively, the contents of this file may be used under the terms of
25  * either the GNU General Public License Version 2 or later (the "GPL"), or
26  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27  * in which case the provisions of the GPL or the LGPL are applicable instead
28  * of those above. If you wish to allow use of your version of this file only
29  * under the terms of either the GPL or the LGPL, and not to allow others to
30  * use your version of this file under the terms of the MPL, indicate your
31  * decision by deleting the provisions above and replace them with the notice
32  * and other provisions required by the GPL or the LGPL. If you do not delete
33  * the provisions above, a recipient may use your version of this file under
34  * the terms of any one of the MPL, the GPL or the LGPL.
35  *
36  * ***** END LICENSE BLOCK ***** */
37 /*
38  * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39  * And Contributors.  All rights reserved.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  *
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  *
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  *
52  * 3. All modifications to the source code must be clearly marked as
53  *    such.  Binary redistributions based on modified source code
54  *    must be clearly marked as modified versions in the documentation
55  *    and/or other materials provided with the distribution.
56  *
57  * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
61  * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68  * SUCH DAMAGE.
69  */
70 
71 #ifndef AFFIXMGR_HXX_
72 #define AFFIXMGR_HXX_
73 
74 #include <stdio.h>
75 
76 #include <string>
77 #include <vector>
78 
79 #include "atypes.hxx"
80 #include "baseaffix.hxx"
81 #include "hashmgr.hxx"
82 #include "phonet.hxx"
83 #include "replist.hxx"
84 
85 // check flag duplication
86 #define dupSFX (1 << 0)
87 #define dupPFX (1 << 1)
88 
89 class PfxEntry;
90 class SfxEntry;
91 
92 class AffixMgr {
93   PfxEntry* pStart[SETSIZE];
94   SfxEntry* sStart[SETSIZE];
95   PfxEntry* pFlag[SETSIZE];
96   SfxEntry* sFlag[SETSIZE];
97   const std::vector<HashMgr*>& alldic;
98   const HashMgr* pHMgr;
99   std::string keystring;
100   std::string trystring;
101   std::string encoding;
102   struct cs_info* csconv;
103   int utf8;
104   int complexprefixes;
105   FLAG compoundflag;
106   FLAG compoundbegin;
107   FLAG compoundmiddle;
108   FLAG compoundend;
109   FLAG compoundroot;
110   FLAG compoundforbidflag;
111   FLAG compoundpermitflag;
112   int compoundmoresuffixes;
113   int checkcompounddup;
114   int checkcompoundrep;
115   int checkcompoundcase;
116   int checkcompoundtriple;
117   int simplifiedtriple;
118   FLAG forbiddenword;
119   FLAG nosuggest;
120   FLAG nongramsuggest;
121   FLAG needaffix;
122   int cpdmin;
123   bool parsedrep;
124   std::vector<replentry> reptable;
125   RepList* iconvtable;
126   RepList* oconvtable;
127   bool parsedmaptable;
128   std::vector<mapentry> maptable;
129   bool parsedbreaktable;
130   std::vector<std::string> breaktable;
131   bool parsedcheckcpd;
132   std::vector<patentry> checkcpdtable;
133   int simplifiedcpd;
134   bool parseddefcpd;
135   std::vector<flagentry> defcpdtable;
136   phonetable* phone;
137   int maxngramsugs;
138   int maxcpdsugs;
139   int maxdiff;
140   int onlymaxdiff;
141   int nosplitsugs;
142   int sugswithdots;
143   int cpdwordmax;
144   int cpdmaxsyllable;
145   std::string cpdvowels; // vowels (for calculating of Hungarian compounding limit,
146   std::vector<w_char> cpdvowels_utf16; //vowels for UTF-8 encoding
147   std::string cpdsyllablenum; // syllable count incrementing flag
148   const char* pfxappnd;  // BUG: not stateless
149   const char* sfxappnd;  // BUG: not stateless
150   int sfxextra;          // BUG: not stateless
151   FLAG sfxflag;          // BUG: not stateless
152   char* derived;         // BUG: not stateless
153   SfxEntry* sfx;         // BUG: not stateless
154   PfxEntry* pfx;         // BUG: not stateless
155   int checknum;
156   std::string wordchars; // letters + spec. word characters
157   std::vector<w_char> wordchars_utf16;
158   std::string ignorechars; // letters + spec. word characters
159   std::vector<w_char> ignorechars_utf16;
160   std::string version;   // affix and dictionary file version string
161   std::string lang;	 // language
162   int langnum;
163   FLAG lemma_present;
164   FLAG circumfix;
165   FLAG onlyincompound;
166   FLAG keepcase;
167   FLAG forceucase;
168   FLAG warn;
169   int forbidwarn;
170   FLAG substandard;
171   int checksharps;
172   int fullstrip;
173 
174   int havecontclass;           // boolean variable
175   char contclasses[CONTSIZE];  // flags of possible continuing classes (twofold
176                                // affix)
177 
178  public:
179   AffixMgr(const char* affpath, const std::vector<HashMgr*>& ptr, const char* key = NULL);
180   ~AffixMgr();
181   struct hentry* affix_check(const char* word,
182                              int len,
183                              const unsigned short needflag = (unsigned short)0,
184                              char in_compound = IN_CPD_NOT);
185   struct hentry* prefix_check(const char* word,
186                               int len,
187                               char in_compound,
188                               const FLAG needflag = FLAG_NULL);
189   inline int isSubset(const char* s1, const char* s2);
190   struct hentry* prefix_check_twosfx(const char* word,
191                                      int len,
192                                      char in_compound,
193                                      const FLAG needflag = FLAG_NULL);
194   inline int isRevSubset(const char* s1, const char* end_of_s2, int len);
195   struct hentry* suffix_check(const char* word,
196                               int len,
197                               int sfxopts,
198                               PfxEntry* ppfx,
199                               const FLAG cclass = FLAG_NULL,
200                               const FLAG needflag = FLAG_NULL,
201                               char in_compound = IN_CPD_NOT);
202   struct hentry* suffix_check_twosfx(const char* word,
203                                      int len,
204                                      int sfxopts,
205                                      PfxEntry* ppfx,
206                                      const FLAG needflag = FLAG_NULL);
207 
208   std::string affix_check_morph(const char* word,
209                                 int len,
210                                 const FLAG needflag = FLAG_NULL,
211                                 char in_compound = IN_CPD_NOT);
212   std::string prefix_check_morph(const char* word,
213                                  int len,
214                                  char in_compound,
215                                  const FLAG needflag = FLAG_NULL);
216   std::string suffix_check_morph(const char* word,
217                                  int len,
218                                  int sfxopts,
219                                  PfxEntry* ppfx,
220                                  const FLAG cclass = FLAG_NULL,
221                                  const FLAG needflag = FLAG_NULL,
222                                  char in_compound = IN_CPD_NOT);
223 
224   std::string prefix_check_twosfx_morph(const char* word,
225                                         int len,
226                                         char in_compound,
227                                         const FLAG needflag = FLAG_NULL);
228   std::string suffix_check_twosfx_morph(const char* word,
229                                         int len,
230                                         int sfxopts,
231                                         PfxEntry* ppfx,
232                                         const FLAG needflag = FLAG_NULL);
233 
234   std::string morphgen(const char* ts,
235                        int wl,
236                        const unsigned short* ap,
237                        unsigned short al,
238                        const char* morph,
239                        const char* targetmorph,
240                        int level);
241 
242   int expand_rootword(struct guessword* wlst,
243                       int maxn,
244                       const char* ts,
245                       int wl,
246                       const unsigned short* ap,
247                       unsigned short al,
248                       const char* bad,
249                       int,
250                       const char*);
251 
252   short get_syllable(const std::string& word);
253   int cpdrep_check(const char* word, int len);
254   int cpdpat_check(const char* word,
255                    int len,
256                    hentry* r1,
257                    hentry* r2,
258                    const char affixed);
259   int defcpd_check(hentry*** words,
260                    short wnum,
261                    hentry* rv,
262                    hentry** rwords,
263                    char all);
264   int cpdcase_check(const char* word, int len);
265   inline int candidate_check(const char* word, int len);
266   void setcminmax(int* cmin, int* cmax, const char* word, int len);
267   struct hentry* compound_check(const std::string& word,
268                                 short wordnum,
269                                 short numsyllable,
270                                 short maxwordnum,
271                                 short wnum,
272                                 hentry** words,
273                                 hentry** rwords,
274                                 char hu_mov_rule,
275                                 char is_sug,
276                                 int* info);
277 
278   int compound_check_morph(const char* word,
279                            int len,
280                            short wordnum,
281                            short numsyllable,
282                            short maxwordnum,
283                            short wnum,
284                            hentry** words,
285                            hentry** rwords,
286                            char hu_mov_rule,
287                            std::string& result,
288                            const std::string* partresult);
289 
290   std::vector<std::string> get_suffix_words(short unsigned* suff,
291                        int len,
292                        const char* root_word);
293 
294   struct hentry* lookup(const char* word);
295   const std::vector<replentry>& get_reptable() const;
296   RepList* get_iconvtable() const;
297   RepList* get_oconvtable() const;
298   struct phonetable* get_phonetable() const;
299   const std::vector<mapentry>& get_maptable() const;
300   const std::vector<std::string>& get_breaktable() const;
301   const std::string& get_encoding();
302   int get_langnum() const;
303   char* get_key_string();
304   char* get_try_string() const;
305   const std::string& get_wordchars() const;
306   const std::vector<w_char>& get_wordchars_utf16() const;
307   const char* get_ignore() const;
308   const std::vector<w_char>& get_ignore_utf16() const;
309   int get_compound() const;
310   FLAG get_compoundflag() const;
311   FLAG get_forbiddenword() const;
312   FLAG get_nosuggest() const;
313   FLAG get_nongramsuggest() const;
314   FLAG get_needaffix() const;
315   FLAG get_onlyincompound() const;
316   const char* get_derived() const;
317   const std::string& get_version() const;
318   int have_contclass() const;
319   int get_utf8() const;
320   int get_complexprefixes() const;
321   char* get_suffixed(char) const;
322   int get_maxngramsugs() const;
323   int get_maxcpdsugs() const;
324   int get_maxdiff() const;
325   int get_onlymaxdiff() const;
326   int get_nosplitsugs() const;
327   int get_sugswithdots(void) const;
328   FLAG get_keepcase(void) const;
329   FLAG get_forceucase(void) const;
330   FLAG get_warn(void) const;
331   int get_forbidwarn(void) const;
332   int get_checksharps(void) const;
333   char* encode_flag(unsigned short aflag) const;
334   int get_fullstrip() const;
335 
336  private:
337   int parse_file(const char* affpath, const char* key);
338   bool parse_flag(const std::string& line, unsigned short* out, FileMgr* af);
339   bool parse_num(const std::string& line, int* out, FileMgr* af);
340   bool parse_cpdsyllable(const std::string& line, FileMgr* af);
341   bool parse_reptable(const std::string& line, FileMgr* af);
342   bool parse_convtable(const std::string& line,
343                       FileMgr* af,
344                       RepList** rl,
345                       const std::string& keyword);
346   bool parse_phonetable(const std::string& line, FileMgr* af);
347   bool parse_maptable(const std::string& line, FileMgr* af);
348   bool parse_breaktable(const std::string& line, FileMgr* af);
349   bool parse_checkcpdtable(const std::string& line, FileMgr* af);
350   bool parse_defcpdtable(const std::string& line, FileMgr* af);
351   bool parse_affix(const std::string& line, const char at, FileMgr* af, char* dupflags);
352 
353   void reverse_condition(std::string&);
354   std::string& debugflag(std::string& result, unsigned short flag);
355   int condlen(const char*);
356   int encodeit(AffEntry& entry, const char* cs);
357   int build_pfxtree(PfxEntry* pfxptr);
358   int build_sfxtree(SfxEntry* sfxptr);
359   int process_pfx_order();
360   int process_sfx_order();
361   PfxEntry* process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr);
362   SfxEntry* process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr);
363   int process_pfx_tree_to_list();
364   int process_sfx_tree_to_list();
365   int redundant_condition(char, const char* strip, int stripl, const char* cond, int);
366   void finishFileMgr(FileMgr* afflst);
367 };
368 
369 #endif
370