1 /* ***** BEGIN LICENSE BLOCK *****
2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3  *
4  * Copyright (C) 2002-2017 Németh László
5  *
6  * The contents of this file are subject to the Mozilla Public License Version
7  * 1.1 (the "License"); you may not use this file except in compliance with
8  * the License. You may obtain a copy of the License at
9  * http://www.mozilla.org/MPL/
10  *
11  * Software distributed under the License is distributed on an "AS IS" basis,
12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13  * for the specific language governing rights and limitations under the
14  * License.
15  *
16  * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17  *
18  * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19  * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20  * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21  * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22  * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23  *
24  * Alternatively, the contents of this file may be used under the terms of
25  * either the GNU General Public License Version 2 or later (the "GPL"), or
26  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27  * in which case the provisions of the GPL or the LGPL are applicable instead
28  * of those above. If you wish to allow use of your version of this file only
29  * under the terms of either the GPL or the LGPL, and not to allow others to
30  * use your version of this file under the terms of the MPL, indicate your
31  * decision by deleting the provisions above and replace them with the notice
32  * and other provisions required by the GPL or the LGPL. If you do not delete
33  * the provisions above, a recipient may use your version of this file under
34  * the terms of any one of the MPL, the GPL or the LGPL.
35  *
36  * ***** END LICENSE BLOCK ***** */
37 /*
38  * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39  * And Contributors.  All rights reserved.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  *
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  *
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  *
52  * 3. All modifications to the source code must be clearly marked as
53  *    such.  Binary redistributions based on modified source code
54  *    must be clearly marked as modified versions in the documentation
55  *    and/or other materials provided with the distribution.
56  *
57  * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
61  * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68  * SUCH DAMAGE.
69  */
70 
71 #include <stdlib.h>
72 #include <string.h>
73 #include <stdio.h>
74 #include <ctype.h>
75 #include <time.h>
76 
77 #include <algorithm>
78 #include <limits>
79 #include <string>
80 #include <vector>
81 
82 #include "affixmgr.hxx"
83 #include "affentry.hxx"
84 #include "langnum.hxx"
85 
86 #include "csutil.hxx"
87 
AffixMgr(const char * affpath,const std::vector<HashMgr * > & ptr,const char * key)88 AffixMgr::AffixMgr(const char* affpath,
89                    const std::vector<HashMgr*>& ptr,
90                    const char* key)
91   : alldic(ptr)
92   , pHMgr(ptr[0]) {
93 
94   // register hash manager and load affix data from aff file
95   csconv = NULL;
96   utf8 = 0;
97   complexprefixes = 0;
98   parsedmaptable = false;
99   parsedbreaktable = false;
100   iconvtable = NULL;
101   oconvtable = NULL;
102   // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
103   simplifiedcpd = 0;
104   parsedcheckcpd = false;
105   parseddefcpd = false;
106   phone = NULL;
107   compoundflag = FLAG_NULL;        // permits word in compound forms
108   compoundbegin = FLAG_NULL;       // may be first word in compound forms
109   compoundmiddle = FLAG_NULL;      // may be middle word in compound forms
110   compoundend = FLAG_NULL;         // may be last word in compound forms
111   compoundroot = FLAG_NULL;        // compound word signing flag
112   compoundpermitflag = FLAG_NULL;  // compound permitting flag for suffixed word
113   compoundforbidflag = FLAG_NULL;  // compound fordidden flag for suffixed word
114   compoundmoresuffixes = 0;        // allow more suffixes within compound words
115   checkcompounddup = 0;            // forbid double words in compounds
116   checkcompoundrep = 0;  // forbid bad compounds (may be non-compound word with
117                          // a REP substitution)
118   checkcompoundcase =
119       0;  // forbid upper and lowercase combinations at word bounds
120   checkcompoundtriple = 0;  // forbid compounds with triple letters
121   simplifiedtriple = 0;     // allow simplified triple letters in compounds
122                             // (Schiff+fahrt -> Schiffahrt)
123   forbiddenword = FORBIDDENWORD;  // forbidden word signing flag
124   nosuggest = FLAG_NULL;  // don't suggest words signed with NOSUGGEST flag
125   nongramsuggest = FLAG_NULL;
126   langnum = 0;  // language code (see http://l10n.openoffice.org/languages.html)
127   needaffix = FLAG_NULL;  // forbidden root, allowed only with suffixes
128   cpdwordmax = -1;        // default: unlimited wordcount in compound words
129   cpdmin = -1;            // undefined
130   cpdmaxsyllable = 0;     // default: unlimited syllablecount in compound words
131   pfxappnd = NULL;  // previous prefix for counting syllables of the prefix BUG
132   sfxappnd = NULL;  // previous suffix for counting syllables of the suffix BUG
133   sfxextra = 0;     // modifier for syllable count of sfxappnd BUG
134   checknum = 0;               // checking numbers, and word with numbers
135   havecontclass = 0;  // flags of possible continuing classes (double affix)
136   // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
137   // in morhological description in dictionary file. It's often combined with
138   // PSEUDOROOT.
139   lemma_present = FLAG_NULL;
140   circumfix = FLAG_NULL;
141   onlyincompound = FLAG_NULL;
142   maxngramsugs = -1;  // undefined
143   maxdiff = -1;       // undefined
144   onlymaxdiff = 0;
145   maxcpdsugs = -1;  // undefined
146   nosplitsugs = 0;
147   sugswithdots = 0;
148   keepcase = 0;
149   forceucase = 0;
150   warn = 0;
151   forbidwarn = 0;
152   checksharps = 0;
153   substandard = FLAG_NULL;
154   fullstrip = 0;
155 
156   sfx = NULL;
157   pfx = NULL;
158 
159   for (int i = 0; i < SETSIZE; i++) {
160     pStart[i] = NULL;
161     sStart[i] = NULL;
162     pFlag[i] = NULL;
163     sFlag[i] = NULL;
164   }
165 
166   for (int j = 0; j < CONTSIZE; j++) {
167     contclasses[j] = 0;
168   }
169 
170   if (parse_file(affpath, key)) {
171     HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n", affpath);
172   }
173 
174   if (cpdmin == -1)
175     cpdmin = MINCPDLEN;
176 }
177 
~AffixMgr()178 AffixMgr::~AffixMgr() {
179   // pass through linked prefix entries and clean up
180   for (int i = 0; i < SETSIZE; i++) {
181     pFlag[i] = NULL;
182     PfxEntry* ptr = pStart[i];
183     PfxEntry* nptr = NULL;
184     while (ptr) {
185       nptr = ptr->getNext();
186       delete (ptr);
187       ptr = nptr;
188       nptr = NULL;
189     }
190   }
191 
192   // pass through linked suffix entries and clean up
193   for (int j = 0; j < SETSIZE; j++) {
194     sFlag[j] = NULL;
195     SfxEntry* ptr = sStart[j];
196     SfxEntry* nptr = NULL;
197     while (ptr) {
198       nptr = ptr->getNext();
199       delete (ptr);
200       ptr = nptr;
201       nptr = NULL;
202     }
203     sStart[j] = NULL;
204   }
205 
206   delete iconvtable;
207   delete oconvtable;
208   delete phone;
209 
210   FREE_FLAG(compoundflag);
211   FREE_FLAG(compoundbegin);
212   FREE_FLAG(compoundmiddle);
213   FREE_FLAG(compoundend);
214   FREE_FLAG(compoundpermitflag);
215   FREE_FLAG(compoundforbidflag);
216   FREE_FLAG(compoundroot);
217   FREE_FLAG(forbiddenword);
218   FREE_FLAG(nosuggest);
219   FREE_FLAG(nongramsuggest);
220   FREE_FLAG(needaffix);
221   FREE_FLAG(lemma_present);
222   FREE_FLAG(circumfix);
223   FREE_FLAG(onlyincompound);
224 
225   cpdwordmax = 0;
226   pHMgr = NULL;
227   cpdmin = 0;
228   cpdmaxsyllable = 0;
229   free_utf_tbl();
230   checknum = 0;
231 #ifdef MOZILLA_CLIENT
232   delete[] csconv;
233 #endif
234 }
235 
finishFileMgr(FileMgr * afflst)236 void AffixMgr::finishFileMgr(FileMgr* afflst) {
237   delete afflst;
238 
239   // convert affix trees to sorted list
240   process_pfx_tree_to_list();
241   process_sfx_tree_to_list();
242 }
243 
244 // read in aff file and build up prefix and suffix entry objects
parse_file(const char * affpath,const char * key)245 int AffixMgr::parse_file(const char* affpath, const char* key) {
246 
247   // checking flag duplication
248   char dupflags[CONTSIZE];
249   char dupflags_ini = 1;
250 
251   // first line indicator for removing byte order mark
252   int firstline = 1;
253 
254   // open the affix file
255   FileMgr* afflst = new FileMgr(affpath, key);
256   if (!afflst) {
257     HUNSPELL_WARNING(
258         stderr, "error: could not open affix description file %s\n", affpath);
259     return 1;
260   }
261 
262   // step one is to parse the affix file building up the internal
263   // affix data structures
264 
265   // read in each line ignoring any that do not
266   // start with a known line type indicator
267   std::string line;
268   while (afflst->getline(line)) {
269     mychomp(line);
270 
271     /* remove byte order mark */
272     if (firstline) {
273       firstline = 0;
274       // Affix file begins with byte order mark: possible incompatibility with
275       // old Hunspell versions
276       if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
277         line.erase(0, 3);
278       }
279     }
280 
281     /* parse in the keyboard string */
282     if (line.compare(0, 3, "KEY", 3) == 0) {
283       if (!parse_string(line, keystring, afflst->getlinenum())) {
284         finishFileMgr(afflst);
285         return 1;
286       }
287     }
288 
289     /* parse in the try string */
290     if (line.compare(0, 3, "TRY", 3) == 0) {
291       if (!parse_string(line, trystring, afflst->getlinenum())) {
292         finishFileMgr(afflst);
293         return 1;
294       }
295     }
296 
297     /* parse in the name of the character set used by the .dict and .aff */
298     if (line.compare(0, 3, "SET", 3) == 0) {
299       if (!parse_string(line, encoding, afflst->getlinenum())) {
300         finishFileMgr(afflst);
301         return 1;
302       }
303       if (encoding == "UTF-8") {
304         utf8 = 1;
305 #ifndef OPENOFFICEORG
306 #ifndef MOZILLA_CLIENT
307         initialize_utf_tbl();
308 #endif
309 #endif
310       }
311     }
312 
313     /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left
314      * writing system */
315     if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
316       complexprefixes = 1;
317 
318     /* parse in the flag used by the controlled compound words */
319     if (line.compare(0, 12, "COMPOUNDFLAG", 12) == 0) {
320       if (!parse_flag(line, &compoundflag, afflst)) {
321         finishFileMgr(afflst);
322         return 1;
323       }
324     }
325 
326     /* parse in the flag used by compound words */
327     if (line.compare(0, 13, "COMPOUNDBEGIN", 13) == 0) {
328       if (complexprefixes) {
329         if (!parse_flag(line, &compoundend, afflst)) {
330           finishFileMgr(afflst);
331           return 1;
332         }
333       } else {
334         if (!parse_flag(line, &compoundbegin, afflst)) {
335           finishFileMgr(afflst);
336           return 1;
337         }
338       }
339     }
340 
341     /* parse in the flag used by compound words */
342     if (line.compare(0, 14, "COMPOUNDMIDDLE", 14) == 0) {
343       if (!parse_flag(line, &compoundmiddle, afflst)) {
344         finishFileMgr(afflst);
345         return 1;
346       }
347     }
348 
349     /* parse in the flag used by compound words */
350     if (line.compare(0, 11, "COMPOUNDEND", 11) == 0) {
351       if (complexprefixes) {
352         if (!parse_flag(line, &compoundbegin, afflst)) {
353           finishFileMgr(afflst);
354           return 1;
355         }
356       } else {
357         if (!parse_flag(line, &compoundend, afflst)) {
358           finishFileMgr(afflst);
359           return 1;
360         }
361       }
362     }
363 
364     /* parse in the data used by compound_check() method */
365     if (line.compare(0, 15, "COMPOUNDWORDMAX", 15) == 0) {
366       if (!parse_num(line, &cpdwordmax, afflst)) {
367         finishFileMgr(afflst);
368         return 1;
369       }
370     }
371 
372     /* parse in the flag sign compounds in dictionary */
373     if (line.compare(0, 12, "COMPOUNDROOT", 12) == 0) {
374       if (!parse_flag(line, &compoundroot, afflst)) {
375         finishFileMgr(afflst);
376         return 1;
377       }
378     }
379 
380     /* parse in the flag used by compound_check() method */
381     if (line.compare(0, 18, "COMPOUNDPERMITFLAG", 18) == 0) {
382       if (!parse_flag(line, &compoundpermitflag, afflst)) {
383         finishFileMgr(afflst);
384         return 1;
385       }
386     }
387 
388     /* parse in the flag used by compound_check() method */
389     if (line.compare(0, 18, "COMPOUNDFORBIDFLAG", 18) == 0) {
390       if (!parse_flag(line, &compoundforbidflag, afflst)) {
391         finishFileMgr(afflst);
392         return 1;
393       }
394     }
395 
396     if (line.compare(0, 20, "COMPOUNDMORESUFFIXES", 20) == 0) {
397       compoundmoresuffixes = 1;
398     }
399 
400     if (line.compare(0, 16, "CHECKCOMPOUNDDUP", 16) == 0) {
401       checkcompounddup = 1;
402     }
403 
404     if (line.compare(0, 16, "CHECKCOMPOUNDREP", 16) == 0) {
405       checkcompoundrep = 1;
406     }
407 
408     if (line.compare(0, 19, "CHECKCOMPOUNDTRIPLE", 19) == 0) {
409       checkcompoundtriple = 1;
410     }
411 
412     if (line.compare(0, 16, "SIMPLIFIEDTRIPLE", 16) == 0) {
413       simplifiedtriple = 1;
414     }
415 
416     if (line.compare(0, 17, "CHECKCOMPOUNDCASE", 17) == 0) {
417       checkcompoundcase = 1;
418     }
419 
420     if (line.compare(0, 9, "NOSUGGEST", 9) == 0) {
421       if (!parse_flag(line, &nosuggest, afflst)) {
422         finishFileMgr(afflst);
423         return 1;
424       }
425     }
426 
427     if (line.compare(0, 14, "NONGRAMSUGGEST", 14) == 0) {
428       if (!parse_flag(line, &nongramsuggest, afflst)) {
429         finishFileMgr(afflst);
430         return 1;
431       }
432     }
433 
434     /* parse in the flag used by forbidden words */
435     if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) {
436       if (!parse_flag(line, &forbiddenword, afflst)) {
437         finishFileMgr(afflst);
438         return 1;
439       }
440     }
441 
442     /* parse in the flag used by forbidden words (is deprecated) */
443     if (line.compare(0, 13, "LEMMA_PRESENT", 13) == 0) {
444       if (!parse_flag(line, &lemma_present, afflst)) {
445         finishFileMgr(afflst);
446         return 1;
447       }
448     }
449 
450     /* parse in the flag used by circumfixes */
451     if (line.compare(0, 9, "CIRCUMFIX", 9) == 0) {
452       if (!parse_flag(line, &circumfix, afflst)) {
453         finishFileMgr(afflst);
454         return 1;
455       }
456     }
457 
458     /* parse in the flag used by fogemorphemes */
459     if (line.compare(0, 14, "ONLYINCOMPOUND", 14) == 0) {
460       if (!parse_flag(line, &onlyincompound, afflst)) {
461         finishFileMgr(afflst);
462         return 1;
463       }
464     }
465 
466     /* parse in the flag used by `needaffixs' (is deprecated) */
467     if (line.compare(0, 10, "PSEUDOROOT", 10) == 0) {
468       if (!parse_flag(line, &needaffix, afflst)) {
469         finishFileMgr(afflst);
470         return 1;
471       }
472     }
473 
474     /* parse in the flag used by `needaffixs' */
475     if (line.compare(0, 9, "NEEDAFFIX", 9) == 0) {
476       if (!parse_flag(line, &needaffix, afflst)) {
477         finishFileMgr(afflst);
478         return 1;
479       }
480     }
481 
482     /* parse in the minimal length for words in compounds */
483     if (line.compare(0, 11, "COMPOUNDMIN", 11) == 0) {
484       if (!parse_num(line, &cpdmin, afflst)) {
485         finishFileMgr(afflst);
486         return 1;
487       }
488       if (cpdmin < 1)
489         cpdmin = 1;
490     }
491 
492     /* parse in the max. words and syllables in compounds */
493     if (line.compare(0, 16, "COMPOUNDSYLLABLE", 16) == 0) {
494       if (!parse_cpdsyllable(line, afflst)) {
495         finishFileMgr(afflst);
496         return 1;
497       }
498     }
499 
500     /* parse in the flag used by compound_check() method */
501     if (line.compare(0, 11, "SYLLABLENUM", 11) == 0) {
502       if (!parse_string(line, cpdsyllablenum, afflst->getlinenum())) {
503         finishFileMgr(afflst);
504         return 1;
505       }
506     }
507 
508     /* parse in the flag used by the controlled compound words */
509     if (line.compare(0, 8, "CHECKNUM", 8) == 0) {
510       checknum = 1;
511     }
512 
513     /* parse in the extra word characters */
514     if (line.compare(0, 9, "WORDCHARS", 9) == 0) {
515       if (!parse_array(line, wordchars, wordchars_utf16,
516                        utf8, afflst->getlinenum())) {
517         finishFileMgr(afflst);
518         return 1;
519       }
520     }
521 
522     /* parse in the ignored characters (for example, Arabic optional diacretics
523      * charachters */
524     if (line.compare(0, 6, "IGNORE", 6) == 0) {
525       if (!parse_array(line, ignorechars, ignorechars_utf16,
526                        utf8, afflst->getlinenum())) {
527         finishFileMgr(afflst);
528         return 1;
529       }
530     }
531 
532     /* parse in the input conversion table */
533     if (line.compare(0, 5, "ICONV", 5) == 0) {
534       if (!parse_convtable(line, afflst, &iconvtable, "ICONV")) {
535         finishFileMgr(afflst);
536         return 1;
537       }
538     }
539 
540     /* parse in the output conversion table */
541     if (line.compare(0, 5, "OCONV", 5) == 0) {
542       if (!parse_convtable(line, afflst, &oconvtable, "OCONV")) {
543         finishFileMgr(afflst);
544         return 1;
545       }
546     }
547 
548     /* parse in the phonetic translation table */
549     if (line.compare(0, 5, "PHONE", 5) == 0) {
550       if (!parse_phonetable(line, afflst)) {
551         finishFileMgr(afflst);
552         return 1;
553       }
554     }
555 
556     /* parse in the checkcompoundpattern table */
557     if (line.compare(0, 20, "CHECKCOMPOUNDPATTERN", 20) == 0) {
558       if (!parse_checkcpdtable(line, afflst)) {
559         finishFileMgr(afflst);
560         return 1;
561       }
562     }
563 
564     /* parse in the defcompound table */
565     if (line.compare(0, 12, "COMPOUNDRULE", 12) == 0) {
566       if (!parse_defcpdtable(line, afflst)) {
567         finishFileMgr(afflst);
568         return 1;
569       }
570     }
571 
572     /* parse in the related character map table */
573     if (line.compare(0, 3, "MAP", 3) == 0) {
574       if (!parse_maptable(line, afflst)) {
575         finishFileMgr(afflst);
576         return 1;
577       }
578     }
579 
580     /* parse in the word breakpoints table */
581     if (line.compare(0, 5, "BREAK", 5) == 0) {
582       if (!parse_breaktable(line, afflst)) {
583         finishFileMgr(afflst);
584         return 1;
585       }
586     }
587 
588     /* parse in the language for language specific codes */
589     if (line.compare(0, 4, "LANG", 4) == 0) {
590       if (!parse_string(line, lang, afflst->getlinenum())) {
591         finishFileMgr(afflst);
592         return 1;
593       }
594       langnum = get_lang_num(lang);
595     }
596 
597     if (line.compare(0, 7, "VERSION", 7) == 0) {
598       size_t startpos = line.find_first_not_of(" \t", 7);
599       if (startpos != std::string::npos) {
600           version = line.substr(startpos);
601       }
602     }
603 
604     if (line.compare(0, 12, "MAXNGRAMSUGS", 12) == 0) {
605       if (!parse_num(line, &maxngramsugs, afflst)) {
606         finishFileMgr(afflst);
607         return 1;
608       }
609     }
610 
611     if (line.compare(0, 11, "ONLYMAXDIFF", 11) == 0)
612       onlymaxdiff = 1;
613 
614     if (line.compare(0, 7, "MAXDIFF", 7) == 0) {
615       if (!parse_num(line, &maxdiff, afflst)) {
616         finishFileMgr(afflst);
617         return 1;
618       }
619     }
620 
621     if (line.compare(0, 10, "MAXCPDSUGS", 10) == 0) {
622       if (!parse_num(line, &maxcpdsugs, afflst)) {
623         finishFileMgr(afflst);
624         return 1;
625       }
626     }
627 
628     if (line.compare(0, 11, "NOSPLITSUGS", 11) == 0) {
629       nosplitsugs = 1;
630     }
631 
632     if (line.compare(0, 9, "FULLSTRIP", 9) == 0) {
633       fullstrip = 1;
634     }
635 
636     if (line.compare(0, 12, "SUGSWITHDOTS", 12) == 0) {
637       sugswithdots = 1;
638     }
639 
640     /* parse in the flag used by forbidden words */
641     if (line.compare(0, 8, "KEEPCASE", 8) == 0) {
642       if (!parse_flag(line, &keepcase, afflst)) {
643         finishFileMgr(afflst);
644         return 1;
645       }
646     }
647 
648     /* parse in the flag used by `forceucase' */
649     if (line.compare(0, 10, "FORCEUCASE", 10) == 0) {
650       if (!parse_flag(line, &forceucase, afflst)) {
651         finishFileMgr(afflst);
652         return 1;
653       }
654     }
655 
656     /* parse in the flag used by `warn' */
657     if (line.compare(0, 4, "WARN", 4) == 0) {
658       if (!parse_flag(line, &warn, afflst)) {
659         finishFileMgr(afflst);
660         return 1;
661       }
662     }
663 
664     if (line.compare(0, 10, "FORBIDWARN", 10) == 0) {
665       forbidwarn = 1;
666     }
667 
668     /* parse in the flag used by the affix generator */
669     if (line.compare(0, 11, "SUBSTANDARD", 11) == 0) {
670       if (!parse_flag(line, &substandard, afflst)) {
671         finishFileMgr(afflst);
672         return 1;
673       }
674     }
675 
676     if (line.compare(0, 11, "CHECKSHARPS", 11) == 0) {
677       checksharps = 1;
678     }
679 
680     /* parse this affix: P - prefix, S - suffix */
681     // affix type
682     char ft = ' ';
683     if (line.compare(0, 3, "PFX", 3) == 0)
684       ft = complexprefixes ? 'S' : 'P';
685     if (line.compare(0, 3, "SFX", 3) == 0)
686       ft = complexprefixes ? 'P' : 'S';
687     if (ft != ' ') {
688       if (dupflags_ini) {
689         memset(dupflags, 0, sizeof(dupflags));
690         dupflags_ini = 0;
691       }
692       if (!parse_affix(line, ft, afflst, dupflags)) {
693         finishFileMgr(afflst);
694         return 1;
695       }
696     }
697   }
698 
699   finishFileMgr(afflst);
700   // affix trees are sorted now
701 
702   // now we can speed up performance greatly taking advantage of the
703   // relationship between the affixes and the idea of "subsets".
704 
705   // View each prefix as a potential leading subset of another and view
706   // each suffix (reversed) as a potential trailing subset of another.
707 
708   // To illustrate this relationship if we know the prefix "ab" is found in the
709   // word to examine, only prefixes that "ab" is a leading subset of need be
710   // examined.
711   // Furthermore is "ab" is not present then none of the prefixes that "ab" is
712   // is a subset need be examined.
713   // The same argument goes for suffix string that are reversed.
714 
715   // Then to top this off why not examine the first char of the word to quickly
716   // limit the set of prefixes to examine (i.e. the prefixes to examine must
717   // be leading supersets of the first character of the word (if they exist)
718 
719   // To take advantage of this "subset" relationship, we need to add two links
720   // from entry.  One to take next if the current prefix is found (call it
721   // nexteq)
722   // and one to take next if the current prefix is not found (call it nextne).
723 
724   // Since we have built ordered lists, all that remains is to properly
725   // initialize
726   // the nextne and nexteq pointers that relate them
727 
728   process_pfx_order();
729   process_sfx_order();
730 
731   /* get encoding for CHECKCOMPOUNDCASE */
732   if (!utf8) {
733     csconv = get_current_cs(get_encoding());
734     for (int i = 0; i <= 255; i++) {
735       if ((csconv[i].cupper != csconv[i].clower) &&
736           (wordchars.find((char)i) == std::string::npos)) {
737         wordchars.push_back((char)i);
738       }
739     }
740 
741   }
742 
743   // default BREAK definition
744   if (!parsedbreaktable) {
745     breaktable.push_back("-");
746     breaktable.push_back("^-");
747     breaktable.push_back("-$");
748     parsedbreaktable = true;
749   }
750   return 0;
751 }
752 
753 // we want to be able to quickly access prefix information
754 // both by prefix flag, and sorted by prefix string itself
755 // so we need to set up two indexes
756 
build_pfxtree(PfxEntry * pfxptr)757 int AffixMgr::build_pfxtree(PfxEntry* pfxptr) {
758   PfxEntry* ptr;
759   PfxEntry* pptr;
760   PfxEntry* ep = pfxptr;
761 
762   // get the right starting points
763   const char* key = ep->getKey();
764   const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF);
765 
766   // first index by flag which must exist
767   ptr = pFlag[flg];
768   ep->setFlgNxt(ptr);
769   pFlag[flg] = ep;
770 
771   // handle the special case of null affix string
772   if (strlen(key) == 0) {
773     // always inset them at head of list at element 0
774     ptr = pStart[0];
775     ep->setNext(ptr);
776     pStart[0] = ep;
777     return 0;
778   }
779 
780   // now handle the normal case
781   ep->setNextEQ(NULL);
782   ep->setNextNE(NULL);
783 
784   unsigned char sp = *((const unsigned char*)key);
785   ptr = pStart[sp];
786 
787   // handle the first insert
788   if (!ptr) {
789     pStart[sp] = ep;
790     return 0;
791   }
792 
793   // otherwise use binary tree insertion so that a sorted
794   // list can easily be generated later
795   pptr = NULL;
796   for (;;) {
797     pptr = ptr;
798     if (strcmp(ep->getKey(), ptr->getKey()) <= 0) {
799       ptr = ptr->getNextEQ();
800       if (!ptr) {
801         pptr->setNextEQ(ep);
802         break;
803       }
804     } else {
805       ptr = ptr->getNextNE();
806       if (!ptr) {
807         pptr->setNextNE(ep);
808         break;
809       }
810     }
811   }
812   return 0;
813 }
814 
815 // we want to be able to quickly access suffix information
816 // both by suffix flag, and sorted by the reverse of the
817 // suffix string itself; so we need to set up two indexes
build_sfxtree(SfxEntry * sfxptr)818 int AffixMgr::build_sfxtree(SfxEntry* sfxptr) {
819 
820   sfxptr->initReverseWord();
821 
822   SfxEntry* ptr;
823   SfxEntry* pptr;
824   SfxEntry* ep = sfxptr;
825 
826   /* get the right starting point */
827   const char* key = ep->getKey();
828   const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF);
829 
830   // first index by flag which must exist
831   ptr = sFlag[flg];
832   ep->setFlgNxt(ptr);
833   sFlag[flg] = ep;
834 
835   // next index by affix string
836 
837   // handle the special case of null affix string
838   if (strlen(key) == 0) {
839     // always inset them at head of list at element 0
840     ptr = sStart[0];
841     ep->setNext(ptr);
842     sStart[0] = ep;
843     return 0;
844   }
845 
846   // now handle the normal case
847   ep->setNextEQ(NULL);
848   ep->setNextNE(NULL);
849 
850   unsigned char sp = *((const unsigned char*)key);
851   ptr = sStart[sp];
852 
853   // handle the first insert
854   if (!ptr) {
855     sStart[sp] = ep;
856     return 0;
857   }
858 
859   // otherwise use binary tree insertion so that a sorted
860   // list can easily be generated later
861   pptr = NULL;
862   for (;;) {
863     pptr = ptr;
864     if (strcmp(ep->getKey(), ptr->getKey()) <= 0) {
865       ptr = ptr->getNextEQ();
866       if (!ptr) {
867         pptr->setNextEQ(ep);
868         break;
869       }
870     } else {
871       ptr = ptr->getNextNE();
872       if (!ptr) {
873         pptr->setNextNE(ep);
874         break;
875       }
876     }
877   }
878   return 0;
879 }
880 
881 // convert from binary tree to sorted list
process_pfx_tree_to_list()882 int AffixMgr::process_pfx_tree_to_list() {
883   for (int i = 1; i < SETSIZE; i++) {
884     pStart[i] = process_pfx_in_order(pStart[i], NULL);
885   }
886   return 0;
887 }
888 
process_pfx_in_order(PfxEntry * ptr,PfxEntry * nptr)889 PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr) {
890   if (ptr) {
891     nptr = process_pfx_in_order(ptr->getNextNE(), nptr);
892     ptr->setNext(nptr);
893     nptr = process_pfx_in_order(ptr->getNextEQ(), ptr);
894   }
895   return nptr;
896 }
897 
898 // convert from binary tree to sorted list
process_sfx_tree_to_list()899 int AffixMgr::process_sfx_tree_to_list() {
900   for (int i = 1; i < SETSIZE; i++) {
901     sStart[i] = process_sfx_in_order(sStart[i], NULL);
902   }
903   return 0;
904 }
905 
process_sfx_in_order(SfxEntry * ptr,SfxEntry * nptr)906 SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr) {
907   if (ptr) {
908     nptr = process_sfx_in_order(ptr->getNextNE(), nptr);
909     ptr->setNext(nptr);
910     nptr = process_sfx_in_order(ptr->getNextEQ(), ptr);
911   }
912   return nptr;
913 }
914 
915 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
916 // using the idea of leading subsets this time
process_pfx_order()917 int AffixMgr::process_pfx_order() {
918   PfxEntry* ptr;
919 
920   // loop through each prefix list starting point
921   for (int i = 1; i < SETSIZE; i++) {
922     ptr = pStart[i];
923 
924     // look through the remainder of the list
925     //  and find next entry with affix that
926     // the current one is not a subset of
927     // mark that as destination for NextNE
928     // use next in list that you are a subset
929     // of as NextEQ
930 
931     for (; ptr != NULL; ptr = ptr->getNext()) {
932       PfxEntry* nptr = ptr->getNext();
933       for (; nptr != NULL; nptr = nptr->getNext()) {
934         if (!isSubset(ptr->getKey(), nptr->getKey()))
935           break;
936       }
937       ptr->setNextNE(nptr);
938       ptr->setNextEQ(NULL);
939       if ((ptr->getNext()) &&
940           isSubset(ptr->getKey(), (ptr->getNext())->getKey()))
941         ptr->setNextEQ(ptr->getNext());
942     }
943 
944     // now clean up by adding smart search termination strings:
945     // if you are already a superset of the previous prefix
946     // but not a subset of the next, search can end here
947     // so set NextNE properly
948 
949     ptr = pStart[i];
950     for (; ptr != NULL; ptr = ptr->getNext()) {
951       PfxEntry* nptr = ptr->getNext();
952       PfxEntry* mptr = NULL;
953       for (; nptr != NULL; nptr = nptr->getNext()) {
954         if (!isSubset(ptr->getKey(), nptr->getKey()))
955           break;
956         mptr = nptr;
957       }
958       if (mptr)
959         mptr->setNextNE(NULL);
960     }
961   }
962   return 0;
963 }
964 
965 // initialize the SfxEntry links NextEQ and NextNE to speed searching
966 // using the idea of leading subsets this time
process_sfx_order()967 int AffixMgr::process_sfx_order() {
968   SfxEntry* ptr;
969 
970   // loop through each prefix list starting point
971   for (int i = 1; i < SETSIZE; i++) {
972     ptr = sStart[i];
973 
974     // look through the remainder of the list
975     //  and find next entry with affix that
976     // the current one is not a subset of
977     // mark that as destination for NextNE
978     // use next in list that you are a subset
979     // of as NextEQ
980 
981     for (; ptr != NULL; ptr = ptr->getNext()) {
982       SfxEntry* nptr = ptr->getNext();
983       for (; nptr != NULL; nptr = nptr->getNext()) {
984         if (!isSubset(ptr->getKey(), nptr->getKey()))
985           break;
986       }
987       ptr->setNextNE(nptr);
988       ptr->setNextEQ(NULL);
989       if ((ptr->getNext()) &&
990           isSubset(ptr->getKey(), (ptr->getNext())->getKey()))
991         ptr->setNextEQ(ptr->getNext());
992     }
993 
994     // now clean up by adding smart search termination strings:
995     // if you are already a superset of the previous suffix
996     // but not a subset of the next, search can end here
997     // so set NextNE properly
998 
999     ptr = sStart[i];
1000     for (; ptr != NULL; ptr = ptr->getNext()) {
1001       SfxEntry* nptr = ptr->getNext();
1002       SfxEntry* mptr = NULL;
1003       for (; nptr != NULL; nptr = nptr->getNext()) {
1004         if (!isSubset(ptr->getKey(), nptr->getKey()))
1005           break;
1006         mptr = nptr;
1007       }
1008       if (mptr)
1009         mptr->setNextNE(NULL);
1010     }
1011   }
1012   return 0;
1013 }
1014 
1015 // add flags to the result for dictionary debugging
debugflag(std::string & result,unsigned short flag)1016 std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) {
1017   char* st = encode_flag(flag);
1018   result.push_back(MSEP_FLD);
1019   result.append(MORPH_FLAG);
1020   if (st) {
1021     result.append(st);
1022     free(st);
1023   }
1024   return result;
1025 }
1026 
1027 // calculate the character length of the condition
condlen(const char * st)1028 int AffixMgr::condlen(const char* st) {
1029   int l = 0;
1030   bool group = false;
1031   for (; *st; st++) {
1032     if (*st == '[') {
1033       group = true;
1034       l++;
1035     } else if (*st == ']')
1036       group = false;
1037     else if (!group && (!utf8 || (!(*st & 0x80) || ((*st & 0xc0) == 0x80))))
1038       l++;
1039   }
1040   return l;
1041 }
1042 
encodeit(AffEntry & entry,const char * cs)1043 int AffixMgr::encodeit(AffEntry& entry, const char* cs) {
1044   if (strcmp(cs, ".") != 0) {
1045     entry.numconds = (char)condlen(cs);
1046     const size_t cslen = strlen(cs);
1047     const size_t short_part = std::min<size_t>(MAXCONDLEN, cslen);
1048     memcpy(entry.c.conds, cs, short_part);
1049     if (short_part < MAXCONDLEN) {
1050       //blank out the remaining space
1051       memset(entry.c.conds + short_part, 0, MAXCONDLEN - short_part);
1052     } else if (cs[MAXCONDLEN]) {
1053       //there is more conditions than fit in fixed space, so its
1054       //a long condition
1055       entry.opts += aeLONGCOND;
1056       entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
1057       if (!entry.c.l.conds2)
1058         return 1;
1059     }
1060   } else {
1061     entry.numconds = 0;
1062     entry.c.conds[0] = '\0';
1063   }
1064   return 0;
1065 }
1066 
1067 // return 1 if s1 is a leading subset of s2 (dots are for infixes)
isSubset(const char * s1,const char * s2)1068 inline int AffixMgr::isSubset(const char* s1, const char* s2) {
1069   while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
1070     s1++;
1071     s2++;
1072   }
1073   return (*s1 == '\0');
1074 }
1075 
1076 // check word for prefixes
prefix_check(const char * word,int len,char in_compound,const FLAG needflag)1077 struct hentry* AffixMgr::prefix_check(const char* word,
1078                                       int len,
1079                                       char in_compound,
1080                                       const FLAG needflag) {
1081   struct hentry* rv = NULL;
1082 
1083   pfx = NULL;
1084   pfxappnd = NULL;
1085   sfxappnd = NULL;
1086   sfxextra = 0;
1087 
1088   // first handle the special case of 0 length prefixes
1089   PfxEntry* pe = pStart[0];
1090   while (pe) {
1091     if (
1092         // fogemorpheme
1093         ((in_compound != IN_CPD_NOT) ||
1094          !(pe->getCont() &&
1095            (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
1096         // permit prefixes in compounds
1097         ((in_compound != IN_CPD_END) ||
1098          (pe->getCont() &&
1099           (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))) {
1100       // check prefix
1101       rv = pe->checkword(word, len, in_compound, needflag);
1102       if (rv) {
1103         pfx = pe;  // BUG: pfx not stateless
1104         return rv;
1105       }
1106     }
1107     pe = pe->getNext();
1108   }
1109 
1110   // now handle the general case
1111   unsigned char sp = *((const unsigned char*)word);
1112   PfxEntry* pptr = pStart[sp];
1113 
1114   while (pptr) {
1115     if (isSubset(pptr->getKey(), word)) {
1116       if (
1117           // fogemorpheme
1118           ((in_compound != IN_CPD_NOT) ||
1119            !(pptr->getCont() &&
1120              (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
1121           // permit prefixes in compounds
1122           ((in_compound != IN_CPD_END) ||
1123            (pptr->getCont() && (TESTAFF(pptr->getCont(), compoundpermitflag,
1124                                         pptr->getContLen()))))) {
1125         // check prefix
1126         rv = pptr->checkword(word, len, in_compound, needflag);
1127         if (rv) {
1128           pfx = pptr;  // BUG: pfx not stateless
1129           return rv;
1130         }
1131       }
1132       pptr = pptr->getNextEQ();
1133     } else {
1134       pptr = pptr->getNextNE();
1135     }
1136   }
1137 
1138   return NULL;
1139 }
1140 
1141 // check word for prefixes and two-level suffixes
prefix_check_twosfx(const char * word,int len,char in_compound,const FLAG needflag)1142 struct hentry* AffixMgr::prefix_check_twosfx(const char* word,
1143                                              int len,
1144                                              char in_compound,
1145                                              const FLAG needflag) {
1146   struct hentry* rv = NULL;
1147 
1148   pfx = NULL;
1149   sfxappnd = NULL;
1150   sfxextra = 0;
1151 
1152   // first handle the special case of 0 length prefixes
1153   PfxEntry* pe = pStart[0];
1154 
1155   while (pe) {
1156     rv = pe->check_twosfx(word, len, in_compound, needflag);
1157     if (rv)
1158       return rv;
1159     pe = pe->getNext();
1160   }
1161 
1162   // now handle the general case
1163   unsigned char sp = *((const unsigned char*)word);
1164   PfxEntry* pptr = pStart[sp];
1165 
1166   while (pptr) {
1167     if (isSubset(pptr->getKey(), word)) {
1168       rv = pptr->check_twosfx(word, len, in_compound, needflag);
1169       if (rv) {
1170         pfx = pptr;
1171         return rv;
1172       }
1173       pptr = pptr->getNextEQ();
1174     } else {
1175       pptr = pptr->getNextNE();
1176     }
1177   }
1178 
1179   return NULL;
1180 }
1181 
1182 // check word for prefixes and morph
prefix_check_morph(const char * word,int len,char in_compound,const FLAG needflag)1183 std::string AffixMgr::prefix_check_morph(const char* word,
1184                                          int len,
1185                                          char in_compound,
1186                                          const FLAG needflag) {
1187 
1188   std::string result;
1189 
1190   pfx = NULL;
1191   sfxappnd = NULL;
1192   sfxextra = 0;
1193 
1194   // first handle the special case of 0 length prefixes
1195   PfxEntry* pe = pStart[0];
1196   while (pe) {
1197     std::string st = pe->check_morph(word, len, in_compound, needflag);
1198     if (!st.empty()) {
1199       result.append(st);
1200     }
1201     pe = pe->getNext();
1202   }
1203 
1204   // now handle the general case
1205   unsigned char sp = *((const unsigned char*)word);
1206   PfxEntry* pptr = pStart[sp];
1207 
1208   while (pptr) {
1209     if (isSubset(pptr->getKey(), word)) {
1210       std::string st = pptr->check_morph(word, len, in_compound, needflag);
1211       if (!st.empty()) {
1212         // fogemorpheme
1213         if ((in_compound != IN_CPD_NOT) ||
1214             !((pptr->getCont() && (TESTAFF(pptr->getCont(), onlyincompound,
1215                                            pptr->getContLen()))))) {
1216           result.append(st);
1217           pfx = pptr;
1218         }
1219       }
1220       pptr = pptr->getNextEQ();
1221     } else {
1222       pptr = pptr->getNextNE();
1223     }
1224   }
1225 
1226   return result;
1227 }
1228 
1229 // check word for prefixes and morph and two-level suffixes
prefix_check_twosfx_morph(const char * word,int len,char in_compound,const FLAG needflag)1230 std::string AffixMgr::prefix_check_twosfx_morph(const char* word,
1231                                                 int len,
1232                                                 char in_compound,
1233                                                 const FLAG needflag) {
1234   std::string result;
1235 
1236   pfx = NULL;
1237   sfxappnd = NULL;
1238   sfxextra = 0;
1239 
1240   // first handle the special case of 0 length prefixes
1241   PfxEntry* pe = pStart[0];
1242   while (pe) {
1243     std::string st = pe->check_twosfx_morph(word, len, in_compound, needflag);
1244     if (!st.empty()) {
1245       result.append(st);
1246     }
1247     pe = pe->getNext();
1248   }
1249 
1250   // now handle the general case
1251   unsigned char sp = *((const unsigned char*)word);
1252   PfxEntry* pptr = pStart[sp];
1253 
1254   while (pptr) {
1255     if (isSubset(pptr->getKey(), word)) {
1256       std::string st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
1257       if (!st.empty()) {
1258         result.append(st);
1259         pfx = pptr;
1260       }
1261       pptr = pptr->getNextEQ();
1262     } else {
1263       pptr = pptr->getNextNE();
1264     }
1265   }
1266 
1267   return result;
1268 }
1269 
1270 // Is word a non-compound with a REP substitution (see checkcompoundrep)?
cpdrep_check(const char * word,int wl)1271 int AffixMgr::cpdrep_check(const char* word, int wl) {
1272 
1273   if ((wl < 2) || get_reptable().empty())
1274     return 0;
1275 
1276   for (size_t i = 0; i < get_reptable().size(); ++i) {
1277     // use only available mid patterns
1278     if (!get_reptable()[i].outstrings[0].empty()) {
1279       const char* r = word;
1280       const size_t lenp = get_reptable()[i].pattern.size();
1281       // search every occurence of the pattern in the word
1282       while ((r = strstr(r, get_reptable()[i].pattern.c_str())) != NULL) {
1283         std::string candidate(word);
1284         candidate.replace(r - word, lenp, get_reptable()[i].outstrings[0]);
1285         if (candidate_check(candidate.c_str(), candidate.size()))
1286           return 1;
1287         ++r;  // search for the next letter
1288       }
1289     }
1290   }
1291 
1292  return 0;
1293 }
1294 
1295 // forbid compound words, if they are in the dictionary as a
1296 // word pair separated by space
cpdwordpair_check(const char * word,int wl)1297 int AffixMgr::cpdwordpair_check(const char * word, int wl) {
1298   if (wl > 2) {
1299     std::string candidate(word);
1300     for (size_t i = 1; i < candidate.size(); i++) {
1301       // go to end of the UTF-8 character
1302       if (utf8 && ((word[i] & 0xc0) == 0x80))
1303           continue;
1304       candidate.insert(i, 1, ' ');
1305       if (candidate_check(candidate.c_str(), candidate.size()))
1306         return 1;
1307       candidate.erase(i, 1);
1308     }
1309   }
1310 
1311   return 0;
1312 }
1313 
1314 // forbid compoundings when there are special patterns at word bound
cpdpat_check(const char * word,int pos,hentry * r1,hentry * r2,const char)1315 int AffixMgr::cpdpat_check(const char* word,
1316                            int pos,
1317                            hentry* r1,
1318                            hentry* r2,
1319                            const char /*affixed*/) {
1320   for (size_t i = 0; i < checkcpdtable.size(); ++i) {
1321     size_t len;
1322     if (isSubset(checkcpdtable[i].pattern2.c_str(), word + pos) &&
1323         (!r1 || !checkcpdtable[i].cond ||
1324          (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) &&
1325         (!r2 || !checkcpdtable[i].cond2 ||
1326          (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) &&
1327         // zero length pattern => only TESTAFF
1328         // zero pattern (0/flag) => unmodified stem (zero affixes allowed)
1329         (checkcpdtable[i].pattern.empty() ||
1330          ((checkcpdtable[i].pattern[0] == '0' && r1->blen <= pos &&
1331            strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) ||
1332           (checkcpdtable[i].pattern[0] != '0' &&
1333            ((len = checkcpdtable[i].pattern.size()) != 0) &&
1334            strncmp(word + pos - len, checkcpdtable[i].pattern.c_str(), len) == 0)))) {
1335       return 1;
1336     }
1337   }
1338   return 0;
1339 }
1340 
1341 // forbid compounding with neighbouring upper and lower case characters at word
1342 // bounds
cpdcase_check(const char * word,int pos)1343 int AffixMgr::cpdcase_check(const char* word, int pos) {
1344   if (utf8) {
1345     const char* p;
1346     for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--)
1347       ;
1348     std::string pair(p);
1349     std::vector<w_char> pair_u;
1350     u8_u16(pair_u, pair);
1351     unsigned short a = pair_u.size() > 1 ? ((pair_u[1].h << 8) + pair_u[1].l) : 0;
1352     unsigned short b = !pair_u.empty() ? ((pair_u[0].h << 8) + pair_u[0].l) : 0;
1353     if (((unicodetoupper(a, langnum) == a) ||
1354          (unicodetoupper(b, langnum) == b)) &&
1355         (a != '-') && (b != '-'))
1356       return 1;
1357   } else {
1358     unsigned char a = *(word + pos - 1);
1359     unsigned char b = *(word + pos);
1360     if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-'))
1361       return 1;
1362   }
1363   return 0;
1364 }
1365 
1366 struct metachar_data {
1367   signed short btpp;  // metacharacter (*, ?) position for backtracking
1368   signed short btwp;  // word position for metacharacters
1369   int btnum;          // number of matched characters in metacharacter
1370 };
1371 
1372 // check compound patterns
defcpd_check(hentry *** words,short wnum,hentry * rv,hentry ** def,char all)1373 int AffixMgr::defcpd_check(hentry*** words,
1374                            short wnum,
1375                            hentry* rv,
1376                            hentry** def,
1377                            char all) {
1378   int w = 0;
1379 
1380   if (!*words) {
1381     w = 1;
1382     *words = def;
1383   }
1384 
1385   if (!*words) {
1386     return 0;
1387   }
1388 
1389   std::vector<metachar_data> btinfo(1);
1390 
1391   short bt = 0;
1392 
1393   (*words)[wnum] = rv;
1394 
1395   // has the last word COMPOUNDRULE flag?
1396   if (rv->alen == 0) {
1397     (*words)[wnum] = NULL;
1398     if (w)
1399       *words = NULL;
1400     return 0;
1401   }
1402   int ok = 0;
1403   for (size_t i = 0; i < defcpdtable.size(); ++i) {
1404     for (size_t j = 0; j < defcpdtable[i].size(); ++j) {
1405       if (defcpdtable[i][j] != '*' && defcpdtable[i][j] != '?' &&
1406           TESTAFF(rv->astr, defcpdtable[i][j], rv->alen)) {
1407         ok = 1;
1408         break;
1409       }
1410     }
1411   }
1412   if (ok == 0) {
1413     (*words)[wnum] = NULL;
1414     if (w)
1415       *words = NULL;
1416     return 0;
1417   }
1418 
1419   for (size_t i = 0; i < defcpdtable.size(); ++i) {
1420     size_t pp = 0;  // pattern position
1421     signed short wp = 0;  // "words" position
1422     int ok2;
1423     ok = 1;
1424     ok2 = 1;
1425     do {
1426       while ((pp < defcpdtable[i].size()) && (wp <= wnum)) {
1427         if (((pp + 1) < defcpdtable[i].size()) &&
1428             ((defcpdtable[i][pp + 1] == '*') ||
1429              (defcpdtable[i][pp + 1] == '?'))) {
1430           int wend = (defcpdtable[i][pp + 1] == '?') ? wp : wnum;
1431           ok2 = 1;
1432           pp += 2;
1433           btinfo[bt].btpp = pp;
1434           btinfo[bt].btwp = wp;
1435           while (wp <= wend) {
1436             if (!(*words)[wp]->alen ||
1437                 !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp - 2],
1438                          (*words)[wp]->alen)) {
1439               ok2 = 0;
1440               break;
1441             }
1442             wp++;
1443           }
1444           if (wp <= wnum)
1445             ok2 = 0;
1446           btinfo[bt].btnum = wp - btinfo[bt].btwp;
1447           if (btinfo[bt].btnum > 0) {
1448             ++bt;
1449             btinfo.resize(bt+1);
1450           }
1451           if (ok2)
1452             break;
1453         } else {
1454           ok2 = 1;
1455           if (!(*words)[wp] || !(*words)[wp]->alen ||
1456               !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp],
1457                        (*words)[wp]->alen)) {
1458             ok = 0;
1459             break;
1460           }
1461           pp++;
1462           wp++;
1463           if ((defcpdtable[i].size() == pp) && !(wp > wnum))
1464             ok = 0;
1465         }
1466       }
1467       if (ok && ok2) {
1468         size_t r = pp;
1469         while ((defcpdtable[i].size() > r) && ((r + 1) < defcpdtable[i].size()) &&
1470                ((defcpdtable[i][r + 1] == '*') ||
1471                 (defcpdtable[i][r + 1] == '?')))
1472           r += 2;
1473         if (defcpdtable[i].size() <= r)
1474           return 1;
1475       }
1476       // backtrack
1477       if (bt)
1478         do {
1479           ok = 1;
1480           btinfo[bt - 1].btnum--;
1481           pp = btinfo[bt - 1].btpp;
1482           wp = btinfo[bt - 1].btwp + (signed short)btinfo[bt - 1].btnum;
1483         } while ((btinfo[bt - 1].btnum < 0) && --bt);
1484     } while (bt);
1485 
1486     if (ok && ok2 && (!all || (defcpdtable[i].size() <= pp)))
1487       return 1;
1488 
1489     // check zero ending
1490     while (ok && ok2 && (defcpdtable[i].size() > pp) &&
1491            ((pp + 1) < defcpdtable[i].size()) &&
1492            ((defcpdtable[i][pp + 1] == '*') ||
1493             (defcpdtable[i][pp + 1] == '?')))
1494       pp += 2;
1495     if (ok && ok2 && (defcpdtable[i].size() <= pp))
1496       return 1;
1497   }
1498   (*words)[wnum] = NULL;
1499   if (w)
1500     *words = NULL;
1501   return 0;
1502 }
1503 
candidate_check(const char * word,int len)1504 inline int AffixMgr::candidate_check(const char* word, int len) {
1505 
1506   struct hentry* rv = lookup(word);
1507   if (rv)
1508     return 1;
1509 
1510   //  rv = prefix_check(word,len,1);
1511   //  if (rv) return 1;
1512 
1513   rv = affix_check(word, len);
1514   if (rv)
1515     return 1;
1516   return 0;
1517 }
1518 
1519 // calculate number of syllable for compound-checking
get_syllable(const std::string & word)1520 short AffixMgr::get_syllable(const std::string& word) {
1521   if (cpdmaxsyllable == 0)
1522     return 0;
1523 
1524   short num = 0;
1525 
1526   if (!utf8) {
1527     for (size_t i = 0; i < word.size(); ++i) {
1528       if (std::binary_search(cpdvowels.begin(), cpdvowels.end(),
1529                              word[i])) {
1530         ++num;
1531       }
1532     }
1533   } else if (!cpdvowels_utf16.empty()) {
1534     std::vector<w_char> w;
1535     u8_u16(w, word);
1536     for (size_t i = 0; i < w.size(); ++i) {
1537       if (std::binary_search(cpdvowels_utf16.begin(),
1538                              cpdvowels_utf16.end(),
1539                              w[i])) {
1540         ++num;
1541       }
1542     }
1543   }
1544 
1545   return num;
1546 }
1547 
setcminmax(int * cmin,int * cmax,const char * word,int len)1548 void AffixMgr::setcminmax(int* cmin, int* cmax, const char* word, int len) {
1549   if (utf8) {
1550     int i;
1551     for (*cmin = 0, i = 0; (i < cpdmin) && *cmin < len; i++) {
1552       for ((*cmin)++; *cmin < len && (word[*cmin] & 0xc0) == 0x80; (*cmin)++)
1553         ;
1554     }
1555     for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax >= 0; i++) {
1556       for ((*cmax)--; *cmax >= 0 && (word[*cmax] & 0xc0) == 0x80; (*cmax)--)
1557         ;
1558     }
1559   } else {
1560     *cmin = cpdmin;
1561     *cmax = len - cpdmin + 1;
1562   }
1563 }
1564 
1565 // check if compound word is correctly spelled
1566 // hu_mov_rule = spec. Hungarian rule (XXX)
compound_check(const std::string & word,short wordnum,short numsyllable,short maxwordnum,short wnum,hentry ** words=NULL,hentry ** rwords=NULL,char hu_mov_rule=0,char is_sug=0,int * info=NULL)1567 struct hentry* AffixMgr::compound_check(const std::string& word,
1568                                         short wordnum,
1569                                         short numsyllable,
1570                                         short maxwordnum,
1571                                         short wnum,
1572                                         hentry** words = NULL,
1573                                         hentry** rwords = NULL,
1574                                         char hu_mov_rule = 0,
1575                                         char is_sug = 0,
1576                                         int* info = NULL) {
1577   int i;
1578   short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
1579   struct hentry* rv = NULL;
1580   struct hentry* rv_first;
1581   std::string st;
1582   char ch = '\0';
1583   int cmin;
1584   int cmax;
1585   int striple = 0;
1586   size_t scpd = 0;
1587   int soldi = 0;
1588   int oldcmin = 0;
1589   int oldcmax = 0;
1590   int oldlen = 0;
1591   int checkedstriple = 0;
1592   char affixed = 0;
1593   hentry** oldwords = words;
1594   size_t len = word.size();
1595 
1596   int checked_prefix;
1597 
1598   // add a time limit to handle possible
1599   // combinatorical explosion of the overlapping words
1600 
1601   HUNSPELL_THREAD_LOCAL clock_t timelimit;
1602 
1603   if (wordnum == 0)
1604       timelimit = clock();
1605   else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) {
1606       timelimit = 0;
1607   }
1608 
1609   setcminmax(&cmin, &cmax, word.c_str(), len);
1610 
1611   st.assign(word);
1612 
1613   for (i = cmin; i < cmax; i++) {
1614     // go to end of the UTF-8 character
1615     if (utf8) {
1616       for (; (st[i] & 0xc0) == 0x80; i++)
1617         ;
1618       if (i >= cmax)
1619         return NULL;
1620     }
1621 
1622     words = oldwords;
1623     int onlycpdrule = (words) ? 1 : 0;
1624 
1625     do {  // onlycpdrule loop
1626 
1627       oldnumsyllable = numsyllable;
1628       oldwordnum = wordnum;
1629       checked_prefix = 0;
1630 
1631       do {  // simplified checkcompoundpattern loop
1632 
1633         if (timelimit == 0)
1634           return 0;
1635 
1636         if (scpd > 0) {
1637           for (; scpd <= checkcpdtable.size() &&
1638                  (checkcpdtable[scpd - 1].pattern3.empty() ||
1639                   strncmp(word.c_str() + i, checkcpdtable[scpd - 1].pattern3.c_str(),
1640                           checkcpdtable[scpd - 1].pattern3.size()) != 0);
1641                scpd++)
1642             ;
1643 
1644           if (scpd > checkcpdtable.size())
1645             break;  // break simplified checkcompoundpattern loop
1646           st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern);
1647           soldi = i;
1648           i += checkcpdtable[scpd - 1].pattern.size();
1649           st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern2);
1650           st.replace(i + checkcpdtable[scpd - 1].pattern2.size(), std::string::npos,
1651                  word.substr(soldi + checkcpdtable[scpd - 1].pattern3.size()));
1652 
1653           oldlen = len;
1654           len += checkcpdtable[scpd - 1].pattern.size() +
1655                  checkcpdtable[scpd - 1].pattern2.size() -
1656                  checkcpdtable[scpd - 1].pattern3.size();
1657           oldcmin = cmin;
1658           oldcmax = cmax;
1659           setcminmax(&cmin, &cmax, st.c_str(), len);
1660 
1661           cmax = len - cpdmin + 1;
1662         }
1663 
1664         ch = st[i];
1665         st[i] = '\0';
1666 
1667         sfx = NULL;
1668         pfx = NULL;
1669 
1670         // FIRST WORD
1671 
1672         affixed = 1;
1673         rv = lookup(st.c_str());  // perhaps without prefix
1674 
1675         // forbid dictionary stems with COMPOUNDFORBIDFLAG in
1676         // compound words, overriding the effect of COMPOUNDPERMITFLAG
1677         if ((rv) && compoundforbidflag &&
1678                 TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule)
1679             continue;
1680 
1681         // search homonym with compound flag
1682         while ((rv) && !hu_mov_rule &&
1683                ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1684                 !((compoundflag && !words && !onlycpdrule &&
1685                    TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1686                   (compoundbegin && !wordnum && !onlycpdrule &&
1687                    TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1688                   (compoundmiddle && wordnum && !words && !onlycpdrule &&
1689                    TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
1690                   (!defcpdtable.empty() && onlycpdrule &&
1691                    ((!words && !wordnum &&
1692                      defcpd_check(&words, wnum, rv, rwords, 0)) ||
1693                     (words &&
1694                      defcpd_check(&words, wnum, rv, rwords, 0))))) ||
1695                 (scpd != 0 && checkcpdtable[scpd - 1].cond != FLAG_NULL &&
1696                  !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)))) {
1697           rv = rv->next_homonym;
1698         }
1699 
1700         if (rv)
1701           affixed = 0;
1702 
1703         if (!rv) {
1704           if (onlycpdrule)
1705             break;
1706           if (compoundflag &&
1707               !(rv = prefix_check(st.c_str(), i,
1708                                   hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
1709                                   compoundflag))) {
1710             if (((rv = suffix_check(
1711                       st.c_str(), i, 0, NULL, FLAG_NULL, compoundflag,
1712                       hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1713                  (compoundmoresuffixes &&
1714                   (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
1715                 !hu_mov_rule && sfx->getCont() &&
1716                 ((compoundforbidflag &&
1717                   TESTAFF(sfx->getCont(), compoundforbidflag,
1718                           sfx->getContLen())) ||
1719                  (compoundend &&
1720                   TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
1721               rv = NULL;
1722             }
1723           }
1724 
1725           if (rv ||
1726               (((wordnum == 0) && compoundbegin &&
1727                 ((rv = suffix_check(
1728                       st.c_str(), i, 0, NULL, FLAG_NULL, compoundbegin,
1729                       hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1730                  (compoundmoresuffixes &&
1731                   (rv = suffix_check_twosfx(
1732                        st.c_str(), i, 0, NULL,
1733                        compoundbegin))) ||  // twofold suffixes + compound
1734                  (rv = prefix_check(st.c_str(), i,
1735                                     hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
1736                                     compoundbegin)))) ||
1737                ((wordnum > 0) && compoundmiddle &&
1738                 ((rv = suffix_check(
1739                       st.c_str(), i, 0, NULL, FLAG_NULL, compoundmiddle,
1740                       hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1741                  (compoundmoresuffixes &&
1742                   (rv = suffix_check_twosfx(
1743                        st.c_str(), i, 0, NULL,
1744                        compoundmiddle))) ||  // twofold suffixes + compound
1745                  (rv = prefix_check(st.c_str(), i,
1746                                     hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
1747                                     compoundmiddle))))))
1748             checked_prefix = 1;
1749           // else check forbiddenwords and needaffix
1750         } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1751                                 TESTAFF(rv->astr, needaffix, rv->alen) ||
1752                                 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1753                                 (is_sug && nosuggest &&
1754                                  TESTAFF(rv->astr, nosuggest, rv->alen)))) {
1755           st[i] = ch;
1756           // continue;
1757           break;
1758         }
1759 
1760         // check non_compound flag in suffix and prefix
1761         if ((rv) && !hu_mov_rule &&
1762             ((pfx && pfx->getCont() &&
1763               TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
1764              (sfx && sfx->getCont() &&
1765               TESTAFF(sfx->getCont(), compoundforbidflag,
1766                       sfx->getContLen())))) {
1767           rv = NULL;
1768         }
1769 
1770         // check compoundend flag in suffix and prefix
1771         if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
1772             ((pfx && pfx->getCont() &&
1773               TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) ||
1774              (sfx && sfx->getCont() &&
1775               TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
1776           rv = NULL;
1777         }
1778 
1779         // check compoundmiddle flag in suffix and prefix
1780         if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
1781             !hu_mov_rule &&
1782             ((pfx && pfx->getCont() &&
1783               TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) ||
1784              (sfx && sfx->getCont() &&
1785               TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) {
1786           rv = NULL;
1787         }
1788 
1789         // check forbiddenwords
1790         if ((rv) && (rv->astr) &&
1791             (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1792              TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1793              (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
1794           return NULL;
1795         }
1796 
1797         // increment word number, if the second root has a compoundroot flag
1798         if ((rv) && compoundroot &&
1799             (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1800           wordnum++;
1801         }
1802 
1803         // first word is acceptable in compound words?
1804         if (((rv) &&
1805              (checked_prefix || (words && words[wnum]) ||
1806               (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1807               ((oldwordnum == 0) && compoundbegin &&
1808                TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1809               ((oldwordnum > 0) && compoundmiddle &&
1810                TESTAFF(rv->astr, compoundmiddle, rv->alen))
1811 
1812               // LANG_hu section: spec. Hungarian rule
1813               || ((langnum == LANG_hu) && hu_mov_rule &&
1814                   (TESTAFF(
1815                        rv->astr, 'F',
1816                        rv->alen) ||  // XXX hardwired Hungarian dictionary codes
1817                    TESTAFF(rv->astr, 'G', rv->alen) ||
1818                    TESTAFF(rv->astr, 'H', rv->alen)))
1819               // END of LANG_hu section
1820               ) &&
1821              (
1822                  // test CHECKCOMPOUNDPATTERN conditions
1823                  scpd == 0 || checkcpdtable[scpd - 1].cond == FLAG_NULL ||
1824                  TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)) &&
1825              !((checkcompoundtriple && scpd == 0 &&
1826                 !words &&  // test triple letters
1827                 (word[i - 1] == word[i]) &&
1828                 (((i > 1) && (word[i - 1] == word[i - 2])) ||
1829                  ((word[i - 1] == word[i + 1]))  // may be word[i+1] == '\0'
1830                  )) ||
1831                (checkcompoundcase && scpd == 0 && !words &&
1832                 cpdcase_check(word.c_str(), i))))
1833             // LANG_hu section: spec. Hungarian rule
1834             || ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
1835                 (rv = affix_check(st.c_str(), i)) &&
1836                 (sfx && sfx->getCont() &&
1837                  (  // XXX hardwired Hungarian dic. codes
1838                      TESTAFF(sfx->getCont(), (unsigned short)'x',
1839                              sfx->getContLen()) ||
1840                      TESTAFF(
1841                          sfx->getCont(), (unsigned short)'%',
1842                          sfx->getContLen()))))) {  // first word is ok condition
1843 
1844           // LANG_hu section: spec. Hungarian rule
1845           if (langnum == LANG_hu) {
1846             // calculate syllable number of the word
1847             numsyllable += get_syllable(st.substr(0, i));
1848             // + 1 word, if syllable number of the prefix > 1 (hungarian
1849             // convention)
1850             if (pfx && (get_syllable(pfx->getKey()) > 1))
1851               wordnum++;
1852           }
1853           // END of LANG_hu section
1854 
1855           // NEXT WORD(S)
1856           rv_first = rv;
1857           st[i] = ch;
1858 
1859           do {  // striple loop
1860 
1861             // check simplifiedtriple
1862             if (simplifiedtriple) {
1863               if (striple) {
1864                 checkedstriple = 1;
1865                 i--;  // check "fahrt" instead of "ahrt" in "Schiffahrt"
1866               } else if (i > 2 && word[i - 1] == word[i - 2])
1867                 striple = 1;
1868             }
1869 
1870             rv = lookup(st.c_str() + i);  // perhaps without prefix
1871 
1872             // search homonym with compound flag
1873             while ((rv) &&
1874                    ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1875                     !((compoundflag && !words &&
1876                        TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1877                       (compoundend && !words &&
1878                        TESTAFF(rv->astr, compoundend, rv->alen)) ||
1879                       (!defcpdtable.empty() && words &&
1880                        defcpd_check(&words, wnum + 1, rv, NULL, 1))) ||
1881                     (scpd != 0 && checkcpdtable[scpd - 1].cond2 != FLAG_NULL &&
1882                      !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2,
1883                               rv->alen)))) {
1884               rv = rv->next_homonym;
1885             }
1886 
1887             // check FORCEUCASE
1888             if (rv && forceucase && (rv) &&
1889                 (TESTAFF(rv->astr, forceucase, rv->alen)) &&
1890                 !(info && *info & SPELL_ORIGCAP))
1891               rv = NULL;
1892 
1893             if (rv && words && words[wnum + 1])
1894               return rv_first;
1895 
1896             oldnumsyllable2 = numsyllable;
1897             oldwordnum2 = wordnum;
1898 
1899             // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary
1900             // code
1901             if ((rv) && (langnum == LANG_hu) &&
1902                 (TESTAFF(rv->astr, 'I', rv->alen)) &&
1903                 !(TESTAFF(rv->astr, 'J', rv->alen))) {
1904               numsyllable--;
1905             }
1906             // END of LANG_hu section
1907 
1908             // increment word number, if the second root has a compoundroot flag
1909             if ((rv) && (compoundroot) &&
1910                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1911               wordnum++;
1912             }
1913 
1914             // check forbiddenwords
1915             if ((rv) && (rv->astr) &&
1916                 (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1917                  TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1918                  (is_sug && nosuggest &&
1919                   TESTAFF(rv->astr, nosuggest, rv->alen))))
1920               return NULL;
1921 
1922             // second word is acceptable, as a root?
1923             // hungarian conventions: compounding is acceptable,
1924             // when compound forms consist of 2 words, or if more,
1925             // then the syllable number of root words must be 6, or lesser.
1926 
1927             if ((rv) &&
1928                 ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1929                  (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
1930                 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
1931                  ((cpdmaxsyllable != 0) &&
1932                   (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
1933                    cpdmaxsyllable))) &&
1934                 (
1935                     // test CHECKCOMPOUNDPATTERN
1936                     checkcpdtable.empty() || scpd != 0 ||
1937                     !cpdpat_check(word.c_str(), i, rv_first, rv, 0)) &&
1938                 ((!checkcompounddup || (rv != rv_first)))
1939                 // test CHECKCOMPOUNDPATTERN conditions
1940                 &&
1941                 (scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL ||
1942                  TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) {
1943               // forbid compound word, if it is a non-compound word with typical
1944               // fault
1945               if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) ||
1946                       cpdwordpair_check(word.c_str(), len))
1947                 return NULL;
1948               return rv_first;
1949             }
1950 
1951             numsyllable = oldnumsyllable2;
1952             wordnum = oldwordnum2;
1953 
1954             // perhaps second word has prefix or/and suffix
1955             sfx = NULL;
1956             sfxflag = FLAG_NULL;
1957             rv = (compoundflag && !onlycpdrule)
1958                      ? affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundflag,
1959                                    IN_CPD_END)
1960                      : NULL;
1961             if (!rv && compoundend && !onlycpdrule) {
1962               sfx = NULL;
1963               pfx = NULL;
1964               rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundend,
1965                                IN_CPD_END);
1966             }
1967 
1968             if (!rv && !defcpdtable.empty() && words) {
1969               rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), 0, IN_CPD_END);
1970               if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1))
1971                 return rv_first;
1972               rv = NULL;
1973             }
1974 
1975             // test CHECKCOMPOUNDPATTERN conditions (allowed forms)
1976             if (rv &&
1977                 !(scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL ||
1978                   TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen)))
1979               rv = NULL;
1980 
1981             // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds)
1982             if (rv && !checkcpdtable.empty() && scpd == 0 &&
1983                 cpdpat_check(word.c_str(), i, rv_first, rv, affixed))
1984               rv = NULL;
1985 
1986             // check non_compound flag in suffix and prefix
1987             if ((rv) && ((pfx && pfx->getCont() &&
1988                           TESTAFF(pfx->getCont(), compoundforbidflag,
1989                                   pfx->getContLen())) ||
1990                          (sfx && sfx->getCont() &&
1991                           TESTAFF(sfx->getCont(), compoundforbidflag,
1992                                   sfx->getContLen())))) {
1993               rv = NULL;
1994             }
1995 
1996             // check FORCEUCASE
1997             if (rv && forceucase && (rv) &&
1998                 (TESTAFF(rv->astr, forceucase, rv->alen)) &&
1999                 !(info && *info & SPELL_ORIGCAP))
2000               rv = NULL;
2001 
2002             // check forbiddenwords
2003             if ((rv) && (rv->astr) &&
2004                 (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2005                  TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
2006                  (is_sug && nosuggest &&
2007                   TESTAFF(rv->astr, nosuggest, rv->alen))))
2008               return NULL;
2009 
2010             // pfxappnd = prefix of word+i, or NULL
2011             // calculate syllable number of prefix.
2012             // hungarian convention: when syllable number of prefix is more,
2013             // than 1, the prefix+word counts as two words.
2014 
2015             if (langnum == LANG_hu) {
2016               // calculate syllable number of the word
2017               numsyllable += get_syllable(word.c_str() + i);
2018 
2019               // - affix syllable num.
2020               // XXX only second suffix (inflections, not derivations)
2021               if (sfxappnd) {
2022                 std::string tmp(sfxappnd);
2023                 reverseword(tmp);
2024                 numsyllable -= short(get_syllable(tmp) + sfxextra);
2025               } else {
2026                 numsyllable -= short(sfxextra);
2027               }
2028 
2029               // + 1 word, if syllable number of the prefix > 1 (hungarian
2030               // convention)
2031               if (pfx && (get_syllable(pfx->getKey()) > 1))
2032                 wordnum++;
2033 
2034               // increment syllable num, if last word has a SYLLABLENUM flag
2035               // and the suffix is beginning `s'
2036 
2037               if (!cpdsyllablenum.empty()) {
2038                 switch (sfxflag) {
2039                   case 'c': {
2040                     numsyllable += 2;
2041                     break;
2042                   }
2043                   case 'J': {
2044                     numsyllable += 1;
2045                     break;
2046                   }
2047                   case 'I': {
2048                     if (rv && TESTAFF(rv->astr, 'J', rv->alen))
2049                       numsyllable += 1;
2050                     break;
2051                   }
2052                 }
2053               }
2054             }
2055 
2056             // increment word number, if the second word has a compoundroot flag
2057             if ((rv) && (compoundroot) &&
2058                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2059               wordnum++;
2060             }
2061             // second word is acceptable, as a word with prefix or/and suffix?
2062             // hungarian conventions: compounding is acceptable,
2063             // when compound forms consist 2 word, otherwise
2064             // the syllable number of root words is 6, or lesser.
2065             if ((rv) &&
2066                 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2067                  ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
2068                 ((!checkcompounddup || (rv != rv_first)))) {
2069               // forbid compound word, if it is a non-compound word with typical
2070               // fault
2071               if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) ||
2072                       cpdwordpair_check(word.c_str(), len))
2073                 return NULL;
2074               return rv_first;
2075             }
2076 
2077             numsyllable = oldnumsyllable2;
2078             wordnum = oldwordnum2;
2079 
2080             // perhaps second word is a compound word (recursive call)
2081             if (wordnum + 2 < maxwordnum) {
2082               rv = compound_check(st.substr(i), wordnum + 1,
2083                                   numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
2084                                   is_sug, info);
2085 
2086               if (rv && !checkcpdtable.empty() &&
2087                   ((scpd == 0 &&
2088                     cpdpat_check(word.c_str(), i, rv_first, rv, affixed)) ||
2089                    (scpd != 0 &&
2090                     !cpdpat_check(word.c_str(), i, rv_first, rv, affixed))))
2091                 rv = NULL;
2092             } else {
2093               rv = NULL;
2094             }
2095             if (rv) {
2096               // forbid compound word, if it is a non-compound word with typical
2097               // fault, or a dictionary word pair
2098 
2099               if (cpdwordpair_check(word.c_str(), len))
2100                   return NULL;
2101 
2102               if (checkcompoundrep || forbiddenword) {
2103 
2104                 if (checkcompoundrep && cpdrep_check(word.c_str(), len))
2105                   return NULL;
2106 
2107                 // check first part
2108                 if (strncmp(rv->word, word.c_str() + i, rv->blen) == 0) {
2109                   char r = st[i + rv->blen];
2110                   st[i + rv->blen] = '\0';
2111 
2112                   if ((checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) ||
2113                       cpdwordpair_check(st.c_str(), i + rv->blen)) {
2114                     st[ + i + rv->blen] = r;
2115                     continue;
2116                   }
2117 
2118                   if (forbiddenword) {
2119                     struct hentry* rv2 = lookup(word.c_str());
2120                     if (!rv2)
2121                       rv2 = affix_check(word.c_str(), len);
2122                     if (rv2 && rv2->astr &&
2123                         TESTAFF(rv2->astr, forbiddenword, rv2->alen) &&
2124                         (strncmp(rv2->word, st.c_str(), i + rv->blen) == 0)) {
2125                       return NULL;
2126                     }
2127                   }
2128                   st[i + rv->blen] = r;
2129                 }
2130               }
2131               return rv_first;
2132             }
2133           } while (striple && !checkedstriple);  // end of striple loop
2134 
2135           if (checkedstriple) {
2136             i++;
2137             checkedstriple = 0;
2138             striple = 0;
2139           }
2140 
2141         }  // first word is ok condition
2142 
2143         if (soldi != 0) {
2144           i = soldi;
2145           soldi = 0;
2146           len = oldlen;
2147           cmin = oldcmin;
2148           cmax = oldcmax;
2149         }
2150         scpd++;
2151 
2152       } while (!onlycpdrule && simplifiedcpd &&
2153                scpd <= checkcpdtable.size());  // end of simplifiedcpd loop
2154 
2155       scpd = 0;
2156       wordnum = oldwordnum;
2157       numsyllable = oldnumsyllable;
2158 
2159       if (soldi != 0) {
2160         i = soldi;
2161         st.assign(word);  // XXX add more optim.
2162         soldi = 0;
2163       } else
2164         st[i] = ch;
2165 
2166     } while (!defcpdtable.empty() && oldwordnum == 0 &&
2167              onlycpdrule++ < 1);  // end of onlycpd loop
2168   }
2169 
2170   return NULL;
2171 }
2172 
2173 // check if compound word is correctly spelled
2174 // hu_mov_rule = spec. Hungarian rule (XXX)
compound_check_morph(const char * word,int len,short wordnum,short numsyllable,short maxwordnum,short wnum,hentry ** words,hentry ** rwords,char hu_mov_rule,std::string & result,const std::string * partresult)2175 int AffixMgr::compound_check_morph(const char* word,
2176                                    int len,
2177                                    short wordnum,
2178                                    short numsyllable,
2179                                    short maxwordnum,
2180                                    short wnum,
2181                                    hentry** words,
2182                                    hentry** rwords,
2183                                    char hu_mov_rule,
2184                                    std::string& result,
2185                                    const std::string* partresult) {
2186   int i;
2187   short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
2188   int ok = 0;
2189 
2190   struct hentry* rv = NULL;
2191   struct hentry* rv_first;
2192   std::string st;
2193   char ch;
2194 
2195   int checked_prefix;
2196   std::string presult;
2197 
2198   int cmin;
2199   int cmax;
2200 
2201   char affixed = 0;
2202   hentry** oldwords = words;
2203 
2204   // add a time limit to handle possible
2205   // combinatorical explosion of the overlapping words
2206 
2207   HUNSPELL_THREAD_LOCAL clock_t timelimit;
2208 
2209   if (wordnum == 0)
2210       timelimit = clock();
2211   else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) {
2212       timelimit = 0;
2213   }
2214 
2215   setcminmax(&cmin, &cmax, word, len);
2216 
2217   st.assign(word);
2218 
2219   for (i = cmin; i < cmax; i++) {
2220     // go to end of the UTF-8 character
2221     if (utf8) {
2222       for (; (st[i] & 0xc0) == 0x80; i++)
2223         ;
2224       if (i >= cmax)
2225         return 0;
2226     }
2227 
2228     words = oldwords;
2229     int onlycpdrule = (words) ? 1 : 0;
2230 
2231     do {  // onlycpdrule loop
2232 
2233       if (timelimit == 0)
2234         return 0;
2235 
2236       oldnumsyllable = numsyllable;
2237       oldwordnum = wordnum;
2238       checked_prefix = 0;
2239 
2240       ch = st[i];
2241       st[i] = '\0';
2242       sfx = NULL;
2243 
2244       // FIRST WORD
2245 
2246       affixed = 1;
2247 
2248       presult.clear();
2249       if (partresult)
2250         presult.append(*partresult);
2251 
2252       rv = lookup(st.c_str());  // perhaps without prefix
2253 
2254       // forbid dictionary stems with COMPOUNDFORBIDFLAG in
2255       // compound words, overriding the effect of COMPOUNDPERMITFLAG
2256       if ((rv) && compoundforbidflag &&
2257               TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule)
2258           continue;
2259 
2260       // search homonym with compound flag
2261       while ((rv) && !hu_mov_rule &&
2262              ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2263               !((compoundflag && !words && !onlycpdrule &&
2264                  TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2265                 (compoundbegin && !wordnum && !onlycpdrule &&
2266                  TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2267                 (compoundmiddle && wordnum && !words && !onlycpdrule &&
2268                  TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
2269                 (!defcpdtable.empty() && onlycpdrule &&
2270                  ((!words && !wordnum &&
2271                    defcpd_check(&words, wnum, rv, rwords, 0)) ||
2272                   (words &&
2273                    defcpd_check(&words, wnum, rv, rwords, 0))))))) {
2274         rv = rv->next_homonym;
2275       }
2276 
2277       if (timelimit == 0)
2278         return 0;
2279 
2280       if (rv)
2281         affixed = 0;
2282 
2283       if (rv) {
2284         presult.push_back(MSEP_FLD);
2285         presult.append(MORPH_PART);
2286         presult.append(st.c_str());
2287         if (!HENTRY_FIND(rv, MORPH_STEM)) {
2288           presult.push_back(MSEP_FLD);
2289           presult.append(MORPH_STEM);
2290           presult.append(st.c_str());
2291         }
2292         if (HENTRY_DATA(rv)) {
2293           presult.push_back(MSEP_FLD);
2294           presult.append(HENTRY_DATA2(rv));
2295         }
2296       }
2297 
2298       if (!rv) {
2299         if (compoundflag &&
2300             !(rv =
2301                   prefix_check(st.c_str(), i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
2302                                compoundflag))) {
2303           if (((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
2304                                   compoundflag,
2305                                   hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2306                (compoundmoresuffixes &&
2307                 (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
2308               !hu_mov_rule && sfx->getCont() &&
2309               ((compoundforbidflag &&
2310                 TESTAFF(sfx->getCont(), compoundforbidflag,
2311                         sfx->getContLen())) ||
2312                (compoundend &&
2313                 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
2314             rv = NULL;
2315           }
2316         }
2317 
2318         if (rv ||
2319             (((wordnum == 0) && compoundbegin &&
2320               ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
2321                                   compoundbegin,
2322                                   hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2323                (compoundmoresuffixes &&
2324                 (rv = suffix_check_twosfx(
2325                      st.c_str(), i, 0, NULL,
2326                      compoundbegin))) ||  // twofold suffix+compound
2327                (rv = prefix_check(st.c_str(), i,
2328                                   hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
2329                                   compoundbegin)))) ||
2330              ((wordnum > 0) && compoundmiddle &&
2331               ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
2332                                   compoundmiddle,
2333                                   hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2334                (compoundmoresuffixes &&
2335                 (rv = suffix_check_twosfx(
2336                      st.c_str(), i, 0, NULL,
2337                      compoundmiddle))) ||  // twofold suffix+compound
2338                (rv = prefix_check(st.c_str(), i,
2339                                   hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
2340                                   compoundmiddle)))))) {
2341           std::string p;
2342           if (compoundflag)
2343             p = affix_check_morph(st.c_str(), i, compoundflag);
2344           if (p.empty()) {
2345             if ((wordnum == 0) && compoundbegin) {
2346               p = affix_check_morph(st.c_str(), i, compoundbegin);
2347             } else if ((wordnum > 0) && compoundmiddle) {
2348               p = affix_check_morph(st.c_str(), i, compoundmiddle);
2349             }
2350           }
2351           if (!p.empty()) {
2352             presult.push_back(MSEP_FLD);
2353             presult.append(MORPH_PART);
2354             presult.append(st.c_str());
2355             line_uniq_app(p, MSEP_REC);
2356             presult.append(p);
2357           }
2358           checked_prefix = 1;
2359         }
2360         // else check forbiddenwords
2361       } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2362                               TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
2363                               TESTAFF(rv->astr, needaffix, rv->alen))) {
2364         st[i] = ch;
2365         continue;
2366       }
2367 
2368       // check non_compound flag in suffix and prefix
2369       if ((rv) && !hu_mov_rule &&
2370           ((pfx && pfx->getCont() &&
2371             TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
2372            (sfx && sfx->getCont() &&
2373             TESTAFF(sfx->getCont(), compoundforbidflag, sfx->getContLen())))) {
2374         continue;
2375       }
2376 
2377       // check compoundend flag in suffix and prefix
2378       if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
2379           ((pfx && pfx->getCont() &&
2380             TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) ||
2381            (sfx && sfx->getCont() &&
2382             TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
2383         continue;
2384       }
2385 
2386       // check compoundmiddle flag in suffix and prefix
2387       if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
2388           !hu_mov_rule &&
2389           ((pfx && pfx->getCont() &&
2390             TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) ||
2391            (sfx && sfx->getCont() &&
2392             TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) {
2393         rv = NULL;
2394       }
2395 
2396       // check forbiddenwords
2397       if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2398                                  TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)))
2399         continue;
2400 
2401       // increment word number, if the second root has a compoundroot flag
2402       if ((rv) && (compoundroot) &&
2403           (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2404         wordnum++;
2405       }
2406 
2407       // first word is acceptable in compound words?
2408       if (((rv) &&
2409            (checked_prefix || (words && words[wnum]) ||
2410             (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2411             ((oldwordnum == 0) && compoundbegin &&
2412              TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2413             ((oldwordnum > 0) && compoundmiddle &&
2414              TESTAFF(rv->astr, compoundmiddle, rv->alen))
2415             // LANG_hu section: spec. Hungarian rule
2416             || ((langnum == LANG_hu) &&  // hu_mov_rule
2417                 hu_mov_rule && (TESTAFF(rv->astr, 'F', rv->alen) ||
2418                                 TESTAFF(rv->astr, 'G', rv->alen) ||
2419                                 TESTAFF(rv->astr, 'H', rv->alen)))
2420             // END of LANG_hu section
2421             ) &&
2422            !((checkcompoundtriple && !words &&  // test triple letters
2423               (word[i - 1] == word[i]) &&
2424               (((i > 1) && (word[i - 1] == word[i - 2])) ||
2425                ((word[i - 1] == word[i + 1]))  // may be word[i+1] == '\0'
2426                )) ||
2427              (
2428                  // test CHECKCOMPOUNDPATTERN
2429                  !checkcpdtable.empty() && !words &&
2430                  cpdpat_check(word, i, rv, NULL, affixed)) ||
2431              (checkcompoundcase && !words && cpdcase_check(word, i))))
2432           // LANG_hu section: spec. Hungarian rule
2433           ||
2434           ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
2435            (rv = affix_check(st.c_str(), i)) &&
2436            (sfx && sfx->getCont() &&
2437             (TESTAFF(sfx->getCont(), (unsigned short)'x', sfx->getContLen()) ||
2438              TESTAFF(sfx->getCont(), (unsigned short)'%', sfx->getContLen()))))
2439           // END of LANG_hu section
2440           ) {
2441         // LANG_hu section: spec. Hungarian rule
2442         if (langnum == LANG_hu) {
2443           // calculate syllable number of the word
2444           numsyllable += get_syllable(st.substr(0, i));
2445 
2446           // + 1 word, if syllable number of the prefix > 1 (hungarian
2447           // convention)
2448           if (pfx && (get_syllable(pfx->getKey()) > 1))
2449             wordnum++;
2450         }
2451         // END of LANG_hu section
2452 
2453         // NEXT WORD(S)
2454         rv_first = rv;
2455         rv = lookup((word + i));  // perhaps without prefix
2456 
2457         // search homonym with compound flag
2458         while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2459                         !((compoundflag && !words &&
2460                            TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2461                           (compoundend && !words &&
2462                            TESTAFF(rv->astr, compoundend, rv->alen)) ||
2463                           (!defcpdtable.empty() && words &&
2464                            defcpd_check(&words, wnum + 1, rv, NULL, 1))))) {
2465           rv = rv->next_homonym;
2466         }
2467 
2468         if (rv && words && words[wnum + 1]) {
2469           result.append(presult);
2470           result.push_back(MSEP_FLD);
2471           result.append(MORPH_PART);
2472           result.append(word + i);
2473           if (complexprefixes && HENTRY_DATA(rv))
2474             result.append(HENTRY_DATA2(rv));
2475           if (!HENTRY_FIND(rv, MORPH_STEM)) {
2476             result.push_back(MSEP_FLD);
2477             result.append(MORPH_STEM);
2478             result.append(HENTRY_WORD(rv));
2479           }
2480           // store the pointer of the hash entry
2481           if (!complexprefixes && HENTRY_DATA(rv)) {
2482             result.push_back(MSEP_FLD);
2483             result.append(HENTRY_DATA2(rv));
2484           }
2485           result.push_back(MSEP_REC);
2486           return 0;
2487         }
2488 
2489         oldnumsyllable2 = numsyllable;
2490         oldwordnum2 = wordnum;
2491 
2492         // LANG_hu section: spec. Hungarian rule
2493         if ((rv) && (langnum == LANG_hu) &&
2494             (TESTAFF(rv->astr, 'I', rv->alen)) &&
2495             !(TESTAFF(rv->astr, 'J', rv->alen))) {
2496           numsyllable--;
2497         }
2498         // END of LANG_hu section
2499         // increment word number, if the second root has a compoundroot flag
2500         if ((rv) && (compoundroot) &&
2501             (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2502           wordnum++;
2503         }
2504 
2505         // check forbiddenwords
2506         if ((rv) && (rv->astr) &&
2507             (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2508              TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) {
2509           st[i] = ch;
2510           continue;
2511         }
2512 
2513         // second word is acceptable, as a root?
2514         // hungarian conventions: compounding is acceptable,
2515         // when compound forms consist of 2 words, or if more,
2516         // then the syllable number of root words must be 6, or lesser.
2517         if ((rv) &&
2518             ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2519              (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
2520             (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2521              ((cpdmaxsyllable != 0) &&
2522               (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
2523                cpdmaxsyllable))) &&
2524             ((!checkcompounddup || (rv != rv_first)))) {
2525           // bad compound word
2526           result.append(presult);
2527           result.push_back(MSEP_FLD);
2528           result.append(MORPH_PART);
2529           result.append(word + i);
2530 
2531           if (HENTRY_DATA(rv)) {
2532             if (complexprefixes)
2533               result.append(HENTRY_DATA2(rv));
2534             if (!HENTRY_FIND(rv, MORPH_STEM)) {
2535               result.push_back(MSEP_FLD);
2536               result.append(MORPH_STEM);
2537               result.append(HENTRY_WORD(rv));
2538             }
2539             // store the pointer of the hash entry
2540             if (!complexprefixes) {
2541               result.push_back(MSEP_FLD);
2542               result.append(HENTRY_DATA2(rv));
2543             }
2544           }
2545           result.push_back(MSEP_REC);
2546           ok = 1;
2547         }
2548 
2549         numsyllable = oldnumsyllable2;
2550         wordnum = oldwordnum2;
2551 
2552         // perhaps second word has prefix or/and suffix
2553         sfx = NULL;
2554         sfxflag = FLAG_NULL;
2555 
2556         if (compoundflag && !onlycpdrule)
2557           rv = affix_check((word + i), strlen(word + i), compoundflag);
2558         else
2559           rv = NULL;
2560 
2561         if (!rv && compoundend && !onlycpdrule) {
2562           sfx = NULL;
2563           pfx = NULL;
2564           rv = affix_check((word + i), strlen(word + i), compoundend);
2565         }
2566 
2567         if (!rv && !defcpdtable.empty() && words) {
2568           rv = affix_check((word + i), strlen(word + i), 0, IN_CPD_END);
2569           if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
2570             std::string m;
2571             if (compoundflag)
2572               m = affix_check_morph((word + i), strlen(word + i), compoundflag);
2573             if (m.empty() && compoundend) {
2574               m = affix_check_morph((word + i), strlen(word + i), compoundend);
2575             }
2576             result.append(presult);
2577             if (!m.empty()) {
2578               result.push_back(MSEP_FLD);
2579               result.append(MORPH_PART);
2580               result.append(word + i);
2581               line_uniq_app(m, MSEP_REC);
2582               result.append(m);
2583             }
2584             result.push_back(MSEP_REC);
2585             ok = 1;
2586           }
2587         }
2588 
2589         // check non_compound flag in suffix and prefix
2590         if ((rv) &&
2591             ((pfx && pfx->getCont() &&
2592               TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
2593              (sfx && sfx->getCont() &&
2594               TESTAFF(sfx->getCont(), compoundforbidflag,
2595                       sfx->getContLen())))) {
2596           rv = NULL;
2597         }
2598 
2599         // check forbiddenwords
2600         if ((rv) && (rv->astr) &&
2601             (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2602              TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)) &&
2603             (!TESTAFF(rv->astr, needaffix, rv->alen))) {
2604           st[i] = ch;
2605           continue;
2606         }
2607 
2608         if (langnum == LANG_hu) {
2609           // calculate syllable number of the word
2610           numsyllable += get_syllable(word + i);
2611 
2612           // - affix syllable num.
2613           // XXX only second suffix (inflections, not derivations)
2614           if (sfxappnd) {
2615             std::string tmp(sfxappnd);
2616             reverseword(tmp);
2617             numsyllable -= short(get_syllable(tmp) + sfxextra);
2618           } else {
2619             numsyllable -= short(sfxextra);
2620           }
2621 
2622           // + 1 word, if syllable number of the prefix > 1 (hungarian
2623           // convention)
2624           if (pfx && (get_syllable(pfx->getKey()) > 1))
2625             wordnum++;
2626 
2627           // increment syllable num, if last word has a SYLLABLENUM flag
2628           // and the suffix is beginning `s'
2629 
2630           if (!cpdsyllablenum.empty()) {
2631             switch (sfxflag) {
2632               case 'c': {
2633                 numsyllable += 2;
2634                 break;
2635               }
2636               case 'J': {
2637                 numsyllable += 1;
2638                 break;
2639               }
2640               case 'I': {
2641                 if (rv && TESTAFF(rv->astr, 'J', rv->alen))
2642                   numsyllable += 1;
2643                 break;
2644               }
2645             }
2646           }
2647         }
2648 
2649         // increment word number, if the second word has a compoundroot flag
2650         if ((rv) && (compoundroot) &&
2651             (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2652           wordnum++;
2653         }
2654         // second word is acceptable, as a word with prefix or/and suffix?
2655         // hungarian conventions: compounding is acceptable,
2656         // when compound forms consist 2 word, otherwise
2657         // the syllable number of root words is 6, or lesser.
2658         if ((rv) &&
2659             (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2660              ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
2661             ((!checkcompounddup || (rv != rv_first)))) {
2662           std::string m;
2663           if (compoundflag)
2664             m = affix_check_morph((word + i), strlen(word + i), compoundflag);
2665           if (m.empty() && compoundend) {
2666             m = affix_check_morph((word + i), strlen(word + i), compoundend);
2667           }
2668           result.append(presult);
2669           if (!m.empty()) {
2670             result.push_back(MSEP_FLD);
2671             result.append(MORPH_PART);
2672             result.append(word + i);
2673             line_uniq_app(m, MSEP_REC);
2674             result.push_back(MSEP_FLD);
2675             result.append(m);
2676           }
2677           result.push_back(MSEP_REC);
2678           ok = 1;
2679         }
2680 
2681         numsyllable = oldnumsyllable2;
2682         wordnum = oldwordnum2;
2683 
2684         // perhaps second word is a compound word (recursive call)
2685         if ((wordnum + 2 < maxwordnum) && (ok == 0)) {
2686           compound_check_morph((word + i), strlen(word + i), wordnum + 1,
2687                                numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
2688                                result, &presult);
2689         } else {
2690           rv = NULL;
2691         }
2692       }
2693       st[i] = ch;
2694       wordnum = oldwordnum;
2695       numsyllable = oldnumsyllable;
2696 
2697     } while (!defcpdtable.empty() && oldwordnum == 0 &&
2698              onlycpdrule++ < 1);  // end of onlycpd loop
2699   }
2700   return 0;
2701 }
2702 
2703 
isRevSubset(const char * s1,const char * end_of_s2,int len)2704 inline int AffixMgr::isRevSubset(const char* s1,
2705                                  const char* end_of_s2,
2706                                  int len) {
2707   while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {
2708     s1++;
2709     end_of_s2--;
2710     len--;
2711   }
2712   return (*s1 == '\0');
2713 }
2714 
2715 // check word for suffixes
suffix_check(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag,char in_compound)2716 struct hentry* AffixMgr::suffix_check(const char* word,
2717                                       int len,
2718                                       int sfxopts,
2719                                       PfxEntry* ppfx,
2720                                       const FLAG cclass,
2721                                       const FLAG needflag,
2722                                       char in_compound) {
2723   struct hentry* rv = NULL;
2724   PfxEntry* ep = ppfx;
2725 
2726   // first handle the special case of 0 length suffixes
2727   SfxEntry* se = sStart[0];
2728 
2729   while (se) {
2730     if (!cclass || se->getCont()) {
2731       // suffixes are not allowed in beginning of compounds
2732       if ((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
2733            // except when signed with compoundpermitflag flag
2734            (se->getCont() && compoundpermitflag &&
2735             TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) &&
2736           (!circumfix ||
2737            // no circumfix flag in prefix and suffix
2738            ((!ppfx || !(ep->getCont()) ||
2739              !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2740             (!se->getCont() ||
2741              !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) ||
2742            // circumfix flag in prefix AND suffix
2743            ((ppfx && (ep->getCont()) &&
2744              TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2745             (se->getCont() &&
2746              (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) &&
2747           // fogemorpheme
2748           (in_compound ||
2749            !(se->getCont() &&
2750              (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) &&
2751           // needaffix on prefix or first suffix
2752           (cclass ||
2753            !(se->getCont() &&
2754              TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2755            (ppfx &&
2756             !((ep->getCont()) &&
2757               TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))) {
2758         rv = se->checkword(word, len, sfxopts, ppfx,
2759                            (FLAG)cclass, needflag,
2760                            (in_compound ? 0 : onlyincompound));
2761         if (rv) {
2762           sfx = se;  // BUG: sfx not stateless
2763           return rv;
2764         }
2765       }
2766     }
2767     se = se->getNext();
2768   }
2769 
2770   // now handle the general case
2771   if (len == 0)
2772     return NULL;  // FULLSTRIP
2773   unsigned char sp = *((const unsigned char*)(word + len - 1));
2774   SfxEntry* sptr = sStart[sp];
2775 
2776   while (sptr) {
2777     if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2778       // suffixes are not allowed in beginning of compounds
2779       if ((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
2780            // except when signed with compoundpermitflag flag
2781            (sptr->getCont() && compoundpermitflag &&
2782             TESTAFF(sptr->getCont(), compoundpermitflag,
2783                     sptr->getContLen()))) &&
2784           (!circumfix ||
2785            // no circumfix flag in prefix and suffix
2786            ((!ppfx || !(ep->getCont()) ||
2787              !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2788             (!sptr->getCont() ||
2789              !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) ||
2790            // circumfix flag in prefix AND suffix
2791            ((ppfx && (ep->getCont()) &&
2792              TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2793             (sptr->getCont() &&
2794              (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) &&
2795           // fogemorpheme
2796           (in_compound ||
2797            !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,
2798                                           sptr->getContLen()))))) &&
2799           // needaffix on prefix or first suffix
2800           (cclass ||
2801            !(sptr->getCont() &&
2802              TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
2803            (ppfx &&
2804             !((ep->getCont()) &&
2805               TESTAFF(ep->getCont(), needaffix, ep->getContLen())))))
2806         if (in_compound != IN_CPD_END || ppfx ||
2807             !(sptr->getCont() &&
2808               TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) {
2809           rv = sptr->checkword(word, len, sfxopts, ppfx,
2810                                cclass, needflag,
2811                                (in_compound ? 0 : onlyincompound));
2812           if (rv) {
2813             sfx = sptr;                 // BUG: sfx not stateless
2814             sfxflag = sptr->getFlag();  // BUG: sfxflag not stateless
2815             if (!sptr->getCont())
2816               sfxappnd = sptr->getKey();  // BUG: sfxappnd not stateless
2817             // LANG_hu section: spec. Hungarian rule
2818             else if (langnum == LANG_hu && sptr->getKeyLen() &&
2819                      sptr->getKey()[0] == 'i' && sptr->getKey()[1] != 'y' &&
2820                      sptr->getKey()[1] != 't') {
2821               sfxextra = 1;
2822             }
2823             // END of LANG_hu section
2824             return rv;
2825           }
2826         }
2827       sptr = sptr->getNextEQ();
2828     } else {
2829       sptr = sptr->getNextNE();
2830     }
2831   }
2832 
2833   return NULL;
2834 }
2835 
2836 // check word for two-level suffixes
suffix_check_twosfx(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG needflag)2837 struct hentry* AffixMgr::suffix_check_twosfx(const char* word,
2838                                              int len,
2839                                              int sfxopts,
2840                                              PfxEntry* ppfx,
2841                                              const FLAG needflag) {
2842   struct hentry* rv = NULL;
2843 
2844   // first handle the special case of 0 length suffixes
2845   SfxEntry* se = sStart[0];
2846   while (se) {
2847     if (contclasses[se->getFlag()]) {
2848       rv = se->check_twosfx(word, len, sfxopts, ppfx, needflag);
2849       if (rv)
2850         return rv;
2851     }
2852     se = se->getNext();
2853   }
2854 
2855   // now handle the general case
2856   if (len == 0)
2857     return NULL;  // FULLSTRIP
2858   unsigned char sp = *((const unsigned char*)(word + len - 1));
2859   SfxEntry* sptr = sStart[sp];
2860 
2861   while (sptr) {
2862     if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2863       if (contclasses[sptr->getFlag()]) {
2864         rv = sptr->check_twosfx(word, len, sfxopts, ppfx, needflag);
2865         if (rv) {
2866           sfxflag = sptr->getFlag();  // BUG: sfxflag not stateless
2867           if (!sptr->getCont())
2868             sfxappnd = sptr->getKey();  // BUG: sfxappnd not stateless
2869           return rv;
2870         }
2871       }
2872       sptr = sptr->getNextEQ();
2873     } else {
2874       sptr = sptr->getNextNE();
2875     }
2876   }
2877 
2878   return NULL;
2879 }
2880 
2881 // check word for two-level suffixes and morph
suffix_check_twosfx_morph(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG needflag)2882 std::string AffixMgr::suffix_check_twosfx_morph(const char* word,
2883                                                 int len,
2884                                                 int sfxopts,
2885                                                 PfxEntry* ppfx,
2886                                                 const FLAG needflag) {
2887   std::string result;
2888   std::string result2;
2889   std::string result3;
2890 
2891   // first handle the special case of 0 length suffixes
2892   SfxEntry* se = sStart[0];
2893   while (se) {
2894     if (contclasses[se->getFlag()]) {
2895       std::string st = se->check_twosfx_morph(word, len, sfxopts, ppfx, needflag);
2896       if (!st.empty()) {
2897         if (ppfx) {
2898           if (ppfx->getMorph()) {
2899             result.append(ppfx->getMorph());
2900             result.push_back(MSEP_FLD);
2901           } else
2902             debugflag(result, ppfx->getFlag());
2903         }
2904         result.append(st);
2905         if (se->getMorph()) {
2906           result.push_back(MSEP_FLD);
2907           result.append(se->getMorph());
2908         } else
2909           debugflag(result, se->getFlag());
2910         result.push_back(MSEP_REC);
2911       }
2912     }
2913     se = se->getNext();
2914   }
2915 
2916   // now handle the general case
2917   if (len == 0)
2918     return std::string();  // FULLSTRIP
2919   unsigned char sp = *((const unsigned char*)(word + len - 1));
2920   SfxEntry* sptr = sStart[sp];
2921 
2922   while (sptr) {
2923     if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2924       if (contclasses[sptr->getFlag()]) {
2925         std::string st = sptr->check_twosfx_morph(word, len, sfxopts, ppfx, needflag);
2926         if (!st.empty()) {
2927           sfxflag = sptr->getFlag();  // BUG: sfxflag not stateless
2928           if (!sptr->getCont())
2929             sfxappnd = sptr->getKey();  // BUG: sfxappnd not stateless
2930           result2.assign(st);
2931 
2932           result3.clear();
2933 
2934           if (sptr->getMorph()) {
2935             result3.push_back(MSEP_FLD);
2936             result3.append(sptr->getMorph());
2937           } else
2938             debugflag(result3, sptr->getFlag());
2939           strlinecat(result2, result3);
2940           result2.push_back(MSEP_REC);
2941           result.append(result2);
2942         }
2943       }
2944       sptr = sptr->getNextEQ();
2945     } else {
2946       sptr = sptr->getNextNE();
2947     }
2948   }
2949 
2950   return result;
2951 }
2952 
suffix_check_morph(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag,char in_compound)2953 std::string AffixMgr::suffix_check_morph(const char* word,
2954                                          int len,
2955                                          int sfxopts,
2956                                          PfxEntry* ppfx,
2957                                          const FLAG cclass,
2958                                          const FLAG needflag,
2959                                          char in_compound) {
2960   std::string result;
2961 
2962   struct hentry* rv = NULL;
2963 
2964   PfxEntry* ep = ppfx;
2965 
2966   // first handle the special case of 0 length suffixes
2967   SfxEntry* se = sStart[0];
2968   while (se) {
2969     if (!cclass || se->getCont()) {
2970       // suffixes are not allowed in beginning of compounds
2971       if (((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
2972             // except when signed with compoundpermitflag flag
2973             (se->getCont() && compoundpermitflag &&
2974              TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) &&
2975            (!circumfix ||
2976             // no circumfix flag in prefix and suffix
2977             ((!ppfx || !(ep->getCont()) ||
2978               !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2979              (!se->getCont() ||
2980               !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) ||
2981             // circumfix flag in prefix AND suffix
2982             ((ppfx && (ep->getCont()) &&
2983               TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2984              (se->getCont() &&
2985               (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) &&
2986            // fogemorpheme
2987            (in_compound ||
2988             !((se->getCont() &&
2989                (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
2990            // needaffix on prefix or first suffix
2991            (cclass ||
2992             !(se->getCont() &&
2993               TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2994             (ppfx &&
2995              !((ep->getCont()) &&
2996                TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))))
2997         rv = se->checkword(word, len, sfxopts, ppfx, cclass,
2998                            needflag, FLAG_NULL);
2999       while (rv) {
3000         if (ppfx) {
3001           if (ppfx->getMorph()) {
3002             result.append(ppfx->getMorph());
3003             result.push_back(MSEP_FLD);
3004           } else
3005             debugflag(result, ppfx->getFlag());
3006         }
3007         if (complexprefixes && HENTRY_DATA(rv))
3008           result.append(HENTRY_DATA2(rv));
3009         if (!HENTRY_FIND(rv, MORPH_STEM)) {
3010           result.push_back(MSEP_FLD);
3011           result.append(MORPH_STEM);
3012           result.append(HENTRY_WORD(rv));
3013         }
3014 
3015         if (!complexprefixes && HENTRY_DATA(rv)) {
3016           result.push_back(MSEP_FLD);
3017           result.append(HENTRY_DATA2(rv));
3018         }
3019         if (se->getMorph()) {
3020           result.push_back(MSEP_FLD);
3021           result.append(se->getMorph());
3022         } else
3023           debugflag(result, se->getFlag());
3024         result.push_back(MSEP_REC);
3025         rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
3026       }
3027     }
3028     se = se->getNext();
3029   }
3030 
3031   // now handle the general case
3032   if (len == 0)
3033     return std::string();  // FULLSTRIP
3034   unsigned char sp = *((const unsigned char*)(word + len - 1));
3035   SfxEntry* sptr = sStart[sp];
3036 
3037   while (sptr) {
3038     if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
3039       // suffixes are not allowed in beginning of compounds
3040       if (((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
3041             // except when signed with compoundpermitflag flag
3042             (sptr->getCont() && compoundpermitflag &&
3043              TESTAFF(sptr->getCont(), compoundpermitflag,
3044                      sptr->getContLen()))) &&
3045            (!circumfix ||
3046             // no circumfix flag in prefix and suffix
3047             ((!ppfx || !(ep->getCont()) ||
3048               !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
3049              (!sptr->getCont() ||
3050               !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) ||
3051             // circumfix flag in prefix AND suffix
3052             ((ppfx && (ep->getCont()) &&
3053               TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
3054              (sptr->getCont() &&
3055               (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) &&
3056            // fogemorpheme
3057            (in_compound ||
3058             !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,
3059                                            sptr->getContLen()))))) &&
3060            // needaffix on first suffix
3061            (cclass ||
3062             !(sptr->getCont() &&
3063               TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())))))
3064         rv = sptr->checkword(word, len, sfxopts, ppfx, cclass,
3065                              needflag, FLAG_NULL);
3066       while (rv) {
3067         if (ppfx) {
3068           if (ppfx->getMorph()) {
3069             result.append(ppfx->getMorph());
3070             result.push_back(MSEP_FLD);
3071           } else
3072             debugflag(result, ppfx->getFlag());
3073         }
3074         if (complexprefixes && HENTRY_DATA(rv))
3075           result.append(HENTRY_DATA2(rv));
3076         if (!HENTRY_FIND(rv, MORPH_STEM)) {
3077           result.push_back(MSEP_FLD);
3078           result.append(MORPH_STEM);
3079           result.append(HENTRY_WORD(rv));
3080         }
3081 
3082         if (!complexprefixes && HENTRY_DATA(rv)) {
3083           result.push_back(MSEP_FLD);
3084           result.append(HENTRY_DATA2(rv));
3085         }
3086 
3087         if (sptr->getMorph()) {
3088           result.push_back(MSEP_FLD);
3089           result.append(sptr->getMorph());
3090         } else
3091           debugflag(result, sptr->getFlag());
3092         result.push_back(MSEP_REC);
3093         rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
3094       }
3095       sptr = sptr->getNextEQ();
3096     } else {
3097       sptr = sptr->getNextNE();
3098     }
3099   }
3100 
3101   return result;
3102 }
3103 
3104 // check if word with affixes is correctly spelled
affix_check(const char * word,int len,const FLAG needflag,char in_compound)3105 struct hentry* AffixMgr::affix_check(const char* word,
3106                                      int len,
3107                                      const FLAG needflag,
3108                                      char in_compound) {
3109 
3110   // check all prefixes (also crossed with suffixes if allowed)
3111   struct hentry* rv = prefix_check(word, len, in_compound, needflag);
3112   if (rv)
3113     return rv;
3114 
3115   // if still not found check all suffixes
3116   rv = suffix_check(word, len, 0, NULL, FLAG_NULL, needflag, in_compound);
3117 
3118   if (havecontclass) {
3119     sfx = NULL;
3120     pfx = NULL;
3121 
3122     if (rv)
3123       return rv;
3124     // if still not found check all two-level suffixes
3125     rv = suffix_check_twosfx(word, len, 0, NULL, needflag);
3126 
3127     if (rv)
3128       return rv;
3129     // if still not found check all two-level suffixes
3130     rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag);
3131   }
3132 
3133   return rv;
3134 }
3135 
3136 // check if word with affixes is correctly spelled
affix_check_morph(const char * word,int len,const FLAG needflag,char in_compound)3137 std::string AffixMgr::affix_check_morph(const char* word,
3138                                   int len,
3139                                   const FLAG needflag,
3140                                   char in_compound) {
3141   std::string result;
3142 
3143   // check all prefixes (also crossed with suffixes if allowed)
3144   std::string st = prefix_check_morph(word, len, in_compound);
3145   if (!st.empty()) {
3146     result.append(st);
3147   }
3148 
3149   // if still not found check all suffixes
3150   st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound);
3151   if (!st.empty()) {
3152     result.append(st);
3153   }
3154 
3155   if (havecontclass) {
3156     sfx = NULL;
3157     pfx = NULL;
3158     // if still not found check all two-level suffixes
3159     st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag);
3160     if (!st.empty()) {
3161       result.append(st);
3162     }
3163 
3164     // if still not found check all two-level suffixes
3165     st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag);
3166     if (!st.empty()) {
3167       result.append(st);
3168     }
3169   }
3170 
3171   return result;
3172 }
3173 
3174 // morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields
3175 // in the first line of the inputs
3176 // return 0, if inputs equal
3177 // return 1, if inputs may equal with a secondary suffix
3178 // otherwise return -1
morphcmp(const char * s,const char * t)3179 static int morphcmp(const char* s, const char* t) {
3180   int se = 0;
3181   int te = 0;
3182   const char* sl;
3183   const char* tl;
3184   const char* olds;
3185   const char* oldt;
3186   if (!s || !t)
3187     return 1;
3188   olds = s;
3189   sl = strchr(s, '\n');
3190   s = strstr(s, MORPH_DERI_SFX);
3191   if (!s || (sl && sl < s))
3192     s = strstr(olds, MORPH_INFL_SFX);
3193   if (!s || (sl && sl < s)) {
3194     s = strstr(olds, MORPH_TERM_SFX);
3195     olds = NULL;
3196   }
3197   oldt = t;
3198   tl = strchr(t, '\n');
3199   t = strstr(t, MORPH_DERI_SFX);
3200   if (!t || (tl && tl < t))
3201     t = strstr(oldt, MORPH_INFL_SFX);
3202   if (!t || (tl && tl < t)) {
3203     t = strstr(oldt, MORPH_TERM_SFX);
3204     oldt = NULL;
3205   }
3206   while (s && t && (!sl || sl > s) && (!tl || tl > t)) {
3207     s += MORPH_TAG_LEN;
3208     t += MORPH_TAG_LEN;
3209     se = 0;
3210     te = 0;
3211     while ((*s == *t) && !se && !te) {
3212       s++;
3213       t++;
3214       switch (*s) {
3215         case ' ':
3216         case '\n':
3217         case '\t':
3218         case '\0':
3219           se = 1;
3220       }
3221       switch (*t) {
3222         case ' ':
3223         case '\n':
3224         case '\t':
3225         case '\0':
3226           te = 1;
3227       }
3228     }
3229     if (!se || !te) {
3230       // not terminal suffix difference
3231       if (olds)
3232         return -1;
3233       return 1;
3234     }
3235     olds = s;
3236     s = strstr(s, MORPH_DERI_SFX);
3237     if (!s || (sl && sl < s))
3238       s = strstr(olds, MORPH_INFL_SFX);
3239     if (!s || (sl && sl < s)) {
3240       s = strstr(olds, MORPH_TERM_SFX);
3241       olds = NULL;
3242     }
3243     oldt = t;
3244     t = strstr(t, MORPH_DERI_SFX);
3245     if (!t || (tl && tl < t))
3246       t = strstr(oldt, MORPH_INFL_SFX);
3247     if (!t || (tl && tl < t)) {
3248       t = strstr(oldt, MORPH_TERM_SFX);
3249       oldt = NULL;
3250     }
3251   }
3252   if (!s && !t && se && te)
3253     return 0;
3254   return 1;
3255 }
3256 
morphgen(const char * ts,int wl,const unsigned short * ap,unsigned short al,const char * morph,const char * targetmorph,int level)3257 std::string AffixMgr::morphgen(const char* ts,
3258                                int wl,
3259                                const unsigned short* ap,
3260                                unsigned short al,
3261                                const char* morph,
3262                                const char* targetmorph,
3263                          int level) {
3264   // handle suffixes
3265   if (!morph)
3266     return std::string();
3267 
3268   // check substandard flag
3269   if (TESTAFF(ap, substandard, al))
3270     return std::string();
3271 
3272   if (morphcmp(morph, targetmorph) == 0)
3273     return ts;
3274 
3275   size_t stemmorphcatpos;
3276   std::string mymorph;
3277 
3278   // use input suffix fields, if exist
3279   if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) {
3280     mymorph.assign(morph);
3281     mymorph.push_back(MSEP_FLD);
3282     stemmorphcatpos = mymorph.size();
3283   } else {
3284     stemmorphcatpos = std::string::npos;
3285   }
3286 
3287   for (int i = 0; i < al; i++) {
3288     const unsigned char c = (unsigned char)(ap[i] & 0x00FF);
3289     SfxEntry* sptr = sFlag[c];
3290     while (sptr) {
3291       if (sptr->getFlag() == ap[i] && sptr->getMorph() &&
3292           ((sptr->getContLen() == 0) ||
3293            // don't generate forms with substandard affixes
3294            !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) {
3295         const char* stemmorph;
3296         if (stemmorphcatpos != std::string::npos) {
3297           mymorph.replace(stemmorphcatpos, std::string::npos, sptr->getMorph());
3298           stemmorph = mymorph.c_str();
3299         } else {
3300           stemmorph = sptr->getMorph();
3301         }
3302 
3303         int cmp = morphcmp(stemmorph, targetmorph);
3304 
3305         if (cmp == 0) {
3306           std::string newword = sptr->add(ts, wl);
3307           if (!newword.empty()) {
3308             hentry* check = pHMgr->lookup(newword.c_str());  // XXX extra dic
3309             if (!check || !check->astr ||
3310                 !(TESTAFF(check->astr, forbiddenword, check->alen) ||
3311                   TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) {
3312               return newword;
3313             }
3314           }
3315         }
3316 
3317         // recursive call for secondary suffixes
3318         if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) &&
3319             !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) {
3320           std::string newword = sptr->add(ts, wl);
3321           if (!newword.empty()) {
3322             std::string newword2 =
3323                 morphgen(newword.c_str(), newword.size(), sptr->getCont(),
3324                          sptr->getContLen(), stemmorph, targetmorph, 1);
3325 
3326             if (!newword2.empty()) {
3327               return newword2;
3328             }
3329           }
3330         }
3331       }
3332       sptr = sptr->getFlgNxt();
3333     }
3334   }
3335   return std::string();
3336 }
3337 
expand_rootword(struct guessword * wlst,int maxn,const char * ts,int wl,const unsigned short * ap,unsigned short al,const char * bad,int badl,const char * phon)3338 int AffixMgr::expand_rootword(struct guessword* wlst,
3339                               int maxn,
3340                               const char* ts,
3341                               int wl,
3342                               const unsigned short* ap,
3343                               unsigned short al,
3344                               const char* bad,
3345                               int badl,
3346                               const char* phon) {
3347   int nh = 0;
3348   // first add root word to list
3349   if ((nh < maxn) &&
3350       !(al && ((needaffix && TESTAFF(ap, needaffix, al)) ||
3351                (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {
3352     wlst[nh].word = mystrdup(ts);
3353     if (!wlst[nh].word)
3354       return 0;
3355     wlst[nh].allow = false;
3356     wlst[nh].orig = NULL;
3357     nh++;
3358     // add special phonetic version
3359     if (phon && (nh < maxn)) {
3360       wlst[nh].word = mystrdup(phon);
3361       if (!wlst[nh].word)
3362         return nh - 1;
3363       wlst[nh].allow = false;
3364       wlst[nh].orig = mystrdup(ts);
3365       if (!wlst[nh].orig)
3366         return nh - 1;
3367       nh++;
3368     }
3369   }
3370 
3371   // handle suffixes
3372   for (int i = 0; i < al; i++) {
3373     const unsigned char c = (unsigned char)(ap[i] & 0x00FF);
3374     SfxEntry* sptr = sFlag[c];
3375     while (sptr) {
3376       if ((sptr->getFlag() == ap[i]) &&
3377           (!sptr->getKeyLen() ||
3378            ((badl > sptr->getKeyLen()) &&
3379             (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) &&
3380           // check needaffix flag
3381           !(sptr->getCont() &&
3382             ((needaffix &&
3383               TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
3384              (circumfix &&
3385               TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) ||
3386              (onlyincompound &&
3387               TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) {
3388         std::string newword = sptr->add(ts, wl);
3389         if (!newword.empty()) {
3390           if (nh < maxn) {
3391             wlst[nh].word = mystrdup(newword.c_str());
3392             wlst[nh].allow = sptr->allowCross();
3393             wlst[nh].orig = NULL;
3394             nh++;
3395             // add special phonetic version
3396             if (phon && (nh < maxn)) {
3397               std::string prefix(phon);
3398               std::string key(sptr->getKey());
3399               reverseword(key);
3400               prefix.append(key);
3401               wlst[nh].word = mystrdup(prefix.c_str());
3402               if (!wlst[nh].word)
3403                 return nh - 1;
3404               wlst[nh].allow = false;
3405               wlst[nh].orig = mystrdup(newword.c_str());
3406               if (!wlst[nh].orig)
3407                 return nh - 1;
3408               nh++;
3409             }
3410           }
3411         }
3412       }
3413       sptr = sptr->getFlgNxt();
3414     }
3415   }
3416 
3417   int n = nh;
3418 
3419   // handle cross products of prefixes and suffixes
3420   for (int j = 1; j < n; j++)
3421     if (wlst[j].allow) {
3422       for (int k = 0; k < al; k++) {
3423         const unsigned char c = (unsigned char)(ap[k] & 0x00FF);
3424         PfxEntry* cptr = pFlag[c];
3425         while (cptr) {
3426           if ((cptr->getFlag() == ap[k]) && cptr->allowCross() &&
3427               (!cptr->getKeyLen() ||
3428                ((badl > cptr->getKeyLen()) &&
3429                 (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
3430             int l1 = strlen(wlst[j].word);
3431             std::string newword = cptr->add(wlst[j].word, l1);
3432             if (!newword.empty()) {
3433               if (nh < maxn) {
3434                 wlst[nh].word = mystrdup(newword.c_str());
3435                 wlst[nh].allow = cptr->allowCross();
3436                 wlst[nh].orig = NULL;
3437                 nh++;
3438               }
3439             }
3440           }
3441           cptr = cptr->getFlgNxt();
3442         }
3443       }
3444     }
3445 
3446   // now handle pure prefixes
3447   for (int m = 0; m < al; m++) {
3448     const unsigned char c = (unsigned char)(ap[m] & 0x00FF);
3449     PfxEntry* ptr = pFlag[c];
3450     while (ptr) {
3451       if ((ptr->getFlag() == ap[m]) &&
3452           (!ptr->getKeyLen() ||
3453            ((badl > ptr->getKeyLen()) &&
3454             (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) &&
3455           // check needaffix flag
3456           !(ptr->getCont() &&
3457             ((needaffix &&
3458               TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) ||
3459              (circumfix &&
3460               TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||
3461              (onlyincompound &&
3462               TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))) {
3463         std::string newword = ptr->add(ts, wl);
3464         if (!newword.empty()) {
3465           if (nh < maxn) {
3466             wlst[nh].word = mystrdup(newword.c_str());
3467             wlst[nh].allow = ptr->allowCross();
3468             wlst[nh].orig = NULL;
3469             nh++;
3470           }
3471         }
3472       }
3473       ptr = ptr->getFlgNxt();
3474     }
3475   }
3476 
3477   return nh;
3478 }
3479 
3480 // return replacing table
get_reptable() const3481 const std::vector<replentry>& AffixMgr::get_reptable() const {
3482   return pHMgr->get_reptable();
3483 }
3484 
3485 // return iconv table
get_iconvtable() const3486 RepList* AffixMgr::get_iconvtable() const {
3487   if (!iconvtable)
3488     return NULL;
3489   return iconvtable;
3490 }
3491 
3492 // return oconv table
get_oconvtable() const3493 RepList* AffixMgr::get_oconvtable() const {
3494   if (!oconvtable)
3495     return NULL;
3496   return oconvtable;
3497 }
3498 
3499 // return replacing table
get_phonetable() const3500 struct phonetable* AffixMgr::get_phonetable() const {
3501   if (!phone)
3502     return NULL;
3503   return phone;
3504 }
3505 
3506 // return character map table
get_maptable() const3507 const std::vector<mapentry>& AffixMgr::get_maptable() const {
3508   return maptable;
3509 }
3510 
3511 // return character map table
get_breaktable() const3512 const std::vector<std::string>& AffixMgr::get_breaktable() const {
3513   return breaktable;
3514 }
3515 
3516 // return text encoding of dictionary
get_encoding()3517 const std::string& AffixMgr::get_encoding() {
3518   if (encoding.empty())
3519     encoding = SPELL_ENCODING;
3520   return encoding;
3521 }
3522 
3523 // return text encoding of dictionary
get_langnum() const3524 int AffixMgr::get_langnum() const {
3525   return langnum;
3526 }
3527 
3528 // return double prefix option
get_complexprefixes() const3529 int AffixMgr::get_complexprefixes() const {
3530   return complexprefixes;
3531 }
3532 
3533 // return FULLSTRIP option
get_fullstrip() const3534 int AffixMgr::get_fullstrip() const {
3535   return fullstrip;
3536 }
3537 
get_keepcase() const3538 FLAG AffixMgr::get_keepcase() const {
3539   return keepcase;
3540 }
3541 
get_forceucase() const3542 FLAG AffixMgr::get_forceucase() const {
3543   return forceucase;
3544 }
3545 
get_warn() const3546 FLAG AffixMgr::get_warn() const {
3547   return warn;
3548 }
3549 
get_forbidwarn() const3550 int AffixMgr::get_forbidwarn() const {
3551   return forbidwarn;
3552 }
3553 
get_checksharps() const3554 int AffixMgr::get_checksharps() const {
3555   return checksharps;
3556 }
3557 
encode_flag(unsigned short aflag) const3558 char* AffixMgr::encode_flag(unsigned short aflag) const {
3559   return pHMgr->encode_flag(aflag);
3560 }
3561 
3562 // return the preferred ignore string for suggestions
get_ignore() const3563 const char* AffixMgr::get_ignore() const {
3564   if (ignorechars.empty())
3565     return NULL;
3566   return ignorechars.c_str();
3567 }
3568 
3569 // return the preferred ignore string for suggestions
get_ignore_utf16() const3570 const std::vector<w_char>& AffixMgr::get_ignore_utf16() const {
3571   return ignorechars_utf16;
3572 }
3573 
3574 // return the keyboard string for suggestions
get_key_string()3575 char* AffixMgr::get_key_string() {
3576   if (keystring.empty())
3577     keystring = SPELL_KEYSTRING;
3578   return mystrdup(keystring.c_str());
3579 }
3580 
3581 // return the preferred try string for suggestions
get_try_string() const3582 char* AffixMgr::get_try_string() const {
3583   if (trystring.empty())
3584     return NULL;
3585   return mystrdup(trystring.c_str());
3586 }
3587 
3588 // return the preferred try string for suggestions
get_wordchars() const3589 const std::string& AffixMgr::get_wordchars() const {
3590   return wordchars;
3591 }
3592 
get_wordchars_utf16() const3593 const std::vector<w_char>& AffixMgr::get_wordchars_utf16() const {
3594   return wordchars_utf16;
3595 }
3596 
3597 // is there compounding?
get_compound() const3598 int AffixMgr::get_compound() const {
3599   return compoundflag || compoundbegin || !defcpdtable.empty();
3600 }
3601 
3602 // return the compound words control flag
get_compoundflag() const3603 FLAG AffixMgr::get_compoundflag() const {
3604   return compoundflag;
3605 }
3606 
3607 // return the forbidden words control flag
get_forbiddenword() const3608 FLAG AffixMgr::get_forbiddenword() const {
3609   return forbiddenword;
3610 }
3611 
3612 // return the forbidden words control flag
get_nosuggest() const3613 FLAG AffixMgr::get_nosuggest() const {
3614   return nosuggest;
3615 }
3616 
3617 // return the forbidden words control flag
get_nongramsuggest() const3618 FLAG AffixMgr::get_nongramsuggest() const {
3619   return nongramsuggest;
3620 }
3621 
3622 // return the substandard root/affix control flag
get_substandard() const3623 FLAG AffixMgr::get_substandard() const {
3624   return substandard;
3625 }
3626 
3627 // return the forbidden words flag modify flag
get_needaffix() const3628 FLAG AffixMgr::get_needaffix() const {
3629   return needaffix;
3630 }
3631 
3632 // return the onlyincompound flag
get_onlyincompound() const3633 FLAG AffixMgr::get_onlyincompound() const {
3634   return onlyincompound;
3635 }
3636 
3637 // return the value of suffix
get_version() const3638 const std::string& AffixMgr::get_version() const {
3639   return version;
3640 }
3641 
3642 // utility method to look up root words in hash table
lookup(const char * word)3643 struct hentry* AffixMgr::lookup(const char* word) {
3644   struct hentry* he = NULL;
3645   for (size_t i = 0; i < alldic.size() && !he; ++i) {
3646     he = alldic[i]->lookup(word);
3647   }
3648   return he;
3649 }
3650 
3651 // return the value of suffix
have_contclass() const3652 int AffixMgr::have_contclass() const {
3653   return havecontclass;
3654 }
3655 
3656 // return utf8
get_utf8() const3657 int AffixMgr::get_utf8() const {
3658   return utf8;
3659 }
3660 
get_maxngramsugs(void) const3661 int AffixMgr::get_maxngramsugs(void) const {
3662   return maxngramsugs;
3663 }
3664 
get_maxcpdsugs(void) const3665 int AffixMgr::get_maxcpdsugs(void) const {
3666   return maxcpdsugs;
3667 }
3668 
get_maxdiff(void) const3669 int AffixMgr::get_maxdiff(void) const {
3670   return maxdiff;
3671 }
3672 
get_onlymaxdiff(void) const3673 int AffixMgr::get_onlymaxdiff(void) const {
3674   return onlymaxdiff;
3675 }
3676 
3677 // return nosplitsugs
get_nosplitsugs(void) const3678 int AffixMgr::get_nosplitsugs(void) const {
3679   return nosplitsugs;
3680 }
3681 
3682 // return sugswithdots
get_sugswithdots(void) const3683 int AffixMgr::get_sugswithdots(void) const {
3684   return sugswithdots;
3685 }
3686 
3687 /* parse flag */
parse_flag(const std::string & line,unsigned short * out,FileMgr * af)3688 bool AffixMgr::parse_flag(const std::string& line, unsigned short* out, FileMgr* af) {
3689   if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) {
3690     HUNSPELL_WARNING(
3691         stderr,
3692         "error: line %d: multiple definitions of an affix file parameter\n",
3693         af->getlinenum());
3694     return false;
3695   }
3696   std::string s;
3697   if (!parse_string(line, s, af->getlinenum()))
3698     return false;
3699   *out = pHMgr->decode_flag(s.c_str());
3700   return true;
3701 }
3702 
3703 /* parse num */
parse_num(const std::string & line,int * out,FileMgr * af)3704 bool AffixMgr::parse_num(const std::string& line, int* out, FileMgr* af) {
3705   if (*out != -1) {
3706     HUNSPELL_WARNING(
3707         stderr,
3708         "error: line %d: multiple definitions of an affix file parameter\n",
3709         af->getlinenum());
3710     return false;
3711   }
3712   std::string s;
3713   if (!parse_string(line, s, af->getlinenum()))
3714     return false;
3715   *out = atoi(s.c_str());
3716   return true;
3717 }
3718 
3719 /* parse in the max syllablecount of compound words and  */
parse_cpdsyllable(const std::string & line,FileMgr * af)3720 bool AffixMgr::parse_cpdsyllable(const std::string& line, FileMgr* af) {
3721   int i = 0;
3722   int np = 0;
3723   std::string::const_iterator iter = line.begin();
3724   std::string::const_iterator start_piece = mystrsep(line, iter);
3725   while (start_piece != line.end()) {
3726     switch (i) {
3727       case 0: {
3728         np++;
3729         break;
3730       }
3731       case 1: {
3732         cpdmaxsyllable = atoi(std::string(start_piece, iter).c_str());
3733         np++;
3734         break;
3735       }
3736       case 2: {
3737         if (!utf8) {
3738           cpdvowels.assign(start_piece, iter);
3739           std::sort(cpdvowels.begin(), cpdvowels.end());
3740         } else {
3741           std::string piece(start_piece, iter);
3742           u8_u16(cpdvowels_utf16, piece);
3743           std::sort(cpdvowels_utf16.begin(), cpdvowels_utf16.end());
3744         }
3745         np++;
3746         break;
3747       }
3748       default:
3749         break;
3750     }
3751     ++i;
3752     start_piece = mystrsep(line, iter);
3753   }
3754   if (np < 2) {
3755     HUNSPELL_WARNING(stderr,
3756                      "error: line %d: missing compoundsyllable information\n",
3757                      af->getlinenum());
3758     return false;
3759   }
3760   if (np == 2)
3761     cpdvowels = "AEIOUaeiou";
3762   return true;
3763 }
3764 
parse_convtable(const std::string & line,FileMgr * af,RepList ** rl,const std::string & keyword)3765 bool AffixMgr::parse_convtable(const std::string& line,
3766                               FileMgr* af,
3767                               RepList** rl,
3768                               const std::string& keyword) {
3769   if (*rl) {
3770     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3771                      af->getlinenum());
3772     return false;
3773   }
3774   int i = 0;
3775   int np = 0;
3776   int numrl = 0;
3777   std::string::const_iterator iter = line.begin();
3778   std::string::const_iterator start_piece = mystrsep(line, iter);
3779   while (start_piece != line.end()) {
3780     switch (i) {
3781       case 0: {
3782         np++;
3783         break;
3784       }
3785       case 1: {
3786         numrl = atoi(std::string(start_piece, iter).c_str());
3787         if (numrl < 1) {
3788           HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
3789                            af->getlinenum());
3790           return false;
3791         }
3792         *rl = new RepList(numrl);
3793         if (!*rl)
3794           return false;
3795         np++;
3796         break;
3797       }
3798       default:
3799         break;
3800     }
3801     ++i;
3802     start_piece = mystrsep(line, iter);
3803   }
3804   if (np != 2) {
3805     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
3806                      af->getlinenum());
3807     return false;
3808   }
3809 
3810   /* now parse the num lines to read in the remainder of the table */
3811   for (int j = 0; j < numrl; j++) {
3812     std::string nl;
3813     if (!af->getline(nl))
3814       return false;
3815     mychomp(nl);
3816     i = 0;
3817     std::string pattern;
3818     std::string pattern2;
3819     iter = nl.begin();
3820     start_piece = mystrsep(nl, iter);
3821     while (start_piece != nl.end()) {
3822       {
3823         switch (i) {
3824           case 0: {
3825             if (nl.compare(start_piece - nl.begin(), keyword.size(), keyword, 0, keyword.size()) != 0) {
3826               HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3827                                af->getlinenum());
3828               delete *rl;
3829               *rl = NULL;
3830               return false;
3831             }
3832             break;
3833           }
3834           case 1: {
3835             pattern.assign(start_piece, iter);
3836             break;
3837           }
3838           case 2: {
3839             pattern2.assign(start_piece, iter);
3840             break;
3841           }
3842           default:
3843             break;
3844         }
3845         ++i;
3846       }
3847       start_piece = mystrsep(nl, iter);
3848     }
3849     if (pattern.empty() || pattern2.empty()) {
3850       HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3851                        af->getlinenum());
3852       return false;
3853     }
3854     (*rl)->add(pattern, pattern2);
3855   }
3856   return true;
3857 }
3858 
3859 /* parse in the typical fault correcting table */
parse_phonetable(const std::string & line,FileMgr * af)3860 bool AffixMgr::parse_phonetable(const std::string& line, FileMgr* af) {
3861   if (phone) {
3862     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3863                      af->getlinenum());
3864     return false;
3865   }
3866   int num = -1;
3867   int i = 0;
3868   int np = 0;
3869   std::string::const_iterator iter = line.begin();
3870   std::string::const_iterator start_piece = mystrsep(line, iter);
3871   while (start_piece != line.end()) {
3872     switch (i) {
3873       case 0: {
3874         np++;
3875         break;
3876       }
3877       case 1: {
3878         num = atoi(std::string(start_piece, iter).c_str());
3879         if (num < 1) {
3880           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
3881                            af->getlinenum());
3882           return false;
3883         }
3884         phone = new phonetable;
3885         phone->utf8 = (char)utf8;
3886         np++;
3887         break;
3888       }
3889       default:
3890         break;
3891     }
3892     ++i;
3893     start_piece = mystrsep(line, iter);
3894   }
3895   if (np != 2) {
3896     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
3897                      af->getlinenum());
3898     return false;
3899   }
3900 
3901   /* now parse the phone->num lines to read in the remainder of the table */
3902   for (int j = 0; j < num; ++j) {
3903     std::string nl;
3904     if (!af->getline(nl))
3905       return false;
3906     mychomp(nl);
3907     i = 0;
3908     const size_t old_size = phone->rules.size();
3909     iter = nl.begin();
3910     start_piece = mystrsep(nl, iter);
3911     while (start_piece != nl.end()) {
3912       {
3913         switch (i) {
3914           case 0: {
3915             if (nl.compare(start_piece - nl.begin(), 5, "PHONE", 5) != 0) {
3916               HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3917                                af->getlinenum());
3918               return false;
3919             }
3920             break;
3921           }
3922           case 1: {
3923             phone->rules.push_back(std::string(start_piece, iter));
3924             break;
3925           }
3926           case 2: {
3927             phone->rules.push_back(std::string(start_piece, iter));
3928             mystrrep(phone->rules.back(), "_", "");
3929             break;
3930           }
3931           default:
3932             break;
3933         }
3934         ++i;
3935       }
3936       start_piece = mystrsep(nl, iter);
3937     }
3938     if (phone->rules.size() != old_size + 2) {
3939       HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3940                        af->getlinenum());
3941       phone->rules.clear();
3942       return false;
3943     }
3944   }
3945   phone->rules.push_back("");
3946   phone->rules.push_back("");
3947   init_phonet_hash(*phone);
3948   return true;
3949 }
3950 
3951 /* parse in the checkcompoundpattern table */
parse_checkcpdtable(const std::string & line,FileMgr * af)3952 bool AffixMgr::parse_checkcpdtable(const std::string& line, FileMgr* af) {
3953   if (parsedcheckcpd) {
3954     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3955                      af->getlinenum());
3956     return false;
3957   }
3958   parsedcheckcpd = true;
3959   int numcheckcpd = -1;
3960   int i = 0;
3961   int np = 0;
3962   std::string::const_iterator iter = line.begin();
3963   std::string::const_iterator start_piece = mystrsep(line, iter);
3964   while (start_piece != line.end()) {
3965     switch (i) {
3966       case 0: {
3967         np++;
3968         break;
3969       }
3970       case 1: {
3971         numcheckcpd = atoi(std::string(start_piece, iter).c_str());
3972         if (numcheckcpd < 1) {
3973           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
3974                            af->getlinenum());
3975           return false;
3976         }
3977         checkcpdtable.reserve(numcheckcpd);
3978         np++;
3979         break;
3980       }
3981       default:
3982         break;
3983     }
3984     ++i;
3985     start_piece = mystrsep(line, iter);
3986   }
3987   if (np != 2) {
3988     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
3989                      af->getlinenum());
3990     return false;
3991   }
3992 
3993   /* now parse the numcheckcpd lines to read in the remainder of the table */
3994   for (int j = 0; j < numcheckcpd; ++j) {
3995     std::string nl;
3996     if (!af->getline(nl))
3997       return false;
3998     mychomp(nl);
3999     i = 0;
4000     checkcpdtable.push_back(patentry());
4001     iter = nl.begin();
4002     start_piece = mystrsep(nl, iter);
4003     while (start_piece != nl.end()) {
4004       switch (i) {
4005         case 0: {
4006           if (nl.compare(start_piece - nl.begin(), 20, "CHECKCOMPOUNDPATTERN", 20) != 0) {
4007             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4008                              af->getlinenum());
4009             return false;
4010           }
4011           break;
4012         }
4013         case 1: {
4014           checkcpdtable.back().pattern.assign(start_piece, iter);
4015           size_t slash_pos = checkcpdtable.back().pattern.find('/');
4016           if (slash_pos != std::string::npos) {
4017             std::string chunk(checkcpdtable.back().pattern, slash_pos + 1);
4018             checkcpdtable.back().pattern.resize(slash_pos);
4019             checkcpdtable.back().cond = pHMgr->decode_flag(chunk.c_str());
4020           }
4021           break;
4022         }
4023         case 2: {
4024           checkcpdtable.back().pattern2.assign(start_piece, iter);
4025           size_t slash_pos = checkcpdtable.back().pattern2.find('/');
4026           if (slash_pos != std::string::npos) {
4027             std::string chunk(checkcpdtable.back().pattern2, slash_pos + 1);
4028             checkcpdtable.back().pattern2.resize(slash_pos);
4029             checkcpdtable.back().cond2 = pHMgr->decode_flag(chunk.c_str());
4030           }
4031           break;
4032         }
4033         case 3: {
4034           checkcpdtable.back().pattern3.assign(start_piece, iter);
4035           simplifiedcpd = 1;
4036           break;
4037         }
4038         default:
4039           break;
4040       }
4041       i++;
4042       start_piece = mystrsep(nl, iter);
4043     }
4044   }
4045   return true;
4046 }
4047 
4048 /* parse in the compound rule table */
parse_defcpdtable(const std::string & line,FileMgr * af)4049 bool AffixMgr::parse_defcpdtable(const std::string& line, FileMgr* af) {
4050   if (parseddefcpd) {
4051     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
4052                      af->getlinenum());
4053     return false;
4054   }
4055   parseddefcpd = true;
4056   int numdefcpd = -1;
4057   int i = 0;
4058   int np = 0;
4059   std::string::const_iterator iter = line.begin();
4060   std::string::const_iterator start_piece = mystrsep(line, iter);
4061   while (start_piece != line.end()) {
4062     switch (i) {
4063       case 0: {
4064         np++;
4065         break;
4066       }
4067       case 1: {
4068         numdefcpd = atoi(std::string(start_piece, iter).c_str());
4069         if (numdefcpd < 1) {
4070           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4071                            af->getlinenum());
4072           return false;
4073         }
4074         defcpdtable.reserve(numdefcpd);
4075         np++;
4076         break;
4077       }
4078       default:
4079         break;
4080     }
4081     ++i;
4082     start_piece = mystrsep(line, iter);
4083   }
4084   if (np != 2) {
4085     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4086                      af->getlinenum());
4087     return false;
4088   }
4089 
4090   /* now parse the numdefcpd lines to read in the remainder of the table */
4091   for (int j = 0; j < numdefcpd; ++j) {
4092     std::string nl;
4093     if (!af->getline(nl))
4094       return false;
4095     mychomp(nl);
4096     i = 0;
4097     defcpdtable.push_back(flagentry());
4098     iter = nl.begin();
4099     start_piece = mystrsep(nl, iter);
4100     while (start_piece != nl.end()) {
4101       switch (i) {
4102         case 0: {
4103           if (nl.compare(start_piece - nl.begin(), 12, "COMPOUNDRULE", 12) != 0) {
4104             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4105                              af->getlinenum());
4106             numdefcpd = 0;
4107             return false;
4108           }
4109           break;
4110         }
4111         case 1: {  // handle parenthesized flags
4112           if (std::find(start_piece, iter, '(') != iter) {
4113             for (std::string::const_iterator k = start_piece; k != iter; ++k) {
4114               std::string::const_iterator chb = k;
4115               std::string::const_iterator che = k + 1;
4116               if (*k == '(') {
4117                 std::string::const_iterator parpos = std::find(k, iter, ')');
4118                 if (parpos != iter) {
4119                   chb = k + 1;
4120                   che = parpos;
4121                   k = parpos;
4122                 }
4123               }
4124 
4125               if (*chb == '*' || *chb == '?') {
4126                 defcpdtable.back().push_back((FLAG)*chb);
4127               } else {
4128                 pHMgr->decode_flags(defcpdtable.back(), std::string(chb, che), af);
4129               }
4130             }
4131           } else {
4132             pHMgr->decode_flags(defcpdtable.back(), std::string(start_piece, iter), af);
4133           }
4134           break;
4135         }
4136         default:
4137           break;
4138       }
4139       ++i;
4140       start_piece = mystrsep(nl, iter);
4141     }
4142     if (defcpdtable.back().empty()) {
4143       HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4144                        af->getlinenum());
4145       return false;
4146     }
4147   }
4148   return true;
4149 }
4150 
4151 /* parse in the character map table */
parse_maptable(const std::string & line,FileMgr * af)4152 bool AffixMgr::parse_maptable(const std::string& line, FileMgr* af) {
4153   if (parsedmaptable) {
4154     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
4155                      af->getlinenum());
4156     return false;
4157   }
4158   parsedmaptable = true;
4159   int nummap = -1;
4160   int i = 0;
4161   int np = 0;
4162   std::string::const_iterator iter = line.begin();
4163   std::string::const_iterator start_piece = mystrsep(line, iter);
4164   while (start_piece != line.end()) {
4165     switch (i) {
4166       case 0: {
4167         np++;
4168         break;
4169       }
4170       case 1: {
4171         nummap = atoi(std::string(start_piece, iter).c_str());
4172         if (nummap < 1) {
4173           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4174                            af->getlinenum());
4175           return false;
4176         }
4177         maptable.reserve(nummap);
4178         np++;
4179         break;
4180       }
4181       default:
4182         break;
4183     }
4184     ++i;
4185     start_piece = mystrsep(line, iter);
4186   }
4187   if (np != 2) {
4188     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4189                      af->getlinenum());
4190     return false;
4191   }
4192 
4193   /* now parse the nummap lines to read in the remainder of the table */
4194   for (int j = 0; j < nummap; ++j) {
4195     std::string nl;
4196     if (!af->getline(nl))
4197       return false;
4198     mychomp(nl);
4199     i = 0;
4200     maptable.push_back(mapentry());
4201     iter = nl.begin();
4202     start_piece = mystrsep(nl, iter);
4203     while (start_piece != nl.end()) {
4204       switch (i) {
4205         case 0: {
4206           if (nl.compare(start_piece - nl.begin(), 3, "MAP", 3) != 0) {
4207             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4208                              af->getlinenum());
4209             nummap = 0;
4210             return false;
4211           }
4212           break;
4213         }
4214         case 1: {
4215           for (std::string::const_iterator k = start_piece; k != iter; ++k) {
4216             std::string::const_iterator chb = k;
4217             std::string::const_iterator che = k + 1;
4218             if (*k == '(') {
4219               std::string::const_iterator parpos = std::find(k, iter, ')');
4220               if (parpos != iter) {
4221                 chb = k + 1;
4222                 che = parpos;
4223                 k = parpos;
4224               }
4225             } else {
4226               if (utf8 && (*k & 0xc0) == 0xc0) {
4227                 ++k;
4228                 while (k != iter && (*k & 0xc0) == 0x80)
4229                     ++k;
4230                 che = k;
4231                 --k;
4232               }
4233             }
4234             maptable.back().push_back(std::string(chb, che));
4235           }
4236           break;
4237         }
4238         default:
4239           break;
4240       }
4241       ++i;
4242       start_piece = mystrsep(nl, iter);
4243     }
4244     if (maptable.back().empty()) {
4245       HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4246                        af->getlinenum());
4247       return false;
4248     }
4249   }
4250   return true;
4251 }
4252 
4253 /* parse in the word breakpoint table */
parse_breaktable(const std::string & line,FileMgr * af)4254 bool AffixMgr::parse_breaktable(const std::string& line, FileMgr* af) {
4255   if (parsedbreaktable) {
4256     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
4257                      af->getlinenum());
4258     return false;
4259   }
4260   parsedbreaktable = true;
4261   int numbreak = -1;
4262   int i = 0;
4263   int np = 0;
4264   std::string::const_iterator iter = line.begin();
4265   std::string::const_iterator start_piece = mystrsep(line, iter);
4266   while (start_piece != line.end()) {
4267     switch (i) {
4268       case 0: {
4269         np++;
4270         break;
4271       }
4272       case 1: {
4273         numbreak = atoi(std::string(start_piece, iter).c_str());
4274         if (numbreak < 0) {
4275           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4276                            af->getlinenum());
4277           return false;
4278         }
4279         if (numbreak == 0)
4280           return true;
4281         breaktable.reserve(numbreak);
4282         np++;
4283         break;
4284       }
4285       default:
4286         break;
4287     }
4288     ++i;
4289     start_piece = mystrsep(line, iter);
4290   }
4291   if (np != 2) {
4292     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4293                      af->getlinenum());
4294     return false;
4295   }
4296 
4297   /* now parse the numbreak lines to read in the remainder of the table */
4298   for (int j = 0; j < numbreak; ++j) {
4299     std::string nl;
4300     if (!af->getline(nl))
4301       return false;
4302     mychomp(nl);
4303     i = 0;
4304     iter = nl.begin();
4305     start_piece = mystrsep(nl, iter);
4306     while (start_piece != nl.end()) {
4307       switch (i) {
4308         case 0: {
4309           if (nl.compare(start_piece - nl.begin(), 5, "BREAK", 5) != 0) {
4310             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4311                              af->getlinenum());
4312             numbreak = 0;
4313             return false;
4314           }
4315           break;
4316         }
4317         case 1: {
4318           breaktable.push_back(std::string(start_piece, iter));
4319           break;
4320         }
4321         default:
4322           break;
4323       }
4324       ++i;
4325       start_piece = mystrsep(nl, iter);
4326     }
4327   }
4328 
4329   if (breaktable.size() != static_cast<size_t>(numbreak)) {
4330     HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4331                      af->getlinenum());
4332     return false;
4333   }
4334 
4335   return true;
4336 }
4337 
reverse_condition(std::string & piece)4338 void AffixMgr::reverse_condition(std::string& piece) {
4339   if (piece.empty())
4340       return;
4341 
4342   int neg = 0;
4343   for (std::string::reverse_iterator k = piece.rbegin(); k != piece.rend(); ++k) {
4344     switch (*k) {
4345       case '[': {
4346         if (neg)
4347           *(k - 1) = '[';
4348         else
4349           *k = ']';
4350         break;
4351       }
4352       case ']': {
4353         *k = '[';
4354         if (neg)
4355           *(k - 1) = '^';
4356         neg = 0;
4357         break;
4358       }
4359       case '^': {
4360         if (*(k - 1) == ']')
4361           neg = 1;
4362         else
4363           *(k - 1) = *k;
4364         break;
4365       }
4366       default: {
4367         if (neg)
4368           *(k - 1) = *k;
4369       }
4370     }
4371   }
4372 }
4373 
4374 class entries_container {
4375   std::vector<AffEntry*> entries;
4376   AffixMgr* m_mgr;
4377   char m_at;
4378 public:
entries_container(char at,AffixMgr * mgr)4379   entries_container(char at, AffixMgr* mgr)
4380     : m_mgr(mgr)
4381     , m_at(at) {
4382   }
release()4383   void release() {
4384     entries.clear();
4385   }
initialize(int numents,char opts,unsigned short aflag)4386   void initialize(int numents,
4387                   char opts, unsigned short aflag) {
4388     entries.reserve(numents);
4389 
4390     if (m_at == 'P') {
4391       entries.push_back(new PfxEntry(m_mgr));
4392     } else {
4393       entries.push_back(new SfxEntry(m_mgr));
4394     }
4395 
4396     entries.back()->opts = opts;
4397     entries.back()->aflag = aflag;
4398   }
4399 
add_entry(char opts)4400   AffEntry* add_entry(char opts) {
4401     if (m_at == 'P') {
4402       entries.push_back(new PfxEntry(m_mgr));
4403     } else {
4404       entries.push_back(new SfxEntry(m_mgr));
4405     }
4406     AffEntry* ret = entries.back();
4407     ret->opts = entries[0]->opts & opts;
4408     return ret;
4409   }
4410 
first_entry()4411   AffEntry* first_entry() {
4412     return entries.empty() ? NULL : entries[0];
4413   }
4414 
~entries_container()4415   ~entries_container() {
4416     for (size_t i = 0; i < entries.size(); ++i) {
4417         delete entries[i];
4418     }
4419   }
4420 
begin()4421   std::vector<AffEntry*>::iterator begin() { return entries.begin(); }
end()4422   std::vector<AffEntry*>::iterator end() { return entries.end(); }
4423 };
4424 
parse_affix(const std::string & line,const char at,FileMgr * af,char * dupflags)4425 bool AffixMgr::parse_affix(const std::string& line,
4426                           const char at,
4427                           FileMgr* af,
4428                           char* dupflags) {
4429   int numents = 0;  // number of AffEntry structures to parse
4430 
4431   unsigned short aflag = 0;  // affix char identifier
4432 
4433   char ff = 0;
4434   entries_container affentries(at, this);
4435 
4436   int i = 0;
4437 
4438 // checking lines with bad syntax
4439 #ifdef DEBUG
4440   int basefieldnum = 0;
4441 #endif
4442 
4443   // split affix header line into pieces
4444 
4445   int np = 0;
4446   std::string::const_iterator iter = line.begin();
4447   std::string::const_iterator start_piece = mystrsep(line, iter);
4448   while (start_piece != line.end()) {
4449     switch (i) {
4450       // piece 1 - is type of affix
4451       case 0: {
4452         np++;
4453         break;
4454       }
4455 
4456       // piece 2 - is affix char
4457       case 1: {
4458         np++;
4459         aflag = pHMgr->decode_flag(std::string(start_piece, iter).c_str());
4460         if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
4461             ((at == 'P') && (dupflags[aflag] & dupPFX))) {
4462           HUNSPELL_WARNING(
4463               stderr,
4464               "error: line %d: multiple definitions of an affix flag\n",
4465               af->getlinenum());
4466         }
4467         dupflags[aflag] += (char)((at == 'S') ? dupSFX : dupPFX);
4468         break;
4469       }
4470       // piece 3 - is cross product indicator
4471       case 2: {
4472         np++;
4473         if (*start_piece == 'Y')
4474           ff = aeXPRODUCT;
4475         break;
4476       }
4477 
4478       // piece 4 - is number of affentries
4479       case 3: {
4480         np++;
4481         numents = atoi(std::string(start_piece, iter).c_str());
4482         if ((numents <= 0) || ((std::numeric_limits<size_t>::max() /
4483                                 sizeof(AffEntry)) < static_cast<size_t>(numents))) {
4484           char* err = pHMgr->encode_flag(aflag);
4485           if (err) {
4486             HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4487                              af->getlinenum());
4488             free(err);
4489           }
4490           return false;
4491         }
4492 
4493         char opts = ff;
4494         if (utf8)
4495           opts += aeUTF8;
4496         if (pHMgr->is_aliasf())
4497           opts += aeALIASF;
4498         if (pHMgr->is_aliasm())
4499           opts += aeALIASM;
4500         affentries.initialize(numents, opts, aflag);
4501       }
4502 
4503       default:
4504         break;
4505     }
4506     ++i;
4507     start_piece = mystrsep(line, iter);
4508   }
4509   // check to make sure we parsed enough pieces
4510   if (np != 4) {
4511     char* err = pHMgr->encode_flag(aflag);
4512     if (err) {
4513       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4514                        af->getlinenum());
4515       free(err);
4516     }
4517     return false;
4518   }
4519 
4520   // now parse numents affentries for this affix
4521   AffEntry* entry = affentries.first_entry();
4522   for (int ent = 0; ent < numents; ++ent) {
4523     std::string nl;
4524     if (!af->getline(nl))
4525       return false;
4526     mychomp(nl);
4527 
4528     iter = nl.begin();
4529     i = 0;
4530     np = 0;
4531 
4532     // split line into pieces
4533     start_piece = mystrsep(nl, iter);
4534     while (start_piece != nl.end()) {
4535       switch (i) {
4536         // piece 1 - is type
4537         case 0: {
4538           np++;
4539           if (ent != 0)
4540             entry = affentries.add_entry((char)(aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM));
4541           break;
4542         }
4543 
4544         // piece 2 - is affix char
4545         case 1: {
4546           np++;
4547           std::string chunk(start_piece, iter);
4548           if (pHMgr->decode_flag(chunk.c_str()) != aflag) {
4549             char* err = pHMgr->encode_flag(aflag);
4550             if (err) {
4551               HUNSPELL_WARNING(stderr,
4552                                "error: line %d: affix %s is corrupt\n",
4553                                af->getlinenum(), err);
4554               free(err);
4555             }
4556             return false;
4557           }
4558 
4559           if (ent != 0) {
4560             AffEntry* start_entry = affentries.first_entry();
4561             entry->aflag = start_entry->aflag;
4562           }
4563           break;
4564         }
4565 
4566         // piece 3 - is string to strip or 0 for null
4567         case 2: {
4568           np++;
4569           entry->strip = std::string(start_piece, iter);
4570           if (complexprefixes) {
4571             if (utf8)
4572               reverseword_utf(entry->strip);
4573             else
4574               reverseword(entry->strip);
4575           }
4576           if (entry->strip.compare("0") == 0) {
4577             entry->strip.clear();
4578           }
4579           break;
4580         }
4581 
4582         // piece 4 - is affix string or 0 for null
4583         case 3: {
4584           entry->morphcode = NULL;
4585           entry->contclass = NULL;
4586           entry->contclasslen = 0;
4587           np++;
4588           std::string::const_iterator dash = std::find(start_piece, iter, '/');
4589           if (dash != iter) {
4590             entry->appnd = std::string(start_piece, dash);
4591             std::string dash_str(dash + 1, iter);
4592 
4593             if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) {
4594               if (utf8) {
4595                 remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
4596               } else {
4597                 remove_ignored_chars(entry->appnd, ignorechars);
4598               }
4599             }
4600 
4601             if (complexprefixes) {
4602               if (utf8)
4603                 reverseword_utf(entry->appnd);
4604               else
4605                 reverseword(entry->appnd);
4606             }
4607 
4608             if (pHMgr->is_aliasf()) {
4609               int index = atoi(dash_str.c_str());
4610               entry->contclasslen = (unsigned short)pHMgr->get_aliasf(
4611                   index, &(entry->contclass), af);
4612               if (!entry->contclasslen)
4613                 HUNSPELL_WARNING(stderr,
4614                                  "error: bad affix flag alias: \"%s\"\n",
4615                                  dash_str.c_str());
4616             } else {
4617               entry->contclasslen = (unsigned short)pHMgr->decode_flags(
4618                   &(entry->contclass), dash_str.c_str(), af);
4619               std::sort(entry->contclass, entry->contclass + entry->contclasslen);
4620             }
4621 
4622             havecontclass = 1;
4623             for (unsigned short _i = 0; _i < entry->contclasslen; _i++) {
4624               contclasses[(entry->contclass)[_i]] = 1;
4625             }
4626           } else {
4627             entry->appnd = std::string(start_piece, iter);
4628 
4629             if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) {
4630               if (utf8) {
4631                 remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
4632               } else {
4633                 remove_ignored_chars(entry->appnd, ignorechars);
4634               }
4635             }
4636 
4637             if (complexprefixes) {
4638               if (utf8)
4639                 reverseword_utf(entry->appnd);
4640               else
4641                 reverseword(entry->appnd);
4642             }
4643           }
4644 
4645           if (entry->appnd.compare("0") == 0) {
4646             entry->appnd.clear();
4647           }
4648           break;
4649         }
4650 
4651         // piece 5 - is the conditions descriptions
4652         case 4: {
4653           std::string chunk(start_piece, iter);
4654           np++;
4655           if (complexprefixes) {
4656             if (utf8)
4657               reverseword_utf(chunk);
4658             else
4659               reverseword(chunk);
4660             reverse_condition(chunk);
4661           }
4662           if (!entry->strip.empty() && chunk != "." &&
4663               redundant_condition(at, entry->strip.c_str(), entry->strip.size(), chunk.c_str(),
4664                                   af->getlinenum()))
4665             chunk = ".";
4666           if (at == 'S') {
4667             reverseword(chunk);
4668             reverse_condition(chunk);
4669           }
4670           if (encodeit(*entry, chunk.c_str()))
4671             return false;
4672           break;
4673         }
4674 
4675         case 5: {
4676           std::string chunk(start_piece, iter);
4677           np++;
4678           if (pHMgr->is_aliasm()) {
4679             int index = atoi(chunk.c_str());
4680             entry->morphcode = pHMgr->get_aliasm(index);
4681           } else {
4682             if (complexprefixes) {  // XXX - fix me for morph. gen.
4683               if (utf8)
4684                 reverseword_utf(chunk);
4685               else
4686                 reverseword(chunk);
4687             }
4688             // add the remaining of the line
4689             std::string::const_iterator end = nl.end();
4690             if (iter != end) {
4691               chunk.append(iter, end);
4692             }
4693             entry->morphcode = mystrdup(chunk.c_str());
4694             if (!entry->morphcode)
4695               return false;
4696           }
4697           break;
4698         }
4699         default:
4700           break;
4701       }
4702       i++;
4703       start_piece = mystrsep(nl, iter);
4704     }
4705     // check to make sure we parsed enough pieces
4706     if (np < 4) {
4707       char* err = pHMgr->encode_flag(aflag);
4708       if (err) {
4709         HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
4710                          af->getlinenum(), err);
4711         free(err);
4712       }
4713       return false;
4714     }
4715 
4716 #ifdef DEBUG
4717     // detect unnecessary fields, excepting comments
4718     if (basefieldnum) {
4719       int fieldnum =
4720           !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6);
4721       if (fieldnum != basefieldnum)
4722         HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n",
4723                          af->getlinenum());
4724     } else {
4725       basefieldnum =
4726           !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6);
4727     }
4728 #endif
4729   }
4730 
4731   // now create SfxEntry or PfxEntry objects and use links to
4732   // build an ordered (sorted by affix string) list
4733   std::vector<AffEntry*>::iterator start = affentries.begin();
4734   std::vector<AffEntry*>::iterator end = affentries.end();
4735   for (std::vector<AffEntry*>::iterator affentry = start; affentry != end; ++affentry) {
4736     if (at == 'P') {
4737       build_pfxtree(static_cast<PfxEntry*>(*affentry));
4738     } else {
4739       build_sfxtree(static_cast<SfxEntry*>(*affentry));
4740     }
4741   }
4742 
4743   //contents belong to AffixMgr now
4744   affentries.release();
4745 
4746   return true;
4747 }
4748 
redundant_condition(char ft,const char * strip,int stripl,const char * cond,int linenum)4749 int AffixMgr::redundant_condition(char ft,
4750                                   const char* strip,
4751                                   int stripl,
4752                                   const char* cond,
4753                                   int linenum) {
4754   int condl = strlen(cond);
4755   int i;
4756   int j;
4757   int neg;
4758   int in;
4759   if (ft == 'P') {  // prefix
4760     if (strncmp(strip, cond, condl) == 0)
4761       return 1;
4762     if (utf8) {
4763     } else {
4764       for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
4765         if (cond[j] != '[') {
4766           if (cond[j] != strip[i]) {
4767             HUNSPELL_WARNING(stderr,
4768                              "warning: line %d: incompatible stripping "
4769                              "characters and condition\n",
4770                              linenum);
4771             return 0;
4772           }
4773         } else {
4774           neg = (cond[j + 1] == '^') ? 1 : 0;
4775           in = 0;
4776           do {
4777             j++;
4778             if (strip[i] == cond[j])
4779               in = 1;
4780           } while ((j < (condl - 1)) && (cond[j] != ']'));
4781           if (j == (condl - 1) && (cond[j] != ']')) {
4782             HUNSPELL_WARNING(stderr,
4783                              "error: line %d: missing ] in condition:\n%s\n",
4784                              linenum, cond);
4785             return 0;
4786           }
4787           if ((!neg && !in) || (neg && in)) {
4788             HUNSPELL_WARNING(stderr,
4789                              "warning: line %d: incompatible stripping "
4790                              "characters and condition\n",
4791                              linenum);
4792             return 0;
4793           }
4794         }
4795       }
4796       if (j >= condl)
4797         return 1;
4798     }
4799   } else {  // suffix
4800     if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0)
4801       return 1;
4802     if (utf8) {
4803     } else {
4804       for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
4805         if (cond[j] != ']') {
4806           if (cond[j] != strip[i]) {
4807             HUNSPELL_WARNING(stderr,
4808                              "warning: line %d: incompatible stripping "
4809                              "characters and condition\n",
4810                              linenum);
4811             return 0;
4812           }
4813         } else {
4814           in = 0;
4815           do {
4816             j--;
4817             if (strip[i] == cond[j])
4818               in = 1;
4819           } while ((j > 0) && (cond[j] != '['));
4820           if ((j == 0) && (cond[j] != '[')) {
4821             HUNSPELL_WARNING(stderr,
4822                              "error: line: %d: missing ] in condition:\n%s\n",
4823                              linenum, cond);
4824             return 0;
4825           }
4826           neg = (cond[j + 1] == '^') ? 1 : 0;
4827           if ((!neg && !in) || (neg && in)) {
4828             HUNSPELL_WARNING(stderr,
4829                              "warning: line %d: incompatible stripping "
4830                              "characters and condition\n",
4831                              linenum);
4832             return 0;
4833           }
4834         }
4835       }
4836       if (j < 0)
4837         return 1;
4838     }
4839   }
4840   return 0;
4841 }
4842 
get_suffix_words(short unsigned * suff,int len,const char * root_word)4843 std::vector<std::string> AffixMgr::get_suffix_words(short unsigned* suff,
4844                                int len,
4845                                const char* root_word) {
4846   std::vector<std::string> slst;
4847   short unsigned* start_ptr = suff;
4848   for (int j = 0; j < SETSIZE; j++) {
4849     SfxEntry* ptr = sStart[j];
4850     while (ptr) {
4851       suff = start_ptr;
4852       for (int i = 0; i < len; i++) {
4853         if ((*suff) == ptr->getFlag()) {
4854           std::string nw(root_word);
4855           nw.append(ptr->getAffix());
4856           hentry* ht = ptr->checkword(nw.c_str(), nw.size(), 0, NULL, 0, 0, 0);
4857           if (ht) {
4858             slst.push_back(nw);
4859           }
4860         }
4861         suff++;
4862       }
4863       ptr = ptr->getNext();
4864     }
4865   }
4866   return slst;
4867 }
4868