1 /* ***** BEGIN LICENSE BLOCK *****
2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3  *
4  * Copyright (C) 2002-2017 Németh László
5  *
6  * The contents of this file are subject to the Mozilla Public License Version
7  * 1.1 (the "License"); you may not use this file except in compliance with
8  * the License. You may obtain a copy of the License at
9  * http://www.mozilla.org/MPL/
10  *
11  * Software distributed under the License is distributed on an "AS IS" basis,
12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13  * for the specific language governing rights and limitations under the
14  * License.
15  *
16  * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17  *
18  * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19  * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20  * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21  * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22  * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23  *
24  * Alternatively, the contents of this file may be used under the terms of
25  * either the GNU General Public License Version 2 or later (the "GPL"), or
26  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27  * in which case the provisions of the GPL or the LGPL are applicable instead
28  * of those above. If you wish to allow use of your version of this file only
29  * under the terms of either the GPL or the LGPL, and not to allow others to
30  * use your version of this file under the terms of the MPL, indicate your
31  * decision by deleting the provisions above and replace them with the notice
32  * and other provisions required by the GPL or the LGPL. If you do not delete
33  * the provisions above, a recipient may use your version of this file under
34  * the terms of any one of the MPL, the GPL or the LGPL.
35  *
36  * ***** END LICENSE BLOCK ***** */
37 /*
38  * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39  * And Contributors.  All rights reserved.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  *
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  *
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  *
52  * 3. All modifications to the source code must be clearly marked as
53  *    such.  Binary redistributions based on modified source code
54  *    must be clearly marked as modified versions in the documentation
55  *    and/or other materials provided with the distribution.
56  *
57  * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
61  * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68  * SUCH DAMAGE.
69  */
70 
71 #include <stdlib.h>
72 #include <string.h>
73 #include <stdio.h>
74 #include <ctype.h>
75 
76 #include <algorithm>
77 #include <limits>
78 #include <string>
79 #include <vector>
80 
81 #include "affixmgr.hxx"
82 #include "affentry.hxx"
83 #include "langnum.hxx"
84 
85 #include "csutil.hxx"
86 
AffixMgr(const char * affpath,const std::vector<HashMgr * > & ptr,const char * key)87 AffixMgr::AffixMgr(const char* affpath,
88                    const std::vector<HashMgr*>& ptr,
89                    const char* key)
90   : alldic(ptr)
91   , pHMgr(ptr[0]) {
92 
93   // register hash manager and load affix data from aff file
94   csconv = NULL;
95   utf8 = 0;
96   complexprefixes = 0;
97   parsedmaptable = false;
98   parsedbreaktable = false;
99   parsedrep = false;
100   iconvtable = NULL;
101   oconvtable = NULL;
102   // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
103   simplifiedcpd = 0;
104   parsedcheckcpd = false;
105   parseddefcpd = false;
106   phone = NULL;
107   compoundflag = FLAG_NULL;        // permits word in compound forms
108   compoundbegin = FLAG_NULL;       // may be first word in compound forms
109   compoundmiddle = FLAG_NULL;      // may be middle word in compound forms
110   compoundend = FLAG_NULL;         // may be last word in compound forms
111   compoundroot = FLAG_NULL;        // compound word signing flag
112   compoundpermitflag = FLAG_NULL;  // compound permitting flag for suffixed word
113   compoundforbidflag = FLAG_NULL;  // compound fordidden flag for suffixed word
114   compoundmoresuffixes = 0;        // allow more suffixes within compound words
115   checkcompounddup = 0;            // forbid double words in compounds
116   checkcompoundrep = 0;  // forbid bad compounds (may be non compound word with
117                          // a REP substitution)
118   checkcompoundcase =
119       0;  // forbid upper and lowercase combinations at word bounds
120   checkcompoundtriple = 0;  // forbid compounds with triple letters
121   simplifiedtriple = 0;     // allow simplified triple letters in compounds
122                             // (Schiff+fahrt -> Schiffahrt)
123   forbiddenword = FORBIDDENWORD;  // forbidden word signing flag
124   nosuggest = FLAG_NULL;  // don't suggest words signed with NOSUGGEST flag
125   nongramsuggest = FLAG_NULL;
126   langnum = 0;  // language code (see http://l10n.openoffice.org/languages.html)
127   needaffix = FLAG_NULL;  // forbidden root, allowed only with suffixes
128   cpdwordmax = -1;        // default: unlimited wordcount in compound words
129   cpdmin = -1;            // undefined
130   cpdmaxsyllable = 0;     // default: unlimited syllablecount in compound words
131   pfxappnd = NULL;  // previous prefix for counting syllables of the prefix BUG
132   sfxappnd = NULL;  // previous suffix for counting syllables of the suffix BUG
133   sfxextra = 0;     // modifier for syllable count of sfxappnd BUG
134   checknum = 0;               // checking numbers, and word with numbers
135   havecontclass = 0;  // flags of possible continuing classes (double affix)
136   // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
137   // in morhological description in dictionary file. It's often combined with
138   // PSEUDOROOT.
139   lemma_present = FLAG_NULL;
140   circumfix = FLAG_NULL;
141   onlyincompound = FLAG_NULL;
142   maxngramsugs = -1;  // undefined
143   maxdiff = -1;       // undefined
144   onlymaxdiff = 0;
145   maxcpdsugs = -1;  // undefined
146   nosplitsugs = 0;
147   sugswithdots = 0;
148   keepcase = 0;
149   forceucase = 0;
150   warn = 0;
151   forbidwarn = 0;
152   checksharps = 0;
153   substandard = FLAG_NULL;
154   fullstrip = 0;
155 
156   sfx = NULL;
157   pfx = NULL;
158 
159   for (int i = 0; i < SETSIZE; i++) {
160     pStart[i] = NULL;
161     sStart[i] = NULL;
162     pFlag[i] = NULL;
163     sFlag[i] = NULL;
164   }
165 
166   for (int j = 0; j < CONTSIZE; j++) {
167     contclasses[j] = 0;
168   }
169 
170   if (parse_file(affpath, key)) {
171     HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n", affpath);
172   }
173 
174   if (cpdmin == -1)
175     cpdmin = MINCPDLEN;
176 }
177 
~AffixMgr()178 AffixMgr::~AffixMgr() {
179   // pass through linked prefix entries and clean up
180   for (int i = 0; i < SETSIZE; i++) {
181     pFlag[i] = NULL;
182     PfxEntry* ptr = pStart[i];
183     PfxEntry* nptr = NULL;
184     while (ptr) {
185       nptr = ptr->getNext();
186       delete (ptr);
187       ptr = nptr;
188       nptr = NULL;
189     }
190   }
191 
192   // pass through linked suffix entries and clean up
193   for (int j = 0; j < SETSIZE; j++) {
194     sFlag[j] = NULL;
195     SfxEntry* ptr = sStart[j];
196     SfxEntry* nptr = NULL;
197     while (ptr) {
198       nptr = ptr->getNext();
199       delete (ptr);
200       ptr = nptr;
201       nptr = NULL;
202     }
203     sStart[j] = NULL;
204   }
205 
206   delete iconvtable;
207   delete oconvtable;
208   delete phone;
209 
210   FREE_FLAG(compoundflag);
211   FREE_FLAG(compoundbegin);
212   FREE_FLAG(compoundmiddle);
213   FREE_FLAG(compoundend);
214   FREE_FLAG(compoundpermitflag);
215   FREE_FLAG(compoundforbidflag);
216   FREE_FLAG(compoundroot);
217   FREE_FLAG(forbiddenword);
218   FREE_FLAG(nosuggest);
219   FREE_FLAG(nongramsuggest);
220   FREE_FLAG(needaffix);
221   FREE_FLAG(lemma_present);
222   FREE_FLAG(circumfix);
223   FREE_FLAG(onlyincompound);
224 
225   cpdwordmax = 0;
226   pHMgr = NULL;
227   cpdmin = 0;
228   cpdmaxsyllable = 0;
229   free_utf_tbl();
230   checknum = 0;
231 #ifdef MOZILLA_CLIENT
232   delete[] csconv;
233 #endif
234 }
235 
finishFileMgr(FileMgr * afflst)236 void AffixMgr::finishFileMgr(FileMgr* afflst) {
237   delete afflst;
238 
239   // convert affix trees to sorted list
240   process_pfx_tree_to_list();
241   process_sfx_tree_to_list();
242 }
243 
244 // read in aff file and build up prefix and suffix entry objects
parse_file(const char * affpath,const char * key)245 int AffixMgr::parse_file(const char* affpath, const char* key) {
246 
247   // checking flag duplication
248   char dupflags[CONTSIZE];
249   char dupflags_ini = 1;
250 
251   // first line indicator for removing byte order mark
252   int firstline = 1;
253 
254   // open the affix file
255   FileMgr* afflst = new FileMgr(affpath, key);
256   if (!afflst) {
257     HUNSPELL_WARNING(
258         stderr, "error: could not open affix description file %s\n", affpath);
259     return 1;
260   }
261 
262   // step one is to parse the affix file building up the internal
263   // affix data structures
264 
265   // read in each line ignoring any that do not
266   // start with a known line type indicator
267   std::string line;
268   while (afflst->getline(line)) {
269     mychomp(line);
270 
271     /* remove byte order mark */
272     if (firstline) {
273       firstline = 0;
274       // Affix file begins with byte order mark: possible incompatibility with
275       // old Hunspell versions
276       if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
277         line.erase(0, 3);
278       }
279     }
280 
281     /* parse in the keyboard string */
282     if (line.compare(0, 3, "KEY", 3) == 0) {
283       if (!parse_string(line, keystring, afflst->getlinenum())) {
284         finishFileMgr(afflst);
285         return 1;
286       }
287     }
288 
289     /* parse in the try string */
290     if (line.compare(0, 3, "TRY", 3) == 0) {
291       if (!parse_string(line, trystring, afflst->getlinenum())) {
292         finishFileMgr(afflst);
293         return 1;
294       }
295     }
296 
297     /* parse in the name of the character set used by the .dict and .aff */
298     if (line.compare(0, 3, "SET", 3) == 0) {
299       if (!parse_string(line, encoding, afflst->getlinenum())) {
300         finishFileMgr(afflst);
301         return 1;
302       }
303       if (encoding == "UTF-8") {
304         utf8 = 1;
305 #ifndef OPENOFFICEORG
306 #ifndef MOZILLA_CLIENT
307         initialize_utf_tbl();
308 #endif
309 #endif
310       }
311     }
312 
313     /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left
314      * writing system */
315     if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
316       complexprefixes = 1;
317 
318     /* parse in the flag used by the controlled compound words */
319     if (line.compare(0, 12, "COMPOUNDFLAG", 12) == 0) {
320       if (!parse_flag(line, &compoundflag, afflst)) {
321         finishFileMgr(afflst);
322         return 1;
323       }
324     }
325 
326     /* parse in the flag used by compound words */
327     if (line.compare(0, 13, "COMPOUNDBEGIN", 13) == 0) {
328       if (complexprefixes) {
329         if (!parse_flag(line, &compoundend, afflst)) {
330           finishFileMgr(afflst);
331           return 1;
332         }
333       } else {
334         if (!parse_flag(line, &compoundbegin, afflst)) {
335           finishFileMgr(afflst);
336           return 1;
337         }
338       }
339     }
340 
341     /* parse in the flag used by compound words */
342     if (line.compare(0, 14, "COMPOUNDMIDDLE", 14) == 0) {
343       if (!parse_flag(line, &compoundmiddle, afflst)) {
344         finishFileMgr(afflst);
345         return 1;
346       }
347     }
348 
349     /* parse in the flag used by compound words */
350     if (line.compare(0, 11, "COMPOUNDEND", 11) == 0) {
351       if (complexprefixes) {
352         if (!parse_flag(line, &compoundbegin, afflst)) {
353           finishFileMgr(afflst);
354           return 1;
355         }
356       } else {
357         if (!parse_flag(line, &compoundend, afflst)) {
358           finishFileMgr(afflst);
359           return 1;
360         }
361       }
362     }
363 
364     /* parse in the data used by compound_check() method */
365     if (line.compare(0, 15, "COMPOUNDWORDMAX", 15) == 0) {
366       if (!parse_num(line, &cpdwordmax, afflst)) {
367         finishFileMgr(afflst);
368         return 1;
369       }
370     }
371 
372     /* parse in the flag sign compounds in dictionary */
373     if (line.compare(0, 12, "COMPOUNDROOT", 12) == 0) {
374       if (!parse_flag(line, &compoundroot, afflst)) {
375         finishFileMgr(afflst);
376         return 1;
377       }
378     }
379 
380     /* parse in the flag used by compound_check() method */
381     if (line.compare(0, 18, "COMPOUNDPERMITFLAG", 18) == 0) {
382       if (!parse_flag(line, &compoundpermitflag, afflst)) {
383         finishFileMgr(afflst);
384         return 1;
385       }
386     }
387 
388     /* parse in the flag used by compound_check() method */
389     if (line.compare(0, 18, "COMPOUNDFORBIDFLAG", 18) == 0) {
390       if (!parse_flag(line, &compoundforbidflag, afflst)) {
391         finishFileMgr(afflst);
392         return 1;
393       }
394     }
395 
396     if (line.compare(0, 20, "COMPOUNDMORESUFFIXES", 20) == 0) {
397       compoundmoresuffixes = 1;
398     }
399 
400     if (line.compare(0, 16, "CHECKCOMPOUNDDUP", 16) == 0) {
401       checkcompounddup = 1;
402     }
403 
404     if (line.compare(0, 16, "CHECKCOMPOUNDREP", 16) == 0) {
405       checkcompoundrep = 1;
406     }
407 
408     if (line.compare(0, 19, "CHECKCOMPOUNDTRIPLE", 19) == 0) {
409       checkcompoundtriple = 1;
410     }
411 
412     if (line.compare(0, 16, "SIMPLIFIEDTRIPLE", 16) == 0) {
413       simplifiedtriple = 1;
414     }
415 
416     if (line.compare(0, 17, "CHECKCOMPOUNDCASE", 17) == 0) {
417       checkcompoundcase = 1;
418     }
419 
420     if (line.compare(0, 9, "NOSUGGEST", 9) == 0) {
421       if (!parse_flag(line, &nosuggest, afflst)) {
422         finishFileMgr(afflst);
423         return 1;
424       }
425     }
426 
427     if (line.compare(0, 14, "NONGRAMSUGGEST", 14) == 0) {
428       if (!parse_flag(line, &nongramsuggest, afflst)) {
429         finishFileMgr(afflst);
430         return 1;
431       }
432     }
433 
434     /* parse in the flag used by forbidden words */
435     if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) {
436       if (!parse_flag(line, &forbiddenword, afflst)) {
437         finishFileMgr(afflst);
438         return 1;
439       }
440     }
441 
442     /* parse in the flag used by forbidden words */
443     if (line.compare(0, 13, "LEMMA_PRESENT", 13) == 0) {
444       if (!parse_flag(line, &lemma_present, afflst)) {
445         finishFileMgr(afflst);
446         return 1;
447       }
448     }
449 
450     /* parse in the flag used by circumfixes */
451     if (line.compare(0, 9, "CIRCUMFIX", 9) == 0) {
452       if (!parse_flag(line, &circumfix, afflst)) {
453         finishFileMgr(afflst);
454         return 1;
455       }
456     }
457 
458     /* parse in the flag used by fogemorphemes */
459     if (line.compare(0, 14, "ONLYINCOMPOUND", 14) == 0) {
460       if (!parse_flag(line, &onlyincompound, afflst)) {
461         finishFileMgr(afflst);
462         return 1;
463       }
464     }
465 
466     /* parse in the flag used by `needaffixs' */
467     if (line.compare(0, 10, "PSEUDOROOT", 10) == 0) {
468       if (!parse_flag(line, &needaffix, afflst)) {
469         finishFileMgr(afflst);
470         return 1;
471       }
472     }
473 
474     /* parse in the flag used by `needaffixs' */
475     if (line.compare(0, 9, "NEEDAFFIX", 9) == 0) {
476       if (!parse_flag(line, &needaffix, afflst)) {
477         finishFileMgr(afflst);
478         return 1;
479       }
480     }
481 
482     /* parse in the minimal length for words in compounds */
483     if (line.compare(0, 11, "COMPOUNDMIN", 11) == 0) {
484       if (!parse_num(line, &cpdmin, afflst)) {
485         finishFileMgr(afflst);
486         return 1;
487       }
488       if (cpdmin < 1)
489         cpdmin = 1;
490     }
491 
492     /* parse in the max. words and syllables in compounds */
493     if (line.compare(0, 16, "COMPOUNDSYLLABLE", 16) == 0) {
494       if (!parse_cpdsyllable(line, afflst)) {
495         finishFileMgr(afflst);
496         return 1;
497       }
498     }
499 
500     /* parse in the flag used by compound_check() method */
501     if (line.compare(0, 11, "SYLLABLENUM", 11) == 0) {
502       if (!parse_string(line, cpdsyllablenum, afflst->getlinenum())) {
503         finishFileMgr(afflst);
504         return 1;
505       }
506     }
507 
508     /* parse in the flag used by the controlled compound words */
509     if (line.compare(0, 8, "CHECKNUM", 8) == 0) {
510       checknum = 1;
511     }
512 
513     /* parse in the extra word characters */
514     if (line.compare(0, 9, "WORDCHARS", 9) == 0) {
515       if (!parse_array(line, wordchars, wordchars_utf16,
516                        utf8, afflst->getlinenum())) {
517         finishFileMgr(afflst);
518         return 1;
519       }
520     }
521 
522     /* parse in the ignored characters (for example, Arabic optional diacretics
523      * charachters */
524     if (line.compare(0, 6, "IGNORE", 6) == 0) {
525       if (!parse_array(line, ignorechars, ignorechars_utf16,
526                        utf8, afflst->getlinenum())) {
527         finishFileMgr(afflst);
528         return 1;
529       }
530     }
531 
532     /* parse in the typical fault correcting table */
533     if (line.compare(0, 3, "REP", 3) == 0) {
534       if (!parse_reptable(line, afflst)) {
535         finishFileMgr(afflst);
536         return 1;
537       }
538     }
539 
540     /* parse in the input conversion table */
541     if (line.compare(0, 5, "ICONV", 5) == 0) {
542       if (!parse_convtable(line, afflst, &iconvtable, "ICONV")) {
543         finishFileMgr(afflst);
544         return 1;
545       }
546     }
547 
548     /* parse in the input conversion table */
549     if (line.compare(0, 5, "OCONV", 5) == 0) {
550       if (!parse_convtable(line, afflst, &oconvtable, "OCONV")) {
551         finishFileMgr(afflst);
552         return 1;
553       }
554     }
555 
556     /* parse in the phonetic translation table */
557     if (line.compare(0, 5, "PHONE", 5) == 0) {
558       if (!parse_phonetable(line, afflst)) {
559         finishFileMgr(afflst);
560         return 1;
561       }
562     }
563 
564     /* parse in the checkcompoundpattern table */
565     if (line.compare(0, 20, "CHECKCOMPOUNDPATTERN", 20) == 0) {
566       if (!parse_checkcpdtable(line, afflst)) {
567         finishFileMgr(afflst);
568         return 1;
569       }
570     }
571 
572     /* parse in the defcompound table */
573     if (line.compare(0, 12, "COMPOUNDRULE", 12) == 0) {
574       if (!parse_defcpdtable(line, afflst)) {
575         finishFileMgr(afflst);
576         return 1;
577       }
578     }
579 
580     /* parse in the related character map table */
581     if (line.compare(0, 3, "MAP", 3) == 0) {
582       if (!parse_maptable(line, afflst)) {
583         finishFileMgr(afflst);
584         return 1;
585       }
586     }
587 
588     /* parse in the word breakpoints table */
589     if (line.compare(0, 5, "BREAK", 5) == 0) {
590       if (!parse_breaktable(line, afflst)) {
591         finishFileMgr(afflst);
592         return 1;
593       }
594     }
595 
596     /* parse in the language for language specific codes */
597     if (line.compare(0, 4, "LANG", 4) == 0) {
598       if (!parse_string(line, lang, afflst->getlinenum())) {
599         finishFileMgr(afflst);
600         return 1;
601       }
602       langnum = get_lang_num(lang);
603     }
604 
605     if (line.compare(0, 7, "VERSION", 7) == 0) {
606       size_t startpos = line.find_first_not_of(" \t", 7);
607       if (startpos != std::string::npos) {
608           version = line.substr(startpos);
609       }
610     }
611 
612     if (line.compare(0, 12, "MAXNGRAMSUGS", 12) == 0) {
613       if (!parse_num(line, &maxngramsugs, afflst)) {
614         finishFileMgr(afflst);
615         return 1;
616       }
617     }
618 
619     if (line.compare(0, 11, "ONLYMAXDIFF", 11) == 0)
620       onlymaxdiff = 1;
621 
622     if (line.compare(0, 7, "MAXDIFF", 7) == 0) {
623       if (!parse_num(line, &maxdiff, afflst)) {
624         finishFileMgr(afflst);
625         return 1;
626       }
627     }
628 
629     if (line.compare(0, 10, "MAXCPDSUGS", 10) == 0) {
630       if (!parse_num(line, &maxcpdsugs, afflst)) {
631         finishFileMgr(afflst);
632         return 1;
633       }
634     }
635 
636     if (line.compare(0, 11, "NOSPLITSUGS", 11) == 0) {
637       nosplitsugs = 1;
638     }
639 
640     if (line.compare(0, 9, "FULLSTRIP", 9) == 0) {
641       fullstrip = 1;
642     }
643 
644     if (line.compare(0, 12, "SUGSWITHDOTS", 12) == 0) {
645       sugswithdots = 1;
646     }
647 
648     /* parse in the flag used by forbidden words */
649     if (line.compare(0, 8, "KEEPCASE", 8) == 0) {
650       if (!parse_flag(line, &keepcase, afflst)) {
651         finishFileMgr(afflst);
652         return 1;
653       }
654     }
655 
656     /* parse in the flag used by `forceucase' */
657     if (line.compare(0, 10, "FORCEUCASE", 10) == 0) {
658       if (!parse_flag(line, &forceucase, afflst)) {
659         finishFileMgr(afflst);
660         return 1;
661       }
662     }
663 
664     /* parse in the flag used by `warn' */
665     if (line.compare(0, 4, "WARN", 4) == 0) {
666       if (!parse_flag(line, &warn, afflst)) {
667         finishFileMgr(afflst);
668         return 1;
669       }
670     }
671 
672     if (line.compare(0, 10, "FORBIDWARN", 10) == 0) {
673       forbidwarn = 1;
674     }
675 
676     /* parse in the flag used by the affix generator */
677     if (line.compare(0, 11, "SUBSTANDARD", 11) == 0) {
678       if (!parse_flag(line, &substandard, afflst)) {
679         finishFileMgr(afflst);
680         return 1;
681       }
682     }
683 
684     if (line.compare(0, 11, "CHECKSHARPS", 11) == 0) {
685       checksharps = 1;
686     }
687 
688     /* parse this affix: P - prefix, S - suffix */
689     // affix type
690     char ft = ' ';
691     if (line.compare(0, 3, "PFX", 3) == 0)
692       ft = complexprefixes ? 'S' : 'P';
693     if (line.compare(0, 3, "SFX", 3) == 0)
694       ft = complexprefixes ? 'P' : 'S';
695     if (ft != ' ') {
696       if (dupflags_ini) {
697         memset(dupflags, 0, sizeof(dupflags));
698         dupflags_ini = 0;
699       }
700       if (!parse_affix(line, ft, afflst, dupflags)) {
701         finishFileMgr(afflst);
702         return 1;
703       }
704     }
705   }
706 
707   finishFileMgr(afflst);
708   // affix trees are sorted now
709 
710   // now we can speed up performance greatly taking advantage of the
711   // relationship between the affixes and the idea of "subsets".
712 
713   // View each prefix as a potential leading subset of another and view
714   // each suffix (reversed) as a potential trailing subset of another.
715 
716   // To illustrate this relationship if we know the prefix "ab" is found in the
717   // word to examine, only prefixes that "ab" is a leading subset of need be
718   // examined.
719   // Furthermore is "ab" is not present then none of the prefixes that "ab" is
720   // is a subset need be examined.
721   // The same argument goes for suffix string that are reversed.
722 
723   // Then to top this off why not examine the first char of the word to quickly
724   // limit the set of prefixes to examine (i.e. the prefixes to examine must
725   // be leading supersets of the first character of the word (if they exist)
726 
727   // To take advantage of this "subset" relationship, we need to add two links
728   // from entry.  One to take next if the current prefix is found (call it
729   // nexteq)
730   // and one to take next if the current prefix is not found (call it nextne).
731 
732   // Since we have built ordered lists, all that remains is to properly
733   // initialize
734   // the nextne and nexteq pointers that relate them
735 
736   process_pfx_order();
737   process_sfx_order();
738 
739   /* get encoding for CHECKCOMPOUNDCASE */
740   if (!utf8) {
741     csconv = get_current_cs(get_encoding());
742     for (int i = 0; i <= 255; i++) {
743       if ((csconv[i].cupper != csconv[i].clower) &&
744           (wordchars.find((char)i) == std::string::npos)) {
745         wordchars.push_back((char)i);
746       }
747     }
748 
749   }
750 
751   // default BREAK definition
752   if (!parsedbreaktable) {
753     breaktable.push_back("-");
754     breaktable.push_back("^-");
755     breaktable.push_back("-$");
756     parsedbreaktable = true;
757   }
758   return 0;
759 }
760 
761 // we want to be able to quickly access prefix information
762 // both by prefix flag, and sorted by prefix string itself
763 // so we need to set up two indexes
764 
build_pfxtree(PfxEntry * pfxptr)765 int AffixMgr::build_pfxtree(PfxEntry* pfxptr) {
766   PfxEntry* ptr;
767   PfxEntry* pptr;
768   PfxEntry* ep = pfxptr;
769 
770   // get the right starting points
771   const char* key = ep->getKey();
772   const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF);
773 
774   // first index by flag which must exist
775   ptr = pFlag[flg];
776   ep->setFlgNxt(ptr);
777   pFlag[flg] = ep;
778 
779   // handle the special case of null affix string
780   if (strlen(key) == 0) {
781     // always inset them at head of list at element 0
782     ptr = pStart[0];
783     ep->setNext(ptr);
784     pStart[0] = ep;
785     return 0;
786   }
787 
788   // now handle the normal case
789   ep->setNextEQ(NULL);
790   ep->setNextNE(NULL);
791 
792   unsigned char sp = *((const unsigned char*)key);
793   ptr = pStart[sp];
794 
795   // handle the first insert
796   if (!ptr) {
797     pStart[sp] = ep;
798     return 0;
799   }
800 
801   // otherwise use binary tree insertion so that a sorted
802   // list can easily be generated later
803   pptr = NULL;
804   for (;;) {
805     pptr = ptr;
806     if (strcmp(ep->getKey(), ptr->getKey()) <= 0) {
807       ptr = ptr->getNextEQ();
808       if (!ptr) {
809         pptr->setNextEQ(ep);
810         break;
811       }
812     } else {
813       ptr = ptr->getNextNE();
814       if (!ptr) {
815         pptr->setNextNE(ep);
816         break;
817       }
818     }
819   }
820   return 0;
821 }
822 
823 // we want to be able to quickly access suffix information
824 // both by suffix flag, and sorted by the reverse of the
825 // suffix string itself; so we need to set up two indexes
build_sfxtree(SfxEntry * sfxptr)826 int AffixMgr::build_sfxtree(SfxEntry* sfxptr) {
827 
828   sfxptr->initReverseWord();
829 
830   SfxEntry* ptr;
831   SfxEntry* pptr;
832   SfxEntry* ep = sfxptr;
833 
834   /* get the right starting point */
835   const char* key = ep->getKey();
836   const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF);
837 
838   // first index by flag which must exist
839   ptr = sFlag[flg];
840   ep->setFlgNxt(ptr);
841   sFlag[flg] = ep;
842 
843   // next index by affix string
844 
845   // handle the special case of null affix string
846   if (strlen(key) == 0) {
847     // always inset them at head of list at element 0
848     ptr = sStart[0];
849     ep->setNext(ptr);
850     sStart[0] = ep;
851     return 0;
852   }
853 
854   // now handle the normal case
855   ep->setNextEQ(NULL);
856   ep->setNextNE(NULL);
857 
858   unsigned char sp = *((const unsigned char*)key);
859   ptr = sStart[sp];
860 
861   // handle the first insert
862   if (!ptr) {
863     sStart[sp] = ep;
864     return 0;
865   }
866 
867   // otherwise use binary tree insertion so that a sorted
868   // list can easily be generated later
869   pptr = NULL;
870   for (;;) {
871     pptr = ptr;
872     if (strcmp(ep->getKey(), ptr->getKey()) <= 0) {
873       ptr = ptr->getNextEQ();
874       if (!ptr) {
875         pptr->setNextEQ(ep);
876         break;
877       }
878     } else {
879       ptr = ptr->getNextNE();
880       if (!ptr) {
881         pptr->setNextNE(ep);
882         break;
883       }
884     }
885   }
886   return 0;
887 }
888 
889 // convert from binary tree to sorted list
process_pfx_tree_to_list()890 int AffixMgr::process_pfx_tree_to_list() {
891   for (int i = 1; i < SETSIZE; i++) {
892     pStart[i] = process_pfx_in_order(pStart[i], NULL);
893   }
894   return 0;
895 }
896 
process_pfx_in_order(PfxEntry * ptr,PfxEntry * nptr)897 PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr) {
898   if (ptr) {
899     nptr = process_pfx_in_order(ptr->getNextNE(), nptr);
900     ptr->setNext(nptr);
901     nptr = process_pfx_in_order(ptr->getNextEQ(), ptr);
902   }
903   return nptr;
904 }
905 
906 // convert from binary tree to sorted list
process_sfx_tree_to_list()907 int AffixMgr::process_sfx_tree_to_list() {
908   for (int i = 1; i < SETSIZE; i++) {
909     sStart[i] = process_sfx_in_order(sStart[i], NULL);
910   }
911   return 0;
912 }
913 
process_sfx_in_order(SfxEntry * ptr,SfxEntry * nptr)914 SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr) {
915   if (ptr) {
916     nptr = process_sfx_in_order(ptr->getNextNE(), nptr);
917     ptr->setNext(nptr);
918     nptr = process_sfx_in_order(ptr->getNextEQ(), ptr);
919   }
920   return nptr;
921 }
922 
923 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
924 // using the idea of leading subsets this time
process_pfx_order()925 int AffixMgr::process_pfx_order() {
926   PfxEntry* ptr;
927 
928   // loop through each prefix list starting point
929   for (int i = 1; i < SETSIZE; i++) {
930     ptr = pStart[i];
931 
932     // look through the remainder of the list
933     //  and find next entry with affix that
934     // the current one is not a subset of
935     // mark that as destination for NextNE
936     // use next in list that you are a subset
937     // of as NextEQ
938 
939     for (; ptr != NULL; ptr = ptr->getNext()) {
940       PfxEntry* nptr = ptr->getNext();
941       for (; nptr != NULL; nptr = nptr->getNext()) {
942         if (!isSubset(ptr->getKey(), nptr->getKey()))
943           break;
944       }
945       ptr->setNextNE(nptr);
946       ptr->setNextEQ(NULL);
947       if ((ptr->getNext()) &&
948           isSubset(ptr->getKey(), (ptr->getNext())->getKey()))
949         ptr->setNextEQ(ptr->getNext());
950     }
951 
952     // now clean up by adding smart search termination strings:
953     // if you are already a superset of the previous prefix
954     // but not a subset of the next, search can end here
955     // so set NextNE properly
956 
957     ptr = pStart[i];
958     for (; ptr != NULL; ptr = ptr->getNext()) {
959       PfxEntry* nptr = ptr->getNext();
960       PfxEntry* mptr = NULL;
961       for (; nptr != NULL; nptr = nptr->getNext()) {
962         if (!isSubset(ptr->getKey(), nptr->getKey()))
963           break;
964         mptr = nptr;
965       }
966       if (mptr)
967         mptr->setNextNE(NULL);
968     }
969   }
970   return 0;
971 }
972 
973 // initialize the SfxEntry links NextEQ and NextNE to speed searching
974 // using the idea of leading subsets this time
process_sfx_order()975 int AffixMgr::process_sfx_order() {
976   SfxEntry* ptr;
977 
978   // loop through each prefix list starting point
979   for (int i = 1; i < SETSIZE; i++) {
980     ptr = sStart[i];
981 
982     // look through the remainder of the list
983     //  and find next entry with affix that
984     // the current one is not a subset of
985     // mark that as destination for NextNE
986     // use next in list that you are a subset
987     // of as NextEQ
988 
989     for (; ptr != NULL; ptr = ptr->getNext()) {
990       SfxEntry* nptr = ptr->getNext();
991       for (; nptr != NULL; nptr = nptr->getNext()) {
992         if (!isSubset(ptr->getKey(), nptr->getKey()))
993           break;
994       }
995       ptr->setNextNE(nptr);
996       ptr->setNextEQ(NULL);
997       if ((ptr->getNext()) &&
998           isSubset(ptr->getKey(), (ptr->getNext())->getKey()))
999         ptr->setNextEQ(ptr->getNext());
1000     }
1001 
1002     // now clean up by adding smart search termination strings:
1003     // if you are already a superset of the previous suffix
1004     // but not a subset of the next, search can end here
1005     // so set NextNE properly
1006 
1007     ptr = sStart[i];
1008     for (; ptr != NULL; ptr = ptr->getNext()) {
1009       SfxEntry* nptr = ptr->getNext();
1010       SfxEntry* mptr = NULL;
1011       for (; nptr != NULL; nptr = nptr->getNext()) {
1012         if (!isSubset(ptr->getKey(), nptr->getKey()))
1013           break;
1014         mptr = nptr;
1015       }
1016       if (mptr)
1017         mptr->setNextNE(NULL);
1018     }
1019   }
1020   return 0;
1021 }
1022 
1023 // add flags to the result for dictionary debugging
debugflag(std::string & result,unsigned short flag)1024 std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) {
1025   char* st = encode_flag(flag);
1026   result.append(" ");
1027   result.append(MORPH_FLAG);
1028   if (st) {
1029     result.append(st);
1030     free(st);
1031   }
1032   return result;
1033 }
1034 
1035 // calculate the character length of the condition
condlen(const char * st)1036 int AffixMgr::condlen(const char* st) {
1037   int l = 0;
1038   bool group = false;
1039   for (; *st; st++) {
1040     if (*st == '[') {
1041       group = true;
1042       l++;
1043     } else if (*st == ']')
1044       group = false;
1045     else if (!group && (!utf8 || (!(*st & 0x80) || ((*st & 0xc0) == 0x80))))
1046       l++;
1047   }
1048   return l;
1049 }
1050 
encodeit(AffEntry & entry,const char * cs)1051 int AffixMgr::encodeit(AffEntry& entry, const char* cs) {
1052   if (strcmp(cs, ".") != 0) {
1053     entry.numconds = (char)condlen(cs);
1054     const size_t cslen = strlen(cs);
1055     const size_t short_part = std::min<size_t>(MAXCONDLEN, cslen);
1056     memcpy(entry.c.conds, cs, short_part);
1057     if (short_part < MAXCONDLEN) {
1058       //blank out the remaining space
1059       memset(entry.c.conds + short_part, 0, MAXCONDLEN - short_part);
1060     } else if (cs[MAXCONDLEN]) {
1061       //there is more conditions than fit in fixed space, so its
1062       //a long condition
1063       entry.opts += aeLONGCOND;
1064       entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
1065       if (!entry.c.l.conds2)
1066         return 1;
1067     }
1068   } else {
1069     entry.numconds = 0;
1070     entry.c.conds[0] = '\0';
1071   }
1072   return 0;
1073 }
1074 
1075 // return 1 if s1 is a leading subset of s2 (dots are for infixes)
isSubset(const char * s1,const char * s2)1076 inline int AffixMgr::isSubset(const char* s1, const char* s2) {
1077   while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
1078     s1++;
1079     s2++;
1080   }
1081   return (*s1 == '\0');
1082 }
1083 
1084 // check word for prefixes
prefix_check(const char * word,int len,char in_compound,const FLAG needflag)1085 struct hentry* AffixMgr::prefix_check(const char* word,
1086                                       int len,
1087                                       char in_compound,
1088                                       const FLAG needflag) {
1089   struct hentry* rv = NULL;
1090 
1091   pfx = NULL;
1092   pfxappnd = NULL;
1093   sfxappnd = NULL;
1094   sfxextra = 0;
1095 
1096   // first handle the special case of 0 length prefixes
1097   PfxEntry* pe = pStart[0];
1098   while (pe) {
1099     if (
1100         // fogemorpheme
1101         ((in_compound != IN_CPD_NOT) ||
1102          !(pe->getCont() &&
1103            (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
1104         // permit prefixes in compounds
1105         ((in_compound != IN_CPD_END) ||
1106          (pe->getCont() &&
1107           (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))) {
1108       // check prefix
1109       rv = pe->checkword(word, len, in_compound, needflag);
1110       if (rv) {
1111         pfx = pe;  // BUG: pfx not stateless
1112         return rv;
1113       }
1114     }
1115     pe = pe->getNext();
1116   }
1117 
1118   // now handle the general case
1119   unsigned char sp = *((const unsigned char*)word);
1120   PfxEntry* pptr = pStart[sp];
1121 
1122   while (pptr) {
1123     if (isSubset(pptr->getKey(), word)) {
1124       if (
1125           // fogemorpheme
1126           ((in_compound != IN_CPD_NOT) ||
1127            !(pptr->getCont() &&
1128              (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
1129           // permit prefixes in compounds
1130           ((in_compound != IN_CPD_END) ||
1131            (pptr->getCont() && (TESTAFF(pptr->getCont(), compoundpermitflag,
1132                                         pptr->getContLen()))))) {
1133         // check prefix
1134         rv = pptr->checkword(word, len, in_compound, needflag);
1135         if (rv) {
1136           pfx = pptr;  // BUG: pfx not stateless
1137           return rv;
1138         }
1139       }
1140       pptr = pptr->getNextEQ();
1141     } else {
1142       pptr = pptr->getNextNE();
1143     }
1144   }
1145 
1146   return NULL;
1147 }
1148 
1149 // check word for prefixes
prefix_check_twosfx(const char * word,int len,char in_compound,const FLAG needflag)1150 struct hentry* AffixMgr::prefix_check_twosfx(const char* word,
1151                                              int len,
1152                                              char in_compound,
1153                                              const FLAG needflag) {
1154   struct hentry* rv = NULL;
1155 
1156   pfx = NULL;
1157   sfxappnd = NULL;
1158   sfxextra = 0;
1159 
1160   // first handle the special case of 0 length prefixes
1161   PfxEntry* pe = pStart[0];
1162 
1163   while (pe) {
1164     rv = pe->check_twosfx(word, len, in_compound, needflag);
1165     if (rv)
1166       return rv;
1167     pe = pe->getNext();
1168   }
1169 
1170   // now handle the general case
1171   unsigned char sp = *((const unsigned char*)word);
1172   PfxEntry* pptr = pStart[sp];
1173 
1174   while (pptr) {
1175     if (isSubset(pptr->getKey(), word)) {
1176       rv = pptr->check_twosfx(word, len, in_compound, needflag);
1177       if (rv) {
1178         pfx = pptr;
1179         return rv;
1180       }
1181       pptr = pptr->getNextEQ();
1182     } else {
1183       pptr = pptr->getNextNE();
1184     }
1185   }
1186 
1187   return NULL;
1188 }
1189 
1190 // check word for prefixes
prefix_check_morph(const char * word,int len,char in_compound,const FLAG needflag)1191 std::string AffixMgr::prefix_check_morph(const char* word,
1192                                          int len,
1193                                          char in_compound,
1194                                          const FLAG needflag) {
1195 
1196   std::string result;
1197 
1198   pfx = NULL;
1199   sfxappnd = NULL;
1200   sfxextra = 0;
1201 
1202   // first handle the special case of 0 length prefixes
1203   PfxEntry* pe = pStart[0];
1204   while (pe) {
1205     std::string st = pe->check_morph(word, len, in_compound, needflag);
1206     if (!st.empty()) {
1207       result.append(st);
1208     }
1209     pe = pe->getNext();
1210   }
1211 
1212   // now handle the general case
1213   unsigned char sp = *((const unsigned char*)word);
1214   PfxEntry* pptr = pStart[sp];
1215 
1216   while (pptr) {
1217     if (isSubset(pptr->getKey(), word)) {
1218       std::string st = pptr->check_morph(word, len, in_compound, needflag);
1219       if (!st.empty()) {
1220         // fogemorpheme
1221         if ((in_compound != IN_CPD_NOT) ||
1222             !((pptr->getCont() && (TESTAFF(pptr->getCont(), onlyincompound,
1223                                            pptr->getContLen()))))) {
1224           result.append(st);
1225           pfx = pptr;
1226         }
1227       }
1228       pptr = pptr->getNextEQ();
1229     } else {
1230       pptr = pptr->getNextNE();
1231     }
1232   }
1233 
1234   return result;
1235 }
1236 
1237 // check word for prefixes
prefix_check_twosfx_morph(const char * word,int len,char in_compound,const FLAG needflag)1238 std::string AffixMgr::prefix_check_twosfx_morph(const char* word,
1239                                                 int len,
1240                                                 char in_compound,
1241                                                 const FLAG needflag) {
1242   std::string result;
1243 
1244   pfx = NULL;
1245   sfxappnd = NULL;
1246   sfxextra = 0;
1247 
1248   // first handle the special case of 0 length prefixes
1249   PfxEntry* pe = pStart[0];
1250   while (pe) {
1251     std::string st = pe->check_twosfx_morph(word, len, in_compound, needflag);
1252     if (!st.empty()) {
1253       result.append(st);
1254     }
1255     pe = pe->getNext();
1256   }
1257 
1258   // now handle the general case
1259   unsigned char sp = *((const unsigned char*)word);
1260   PfxEntry* pptr = pStart[sp];
1261 
1262   while (pptr) {
1263     if (isSubset(pptr->getKey(), word)) {
1264       std::string st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
1265       if (!st.empty()) {
1266         result.append(st);
1267         pfx = pptr;
1268       }
1269       pptr = pptr->getNextEQ();
1270     } else {
1271       pptr = pptr->getNextNE();
1272     }
1273   }
1274 
1275   return result;
1276 }
1277 
1278 // Is word a non compound with a REP substitution (see checkcompoundrep)?
cpdrep_check(const char * word,int wl)1279 int AffixMgr::cpdrep_check(const char* word, int wl) {
1280 
1281   if ((wl < 2) || reptable.empty())
1282     return 0;
1283 
1284   for (size_t i = 0; i < reptable.size(); ++i) {
1285     const char* r = word;
1286     const size_t lenp = reptable[i].pattern.size();
1287     // search every occurence of the pattern in the word
1288     while ((r = strstr(r, reptable[i].pattern.c_str())) != NULL) {
1289       std::string candidate(word);
1290       size_t type = r == word && langnum != LANG_hu ? 1 : 0;
1291       if (r - word + reptable[i].pattern.size() == lenp && langnum != LANG_hu)
1292         type += 2;
1293       candidate.replace(r - word, lenp, reptable[i].outstrings[type]);
1294       if (candidate_check(candidate.c_str(), candidate.size()))
1295         return 1;
1296       ++r;  // search for the next letter
1297     }
1298   }
1299 
1300   return 0;
1301 }
1302 
1303 // forbid compoundings when there are special patterns at word bound
cpdpat_check(const char * word,int pos,hentry * r1,hentry * r2,const char)1304 int AffixMgr::cpdpat_check(const char* word,
1305                            int pos,
1306                            hentry* r1,
1307                            hentry* r2,
1308                            const char /*affixed*/) {
1309   for (size_t i = 0; i < checkcpdtable.size(); ++i) {
1310     size_t len;
1311     if (isSubset(checkcpdtable[i].pattern2.c_str(), word + pos) &&
1312         (!r1 || !checkcpdtable[i].cond ||
1313          (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) &&
1314         (!r2 || !checkcpdtable[i].cond2 ||
1315          (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) &&
1316         // zero length pattern => only TESTAFF
1317         // zero pattern (0/flag) => unmodified stem (zero affixes allowed)
1318         (checkcpdtable[i].pattern.empty() ||
1319          ((checkcpdtable[i].pattern[0] == '0' && r1->blen <= pos &&
1320            strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) ||
1321           (checkcpdtable[i].pattern[0] != '0' &&
1322            ((len = checkcpdtable[i].pattern.size()) != 0) &&
1323            strncmp(word + pos - len, checkcpdtable[i].pattern.c_str(), len) == 0)))) {
1324       return 1;
1325     }
1326   }
1327   return 0;
1328 }
1329 
1330 // forbid compounding with neighbouring upper and lower case characters at word
1331 // bounds
cpdcase_check(const char * word,int pos)1332 int AffixMgr::cpdcase_check(const char* word, int pos) {
1333   if (utf8) {
1334     const char* p;
1335     for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--)
1336       ;
1337     std::string pair(p);
1338     std::vector<w_char> pair_u;
1339     u8_u16(pair_u, pair);
1340     unsigned short a = pair_u.size() > 1 ? ((pair_u[1].h << 8) + pair_u[1].l) : 0;
1341     unsigned short b = !pair_u.empty() ? ((pair_u[0].h << 8) + pair_u[0].l) : 0;
1342     if (((unicodetoupper(a, langnum) == a) ||
1343          (unicodetoupper(b, langnum) == b)) &&
1344         (a != '-') && (b != '-'))
1345       return 1;
1346   } else {
1347     unsigned char a = *(word + pos - 1);
1348     unsigned char b = *(word + pos);
1349     if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-'))
1350       return 1;
1351   }
1352   return 0;
1353 }
1354 
1355 struct metachar_data {
1356   signed short btpp;  // metacharacter (*, ?) position for backtracking
1357   signed short btwp;  // word position for metacharacters
1358   int btnum;          // number of matched characters in metacharacter
1359 };
1360 
1361 // check compound patterns
defcpd_check(hentry *** words,short wnum,hentry * rv,hentry ** def,char all)1362 int AffixMgr::defcpd_check(hentry*** words,
1363                            short wnum,
1364                            hentry* rv,
1365                            hentry** def,
1366                            char all) {
1367   int w = 0;
1368 
1369   if (!*words) {
1370     w = 1;
1371     *words = def;
1372   }
1373 
1374   if (!*words) {
1375     return 0;
1376   }
1377 
1378   std::vector<metachar_data> btinfo(1);
1379 
1380   short bt = 0;
1381 
1382   (*words)[wnum] = rv;
1383 
1384   // has the last word COMPOUNDRULE flag?
1385   if (rv->alen == 0) {
1386     (*words)[wnum] = NULL;
1387     if (w)
1388       *words = NULL;
1389     return 0;
1390   }
1391   int ok = 0;
1392   for (size_t i = 0; i < defcpdtable.size(); ++i) {
1393     for (size_t j = 0; j < defcpdtable[i].size(); ++j) {
1394       if (defcpdtable[i][j] != '*' && defcpdtable[i][j] != '?' &&
1395           TESTAFF(rv->astr, defcpdtable[i][j], rv->alen)) {
1396         ok = 1;
1397         break;
1398       }
1399     }
1400   }
1401   if (ok == 0) {
1402     (*words)[wnum] = NULL;
1403     if (w)
1404       *words = NULL;
1405     return 0;
1406   }
1407 
1408   for (size_t i = 0; i < defcpdtable.size(); ++i) {
1409     size_t pp = 0;  // pattern position
1410     signed short wp = 0;  // "words" position
1411     int ok2;
1412     ok = 1;
1413     ok2 = 1;
1414     do {
1415       while ((pp < defcpdtable[i].size()) && (wp <= wnum)) {
1416         if (((pp + 1) < defcpdtable[i].size()) &&
1417             ((defcpdtable[i][pp + 1] == '*') ||
1418              (defcpdtable[i][pp + 1] == '?'))) {
1419           int wend = (defcpdtable[i][pp + 1] == '?') ? wp : wnum;
1420           ok2 = 1;
1421           pp += 2;
1422           btinfo[bt].btpp = pp;
1423           btinfo[bt].btwp = wp;
1424           while (wp <= wend) {
1425             if (!(*words)[wp]->alen ||
1426                 !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp - 2],
1427                          (*words)[wp]->alen)) {
1428               ok2 = 0;
1429               break;
1430             }
1431             wp++;
1432           }
1433           if (wp <= wnum)
1434             ok2 = 0;
1435           btinfo[bt].btnum = wp - btinfo[bt].btwp;
1436           if (btinfo[bt].btnum > 0) {
1437             ++bt;
1438             btinfo.resize(bt+1);
1439           }
1440           if (ok2)
1441             break;
1442         } else {
1443           ok2 = 1;
1444           if (!(*words)[wp] || !(*words)[wp]->alen ||
1445               !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp],
1446                        (*words)[wp]->alen)) {
1447             ok = 0;
1448             break;
1449           }
1450           pp++;
1451           wp++;
1452           if ((defcpdtable[i].size() == pp) && !(wp > wnum))
1453             ok = 0;
1454         }
1455       }
1456       if (ok && ok2) {
1457         size_t r = pp;
1458         while ((defcpdtable[i].size() > r) && ((r + 1) < defcpdtable[i].size()) &&
1459                ((defcpdtable[i][r + 1] == '*') ||
1460                 (defcpdtable[i][r + 1] == '?')))
1461           r += 2;
1462         if (defcpdtable[i].size() <= r)
1463           return 1;
1464       }
1465       // backtrack
1466       if (bt)
1467         do {
1468           ok = 1;
1469           btinfo[bt - 1].btnum--;
1470           pp = btinfo[bt - 1].btpp;
1471           wp = btinfo[bt - 1].btwp + (signed short)btinfo[bt - 1].btnum;
1472         } while ((btinfo[bt - 1].btnum < 0) && --bt);
1473     } while (bt);
1474 
1475     if (ok && ok2 && (!all || (defcpdtable[i].size() <= pp)))
1476       return 1;
1477 
1478     // check zero ending
1479     while (ok && ok2 && (defcpdtable[i].size() > pp) &&
1480            ((pp + 1) < defcpdtable[i].size()) &&
1481            ((defcpdtable[i][pp + 1] == '*') ||
1482             (defcpdtable[i][pp + 1] == '?')))
1483       pp += 2;
1484     if (ok && ok2 && (defcpdtable[i].size() <= pp))
1485       return 1;
1486   }
1487   (*words)[wnum] = NULL;
1488   if (w)
1489     *words = NULL;
1490   return 0;
1491 }
1492 
candidate_check(const char * word,int len)1493 inline int AffixMgr::candidate_check(const char* word, int len) {
1494 
1495   struct hentry* rv = lookup(word);
1496   if (rv)
1497     return 1;
1498 
1499   //  rv = prefix_check(word,len,1);
1500   //  if (rv) return 1;
1501 
1502   rv = affix_check(word, len);
1503   if (rv)
1504     return 1;
1505   return 0;
1506 }
1507 
1508 // calculate number of syllable for compound-checking
get_syllable(const std::string & word)1509 short AffixMgr::get_syllable(const std::string& word) {
1510   if (cpdmaxsyllable == 0)
1511     return 0;
1512 
1513   short num = 0;
1514 
1515   if (!utf8) {
1516     for (size_t i = 0; i < word.size(); ++i) {
1517       if (std::binary_search(cpdvowels.begin(), cpdvowels.end(),
1518                              word[i])) {
1519         ++num;
1520       }
1521     }
1522   } else if (!cpdvowels_utf16.empty()) {
1523     std::vector<w_char> w;
1524     u8_u16(w, word);
1525     for (size_t i = 0; i < w.size(); ++i) {
1526       if (std::binary_search(cpdvowels_utf16.begin(),
1527                              cpdvowels_utf16.end(),
1528                              w[i])) {
1529         ++num;
1530       }
1531     }
1532   }
1533 
1534   return num;
1535 }
1536 
setcminmax(int * cmin,int * cmax,const char * word,int len)1537 void AffixMgr::setcminmax(int* cmin, int* cmax, const char* word, int len) {
1538   if (utf8) {
1539     int i;
1540     for (*cmin = 0, i = 0; (i < cpdmin) && *cmin < len; i++) {
1541       for ((*cmin)++; *cmin < len && (word[*cmin] & 0xc0) == 0x80; (*cmin)++)
1542         ;
1543     }
1544     for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax >= 0; i++) {
1545       for ((*cmax)--; *cmax >= 0 && (word[*cmax] & 0xc0) == 0x80; (*cmax)--)
1546         ;
1547     }
1548   } else {
1549     *cmin = cpdmin;
1550     *cmax = len - cpdmin + 1;
1551   }
1552 }
1553 
1554 // check if compound word is correctly spelled
1555 // hu_mov_rule = spec. Hungarian rule (XXX)
compound_check(const std::string & word,short wordnum,short numsyllable,short maxwordnum,short wnum,hentry ** words=NULL,hentry ** rwords=NULL,char hu_mov_rule=0,char is_sug=0,int * info=NULL)1556 struct hentry* AffixMgr::compound_check(const std::string& word,
1557                                         short wordnum,
1558                                         short numsyllable,
1559                                         short maxwordnum,
1560                                         short wnum,
1561                                         hentry** words = NULL,
1562                                         hentry** rwords = NULL,
1563                                         char hu_mov_rule = 0,
1564                                         char is_sug = 0,
1565                                         int* info = NULL) {
1566   int i;
1567   short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
1568   struct hentry* rv = NULL;
1569   struct hentry* rv_first;
1570   std::string st;
1571   char ch = '\0';
1572   int cmin;
1573   int cmax;
1574   int striple = 0;
1575   size_t scpd = 0;
1576   int soldi = 0;
1577   int oldcmin = 0;
1578   int oldcmax = 0;
1579   int oldlen = 0;
1580   int checkedstriple = 0;
1581   char affixed = 0;
1582   hentry** oldwords = words;
1583   size_t len = word.size();
1584 
1585   int checked_prefix;
1586 
1587   setcminmax(&cmin, &cmax, word.c_str(), len);
1588 
1589   st.assign(word);
1590 
1591   for (i = cmin; i < cmax; i++) {
1592     // go to end of the UTF-8 character
1593     if (utf8) {
1594       for (; (st[i] & 0xc0) == 0x80; i++)
1595         ;
1596       if (i >= cmax)
1597         return NULL;
1598     }
1599 
1600     words = oldwords;
1601     int onlycpdrule = (words) ? 1 : 0;
1602 
1603     do {  // onlycpdrule loop
1604 
1605       oldnumsyllable = numsyllable;
1606       oldwordnum = wordnum;
1607       checked_prefix = 0;
1608 
1609       do {  // simplified checkcompoundpattern loop
1610 
1611         if (scpd > 0) {
1612           for (; scpd <= checkcpdtable.size() &&
1613                  (checkcpdtable[scpd - 1].pattern3.empty() ||
1614                   strncmp(word.c_str() + i, checkcpdtable[scpd - 1].pattern3.c_str(),
1615                           checkcpdtable[scpd - 1].pattern3.size()) != 0);
1616                scpd++)
1617             ;
1618 
1619           if (scpd > checkcpdtable.size())
1620             break;  // break simplified checkcompoundpattern loop
1621           st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern);
1622           soldi = i;
1623           i += checkcpdtable[scpd - 1].pattern.size();
1624           st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern2);
1625           st.replace(i + checkcpdtable[scpd - 1].pattern2.size(), std::string::npos,
1626                  word.substr(soldi + checkcpdtable[scpd - 1].pattern3.size()));
1627 
1628           oldlen = len;
1629           len += checkcpdtable[scpd - 1].pattern.size() +
1630                  checkcpdtable[scpd - 1].pattern2.size() -
1631                  checkcpdtable[scpd - 1].pattern3.size();
1632           oldcmin = cmin;
1633           oldcmax = cmax;
1634           setcminmax(&cmin, &cmax, st.c_str(), len);
1635 
1636           cmax = len - cpdmin + 1;
1637         }
1638 
1639         ch = st[i];
1640         st[i] = '\0';
1641 
1642         sfx = NULL;
1643         pfx = NULL;
1644 
1645         // FIRST WORD
1646 
1647         affixed = 1;
1648         rv = lookup(st.c_str());  // perhaps without prefix
1649 
1650         // search homonym with compound flag
1651         while ((rv) && !hu_mov_rule &&
1652                ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1653                 !((compoundflag && !words && !onlycpdrule &&
1654                    TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1655                   (compoundbegin && !wordnum && !onlycpdrule &&
1656                    TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1657                   (compoundmiddle && wordnum && !words && !onlycpdrule &&
1658                    TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
1659                   (!defcpdtable.empty() && onlycpdrule &&
1660                    ((!words && !wordnum &&
1661                      defcpd_check(&words, wnum, rv, rwords, 0)) ||
1662                     (words &&
1663                      defcpd_check(&words, wnum, rv, rwords, 0))))) ||
1664                 (scpd != 0 && checkcpdtable[scpd - 1].cond != FLAG_NULL &&
1665                  !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)))) {
1666           rv = rv->next_homonym;
1667         }
1668 
1669         if (rv)
1670           affixed = 0;
1671 
1672         if (!rv) {
1673           if (onlycpdrule)
1674             break;
1675           if (compoundflag &&
1676               !(rv = prefix_check(st.c_str(), i,
1677                                   hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
1678                                   compoundflag))) {
1679             if (((rv = suffix_check(
1680                       st.c_str(), i, 0, NULL, FLAG_NULL, compoundflag,
1681                       hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1682                  (compoundmoresuffixes &&
1683                   (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
1684                 !hu_mov_rule && sfx->getCont() &&
1685                 ((compoundforbidflag &&
1686                   TESTAFF(sfx->getCont(), compoundforbidflag,
1687                           sfx->getContLen())) ||
1688                  (compoundend &&
1689                   TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
1690               rv = NULL;
1691             }
1692           }
1693 
1694           if (rv ||
1695               (((wordnum == 0) && compoundbegin &&
1696                 ((rv = suffix_check(
1697                       st.c_str(), i, 0, NULL, FLAG_NULL, compoundbegin,
1698                       hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1699                  (compoundmoresuffixes &&
1700                   (rv = suffix_check_twosfx(
1701                        st.c_str(), i, 0, NULL,
1702                        compoundbegin))) ||  // twofold suffixes + compound
1703                  (rv = prefix_check(st.c_str(), i,
1704                                     hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
1705                                     compoundbegin)))) ||
1706                ((wordnum > 0) && compoundmiddle &&
1707                 ((rv = suffix_check(
1708                       st.c_str(), i, 0, NULL, FLAG_NULL, compoundmiddle,
1709                       hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1710                  (compoundmoresuffixes &&
1711                   (rv = suffix_check_twosfx(
1712                        st.c_str(), i, 0, NULL,
1713                        compoundmiddle))) ||  // twofold suffixes + compound
1714                  (rv = prefix_check(st.c_str(), i,
1715                                     hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
1716                                     compoundmiddle))))))
1717             checked_prefix = 1;
1718           // else check forbiddenwords and needaffix
1719         } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1720                                 TESTAFF(rv->astr, needaffix, rv->alen) ||
1721                                 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1722                                 (is_sug && nosuggest &&
1723                                  TESTAFF(rv->astr, nosuggest, rv->alen)))) {
1724           st[i] = ch;
1725           // continue;
1726           break;
1727         }
1728 
1729         // check non_compound flag in suffix and prefix
1730         if ((rv) && !hu_mov_rule &&
1731             ((pfx && pfx->getCont() &&
1732               TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
1733              (sfx && sfx->getCont() &&
1734               TESTAFF(sfx->getCont(), compoundforbidflag,
1735                       sfx->getContLen())))) {
1736           rv = NULL;
1737         }
1738 
1739         // check compoundend flag in suffix and prefix
1740         if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
1741             ((pfx && pfx->getCont() &&
1742               TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) ||
1743              (sfx && sfx->getCont() &&
1744               TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
1745           rv = NULL;
1746         }
1747 
1748         // check compoundmiddle flag in suffix and prefix
1749         if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
1750             !hu_mov_rule &&
1751             ((pfx && pfx->getCont() &&
1752               TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) ||
1753              (sfx && sfx->getCont() &&
1754               TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) {
1755           rv = NULL;
1756         }
1757 
1758         // check forbiddenwords
1759         if ((rv) && (rv->astr) &&
1760             (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1761              TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1762              (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
1763           return NULL;
1764         }
1765 
1766         // increment word number, if the second root has a compoundroot flag
1767         if ((rv) && compoundroot &&
1768             (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1769           wordnum++;
1770         }
1771 
1772         // first word is acceptable in compound words?
1773         if (((rv) &&
1774              (checked_prefix || (words && words[wnum]) ||
1775               (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1776               ((oldwordnum == 0) && compoundbegin &&
1777                TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1778               ((oldwordnum > 0) && compoundmiddle &&
1779                TESTAFF(rv->astr, compoundmiddle, rv->alen))
1780 
1781               // LANG_hu section: spec. Hungarian rule
1782               || ((langnum == LANG_hu) && hu_mov_rule &&
1783                   (TESTAFF(
1784                        rv->astr, 'F',
1785                        rv->alen) ||  // XXX hardwired Hungarian dictionary codes
1786                    TESTAFF(rv->astr, 'G', rv->alen) ||
1787                    TESTAFF(rv->astr, 'H', rv->alen)))
1788               // END of LANG_hu section
1789               ) &&
1790              (
1791                  // test CHECKCOMPOUNDPATTERN conditions
1792                  scpd == 0 || checkcpdtable[scpd - 1].cond == FLAG_NULL ||
1793                  TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)) &&
1794              !((checkcompoundtriple && scpd == 0 &&
1795                 !words &&  // test triple letters
1796                 (word[i - 1] == word[i]) &&
1797                 (((i > 1) && (word[i - 1] == word[i - 2])) ||
1798                  ((word[i - 1] == word[i + 1]))  // may be word[i+1] == '\0'
1799                  )) ||
1800                (checkcompoundcase && scpd == 0 && !words &&
1801                 cpdcase_check(word.c_str(), i))))
1802             // LANG_hu section: spec. Hungarian rule
1803             || ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
1804                 (rv = affix_check(st.c_str(), i)) &&
1805                 (sfx && sfx->getCont() &&
1806                  (  // XXX hardwired Hungarian dic. codes
1807                      TESTAFF(sfx->getCont(), (unsigned short)'x',
1808                              sfx->getContLen()) ||
1809                      TESTAFF(
1810                          sfx->getCont(), (unsigned short)'%',
1811                          sfx->getContLen()))))) {  // first word is ok condition
1812 
1813           // LANG_hu section: spec. Hungarian rule
1814           if (langnum == LANG_hu) {
1815             // calculate syllable number of the word
1816             numsyllable += get_syllable(st.substr(0, i));
1817             // + 1 word, if syllable number of the prefix > 1 (hungarian
1818             // convention)
1819             if (pfx && (get_syllable(pfx->getKey()) > 1))
1820               wordnum++;
1821           }
1822           // END of LANG_hu section
1823 
1824           // NEXT WORD(S)
1825           rv_first = rv;
1826           st[i] = ch;
1827 
1828           do {  // striple loop
1829 
1830             // check simplifiedtriple
1831             if (simplifiedtriple) {
1832               if (striple) {
1833                 checkedstriple = 1;
1834                 i--;  // check "fahrt" instead of "ahrt" in "Schiffahrt"
1835               } else if (i > 2 && word[i - 1] == word[i - 2])
1836                 striple = 1;
1837             }
1838 
1839             rv = lookup(st.c_str() + i);  // perhaps without prefix
1840 
1841             // search homonym with compound flag
1842             while ((rv) &&
1843                    ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1844                     !((compoundflag && !words &&
1845                        TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1846                       (compoundend && !words &&
1847                        TESTAFF(rv->astr, compoundend, rv->alen)) ||
1848                       (!defcpdtable.empty() && words &&
1849                        defcpd_check(&words, wnum + 1, rv, NULL, 1))) ||
1850                     (scpd != 0 && checkcpdtable[scpd - 1].cond2 != FLAG_NULL &&
1851                      !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2,
1852                               rv->alen)))) {
1853               rv = rv->next_homonym;
1854             }
1855 
1856             // check FORCEUCASE
1857             if (rv && forceucase && (rv) &&
1858                 (TESTAFF(rv->astr, forceucase, rv->alen)) &&
1859                 !(info && *info & SPELL_ORIGCAP))
1860               rv = NULL;
1861 
1862             if (rv && words && words[wnum + 1])
1863               return rv_first;
1864 
1865             oldnumsyllable2 = numsyllable;
1866             oldwordnum2 = wordnum;
1867 
1868             // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary
1869             // code
1870             if ((rv) && (langnum == LANG_hu) &&
1871                 (TESTAFF(rv->astr, 'I', rv->alen)) &&
1872                 !(TESTAFF(rv->astr, 'J', rv->alen))) {
1873               numsyllable--;
1874             }
1875             // END of LANG_hu section
1876 
1877             // increment word number, if the second root has a compoundroot flag
1878             if ((rv) && (compoundroot) &&
1879                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1880               wordnum++;
1881             }
1882 
1883             // check forbiddenwords
1884             if ((rv) && (rv->astr) &&
1885                 (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1886                  TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1887                  (is_sug && nosuggest &&
1888                   TESTAFF(rv->astr, nosuggest, rv->alen))))
1889               return NULL;
1890 
1891             // second word is acceptable, as a root?
1892             // hungarian conventions: compounding is acceptable,
1893             // when compound forms consist of 2 words, or if more,
1894             // then the syllable number of root words must be 6, or lesser.
1895 
1896             if ((rv) &&
1897                 ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1898                  (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
1899                 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
1900                  ((cpdmaxsyllable != 0) &&
1901                   (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
1902                    cpdmaxsyllable))) &&
1903                 (
1904                     // test CHECKCOMPOUNDPATTERN
1905                     checkcpdtable.empty() || scpd != 0 ||
1906                     !cpdpat_check(word.c_str(), i, rv_first, rv, 0)) &&
1907                 ((!checkcompounddup || (rv != rv_first)))
1908                 // test CHECKCOMPOUNDPATTERN conditions
1909                 &&
1910                 (scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL ||
1911                  TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) {
1912               // forbid compound word, if it is a non compound word with typical
1913               // fault
1914               if (checkcompoundrep && cpdrep_check(word.c_str(), len))
1915                 return NULL;
1916               return rv_first;
1917             }
1918 
1919             numsyllable = oldnumsyllable2;
1920             wordnum = oldwordnum2;
1921 
1922             // perhaps second word has prefix or/and suffix
1923             sfx = NULL;
1924             sfxflag = FLAG_NULL;
1925             rv = (compoundflag && !onlycpdrule)
1926                      ? affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundflag,
1927                                    IN_CPD_END)
1928                      : NULL;
1929             if (!rv && compoundend && !onlycpdrule) {
1930               sfx = NULL;
1931               pfx = NULL;
1932               rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundend,
1933                                IN_CPD_END);
1934             }
1935 
1936             if (!rv && !defcpdtable.empty() && words) {
1937               rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), 0, IN_CPD_END);
1938               if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1))
1939                 return rv_first;
1940               rv = NULL;
1941             }
1942 
1943             // test CHECKCOMPOUNDPATTERN conditions (allowed forms)
1944             if (rv &&
1945                 !(scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL ||
1946                   TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen)))
1947               rv = NULL;
1948 
1949             // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds)
1950             if (rv && !checkcpdtable.empty() && scpd == 0 &&
1951                 cpdpat_check(word.c_str(), i, rv_first, rv, affixed))
1952               rv = NULL;
1953 
1954             // check non_compound flag in suffix and prefix
1955             if ((rv) && ((pfx && pfx->getCont() &&
1956                           TESTAFF(pfx->getCont(), compoundforbidflag,
1957                                   pfx->getContLen())) ||
1958                          (sfx && sfx->getCont() &&
1959                           TESTAFF(sfx->getCont(), compoundforbidflag,
1960                                   sfx->getContLen())))) {
1961               rv = NULL;
1962             }
1963 
1964             // check FORCEUCASE
1965             if (rv && forceucase && (rv) &&
1966                 (TESTAFF(rv->astr, forceucase, rv->alen)) &&
1967                 !(info && *info & SPELL_ORIGCAP))
1968               rv = NULL;
1969 
1970             // check forbiddenwords
1971             if ((rv) && (rv->astr) &&
1972                 (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1973                  TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1974                  (is_sug && nosuggest &&
1975                   TESTAFF(rv->astr, nosuggest, rv->alen))))
1976               return NULL;
1977 
1978             // pfxappnd = prefix of word+i, or NULL
1979             // calculate syllable number of prefix.
1980             // hungarian convention: when syllable number of prefix is more,
1981             // than 1, the prefix+word counts as two words.
1982 
1983             if (langnum == LANG_hu) {
1984               // calculate syllable number of the word
1985               numsyllable += get_syllable(word.c_str() + i);
1986 
1987               // - affix syllable num.
1988               // XXX only second suffix (inflections, not derivations)
1989               if (sfxappnd) {
1990                 std::string tmp(sfxappnd);
1991                 reverseword(tmp);
1992                 numsyllable -= get_syllable(tmp) + sfxextra;
1993               }
1994 
1995               // + 1 word, if syllable number of the prefix > 1 (hungarian
1996               // convention)
1997               if (pfx && (get_syllable(pfx->getKey()) > 1))
1998                 wordnum++;
1999 
2000               // increment syllable num, if last word has a SYLLABLENUM flag
2001               // and the suffix is beginning `s'
2002 
2003               if (!cpdsyllablenum.empty()) {
2004                 switch (sfxflag) {
2005                   case 'c': {
2006                     numsyllable += 2;
2007                     break;
2008                   }
2009                   case 'J': {
2010                     numsyllable += 1;
2011                     break;
2012                   }
2013                   case 'I': {
2014                     if (rv && TESTAFF(rv->astr, 'J', rv->alen))
2015                       numsyllable += 1;
2016                     break;
2017                   }
2018                 }
2019               }
2020             }
2021 
2022             // increment word number, if the second word has a compoundroot flag
2023             if ((rv) && (compoundroot) &&
2024                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2025               wordnum++;
2026             }
2027 
2028             // second word is acceptable, as a word with prefix or/and suffix?
2029             // hungarian conventions: compounding is acceptable,
2030             // when compound forms consist 2 word, otherwise
2031             // the syllable number of root words is 6, or lesser.
2032             if ((rv) &&
2033                 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2034                  ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
2035                 ((!checkcompounddup || (rv != rv_first)))) {
2036               // forbid compound word, if it is a non compound word with typical
2037               // fault
2038               if (checkcompoundrep && cpdrep_check(word.c_str(), len))
2039                 return NULL;
2040               return rv_first;
2041             }
2042 
2043             numsyllable = oldnumsyllable2;
2044             wordnum = oldwordnum2;
2045 
2046             // perhaps second word is a compound word (recursive call)
2047             if (wordnum + 2 < maxwordnum) {
2048               rv = compound_check(st.substr(i), wordnum + 1,
2049                                   numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
2050                                   is_sug, info);
2051 
2052               if (rv && !checkcpdtable.empty() &&
2053                   ((scpd == 0 &&
2054                     cpdpat_check(word.c_str(), i, rv_first, rv, affixed)) ||
2055                    (scpd != 0 &&
2056                     !cpdpat_check(word.c_str(), i, rv_first, rv, affixed))))
2057                 rv = NULL;
2058             } else {
2059               rv = NULL;
2060             }
2061             if (rv) {
2062               // forbid compound word, if it is a non compound word with typical
2063               // fault
2064               if (checkcompoundrep || forbiddenword) {
2065 
2066                 if (checkcompoundrep && cpdrep_check(word.c_str(), len))
2067                   return NULL;
2068 
2069                 // check first part
2070                 if (strncmp(rv->word, word.c_str() + i, rv->blen) == 0) {
2071                   char r = st[i + rv->blen];
2072                   st[i + rv->blen] = '\0';
2073 
2074                   if (checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) {
2075                     st[ + i + rv->blen] = r;
2076                     continue;
2077                   }
2078 
2079                   if (forbiddenword) {
2080                     struct hentry* rv2 = lookup(word.c_str());
2081                     if (!rv2)
2082                       rv2 = affix_check(word.c_str(), len);
2083                     if (rv2 && rv2->astr &&
2084                         TESTAFF(rv2->astr, forbiddenword, rv2->alen) &&
2085                         (strncmp(rv2->word, st.c_str(), i + rv->blen) == 0)) {
2086                       return NULL;
2087                     }
2088                   }
2089                   st[i + rv->blen] = r;
2090                 }
2091               }
2092               return rv_first;
2093             }
2094           } while (striple && !checkedstriple);  // end of striple loop
2095 
2096           if (checkedstriple) {
2097             i++;
2098             checkedstriple = 0;
2099             striple = 0;
2100           }
2101 
2102         }  // first word is ok condition
2103 
2104         if (soldi != 0) {
2105           i = soldi;
2106           soldi = 0;
2107           len = oldlen;
2108           cmin = oldcmin;
2109           cmax = oldcmax;
2110         }
2111         scpd++;
2112 
2113       } while (!onlycpdrule && simplifiedcpd &&
2114                scpd <= checkcpdtable.size());  // end of simplifiedcpd loop
2115 
2116       scpd = 0;
2117       wordnum = oldwordnum;
2118       numsyllable = oldnumsyllable;
2119 
2120       if (soldi != 0) {
2121         i = soldi;
2122         st.assign(word);  // XXX add more optim.
2123         soldi = 0;
2124       } else
2125         st[i] = ch;
2126 
2127     } while (!defcpdtable.empty() && oldwordnum == 0 &&
2128              onlycpdrule++ < 1);  // end of onlycpd loop
2129   }
2130 
2131   return NULL;
2132 }
2133 
2134 // check if compound word is correctly spelled
2135 // hu_mov_rule = spec. Hungarian rule (XXX)
compound_check_morph(const char * word,int len,short wordnum,short numsyllable,short maxwordnum,short wnum,hentry ** words,hentry ** rwords,char hu_mov_rule,std::string & result,const std::string * partresult)2136 int AffixMgr::compound_check_morph(const char* word,
2137                                    int len,
2138                                    short wordnum,
2139                                    short numsyllable,
2140                                    short maxwordnum,
2141                                    short wnum,
2142                                    hentry** words,
2143                                    hentry** rwords,
2144                                    char hu_mov_rule,
2145                                    std::string& result,
2146                                    const std::string* partresult) {
2147   int i;
2148   short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
2149   int ok = 0;
2150 
2151   struct hentry* rv = NULL;
2152   struct hentry* rv_first;
2153   std::string st;
2154   char ch;
2155 
2156   int checked_prefix;
2157   std::string presult;
2158 
2159   int cmin;
2160   int cmax;
2161 
2162   char affixed = 0;
2163   hentry** oldwords = words;
2164 
2165   setcminmax(&cmin, &cmax, word, len);
2166 
2167   st.assign(word);
2168 
2169   for (i = cmin; i < cmax; i++) {
2170     // go to end of the UTF-8 character
2171     if (utf8) {
2172       for (; (st[i] & 0xc0) == 0x80; i++)
2173         ;
2174       if (i >= cmax)
2175         return 0;
2176     }
2177 
2178     words = oldwords;
2179     int onlycpdrule = (words) ? 1 : 0;
2180 
2181     do {  // onlycpdrule loop
2182 
2183       oldnumsyllable = numsyllable;
2184       oldwordnum = wordnum;
2185       checked_prefix = 0;
2186 
2187       ch = st[i];
2188       st[i] = '\0';
2189       sfx = NULL;
2190 
2191       // FIRST WORD
2192 
2193       affixed = 1;
2194 
2195       presult.clear();
2196       if (partresult)
2197         presult.append(*partresult);
2198 
2199       rv = lookup(st.c_str());  // perhaps without prefix
2200 
2201       // search homonym with compound flag
2202       while ((rv) && !hu_mov_rule &&
2203              ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2204               !((compoundflag && !words && !onlycpdrule &&
2205                  TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2206                 (compoundbegin && !wordnum && !onlycpdrule &&
2207                  TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2208                 (compoundmiddle && wordnum && !words && !onlycpdrule &&
2209                  TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
2210                 (!defcpdtable.empty() && onlycpdrule &&
2211                  ((!words && !wordnum &&
2212                    defcpd_check(&words, wnum, rv, rwords, 0)) ||
2213                   (words &&
2214                    defcpd_check(&words, wnum, rv, rwords, 0))))))) {
2215         rv = rv->next_homonym;
2216       }
2217 
2218       if (rv)
2219         affixed = 0;
2220 
2221       if (rv) {
2222         presult.push_back(MSEP_FLD);
2223         presult.append(MORPH_PART);
2224         presult.append(st.c_str());
2225         if (!HENTRY_FIND(rv, MORPH_STEM)) {
2226           presult.push_back(MSEP_FLD);
2227           presult.append(MORPH_STEM);
2228           presult.append(st.c_str());
2229         }
2230         if (HENTRY_DATA(rv)) {
2231           presult.push_back(MSEP_FLD);
2232           presult.append(HENTRY_DATA2(rv));
2233         }
2234       }
2235 
2236       if (!rv) {
2237         if (compoundflag &&
2238             !(rv =
2239                   prefix_check(st.c_str(), i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
2240                                compoundflag))) {
2241           if (((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
2242                                   compoundflag,
2243                                   hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2244                (compoundmoresuffixes &&
2245                 (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
2246               !hu_mov_rule && sfx->getCont() &&
2247               ((compoundforbidflag &&
2248                 TESTAFF(sfx->getCont(), compoundforbidflag,
2249                         sfx->getContLen())) ||
2250                (compoundend &&
2251                 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
2252             rv = NULL;
2253           }
2254         }
2255 
2256         if (rv ||
2257             (((wordnum == 0) && compoundbegin &&
2258               ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
2259                                   compoundbegin,
2260                                   hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2261                (compoundmoresuffixes &&
2262                 (rv = suffix_check_twosfx(
2263                      st.c_str(), i, 0, NULL,
2264                      compoundbegin))) ||  // twofold suffix+compound
2265                (rv = prefix_check(st.c_str(), i,
2266                                   hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
2267                                   compoundbegin)))) ||
2268              ((wordnum > 0) && compoundmiddle &&
2269               ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
2270                                   compoundmiddle,
2271                                   hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2272                (compoundmoresuffixes &&
2273                 (rv = suffix_check_twosfx(
2274                      st.c_str(), i, 0, NULL,
2275                      compoundmiddle))) ||  // twofold suffix+compound
2276                (rv = prefix_check(st.c_str(), i,
2277                                   hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
2278                                   compoundmiddle)))))) {
2279           std::string p;
2280           if (compoundflag)
2281             p = affix_check_morph(st.c_str(), i, compoundflag);
2282           if (p.empty()) {
2283             if ((wordnum == 0) && compoundbegin) {
2284               p = affix_check_morph(st.c_str(), i, compoundbegin);
2285             } else if ((wordnum > 0) && compoundmiddle) {
2286               p = affix_check_morph(st.c_str(), i, compoundmiddle);
2287             }
2288           }
2289           if (!p.empty()) {
2290             presult.push_back(MSEP_FLD);
2291             presult.append(MORPH_PART);
2292             presult.append(st.c_str());
2293             line_uniq_app(p, MSEP_REC);
2294             presult.append(p);
2295           }
2296           checked_prefix = 1;
2297         }
2298         // else check forbiddenwords
2299       } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2300                               TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
2301                               TESTAFF(rv->astr, needaffix, rv->alen))) {
2302         st[i] = ch;
2303         continue;
2304       }
2305 
2306       // check non_compound flag in suffix and prefix
2307       if ((rv) && !hu_mov_rule &&
2308           ((pfx && pfx->getCont() &&
2309             TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
2310            (sfx && sfx->getCont() &&
2311             TESTAFF(sfx->getCont(), compoundforbidflag, sfx->getContLen())))) {
2312         continue;
2313       }
2314 
2315       // check compoundend flag in suffix and prefix
2316       if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
2317           ((pfx && pfx->getCont() &&
2318             TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) ||
2319            (sfx && sfx->getCont() &&
2320             TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
2321         continue;
2322       }
2323 
2324       // check compoundmiddle flag in suffix and prefix
2325       if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
2326           !hu_mov_rule &&
2327           ((pfx && pfx->getCont() &&
2328             TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) ||
2329            (sfx && sfx->getCont() &&
2330             TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) {
2331         rv = NULL;
2332       }
2333 
2334       // check forbiddenwords
2335       if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2336                                  TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)))
2337         continue;
2338 
2339       // increment word number, if the second root has a compoundroot flag
2340       if ((rv) && (compoundroot) &&
2341           (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2342         wordnum++;
2343       }
2344 
2345       // first word is acceptable in compound words?
2346       if (((rv) &&
2347            (checked_prefix || (words && words[wnum]) ||
2348             (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2349             ((oldwordnum == 0) && compoundbegin &&
2350              TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2351             ((oldwordnum > 0) && compoundmiddle &&
2352              TESTAFF(rv->astr, compoundmiddle, rv->alen))
2353             // LANG_hu section: spec. Hungarian rule
2354             || ((langnum == LANG_hu) &&  // hu_mov_rule
2355                 hu_mov_rule && (TESTAFF(rv->astr, 'F', rv->alen) ||
2356                                 TESTAFF(rv->astr, 'G', rv->alen) ||
2357                                 TESTAFF(rv->astr, 'H', rv->alen)))
2358             // END of LANG_hu section
2359             ) &&
2360            !((checkcompoundtriple && !words &&  // test triple letters
2361               (word[i - 1] == word[i]) &&
2362               (((i > 1) && (word[i - 1] == word[i - 2])) ||
2363                ((word[i - 1] == word[i + 1]))  // may be word[i+1] == '\0'
2364                )) ||
2365              (
2366                  // test CHECKCOMPOUNDPATTERN
2367                  !checkcpdtable.empty() && !words &&
2368                  cpdpat_check(word, i, rv, NULL, affixed)) ||
2369              (checkcompoundcase && !words && cpdcase_check(word, i))))
2370           // LANG_hu section: spec. Hungarian rule
2371           ||
2372           ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
2373            (rv = affix_check(st.c_str(), i)) &&
2374            (sfx && sfx->getCont() &&
2375             (TESTAFF(sfx->getCont(), (unsigned short)'x', sfx->getContLen()) ||
2376              TESTAFF(sfx->getCont(), (unsigned short)'%', sfx->getContLen()))))
2377           // END of LANG_hu section
2378           ) {
2379         // LANG_hu section: spec. Hungarian rule
2380         if (langnum == LANG_hu) {
2381           // calculate syllable number of the word
2382           numsyllable += get_syllable(st.substr(0, i));
2383 
2384           // + 1 word, if syllable number of the prefix > 1 (hungarian
2385           // convention)
2386           if (pfx && (get_syllable(pfx->getKey()) > 1))
2387             wordnum++;
2388         }
2389         // END of LANG_hu section
2390 
2391         // NEXT WORD(S)
2392         rv_first = rv;
2393         rv = lookup((word + i));  // perhaps without prefix
2394 
2395         // search homonym with compound flag
2396         while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2397                         !((compoundflag && !words &&
2398                            TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2399                           (compoundend && !words &&
2400                            TESTAFF(rv->astr, compoundend, rv->alen)) ||
2401                           (!defcpdtable.empty() && words &&
2402                            defcpd_check(&words, wnum + 1, rv, NULL, 1))))) {
2403           rv = rv->next_homonym;
2404         }
2405 
2406         if (rv && words && words[wnum + 1]) {
2407           result.append(presult);
2408           result.append(" ");
2409           result.append(MORPH_PART);
2410           result.append(word + i);
2411           if (complexprefixes && HENTRY_DATA(rv))
2412             result.append(HENTRY_DATA2(rv));
2413           if (!HENTRY_FIND(rv, MORPH_STEM)) {
2414             result.append(" ");
2415             result.append(MORPH_STEM);
2416             result.append(HENTRY_WORD(rv));
2417           }
2418           // store the pointer of the hash entry
2419           if (!complexprefixes && HENTRY_DATA(rv)) {
2420             result.append(" ");
2421             result.append(HENTRY_DATA2(rv));
2422           }
2423           result.append("\n");
2424           return 0;
2425         }
2426 
2427         oldnumsyllable2 = numsyllable;
2428         oldwordnum2 = wordnum;
2429 
2430         // LANG_hu section: spec. Hungarian rule
2431         if ((rv) && (langnum == LANG_hu) &&
2432             (TESTAFF(rv->astr, 'I', rv->alen)) &&
2433             !(TESTAFF(rv->astr, 'J', rv->alen))) {
2434           numsyllable--;
2435         }
2436         // END of LANG_hu section
2437         // increment word number, if the second root has a compoundroot flag
2438         if ((rv) && (compoundroot) &&
2439             (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2440           wordnum++;
2441         }
2442 
2443         // check forbiddenwords
2444         if ((rv) && (rv->astr) &&
2445             (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2446              TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) {
2447           st[i] = ch;
2448           continue;
2449         }
2450 
2451         // second word is acceptable, as a root?
2452         // hungarian conventions: compounding is acceptable,
2453         // when compound forms consist of 2 words, or if more,
2454         // then the syllable number of root words must be 6, or lesser.
2455         if ((rv) &&
2456             ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2457              (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
2458             (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2459              ((cpdmaxsyllable != 0) &&
2460               (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
2461                cpdmaxsyllable))) &&
2462             ((!checkcompounddup || (rv != rv_first)))) {
2463           // bad compound word
2464           result.append(presult);
2465           result.append(" ");
2466           result.append(MORPH_PART);
2467           result.append(word + i);
2468 
2469           if (HENTRY_DATA(rv)) {
2470             if (complexprefixes)
2471               result.append(HENTRY_DATA2(rv));
2472             if (!HENTRY_FIND(rv, MORPH_STEM)) {
2473               result.append(" ");
2474               result.append(MORPH_STEM);
2475               result.append(HENTRY_WORD(rv));
2476             }
2477             // store the pointer of the hash entry
2478             if (!complexprefixes) {
2479               result.append(" ");
2480               result.append(HENTRY_DATA2(rv));
2481             }
2482           }
2483           result.append("\n");
2484           ok = 1;
2485         }
2486 
2487         numsyllable = oldnumsyllable2;
2488         wordnum = oldwordnum2;
2489 
2490         // perhaps second word has prefix or/and suffix
2491         sfx = NULL;
2492         sfxflag = FLAG_NULL;
2493 
2494         if (compoundflag && !onlycpdrule)
2495           rv = affix_check((word + i), strlen(word + i), compoundflag);
2496         else
2497           rv = NULL;
2498 
2499         if (!rv && compoundend && !onlycpdrule) {
2500           sfx = NULL;
2501           pfx = NULL;
2502           rv = affix_check((word + i), strlen(word + i), compoundend);
2503         }
2504 
2505         if (!rv && !defcpdtable.empty() && words) {
2506           rv = affix_check((word + i), strlen(word + i), 0, IN_CPD_END);
2507           if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
2508             std::string m;
2509             if (compoundflag)
2510               m = affix_check_morph((word + i), strlen(word + i), compoundflag);
2511             if (m.empty() && compoundend) {
2512               m = affix_check_morph((word + i), strlen(word + i), compoundend);
2513             }
2514             result.append(presult);
2515             if (!m.empty()) {
2516               result.push_back(MSEP_FLD);
2517               result.append(MORPH_PART);
2518               result.append(word + i);
2519               line_uniq_app(m, MSEP_REC);
2520               result.append(m);
2521             }
2522             result.append("\n");
2523             ok = 1;
2524           }
2525         }
2526 
2527         // check non_compound flag in suffix and prefix
2528         if ((rv) &&
2529             ((pfx && pfx->getCont() &&
2530               TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
2531              (sfx && sfx->getCont() &&
2532               TESTAFF(sfx->getCont(), compoundforbidflag,
2533                       sfx->getContLen())))) {
2534           rv = NULL;
2535         }
2536 
2537         // check forbiddenwords
2538         if ((rv) && (rv->astr) &&
2539             (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2540              TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)) &&
2541             (!TESTAFF(rv->astr, needaffix, rv->alen))) {
2542           st[i] = ch;
2543           continue;
2544         }
2545 
2546         if (langnum == LANG_hu) {
2547           // calculate syllable number of the word
2548           numsyllable += get_syllable(word + i);
2549 
2550           // - affix syllable num.
2551           // XXX only second suffix (inflections, not derivations)
2552           if (sfxappnd) {
2553             std::string tmp(sfxappnd);
2554             reverseword(tmp);
2555             numsyllable -= get_syllable(tmp) + sfxextra;
2556           }
2557 
2558           // + 1 word, if syllable number of the prefix > 1 (hungarian
2559           // convention)
2560           if (pfx && (get_syllable(pfx->getKey()) > 1))
2561             wordnum++;
2562 
2563           // increment syllable num, if last word has a SYLLABLENUM flag
2564           // and the suffix is beginning `s'
2565 
2566           if (!cpdsyllablenum.empty()) {
2567             switch (sfxflag) {
2568               case 'c': {
2569                 numsyllable += 2;
2570                 break;
2571               }
2572               case 'J': {
2573                 numsyllable += 1;
2574                 break;
2575               }
2576               case 'I': {
2577                 if (rv && TESTAFF(rv->astr, 'J', rv->alen))
2578                   numsyllable += 1;
2579                 break;
2580               }
2581             }
2582           }
2583         }
2584 
2585         // increment word number, if the second word has a compoundroot flag
2586         if ((rv) && (compoundroot) &&
2587             (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2588           wordnum++;
2589         }
2590         // second word is acceptable, as a word with prefix or/and suffix?
2591         // hungarian conventions: compounding is acceptable,
2592         // when compound forms consist 2 word, otherwise
2593         // the syllable number of root words is 6, or lesser.
2594         if ((rv) &&
2595             (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2596              ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
2597             ((!checkcompounddup || (rv != rv_first)))) {
2598           std::string m;
2599           if (compoundflag)
2600             m = affix_check_morph((word + i), strlen(word + i), compoundflag);
2601           if (m.empty() && compoundend) {
2602             m = affix_check_morph((word + i), strlen(word + i), compoundend);
2603           }
2604           result.append(presult);
2605           if (!m.empty()) {
2606             result.push_back(MSEP_FLD);
2607             result.append(MORPH_PART);
2608             result.append(word + 1);
2609             line_uniq_app(m, MSEP_REC);
2610             result.append(m);
2611           }
2612           result.push_back(MSEP_REC);
2613           ok = 1;
2614         }
2615 
2616         numsyllable = oldnumsyllable2;
2617         wordnum = oldwordnum2;
2618 
2619         // perhaps second word is a compound word (recursive call)
2620         if ((wordnum + 2 < maxwordnum) && (ok == 0)) {
2621           compound_check_morph((word + i), strlen(word + i), wordnum + 1,
2622                                numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
2623                                result, &presult);
2624         } else {
2625           rv = NULL;
2626         }
2627       }
2628       st[i] = ch;
2629       wordnum = oldwordnum;
2630       numsyllable = oldnumsyllable;
2631 
2632     } while (!defcpdtable.empty() && oldwordnum == 0 &&
2633              onlycpdrule++ < 1);  // end of onlycpd loop
2634   }
2635   return 0;
2636 }
2637 
2638 
isRevSubset(const char * s1,const char * end_of_s2,int len)2639 inline int AffixMgr::isRevSubset(const char* s1,
2640                                  const char* end_of_s2,
2641                                  int len) {
2642   while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {
2643     s1++;
2644     end_of_s2--;
2645     len--;
2646   }
2647   return (*s1 == '\0');
2648 }
2649 
2650 // check word for suffixes
suffix_check(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag,char in_compound)2651 struct hentry* AffixMgr::suffix_check(const char* word,
2652                                       int len,
2653                                       int sfxopts,
2654                                       PfxEntry* ppfx,
2655                                       const FLAG cclass,
2656                                       const FLAG needflag,
2657                                       char in_compound) {
2658   struct hentry* rv = NULL;
2659   PfxEntry* ep = ppfx;
2660 
2661   // first handle the special case of 0 length suffixes
2662   SfxEntry* se = sStart[0];
2663 
2664   while (se) {
2665     if (!cclass || se->getCont()) {
2666       // suffixes are not allowed in beginning of compounds
2667       if ((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
2668            // except when signed with compoundpermitflag flag
2669            (se->getCont() && compoundpermitflag &&
2670             TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) &&
2671           (!circumfix ||
2672            // no circumfix flag in prefix and suffix
2673            ((!ppfx || !(ep->getCont()) ||
2674              !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2675             (!se->getCont() ||
2676              !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) ||
2677            // circumfix flag in prefix AND suffix
2678            ((ppfx && (ep->getCont()) &&
2679              TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2680             (se->getCont() &&
2681              (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) &&
2682           // fogemorpheme
2683           (in_compound ||
2684            !(se->getCont() &&
2685              (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) &&
2686           // needaffix on prefix or first suffix
2687           (cclass ||
2688            !(se->getCont() &&
2689              TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2690            (ppfx &&
2691             !((ep->getCont()) &&
2692               TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))) {
2693         rv = se->checkword(word, len, sfxopts, ppfx,
2694                            (FLAG)cclass, needflag,
2695                            (in_compound ? 0 : onlyincompound));
2696         if (rv) {
2697           sfx = se;  // BUG: sfx not stateless
2698           return rv;
2699         }
2700       }
2701     }
2702     se = se->getNext();
2703   }
2704 
2705   // now handle the general case
2706   if (len == 0)
2707     return NULL;  // FULLSTRIP
2708   unsigned char sp = *((const unsigned char*)(word + len - 1));
2709   SfxEntry* sptr = sStart[sp];
2710 
2711   while (sptr) {
2712     if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2713       // suffixes are not allowed in beginning of compounds
2714       if ((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
2715            // except when signed with compoundpermitflag flag
2716            (sptr->getCont() && compoundpermitflag &&
2717             TESTAFF(sptr->getCont(), compoundpermitflag,
2718                     sptr->getContLen()))) &&
2719           (!circumfix ||
2720            // no circumfix flag in prefix and suffix
2721            ((!ppfx || !(ep->getCont()) ||
2722              !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2723             (!sptr->getCont() ||
2724              !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) ||
2725            // circumfix flag in prefix AND suffix
2726            ((ppfx && (ep->getCont()) &&
2727              TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2728             (sptr->getCont() &&
2729              (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) &&
2730           // fogemorpheme
2731           (in_compound ||
2732            !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,
2733                                           sptr->getContLen()))))) &&
2734           // needaffix on prefix or first suffix
2735           (cclass ||
2736            !(sptr->getCont() &&
2737              TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
2738            (ppfx &&
2739             !((ep->getCont()) &&
2740               TESTAFF(ep->getCont(), needaffix, ep->getContLen())))))
2741         if (in_compound != IN_CPD_END || ppfx ||
2742             !(sptr->getCont() &&
2743               TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) {
2744           rv = sptr->checkword(word, len, sfxopts, ppfx,
2745                                cclass, needflag,
2746                                (in_compound ? 0 : onlyincompound));
2747           if (rv) {
2748             sfx = sptr;                 // BUG: sfx not stateless
2749             sfxflag = sptr->getFlag();  // BUG: sfxflag not stateless
2750             if (!sptr->getCont())
2751               sfxappnd = sptr->getKey();  // BUG: sfxappnd not stateless
2752             // LANG_hu section: spec. Hungarian rule
2753             else if (langnum == LANG_hu && sptr->getKeyLen() &&
2754                      sptr->getKey()[0] == 'i' && sptr->getKey()[1] != 'y' &&
2755                      sptr->getKey()[1] != 't') {
2756               sfxextra = 1;
2757             }
2758             // END of LANG_hu section
2759             return rv;
2760           }
2761         }
2762       sptr = sptr->getNextEQ();
2763     } else {
2764       sptr = sptr->getNextNE();
2765     }
2766   }
2767 
2768   return NULL;
2769 }
2770 
2771 // check word for two-level suffixes
2772 
suffix_check_twosfx(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG needflag)2773 struct hentry* AffixMgr::suffix_check_twosfx(const char* word,
2774                                              int len,
2775                                              int sfxopts,
2776                                              PfxEntry* ppfx,
2777                                              const FLAG needflag) {
2778   struct hentry* rv = NULL;
2779 
2780   // first handle the special case of 0 length suffixes
2781   SfxEntry* se = sStart[0];
2782   while (se) {
2783     if (contclasses[se->getFlag()]) {
2784       rv = se->check_twosfx(word, len, sfxopts, ppfx, needflag);
2785       if (rv)
2786         return rv;
2787     }
2788     se = se->getNext();
2789   }
2790 
2791   // now handle the general case
2792   if (len == 0)
2793     return NULL;  // FULLSTRIP
2794   unsigned char sp = *((const unsigned char*)(word + len - 1));
2795   SfxEntry* sptr = sStart[sp];
2796 
2797   while (sptr) {
2798     if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2799       if (contclasses[sptr->getFlag()]) {
2800         rv = sptr->check_twosfx(word, len, sfxopts, ppfx, needflag);
2801         if (rv) {
2802           sfxflag = sptr->getFlag();  // BUG: sfxflag not stateless
2803           if (!sptr->getCont())
2804             sfxappnd = sptr->getKey();  // BUG: sfxappnd not stateless
2805           return rv;
2806         }
2807       }
2808       sptr = sptr->getNextEQ();
2809     } else {
2810       sptr = sptr->getNextNE();
2811     }
2812   }
2813 
2814   return NULL;
2815 }
2816 
suffix_check_twosfx_morph(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG needflag)2817 std::string AffixMgr::suffix_check_twosfx_morph(const char* word,
2818                                                 int len,
2819                                                 int sfxopts,
2820                                                 PfxEntry* ppfx,
2821                                                 const FLAG needflag) {
2822   std::string result;
2823   std::string result2;
2824   std::string result3;
2825 
2826   // first handle the special case of 0 length suffixes
2827   SfxEntry* se = sStart[0];
2828   while (se) {
2829     if (contclasses[se->getFlag()]) {
2830       std::string st = se->check_twosfx_morph(word, len, sfxopts, ppfx, needflag);
2831       if (!st.empty()) {
2832         if (ppfx) {
2833           if (ppfx->getMorph()) {
2834             result.append(ppfx->getMorph());
2835             result.append(" ");
2836           } else
2837             debugflag(result, ppfx->getFlag());
2838         }
2839         result.append(st);
2840         if (se->getMorph()) {
2841           result.append(" ");
2842           result.append(se->getMorph());
2843         } else
2844           debugflag(result, se->getFlag());
2845         result.append("\n");
2846       }
2847     }
2848     se = se->getNext();
2849   }
2850 
2851   // now handle the general case
2852   if (len == 0)
2853     return std::string();  // FULLSTRIP
2854   unsigned char sp = *((const unsigned char*)(word + len - 1));
2855   SfxEntry* sptr = sStart[sp];
2856 
2857   while (sptr) {
2858     if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2859       if (contclasses[sptr->getFlag()]) {
2860         std::string st = sptr->check_twosfx_morph(word, len, sfxopts, ppfx, needflag);
2861         if (!st.empty()) {
2862           sfxflag = sptr->getFlag();  // BUG: sfxflag not stateless
2863           if (!sptr->getCont())
2864             sfxappnd = sptr->getKey();  // BUG: sfxappnd not stateless
2865           result2.assign(st);
2866 
2867           result3.clear();
2868 
2869           if (sptr->getMorph()) {
2870             result3.append(" ");
2871             result3.append(sptr->getMorph());
2872           } else
2873             debugflag(result3, sptr->getFlag());
2874           strlinecat(result2, result3);
2875           result2.append("\n");
2876           result.append(result2);
2877         }
2878       }
2879       sptr = sptr->getNextEQ();
2880     } else {
2881       sptr = sptr->getNextNE();
2882     }
2883   }
2884 
2885   return result;
2886 }
2887 
suffix_check_morph(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag,char in_compound)2888 std::string AffixMgr::suffix_check_morph(const char* word,
2889                                          int len,
2890                                          int sfxopts,
2891                                          PfxEntry* ppfx,
2892                                          const FLAG cclass,
2893                                          const FLAG needflag,
2894                                          char in_compound) {
2895   std::string result;
2896 
2897   struct hentry* rv = NULL;
2898 
2899   PfxEntry* ep = ppfx;
2900 
2901   // first handle the special case of 0 length suffixes
2902   SfxEntry* se = sStart[0];
2903   while (se) {
2904     if (!cclass || se->getCont()) {
2905       // suffixes are not allowed in beginning of compounds
2906       if (((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
2907             // except when signed with compoundpermitflag flag
2908             (se->getCont() && compoundpermitflag &&
2909              TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) &&
2910            (!circumfix ||
2911             // no circumfix flag in prefix and suffix
2912             ((!ppfx || !(ep->getCont()) ||
2913               !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2914              (!se->getCont() ||
2915               !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) ||
2916             // circumfix flag in prefix AND suffix
2917             ((ppfx && (ep->getCont()) &&
2918               TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2919              (se->getCont() &&
2920               (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) &&
2921            // fogemorpheme
2922            (in_compound ||
2923             !((se->getCont() &&
2924                (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
2925            // needaffix on prefix or first suffix
2926            (cclass ||
2927             !(se->getCont() &&
2928               TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2929             (ppfx &&
2930              !((ep->getCont()) &&
2931                TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))))
2932         rv = se->checkword(word, len, sfxopts, ppfx, cclass,
2933                            needflag, FLAG_NULL);
2934       while (rv) {
2935         if (ppfx) {
2936           if (ppfx->getMorph()) {
2937             result.append(ppfx->getMorph());
2938             result.append(" ");
2939           } else
2940             debugflag(result, ppfx->getFlag());
2941         }
2942         if (complexprefixes && HENTRY_DATA(rv))
2943           result.append(HENTRY_DATA2(rv));
2944         if (!HENTRY_FIND(rv, MORPH_STEM)) {
2945           result.append(" ");
2946           result.append(MORPH_STEM);
2947           result.append(HENTRY_WORD(rv));
2948         }
2949 
2950         if (!complexprefixes && HENTRY_DATA(rv)) {
2951           result.append(" ");
2952           result.append(HENTRY_DATA2(rv));
2953         }
2954         if (se->getMorph()) {
2955           result.append(" ");
2956           result.append(se->getMorph());
2957         } else
2958           debugflag(result, se->getFlag());
2959         result.append("\n");
2960         rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
2961       }
2962     }
2963     se = se->getNext();
2964   }
2965 
2966   // now handle the general case
2967   if (len == 0)
2968     return std::string();  // FULLSTRIP
2969   unsigned char sp = *((const unsigned char*)(word + len - 1));
2970   SfxEntry* sptr = sStart[sp];
2971 
2972   while (sptr) {
2973     if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2974       // suffixes are not allowed in beginning of compounds
2975       if (((((in_compound != IN_CPD_BEGIN)) ||  // && !cclass
2976             // except when signed with compoundpermitflag flag
2977             (sptr->getCont() && compoundpermitflag &&
2978              TESTAFF(sptr->getCont(), compoundpermitflag,
2979                      sptr->getContLen()))) &&
2980            (!circumfix ||
2981             // no circumfix flag in prefix and suffix
2982             ((!ppfx || !(ep->getCont()) ||
2983               !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2984              (!sptr->getCont() ||
2985               !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) ||
2986             // circumfix flag in prefix AND suffix
2987             ((ppfx && (ep->getCont()) &&
2988               TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2989              (sptr->getCont() &&
2990               (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) &&
2991            // fogemorpheme
2992            (in_compound ||
2993             !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,
2994                                            sptr->getContLen()))))) &&
2995            // needaffix on first suffix
2996            (cclass ||
2997             !(sptr->getCont() &&
2998               TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())))))
2999         rv = sptr->checkword(word, len, sfxopts, ppfx, cclass,
3000                              needflag, FLAG_NULL);
3001       while (rv) {
3002         if (ppfx) {
3003           if (ppfx->getMorph()) {
3004             result.append(ppfx->getMorph());
3005             result.append(" ");
3006           } else
3007             debugflag(result, ppfx->getFlag());
3008         }
3009         if (complexprefixes && HENTRY_DATA(rv))
3010           result.append(HENTRY_DATA2(rv));
3011         if (!HENTRY_FIND(rv, MORPH_STEM)) {
3012           result.append(" ");
3013           result.append(MORPH_STEM);
3014           result.append(HENTRY_WORD(rv));
3015         }
3016 
3017         if (!complexprefixes && HENTRY_DATA(rv)) {
3018           result.append(" ");
3019           result.append(HENTRY_DATA2(rv));
3020         }
3021 
3022         if (sptr->getMorph()) {
3023           result.append(" ");
3024           result.append(sptr->getMorph());
3025         } else
3026           debugflag(result, sptr->getFlag());
3027         result.append("\n");
3028         rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
3029       }
3030       sptr = sptr->getNextEQ();
3031     } else {
3032       sptr = sptr->getNextNE();
3033     }
3034   }
3035 
3036   return result;
3037 }
3038 
3039 // check if word with affixes is correctly spelled
affix_check(const char * word,int len,const FLAG needflag,char in_compound)3040 struct hentry* AffixMgr::affix_check(const char* word,
3041                                      int len,
3042                                      const FLAG needflag,
3043                                      char in_compound) {
3044 
3045   // check all prefixes (also crossed with suffixes if allowed)
3046   struct hentry* rv = prefix_check(word, len, in_compound, needflag);
3047   if (rv)
3048     return rv;
3049 
3050   // if still not found check all suffixes
3051   rv = suffix_check(word, len, 0, NULL, FLAG_NULL, needflag, in_compound);
3052 
3053   if (havecontclass) {
3054     sfx = NULL;
3055     pfx = NULL;
3056 
3057     if (rv)
3058       return rv;
3059     // if still not found check all two-level suffixes
3060     rv = suffix_check_twosfx(word, len, 0, NULL, needflag);
3061 
3062     if (rv)
3063       return rv;
3064     // if still not found check all two-level suffixes
3065     rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag);
3066   }
3067 
3068   return rv;
3069 }
3070 
3071 // check if word with affixes is correctly spelled
affix_check_morph(const char * word,int len,const FLAG needflag,char in_compound)3072 std::string AffixMgr::affix_check_morph(const char* word,
3073                                   int len,
3074                                   const FLAG needflag,
3075                                   char in_compound) {
3076   std::string result;
3077 
3078   // check all prefixes (also crossed with suffixes if allowed)
3079   std::string st = prefix_check_morph(word, len, in_compound);
3080   if (!st.empty()) {
3081     result.append(st);
3082   }
3083 
3084   // if still not found check all suffixes
3085   st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound);
3086   if (!st.empty()) {
3087     result.append(st);
3088   }
3089 
3090   if (havecontclass) {
3091     sfx = NULL;
3092     pfx = NULL;
3093     // if still not found check all two-level suffixes
3094     st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag);
3095     if (!st.empty()) {
3096       result.append(st);
3097     }
3098 
3099     // if still not found check all two-level suffixes
3100     st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag);
3101     if (!st.empty()) {
3102       result.append(st);
3103     }
3104   }
3105 
3106   return result;
3107 }
3108 
3109 // morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields
3110 // in the first line of the inputs
3111 // return 0, if inputs equal
3112 // return 1, if inputs may equal with a secondary suffix
3113 // otherwise return -1
morphcmp(const char * s,const char * t)3114 static int morphcmp(const char* s, const char* t) {
3115   int se = 0;
3116   int te = 0;
3117   const char* sl;
3118   const char* tl;
3119   const char* olds;
3120   const char* oldt;
3121   if (!s || !t)
3122     return 1;
3123   olds = s;
3124   sl = strchr(s, '\n');
3125   s = strstr(s, MORPH_DERI_SFX);
3126   if (!s || (sl && sl < s))
3127     s = strstr(olds, MORPH_INFL_SFX);
3128   if (!s || (sl && sl < s)) {
3129     s = strstr(olds, MORPH_TERM_SFX);
3130     olds = NULL;
3131   }
3132   oldt = t;
3133   tl = strchr(t, '\n');
3134   t = strstr(t, MORPH_DERI_SFX);
3135   if (!t || (tl && tl < t))
3136     t = strstr(oldt, MORPH_INFL_SFX);
3137   if (!t || (tl && tl < t)) {
3138     t = strstr(oldt, MORPH_TERM_SFX);
3139     oldt = NULL;
3140   }
3141   while (s && t && (!sl || sl > s) && (!tl || tl > t)) {
3142     s += MORPH_TAG_LEN;
3143     t += MORPH_TAG_LEN;
3144     se = 0;
3145     te = 0;
3146     while ((*s == *t) && !se && !te) {
3147       s++;
3148       t++;
3149       switch (*s) {
3150         case ' ':
3151         case '\n':
3152         case '\t':
3153         case '\0':
3154           se = 1;
3155       }
3156       switch (*t) {
3157         case ' ':
3158         case '\n':
3159         case '\t':
3160         case '\0':
3161           te = 1;
3162       }
3163     }
3164     if (!se || !te) {
3165       // not terminal suffix difference
3166       if (olds)
3167         return -1;
3168       return 1;
3169     }
3170     olds = s;
3171     s = strstr(s, MORPH_DERI_SFX);
3172     if (!s || (sl && sl < s))
3173       s = strstr(olds, MORPH_INFL_SFX);
3174     if (!s || (sl && sl < s)) {
3175       s = strstr(olds, MORPH_TERM_SFX);
3176       olds = NULL;
3177     }
3178     oldt = t;
3179     t = strstr(t, MORPH_DERI_SFX);
3180     if (!t || (tl && tl < t))
3181       t = strstr(oldt, MORPH_INFL_SFX);
3182     if (!t || (tl && tl < t)) {
3183       t = strstr(oldt, MORPH_TERM_SFX);
3184       oldt = NULL;
3185     }
3186   }
3187   if (!s && !t && se && te)
3188     return 0;
3189   return 1;
3190 }
3191 
morphgen(const char * ts,int wl,const unsigned short * ap,unsigned short al,const char * morph,const char * targetmorph,int level)3192 std::string AffixMgr::morphgen(const char* ts,
3193                                int wl,
3194                                const unsigned short* ap,
3195                                unsigned short al,
3196                                const char* morph,
3197                                const char* targetmorph,
3198                          int level) {
3199   // handle suffixes
3200   if (!morph)
3201     return std::string();
3202 
3203   // check substandard flag
3204   if (TESTAFF(ap, substandard, al))
3205     return std::string();
3206 
3207   if (morphcmp(morph, targetmorph) == 0)
3208     return ts;
3209 
3210   size_t stemmorphcatpos;
3211   std::string mymorph;
3212 
3213   // use input suffix fields, if exist
3214   if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) {
3215     mymorph.assign(morph);
3216     mymorph.append(" ");
3217     stemmorphcatpos = mymorph.size();
3218   } else {
3219     stemmorphcatpos = std::string::npos;
3220   }
3221 
3222   for (int i = 0; i < al; i++) {
3223     const unsigned char c = (unsigned char)(ap[i] & 0x00FF);
3224     SfxEntry* sptr = sFlag[c];
3225     while (sptr) {
3226       if (sptr->getFlag() == ap[i] && sptr->getMorph() &&
3227           ((sptr->getContLen() == 0) ||
3228            // don't generate forms with substandard affixes
3229            !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) {
3230         const char* stemmorph;
3231         if (stemmorphcatpos != std::string::npos) {
3232           mymorph.replace(stemmorphcatpos, std::string::npos, sptr->getMorph());
3233           stemmorph = mymorph.c_str();
3234         } else {
3235           stemmorph = sptr->getMorph();
3236         }
3237 
3238         int cmp = morphcmp(stemmorph, targetmorph);
3239 
3240         if (cmp == 0) {
3241           std::string newword = sptr->add(ts, wl);
3242           if (!newword.empty()) {
3243             hentry* check = pHMgr->lookup(newword.c_str());  // XXX extra dic
3244             if (!check || !check->astr ||
3245                 !(TESTAFF(check->astr, forbiddenword, check->alen) ||
3246                   TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) {
3247               return newword;
3248             }
3249           }
3250         }
3251 
3252         // recursive call for secondary suffixes
3253         if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) &&
3254             !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) {
3255           std::string newword = sptr->add(ts, wl);
3256           if (!newword.empty()) {
3257             std::string newword2 =
3258                 morphgen(newword.c_str(), newword.size(), sptr->getCont(),
3259                          sptr->getContLen(), stemmorph, targetmorph, 1);
3260 
3261             if (!newword2.empty()) {
3262               return newword2;
3263             }
3264           }
3265         }
3266       }
3267       sptr = sptr->getFlgNxt();
3268     }
3269   }
3270   return std::string();
3271 }
3272 
expand_rootword(struct guessword * wlst,int maxn,const char * ts,int wl,const unsigned short * ap,unsigned short al,const char * bad,int badl,const char * phon)3273 int AffixMgr::expand_rootword(struct guessword* wlst,
3274                               int maxn,
3275                               const char* ts,
3276                               int wl,
3277                               const unsigned short* ap,
3278                               unsigned short al,
3279                               const char* bad,
3280                               int badl,
3281                               const char* phon) {
3282   int nh = 0;
3283   // first add root word to list
3284   if ((nh < maxn) &&
3285       !(al && ((needaffix && TESTAFF(ap, needaffix, al)) ||
3286                (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {
3287     wlst[nh].word = mystrdup(ts);
3288     if (!wlst[nh].word)
3289       return 0;
3290     wlst[nh].allow = false;
3291     wlst[nh].orig = NULL;
3292     nh++;
3293     // add special phonetic version
3294     if (phon && (nh < maxn)) {
3295       wlst[nh].word = mystrdup(phon);
3296       if (!wlst[nh].word)
3297         return nh - 1;
3298       wlst[nh].allow = false;
3299       wlst[nh].orig = mystrdup(ts);
3300       if (!wlst[nh].orig)
3301         return nh - 1;
3302       nh++;
3303     }
3304   }
3305 
3306   // handle suffixes
3307   for (int i = 0; i < al; i++) {
3308     const unsigned char c = (unsigned char)(ap[i] & 0x00FF);
3309     SfxEntry* sptr = sFlag[c];
3310     while (sptr) {
3311       if ((sptr->getFlag() == ap[i]) &&
3312           (!sptr->getKeyLen() ||
3313            ((badl > sptr->getKeyLen()) &&
3314             (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) &&
3315           // check needaffix flag
3316           !(sptr->getCont() &&
3317             ((needaffix &&
3318               TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
3319              (circumfix &&
3320               TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) ||
3321              (onlyincompound &&
3322               TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) {
3323         std::string newword = sptr->add(ts, wl);
3324         if (!newword.empty()) {
3325           if (nh < maxn) {
3326             wlst[nh].word = mystrdup(newword.c_str());
3327             wlst[nh].allow = sptr->allowCross();
3328             wlst[nh].orig = NULL;
3329             nh++;
3330             // add special phonetic version
3331             if (phon && (nh < maxn)) {
3332               std::string prefix(phon);
3333               std::string key(sptr->getKey());
3334               reverseword(key);
3335               prefix.append(key);
3336               wlst[nh].word = mystrdup(prefix.c_str());
3337               if (!wlst[nh].word)
3338                 return nh - 1;
3339               wlst[nh].allow = false;
3340               wlst[nh].orig = mystrdup(newword.c_str());
3341               if (!wlst[nh].orig)
3342                 return nh - 1;
3343               nh++;
3344             }
3345           }
3346         }
3347       }
3348       sptr = sptr->getFlgNxt();
3349     }
3350   }
3351 
3352   int n = nh;
3353 
3354   // handle cross products of prefixes and suffixes
3355   for (int j = 1; j < n; j++)
3356     if (wlst[j].allow) {
3357       for (int k = 0; k < al; k++) {
3358         const unsigned char c = (unsigned char)(ap[k] & 0x00FF);
3359         PfxEntry* cptr = pFlag[c];
3360         while (cptr) {
3361           if ((cptr->getFlag() == ap[k]) && cptr->allowCross() &&
3362               (!cptr->getKeyLen() ||
3363                ((badl > cptr->getKeyLen()) &&
3364                 (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
3365             int l1 = strlen(wlst[j].word);
3366             std::string newword = cptr->add(wlst[j].word, l1);
3367             if (!newword.empty()) {
3368               if (nh < maxn) {
3369                 wlst[nh].word = mystrdup(newword.c_str());
3370                 wlst[nh].allow = cptr->allowCross();
3371                 wlst[nh].orig = NULL;
3372                 nh++;
3373               }
3374             }
3375           }
3376           cptr = cptr->getFlgNxt();
3377         }
3378       }
3379     }
3380 
3381   // now handle pure prefixes
3382   for (int m = 0; m < al; m++) {
3383     const unsigned char c = (unsigned char)(ap[m] & 0x00FF);
3384     PfxEntry* ptr = pFlag[c];
3385     while (ptr) {
3386       if ((ptr->getFlag() == ap[m]) &&
3387           (!ptr->getKeyLen() ||
3388            ((badl > ptr->getKeyLen()) &&
3389             (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) &&
3390           // check needaffix flag
3391           !(ptr->getCont() &&
3392             ((needaffix &&
3393               TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) ||
3394              (circumfix &&
3395               TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||
3396              (onlyincompound &&
3397               TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))) {
3398         std::string newword = ptr->add(ts, wl);
3399         if (!newword.empty()) {
3400           if (nh < maxn) {
3401             wlst[nh].word = mystrdup(newword.c_str());
3402             wlst[nh].allow = ptr->allowCross();
3403             wlst[nh].orig = NULL;
3404             nh++;
3405           }
3406         }
3407       }
3408       ptr = ptr->getFlgNxt();
3409     }
3410   }
3411 
3412   return nh;
3413 }
3414 
3415 // return replacing table
get_reptable() const3416 const std::vector<replentry>& AffixMgr::get_reptable() const {
3417   return reptable;
3418 }
3419 
3420 // return iconv table
get_iconvtable() const3421 RepList* AffixMgr::get_iconvtable() const {
3422   if (!iconvtable)
3423     return NULL;
3424   return iconvtable;
3425 }
3426 
3427 // return oconv table
get_oconvtable() const3428 RepList* AffixMgr::get_oconvtable() const {
3429   if (!oconvtable)
3430     return NULL;
3431   return oconvtable;
3432 }
3433 
3434 // return replacing table
get_phonetable() const3435 struct phonetable* AffixMgr::get_phonetable() const {
3436   if (!phone)
3437     return NULL;
3438   return phone;
3439 }
3440 
3441 // return character map table
get_maptable() const3442 const std::vector<mapentry>& AffixMgr::get_maptable() const {
3443   return maptable;
3444 }
3445 
3446 // return character map table
get_breaktable() const3447 const std::vector<std::string>& AffixMgr::get_breaktable() const {
3448   return breaktable;
3449 }
3450 
3451 // return text encoding of dictionary
get_encoding()3452 const std::string& AffixMgr::get_encoding() {
3453   if (encoding.empty())
3454     encoding = SPELL_ENCODING;
3455   return encoding;
3456 }
3457 
3458 // return text encoding of dictionary
get_langnum() const3459 int AffixMgr::get_langnum() const {
3460   return langnum;
3461 }
3462 
3463 // return double prefix option
get_complexprefixes() const3464 int AffixMgr::get_complexprefixes() const {
3465   return complexprefixes;
3466 }
3467 
3468 // return FULLSTRIP option
get_fullstrip() const3469 int AffixMgr::get_fullstrip() const {
3470   return fullstrip;
3471 }
3472 
get_keepcase() const3473 FLAG AffixMgr::get_keepcase() const {
3474   return keepcase;
3475 }
3476 
get_forceucase() const3477 FLAG AffixMgr::get_forceucase() const {
3478   return forceucase;
3479 }
3480 
get_warn() const3481 FLAG AffixMgr::get_warn() const {
3482   return warn;
3483 }
3484 
get_forbidwarn() const3485 int AffixMgr::get_forbidwarn() const {
3486   return forbidwarn;
3487 }
3488 
get_checksharps() const3489 int AffixMgr::get_checksharps() const {
3490   return checksharps;
3491 }
3492 
encode_flag(unsigned short aflag) const3493 char* AffixMgr::encode_flag(unsigned short aflag) const {
3494   return pHMgr->encode_flag(aflag);
3495 }
3496 
3497 // return the preferred ignore string for suggestions
get_ignore() const3498 const char* AffixMgr::get_ignore() const {
3499   if (ignorechars.empty())
3500     return NULL;
3501   return ignorechars.c_str();
3502 }
3503 
3504 // return the preferred ignore string for suggestions
get_ignore_utf16() const3505 const std::vector<w_char>& AffixMgr::get_ignore_utf16() const {
3506   return ignorechars_utf16;
3507 }
3508 
3509 // return the keyboard string for suggestions
get_key_string()3510 char* AffixMgr::get_key_string() {
3511   if (keystring.empty())
3512     keystring = SPELL_KEYSTRING;
3513   return mystrdup(keystring.c_str());
3514 }
3515 
3516 // return the preferred try string for suggestions
get_try_string() const3517 char* AffixMgr::get_try_string() const {
3518   if (trystring.empty())
3519     return NULL;
3520   return mystrdup(trystring.c_str());
3521 }
3522 
3523 // return the preferred try string for suggestions
get_wordchars() const3524 const std::string& AffixMgr::get_wordchars() const {
3525   return wordchars;
3526 }
3527 
get_wordchars_utf16() const3528 const std::vector<w_char>& AffixMgr::get_wordchars_utf16() const {
3529   return wordchars_utf16;
3530 }
3531 
3532 // is there compounding?
get_compound() const3533 int AffixMgr::get_compound() const {
3534   return compoundflag || compoundbegin || !defcpdtable.empty();
3535 }
3536 
3537 // return the compound words control flag
get_compoundflag() const3538 FLAG AffixMgr::get_compoundflag() const {
3539   return compoundflag;
3540 }
3541 
3542 // return the forbidden words control flag
get_forbiddenword() const3543 FLAG AffixMgr::get_forbiddenword() const {
3544   return forbiddenword;
3545 }
3546 
3547 // return the forbidden words control flag
get_nosuggest() const3548 FLAG AffixMgr::get_nosuggest() const {
3549   return nosuggest;
3550 }
3551 
3552 // return the forbidden words control flag
get_nongramsuggest() const3553 FLAG AffixMgr::get_nongramsuggest() const {
3554   return nongramsuggest;
3555 }
3556 
3557 // return the forbidden words flag modify flag
get_needaffix() const3558 FLAG AffixMgr::get_needaffix() const {
3559   return needaffix;
3560 }
3561 
3562 // return the onlyincompound flag
get_onlyincompound() const3563 FLAG AffixMgr::get_onlyincompound() const {
3564   return onlyincompound;
3565 }
3566 
3567 // return the value of suffix
get_version() const3568 const std::string& AffixMgr::get_version() const {
3569   return version;
3570 }
3571 
3572 // utility method to look up root words in hash table
lookup(const char * word)3573 struct hentry* AffixMgr::lookup(const char* word) {
3574   struct hentry* he = NULL;
3575   for (size_t i = 0; i < alldic.size() && !he; ++i) {
3576     he = alldic[i]->lookup(word);
3577   }
3578   return he;
3579 }
3580 
3581 // return the value of suffix
have_contclass() const3582 int AffixMgr::have_contclass() const {
3583   return havecontclass;
3584 }
3585 
3586 // return utf8
get_utf8() const3587 int AffixMgr::get_utf8() const {
3588   return utf8;
3589 }
3590 
get_maxngramsugs(void) const3591 int AffixMgr::get_maxngramsugs(void) const {
3592   return maxngramsugs;
3593 }
3594 
get_maxcpdsugs(void) const3595 int AffixMgr::get_maxcpdsugs(void) const {
3596   return maxcpdsugs;
3597 }
3598 
get_maxdiff(void) const3599 int AffixMgr::get_maxdiff(void) const {
3600   return maxdiff;
3601 }
3602 
get_onlymaxdiff(void) const3603 int AffixMgr::get_onlymaxdiff(void) const {
3604   return onlymaxdiff;
3605 }
3606 
3607 // return nosplitsugs
get_nosplitsugs(void) const3608 int AffixMgr::get_nosplitsugs(void) const {
3609   return nosplitsugs;
3610 }
3611 
3612 // return sugswithdots
get_sugswithdots(void) const3613 int AffixMgr::get_sugswithdots(void) const {
3614   return sugswithdots;
3615 }
3616 
3617 /* parse flag */
parse_flag(const std::string & line,unsigned short * out,FileMgr * af)3618 bool AffixMgr::parse_flag(const std::string& line, unsigned short* out, FileMgr* af) {
3619   if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) {
3620     HUNSPELL_WARNING(
3621         stderr,
3622         "error: line %d: multiple definitions of an affix file parameter\n",
3623         af->getlinenum());
3624     return false;
3625   }
3626   std::string s;
3627   if (!parse_string(line, s, af->getlinenum()))
3628     return false;
3629   *out = pHMgr->decode_flag(s.c_str());
3630   return true;
3631 }
3632 
3633 /* parse num */
parse_num(const std::string & line,int * out,FileMgr * af)3634 bool AffixMgr::parse_num(const std::string& line, int* out, FileMgr* af) {
3635   if (*out != -1) {
3636     HUNSPELL_WARNING(
3637         stderr,
3638         "error: line %d: multiple definitions of an affix file parameter\n",
3639         af->getlinenum());
3640     return false;
3641   }
3642   std::string s;
3643   if (!parse_string(line, s, af->getlinenum()))
3644     return false;
3645   *out = atoi(s.c_str());
3646   return true;
3647 }
3648 
3649 /* parse in the max syllablecount of compound words and  */
parse_cpdsyllable(const std::string & line,FileMgr * af)3650 bool AffixMgr::parse_cpdsyllable(const std::string& line, FileMgr* af) {
3651   int i = 0;
3652   int np = 0;
3653   std::string::const_iterator iter = line.begin();
3654   std::string::const_iterator start_piece = mystrsep(line, iter);
3655   while (start_piece != line.end()) {
3656     switch (i) {
3657       case 0: {
3658         np++;
3659         break;
3660       }
3661       case 1: {
3662         cpdmaxsyllable = atoi(std::string(start_piece, iter).c_str());
3663         np++;
3664         break;
3665       }
3666       case 2: {
3667         if (!utf8) {
3668           cpdvowels.assign(start_piece, iter);
3669           std::sort(cpdvowels.begin(), cpdvowels.end());
3670         } else {
3671           std::string piece(start_piece, iter);
3672           u8_u16(cpdvowels_utf16, piece);
3673           std::sort(cpdvowels_utf16.begin(), cpdvowels_utf16.end());
3674         }
3675         np++;
3676         break;
3677       }
3678       default:
3679         break;
3680     }
3681     ++i;
3682     start_piece = mystrsep(line, iter);
3683   }
3684   if (np < 2) {
3685     HUNSPELL_WARNING(stderr,
3686                      "error: line %d: missing compoundsyllable information\n",
3687                      af->getlinenum());
3688     return false;
3689   }
3690   if (np == 2)
3691     cpdvowels = "AEIOUaeiou";
3692   return true;
3693 }
3694 
3695 /* parse in the typical fault correcting table */
parse_reptable(const std::string & line,FileMgr * af)3696 bool AffixMgr::parse_reptable(const std::string& line, FileMgr* af) {
3697   if (parsedrep) {
3698     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3699                      af->getlinenum());
3700     return false;
3701   }
3702   parsedrep = true;
3703   int numrep = -1;
3704   int i = 0;
3705   int np = 0;
3706   std::string::const_iterator iter = line.begin();
3707   std::string::const_iterator start_piece = mystrsep(line, iter);
3708   while (start_piece != line.end()) {
3709     switch (i) {
3710       case 0: {
3711         np++;
3712         break;
3713       }
3714       case 1: {
3715         numrep = atoi(std::string(start_piece, iter).c_str());
3716         if (numrep < 1) {
3717           HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
3718                            af->getlinenum());
3719           return false;
3720         }
3721         reptable.reserve(numrep);
3722         np++;
3723         break;
3724       }
3725       default:
3726         break;
3727     }
3728     ++i;
3729     start_piece = mystrsep(line, iter);
3730   }
3731   if (np != 2) {
3732     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
3733                      af->getlinenum());
3734     return false;
3735   }
3736 
3737   /* now parse the numrep lines to read in the remainder of the table */
3738   for (int j = 0; j < numrep; ++j) {
3739     std::string nl;
3740     if (!af->getline(nl))
3741       return false;
3742     mychomp(nl);
3743     reptable.push_back(replentry());
3744     iter = nl.begin();
3745     i = 0;
3746     int type = 0;
3747     start_piece = mystrsep(nl, iter);
3748     while (start_piece != nl.end()) {
3749       switch (i) {
3750         case 0: {
3751           if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) {
3752             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3753                              af->getlinenum());
3754             reptable.clear();
3755             return false;
3756           }
3757           break;
3758         }
3759         case 1: {
3760           if (*start_piece == '^')
3761             type = 1;
3762           reptable.back().pattern.assign(start_piece + type, iter);
3763           mystrrep(reptable.back().pattern, "_", " ");
3764           if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') {
3765             type += 2;
3766             reptable.back().pattern.resize(reptable.back().pattern.size() - 1);
3767           }
3768           break;
3769         }
3770         case 2: {
3771           reptable.back().outstrings[type].assign(start_piece, iter);
3772           mystrrep(reptable.back().outstrings[type], "_", " ");
3773           break;
3774         }
3775         default:
3776           break;
3777       }
3778       ++i;
3779       start_piece = mystrsep(nl, iter);
3780     }
3781     if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) {
3782       HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3783                        af->getlinenum());
3784       reptable.clear();
3785       return false;
3786     }
3787   }
3788   return true;
3789 }
3790 
3791 /* parse in the typical fault correcting table */
parse_convtable(const std::string & line,FileMgr * af,RepList ** rl,const std::string & keyword)3792 bool AffixMgr::parse_convtable(const std::string& line,
3793                               FileMgr* af,
3794                               RepList** rl,
3795                               const std::string& keyword) {
3796   if (*rl) {
3797     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3798                      af->getlinenum());
3799     return false;
3800   }
3801   int i = 0;
3802   int np = 0;
3803   int numrl = 0;
3804   std::string::const_iterator iter = line.begin();
3805   std::string::const_iterator start_piece = mystrsep(line, iter);
3806   while (start_piece != line.end()) {
3807     switch (i) {
3808       case 0: {
3809         np++;
3810         break;
3811       }
3812       case 1: {
3813         numrl = atoi(std::string(start_piece, iter).c_str());
3814         if (numrl < 1) {
3815           HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
3816                            af->getlinenum());
3817           return false;
3818         }
3819         *rl = new RepList(numrl);
3820         if (!*rl)
3821           return false;
3822         np++;
3823         break;
3824       }
3825       default:
3826         break;
3827     }
3828     ++i;
3829     start_piece = mystrsep(line, iter);
3830   }
3831   if (np != 2) {
3832     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
3833                      af->getlinenum());
3834     return false;
3835   }
3836 
3837   /* now parse the num lines to read in the remainder of the table */
3838   for (int j = 0; j < numrl; j++) {
3839     std::string nl;
3840     if (!af->getline(nl))
3841       return false;
3842     mychomp(nl);
3843     i = 0;
3844     std::string pattern;
3845     std::string pattern2;
3846     iter = nl.begin();
3847     start_piece = mystrsep(nl, iter);
3848     while (start_piece != nl.end()) {
3849       {
3850         switch (i) {
3851           case 0: {
3852             if (nl.compare(start_piece - nl.begin(), keyword.size(), keyword, 0, keyword.size()) != 0) {
3853               HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3854                                af->getlinenum());
3855               delete *rl;
3856               *rl = NULL;
3857               return false;
3858             }
3859             break;
3860           }
3861           case 1: {
3862             pattern.assign(start_piece, iter);
3863             break;
3864           }
3865           case 2: {
3866             pattern2.assign(start_piece, iter);
3867             break;
3868           }
3869           default:
3870             break;
3871         }
3872         ++i;
3873       }
3874       start_piece = mystrsep(nl, iter);
3875     }
3876     if (pattern.empty() || pattern2.empty()) {
3877       HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3878                        af->getlinenum());
3879       return false;
3880     }
3881     (*rl)->add(pattern, pattern2);
3882   }
3883   return true;
3884 }
3885 
3886 /* parse in the typical fault correcting table */
parse_phonetable(const std::string & line,FileMgr * af)3887 bool AffixMgr::parse_phonetable(const std::string& line, FileMgr* af) {
3888   if (phone) {
3889     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3890                      af->getlinenum());
3891     return false;
3892   }
3893   int num = -1;
3894   int i = 0;
3895   int np = 0;
3896   std::string::const_iterator iter = line.begin();
3897   std::string::const_iterator start_piece = mystrsep(line, iter);
3898   while (start_piece != line.end()) {
3899     switch (i) {
3900       case 0: {
3901         np++;
3902         break;
3903       }
3904       case 1: {
3905         num = atoi(std::string(start_piece, iter).c_str());
3906         if (num < 1) {
3907           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
3908                            af->getlinenum());
3909           return false;
3910         }
3911         phone = new phonetable;
3912         phone->utf8 = (char)utf8;
3913         np++;
3914         break;
3915       }
3916       default:
3917         break;
3918     }
3919     ++i;
3920     start_piece = mystrsep(line, iter);
3921   }
3922   if (np != 2) {
3923     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
3924                      af->getlinenum());
3925     return false;
3926   }
3927 
3928   /* now parse the phone->num lines to read in the remainder of the table */
3929   for (int j = 0; j < num; ++j) {
3930     std::string nl;
3931     if (!af->getline(nl))
3932       return false;
3933     mychomp(nl);
3934     i = 0;
3935     const size_t old_size = phone->rules.size();
3936     iter = nl.begin();
3937     start_piece = mystrsep(nl, iter);
3938     while (start_piece != nl.end()) {
3939       {
3940         switch (i) {
3941           case 0: {
3942             if (nl.compare(start_piece - nl.begin(), 5, "PHONE", 5) != 0) {
3943               HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3944                                af->getlinenum());
3945               return false;
3946             }
3947             break;
3948           }
3949           case 1: {
3950             phone->rules.push_back(std::string(start_piece, iter));
3951             break;
3952           }
3953           case 2: {
3954             phone->rules.push_back(std::string(start_piece, iter));
3955             mystrrep(phone->rules.back(), "_", "");
3956             break;
3957           }
3958           default:
3959             break;
3960         }
3961         ++i;
3962       }
3963       start_piece = mystrsep(nl, iter);
3964     }
3965     if (phone->rules.size() != old_size + 2) {
3966       HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3967                        af->getlinenum());
3968       phone->rules.clear();
3969       return false;
3970     }
3971   }
3972   phone->rules.push_back("");
3973   phone->rules.push_back("");
3974   init_phonet_hash(*phone);
3975   return true;
3976 }
3977 
3978 /* parse in the checkcompoundpattern table */
parse_checkcpdtable(const std::string & line,FileMgr * af)3979 bool AffixMgr::parse_checkcpdtable(const std::string& line, FileMgr* af) {
3980   if (parsedcheckcpd) {
3981     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3982                      af->getlinenum());
3983     return false;
3984   }
3985   parsedcheckcpd = true;
3986   int numcheckcpd = -1;
3987   int i = 0;
3988   int np = 0;
3989   std::string::const_iterator iter = line.begin();
3990   std::string::const_iterator start_piece = mystrsep(line, iter);
3991   while (start_piece != line.end()) {
3992     switch (i) {
3993       case 0: {
3994         np++;
3995         break;
3996       }
3997       case 1: {
3998         numcheckcpd = atoi(std::string(start_piece, iter).c_str());
3999         if (numcheckcpd < 1) {
4000           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4001                            af->getlinenum());
4002           return false;
4003         }
4004         checkcpdtable.reserve(numcheckcpd);
4005         np++;
4006         break;
4007       }
4008       default:
4009         break;
4010     }
4011     ++i;
4012     start_piece = mystrsep(line, iter);
4013   }
4014   if (np != 2) {
4015     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4016                      af->getlinenum());
4017     return false;
4018   }
4019 
4020   /* now parse the numcheckcpd lines to read in the remainder of the table */
4021   for (int j = 0; j < numcheckcpd; ++j) {
4022     std::string nl;
4023     if (!af->getline(nl))
4024       return false;
4025     mychomp(nl);
4026     i = 0;
4027     checkcpdtable.push_back(patentry());
4028     iter = nl.begin();
4029     start_piece = mystrsep(nl, iter);
4030     while (start_piece != nl.end()) {
4031       switch (i) {
4032         case 0: {
4033           if (nl.compare(start_piece - nl.begin(), 20, "CHECKCOMPOUNDPATTERN", 20) != 0) {
4034             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4035                              af->getlinenum());
4036             return false;
4037           }
4038           break;
4039         }
4040         case 1: {
4041           checkcpdtable.back().pattern.assign(start_piece, iter);
4042           size_t slash_pos = checkcpdtable.back().pattern.find('/');
4043           if (slash_pos != std::string::npos) {
4044             std::string chunk(checkcpdtable.back().pattern, slash_pos + 1);
4045             checkcpdtable.back().pattern.resize(slash_pos);
4046             checkcpdtable.back().cond = pHMgr->decode_flag(chunk.c_str());
4047           }
4048           break;
4049         }
4050         case 2: {
4051           checkcpdtable.back().pattern2.assign(start_piece, iter);
4052           size_t slash_pos = checkcpdtable.back().pattern2.find('/');
4053           if (slash_pos != std::string::npos) {
4054             std::string chunk(checkcpdtable.back().pattern2, slash_pos + 1);
4055             checkcpdtable.back().pattern2.resize(slash_pos);
4056             checkcpdtable.back().cond2 = pHMgr->decode_flag(chunk.c_str());
4057           }
4058           break;
4059         }
4060         case 3: {
4061           checkcpdtable.back().pattern3.assign(start_piece, iter);
4062           simplifiedcpd = 1;
4063           break;
4064         }
4065         default:
4066           break;
4067       }
4068       i++;
4069       start_piece = mystrsep(nl, iter);
4070     }
4071   }
4072   return true;
4073 }
4074 
4075 /* parse in the compound rule table */
parse_defcpdtable(const std::string & line,FileMgr * af)4076 bool AffixMgr::parse_defcpdtable(const std::string& line, FileMgr* af) {
4077   if (parseddefcpd) {
4078     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
4079                      af->getlinenum());
4080     return false;
4081   }
4082   parseddefcpd = true;
4083   int numdefcpd = -1;
4084   int i = 0;
4085   int np = 0;
4086   std::string::const_iterator iter = line.begin();
4087   std::string::const_iterator start_piece = mystrsep(line, iter);
4088   while (start_piece != line.end()) {
4089     switch (i) {
4090       case 0: {
4091         np++;
4092         break;
4093       }
4094       case 1: {
4095         numdefcpd = atoi(std::string(start_piece, iter).c_str());
4096         if (numdefcpd < 1) {
4097           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4098                            af->getlinenum());
4099           return false;
4100         }
4101         defcpdtable.reserve(numdefcpd);
4102         np++;
4103         break;
4104       }
4105       default:
4106         break;
4107     }
4108     ++i;
4109     start_piece = mystrsep(line, iter);
4110   }
4111   if (np != 2) {
4112     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4113                      af->getlinenum());
4114     return false;
4115   }
4116 
4117   /* now parse the numdefcpd lines to read in the remainder of the table */
4118   for (int j = 0; j < numdefcpd; ++j) {
4119     std::string nl;
4120     if (!af->getline(nl))
4121       return false;
4122     mychomp(nl);
4123     i = 0;
4124     defcpdtable.push_back(flagentry());
4125     iter = nl.begin();
4126     start_piece = mystrsep(nl, iter);
4127     while (start_piece != nl.end()) {
4128       switch (i) {
4129         case 0: {
4130           if (nl.compare(start_piece - nl.begin(), 12, "COMPOUNDRULE", 12) != 0) {
4131             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4132                              af->getlinenum());
4133             numdefcpd = 0;
4134             return false;
4135           }
4136           break;
4137         }
4138         case 1: {  // handle parenthesized flags
4139           if (std::find(start_piece, iter, '(') != iter) {
4140             for (std::string::const_iterator k = start_piece; k != iter; ++k) {
4141               std::string::const_iterator chb = k;
4142               std::string::const_iterator che = k + 1;
4143               if (*k == '(') {
4144                 std::string::const_iterator parpos = std::find(k, iter, ')');
4145                 if (parpos != iter) {
4146                   chb = k + 1;
4147                   che = parpos;
4148                   k = parpos;
4149                 }
4150               }
4151 
4152               if (*chb == '*' || *chb == '?') {
4153                 defcpdtable.back().push_back((FLAG)*chb);
4154               } else {
4155                 pHMgr->decode_flags(defcpdtable.back(), std::string(chb, che), af);
4156               }
4157             }
4158           } else {
4159             pHMgr->decode_flags(defcpdtable.back(), std::string(start_piece, iter), af);
4160           }
4161           break;
4162         }
4163         default:
4164           break;
4165       }
4166       ++i;
4167       start_piece = mystrsep(nl, iter);
4168     }
4169     if (defcpdtable.back().empty()) {
4170       HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4171                        af->getlinenum());
4172       return false;
4173     }
4174   }
4175   return true;
4176 }
4177 
4178 /* parse in the character map table */
parse_maptable(const std::string & line,FileMgr * af)4179 bool AffixMgr::parse_maptable(const std::string& line, FileMgr* af) {
4180   if (parsedmaptable) {
4181     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
4182                      af->getlinenum());
4183     return false;
4184   }
4185   parsedmaptable = true;
4186   int nummap = -1;
4187   int i = 0;
4188   int np = 0;
4189   std::string::const_iterator iter = line.begin();
4190   std::string::const_iterator start_piece = mystrsep(line, iter);
4191   while (start_piece != line.end()) {
4192     switch (i) {
4193       case 0: {
4194         np++;
4195         break;
4196       }
4197       case 1: {
4198         nummap = atoi(std::string(start_piece, iter).c_str());
4199         if (nummap < 1) {
4200           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4201                            af->getlinenum());
4202           return false;
4203         }
4204         maptable.reserve(nummap);
4205         np++;
4206         break;
4207       }
4208       default:
4209         break;
4210     }
4211     ++i;
4212     start_piece = mystrsep(line, iter);
4213   }
4214   if (np != 2) {
4215     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4216                      af->getlinenum());
4217     return false;
4218   }
4219 
4220   /* now parse the nummap lines to read in the remainder of the table */
4221   for (int j = 0; j < nummap; ++j) {
4222     std::string nl;
4223     if (!af->getline(nl))
4224       return false;
4225     mychomp(nl);
4226     i = 0;
4227     maptable.push_back(mapentry());
4228     iter = nl.begin();
4229     start_piece = mystrsep(nl, iter);
4230     while (start_piece != nl.end()) {
4231       switch (i) {
4232         case 0: {
4233           if (nl.compare(start_piece - nl.begin(), 3, "MAP", 3) != 0) {
4234             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4235                              af->getlinenum());
4236             nummap = 0;
4237             return false;
4238           }
4239           break;
4240         }
4241         case 1: {
4242           for (std::string::const_iterator k = start_piece; k != iter; ++k) {
4243             std::string::const_iterator chb = k;
4244             std::string::const_iterator che = k + 1;
4245             if (*k == '(') {
4246               std::string::const_iterator parpos = std::find(k, iter, ')');
4247               if (parpos != iter) {
4248                 chb = k + 1;
4249                 che = parpos;
4250                 k = parpos;
4251               }
4252             } else {
4253               if (utf8 && (*k & 0xc0) == 0xc0) {
4254                 ++k;
4255                 while (k != iter && (*k & 0xc0) == 0x80)
4256                     ++k;
4257                 che = k;
4258                 --k;
4259               }
4260             }
4261             maptable.back().push_back(std::string(chb, che));
4262           }
4263           break;
4264         }
4265         default:
4266           break;
4267       }
4268       ++i;
4269       start_piece = mystrsep(nl, iter);
4270     }
4271     if (maptable.back().empty()) {
4272       HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4273                        af->getlinenum());
4274       return false;
4275     }
4276   }
4277   return true;
4278 }
4279 
4280 /* parse in the word breakpoint table */
parse_breaktable(const std::string & line,FileMgr * af)4281 bool AffixMgr::parse_breaktable(const std::string& line, FileMgr* af) {
4282   if (parsedbreaktable) {
4283     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
4284                      af->getlinenum());
4285     return false;
4286   }
4287   parsedbreaktable = true;
4288   int numbreak = -1;
4289   int i = 0;
4290   int np = 0;
4291   std::string::const_iterator iter = line.begin();
4292   std::string::const_iterator start_piece = mystrsep(line, iter);
4293   while (start_piece != line.end()) {
4294     switch (i) {
4295       case 0: {
4296         np++;
4297         break;
4298       }
4299       case 1: {
4300         numbreak = atoi(std::string(start_piece, iter).c_str());
4301         if (numbreak < 0) {
4302           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4303                            af->getlinenum());
4304           return false;
4305         }
4306         if (numbreak == 0)
4307           return true;
4308         breaktable.reserve(numbreak);
4309         np++;
4310         break;
4311       }
4312       default:
4313         break;
4314     }
4315     ++i;
4316     start_piece = mystrsep(line, iter);
4317   }
4318   if (np != 2) {
4319     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4320                      af->getlinenum());
4321     return false;
4322   }
4323 
4324   /* now parse the numbreak lines to read in the remainder of the table */
4325   for (int j = 0; j < numbreak; ++j) {
4326     std::string nl;
4327     if (!af->getline(nl))
4328       return false;
4329     mychomp(nl);
4330     i = 0;
4331     iter = nl.begin();
4332     start_piece = mystrsep(nl, iter);
4333     while (start_piece != nl.end()) {
4334       switch (i) {
4335         case 0: {
4336           if (nl.compare(start_piece - nl.begin(), 5, "BREAK", 5) != 0) {
4337             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4338                              af->getlinenum());
4339             numbreak = 0;
4340             return false;
4341           }
4342           break;
4343         }
4344         case 1: {
4345           breaktable.push_back(std::string(start_piece, iter));
4346           break;
4347         }
4348         default:
4349           break;
4350       }
4351       ++i;
4352       start_piece = mystrsep(nl, iter);
4353     }
4354   }
4355 
4356   if (breaktable.size() != static_cast<size_t>(numbreak)) {
4357     HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4358                      af->getlinenum());
4359     return false;
4360   }
4361 
4362   return true;
4363 }
4364 
reverse_condition(std::string & piece)4365 void AffixMgr::reverse_condition(std::string& piece) {
4366   if (piece.empty())
4367       return;
4368 
4369   int neg = 0;
4370   for (std::string::reverse_iterator k = piece.rbegin(); k != piece.rend(); ++k) {
4371     switch (*k) {
4372       case '[': {
4373         if (neg)
4374           *(k - 1) = '[';
4375         else
4376           *k = ']';
4377         break;
4378       }
4379       case ']': {
4380         *k = '[';
4381         if (neg)
4382           *(k - 1) = '^';
4383         neg = 0;
4384         break;
4385       }
4386       case '^': {
4387         if (*(k - 1) == ']')
4388           neg = 1;
4389         else
4390           *(k - 1) = *k;
4391         break;
4392       }
4393       default: {
4394         if (neg)
4395           *(k - 1) = *k;
4396       }
4397     }
4398   }
4399 }
4400 
4401 class entries_container {
4402   std::vector<AffEntry*> entries;
4403   AffixMgr* m_mgr;
4404   char m_at;
4405 public:
entries_container(char at,AffixMgr * mgr)4406   entries_container(char at, AffixMgr* mgr)
4407     : m_mgr(mgr)
4408     , m_at(at) {
4409   }
release()4410   void release() {
4411     entries.clear();
4412   }
initialize(int numents,char opts,unsigned short aflag)4413   void initialize(int numents,
4414                   char opts, unsigned short aflag) {
4415     entries.reserve(numents);
4416 
4417     if (m_at == 'P') {
4418       entries.push_back(new PfxEntry(m_mgr));
4419     } else {
4420       entries.push_back(new SfxEntry(m_mgr));
4421     }
4422 
4423     entries.back()->opts = opts;
4424     entries.back()->aflag = aflag;
4425   }
4426 
add_entry(char opts)4427   AffEntry* add_entry(char opts) {
4428     if (m_at == 'P') {
4429       entries.push_back(new PfxEntry(m_mgr));
4430     } else {
4431       entries.push_back(new SfxEntry(m_mgr));
4432     }
4433     AffEntry* ret = entries.back();
4434     ret->opts = entries[0]->opts & opts;
4435     return ret;
4436   }
4437 
first_entry()4438   AffEntry* first_entry() {
4439     return entries.empty() ? NULL : entries[0];
4440   }
4441 
~entries_container()4442   ~entries_container() {
4443     for (size_t i = 0; i < entries.size(); ++i) {
4444         delete entries[i];
4445     }
4446   }
4447 
begin()4448   std::vector<AffEntry*>::iterator begin() { return entries.begin(); }
end()4449   std::vector<AffEntry*>::iterator end() { return entries.end(); }
4450 };
4451 
parse_affix(const std::string & line,const char at,FileMgr * af,char * dupflags)4452 bool AffixMgr::parse_affix(const std::string& line,
4453                           const char at,
4454                           FileMgr* af,
4455                           char* dupflags) {
4456   int numents = 0;  // number of AffEntry structures to parse
4457 
4458   unsigned short aflag = 0;  // affix char identifier
4459 
4460   char ff = 0;
4461   entries_container affentries(at, this);
4462 
4463   int i = 0;
4464 
4465 // checking lines with bad syntax
4466 #ifdef DEBUG
4467   int basefieldnum = 0;
4468 #endif
4469 
4470   // split affix header line into pieces
4471 
4472   int np = 0;
4473   std::string::const_iterator iter = line.begin();
4474   std::string::const_iterator start_piece = mystrsep(line, iter);
4475   while (start_piece != line.end()) {
4476     switch (i) {
4477       // piece 1 - is type of affix
4478       case 0: {
4479         np++;
4480         break;
4481       }
4482 
4483       // piece 2 - is affix char
4484       case 1: {
4485         np++;
4486         aflag = pHMgr->decode_flag(std::string(start_piece, iter).c_str());
4487         if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
4488             ((at == 'P') && (dupflags[aflag] & dupPFX))) {
4489           HUNSPELL_WARNING(
4490               stderr,
4491               "error: line %d: multiple definitions of an affix flag\n",
4492               af->getlinenum());
4493         }
4494         dupflags[aflag] += (char)((at == 'S') ? dupSFX : dupPFX);
4495         break;
4496       }
4497       // piece 3 - is cross product indicator
4498       case 2: {
4499         np++;
4500         if (*start_piece == 'Y')
4501           ff = aeXPRODUCT;
4502         break;
4503       }
4504 
4505       // piece 4 - is number of affentries
4506       case 3: {
4507         np++;
4508         numents = atoi(std::string(start_piece, iter).c_str());
4509         if ((numents <= 0) || ((std::numeric_limits<size_t>::max() /
4510                                 sizeof(AffEntry)) < static_cast<size_t>(numents))) {
4511           char* err = pHMgr->encode_flag(aflag);
4512           if (err) {
4513             HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4514                              af->getlinenum());
4515             free(err);
4516           }
4517           return false;
4518         }
4519 
4520         char opts = ff;
4521         if (utf8)
4522           opts += aeUTF8;
4523         if (pHMgr->is_aliasf())
4524           opts += aeALIASF;
4525         if (pHMgr->is_aliasm())
4526           opts += aeALIASM;
4527         affentries.initialize(numents, opts, aflag);
4528       }
4529 
4530       default:
4531         break;
4532     }
4533     ++i;
4534     start_piece = mystrsep(line, iter);
4535   }
4536   // check to make sure we parsed enough pieces
4537   if (np != 4) {
4538     char* err = pHMgr->encode_flag(aflag);
4539     if (err) {
4540       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4541                        af->getlinenum());
4542       free(err);
4543     }
4544     return false;
4545   }
4546 
4547   // now parse numents affentries for this affix
4548   AffEntry* entry = affentries.first_entry();
4549   for (int ent = 0; ent < numents; ++ent) {
4550     std::string nl;
4551     if (!af->getline(nl))
4552       return false;
4553     mychomp(nl);
4554 
4555     iter = nl.begin();
4556     i = 0;
4557     np = 0;
4558 
4559     // split line into pieces
4560     start_piece = mystrsep(nl, iter);
4561     while (start_piece != nl.end()) {
4562       switch (i) {
4563         // piece 1 - is type
4564         case 0: {
4565           np++;
4566           if (ent != 0)
4567             entry = affentries.add_entry((char)(aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM));
4568           break;
4569         }
4570 
4571         // piece 2 - is affix char
4572         case 1: {
4573           np++;
4574           std::string chunk(start_piece, iter);
4575           if (pHMgr->decode_flag(chunk.c_str()) != aflag) {
4576             char* err = pHMgr->encode_flag(aflag);
4577             if (err) {
4578               HUNSPELL_WARNING(stderr,
4579                                "error: line %d: affix %s is corrupt\n",
4580                                af->getlinenum(), err);
4581               free(err);
4582             }
4583             return false;
4584           }
4585 
4586           if (ent != 0) {
4587             AffEntry* start_entry = affentries.first_entry();
4588             entry->aflag = start_entry->aflag;
4589           }
4590           break;
4591         }
4592 
4593         // piece 3 - is string to strip or 0 for null
4594         case 2: {
4595           np++;
4596           entry->strip = std::string(start_piece, iter);
4597           if (complexprefixes) {
4598             if (utf8)
4599               reverseword_utf(entry->strip);
4600             else
4601               reverseword(entry->strip);
4602           }
4603           if (entry->strip.compare("0") == 0) {
4604             entry->strip.clear();
4605           }
4606           break;
4607         }
4608 
4609         // piece 4 - is affix string or 0 for null
4610         case 3: {
4611           entry->morphcode = NULL;
4612           entry->contclass = NULL;
4613           entry->contclasslen = 0;
4614           np++;
4615           std::string::const_iterator dash = std::find(start_piece, iter, '/');
4616           if (dash != iter) {
4617             entry->appnd = std::string(start_piece, dash);
4618             std::string dash_str(dash + 1, iter);
4619 
4620             if (!ignorechars.empty()) {
4621               if (utf8) {
4622                 remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
4623               } else {
4624                 remove_ignored_chars(entry->appnd, ignorechars);
4625               }
4626             }
4627 
4628             if (complexprefixes) {
4629               if (utf8)
4630                 reverseword_utf(entry->appnd);
4631               else
4632                 reverseword(entry->appnd);
4633             }
4634 
4635             if (pHMgr->is_aliasf()) {
4636               int index = atoi(dash_str.c_str());
4637               entry->contclasslen = (unsigned short)pHMgr->get_aliasf(
4638                   index, &(entry->contclass), af);
4639               if (!entry->contclasslen)
4640                 HUNSPELL_WARNING(stderr,
4641                                  "error: bad affix flag alias: \"%s\"\n",
4642                                  dash_str.c_str());
4643             } else {
4644               entry->contclasslen = (unsigned short)pHMgr->decode_flags(
4645                   &(entry->contclass), dash_str.c_str(), af);
4646               std::sort(entry->contclass, entry->contclass + entry->contclasslen);
4647             }
4648 
4649             havecontclass = 1;
4650             for (unsigned short _i = 0; _i < entry->contclasslen; _i++) {
4651               contclasses[(entry->contclass)[_i]] = 1;
4652             }
4653           } else {
4654             entry->appnd = std::string(start_piece, iter);
4655 
4656             if (!ignorechars.empty()) {
4657               if (utf8) {
4658                 remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
4659               } else {
4660                 remove_ignored_chars(entry->appnd, ignorechars);
4661               }
4662             }
4663 
4664             if (complexprefixes) {
4665               if (utf8)
4666                 reverseword_utf(entry->appnd);
4667               else
4668                 reverseword(entry->appnd);
4669             }
4670           }
4671 
4672           if (entry->appnd.compare("0") == 0) {
4673             entry->appnd.clear();
4674           }
4675           break;
4676         }
4677 
4678         // piece 5 - is the conditions descriptions
4679         case 4: {
4680           std::string chunk(start_piece, iter);
4681           np++;
4682           if (complexprefixes) {
4683             if (utf8)
4684               reverseword_utf(chunk);
4685             else
4686               reverseword(chunk);
4687             reverse_condition(chunk);
4688           }
4689           if (!entry->strip.empty() && chunk != "." &&
4690               redundant_condition(at, entry->strip.c_str(), entry->strip.size(), chunk.c_str(),
4691                                   af->getlinenum()))
4692             chunk = ".";
4693           if (at == 'S') {
4694             reverseword(chunk);
4695             reverse_condition(chunk);
4696           }
4697           if (encodeit(*entry, chunk.c_str()))
4698             return false;
4699           break;
4700         }
4701 
4702         case 5: {
4703           std::string chunk(start_piece, iter);
4704           np++;
4705           if (pHMgr->is_aliasm()) {
4706             int index = atoi(chunk.c_str());
4707             entry->morphcode = pHMgr->get_aliasm(index);
4708           } else {
4709             if (complexprefixes) {  // XXX - fix me for morph. gen.
4710               if (utf8)
4711                 reverseword_utf(chunk);
4712               else
4713                 reverseword(chunk);
4714             }
4715             // add the remaining of the line
4716             std::string::const_iterator end = nl.end();
4717             if (iter != end) {
4718               chunk.append(iter, end);
4719             }
4720             entry->morphcode = mystrdup(chunk.c_str());
4721             if (!entry->morphcode)
4722               return false;
4723           }
4724           break;
4725         }
4726         default:
4727           break;
4728       }
4729       i++;
4730       start_piece = mystrsep(nl, iter);
4731     }
4732     // check to make sure we parsed enough pieces
4733     if (np < 4) {
4734       char* err = pHMgr->encode_flag(aflag);
4735       if (err) {
4736         HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
4737                          af->getlinenum(), err);
4738         free(err);
4739       }
4740       return false;
4741     }
4742 
4743 #ifdef DEBUG
4744     // detect unnecessary fields, excepting comments
4745     if (basefieldnum) {
4746       int fieldnum =
4747           !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6);
4748       if (fieldnum != basefieldnum)
4749         HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n",
4750                          af->getlinenum());
4751     } else {
4752       basefieldnum =
4753           !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6);
4754     }
4755 #endif
4756   }
4757 
4758   // now create SfxEntry or PfxEntry objects and use links to
4759   // build an ordered (sorted by affix string) list
4760   std::vector<AffEntry*>::iterator start = affentries.begin();
4761   std::vector<AffEntry*>::iterator end = affentries.end();
4762   for (std::vector<AffEntry*>::iterator affentry = start; affentry != end; ++affentry) {
4763     if (at == 'P') {
4764       build_pfxtree(static_cast<PfxEntry*>(*affentry));
4765     } else {
4766       build_sfxtree(static_cast<SfxEntry*>(*affentry));
4767     }
4768   }
4769 
4770   //contents belong to AffixMgr now
4771   affentries.release();
4772 
4773   return true;
4774 }
4775 
redundant_condition(char ft,const char * strip,int stripl,const char * cond,int linenum)4776 int AffixMgr::redundant_condition(char ft,
4777                                   const char* strip,
4778                                   int stripl,
4779                                   const char* cond,
4780                                   int linenum) {
4781   int condl = strlen(cond);
4782   int i;
4783   int j;
4784   int neg;
4785   int in;
4786   if (ft == 'P') {  // prefix
4787     if (strncmp(strip, cond, condl) == 0)
4788       return 1;
4789     if (utf8) {
4790     } else {
4791       for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
4792         if (cond[j] != '[') {
4793           if (cond[j] != strip[i]) {
4794             HUNSPELL_WARNING(stderr,
4795                              "warning: line %d: incompatible stripping "
4796                              "characters and condition\n",
4797                              linenum);
4798             return 0;
4799           }
4800         } else {
4801           neg = (cond[j + 1] == '^') ? 1 : 0;
4802           in = 0;
4803           do {
4804             j++;
4805             if (strip[i] == cond[j])
4806               in = 1;
4807           } while ((j < (condl - 1)) && (cond[j] != ']'));
4808           if (j == (condl - 1) && (cond[j] != ']')) {
4809             HUNSPELL_WARNING(stderr,
4810                              "error: line %d: missing ] in condition:\n%s\n",
4811                              linenum, cond);
4812             return 0;
4813           }
4814           if ((!neg && !in) || (neg && in)) {
4815             HUNSPELL_WARNING(stderr,
4816                              "warning: line %d: incompatible stripping "
4817                              "characters and condition\n",
4818                              linenum);
4819             return 0;
4820           }
4821         }
4822       }
4823       if (j >= condl)
4824         return 1;
4825     }
4826   } else {  // suffix
4827     if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0)
4828       return 1;
4829     if (utf8) {
4830     } else {
4831       for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
4832         if (cond[j] != ']') {
4833           if (cond[j] != strip[i]) {
4834             HUNSPELL_WARNING(stderr,
4835                              "warning: line %d: incompatible stripping "
4836                              "characters and condition\n",
4837                              linenum);
4838             return 0;
4839           }
4840         } else {
4841           in = 0;
4842           do {
4843             j--;
4844             if (strip[i] == cond[j])
4845               in = 1;
4846           } while ((j > 0) && (cond[j] != '['));
4847           if ((j == 0) && (cond[j] != '[')) {
4848             HUNSPELL_WARNING(stderr,
4849                              "error: line: %d: missing ] in condition:\n%s\n",
4850                              linenum, cond);
4851             return 0;
4852           }
4853           neg = (cond[j + 1] == '^') ? 1 : 0;
4854           if ((!neg && !in) || (neg && in)) {
4855             HUNSPELL_WARNING(stderr,
4856                              "warning: line %d: incompatible stripping "
4857                              "characters and condition\n",
4858                              linenum);
4859             return 0;
4860           }
4861         }
4862       }
4863       if (j < 0)
4864         return 1;
4865     }
4866   }
4867   return 0;
4868 }
4869 
get_suffix_words(short unsigned * suff,int len,const char * root_word)4870 std::vector<std::string> AffixMgr::get_suffix_words(short unsigned* suff,
4871                                int len,
4872                                const char* root_word) {
4873   std::vector<std::string> slst;
4874   short unsigned* start_ptr = suff;
4875   for (int j = 0; j < SETSIZE; j++) {
4876     SfxEntry* ptr = sStart[j];
4877     while (ptr) {
4878       suff = start_ptr;
4879       for (int i = 0; i < len; i++) {
4880         if ((*suff) == ptr->getFlag()) {
4881           std::string nw(root_word);
4882           nw.append(ptr->getAffix());
4883           hentry* ht = ptr->checkword(nw.c_str(), nw.size(), 0, NULL, 0, 0, 0);
4884           if (ht) {
4885             slst.push_back(nw);
4886           }
4887         }
4888         suff++;
4889       }
4890       ptr = ptr->getNext();
4891     }
4892   }
4893   return slst;
4894 }
4895