1 #include "license.hunspell"
2 #include "license.myspell"
3 
4 #include <stdlib.h>
5 #include <string.h>
6 #include <stdio.h>
7 #include <ctype.h>
8 
9 #include <vector>
10 
11 #include "affixmgr.hxx"
12 #include "affentry.hxx"
13 #include "langnum.hxx"
14 
15 #include "csutil.hxx"
16 
AffixMgr(const char * affpath,HashMgr ** ptr,int * md,const char * key)17 AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key)
18 {
19   // register hash manager and load affix data from aff file
20   pHMgr = ptr[0];
21   alldic = ptr;
22   maxdic = md;
23   keystring = NULL;
24   trystring = NULL;
25   encoding=NULL;
26   csconv=NULL;
27   utf8 = 0;
28   complexprefixes = 0;
29   maptable = NULL;
30   nummap = 0;
31   breaktable = NULL;
32   numbreak = -1;
33   reptable = NULL;
34   numrep = 0;
35   iconvtable = NULL;
36   oconvtable = NULL;
37   checkcpdtable = NULL;
38   // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
39   simplifiedcpd = 0;
40   numcheckcpd = 0;
41   defcpdtable = NULL;
42   numdefcpd = 0;
43   phone = NULL;
44   compoundflag = FLAG_NULL; // permits word in compound forms
45   compoundbegin = FLAG_NULL; // may be first word in compound forms
46   compoundmiddle = FLAG_NULL; // may be middle word in compound forms
47   compoundend = FLAG_NULL; // may be last word in compound forms
48   compoundroot = FLAG_NULL; // compound word signing flag
49   compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
50   compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
51   compoundmoresuffixes = 0; // allow more suffixes within compound words
52   checkcompounddup = 0; // forbid double words in compounds
53   checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)
54   checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds
55   checkcompoundtriple = 0; // forbid compounds with triple letters
56   simplifiedtriple = 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt)
57   forbiddenword = FORBIDDENWORD; // forbidden word signing flag
58   nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
59   nongramsuggest = FLAG_NULL;
60   lang = NULL; // language
61   langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
62   needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes
63   cpdwordmax = -1; // default: unlimited wordcount in compound words
64   cpdmin = -1;  // undefined
65   cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
66   cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
67   cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
68   cpdvowels_utf16_len=0; // vowels
69   pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG
70   sfxappnd=NULL; // previous suffix for counting a special syllables BUG
71   cpdsyllablenum=NULL; // syllable count incrementing flag
72   checknum=0; // checking numbers, and word with numbers
73   wordchars=NULL; // letters + spec. word characters
74   wordchars_utf16=NULL; // letters + spec. word characters
75   wordchars_utf16_len=0; // letters + spec. word characters
76   ignorechars=NULL; // letters + spec. word characters
77   ignorechars_utf16=NULL; // letters + spec. word characters
78   ignorechars_utf16_len=0; // letters + spec. word characters
79   version=NULL; // affix and dictionary file version string
80   havecontclass=0; // flags of possible continuing classes (double affix)
81   // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
82   // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
83   lemma_present = FLAG_NULL;
84   circumfix = FLAG_NULL;
85   onlyincompound = FLAG_NULL;
86   maxngramsugs = -1; // undefined
87   maxdiff = -1; // undefined
88   onlymaxdiff = 0;
89   maxcpdsugs = -1; // undefined
90   nosplitsugs = 0;
91   sugswithdots = 0;
92   keepcase = 0;
93   forceucase = 0;
94   warn = 0;
95   forbidwarn = 0;
96   checksharps = 0;
97   substandard = FLAG_NULL;
98   fullstrip = 0;
99 
100   sfx = NULL;
101   pfx = NULL;
102 
103   for (int i=0; i < SETSIZE; i++) {
104      pStart[i] = NULL;
105      sStart[i] = NULL;
106      pFlag[i] = NULL;
107      sFlag[i] = NULL;
108   }
109 
110   for (int j=0; j < CONTSIZE; j++) {
111     contclasses[j] = 0;
112   }
113 
114   if (parse_file(affpath, key)) {
115      HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
116   }
117 
118   if (cpdmin == -1) cpdmin = MINCPDLEN;
119 
120 }
121 
122 
~AffixMgr()123 AffixMgr::~AffixMgr()
124 {
125   // pass through linked prefix entries and clean up
126   for (int i=0; i < SETSIZE ;i++) {
127        pFlag[i] = NULL;
128        PfxEntry * ptr = pStart[i];
129        PfxEntry * nptr = NULL;
130        while (ptr) {
131             nptr = ptr->getNext();
132             delete(ptr);
133             ptr = nptr;
134             nptr = NULL;
135        }
136   }
137 
138   // pass through linked suffix entries and clean up
139   for (int j=0; j < SETSIZE ; j++) {
140        sFlag[j] = NULL;
141        SfxEntry * ptr = sStart[j];
142        SfxEntry * nptr = NULL;
143        while (ptr) {
144             nptr = ptr->getNext();
145             delete(ptr);
146             ptr = nptr;
147             nptr = NULL;
148        }
149        sStart[j] = NULL;
150   }
151 
152   if (keystring) free(keystring);
153   keystring=NULL;
154   if (trystring) free(trystring);
155   trystring=NULL;
156   if (encoding) free(encoding);
157   encoding=NULL;
158   if (maptable) {
159      for (int j=0; j < nummap; j++) {
160         for (int k=0; k < maptable[j].len; k++) {
161            if (maptable[j].set[k]) free(maptable[j].set[k]);
162         }
163         free(maptable[j].set);
164         maptable[j].set = NULL;
165         maptable[j].len = 0;
166      }
167      free(maptable);
168      maptable = NULL;
169   }
170   nummap = 0;
171   if (breaktable) {
172      for (int j=0; j < numbreak; j++) {
173         if (breaktable[j]) free(breaktable[j]);
174         breaktable[j] = NULL;
175      }
176      free(breaktable);
177      breaktable = NULL;
178   }
179   numbreak = 0;
180   if (reptable) {
181      for (int j=0; j < numrep; j++) {
182         free(reptable[j].pattern);
183         free(reptable[j].pattern2);
184      }
185      free(reptable);
186      reptable = NULL;
187   }
188   if (iconvtable) delete iconvtable;
189   if (oconvtable) delete oconvtable;
190   if (phone && phone->rules) {
191      for (int j=0; j < phone->num + 1; j++) {
192         free(phone->rules[j * 2]);
193         free(phone->rules[j * 2 + 1]);
194      }
195      free(phone->rules);
196      free(phone);
197      phone = NULL;
198   }
199 
200   if (defcpdtable) {
201      for (int j=0; j < numdefcpd; j++) {
202         free(defcpdtable[j].def);
203         defcpdtable[j].def = NULL;
204      }
205      free(defcpdtable);
206      defcpdtable = NULL;
207   }
208   numrep = 0;
209   if (checkcpdtable) {
210      for (int j=0; j < numcheckcpd; j++) {
211         free(checkcpdtable[j].pattern);
212         free(checkcpdtable[j].pattern2);
213         free(checkcpdtable[j].pattern3);
214         checkcpdtable[j].pattern = NULL;
215         checkcpdtable[j].pattern2 = NULL;
216         checkcpdtable[j].pattern3 = NULL;
217      }
218      free(checkcpdtable);
219      checkcpdtable = NULL;
220   }
221   numcheckcpd = 0;
222   FREE_FLAG(compoundflag);
223   FREE_FLAG(compoundbegin);
224   FREE_FLAG(compoundmiddle);
225   FREE_FLAG(compoundend);
226   FREE_FLAG(compoundpermitflag);
227   FREE_FLAG(compoundforbidflag);
228   FREE_FLAG(compoundroot);
229   FREE_FLAG(forbiddenword);
230   FREE_FLAG(nosuggest);
231   FREE_FLAG(nongramsuggest);
232   FREE_FLAG(needaffix);
233   FREE_FLAG(lemma_present);
234   FREE_FLAG(circumfix);
235   FREE_FLAG(onlyincompound);
236 
237   cpdwordmax = 0;
238   pHMgr = NULL;
239   cpdmin = 0;
240   cpdmaxsyllable = 0;
241   if (cpdvowels) free(cpdvowels);
242   if (cpdvowels_utf16) free(cpdvowels_utf16);
243   if (cpdsyllablenum) free(cpdsyllablenum);
244   free_utf_tbl();
245   if (lang) free(lang);
246   if (wordchars) free(wordchars);
247   if (wordchars_utf16) free(wordchars_utf16);
248   if (ignorechars) free(ignorechars);
249   if (ignorechars_utf16) free(ignorechars_utf16);
250   if (version) free(version);
251   checknum=0;
252 #ifdef MOZILLA_CLIENT
253   delete [] csconv;
254 #endif
255 }
256 
finishFileMgr(FileMgr * afflst)257 void AffixMgr::finishFileMgr(FileMgr *afflst)
258 {
259     delete afflst;
260 
261     // convert affix trees to sorted list
262     process_pfx_tree_to_list();
263     process_sfx_tree_to_list();
264 }
265 
266 // read in aff file and build up prefix and suffix entry objects
parse_file(const char * affpath,const char * key)267 int  AffixMgr::parse_file(const char * affpath, const char * key)
268 {
269   char * line; // io buffers
270   char ft;     // affix type
271 
272   // checking flag duplication
273   char dupflags[CONTSIZE];
274   char dupflags_ini = 1;
275 
276   // first line indicator for removing byte order mark
277   int firstline = 1;
278 
279   // open the affix file
280   FileMgr * afflst = new FileMgr(affpath, key);
281   if (!afflst) {
282     HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);
283     return 1;
284   }
285 
286   // step one is to parse the affix file building up the internal
287   // affix data structures
288 
289     // read in each line ignoring any that do not
290     // start with a known line type indicator
291     while ((line = afflst->getline()) != NULL) {
292        mychomp(line);
293 
294        /* remove byte order mark */
295        if (firstline) {
296          firstline = 0;
297          // Affix file begins with byte order mark: possible incompatibility with old Hunspell versions
298          if (strncmp(line,"\xEF\xBB\xBF",3) == 0) {
299             memmove(line, line+3, strlen(line+3)+1);
300          }
301        }
302 
303        /* parse in the keyboard string */
304        if (strncmp(line,"KEY",3) == 0) {
305           if (parse_string(line, &keystring, afflst->getlinenum())) {
306              finishFileMgr(afflst);
307              return 1;
308           }
309        }
310 
311        /* parse in the try string */
312        if (strncmp(line,"TRY",3) == 0) {
313           if (parse_string(line, &trystring, afflst->getlinenum())) {
314              finishFileMgr(afflst);
315              return 1;
316           }
317        }
318 
319        /* parse in the name of the character set used by the .dict and .aff */
320        if (strncmp(line,"SET",3) == 0) {
321           if (parse_string(line, &encoding, afflst->getlinenum())) {
322              finishFileMgr(afflst);
323              return 1;
324           }
325           if (strcmp(encoding, "UTF-8") == 0) {
326              utf8 = 1;
327 #ifndef OPENOFFICEORG
328 #ifndef MOZILLA_CLIENT
329              if (initialize_utf_tbl()) return 1;
330 #endif
331 #endif
332           }
333        }
334 
335        /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
336        if (strncmp(line,"COMPLEXPREFIXES",15) == 0)
337                    complexprefixes = 1;
338 
339        /* parse in the flag used by the controlled compound words */
340        if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
341           if (parse_flag(line, &compoundflag, afflst)) {
342              finishFileMgr(afflst);
343              return 1;
344           }
345        }
346 
347        /* parse in the flag used by compound words */
348        if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {
349           if (complexprefixes) {
350             if (parse_flag(line, &compoundend, afflst)) {
351               finishFileMgr(afflst);
352               return 1;
353             }
354           } else {
355             if (parse_flag(line, &compoundbegin, afflst)) {
356               finishFileMgr(afflst);
357               return 1;
358             }
359           }
360        }
361 
362        /* parse in the flag used by compound words */
363        if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {
364           if (parse_flag(line, &compoundmiddle, afflst)) {
365              finishFileMgr(afflst);
366              return 1;
367           }
368        }
369        /* parse in the flag used by compound words */
370        if (strncmp(line,"COMPOUNDEND",11) == 0) {
371           if (complexprefixes) {
372             if (parse_flag(line, &compoundbegin, afflst)) {
373               finishFileMgr(afflst);
374               return 1;
375             }
376           } else {
377             if (parse_flag(line, &compoundend, afflst)) {
378               finishFileMgr(afflst);
379               return 1;
380             }
381           }
382        }
383 
384        /* parse in the data used by compound_check() method */
385        if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {
386           if (parse_num(line, &cpdwordmax, afflst)) {
387              finishFileMgr(afflst);
388              return 1;
389           }
390        }
391 
392        /* parse in the flag sign compounds in dictionary */
393        if (strncmp(line,"COMPOUNDROOT",12) == 0) {
394           if (parse_flag(line, &compoundroot, afflst)) {
395              finishFileMgr(afflst);
396              return 1;
397           }
398        }
399 
400        /* parse in the flag used by compound_check() method */
401        if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {
402           if (parse_flag(line, &compoundpermitflag, afflst)) {
403              finishFileMgr(afflst);
404              return 1;
405           }
406        }
407 
408        /* parse in the flag used by compound_check() method */
409        if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {
410           if (parse_flag(line, &compoundforbidflag, afflst)) {
411              finishFileMgr(afflst);
412              return 1;
413           }
414        }
415 
416        if (strncmp(line,"COMPOUNDMORESUFFIXES",20) == 0) {
417                    compoundmoresuffixes = 1;
418        }
419 
420        if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) {
421                    checkcompounddup = 1;
422        }
423 
424        if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) {
425                    checkcompoundrep = 1;
426        }
427 
428        if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) {
429                    checkcompoundtriple = 1;
430        }
431 
432        if (strncmp(line,"SIMPLIFIEDTRIPLE",16) == 0) {
433                    simplifiedtriple = 1;
434        }
435 
436        if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) {
437                    checkcompoundcase = 1;
438        }
439 
440        if (strncmp(line,"NOSUGGEST",9) == 0) {
441           if (parse_flag(line, &nosuggest, afflst)) {
442              finishFileMgr(afflst);
443              return 1;
444           }
445        }
446 
447        if (strncmp(line,"NONGRAMSUGGEST",14) == 0) {
448           if (parse_flag(line, &nongramsuggest, afflst)) {
449              finishFileMgr(afflst);
450              return 1;
451           }
452        }
453 
454        /* parse in the flag used by forbidden words */
455        if (strncmp(line,"FORBIDDENWORD",13) == 0) {
456           if (parse_flag(line, &forbiddenword, afflst)) {
457              finishFileMgr(afflst);
458              return 1;
459           }
460        }
461 
462        /* parse in the flag used by forbidden words */
463        if (strncmp(line,"LEMMA_PRESENT",13) == 0) {
464           if (parse_flag(line, &lemma_present, afflst)) {
465              finishFileMgr(afflst);
466              return 1;
467           }
468        }
469 
470        /* parse in the flag used by circumfixes */
471        if (strncmp(line,"CIRCUMFIX",9) == 0) {
472           if (parse_flag(line, &circumfix, afflst)) {
473              finishFileMgr(afflst);
474              return 1;
475           }
476        }
477 
478        /* parse in the flag used by fogemorphemes */
479        if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {
480           if (parse_flag(line, &onlyincompound, afflst)) {
481              finishFileMgr(afflst);
482              return 1;
483           }
484        }
485 
486        /* parse in the flag used by `needaffixs' */
487        if (strncmp(line,"PSEUDOROOT",10) == 0) {
488           if (parse_flag(line, &needaffix, afflst)) {
489              finishFileMgr(afflst);
490              return 1;
491           }
492        }
493 
494        /* parse in the flag used by `needaffixs' */
495        if (strncmp(line,"NEEDAFFIX",9) == 0) {
496           if (parse_flag(line, &needaffix, afflst)) {
497              finishFileMgr(afflst);
498              return 1;
499           }
500        }
501 
502        /* parse in the minimal length for words in compounds */
503        if (strncmp(line,"COMPOUNDMIN",11) == 0) {
504           if (parse_num(line, &cpdmin, afflst)) {
505              finishFileMgr(afflst);
506              return 1;
507           }
508           if (cpdmin < 1) cpdmin = 1;
509        }
510 
511        /* parse in the max. words and syllables in compounds */
512        if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {
513           if (parse_cpdsyllable(line, afflst)) {
514              finishFileMgr(afflst);
515              return 1;
516           }
517        }
518 
519        /* parse in the flag used by compound_check() method */
520        if (strncmp(line,"SYLLABLENUM",11) == 0) {
521           if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) {
522              finishFileMgr(afflst);
523              return 1;
524           }
525        }
526 
527        /* parse in the flag used by the controlled compound words */
528        if (strncmp(line,"CHECKNUM",8) == 0) {
529            checknum=1;
530        }
531 
532        /* parse in the extra word characters */
533        if (strncmp(line,"WORDCHARS",9) == 0) {
534           if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, afflst->getlinenum())) {
535              finishFileMgr(afflst);
536              return 1;
537           }
538        }
539 
540        /* parse in the ignored characters (for example, Arabic optional diacretics charachters */
541        if (strncmp(line,"IGNORE",6) == 0) {
542           if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
543              finishFileMgr(afflst);
544              return 1;
545           }
546        }
547 
548        /* parse in the typical fault correcting table */
549        if (strncmp(line,"REP",3) == 0) {
550           if (parse_reptable(line, afflst)) {
551              finishFileMgr(afflst);
552              return 1;
553           }
554        }
555 
556        /* parse in the input conversion table */
557        if (strncmp(line,"ICONV",5) == 0) {
558           if (parse_convtable(line, afflst, &iconvtable, "ICONV")) {
559              finishFileMgr(afflst);
560              return 1;
561           }
562        }
563 
564        /* parse in the input conversion table */
565        if (strncmp(line,"OCONV",5) == 0) {
566           if (parse_convtable(line, afflst, &oconvtable, "OCONV")) {
567              finishFileMgr(afflst);
568              return 1;
569           }
570        }
571 
572        /* parse in the phonetic translation table */
573        if (strncmp(line,"PHONE",5) == 0) {
574           if (parse_phonetable(line, afflst)) {
575              finishFileMgr(afflst);
576              return 1;
577           }
578        }
579 
580        /* parse in the checkcompoundpattern table */
581        if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {
582           if (parse_checkcpdtable(line, afflst)) {
583              finishFileMgr(afflst);
584              return 1;
585           }
586        }
587 
588        /* parse in the defcompound table */
589        if (strncmp(line,"COMPOUNDRULE",12) == 0) {
590           if (parse_defcpdtable(line, afflst)) {
591              finishFileMgr(afflst);
592              return 1;
593           }
594        }
595 
596        /* parse in the related character map table */
597        if (strncmp(line,"MAP",3) == 0) {
598           if (parse_maptable(line, afflst)) {
599              finishFileMgr(afflst);
600              return 1;
601           }
602        }
603 
604        /* parse in the word breakpoints table */
605        if (strncmp(line,"BREAK",5) == 0) {
606           if (parse_breaktable(line, afflst)) {
607              finishFileMgr(afflst);
608              return 1;
609           }
610        }
611 
612        /* parse in the language for language specific codes */
613        if (strncmp(line,"LANG",4) == 0) {
614           if (parse_string(line, &lang, afflst->getlinenum())) {
615              finishFileMgr(afflst);
616              return 1;
617           }
618           langnum = get_lang_num(lang);
619        }
620 
621        if (strncmp(line,"VERSION",7) == 0) {
622           for(line = line + 7; *line == ' ' || *line == '\t'; line++);
623           version = mystrdup(line);
624        }
625 
626        if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {
627           if (parse_num(line, &maxngramsugs, afflst)) {
628              finishFileMgr(afflst);
629              return 1;
630           }
631        }
632 
633        if (strncmp(line,"ONLYMAXDIFF", 11) == 0)
634                    onlymaxdiff = 1;
635 
636        if (strncmp(line,"MAXDIFF",7) == 0) {
637           if (parse_num(line, &maxdiff, afflst)) {
638              finishFileMgr(afflst);
639              return 1;
640           }
641        }
642 
643        if (strncmp(line,"MAXCPDSUGS",10) == 0) {
644           if (parse_num(line, &maxcpdsugs, afflst)) {
645              finishFileMgr(afflst);
646              return 1;
647           }
648        }
649 
650        if (strncmp(line,"NOSPLITSUGS",11) == 0) {
651                    nosplitsugs=1;
652        }
653 
654        if (strncmp(line,"FULLSTRIP",9) == 0) {
655                    fullstrip=1;
656        }
657 
658        if (strncmp(line,"SUGSWITHDOTS",12) == 0) {
659                    sugswithdots=1;
660        }
661 
662        /* parse in the flag used by forbidden words */
663        if (strncmp(line,"KEEPCASE",8) == 0) {
664           if (parse_flag(line, &keepcase, afflst)) {
665              finishFileMgr(afflst);
666              return 1;
667           }
668        }
669 
670        /* parse in the flag used by `forceucase' */
671        if (strncmp(line,"FORCEUCASE",10) == 0) {
672           if (parse_flag(line, &forceucase, afflst)) {
673              finishFileMgr(afflst);
674              return 1;
675           }
676        }
677 
678        /* parse in the flag used by `warn' */
679        if (strncmp(line,"WARN",4) == 0) {
680           if (parse_flag(line, &warn, afflst)) {
681              finishFileMgr(afflst);
682              return 1;
683           }
684        }
685 
686        if (strncmp(line,"FORBIDWARN",10) == 0) {
687                    forbidwarn=1;
688        }
689 
690        /* parse in the flag used by the affix generator */
691        if (strncmp(line,"SUBSTANDARD",11) == 0) {
692           if (parse_flag(line, &substandard, afflst)) {
693              finishFileMgr(afflst);
694              return 1;
695           }
696        }
697 
698        if (strncmp(line,"CHECKSHARPS",11) == 0) {
699                    checksharps=1;
700        }
701 
702        /* parse this affix: P - prefix, S - suffix */
703        ft = ' ';
704        if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
705        if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
706        if (ft != ' ') {
707           if (dupflags_ini) {
708             memset(dupflags, 0, sizeof(dupflags));
709             dupflags_ini = 0;
710           }
711           if (parse_affix(line, ft, afflst, dupflags)) {
712              finishFileMgr(afflst);
713              return 1;
714           }
715        }
716     }
717 
718     finishFileMgr(afflst);
719     // affix trees are sorted now
720 
721     // now we can speed up performance greatly taking advantage of the
722     // relationship between the affixes and the idea of "subsets".
723 
724     // View each prefix as a potential leading subset of another and view
725     // each suffix (reversed) as a potential trailing subset of another.
726 
727     // To illustrate this relationship if we know the prefix "ab" is found in the
728     // word to examine, only prefixes that "ab" is a leading subset of need be examined.
729     // Furthermore is "ab" is not present then none of the prefixes that "ab" is
730     // is a subset need be examined.
731     // The same argument goes for suffix string that are reversed.
732 
733     // Then to top this off why not examine the first char of the word to quickly
734     // limit the set of prefixes to examine (i.e. the prefixes to examine must
735     // be leading supersets of the first character of the word (if they exist)
736 
737     // To take advantage of this "subset" relationship, we need to add two links
738     // from entry.  One to take next if the current prefix is found (call it nexteq)
739     // and one to take next if the current prefix is not found (call it nextne).
740 
741     // Since we have built ordered lists, all that remains is to properly initialize
742     // the nextne and nexteq pointers that relate them
743 
744     process_pfx_order();
745     process_sfx_order();
746 
747     /* get encoding for CHECKCOMPOUNDCASE */
748     if (!utf8) {
749     char * enc = get_encoding();
750     csconv = get_current_cs(enc);
751     free(enc);
752     enc = NULL;
753 
754     char expw[MAXLNLEN];
755     if (wordchars) {
756         strcpy(expw, wordchars);
757         free(wordchars);
758     } else *expw = '\0';
759 
760     for (int i = 0; i <= 255; i++) {
761         if ( (csconv[i].cupper != csconv[i].clower) &&
762             (! strchr(expw, (char) i))) {
763                 *(expw + strlen(expw) + 1) = '\0';
764                 *(expw + strlen(expw)) = (char) i;
765         }
766     }
767 
768     wordchars = mystrdup(expw);
769     }
770 
771     // default BREAK definition
772     if (numbreak == -1) {
773         breaktable = (char **) malloc(sizeof(char *) * 3);
774         if (!breaktable) return 1;
775         breaktable[0] = mystrdup("-");
776         breaktable[1] = mystrdup("^-");
777         breaktable[2] = mystrdup("-$");
778         if (breaktable[0] && breaktable[1] && breaktable[2]) numbreak = 3;
779     }
780     return 0;
781 }
782 
783 
784 // we want to be able to quickly access prefix information
785 // both by prefix flag, and sorted by prefix string itself
786 // so we need to set up two indexes
787 
build_pfxtree(PfxEntry * pfxptr)788 int AffixMgr::build_pfxtree(PfxEntry* pfxptr)
789 {
790   PfxEntry * ptr;
791   PfxEntry * pptr;
792   PfxEntry * ep = pfxptr;
793 
794   // get the right starting points
795   const char * key = ep->getKey();
796   const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
797 
798   // first index by flag which must exist
799   ptr = pFlag[flg];
800   ep->setFlgNxt(ptr);
801   pFlag[flg] = ep;
802 
803 
804   // handle the special case of null affix string
805   if (strlen(key) == 0) {
806     // always inset them at head of list at element 0
807      ptr = pStart[0];
808      ep->setNext(ptr);
809      pStart[0] = ep;
810      return 0;
811   }
812 
813   // now handle the normal case
814   ep->setNextEQ(NULL);
815   ep->setNextNE(NULL);
816 
817   unsigned char sp = *((const unsigned char *)key);
818   ptr = pStart[sp];
819 
820   // handle the first insert
821   if (!ptr) {
822      pStart[sp] = ep;
823      return 0;
824   }
825 
826 
827   // otherwise use binary tree insertion so that a sorted
828   // list can easily be generated later
829   pptr = NULL;
830   for (;;) {
831     pptr = ptr;
832     if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
833        ptr = ptr->getNextEQ();
834        if (!ptr) {
835           pptr->setNextEQ(ep);
836           break;
837        }
838     } else {
839        ptr = ptr->getNextNE();
840        if (!ptr) {
841           pptr->setNextNE(ep);
842           break;
843        }
844     }
845   }
846   return 0;
847 }
848 
849 // we want to be able to quickly access suffix information
850 // both by suffix flag, and sorted by the reverse of the
851 // suffix string itself; so we need to set up two indexes
build_sfxtree(SfxEntry * sfxptr)852 int AffixMgr::build_sfxtree(SfxEntry* sfxptr)
853 {
854   SfxEntry * ptr;
855   SfxEntry * pptr;
856   SfxEntry * ep = sfxptr;
857 
858   /* get the right starting point */
859   const char * key = ep->getKey();
860   const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
861 
862   // first index by flag which must exist
863   ptr = sFlag[flg];
864   ep->setFlgNxt(ptr);
865   sFlag[flg] = ep;
866 
867   // next index by affix string
868 
869   // handle the special case of null affix string
870   if (strlen(key) == 0) {
871     // always inset them at head of list at element 0
872      ptr = sStart[0];
873      ep->setNext(ptr);
874      sStart[0] = ep;
875      return 0;
876   }
877 
878   // now handle the normal case
879   ep->setNextEQ(NULL);
880   ep->setNextNE(NULL);
881 
882   unsigned char sp = *((const unsigned char *)key);
883   ptr = sStart[sp];
884 
885   // handle the first insert
886   if (!ptr) {
887      sStart[sp] = ep;
888      return 0;
889   }
890 
891   // otherwise use binary tree insertion so that a sorted
892   // list can easily be generated later
893   pptr = NULL;
894   for (;;) {
895     pptr = ptr;
896     if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
897        ptr = ptr->getNextEQ();
898        if (!ptr) {
899           pptr->setNextEQ(ep);
900           break;
901        }
902     } else {
903        ptr = ptr->getNextNE();
904        if (!ptr) {
905           pptr->setNextNE(ep);
906           break;
907        }
908     }
909   }
910   return 0;
911 }
912 
913 // convert from binary tree to sorted list
process_pfx_tree_to_list()914 int AffixMgr::process_pfx_tree_to_list()
915 {
916   for (int i=1; i< SETSIZE; i++) {
917     pStart[i] = process_pfx_in_order(pStart[i],NULL);
918   }
919   return 0;
920 }
921 
922 
process_pfx_in_order(PfxEntry * ptr,PfxEntry * nptr)923 PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr)
924 {
925   if (ptr) {
926     nptr = process_pfx_in_order(ptr->getNextNE(), nptr);
927     ptr->setNext(nptr);
928     nptr = process_pfx_in_order(ptr->getNextEQ(), ptr);
929   }
930   return nptr;
931 }
932 
933 
934 // convert from binary tree to sorted list
process_sfx_tree_to_list()935 int AffixMgr:: process_sfx_tree_to_list()
936 {
937   for (int i=1; i< SETSIZE; i++) {
938     sStart[i] = process_sfx_in_order(sStart[i],NULL);
939   }
940   return 0;
941 }
942 
process_sfx_in_order(SfxEntry * ptr,SfxEntry * nptr)943 SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr)
944 {
945   if (ptr) {
946     nptr = process_sfx_in_order(ptr->getNextNE(), nptr);
947     ptr->setNext(nptr);
948     nptr = process_sfx_in_order(ptr->getNextEQ(), ptr);
949   }
950   return nptr;
951 }
952 
953 
954 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
955 // using the idea of leading subsets this time
process_pfx_order()956 int AffixMgr::process_pfx_order()
957 {
958     PfxEntry* ptr;
959 
960     // loop through each prefix list starting point
961     for (int i=1; i < SETSIZE; i++) {
962 
963          ptr = pStart[i];
964 
965          // look through the remainder of the list
966          //  and find next entry with affix that
967          // the current one is not a subset of
968          // mark that as destination for NextNE
969          // use next in list that you are a subset
970          // of as NextEQ
971 
972          for (; ptr != NULL; ptr = ptr->getNext()) {
973 
974              PfxEntry * nptr = ptr->getNext();
975              for (; nptr != NULL; nptr = nptr->getNext()) {
976                  if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
977              }
978              ptr->setNextNE(nptr);
979              ptr->setNextEQ(NULL);
980              if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))
981                  ptr->setNextEQ(ptr->getNext());
982          }
983 
984          // now clean up by adding smart search termination strings:
985          // if you are already a superset of the previous prefix
986          // but not a subset of the next, search can end here
987          // so set NextNE properly
988 
989          ptr = pStart[i];
990          for (; ptr != NULL; ptr = ptr->getNext()) {
991              PfxEntry * nptr = ptr->getNext();
992              PfxEntry * mptr = NULL;
993              for (; nptr != NULL; nptr = nptr->getNext()) {
994                  if (! isSubset(ptr->getKey(),nptr->getKey())) break;
995                  mptr = nptr;
996              }
997              if (mptr) mptr->setNextNE(NULL);
998          }
999     }
1000     return 0;
1001 }
1002 
1003 // initialize the SfxEntry links NextEQ and NextNE to speed searching
1004 // using the idea of leading subsets this time
process_sfx_order()1005 int AffixMgr::process_sfx_order()
1006 {
1007     SfxEntry* ptr;
1008 
1009     // loop through each prefix list starting point
1010     for (int i=1; i < SETSIZE; i++) {
1011 
1012          ptr = sStart[i];
1013 
1014          // look through the remainder of the list
1015          //  and find next entry with affix that
1016          // the current one is not a subset of
1017          // mark that as destination for NextNE
1018          // use next in list that you are a subset
1019          // of as NextEQ
1020 
1021          for (; ptr != NULL; ptr = ptr->getNext()) {
1022              SfxEntry * nptr = ptr->getNext();
1023              for (; nptr != NULL; nptr = nptr->getNext()) {
1024                  if (! isSubset(ptr->getKey(),nptr->getKey())) break;
1025              }
1026              ptr->setNextNE(nptr);
1027              ptr->setNextEQ(NULL);
1028              if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey()))
1029                  ptr->setNextEQ(ptr->getNext());
1030          }
1031 
1032 
1033          // now clean up by adding smart search termination strings:
1034          // if you are already a superset of the previous suffix
1035          // but not a subset of the next, search can end here
1036          // so set NextNE properly
1037 
1038          ptr = sStart[i];
1039          for (; ptr != NULL; ptr = ptr->getNext()) {
1040              SfxEntry * nptr = ptr->getNext();
1041              SfxEntry * mptr = NULL;
1042              for (; nptr != NULL; nptr = nptr->getNext()) {
1043                  if (! isSubset(ptr->getKey(),nptr->getKey())) break;
1044                  mptr = nptr;
1045              }
1046              if (mptr) mptr->setNextNE(NULL);
1047          }
1048     }
1049     return 0;
1050 }
1051 
1052 // add flags to the result for dictionary debugging
debugflag(char * result,unsigned short flag)1053 void AffixMgr::debugflag(char * result, unsigned short flag) {
1054     char * st = encode_flag(flag);
1055     mystrcat(result, " ", MAXLNLEN);
1056     mystrcat(result, MORPH_FLAG, MAXLNLEN);
1057     if (st) {
1058         mystrcat(result, st, MAXLNLEN);
1059         free(st);
1060     }
1061 }
1062 
1063 // calculate the character length of the condition
condlen(char * st)1064 int AffixMgr::condlen(char * st)
1065 {
1066   int l = 0;
1067   bool group = false;
1068   for(; *st; st++) {
1069     if (*st == '[') {
1070         group = true;
1071         l++;
1072     } else if (*st == ']') group = false;
1073     else if (!group && (!utf8 ||
1074         (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++;
1075   }
1076   return l;
1077 }
1078 
encodeit(affentry & entry,char * cs)1079 int AffixMgr::encodeit(affentry &entry, char * cs)
1080 {
1081   if (strcmp(cs,".") != 0) {
1082     entry.numconds = (char) condlen(cs);
1083     strncpy(entry.c.conds, cs, MAXCONDLEN);
1084     // long condition (end of conds padded by strncpy)
1085     if (entry.c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) {
1086       entry.opts += aeLONGCOND;
1087       entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
1088       if (!entry.c.l.conds2) return 1;
1089     }
1090   } else {
1091     entry.numconds = 0;
1092     entry.c.conds[0] = '\0';
1093   }
1094   return 0;
1095 }
1096 
1097 // return 1 if s1 is a leading subset of s2 (dots are for infixes)
isSubset(const char * s1,const char * s2)1098 inline int AffixMgr::isSubset(const char * s1, const char * s2)
1099  {
1100     while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
1101         s1++;
1102         s2++;
1103     }
1104     return (*s1 == '\0');
1105  }
1106 
1107 
1108 // check word for prefixes
prefix_check(const char * word,int len,char in_compound,const FLAG needflag)1109 struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,
1110     const FLAG needflag)
1111 {
1112     struct hentry * rv= NULL;
1113 
1114     pfx = NULL;
1115     pfxappnd = NULL;
1116     sfxappnd = NULL;
1117 
1118     // first handle the special case of 0 length prefixes
1119     PfxEntry * pe = pStart[0];
1120     while (pe) {
1121         if (
1122             // fogemorpheme
1123               ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&
1124                   (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
1125             // permit prefixes in compounds
1126               ((in_compound != IN_CPD_END) || (pe->getCont() &&
1127                   (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))
1128               ) {
1129                     // check prefix
1130                     rv = pe->checkword(word, len, in_compound, needflag);
1131                     if (rv) {
1132                         pfx=pe; // BUG: pfx not stateless
1133                         return rv;
1134                     }
1135              }
1136        pe = pe->getNext();
1137     }
1138 
1139     // now handle the general case
1140     unsigned char sp = *((const unsigned char *)word);
1141     PfxEntry * pptr = pStart[sp];
1142 
1143     while (pptr) {
1144         if (isSubset(pptr->getKey(),word)) {
1145              if (
1146             // fogemorpheme
1147               ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&
1148                   (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
1149             // permit prefixes in compounds
1150               ((in_compound != IN_CPD_END) || (pptr->getCont() &&
1151                   (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen()))))
1152               ) {
1153             // check prefix
1154                   rv = pptr->checkword(word, len, in_compound, needflag);
1155                   if (rv) {
1156                     pfx=pptr; // BUG: pfx not stateless
1157                     return rv;
1158                   }
1159              }
1160              pptr = pptr->getNextEQ();
1161         } else {
1162              pptr = pptr->getNextNE();
1163         }
1164     }
1165 
1166     return NULL;
1167 }
1168 
1169 // check word for prefixes
prefix_check_twosfx(const char * word,int len,char in_compound,const FLAG needflag)1170 struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,
1171     char in_compound, const FLAG needflag)
1172 {
1173     struct hentry * rv= NULL;
1174 
1175     pfx = NULL;
1176     sfxappnd = NULL;
1177 
1178     // first handle the special case of 0 length prefixes
1179     PfxEntry * pe = pStart[0];
1180 
1181     while (pe) {
1182         rv = pe->check_twosfx(word, len, in_compound, needflag);
1183         if (rv) return rv;
1184         pe = pe->getNext();
1185     }
1186 
1187     // now handle the general case
1188     unsigned char sp = *((const unsigned char *)word);
1189     PfxEntry * pptr = pStart[sp];
1190 
1191     while (pptr) {
1192         if (isSubset(pptr->getKey(),word)) {
1193             rv = pptr->check_twosfx(word, len, in_compound, needflag);
1194             if (rv) {
1195                 pfx = pptr;
1196                 return rv;
1197             }
1198             pptr = pptr->getNextEQ();
1199         } else {
1200              pptr = pptr->getNextNE();
1201         }
1202     }
1203 
1204     return NULL;
1205 }
1206 
1207 // check word for prefixes
prefix_check_morph(const char * word,int len,char in_compound,const FLAG needflag)1208 char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,
1209     const FLAG needflag)
1210 {
1211     char * st;
1212 
1213     char result[MAXLNLEN];
1214     result[0] = '\0';
1215 
1216     pfx = NULL;
1217     sfxappnd = NULL;
1218 
1219     // first handle the special case of 0 length prefixes
1220     PfxEntry * pe = pStart[0];
1221     while (pe) {
1222        st = pe->check_morph(word,len,in_compound, needflag);
1223        if (st) {
1224             mystrcat(result, st, MAXLNLEN);
1225             free(st);
1226        }
1227        // if (rv) return rv;
1228        pe = pe->getNext();
1229     }
1230 
1231     // now handle the general case
1232     unsigned char sp = *((const unsigned char *)word);
1233     PfxEntry * pptr = pStart[sp];
1234 
1235     while (pptr) {
1236         if (isSubset(pptr->getKey(),word)) {
1237             st = pptr->check_morph(word,len,in_compound, needflag);
1238             if (st) {
1239               // fogemorpheme
1240               if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() &&
1241                         (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {
1242                     mystrcat(result, st, MAXLNLEN);
1243                     pfx = pptr;
1244                 }
1245                 free(st);
1246             }
1247             pptr = pptr->getNextEQ();
1248         } else {
1249             pptr = pptr->getNextNE();
1250         }
1251     }
1252 
1253     if (*result) return mystrdup(result);
1254     return NULL;
1255 }
1256 
1257 
1258 // check word for prefixes
prefix_check_twosfx_morph(const char * word,int len,char in_compound,const FLAG needflag)1259 char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,
1260     char in_compound, const FLAG needflag)
1261 {
1262     char * st;
1263 
1264     char result[MAXLNLEN];
1265     result[0] = '\0';
1266 
1267     pfx = NULL;
1268     sfxappnd = NULL;
1269 
1270     // first handle the special case of 0 length prefixes
1271     PfxEntry * pe = pStart[0];
1272     while (pe) {
1273         st = pe->check_twosfx_morph(word,len,in_compound, needflag);
1274         if (st) {
1275             mystrcat(result, st, MAXLNLEN);
1276             free(st);
1277         }
1278         pe = pe->getNext();
1279     }
1280 
1281     // now handle the general case
1282     unsigned char sp = *((const unsigned char *)word);
1283     PfxEntry * pptr = pStart[sp];
1284 
1285     while (pptr) {
1286         if (isSubset(pptr->getKey(),word)) {
1287             st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
1288             if (st) {
1289                 mystrcat(result, st, MAXLNLEN);
1290                 free(st);
1291                 pfx = pptr;
1292             }
1293             pptr = pptr->getNextEQ();
1294         } else {
1295             pptr = pptr->getNextNE();
1296         }
1297     }
1298 
1299     if (*result) return mystrdup(result);
1300     return NULL;
1301 }
1302 
1303 // Is word a non compound with a REP substitution (see checkcompoundrep)?
cpdrep_check(const char * word,int wl)1304 int AffixMgr::cpdrep_check(const char * word, int wl)
1305 {
1306   char candidate[MAXLNLEN];
1307   const char * r;
1308   int lenr, lenp;
1309 
1310   if ((wl < 2) || !numrep) return 0;
1311 
1312   for (int i=0; i < numrep; i++ ) {
1313       r = word;
1314       lenr = strlen(reptable[i].pattern2);
1315       lenp = strlen(reptable[i].pattern);
1316       // search every occurence of the pattern in the word
1317       while ((r=strstr(r, reptable[i].pattern)) != NULL) {
1318           strcpy(candidate, word);
1319           if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
1320           strcpy(candidate+(r-word),reptable[i].pattern2);
1321           strcpy(candidate+(r-word)+lenr, r+lenp);
1322           if (candidate_check(candidate,strlen(candidate))) return 1;
1323           r++; // search for the next letter
1324       }
1325    }
1326    return 0;
1327 }
1328 
1329 // forbid compoundings when there are special patterns at word bound
cpdpat_check(const char * word,int pos,hentry * r1,hentry * r2,const char)1330 int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2, const char /*affixed*/)
1331 {
1332   int len;
1333   for (int i = 0; i < numcheckcpd; i++) {
1334       if (isSubset(checkcpdtable[i].pattern2, word + pos) &&
1335         (!r1 || !checkcpdtable[i].cond ||
1336           (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) &&
1337         (!r2 || !checkcpdtable[i].cond2 ||
1338           (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) &&
1339         // zero length pattern => only TESTAFF
1340         // zero pattern (0/flag) => unmodified stem (zero affixes allowed)
1341         (!*(checkcpdtable[i].pattern) || (
1342             (*(checkcpdtable[i].pattern)=='0' && r1->blen <= pos && strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) ||
1343             (*(checkcpdtable[i].pattern)!='0' && ((len = strlen(checkcpdtable[i].pattern)) != 0) &&
1344                 strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)))) {
1345             return 1;
1346         }
1347   }
1348   return 0;
1349 }
1350 
1351 // forbid compounding with neighbouring upper and lower case characters at word bounds
cpdcase_check(const char * word,int pos)1352 int AffixMgr::cpdcase_check(const char * word, int pos)
1353 {
1354   if (utf8) {
1355       w_char u, w;
1356       const char * p;
1357       u8_u16(&u, 1, word + pos);
1358       for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);
1359       u8_u16(&w, 1, p);
1360       unsigned short a = (u.h << 8) + u.l;
1361       unsigned short b = (w.h << 8) + w.l;
1362       if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) &&
1363           (a != '-') && (b != '-')) return 1;
1364   } else {
1365       unsigned char a = *(word + pos - 1);
1366       unsigned char b = *(word + pos);
1367       if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;
1368   }
1369   return 0;
1370 }
1371 
1372 // check compound patterns
defcpd_check(hentry *** words,short wnum,hentry * rv,hentry ** def,char all)1373 int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)
1374 {
1375   signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
1376   signed short btwp[MAXWORDLEN]; // word positions for metacharacters
1377   int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions
1378   short bt = 0;
1379   int i, j;
1380   int ok;
1381   int w = 0;
1382 
1383   if (!*words) {
1384     w = 1;
1385     *words = def;
1386   }
1387 
1388   if (!*words) {
1389     return 0;
1390   }
1391 
1392   (*words)[wnum] = rv;
1393 
1394   // has the last word COMPOUNDRULE flag?
1395   if (rv->alen == 0) {
1396     (*words)[wnum] = NULL;
1397     if (w) *words = NULL;
1398     return 0;
1399   }
1400   ok = 0;
1401   for (i = 0; i < numdefcpd; i++) {
1402     for (j = 0; j < defcpdtable[i].len; j++) {
1403        if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' &&
1404           TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) {
1405          ok = 1;
1406          break;
1407        }
1408     }
1409   }
1410   if (ok == 0) {
1411     (*words)[wnum] = NULL;
1412     if (w) *words = NULL;
1413     return 0;
1414   }
1415 
1416   for (i = 0; i < numdefcpd; i++) {
1417     signed short pp = 0; // pattern position
1418     signed short wp = 0; // "words" position
1419     int ok2;
1420     ok = 1;
1421     ok2 = 1;
1422     do {
1423       while ((pp < defcpdtable[i].len) && (wp <= wnum)) {
1424         if (((pp+1) < defcpdtable[i].len) &&
1425           ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {
1426             int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;
1427             ok2 = 1;
1428             pp+=2;
1429             btpp[bt] = pp;
1430             btwp[bt] = wp;
1431             while (wp <= wend) {
1432                 if (!(*words)[wp]->alen ||
1433                   !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {
1434                     ok2 = 0;
1435                     break;
1436                 }
1437                 wp++;
1438             }
1439             if (wp <= wnum) ok2 = 0;
1440             btnum[bt] = wp - btwp[bt];
1441             if (btnum[bt] > 0) bt++;
1442             if (ok2) break;
1443         } else {
1444             ok2 = 1;
1445             if (!(*words)[wp] || !(*words)[wp]->alen ||
1446               !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {
1447                 ok = 0;
1448                 break;
1449             }
1450             pp++;
1451             wp++;
1452             if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;
1453         }
1454       }
1455     if (ok && ok2) {
1456         int r = pp;
1457         while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&
1458             ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;
1459         if (defcpdtable[i].len <= r) return 1;
1460     }
1461     // backtrack
1462     if (bt) do {
1463         ok = 1;
1464         btnum[bt - 1]--;
1465         pp = btpp[bt - 1];
1466         wp = btwp[bt - 1] + (signed short) btnum[bt - 1];
1467     } while ((btnum[bt - 1] < 0) && --bt);
1468   } while (bt);
1469 
1470   if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1;
1471 
1472   // check zero ending
1473   while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&
1474     ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;
1475   if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;
1476   }
1477   (*words)[wnum] = NULL;
1478   if (w) *words = NULL;
1479   return 0;
1480 }
1481 
candidate_check(const char * word,int len)1482 inline int AffixMgr::candidate_check(const char * word, int len)
1483 {
1484   struct hentry * rv=NULL;
1485 
1486   rv = lookup(word);
1487   if (rv) return 1;
1488 
1489 //  rv = prefix_check(word,len,1);
1490 //  if (rv) return 1;
1491 
1492   rv = affix_check(word,len);
1493   if (rv) return 1;
1494   return 0;
1495 }
1496 
1497 // calculate number of syllable for compound-checking
get_syllable(const char * word,int wlen)1498 short AffixMgr::get_syllable(const char * word, int wlen)
1499 {
1500     if (cpdmaxsyllable==0) return 0;
1501 
1502     short num=0;
1503 
1504     if (!utf8) {
1505         for (int i=0; i<wlen; i++) {
1506             if (strchr(cpdvowels, word[i])) num++;
1507         }
1508     } else if (cpdvowels_utf16) {
1509         w_char w[MAXWORDUTF8LEN];
1510         int i = u8_u16(w, MAXWORDUTF8LEN, word);
1511         for (; i > 0; i--) {
1512             if (flag_bsearch((unsigned short *) cpdvowels_utf16,
1513                 ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;
1514         }
1515     }
1516     return num;
1517 }
1518 
setcminmax(int * cmin,int * cmax,const char * word,int len)1519 void AffixMgr::setcminmax(int * cmin, int * cmax, const char * word, int len) {
1520     if (utf8) {
1521         int i;
1522         for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) {
1523           for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++);
1524         }
1525         for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) {
1526           for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--);
1527         }
1528     } else {
1529         *cmin = cpdmin;
1530         *cmax = len - cpdmin + 1;
1531     }
1532 }
1533 
1534 
1535 // check if compound word is correctly spelled
1536 // hu_mov_rule = spec. Hungarian rule (XXX)
compound_check(const char * word,int len,short wordnum,short numsyllable,short maxwordnum,short wnum,hentry ** words=NULL,char hu_mov_rule=0,char is_sug=0,int * info=NULL)1537 struct hentry * AffixMgr::compound_check(const char * word, int len,
1538     short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,
1539     char hu_mov_rule = 0, char is_sug = 0, int * info = NULL)
1540 {
1541     int i;
1542     short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
1543     struct hentry * rv = NULL;
1544     struct hentry * rv_first;
1545     struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
1546     char st [MAXWORDUTF8LEN + 4];
1547     char ch = '\0';
1548     int cmin;
1549     int cmax;
1550     int striple = 0;
1551     int scpd = 0;
1552     int soldi = 0;
1553     int oldcmin = 0;
1554     int oldcmax = 0;
1555     int oldlen = 0;
1556     int checkedstriple = 0;
1557     int onlycpdrule;
1558     char affixed = 0;
1559     hentry ** oldwords = words;
1560 
1561     int checked_prefix;
1562 
1563     setcminmax(&cmin, &cmax, word, len);
1564 
1565     strcpy(st, word);
1566 
1567     for (i = cmin; i < cmax; i++) {
1568         // go to end of the UTF-8 character
1569         if (utf8) {
1570             for (; (st[i] & 0xc0) == 0x80; i++);
1571             if (i >= cmax) return NULL;
1572         }
1573 
1574         words = oldwords;
1575         onlycpdrule = (words) ? 1 : 0;
1576 
1577         do { // onlycpdrule loop
1578 
1579         oldnumsyllable = numsyllable;
1580         oldwordnum = wordnum;
1581         checked_prefix = 0;
1582 
1583 
1584         do { // simplified checkcompoundpattern loop
1585 
1586         if (scpd > 0) {
1587           for (; scpd <= numcheckcpd && (!checkcpdtable[scpd-1].pattern3 ||
1588             strncmp(word + i, checkcpdtable[scpd-1].pattern3, strlen(checkcpdtable[scpd-1].pattern3)) != 0); scpd++);
1589 
1590           if (scpd > numcheckcpd) break; // break simplified checkcompoundpattern loop
1591           strcpy(st + i, checkcpdtable[scpd-1].pattern);
1592           soldi = i;
1593           i += strlen(checkcpdtable[scpd-1].pattern);
1594           strcpy(st + i, checkcpdtable[scpd-1].pattern2);
1595           strcpy(st + i + strlen(checkcpdtable[scpd-1].pattern2), word + soldi + strlen(checkcpdtable[scpd-1].pattern3));
1596 
1597           oldlen = len;
1598           len += strlen(checkcpdtable[scpd-1].pattern) + strlen(checkcpdtable[scpd-1].pattern2) - strlen(checkcpdtable[scpd-1].pattern3);
1599           oldcmin = cmin;
1600           oldcmax = cmax;
1601           setcminmax(&cmin, &cmax, st, len);
1602 
1603           cmax = len - cpdmin + 1;
1604         }
1605 
1606         ch = st[i];
1607         st[i] = '\0';
1608 
1609         sfx = NULL;
1610         pfx = NULL;
1611 
1612         // FIRST WORD
1613 
1614         affixed = 1;
1615         rv = lookup(st); // perhaps without prefix
1616 
1617         // search homonym with compound flag
1618         while ((rv) && !hu_mov_rule &&
1619             ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1620                 !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1621                   (compoundbegin && !wordnum && !onlycpdrule &&
1622                         TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1623                   (compoundmiddle && wordnum && !words && !onlycpdrule &&
1624                     TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
1625                   (numdefcpd && onlycpdrule &&
1626                     ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
1627                     (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))) ||
1628                   (scpd != 0 && checkcpdtable[scpd-1].cond != FLAG_NULL &&
1629                     !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen)))
1630                   ) {
1631             rv = rv->next_homonym;
1632         }
1633 
1634         if (rv) affixed = 0;
1635 
1636         if (!rv) {
1637             if (onlycpdrule) break;
1638             if (compoundflag &&
1639              !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
1640                 if (((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
1641                         FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1642                         (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundflag)))) && !hu_mov_rule &&
1643                     sfx->getCont() &&
1644                         ((compoundforbidflag && TESTAFF(sfx->getCont(), compoundforbidflag,
1645                             sfx->getContLen())) || (compoundend &&
1646                         TESTAFF(sfx->getCont(), compoundend,
1647                             sfx->getContLen())))) {
1648                         rv = NULL;
1649                 }
1650             }
1651 
1652             if (rv ||
1653               (((wordnum == 0) && compoundbegin &&
1654                 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1655                 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundbegin))) || // twofold suffixes + compound
1656                 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
1657               ((wordnum > 0) && compoundmiddle &&
1658                 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1659                 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundmiddle))) || // twofold suffixes + compound
1660                 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
1661               ) checked_prefix = 1;
1662         // else check forbiddenwords and needaffix
1663         } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1664             TESTAFF(rv->astr, needaffix, rv->alen) ||
1665             TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1666             (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen))
1667              )) {
1668                 st[i] = ch;
1669                 //continue;
1670                 break;
1671         }
1672 
1673             // check non_compound flag in suffix and prefix
1674             if ((rv) && !hu_mov_rule &&
1675                 ((pfx && pfx->getCont() &&
1676                     TESTAFF(pfx->getCont(), compoundforbidflag,
1677                         pfx->getContLen())) ||
1678                 (sfx && sfx->getCont() &&
1679                     TESTAFF(sfx->getCont(), compoundforbidflag,
1680                         sfx->getContLen())))) {
1681                     rv = NULL;
1682             }
1683 
1684             // check compoundend flag in suffix and prefix
1685             if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
1686                 ((pfx && pfx->getCont() &&
1687                     TESTAFF(pfx->getCont(), compoundend,
1688                         pfx->getContLen())) ||
1689                 (sfx && sfx->getCont() &&
1690                     TESTAFF(sfx->getCont(), compoundend,
1691                         sfx->getContLen())))) {
1692                     rv = NULL;
1693             }
1694 
1695             // check compoundmiddle flag in suffix and prefix
1696             if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
1697                 ((pfx && pfx->getCont() &&
1698                     TESTAFF(pfx->getCont(), compoundmiddle,
1699                         pfx->getContLen())) ||
1700                 (sfx && sfx->getCont() &&
1701                     TESTAFF(sfx->getCont(), compoundmiddle,
1702                         sfx->getContLen())))) {
1703                     rv = NULL;
1704             }
1705 
1706         // check forbiddenwords
1707         if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1708             TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1709             (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
1710                 return NULL;
1711             }
1712 
1713         // increment word number, if the second root has a compoundroot flag
1714         if ((rv) && compoundroot &&
1715             (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1716                 wordnum++;
1717         }
1718 
1719         // first word is acceptable in compound words?
1720         if (((rv) &&
1721           ( checked_prefix || (words && words[wnum]) ||
1722             (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1723             ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1724             ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))// ||
1725 //            (numdefcpd && )
1726 
1727 // LANG_hu section: spec. Hungarian rule
1728             || ((langnum == LANG_hu) && hu_mov_rule && (
1729                     TESTAFF(rv->astr, 'F', rv->alen) || // XXX hardwired Hungarian dictionary codes
1730                     TESTAFF(rv->astr, 'G', rv->alen) ||
1731                     TESTAFF(rv->astr, 'H', rv->alen)
1732                 )
1733               )
1734 // END of LANG_hu section
1735           ) &&
1736           (
1737              // test CHECKCOMPOUNDPATTERN conditions
1738              scpd == 0 || checkcpdtable[scpd-1].cond == FLAG_NULL ||
1739                 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen)
1740           )
1741           && ! (( checkcompoundtriple && scpd == 0 && !words && // test triple letters
1742                    (word[i-1]==word[i]) && (
1743                       ((i>1) && (word[i-1]==word[i-2])) ||
1744                       ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
1745                    )
1746                ) ||
1747                (
1748                  checkcompoundcase && scpd == 0 && !words && cpdcase_check(word, i)
1749                ))
1750          )
1751 // LANG_hu section: spec. Hungarian rule
1752          || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
1753               (sfx && sfx->getCont() && ( // XXX hardwired Hungarian dic. codes
1754                         TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getContLen()) ||
1755                         TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getContLen())
1756                     )
1757                )
1758              )
1759          ) { // first word is ok condition
1760 
1761 // LANG_hu section: spec. Hungarian rule
1762             if (langnum == LANG_hu) {
1763                 // calculate syllable number of the word
1764                 numsyllable += get_syllable(st, i);
1765                 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
1766                 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
1767             }
1768 // END of LANG_hu section
1769 
1770             // NEXT WORD(S)
1771             rv_first = rv;
1772             st[i] = ch;
1773 
1774         do { // striple loop
1775 
1776             // check simplifiedtriple
1777             if (simplifiedtriple) {
1778               if (striple) {
1779                 checkedstriple = 1;
1780                 i--; // check "fahrt" instead of "ahrt" in "Schiffahrt"
1781               } else if (i > 2 && *(word+i - 1) == *(word + i - 2)) striple = 1;
1782             }
1783 
1784             rv = lookup((st+i)); // perhaps without prefix
1785 
1786         // search homonym with compound flag
1787         while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1788                         !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1789                           (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
1790                            (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))) ||
1791                              (scpd != 0 && checkcpdtable[scpd-1].cond2 != FLAG_NULL &&
1792                                 !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))
1793                            )) {
1794             rv = rv->next_homonym;
1795         }
1796 
1797             // check FORCEUCASE
1798             if (rv && forceucase && (rv) &&
1799                 (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL;
1800 
1801             if (rv && words && words[wnum + 1]) return rv_first;
1802 
1803             oldnumsyllable2 = numsyllable;
1804             oldwordnum2 = wordnum;
1805 
1806 
1807 // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code
1808             if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
1809                 numsyllable--;
1810             }
1811 // END of LANG_hu section
1812 
1813             // increment word number, if the second root has a compoundroot flag
1814             if ((rv) && (compoundroot) &&
1815                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1816                     wordnum++;
1817             }
1818 
1819             // check forbiddenwords
1820             if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1821                 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1822                (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
1823 
1824             // second word is acceptable, as a root?
1825             // hungarian conventions: compounding is acceptable,
1826             // when compound forms consist of 2 words, or if more,
1827             // then the syllable number of root words must be 6, or lesser.
1828 
1829             if ((rv) && (
1830                       (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1831                       (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
1832                     )
1833                 && (
1834                       ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
1835                       ((cpdmaxsyllable!=0) &&
1836                           (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen)<=cpdmaxsyllable))
1837                     ) &&
1838                (
1839                  // test CHECKCOMPOUNDPATTERN
1840                  !numcheckcpd || scpd != 0 || !cpdpat_check(word, i, rv_first, rv, 0)
1841                ) &&
1842                 (
1843                      (!checkcompounddup || (rv != rv_first))
1844                    )
1845             // test CHECKCOMPOUNDPATTERN conditions
1846                 && (scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL ||
1847                       TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))
1848                 )
1849                  {
1850                       // forbid compound word, if it is a non compound word with typical fault
1851                       if (checkcompoundrep && cpdrep_check(word,len)) return NULL;
1852                       return rv_first;
1853             }
1854 
1855             numsyllable = oldnumsyllable2;
1856             wordnum = oldwordnum2;
1857 
1858             // perhaps second word has prefix or/and suffix
1859             sfx = NULL;
1860             sfxflag = FLAG_NULL;
1861             rv = (compoundflag && !onlycpdrule) ? affix_check((word+i),strlen(word+i), compoundflag, IN_CPD_END) : NULL;
1862             if (!rv && compoundend && !onlycpdrule) {
1863                 sfx = NULL;
1864                 pfx = NULL;
1865                 rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END);
1866             }
1867 
1868             if (!rv && numdefcpd && words) {
1869                 rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
1870                 if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv_first;
1871                 rv = NULL;
1872             }
1873 
1874             // test CHECKCOMPOUNDPATTERN conditions (allowed forms)
1875             if (rv && !(scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL ||
1876                 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))) rv = NULL;
1877 
1878             // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds)
1879             if (rv && numcheckcpd && scpd == 0 && cpdpat_check(word, i, rv_first, rv, affixed)) rv = NULL;
1880 
1881             // check non_compound flag in suffix and prefix
1882             if ((rv) &&
1883                 ((pfx && pfx->getCont() &&
1884                     TESTAFF(pfx->getCont(), compoundforbidflag,
1885                         pfx->getContLen())) ||
1886                 (sfx && sfx->getCont() &&
1887                     TESTAFF(sfx->getCont(), compoundforbidflag,
1888                         sfx->getContLen())))) {
1889                     rv = NULL;
1890             }
1891 
1892             // check FORCEUCASE
1893             if (rv && forceucase && (rv) &&
1894                 (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL;
1895 
1896             // check forbiddenwords
1897             if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1898                 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1899                (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
1900 
1901             // pfxappnd = prefix of word+i, or NULL
1902             // calculate syllable number of prefix.
1903             // hungarian convention: when syllable number of prefix is more,
1904             // than 1, the prefix+word counts as two words.
1905 
1906             if (langnum == LANG_hu) {
1907                 // calculate syllable number of the word
1908                 numsyllable += get_syllable(word + i, strlen(word + i));
1909 
1910                 // - affix syllable num.
1911                 // XXX only second suffix (inflections, not derivations)
1912                 if (sfxappnd) {
1913                     char * tmp = myrevstrdup(sfxappnd);
1914                     numsyllable -= get_syllable(tmp, strlen(tmp));
1915                     free(tmp);
1916                 }
1917 
1918                 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
1919                 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
1920 
1921                 // increment syllable num, if last word has a SYLLABLENUM flag
1922                 // and the suffix is beginning `s'
1923 
1924                 if (cpdsyllablenum) {
1925                     switch (sfxflag) {
1926                         case 'c': { numsyllable+=2; break; }
1927                         case 'J': { numsyllable += 1; break; }
1928                         case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
1929                     }
1930                 }
1931             }
1932 
1933             // increment word number, if the second word has a compoundroot flag
1934             if ((rv) && (compoundroot) &&
1935                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1936                     wordnum++;
1937             }
1938 
1939             // second word is acceptable, as a word with prefix or/and suffix?
1940             // hungarian conventions: compounding is acceptable,
1941             // when compound forms consist 2 word, otherwise
1942             // the syllable number of root words is 6, or lesser.
1943             if ((rv) &&
1944                     (
1945                       ((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
1946                       ((cpdmaxsyllable != 0) &&
1947                           (numsyllable <= cpdmaxsyllable))
1948                     )
1949                 && (
1950                    (!checkcompounddup || (rv != rv_first))
1951                    )) {
1952                     // forbid compound word, if it is a non compound word with typical fault
1953                     if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
1954                     return rv_first;
1955             }
1956 
1957             numsyllable = oldnumsyllable2;
1958             wordnum = oldwordnum2;
1959 
1960             // perhaps second word is a compound word (recursive call)
1961             if (wordnum < maxwordnum) {
1962                 rv = compound_check((st+i),strlen(st+i), wordnum+1,
1963                      numsyllable, maxwordnum, wnum + 1, words, 0, is_sug, info);
1964 
1965                 if (rv && numcheckcpd && ((scpd == 0 && cpdpat_check(word, i, rv_first, rv, affixed)) ||
1966                    (scpd != 0 && !cpdpat_check(word, i, rv_first, rv, affixed)))) rv = NULL;
1967             } else {
1968                 rv=NULL;
1969             }
1970             if (rv) {
1971                 // forbid compound word, if it is a non compound word with typical fault
1972                 if (checkcompoundrep || forbiddenword) {
1973                     struct hentry * rv2 = NULL;
1974 
1975                     if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
1976 
1977                     // check first part
1978                     if (strncmp(rv->word, word + i, rv->blen) == 0) {
1979                         char r = *(st + i + rv->blen);
1980                         *(st + i + rv->blen) = '\0';
1981 
1982                         if (checkcompoundrep && cpdrep_check(st, i + rv->blen)) {
1983                             *(st + i + rv->blen) = r;
1984                             continue;
1985                         }
1986 
1987                         if (forbiddenword) {
1988                             rv2 = lookup(word);
1989                             if (!rv2) rv2 = affix_check(word, len);
1990                             if (rv2 && rv2->astr && TESTAFF(rv2->astr, forbiddenword, rv2->alen) &&
1991                                 (strncmp(rv2->word, st, i + rv->blen) == 0)) {
1992                                     return NULL;
1993                             }
1994                         }
1995                         *(st + i + rv->blen) = r;
1996                     }
1997                 }
1998                 return rv_first;
1999             }
2000           } while (striple && !checkedstriple); // end of striple loop
2001 
2002           if (checkedstriple) {
2003             i++;
2004             checkedstriple = 0;
2005             striple = 0;
2006           }
2007 
2008         } // first word is ok condition
2009 
2010         if (soldi != 0) {
2011           i = soldi;
2012           soldi = 0;
2013           len = oldlen;
2014           cmin = oldcmin;
2015           cmax = oldcmax;
2016         }
2017         scpd++;
2018 
2019 
2020         } while (!onlycpdrule && simplifiedcpd && scpd <= numcheckcpd); // end of simplifiedcpd loop
2021 
2022         scpd = 0;
2023         wordnum = oldwordnum;
2024         numsyllable = oldnumsyllable;
2025 
2026         if (soldi != 0) {
2027           i = soldi;
2028           strcpy(st, word); // XXX add more optim.
2029           soldi = 0;
2030         } else st[i] = ch;
2031 
2032         } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule = 1)); // end of onlycpd loop
2033 
2034     }
2035 
2036     return NULL;
2037 }
2038 
2039 // check if compound word is correctly spelled
2040 // hu_mov_rule = spec. Hungarian rule (XXX)
compound_check_morph(const char * word,int len,short wordnum,short numsyllable,short maxwordnum,short wnum,hentry ** words,char hu_mov_rule=0,char ** result=NULL,char * partresult=NULL)2041 int AffixMgr::compound_check_morph(const char * word, int len,
2042     short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words,
2043     char hu_mov_rule = 0, char ** result = NULL, char * partresult = NULL)
2044 {
2045     int i;
2046     short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
2047     int ok = 0;
2048 
2049     struct hentry * rv = NULL;
2050     struct hentry * rv_first;
2051     struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
2052     char st [MAXWORDUTF8LEN + 4];
2053     char ch;
2054 
2055     int checked_prefix;
2056     char presult[MAXLNLEN];
2057 
2058     int cmin;
2059     int cmax;
2060 
2061     int onlycpdrule;
2062     char affixed = 0;
2063     hentry ** oldwords = words;
2064 
2065     setcminmax(&cmin, &cmax, word, len);
2066 
2067     strcpy(st, word);
2068 
2069     for (i = cmin; i < cmax; i++) {
2070         oldnumsyllable = numsyllable;
2071         oldwordnum = wordnum;
2072         checked_prefix = 0;
2073 
2074         // go to end of the UTF-8 character
2075         if (utf8) {
2076             for (; (st[i] & 0xc0) == 0x80; i++);
2077             if (i >= cmax) return 0;
2078         }
2079 
2080         words = oldwords;
2081         onlycpdrule = (words) ? 1 : 0;
2082 
2083         do { // onlycpdrule loop
2084 
2085         oldnumsyllable = numsyllable;
2086         oldwordnum = wordnum;
2087         checked_prefix = 0;
2088 
2089         ch = st[i];
2090         st[i] = '\0';
2091         sfx = NULL;
2092 
2093         // FIRST WORD
2094 
2095         affixed = 1;
2096 
2097         *presult = '\0';
2098         if (partresult) mystrcat(presult, partresult, MAXLNLEN);
2099 
2100         rv = lookup(st); // perhaps without prefix
2101 
2102         // search homonym with compound flag
2103         while ((rv) && !hu_mov_rule &&
2104             ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2105                 !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2106                 (compoundbegin && !wordnum && !onlycpdrule &&
2107                         TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2108                 (compoundmiddle && wordnum && !words && !onlycpdrule &&
2109                     TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
2110                   (numdefcpd && onlycpdrule &&
2111                     ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
2112                     (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))
2113                   ))) {
2114             rv = rv->next_homonym;
2115         }
2116 
2117         if (rv) affixed = 0;
2118 
2119         if (rv)  {
2120             sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st);
2121             if (!HENTRY_FIND(rv, MORPH_STEM)) {
2122                 sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM, st);
2123             }
2124             // store the pointer of the hash entry
2125 //            sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTRY, rv);
2126             if (HENTRY_DATA(rv)) {
2127                 sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, HENTRY_DATA2(rv));
2128             }
2129         }
2130 
2131         if (!rv) {
2132             if (onlycpdrule && strlen(*result) > MAXLNLEN/10) break;
2133             if (compoundflag &&
2134              !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
2135                 if (((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
2136                         FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2137                         (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundflag)))) && !hu_mov_rule &&
2138                     sfx->getCont() &&
2139                         ((compoundforbidflag && TESTAFF(sfx->getCont(), compoundforbidflag,
2140                             sfx->getContLen())) || (compoundend &&
2141                         TESTAFF(sfx->getCont(), compoundend,
2142                             sfx->getContLen())))) {
2143                         rv = NULL;
2144                 }
2145             }
2146 
2147             if (rv ||
2148               (((wordnum == 0) && compoundbegin &&
2149                 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2150                 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundbegin))) ||  // twofold suffix+compound
2151                 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
2152               ((wordnum > 0) && compoundmiddle &&
2153                 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2154                 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundmiddle))) ||  // twofold suffix+compound
2155                 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
2156               ) {
2157                 // char * p = prefix_check_morph(st, i, 0, compound);
2158                 char * p = NULL;
2159                 if (compoundflag) p = affix_check_morph(st, i, compoundflag);
2160                 if (!p || (*p == '\0')) {
2161                    if (p) free(p);
2162                    p = NULL;
2163                    if ((wordnum == 0) && compoundbegin) {
2164                      p = affix_check_morph(st, i, compoundbegin);
2165                    } else if ((wordnum > 0) && compoundmiddle) {
2166                      p = affix_check_morph(st, i, compoundmiddle);
2167                    }
2168                 }
2169                 if (p && (*p != '\0')) {
2170                     sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD,
2171                         MORPH_PART, st, line_uniq_app(&p, MSEP_REC));
2172                 }
2173                 if (p) free(p);
2174                 checked_prefix = 1;
2175             }
2176         // else check forbiddenwords
2177         } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2178             TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
2179             TESTAFF(rv->astr, needaffix, rv->alen))) {
2180                 st[i] = ch;
2181                 continue;
2182         }
2183 
2184             // check non_compound flag in suffix and prefix
2185             if ((rv) && !hu_mov_rule &&
2186                 ((pfx && pfx->getCont() &&
2187                     TESTAFF(pfx->getCont(), compoundforbidflag,
2188                         pfx->getContLen())) ||
2189                 (sfx && sfx->getCont() &&
2190                     TESTAFF(sfx->getCont(), compoundforbidflag,
2191                         sfx->getContLen())))) {
2192                     continue;
2193             }
2194 
2195             // check compoundend flag in suffix and prefix
2196             if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
2197                 ((pfx && pfx->getCont() &&
2198                     TESTAFF(pfx->getCont(), compoundend,
2199                         pfx->getContLen())) ||
2200                 (sfx && sfx->getCont() &&
2201                     TESTAFF(sfx->getCont(), compoundend,
2202                         sfx->getContLen())))) {
2203                     continue;
2204             }
2205 
2206             // check compoundmiddle flag in suffix and prefix
2207             if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
2208                 ((pfx && pfx->getCont() &&
2209                     TESTAFF(pfx->getCont(), compoundmiddle,
2210                         pfx->getContLen())) ||
2211                 (sfx && sfx->getCont() &&
2212                     TESTAFF(sfx->getCont(), compoundmiddle,
2213                         sfx->getContLen())))) {
2214                     rv = NULL;
2215             }
2216 
2217         // check forbiddenwords
2218         if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen)
2219             || TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) continue;
2220 
2221         // increment word number, if the second root has a compoundroot flag
2222         if ((rv) && (compoundroot) &&
2223             (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2224                 wordnum++;
2225         }
2226 
2227         // first word is acceptable in compound words?
2228         if (((rv) &&
2229           ( checked_prefix || (words && words[wnum]) ||
2230             (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2231             ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2232             ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))
2233 // LANG_hu section: spec. Hungarian rule
2234             || ((langnum == LANG_hu) && // hu_mov_rule
2235                 hu_mov_rule && (
2236                     TESTAFF(rv->astr, 'F', rv->alen) ||
2237                     TESTAFF(rv->astr, 'G', rv->alen) ||
2238                     TESTAFF(rv->astr, 'H', rv->alen)
2239                 )
2240               )
2241 // END of LANG_hu section
2242           )
2243           && ! (( checkcompoundtriple && !words && // test triple letters
2244                    (word[i-1]==word[i]) && (
2245                       ((i>1) && (word[i-1]==word[i-2])) ||
2246                       ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
2247                    )
2248                ) ||
2249                (
2250                    // test CHECKCOMPOUNDPATTERN
2251                    numcheckcpd && !words && cpdpat_check(word, i, rv, NULL, affixed)
2252                ) ||
2253                (
2254                  checkcompoundcase && !words && cpdcase_check(word, i)
2255                ))
2256          )
2257 // LANG_hu section: spec. Hungarian rule
2258          || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
2259               (sfx && sfx->getCont() && (
2260                         TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getContLen()) ||
2261                         TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getContLen())
2262                     )
2263                )
2264              )
2265 // END of LANG_hu section
2266          ) {
2267 
2268 // LANG_hu section: spec. Hungarian rule
2269             if (langnum == LANG_hu) {
2270                 // calculate syllable number of the word
2271                 numsyllable += get_syllable(st, i);
2272 
2273                 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
2274                 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
2275             }
2276 // END of LANG_hu section
2277 
2278             // NEXT WORD(S)
2279             rv_first = rv;
2280             rv = lookup((word+i)); // perhaps without prefix
2281 
2282         // search homonym with compound flag
2283         while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2284                         !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2285                           (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
2286                            (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) {
2287             rv = rv->next_homonym;
2288         }
2289 
2290             if (rv && words && words[wnum + 1]) {
2291                   mystrcat(*result, presult, MAXLNLEN);
2292                   mystrcat(*result, " ", MAXLNLEN);
2293                   mystrcat(*result, MORPH_PART, MAXLNLEN);
2294                   mystrcat(*result, word+i, MAXLNLEN);
2295                   if (complexprefixes && HENTRY_DATA(rv)) mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
2296                   if (!HENTRY_FIND(rv, MORPH_STEM)) {
2297                     mystrcat(*result, " ", MAXLNLEN);
2298                     mystrcat(*result, MORPH_STEM, MAXLNLEN);
2299                     mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN);
2300                   }
2301                   // store the pointer of the hash entry
2302 //                  sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
2303                   if (!complexprefixes && HENTRY_DATA(rv)) {
2304                     mystrcat(*result, " ", MAXLNLEN);
2305                     mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
2306                   }
2307                   mystrcat(*result, "\n", MAXLNLEN);
2308                   ok = 1;
2309                   return 0;
2310             }
2311 
2312             oldnumsyllable2 = numsyllable;
2313             oldwordnum2 = wordnum;
2314 
2315 // LANG_hu section: spec. Hungarian rule
2316             if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
2317                 numsyllable--;
2318             }
2319 // END of LANG_hu section
2320             // increment word number, if the second root has a compoundroot flag
2321             if ((rv) && (compoundroot) &&
2322                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2323                     wordnum++;
2324             }
2325 
2326             // check forbiddenwords
2327             if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2328                 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) {
2329                 st[i] = ch;
2330                 continue;
2331             }
2332 
2333             // second word is acceptable, as a root?
2334             // hungarian conventions: compounding is acceptable,
2335             // when compound forms consist of 2 words, or if more,
2336             // then the syllable number of root words must be 6, or lesser.
2337             if ((rv) && (
2338                       (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2339                       (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
2340                     )
2341                 && (
2342                       ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
2343                       ((cpdmaxsyllable!=0) &&
2344                           (numsyllable+get_syllable(HENTRY_WORD(rv),rv->blen)<=cpdmaxsyllable))
2345                     )
2346                 && (
2347                      (!checkcompounddup || (rv != rv_first))
2348                    )
2349                 )
2350                  {
2351                       // bad compound word
2352                       mystrcat(*result, presult, MAXLNLEN);
2353                       mystrcat(*result, " ", MAXLNLEN);
2354                       mystrcat(*result, MORPH_PART, MAXLNLEN);
2355                       mystrcat(*result, word+i, MAXLNLEN);
2356 
2357                       if (HENTRY_DATA(rv)) {
2358                         if (complexprefixes) mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
2359                         if (! HENTRY_FIND(rv, MORPH_STEM)) {
2360                            mystrcat(*result, " ", MAXLNLEN);
2361                            mystrcat(*result, MORPH_STEM, MAXLNLEN);
2362                            mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN);
2363                         }
2364                         // store the pointer of the hash entry
2365 //                        sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
2366                         if (!complexprefixes) {
2367                             mystrcat(*result, " ", MAXLNLEN);
2368                             mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
2369                         }
2370                       }
2371                       mystrcat(*result, "\n", MAXLNLEN);
2372                               ok = 1;
2373             }
2374 
2375             numsyllable = oldnumsyllable2 ;
2376             wordnum = oldwordnum2;
2377 
2378             // perhaps second word has prefix or/and suffix
2379             sfx = NULL;
2380             sfxflag = FLAG_NULL;
2381 
2382             if (compoundflag && !onlycpdrule) rv = affix_check((word+i),strlen(word+i), compoundflag); else rv = NULL;
2383 
2384             if (!rv && compoundend && !onlycpdrule) {
2385                 sfx = NULL;
2386                 pfx = NULL;
2387                 rv = affix_check((word+i),strlen(word+i), compoundend);
2388             }
2389 
2390             if (!rv && numdefcpd && words) {
2391                 rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
2392                 if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
2393                       char * m = NULL;
2394                       if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
2395                       if ((!m || *m == '\0') && compoundend) {
2396                             if (m) free(m);
2397                             m = affix_check_morph((word+i),strlen(word+i), compoundend);
2398                       }
2399                       mystrcat(*result, presult, MAXLNLEN);
2400                       if (m || (*m != '\0')) {
2401                         sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD,
2402                             MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC));
2403                       }
2404                       if (m) free(m);
2405                       mystrcat(*result, "\n", MAXLNLEN);
2406                       ok = 1;
2407                 }
2408             }
2409 
2410             // check non_compound flag in suffix and prefix
2411             if ((rv) &&
2412                 ((pfx && pfx->getCont() &&
2413                     TESTAFF(pfx->getCont(), compoundforbidflag,
2414                         pfx->getContLen())) ||
2415                 (sfx && sfx->getCont() &&
2416                     TESTAFF(sfx->getCont(), compoundforbidflag,
2417                         sfx->getContLen())))) {
2418                     rv = NULL;
2419             }
2420 
2421             // check forbiddenwords
2422             if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen) ||
2423                     TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))
2424                     && (! TESTAFF(rv->astr, needaffix, rv->alen))) {
2425                         st[i] = ch;
2426                         continue;
2427                     }
2428 
2429             if (langnum == LANG_hu) {
2430                 // calculate syllable number of the word
2431                 numsyllable += get_syllable(word + i, strlen(word + i));
2432 
2433                 // - affix syllable num.
2434                 // XXX only second suffix (inflections, not derivations)
2435                 if (sfxappnd) {
2436                     char * tmp = myrevstrdup(sfxappnd);
2437                     numsyllable -= get_syllable(tmp, strlen(tmp));
2438                     free(tmp);
2439                 }
2440 
2441                 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
2442                 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
2443 
2444                 // increment syllable num, if last word has a SYLLABLENUM flag
2445                 // and the suffix is beginning `s'
2446 
2447                 if (cpdsyllablenum) {
2448                     switch (sfxflag) {
2449                         case 'c': { numsyllable+=2; break; }
2450                         case 'J': { numsyllable += 1; break; }
2451                         case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
2452                     }
2453                 }
2454             }
2455 
2456             // increment word number, if the second word has a compoundroot flag
2457             if ((rv) && (compoundroot) &&
2458                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2459                     wordnum++;
2460             }
2461             // second word is acceptable, as a word with prefix or/and suffix?
2462             // hungarian conventions: compounding is acceptable,
2463             // when compound forms consist 2 word, otherwise
2464             // the syllable number of root words is 6, or lesser.
2465             if ((rv) &&
2466                     (
2467                       ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
2468                       ((cpdmaxsyllable!=0) &&
2469                           (numsyllable <= cpdmaxsyllable))
2470                     )
2471                 && (
2472                    (!checkcompounddup || (rv != rv_first))
2473                    )) {
2474                       char * m = NULL;
2475                       if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
2476                       if ((!m || *m == '\0') && compoundend) {
2477                             if (m) free(m);
2478                             m = affix_check_morph((word+i),strlen(word+i), compoundend);
2479                       }
2480                       mystrcat(*result, presult, MAXLNLEN);
2481                       if (m && (*m != '\0')) {
2482                         sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD,
2483                             MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC));
2484                       }
2485                       if (m) free(m);
2486                       sprintf(*result + strlen(*result), "%c", MSEP_REC);
2487                       ok = 1;
2488             }
2489 
2490             numsyllable = oldnumsyllable2;
2491             wordnum = oldwordnum2;
2492 
2493             // perhaps second word is a compound word (recursive call)
2494             if ((wordnum < maxwordnum) && (ok == 0)) {
2495                         compound_check_morph((word+i),strlen(word+i), wordnum+1,
2496                              numsyllable, maxwordnum, wnum + 1, words, 0, result, presult);
2497             } else {
2498                 rv=NULL;
2499             }
2500         }
2501         st[i] = ch;
2502         wordnum = oldwordnum;
2503         numsyllable = oldnumsyllable;
2504 
2505         } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule = 1)); // end of onlycpd loop
2506 
2507     }
2508     return 0;
2509 }
2510 
2511  // return 1 if s1 (reversed) is a leading subset of end of s2
2512 /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
2513  {
2514     while ((len > 0) && *s1 && (*s1 == *end_of_s2)) {
2515         s1++;
2516         end_of_s2--;
2517         len--;
2518     }
2519     return (*s1 == '\0');
2520  }
2521  */
2522 
isRevSubset(const char * s1,const char * end_of_s2,int len)2523 inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
2524  {
2525     while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {
2526         s1++;
2527         end_of_s2--;
2528         len--;
2529     }
2530     return (*s1 == '\0');
2531  }
2532 
2533 // check word for suffixes
2534 
suffix_check(const char * word,int len,int sfxopts,PfxEntry * ppfx,char ** wlst,int maxSug,int * ns,const FLAG cclass,const FLAG needflag,char in_compound)2535 struct hentry * AffixMgr::suffix_check (const char * word, int len,
2536        int sfxopts, PfxEntry * ppfx, char ** wlst, int maxSug, int * ns,
2537        const FLAG cclass, const FLAG needflag, char in_compound)
2538 {
2539     struct hentry * rv = NULL;
2540     PfxEntry* ep = ppfx;
2541 
2542     // first handle the special case of 0 length suffixes
2543     SfxEntry * se = sStart[0];
2544 
2545     while (se) {
2546         if (!cclass || se->getCont()) {
2547             // suffixes are not allowed in beginning of compounds
2548             if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2549              // except when signed with compoundpermitflag flag
2550              (se->getCont() && compoundpermitflag &&
2551                 TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
2552               // no circumfix flag in prefix and suffix
2553               ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2554                    circumfix, ep->getContLen())) &&
2555                (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
2556               // circumfix flag in prefix AND suffix
2557               ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2558                    circumfix, ep->getContLen())) &&
2559                (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen())))))  &&
2560             // fogemorpheme
2561               (in_compound ||
2562                  !(se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) &&
2563             // needaffix on prefix or first suffix
2564               (cclass ||
2565                    !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2566                    (ppfx && !((ep->getCont()) &&
2567                      TESTAFF(ep->getCont(), needaffix,
2568                        ep->getContLen())))
2569               )) {
2570                 rv = se->checkword(word,len, sfxopts, ppfx, wlst, maxSug, ns, (FLAG) cclass,
2571                     needflag, (in_compound ? 0 : onlyincompound));
2572                 if (rv) {
2573                     sfx=se; // BUG: sfx not stateless
2574                     return rv;
2575                 }
2576             }
2577         }
2578        se = se->getNext();
2579     }
2580 
2581     // now handle the general case
2582     if (len == 0) return NULL; // FULLSTRIP
2583     unsigned char sp= *((const unsigned char *)(word + len - 1));
2584     SfxEntry * sptr = sStart[sp];
2585 
2586     while (sptr) {
2587         if (isRevSubset(sptr->getKey(), word + len - 1, len)
2588         ) {
2589             // suffixes are not allowed in beginning of compounds
2590             if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2591              // except when signed with compoundpermitflag flag
2592              (sptr->getCont() && compoundpermitflag &&
2593                 TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
2594               // no circumfix flag in prefix and suffix
2595               ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2596                    circumfix, ep->getContLen())) &&
2597                (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
2598               // circumfix flag in prefix AND suffix
2599               ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2600                    circumfix, ep->getContLen())) &&
2601                (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))))  &&
2602             // fogemorpheme
2603               (in_compound ||
2604                  !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
2605             // needaffix on prefix or first suffix
2606               (cclass ||
2607                   !(sptr->getCont() && TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
2608                   (ppfx && !((ep->getCont()) &&
2609                      TESTAFF(ep->getCont(), needaffix,
2610                        ep->getContLen())))
2611               )
2612             ) if (in_compound != IN_CPD_END || ppfx || !(sptr->getCont() && TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) {
2613                 rv = sptr->checkword(word,len, sfxopts, ppfx, wlst,
2614                     maxSug, ns, cclass, needflag, (in_compound ? 0 : onlyincompound));
2615                 if (rv) {
2616                     sfx=sptr; // BUG: sfx not stateless
2617                     sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2618                     if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
2619                     return rv;
2620                 }
2621              }
2622              sptr = sptr->getNextEQ();
2623         } else {
2624              sptr = sptr->getNextNE();
2625         }
2626     }
2627 
2628     return NULL;
2629 }
2630 
2631 // check word for two-level suffixes
2632 
suffix_check_twosfx(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG needflag)2633 struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len,
2634        int sfxopts, PfxEntry * ppfx, const FLAG needflag)
2635 {
2636     struct hentry * rv = NULL;
2637 
2638     // first handle the special case of 0 length suffixes
2639     SfxEntry * se = sStart[0];
2640     while (se) {
2641         if (contclasses[se->getFlag()])
2642         {
2643             rv = se->check_twosfx(word,len, sfxopts, ppfx, needflag);
2644             if (rv) return rv;
2645         }
2646         se = se->getNext();
2647     }
2648 
2649     // now handle the general case
2650     if (len == 0) return NULL; // FULLSTRIP
2651     unsigned char sp = *((const unsigned char *)(word + len - 1));
2652     SfxEntry * sptr = sStart[sp];
2653 
2654     while (sptr) {
2655         if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2656             if (contclasses[sptr->getFlag()])
2657             {
2658                 rv = sptr->check_twosfx(word,len, sfxopts, ppfx, needflag);
2659                 if (rv) {
2660                     sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2661                     if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
2662                     return rv;
2663                 }
2664             }
2665             sptr = sptr->getNextEQ();
2666         } else {
2667              sptr = sptr->getNextNE();
2668         }
2669     }
2670 
2671     return NULL;
2672 }
2673 
suffix_check_twosfx_morph(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG needflag)2674 char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len,
2675        int sfxopts, PfxEntry * ppfx, const FLAG needflag)
2676 {
2677     char result[MAXLNLEN];
2678     char result2[MAXLNLEN];
2679     char result3[MAXLNLEN];
2680 
2681     char * st;
2682 
2683     result[0] = '\0';
2684     result2[0] = '\0';
2685     result3[0] = '\0';
2686 
2687     // first handle the special case of 0 length suffixes
2688     SfxEntry * se = sStart[0];
2689     while (se) {
2690         if (contclasses[se->getFlag()])
2691         {
2692             st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
2693             if (st) {
2694                 if (ppfx) {
2695                     if (ppfx->getMorph()) {
2696                         mystrcat(result, ppfx->getMorph(), MAXLNLEN);
2697                         mystrcat(result, " ", MAXLNLEN);
2698                     } else debugflag(result, ppfx->getFlag());
2699                 }
2700                 mystrcat(result, st, MAXLNLEN);
2701                 free(st);
2702                 if (se->getMorph()) {
2703                     mystrcat(result, " ", MAXLNLEN);
2704                     mystrcat(result, se->getMorph(), MAXLNLEN);
2705                 } else debugflag(result, se->getFlag());
2706                 mystrcat(result, "\n", MAXLNLEN);
2707             }
2708         }
2709         se = se->getNext();
2710     }
2711 
2712     // now handle the general case
2713     if (len == 0) return NULL; // FULLSTRIP
2714     unsigned char sp = *((const unsigned char *)(word + len - 1));
2715     SfxEntry * sptr = sStart[sp];
2716 
2717     while (sptr) {
2718         if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2719             if (contclasses[sptr->getFlag()])
2720             {
2721                 st = sptr->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
2722                 if (st) {
2723                     sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2724                     if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
2725                     strcpy(result2, st);
2726                     free(st);
2727 
2728                 result3[0] = '\0';
2729 
2730                 if (sptr->getMorph()) {
2731                     mystrcat(result3, " ", MAXLNLEN);
2732                     mystrcat(result3, sptr->getMorph(), MAXLNLEN);
2733                 } else debugflag(result3, sptr->getFlag());
2734                 strlinecat(result2, result3);
2735                 mystrcat(result2, "\n", MAXLNLEN);
2736                 mystrcat(result,  result2, MAXLNLEN);
2737                 }
2738             }
2739             sptr = sptr->getNextEQ();
2740         } else {
2741              sptr = sptr->getNextNE();
2742         }
2743     }
2744     if (*result) return mystrdup(result);
2745     return NULL;
2746 }
2747 
suffix_check_morph(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag,char in_compound)2748 char * AffixMgr::suffix_check_morph(const char * word, int len,
2749        int sfxopts, PfxEntry * ppfx, const FLAG cclass, const FLAG needflag, char in_compound)
2750 {
2751     char result[MAXLNLEN];
2752 
2753     struct hentry * rv = NULL;
2754 
2755     result[0] = '\0';
2756 
2757     PfxEntry* ep = ppfx;
2758 
2759     // first handle the special case of 0 length suffixes
2760     SfxEntry * se = sStart[0];
2761     while (se) {
2762         if (!cclass || se->getCont()) {
2763             // suffixes are not allowed in beginning of compounds
2764             if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2765              // except when signed with compoundpermitflag flag
2766              (se->getCont() && compoundpermitflag &&
2767                 TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
2768               // no circumfix flag in prefix and suffix
2769               ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2770                    circumfix, ep->getContLen())) &&
2771                (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
2772               // circumfix flag in prefix AND suffix
2773               ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2774                    circumfix, ep->getContLen())) &&
2775                (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen())))))  &&
2776             // fogemorpheme
2777               (in_compound ||
2778                  !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
2779             // needaffix on prefix or first suffix
2780               (cclass ||
2781                    !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2782                    (ppfx && !((ep->getCont()) &&
2783                      TESTAFF(ep->getCont(), needaffix,
2784                        ep->getContLen())))
2785               )
2786             ))
2787             rv = se->checkword(word, len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
2788          while (rv) {
2789            if (ppfx) {
2790                 if (ppfx->getMorph()) {
2791                     mystrcat(result, ppfx->getMorph(), MAXLNLEN);
2792                     mystrcat(result, " ", MAXLNLEN);
2793                 } else debugflag(result, ppfx->getFlag());
2794             }
2795             if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2796             if (! HENTRY_FIND(rv, MORPH_STEM)) {
2797                 mystrcat(result, " ", MAXLNLEN);
2798                 mystrcat(result, MORPH_STEM, MAXLNLEN);
2799                 mystrcat(result, HENTRY_WORD(rv), MAXLNLEN);
2800             }
2801             // store the pointer of the hash entry
2802 //            sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
2803 
2804             if (!complexprefixes && HENTRY_DATA(rv)) {
2805                     mystrcat(result, " ", MAXLNLEN);
2806                     mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2807             }
2808             if (se->getMorph()) {
2809                 mystrcat(result, " ", MAXLNLEN);
2810                 mystrcat(result, se->getMorph(), MAXLNLEN);
2811             } else debugflag(result, se->getFlag());
2812             mystrcat(result, "\n", MAXLNLEN);
2813             rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
2814          }
2815        }
2816        se = se->getNext();
2817     }
2818 
2819     // now handle the general case
2820     if (len == 0) return NULL; // FULLSTRIP
2821     unsigned char sp = *((const unsigned char *)(word + len - 1));
2822     SfxEntry * sptr = sStart[sp];
2823 
2824     while (sptr) {
2825         if (isRevSubset(sptr->getKey(), word + len - 1, len)
2826         ) {
2827             // suffixes are not allowed in beginning of compounds
2828             if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2829              // except when signed with compoundpermitflag flag
2830              (sptr->getCont() && compoundpermitflag &&
2831                 TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
2832               // no circumfix flag in prefix and suffix
2833               ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2834                    circumfix, ep->getContLen())) &&
2835                (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
2836               // circumfix flag in prefix AND suffix
2837               ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2838                    circumfix, ep->getContLen())) &&
2839                (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))))  &&
2840             // fogemorpheme
2841               (in_compound ||
2842                  !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
2843             // needaffix on first suffix
2844               (cclass || !(sptr->getCont() &&
2845                    TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())))
2846             )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
2847             while (rv) {
2848                     if (ppfx) {
2849                         if (ppfx->getMorph()) {
2850                             mystrcat(result, ppfx->getMorph(), MAXLNLEN);
2851                             mystrcat(result, " ", MAXLNLEN);
2852                         } else debugflag(result, ppfx->getFlag());
2853                     }
2854                     if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2855                     if (! HENTRY_FIND(rv, MORPH_STEM)) {
2856                             mystrcat(result, " ", MAXLNLEN);
2857                             mystrcat(result, MORPH_STEM, MAXLNLEN);
2858                             mystrcat(result, HENTRY_WORD(rv), MAXLNLEN);
2859                     }
2860                     // store the pointer of the hash entry
2861 //                    sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
2862 
2863                     if (!complexprefixes && HENTRY_DATA(rv)) {
2864                         mystrcat(result, " ", MAXLNLEN);
2865                         mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2866                     }
2867 
2868                 if (sptr->getMorph()) {
2869                     mystrcat(result, " ", MAXLNLEN);
2870                     mystrcat(result, sptr->getMorph(), MAXLNLEN);
2871                 } else debugflag(result, sptr->getFlag());
2872                 mystrcat(result, "\n", MAXLNLEN);
2873                 rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
2874             }
2875              sptr = sptr->getNextEQ();
2876         } else {
2877              sptr = sptr->getNextNE();
2878         }
2879     }
2880 
2881     if (*result) return mystrdup(result);
2882     return NULL;
2883 }
2884 
2885 // check if word with affixes is correctly spelled
affix_check(const char * word,int len,const FLAG needflag,char in_compound)2886 struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound)
2887 {
2888     struct hentry * rv= NULL;
2889 
2890     // check all prefixes (also crossed with suffixes if allowed)
2891     rv = prefix_check(word, len, in_compound, needflag);
2892     if (rv) return rv;
2893 
2894     // if still not found check all suffixes
2895     rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, in_compound);
2896 
2897     if (havecontclass) {
2898         sfx = NULL;
2899         pfx = NULL;
2900 
2901         if (rv) return rv;
2902         // if still not found check all two-level suffixes
2903         rv = suffix_check_twosfx(word, len, 0, NULL, needflag);
2904 
2905         if (rv) return rv;
2906         // if still not found check all two-level suffixes
2907         rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag);
2908     }
2909 
2910     return rv;
2911 }
2912 
2913 // check if word with affixes is correctly spelled
affix_check_morph(const char * word,int len,const FLAG needflag,char in_compound)2914 char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound)
2915 {
2916     char result[MAXLNLEN];
2917     char * st = NULL;
2918 
2919     *result = '\0';
2920 
2921     // check all prefixes (also crossed with suffixes if allowed)
2922     st = prefix_check_morph(word, len, in_compound);
2923     if (st) {
2924         mystrcat(result, st, MAXLNLEN);
2925         free(st);
2926     }
2927 
2928     // if still not found check all suffixes
2929     st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound);
2930     if (st) {
2931         mystrcat(result, st, MAXLNLEN);
2932         free(st);
2933     }
2934 
2935     if (havecontclass) {
2936         sfx = NULL;
2937         pfx = NULL;
2938         // if still not found check all two-level suffixes
2939         st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag);
2940         if (st) {
2941             mystrcat(result, st, MAXLNLEN);
2942             free(st);
2943         }
2944 
2945         // if still not found check all two-level suffixes
2946         st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag);
2947         if (st) {
2948             mystrcat(result, st, MAXLNLEN);
2949             free(st);
2950         }
2951     }
2952 
2953     return mystrdup(result);
2954 }
2955 
morphgen(char * ts,int wl,const unsigned short * ap,unsigned short al,char * morph,char * targetmorph,int level)2956 char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap,
2957     unsigned short al, char * morph, char * targetmorph, int level)
2958 {
2959     // handle suffixes
2960     char * stemmorph;
2961     char * stemmorphcatpos;
2962     char mymorph[MAXLNLEN];
2963 
2964     if (!morph) return NULL;
2965 
2966     // check substandard flag
2967     if (TESTAFF(ap, substandard, al)) return NULL;
2968 
2969     if (morphcmp(morph, targetmorph) == 0) return mystrdup(ts);
2970 
2971 //    int targetcount = get_sfxcount(targetmorph);
2972 
2973     // use input suffix fields, if exist
2974     if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) {
2975         stemmorph = mymorph;
2976         strcpy(stemmorph, morph);
2977         mystrcat(stemmorph, " ", MAXLNLEN);
2978         stemmorphcatpos = stemmorph + strlen(stemmorph);
2979     } else {
2980         stemmorph = morph;
2981         stemmorphcatpos = NULL;
2982     }
2983 
2984     for (int i = 0; i < al; i++) {
2985         const unsigned char c = (unsigned char) (ap[i] & 0x00FF);
2986         SfxEntry * sptr = sFlag[c];
2987         while (sptr) {
2988             if (sptr->getFlag() == ap[i] && sptr->getMorph() && ((sptr->getContLen() == 0) ||
2989                 // don't generate forms with substandard affixes
2990                 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) {
2991 
2992                 if (stemmorphcatpos) strcpy(stemmorphcatpos, sptr->getMorph());
2993                 else stemmorph = (char *) sptr->getMorph();
2994 
2995                 int cmp = morphcmp(stemmorph, targetmorph);
2996 
2997                 if (cmp == 0) {
2998                     char * newword = sptr->add(ts, wl);
2999                     if (newword) {
3000                         hentry * check = pHMgr->lookup(newword); // XXX extra dic
3001                         if (!check || !check->astr ||
3002                             !(TESTAFF(check->astr, forbiddenword, check->alen) ||
3003                               TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) {
3004                                 return newword;
3005                         }
3006                         free(newword);
3007                     }
3008                 }
3009 
3010                 // recursive call for secondary suffixes
3011                 if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) &&
3012 //                    (get_sfxcount(stemmorph) < targetcount) &&
3013                     !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) {
3014                     char * newword = sptr->add(ts, wl);
3015                     if (newword) {
3016                         char * newword2 = morphgen(newword, strlen(newword), sptr->getCont(),
3017                             sptr->getContLen(), stemmorph, targetmorph, 1);
3018 
3019                         if (newword2) {
3020                             free(newword);
3021                             return newword2;
3022                         }
3023                         free(newword);
3024                         newword = NULL;
3025                     }
3026                 }
3027             }
3028             sptr = sptr->getFlgNxt();
3029         }
3030     }
3031    return NULL;
3032 }
3033 
3034 
expand_rootword(struct guessword * wlst,int maxn,const char * ts,int wl,const unsigned short * ap,unsigned short al,char * bad,int badl,char * phon)3035 int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts,
3036     int wl, const unsigned short * ap, unsigned short al, char * bad, int badl,
3037     char * phon)
3038 {
3039     int nh=0;
3040     // first add root word to list
3041     if ((nh < maxn) && !(al && ((needaffix && TESTAFF(ap, needaffix, al)) ||
3042          (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {
3043        wlst[nh].word = mystrdup(ts);
3044        if (!wlst[nh].word) return 0;
3045        wlst[nh].allow = (1 == 0);
3046        wlst[nh].orig = NULL;
3047        nh++;
3048        // add special phonetic version
3049        if (phon && (nh < maxn)) {
3050     	    wlst[nh].word = mystrdup(phon);
3051             if (!wlst[nh].word) return nh - 1;
3052     	    wlst[nh].allow = (1 == 0);
3053     	    wlst[nh].orig = mystrdup(ts);
3054             if (!wlst[nh].orig) return nh - 1;
3055     	    nh++;
3056        }
3057     }
3058 
3059     // handle suffixes
3060     for (int i = 0; i < al; i++) {
3061        const unsigned char c = (unsigned char) (ap[i] & 0x00FF);
3062        SfxEntry * sptr = sFlag[c];
3063        while (sptr) {
3064          if ((sptr->getFlag() == ap[i]) && (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) &&
3065                 (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) &&
3066                 // check needaffix flag
3067                 !(sptr->getCont() && ((needaffix &&
3068                       TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
3069                   (circumfix &&
3070                       TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) ||
3071                   (onlyincompound &&
3072                       TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))
3073                 ) {
3074             char * newword = sptr->add(ts, wl);
3075             if (newword) {
3076                 if (nh < maxn) {
3077                     wlst[nh].word = newword;
3078                     wlst[nh].allow = sptr->allowCross();
3079                     wlst[nh].orig = NULL;
3080                     nh++;
3081                     // add special phonetic version
3082     		    if (phon && (nh < maxn)) {
3083     			char st[MAXWORDUTF8LEN];
3084     			strcpy(st, phon);
3085     			strcat(st, sptr->getKey());
3086     			reverseword(st + strlen(phon));
3087     			wlst[nh].word = mystrdup(st);
3088     			if (!wlst[nh].word) return nh - 1;
3089     			wlst[nh].allow = (1 == 0);
3090     			wlst[nh].orig = mystrdup(newword);
3091                         if (!wlst[nh].orig) return nh - 1;
3092     			nh++;
3093     		    }
3094                 } else {
3095                     free(newword);
3096                 }
3097             }
3098          }
3099          sptr = sptr->getFlgNxt();
3100        }
3101     }
3102 
3103     int n = nh;
3104 
3105     // handle cross products of prefixes and suffixes
3106     for (int j=1;j<n ;j++)
3107        if (wlst[j].allow) {
3108           for (int k = 0; k < al; k++) {
3109              const unsigned char c = (unsigned char) (ap[k] & 0x00FF);
3110              PfxEntry * cptr = pFlag[c];
3111              while (cptr) {
3112                 if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) &&
3113                         (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
3114                     int l1 = strlen(wlst[j].word);
3115                     char * newword = cptr->add(wlst[j].word, l1);
3116                     if (newword) {
3117                        if (nh < maxn) {
3118                           wlst[nh].word = newword;
3119                           wlst[nh].allow = cptr->allowCross();
3120                           wlst[nh].orig = NULL;
3121                           nh++;
3122                        } else {
3123                           free(newword);
3124                        }
3125                     }
3126                 }
3127                 cptr = cptr->getFlgNxt();
3128              }
3129           }
3130        }
3131 
3132 
3133     // now handle pure prefixes
3134     for (int m = 0; m < al; m ++) {
3135        const unsigned char c = (unsigned char) (ap[m] & 0x00FF);
3136        PfxEntry * ptr = pFlag[c];
3137        while (ptr) {
3138          if ((ptr->getFlag() == ap[m]) && (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) &&
3139                 (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) &&
3140                 // check needaffix flag
3141                 !(ptr->getCont() && ((needaffix &&
3142                       TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) ||
3143                      (circumfix &&
3144                       TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||
3145                   (onlyincompound &&
3146                       TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))
3147                 ) {
3148             char * newword = ptr->add(ts, wl);
3149             if (newword) {
3150                 if (nh < maxn) {
3151                     wlst[nh].word = newword;
3152                     wlst[nh].allow = ptr->allowCross();
3153                     wlst[nh].orig = NULL;
3154                     nh++;
3155                 } else {
3156                     free(newword);
3157                 }
3158             }
3159          }
3160          ptr = ptr->getFlgNxt();
3161        }
3162     }
3163 
3164     return nh;
3165 }
3166 
3167 // return length of replacing table
get_numrep() const3168 int AffixMgr::get_numrep() const
3169 {
3170   return numrep;
3171 }
3172 
3173 // return replacing table
get_reptable() const3174 struct replentry * AffixMgr::get_reptable() const
3175 {
3176   if (! reptable ) return NULL;
3177   return reptable;
3178 }
3179 
3180 // return iconv table
get_iconvtable() const3181 RepList * AffixMgr::get_iconvtable() const
3182 {
3183   if (! iconvtable ) return NULL;
3184   return iconvtable;
3185 }
3186 
3187 // return oconv table
get_oconvtable() const3188 RepList * AffixMgr::get_oconvtable() const
3189 {
3190   if (! oconvtable ) return NULL;
3191   return oconvtable;
3192 }
3193 
3194 // return replacing table
get_phonetable() const3195 struct phonetable * AffixMgr::get_phonetable() const
3196 {
3197   if (! phone ) return NULL;
3198   return phone;
3199 }
3200 
3201 // return length of character map table
get_nummap() const3202 int AffixMgr::get_nummap() const
3203 {
3204   return nummap;
3205 }
3206 
3207 // return character map table
get_maptable() const3208 struct mapentry * AffixMgr::get_maptable() const
3209 {
3210   if (! maptable ) return NULL;
3211   return maptable;
3212 }
3213 
3214 // return length of word break table
get_numbreak() const3215 int AffixMgr::get_numbreak() const
3216 {
3217   return numbreak;
3218 }
3219 
3220 // return character map table
get_breaktable() const3221 char ** AffixMgr::get_breaktable() const
3222 {
3223   if (! breaktable ) return NULL;
3224   return breaktable;
3225 }
3226 
3227 // return text encoding of dictionary
get_encoding()3228 char * AffixMgr::get_encoding()
3229 {
3230   if (! encoding ) encoding = mystrdup(SPELL_ENCODING);
3231   return mystrdup(encoding);
3232 }
3233 
3234 // return text encoding of dictionary
get_langnum() const3235 int AffixMgr::get_langnum() const
3236 {
3237   return langnum;
3238 }
3239 
3240 // return double prefix option
get_complexprefixes() const3241 int AffixMgr::get_complexprefixes() const
3242 {
3243   return complexprefixes;
3244 }
3245 
3246 // return FULLSTRIP option
get_fullstrip() const3247 int AffixMgr::get_fullstrip() const
3248 {
3249   return fullstrip;
3250 }
3251 
get_keepcase() const3252 FLAG AffixMgr::get_keepcase() const
3253 {
3254   return keepcase;
3255 }
3256 
get_forceucase() const3257 FLAG AffixMgr::get_forceucase() const
3258 {
3259   return forceucase;
3260 }
3261 
get_warn() const3262 FLAG AffixMgr::get_warn() const
3263 {
3264   return warn;
3265 }
3266 
get_forbidwarn() const3267 int AffixMgr::get_forbidwarn() const
3268 {
3269   return forbidwarn;
3270 }
3271 
get_checksharps() const3272 int AffixMgr::get_checksharps() const
3273 {
3274   return checksharps;
3275 }
3276 
encode_flag(unsigned short aflag) const3277 char * AffixMgr::encode_flag(unsigned short aflag) const
3278 {
3279   return pHMgr->encode_flag(aflag);
3280 }
3281 
3282 
3283 // return the preferred ignore string for suggestions
get_ignore() const3284 char * AffixMgr::get_ignore() const
3285 {
3286   if (!ignorechars) return NULL;
3287   return ignorechars;
3288 }
3289 
3290 // return the preferred ignore string for suggestions
get_ignore_utf16(int * len) const3291 unsigned short * AffixMgr::get_ignore_utf16(int * len) const
3292 {
3293   *len = ignorechars_utf16_len;
3294   return ignorechars_utf16;
3295 }
3296 
3297 // return the keyboard string for suggestions
get_key_string()3298 char * AffixMgr::get_key_string()
3299 {
3300   if (! keystring ) keystring = mystrdup(SPELL_KEYSTRING);
3301   return mystrdup(keystring);
3302 }
3303 
3304 // return the preferred try string for suggestions
get_try_string() const3305 char * AffixMgr::get_try_string() const
3306 {
3307   if (! trystring ) return NULL;
3308   return mystrdup(trystring);
3309 }
3310 
3311 // return the preferred try string for suggestions
get_wordchars() const3312 const char * AffixMgr::get_wordchars() const
3313 {
3314   return wordchars;
3315 }
3316 
get_wordchars_utf16(int * len) const3317 unsigned short * AffixMgr::get_wordchars_utf16(int * len) const
3318 {
3319   *len = wordchars_utf16_len;
3320   return wordchars_utf16;
3321 }
3322 
3323 // is there compounding?
get_compound() const3324 int AffixMgr::get_compound() const
3325 {
3326   return compoundflag || compoundbegin || numdefcpd;
3327 }
3328 
3329 // return the compound words control flag
get_compoundflag() const3330 FLAG AffixMgr::get_compoundflag() const
3331 {
3332   return compoundflag;
3333 }
3334 
3335 // return the forbidden words control flag
get_forbiddenword() const3336 FLAG AffixMgr::get_forbiddenword() const
3337 {
3338   return forbiddenword;
3339 }
3340 
3341 // return the forbidden words control flag
get_nosuggest() const3342 FLAG AffixMgr::get_nosuggest() const
3343 {
3344   return nosuggest;
3345 }
3346 
3347 // return the forbidden words control flag
get_nongramsuggest() const3348 FLAG AffixMgr::get_nongramsuggest() const
3349 {
3350   return nongramsuggest;
3351 }
3352 
3353 // return the forbidden words flag modify flag
get_needaffix() const3354 FLAG AffixMgr::get_needaffix() const
3355 {
3356   return needaffix;
3357 }
3358 
3359 // return the onlyincompound flag
get_onlyincompound() const3360 FLAG AffixMgr::get_onlyincompound() const
3361 {
3362   return onlyincompound;
3363 }
3364 
3365 // return the compound word signal flag
get_compoundroot() const3366 FLAG AffixMgr::get_compoundroot() const
3367 {
3368   return compoundroot;
3369 }
3370 
3371 // return the compound begin signal flag
get_compoundbegin() const3372 FLAG AffixMgr::get_compoundbegin() const
3373 {
3374   return compoundbegin;
3375 }
3376 
3377 // return the value of checknum
get_checknum() const3378 int AffixMgr::get_checknum() const
3379 {
3380   return checknum;
3381 }
3382 
3383 // return the value of prefix
get_prefix() const3384 const char * AffixMgr::get_prefix() const
3385 {
3386   if (pfx) return pfx->getKey();
3387   return NULL;
3388 }
3389 
3390 // return the value of suffix
get_suffix() const3391 const char * AffixMgr::get_suffix() const
3392 {
3393   return sfxappnd;
3394 }
3395 
3396 // return the value of suffix
get_version() const3397 const char * AffixMgr::get_version() const
3398 {
3399   return version;
3400 }
3401 
3402 // return lemma_present flag
get_lemma_present() const3403 FLAG AffixMgr::get_lemma_present() const
3404 {
3405   return lemma_present;
3406 }
3407 
3408 // utility method to look up root words in hash table
lookup(const char * word)3409 struct hentry * AffixMgr::lookup(const char * word)
3410 {
3411   int i;
3412   struct hentry * he = NULL;
3413   for (i = 0; i < *maxdic && !he; i++) {
3414     he = (alldic[i])->lookup(word);
3415   }
3416   return he;
3417 }
3418 
3419 // return the value of suffix
have_contclass() const3420 int AffixMgr::have_contclass() const
3421 {
3422   return havecontclass;
3423 }
3424 
3425 // return utf8
get_utf8() const3426 int AffixMgr::get_utf8() const
3427 {
3428   return utf8;
3429 }
3430 
get_maxngramsugs(void) const3431 int AffixMgr::get_maxngramsugs(void) const
3432 {
3433   return maxngramsugs;
3434 }
3435 
get_maxcpdsugs(void) const3436 int AffixMgr::get_maxcpdsugs(void) const
3437 {
3438   return maxcpdsugs;
3439 }
3440 
get_maxdiff(void) const3441 int AffixMgr::get_maxdiff(void) const
3442 {
3443   return maxdiff;
3444 }
3445 
get_onlymaxdiff(void) const3446 int AffixMgr::get_onlymaxdiff(void) const
3447 {
3448   return onlymaxdiff;
3449 }
3450 
3451 // return nosplitsugs
get_nosplitsugs(void) const3452 int AffixMgr::get_nosplitsugs(void) const
3453 {
3454   return nosplitsugs;
3455 }
3456 
3457 // return sugswithdots
get_sugswithdots(void) const3458 int AffixMgr::get_sugswithdots(void) const
3459 {
3460   return sugswithdots;
3461 }
3462 
3463 /* parse flag */
parse_flag(char * line,unsigned short * out,FileMgr * af)3464 int AffixMgr::parse_flag(char * line, unsigned short * out, FileMgr * af) {
3465    char * s = NULL;
3466    if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) {
3467       HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum());
3468       return 1;
3469    }
3470    if (parse_string(line, &s, af->getlinenum())) return 1;
3471    *out = pHMgr->decode_flag(s);
3472    free(s);
3473    return 0;
3474 }
3475 
3476 /* parse num */
parse_num(char * line,int * out,FileMgr * af)3477 int AffixMgr::parse_num(char * line, int * out, FileMgr * af) {
3478    char * s = NULL;
3479    if (*out != -1) {
3480       HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum());
3481       return 1;
3482    }
3483    if (parse_string(line, &s, af->getlinenum())) return 1;
3484    *out = atoi(s);
3485    free(s);
3486    return 0;
3487 }
3488 
3489 /* parse in the max syllablecount of compound words and  */
parse_cpdsyllable(char * line,FileMgr * af)3490 int  AffixMgr::parse_cpdsyllable(char * line, FileMgr * af)
3491 {
3492    char * tp = line;
3493    char * piece;
3494    int i = 0;
3495    int np = 0;
3496    w_char w[MAXWORDLEN];
3497    piece = mystrsep(&tp, 0);
3498    while (piece) {
3499       if (*piece != '\0') {
3500           switch(i) {
3501              case 0: { np++; break; }
3502              case 1: { cpdmaxsyllable = atoi(piece); np++; break; }
3503              case 2: {
3504                 if (!utf8) {
3505                     cpdvowels = mystrdup(piece);
3506                 } else {
3507                     int n = u8_u16(w, MAXWORDLEN, piece);
3508                     if (n > 0) {
3509                         flag_qsort((unsigned short *) w, 0, n);
3510                         cpdvowels_utf16 = (w_char *) malloc(n * sizeof(w_char));
3511                         if (!cpdvowels_utf16) return 1;
3512                         memcpy(cpdvowels_utf16, w, n * sizeof(w_char));
3513                     }
3514                     cpdvowels_utf16_len = n;
3515                 }
3516                 np++;
3517                 break;
3518              }
3519              default: break;
3520           }
3521           i++;
3522       }
3523       piece = mystrsep(&tp, 0);
3524    }
3525    if (np < 2) {
3526       HUNSPELL_WARNING(stderr, "error: line %d: missing compoundsyllable information\n", af->getlinenum());
3527       return 1;
3528    }
3529    if (np == 2) cpdvowels = mystrdup("aeiouAEIOU");
3530    return 0;
3531 }
3532 
3533 /* parse in the typical fault correcting table */
parse_reptable(char * line,FileMgr * af)3534 int  AffixMgr::parse_reptable(char * line, FileMgr * af)
3535 {
3536    if (numrep != 0) {
3537       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3538       return 1;
3539    }
3540    char * tp = line;
3541    char * piece;
3542    int i = 0;
3543    int np = 0;
3544    piece = mystrsep(&tp, 0);
3545    while (piece) {
3546        if (*piece != '\0') {
3547           switch(i) {
3548              case 0: { np++; break; }
3549              case 1: {
3550                        numrep = atoi(piece);
3551                        if (numrep < 1) {
3552                           HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum());
3553                           return 1;
3554                        }
3555                        reptable = (replentry *) malloc(numrep * sizeof(struct replentry));
3556                        if (!reptable) return 1;
3557                        np++;
3558                        break;
3559                      }
3560              default: break;
3561           }
3562           i++;
3563        }
3564        piece = mystrsep(&tp, 0);
3565    }
3566    if (np != 2) {
3567       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3568       return 1;
3569    }
3570 
3571    /* now parse the numrep lines to read in the remainder of the table */
3572    char * nl;
3573    for (int j=0; j < numrep; j++) {
3574         if ((nl = af->getline()) == NULL) return 1;
3575         mychomp(nl);
3576         tp = nl;
3577         i = 0;
3578         reptable[j].pattern = NULL;
3579         reptable[j].pattern2 = NULL;
3580         piece = mystrsep(&tp, 0);
3581         while (piece) {
3582            if (*piece != '\0') {
3583                switch(i) {
3584                   case 0: {
3585                              if (strncmp(piece,"REP",3) != 0) {
3586                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3587                                  numrep = 0;
3588                                  return 1;
3589                              }
3590                              break;
3591                           }
3592                   case 1: {
3593                             if (*piece == '^') reptable[j].start = true; else reptable[j].start = false;
3594                             reptable[j].pattern = mystrrep(mystrdup(piece + int(reptable[j].start)),"_"," ");
3595                             int lr = strlen(reptable[j].pattern) - 1;
3596                             if (reptable[j].pattern[lr] == '$') {
3597                                 reptable[j].end = true;
3598                                 reptable[j].pattern[lr] = '\0';
3599                             } else reptable[j].end = false;
3600                             break;
3601                           }
3602                   case 2: { reptable[j].pattern2 = mystrrep(mystrdup(piece),"_"," "); break; }
3603                   default: break;
3604                }
3605                i++;
3606            }
3607            piece = mystrsep(&tp, 0);
3608         }
3609         if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) {
3610              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3611              numrep = 0;
3612              return 1;
3613         }
3614    }
3615    return 0;
3616 }
3617 
3618 /* parse in the typical fault correcting table */
parse_convtable(char * line,FileMgr * af,RepList ** rl,const char * keyword)3619 int  AffixMgr::parse_convtable(char * line, FileMgr * af, RepList ** rl, const char * keyword)
3620 {
3621    if (*rl) {
3622       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3623       return 1;
3624    }
3625    char * tp = line;
3626    char * piece;
3627    int i = 0;
3628    int np = 0;
3629    int numrl = 0;
3630    piece = mystrsep(&tp, 0);
3631    while (piece) {
3632        if (*piece != '\0') {
3633           switch(i) {
3634              case 0: { np++; break; }
3635              case 1: {
3636                        numrl = atoi(piece);
3637                        if (numrl < 1) {
3638                           HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum());
3639                           return 1;
3640                        }
3641                        *rl = new RepList(numrl);
3642                        if (!*rl) return 1;
3643                        np++;
3644                        break;
3645                      }
3646              default: break;
3647           }
3648           i++;
3649        }
3650        piece = mystrsep(&tp, 0);
3651    }
3652    if (np != 2) {
3653       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3654       return 1;
3655    }
3656 
3657    /* now parse the num lines to read in the remainder of the table */
3658    char * nl;
3659    for (int j=0; j < numrl; j++) {
3660         if (!(nl = af->getline())) return 1;
3661         mychomp(nl);
3662         tp = nl;
3663         i = 0;
3664         char * pattern = NULL;
3665         char * pattern2 = NULL;
3666         piece = mystrsep(&tp, 0);
3667         while (piece) {
3668            if (*piece != '\0') {
3669                switch(i) {
3670                   case 0: {
3671                              if (strncmp(piece, keyword, strlen(keyword)) != 0) {
3672                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3673                                  delete *rl;
3674                                  *rl = NULL;
3675                                  return 1;
3676                              }
3677                              break;
3678                           }
3679                   case 1: { pattern = mystrrep(mystrdup(piece),"_"," "); break; }
3680                   case 2: {
3681                     pattern2 = mystrrep(mystrdup(piece),"_"," ");
3682                     break;
3683                   }
3684                   default: break;
3685                }
3686                i++;
3687            }
3688            piece = mystrsep(&tp, 0);
3689         }
3690         if (!pattern || !pattern2) {
3691             if (pattern)
3692                 free(pattern);
3693             if (pattern2)
3694                 free(pattern2);
3695             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3696             return 1;
3697         }
3698         (*rl)->add(pattern, pattern2);
3699    }
3700    return 0;
3701 }
3702 
3703 
3704 /* parse in the typical fault correcting table */
parse_phonetable(char * line,FileMgr * af)3705 int  AffixMgr::parse_phonetable(char * line, FileMgr * af)
3706 {
3707    if (phone) {
3708       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3709       return 1;
3710    }
3711    char * tp = line;
3712    char * piece;
3713    int i = 0;
3714    int np = 0;
3715    piece = mystrsep(&tp, 0);
3716    while (piece) {
3717        if (*piece != '\0') {
3718           switch(i) {
3719              case 0: { np++; break; }
3720              case 1: {
3721                        phone = (phonetable *) malloc(sizeof(struct phonetable));
3722                        if (!phone) return 1;
3723                        phone->num = atoi(piece);
3724                        phone->rules = NULL;
3725                        phone->utf8 = (char) utf8;
3726                        if (phone->num < 1) {
3727                           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
3728                           return 1;
3729                        }
3730                        phone->rules = (char * *) malloc(2 * (phone->num + 1) * sizeof(char *));
3731                        if (!phone->rules) {
3732                           free(phone);
3733                           phone = NULL;
3734                           return 1;
3735                        }
3736                        np++;
3737                        break;
3738                      }
3739              default: break;
3740           }
3741           i++;
3742        }
3743        piece = mystrsep(&tp, 0);
3744    }
3745    if (np != 2) {
3746       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3747       return 1;
3748    }
3749 
3750    /* now parse the phone->num lines to read in the remainder of the table */
3751    char * nl;
3752    for (int j=0; j < phone->num; j++) {
3753         if (!(nl = af->getline())) return 1;
3754         mychomp(nl);
3755         tp = nl;
3756         i = 0;
3757         phone->rules[j * 2] = NULL;
3758         phone->rules[j * 2 + 1] = NULL;
3759         piece = mystrsep(&tp, 0);
3760         while (piece) {
3761            if (*piece != '\0') {
3762                switch(i) {
3763                   case 0: {
3764                              if (strncmp(piece,"PHONE",5) != 0) {
3765                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3766                                  phone->num = 0;
3767                                  return 1;
3768                              }
3769                              break;
3770                           }
3771                   case 1: { phone->rules[j * 2] = mystrrep(mystrdup(piece),"_",""); break; }
3772                   case 2: { phone->rules[j * 2 + 1] = mystrrep(mystrdup(piece),"_",""); break; }
3773                   default: break;
3774                }
3775                i++;
3776            }
3777            piece = mystrsep(&tp, 0);
3778         }
3779         if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) {
3780              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3781              phone->num = 0;
3782              return 1;
3783         }
3784    }
3785    phone->rules[phone->num * 2] = mystrdup("");
3786    phone->rules[phone->num * 2 + 1] = mystrdup("");
3787    init_phonet_hash(*phone);
3788    return 0;
3789 }
3790 
3791 /* parse in the checkcompoundpattern table */
parse_checkcpdtable(char * line,FileMgr * af)3792 int  AffixMgr::parse_checkcpdtable(char * line, FileMgr * af)
3793 {
3794    if (numcheckcpd != 0) {
3795       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3796       return 1;
3797    }
3798    char * tp = line;
3799    char * piece;
3800    int i = 0;
3801    int np = 0;
3802    piece = mystrsep(&tp, 0);
3803    while (piece) {
3804        if (*piece != '\0') {
3805           switch(i) {
3806              case 0: { np++; break; }
3807              case 1: {
3808                        numcheckcpd = atoi(piece);
3809                        if (numcheckcpd < 1) {
3810                           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
3811                           return 1;
3812                        }
3813                        checkcpdtable = (patentry *) malloc(numcheckcpd * sizeof(struct patentry));
3814                        if (!checkcpdtable) return 1;
3815                        np++;
3816                        break;
3817                      }
3818              default: break;
3819           }
3820           i++;
3821        }
3822        piece = mystrsep(&tp, 0);
3823    }
3824    if (np != 2) {
3825       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",  af->getlinenum());
3826       return 1;
3827    }
3828 
3829    /* now parse the numcheckcpd lines to read in the remainder of the table */
3830    char * nl;
3831    for (int j=0; j < numcheckcpd; j++) {
3832         if (!(nl = af->getline())) return 1;
3833         mychomp(nl);
3834         tp = nl;
3835         i = 0;
3836         checkcpdtable[j].pattern = NULL;
3837         checkcpdtable[j].pattern2 = NULL;
3838         checkcpdtable[j].pattern3 = NULL;
3839         checkcpdtable[j].cond = FLAG_NULL;
3840         checkcpdtable[j].cond2 = FLAG_NULL;
3841         piece = mystrsep(&tp, 0);
3842         while (piece) {
3843            if (*piece != '\0') {
3844                switch(i) {
3845                   case 0: {
3846                              if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) {
3847                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3848                                  numcheckcpd = 0;
3849                                  return 1;
3850                              }
3851                              break;
3852                           }
3853                   case 1: {
3854                     checkcpdtable[j].pattern = mystrdup(piece);
3855                     char * p = strchr(checkcpdtable[j].pattern, '/');
3856                     if (p) {
3857                       *p = '\0';
3858                     checkcpdtable[j].cond = pHMgr->decode_flag(p + 1);
3859                     }
3860                     break; }
3861                   case 2: {
3862                     checkcpdtable[j].pattern2 = mystrdup(piece);
3863                     char * p = strchr(checkcpdtable[j].pattern2, '/');
3864                     if (p) {
3865                       *p = '\0';
3866                       checkcpdtable[j].cond2 = pHMgr->decode_flag(p + 1);
3867                     }
3868                     break;
3869                     }
3870                   case 3: { checkcpdtable[j].pattern3 = mystrdup(piece); simplifiedcpd = 1; break; }
3871                   default: break;
3872                }
3873                i++;
3874            }
3875            piece = mystrsep(&tp, 0);
3876         }
3877         if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) {
3878              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3879              numcheckcpd = 0;
3880              return 1;
3881         }
3882    }
3883    return 0;
3884 }
3885 
3886 /* parse in the compound rule table */
parse_defcpdtable(char * line,FileMgr * af)3887 int  AffixMgr::parse_defcpdtable(char * line, FileMgr * af)
3888 {
3889    if (numdefcpd != 0) {
3890       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3891       return 1;
3892    }
3893    char * tp = line;
3894    char * piece;
3895    int i = 0;
3896    int np = 0;
3897    piece = mystrsep(&tp, 0);
3898    while (piece) {
3899        if (*piece != '\0') {
3900           switch(i) {
3901              case 0: { np++; break; }
3902              case 1: {
3903                        numdefcpd = atoi(piece);
3904                        if (numdefcpd < 1) {
3905                           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
3906                           return 1;
3907                        }
3908                        defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry));
3909                        if (!defcpdtable) return 1;
3910                        np++;
3911                        break;
3912                      }
3913              default: break;
3914           }
3915           i++;
3916        }
3917        piece = mystrsep(&tp, 0);
3918    }
3919    if (np != 2) {
3920       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3921       return 1;
3922    }
3923 
3924    /* now parse the numdefcpd lines to read in the remainder of the table */
3925    char * nl;
3926    for (int j=0; j < numdefcpd; j++) {
3927         if (!(nl = af->getline())) return 1;
3928         mychomp(nl);
3929         tp = nl;
3930         i = 0;
3931         defcpdtable[j].def = NULL;
3932         piece = mystrsep(&tp, 0);
3933         while (piece) {
3934            if (*piece != '\0') {
3935                switch(i) {
3936                   case 0: {
3937                              if (strncmp(piece, "COMPOUNDRULE", 12) != 0) {
3938                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3939                                  numdefcpd = 0;
3940                                  return 1;
3941                              }
3942                              break;
3943                           }
3944                   case 1: { // handle parenthesized flags
3945                             if (strchr(piece, '(')) {
3946                                 defcpdtable[j].def = (FLAG *) malloc(strlen(piece) * sizeof(FLAG));
3947                                 defcpdtable[j].len = 0;
3948                                 int end = 0;
3949                                 FLAG * conv;
3950                                 while (!end) {
3951                                     char * par = piece + 1;
3952                                     while (*par != '(' && *par != ')' && *par != '\0') par++;
3953                                     if (*par == '\0') end = 1; else *par = '\0';
3954                                     if (*piece == '(') piece++;
3955                                     if (*piece == '*' || *piece == '?') {
3956                                         defcpdtable[j].def[defcpdtable[j].len++] = (FLAG) *piece;
3957                                     } else if (*piece != '\0') {
3958                                         int l = pHMgr->decode_flags(&conv, piece, af);
3959                                         for (int k = 0; k < l; k++) defcpdtable[j].def[defcpdtable[j].len++] = conv[k];
3960                                         free(conv);
3961                                     }
3962                                     piece = par + 1;
3963                                 }
3964                             } else {
3965                                 defcpdtable[j].len = pHMgr->decode_flags(&(defcpdtable[j].def), piece, af);
3966                             }
3967                             break;
3968                            }
3969                   default: break;
3970                }
3971                i++;
3972            }
3973            piece = mystrsep(&tp, 0);
3974         }
3975         if (!defcpdtable[j].len) {
3976              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3977              numdefcpd = 0;
3978              return 1;
3979         }
3980    }
3981    return 0;
3982 }
3983 
3984 
3985 /* parse in the character map table */
parse_maptable(char * line,FileMgr * af)3986 int  AffixMgr::parse_maptable(char * line, FileMgr * af)
3987 {
3988    if (nummap != 0) {
3989       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3990       return 1;
3991    }
3992    char * tp = line;
3993    char * piece;
3994    int i = 0;
3995    int np = 0;
3996    piece = mystrsep(&tp, 0);
3997    while (piece) {
3998        if (*piece != '\0') {
3999           switch(i) {
4000              case 0: { np++; break; }
4001              case 1: {
4002                        nummap = atoi(piece);
4003                        if (nummap < 1) {
4004                           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
4005                           return 1;
4006                        }
4007                        maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry));
4008                        if (!maptable) return 1;
4009                        np++;
4010                        break;
4011                      }
4012              default: break;
4013           }
4014           i++;
4015        }
4016        piece = mystrsep(&tp, 0);
4017    }
4018    if (np != 2) {
4019       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
4020       return 1;
4021    }
4022 
4023    /* now parse the nummap lines to read in the remainder of the table */
4024    char * nl;
4025    for (int j=0; j < nummap; j++) {
4026         if (!(nl = af->getline())) return 1;
4027         mychomp(nl);
4028         tp = nl;
4029         i = 0;
4030         maptable[j].set = NULL;
4031         maptable[j].len = 0;
4032         piece = mystrsep(&tp, 0);
4033         while (piece) {
4034            if (*piece != '\0') {
4035                switch(i) {
4036                   case 0: {
4037                              if (strncmp(piece,"MAP",3) != 0) {
4038                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
4039                                  nummap = 0;
4040                                  return 1;
4041                              }
4042                              break;
4043                           }
4044                   case 1: {
4045 			    int setn = 0;
4046                             maptable[j].len = strlen(piece);
4047                             maptable[j].set = (char **) malloc(maptable[j].len * sizeof(char*));
4048                             if (!maptable[j].set) return 1;
4049 			    for (int k = 0; k < maptable[j].len; k++) {
4050 				int chl = 1;
4051 				int chb = k;
4052 			        if (piece[k] == '(') {
4053 				    char * parpos = strchr(piece + k, ')');
4054 				    if (parpos != NULL) {
4055 					chb = k + 1;
4056 					chl = (int)(parpos - piece) - k - 1;
4057 					k = k + chl + 1;
4058 				    }
4059 				} else {
4060 				    if (utf8 && (piece[k] & 0xc0) == 0xc0) {
4061 					for (k++; utf8 && (piece[k] & 0xc0) == 0x80; k++);
4062 					chl = k - chb;
4063 					k--;
4064 				    }
4065 				}
4066 				maptable[j].set[setn] = (char *) malloc(chl + 1);
4067 				if (!maptable[j].set[setn]) return 1;
4068 				strncpy(maptable[j].set[setn], piece + chb, chl);
4069 				maptable[j].set[setn][chl] = '\0';
4070 				setn++;
4071 			    }
4072                             maptable[j].len = setn;
4073                             break; }
4074                   default: break;
4075                }
4076                i++;
4077            }
4078            piece = mystrsep(&tp, 0);
4079         }
4080         if (!maptable[j].set || !maptable[j].len) {
4081              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
4082              nummap = 0;
4083              return 1;
4084         }
4085    }
4086    return 0;
4087 }
4088 
4089 /* parse in the word breakpoint table */
parse_breaktable(char * line,FileMgr * af)4090 int  AffixMgr::parse_breaktable(char * line, FileMgr * af)
4091 {
4092    if (numbreak > -1) {
4093       HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
4094       return 1;
4095    }
4096    char * tp = line;
4097    char * piece;
4098    int i = 0;
4099    int np = 0;
4100    piece = mystrsep(&tp, 0);
4101    while (piece) {
4102        if (*piece != '\0') {
4103           switch(i) {
4104              case 0: { np++; break; }
4105              case 1: {
4106                        numbreak = atoi(piece);
4107                        if (numbreak < 0) {
4108                           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
4109                           return 1;
4110                        }
4111                        if (numbreak == 0) return 0;
4112                        breaktable = (char **) malloc(numbreak * sizeof(char *));
4113                        if (!breaktable) return 1;
4114                        np++;
4115                        break;
4116                      }
4117              default: break;
4118           }
4119           i++;
4120        }
4121        piece = mystrsep(&tp, 0);
4122    }
4123    if (np != 2) {
4124       HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
4125       return 1;
4126    }
4127 
4128    /* now parse the numbreak lines to read in the remainder of the table */
4129    char * nl;
4130    for (int j=0; j < numbreak; j++) {
4131         if (!(nl = af->getline())) return 1;
4132         mychomp(nl);
4133         tp = nl;
4134         i = 0;
4135         piece = mystrsep(&tp, 0);
4136         while (piece) {
4137            if (*piece != '\0') {
4138                switch(i) {
4139                   case 0: {
4140                              if (strncmp(piece,"BREAK",5) != 0) {
4141                                  HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
4142                                  numbreak = 0;
4143                                  return 1;
4144                              }
4145                              break;
4146                           }
4147                   case 1: {
4148                             breaktable[j] = mystrdup(piece);
4149                             break;
4150                           }
4151                   default: break;
4152                }
4153                i++;
4154            }
4155            piece = mystrsep(&tp, 0);
4156         }
4157         if (!breaktable) {
4158              HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
4159              numbreak = 0;
4160              return 1;
4161         }
4162    }
4163    return 0;
4164 }
4165 
reverse_condition(char * piece)4166 void AffixMgr::reverse_condition(char * piece) {
4167     int neg = 0;
4168     for (char * k = piece + strlen(piece) - 1; k >= piece; k--) {
4169         switch(*k) {
4170           case '[': {
4171                 if (neg) *(k+1) = '['; else *k = ']';
4172                     break;
4173             }
4174           case ']': {
4175                 *k = '[';
4176                 if (neg) *(k+1) = '^';
4177                 neg = 0;
4178                 break;
4179             }
4180           case '^': {
4181                if (*(k+1) == ']') neg = 1; else *(k+1) = *k;
4182                break;
4183                 }
4184           default: {
4185             if (neg) *(k+1) = *k;
4186           }
4187        }
4188     }
4189 }
4190 
parse_affix(char * line,const char at,FileMgr * af,char * dupflags)4191 int  AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupflags)
4192 {
4193    int numents = 0;      // number of affentry structures to parse
4194 
4195    unsigned short aflag = 0;      // affix char identifier
4196 
4197    char ff=0;
4198    std::vector<affentry> affentries;
4199 
4200    char * tp = line;
4201    char * nl = line;
4202    char * piece;
4203    int i = 0;
4204 
4205    // checking lines with bad syntax
4206 #ifdef DEBUG
4207    int basefieldnum = 0;
4208 #endif
4209 
4210    // split affix header line into pieces
4211 
4212    int np = 0;
4213 
4214    piece = mystrsep(&tp, 0);
4215    while (piece) {
4216       if (*piece != '\0') {
4217           switch(i) {
4218              // piece 1 - is type of affix
4219              case 0: { np++; break; }
4220 
4221              // piece 2 - is affix char
4222              case 1: {
4223                     np++;
4224                     aflag = pHMgr->decode_flag(piece);
4225                     if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
4226                         ((at == 'P') && (dupflags[aflag] & dupPFX))) {
4227                         HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix flag\n",
4228                             af->getlinenum());
4229                         // return 1; XXX permissive mode for bad dictionaries
4230                     }
4231                     dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX);
4232                     break;
4233                     }
4234              // piece 3 - is cross product indicator
4235              case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; }
4236 
4237              // piece 4 - is number of affentries
4238              case 3: {
4239                        np++;
4240                        numents = atoi(piece);
4241                        if (numents == 0) {
4242                            char * err = pHMgr->encode_flag(aflag);
4243                            if (err) {
4244                                 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4245                                    af->getlinenum());
4246                                 free(err);
4247                            }
4248                            return 1;
4249                        }
4250                        affentries.resize(numents);
4251                        affentries[0].opts = ff;
4252                        if (utf8) affentries[0].opts += aeUTF8;
4253                        if (pHMgr->is_aliasf()) affentries[0].opts += aeALIASF;
4254                        if (pHMgr->is_aliasm()) affentries[0].opts += aeALIASM;
4255                        affentries[0].aflag = aflag;
4256                      }
4257 
4258              default: break;
4259           }
4260           i++;
4261       }
4262       piece = mystrsep(&tp, 0);
4263    }
4264    // check to make sure we parsed enough pieces
4265    if (np != 4) {
4266        char * err = pHMgr->encode_flag(aflag);
4267        if (err) {
4268             HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
4269             free(err);
4270        }
4271        return 1;
4272    }
4273 
4274    // now parse numents affentries for this affix
4275    std::vector<affentry>::iterator start = affentries.begin();
4276    std::vector<affentry>::iterator end = affentries.end();
4277    for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) {
4278       if ((nl = af->getline()) == NULL) return 1;
4279       mychomp(nl);
4280       tp = nl;
4281       i = 0;
4282       np = 0;
4283 
4284       // split line into pieces
4285       piece = mystrsep(&tp, 0);
4286       while (piece) {
4287          if (*piece != '\0') {
4288              switch(i) {
4289                 // piece 1 - is type
4290                 case 0: {
4291                           np++;
4292                           if (entry != start) entry->opts = start->opts &
4293                              (char) (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM);
4294                           break;
4295                         }
4296 
4297                 // piece 2 - is affix char
4298                 case 1: {
4299                           np++;
4300                           if (pHMgr->decode_flag(piece) != aflag) {
4301                               char * err = pHMgr->encode_flag(aflag);
4302                               if (err) {
4303                                 HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
4304                                     af->getlinenum(), err);
4305                                 free(err);
4306                               }
4307                               return 1;
4308                           }
4309 
4310                           if (entry != start) entry->aflag = start->aflag;
4311                           break;
4312                         }
4313 
4314                 // piece 3 - is string to strip or 0 for null
4315                 case 2: {
4316                           np++;
4317                           if (complexprefixes) {
4318                             if (utf8) reverseword_utf(piece); else reverseword(piece);
4319                           }
4320                           entry->strip = mystrdup(piece);
4321                           entry->stripl = (unsigned char) strlen(entry->strip);
4322                           if (strcmp(entry->strip,"0") == 0) {
4323                               free(entry->strip);
4324                               entry->strip=mystrdup("");
4325                               entry->stripl = 0;
4326                           }
4327                           break;
4328                         }
4329 
4330                 // piece 4 - is affix string or 0 for null
4331                 case 3: {
4332                           char * dash;
4333                           entry->morphcode = NULL;
4334                           entry->contclass = NULL;
4335                           entry->contclasslen = 0;
4336                           np++;
4337                           dash = strchr(piece, '/');
4338                           if (dash) {
4339                             *dash = '\0';
4340 
4341                             if (ignorechars) {
4342                               if (utf8) {
4343                                 remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
4344                               } else {
4345                                 remove_ignored_chars(piece,ignorechars);
4346                               }
4347                             }
4348 
4349                             if (complexprefixes) {
4350                                 if (utf8) reverseword_utf(piece); else reverseword(piece);
4351                             }
4352                             entry->appnd = mystrdup(piece);
4353 
4354                             if (pHMgr->is_aliasf()) {
4355                                 int index = atoi(dash + 1);
4356                                 entry->contclasslen = (unsigned short) pHMgr->get_aliasf(index, &(entry->contclass), af);
4357                                 if (!entry->contclasslen) HUNSPELL_WARNING(stderr, "error: bad affix flag alias: \"%s\"\n", dash+1);
4358                             } else {
4359                                 entry->contclasslen = (unsigned short) pHMgr->decode_flags(&(entry->contclass), dash + 1, af);
4360                                 flag_qsort(entry->contclass, 0, entry->contclasslen);
4361                             }
4362                             *dash = '/';
4363 
4364                             havecontclass = 1;
4365                             for (unsigned short _i = 0; _i < entry->contclasslen; _i++) {
4366                               contclasses[(entry->contclass)[_i]] = 1;
4367                             }
4368                           } else {
4369                             if (ignorechars) {
4370                               if (utf8) {
4371                                 remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
4372                               } else {
4373                                 remove_ignored_chars(piece,ignorechars);
4374                               }
4375                             }
4376 
4377                             if (complexprefixes) {
4378                                 if (utf8) reverseword_utf(piece); else reverseword(piece);
4379                             }
4380                             entry->appnd = mystrdup(piece);
4381                           }
4382 
4383                           entry->appndl = (unsigned char) strlen(entry->appnd);
4384                           if (strcmp(entry->appnd,"0") == 0) {
4385                               free(entry->appnd);
4386                               entry->appnd=mystrdup("");
4387                               entry->appndl = 0;
4388                           }
4389                           break;
4390                         }
4391 
4392                 // piece 5 - is the conditions descriptions
4393                 case 4: {
4394                           np++;
4395                           if (complexprefixes) {
4396                             if (utf8) reverseword_utf(piece); else reverseword(piece);
4397                             reverse_condition(piece);
4398                           }
4399                           if (entry->stripl && (strcmp(piece, ".") != 0) &&
4400                             redundant_condition(at, entry->strip, entry->stripl, piece, af->getlinenum()))
4401                                 strcpy(piece, ".");
4402                           if (at == 'S') {
4403                             reverseword(piece);
4404                             reverse_condition(piece);
4405                           }
4406                           if (encodeit(*entry, piece)) return 1;
4407                          break;
4408                 }
4409 
4410                 case 5: {
4411                           np++;
4412                           if (pHMgr->is_aliasm()) {
4413                             int index = atoi(piece);
4414                             entry->morphcode = pHMgr->get_aliasm(index);
4415                           } else {
4416                             if (complexprefixes) { // XXX - fix me for morph. gen.
4417                                 if (utf8) reverseword_utf(piece); else reverseword(piece);
4418                             }
4419                             // add the remaining of the line
4420                             if (*tp) {
4421                                 *(tp - 1) = ' ';
4422                                 tp = tp + strlen(tp);
4423                             }
4424                             entry->morphcode = mystrdup(piece);
4425                             if (!entry->morphcode) return 1;
4426                           }
4427                           break;
4428                 }
4429                 default: break;
4430              }
4431              i++;
4432          }
4433          piece = mystrsep(&tp, 0);
4434       }
4435       // check to make sure we parsed enough pieces
4436       if (np < 4) {
4437           char * err = pHMgr->encode_flag(aflag);
4438           if (err) {
4439             HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
4440                 af->getlinenum(), err);
4441             free(err);
4442           }
4443           return 1;
4444       }
4445 
4446 #ifdef DEBUG
4447       // detect unnecessary fields, excepting comments
4448       if (basefieldnum) {
4449         int fieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5 : 6);
4450           if (fieldnum != basefieldnum)
4451             HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n", af->getlinenum());
4452       } else {
4453         basefieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5 : 6);
4454       }
4455 #endif
4456    }
4457 
4458    // now create SfxEntry or PfxEntry objects and use links to
4459    // build an ordered (sorted by affix string) list
4460    for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) {
4461       if (at == 'P') {
4462           PfxEntry * pfxptr = new PfxEntry(this,&(*entry));
4463           build_pfxtree(pfxptr);
4464       } else {
4465           SfxEntry * sfxptr = new SfxEntry(this,&(*entry));
4466           build_sfxtree(sfxptr);
4467       }
4468    }
4469    return 0;
4470 }
4471 
redundant_condition(char ft,char * strip,int stripl,const char * cond,int linenum)4472 int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, int linenum) {
4473   int condl = strlen(cond);
4474   int i;
4475   int j;
4476   int neg;
4477   int in;
4478   if (ft == 'P') { // prefix
4479     if (strncmp(strip, cond, condl) == 0) return 1;
4480     if (utf8) {
4481     } else {
4482       for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
4483         if (cond[j] != '[') {
4484           if (cond[j] != strip[i]) {
4485             HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4486             return 0;
4487           }
4488         } else {
4489           neg = (cond[j+1] == '^') ? 1 : 0;
4490           in = 0;
4491           do {
4492             j++;
4493             if (strip[i] == cond[j]) in = 1;
4494           } while ((j < (condl - 1)) && (cond[j] != ']'));
4495           if (j == (condl - 1) && (cond[j] != ']')) {
4496             HUNSPELL_WARNING(stderr, "error: line %d: missing ] in condition:\n%s\n", linenum, cond);
4497             return 0;
4498           }
4499           if ((!neg && !in) || (neg && in)) {
4500             HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4501             return 0;
4502           }
4503         }
4504       }
4505       if (j >= condl) return 1;
4506     }
4507   } else { // suffix
4508     if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) return 1;
4509     if (utf8) {
4510     } else {
4511       for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
4512         if (cond[j] != ']') {
4513           if (cond[j] != strip[i]) {
4514             HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4515             return 0;
4516           }
4517         } else {
4518           in = 0;
4519           do {
4520             j--;
4521             if (strip[i] == cond[j]) in = 1;
4522           } while ((j > 0) && (cond[j] != '['));
4523           if ((j == 0) && (cond[j] != '[')) {
4524             HUNSPELL_WARNING(stderr, "error: line: %d: missing ] in condition:\n%s\n", linenum, cond);
4525             return 0;
4526           }
4527           neg = (cond[j+1] == '^') ? 1 : 0;
4528           if ((!neg && !in) || (neg && in)) {
4529             HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4530             return 0;
4531           }
4532         }
4533       }
4534       if (j < 0) return 1;
4535     }
4536   }
4537   return 0;
4538 }
4539