1 #include "license.hunspell"
2 #include "license.myspell"
3
4 #include <stdlib.h>
5 #include <string.h>
6 #include <stdio.h>
7 #include <ctype.h>
8
9 #include <vector>
10
11 #include "affixmgr.hxx"
12 #include "affentry.hxx"
13 #include "langnum.hxx"
14
15 #include "csutil.hxx"
16
AffixMgr(const char * affpath,HashMgr ** ptr,int * md,const char * key)17 AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key)
18 {
19 // register hash manager and load affix data from aff file
20 pHMgr = ptr[0];
21 alldic = ptr;
22 maxdic = md;
23 keystring = NULL;
24 trystring = NULL;
25 encoding=NULL;
26 csconv=NULL;
27 utf8 = 0;
28 complexprefixes = 0;
29 maptable = NULL;
30 nummap = 0;
31 breaktable = NULL;
32 numbreak = -1;
33 reptable = NULL;
34 numrep = 0;
35 iconvtable = NULL;
36 oconvtable = NULL;
37 checkcpdtable = NULL;
38 // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
39 simplifiedcpd = 0;
40 numcheckcpd = 0;
41 defcpdtable = NULL;
42 numdefcpd = 0;
43 phone = NULL;
44 compoundflag = FLAG_NULL; // permits word in compound forms
45 compoundbegin = FLAG_NULL; // may be first word in compound forms
46 compoundmiddle = FLAG_NULL; // may be middle word in compound forms
47 compoundend = FLAG_NULL; // may be last word in compound forms
48 compoundroot = FLAG_NULL; // compound word signing flag
49 compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
50 compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
51 compoundmoresuffixes = 0; // allow more suffixes within compound words
52 checkcompounddup = 0; // forbid double words in compounds
53 checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)
54 checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds
55 checkcompoundtriple = 0; // forbid compounds with triple letters
56 simplifiedtriple = 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt)
57 forbiddenword = FORBIDDENWORD; // forbidden word signing flag
58 nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
59 nongramsuggest = FLAG_NULL;
60 lang = NULL; // language
61 langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
62 needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes
63 cpdwordmax = -1; // default: unlimited wordcount in compound words
64 cpdmin = -1; // undefined
65 cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
66 cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
67 cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
68 cpdvowels_utf16_len=0; // vowels
69 pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG
70 sfxappnd=NULL; // previous suffix for counting a special syllables BUG
71 cpdsyllablenum=NULL; // syllable count incrementing flag
72 checknum=0; // checking numbers, and word with numbers
73 wordchars=NULL; // letters + spec. word characters
74 wordchars_utf16=NULL; // letters + spec. word characters
75 wordchars_utf16_len=0; // letters + spec. word characters
76 ignorechars=NULL; // letters + spec. word characters
77 ignorechars_utf16=NULL; // letters + spec. word characters
78 ignorechars_utf16_len=0; // letters + spec. word characters
79 version=NULL; // affix and dictionary file version string
80 havecontclass=0; // flags of possible continuing classes (double affix)
81 // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
82 // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
83 lemma_present = FLAG_NULL;
84 circumfix = FLAG_NULL;
85 onlyincompound = FLAG_NULL;
86 maxngramsugs = -1; // undefined
87 maxdiff = -1; // undefined
88 onlymaxdiff = 0;
89 maxcpdsugs = -1; // undefined
90 nosplitsugs = 0;
91 sugswithdots = 0;
92 keepcase = 0;
93 forceucase = 0;
94 warn = 0;
95 forbidwarn = 0;
96 checksharps = 0;
97 substandard = FLAG_NULL;
98 fullstrip = 0;
99
100 sfx = NULL;
101 pfx = NULL;
102
103 for (int i=0; i < SETSIZE; i++) {
104 pStart[i] = NULL;
105 sStart[i] = NULL;
106 pFlag[i] = NULL;
107 sFlag[i] = NULL;
108 }
109
110 for (int j=0; j < CONTSIZE; j++) {
111 contclasses[j] = 0;
112 }
113
114 if (parse_file(affpath, key)) {
115 HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
116 }
117
118 if (cpdmin == -1) cpdmin = MINCPDLEN;
119
120 }
121
122
~AffixMgr()123 AffixMgr::~AffixMgr()
124 {
125 // pass through linked prefix entries and clean up
126 for (int i=0; i < SETSIZE ;i++) {
127 pFlag[i] = NULL;
128 PfxEntry * ptr = pStart[i];
129 PfxEntry * nptr = NULL;
130 while (ptr) {
131 nptr = ptr->getNext();
132 delete(ptr);
133 ptr = nptr;
134 nptr = NULL;
135 }
136 }
137
138 // pass through linked suffix entries and clean up
139 for (int j=0; j < SETSIZE ; j++) {
140 sFlag[j] = NULL;
141 SfxEntry * ptr = sStart[j];
142 SfxEntry * nptr = NULL;
143 while (ptr) {
144 nptr = ptr->getNext();
145 delete(ptr);
146 ptr = nptr;
147 nptr = NULL;
148 }
149 sStart[j] = NULL;
150 }
151
152 if (keystring) free(keystring);
153 keystring=NULL;
154 if (trystring) free(trystring);
155 trystring=NULL;
156 if (encoding) free(encoding);
157 encoding=NULL;
158 if (maptable) {
159 for (int j=0; j < nummap; j++) {
160 for (int k=0; k < maptable[j].len; k++) {
161 if (maptable[j].set[k]) free(maptable[j].set[k]);
162 }
163 free(maptable[j].set);
164 maptable[j].set = NULL;
165 maptable[j].len = 0;
166 }
167 free(maptable);
168 maptable = NULL;
169 }
170 nummap = 0;
171 if (breaktable) {
172 for (int j=0; j < numbreak; j++) {
173 if (breaktable[j]) free(breaktable[j]);
174 breaktable[j] = NULL;
175 }
176 free(breaktable);
177 breaktable = NULL;
178 }
179 numbreak = 0;
180 if (reptable) {
181 for (int j=0; j < numrep; j++) {
182 free(reptable[j].pattern);
183 free(reptable[j].pattern2);
184 }
185 free(reptable);
186 reptable = NULL;
187 }
188 if (iconvtable) delete iconvtable;
189 if (oconvtable) delete oconvtable;
190 if (phone && phone->rules) {
191 for (int j=0; j < phone->num + 1; j++) {
192 free(phone->rules[j * 2]);
193 free(phone->rules[j * 2 + 1]);
194 }
195 free(phone->rules);
196 free(phone);
197 phone = NULL;
198 }
199
200 if (defcpdtable) {
201 for (int j=0; j < numdefcpd; j++) {
202 free(defcpdtable[j].def);
203 defcpdtable[j].def = NULL;
204 }
205 free(defcpdtable);
206 defcpdtable = NULL;
207 }
208 numrep = 0;
209 if (checkcpdtable) {
210 for (int j=0; j < numcheckcpd; j++) {
211 free(checkcpdtable[j].pattern);
212 free(checkcpdtable[j].pattern2);
213 free(checkcpdtable[j].pattern3);
214 checkcpdtable[j].pattern = NULL;
215 checkcpdtable[j].pattern2 = NULL;
216 checkcpdtable[j].pattern3 = NULL;
217 }
218 free(checkcpdtable);
219 checkcpdtable = NULL;
220 }
221 numcheckcpd = 0;
222 FREE_FLAG(compoundflag);
223 FREE_FLAG(compoundbegin);
224 FREE_FLAG(compoundmiddle);
225 FREE_FLAG(compoundend);
226 FREE_FLAG(compoundpermitflag);
227 FREE_FLAG(compoundforbidflag);
228 FREE_FLAG(compoundroot);
229 FREE_FLAG(forbiddenword);
230 FREE_FLAG(nosuggest);
231 FREE_FLAG(nongramsuggest);
232 FREE_FLAG(needaffix);
233 FREE_FLAG(lemma_present);
234 FREE_FLAG(circumfix);
235 FREE_FLAG(onlyincompound);
236
237 cpdwordmax = 0;
238 pHMgr = NULL;
239 cpdmin = 0;
240 cpdmaxsyllable = 0;
241 if (cpdvowels) free(cpdvowels);
242 if (cpdvowels_utf16) free(cpdvowels_utf16);
243 if (cpdsyllablenum) free(cpdsyllablenum);
244 free_utf_tbl();
245 if (lang) free(lang);
246 if (wordchars) free(wordchars);
247 if (wordchars_utf16) free(wordchars_utf16);
248 if (ignorechars) free(ignorechars);
249 if (ignorechars_utf16) free(ignorechars_utf16);
250 if (version) free(version);
251 checknum=0;
252 #ifdef MOZILLA_CLIENT
253 delete [] csconv;
254 #endif
255 }
256
finishFileMgr(FileMgr * afflst)257 void AffixMgr::finishFileMgr(FileMgr *afflst)
258 {
259 delete afflst;
260
261 // convert affix trees to sorted list
262 process_pfx_tree_to_list();
263 process_sfx_tree_to_list();
264 }
265
266 // read in aff file and build up prefix and suffix entry objects
parse_file(const char * affpath,const char * key)267 int AffixMgr::parse_file(const char * affpath, const char * key)
268 {
269 char * line; // io buffers
270 char ft; // affix type
271
272 // checking flag duplication
273 char dupflags[CONTSIZE];
274 char dupflags_ini = 1;
275
276 // first line indicator for removing byte order mark
277 int firstline = 1;
278
279 // open the affix file
280 FileMgr * afflst = new FileMgr(affpath, key);
281 if (!afflst) {
282 HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);
283 return 1;
284 }
285
286 // step one is to parse the affix file building up the internal
287 // affix data structures
288
289 // read in each line ignoring any that do not
290 // start with a known line type indicator
291 while ((line = afflst->getline()) != NULL) {
292 mychomp(line);
293
294 /* remove byte order mark */
295 if (firstline) {
296 firstline = 0;
297 // Affix file begins with byte order mark: possible incompatibility with old Hunspell versions
298 if (strncmp(line,"\xEF\xBB\xBF",3) == 0) {
299 memmove(line, line+3, strlen(line+3)+1);
300 }
301 }
302
303 /* parse in the keyboard string */
304 if (strncmp(line,"KEY",3) == 0) {
305 if (parse_string(line, &keystring, afflst->getlinenum())) {
306 finishFileMgr(afflst);
307 return 1;
308 }
309 }
310
311 /* parse in the try string */
312 if (strncmp(line,"TRY",3) == 0) {
313 if (parse_string(line, &trystring, afflst->getlinenum())) {
314 finishFileMgr(afflst);
315 return 1;
316 }
317 }
318
319 /* parse in the name of the character set used by the .dict and .aff */
320 if (strncmp(line,"SET",3) == 0) {
321 if (parse_string(line, &encoding, afflst->getlinenum())) {
322 finishFileMgr(afflst);
323 return 1;
324 }
325 if (strcmp(encoding, "UTF-8") == 0) {
326 utf8 = 1;
327 #ifndef OPENOFFICEORG
328 #ifndef MOZILLA_CLIENT
329 if (initialize_utf_tbl()) return 1;
330 #endif
331 #endif
332 }
333 }
334
335 /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
336 if (strncmp(line,"COMPLEXPREFIXES",15) == 0)
337 complexprefixes = 1;
338
339 /* parse in the flag used by the controlled compound words */
340 if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
341 if (parse_flag(line, &compoundflag, afflst)) {
342 finishFileMgr(afflst);
343 return 1;
344 }
345 }
346
347 /* parse in the flag used by compound words */
348 if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {
349 if (complexprefixes) {
350 if (parse_flag(line, &compoundend, afflst)) {
351 finishFileMgr(afflst);
352 return 1;
353 }
354 } else {
355 if (parse_flag(line, &compoundbegin, afflst)) {
356 finishFileMgr(afflst);
357 return 1;
358 }
359 }
360 }
361
362 /* parse in the flag used by compound words */
363 if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {
364 if (parse_flag(line, &compoundmiddle, afflst)) {
365 finishFileMgr(afflst);
366 return 1;
367 }
368 }
369 /* parse in the flag used by compound words */
370 if (strncmp(line,"COMPOUNDEND",11) == 0) {
371 if (complexprefixes) {
372 if (parse_flag(line, &compoundbegin, afflst)) {
373 finishFileMgr(afflst);
374 return 1;
375 }
376 } else {
377 if (parse_flag(line, &compoundend, afflst)) {
378 finishFileMgr(afflst);
379 return 1;
380 }
381 }
382 }
383
384 /* parse in the data used by compound_check() method */
385 if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {
386 if (parse_num(line, &cpdwordmax, afflst)) {
387 finishFileMgr(afflst);
388 return 1;
389 }
390 }
391
392 /* parse in the flag sign compounds in dictionary */
393 if (strncmp(line,"COMPOUNDROOT",12) == 0) {
394 if (parse_flag(line, &compoundroot, afflst)) {
395 finishFileMgr(afflst);
396 return 1;
397 }
398 }
399
400 /* parse in the flag used by compound_check() method */
401 if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {
402 if (parse_flag(line, &compoundpermitflag, afflst)) {
403 finishFileMgr(afflst);
404 return 1;
405 }
406 }
407
408 /* parse in the flag used by compound_check() method */
409 if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {
410 if (parse_flag(line, &compoundforbidflag, afflst)) {
411 finishFileMgr(afflst);
412 return 1;
413 }
414 }
415
416 if (strncmp(line,"COMPOUNDMORESUFFIXES",20) == 0) {
417 compoundmoresuffixes = 1;
418 }
419
420 if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) {
421 checkcompounddup = 1;
422 }
423
424 if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) {
425 checkcompoundrep = 1;
426 }
427
428 if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) {
429 checkcompoundtriple = 1;
430 }
431
432 if (strncmp(line,"SIMPLIFIEDTRIPLE",16) == 0) {
433 simplifiedtriple = 1;
434 }
435
436 if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) {
437 checkcompoundcase = 1;
438 }
439
440 if (strncmp(line,"NOSUGGEST",9) == 0) {
441 if (parse_flag(line, &nosuggest, afflst)) {
442 finishFileMgr(afflst);
443 return 1;
444 }
445 }
446
447 if (strncmp(line,"NONGRAMSUGGEST",14) == 0) {
448 if (parse_flag(line, &nongramsuggest, afflst)) {
449 finishFileMgr(afflst);
450 return 1;
451 }
452 }
453
454 /* parse in the flag used by forbidden words */
455 if (strncmp(line,"FORBIDDENWORD",13) == 0) {
456 if (parse_flag(line, &forbiddenword, afflst)) {
457 finishFileMgr(afflst);
458 return 1;
459 }
460 }
461
462 /* parse in the flag used by forbidden words */
463 if (strncmp(line,"LEMMA_PRESENT",13) == 0) {
464 if (parse_flag(line, &lemma_present, afflst)) {
465 finishFileMgr(afflst);
466 return 1;
467 }
468 }
469
470 /* parse in the flag used by circumfixes */
471 if (strncmp(line,"CIRCUMFIX",9) == 0) {
472 if (parse_flag(line, &circumfix, afflst)) {
473 finishFileMgr(afflst);
474 return 1;
475 }
476 }
477
478 /* parse in the flag used by fogemorphemes */
479 if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {
480 if (parse_flag(line, &onlyincompound, afflst)) {
481 finishFileMgr(afflst);
482 return 1;
483 }
484 }
485
486 /* parse in the flag used by `needaffixs' */
487 if (strncmp(line,"PSEUDOROOT",10) == 0) {
488 if (parse_flag(line, &needaffix, afflst)) {
489 finishFileMgr(afflst);
490 return 1;
491 }
492 }
493
494 /* parse in the flag used by `needaffixs' */
495 if (strncmp(line,"NEEDAFFIX",9) == 0) {
496 if (parse_flag(line, &needaffix, afflst)) {
497 finishFileMgr(afflst);
498 return 1;
499 }
500 }
501
502 /* parse in the minimal length for words in compounds */
503 if (strncmp(line,"COMPOUNDMIN",11) == 0) {
504 if (parse_num(line, &cpdmin, afflst)) {
505 finishFileMgr(afflst);
506 return 1;
507 }
508 if (cpdmin < 1) cpdmin = 1;
509 }
510
511 /* parse in the max. words and syllables in compounds */
512 if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {
513 if (parse_cpdsyllable(line, afflst)) {
514 finishFileMgr(afflst);
515 return 1;
516 }
517 }
518
519 /* parse in the flag used by compound_check() method */
520 if (strncmp(line,"SYLLABLENUM",11) == 0) {
521 if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) {
522 finishFileMgr(afflst);
523 return 1;
524 }
525 }
526
527 /* parse in the flag used by the controlled compound words */
528 if (strncmp(line,"CHECKNUM",8) == 0) {
529 checknum=1;
530 }
531
532 /* parse in the extra word characters */
533 if (strncmp(line,"WORDCHARS",9) == 0) {
534 if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, afflst->getlinenum())) {
535 finishFileMgr(afflst);
536 return 1;
537 }
538 }
539
540 /* parse in the ignored characters (for example, Arabic optional diacretics charachters */
541 if (strncmp(line,"IGNORE",6) == 0) {
542 if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
543 finishFileMgr(afflst);
544 return 1;
545 }
546 }
547
548 /* parse in the typical fault correcting table */
549 if (strncmp(line,"REP",3) == 0) {
550 if (parse_reptable(line, afflst)) {
551 finishFileMgr(afflst);
552 return 1;
553 }
554 }
555
556 /* parse in the input conversion table */
557 if (strncmp(line,"ICONV",5) == 0) {
558 if (parse_convtable(line, afflst, &iconvtable, "ICONV")) {
559 finishFileMgr(afflst);
560 return 1;
561 }
562 }
563
564 /* parse in the input conversion table */
565 if (strncmp(line,"OCONV",5) == 0) {
566 if (parse_convtable(line, afflst, &oconvtable, "OCONV")) {
567 finishFileMgr(afflst);
568 return 1;
569 }
570 }
571
572 /* parse in the phonetic translation table */
573 if (strncmp(line,"PHONE",5) == 0) {
574 if (parse_phonetable(line, afflst)) {
575 finishFileMgr(afflst);
576 return 1;
577 }
578 }
579
580 /* parse in the checkcompoundpattern table */
581 if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {
582 if (parse_checkcpdtable(line, afflst)) {
583 finishFileMgr(afflst);
584 return 1;
585 }
586 }
587
588 /* parse in the defcompound table */
589 if (strncmp(line,"COMPOUNDRULE",12) == 0) {
590 if (parse_defcpdtable(line, afflst)) {
591 finishFileMgr(afflst);
592 return 1;
593 }
594 }
595
596 /* parse in the related character map table */
597 if (strncmp(line,"MAP",3) == 0) {
598 if (parse_maptable(line, afflst)) {
599 finishFileMgr(afflst);
600 return 1;
601 }
602 }
603
604 /* parse in the word breakpoints table */
605 if (strncmp(line,"BREAK",5) == 0) {
606 if (parse_breaktable(line, afflst)) {
607 finishFileMgr(afflst);
608 return 1;
609 }
610 }
611
612 /* parse in the language for language specific codes */
613 if (strncmp(line,"LANG",4) == 0) {
614 if (parse_string(line, &lang, afflst->getlinenum())) {
615 finishFileMgr(afflst);
616 return 1;
617 }
618 langnum = get_lang_num(lang);
619 }
620
621 if (strncmp(line,"VERSION",7) == 0) {
622 for(line = line + 7; *line == ' ' || *line == '\t'; line++);
623 version = mystrdup(line);
624 }
625
626 if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {
627 if (parse_num(line, &maxngramsugs, afflst)) {
628 finishFileMgr(afflst);
629 return 1;
630 }
631 }
632
633 if (strncmp(line,"ONLYMAXDIFF", 11) == 0)
634 onlymaxdiff = 1;
635
636 if (strncmp(line,"MAXDIFF",7) == 0) {
637 if (parse_num(line, &maxdiff, afflst)) {
638 finishFileMgr(afflst);
639 return 1;
640 }
641 }
642
643 if (strncmp(line,"MAXCPDSUGS",10) == 0) {
644 if (parse_num(line, &maxcpdsugs, afflst)) {
645 finishFileMgr(afflst);
646 return 1;
647 }
648 }
649
650 if (strncmp(line,"NOSPLITSUGS",11) == 0) {
651 nosplitsugs=1;
652 }
653
654 if (strncmp(line,"FULLSTRIP",9) == 0) {
655 fullstrip=1;
656 }
657
658 if (strncmp(line,"SUGSWITHDOTS",12) == 0) {
659 sugswithdots=1;
660 }
661
662 /* parse in the flag used by forbidden words */
663 if (strncmp(line,"KEEPCASE",8) == 0) {
664 if (parse_flag(line, &keepcase, afflst)) {
665 finishFileMgr(afflst);
666 return 1;
667 }
668 }
669
670 /* parse in the flag used by `forceucase' */
671 if (strncmp(line,"FORCEUCASE",10) == 0) {
672 if (parse_flag(line, &forceucase, afflst)) {
673 finishFileMgr(afflst);
674 return 1;
675 }
676 }
677
678 /* parse in the flag used by `warn' */
679 if (strncmp(line,"WARN",4) == 0) {
680 if (parse_flag(line, &warn, afflst)) {
681 finishFileMgr(afflst);
682 return 1;
683 }
684 }
685
686 if (strncmp(line,"FORBIDWARN",10) == 0) {
687 forbidwarn=1;
688 }
689
690 /* parse in the flag used by the affix generator */
691 if (strncmp(line,"SUBSTANDARD",11) == 0) {
692 if (parse_flag(line, &substandard, afflst)) {
693 finishFileMgr(afflst);
694 return 1;
695 }
696 }
697
698 if (strncmp(line,"CHECKSHARPS",11) == 0) {
699 checksharps=1;
700 }
701
702 /* parse this affix: P - prefix, S - suffix */
703 ft = ' ';
704 if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
705 if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
706 if (ft != ' ') {
707 if (dupflags_ini) {
708 memset(dupflags, 0, sizeof(dupflags));
709 dupflags_ini = 0;
710 }
711 if (parse_affix(line, ft, afflst, dupflags)) {
712 finishFileMgr(afflst);
713 return 1;
714 }
715 }
716 }
717
718 finishFileMgr(afflst);
719 // affix trees are sorted now
720
721 // now we can speed up performance greatly taking advantage of the
722 // relationship between the affixes and the idea of "subsets".
723
724 // View each prefix as a potential leading subset of another and view
725 // each suffix (reversed) as a potential trailing subset of another.
726
727 // To illustrate this relationship if we know the prefix "ab" is found in the
728 // word to examine, only prefixes that "ab" is a leading subset of need be examined.
729 // Furthermore is "ab" is not present then none of the prefixes that "ab" is
730 // is a subset need be examined.
731 // The same argument goes for suffix string that are reversed.
732
733 // Then to top this off why not examine the first char of the word to quickly
734 // limit the set of prefixes to examine (i.e. the prefixes to examine must
735 // be leading supersets of the first character of the word (if they exist)
736
737 // To take advantage of this "subset" relationship, we need to add two links
738 // from entry. One to take next if the current prefix is found (call it nexteq)
739 // and one to take next if the current prefix is not found (call it nextne).
740
741 // Since we have built ordered lists, all that remains is to properly initialize
742 // the nextne and nexteq pointers that relate them
743
744 process_pfx_order();
745 process_sfx_order();
746
747 /* get encoding for CHECKCOMPOUNDCASE */
748 if (!utf8) {
749 char * enc = get_encoding();
750 csconv = get_current_cs(enc);
751 free(enc);
752 enc = NULL;
753
754 char expw[MAXLNLEN];
755 if (wordchars) {
756 strcpy(expw, wordchars);
757 free(wordchars);
758 } else *expw = '\0';
759
760 for (int i = 0; i <= 255; i++) {
761 if ( (csconv[i].cupper != csconv[i].clower) &&
762 (! strchr(expw, (char) i))) {
763 *(expw + strlen(expw) + 1) = '\0';
764 *(expw + strlen(expw)) = (char) i;
765 }
766 }
767
768 wordchars = mystrdup(expw);
769 }
770
771 // default BREAK definition
772 if (numbreak == -1) {
773 breaktable = (char **) malloc(sizeof(char *) * 3);
774 if (!breaktable) return 1;
775 breaktable[0] = mystrdup("-");
776 breaktable[1] = mystrdup("^-");
777 breaktable[2] = mystrdup("-$");
778 if (breaktable[0] && breaktable[1] && breaktable[2]) numbreak = 3;
779 }
780 return 0;
781 }
782
783
784 // we want to be able to quickly access prefix information
785 // both by prefix flag, and sorted by prefix string itself
786 // so we need to set up two indexes
787
build_pfxtree(PfxEntry * pfxptr)788 int AffixMgr::build_pfxtree(PfxEntry* pfxptr)
789 {
790 PfxEntry * ptr;
791 PfxEntry * pptr;
792 PfxEntry * ep = pfxptr;
793
794 // get the right starting points
795 const char * key = ep->getKey();
796 const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
797
798 // first index by flag which must exist
799 ptr = pFlag[flg];
800 ep->setFlgNxt(ptr);
801 pFlag[flg] = ep;
802
803
804 // handle the special case of null affix string
805 if (strlen(key) == 0) {
806 // always inset them at head of list at element 0
807 ptr = pStart[0];
808 ep->setNext(ptr);
809 pStart[0] = ep;
810 return 0;
811 }
812
813 // now handle the normal case
814 ep->setNextEQ(NULL);
815 ep->setNextNE(NULL);
816
817 unsigned char sp = *((const unsigned char *)key);
818 ptr = pStart[sp];
819
820 // handle the first insert
821 if (!ptr) {
822 pStart[sp] = ep;
823 return 0;
824 }
825
826
827 // otherwise use binary tree insertion so that a sorted
828 // list can easily be generated later
829 pptr = NULL;
830 for (;;) {
831 pptr = ptr;
832 if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
833 ptr = ptr->getNextEQ();
834 if (!ptr) {
835 pptr->setNextEQ(ep);
836 break;
837 }
838 } else {
839 ptr = ptr->getNextNE();
840 if (!ptr) {
841 pptr->setNextNE(ep);
842 break;
843 }
844 }
845 }
846 return 0;
847 }
848
849 // we want to be able to quickly access suffix information
850 // both by suffix flag, and sorted by the reverse of the
851 // suffix string itself; so we need to set up two indexes
build_sfxtree(SfxEntry * sfxptr)852 int AffixMgr::build_sfxtree(SfxEntry* sfxptr)
853 {
854 SfxEntry * ptr;
855 SfxEntry * pptr;
856 SfxEntry * ep = sfxptr;
857
858 /* get the right starting point */
859 const char * key = ep->getKey();
860 const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
861
862 // first index by flag which must exist
863 ptr = sFlag[flg];
864 ep->setFlgNxt(ptr);
865 sFlag[flg] = ep;
866
867 // next index by affix string
868
869 // handle the special case of null affix string
870 if (strlen(key) == 0) {
871 // always inset them at head of list at element 0
872 ptr = sStart[0];
873 ep->setNext(ptr);
874 sStart[0] = ep;
875 return 0;
876 }
877
878 // now handle the normal case
879 ep->setNextEQ(NULL);
880 ep->setNextNE(NULL);
881
882 unsigned char sp = *((const unsigned char *)key);
883 ptr = sStart[sp];
884
885 // handle the first insert
886 if (!ptr) {
887 sStart[sp] = ep;
888 return 0;
889 }
890
891 // otherwise use binary tree insertion so that a sorted
892 // list can easily be generated later
893 pptr = NULL;
894 for (;;) {
895 pptr = ptr;
896 if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
897 ptr = ptr->getNextEQ();
898 if (!ptr) {
899 pptr->setNextEQ(ep);
900 break;
901 }
902 } else {
903 ptr = ptr->getNextNE();
904 if (!ptr) {
905 pptr->setNextNE(ep);
906 break;
907 }
908 }
909 }
910 return 0;
911 }
912
913 // convert from binary tree to sorted list
process_pfx_tree_to_list()914 int AffixMgr::process_pfx_tree_to_list()
915 {
916 for (int i=1; i< SETSIZE; i++) {
917 pStart[i] = process_pfx_in_order(pStart[i],NULL);
918 }
919 return 0;
920 }
921
922
process_pfx_in_order(PfxEntry * ptr,PfxEntry * nptr)923 PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr)
924 {
925 if (ptr) {
926 nptr = process_pfx_in_order(ptr->getNextNE(), nptr);
927 ptr->setNext(nptr);
928 nptr = process_pfx_in_order(ptr->getNextEQ(), ptr);
929 }
930 return nptr;
931 }
932
933
934 // convert from binary tree to sorted list
process_sfx_tree_to_list()935 int AffixMgr:: process_sfx_tree_to_list()
936 {
937 for (int i=1; i< SETSIZE; i++) {
938 sStart[i] = process_sfx_in_order(sStart[i],NULL);
939 }
940 return 0;
941 }
942
process_sfx_in_order(SfxEntry * ptr,SfxEntry * nptr)943 SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr)
944 {
945 if (ptr) {
946 nptr = process_sfx_in_order(ptr->getNextNE(), nptr);
947 ptr->setNext(nptr);
948 nptr = process_sfx_in_order(ptr->getNextEQ(), ptr);
949 }
950 return nptr;
951 }
952
953
954 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
955 // using the idea of leading subsets this time
process_pfx_order()956 int AffixMgr::process_pfx_order()
957 {
958 PfxEntry* ptr;
959
960 // loop through each prefix list starting point
961 for (int i=1; i < SETSIZE; i++) {
962
963 ptr = pStart[i];
964
965 // look through the remainder of the list
966 // and find next entry with affix that
967 // the current one is not a subset of
968 // mark that as destination for NextNE
969 // use next in list that you are a subset
970 // of as NextEQ
971
972 for (; ptr != NULL; ptr = ptr->getNext()) {
973
974 PfxEntry * nptr = ptr->getNext();
975 for (; nptr != NULL; nptr = nptr->getNext()) {
976 if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
977 }
978 ptr->setNextNE(nptr);
979 ptr->setNextEQ(NULL);
980 if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))
981 ptr->setNextEQ(ptr->getNext());
982 }
983
984 // now clean up by adding smart search termination strings:
985 // if you are already a superset of the previous prefix
986 // but not a subset of the next, search can end here
987 // so set NextNE properly
988
989 ptr = pStart[i];
990 for (; ptr != NULL; ptr = ptr->getNext()) {
991 PfxEntry * nptr = ptr->getNext();
992 PfxEntry * mptr = NULL;
993 for (; nptr != NULL; nptr = nptr->getNext()) {
994 if (! isSubset(ptr->getKey(),nptr->getKey())) break;
995 mptr = nptr;
996 }
997 if (mptr) mptr->setNextNE(NULL);
998 }
999 }
1000 return 0;
1001 }
1002
1003 // initialize the SfxEntry links NextEQ and NextNE to speed searching
1004 // using the idea of leading subsets this time
process_sfx_order()1005 int AffixMgr::process_sfx_order()
1006 {
1007 SfxEntry* ptr;
1008
1009 // loop through each prefix list starting point
1010 for (int i=1; i < SETSIZE; i++) {
1011
1012 ptr = sStart[i];
1013
1014 // look through the remainder of the list
1015 // and find next entry with affix that
1016 // the current one is not a subset of
1017 // mark that as destination for NextNE
1018 // use next in list that you are a subset
1019 // of as NextEQ
1020
1021 for (; ptr != NULL; ptr = ptr->getNext()) {
1022 SfxEntry * nptr = ptr->getNext();
1023 for (; nptr != NULL; nptr = nptr->getNext()) {
1024 if (! isSubset(ptr->getKey(),nptr->getKey())) break;
1025 }
1026 ptr->setNextNE(nptr);
1027 ptr->setNextEQ(NULL);
1028 if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey()))
1029 ptr->setNextEQ(ptr->getNext());
1030 }
1031
1032
1033 // now clean up by adding smart search termination strings:
1034 // if you are already a superset of the previous suffix
1035 // but not a subset of the next, search can end here
1036 // so set NextNE properly
1037
1038 ptr = sStart[i];
1039 for (; ptr != NULL; ptr = ptr->getNext()) {
1040 SfxEntry * nptr = ptr->getNext();
1041 SfxEntry * mptr = NULL;
1042 for (; nptr != NULL; nptr = nptr->getNext()) {
1043 if (! isSubset(ptr->getKey(),nptr->getKey())) break;
1044 mptr = nptr;
1045 }
1046 if (mptr) mptr->setNextNE(NULL);
1047 }
1048 }
1049 return 0;
1050 }
1051
1052 // add flags to the result for dictionary debugging
debugflag(char * result,unsigned short flag)1053 void AffixMgr::debugflag(char * result, unsigned short flag) {
1054 char * st = encode_flag(flag);
1055 mystrcat(result, " ", MAXLNLEN);
1056 mystrcat(result, MORPH_FLAG, MAXLNLEN);
1057 if (st) {
1058 mystrcat(result, st, MAXLNLEN);
1059 free(st);
1060 }
1061 }
1062
1063 // calculate the character length of the condition
condlen(char * st)1064 int AffixMgr::condlen(char * st)
1065 {
1066 int l = 0;
1067 bool group = false;
1068 for(; *st; st++) {
1069 if (*st == '[') {
1070 group = true;
1071 l++;
1072 } else if (*st == ']') group = false;
1073 else if (!group && (!utf8 ||
1074 (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++;
1075 }
1076 return l;
1077 }
1078
encodeit(affentry & entry,char * cs)1079 int AffixMgr::encodeit(affentry &entry, char * cs)
1080 {
1081 if (strcmp(cs,".") != 0) {
1082 entry.numconds = (char) condlen(cs);
1083 strncpy(entry.c.conds, cs, MAXCONDLEN);
1084 // long condition (end of conds padded by strncpy)
1085 if (entry.c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) {
1086 entry.opts += aeLONGCOND;
1087 entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
1088 if (!entry.c.l.conds2) return 1;
1089 }
1090 } else {
1091 entry.numconds = 0;
1092 entry.c.conds[0] = '\0';
1093 }
1094 return 0;
1095 }
1096
1097 // return 1 if s1 is a leading subset of s2 (dots are for infixes)
isSubset(const char * s1,const char * s2)1098 inline int AffixMgr::isSubset(const char * s1, const char * s2)
1099 {
1100 while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
1101 s1++;
1102 s2++;
1103 }
1104 return (*s1 == '\0');
1105 }
1106
1107
1108 // check word for prefixes
prefix_check(const char * word,int len,char in_compound,const FLAG needflag)1109 struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,
1110 const FLAG needflag)
1111 {
1112 struct hentry * rv= NULL;
1113
1114 pfx = NULL;
1115 pfxappnd = NULL;
1116 sfxappnd = NULL;
1117
1118 // first handle the special case of 0 length prefixes
1119 PfxEntry * pe = pStart[0];
1120 while (pe) {
1121 if (
1122 // fogemorpheme
1123 ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&
1124 (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
1125 // permit prefixes in compounds
1126 ((in_compound != IN_CPD_END) || (pe->getCont() &&
1127 (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))
1128 ) {
1129 // check prefix
1130 rv = pe->checkword(word, len, in_compound, needflag);
1131 if (rv) {
1132 pfx=pe; // BUG: pfx not stateless
1133 return rv;
1134 }
1135 }
1136 pe = pe->getNext();
1137 }
1138
1139 // now handle the general case
1140 unsigned char sp = *((const unsigned char *)word);
1141 PfxEntry * pptr = pStart[sp];
1142
1143 while (pptr) {
1144 if (isSubset(pptr->getKey(),word)) {
1145 if (
1146 // fogemorpheme
1147 ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&
1148 (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
1149 // permit prefixes in compounds
1150 ((in_compound != IN_CPD_END) || (pptr->getCont() &&
1151 (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen()))))
1152 ) {
1153 // check prefix
1154 rv = pptr->checkword(word, len, in_compound, needflag);
1155 if (rv) {
1156 pfx=pptr; // BUG: pfx not stateless
1157 return rv;
1158 }
1159 }
1160 pptr = pptr->getNextEQ();
1161 } else {
1162 pptr = pptr->getNextNE();
1163 }
1164 }
1165
1166 return NULL;
1167 }
1168
1169 // check word for prefixes
prefix_check_twosfx(const char * word,int len,char in_compound,const FLAG needflag)1170 struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,
1171 char in_compound, const FLAG needflag)
1172 {
1173 struct hentry * rv= NULL;
1174
1175 pfx = NULL;
1176 sfxappnd = NULL;
1177
1178 // first handle the special case of 0 length prefixes
1179 PfxEntry * pe = pStart[0];
1180
1181 while (pe) {
1182 rv = pe->check_twosfx(word, len, in_compound, needflag);
1183 if (rv) return rv;
1184 pe = pe->getNext();
1185 }
1186
1187 // now handle the general case
1188 unsigned char sp = *((const unsigned char *)word);
1189 PfxEntry * pptr = pStart[sp];
1190
1191 while (pptr) {
1192 if (isSubset(pptr->getKey(),word)) {
1193 rv = pptr->check_twosfx(word, len, in_compound, needflag);
1194 if (rv) {
1195 pfx = pptr;
1196 return rv;
1197 }
1198 pptr = pptr->getNextEQ();
1199 } else {
1200 pptr = pptr->getNextNE();
1201 }
1202 }
1203
1204 return NULL;
1205 }
1206
1207 // check word for prefixes
prefix_check_morph(const char * word,int len,char in_compound,const FLAG needflag)1208 char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,
1209 const FLAG needflag)
1210 {
1211 char * st;
1212
1213 char result[MAXLNLEN];
1214 result[0] = '\0';
1215
1216 pfx = NULL;
1217 sfxappnd = NULL;
1218
1219 // first handle the special case of 0 length prefixes
1220 PfxEntry * pe = pStart[0];
1221 while (pe) {
1222 st = pe->check_morph(word,len,in_compound, needflag);
1223 if (st) {
1224 mystrcat(result, st, MAXLNLEN);
1225 free(st);
1226 }
1227 // if (rv) return rv;
1228 pe = pe->getNext();
1229 }
1230
1231 // now handle the general case
1232 unsigned char sp = *((const unsigned char *)word);
1233 PfxEntry * pptr = pStart[sp];
1234
1235 while (pptr) {
1236 if (isSubset(pptr->getKey(),word)) {
1237 st = pptr->check_morph(word,len,in_compound, needflag);
1238 if (st) {
1239 // fogemorpheme
1240 if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() &&
1241 (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {
1242 mystrcat(result, st, MAXLNLEN);
1243 pfx = pptr;
1244 }
1245 free(st);
1246 }
1247 pptr = pptr->getNextEQ();
1248 } else {
1249 pptr = pptr->getNextNE();
1250 }
1251 }
1252
1253 if (*result) return mystrdup(result);
1254 return NULL;
1255 }
1256
1257
1258 // check word for prefixes
prefix_check_twosfx_morph(const char * word,int len,char in_compound,const FLAG needflag)1259 char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,
1260 char in_compound, const FLAG needflag)
1261 {
1262 char * st;
1263
1264 char result[MAXLNLEN];
1265 result[0] = '\0';
1266
1267 pfx = NULL;
1268 sfxappnd = NULL;
1269
1270 // first handle the special case of 0 length prefixes
1271 PfxEntry * pe = pStart[0];
1272 while (pe) {
1273 st = pe->check_twosfx_morph(word,len,in_compound, needflag);
1274 if (st) {
1275 mystrcat(result, st, MAXLNLEN);
1276 free(st);
1277 }
1278 pe = pe->getNext();
1279 }
1280
1281 // now handle the general case
1282 unsigned char sp = *((const unsigned char *)word);
1283 PfxEntry * pptr = pStart[sp];
1284
1285 while (pptr) {
1286 if (isSubset(pptr->getKey(),word)) {
1287 st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
1288 if (st) {
1289 mystrcat(result, st, MAXLNLEN);
1290 free(st);
1291 pfx = pptr;
1292 }
1293 pptr = pptr->getNextEQ();
1294 } else {
1295 pptr = pptr->getNextNE();
1296 }
1297 }
1298
1299 if (*result) return mystrdup(result);
1300 return NULL;
1301 }
1302
1303 // Is word a non compound with a REP substitution (see checkcompoundrep)?
cpdrep_check(const char * word,int wl)1304 int AffixMgr::cpdrep_check(const char * word, int wl)
1305 {
1306 char candidate[MAXLNLEN];
1307 const char * r;
1308 int lenr, lenp;
1309
1310 if ((wl < 2) || !numrep) return 0;
1311
1312 for (int i=0; i < numrep; i++ ) {
1313 r = word;
1314 lenr = strlen(reptable[i].pattern2);
1315 lenp = strlen(reptable[i].pattern);
1316 // search every occurence of the pattern in the word
1317 while ((r=strstr(r, reptable[i].pattern)) != NULL) {
1318 strcpy(candidate, word);
1319 if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
1320 strcpy(candidate+(r-word),reptable[i].pattern2);
1321 strcpy(candidate+(r-word)+lenr, r+lenp);
1322 if (candidate_check(candidate,strlen(candidate))) return 1;
1323 r++; // search for the next letter
1324 }
1325 }
1326 return 0;
1327 }
1328
1329 // forbid compoundings when there are special patterns at word bound
cpdpat_check(const char * word,int pos,hentry * r1,hentry * r2,const char)1330 int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2, const char /*affixed*/)
1331 {
1332 int len;
1333 for (int i = 0; i < numcheckcpd; i++) {
1334 if (isSubset(checkcpdtable[i].pattern2, word + pos) &&
1335 (!r1 || !checkcpdtable[i].cond ||
1336 (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) &&
1337 (!r2 || !checkcpdtable[i].cond2 ||
1338 (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) &&
1339 // zero length pattern => only TESTAFF
1340 // zero pattern (0/flag) => unmodified stem (zero affixes allowed)
1341 (!*(checkcpdtable[i].pattern) || (
1342 (*(checkcpdtable[i].pattern)=='0' && r1->blen <= pos && strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) ||
1343 (*(checkcpdtable[i].pattern)!='0' && ((len = strlen(checkcpdtable[i].pattern)) != 0) &&
1344 strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)))) {
1345 return 1;
1346 }
1347 }
1348 return 0;
1349 }
1350
1351 // forbid compounding with neighbouring upper and lower case characters at word bounds
cpdcase_check(const char * word,int pos)1352 int AffixMgr::cpdcase_check(const char * word, int pos)
1353 {
1354 if (utf8) {
1355 w_char u, w;
1356 const char * p;
1357 u8_u16(&u, 1, word + pos);
1358 for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);
1359 u8_u16(&w, 1, p);
1360 unsigned short a = (u.h << 8) + u.l;
1361 unsigned short b = (w.h << 8) + w.l;
1362 if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) &&
1363 (a != '-') && (b != '-')) return 1;
1364 } else {
1365 unsigned char a = *(word + pos - 1);
1366 unsigned char b = *(word + pos);
1367 if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;
1368 }
1369 return 0;
1370 }
1371
1372 // check compound patterns
defcpd_check(hentry *** words,short wnum,hentry * rv,hentry ** def,char all)1373 int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)
1374 {
1375 signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
1376 signed short btwp[MAXWORDLEN]; // word positions for metacharacters
1377 int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions
1378 short bt = 0;
1379 int i, j;
1380 int ok;
1381 int w = 0;
1382
1383 if (!*words) {
1384 w = 1;
1385 *words = def;
1386 }
1387
1388 if (!*words) {
1389 return 0;
1390 }
1391
1392 (*words)[wnum] = rv;
1393
1394 // has the last word COMPOUNDRULE flag?
1395 if (rv->alen == 0) {
1396 (*words)[wnum] = NULL;
1397 if (w) *words = NULL;
1398 return 0;
1399 }
1400 ok = 0;
1401 for (i = 0; i < numdefcpd; i++) {
1402 for (j = 0; j < defcpdtable[i].len; j++) {
1403 if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' &&
1404 TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) {
1405 ok = 1;
1406 break;
1407 }
1408 }
1409 }
1410 if (ok == 0) {
1411 (*words)[wnum] = NULL;
1412 if (w) *words = NULL;
1413 return 0;
1414 }
1415
1416 for (i = 0; i < numdefcpd; i++) {
1417 signed short pp = 0; // pattern position
1418 signed short wp = 0; // "words" position
1419 int ok2;
1420 ok = 1;
1421 ok2 = 1;
1422 do {
1423 while ((pp < defcpdtable[i].len) && (wp <= wnum)) {
1424 if (((pp+1) < defcpdtable[i].len) &&
1425 ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {
1426 int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;
1427 ok2 = 1;
1428 pp+=2;
1429 btpp[bt] = pp;
1430 btwp[bt] = wp;
1431 while (wp <= wend) {
1432 if (!(*words)[wp]->alen ||
1433 !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {
1434 ok2 = 0;
1435 break;
1436 }
1437 wp++;
1438 }
1439 if (wp <= wnum) ok2 = 0;
1440 btnum[bt] = wp - btwp[bt];
1441 if (btnum[bt] > 0) bt++;
1442 if (ok2) break;
1443 } else {
1444 ok2 = 1;
1445 if (!(*words)[wp] || !(*words)[wp]->alen ||
1446 !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {
1447 ok = 0;
1448 break;
1449 }
1450 pp++;
1451 wp++;
1452 if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;
1453 }
1454 }
1455 if (ok && ok2) {
1456 int r = pp;
1457 while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&
1458 ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;
1459 if (defcpdtable[i].len <= r) return 1;
1460 }
1461 // backtrack
1462 if (bt) do {
1463 ok = 1;
1464 btnum[bt - 1]--;
1465 pp = btpp[bt - 1];
1466 wp = btwp[bt - 1] + (signed short) btnum[bt - 1];
1467 } while ((btnum[bt - 1] < 0) && --bt);
1468 } while (bt);
1469
1470 if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1;
1471
1472 // check zero ending
1473 while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&
1474 ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;
1475 if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;
1476 }
1477 (*words)[wnum] = NULL;
1478 if (w) *words = NULL;
1479 return 0;
1480 }
1481
candidate_check(const char * word,int len)1482 inline int AffixMgr::candidate_check(const char * word, int len)
1483 {
1484 struct hentry * rv=NULL;
1485
1486 rv = lookup(word);
1487 if (rv) return 1;
1488
1489 // rv = prefix_check(word,len,1);
1490 // if (rv) return 1;
1491
1492 rv = affix_check(word,len);
1493 if (rv) return 1;
1494 return 0;
1495 }
1496
1497 // calculate number of syllable for compound-checking
get_syllable(const char * word,int wlen)1498 short AffixMgr::get_syllable(const char * word, int wlen)
1499 {
1500 if (cpdmaxsyllable==0) return 0;
1501
1502 short num=0;
1503
1504 if (!utf8) {
1505 for (int i=0; i<wlen; i++) {
1506 if (strchr(cpdvowels, word[i])) num++;
1507 }
1508 } else if (cpdvowels_utf16) {
1509 w_char w[MAXWORDUTF8LEN];
1510 int i = u8_u16(w, MAXWORDUTF8LEN, word);
1511 for (; i > 0; i--) {
1512 if (flag_bsearch((unsigned short *) cpdvowels_utf16,
1513 ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;
1514 }
1515 }
1516 return num;
1517 }
1518
setcminmax(int * cmin,int * cmax,const char * word,int len)1519 void AffixMgr::setcminmax(int * cmin, int * cmax, const char * word, int len) {
1520 if (utf8) {
1521 int i;
1522 for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) {
1523 for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++);
1524 }
1525 for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) {
1526 for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--);
1527 }
1528 } else {
1529 *cmin = cpdmin;
1530 *cmax = len - cpdmin + 1;
1531 }
1532 }
1533
1534
1535 // check if compound word is correctly spelled
1536 // hu_mov_rule = spec. Hungarian rule (XXX)
compound_check(const char * word,int len,short wordnum,short numsyllable,short maxwordnum,short wnum,hentry ** words=NULL,char hu_mov_rule=0,char is_sug=0,int * info=NULL)1537 struct hentry * AffixMgr::compound_check(const char * word, int len,
1538 short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,
1539 char hu_mov_rule = 0, char is_sug = 0, int * info = NULL)
1540 {
1541 int i;
1542 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
1543 struct hentry * rv = NULL;
1544 struct hentry * rv_first;
1545 struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
1546 char st [MAXWORDUTF8LEN + 4];
1547 char ch = '\0';
1548 int cmin;
1549 int cmax;
1550 int striple = 0;
1551 int scpd = 0;
1552 int soldi = 0;
1553 int oldcmin = 0;
1554 int oldcmax = 0;
1555 int oldlen = 0;
1556 int checkedstriple = 0;
1557 int onlycpdrule;
1558 char affixed = 0;
1559 hentry ** oldwords = words;
1560
1561 int checked_prefix;
1562
1563 setcminmax(&cmin, &cmax, word, len);
1564
1565 strcpy(st, word);
1566
1567 for (i = cmin; i < cmax; i++) {
1568 // go to end of the UTF-8 character
1569 if (utf8) {
1570 for (; (st[i] & 0xc0) == 0x80; i++);
1571 if (i >= cmax) return NULL;
1572 }
1573
1574 words = oldwords;
1575 onlycpdrule = (words) ? 1 : 0;
1576
1577 do { // onlycpdrule loop
1578
1579 oldnumsyllable = numsyllable;
1580 oldwordnum = wordnum;
1581 checked_prefix = 0;
1582
1583
1584 do { // simplified checkcompoundpattern loop
1585
1586 if (scpd > 0) {
1587 for (; scpd <= numcheckcpd && (!checkcpdtable[scpd-1].pattern3 ||
1588 strncmp(word + i, checkcpdtable[scpd-1].pattern3, strlen(checkcpdtable[scpd-1].pattern3)) != 0); scpd++);
1589
1590 if (scpd > numcheckcpd) break; // break simplified checkcompoundpattern loop
1591 strcpy(st + i, checkcpdtable[scpd-1].pattern);
1592 soldi = i;
1593 i += strlen(checkcpdtable[scpd-1].pattern);
1594 strcpy(st + i, checkcpdtable[scpd-1].pattern2);
1595 strcpy(st + i + strlen(checkcpdtable[scpd-1].pattern2), word + soldi + strlen(checkcpdtable[scpd-1].pattern3));
1596
1597 oldlen = len;
1598 len += strlen(checkcpdtable[scpd-1].pattern) + strlen(checkcpdtable[scpd-1].pattern2) - strlen(checkcpdtable[scpd-1].pattern3);
1599 oldcmin = cmin;
1600 oldcmax = cmax;
1601 setcminmax(&cmin, &cmax, st, len);
1602
1603 cmax = len - cpdmin + 1;
1604 }
1605
1606 ch = st[i];
1607 st[i] = '\0';
1608
1609 sfx = NULL;
1610 pfx = NULL;
1611
1612 // FIRST WORD
1613
1614 affixed = 1;
1615 rv = lookup(st); // perhaps without prefix
1616
1617 // search homonym with compound flag
1618 while ((rv) && !hu_mov_rule &&
1619 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1620 !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1621 (compoundbegin && !wordnum && !onlycpdrule &&
1622 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1623 (compoundmiddle && wordnum && !words && !onlycpdrule &&
1624 TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
1625 (numdefcpd && onlycpdrule &&
1626 ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
1627 (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))) ||
1628 (scpd != 0 && checkcpdtable[scpd-1].cond != FLAG_NULL &&
1629 !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen)))
1630 ) {
1631 rv = rv->next_homonym;
1632 }
1633
1634 if (rv) affixed = 0;
1635
1636 if (!rv) {
1637 if (onlycpdrule) break;
1638 if (compoundflag &&
1639 !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
1640 if (((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
1641 FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1642 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundflag)))) && !hu_mov_rule &&
1643 sfx->getCont() &&
1644 ((compoundforbidflag && TESTAFF(sfx->getCont(), compoundforbidflag,
1645 sfx->getContLen())) || (compoundend &&
1646 TESTAFF(sfx->getCont(), compoundend,
1647 sfx->getContLen())))) {
1648 rv = NULL;
1649 }
1650 }
1651
1652 if (rv ||
1653 (((wordnum == 0) && compoundbegin &&
1654 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1655 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundbegin))) || // twofold suffixes + compound
1656 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
1657 ((wordnum > 0) && compoundmiddle &&
1658 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1659 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundmiddle))) || // twofold suffixes + compound
1660 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
1661 ) checked_prefix = 1;
1662 // else check forbiddenwords and needaffix
1663 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1664 TESTAFF(rv->astr, needaffix, rv->alen) ||
1665 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1666 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen))
1667 )) {
1668 st[i] = ch;
1669 //continue;
1670 break;
1671 }
1672
1673 // check non_compound flag in suffix and prefix
1674 if ((rv) && !hu_mov_rule &&
1675 ((pfx && pfx->getCont() &&
1676 TESTAFF(pfx->getCont(), compoundforbidflag,
1677 pfx->getContLen())) ||
1678 (sfx && sfx->getCont() &&
1679 TESTAFF(sfx->getCont(), compoundforbidflag,
1680 sfx->getContLen())))) {
1681 rv = NULL;
1682 }
1683
1684 // check compoundend flag in suffix and prefix
1685 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
1686 ((pfx && pfx->getCont() &&
1687 TESTAFF(pfx->getCont(), compoundend,
1688 pfx->getContLen())) ||
1689 (sfx && sfx->getCont() &&
1690 TESTAFF(sfx->getCont(), compoundend,
1691 sfx->getContLen())))) {
1692 rv = NULL;
1693 }
1694
1695 // check compoundmiddle flag in suffix and prefix
1696 if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
1697 ((pfx && pfx->getCont() &&
1698 TESTAFF(pfx->getCont(), compoundmiddle,
1699 pfx->getContLen())) ||
1700 (sfx && sfx->getCont() &&
1701 TESTAFF(sfx->getCont(), compoundmiddle,
1702 sfx->getContLen())))) {
1703 rv = NULL;
1704 }
1705
1706 // check forbiddenwords
1707 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1708 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1709 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
1710 return NULL;
1711 }
1712
1713 // increment word number, if the second root has a compoundroot flag
1714 if ((rv) && compoundroot &&
1715 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1716 wordnum++;
1717 }
1718
1719 // first word is acceptable in compound words?
1720 if (((rv) &&
1721 ( checked_prefix || (words && words[wnum]) ||
1722 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1723 ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1724 ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))// ||
1725 // (numdefcpd && )
1726
1727 // LANG_hu section: spec. Hungarian rule
1728 || ((langnum == LANG_hu) && hu_mov_rule && (
1729 TESTAFF(rv->astr, 'F', rv->alen) || // XXX hardwired Hungarian dictionary codes
1730 TESTAFF(rv->astr, 'G', rv->alen) ||
1731 TESTAFF(rv->astr, 'H', rv->alen)
1732 )
1733 )
1734 // END of LANG_hu section
1735 ) &&
1736 (
1737 // test CHECKCOMPOUNDPATTERN conditions
1738 scpd == 0 || checkcpdtable[scpd-1].cond == FLAG_NULL ||
1739 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen)
1740 )
1741 && ! (( checkcompoundtriple && scpd == 0 && !words && // test triple letters
1742 (word[i-1]==word[i]) && (
1743 ((i>1) && (word[i-1]==word[i-2])) ||
1744 ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
1745 )
1746 ) ||
1747 (
1748 checkcompoundcase && scpd == 0 && !words && cpdcase_check(word, i)
1749 ))
1750 )
1751 // LANG_hu section: spec. Hungarian rule
1752 || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
1753 (sfx && sfx->getCont() && ( // XXX hardwired Hungarian dic. codes
1754 TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getContLen()) ||
1755 TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getContLen())
1756 )
1757 )
1758 )
1759 ) { // first word is ok condition
1760
1761 // LANG_hu section: spec. Hungarian rule
1762 if (langnum == LANG_hu) {
1763 // calculate syllable number of the word
1764 numsyllable += get_syllable(st, i);
1765 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
1766 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
1767 }
1768 // END of LANG_hu section
1769
1770 // NEXT WORD(S)
1771 rv_first = rv;
1772 st[i] = ch;
1773
1774 do { // striple loop
1775
1776 // check simplifiedtriple
1777 if (simplifiedtriple) {
1778 if (striple) {
1779 checkedstriple = 1;
1780 i--; // check "fahrt" instead of "ahrt" in "Schiffahrt"
1781 } else if (i > 2 && *(word+i - 1) == *(word + i - 2)) striple = 1;
1782 }
1783
1784 rv = lookup((st+i)); // perhaps without prefix
1785
1786 // search homonym with compound flag
1787 while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1788 !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1789 (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
1790 (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))) ||
1791 (scpd != 0 && checkcpdtable[scpd-1].cond2 != FLAG_NULL &&
1792 !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))
1793 )) {
1794 rv = rv->next_homonym;
1795 }
1796
1797 // check FORCEUCASE
1798 if (rv && forceucase && (rv) &&
1799 (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL;
1800
1801 if (rv && words && words[wnum + 1]) return rv_first;
1802
1803 oldnumsyllable2 = numsyllable;
1804 oldwordnum2 = wordnum;
1805
1806
1807 // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code
1808 if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
1809 numsyllable--;
1810 }
1811 // END of LANG_hu section
1812
1813 // increment word number, if the second root has a compoundroot flag
1814 if ((rv) && (compoundroot) &&
1815 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1816 wordnum++;
1817 }
1818
1819 // check forbiddenwords
1820 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1821 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1822 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
1823
1824 // second word is acceptable, as a root?
1825 // hungarian conventions: compounding is acceptable,
1826 // when compound forms consist of 2 words, or if more,
1827 // then the syllable number of root words must be 6, or lesser.
1828
1829 if ((rv) && (
1830 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1831 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
1832 )
1833 && (
1834 ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
1835 ((cpdmaxsyllable!=0) &&
1836 (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen)<=cpdmaxsyllable))
1837 ) &&
1838 (
1839 // test CHECKCOMPOUNDPATTERN
1840 !numcheckcpd || scpd != 0 || !cpdpat_check(word, i, rv_first, rv, 0)
1841 ) &&
1842 (
1843 (!checkcompounddup || (rv != rv_first))
1844 )
1845 // test CHECKCOMPOUNDPATTERN conditions
1846 && (scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL ||
1847 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))
1848 )
1849 {
1850 // forbid compound word, if it is a non compound word with typical fault
1851 if (checkcompoundrep && cpdrep_check(word,len)) return NULL;
1852 return rv_first;
1853 }
1854
1855 numsyllable = oldnumsyllable2;
1856 wordnum = oldwordnum2;
1857
1858 // perhaps second word has prefix or/and suffix
1859 sfx = NULL;
1860 sfxflag = FLAG_NULL;
1861 rv = (compoundflag && !onlycpdrule) ? affix_check((word+i),strlen(word+i), compoundflag, IN_CPD_END) : NULL;
1862 if (!rv && compoundend && !onlycpdrule) {
1863 sfx = NULL;
1864 pfx = NULL;
1865 rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END);
1866 }
1867
1868 if (!rv && numdefcpd && words) {
1869 rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
1870 if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv_first;
1871 rv = NULL;
1872 }
1873
1874 // test CHECKCOMPOUNDPATTERN conditions (allowed forms)
1875 if (rv && !(scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL ||
1876 TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))) rv = NULL;
1877
1878 // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds)
1879 if (rv && numcheckcpd && scpd == 0 && cpdpat_check(word, i, rv_first, rv, affixed)) rv = NULL;
1880
1881 // check non_compound flag in suffix and prefix
1882 if ((rv) &&
1883 ((pfx && pfx->getCont() &&
1884 TESTAFF(pfx->getCont(), compoundforbidflag,
1885 pfx->getContLen())) ||
1886 (sfx && sfx->getCont() &&
1887 TESTAFF(sfx->getCont(), compoundforbidflag,
1888 sfx->getContLen())))) {
1889 rv = NULL;
1890 }
1891
1892 // check FORCEUCASE
1893 if (rv && forceucase && (rv) &&
1894 (TESTAFF(rv->astr, forceucase, rv->alen)) && !(info && *info & SPELL_ORIGCAP)) rv = NULL;
1895
1896 // check forbiddenwords
1897 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1898 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1899 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
1900
1901 // pfxappnd = prefix of word+i, or NULL
1902 // calculate syllable number of prefix.
1903 // hungarian convention: when syllable number of prefix is more,
1904 // than 1, the prefix+word counts as two words.
1905
1906 if (langnum == LANG_hu) {
1907 // calculate syllable number of the word
1908 numsyllable += get_syllable(word + i, strlen(word + i));
1909
1910 // - affix syllable num.
1911 // XXX only second suffix (inflections, not derivations)
1912 if (sfxappnd) {
1913 char * tmp = myrevstrdup(sfxappnd);
1914 numsyllable -= get_syllable(tmp, strlen(tmp));
1915 free(tmp);
1916 }
1917
1918 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
1919 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
1920
1921 // increment syllable num, if last word has a SYLLABLENUM flag
1922 // and the suffix is beginning `s'
1923
1924 if (cpdsyllablenum) {
1925 switch (sfxflag) {
1926 case 'c': { numsyllable+=2; break; }
1927 case 'J': { numsyllable += 1; break; }
1928 case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
1929 }
1930 }
1931 }
1932
1933 // increment word number, if the second word has a compoundroot flag
1934 if ((rv) && (compoundroot) &&
1935 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1936 wordnum++;
1937 }
1938
1939 // second word is acceptable, as a word with prefix or/and suffix?
1940 // hungarian conventions: compounding is acceptable,
1941 // when compound forms consist 2 word, otherwise
1942 // the syllable number of root words is 6, or lesser.
1943 if ((rv) &&
1944 (
1945 ((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
1946 ((cpdmaxsyllable != 0) &&
1947 (numsyllable <= cpdmaxsyllable))
1948 )
1949 && (
1950 (!checkcompounddup || (rv != rv_first))
1951 )) {
1952 // forbid compound word, if it is a non compound word with typical fault
1953 if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
1954 return rv_first;
1955 }
1956
1957 numsyllable = oldnumsyllable2;
1958 wordnum = oldwordnum2;
1959
1960 // perhaps second word is a compound word (recursive call)
1961 if (wordnum < maxwordnum) {
1962 rv = compound_check((st+i),strlen(st+i), wordnum+1,
1963 numsyllable, maxwordnum, wnum + 1, words, 0, is_sug, info);
1964
1965 if (rv && numcheckcpd && ((scpd == 0 && cpdpat_check(word, i, rv_first, rv, affixed)) ||
1966 (scpd != 0 && !cpdpat_check(word, i, rv_first, rv, affixed)))) rv = NULL;
1967 } else {
1968 rv=NULL;
1969 }
1970 if (rv) {
1971 // forbid compound word, if it is a non compound word with typical fault
1972 if (checkcompoundrep || forbiddenword) {
1973 struct hentry * rv2 = NULL;
1974
1975 if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
1976
1977 // check first part
1978 if (strncmp(rv->word, word + i, rv->blen) == 0) {
1979 char r = *(st + i + rv->blen);
1980 *(st + i + rv->blen) = '\0';
1981
1982 if (checkcompoundrep && cpdrep_check(st, i + rv->blen)) {
1983 *(st + i + rv->blen) = r;
1984 continue;
1985 }
1986
1987 if (forbiddenword) {
1988 rv2 = lookup(word);
1989 if (!rv2) rv2 = affix_check(word, len);
1990 if (rv2 && rv2->astr && TESTAFF(rv2->astr, forbiddenword, rv2->alen) &&
1991 (strncmp(rv2->word, st, i + rv->blen) == 0)) {
1992 return NULL;
1993 }
1994 }
1995 *(st + i + rv->blen) = r;
1996 }
1997 }
1998 return rv_first;
1999 }
2000 } while (striple && !checkedstriple); // end of striple loop
2001
2002 if (checkedstriple) {
2003 i++;
2004 checkedstriple = 0;
2005 striple = 0;
2006 }
2007
2008 } // first word is ok condition
2009
2010 if (soldi != 0) {
2011 i = soldi;
2012 soldi = 0;
2013 len = oldlen;
2014 cmin = oldcmin;
2015 cmax = oldcmax;
2016 }
2017 scpd++;
2018
2019
2020 } while (!onlycpdrule && simplifiedcpd && scpd <= numcheckcpd); // end of simplifiedcpd loop
2021
2022 scpd = 0;
2023 wordnum = oldwordnum;
2024 numsyllable = oldnumsyllable;
2025
2026 if (soldi != 0) {
2027 i = soldi;
2028 strcpy(st, word); // XXX add more optim.
2029 soldi = 0;
2030 } else st[i] = ch;
2031
2032 } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule = 1)); // end of onlycpd loop
2033
2034 }
2035
2036 return NULL;
2037 }
2038
2039 // check if compound word is correctly spelled
2040 // hu_mov_rule = spec. Hungarian rule (XXX)
compound_check_morph(const char * word,int len,short wordnum,short numsyllable,short maxwordnum,short wnum,hentry ** words,char hu_mov_rule=0,char ** result=NULL,char * partresult=NULL)2041 int AffixMgr::compound_check_morph(const char * word, int len,
2042 short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words,
2043 char hu_mov_rule = 0, char ** result = NULL, char * partresult = NULL)
2044 {
2045 int i;
2046 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
2047 int ok = 0;
2048
2049 struct hentry * rv = NULL;
2050 struct hentry * rv_first;
2051 struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
2052 char st [MAXWORDUTF8LEN + 4];
2053 char ch;
2054
2055 int checked_prefix;
2056 char presult[MAXLNLEN];
2057
2058 int cmin;
2059 int cmax;
2060
2061 int onlycpdrule;
2062 char affixed = 0;
2063 hentry ** oldwords = words;
2064
2065 setcminmax(&cmin, &cmax, word, len);
2066
2067 strcpy(st, word);
2068
2069 for (i = cmin; i < cmax; i++) {
2070 oldnumsyllable = numsyllable;
2071 oldwordnum = wordnum;
2072 checked_prefix = 0;
2073
2074 // go to end of the UTF-8 character
2075 if (utf8) {
2076 for (; (st[i] & 0xc0) == 0x80; i++);
2077 if (i >= cmax) return 0;
2078 }
2079
2080 words = oldwords;
2081 onlycpdrule = (words) ? 1 : 0;
2082
2083 do { // onlycpdrule loop
2084
2085 oldnumsyllable = numsyllable;
2086 oldwordnum = wordnum;
2087 checked_prefix = 0;
2088
2089 ch = st[i];
2090 st[i] = '\0';
2091 sfx = NULL;
2092
2093 // FIRST WORD
2094
2095 affixed = 1;
2096
2097 *presult = '\0';
2098 if (partresult) mystrcat(presult, partresult, MAXLNLEN);
2099
2100 rv = lookup(st); // perhaps without prefix
2101
2102 // search homonym with compound flag
2103 while ((rv) && !hu_mov_rule &&
2104 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2105 !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2106 (compoundbegin && !wordnum && !onlycpdrule &&
2107 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2108 (compoundmiddle && wordnum && !words && !onlycpdrule &&
2109 TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
2110 (numdefcpd && onlycpdrule &&
2111 ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
2112 (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))
2113 ))) {
2114 rv = rv->next_homonym;
2115 }
2116
2117 if (rv) affixed = 0;
2118
2119 if (rv) {
2120 sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st);
2121 if (!HENTRY_FIND(rv, MORPH_STEM)) {
2122 sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM, st);
2123 }
2124 // store the pointer of the hash entry
2125 // sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTRY, rv);
2126 if (HENTRY_DATA(rv)) {
2127 sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, HENTRY_DATA2(rv));
2128 }
2129 }
2130
2131 if (!rv) {
2132 if (onlycpdrule && strlen(*result) > MAXLNLEN/10) break;
2133 if (compoundflag &&
2134 !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
2135 if (((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
2136 FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2137 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundflag)))) && !hu_mov_rule &&
2138 sfx->getCont() &&
2139 ((compoundforbidflag && TESTAFF(sfx->getCont(), compoundforbidflag,
2140 sfx->getContLen())) || (compoundend &&
2141 TESTAFF(sfx->getCont(), compoundend,
2142 sfx->getContLen())))) {
2143 rv = NULL;
2144 }
2145 }
2146
2147 if (rv ||
2148 (((wordnum == 0) && compoundbegin &&
2149 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2150 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundbegin))) || // twofold suffix+compound
2151 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
2152 ((wordnum > 0) && compoundmiddle &&
2153 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2154 (compoundmoresuffixes && (rv = suffix_check_twosfx(st, i, 0, NULL, compoundmiddle))) || // twofold suffix+compound
2155 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
2156 ) {
2157 // char * p = prefix_check_morph(st, i, 0, compound);
2158 char * p = NULL;
2159 if (compoundflag) p = affix_check_morph(st, i, compoundflag);
2160 if (!p || (*p == '\0')) {
2161 if (p) free(p);
2162 p = NULL;
2163 if ((wordnum == 0) && compoundbegin) {
2164 p = affix_check_morph(st, i, compoundbegin);
2165 } else if ((wordnum > 0) && compoundmiddle) {
2166 p = affix_check_morph(st, i, compoundmiddle);
2167 }
2168 }
2169 if (p && (*p != '\0')) {
2170 sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD,
2171 MORPH_PART, st, line_uniq_app(&p, MSEP_REC));
2172 }
2173 if (p) free(p);
2174 checked_prefix = 1;
2175 }
2176 // else check forbiddenwords
2177 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2178 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
2179 TESTAFF(rv->astr, needaffix, rv->alen))) {
2180 st[i] = ch;
2181 continue;
2182 }
2183
2184 // check non_compound flag in suffix and prefix
2185 if ((rv) && !hu_mov_rule &&
2186 ((pfx && pfx->getCont() &&
2187 TESTAFF(pfx->getCont(), compoundforbidflag,
2188 pfx->getContLen())) ||
2189 (sfx && sfx->getCont() &&
2190 TESTAFF(sfx->getCont(), compoundforbidflag,
2191 sfx->getContLen())))) {
2192 continue;
2193 }
2194
2195 // check compoundend flag in suffix and prefix
2196 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
2197 ((pfx && pfx->getCont() &&
2198 TESTAFF(pfx->getCont(), compoundend,
2199 pfx->getContLen())) ||
2200 (sfx && sfx->getCont() &&
2201 TESTAFF(sfx->getCont(), compoundend,
2202 sfx->getContLen())))) {
2203 continue;
2204 }
2205
2206 // check compoundmiddle flag in suffix and prefix
2207 if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
2208 ((pfx && pfx->getCont() &&
2209 TESTAFF(pfx->getCont(), compoundmiddle,
2210 pfx->getContLen())) ||
2211 (sfx && sfx->getCont() &&
2212 TESTAFF(sfx->getCont(), compoundmiddle,
2213 sfx->getContLen())))) {
2214 rv = NULL;
2215 }
2216
2217 // check forbiddenwords
2218 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen)
2219 || TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) continue;
2220
2221 // increment word number, if the second root has a compoundroot flag
2222 if ((rv) && (compoundroot) &&
2223 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2224 wordnum++;
2225 }
2226
2227 // first word is acceptable in compound words?
2228 if (((rv) &&
2229 ( checked_prefix || (words && words[wnum]) ||
2230 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2231 ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2232 ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))
2233 // LANG_hu section: spec. Hungarian rule
2234 || ((langnum == LANG_hu) && // hu_mov_rule
2235 hu_mov_rule && (
2236 TESTAFF(rv->astr, 'F', rv->alen) ||
2237 TESTAFF(rv->astr, 'G', rv->alen) ||
2238 TESTAFF(rv->astr, 'H', rv->alen)
2239 )
2240 )
2241 // END of LANG_hu section
2242 )
2243 && ! (( checkcompoundtriple && !words && // test triple letters
2244 (word[i-1]==word[i]) && (
2245 ((i>1) && (word[i-1]==word[i-2])) ||
2246 ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
2247 )
2248 ) ||
2249 (
2250 // test CHECKCOMPOUNDPATTERN
2251 numcheckcpd && !words && cpdpat_check(word, i, rv, NULL, affixed)
2252 ) ||
2253 (
2254 checkcompoundcase && !words && cpdcase_check(word, i)
2255 ))
2256 )
2257 // LANG_hu section: spec. Hungarian rule
2258 || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
2259 (sfx && sfx->getCont() && (
2260 TESTAFF(sfx->getCont(), (unsigned short) 'x', sfx->getContLen()) ||
2261 TESTAFF(sfx->getCont(), (unsigned short) '%', sfx->getContLen())
2262 )
2263 )
2264 )
2265 // END of LANG_hu section
2266 ) {
2267
2268 // LANG_hu section: spec. Hungarian rule
2269 if (langnum == LANG_hu) {
2270 // calculate syllable number of the word
2271 numsyllable += get_syllable(st, i);
2272
2273 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
2274 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
2275 }
2276 // END of LANG_hu section
2277
2278 // NEXT WORD(S)
2279 rv_first = rv;
2280 rv = lookup((word+i)); // perhaps without prefix
2281
2282 // search homonym with compound flag
2283 while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2284 !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2285 (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
2286 (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) {
2287 rv = rv->next_homonym;
2288 }
2289
2290 if (rv && words && words[wnum + 1]) {
2291 mystrcat(*result, presult, MAXLNLEN);
2292 mystrcat(*result, " ", MAXLNLEN);
2293 mystrcat(*result, MORPH_PART, MAXLNLEN);
2294 mystrcat(*result, word+i, MAXLNLEN);
2295 if (complexprefixes && HENTRY_DATA(rv)) mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
2296 if (!HENTRY_FIND(rv, MORPH_STEM)) {
2297 mystrcat(*result, " ", MAXLNLEN);
2298 mystrcat(*result, MORPH_STEM, MAXLNLEN);
2299 mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN);
2300 }
2301 // store the pointer of the hash entry
2302 // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
2303 if (!complexprefixes && HENTRY_DATA(rv)) {
2304 mystrcat(*result, " ", MAXLNLEN);
2305 mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
2306 }
2307 mystrcat(*result, "\n", MAXLNLEN);
2308 ok = 1;
2309 return 0;
2310 }
2311
2312 oldnumsyllable2 = numsyllable;
2313 oldwordnum2 = wordnum;
2314
2315 // LANG_hu section: spec. Hungarian rule
2316 if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
2317 numsyllable--;
2318 }
2319 // END of LANG_hu section
2320 // increment word number, if the second root has a compoundroot flag
2321 if ((rv) && (compoundroot) &&
2322 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2323 wordnum++;
2324 }
2325
2326 // check forbiddenwords
2327 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2328 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) {
2329 st[i] = ch;
2330 continue;
2331 }
2332
2333 // second word is acceptable, as a root?
2334 // hungarian conventions: compounding is acceptable,
2335 // when compound forms consist of 2 words, or if more,
2336 // then the syllable number of root words must be 6, or lesser.
2337 if ((rv) && (
2338 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2339 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
2340 )
2341 && (
2342 ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
2343 ((cpdmaxsyllable!=0) &&
2344 (numsyllable+get_syllable(HENTRY_WORD(rv),rv->blen)<=cpdmaxsyllable))
2345 )
2346 && (
2347 (!checkcompounddup || (rv != rv_first))
2348 )
2349 )
2350 {
2351 // bad compound word
2352 mystrcat(*result, presult, MAXLNLEN);
2353 mystrcat(*result, " ", MAXLNLEN);
2354 mystrcat(*result, MORPH_PART, MAXLNLEN);
2355 mystrcat(*result, word+i, MAXLNLEN);
2356
2357 if (HENTRY_DATA(rv)) {
2358 if (complexprefixes) mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
2359 if (! HENTRY_FIND(rv, MORPH_STEM)) {
2360 mystrcat(*result, " ", MAXLNLEN);
2361 mystrcat(*result, MORPH_STEM, MAXLNLEN);
2362 mystrcat(*result, HENTRY_WORD(rv), MAXLNLEN);
2363 }
2364 // store the pointer of the hash entry
2365 // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
2366 if (!complexprefixes) {
2367 mystrcat(*result, " ", MAXLNLEN);
2368 mystrcat(*result, HENTRY_DATA2(rv), MAXLNLEN);
2369 }
2370 }
2371 mystrcat(*result, "\n", MAXLNLEN);
2372 ok = 1;
2373 }
2374
2375 numsyllable = oldnumsyllable2 ;
2376 wordnum = oldwordnum2;
2377
2378 // perhaps second word has prefix or/and suffix
2379 sfx = NULL;
2380 sfxflag = FLAG_NULL;
2381
2382 if (compoundflag && !onlycpdrule) rv = affix_check((word+i),strlen(word+i), compoundflag); else rv = NULL;
2383
2384 if (!rv && compoundend && !onlycpdrule) {
2385 sfx = NULL;
2386 pfx = NULL;
2387 rv = affix_check((word+i),strlen(word+i), compoundend);
2388 }
2389
2390 if (!rv && numdefcpd && words) {
2391 rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
2392 if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
2393 char * m = NULL;
2394 if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
2395 if ((!m || *m == '\0') && compoundend) {
2396 if (m) free(m);
2397 m = affix_check_morph((word+i),strlen(word+i), compoundend);
2398 }
2399 mystrcat(*result, presult, MAXLNLEN);
2400 if (m || (*m != '\0')) {
2401 sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD,
2402 MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC));
2403 }
2404 if (m) free(m);
2405 mystrcat(*result, "\n", MAXLNLEN);
2406 ok = 1;
2407 }
2408 }
2409
2410 // check non_compound flag in suffix and prefix
2411 if ((rv) &&
2412 ((pfx && pfx->getCont() &&
2413 TESTAFF(pfx->getCont(), compoundforbidflag,
2414 pfx->getContLen())) ||
2415 (sfx && sfx->getCont() &&
2416 TESTAFF(sfx->getCont(), compoundforbidflag,
2417 sfx->getContLen())))) {
2418 rv = NULL;
2419 }
2420
2421 // check forbiddenwords
2422 if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen) ||
2423 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))
2424 && (! TESTAFF(rv->astr, needaffix, rv->alen))) {
2425 st[i] = ch;
2426 continue;
2427 }
2428
2429 if (langnum == LANG_hu) {
2430 // calculate syllable number of the word
2431 numsyllable += get_syllable(word + i, strlen(word + i));
2432
2433 // - affix syllable num.
2434 // XXX only second suffix (inflections, not derivations)
2435 if (sfxappnd) {
2436 char * tmp = myrevstrdup(sfxappnd);
2437 numsyllable -= get_syllable(tmp, strlen(tmp));
2438 free(tmp);
2439 }
2440
2441 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
2442 if (pfx && (get_syllable(pfx->getKey(),strlen(pfx->getKey())) > 1)) wordnum++;
2443
2444 // increment syllable num, if last word has a SYLLABLENUM flag
2445 // and the suffix is beginning `s'
2446
2447 if (cpdsyllablenum) {
2448 switch (sfxflag) {
2449 case 'c': { numsyllable+=2; break; }
2450 case 'J': { numsyllable += 1; break; }
2451 case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
2452 }
2453 }
2454 }
2455
2456 // increment word number, if the second word has a compoundroot flag
2457 if ((rv) && (compoundroot) &&
2458 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2459 wordnum++;
2460 }
2461 // second word is acceptable, as a word with prefix or/and suffix?
2462 // hungarian conventions: compounding is acceptable,
2463 // when compound forms consist 2 word, otherwise
2464 // the syllable number of root words is 6, or lesser.
2465 if ((rv) &&
2466 (
2467 ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
2468 ((cpdmaxsyllable!=0) &&
2469 (numsyllable <= cpdmaxsyllable))
2470 )
2471 && (
2472 (!checkcompounddup || (rv != rv_first))
2473 )) {
2474 char * m = NULL;
2475 if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
2476 if ((!m || *m == '\0') && compoundend) {
2477 if (m) free(m);
2478 m = affix_check_morph((word+i),strlen(word+i), compoundend);
2479 }
2480 mystrcat(*result, presult, MAXLNLEN);
2481 if (m && (*m != '\0')) {
2482 sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD,
2483 MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC));
2484 }
2485 if (m) free(m);
2486 sprintf(*result + strlen(*result), "%c", MSEP_REC);
2487 ok = 1;
2488 }
2489
2490 numsyllable = oldnumsyllable2;
2491 wordnum = oldwordnum2;
2492
2493 // perhaps second word is a compound word (recursive call)
2494 if ((wordnum < maxwordnum) && (ok == 0)) {
2495 compound_check_morph((word+i),strlen(word+i), wordnum+1,
2496 numsyllable, maxwordnum, wnum + 1, words, 0, result, presult);
2497 } else {
2498 rv=NULL;
2499 }
2500 }
2501 st[i] = ch;
2502 wordnum = oldwordnum;
2503 numsyllable = oldnumsyllable;
2504
2505 } while (numdefcpd && oldwordnum == 0 && !onlycpdrule && (onlycpdrule = 1)); // end of onlycpd loop
2506
2507 }
2508 return 0;
2509 }
2510
2511 // return 1 if s1 (reversed) is a leading subset of end of s2
2512 /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
2513 {
2514 while ((len > 0) && *s1 && (*s1 == *end_of_s2)) {
2515 s1++;
2516 end_of_s2--;
2517 len--;
2518 }
2519 return (*s1 == '\0');
2520 }
2521 */
2522
isRevSubset(const char * s1,const char * end_of_s2,int len)2523 inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
2524 {
2525 while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {
2526 s1++;
2527 end_of_s2--;
2528 len--;
2529 }
2530 return (*s1 == '\0');
2531 }
2532
2533 // check word for suffixes
2534
suffix_check(const char * word,int len,int sfxopts,PfxEntry * ppfx,char ** wlst,int maxSug,int * ns,const FLAG cclass,const FLAG needflag,char in_compound)2535 struct hentry * AffixMgr::suffix_check (const char * word, int len,
2536 int sfxopts, PfxEntry * ppfx, char ** wlst, int maxSug, int * ns,
2537 const FLAG cclass, const FLAG needflag, char in_compound)
2538 {
2539 struct hentry * rv = NULL;
2540 PfxEntry* ep = ppfx;
2541
2542 // first handle the special case of 0 length suffixes
2543 SfxEntry * se = sStart[0];
2544
2545 while (se) {
2546 if (!cclass || se->getCont()) {
2547 // suffixes are not allowed in beginning of compounds
2548 if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2549 // except when signed with compoundpermitflag flag
2550 (se->getCont() && compoundpermitflag &&
2551 TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
2552 // no circumfix flag in prefix and suffix
2553 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2554 circumfix, ep->getContLen())) &&
2555 (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
2556 // circumfix flag in prefix AND suffix
2557 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2558 circumfix, ep->getContLen())) &&
2559 (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) &&
2560 // fogemorpheme
2561 (in_compound ||
2562 !(se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) &&
2563 // needaffix on prefix or first suffix
2564 (cclass ||
2565 !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2566 (ppfx && !((ep->getCont()) &&
2567 TESTAFF(ep->getCont(), needaffix,
2568 ep->getContLen())))
2569 )) {
2570 rv = se->checkword(word,len, sfxopts, ppfx, wlst, maxSug, ns, (FLAG) cclass,
2571 needflag, (in_compound ? 0 : onlyincompound));
2572 if (rv) {
2573 sfx=se; // BUG: sfx not stateless
2574 return rv;
2575 }
2576 }
2577 }
2578 se = se->getNext();
2579 }
2580
2581 // now handle the general case
2582 if (len == 0) return NULL; // FULLSTRIP
2583 unsigned char sp= *((const unsigned char *)(word + len - 1));
2584 SfxEntry * sptr = sStart[sp];
2585
2586 while (sptr) {
2587 if (isRevSubset(sptr->getKey(), word + len - 1, len)
2588 ) {
2589 // suffixes are not allowed in beginning of compounds
2590 if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2591 // except when signed with compoundpermitflag flag
2592 (sptr->getCont() && compoundpermitflag &&
2593 TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
2594 // no circumfix flag in prefix and suffix
2595 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2596 circumfix, ep->getContLen())) &&
2597 (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
2598 // circumfix flag in prefix AND suffix
2599 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2600 circumfix, ep->getContLen())) &&
2601 (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) &&
2602 // fogemorpheme
2603 (in_compound ||
2604 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
2605 // needaffix on prefix or first suffix
2606 (cclass ||
2607 !(sptr->getCont() && TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
2608 (ppfx && !((ep->getCont()) &&
2609 TESTAFF(ep->getCont(), needaffix,
2610 ep->getContLen())))
2611 )
2612 ) if (in_compound != IN_CPD_END || ppfx || !(sptr->getCont() && TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) {
2613 rv = sptr->checkword(word,len, sfxopts, ppfx, wlst,
2614 maxSug, ns, cclass, needflag, (in_compound ? 0 : onlyincompound));
2615 if (rv) {
2616 sfx=sptr; // BUG: sfx not stateless
2617 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2618 if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
2619 return rv;
2620 }
2621 }
2622 sptr = sptr->getNextEQ();
2623 } else {
2624 sptr = sptr->getNextNE();
2625 }
2626 }
2627
2628 return NULL;
2629 }
2630
2631 // check word for two-level suffixes
2632
suffix_check_twosfx(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG needflag)2633 struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len,
2634 int sfxopts, PfxEntry * ppfx, const FLAG needflag)
2635 {
2636 struct hentry * rv = NULL;
2637
2638 // first handle the special case of 0 length suffixes
2639 SfxEntry * se = sStart[0];
2640 while (se) {
2641 if (contclasses[se->getFlag()])
2642 {
2643 rv = se->check_twosfx(word,len, sfxopts, ppfx, needflag);
2644 if (rv) return rv;
2645 }
2646 se = se->getNext();
2647 }
2648
2649 // now handle the general case
2650 if (len == 0) return NULL; // FULLSTRIP
2651 unsigned char sp = *((const unsigned char *)(word + len - 1));
2652 SfxEntry * sptr = sStart[sp];
2653
2654 while (sptr) {
2655 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2656 if (contclasses[sptr->getFlag()])
2657 {
2658 rv = sptr->check_twosfx(word,len, sfxopts, ppfx, needflag);
2659 if (rv) {
2660 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2661 if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
2662 return rv;
2663 }
2664 }
2665 sptr = sptr->getNextEQ();
2666 } else {
2667 sptr = sptr->getNextNE();
2668 }
2669 }
2670
2671 return NULL;
2672 }
2673
suffix_check_twosfx_morph(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG needflag)2674 char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len,
2675 int sfxopts, PfxEntry * ppfx, const FLAG needflag)
2676 {
2677 char result[MAXLNLEN];
2678 char result2[MAXLNLEN];
2679 char result3[MAXLNLEN];
2680
2681 char * st;
2682
2683 result[0] = '\0';
2684 result2[0] = '\0';
2685 result3[0] = '\0';
2686
2687 // first handle the special case of 0 length suffixes
2688 SfxEntry * se = sStart[0];
2689 while (se) {
2690 if (contclasses[se->getFlag()])
2691 {
2692 st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
2693 if (st) {
2694 if (ppfx) {
2695 if (ppfx->getMorph()) {
2696 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
2697 mystrcat(result, " ", MAXLNLEN);
2698 } else debugflag(result, ppfx->getFlag());
2699 }
2700 mystrcat(result, st, MAXLNLEN);
2701 free(st);
2702 if (se->getMorph()) {
2703 mystrcat(result, " ", MAXLNLEN);
2704 mystrcat(result, se->getMorph(), MAXLNLEN);
2705 } else debugflag(result, se->getFlag());
2706 mystrcat(result, "\n", MAXLNLEN);
2707 }
2708 }
2709 se = se->getNext();
2710 }
2711
2712 // now handle the general case
2713 if (len == 0) return NULL; // FULLSTRIP
2714 unsigned char sp = *((const unsigned char *)(word + len - 1));
2715 SfxEntry * sptr = sStart[sp];
2716
2717 while (sptr) {
2718 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2719 if (contclasses[sptr->getFlag()])
2720 {
2721 st = sptr->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
2722 if (st) {
2723 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2724 if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
2725 strcpy(result2, st);
2726 free(st);
2727
2728 result3[0] = '\0';
2729
2730 if (sptr->getMorph()) {
2731 mystrcat(result3, " ", MAXLNLEN);
2732 mystrcat(result3, sptr->getMorph(), MAXLNLEN);
2733 } else debugflag(result3, sptr->getFlag());
2734 strlinecat(result2, result3);
2735 mystrcat(result2, "\n", MAXLNLEN);
2736 mystrcat(result, result2, MAXLNLEN);
2737 }
2738 }
2739 sptr = sptr->getNextEQ();
2740 } else {
2741 sptr = sptr->getNextNE();
2742 }
2743 }
2744 if (*result) return mystrdup(result);
2745 return NULL;
2746 }
2747
suffix_check_morph(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag,char in_compound)2748 char * AffixMgr::suffix_check_morph(const char * word, int len,
2749 int sfxopts, PfxEntry * ppfx, const FLAG cclass, const FLAG needflag, char in_compound)
2750 {
2751 char result[MAXLNLEN];
2752
2753 struct hentry * rv = NULL;
2754
2755 result[0] = '\0';
2756
2757 PfxEntry* ep = ppfx;
2758
2759 // first handle the special case of 0 length suffixes
2760 SfxEntry * se = sStart[0];
2761 while (se) {
2762 if (!cclass || se->getCont()) {
2763 // suffixes are not allowed in beginning of compounds
2764 if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2765 // except when signed with compoundpermitflag flag
2766 (se->getCont() && compoundpermitflag &&
2767 TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
2768 // no circumfix flag in prefix and suffix
2769 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2770 circumfix, ep->getContLen())) &&
2771 (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
2772 // circumfix flag in prefix AND suffix
2773 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2774 circumfix, ep->getContLen())) &&
2775 (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) &&
2776 // fogemorpheme
2777 (in_compound ||
2778 !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
2779 // needaffix on prefix or first suffix
2780 (cclass ||
2781 !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2782 (ppfx && !((ep->getCont()) &&
2783 TESTAFF(ep->getCont(), needaffix,
2784 ep->getContLen())))
2785 )
2786 ))
2787 rv = se->checkword(word, len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
2788 while (rv) {
2789 if (ppfx) {
2790 if (ppfx->getMorph()) {
2791 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
2792 mystrcat(result, " ", MAXLNLEN);
2793 } else debugflag(result, ppfx->getFlag());
2794 }
2795 if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2796 if (! HENTRY_FIND(rv, MORPH_STEM)) {
2797 mystrcat(result, " ", MAXLNLEN);
2798 mystrcat(result, MORPH_STEM, MAXLNLEN);
2799 mystrcat(result, HENTRY_WORD(rv), MAXLNLEN);
2800 }
2801 // store the pointer of the hash entry
2802 // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
2803
2804 if (!complexprefixes && HENTRY_DATA(rv)) {
2805 mystrcat(result, " ", MAXLNLEN);
2806 mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2807 }
2808 if (se->getMorph()) {
2809 mystrcat(result, " ", MAXLNLEN);
2810 mystrcat(result, se->getMorph(), MAXLNLEN);
2811 } else debugflag(result, se->getFlag());
2812 mystrcat(result, "\n", MAXLNLEN);
2813 rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
2814 }
2815 }
2816 se = se->getNext();
2817 }
2818
2819 // now handle the general case
2820 if (len == 0) return NULL; // FULLSTRIP
2821 unsigned char sp = *((const unsigned char *)(word + len - 1));
2822 SfxEntry * sptr = sStart[sp];
2823
2824 while (sptr) {
2825 if (isRevSubset(sptr->getKey(), word + len - 1, len)
2826 ) {
2827 // suffixes are not allowed in beginning of compounds
2828 if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2829 // except when signed with compoundpermitflag flag
2830 (sptr->getCont() && compoundpermitflag &&
2831 TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
2832 // no circumfix flag in prefix and suffix
2833 ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2834 circumfix, ep->getContLen())) &&
2835 (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
2836 // circumfix flag in prefix AND suffix
2837 ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2838 circumfix, ep->getContLen())) &&
2839 (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) &&
2840 // fogemorpheme
2841 (in_compound ||
2842 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
2843 // needaffix on first suffix
2844 (cclass || !(sptr->getCont() &&
2845 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())))
2846 )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
2847 while (rv) {
2848 if (ppfx) {
2849 if (ppfx->getMorph()) {
2850 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
2851 mystrcat(result, " ", MAXLNLEN);
2852 } else debugflag(result, ppfx->getFlag());
2853 }
2854 if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2855 if (! HENTRY_FIND(rv, MORPH_STEM)) {
2856 mystrcat(result, " ", MAXLNLEN);
2857 mystrcat(result, MORPH_STEM, MAXLNLEN);
2858 mystrcat(result, HENTRY_WORD(rv), MAXLNLEN);
2859 }
2860 // store the pointer of the hash entry
2861 // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
2862
2863 if (!complexprefixes && HENTRY_DATA(rv)) {
2864 mystrcat(result, " ", MAXLNLEN);
2865 mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
2866 }
2867
2868 if (sptr->getMorph()) {
2869 mystrcat(result, " ", MAXLNLEN);
2870 mystrcat(result, sptr->getMorph(), MAXLNLEN);
2871 } else debugflag(result, sptr->getFlag());
2872 mystrcat(result, "\n", MAXLNLEN);
2873 rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
2874 }
2875 sptr = sptr->getNextEQ();
2876 } else {
2877 sptr = sptr->getNextNE();
2878 }
2879 }
2880
2881 if (*result) return mystrdup(result);
2882 return NULL;
2883 }
2884
2885 // check if word with affixes is correctly spelled
affix_check(const char * word,int len,const FLAG needflag,char in_compound)2886 struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound)
2887 {
2888 struct hentry * rv= NULL;
2889
2890 // check all prefixes (also crossed with suffixes if allowed)
2891 rv = prefix_check(word, len, in_compound, needflag);
2892 if (rv) return rv;
2893
2894 // if still not found check all suffixes
2895 rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, in_compound);
2896
2897 if (havecontclass) {
2898 sfx = NULL;
2899 pfx = NULL;
2900
2901 if (rv) return rv;
2902 // if still not found check all two-level suffixes
2903 rv = suffix_check_twosfx(word, len, 0, NULL, needflag);
2904
2905 if (rv) return rv;
2906 // if still not found check all two-level suffixes
2907 rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag);
2908 }
2909
2910 return rv;
2911 }
2912
2913 // check if word with affixes is correctly spelled
affix_check_morph(const char * word,int len,const FLAG needflag,char in_compound)2914 char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound)
2915 {
2916 char result[MAXLNLEN];
2917 char * st = NULL;
2918
2919 *result = '\0';
2920
2921 // check all prefixes (also crossed with suffixes if allowed)
2922 st = prefix_check_morph(word, len, in_compound);
2923 if (st) {
2924 mystrcat(result, st, MAXLNLEN);
2925 free(st);
2926 }
2927
2928 // if still not found check all suffixes
2929 st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound);
2930 if (st) {
2931 mystrcat(result, st, MAXLNLEN);
2932 free(st);
2933 }
2934
2935 if (havecontclass) {
2936 sfx = NULL;
2937 pfx = NULL;
2938 // if still not found check all two-level suffixes
2939 st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag);
2940 if (st) {
2941 mystrcat(result, st, MAXLNLEN);
2942 free(st);
2943 }
2944
2945 // if still not found check all two-level suffixes
2946 st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag);
2947 if (st) {
2948 mystrcat(result, st, MAXLNLEN);
2949 free(st);
2950 }
2951 }
2952
2953 return mystrdup(result);
2954 }
2955
morphgen(char * ts,int wl,const unsigned short * ap,unsigned short al,char * morph,char * targetmorph,int level)2956 char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap,
2957 unsigned short al, char * morph, char * targetmorph, int level)
2958 {
2959 // handle suffixes
2960 char * stemmorph;
2961 char * stemmorphcatpos;
2962 char mymorph[MAXLNLEN];
2963
2964 if (!morph) return NULL;
2965
2966 // check substandard flag
2967 if (TESTAFF(ap, substandard, al)) return NULL;
2968
2969 if (morphcmp(morph, targetmorph) == 0) return mystrdup(ts);
2970
2971 // int targetcount = get_sfxcount(targetmorph);
2972
2973 // use input suffix fields, if exist
2974 if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) {
2975 stemmorph = mymorph;
2976 strcpy(stemmorph, morph);
2977 mystrcat(stemmorph, " ", MAXLNLEN);
2978 stemmorphcatpos = stemmorph + strlen(stemmorph);
2979 } else {
2980 stemmorph = morph;
2981 stemmorphcatpos = NULL;
2982 }
2983
2984 for (int i = 0; i < al; i++) {
2985 const unsigned char c = (unsigned char) (ap[i] & 0x00FF);
2986 SfxEntry * sptr = sFlag[c];
2987 while (sptr) {
2988 if (sptr->getFlag() == ap[i] && sptr->getMorph() && ((sptr->getContLen() == 0) ||
2989 // don't generate forms with substandard affixes
2990 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) {
2991
2992 if (stemmorphcatpos) strcpy(stemmorphcatpos, sptr->getMorph());
2993 else stemmorph = (char *) sptr->getMorph();
2994
2995 int cmp = morphcmp(stemmorph, targetmorph);
2996
2997 if (cmp == 0) {
2998 char * newword = sptr->add(ts, wl);
2999 if (newword) {
3000 hentry * check = pHMgr->lookup(newword); // XXX extra dic
3001 if (!check || !check->astr ||
3002 !(TESTAFF(check->astr, forbiddenword, check->alen) ||
3003 TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) {
3004 return newword;
3005 }
3006 free(newword);
3007 }
3008 }
3009
3010 // recursive call for secondary suffixes
3011 if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) &&
3012 // (get_sfxcount(stemmorph) < targetcount) &&
3013 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) {
3014 char * newword = sptr->add(ts, wl);
3015 if (newword) {
3016 char * newword2 = morphgen(newword, strlen(newword), sptr->getCont(),
3017 sptr->getContLen(), stemmorph, targetmorph, 1);
3018
3019 if (newword2) {
3020 free(newword);
3021 return newword2;
3022 }
3023 free(newword);
3024 newword = NULL;
3025 }
3026 }
3027 }
3028 sptr = sptr->getFlgNxt();
3029 }
3030 }
3031 return NULL;
3032 }
3033
3034
expand_rootword(struct guessword * wlst,int maxn,const char * ts,int wl,const unsigned short * ap,unsigned short al,char * bad,int badl,char * phon)3035 int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts,
3036 int wl, const unsigned short * ap, unsigned short al, char * bad, int badl,
3037 char * phon)
3038 {
3039 int nh=0;
3040 // first add root word to list
3041 if ((nh < maxn) && !(al && ((needaffix && TESTAFF(ap, needaffix, al)) ||
3042 (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {
3043 wlst[nh].word = mystrdup(ts);
3044 if (!wlst[nh].word) return 0;
3045 wlst[nh].allow = (1 == 0);
3046 wlst[nh].orig = NULL;
3047 nh++;
3048 // add special phonetic version
3049 if (phon && (nh < maxn)) {
3050 wlst[nh].word = mystrdup(phon);
3051 if (!wlst[nh].word) return nh - 1;
3052 wlst[nh].allow = (1 == 0);
3053 wlst[nh].orig = mystrdup(ts);
3054 if (!wlst[nh].orig) return nh - 1;
3055 nh++;
3056 }
3057 }
3058
3059 // handle suffixes
3060 for (int i = 0; i < al; i++) {
3061 const unsigned char c = (unsigned char) (ap[i] & 0x00FF);
3062 SfxEntry * sptr = sFlag[c];
3063 while (sptr) {
3064 if ((sptr->getFlag() == ap[i]) && (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) &&
3065 (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) &&
3066 // check needaffix flag
3067 !(sptr->getCont() && ((needaffix &&
3068 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
3069 (circumfix &&
3070 TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) ||
3071 (onlyincompound &&
3072 TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))
3073 ) {
3074 char * newword = sptr->add(ts, wl);
3075 if (newword) {
3076 if (nh < maxn) {
3077 wlst[nh].word = newword;
3078 wlst[nh].allow = sptr->allowCross();
3079 wlst[nh].orig = NULL;
3080 nh++;
3081 // add special phonetic version
3082 if (phon && (nh < maxn)) {
3083 char st[MAXWORDUTF8LEN];
3084 strcpy(st, phon);
3085 strcat(st, sptr->getKey());
3086 reverseword(st + strlen(phon));
3087 wlst[nh].word = mystrdup(st);
3088 if (!wlst[nh].word) return nh - 1;
3089 wlst[nh].allow = (1 == 0);
3090 wlst[nh].orig = mystrdup(newword);
3091 if (!wlst[nh].orig) return nh - 1;
3092 nh++;
3093 }
3094 } else {
3095 free(newword);
3096 }
3097 }
3098 }
3099 sptr = sptr->getFlgNxt();
3100 }
3101 }
3102
3103 int n = nh;
3104
3105 // handle cross products of prefixes and suffixes
3106 for (int j=1;j<n ;j++)
3107 if (wlst[j].allow) {
3108 for (int k = 0; k < al; k++) {
3109 const unsigned char c = (unsigned char) (ap[k] & 0x00FF);
3110 PfxEntry * cptr = pFlag[c];
3111 while (cptr) {
3112 if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) &&
3113 (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
3114 int l1 = strlen(wlst[j].word);
3115 char * newword = cptr->add(wlst[j].word, l1);
3116 if (newword) {
3117 if (nh < maxn) {
3118 wlst[nh].word = newword;
3119 wlst[nh].allow = cptr->allowCross();
3120 wlst[nh].orig = NULL;
3121 nh++;
3122 } else {
3123 free(newword);
3124 }
3125 }
3126 }
3127 cptr = cptr->getFlgNxt();
3128 }
3129 }
3130 }
3131
3132
3133 // now handle pure prefixes
3134 for (int m = 0; m < al; m ++) {
3135 const unsigned char c = (unsigned char) (ap[m] & 0x00FF);
3136 PfxEntry * ptr = pFlag[c];
3137 while (ptr) {
3138 if ((ptr->getFlag() == ap[m]) && (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) &&
3139 (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) &&
3140 // check needaffix flag
3141 !(ptr->getCont() && ((needaffix &&
3142 TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) ||
3143 (circumfix &&
3144 TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||
3145 (onlyincompound &&
3146 TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))
3147 ) {
3148 char * newword = ptr->add(ts, wl);
3149 if (newword) {
3150 if (nh < maxn) {
3151 wlst[nh].word = newword;
3152 wlst[nh].allow = ptr->allowCross();
3153 wlst[nh].orig = NULL;
3154 nh++;
3155 } else {
3156 free(newword);
3157 }
3158 }
3159 }
3160 ptr = ptr->getFlgNxt();
3161 }
3162 }
3163
3164 return nh;
3165 }
3166
3167 // return length of replacing table
get_numrep() const3168 int AffixMgr::get_numrep() const
3169 {
3170 return numrep;
3171 }
3172
3173 // return replacing table
get_reptable() const3174 struct replentry * AffixMgr::get_reptable() const
3175 {
3176 if (! reptable ) return NULL;
3177 return reptable;
3178 }
3179
3180 // return iconv table
get_iconvtable() const3181 RepList * AffixMgr::get_iconvtable() const
3182 {
3183 if (! iconvtable ) return NULL;
3184 return iconvtable;
3185 }
3186
3187 // return oconv table
get_oconvtable() const3188 RepList * AffixMgr::get_oconvtable() const
3189 {
3190 if (! oconvtable ) return NULL;
3191 return oconvtable;
3192 }
3193
3194 // return replacing table
get_phonetable() const3195 struct phonetable * AffixMgr::get_phonetable() const
3196 {
3197 if (! phone ) return NULL;
3198 return phone;
3199 }
3200
3201 // return length of character map table
get_nummap() const3202 int AffixMgr::get_nummap() const
3203 {
3204 return nummap;
3205 }
3206
3207 // return character map table
get_maptable() const3208 struct mapentry * AffixMgr::get_maptable() const
3209 {
3210 if (! maptable ) return NULL;
3211 return maptable;
3212 }
3213
3214 // return length of word break table
get_numbreak() const3215 int AffixMgr::get_numbreak() const
3216 {
3217 return numbreak;
3218 }
3219
3220 // return character map table
get_breaktable() const3221 char ** AffixMgr::get_breaktable() const
3222 {
3223 if (! breaktable ) return NULL;
3224 return breaktable;
3225 }
3226
3227 // return text encoding of dictionary
get_encoding()3228 char * AffixMgr::get_encoding()
3229 {
3230 if (! encoding ) encoding = mystrdup(SPELL_ENCODING);
3231 return mystrdup(encoding);
3232 }
3233
3234 // return text encoding of dictionary
get_langnum() const3235 int AffixMgr::get_langnum() const
3236 {
3237 return langnum;
3238 }
3239
3240 // return double prefix option
get_complexprefixes() const3241 int AffixMgr::get_complexprefixes() const
3242 {
3243 return complexprefixes;
3244 }
3245
3246 // return FULLSTRIP option
get_fullstrip() const3247 int AffixMgr::get_fullstrip() const
3248 {
3249 return fullstrip;
3250 }
3251
get_keepcase() const3252 FLAG AffixMgr::get_keepcase() const
3253 {
3254 return keepcase;
3255 }
3256
get_forceucase() const3257 FLAG AffixMgr::get_forceucase() const
3258 {
3259 return forceucase;
3260 }
3261
get_warn() const3262 FLAG AffixMgr::get_warn() const
3263 {
3264 return warn;
3265 }
3266
get_forbidwarn() const3267 int AffixMgr::get_forbidwarn() const
3268 {
3269 return forbidwarn;
3270 }
3271
get_checksharps() const3272 int AffixMgr::get_checksharps() const
3273 {
3274 return checksharps;
3275 }
3276
encode_flag(unsigned short aflag) const3277 char * AffixMgr::encode_flag(unsigned short aflag) const
3278 {
3279 return pHMgr->encode_flag(aflag);
3280 }
3281
3282
3283 // return the preferred ignore string for suggestions
get_ignore() const3284 char * AffixMgr::get_ignore() const
3285 {
3286 if (!ignorechars) return NULL;
3287 return ignorechars;
3288 }
3289
3290 // return the preferred ignore string for suggestions
get_ignore_utf16(int * len) const3291 unsigned short * AffixMgr::get_ignore_utf16(int * len) const
3292 {
3293 *len = ignorechars_utf16_len;
3294 return ignorechars_utf16;
3295 }
3296
3297 // return the keyboard string for suggestions
get_key_string()3298 char * AffixMgr::get_key_string()
3299 {
3300 if (! keystring ) keystring = mystrdup(SPELL_KEYSTRING);
3301 return mystrdup(keystring);
3302 }
3303
3304 // return the preferred try string for suggestions
get_try_string() const3305 char * AffixMgr::get_try_string() const
3306 {
3307 if (! trystring ) return NULL;
3308 return mystrdup(trystring);
3309 }
3310
3311 // return the preferred try string for suggestions
get_wordchars() const3312 const char * AffixMgr::get_wordchars() const
3313 {
3314 return wordchars;
3315 }
3316
get_wordchars_utf16(int * len) const3317 unsigned short * AffixMgr::get_wordchars_utf16(int * len) const
3318 {
3319 *len = wordchars_utf16_len;
3320 return wordchars_utf16;
3321 }
3322
3323 // is there compounding?
get_compound() const3324 int AffixMgr::get_compound() const
3325 {
3326 return compoundflag || compoundbegin || numdefcpd;
3327 }
3328
3329 // return the compound words control flag
get_compoundflag() const3330 FLAG AffixMgr::get_compoundflag() const
3331 {
3332 return compoundflag;
3333 }
3334
3335 // return the forbidden words control flag
get_forbiddenword() const3336 FLAG AffixMgr::get_forbiddenword() const
3337 {
3338 return forbiddenword;
3339 }
3340
3341 // return the forbidden words control flag
get_nosuggest() const3342 FLAG AffixMgr::get_nosuggest() const
3343 {
3344 return nosuggest;
3345 }
3346
3347 // return the forbidden words control flag
get_nongramsuggest() const3348 FLAG AffixMgr::get_nongramsuggest() const
3349 {
3350 return nongramsuggest;
3351 }
3352
3353 // return the forbidden words flag modify flag
get_needaffix() const3354 FLAG AffixMgr::get_needaffix() const
3355 {
3356 return needaffix;
3357 }
3358
3359 // return the onlyincompound flag
get_onlyincompound() const3360 FLAG AffixMgr::get_onlyincompound() const
3361 {
3362 return onlyincompound;
3363 }
3364
3365 // return the compound word signal flag
get_compoundroot() const3366 FLAG AffixMgr::get_compoundroot() const
3367 {
3368 return compoundroot;
3369 }
3370
3371 // return the compound begin signal flag
get_compoundbegin() const3372 FLAG AffixMgr::get_compoundbegin() const
3373 {
3374 return compoundbegin;
3375 }
3376
3377 // return the value of checknum
get_checknum() const3378 int AffixMgr::get_checknum() const
3379 {
3380 return checknum;
3381 }
3382
3383 // return the value of prefix
get_prefix() const3384 const char * AffixMgr::get_prefix() const
3385 {
3386 if (pfx) return pfx->getKey();
3387 return NULL;
3388 }
3389
3390 // return the value of suffix
get_suffix() const3391 const char * AffixMgr::get_suffix() const
3392 {
3393 return sfxappnd;
3394 }
3395
3396 // return the value of suffix
get_version() const3397 const char * AffixMgr::get_version() const
3398 {
3399 return version;
3400 }
3401
3402 // return lemma_present flag
get_lemma_present() const3403 FLAG AffixMgr::get_lemma_present() const
3404 {
3405 return lemma_present;
3406 }
3407
3408 // utility method to look up root words in hash table
lookup(const char * word)3409 struct hentry * AffixMgr::lookup(const char * word)
3410 {
3411 int i;
3412 struct hentry * he = NULL;
3413 for (i = 0; i < *maxdic && !he; i++) {
3414 he = (alldic[i])->lookup(word);
3415 }
3416 return he;
3417 }
3418
3419 // return the value of suffix
have_contclass() const3420 int AffixMgr::have_contclass() const
3421 {
3422 return havecontclass;
3423 }
3424
3425 // return utf8
get_utf8() const3426 int AffixMgr::get_utf8() const
3427 {
3428 return utf8;
3429 }
3430
get_maxngramsugs(void) const3431 int AffixMgr::get_maxngramsugs(void) const
3432 {
3433 return maxngramsugs;
3434 }
3435
get_maxcpdsugs(void) const3436 int AffixMgr::get_maxcpdsugs(void) const
3437 {
3438 return maxcpdsugs;
3439 }
3440
get_maxdiff(void) const3441 int AffixMgr::get_maxdiff(void) const
3442 {
3443 return maxdiff;
3444 }
3445
get_onlymaxdiff(void) const3446 int AffixMgr::get_onlymaxdiff(void) const
3447 {
3448 return onlymaxdiff;
3449 }
3450
3451 // return nosplitsugs
get_nosplitsugs(void) const3452 int AffixMgr::get_nosplitsugs(void) const
3453 {
3454 return nosplitsugs;
3455 }
3456
3457 // return sugswithdots
get_sugswithdots(void) const3458 int AffixMgr::get_sugswithdots(void) const
3459 {
3460 return sugswithdots;
3461 }
3462
3463 /* parse flag */
parse_flag(char * line,unsigned short * out,FileMgr * af)3464 int AffixMgr::parse_flag(char * line, unsigned short * out, FileMgr * af) {
3465 char * s = NULL;
3466 if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) {
3467 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum());
3468 return 1;
3469 }
3470 if (parse_string(line, &s, af->getlinenum())) return 1;
3471 *out = pHMgr->decode_flag(s);
3472 free(s);
3473 return 0;
3474 }
3475
3476 /* parse num */
parse_num(char * line,int * out,FileMgr * af)3477 int AffixMgr::parse_num(char * line, int * out, FileMgr * af) {
3478 char * s = NULL;
3479 if (*out != -1) {
3480 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix file parameter\n", af->getlinenum());
3481 return 1;
3482 }
3483 if (parse_string(line, &s, af->getlinenum())) return 1;
3484 *out = atoi(s);
3485 free(s);
3486 return 0;
3487 }
3488
3489 /* parse in the max syllablecount of compound words and */
parse_cpdsyllable(char * line,FileMgr * af)3490 int AffixMgr::parse_cpdsyllable(char * line, FileMgr * af)
3491 {
3492 char * tp = line;
3493 char * piece;
3494 int i = 0;
3495 int np = 0;
3496 w_char w[MAXWORDLEN];
3497 piece = mystrsep(&tp, 0);
3498 while (piece) {
3499 if (*piece != '\0') {
3500 switch(i) {
3501 case 0: { np++; break; }
3502 case 1: { cpdmaxsyllable = atoi(piece); np++; break; }
3503 case 2: {
3504 if (!utf8) {
3505 cpdvowels = mystrdup(piece);
3506 } else {
3507 int n = u8_u16(w, MAXWORDLEN, piece);
3508 if (n > 0) {
3509 flag_qsort((unsigned short *) w, 0, n);
3510 cpdvowels_utf16 = (w_char *) malloc(n * sizeof(w_char));
3511 if (!cpdvowels_utf16) return 1;
3512 memcpy(cpdvowels_utf16, w, n * sizeof(w_char));
3513 }
3514 cpdvowels_utf16_len = n;
3515 }
3516 np++;
3517 break;
3518 }
3519 default: break;
3520 }
3521 i++;
3522 }
3523 piece = mystrsep(&tp, 0);
3524 }
3525 if (np < 2) {
3526 HUNSPELL_WARNING(stderr, "error: line %d: missing compoundsyllable information\n", af->getlinenum());
3527 return 1;
3528 }
3529 if (np == 2) cpdvowels = mystrdup("aeiouAEIOU");
3530 return 0;
3531 }
3532
3533 /* parse in the typical fault correcting table */
parse_reptable(char * line,FileMgr * af)3534 int AffixMgr::parse_reptable(char * line, FileMgr * af)
3535 {
3536 if (numrep != 0) {
3537 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3538 return 1;
3539 }
3540 char * tp = line;
3541 char * piece;
3542 int i = 0;
3543 int np = 0;
3544 piece = mystrsep(&tp, 0);
3545 while (piece) {
3546 if (*piece != '\0') {
3547 switch(i) {
3548 case 0: { np++; break; }
3549 case 1: {
3550 numrep = atoi(piece);
3551 if (numrep < 1) {
3552 HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum());
3553 return 1;
3554 }
3555 reptable = (replentry *) malloc(numrep * sizeof(struct replentry));
3556 if (!reptable) return 1;
3557 np++;
3558 break;
3559 }
3560 default: break;
3561 }
3562 i++;
3563 }
3564 piece = mystrsep(&tp, 0);
3565 }
3566 if (np != 2) {
3567 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3568 return 1;
3569 }
3570
3571 /* now parse the numrep lines to read in the remainder of the table */
3572 char * nl;
3573 for (int j=0; j < numrep; j++) {
3574 if ((nl = af->getline()) == NULL) return 1;
3575 mychomp(nl);
3576 tp = nl;
3577 i = 0;
3578 reptable[j].pattern = NULL;
3579 reptable[j].pattern2 = NULL;
3580 piece = mystrsep(&tp, 0);
3581 while (piece) {
3582 if (*piece != '\0') {
3583 switch(i) {
3584 case 0: {
3585 if (strncmp(piece,"REP",3) != 0) {
3586 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3587 numrep = 0;
3588 return 1;
3589 }
3590 break;
3591 }
3592 case 1: {
3593 if (*piece == '^') reptable[j].start = true; else reptable[j].start = false;
3594 reptable[j].pattern = mystrrep(mystrdup(piece + int(reptable[j].start)),"_"," ");
3595 int lr = strlen(reptable[j].pattern) - 1;
3596 if (reptable[j].pattern[lr] == '$') {
3597 reptable[j].end = true;
3598 reptable[j].pattern[lr] = '\0';
3599 } else reptable[j].end = false;
3600 break;
3601 }
3602 case 2: { reptable[j].pattern2 = mystrrep(mystrdup(piece),"_"," "); break; }
3603 default: break;
3604 }
3605 i++;
3606 }
3607 piece = mystrsep(&tp, 0);
3608 }
3609 if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) {
3610 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3611 numrep = 0;
3612 return 1;
3613 }
3614 }
3615 return 0;
3616 }
3617
3618 /* parse in the typical fault correcting table */
parse_convtable(char * line,FileMgr * af,RepList ** rl,const char * keyword)3619 int AffixMgr::parse_convtable(char * line, FileMgr * af, RepList ** rl, const char * keyword)
3620 {
3621 if (*rl) {
3622 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3623 return 1;
3624 }
3625 char * tp = line;
3626 char * piece;
3627 int i = 0;
3628 int np = 0;
3629 int numrl = 0;
3630 piece = mystrsep(&tp, 0);
3631 while (piece) {
3632 if (*piece != '\0') {
3633 switch(i) {
3634 case 0: { np++; break; }
3635 case 1: {
3636 numrl = atoi(piece);
3637 if (numrl < 1) {
3638 HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum());
3639 return 1;
3640 }
3641 *rl = new RepList(numrl);
3642 if (!*rl) return 1;
3643 np++;
3644 break;
3645 }
3646 default: break;
3647 }
3648 i++;
3649 }
3650 piece = mystrsep(&tp, 0);
3651 }
3652 if (np != 2) {
3653 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3654 return 1;
3655 }
3656
3657 /* now parse the num lines to read in the remainder of the table */
3658 char * nl;
3659 for (int j=0; j < numrl; j++) {
3660 if (!(nl = af->getline())) return 1;
3661 mychomp(nl);
3662 tp = nl;
3663 i = 0;
3664 char * pattern = NULL;
3665 char * pattern2 = NULL;
3666 piece = mystrsep(&tp, 0);
3667 while (piece) {
3668 if (*piece != '\0') {
3669 switch(i) {
3670 case 0: {
3671 if (strncmp(piece, keyword, strlen(keyword)) != 0) {
3672 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3673 delete *rl;
3674 *rl = NULL;
3675 return 1;
3676 }
3677 break;
3678 }
3679 case 1: { pattern = mystrrep(mystrdup(piece),"_"," "); break; }
3680 case 2: {
3681 pattern2 = mystrrep(mystrdup(piece),"_"," ");
3682 break;
3683 }
3684 default: break;
3685 }
3686 i++;
3687 }
3688 piece = mystrsep(&tp, 0);
3689 }
3690 if (!pattern || !pattern2) {
3691 if (pattern)
3692 free(pattern);
3693 if (pattern2)
3694 free(pattern2);
3695 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3696 return 1;
3697 }
3698 (*rl)->add(pattern, pattern2);
3699 }
3700 return 0;
3701 }
3702
3703
3704 /* parse in the typical fault correcting table */
parse_phonetable(char * line,FileMgr * af)3705 int AffixMgr::parse_phonetable(char * line, FileMgr * af)
3706 {
3707 if (phone) {
3708 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3709 return 1;
3710 }
3711 char * tp = line;
3712 char * piece;
3713 int i = 0;
3714 int np = 0;
3715 piece = mystrsep(&tp, 0);
3716 while (piece) {
3717 if (*piece != '\0') {
3718 switch(i) {
3719 case 0: { np++; break; }
3720 case 1: {
3721 phone = (phonetable *) malloc(sizeof(struct phonetable));
3722 if (!phone) return 1;
3723 phone->num = atoi(piece);
3724 phone->rules = NULL;
3725 phone->utf8 = (char) utf8;
3726 if (phone->num < 1) {
3727 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
3728 return 1;
3729 }
3730 phone->rules = (char * *) malloc(2 * (phone->num + 1) * sizeof(char *));
3731 if (!phone->rules) {
3732 free(phone);
3733 phone = NULL;
3734 return 1;
3735 }
3736 np++;
3737 break;
3738 }
3739 default: break;
3740 }
3741 i++;
3742 }
3743 piece = mystrsep(&tp, 0);
3744 }
3745 if (np != 2) {
3746 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3747 return 1;
3748 }
3749
3750 /* now parse the phone->num lines to read in the remainder of the table */
3751 char * nl;
3752 for (int j=0; j < phone->num; j++) {
3753 if (!(nl = af->getline())) return 1;
3754 mychomp(nl);
3755 tp = nl;
3756 i = 0;
3757 phone->rules[j * 2] = NULL;
3758 phone->rules[j * 2 + 1] = NULL;
3759 piece = mystrsep(&tp, 0);
3760 while (piece) {
3761 if (*piece != '\0') {
3762 switch(i) {
3763 case 0: {
3764 if (strncmp(piece,"PHONE",5) != 0) {
3765 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3766 phone->num = 0;
3767 return 1;
3768 }
3769 break;
3770 }
3771 case 1: { phone->rules[j * 2] = mystrrep(mystrdup(piece),"_",""); break; }
3772 case 2: { phone->rules[j * 2 + 1] = mystrrep(mystrdup(piece),"_",""); break; }
3773 default: break;
3774 }
3775 i++;
3776 }
3777 piece = mystrsep(&tp, 0);
3778 }
3779 if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) {
3780 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3781 phone->num = 0;
3782 return 1;
3783 }
3784 }
3785 phone->rules[phone->num * 2] = mystrdup("");
3786 phone->rules[phone->num * 2 + 1] = mystrdup("");
3787 init_phonet_hash(*phone);
3788 return 0;
3789 }
3790
3791 /* parse in the checkcompoundpattern table */
parse_checkcpdtable(char * line,FileMgr * af)3792 int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af)
3793 {
3794 if (numcheckcpd != 0) {
3795 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3796 return 1;
3797 }
3798 char * tp = line;
3799 char * piece;
3800 int i = 0;
3801 int np = 0;
3802 piece = mystrsep(&tp, 0);
3803 while (piece) {
3804 if (*piece != '\0') {
3805 switch(i) {
3806 case 0: { np++; break; }
3807 case 1: {
3808 numcheckcpd = atoi(piece);
3809 if (numcheckcpd < 1) {
3810 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
3811 return 1;
3812 }
3813 checkcpdtable = (patentry *) malloc(numcheckcpd * sizeof(struct patentry));
3814 if (!checkcpdtable) return 1;
3815 np++;
3816 break;
3817 }
3818 default: break;
3819 }
3820 i++;
3821 }
3822 piece = mystrsep(&tp, 0);
3823 }
3824 if (np != 2) {
3825 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3826 return 1;
3827 }
3828
3829 /* now parse the numcheckcpd lines to read in the remainder of the table */
3830 char * nl;
3831 for (int j=0; j < numcheckcpd; j++) {
3832 if (!(nl = af->getline())) return 1;
3833 mychomp(nl);
3834 tp = nl;
3835 i = 0;
3836 checkcpdtable[j].pattern = NULL;
3837 checkcpdtable[j].pattern2 = NULL;
3838 checkcpdtable[j].pattern3 = NULL;
3839 checkcpdtable[j].cond = FLAG_NULL;
3840 checkcpdtable[j].cond2 = FLAG_NULL;
3841 piece = mystrsep(&tp, 0);
3842 while (piece) {
3843 if (*piece != '\0') {
3844 switch(i) {
3845 case 0: {
3846 if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) {
3847 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3848 numcheckcpd = 0;
3849 return 1;
3850 }
3851 break;
3852 }
3853 case 1: {
3854 checkcpdtable[j].pattern = mystrdup(piece);
3855 char * p = strchr(checkcpdtable[j].pattern, '/');
3856 if (p) {
3857 *p = '\0';
3858 checkcpdtable[j].cond = pHMgr->decode_flag(p + 1);
3859 }
3860 break; }
3861 case 2: {
3862 checkcpdtable[j].pattern2 = mystrdup(piece);
3863 char * p = strchr(checkcpdtable[j].pattern2, '/');
3864 if (p) {
3865 *p = '\0';
3866 checkcpdtable[j].cond2 = pHMgr->decode_flag(p + 1);
3867 }
3868 break;
3869 }
3870 case 3: { checkcpdtable[j].pattern3 = mystrdup(piece); simplifiedcpd = 1; break; }
3871 default: break;
3872 }
3873 i++;
3874 }
3875 piece = mystrsep(&tp, 0);
3876 }
3877 if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) {
3878 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3879 numcheckcpd = 0;
3880 return 1;
3881 }
3882 }
3883 return 0;
3884 }
3885
3886 /* parse in the compound rule table */
parse_defcpdtable(char * line,FileMgr * af)3887 int AffixMgr::parse_defcpdtable(char * line, FileMgr * af)
3888 {
3889 if (numdefcpd != 0) {
3890 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3891 return 1;
3892 }
3893 char * tp = line;
3894 char * piece;
3895 int i = 0;
3896 int np = 0;
3897 piece = mystrsep(&tp, 0);
3898 while (piece) {
3899 if (*piece != '\0') {
3900 switch(i) {
3901 case 0: { np++; break; }
3902 case 1: {
3903 numdefcpd = atoi(piece);
3904 if (numdefcpd < 1) {
3905 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
3906 return 1;
3907 }
3908 defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry));
3909 if (!defcpdtable) return 1;
3910 np++;
3911 break;
3912 }
3913 default: break;
3914 }
3915 i++;
3916 }
3917 piece = mystrsep(&tp, 0);
3918 }
3919 if (np != 2) {
3920 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
3921 return 1;
3922 }
3923
3924 /* now parse the numdefcpd lines to read in the remainder of the table */
3925 char * nl;
3926 for (int j=0; j < numdefcpd; j++) {
3927 if (!(nl = af->getline())) return 1;
3928 mychomp(nl);
3929 tp = nl;
3930 i = 0;
3931 defcpdtable[j].def = NULL;
3932 piece = mystrsep(&tp, 0);
3933 while (piece) {
3934 if (*piece != '\0') {
3935 switch(i) {
3936 case 0: {
3937 if (strncmp(piece, "COMPOUNDRULE", 12) != 0) {
3938 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3939 numdefcpd = 0;
3940 return 1;
3941 }
3942 break;
3943 }
3944 case 1: { // handle parenthesized flags
3945 if (strchr(piece, '(')) {
3946 defcpdtable[j].def = (FLAG *) malloc(strlen(piece) * sizeof(FLAG));
3947 defcpdtable[j].len = 0;
3948 int end = 0;
3949 FLAG * conv;
3950 while (!end) {
3951 char * par = piece + 1;
3952 while (*par != '(' && *par != ')' && *par != '\0') par++;
3953 if (*par == '\0') end = 1; else *par = '\0';
3954 if (*piece == '(') piece++;
3955 if (*piece == '*' || *piece == '?') {
3956 defcpdtable[j].def[defcpdtable[j].len++] = (FLAG) *piece;
3957 } else if (*piece != '\0') {
3958 int l = pHMgr->decode_flags(&conv, piece, af);
3959 for (int k = 0; k < l; k++) defcpdtable[j].def[defcpdtable[j].len++] = conv[k];
3960 free(conv);
3961 }
3962 piece = par + 1;
3963 }
3964 } else {
3965 defcpdtable[j].len = pHMgr->decode_flags(&(defcpdtable[j].def), piece, af);
3966 }
3967 break;
3968 }
3969 default: break;
3970 }
3971 i++;
3972 }
3973 piece = mystrsep(&tp, 0);
3974 }
3975 if (!defcpdtable[j].len) {
3976 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
3977 numdefcpd = 0;
3978 return 1;
3979 }
3980 }
3981 return 0;
3982 }
3983
3984
3985 /* parse in the character map table */
parse_maptable(char * line,FileMgr * af)3986 int AffixMgr::parse_maptable(char * line, FileMgr * af)
3987 {
3988 if (nummap != 0) {
3989 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
3990 return 1;
3991 }
3992 char * tp = line;
3993 char * piece;
3994 int i = 0;
3995 int np = 0;
3996 piece = mystrsep(&tp, 0);
3997 while (piece) {
3998 if (*piece != '\0') {
3999 switch(i) {
4000 case 0: { np++; break; }
4001 case 1: {
4002 nummap = atoi(piece);
4003 if (nummap < 1) {
4004 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
4005 return 1;
4006 }
4007 maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry));
4008 if (!maptable) return 1;
4009 np++;
4010 break;
4011 }
4012 default: break;
4013 }
4014 i++;
4015 }
4016 piece = mystrsep(&tp, 0);
4017 }
4018 if (np != 2) {
4019 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
4020 return 1;
4021 }
4022
4023 /* now parse the nummap lines to read in the remainder of the table */
4024 char * nl;
4025 for (int j=0; j < nummap; j++) {
4026 if (!(nl = af->getline())) return 1;
4027 mychomp(nl);
4028 tp = nl;
4029 i = 0;
4030 maptable[j].set = NULL;
4031 maptable[j].len = 0;
4032 piece = mystrsep(&tp, 0);
4033 while (piece) {
4034 if (*piece != '\0') {
4035 switch(i) {
4036 case 0: {
4037 if (strncmp(piece,"MAP",3) != 0) {
4038 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
4039 nummap = 0;
4040 return 1;
4041 }
4042 break;
4043 }
4044 case 1: {
4045 int setn = 0;
4046 maptable[j].len = strlen(piece);
4047 maptable[j].set = (char **) malloc(maptable[j].len * sizeof(char*));
4048 if (!maptable[j].set) return 1;
4049 for (int k = 0; k < maptable[j].len; k++) {
4050 int chl = 1;
4051 int chb = k;
4052 if (piece[k] == '(') {
4053 char * parpos = strchr(piece + k, ')');
4054 if (parpos != NULL) {
4055 chb = k + 1;
4056 chl = (int)(parpos - piece) - k - 1;
4057 k = k + chl + 1;
4058 }
4059 } else {
4060 if (utf8 && (piece[k] & 0xc0) == 0xc0) {
4061 for (k++; utf8 && (piece[k] & 0xc0) == 0x80; k++);
4062 chl = k - chb;
4063 k--;
4064 }
4065 }
4066 maptable[j].set[setn] = (char *) malloc(chl + 1);
4067 if (!maptable[j].set[setn]) return 1;
4068 strncpy(maptable[j].set[setn], piece + chb, chl);
4069 maptable[j].set[setn][chl] = '\0';
4070 setn++;
4071 }
4072 maptable[j].len = setn;
4073 break; }
4074 default: break;
4075 }
4076 i++;
4077 }
4078 piece = mystrsep(&tp, 0);
4079 }
4080 if (!maptable[j].set || !maptable[j].len) {
4081 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
4082 nummap = 0;
4083 return 1;
4084 }
4085 }
4086 return 0;
4087 }
4088
4089 /* parse in the word breakpoint table */
parse_breaktable(char * line,FileMgr * af)4090 int AffixMgr::parse_breaktable(char * line, FileMgr * af)
4091 {
4092 if (numbreak > -1) {
4093 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
4094 return 1;
4095 }
4096 char * tp = line;
4097 char * piece;
4098 int i = 0;
4099 int np = 0;
4100 piece = mystrsep(&tp, 0);
4101 while (piece) {
4102 if (*piece != '\0') {
4103 switch(i) {
4104 case 0: { np++; break; }
4105 case 1: {
4106 numbreak = atoi(piece);
4107 if (numbreak < 0) {
4108 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
4109 return 1;
4110 }
4111 if (numbreak == 0) return 0;
4112 breaktable = (char **) malloc(numbreak * sizeof(char *));
4113 if (!breaktable) return 1;
4114 np++;
4115 break;
4116 }
4117 default: break;
4118 }
4119 i++;
4120 }
4121 piece = mystrsep(&tp, 0);
4122 }
4123 if (np != 2) {
4124 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
4125 return 1;
4126 }
4127
4128 /* now parse the numbreak lines to read in the remainder of the table */
4129 char * nl;
4130 for (int j=0; j < numbreak; j++) {
4131 if (!(nl = af->getline())) return 1;
4132 mychomp(nl);
4133 tp = nl;
4134 i = 0;
4135 piece = mystrsep(&tp, 0);
4136 while (piece) {
4137 if (*piece != '\0') {
4138 switch(i) {
4139 case 0: {
4140 if (strncmp(piece,"BREAK",5) != 0) {
4141 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
4142 numbreak = 0;
4143 return 1;
4144 }
4145 break;
4146 }
4147 case 1: {
4148 breaktable[j] = mystrdup(piece);
4149 break;
4150 }
4151 default: break;
4152 }
4153 i++;
4154 }
4155 piece = mystrsep(&tp, 0);
4156 }
4157 if (!breaktable) {
4158 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
4159 numbreak = 0;
4160 return 1;
4161 }
4162 }
4163 return 0;
4164 }
4165
reverse_condition(char * piece)4166 void AffixMgr::reverse_condition(char * piece) {
4167 int neg = 0;
4168 for (char * k = piece + strlen(piece) - 1; k >= piece; k--) {
4169 switch(*k) {
4170 case '[': {
4171 if (neg) *(k+1) = '['; else *k = ']';
4172 break;
4173 }
4174 case ']': {
4175 *k = '[';
4176 if (neg) *(k+1) = '^';
4177 neg = 0;
4178 break;
4179 }
4180 case '^': {
4181 if (*(k+1) == ']') neg = 1; else *(k+1) = *k;
4182 break;
4183 }
4184 default: {
4185 if (neg) *(k+1) = *k;
4186 }
4187 }
4188 }
4189 }
4190
parse_affix(char * line,const char at,FileMgr * af,char * dupflags)4191 int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupflags)
4192 {
4193 int numents = 0; // number of affentry structures to parse
4194
4195 unsigned short aflag = 0; // affix char identifier
4196
4197 char ff=0;
4198 std::vector<affentry> affentries;
4199
4200 char * tp = line;
4201 char * nl = line;
4202 char * piece;
4203 int i = 0;
4204
4205 // checking lines with bad syntax
4206 #ifdef DEBUG
4207 int basefieldnum = 0;
4208 #endif
4209
4210 // split affix header line into pieces
4211
4212 int np = 0;
4213
4214 piece = mystrsep(&tp, 0);
4215 while (piece) {
4216 if (*piece != '\0') {
4217 switch(i) {
4218 // piece 1 - is type of affix
4219 case 0: { np++; break; }
4220
4221 // piece 2 - is affix char
4222 case 1: {
4223 np++;
4224 aflag = pHMgr->decode_flag(piece);
4225 if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
4226 ((at == 'P') && (dupflags[aflag] & dupPFX))) {
4227 HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix flag\n",
4228 af->getlinenum());
4229 // return 1; XXX permissive mode for bad dictionaries
4230 }
4231 dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX);
4232 break;
4233 }
4234 // piece 3 - is cross product indicator
4235 case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; }
4236
4237 // piece 4 - is number of affentries
4238 case 3: {
4239 np++;
4240 numents = atoi(piece);
4241 if (numents == 0) {
4242 char * err = pHMgr->encode_flag(aflag);
4243 if (err) {
4244 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4245 af->getlinenum());
4246 free(err);
4247 }
4248 return 1;
4249 }
4250 affentries.resize(numents);
4251 affentries[0].opts = ff;
4252 if (utf8) affentries[0].opts += aeUTF8;
4253 if (pHMgr->is_aliasf()) affentries[0].opts += aeALIASF;
4254 if (pHMgr->is_aliasm()) affentries[0].opts += aeALIASM;
4255 affentries[0].aflag = aflag;
4256 }
4257
4258 default: break;
4259 }
4260 i++;
4261 }
4262 piece = mystrsep(&tp, 0);
4263 }
4264 // check to make sure we parsed enough pieces
4265 if (np != 4) {
4266 char * err = pHMgr->encode_flag(aflag);
4267 if (err) {
4268 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
4269 free(err);
4270 }
4271 return 1;
4272 }
4273
4274 // now parse numents affentries for this affix
4275 std::vector<affentry>::iterator start = affentries.begin();
4276 std::vector<affentry>::iterator end = affentries.end();
4277 for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) {
4278 if ((nl = af->getline()) == NULL) return 1;
4279 mychomp(nl);
4280 tp = nl;
4281 i = 0;
4282 np = 0;
4283
4284 // split line into pieces
4285 piece = mystrsep(&tp, 0);
4286 while (piece) {
4287 if (*piece != '\0') {
4288 switch(i) {
4289 // piece 1 - is type
4290 case 0: {
4291 np++;
4292 if (entry != start) entry->opts = start->opts &
4293 (char) (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM);
4294 break;
4295 }
4296
4297 // piece 2 - is affix char
4298 case 1: {
4299 np++;
4300 if (pHMgr->decode_flag(piece) != aflag) {
4301 char * err = pHMgr->encode_flag(aflag);
4302 if (err) {
4303 HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
4304 af->getlinenum(), err);
4305 free(err);
4306 }
4307 return 1;
4308 }
4309
4310 if (entry != start) entry->aflag = start->aflag;
4311 break;
4312 }
4313
4314 // piece 3 - is string to strip or 0 for null
4315 case 2: {
4316 np++;
4317 if (complexprefixes) {
4318 if (utf8) reverseword_utf(piece); else reverseword(piece);
4319 }
4320 entry->strip = mystrdup(piece);
4321 entry->stripl = (unsigned char) strlen(entry->strip);
4322 if (strcmp(entry->strip,"0") == 0) {
4323 free(entry->strip);
4324 entry->strip=mystrdup("");
4325 entry->stripl = 0;
4326 }
4327 break;
4328 }
4329
4330 // piece 4 - is affix string or 0 for null
4331 case 3: {
4332 char * dash;
4333 entry->morphcode = NULL;
4334 entry->contclass = NULL;
4335 entry->contclasslen = 0;
4336 np++;
4337 dash = strchr(piece, '/');
4338 if (dash) {
4339 *dash = '\0';
4340
4341 if (ignorechars) {
4342 if (utf8) {
4343 remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
4344 } else {
4345 remove_ignored_chars(piece,ignorechars);
4346 }
4347 }
4348
4349 if (complexprefixes) {
4350 if (utf8) reverseword_utf(piece); else reverseword(piece);
4351 }
4352 entry->appnd = mystrdup(piece);
4353
4354 if (pHMgr->is_aliasf()) {
4355 int index = atoi(dash + 1);
4356 entry->contclasslen = (unsigned short) pHMgr->get_aliasf(index, &(entry->contclass), af);
4357 if (!entry->contclasslen) HUNSPELL_WARNING(stderr, "error: bad affix flag alias: \"%s\"\n", dash+1);
4358 } else {
4359 entry->contclasslen = (unsigned short) pHMgr->decode_flags(&(entry->contclass), dash + 1, af);
4360 flag_qsort(entry->contclass, 0, entry->contclasslen);
4361 }
4362 *dash = '/';
4363
4364 havecontclass = 1;
4365 for (unsigned short _i = 0; _i < entry->contclasslen; _i++) {
4366 contclasses[(entry->contclass)[_i]] = 1;
4367 }
4368 } else {
4369 if (ignorechars) {
4370 if (utf8) {
4371 remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
4372 } else {
4373 remove_ignored_chars(piece,ignorechars);
4374 }
4375 }
4376
4377 if (complexprefixes) {
4378 if (utf8) reverseword_utf(piece); else reverseword(piece);
4379 }
4380 entry->appnd = mystrdup(piece);
4381 }
4382
4383 entry->appndl = (unsigned char) strlen(entry->appnd);
4384 if (strcmp(entry->appnd,"0") == 0) {
4385 free(entry->appnd);
4386 entry->appnd=mystrdup("");
4387 entry->appndl = 0;
4388 }
4389 break;
4390 }
4391
4392 // piece 5 - is the conditions descriptions
4393 case 4: {
4394 np++;
4395 if (complexprefixes) {
4396 if (utf8) reverseword_utf(piece); else reverseword(piece);
4397 reverse_condition(piece);
4398 }
4399 if (entry->stripl && (strcmp(piece, ".") != 0) &&
4400 redundant_condition(at, entry->strip, entry->stripl, piece, af->getlinenum()))
4401 strcpy(piece, ".");
4402 if (at == 'S') {
4403 reverseword(piece);
4404 reverse_condition(piece);
4405 }
4406 if (encodeit(*entry, piece)) return 1;
4407 break;
4408 }
4409
4410 case 5: {
4411 np++;
4412 if (pHMgr->is_aliasm()) {
4413 int index = atoi(piece);
4414 entry->morphcode = pHMgr->get_aliasm(index);
4415 } else {
4416 if (complexprefixes) { // XXX - fix me for morph. gen.
4417 if (utf8) reverseword_utf(piece); else reverseword(piece);
4418 }
4419 // add the remaining of the line
4420 if (*tp) {
4421 *(tp - 1) = ' ';
4422 tp = tp + strlen(tp);
4423 }
4424 entry->morphcode = mystrdup(piece);
4425 if (!entry->morphcode) return 1;
4426 }
4427 break;
4428 }
4429 default: break;
4430 }
4431 i++;
4432 }
4433 piece = mystrsep(&tp, 0);
4434 }
4435 // check to make sure we parsed enough pieces
4436 if (np < 4) {
4437 char * err = pHMgr->encode_flag(aflag);
4438 if (err) {
4439 HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
4440 af->getlinenum(), err);
4441 free(err);
4442 }
4443 return 1;
4444 }
4445
4446 #ifdef DEBUG
4447 // detect unnecessary fields, excepting comments
4448 if (basefieldnum) {
4449 int fieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5 : 6);
4450 if (fieldnum != basefieldnum)
4451 HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n", af->getlinenum());
4452 } else {
4453 basefieldnum = !(entry->morphcode) ? 5 : ((*(entry->morphcode)=='#') ? 5 : 6);
4454 }
4455 #endif
4456 }
4457
4458 // now create SfxEntry or PfxEntry objects and use links to
4459 // build an ordered (sorted by affix string) list
4460 for (std::vector<affentry>::iterator entry = start; entry != end; ++entry) {
4461 if (at == 'P') {
4462 PfxEntry * pfxptr = new PfxEntry(this,&(*entry));
4463 build_pfxtree(pfxptr);
4464 } else {
4465 SfxEntry * sfxptr = new SfxEntry(this,&(*entry));
4466 build_sfxtree(sfxptr);
4467 }
4468 }
4469 return 0;
4470 }
4471
redundant_condition(char ft,char * strip,int stripl,const char * cond,int linenum)4472 int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, int linenum) {
4473 int condl = strlen(cond);
4474 int i;
4475 int j;
4476 int neg;
4477 int in;
4478 if (ft == 'P') { // prefix
4479 if (strncmp(strip, cond, condl) == 0) return 1;
4480 if (utf8) {
4481 } else {
4482 for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
4483 if (cond[j] != '[') {
4484 if (cond[j] != strip[i]) {
4485 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4486 return 0;
4487 }
4488 } else {
4489 neg = (cond[j+1] == '^') ? 1 : 0;
4490 in = 0;
4491 do {
4492 j++;
4493 if (strip[i] == cond[j]) in = 1;
4494 } while ((j < (condl - 1)) && (cond[j] != ']'));
4495 if (j == (condl - 1) && (cond[j] != ']')) {
4496 HUNSPELL_WARNING(stderr, "error: line %d: missing ] in condition:\n%s\n", linenum, cond);
4497 return 0;
4498 }
4499 if ((!neg && !in) || (neg && in)) {
4500 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4501 return 0;
4502 }
4503 }
4504 }
4505 if (j >= condl) return 1;
4506 }
4507 } else { // suffix
4508 if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) return 1;
4509 if (utf8) {
4510 } else {
4511 for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
4512 if (cond[j] != ']') {
4513 if (cond[j] != strip[i]) {
4514 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4515 return 0;
4516 }
4517 } else {
4518 in = 0;
4519 do {
4520 j--;
4521 if (strip[i] == cond[j]) in = 1;
4522 } while ((j > 0) && (cond[j] != '['));
4523 if ((j == 0) && (cond[j] != '[')) {
4524 HUNSPELL_WARNING(stderr, "error: line: %d: missing ] in condition:\n%s\n", linenum, cond);
4525 return 0;
4526 }
4527 neg = (cond[j+1] == '^') ? 1 : 0;
4528 if ((!neg && !in) || (neg && in)) {
4529 HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum);
4530 return 0;
4531 }
4532 }
4533 }
4534 if (j < 0) return 1;
4535 }
4536 }
4537 return 0;
4538 }
4539