1 #include "license.hunspell"
2 #include "license.myspell"
3
4 #include <stdlib.h>
5 #include <string.h>
6 #include <stdio.h>
7 #include <ctype.h>
8
9 #include "affentry.hxx"
10 #include "csutil.hxx"
11
12 #define MAXTEMPWORDLEN (MAXWORDUTF8LEN + 4)
13
PfxEntry(AffixMgr * pmgr,affentry * dp)14 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
15 // register affix manager
16 : pmyMgr(pmgr)
17 , next(NULL)
18 , nexteq(NULL)
19 , nextne(NULL)
20 , flgnxt(NULL)
21 {
22 // set up its initial values
23 aflag = dp->aflag; // flag
24 strip = dp->strip; // string to strip
25 appnd = dp->appnd; // string to append
26 stripl = dp->stripl; // length of strip string
27 appndl = dp->appndl; // length of append string
28 numconds = dp->numconds; // length of the condition
29 opts = dp->opts; // cross product flag
30 // then copy over all of the conditions
31 if (opts & aeLONGCOND) {
32 memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
33 c.l.conds2 = dp->c.l.conds2;
34 } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
35 morphcode = dp->morphcode;
36 contclass = dp->contclass;
37 contclasslen = dp->contclasslen;
38 }
39
40
~PfxEntry()41 PfxEntry::~PfxEntry()
42 {
43 aflag = 0;
44 if (appnd) free(appnd);
45 if (strip) free(strip);
46 pmyMgr = NULL;
47 appnd = NULL;
48 strip = NULL;
49 if (opts & aeLONGCOND) free(c.l.conds2);
50 if (morphcode && !(opts & aeALIASM)) free(morphcode);
51 if (contclass && !(opts & aeALIASF)) free(contclass);
52 }
53
54 // add prefix to this word assuming conditions hold
add(const char * word,int len)55 char * PfxEntry::add(const char * word, int len)
56 {
57 char tword[MAXTEMPWORDLEN];
58
59 if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
60 (len >= numconds) && test_condition(word) &&
61 (!stripl || (strncmp(word, strip, stripl) == 0)) &&
62 ((MAXTEMPWORDLEN) > (len + appndl - stripl))) {
63 /* we have a match so add prefix */
64 char * pp = tword;
65 if (appndl) {
66 strncpy(tword, appnd, MAXTEMPWORDLEN-1);
67 tword[MAXTEMPWORDLEN-1] = '\0';
68 pp += appndl;
69 }
70 strcpy(pp, (word + stripl));
71 return mystrdup(tword);
72 }
73 return NULL;
74 }
75
nextchar(char * p)76 inline char * PfxEntry::nextchar(char * p) {
77 if (p) {
78 p++;
79 if (opts & aeLONGCOND) {
80 // jump to the 2nd part of the condition
81 if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;
82 // end of the MAXCONDLEN length condition
83 } else if (p == c.conds + MAXCONDLEN) return NULL;
84 return *p ? p : NULL;
85 }
86 return NULL;
87 }
88
test_condition(const char * st)89 inline int PfxEntry::test_condition(const char * st)
90 {
91 const char * pos = NULL; // group with pos input position
92 bool neg = false; // complementer
93 bool ingroup = false; // character in the group
94 if (numconds == 0) return 1;
95 char * p = c.conds;
96 while (1) {
97 switch (*p) {
98 case '\0': return 1;
99 case '[': {
100 neg = false;
101 ingroup = false;
102 p = nextchar(p);
103 pos = st; break;
104 }
105 case '^': { p = nextchar(p); neg = true; break; }
106 case ']': {
107 if ((neg && ingroup) || (!neg && !ingroup)) return 0;
108 pos = NULL;
109 p = nextchar(p);
110 // skip the next character
111 if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
112 if (*st == '\0' && p) return 0; // word <= condition
113 break;
114 }
115 case '.':
116 if (!pos) { // dots are not metacharacters in groups: [.]
117 p = nextchar(p);
118 // skip the next character
119 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
120 if (*st == '\0' && p) return 0; // word <= condition
121 break;
122 }
123 /* FALLTHROUGH */
124 default: {
125 if (*st == *p) {
126 st++;
127 p = nextchar(p);
128 if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
129 while (p && (*p & 0xc0) == 0x80) { // character
130 if (*p != *st) {
131 if (!pos) return 0;
132 st = pos;
133 break;
134 }
135 p = nextchar(p);
136 st++;
137 }
138 if (pos && st != pos) {
139 ingroup = true;
140 while (p && *p != ']' && ((p = nextchar(p)) != NULL));
141 }
142 } else if (pos) {
143 ingroup = true;
144 while (p && *p != ']' && ((p = nextchar(p)) != NULL));
145 }
146 } else if (pos) { // group
147 p = nextchar(p);
148 } else return 0;
149 }
150 }
151 if (!p) return 1;
152 }
153 }
154
155 // check if this prefix entry matches
checkword(const char * word,int len,char in_compound,const FLAG needflag)156 struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
157 {
158 int tmpl; // length of tmpword
159 struct hentry * he; // hash entry of root word or NULL
160 char tmpword[MAXTEMPWORDLEN];
161
162 // on entry prefix is 0 length or already matches the beginning of the word.
163 // So if the remaining root word has positive length
164 // and if there are enough chars in root word and added back strip chars
165 // to meet the number of characters conditions, then test it
166
167 tmpl = len - appndl;
168
169 if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
170
171 // generate new root word by removing prefix and adding
172 // back any characters that would have been stripped
173
174 if (stripl) {
175 strncpy(tmpword, strip, MAXTEMPWORDLEN-1);
176 tmpword[MAXTEMPWORDLEN-1] = '\0';
177 }
178 strcpy ((tmpword + stripl), (word + appndl));
179
180 // now make sure all of the conditions on characters
181 // are met. Please see the appendix at the end of
182 // this file for more info on exactly what is being
183 // tested
184
185 // if all conditions are met then check if resulting
186 // root word in the dictionary
187
188 if (test_condition(tmpword)) {
189 tmpl += stripl;
190 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
191 do {
192 if (TESTAFF(he->astr, aflag, he->alen) &&
193 // forbid single prefixes with needaffix flag
194 ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
195 // needflag
196 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
197 (contclass && TESTAFF(contclass, needflag, contclasslen))))
198 return he;
199 he = he->next_homonym; // check homonyms
200 } while (he);
201 }
202
203 // prefix matched but no root word was found
204 // if aeXPRODUCT is allowed, try again but now
205 // ross checked combined with a suffix
206
207 //if ((opts & aeXPRODUCT) && in_compound) {
208 if ((opts & aeXPRODUCT)) {
209 he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL,
210 0, NULL, FLAG_NULL, needflag, in_compound);
211 if (he) return he;
212 }
213 }
214 }
215 return NULL;
216 }
217
218 // check if this prefix entry matches
check_twosfx(const char * word,int len,char in_compound,const FLAG needflag)219 struct hentry * PfxEntry::check_twosfx(const char * word, int len,
220 char in_compound, const FLAG needflag)
221 {
222 int tmpl; // length of tmpword
223 struct hentry * he; // hash entry of root word or NULL
224 char tmpword[MAXTEMPWORDLEN];
225
226 // on entry prefix is 0 length or already matches the beginning of the word.
227 // So if the remaining root word has positive length
228 // and if there are enough chars in root word and added back strip chars
229 // to meet the number of characters conditions, then test it
230
231 tmpl = len - appndl;
232
233 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
234 (tmpl + stripl >= numconds)) {
235
236 // generate new root word by removing prefix and adding
237 // back any characters that would have been stripped
238
239 if (stripl) {
240 strncpy(tmpword, strip, MAXTEMPWORDLEN-1);
241 tmpword[MAXTEMPWORDLEN-1] = '\0';
242 }
243 strcpy ((tmpword + stripl), (word + appndl));
244
245 // now make sure all of the conditions on characters
246 // are met. Please see the appendix at the end of
247 // this file for more info on exactly what is being
248 // tested
249
250 // if all conditions are met then check if resulting
251 // root word in the dictionary
252
253 if (test_condition(tmpword)) {
254 tmpl += stripl;
255
256 // prefix matched but no root word was found
257 // if aeXPRODUCT is allowed, try again but now
258 // cross checked combined with a suffix
259
260 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
261 he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag);
262 if (he) return he;
263 }
264 }
265 }
266 return NULL;
267 }
268
269 // check if this prefix entry matches
check_twosfx_morph(const char * word,int len,char in_compound,const FLAG needflag)270 char * PfxEntry::check_twosfx_morph(const char * word, int len,
271 char in_compound, const FLAG needflag)
272 {
273 int tmpl; // length of tmpword
274 char tmpword[MAXTEMPWORDLEN];
275
276 // on entry prefix is 0 length or already matches the beginning of the word.
277 // So if the remaining root word has positive length
278 // and if there are enough chars in root word and added back strip chars
279 // to meet the number of characters conditions, then test it
280
281 tmpl = len - appndl;
282
283 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
284 (tmpl + stripl >= numconds)) {
285
286 // generate new root word by removing prefix and adding
287 // back any characters that would have been stripped
288
289 if (stripl) {
290 strncpy(tmpword, strip, MAXTEMPWORDLEN-1);
291 tmpword[MAXTEMPWORDLEN-1] = '\0';
292 }
293 strcpy ((tmpword + stripl), (word + appndl));
294
295 // now make sure all of the conditions on characters
296 // are met. Please see the appendix at the end of
297 // this file for more info on exactly what is being
298 // tested
299
300 // if all conditions are met then check if resulting
301 // root word in the dictionary
302
303 if (test_condition(tmpword)) {
304 tmpl += stripl;
305
306 // prefix matched but no root word was found
307 // if aeXPRODUCT is allowed, try again but now
308 // ross checked combined with a suffix
309
310 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
311 return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
312 aeXPRODUCT, this, needflag);
313 }
314 }
315 }
316 return NULL;
317 }
318
319 // check if this prefix entry matches
check_morph(const char * word,int len,char in_compound,const FLAG needflag)320 char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
321 {
322 int tmpl; // length of tmpword
323 struct hentry * he; // hash entry of root word or NULL
324 char tmpword[MAXTEMPWORDLEN];
325 char result[MAXLNLEN];
326 char * st;
327
328 *result = '\0';
329
330 // on entry prefix is 0 length or already matches the beginning of the word.
331 // So if the remaining root word has positive length
332 // and if there are enough chars in root word and added back strip chars
333 // to meet the number of characters conditions, then test it
334
335 tmpl = len - appndl;
336
337 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
338 (tmpl + stripl >= numconds)) {
339
340 // generate new root word by removing prefix and adding
341 // back any characters that would have been stripped
342
343 if (stripl) {
344 strncpy(tmpword, strip, MAXTEMPWORDLEN-1);
345 tmpword[MAXTEMPWORDLEN-1] = '\0';
346 }
347 strcpy ((tmpword + stripl), (word + appndl));
348
349 // now make sure all of the conditions on characters
350 // are met. Please see the appendix at the end of
351 // this file for more info on exactly what is being
352 // tested
353
354 // if all conditions are met then check if resulting
355 // root word in the dictionary
356
357 if (test_condition(tmpword)) {
358 tmpl += stripl;
359 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
360 do {
361 if (TESTAFF(he->astr, aflag, he->alen) &&
362 // forbid single prefixes with needaffix flag
363 ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
364 // needflag
365 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
366 (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
367 if (morphcode) {
368 mystrcat(result, " ", MAXLNLEN);
369 mystrcat(result, morphcode, MAXLNLEN);
370 } else mystrcat(result,getKey(), MAXLNLEN);
371 if (!HENTRY_FIND(he, MORPH_STEM)) {
372 mystrcat(result, " ", MAXLNLEN);
373 mystrcat(result, MORPH_STEM, MAXLNLEN);
374 mystrcat(result, HENTRY_WORD(he), MAXLNLEN);
375 }
376 // store the pointer of the hash entry
377 if (HENTRY_DATA(he)) {
378 mystrcat(result, " ", MAXLNLEN);
379 mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);
380 } else {
381 // return with debug information
382 char * flag = pmyMgr->encode_flag(getFlag());
383 mystrcat(result, " ", MAXLNLEN);
384 mystrcat(result, MORPH_FLAG, MAXLNLEN);
385 mystrcat(result, flag, MAXLNLEN);
386 free(flag);
387 }
388 mystrcat(result, "\n", MAXLNLEN);
389 }
390 he = he->next_homonym;
391 } while (he);
392 }
393
394 // prefix matched but no root word was found
395 // if aeXPRODUCT is allowed, try again but now
396 // ross checked combined with a suffix
397
398 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
399 st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this,
400 FLAG_NULL, needflag);
401 if (st) {
402 mystrcat(result, st, MAXLNLEN);
403 free(st);
404 }
405 }
406 }
407 }
408
409 if (*result) return mystrdup(result);
410 return NULL;
411 }
412
SfxEntry(AffixMgr * pmgr,affentry * dp)413 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
414 : pmyMgr(pmgr) // register affix manager
415 , next(NULL)
416 , nexteq(NULL)
417 , nextne(NULL)
418 , flgnxt(NULL)
419 , l_morph(NULL)
420 , r_morph(NULL)
421 , eq_morph(NULL)
422 {
423 // set up its initial values
424 aflag = dp->aflag; // char flag
425 strip = dp->strip; // string to strip
426 appnd = dp->appnd; // string to append
427 stripl = dp->stripl; // length of strip string
428 appndl = dp->appndl; // length of append string
429 numconds = dp->numconds; // length of the condition
430 opts = dp->opts; // cross product flag
431
432 // then copy over all of the conditions
433 if (opts & aeLONGCOND) {
434 memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
435 c.l.conds2 = dp->c.l.conds2;
436 } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
437 rappnd = myrevstrdup(appnd);
438 morphcode = dp->morphcode;
439 contclass = dp->contclass;
440 contclasslen = dp->contclasslen;
441 }
442
443
~SfxEntry()444 SfxEntry::~SfxEntry()
445 {
446 aflag = 0;
447 if (appnd) free(appnd);
448 if (rappnd) free(rappnd);
449 if (strip) free(strip);
450 pmyMgr = NULL;
451 appnd = NULL;
452 strip = NULL;
453 if (opts & aeLONGCOND) free(c.l.conds2);
454 if (morphcode && !(opts & aeALIASM)) free(morphcode);
455 if (contclass && !(opts & aeALIASF)) free(contclass);
456 }
457
458 // add suffix to this word assuming conditions hold
add(const char * word,int len)459 char * SfxEntry::add(const char * word, int len)
460 {
461 char tword[MAXTEMPWORDLEN];
462
463 /* make sure all conditions match */
464 if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
465 (len >= numconds) && test_condition(word + len, word) &&
466 (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
467 ((MAXTEMPWORDLEN) > (len + appndl - stripl))) {
468 /* we have a match so add suffix */
469 strncpy(tword, word, MAXTEMPWORDLEN-1);
470 tword[MAXTEMPWORDLEN-1] = '\0';
471 if (appndl) {
472 strcpy(tword + len - stripl, appnd);
473 } else {
474 *(tword + len - stripl) = '\0';
475 }
476 return mystrdup(tword);
477 }
478 return NULL;
479 }
480
nextchar(char * p)481 inline char * SfxEntry::nextchar(char * p) {
482 if (p) {
483 p++;
484 if (opts & aeLONGCOND) {
485 // jump to the 2nd part of the condition
486 if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;
487 // end of the MAXCONDLEN length condition
488 } else if (p == c.conds + MAXCONDLEN) return NULL;
489 return *p ? p : NULL;
490 }
491 return NULL;
492 }
493
test_condition(const char * st,const char * beg)494 inline int SfxEntry::test_condition(const char * st, const char * beg)
495 {
496 const char * pos = NULL; // group with pos input position
497 bool neg = false; // complementer
498 bool ingroup = false; // character in the group
499 if (numconds == 0) return 1;
500 char * p = c.conds;
501 st--;
502 int i = 1;
503 while (1) {
504 switch (*p) {
505 case '\0':
506 return 1;
507 case '[':
508 p = nextchar(p);
509 pos = st;
510 break;
511 case '^':
512 p = nextchar(p);
513 neg = true;
514 break;
515 case ']':
516 if (!neg && !ingroup)
517 return 0;
518 i++;
519 // skip the next character
520 if (!ingroup)
521 {
522 for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
523 st--;
524 }
525 pos = NULL;
526 neg = false;
527 ingroup = false;
528 p = nextchar(p);
529 if (st < beg && p)
530 return 0; // word <= condition
531 break;
532 case '.':
533 if (!pos)
534 {
535 // dots are not metacharacters in groups: [.]
536 p = nextchar(p);
537 // skip the next character
538 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
539 if (st < beg) { // word <= condition
540 if (p) return 0; else return 1;
541 }
542 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
543 st--;
544 if (st < beg) { // word <= condition
545 if (p) return 0; else return 1;
546 }
547 }
548 break;
549 }
550 /* FALLTHROUGH */
551 default: {
552 if (*st == *p) {
553 p = nextchar(p);
554 if ((opts & aeUTF8) && (*st & 0x80)) {
555 st--;
556 while (p && (st >= beg)) {
557 if (*p != *st) {
558 if (!pos) return 0;
559 st = pos;
560 break;
561 }
562 // first byte of the UTF-8 multibyte character
563 if ((*p & 0xc0) != 0x80) break;
564 p = nextchar(p);
565 st--;
566 }
567 if (pos && st != pos) {
568 if (neg) return 0;
569 else if (i == numconds) return 1;
570 ingroup = true;
571 while (p && *p != ']' && ((p = nextchar(p)) != NULL));
572 st--;
573 }
574 if (p && *p != ']') p = nextchar(p);
575 } else if (pos) {
576 if (neg) return 0;
577 else if (i == numconds) return 1;
578 ingroup = true;
579 while (p && *p != ']' && ((p = nextchar(p)) != NULL));
580 // if (p && *p != ']') p = nextchar(p);
581 st--;
582 }
583 if (!pos) {
584 i++;
585 st--;
586 }
587 if (st < beg && p && *p != ']') return 0; // word <= condition
588 } else if (pos) { // group
589 p = nextchar(p);
590 } else return 0;
591 }
592 }
593 if (!p) return 1;
594 }
595 }
596
597 // see if this suffix is present in the word
checkword(const char * word,int len,int optflags,PfxEntry * ppfx,char ** wlst,int maxSug,int * ns,const FLAG cclass,const FLAG needflag,const FLAG badflag)598 struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
599 PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
600 const FLAG badflag)
601 {
602 int tmpl; // length of tmpword
603 struct hentry * he; // hash entry pointer
604 unsigned char * cp;
605 char tmpword[MAXTEMPWORDLEN];
606 PfxEntry* ep = ppfx;
607
608 // if this suffix is being cross checked with a prefix
609 // but it does not support cross products skip it
610
611 if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
612 return NULL;
613
614 // upon entry suffix is 0 length or already matches the end of the word.
615 // So if the remaining root word has positive length
616 // and if there are enough chars in root word and added back strip chars
617 // to meet the number of characters conditions, then test it
618
619 tmpl = len - appndl;
620 // the second condition is not enough for UTF-8 strings
621 // it checked in test_condition()
622
623 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
624 (tmpl + stripl >= numconds)) {
625
626 // generate new root word by removing suffix and adding
627 // back any characters that would have been stripped or
628 // or null terminating the shorter string
629
630 strncpy (tmpword, word, MAXTEMPWORDLEN-1);
631 tmpword[MAXTEMPWORDLEN-1] = '\0';
632 cp = (unsigned char *)(tmpword + tmpl);
633 if (stripl) {
634 strcpy ((char *)cp, strip);
635 tmpl += stripl;
636 cp = (unsigned char *)(tmpword + tmpl);
637 } else *cp = '\0';
638
639 // now make sure all of the conditions on characters
640 // are met. Please see the appendix at the end of
641 // this file for more info on exactly what is being
642 // tested
643
644 // if all conditions are met then check if resulting
645 // root word in the dictionary
646
647 if (test_condition((char *) cp, (char *) tmpword)) {
648
649 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
650 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
651 #endif
652 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
653 do {
654 // check conditional suffix (enabled by prefix)
655 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
656 TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
657 (((optflags & aeXPRODUCT) == 0) ||
658 (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
659 // enabled by prefix
660 ((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))
661 ) &&
662 // handle cont. class
663 ((!cclass) ||
664 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
665 ) &&
666 // check only in compound homonyms (bad flags)
667 (!badflag || !TESTAFF(he->astr, badflag, he->alen)
668 ) &&
669 // handle required flag
670 ((!needflag) ||
671 (TESTAFF(he->astr, needflag, he->alen) ||
672 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
673 )
674 ) return he;
675 he = he->next_homonym; // check homonyms
676 } while (he);
677
678 // obsolote stemming code (used only by the
679 // experimental SuffixMgr:suggest_pos_stems)
680 // store resulting root in wlst
681 } else if (wlst && (*ns < maxSug)) {
682 int cwrd = 1;
683 for (int k=0; k < *ns; k++)
684 if (strcmp(tmpword, wlst[k]) == 0) {
685 cwrd = 0;
686 break;
687 }
688 if (cwrd) {
689 wlst[*ns] = mystrdup(tmpword);
690 if (wlst[*ns] == NULL) {
691 for (int j=0; j<*ns; j++) free(wlst[j]);
692 *ns = -1;
693 return NULL;
694 }
695 (*ns)++;
696 }
697 }
698 }
699 }
700 return NULL;
701 }
702
703 // see if two-level suffix is present in the word
check_twosfx(const char * word,int len,int optflags,PfxEntry * ppfx,const FLAG needflag)704 struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
705 PfxEntry* ppfx, const FLAG needflag)
706 {
707 int tmpl; // length of tmpword
708 struct hentry * he; // hash entry pointer
709 unsigned char * cp;
710 char tmpword[MAXTEMPWORDLEN];
711 PfxEntry* ep = ppfx;
712
713
714 // if this suffix is being cross checked with a prefix
715 // but it does not support cross products skip it
716
717 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
718 return NULL;
719
720 // upon entry suffix is 0 length or already matches the end of the word.
721 // So if the remaining root word has positive length
722 // and if there are enough chars in root word and added back strip chars
723 // to meet the number of characters conditions, then test it
724
725 tmpl = len - appndl;
726
727 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
728 (tmpl + stripl >= numconds)) {
729
730 // generate new root word by removing suffix and adding
731 // back any characters that would have been stripped or
732 // or null terminating the shorter string
733
734 strncpy(tmpword, word, MAXTEMPWORDLEN-1);
735 tmpword[MAXTEMPWORDLEN-1] = '\0';
736 cp = (unsigned char *)(tmpword + tmpl);
737 if (stripl) {
738 strcpy ((char *)cp, strip);
739 tmpl += stripl;
740 cp = (unsigned char *)(tmpword + tmpl);
741 } else *cp = '\0';
742
743 // now make sure all of the conditions on characters
744 // are met. Please see the appendix at the end of
745 // this file for more info on exactly what is being
746 // tested
747
748 // if all conditions are met then recall suffix_check
749
750 if (test_condition((char *) cp, (char *) tmpword)) {
751 if (ppfx) {
752 // handle conditional suffix
753 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
754 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
755 else
756 he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
757 } else {
758 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
759 }
760 if (he) return he;
761 }
762 }
763 return NULL;
764 }
765
766 // see if two-level suffix is present in the word
check_twosfx_morph(const char * word,int len,int optflags,PfxEntry * ppfx,const FLAG needflag)767 char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
768 PfxEntry* ppfx, const FLAG needflag)
769 {
770 int tmpl; // length of tmpword
771 unsigned char * cp;
772 char tmpword[MAXTEMPWORDLEN];
773 PfxEntry* ep = ppfx;
774 char * st;
775
776 char result[MAXLNLEN];
777
778 *result = '\0';
779
780 // if this suffix is being cross checked with a prefix
781 // but it does not support cross products skip it
782
783 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
784 return NULL;
785
786 // upon entry suffix is 0 length or already matches the end of the word.
787 // So if the remaining root word has positive length
788 // and if there are enough chars in root word and added back strip chars
789 // to meet the number of characters conditions, then test it
790
791 tmpl = len - appndl;
792
793 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
794 (tmpl + stripl >= numconds)) {
795
796 // generate new root word by removing suffix and adding
797 // back any characters that would have been stripped or
798 // or null terminating the shorter string
799
800 strncpy(tmpword, word, MAXTEMPWORDLEN-1);
801 tmpword[MAXTEMPWORDLEN-1] = '\0';
802 cp = (unsigned char *)(tmpword + tmpl);
803 if (stripl) {
804 strcpy ((char *)cp, strip);
805 tmpl += stripl;
806 cp = (unsigned char *)(tmpword + tmpl);
807 } else *cp = '\0';
808
809 // now make sure all of the conditions on characters
810 // are met. Please see the appendix at the end of
811 // this file for more info on exactly what is being
812 // tested
813
814 // if all conditions are met then recall suffix_check
815
816 if (test_condition((char *) cp, (char *) tmpword)) {
817 if (ppfx) {
818 // handle conditional suffix
819 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
820 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
821 if (st) {
822 if (ppfx->getMorph()) {
823 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
824 mystrcat(result, " ", MAXLNLEN);
825 }
826 mystrcat(result,st, MAXLNLEN);
827 free(st);
828 mychomp(result);
829 }
830 } else {
831 st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
832 if (st) {
833 mystrcat(result, st, MAXLNLEN);
834 free(st);
835 mychomp(result);
836 }
837 }
838 } else {
839 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
840 if (st) {
841 mystrcat(result, st, MAXLNLEN);
842 free(st);
843 mychomp(result);
844 }
845 }
846 if (*result) return mystrdup(result);
847 }
848 }
849 return NULL;
850 }
851
852 // get next homonym with same affix
get_next_homonym(struct hentry * he,int optflags,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag)853 struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx,
854 const FLAG cclass, const FLAG needflag)
855 {
856 PfxEntry* ep = ppfx;
857 FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
858
859 while (he->next_homonym) {
860 he = he->next_homonym;
861 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
862 ((optflags & aeXPRODUCT) == 0 ||
863 TESTAFF(he->astr, eFlag, he->alen) ||
864 // handle conditional suffix
865 ((contclass) && TESTAFF(contclass, eFlag, contclasslen))
866 ) &&
867 // handle cont. class
868 ((!cclass) ||
869 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
870 ) &&
871 // handle required flag
872 ((!needflag) ||
873 (TESTAFF(he->astr, needflag, he->alen) ||
874 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
875 )
876 ) return he;
877 }
878 return NULL;
879 }
880
881
882 #if 0
883
884 Appendix: Understanding Affix Code
885
886
887 An affix is either a prefix or a suffix attached to root words to make
888 other words.
889
890 Basically a Prefix or a Suffix is set of AffEntry objects
891 which store information about the prefix or suffix along
892 with supporting routines to check if a word has a particular
893 prefix or suffix or a combination.
894
895 The structure affentry is defined as follows:
896
897 struct affentry
898 {
899 unsigned short aflag; // ID used to represent the affix
900 char * strip; // string to strip before adding affix
901 char * appnd; // the affix string to add
902 unsigned char stripl; // length of the strip string
903 unsigned char appndl; // length of the affix string
904 char numconds; // the number of conditions that must be met
905 char opts; // flag: aeXPRODUCT- combine both prefix and suffix
906 char conds[SETSIZE]; // array which encodes the conditions to be met
907 };
908
909
910 Here is a suffix borrowed from the en_US.aff file. This file
911 is whitespace delimited.
912
913 SFX D Y 4
914 SFX D 0 e d
915 SFX D y ied [^aeiou]y
916 SFX D 0 ed [^ey]
917 SFX D 0 ed [aeiou]y
918
919 This information can be interpreted as follows:
920
921 In the first line has 4 fields
922
923 Field
924 -----
925 1 SFX - indicates this is a suffix
926 2 D - is the name of the character flag which represents this suffix
927 3 Y - indicates it can be combined with prefixes (cross product)
928 4 4 - indicates that sequence of 4 affentry structures are needed to
929 properly store the affix information
930
931 The remaining lines describe the unique information for the 4 SfxEntry
932 objects that make up this affix. Each line can be interpreted
933 as follows: (note fields 1 and 2 are as a check against line 1 info)
934
935 Field
936 -----
937 1 SFX - indicates this is a suffix
938 2 D - is the name of the character flag for this affix
939 3 y - the string of chars to strip off before adding affix
940 (a 0 here indicates the NULL string)
941 4 ied - the string of affix characters to add
942 5 [^aeiou]y - the conditions which must be met before the affix
943 can be applied
944
945 Field 5 is interesting. Since this is a suffix, field 5 tells us that
946 there are 2 conditions that must be met. The first condition is that
947 the next to the last character in the word must *NOT* be any of the
948 following "a", "e", "i", "o" or "u". The second condition is that
949 the last character of the word must end in "y".
950
951 So how can we encode this information concisely and be able to
952 test for both conditions in a fast manner? The answer is found
953 but studying the wonderful ispell code of Geoff Kuenning, et.al.
954 (now available under a normal BSD license).
955
956 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
957 using a character (cast to an unsigned char) of a string, we have 8 bits
958 of information we can store about that character. Specifically we
959 could use each bit to say if that character is allowed in any of the
960 last (or first for prefixes) 8 characters of the word.
961
962 Basically, each character at one end of the word (up to the number
963 of conditions) is used to index into the conds array and the resulting
964 value found there says whether the that character is valid for a
965 specific character position in the word.
966
967 For prefixes, it does this by setting bit 0 if that char is valid
968 in the first position, bit 1 if valid in the second position, and so on.
969
970 If a bit is not set, then that char is not valid for that postion in the
971 word.
972
973 If working with suffixes bit 0 is used for the character closest
974 to the front, bit 1 for the next character towards the end, ...,
975 with bit numconds-1 representing the last char at the end of the string.
976
977 Note: since entries in the conds[] are 8 bits, only 8 conditions
978 (read that only 8 character positions) can be examined at one
979 end of a word (the beginning for prefixes and the end for suffixes.
980
981 So to make this clearer, lets encode the conds array values for the
982 first two affentries for the suffix D described earlier.
983
984
985 For the first affentry:
986 numconds = 1 (only examine the last character)
987
988 conds['e'] = (1 << 0) (the word must end in an E)
989 all others are all 0
990
991 For the second affentry:
992 numconds = 2 (only examine the last two characters)
993
994 conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
995 where X is all characters *but* a, e, i, o, or u
996
997
998 conds['y'] = (1 << 1) (the last char must be a y)
999 all other bits for all other entries in the conds array are zero
1000
1001
1002 #endif
1003
1004