1 #include "license.hunspell"
2 #include "license.myspell"
3 
4 #include <stdlib.h>
5 #include <string.h>
6 #include <stdio.h>
7 #include <ctype.h>
8 
9 #include "affentry.hxx"
10 #include "csutil.hxx"
11 
12 #define MAXTEMPWORDLEN (MAXWORDUTF8LEN + 4)
13 
PfxEntry(AffixMgr * pmgr,affentry * dp)14 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
15     // register affix manager
16     : pmyMgr(pmgr)
17     , next(NULL)
18     , nexteq(NULL)
19     , nextne(NULL)
20     , flgnxt(NULL)
21 {
22   // set up its initial values
23   aflag = dp->aflag;         // flag
24   strip = dp->strip;         // string to strip
25   appnd = dp->appnd;         // string to append
26   stripl = dp->stripl;       // length of strip string
27   appndl = dp->appndl;       // length of append string
28   numconds = dp->numconds;   // length of the condition
29   opts = dp->opts;           // cross product flag
30   // then copy over all of the conditions
31   if (opts & aeLONGCOND) {
32     memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
33     c.l.conds2 = dp->c.l.conds2;
34   } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
35   morphcode = dp->morphcode;
36   contclass = dp->contclass;
37   contclasslen = dp->contclasslen;
38 }
39 
40 
~PfxEntry()41 PfxEntry::~PfxEntry()
42 {
43     aflag = 0;
44     if (appnd) free(appnd);
45     if (strip) free(strip);
46     pmyMgr = NULL;
47     appnd = NULL;
48     strip = NULL;
49     if (opts & aeLONGCOND) free(c.l.conds2);
50     if (morphcode && !(opts & aeALIASM)) free(morphcode);
51     if (contclass && !(opts & aeALIASF)) free(contclass);
52 }
53 
54 // add prefix to this word assuming conditions hold
add(const char * word,int len)55 char * PfxEntry::add(const char * word, int len)
56 {
57     char tword[MAXTEMPWORDLEN];
58 
59     if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
60        (len >= numconds) && test_condition(word) &&
61        (!stripl || (strncmp(word, strip, stripl) == 0)) &&
62        ((MAXTEMPWORDLEN) > (len + appndl - stripl))) {
63     /* we have a match so add prefix */
64               char * pp = tword;
65               if (appndl) {
66                   strncpy(tword, appnd, MAXTEMPWORDLEN-1);
67                   tword[MAXTEMPWORDLEN-1] = '\0';
68                   pp += appndl;
69                }
70                strcpy(pp, (word + stripl));
71                return mystrdup(tword);
72      }
73      return NULL;
74 }
75 
nextchar(char * p)76 inline char * PfxEntry::nextchar(char * p) {
77     if (p) {
78         p++;
79         if (opts & aeLONGCOND) {
80             // jump to the 2nd part of the condition
81             if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;
82         // end of the MAXCONDLEN length condition
83         } else if (p == c.conds + MAXCONDLEN) return NULL;
84 	return *p ? p : NULL;
85     }
86     return NULL;
87 }
88 
test_condition(const char * st)89 inline int PfxEntry::test_condition(const char * st)
90 {
91     const char * pos = NULL; // group with pos input position
92     bool neg = false;        // complementer
93     bool ingroup = false;    // character in the group
94     if (numconds == 0) return 1;
95     char * p = c.conds;
96     while (1) {
97       switch (*p) {
98         case '\0': return 1;
99         case '[': {
100                 neg = false;
101                 ingroup = false;
102                 p = nextchar(p);
103                 pos = st; break;
104             }
105         case '^': { p = nextchar(p); neg = true; break; }
106         case ']': {
107                 if ((neg && ingroup) || (!neg && !ingroup)) return 0;
108                 pos = NULL;
109                 p = nextchar(p);
110                 // skip the next character
111                 if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
112                 if (*st == '\0' && p) return 0; // word <= condition
113                 break;
114             }
115          case '.':
116             if (!pos) { // dots are not metacharacters in groups: [.]
117                 p = nextchar(p);
118                 // skip the next character
119                 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
120                 if (*st == '\0' && p) return 0; // word <= condition
121                 break;
122             }
123             /* FALLTHROUGH */
124     default: {
125                 if (*st == *p) {
126                     st++;
127                     p = nextchar(p);
128                     if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
129                         while (p && (*p & 0xc0) == 0x80) {       // character
130                             if (*p != *st) {
131                                 if (!pos) return 0;
132                                 st = pos;
133                                 break;
134                             }
135                             p = nextchar(p);
136                             st++;
137                         }
138                         if (pos && st != pos) {
139                             ingroup = true;
140                             while (p && *p != ']' && ((p = nextchar(p)) != NULL));
141                         }
142                     } else if (pos) {
143                         ingroup = true;
144                         while (p && *p != ']' && ((p = nextchar(p)) != NULL));
145                     }
146                 } else if (pos) { // group
147                     p = nextchar(p);
148                 } else return 0;
149             }
150       }
151       if (!p) return 1;
152     }
153 }
154 
155 // check if this prefix entry matches
checkword(const char * word,int len,char in_compound,const FLAG needflag)156 struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
157 {
158     int                 tmpl;   // length of tmpword
159     struct hentry *     he;     // hash entry of root word or NULL
160     char                tmpword[MAXTEMPWORDLEN];
161 
162     // on entry prefix is 0 length or already matches the beginning of the word.
163     // So if the remaining root word has positive length
164     // and if there are enough chars in root word and added back strip chars
165     // to meet the number of characters conditions, then test it
166 
167      tmpl = len - appndl;
168 
169      if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
170 
171             // generate new root word by removing prefix and adding
172             // back any characters that would have been stripped
173 
174             if (stripl) {
175                 strncpy(tmpword, strip, MAXTEMPWORDLEN-1);
176                 tmpword[MAXTEMPWORDLEN-1] = '\0';
177             }
178             strcpy ((tmpword + stripl), (word + appndl));
179 
180             // now make sure all of the conditions on characters
181             // are met.  Please see the appendix at the end of
182             // this file for more info on exactly what is being
183             // tested
184 
185             // if all conditions are met then check if resulting
186             // root word in the dictionary
187 
188             if (test_condition(tmpword)) {
189                 tmpl += stripl;
190                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
191                    do {
192                       if (TESTAFF(he->astr, aflag, he->alen) &&
193                         // forbid single prefixes with needaffix flag
194                         ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
195                         // needflag
196                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
197                          (contclass && TESTAFF(contclass, needflag, contclasslen))))
198                             return he;
199                       he = he->next_homonym; // check homonyms
200                    } while (he);
201                 }
202 
203                 // prefix matched but no root word was found
204                 // if aeXPRODUCT is allowed, try again but now
205                 // ross checked combined with a suffix
206 
207                 //if ((opts & aeXPRODUCT) && in_compound) {
208                 if ((opts & aeXPRODUCT)) {
209                    he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL,
210                         0, NULL, FLAG_NULL, needflag, in_compound);
211                    if (he) return he;
212                 }
213             }
214      }
215     return NULL;
216 }
217 
218 // check if this prefix entry matches
check_twosfx(const char * word,int len,char in_compound,const FLAG needflag)219 struct hentry * PfxEntry::check_twosfx(const char * word, int len,
220     char in_compound, const FLAG needflag)
221 {
222     int                 tmpl;   // length of tmpword
223     struct hentry *     he;     // hash entry of root word or NULL
224     char                tmpword[MAXTEMPWORDLEN];
225 
226     // on entry prefix is 0 length or already matches the beginning of the word.
227     // So if the remaining root word has positive length
228     // and if there are enough chars in root word and added back strip chars
229     // to meet the number of characters conditions, then test it
230 
231      tmpl = len - appndl;
232 
233      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
234         (tmpl + stripl >= numconds)) {
235 
236             // generate new root word by removing prefix and adding
237             // back any characters that would have been stripped
238 
239             if (stripl) {
240                 strncpy(tmpword, strip, MAXTEMPWORDLEN-1);
241                 tmpword[MAXTEMPWORDLEN-1] = '\0';
242             }
243             strcpy ((tmpword + stripl), (word + appndl));
244 
245             // now make sure all of the conditions on characters
246             // are met.  Please see the appendix at the end of
247             // this file for more info on exactly what is being
248             // tested
249 
250             // if all conditions are met then check if resulting
251             // root word in the dictionary
252 
253             if (test_condition(tmpword)) {
254                 tmpl += stripl;
255 
256                 // prefix matched but no root word was found
257                 // if aeXPRODUCT is allowed, try again but now
258                 // cross checked combined with a suffix
259 
260                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
261                    he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag);
262                    if (he) return he;
263                 }
264             }
265      }
266     return NULL;
267 }
268 
269 // check if this prefix entry matches
check_twosfx_morph(const char * word,int len,char in_compound,const FLAG needflag)270 char * PfxEntry::check_twosfx_morph(const char * word, int len,
271          char in_compound, const FLAG needflag)
272 {
273     int                 tmpl;   // length of tmpword
274     char                tmpword[MAXTEMPWORDLEN];
275 
276     // on entry prefix is 0 length or already matches the beginning of the word.
277     // So if the remaining root word has positive length
278     // and if there are enough chars in root word and added back strip chars
279     // to meet the number of characters conditions, then test it
280 
281      tmpl = len - appndl;
282 
283      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
284         (tmpl + stripl >= numconds)) {
285 
286             // generate new root word by removing prefix and adding
287             // back any characters that would have been stripped
288 
289             if (stripl) {
290                 strncpy(tmpword, strip, MAXTEMPWORDLEN-1);
291                 tmpword[MAXTEMPWORDLEN-1] = '\0';
292             }
293             strcpy ((tmpword + stripl), (word + appndl));
294 
295             // now make sure all of the conditions on characters
296             // are met.  Please see the appendix at the end of
297             // this file for more info on exactly what is being
298             // tested
299 
300             // if all conditions are met then check if resulting
301             // root word in the dictionary
302 
303             if (test_condition(tmpword)) {
304                 tmpl += stripl;
305 
306                 // prefix matched but no root word was found
307                 // if aeXPRODUCT is allowed, try again but now
308                 // ross checked combined with a suffix
309 
310                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
311                     return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
312                              aeXPRODUCT, this, needflag);
313                 }
314             }
315      }
316     return NULL;
317 }
318 
319 // check if this prefix entry matches
check_morph(const char * word,int len,char in_compound,const FLAG needflag)320 char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
321 {
322     int                 tmpl;   // length of tmpword
323     struct hentry *     he;     // hash entry of root word or NULL
324     char                tmpword[MAXTEMPWORDLEN];
325     char                result[MAXLNLEN];
326     char * st;
327 
328     *result = '\0';
329 
330     // on entry prefix is 0 length or already matches the beginning of the word.
331     // So if the remaining root word has positive length
332     // and if there are enough chars in root word and added back strip chars
333     // to meet the number of characters conditions, then test it
334 
335      tmpl = len - appndl;
336 
337      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
338         (tmpl + stripl >= numconds)) {
339 
340             // generate new root word by removing prefix and adding
341             // back any characters that would have been stripped
342 
343             if (stripl) {
344                 strncpy(tmpword, strip, MAXTEMPWORDLEN-1);
345                 tmpword[MAXTEMPWORDLEN-1] = '\0';
346             }
347             strcpy ((tmpword + stripl), (word + appndl));
348 
349             // now make sure all of the conditions on characters
350             // are met.  Please see the appendix at the end of
351             // this file for more info on exactly what is being
352             // tested
353 
354             // if all conditions are met then check if resulting
355             // root word in the dictionary
356 
357             if (test_condition(tmpword)) {
358                 tmpl += stripl;
359                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
360                     do {
361                       if (TESTAFF(he->astr, aflag, he->alen) &&
362                         // forbid single prefixes with needaffix flag
363                         ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
364                         // needflag
365                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
366                          (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
367                             if (morphcode) {
368                                 mystrcat(result, " ", MAXLNLEN);
369                                 mystrcat(result, morphcode, MAXLNLEN);
370                             } else mystrcat(result,getKey(), MAXLNLEN);
371                             if (!HENTRY_FIND(he, MORPH_STEM)) {
372                                 mystrcat(result, " ", MAXLNLEN);
373                                 mystrcat(result, MORPH_STEM, MAXLNLEN);
374                                 mystrcat(result, HENTRY_WORD(he), MAXLNLEN);
375                             }
376                             // store the pointer of the hash entry
377                             if (HENTRY_DATA(he)) {
378                                 mystrcat(result, " ", MAXLNLEN);
379                                 mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);
380                             } else {
381                                 // return with debug information
382                                 char * flag = pmyMgr->encode_flag(getFlag());
383                                 mystrcat(result, " ", MAXLNLEN);
384                                 mystrcat(result, MORPH_FLAG, MAXLNLEN);
385                                 mystrcat(result, flag, MAXLNLEN);
386                                 free(flag);
387                             }
388                             mystrcat(result, "\n", MAXLNLEN);
389                       }
390                       he = he->next_homonym;
391                     } while (he);
392                 }
393 
394                 // prefix matched but no root word was found
395                 // if aeXPRODUCT is allowed, try again but now
396                 // ross checked combined with a suffix
397 
398                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
399                    st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this,
400                      FLAG_NULL, needflag);
401                    if (st) {
402                         mystrcat(result, st, MAXLNLEN);
403                         free(st);
404                    }
405                 }
406             }
407      }
408 
409     if (*result) return mystrdup(result);
410     return NULL;
411 }
412 
SfxEntry(AffixMgr * pmgr,affentry * dp)413 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
414     : pmyMgr(pmgr) // register affix manager
415     , next(NULL)
416     , nexteq(NULL)
417     , nextne(NULL)
418     , flgnxt(NULL)
419     , l_morph(NULL)
420     , r_morph(NULL)
421     , eq_morph(NULL)
422 {
423   // set up its initial values
424   aflag = dp->aflag;         // char flag
425   strip = dp->strip;         // string to strip
426   appnd = dp->appnd;         // string to append
427   stripl = dp->stripl;       // length of strip string
428   appndl = dp->appndl;       // length of append string
429   numconds = dp->numconds;   // length of the condition
430   opts = dp->opts;           // cross product flag
431 
432   // then copy over all of the conditions
433   if (opts & aeLONGCOND) {
434     memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
435     c.l.conds2 = dp->c.l.conds2;
436   } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
437   rappnd = myrevstrdup(appnd);
438   morphcode = dp->morphcode;
439   contclass = dp->contclass;
440   contclasslen = dp->contclasslen;
441 }
442 
443 
~SfxEntry()444 SfxEntry::~SfxEntry()
445 {
446     aflag = 0;
447     if (appnd) free(appnd);
448     if (rappnd) free(rappnd);
449     if (strip) free(strip);
450     pmyMgr = NULL;
451     appnd = NULL;
452     strip = NULL;
453     if (opts & aeLONGCOND) free(c.l.conds2);
454     if (morphcode && !(opts & aeALIASM)) free(morphcode);
455     if (contclass && !(opts & aeALIASF)) free(contclass);
456 }
457 
458 // add suffix to this word assuming conditions hold
add(const char * word,int len)459 char * SfxEntry::add(const char * word, int len)
460 {
461     char tword[MAXTEMPWORDLEN];
462 
463      /* make sure all conditions match */
464      if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
465         (len >= numconds) && test_condition(word + len, word) &&
466         (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
467         ((MAXTEMPWORDLEN) > (len + appndl - stripl))) {
468               /* we have a match so add suffix */
469               strncpy(tword, word, MAXTEMPWORDLEN-1);
470               tword[MAXTEMPWORDLEN-1] = '\0';
471               if (appndl) {
472                   strcpy(tword + len - stripl, appnd);
473               } else {
474                   *(tword + len - stripl) = '\0';
475               }
476               return mystrdup(tword);
477      }
478      return NULL;
479 }
480 
nextchar(char * p)481 inline char * SfxEntry::nextchar(char * p) {
482     if (p) {
483 	p++;
484 	if (opts & aeLONGCOND) {
485     	    // jump to the 2nd part of the condition
486     	    if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;
487 	// end of the MAXCONDLEN length condition
488 	} else if (p == c.conds + MAXCONDLEN) return NULL;
489 	return *p ? p : NULL;
490     }
491     return NULL;
492 }
493 
test_condition(const char * st,const char * beg)494 inline int SfxEntry::test_condition(const char * st, const char * beg)
495 {
496     const char * pos = NULL;    // group with pos input position
497     bool neg = false;           // complementer
498     bool ingroup = false;       // character in the group
499     if (numconds == 0) return 1;
500     char * p = c.conds;
501     st--;
502     int i = 1;
503     while (1) {
504       switch (*p) {
505         case '\0':
506             return 1;
507         case '[':
508             p = nextchar(p);
509             pos = st;
510             break;
511         case '^':
512             p = nextchar(p);
513             neg = true;
514             break;
515         case ']':
516             if (!neg && !ingroup)
517               return 0;
518             i++;
519             // skip the next character
520             if (!ingroup)
521             {
522                 for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
523                 st--;
524             }
525             pos = NULL;
526             neg = false;
527             ingroup = false;
528             p = nextchar(p);
529             if (st < beg && p)
530                 return 0; // word <= condition
531             break;
532         case '.':
533             if (!pos)
534             {
535                 // dots are not metacharacters in groups: [.]
536                 p = nextchar(p);
537                 // skip the next character
538                 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
539                 if (st < beg) { // word <= condition
540 		    if (p) return 0; else return 1;
541 		}
542                 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
543                     st--;
544                     if (st < beg) { // word <= condition
545 			if (p) return 0; else return 1;
546 		    }
547                 }
548                 break;
549             }
550             /* FALLTHROUGH */
551     default: {
552                 if (*st == *p) {
553                     p = nextchar(p);
554                     if ((opts & aeUTF8) && (*st & 0x80)) {
555                         st--;
556                         while (p && (st >= beg)) {
557                             if (*p != *st) {
558                                 if (!pos) return 0;
559                                 st = pos;
560                                 break;
561                             }
562                             // first byte of the UTF-8 multibyte character
563                             if ((*p & 0xc0) != 0x80) break;
564                             p = nextchar(p);
565                             st--;
566                         }
567                         if (pos && st != pos) {
568                             if (neg) return 0;
569                             else if (i == numconds) return 1;
570                             ingroup = true;
571                             while (p && *p != ']' && ((p = nextchar(p)) != NULL));
572 			    st--;
573                         }
574                         if (p && *p != ']') p = nextchar(p);
575                     } else if (pos) {
576                         if (neg) return 0;
577                         else if (i == numconds) return 1;
578                         ingroup = true;
579 			while (p && *p != ']' && ((p = nextchar(p)) != NULL));
580 //			if (p && *p != ']') p = nextchar(p);
581                         st--;
582                     }
583                     if (!pos) {
584                         i++;
585                         st--;
586                     }
587                     if (st < beg && p && *p != ']') return 0; // word <= condition
588                 } else if (pos) { // group
589                     p = nextchar(p);
590                 } else return 0;
591             }
592       }
593       if (!p) return 1;
594     }
595 }
596 
597 // see if this suffix is present in the word
checkword(const char * word,int len,int optflags,PfxEntry * ppfx,char ** wlst,int maxSug,int * ns,const FLAG cclass,const FLAG needflag,const FLAG badflag)598 struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
599     PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
600     const FLAG badflag)
601 {
602     int                 tmpl;            // length of tmpword
603     struct hentry *     he;              // hash entry pointer
604     unsigned char *     cp;
605     char                tmpword[MAXTEMPWORDLEN];
606     PfxEntry* ep = ppfx;
607 
608     // if this suffix is being cross checked with a prefix
609     // but it does not support cross products skip it
610 
611     if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
612         return NULL;
613 
614     // upon entry suffix is 0 length or already matches the end of the word.
615     // So if the remaining root word has positive length
616     // and if there are enough chars in root word and added back strip chars
617     // to meet the number of characters conditions, then test it
618 
619     tmpl = len - appndl;
620     // the second condition is not enough for UTF-8 strings
621     // it checked in test_condition()
622 
623     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
624         (tmpl + stripl >= numconds)) {
625 
626             // generate new root word by removing suffix and adding
627             // back any characters that would have been stripped or
628             // or null terminating the shorter string
629 
630             strncpy (tmpword, word, MAXTEMPWORDLEN-1);
631             tmpword[MAXTEMPWORDLEN-1] = '\0';
632             cp = (unsigned char *)(tmpword + tmpl);
633             if (stripl) {
634                 strcpy ((char *)cp, strip);
635                 tmpl += stripl;
636                 cp = (unsigned char *)(tmpword + tmpl);
637             } else *cp = '\0';
638 
639             // now make sure all of the conditions on characters
640             // are met.  Please see the appendix at the end of
641             // this file for more info on exactly what is being
642             // tested
643 
644             // if all conditions are met then check if resulting
645             // root word in the dictionary
646 
647             if (test_condition((char *) cp, (char *) tmpword)) {
648 
649 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
650                 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
651 #endif
652                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
653                     do {
654                         // check conditional suffix (enabled by prefix)
655                         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
656                                     TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
657                             (((optflags & aeXPRODUCT) == 0) ||
658                             (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
659                              // enabled by prefix
660                             ((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))
661                             ) &&
662                             // handle cont. class
663                             ((!cclass) ||
664                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
665                             ) &&
666                             // check only in compound homonyms (bad flags)
667                             (!badflag || !TESTAFF(he->astr, badflag, he->alen)
668                             ) &&
669                             // handle required flag
670                             ((!needflag) ||
671                               (TESTAFF(he->astr, needflag, he->alen) ||
672                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
673                             )
674                         ) return he;
675                         he = he->next_homonym; // check homonyms
676                     } while (he);
677 
678                 // obsolote stemming code (used only by the
679                 // experimental SuffixMgr:suggest_pos_stems)
680                 // store resulting root in wlst
681                 } else if (wlst && (*ns < maxSug)) {
682                     int cwrd = 1;
683                     for (int k=0; k < *ns; k++)
684                         if (strcmp(tmpword, wlst[k]) == 0) {
685                            cwrd = 0;
686                            break;
687                         }
688                     if (cwrd) {
689                         wlst[*ns] = mystrdup(tmpword);
690                         if (wlst[*ns] == NULL) {
691                             for (int j=0; j<*ns; j++) free(wlst[j]);
692                             *ns = -1;
693                             return NULL;
694                         }
695                         (*ns)++;
696                     }
697                 }
698             }
699     }
700     return NULL;
701 }
702 
703 // see if two-level suffix is present in the word
check_twosfx(const char * word,int len,int optflags,PfxEntry * ppfx,const FLAG needflag)704 struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
705     PfxEntry* ppfx, const FLAG needflag)
706 {
707     int                 tmpl;            // length of tmpword
708     struct hentry *     he;              // hash entry pointer
709     unsigned char *     cp;
710     char                tmpword[MAXTEMPWORDLEN];
711     PfxEntry* ep = ppfx;
712 
713 
714     // if this suffix is being cross checked with a prefix
715     // but it does not support cross products skip it
716 
717     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)
718         return NULL;
719 
720     // upon entry suffix is 0 length or already matches the end of the word.
721     // So if the remaining root word has positive length
722     // and if there are enough chars in root word and added back strip chars
723     // to meet the number of characters conditions, then test it
724 
725     tmpl = len - appndl;
726 
727     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
728        (tmpl + stripl >= numconds)) {
729 
730             // generate new root word by removing suffix and adding
731             // back any characters that would have been stripped or
732             // or null terminating the shorter string
733 
734             strncpy(tmpword, word, MAXTEMPWORDLEN-1);
735             tmpword[MAXTEMPWORDLEN-1] = '\0';
736             cp = (unsigned char *)(tmpword + tmpl);
737             if (stripl) {
738                 strcpy ((char *)cp, strip);
739                 tmpl += stripl;
740                 cp = (unsigned char *)(tmpword + tmpl);
741             } else *cp = '\0';
742 
743             // now make sure all of the conditions on characters
744             // are met.  Please see the appendix at the end of
745             // this file for more info on exactly what is being
746             // tested
747 
748             // if all conditions are met then recall suffix_check
749 
750             if (test_condition((char *) cp, (char *) tmpword)) {
751                 if (ppfx) {
752                     // handle conditional suffix
753                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
754                         he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
755                     else
756                         he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
757                 } else {
758                     he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
759                 }
760                 if (he) return he;
761             }
762     }
763     return NULL;
764 }
765 
766 // see if two-level suffix is present in the word
check_twosfx_morph(const char * word,int len,int optflags,PfxEntry * ppfx,const FLAG needflag)767 char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
768     PfxEntry* ppfx, const FLAG needflag)
769 {
770     int                 tmpl;            // length of tmpword
771     unsigned char *     cp;
772     char                tmpword[MAXTEMPWORDLEN];
773     PfxEntry* ep = ppfx;
774     char * st;
775 
776     char result[MAXLNLEN];
777 
778     *result = '\0';
779 
780     // if this suffix is being cross checked with a prefix
781     // but it does not support cross products skip it
782 
783     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)
784         return NULL;
785 
786     // upon entry suffix is 0 length or already matches the end of the word.
787     // So if the remaining root word has positive length
788     // and if there are enough chars in root word and added back strip chars
789     // to meet the number of characters conditions, then test it
790 
791     tmpl = len - appndl;
792 
793     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
794        (tmpl + stripl >= numconds)) {
795 
796             // generate new root word by removing suffix and adding
797             // back any characters that would have been stripped or
798             // or null terminating the shorter string
799 
800             strncpy(tmpword, word, MAXTEMPWORDLEN-1);
801             tmpword[MAXTEMPWORDLEN-1] = '\0';
802             cp = (unsigned char *)(tmpword + tmpl);
803             if (stripl) {
804                 strcpy ((char *)cp, strip);
805                 tmpl += stripl;
806                 cp = (unsigned char *)(tmpword + tmpl);
807             } else *cp = '\0';
808 
809             // now make sure all of the conditions on characters
810             // are met.  Please see the appendix at the end of
811             // this file for more info on exactly what is being
812             // tested
813 
814             // if all conditions are met then recall suffix_check
815 
816             if (test_condition((char *) cp, (char *) tmpword)) {
817                 if (ppfx) {
818                     // handle conditional suffix
819                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
820                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
821                         if (st) {
822                             if (ppfx->getMorph()) {
823                                 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
824                                 mystrcat(result, " ", MAXLNLEN);
825                             }
826                             mystrcat(result,st, MAXLNLEN);
827                             free(st);
828                             mychomp(result);
829                         }
830                     } else {
831                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
832                         if (st) {
833                             mystrcat(result, st, MAXLNLEN);
834                             free(st);
835                             mychomp(result);
836                         }
837                     }
838                 } else {
839                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
840                         if (st) {
841                             mystrcat(result, st, MAXLNLEN);
842                             free(st);
843                             mychomp(result);
844                         }
845                 }
846                 if (*result) return mystrdup(result);
847             }
848     }
849     return NULL;
850 }
851 
852 // get next homonym with same affix
get_next_homonym(struct hentry * he,int optflags,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag)853 struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx,
854     const FLAG cclass, const FLAG needflag)
855 {
856     PfxEntry* ep = ppfx;
857     FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
858 
859     while (he->next_homonym) {
860         he = he->next_homonym;
861         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
862                             ((optflags & aeXPRODUCT) == 0 ||
863                             TESTAFF(he->astr, eFlag, he->alen) ||
864                              // handle conditional suffix
865                             ((contclass) && TESTAFF(contclass, eFlag, contclasslen))
866                             ) &&
867                             // handle cont. class
868                             ((!cclass) ||
869                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
870                             ) &&
871                             // handle required flag
872                             ((!needflag) ||
873                               (TESTAFF(he->astr, needflag, he->alen) ||
874                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
875                             )
876                         ) return he;
877     }
878     return NULL;
879 }
880 
881 
882 #if 0
883 
884 Appendix:  Understanding Affix Code
885 
886 
887 An affix is either a  prefix or a suffix attached to root words to make
888 other words.
889 
890 Basically a Prefix or a Suffix is set of AffEntry objects
891 which store information about the prefix or suffix along
892 with supporting routines to check if a word has a particular
893 prefix or suffix or a combination.
894 
895 The structure affentry is defined as follows:
896 
897 struct affentry
898 {
899    unsigned short aflag;    // ID used to represent the affix
900    char * strip;            // string to strip before adding affix
901    char * appnd;            // the affix string to add
902    unsigned char stripl;    // length of the strip string
903    unsigned char appndl;    // length of the affix string
904    char numconds;           // the number of conditions that must be met
905    char opts;               // flag: aeXPRODUCT- combine both prefix and suffix
906    char   conds[SETSIZE];   // array which encodes the conditions to be met
907 };
908 
909 
910 Here is a suffix borrowed from the en_US.aff file.  This file
911 is whitespace delimited.
912 
913 SFX D Y 4
914 SFX D   0     e          d
915 SFX D   y     ied        [^aeiou]y
916 SFX D   0     ed         [^ey]
917 SFX D   0     ed         [aeiou]y
918 
919 This information can be interpreted as follows:
920 
921 In the first line has 4 fields
922 
923 Field
924 -----
925 1     SFX - indicates this is a suffix
926 2     D   - is the name of the character flag which represents this suffix
927 3     Y   - indicates it can be combined with prefixes (cross product)
928 4     4   - indicates that sequence of 4 affentry structures are needed to
929                properly store the affix information
930 
931 The remaining lines describe the unique information for the 4 SfxEntry
932 objects that make up this affix.  Each line can be interpreted
933 as follows: (note fields 1 and 2 are as a check against line 1 info)
934 
935 Field
936 -----
937 1     SFX         - indicates this is a suffix
938 2     D           - is the name of the character flag for this affix
939 3     y           - the string of chars to strip off before adding affix
940                          (a 0 here indicates the NULL string)
941 4     ied         - the string of affix characters to add
942 5     [^aeiou]y   - the conditions which must be met before the affix
943                     can be applied
944 
945 Field 5 is interesting.  Since this is a suffix, field 5 tells us that
946 there are 2 conditions that must be met.  The first condition is that
947 the next to the last character in the word must *NOT* be any of the
948 following "a", "e", "i", "o" or "u".  The second condition is that
949 the last character of the word must end in "y".
950 
951 So how can we encode this information concisely and be able to
952 test for both conditions in a fast manner?  The answer is found
953 but studying the wonderful ispell code of Geoff Kuenning, et.al.
954 (now available under a normal BSD license).
955 
956 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
957 using a character (cast to an unsigned char) of a string, we have 8 bits
958 of information we can store about that character.  Specifically we
959 could use each bit to say if that character is allowed in any of the
960 last (or first for prefixes) 8 characters of the word.
961 
962 Basically, each character at one end of the word (up to the number
963 of conditions) is used to index into the conds array and the resulting
964 value found there says whether the that character is valid for a
965 specific character position in the word.
966 
967 For prefixes, it does this by setting bit 0 if that char is valid
968 in the first position, bit 1 if valid in the second position, and so on.
969 
970 If a bit is not set, then that char is not valid for that postion in the
971 word.
972 
973 If working with suffixes bit 0 is used for the character closest
974 to the front, bit 1 for the next character towards the end, ...,
975 with bit numconds-1 representing the last char at the end of the string.
976 
977 Note: since entries in the conds[] are 8 bits, only 8 conditions
978 (read that only 8 character positions) can be examined at one
979 end of a word (the beginning for prefixes and the end for suffixes.
980 
981 So to make this clearer, lets encode the conds array values for the
982 first two affentries for the suffix D described earlier.
983 
984 
985   For the first affentry:
986      numconds = 1             (only examine the last character)
987 
988      conds['e'] =  (1 << 0)   (the word must end in an E)
989      all others are all 0
990 
991   For the second affentry:
992      numconds = 2             (only examine the last two characters)
993 
994      conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
995          where X is all characters *but* a, e, i, o, or u
996 
997 
998      conds['y'] = (1 << 1)     (the last char must be a y)
999      all other bits for all other entries in the conds array are zero
1000 
1001 
1002 #endif
1003 
1004