1 /* ***** BEGIN LICENSE BLOCK *****
2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3  *
4  * Copyright (C) 2002-2017 Németh László
5  *
6  * The contents of this file are subject to the Mozilla Public License Version
7  * 1.1 (the "License"); you may not use this file except in compliance with
8  * the License. You may obtain a copy of the License at
9  * http://www.mozilla.org/MPL/
10  *
11  * Software distributed under the License is distributed on an "AS IS" basis,
12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13  * for the specific language governing rights and limitations under the
14  * License.
15  *
16  * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17  *
18  * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19  * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20  * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21  * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22  * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23  *
24  * Alternatively, the contents of this file may be used under the terms of
25  * either the GNU General Public License Version 2 or later (the "GPL"), or
26  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27  * in which case the provisions of the GPL or the LGPL are applicable instead
28  * of those above. If you wish to allow use of your version of this file only
29  * under the terms of either the GPL or the LGPL, and not to allow others to
30  * use your version of this file under the terms of the MPL, indicate your
31  * decision by deleting the provisions above and replace them with the notice
32  * and other provisions required by the GPL or the LGPL. If you do not delete
33  * the provisions above, a recipient may use your version of this file under
34  * the terms of any one of the MPL, the GPL or the LGPL.
35  *
36  * ***** END LICENSE BLOCK ***** */
37 /*
38  * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39  * And Contributors.  All rights reserved.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  *
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  *
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  *
52  * 3. All modifications to the source code must be clearly marked as
53  *    such.  Binary redistributions based on modified source code
54  *    must be clearly marked as modified versions in the documentation
55  *    and/or other materials provided with the distribution.
56  *
57  * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
61  * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68  * SUCH DAMAGE.
69  */
70 
71 #include <stdlib.h>
72 #include <string.h>
73 #include <stdio.h>
74 #include <ctype.h>
75 
76 #include "affentry.hxx"
77 #include "csutil.hxx"
78 
~AffEntry()79 AffEntry::~AffEntry() {
80   if (opts & aeLONGCOND)
81     free(c.l.conds2);
82   if (morphcode && !(opts & aeALIASM))
83     free(morphcode);
84   if (contclass && !(opts & aeALIASF))
85     free(contclass);
86 }
87 
PfxEntry(AffixMgr * pmgr)88 PfxEntry::PfxEntry(AffixMgr* pmgr)
89     // register affix manager
90     : pmyMgr(pmgr),
91       next(NULL),
92       nexteq(NULL),
93       nextne(NULL),
94       flgnxt(NULL) {
95 }
96 
97 // add prefix to this word assuming conditions hold
add(const char * word,size_t len)98 std::string PfxEntry::add(const char* word, size_t len) {
99   std::string result;
100   if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
101       (len >= numconds) && test_condition(word) &&
102       (!strip.size() || (strncmp(word, strip.c_str(), strip.size()) == 0))) {
103     /* we have a match so add prefix */
104     result.assign(appnd);
105     result.append(word + strip.size());
106   }
107   return result;
108 }
109 
nextchar(char * p)110 inline char* PfxEntry::nextchar(char* p) {
111   if (p) {
112     p++;
113     if (opts & aeLONGCOND) {
114       // jump to the 2nd part of the condition
115       if (p == c.conds + MAXCONDLEN_1)
116         return c.l.conds2;
117       // end of the MAXCONDLEN length condition
118     } else if (p == c.conds + MAXCONDLEN)
119       return NULL;
120     return *p ? p : NULL;
121   }
122   return NULL;
123 }
124 
test_condition(const char * st)125 inline int PfxEntry::test_condition(const char* st) {
126   const char* pos = NULL;  // group with pos input position
127   bool neg = false;        // complementer
128   bool ingroup = false;    // character in the group
129   if (numconds == 0)
130     return 1;
131   char* p = c.conds;
132   while (1) {
133     switch (*p) {
134       case '\0':
135         return 1;
136       case '[': {
137         neg = false;
138         ingroup = false;
139         p = nextchar(p);
140         pos = st;
141         break;
142       }
143       case '^': {
144         p = nextchar(p);
145         neg = true;
146         break;
147       }
148       case ']': {
149         if ((neg && ingroup) || (!neg && !ingroup))
150           return 0;
151         pos = NULL;
152         p = nextchar(p);
153         // skip the next character
154         if (!ingroup && *st)
155           for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
156             ;
157         if (*st == '\0' && p)
158           return 0;  // word <= condition
159         break;
160       }
161       case '.':
162         if (!pos) {  // dots are not metacharacters in groups: [.]
163           p = nextchar(p);
164           // skip the next character
165           for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
166             ;
167           if (*st == '\0' && p)
168             return 0;  // word <= condition
169           break;
170         }
171       /* FALLTHROUGH */
172       default: {
173         if (*st == *p) {
174           st++;
175           p = nextchar(p);
176           if ((opts & aeUTF8) && (*(st - 1) & 0x80)) {  // multibyte
177             while (p && (*p & 0xc0) == 0x80) {          // character
178               if (*p != *st) {
179                 if (!pos)
180                   return 0;
181                 st = pos;
182                 break;
183               }
184               p = nextchar(p);
185               st++;
186             }
187             if (pos && st != pos) {
188               ingroup = true;
189               while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
190               }
191             }
192           } else if (pos) {
193             ingroup = true;
194             while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
195             }
196           }
197         } else if (pos) {  // group
198           p = nextchar(p);
199         } else
200           return 0;
201       }
202     }
203     if (!p)
204       return 1;
205   }
206 }
207 
208 // check if this prefix entry matches
checkword(const char * word,int len,char in_compound,const FLAG needflag)209 struct hentry* PfxEntry::checkword(const char* word,
210                                    int len,
211                                    char in_compound,
212                                    const FLAG needflag) {
213   struct hentry* he;  // hash entry of root word or NULL
214 
215   // on entry prefix is 0 length or already matches the beginning of the word.
216   // So if the remaining root word has positive length
217   // and if there are enough chars in root word and added back strip chars
218   // to meet the number of characters conditions, then test it
219 
220   int tmpl = len - appnd.size(); // length of tmpword
221 
222   if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
223     // generate new root word by removing prefix and adding
224     // back any characters that would have been stripped
225 
226     std::string tmpword(strip);
227     tmpword.append(word + appnd.size());
228 
229     // now make sure all of the conditions on characters
230     // are met.  Please see the appendix at the end of
231     // this file for more info on exactly what is being
232     // tested
233 
234     // if all conditions are met then check if resulting
235     // root word in the dictionary
236 
237     if (test_condition(tmpword.c_str())) {
238       tmpl += strip.size();
239       if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
240         do {
241           if (TESTAFF(he->astr, aflag, he->alen) &&
242               // forbid single prefixes with needaffix flag
243               !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
244               // needflag
245               ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
246                (contclass && TESTAFF(contclass, needflag, contclasslen))))
247             return he;
248           he = he->next_homonym;  // check homonyms
249         } while (he);
250       }
251 
252       // prefix matched but no root word was found
253       // if aeXPRODUCT is allowed, try again but now
254       // ross checked combined with a suffix
255 
256       // if ((opts & aeXPRODUCT) && in_compound) {
257       if ((opts & aeXPRODUCT)) {
258         he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, aeXPRODUCT, this,
259                                   FLAG_NULL, needflag, in_compound);
260         if (he)
261           return he;
262       }
263     }
264   }
265   return NULL;
266 }
267 
268 // check if this prefix entry matches
check_twosfx(const char * word,int len,char in_compound,const FLAG needflag)269 struct hentry* PfxEntry::check_twosfx(const char* word,
270                                       int len,
271                                       char in_compound,
272                                       const FLAG needflag) {
273   // on entry prefix is 0 length or already matches the beginning of the word.
274   // So if the remaining root word has positive length
275   // and if there are enough chars in root word and added back strip chars
276   // to meet the number of characters conditions, then test it
277 
278   int tmpl = len - appnd.size(); // length of tmpword
279 
280   if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
281       (tmpl + strip.size() >= numconds)) {
282     // generate new root word by removing prefix and adding
283     // back any characters that would have been stripped
284 
285     std::string tmpword(strip);
286     tmpword.append(word + appnd.size());
287 
288     // now make sure all of the conditions on characters
289     // are met.  Please see the appendix at the end of
290     // this file for more info on exactly what is being
291     // tested
292 
293     // if all conditions are met then check if resulting
294     // root word in the dictionary
295 
296     if (test_condition(tmpword.c_str())) {
297       tmpl += strip.size();
298 
299       // prefix matched but no root word was found
300       // if aeXPRODUCT is allowed, try again but now
301       // cross checked combined with a suffix
302 
303       if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
304         // hash entry of root word or NULL
305         struct hentry* he = pmyMgr->suffix_check_twosfx(tmpword.c_str(), tmpl, aeXPRODUCT, this,
306                                                         needflag);
307         if (he)
308           return he;
309       }
310     }
311   }
312   return NULL;
313 }
314 
315 // check if this prefix entry matches
check_twosfx_morph(const char * word,int len,char in_compound,const FLAG needflag)316 std::string PfxEntry::check_twosfx_morph(const char* word,
317                                          int len,
318                                          char in_compound,
319                                          const FLAG needflag) {
320   std::string result;
321   // on entry prefix is 0 length or already matches the beginning of the word.
322   // So if the remaining root word has positive length
323   // and if there are enough chars in root word and added back strip chars
324   // to meet the number of characters conditions, then test it
325   int tmpl = len - appnd.size(); // length of tmpword
326 
327   if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
328       (tmpl + strip.size() >= numconds)) {
329     // generate new root word by removing prefix and adding
330     // back any characters that would have been stripped
331 
332     std::string tmpword(strip);
333     tmpword.append(word + appnd.size());
334 
335     // now make sure all of the conditions on characters
336     // are met.  Please see the appendix at the end of
337     // this file for more info on exactly what is being
338     // tested
339 
340     // if all conditions are met then check if resulting
341     // root word in the dictionary
342 
343     if (test_condition(tmpword.c_str())) {
344       tmpl += strip.size();
345 
346       // prefix matched but no root word was found
347       // if aeXPRODUCT is allowed, try again but now
348       // ross checked combined with a suffix
349 
350       if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
351         result = pmyMgr->suffix_check_twosfx_morph(tmpword.c_str(), tmpl,
352                                                    aeXPRODUCT,
353                                                    this, needflag);
354       }
355     }
356   }
357   return result;
358 }
359 
360 // check if this prefix entry matches
check_morph(const char * word,int len,char in_compound,const FLAG needflag)361 std::string PfxEntry::check_morph(const char* word,
362                                   int len,
363                                   char in_compound,
364                                   const FLAG needflag) {
365   std::string result;
366 
367   // on entry prefix is 0 length or already matches the beginning of the word.
368   // So if the remaining root word has positive length
369   // and if there are enough chars in root word and added back strip chars
370   // to meet the number of characters conditions, then test it
371 
372   int tmpl = len - appnd.size(); // length of tmpword
373 
374   if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
375       (tmpl + strip.size() >= numconds)) {
376     // generate new root word by removing prefix and adding
377     // back any characters that would have been stripped
378 
379     std::string tmpword(strip);
380     tmpword.append(word + appnd.size());
381 
382     // now make sure all of the conditions on characters
383     // are met.  Please see the appendix at the end of
384     // this file for more info on exactly what is being
385     // tested
386 
387     // if all conditions are met then check if resulting
388     // root word in the dictionary
389 
390     if (test_condition(tmpword.c_str())) {
391       tmpl += strip.size();
392       struct hentry* he;  // hash entry of root word or NULL
393       if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
394         do {
395           if (TESTAFF(he->astr, aflag, he->alen) &&
396               // forbid single prefixes with needaffix flag
397               !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
398               // needflag
399               ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
400                (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
401             if (morphcode) {
402               result.append(" ");
403               result.append(morphcode);
404             } else
405               result.append(getKey());
406             if (!HENTRY_FIND(he, MORPH_STEM)) {
407               result.append(" ");
408               result.append(MORPH_STEM);
409               result.append(HENTRY_WORD(he));
410             }
411             // store the pointer of the hash entry
412             if (HENTRY_DATA(he)) {
413               result.append(" ");
414               result.append(HENTRY_DATA2(he));
415             } else {
416               // return with debug information
417               char* flag = pmyMgr->encode_flag(getFlag());
418               result.append(" ");
419               result.append(MORPH_FLAG);
420               result.append(flag);
421               free(flag);
422             }
423             result.append("\n");
424           }
425           he = he->next_homonym;
426         } while (he);
427       }
428 
429       // prefix matched but no root word was found
430       // if aeXPRODUCT is allowed, try again but now
431       // ross checked combined with a suffix
432 
433       if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
434         std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, aeXPRODUCT, this,
435                                                     FLAG_NULL, needflag);
436         if (!st.empty()) {
437           result.append(st);
438         }
439       }
440     }
441   }
442 
443   return result;
444 }
445 
SfxEntry(AffixMgr * pmgr)446 SfxEntry::SfxEntry(AffixMgr* pmgr)
447     : pmyMgr(pmgr)  // register affix manager
448       ,
449       next(NULL),
450       nexteq(NULL),
451       nextne(NULL),
452       flgnxt(NULL),
453       l_morph(NULL),
454       r_morph(NULL),
455       eq_morph(NULL) {
456 }
457 
458 // add suffix to this word assuming conditions hold
add(const char * word,size_t len)459 std::string SfxEntry::add(const char* word, size_t len) {
460   std::string result;
461   /* make sure all conditions match */
462   if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
463       (len >= numconds) && test_condition(word + len, word) &&
464       (!strip.size() ||
465        (strcmp(word + len - strip.size(), strip.c_str()) == 0))) {
466     result.assign(word);
467     /* we have a match so add suffix */
468     result.replace(len - strip.size(), std::string::npos, appnd);
469   }
470   return result;
471 }
472 
nextchar(char * p)473 inline char* SfxEntry::nextchar(char* p) {
474   if (p) {
475     p++;
476     if (opts & aeLONGCOND) {
477       // jump to the 2nd part of the condition
478       if (p == c.l.conds1 + MAXCONDLEN_1)
479         return c.l.conds2;
480       // end of the MAXCONDLEN length condition
481     } else if (p == c.conds + MAXCONDLEN)
482       return NULL;
483     return *p ? p : NULL;
484   }
485   return NULL;
486 }
487 
test_condition(const char * st,const char * beg)488 inline int SfxEntry::test_condition(const char* st, const char* beg) {
489   const char* pos = NULL;  // group with pos input position
490   bool neg = false;        // complementer
491   bool ingroup = false;    // character in the group
492   if (numconds == 0)
493     return 1;
494   char* p = c.conds;
495   st--;
496   int i = 1;
497   while (1) {
498     switch (*p) {
499       case '\0':
500         return 1;
501       case '[':
502         p = nextchar(p);
503         pos = st;
504         break;
505       case '^':
506         p = nextchar(p);
507         neg = true;
508         break;
509       case ']':
510         if (!neg && !ingroup)
511           return 0;
512         i++;
513         // skip the next character
514         if (!ingroup) {
515           for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--)
516             ;
517           st--;
518         }
519         pos = NULL;
520         neg = false;
521         ingroup = false;
522         p = nextchar(p);
523         if (st < beg && p)
524           return 0;  // word <= condition
525         break;
526       case '.':
527         if (!pos) {
528           // dots are not metacharacters in groups: [.]
529           p = nextchar(p);
530           // skip the next character
531           for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80;
532                st--)
533             ;
534           if (st < beg) {  // word <= condition
535             if (p)
536               return 0;
537             else
538               return 1;
539           }
540           if ((opts & aeUTF8) && (*st & 0x80)) {  // head of the UTF-8 character
541             st--;
542             if (st < beg) {  // word <= condition
543               if (p)
544                 return 0;
545               else
546                 return 1;
547             }
548           }
549           break;
550         }
551       /* FALLTHROUGH */
552       default: {
553         if (*st == *p) {
554           p = nextchar(p);
555           if ((opts & aeUTF8) && (*st & 0x80)) {
556             st--;
557             while (p && (st >= beg)) {
558               if (*p != *st) {
559                 if (!pos)
560                   return 0;
561                 st = pos;
562                 break;
563               }
564               // first byte of the UTF-8 multibyte character
565               if ((*p & 0xc0) != 0x80)
566                 break;
567               p = nextchar(p);
568               st--;
569             }
570             if (pos && st != pos) {
571               if (neg)
572                 return 0;
573               else if (i == numconds)
574                 return 1;
575               ingroup = true;
576               while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
577               }
578               st--;
579             }
580             if (p && *p != ']')
581               p = nextchar(p);
582           } else if (pos) {
583             if (neg)
584               return 0;
585             else if (i == numconds)
586               return 1;
587             ingroup = true;
588             while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
589             }
590             //			if (p && *p != ']') p = nextchar(p);
591             st--;
592           }
593           if (!pos) {
594             i++;
595             st--;
596           }
597           if (st < beg && p && *p != ']')
598             return 0;      // word <= condition
599         } else if (pos) {  // group
600           p = nextchar(p);
601         } else
602           return 0;
603       }
604     }
605     if (!p)
606       return 1;
607   }
608 }
609 
610 // see if this suffix is present in the word
checkword(const char * word,int len,int optflags,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag,const FLAG badflag)611 struct hentry* SfxEntry::checkword(const char* word,
612                                    int len,
613                                    int optflags,
614                                    PfxEntry* ppfx,
615                                    const FLAG cclass,
616                                    const FLAG needflag,
617                                    const FLAG badflag) {
618   struct hentry* he;  // hash entry pointer
619   PfxEntry* ep = ppfx;
620 
621   // if this suffix is being cross checked with a prefix
622   // but it does not support cross products skip it
623 
624   if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
625     return NULL;
626 
627   // upon entry suffix is 0 length or already matches the end of the word.
628   // So if the remaining root word has positive length
629   // and if there are enough chars in root word and added back strip chars
630   // to meet the number of characters conditions, then test it
631 
632   int tmpl = len - appnd.size(); // length of tmpword
633   // the second condition is not enough for UTF-8 strings
634   // it checked in test_condition()
635 
636   if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
637       (tmpl + strip.size() >= numconds)) {
638     // generate new root word by removing suffix and adding
639     // back any characters that would have been stripped or
640     // or null terminating the shorter string
641 
642     std::string tmpstring(word, tmpl);
643     if (strip.size()) {
644       tmpstring.append(strip);
645     }
646 
647     const char* tmpword = tmpstring.c_str();
648     const char* endword = tmpword + tmpstring.size();
649 
650     // now make sure all of the conditions on characters
651     // are met.  Please see the appendix at the end of
652     // this file for more info on exactly what is being
653     // tested
654 
655     // if all conditions are met then check if resulting
656     // root word in the dictionary
657 
658     if (test_condition(endword, tmpword)) {
659 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
660       fprintf(stdout, "%s %s %c\n", word, tmpword, aflag);
661 #endif
662       if ((he = pmyMgr->lookup(tmpword)) != NULL) {
663         do {
664           // check conditional suffix (enabled by prefix)
665           if ((TESTAFF(he->astr, aflag, he->alen) ||
666                (ep && ep->getCont() &&
667                 TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
668               (((optflags & aeXPRODUCT) == 0) ||
669                (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
670                // enabled by prefix
671                ((contclass) &&
672                 (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))) &&
673               // handle cont. class
674               ((!cclass) ||
675                ((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
676               // check only in compound homonyms (bad flags)
677               (!badflag || !TESTAFF(he->astr, badflag, he->alen)) &&
678               // handle required flag
679               ((!needflag) ||
680                (TESTAFF(he->astr, needflag, he->alen) ||
681                 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
682             return he;
683           he = he->next_homonym;  // check homonyms
684         } while (he);
685       }
686     }
687   }
688   return NULL;
689 }
690 
691 // see if two-level suffix is present in the word
check_twosfx(const char * word,int len,int optflags,PfxEntry * ppfx,const FLAG needflag)692 struct hentry* SfxEntry::check_twosfx(const char* word,
693                                       int len,
694                                       int optflags,
695                                       PfxEntry* ppfx,
696                                       const FLAG needflag) {
697   PfxEntry* ep = ppfx;
698 
699   // if this suffix is being cross checked with a prefix
700   // but it does not support cross products skip it
701 
702   if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
703     return NULL;
704 
705   // upon entry suffix is 0 length or already matches the end of the word.
706   // So if the remaining root word has positive length
707   // and if there are enough chars in root word and added back strip chars
708   // to meet the number of characters conditions, then test it
709 
710   int tmpl = len - appnd.size(); // length of tmpword
711 
712   if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
713       (tmpl + strip.size() >= numconds)) {
714     // generate new root word by removing suffix and adding
715     // back any characters that would have been stripped or
716     // or null terminating the shorter string
717 
718     std::string tmpword(word);
719     tmpword.resize(tmpl);
720     tmpword.append(strip);
721     tmpl += strip.size();
722 
723     const char* beg = tmpword.c_str();
724     const char* end = beg + tmpl;
725 
726     // now make sure all of the conditions on characters
727     // are met.  Please see the appendix at the end of
728     // this file for more info on exactly what is being
729     // tested
730 
731     // if all conditions are met then recall suffix_check
732 
733     if (test_condition(end, beg)) {
734       struct hentry* he;  // hash entry pointer
735       if (ppfx) {
736         // handle conditional suffix
737         if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
738           he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL,
739                                     (FLAG)aflag, needflag, IN_CPD_NOT);
740         else
741           he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, optflags, ppfx,
742                                     (FLAG)aflag, needflag, IN_CPD_NOT);
743       } else {
744         he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL,
745                                   (FLAG)aflag, needflag, IN_CPD_NOT);
746       }
747       if (he)
748         return he;
749     }
750   }
751   return NULL;
752 }
753 
754 // see if two-level suffix is present in the word
check_twosfx_morph(const char * word,int len,int optflags,PfxEntry * ppfx,const FLAG needflag)755 std::string SfxEntry::check_twosfx_morph(const char* word,
756                                          int len,
757                                          int optflags,
758                                          PfxEntry* ppfx,
759                                          const FLAG needflag) {
760   PfxEntry* ep = ppfx;
761 
762   std::string result;
763 
764   // if this suffix is being cross checked with a prefix
765   // but it does not support cross products skip it
766 
767   if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
768     return result;
769 
770   // upon entry suffix is 0 length or already matches the end of the word.
771   // So if the remaining root word has positive length
772   // and if there are enough chars in root word and added back strip chars
773   // to meet the number of characters conditions, then test it
774 
775   int tmpl = len - appnd.size(); // length of tmpword
776 
777   if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
778       (tmpl + strip.size() >= numconds)) {
779     // generate new root word by removing suffix and adding
780     // back any characters that would have been stripped or
781     // or null terminating the shorter string
782 
783     std::string tmpword(word);
784     tmpword.resize(tmpl);
785     tmpword.append(strip);
786     tmpl += strip.size();
787 
788     const char* beg = tmpword.c_str();
789     const char* end = beg + tmpl;
790 
791     // now make sure all of the conditions on characters
792     // are met.  Please see the appendix at the end of
793     // this file for more info on exactly what is being
794     // tested
795 
796     // if all conditions are met then recall suffix_check
797 
798     if (test_condition(end, beg)) {
799       if (ppfx) {
800         // handle conditional suffix
801         if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
802           std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag,
803                                                       needflag);
804           if (!st.empty()) {
805             if (ppfx->getMorph()) {
806               result.append(ppfx->getMorph());
807               result.append(" ");
808             }
809             result.append(st);
810             mychomp(result);
811           }
812         } else {
813           std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, optflags, ppfx, aflag,
814                                                       needflag);
815           if (!st.empty()) {
816             result.append(st);
817             mychomp(result);
818           }
819         }
820       } else {
821         std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, needflag);
822         if (!st.empty()) {
823           result.append(st);
824           mychomp(result);
825         }
826       }
827     }
828   }
829   return result;
830 }
831 
832 // get next homonym with same affix
get_next_homonym(struct hentry * he,int optflags,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag)833 struct hentry* SfxEntry::get_next_homonym(struct hentry* he,
834                                           int optflags,
835                                           PfxEntry* ppfx,
836                                           const FLAG cclass,
837                                           const FLAG needflag) {
838   PfxEntry* ep = ppfx;
839   FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
840 
841   while (he->next_homonym) {
842     he = he->next_homonym;
843     if ((TESTAFF(he->astr, aflag, he->alen) ||
844          (ep && ep->getCont() &&
845           TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
846         ((optflags & aeXPRODUCT) == 0 || TESTAFF(he->astr, eFlag, he->alen) ||
847          // handle conditional suffix
848          ((contclass) && TESTAFF(contclass, eFlag, contclasslen))) &&
849         // handle cont. class
850         ((!cclass) ||
851          ((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
852         // handle required flag
853         ((!needflag) ||
854          (TESTAFF(he->astr, needflag, he->alen) ||
855           ((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
856       return he;
857   }
858   return NULL;
859 }
860 
initReverseWord()861 void SfxEntry::initReverseWord() {
862   rappnd = appnd;
863   reverseword(rappnd);
864 }
865 
866 #if 0
867 
868 Appendix:  Understanding Affix Code
869 
870 
871 An affix is either a  prefix or a suffix attached to root words to make
872 other words.
873 
874 Basically a Prefix or a Suffix is set of AffEntry objects
875 which store information about the prefix or suffix along
876 with supporting routines to check if a word has a particular
877 prefix or suffix or a combination.
878 
879 The structure affentry is defined as follows:
880 
881 struct affentry
882 {
883    unsigned short aflag;    // ID used to represent the affix
884    std::string strip;       // string to strip before adding affix
885    std::string appnd;       // the affix string to add
886    char numconds;           // the number of conditions that must be met
887    char opts;               // flag: aeXPRODUCT- combine both prefix and suffix
888    char   conds[SETSIZE];   // array which encodes the conditions to be met
889 };
890 
891 
892 Here is a suffix borrowed from the en_US.aff file.  This file
893 is whitespace delimited.
894 
895 SFX D Y 4
896 SFX D   0     e          d
897 SFX D   y     ied        [^aeiou]y
898 SFX D   0     ed         [^ey]
899 SFX D   0     ed         [aeiou]y
900 
901 This information can be interpreted as follows:
902 
903 In the first line has 4 fields
904 
905 Field
906 -----
907 1     SFX - indicates this is a suffix
908 2     D   - is the name of the character flag which represents this suffix
909 3     Y   - indicates it can be combined with prefixes (cross product)
910 4     4   - indicates that sequence of 4 affentry structures are needed to
911                properly store the affix information
912 
913 The remaining lines describe the unique information for the 4 SfxEntry
914 objects that make up this affix.  Each line can be interpreted
915 as follows: (note fields 1 and 2 are as a check against line 1 info)
916 
917 Field
918 -----
919 1     SFX         - indicates this is a suffix
920 2     D           - is the name of the character flag for this affix
921 3     y           - the string of chars to strip off before adding affix
922                          (a 0 here indicates the NULL string)
923 4     ied         - the string of affix characters to add
924 5     [^aeiou]y   - the conditions which must be met before the affix
925                     can be applied
926 
927 Field 5 is interesting.  Since this is a suffix, field 5 tells us that
928 there are 2 conditions that must be met.  The first condition is that
929 the next to the last character in the word must *NOT* be any of the
930 following "a", "e", "i", "o" or "u".  The second condition is that
931 the last character of the word must end in "y".
932 
933 So how can we encode this information concisely and be able to
934 test for both conditions in a fast manner?  The answer is found
935 but studying the wonderful ispell code of Geoff Kuenning, et.al.
936 (now available under a normal BSD license).
937 
938 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
939 using a character (cast to an unsigned char) of a string, we have 8 bits
940 of information we can store about that character.  Specifically we
941 could use each bit to say if that character is allowed in any of the
942 last (or first for prefixes) 8 characters of the word.
943 
944 Basically, each character at one end of the word (up to the number
945 of conditions) is used to index into the conds array and the resulting
946 value found there says whether the that character is valid for a
947 specific character position in the word.
948 
949 For prefixes, it does this by setting bit 0 if that char is valid
950 in the first position, bit 1 if valid in the second position, and so on.
951 
952 If a bit is not set, then that char is not valid for that postion in the
953 word.
954 
955 If working with suffixes bit 0 is used for the character closest
956 to the front, bit 1 for the next character towards the end, ...,
957 with bit numconds-1 representing the last char at the end of the string.
958 
959 Note: since entries in the conds[] are 8 bits, only 8 conditions
960 (read that only 8 character positions) can be examined at one
961 end of a word (the beginning for prefixes and the end for suffixes.
962 
963 So to make this clearer, lets encode the conds array values for the
964 first two affentries for the suffix D described earlier.
965 
966 
967   For the first affentry:
968      numconds = 1             (only examine the last character)
969 
970      conds['e'] =  (1 << 0)   (the word must end in an E)
971      all others are all 0
972 
973   For the second affentry:
974      numconds = 2             (only examine the last two characters)
975 
976      conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
977          where X is all characters *but* a, e, i, o, or u
978 
979 
980      conds['y'] = (1 << 1)     (the last char must be a y)
981      all other bits for all other entries in the conds array are zero
982 
983 #endif
984