1 /* ***** BEGIN LICENSE BLOCK *****
2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3  *
4  * The contents of this file are subject to the Mozilla Public License Version
5  * 1.1 (the "License"); you may not use this file except in compliance with
6  * the License. You may obtain a copy of the License at
7  * http://www.mozilla.org/MPL/
8  *
9  * Software distributed under the License is distributed on an "AS IS" basis,
10  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11  * for the specific language governing rights and limitations under the
12  * License.
13  *
14  * The Original Code is Hunspell, based on MySpell.
15  *
16  * The Initial Developers of the Original Code are
17  * Kevin Hendricks (MySpell) and Németh László (Hunspell).
18  * Portions created by the Initial Developers are Copyright (C) 2002-2005
19  * the Initial Developers. All Rights Reserved.
20  *
21  * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
22  * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
23  * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
24  * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
25  * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
26  *
27  * Alternatively, the contents of this file may be used under the terms of
28  * either the GNU General Public License Version 2 or later (the "GPL"), or
29  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30  * in which case the provisions of the GPL or the LGPL are applicable instead
31  * of those above. If you wish to allow use of your version of this file only
32  * under the terms of either the GPL or the LGPL, and not to allow others to
33  * use your version of this file under the terms of the MPL, indicate your
34  * decision by deleting the provisions above and replace them with the notice
35  * and other provisions required by the GPL or the LGPL. If you do not delete
36  * the provisions above, a recipient may use your version of this file under
37  * the terms of any one of the MPL, the GPL or the LGPL.
38  *
39  * ***** END LICENSE BLOCK ***** */
40 /*
41  * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
42  * And Contributors.  All rights reserved.
43  *
44  * Redistribution and use in source and binary forms, with or without
45  * modification, are permitted provided that the following conditions
46  * are met:
47  *
48  * 1. Redistributions of source code must retain the above copyright
49  *    notice, this list of conditions and the following disclaimer.
50  *
51  * 2. Redistributions in binary form must reproduce the above copyright
52  *    notice, this list of conditions and the following disclaimer in the
53  *    documentation and/or other materials provided with the distribution.
54  *
55  * 3. All modifications to the source code must be clearly marked as
56  *    such.  Binary redistributions based on modified source code
57  *    must be clearly marked as modified versions in the documentation
58  *    and/or other materials provided with the distribution.
59  *
60  * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
61  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
62  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
63  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
64  * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
65  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
66  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
67  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
71  * SUCH DAMAGE.
72  */
73 
74 #include <stdlib.h>
75 #include <string.h>
76 #include <stdio.h>
77 #include <ctype.h>
78 
79 #include "affentry.hxx"
80 #include "csutil.hxx"
81 
PfxEntry(AffixMgr * pmgr,affentry * dp)82 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
83     // register affix manager
84     : pmyMgr(pmgr),
85       next(NULL),
86       nexteq(NULL),
87       nextne(NULL),
88       flgnxt(NULL) {
89   // set up its initial values
90   aflag = dp->aflag;        // flag
91   strip = dp->strip;        // string to strip
92   appnd = dp->appnd;        // string to append
93   numconds = dp->numconds;  // length of the condition
94   opts = dp->opts;          // cross product flag
95   // then copy over all of the conditions
96   if (opts & aeLONGCOND) {
97     memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
98     c.l.conds2 = dp->c.l.conds2;
99   } else
100     memcpy(c.conds, dp->c.conds, MAXCONDLEN);
101   morphcode = dp->morphcode;
102   contclass = dp->contclass;
103   contclasslen = dp->contclasslen;
104 }
105 
~PfxEntry()106 PfxEntry::~PfxEntry() {
107   aflag = 0;
108   pmyMgr = NULL;
109   if (opts & aeLONGCOND)
110     free(c.l.conds2);
111   if (morphcode && !(opts & aeALIASM))
112     free(morphcode);
113   if (contclass && !(opts & aeALIASF))
114     free(contclass);
115 }
116 
117 // add prefix to this word assuming conditions hold
add(const char * word,size_t len)118 char* PfxEntry::add(const char* word, size_t len) {
119   if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
120       (len >= numconds) && test_condition(word) &&
121       (!strip.size() || (strncmp(word, strip.c_str(), strip.size()) == 0))) {
122     /* we have a match so add prefix */
123     std::string tword(appnd);
124     tword.append(word + strip.size());
125     return mystrdup(tword.c_str());
126   }
127   return NULL;
128 }
129 
nextchar(char * p)130 inline char* PfxEntry::nextchar(char* p) {
131   if (p) {
132     p++;
133     if (opts & aeLONGCOND) {
134       // jump to the 2nd part of the condition
135       if (p == c.conds + MAXCONDLEN_1)
136         return c.l.conds2;
137       // end of the MAXCONDLEN length condition
138     } else if (p == c.conds + MAXCONDLEN)
139       return NULL;
140     return *p ? p : NULL;
141   }
142   return NULL;
143 }
144 
test_condition(const char * st)145 inline int PfxEntry::test_condition(const char* st) {
146   const char* pos = NULL;  // group with pos input position
147   bool neg = false;        // complementer
148   bool ingroup = false;    // character in the group
149   if (numconds == 0)
150     return 1;
151   char* p = c.conds;
152   while (1) {
153     switch (*p) {
154       case '\0':
155         return 1;
156       case '[': {
157         neg = false;
158         ingroup = false;
159         p = nextchar(p);
160         pos = st;
161         break;
162       }
163       case '^': {
164         p = nextchar(p);
165         neg = true;
166         break;
167       }
168       case ']': {
169         if ((neg && ingroup) || (!neg && !ingroup))
170           return 0;
171         pos = NULL;
172         p = nextchar(p);
173         // skip the next character
174         if (!ingroup && *st)
175           for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
176             ;
177         if (*st == '\0' && p)
178           return 0;  // word <= condition
179         break;
180       }
181       case '.':
182         if (!pos) {  // dots are not metacharacters in groups: [.]
183           p = nextchar(p);
184           // skip the next character
185           for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
186             ;
187           if (*st == '\0' && p)
188             return 0;  // word <= condition
189           break;
190         }
191       /* FALLTHROUGH */
192       default: {
193         if (*st == *p) {
194           st++;
195           p = nextchar(p);
196           if ((opts & aeUTF8) && (*(st - 1) & 0x80)) {  // multibyte
197             while (p && (*p & 0xc0) == 0x80) {          // character
198               if (*p != *st) {
199                 if (!pos)
200                   return 0;
201                 st = pos;
202                 break;
203               }
204               p = nextchar(p);
205               st++;
206             }
207             if (pos && st != pos) {
208               ingroup = true;
209               while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
210               }
211             }
212           } else if (pos) {
213             ingroup = true;
214             while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
215             }
216           }
217         } else if (pos) {  // group
218           p = nextchar(p);
219         } else
220           return 0;
221       }
222     }
223     if (!p)
224       return 1;
225   }
226 }
227 
228 // check if this prefix entry matches
checkword(const char * word,int len,char in_compound,const FLAG needflag)229 struct hentry* PfxEntry::checkword(const char* word,
230                                    int len,
231                                    char in_compound,
232                                    const FLAG needflag) {
233   struct hentry* he;  // hash entry of root word or NULL
234 
235   // on entry prefix is 0 length or already matches the beginning of the word.
236   // So if the remaining root word has positive length
237   // and if there are enough chars in root word and added back strip chars
238   // to meet the number of characters conditions, then test it
239 
240   int tmpl = len - appnd.size(); // length of tmpword
241 
242   if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
243     // generate new root word by removing prefix and adding
244     // back any characters that would have been stripped
245 
246     std::string tmpword(strip);
247     tmpword.append(word + appnd.size());
248 
249     // now make sure all of the conditions on characters
250     // are met.  Please see the appendix at the end of
251     // this file for more info on exactly what is being
252     // tested
253 
254     // if all conditions are met then check if resulting
255     // root word in the dictionary
256 
257     if (test_condition(tmpword.c_str())) {
258       tmpl += strip.size();
259       if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
260         do {
261           if (TESTAFF(he->astr, aflag, he->alen) &&
262               // forbid single prefixes with needaffix flag
263               !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
264               // needflag
265               ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
266                (contclass && TESTAFF(contclass, needflag, contclasslen))))
267             return he;
268           he = he->next_homonym;  // check homonyms
269         } while (he);
270       }
271 
272       // prefix matched but no root word was found
273       // if aeXPRODUCT is allowed, try again but now
274       // ross checked combined with a suffix
275 
276       // if ((opts & aeXPRODUCT) && in_compound) {
277       if ((opts & aeXPRODUCT)) {
278         he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, aeXPRODUCT, this,
279                                   NULL, 0, NULL, FLAG_NULL, needflag,
280                                   in_compound);
281         if (he)
282           return he;
283       }
284     }
285   }
286   return NULL;
287 }
288 
289 // check if this prefix entry matches
check_twosfx(const char * word,int len,char in_compound,const FLAG needflag)290 struct hentry* PfxEntry::check_twosfx(const char* word,
291                                       int len,
292                                       char in_compound,
293                                       const FLAG needflag) {
294   struct hentry* he;  // hash entry of root word or NULL
295 
296   // on entry prefix is 0 length or already matches the beginning of the word.
297   // So if the remaining root word has positive length
298   // and if there are enough chars in root word and added back strip chars
299   // to meet the number of characters conditions, then test it
300 
301   int tmpl = len - appnd.size(); // length of tmpword
302 
303   if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
304       (tmpl + strip.size() >= numconds)) {
305     // generate new root word by removing prefix and adding
306     // back any characters that would have been stripped
307 
308     std::string tmpword(strip);
309     tmpword.append(word + appnd.size());
310 
311     // now make sure all of the conditions on characters
312     // are met.  Please see the appendix at the end of
313     // this file for more info on exactly what is being
314     // tested
315 
316     // if all conditions are met then check if resulting
317     // root word in the dictionary
318 
319     if (test_condition(tmpword.c_str())) {
320       tmpl += strip.size();
321 
322       // prefix matched but no root word was found
323       // if aeXPRODUCT is allowed, try again but now
324       // cross checked combined with a suffix
325 
326       if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
327         he = pmyMgr->suffix_check_twosfx(tmpword.c_str(), tmpl, aeXPRODUCT, this,
328                                          needflag);
329         if (he)
330           return he;
331       }
332     }
333   }
334   return NULL;
335 }
336 
337 // check if this prefix entry matches
check_twosfx_morph(const char * word,int len,char in_compound,const FLAG needflag)338 char* PfxEntry::check_twosfx_morph(const char* word,
339                                    int len,
340                                    char in_compound,
341                                    const FLAG needflag) {
342   // on entry prefix is 0 length or already matches the beginning of the word.
343   // So if the remaining root word has positive length
344   // and if there are enough chars in root word and added back strip chars
345   // to meet the number of characters conditions, then test it
346 
347   int tmpl = len - appnd.size(); // length of tmpword
348 
349   if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
350       (tmpl + strip.size() >= numconds)) {
351     // generate new root word by removing prefix and adding
352     // back any characters that would have been stripped
353 
354     std::string tmpword(strip);
355     tmpword.append(word + appnd.size());
356 
357     // now make sure all of the conditions on characters
358     // are met.  Please see the appendix at the end of
359     // this file for more info on exactly what is being
360     // tested
361 
362     // if all conditions are met then check if resulting
363     // root word in the dictionary
364 
365     if (test_condition(tmpword.c_str())) {
366       tmpl += strip.size();
367 
368       // prefix matched but no root word was found
369       // if aeXPRODUCT is allowed, try again but now
370       // ross checked combined with a suffix
371 
372       if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
373         return pmyMgr->suffix_check_twosfx_morph(tmpword.c_str(), tmpl,
374                                                  aeXPRODUCT,
375                                                  this, needflag);
376       }
377     }
378   }
379   return NULL;
380 }
381 
382 // check if this prefix entry matches
check_morph(const char * word,int len,char in_compound,const FLAG needflag)383 char* PfxEntry::check_morph(const char* word,
384                             int len,
385                             char in_compound,
386                             const FLAG needflag) {
387   struct hentry* he;  // hash entry of root word or NULL
388   char* st;
389 
390   // on entry prefix is 0 length or already matches the beginning of the word.
391   // So if the remaining root word has positive length
392   // and if there are enough chars in root word and added back strip chars
393   // to meet the number of characters conditions, then test it
394 
395   int tmpl = len - appnd.size(); // length of tmpword
396 
397   if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
398       (tmpl + strip.size() >= numconds)) {
399     // generate new root word by removing prefix and adding
400     // back any characters that would have been stripped
401 
402     std::string tmpword(strip);
403     tmpword.append(word + appnd.size());
404 
405     // now make sure all of the conditions on characters
406     // are met.  Please see the appendix at the end of
407     // this file for more info on exactly what is being
408     // tested
409 
410     // if all conditions are met then check if resulting
411     // root word in the dictionary
412 
413     if (test_condition(tmpword.c_str())) {
414       std::string result;
415 
416       tmpl += strip.size();
417       if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
418         do {
419           if (TESTAFF(he->astr, aflag, he->alen) &&
420               // forbid single prefixes with needaffix flag
421               !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
422               // needflag
423               ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
424                (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
425             if (morphcode) {
426               result.append(" ");
427               result.append(morphcode);
428             } else
429               result.append(getKey());
430             if (!HENTRY_FIND(he, MORPH_STEM)) {
431               result.append(" ");
432               result.append(MORPH_STEM);
433               result.append(HENTRY_WORD(he));
434             }
435             // store the pointer of the hash entry
436             if (HENTRY_DATA(he)) {
437               result.append(" ");
438               result.append(HENTRY_DATA2(he));
439             } else {
440               // return with debug information
441               char* flag = pmyMgr->encode_flag(getFlag());
442               result.append(" ");
443               result.append(MORPH_FLAG);
444               result.append(flag);
445               free(flag);
446             }
447             result.append("\n");
448           }
449           he = he->next_homonym;
450         } while (he);
451       }
452 
453       // prefix matched but no root word was found
454       // if aeXPRODUCT is allowed, try again but now
455       // ross checked combined with a suffix
456 
457       if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
458         st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, aeXPRODUCT, this,
459                                         FLAG_NULL, needflag);
460         if (st) {
461           result.append(st);
462           free(st);
463         }
464       }
465 
466       if (!result.empty())
467         return mystrdup(result.c_str());
468     }
469   }
470 
471   return NULL;
472 }
473 
SfxEntry(AffixMgr * pmgr,affentry * dp)474 SfxEntry::SfxEntry(AffixMgr* pmgr, affentry* dp)
475     : pmyMgr(pmgr)  // register affix manager
476       ,
477       next(NULL),
478       nexteq(NULL),
479       nextne(NULL),
480       flgnxt(NULL),
481       l_morph(NULL),
482       r_morph(NULL),
483       eq_morph(NULL) {
484   // set up its initial values
485   aflag = dp->aflag;        // char flag
486   strip = dp->strip;        // string to strip
487   appnd = dp->appnd;        // string to append
488   numconds = dp->numconds;  // length of the condition
489   opts = dp->opts;          // cross product flag
490 
491   // then copy over all of the conditions
492   if (opts & aeLONGCOND) {
493     memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
494     c.l.conds2 = dp->c.l.conds2;
495   } else
496     memcpy(c.conds, dp->c.conds, MAXCONDLEN);
497   rappnd = appnd;
498   reverseword(rappnd);
499   morphcode = dp->morphcode;
500   contclass = dp->contclass;
501   contclasslen = dp->contclasslen;
502 }
503 
~SfxEntry()504 SfxEntry::~SfxEntry() {
505   aflag = 0;
506   pmyMgr = NULL;
507   if (opts & aeLONGCOND)
508     free(c.l.conds2);
509   if (morphcode && !(opts & aeALIASM))
510     free(morphcode);
511   if (contclass && !(opts & aeALIASF))
512     free(contclass);
513 }
514 
515 // add suffix to this word assuming conditions hold
add(const char * word,size_t len)516 char* SfxEntry::add(const char* word, size_t len) {
517   /* make sure all conditions match */
518   if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
519       (len >= numconds) && test_condition(word + len, word) &&
520       (!strip.size() ||
521        (strcmp(word + len - strip.size(), strip.c_str()) == 0))) {
522     std::string tword(word);
523     /* we have a match so add suffix */
524     tword.replace(len - strip.size(), std::string::npos, appnd);
525     return mystrdup(tword.c_str());
526   }
527   return NULL;
528 }
529 
nextchar(char * p)530 inline char* SfxEntry::nextchar(char* p) {
531   if (p) {
532     p++;
533     if (opts & aeLONGCOND) {
534       // jump to the 2nd part of the condition
535       if (p == c.l.conds1 + MAXCONDLEN_1)
536         return c.l.conds2;
537       // end of the MAXCONDLEN length condition
538     } else if (p == c.conds + MAXCONDLEN)
539       return NULL;
540     return *p ? p : NULL;
541   }
542   return NULL;
543 }
544 
test_condition(const char * st,const char * beg)545 inline int SfxEntry::test_condition(const char* st, const char* beg) {
546   const char* pos = NULL;  // group with pos input position
547   bool neg = false;        // complementer
548   bool ingroup = false;    // character in the group
549   if (numconds == 0)
550     return 1;
551   char* p = c.conds;
552   st--;
553   int i = 1;
554   while (1) {
555     switch (*p) {
556       case '\0':
557         return 1;
558       case '[':
559         p = nextchar(p);
560         pos = st;
561         break;
562       case '^':
563         p = nextchar(p);
564         neg = true;
565         break;
566       case ']':
567         if (!neg && !ingroup)
568           return 0;
569         i++;
570         // skip the next character
571         if (!ingroup) {
572           for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--)
573             ;
574           st--;
575         }
576         pos = NULL;
577         neg = false;
578         ingroup = false;
579         p = nextchar(p);
580         if (st < beg && p)
581           return 0;  // word <= condition
582         break;
583       case '.':
584         if (!pos) {
585           // dots are not metacharacters in groups: [.]
586           p = nextchar(p);
587           // skip the next character
588           for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80;
589                st--)
590             ;
591           if (st < beg) {  // word <= condition
592             if (p)
593               return 0;
594             else
595               return 1;
596           }
597           if ((opts & aeUTF8) && (*st & 0x80)) {  // head of the UTF-8 character
598             st--;
599             if (st < beg) {  // word <= condition
600               if (p)
601                 return 0;
602               else
603                 return 1;
604             }
605           }
606           break;
607         }
608       /* FALLTHROUGH */
609       default: {
610         if (*st == *p) {
611           p = nextchar(p);
612           if ((opts & aeUTF8) && (*st & 0x80)) {
613             st--;
614             while (p && (st >= beg)) {
615               if (*p != *st) {
616                 if (!pos)
617                   return 0;
618                 st = pos;
619                 break;
620               }
621               // first byte of the UTF-8 multibyte character
622               if ((*p & 0xc0) != 0x80)
623                 break;
624               p = nextchar(p);
625               st--;
626             }
627             if (pos && st != pos) {
628               if (neg)
629                 return 0;
630               else if (i == numconds)
631                 return 1;
632               ingroup = true;
633               while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
634               }
635               st--;
636             }
637             if (p && *p != ']')
638               p = nextchar(p);
639           } else if (pos) {
640             if (neg)
641               return 0;
642             else if (i == numconds)
643               return 1;
644             ingroup = true;
645             while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
646             }
647             //			if (p && *p != ']') p = nextchar(p);
648             st--;
649           }
650           if (!pos) {
651             i++;
652             st--;
653           }
654           if (st < beg && p && *p != ']')
655             return 0;      // word <= condition
656         } else if (pos) {  // group
657           p = nextchar(p);
658         } else
659           return 0;
660       }
661     }
662     if (!p)
663       return 1;
664   }
665 }
666 
667 // see if this suffix is present in the word
checkword(const char * word,int len,int optflags,PfxEntry * ppfx,char ** wlst,int maxSug,int * ns,const FLAG cclass,const FLAG needflag,const FLAG badflag)668 struct hentry* SfxEntry::checkword(const char* word,
669                                    int len,
670                                    int optflags,
671                                    PfxEntry* ppfx,
672                                    char** wlst,
673                                    int maxSug,
674                                    int* ns,
675                                    const FLAG cclass,
676                                    const FLAG needflag,
677                                    const FLAG badflag) {
678   struct hentry* he;  // hash entry pointer
679   PfxEntry* ep = ppfx;
680 
681   // if this suffix is being cross checked with a prefix
682   // but it does not support cross products skip it
683 
684   if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
685     return NULL;
686 
687   // upon entry suffix is 0 length or already matches the end of the word.
688   // So if the remaining root word has positive length
689   // and if there are enough chars in root word and added back strip chars
690   // to meet the number of characters conditions, then test it
691 
692   int tmpl = len - appnd.size(); // length of tmpword
693   // the second condition is not enough for UTF-8 strings
694   // it checked in test_condition()
695 
696   if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
697       (tmpl + strip.size() >= numconds)) {
698     // generate new root word by removing suffix and adding
699     // back any characters that would have been stripped or
700     // or null terminating the shorter string
701 
702     std::string tmpstring(word, tmpl);
703     if (strip.size()) {
704       tmpstring.append(strip);
705     }
706 
707     const char* tmpword = tmpstring.c_str();
708     const char* endword = tmpword + tmpstring.size();
709 
710     // now make sure all of the conditions on characters
711     // are met.  Please see the appendix at the end of
712     // this file for more info on exactly what is being
713     // tested
714 
715     // if all conditions are met then check if resulting
716     // root word in the dictionary
717 
718     if (test_condition(endword, tmpword)) {
719 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
720       fprintf(stdout, "%s %s %c\n", word, tmpword, aflag);
721 #endif
722       if ((he = pmyMgr->lookup(tmpword)) != NULL) {
723         do {
724           // check conditional suffix (enabled by prefix)
725           if ((TESTAFF(he->astr, aflag, he->alen) ||
726                (ep && ep->getCont() &&
727                 TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
728               (((optflags & aeXPRODUCT) == 0) ||
729                (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
730                // enabled by prefix
731                ((contclass) &&
732                 (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))) &&
733               // handle cont. class
734               ((!cclass) ||
735                ((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
736               // check only in compound homonyms (bad flags)
737               (!badflag || !TESTAFF(he->astr, badflag, he->alen)) &&
738               // handle required flag
739               ((!needflag) ||
740                (TESTAFF(he->astr, needflag, he->alen) ||
741                 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
742             return he;
743           he = he->next_homonym;  // check homonyms
744         } while (he);
745 
746         // obsolote stemming code (used only by the
747         // experimental SuffixMgr:suggest_pos_stems)
748         // store resulting root in wlst
749       } else if (wlst && (*ns < maxSug)) {
750         int cwrd = 1;
751         for (int k = 0; k < *ns; k++)
752           if (strcmp(tmpword, wlst[k]) == 0) {
753             cwrd = 0;
754             break;
755           }
756         if (cwrd) {
757           wlst[*ns] = mystrdup(tmpword);
758           if (wlst[*ns] == NULL) {
759             for (int j = 0; j < *ns; j++)
760               free(wlst[j]);
761             *ns = -1;
762             return NULL;
763           }
764           (*ns)++;
765         }
766       }
767     }
768   }
769   return NULL;
770 }
771 
772 // see if two-level suffix is present in the word
check_twosfx(const char * word,int len,int optflags,PfxEntry * ppfx,const FLAG needflag)773 struct hentry* SfxEntry::check_twosfx(const char* word,
774                                       int len,
775                                       int optflags,
776                                       PfxEntry* ppfx,
777                                       const FLAG needflag) {
778   struct hentry* he;  // hash entry pointer
779   PfxEntry* ep = ppfx;
780 
781   // if this suffix is being cross checked with a prefix
782   // but it does not support cross products skip it
783 
784   if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
785     return NULL;
786 
787   // upon entry suffix is 0 length or already matches the end of the word.
788   // So if the remaining root word has positive length
789   // and if there are enough chars in root word and added back strip chars
790   // to meet the number of characters conditions, then test it
791 
792   int tmpl = len - appnd.size(); // length of tmpword
793 
794   if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
795       (tmpl + strip.size() >= numconds)) {
796     // generate new root word by removing suffix and adding
797     // back any characters that would have been stripped or
798     // or null terminating the shorter string
799 
800     std::string tmpword(word);
801     tmpword.resize(tmpl);
802     tmpword.append(strip);
803     tmpl += strip.size();
804 
805     const char* beg = tmpword.c_str();
806     const char* end = beg + tmpl;
807 
808     // now make sure all of the conditions on characters
809     // are met.  Please see the appendix at the end of
810     // this file for more info on exactly what is being
811     // tested
812 
813     // if all conditions are met then recall suffix_check
814 
815     if (test_condition(end, beg)) {
816       if (ppfx) {
817         // handle conditional suffix
818         if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
819           he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL, NULL, 0, NULL,
820                                     (FLAG)aflag, needflag);
821         else
822           he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, optflags, ppfx, NULL, 0,
823                                     NULL, (FLAG)aflag, needflag);
824       } else {
825         he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL, NULL, 0, NULL,
826                                   (FLAG)aflag, needflag);
827       }
828       if (he)
829         return he;
830     }
831   }
832   return NULL;
833 }
834 
835 // see if two-level suffix is present in the word
check_twosfx_morph(const char * word,int len,int optflags,PfxEntry * ppfx,const FLAG needflag)836 char* SfxEntry::check_twosfx_morph(const char* word,
837                                    int len,
838                                    int optflags,
839                                    PfxEntry* ppfx,
840                                    const FLAG needflag) {
841   PfxEntry* ep = ppfx;
842   char* st;
843 
844   char result[MAXLNLEN];
845 
846   *result = '\0';
847 
848   // if this suffix is being cross checked with a prefix
849   // but it does not support cross products skip it
850 
851   if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
852     return NULL;
853 
854   // upon entry suffix is 0 length or already matches the end of the word.
855   // So if the remaining root word has positive length
856   // and if there are enough chars in root word and added back strip chars
857   // to meet the number of characters conditions, then test it
858 
859   int tmpl = len - appnd.size(); // length of tmpword
860 
861   if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
862       (tmpl + strip.size() >= numconds)) {
863     // generate new root word by removing suffix and adding
864     // back any characters that would have been stripped or
865     // or null terminating the shorter string
866 
867     std::string tmpword(word);
868     tmpword.resize(tmpl);
869     tmpword.append(strip);
870     tmpl += strip.size();
871 
872     const char* beg = tmpword.c_str();
873     const char* end = beg + tmpl;
874 
875     // now make sure all of the conditions on characters
876     // are met.  Please see the appendix at the end of
877     // this file for more info on exactly what is being
878     // tested
879 
880     // if all conditions are met then recall suffix_check
881 
882     if (test_condition(end, beg)) {
883       if (ppfx) {
884         // handle conditional suffix
885         if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
886           st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag,
887                                           needflag);
888           if (st) {
889             if (ppfx->getMorph()) {
890               mystrcat(result, ppfx->getMorph(), MAXLNLEN);
891               mystrcat(result, " ", MAXLNLEN);
892             }
893             mystrcat(result, st, MAXLNLEN);
894             free(st);
895             mychomp(result);
896           }
897         } else {
898           st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, optflags, ppfx, aflag,
899                                           needflag);
900           if (st) {
901             mystrcat(result, st, MAXLNLEN);
902             free(st);
903             mychomp(result);
904           }
905         }
906       } else {
907         st =
908             pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, needflag);
909         if (st) {
910           mystrcat(result, st, MAXLNLEN);
911           free(st);
912           mychomp(result);
913         }
914       }
915       if (*result)
916         return mystrdup(result);
917     }
918   }
919   return NULL;
920 }
921 
922 // get next homonym with same affix
get_next_homonym(struct hentry * he,int optflags,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag)923 struct hentry* SfxEntry::get_next_homonym(struct hentry* he,
924                                           int optflags,
925                                           PfxEntry* ppfx,
926                                           const FLAG cclass,
927                                           const FLAG needflag) {
928   PfxEntry* ep = ppfx;
929   FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
930 
931   while (he->next_homonym) {
932     he = he->next_homonym;
933     if ((TESTAFF(he->astr, aflag, he->alen) ||
934          (ep && ep->getCont() &&
935           TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
936         ((optflags & aeXPRODUCT) == 0 || TESTAFF(he->astr, eFlag, he->alen) ||
937          // handle conditional suffix
938          ((contclass) && TESTAFF(contclass, eFlag, contclasslen))) &&
939         // handle cont. class
940         ((!cclass) ||
941          ((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
942         // handle required flag
943         ((!needflag) ||
944          (TESTAFF(he->astr, needflag, he->alen) ||
945           ((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
946       return he;
947   }
948   return NULL;
949 }
950 
951 #if 0
952 
953 Appendix:  Understanding Affix Code
954 
955 
956 An affix is either a  prefix or a suffix attached to root words to make
957 other words.
958 
959 Basically a Prefix or a Suffix is set of AffEntry objects
960 which store information about the prefix or suffix along
961 with supporting routines to check if a word has a particular
962 prefix or suffix or a combination.
963 
964 The structure affentry is defined as follows:
965 
966 struct affentry
967 {
968    unsigned short aflag;    // ID used to represent the affix
969    std::string strip;       // string to strip before adding affix
970    std::string appnd;       // the affix string to add
971    char numconds;           // the number of conditions that must be met
972    char opts;               // flag: aeXPRODUCT- combine both prefix and suffix
973    char   conds[SETSIZE];   // array which encodes the conditions to be met
974 };
975 
976 
977 Here is a suffix borrowed from the en_US.aff file.  This file
978 is whitespace delimited.
979 
980 SFX D Y 4
981 SFX D   0     e          d
982 SFX D   y     ied        [^aeiou]y
983 SFX D   0     ed         [^ey]
984 SFX D   0     ed         [aeiou]y
985 
986 This information can be interpreted as follows:
987 
988 In the first line has 4 fields
989 
990 Field
991 -----
992 1     SFX - indicates this is a suffix
993 2     D   - is the name of the character flag which represents this suffix
994 3     Y   - indicates it can be combined with prefixes (cross product)
995 4     4   - indicates that sequence of 4 affentry structures are needed to
996                properly store the affix information
997 
998 The remaining lines describe the unique information for the 4 SfxEntry
999 objects that make up this affix.  Each line can be interpreted
1000 as follows: (note fields 1 and 2 are as a check against line 1 info)
1001 
1002 Field
1003 -----
1004 1     SFX         - indicates this is a suffix
1005 2     D           - is the name of the character flag for this affix
1006 3     y           - the string of chars to strip off before adding affix
1007                          (a 0 here indicates the NULL string)
1008 4     ied         - the string of affix characters to add
1009 5     [^aeiou]y   - the conditions which must be met before the affix
1010                     can be applied
1011 
1012 Field 5 is interesting.  Since this is a suffix, field 5 tells us that
1013 there are 2 conditions that must be met.  The first condition is that
1014 the next to the last character in the word must *NOT* be any of the
1015 following "a", "e", "i", "o" or "u".  The second condition is that
1016 the last character of the word must end in "y".
1017 
1018 So how can we encode this information concisely and be able to
1019 test for both conditions in a fast manner?  The answer is found
1020 but studying the wonderful ispell code of Geoff Kuenning, et.al.
1021 (now available under a normal BSD license).
1022 
1023 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
1024 using a character (cast to an unsigned char) of a string, we have 8 bits
1025 of information we can store about that character.  Specifically we
1026 could use each bit to say if that character is allowed in any of the
1027 last (or first for prefixes) 8 characters of the word.
1028 
1029 Basically, each character at one end of the word (up to the number
1030 of conditions) is used to index into the conds array and the resulting
1031 value found there says whether the that character is valid for a
1032 specific character position in the word.
1033 
1034 For prefixes, it does this by setting bit 0 if that char is valid
1035 in the first position, bit 1 if valid in the second position, and so on.
1036 
1037 If a bit is not set, then that char is not valid for that postion in the
1038 word.
1039 
1040 If working with suffixes bit 0 is used for the character closest
1041 to the front, bit 1 for the next character towards the end, ...,
1042 with bit numconds-1 representing the last char at the end of the string.
1043 
1044 Note: since entries in the conds[] are 8 bits, only 8 conditions
1045 (read that only 8 character positions) can be examined at one
1046 end of a word (the beginning for prefixes and the end for suffixes.
1047 
1048 So to make this clearer, lets encode the conds array values for the
1049 first two affentries for the suffix D described earlier.
1050 
1051 
1052   For the first affentry:
1053      numconds = 1             (only examine the last character)
1054 
1055      conds['e'] =  (1 << 0)   (the word must end in an E)
1056      all others are all 0
1057 
1058   For the second affentry:
1059      numconds = 2             (only examine the last two characters)
1060 
1061      conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
1062          where X is all characters *but* a, e, i, o, or u
1063 
1064 
1065      conds['y'] = (1 << 1)     (the last char must be a y)
1066      all other bits for all other entries in the conds array are zero
1067 
1068 #endif
1069