1 // This file is part of The New Aspell
2 // Copyright (C) 2004 by Kevin Atkinson under the GNU LGPL
3 // license version 2.0 or 2.1.  You should have received a copy of the
4 // LGPL license along with this library if you did not you can find it
5 // at http://www.gnu.org/.
6 //
7 // This code is based on the the MySpell affix code:
8 //
9 /*
10  * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada And
11  * Contributors.  All rights reserved.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  *
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  *
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  *
24  * 3. All modifications to the source code must be clearly marked as
25  *    such.  Binary redistributions based on modified source code
26  *    must be clearly marked as modified versions in the documentation
27  *    and/or other materials provided with the distribution.
28  *
29  * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
30  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
31  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
32  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
33  * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
34  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
35  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
36  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  */
43 
44 #include <cstdlib>
45 #include <cstring>
46 #include <cstdio>
47 
48 //#include "iostream.hpp"
49 
50 #include "affix.hpp"
51 #include "errors.hpp"
52 #include "getdata.hpp"
53 #include "parm_string.hpp"
54 #include "check_list.hpp"
55 #include "speller_impl.hpp"
56 #include "vararray.hpp"
57 #include "lsort.hpp"
58 #include "hash-t.hpp"
59 
60 #include "gettext.h"
61 
62 using namespace std;
63 
64 namespace aspeller {
65 
66 typedef unsigned char byte;
67 static char EMPTY[1] = {0};
68 
69 //////////////////////////////////////////////////////////////////////
70 //
71 // Entry struct definations
72 //
73 
74 struct Conds
75 {
76   char * str;
77   unsigned num;
78   char conds[SETSIZE];
getaspeller::Conds79   char get(byte i) const {return conds[i];}
80 };
81 
82 struct AffEntry
83 {
84   const char *   appnd;
85   const char *   strip;
86   byte           appndl;
87   byte           stripl;
88   byte           xpflg;
89   char           achar;
90   const Conds *  conds;
91   //unsigned int numconds;
92   //char         conds[SETSIZE];
93 };
94 
95 // A Prefix Entry
96 
97 struct PfxEntry : public AffEntry
98 {
99   PfxEntry * next;
100   PfxEntry * next_eq;
101   PfxEntry * next_ne;
102   PfxEntry * flag_next;
PfxEntryaspeller::PfxEntry103   PfxEntry() {}
104 
105   bool check(const LookupInfo &, const AffixMgr * pmyMgr,
106              ParmString, CheckInfo &, GuessInfo *, bool cross = true) const;
107 
allow_crossaspeller::PfxEntry108   inline bool          allow_cross() const { return ((xpflg & XPRODUCT) != 0); }
flagaspeller::PfxEntry109   inline byte flag() const { return achar;  }
keyaspeller::PfxEntry110   inline const char *  key() const  { return appnd;  }
111   bool applicable(SimpleString) const;
112   SimpleString add(SimpleString, ObjStack & buf) const;
113 };
114 
115 // A Suffix Entry
116 
117 struct SfxEntry : public AffEntry
118 {
119   const char * rappnd; // this is set in AffixMgr::build_sfxlist
120 
121   SfxEntry *   next;
122   SfxEntry *   next_eq;
123   SfxEntry *   next_ne;
124   SfxEntry *   flag_next;
125 
SfxEntryaspeller::SfxEntry126   SfxEntry() {}
127 
128   bool check(const LookupInfo &, ParmString, CheckInfo &, GuessInfo *,
129              int optflags, AffEntry * ppfx);
130 
allow_crossaspeller::SfxEntry131   inline bool          allow_cross() const { return ((xpflg & XPRODUCT) != 0); }
flagaspeller::SfxEntry132   inline byte flag() const { return achar;  }
keyaspeller::SfxEntry133   inline const char *  key() const  { return rappnd; }
134   bool applicable(SimpleString) const;
135   SimpleString add(SimpleString, ObjStack & buf, int limit, SimpleString) const;
136 };
137 
138 //////////////////////////////////////////////////////////////////////
139 //
140 // Utility functions declarations
141 //
142 
143 /* return 1 if s1 is subset of s2 */
isSubset(const char * s1,const char * s2)144 static bool isSubset(const char * s1, const char * s2)
145 {
146   while( *s1 && (*s1 == *s2) ) {
147     s1++;
148     s2++;
149   }
150   return (*s1 == '\0');
151 }
152 
153 // return 1 if s1 (reversed) is a leading subset of end of s2
isRevSubset(const char * s1,const char * end_of_s2,int len)154 static bool isRevSubset(const char * s1, const char * end_of_s2, int len)
155 {
156   while( (len > 0) && *s1 && (*s1 == *end_of_s2) ) {
157     s1++;
158     end_of_s2--;
159     len --;
160   }
161   return (*s1 == '\0');
162 }
163 
164 template <class T>
165 struct AffixLess
166 {
operator ()aspeller::AffixLess167   bool operator() (T * x, T * y) const {return strcmp(x->key(),y->key()) < 0;}
168 };
169 
170 // struct StringLookup {
171 //   struct Parms {
172 //     typedef const char * Value;
173 //     typedef const char * Key;
174 //     static const bool is_multi = false;
175 //     hash<const char *> hfun;
176 //     size_t hash(const char * s) {return hfun(s);}
177 //     bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;}
178 //     const char * key(const char * c) {return c;}
179 //   };
180 //   typedef HashTable<Parms> Lookup;
181 //   Lookup lookup;
182 //   ObjStack * data_buf;
183 //   StringLookup(ObjStack * b) : data_buf(b) {}
184 //   const char * dup(const char * orig) {
185 //     pair<Lookup::iterator, bool> res = lookup.insert(orig);
186 //     if (res.second) *res.first = data_buf->dup(orig);
187 //     return *res.first;
188 //     //return data_buf->dup(orig);
189 //   }
190 // };
191 
192 struct CondsLookupParms {
193   typedef const Conds * Value;
194   typedef const char * Key;
195   static const bool is_multi = false;
196   acommon::hash<const char *> hfun;
hashaspeller::CondsLookupParms197   size_t hash(const char * s) {return hfun(s);}
equalaspeller::CondsLookupParms198   bool equal(const char * x, const char * y) {return strcmp(x,y) == 0;}
keyaspeller::CondsLookupParms199   const char * key(const Conds * c) {return c->str;}
200 };
201 
202 typedef HashTable<CondsLookupParms> CondsLookup;
203 
204 // normalizes and checks the cond_str
205 // returns the length of the new string or -1 if invalid
normalize_cond_str(char * str)206 static int normalize_cond_str(char * str)
207 {
208   char * s = str;
209   char * d = str;
210   while (*s) {
211     if (*s != '[') {
212       *d++ = *s++;
213     } else if (s[1] == '\0' || s[1] == ']') {
214       return -1;
215     } else if (s[2] == ']') {
216       *d++ = s[1];
217       s += 3;
218     } else {
219       *d++ = *s++;
220       if (*s == '^') *d++ = *s++;
221       while (*s != ']') {
222         if (*s == '\0' || *s == '[') return -1;
223         char * min = s;
224         for (char * i = s + 1; *i != ']'; ++i) {
225           if ((byte)*i < (byte)*min) min = i;}
226         char c = *s;
227         *d++ = *min;
228         *min = c;
229         ++s;
230       }
231       *d++ = *s++;
232     }
233   }
234   *d = '\0';
235   return d - str;
236 }
237 
238 static void encodeit(CondsLookup &, ObjStack &,
239                      AffEntry * ptr, char * cs);
240 
241 //////////////////////////////////////////////////////////////////////
242 //
243 // Affix Manager
244 //
245 
setup(ParmString affpath,Conv & iconv)246 PosibErr<void> AffixMgr::setup(ParmString affpath, Conv & iconv)
247 {
248   // register hash manager and load affix data from aff file
249   //cpdmin = 3;  // default value
250   max_strip_ = 0;
251   for (int i=0; i < SETSIZE; i++) {
252     pStart[i] = NULL;
253     sStart[i] = NULL;
254     pFlag[i] = NULL;
255     sFlag[i] = NULL;
256     max_strip_f[i] = 0;
257   }
258   return parse_file(affpath, iconv);
259 }
260 
AffixMgr(const Language * l)261 AffixMgr::AffixMgr(const Language * l)
262   : lang(l), data_buf(1024*16) {}
263 
~AffixMgr()264 AffixMgr::~AffixMgr() {}
265 
max_(int & lhs,int rhs)266 static inline void max_(int & lhs, int rhs)
267 {
268   if (lhs < rhs) lhs = rhs;
269 }
270 
271 // read in aff file and build up prefix and suffix entry objects
parse_file(const char * affpath,Conv & iconv)272 PosibErr<void> AffixMgr::parse_file(const char * affpath, Conv & iconv)
273 {
274   // io buffers
275   String buf; DataPair dp;
276 
277   CondsLookup conds_lookup;
278 
279   // open the affix file
280   affix_file = data_buf.dup(affpath);
281   FStream afflst;
282   RET_ON_ERR(afflst.open(affpath,"r"));
283 
284   // step one is to parse the affix file building up the internal
285   // affix data structures
286 
287   // read in each line ignoring any that do not
288   // start with a known line type indicator
289 
290   char prev_aff = '\0';
291 
292   while (getdata_pair(afflst,dp,buf)) {
293     char affix_type = ' ';
294 
295     /* parse in the name of the character set used by the .dict and .aff */
296 
297     if (dp.key == "SET") {
298       String buf;
299       encoding = data_buf.dup(fix_encoding_str(dp.value, buf));
300       if (strcmp(encoding, lang->data_encoding()) != 0)
301         return make_err(incorrect_encoding, affix_file, lang->data_encoding(), encoding);
302     }
303 
304     /* parse in the flag used by the controlled compound words */
305     //else if (d.key == "COMPOUNDFLAG")
306     //  compound = data_buf.dup(d.value);
307 
308     /* parse in the flag used by the controlled compound words */
309     //else if (d.key == "COMPOUNDMIN")
310     //  cpdmin = atoi(d.value); // FiXME
311 
312     //else if (dp.key == "TRY" || dp.key == "REP");
313 
314     else if (dp.key == "PFX" || dp.key == "SFX")
315       affix_type = dp.key[0];
316 
317     if (affix_type == ' ') continue;
318 
319     //
320     // parse this affix: P - prefix, S - suffix
321     //
322 
323     int numents = 0;      // number of affentry structures to parse
324     char achar='\0';      // affix char identifier
325     short xpflg=0;
326     AffEntry * nptr;
327     {
328       // split affix header line into pieces
329       split(dp);
330       if (dp.key.empty()) goto error;
331       // key is affix char
332       const char * astr = iconv(dp.key);
333       if (astr[0] == '\0' || astr[1] != '\0') goto error;
334       achar = astr[0];
335       if (achar == prev_aff) goto error_count;
336       prev_aff = achar;
337 
338       split(dp);
339       if (dp.key.size != 1 ||
340           !(dp.key[0] == 'Y' || dp.key[0] == 'N')) goto error;
341       // key is cross product indicator
342       if (dp.key[0] == 'Y') xpflg = XPRODUCT;
343 
344       split(dp);
345       if (dp.key.empty()) goto error;
346       // key is number of affentries
347 
348       numents = atoi(dp.key);
349 
350       for (int j = 0; j < numents; j++) {
351         getdata_pair(afflst, dp, buf);
352 
353         if (affix_type == 'P') {
354           nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(PfxEntry));
355           new (nptr) PfxEntry;
356         } else {
357           nptr = (AffEntry *) data_buf.alloc_bottom(sizeof(SfxEntry));
358           new (nptr) SfxEntry;
359         }
360 
361         nptr->xpflg = xpflg;
362 
363         split(dp);
364         if (dp.key.empty()) goto error;
365         // key is affix charter
366         if (iconv(dp.key)[0] != achar) goto error_count;
367         nptr->achar = achar;
368 
369         split(dp);
370         if (dp.key.empty()) goto error;
371         // key is strip
372         if (dp.key != "0") {
373           ParmString s0(iconv(dp.key));
374           max_(max_strip_, s0.size());
375           max_(max_strip_f[(byte)achar], s0.size());
376           nptr->strip = data_buf.dup(s0);
377           nptr->stripl = s0.size();
378         } else {
379           nptr->strip  = "";
380           nptr->stripl = 0;
381         }
382 
383         split(dp);
384         if (dp.key.empty()) goto error;
385         // key is affix string or 0 for null
386         if (dp.key != "0") {
387           nptr->appnd  = data_buf.dup(iconv(dp.key));
388           nptr->appndl = strlen(nptr->appnd);
389         } else {
390           nptr->appnd  = "";
391           nptr->appndl = 0;
392         }
393 
394         split(dp);
395         if (dp.key.empty()) goto error;
396         // key is the conditions descriptions
397         char * cond = iconv(dp.key);
398         int cond_len = normalize_cond_str(cond);
399         if (cond_len < 0)
400           return (make_err(invalid_cond, MsgConv(lang)(cond))
401                   .with_file(affix_file, dp.line_num));
402         if (nptr->stripl != 0) {
403           char * cc = cond;
404           if (affix_type == 'S') cc += cond_len - nptr->stripl;
405           if (cond_len < nptr->stripl ||
406               memcmp(cc, nptr->strip, nptr->stripl) != 0)
407             return (make_err(invalid_cond_strip,
408                              MsgConv(lang)(cond), MsgConv(lang)(nptr->strip))
409                     .with_file(affix_file, dp.line_num));
410         }
411         encodeit(conds_lookup, data_buf, nptr, cond);
412 
413         // now create SfxEntry or PfxEntry objects and use links to
414         // build an ordered (sorted by affix string) list
415         if (affix_type == 'P')
416           build_pfxlist(static_cast<PfxEntry *>(nptr));
417         else
418           build_sfxlist(static_cast<SfxEntry *>(nptr));
419       }
420     }
421     continue;
422   error:
423     return make_err(corrupt_affix, MsgConv(lang)(achar)).with_file(affix_file, dp.line_num);
424   error_count:
425     return make_err(corrupt_affix, MsgConv(lang)(achar),
426                     _("Possibly incorrect count.")).with_file(affix_file, dp.line_num);
427   }
428   afflst.close();
429 
430   // now we can speed up performance greatly taking advantage of the
431   // relationship between the affixes and the idea of "subsets".
432 
433   // View each prefix as a potential leading subset of another and view
434   // each suffix (reversed) as a potential trailing subset of another.
435 
436   // To illustrate this relationship if we know the prefix "ab" is
437   // found in the word to examine, only prefixes that "ab" is a
438   // leading subset of need be examined.  Furthermore is "ab" is not
439   // present then none of the prefixes that "ab" is is a subset need
440   // be examined.
441 
442   // The same argument goes for suffix string that are reversed.
443 
444   // Then to top this off why not examine the first char of the word
445   // to quickly limit the set of prefixes to examine (i.e. the
446   // prefixes to examine must be leading supersets of the first
447   // character of the word (if they exist)
448 
449   // To take advantage of this "subset" relationship, we need to add
450   // two links from entry.  One to take next if the current prefix
451   // is found (call it nexteq) and one to take next if the current
452   // prefix is not found (call it nextne).
453 
454   // Since we have built ordered lists, all that remains is to
455   // properly initialize the nextne and nexteq pointers that relate
456   // them
457 
458   process_pfx_order();
459   process_sfx_order();
460 
461   //CERR.printf("%u\n", data_buf.calc_size()/1024);
462 
463   return no_err;
464 
465 }
466 
467 
468 // we want to be able to quickly access prefix information
469 // both by prefix flag, and sorted by prefix string itself
470 // so we need to set up two indexes
471 
build_pfxlist(PfxEntry * pfxptr)472 PosibErr<void> AffixMgr::build_pfxlist(PfxEntry* pfxptr)
473 {
474   PfxEntry * ptr;
475   PfxEntry * ep = pfxptr;
476 
477   // get the right starting point
478   const char * key = ep->key();
479   const byte flg = ep->flag();
480 
481   // first index by flag which must exist
482   ptr = pFlag[flg];
483   ep->flag_next = ptr;
484   pFlag[flg] = ep;
485 
486   // next insert the affix string, it will be sorted latter
487 
488   byte sp = *((const byte *)key);
489   ptr = pStart[sp];
490   ep->next = ptr;
491   pStart[sp] = ep;
492   return no_err;
493 }
494 
495 // we want to be able to quickly access suffix information
496 // both by suffix flag, and sorted by the reverse of the
497 // suffix string itself; so we need to set up two indexes
498 
build_sfxlist(SfxEntry * sfxptr)499 PosibErr<void> AffixMgr::build_sfxlist(SfxEntry* sfxptr)
500 {
501   SfxEntry * ptr;
502   SfxEntry * ep = sfxptr;
503   char * tmp = (char *)data_buf.alloc(sfxptr->appndl + 1);
504   sfxptr->rappnd = tmp;
505 
506   // reverse the string
507   char * dest = tmp + sfxptr->appndl;
508   *dest-- = 0;
509   const char * src = sfxptr->appnd;
510   for (; dest >= tmp; --dest, ++src)
511     *dest = *src;
512 
513   /* get the right starting point */
514   const char * key = ep->key();
515   const byte flg = ep->flag();
516 
517   // first index by flag which must exist
518   ptr = sFlag[flg];
519   ep->flag_next = ptr;
520   sFlag[flg] = ep;
521 
522   // next insert the affix string, it will be sorted latter
523 
524   byte sp = *((const byte *)key);
525   ptr = sStart[sp];
526   ep->next = ptr;
527   sStart[sp] = ep;
528   return no_err;
529 }
530 
531 
532 
533 // initialize the PfxEntry links NextEQ and NextNE to speed searching
process_pfx_order()534 PosibErr<void> AffixMgr::process_pfx_order()
535 {
536   PfxEntry* ptr;
537 
538   // loop through each prefix list starting point
539   for (int i=1; i < SETSIZE; i++) {
540 
541     ptr = pStart[i];
542 
543     if (ptr && ptr->next)
544       ptr = pStart[i] = sort(ptr, AffixLess<PfxEntry>());
545 
546     // look through the remainder of the list
547     //  and find next entry with affix that
548     // the current one is not a subset of
549     // mark that as destination for NextNE
550     // use next in list that you are a subset
551     // of as NextEQ
552 
553     for (; ptr != NULL; ptr = ptr->next) {
554 
555       PfxEntry * nptr = ptr->next;
556       for (; nptr != NULL; nptr = nptr->next) {
557         if (! isSubset( ptr->key() , nptr->key() )) break;
558       }
559       ptr->next_ne = nptr;
560       ptr->next_eq = NULL;
561       if ((ptr->next) && isSubset(ptr->key() ,
562                                   (ptr->next)->key()))
563         ptr->next_eq = ptr->next;
564     }
565 
566     // now clean up by adding smart search termination strings
567     // if you are already a superset of the previous prefix
568     // but not a subset of the next, search can end here
569     // so set NextNE properly
570 
571     ptr = pStart[i];
572     for (; ptr != NULL; ptr = ptr->next) {
573       PfxEntry * nptr = ptr->next;
574       PfxEntry * mptr = NULL;
575       for (; nptr != NULL; nptr = nptr->next) {
576         if (! isSubset(ptr->key(),nptr->key())) break;
577         mptr = nptr;
578       }
579       if (mptr) mptr->next_ne = NULL;
580     }
581   }
582   return no_err;
583 }
584 
585 
586 
587 // initialize the SfxEntry links NextEQ and NextNE to speed searching
process_sfx_order()588 PosibErr<void> AffixMgr::process_sfx_order()
589 {
590   SfxEntry* ptr;
591 
592   // loop through each prefix list starting point
593   for (int i=1; i < SETSIZE; i++) {
594 
595     ptr = sStart[i];
596 
597     if (ptr && ptr->next)
598       ptr = sStart[i] = sort(ptr, AffixLess<SfxEntry>());
599 
600     // look through the remainder of the list
601     //  and find next entry with affix that
602     // the current one is not a subset of
603     // mark that as destination for NextNE
604     // use next in list that you are a subset
605     // of as NextEQ
606 
607     for (; ptr != NULL; ptr = ptr->next) {
608       SfxEntry * nptr = ptr->next;
609       for (; nptr != NULL; nptr = nptr->next) {
610         if (! isSubset(ptr->key(),nptr->key())) break;
611       }
612       ptr->next_ne = nptr;
613       ptr->next_eq = NULL;
614       if ((ptr->next) && isSubset(ptr->key(),(ptr->next)->key()))
615         ptr->next_eq = ptr->next;
616     }
617 
618 
619     // now clean up by adding smart search termination strings:
620     // if you are already a superset of the previous suffix
621     // but not a subset of the next, search can end here
622     // so set NextNE properly
623 
624     ptr = sStart[i];
625     for (; ptr != NULL; ptr = ptr->next) {
626       SfxEntry * nptr = ptr->next;
627       SfxEntry * mptr = NULL;
628       for (; nptr != NULL; nptr = nptr->next) {
629         if (! isSubset(ptr->key(),nptr->key())) break;
630         mptr = nptr;
631       }
632       if (mptr) mptr->next_ne = NULL;
633     }
634   }
635   return no_err;
636 }
637 
638 // takes aff file condition string and creates the
639 // conds array - please see the appendix at the end of the
640 // file affentry.cxx which describes what is going on here
641 // in much more detail
642 
encodeit(CondsLookup & l,ObjStack & buf,AffEntry * ptr,char * cs)643 static void encodeit(CondsLookup & l, ObjStack & buf,
644                      AffEntry * ptr, char * cs)
645 {
646   byte c;
647   int i, j, k;
648 
649   // see if we already have this conds matrix
650 
651   CondsLookup::iterator itr = l.find(cs);
652   if (!(itr == l.end())) {
653     ptr->conds = *itr;
654     return;
655   }
656 
657   Conds * cds = (Conds *)buf.alloc_bottom(sizeof(Conds));
658   cds->str = buf.dup(cs);
659   l.insert(cds);
660   ptr->conds = cds;
661 
662   int nc = strlen(cs);
663   VARARRAYM(byte, mbr, nc + 1, MAXLNLEN);
664 
665   // now clear the conditions array
666   memset(cds->conds, 0, sizeof(cds->conds));
667 
668   // now parse the string to create the conds array
669 
670   int neg = 0;   // complement indicator
671   int grp = 0;   // group indicator
672   int n = 0;     // number of conditions
673   int ec = 0;    // end condition indicator
674   int nm = 0;    // number of member in group
675 
676   // if no condition just return
677   if (strcmp(cs,".")==0) {
678     cds->num = 0;
679     return;
680   }
681 
682   i = 0;
683   while (i < nc) {
684     c = *((byte *)(cs + i));
685 
686     // start group indicator
687     if (c == '[') {
688       grp = 1;
689       c = 0;
690     }
691 
692     // complement flag
693     if ((grp == 1) && (c == '^')) {
694       neg = 1;
695       c = 0;
696     }
697 
698     // end goup indicator
699     if (c == ']') {
700       ec = 1;
701       c = 0;
702     }
703 
704     // add character of group to list
705     if ((grp == 1) && (c != 0)) {
706       *(mbr + nm) = c;
707       nm++;
708       c = 0;
709     }
710 
711     // end of condition
712     if (c != 0) {
713       ec = 1;
714     }
715 
716 
717     if (ec) {
718       if (grp == 1) {
719         if (neg == 0) {
720           // set the proper bits in the condition array vals for those chars
721           for (j=0;j<nm;j++) {
722             k = (unsigned int) mbr[j];
723             cds->conds[k] = cds->conds[k] | (1 << n);
724           }
725         } else {
726           // complement so set all of them and then unset indicated ones
727           for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n);
728           for (j=0;j<nm;j++) {
729             k = (unsigned int) mbr[j];
730             cds->conds[k] = cds->conds[k] & ~(1 << n);
731           }
732         }
733         neg = 0;
734         grp = 0;
735         nm = 0;
736       } else {
737         // not a group so just set the proper bit for this char
738         // but first handle special case of . inside condition
739         if (c == '.') {
740           // wild card character so set them all
741           for (j=0;j<SETSIZE;j++) cds->conds[j] = cds->conds[j] | (1 << n);
742         } else {
743           cds->conds[(unsigned int)c] = cds->conds[(unsigned int)c] | (1 << n);
744         }
745       }
746       n++;
747       ec = 0;
748     }
749 
750 
751     i++;
752   }
753   cds->num = n;
754   return;
755 }
756 
757 
758 // check word for prefixes
prefix_check(const LookupInfo & linf,ParmString word,CheckInfo & ci,GuessInfo * gi,bool cross) const759 bool AffixMgr::prefix_check (const LookupInfo & linf, ParmString word,
760                              CheckInfo & ci, GuessInfo * gi, bool cross) const
761 {
762   if (word.empty()) return false;
763 
764   // first handle the special case of 0 length prefixes
765   PfxEntry * pe = pStart[0];
766   while (pe) {
767     if (pe->check(linf,this,word,ci,gi)) return true;
768     pe = pe->next;
769   }
770 
771   // now handle the general case
772   byte sp = *reinterpret_cast<const byte *>(word.str());
773   PfxEntry * pptr = pStart[sp];
774 
775   while (pptr) {
776     if (isSubset(pptr->key(),word)) {
777       if (pptr->check(linf,this,word,ci,gi,cross)) return true;
778       pptr = pptr->next_eq;
779     } else {
780       pptr = pptr->next_ne;
781     }
782   }
783 
784   return false;
785 }
786 
787 
788 // check word for suffixes
suffix_check(const LookupInfo & linf,ParmString word,CheckInfo & ci,GuessInfo * gi,int sfxopts,AffEntry * ppfx) const789 bool AffixMgr::suffix_check (const LookupInfo & linf, ParmString word,
790                              CheckInfo & ci, GuessInfo * gi,
791                              int sfxopts, AffEntry * ppfx) const
792 {
793   if (word.empty()) return false;
794 
795   // first handle the special case of 0 length suffixes
796   SfxEntry * se = sStart[0];
797   while (se) {
798     if (se->check(linf, word, ci, gi, sfxopts, ppfx)) return true;
799     se = se->next;
800   }
801 
802   if (word.size() == 0)
803     return false;
804 
805   // now handle the general case
806   byte sp = *((const byte *)(word + word.size() - 1));
807   SfxEntry * sptr = sStart[sp];
808 
809   while (sptr) {
810     if (isRevSubset(sptr->key(), word + word.size() - 1, word.size())) {
811       if (sptr->check(linf, word, ci, gi, sfxopts, ppfx)) return true;
812       sptr = sptr->next_eq;
813     } else {
814       sptr = sptr->next_ne;
815     }
816   }
817 
818   return false;
819 }
820 
821 // check if word with affixes is correctly spelled
affix_check(const LookupInfo & linf,ParmString word,CheckInfo & ci,GuessInfo * gi) const822 bool AffixMgr::affix_check(const LookupInfo & linf, ParmString word,
823                            CheckInfo & ci, GuessInfo * gi) const
824 {
825   if (word.empty()) return false;
826 
827   // Deal With Case in a semi-intelligent manner
828   CasePattern cp = lang->LangImpl::case_pattern(word);
829   ParmString pword = word;
830   ParmString sword = word;
831   CharVector lower;
832   if (cp == FirstUpper) {
833     lower.append(word, word.size() + 1);
834     lower[0] = lang->to_lower(word[0]);
835     pword = ParmString(lower.data(), lower.size() - 1);
836   } else if (cp == AllUpper) {
837     lower.resize(word.size() + 1);
838     unsigned int i = 0;
839     for (; i != word.size(); ++i)
840       lower[i] = lang->to_lower(word[i]);
841     lower[i] = '\0';
842     pword = ParmString(lower.data(), lower.size() - 1);
843     sword = pword;
844   }
845 
846   // check all prefixes (also crossed with suffixes if allowed)
847   if (prefix_check(linf, pword, ci, gi)) return true;
848 
849   // if still not found check all suffixes
850   if (suffix_check(linf, sword, ci, gi, 0, NULL)) return true;
851 
852   // if still not found check again but with the lower case version
853   // which can make a difference if the entire word matches the cond
854   // string
855   if (cp == FirstUpper) {
856     return suffix_check(linf, pword, ci, gi, 0, NULL);
857   } else {
858     return false;
859   }
860 }
861 
munch(ParmString word,GuessInfo * gi,bool cross) const862 void AffixMgr::munch(ParmString word, GuessInfo * gi, bool cross) const
863 {
864   LookupInfo li(0, LookupInfo::AlwaysTrue);
865   CheckInfo ci;
866   gi->reset();
867   CasePattern cp = lang->LangImpl::case_pattern(word);
868   if (cp == AllUpper) return;
869   if (cp != FirstUpper)
870     prefix_check(li, word, ci, gi, cross);
871   suffix_check(li, word, ci, gi, 0, NULL);
872 }
873 
expand(ParmString word,ParmString aff,ObjStack & buf,int limit) const874 WordAff * AffixMgr::expand(ParmString word, ParmString aff,
875                            ObjStack & buf, int limit) const
876 {
877   byte * empty = (byte *)buf.alloc(1);
878   *empty = 0;
879 
880   byte * suf  = (byte *)buf.alloc(aff.size() + 1);
881   byte * suf_e = suf;
882   byte * csuf = (byte *)buf.alloc(aff.size() + 1);
883   byte * csuf_e = csuf;
884 
885   WordAff * head = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
886   WordAff * cur = head;
887   cur->word = buf.dup(word);
888   cur->aff  = suf;
889 
890   for (const byte * c = (const byte *)aff.str(), * end = c + aff.size();
891        c != end;
892        ++c)
893   {
894     if (sFlag[*c]) *suf_e++ = *c;
895     if (sFlag[*c] && sFlag[*c]->allow_cross()) *csuf_e++ = *c;
896 
897     for (PfxEntry * p = pFlag[*c]; p; p = p->flag_next) {
898       SimpleString newword = p->add(word, buf);
899       if (!newword) continue;
900       cur->next = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
901       cur = cur->next;
902       cur->word = newword;
903       cur->aff = p->allow_cross() ? csuf : empty;
904     }
905   }
906 
907   *suf_e = 0;
908   *csuf_e = 0;
909   cur->next = 0;
910 
911   if (limit == 0) return head;
912 
913   WordAff * * end = &cur->next;
914   WordAff * * very_end = end;
915   size_t nsuf_s = suf_e - suf + 1;
916 
917   for (WordAff * * cur = &head; cur != end; cur = &(*cur)->next) {
918     if ((int)(*cur)->word.size - max_strip_ >= limit) continue;
919     byte * nsuf = (byte *)buf.alloc(nsuf_s);
920     expand_suffix((*cur)->word, (*cur)->aff, buf, limit, nsuf, &very_end, word);
921     (*cur)->aff = nsuf;
922   }
923 
924   return head;
925 }
926 
expand_suffix(ParmString word,const byte * aff,ObjStack & buf,int limit,byte * new_aff,WordAff *** l,ParmString orig_word) const927 WordAff * AffixMgr::expand_suffix(ParmString word, const byte * aff,
928                                   ObjStack & buf, int limit,
929                                   byte * new_aff, WordAff * * * l,
930                                   ParmString orig_word) const
931 {
932   WordAff * head = 0;
933   if (l) head = **l;
934   WordAff * * cur = l ? *l : &head;
935   bool expanded     = false;
936   bool not_expanded = false;
937   if (!orig_word) orig_word = word;
938 
939   while (*aff) {
940     if ((int)word.size() - max_strip_f[*aff] < limit) {
941       for (SfxEntry * p = sFlag[*aff]; p; p = p->flag_next) {
942         SimpleString newword = p->add(word, buf, limit, orig_word);
943         if (!newword) continue;
944         if (newword == EMPTY) {not_expanded = true; continue;}
945         *cur = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
946         (*cur)->word = newword;
947         (*cur)->aff  = (const byte *)EMPTY;
948         cur = &(*cur)->next;
949         expanded = true;
950       }
951     }
952     if (new_aff && (!expanded || not_expanded)) *new_aff++ = *aff;
953     ++aff;
954   }
955   *cur = 0;
956   if (new_aff) *new_aff = 0;
957   if (l) *l = cur;
958   return head;
959 }
960 
check_affix(ParmString word,char aff) const961 CheckAffixRes AffixMgr::check_affix(ParmString word, char aff) const
962 {
963   CheckAffixRes res = InvalidAffix;
964 
965   for (PfxEntry * p = pFlag[(unsigned char)aff]; p; p = p->flag_next) {
966     res = InapplicableAffix;
967     if (p->applicable(word)) return ValidAffix;
968   }
969 
970   for (SfxEntry * p = sFlag[(unsigned char)aff]; p; p = p->flag_next) {
971     if (res == InvalidAffix) res = InapplicableAffix;
972     if (p->applicable(word)) return ValidAffix;
973   }
974 
975   return res;
976 }
977 
978 
979 
980 //////////////////////////////////////////////////////////////////////
981 //
982 // LookupInfo
983 //
984 
lookup(ParmString word,const SensitiveCompare * c,char achar,WordEntry & o,GuessInfo * gi) const985 int LookupInfo::lookup (ParmString word, const SensitiveCompare * c,
986                         char achar,
987                         WordEntry & o, GuessInfo * gi) const
988 {
989   SpellerImpl::WS::const_iterator i = begin;
990   const char * g = 0;
991   if (mode == Word) {
992     do {
993       (*i)->lookup(word, c, o);
994       for (;!o.at_end(); o.adv()) {
995         if (TESTAFF(o.aff, achar))
996           return 1;
997         else
998           g = o.word;
999       }
1000       ++i;
1001     } while (i != end);
1002   } else if (mode == Clean) {
1003     do {
1004       (*i)->clean_lookup(word, o);
1005       for (;!o.at_end(); o.adv()) {
1006         if (TESTAFF(o.aff, achar))
1007           return 1;
1008         else
1009           g = o.word;
1010       }
1011       ++i;
1012     } while (i != end);
1013   } else if (gi) {
1014     g = gi->dup(word);
1015   }
1016   if (gi && g) {
1017     CheckInfo * ci = gi->add();
1018     ci->word = g;
1019     return -1;
1020   }
1021   return 0;
1022 }
1023 
1024 //////////////////////////////////////////////////////////////////////
1025 //
1026 // Affix Entry
1027 //
1028 
applicable(SimpleString word) const1029 bool PfxEntry::applicable(SimpleString word) const
1030 {
1031   unsigned int cond;
1032   /* make sure all conditions match */
1033   if ((word.size > stripl) && (word.size >= conds->num)) {
1034     const byte * cp = (const byte *) word.str;
1035     for (cond = 0;  cond < conds->num;  cond++) {
1036       if ((conds->get(*cp++) & (1 << cond)) == 0)
1037         break;
1038     }
1039     if (cond >= conds->num) return true;
1040   }
1041   return false;
1042 }
1043 
1044 // add prefix to this word assuming conditions hold
add(SimpleString word,ObjStack & buf) const1045 SimpleString PfxEntry::add(SimpleString word, ObjStack & buf) const
1046 {
1047   unsigned int cond;
1048   /* make sure all conditions match */
1049   if ((word.size > stripl) && (word.size >= conds->num)) {
1050     const byte * cp = (const byte *) word.str;
1051     for (cond = 0;  cond < conds->num;  cond++) {
1052       if ((conds->get(*cp++) & (1 << cond)) == 0)
1053         break;
1054     }
1055     if (cond >= conds->num) {
1056       /* */
1057       int alen = word.size - stripl;
1058       char * newword = (char *)buf.alloc(alen + appndl + 1);
1059       if (appndl) memcpy(newword, appnd, appndl);
1060       memcpy(newword + appndl, word + stripl, alen + 1);
1061       return SimpleString(newword, alen + appndl);
1062     }
1063   }
1064   return SimpleString();
1065 }
1066 
1067 // check if this prefix entry matches
check(const LookupInfo & linf,const AffixMgr * pmyMgr,ParmString word,CheckInfo & ci,GuessInfo * gi,bool cross) const1068 bool PfxEntry::check(const LookupInfo & linf, const AffixMgr * pmyMgr,
1069                      ParmString word,
1070                      CheckInfo & ci, GuessInfo * gi, bool cross) const
1071 {
1072   unsigned int		cond;	// condition number being examined
1073   unsigned              tmpl;   // length of tmpword
1074   WordEntry             wordinfo;     // hash entry of root word or NULL
1075   byte *	cp;
1076   VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1);
1077 
1078   // on entry prefix is 0 length or already matches the beginning of the word.
1079   // So if the remaining root word has positive length
1080   // and if there are enough chars in root word and added back strip chars
1081   // to meet the number of characters conditions, then test it
1082 
1083   tmpl = word.size() - appndl;
1084 
1085   if ((tmpl > 0) &&  (tmpl + stripl >= conds->num)) {
1086 
1087     // generate new root word by removing prefix and adding
1088     // back any characters that would have been stripped
1089 
1090     if (stripl) strcpy (tmpword, strip);
1091     strcpy ((tmpword + stripl), (word + appndl));
1092 
1093     // now make sure all of the conditions on characters
1094     // are met.  Please see the appendix at the end of
1095     // this file for more info on exactly what is being
1096     // tested
1097 
1098     cp = (byte *)tmpword;
1099     for (cond = 0;  cond < conds->num;  cond++) {
1100       if ((conds->get(*cp++) & (1 << cond)) == 0) break;
1101     }
1102 
1103     // if all conditions are met then check if resulting
1104     // root word in the dictionary
1105 
1106     if (cond >= conds->num) {
1107       CheckInfo * lci = 0;
1108       CheckInfo * guess = 0;
1109       tmpl += stripl;
1110 
1111       int res = linf.lookup(tmpword, &linf.sp->s_cmp_end, achar, wordinfo, gi);
1112 
1113       if (res == 1) {
1114 
1115         lci = &ci;
1116         lci->word = wordinfo.word;
1117         goto quit;
1118 
1119       } else if (res == -1) {
1120 
1121         guess = gi->head;
1122 
1123       }
1124 
1125       // prefix matched but no root word was found
1126       // if XPRODUCT is allowed, try again but now
1127       // cross checked combined with a suffix
1128 
1129       if (gi)
1130         lci = gi->head;
1131 
1132       if (cross && xpflg & XPRODUCT) {
1133         if (pmyMgr->suffix_check(linf, ParmString(tmpword, tmpl),
1134                                  ci, gi,
1135                                  XPRODUCT, (AffEntry *)this)) {
1136           lci = &ci;
1137 
1138         } else if (gi) {
1139 
1140           CheckInfo * stop = lci;
1141           for (lci = gi->head;
1142                lci != stop;
1143                lci = const_cast<CheckInfo *>(lci->next))
1144           {
1145             lci->pre_flag = achar;
1146             lci->pre_strip_len = stripl;
1147             lci->pre_add_len = appndl;
1148             lci->pre_add = appnd;
1149           }
1150 
1151         } else {
1152 
1153           lci = 0;
1154 
1155         }
1156       }
1157 
1158       if (guess)
1159         lci = guess;
1160 
1161     quit:
1162       if (lci) {
1163         lci->pre_flag = achar;
1164         lci->pre_strip_len = stripl;
1165         lci->pre_add_len = appndl;
1166         lci->pre_add = appnd;
1167       }
1168       if (lci == &ci) return true;
1169     }
1170   }
1171   return false;
1172 }
1173 
applicable(SimpleString word) const1174 bool SfxEntry::applicable(SimpleString word) const
1175 {
1176   int cond;
1177   /* make sure all conditions match */
1178   if ((word.size > stripl) && (word.size >= conds->num)) {
1179     const byte * cp = (const byte *) (word + word.size);
1180     for (cond = conds->num; --cond >=0; ) {
1181       if ((conds->get(*--cp) & (1 << cond)) == 0)
1182         break;
1183     }
1184     if (cond < 0) return true;
1185   }
1186   return false;
1187 }
1188 
1189 // add suffix to this word assuming conditions hold
add(SimpleString word,ObjStack & buf,int limit,SimpleString orig_word) const1190 SimpleString SfxEntry::add(SimpleString word, ObjStack & buf,
1191                            int limit, SimpleString orig_word) const
1192 {
1193   int cond;
1194   /* make sure all conditions match */
1195   if ((orig_word.size > stripl) && (orig_word.size >= conds->num)) {
1196     const byte * cp = (const byte *) (orig_word + orig_word.size);
1197     for (cond = conds->num; --cond >=0; ) {
1198       if ((conds->get(*--cp) & (1 << cond)) == 0)
1199         break;
1200     }
1201     if (cond < 0) {
1202       int alen = word.size - stripl;
1203       if (alen >= limit) return EMPTY;
1204       /* we have a match so add suffix */
1205       char * newword = (char *)buf.alloc(alen + appndl + 1);
1206       memcpy(newword, word, alen);
1207       memcpy(newword + alen, appnd, appndl + 1);
1208       return SimpleString(newword, alen + appndl);
1209     }
1210   }
1211   return SimpleString();
1212 }
1213 
1214 // see if this suffix is present in the word
check(const LookupInfo & linf,ParmString word,CheckInfo & ci,GuessInfo * gi,int optflags,AffEntry * ppfx)1215 bool SfxEntry::check(const LookupInfo & linf, ParmString word,
1216                      CheckInfo & ci, GuessInfo * gi,
1217                      int optflags, AffEntry* ppfx)
1218 {
1219   unsigned              tmpl;		 // length of tmpword
1220   int			cond;		 // condition beng examined
1221   WordEntry             wordinfo;        // hash entry pointer
1222   byte *	cp;
1223   VARARRAYM(char, tmpword, word.size()+stripl+1, MAXWORDLEN+1);
1224   PfxEntry* ep = (PfxEntry *) ppfx;
1225 
1226   // if this suffix is being cross checked with a prefix
1227   // but it does not support cross products skip it
1228 
1229   if ((optflags & XPRODUCT) != 0 &&  (xpflg & XPRODUCT) == 0)
1230     return false;
1231 
1232   // upon entry suffix is 0 length or already matches the end of the word.
1233   // So if the remaining root word has positive length
1234   // and if there are enough chars in root word and added back strip chars
1235   // to meet the number of characters conditions, then test it
1236 
1237   tmpl = word.size() - appndl;
1238 
1239   if ((tmpl > 0)  &&  (tmpl + stripl >= conds->num)) {
1240 
1241     // generate new root word by removing suffix and adding
1242     // back any characters that would have been stripped or
1243     // or null terminating the shorter string
1244 
1245     strcpy (tmpword, word);
1246     cp = (byte *)(tmpword + tmpl);
1247     if (stripl) {
1248       strcpy ((char *)cp, strip);
1249       tmpl += stripl;
1250       cp = (byte *)(tmpword + tmpl);
1251     } else *cp = '\0';
1252 
1253     // now make sure all of the conditions on characters
1254     // are met.  Please see the appendix at the end of
1255     // this file for more info on exactly what is being
1256     // tested
1257 
1258     for (cond = conds->num;  --cond >= 0; ) {
1259       if ((conds->get(*--cp) & (1 << cond)) == 0) break;
1260     }
1261 
1262     // if all conditions are met then check if resulting
1263     // root word in the dictionary
1264 
1265     if (cond < 0) {
1266       CheckInfo * lci = 0;
1267       tmpl += stripl;
1268       const SensitiveCompare * cmp =
1269         optflags & XPRODUCT ? &linf.sp->s_cmp_middle : &linf.sp->s_cmp_begin;
1270       int res = linf.lookup(tmpword, cmp, achar, wordinfo, gi);
1271       if (res == 1
1272           && ((optflags & XPRODUCT) == 0 || TESTAFF(wordinfo.aff, ep->achar)))
1273       {
1274         lci = &ci;
1275         lci->word = wordinfo.word;
1276       } else if (res == 1 && gi) {
1277         lci = gi->add();
1278         lci->word = wordinfo.word;
1279       } else if (res == -1) { // gi must be defined
1280         lci = gi->head;
1281       }
1282 
1283       if (lci) {
1284         lci->suf_flag = achar;
1285         lci->suf_strip_len = stripl;
1286         lci->suf_add_len = appndl;
1287         lci->suf_add = appnd;
1288       }
1289 
1290       if (lci == &ci) return true;
1291     }
1292   }
1293   return false;
1294 }
1295 
1296 //////////////////////////////////////////////////////////////////////
1297 //
1298 // new_affix_mgr
1299 //
1300 
1301 
new_affix_mgr(ParmString name,Conv & iconv,const Language * lang)1302 PosibErr<AffixMgr *> new_affix_mgr(ParmString name,
1303                                    Conv & iconv,
1304                                    const Language * lang)
1305 {
1306   if (name == "none")
1307     return 0;
1308   //CERR << "NEW AFFIX MGR\n";
1309   String file;
1310   file += lang->data_dir();
1311   file += '/';
1312   file += lang->name();
1313   file += "_affix.dat";
1314   AffixMgr * affix;
1315   affix = new AffixMgr(lang);
1316   PosibErrBase pe = affix->setup(file, iconv);
1317   if (pe.has_err()) {
1318     delete affix;
1319     return pe;
1320   } else {
1321     return affix;
1322   }
1323 }
1324 }
1325 
1326 /**************************************************************************
1327 
1328 Appendix:  Understanding Affix Code
1329 
1330 
1331 An affix is either a  prefix or a suffix attached to root words to make
1332 other words.
1333 
1334 Basically a Prefix or a Suffix is set of AffEntry objects
1335 which store information about the prefix or suffix along
1336 with supporting routines to check if a word has a particular
1337 prefix or suffix or a combination.
1338 
1339 The structure affentry is defined as follows:
1340 
1341 struct AffEntry
1342 {
1343    unsigned char achar;   // char used to represent the affix
1344    char * strip;          // string to strip before adding affix
1345    char * appnd;          // the affix string to add
1346    short  stripl;         // length of the strip string
1347    short  appndl;         // length of the affix string
1348    short  numconds;       // the number of conditions that must be met
1349    short  xpflg;          // flag: XPRODUCT- combine both prefix and suffix
1350    char   conds[SETSIZE]; // array which encodes the conditions to be met
1351 };
1352 
1353 
1354 Here is a suffix borrowed from the en_US.aff file.  This file
1355 is whitespace delimited.
1356 
1357 SFX D Y 4
1358 SFX D   0     e          d
1359 SFX D   y     ied        [^aeiou]y
1360 SFX D   0     ed         [^ey]
1361 SFX D   0     ed         [aeiou]y
1362 
1363 This information can be interpreted as follows:
1364 
1365 In the first line has 4 fields
1366 
1367 Field
1368 -----
1369 1     SFX - indicates this is a suffix
1370 2     D   - is the name of the character flag which represents this suffix
1371 3     Y   - indicates it can be combined with prefixes (cross product)
1372 4     4   - indicates that sequence of 4 affentry structures are needed to
1373                properly store the affix information
1374 
1375 The remaining lines describe the unique information for the 4 SfxEntry
1376 objects that make up this affix.  Each line can be interpreted
1377 as follows: (note fields 1 and 2 are as a check against line 1 info)
1378 
1379 Field
1380 -----
1381 1     SFX         - indicates this is a suffix
1382 2     D           - is the name of the character flag for this affix
1383 3     y           - the string of chars to strip off before adding affix
1384                          (a 0 here indicates the NULL string)
1385 4     ied         - the string of affix characters to add
1386 5     [^aeiou]y   - the conditions which must be met before the affix
1387                     can be applied
1388 
1389 Field 5 is interesting.  Since this is a suffix, field 5 tells us that
1390 there are 2 conditions that must be met.  The first condition is that
1391 the next to the last character in the word must *NOT* be any of the
1392 following "a", "e", "i", "o" or "u".  The second condition is that
1393 the last character of the word must end in "y".
1394 
1395 So how can we encode this information concisely and be able to
1396 test for both conditions in a fast manner?  The answer is found
1397 but studying the wonderful ispell code of Geoff Kuenning, et.al.
1398 (now available under a normal BSD license).
1399 
1400 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
1401 using a character (cast to an unsigned char) of a string, we have 8 bits
1402 of information we can store about that character.  Specifically we
1403 could use each bit to say if that character is allowed in any of the
1404 last (or first for prefixes) 8 characters of the word.
1405 
1406 Basically, each character at one end of the word (up to the number
1407 of conditions) is used to index into the conds array and the resulting
1408 value found there says whether the that character is valid for a
1409 specific character position in the word.
1410 
1411 For prefixes, it does this by setting bit 0 if that char is valid
1412 in the first position, bit 1 if valid in the second position, and so on.
1413 
1414 If a bit is not set, then that char is not valid for that position in the
1415 word.
1416 
1417 If working with suffixes bit 0 is used for the character closest
1418 to the front, bit 1 for the next character towards the end, ...,
1419 with bit numconds-1 representing the last char at the end of the string.
1420 
1421 Note: since entries in the conds[] are 8 bits, only 8 conditions
1422 (read that only 8 character positions) can be examined at one
1423 end of a word (the beginning for prefixes and the end for suffixes.
1424 
1425 So to make this clearer, lets encode the conds array values for the
1426 first two affentries for the suffix D described earlier.
1427 
1428 
1429   For the first affentry:
1430      numconds = 1             (only examine the last character)
1431 
1432      conds['e'] =  (1 << 0)   (the word must end in an E)
1433      all others are all 0
1434 
1435   For the second affentry:
1436      numconds = 2             (only examine the last two characters)
1437 
1438      conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
1439          where X is all characters *but* a, e, i, o, or u
1440 
1441 
1442      conds['y'] = (1 << 1)     (the last char must be a y)
1443      all other bits for all other entries in the conds array are zero
1444 
1445 
1446 **************************************************************************/
1447