1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
8 *
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
12 * License.
13 *
14 * The Original Code is Hunspell, based on MySpell.
15 *
16 * The Initial Developers of the Original Code are
17 * Kevin Hendricks (MySpell) and Németh László (Hunspell).
18 * Portions created by the Initial Developers are Copyright (C) 2002-2005
19 * the Initial Developers. All Rights Reserved.
20 *
21 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
22 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
23 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
24 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
25 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
26 *
27 * Alternatively, the contents of this file may be used under the terms of
28 * either the GNU General Public License Version 2 or later (the "GPL"), or
29 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 * in which case the provisions of the GPL or the LGPL are applicable instead
31 * of those above. If you wish to allow use of your version of this file only
32 * under the terms of either the GPL or the LGPL, and not to allow others to
33 * use your version of this file under the terms of the MPL, indicate your
34 * decision by deleting the provisions above and replace them with the notice
35 * and other provisions required by the GPL or the LGPL. If you do not delete
36 * the provisions above, a recipient may use your version of this file under
37 * the terms of any one of the MPL, the GPL or the LGPL.
38 *
39 * ***** END LICENSE BLOCK ***** */
40 /*
41 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
42 * And Contributors. All rights reserved.
43 *
44 * Redistribution and use in source and binary forms, with or without
45 * modification, are permitted provided that the following conditions
46 * are met:
47 *
48 * 1. Redistributions of source code must retain the above copyright
49 * notice, this list of conditions and the following disclaimer.
50 *
51 * 2. Redistributions in binary form must reproduce the above copyright
52 * notice, this list of conditions and the following disclaimer in the
53 * documentation and/or other materials provided with the distribution.
54 *
55 * 3. All modifications to the source code must be clearly marked as
56 * such. Binary redistributions based on modified source code
57 * must be clearly marked as modified versions in the documentation
58 * and/or other materials provided with the distribution.
59 *
60 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
61 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
62 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
63 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
64 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
65 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
66 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
67 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
71 * SUCH DAMAGE.
72 */
73
74 #include <stdlib.h>
75 #include <string.h>
76 #include <stdio.h>
77 #include <ctype.h>
78
79 #include "affentry.hxx"
80 #include "csutil.hxx"
81
PfxEntry(AffixMgr * pmgr,affentry * dp)82 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
83 // register affix manager
84 : pmyMgr(pmgr),
85 next(NULL),
86 nexteq(NULL),
87 nextne(NULL),
88 flgnxt(NULL) {
89 // set up its initial values
90 aflag = dp->aflag; // flag
91 strip = dp->strip; // string to strip
92 appnd = dp->appnd; // string to append
93 numconds = dp->numconds; // length of the condition
94 opts = dp->opts; // cross product flag
95 // then copy over all of the conditions
96 if (opts & aeLONGCOND) {
97 memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
98 c.l.conds2 = dp->c.l.conds2;
99 } else
100 memcpy(c.conds, dp->c.conds, MAXCONDLEN);
101 morphcode = dp->morphcode;
102 contclass = dp->contclass;
103 contclasslen = dp->contclasslen;
104 }
105
~PfxEntry()106 PfxEntry::~PfxEntry() {
107 aflag = 0;
108 pmyMgr = NULL;
109 if (opts & aeLONGCOND)
110 free(c.l.conds2);
111 if (morphcode && !(opts & aeALIASM))
112 free(morphcode);
113 if (contclass && !(opts & aeALIASF))
114 free(contclass);
115 }
116
117 // add prefix to this word assuming conditions hold
add(const char * word,size_t len)118 char* PfxEntry::add(const char* word, size_t len) {
119 if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
120 (len >= numconds) && test_condition(word) &&
121 (!strip.size() || (strncmp(word, strip.c_str(), strip.size()) == 0))) {
122 /* we have a match so add prefix */
123 std::string tword(appnd);
124 tword.append(word + strip.size());
125 return mystrdup(tword.c_str());
126 }
127 return NULL;
128 }
129
nextchar(char * p)130 inline char* PfxEntry::nextchar(char* p) {
131 if (p) {
132 p++;
133 if (opts & aeLONGCOND) {
134 // jump to the 2nd part of the condition
135 if (p == c.conds + MAXCONDLEN_1)
136 return c.l.conds2;
137 // end of the MAXCONDLEN length condition
138 } else if (p == c.conds + MAXCONDLEN)
139 return NULL;
140 return *p ? p : NULL;
141 }
142 return NULL;
143 }
144
test_condition(const char * st)145 inline int PfxEntry::test_condition(const char* st) {
146 const char* pos = NULL; // group with pos input position
147 bool neg = false; // complementer
148 bool ingroup = false; // character in the group
149 if (numconds == 0)
150 return 1;
151 char* p = c.conds;
152 while (1) {
153 switch (*p) {
154 case '\0':
155 return 1;
156 case '[': {
157 neg = false;
158 ingroup = false;
159 p = nextchar(p);
160 pos = st;
161 break;
162 }
163 case '^': {
164 p = nextchar(p);
165 neg = true;
166 break;
167 }
168 case ']': {
169 if ((neg && ingroup) || (!neg && !ingroup))
170 return 0;
171 pos = NULL;
172 p = nextchar(p);
173 // skip the next character
174 if (!ingroup && *st)
175 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
176 ;
177 if (*st == '\0' && p)
178 return 0; // word <= condition
179 break;
180 }
181 case '.':
182 if (!pos) { // dots are not metacharacters in groups: [.]
183 p = nextchar(p);
184 // skip the next character
185 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
186 ;
187 if (*st == '\0' && p)
188 return 0; // word <= condition
189 break;
190 }
191 /* FALLTHROUGH */
192 default: {
193 if (*st == *p) {
194 st++;
195 p = nextchar(p);
196 if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
197 while (p && (*p & 0xc0) == 0x80) { // character
198 if (*p != *st) {
199 if (!pos)
200 return 0;
201 st = pos;
202 break;
203 }
204 p = nextchar(p);
205 st++;
206 }
207 if (pos && st != pos) {
208 ingroup = true;
209 while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
210 }
211 }
212 } else if (pos) {
213 ingroup = true;
214 while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
215 }
216 }
217 } else if (pos) { // group
218 p = nextchar(p);
219 } else
220 return 0;
221 }
222 }
223 if (!p)
224 return 1;
225 }
226 }
227
228 // check if this prefix entry matches
checkword(const char * word,int len,char in_compound,const FLAG needflag)229 struct hentry* PfxEntry::checkword(const char* word,
230 int len,
231 char in_compound,
232 const FLAG needflag) {
233 struct hentry* he; // hash entry of root word or NULL
234
235 // on entry prefix is 0 length or already matches the beginning of the word.
236 // So if the remaining root word has positive length
237 // and if there are enough chars in root word and added back strip chars
238 // to meet the number of characters conditions, then test it
239
240 int tmpl = len - appnd.size(); // length of tmpword
241
242 if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
243 // generate new root word by removing prefix and adding
244 // back any characters that would have been stripped
245
246 std::string tmpword(strip);
247 tmpword.append(word + appnd.size());
248
249 // now make sure all of the conditions on characters
250 // are met. Please see the appendix at the end of
251 // this file for more info on exactly what is being
252 // tested
253
254 // if all conditions are met then check if resulting
255 // root word in the dictionary
256
257 if (test_condition(tmpword.c_str())) {
258 tmpl += strip.size();
259 if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
260 do {
261 if (TESTAFF(he->astr, aflag, he->alen) &&
262 // forbid single prefixes with needaffix flag
263 !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
264 // needflag
265 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
266 (contclass && TESTAFF(contclass, needflag, contclasslen))))
267 return he;
268 he = he->next_homonym; // check homonyms
269 } while (he);
270 }
271
272 // prefix matched but no root word was found
273 // if aeXPRODUCT is allowed, try again but now
274 // ross checked combined with a suffix
275
276 // if ((opts & aeXPRODUCT) && in_compound) {
277 if ((opts & aeXPRODUCT)) {
278 he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, aeXPRODUCT, this,
279 NULL, 0, NULL, FLAG_NULL, needflag,
280 in_compound);
281 if (he)
282 return he;
283 }
284 }
285 }
286 return NULL;
287 }
288
289 // check if this prefix entry matches
check_twosfx(const char * word,int len,char in_compound,const FLAG needflag)290 struct hentry* PfxEntry::check_twosfx(const char* word,
291 int len,
292 char in_compound,
293 const FLAG needflag) {
294 struct hentry* he; // hash entry of root word or NULL
295
296 // on entry prefix is 0 length or already matches the beginning of the word.
297 // So if the remaining root word has positive length
298 // and if there are enough chars in root word and added back strip chars
299 // to meet the number of characters conditions, then test it
300
301 int tmpl = len - appnd.size(); // length of tmpword
302
303 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
304 (tmpl + strip.size() >= numconds)) {
305 // generate new root word by removing prefix and adding
306 // back any characters that would have been stripped
307
308 std::string tmpword(strip);
309 tmpword.append(word + appnd.size());
310
311 // now make sure all of the conditions on characters
312 // are met. Please see the appendix at the end of
313 // this file for more info on exactly what is being
314 // tested
315
316 // if all conditions are met then check if resulting
317 // root word in the dictionary
318
319 if (test_condition(tmpword.c_str())) {
320 tmpl += strip.size();
321
322 // prefix matched but no root word was found
323 // if aeXPRODUCT is allowed, try again but now
324 // cross checked combined with a suffix
325
326 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
327 he = pmyMgr->suffix_check_twosfx(tmpword.c_str(), tmpl, aeXPRODUCT, this,
328 needflag);
329 if (he)
330 return he;
331 }
332 }
333 }
334 return NULL;
335 }
336
337 // check if this prefix entry matches
check_twosfx_morph(const char * word,int len,char in_compound,const FLAG needflag)338 char* PfxEntry::check_twosfx_morph(const char* word,
339 int len,
340 char in_compound,
341 const FLAG needflag) {
342 // on entry prefix is 0 length or already matches the beginning of the word.
343 // So if the remaining root word has positive length
344 // and if there are enough chars in root word and added back strip chars
345 // to meet the number of characters conditions, then test it
346
347 int tmpl = len - appnd.size(); // length of tmpword
348
349 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
350 (tmpl + strip.size() >= numconds)) {
351 // generate new root word by removing prefix and adding
352 // back any characters that would have been stripped
353
354 std::string tmpword(strip);
355 tmpword.append(word + appnd.size());
356
357 // now make sure all of the conditions on characters
358 // are met. Please see the appendix at the end of
359 // this file for more info on exactly what is being
360 // tested
361
362 // if all conditions are met then check if resulting
363 // root word in the dictionary
364
365 if (test_condition(tmpword.c_str())) {
366 tmpl += strip.size();
367
368 // prefix matched but no root word was found
369 // if aeXPRODUCT is allowed, try again but now
370 // ross checked combined with a suffix
371
372 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
373 return pmyMgr->suffix_check_twosfx_morph(tmpword.c_str(), tmpl,
374 aeXPRODUCT,
375 this, needflag);
376 }
377 }
378 }
379 return NULL;
380 }
381
382 // check if this prefix entry matches
check_morph(const char * word,int len,char in_compound,const FLAG needflag)383 char* PfxEntry::check_morph(const char* word,
384 int len,
385 char in_compound,
386 const FLAG needflag) {
387 struct hentry* he; // hash entry of root word or NULL
388 char* st;
389
390 // on entry prefix is 0 length or already matches the beginning of the word.
391 // So if the remaining root word has positive length
392 // and if there are enough chars in root word and added back strip chars
393 // to meet the number of characters conditions, then test it
394
395 int tmpl = len - appnd.size(); // length of tmpword
396
397 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
398 (tmpl + strip.size() >= numconds)) {
399 // generate new root word by removing prefix and adding
400 // back any characters that would have been stripped
401
402 std::string tmpword(strip);
403 tmpword.append(word + appnd.size());
404
405 // now make sure all of the conditions on characters
406 // are met. Please see the appendix at the end of
407 // this file for more info on exactly what is being
408 // tested
409
410 // if all conditions are met then check if resulting
411 // root word in the dictionary
412
413 if (test_condition(tmpword.c_str())) {
414 std::string result;
415
416 tmpl += strip.size();
417 if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
418 do {
419 if (TESTAFF(he->astr, aflag, he->alen) &&
420 // forbid single prefixes with needaffix flag
421 !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
422 // needflag
423 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
424 (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
425 if (morphcode) {
426 result.append(" ");
427 result.append(morphcode);
428 } else
429 result.append(getKey());
430 if (!HENTRY_FIND(he, MORPH_STEM)) {
431 result.append(" ");
432 result.append(MORPH_STEM);
433 result.append(HENTRY_WORD(he));
434 }
435 // store the pointer of the hash entry
436 if (HENTRY_DATA(he)) {
437 result.append(" ");
438 result.append(HENTRY_DATA2(he));
439 } else {
440 // return with debug information
441 char* flag = pmyMgr->encode_flag(getFlag());
442 result.append(" ");
443 result.append(MORPH_FLAG);
444 result.append(flag);
445 free(flag);
446 }
447 result.append("\n");
448 }
449 he = he->next_homonym;
450 } while (he);
451 }
452
453 // prefix matched but no root word was found
454 // if aeXPRODUCT is allowed, try again but now
455 // ross checked combined with a suffix
456
457 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
458 st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, aeXPRODUCT, this,
459 FLAG_NULL, needflag);
460 if (st) {
461 result.append(st);
462 free(st);
463 }
464 }
465
466 if (!result.empty())
467 return mystrdup(result.c_str());
468 }
469 }
470
471 return NULL;
472 }
473
SfxEntry(AffixMgr * pmgr,affentry * dp)474 SfxEntry::SfxEntry(AffixMgr* pmgr, affentry* dp)
475 : pmyMgr(pmgr) // register affix manager
476 ,
477 next(NULL),
478 nexteq(NULL),
479 nextne(NULL),
480 flgnxt(NULL),
481 l_morph(NULL),
482 r_morph(NULL),
483 eq_morph(NULL) {
484 // set up its initial values
485 aflag = dp->aflag; // char flag
486 strip = dp->strip; // string to strip
487 appnd = dp->appnd; // string to append
488 numconds = dp->numconds; // length of the condition
489 opts = dp->opts; // cross product flag
490
491 // then copy over all of the conditions
492 if (opts & aeLONGCOND) {
493 memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
494 c.l.conds2 = dp->c.l.conds2;
495 } else
496 memcpy(c.conds, dp->c.conds, MAXCONDLEN);
497 rappnd = appnd;
498 reverseword(rappnd);
499 morphcode = dp->morphcode;
500 contclass = dp->contclass;
501 contclasslen = dp->contclasslen;
502 }
503
~SfxEntry()504 SfxEntry::~SfxEntry() {
505 aflag = 0;
506 pmyMgr = NULL;
507 if (opts & aeLONGCOND)
508 free(c.l.conds2);
509 if (morphcode && !(opts & aeALIASM))
510 free(morphcode);
511 if (contclass && !(opts & aeALIASF))
512 free(contclass);
513 }
514
515 // add suffix to this word assuming conditions hold
add(const char * word,size_t len)516 char* SfxEntry::add(const char* word, size_t len) {
517 /* make sure all conditions match */
518 if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
519 (len >= numconds) && test_condition(word + len, word) &&
520 (!strip.size() ||
521 (strcmp(word + len - strip.size(), strip.c_str()) == 0))) {
522 std::string tword(word);
523 /* we have a match so add suffix */
524 tword.replace(len - strip.size(), std::string::npos, appnd);
525 return mystrdup(tword.c_str());
526 }
527 return NULL;
528 }
529
nextchar(char * p)530 inline char* SfxEntry::nextchar(char* p) {
531 if (p) {
532 p++;
533 if (opts & aeLONGCOND) {
534 // jump to the 2nd part of the condition
535 if (p == c.l.conds1 + MAXCONDLEN_1)
536 return c.l.conds2;
537 // end of the MAXCONDLEN length condition
538 } else if (p == c.conds + MAXCONDLEN)
539 return NULL;
540 return *p ? p : NULL;
541 }
542 return NULL;
543 }
544
test_condition(const char * st,const char * beg)545 inline int SfxEntry::test_condition(const char* st, const char* beg) {
546 const char* pos = NULL; // group with pos input position
547 bool neg = false; // complementer
548 bool ingroup = false; // character in the group
549 if (numconds == 0)
550 return 1;
551 char* p = c.conds;
552 st--;
553 int i = 1;
554 while (1) {
555 switch (*p) {
556 case '\0':
557 return 1;
558 case '[':
559 p = nextchar(p);
560 pos = st;
561 break;
562 case '^':
563 p = nextchar(p);
564 neg = true;
565 break;
566 case ']':
567 if (!neg && !ingroup)
568 return 0;
569 i++;
570 // skip the next character
571 if (!ingroup) {
572 for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--)
573 ;
574 st--;
575 }
576 pos = NULL;
577 neg = false;
578 ingroup = false;
579 p = nextchar(p);
580 if (st < beg && p)
581 return 0; // word <= condition
582 break;
583 case '.':
584 if (!pos) {
585 // dots are not metacharacters in groups: [.]
586 p = nextchar(p);
587 // skip the next character
588 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80;
589 st--)
590 ;
591 if (st < beg) { // word <= condition
592 if (p)
593 return 0;
594 else
595 return 1;
596 }
597 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
598 st--;
599 if (st < beg) { // word <= condition
600 if (p)
601 return 0;
602 else
603 return 1;
604 }
605 }
606 break;
607 }
608 /* FALLTHROUGH */
609 default: {
610 if (*st == *p) {
611 p = nextchar(p);
612 if ((opts & aeUTF8) && (*st & 0x80)) {
613 st--;
614 while (p && (st >= beg)) {
615 if (*p != *st) {
616 if (!pos)
617 return 0;
618 st = pos;
619 break;
620 }
621 // first byte of the UTF-8 multibyte character
622 if ((*p & 0xc0) != 0x80)
623 break;
624 p = nextchar(p);
625 st--;
626 }
627 if (pos && st != pos) {
628 if (neg)
629 return 0;
630 else if (i == numconds)
631 return 1;
632 ingroup = true;
633 while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
634 }
635 st--;
636 }
637 if (p && *p != ']')
638 p = nextchar(p);
639 } else if (pos) {
640 if (neg)
641 return 0;
642 else if (i == numconds)
643 return 1;
644 ingroup = true;
645 while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
646 }
647 // if (p && *p != ']') p = nextchar(p);
648 st--;
649 }
650 if (!pos) {
651 i++;
652 st--;
653 }
654 if (st < beg && p && *p != ']')
655 return 0; // word <= condition
656 } else if (pos) { // group
657 p = nextchar(p);
658 } else
659 return 0;
660 }
661 }
662 if (!p)
663 return 1;
664 }
665 }
666
667 // see if this suffix is present in the word
checkword(const char * word,int len,int optflags,PfxEntry * ppfx,char ** wlst,int maxSug,int * ns,const FLAG cclass,const FLAG needflag,const FLAG badflag)668 struct hentry* SfxEntry::checkword(const char* word,
669 int len,
670 int optflags,
671 PfxEntry* ppfx,
672 char** wlst,
673 int maxSug,
674 int* ns,
675 const FLAG cclass,
676 const FLAG needflag,
677 const FLAG badflag) {
678 struct hentry* he; // hash entry pointer
679 PfxEntry* ep = ppfx;
680
681 // if this suffix is being cross checked with a prefix
682 // but it does not support cross products skip it
683
684 if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
685 return NULL;
686
687 // upon entry suffix is 0 length or already matches the end of the word.
688 // So if the remaining root word has positive length
689 // and if there are enough chars in root word and added back strip chars
690 // to meet the number of characters conditions, then test it
691
692 int tmpl = len - appnd.size(); // length of tmpword
693 // the second condition is not enough for UTF-8 strings
694 // it checked in test_condition()
695
696 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
697 (tmpl + strip.size() >= numconds)) {
698 // generate new root word by removing suffix and adding
699 // back any characters that would have been stripped or
700 // or null terminating the shorter string
701
702 std::string tmpstring(word, tmpl);
703 if (strip.size()) {
704 tmpstring.append(strip);
705 }
706
707 const char* tmpword = tmpstring.c_str();
708 const char* endword = tmpword + tmpstring.size();
709
710 // now make sure all of the conditions on characters
711 // are met. Please see the appendix at the end of
712 // this file for more info on exactly what is being
713 // tested
714
715 // if all conditions are met then check if resulting
716 // root word in the dictionary
717
718 if (test_condition(endword, tmpword)) {
719 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
720 fprintf(stdout, "%s %s %c\n", word, tmpword, aflag);
721 #endif
722 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
723 do {
724 // check conditional suffix (enabled by prefix)
725 if ((TESTAFF(he->astr, aflag, he->alen) ||
726 (ep && ep->getCont() &&
727 TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
728 (((optflags & aeXPRODUCT) == 0) ||
729 (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
730 // enabled by prefix
731 ((contclass) &&
732 (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))) &&
733 // handle cont. class
734 ((!cclass) ||
735 ((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
736 // check only in compound homonyms (bad flags)
737 (!badflag || !TESTAFF(he->astr, badflag, he->alen)) &&
738 // handle required flag
739 ((!needflag) ||
740 (TESTAFF(he->astr, needflag, he->alen) ||
741 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
742 return he;
743 he = he->next_homonym; // check homonyms
744 } while (he);
745
746 // obsolote stemming code (used only by the
747 // experimental SuffixMgr:suggest_pos_stems)
748 // store resulting root in wlst
749 } else if (wlst && (*ns < maxSug)) {
750 int cwrd = 1;
751 for (int k = 0; k < *ns; k++)
752 if (strcmp(tmpword, wlst[k]) == 0) {
753 cwrd = 0;
754 break;
755 }
756 if (cwrd) {
757 wlst[*ns] = mystrdup(tmpword);
758 if (wlst[*ns] == NULL) {
759 for (int j = 0; j < *ns; j++)
760 free(wlst[j]);
761 *ns = -1;
762 return NULL;
763 }
764 (*ns)++;
765 }
766 }
767 }
768 }
769 return NULL;
770 }
771
772 // see if two-level suffix is present in the word
check_twosfx(const char * word,int len,int optflags,PfxEntry * ppfx,const FLAG needflag)773 struct hentry* SfxEntry::check_twosfx(const char* word,
774 int len,
775 int optflags,
776 PfxEntry* ppfx,
777 const FLAG needflag) {
778 struct hentry* he; // hash entry pointer
779 PfxEntry* ep = ppfx;
780
781 // if this suffix is being cross checked with a prefix
782 // but it does not support cross products skip it
783
784 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
785 return NULL;
786
787 // upon entry suffix is 0 length or already matches the end of the word.
788 // So if the remaining root word has positive length
789 // and if there are enough chars in root word and added back strip chars
790 // to meet the number of characters conditions, then test it
791
792 int tmpl = len - appnd.size(); // length of tmpword
793
794 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
795 (tmpl + strip.size() >= numconds)) {
796 // generate new root word by removing suffix and adding
797 // back any characters that would have been stripped or
798 // or null terminating the shorter string
799
800 std::string tmpword(word);
801 tmpword.resize(tmpl);
802 tmpword.append(strip);
803 tmpl += strip.size();
804
805 const char* beg = tmpword.c_str();
806 const char* end = beg + tmpl;
807
808 // now make sure all of the conditions on characters
809 // are met. Please see the appendix at the end of
810 // this file for more info on exactly what is being
811 // tested
812
813 // if all conditions are met then recall suffix_check
814
815 if (test_condition(end, beg)) {
816 if (ppfx) {
817 // handle conditional suffix
818 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
819 he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL, NULL, 0, NULL,
820 (FLAG)aflag, needflag);
821 else
822 he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, optflags, ppfx, NULL, 0,
823 NULL, (FLAG)aflag, needflag);
824 } else {
825 he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL, NULL, 0, NULL,
826 (FLAG)aflag, needflag);
827 }
828 if (he)
829 return he;
830 }
831 }
832 return NULL;
833 }
834
835 // see if two-level suffix is present in the word
check_twosfx_morph(const char * word,int len,int optflags,PfxEntry * ppfx,const FLAG needflag)836 char* SfxEntry::check_twosfx_morph(const char* word,
837 int len,
838 int optflags,
839 PfxEntry* ppfx,
840 const FLAG needflag) {
841 PfxEntry* ep = ppfx;
842 char* st;
843
844 char result[MAXLNLEN];
845
846 *result = '\0';
847
848 // if this suffix is being cross checked with a prefix
849 // but it does not support cross products skip it
850
851 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
852 return NULL;
853
854 // upon entry suffix is 0 length or already matches the end of the word.
855 // So if the remaining root word has positive length
856 // and if there are enough chars in root word and added back strip chars
857 // to meet the number of characters conditions, then test it
858
859 int tmpl = len - appnd.size(); // length of tmpword
860
861 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
862 (tmpl + strip.size() >= numconds)) {
863 // generate new root word by removing suffix and adding
864 // back any characters that would have been stripped or
865 // or null terminating the shorter string
866
867 std::string tmpword(word);
868 tmpword.resize(tmpl);
869 tmpword.append(strip);
870 tmpl += strip.size();
871
872 const char* beg = tmpword.c_str();
873 const char* end = beg + tmpl;
874
875 // now make sure all of the conditions on characters
876 // are met. Please see the appendix at the end of
877 // this file for more info on exactly what is being
878 // tested
879
880 // if all conditions are met then recall suffix_check
881
882 if (test_condition(end, beg)) {
883 if (ppfx) {
884 // handle conditional suffix
885 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
886 st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag,
887 needflag);
888 if (st) {
889 if (ppfx->getMorph()) {
890 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
891 mystrcat(result, " ", MAXLNLEN);
892 }
893 mystrcat(result, st, MAXLNLEN);
894 free(st);
895 mychomp(result);
896 }
897 } else {
898 st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, optflags, ppfx, aflag,
899 needflag);
900 if (st) {
901 mystrcat(result, st, MAXLNLEN);
902 free(st);
903 mychomp(result);
904 }
905 }
906 } else {
907 st =
908 pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, needflag);
909 if (st) {
910 mystrcat(result, st, MAXLNLEN);
911 free(st);
912 mychomp(result);
913 }
914 }
915 if (*result)
916 return mystrdup(result);
917 }
918 }
919 return NULL;
920 }
921
922 // get next homonym with same affix
get_next_homonym(struct hentry * he,int optflags,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag)923 struct hentry* SfxEntry::get_next_homonym(struct hentry* he,
924 int optflags,
925 PfxEntry* ppfx,
926 const FLAG cclass,
927 const FLAG needflag) {
928 PfxEntry* ep = ppfx;
929 FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
930
931 while (he->next_homonym) {
932 he = he->next_homonym;
933 if ((TESTAFF(he->astr, aflag, he->alen) ||
934 (ep && ep->getCont() &&
935 TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
936 ((optflags & aeXPRODUCT) == 0 || TESTAFF(he->astr, eFlag, he->alen) ||
937 // handle conditional suffix
938 ((contclass) && TESTAFF(contclass, eFlag, contclasslen))) &&
939 // handle cont. class
940 ((!cclass) ||
941 ((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
942 // handle required flag
943 ((!needflag) ||
944 (TESTAFF(he->astr, needflag, he->alen) ||
945 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
946 return he;
947 }
948 return NULL;
949 }
950
951 #if 0
952
953 Appendix: Understanding Affix Code
954
955
956 An affix is either a prefix or a suffix attached to root words to make
957 other words.
958
959 Basically a Prefix or a Suffix is set of AffEntry objects
960 which store information about the prefix or suffix along
961 with supporting routines to check if a word has a particular
962 prefix or suffix or a combination.
963
964 The structure affentry is defined as follows:
965
966 struct affentry
967 {
968 unsigned short aflag; // ID used to represent the affix
969 std::string strip; // string to strip before adding affix
970 std::string appnd; // the affix string to add
971 char numconds; // the number of conditions that must be met
972 char opts; // flag: aeXPRODUCT- combine both prefix and suffix
973 char conds[SETSIZE]; // array which encodes the conditions to be met
974 };
975
976
977 Here is a suffix borrowed from the en_US.aff file. This file
978 is whitespace delimited.
979
980 SFX D Y 4
981 SFX D 0 e d
982 SFX D y ied [^aeiou]y
983 SFX D 0 ed [^ey]
984 SFX D 0 ed [aeiou]y
985
986 This information can be interpreted as follows:
987
988 In the first line has 4 fields
989
990 Field
991 -----
992 1 SFX - indicates this is a suffix
993 2 D - is the name of the character flag which represents this suffix
994 3 Y - indicates it can be combined with prefixes (cross product)
995 4 4 - indicates that sequence of 4 affentry structures are needed to
996 properly store the affix information
997
998 The remaining lines describe the unique information for the 4 SfxEntry
999 objects that make up this affix. Each line can be interpreted
1000 as follows: (note fields 1 and 2 are as a check against line 1 info)
1001
1002 Field
1003 -----
1004 1 SFX - indicates this is a suffix
1005 2 D - is the name of the character flag for this affix
1006 3 y - the string of chars to strip off before adding affix
1007 (a 0 here indicates the NULL string)
1008 4 ied - the string of affix characters to add
1009 5 [^aeiou]y - the conditions which must be met before the affix
1010 can be applied
1011
1012 Field 5 is interesting. Since this is a suffix, field 5 tells us that
1013 there are 2 conditions that must be met. The first condition is that
1014 the next to the last character in the word must *NOT* be any of the
1015 following "a", "e", "i", "o" or "u". The second condition is that
1016 the last character of the word must end in "y".
1017
1018 So how can we encode this information concisely and be able to
1019 test for both conditions in a fast manner? The answer is found
1020 but studying the wonderful ispell code of Geoff Kuenning, et.al.
1021 (now available under a normal BSD license).
1022
1023 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
1024 using a character (cast to an unsigned char) of a string, we have 8 bits
1025 of information we can store about that character. Specifically we
1026 could use each bit to say if that character is allowed in any of the
1027 last (or first for prefixes) 8 characters of the word.
1028
1029 Basically, each character at one end of the word (up to the number
1030 of conditions) is used to index into the conds array and the resulting
1031 value found there says whether the that character is valid for a
1032 specific character position in the word.
1033
1034 For prefixes, it does this by setting bit 0 if that char is valid
1035 in the first position, bit 1 if valid in the second position, and so on.
1036
1037 If a bit is not set, then that char is not valid for that postion in the
1038 word.
1039
1040 If working with suffixes bit 0 is used for the character closest
1041 to the front, bit 1 for the next character towards the end, ...,
1042 with bit numconds-1 representing the last char at the end of the string.
1043
1044 Note: since entries in the conds[] are 8 bits, only 8 conditions
1045 (read that only 8 character positions) can be examined at one
1046 end of a word (the beginning for prefixes and the end for suffixes.
1047
1048 So to make this clearer, lets encode the conds array values for the
1049 first two affentries for the suffix D described earlier.
1050
1051
1052 For the first affentry:
1053 numconds = 1 (only examine the last character)
1054
1055 conds['e'] = (1 << 0) (the word must end in an E)
1056 all others are all 0
1057
1058 For the second affentry:
1059 numconds = 2 (only examine the last two characters)
1060
1061 conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
1062 where X is all characters *but* a, e, i, o, or u
1063
1064
1065 conds['y'] = (1 << 1) (the last char must be a y)
1066 all other bits for all other entries in the conds array are zero
1067
1068 #endif
1069