1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * Copyright (C) 2002-2017 Németh László
5 *
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
10 *
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
15 *
16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17 *
18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23 *
24 * Alternatively, the contents of this file may be used under the terms of
25 * either the GNU General Public License Version 2 or later (the "GPL"), or
26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
35 *
36 * ***** END LICENSE BLOCK ***** */
37 /*
38 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39 * And Contributors. All rights reserved.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 *
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 *
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 *
52 * 3. All modifications to the source code must be clearly marked as
53 * such. Binary redistributions based on modified source code
54 * must be clearly marked as modified versions in the documentation
55 * and/or other materials provided with the distribution.
56 *
57 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
61 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 */
70
71 #include <stdlib.h>
72 #include <string.h>
73 #include <stdio.h>
74 #include <ctype.h>
75
76 #include "affentry.hxx"
77 #include "csutil.hxx"
78
~AffEntry()79 AffEntry::~AffEntry() {
80 if (opts & aeLONGCOND)
81 free(c.l.conds2);
82 if (morphcode && !(opts & aeALIASM))
83 free(morphcode);
84 if (contclass && !(opts & aeALIASF))
85 free(contclass);
86 }
87
PfxEntry(AffixMgr * pmgr)88 PfxEntry::PfxEntry(AffixMgr* pmgr)
89 // register affix manager
90 : pmyMgr(pmgr),
91 next(NULL),
92 nexteq(NULL),
93 nextne(NULL),
94 flgnxt(NULL) {
95 }
96
97 // add prefix to this word assuming conditions hold
add(const char * word,size_t len)98 std::string PfxEntry::add(const char* word, size_t len) {
99 std::string result;
100 if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
101 (len >= numconds) && test_condition(word) &&
102 (!strip.size() || (strncmp(word, strip.c_str(), strip.size()) == 0))) {
103 /* we have a match so add prefix */
104 result.assign(appnd);
105 result.append(word + strip.size());
106 }
107 return result;
108 }
109
nextchar(char * p)110 inline char* PfxEntry::nextchar(char* p) {
111 if (p) {
112 p++;
113 if (opts & aeLONGCOND) {
114 // jump to the 2nd part of the condition
115 if (p == c.conds + MAXCONDLEN_1)
116 return c.l.conds2;
117 // end of the MAXCONDLEN length condition
118 } else if (p == c.conds + MAXCONDLEN)
119 return NULL;
120 return *p ? p : NULL;
121 }
122 return NULL;
123 }
124
test_condition(const char * st)125 inline int PfxEntry::test_condition(const char* st) {
126 const char* pos = NULL; // group with pos input position
127 bool neg = false; // complementer
128 bool ingroup = false; // character in the group
129 if (numconds == 0)
130 return 1;
131 char* p = c.conds;
132 while (1) {
133 switch (*p) {
134 case '\0':
135 return 1;
136 case '[': {
137 neg = false;
138 ingroup = false;
139 p = nextchar(p);
140 pos = st;
141 break;
142 }
143 case '^': {
144 p = nextchar(p);
145 neg = true;
146 break;
147 }
148 case ']': {
149 if ((neg && ingroup) || (!neg && !ingroup))
150 return 0;
151 pos = NULL;
152 p = nextchar(p);
153 // skip the next character
154 if (!ingroup && *st)
155 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
156 ;
157 if (*st == '\0' && p)
158 return 0; // word <= condition
159 break;
160 }
161 case '.':
162 if (!pos) { // dots are not metacharacters in groups: [.]
163 p = nextchar(p);
164 // skip the next character
165 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
166 ;
167 if (*st == '\0' && p)
168 return 0; // word <= condition
169 break;
170 }
171 /* FALLTHROUGH */
172 default: {
173 if (*st == *p) {
174 st++;
175 p = nextchar(p);
176 if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
177 while (p && (*p & 0xc0) == 0x80) { // character
178 if (*p != *st) {
179 if (!pos)
180 return 0;
181 st = pos;
182 break;
183 }
184 p = nextchar(p);
185 st++;
186 }
187 if (pos && st != pos) {
188 ingroup = true;
189 while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
190 }
191 }
192 } else if (pos) {
193 ingroup = true;
194 while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
195 }
196 }
197 } else if (pos) { // group
198 p = nextchar(p);
199 } else
200 return 0;
201 }
202 }
203 if (!p)
204 return 1;
205 }
206 }
207
208 // check if this prefix entry matches
checkword(const char * word,int len,char in_compound,const FLAG needflag)209 struct hentry* PfxEntry::checkword(const char* word,
210 int len,
211 char in_compound,
212 const FLAG needflag) {
213 struct hentry* he; // hash entry of root word or NULL
214
215 // on entry prefix is 0 length or already matches the beginning of the word.
216 // So if the remaining root word has positive length
217 // and if there are enough chars in root word and added back strip chars
218 // to meet the number of characters conditions, then test it
219
220 int tmpl = len - appnd.size(); // length of tmpword
221
222 if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
223 // generate new root word by removing prefix and adding
224 // back any characters that would have been stripped
225
226 std::string tmpword(strip);
227 tmpword.append(word + appnd.size());
228
229 // now make sure all of the conditions on characters
230 // are met. Please see the appendix at the end of
231 // this file for more info on exactly what is being
232 // tested
233
234 // if all conditions are met then check if resulting
235 // root word in the dictionary
236
237 if (test_condition(tmpword.c_str())) {
238 tmpl += strip.size();
239 if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
240 do {
241 if (TESTAFF(he->astr, aflag, he->alen) &&
242 // forbid single prefixes with needaffix flag
243 !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
244 // needflag
245 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
246 (contclass && TESTAFF(contclass, needflag, contclasslen))))
247 return he;
248 he = he->next_homonym; // check homonyms
249 } while (he);
250 }
251
252 // prefix matched but no root word was found
253 // if aeXPRODUCT is allowed, try again but now
254 // ross checked combined with a suffix
255
256 // if ((opts & aeXPRODUCT) && in_compound) {
257 if ((opts & aeXPRODUCT)) {
258 he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, aeXPRODUCT, this,
259 FLAG_NULL, needflag, in_compound);
260 if (he)
261 return he;
262 }
263 }
264 }
265 return NULL;
266 }
267
268 // check if this prefix entry matches
check_twosfx(const char * word,int len,char in_compound,const FLAG needflag)269 struct hentry* PfxEntry::check_twosfx(const char* word,
270 int len,
271 char in_compound,
272 const FLAG needflag) {
273 // on entry prefix is 0 length or already matches the beginning of the word.
274 // So if the remaining root word has positive length
275 // and if there are enough chars in root word and added back strip chars
276 // to meet the number of characters conditions, then test it
277
278 int tmpl = len - appnd.size(); // length of tmpword
279
280 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
281 (tmpl + strip.size() >= numconds)) {
282 // generate new root word by removing prefix and adding
283 // back any characters that would have been stripped
284
285 std::string tmpword(strip);
286 tmpword.append(word + appnd.size());
287
288 // now make sure all of the conditions on characters
289 // are met. Please see the appendix at the end of
290 // this file for more info on exactly what is being
291 // tested
292
293 // if all conditions are met then check if resulting
294 // root word in the dictionary
295
296 if (test_condition(tmpword.c_str())) {
297 tmpl += strip.size();
298
299 // prefix matched but no root word was found
300 // if aeXPRODUCT is allowed, try again but now
301 // cross checked combined with a suffix
302
303 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
304 // hash entry of root word or NULL
305 struct hentry* he = pmyMgr->suffix_check_twosfx(tmpword.c_str(), tmpl, aeXPRODUCT, this,
306 needflag);
307 if (he)
308 return he;
309 }
310 }
311 }
312 return NULL;
313 }
314
315 // check if this prefix entry matches
check_twosfx_morph(const char * word,int len,char in_compound,const FLAG needflag)316 std::string PfxEntry::check_twosfx_morph(const char* word,
317 int len,
318 char in_compound,
319 const FLAG needflag) {
320 std::string result;
321 // on entry prefix is 0 length or already matches the beginning of the word.
322 // So if the remaining root word has positive length
323 // and if there are enough chars in root word and added back strip chars
324 // to meet the number of characters conditions, then test it
325 int tmpl = len - appnd.size(); // length of tmpword
326
327 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
328 (tmpl + strip.size() >= numconds)) {
329 // generate new root word by removing prefix and adding
330 // back any characters that would have been stripped
331
332 std::string tmpword(strip);
333 tmpword.append(word + appnd.size());
334
335 // now make sure all of the conditions on characters
336 // are met. Please see the appendix at the end of
337 // this file for more info on exactly what is being
338 // tested
339
340 // if all conditions are met then check if resulting
341 // root word in the dictionary
342
343 if (test_condition(tmpword.c_str())) {
344 tmpl += strip.size();
345
346 // prefix matched but no root word was found
347 // if aeXPRODUCT is allowed, try again but now
348 // ross checked combined with a suffix
349
350 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
351 result = pmyMgr->suffix_check_twosfx_morph(tmpword.c_str(), tmpl,
352 aeXPRODUCT,
353 this, needflag);
354 }
355 }
356 }
357 return result;
358 }
359
360 // check if this prefix entry matches
check_morph(const char * word,int len,char in_compound,const FLAG needflag)361 std::string PfxEntry::check_morph(const char* word,
362 int len,
363 char in_compound,
364 const FLAG needflag) {
365 std::string result;
366
367 // on entry prefix is 0 length or already matches the beginning of the word.
368 // So if the remaining root word has positive length
369 // and if there are enough chars in root word and added back strip chars
370 // to meet the number of characters conditions, then test it
371
372 int tmpl = len - appnd.size(); // length of tmpword
373
374 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
375 (tmpl + strip.size() >= numconds)) {
376 // generate new root word by removing prefix and adding
377 // back any characters that would have been stripped
378
379 std::string tmpword(strip);
380 tmpword.append(word + appnd.size());
381
382 // now make sure all of the conditions on characters
383 // are met. Please see the appendix at the end of
384 // this file for more info on exactly what is being
385 // tested
386
387 // if all conditions are met then check if resulting
388 // root word in the dictionary
389
390 if (test_condition(tmpword.c_str())) {
391 tmpl += strip.size();
392 struct hentry* he; // hash entry of root word or NULL
393 if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
394 do {
395 if (TESTAFF(he->astr, aflag, he->alen) &&
396 // forbid single prefixes with needaffix flag
397 !TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
398 // needflag
399 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
400 (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
401 if (morphcode) {
402 result.push_back(MSEP_FLD);
403 result.append(morphcode);
404 } else
405 result.append(getKey());
406 if (!HENTRY_FIND(he, MORPH_STEM)) {
407 result.push_back(MSEP_FLD);
408 result.append(MORPH_STEM);
409 result.append(HENTRY_WORD(he));
410 }
411 // store the pointer of the hash entry
412 if (HENTRY_DATA(he)) {
413 result.push_back(MSEP_FLD);
414 result.append(HENTRY_DATA2(he));
415 } else {
416 // return with debug information
417 char* flag = pmyMgr->encode_flag(getFlag());
418 result.push_back(MSEP_FLD);
419 result.append(MORPH_FLAG);
420 result.append(flag);
421 free(flag);
422 }
423 result.push_back(MSEP_REC);
424 }
425 he = he->next_homonym;
426 } while (he);
427 }
428
429 // prefix matched but no root word was found
430 // if aeXPRODUCT is allowed, try again but now
431 // ross checked combined with a suffix
432
433 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
434 std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, aeXPRODUCT, this,
435 FLAG_NULL, needflag);
436 if (!st.empty()) {
437 result.append(st);
438 }
439 }
440 }
441 }
442
443 return result;
444 }
445
SfxEntry(AffixMgr * pmgr)446 SfxEntry::SfxEntry(AffixMgr* pmgr)
447 : pmyMgr(pmgr) // register affix manager
448 ,
449 next(NULL),
450 nexteq(NULL),
451 nextne(NULL),
452 flgnxt(NULL),
453 l_morph(NULL),
454 r_morph(NULL),
455 eq_morph(NULL) {
456 }
457
458 // add suffix to this word assuming conditions hold
add(const char * word,size_t len)459 std::string SfxEntry::add(const char* word, size_t len) {
460 std::string result;
461 /* make sure all conditions match */
462 if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
463 (len >= numconds) && test_condition(word + len, word) &&
464 (!strip.size() ||
465 (strcmp(word + len - strip.size(), strip.c_str()) == 0))) {
466 result.assign(word);
467 /* we have a match so add suffix */
468 result.replace(len - strip.size(), std::string::npos, appnd);
469 }
470 return result;
471 }
472
nextchar(char * p)473 inline char* SfxEntry::nextchar(char* p) {
474 if (p) {
475 p++;
476 if (opts & aeLONGCOND) {
477 // jump to the 2nd part of the condition
478 if (p == c.l.conds1 + MAXCONDLEN_1)
479 return c.l.conds2;
480 // end of the MAXCONDLEN length condition
481 } else if (p == c.conds + MAXCONDLEN)
482 return NULL;
483 return *p ? p : NULL;
484 }
485 return NULL;
486 }
487
test_condition(const char * st,const char * beg)488 inline int SfxEntry::test_condition(const char* st, const char* beg) {
489 const char* pos = NULL; // group with pos input position
490 bool neg = false; // complementer
491 bool ingroup = false; // character in the group
492 if (numconds == 0)
493 return 1;
494 char* p = c.conds;
495 st--;
496 int i = 1;
497 while (1) {
498 switch (*p) {
499 case '\0':
500 return 1;
501 case '[':
502 p = nextchar(p);
503 pos = st;
504 break;
505 case '^':
506 p = nextchar(p);
507 neg = true;
508 break;
509 case ']':
510 if (!neg && !ingroup)
511 return 0;
512 i++;
513 // skip the next character
514 if (!ingroup) {
515 for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--)
516 ;
517 st--;
518 }
519 pos = NULL;
520 neg = false;
521 ingroup = false;
522 p = nextchar(p);
523 if (st < beg && p)
524 return 0; // word <= condition
525 break;
526 case '.':
527 if (!pos) {
528 // dots are not metacharacters in groups: [.]
529 p = nextchar(p);
530 // skip the next character
531 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80;
532 st--)
533 ;
534 if (st < beg) { // word <= condition
535 if (p)
536 return 0;
537 else
538 return 1;
539 }
540 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
541 st--;
542 if (st < beg) { // word <= condition
543 if (p)
544 return 0;
545 else
546 return 1;
547 }
548 }
549 break;
550 }
551 /* FALLTHROUGH */
552 default: {
553 if (*st == *p) {
554 p = nextchar(p);
555 if ((opts & aeUTF8) && (*st & 0x80)) {
556 st--;
557 while (p && (st >= beg)) {
558 if (*p != *st) {
559 if (!pos)
560 return 0;
561 st = pos;
562 break;
563 }
564 // first byte of the UTF-8 multibyte character
565 if ((*p & 0xc0) != 0x80)
566 break;
567 p = nextchar(p);
568 st--;
569 }
570 if (pos && st != pos) {
571 if (neg)
572 return 0;
573 else if (i == numconds)
574 return 1;
575 ingroup = true;
576 while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
577 }
578 st--;
579 }
580 if (p && *p != ']')
581 p = nextchar(p);
582 } else if (pos) {
583 if (neg)
584 return 0;
585 else if (i == numconds)
586 return 1;
587 ingroup = true;
588 while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
589 }
590 // if (p && *p != ']') p = nextchar(p);
591 st--;
592 }
593 if (!pos) {
594 i++;
595 st--;
596 }
597 if (st < beg && p && *p != ']')
598 return 0; // word <= condition
599 } else if (pos) { // group
600 p = nextchar(p);
601 } else
602 return 0;
603 }
604 }
605 if (!p)
606 return 1;
607 }
608 }
609
610 // see if this suffix is present in the word
checkword(const char * word,int len,int optflags,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag,const FLAG badflag)611 struct hentry* SfxEntry::checkword(const char* word,
612 int len,
613 int optflags,
614 PfxEntry* ppfx,
615 const FLAG cclass,
616 const FLAG needflag,
617 const FLAG badflag) {
618 struct hentry* he; // hash entry pointer
619 PfxEntry* ep = ppfx;
620
621 // if this suffix is being cross checked with a prefix
622 // but it does not support cross products skip it
623
624 if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
625 return NULL;
626
627 // upon entry suffix is 0 length or already matches the end of the word.
628 // So if the remaining root word has positive length
629 // and if there are enough chars in root word and added back strip chars
630 // to meet the number of characters conditions, then test it
631
632 int tmpl = len - appnd.size(); // length of tmpword
633 // the second condition is not enough for UTF-8 strings
634 // it checked in test_condition()
635
636 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
637 (tmpl + strip.size() >= numconds)) {
638 // generate new root word by removing suffix and adding
639 // back any characters that would have been stripped or
640 // or null terminating the shorter string
641
642 std::string tmpstring(word, tmpl);
643 if (strip.size()) {
644 tmpstring.append(strip);
645 }
646
647 const char* tmpword = tmpstring.c_str();
648 const char* endword = tmpword + tmpstring.size();
649
650 // now make sure all of the conditions on characters
651 // are met. Please see the appendix at the end of
652 // this file for more info on exactly what is being
653 // tested
654
655 // if all conditions are met then check if resulting
656 // root word in the dictionary
657
658 if (test_condition(endword, tmpword)) {
659 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
660 fprintf(stdout, "%s %s %c\n", word, tmpword, aflag);
661 #endif
662 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
663 do {
664 // check conditional suffix (enabled by prefix)
665 if ((TESTAFF(he->astr, aflag, he->alen) ||
666 (ep && ep->getCont() &&
667 TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
668 (((optflags & aeXPRODUCT) == 0) ||
669 (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
670 // enabled by prefix
671 ((contclass) &&
672 (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))) &&
673 // handle cont. class
674 ((!cclass) ||
675 ((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
676 // check only in compound homonyms (bad flags)
677 (!badflag || !TESTAFF(he->astr, badflag, he->alen)) &&
678 // handle required flag
679 ((!needflag) ||
680 (TESTAFF(he->astr, needflag, he->alen) ||
681 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
682 return he;
683 he = he->next_homonym; // check homonyms
684 } while (he);
685 }
686 }
687 }
688 return NULL;
689 }
690
691 // see if two-level suffix is present in the word
check_twosfx(const char * word,int len,int optflags,PfxEntry * ppfx,const FLAG needflag)692 struct hentry* SfxEntry::check_twosfx(const char* word,
693 int len,
694 int optflags,
695 PfxEntry* ppfx,
696 const FLAG needflag) {
697 PfxEntry* ep = ppfx;
698
699 // if this suffix is being cross checked with a prefix
700 // but it does not support cross products skip it
701
702 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
703 return NULL;
704
705 // upon entry suffix is 0 length or already matches the end of the word.
706 // So if the remaining root word has positive length
707 // and if there are enough chars in root word and added back strip chars
708 // to meet the number of characters conditions, then test it
709
710 int tmpl = len - appnd.size(); // length of tmpword
711
712 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
713 (tmpl + strip.size() >= numconds)) {
714 // generate new root word by removing suffix and adding
715 // back any characters that would have been stripped or
716 // or null terminating the shorter string
717
718 std::string tmpword(word);
719 tmpword.resize(tmpl);
720 tmpword.append(strip);
721 tmpl += strip.size();
722
723 const char* beg = tmpword.c_str();
724 const char* end = beg + tmpl;
725
726 // now make sure all of the conditions on characters
727 // are met. Please see the appendix at the end of
728 // this file for more info on exactly what is being
729 // tested
730
731 // if all conditions are met then recall suffix_check
732
733 if (test_condition(end, beg)) {
734 struct hentry* he; // hash entry pointer
735 if (ppfx) {
736 // handle conditional suffix
737 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
738 he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL,
739 (FLAG)aflag, needflag, IN_CPD_NOT);
740 else
741 he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, optflags, ppfx,
742 (FLAG)aflag, needflag, IN_CPD_NOT);
743 } else {
744 he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL,
745 (FLAG)aflag, needflag, IN_CPD_NOT);
746 }
747 if (he)
748 return he;
749 }
750 }
751 return NULL;
752 }
753
754 // see if two-level suffix is present in the word
check_twosfx_morph(const char * word,int len,int optflags,PfxEntry * ppfx,const FLAG needflag)755 std::string SfxEntry::check_twosfx_morph(const char* word,
756 int len,
757 int optflags,
758 PfxEntry* ppfx,
759 const FLAG needflag) {
760 PfxEntry* ep = ppfx;
761
762 std::string result;
763
764 // if this suffix is being cross checked with a prefix
765 // but it does not support cross products skip it
766
767 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
768 return result;
769
770 // upon entry suffix is 0 length or already matches the end of the word.
771 // So if the remaining root word has positive length
772 // and if there are enough chars in root word and added back strip chars
773 // to meet the number of characters conditions, then test it
774
775 int tmpl = len - appnd.size(); // length of tmpword
776
777 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
778 (tmpl + strip.size() >= numconds)) {
779 // generate new root word by removing suffix and adding
780 // back any characters that would have been stripped or
781 // or null terminating the shorter string
782
783 std::string tmpword(word);
784 tmpword.resize(tmpl);
785 tmpword.append(strip);
786 tmpl += strip.size();
787
788 const char* beg = tmpword.c_str();
789 const char* end = beg + tmpl;
790
791 // now make sure all of the conditions on characters
792 // are met. Please see the appendix at the end of
793 // this file for more info on exactly what is being
794 // tested
795
796 // if all conditions are met then recall suffix_check
797
798 if (test_condition(end, beg)) {
799 if (ppfx) {
800 // handle conditional suffix
801 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
802 std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag,
803 needflag);
804 if (!st.empty()) {
805 if (ppfx->getMorph()) {
806 result.append(ppfx->getMorph());
807 result.push_back(MSEP_FLD);
808 }
809 result.append(st);
810 mychomp(result);
811 }
812 } else {
813 std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, optflags, ppfx, aflag,
814 needflag);
815 if (!st.empty()) {
816 result.append(st);
817 mychomp(result);
818 }
819 }
820 } else {
821 std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, needflag);
822 if (!st.empty()) {
823 result.append(st);
824 mychomp(result);
825 }
826 }
827 }
828 }
829 return result;
830 }
831
832 // get next homonym with same affix
get_next_homonym(struct hentry * he,int optflags,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag)833 struct hentry* SfxEntry::get_next_homonym(struct hentry* he,
834 int optflags,
835 PfxEntry* ppfx,
836 const FLAG cclass,
837 const FLAG needflag) {
838 PfxEntry* ep = ppfx;
839 FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
840
841 while (he->next_homonym) {
842 he = he->next_homonym;
843 if ((TESTAFF(he->astr, aflag, he->alen) ||
844 (ep && ep->getCont() &&
845 TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
846 ((optflags & aeXPRODUCT) == 0 || TESTAFF(he->astr, eFlag, he->alen) ||
847 // handle conditional suffix
848 ((contclass) && TESTAFF(contclass, eFlag, contclasslen))) &&
849 // handle cont. class
850 ((!cclass) ||
851 ((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
852 // handle required flag
853 ((!needflag) ||
854 (TESTAFF(he->astr, needflag, he->alen) ||
855 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
856 return he;
857 }
858 return NULL;
859 }
860
initReverseWord()861 void SfxEntry::initReverseWord() {
862 rappnd = appnd;
863 reverseword(rappnd);
864 }
865
866 #if 0
867
868 Appendix: Understanding Affix Code
869
870
871 An affix is either a prefix or a suffix attached to root words to make
872 other words.
873
874 Basically a Prefix or a Suffix is set of AffEntry objects
875 which store information about the prefix or suffix along
876 with supporting routines to check if a word has a particular
877 prefix or suffix or a combination.
878
879 The structure affentry is defined as follows:
880
881 struct affentry
882 {
883 unsigned short aflag; // ID used to represent the affix
884 std::string strip; // string to strip before adding affix
885 std::string appnd; // the affix string to add
886 char numconds; // the number of conditions that must be met
887 char opts; // flag: aeXPRODUCT- combine both prefix and suffix
888 char conds[SETSIZE]; // array which encodes the conditions to be met
889 };
890
891
892 Here is a suffix borrowed from the en_US.aff file. This file
893 is whitespace delimited.
894
895 SFX D Y 4
896 SFX D 0 e d
897 SFX D y ied [^aeiou]y
898 SFX D 0 ed [^ey]
899 SFX D 0 ed [aeiou]y
900
901 This information can be interpreted as follows:
902
903 In the first line has 4 fields
904
905 Field
906 -----
907 1 SFX - indicates this is a suffix
908 2 D - is the name of the character flag which represents this suffix
909 3 Y - indicates it can be combined with prefixes (cross product)
910 4 4 - indicates that sequence of 4 affentry structures are needed to
911 properly store the affix information
912
913 The remaining lines describe the unique information for the 4 SfxEntry
914 objects that make up this affix. Each line can be interpreted
915 as follows: (note fields 1 and 2 are as a check against line 1 info)
916
917 Field
918 -----
919 1 SFX - indicates this is a suffix
920 2 D - is the name of the character flag for this affix
921 3 y - the string of chars to strip off before adding affix
922 (a 0 here indicates the NULL string)
923 4 ied - the string of affix characters to add
924 5 [^aeiou]y - the conditions which must be met before the affix
925 can be applied
926
927 Field 5 is interesting. Since this is a suffix, field 5 tells us that
928 there are 2 conditions that must be met. The first condition is that
929 the next to the last character in the word must *NOT* be any of the
930 following "a", "e", "i", "o" or "u". The second condition is that
931 the last character of the word must end in "y".
932
933 So how can we encode this information concisely and be able to
934 test for both conditions in a fast manner? The answer is found
935 but studying the wonderful ispell code of Geoff Kuenning, et.al.
936 (now available under a normal BSD license).
937
938 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
939 using a character (cast to an unsigned char) of a string, we have 8 bits
940 of information we can store about that character. Specifically we
941 could use each bit to say if that character is allowed in any of the
942 last (or first for prefixes) 8 characters of the word.
943
944 Basically, each character at one end of the word (up to the number
945 of conditions) is used to index into the conds array and the resulting
946 value found there says whether the that character is valid for a
947 specific character position in the word.
948
949 For prefixes, it does this by setting bit 0 if that char is valid
950 in the first position, bit 1 if valid in the second position, and so on.
951
952 If a bit is not set, then that char is not valid for that postion in the
953 word.
954
955 If working with suffixes bit 0 is used for the character closest
956 to the front, bit 1 for the next character towards the end, ...,
957 with bit numconds-1 representing the last char at the end of the string.
958
959 Note: since entries in the conds[] are 8 bits, only 8 conditions
960 (read that only 8 character positions) can be examined at one
961 end of a word (the beginning for prefixes and the end for suffixes.
962
963 So to make this clearer, lets encode the conds array values for the
964 first two affentries for the suffix D described earlier.
965
966
967 For the first affentry:
968 numconds = 1 (only examine the last character)
969
970 conds['e'] = (1 << 0) (the word must end in an E)
971 all others are all 0
972
973 For the second affentry:
974 numconds = 2 (only examine the last two characters)
975
976 conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
977 where X is all characters *but* a, e, i, o, or u
978
979
980 conds['y'] = (1 << 1) (the last char must be a y)
981 all other bits for all other entries in the conds array are zero
982
983 #endif
984