1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * Copyright (C) 2002-2017 Németh László
5 *
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
10 *
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
15 *
16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17 *
18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23 *
24 * Alternatively, the contents of this file may be used under the terms of
25 * either the GNU General Public License Version 2 or later (the "GPL"), or
26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
35 *
36 * ***** END LICENSE BLOCK ***** */
37 /*
38 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39 * And Contributors. All rights reserved.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 *
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 *
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 *
52 * 3. All modifications to the source code must be clearly marked as
53 * such. Binary redistributions based on modified source code
54 * must be clearly marked as modified versions in the documentation
55 * and/or other materials provided with the distribution.
56 *
57 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
61 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 */
70
71 #include <stdlib.h>
72 #include <string.h>
73 #include <stdio.h>
74 #include <ctype.h>
75 #include <time.h>
76
77 #include <algorithm>
78 #include <limits>
79 #include <string>
80 #include <vector>
81
82 #include "affixmgr.hxx"
83 #include "affentry.hxx"
84 #include "langnum.hxx"
85
86 #include "csutil.hxx"
87
AffixMgr(const char * affpath,const std::vector<HashMgr * > & ptr,const char * key)88 AffixMgr::AffixMgr(const char* affpath,
89 const std::vector<HashMgr*>& ptr,
90 const char* key)
91 : alldic(ptr)
92 , pHMgr(ptr[0]) {
93
94 // register hash manager and load affix data from aff file
95 csconv = NULL;
96 utf8 = 0;
97 complexprefixes = 0;
98 parsedmaptable = false;
99 parsedbreaktable = false;
100 iconvtable = NULL;
101 oconvtable = NULL;
102 // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
103 simplifiedcpd = 0;
104 parsedcheckcpd = false;
105 parseddefcpd = false;
106 phone = NULL;
107 compoundflag = FLAG_NULL; // permits word in compound forms
108 compoundbegin = FLAG_NULL; // may be first word in compound forms
109 compoundmiddle = FLAG_NULL; // may be middle word in compound forms
110 compoundend = FLAG_NULL; // may be last word in compound forms
111 compoundroot = FLAG_NULL; // compound word signing flag
112 compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
113 compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
114 compoundmoresuffixes = 0; // allow more suffixes within compound words
115 checkcompounddup = 0; // forbid double words in compounds
116 checkcompoundrep = 0; // forbid bad compounds (may be non-compound word with
117 // a REP substitution)
118 checkcompoundcase =
119 0; // forbid upper and lowercase combinations at word bounds
120 checkcompoundtriple = 0; // forbid compounds with triple letters
121 simplifiedtriple = 0; // allow simplified triple letters in compounds
122 // (Schiff+fahrt -> Schiffahrt)
123 forbiddenword = FORBIDDENWORD; // forbidden word signing flag
124 nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
125 nongramsuggest = FLAG_NULL;
126 langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
127 needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes
128 cpdwordmax = -1; // default: unlimited wordcount in compound words
129 cpdmin = -1; // undefined
130 cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
131 pfxappnd = NULL; // previous prefix for counting syllables of the prefix BUG
132 sfxappnd = NULL; // previous suffix for counting syllables of the suffix BUG
133 sfxextra = 0; // modifier for syllable count of sfxappnd BUG
134 checknum = 0; // checking numbers, and word with numbers
135 havecontclass = 0; // flags of possible continuing classes (double affix)
136 // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
137 // in morhological description in dictionary file. It's often combined with
138 // PSEUDOROOT.
139 lemma_present = FLAG_NULL;
140 circumfix = FLAG_NULL;
141 onlyincompound = FLAG_NULL;
142 maxngramsugs = -1; // undefined
143 maxdiff = -1; // undefined
144 onlymaxdiff = 0;
145 maxcpdsugs = -1; // undefined
146 nosplitsugs = 0;
147 sugswithdots = 0;
148 keepcase = 0;
149 forceucase = 0;
150 warn = 0;
151 forbidwarn = 0;
152 checksharps = 0;
153 substandard = FLAG_NULL;
154 fullstrip = 0;
155
156 sfx = NULL;
157 pfx = NULL;
158
159 for (int i = 0; i < SETSIZE; i++) {
160 pStart[i] = NULL;
161 sStart[i] = NULL;
162 pFlag[i] = NULL;
163 sFlag[i] = NULL;
164 }
165
166 for (int j = 0; j < CONTSIZE; j++) {
167 contclasses[j] = 0;
168 }
169
170 if (parse_file(affpath, key)) {
171 HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n", affpath);
172 }
173
174 if (cpdmin == -1)
175 cpdmin = MINCPDLEN;
176 }
177
~AffixMgr()178 AffixMgr::~AffixMgr() {
179 // pass through linked prefix entries and clean up
180 for (int i = 0; i < SETSIZE; i++) {
181 pFlag[i] = NULL;
182 PfxEntry* ptr = pStart[i];
183 PfxEntry* nptr = NULL;
184 while (ptr) {
185 nptr = ptr->getNext();
186 delete (ptr);
187 ptr = nptr;
188 nptr = NULL;
189 }
190 }
191
192 // pass through linked suffix entries and clean up
193 for (int j = 0; j < SETSIZE; j++) {
194 sFlag[j] = NULL;
195 SfxEntry* ptr = sStart[j];
196 SfxEntry* nptr = NULL;
197 while (ptr) {
198 nptr = ptr->getNext();
199 delete (ptr);
200 ptr = nptr;
201 nptr = NULL;
202 }
203 sStart[j] = NULL;
204 }
205
206 delete iconvtable;
207 delete oconvtable;
208 delete phone;
209
210 FREE_FLAG(compoundflag);
211 FREE_FLAG(compoundbegin);
212 FREE_FLAG(compoundmiddle);
213 FREE_FLAG(compoundend);
214 FREE_FLAG(compoundpermitflag);
215 FREE_FLAG(compoundforbidflag);
216 FREE_FLAG(compoundroot);
217 FREE_FLAG(forbiddenword);
218 FREE_FLAG(nosuggest);
219 FREE_FLAG(nongramsuggest);
220 FREE_FLAG(needaffix);
221 FREE_FLAG(lemma_present);
222 FREE_FLAG(circumfix);
223 FREE_FLAG(onlyincompound);
224
225 cpdwordmax = 0;
226 pHMgr = NULL;
227 cpdmin = 0;
228 cpdmaxsyllable = 0;
229 free_utf_tbl();
230 checknum = 0;
231 #ifdef MOZILLA_CLIENT
232 delete[] csconv;
233 #endif
234 }
235
finishFileMgr(FileMgr * afflst)236 void AffixMgr::finishFileMgr(FileMgr* afflst) {
237 delete afflst;
238
239 // convert affix trees to sorted list
240 process_pfx_tree_to_list();
241 process_sfx_tree_to_list();
242 }
243
244 // read in aff file and build up prefix and suffix entry objects
parse_file(const char * affpath,const char * key)245 int AffixMgr::parse_file(const char* affpath, const char* key) {
246
247 // checking flag duplication
248 char dupflags[CONTSIZE];
249 char dupflags_ini = 1;
250
251 // first line indicator for removing byte order mark
252 int firstline = 1;
253
254 // open the affix file
255 FileMgr* afflst = new FileMgr(affpath, key);
256 if (!afflst) {
257 HUNSPELL_WARNING(
258 stderr, "error: could not open affix description file %s\n", affpath);
259 return 1;
260 }
261
262 // step one is to parse the affix file building up the internal
263 // affix data structures
264
265 // read in each line ignoring any that do not
266 // start with a known line type indicator
267 std::string line;
268 while (afflst->getline(line)) {
269 mychomp(line);
270
271 /* remove byte order mark */
272 if (firstline) {
273 firstline = 0;
274 // Affix file begins with byte order mark: possible incompatibility with
275 // old Hunspell versions
276 if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
277 line.erase(0, 3);
278 }
279 }
280
281 /* parse in the keyboard string */
282 if (line.compare(0, 3, "KEY", 3) == 0) {
283 if (!parse_string(line, keystring, afflst->getlinenum())) {
284 finishFileMgr(afflst);
285 return 1;
286 }
287 }
288
289 /* parse in the try string */
290 if (line.compare(0, 3, "TRY", 3) == 0) {
291 if (!parse_string(line, trystring, afflst->getlinenum())) {
292 finishFileMgr(afflst);
293 return 1;
294 }
295 }
296
297 /* parse in the name of the character set used by the .dict and .aff */
298 if (line.compare(0, 3, "SET", 3) == 0) {
299 if (!parse_string(line, encoding, afflst->getlinenum())) {
300 finishFileMgr(afflst);
301 return 1;
302 }
303 if (encoding == "UTF-8") {
304 utf8 = 1;
305 #ifndef OPENOFFICEORG
306 #ifndef MOZILLA_CLIENT
307 initialize_utf_tbl();
308 #endif
309 #endif
310 }
311 }
312
313 /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left
314 * writing system */
315 if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
316 complexprefixes = 1;
317
318 /* parse in the flag used by the controlled compound words */
319 if (line.compare(0, 12, "COMPOUNDFLAG", 12) == 0) {
320 if (!parse_flag(line, &compoundflag, afflst)) {
321 finishFileMgr(afflst);
322 return 1;
323 }
324 }
325
326 /* parse in the flag used by compound words */
327 if (line.compare(0, 13, "COMPOUNDBEGIN", 13) == 0) {
328 if (complexprefixes) {
329 if (!parse_flag(line, &compoundend, afflst)) {
330 finishFileMgr(afflst);
331 return 1;
332 }
333 } else {
334 if (!parse_flag(line, &compoundbegin, afflst)) {
335 finishFileMgr(afflst);
336 return 1;
337 }
338 }
339 }
340
341 /* parse in the flag used by compound words */
342 if (line.compare(0, 14, "COMPOUNDMIDDLE", 14) == 0) {
343 if (!parse_flag(line, &compoundmiddle, afflst)) {
344 finishFileMgr(afflst);
345 return 1;
346 }
347 }
348
349 /* parse in the flag used by compound words */
350 if (line.compare(0, 11, "COMPOUNDEND", 11) == 0) {
351 if (complexprefixes) {
352 if (!parse_flag(line, &compoundbegin, afflst)) {
353 finishFileMgr(afflst);
354 return 1;
355 }
356 } else {
357 if (!parse_flag(line, &compoundend, afflst)) {
358 finishFileMgr(afflst);
359 return 1;
360 }
361 }
362 }
363
364 /* parse in the data used by compound_check() method */
365 if (line.compare(0, 15, "COMPOUNDWORDMAX", 15) == 0) {
366 if (!parse_num(line, &cpdwordmax, afflst)) {
367 finishFileMgr(afflst);
368 return 1;
369 }
370 }
371
372 /* parse in the flag sign compounds in dictionary */
373 if (line.compare(0, 12, "COMPOUNDROOT", 12) == 0) {
374 if (!parse_flag(line, &compoundroot, afflst)) {
375 finishFileMgr(afflst);
376 return 1;
377 }
378 }
379
380 /* parse in the flag used by compound_check() method */
381 if (line.compare(0, 18, "COMPOUNDPERMITFLAG", 18) == 0) {
382 if (!parse_flag(line, &compoundpermitflag, afflst)) {
383 finishFileMgr(afflst);
384 return 1;
385 }
386 }
387
388 /* parse in the flag used by compound_check() method */
389 if (line.compare(0, 18, "COMPOUNDFORBIDFLAG", 18) == 0) {
390 if (!parse_flag(line, &compoundforbidflag, afflst)) {
391 finishFileMgr(afflst);
392 return 1;
393 }
394 }
395
396 if (line.compare(0, 20, "COMPOUNDMORESUFFIXES", 20) == 0) {
397 compoundmoresuffixes = 1;
398 }
399
400 if (line.compare(0, 16, "CHECKCOMPOUNDDUP", 16) == 0) {
401 checkcompounddup = 1;
402 }
403
404 if (line.compare(0, 16, "CHECKCOMPOUNDREP", 16) == 0) {
405 checkcompoundrep = 1;
406 }
407
408 if (line.compare(0, 19, "CHECKCOMPOUNDTRIPLE", 19) == 0) {
409 checkcompoundtriple = 1;
410 }
411
412 if (line.compare(0, 16, "SIMPLIFIEDTRIPLE", 16) == 0) {
413 simplifiedtriple = 1;
414 }
415
416 if (line.compare(0, 17, "CHECKCOMPOUNDCASE", 17) == 0) {
417 checkcompoundcase = 1;
418 }
419
420 if (line.compare(0, 9, "NOSUGGEST", 9) == 0) {
421 if (!parse_flag(line, &nosuggest, afflst)) {
422 finishFileMgr(afflst);
423 return 1;
424 }
425 }
426
427 if (line.compare(0, 14, "NONGRAMSUGGEST", 14) == 0) {
428 if (!parse_flag(line, &nongramsuggest, afflst)) {
429 finishFileMgr(afflst);
430 return 1;
431 }
432 }
433
434 /* parse in the flag used by forbidden words */
435 if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) {
436 if (!parse_flag(line, &forbiddenword, afflst)) {
437 finishFileMgr(afflst);
438 return 1;
439 }
440 }
441
442 /* parse in the flag used by forbidden words (is deprecated) */
443 if (line.compare(0, 13, "LEMMA_PRESENT", 13) == 0) {
444 if (!parse_flag(line, &lemma_present, afflst)) {
445 finishFileMgr(afflst);
446 return 1;
447 }
448 }
449
450 /* parse in the flag used by circumfixes */
451 if (line.compare(0, 9, "CIRCUMFIX", 9) == 0) {
452 if (!parse_flag(line, &circumfix, afflst)) {
453 finishFileMgr(afflst);
454 return 1;
455 }
456 }
457
458 /* parse in the flag used by fogemorphemes */
459 if (line.compare(0, 14, "ONLYINCOMPOUND", 14) == 0) {
460 if (!parse_flag(line, &onlyincompound, afflst)) {
461 finishFileMgr(afflst);
462 return 1;
463 }
464 }
465
466 /* parse in the flag used by `needaffixs' (is deprecated) */
467 if (line.compare(0, 10, "PSEUDOROOT", 10) == 0) {
468 if (!parse_flag(line, &needaffix, afflst)) {
469 finishFileMgr(afflst);
470 return 1;
471 }
472 }
473
474 /* parse in the flag used by `needaffixs' */
475 if (line.compare(0, 9, "NEEDAFFIX", 9) == 0) {
476 if (!parse_flag(line, &needaffix, afflst)) {
477 finishFileMgr(afflst);
478 return 1;
479 }
480 }
481
482 /* parse in the minimal length for words in compounds */
483 if (line.compare(0, 11, "COMPOUNDMIN", 11) == 0) {
484 if (!parse_num(line, &cpdmin, afflst)) {
485 finishFileMgr(afflst);
486 return 1;
487 }
488 if (cpdmin < 1)
489 cpdmin = 1;
490 }
491
492 /* parse in the max. words and syllables in compounds */
493 if (line.compare(0, 16, "COMPOUNDSYLLABLE", 16) == 0) {
494 if (!parse_cpdsyllable(line, afflst)) {
495 finishFileMgr(afflst);
496 return 1;
497 }
498 }
499
500 /* parse in the flag used by compound_check() method */
501 if (line.compare(0, 11, "SYLLABLENUM", 11) == 0) {
502 if (!parse_string(line, cpdsyllablenum, afflst->getlinenum())) {
503 finishFileMgr(afflst);
504 return 1;
505 }
506 }
507
508 /* parse in the flag used by the controlled compound words */
509 if (line.compare(0, 8, "CHECKNUM", 8) == 0) {
510 checknum = 1;
511 }
512
513 /* parse in the extra word characters */
514 if (line.compare(0, 9, "WORDCHARS", 9) == 0) {
515 if (!parse_array(line, wordchars, wordchars_utf16,
516 utf8, afflst->getlinenum())) {
517 finishFileMgr(afflst);
518 return 1;
519 }
520 }
521
522 /* parse in the ignored characters (for example, Arabic optional diacretics
523 * charachters */
524 if (line.compare(0, 6, "IGNORE", 6) == 0) {
525 if (!parse_array(line, ignorechars, ignorechars_utf16,
526 utf8, afflst->getlinenum())) {
527 finishFileMgr(afflst);
528 return 1;
529 }
530 }
531
532 /* parse in the input conversion table */
533 if (line.compare(0, 5, "ICONV", 5) == 0) {
534 if (!parse_convtable(line, afflst, &iconvtable, "ICONV")) {
535 finishFileMgr(afflst);
536 return 1;
537 }
538 }
539
540 /* parse in the output conversion table */
541 if (line.compare(0, 5, "OCONV", 5) == 0) {
542 if (!parse_convtable(line, afflst, &oconvtable, "OCONV")) {
543 finishFileMgr(afflst);
544 return 1;
545 }
546 }
547
548 /* parse in the phonetic translation table */
549 if (line.compare(0, 5, "PHONE", 5) == 0) {
550 if (!parse_phonetable(line, afflst)) {
551 finishFileMgr(afflst);
552 return 1;
553 }
554 }
555
556 /* parse in the checkcompoundpattern table */
557 if (line.compare(0, 20, "CHECKCOMPOUNDPATTERN", 20) == 0) {
558 if (!parse_checkcpdtable(line, afflst)) {
559 finishFileMgr(afflst);
560 return 1;
561 }
562 }
563
564 /* parse in the defcompound table */
565 if (line.compare(0, 12, "COMPOUNDRULE", 12) == 0) {
566 if (!parse_defcpdtable(line, afflst)) {
567 finishFileMgr(afflst);
568 return 1;
569 }
570 }
571
572 /* parse in the related character map table */
573 if (line.compare(0, 3, "MAP", 3) == 0) {
574 if (!parse_maptable(line, afflst)) {
575 finishFileMgr(afflst);
576 return 1;
577 }
578 }
579
580 /* parse in the word breakpoints table */
581 if (line.compare(0, 5, "BREAK", 5) == 0) {
582 if (!parse_breaktable(line, afflst)) {
583 finishFileMgr(afflst);
584 return 1;
585 }
586 }
587
588 /* parse in the language for language specific codes */
589 if (line.compare(0, 4, "LANG", 4) == 0) {
590 if (!parse_string(line, lang, afflst->getlinenum())) {
591 finishFileMgr(afflst);
592 return 1;
593 }
594 langnum = get_lang_num(lang);
595 }
596
597 if (line.compare(0, 7, "VERSION", 7) == 0) {
598 size_t startpos = line.find_first_not_of(" \t", 7);
599 if (startpos != std::string::npos) {
600 version = line.substr(startpos);
601 }
602 }
603
604 if (line.compare(0, 12, "MAXNGRAMSUGS", 12) == 0) {
605 if (!parse_num(line, &maxngramsugs, afflst)) {
606 finishFileMgr(afflst);
607 return 1;
608 }
609 }
610
611 if (line.compare(0, 11, "ONLYMAXDIFF", 11) == 0)
612 onlymaxdiff = 1;
613
614 if (line.compare(0, 7, "MAXDIFF", 7) == 0) {
615 if (!parse_num(line, &maxdiff, afflst)) {
616 finishFileMgr(afflst);
617 return 1;
618 }
619 }
620
621 if (line.compare(0, 10, "MAXCPDSUGS", 10) == 0) {
622 if (!parse_num(line, &maxcpdsugs, afflst)) {
623 finishFileMgr(afflst);
624 return 1;
625 }
626 }
627
628 if (line.compare(0, 11, "NOSPLITSUGS", 11) == 0) {
629 nosplitsugs = 1;
630 }
631
632 if (line.compare(0, 9, "FULLSTRIP", 9) == 0) {
633 fullstrip = 1;
634 }
635
636 if (line.compare(0, 12, "SUGSWITHDOTS", 12) == 0) {
637 sugswithdots = 1;
638 }
639
640 /* parse in the flag used by forbidden words */
641 if (line.compare(0, 8, "KEEPCASE", 8) == 0) {
642 if (!parse_flag(line, &keepcase, afflst)) {
643 finishFileMgr(afflst);
644 return 1;
645 }
646 }
647
648 /* parse in the flag used by `forceucase' */
649 if (line.compare(0, 10, "FORCEUCASE", 10) == 0) {
650 if (!parse_flag(line, &forceucase, afflst)) {
651 finishFileMgr(afflst);
652 return 1;
653 }
654 }
655
656 /* parse in the flag used by `warn' */
657 if (line.compare(0, 4, "WARN", 4) == 0) {
658 if (!parse_flag(line, &warn, afflst)) {
659 finishFileMgr(afflst);
660 return 1;
661 }
662 }
663
664 if (line.compare(0, 10, "FORBIDWARN", 10) == 0) {
665 forbidwarn = 1;
666 }
667
668 /* parse in the flag used by the affix generator */
669 if (line.compare(0, 11, "SUBSTANDARD", 11) == 0) {
670 if (!parse_flag(line, &substandard, afflst)) {
671 finishFileMgr(afflst);
672 return 1;
673 }
674 }
675
676 if (line.compare(0, 11, "CHECKSHARPS", 11) == 0) {
677 checksharps = 1;
678 }
679
680 /* parse this affix: P - prefix, S - suffix */
681 // affix type
682 char ft = ' ';
683 if (line.compare(0, 3, "PFX", 3) == 0)
684 ft = complexprefixes ? 'S' : 'P';
685 if (line.compare(0, 3, "SFX", 3) == 0)
686 ft = complexprefixes ? 'P' : 'S';
687 if (ft != ' ') {
688 if (dupflags_ini) {
689 memset(dupflags, 0, sizeof(dupflags));
690 dupflags_ini = 0;
691 }
692 if (!parse_affix(line, ft, afflst, dupflags)) {
693 finishFileMgr(afflst);
694 return 1;
695 }
696 }
697 }
698
699 finishFileMgr(afflst);
700 // affix trees are sorted now
701
702 // now we can speed up performance greatly taking advantage of the
703 // relationship between the affixes and the idea of "subsets".
704
705 // View each prefix as a potential leading subset of another and view
706 // each suffix (reversed) as a potential trailing subset of another.
707
708 // To illustrate this relationship if we know the prefix "ab" is found in the
709 // word to examine, only prefixes that "ab" is a leading subset of need be
710 // examined.
711 // Furthermore is "ab" is not present then none of the prefixes that "ab" is
712 // is a subset need be examined.
713 // The same argument goes for suffix string that are reversed.
714
715 // Then to top this off why not examine the first char of the word to quickly
716 // limit the set of prefixes to examine (i.e. the prefixes to examine must
717 // be leading supersets of the first character of the word (if they exist)
718
719 // To take advantage of this "subset" relationship, we need to add two links
720 // from entry. One to take next if the current prefix is found (call it
721 // nexteq)
722 // and one to take next if the current prefix is not found (call it nextne).
723
724 // Since we have built ordered lists, all that remains is to properly
725 // initialize
726 // the nextne and nexteq pointers that relate them
727
728 process_pfx_order();
729 process_sfx_order();
730
731 /* get encoding for CHECKCOMPOUNDCASE */
732 if (!utf8) {
733 csconv = get_current_cs(get_encoding());
734 for (int i = 0; i <= 255; i++) {
735 if ((csconv[i].cupper != csconv[i].clower) &&
736 (wordchars.find((char)i) == std::string::npos)) {
737 wordchars.push_back((char)i);
738 }
739 }
740
741 }
742
743 // default BREAK definition
744 if (!parsedbreaktable) {
745 breaktable.push_back("-");
746 breaktable.push_back("^-");
747 breaktable.push_back("-$");
748 parsedbreaktable = true;
749 }
750 return 0;
751 }
752
753 // we want to be able to quickly access prefix information
754 // both by prefix flag, and sorted by prefix string itself
755 // so we need to set up two indexes
756
build_pfxtree(PfxEntry * pfxptr)757 int AffixMgr::build_pfxtree(PfxEntry* pfxptr) {
758 PfxEntry* ptr;
759 PfxEntry* pptr;
760 PfxEntry* ep = pfxptr;
761
762 // get the right starting points
763 const char* key = ep->getKey();
764 const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF);
765
766 // first index by flag which must exist
767 ptr = pFlag[flg];
768 ep->setFlgNxt(ptr);
769 pFlag[flg] = ep;
770
771 // handle the special case of null affix string
772 if (strlen(key) == 0) {
773 // always inset them at head of list at element 0
774 ptr = pStart[0];
775 ep->setNext(ptr);
776 pStart[0] = ep;
777 return 0;
778 }
779
780 // now handle the normal case
781 ep->setNextEQ(NULL);
782 ep->setNextNE(NULL);
783
784 unsigned char sp = *((const unsigned char*)key);
785 ptr = pStart[sp];
786
787 // handle the first insert
788 if (!ptr) {
789 pStart[sp] = ep;
790 return 0;
791 }
792
793 // otherwise use binary tree insertion so that a sorted
794 // list can easily be generated later
795 pptr = NULL;
796 for (;;) {
797 pptr = ptr;
798 if (strcmp(ep->getKey(), ptr->getKey()) <= 0) {
799 ptr = ptr->getNextEQ();
800 if (!ptr) {
801 pptr->setNextEQ(ep);
802 break;
803 }
804 } else {
805 ptr = ptr->getNextNE();
806 if (!ptr) {
807 pptr->setNextNE(ep);
808 break;
809 }
810 }
811 }
812 return 0;
813 }
814
815 // we want to be able to quickly access suffix information
816 // both by suffix flag, and sorted by the reverse of the
817 // suffix string itself; so we need to set up two indexes
build_sfxtree(SfxEntry * sfxptr)818 int AffixMgr::build_sfxtree(SfxEntry* sfxptr) {
819
820 sfxptr->initReverseWord();
821
822 SfxEntry* ptr;
823 SfxEntry* pptr;
824 SfxEntry* ep = sfxptr;
825
826 /* get the right starting point */
827 const char* key = ep->getKey();
828 const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF);
829
830 // first index by flag which must exist
831 ptr = sFlag[flg];
832 ep->setFlgNxt(ptr);
833 sFlag[flg] = ep;
834
835 // next index by affix string
836
837 // handle the special case of null affix string
838 if (strlen(key) == 0) {
839 // always inset them at head of list at element 0
840 ptr = sStart[0];
841 ep->setNext(ptr);
842 sStart[0] = ep;
843 return 0;
844 }
845
846 // now handle the normal case
847 ep->setNextEQ(NULL);
848 ep->setNextNE(NULL);
849
850 unsigned char sp = *((const unsigned char*)key);
851 ptr = sStart[sp];
852
853 // handle the first insert
854 if (!ptr) {
855 sStart[sp] = ep;
856 return 0;
857 }
858
859 // otherwise use binary tree insertion so that a sorted
860 // list can easily be generated later
861 pptr = NULL;
862 for (;;) {
863 pptr = ptr;
864 if (strcmp(ep->getKey(), ptr->getKey()) <= 0) {
865 ptr = ptr->getNextEQ();
866 if (!ptr) {
867 pptr->setNextEQ(ep);
868 break;
869 }
870 } else {
871 ptr = ptr->getNextNE();
872 if (!ptr) {
873 pptr->setNextNE(ep);
874 break;
875 }
876 }
877 }
878 return 0;
879 }
880
881 // convert from binary tree to sorted list
process_pfx_tree_to_list()882 int AffixMgr::process_pfx_tree_to_list() {
883 for (int i = 1; i < SETSIZE; i++) {
884 pStart[i] = process_pfx_in_order(pStart[i], NULL);
885 }
886 return 0;
887 }
888
process_pfx_in_order(PfxEntry * ptr,PfxEntry * nptr)889 PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr) {
890 if (ptr) {
891 nptr = process_pfx_in_order(ptr->getNextNE(), nptr);
892 ptr->setNext(nptr);
893 nptr = process_pfx_in_order(ptr->getNextEQ(), ptr);
894 }
895 return nptr;
896 }
897
898 // convert from binary tree to sorted list
process_sfx_tree_to_list()899 int AffixMgr::process_sfx_tree_to_list() {
900 for (int i = 1; i < SETSIZE; i++) {
901 sStart[i] = process_sfx_in_order(sStart[i], NULL);
902 }
903 return 0;
904 }
905
process_sfx_in_order(SfxEntry * ptr,SfxEntry * nptr)906 SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr) {
907 if (ptr) {
908 nptr = process_sfx_in_order(ptr->getNextNE(), nptr);
909 ptr->setNext(nptr);
910 nptr = process_sfx_in_order(ptr->getNextEQ(), ptr);
911 }
912 return nptr;
913 }
914
915 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
916 // using the idea of leading subsets this time
process_pfx_order()917 int AffixMgr::process_pfx_order() {
918 PfxEntry* ptr;
919
920 // loop through each prefix list starting point
921 for (int i = 1; i < SETSIZE; i++) {
922 ptr = pStart[i];
923
924 // look through the remainder of the list
925 // and find next entry with affix that
926 // the current one is not a subset of
927 // mark that as destination for NextNE
928 // use next in list that you are a subset
929 // of as NextEQ
930
931 for (; ptr != NULL; ptr = ptr->getNext()) {
932 PfxEntry* nptr = ptr->getNext();
933 for (; nptr != NULL; nptr = nptr->getNext()) {
934 if (!isSubset(ptr->getKey(), nptr->getKey()))
935 break;
936 }
937 ptr->setNextNE(nptr);
938 ptr->setNextEQ(NULL);
939 if ((ptr->getNext()) &&
940 isSubset(ptr->getKey(), (ptr->getNext())->getKey()))
941 ptr->setNextEQ(ptr->getNext());
942 }
943
944 // now clean up by adding smart search termination strings:
945 // if you are already a superset of the previous prefix
946 // but not a subset of the next, search can end here
947 // so set NextNE properly
948
949 ptr = pStart[i];
950 for (; ptr != NULL; ptr = ptr->getNext()) {
951 PfxEntry* nptr = ptr->getNext();
952 PfxEntry* mptr = NULL;
953 for (; nptr != NULL; nptr = nptr->getNext()) {
954 if (!isSubset(ptr->getKey(), nptr->getKey()))
955 break;
956 mptr = nptr;
957 }
958 if (mptr)
959 mptr->setNextNE(NULL);
960 }
961 }
962 return 0;
963 }
964
965 // initialize the SfxEntry links NextEQ and NextNE to speed searching
966 // using the idea of leading subsets this time
process_sfx_order()967 int AffixMgr::process_sfx_order() {
968 SfxEntry* ptr;
969
970 // loop through each prefix list starting point
971 for (int i = 1; i < SETSIZE; i++) {
972 ptr = sStart[i];
973
974 // look through the remainder of the list
975 // and find next entry with affix that
976 // the current one is not a subset of
977 // mark that as destination for NextNE
978 // use next in list that you are a subset
979 // of as NextEQ
980
981 for (; ptr != NULL; ptr = ptr->getNext()) {
982 SfxEntry* nptr = ptr->getNext();
983 for (; nptr != NULL; nptr = nptr->getNext()) {
984 if (!isSubset(ptr->getKey(), nptr->getKey()))
985 break;
986 }
987 ptr->setNextNE(nptr);
988 ptr->setNextEQ(NULL);
989 if ((ptr->getNext()) &&
990 isSubset(ptr->getKey(), (ptr->getNext())->getKey()))
991 ptr->setNextEQ(ptr->getNext());
992 }
993
994 // now clean up by adding smart search termination strings:
995 // if you are already a superset of the previous suffix
996 // but not a subset of the next, search can end here
997 // so set NextNE properly
998
999 ptr = sStart[i];
1000 for (; ptr != NULL; ptr = ptr->getNext()) {
1001 SfxEntry* nptr = ptr->getNext();
1002 SfxEntry* mptr = NULL;
1003 for (; nptr != NULL; nptr = nptr->getNext()) {
1004 if (!isSubset(ptr->getKey(), nptr->getKey()))
1005 break;
1006 mptr = nptr;
1007 }
1008 if (mptr)
1009 mptr->setNextNE(NULL);
1010 }
1011 }
1012 return 0;
1013 }
1014
1015 // add flags to the result for dictionary debugging
debugflag(std::string & result,unsigned short flag)1016 std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) {
1017 char* st = encode_flag(flag);
1018 result.push_back(MSEP_FLD);
1019 result.append(MORPH_FLAG);
1020 if (st) {
1021 result.append(st);
1022 free(st);
1023 }
1024 return result;
1025 }
1026
1027 // calculate the character length of the condition
condlen(const char * st)1028 int AffixMgr::condlen(const char* st) {
1029 int l = 0;
1030 bool group = false;
1031 for (; *st; st++) {
1032 if (*st == '[') {
1033 group = true;
1034 l++;
1035 } else if (*st == ']')
1036 group = false;
1037 else if (!group && (!utf8 || (!(*st & 0x80) || ((*st & 0xc0) == 0x80))))
1038 l++;
1039 }
1040 return l;
1041 }
1042
encodeit(AffEntry & entry,const char * cs)1043 int AffixMgr::encodeit(AffEntry& entry, const char* cs) {
1044 if (strcmp(cs, ".") != 0) {
1045 entry.numconds = (char)condlen(cs);
1046 const size_t cslen = strlen(cs);
1047 const size_t short_part = std::min<size_t>(MAXCONDLEN, cslen);
1048 memcpy(entry.c.conds, cs, short_part);
1049 if (short_part < MAXCONDLEN) {
1050 //blank out the remaining space
1051 memset(entry.c.conds + short_part, 0, MAXCONDLEN - short_part);
1052 } else if (cs[MAXCONDLEN]) {
1053 //there is more conditions than fit in fixed space, so its
1054 //a long condition
1055 entry.opts += aeLONGCOND;
1056 entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
1057 if (!entry.c.l.conds2)
1058 return 1;
1059 }
1060 } else {
1061 entry.numconds = 0;
1062 entry.c.conds[0] = '\0';
1063 }
1064 return 0;
1065 }
1066
1067 // return 1 if s1 is a leading subset of s2 (dots are for infixes)
isSubset(const char * s1,const char * s2)1068 inline int AffixMgr::isSubset(const char* s1, const char* s2) {
1069 while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
1070 s1++;
1071 s2++;
1072 }
1073 return (*s1 == '\0');
1074 }
1075
1076 // check word for prefixes
prefix_check(const char * word,int len,char in_compound,const FLAG needflag)1077 struct hentry* AffixMgr::prefix_check(const char* word,
1078 int len,
1079 char in_compound,
1080 const FLAG needflag) {
1081 struct hentry* rv = NULL;
1082
1083 pfx = NULL;
1084 pfxappnd = NULL;
1085 sfxappnd = NULL;
1086 sfxextra = 0;
1087
1088 // first handle the special case of 0 length prefixes
1089 PfxEntry* pe = pStart[0];
1090 while (pe) {
1091 if (
1092 // fogemorpheme
1093 ((in_compound != IN_CPD_NOT) ||
1094 !(pe->getCont() &&
1095 (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
1096 // permit prefixes in compounds
1097 ((in_compound != IN_CPD_END) ||
1098 (pe->getCont() &&
1099 (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))) {
1100 // check prefix
1101 rv = pe->checkword(word, len, in_compound, needflag);
1102 if (rv) {
1103 pfx = pe; // BUG: pfx not stateless
1104 return rv;
1105 }
1106 }
1107 pe = pe->getNext();
1108 }
1109
1110 // now handle the general case
1111 unsigned char sp = *((const unsigned char*)word);
1112 PfxEntry* pptr = pStart[sp];
1113
1114 while (pptr) {
1115 if (isSubset(pptr->getKey(), word)) {
1116 if (
1117 // fogemorpheme
1118 ((in_compound != IN_CPD_NOT) ||
1119 !(pptr->getCont() &&
1120 (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
1121 // permit prefixes in compounds
1122 ((in_compound != IN_CPD_END) ||
1123 (pptr->getCont() && (TESTAFF(pptr->getCont(), compoundpermitflag,
1124 pptr->getContLen()))))) {
1125 // check prefix
1126 rv = pptr->checkword(word, len, in_compound, needflag);
1127 if (rv) {
1128 pfx = pptr; // BUG: pfx not stateless
1129 return rv;
1130 }
1131 }
1132 pptr = pptr->getNextEQ();
1133 } else {
1134 pptr = pptr->getNextNE();
1135 }
1136 }
1137
1138 return NULL;
1139 }
1140
1141 // check word for prefixes and two-level suffixes
prefix_check_twosfx(const char * word,int len,char in_compound,const FLAG needflag)1142 struct hentry* AffixMgr::prefix_check_twosfx(const char* word,
1143 int len,
1144 char in_compound,
1145 const FLAG needflag) {
1146 struct hentry* rv = NULL;
1147
1148 pfx = NULL;
1149 sfxappnd = NULL;
1150 sfxextra = 0;
1151
1152 // first handle the special case of 0 length prefixes
1153 PfxEntry* pe = pStart[0];
1154
1155 while (pe) {
1156 rv = pe->check_twosfx(word, len, in_compound, needflag);
1157 if (rv)
1158 return rv;
1159 pe = pe->getNext();
1160 }
1161
1162 // now handle the general case
1163 unsigned char sp = *((const unsigned char*)word);
1164 PfxEntry* pptr = pStart[sp];
1165
1166 while (pptr) {
1167 if (isSubset(pptr->getKey(), word)) {
1168 rv = pptr->check_twosfx(word, len, in_compound, needflag);
1169 if (rv) {
1170 pfx = pptr;
1171 return rv;
1172 }
1173 pptr = pptr->getNextEQ();
1174 } else {
1175 pptr = pptr->getNextNE();
1176 }
1177 }
1178
1179 return NULL;
1180 }
1181
1182 // check word for prefixes and morph
prefix_check_morph(const char * word,int len,char in_compound,const FLAG needflag)1183 std::string AffixMgr::prefix_check_morph(const char* word,
1184 int len,
1185 char in_compound,
1186 const FLAG needflag) {
1187
1188 std::string result;
1189
1190 pfx = NULL;
1191 sfxappnd = NULL;
1192 sfxextra = 0;
1193
1194 // first handle the special case of 0 length prefixes
1195 PfxEntry* pe = pStart[0];
1196 while (pe) {
1197 std::string st = pe->check_morph(word, len, in_compound, needflag);
1198 if (!st.empty()) {
1199 result.append(st);
1200 }
1201 pe = pe->getNext();
1202 }
1203
1204 // now handle the general case
1205 unsigned char sp = *((const unsigned char*)word);
1206 PfxEntry* pptr = pStart[sp];
1207
1208 while (pptr) {
1209 if (isSubset(pptr->getKey(), word)) {
1210 std::string st = pptr->check_morph(word, len, in_compound, needflag);
1211 if (!st.empty()) {
1212 // fogemorpheme
1213 if ((in_compound != IN_CPD_NOT) ||
1214 !((pptr->getCont() && (TESTAFF(pptr->getCont(), onlyincompound,
1215 pptr->getContLen()))))) {
1216 result.append(st);
1217 pfx = pptr;
1218 }
1219 }
1220 pptr = pptr->getNextEQ();
1221 } else {
1222 pptr = pptr->getNextNE();
1223 }
1224 }
1225
1226 return result;
1227 }
1228
1229 // check word for prefixes and morph and two-level suffixes
prefix_check_twosfx_morph(const char * word,int len,char in_compound,const FLAG needflag)1230 std::string AffixMgr::prefix_check_twosfx_morph(const char* word,
1231 int len,
1232 char in_compound,
1233 const FLAG needflag) {
1234 std::string result;
1235
1236 pfx = NULL;
1237 sfxappnd = NULL;
1238 sfxextra = 0;
1239
1240 // first handle the special case of 0 length prefixes
1241 PfxEntry* pe = pStart[0];
1242 while (pe) {
1243 std::string st = pe->check_twosfx_morph(word, len, in_compound, needflag);
1244 if (!st.empty()) {
1245 result.append(st);
1246 }
1247 pe = pe->getNext();
1248 }
1249
1250 // now handle the general case
1251 unsigned char sp = *((const unsigned char*)word);
1252 PfxEntry* pptr = pStart[sp];
1253
1254 while (pptr) {
1255 if (isSubset(pptr->getKey(), word)) {
1256 std::string st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
1257 if (!st.empty()) {
1258 result.append(st);
1259 pfx = pptr;
1260 }
1261 pptr = pptr->getNextEQ();
1262 } else {
1263 pptr = pptr->getNextNE();
1264 }
1265 }
1266
1267 return result;
1268 }
1269
1270 // Is word a non-compound with a REP substitution (see checkcompoundrep)?
cpdrep_check(const char * word,int wl)1271 int AffixMgr::cpdrep_check(const char* word, int wl) {
1272
1273 if ((wl < 2) || get_reptable().empty())
1274 return 0;
1275
1276 for (size_t i = 0; i < get_reptable().size(); ++i) {
1277 // use only available mid patterns
1278 if (!get_reptable()[i].outstrings[0].empty()) {
1279 const char* r = word;
1280 const size_t lenp = get_reptable()[i].pattern.size();
1281 // search every occurence of the pattern in the word
1282 while ((r = strstr(r, get_reptable()[i].pattern.c_str())) != NULL) {
1283 std::string candidate(word);
1284 candidate.replace(r - word, lenp, get_reptable()[i].outstrings[0]);
1285 if (candidate_check(candidate.c_str(), candidate.size()))
1286 return 1;
1287 ++r; // search for the next letter
1288 }
1289 }
1290 }
1291
1292 return 0;
1293 }
1294
1295 // forbid compound words, if they are in the dictionary as a
1296 // word pair separated by space
cpdwordpair_check(const char * word,int wl)1297 int AffixMgr::cpdwordpair_check(const char * word, int wl) {
1298 if (wl > 2) {
1299 std::string candidate(word);
1300 for (size_t i = 1; i < candidate.size(); i++) {
1301 // go to end of the UTF-8 character
1302 if (utf8 && ((word[i] & 0xc0) == 0x80))
1303 continue;
1304 candidate.insert(i, 1, ' ');
1305 if (candidate_check(candidate.c_str(), candidate.size()))
1306 return 1;
1307 candidate.erase(i, 1);
1308 }
1309 }
1310
1311 return 0;
1312 }
1313
1314 // forbid compoundings when there are special patterns at word bound
cpdpat_check(const char * word,int pos,hentry * r1,hentry * r2,const char)1315 int AffixMgr::cpdpat_check(const char* word,
1316 int pos,
1317 hentry* r1,
1318 hentry* r2,
1319 const char /*affixed*/) {
1320 for (size_t i = 0; i < checkcpdtable.size(); ++i) {
1321 size_t len;
1322 if (isSubset(checkcpdtable[i].pattern2.c_str(), word + pos) &&
1323 (!r1 || !checkcpdtable[i].cond ||
1324 (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) &&
1325 (!r2 || !checkcpdtable[i].cond2 ||
1326 (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) &&
1327 // zero length pattern => only TESTAFF
1328 // zero pattern (0/flag) => unmodified stem (zero affixes allowed)
1329 (checkcpdtable[i].pattern.empty() ||
1330 ((checkcpdtable[i].pattern[0] == '0' && r1->blen <= pos &&
1331 strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) ||
1332 (checkcpdtable[i].pattern[0] != '0' &&
1333 ((len = checkcpdtable[i].pattern.size()) != 0) &&
1334 strncmp(word + pos - len, checkcpdtable[i].pattern.c_str(), len) == 0)))) {
1335 return 1;
1336 }
1337 }
1338 return 0;
1339 }
1340
1341 // forbid compounding with neighbouring upper and lower case characters at word
1342 // bounds
cpdcase_check(const char * word,int pos)1343 int AffixMgr::cpdcase_check(const char* word, int pos) {
1344 if (utf8) {
1345 const char* p;
1346 for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--)
1347 ;
1348 std::string pair(p);
1349 std::vector<w_char> pair_u;
1350 u8_u16(pair_u, pair);
1351 unsigned short a = pair_u.size() > 1 ? ((pair_u[1].h << 8) + pair_u[1].l) : 0;
1352 unsigned short b = !pair_u.empty() ? ((pair_u[0].h << 8) + pair_u[0].l) : 0;
1353 if (((unicodetoupper(a, langnum) == a) ||
1354 (unicodetoupper(b, langnum) == b)) &&
1355 (a != '-') && (b != '-'))
1356 return 1;
1357 } else {
1358 unsigned char a = *(word + pos - 1);
1359 unsigned char b = *(word + pos);
1360 if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-'))
1361 return 1;
1362 }
1363 return 0;
1364 }
1365
1366 struct metachar_data {
1367 signed short btpp; // metacharacter (*, ?) position for backtracking
1368 signed short btwp; // word position for metacharacters
1369 int btnum; // number of matched characters in metacharacter
1370 };
1371
1372 // check compound patterns
defcpd_check(hentry *** words,short wnum,hentry * rv,hentry ** def,char all)1373 int AffixMgr::defcpd_check(hentry*** words,
1374 short wnum,
1375 hentry* rv,
1376 hentry** def,
1377 char all) {
1378 int w = 0;
1379
1380 if (!*words) {
1381 w = 1;
1382 *words = def;
1383 }
1384
1385 if (!*words) {
1386 return 0;
1387 }
1388
1389 std::vector<metachar_data> btinfo(1);
1390
1391 short bt = 0;
1392
1393 (*words)[wnum] = rv;
1394
1395 // has the last word COMPOUNDRULE flag?
1396 if (rv->alen == 0) {
1397 (*words)[wnum] = NULL;
1398 if (w)
1399 *words = NULL;
1400 return 0;
1401 }
1402 int ok = 0;
1403 for (size_t i = 0; i < defcpdtable.size(); ++i) {
1404 for (size_t j = 0; j < defcpdtable[i].size(); ++j) {
1405 if (defcpdtable[i][j] != '*' && defcpdtable[i][j] != '?' &&
1406 TESTAFF(rv->astr, defcpdtable[i][j], rv->alen)) {
1407 ok = 1;
1408 break;
1409 }
1410 }
1411 }
1412 if (ok == 0) {
1413 (*words)[wnum] = NULL;
1414 if (w)
1415 *words = NULL;
1416 return 0;
1417 }
1418
1419 for (size_t i = 0; i < defcpdtable.size(); ++i) {
1420 size_t pp = 0; // pattern position
1421 signed short wp = 0; // "words" position
1422 int ok2;
1423 ok = 1;
1424 ok2 = 1;
1425 do {
1426 while ((pp < defcpdtable[i].size()) && (wp <= wnum)) {
1427 if (((pp + 1) < defcpdtable[i].size()) &&
1428 ((defcpdtable[i][pp + 1] == '*') ||
1429 (defcpdtable[i][pp + 1] == '?'))) {
1430 int wend = (defcpdtable[i][pp + 1] == '?') ? wp : wnum;
1431 ok2 = 1;
1432 pp += 2;
1433 btinfo[bt].btpp = pp;
1434 btinfo[bt].btwp = wp;
1435 while (wp <= wend) {
1436 if (!(*words)[wp]->alen ||
1437 !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp - 2],
1438 (*words)[wp]->alen)) {
1439 ok2 = 0;
1440 break;
1441 }
1442 wp++;
1443 }
1444 if (wp <= wnum)
1445 ok2 = 0;
1446 btinfo[bt].btnum = wp - btinfo[bt].btwp;
1447 if (btinfo[bt].btnum > 0) {
1448 ++bt;
1449 btinfo.resize(bt+1);
1450 }
1451 if (ok2)
1452 break;
1453 } else {
1454 ok2 = 1;
1455 if (!(*words)[wp] || !(*words)[wp]->alen ||
1456 !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp],
1457 (*words)[wp]->alen)) {
1458 ok = 0;
1459 break;
1460 }
1461 pp++;
1462 wp++;
1463 if ((defcpdtable[i].size() == pp) && !(wp > wnum))
1464 ok = 0;
1465 }
1466 }
1467 if (ok && ok2) {
1468 size_t r = pp;
1469 while ((defcpdtable[i].size() > r) && ((r + 1) < defcpdtable[i].size()) &&
1470 ((defcpdtable[i][r + 1] == '*') ||
1471 (defcpdtable[i][r + 1] == '?')))
1472 r += 2;
1473 if (defcpdtable[i].size() <= r)
1474 return 1;
1475 }
1476 // backtrack
1477 if (bt)
1478 do {
1479 ok = 1;
1480 btinfo[bt - 1].btnum--;
1481 pp = btinfo[bt - 1].btpp;
1482 wp = btinfo[bt - 1].btwp + (signed short)btinfo[bt - 1].btnum;
1483 } while ((btinfo[bt - 1].btnum < 0) && --bt);
1484 } while (bt);
1485
1486 if (ok && ok2 && (!all || (defcpdtable[i].size() <= pp)))
1487 return 1;
1488
1489 // check zero ending
1490 while (ok && ok2 && (defcpdtable[i].size() > pp) &&
1491 ((pp + 1) < defcpdtable[i].size()) &&
1492 ((defcpdtable[i][pp + 1] == '*') ||
1493 (defcpdtable[i][pp + 1] == '?')))
1494 pp += 2;
1495 if (ok && ok2 && (defcpdtable[i].size() <= pp))
1496 return 1;
1497 }
1498 (*words)[wnum] = NULL;
1499 if (w)
1500 *words = NULL;
1501 return 0;
1502 }
1503
candidate_check(const char * word,int len)1504 inline int AffixMgr::candidate_check(const char* word, int len) {
1505
1506 struct hentry* rv = lookup(word);
1507 if (rv)
1508 return 1;
1509
1510 // rv = prefix_check(word,len,1);
1511 // if (rv) return 1;
1512
1513 rv = affix_check(word, len);
1514 if (rv)
1515 return 1;
1516 return 0;
1517 }
1518
1519 // calculate number of syllable for compound-checking
get_syllable(const std::string & word)1520 short AffixMgr::get_syllable(const std::string& word) {
1521 if (cpdmaxsyllable == 0)
1522 return 0;
1523
1524 short num = 0;
1525
1526 if (!utf8) {
1527 for (size_t i = 0; i < word.size(); ++i) {
1528 if (std::binary_search(cpdvowels.begin(), cpdvowels.end(),
1529 word[i])) {
1530 ++num;
1531 }
1532 }
1533 } else if (!cpdvowels_utf16.empty()) {
1534 std::vector<w_char> w;
1535 u8_u16(w, word);
1536 for (size_t i = 0; i < w.size(); ++i) {
1537 if (std::binary_search(cpdvowels_utf16.begin(),
1538 cpdvowels_utf16.end(),
1539 w[i])) {
1540 ++num;
1541 }
1542 }
1543 }
1544
1545 return num;
1546 }
1547
setcminmax(int * cmin,int * cmax,const char * word,int len)1548 void AffixMgr::setcminmax(int* cmin, int* cmax, const char* word, int len) {
1549 if (utf8) {
1550 int i;
1551 for (*cmin = 0, i = 0; (i < cpdmin) && *cmin < len; i++) {
1552 for ((*cmin)++; *cmin < len && (word[*cmin] & 0xc0) == 0x80; (*cmin)++)
1553 ;
1554 }
1555 for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax >= 0; i++) {
1556 for ((*cmax)--; *cmax >= 0 && (word[*cmax] & 0xc0) == 0x80; (*cmax)--)
1557 ;
1558 }
1559 } else {
1560 *cmin = cpdmin;
1561 *cmax = len - cpdmin + 1;
1562 }
1563 }
1564
1565 // check if compound word is correctly spelled
1566 // hu_mov_rule = spec. Hungarian rule (XXX)
compound_check(const std::string & word,short wordnum,short numsyllable,short maxwordnum,short wnum,hentry ** words=NULL,hentry ** rwords=NULL,char hu_mov_rule=0,char is_sug=0,int * info=NULL)1567 struct hentry* AffixMgr::compound_check(const std::string& word,
1568 short wordnum,
1569 short numsyllable,
1570 short maxwordnum,
1571 short wnum,
1572 hentry** words = NULL,
1573 hentry** rwords = NULL,
1574 char hu_mov_rule = 0,
1575 char is_sug = 0,
1576 int* info = NULL) {
1577 int i;
1578 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
1579 struct hentry* rv = NULL;
1580 struct hentry* rv_first;
1581 std::string st;
1582 char ch = '\0';
1583 int cmin;
1584 int cmax;
1585 int striple = 0;
1586 size_t scpd = 0;
1587 int soldi = 0;
1588 int oldcmin = 0;
1589 int oldcmax = 0;
1590 int oldlen = 0;
1591 int checkedstriple = 0;
1592 char affixed = 0;
1593 hentry** oldwords = words;
1594 size_t len = word.size();
1595
1596 int checked_prefix;
1597
1598 // add a time limit to handle possible
1599 // combinatorical explosion of the overlapping words
1600
1601 HUNSPELL_THREAD_LOCAL clock_t timelimit;
1602
1603 if (wordnum == 0)
1604 timelimit = clock();
1605 else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) {
1606 timelimit = 0;
1607 }
1608
1609 setcminmax(&cmin, &cmax, word.c_str(), len);
1610
1611 st.assign(word);
1612
1613 for (i = cmin; i < cmax; i++) {
1614 // go to end of the UTF-8 character
1615 if (utf8) {
1616 for (; (st[i] & 0xc0) == 0x80; i++)
1617 ;
1618 if (i >= cmax)
1619 return NULL;
1620 }
1621
1622 words = oldwords;
1623 int onlycpdrule = (words) ? 1 : 0;
1624
1625 do { // onlycpdrule loop
1626
1627 oldnumsyllable = numsyllable;
1628 oldwordnum = wordnum;
1629 checked_prefix = 0;
1630
1631 do { // simplified checkcompoundpattern loop
1632
1633 if (timelimit == 0)
1634 return 0;
1635
1636 if (scpd > 0) {
1637 for (; scpd <= checkcpdtable.size() &&
1638 (checkcpdtable[scpd - 1].pattern3.empty() ||
1639 strncmp(word.c_str() + i, checkcpdtable[scpd - 1].pattern3.c_str(),
1640 checkcpdtable[scpd - 1].pattern3.size()) != 0);
1641 scpd++)
1642 ;
1643
1644 if (scpd > checkcpdtable.size())
1645 break; // break simplified checkcompoundpattern loop
1646 st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern);
1647 soldi = i;
1648 i += checkcpdtable[scpd - 1].pattern.size();
1649 st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern2);
1650 st.replace(i + checkcpdtable[scpd - 1].pattern2.size(), std::string::npos,
1651 word.substr(soldi + checkcpdtable[scpd - 1].pattern3.size()));
1652
1653 oldlen = len;
1654 len += checkcpdtable[scpd - 1].pattern.size() +
1655 checkcpdtable[scpd - 1].pattern2.size() -
1656 checkcpdtable[scpd - 1].pattern3.size();
1657 oldcmin = cmin;
1658 oldcmax = cmax;
1659 setcminmax(&cmin, &cmax, st.c_str(), len);
1660
1661 cmax = len - cpdmin + 1;
1662 }
1663
1664 ch = st[i];
1665 st[i] = '\0';
1666
1667 sfx = NULL;
1668 pfx = NULL;
1669
1670 // FIRST WORD
1671
1672 affixed = 1;
1673 rv = lookup(st.c_str()); // perhaps without prefix
1674
1675 // forbid dictionary stems with COMPOUNDFORBIDFLAG in
1676 // compound words, overriding the effect of COMPOUNDPERMITFLAG
1677 if ((rv) && compoundforbidflag &&
1678 TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule)
1679 continue;
1680
1681 // search homonym with compound flag
1682 while ((rv) && !hu_mov_rule &&
1683 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1684 !((compoundflag && !words && !onlycpdrule &&
1685 TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1686 (compoundbegin && !wordnum && !onlycpdrule &&
1687 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1688 (compoundmiddle && wordnum && !words && !onlycpdrule &&
1689 TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
1690 (!defcpdtable.empty() && onlycpdrule &&
1691 ((!words && !wordnum &&
1692 defcpd_check(&words, wnum, rv, rwords, 0)) ||
1693 (words &&
1694 defcpd_check(&words, wnum, rv, rwords, 0))))) ||
1695 (scpd != 0 && checkcpdtable[scpd - 1].cond != FLAG_NULL &&
1696 !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)))) {
1697 rv = rv->next_homonym;
1698 }
1699
1700 if (rv)
1701 affixed = 0;
1702
1703 if (!rv) {
1704 if (onlycpdrule)
1705 break;
1706 if (compoundflag &&
1707 !(rv = prefix_check(st.c_str(), i,
1708 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
1709 compoundflag))) {
1710 if (((rv = suffix_check(
1711 st.c_str(), i, 0, NULL, FLAG_NULL, compoundflag,
1712 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1713 (compoundmoresuffixes &&
1714 (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
1715 !hu_mov_rule && sfx->getCont() &&
1716 ((compoundforbidflag &&
1717 TESTAFF(sfx->getCont(), compoundforbidflag,
1718 sfx->getContLen())) ||
1719 (compoundend &&
1720 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
1721 rv = NULL;
1722 }
1723 }
1724
1725 if (rv ||
1726 (((wordnum == 0) && compoundbegin &&
1727 ((rv = suffix_check(
1728 st.c_str(), i, 0, NULL, FLAG_NULL, compoundbegin,
1729 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1730 (compoundmoresuffixes &&
1731 (rv = suffix_check_twosfx(
1732 st.c_str(), i, 0, NULL,
1733 compoundbegin))) || // twofold suffixes + compound
1734 (rv = prefix_check(st.c_str(), i,
1735 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
1736 compoundbegin)))) ||
1737 ((wordnum > 0) && compoundmiddle &&
1738 ((rv = suffix_check(
1739 st.c_str(), i, 0, NULL, FLAG_NULL, compoundmiddle,
1740 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1741 (compoundmoresuffixes &&
1742 (rv = suffix_check_twosfx(
1743 st.c_str(), i, 0, NULL,
1744 compoundmiddle))) || // twofold suffixes + compound
1745 (rv = prefix_check(st.c_str(), i,
1746 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
1747 compoundmiddle))))))
1748 checked_prefix = 1;
1749 // else check forbiddenwords and needaffix
1750 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1751 TESTAFF(rv->astr, needaffix, rv->alen) ||
1752 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1753 (is_sug && nosuggest &&
1754 TESTAFF(rv->astr, nosuggest, rv->alen)))) {
1755 st[i] = ch;
1756 // continue;
1757 break;
1758 }
1759
1760 // check non_compound flag in suffix and prefix
1761 if ((rv) && !hu_mov_rule &&
1762 ((pfx && pfx->getCont() &&
1763 TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
1764 (sfx && sfx->getCont() &&
1765 TESTAFF(sfx->getCont(), compoundforbidflag,
1766 sfx->getContLen())))) {
1767 rv = NULL;
1768 }
1769
1770 // check compoundend flag in suffix and prefix
1771 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
1772 ((pfx && pfx->getCont() &&
1773 TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) ||
1774 (sfx && sfx->getCont() &&
1775 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
1776 rv = NULL;
1777 }
1778
1779 // check compoundmiddle flag in suffix and prefix
1780 if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
1781 !hu_mov_rule &&
1782 ((pfx && pfx->getCont() &&
1783 TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) ||
1784 (sfx && sfx->getCont() &&
1785 TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) {
1786 rv = NULL;
1787 }
1788
1789 // check forbiddenwords
1790 if ((rv) && (rv->astr) &&
1791 (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1792 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1793 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
1794 return NULL;
1795 }
1796
1797 // increment word number, if the second root has a compoundroot flag
1798 if ((rv) && compoundroot &&
1799 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1800 wordnum++;
1801 }
1802
1803 // first word is acceptable in compound words?
1804 if (((rv) &&
1805 (checked_prefix || (words && words[wnum]) ||
1806 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1807 ((oldwordnum == 0) && compoundbegin &&
1808 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1809 ((oldwordnum > 0) && compoundmiddle &&
1810 TESTAFF(rv->astr, compoundmiddle, rv->alen))
1811
1812 // LANG_hu section: spec. Hungarian rule
1813 || ((langnum == LANG_hu) && hu_mov_rule &&
1814 (TESTAFF(
1815 rv->astr, 'F',
1816 rv->alen) || // XXX hardwired Hungarian dictionary codes
1817 TESTAFF(rv->astr, 'G', rv->alen) ||
1818 TESTAFF(rv->astr, 'H', rv->alen)))
1819 // END of LANG_hu section
1820 ) &&
1821 (
1822 // test CHECKCOMPOUNDPATTERN conditions
1823 scpd == 0 || checkcpdtable[scpd - 1].cond == FLAG_NULL ||
1824 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)) &&
1825 !((checkcompoundtriple && scpd == 0 &&
1826 !words && // test triple letters
1827 (word[i - 1] == word[i]) &&
1828 (((i > 1) && (word[i - 1] == word[i - 2])) ||
1829 ((word[i - 1] == word[i + 1])) // may be word[i+1] == '\0'
1830 )) ||
1831 (checkcompoundcase && scpd == 0 && !words &&
1832 cpdcase_check(word.c_str(), i))))
1833 // LANG_hu section: spec. Hungarian rule
1834 || ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
1835 (rv = affix_check(st.c_str(), i)) &&
1836 (sfx && sfx->getCont() &&
1837 ( // XXX hardwired Hungarian dic. codes
1838 TESTAFF(sfx->getCont(), (unsigned short)'x',
1839 sfx->getContLen()) ||
1840 TESTAFF(
1841 sfx->getCont(), (unsigned short)'%',
1842 sfx->getContLen()))))) { // first word is ok condition
1843
1844 // LANG_hu section: spec. Hungarian rule
1845 if (langnum == LANG_hu) {
1846 // calculate syllable number of the word
1847 numsyllable += get_syllable(st.substr(0, i));
1848 // + 1 word, if syllable number of the prefix > 1 (hungarian
1849 // convention)
1850 if (pfx && (get_syllable(pfx->getKey()) > 1))
1851 wordnum++;
1852 }
1853 // END of LANG_hu section
1854
1855 // NEXT WORD(S)
1856 rv_first = rv;
1857 st[i] = ch;
1858
1859 do { // striple loop
1860
1861 // check simplifiedtriple
1862 if (simplifiedtriple) {
1863 if (striple) {
1864 checkedstriple = 1;
1865 i--; // check "fahrt" instead of "ahrt" in "Schiffahrt"
1866 } else if (i > 2 && word[i - 1] == word[i - 2])
1867 striple = 1;
1868 }
1869
1870 rv = lookup(st.c_str() + i); // perhaps without prefix
1871
1872 // search homonym with compound flag
1873 while ((rv) &&
1874 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1875 !((compoundflag && !words &&
1876 TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1877 (compoundend && !words &&
1878 TESTAFF(rv->astr, compoundend, rv->alen)) ||
1879 (!defcpdtable.empty() && words &&
1880 defcpd_check(&words, wnum + 1, rv, NULL, 1))) ||
1881 (scpd != 0 && checkcpdtable[scpd - 1].cond2 != FLAG_NULL &&
1882 !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2,
1883 rv->alen)))) {
1884 rv = rv->next_homonym;
1885 }
1886
1887 // check FORCEUCASE
1888 if (rv && forceucase && (rv) &&
1889 (TESTAFF(rv->astr, forceucase, rv->alen)) &&
1890 !(info && *info & SPELL_ORIGCAP))
1891 rv = NULL;
1892
1893 if (rv && words && words[wnum + 1])
1894 return rv_first;
1895
1896 oldnumsyllable2 = numsyllable;
1897 oldwordnum2 = wordnum;
1898
1899 // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary
1900 // code
1901 if ((rv) && (langnum == LANG_hu) &&
1902 (TESTAFF(rv->astr, 'I', rv->alen)) &&
1903 !(TESTAFF(rv->astr, 'J', rv->alen))) {
1904 numsyllable--;
1905 }
1906 // END of LANG_hu section
1907
1908 // increment word number, if the second root has a compoundroot flag
1909 if ((rv) && (compoundroot) &&
1910 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1911 wordnum++;
1912 }
1913
1914 // check forbiddenwords
1915 if ((rv) && (rv->astr) &&
1916 (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1917 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1918 (is_sug && nosuggest &&
1919 TESTAFF(rv->astr, nosuggest, rv->alen))))
1920 return NULL;
1921
1922 // second word is acceptable, as a root?
1923 // hungarian conventions: compounding is acceptable,
1924 // when compound forms consist of 2 words, or if more,
1925 // then the syllable number of root words must be 6, or lesser.
1926
1927 if ((rv) &&
1928 ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1929 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
1930 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
1931 ((cpdmaxsyllable != 0) &&
1932 (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
1933 cpdmaxsyllable))) &&
1934 (
1935 // test CHECKCOMPOUNDPATTERN
1936 checkcpdtable.empty() || scpd != 0 ||
1937 !cpdpat_check(word.c_str(), i, rv_first, rv, 0)) &&
1938 ((!checkcompounddup || (rv != rv_first)))
1939 // test CHECKCOMPOUNDPATTERN conditions
1940 &&
1941 (scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL ||
1942 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) {
1943 // forbid compound word, if it is a non-compound word with typical
1944 // fault
1945 if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) ||
1946 cpdwordpair_check(word.c_str(), len))
1947 return NULL;
1948 return rv_first;
1949 }
1950
1951 numsyllable = oldnumsyllable2;
1952 wordnum = oldwordnum2;
1953
1954 // perhaps second word has prefix or/and suffix
1955 sfx = NULL;
1956 sfxflag = FLAG_NULL;
1957 rv = (compoundflag && !onlycpdrule)
1958 ? affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundflag,
1959 IN_CPD_END)
1960 : NULL;
1961 if (!rv && compoundend && !onlycpdrule) {
1962 sfx = NULL;
1963 pfx = NULL;
1964 rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundend,
1965 IN_CPD_END);
1966 }
1967
1968 if (!rv && !defcpdtable.empty() && words) {
1969 rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), 0, IN_CPD_END);
1970 if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1))
1971 return rv_first;
1972 rv = NULL;
1973 }
1974
1975 // test CHECKCOMPOUNDPATTERN conditions (allowed forms)
1976 if (rv &&
1977 !(scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL ||
1978 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen)))
1979 rv = NULL;
1980
1981 // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds)
1982 if (rv && !checkcpdtable.empty() && scpd == 0 &&
1983 cpdpat_check(word.c_str(), i, rv_first, rv, affixed))
1984 rv = NULL;
1985
1986 // check non_compound flag in suffix and prefix
1987 if ((rv) && ((pfx && pfx->getCont() &&
1988 TESTAFF(pfx->getCont(), compoundforbidflag,
1989 pfx->getContLen())) ||
1990 (sfx && sfx->getCont() &&
1991 TESTAFF(sfx->getCont(), compoundforbidflag,
1992 sfx->getContLen())))) {
1993 rv = NULL;
1994 }
1995
1996 // check FORCEUCASE
1997 if (rv && forceucase && (rv) &&
1998 (TESTAFF(rv->astr, forceucase, rv->alen)) &&
1999 !(info && *info & SPELL_ORIGCAP))
2000 rv = NULL;
2001
2002 // check forbiddenwords
2003 if ((rv) && (rv->astr) &&
2004 (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2005 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
2006 (is_sug && nosuggest &&
2007 TESTAFF(rv->astr, nosuggest, rv->alen))))
2008 return NULL;
2009
2010 // pfxappnd = prefix of word+i, or NULL
2011 // calculate syllable number of prefix.
2012 // hungarian convention: when syllable number of prefix is more,
2013 // than 1, the prefix+word counts as two words.
2014
2015 if (langnum == LANG_hu) {
2016 // calculate syllable number of the word
2017 numsyllable += get_syllable(word.c_str() + i);
2018
2019 // - affix syllable num.
2020 // XXX only second suffix (inflections, not derivations)
2021 if (sfxappnd) {
2022 std::string tmp(sfxappnd);
2023 reverseword(tmp);
2024 numsyllable -= short(get_syllable(tmp) + sfxextra);
2025 } else {
2026 numsyllable -= short(sfxextra);
2027 }
2028
2029 // + 1 word, if syllable number of the prefix > 1 (hungarian
2030 // convention)
2031 if (pfx && (get_syllable(pfx->getKey()) > 1))
2032 wordnum++;
2033
2034 // increment syllable num, if last word has a SYLLABLENUM flag
2035 // and the suffix is beginning `s'
2036
2037 if (!cpdsyllablenum.empty()) {
2038 switch (sfxflag) {
2039 case 'c': {
2040 numsyllable += 2;
2041 break;
2042 }
2043 case 'J': {
2044 numsyllable += 1;
2045 break;
2046 }
2047 case 'I': {
2048 if (rv && TESTAFF(rv->astr, 'J', rv->alen))
2049 numsyllable += 1;
2050 break;
2051 }
2052 }
2053 }
2054 }
2055
2056 // increment word number, if the second word has a compoundroot flag
2057 if ((rv) && (compoundroot) &&
2058 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2059 wordnum++;
2060 }
2061 // second word is acceptable, as a word with prefix or/and suffix?
2062 // hungarian conventions: compounding is acceptable,
2063 // when compound forms consist 2 word, otherwise
2064 // the syllable number of root words is 6, or lesser.
2065 if ((rv) &&
2066 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2067 ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
2068 ((!checkcompounddup || (rv != rv_first)))) {
2069 // forbid compound word, if it is a non-compound word with typical
2070 // fault
2071 if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) ||
2072 cpdwordpair_check(word.c_str(), len))
2073 return NULL;
2074 return rv_first;
2075 }
2076
2077 numsyllable = oldnumsyllable2;
2078 wordnum = oldwordnum2;
2079
2080 // perhaps second word is a compound word (recursive call)
2081 if (wordnum + 2 < maxwordnum) {
2082 rv = compound_check(st.substr(i), wordnum + 1,
2083 numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
2084 is_sug, info);
2085
2086 if (rv && !checkcpdtable.empty() &&
2087 ((scpd == 0 &&
2088 cpdpat_check(word.c_str(), i, rv_first, rv, affixed)) ||
2089 (scpd != 0 &&
2090 !cpdpat_check(word.c_str(), i, rv_first, rv, affixed))))
2091 rv = NULL;
2092 } else {
2093 rv = NULL;
2094 }
2095 if (rv) {
2096 // forbid compound word, if it is a non-compound word with typical
2097 // fault, or a dictionary word pair
2098
2099 if (cpdwordpair_check(word.c_str(), len))
2100 return NULL;
2101
2102 if (checkcompoundrep || forbiddenword) {
2103
2104 if (checkcompoundrep && cpdrep_check(word.c_str(), len))
2105 return NULL;
2106
2107 // check first part
2108 if (strncmp(rv->word, word.c_str() + i, rv->blen) == 0) {
2109 char r = st[i + rv->blen];
2110 st[i + rv->blen] = '\0';
2111
2112 if ((checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) ||
2113 cpdwordpair_check(st.c_str(), i + rv->blen)) {
2114 st[ + i + rv->blen] = r;
2115 continue;
2116 }
2117
2118 if (forbiddenword) {
2119 struct hentry* rv2 = lookup(word.c_str());
2120 if (!rv2)
2121 rv2 = affix_check(word.c_str(), len);
2122 if (rv2 && rv2->astr &&
2123 TESTAFF(rv2->astr, forbiddenword, rv2->alen) &&
2124 (strncmp(rv2->word, st.c_str(), i + rv->blen) == 0)) {
2125 return NULL;
2126 }
2127 }
2128 st[i + rv->blen] = r;
2129 }
2130 }
2131 return rv_first;
2132 }
2133 } while (striple && !checkedstriple); // end of striple loop
2134
2135 if (checkedstriple) {
2136 i++;
2137 checkedstriple = 0;
2138 striple = 0;
2139 }
2140
2141 } // first word is ok condition
2142
2143 if (soldi != 0) {
2144 i = soldi;
2145 soldi = 0;
2146 len = oldlen;
2147 cmin = oldcmin;
2148 cmax = oldcmax;
2149 }
2150 scpd++;
2151
2152 } while (!onlycpdrule && simplifiedcpd &&
2153 scpd <= checkcpdtable.size()); // end of simplifiedcpd loop
2154
2155 scpd = 0;
2156 wordnum = oldwordnum;
2157 numsyllable = oldnumsyllable;
2158
2159 if (soldi != 0) {
2160 i = soldi;
2161 st.assign(word); // XXX add more optim.
2162 soldi = 0;
2163 } else
2164 st[i] = ch;
2165
2166 } while (!defcpdtable.empty() && oldwordnum == 0 &&
2167 onlycpdrule++ < 1); // end of onlycpd loop
2168 }
2169
2170 return NULL;
2171 }
2172
2173 // check if compound word is correctly spelled
2174 // hu_mov_rule = spec. Hungarian rule (XXX)
compound_check_morph(const char * word,int len,short wordnum,short numsyllable,short maxwordnum,short wnum,hentry ** words,hentry ** rwords,char hu_mov_rule,std::string & result,const std::string * partresult)2175 int AffixMgr::compound_check_morph(const char* word,
2176 int len,
2177 short wordnum,
2178 short numsyllable,
2179 short maxwordnum,
2180 short wnum,
2181 hentry** words,
2182 hentry** rwords,
2183 char hu_mov_rule,
2184 std::string& result,
2185 const std::string* partresult) {
2186 int i;
2187 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
2188 int ok = 0;
2189
2190 struct hentry* rv = NULL;
2191 struct hentry* rv_first;
2192 std::string st;
2193 char ch;
2194
2195 int checked_prefix;
2196 std::string presult;
2197
2198 int cmin;
2199 int cmax;
2200
2201 char affixed = 0;
2202 hentry** oldwords = words;
2203
2204 // add a time limit to handle possible
2205 // combinatorical explosion of the overlapping words
2206
2207 HUNSPELL_THREAD_LOCAL clock_t timelimit;
2208
2209 if (wordnum == 0)
2210 timelimit = clock();
2211 else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) {
2212 timelimit = 0;
2213 }
2214
2215 setcminmax(&cmin, &cmax, word, len);
2216
2217 st.assign(word);
2218
2219 for (i = cmin; i < cmax; i++) {
2220 // go to end of the UTF-8 character
2221 if (utf8) {
2222 for (; (st[i] & 0xc0) == 0x80; i++)
2223 ;
2224 if (i >= cmax)
2225 return 0;
2226 }
2227
2228 words = oldwords;
2229 int onlycpdrule = (words) ? 1 : 0;
2230
2231 do { // onlycpdrule loop
2232
2233 if (timelimit == 0)
2234 return 0;
2235
2236 oldnumsyllable = numsyllable;
2237 oldwordnum = wordnum;
2238 checked_prefix = 0;
2239
2240 ch = st[i];
2241 st[i] = '\0';
2242 sfx = NULL;
2243
2244 // FIRST WORD
2245
2246 affixed = 1;
2247
2248 presult.clear();
2249 if (partresult)
2250 presult.append(*partresult);
2251
2252 rv = lookup(st.c_str()); // perhaps without prefix
2253
2254 // forbid dictionary stems with COMPOUNDFORBIDFLAG in
2255 // compound words, overriding the effect of COMPOUNDPERMITFLAG
2256 if ((rv) && compoundforbidflag &&
2257 TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule)
2258 continue;
2259
2260 // search homonym with compound flag
2261 while ((rv) && !hu_mov_rule &&
2262 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2263 !((compoundflag && !words && !onlycpdrule &&
2264 TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2265 (compoundbegin && !wordnum && !onlycpdrule &&
2266 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2267 (compoundmiddle && wordnum && !words && !onlycpdrule &&
2268 TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
2269 (!defcpdtable.empty() && onlycpdrule &&
2270 ((!words && !wordnum &&
2271 defcpd_check(&words, wnum, rv, rwords, 0)) ||
2272 (words &&
2273 defcpd_check(&words, wnum, rv, rwords, 0))))))) {
2274 rv = rv->next_homonym;
2275 }
2276
2277 if (timelimit == 0)
2278 return 0;
2279
2280 if (rv)
2281 affixed = 0;
2282
2283 if (rv) {
2284 presult.push_back(MSEP_FLD);
2285 presult.append(MORPH_PART);
2286 presult.append(st.c_str());
2287 if (!HENTRY_FIND(rv, MORPH_STEM)) {
2288 presult.push_back(MSEP_FLD);
2289 presult.append(MORPH_STEM);
2290 presult.append(st.c_str());
2291 }
2292 if (HENTRY_DATA(rv)) {
2293 presult.push_back(MSEP_FLD);
2294 presult.append(HENTRY_DATA2(rv));
2295 }
2296 }
2297
2298 if (!rv) {
2299 if (compoundflag &&
2300 !(rv =
2301 prefix_check(st.c_str(), i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
2302 compoundflag))) {
2303 if (((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
2304 compoundflag,
2305 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2306 (compoundmoresuffixes &&
2307 (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
2308 !hu_mov_rule && sfx->getCont() &&
2309 ((compoundforbidflag &&
2310 TESTAFF(sfx->getCont(), compoundforbidflag,
2311 sfx->getContLen())) ||
2312 (compoundend &&
2313 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
2314 rv = NULL;
2315 }
2316 }
2317
2318 if (rv ||
2319 (((wordnum == 0) && compoundbegin &&
2320 ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
2321 compoundbegin,
2322 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2323 (compoundmoresuffixes &&
2324 (rv = suffix_check_twosfx(
2325 st.c_str(), i, 0, NULL,
2326 compoundbegin))) || // twofold suffix+compound
2327 (rv = prefix_check(st.c_str(), i,
2328 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
2329 compoundbegin)))) ||
2330 ((wordnum > 0) && compoundmiddle &&
2331 ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
2332 compoundmiddle,
2333 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2334 (compoundmoresuffixes &&
2335 (rv = suffix_check_twosfx(
2336 st.c_str(), i, 0, NULL,
2337 compoundmiddle))) || // twofold suffix+compound
2338 (rv = prefix_check(st.c_str(), i,
2339 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
2340 compoundmiddle)))))) {
2341 std::string p;
2342 if (compoundflag)
2343 p = affix_check_morph(st.c_str(), i, compoundflag);
2344 if (p.empty()) {
2345 if ((wordnum == 0) && compoundbegin) {
2346 p = affix_check_morph(st.c_str(), i, compoundbegin);
2347 } else if ((wordnum > 0) && compoundmiddle) {
2348 p = affix_check_morph(st.c_str(), i, compoundmiddle);
2349 }
2350 }
2351 if (!p.empty()) {
2352 presult.push_back(MSEP_FLD);
2353 presult.append(MORPH_PART);
2354 presult.append(st.c_str());
2355 line_uniq_app(p, MSEP_REC);
2356 presult.append(p);
2357 }
2358 checked_prefix = 1;
2359 }
2360 // else check forbiddenwords
2361 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2362 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
2363 TESTAFF(rv->astr, needaffix, rv->alen))) {
2364 st[i] = ch;
2365 continue;
2366 }
2367
2368 // check non_compound flag in suffix and prefix
2369 if ((rv) && !hu_mov_rule &&
2370 ((pfx && pfx->getCont() &&
2371 TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
2372 (sfx && sfx->getCont() &&
2373 TESTAFF(sfx->getCont(), compoundforbidflag, sfx->getContLen())))) {
2374 continue;
2375 }
2376
2377 // check compoundend flag in suffix and prefix
2378 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
2379 ((pfx && pfx->getCont() &&
2380 TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) ||
2381 (sfx && sfx->getCont() &&
2382 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
2383 continue;
2384 }
2385
2386 // check compoundmiddle flag in suffix and prefix
2387 if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
2388 !hu_mov_rule &&
2389 ((pfx && pfx->getCont() &&
2390 TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) ||
2391 (sfx && sfx->getCont() &&
2392 TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) {
2393 rv = NULL;
2394 }
2395
2396 // check forbiddenwords
2397 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2398 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)))
2399 continue;
2400
2401 // increment word number, if the second root has a compoundroot flag
2402 if ((rv) && (compoundroot) &&
2403 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2404 wordnum++;
2405 }
2406
2407 // first word is acceptable in compound words?
2408 if (((rv) &&
2409 (checked_prefix || (words && words[wnum]) ||
2410 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2411 ((oldwordnum == 0) && compoundbegin &&
2412 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2413 ((oldwordnum > 0) && compoundmiddle &&
2414 TESTAFF(rv->astr, compoundmiddle, rv->alen))
2415 // LANG_hu section: spec. Hungarian rule
2416 || ((langnum == LANG_hu) && // hu_mov_rule
2417 hu_mov_rule && (TESTAFF(rv->astr, 'F', rv->alen) ||
2418 TESTAFF(rv->astr, 'G', rv->alen) ||
2419 TESTAFF(rv->astr, 'H', rv->alen)))
2420 // END of LANG_hu section
2421 ) &&
2422 !((checkcompoundtriple && !words && // test triple letters
2423 (word[i - 1] == word[i]) &&
2424 (((i > 1) && (word[i - 1] == word[i - 2])) ||
2425 ((word[i - 1] == word[i + 1])) // may be word[i+1] == '\0'
2426 )) ||
2427 (
2428 // test CHECKCOMPOUNDPATTERN
2429 !checkcpdtable.empty() && !words &&
2430 cpdpat_check(word, i, rv, NULL, affixed)) ||
2431 (checkcompoundcase && !words && cpdcase_check(word, i))))
2432 // LANG_hu section: spec. Hungarian rule
2433 ||
2434 ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
2435 (rv = affix_check(st.c_str(), i)) &&
2436 (sfx && sfx->getCont() &&
2437 (TESTAFF(sfx->getCont(), (unsigned short)'x', sfx->getContLen()) ||
2438 TESTAFF(sfx->getCont(), (unsigned short)'%', sfx->getContLen()))))
2439 // END of LANG_hu section
2440 ) {
2441 // LANG_hu section: spec. Hungarian rule
2442 if (langnum == LANG_hu) {
2443 // calculate syllable number of the word
2444 numsyllable += get_syllable(st.substr(0, i));
2445
2446 // + 1 word, if syllable number of the prefix > 1 (hungarian
2447 // convention)
2448 if (pfx && (get_syllable(pfx->getKey()) > 1))
2449 wordnum++;
2450 }
2451 // END of LANG_hu section
2452
2453 // NEXT WORD(S)
2454 rv_first = rv;
2455 rv = lookup((word + i)); // perhaps without prefix
2456
2457 // search homonym with compound flag
2458 while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2459 !((compoundflag && !words &&
2460 TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2461 (compoundend && !words &&
2462 TESTAFF(rv->astr, compoundend, rv->alen)) ||
2463 (!defcpdtable.empty() && words &&
2464 defcpd_check(&words, wnum + 1, rv, NULL, 1))))) {
2465 rv = rv->next_homonym;
2466 }
2467
2468 if (rv && words && words[wnum + 1]) {
2469 result.append(presult);
2470 result.push_back(MSEP_FLD);
2471 result.append(MORPH_PART);
2472 result.append(word + i);
2473 if (complexprefixes && HENTRY_DATA(rv))
2474 result.append(HENTRY_DATA2(rv));
2475 if (!HENTRY_FIND(rv, MORPH_STEM)) {
2476 result.push_back(MSEP_FLD);
2477 result.append(MORPH_STEM);
2478 result.append(HENTRY_WORD(rv));
2479 }
2480 // store the pointer of the hash entry
2481 if (!complexprefixes && HENTRY_DATA(rv)) {
2482 result.push_back(MSEP_FLD);
2483 result.append(HENTRY_DATA2(rv));
2484 }
2485 result.push_back(MSEP_REC);
2486 return 0;
2487 }
2488
2489 oldnumsyllable2 = numsyllable;
2490 oldwordnum2 = wordnum;
2491
2492 // LANG_hu section: spec. Hungarian rule
2493 if ((rv) && (langnum == LANG_hu) &&
2494 (TESTAFF(rv->astr, 'I', rv->alen)) &&
2495 !(TESTAFF(rv->astr, 'J', rv->alen))) {
2496 numsyllable--;
2497 }
2498 // END of LANG_hu section
2499 // increment word number, if the second root has a compoundroot flag
2500 if ((rv) && (compoundroot) &&
2501 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2502 wordnum++;
2503 }
2504
2505 // check forbiddenwords
2506 if ((rv) && (rv->astr) &&
2507 (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2508 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) {
2509 st[i] = ch;
2510 continue;
2511 }
2512
2513 // second word is acceptable, as a root?
2514 // hungarian conventions: compounding is acceptable,
2515 // when compound forms consist of 2 words, or if more,
2516 // then the syllable number of root words must be 6, or lesser.
2517 if ((rv) &&
2518 ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2519 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
2520 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2521 ((cpdmaxsyllable != 0) &&
2522 (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
2523 cpdmaxsyllable))) &&
2524 ((!checkcompounddup || (rv != rv_first)))) {
2525 // bad compound word
2526 result.append(presult);
2527 result.push_back(MSEP_FLD);
2528 result.append(MORPH_PART);
2529 result.append(word + i);
2530
2531 if (HENTRY_DATA(rv)) {
2532 if (complexprefixes)
2533 result.append(HENTRY_DATA2(rv));
2534 if (!HENTRY_FIND(rv, MORPH_STEM)) {
2535 result.push_back(MSEP_FLD);
2536 result.append(MORPH_STEM);
2537 result.append(HENTRY_WORD(rv));
2538 }
2539 // store the pointer of the hash entry
2540 if (!complexprefixes) {
2541 result.push_back(MSEP_FLD);
2542 result.append(HENTRY_DATA2(rv));
2543 }
2544 }
2545 result.push_back(MSEP_REC);
2546 ok = 1;
2547 }
2548
2549 numsyllable = oldnumsyllable2;
2550 wordnum = oldwordnum2;
2551
2552 // perhaps second word has prefix or/and suffix
2553 sfx = NULL;
2554 sfxflag = FLAG_NULL;
2555
2556 if (compoundflag && !onlycpdrule)
2557 rv = affix_check((word + i), strlen(word + i), compoundflag);
2558 else
2559 rv = NULL;
2560
2561 if (!rv && compoundend && !onlycpdrule) {
2562 sfx = NULL;
2563 pfx = NULL;
2564 rv = affix_check((word + i), strlen(word + i), compoundend);
2565 }
2566
2567 if (!rv && !defcpdtable.empty() && words) {
2568 rv = affix_check((word + i), strlen(word + i), 0, IN_CPD_END);
2569 if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
2570 std::string m;
2571 if (compoundflag)
2572 m = affix_check_morph((word + i), strlen(word + i), compoundflag);
2573 if (m.empty() && compoundend) {
2574 m = affix_check_morph((word + i), strlen(word + i), compoundend);
2575 }
2576 result.append(presult);
2577 if (!m.empty()) {
2578 result.push_back(MSEP_FLD);
2579 result.append(MORPH_PART);
2580 result.append(word + i);
2581 line_uniq_app(m, MSEP_REC);
2582 result.append(m);
2583 }
2584 result.push_back(MSEP_REC);
2585 ok = 1;
2586 }
2587 }
2588
2589 // check non_compound flag in suffix and prefix
2590 if ((rv) &&
2591 ((pfx && pfx->getCont() &&
2592 TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
2593 (sfx && sfx->getCont() &&
2594 TESTAFF(sfx->getCont(), compoundforbidflag,
2595 sfx->getContLen())))) {
2596 rv = NULL;
2597 }
2598
2599 // check forbiddenwords
2600 if ((rv) && (rv->astr) &&
2601 (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2602 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)) &&
2603 (!TESTAFF(rv->astr, needaffix, rv->alen))) {
2604 st[i] = ch;
2605 continue;
2606 }
2607
2608 if (langnum == LANG_hu) {
2609 // calculate syllable number of the word
2610 numsyllable += get_syllable(word + i);
2611
2612 // - affix syllable num.
2613 // XXX only second suffix (inflections, not derivations)
2614 if (sfxappnd) {
2615 std::string tmp(sfxappnd);
2616 reverseword(tmp);
2617 numsyllable -= short(get_syllable(tmp) + sfxextra);
2618 } else {
2619 numsyllable -= short(sfxextra);
2620 }
2621
2622 // + 1 word, if syllable number of the prefix > 1 (hungarian
2623 // convention)
2624 if (pfx && (get_syllable(pfx->getKey()) > 1))
2625 wordnum++;
2626
2627 // increment syllable num, if last word has a SYLLABLENUM flag
2628 // and the suffix is beginning `s'
2629
2630 if (!cpdsyllablenum.empty()) {
2631 switch (sfxflag) {
2632 case 'c': {
2633 numsyllable += 2;
2634 break;
2635 }
2636 case 'J': {
2637 numsyllable += 1;
2638 break;
2639 }
2640 case 'I': {
2641 if (rv && TESTAFF(rv->astr, 'J', rv->alen))
2642 numsyllable += 1;
2643 break;
2644 }
2645 }
2646 }
2647 }
2648
2649 // increment word number, if the second word has a compoundroot flag
2650 if ((rv) && (compoundroot) &&
2651 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2652 wordnum++;
2653 }
2654 // second word is acceptable, as a word with prefix or/and suffix?
2655 // hungarian conventions: compounding is acceptable,
2656 // when compound forms consist 2 word, otherwise
2657 // the syllable number of root words is 6, or lesser.
2658 if ((rv) &&
2659 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2660 ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
2661 ((!checkcompounddup || (rv != rv_first)))) {
2662 std::string m;
2663 if (compoundflag)
2664 m = affix_check_morph((word + i), strlen(word + i), compoundflag);
2665 if (m.empty() && compoundend) {
2666 m = affix_check_morph((word + i), strlen(word + i), compoundend);
2667 }
2668 result.append(presult);
2669 if (!m.empty()) {
2670 result.push_back(MSEP_FLD);
2671 result.append(MORPH_PART);
2672 result.append(word + i);
2673 line_uniq_app(m, MSEP_REC);
2674 result.push_back(MSEP_FLD);
2675 result.append(m);
2676 }
2677 result.push_back(MSEP_REC);
2678 ok = 1;
2679 }
2680
2681 numsyllable = oldnumsyllable2;
2682 wordnum = oldwordnum2;
2683
2684 // perhaps second word is a compound word (recursive call)
2685 if ((wordnum + 2 < maxwordnum) && (ok == 0)) {
2686 compound_check_morph((word + i), strlen(word + i), wordnum + 1,
2687 numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
2688 result, &presult);
2689 } else {
2690 rv = NULL;
2691 }
2692 }
2693 st[i] = ch;
2694 wordnum = oldwordnum;
2695 numsyllable = oldnumsyllable;
2696
2697 } while (!defcpdtable.empty() && oldwordnum == 0 &&
2698 onlycpdrule++ < 1); // end of onlycpd loop
2699 }
2700 return 0;
2701 }
2702
2703
isRevSubset(const char * s1,const char * end_of_s2,int len)2704 inline int AffixMgr::isRevSubset(const char* s1,
2705 const char* end_of_s2,
2706 int len) {
2707 while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {
2708 s1++;
2709 end_of_s2--;
2710 len--;
2711 }
2712 return (*s1 == '\0');
2713 }
2714
2715 // check word for suffixes
suffix_check(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag,char in_compound)2716 struct hentry* AffixMgr::suffix_check(const char* word,
2717 int len,
2718 int sfxopts,
2719 PfxEntry* ppfx,
2720 const FLAG cclass,
2721 const FLAG needflag,
2722 char in_compound) {
2723 struct hentry* rv = NULL;
2724 PfxEntry* ep = ppfx;
2725
2726 // first handle the special case of 0 length suffixes
2727 SfxEntry* se = sStart[0];
2728
2729 while (se) {
2730 if (!cclass || se->getCont()) {
2731 // suffixes are not allowed in beginning of compounds
2732 if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2733 // except when signed with compoundpermitflag flag
2734 (se->getCont() && compoundpermitflag &&
2735 TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) &&
2736 (!circumfix ||
2737 // no circumfix flag in prefix and suffix
2738 ((!ppfx || !(ep->getCont()) ||
2739 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2740 (!se->getCont() ||
2741 !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) ||
2742 // circumfix flag in prefix AND suffix
2743 ((ppfx && (ep->getCont()) &&
2744 TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2745 (se->getCont() &&
2746 (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) &&
2747 // fogemorpheme
2748 (in_compound ||
2749 !(se->getCont() &&
2750 (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) &&
2751 // needaffix on prefix or first suffix
2752 (cclass ||
2753 !(se->getCont() &&
2754 TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2755 (ppfx &&
2756 !((ep->getCont()) &&
2757 TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))) {
2758 rv = se->checkword(word, len, sfxopts, ppfx,
2759 (FLAG)cclass, needflag,
2760 (in_compound ? 0 : onlyincompound));
2761 if (rv) {
2762 sfx = se; // BUG: sfx not stateless
2763 return rv;
2764 }
2765 }
2766 }
2767 se = se->getNext();
2768 }
2769
2770 // now handle the general case
2771 if (len == 0)
2772 return NULL; // FULLSTRIP
2773 unsigned char sp = *((const unsigned char*)(word + len - 1));
2774 SfxEntry* sptr = sStart[sp];
2775
2776 while (sptr) {
2777 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2778 // suffixes are not allowed in beginning of compounds
2779 if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2780 // except when signed with compoundpermitflag flag
2781 (sptr->getCont() && compoundpermitflag &&
2782 TESTAFF(sptr->getCont(), compoundpermitflag,
2783 sptr->getContLen()))) &&
2784 (!circumfix ||
2785 // no circumfix flag in prefix and suffix
2786 ((!ppfx || !(ep->getCont()) ||
2787 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2788 (!sptr->getCont() ||
2789 !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) ||
2790 // circumfix flag in prefix AND suffix
2791 ((ppfx && (ep->getCont()) &&
2792 TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2793 (sptr->getCont() &&
2794 (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) &&
2795 // fogemorpheme
2796 (in_compound ||
2797 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,
2798 sptr->getContLen()))))) &&
2799 // needaffix on prefix or first suffix
2800 (cclass ||
2801 !(sptr->getCont() &&
2802 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
2803 (ppfx &&
2804 !((ep->getCont()) &&
2805 TESTAFF(ep->getCont(), needaffix, ep->getContLen())))))
2806 if (in_compound != IN_CPD_END || ppfx ||
2807 !(sptr->getCont() &&
2808 TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) {
2809 rv = sptr->checkword(word, len, sfxopts, ppfx,
2810 cclass, needflag,
2811 (in_compound ? 0 : onlyincompound));
2812 if (rv) {
2813 sfx = sptr; // BUG: sfx not stateless
2814 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2815 if (!sptr->getCont())
2816 sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless
2817 // LANG_hu section: spec. Hungarian rule
2818 else if (langnum == LANG_hu && sptr->getKeyLen() &&
2819 sptr->getKey()[0] == 'i' && sptr->getKey()[1] != 'y' &&
2820 sptr->getKey()[1] != 't') {
2821 sfxextra = 1;
2822 }
2823 // END of LANG_hu section
2824 return rv;
2825 }
2826 }
2827 sptr = sptr->getNextEQ();
2828 } else {
2829 sptr = sptr->getNextNE();
2830 }
2831 }
2832
2833 return NULL;
2834 }
2835
2836 // check word for two-level suffixes
suffix_check_twosfx(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG needflag)2837 struct hentry* AffixMgr::suffix_check_twosfx(const char* word,
2838 int len,
2839 int sfxopts,
2840 PfxEntry* ppfx,
2841 const FLAG needflag) {
2842 struct hentry* rv = NULL;
2843
2844 // first handle the special case of 0 length suffixes
2845 SfxEntry* se = sStart[0];
2846 while (se) {
2847 if (contclasses[se->getFlag()]) {
2848 rv = se->check_twosfx(word, len, sfxopts, ppfx, needflag);
2849 if (rv)
2850 return rv;
2851 }
2852 se = se->getNext();
2853 }
2854
2855 // now handle the general case
2856 if (len == 0)
2857 return NULL; // FULLSTRIP
2858 unsigned char sp = *((const unsigned char*)(word + len - 1));
2859 SfxEntry* sptr = sStart[sp];
2860
2861 while (sptr) {
2862 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2863 if (contclasses[sptr->getFlag()]) {
2864 rv = sptr->check_twosfx(word, len, sfxopts, ppfx, needflag);
2865 if (rv) {
2866 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2867 if (!sptr->getCont())
2868 sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless
2869 return rv;
2870 }
2871 }
2872 sptr = sptr->getNextEQ();
2873 } else {
2874 sptr = sptr->getNextNE();
2875 }
2876 }
2877
2878 return NULL;
2879 }
2880
2881 // check word for two-level suffixes and morph
suffix_check_twosfx_morph(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG needflag)2882 std::string AffixMgr::suffix_check_twosfx_morph(const char* word,
2883 int len,
2884 int sfxopts,
2885 PfxEntry* ppfx,
2886 const FLAG needflag) {
2887 std::string result;
2888 std::string result2;
2889 std::string result3;
2890
2891 // first handle the special case of 0 length suffixes
2892 SfxEntry* se = sStart[0];
2893 while (se) {
2894 if (contclasses[se->getFlag()]) {
2895 std::string st = se->check_twosfx_morph(word, len, sfxopts, ppfx, needflag);
2896 if (!st.empty()) {
2897 if (ppfx) {
2898 if (ppfx->getMorph()) {
2899 result.append(ppfx->getMorph());
2900 result.push_back(MSEP_FLD);
2901 } else
2902 debugflag(result, ppfx->getFlag());
2903 }
2904 result.append(st);
2905 if (se->getMorph()) {
2906 result.push_back(MSEP_FLD);
2907 result.append(se->getMorph());
2908 } else
2909 debugflag(result, se->getFlag());
2910 result.push_back(MSEP_REC);
2911 }
2912 }
2913 se = se->getNext();
2914 }
2915
2916 // now handle the general case
2917 if (len == 0)
2918 return std::string(); // FULLSTRIP
2919 unsigned char sp = *((const unsigned char*)(word + len - 1));
2920 SfxEntry* sptr = sStart[sp];
2921
2922 while (sptr) {
2923 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2924 if (contclasses[sptr->getFlag()]) {
2925 std::string st = sptr->check_twosfx_morph(word, len, sfxopts, ppfx, needflag);
2926 if (!st.empty()) {
2927 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2928 if (!sptr->getCont())
2929 sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless
2930 result2.assign(st);
2931
2932 result3.clear();
2933
2934 if (sptr->getMorph()) {
2935 result3.push_back(MSEP_FLD);
2936 result3.append(sptr->getMorph());
2937 } else
2938 debugflag(result3, sptr->getFlag());
2939 strlinecat(result2, result3);
2940 result2.push_back(MSEP_REC);
2941 result.append(result2);
2942 }
2943 }
2944 sptr = sptr->getNextEQ();
2945 } else {
2946 sptr = sptr->getNextNE();
2947 }
2948 }
2949
2950 return result;
2951 }
2952
suffix_check_morph(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag,char in_compound)2953 std::string AffixMgr::suffix_check_morph(const char* word,
2954 int len,
2955 int sfxopts,
2956 PfxEntry* ppfx,
2957 const FLAG cclass,
2958 const FLAG needflag,
2959 char in_compound) {
2960 std::string result;
2961
2962 struct hentry* rv = NULL;
2963
2964 PfxEntry* ep = ppfx;
2965
2966 // first handle the special case of 0 length suffixes
2967 SfxEntry* se = sStart[0];
2968 while (se) {
2969 if (!cclass || se->getCont()) {
2970 // suffixes are not allowed in beginning of compounds
2971 if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2972 // except when signed with compoundpermitflag flag
2973 (se->getCont() && compoundpermitflag &&
2974 TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) &&
2975 (!circumfix ||
2976 // no circumfix flag in prefix and suffix
2977 ((!ppfx || !(ep->getCont()) ||
2978 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2979 (!se->getCont() ||
2980 !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) ||
2981 // circumfix flag in prefix AND suffix
2982 ((ppfx && (ep->getCont()) &&
2983 TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2984 (se->getCont() &&
2985 (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) &&
2986 // fogemorpheme
2987 (in_compound ||
2988 !((se->getCont() &&
2989 (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
2990 // needaffix on prefix or first suffix
2991 (cclass ||
2992 !(se->getCont() &&
2993 TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2994 (ppfx &&
2995 !((ep->getCont()) &&
2996 TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))))
2997 rv = se->checkword(word, len, sfxopts, ppfx, cclass,
2998 needflag, FLAG_NULL);
2999 while (rv) {
3000 if (ppfx) {
3001 if (ppfx->getMorph()) {
3002 result.append(ppfx->getMorph());
3003 result.push_back(MSEP_FLD);
3004 } else
3005 debugflag(result, ppfx->getFlag());
3006 }
3007 if (complexprefixes && HENTRY_DATA(rv))
3008 result.append(HENTRY_DATA2(rv));
3009 if (!HENTRY_FIND(rv, MORPH_STEM)) {
3010 result.push_back(MSEP_FLD);
3011 result.append(MORPH_STEM);
3012 result.append(HENTRY_WORD(rv));
3013 }
3014
3015 if (!complexprefixes && HENTRY_DATA(rv)) {
3016 result.push_back(MSEP_FLD);
3017 result.append(HENTRY_DATA2(rv));
3018 }
3019 if (se->getMorph()) {
3020 result.push_back(MSEP_FLD);
3021 result.append(se->getMorph());
3022 } else
3023 debugflag(result, se->getFlag());
3024 result.push_back(MSEP_REC);
3025 rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
3026 }
3027 }
3028 se = se->getNext();
3029 }
3030
3031 // now handle the general case
3032 if (len == 0)
3033 return std::string(); // FULLSTRIP
3034 unsigned char sp = *((const unsigned char*)(word + len - 1));
3035 SfxEntry* sptr = sStart[sp];
3036
3037 while (sptr) {
3038 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
3039 // suffixes are not allowed in beginning of compounds
3040 if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
3041 // except when signed with compoundpermitflag flag
3042 (sptr->getCont() && compoundpermitflag &&
3043 TESTAFF(sptr->getCont(), compoundpermitflag,
3044 sptr->getContLen()))) &&
3045 (!circumfix ||
3046 // no circumfix flag in prefix and suffix
3047 ((!ppfx || !(ep->getCont()) ||
3048 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
3049 (!sptr->getCont() ||
3050 !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) ||
3051 // circumfix flag in prefix AND suffix
3052 ((ppfx && (ep->getCont()) &&
3053 TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
3054 (sptr->getCont() &&
3055 (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) &&
3056 // fogemorpheme
3057 (in_compound ||
3058 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,
3059 sptr->getContLen()))))) &&
3060 // needaffix on first suffix
3061 (cclass ||
3062 !(sptr->getCont() &&
3063 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())))))
3064 rv = sptr->checkword(word, len, sfxopts, ppfx, cclass,
3065 needflag, FLAG_NULL);
3066 while (rv) {
3067 if (ppfx) {
3068 if (ppfx->getMorph()) {
3069 result.append(ppfx->getMorph());
3070 result.push_back(MSEP_FLD);
3071 } else
3072 debugflag(result, ppfx->getFlag());
3073 }
3074 if (complexprefixes && HENTRY_DATA(rv))
3075 result.append(HENTRY_DATA2(rv));
3076 if (!HENTRY_FIND(rv, MORPH_STEM)) {
3077 result.push_back(MSEP_FLD);
3078 result.append(MORPH_STEM);
3079 result.append(HENTRY_WORD(rv));
3080 }
3081
3082 if (!complexprefixes && HENTRY_DATA(rv)) {
3083 result.push_back(MSEP_FLD);
3084 result.append(HENTRY_DATA2(rv));
3085 }
3086
3087 if (sptr->getMorph()) {
3088 result.push_back(MSEP_FLD);
3089 result.append(sptr->getMorph());
3090 } else
3091 debugflag(result, sptr->getFlag());
3092 result.push_back(MSEP_REC);
3093 rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
3094 }
3095 sptr = sptr->getNextEQ();
3096 } else {
3097 sptr = sptr->getNextNE();
3098 }
3099 }
3100
3101 return result;
3102 }
3103
3104 // check if word with affixes is correctly spelled
affix_check(const char * word,int len,const FLAG needflag,char in_compound)3105 struct hentry* AffixMgr::affix_check(const char* word,
3106 int len,
3107 const FLAG needflag,
3108 char in_compound) {
3109
3110 // check all prefixes (also crossed with suffixes if allowed)
3111 struct hentry* rv = prefix_check(word, len, in_compound, needflag);
3112 if (rv)
3113 return rv;
3114
3115 // if still not found check all suffixes
3116 rv = suffix_check(word, len, 0, NULL, FLAG_NULL, needflag, in_compound);
3117
3118 if (havecontclass) {
3119 sfx = NULL;
3120 pfx = NULL;
3121
3122 if (rv)
3123 return rv;
3124 // if still not found check all two-level suffixes
3125 rv = suffix_check_twosfx(word, len, 0, NULL, needflag);
3126
3127 if (rv)
3128 return rv;
3129 // if still not found check all two-level suffixes
3130 rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag);
3131 }
3132
3133 return rv;
3134 }
3135
3136 // check if word with affixes is correctly spelled
affix_check_morph(const char * word,int len,const FLAG needflag,char in_compound)3137 std::string AffixMgr::affix_check_morph(const char* word,
3138 int len,
3139 const FLAG needflag,
3140 char in_compound) {
3141 std::string result;
3142
3143 // check all prefixes (also crossed with suffixes if allowed)
3144 std::string st = prefix_check_morph(word, len, in_compound);
3145 if (!st.empty()) {
3146 result.append(st);
3147 }
3148
3149 // if still not found check all suffixes
3150 st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound);
3151 if (!st.empty()) {
3152 result.append(st);
3153 }
3154
3155 if (havecontclass) {
3156 sfx = NULL;
3157 pfx = NULL;
3158 // if still not found check all two-level suffixes
3159 st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag);
3160 if (!st.empty()) {
3161 result.append(st);
3162 }
3163
3164 // if still not found check all two-level suffixes
3165 st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag);
3166 if (!st.empty()) {
3167 result.append(st);
3168 }
3169 }
3170
3171 return result;
3172 }
3173
3174 // morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields
3175 // in the first line of the inputs
3176 // return 0, if inputs equal
3177 // return 1, if inputs may equal with a secondary suffix
3178 // otherwise return -1
morphcmp(const char * s,const char * t)3179 static int morphcmp(const char* s, const char* t) {
3180 int se = 0;
3181 int te = 0;
3182 const char* sl;
3183 const char* tl;
3184 const char* olds;
3185 const char* oldt;
3186 if (!s || !t)
3187 return 1;
3188 olds = s;
3189 sl = strchr(s, '\n');
3190 s = strstr(s, MORPH_DERI_SFX);
3191 if (!s || (sl && sl < s))
3192 s = strstr(olds, MORPH_INFL_SFX);
3193 if (!s || (sl && sl < s)) {
3194 s = strstr(olds, MORPH_TERM_SFX);
3195 olds = NULL;
3196 }
3197 oldt = t;
3198 tl = strchr(t, '\n');
3199 t = strstr(t, MORPH_DERI_SFX);
3200 if (!t || (tl && tl < t))
3201 t = strstr(oldt, MORPH_INFL_SFX);
3202 if (!t || (tl && tl < t)) {
3203 t = strstr(oldt, MORPH_TERM_SFX);
3204 oldt = NULL;
3205 }
3206 while (s && t && (!sl || sl > s) && (!tl || tl > t)) {
3207 s += MORPH_TAG_LEN;
3208 t += MORPH_TAG_LEN;
3209 se = 0;
3210 te = 0;
3211 while ((*s == *t) && !se && !te) {
3212 s++;
3213 t++;
3214 switch (*s) {
3215 case ' ':
3216 case '\n':
3217 case '\t':
3218 case '\0':
3219 se = 1;
3220 }
3221 switch (*t) {
3222 case ' ':
3223 case '\n':
3224 case '\t':
3225 case '\0':
3226 te = 1;
3227 }
3228 }
3229 if (!se || !te) {
3230 // not terminal suffix difference
3231 if (olds)
3232 return -1;
3233 return 1;
3234 }
3235 olds = s;
3236 s = strstr(s, MORPH_DERI_SFX);
3237 if (!s || (sl && sl < s))
3238 s = strstr(olds, MORPH_INFL_SFX);
3239 if (!s || (sl && sl < s)) {
3240 s = strstr(olds, MORPH_TERM_SFX);
3241 olds = NULL;
3242 }
3243 oldt = t;
3244 t = strstr(t, MORPH_DERI_SFX);
3245 if (!t || (tl && tl < t))
3246 t = strstr(oldt, MORPH_INFL_SFX);
3247 if (!t || (tl && tl < t)) {
3248 t = strstr(oldt, MORPH_TERM_SFX);
3249 oldt = NULL;
3250 }
3251 }
3252 if (!s && !t && se && te)
3253 return 0;
3254 return 1;
3255 }
3256
morphgen(const char * ts,int wl,const unsigned short * ap,unsigned short al,const char * morph,const char * targetmorph,int level)3257 std::string AffixMgr::morphgen(const char* ts,
3258 int wl,
3259 const unsigned short* ap,
3260 unsigned short al,
3261 const char* morph,
3262 const char* targetmorph,
3263 int level) {
3264 // handle suffixes
3265 if (!morph)
3266 return std::string();
3267
3268 // check substandard flag
3269 if (TESTAFF(ap, substandard, al))
3270 return std::string();
3271
3272 if (morphcmp(morph, targetmorph) == 0)
3273 return ts;
3274
3275 size_t stemmorphcatpos;
3276 std::string mymorph;
3277
3278 // use input suffix fields, if exist
3279 if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) {
3280 mymorph.assign(morph);
3281 mymorph.push_back(MSEP_FLD);
3282 stemmorphcatpos = mymorph.size();
3283 } else {
3284 stemmorphcatpos = std::string::npos;
3285 }
3286
3287 for (int i = 0; i < al; i++) {
3288 const unsigned char c = (unsigned char)(ap[i] & 0x00FF);
3289 SfxEntry* sptr = sFlag[c];
3290 while (sptr) {
3291 if (sptr->getFlag() == ap[i] && sptr->getMorph() &&
3292 ((sptr->getContLen() == 0) ||
3293 // don't generate forms with substandard affixes
3294 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) {
3295 const char* stemmorph;
3296 if (stemmorphcatpos != std::string::npos) {
3297 mymorph.replace(stemmorphcatpos, std::string::npos, sptr->getMorph());
3298 stemmorph = mymorph.c_str();
3299 } else {
3300 stemmorph = sptr->getMorph();
3301 }
3302
3303 int cmp = morphcmp(stemmorph, targetmorph);
3304
3305 if (cmp == 0) {
3306 std::string newword = sptr->add(ts, wl);
3307 if (!newword.empty()) {
3308 hentry* check = pHMgr->lookup(newword.c_str()); // XXX extra dic
3309 if (!check || !check->astr ||
3310 !(TESTAFF(check->astr, forbiddenword, check->alen) ||
3311 TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) {
3312 return newword;
3313 }
3314 }
3315 }
3316
3317 // recursive call for secondary suffixes
3318 if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) &&
3319 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) {
3320 std::string newword = sptr->add(ts, wl);
3321 if (!newword.empty()) {
3322 std::string newword2 =
3323 morphgen(newword.c_str(), newword.size(), sptr->getCont(),
3324 sptr->getContLen(), stemmorph, targetmorph, 1);
3325
3326 if (!newword2.empty()) {
3327 return newword2;
3328 }
3329 }
3330 }
3331 }
3332 sptr = sptr->getFlgNxt();
3333 }
3334 }
3335 return std::string();
3336 }
3337
expand_rootword(struct guessword * wlst,int maxn,const char * ts,int wl,const unsigned short * ap,unsigned short al,const char * bad,int badl,const char * phon)3338 int AffixMgr::expand_rootword(struct guessword* wlst,
3339 int maxn,
3340 const char* ts,
3341 int wl,
3342 const unsigned short* ap,
3343 unsigned short al,
3344 const char* bad,
3345 int badl,
3346 const char* phon) {
3347 int nh = 0;
3348 // first add root word to list
3349 if ((nh < maxn) &&
3350 !(al && ((needaffix && TESTAFF(ap, needaffix, al)) ||
3351 (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {
3352 wlst[nh].word = mystrdup(ts);
3353 if (!wlst[nh].word)
3354 return 0;
3355 wlst[nh].allow = false;
3356 wlst[nh].orig = NULL;
3357 nh++;
3358 // add special phonetic version
3359 if (phon && (nh < maxn)) {
3360 wlst[nh].word = mystrdup(phon);
3361 if (!wlst[nh].word)
3362 return nh - 1;
3363 wlst[nh].allow = false;
3364 wlst[nh].orig = mystrdup(ts);
3365 if (!wlst[nh].orig)
3366 return nh - 1;
3367 nh++;
3368 }
3369 }
3370
3371 // handle suffixes
3372 for (int i = 0; i < al; i++) {
3373 const unsigned char c = (unsigned char)(ap[i] & 0x00FF);
3374 SfxEntry* sptr = sFlag[c];
3375 while (sptr) {
3376 if ((sptr->getFlag() == ap[i]) &&
3377 (!sptr->getKeyLen() ||
3378 ((badl > sptr->getKeyLen()) &&
3379 (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) &&
3380 // check needaffix flag
3381 !(sptr->getCont() &&
3382 ((needaffix &&
3383 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
3384 (circumfix &&
3385 TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) ||
3386 (onlyincompound &&
3387 TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) {
3388 std::string newword = sptr->add(ts, wl);
3389 if (!newword.empty()) {
3390 if (nh < maxn) {
3391 wlst[nh].word = mystrdup(newword.c_str());
3392 wlst[nh].allow = sptr->allowCross();
3393 wlst[nh].orig = NULL;
3394 nh++;
3395 // add special phonetic version
3396 if (phon && (nh < maxn)) {
3397 std::string prefix(phon);
3398 std::string key(sptr->getKey());
3399 reverseword(key);
3400 prefix.append(key);
3401 wlst[nh].word = mystrdup(prefix.c_str());
3402 if (!wlst[nh].word)
3403 return nh - 1;
3404 wlst[nh].allow = false;
3405 wlst[nh].orig = mystrdup(newword.c_str());
3406 if (!wlst[nh].orig)
3407 return nh - 1;
3408 nh++;
3409 }
3410 }
3411 }
3412 }
3413 sptr = sptr->getFlgNxt();
3414 }
3415 }
3416
3417 int n = nh;
3418
3419 // handle cross products of prefixes and suffixes
3420 for (int j = 1; j < n; j++)
3421 if (wlst[j].allow) {
3422 for (int k = 0; k < al; k++) {
3423 const unsigned char c = (unsigned char)(ap[k] & 0x00FF);
3424 PfxEntry* cptr = pFlag[c];
3425 while (cptr) {
3426 if ((cptr->getFlag() == ap[k]) && cptr->allowCross() &&
3427 (!cptr->getKeyLen() ||
3428 ((badl > cptr->getKeyLen()) &&
3429 (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
3430 int l1 = strlen(wlst[j].word);
3431 std::string newword = cptr->add(wlst[j].word, l1);
3432 if (!newword.empty()) {
3433 if (nh < maxn) {
3434 wlst[nh].word = mystrdup(newword.c_str());
3435 wlst[nh].allow = cptr->allowCross();
3436 wlst[nh].orig = NULL;
3437 nh++;
3438 }
3439 }
3440 }
3441 cptr = cptr->getFlgNxt();
3442 }
3443 }
3444 }
3445
3446 // now handle pure prefixes
3447 for (int m = 0; m < al; m++) {
3448 const unsigned char c = (unsigned char)(ap[m] & 0x00FF);
3449 PfxEntry* ptr = pFlag[c];
3450 while (ptr) {
3451 if ((ptr->getFlag() == ap[m]) &&
3452 (!ptr->getKeyLen() ||
3453 ((badl > ptr->getKeyLen()) &&
3454 (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) &&
3455 // check needaffix flag
3456 !(ptr->getCont() &&
3457 ((needaffix &&
3458 TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) ||
3459 (circumfix &&
3460 TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||
3461 (onlyincompound &&
3462 TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))) {
3463 std::string newword = ptr->add(ts, wl);
3464 if (!newword.empty()) {
3465 if (nh < maxn) {
3466 wlst[nh].word = mystrdup(newword.c_str());
3467 wlst[nh].allow = ptr->allowCross();
3468 wlst[nh].orig = NULL;
3469 nh++;
3470 }
3471 }
3472 }
3473 ptr = ptr->getFlgNxt();
3474 }
3475 }
3476
3477 return nh;
3478 }
3479
3480 // return replacing table
get_reptable() const3481 const std::vector<replentry>& AffixMgr::get_reptable() const {
3482 return pHMgr->get_reptable();
3483 }
3484
3485 // return iconv table
get_iconvtable() const3486 RepList* AffixMgr::get_iconvtable() const {
3487 if (!iconvtable)
3488 return NULL;
3489 return iconvtable;
3490 }
3491
3492 // return oconv table
get_oconvtable() const3493 RepList* AffixMgr::get_oconvtable() const {
3494 if (!oconvtable)
3495 return NULL;
3496 return oconvtable;
3497 }
3498
3499 // return replacing table
get_phonetable() const3500 struct phonetable* AffixMgr::get_phonetable() const {
3501 if (!phone)
3502 return NULL;
3503 return phone;
3504 }
3505
3506 // return character map table
get_maptable() const3507 const std::vector<mapentry>& AffixMgr::get_maptable() const {
3508 return maptable;
3509 }
3510
3511 // return character map table
get_breaktable() const3512 const std::vector<std::string>& AffixMgr::get_breaktable() const {
3513 return breaktable;
3514 }
3515
3516 // return text encoding of dictionary
get_encoding()3517 const std::string& AffixMgr::get_encoding() {
3518 if (encoding.empty())
3519 encoding = SPELL_ENCODING;
3520 return encoding;
3521 }
3522
3523 // return text encoding of dictionary
get_langnum() const3524 int AffixMgr::get_langnum() const {
3525 return langnum;
3526 }
3527
3528 // return double prefix option
get_complexprefixes() const3529 int AffixMgr::get_complexprefixes() const {
3530 return complexprefixes;
3531 }
3532
3533 // return FULLSTRIP option
get_fullstrip() const3534 int AffixMgr::get_fullstrip() const {
3535 return fullstrip;
3536 }
3537
get_keepcase() const3538 FLAG AffixMgr::get_keepcase() const {
3539 return keepcase;
3540 }
3541
get_forceucase() const3542 FLAG AffixMgr::get_forceucase() const {
3543 return forceucase;
3544 }
3545
get_warn() const3546 FLAG AffixMgr::get_warn() const {
3547 return warn;
3548 }
3549
get_forbidwarn() const3550 int AffixMgr::get_forbidwarn() const {
3551 return forbidwarn;
3552 }
3553
get_checksharps() const3554 int AffixMgr::get_checksharps() const {
3555 return checksharps;
3556 }
3557
encode_flag(unsigned short aflag) const3558 char* AffixMgr::encode_flag(unsigned short aflag) const {
3559 return pHMgr->encode_flag(aflag);
3560 }
3561
3562 // return the preferred ignore string for suggestions
get_ignore() const3563 const char* AffixMgr::get_ignore() const {
3564 if (ignorechars.empty())
3565 return NULL;
3566 return ignorechars.c_str();
3567 }
3568
3569 // return the preferred ignore string for suggestions
get_ignore_utf16() const3570 const std::vector<w_char>& AffixMgr::get_ignore_utf16() const {
3571 return ignorechars_utf16;
3572 }
3573
3574 // return the keyboard string for suggestions
get_key_string()3575 char* AffixMgr::get_key_string() {
3576 if (keystring.empty())
3577 keystring = SPELL_KEYSTRING;
3578 return mystrdup(keystring.c_str());
3579 }
3580
3581 // return the preferred try string for suggestions
get_try_string() const3582 char* AffixMgr::get_try_string() const {
3583 if (trystring.empty())
3584 return NULL;
3585 return mystrdup(trystring.c_str());
3586 }
3587
3588 // return the preferred try string for suggestions
get_wordchars() const3589 const std::string& AffixMgr::get_wordchars() const {
3590 return wordchars;
3591 }
3592
get_wordchars_utf16() const3593 const std::vector<w_char>& AffixMgr::get_wordchars_utf16() const {
3594 return wordchars_utf16;
3595 }
3596
3597 // is there compounding?
get_compound() const3598 int AffixMgr::get_compound() const {
3599 return compoundflag || compoundbegin || !defcpdtable.empty();
3600 }
3601
3602 // return the compound words control flag
get_compoundflag() const3603 FLAG AffixMgr::get_compoundflag() const {
3604 return compoundflag;
3605 }
3606
3607 // return the forbidden words control flag
get_forbiddenword() const3608 FLAG AffixMgr::get_forbiddenword() const {
3609 return forbiddenword;
3610 }
3611
3612 // return the forbidden words control flag
get_nosuggest() const3613 FLAG AffixMgr::get_nosuggest() const {
3614 return nosuggest;
3615 }
3616
3617 // return the forbidden words control flag
get_nongramsuggest() const3618 FLAG AffixMgr::get_nongramsuggest() const {
3619 return nongramsuggest;
3620 }
3621
3622 // return the substandard root/affix control flag
get_substandard() const3623 FLAG AffixMgr::get_substandard() const {
3624 return substandard;
3625 }
3626
3627 // return the forbidden words flag modify flag
get_needaffix() const3628 FLAG AffixMgr::get_needaffix() const {
3629 return needaffix;
3630 }
3631
3632 // return the onlyincompound flag
get_onlyincompound() const3633 FLAG AffixMgr::get_onlyincompound() const {
3634 return onlyincompound;
3635 }
3636
3637 // return the value of suffix
get_version() const3638 const std::string& AffixMgr::get_version() const {
3639 return version;
3640 }
3641
3642 // utility method to look up root words in hash table
lookup(const char * word)3643 struct hentry* AffixMgr::lookup(const char* word) {
3644 struct hentry* he = NULL;
3645 for (size_t i = 0; i < alldic.size() && !he; ++i) {
3646 he = alldic[i]->lookup(word);
3647 }
3648 return he;
3649 }
3650
3651 // return the value of suffix
have_contclass() const3652 int AffixMgr::have_contclass() const {
3653 return havecontclass;
3654 }
3655
3656 // return utf8
get_utf8() const3657 int AffixMgr::get_utf8() const {
3658 return utf8;
3659 }
3660
get_maxngramsugs(void) const3661 int AffixMgr::get_maxngramsugs(void) const {
3662 return maxngramsugs;
3663 }
3664
get_maxcpdsugs(void) const3665 int AffixMgr::get_maxcpdsugs(void) const {
3666 return maxcpdsugs;
3667 }
3668
get_maxdiff(void) const3669 int AffixMgr::get_maxdiff(void) const {
3670 return maxdiff;
3671 }
3672
get_onlymaxdiff(void) const3673 int AffixMgr::get_onlymaxdiff(void) const {
3674 return onlymaxdiff;
3675 }
3676
3677 // return nosplitsugs
get_nosplitsugs(void) const3678 int AffixMgr::get_nosplitsugs(void) const {
3679 return nosplitsugs;
3680 }
3681
3682 // return sugswithdots
get_sugswithdots(void) const3683 int AffixMgr::get_sugswithdots(void) const {
3684 return sugswithdots;
3685 }
3686
3687 /* parse flag */
parse_flag(const std::string & line,unsigned short * out,FileMgr * af)3688 bool AffixMgr::parse_flag(const std::string& line, unsigned short* out, FileMgr* af) {
3689 if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) {
3690 HUNSPELL_WARNING(
3691 stderr,
3692 "error: line %d: multiple definitions of an affix file parameter\n",
3693 af->getlinenum());
3694 return false;
3695 }
3696 std::string s;
3697 if (!parse_string(line, s, af->getlinenum()))
3698 return false;
3699 *out = pHMgr->decode_flag(s.c_str());
3700 return true;
3701 }
3702
3703 /* parse num */
parse_num(const std::string & line,int * out,FileMgr * af)3704 bool AffixMgr::parse_num(const std::string& line, int* out, FileMgr* af) {
3705 if (*out != -1) {
3706 HUNSPELL_WARNING(
3707 stderr,
3708 "error: line %d: multiple definitions of an affix file parameter\n",
3709 af->getlinenum());
3710 return false;
3711 }
3712 std::string s;
3713 if (!parse_string(line, s, af->getlinenum()))
3714 return false;
3715 *out = atoi(s.c_str());
3716 return true;
3717 }
3718
3719 /* parse in the max syllablecount of compound words and */
parse_cpdsyllable(const std::string & line,FileMgr * af)3720 bool AffixMgr::parse_cpdsyllable(const std::string& line, FileMgr* af) {
3721 int i = 0;
3722 int np = 0;
3723 std::string::const_iterator iter = line.begin();
3724 std::string::const_iterator start_piece = mystrsep(line, iter);
3725 while (start_piece != line.end()) {
3726 switch (i) {
3727 case 0: {
3728 np++;
3729 break;
3730 }
3731 case 1: {
3732 cpdmaxsyllable = atoi(std::string(start_piece, iter).c_str());
3733 np++;
3734 break;
3735 }
3736 case 2: {
3737 if (!utf8) {
3738 cpdvowels.assign(start_piece, iter);
3739 std::sort(cpdvowels.begin(), cpdvowels.end());
3740 } else {
3741 std::string piece(start_piece, iter);
3742 u8_u16(cpdvowels_utf16, piece);
3743 std::sort(cpdvowels_utf16.begin(), cpdvowels_utf16.end());
3744 }
3745 np++;
3746 break;
3747 }
3748 default:
3749 break;
3750 }
3751 ++i;
3752 start_piece = mystrsep(line, iter);
3753 }
3754 if (np < 2) {
3755 HUNSPELL_WARNING(stderr,
3756 "error: line %d: missing compoundsyllable information\n",
3757 af->getlinenum());
3758 return false;
3759 }
3760 if (np == 2)
3761 cpdvowels = "AEIOUaeiou";
3762 return true;
3763 }
3764
parse_convtable(const std::string & line,FileMgr * af,RepList ** rl,const std::string & keyword)3765 bool AffixMgr::parse_convtable(const std::string& line,
3766 FileMgr* af,
3767 RepList** rl,
3768 const std::string& keyword) {
3769 if (*rl) {
3770 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3771 af->getlinenum());
3772 return false;
3773 }
3774 int i = 0;
3775 int np = 0;
3776 int numrl = 0;
3777 std::string::const_iterator iter = line.begin();
3778 std::string::const_iterator start_piece = mystrsep(line, iter);
3779 while (start_piece != line.end()) {
3780 switch (i) {
3781 case 0: {
3782 np++;
3783 break;
3784 }
3785 case 1: {
3786 numrl = atoi(std::string(start_piece, iter).c_str());
3787 if (numrl < 1) {
3788 HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
3789 af->getlinenum());
3790 return false;
3791 }
3792 *rl = new RepList(numrl);
3793 if (!*rl)
3794 return false;
3795 np++;
3796 break;
3797 }
3798 default:
3799 break;
3800 }
3801 ++i;
3802 start_piece = mystrsep(line, iter);
3803 }
3804 if (np != 2) {
3805 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
3806 af->getlinenum());
3807 return false;
3808 }
3809
3810 /* now parse the num lines to read in the remainder of the table */
3811 for (int j = 0; j < numrl; j++) {
3812 std::string nl;
3813 if (!af->getline(nl))
3814 return false;
3815 mychomp(nl);
3816 i = 0;
3817 std::string pattern;
3818 std::string pattern2;
3819 iter = nl.begin();
3820 start_piece = mystrsep(nl, iter);
3821 while (start_piece != nl.end()) {
3822 {
3823 switch (i) {
3824 case 0: {
3825 if (nl.compare(start_piece - nl.begin(), keyword.size(), keyword, 0, keyword.size()) != 0) {
3826 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3827 af->getlinenum());
3828 delete *rl;
3829 *rl = NULL;
3830 return false;
3831 }
3832 break;
3833 }
3834 case 1: {
3835 pattern.assign(start_piece, iter);
3836 break;
3837 }
3838 case 2: {
3839 pattern2.assign(start_piece, iter);
3840 break;
3841 }
3842 default:
3843 break;
3844 }
3845 ++i;
3846 }
3847 start_piece = mystrsep(nl, iter);
3848 }
3849 if (pattern.empty() || pattern2.empty()) {
3850 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3851 af->getlinenum());
3852 return false;
3853 }
3854 (*rl)->add(pattern, pattern2);
3855 }
3856 return true;
3857 }
3858
3859 /* parse in the typical fault correcting table */
parse_phonetable(const std::string & line,FileMgr * af)3860 bool AffixMgr::parse_phonetable(const std::string& line, FileMgr* af) {
3861 if (phone) {
3862 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3863 af->getlinenum());
3864 return false;
3865 }
3866 int num = -1;
3867 int i = 0;
3868 int np = 0;
3869 std::string::const_iterator iter = line.begin();
3870 std::string::const_iterator start_piece = mystrsep(line, iter);
3871 while (start_piece != line.end()) {
3872 switch (i) {
3873 case 0: {
3874 np++;
3875 break;
3876 }
3877 case 1: {
3878 num = atoi(std::string(start_piece, iter).c_str());
3879 if (num < 1) {
3880 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
3881 af->getlinenum());
3882 return false;
3883 }
3884 phone = new phonetable;
3885 phone->utf8 = (char)utf8;
3886 np++;
3887 break;
3888 }
3889 default:
3890 break;
3891 }
3892 ++i;
3893 start_piece = mystrsep(line, iter);
3894 }
3895 if (np != 2) {
3896 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
3897 af->getlinenum());
3898 return false;
3899 }
3900
3901 /* now parse the phone->num lines to read in the remainder of the table */
3902 for (int j = 0; j < num; ++j) {
3903 std::string nl;
3904 if (!af->getline(nl))
3905 return false;
3906 mychomp(nl);
3907 i = 0;
3908 const size_t old_size = phone->rules.size();
3909 iter = nl.begin();
3910 start_piece = mystrsep(nl, iter);
3911 while (start_piece != nl.end()) {
3912 {
3913 switch (i) {
3914 case 0: {
3915 if (nl.compare(start_piece - nl.begin(), 5, "PHONE", 5) != 0) {
3916 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3917 af->getlinenum());
3918 return false;
3919 }
3920 break;
3921 }
3922 case 1: {
3923 phone->rules.push_back(std::string(start_piece, iter));
3924 break;
3925 }
3926 case 2: {
3927 phone->rules.push_back(std::string(start_piece, iter));
3928 mystrrep(phone->rules.back(), "_", "");
3929 break;
3930 }
3931 default:
3932 break;
3933 }
3934 ++i;
3935 }
3936 start_piece = mystrsep(nl, iter);
3937 }
3938 if (phone->rules.size() != old_size + 2) {
3939 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3940 af->getlinenum());
3941 phone->rules.clear();
3942 return false;
3943 }
3944 }
3945 phone->rules.push_back("");
3946 phone->rules.push_back("");
3947 init_phonet_hash(*phone);
3948 return true;
3949 }
3950
3951 /* parse in the checkcompoundpattern table */
parse_checkcpdtable(const std::string & line,FileMgr * af)3952 bool AffixMgr::parse_checkcpdtable(const std::string& line, FileMgr* af) {
3953 if (parsedcheckcpd) {
3954 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3955 af->getlinenum());
3956 return false;
3957 }
3958 parsedcheckcpd = true;
3959 int numcheckcpd = -1;
3960 int i = 0;
3961 int np = 0;
3962 std::string::const_iterator iter = line.begin();
3963 std::string::const_iterator start_piece = mystrsep(line, iter);
3964 while (start_piece != line.end()) {
3965 switch (i) {
3966 case 0: {
3967 np++;
3968 break;
3969 }
3970 case 1: {
3971 numcheckcpd = atoi(std::string(start_piece, iter).c_str());
3972 if (numcheckcpd < 1) {
3973 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
3974 af->getlinenum());
3975 return false;
3976 }
3977 checkcpdtable.reserve(numcheckcpd);
3978 np++;
3979 break;
3980 }
3981 default:
3982 break;
3983 }
3984 ++i;
3985 start_piece = mystrsep(line, iter);
3986 }
3987 if (np != 2) {
3988 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
3989 af->getlinenum());
3990 return false;
3991 }
3992
3993 /* now parse the numcheckcpd lines to read in the remainder of the table */
3994 for (int j = 0; j < numcheckcpd; ++j) {
3995 std::string nl;
3996 if (!af->getline(nl))
3997 return false;
3998 mychomp(nl);
3999 i = 0;
4000 checkcpdtable.push_back(patentry());
4001 iter = nl.begin();
4002 start_piece = mystrsep(nl, iter);
4003 while (start_piece != nl.end()) {
4004 switch (i) {
4005 case 0: {
4006 if (nl.compare(start_piece - nl.begin(), 20, "CHECKCOMPOUNDPATTERN", 20) != 0) {
4007 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4008 af->getlinenum());
4009 return false;
4010 }
4011 break;
4012 }
4013 case 1: {
4014 checkcpdtable.back().pattern.assign(start_piece, iter);
4015 size_t slash_pos = checkcpdtable.back().pattern.find('/');
4016 if (slash_pos != std::string::npos) {
4017 std::string chunk(checkcpdtable.back().pattern, slash_pos + 1);
4018 checkcpdtable.back().pattern.resize(slash_pos);
4019 checkcpdtable.back().cond = pHMgr->decode_flag(chunk.c_str());
4020 }
4021 break;
4022 }
4023 case 2: {
4024 checkcpdtable.back().pattern2.assign(start_piece, iter);
4025 size_t slash_pos = checkcpdtable.back().pattern2.find('/');
4026 if (slash_pos != std::string::npos) {
4027 std::string chunk(checkcpdtable.back().pattern2, slash_pos + 1);
4028 checkcpdtable.back().pattern2.resize(slash_pos);
4029 checkcpdtable.back().cond2 = pHMgr->decode_flag(chunk.c_str());
4030 }
4031 break;
4032 }
4033 case 3: {
4034 checkcpdtable.back().pattern3.assign(start_piece, iter);
4035 simplifiedcpd = 1;
4036 break;
4037 }
4038 default:
4039 break;
4040 }
4041 i++;
4042 start_piece = mystrsep(nl, iter);
4043 }
4044 }
4045 return true;
4046 }
4047
4048 /* parse in the compound rule table */
parse_defcpdtable(const std::string & line,FileMgr * af)4049 bool AffixMgr::parse_defcpdtable(const std::string& line, FileMgr* af) {
4050 if (parseddefcpd) {
4051 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
4052 af->getlinenum());
4053 return false;
4054 }
4055 parseddefcpd = true;
4056 int numdefcpd = -1;
4057 int i = 0;
4058 int np = 0;
4059 std::string::const_iterator iter = line.begin();
4060 std::string::const_iterator start_piece = mystrsep(line, iter);
4061 while (start_piece != line.end()) {
4062 switch (i) {
4063 case 0: {
4064 np++;
4065 break;
4066 }
4067 case 1: {
4068 numdefcpd = atoi(std::string(start_piece, iter).c_str());
4069 if (numdefcpd < 1) {
4070 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4071 af->getlinenum());
4072 return false;
4073 }
4074 defcpdtable.reserve(numdefcpd);
4075 np++;
4076 break;
4077 }
4078 default:
4079 break;
4080 }
4081 ++i;
4082 start_piece = mystrsep(line, iter);
4083 }
4084 if (np != 2) {
4085 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4086 af->getlinenum());
4087 return false;
4088 }
4089
4090 /* now parse the numdefcpd lines to read in the remainder of the table */
4091 for (int j = 0; j < numdefcpd; ++j) {
4092 std::string nl;
4093 if (!af->getline(nl))
4094 return false;
4095 mychomp(nl);
4096 i = 0;
4097 defcpdtable.push_back(flagentry());
4098 iter = nl.begin();
4099 start_piece = mystrsep(nl, iter);
4100 while (start_piece != nl.end()) {
4101 switch (i) {
4102 case 0: {
4103 if (nl.compare(start_piece - nl.begin(), 12, "COMPOUNDRULE", 12) != 0) {
4104 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4105 af->getlinenum());
4106 numdefcpd = 0;
4107 return false;
4108 }
4109 break;
4110 }
4111 case 1: { // handle parenthesized flags
4112 if (std::find(start_piece, iter, '(') != iter) {
4113 for (std::string::const_iterator k = start_piece; k != iter; ++k) {
4114 std::string::const_iterator chb = k;
4115 std::string::const_iterator che = k + 1;
4116 if (*k == '(') {
4117 std::string::const_iterator parpos = std::find(k, iter, ')');
4118 if (parpos != iter) {
4119 chb = k + 1;
4120 che = parpos;
4121 k = parpos;
4122 }
4123 }
4124
4125 if (*chb == '*' || *chb == '?') {
4126 defcpdtable.back().push_back((FLAG)*chb);
4127 } else {
4128 pHMgr->decode_flags(defcpdtable.back(), std::string(chb, che), af);
4129 }
4130 }
4131 } else {
4132 pHMgr->decode_flags(defcpdtable.back(), std::string(start_piece, iter), af);
4133 }
4134 break;
4135 }
4136 default:
4137 break;
4138 }
4139 ++i;
4140 start_piece = mystrsep(nl, iter);
4141 }
4142 if (defcpdtable.back().empty()) {
4143 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4144 af->getlinenum());
4145 return false;
4146 }
4147 }
4148 return true;
4149 }
4150
4151 /* parse in the character map table */
parse_maptable(const std::string & line,FileMgr * af)4152 bool AffixMgr::parse_maptable(const std::string& line, FileMgr* af) {
4153 if (parsedmaptable) {
4154 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
4155 af->getlinenum());
4156 return false;
4157 }
4158 parsedmaptable = true;
4159 int nummap = -1;
4160 int i = 0;
4161 int np = 0;
4162 std::string::const_iterator iter = line.begin();
4163 std::string::const_iterator start_piece = mystrsep(line, iter);
4164 while (start_piece != line.end()) {
4165 switch (i) {
4166 case 0: {
4167 np++;
4168 break;
4169 }
4170 case 1: {
4171 nummap = atoi(std::string(start_piece, iter).c_str());
4172 if (nummap < 1) {
4173 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4174 af->getlinenum());
4175 return false;
4176 }
4177 maptable.reserve(nummap);
4178 np++;
4179 break;
4180 }
4181 default:
4182 break;
4183 }
4184 ++i;
4185 start_piece = mystrsep(line, iter);
4186 }
4187 if (np != 2) {
4188 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4189 af->getlinenum());
4190 return false;
4191 }
4192
4193 /* now parse the nummap lines to read in the remainder of the table */
4194 for (int j = 0; j < nummap; ++j) {
4195 std::string nl;
4196 if (!af->getline(nl))
4197 return false;
4198 mychomp(nl);
4199 i = 0;
4200 maptable.push_back(mapentry());
4201 iter = nl.begin();
4202 start_piece = mystrsep(nl, iter);
4203 while (start_piece != nl.end()) {
4204 switch (i) {
4205 case 0: {
4206 if (nl.compare(start_piece - nl.begin(), 3, "MAP", 3) != 0) {
4207 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4208 af->getlinenum());
4209 nummap = 0;
4210 return false;
4211 }
4212 break;
4213 }
4214 case 1: {
4215 for (std::string::const_iterator k = start_piece; k != iter; ++k) {
4216 std::string::const_iterator chb = k;
4217 std::string::const_iterator che = k + 1;
4218 if (*k == '(') {
4219 std::string::const_iterator parpos = std::find(k, iter, ')');
4220 if (parpos != iter) {
4221 chb = k + 1;
4222 che = parpos;
4223 k = parpos;
4224 }
4225 } else {
4226 if (utf8 && (*k & 0xc0) == 0xc0) {
4227 ++k;
4228 while (k != iter && (*k & 0xc0) == 0x80)
4229 ++k;
4230 che = k;
4231 --k;
4232 }
4233 }
4234 maptable.back().push_back(std::string(chb, che));
4235 }
4236 break;
4237 }
4238 default:
4239 break;
4240 }
4241 ++i;
4242 start_piece = mystrsep(nl, iter);
4243 }
4244 if (maptable.back().empty()) {
4245 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4246 af->getlinenum());
4247 return false;
4248 }
4249 }
4250 return true;
4251 }
4252
4253 /* parse in the word breakpoint table */
parse_breaktable(const std::string & line,FileMgr * af)4254 bool AffixMgr::parse_breaktable(const std::string& line, FileMgr* af) {
4255 if (parsedbreaktable) {
4256 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
4257 af->getlinenum());
4258 return false;
4259 }
4260 parsedbreaktable = true;
4261 int numbreak = -1;
4262 int i = 0;
4263 int np = 0;
4264 std::string::const_iterator iter = line.begin();
4265 std::string::const_iterator start_piece = mystrsep(line, iter);
4266 while (start_piece != line.end()) {
4267 switch (i) {
4268 case 0: {
4269 np++;
4270 break;
4271 }
4272 case 1: {
4273 numbreak = atoi(std::string(start_piece, iter).c_str());
4274 if (numbreak < 0) {
4275 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4276 af->getlinenum());
4277 return false;
4278 }
4279 if (numbreak == 0)
4280 return true;
4281 breaktable.reserve(numbreak);
4282 np++;
4283 break;
4284 }
4285 default:
4286 break;
4287 }
4288 ++i;
4289 start_piece = mystrsep(line, iter);
4290 }
4291 if (np != 2) {
4292 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4293 af->getlinenum());
4294 return false;
4295 }
4296
4297 /* now parse the numbreak lines to read in the remainder of the table */
4298 for (int j = 0; j < numbreak; ++j) {
4299 std::string nl;
4300 if (!af->getline(nl))
4301 return false;
4302 mychomp(nl);
4303 i = 0;
4304 iter = nl.begin();
4305 start_piece = mystrsep(nl, iter);
4306 while (start_piece != nl.end()) {
4307 switch (i) {
4308 case 0: {
4309 if (nl.compare(start_piece - nl.begin(), 5, "BREAK", 5) != 0) {
4310 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4311 af->getlinenum());
4312 numbreak = 0;
4313 return false;
4314 }
4315 break;
4316 }
4317 case 1: {
4318 breaktable.push_back(std::string(start_piece, iter));
4319 break;
4320 }
4321 default:
4322 break;
4323 }
4324 ++i;
4325 start_piece = mystrsep(nl, iter);
4326 }
4327 }
4328
4329 if (breaktable.size() != static_cast<size_t>(numbreak)) {
4330 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4331 af->getlinenum());
4332 return false;
4333 }
4334
4335 return true;
4336 }
4337
reverse_condition(std::string & piece)4338 void AffixMgr::reverse_condition(std::string& piece) {
4339 if (piece.empty())
4340 return;
4341
4342 int neg = 0;
4343 for (std::string::reverse_iterator k = piece.rbegin(); k != piece.rend(); ++k) {
4344 switch (*k) {
4345 case '[': {
4346 if (neg)
4347 *(k - 1) = '[';
4348 else
4349 *k = ']';
4350 break;
4351 }
4352 case ']': {
4353 *k = '[';
4354 if (neg)
4355 *(k - 1) = '^';
4356 neg = 0;
4357 break;
4358 }
4359 case '^': {
4360 if (*(k - 1) == ']')
4361 neg = 1;
4362 else
4363 *(k - 1) = *k;
4364 break;
4365 }
4366 default: {
4367 if (neg)
4368 *(k - 1) = *k;
4369 }
4370 }
4371 }
4372 }
4373
4374 class entries_container {
4375 std::vector<AffEntry*> entries;
4376 AffixMgr* m_mgr;
4377 char m_at;
4378 public:
entries_container(char at,AffixMgr * mgr)4379 entries_container(char at, AffixMgr* mgr)
4380 : m_mgr(mgr)
4381 , m_at(at) {
4382 }
release()4383 void release() {
4384 entries.clear();
4385 }
initialize(int numents,char opts,unsigned short aflag)4386 void initialize(int numents,
4387 char opts, unsigned short aflag) {
4388 entries.reserve(numents);
4389
4390 if (m_at == 'P') {
4391 entries.push_back(new PfxEntry(m_mgr));
4392 } else {
4393 entries.push_back(new SfxEntry(m_mgr));
4394 }
4395
4396 entries.back()->opts = opts;
4397 entries.back()->aflag = aflag;
4398 }
4399
add_entry(char opts)4400 AffEntry* add_entry(char opts) {
4401 if (m_at == 'P') {
4402 entries.push_back(new PfxEntry(m_mgr));
4403 } else {
4404 entries.push_back(new SfxEntry(m_mgr));
4405 }
4406 AffEntry* ret = entries.back();
4407 ret->opts = entries[0]->opts & opts;
4408 return ret;
4409 }
4410
first_entry()4411 AffEntry* first_entry() {
4412 return entries.empty() ? NULL : entries[0];
4413 }
4414
~entries_container()4415 ~entries_container() {
4416 for (size_t i = 0; i < entries.size(); ++i) {
4417 delete entries[i];
4418 }
4419 }
4420
begin()4421 std::vector<AffEntry*>::iterator begin() { return entries.begin(); }
end()4422 std::vector<AffEntry*>::iterator end() { return entries.end(); }
4423 };
4424
parse_affix(const std::string & line,const char at,FileMgr * af,char * dupflags)4425 bool AffixMgr::parse_affix(const std::string& line,
4426 const char at,
4427 FileMgr* af,
4428 char* dupflags) {
4429 int numents = 0; // number of AffEntry structures to parse
4430
4431 unsigned short aflag = 0; // affix char identifier
4432
4433 char ff = 0;
4434 entries_container affentries(at, this);
4435
4436 int i = 0;
4437
4438 // checking lines with bad syntax
4439 #ifdef DEBUG
4440 int basefieldnum = 0;
4441 #endif
4442
4443 // split affix header line into pieces
4444
4445 int np = 0;
4446 std::string::const_iterator iter = line.begin();
4447 std::string::const_iterator start_piece = mystrsep(line, iter);
4448 while (start_piece != line.end()) {
4449 switch (i) {
4450 // piece 1 - is type of affix
4451 case 0: {
4452 np++;
4453 break;
4454 }
4455
4456 // piece 2 - is affix char
4457 case 1: {
4458 np++;
4459 aflag = pHMgr->decode_flag(std::string(start_piece, iter).c_str());
4460 if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
4461 ((at == 'P') && (dupflags[aflag] & dupPFX))) {
4462 HUNSPELL_WARNING(
4463 stderr,
4464 "error: line %d: multiple definitions of an affix flag\n",
4465 af->getlinenum());
4466 }
4467 dupflags[aflag] += (char)((at == 'S') ? dupSFX : dupPFX);
4468 break;
4469 }
4470 // piece 3 - is cross product indicator
4471 case 2: {
4472 np++;
4473 if (*start_piece == 'Y')
4474 ff = aeXPRODUCT;
4475 break;
4476 }
4477
4478 // piece 4 - is number of affentries
4479 case 3: {
4480 np++;
4481 numents = atoi(std::string(start_piece, iter).c_str());
4482 if ((numents <= 0) || ((std::numeric_limits<size_t>::max() /
4483 sizeof(AffEntry)) < static_cast<size_t>(numents))) {
4484 char* err = pHMgr->encode_flag(aflag);
4485 if (err) {
4486 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4487 af->getlinenum());
4488 free(err);
4489 }
4490 return false;
4491 }
4492
4493 char opts = ff;
4494 if (utf8)
4495 opts += aeUTF8;
4496 if (pHMgr->is_aliasf())
4497 opts += aeALIASF;
4498 if (pHMgr->is_aliasm())
4499 opts += aeALIASM;
4500 affentries.initialize(numents, opts, aflag);
4501 }
4502
4503 default:
4504 break;
4505 }
4506 ++i;
4507 start_piece = mystrsep(line, iter);
4508 }
4509 // check to make sure we parsed enough pieces
4510 if (np != 4) {
4511 char* err = pHMgr->encode_flag(aflag);
4512 if (err) {
4513 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4514 af->getlinenum());
4515 free(err);
4516 }
4517 return false;
4518 }
4519
4520 // now parse numents affentries for this affix
4521 AffEntry* entry = affentries.first_entry();
4522 for (int ent = 0; ent < numents; ++ent) {
4523 std::string nl;
4524 if (!af->getline(nl))
4525 return false;
4526 mychomp(nl);
4527
4528 iter = nl.begin();
4529 i = 0;
4530 np = 0;
4531
4532 // split line into pieces
4533 start_piece = mystrsep(nl, iter);
4534 while (start_piece != nl.end()) {
4535 switch (i) {
4536 // piece 1 - is type
4537 case 0: {
4538 np++;
4539 if (ent != 0)
4540 entry = affentries.add_entry((char)(aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM));
4541 break;
4542 }
4543
4544 // piece 2 - is affix char
4545 case 1: {
4546 np++;
4547 std::string chunk(start_piece, iter);
4548 if (pHMgr->decode_flag(chunk.c_str()) != aflag) {
4549 char* err = pHMgr->encode_flag(aflag);
4550 if (err) {
4551 HUNSPELL_WARNING(stderr,
4552 "error: line %d: affix %s is corrupt\n",
4553 af->getlinenum(), err);
4554 free(err);
4555 }
4556 return false;
4557 }
4558
4559 if (ent != 0) {
4560 AffEntry* start_entry = affentries.first_entry();
4561 entry->aflag = start_entry->aflag;
4562 }
4563 break;
4564 }
4565
4566 // piece 3 - is string to strip or 0 for null
4567 case 2: {
4568 np++;
4569 entry->strip = std::string(start_piece, iter);
4570 if (complexprefixes) {
4571 if (utf8)
4572 reverseword_utf(entry->strip);
4573 else
4574 reverseword(entry->strip);
4575 }
4576 if (entry->strip.compare("0") == 0) {
4577 entry->strip.clear();
4578 }
4579 break;
4580 }
4581
4582 // piece 4 - is affix string or 0 for null
4583 case 3: {
4584 entry->morphcode = NULL;
4585 entry->contclass = NULL;
4586 entry->contclasslen = 0;
4587 np++;
4588 std::string::const_iterator dash = std::find(start_piece, iter, '/');
4589 if (dash != iter) {
4590 entry->appnd = std::string(start_piece, dash);
4591 std::string dash_str(dash + 1, iter);
4592
4593 if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) {
4594 if (utf8) {
4595 remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
4596 } else {
4597 remove_ignored_chars(entry->appnd, ignorechars);
4598 }
4599 }
4600
4601 if (complexprefixes) {
4602 if (utf8)
4603 reverseword_utf(entry->appnd);
4604 else
4605 reverseword(entry->appnd);
4606 }
4607
4608 if (pHMgr->is_aliasf()) {
4609 int index = atoi(dash_str.c_str());
4610 entry->contclasslen = (unsigned short)pHMgr->get_aliasf(
4611 index, &(entry->contclass), af);
4612 if (!entry->contclasslen)
4613 HUNSPELL_WARNING(stderr,
4614 "error: bad affix flag alias: \"%s\"\n",
4615 dash_str.c_str());
4616 } else {
4617 entry->contclasslen = (unsigned short)pHMgr->decode_flags(
4618 &(entry->contclass), dash_str.c_str(), af);
4619 std::sort(entry->contclass, entry->contclass + entry->contclasslen);
4620 }
4621
4622 havecontclass = 1;
4623 for (unsigned short _i = 0; _i < entry->contclasslen; _i++) {
4624 contclasses[(entry->contclass)[_i]] = 1;
4625 }
4626 } else {
4627 entry->appnd = std::string(start_piece, iter);
4628
4629 if (!ignorechars.empty() && !has_no_ignored_chars(entry->appnd, ignorechars)) {
4630 if (utf8) {
4631 remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
4632 } else {
4633 remove_ignored_chars(entry->appnd, ignorechars);
4634 }
4635 }
4636
4637 if (complexprefixes) {
4638 if (utf8)
4639 reverseword_utf(entry->appnd);
4640 else
4641 reverseword(entry->appnd);
4642 }
4643 }
4644
4645 if (entry->appnd.compare("0") == 0) {
4646 entry->appnd.clear();
4647 }
4648 break;
4649 }
4650
4651 // piece 5 - is the conditions descriptions
4652 case 4: {
4653 std::string chunk(start_piece, iter);
4654 np++;
4655 if (complexprefixes) {
4656 if (utf8)
4657 reverseword_utf(chunk);
4658 else
4659 reverseword(chunk);
4660 reverse_condition(chunk);
4661 }
4662 if (!entry->strip.empty() && chunk != "." &&
4663 redundant_condition(at, entry->strip.c_str(), entry->strip.size(), chunk.c_str(),
4664 af->getlinenum()))
4665 chunk = ".";
4666 if (at == 'S') {
4667 reverseword(chunk);
4668 reverse_condition(chunk);
4669 }
4670 if (encodeit(*entry, chunk.c_str()))
4671 return false;
4672 break;
4673 }
4674
4675 case 5: {
4676 std::string chunk(start_piece, iter);
4677 np++;
4678 if (pHMgr->is_aliasm()) {
4679 int index = atoi(chunk.c_str());
4680 entry->morphcode = pHMgr->get_aliasm(index);
4681 } else {
4682 if (complexprefixes) { // XXX - fix me for morph. gen.
4683 if (utf8)
4684 reverseword_utf(chunk);
4685 else
4686 reverseword(chunk);
4687 }
4688 // add the remaining of the line
4689 std::string::const_iterator end = nl.end();
4690 if (iter != end) {
4691 chunk.append(iter, end);
4692 }
4693 entry->morphcode = mystrdup(chunk.c_str());
4694 if (!entry->morphcode)
4695 return false;
4696 }
4697 break;
4698 }
4699 default:
4700 break;
4701 }
4702 i++;
4703 start_piece = mystrsep(nl, iter);
4704 }
4705 // check to make sure we parsed enough pieces
4706 if (np < 4) {
4707 char* err = pHMgr->encode_flag(aflag);
4708 if (err) {
4709 HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
4710 af->getlinenum(), err);
4711 free(err);
4712 }
4713 return false;
4714 }
4715
4716 #ifdef DEBUG
4717 // detect unnecessary fields, excepting comments
4718 if (basefieldnum) {
4719 int fieldnum =
4720 !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6);
4721 if (fieldnum != basefieldnum)
4722 HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n",
4723 af->getlinenum());
4724 } else {
4725 basefieldnum =
4726 !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6);
4727 }
4728 #endif
4729 }
4730
4731 // now create SfxEntry or PfxEntry objects and use links to
4732 // build an ordered (sorted by affix string) list
4733 std::vector<AffEntry*>::iterator start = affentries.begin();
4734 std::vector<AffEntry*>::iterator end = affentries.end();
4735 for (std::vector<AffEntry*>::iterator affentry = start; affentry != end; ++affentry) {
4736 if (at == 'P') {
4737 build_pfxtree(static_cast<PfxEntry*>(*affentry));
4738 } else {
4739 build_sfxtree(static_cast<SfxEntry*>(*affentry));
4740 }
4741 }
4742
4743 //contents belong to AffixMgr now
4744 affentries.release();
4745
4746 return true;
4747 }
4748
redundant_condition(char ft,const char * strip,int stripl,const char * cond,int linenum)4749 int AffixMgr::redundant_condition(char ft,
4750 const char* strip,
4751 int stripl,
4752 const char* cond,
4753 int linenum) {
4754 int condl = strlen(cond);
4755 int i;
4756 int j;
4757 int neg;
4758 int in;
4759 if (ft == 'P') { // prefix
4760 if (strncmp(strip, cond, condl) == 0)
4761 return 1;
4762 if (utf8) {
4763 } else {
4764 for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
4765 if (cond[j] != '[') {
4766 if (cond[j] != strip[i]) {
4767 HUNSPELL_WARNING(stderr,
4768 "warning: line %d: incompatible stripping "
4769 "characters and condition\n",
4770 linenum);
4771 return 0;
4772 }
4773 } else {
4774 neg = (cond[j + 1] == '^') ? 1 : 0;
4775 in = 0;
4776 do {
4777 j++;
4778 if (strip[i] == cond[j])
4779 in = 1;
4780 } while ((j < (condl - 1)) && (cond[j] != ']'));
4781 if (j == (condl - 1) && (cond[j] != ']')) {
4782 HUNSPELL_WARNING(stderr,
4783 "error: line %d: missing ] in condition:\n%s\n",
4784 linenum, cond);
4785 return 0;
4786 }
4787 if ((!neg && !in) || (neg && in)) {
4788 HUNSPELL_WARNING(stderr,
4789 "warning: line %d: incompatible stripping "
4790 "characters and condition\n",
4791 linenum);
4792 return 0;
4793 }
4794 }
4795 }
4796 if (j >= condl)
4797 return 1;
4798 }
4799 } else { // suffix
4800 if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0)
4801 return 1;
4802 if (utf8) {
4803 } else {
4804 for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
4805 if (cond[j] != ']') {
4806 if (cond[j] != strip[i]) {
4807 HUNSPELL_WARNING(stderr,
4808 "warning: line %d: incompatible stripping "
4809 "characters and condition\n",
4810 linenum);
4811 return 0;
4812 }
4813 } else {
4814 in = 0;
4815 do {
4816 j--;
4817 if (strip[i] == cond[j])
4818 in = 1;
4819 } while ((j > 0) && (cond[j] != '['));
4820 if ((j == 0) && (cond[j] != '[')) {
4821 HUNSPELL_WARNING(stderr,
4822 "error: line: %d: missing ] in condition:\n%s\n",
4823 linenum, cond);
4824 return 0;
4825 }
4826 neg = (cond[j + 1] == '^') ? 1 : 0;
4827 if ((!neg && !in) || (neg && in)) {
4828 HUNSPELL_WARNING(stderr,
4829 "warning: line %d: incompatible stripping "
4830 "characters and condition\n",
4831 linenum);
4832 return 0;
4833 }
4834 }
4835 }
4836 if (j < 0)
4837 return 1;
4838 }
4839 }
4840 return 0;
4841 }
4842
get_suffix_words(short unsigned * suff,int len,const char * root_word)4843 std::vector<std::string> AffixMgr::get_suffix_words(short unsigned* suff,
4844 int len,
4845 const char* root_word) {
4846 std::vector<std::string> slst;
4847 short unsigned* start_ptr = suff;
4848 for (int j = 0; j < SETSIZE; j++) {
4849 SfxEntry* ptr = sStart[j];
4850 while (ptr) {
4851 suff = start_ptr;
4852 for (int i = 0; i < len; i++) {
4853 if ((*suff) == ptr->getFlag()) {
4854 std::string nw(root_word);
4855 nw.append(ptr->getAffix());
4856 hentry* ht = ptr->checkword(nw.c_str(), nw.size(), 0, NULL, 0, 0, 0);
4857 if (ht) {
4858 slst.push_back(nw);
4859 }
4860 }
4861 suff++;
4862 }
4863 ptr = ptr->getNext();
4864 }
4865 }
4866 return slst;
4867 }
4868