1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * Copyright (C) 2002-2017 Németh László
5 *
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
10 *
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
15 *
16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17 *
18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23 *
24 * Alternatively, the contents of this file may be used under the terms of
25 * either the GNU General Public License Version 2 or later (the "GPL"), or
26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
35 *
36 * ***** END LICENSE BLOCK ***** */
37 /*
38 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39 * And Contributors. All rights reserved.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 *
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 *
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 *
52 * 3. All modifications to the source code must be clearly marked as
53 * such. Binary redistributions based on modified source code
54 * must be clearly marked as modified versions in the documentation
55 * and/or other materials provided with the distribution.
56 *
57 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
61 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 */
70
71 #include <stdlib.h>
72 #include <string.h>
73 #include <stdio.h>
74 #include <ctype.h>
75
76 #include <algorithm>
77 #include <limits>
78 #include <string>
79 #include <vector>
80
81 #include "affixmgr.hxx"
82 #include "affentry.hxx"
83 #include "langnum.hxx"
84
85 #include "csutil.hxx"
86
AffixMgr(const char * affpath,const std::vector<HashMgr * > & ptr,const char * key)87 AffixMgr::AffixMgr(const char* affpath,
88 const std::vector<HashMgr*>& ptr,
89 const char* key)
90 : alldic(ptr)
91 , pHMgr(ptr[0]) {
92
93 // register hash manager and load affix data from aff file
94 csconv = NULL;
95 utf8 = 0;
96 complexprefixes = 0;
97 parsedmaptable = false;
98 parsedbreaktable = false;
99 parsedrep = false;
100 iconvtable = NULL;
101 oconvtable = NULL;
102 // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
103 simplifiedcpd = 0;
104 parsedcheckcpd = false;
105 parseddefcpd = false;
106 phone = NULL;
107 compoundflag = FLAG_NULL; // permits word in compound forms
108 compoundbegin = FLAG_NULL; // may be first word in compound forms
109 compoundmiddle = FLAG_NULL; // may be middle word in compound forms
110 compoundend = FLAG_NULL; // may be last word in compound forms
111 compoundroot = FLAG_NULL; // compound word signing flag
112 compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
113 compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
114 compoundmoresuffixes = 0; // allow more suffixes within compound words
115 checkcompounddup = 0; // forbid double words in compounds
116 checkcompoundrep = 0; // forbid bad compounds (may be non compound word with
117 // a REP substitution)
118 checkcompoundcase =
119 0; // forbid upper and lowercase combinations at word bounds
120 checkcompoundtriple = 0; // forbid compounds with triple letters
121 simplifiedtriple = 0; // allow simplified triple letters in compounds
122 // (Schiff+fahrt -> Schiffahrt)
123 forbiddenword = FORBIDDENWORD; // forbidden word signing flag
124 nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
125 nongramsuggest = FLAG_NULL;
126 langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
127 needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes
128 cpdwordmax = -1; // default: unlimited wordcount in compound words
129 cpdmin = -1; // undefined
130 cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
131 pfxappnd = NULL; // previous prefix for counting syllables of the prefix BUG
132 sfxappnd = NULL; // previous suffix for counting syllables of the suffix BUG
133 sfxextra = 0; // modifier for syllable count of sfxappnd BUG
134 checknum = 0; // checking numbers, and word with numbers
135 havecontclass = 0; // flags of possible continuing classes (double affix)
136 // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
137 // in morhological description in dictionary file. It's often combined with
138 // PSEUDOROOT.
139 lemma_present = FLAG_NULL;
140 circumfix = FLAG_NULL;
141 onlyincompound = FLAG_NULL;
142 maxngramsugs = -1; // undefined
143 maxdiff = -1; // undefined
144 onlymaxdiff = 0;
145 maxcpdsugs = -1; // undefined
146 nosplitsugs = 0;
147 sugswithdots = 0;
148 keepcase = 0;
149 forceucase = 0;
150 warn = 0;
151 forbidwarn = 0;
152 checksharps = 0;
153 substandard = FLAG_NULL;
154 fullstrip = 0;
155
156 sfx = NULL;
157 pfx = NULL;
158
159 for (int i = 0; i < SETSIZE; i++) {
160 pStart[i] = NULL;
161 sStart[i] = NULL;
162 pFlag[i] = NULL;
163 sFlag[i] = NULL;
164 }
165
166 for (int j = 0; j < CONTSIZE; j++) {
167 contclasses[j] = 0;
168 }
169
170 if (parse_file(affpath, key)) {
171 HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n", affpath);
172 }
173
174 if (cpdmin == -1)
175 cpdmin = MINCPDLEN;
176 }
177
~AffixMgr()178 AffixMgr::~AffixMgr() {
179 // pass through linked prefix entries and clean up
180 for (int i = 0; i < SETSIZE; i++) {
181 pFlag[i] = NULL;
182 PfxEntry* ptr = pStart[i];
183 PfxEntry* nptr = NULL;
184 while (ptr) {
185 nptr = ptr->getNext();
186 delete (ptr);
187 ptr = nptr;
188 nptr = NULL;
189 }
190 }
191
192 // pass through linked suffix entries and clean up
193 for (int j = 0; j < SETSIZE; j++) {
194 sFlag[j] = NULL;
195 SfxEntry* ptr = sStart[j];
196 SfxEntry* nptr = NULL;
197 while (ptr) {
198 nptr = ptr->getNext();
199 delete (ptr);
200 ptr = nptr;
201 nptr = NULL;
202 }
203 sStart[j] = NULL;
204 }
205
206 delete iconvtable;
207 delete oconvtable;
208 delete phone;
209
210 FREE_FLAG(compoundflag);
211 FREE_FLAG(compoundbegin);
212 FREE_FLAG(compoundmiddle);
213 FREE_FLAG(compoundend);
214 FREE_FLAG(compoundpermitflag);
215 FREE_FLAG(compoundforbidflag);
216 FREE_FLAG(compoundroot);
217 FREE_FLAG(forbiddenword);
218 FREE_FLAG(nosuggest);
219 FREE_FLAG(nongramsuggest);
220 FREE_FLAG(needaffix);
221 FREE_FLAG(lemma_present);
222 FREE_FLAG(circumfix);
223 FREE_FLAG(onlyincompound);
224
225 cpdwordmax = 0;
226 pHMgr = NULL;
227 cpdmin = 0;
228 cpdmaxsyllable = 0;
229 free_utf_tbl();
230 checknum = 0;
231 #ifdef MOZILLA_CLIENT
232 delete[] csconv;
233 #endif
234 }
235
finishFileMgr(FileMgr * afflst)236 void AffixMgr::finishFileMgr(FileMgr* afflst) {
237 delete afflst;
238
239 // convert affix trees to sorted list
240 process_pfx_tree_to_list();
241 process_sfx_tree_to_list();
242 }
243
244 // read in aff file and build up prefix and suffix entry objects
parse_file(const char * affpath,const char * key)245 int AffixMgr::parse_file(const char* affpath, const char* key) {
246
247 // checking flag duplication
248 char dupflags[CONTSIZE];
249 char dupflags_ini = 1;
250
251 // first line indicator for removing byte order mark
252 int firstline = 1;
253
254 // open the affix file
255 FileMgr* afflst = new FileMgr(affpath, key);
256 if (!afflst) {
257 HUNSPELL_WARNING(
258 stderr, "error: could not open affix description file %s\n", affpath);
259 return 1;
260 }
261
262 // step one is to parse the affix file building up the internal
263 // affix data structures
264
265 // read in each line ignoring any that do not
266 // start with a known line type indicator
267 std::string line;
268 while (afflst->getline(line)) {
269 mychomp(line);
270
271 /* remove byte order mark */
272 if (firstline) {
273 firstline = 0;
274 // Affix file begins with byte order mark: possible incompatibility with
275 // old Hunspell versions
276 if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
277 line.erase(0, 3);
278 }
279 }
280
281 /* parse in the keyboard string */
282 if (line.compare(0, 3, "KEY", 3) == 0) {
283 if (!parse_string(line, keystring, afflst->getlinenum())) {
284 finishFileMgr(afflst);
285 return 1;
286 }
287 }
288
289 /* parse in the try string */
290 if (line.compare(0, 3, "TRY", 3) == 0) {
291 if (!parse_string(line, trystring, afflst->getlinenum())) {
292 finishFileMgr(afflst);
293 return 1;
294 }
295 }
296
297 /* parse in the name of the character set used by the .dict and .aff */
298 if (line.compare(0, 3, "SET", 3) == 0) {
299 if (!parse_string(line, encoding, afflst->getlinenum())) {
300 finishFileMgr(afflst);
301 return 1;
302 }
303 if (encoding == "UTF-8") {
304 utf8 = 1;
305 #ifndef OPENOFFICEORG
306 #ifndef MOZILLA_CLIENT
307 initialize_utf_tbl();
308 #endif
309 #endif
310 }
311 }
312
313 /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left
314 * writing system */
315 if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
316 complexprefixes = 1;
317
318 /* parse in the flag used by the controlled compound words */
319 if (line.compare(0, 12, "COMPOUNDFLAG", 12) == 0) {
320 if (!parse_flag(line, &compoundflag, afflst)) {
321 finishFileMgr(afflst);
322 return 1;
323 }
324 }
325
326 /* parse in the flag used by compound words */
327 if (line.compare(0, 13, "COMPOUNDBEGIN", 13) == 0) {
328 if (complexprefixes) {
329 if (!parse_flag(line, &compoundend, afflst)) {
330 finishFileMgr(afflst);
331 return 1;
332 }
333 } else {
334 if (!parse_flag(line, &compoundbegin, afflst)) {
335 finishFileMgr(afflst);
336 return 1;
337 }
338 }
339 }
340
341 /* parse in the flag used by compound words */
342 if (line.compare(0, 14, "COMPOUNDMIDDLE", 14) == 0) {
343 if (!parse_flag(line, &compoundmiddle, afflst)) {
344 finishFileMgr(afflst);
345 return 1;
346 }
347 }
348
349 /* parse in the flag used by compound words */
350 if (line.compare(0, 11, "COMPOUNDEND", 11) == 0) {
351 if (complexprefixes) {
352 if (!parse_flag(line, &compoundbegin, afflst)) {
353 finishFileMgr(afflst);
354 return 1;
355 }
356 } else {
357 if (!parse_flag(line, &compoundend, afflst)) {
358 finishFileMgr(afflst);
359 return 1;
360 }
361 }
362 }
363
364 /* parse in the data used by compound_check() method */
365 if (line.compare(0, 15, "COMPOUNDWORDMAX", 15) == 0) {
366 if (!parse_num(line, &cpdwordmax, afflst)) {
367 finishFileMgr(afflst);
368 return 1;
369 }
370 }
371
372 /* parse in the flag sign compounds in dictionary */
373 if (line.compare(0, 12, "COMPOUNDROOT", 12) == 0) {
374 if (!parse_flag(line, &compoundroot, afflst)) {
375 finishFileMgr(afflst);
376 return 1;
377 }
378 }
379
380 /* parse in the flag used by compound_check() method */
381 if (line.compare(0, 18, "COMPOUNDPERMITFLAG", 18) == 0) {
382 if (!parse_flag(line, &compoundpermitflag, afflst)) {
383 finishFileMgr(afflst);
384 return 1;
385 }
386 }
387
388 /* parse in the flag used by compound_check() method */
389 if (line.compare(0, 18, "COMPOUNDFORBIDFLAG", 18) == 0) {
390 if (!parse_flag(line, &compoundforbidflag, afflst)) {
391 finishFileMgr(afflst);
392 return 1;
393 }
394 }
395
396 if (line.compare(0, 20, "COMPOUNDMORESUFFIXES", 20) == 0) {
397 compoundmoresuffixes = 1;
398 }
399
400 if (line.compare(0, 16, "CHECKCOMPOUNDDUP", 16) == 0) {
401 checkcompounddup = 1;
402 }
403
404 if (line.compare(0, 16, "CHECKCOMPOUNDREP", 16) == 0) {
405 checkcompoundrep = 1;
406 }
407
408 if (line.compare(0, 19, "CHECKCOMPOUNDTRIPLE", 19) == 0) {
409 checkcompoundtriple = 1;
410 }
411
412 if (line.compare(0, 16, "SIMPLIFIEDTRIPLE", 16) == 0) {
413 simplifiedtriple = 1;
414 }
415
416 if (line.compare(0, 17, "CHECKCOMPOUNDCASE", 17) == 0) {
417 checkcompoundcase = 1;
418 }
419
420 if (line.compare(0, 9, "NOSUGGEST", 9) == 0) {
421 if (!parse_flag(line, &nosuggest, afflst)) {
422 finishFileMgr(afflst);
423 return 1;
424 }
425 }
426
427 if (line.compare(0, 14, "NONGRAMSUGGEST", 14) == 0) {
428 if (!parse_flag(line, &nongramsuggest, afflst)) {
429 finishFileMgr(afflst);
430 return 1;
431 }
432 }
433
434 /* parse in the flag used by forbidden words */
435 if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) {
436 if (!parse_flag(line, &forbiddenword, afflst)) {
437 finishFileMgr(afflst);
438 return 1;
439 }
440 }
441
442 /* parse in the flag used by forbidden words */
443 if (line.compare(0, 13, "LEMMA_PRESENT", 13) == 0) {
444 if (!parse_flag(line, &lemma_present, afflst)) {
445 finishFileMgr(afflst);
446 return 1;
447 }
448 }
449
450 /* parse in the flag used by circumfixes */
451 if (line.compare(0, 9, "CIRCUMFIX", 9) == 0) {
452 if (!parse_flag(line, &circumfix, afflst)) {
453 finishFileMgr(afflst);
454 return 1;
455 }
456 }
457
458 /* parse in the flag used by fogemorphemes */
459 if (line.compare(0, 14, "ONLYINCOMPOUND", 14) == 0) {
460 if (!parse_flag(line, &onlyincompound, afflst)) {
461 finishFileMgr(afflst);
462 return 1;
463 }
464 }
465
466 /* parse in the flag used by `needaffixs' */
467 if (line.compare(0, 10, "PSEUDOROOT", 10) == 0) {
468 if (!parse_flag(line, &needaffix, afflst)) {
469 finishFileMgr(afflst);
470 return 1;
471 }
472 }
473
474 /* parse in the flag used by `needaffixs' */
475 if (line.compare(0, 9, "NEEDAFFIX", 9) == 0) {
476 if (!parse_flag(line, &needaffix, afflst)) {
477 finishFileMgr(afflst);
478 return 1;
479 }
480 }
481
482 /* parse in the minimal length for words in compounds */
483 if (line.compare(0, 11, "COMPOUNDMIN", 11) == 0) {
484 if (!parse_num(line, &cpdmin, afflst)) {
485 finishFileMgr(afflst);
486 return 1;
487 }
488 if (cpdmin < 1)
489 cpdmin = 1;
490 }
491
492 /* parse in the max. words and syllables in compounds */
493 if (line.compare(0, 16, "COMPOUNDSYLLABLE", 16) == 0) {
494 if (!parse_cpdsyllable(line, afflst)) {
495 finishFileMgr(afflst);
496 return 1;
497 }
498 }
499
500 /* parse in the flag used by compound_check() method */
501 if (line.compare(0, 11, "SYLLABLENUM", 11) == 0) {
502 if (!parse_string(line, cpdsyllablenum, afflst->getlinenum())) {
503 finishFileMgr(afflst);
504 return 1;
505 }
506 }
507
508 /* parse in the flag used by the controlled compound words */
509 if (line.compare(0, 8, "CHECKNUM", 8) == 0) {
510 checknum = 1;
511 }
512
513 /* parse in the extra word characters */
514 if (line.compare(0, 9, "WORDCHARS", 9) == 0) {
515 if (!parse_array(line, wordchars, wordchars_utf16,
516 utf8, afflst->getlinenum())) {
517 finishFileMgr(afflst);
518 return 1;
519 }
520 }
521
522 /* parse in the ignored characters (for example, Arabic optional diacretics
523 * charachters */
524 if (line.compare(0, 6, "IGNORE", 6) == 0) {
525 if (!parse_array(line, ignorechars, ignorechars_utf16,
526 utf8, afflst->getlinenum())) {
527 finishFileMgr(afflst);
528 return 1;
529 }
530 }
531
532 /* parse in the typical fault correcting table */
533 if (line.compare(0, 3, "REP", 3) == 0) {
534 if (!parse_reptable(line, afflst)) {
535 finishFileMgr(afflst);
536 return 1;
537 }
538 }
539
540 /* parse in the input conversion table */
541 if (line.compare(0, 5, "ICONV", 5) == 0) {
542 if (!parse_convtable(line, afflst, &iconvtable, "ICONV")) {
543 finishFileMgr(afflst);
544 return 1;
545 }
546 }
547
548 /* parse in the input conversion table */
549 if (line.compare(0, 5, "OCONV", 5) == 0) {
550 if (!parse_convtable(line, afflst, &oconvtable, "OCONV")) {
551 finishFileMgr(afflst);
552 return 1;
553 }
554 }
555
556 /* parse in the phonetic translation table */
557 if (line.compare(0, 5, "PHONE", 5) == 0) {
558 if (!parse_phonetable(line, afflst)) {
559 finishFileMgr(afflst);
560 return 1;
561 }
562 }
563
564 /* parse in the checkcompoundpattern table */
565 if (line.compare(0, 20, "CHECKCOMPOUNDPATTERN", 20) == 0) {
566 if (!parse_checkcpdtable(line, afflst)) {
567 finishFileMgr(afflst);
568 return 1;
569 }
570 }
571
572 /* parse in the defcompound table */
573 if (line.compare(0, 12, "COMPOUNDRULE", 12) == 0) {
574 if (!parse_defcpdtable(line, afflst)) {
575 finishFileMgr(afflst);
576 return 1;
577 }
578 }
579
580 /* parse in the related character map table */
581 if (line.compare(0, 3, "MAP", 3) == 0) {
582 if (!parse_maptable(line, afflst)) {
583 finishFileMgr(afflst);
584 return 1;
585 }
586 }
587
588 /* parse in the word breakpoints table */
589 if (line.compare(0, 5, "BREAK", 5) == 0) {
590 if (!parse_breaktable(line, afflst)) {
591 finishFileMgr(afflst);
592 return 1;
593 }
594 }
595
596 /* parse in the language for language specific codes */
597 if (line.compare(0, 4, "LANG", 4) == 0) {
598 if (!parse_string(line, lang, afflst->getlinenum())) {
599 finishFileMgr(afflst);
600 return 1;
601 }
602 langnum = get_lang_num(lang);
603 }
604
605 if (line.compare(0, 7, "VERSION", 7) == 0) {
606 size_t startpos = line.find_first_not_of(" \t", 7);
607 if (startpos != std::string::npos) {
608 version = line.substr(startpos);
609 }
610 }
611
612 if (line.compare(0, 12, "MAXNGRAMSUGS", 12) == 0) {
613 if (!parse_num(line, &maxngramsugs, afflst)) {
614 finishFileMgr(afflst);
615 return 1;
616 }
617 }
618
619 if (line.compare(0, 11, "ONLYMAXDIFF", 11) == 0)
620 onlymaxdiff = 1;
621
622 if (line.compare(0, 7, "MAXDIFF", 7) == 0) {
623 if (!parse_num(line, &maxdiff, afflst)) {
624 finishFileMgr(afflst);
625 return 1;
626 }
627 }
628
629 if (line.compare(0, 10, "MAXCPDSUGS", 10) == 0) {
630 if (!parse_num(line, &maxcpdsugs, afflst)) {
631 finishFileMgr(afflst);
632 return 1;
633 }
634 }
635
636 if (line.compare(0, 11, "NOSPLITSUGS", 11) == 0) {
637 nosplitsugs = 1;
638 }
639
640 if (line.compare(0, 9, "FULLSTRIP", 9) == 0) {
641 fullstrip = 1;
642 }
643
644 if (line.compare(0, 12, "SUGSWITHDOTS", 12) == 0) {
645 sugswithdots = 1;
646 }
647
648 /* parse in the flag used by forbidden words */
649 if (line.compare(0, 8, "KEEPCASE", 8) == 0) {
650 if (!parse_flag(line, &keepcase, afflst)) {
651 finishFileMgr(afflst);
652 return 1;
653 }
654 }
655
656 /* parse in the flag used by `forceucase' */
657 if (line.compare(0, 10, "FORCEUCASE", 10) == 0) {
658 if (!parse_flag(line, &forceucase, afflst)) {
659 finishFileMgr(afflst);
660 return 1;
661 }
662 }
663
664 /* parse in the flag used by `warn' */
665 if (line.compare(0, 4, "WARN", 4) == 0) {
666 if (!parse_flag(line, &warn, afflst)) {
667 finishFileMgr(afflst);
668 return 1;
669 }
670 }
671
672 if (line.compare(0, 10, "FORBIDWARN", 10) == 0) {
673 forbidwarn = 1;
674 }
675
676 /* parse in the flag used by the affix generator */
677 if (line.compare(0, 11, "SUBSTANDARD", 11) == 0) {
678 if (!parse_flag(line, &substandard, afflst)) {
679 finishFileMgr(afflst);
680 return 1;
681 }
682 }
683
684 if (line.compare(0, 11, "CHECKSHARPS", 11) == 0) {
685 checksharps = 1;
686 }
687
688 /* parse this affix: P - prefix, S - suffix */
689 // affix type
690 char ft = ' ';
691 if (line.compare(0, 3, "PFX", 3) == 0)
692 ft = complexprefixes ? 'S' : 'P';
693 if (line.compare(0, 3, "SFX", 3) == 0)
694 ft = complexprefixes ? 'P' : 'S';
695 if (ft != ' ') {
696 if (dupflags_ini) {
697 memset(dupflags, 0, sizeof(dupflags));
698 dupflags_ini = 0;
699 }
700 if (!parse_affix(line, ft, afflst, dupflags)) {
701 finishFileMgr(afflst);
702 return 1;
703 }
704 }
705 }
706
707 finishFileMgr(afflst);
708 // affix trees are sorted now
709
710 // now we can speed up performance greatly taking advantage of the
711 // relationship between the affixes and the idea of "subsets".
712
713 // View each prefix as a potential leading subset of another and view
714 // each suffix (reversed) as a potential trailing subset of another.
715
716 // To illustrate this relationship if we know the prefix "ab" is found in the
717 // word to examine, only prefixes that "ab" is a leading subset of need be
718 // examined.
719 // Furthermore is "ab" is not present then none of the prefixes that "ab" is
720 // is a subset need be examined.
721 // The same argument goes for suffix string that are reversed.
722
723 // Then to top this off why not examine the first char of the word to quickly
724 // limit the set of prefixes to examine (i.e. the prefixes to examine must
725 // be leading supersets of the first character of the word (if they exist)
726
727 // To take advantage of this "subset" relationship, we need to add two links
728 // from entry. One to take next if the current prefix is found (call it
729 // nexteq)
730 // and one to take next if the current prefix is not found (call it nextne).
731
732 // Since we have built ordered lists, all that remains is to properly
733 // initialize
734 // the nextne and nexteq pointers that relate them
735
736 process_pfx_order();
737 process_sfx_order();
738
739 /* get encoding for CHECKCOMPOUNDCASE */
740 if (!utf8) {
741 csconv = get_current_cs(get_encoding());
742 for (int i = 0; i <= 255; i++) {
743 if ((csconv[i].cupper != csconv[i].clower) &&
744 (wordchars.find((char)i) == std::string::npos)) {
745 wordchars.push_back((char)i);
746 }
747 }
748
749 }
750
751 // default BREAK definition
752 if (!parsedbreaktable) {
753 breaktable.push_back("-");
754 breaktable.push_back("^-");
755 breaktable.push_back("-$");
756 parsedbreaktable = true;
757 }
758 return 0;
759 }
760
761 // we want to be able to quickly access prefix information
762 // both by prefix flag, and sorted by prefix string itself
763 // so we need to set up two indexes
764
build_pfxtree(PfxEntry * pfxptr)765 int AffixMgr::build_pfxtree(PfxEntry* pfxptr) {
766 PfxEntry* ptr;
767 PfxEntry* pptr;
768 PfxEntry* ep = pfxptr;
769
770 // get the right starting points
771 const char* key = ep->getKey();
772 const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF);
773
774 // first index by flag which must exist
775 ptr = pFlag[flg];
776 ep->setFlgNxt(ptr);
777 pFlag[flg] = ep;
778
779 // handle the special case of null affix string
780 if (strlen(key) == 0) {
781 // always inset them at head of list at element 0
782 ptr = pStart[0];
783 ep->setNext(ptr);
784 pStart[0] = ep;
785 return 0;
786 }
787
788 // now handle the normal case
789 ep->setNextEQ(NULL);
790 ep->setNextNE(NULL);
791
792 unsigned char sp = *((const unsigned char*)key);
793 ptr = pStart[sp];
794
795 // handle the first insert
796 if (!ptr) {
797 pStart[sp] = ep;
798 return 0;
799 }
800
801 // otherwise use binary tree insertion so that a sorted
802 // list can easily be generated later
803 pptr = NULL;
804 for (;;) {
805 pptr = ptr;
806 if (strcmp(ep->getKey(), ptr->getKey()) <= 0) {
807 ptr = ptr->getNextEQ();
808 if (!ptr) {
809 pptr->setNextEQ(ep);
810 break;
811 }
812 } else {
813 ptr = ptr->getNextNE();
814 if (!ptr) {
815 pptr->setNextNE(ep);
816 break;
817 }
818 }
819 }
820 return 0;
821 }
822
823 // we want to be able to quickly access suffix information
824 // both by suffix flag, and sorted by the reverse of the
825 // suffix string itself; so we need to set up two indexes
build_sfxtree(SfxEntry * sfxptr)826 int AffixMgr::build_sfxtree(SfxEntry* sfxptr) {
827
828 sfxptr->initReverseWord();
829
830 SfxEntry* ptr;
831 SfxEntry* pptr;
832 SfxEntry* ep = sfxptr;
833
834 /* get the right starting point */
835 const char* key = ep->getKey();
836 const unsigned char flg = (unsigned char)(ep->getFlag() & 0x00FF);
837
838 // first index by flag which must exist
839 ptr = sFlag[flg];
840 ep->setFlgNxt(ptr);
841 sFlag[flg] = ep;
842
843 // next index by affix string
844
845 // handle the special case of null affix string
846 if (strlen(key) == 0) {
847 // always inset them at head of list at element 0
848 ptr = sStart[0];
849 ep->setNext(ptr);
850 sStart[0] = ep;
851 return 0;
852 }
853
854 // now handle the normal case
855 ep->setNextEQ(NULL);
856 ep->setNextNE(NULL);
857
858 unsigned char sp = *((const unsigned char*)key);
859 ptr = sStart[sp];
860
861 // handle the first insert
862 if (!ptr) {
863 sStart[sp] = ep;
864 return 0;
865 }
866
867 // otherwise use binary tree insertion so that a sorted
868 // list can easily be generated later
869 pptr = NULL;
870 for (;;) {
871 pptr = ptr;
872 if (strcmp(ep->getKey(), ptr->getKey()) <= 0) {
873 ptr = ptr->getNextEQ();
874 if (!ptr) {
875 pptr->setNextEQ(ep);
876 break;
877 }
878 } else {
879 ptr = ptr->getNextNE();
880 if (!ptr) {
881 pptr->setNextNE(ep);
882 break;
883 }
884 }
885 }
886 return 0;
887 }
888
889 // convert from binary tree to sorted list
process_pfx_tree_to_list()890 int AffixMgr::process_pfx_tree_to_list() {
891 for (int i = 1; i < SETSIZE; i++) {
892 pStart[i] = process_pfx_in_order(pStart[i], NULL);
893 }
894 return 0;
895 }
896
process_pfx_in_order(PfxEntry * ptr,PfxEntry * nptr)897 PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr) {
898 if (ptr) {
899 nptr = process_pfx_in_order(ptr->getNextNE(), nptr);
900 ptr->setNext(nptr);
901 nptr = process_pfx_in_order(ptr->getNextEQ(), ptr);
902 }
903 return nptr;
904 }
905
906 // convert from binary tree to sorted list
process_sfx_tree_to_list()907 int AffixMgr::process_sfx_tree_to_list() {
908 for (int i = 1; i < SETSIZE; i++) {
909 sStart[i] = process_sfx_in_order(sStart[i], NULL);
910 }
911 return 0;
912 }
913
process_sfx_in_order(SfxEntry * ptr,SfxEntry * nptr)914 SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr) {
915 if (ptr) {
916 nptr = process_sfx_in_order(ptr->getNextNE(), nptr);
917 ptr->setNext(nptr);
918 nptr = process_sfx_in_order(ptr->getNextEQ(), ptr);
919 }
920 return nptr;
921 }
922
923 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
924 // using the idea of leading subsets this time
process_pfx_order()925 int AffixMgr::process_pfx_order() {
926 PfxEntry* ptr;
927
928 // loop through each prefix list starting point
929 for (int i = 1; i < SETSIZE; i++) {
930 ptr = pStart[i];
931
932 // look through the remainder of the list
933 // and find next entry with affix that
934 // the current one is not a subset of
935 // mark that as destination for NextNE
936 // use next in list that you are a subset
937 // of as NextEQ
938
939 for (; ptr != NULL; ptr = ptr->getNext()) {
940 PfxEntry* nptr = ptr->getNext();
941 for (; nptr != NULL; nptr = nptr->getNext()) {
942 if (!isSubset(ptr->getKey(), nptr->getKey()))
943 break;
944 }
945 ptr->setNextNE(nptr);
946 ptr->setNextEQ(NULL);
947 if ((ptr->getNext()) &&
948 isSubset(ptr->getKey(), (ptr->getNext())->getKey()))
949 ptr->setNextEQ(ptr->getNext());
950 }
951
952 // now clean up by adding smart search termination strings:
953 // if you are already a superset of the previous prefix
954 // but not a subset of the next, search can end here
955 // so set NextNE properly
956
957 ptr = pStart[i];
958 for (; ptr != NULL; ptr = ptr->getNext()) {
959 PfxEntry* nptr = ptr->getNext();
960 PfxEntry* mptr = NULL;
961 for (; nptr != NULL; nptr = nptr->getNext()) {
962 if (!isSubset(ptr->getKey(), nptr->getKey()))
963 break;
964 mptr = nptr;
965 }
966 if (mptr)
967 mptr->setNextNE(NULL);
968 }
969 }
970 return 0;
971 }
972
973 // initialize the SfxEntry links NextEQ and NextNE to speed searching
974 // using the idea of leading subsets this time
process_sfx_order()975 int AffixMgr::process_sfx_order() {
976 SfxEntry* ptr;
977
978 // loop through each prefix list starting point
979 for (int i = 1; i < SETSIZE; i++) {
980 ptr = sStart[i];
981
982 // look through the remainder of the list
983 // and find next entry with affix that
984 // the current one is not a subset of
985 // mark that as destination for NextNE
986 // use next in list that you are a subset
987 // of as NextEQ
988
989 for (; ptr != NULL; ptr = ptr->getNext()) {
990 SfxEntry* nptr = ptr->getNext();
991 for (; nptr != NULL; nptr = nptr->getNext()) {
992 if (!isSubset(ptr->getKey(), nptr->getKey()))
993 break;
994 }
995 ptr->setNextNE(nptr);
996 ptr->setNextEQ(NULL);
997 if ((ptr->getNext()) &&
998 isSubset(ptr->getKey(), (ptr->getNext())->getKey()))
999 ptr->setNextEQ(ptr->getNext());
1000 }
1001
1002 // now clean up by adding smart search termination strings:
1003 // if you are already a superset of the previous suffix
1004 // but not a subset of the next, search can end here
1005 // so set NextNE properly
1006
1007 ptr = sStart[i];
1008 for (; ptr != NULL; ptr = ptr->getNext()) {
1009 SfxEntry* nptr = ptr->getNext();
1010 SfxEntry* mptr = NULL;
1011 for (; nptr != NULL; nptr = nptr->getNext()) {
1012 if (!isSubset(ptr->getKey(), nptr->getKey()))
1013 break;
1014 mptr = nptr;
1015 }
1016 if (mptr)
1017 mptr->setNextNE(NULL);
1018 }
1019 }
1020 return 0;
1021 }
1022
1023 // add flags to the result for dictionary debugging
debugflag(std::string & result,unsigned short flag)1024 std::string& AffixMgr::debugflag(std::string& result, unsigned short flag) {
1025 char* st = encode_flag(flag);
1026 result.append(" ");
1027 result.append(MORPH_FLAG);
1028 if (st) {
1029 result.append(st);
1030 free(st);
1031 }
1032 return result;
1033 }
1034
1035 // calculate the character length of the condition
condlen(const char * st)1036 int AffixMgr::condlen(const char* st) {
1037 int l = 0;
1038 bool group = false;
1039 for (; *st; st++) {
1040 if (*st == '[') {
1041 group = true;
1042 l++;
1043 } else if (*st == ']')
1044 group = false;
1045 else if (!group && (!utf8 || (!(*st & 0x80) || ((*st & 0xc0) == 0x80))))
1046 l++;
1047 }
1048 return l;
1049 }
1050
encodeit(AffEntry & entry,const char * cs)1051 int AffixMgr::encodeit(AffEntry& entry, const char* cs) {
1052 if (strcmp(cs, ".") != 0) {
1053 entry.numconds = (char)condlen(cs);
1054 const size_t cslen = strlen(cs);
1055 const size_t short_part = std::min<size_t>(MAXCONDLEN, cslen);
1056 memcpy(entry.c.conds, cs, short_part);
1057 if (short_part < MAXCONDLEN) {
1058 //blank out the remaining space
1059 memset(entry.c.conds + short_part, 0, MAXCONDLEN - short_part);
1060 } else if (cs[MAXCONDLEN]) {
1061 //there is more conditions than fit in fixed space, so its
1062 //a long condition
1063 entry.opts += aeLONGCOND;
1064 entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
1065 if (!entry.c.l.conds2)
1066 return 1;
1067 }
1068 } else {
1069 entry.numconds = 0;
1070 entry.c.conds[0] = '\0';
1071 }
1072 return 0;
1073 }
1074
1075 // return 1 if s1 is a leading subset of s2 (dots are for infixes)
isSubset(const char * s1,const char * s2)1076 inline int AffixMgr::isSubset(const char* s1, const char* s2) {
1077 while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
1078 s1++;
1079 s2++;
1080 }
1081 return (*s1 == '\0');
1082 }
1083
1084 // check word for prefixes
prefix_check(const char * word,int len,char in_compound,const FLAG needflag)1085 struct hentry* AffixMgr::prefix_check(const char* word,
1086 int len,
1087 char in_compound,
1088 const FLAG needflag) {
1089 struct hentry* rv = NULL;
1090
1091 pfx = NULL;
1092 pfxappnd = NULL;
1093 sfxappnd = NULL;
1094 sfxextra = 0;
1095
1096 // first handle the special case of 0 length prefixes
1097 PfxEntry* pe = pStart[0];
1098 while (pe) {
1099 if (
1100 // fogemorpheme
1101 ((in_compound != IN_CPD_NOT) ||
1102 !(pe->getCont() &&
1103 (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
1104 // permit prefixes in compounds
1105 ((in_compound != IN_CPD_END) ||
1106 (pe->getCont() &&
1107 (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))) {
1108 // check prefix
1109 rv = pe->checkword(word, len, in_compound, needflag);
1110 if (rv) {
1111 pfx = pe; // BUG: pfx not stateless
1112 return rv;
1113 }
1114 }
1115 pe = pe->getNext();
1116 }
1117
1118 // now handle the general case
1119 unsigned char sp = *((const unsigned char*)word);
1120 PfxEntry* pptr = pStart[sp];
1121
1122 while (pptr) {
1123 if (isSubset(pptr->getKey(), word)) {
1124 if (
1125 // fogemorpheme
1126 ((in_compound != IN_CPD_NOT) ||
1127 !(pptr->getCont() &&
1128 (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
1129 // permit prefixes in compounds
1130 ((in_compound != IN_CPD_END) ||
1131 (pptr->getCont() && (TESTAFF(pptr->getCont(), compoundpermitflag,
1132 pptr->getContLen()))))) {
1133 // check prefix
1134 rv = pptr->checkword(word, len, in_compound, needflag);
1135 if (rv) {
1136 pfx = pptr; // BUG: pfx not stateless
1137 return rv;
1138 }
1139 }
1140 pptr = pptr->getNextEQ();
1141 } else {
1142 pptr = pptr->getNextNE();
1143 }
1144 }
1145
1146 return NULL;
1147 }
1148
1149 // check word for prefixes
prefix_check_twosfx(const char * word,int len,char in_compound,const FLAG needflag)1150 struct hentry* AffixMgr::prefix_check_twosfx(const char* word,
1151 int len,
1152 char in_compound,
1153 const FLAG needflag) {
1154 struct hentry* rv = NULL;
1155
1156 pfx = NULL;
1157 sfxappnd = NULL;
1158 sfxextra = 0;
1159
1160 // first handle the special case of 0 length prefixes
1161 PfxEntry* pe = pStart[0];
1162
1163 while (pe) {
1164 rv = pe->check_twosfx(word, len, in_compound, needflag);
1165 if (rv)
1166 return rv;
1167 pe = pe->getNext();
1168 }
1169
1170 // now handle the general case
1171 unsigned char sp = *((const unsigned char*)word);
1172 PfxEntry* pptr = pStart[sp];
1173
1174 while (pptr) {
1175 if (isSubset(pptr->getKey(), word)) {
1176 rv = pptr->check_twosfx(word, len, in_compound, needflag);
1177 if (rv) {
1178 pfx = pptr;
1179 return rv;
1180 }
1181 pptr = pptr->getNextEQ();
1182 } else {
1183 pptr = pptr->getNextNE();
1184 }
1185 }
1186
1187 return NULL;
1188 }
1189
1190 // check word for prefixes
prefix_check_morph(const char * word,int len,char in_compound,const FLAG needflag)1191 std::string AffixMgr::prefix_check_morph(const char* word,
1192 int len,
1193 char in_compound,
1194 const FLAG needflag) {
1195
1196 std::string result;
1197
1198 pfx = NULL;
1199 sfxappnd = NULL;
1200 sfxextra = 0;
1201
1202 // first handle the special case of 0 length prefixes
1203 PfxEntry* pe = pStart[0];
1204 while (pe) {
1205 std::string st = pe->check_morph(word, len, in_compound, needflag);
1206 if (!st.empty()) {
1207 result.append(st);
1208 }
1209 pe = pe->getNext();
1210 }
1211
1212 // now handle the general case
1213 unsigned char sp = *((const unsigned char*)word);
1214 PfxEntry* pptr = pStart[sp];
1215
1216 while (pptr) {
1217 if (isSubset(pptr->getKey(), word)) {
1218 std::string st = pptr->check_morph(word, len, in_compound, needflag);
1219 if (!st.empty()) {
1220 // fogemorpheme
1221 if ((in_compound != IN_CPD_NOT) ||
1222 !((pptr->getCont() && (TESTAFF(pptr->getCont(), onlyincompound,
1223 pptr->getContLen()))))) {
1224 result.append(st);
1225 pfx = pptr;
1226 }
1227 }
1228 pptr = pptr->getNextEQ();
1229 } else {
1230 pptr = pptr->getNextNE();
1231 }
1232 }
1233
1234 return result;
1235 }
1236
1237 // check word for prefixes
prefix_check_twosfx_morph(const char * word,int len,char in_compound,const FLAG needflag)1238 std::string AffixMgr::prefix_check_twosfx_morph(const char* word,
1239 int len,
1240 char in_compound,
1241 const FLAG needflag) {
1242 std::string result;
1243
1244 pfx = NULL;
1245 sfxappnd = NULL;
1246 sfxextra = 0;
1247
1248 // first handle the special case of 0 length prefixes
1249 PfxEntry* pe = pStart[0];
1250 while (pe) {
1251 std::string st = pe->check_twosfx_morph(word, len, in_compound, needflag);
1252 if (!st.empty()) {
1253 result.append(st);
1254 }
1255 pe = pe->getNext();
1256 }
1257
1258 // now handle the general case
1259 unsigned char sp = *((const unsigned char*)word);
1260 PfxEntry* pptr = pStart[sp];
1261
1262 while (pptr) {
1263 if (isSubset(pptr->getKey(), word)) {
1264 std::string st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
1265 if (!st.empty()) {
1266 result.append(st);
1267 pfx = pptr;
1268 }
1269 pptr = pptr->getNextEQ();
1270 } else {
1271 pptr = pptr->getNextNE();
1272 }
1273 }
1274
1275 return result;
1276 }
1277
1278 // Is word a non compound with a REP substitution (see checkcompoundrep)?
cpdrep_check(const char * word,int wl)1279 int AffixMgr::cpdrep_check(const char* word, int wl) {
1280
1281 if ((wl < 2) || reptable.empty())
1282 return 0;
1283
1284 for (size_t i = 0; i < reptable.size(); ++i) {
1285 const char* r = word;
1286 const size_t lenp = reptable[i].pattern.size();
1287 // search every occurence of the pattern in the word
1288 while ((r = strstr(r, reptable[i].pattern.c_str())) != NULL) {
1289 std::string candidate(word);
1290 size_t type = r == word && langnum != LANG_hu ? 1 : 0;
1291 if (r - word + reptable[i].pattern.size() == lenp && langnum != LANG_hu)
1292 type += 2;
1293 candidate.replace(r - word, lenp, reptable[i].outstrings[type]);
1294 if (candidate_check(candidate.c_str(), candidate.size()))
1295 return 1;
1296 ++r; // search for the next letter
1297 }
1298 }
1299
1300 return 0;
1301 }
1302
1303 // forbid compoundings when there are special patterns at word bound
cpdpat_check(const char * word,int pos,hentry * r1,hentry * r2,const char)1304 int AffixMgr::cpdpat_check(const char* word,
1305 int pos,
1306 hentry* r1,
1307 hentry* r2,
1308 const char /*affixed*/) {
1309 for (size_t i = 0; i < checkcpdtable.size(); ++i) {
1310 size_t len;
1311 if (isSubset(checkcpdtable[i].pattern2.c_str(), word + pos) &&
1312 (!r1 || !checkcpdtable[i].cond ||
1313 (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) &&
1314 (!r2 || !checkcpdtable[i].cond2 ||
1315 (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) &&
1316 // zero length pattern => only TESTAFF
1317 // zero pattern (0/flag) => unmodified stem (zero affixes allowed)
1318 (checkcpdtable[i].pattern.empty() ||
1319 ((checkcpdtable[i].pattern[0] == '0' && r1->blen <= pos &&
1320 strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) ||
1321 (checkcpdtable[i].pattern[0] != '0' &&
1322 ((len = checkcpdtable[i].pattern.size()) != 0) &&
1323 strncmp(word + pos - len, checkcpdtable[i].pattern.c_str(), len) == 0)))) {
1324 return 1;
1325 }
1326 }
1327 return 0;
1328 }
1329
1330 // forbid compounding with neighbouring upper and lower case characters at word
1331 // bounds
cpdcase_check(const char * word,int pos)1332 int AffixMgr::cpdcase_check(const char* word, int pos) {
1333 if (utf8) {
1334 const char* p;
1335 for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--)
1336 ;
1337 std::string pair(p);
1338 std::vector<w_char> pair_u;
1339 u8_u16(pair_u, pair);
1340 unsigned short a = pair_u.size() > 1 ? ((pair_u[1].h << 8) + pair_u[1].l) : 0;
1341 unsigned short b = !pair_u.empty() ? ((pair_u[0].h << 8) + pair_u[0].l) : 0;
1342 if (((unicodetoupper(a, langnum) == a) ||
1343 (unicodetoupper(b, langnum) == b)) &&
1344 (a != '-') && (b != '-'))
1345 return 1;
1346 } else {
1347 unsigned char a = *(word + pos - 1);
1348 unsigned char b = *(word + pos);
1349 if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-'))
1350 return 1;
1351 }
1352 return 0;
1353 }
1354
1355 struct metachar_data {
1356 signed short btpp; // metacharacter (*, ?) position for backtracking
1357 signed short btwp; // word position for metacharacters
1358 int btnum; // number of matched characters in metacharacter
1359 };
1360
1361 // check compound patterns
defcpd_check(hentry *** words,short wnum,hentry * rv,hentry ** def,char all)1362 int AffixMgr::defcpd_check(hentry*** words,
1363 short wnum,
1364 hentry* rv,
1365 hentry** def,
1366 char all) {
1367 int w = 0;
1368
1369 if (!*words) {
1370 w = 1;
1371 *words = def;
1372 }
1373
1374 if (!*words) {
1375 return 0;
1376 }
1377
1378 std::vector<metachar_data> btinfo(1);
1379
1380 short bt = 0;
1381
1382 (*words)[wnum] = rv;
1383
1384 // has the last word COMPOUNDRULE flag?
1385 if (rv->alen == 0) {
1386 (*words)[wnum] = NULL;
1387 if (w)
1388 *words = NULL;
1389 return 0;
1390 }
1391 int ok = 0;
1392 for (size_t i = 0; i < defcpdtable.size(); ++i) {
1393 for (size_t j = 0; j < defcpdtable[i].size(); ++j) {
1394 if (defcpdtable[i][j] != '*' && defcpdtable[i][j] != '?' &&
1395 TESTAFF(rv->astr, defcpdtable[i][j], rv->alen)) {
1396 ok = 1;
1397 break;
1398 }
1399 }
1400 }
1401 if (ok == 0) {
1402 (*words)[wnum] = NULL;
1403 if (w)
1404 *words = NULL;
1405 return 0;
1406 }
1407
1408 for (size_t i = 0; i < defcpdtable.size(); ++i) {
1409 size_t pp = 0; // pattern position
1410 signed short wp = 0; // "words" position
1411 int ok2;
1412 ok = 1;
1413 ok2 = 1;
1414 do {
1415 while ((pp < defcpdtable[i].size()) && (wp <= wnum)) {
1416 if (((pp + 1) < defcpdtable[i].size()) &&
1417 ((defcpdtable[i][pp + 1] == '*') ||
1418 (defcpdtable[i][pp + 1] == '?'))) {
1419 int wend = (defcpdtable[i][pp + 1] == '?') ? wp : wnum;
1420 ok2 = 1;
1421 pp += 2;
1422 btinfo[bt].btpp = pp;
1423 btinfo[bt].btwp = wp;
1424 while (wp <= wend) {
1425 if (!(*words)[wp]->alen ||
1426 !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp - 2],
1427 (*words)[wp]->alen)) {
1428 ok2 = 0;
1429 break;
1430 }
1431 wp++;
1432 }
1433 if (wp <= wnum)
1434 ok2 = 0;
1435 btinfo[bt].btnum = wp - btinfo[bt].btwp;
1436 if (btinfo[bt].btnum > 0) {
1437 ++bt;
1438 btinfo.resize(bt+1);
1439 }
1440 if (ok2)
1441 break;
1442 } else {
1443 ok2 = 1;
1444 if (!(*words)[wp] || !(*words)[wp]->alen ||
1445 !TESTAFF((*words)[wp]->astr, defcpdtable[i][pp],
1446 (*words)[wp]->alen)) {
1447 ok = 0;
1448 break;
1449 }
1450 pp++;
1451 wp++;
1452 if ((defcpdtable[i].size() == pp) && !(wp > wnum))
1453 ok = 0;
1454 }
1455 }
1456 if (ok && ok2) {
1457 size_t r = pp;
1458 while ((defcpdtable[i].size() > r) && ((r + 1) < defcpdtable[i].size()) &&
1459 ((defcpdtable[i][r + 1] == '*') ||
1460 (defcpdtable[i][r + 1] == '?')))
1461 r += 2;
1462 if (defcpdtable[i].size() <= r)
1463 return 1;
1464 }
1465 // backtrack
1466 if (bt)
1467 do {
1468 ok = 1;
1469 btinfo[bt - 1].btnum--;
1470 pp = btinfo[bt - 1].btpp;
1471 wp = btinfo[bt - 1].btwp + (signed short)btinfo[bt - 1].btnum;
1472 } while ((btinfo[bt - 1].btnum < 0) && --bt);
1473 } while (bt);
1474
1475 if (ok && ok2 && (!all || (defcpdtable[i].size() <= pp)))
1476 return 1;
1477
1478 // check zero ending
1479 while (ok && ok2 && (defcpdtable[i].size() > pp) &&
1480 ((pp + 1) < defcpdtable[i].size()) &&
1481 ((defcpdtable[i][pp + 1] == '*') ||
1482 (defcpdtable[i][pp + 1] == '?')))
1483 pp += 2;
1484 if (ok && ok2 && (defcpdtable[i].size() <= pp))
1485 return 1;
1486 }
1487 (*words)[wnum] = NULL;
1488 if (w)
1489 *words = NULL;
1490 return 0;
1491 }
1492
candidate_check(const char * word,int len)1493 inline int AffixMgr::candidate_check(const char* word, int len) {
1494
1495 struct hentry* rv = lookup(word);
1496 if (rv)
1497 return 1;
1498
1499 // rv = prefix_check(word,len,1);
1500 // if (rv) return 1;
1501
1502 rv = affix_check(word, len);
1503 if (rv)
1504 return 1;
1505 return 0;
1506 }
1507
1508 // calculate number of syllable for compound-checking
get_syllable(const std::string & word)1509 short AffixMgr::get_syllable(const std::string& word) {
1510 if (cpdmaxsyllable == 0)
1511 return 0;
1512
1513 short num = 0;
1514
1515 if (!utf8) {
1516 for (size_t i = 0; i < word.size(); ++i) {
1517 if (std::binary_search(cpdvowels.begin(), cpdvowels.end(),
1518 word[i])) {
1519 ++num;
1520 }
1521 }
1522 } else if (!cpdvowels_utf16.empty()) {
1523 std::vector<w_char> w;
1524 u8_u16(w, word);
1525 for (size_t i = 0; i < w.size(); ++i) {
1526 if (std::binary_search(cpdvowels_utf16.begin(),
1527 cpdvowels_utf16.end(),
1528 w[i])) {
1529 ++num;
1530 }
1531 }
1532 }
1533
1534 return num;
1535 }
1536
setcminmax(int * cmin,int * cmax,const char * word,int len)1537 void AffixMgr::setcminmax(int* cmin, int* cmax, const char* word, int len) {
1538 if (utf8) {
1539 int i;
1540 for (*cmin = 0, i = 0; (i < cpdmin) && *cmin < len; i++) {
1541 for ((*cmin)++; *cmin < len && (word[*cmin] & 0xc0) == 0x80; (*cmin)++)
1542 ;
1543 }
1544 for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax >= 0; i++) {
1545 for ((*cmax)--; *cmax >= 0 && (word[*cmax] & 0xc0) == 0x80; (*cmax)--)
1546 ;
1547 }
1548 } else {
1549 *cmin = cpdmin;
1550 *cmax = len - cpdmin + 1;
1551 }
1552 }
1553
1554 // check if compound word is correctly spelled
1555 // hu_mov_rule = spec. Hungarian rule (XXX)
compound_check(const std::string & word,short wordnum,short numsyllable,short maxwordnum,short wnum,hentry ** words=NULL,hentry ** rwords=NULL,char hu_mov_rule=0,char is_sug=0,int * info=NULL)1556 struct hentry* AffixMgr::compound_check(const std::string& word,
1557 short wordnum,
1558 short numsyllable,
1559 short maxwordnum,
1560 short wnum,
1561 hentry** words = NULL,
1562 hentry** rwords = NULL,
1563 char hu_mov_rule = 0,
1564 char is_sug = 0,
1565 int* info = NULL) {
1566 int i;
1567 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
1568 struct hentry* rv = NULL;
1569 struct hentry* rv_first;
1570 std::string st;
1571 char ch = '\0';
1572 int cmin;
1573 int cmax;
1574 int striple = 0;
1575 size_t scpd = 0;
1576 int soldi = 0;
1577 int oldcmin = 0;
1578 int oldcmax = 0;
1579 int oldlen = 0;
1580 int checkedstriple = 0;
1581 char affixed = 0;
1582 hentry** oldwords = words;
1583 size_t len = word.size();
1584
1585 int checked_prefix;
1586
1587 setcminmax(&cmin, &cmax, word.c_str(), len);
1588
1589 st.assign(word);
1590
1591 for (i = cmin; i < cmax; i++) {
1592 // go to end of the UTF-8 character
1593 if (utf8) {
1594 for (; (st[i] & 0xc0) == 0x80; i++)
1595 ;
1596 if (i >= cmax)
1597 return NULL;
1598 }
1599
1600 words = oldwords;
1601 int onlycpdrule = (words) ? 1 : 0;
1602
1603 do { // onlycpdrule loop
1604
1605 oldnumsyllable = numsyllable;
1606 oldwordnum = wordnum;
1607 checked_prefix = 0;
1608
1609 do { // simplified checkcompoundpattern loop
1610
1611 if (scpd > 0) {
1612 for (; scpd <= checkcpdtable.size() &&
1613 (checkcpdtable[scpd - 1].pattern3.empty() ||
1614 strncmp(word.c_str() + i, checkcpdtable[scpd - 1].pattern3.c_str(),
1615 checkcpdtable[scpd - 1].pattern3.size()) != 0);
1616 scpd++)
1617 ;
1618
1619 if (scpd > checkcpdtable.size())
1620 break; // break simplified checkcompoundpattern loop
1621 st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern);
1622 soldi = i;
1623 i += checkcpdtable[scpd - 1].pattern.size();
1624 st.replace(i, std::string::npos, checkcpdtable[scpd - 1].pattern2);
1625 st.replace(i + checkcpdtable[scpd - 1].pattern2.size(), std::string::npos,
1626 word.substr(soldi + checkcpdtable[scpd - 1].pattern3.size()));
1627
1628 oldlen = len;
1629 len += checkcpdtable[scpd - 1].pattern.size() +
1630 checkcpdtable[scpd - 1].pattern2.size() -
1631 checkcpdtable[scpd - 1].pattern3.size();
1632 oldcmin = cmin;
1633 oldcmax = cmax;
1634 setcminmax(&cmin, &cmax, st.c_str(), len);
1635
1636 cmax = len - cpdmin + 1;
1637 }
1638
1639 ch = st[i];
1640 st[i] = '\0';
1641
1642 sfx = NULL;
1643 pfx = NULL;
1644
1645 // FIRST WORD
1646
1647 affixed = 1;
1648 rv = lookup(st.c_str()); // perhaps without prefix
1649
1650 // search homonym with compound flag
1651 while ((rv) && !hu_mov_rule &&
1652 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1653 !((compoundflag && !words && !onlycpdrule &&
1654 TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1655 (compoundbegin && !wordnum && !onlycpdrule &&
1656 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1657 (compoundmiddle && wordnum && !words && !onlycpdrule &&
1658 TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
1659 (!defcpdtable.empty() && onlycpdrule &&
1660 ((!words && !wordnum &&
1661 defcpd_check(&words, wnum, rv, rwords, 0)) ||
1662 (words &&
1663 defcpd_check(&words, wnum, rv, rwords, 0))))) ||
1664 (scpd != 0 && checkcpdtable[scpd - 1].cond != FLAG_NULL &&
1665 !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)))) {
1666 rv = rv->next_homonym;
1667 }
1668
1669 if (rv)
1670 affixed = 0;
1671
1672 if (!rv) {
1673 if (onlycpdrule)
1674 break;
1675 if (compoundflag &&
1676 !(rv = prefix_check(st.c_str(), i,
1677 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
1678 compoundflag))) {
1679 if (((rv = suffix_check(
1680 st.c_str(), i, 0, NULL, FLAG_NULL, compoundflag,
1681 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1682 (compoundmoresuffixes &&
1683 (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
1684 !hu_mov_rule && sfx->getCont() &&
1685 ((compoundforbidflag &&
1686 TESTAFF(sfx->getCont(), compoundforbidflag,
1687 sfx->getContLen())) ||
1688 (compoundend &&
1689 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
1690 rv = NULL;
1691 }
1692 }
1693
1694 if (rv ||
1695 (((wordnum == 0) && compoundbegin &&
1696 ((rv = suffix_check(
1697 st.c_str(), i, 0, NULL, FLAG_NULL, compoundbegin,
1698 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1699 (compoundmoresuffixes &&
1700 (rv = suffix_check_twosfx(
1701 st.c_str(), i, 0, NULL,
1702 compoundbegin))) || // twofold suffixes + compound
1703 (rv = prefix_check(st.c_str(), i,
1704 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
1705 compoundbegin)))) ||
1706 ((wordnum > 0) && compoundmiddle &&
1707 ((rv = suffix_check(
1708 st.c_str(), i, 0, NULL, FLAG_NULL, compoundmiddle,
1709 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1710 (compoundmoresuffixes &&
1711 (rv = suffix_check_twosfx(
1712 st.c_str(), i, 0, NULL,
1713 compoundmiddle))) || // twofold suffixes + compound
1714 (rv = prefix_check(st.c_str(), i,
1715 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
1716 compoundmiddle))))))
1717 checked_prefix = 1;
1718 // else check forbiddenwords and needaffix
1719 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1720 TESTAFF(rv->astr, needaffix, rv->alen) ||
1721 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1722 (is_sug && nosuggest &&
1723 TESTAFF(rv->astr, nosuggest, rv->alen)))) {
1724 st[i] = ch;
1725 // continue;
1726 break;
1727 }
1728
1729 // check non_compound flag in suffix and prefix
1730 if ((rv) && !hu_mov_rule &&
1731 ((pfx && pfx->getCont() &&
1732 TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
1733 (sfx && sfx->getCont() &&
1734 TESTAFF(sfx->getCont(), compoundforbidflag,
1735 sfx->getContLen())))) {
1736 rv = NULL;
1737 }
1738
1739 // check compoundend flag in suffix and prefix
1740 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
1741 ((pfx && pfx->getCont() &&
1742 TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) ||
1743 (sfx && sfx->getCont() &&
1744 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
1745 rv = NULL;
1746 }
1747
1748 // check compoundmiddle flag in suffix and prefix
1749 if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
1750 !hu_mov_rule &&
1751 ((pfx && pfx->getCont() &&
1752 TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) ||
1753 (sfx && sfx->getCont() &&
1754 TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) {
1755 rv = NULL;
1756 }
1757
1758 // check forbiddenwords
1759 if ((rv) && (rv->astr) &&
1760 (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1761 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1762 (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
1763 return NULL;
1764 }
1765
1766 // increment word number, if the second root has a compoundroot flag
1767 if ((rv) && compoundroot &&
1768 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1769 wordnum++;
1770 }
1771
1772 // first word is acceptable in compound words?
1773 if (((rv) &&
1774 (checked_prefix || (words && words[wnum]) ||
1775 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1776 ((oldwordnum == 0) && compoundbegin &&
1777 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1778 ((oldwordnum > 0) && compoundmiddle &&
1779 TESTAFF(rv->astr, compoundmiddle, rv->alen))
1780
1781 // LANG_hu section: spec. Hungarian rule
1782 || ((langnum == LANG_hu) && hu_mov_rule &&
1783 (TESTAFF(
1784 rv->astr, 'F',
1785 rv->alen) || // XXX hardwired Hungarian dictionary codes
1786 TESTAFF(rv->astr, 'G', rv->alen) ||
1787 TESTAFF(rv->astr, 'H', rv->alen)))
1788 // END of LANG_hu section
1789 ) &&
1790 (
1791 // test CHECKCOMPOUNDPATTERN conditions
1792 scpd == 0 || checkcpdtable[scpd - 1].cond == FLAG_NULL ||
1793 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond, rv->alen)) &&
1794 !((checkcompoundtriple && scpd == 0 &&
1795 !words && // test triple letters
1796 (word[i - 1] == word[i]) &&
1797 (((i > 1) && (word[i - 1] == word[i - 2])) ||
1798 ((word[i - 1] == word[i + 1])) // may be word[i+1] == '\0'
1799 )) ||
1800 (checkcompoundcase && scpd == 0 && !words &&
1801 cpdcase_check(word.c_str(), i))))
1802 // LANG_hu section: spec. Hungarian rule
1803 || ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
1804 (rv = affix_check(st.c_str(), i)) &&
1805 (sfx && sfx->getCont() &&
1806 ( // XXX hardwired Hungarian dic. codes
1807 TESTAFF(sfx->getCont(), (unsigned short)'x',
1808 sfx->getContLen()) ||
1809 TESTAFF(
1810 sfx->getCont(), (unsigned short)'%',
1811 sfx->getContLen()))))) { // first word is ok condition
1812
1813 // LANG_hu section: spec. Hungarian rule
1814 if (langnum == LANG_hu) {
1815 // calculate syllable number of the word
1816 numsyllable += get_syllable(st.substr(0, i));
1817 // + 1 word, if syllable number of the prefix > 1 (hungarian
1818 // convention)
1819 if (pfx && (get_syllable(pfx->getKey()) > 1))
1820 wordnum++;
1821 }
1822 // END of LANG_hu section
1823
1824 // NEXT WORD(S)
1825 rv_first = rv;
1826 st[i] = ch;
1827
1828 do { // striple loop
1829
1830 // check simplifiedtriple
1831 if (simplifiedtriple) {
1832 if (striple) {
1833 checkedstriple = 1;
1834 i--; // check "fahrt" instead of "ahrt" in "Schiffahrt"
1835 } else if (i > 2 && word[i - 1] == word[i - 2])
1836 striple = 1;
1837 }
1838
1839 rv = lookup(st.c_str() + i); // perhaps without prefix
1840
1841 // search homonym with compound flag
1842 while ((rv) &&
1843 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
1844 !((compoundflag && !words &&
1845 TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1846 (compoundend && !words &&
1847 TESTAFF(rv->astr, compoundend, rv->alen)) ||
1848 (!defcpdtable.empty() && words &&
1849 defcpd_check(&words, wnum + 1, rv, NULL, 1))) ||
1850 (scpd != 0 && checkcpdtable[scpd - 1].cond2 != FLAG_NULL &&
1851 !TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2,
1852 rv->alen)))) {
1853 rv = rv->next_homonym;
1854 }
1855
1856 // check FORCEUCASE
1857 if (rv && forceucase && (rv) &&
1858 (TESTAFF(rv->astr, forceucase, rv->alen)) &&
1859 !(info && *info & SPELL_ORIGCAP))
1860 rv = NULL;
1861
1862 if (rv && words && words[wnum + 1])
1863 return rv_first;
1864
1865 oldnumsyllable2 = numsyllable;
1866 oldwordnum2 = wordnum;
1867
1868 // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary
1869 // code
1870 if ((rv) && (langnum == LANG_hu) &&
1871 (TESTAFF(rv->astr, 'I', rv->alen)) &&
1872 !(TESTAFF(rv->astr, 'J', rv->alen))) {
1873 numsyllable--;
1874 }
1875 // END of LANG_hu section
1876
1877 // increment word number, if the second root has a compoundroot flag
1878 if ((rv) && (compoundroot) &&
1879 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1880 wordnum++;
1881 }
1882
1883 // check forbiddenwords
1884 if ((rv) && (rv->astr) &&
1885 (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1886 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1887 (is_sug && nosuggest &&
1888 TESTAFF(rv->astr, nosuggest, rv->alen))))
1889 return NULL;
1890
1891 // second word is acceptable, as a root?
1892 // hungarian conventions: compounding is acceptable,
1893 // when compound forms consist of 2 words, or if more,
1894 // then the syllable number of root words must be 6, or lesser.
1895
1896 if ((rv) &&
1897 ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1898 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
1899 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
1900 ((cpdmaxsyllable != 0) &&
1901 (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
1902 cpdmaxsyllable))) &&
1903 (
1904 // test CHECKCOMPOUNDPATTERN
1905 checkcpdtable.empty() || scpd != 0 ||
1906 !cpdpat_check(word.c_str(), i, rv_first, rv, 0)) &&
1907 ((!checkcompounddup || (rv != rv_first)))
1908 // test CHECKCOMPOUNDPATTERN conditions
1909 &&
1910 (scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL ||
1911 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) {
1912 // forbid compound word, if it is a non compound word with typical
1913 // fault
1914 if (checkcompoundrep && cpdrep_check(word.c_str(), len))
1915 return NULL;
1916 return rv_first;
1917 }
1918
1919 numsyllable = oldnumsyllable2;
1920 wordnum = oldwordnum2;
1921
1922 // perhaps second word has prefix or/and suffix
1923 sfx = NULL;
1924 sfxflag = FLAG_NULL;
1925 rv = (compoundflag && !onlycpdrule)
1926 ? affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundflag,
1927 IN_CPD_END)
1928 : NULL;
1929 if (!rv && compoundend && !onlycpdrule) {
1930 sfx = NULL;
1931 pfx = NULL;
1932 rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), compoundend,
1933 IN_CPD_END);
1934 }
1935
1936 if (!rv && !defcpdtable.empty() && words) {
1937 rv = affix_check((word.c_str() + i), strlen(word.c_str() + i), 0, IN_CPD_END);
1938 if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1))
1939 return rv_first;
1940 rv = NULL;
1941 }
1942
1943 // test CHECKCOMPOUNDPATTERN conditions (allowed forms)
1944 if (rv &&
1945 !(scpd == 0 || checkcpdtable[scpd - 1].cond2 == FLAG_NULL ||
1946 TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen)))
1947 rv = NULL;
1948
1949 // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds)
1950 if (rv && !checkcpdtable.empty() && scpd == 0 &&
1951 cpdpat_check(word.c_str(), i, rv_first, rv, affixed))
1952 rv = NULL;
1953
1954 // check non_compound flag in suffix and prefix
1955 if ((rv) && ((pfx && pfx->getCont() &&
1956 TESTAFF(pfx->getCont(), compoundforbidflag,
1957 pfx->getContLen())) ||
1958 (sfx && sfx->getCont() &&
1959 TESTAFF(sfx->getCont(), compoundforbidflag,
1960 sfx->getContLen())))) {
1961 rv = NULL;
1962 }
1963
1964 // check FORCEUCASE
1965 if (rv && forceucase && (rv) &&
1966 (TESTAFF(rv->astr, forceucase, rv->alen)) &&
1967 !(info && *info & SPELL_ORIGCAP))
1968 rv = NULL;
1969
1970 // check forbiddenwords
1971 if ((rv) && (rv->astr) &&
1972 (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1973 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1974 (is_sug && nosuggest &&
1975 TESTAFF(rv->astr, nosuggest, rv->alen))))
1976 return NULL;
1977
1978 // pfxappnd = prefix of word+i, or NULL
1979 // calculate syllable number of prefix.
1980 // hungarian convention: when syllable number of prefix is more,
1981 // than 1, the prefix+word counts as two words.
1982
1983 if (langnum == LANG_hu) {
1984 // calculate syllable number of the word
1985 numsyllable += get_syllable(word.c_str() + i);
1986
1987 // - affix syllable num.
1988 // XXX only second suffix (inflections, not derivations)
1989 if (sfxappnd) {
1990 std::string tmp(sfxappnd);
1991 reverseword(tmp);
1992 numsyllable -= get_syllable(tmp) + sfxextra;
1993 }
1994
1995 // + 1 word, if syllable number of the prefix > 1 (hungarian
1996 // convention)
1997 if (pfx && (get_syllable(pfx->getKey()) > 1))
1998 wordnum++;
1999
2000 // increment syllable num, if last word has a SYLLABLENUM flag
2001 // and the suffix is beginning `s'
2002
2003 if (!cpdsyllablenum.empty()) {
2004 switch (sfxflag) {
2005 case 'c': {
2006 numsyllable += 2;
2007 break;
2008 }
2009 case 'J': {
2010 numsyllable += 1;
2011 break;
2012 }
2013 case 'I': {
2014 if (rv && TESTAFF(rv->astr, 'J', rv->alen))
2015 numsyllable += 1;
2016 break;
2017 }
2018 }
2019 }
2020 }
2021
2022 // increment word number, if the second word has a compoundroot flag
2023 if ((rv) && (compoundroot) &&
2024 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2025 wordnum++;
2026 }
2027
2028 // second word is acceptable, as a word with prefix or/and suffix?
2029 // hungarian conventions: compounding is acceptable,
2030 // when compound forms consist 2 word, otherwise
2031 // the syllable number of root words is 6, or lesser.
2032 if ((rv) &&
2033 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2034 ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
2035 ((!checkcompounddup || (rv != rv_first)))) {
2036 // forbid compound word, if it is a non compound word with typical
2037 // fault
2038 if (checkcompoundrep && cpdrep_check(word.c_str(), len))
2039 return NULL;
2040 return rv_first;
2041 }
2042
2043 numsyllable = oldnumsyllable2;
2044 wordnum = oldwordnum2;
2045
2046 // perhaps second word is a compound word (recursive call)
2047 if (wordnum + 2 < maxwordnum) {
2048 rv = compound_check(st.substr(i), wordnum + 1,
2049 numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
2050 is_sug, info);
2051
2052 if (rv && !checkcpdtable.empty() &&
2053 ((scpd == 0 &&
2054 cpdpat_check(word.c_str(), i, rv_first, rv, affixed)) ||
2055 (scpd != 0 &&
2056 !cpdpat_check(word.c_str(), i, rv_first, rv, affixed))))
2057 rv = NULL;
2058 } else {
2059 rv = NULL;
2060 }
2061 if (rv) {
2062 // forbid compound word, if it is a non compound word with typical
2063 // fault
2064 if (checkcompoundrep || forbiddenword) {
2065
2066 if (checkcompoundrep && cpdrep_check(word.c_str(), len))
2067 return NULL;
2068
2069 // check first part
2070 if (strncmp(rv->word, word.c_str() + i, rv->blen) == 0) {
2071 char r = st[i + rv->blen];
2072 st[i + rv->blen] = '\0';
2073
2074 if (checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) {
2075 st[ + i + rv->blen] = r;
2076 continue;
2077 }
2078
2079 if (forbiddenword) {
2080 struct hentry* rv2 = lookup(word.c_str());
2081 if (!rv2)
2082 rv2 = affix_check(word.c_str(), len);
2083 if (rv2 && rv2->astr &&
2084 TESTAFF(rv2->astr, forbiddenword, rv2->alen) &&
2085 (strncmp(rv2->word, st.c_str(), i + rv->blen) == 0)) {
2086 return NULL;
2087 }
2088 }
2089 st[i + rv->blen] = r;
2090 }
2091 }
2092 return rv_first;
2093 }
2094 } while (striple && !checkedstriple); // end of striple loop
2095
2096 if (checkedstriple) {
2097 i++;
2098 checkedstriple = 0;
2099 striple = 0;
2100 }
2101
2102 } // first word is ok condition
2103
2104 if (soldi != 0) {
2105 i = soldi;
2106 soldi = 0;
2107 len = oldlen;
2108 cmin = oldcmin;
2109 cmax = oldcmax;
2110 }
2111 scpd++;
2112
2113 } while (!onlycpdrule && simplifiedcpd &&
2114 scpd <= checkcpdtable.size()); // end of simplifiedcpd loop
2115
2116 scpd = 0;
2117 wordnum = oldwordnum;
2118 numsyllable = oldnumsyllable;
2119
2120 if (soldi != 0) {
2121 i = soldi;
2122 st.assign(word); // XXX add more optim.
2123 soldi = 0;
2124 } else
2125 st[i] = ch;
2126
2127 } while (!defcpdtable.empty() && oldwordnum == 0 &&
2128 onlycpdrule++ < 1); // end of onlycpd loop
2129 }
2130
2131 return NULL;
2132 }
2133
2134 // check if compound word is correctly spelled
2135 // hu_mov_rule = spec. Hungarian rule (XXX)
compound_check_morph(const char * word,int len,short wordnum,short numsyllable,short maxwordnum,short wnum,hentry ** words,hentry ** rwords,char hu_mov_rule,std::string & result,const std::string * partresult)2136 int AffixMgr::compound_check_morph(const char* word,
2137 int len,
2138 short wordnum,
2139 short numsyllable,
2140 short maxwordnum,
2141 short wnum,
2142 hentry** words,
2143 hentry** rwords,
2144 char hu_mov_rule,
2145 std::string& result,
2146 const std::string* partresult) {
2147 int i;
2148 short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
2149 int ok = 0;
2150
2151 struct hentry* rv = NULL;
2152 struct hentry* rv_first;
2153 std::string st;
2154 char ch;
2155
2156 int checked_prefix;
2157 std::string presult;
2158
2159 int cmin;
2160 int cmax;
2161
2162 char affixed = 0;
2163 hentry** oldwords = words;
2164
2165 setcminmax(&cmin, &cmax, word, len);
2166
2167 st.assign(word);
2168
2169 for (i = cmin; i < cmax; i++) {
2170 // go to end of the UTF-8 character
2171 if (utf8) {
2172 for (; (st[i] & 0xc0) == 0x80; i++)
2173 ;
2174 if (i >= cmax)
2175 return 0;
2176 }
2177
2178 words = oldwords;
2179 int onlycpdrule = (words) ? 1 : 0;
2180
2181 do { // onlycpdrule loop
2182
2183 oldnumsyllable = numsyllable;
2184 oldwordnum = wordnum;
2185 checked_prefix = 0;
2186
2187 ch = st[i];
2188 st[i] = '\0';
2189 sfx = NULL;
2190
2191 // FIRST WORD
2192
2193 affixed = 1;
2194
2195 presult.clear();
2196 if (partresult)
2197 presult.append(*partresult);
2198
2199 rv = lookup(st.c_str()); // perhaps without prefix
2200
2201 // search homonym with compound flag
2202 while ((rv) && !hu_mov_rule &&
2203 ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2204 !((compoundflag && !words && !onlycpdrule &&
2205 TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2206 (compoundbegin && !wordnum && !onlycpdrule &&
2207 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2208 (compoundmiddle && wordnum && !words && !onlycpdrule &&
2209 TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
2210 (!defcpdtable.empty() && onlycpdrule &&
2211 ((!words && !wordnum &&
2212 defcpd_check(&words, wnum, rv, rwords, 0)) ||
2213 (words &&
2214 defcpd_check(&words, wnum, rv, rwords, 0))))))) {
2215 rv = rv->next_homonym;
2216 }
2217
2218 if (rv)
2219 affixed = 0;
2220
2221 if (rv) {
2222 presult.push_back(MSEP_FLD);
2223 presult.append(MORPH_PART);
2224 presult.append(st.c_str());
2225 if (!HENTRY_FIND(rv, MORPH_STEM)) {
2226 presult.push_back(MSEP_FLD);
2227 presult.append(MORPH_STEM);
2228 presult.append(st.c_str());
2229 }
2230 if (HENTRY_DATA(rv)) {
2231 presult.push_back(MSEP_FLD);
2232 presult.append(HENTRY_DATA2(rv));
2233 }
2234 }
2235
2236 if (!rv) {
2237 if (compoundflag &&
2238 !(rv =
2239 prefix_check(st.c_str(), i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
2240 compoundflag))) {
2241 if (((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
2242 compoundflag,
2243 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2244 (compoundmoresuffixes &&
2245 (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
2246 !hu_mov_rule && sfx->getCont() &&
2247 ((compoundforbidflag &&
2248 TESTAFF(sfx->getCont(), compoundforbidflag,
2249 sfx->getContLen())) ||
2250 (compoundend &&
2251 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
2252 rv = NULL;
2253 }
2254 }
2255
2256 if (rv ||
2257 (((wordnum == 0) && compoundbegin &&
2258 ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
2259 compoundbegin,
2260 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2261 (compoundmoresuffixes &&
2262 (rv = suffix_check_twosfx(
2263 st.c_str(), i, 0, NULL,
2264 compoundbegin))) || // twofold suffix+compound
2265 (rv = prefix_check(st.c_str(), i,
2266 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
2267 compoundbegin)))) ||
2268 ((wordnum > 0) && compoundmiddle &&
2269 ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
2270 compoundmiddle,
2271 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
2272 (compoundmoresuffixes &&
2273 (rv = suffix_check_twosfx(
2274 st.c_str(), i, 0, NULL,
2275 compoundmiddle))) || // twofold suffix+compound
2276 (rv = prefix_check(st.c_str(), i,
2277 hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
2278 compoundmiddle)))))) {
2279 std::string p;
2280 if (compoundflag)
2281 p = affix_check_morph(st.c_str(), i, compoundflag);
2282 if (p.empty()) {
2283 if ((wordnum == 0) && compoundbegin) {
2284 p = affix_check_morph(st.c_str(), i, compoundbegin);
2285 } else if ((wordnum > 0) && compoundmiddle) {
2286 p = affix_check_morph(st.c_str(), i, compoundmiddle);
2287 }
2288 }
2289 if (!p.empty()) {
2290 presult.push_back(MSEP_FLD);
2291 presult.append(MORPH_PART);
2292 presult.append(st.c_str());
2293 line_uniq_app(p, MSEP_REC);
2294 presult.append(p);
2295 }
2296 checked_prefix = 1;
2297 }
2298 // else check forbiddenwords
2299 } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2300 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
2301 TESTAFF(rv->astr, needaffix, rv->alen))) {
2302 st[i] = ch;
2303 continue;
2304 }
2305
2306 // check non_compound flag in suffix and prefix
2307 if ((rv) && !hu_mov_rule &&
2308 ((pfx && pfx->getCont() &&
2309 TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
2310 (sfx && sfx->getCont() &&
2311 TESTAFF(sfx->getCont(), compoundforbidflag, sfx->getContLen())))) {
2312 continue;
2313 }
2314
2315 // check compoundend flag in suffix and prefix
2316 if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
2317 ((pfx && pfx->getCont() &&
2318 TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) ||
2319 (sfx && sfx->getCont() &&
2320 TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
2321 continue;
2322 }
2323
2324 // check compoundmiddle flag in suffix and prefix
2325 if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
2326 !hu_mov_rule &&
2327 ((pfx && pfx->getCont() &&
2328 TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) ||
2329 (sfx && sfx->getCont() &&
2330 TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) {
2331 rv = NULL;
2332 }
2333
2334 // check forbiddenwords
2335 if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2336 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)))
2337 continue;
2338
2339 // increment word number, if the second root has a compoundroot flag
2340 if ((rv) && (compoundroot) &&
2341 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2342 wordnum++;
2343 }
2344
2345 // first word is acceptable in compound words?
2346 if (((rv) &&
2347 (checked_prefix || (words && words[wnum]) ||
2348 (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2349 ((oldwordnum == 0) && compoundbegin &&
2350 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2351 ((oldwordnum > 0) && compoundmiddle &&
2352 TESTAFF(rv->astr, compoundmiddle, rv->alen))
2353 // LANG_hu section: spec. Hungarian rule
2354 || ((langnum == LANG_hu) && // hu_mov_rule
2355 hu_mov_rule && (TESTAFF(rv->astr, 'F', rv->alen) ||
2356 TESTAFF(rv->astr, 'G', rv->alen) ||
2357 TESTAFF(rv->astr, 'H', rv->alen)))
2358 // END of LANG_hu section
2359 ) &&
2360 !((checkcompoundtriple && !words && // test triple letters
2361 (word[i - 1] == word[i]) &&
2362 (((i > 1) && (word[i - 1] == word[i - 2])) ||
2363 ((word[i - 1] == word[i + 1])) // may be word[i+1] == '\0'
2364 )) ||
2365 (
2366 // test CHECKCOMPOUNDPATTERN
2367 !checkcpdtable.empty() && !words &&
2368 cpdpat_check(word, i, rv, NULL, affixed)) ||
2369 (checkcompoundcase && !words && cpdcase_check(word, i))))
2370 // LANG_hu section: spec. Hungarian rule
2371 ||
2372 ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
2373 (rv = affix_check(st.c_str(), i)) &&
2374 (sfx && sfx->getCont() &&
2375 (TESTAFF(sfx->getCont(), (unsigned short)'x', sfx->getContLen()) ||
2376 TESTAFF(sfx->getCont(), (unsigned short)'%', sfx->getContLen()))))
2377 // END of LANG_hu section
2378 ) {
2379 // LANG_hu section: spec. Hungarian rule
2380 if (langnum == LANG_hu) {
2381 // calculate syllable number of the word
2382 numsyllable += get_syllable(st.substr(0, i));
2383
2384 // + 1 word, if syllable number of the prefix > 1 (hungarian
2385 // convention)
2386 if (pfx && (get_syllable(pfx->getKey()) > 1))
2387 wordnum++;
2388 }
2389 // END of LANG_hu section
2390
2391 // NEXT WORD(S)
2392 rv_first = rv;
2393 rv = lookup((word + i)); // perhaps without prefix
2394
2395 // search homonym with compound flag
2396 while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
2397 !((compoundflag && !words &&
2398 TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2399 (compoundend && !words &&
2400 TESTAFF(rv->astr, compoundend, rv->alen)) ||
2401 (!defcpdtable.empty() && words &&
2402 defcpd_check(&words, wnum + 1, rv, NULL, 1))))) {
2403 rv = rv->next_homonym;
2404 }
2405
2406 if (rv && words && words[wnum + 1]) {
2407 result.append(presult);
2408 result.append(" ");
2409 result.append(MORPH_PART);
2410 result.append(word + i);
2411 if (complexprefixes && HENTRY_DATA(rv))
2412 result.append(HENTRY_DATA2(rv));
2413 if (!HENTRY_FIND(rv, MORPH_STEM)) {
2414 result.append(" ");
2415 result.append(MORPH_STEM);
2416 result.append(HENTRY_WORD(rv));
2417 }
2418 // store the pointer of the hash entry
2419 if (!complexprefixes && HENTRY_DATA(rv)) {
2420 result.append(" ");
2421 result.append(HENTRY_DATA2(rv));
2422 }
2423 result.append("\n");
2424 return 0;
2425 }
2426
2427 oldnumsyllable2 = numsyllable;
2428 oldwordnum2 = wordnum;
2429
2430 // LANG_hu section: spec. Hungarian rule
2431 if ((rv) && (langnum == LANG_hu) &&
2432 (TESTAFF(rv->astr, 'I', rv->alen)) &&
2433 !(TESTAFF(rv->astr, 'J', rv->alen))) {
2434 numsyllable--;
2435 }
2436 // END of LANG_hu section
2437 // increment word number, if the second root has a compoundroot flag
2438 if ((rv) && (compoundroot) &&
2439 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2440 wordnum++;
2441 }
2442
2443 // check forbiddenwords
2444 if ((rv) && (rv->astr) &&
2445 (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2446 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) {
2447 st[i] = ch;
2448 continue;
2449 }
2450
2451 // second word is acceptable, as a root?
2452 // hungarian conventions: compounding is acceptable,
2453 // when compound forms consist of 2 words, or if more,
2454 // then the syllable number of root words must be 6, or lesser.
2455 if ((rv) &&
2456 ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2457 (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
2458 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2459 ((cpdmaxsyllable != 0) &&
2460 (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
2461 cpdmaxsyllable))) &&
2462 ((!checkcompounddup || (rv != rv_first)))) {
2463 // bad compound word
2464 result.append(presult);
2465 result.append(" ");
2466 result.append(MORPH_PART);
2467 result.append(word + i);
2468
2469 if (HENTRY_DATA(rv)) {
2470 if (complexprefixes)
2471 result.append(HENTRY_DATA2(rv));
2472 if (!HENTRY_FIND(rv, MORPH_STEM)) {
2473 result.append(" ");
2474 result.append(MORPH_STEM);
2475 result.append(HENTRY_WORD(rv));
2476 }
2477 // store the pointer of the hash entry
2478 if (!complexprefixes) {
2479 result.append(" ");
2480 result.append(HENTRY_DATA2(rv));
2481 }
2482 }
2483 result.append("\n");
2484 ok = 1;
2485 }
2486
2487 numsyllable = oldnumsyllable2;
2488 wordnum = oldwordnum2;
2489
2490 // perhaps second word has prefix or/and suffix
2491 sfx = NULL;
2492 sfxflag = FLAG_NULL;
2493
2494 if (compoundflag && !onlycpdrule)
2495 rv = affix_check((word + i), strlen(word + i), compoundflag);
2496 else
2497 rv = NULL;
2498
2499 if (!rv && compoundend && !onlycpdrule) {
2500 sfx = NULL;
2501 pfx = NULL;
2502 rv = affix_check((word + i), strlen(word + i), compoundend);
2503 }
2504
2505 if (!rv && !defcpdtable.empty() && words) {
2506 rv = affix_check((word + i), strlen(word + i), 0, IN_CPD_END);
2507 if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
2508 std::string m;
2509 if (compoundflag)
2510 m = affix_check_morph((word + i), strlen(word + i), compoundflag);
2511 if (m.empty() && compoundend) {
2512 m = affix_check_morph((word + i), strlen(word + i), compoundend);
2513 }
2514 result.append(presult);
2515 if (!m.empty()) {
2516 result.push_back(MSEP_FLD);
2517 result.append(MORPH_PART);
2518 result.append(word + i);
2519 line_uniq_app(m, MSEP_REC);
2520 result.append(m);
2521 }
2522 result.append("\n");
2523 ok = 1;
2524 }
2525 }
2526
2527 // check non_compound flag in suffix and prefix
2528 if ((rv) &&
2529 ((pfx && pfx->getCont() &&
2530 TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
2531 (sfx && sfx->getCont() &&
2532 TESTAFF(sfx->getCont(), compoundforbidflag,
2533 sfx->getContLen())))) {
2534 rv = NULL;
2535 }
2536
2537 // check forbiddenwords
2538 if ((rv) && (rv->astr) &&
2539 (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2540 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)) &&
2541 (!TESTAFF(rv->astr, needaffix, rv->alen))) {
2542 st[i] = ch;
2543 continue;
2544 }
2545
2546 if (langnum == LANG_hu) {
2547 // calculate syllable number of the word
2548 numsyllable += get_syllable(word + i);
2549
2550 // - affix syllable num.
2551 // XXX only second suffix (inflections, not derivations)
2552 if (sfxappnd) {
2553 std::string tmp(sfxappnd);
2554 reverseword(tmp);
2555 numsyllable -= get_syllable(tmp) + sfxextra;
2556 }
2557
2558 // + 1 word, if syllable number of the prefix > 1 (hungarian
2559 // convention)
2560 if (pfx && (get_syllable(pfx->getKey()) > 1))
2561 wordnum++;
2562
2563 // increment syllable num, if last word has a SYLLABLENUM flag
2564 // and the suffix is beginning `s'
2565
2566 if (!cpdsyllablenum.empty()) {
2567 switch (sfxflag) {
2568 case 'c': {
2569 numsyllable += 2;
2570 break;
2571 }
2572 case 'J': {
2573 numsyllable += 1;
2574 break;
2575 }
2576 case 'I': {
2577 if (rv && TESTAFF(rv->astr, 'J', rv->alen))
2578 numsyllable += 1;
2579 break;
2580 }
2581 }
2582 }
2583 }
2584
2585 // increment word number, if the second word has a compoundroot flag
2586 if ((rv) && (compoundroot) &&
2587 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2588 wordnum++;
2589 }
2590 // second word is acceptable, as a word with prefix or/and suffix?
2591 // hungarian conventions: compounding is acceptable,
2592 // when compound forms consist 2 word, otherwise
2593 // the syllable number of root words is 6, or lesser.
2594 if ((rv) &&
2595 (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
2596 ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
2597 ((!checkcompounddup || (rv != rv_first)))) {
2598 std::string m;
2599 if (compoundflag)
2600 m = affix_check_morph((word + i), strlen(word + i), compoundflag);
2601 if (m.empty() && compoundend) {
2602 m = affix_check_morph((word + i), strlen(word + i), compoundend);
2603 }
2604 result.append(presult);
2605 if (!m.empty()) {
2606 result.push_back(MSEP_FLD);
2607 result.append(MORPH_PART);
2608 result.append(word + 1);
2609 line_uniq_app(m, MSEP_REC);
2610 result.append(m);
2611 }
2612 result.push_back(MSEP_REC);
2613 ok = 1;
2614 }
2615
2616 numsyllable = oldnumsyllable2;
2617 wordnum = oldwordnum2;
2618
2619 // perhaps second word is a compound word (recursive call)
2620 if ((wordnum + 2 < maxwordnum) && (ok == 0)) {
2621 compound_check_morph((word + i), strlen(word + i), wordnum + 1,
2622 numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
2623 result, &presult);
2624 } else {
2625 rv = NULL;
2626 }
2627 }
2628 st[i] = ch;
2629 wordnum = oldwordnum;
2630 numsyllable = oldnumsyllable;
2631
2632 } while (!defcpdtable.empty() && oldwordnum == 0 &&
2633 onlycpdrule++ < 1); // end of onlycpd loop
2634 }
2635 return 0;
2636 }
2637
2638
isRevSubset(const char * s1,const char * end_of_s2,int len)2639 inline int AffixMgr::isRevSubset(const char* s1,
2640 const char* end_of_s2,
2641 int len) {
2642 while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {
2643 s1++;
2644 end_of_s2--;
2645 len--;
2646 }
2647 return (*s1 == '\0');
2648 }
2649
2650 // check word for suffixes
suffix_check(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag,char in_compound)2651 struct hentry* AffixMgr::suffix_check(const char* word,
2652 int len,
2653 int sfxopts,
2654 PfxEntry* ppfx,
2655 const FLAG cclass,
2656 const FLAG needflag,
2657 char in_compound) {
2658 struct hentry* rv = NULL;
2659 PfxEntry* ep = ppfx;
2660
2661 // first handle the special case of 0 length suffixes
2662 SfxEntry* se = sStart[0];
2663
2664 while (se) {
2665 if (!cclass || se->getCont()) {
2666 // suffixes are not allowed in beginning of compounds
2667 if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2668 // except when signed with compoundpermitflag flag
2669 (se->getCont() && compoundpermitflag &&
2670 TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) &&
2671 (!circumfix ||
2672 // no circumfix flag in prefix and suffix
2673 ((!ppfx || !(ep->getCont()) ||
2674 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2675 (!se->getCont() ||
2676 !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) ||
2677 // circumfix flag in prefix AND suffix
2678 ((ppfx && (ep->getCont()) &&
2679 TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2680 (se->getCont() &&
2681 (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) &&
2682 // fogemorpheme
2683 (in_compound ||
2684 !(se->getCont() &&
2685 (TESTAFF(se->getCont(), onlyincompound, se->getContLen())))) &&
2686 // needaffix on prefix or first suffix
2687 (cclass ||
2688 !(se->getCont() &&
2689 TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2690 (ppfx &&
2691 !((ep->getCont()) &&
2692 TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))) {
2693 rv = se->checkword(word, len, sfxopts, ppfx,
2694 (FLAG)cclass, needflag,
2695 (in_compound ? 0 : onlyincompound));
2696 if (rv) {
2697 sfx = se; // BUG: sfx not stateless
2698 return rv;
2699 }
2700 }
2701 }
2702 se = se->getNext();
2703 }
2704
2705 // now handle the general case
2706 if (len == 0)
2707 return NULL; // FULLSTRIP
2708 unsigned char sp = *((const unsigned char*)(word + len - 1));
2709 SfxEntry* sptr = sStart[sp];
2710
2711 while (sptr) {
2712 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2713 // suffixes are not allowed in beginning of compounds
2714 if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2715 // except when signed with compoundpermitflag flag
2716 (sptr->getCont() && compoundpermitflag &&
2717 TESTAFF(sptr->getCont(), compoundpermitflag,
2718 sptr->getContLen()))) &&
2719 (!circumfix ||
2720 // no circumfix flag in prefix and suffix
2721 ((!ppfx || !(ep->getCont()) ||
2722 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2723 (!sptr->getCont() ||
2724 !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) ||
2725 // circumfix flag in prefix AND suffix
2726 ((ppfx && (ep->getCont()) &&
2727 TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2728 (sptr->getCont() &&
2729 (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) &&
2730 // fogemorpheme
2731 (in_compound ||
2732 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,
2733 sptr->getContLen()))))) &&
2734 // needaffix on prefix or first suffix
2735 (cclass ||
2736 !(sptr->getCont() &&
2737 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
2738 (ppfx &&
2739 !((ep->getCont()) &&
2740 TESTAFF(ep->getCont(), needaffix, ep->getContLen())))))
2741 if (in_compound != IN_CPD_END || ppfx ||
2742 !(sptr->getCont() &&
2743 TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))) {
2744 rv = sptr->checkword(word, len, sfxopts, ppfx,
2745 cclass, needflag,
2746 (in_compound ? 0 : onlyincompound));
2747 if (rv) {
2748 sfx = sptr; // BUG: sfx not stateless
2749 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2750 if (!sptr->getCont())
2751 sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless
2752 // LANG_hu section: spec. Hungarian rule
2753 else if (langnum == LANG_hu && sptr->getKeyLen() &&
2754 sptr->getKey()[0] == 'i' && sptr->getKey()[1] != 'y' &&
2755 sptr->getKey()[1] != 't') {
2756 sfxextra = 1;
2757 }
2758 // END of LANG_hu section
2759 return rv;
2760 }
2761 }
2762 sptr = sptr->getNextEQ();
2763 } else {
2764 sptr = sptr->getNextNE();
2765 }
2766 }
2767
2768 return NULL;
2769 }
2770
2771 // check word for two-level suffixes
2772
suffix_check_twosfx(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG needflag)2773 struct hentry* AffixMgr::suffix_check_twosfx(const char* word,
2774 int len,
2775 int sfxopts,
2776 PfxEntry* ppfx,
2777 const FLAG needflag) {
2778 struct hentry* rv = NULL;
2779
2780 // first handle the special case of 0 length suffixes
2781 SfxEntry* se = sStart[0];
2782 while (se) {
2783 if (contclasses[se->getFlag()]) {
2784 rv = se->check_twosfx(word, len, sfxopts, ppfx, needflag);
2785 if (rv)
2786 return rv;
2787 }
2788 se = se->getNext();
2789 }
2790
2791 // now handle the general case
2792 if (len == 0)
2793 return NULL; // FULLSTRIP
2794 unsigned char sp = *((const unsigned char*)(word + len - 1));
2795 SfxEntry* sptr = sStart[sp];
2796
2797 while (sptr) {
2798 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2799 if (contclasses[sptr->getFlag()]) {
2800 rv = sptr->check_twosfx(word, len, sfxopts, ppfx, needflag);
2801 if (rv) {
2802 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2803 if (!sptr->getCont())
2804 sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless
2805 return rv;
2806 }
2807 }
2808 sptr = sptr->getNextEQ();
2809 } else {
2810 sptr = sptr->getNextNE();
2811 }
2812 }
2813
2814 return NULL;
2815 }
2816
suffix_check_twosfx_morph(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG needflag)2817 std::string AffixMgr::suffix_check_twosfx_morph(const char* word,
2818 int len,
2819 int sfxopts,
2820 PfxEntry* ppfx,
2821 const FLAG needflag) {
2822 std::string result;
2823 std::string result2;
2824 std::string result3;
2825
2826 // first handle the special case of 0 length suffixes
2827 SfxEntry* se = sStart[0];
2828 while (se) {
2829 if (contclasses[se->getFlag()]) {
2830 std::string st = se->check_twosfx_morph(word, len, sfxopts, ppfx, needflag);
2831 if (!st.empty()) {
2832 if (ppfx) {
2833 if (ppfx->getMorph()) {
2834 result.append(ppfx->getMorph());
2835 result.append(" ");
2836 } else
2837 debugflag(result, ppfx->getFlag());
2838 }
2839 result.append(st);
2840 if (se->getMorph()) {
2841 result.append(" ");
2842 result.append(se->getMorph());
2843 } else
2844 debugflag(result, se->getFlag());
2845 result.append("\n");
2846 }
2847 }
2848 se = se->getNext();
2849 }
2850
2851 // now handle the general case
2852 if (len == 0)
2853 return std::string(); // FULLSTRIP
2854 unsigned char sp = *((const unsigned char*)(word + len - 1));
2855 SfxEntry* sptr = sStart[sp];
2856
2857 while (sptr) {
2858 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2859 if (contclasses[sptr->getFlag()]) {
2860 std::string st = sptr->check_twosfx_morph(word, len, sfxopts, ppfx, needflag);
2861 if (!st.empty()) {
2862 sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2863 if (!sptr->getCont())
2864 sfxappnd = sptr->getKey(); // BUG: sfxappnd not stateless
2865 result2.assign(st);
2866
2867 result3.clear();
2868
2869 if (sptr->getMorph()) {
2870 result3.append(" ");
2871 result3.append(sptr->getMorph());
2872 } else
2873 debugflag(result3, sptr->getFlag());
2874 strlinecat(result2, result3);
2875 result2.append("\n");
2876 result.append(result2);
2877 }
2878 }
2879 sptr = sptr->getNextEQ();
2880 } else {
2881 sptr = sptr->getNextNE();
2882 }
2883 }
2884
2885 return result;
2886 }
2887
suffix_check_morph(const char * word,int len,int sfxopts,PfxEntry * ppfx,const FLAG cclass,const FLAG needflag,char in_compound)2888 std::string AffixMgr::suffix_check_morph(const char* word,
2889 int len,
2890 int sfxopts,
2891 PfxEntry* ppfx,
2892 const FLAG cclass,
2893 const FLAG needflag,
2894 char in_compound) {
2895 std::string result;
2896
2897 struct hentry* rv = NULL;
2898
2899 PfxEntry* ep = ppfx;
2900
2901 // first handle the special case of 0 length suffixes
2902 SfxEntry* se = sStart[0];
2903 while (se) {
2904 if (!cclass || se->getCont()) {
2905 // suffixes are not allowed in beginning of compounds
2906 if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2907 // except when signed with compoundpermitflag flag
2908 (se->getCont() && compoundpermitflag &&
2909 TESTAFF(se->getCont(), compoundpermitflag, se->getContLen()))) &&
2910 (!circumfix ||
2911 // no circumfix flag in prefix and suffix
2912 ((!ppfx || !(ep->getCont()) ||
2913 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2914 (!se->getCont() ||
2915 !(TESTAFF(se->getCont(), circumfix, se->getContLen())))) ||
2916 // circumfix flag in prefix AND suffix
2917 ((ppfx && (ep->getCont()) &&
2918 TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2919 (se->getCont() &&
2920 (TESTAFF(se->getCont(), circumfix, se->getContLen()))))) &&
2921 // fogemorpheme
2922 (in_compound ||
2923 !((se->getCont() &&
2924 (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
2925 // needaffix on prefix or first suffix
2926 (cclass ||
2927 !(se->getCont() &&
2928 TESTAFF(se->getCont(), needaffix, se->getContLen())) ||
2929 (ppfx &&
2930 !((ep->getCont()) &&
2931 TESTAFF(ep->getCont(), needaffix, ep->getContLen()))))))
2932 rv = se->checkword(word, len, sfxopts, ppfx, cclass,
2933 needflag, FLAG_NULL);
2934 while (rv) {
2935 if (ppfx) {
2936 if (ppfx->getMorph()) {
2937 result.append(ppfx->getMorph());
2938 result.append(" ");
2939 } else
2940 debugflag(result, ppfx->getFlag());
2941 }
2942 if (complexprefixes && HENTRY_DATA(rv))
2943 result.append(HENTRY_DATA2(rv));
2944 if (!HENTRY_FIND(rv, MORPH_STEM)) {
2945 result.append(" ");
2946 result.append(MORPH_STEM);
2947 result.append(HENTRY_WORD(rv));
2948 }
2949
2950 if (!complexprefixes && HENTRY_DATA(rv)) {
2951 result.append(" ");
2952 result.append(HENTRY_DATA2(rv));
2953 }
2954 if (se->getMorph()) {
2955 result.append(" ");
2956 result.append(se->getMorph());
2957 } else
2958 debugflag(result, se->getFlag());
2959 result.append("\n");
2960 rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
2961 }
2962 }
2963 se = se->getNext();
2964 }
2965
2966 // now handle the general case
2967 if (len == 0)
2968 return std::string(); // FULLSTRIP
2969 unsigned char sp = *((const unsigned char*)(word + len - 1));
2970 SfxEntry* sptr = sStart[sp];
2971
2972 while (sptr) {
2973 if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2974 // suffixes are not allowed in beginning of compounds
2975 if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2976 // except when signed with compoundpermitflag flag
2977 (sptr->getCont() && compoundpermitflag &&
2978 TESTAFF(sptr->getCont(), compoundpermitflag,
2979 sptr->getContLen()))) &&
2980 (!circumfix ||
2981 // no circumfix flag in prefix and suffix
2982 ((!ppfx || !(ep->getCont()) ||
2983 !TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2984 (!sptr->getCont() ||
2985 !(TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())))) ||
2986 // circumfix flag in prefix AND suffix
2987 ((ppfx && (ep->getCont()) &&
2988 TESTAFF(ep->getCont(), circumfix, ep->getContLen())) &&
2989 (sptr->getCont() &&
2990 (TESTAFF(sptr->getCont(), circumfix, sptr->getContLen()))))) &&
2991 // fogemorpheme
2992 (in_compound ||
2993 !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound,
2994 sptr->getContLen()))))) &&
2995 // needaffix on first suffix
2996 (cclass ||
2997 !(sptr->getCont() &&
2998 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())))))
2999 rv = sptr->checkword(word, len, sfxopts, ppfx, cclass,
3000 needflag, FLAG_NULL);
3001 while (rv) {
3002 if (ppfx) {
3003 if (ppfx->getMorph()) {
3004 result.append(ppfx->getMorph());
3005 result.append(" ");
3006 } else
3007 debugflag(result, ppfx->getFlag());
3008 }
3009 if (complexprefixes && HENTRY_DATA(rv))
3010 result.append(HENTRY_DATA2(rv));
3011 if (!HENTRY_FIND(rv, MORPH_STEM)) {
3012 result.append(" ");
3013 result.append(MORPH_STEM);
3014 result.append(HENTRY_WORD(rv));
3015 }
3016
3017 if (!complexprefixes && HENTRY_DATA(rv)) {
3018 result.append(" ");
3019 result.append(HENTRY_DATA2(rv));
3020 }
3021
3022 if (sptr->getMorph()) {
3023 result.append(" ");
3024 result.append(sptr->getMorph());
3025 } else
3026 debugflag(result, sptr->getFlag());
3027 result.append("\n");
3028 rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
3029 }
3030 sptr = sptr->getNextEQ();
3031 } else {
3032 sptr = sptr->getNextNE();
3033 }
3034 }
3035
3036 return result;
3037 }
3038
3039 // check if word with affixes is correctly spelled
affix_check(const char * word,int len,const FLAG needflag,char in_compound)3040 struct hentry* AffixMgr::affix_check(const char* word,
3041 int len,
3042 const FLAG needflag,
3043 char in_compound) {
3044
3045 // check all prefixes (also crossed with suffixes if allowed)
3046 struct hentry* rv = prefix_check(word, len, in_compound, needflag);
3047 if (rv)
3048 return rv;
3049
3050 // if still not found check all suffixes
3051 rv = suffix_check(word, len, 0, NULL, FLAG_NULL, needflag, in_compound);
3052
3053 if (havecontclass) {
3054 sfx = NULL;
3055 pfx = NULL;
3056
3057 if (rv)
3058 return rv;
3059 // if still not found check all two-level suffixes
3060 rv = suffix_check_twosfx(word, len, 0, NULL, needflag);
3061
3062 if (rv)
3063 return rv;
3064 // if still not found check all two-level suffixes
3065 rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag);
3066 }
3067
3068 return rv;
3069 }
3070
3071 // check if word with affixes is correctly spelled
affix_check_morph(const char * word,int len,const FLAG needflag,char in_compound)3072 std::string AffixMgr::affix_check_morph(const char* word,
3073 int len,
3074 const FLAG needflag,
3075 char in_compound) {
3076 std::string result;
3077
3078 // check all prefixes (also crossed with suffixes if allowed)
3079 std::string st = prefix_check_morph(word, len, in_compound);
3080 if (!st.empty()) {
3081 result.append(st);
3082 }
3083
3084 // if still not found check all suffixes
3085 st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound);
3086 if (!st.empty()) {
3087 result.append(st);
3088 }
3089
3090 if (havecontclass) {
3091 sfx = NULL;
3092 pfx = NULL;
3093 // if still not found check all two-level suffixes
3094 st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag);
3095 if (!st.empty()) {
3096 result.append(st);
3097 }
3098
3099 // if still not found check all two-level suffixes
3100 st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag);
3101 if (!st.empty()) {
3102 result.append(st);
3103 }
3104 }
3105
3106 return result;
3107 }
3108
3109 // morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields
3110 // in the first line of the inputs
3111 // return 0, if inputs equal
3112 // return 1, if inputs may equal with a secondary suffix
3113 // otherwise return -1
morphcmp(const char * s,const char * t)3114 static int morphcmp(const char* s, const char* t) {
3115 int se = 0;
3116 int te = 0;
3117 const char* sl;
3118 const char* tl;
3119 const char* olds;
3120 const char* oldt;
3121 if (!s || !t)
3122 return 1;
3123 olds = s;
3124 sl = strchr(s, '\n');
3125 s = strstr(s, MORPH_DERI_SFX);
3126 if (!s || (sl && sl < s))
3127 s = strstr(olds, MORPH_INFL_SFX);
3128 if (!s || (sl && sl < s)) {
3129 s = strstr(olds, MORPH_TERM_SFX);
3130 olds = NULL;
3131 }
3132 oldt = t;
3133 tl = strchr(t, '\n');
3134 t = strstr(t, MORPH_DERI_SFX);
3135 if (!t || (tl && tl < t))
3136 t = strstr(oldt, MORPH_INFL_SFX);
3137 if (!t || (tl && tl < t)) {
3138 t = strstr(oldt, MORPH_TERM_SFX);
3139 oldt = NULL;
3140 }
3141 while (s && t && (!sl || sl > s) && (!tl || tl > t)) {
3142 s += MORPH_TAG_LEN;
3143 t += MORPH_TAG_LEN;
3144 se = 0;
3145 te = 0;
3146 while ((*s == *t) && !se && !te) {
3147 s++;
3148 t++;
3149 switch (*s) {
3150 case ' ':
3151 case '\n':
3152 case '\t':
3153 case '\0':
3154 se = 1;
3155 }
3156 switch (*t) {
3157 case ' ':
3158 case '\n':
3159 case '\t':
3160 case '\0':
3161 te = 1;
3162 }
3163 }
3164 if (!se || !te) {
3165 // not terminal suffix difference
3166 if (olds)
3167 return -1;
3168 return 1;
3169 }
3170 olds = s;
3171 s = strstr(s, MORPH_DERI_SFX);
3172 if (!s || (sl && sl < s))
3173 s = strstr(olds, MORPH_INFL_SFX);
3174 if (!s || (sl && sl < s)) {
3175 s = strstr(olds, MORPH_TERM_SFX);
3176 olds = NULL;
3177 }
3178 oldt = t;
3179 t = strstr(t, MORPH_DERI_SFX);
3180 if (!t || (tl && tl < t))
3181 t = strstr(oldt, MORPH_INFL_SFX);
3182 if (!t || (tl && tl < t)) {
3183 t = strstr(oldt, MORPH_TERM_SFX);
3184 oldt = NULL;
3185 }
3186 }
3187 if (!s && !t && se && te)
3188 return 0;
3189 return 1;
3190 }
3191
morphgen(const char * ts,int wl,const unsigned short * ap,unsigned short al,const char * morph,const char * targetmorph,int level)3192 std::string AffixMgr::morphgen(const char* ts,
3193 int wl,
3194 const unsigned short* ap,
3195 unsigned short al,
3196 const char* morph,
3197 const char* targetmorph,
3198 int level) {
3199 // handle suffixes
3200 if (!morph)
3201 return std::string();
3202
3203 // check substandard flag
3204 if (TESTAFF(ap, substandard, al))
3205 return std::string();
3206
3207 if (morphcmp(morph, targetmorph) == 0)
3208 return ts;
3209
3210 size_t stemmorphcatpos;
3211 std::string mymorph;
3212
3213 // use input suffix fields, if exist
3214 if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) {
3215 mymorph.assign(morph);
3216 mymorph.append(" ");
3217 stemmorphcatpos = mymorph.size();
3218 } else {
3219 stemmorphcatpos = std::string::npos;
3220 }
3221
3222 for (int i = 0; i < al; i++) {
3223 const unsigned char c = (unsigned char)(ap[i] & 0x00FF);
3224 SfxEntry* sptr = sFlag[c];
3225 while (sptr) {
3226 if (sptr->getFlag() == ap[i] && sptr->getMorph() &&
3227 ((sptr->getContLen() == 0) ||
3228 // don't generate forms with substandard affixes
3229 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) {
3230 const char* stemmorph;
3231 if (stemmorphcatpos != std::string::npos) {
3232 mymorph.replace(stemmorphcatpos, std::string::npos, sptr->getMorph());
3233 stemmorph = mymorph.c_str();
3234 } else {
3235 stemmorph = sptr->getMorph();
3236 }
3237
3238 int cmp = morphcmp(stemmorph, targetmorph);
3239
3240 if (cmp == 0) {
3241 std::string newword = sptr->add(ts, wl);
3242 if (!newword.empty()) {
3243 hentry* check = pHMgr->lookup(newword.c_str()); // XXX extra dic
3244 if (!check || !check->astr ||
3245 !(TESTAFF(check->astr, forbiddenword, check->alen) ||
3246 TESTAFF(check->astr, ONLYUPCASEFLAG, check->alen))) {
3247 return newword;
3248 }
3249 }
3250 }
3251
3252 // recursive call for secondary suffixes
3253 if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) &&
3254 !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) {
3255 std::string newword = sptr->add(ts, wl);
3256 if (!newword.empty()) {
3257 std::string newword2 =
3258 morphgen(newword.c_str(), newword.size(), sptr->getCont(),
3259 sptr->getContLen(), stemmorph, targetmorph, 1);
3260
3261 if (!newword2.empty()) {
3262 return newword2;
3263 }
3264 }
3265 }
3266 }
3267 sptr = sptr->getFlgNxt();
3268 }
3269 }
3270 return std::string();
3271 }
3272
expand_rootword(struct guessword * wlst,int maxn,const char * ts,int wl,const unsigned short * ap,unsigned short al,const char * bad,int badl,const char * phon)3273 int AffixMgr::expand_rootword(struct guessword* wlst,
3274 int maxn,
3275 const char* ts,
3276 int wl,
3277 const unsigned short* ap,
3278 unsigned short al,
3279 const char* bad,
3280 int badl,
3281 const char* phon) {
3282 int nh = 0;
3283 // first add root word to list
3284 if ((nh < maxn) &&
3285 !(al && ((needaffix && TESTAFF(ap, needaffix, al)) ||
3286 (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {
3287 wlst[nh].word = mystrdup(ts);
3288 if (!wlst[nh].word)
3289 return 0;
3290 wlst[nh].allow = false;
3291 wlst[nh].orig = NULL;
3292 nh++;
3293 // add special phonetic version
3294 if (phon && (nh < maxn)) {
3295 wlst[nh].word = mystrdup(phon);
3296 if (!wlst[nh].word)
3297 return nh - 1;
3298 wlst[nh].allow = false;
3299 wlst[nh].orig = mystrdup(ts);
3300 if (!wlst[nh].orig)
3301 return nh - 1;
3302 nh++;
3303 }
3304 }
3305
3306 // handle suffixes
3307 for (int i = 0; i < al; i++) {
3308 const unsigned char c = (unsigned char)(ap[i] & 0x00FF);
3309 SfxEntry* sptr = sFlag[c];
3310 while (sptr) {
3311 if ((sptr->getFlag() == ap[i]) &&
3312 (!sptr->getKeyLen() ||
3313 ((badl > sptr->getKeyLen()) &&
3314 (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) &&
3315 // check needaffix flag
3316 !(sptr->getCont() &&
3317 ((needaffix &&
3318 TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) ||
3319 (circumfix &&
3320 TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) ||
3321 (onlyincompound &&
3322 TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) {
3323 std::string newword = sptr->add(ts, wl);
3324 if (!newword.empty()) {
3325 if (nh < maxn) {
3326 wlst[nh].word = mystrdup(newword.c_str());
3327 wlst[nh].allow = sptr->allowCross();
3328 wlst[nh].orig = NULL;
3329 nh++;
3330 // add special phonetic version
3331 if (phon && (nh < maxn)) {
3332 std::string prefix(phon);
3333 std::string key(sptr->getKey());
3334 reverseword(key);
3335 prefix.append(key);
3336 wlst[nh].word = mystrdup(prefix.c_str());
3337 if (!wlst[nh].word)
3338 return nh - 1;
3339 wlst[nh].allow = false;
3340 wlst[nh].orig = mystrdup(newword.c_str());
3341 if (!wlst[nh].orig)
3342 return nh - 1;
3343 nh++;
3344 }
3345 }
3346 }
3347 }
3348 sptr = sptr->getFlgNxt();
3349 }
3350 }
3351
3352 int n = nh;
3353
3354 // handle cross products of prefixes and suffixes
3355 for (int j = 1; j < n; j++)
3356 if (wlst[j].allow) {
3357 for (int k = 0; k < al; k++) {
3358 const unsigned char c = (unsigned char)(ap[k] & 0x00FF);
3359 PfxEntry* cptr = pFlag[c];
3360 while (cptr) {
3361 if ((cptr->getFlag() == ap[k]) && cptr->allowCross() &&
3362 (!cptr->getKeyLen() ||
3363 ((badl > cptr->getKeyLen()) &&
3364 (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
3365 int l1 = strlen(wlst[j].word);
3366 std::string newword = cptr->add(wlst[j].word, l1);
3367 if (!newword.empty()) {
3368 if (nh < maxn) {
3369 wlst[nh].word = mystrdup(newword.c_str());
3370 wlst[nh].allow = cptr->allowCross();
3371 wlst[nh].orig = NULL;
3372 nh++;
3373 }
3374 }
3375 }
3376 cptr = cptr->getFlgNxt();
3377 }
3378 }
3379 }
3380
3381 // now handle pure prefixes
3382 for (int m = 0; m < al; m++) {
3383 const unsigned char c = (unsigned char)(ap[m] & 0x00FF);
3384 PfxEntry* ptr = pFlag[c];
3385 while (ptr) {
3386 if ((ptr->getFlag() == ap[m]) &&
3387 (!ptr->getKeyLen() ||
3388 ((badl > ptr->getKeyLen()) &&
3389 (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) &&
3390 // check needaffix flag
3391 !(ptr->getCont() &&
3392 ((needaffix &&
3393 TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) ||
3394 (circumfix &&
3395 TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||
3396 (onlyincompound &&
3397 TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))) {
3398 std::string newword = ptr->add(ts, wl);
3399 if (!newword.empty()) {
3400 if (nh < maxn) {
3401 wlst[nh].word = mystrdup(newword.c_str());
3402 wlst[nh].allow = ptr->allowCross();
3403 wlst[nh].orig = NULL;
3404 nh++;
3405 }
3406 }
3407 }
3408 ptr = ptr->getFlgNxt();
3409 }
3410 }
3411
3412 return nh;
3413 }
3414
3415 // return replacing table
get_reptable() const3416 const std::vector<replentry>& AffixMgr::get_reptable() const {
3417 return reptable;
3418 }
3419
3420 // return iconv table
get_iconvtable() const3421 RepList* AffixMgr::get_iconvtable() const {
3422 if (!iconvtable)
3423 return NULL;
3424 return iconvtable;
3425 }
3426
3427 // return oconv table
get_oconvtable() const3428 RepList* AffixMgr::get_oconvtable() const {
3429 if (!oconvtable)
3430 return NULL;
3431 return oconvtable;
3432 }
3433
3434 // return replacing table
get_phonetable() const3435 struct phonetable* AffixMgr::get_phonetable() const {
3436 if (!phone)
3437 return NULL;
3438 return phone;
3439 }
3440
3441 // return character map table
get_maptable() const3442 const std::vector<mapentry>& AffixMgr::get_maptable() const {
3443 return maptable;
3444 }
3445
3446 // return character map table
get_breaktable() const3447 const std::vector<std::string>& AffixMgr::get_breaktable() const {
3448 return breaktable;
3449 }
3450
3451 // return text encoding of dictionary
get_encoding()3452 const std::string& AffixMgr::get_encoding() {
3453 if (encoding.empty())
3454 encoding = SPELL_ENCODING;
3455 return encoding;
3456 }
3457
3458 // return text encoding of dictionary
get_langnum() const3459 int AffixMgr::get_langnum() const {
3460 return langnum;
3461 }
3462
3463 // return double prefix option
get_complexprefixes() const3464 int AffixMgr::get_complexprefixes() const {
3465 return complexprefixes;
3466 }
3467
3468 // return FULLSTRIP option
get_fullstrip() const3469 int AffixMgr::get_fullstrip() const {
3470 return fullstrip;
3471 }
3472
get_keepcase() const3473 FLAG AffixMgr::get_keepcase() const {
3474 return keepcase;
3475 }
3476
get_forceucase() const3477 FLAG AffixMgr::get_forceucase() const {
3478 return forceucase;
3479 }
3480
get_warn() const3481 FLAG AffixMgr::get_warn() const {
3482 return warn;
3483 }
3484
get_forbidwarn() const3485 int AffixMgr::get_forbidwarn() const {
3486 return forbidwarn;
3487 }
3488
get_checksharps() const3489 int AffixMgr::get_checksharps() const {
3490 return checksharps;
3491 }
3492
encode_flag(unsigned short aflag) const3493 char* AffixMgr::encode_flag(unsigned short aflag) const {
3494 return pHMgr->encode_flag(aflag);
3495 }
3496
3497 // return the preferred ignore string for suggestions
get_ignore() const3498 const char* AffixMgr::get_ignore() const {
3499 if (ignorechars.empty())
3500 return NULL;
3501 return ignorechars.c_str();
3502 }
3503
3504 // return the preferred ignore string for suggestions
get_ignore_utf16() const3505 const std::vector<w_char>& AffixMgr::get_ignore_utf16() const {
3506 return ignorechars_utf16;
3507 }
3508
3509 // return the keyboard string for suggestions
get_key_string()3510 char* AffixMgr::get_key_string() {
3511 if (keystring.empty())
3512 keystring = SPELL_KEYSTRING;
3513 return mystrdup(keystring.c_str());
3514 }
3515
3516 // return the preferred try string for suggestions
get_try_string() const3517 char* AffixMgr::get_try_string() const {
3518 if (trystring.empty())
3519 return NULL;
3520 return mystrdup(trystring.c_str());
3521 }
3522
3523 // return the preferred try string for suggestions
get_wordchars() const3524 const std::string& AffixMgr::get_wordchars() const {
3525 return wordchars;
3526 }
3527
get_wordchars_utf16() const3528 const std::vector<w_char>& AffixMgr::get_wordchars_utf16() const {
3529 return wordchars_utf16;
3530 }
3531
3532 // is there compounding?
get_compound() const3533 int AffixMgr::get_compound() const {
3534 return compoundflag || compoundbegin || !defcpdtable.empty();
3535 }
3536
3537 // return the compound words control flag
get_compoundflag() const3538 FLAG AffixMgr::get_compoundflag() const {
3539 return compoundflag;
3540 }
3541
3542 // return the forbidden words control flag
get_forbiddenword() const3543 FLAG AffixMgr::get_forbiddenword() const {
3544 return forbiddenword;
3545 }
3546
3547 // return the forbidden words control flag
get_nosuggest() const3548 FLAG AffixMgr::get_nosuggest() const {
3549 return nosuggest;
3550 }
3551
3552 // return the forbidden words control flag
get_nongramsuggest() const3553 FLAG AffixMgr::get_nongramsuggest() const {
3554 return nongramsuggest;
3555 }
3556
3557 // return the forbidden words flag modify flag
get_needaffix() const3558 FLAG AffixMgr::get_needaffix() const {
3559 return needaffix;
3560 }
3561
3562 // return the onlyincompound flag
get_onlyincompound() const3563 FLAG AffixMgr::get_onlyincompound() const {
3564 return onlyincompound;
3565 }
3566
3567 // return the value of suffix
get_version() const3568 const std::string& AffixMgr::get_version() const {
3569 return version;
3570 }
3571
3572 // utility method to look up root words in hash table
lookup(const char * word)3573 struct hentry* AffixMgr::lookup(const char* word) {
3574 struct hentry* he = NULL;
3575 for (size_t i = 0; i < alldic.size() && !he; ++i) {
3576 he = alldic[i]->lookup(word);
3577 }
3578 return he;
3579 }
3580
3581 // return the value of suffix
have_contclass() const3582 int AffixMgr::have_contclass() const {
3583 return havecontclass;
3584 }
3585
3586 // return utf8
get_utf8() const3587 int AffixMgr::get_utf8() const {
3588 return utf8;
3589 }
3590
get_maxngramsugs(void) const3591 int AffixMgr::get_maxngramsugs(void) const {
3592 return maxngramsugs;
3593 }
3594
get_maxcpdsugs(void) const3595 int AffixMgr::get_maxcpdsugs(void) const {
3596 return maxcpdsugs;
3597 }
3598
get_maxdiff(void) const3599 int AffixMgr::get_maxdiff(void) const {
3600 return maxdiff;
3601 }
3602
get_onlymaxdiff(void) const3603 int AffixMgr::get_onlymaxdiff(void) const {
3604 return onlymaxdiff;
3605 }
3606
3607 // return nosplitsugs
get_nosplitsugs(void) const3608 int AffixMgr::get_nosplitsugs(void) const {
3609 return nosplitsugs;
3610 }
3611
3612 // return sugswithdots
get_sugswithdots(void) const3613 int AffixMgr::get_sugswithdots(void) const {
3614 return sugswithdots;
3615 }
3616
3617 /* parse flag */
parse_flag(const std::string & line,unsigned short * out,FileMgr * af)3618 bool AffixMgr::parse_flag(const std::string& line, unsigned short* out, FileMgr* af) {
3619 if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) {
3620 HUNSPELL_WARNING(
3621 stderr,
3622 "error: line %d: multiple definitions of an affix file parameter\n",
3623 af->getlinenum());
3624 return false;
3625 }
3626 std::string s;
3627 if (!parse_string(line, s, af->getlinenum()))
3628 return false;
3629 *out = pHMgr->decode_flag(s.c_str());
3630 return true;
3631 }
3632
3633 /* parse num */
parse_num(const std::string & line,int * out,FileMgr * af)3634 bool AffixMgr::parse_num(const std::string& line, int* out, FileMgr* af) {
3635 if (*out != -1) {
3636 HUNSPELL_WARNING(
3637 stderr,
3638 "error: line %d: multiple definitions of an affix file parameter\n",
3639 af->getlinenum());
3640 return false;
3641 }
3642 std::string s;
3643 if (!parse_string(line, s, af->getlinenum()))
3644 return false;
3645 *out = atoi(s.c_str());
3646 return true;
3647 }
3648
3649 /* parse in the max syllablecount of compound words and */
parse_cpdsyllable(const std::string & line,FileMgr * af)3650 bool AffixMgr::parse_cpdsyllable(const std::string& line, FileMgr* af) {
3651 int i = 0;
3652 int np = 0;
3653 std::string::const_iterator iter = line.begin();
3654 std::string::const_iterator start_piece = mystrsep(line, iter);
3655 while (start_piece != line.end()) {
3656 switch (i) {
3657 case 0: {
3658 np++;
3659 break;
3660 }
3661 case 1: {
3662 cpdmaxsyllable = atoi(std::string(start_piece, iter).c_str());
3663 np++;
3664 break;
3665 }
3666 case 2: {
3667 if (!utf8) {
3668 cpdvowels.assign(start_piece, iter);
3669 std::sort(cpdvowels.begin(), cpdvowels.end());
3670 } else {
3671 std::string piece(start_piece, iter);
3672 u8_u16(cpdvowels_utf16, piece);
3673 std::sort(cpdvowels_utf16.begin(), cpdvowels_utf16.end());
3674 }
3675 np++;
3676 break;
3677 }
3678 default:
3679 break;
3680 }
3681 ++i;
3682 start_piece = mystrsep(line, iter);
3683 }
3684 if (np < 2) {
3685 HUNSPELL_WARNING(stderr,
3686 "error: line %d: missing compoundsyllable information\n",
3687 af->getlinenum());
3688 return false;
3689 }
3690 if (np == 2)
3691 cpdvowels = "AEIOUaeiou";
3692 return true;
3693 }
3694
3695 /* parse in the typical fault correcting table */
parse_reptable(const std::string & line,FileMgr * af)3696 bool AffixMgr::parse_reptable(const std::string& line, FileMgr* af) {
3697 if (parsedrep) {
3698 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3699 af->getlinenum());
3700 return false;
3701 }
3702 parsedrep = true;
3703 int numrep = -1;
3704 int i = 0;
3705 int np = 0;
3706 std::string::const_iterator iter = line.begin();
3707 std::string::const_iterator start_piece = mystrsep(line, iter);
3708 while (start_piece != line.end()) {
3709 switch (i) {
3710 case 0: {
3711 np++;
3712 break;
3713 }
3714 case 1: {
3715 numrep = atoi(std::string(start_piece, iter).c_str());
3716 if (numrep < 1) {
3717 HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
3718 af->getlinenum());
3719 return false;
3720 }
3721 reptable.reserve(numrep);
3722 np++;
3723 break;
3724 }
3725 default:
3726 break;
3727 }
3728 ++i;
3729 start_piece = mystrsep(line, iter);
3730 }
3731 if (np != 2) {
3732 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
3733 af->getlinenum());
3734 return false;
3735 }
3736
3737 /* now parse the numrep lines to read in the remainder of the table */
3738 for (int j = 0; j < numrep; ++j) {
3739 std::string nl;
3740 if (!af->getline(nl))
3741 return false;
3742 mychomp(nl);
3743 reptable.push_back(replentry());
3744 iter = nl.begin();
3745 i = 0;
3746 int type = 0;
3747 start_piece = mystrsep(nl, iter);
3748 while (start_piece != nl.end()) {
3749 switch (i) {
3750 case 0: {
3751 if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) {
3752 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3753 af->getlinenum());
3754 reptable.clear();
3755 return false;
3756 }
3757 break;
3758 }
3759 case 1: {
3760 if (*start_piece == '^')
3761 type = 1;
3762 reptable.back().pattern.assign(start_piece + type, iter);
3763 mystrrep(reptable.back().pattern, "_", " ");
3764 if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') {
3765 type += 2;
3766 reptable.back().pattern.resize(reptable.back().pattern.size() - 1);
3767 }
3768 break;
3769 }
3770 case 2: {
3771 reptable.back().outstrings[type].assign(start_piece, iter);
3772 mystrrep(reptable.back().outstrings[type], "_", " ");
3773 break;
3774 }
3775 default:
3776 break;
3777 }
3778 ++i;
3779 start_piece = mystrsep(nl, iter);
3780 }
3781 if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) {
3782 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3783 af->getlinenum());
3784 reptable.clear();
3785 return false;
3786 }
3787 }
3788 return true;
3789 }
3790
3791 /* parse in the typical fault correcting table */
parse_convtable(const std::string & line,FileMgr * af,RepList ** rl,const std::string & keyword)3792 bool AffixMgr::parse_convtable(const std::string& line,
3793 FileMgr* af,
3794 RepList** rl,
3795 const std::string& keyword) {
3796 if (*rl) {
3797 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3798 af->getlinenum());
3799 return false;
3800 }
3801 int i = 0;
3802 int np = 0;
3803 int numrl = 0;
3804 std::string::const_iterator iter = line.begin();
3805 std::string::const_iterator start_piece = mystrsep(line, iter);
3806 while (start_piece != line.end()) {
3807 switch (i) {
3808 case 0: {
3809 np++;
3810 break;
3811 }
3812 case 1: {
3813 numrl = atoi(std::string(start_piece, iter).c_str());
3814 if (numrl < 1) {
3815 HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
3816 af->getlinenum());
3817 return false;
3818 }
3819 *rl = new RepList(numrl);
3820 if (!*rl)
3821 return false;
3822 np++;
3823 break;
3824 }
3825 default:
3826 break;
3827 }
3828 ++i;
3829 start_piece = mystrsep(line, iter);
3830 }
3831 if (np != 2) {
3832 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
3833 af->getlinenum());
3834 return false;
3835 }
3836
3837 /* now parse the num lines to read in the remainder of the table */
3838 for (int j = 0; j < numrl; j++) {
3839 std::string nl;
3840 if (!af->getline(nl))
3841 return false;
3842 mychomp(nl);
3843 i = 0;
3844 std::string pattern;
3845 std::string pattern2;
3846 iter = nl.begin();
3847 start_piece = mystrsep(nl, iter);
3848 while (start_piece != nl.end()) {
3849 {
3850 switch (i) {
3851 case 0: {
3852 if (nl.compare(start_piece - nl.begin(), keyword.size(), keyword, 0, keyword.size()) != 0) {
3853 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3854 af->getlinenum());
3855 delete *rl;
3856 *rl = NULL;
3857 return false;
3858 }
3859 break;
3860 }
3861 case 1: {
3862 pattern.assign(start_piece, iter);
3863 break;
3864 }
3865 case 2: {
3866 pattern2.assign(start_piece, iter);
3867 break;
3868 }
3869 default:
3870 break;
3871 }
3872 ++i;
3873 }
3874 start_piece = mystrsep(nl, iter);
3875 }
3876 if (pattern.empty() || pattern2.empty()) {
3877 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3878 af->getlinenum());
3879 return false;
3880 }
3881 (*rl)->add(pattern, pattern2);
3882 }
3883 return true;
3884 }
3885
3886 /* parse in the typical fault correcting table */
parse_phonetable(const std::string & line,FileMgr * af)3887 bool AffixMgr::parse_phonetable(const std::string& line, FileMgr* af) {
3888 if (phone) {
3889 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3890 af->getlinenum());
3891 return false;
3892 }
3893 int num = -1;
3894 int i = 0;
3895 int np = 0;
3896 std::string::const_iterator iter = line.begin();
3897 std::string::const_iterator start_piece = mystrsep(line, iter);
3898 while (start_piece != line.end()) {
3899 switch (i) {
3900 case 0: {
3901 np++;
3902 break;
3903 }
3904 case 1: {
3905 num = atoi(std::string(start_piece, iter).c_str());
3906 if (num < 1) {
3907 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
3908 af->getlinenum());
3909 return false;
3910 }
3911 phone = new phonetable;
3912 phone->utf8 = (char)utf8;
3913 np++;
3914 break;
3915 }
3916 default:
3917 break;
3918 }
3919 ++i;
3920 start_piece = mystrsep(line, iter);
3921 }
3922 if (np != 2) {
3923 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
3924 af->getlinenum());
3925 return false;
3926 }
3927
3928 /* now parse the phone->num lines to read in the remainder of the table */
3929 for (int j = 0; j < num; ++j) {
3930 std::string nl;
3931 if (!af->getline(nl))
3932 return false;
3933 mychomp(nl);
3934 i = 0;
3935 const size_t old_size = phone->rules.size();
3936 iter = nl.begin();
3937 start_piece = mystrsep(nl, iter);
3938 while (start_piece != nl.end()) {
3939 {
3940 switch (i) {
3941 case 0: {
3942 if (nl.compare(start_piece - nl.begin(), 5, "PHONE", 5) != 0) {
3943 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3944 af->getlinenum());
3945 return false;
3946 }
3947 break;
3948 }
3949 case 1: {
3950 phone->rules.push_back(std::string(start_piece, iter));
3951 break;
3952 }
3953 case 2: {
3954 phone->rules.push_back(std::string(start_piece, iter));
3955 mystrrep(phone->rules.back(), "_", "");
3956 break;
3957 }
3958 default:
3959 break;
3960 }
3961 ++i;
3962 }
3963 start_piece = mystrsep(nl, iter);
3964 }
3965 if (phone->rules.size() != old_size + 2) {
3966 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
3967 af->getlinenum());
3968 phone->rules.clear();
3969 return false;
3970 }
3971 }
3972 phone->rules.push_back("");
3973 phone->rules.push_back("");
3974 init_phonet_hash(*phone);
3975 return true;
3976 }
3977
3978 /* parse in the checkcompoundpattern table */
parse_checkcpdtable(const std::string & line,FileMgr * af)3979 bool AffixMgr::parse_checkcpdtable(const std::string& line, FileMgr* af) {
3980 if (parsedcheckcpd) {
3981 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
3982 af->getlinenum());
3983 return false;
3984 }
3985 parsedcheckcpd = true;
3986 int numcheckcpd = -1;
3987 int i = 0;
3988 int np = 0;
3989 std::string::const_iterator iter = line.begin();
3990 std::string::const_iterator start_piece = mystrsep(line, iter);
3991 while (start_piece != line.end()) {
3992 switch (i) {
3993 case 0: {
3994 np++;
3995 break;
3996 }
3997 case 1: {
3998 numcheckcpd = atoi(std::string(start_piece, iter).c_str());
3999 if (numcheckcpd < 1) {
4000 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4001 af->getlinenum());
4002 return false;
4003 }
4004 checkcpdtable.reserve(numcheckcpd);
4005 np++;
4006 break;
4007 }
4008 default:
4009 break;
4010 }
4011 ++i;
4012 start_piece = mystrsep(line, iter);
4013 }
4014 if (np != 2) {
4015 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4016 af->getlinenum());
4017 return false;
4018 }
4019
4020 /* now parse the numcheckcpd lines to read in the remainder of the table */
4021 for (int j = 0; j < numcheckcpd; ++j) {
4022 std::string nl;
4023 if (!af->getline(nl))
4024 return false;
4025 mychomp(nl);
4026 i = 0;
4027 checkcpdtable.push_back(patentry());
4028 iter = nl.begin();
4029 start_piece = mystrsep(nl, iter);
4030 while (start_piece != nl.end()) {
4031 switch (i) {
4032 case 0: {
4033 if (nl.compare(start_piece - nl.begin(), 20, "CHECKCOMPOUNDPATTERN", 20) != 0) {
4034 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4035 af->getlinenum());
4036 return false;
4037 }
4038 break;
4039 }
4040 case 1: {
4041 checkcpdtable.back().pattern.assign(start_piece, iter);
4042 size_t slash_pos = checkcpdtable.back().pattern.find('/');
4043 if (slash_pos != std::string::npos) {
4044 std::string chunk(checkcpdtable.back().pattern, slash_pos + 1);
4045 checkcpdtable.back().pattern.resize(slash_pos);
4046 checkcpdtable.back().cond = pHMgr->decode_flag(chunk.c_str());
4047 }
4048 break;
4049 }
4050 case 2: {
4051 checkcpdtable.back().pattern2.assign(start_piece, iter);
4052 size_t slash_pos = checkcpdtable.back().pattern2.find('/');
4053 if (slash_pos != std::string::npos) {
4054 std::string chunk(checkcpdtable.back().pattern2, slash_pos + 1);
4055 checkcpdtable.back().pattern2.resize(slash_pos);
4056 checkcpdtable.back().cond2 = pHMgr->decode_flag(chunk.c_str());
4057 }
4058 break;
4059 }
4060 case 3: {
4061 checkcpdtable.back().pattern3.assign(start_piece, iter);
4062 simplifiedcpd = 1;
4063 break;
4064 }
4065 default:
4066 break;
4067 }
4068 i++;
4069 start_piece = mystrsep(nl, iter);
4070 }
4071 }
4072 return true;
4073 }
4074
4075 /* parse in the compound rule table */
parse_defcpdtable(const std::string & line,FileMgr * af)4076 bool AffixMgr::parse_defcpdtable(const std::string& line, FileMgr* af) {
4077 if (parseddefcpd) {
4078 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
4079 af->getlinenum());
4080 return false;
4081 }
4082 parseddefcpd = true;
4083 int numdefcpd = -1;
4084 int i = 0;
4085 int np = 0;
4086 std::string::const_iterator iter = line.begin();
4087 std::string::const_iterator start_piece = mystrsep(line, iter);
4088 while (start_piece != line.end()) {
4089 switch (i) {
4090 case 0: {
4091 np++;
4092 break;
4093 }
4094 case 1: {
4095 numdefcpd = atoi(std::string(start_piece, iter).c_str());
4096 if (numdefcpd < 1) {
4097 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4098 af->getlinenum());
4099 return false;
4100 }
4101 defcpdtable.reserve(numdefcpd);
4102 np++;
4103 break;
4104 }
4105 default:
4106 break;
4107 }
4108 ++i;
4109 start_piece = mystrsep(line, iter);
4110 }
4111 if (np != 2) {
4112 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4113 af->getlinenum());
4114 return false;
4115 }
4116
4117 /* now parse the numdefcpd lines to read in the remainder of the table */
4118 for (int j = 0; j < numdefcpd; ++j) {
4119 std::string nl;
4120 if (!af->getline(nl))
4121 return false;
4122 mychomp(nl);
4123 i = 0;
4124 defcpdtable.push_back(flagentry());
4125 iter = nl.begin();
4126 start_piece = mystrsep(nl, iter);
4127 while (start_piece != nl.end()) {
4128 switch (i) {
4129 case 0: {
4130 if (nl.compare(start_piece - nl.begin(), 12, "COMPOUNDRULE", 12) != 0) {
4131 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4132 af->getlinenum());
4133 numdefcpd = 0;
4134 return false;
4135 }
4136 break;
4137 }
4138 case 1: { // handle parenthesized flags
4139 if (std::find(start_piece, iter, '(') != iter) {
4140 for (std::string::const_iterator k = start_piece; k != iter; ++k) {
4141 std::string::const_iterator chb = k;
4142 std::string::const_iterator che = k + 1;
4143 if (*k == '(') {
4144 std::string::const_iterator parpos = std::find(k, iter, ')');
4145 if (parpos != iter) {
4146 chb = k + 1;
4147 che = parpos;
4148 k = parpos;
4149 }
4150 }
4151
4152 if (*chb == '*' || *chb == '?') {
4153 defcpdtable.back().push_back((FLAG)*chb);
4154 } else {
4155 pHMgr->decode_flags(defcpdtable.back(), std::string(chb, che), af);
4156 }
4157 }
4158 } else {
4159 pHMgr->decode_flags(defcpdtable.back(), std::string(start_piece, iter), af);
4160 }
4161 break;
4162 }
4163 default:
4164 break;
4165 }
4166 ++i;
4167 start_piece = mystrsep(nl, iter);
4168 }
4169 if (defcpdtable.back().empty()) {
4170 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4171 af->getlinenum());
4172 return false;
4173 }
4174 }
4175 return true;
4176 }
4177
4178 /* parse in the character map table */
parse_maptable(const std::string & line,FileMgr * af)4179 bool AffixMgr::parse_maptable(const std::string& line, FileMgr* af) {
4180 if (parsedmaptable) {
4181 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
4182 af->getlinenum());
4183 return false;
4184 }
4185 parsedmaptable = true;
4186 int nummap = -1;
4187 int i = 0;
4188 int np = 0;
4189 std::string::const_iterator iter = line.begin();
4190 std::string::const_iterator start_piece = mystrsep(line, iter);
4191 while (start_piece != line.end()) {
4192 switch (i) {
4193 case 0: {
4194 np++;
4195 break;
4196 }
4197 case 1: {
4198 nummap = atoi(std::string(start_piece, iter).c_str());
4199 if (nummap < 1) {
4200 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4201 af->getlinenum());
4202 return false;
4203 }
4204 maptable.reserve(nummap);
4205 np++;
4206 break;
4207 }
4208 default:
4209 break;
4210 }
4211 ++i;
4212 start_piece = mystrsep(line, iter);
4213 }
4214 if (np != 2) {
4215 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4216 af->getlinenum());
4217 return false;
4218 }
4219
4220 /* now parse the nummap lines to read in the remainder of the table */
4221 for (int j = 0; j < nummap; ++j) {
4222 std::string nl;
4223 if (!af->getline(nl))
4224 return false;
4225 mychomp(nl);
4226 i = 0;
4227 maptable.push_back(mapentry());
4228 iter = nl.begin();
4229 start_piece = mystrsep(nl, iter);
4230 while (start_piece != nl.end()) {
4231 switch (i) {
4232 case 0: {
4233 if (nl.compare(start_piece - nl.begin(), 3, "MAP", 3) != 0) {
4234 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4235 af->getlinenum());
4236 nummap = 0;
4237 return false;
4238 }
4239 break;
4240 }
4241 case 1: {
4242 for (std::string::const_iterator k = start_piece; k != iter; ++k) {
4243 std::string::const_iterator chb = k;
4244 std::string::const_iterator che = k + 1;
4245 if (*k == '(') {
4246 std::string::const_iterator parpos = std::find(k, iter, ')');
4247 if (parpos != iter) {
4248 chb = k + 1;
4249 che = parpos;
4250 k = parpos;
4251 }
4252 } else {
4253 if (utf8 && (*k & 0xc0) == 0xc0) {
4254 ++k;
4255 while (k != iter && (*k & 0xc0) == 0x80)
4256 ++k;
4257 che = k;
4258 --k;
4259 }
4260 }
4261 maptable.back().push_back(std::string(chb, che));
4262 }
4263 break;
4264 }
4265 default:
4266 break;
4267 }
4268 ++i;
4269 start_piece = mystrsep(nl, iter);
4270 }
4271 if (maptable.back().empty()) {
4272 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4273 af->getlinenum());
4274 return false;
4275 }
4276 }
4277 return true;
4278 }
4279
4280 /* parse in the word breakpoint table */
parse_breaktable(const std::string & line,FileMgr * af)4281 bool AffixMgr::parse_breaktable(const std::string& line, FileMgr* af) {
4282 if (parsedbreaktable) {
4283 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
4284 af->getlinenum());
4285 return false;
4286 }
4287 parsedbreaktable = true;
4288 int numbreak = -1;
4289 int i = 0;
4290 int np = 0;
4291 std::string::const_iterator iter = line.begin();
4292 std::string::const_iterator start_piece = mystrsep(line, iter);
4293 while (start_piece != line.end()) {
4294 switch (i) {
4295 case 0: {
4296 np++;
4297 break;
4298 }
4299 case 1: {
4300 numbreak = atoi(std::string(start_piece, iter).c_str());
4301 if (numbreak < 0) {
4302 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4303 af->getlinenum());
4304 return false;
4305 }
4306 if (numbreak == 0)
4307 return true;
4308 breaktable.reserve(numbreak);
4309 np++;
4310 break;
4311 }
4312 default:
4313 break;
4314 }
4315 ++i;
4316 start_piece = mystrsep(line, iter);
4317 }
4318 if (np != 2) {
4319 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4320 af->getlinenum());
4321 return false;
4322 }
4323
4324 /* now parse the numbreak lines to read in the remainder of the table */
4325 for (int j = 0; j < numbreak; ++j) {
4326 std::string nl;
4327 if (!af->getline(nl))
4328 return false;
4329 mychomp(nl);
4330 i = 0;
4331 iter = nl.begin();
4332 start_piece = mystrsep(nl, iter);
4333 while (start_piece != nl.end()) {
4334 switch (i) {
4335 case 0: {
4336 if (nl.compare(start_piece - nl.begin(), 5, "BREAK", 5) != 0) {
4337 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4338 af->getlinenum());
4339 numbreak = 0;
4340 return false;
4341 }
4342 break;
4343 }
4344 case 1: {
4345 breaktable.push_back(std::string(start_piece, iter));
4346 break;
4347 }
4348 default:
4349 break;
4350 }
4351 ++i;
4352 start_piece = mystrsep(nl, iter);
4353 }
4354 }
4355
4356 if (breaktable.size() != static_cast<size_t>(numbreak)) {
4357 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
4358 af->getlinenum());
4359 return false;
4360 }
4361
4362 return true;
4363 }
4364
reverse_condition(std::string & piece)4365 void AffixMgr::reverse_condition(std::string& piece) {
4366 if (piece.empty())
4367 return;
4368
4369 int neg = 0;
4370 for (std::string::reverse_iterator k = piece.rbegin(); k != piece.rend(); ++k) {
4371 switch (*k) {
4372 case '[': {
4373 if (neg)
4374 *(k - 1) = '[';
4375 else
4376 *k = ']';
4377 break;
4378 }
4379 case ']': {
4380 *k = '[';
4381 if (neg)
4382 *(k - 1) = '^';
4383 neg = 0;
4384 break;
4385 }
4386 case '^': {
4387 if (*(k - 1) == ']')
4388 neg = 1;
4389 else
4390 *(k - 1) = *k;
4391 break;
4392 }
4393 default: {
4394 if (neg)
4395 *(k - 1) = *k;
4396 }
4397 }
4398 }
4399 }
4400
4401 class entries_container {
4402 std::vector<AffEntry*> entries;
4403 AffixMgr* m_mgr;
4404 char m_at;
4405 public:
entries_container(char at,AffixMgr * mgr)4406 entries_container(char at, AffixMgr* mgr)
4407 : m_mgr(mgr)
4408 , m_at(at) {
4409 }
release()4410 void release() {
4411 entries.clear();
4412 }
initialize(int numents,char opts,unsigned short aflag)4413 void initialize(int numents,
4414 char opts, unsigned short aflag) {
4415 entries.reserve(numents);
4416
4417 if (m_at == 'P') {
4418 entries.push_back(new PfxEntry(m_mgr));
4419 } else {
4420 entries.push_back(new SfxEntry(m_mgr));
4421 }
4422
4423 entries.back()->opts = opts;
4424 entries.back()->aflag = aflag;
4425 }
4426
add_entry(char opts)4427 AffEntry* add_entry(char opts) {
4428 if (m_at == 'P') {
4429 entries.push_back(new PfxEntry(m_mgr));
4430 } else {
4431 entries.push_back(new SfxEntry(m_mgr));
4432 }
4433 AffEntry* ret = entries.back();
4434 ret->opts = entries[0]->opts & opts;
4435 return ret;
4436 }
4437
first_entry()4438 AffEntry* first_entry() {
4439 return entries.empty() ? NULL : entries[0];
4440 }
4441
~entries_container()4442 ~entries_container() {
4443 for (size_t i = 0; i < entries.size(); ++i) {
4444 delete entries[i];
4445 }
4446 }
4447
begin()4448 std::vector<AffEntry*>::iterator begin() { return entries.begin(); }
end()4449 std::vector<AffEntry*>::iterator end() { return entries.end(); }
4450 };
4451
parse_affix(const std::string & line,const char at,FileMgr * af,char * dupflags)4452 bool AffixMgr::parse_affix(const std::string& line,
4453 const char at,
4454 FileMgr* af,
4455 char* dupflags) {
4456 int numents = 0; // number of AffEntry structures to parse
4457
4458 unsigned short aflag = 0; // affix char identifier
4459
4460 char ff = 0;
4461 entries_container affentries(at, this);
4462
4463 int i = 0;
4464
4465 // checking lines with bad syntax
4466 #ifdef DEBUG
4467 int basefieldnum = 0;
4468 #endif
4469
4470 // split affix header line into pieces
4471
4472 int np = 0;
4473 std::string::const_iterator iter = line.begin();
4474 std::string::const_iterator start_piece = mystrsep(line, iter);
4475 while (start_piece != line.end()) {
4476 switch (i) {
4477 // piece 1 - is type of affix
4478 case 0: {
4479 np++;
4480 break;
4481 }
4482
4483 // piece 2 - is affix char
4484 case 1: {
4485 np++;
4486 aflag = pHMgr->decode_flag(std::string(start_piece, iter).c_str());
4487 if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
4488 ((at == 'P') && (dupflags[aflag] & dupPFX))) {
4489 HUNSPELL_WARNING(
4490 stderr,
4491 "error: line %d: multiple definitions of an affix flag\n",
4492 af->getlinenum());
4493 }
4494 dupflags[aflag] += (char)((at == 'S') ? dupSFX : dupPFX);
4495 break;
4496 }
4497 // piece 3 - is cross product indicator
4498 case 2: {
4499 np++;
4500 if (*start_piece == 'Y')
4501 ff = aeXPRODUCT;
4502 break;
4503 }
4504
4505 // piece 4 - is number of affentries
4506 case 3: {
4507 np++;
4508 numents = atoi(std::string(start_piece, iter).c_str());
4509 if ((numents <= 0) || ((std::numeric_limits<size_t>::max() /
4510 sizeof(AffEntry)) < static_cast<size_t>(numents))) {
4511 char* err = pHMgr->encode_flag(aflag);
4512 if (err) {
4513 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
4514 af->getlinenum());
4515 free(err);
4516 }
4517 return false;
4518 }
4519
4520 char opts = ff;
4521 if (utf8)
4522 opts += aeUTF8;
4523 if (pHMgr->is_aliasf())
4524 opts += aeALIASF;
4525 if (pHMgr->is_aliasm())
4526 opts += aeALIASM;
4527 affentries.initialize(numents, opts, aflag);
4528 }
4529
4530 default:
4531 break;
4532 }
4533 ++i;
4534 start_piece = mystrsep(line, iter);
4535 }
4536 // check to make sure we parsed enough pieces
4537 if (np != 4) {
4538 char* err = pHMgr->encode_flag(aflag);
4539 if (err) {
4540 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
4541 af->getlinenum());
4542 free(err);
4543 }
4544 return false;
4545 }
4546
4547 // now parse numents affentries for this affix
4548 AffEntry* entry = affentries.first_entry();
4549 for (int ent = 0; ent < numents; ++ent) {
4550 std::string nl;
4551 if (!af->getline(nl))
4552 return false;
4553 mychomp(nl);
4554
4555 iter = nl.begin();
4556 i = 0;
4557 np = 0;
4558
4559 // split line into pieces
4560 start_piece = mystrsep(nl, iter);
4561 while (start_piece != nl.end()) {
4562 switch (i) {
4563 // piece 1 - is type
4564 case 0: {
4565 np++;
4566 if (ent != 0)
4567 entry = affentries.add_entry((char)(aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM));
4568 break;
4569 }
4570
4571 // piece 2 - is affix char
4572 case 1: {
4573 np++;
4574 std::string chunk(start_piece, iter);
4575 if (pHMgr->decode_flag(chunk.c_str()) != aflag) {
4576 char* err = pHMgr->encode_flag(aflag);
4577 if (err) {
4578 HUNSPELL_WARNING(stderr,
4579 "error: line %d: affix %s is corrupt\n",
4580 af->getlinenum(), err);
4581 free(err);
4582 }
4583 return false;
4584 }
4585
4586 if (ent != 0) {
4587 AffEntry* start_entry = affentries.first_entry();
4588 entry->aflag = start_entry->aflag;
4589 }
4590 break;
4591 }
4592
4593 // piece 3 - is string to strip or 0 for null
4594 case 2: {
4595 np++;
4596 entry->strip = std::string(start_piece, iter);
4597 if (complexprefixes) {
4598 if (utf8)
4599 reverseword_utf(entry->strip);
4600 else
4601 reverseword(entry->strip);
4602 }
4603 if (entry->strip.compare("0") == 0) {
4604 entry->strip.clear();
4605 }
4606 break;
4607 }
4608
4609 // piece 4 - is affix string or 0 for null
4610 case 3: {
4611 entry->morphcode = NULL;
4612 entry->contclass = NULL;
4613 entry->contclasslen = 0;
4614 np++;
4615 std::string::const_iterator dash = std::find(start_piece, iter, '/');
4616 if (dash != iter) {
4617 entry->appnd = std::string(start_piece, dash);
4618 std::string dash_str(dash + 1, iter);
4619
4620 if (!ignorechars.empty()) {
4621 if (utf8) {
4622 remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
4623 } else {
4624 remove_ignored_chars(entry->appnd, ignorechars);
4625 }
4626 }
4627
4628 if (complexprefixes) {
4629 if (utf8)
4630 reverseword_utf(entry->appnd);
4631 else
4632 reverseword(entry->appnd);
4633 }
4634
4635 if (pHMgr->is_aliasf()) {
4636 int index = atoi(dash_str.c_str());
4637 entry->contclasslen = (unsigned short)pHMgr->get_aliasf(
4638 index, &(entry->contclass), af);
4639 if (!entry->contclasslen)
4640 HUNSPELL_WARNING(stderr,
4641 "error: bad affix flag alias: \"%s\"\n",
4642 dash_str.c_str());
4643 } else {
4644 entry->contclasslen = (unsigned short)pHMgr->decode_flags(
4645 &(entry->contclass), dash_str.c_str(), af);
4646 std::sort(entry->contclass, entry->contclass + entry->contclasslen);
4647 }
4648
4649 havecontclass = 1;
4650 for (unsigned short _i = 0; _i < entry->contclasslen; _i++) {
4651 contclasses[(entry->contclass)[_i]] = 1;
4652 }
4653 } else {
4654 entry->appnd = std::string(start_piece, iter);
4655
4656 if (!ignorechars.empty()) {
4657 if (utf8) {
4658 remove_ignored_chars_utf(entry->appnd, ignorechars_utf16);
4659 } else {
4660 remove_ignored_chars(entry->appnd, ignorechars);
4661 }
4662 }
4663
4664 if (complexprefixes) {
4665 if (utf8)
4666 reverseword_utf(entry->appnd);
4667 else
4668 reverseword(entry->appnd);
4669 }
4670 }
4671
4672 if (entry->appnd.compare("0") == 0) {
4673 entry->appnd.clear();
4674 }
4675 break;
4676 }
4677
4678 // piece 5 - is the conditions descriptions
4679 case 4: {
4680 std::string chunk(start_piece, iter);
4681 np++;
4682 if (complexprefixes) {
4683 if (utf8)
4684 reverseword_utf(chunk);
4685 else
4686 reverseword(chunk);
4687 reverse_condition(chunk);
4688 }
4689 if (!entry->strip.empty() && chunk != "." &&
4690 redundant_condition(at, entry->strip.c_str(), entry->strip.size(), chunk.c_str(),
4691 af->getlinenum()))
4692 chunk = ".";
4693 if (at == 'S') {
4694 reverseword(chunk);
4695 reverse_condition(chunk);
4696 }
4697 if (encodeit(*entry, chunk.c_str()))
4698 return false;
4699 break;
4700 }
4701
4702 case 5: {
4703 std::string chunk(start_piece, iter);
4704 np++;
4705 if (pHMgr->is_aliasm()) {
4706 int index = atoi(chunk.c_str());
4707 entry->morphcode = pHMgr->get_aliasm(index);
4708 } else {
4709 if (complexprefixes) { // XXX - fix me for morph. gen.
4710 if (utf8)
4711 reverseword_utf(chunk);
4712 else
4713 reverseword(chunk);
4714 }
4715 // add the remaining of the line
4716 std::string::const_iterator end = nl.end();
4717 if (iter != end) {
4718 chunk.append(iter, end);
4719 }
4720 entry->morphcode = mystrdup(chunk.c_str());
4721 if (!entry->morphcode)
4722 return false;
4723 }
4724 break;
4725 }
4726 default:
4727 break;
4728 }
4729 i++;
4730 start_piece = mystrsep(nl, iter);
4731 }
4732 // check to make sure we parsed enough pieces
4733 if (np < 4) {
4734 char* err = pHMgr->encode_flag(aflag);
4735 if (err) {
4736 HUNSPELL_WARNING(stderr, "error: line %d: affix %s is corrupt\n",
4737 af->getlinenum(), err);
4738 free(err);
4739 }
4740 return false;
4741 }
4742
4743 #ifdef DEBUG
4744 // detect unnecessary fields, excepting comments
4745 if (basefieldnum) {
4746 int fieldnum =
4747 !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6);
4748 if (fieldnum != basefieldnum)
4749 HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n",
4750 af->getlinenum());
4751 } else {
4752 basefieldnum =
4753 !(entry->morphcode) ? 5 : ((*(entry->morphcode) == '#') ? 5 : 6);
4754 }
4755 #endif
4756 }
4757
4758 // now create SfxEntry or PfxEntry objects and use links to
4759 // build an ordered (sorted by affix string) list
4760 std::vector<AffEntry*>::iterator start = affentries.begin();
4761 std::vector<AffEntry*>::iterator end = affentries.end();
4762 for (std::vector<AffEntry*>::iterator affentry = start; affentry != end; ++affentry) {
4763 if (at == 'P') {
4764 build_pfxtree(static_cast<PfxEntry*>(*affentry));
4765 } else {
4766 build_sfxtree(static_cast<SfxEntry*>(*affentry));
4767 }
4768 }
4769
4770 //contents belong to AffixMgr now
4771 affentries.release();
4772
4773 return true;
4774 }
4775
redundant_condition(char ft,const char * strip,int stripl,const char * cond,int linenum)4776 int AffixMgr::redundant_condition(char ft,
4777 const char* strip,
4778 int stripl,
4779 const char* cond,
4780 int linenum) {
4781 int condl = strlen(cond);
4782 int i;
4783 int j;
4784 int neg;
4785 int in;
4786 if (ft == 'P') { // prefix
4787 if (strncmp(strip, cond, condl) == 0)
4788 return 1;
4789 if (utf8) {
4790 } else {
4791 for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
4792 if (cond[j] != '[') {
4793 if (cond[j] != strip[i]) {
4794 HUNSPELL_WARNING(stderr,
4795 "warning: line %d: incompatible stripping "
4796 "characters and condition\n",
4797 linenum);
4798 return 0;
4799 }
4800 } else {
4801 neg = (cond[j + 1] == '^') ? 1 : 0;
4802 in = 0;
4803 do {
4804 j++;
4805 if (strip[i] == cond[j])
4806 in = 1;
4807 } while ((j < (condl - 1)) && (cond[j] != ']'));
4808 if (j == (condl - 1) && (cond[j] != ']')) {
4809 HUNSPELL_WARNING(stderr,
4810 "error: line %d: missing ] in condition:\n%s\n",
4811 linenum, cond);
4812 return 0;
4813 }
4814 if ((!neg && !in) || (neg && in)) {
4815 HUNSPELL_WARNING(stderr,
4816 "warning: line %d: incompatible stripping "
4817 "characters and condition\n",
4818 linenum);
4819 return 0;
4820 }
4821 }
4822 }
4823 if (j >= condl)
4824 return 1;
4825 }
4826 } else { // suffix
4827 if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0)
4828 return 1;
4829 if (utf8) {
4830 } else {
4831 for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
4832 if (cond[j] != ']') {
4833 if (cond[j] != strip[i]) {
4834 HUNSPELL_WARNING(stderr,
4835 "warning: line %d: incompatible stripping "
4836 "characters and condition\n",
4837 linenum);
4838 return 0;
4839 }
4840 } else {
4841 in = 0;
4842 do {
4843 j--;
4844 if (strip[i] == cond[j])
4845 in = 1;
4846 } while ((j > 0) && (cond[j] != '['));
4847 if ((j == 0) && (cond[j] != '[')) {
4848 HUNSPELL_WARNING(stderr,
4849 "error: line: %d: missing ] in condition:\n%s\n",
4850 linenum, cond);
4851 return 0;
4852 }
4853 neg = (cond[j + 1] == '^') ? 1 : 0;
4854 if ((!neg && !in) || (neg && in)) {
4855 HUNSPELL_WARNING(stderr,
4856 "warning: line %d: incompatible stripping "
4857 "characters and condition\n",
4858 linenum);
4859 return 0;
4860 }
4861 }
4862 }
4863 if (j < 0)
4864 return 1;
4865 }
4866 }
4867 return 0;
4868 }
4869
get_suffix_words(short unsigned * suff,int len,const char * root_word)4870 std::vector<std::string> AffixMgr::get_suffix_words(short unsigned* suff,
4871 int len,
4872 const char* root_word) {
4873 std::vector<std::string> slst;
4874 short unsigned* start_ptr = suff;
4875 for (int j = 0; j < SETSIZE; j++) {
4876 SfxEntry* ptr = sStart[j];
4877 while (ptr) {
4878 suff = start_ptr;
4879 for (int i = 0; i < len; i++) {
4880 if ((*suff) == ptr->getFlag()) {
4881 std::string nw(root_word);
4882 nw.append(ptr->getAffix());
4883 hentry* ht = ptr->checkword(nw.c_str(), nw.size(), 0, NULL, 0, 0, 0);
4884 if (ht) {
4885 slst.push_back(nw);
4886 }
4887 }
4888 suff++;
4889 }
4890 ptr = ptr->getNext();
4891 }
4892 }
4893 return slst;
4894 }
4895