1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * Copyright (C) 2002-2017 Németh László
5 *
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
10 *
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
15 *
16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17 *
18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23 *
24 * Alternatively, the contents of this file may be used under the terms of
25 * either the GNU General Public License Version 2 or later (the "GPL"), or
26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
35 *
36 * ***** END LICENSE BLOCK ***** */
37 /*
38 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39 * And Contributors. All rights reserved.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 *
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 *
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 *
52 * 3. All modifications to the source code must be clearly marked as
53 * such. Binary redistributions based on modified source code
54 * must be clearly marked as modified versions in the documentation
55 * and/or other materials provided with the distribution.
56 *
57 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
61 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 */
70
71 #include <stdlib.h>
72 #include <string.h>
73 #include <stdio.h>
74 #include <ctype.h>
75 #include <limits>
76 #include <sstream>
77
78 #include "hashmgr.hxx"
79 #include "csutil.hxx"
80 #include "atypes.hxx"
81
82 // build a hash table from a munched word list
83
HashMgr(const char * tpath,const char * apath,const char * key)84 HashMgr::HashMgr(const char* tpath, const char* apath, const char* key)
85 : tablesize(0),
86 tableptr(NULL),
87 flag_mode(FLAG_CHAR),
88 complexprefixes(0),
89 utf8(0),
90 forbiddenword(FORBIDDENWORD) // forbidden word signing flag
91 ,
92 numaliasf(0),
93 aliasf(NULL),
94 aliasflen(0),
95 numaliasm(0),
96 aliasm(NULL) {
97 langnum = 0;
98 csconv = 0;
99 load_config(apath, key);
100 int ec = load_tables(tpath, key);
101 if (ec) {
102 /* error condition - what should we do here */
103 HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n", ec);
104 free(tableptr);
105 //keep tablesize to 1 to fix possible division with zero
106 tablesize = 1;
107 tableptr = (struct hentry**)calloc(tablesize, sizeof(struct hentry*));
108 if (!tableptr) {
109 tablesize = 0;
110 }
111 }
112 }
113
~HashMgr()114 HashMgr::~HashMgr() {
115 if (tableptr) {
116 // now pass through hash table freeing up everything
117 // go through column by column of the table
118 for (int i = 0; i < tablesize; i++) {
119 struct hentry* pt = tableptr[i];
120 struct hentry* nt = NULL;
121 while (pt) {
122 nt = pt->next;
123 if (pt->astr &&
124 (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen)))
125 free(pt->astr);
126 free(pt);
127 pt = nt;
128 }
129 }
130 free(tableptr);
131 }
132 tablesize = 0;
133
134 if (aliasf) {
135 for (int j = 0; j < (numaliasf); j++)
136 free(aliasf[j]);
137 free(aliasf);
138 aliasf = NULL;
139 if (aliasflen) {
140 free(aliasflen);
141 aliasflen = NULL;
142 }
143 }
144 if (aliasm) {
145 for (int j = 0; j < (numaliasm); j++)
146 free(aliasm[j]);
147 free(aliasm);
148 aliasm = NULL;
149 }
150
151 #ifndef OPENOFFICEORG
152 #ifndef MOZILLA_CLIENT
153 if (utf8)
154 free_utf_tbl();
155 #endif
156 #endif
157
158 #ifdef MOZILLA_CLIENT
159 delete[] csconv;
160 #endif
161 }
162
163 // lookup a root word in the hashtable
164
lookup(const char * word) const165 struct hentry* HashMgr::lookup(const char* word) const {
166 struct hentry* dp;
167 if (tableptr) {
168 dp = tableptr[hash(word)];
169 if (!dp)
170 return NULL;
171 for (; dp != NULL; dp = dp->next) {
172 if (strcmp(word, dp->word) == 0)
173 return dp;
174 }
175 }
176 return NULL;
177 }
178
179 // add a word to the hash table (private)
add_word(const std::string & in_word,int wcl,unsigned short * aff,int al,const std::string * in_desc,bool onlyupcase)180 int HashMgr::add_word(const std::string& in_word,
181 int wcl,
182 unsigned short* aff,
183 int al,
184 const std::string* in_desc,
185 bool onlyupcase) {
186 const std::string* word = &in_word;
187 const std::string* desc = in_desc;
188
189 std::string *word_copy = NULL;
190 std::string *desc_copy = NULL;
191 if (!ignorechars.empty() || complexprefixes) {
192 word_copy = new std::string(in_word);
193
194 if (!ignorechars.empty()) {
195 if (utf8) {
196 wcl = remove_ignored_chars_utf(*word_copy, ignorechars_utf16);
197 } else {
198 remove_ignored_chars(*word_copy, ignorechars);
199 }
200 }
201
202 if (complexprefixes) {
203 if (utf8)
204 wcl = reverseword_utf(*word_copy);
205 else
206 reverseword(*word_copy);
207
208 if (in_desc && !aliasm) {
209 desc_copy = new std::string(*in_desc);
210
211 if (complexprefixes) {
212 if (utf8)
213 reverseword_utf(*desc_copy);
214 else
215 reverseword(*desc_copy);
216 }
217 desc = desc_copy;
218 }
219 }
220
221 word = word_copy;
222 }
223
224 bool upcasehomonym = false;
225 int descl = desc ? (aliasm ? sizeof(char*) : desc->size() + 1) : 0;
226 // variable-length hash record with word and optional fields
227 struct hentry* hp =
228 (struct hentry*)malloc(sizeof(struct hentry) + word->size() + descl);
229 if (!hp) {
230 delete desc_copy;
231 delete word_copy;
232 return 1;
233 }
234
235 char* hpw = hp->word;
236 strcpy(hpw, word->c_str());
237
238 int i = hash(hpw);
239
240 hp->blen = (unsigned char)word->size();
241 hp->clen = (unsigned char)wcl;
242 hp->alen = (short)al;
243 hp->astr = aff;
244 hp->next = NULL;
245 hp->next_homonym = NULL;
246
247 // store the description string or its pointer
248 if (desc) {
249 hp->var = H_OPT;
250 if (aliasm) {
251 hp->var += H_OPT_ALIASM;
252 store_pointer(hpw + word->size() + 1, get_aliasm(atoi(desc->c_str())));
253 } else {
254 strcpy(hpw + word->size() + 1, desc->c_str());
255 }
256 if (strstr(HENTRY_DATA(hp), MORPH_PHON))
257 hp->var += H_OPT_PHON;
258 } else
259 hp->var = 0;
260
261 struct hentry* dp = tableptr[i];
262 if (!dp) {
263 tableptr[i] = hp;
264 delete desc_copy;
265 delete word_copy;
266 return 0;
267 }
268 while (dp->next != NULL) {
269 if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) {
270 // remove hidden onlyupcase homonym
271 if (!onlyupcase) {
272 if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
273 free(dp->astr);
274 dp->astr = hp->astr;
275 dp->alen = hp->alen;
276 free(hp);
277 delete desc_copy;
278 delete word_copy;
279 return 0;
280 } else {
281 dp->next_homonym = hp;
282 }
283 } else {
284 upcasehomonym = true;
285 }
286 }
287 dp = dp->next;
288 }
289 if (strcmp(hp->word, dp->word) == 0) {
290 // remove hidden onlyupcase homonym
291 if (!onlyupcase) {
292 if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
293 free(dp->astr);
294 dp->astr = hp->astr;
295 dp->alen = hp->alen;
296 free(hp);
297 delete desc_copy;
298 delete word_copy;
299 return 0;
300 } else {
301 dp->next_homonym = hp;
302 }
303 } else {
304 upcasehomonym = true;
305 }
306 }
307 if (!upcasehomonym) {
308 dp->next = hp;
309 } else {
310 // remove hidden onlyupcase homonym
311 if (hp->astr)
312 free(hp->astr);
313 free(hp);
314 }
315
316 delete desc_copy;
317 delete word_copy;
318 return 0;
319 }
320
add_hidden_capitalized_word(const std::string & word,int wcl,unsigned short * flags,int flagslen,const std::string * dp,int captype)321 int HashMgr::add_hidden_capitalized_word(const std::string& word,
322 int wcl,
323 unsigned short* flags,
324 int flagslen,
325 const std::string* dp,
326 int captype) {
327 if (flags == NULL)
328 flagslen = 0;
329
330 // add inner capitalized forms to handle the following allcap forms:
331 // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG
332 // Allcaps with suffixes: CIA's -> CIA'S
333 if (((captype == HUHCAP) || (captype == HUHINITCAP) ||
334 ((captype == ALLCAP) && (flagslen != 0))) &&
335 !((flagslen != 0) && TESTAFF(flags, forbiddenword, flagslen))) {
336 unsigned short* flags2 =
337 (unsigned short*)malloc(sizeof(unsigned short) * (flagslen + 1));
338 if (!flags2)
339 return 1;
340 if (flagslen)
341 memcpy(flags2, flags, flagslen * sizeof(unsigned short));
342 flags2[flagslen] = ONLYUPCASEFLAG;
343 if (utf8) {
344 std::string st;
345 std::vector<w_char> w;
346 u8_u16(w, word);
347 mkallsmall_utf(w, langnum);
348 mkinitcap_utf(w, langnum);
349 u16_u8(st, w);
350 return add_word(st, wcl, flags2, flagslen + 1, dp, true);
351 } else {
352 std::string new_word(word);
353 mkallsmall(new_word, csconv);
354 mkinitcap(new_word, csconv);
355 int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true);
356 return ret;
357 }
358 }
359 return 0;
360 }
361
362 // detect captype and modify word length for UTF-8 encoding
get_clen_and_captype(const std::string & word,int * captype,std::vector<w_char> & workbuf)363 int HashMgr::get_clen_and_captype(const std::string& word, int* captype, std::vector<w_char> &workbuf) {
364 int len;
365 if (utf8) {
366 len = u8_u16(workbuf, word);
367 *captype = get_captype_utf8(workbuf, langnum);
368 } else {
369 len = word.size();
370 *captype = get_captype(word, csconv);
371 }
372 return len;
373 }
374
get_clen_and_captype(const std::string & word,int * captype)375 int HashMgr::get_clen_and_captype(const std::string& word, int* captype) {
376 std::vector<w_char> workbuf;
377 return get_clen_and_captype(word, captype, workbuf);
378 }
379
380 // remove word (personal dictionary function for standalone applications)
remove(const std::string & word)381 int HashMgr::remove(const std::string& word) {
382 struct hentry* dp = lookup(word.c_str());
383 while (dp) {
384 if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) {
385 unsigned short* flags =
386 (unsigned short*)malloc(sizeof(unsigned short) * (dp->alen + 1));
387 if (!flags)
388 return 1;
389 for (int i = 0; i < dp->alen; i++)
390 flags[i] = dp->astr[i];
391 flags[dp->alen] = forbiddenword;
392 free(dp->astr);
393 dp->astr = flags;
394 dp->alen++;
395 std::sort(flags, flags + dp->alen);
396 }
397 dp = dp->next_homonym;
398 }
399 return 0;
400 }
401
402 /* remove forbidden flag to add a personal word to the hash */
remove_forbidden_flag(const std::string & word)403 int HashMgr::remove_forbidden_flag(const std::string& word) {
404 struct hentry* dp = lookup(word.c_str());
405 if (!dp)
406 return 1;
407 while (dp) {
408 if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) {
409 if (dp->alen == 1)
410 dp->alen = 0; // XXX forbidden words of personal dic.
411 else {
412 unsigned short* flags2 =
413 (unsigned short*)malloc(sizeof(unsigned short) * (dp->alen - 1));
414 if (!flags2)
415 return 1;
416 int i, j = 0;
417 for (i = 0; i < dp->alen; i++) {
418 if (dp->astr[i] != forbiddenword)
419 flags2[j++] = dp->astr[i];
420 }
421 dp->alen--;
422 free(dp->astr);
423 dp->astr = flags2; // XXX allowed forbidden words
424 }
425 }
426 dp = dp->next_homonym;
427 }
428 return 0;
429 }
430
431 // add a custom dic. word to the hash table (public)
add(const std::string & word)432 int HashMgr::add(const std::string& word) {
433 if (remove_forbidden_flag(word)) {
434 int captype;
435 int al = 0;
436 unsigned short* flags = NULL;
437 int wcl = get_clen_and_captype(word, &captype);
438 add_word(word, wcl, flags, al, NULL, false);
439 return add_hidden_capitalized_word(word, wcl, flags, al, NULL,
440 captype);
441 }
442 return 0;
443 }
444
add_with_affix(const std::string & word,const std::string & example)445 int HashMgr::add_with_affix(const std::string& word, const std::string& example) {
446 // detect captype and modify word length for UTF-8 encoding
447 struct hentry* dp = lookup(example.c_str());
448 remove_forbidden_flag(word);
449 if (dp && dp->astr) {
450 int captype;
451 int wcl = get_clen_and_captype(word, &captype);
452 if (aliasf) {
453 add_word(word, wcl, dp->astr, dp->alen, NULL, false);
454 } else {
455 unsigned short* flags =
456 (unsigned short*)malloc(dp->alen * sizeof(unsigned short));
457 if (flags) {
458 memcpy((void*)flags, (void*)dp->astr,
459 dp->alen * sizeof(unsigned short));
460 add_word(word, wcl, flags, dp->alen, NULL, false);
461 } else
462 return 1;
463 }
464 return add_hidden_capitalized_word(word, wcl, dp->astr,
465 dp->alen, NULL, captype);
466 }
467 return 1;
468 }
469
470 // walk the hash table entry by entry - null at end
471 // initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp);
walk_hashtable(int & col,struct hentry * hp) const472 struct hentry* HashMgr::walk_hashtable(int& col, struct hentry* hp) const {
473 if (hp && hp->next != NULL)
474 return hp->next;
475 for (col++; col < tablesize; col++) {
476 if (tableptr[col])
477 return tableptr[col];
478 }
479 // null at end and reset to start
480 col = -1;
481 return NULL;
482 }
483
484 // load a munched word list and build a hash table on the fly
load_tables(const char * tpath,const char * key)485 int HashMgr::load_tables(const char* tpath, const char* key) {
486 // open dictionary file
487 FileMgr* dict = new FileMgr(tpath, key);
488 if (dict == NULL)
489 return 1;
490
491 // first read the first line of file to get hash table size */
492 std::string ts;
493 if (!dict->getline(ts)) {
494 HUNSPELL_WARNING(stderr, "error: empty dic file %s\n", tpath);
495 delete dict;
496 return 2;
497 }
498 mychomp(ts);
499
500 /* remove byte order mark */
501 if (ts.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
502 ts.erase(0, 3);
503 }
504
505 tablesize = atoi(ts.c_str());
506
507 int nExtra = 5 + USERWORD;
508
509 if (tablesize <= 0 ||
510 (tablesize >= (std::numeric_limits<int>::max() - 1 - nExtra) /
511 int(sizeof(struct hentry*)))) {
512 HUNSPELL_WARNING(
513 stderr, "error: line 1: missing or bad word count in the dic file\n");
514 delete dict;
515 return 4;
516 }
517 tablesize += nExtra;
518 if ((tablesize % 2) == 0)
519 tablesize++;
520
521 // allocate the hash table
522 tableptr = (struct hentry**)calloc(tablesize, sizeof(struct hentry*));
523 if (!tableptr) {
524 delete dict;
525 return 3;
526 }
527
528 // loop through all words on much list and add to hash
529 // table and create word and affix strings
530
531 std::vector<w_char> workbuf;
532
533 while (dict->getline(ts)) {
534 mychomp(ts);
535 // split each line into word and morphological description
536 size_t dp_pos = 0;
537 while ((dp_pos = ts.find(':', dp_pos)) != std::string::npos) {
538 if ((dp_pos > 3) && (ts[dp_pos - 3] == ' ' || ts[dp_pos - 3] == '\t')) {
539 for (dp_pos -= 3; dp_pos > 0 && (ts[dp_pos-1] == ' ' || ts[dp_pos-1] == '\t'); --dp_pos)
540 ;
541 if (dp_pos == 0) { // missing word
542 dp_pos = std::string::npos;
543 } else {
544 ++dp_pos;
545 }
546 break;
547 }
548 ++dp_pos;
549 }
550
551 // tabulator is the old morphological field separator
552 size_t dp2_pos = ts.find('\t');
553 if (dp2_pos != std::string::npos && (dp_pos == std::string::npos || dp2_pos < dp_pos)) {
554 dp_pos = dp2_pos + 1;
555 }
556
557 std::string dp;
558 if (dp_pos != std::string::npos) {
559 dp.assign(ts.substr(dp_pos));
560 ts.resize(dp_pos - 1);
561 }
562
563 // split each line into word and affix char strings
564 // "\/" signs slash in words (not affix separator)
565 // "/" at beginning of the line is word character (not affix separator)
566 size_t ap_pos = ts.find('/');
567 while (ap_pos != std::string::npos) {
568 if (ap_pos == 0) {
569 ++ap_pos;
570 continue;
571 } else if (ts[ap_pos - 1] != '\\')
572 break;
573 // replace "\/" with "/"
574 ts.erase(ap_pos - 1, 1);
575 ap_pos = ts.find('/', ap_pos);
576 }
577
578 unsigned short* flags;
579 int al;
580 if (ap_pos != std::string::npos && ap_pos != ts.size()) {
581 std::string ap(ts.substr(ap_pos + 1));
582 ts.resize(ap_pos);
583 if (aliasf) {
584 int index = atoi(ap.c_str());
585 al = get_aliasf(index, &flags, dict);
586 if (!al) {
587 HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n",
588 dict->getlinenum());
589 }
590 } else {
591 al = decode_flags(&flags, ap.c_str(), dict);
592 if (al == -1) {
593 HUNSPELL_WARNING(stderr, "Can't allocate memory.\n");
594 delete dict;
595 return 6;
596 }
597 std::sort(flags, flags + al);
598 }
599 } else {
600 al = 0;
601 flags = NULL;
602 }
603
604 int captype;
605 int wcl = get_clen_and_captype(ts, &captype, workbuf);
606 const std::string *dp_str = dp.empty() ? NULL : &dp;
607 // add the word and its index plus its capitalized form optionally
608 if (add_word(ts, wcl, flags, al, dp_str, false) ||
609 add_hidden_capitalized_word(ts, wcl, flags, al, dp_str, captype)) {
610 delete dict;
611 return 5;
612 }
613 }
614
615 delete dict;
616 return 0;
617 }
618
619 // the hash function is a simple load and rotate
620 // algorithm borrowed
hash(const char * word) const621 int HashMgr::hash(const char* word) const {
622 unsigned long hv = 0;
623 for (int i = 0; i < 4 && *word != 0; i++)
624 hv = (hv << 8) | (*word++);
625 while (*word != 0) {
626 ROTATE(hv, ROTATE_LEN);
627 hv ^= (*word++);
628 }
629 return (unsigned long)hv % tablesize;
630 }
631
decode_flags(unsigned short ** result,const std::string & flags,FileMgr * af) const632 int HashMgr::decode_flags(unsigned short** result, const std::string& flags, FileMgr* af) const {
633 int len;
634 if (flags.empty()) {
635 *result = NULL;
636 return 0;
637 }
638 switch (flag_mode) {
639 case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)
640 len = flags.size();
641 if (len % 2 == 1)
642 HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n",
643 af->getlinenum());
644 len /= 2;
645 *result = (unsigned short*)malloc(len * sizeof(unsigned short));
646 if (!*result)
647 return -1;
648 for (int i = 0; i < len; i++) {
649 (*result)[i] = ((unsigned short)((unsigned char)flags[i * 2]) << 8) +
650 (unsigned char)flags[i * 2 + 1];
651 }
652 break;
653 }
654 case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521
655 // 23 233)
656 len = 1;
657 unsigned short* dest;
658 for (size_t i = 0; i < flags.size(); ++i) {
659 if (flags[i] == ',')
660 len++;
661 }
662 *result = (unsigned short*)malloc(len * sizeof(unsigned short));
663 if (!*result)
664 return -1;
665 dest = *result;
666 const char* src = flags.c_str();
667 for (const char* p = src; *p; p++) {
668 if (*p == ',') {
669 int i = atoi(src);
670 if (i >= DEFAULTFLAGS)
671 HUNSPELL_WARNING(
672 stderr, "error: line %d: flag id %d is too large (max: %d)\n",
673 af->getlinenum(), i, DEFAULTFLAGS - 1);
674 *dest = (unsigned short)i;
675 if (*dest == 0)
676 HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
677 af->getlinenum());
678 src = p + 1;
679 dest++;
680 }
681 }
682 int i = atoi(src);
683 if (i >= DEFAULTFLAGS)
684 HUNSPELL_WARNING(stderr,
685 "error: line %d: flag id %d is too large (max: %d)\n",
686 af->getlinenum(), i, DEFAULTFLAGS - 1);
687 *dest = (unsigned short)i;
688 if (*dest == 0)
689 HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
690 af->getlinenum());
691 break;
692 }
693 case FLAG_UNI: { // UTF-8 characters
694 std::vector<w_char> w;
695 u8_u16(w, flags);
696 len = w.size();
697 *result = (unsigned short*)malloc(len * sizeof(unsigned short));
698 if (!*result)
699 return -1;
700 memcpy(*result, &w[0], len * sizeof(short));
701 break;
702 }
703 default: { // Ispell's one-character flags (erfg -> e r f g)
704 unsigned short* dest;
705 len = flags.size();
706 *result = (unsigned short*)malloc(len * sizeof(unsigned short));
707 if (!*result)
708 return -1;
709 dest = *result;
710 for (size_t i = 0; i < flags.size(); ++i) {
711 *dest = (unsigned char)flags[i];
712 dest++;
713 }
714 }
715 }
716 return len;
717 }
718
decode_flags(std::vector<unsigned short> & result,const std::string & flags,FileMgr * af) const719 bool HashMgr::decode_flags(std::vector<unsigned short>& result, const std::string& flags, FileMgr* af) const {
720 if (flags.empty()) {
721 return false;
722 }
723 switch (flag_mode) {
724 case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)
725 size_t len = flags.size();
726 if (len % 2 == 1)
727 HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n",
728 af->getlinenum());
729 len /= 2;
730 result.reserve(result.size() + len);
731 for (size_t i = 0; i < len; ++i) {
732 result.push_back(((unsigned short)((unsigned char)flags[i * 2]) << 8) +
733 (unsigned char)flags[i * 2 + 1]);
734 }
735 break;
736 }
737 case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521
738 // 23 233)
739 const char* src = flags.c_str();
740 for (const char* p = src; *p; p++) {
741 if (*p == ',') {
742 int i = atoi(src);
743 if (i >= DEFAULTFLAGS)
744 HUNSPELL_WARNING(
745 stderr, "error: line %d: flag id %d is too large (max: %d)\n",
746 af->getlinenum(), i, DEFAULTFLAGS - 1);
747 result.push_back((unsigned short)i);
748 if (result.back() == 0)
749 HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
750 af->getlinenum());
751 src = p + 1;
752 }
753 }
754 int i = atoi(src);
755 if (i >= DEFAULTFLAGS)
756 HUNSPELL_WARNING(stderr,
757 "error: line %d: flag id %d is too large (max: %d)\n",
758 af->getlinenum(), i, DEFAULTFLAGS - 1);
759 result.push_back((unsigned short)i);
760 if (result.back() == 0)
761 HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
762 af->getlinenum());
763 break;
764 }
765 case FLAG_UNI: { // UTF-8 characters
766 std::vector<w_char> w;
767 u8_u16(w, flags);
768 size_t len = w.size();
769 size_t origsize = result.size();
770 result.resize(origsize + len);
771 memcpy(&result[origsize], &w[0], len * sizeof(short));
772 break;
773 }
774 default: { // Ispell's one-character flags (erfg -> e r f g)
775 result.reserve(flags.size());
776 for (size_t i = 0; i < flags.size(); ++i) {
777 result.push_back((unsigned char)flags[i]);
778 }
779 }
780 }
781 return true;
782 }
783
decode_flag(const char * f) const784 unsigned short HashMgr::decode_flag(const char* f) const {
785 unsigned short s = 0;
786 int i;
787 switch (flag_mode) {
788 case FLAG_LONG:
789 s = ((unsigned short)((unsigned char)f[0]) << 8) + (unsigned char)f[1];
790 break;
791 case FLAG_NUM:
792 i = atoi(f);
793 if (i >= DEFAULTFLAGS)
794 HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n",
795 i, DEFAULTFLAGS - 1);
796 s = (unsigned short)i;
797 break;
798 case FLAG_UNI: {
799 std::vector<w_char> w;
800 u8_u16(w, f);
801 if (!w.empty())
802 memcpy(&s, &w[0], 1 * sizeof(short));
803 break;
804 }
805 default:
806 s = *(unsigned char*)f;
807 }
808 if (s == 0)
809 HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
810 return s;
811 }
812
encode_flag(unsigned short f) const813 char* HashMgr::encode_flag(unsigned short f) const {
814 if (f == 0)
815 return mystrdup("(NULL)");
816 std::string ch;
817 if (flag_mode == FLAG_LONG) {
818 ch.push_back((unsigned char)(f >> 8));
819 ch.push_back((unsigned char)(f - ((f >> 8) << 8)));
820 } else if (flag_mode == FLAG_NUM) {
821 std::ostringstream stream;
822 stream << f;
823 ch = stream.str();
824 } else if (flag_mode == FLAG_UNI) {
825 const w_char* w_c = (const w_char*)&f;
826 std::vector<w_char> w(w_c, w_c + 1);
827 u16_u8(ch, w);
828 } else {
829 ch.push_back((unsigned char)(f));
830 }
831 return mystrdup(ch.c_str());
832 }
833
834 // read in aff file and set flag mode
load_config(const char * affpath,const char * key)835 int HashMgr::load_config(const char* affpath, const char* key) {
836 int firstline = 1;
837
838 // open the affix file
839 FileMgr* afflst = new FileMgr(affpath, key);
840 if (!afflst) {
841 HUNSPELL_WARNING(
842 stderr, "Error - could not open affix description file %s\n", affpath);
843 return 1;
844 }
845
846 // read in each line ignoring any that do not
847 // start with a known line type indicator
848
849 std::string line;
850 while (afflst->getline(line)) {
851 mychomp(line);
852
853 /* remove byte order mark */
854 if (firstline) {
855 firstline = 0;
856 if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
857 line.erase(0, 3);
858 }
859 }
860
861 /* parse in the try string */
862 if ((line.compare(0, 4, "FLAG", 4) == 0) && line.size() > 4 && isspace(line[4])) {
863 if (flag_mode != FLAG_CHAR) {
864 HUNSPELL_WARNING(stderr,
865 "error: line %d: multiple definitions of the FLAG "
866 "affix file parameter\n",
867 afflst->getlinenum());
868 }
869 if (line.find("long") != std::string::npos)
870 flag_mode = FLAG_LONG;
871 if (line.find("num") != std::string::npos)
872 flag_mode = FLAG_NUM;
873 if (line.find("UTF-8") != std::string::npos)
874 flag_mode = FLAG_UNI;
875 if (flag_mode == FLAG_CHAR) {
876 HUNSPELL_WARNING(
877 stderr,
878 "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n",
879 afflst->getlinenum());
880 }
881 }
882
883 if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) {
884 std::string st;
885 if (!parse_string(line, st, afflst->getlinenum())) {
886 delete afflst;
887 return 1;
888 }
889 forbiddenword = decode_flag(st.c_str());
890 }
891
892 if (line.compare(0, 3, "SET", 3) == 0) {
893 if (!parse_string(line, enc, afflst->getlinenum())) {
894 delete afflst;
895 return 1;
896 }
897 if (enc == "UTF-8") {
898 utf8 = 1;
899 #ifndef OPENOFFICEORG
900 #ifndef MOZILLA_CLIENT
901 initialize_utf_tbl();
902 #endif
903 #endif
904 } else
905 csconv = get_current_cs(enc);
906 }
907
908 if (line.compare(0, 4, "LANG", 4) == 0) {
909 if (!parse_string(line, lang, afflst->getlinenum())) {
910 delete afflst;
911 return 1;
912 }
913 langnum = get_lang_num(lang);
914 }
915
916 /* parse in the ignored characters (for example, Arabic optional diacritics
917 * characters */
918 if (line.compare(0, 6, "IGNORE", 6) == 0) {
919 if (!parse_array(line, ignorechars, ignorechars_utf16,
920 utf8, afflst->getlinenum())) {
921 delete afflst;
922 return 1;
923 }
924 }
925
926 if ((line.compare(0, 2, "AF", 2) == 0) && line.size() > 2 && isspace(line[2])) {
927 if (!parse_aliasf(line, afflst)) {
928 delete afflst;
929 return 1;
930 }
931 }
932
933 if ((line.compare(0, 2, "AM", 2) == 0) && line.size() > 2 && isspace(line[2])) {
934 if (!parse_aliasm(line, afflst)) {
935 delete afflst;
936 return 1;
937 }
938 }
939
940 if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
941 complexprefixes = 1;
942
943 if (((line.compare(0, 3, "SFX", 3) == 0) ||
944 (line.compare(0, 3, "PFX", 3) == 0)) && line.size() > 3 && isspace(line[3]))
945 break;
946 }
947
948 if (csconv == NULL)
949 csconv = get_current_cs(SPELL_ENCODING);
950 delete afflst;
951 return 0;
952 }
953
954 /* parse in the ALIAS table */
parse_aliasf(const std::string & line,FileMgr * af)955 bool HashMgr::parse_aliasf(const std::string& line, FileMgr* af) {
956 if (numaliasf != 0) {
957 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
958 af->getlinenum());
959 return false;
960 }
961 int i = 0;
962 int np = 0;
963 std::string::const_iterator iter = line.begin();
964 std::string::const_iterator start_piece = mystrsep(line, iter);
965 while (start_piece != line.end()) {
966 switch (i) {
967 case 0: {
968 np++;
969 break;
970 }
971 case 1: {
972 numaliasf = atoi(std::string(start_piece, iter).c_str());
973 if (numaliasf < 1) {
974 numaliasf = 0;
975 aliasf = NULL;
976 aliasflen = NULL;
977 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
978 af->getlinenum());
979 return false;
980 }
981 aliasf =
982 (unsigned short**)malloc(numaliasf * sizeof(unsigned short*));
983 aliasflen =
984 (unsigned short*)malloc(numaliasf * sizeof(unsigned short));
985 if (!aliasf || !aliasflen) {
986 numaliasf = 0;
987 if (aliasf)
988 free(aliasf);
989 if (aliasflen)
990 free(aliasflen);
991 aliasf = NULL;
992 aliasflen = NULL;
993 return false;
994 }
995 np++;
996 break;
997 }
998 default:
999 break;
1000 }
1001 ++i;
1002 start_piece = mystrsep(line, iter);
1003 }
1004 if (np != 2) {
1005 numaliasf = 0;
1006 free(aliasf);
1007 free(aliasflen);
1008 aliasf = NULL;
1009 aliasflen = NULL;
1010 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
1011 af->getlinenum());
1012 return false;
1013 }
1014
1015 /* now parse the numaliasf lines to read in the remainder of the table */
1016 for (int j = 0; j < numaliasf; j++) {
1017 std::string nl;
1018 if (!af->getline(nl))
1019 return false;
1020 mychomp(nl);
1021 i = 0;
1022 aliasf[j] = NULL;
1023 aliasflen[j] = 0;
1024 iter = nl.begin();
1025 start_piece = mystrsep(nl, iter);
1026 while (start_piece != nl.end()) {
1027 switch (i) {
1028 case 0: {
1029 if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) {
1030 numaliasf = 0;
1031 free(aliasf);
1032 free(aliasflen);
1033 aliasf = NULL;
1034 aliasflen = NULL;
1035 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1036 af->getlinenum());
1037 return false;
1038 }
1039 break;
1040 }
1041 case 1: {
1042 std::string piece(start_piece, iter);
1043 aliasflen[j] =
1044 (unsigned short)decode_flags(&(aliasf[j]), piece, af);
1045 std::sort(aliasf[j], aliasf[j] + aliasflen[j]);
1046 break;
1047 }
1048 default:
1049 break;
1050 }
1051 ++i;
1052 start_piece = mystrsep(nl, iter);
1053 }
1054 if (!aliasf[j]) {
1055 free(aliasf);
1056 free(aliasflen);
1057 aliasf = NULL;
1058 aliasflen = NULL;
1059 numaliasf = 0;
1060 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1061 af->getlinenum());
1062 return false;
1063 }
1064 }
1065 return true;
1066 }
1067
is_aliasf() const1068 int HashMgr::is_aliasf() const {
1069 return (aliasf != NULL);
1070 }
1071
get_aliasf(int index,unsigned short ** fvec,FileMgr * af) const1072 int HashMgr::get_aliasf(int index, unsigned short** fvec, FileMgr* af) const {
1073 if ((index > 0) && (index <= numaliasf)) {
1074 *fvec = aliasf[index - 1];
1075 return aliasflen[index - 1];
1076 }
1077 HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n",
1078 af->getlinenum(), index);
1079 *fvec = NULL;
1080 return 0;
1081 }
1082
1083 /* parse morph alias definitions */
parse_aliasm(const std::string & line,FileMgr * af)1084 bool HashMgr::parse_aliasm(const std::string& line, FileMgr* af) {
1085 if (numaliasm != 0) {
1086 HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
1087 af->getlinenum());
1088 return false;
1089 }
1090 int i = 0;
1091 int np = 0;
1092 std::string::const_iterator iter = line.begin();
1093 std::string::const_iterator start_piece = mystrsep(line, iter);
1094 while (start_piece != line.end()) {
1095 switch (i) {
1096 case 0: {
1097 np++;
1098 break;
1099 }
1100 case 1: {
1101 numaliasm = atoi(std::string(start_piece, iter).c_str());
1102 if (numaliasm < 1) {
1103 HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
1104 af->getlinenum());
1105 return false;
1106 }
1107 aliasm = (char**)malloc(numaliasm * sizeof(char*));
1108 if (!aliasm) {
1109 numaliasm = 0;
1110 return false;
1111 }
1112 np++;
1113 break;
1114 }
1115 default:
1116 break;
1117 }
1118 ++i;
1119 start_piece = mystrsep(line, iter);
1120 }
1121 if (np != 2) {
1122 numaliasm = 0;
1123 free(aliasm);
1124 aliasm = NULL;
1125 HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
1126 af->getlinenum());
1127 return false;
1128 }
1129
1130 /* now parse the numaliasm lines to read in the remainder of the table */
1131 for (int j = 0; j < numaliasm; j++) {
1132 std::string nl;
1133 if (!af->getline(nl))
1134 return false;
1135 mychomp(nl);
1136 aliasm[j] = NULL;
1137 iter = nl.begin();
1138 i = 0;
1139 start_piece = mystrsep(nl, iter);
1140 while (start_piece != nl.end()) {
1141 switch (i) {
1142 case 0: {
1143 if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) {
1144 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1145 af->getlinenum());
1146 numaliasm = 0;
1147 free(aliasm);
1148 aliasm = NULL;
1149 return false;
1150 }
1151 break;
1152 }
1153 case 1: {
1154 // add the remaining of the line
1155 std::string::const_iterator end = nl.end();
1156 std::string chunk(start_piece, end);
1157 if (complexprefixes) {
1158 if (utf8)
1159 reverseword_utf(chunk);
1160 else
1161 reverseword(chunk);
1162 }
1163 aliasm[j] = mystrdup(chunk.c_str());
1164 break;
1165 }
1166 default:
1167 break;
1168 }
1169 ++i;
1170 start_piece = mystrsep(nl, iter);
1171 }
1172 if (!aliasm[j]) {
1173 numaliasm = 0;
1174 free(aliasm);
1175 aliasm = NULL;
1176 HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1177 af->getlinenum());
1178 return false;
1179 }
1180 }
1181 return true;
1182 }
1183
is_aliasm() const1184 int HashMgr::is_aliasm() const {
1185 return (aliasm != NULL);
1186 }
1187
get_aliasm(int index) const1188 char* HashMgr::get_aliasm(int index) const {
1189 if ((index > 0) && (index <= numaliasm))
1190 return aliasm[index - 1];
1191 HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
1192 return NULL;
1193 }
1194