1 /* ***** BEGIN LICENSE BLOCK *****
2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3  *
4  * Copyright (C) 2002-2017 Németh László
5  *
6  * The contents of this file are subject to the Mozilla Public License Version
7  * 1.1 (the "License"); you may not use this file except in compliance with
8  * the License. You may obtain a copy of the License at
9  * http://www.mozilla.org/MPL/
10  *
11  * Software distributed under the License is distributed on an "AS IS" basis,
12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13  * for the specific language governing rights and limitations under the
14  * License.
15  *
16  * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17  *
18  * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19  * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20  * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21  * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22  * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23  *
24  * Alternatively, the contents of this file may be used under the terms of
25  * either the GNU General Public License Version 2 or later (the "GPL"), or
26  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27  * in which case the provisions of the GPL or the LGPL are applicable instead
28  * of those above. If you wish to allow use of your version of this file only
29  * under the terms of either the GPL or the LGPL, and not to allow others to
30  * use your version of this file under the terms of the MPL, indicate your
31  * decision by deleting the provisions above and replace them with the notice
32  * and other provisions required by the GPL or the LGPL. If you do not delete
33  * the provisions above, a recipient may use your version of this file under
34  * the terms of any one of the MPL, the GPL or the LGPL.
35  *
36  * ***** END LICENSE BLOCK ***** */
37 /*
38  * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39  * And Contributors.  All rights reserved.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  *
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  *
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  *
52  * 3. All modifications to the source code must be clearly marked as
53  *    such.  Binary redistributions based on modified source code
54  *    must be clearly marked as modified versions in the documentation
55  *    and/or other materials provided with the distribution.
56  *
57  * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
61  * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68  * SUCH DAMAGE.
69  */
70 
71 #include <stdlib.h>
72 #include <string.h>
73 #include <stdio.h>
74 #include <ctype.h>
75 #include <limits>
76 #include <sstream>
77 
78 #include "hashmgr.hxx"
79 #include "csutil.hxx"
80 #include "atypes.hxx"
81 
82 // build a hash table from a munched word list
83 
HashMgr(const char * tpath,const char * apath,const char * key)84 HashMgr::HashMgr(const char* tpath, const char* apath, const char* key)
85     : tablesize(0),
86       tableptr(NULL),
87       flag_mode(FLAG_CHAR),
88       complexprefixes(0),
89       utf8(0),
90       forbiddenword(FORBIDDENWORD)  // forbidden word signing flag
91       ,
92       numaliasf(0),
93       aliasf(NULL),
94       aliasflen(0),
95       numaliasm(0),
96       aliasm(NULL) {
97   langnum = 0;
98   csconv = 0;
99   load_config(apath, key);
100   int ec = load_tables(tpath, key);
101   if (ec) {
102     /* error condition - what should we do here */
103     HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n", ec);
104     free(tableptr);
105     //keep tablesize to 1 to fix possible division with zero
106     tablesize = 1;
107     tableptr = (struct hentry**)calloc(tablesize, sizeof(struct hentry*));
108     if (!tableptr) {
109       tablesize = 0;
110     }
111   }
112 }
113 
~HashMgr()114 HashMgr::~HashMgr() {
115   if (tableptr) {
116     // now pass through hash table freeing up everything
117     // go through column by column of the table
118     for (int i = 0; i < tablesize; i++) {
119       struct hentry* pt = tableptr[i];
120       struct hentry* nt = NULL;
121       while (pt) {
122         nt = pt->next;
123         if (pt->astr &&
124             (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen)))
125           free(pt->astr);
126         free(pt);
127         pt = nt;
128       }
129     }
130     free(tableptr);
131   }
132   tablesize = 0;
133 
134   if (aliasf) {
135     for (int j = 0; j < (numaliasf); j++)
136       free(aliasf[j]);
137     free(aliasf);
138     aliasf = NULL;
139     if (aliasflen) {
140       free(aliasflen);
141       aliasflen = NULL;
142     }
143   }
144   if (aliasm) {
145     for (int j = 0; j < (numaliasm); j++)
146       free(aliasm[j]);
147     free(aliasm);
148     aliasm = NULL;
149   }
150 
151 #ifndef OPENOFFICEORG
152 #ifndef MOZILLA_CLIENT
153   if (utf8)
154     free_utf_tbl();
155 #endif
156 #endif
157 
158 #ifdef MOZILLA_CLIENT
159   delete[] csconv;
160 #endif
161 }
162 
163 // lookup a root word in the hashtable
164 
lookup(const char * word) const165 struct hentry* HashMgr::lookup(const char* word) const {
166   struct hentry* dp;
167   if (tableptr) {
168     dp = tableptr[hash(word)];
169     if (!dp)
170       return NULL;
171     for (; dp != NULL; dp = dp->next) {
172       if (strcmp(word, dp->word) == 0)
173         return dp;
174     }
175   }
176   return NULL;
177 }
178 
179 // add a word to the hash table (private)
add_word(const std::string & in_word,int wcl,unsigned short * aff,int al,const std::string * in_desc,bool onlyupcase)180 int HashMgr::add_word(const std::string& in_word,
181                       int wcl,
182                       unsigned short* aff,
183                       int al,
184                       const std::string* in_desc,
185                       bool onlyupcase) {
186   const std::string* word = &in_word;
187   const std::string* desc = in_desc;
188 
189   std::string *word_copy = NULL;
190   std::string *desc_copy = NULL;
191   if (!ignorechars.empty() || complexprefixes) {
192     word_copy = new std::string(in_word);
193 
194     if (!ignorechars.empty()) {
195       if (utf8) {
196         wcl = remove_ignored_chars_utf(*word_copy, ignorechars_utf16);
197       } else {
198         remove_ignored_chars(*word_copy, ignorechars);
199       }
200     }
201 
202     if (complexprefixes) {
203       if (utf8)
204         wcl = reverseword_utf(*word_copy);
205       else
206         reverseword(*word_copy);
207 
208       if (in_desc && !aliasm) {
209         desc_copy = new std::string(*in_desc);
210 
211         if (complexprefixes) {
212           if (utf8)
213             reverseword_utf(*desc_copy);
214           else
215             reverseword(*desc_copy);
216         }
217         desc = desc_copy;
218       }
219     }
220 
221     word = word_copy;
222   }
223 
224   bool upcasehomonym = false;
225   int descl = desc ? (aliasm ? sizeof(char*) : desc->size() + 1) : 0;
226   // variable-length hash record with word and optional fields
227   struct hentry* hp =
228       (struct hentry*)malloc(sizeof(struct hentry) + word->size() + descl);
229   if (!hp) {
230     delete desc_copy;
231     delete word_copy;
232     return 1;
233   }
234 
235   char* hpw = hp->word;
236   strcpy(hpw, word->c_str());
237 
238   int i = hash(hpw);
239 
240   hp->blen = (unsigned char)word->size();
241   hp->clen = (unsigned char)wcl;
242   hp->alen = (short)al;
243   hp->astr = aff;
244   hp->next = NULL;
245   hp->next_homonym = NULL;
246 
247   // store the description string or its pointer
248   if (desc) {
249     hp->var = H_OPT;
250     if (aliasm) {
251       hp->var += H_OPT_ALIASM;
252       store_pointer(hpw + word->size() + 1, get_aliasm(atoi(desc->c_str())));
253     } else {
254       strcpy(hpw + word->size() + 1, desc->c_str());
255     }
256     if (strstr(HENTRY_DATA(hp), MORPH_PHON))
257       hp->var += H_OPT_PHON;
258   } else
259     hp->var = 0;
260 
261   struct hentry* dp = tableptr[i];
262   if (!dp) {
263     tableptr[i] = hp;
264     delete desc_copy;
265     delete word_copy;
266     return 0;
267   }
268   while (dp->next != NULL) {
269     if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) {
270       // remove hidden onlyupcase homonym
271       if (!onlyupcase) {
272         if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
273           free(dp->astr);
274           dp->astr = hp->astr;
275           dp->alen = hp->alen;
276           free(hp);
277           delete desc_copy;
278           delete word_copy;
279           return 0;
280         } else {
281           dp->next_homonym = hp;
282         }
283       } else {
284         upcasehomonym = true;
285       }
286     }
287     dp = dp->next;
288   }
289   if (strcmp(hp->word, dp->word) == 0) {
290     // remove hidden onlyupcase homonym
291     if (!onlyupcase) {
292       if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
293         free(dp->astr);
294         dp->astr = hp->astr;
295         dp->alen = hp->alen;
296         free(hp);
297         delete desc_copy;
298         delete word_copy;
299         return 0;
300       } else {
301         dp->next_homonym = hp;
302       }
303     } else {
304       upcasehomonym = true;
305     }
306   }
307   if (!upcasehomonym) {
308     dp->next = hp;
309   } else {
310     // remove hidden onlyupcase homonym
311     if (hp->astr)
312       free(hp->astr);
313     free(hp);
314   }
315 
316   delete desc_copy;
317   delete word_copy;
318   return 0;
319 }
320 
add_hidden_capitalized_word(const std::string & word,int wcl,unsigned short * flags,int flagslen,const std::string * dp,int captype)321 int HashMgr::add_hidden_capitalized_word(const std::string& word,
322                                          int wcl,
323                                          unsigned short* flags,
324                                          int flagslen,
325                                          const std::string* dp,
326                                          int captype) {
327   if (flags == NULL)
328     flagslen = 0;
329 
330   // add inner capitalized forms to handle the following allcap forms:
331   // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG
332   // Allcaps with suffixes: CIA's -> CIA'S
333   if (((captype == HUHCAP) || (captype == HUHINITCAP) ||
334        ((captype == ALLCAP) && (flagslen != 0))) &&
335       !((flagslen != 0) && TESTAFF(flags, forbiddenword, flagslen))) {
336     unsigned short* flags2 =
337         (unsigned short*)malloc(sizeof(unsigned short) * (flagslen + 1));
338     if (!flags2)
339       return 1;
340     if (flagslen)
341       memcpy(flags2, flags, flagslen * sizeof(unsigned short));
342     flags2[flagslen] = ONLYUPCASEFLAG;
343     if (utf8) {
344       std::string st;
345       std::vector<w_char> w;
346       u8_u16(w, word);
347       mkallsmall_utf(w, langnum);
348       mkinitcap_utf(w, langnum);
349       u16_u8(st, w);
350       return add_word(st, wcl, flags2, flagslen + 1, dp, true);
351     } else {
352       std::string new_word(word);
353       mkallsmall(new_word, csconv);
354       mkinitcap(new_word, csconv);
355       int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true);
356       return ret;
357     }
358   }
359   return 0;
360 }
361 
362 // detect captype and modify word length for UTF-8 encoding
get_clen_and_captype(const std::string & word,int * captype,std::vector<w_char> & workbuf)363 int HashMgr::get_clen_and_captype(const std::string& word, int* captype, std::vector<w_char> &workbuf) {
364   int len;
365   if (utf8) {
366     len = u8_u16(workbuf, word);
367     *captype = get_captype_utf8(workbuf, langnum);
368   } else {
369     len = word.size();
370     *captype = get_captype(word, csconv);
371   }
372   return len;
373 }
374 
get_clen_and_captype(const std::string & word,int * captype)375 int HashMgr::get_clen_and_captype(const std::string& word, int* captype) {
376   std::vector<w_char> workbuf;
377   return get_clen_and_captype(word, captype, workbuf);
378 }
379 
380 // remove word (personal dictionary function for standalone applications)
remove(const std::string & word)381 int HashMgr::remove(const std::string& word) {
382   struct hentry* dp = lookup(word.c_str());
383   while (dp) {
384     if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) {
385       unsigned short* flags =
386           (unsigned short*)malloc(sizeof(unsigned short) * (dp->alen + 1));
387       if (!flags)
388         return 1;
389       for (int i = 0; i < dp->alen; i++)
390         flags[i] = dp->astr[i];
391       flags[dp->alen] = forbiddenword;
392       free(dp->astr);
393       dp->astr = flags;
394       dp->alen++;
395       std::sort(flags, flags + dp->alen);
396     }
397     dp = dp->next_homonym;
398   }
399   return 0;
400 }
401 
402 /* remove forbidden flag to add a personal word to the hash */
remove_forbidden_flag(const std::string & word)403 int HashMgr::remove_forbidden_flag(const std::string& word) {
404   struct hentry* dp = lookup(word.c_str());
405   if (!dp)
406     return 1;
407   while (dp) {
408     if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) {
409       if (dp->alen == 1)
410         dp->alen = 0;  // XXX forbidden words of personal dic.
411       else {
412         unsigned short* flags2 =
413             (unsigned short*)malloc(sizeof(unsigned short) * (dp->alen - 1));
414         if (!flags2)
415           return 1;
416         int i, j = 0;
417         for (i = 0; i < dp->alen; i++) {
418           if (dp->astr[i] != forbiddenword)
419             flags2[j++] = dp->astr[i];
420         }
421         dp->alen--;
422         free(dp->astr);
423         dp->astr = flags2;  // XXX allowed forbidden words
424       }
425     }
426     dp = dp->next_homonym;
427   }
428   return 0;
429 }
430 
431 // add a custom dic. word to the hash table (public)
add(const std::string & word)432 int HashMgr::add(const std::string& word) {
433   if (remove_forbidden_flag(word)) {
434     int captype;
435     int al = 0;
436     unsigned short* flags = NULL;
437     int wcl = get_clen_and_captype(word, &captype);
438     add_word(word, wcl, flags, al, NULL, false);
439     return add_hidden_capitalized_word(word, wcl, flags, al, NULL,
440                                        captype);
441   }
442   return 0;
443 }
444 
add_with_affix(const std::string & word,const std::string & example)445 int HashMgr::add_with_affix(const std::string& word, const std::string& example) {
446   // detect captype and modify word length for UTF-8 encoding
447   struct hentry* dp = lookup(example.c_str());
448   remove_forbidden_flag(word);
449   if (dp && dp->astr) {
450     int captype;
451     int wcl = get_clen_and_captype(word, &captype);
452     if (aliasf) {
453       add_word(word, wcl, dp->astr, dp->alen, NULL, false);
454     } else {
455       unsigned short* flags =
456           (unsigned short*)malloc(dp->alen * sizeof(unsigned short));
457       if (flags) {
458         memcpy((void*)flags, (void*)dp->astr,
459                dp->alen * sizeof(unsigned short));
460         add_word(word, wcl, flags, dp->alen, NULL, false);
461       } else
462         return 1;
463     }
464     return add_hidden_capitalized_word(word, wcl, dp->astr,
465                                        dp->alen, NULL, captype);
466   }
467   return 1;
468 }
469 
470 // walk the hash table entry by entry - null at end
471 // initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp);
walk_hashtable(int & col,struct hentry * hp) const472 struct hentry* HashMgr::walk_hashtable(int& col, struct hentry* hp) const {
473   if (hp && hp->next != NULL)
474     return hp->next;
475   for (col++; col < tablesize; col++) {
476     if (tableptr[col])
477       return tableptr[col];
478   }
479   // null at end and reset to start
480   col = -1;
481   return NULL;
482 }
483 
484 // load a munched word list and build a hash table on the fly
load_tables(const char * tpath,const char * key)485 int HashMgr::load_tables(const char* tpath, const char* key) {
486   // open dictionary file
487   FileMgr* dict = new FileMgr(tpath, key);
488   if (dict == NULL)
489     return 1;
490 
491   // first read the first line of file to get hash table size */
492   std::string ts;
493   if (!dict->getline(ts)) {
494     HUNSPELL_WARNING(stderr, "error: empty dic file %s\n", tpath);
495     delete dict;
496     return 2;
497   }
498   mychomp(ts);
499 
500   /* remove byte order mark */
501   if (ts.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
502     ts.erase(0, 3);
503   }
504 
505   tablesize = atoi(ts.c_str());
506 
507   int nExtra = 5 + USERWORD;
508 
509   if (tablesize <= 0 ||
510       (tablesize >= (std::numeric_limits<int>::max() - 1 - nExtra) /
511                         int(sizeof(struct hentry*)))) {
512     HUNSPELL_WARNING(
513         stderr, "error: line 1: missing or bad word count in the dic file\n");
514     delete dict;
515     return 4;
516   }
517   tablesize += nExtra;
518   if ((tablesize % 2) == 0)
519     tablesize++;
520 
521   // allocate the hash table
522   tableptr = (struct hentry**)calloc(tablesize, sizeof(struct hentry*));
523   if (!tableptr) {
524     delete dict;
525     return 3;
526   }
527 
528   // loop through all words on much list and add to hash
529   // table and create word and affix strings
530 
531   std::vector<w_char> workbuf;
532 
533   while (dict->getline(ts)) {
534     mychomp(ts);
535     // split each line into word and morphological description
536     size_t dp_pos = 0;
537     while ((dp_pos = ts.find(':', dp_pos)) != std::string::npos) {
538       if ((dp_pos > 3) && (ts[dp_pos - 3] == ' ' || ts[dp_pos - 3] == '\t')) {
539         for (dp_pos -= 3; dp_pos > 0 && (ts[dp_pos-1] == ' ' || ts[dp_pos-1] == '\t'); --dp_pos)
540           ;
541         if (dp_pos == 0) {  // missing word
542           dp_pos = std::string::npos;
543         } else {
544           ++dp_pos;
545         }
546         break;
547       }
548       ++dp_pos;
549     }
550 
551     // tabulator is the old morphological field separator
552     size_t dp2_pos = ts.find('\t');
553     if (dp2_pos != std::string::npos && (dp_pos == std::string::npos || dp2_pos < dp_pos)) {
554       dp_pos = dp2_pos + 1;
555     }
556 
557     std::string dp;
558     if (dp_pos != std::string::npos) {
559       dp.assign(ts.substr(dp_pos));
560       ts.resize(dp_pos - 1);
561     }
562 
563     // split each line into word and affix char strings
564     // "\/" signs slash in words (not affix separator)
565     // "/" at beginning of the line is word character (not affix separator)
566     size_t ap_pos = ts.find('/');
567     while (ap_pos != std::string::npos) {
568       if (ap_pos == 0) {
569         ++ap_pos;
570         continue;
571       } else if (ts[ap_pos - 1] != '\\')
572         break;
573       // replace "\/" with "/"
574       ts.erase(ap_pos - 1, 1);
575       ap_pos = ts.find('/', ap_pos);
576     }
577 
578     unsigned short* flags;
579     int al;
580     if (ap_pos != std::string::npos && ap_pos != ts.size()) {
581       std::string ap(ts.substr(ap_pos + 1));
582       ts.resize(ap_pos);
583       if (aliasf) {
584         int index = atoi(ap.c_str());
585         al = get_aliasf(index, &flags, dict);
586         if (!al) {
587           HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n",
588                            dict->getlinenum());
589         }
590       } else {
591         al = decode_flags(&flags, ap.c_str(), dict);
592         if (al == -1) {
593           HUNSPELL_WARNING(stderr, "Can't allocate memory.\n");
594           delete dict;
595           return 6;
596         }
597         std::sort(flags, flags + al);
598       }
599     } else {
600       al = 0;
601       flags = NULL;
602     }
603 
604     int captype;
605     int wcl = get_clen_and_captype(ts, &captype, workbuf);
606     const std::string *dp_str = dp.empty() ? NULL : &dp;
607     // add the word and its index plus its capitalized form optionally
608     if (add_word(ts, wcl, flags, al, dp_str, false) ||
609         add_hidden_capitalized_word(ts, wcl, flags, al, dp_str, captype)) {
610       delete dict;
611       return 5;
612     }
613   }
614 
615   delete dict;
616   return 0;
617 }
618 
619 // the hash function is a simple load and rotate
620 // algorithm borrowed
hash(const char * word) const621 int HashMgr::hash(const char* word) const {
622   unsigned long hv = 0;
623   for (int i = 0; i < 4 && *word != 0; i++)
624     hv = (hv << 8) | (*word++);
625   while (*word != 0) {
626     ROTATE(hv, ROTATE_LEN);
627     hv ^= (*word++);
628   }
629   return (unsigned long)hv % tablesize;
630 }
631 
decode_flags(unsigned short ** result,const std::string & flags,FileMgr * af) const632 int HashMgr::decode_flags(unsigned short** result, const std::string& flags, FileMgr* af) const {
633   int len;
634   if (flags.empty()) {
635     *result = NULL;
636     return 0;
637   }
638   switch (flag_mode) {
639     case FLAG_LONG: {  // two-character flags (1x2yZz -> 1x 2y Zz)
640       len = flags.size();
641       if (len % 2 == 1)
642         HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n",
643                          af->getlinenum());
644       len /= 2;
645       *result = (unsigned short*)malloc(len * sizeof(unsigned short));
646       if (!*result)
647         return -1;
648       for (int i = 0; i < len; i++) {
649         (*result)[i] = ((unsigned short)((unsigned char)flags[i * 2]) << 8) +
650                        (unsigned char)flags[i * 2 + 1];
651       }
652       break;
653     }
654     case FLAG_NUM: {  // decimal numbers separated by comma (4521,23,233 -> 4521
655                       // 23 233)
656       len = 1;
657       unsigned short* dest;
658       for (size_t i = 0; i < flags.size(); ++i) {
659         if (flags[i] == ',')
660           len++;
661       }
662       *result = (unsigned short*)malloc(len * sizeof(unsigned short));
663       if (!*result)
664         return -1;
665       dest = *result;
666       const char* src = flags.c_str();
667       for (const char* p = src; *p; p++) {
668         if (*p == ',') {
669           int i = atoi(src);
670           if (i >= DEFAULTFLAGS)
671             HUNSPELL_WARNING(
672                 stderr, "error: line %d: flag id %d is too large (max: %d)\n",
673                 af->getlinenum(), i, DEFAULTFLAGS - 1);
674           *dest = (unsigned short)i;
675           if (*dest == 0)
676             HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
677                              af->getlinenum());
678           src = p + 1;
679           dest++;
680         }
681       }
682       int i = atoi(src);
683       if (i >= DEFAULTFLAGS)
684         HUNSPELL_WARNING(stderr,
685                          "error: line %d: flag id %d is too large (max: %d)\n",
686                          af->getlinenum(), i, DEFAULTFLAGS - 1);
687       *dest = (unsigned short)i;
688       if (*dest == 0)
689         HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
690                          af->getlinenum());
691       break;
692     }
693     case FLAG_UNI: {  // UTF-8 characters
694       std::vector<w_char> w;
695       u8_u16(w, flags);
696       len = w.size();
697       *result = (unsigned short*)malloc(len * sizeof(unsigned short));
698       if (!*result)
699         return -1;
700       memcpy(*result, &w[0], len * sizeof(short));
701       break;
702     }
703     default: {  // Ispell's one-character flags (erfg -> e r f g)
704       unsigned short* dest;
705       len = flags.size();
706       *result = (unsigned short*)malloc(len * sizeof(unsigned short));
707       if (!*result)
708         return -1;
709       dest = *result;
710       for (size_t i = 0; i < flags.size(); ++i) {
711         *dest = (unsigned char)flags[i];
712         dest++;
713       }
714     }
715   }
716   return len;
717 }
718 
decode_flags(std::vector<unsigned short> & result,const std::string & flags,FileMgr * af) const719 bool HashMgr::decode_flags(std::vector<unsigned short>& result, const std::string& flags, FileMgr* af) const {
720   if (flags.empty()) {
721     return false;
722   }
723   switch (flag_mode) {
724     case FLAG_LONG: {  // two-character flags (1x2yZz -> 1x 2y Zz)
725       size_t len = flags.size();
726       if (len % 2 == 1)
727         HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n",
728                          af->getlinenum());
729       len /= 2;
730       result.reserve(result.size() + len);
731       for (size_t i = 0; i < len; ++i) {
732         result.push_back(((unsigned short)((unsigned char)flags[i * 2]) << 8) +
733                          (unsigned char)flags[i * 2 + 1]);
734       }
735       break;
736     }
737     case FLAG_NUM: {  // decimal numbers separated by comma (4521,23,233 -> 4521
738                       // 23 233)
739       const char* src = flags.c_str();
740       for (const char* p = src; *p; p++) {
741         if (*p == ',') {
742           int i = atoi(src);
743           if (i >= DEFAULTFLAGS)
744             HUNSPELL_WARNING(
745                 stderr, "error: line %d: flag id %d is too large (max: %d)\n",
746                 af->getlinenum(), i, DEFAULTFLAGS - 1);
747           result.push_back((unsigned short)i);
748           if (result.back() == 0)
749             HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
750                              af->getlinenum());
751           src = p + 1;
752         }
753       }
754       int i = atoi(src);
755       if (i >= DEFAULTFLAGS)
756         HUNSPELL_WARNING(stderr,
757                          "error: line %d: flag id %d is too large (max: %d)\n",
758                          af->getlinenum(), i, DEFAULTFLAGS - 1);
759       result.push_back((unsigned short)i);
760       if (result.back() == 0)
761         HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
762                          af->getlinenum());
763       break;
764     }
765     case FLAG_UNI: {  // UTF-8 characters
766       std::vector<w_char> w;
767       u8_u16(w, flags);
768       size_t len = w.size();
769       size_t origsize = result.size();
770       result.resize(origsize + len);
771       memcpy(&result[origsize], &w[0], len * sizeof(short));
772       break;
773     }
774     default: {  // Ispell's one-character flags (erfg -> e r f g)
775       result.reserve(flags.size());
776       for (size_t i = 0; i < flags.size(); ++i) {
777         result.push_back((unsigned char)flags[i]);
778       }
779     }
780   }
781   return true;
782 }
783 
decode_flag(const char * f) const784 unsigned short HashMgr::decode_flag(const char* f) const {
785   unsigned short s = 0;
786   int i;
787   switch (flag_mode) {
788     case FLAG_LONG:
789       s = ((unsigned short)((unsigned char)f[0]) << 8) + (unsigned char)f[1];
790       break;
791     case FLAG_NUM:
792       i = atoi(f);
793       if (i >= DEFAULTFLAGS)
794         HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n",
795                          i, DEFAULTFLAGS - 1);
796       s = (unsigned short)i;
797       break;
798     case FLAG_UNI: {
799       std::vector<w_char> w;
800       u8_u16(w, f);
801       if (!w.empty())
802           memcpy(&s, &w[0], 1 * sizeof(short));
803       break;
804     }
805     default:
806       s = *(unsigned char*)f;
807   }
808   if (s == 0)
809     HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
810   return s;
811 }
812 
encode_flag(unsigned short f) const813 char* HashMgr::encode_flag(unsigned short f) const {
814   if (f == 0)
815     return mystrdup("(NULL)");
816   std::string ch;
817   if (flag_mode == FLAG_LONG) {
818     ch.push_back((unsigned char)(f >> 8));
819     ch.push_back((unsigned char)(f - ((f >> 8) << 8)));
820   } else if (flag_mode == FLAG_NUM) {
821     std::ostringstream stream;
822     stream << f;
823     ch = stream.str();
824   } else if (flag_mode == FLAG_UNI) {
825     const w_char* w_c = (const w_char*)&f;
826     std::vector<w_char> w(w_c, w_c + 1);
827     u16_u8(ch, w);
828   } else {
829     ch.push_back((unsigned char)(f));
830   }
831   return mystrdup(ch.c_str());
832 }
833 
834 // read in aff file and set flag mode
load_config(const char * affpath,const char * key)835 int HashMgr::load_config(const char* affpath, const char* key) {
836   int firstline = 1;
837 
838   // open the affix file
839   FileMgr* afflst = new FileMgr(affpath, key);
840   if (!afflst) {
841     HUNSPELL_WARNING(
842         stderr, "Error - could not open affix description file %s\n", affpath);
843     return 1;
844   }
845 
846   // read in each line ignoring any that do not
847   // start with a known line type indicator
848 
849   std::string line;
850   while (afflst->getline(line)) {
851     mychomp(line);
852 
853     /* remove byte order mark */
854     if (firstline) {
855       firstline = 0;
856       if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
857         line.erase(0, 3);
858       }
859     }
860 
861     /* parse in the try string */
862     if ((line.compare(0, 4, "FLAG", 4) == 0) && line.size() > 4 && isspace(line[4])) {
863       if (flag_mode != FLAG_CHAR) {
864         HUNSPELL_WARNING(stderr,
865                          "error: line %d: multiple definitions of the FLAG "
866                          "affix file parameter\n",
867                          afflst->getlinenum());
868       }
869       if (line.find("long") != std::string::npos)
870         flag_mode = FLAG_LONG;
871       if (line.find("num") != std::string::npos)
872         flag_mode = FLAG_NUM;
873       if (line.find("UTF-8") != std::string::npos)
874         flag_mode = FLAG_UNI;
875       if (flag_mode == FLAG_CHAR) {
876         HUNSPELL_WARNING(
877             stderr,
878             "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n",
879             afflst->getlinenum());
880       }
881     }
882 
883     if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) {
884       std::string st;
885       if (!parse_string(line, st, afflst->getlinenum())) {
886         delete afflst;
887         return 1;
888       }
889       forbiddenword = decode_flag(st.c_str());
890     }
891 
892     if (line.compare(0, 3, "SET", 3) == 0) {
893       if (!parse_string(line, enc, afflst->getlinenum())) {
894         delete afflst;
895         return 1;
896       }
897       if (enc == "UTF-8") {
898         utf8 = 1;
899 #ifndef OPENOFFICEORG
900 #ifndef MOZILLA_CLIENT
901         initialize_utf_tbl();
902 #endif
903 #endif
904       } else
905         csconv = get_current_cs(enc);
906     }
907 
908     if (line.compare(0, 4, "LANG", 4) == 0) {
909       if (!parse_string(line, lang, afflst->getlinenum())) {
910         delete afflst;
911         return 1;
912       }
913       langnum = get_lang_num(lang);
914     }
915 
916     /* parse in the ignored characters (for example, Arabic optional diacritics
917      * characters */
918     if (line.compare(0, 6, "IGNORE", 6) == 0) {
919       if (!parse_array(line, ignorechars, ignorechars_utf16,
920                        utf8, afflst->getlinenum())) {
921         delete afflst;
922         return 1;
923       }
924     }
925 
926     if ((line.compare(0, 2, "AF", 2) == 0) && line.size() > 2 && isspace(line[2])) {
927       if (!parse_aliasf(line, afflst)) {
928         delete afflst;
929         return 1;
930       }
931     }
932 
933     if ((line.compare(0, 2, "AM", 2) == 0) && line.size() > 2 && isspace(line[2])) {
934       if (!parse_aliasm(line, afflst)) {
935         delete afflst;
936         return 1;
937       }
938     }
939 
940     if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
941       complexprefixes = 1;
942 
943     if (((line.compare(0, 3, "SFX", 3) == 0) ||
944          (line.compare(0, 3, "PFX", 3) == 0)) && line.size() > 3 && isspace(line[3]))
945       break;
946   }
947 
948   if (csconv == NULL)
949     csconv = get_current_cs(SPELL_ENCODING);
950   delete afflst;
951   return 0;
952 }
953 
954 /* parse in the ALIAS table */
parse_aliasf(const std::string & line,FileMgr * af)955 bool HashMgr::parse_aliasf(const std::string& line, FileMgr* af) {
956   if (numaliasf != 0) {
957     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
958                      af->getlinenum());
959     return false;
960   }
961   int i = 0;
962   int np = 0;
963   std::string::const_iterator iter = line.begin();
964   std::string::const_iterator start_piece = mystrsep(line, iter);
965   while (start_piece != line.end()) {
966     switch (i) {
967       case 0: {
968         np++;
969         break;
970       }
971       case 1: {
972         numaliasf = atoi(std::string(start_piece, iter).c_str());
973         if (numaliasf < 1) {
974           numaliasf = 0;
975           aliasf = NULL;
976           aliasflen = NULL;
977           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
978                            af->getlinenum());
979           return false;
980         }
981         aliasf =
982             (unsigned short**)malloc(numaliasf * sizeof(unsigned short*));
983         aliasflen =
984             (unsigned short*)malloc(numaliasf * sizeof(unsigned short));
985         if (!aliasf || !aliasflen) {
986           numaliasf = 0;
987           if (aliasf)
988             free(aliasf);
989           if (aliasflen)
990             free(aliasflen);
991           aliasf = NULL;
992           aliasflen = NULL;
993           return false;
994         }
995         np++;
996         break;
997       }
998       default:
999         break;
1000     }
1001     ++i;
1002     start_piece = mystrsep(line, iter);
1003   }
1004   if (np != 2) {
1005     numaliasf = 0;
1006     free(aliasf);
1007     free(aliasflen);
1008     aliasf = NULL;
1009     aliasflen = NULL;
1010     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
1011                      af->getlinenum());
1012     return false;
1013   }
1014 
1015   /* now parse the numaliasf lines to read in the remainder of the table */
1016   for (int j = 0; j < numaliasf; j++) {
1017     std::string nl;
1018     if (!af->getline(nl))
1019       return false;
1020     mychomp(nl);
1021     i = 0;
1022     aliasf[j] = NULL;
1023     aliasflen[j] = 0;
1024     iter = nl.begin();
1025     start_piece = mystrsep(nl, iter);
1026     while (start_piece != nl.end()) {
1027       switch (i) {
1028         case 0: {
1029           if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) {
1030             numaliasf = 0;
1031             free(aliasf);
1032             free(aliasflen);
1033             aliasf = NULL;
1034             aliasflen = NULL;
1035             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1036                              af->getlinenum());
1037             return false;
1038           }
1039           break;
1040         }
1041         case 1: {
1042           std::string piece(start_piece, iter);
1043           aliasflen[j] =
1044               (unsigned short)decode_flags(&(aliasf[j]), piece, af);
1045           std::sort(aliasf[j], aliasf[j] + aliasflen[j]);
1046           break;
1047         }
1048         default:
1049           break;
1050       }
1051       ++i;
1052       start_piece = mystrsep(nl, iter);
1053     }
1054     if (!aliasf[j]) {
1055       free(aliasf);
1056       free(aliasflen);
1057       aliasf = NULL;
1058       aliasflen = NULL;
1059       numaliasf = 0;
1060       HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1061                        af->getlinenum());
1062       return false;
1063     }
1064   }
1065   return true;
1066 }
1067 
is_aliasf() const1068 int HashMgr::is_aliasf() const {
1069   return (aliasf != NULL);
1070 }
1071 
get_aliasf(int index,unsigned short ** fvec,FileMgr * af) const1072 int HashMgr::get_aliasf(int index, unsigned short** fvec, FileMgr* af) const {
1073   if ((index > 0) && (index <= numaliasf)) {
1074     *fvec = aliasf[index - 1];
1075     return aliasflen[index - 1];
1076   }
1077   HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n",
1078                    af->getlinenum(), index);
1079   *fvec = NULL;
1080   return 0;
1081 }
1082 
1083 /* parse morph alias definitions */
parse_aliasm(const std::string & line,FileMgr * af)1084 bool HashMgr::parse_aliasm(const std::string& line, FileMgr* af) {
1085   if (numaliasm != 0) {
1086     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
1087                      af->getlinenum());
1088     return false;
1089   }
1090   int i = 0;
1091   int np = 0;
1092   std::string::const_iterator iter = line.begin();
1093   std::string::const_iterator start_piece = mystrsep(line, iter);
1094   while (start_piece != line.end()) {
1095     switch (i) {
1096       case 0: {
1097         np++;
1098         break;
1099       }
1100       case 1: {
1101         numaliasm = atoi(std::string(start_piece, iter).c_str());
1102         if (numaliasm < 1) {
1103           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
1104                            af->getlinenum());
1105           return false;
1106         }
1107         aliasm = (char**)malloc(numaliasm * sizeof(char*));
1108         if (!aliasm) {
1109           numaliasm = 0;
1110           return false;
1111         }
1112         np++;
1113         break;
1114       }
1115       default:
1116         break;
1117     }
1118     ++i;
1119     start_piece = mystrsep(line, iter);
1120   }
1121   if (np != 2) {
1122     numaliasm = 0;
1123     free(aliasm);
1124     aliasm = NULL;
1125     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
1126                      af->getlinenum());
1127     return false;
1128   }
1129 
1130   /* now parse the numaliasm lines to read in the remainder of the table */
1131   for (int j = 0; j < numaliasm; j++) {
1132     std::string nl;
1133     if (!af->getline(nl))
1134       return false;
1135     mychomp(nl);
1136     aliasm[j] = NULL;
1137     iter = nl.begin();
1138     i = 0;
1139     start_piece = mystrsep(nl, iter);
1140     while (start_piece != nl.end()) {
1141       switch (i) {
1142         case 0: {
1143           if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) {
1144             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1145                              af->getlinenum());
1146             numaliasm = 0;
1147             free(aliasm);
1148             aliasm = NULL;
1149             return false;
1150           }
1151           break;
1152         }
1153         case 1: {
1154           // add the remaining of the line
1155           std::string::const_iterator end = nl.end();
1156           std::string chunk(start_piece, end);
1157           if (complexprefixes) {
1158             if (utf8)
1159               reverseword_utf(chunk);
1160             else
1161               reverseword(chunk);
1162           }
1163           aliasm[j] = mystrdup(chunk.c_str());
1164           break;
1165         }
1166         default:
1167           break;
1168       }
1169       ++i;
1170       start_piece = mystrsep(nl, iter);
1171     }
1172     if (!aliasm[j]) {
1173       numaliasm = 0;
1174       free(aliasm);
1175       aliasm = NULL;
1176       HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1177                        af->getlinenum());
1178       return false;
1179     }
1180   }
1181   return true;
1182 }
1183 
is_aliasm() const1184 int HashMgr::is_aliasm() const {
1185   return (aliasm != NULL);
1186 }
1187 
get_aliasm(int index) const1188 char* HashMgr::get_aliasm(int index) const {
1189   if ((index > 0) && (index <= numaliasm))
1190     return aliasm[index - 1];
1191   HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
1192   return NULL;
1193 }
1194