1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * Copyright (C) 2002-2017 Németh László
5 *
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
10 *
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
15 *
16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17 *
18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23 *
24 * Alternatively, the contents of this file may be used under the terms of
25 * either the GNU General Public License Version 2 or later (the "GPL"), or
26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
35 *
36 * ***** END LICENSE BLOCK ***** */
37 /*
38 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39 * And Contributors. All rights reserved.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 *
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 *
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 *
52 * 3. All modifications to the source code must be clearly marked as
53 * such. Binary redistributions based on modified source code
54 * must be clearly marked as modified versions in the documentation
55 * and/or other materials provided with the distribution.
56 *
57 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
61 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 */
70
71 #include <stdlib.h>
72 #include <string.h>
73 #include <stdio.h>
74 #include <time.h>
75
76 #include "affixmgr.hxx"
77 #include "hunspell.hxx"
78 #include "suggestmgr.hxx"
79 #include "hunspell.h"
80 #include "csutil.hxx"
81
82 #include <limits>
83 #include <string>
84
85 #define MAXWORDUTF8LEN (MAXWORDLEN * 3)
86
87 class HunspellImpl
88 {
89 public:
90 HunspellImpl(const char* affpath, const char* dpath, const char* key = NULL);
91 ~HunspellImpl();
92 int add_dic(const char* dpath, const char* key = NULL);
93 std::vector<std::string> suffix_suggest(const std::string& root_word);
94 std::vector<std::string> generate(const std::string& word, const std::vector<std::string>& pl);
95 std::vector<std::string> generate(const std::string& word, const std::string& pattern);
96 std::vector<std::string> stem(const std::string& word);
97 std::vector<std::string> stem(const std::vector<std::string>& morph);
98 std::vector<std::string> analyze(const std::string& word);
99 int get_langnum() const;
100 bool input_conv(const std::string& word, std::string& dest);
101 bool spell(const std::string& word, int* info = NULL, std::string* root = NULL);
102 std::vector<std::string> suggest(const std::string& word);
103 const std::string& get_wordchars_cpp() const;
104 const std::vector<w_char>& get_wordchars_utf16() const;
105 const std::string& get_dict_encoding() const;
106 int add(const std::string& word);
107 int add_with_affix(const std::string& word, const std::string& example);
108 int remove(const std::string& word);
109 const std::string& get_version_cpp() const;
110 struct cs_info* get_csconv();
111
112 int spell(const char* word, int* info = NULL, char** root = NULL);
113 int suggest(char*** slst, const char* word);
114 int suffix_suggest(char*** slst, const char* root_word);
115 void free_list(char*** slst, int n);
116 char* get_dic_encoding();
117 int analyze(char*** slst, const char* word);
118 int stem(char*** slst, const char* word);
119 int stem(char*** slst, char** morph, int n);
120 int generate(char*** slst, const char* word, const char* word2);
121 int generate(char*** slst, const char* word, char** desc, int n);
122 const char* get_wordchars() const;
123 const char* get_version() const;
124 int input_conv(const char* word, char* dest, size_t destsize);
125
126 private:
127 AffixMgr* pAMgr;
128 std::vector<HashMgr*> m_HMgrs;
129 SuggestMgr* pSMgr;
130 char* affixpath;
131 std::string encoding;
132 struct cs_info* csconv;
133 int langnum;
134 int utf8;
135 int complexprefixes;
136 std::vector<std::string> wordbreak;
137
138 private:
139 std::vector<std::string> analyze_internal(const std::string& word);
140 bool spell_internal(const std::string& word, int* info = NULL, std::string* root = NULL);
141 std::vector<std::string> suggest_internal(const std::string& word,
142 bool& capitalized, size_t& abbreviated, int& captype);
143 void cleanword(std::string& dest, const std::string&, int* pcaptype, int* pabbrev);
144 size_t cleanword2(std::string& dest,
145 std::vector<w_char>& dest_u,
146 const std::string& src,
147 int* pcaptype,
148 size_t* pabbrev);
149 void clean_ignore(std::string& dest, const std::string& src);
150 void mkinitcap(std::string& u8);
151 int mkinitcap2(std::string& u8, std::vector<w_char>& u16);
152 int mkinitsmall2(std::string& u8, std::vector<w_char>& u16);
153 void mkallcap(std::string& u8);
154 int mkallsmall2(std::string& u8, std::vector<w_char>& u16);
155 struct hentry* checkword(const std::string& source, int* info, std::string* root);
156 std::string sharps_u8_l1(const std::string& source);
157 hentry*
158 spellsharps(std::string& base, size_t start_pos, int, int, int* info, std::string* root);
159 int is_keepcase(const hentry* rv);
160 void insert_sug(std::vector<std::string>& slst, const std::string& word);
161 void cat_result(std::string& result, const std::string& st);
162 std::vector<std::string> spellml(const std::string& word);
163 std::string get_xml_par(const char* par);
164 const char* get_xml_pos(const char* s, const char* attr);
165 std::vector<std::string> get_xml_list(const char* list, const char* tag);
166 int check_xml_par(const char* q, const char* attr, const char* value);
167 private:
168 HunspellImpl(const HunspellImpl&);
169 HunspellImpl& operator=(const HunspellImpl&);
170 };
171
HunspellImpl(const char * affpath,const char * dpath,const char * key)172 HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* key) {
173 csconv = NULL;
174 utf8 = 0;
175 complexprefixes = 0;
176 affixpath = mystrdup(affpath);
177
178 /* first set up the hash manager */
179 m_HMgrs.push_back(new HashMgr(dpath, affpath, key));
180
181 /* next set up the affix manager */
182 /* it needs access to the hash manager lookup methods */
183 pAMgr = new AffixMgr(affpath, m_HMgrs, key);
184
185 /* get the preferred try string and the dictionary */
186 /* encoding from the Affix Manager for that dictionary */
187 char* try_string = pAMgr->get_try_string();
188 encoding = pAMgr->get_encoding();
189 langnum = pAMgr->get_langnum();
190 utf8 = pAMgr->get_utf8();
191 if (!utf8)
192 csconv = get_current_cs(encoding);
193 complexprefixes = pAMgr->get_complexprefixes();
194 wordbreak = pAMgr->get_breaktable();
195
196 /* and finally set up the suggestion manager */
197 pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr);
198 if (try_string)
199 free(try_string);
200 }
201
~HunspellImpl()202 HunspellImpl::~HunspellImpl() {
203 delete pSMgr;
204 delete pAMgr;
205 for (size_t i = 0; i < m_HMgrs.size(); ++i)
206 delete m_HMgrs[i];
207 pSMgr = NULL;
208 pAMgr = NULL;
209 #ifdef MOZILLA_CLIENT
210 delete[] csconv;
211 #endif
212 csconv = NULL;
213 if (affixpath)
214 free(affixpath);
215 affixpath = NULL;
216 }
217
218 // load extra dictionaries
add_dic(const char * dpath,const char * key)219 int HunspellImpl::add_dic(const char* dpath, const char* key) {
220 if (!affixpath)
221 return 1;
222 m_HMgrs.push_back(new HashMgr(dpath, affixpath, key));
223 return 0;
224 }
225
226
227 // make a copy of src at dest while removing all characters
228 // specified in IGNORE rule
clean_ignore(std::string & dest,const std::string & src)229 void HunspellImpl::clean_ignore(std::string& dest,
230 const std::string& src) {
231 dest.clear();
232 dest.assign(src);
233 const char* ignoredchars = pAMgr ? pAMgr->get_ignore() : NULL;
234 if (ignoredchars != NULL) {
235 if (utf8) {
236 const std::vector<w_char>& ignoredchars_utf16 =
237 pAMgr->get_ignore_utf16();
238 remove_ignored_chars_utf(dest, ignoredchars_utf16);
239 } else {
240 remove_ignored_chars(dest, ignoredchars);
241 }
242 }
243 }
244
245
246 // make a copy of src at destination while removing all leading
247 // blanks and removing any trailing periods after recording
248 // their presence with the abbreviation flag
249 // also since already going through character by character,
250 // set the capitalization type
251 // return the length of the "cleaned" (and UTF-8 encoded) word
252
cleanword2(std::string & dest,std::vector<w_char> & dest_utf,const std::string & src,int * pcaptype,size_t * pabbrev)253 size_t HunspellImpl::cleanword2(std::string& dest,
254 std::vector<w_char>& dest_utf,
255 const std::string& src,
256 int* pcaptype,
257 size_t* pabbrev) {
258 dest.clear();
259 dest_utf.clear();
260
261 // remove IGNORE characters from the string
262 std::string w2;
263 clean_ignore(w2, src);
264
265 const char* q = w2.c_str();
266
267 // first skip over any leading blanks
268 while (*q == ' ')
269 ++q;
270
271 // now strip off any trailing periods (recording their presence)
272 *pabbrev = 0;
273 int nl = strlen(q);
274 while ((nl > 0) && (*(q + nl - 1) == '.')) {
275 nl--;
276 (*pabbrev)++;
277 }
278
279 // if no characters are left it can't be capitalized
280 if (nl <= 0) {
281 *pcaptype = NOCAP;
282 return 0;
283 }
284
285 dest.append(q, nl);
286 nl = dest.size();
287 if (utf8) {
288 u8_u16(dest_utf, dest);
289 *pcaptype = get_captype_utf8(dest_utf, langnum);
290 } else {
291 *pcaptype = get_captype(dest, csconv);
292 }
293 return nl;
294 }
295
cleanword(std::string & dest,const std::string & src,int * pcaptype,int * pabbrev)296 void HunspellImpl::cleanword(std::string& dest,
297 const std::string& src,
298 int* pcaptype,
299 int* pabbrev) {
300 dest.clear();
301 const unsigned char* q = (const unsigned char*)src.c_str();
302 int firstcap = 0;
303
304 // first skip over any leading blanks
305 while (*q == ' ')
306 ++q;
307
308 // now strip off any trailing periods (recording their presence)
309 *pabbrev = 0;
310 int nl = strlen((const char*)q);
311 while ((nl > 0) && (*(q + nl - 1) == '.')) {
312 nl--;
313 (*pabbrev)++;
314 }
315
316 // if no characters are left it can't be capitalized
317 if (nl <= 0) {
318 *pcaptype = NOCAP;
319 return;
320 }
321
322 // now determine the capitalization type of the first nl letters
323 int ncap = 0;
324 int nneutral = 0;
325 int nc = 0;
326
327 if (!utf8) {
328 while (nl > 0) {
329 nc++;
330 if (csconv[(*q)].ccase)
331 ncap++;
332 if (csconv[(*q)].cupper == csconv[(*q)].clower)
333 nneutral++;
334 dest.push_back(*q++);
335 nl--;
336 }
337 // remember to terminate the destination string
338 firstcap = csconv[static_cast<unsigned char>(dest[0])].ccase;
339 } else {
340 std::vector<w_char> t;
341 u8_u16(t, src);
342 for (size_t i = 0; i < t.size(); ++i) {
343 unsigned short idx = (t[i].h << 8) + t[i].l;
344 unsigned short low = unicodetolower(idx, langnum);
345 if (idx != low)
346 ncap++;
347 if (unicodetoupper(idx, langnum) == low)
348 nneutral++;
349 }
350 u16_u8(dest, t);
351 if (ncap) {
352 unsigned short idx = (t[0].h << 8) + t[0].l;
353 firstcap = (idx != unicodetolower(idx, langnum));
354 }
355 }
356
357 // now finally set the captype
358 if (ncap == 0) {
359 *pcaptype = NOCAP;
360 } else if ((ncap == 1) && firstcap) {
361 *pcaptype = INITCAP;
362 } else if ((ncap == nc) || ((ncap + nneutral) == nc)) {
363 *pcaptype = ALLCAP;
364 } else if ((ncap > 1) && firstcap) {
365 *pcaptype = HUHINITCAP;
366 } else {
367 *pcaptype = HUHCAP;
368 }
369 }
370
mkallcap(std::string & u8)371 void HunspellImpl::mkallcap(std::string& u8) {
372 if (utf8) {
373 std::vector<w_char> u16;
374 u8_u16(u16, u8);
375 ::mkallcap_utf(u16, langnum);
376 u16_u8(u8, u16);
377 } else {
378 ::mkallcap(u8, csconv);
379 }
380 }
381
mkallsmall2(std::string & u8,std::vector<w_char> & u16)382 int HunspellImpl::mkallsmall2(std::string& u8, std::vector<w_char>& u16) {
383 if (utf8) {
384 ::mkallsmall_utf(u16, langnum);
385 u16_u8(u8, u16);
386 } else {
387 ::mkallsmall(u8, csconv);
388 }
389 return u8.size();
390 }
391
392 // convert UTF-8 sharp S codes to latin 1
sharps_u8_l1(const std::string & source)393 std::string HunspellImpl::sharps_u8_l1(const std::string& source) {
394 std::string dest(source);
395 mystrrep(dest, "\xC3\x9F", "\xDF");
396 return dest;
397 }
398
399 // recursive search for right ss - sharp s permutations
spellsharps(std::string & base,size_t n_pos,int n,int repnum,int * info,std::string * root)400 hentry* HunspellImpl::spellsharps(std::string& base,
401 size_t n_pos,
402 int n,
403 int repnum,
404 int* info,
405 std::string* root) {
406 size_t pos = base.find("ss", n_pos);
407 if (pos != std::string::npos && (n < MAXSHARPS)) {
408 base[pos] = '\xC3';
409 base[pos + 1] = '\x9F';
410 hentry* h = spellsharps(base, pos + 2, n + 1, repnum + 1, info, root);
411 if (h)
412 return h;
413 base[pos] = 's';
414 base[pos + 1] = 's';
415 h = spellsharps(base, pos + 2, n + 1, repnum, info, root);
416 if (h)
417 return h;
418 } else if (repnum > 0) {
419 if (utf8)
420 return checkword(base, info, root);
421 std::string tmp(sharps_u8_l1(base));
422 return checkword(tmp, info, root);
423 }
424 return NULL;
425 }
426
is_keepcase(const hentry * rv)427 int HunspellImpl::is_keepcase(const hentry* rv) {
428 return pAMgr && rv->astr && pAMgr->get_keepcase() &&
429 TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen);
430 }
431
432 /* insert a word to the beginning of the suggestion array */
insert_sug(std::vector<std::string> & slst,const std::string & word)433 void HunspellImpl::insert_sug(std::vector<std::string>& slst, const std::string& word) {
434 slst.insert(slst.begin(), word);
435 }
436
spell(const std::string & word,int * info,std::string * root)437 bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) {
438 bool r = spell_internal(word, info, root);
439 if (r && root) {
440 // output conversion
441 RepList* rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
442 if (rl) {
443 std::string wspace;
444 if (rl->conv(*root, wspace)) {
445 *root = wspace;
446 }
447 }
448 }
449 return r;
450 }
451
spell_internal(const std::string & word,int * info,std::string * root)452 bool HunspellImpl::spell_internal(const std::string& word, int* info, std::string* root) {
453 struct hentry* rv = NULL;
454
455 int info2 = 0;
456 if (!info)
457 info = &info2;
458 else
459 *info = 0;
460
461 // Hunspell supports XML input of the simplified API (see manual)
462 if (word == SPELL_XML)
463 return true;
464 if (utf8) {
465 if (word.size() >= MAXWORDUTF8LEN)
466 return false;
467 } else {
468 if (word.size() >= MAXWORDLEN)
469 return false;
470 }
471 int captype = NOCAP;
472 size_t abbv = 0;
473 size_t wl = 0;
474
475 std::string scw;
476 std::vector<w_char> sunicw;
477
478 // input conversion
479 RepList* rl = pAMgr ? pAMgr->get_iconvtable() : NULL;
480 {
481 std::string wspace;
482
483 bool convstatus = rl ? rl->conv(word, wspace) : false;
484 if (convstatus)
485 wl = cleanword2(scw, sunicw, wspace, &captype, &abbv);
486 else
487 wl = cleanword2(scw, sunicw, word, &captype, &abbv);
488 }
489
490 #ifdef MOZILLA_CLIENT
491 // accept the abbreviated words without dots
492 // workaround for the incomplete tokenization of Mozilla
493 abbv = 1;
494 #endif
495
496 if (wl == 0 || m_HMgrs.empty())
497 return true;
498 if (root)
499 root->clear();
500
501 // allow numbers with dots, dashes and commas (but forbid double separators:
502 // "..", "--" etc.)
503 enum { NBEGIN, NNUM, NSEP };
504 int nstate = NBEGIN;
505 size_t i;
506
507 for (i = 0; (i < wl); i++) {
508 if ((scw[i] <= '9') && (scw[i] >= '0')) {
509 nstate = NNUM;
510 } else if ((scw[i] == ',') || (scw[i] == '.') || (scw[i] == '-')) {
511 if ((nstate == NSEP) || (i == 0))
512 break;
513 nstate = NSEP;
514 } else
515 break;
516 }
517 if ((i == wl) && (nstate == NNUM))
518 return true;
519
520 switch (captype) {
521 case HUHCAP:
522 /* FALLTHROUGH */
523 case HUHINITCAP:
524 *info += SPELL_ORIGCAP;
525 /* FALLTHROUGH */
526 case NOCAP:
527 rv = checkword(scw, info, root);
528 if ((abbv) && !(rv)) {
529 std::string u8buffer(scw);
530 u8buffer.push_back('.');
531 rv = checkword(u8buffer, info, root);
532 }
533 break;
534 case ALLCAP: {
535 *info += SPELL_ORIGCAP;
536 rv = checkword(scw, info, root);
537 if (rv)
538 break;
539 if (abbv) {
540 std::string u8buffer(scw);
541 u8buffer.push_back('.');
542 rv = checkword(u8buffer, info, root);
543 if (rv)
544 break;
545 }
546 // Spec. prefix handling for Catalan, French, Italian:
547 // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
548 size_t apos = pAMgr ? scw.find('\'') : std::string::npos;
549 if (apos != std::string::npos) {
550 mkallsmall2(scw, sunicw);
551 //conversion may result in string with different len to pre-mkallsmall2
552 //so re-scan
553 if (apos != std::string::npos && apos < scw.size() - 1) {
554 std::string part1 = scw.substr(0, apos+1);
555 std::string part2 = scw.substr(apos+1);
556 if (utf8) {
557 std::vector<w_char> part1u, part2u;
558 u8_u16(part1u, part1);
559 u8_u16(part2u, part2);
560 mkinitcap2(part2, part2u);
561 scw = part1 + part2;
562 sunicw = part1u;
563 sunicw.insert(sunicw.end(), part2u.begin(), part2u.end());
564 rv = checkword(scw, info, root);
565 if (rv)
566 break;
567 } else {
568 mkinitcap2(part2, sunicw);
569 scw = part1 + part2;
570 rv = checkword(scw, info, root);
571 if (rv)
572 break;
573 }
574 mkinitcap2(scw, sunicw);
575 rv = checkword(scw, info, root);
576 if (rv)
577 break;
578 }
579 }
580 if (pAMgr && pAMgr->get_checksharps() && scw.find("SS") != std::string::npos) {
581
582 mkallsmall2(scw, sunicw);
583 std::string u8buffer(scw);
584 rv = spellsharps(u8buffer, 0, 0, 0, info, root);
585 if (!rv) {
586 mkinitcap2(scw, sunicw);
587 rv = spellsharps(scw, 0, 0, 0, info, root);
588 }
589 if ((abbv) && !(rv)) {
590 u8buffer.push_back('.');
591 rv = spellsharps(u8buffer, 0, 0, 0, info, root);
592 if (!rv) {
593 u8buffer = std::string(scw);
594 u8buffer.push_back('.');
595 rv = spellsharps(u8buffer, 0, 0, 0, info, root);
596 }
597 }
598 if (rv)
599 break;
600 }
601 }
602 /* FALLTHROUGH */
603 case INITCAP: {
604 // handle special capitalization of dotted I
605 bool Idot = (utf8 && (unsigned char) scw[0] == 0xc4 && (unsigned char) scw[1] == 0xb0);
606 *info += SPELL_ORIGCAP;
607 if (captype == ALLCAP) {
608 mkallsmall2(scw, sunicw);
609 mkinitcap2(scw, sunicw);
610 if (Idot)
611 scw.replace(0, 1, "\xc4\xb0");
612 }
613 if (captype == INITCAP)
614 *info += SPELL_INITCAP;
615 rv = checkword(scw, info, root);
616 if (captype == INITCAP)
617 *info -= SPELL_INITCAP;
618 // forbid bad capitalization
619 // (for example, ijs -> Ijs instead of IJs in Dutch)
620 // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag)
621 if (*info & SPELL_FORBIDDEN) {
622 rv = NULL;
623 break;
624 }
625 if (rv && is_keepcase(rv) && (captype == ALLCAP))
626 rv = NULL;
627 if (rv || (Idot && langnum != LANG_az && langnum != LANG_tr && langnum != LANG_crh))
628 break;
629
630 mkallsmall2(scw, sunicw);
631 std::string u8buffer(scw);
632 mkinitcap2(scw, sunicw);
633
634 rv = checkword(u8buffer, info, root);
635 if (abbv && !rv) {
636 u8buffer.push_back('.');
637 rv = checkword(u8buffer, info, root);
638 if (!rv) {
639 u8buffer = scw;
640 u8buffer.push_back('.');
641 if (captype == INITCAP)
642 *info += SPELL_INITCAP;
643 rv = checkword(u8buffer, info, root);
644 if (captype == INITCAP)
645 *info -= SPELL_INITCAP;
646 if (rv && is_keepcase(rv) && (captype == ALLCAP))
647 rv = NULL;
648 break;
649 }
650 }
651 if (rv && is_keepcase(rv) &&
652 ((captype == ALLCAP) ||
653 // if CHECKSHARPS: KEEPCASE words with \xDF are allowed
654 // in INITCAP form, too.
655 !(pAMgr->get_checksharps() &&
656 ((utf8 && u8buffer.find("\xC3\x9F") != std::string::npos) ||
657 (!utf8 && u8buffer.find('\xDF') != std::string::npos)))))
658 rv = NULL;
659 break;
660 }
661 }
662
663 if (rv) {
664 if (pAMgr && pAMgr->get_warn() && rv->astr &&
665 TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) {
666 *info += SPELL_WARN;
667 if (pAMgr->get_forbidwarn())
668 return false;
669 return true;
670 }
671 return true;
672 }
673
674 // recursive breaking at break points
675 if (!wordbreak.empty() && !(*info & SPELL_FORBIDDEN)) {
676
677 int nbr = 0;
678 wl = scw.size();
679
680 // calculate break points for recursion limit
681 for (size_t j = 0; j < wordbreak.size(); ++j) {
682 size_t pos = 0;
683 while ((pos = scw.find(wordbreak[j], pos)) != std::string::npos) {
684 ++nbr;
685 pos += wordbreak[j].size();
686 }
687 }
688 if (nbr >= 10)
689 return false;
690
691 // check boundary patterns (^begin and end$)
692 for (size_t j = 0; j < wordbreak.size(); ++j) {
693 size_t plen = wordbreak[j].size();
694 if (plen == 1 || plen > wl)
695 continue;
696
697 if (wordbreak[j][0] == '^' &&
698 scw.compare(0, plen - 1, wordbreak[j], 1, plen -1) == 0 && spell(scw.substr(plen - 1)))
699 return true;
700
701 if (wordbreak[j][plen - 1] == '$' &&
702 scw.compare(wl - plen + 1, plen - 1, wordbreak[j], 0, plen - 1) == 0) {
703 std::string suffix(scw.substr(wl - plen + 1));
704 scw.resize(wl - plen + 1);
705 if (spell(scw))
706 return true;
707 scw.append(suffix);
708 }
709 }
710
711 // other patterns
712 for (size_t j = 0; j < wordbreak.size(); ++j) {
713 size_t plen = wordbreak[j].size();
714 size_t found = scw.find(wordbreak[j]);
715 if ((found > 0) && (found < wl - plen)) {
716 size_t found2 = scw.find(wordbreak[j], found + 1);
717 // try to break at the second occurance
718 // to recognize dictionary words with wordbreak
719 if (found2 > 0 && (found2 < wl - plen))
720 found = found2;
721 if (!spell(scw.substr(found + plen)))
722 continue;
723 std::string suffix(scw.substr(found));
724 scw.resize(found);
725 // examine 2 sides of the break point
726 if (spell(scw))
727 return true;
728 scw.append(suffix);
729
730 // LANG_hu: spec. dash rule
731 if (langnum == LANG_hu && wordbreak[j] == "-") {
732 suffix = scw.substr(found + 1);
733 scw.resize(found + 1);
734 if (spell(scw))
735 return true; // check the first part with dash
736 scw.append(suffix);
737 }
738 // end of LANG specific region
739 }
740 }
741
742 // other patterns (break at first break point)
743 for (size_t j = 0; j < wordbreak.size(); ++j) {
744 size_t plen = wordbreak[j].size();
745 size_t found = scw.find(wordbreak[j]);
746 if ((found > 0) && (found < wl - plen)) {
747 if (!spell(scw.substr(found + plen)))
748 continue;
749 std::string suffix(scw.substr(found));
750 scw.resize(found);
751 // examine 2 sides of the break point
752 if (spell(scw))
753 return true;
754 scw.append(suffix);
755
756 // LANG_hu: spec. dash rule
757 if (langnum == LANG_hu && wordbreak[j] == "-") {
758 suffix = scw.substr(found + 1);
759 scw.resize(found + 1);
760 if (spell(scw))
761 return true; // check the first part with dash
762 scw.append(suffix);
763 }
764 // end of LANG specific region
765 }
766 }
767 }
768
769 return false;
770 }
771
checkword(const std::string & w,int * info,std::string * root)772 struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::string* root) {
773 bool usebuffer = false;
774 std::string w2;
775 const char* word;
776 int len;
777
778 // remove IGNORE characters from the string
779 clean_ignore(w2, w);
780
781 word = w2.c_str();
782 len = w2.size();
783 usebuffer = true;
784
785 if (!len)
786 return NULL;
787
788 // word reversing wrapper for complex prefixes
789 if (complexprefixes) {
790 if (!usebuffer) {
791 w2.assign(word);
792 usebuffer = true;
793 }
794 if (utf8)
795 reverseword_utf(w2);
796 else
797 reverseword(w2);
798 }
799
800 if (usebuffer) {
801 word = w2.c_str();
802 }
803
804 // look word in hash table
805 struct hentry* he = NULL;
806 for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) {
807 he = m_HMgrs[i]->lookup(word);
808
809 // check forbidden and onlyincompound words
810 if ((he) && (he->astr) && (pAMgr) &&
811 TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
812 if (info)
813 *info += SPELL_FORBIDDEN;
814 // LANG_hu section: set dash information for suggestions
815 if (langnum == LANG_hu) {
816 if (pAMgr->get_compoundflag() &&
817 TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) {
818 if (info)
819 *info += SPELL_COMPOUND;
820 }
821 }
822 return NULL;
823 }
824
825 // he = next not needaffix, onlyincompound homonym or onlyupcase word
826 while (he && (he->astr) && pAMgr &&
827 ((pAMgr->get_needaffix() &&
828 TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) ||
829 (pAMgr->get_onlyincompound() &&
830 TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
831 (info && (*info & SPELL_INITCAP) &&
832 TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen))))
833 he = he->next_homonym;
834 }
835
836 // check with affixes
837 if (!he && pAMgr) {
838 // try stripping off affixes */
839 he = pAMgr->affix_check(word, len, 0);
840
841 // check compound restriction and onlyupcase
842 if (he && he->astr &&
843 ((pAMgr->get_onlyincompound() &&
844 TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
845 (info && (*info & SPELL_INITCAP) &&
846 TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) {
847 he = NULL;
848 }
849
850 if (he) {
851 if ((he->astr) && (pAMgr) &&
852 TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
853 if (info)
854 *info += SPELL_FORBIDDEN;
855 return NULL;
856 }
857 if (root) {
858 root->assign(he->word);
859 if (complexprefixes) {
860 if (utf8)
861 reverseword_utf(*root);
862 else
863 reverseword(*root);
864 }
865 }
866 // try check compound word
867 } else if (pAMgr->get_compound()) {
868 struct hentry* rwords[100]; // buffer for COMPOUND pattern checking
869 he = pAMgr->compound_check(word, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, info);
870 // LANG_hu section: `moving rule' with last dash
871 if ((!he) && (langnum == LANG_hu) && (word[len - 1] == '-')) {
872 std::string dup(word, len - 1);
873 he = pAMgr->compound_check(dup, -5, 0, 100, 0, NULL, (hentry**)&rwords, 1, 0, info);
874 }
875 // end of LANG specific region
876 if (he) {
877 if (root) {
878 root->assign(he->word);
879 if (complexprefixes) {
880 if (utf8)
881 reverseword_utf(*root);
882 else
883 reverseword(*root);
884 }
885 }
886 if (info)
887 *info += SPELL_COMPOUND;
888 }
889 }
890 }
891
892 return he;
893 }
894
suggest(const std::string & word)895 std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
896 bool capwords;
897 size_t abbv;
898 int captype;
899 std::vector<std::string> slst = suggest_internal(word, capwords, abbv, captype);
900 // word reversing wrapper for complex prefixes
901 if (complexprefixes) {
902 for (size_t j = 0; j < slst.size(); ++j) {
903 if (utf8)
904 reverseword_utf(slst[j]);
905 else
906 reverseword(slst[j]);
907 }
908 }
909
910 // capitalize
911 if (capwords)
912 for (size_t j = 0; j < slst.size(); ++j) {
913 mkinitcap(slst[j]);
914 }
915
916 // expand suggestions with dot(s)
917 if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
918 for (size_t j = 0; j < slst.size(); ++j) {
919 slst[j].append(word.substr(word.size() - abbv));
920 }
921 }
922
923 // remove bad capitalized and forbidden forms
924 if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) {
925 switch (captype) {
926 case INITCAP:
927 case ALLCAP: {
928 size_t l = 0;
929 for (size_t j = 0; j < slst.size(); ++j) {
930 if (slst[j].find(' ') == std::string::npos && !spell(slst[j])) {
931 std::string s;
932 std::vector<w_char> w;
933 if (utf8) {
934 u8_u16(w, slst[j]);
935 } else {
936 s = slst[j];
937 }
938 mkallsmall2(s, w);
939 if (spell(s)) {
940 slst[l] = s;
941 ++l;
942 } else {
943 mkinitcap2(s, w);
944 if (spell(s)) {
945 slst[l] = s;
946 ++l;
947 }
948 }
949 } else {
950 slst[l] = slst[j];
951 ++l;
952 }
953 }
954 slst.resize(l);
955 }
956 }
957 }
958
959 // remove duplications
960 size_t l = 0;
961 for (size_t j = 0; j < slst.size(); ++j) {
962 slst[l] = slst[j];
963 for (size_t k = 0; k < l; ++k) {
964 if (slst[k] == slst[j]) {
965 --l;
966 break;
967 }
968 }
969 ++l;
970 }
971 slst.resize(l);
972
973 // output conversion
974 RepList* rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
975 if (rl) {
976 for (size_t i = 0; rl && i < slst.size(); ++i) {
977 std::string wspace;
978 if (rl->conv(slst[i], wspace)) {
979 slst[i] = wspace;
980 }
981 }
982 }
983 return slst;
984 }
985
suggest_internal(const std::string & word,bool & capwords,size_t & abbv,int & captype)986 std::vector<std::string> HunspellImpl::suggest_internal(const std::string& word,
987 bool& capwords, size_t& abbv, int& captype) {
988 captype = NOCAP;
989 abbv = 0;
990 capwords = false;
991
992 std::vector<std::string> slst;
993
994 int onlycmpdsug = 0;
995 if (!pSMgr || m_HMgrs.empty())
996 return slst;
997
998 // process XML input of the simplified API (see manual)
999 if (word.compare(0, sizeof(SPELL_XML) - 3, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) {
1000 return spellml(word);
1001 }
1002 if (utf8) {
1003 if (word.size() >= MAXWORDUTF8LEN)
1004 return slst;
1005 } else {
1006 if (word.size() >= MAXWORDLEN)
1007 return slst;
1008 }
1009 size_t wl = 0;
1010
1011 std::string scw;
1012 std::vector<w_char> sunicw;
1013
1014 // input conversion
1015 RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
1016 {
1017 std::string wspace;
1018
1019 bool convstatus = rl ? rl->conv(word, wspace) : false;
1020 if (convstatus)
1021 wl = cleanword2(scw, sunicw, wspace, &captype, &abbv);
1022 else
1023 wl = cleanword2(scw, sunicw, word, &captype, &abbv);
1024
1025 if (wl == 0)
1026 return slst;
1027 }
1028
1029 bool good = false;
1030
1031 clock_t timelimit;
1032 // initialize in every suggestion call
1033 timelimit = clock();
1034
1035 // check capitalized form for FORCEUCASE
1036 if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) {
1037 int info = SPELL_ORIGCAP;
1038 if (checkword(scw, &info, NULL)) {
1039 std::string form(scw);
1040 mkinitcap(form);
1041 slst.push_back(form);
1042 return slst;
1043 }
1044 }
1045
1046 switch (captype) {
1047 case NOCAP: {
1048 good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
1049 if (clock() > timelimit + TIMELIMIT_GLOBAL)
1050 return slst;
1051 if (abbv) {
1052 std::string wspace(scw);
1053 wspace.push_back('.');
1054 good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1055 if (clock() > timelimit + TIMELIMIT_GLOBAL)
1056 return slst;
1057 }
1058 break;
1059 }
1060
1061 case INITCAP: {
1062 capwords = true;
1063 good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
1064 if (clock() > timelimit + TIMELIMIT_GLOBAL)
1065 return slst;
1066 std::string wspace(scw);
1067 mkallsmall2(wspace, sunicw);
1068 good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1069 if (clock() > timelimit + TIMELIMIT_GLOBAL)
1070 return slst;
1071 break;
1072 }
1073 case HUHINITCAP:
1074 capwords = true;
1075 /* FALLTHROUGH */
1076 case HUHCAP: {
1077 good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
1078 if (clock() > timelimit + TIMELIMIT_GLOBAL)
1079 return slst;
1080 // something.The -> something. The
1081 size_t dot_pos = scw.find('.');
1082 if (dot_pos != std::string::npos) {
1083 std::string postdot = scw.substr(dot_pos + 1);
1084 int captype_;
1085 if (utf8) {
1086 std::vector<w_char> postdotu;
1087 u8_u16(postdotu, postdot);
1088 captype_ = get_captype_utf8(postdotu, langnum);
1089 } else {
1090 captype_ = get_captype(postdot, csconv);
1091 }
1092 if (captype_ == INITCAP) {
1093 std::string str(scw);
1094 str.insert(dot_pos + 1, 1, ' ');
1095 insert_sug(slst, str);
1096 }
1097 }
1098
1099 std::string wspace;
1100
1101 if (captype == HUHINITCAP) {
1102 // TheOpenOffice.org -> The OpenOffice.org
1103 wspace = scw;
1104 mkinitsmall2(wspace, sunicw);
1105 good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1106 if (clock() > timelimit + TIMELIMIT_GLOBAL)
1107 return slst;
1108 }
1109 wspace = scw;
1110 mkallsmall2(wspace, sunicw);
1111 if (spell(wspace.c_str()))
1112 insert_sug(slst, wspace);
1113 size_t prevns = slst.size();
1114 good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1115 if (clock() > timelimit + TIMELIMIT_GLOBAL)
1116 return slst;
1117 if (captype == HUHINITCAP) {
1118 mkinitcap2(wspace, sunicw);
1119 if (spell(wspace.c_str()))
1120 insert_sug(slst, wspace);
1121 good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1122 if (clock() > timelimit + TIMELIMIT_GLOBAL)
1123 return slst;
1124 }
1125 // aNew -> "a New" (instead of "a new")
1126 for (size_t j = prevns; j < slst.size(); ++j) {
1127 const char* space = strchr(slst[j].c_str(), ' ');
1128 if (space) {
1129 size_t slen = strlen(space + 1);
1130 // different case after space (need capitalisation)
1131 if ((slen < wl) && strcmp(scw.c_str() + wl - slen, space + 1)) {
1132 std::string first(slst[j].c_str(), space + 1);
1133 std::string second(space + 1);
1134 std::vector<w_char> w;
1135 if (utf8)
1136 u8_u16(w, second);
1137 mkinitcap2(second, w);
1138 // set as first suggestion
1139 slst.erase(slst.begin() + j);
1140 slst.insert(slst.begin(), first + second);
1141 }
1142 }
1143 }
1144 break;
1145 }
1146
1147 case ALLCAP: {
1148 std::string wspace(scw);
1149 mkallsmall2(wspace, sunicw);
1150 good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1151 if (clock() > timelimit + TIMELIMIT_GLOBAL)
1152 return slst;
1153 if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str()))
1154 insert_sug(slst, wspace);
1155 mkinitcap2(wspace, sunicw);
1156 good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
1157 if (clock() > timelimit + TIMELIMIT_GLOBAL)
1158 return slst;
1159 for (size_t j = 0; j < slst.size(); ++j) {
1160 mkallcap(slst[j]);
1161 if (pAMgr && pAMgr->get_checksharps()) {
1162 if (utf8) {
1163 mystrrep(slst[j], "\xC3\x9F", "SS");
1164 } else {
1165 mystrrep(slst[j], "\xDF", "SS");
1166 }
1167 }
1168 }
1169 break;
1170 }
1171 }
1172
1173 // LANG_hu section: replace '-' with ' ' in Hungarian
1174 if (langnum == LANG_hu) {
1175 for (size_t j = 0; j < slst.size(); ++j) {
1176 size_t pos = slst[j].find('-');
1177 if (pos != std::string::npos) {
1178 int info;
1179 std::string w(slst[j].substr(0, pos));
1180 w.append(slst[j].substr(pos + 1));
1181 (void)spell(w, &info, NULL);
1182 if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
1183 slst[j][pos] = ' ';
1184 } else
1185 slst[j][pos] = '-';
1186 }
1187 }
1188 }
1189 // END OF LANG_hu section
1190 // try ngram approach since found nothing good suggestion
1191 if (!good && pAMgr && (slst.empty() || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0)) {
1192 switch (captype) {
1193 case NOCAP: {
1194 pSMgr->ngsuggest(slst, scw.c_str(), m_HMgrs, NOCAP);
1195 if (clock() > timelimit + TIMELIMIT_GLOBAL)
1196 return slst;
1197 break;
1198 }
1199 /* FALLTHROUGH */
1200 case HUHINITCAP:
1201 capwords = true;
1202 /* FALLTHROUGH */
1203 case HUHCAP: {
1204 std::string wspace(scw);
1205 mkallsmall2(wspace, sunicw);
1206 pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, HUHCAP);
1207 if (clock() > timelimit + TIMELIMIT_GLOBAL)
1208 return slst;
1209 break;
1210 }
1211 case INITCAP: {
1212 capwords = true;
1213 std::string wspace(scw);
1214 mkallsmall2(wspace, sunicw);
1215 pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, INITCAP);
1216 if (clock() > timelimit + TIMELIMIT_GLOBAL)
1217 return slst;
1218 break;
1219 }
1220 case ALLCAP: {
1221 std::string wspace(scw);
1222 mkallsmall2(wspace, sunicw);
1223 size_t oldns = slst.size();
1224 pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, ALLCAP);
1225 if (clock() > timelimit + TIMELIMIT_GLOBAL)
1226 return slst;
1227 for (size_t j = oldns; j < slst.size(); ++j) {
1228 mkallcap(slst[j]);
1229 }
1230 break;
1231 }
1232 }
1233 }
1234
1235 // try dash suggestion (Afo-American -> Afro-American)
1236 // Note: LibreOffice was modified to treat dashes as word
1237 // characters to check "scot-free" etc. word forms, but
1238 // we need to handle suggestions for "Afo-American", etc.,
1239 // while "Afro-American" is missing from the dictionary.
1240 // TODO avoid possible overgeneration
1241 size_t dash_pos = scw.find('-');
1242 if (dash_pos != std::string::npos) {
1243 int nodashsug = 1;
1244 for (size_t j = 0; j < slst.size() && nodashsug == 1; ++j) {
1245 if (slst[j].find('-') != std::string::npos)
1246 nodashsug = 0;
1247 }
1248
1249 size_t prev_pos = 0;
1250 bool last = false;
1251
1252 while (!good && nodashsug && !last) {
1253 if (dash_pos == scw.size())
1254 last = 1;
1255 std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos);
1256 if (!spell(chunk.c_str())) {
1257 std::vector<std::string> nlst = suggest(chunk.c_str());
1258 if (clock() > timelimit + TIMELIMIT_GLOBAL)
1259 return slst;
1260 for (std::vector<std::string>::reverse_iterator j = nlst.rbegin(); j != nlst.rend(); ++j) {
1261 std::string wspace = scw.substr(0, prev_pos);
1262 wspace.append(*j);
1263 if (!last) {
1264 wspace.append("-");
1265 wspace.append(scw.substr(dash_pos + 1));
1266 }
1267 int info = 0;
1268 if (pAMgr && pAMgr->get_forbiddenword())
1269 checkword(wspace, &info, NULL);
1270 if (!(info & SPELL_FORBIDDEN))
1271 insert_sug(slst, wspace);
1272 }
1273 nodashsug = 0;
1274 }
1275 if (!last) {
1276 prev_pos = dash_pos + 1;
1277 dash_pos = scw.find('-', prev_pos);
1278 }
1279 if (dash_pos == std::string::npos)
1280 dash_pos = scw.size();
1281 }
1282 }
1283 return slst;
1284 }
1285
get_dict_encoding() const1286 const std::string& HunspellImpl::get_dict_encoding() const {
1287 return encoding;
1288 }
1289
stem(const std::vector<std::string> & desc)1290 std::vector<std::string> HunspellImpl::stem(const std::vector<std::string>& desc) {
1291 std::vector<std::string> slst;
1292
1293 std::string result2;
1294 if (desc.empty())
1295 return slst;
1296 for (size_t i = 0; i < desc.size(); ++i) {
1297
1298 std::string result;
1299
1300 // add compound word parts (except the last one)
1301 const char* s = desc[i].c_str();
1302 const char* part = strstr(s, MORPH_PART);
1303 if (part) {
1304 const char* nextpart = strstr(part + 1, MORPH_PART);
1305 while (nextpart) {
1306 std::string field;
1307 copy_field(field, part, MORPH_PART);
1308 result.append(field);
1309 part = nextpart;
1310 nextpart = strstr(part + 1, MORPH_PART);
1311 }
1312 s = part;
1313 }
1314
1315 std::string tok(s);
1316 size_t alt = 0;
1317 while ((alt = tok.find(" | ", alt)) != std::string::npos) {
1318 tok[alt + 1] = MSEP_ALT;
1319 }
1320 std::vector<std::string> pl = line_tok(tok, MSEP_ALT);
1321 for (size_t k = 0; k < pl.size(); ++k) {
1322 // add derivational suffixes
1323 if (pl[k].find(MORPH_DERI_SFX) != std::string::npos) {
1324 // remove inflectional suffixes
1325 const size_t is = pl[k].find(MORPH_INFL_SFX);
1326 if (is != std::string::npos)
1327 pl[k].resize(is);
1328 std::vector<std::string> singlepl;
1329 singlepl.push_back(pl[k]);
1330 std::string sg = pSMgr->suggest_gen(singlepl, pl[k]);
1331 if (!sg.empty()) {
1332 std::vector<std::string> gen = line_tok(sg, MSEP_REC);
1333 for (size_t j = 0; j < gen.size(); ++j) {
1334 result2.push_back(MSEP_REC);
1335 result2.append(result);
1336 result2.append(gen[j]);
1337 }
1338 }
1339 } else {
1340 result2.push_back(MSEP_REC);
1341 result2.append(result);
1342 if (pl[k].find(MORPH_SURF_PFX) != std::string::npos) {
1343 std::string field;
1344 copy_field(field, pl[k], MORPH_SURF_PFX);
1345 result2.append(field);
1346 }
1347 std::string field;
1348 copy_field(field, pl[k], MORPH_STEM);
1349 result2.append(field);
1350 }
1351 }
1352 }
1353 slst = line_tok(result2, MSEP_REC);
1354 uniqlist(slst);
1355 return slst;
1356 }
1357
stem(const std::string & word)1358 std::vector<std::string> HunspellImpl::stem(const std::string& word) {
1359 return stem(analyze(word));
1360 }
1361
get_wordchars_cpp() const1362 const std::string& HunspellImpl::get_wordchars_cpp() const {
1363 return pAMgr->get_wordchars();
1364 }
1365
get_wordchars_utf16() const1366 const std::vector<w_char>& HunspellImpl::get_wordchars_utf16() const {
1367 return pAMgr->get_wordchars_utf16();
1368 }
1369
mkinitcap(std::string & u8)1370 void HunspellImpl::mkinitcap(std::string& u8) {
1371 if (utf8) {
1372 std::vector<w_char> u16;
1373 u8_u16(u16, u8);
1374 ::mkinitcap_utf(u16, langnum);
1375 u16_u8(u8, u16);
1376 } else {
1377 ::mkinitcap(u8, csconv);
1378 }
1379 }
1380
mkinitcap2(std::string & u8,std::vector<w_char> & u16)1381 int HunspellImpl::mkinitcap2(std::string& u8, std::vector<w_char>& u16) {
1382 if (utf8) {
1383 ::mkinitcap_utf(u16, langnum);
1384 u16_u8(u8, u16);
1385 } else {
1386 ::mkinitcap(u8, csconv);
1387 }
1388 return u8.size();
1389 }
1390
mkinitsmall2(std::string & u8,std::vector<w_char> & u16)1391 int HunspellImpl::mkinitsmall2(std::string& u8, std::vector<w_char>& u16) {
1392 if (utf8) {
1393 ::mkinitsmall_utf(u16, langnum);
1394 u16_u8(u8, u16);
1395 } else {
1396 ::mkinitsmall(u8, csconv);
1397 }
1398 return u8.size();
1399 }
1400
add(const std::string & word)1401 int HunspellImpl::add(const std::string& word) {
1402 if (!m_HMgrs.empty())
1403 return m_HMgrs[0]->add(word);
1404 return 0;
1405 }
1406
add_with_affix(const std::string & word,const std::string & example)1407 int HunspellImpl::add_with_affix(const std::string& word, const std::string& example) {
1408 if (!m_HMgrs.empty())
1409 return m_HMgrs[0]->add_with_affix(word, example);
1410 return 0;
1411 }
1412
remove(const std::string & word)1413 int HunspellImpl::remove(const std::string& word) {
1414 if (!m_HMgrs.empty())
1415 return m_HMgrs[0]->remove(word);
1416 return 0;
1417 }
1418
get_version_cpp() const1419 const std::string& HunspellImpl::get_version_cpp() const {
1420 return pAMgr->get_version();
1421 }
1422
get_csconv()1423 struct cs_info* HunspellImpl::get_csconv() {
1424 return csconv;
1425 }
1426
cat_result(std::string & result,const std::string & st)1427 void HunspellImpl::cat_result(std::string& result, const std::string& st) {
1428 if (!st.empty()) {
1429 if (!result.empty())
1430 result.append("\n");
1431 result.append(st);
1432 }
1433 }
1434
analyze(const std::string & word)1435 std::vector<std::string> HunspellImpl::analyze(const std::string& word) {
1436 std::vector<std::string> slst = analyze_internal(word);
1437 // output conversion
1438 RepList* rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
1439 if (rl) {
1440 for (size_t i = 0; rl && i < slst.size(); ++i) {
1441 std::string wspace;
1442 if (rl->conv(slst[i], wspace)) {
1443 slst[i] = wspace;
1444 }
1445 }
1446 }
1447 return slst;
1448 }
1449
analyze_internal(const std::string & word)1450 std::vector<std::string> HunspellImpl::analyze_internal(const std::string& word) {
1451 std::vector<std::string> slst;
1452 if (!pSMgr || m_HMgrs.empty())
1453 return slst;
1454 if (utf8) {
1455 if (word.size() >= MAXWORDUTF8LEN)
1456 return slst;
1457 } else {
1458 if (word.size() >= MAXWORDLEN)
1459 return slst;
1460 }
1461 int captype = NOCAP;
1462 size_t abbv = 0;
1463 size_t wl = 0;
1464
1465 std::string scw;
1466 std::vector<w_char> sunicw;
1467
1468 // input conversion
1469 RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
1470 {
1471 std::string wspace;
1472
1473 bool convstatus = rl ? rl->conv(word, wspace) : false;
1474 if (convstatus)
1475 wl = cleanword2(scw, sunicw, wspace, &captype, &abbv);
1476 else
1477 wl = cleanword2(scw, sunicw, word, &captype, &abbv);
1478 }
1479
1480 if (wl == 0) {
1481 if (abbv) {
1482 scw.clear();
1483 for (wl = 0; wl < abbv; wl++)
1484 scw.push_back('.');
1485 abbv = 0;
1486 } else
1487 return slst;
1488 }
1489
1490 std::string result;
1491
1492 size_t n = 0;
1493 // test numbers
1494 // LANG_hu section: set dash information for suggestions
1495 if (langnum == LANG_hu) {
1496 size_t n2 = 0;
1497 size_t n3 = 0;
1498
1499 while ((n < wl) && (((scw[n] <= '9') && (scw[n] >= '0')) ||
1500 (((scw[n] == '.') || (scw[n] == ',')) && (n > 0)))) {
1501 n++;
1502 if ((scw[n] == '.') || (scw[n] == ',')) {
1503 if (((n2 == 0) && (n > 3)) ||
1504 ((n2 > 0) && ((scw[n - 1] == '.') || (scw[n - 1] == ','))))
1505 break;
1506 n2++;
1507 n3 = n;
1508 }
1509 }
1510
1511 if ((n == wl) && (n3 > 0) && (n - n3 > 3))
1512 return slst;
1513 if ((n == wl) || ((n > 0) && ((scw[n] == '%') || (scw[n] == '\xB0')) &&
1514 checkword(scw.substr(n), NULL, NULL))) {
1515 result.append(scw);
1516 result.resize(n - 1);
1517 if (n == wl)
1518 cat_result(result, pSMgr->suggest_morph(scw.substr(n - 1)));
1519 else {
1520 std::string chunk = scw.substr(n - 1, 1);
1521 cat_result(result, pSMgr->suggest_morph(chunk));
1522 result.push_back('+'); // XXX SPEC. MORPHCODE
1523 cat_result(result, pSMgr->suggest_morph(scw.substr(n)));
1524 }
1525 return line_tok(result, MSEP_REC);
1526 }
1527 }
1528 // END OF LANG_hu section
1529
1530 switch (captype) {
1531 case HUHCAP:
1532 case HUHINITCAP:
1533 case NOCAP: {
1534 cat_result(result, pSMgr->suggest_morph(scw));
1535 if (abbv) {
1536 std::string u8buffer(scw);
1537 u8buffer.push_back('.');
1538 cat_result(result, pSMgr->suggest_morph(u8buffer));
1539 }
1540 break;
1541 }
1542 case INITCAP: {
1543 mkallsmall2(scw, sunicw);
1544 std::string u8buffer(scw);
1545 mkinitcap2(scw, sunicw);
1546 cat_result(result, pSMgr->suggest_morph(u8buffer));
1547 cat_result(result, pSMgr->suggest_morph(scw));
1548 if (abbv) {
1549 u8buffer.push_back('.');
1550 cat_result(result, pSMgr->suggest_morph(u8buffer));
1551
1552 u8buffer = scw;
1553 u8buffer.push_back('.');
1554
1555 cat_result(result, pSMgr->suggest_morph(u8buffer));
1556 }
1557 break;
1558 }
1559 case ALLCAP: {
1560 cat_result(result, pSMgr->suggest_morph(scw));
1561 if (abbv) {
1562 std::string u8buffer(scw);
1563 u8buffer.push_back('.');
1564 cat_result(result, pSMgr->suggest_morph(u8buffer));
1565 }
1566 mkallsmall2(scw, sunicw);
1567 std::string u8buffer(scw);
1568 mkinitcap2(scw, sunicw);
1569
1570 cat_result(result, pSMgr->suggest_morph(u8buffer));
1571 cat_result(result, pSMgr->suggest_morph(scw));
1572 if (abbv) {
1573 u8buffer.push_back('.');
1574 cat_result(result, pSMgr->suggest_morph(u8buffer));
1575
1576 u8buffer = scw;
1577 u8buffer.push_back('.');
1578
1579 cat_result(result, pSMgr->suggest_morph(u8buffer));
1580 }
1581 break;
1582 }
1583 }
1584
1585 if (!result.empty()) {
1586 // word reversing wrapper for complex prefixes
1587 if (complexprefixes) {
1588 if (utf8)
1589 reverseword_utf(result);
1590 else
1591 reverseword(result);
1592 }
1593 return line_tok(result, MSEP_REC);
1594 }
1595
1596 // compound word with dash (HU) I18n
1597 // LANG_hu section: set dash information for suggestions
1598
1599 size_t dash_pos = langnum == LANG_hu ? scw.find('-') : std::string::npos;
1600 if (dash_pos != std::string::npos) {
1601 int nresult = 0;
1602
1603 std::string part1 = scw.substr(0, dash_pos);
1604 std::string part2 = scw.substr(dash_pos+1);
1605
1606 // examine 2 sides of the dash
1607 if (part2.empty()) { // base word ending with dash
1608 if (spell(part1)) {
1609 std::string p = pSMgr->suggest_morph(part1);
1610 if (!p.empty()) {
1611 slst = line_tok(p, MSEP_REC);
1612 return slst;
1613 }
1614 }
1615 } else if (part2.size() == 1 && part2[0] == 'e') { // XXX (HU) -e hat.
1616 if (spell(part1) && (spell("-e"))) {
1617 std::string st = pSMgr->suggest_morph(part1);
1618 if (!st.empty()) {
1619 result.append(st);
1620 }
1621 result.push_back('+'); // XXX spec. separator in MORPHCODE
1622 st = pSMgr->suggest_morph("-e");
1623 if (!st.empty()) {
1624 result.append(st);
1625 }
1626 return line_tok(result, MSEP_REC);
1627 }
1628 } else {
1629 // first word ending with dash: word- XXX ???
1630 part1.push_back(' ');
1631 nresult = spell(part1);
1632 part1.erase(part1.size() - 1);
1633 if (nresult && spell(part2) &&
1634 ((part2.size() > 1) || ((part2[0] > '0') && (part2[0] < '9')))) {
1635 std::string st = pSMgr->suggest_morph(part1);
1636 if (!st.empty()) {
1637 result.append(st);
1638 result.push_back('+'); // XXX spec. separator in MORPHCODE
1639 }
1640 st = pSMgr->suggest_morph(part2);
1641 if (!st.empty()) {
1642 result.append(st);
1643 }
1644 return line_tok(result, MSEP_REC);
1645 }
1646 }
1647 // affixed number in correct word
1648 if (nresult && (dash_pos > 0) &&
1649 (((scw[dash_pos - 1] <= '9') && (scw[dash_pos - 1] >= '0')) ||
1650 (scw[dash_pos - 1] == '.'))) {
1651 n = 1;
1652 if (scw[dash_pos - n] == '.')
1653 n++;
1654 // search first not a number character to left from dash
1655 while ((dash_pos >= n) && ((scw[dash_pos - n] == '0') || (n < 3)) &&
1656 (n < 6)) {
1657 n++;
1658 }
1659 if (dash_pos < n)
1660 n--;
1661 // numbers: valami1000000-hoz
1662 // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz,
1663 // 56-hoz, 6-hoz
1664 for (; n >= 1; n--) {
1665 if (scw[dash_pos - n] < '0' || scw[dash_pos - n] > '9') {
1666 continue;
1667 }
1668 std::string chunk = scw.substr(dash_pos - n);
1669 if (checkword(chunk, NULL, NULL)) {
1670 result.append(chunk);
1671 std::string st = pSMgr->suggest_morph(chunk);
1672 if (!st.empty()) {
1673 result.append(st);
1674 }
1675 return line_tok(result, MSEP_REC);
1676 }
1677 }
1678 }
1679 }
1680 return slst;
1681 }
1682
generate(const std::string & word,const std::vector<std::string> & pl)1683 std::vector<std::string> HunspellImpl::generate(const std::string& word, const std::vector<std::string>& pl) {
1684 std::vector<std::string> slst;
1685 if (!pSMgr || pl.empty())
1686 return slst;
1687 std::vector<std::string> pl2 = analyze(word);
1688 int captype = NOCAP;
1689 int abbv = 0;
1690 std::string cw;
1691 cleanword(cw, word, &captype, &abbv);
1692 std::string result;
1693
1694 for (size_t i = 0; i < pl.size(); ++i) {
1695 cat_result(result, pSMgr->suggest_gen(pl2, pl[i]));
1696 }
1697
1698 if (!result.empty()) {
1699 // allcap
1700 if (captype == ALLCAP)
1701 mkallcap(result);
1702
1703 // line split
1704 slst = line_tok(result, MSEP_REC);
1705
1706 // capitalize
1707 if (captype == INITCAP || captype == HUHINITCAP) {
1708 for (size_t j = 0; j < slst.size(); ++j) {
1709 mkinitcap(slst[j]);
1710 }
1711 }
1712
1713 // temporary filtering of prefix related errors (eg.
1714 // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks")
1715 std::vector<std::string>::iterator it = slst.begin();
1716 while (it != slst.end()) {
1717 if (!spell(*it)) {
1718 it = slst.erase(it);
1719 } else {
1720 ++it;
1721 }
1722 }
1723 }
1724 return slst;
1725 }
1726
generate(const std::string & word,const std::string & pattern)1727 std::vector<std::string> HunspellImpl::generate(const std::string& word, const std::string& pattern) {
1728 std::vector<std::string> pl = analyze(pattern);
1729 std::vector<std::string> slst = generate(word, pl);
1730 uniqlist(slst);
1731 return slst;
1732 }
1733
1734 // minimal XML parser functions
get_xml_par(const char * par)1735 std::string HunspellImpl::get_xml_par(const char* par) {
1736 std::string dest;
1737 if (!par)
1738 return dest;
1739 char end = *par;
1740 if (end == '>')
1741 end = '<';
1742 else if (end != '\'' && end != '"')
1743 return dest; // bad XML
1744 for (par++; *par != '\0' && *par != end; ++par) {
1745 dest.push_back(*par);
1746 }
1747 mystrrep(dest, "<", "<");
1748 mystrrep(dest, "&", "&");
1749 return dest;
1750 }
1751
get_langnum() const1752 int HunspellImpl::get_langnum() const {
1753 return langnum;
1754 }
1755
input_conv(const std::string & word,std::string & dest)1756 bool HunspellImpl::input_conv(const std::string& word, std::string& dest) {
1757 RepList* rl = pAMgr ? pAMgr->get_iconvtable() : NULL;
1758 if (rl) {
1759 return rl->conv(word, dest);
1760 }
1761 dest.assign(word);
1762 return false;
1763 }
1764
1765 // return the beginning of the element (attr == NULL) or the attribute
get_xml_pos(const char * s,const char * attr)1766 const char* HunspellImpl::get_xml_pos(const char* s, const char* attr) {
1767 const char* end = strchr(s, '>');
1768 if (attr == NULL)
1769 return end;
1770 const char* p = s;
1771 while (1) {
1772 p = strstr(p, attr);
1773 if (!p || p >= end)
1774 return 0;
1775 if (*(p - 1) == ' ' || *(p - 1) == '\n')
1776 break;
1777 p += strlen(attr);
1778 }
1779 return p + strlen(attr);
1780 }
1781
check_xml_par(const char * q,const char * attr,const char * value)1782 int HunspellImpl::check_xml_par(const char* q,
1783 const char* attr,
1784 const char* value) {
1785 std::string cw = get_xml_par(get_xml_pos(q, attr));
1786 if (cw == value)
1787 return 1;
1788 return 0;
1789 }
1790
get_xml_list(const char * list,const char * tag)1791 std::vector<std::string> HunspellImpl::get_xml_list(const char* list, const char* tag) {
1792 std::vector<std::string> slst;
1793 if (!list)
1794 return slst;
1795 const char* p = list;
1796 for (size_t n = 0; ((p = strstr(p, tag)) != NULL); ++p, ++n) {
1797 std::string cw = get_xml_par(p + strlen(tag) - 1);
1798 if (cw.empty()) {
1799 break;
1800 }
1801 slst.push_back(cw);
1802 }
1803 return slst;
1804 }
1805
spellml(const std::string & in_word)1806 std::vector<std::string> HunspellImpl::spellml(const std::string& in_word) {
1807 std::vector<std::string> slst;
1808
1809 const char* word = in_word.c_str();
1810
1811 const char* q = strstr(word, "<query");
1812 if (!q)
1813 return slst; // bad XML input
1814 const char* q2 = strchr(q, '>');
1815 if (!q2)
1816 return slst; // bad XML input
1817 q2 = strstr(q2, "<word");
1818 if (!q2)
1819 return slst; // bad XML input
1820 if (check_xml_par(q, "type=", "analyze")) {
1821 std::string cw = get_xml_par(strchr(q2, '>'));
1822 if (!cw.empty())
1823 slst = analyze(cw);
1824 if (slst.empty())
1825 return slst;
1826 // convert the result to <code><a>ana1</a><a>ana2</a></code> format
1827 std::string r;
1828 r.append("<code>");
1829 for (size_t i = 0; i < slst.size(); ++i) {
1830 r.append("<a>");
1831
1832 std::string entry(slst[i]);
1833 mystrrep(entry, "\t", " ");
1834 mystrrep(entry, "&", "&");
1835 mystrrep(entry, "<", "<");
1836 r.append(entry);
1837
1838 r.append("</a>");
1839 }
1840 r.append("</code>");
1841 slst.clear();
1842 slst.push_back(r);
1843 return slst;
1844 } else if (check_xml_par(q, "type=", "stem")) {
1845 std::string cw = get_xml_par(strchr(q2, '>'));
1846 if (!cw.empty())
1847 return stem(cw);
1848 } else if (check_xml_par(q, "type=", "generate")) {
1849 std::string cw = get_xml_par(strchr(q2, '>'));
1850 if (cw.empty())
1851 return slst;
1852 const char* q3 = strstr(q2 + 1, "<word");
1853 if (q3) {
1854 std::string cw2 = get_xml_par(strchr(q3, '>'));
1855 if (!cw2.empty()) {
1856 return generate(cw, cw2);
1857 }
1858 } else {
1859 if ((q2 = strstr(q2 + 1, "<code")) != NULL) {
1860 std::vector<std::string> slst2 = get_xml_list(strchr(q2, '>'), "<a>");
1861 if (!slst2.empty()) {
1862 slst = generate(cw, slst2);
1863 uniqlist(slst);
1864 return slst;
1865 }
1866 }
1867 }
1868 } else if (check_xml_par(q, "type=", "add")) {
1869 std::string cw = get_xml_par(strchr(q2, '>'));
1870 if (cw.empty())
1871 return slst;
1872 const char* q3 = strstr(q2 + 1, "<word");
1873 if (q3) {
1874 std::string cw2 = get_xml_par(strchr(q3, '>'));
1875 if (!cw2.empty()) {
1876 add_with_affix(cw, cw2);
1877 } else {
1878 add(cw);
1879 }
1880 } else {
1881 add(cw);
1882 }
1883 }
1884 return slst;
1885 }
1886
suffix_suggest(const std::string & root_word)1887 std::vector<std::string> HunspellImpl::suffix_suggest(const std::string& root_word) {
1888 std::vector<std::string> slst;
1889 struct hentry* he = NULL;
1890 int len;
1891 std::string w2;
1892 const char* word;
1893 const char* ignoredchars = pAMgr->get_ignore();
1894 if (ignoredchars != NULL) {
1895 w2.assign(root_word);
1896 if (utf8) {
1897 const std::vector<w_char>& ignoredchars_utf16 =
1898 pAMgr->get_ignore_utf16();
1899 remove_ignored_chars_utf(w2, ignoredchars_utf16);
1900 } else {
1901 remove_ignored_chars(w2, ignoredchars);
1902 }
1903 word = w2.c_str();
1904 } else
1905 word = root_word.c_str();
1906
1907 len = strlen(word);
1908
1909 if (!len)
1910 return slst;
1911
1912 for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) {
1913 he = m_HMgrs[i]->lookup(word);
1914 }
1915 if (he) {
1916 slst = pAMgr->get_suffix_words(he->astr, he->alen, root_word.c_str());
1917 }
1918 return slst;
1919 }
1920
1921 namespace {
munge_vector(char *** slst,const std::vector<std::string> & items)1922 int munge_vector(char*** slst, const std::vector<std::string>& items) {
1923 if (items.empty()) {
1924 *slst = NULL;
1925 return 0;
1926 } else {
1927 *slst = (char**)malloc(sizeof(char*) * items.size());
1928 if (!*slst)
1929 return 0;
1930 for (size_t i = 0; i < items.size(); ++i)
1931 (*slst)[i] = mystrdup(items[i].c_str());
1932 }
1933 return items.size();
1934 }
1935 }
1936
spell(const char * word,int * info,char ** root)1937 int HunspellImpl::spell(const char* word, int* info, char** root) {
1938 std::string sroot;
1939 bool ret = spell(word, info, root ? &sroot : NULL);
1940 if (root) {
1941 if (sroot.empty()) {
1942 *root = NULL;
1943 } else {
1944 *root = mystrdup(sroot.c_str());
1945 }
1946 }
1947 return ret;
1948 }
1949
suggest(char *** slst,const char * word)1950 int HunspellImpl::suggest(char*** slst, const char* word) {
1951 std::vector<std::string> suggests = suggest(word);
1952 return munge_vector(slst, suggests);
1953 }
1954
suffix_suggest(char *** slst,const char * root_word)1955 int HunspellImpl::suffix_suggest(char*** slst, const char* root_word) {
1956 std::vector<std::string> stems = suffix_suggest(root_word);
1957 return munge_vector(slst, stems);
1958 }
1959
free_list(char *** slst,int n)1960 void HunspellImpl::free_list(char*** slst, int n) {
1961 if (slst && *slst) {
1962 for (int i = 0; i < n; i++)
1963 free((*slst)[i]);
1964 free(*slst);
1965 *slst = NULL;
1966 }
1967 }
1968
get_dic_encoding()1969 char* HunspellImpl::get_dic_encoding() {
1970 return &encoding[0];
1971 }
1972
analyze(char *** slst,const char * word)1973 int HunspellImpl::analyze(char*** slst, const char* word) {
1974 std::vector<std::string> stems = analyze(word);
1975 return munge_vector(slst, stems);
1976 }
1977
stem(char *** slst,const char * word)1978 int HunspellImpl::stem(char*** slst, const char* word) {
1979 std::vector<std::string> stems = stem(word);
1980 return munge_vector(slst, stems);
1981 }
1982
stem(char *** slst,char ** desc,int n)1983 int HunspellImpl::stem(char*** slst, char** desc, int n) {
1984 std::vector<std::string> morph;
1985 for (int i = 0; i < n; ++i)
1986 morph.push_back(desc[i]);
1987
1988 std::vector<std::string> stems = stem(morph);
1989 return munge_vector(slst, stems);
1990 }
1991
generate(char *** slst,const char * word,const char * pattern)1992 int HunspellImpl::generate(char*** slst, const char* word, const char* pattern) {
1993 std::vector<std::string> stems = generate(word, pattern);
1994 return munge_vector(slst, stems);
1995 }
1996
generate(char *** slst,const char * word,char ** pl,int pln)1997 int HunspellImpl::generate(char*** slst, const char* word, char** pl, int pln) {
1998 std::vector<std::string> morph;
1999 for (int i = 0; i < pln; ++i)
2000 morph.push_back(pl[i]);
2001
2002 std::vector<std::string> stems = generate(word, morph);
2003 return munge_vector(slst, stems);
2004 }
2005
get_wordchars() const2006 const char* HunspellImpl::get_wordchars() const {
2007 return get_wordchars_cpp().c_str();
2008 }
2009
get_version() const2010 const char* HunspellImpl::get_version() const {
2011 return get_version_cpp().c_str();
2012 }
2013
input_conv(const char * word,char * dest,size_t destsize)2014 int HunspellImpl::input_conv(const char* word, char* dest, size_t destsize) {
2015 std::string d;
2016 bool ret = input_conv(word, d);
2017 if (ret && d.size() < destsize) {
2018 strncpy(dest, d.c_str(), destsize);
2019 return 1;
2020 }
2021 return 0;
2022 }
2023
Hunspell(const char * affpath,const char * dpath,const char * key)2024 Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key)
2025 : m_Impl(new HunspellImpl(affpath, dpath, key)) {
2026 }
2027
~Hunspell()2028 Hunspell::~Hunspell() {
2029 delete m_Impl;
2030 }
2031
2032 // load extra dictionaries
add_dic(const char * dpath,const char * key)2033 int Hunspell::add_dic(const char* dpath, const char* key) {
2034 return m_Impl->add_dic(dpath, key);
2035 }
2036
spell(const std::string & word,int * info,std::string * root)2037 bool Hunspell::spell(const std::string& word, int* info, std::string* root) {
2038 return m_Impl->spell(word, info, root);
2039 }
2040
suggest(const std::string & word)2041 std::vector<std::string> Hunspell::suggest(const std::string& word) {
2042 return m_Impl->suggest(word);
2043 }
2044
suffix_suggest(const std::string & root_word)2045 std::vector<std::string> Hunspell::suffix_suggest(const std::string& root_word) {
2046 return m_Impl->suffix_suggest(root_word);
2047 }
2048
get_dict_encoding() const2049 const std::string& Hunspell::get_dict_encoding() const {
2050 return m_Impl->get_dict_encoding();
2051 }
2052
stem(const std::vector<std::string> & desc)2053 std::vector<std::string> Hunspell::stem(const std::vector<std::string>& desc) {
2054 return m_Impl->stem(desc);
2055 }
2056
stem(const std::string & word)2057 std::vector<std::string> Hunspell::stem(const std::string& word) {
2058 return m_Impl->stem(word);
2059 }
2060
get_wordchars_cpp() const2061 const std::string& Hunspell::get_wordchars_cpp() const {
2062 return m_Impl->get_wordchars_cpp();
2063 }
2064
get_wordchars_utf16() const2065 const std::vector<w_char>& Hunspell::get_wordchars_utf16() const {
2066 return m_Impl->get_wordchars_utf16();
2067 }
2068
add(const std::string & word)2069 int Hunspell::add(const std::string& word) {
2070 return m_Impl->add(word);
2071 }
2072
add_with_affix(const std::string & word,const std::string & example)2073 int Hunspell::add_with_affix(const std::string& word, const std::string& example) {
2074 return m_Impl->add_with_affix(word, example);
2075 }
2076
remove(const std::string & word)2077 int Hunspell::remove(const std::string& word) {
2078 return m_Impl->remove(word);
2079 }
2080
get_version_cpp() const2081 const std::string& Hunspell::get_version_cpp() const {
2082 return m_Impl->get_version_cpp();
2083 }
2084
get_csconv()2085 struct cs_info* Hunspell::get_csconv() {
2086 return m_Impl->get_csconv();
2087 }
2088
analyze(const std::string & word)2089 std::vector<std::string> Hunspell::analyze(const std::string& word) {
2090 return m_Impl->analyze(word);
2091 }
2092
generate(const std::string & word,const std::vector<std::string> & pl)2093 std::vector<std::string> Hunspell::generate(const std::string& word, const std::vector<std::string>& pl) {
2094 return m_Impl->generate(word, pl);
2095 }
2096
generate(const std::string & word,const std::string & pattern)2097 std::vector<std::string> Hunspell::generate(const std::string& word, const std::string& pattern) {
2098 return m_Impl->generate(word, pattern);
2099 }
2100
get_langnum() const2101 int Hunspell::get_langnum() const {
2102 return m_Impl->get_langnum();
2103 }
2104
input_conv(const std::string & word,std::string & dest)2105 bool Hunspell::input_conv(const std::string& word, std::string& dest) {
2106 return m_Impl->input_conv(word, dest);
2107 }
2108
spell(const char * word,int * info,char ** root)2109 int Hunspell::spell(const char* word, int* info, char** root) {
2110 return m_Impl->spell(word, info, root);
2111 }
2112
suggest(char *** slst,const char * word)2113 int Hunspell::suggest(char*** slst, const char* word) {
2114 return m_Impl->suggest(slst, word);
2115 }
2116
suffix_suggest(char *** slst,const char * root_word)2117 int Hunspell::suffix_suggest(char*** slst, const char* root_word) {
2118 return m_Impl->suffix_suggest(slst, root_word);
2119 }
2120
free_list(char *** slst,int n)2121 void Hunspell::free_list(char*** slst, int n) {
2122 m_Impl->free_list(slst, n);
2123 }
2124
get_dic_encoding()2125 char* Hunspell::get_dic_encoding() {
2126 return m_Impl->get_dic_encoding();
2127 }
2128
analyze(char *** slst,const char * word)2129 int Hunspell::analyze(char*** slst, const char* word) {
2130 return m_Impl->analyze(slst, word);
2131 }
2132
stem(char *** slst,const char * word)2133 int Hunspell::stem(char*** slst, const char* word) {
2134 return m_Impl->stem(slst, word);
2135 }
2136
stem(char *** slst,char ** desc,int n)2137 int Hunspell::stem(char*** slst, char** desc, int n) {
2138 return m_Impl->stem(slst, desc, n);
2139 }
2140
generate(char *** slst,const char * word,const char * pattern)2141 int Hunspell::generate(char*** slst, const char* word, const char* pattern) {
2142 return m_Impl->generate(slst, word, pattern);
2143 }
2144
generate(char *** slst,const char * word,char ** pl,int pln)2145 int Hunspell::generate(char*** slst, const char* word, char** pl, int pln) {
2146 return m_Impl->generate(slst, word, pl, pln);
2147 }
2148
get_wordchars() const2149 const char* Hunspell::get_wordchars() const {
2150 return m_Impl->get_wordchars();
2151 }
2152
get_version() const2153 const char* Hunspell::get_version() const {
2154 return m_Impl->get_version();
2155 }
2156
input_conv(const char * word,char * dest,size_t destsize)2157 int Hunspell::input_conv(const char* word, char* dest, size_t destsize) {
2158 return m_Impl->input_conv(word, dest, destsize);
2159 }
2160
Hunspell_create(const char * affpath,const char * dpath)2161 Hunhandle* Hunspell_create(const char* affpath, const char* dpath) {
2162 return reinterpret_cast<Hunhandle*>(new HunspellImpl(affpath, dpath));
2163 }
2164
Hunspell_create_key(const char * affpath,const char * dpath,const char * key)2165 Hunhandle* Hunspell_create_key(const char* affpath,
2166 const char* dpath,
2167 const char* key) {
2168 return reinterpret_cast<Hunhandle*>(new HunspellImpl(affpath, dpath, key));
2169 }
2170
Hunspell_destroy(Hunhandle * pHunspell)2171 void Hunspell_destroy(Hunhandle* pHunspell) {
2172 delete reinterpret_cast<HunspellImpl*>(pHunspell);
2173 }
2174
Hunspell_add_dic(Hunhandle * pHunspell,const char * dpath)2175 int Hunspell_add_dic(Hunhandle* pHunspell, const char* dpath) {
2176 return reinterpret_cast<HunspellImpl*>(pHunspell)->add_dic(dpath);
2177 }
2178
Hunspell_spell(Hunhandle * pHunspell,const char * word)2179 int Hunspell_spell(Hunhandle* pHunspell, const char* word) {
2180 return reinterpret_cast<HunspellImpl*>(pHunspell)->spell(word);
2181 }
2182
Hunspell_get_dic_encoding(Hunhandle * pHunspell)2183 char* Hunspell_get_dic_encoding(Hunhandle* pHunspell) {
2184 return reinterpret_cast<HunspellImpl*>(pHunspell)->get_dic_encoding();
2185 }
2186
Hunspell_suggest(Hunhandle * pHunspell,char *** slst,const char * word)2187 int Hunspell_suggest(Hunhandle* pHunspell, char*** slst, const char* word) {
2188 return reinterpret_cast<HunspellImpl*>(pHunspell)->suggest(slst, word);
2189 }
2190
Hunspell_analyze(Hunhandle * pHunspell,char *** slst,const char * word)2191 int Hunspell_analyze(Hunhandle* pHunspell, char*** slst, const char* word) {
2192 return reinterpret_cast<HunspellImpl*>(pHunspell)->analyze(slst, word);
2193 }
2194
Hunspell_stem(Hunhandle * pHunspell,char *** slst,const char * word)2195 int Hunspell_stem(Hunhandle* pHunspell, char*** slst, const char* word) {
2196 return reinterpret_cast<HunspellImpl*>(pHunspell)->stem(slst, word);
2197 }
2198
Hunspell_stem2(Hunhandle * pHunspell,char *** slst,char ** desc,int n)2199 int Hunspell_stem2(Hunhandle* pHunspell, char*** slst, char** desc, int n) {
2200 return reinterpret_cast<HunspellImpl*>(pHunspell)->stem(slst, desc, n);
2201 }
2202
Hunspell_generate(Hunhandle * pHunspell,char *** slst,const char * word,const char * pattern)2203 int Hunspell_generate(Hunhandle* pHunspell,
2204 char*** slst,
2205 const char* word,
2206 const char* pattern)
2207 {
2208 return reinterpret_cast<HunspellImpl*>(pHunspell)->generate(slst, word, pattern);
2209 }
2210
Hunspell_generate2(Hunhandle * pHunspell,char *** slst,const char * word,char ** desc,int n)2211 int Hunspell_generate2(Hunhandle* pHunspell,
2212 char*** slst,
2213 const char* word,
2214 char** desc,
2215 int n)
2216 {
2217 return reinterpret_cast<HunspellImpl*>(pHunspell)->generate(slst, word, desc, n);
2218 }
2219
2220 /* functions for run-time modification of the dictionary */
2221
2222 /* add word to the run-time dictionary */
2223
Hunspell_add(Hunhandle * pHunspell,const char * word)2224 int Hunspell_add(Hunhandle* pHunspell, const char* word) {
2225 return reinterpret_cast<HunspellImpl*>(pHunspell)->add(word);
2226 }
2227
2228 /* add word to the run-time dictionary with affix flags of
2229 * the example (a dictionary word): Hunspell will recognize
2230 * affixed forms of the new word, too.
2231 */
2232
Hunspell_add_with_affix(Hunhandle * pHunspell,const char * word,const char * example)2233 int Hunspell_add_with_affix(Hunhandle* pHunspell,
2234 const char* word,
2235 const char* example) {
2236 return reinterpret_cast<HunspellImpl*>(pHunspell)->add_with_affix(word, example);
2237 }
2238
2239 /* remove word from the run-time dictionary */
2240
Hunspell_remove(Hunhandle * pHunspell,const char * word)2241 int Hunspell_remove(Hunhandle* pHunspell, const char* word) {
2242 return reinterpret_cast<HunspellImpl*>(pHunspell)->remove(word);
2243 }
2244
Hunspell_free_list(Hunhandle * pHunspell,char *** list,int n)2245 void Hunspell_free_list(Hunhandle* pHunspell, char*** list, int n) {
2246 reinterpret_cast<HunspellImpl*>(pHunspell)->free_list(list, n);
2247 }
2248