1 /* ***** BEGIN LICENSE BLOCK *****
2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3  *
4  * Copyright (C) 2002-2017 Németh László
5  *
6  * The contents of this file are subject to the Mozilla Public License Version
7  * 1.1 (the "License"); you may not use this file except in compliance with
8  * the License. You may obtain a copy of the License at
9  * http://www.mozilla.org/MPL/
10  *
11  * Software distributed under the License is distributed on an "AS IS" basis,
12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13  * for the specific language governing rights and limitations under the
14  * License.
15  *
16  * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17  *
18  * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19  * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20  * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21  * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22  * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23  *
24  * Alternatively, the contents of this file may be used under the terms of
25  * either the GNU General Public License Version 2 or later (the "GPL"), or
26  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27  * in which case the provisions of the GPL or the LGPL are applicable instead
28  * of those above. If you wish to allow use of your version of this file only
29  * under the terms of either the GPL or the LGPL, and not to allow others to
30  * use your version of this file under the terms of the MPL, indicate your
31  * decision by deleting the provisions above and replace them with the notice
32  * and other provisions required by the GPL or the LGPL. If you do not delete
33  * the provisions above, a recipient may use your version of this file under
34  * the terms of any one of the MPL, the GPL or the LGPL.
35  *
36  * ***** END LICENSE BLOCK ***** */
37 /*
38  * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39  * And Contributors.  All rights reserved.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  *
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  *
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  *
52  * 3. All modifications to the source code must be clearly marked as
53  *    such.  Binary redistributions based on modified source code
54  *    must be clearly marked as modified versions in the documentation
55  *    and/or other materials provided with the distribution.
56  *
57  * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
61  * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68  * SUCH DAMAGE.
69  */
70 
71 #include <stdlib.h>
72 #include <string.h>
73 #include <stdio.h>
74 #include <ctype.h>
75 #include <time.h>
76 
77 #include "suggestmgr.hxx"
78 #include "htypes.hxx"
79 #include "csutil.hxx"
80 
81 const w_char W_VLINE = {'\0', '|'};
82 
83 #define MAX_CHAR_DISTANCE 4
84 
SuggestMgr(const char * tryme,unsigned int maxn,AffixMgr * aptr)85 SuggestMgr::SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr) {
86   // register affix manager and check in string of chars to
87   // try when building candidate suggestions
88   pAMgr = aptr;
89 
90   csconv = NULL;
91 
92   ckeyl = 0;
93   ckey = NULL;
94 
95   ctryl = 0;
96   ctry = NULL;
97 
98   utf8 = 0;
99   langnum = 0;
100   complexprefixes = 0;
101 
102   maxSug = maxn;
103   nosplitsugs = 0;
104   maxngramsugs = MAXNGRAMSUGS;
105   maxcpdsugs = MAXCOMPOUNDSUGS;
106 
107   if (pAMgr) {
108     langnum = pAMgr->get_langnum();
109     ckey = pAMgr->get_key_string();
110     nosplitsugs = pAMgr->get_nosplitsugs();
111     if (pAMgr->get_maxngramsugs() >= 0)
112       maxngramsugs = pAMgr->get_maxngramsugs();
113     utf8 = pAMgr->get_utf8();
114     if (pAMgr->get_maxcpdsugs() >= 0)
115       maxcpdsugs = pAMgr->get_maxcpdsugs();
116     if (!utf8) {
117       csconv = get_current_cs(pAMgr->get_encoding());
118     }
119     complexprefixes = pAMgr->get_complexprefixes();
120   }
121 
122   if (ckey) {
123     if (utf8) {
124       ckeyl = u8_u16(ckey_utf, ckey);
125     } else {
126       ckeyl = strlen(ckey);
127     }
128   }
129 
130   if (tryme) {
131     ctry = mystrdup(tryme);
132     if (ctry)
133       ctryl = strlen(ctry);
134     if (ctry && utf8) {
135       ctryl = u8_u16(ctry_utf, tryme);
136     }
137   }
138 
139   // language with possible dash usage
140   // (latin letters or dash in TRY characters)
141   lang_with_dash_usage = (ctry &&
142       ((strchr(ctry, '-') != NULL) || (strchr(ctry, 'a') != NULL)));
143 }
144 
~SuggestMgr()145 SuggestMgr::~SuggestMgr() {
146   pAMgr = NULL;
147   if (ckey)
148     free(ckey);
149   ckey = NULL;
150   ckeyl = 0;
151   if (ctry)
152     free(ctry);
153   ctry = NULL;
154   ctryl = 0;
155   maxSug = 0;
156 #ifdef MOZILLA_CLIENT
157   delete[] csconv;
158 #endif
159 }
160 
testsug(std::vector<std::string> & wlst,const std::string & candidate,int cpdsuggest,int * timer,clock_t * timelimit)161 void SuggestMgr::testsug(std::vector<std::string>& wlst,
162                         const std::string& candidate,
163                         int cpdsuggest,
164                         int* timer,
165                         clock_t* timelimit) {
166   int cwrd = 1;
167   if (wlst.size() == maxSug)
168     return;
169   for (size_t k = 0; k < wlst.size(); ++k) {
170     if (wlst[k] == candidate) {
171       cwrd = 0;
172       break;
173     }
174   }
175   if ((cwrd) && checkword(candidate, cpdsuggest, timer, timelimit)) {
176     wlst.push_back(candidate);
177   }
178 }
179 
180 /* generate suggestions for a misspelled word
181  *    pass in address of array of char * pointers
182  * onlycompoundsug: probably bad suggestions (need for ngram sugs, too)
183  * return value: true, if there is a good suggestion
184  * (REP, ph: or a dictionary word pair)
185  */
suggest(std::vector<std::string> & slst,const char * w,int * onlycompoundsug)186 bool SuggestMgr::suggest(std::vector<std::string>& slst,
187                         const char* w,
188                         int* onlycompoundsug) {
189   int nocompoundtwowords = 0;
190   std::vector<w_char> word_utf;
191   int wl = 0;
192   size_t nsugorig = slst.size();
193   std::string w2;
194   const char* word = w;
195   size_t oldSug = 0;
196   bool good_suggestion = false;
197 
198   // word reversing wrapper for complex prefixes
199   if (complexprefixes) {
200     w2.assign(w);
201     if (utf8)
202       reverseword_utf(w2);
203     else
204       reverseword(w2);
205     word = w2.c_str();
206   }
207 
208   if (utf8) {
209     wl = u8_u16(word_utf, word);
210     if (wl == -1) {
211       return false;
212     }
213   }
214 
215   for (int cpdsuggest = 0; (cpdsuggest < 2) && (nocompoundtwowords == 0) && !good_suggestion;
216        cpdsuggest++) {
217 
218     clock_t timelimit;
219     // initialize both in non-compound and compound cycles
220     timelimit = clock();
221 
222     // limit compound suggestion
223     if (cpdsuggest > 0)
224       oldSug = slst.size();
225 
226     // suggestions for an uppercase word (html -> HTML)
227     if (slst.size() < maxSug) {
228       size_t i = slst.size();
229       if (utf8)
230         capchars_utf(slst, &word_utf[0], wl, cpdsuggest);
231       else
232         capchars(slst, word, cpdsuggest);
233       if (slst.size() > i)
234         good_suggestion = true;
235     }
236 
237     // perhaps we made a typical fault of spelling
238     if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
239       size_t i = slst.size();
240       replchars(slst, word, cpdsuggest);
241       if (slst.size() > i)
242         good_suggestion = true;
243     }
244     if (clock() > timelimit + TIMELIMIT_SUGGESTION)
245       return good_suggestion;
246 
247     // perhaps we made chose the wrong char from a related set
248     if ((slst.size() < maxSug) &&
249         (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
250       mapchars(slst, word, cpdsuggest);
251     }
252     if (clock() > timelimit + TIMELIMIT_SUGGESTION)
253       return good_suggestion;
254 
255     // only suggest compound words when no other suggestion
256     if ((cpdsuggest == 0) && (slst.size() > nsugorig))
257       nocompoundtwowords = 1;
258 
259     // did we swap the order of chars by mistake
260     if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
261       if (utf8)
262         swapchar_utf(slst, &word_utf[0], wl, cpdsuggest);
263       else
264         swapchar(slst, word, cpdsuggest);
265     }
266     if (clock() > timelimit + TIMELIMIT_SUGGESTION)
267       return good_suggestion;
268 
269     // did we swap the order of non adjacent chars by mistake
270     if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
271       if (utf8)
272         longswapchar_utf(slst, &word_utf[0], wl, cpdsuggest);
273       else
274         longswapchar(slst, word, cpdsuggest);
275     }
276     if (clock() > timelimit + TIMELIMIT_SUGGESTION)
277       return good_suggestion;
278 
279     // did we just hit the wrong key in place of a good char (case and keyboard)
280     if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
281       if (utf8)
282         badcharkey_utf(slst, &word_utf[0], wl, cpdsuggest);
283       else
284         badcharkey(slst, word, cpdsuggest);
285     }
286     if (clock() > timelimit + TIMELIMIT_SUGGESTION)
287       return good_suggestion;
288 
289     // did we add a char that should not be there
290     if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
291       if (utf8)
292         extrachar_utf(slst, &word_utf[0], wl, cpdsuggest);
293       else
294         extrachar(slst, word, cpdsuggest);
295     }
296     if (clock() > timelimit + TIMELIMIT_SUGGESTION)
297       return good_suggestion;
298 
299     // did we forgot a char
300     if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
301       if (utf8)
302         forgotchar_utf(slst, &word_utf[0], wl, cpdsuggest);
303       else
304         forgotchar(slst, word, cpdsuggest);
305     }
306     if (clock() > timelimit + TIMELIMIT_SUGGESTION)
307       return good_suggestion;
308 
309     // did we move a char
310     if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
311       if (utf8)
312         movechar_utf(slst, &word_utf[0], wl, cpdsuggest);
313       else
314         movechar(slst, word, cpdsuggest);
315     }
316     if (clock() > timelimit + TIMELIMIT_SUGGESTION)
317       return good_suggestion;
318 
319     // did we just hit the wrong key in place of a good char
320     if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
321       if (utf8)
322         badchar_utf(slst, &word_utf[0], wl, cpdsuggest);
323       else
324         badchar(slst, word, cpdsuggest);
325     }
326     if (clock() > timelimit + TIMELIMIT_SUGGESTION)
327       return good_suggestion;
328 
329     // did we double two characters
330     if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
331       if (utf8)
332         doubletwochars_utf(slst, &word_utf[0], wl, cpdsuggest);
333       else
334         doubletwochars(slst, word, cpdsuggest);
335     }
336     if (clock() > timelimit + TIMELIMIT_SUGGESTION)
337       return good_suggestion;
338 
339     // perhaps we forgot to hit space and two words ran together
340     // (dictionary word pairs have top priority here, so
341     // we always suggest them, in despite of nosplitsugs, and
342     // drop compound word and other suggestions)
343     if (!cpdsuggest || (!nosplitsugs && slst.size() < oldSug + maxcpdsugs)) {
344       good_suggestion = twowords(slst, word, cpdsuggest, good_suggestion);
345     }
346     if (clock() > timelimit + TIMELIMIT_SUGGESTION)
347       return good_suggestion;
348 
349   }  // repeating ``for'' statement compounding support
350 
351   if (!nocompoundtwowords && (!slst.empty()) && onlycompoundsug)
352     *onlycompoundsug = 1;
353 
354   return good_suggestion;
355 }
356 
357 // suggestions for an uppercase word (html -> HTML)
capchars_utf(std::vector<std::string> & wlst,const w_char * word,int wl,int cpdsuggest)358 void SuggestMgr::capchars_utf(std::vector<std::string>& wlst,
359                               const w_char* word,
360                               int wl,
361                               int cpdsuggest) {
362   std::vector<w_char> candidate_utf(word, word + wl);
363   mkallcap_utf(candidate_utf, langnum);
364   std::string candidate;
365   u16_u8(candidate, candidate_utf);
366   testsug(wlst, candidate, cpdsuggest, NULL, NULL);
367 }
368 
369 // suggestions for an uppercase word (html -> HTML)
capchars(std::vector<std::string> & wlst,const char * word,int cpdsuggest)370 void SuggestMgr::capchars(std::vector<std::string>& wlst,
371                           const char* word,
372                           int cpdsuggest) {
373   std::string candidate(word);
374   mkallcap(candidate, csconv);
375   testsug(wlst, candidate, cpdsuggest, NULL, NULL);
376 }
377 
378 // suggestions for when chose the wrong char out of a related set
mapchars(std::vector<std::string> & wlst,const char * word,int cpdsuggest)379 int SuggestMgr::mapchars(std::vector<std::string>& wlst,
380                          const char* word,
381                          int cpdsuggest) {
382   std::string candidate;
383   clock_t timelimit;
384   int timer;
385 
386   int wl = strlen(word);
387   if (wl < 2 || !pAMgr)
388     return wlst.size();
389 
390   const std::vector<mapentry>& maptable = pAMgr->get_maptable();
391   if (maptable.empty())
392     return wlst.size();
393 
394   timelimit = clock();
395   timer = MINTIMER;
396   return map_related(word, candidate, 0, wlst, cpdsuggest,
397                      maptable, &timer, &timelimit);
398 }
399 
map_related(const char * word,std::string & candidate,int wn,std::vector<std::string> & wlst,int cpdsuggest,const std::vector<mapentry> & maptable,int * timer,clock_t * timelimit)400 int SuggestMgr::map_related(const char* word,
401                             std::string& candidate,
402                             int wn,
403                             std::vector<std::string>& wlst,
404                             int cpdsuggest,
405                             const std::vector<mapentry>& maptable,
406                             int* timer,
407                             clock_t* timelimit) {
408   if (*(word + wn) == '\0') {
409     int cwrd = 1;
410     for (size_t m = 0; m < wlst.size(); ++m) {
411       if (wlst[m] == candidate) {
412         cwrd = 0;
413         break;
414       }
415     }
416     if ((cwrd) && checkword(candidate, cpdsuggest, timer, timelimit)) {
417       if (wlst.size() < maxSug) {
418         wlst.push_back(candidate);
419       }
420     }
421     return wlst.size();
422   }
423   int in_map = 0;
424   for (size_t j = 0; j < maptable.size(); ++j) {
425     for (size_t k = 0; k < maptable[j].size(); ++k) {
426       size_t len = maptable[j][k].size();
427       if (strncmp(maptable[j][k].c_str(), word + wn, len) == 0) {
428         in_map = 1;
429         size_t cn = candidate.size();
430         for (size_t l = 0; l < maptable[j].size(); ++l) {
431           candidate.resize(cn);
432           candidate.append(maptable[j][l]);
433           map_related(word, candidate, wn + len, wlst,
434                            cpdsuggest, maptable, timer, timelimit);
435           if (!(*timer))
436             return wlst.size();
437         }
438       }
439     }
440   }
441   if (!in_map) {
442     candidate.push_back(*(word + wn));
443     map_related(word, candidate, wn + 1, wlst, cpdsuggest,
444                 maptable, timer, timelimit);
445   }
446   return wlst.size();
447 }
448 
449 // suggestions for a typical fault of spelling, that
450 // differs with more, than 1 letter from the right form.
replchars(std::vector<std::string> & wlst,const char * word,int cpdsuggest)451 int SuggestMgr::replchars(std::vector<std::string>& wlst,
452                           const char* word,
453                           int cpdsuggest) {
454   std::string candidate;
455   int wl = strlen(word);
456   if (wl < 2 || !pAMgr)
457     return wlst.size();
458   const std::vector<replentry>& reptable = pAMgr->get_reptable();
459   for (size_t i = 0; i < reptable.size(); ++i) {
460     const char* r = word;
461     // search every occurence of the pattern in the word
462     while ((r = strstr(r, reptable[i].pattern.c_str())) != NULL) {
463       int type = (r == word) ? 1 : 0;
464       if (r - word + reptable[i].pattern.size() == strlen(word))
465         type += 2;
466       while (type && reptable[i].outstrings[type].empty())
467         type = (type == 2 && r != word) ? 0 : type - 1;
468       const std::string&out = reptable[i].outstrings[type];
469       if (out.empty()) {
470         ++r;
471         continue;
472       }
473       candidate.assign(word);
474       candidate.resize(r - word);
475       candidate.append(reptable[i].outstrings[type]);
476       candidate.append(r + reptable[i].pattern.size());
477       testsug(wlst, candidate, cpdsuggest, NULL, NULL);
478       // check REP suggestions with space
479       size_t sp = candidate.find(' ');
480       if (sp != std::string::npos) {
481         size_t prev = 0;
482         while (sp != std::string::npos) {
483           std::string prev_chunk = candidate.substr(prev, sp - prev);
484           if (checkword(prev_chunk, 0, NULL, NULL)) {
485             size_t oldns = wlst.size();
486             std::string post_chunk = candidate.substr(sp + 1);
487             testsug(wlst, post_chunk, cpdsuggest, NULL, NULL);
488             if (oldns < wlst.size()) {
489               wlst[wlst.size() - 1] = candidate;
490             }
491           }
492           prev = sp + 1;
493           sp = candidate.find(' ', prev);
494         }
495       }
496       r++;  // search for the next letter
497     }
498   }
499   return wlst.size();
500 }
501 
502 // perhaps we doubled two characters
503 // (for example vacation -> vacacation)
504 // The recognized pattern with regex back-references:
505 // "(.)(.)\1\2\1" or "..(.)(.)\1\2"
506 
doubletwochars(std::vector<std::string> & wlst,const char * word,int cpdsuggest)507 int SuggestMgr::doubletwochars(std::vector<std::string>& wlst,
508                                const char* word,
509                                int cpdsuggest) {
510   int state = 0;
511   int wl = strlen(word);
512   if (wl < 5 || !pAMgr)
513     return wlst.size();
514   for (int i = 2; i < wl; i++) {
515     if (word[i] == word[i - 2]) {
516       state++;
517       if (state == 3 || (state == 2 && i >= 4)) {
518         std::string candidate(word, word + i - 1);
519         candidate.insert(candidate.end(), word + i + 1, word + wl);
520         testsug(wlst, candidate, cpdsuggest, NULL, NULL);
521         state = 0;
522       }
523     } else {
524       state = 0;
525     }
526   }
527   return wlst.size();
528 }
529 
530 // perhaps we doubled two characters
531 // (for example vacation -> vacacation)
532 // The recognized pattern with regex back-references:
533 // "(.)(.)\1\2\1" or "..(.)(.)\1\2"
534 
doubletwochars_utf(std::vector<std::string> & wlst,const w_char * word,int wl,int cpdsuggest)535 int SuggestMgr::doubletwochars_utf(std::vector<std::string>& wlst,
536                                    const w_char* word,
537                                    int wl,
538                                    int cpdsuggest) {
539   int state = 0;
540   if (wl < 5 || !pAMgr)
541     return wlst.size();
542   for (int i = 2; i < wl; i++) {
543     if (word[i] == word[i - 2]) {
544       state++;
545       if (state == 3 || (state == 2 && i >= 4)) {
546         std::vector<w_char> candidate_utf(word, word + i - 1);
547         candidate_utf.insert(candidate_utf.end(), word + i + 1, word + wl);
548         std::string candidate;
549         u16_u8(candidate, candidate_utf);
550         testsug(wlst, candidate, cpdsuggest, NULL, NULL);
551         state = 0;
552       }
553     } else {
554       state = 0;
555     }
556   }
557   return wlst.size();
558 }
559 
560 // error is wrong char in place of correct one (case and keyboard related
561 // version)
badcharkey(std::vector<std::string> & wlst,const char * word,int cpdsuggest)562 int SuggestMgr::badcharkey(std::vector<std::string>& wlst,
563                            const char* word,
564                            int cpdsuggest) {
565   std::string candidate(word);
566 
567   // swap out each char one by one and try uppercase and neighbor
568   // keyboard chars in its place to see if that makes a good word
569   for (size_t i = 0; i < candidate.size(); ++i) {
570     char tmpc = candidate[i];
571     // check with uppercase letters
572     candidate[i] = csconv[((unsigned char)tmpc)].cupper;
573     if (tmpc != candidate[i]) {
574       testsug(wlst, candidate, cpdsuggest, NULL, NULL);
575       candidate[i] = tmpc;
576     }
577     // check neighbor characters in keyboard string
578     if (!ckey)
579       continue;
580     char* loc = strchr(ckey, tmpc);
581     while (loc) {
582       if ((loc > ckey) && (*(loc - 1) != '|')) {
583         candidate[i] = *(loc - 1);
584         testsug(wlst, candidate, cpdsuggest, NULL, NULL);
585       }
586       if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) {
587         candidate[i] = *(loc + 1);
588         testsug(wlst, candidate, cpdsuggest, NULL, NULL);
589       }
590       loc = strchr(loc + 1, tmpc);
591     }
592     candidate[i] = tmpc;
593   }
594   return wlst.size();
595 }
596 
597 // error is wrong char in place of correct one (case and keyboard related
598 // version)
badcharkey_utf(std::vector<std::string> & wlst,const w_char * word,int wl,int cpdsuggest)599 int SuggestMgr::badcharkey_utf(std::vector<std::string>& wlst,
600                                const w_char* word,
601                                int wl,
602                                int cpdsuggest) {
603   std::string candidate;
604   std::vector<w_char> candidate_utf(word, word + wl);
605   // swap out each char one by one and try all the tryme
606   // chars in its place to see if that makes a good word
607   for (int i = 0; i < wl; i++) {
608     w_char tmpc = candidate_utf[i];
609     // check with uppercase letters
610     candidate_utf[i] = upper_utf(candidate_utf[i], 1);
611     if (tmpc != candidate_utf[i]) {
612       u16_u8(candidate, candidate_utf);
613       testsug(wlst, candidate, cpdsuggest, NULL, NULL);
614       candidate_utf[i] = tmpc;
615     }
616     // check neighbor characters in keyboard string
617     if (!ckey)
618       continue;
619     size_t loc = 0;
620     while ((loc < ckeyl) && ckey_utf[loc] != tmpc)
621       ++loc;
622     while (loc < ckeyl) {
623       if ((loc > 0) && ckey_utf[loc - 1] != W_VLINE) {
624         candidate_utf[i] = ckey_utf[loc - 1];
625         u16_u8(candidate, candidate_utf);
626         testsug(wlst, candidate, cpdsuggest, NULL, NULL);
627       }
628       if (((loc + 1) < ckeyl) && (ckey_utf[loc + 1] != W_VLINE)) {
629         candidate_utf[i] = ckey_utf[loc + 1];
630         u16_u8(candidate, candidate_utf);
631         testsug(wlst, candidate, cpdsuggest, NULL, NULL);
632       }
633       do {
634         loc++;
635       } while ((loc < ckeyl) && ckey_utf[loc] != tmpc);
636     }
637     candidate_utf[i] = tmpc;
638   }
639   return wlst.size();
640 }
641 
642 // error is wrong char in place of correct one
badchar(std::vector<std::string> & wlst,const char * word,int cpdsuggest)643 int SuggestMgr::badchar(std::vector<std::string>& wlst, const char* word, int cpdsuggest) {
644   std::string candidate(word);
645   clock_t timelimit = clock();
646   int timer = MINTIMER;
647   // swap out each char one by one and try all the tryme
648   // chars in its place to see if that makes a good word
649   for (size_t j = 0; j < ctryl; ++j) {
650     for (std::string::reverse_iterator aI = candidate.rbegin(), aEnd = candidate.rend(); aI != aEnd; ++aI) {
651       char tmpc = *aI;
652       if (ctry[j] == tmpc)
653         continue;
654       *aI = ctry[j];
655       testsug(wlst, candidate, cpdsuggest, &timer, &timelimit);
656       if (!timer)
657         return wlst.size();
658       *aI = tmpc;
659     }
660   }
661   return wlst.size();
662 }
663 
664 // error is wrong char in place of correct one
badchar_utf(std::vector<std::string> & wlst,const w_char * word,int wl,int cpdsuggest)665 int SuggestMgr::badchar_utf(std::vector<std::string>& wlst,
666                             const w_char* word,
667                             int wl,
668                             int cpdsuggest) {
669   std::vector<w_char> candidate_utf(word, word + wl);
670   std::string candidate;
671   clock_t timelimit = clock();
672   int timer = MINTIMER;
673   // swap out each char one by one and try all the tryme
674   // chars in its place to see if that makes a good word
675   for (size_t j = 0; j < ctryl; ++j) {
676     for (int i = wl - 1; i >= 0; i--) {
677       w_char tmpc = candidate_utf[i];
678       if (tmpc == ctry_utf[j])
679         continue;
680       candidate_utf[i] = ctry_utf[j];
681       u16_u8(candidate, candidate_utf);
682       testsug(wlst, candidate, cpdsuggest, &timer, &timelimit);
683       if (!timer)
684         return wlst.size();
685       candidate_utf[i] = tmpc;
686     }
687   }
688   return wlst.size();
689 }
690 
691 // error is word has an extra letter it does not need
extrachar_utf(std::vector<std::string> & wlst,const w_char * word,int wl,int cpdsuggest)692 int SuggestMgr::extrachar_utf(std::vector<std::string>& wlst,
693                               const w_char* word,
694                               int wl,
695                               int cpdsuggest) {
696   std::vector<w_char> candidate_utf(word, word + wl);
697   if (candidate_utf.size() < 2)
698     return wlst.size();
699   // try omitting one char of word at a time
700   for (size_t i = 0; i < candidate_utf.size(); ++i) {
701     size_t index = candidate_utf.size() - 1 - i;
702     w_char tmpc = candidate_utf[index];
703     candidate_utf.erase(candidate_utf.begin() + index);
704     std::string candidate;
705     u16_u8(candidate, candidate_utf);
706     testsug(wlst, candidate, cpdsuggest, NULL, NULL);
707     candidate_utf.insert(candidate_utf.begin() + index, tmpc);
708   }
709   return wlst.size();
710 }
711 
712 // error is word has an extra letter it does not need
extrachar(std::vector<std::string> & wlst,const char * word,int cpdsuggest)713 int SuggestMgr::extrachar(std::vector<std::string>& wlst,
714                           const char* word,
715                           int cpdsuggest) {
716   std::string candidate(word);
717   if (candidate.size() < 2)
718     return wlst.size();
719   // try omitting one char of word at a time
720   for (size_t i = 0; i < candidate.size(); ++i) {
721     size_t index = candidate.size() - 1 - i;
722     char tmpc = candidate[index];
723     candidate.erase(candidate.begin() + index);
724     testsug(wlst, candidate, cpdsuggest, NULL, NULL);
725     candidate.insert(candidate.begin() + index, tmpc);
726   }
727   return wlst.size();
728 }
729 
730 // error is missing a letter it needs
forgotchar(std::vector<std::string> & wlst,const char * word,int cpdsuggest)731 int SuggestMgr::forgotchar(std::vector<std::string>& wlst,
732                            const char* word,
733                            int cpdsuggest) {
734   std::string candidate(word);
735   clock_t timelimit = clock();
736   int timer = MINTIMER;
737 
738   // try inserting a tryme character before every letter (and the null
739   // terminator)
740   for (size_t k = 0; k < ctryl; ++k) {
741     for (size_t i = 0; i <= candidate.size(); ++i) {
742       size_t index = candidate.size() - i;
743       candidate.insert(candidate.begin() + index, ctry[k]);
744       testsug(wlst, candidate, cpdsuggest, &timer, &timelimit);
745       if (!timer)
746         return wlst.size();
747       candidate.erase(candidate.begin() + index);
748     }
749   }
750   return wlst.size();
751 }
752 
753 // error is missing a letter it needs
forgotchar_utf(std::vector<std::string> & wlst,const w_char * word,int wl,int cpdsuggest)754 int SuggestMgr::forgotchar_utf(std::vector<std::string>& wlst,
755                                const w_char* word,
756                                int wl,
757                                int cpdsuggest) {
758   std::vector<w_char> candidate_utf(word, word + wl);
759   clock_t timelimit = clock();
760   int timer = MINTIMER;
761 
762   // try inserting a tryme character at the end of the word and before every
763   // letter
764   for (size_t k = 0; k < ctryl; ++k) {
765     for (size_t i = 0; i <= candidate_utf.size(); ++i) {
766       size_t index = candidate_utf.size() - i;
767       candidate_utf.insert(candidate_utf.begin() + index, ctry_utf[k]);
768       std::string candidate;
769       u16_u8(candidate, candidate_utf);
770       testsug(wlst, candidate, cpdsuggest, &timer, &timelimit);
771       if (!timer)
772         return wlst.size();
773       candidate_utf.erase(candidate_utf.begin() + index);
774     }
775   }
776   return wlst.size();
777 }
778 
779 /* error is should have been two words
780  * return value is true, if there is a dictionary word pair,
781  * or there was already a good suggestion before calling
782  * this function.
783  */
twowords(std::vector<std::string> & wlst,const char * word,int cpdsuggest,bool good)784 bool SuggestMgr::twowords(std::vector<std::string>& wlst,
785                          const char* word,
786                          int cpdsuggest,
787                          bool good) {
788   int c2;
789   int forbidden = 0;
790   int cwrd;
791 
792   int wl = strlen(word);
793   if (wl < 3)
794     return false;
795 
796   if (langnum == LANG_hu)
797     forbidden = check_forbidden(word, wl);
798 
799   char* candidate = (char*)malloc(wl + 2);
800   strcpy(candidate + 1, word);
801 
802   // split the string into two pieces after every char
803   // if both pieces are good words make them a suggestion
804   for (char* p = candidate + 1; p[1] != '\0'; p++) {
805     p[-1] = *p;
806     // go to end of the UTF-8 character
807     while (utf8 && ((p[1] & 0xc0) == 0x80)) {
808       *p = p[1];
809       p++;
810     }
811     if (utf8 && p[1] == '\0')
812       break;  // last UTF-8 character
813 
814     // Suggest only word pairs, if they are listed in the dictionary.
815     // For example, adding "a lot" to the English dic file will
816     // result only "alot" -> "a lot" suggestion instead of
817     // "alto, slot, alt, lot, allot, aloft, aloe, clot, plot, blot, a lot".
818     // Note: using "ph:alot" keeps the other suggestions:
819     // a lot ph:alot
820     // alot -> a lot, alto, slot...
821     *p = ' ';
822     if (!cpdsuggest && checkword(candidate, cpdsuggest, NULL, NULL)) {
823       // remove not word pair suggestions
824       if (!good) {
825         good = true;
826         wlst.clear();
827       }
828       wlst.insert(wlst.begin(), candidate);
829     }
830 
831     // word pairs with dash?
832     if (lang_with_dash_usage) {
833       *p = '-';
834 
835       if (!cpdsuggest && checkword(candidate, cpdsuggest, NULL, NULL)) {
836         // remove not word pair suggestions
837         if (!good) {
838           good = true;
839           wlst.clear();
840         }
841         wlst.insert(wlst.begin(), candidate);
842       }
843     }
844 
845     if (wlst.size() < maxSug && !nosplitsugs && !good) {
846       *p = '\0';
847       int c1 = checkword(candidate, cpdsuggest, NULL, NULL);
848       if (c1) {
849         c2 = checkword((p + 1), cpdsuggest, NULL, NULL);
850         if (c2) {
851           // spec. Hungarian code (TODO need a better compound word support)
852           if ((langnum == LANG_hu) && !forbidden &&
853               // if 3 repeating letter, use - instead of space
854               (((p[-1] == p[1]) &&
855               (((p > candidate + 1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
856               // or multiple compounding, with more, than 6 syllables
857               ((c1 == 3) && (c2 >= 2))))
858             *p = '-';
859           else
860             *p = ' ';
861 
862           cwrd = 1;
863           for (size_t k = 0; k < wlst.size(); ++k) {
864             if (wlst[k] == candidate) {
865               cwrd = 0;
866               break;
867             }
868           }
869 
870           if (cwrd && (wlst.size() < maxSug))
871               wlst.push_back(candidate);
872 
873           // add two word suggestion with dash, depending on the language
874           // Note that cwrd doesn't modified for REP twoword sugg.
875           if ( !nosplitsugs && lang_with_dash_usage &&
876               mystrlen(p + 1) > 1 && mystrlen(candidate) - mystrlen(p) > 1) {
877             *p = '-';
878             for (size_t k = 0; k < wlst.size(); ++k) {
879               if (wlst[k] == candidate) {
880                 cwrd = 0;
881                 break;
882               }
883             }
884 
885             if ((wlst.size() < maxSug) && cwrd)
886               wlst.push_back(candidate);
887           }
888         }
889       }
890     }
891   }
892   free(candidate);
893   return good;
894 }
895 
896 // error is adjacent letter were swapped
swapchar(std::vector<std::string> & wlst,const char * word,int cpdsuggest)897 int SuggestMgr::swapchar(std::vector<std::string>& wlst,
898                          const char* word,
899                          int cpdsuggest) {
900   std::string candidate(word);
901   if (candidate.size() < 2)
902     return wlst.size();
903 
904   // try swapping adjacent chars one by one
905   for (size_t i = 0; i < candidate.size() - 1; ++i) {
906     std::swap(candidate[i], candidate[i+1]);
907     testsug(wlst, candidate, cpdsuggest, NULL, NULL);
908     std::swap(candidate[i], candidate[i+1]);
909   }
910 
911   // try double swaps for short words
912   // ahev -> have, owudl -> would
913   if (candidate.size() == 4 || candidate.size() == 5) {
914     candidate[0] = word[1];
915     candidate[1] = word[0];
916     candidate[2] = word[2];
917     candidate[candidate.size() - 2] = word[candidate.size() - 1];
918     candidate[candidate.size() - 1] = word[candidate.size() - 2];
919     testsug(wlst, candidate, cpdsuggest, NULL, NULL);
920     if (candidate.size() == 5) {
921       candidate[0] = word[0];
922       candidate[1] = word[2];
923       candidate[2] = word[1];
924       testsug(wlst, candidate, cpdsuggest, NULL, NULL);
925     }
926   }
927 
928   return wlst.size();
929 }
930 
931 // error is adjacent letter were swapped
swapchar_utf(std::vector<std::string> & wlst,const w_char * word,int wl,int cpdsuggest)932 int SuggestMgr::swapchar_utf(std::vector<std::string>& wlst,
933                              const w_char* word,
934                              int wl,
935                              int cpdsuggest) {
936   std::vector<w_char> candidate_utf(word, word + wl);
937   if (candidate_utf.size() < 2)
938     return wlst.size();
939 
940   std::string candidate;
941   // try swapping adjacent chars one by one
942   for (size_t i = 0; i < candidate_utf.size() - 1; ++i) {
943     std::swap(candidate_utf[i], candidate_utf[i+1]);
944     u16_u8(candidate, candidate_utf);
945     testsug(wlst, candidate, cpdsuggest, NULL, NULL);
946     std::swap(candidate_utf[i], candidate_utf[i+1]);
947   }
948 
949   // try double swaps for short words
950   // ahev -> have, owudl -> would, suodn -> sound
951   if (candidate_utf.size() == 4 || candidate_utf.size() == 5) {
952     candidate_utf[0] = word[1];
953     candidate_utf[1] = word[0];
954     candidate_utf[2] = word[2];
955     candidate_utf[candidate_utf.size() - 2] = word[candidate_utf.size() - 1];
956     candidate_utf[candidate_utf.size() - 1] = word[candidate_utf.size() - 2];
957     u16_u8(candidate, candidate_utf);
958     testsug(wlst, candidate, cpdsuggest, NULL, NULL);
959     if (candidate_utf.size() == 5) {
960       candidate_utf[0] = word[0];
961       candidate_utf[1] = word[2];
962       candidate_utf[2] = word[1];
963       u16_u8(candidate, candidate_utf);
964       testsug(wlst, candidate, cpdsuggest, NULL, NULL);
965     }
966   }
967   return wlst.size();
968 }
969 
970 // error is not adjacent letter were swapped
longswapchar(std::vector<std::string> & wlst,const char * word,int cpdsuggest)971 int SuggestMgr::longswapchar(std::vector<std::string>& wlst,
972                              const char* word,
973                              int cpdsuggest) {
974   std::string candidate(word);
975   // try swapping not adjacent chars one by one
976   for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) {
977     for (std::string::iterator q = candidate.begin(); q < candidate.end(); ++q) {
978       size_t distance = std::abs(std::distance(q, p));
979       if (distance > 1 && distance <= MAX_CHAR_DISTANCE) {
980         std::swap(*p, *q);
981         testsug(wlst, candidate, cpdsuggest, NULL, NULL);
982         std::swap(*p, *q);
983       }
984     }
985   }
986   return wlst.size();
987 }
988 
989 // error is adjacent letter were swapped
longswapchar_utf(std::vector<std::string> & wlst,const w_char * word,int wl,int cpdsuggest)990 int SuggestMgr::longswapchar_utf(std::vector<std::string>& wlst,
991                                  const w_char* word,
992                                  int wl,
993                                  int cpdsuggest) {
994   std::vector<w_char> candidate_utf(word, word + wl);
995   // try swapping not adjacent chars
996   for (std::vector<w_char>::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) {
997     for (std::vector<w_char>::iterator q = candidate_utf.begin(); q < candidate_utf.end(); ++q) {
998       size_t distance = std::abs(std::distance(q, p));
999       if (distance > 1 && distance <= MAX_CHAR_DISTANCE) {
1000         std::swap(*p, *q);
1001         std::string candidate;
1002         u16_u8(candidate, candidate_utf);
1003         testsug(wlst, candidate, cpdsuggest, NULL, NULL);
1004         std::swap(*p, *q);
1005       }
1006     }
1007   }
1008   return wlst.size();
1009 }
1010 
1011 // error is a letter was moved
movechar(std::vector<std::string> & wlst,const char * word,int cpdsuggest)1012 int SuggestMgr::movechar(std::vector<std::string>& wlst,
1013                          const char* word,
1014                          int cpdsuggest) {
1015   std::string candidate(word);
1016   if (candidate.size() < 2)
1017     return wlst.size();
1018 
1019   // try moving a char
1020   for (std::string::iterator p = candidate.begin(); p < candidate.end(); ++p) {
1021     for (std::string::iterator q = p + 1; q < candidate.end() && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) {
1022       std::swap(*q, *(q - 1));
1023       if (std::distance(p, q) < 2)
1024         continue;  // omit swap char
1025       testsug(wlst, candidate, cpdsuggest, NULL, NULL);
1026     }
1027     std::copy(word, word + candidate.size(), candidate.begin());
1028   }
1029 
1030   for (std::string::reverse_iterator p = candidate.rbegin(), pEnd = candidate.rend() - 1; p != pEnd; ++p) {
1031     for (std::string::reverse_iterator q = p + 1, qEnd = candidate.rend(); q != qEnd && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) {
1032       std::swap(*q, *(q - 1));
1033       if (std::distance(p, q) < 2)
1034         continue;  // omit swap char
1035       testsug(wlst, candidate, cpdsuggest, NULL, NULL);
1036     }
1037     std::copy(word, word + candidate.size(), candidate.begin());
1038   }
1039 
1040   return wlst.size();
1041 }
1042 
1043 // error is a letter was moved
movechar_utf(std::vector<std::string> & wlst,const w_char * word,int wl,int cpdsuggest)1044 int SuggestMgr::movechar_utf(std::vector<std::string>& wlst,
1045                              const w_char* word,
1046                              int wl,
1047                              int cpdsuggest) {
1048   std::vector<w_char> candidate_utf(word, word + wl);
1049   if (candidate_utf.size() < 2)
1050     return wlst.size();
1051 
1052   // try moving a char
1053   for (std::vector<w_char>::iterator p = candidate_utf.begin(); p < candidate_utf.end(); ++p) {
1054     for (std::vector<w_char>::iterator q = p + 1; q < candidate_utf.end() && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) {
1055       std::swap(*q, *(q - 1));
1056       if (std::distance(p, q) < 2)
1057         continue;  // omit swap char
1058       std::string candidate;
1059       u16_u8(candidate, candidate_utf);
1060       testsug(wlst, candidate, cpdsuggest, NULL, NULL);
1061     }
1062     std::copy(word, word + candidate_utf.size(), candidate_utf.begin());
1063   }
1064 
1065   for (std::vector<w_char>::reverse_iterator p = candidate_utf.rbegin(); p < candidate_utf.rend(); ++p) {
1066     for (std::vector<w_char>::reverse_iterator q = p + 1; q < candidate_utf.rend() && std::distance(p, q) <= MAX_CHAR_DISTANCE; ++q) {
1067       std::swap(*q, *(q - 1));
1068       if (std::distance(p, q) < 2)
1069         continue;  // omit swap char
1070       std::string candidate;
1071       u16_u8(candidate, candidate_utf);
1072       testsug(wlst, candidate, cpdsuggest, NULL, NULL);
1073     }
1074     std::copy(word, word + candidate_utf.size(), candidate_utf.begin());
1075   }
1076 
1077   return wlst.size();
1078 }
1079 
1080 // generate a set of suggestions for very poorly spelled words
ngsuggest(std::vector<std::string> & wlst,const char * w,const std::vector<HashMgr * > & rHMgr,int captype)1081 void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
1082                           const char* w,
1083                           const std::vector<HashMgr*>& rHMgr,
1084                           int captype) {
1085   int lval;
1086   int sc;
1087   int lp, lpphon;
1088   int nonbmp = 0;
1089 
1090   // exhaustively search through all root words
1091   // keeping track of the MAX_ROOTS most similar root words
1092   struct hentry* roots[MAX_ROOTS];
1093   char* rootsphon[MAX_ROOTS];
1094   int scores[MAX_ROOTS];
1095   int scoresphon[MAX_ROOTS];
1096   for (int i = 0; i < MAX_ROOTS; i++) {
1097     roots[i] = NULL;
1098     scores[i] = -100 * i;
1099     rootsphon[i] = NULL;
1100     scoresphon[i] = -100 * i;
1101   }
1102   lp = MAX_ROOTS - 1;
1103   lpphon = MAX_ROOTS - 1;
1104   int low = NGRAM_LOWERING;
1105 
1106   std::string w2;
1107   const char* word = w;
1108 
1109   // word reversing wrapper for complex prefixes
1110   if (complexprefixes) {
1111     w2.assign(w);
1112     if (utf8)
1113       reverseword_utf(w2);
1114     else
1115       reverseword(w2);
1116     word = w2.c_str();
1117   }
1118 
1119   std::vector<w_char> u8;
1120   int nc = strlen(word);
1121   int n = (utf8) ? u8_u16(u8, word) : nc;
1122 
1123   // set character based ngram suggestion for words with non-BMP Unicode
1124   // characters
1125   if (n == -1) {
1126     utf8 = 0;  // XXX not state-free
1127     n = nc;
1128     nonbmp = 1;
1129     low = 0;
1130   }
1131 
1132   struct hentry* hp = NULL;
1133   int col = -1;
1134   phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;
1135   std::string target;
1136   std::string candidate;
1137   std::vector<w_char> w_candidate;
1138   if (ph) {
1139     if (utf8) {
1140       u8_u16(w_candidate, word);
1141       mkallcap_utf(w_candidate, langnum);
1142       u16_u8(candidate, w_candidate);
1143     } else {
1144       candidate.assign(word);
1145       if (!nonbmp)
1146         mkallcap(candidate, csconv);
1147     }
1148     target = phonet(candidate, *ph);  // XXX phonet() is 8-bit (nc, not n)
1149   }
1150 
1151   FLAG forbiddenword = pAMgr ? pAMgr->get_forbiddenword() : FLAG_NULL;
1152   FLAG nosuggest = pAMgr ? pAMgr->get_nosuggest() : FLAG_NULL;
1153   FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL;
1154   FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL;
1155 
1156   std::vector<w_char> w_word, w_target;
1157   if (utf8) {
1158     u8_u16(w_word, word);
1159     u8_u16(w_target, target);
1160   }
1161 
1162   std::string f;
1163   std::vector<w_char> w_f;
1164 
1165   for (size_t i = 0; i < rHMgr.size(); ++i) {
1166     while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) {
1167       // skip exceptions
1168       if (
1169            // skip it, if the word length different by 5 or
1170            // more characters (to avoid strange suggestions)
1171            // (except Unicode characters over BMP)
1172            (((abs(n - hp->clen) > 4) && !nonbmp)) ||
1173            // don't suggest capitalized dictionary words for
1174            // lower case misspellings in ngram suggestions, except
1175            // - PHONE usage, or
1176            // - in the case of German, where not only proper
1177            //   nouns are capitalized, or
1178            // - the capitalized word has special pronunciation
1179            ((captype == NOCAP) && (hp->var & H_OPT_INITCAP) &&
1180               !ph && (langnum != LANG_de) && !(hp->var & H_OPT_PHON)) ||
1181            // or it has one of the following special flags
1182            ((hp->astr) && (pAMgr) &&
1183              (TESTAFF(hp->astr, forbiddenword, hp->alen) ||
1184              TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) ||
1185              TESTAFF(hp->astr, nosuggest, hp->alen) ||
1186              TESTAFF(hp->astr, nongramsuggest, hp->alen) ||
1187              TESTAFF(hp->astr, onlyincompound, hp->alen)))
1188          )
1189         continue;
1190 
1191       if (utf8) {
1192         u8_u16(w_f, HENTRY_WORD(hp));
1193 
1194         int leftcommon = leftcommonsubstring(w_word, w_f);
1195         if (low) {
1196           // lowering dictionary word
1197           mkallsmall_utf(w_f, langnum);
1198         }
1199         sc = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE) + leftcommon;
1200       } else {
1201         f.assign(HENTRY_WORD(hp));
1202 
1203         int leftcommon = leftcommonsubstring(word, f.c_str());
1204         if (low) {
1205           // lowering dictionary word
1206           mkallsmall(f, csconv);
1207         }
1208         sc = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon;
1209       }
1210 
1211       // check special pronunciation
1212       f.clear();
1213       if ((hp->var & H_OPT_PHON) &&
1214           copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
1215         int sc2;
1216         if (utf8) {
1217           u8_u16(w_f, f);
1218 
1219           int leftcommon = leftcommonsubstring(w_word, w_f);
1220           if (low) {
1221             // lowering dictionary word
1222             mkallsmall_utf(w_f, langnum);
1223           }
1224           sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE) + leftcommon;
1225         } else {
1226           int leftcommon = leftcommonsubstring(word, f.c_str());
1227           if (low) {
1228             // lowering dictionary word
1229             mkallsmall(f, csconv);
1230           }
1231           sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon;
1232         }
1233         if (sc2 > sc)
1234           sc = sc2;
1235       }
1236 
1237       int scphon = -20000;
1238       if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) {
1239         if (utf8) {
1240           u8_u16(w_candidate, HENTRY_WORD(hp));
1241           mkallcap_utf(w_candidate, langnum);
1242           u16_u8(candidate, w_candidate);
1243         } else {
1244           candidate = HENTRY_WORD(hp);
1245           mkallcap(candidate, csconv);
1246         }
1247         f = phonet(candidate, *ph);
1248         if (utf8) {
1249           u8_u16(w_f, f);
1250           scphon = 2 * ngram(3, w_target, w_f,
1251                              NGRAM_LONGER_WORSE);
1252         } else {
1253           scphon = 2 * ngram(3, target, f,
1254                              NGRAM_LONGER_WORSE);
1255         }
1256       }
1257 
1258       if (sc > scores[lp]) {
1259         scores[lp] = sc;
1260         roots[lp] = hp;
1261         lval = sc;
1262         for (int j = 0; j < MAX_ROOTS; j++)
1263           if (scores[j] < lval) {
1264             lp = j;
1265             lval = scores[j];
1266           }
1267       }
1268 
1269       if (scphon > scoresphon[lpphon]) {
1270         scoresphon[lpphon] = scphon;
1271         rootsphon[lpphon] = HENTRY_WORD(hp);
1272         lval = scphon;
1273         for (int j = 0; j < MAX_ROOTS; j++)
1274           if (scoresphon[j] < lval) {
1275             lpphon = j;
1276             lval = scoresphon[j];
1277           }
1278       }
1279     }
1280   }
1281 
1282   // find minimum threshold for a passable suggestion
1283   // mangle original word three differnt ways
1284   // and score them to generate a minimum acceptable score
1285   std::vector<w_char> w_mw;
1286   int thresh = 0;
1287   for (int sp = 1; sp < 4; sp++) {
1288     if (utf8) {
1289       w_mw = w_word;
1290       for (int k = sp; k < n; k += 4) {
1291         w_mw[k].l = '*';
1292         w_mw[k].h = 0;
1293       }
1294 
1295       if (low) {
1296         // lowering dictionary word
1297         mkallsmall_utf(w_mw, langnum);
1298       }
1299 
1300       thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH);
1301     } else {
1302       std::string mw = word;
1303       for (int k = sp; k < n; k += 4)
1304         mw[k] = '*';
1305 
1306       if (low) {
1307         // lowering dictionary word
1308         mkallsmall(mw, csconv);
1309       }
1310 
1311       thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH);
1312     }
1313   }
1314   thresh = thresh / 3;
1315   thresh--;
1316 
1317   // now expand affixes on each of these root words and
1318   // and use length adjusted ngram scores to select
1319   // possible suggestions
1320   char* guess[MAX_GUESS];
1321   char* guessorig[MAX_GUESS];
1322   int gscore[MAX_GUESS];
1323   for (int i = 0; i < MAX_GUESS; i++) {
1324     guess[i] = NULL;
1325     guessorig[i] = NULL;
1326     gscore[i] = -100 * i;
1327   }
1328 
1329   lp = MAX_GUESS - 1;
1330 
1331   struct guessword* glst;
1332   glst = (struct guessword*)calloc(MAX_WORDS, sizeof(struct guessword));
1333   if (!glst) {
1334     if (nonbmp)
1335       utf8 = 1;
1336     return;
1337   }
1338 
1339   for (int i = 0; i < MAX_ROOTS; i++) {
1340     if (roots[i]) {
1341       struct hentry* rp = roots[i];
1342 
1343       f.clear();
1344       const char *field = NULL;
1345       if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON))
1346           field = f.c_str();
1347       int nw = pAMgr->expand_rootword(
1348           glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen, rp->astr, rp->alen, word,
1349           nc, field);
1350 
1351       for (int k = 0; k < nw; k++) {
1352         if (utf8) {
1353           u8_u16(w_f, glst[k].word);
1354 
1355           int leftcommon = leftcommonsubstring(w_word, w_f);
1356           if (low) {
1357             // lowering dictionary word
1358             mkallsmall_utf(w_f, langnum);
1359           }
1360 
1361           sc = ngram(n, w_word, w_f, NGRAM_ANY_MISMATCH) + leftcommon;
1362         } else {
1363           f = glst[k].word;
1364 
1365           int leftcommon = leftcommonsubstring(word, f.c_str());
1366           if (low) {
1367             // lowering dictionary word
1368             mkallsmall(f, csconv);
1369           }
1370 
1371           sc = ngram(n, word, f, NGRAM_ANY_MISMATCH) + leftcommon;
1372         }
1373 
1374         if (sc > thresh) {
1375           if (sc > gscore[lp]) {
1376             if (guess[lp]) {
1377               free(guess[lp]);
1378               if (guessorig[lp]) {
1379                 free(guessorig[lp]);
1380                 guessorig[lp] = NULL;
1381               }
1382             }
1383             gscore[lp] = sc;
1384             guess[lp] = glst[k].word;
1385             guessorig[lp] = glst[k].orig;
1386             lval = sc;
1387             for (int j = 0; j < MAX_GUESS; j++)
1388               if (gscore[j] < lval) {
1389                 lp = j;
1390                 lval = gscore[j];
1391               }
1392           } else {
1393             free(glst[k].word);
1394             if (glst[k].orig)
1395               free(glst[k].orig);
1396           }
1397         } else {
1398           free(glst[k].word);
1399           if (glst[k].orig)
1400             free(glst[k].orig);
1401         }
1402       }
1403     }
1404   }
1405   free(glst);
1406 
1407   // now we are done generating guesses
1408   // sort in order of decreasing score
1409 
1410   bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);
1411   if (ph)
1412     bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);
1413 
1414   // weight suggestions with a similarity index, based on
1415   // the longest common subsequent algorithm and resort
1416 
1417   int is_swap = 0;
1418   int re = 0;
1419   double fact = 1.0;
1420   if (pAMgr) {
1421     int maxd = pAMgr->get_maxdiff();
1422     if (maxd >= 0)
1423       fact = (10.0 - maxd) / 5.0;
1424   }
1425 
1426   std::vector<w_char> w_gl;
1427   for (int i = 0; i < MAX_GUESS; i++) {
1428     if (guess[i]) {
1429       // lowering guess[i]
1430       std::string gl;
1431       int len;
1432       if (utf8) {
1433         len = u8_u16(w_gl, guess[i]);
1434         mkallsmall_utf(w_gl, langnum);
1435         u16_u8(gl, w_gl);
1436       } else {
1437         gl.assign(guess[i]);
1438         if (!nonbmp)
1439           mkallsmall(gl, csconv);
1440         len = strlen(guess[i]);
1441       }
1442 
1443       int _lcs = lcslen(word, gl.c_str());
1444 
1445       // same characters with different casing
1446       if ((n == len) && (n == _lcs)) {
1447         gscore[i] += 2000;
1448         break;
1449       }
1450       // using 2-gram instead of 3, and other weightening
1451 
1452       if (utf8) {
1453         u8_u16(w_gl, gl);
1454         //w_gl is lowercase already at this point
1455         re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
1456         if (low) {
1457           w_f = w_word;
1458           // lowering dictionary word
1459           mkallsmall_utf(w_f, langnum);
1460           re += ngram(2, w_gl, w_f, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
1461         } else {
1462           re += ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
1463         }
1464       } else {
1465         //gl is lowercase already at this point
1466         re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
1467         if (low) {
1468           f = word;
1469           // lowering dictionary word
1470           mkallsmall(f, csconv);
1471           re += ngram(2, gl, f, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
1472         } else {
1473           re += ngram(2, gl, word, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
1474         }
1475       }
1476 
1477       int ngram_score, leftcommon_score;
1478       if (utf8) {
1479         //w_gl is lowercase already at this point
1480         ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH);
1481         leftcommon_score = leftcommonsubstring(w_word, w_gl);
1482       } else {
1483         //gl is lowercase already at this point
1484         ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH);
1485         leftcommon_score = leftcommonsubstring(word, gl.c_str());
1486       }
1487       gscore[i] =
1488           // length of longest common subsequent minus length difference
1489           2 * _lcs - abs((int)(n - len)) +
1490           // weight length of the left common substring
1491           leftcommon_score +
1492           // weight equal character positions
1493           (!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap)
1494                ? 1
1495                : 0) +
1496           // swap character (not neighboring)
1497           ((is_swap) ? 10 : 0) +
1498           // ngram
1499           ngram_score +
1500           // weighted ngrams
1501           re +
1502           // different limit for dictionaries with PHONE rules
1503           (ph ? (re < len * fact ? -1000 : 0)
1504               : (re < (n + len) * fact ? -1000 : 0));
1505     }
1506   }
1507 
1508   bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);
1509 
1510   // phonetic version
1511   if (ph)
1512     for (int i = 0; i < MAX_ROOTS; i++) {
1513       if (rootsphon[i]) {
1514         // lowering rootphon[i]
1515         std::string gl;
1516         int len;
1517         if (utf8) {
1518           len = u8_u16(w_gl, rootsphon[i]);
1519           mkallsmall_utf(w_gl, langnum);
1520           u16_u8(gl, w_gl);
1521         } else {
1522           gl.assign(rootsphon[i]);
1523           if (!nonbmp)
1524             mkallsmall(gl, csconv);
1525           len = strlen(rootsphon[i]);
1526         }
1527 
1528         // weight length of the left common substring
1529         int leftcommon_score;
1530         if (utf8)
1531           leftcommon_score = leftcommonsubstring(w_word, w_gl);
1532         else
1533           leftcommon_score = leftcommonsubstring(word, gl.c_str());
1534         // heuristic weigthing of ngram scores
1535         scoresphon[i] += 2 * lcslen(word, gl) - abs((int)(n - len)) +
1536                          leftcommon_score;
1537       }
1538     }
1539 
1540   if (ph)
1541     bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);
1542 
1543   // copy over
1544   size_t oldns = wlst.size();
1545 
1546   int same = 0;
1547   for (int i = 0; i < MAX_GUESS; i++) {
1548     if (guess[i]) {
1549       if ((wlst.size() < oldns + maxngramsugs) && (wlst.size() < maxSug) &&
1550           (!same || (gscore[i] > 1000))) {
1551         int unique = 1;
1552         // leave only excellent suggestions, if exists
1553         if (gscore[i] > 1000)
1554           same = 1;
1555         else if (gscore[i] < -100) {
1556           same = 1;
1557           // keep the best ngram suggestions, unless in ONLYMAXDIFF mode
1558           if (wlst.size() > oldns || (pAMgr && pAMgr->get_onlymaxdiff())) {
1559             free(guess[i]);
1560             if (guessorig[i])
1561               free(guessorig[i]);
1562             continue;
1563           }
1564         }
1565         for (size_t j = 0; j < wlst.size(); ++j) {
1566           // don't suggest previous suggestions or a previous suggestion with
1567           // prefixes or affixes
1568           if ((!guessorig[i] && strstr(guess[i], wlst[j].c_str())) ||
1569               (guessorig[i] && strstr(guessorig[i], wlst[j].c_str())) ||
1570               // check forbidden words
1571               !checkword(guess[i], 0, NULL, NULL)) {
1572             unique = 0;
1573             break;
1574           }
1575         }
1576         if (unique) {
1577           if (guessorig[i]) {
1578             wlst.push_back(guessorig[i]);
1579           } else {
1580             wlst.push_back(guess[i]);
1581           }
1582         }
1583         free(guess[i]);
1584         if (guessorig[i])
1585           free(guessorig[i]);
1586       } else {
1587         free(guess[i]);
1588         if (guessorig[i])
1589           free(guessorig[i]);
1590       }
1591     }
1592   }
1593 
1594   oldns = wlst.size();
1595   if (ph)
1596     for (int i = 0; i < MAX_ROOTS; i++) {
1597       if (rootsphon[i]) {
1598         if ((wlst.size() < oldns + MAXPHONSUGS) && (wlst.size() < maxSug)) {
1599           int unique = 1;
1600           for (size_t j = 0; j < wlst.size(); ++j) {
1601             // don't suggest previous suggestions or a previous suggestion with
1602             // prefixes or affixes
1603             if (strstr(rootsphon[i], wlst[j].c_str()) ||
1604                 // check forbidden words
1605                 !checkword(rootsphon[i], 0, NULL, NULL)) {
1606               unique = 0;
1607               break;
1608             }
1609           }
1610           if (unique) {
1611             wlst.push_back(rootsphon[i]);
1612           }
1613         }
1614       }
1615     }
1616 
1617   if (nonbmp)
1618     utf8 = 1;
1619 }
1620 
1621 // see if a candidate suggestion is spelled correctly
1622 // needs to check both root words and words with affixes
1623 
1624 // obsolote MySpell-HU modifications:
1625 // return value 2 and 3 marks compounding with hyphen (-)
1626 // `3' marks roots without suffix
checkword(const std::string & word,int cpdsuggest,int * timer,clock_t * timelimit)1627 int SuggestMgr::checkword(const std::string& word,
1628                           int cpdsuggest,
1629                           int* timer,
1630                           clock_t* timelimit) {
1631   // check time limit
1632   if (timer) {
1633     (*timer)--;
1634     if (!(*timer) && timelimit) {
1635       if ((clock() - *timelimit) > TIMELIMIT)
1636         return 0;
1637       *timer = MAXPLUSTIMER;
1638     }
1639   }
1640 
1641   if (pAMgr) {
1642     struct hentry* rv = NULL;
1643     int nosuffix = 0;
1644 
1645     if (cpdsuggest == 1) {
1646       if (pAMgr->get_compound()) {
1647         struct hentry* rv2 = NULL;
1648         struct hentry* rwords[100];  // buffer for COMPOUND pattern checking
1649         rv = pAMgr->compound_check(word, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 1, 0);  // EXT
1650         if (rv &&
1651             (!(rv2 = pAMgr->lookup(word.c_str())) || !rv2->astr ||
1652              !(TESTAFF(rv2->astr, pAMgr->get_forbiddenword(), rv2->alen) ||
1653                TESTAFF(rv2->astr, pAMgr->get_nosuggest(), rv2->alen))))
1654           return 3;  // XXX obsolote categorisation + only ICONV needs affix
1655                      // flag check?
1656       }
1657       return 0;
1658     }
1659 
1660     rv = pAMgr->lookup(word.c_str());
1661 
1662     if (rv) {
1663       if ((rv->astr) &&
1664           (TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||
1665            TESTAFF(rv->astr, pAMgr->get_nosuggest(), rv->alen) ||
1666            TESTAFF(rv->astr, pAMgr->get_substandard(), rv->alen)))
1667         return 0;
1668       while (rv) {
1669         if (rv->astr &&
1670             (TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) ||
1671              TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1672              TESTAFF(rv->astr, pAMgr->get_onlyincompound(), rv->alen))) {
1673           rv = rv->next_homonym;
1674         } else
1675           break;
1676       }
1677     } else
1678       rv = pAMgr->prefix_check(word.c_str(), word.size(),
1679                                0);  // only prefix, and prefix + suffix XXX
1680 
1681     if (rv) {
1682       nosuffix = 1;
1683     } else {
1684       rv = pAMgr->suffix_check(word.c_str(), word.size(), 0, NULL,
1685                                FLAG_NULL, FLAG_NULL, IN_CPD_NOT);  // only suffix
1686     }
1687 
1688     if (!rv && pAMgr->have_contclass()) {
1689       rv = pAMgr->suffix_check_twosfx(word.c_str(), word.size(), 0, NULL, FLAG_NULL);
1690       if (!rv)
1691         rv = pAMgr->prefix_check_twosfx(word.c_str(), word.size(), 0, FLAG_NULL);
1692     }
1693 
1694     // check forbidden words
1695     if ((rv) && (rv->astr) &&
1696         (TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||
1697          TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1698          TESTAFF(rv->astr, pAMgr->get_nosuggest(), rv->alen) ||
1699          TESTAFF(rv->astr, pAMgr->get_onlyincompound(), rv->alen)))
1700       return 0;
1701 
1702     if (rv) {  // XXX obsolote
1703       if ((pAMgr->get_compoundflag()) &&
1704           TESTAFF(rv->astr, pAMgr->get_compoundflag(), rv->alen))
1705         return 2 + nosuffix;
1706       return 1;
1707     }
1708   }
1709   return 0;
1710 }
1711 
check_forbidden(const char * word,int len)1712 int SuggestMgr::check_forbidden(const char* word, int len) {
1713   if (pAMgr) {
1714     struct hentry* rv = pAMgr->lookup(word);
1715     if (rv && rv->astr &&
1716         (TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) ||
1717          TESTAFF(rv->astr, pAMgr->get_onlyincompound(), rv->alen)))
1718       rv = NULL;
1719     if (!(pAMgr->prefix_check(word, len, 1)))
1720       rv = pAMgr->suffix_check(word, len, 0, NULL,
1721                                FLAG_NULL, FLAG_NULL, IN_CPD_NOT);  // prefix+suffix, suffix
1722     // check forbidden words
1723     if ((rv) && (rv->astr) &&
1724         TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen))
1725       return 1;
1726   }
1727   return 0;
1728 }
1729 
suggest_morph(const std::string & in_w)1730 std::string SuggestMgr::suggest_morph(const std::string& in_w) {
1731   std::string result;
1732 
1733   struct hentry* rv = NULL;
1734 
1735   if (!pAMgr)
1736     return std::string();
1737 
1738   std::string w(in_w);
1739 
1740   // word reversing wrapper for complex prefixes
1741   if (complexprefixes) {
1742     if (utf8)
1743       reverseword_utf(w);
1744     else
1745       reverseword(w);
1746   }
1747 
1748   rv = pAMgr->lookup(w.c_str());
1749 
1750   while (rv) {
1751     if ((!rv->astr) ||
1752         !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||
1753           TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) ||
1754           TESTAFF(rv->astr, pAMgr->get_onlyincompound(), rv->alen))) {
1755       if (!HENTRY_FIND(rv, MORPH_STEM)) {
1756         result.push_back(MSEP_FLD);
1757         result.append(MORPH_STEM);
1758         result.append(w);
1759       }
1760       if (HENTRY_DATA(rv)) {
1761         result.push_back(MSEP_FLD);
1762         result.append(HENTRY_DATA2(rv));
1763       }
1764       result.push_back(MSEP_REC);
1765     }
1766     rv = rv->next_homonym;
1767   }
1768 
1769   std::string st = pAMgr->affix_check_morph(w.c_str(), w.size());
1770   if (!st.empty()) {
1771     result.append(st);
1772   }
1773 
1774   if (pAMgr->get_compound() && result.empty()) {
1775     struct hentry* rwords[100];  // buffer for COMPOUND pattern checking
1776     pAMgr->compound_check_morph(w.c_str(), w.size(), 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, result,
1777                                 NULL);
1778   }
1779 
1780   line_uniq(result, MSEP_REC);
1781 
1782   return result;
1783 }
1784 
get_sfxcount(const char * morph)1785 static int get_sfxcount(const char* morph) {
1786   if (!morph || !*morph)
1787     return 0;
1788   int n = 0;
1789   const char* old = morph;
1790   morph = strstr(morph, MORPH_DERI_SFX);
1791   if (!morph)
1792     morph = strstr(old, MORPH_INFL_SFX);
1793   if (!morph)
1794     morph = strstr(old, MORPH_TERM_SFX);
1795   while (morph) {
1796     n++;
1797     old = morph;
1798     morph = strstr(morph + 1, MORPH_DERI_SFX);
1799     if (!morph)
1800       morph = strstr(old + 1, MORPH_INFL_SFX);
1801     if (!morph)
1802       morph = strstr(old + 1, MORPH_TERM_SFX);
1803   }
1804   return n;
1805 }
1806 
1807 /* affixation */
suggest_hentry_gen(hentry * rv,const char * pattern)1808 std::string SuggestMgr::suggest_hentry_gen(hentry* rv, const char* pattern) {
1809   std::string result;
1810   int sfxcount = get_sfxcount(pattern);
1811 
1812   if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount)
1813     return result;
1814 
1815   if (HENTRY_DATA(rv)) {
1816     std::string aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen,
1817                                       HENTRY_DATA(rv), pattern, 0);
1818     if (!aff.empty()) {
1819       result.append(aff);
1820       result.push_back(MSEP_REC);
1821     }
1822   }
1823 
1824   // check all allomorphs
1825   char* p = NULL;
1826   if (HENTRY_DATA(rv))
1827     p = (char*)strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH);
1828   while (p) {
1829     p += MORPH_TAG_LEN;
1830     int plen = fieldlen(p);
1831     std::string allomorph(p, plen);
1832     struct hentry* rv2 = pAMgr->lookup(allomorph.c_str());
1833     while (rv2) {
1834       //            if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <=
1835       //            sfxcount) {
1836       if (HENTRY_DATA(rv2)) {
1837         char* st = (char*)strstr(HENTRY_DATA2(rv2), MORPH_STEM);
1838         if (st && (strncmp(st + MORPH_TAG_LEN, HENTRY_WORD(rv),
1839                            fieldlen(st + MORPH_TAG_LEN)) == 0)) {
1840           std::string aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr,
1841                                             rv2->alen, HENTRY_DATA(rv2), pattern, 0);
1842           if (!aff.empty()) {
1843             result.append(aff);
1844             result.push_back(MSEP_REC);
1845           }
1846         }
1847       }
1848       rv2 = rv2->next_homonym;
1849     }
1850     p = strstr(p + plen, MORPH_ALLOMORPH);
1851   }
1852 
1853   return result;
1854 }
1855 
suggest_gen(const std::vector<std::string> & desc,const std::string & in_pattern)1856 std::string SuggestMgr::suggest_gen(const std::vector<std::string>& desc, const std::string& in_pattern) {
1857   if (desc.empty() || !pAMgr)
1858     return std::string();
1859 
1860   const char* pattern = in_pattern.c_str();
1861   std::string result2;
1862   std::string newpattern;
1863   struct hentry* rv = NULL;
1864 
1865   // search affixed forms with and without derivational suffixes
1866   while (1) {
1867     for (size_t k = 0; k < desc.size(); ++k) {
1868       std::string result;
1869 
1870       // add compound word parts (except the last one)
1871       const char* s = desc[k].c_str();
1872       const char* part = strstr(s, MORPH_PART);
1873       if (part) {
1874         const char* nextpart = strstr(part + 1, MORPH_PART);
1875         while (nextpart) {
1876           std::string field;
1877           copy_field(field, part, MORPH_PART);
1878           result.append(field);
1879           part = nextpart;
1880           nextpart = strstr(part + 1, MORPH_PART);
1881         }
1882         s = part;
1883       }
1884 
1885       std::string tok(s);
1886       size_t pos = tok.find(" | ");
1887       while (pos != std::string::npos) {
1888         tok[pos + 1] = MSEP_ALT;
1889         pos = tok.find(" | ", pos);
1890       }
1891       std::vector<std::string> pl = line_tok(tok, MSEP_ALT);
1892       for (size_t i = 0; i < pl.size(); ++i) {
1893         // remove inflectional and terminal suffixes
1894         size_t is = pl[i].find(MORPH_INFL_SFX);
1895         if (is != std::string::npos)
1896           pl[i].resize(is);
1897         size_t ts = pl[i].find(MORPH_TERM_SFX);
1898         while (ts != std::string::npos) {
1899           pl[i][ts] = '_';
1900           ts = pl[i].find(MORPH_TERM_SFX);
1901         }
1902         const char* st = strstr(s, MORPH_STEM);
1903         if (st) {
1904           copy_field(tok, st, MORPH_STEM);
1905           rv = pAMgr->lookup(tok.c_str());
1906           while (rv) {
1907             std::string newpat(pl[i]);
1908             newpat.append(pattern);
1909             std::string sg = suggest_hentry_gen(rv, newpat.c_str());
1910             if (sg.empty())
1911               sg = suggest_hentry_gen(rv, pattern);
1912             if (!sg.empty()) {
1913               std::vector<std::string> gen = line_tok(sg, MSEP_REC);
1914               for (size_t j = 0; j < gen.size(); ++j) {
1915                 result2.push_back(MSEP_REC);
1916                 result2.append(result);
1917                 if (pl[i].find(MORPH_SURF_PFX) != std::string::npos) {
1918                   std::string field;
1919                   copy_field(field, pl[i], MORPH_SURF_PFX);
1920                   result2.append(field);
1921                 }
1922                 result2.append(gen[j]);
1923               }
1924             }
1925             rv = rv->next_homonym;
1926           }
1927         }
1928       }
1929     }
1930 
1931     if (!result2.empty() || !strstr(pattern, MORPH_DERI_SFX))
1932       break;
1933 
1934     newpattern.assign(pattern);
1935     mystrrep(newpattern, MORPH_DERI_SFX, MORPH_TERM_SFX);
1936     pattern = newpattern.c_str();
1937   }
1938   return result2;
1939 }
1940 
1941 // generate an n-gram score comparing s1 and s2, UTF16 version
ngram(int n,const std::vector<w_char> & su1,const std::vector<w_char> & su2,int opt)1942 int SuggestMgr::ngram(int n,
1943                       const std::vector<w_char>& su1,
1944                       const std::vector<w_char>& su2,
1945                       int opt) {
1946   int nscore = 0;
1947   int ns;
1948   int l1;
1949   int l2;
1950   int test = 0;
1951 
1952   l1 = su1.size();
1953   l2 = su2.size();
1954   if (l2 == 0)
1955     return 0;
1956   for (int j = 1; j <= n; j++) {
1957     ns = 0;
1958     for (int i = 0; i <= (l1 - j); i++) {
1959       int k = 0;
1960       for (int l = 0; l <= (l2 - j); l++) {
1961         for (k = 0; k < j; k++) {
1962           const w_char& c1 = su1[i + k];
1963           const w_char& c2 = su2[l + k];
1964           if ((c1.l != c2.l) || (c1.h != c2.h))
1965             break;
1966         }
1967         if (k == j) {
1968           ns++;
1969           break;
1970         }
1971       }
1972       if (k != j && opt & NGRAM_WEIGHTED) {
1973         ns--;
1974         test++;
1975         if (i == 0 || i == l1 - j)
1976           ns--;  // side weight
1977       }
1978     }
1979     nscore = nscore + ns;
1980     if (ns < 2 && !(opt & NGRAM_WEIGHTED))
1981       break;
1982   }
1983 
1984   ns = 0;
1985   if (opt & NGRAM_LONGER_WORSE)
1986     ns = (l2 - l1) - 2;
1987   if (opt & NGRAM_ANY_MISMATCH)
1988     ns = abs(l2 - l1) - 2;
1989   ns = (nscore - ((ns > 0) ? ns : 0));
1990   return ns;
1991 }
1992 
1993 // generate an n-gram score comparing s1 and s2, non-UTF16 version
ngram(int n,const std::string & s1,const std::string & s2,int opt)1994 int SuggestMgr::ngram(int n,
1995                       const std::string& s1,
1996                       const std::string& s2,
1997                       int opt) {
1998   int nscore = 0;
1999   int ns;
2000   int l1;
2001   int l2;
2002   int test = 0;
2003 
2004   l2 = s2.size();
2005   if (l2 == 0)
2006     return 0;
2007   l1 = s1.size();
2008   for (int j = 1; j <= n; j++) {
2009     ns = 0;
2010     for (int i = 0; i <= (l1 - j); i++) {
2011       //s2 is haystack, s1[i..i+j) is needle
2012       if (s2.find(s1.c_str()+i, 0, j) != std::string::npos) {
2013         ns++;
2014       } else if (opt & NGRAM_WEIGHTED) {
2015         ns--;
2016         test++;
2017         if (i == 0 || i == l1 - j)
2018           ns--;  // side weight
2019       }
2020     }
2021     nscore = nscore + ns;
2022     if (ns < 2 && !(opt & NGRAM_WEIGHTED))
2023       break;
2024   }
2025 
2026   ns = 0;
2027   if (opt & NGRAM_LONGER_WORSE)
2028     ns = (l2 - l1) - 2;
2029   if (opt & NGRAM_ANY_MISMATCH)
2030     ns = abs(l2 - l1) - 2;
2031   ns = (nscore - ((ns > 0) ? ns : 0));
2032   return ns;
2033 }
2034 
2035 // length of the left common substring of s1 and (decapitalised) s2, UTF version
leftcommonsubstring(const std::vector<w_char> & su1,const std::vector<w_char> & su2)2036 int SuggestMgr::leftcommonsubstring(
2037     const std::vector<w_char>& su1,
2038     const std::vector<w_char>& su2) {
2039   int l1 = su1.size();
2040   int l2 = su2.size();
2041   // decapitalize dictionary word
2042   if (complexprefixes) {
2043     if (su1[l1 - 1] == su2[l2 - 1])
2044       return 1;
2045   } else {
2046     unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l;
2047     unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l;
2048     if (otheridx != idx && (otheridx != unicodetolower(idx, langnum)))
2049       return 0;
2050     int i;
2051     for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) &&
2052                 (su1[i].h == su2[i].h);
2053          i++)
2054       ;
2055     return i;
2056   }
2057   return 0;
2058 }
2059 
2060 // length of the left common substring of s1 and (decapitalised) s2, non-UTF
leftcommonsubstring(const char * s1,const char * s2)2061 int SuggestMgr::leftcommonsubstring(
2062     const char* s1,
2063     const char* s2) {
2064   if (complexprefixes) {
2065     int l1 = strlen(s1);
2066     int l2 = strlen(s2);
2067     if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1])
2068       return 1;
2069   } else if (csconv) {
2070     const char* olds = s1;
2071     // decapitalise dictionary word
2072     if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower))
2073       return 0;
2074     do {
2075       s1++;
2076       s2++;
2077     } while ((*s1 == *s2) && (*s1 != '\0'));
2078     return (int)(s1 - olds);
2079   }
2080   return 0;
2081 }
2082 
commoncharacterpositions(const char * s1,const char * s2,int * is_swap)2083 int SuggestMgr::commoncharacterpositions(const char* s1,
2084                                          const char* s2,
2085                                          int* is_swap) {
2086   int num = 0;
2087   int diff = 0;
2088   int diffpos[2];
2089   *is_swap = 0;
2090   if (utf8) {
2091     std::vector<w_char> su1;
2092     std::vector<w_char> su2;
2093     int l1 = u8_u16(su1, s1);
2094     int l2 = u8_u16(su2, s2);
2095 
2096     if (l1 <= 0 || l2 <= 0)
2097       return 0;
2098 
2099     // decapitalize dictionary word
2100     if (complexprefixes) {
2101       su2[l2 - 1] = lower_utf(su2[l2 - 1], langnum);
2102     } else {
2103       su2[0] = lower_utf(su2[0], langnum);
2104     }
2105     for (int i = 0; (i < l1) && (i < l2); i++) {
2106       if (su1[i] == su2[i]) {
2107         num++;
2108       } else {
2109         if (diff < 2)
2110           diffpos[diff] = i;
2111         diff++;
2112       }
2113     }
2114     if ((diff == 2) && (l1 == l2) &&
2115         (su1[diffpos[0]] == su2[diffpos[1]]) &&
2116         (su1[diffpos[1]] == su2[diffpos[0]]))
2117       *is_swap = 1;
2118   } else {
2119     size_t i;
2120     std::string t(s2);
2121     // decapitalize dictionary word
2122     if (complexprefixes) {
2123       size_t l2 = t.size();
2124       t[l2 - 1] = csconv[(unsigned char)t[l2 - 1]].clower;
2125     } else {
2126       mkallsmall(t, csconv);
2127     }
2128     for (i = 0; i < t.size() && (*(s1 + i) != 0); ++i) {
2129       if (*(s1 + i) == t[i]) {
2130         num++;
2131       } else {
2132         if (diff < 2)
2133           diffpos[diff] = i;
2134         diff++;
2135       }
2136     }
2137     if ((diff == 2) && (*(s1 + i) == 0) && i == t.size() &&
2138         (*(s1 + diffpos[0]) == t[diffpos[1]]) &&
2139         (*(s1 + diffpos[1]) == t[diffpos[0]]))
2140       *is_swap = 1;
2141   }
2142   return num;
2143 }
2144 
mystrlen(const char * word)2145 int SuggestMgr::mystrlen(const char* word) {
2146   if (utf8) {
2147     std::vector<w_char> w;
2148     return u8_u16(w, word);
2149   } else
2150     return strlen(word);
2151 }
2152 
2153 // sort in decreasing order of score
bubblesort(char ** rword,char ** rword2,int * rsc,int n)2154 void SuggestMgr::bubblesort(char** rword, char** rword2, int* rsc, int n) {
2155   int m = 1;
2156   while (m < n) {
2157     int j = m;
2158     while (j > 0) {
2159       if (rsc[j - 1] < rsc[j]) {
2160         int sctmp = rsc[j - 1];
2161         char* wdtmp = rword[j - 1];
2162         rsc[j - 1] = rsc[j];
2163         rword[j - 1] = rword[j];
2164         rsc[j] = sctmp;
2165         rword[j] = wdtmp;
2166         if (rword2) {
2167           wdtmp = rword2[j - 1];
2168           rword2[j - 1] = rword2[j];
2169           rword2[j] = wdtmp;
2170         }
2171         j--;
2172       } else
2173         break;
2174     }
2175     m++;
2176   }
2177   return;
2178 }
2179 
2180 // longest common subsequence
lcs(const char * s,const char * s2,int * l1,int * l2,char ** result)2181 void SuggestMgr::lcs(const char* s,
2182                      const char* s2,
2183                      int* l1,
2184                      int* l2,
2185                      char** result) {
2186   int n, m;
2187   std::vector<w_char> su;
2188   std::vector<w_char> su2;
2189   char* b;
2190   char* c;
2191   int i;
2192   int j;
2193   if (utf8) {
2194     m = u8_u16(su, s);
2195     n = u8_u16(su2, s2);
2196   } else {
2197     m = strlen(s);
2198     n = strlen(s2);
2199   }
2200   c = (char*)malloc((m + 1) * (n + 1));
2201   b = (char*)malloc((m + 1) * (n + 1));
2202   if (!c || !b) {
2203     if (c)
2204       free(c);
2205     if (b)
2206       free(b);
2207     *result = NULL;
2208     return;
2209   }
2210   for (i = 1; i <= m; i++)
2211     c[i * (n + 1)] = 0;
2212   for (j = 0; j <= n; j++)
2213     c[j] = 0;
2214   for (i = 1; i <= m; i++) {
2215     for (j = 1; j <= n; j++) {
2216       if (((utf8) && (su[i - 1] == su2[j - 1])) ||
2217           ((!utf8) && (s[i - 1] == s2[j - 1]))) {
2218         c[i * (n + 1) + j] = c[(i - 1) * (n + 1) + j - 1] + 1;
2219         b[i * (n + 1) + j] = LCS_UPLEFT;
2220       } else if (c[(i - 1) * (n + 1) + j] >= c[i * (n + 1) + j - 1]) {
2221         c[i * (n + 1) + j] = c[(i - 1) * (n + 1) + j];
2222         b[i * (n + 1) + j] = LCS_UP;
2223       } else {
2224         c[i * (n + 1) + j] = c[i * (n + 1) + j - 1];
2225         b[i * (n + 1) + j] = LCS_LEFT;
2226       }
2227     }
2228   }
2229   *result = b;
2230   free(c);
2231   *l1 = m;
2232   *l2 = n;
2233 }
2234 
lcslen(const char * s,const char * s2)2235 int SuggestMgr::lcslen(const char* s, const char* s2) {
2236   int m;
2237   int n;
2238   int i;
2239   int j;
2240   char* result;
2241   int len = 0;
2242   lcs(s, s2, &m, &n, &result);
2243   if (!result)
2244     return 0;
2245   i = m;
2246   j = n;
2247   while ((i != 0) && (j != 0)) {
2248     if (result[i * (n + 1) + j] == LCS_UPLEFT) {
2249       len++;
2250       i--;
2251       j--;
2252     } else if (result[i * (n + 1) + j] == LCS_UP) {
2253       i--;
2254     } else
2255       j--;
2256   }
2257   free(result);
2258   return len;
2259 }
2260 
lcslen(const std::string & s,const std::string & s2)2261 int SuggestMgr::lcslen(const std::string& s, const std::string& s2) {
2262   return lcslen(s.c_str(), s2.c_str());
2263 }
2264