1 #ifdef HAVE_CONFIG_H
2 #include "config.h"
3 #endif
4 
5 #include <algorithm>
6 #include <cctype>
7 #include <cstring>
8 #include <stdexcept>
9 
10 #include <glib/gstdio.h>
11 #include <sys/stat.h>
12 #include <zlib.h>
13 
14 #include "distance.hpp"
15 #include "mapfile.hpp"
16 #include "utils.hpp"
17 
18 #include "stardict_lib.hpp"
19 
20 #define TO_STR2(xstr) #xstr
21 #define TO_STR1(xstr) TO_STR2(xstr)
22 
23 #define THROW_IF_ERROR(expr)                                                                 \
24     do {                                                                                     \
25         assert((expr));                                                                      \
26         if (!(expr))                                                                         \
27             throw std::runtime_error(#expr " not true at " __FILE__ ": " TO_STR1(__LINE__)); \
28     } while (false)
29 
30 // Notice: read src/tools/DICTFILE_FORMAT for the dictionary
31 // file's format information!
32 
33 namespace
34 {
35 struct Fuzzystruct {
36     char *pMatchWord;
37     int iMatchWordDistance;
38 };
39 
bIsVowel(gchar inputchar)40 static inline bool bIsVowel(gchar inputchar)
41 {
42     gchar ch = g_ascii_toupper(inputchar);
43     return (ch == 'A' || ch == 'E' || ch == 'I' || ch == 'O' || ch == 'U');
44 }
45 
bIsPureEnglish(const gchar * str)46 static bool bIsPureEnglish(const gchar *str)
47 {
48     // i think this should work even when it is UTF8 string :).
49     for (int i = 0; str[i] != 0; i++)
50         //if(str[i]<0)
51         //if(str[i]<32 || str[i]>126) // tab equal 9,so this is not OK.
52         // Better use isascii() but not str[i]<0 while char is default unsigned in arm
53         if (!isascii(str[i]))
54             return false;
55     return true;
56 }
57 
stardict_strcmp(const gchar * s1,const gchar * s2)58 static inline gint stardict_strcmp(const gchar *s1, const gchar *s2)
59 {
60     const gint a = g_ascii_strcasecmp(s1, s2);
61     if (a == 0)
62         return strcmp(s1, s2);
63     else
64         return a;
65 }
66 
unicode_strdown(gunichar * str)67 static void unicode_strdown(gunichar *str)
68 {
69     while (*str) {
70         *str = g_unichar_tolower(*str);
71         ++str;
72     }
73 }
74 }
75 
load_from_ifo_file(const std::string & ifofilename,bool istreedict)76 bool DictInfo::load_from_ifo_file(const std::string &ifofilename,
77                                   bool istreedict)
78 {
79     ifo_file_name = ifofilename;
80     glib::CharStr buffer;
81     if (!g_file_get_contents(ifofilename.c_str(), get_addr(buffer), nullptr, nullptr))
82         return false;
83 
84     static const char TREEDICT_MAGIC_DATA[] = "StarDict's treedict ifo file";
85     static const char DICT_MAGIC_DATA[] = "StarDict's dict ifo file";
86 
87     const gchar *magic_data = istreedict ? TREEDICT_MAGIC_DATA : DICT_MAGIC_DATA;
88     static const unsigned char utf8_bom[] = { 0xEF, 0xBB, 0xBF, '\0' };
89     if (!g_str_has_prefix(
90             g_str_has_prefix(get_impl(buffer), (const gchar *)(utf8_bom)) ? get_impl(buffer) + 3 : get_impl(buffer),
91             magic_data)) {
92         return false;
93     }
94 
95     gchar *p1 = get_impl(buffer) + strlen(magic_data) - 1;
96 
97     gchar *p2 = strstr(p1, "\nwordcount=");
98     if (p2 == nullptr)
99         return false;
100 
101     gchar *p3 = strchr(p2 + sizeof("\nwordcount=") - 1, '\n');
102 
103     wordcount = atol(std::string(p2 + sizeof("\nwordcount=") - 1, p3 - (p2 + sizeof("\nwordcount=") - 1)).c_str());
104 
105     if (istreedict) {
106         p2 = strstr(p1, "\ntdxfilesize=");
107         if (p2 == nullptr)
108             return false;
109 
110         p3 = strchr(p2 + sizeof("\ntdxfilesize=") - 1, '\n');
111 
112         index_file_size = atol(std::string(p2 + sizeof("\ntdxfilesize=") - 1, p3 - (p2 + sizeof("\ntdxfilesize=") - 1)).c_str());
113 
114     } else {
115 
116         p2 = strstr(p1, "\nidxfilesize=");
117         if (p2 == nullptr)
118             return false;
119 
120         p3 = strchr(p2 + sizeof("\nidxfilesize=") - 1, '\n');
121         index_file_size = atol(std::string(p2 + sizeof("\nidxfilesize=") - 1, p3 - (p2 + sizeof("\nidxfilesize=") - 1)).c_str());
122     }
123 
124     p2 = strstr(p1, "\nbookname=");
125 
126     if (p2 == nullptr)
127         return false;
128 
129     p2 = p2 + sizeof("\nbookname=") - 1;
130     p3 = strchr(p2, '\n');
131     bookname.assign(p2, p3 - p2);
132 
133     p2 = strstr(p1, "\nauthor=");
134     if (p2) {
135         p2 = p2 + sizeof("\nauthor=") - 1;
136         p3 = strchr(p2, '\n');
137         author.assign(p2, p3 - p2);
138     }
139 
140     p2 = strstr(p1, "\nemail=");
141     if (p2) {
142         p2 = p2 + sizeof("\nemail=") - 1;
143         p3 = strchr(p2, '\n');
144         email.assign(p2, p3 - p2);
145     }
146 
147     p2 = strstr(p1, "\nwebsite=");
148     if (p2) {
149         p2 = p2 + sizeof("\nwebsite=") - 1;
150         p3 = strchr(p2, '\n');
151         website.assign(p2, p3 - p2);
152     }
153 
154     p2 = strstr(p1, "\ndate=");
155     if (p2) {
156         p2 = p2 + sizeof("\ndate=") - 1;
157         p3 = strchr(p2, '\n');
158         date.assign(p2, p3 - p2);
159     }
160 
161     p2 = strstr(p1, "\ndescription=");
162     if (p2) {
163         p2 = p2 + sizeof("\ndescription=") - 1;
164         p3 = strchr(p2, '\n');
165         description.assign(p2, p3 - p2);
166     }
167 
168     p2 = strstr(p1, "\nsametypesequence=");
169     if (p2) {
170         p2 += sizeof("\nsametypesequence=") - 1;
171         p3 = strchr(p2, '\n');
172         sametypesequence.assign(p2, p3 - p2);
173     }
174 
175     p2 = strstr(p1, "\nsynwordcount=");
176     syn_wordcount = 0;
177     if (p2) {
178         p2 += sizeof("\nsynwordcount=") - 1;
179         p3 = strchr(p2, '\n');
180         syn_wordcount = atol(std::string(p2, p3 - p2).c_str());
181     }
182 
183     return true;
184 }
185 
GetWordData(guint32 idxitem_offset,guint32 idxitem_size)186 gchar *DictBase::GetWordData(guint32 idxitem_offset, guint32 idxitem_size)
187 {
188     for (int i = 0; i < WORDDATA_CACHE_NUM; i++)
189         if (cache[i].data && cache[i].offset == idxitem_offset)
190             return cache[i].data;
191 
192     if (dictfile)
193         fseek(dictfile, idxitem_offset, SEEK_SET);
194 
195     gchar *data;
196     if (!sametypesequence.empty()) {
197         glib::CharStr origin_data((gchar *)g_malloc(idxitem_size));
198 
199         if (dictfile) {
200             const size_t nitems = fread(get_impl(origin_data), idxitem_size, 1, dictfile);
201             THROW_IF_ERROR(nitems == 1);
202         } else
203             dictdzfile->read(get_impl(origin_data), idxitem_offset, idxitem_size);
204 
205         guint32 data_size;
206         gint sametypesequence_len = sametypesequence.length();
207         //there have sametypesequence_len char being omitted.
208         data_size = idxitem_size + sizeof(guint32) + sametypesequence_len;
209         //if the last item's size is determined by the end up '\0',then +=sizeof(gchar);
210         //if the last item's size is determined by the head guint32 type data,then +=sizeof(guint32);
211         switch (sametypesequence[sametypesequence_len - 1]) {
212         case 'm':
213         case 't':
214         case 'y':
215         case 'l':
216         case 'g':
217         case 'x':
218         case 'k':
219             data_size += sizeof(gchar);
220             break;
221         case 'W':
222         case 'P':
223             data_size += sizeof(guint32);
224             break;
225         default:
226             if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1]))
227                 data_size += sizeof(guint32);
228             else
229                 data_size += sizeof(gchar);
230             break;
231         }
232         data = (gchar *)g_malloc(data_size);
233         gchar *p1, *p2;
234         p1 = data + sizeof(guint32);
235         p2 = get_impl(origin_data);
236         guint32 sec_size;
237         //copy the head items.
238         for (int i = 0; i < sametypesequence_len - 1; i++) {
239             *p1 = sametypesequence[i];
240             p1 += sizeof(gchar);
241             switch (sametypesequence[i]) {
242             case 'm':
243             case 't':
244             case 'y':
245             case 'l':
246             case 'g':
247             case 'x':
248             case 'k':
249                 sec_size = strlen(p2) + 1;
250                 memcpy(p1, p2, sec_size);
251                 p1 += sec_size;
252                 p2 += sec_size;
253                 break;
254             case 'W':
255             case 'P':
256                 sec_size = get_uint32(p2);
257                 sec_size += sizeof(guint32);
258                 memcpy(p1, p2, sec_size);
259                 p1 += sec_size;
260                 p2 += sec_size;
261                 break;
262             default:
263                 if (g_ascii_isupper(sametypesequence[i])) {
264                     sec_size = get_uint32(p2);
265                     sec_size += sizeof(guint32);
266                 } else {
267                     sec_size = strlen(p2) + 1;
268                 }
269                 memcpy(p1, p2, sec_size);
270                 p1 += sec_size;
271                 p2 += sec_size;
272                 break;
273             }
274         }
275         //calculate the last item 's size.
276         sec_size = idxitem_size - (p2 - get_impl(origin_data));
277         *p1 = sametypesequence[sametypesequence_len - 1];
278         p1 += sizeof(gchar);
279         switch (sametypesequence[sametypesequence_len - 1]) {
280         case 'm':
281         case 't':
282         case 'y':
283         case 'l':
284         case 'g':
285         case 'x':
286         case 'k':
287             memcpy(p1, p2, sec_size);
288             p1 += sec_size;
289             *p1 = '\0'; //add the end up '\0';
290             break;
291         case 'W':
292         case 'P':
293             set_uint32(p1, sec_size);
294             p1 += sizeof(guint32);
295             memcpy(p1, p2, sec_size);
296             break;
297         default:
298             if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1])) {
299                 set_uint32(p1, sec_size);
300                 p1 += sizeof(guint32);
301                 memcpy(p1, p2, sec_size);
302             } else {
303                 memcpy(p1, p2, sec_size);
304                 p1 += sec_size;
305                 *p1 = '\0';
306             }
307             break;
308         }
309         set_uint32(data, data_size);
310     } else {
311         data = (gchar *)g_malloc(idxitem_size + sizeof(guint32));
312         if (dictfile) {
313             const size_t nitems = fread(data + sizeof(guint32), idxitem_size, 1, dictfile);
314             THROW_IF_ERROR(nitems == 1);
315         } else
316             dictdzfile->read(data + sizeof(guint32), idxitem_offset, idxitem_size);
317         set_uint32(data, idxitem_size + sizeof(guint32));
318     }
319     g_free(cache[cache_cur].data);
320 
321     cache[cache_cur].data = data;
322     cache[cache_cur].offset = idxitem_offset;
323     cache_cur++;
324     if (cache_cur == WORDDATA_CACHE_NUM)
325         cache_cur = 0;
326     return data;
327 }
328 
SearchData(std::vector<std::string> & SearchWords,guint32 idxitem_offset,guint32 idxitem_size,gchar * origin_data)329 bool DictBase::SearchData(std::vector<std::string> &SearchWords, guint32 idxitem_offset, guint32 idxitem_size, gchar *origin_data)
330 {
331     int nWord = SearchWords.size();
332     std::vector<bool> WordFind(nWord, false);
333     int nfound = 0;
334 
335     if (dictfile)
336         fseek(dictfile, idxitem_offset, SEEK_SET);
337     if (dictfile) {
338         const size_t nitems = fread(origin_data, idxitem_size, 1, dictfile);
339         THROW_IF_ERROR(nitems == 1);
340     } else
341         dictdzfile->read(origin_data, idxitem_offset, idxitem_size);
342     gchar *p = origin_data;
343     guint32 sec_size;
344     int j;
345     if (!sametypesequence.empty()) {
346         gint sametypesequence_len = sametypesequence.length();
347         for (int i = 0; i < sametypesequence_len - 1; i++) {
348             switch (sametypesequence[i]) {
349             case 'm':
350             case 't':
351             case 'y':
352             case 'l':
353             case 'g':
354             case 'x':
355             case 'k':
356                 for (j = 0; j < nWord; j++)
357                     if (!WordFind[j] && strstr(p, SearchWords[j].c_str())) {
358                         WordFind[j] = true;
359                         ++nfound;
360                     }
361 
362                 if (nfound == nWord)
363                     return true;
364                 sec_size = strlen(p) + 1;
365                 p += sec_size;
366                 break;
367             default:
368                 if (g_ascii_isupper(sametypesequence[i])) {
369                     sec_size = get_uint32(p);
370                     sec_size += sizeof(guint32);
371                 } else {
372                     sec_size = strlen(p) + 1;
373                 }
374                 p += sec_size;
375             }
376         }
377         switch (sametypesequence[sametypesequence_len - 1]) {
378         case 'm':
379         case 't':
380         case 'y':
381         case 'l':
382         case 'g':
383         case 'x':
384         case 'k':
385             sec_size = idxitem_size - (p - origin_data);
386             for (j = 0; j < nWord; j++)
387                 if (!WordFind[j] && g_strstr_len(p, sec_size, SearchWords[j].c_str())) {
388                     WordFind[j] = true;
389                     ++nfound;
390                 }
391 
392             if (nfound == nWord)
393                 return true;
394             break;
395         }
396     } else {
397         while (guint32(p - origin_data) < idxitem_size) {
398             switch (*p) {
399             case 'm':
400             case 't':
401             case 'y':
402             case 'l':
403             case 'g':
404             case 'x':
405             case 'k':
406                 for (j = 0; j < nWord; j++)
407                     if (!WordFind[j] && strstr(p, SearchWords[j].c_str())) {
408                         WordFind[j] = true;
409                         ++nfound;
410                     }
411 
412                 if (nfound == nWord)
413                     return true;
414                 sec_size = strlen(p) + 1;
415                 p += sec_size;
416                 break;
417             default:
418                 if (g_ascii_isupper(*p)) {
419                     sec_size = get_uint32(p);
420                     sec_size += sizeof(guint32);
421                 } else {
422                     sec_size = strlen(p) + 1;
423                 }
424                 p += sec_size;
425             }
426         }
427     }
428     return false;
429 }
430 
431 namespace
432 {
433 class OffsetIndex : public IIndexFile
434 {
435 public:
OffsetIndex()436     OffsetIndex()
437         : idxfile(nullptr)
438     {
439     }
~OffsetIndex()440     ~OffsetIndex()
441     {
442         if (idxfile)
443             fclose(idxfile);
444     }
445     bool load(const std::string &url, gulong wc, gulong fsize, bool verbose) override;
446     const gchar *get_key(glong idx) override;
get_data(glong idx)447     void get_data(glong idx) override { get_key(idx); }
get_key_and_data(glong idx)448     const gchar *get_key_and_data(glong idx) override
449     {
450         return get_key(idx);
451     }
452     bool lookup(const char *str, glong &idx) override;
453 
454 private:
455     static const gint ENTR_PER_PAGE = 32;
456     static const char *CACHE_MAGIC;
457 
458     std::vector<guint32> wordoffset;
459     FILE *idxfile;
460     gulong wordcount;
461 
462     gchar wordentry_buf[256 + sizeof(guint32) * 2]; // The length of "word_str" should be less than 256. See src/tools/DICTFILE_FORMAT.
463     struct index_entry {
464         glong idx;
465         std::string keystr;
assign__anon48ed784c0211::OffsetIndex::index_entry466         void assign(glong i, const std::string &str)
467         {
468             idx = i;
469             keystr.assign(str);
470         }
471     };
472     index_entry first, last, middle, real_last;
473 
474     struct page_entry {
475         gchar *keystr;
476         guint32 off, size;
477     };
478     std::vector<gchar> page_data;
479     struct page_t {
480         glong idx = -1;
481         page_entry entries[ENTR_PER_PAGE];
482 
page_t__anon48ed784c0211::OffsetIndex::page_t483         page_t() {}
484         void fill(gchar *data, gint nent, glong idx_);
485     } page;
486     gulong load_page(glong page_idx);
487     const gchar *read_first_on_page_key(glong page_idx);
488     const gchar *get_first_on_page_key(glong page_idx);
489     bool load_cache(const std::string &url);
490     bool save_cache(const std::string &url, bool verbose);
491     static std::list<std::string> get_cache_variant(const std::string &url);
492 };
493 
494 const char *OffsetIndex::CACHE_MAGIC = "StarDict's Cache, Version: 0.1";
495 
496 class WordListIndex : public IIndexFile
497 {
498 public:
WordListIndex()499     WordListIndex()
500         : idxdatabuf(nullptr)
501     {
502     }
~WordListIndex()503     ~WordListIndex() { g_free(idxdatabuf); }
504     bool load(const std::string &url, gulong wc, gulong fsize, bool verbose) override;
get_key(glong idx)505     const gchar *get_key(glong idx) override { return wordlist[idx]; }
506     void get_data(glong idx) override;
get_key_and_data(glong idx)507     const gchar *get_key_and_data(glong idx) override
508     {
509         get_data(idx);
510         return get_key(idx);
511     }
512     bool lookup(const char *str, glong &idx) override;
513 
514 private:
515     gchar *idxdatabuf;
516     std::vector<gchar *> wordlist;
517 };
518 
fill(gchar * data,gint nent,glong idx_)519 void OffsetIndex::page_t::fill(gchar *data, gint nent, glong idx_)
520 {
521     idx = idx_;
522     gchar *p = data;
523     glong len;
524     for (gint i = 0; i < nent; ++i) {
525         entries[i].keystr = p;
526         len = strlen(p);
527         p += len + 1;
528         entries[i].off = g_ntohl(get_uint32(p));
529         p += sizeof(guint32);
530         entries[i].size = g_ntohl(get_uint32(p));
531         p += sizeof(guint32);
532     }
533 }
534 
read_first_on_page_key(glong page_idx)535 inline const gchar *OffsetIndex::read_first_on_page_key(glong page_idx)
536 {
537     fseek(idxfile, wordoffset[page_idx], SEEK_SET);
538     guint32 page_size = wordoffset[page_idx + 1] - wordoffset[page_idx];
539     const size_t nitems = fread(wordentry_buf,
540                                 std::min(sizeof(wordentry_buf), static_cast<size_t>(page_size)),
541                                 1, idxfile);
542     THROW_IF_ERROR(nitems == 1);
543     //TODO: check returned values, deal with word entry that strlen>255.
544     return wordentry_buf;
545 }
546 
get_first_on_page_key(glong page_idx)547 inline const gchar *OffsetIndex::get_first_on_page_key(glong page_idx)
548 {
549     if (page_idx < middle.idx) {
550         if (page_idx == first.idx)
551             return first.keystr.c_str();
552         return read_first_on_page_key(page_idx);
553     } else if (page_idx > middle.idx) {
554         if (page_idx == last.idx)
555             return last.keystr.c_str();
556         return read_first_on_page_key(page_idx);
557     } else
558         return middle.keystr.c_str();
559 }
560 
load_cache(const std::string & url)561 bool OffsetIndex::load_cache(const std::string &url)
562 {
563     const std::list<std::string> vars = get_cache_variant(url);
564 
565     for (const std::string &item : vars) {
566         struct ::stat idxstat, cachestat;
567         if (g_stat(url.c_str(), &idxstat) != 0 || g_stat(item.c_str(), &cachestat) != 0)
568             continue;
569         if (cachestat.st_mtime < idxstat.st_mtime)
570             continue;
571         MapFile mf;
572         if (!mf.open(item.c_str(), cachestat.st_size))
573             continue;
574         if (strncmp(mf.begin(), CACHE_MAGIC, strlen(CACHE_MAGIC)) != 0)
575             continue;
576         memcpy(&wordoffset[0], mf.begin() + strlen(CACHE_MAGIC), wordoffset.size() * sizeof(wordoffset[0]));
577         return true;
578     }
579 
580     return false;
581 }
582 
get_cache_variant(const std::string & url)583 std::list<std::string> OffsetIndex::get_cache_variant(const std::string &url)
584 {
585     std::list<std::string> res = { url + ".oft" };
586     if (!g_file_test(g_get_user_cache_dir(), G_FILE_TEST_EXISTS) && g_mkdir(g_get_user_cache_dir(), 0700) == -1)
587         return res;
588 
589     const std::string cache_dir = std::string(g_get_user_cache_dir()) + G_DIR_SEPARATOR_S + "sdcv";
590 
591     if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_EXISTS)) {
592         if (g_mkdir(cache_dir.c_str(), 0700) == -1)
593             return res;
594     } else if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_IS_DIR))
595         return res;
596 
597     gchar *base = g_path_get_basename(url.c_str());
598     res.push_back(cache_dir + G_DIR_SEPARATOR_S + base + ".oft");
599     g_free(base);
600     return res;
601 }
602 
save_cache(const std::string & url,bool verbose)603 bool OffsetIndex::save_cache(const std::string &url, bool verbose)
604 {
605     const std::list<std::string> vars = get_cache_variant(url);
606     for (const std::string &item : vars) {
607         FILE *out = fopen(item.c_str(), "wb");
608         if (!out)
609             continue;
610         if (fwrite(CACHE_MAGIC, 1, strlen(CACHE_MAGIC), out) != strlen(CACHE_MAGIC))
611             continue;
612         if (fwrite(&wordoffset[0], sizeof(wordoffset[0]), wordoffset.size(), out) != wordoffset.size())
613             continue;
614         fclose(out);
615         if (verbose) {
616             printf("save to cache %s\n", url.c_str());
617         }
618         return true;
619     }
620     return false;
621 }
622 
load(const std::string & url,gulong wc,gulong fsize,bool verbose)623 bool OffsetIndex::load(const std::string &url, gulong wc, gulong fsize, bool verbose)
624 {
625     wordcount = wc;
626     gulong npages = (wc - 1) / ENTR_PER_PAGE + 2;
627     wordoffset.resize(npages);
628     if (!load_cache(url)) { //map file will close after finish of block
629         MapFile map_file;
630         if (!map_file.open(url.c_str(), fsize))
631             return false;
632         const gchar *idxdatabuffer = map_file.begin();
633 
634         const gchar *p1 = idxdatabuffer;
635         gulong index_size;
636         guint32 j = 0;
637         for (guint32 i = 0; i < wc; i++) {
638             index_size = strlen(p1) + 1 + 2 * sizeof(guint32);
639             if (i % ENTR_PER_PAGE == 0) {
640                 wordoffset[j] = p1 - idxdatabuffer;
641                 ++j;
642             }
643             p1 += index_size;
644         }
645         wordoffset[j] = p1 - idxdatabuffer;
646         if (!save_cache(url, verbose))
647             fprintf(stderr, "cache update failed\n");
648     }
649 
650     if (!(idxfile = fopen(url.c_str(), "rb"))) {
651         wordoffset.resize(0);
652         return false;
653     }
654 
655     first.assign(0, read_first_on_page_key(0));
656     last.assign(wordoffset.size() - 2, read_first_on_page_key(wordoffset.size() - 2));
657     middle.assign((wordoffset.size() - 2) / 2, read_first_on_page_key((wordoffset.size() - 2) / 2));
658     real_last.assign(wc - 1, get_key(wc - 1));
659 
660     return true;
661 }
662 
load_page(glong page_idx)663 inline gulong OffsetIndex::load_page(glong page_idx)
664 {
665     gulong nentr = ENTR_PER_PAGE;
666     if (page_idx == glong(wordoffset.size() - 2))
667         if ((nentr = (wordcount % ENTR_PER_PAGE)) == 0)
668             nentr = ENTR_PER_PAGE;
669 
670     if (page_idx != page.idx) {
671         page_data.resize(wordoffset[page_idx + 1] - wordoffset[page_idx]);
672         fseek(idxfile, wordoffset[page_idx], SEEK_SET);
673         const size_t nitems = fread(&page_data[0], 1, page_data.size(), idxfile);
674         THROW_IF_ERROR(nitems == page_data.size());
675 
676         page.fill(&page_data[0], nentr, page_idx);
677     }
678 
679     return nentr;
680 }
681 
get_key(glong idx)682 const gchar *OffsetIndex::get_key(glong idx)
683 {
684     load_page(idx / ENTR_PER_PAGE);
685     glong idx_in_page = idx % ENTR_PER_PAGE;
686     wordentry_offset = page.entries[idx_in_page].off;
687     wordentry_size = page.entries[idx_in_page].size;
688 
689     return page.entries[idx_in_page].keystr;
690 }
691 
lookup(const char * str,glong & idx)692 bool OffsetIndex::lookup(const char *str, glong &idx)
693 {
694     bool bFound = false;
695     glong iFrom;
696     glong iTo = wordoffset.size() - 2;
697     gint cmpint;
698     glong iThisIndex;
699     if (stardict_strcmp(str, first.keystr.c_str()) < 0) {
700         idx = 0;
701         return false;
702     } else if (stardict_strcmp(str, real_last.keystr.c_str()) > 0) {
703         idx = INVALID_INDEX;
704         return false;
705     } else {
706         iFrom = 0;
707         iThisIndex = 0;
708         while (iFrom <= iTo) {
709             iThisIndex = (iFrom + iTo) / 2;
710             cmpint = stardict_strcmp(str, get_first_on_page_key(iThisIndex));
711             if (cmpint > 0)
712                 iFrom = iThisIndex + 1;
713             else if (cmpint < 0)
714                 iTo = iThisIndex - 1;
715             else {
716                 bFound = true;
717                 break;
718             }
719         }
720         if (!bFound)
721             idx = iTo; //prev
722         else
723             idx = iThisIndex;
724     }
725     if (!bFound) {
726         gulong netr = load_page(idx);
727         iFrom = 1; // Needn't search the first word anymore.
728         iTo = netr - 1;
729         iThisIndex = 0;
730         while (iFrom <= iTo) {
731             iThisIndex = (iFrom + iTo) / 2;
732             cmpint = stardict_strcmp(str, page.entries[iThisIndex].keystr);
733             if (cmpint > 0)
734                 iFrom = iThisIndex + 1;
735             else if (cmpint < 0)
736                 iTo = iThisIndex - 1;
737             else {
738                 bFound = true;
739                 break;
740             }
741         }
742         idx *= ENTR_PER_PAGE;
743         if (!bFound)
744             idx += iFrom; //next
745         else
746             idx += iThisIndex;
747     } else {
748         idx *= ENTR_PER_PAGE;
749     }
750     return bFound;
751 }
752 
load(const std::string & url,gulong wc,gulong fsize,bool verbose)753 bool WordListIndex::load(const std::string &url, gulong wc, gulong fsize, bool verbose)
754 {
755     gzFile in = gzopen(url.c_str(), "rb");
756     if (in == nullptr)
757         return false;
758 
759     idxdatabuf = (gchar *)g_malloc(fsize);
760 
761     const int len = gzread(in, idxdatabuf, fsize);
762     gzclose(in);
763     if (len < 0)
764         return false;
765 
766     if (gulong(len) != fsize)
767         return false;
768 
769     wordlist.resize(wc + 1);
770     gchar *p1 = idxdatabuf;
771     guint32 i;
772     for (i = 0; i < wc; i++) {
773         wordlist[i] = p1;
774         p1 += strlen(p1) + 1 + 2 * sizeof(guint32);
775     }
776     wordlist[wc] = p1;
777 
778     return true;
779 }
780 
get_data(glong idx)781 void WordListIndex::get_data(glong idx)
782 {
783     gchar *p1 = wordlist[idx] + strlen(wordlist[idx]) + sizeof(gchar);
784     wordentry_offset = g_ntohl(get_uint32(p1));
785     p1 += sizeof(guint32);
786     wordentry_size = g_ntohl(get_uint32(p1));
787 }
788 
lookup(const char * str,glong & idx)789 bool WordListIndex::lookup(const char *str, glong &idx)
790 {
791     bool bFound = false;
792     glong iTo = wordlist.size() - 2;
793 
794     if (stardict_strcmp(str, get_key(0)) < 0) {
795         idx = 0;
796     } else if (stardict_strcmp(str, get_key(iTo)) > 0) {
797         idx = INVALID_INDEX;
798     } else {
799         glong iThisIndex = 0;
800         glong iFrom = 0;
801         gint cmpint;
802         while (iFrom <= iTo) {
803             iThisIndex = (iFrom + iTo) / 2;
804             cmpint = stardict_strcmp(str, get_key(iThisIndex));
805             if (cmpint > 0)
806                 iFrom = iThisIndex + 1;
807             else if (cmpint < 0)
808                 iTo = iThisIndex - 1;
809             else {
810                 bFound = true;
811                 break;
812             }
813         }
814         if (!bFound)
815             idx = iFrom; //next
816         else
817             idx = iThisIndex;
818     }
819     return bFound;
820 }
821 }
822 
load(const std::string & url,gulong wc)823 bool SynFile::load(const std::string &url, gulong wc)
824 {
825     struct stat stat_buf;
826     if (!stat(url.c_str(), &stat_buf)) {
827         MapFile syn;
828         if (!syn.open(url.c_str(), stat_buf.st_size))
829             return false;
830         const gchar *current = syn.begin();
831         for (unsigned long i = 0; i < wc; i++) {
832             // each entry in a syn-file is:
833             // - 0-terminated string
834             // 4-byte index into .dict file in network byte order
835             glib::CharStr lower_string{ g_utf8_casefold(current, -1) };
836             std::string synonym{ get_impl(lower_string) };
837             current += synonym.length() + 1;
838             const guint32 idx = g_ntohl(get_uint32(current));
839             current += sizeof(idx);
840             synonyms[synonym] = idx;
841         }
842         return true;
843     } else {
844         return false;
845     }
846 }
847 
lookup(const char * str,glong & idx)848 bool SynFile::lookup(const char *str, glong &idx)
849 {
850     glib::CharStr lower_string{ g_utf8_casefold(str, -1) };
851     auto it = synonyms.find(get_impl(lower_string));
852     if (it != synonyms.end()) {
853         idx = it->second;
854         return true;
855     }
856     return false;
857 }
858 
Lookup(const char * str,glong & idx)859 bool Dict::Lookup(const char *str, glong &idx)
860 {
861     return syn_file->lookup(str, idx) || idx_file->lookup(str, idx);
862 }
863 
load(const std::string & ifofilename,bool verbose)864 bool Dict::load(const std::string &ifofilename, bool verbose)
865 {
866     gulong idxfilesize;
867     if (!load_ifofile(ifofilename, idxfilesize))
868         return false;
869 
870     std::string fullfilename(ifofilename);
871     fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "dict.dz");
872 
873     if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS)) {
874         dictdzfile.reset(new DictData);
875         if (!dictdzfile->open(fullfilename, 0)) {
876             //g_print("open file %s failed!\n",fullfilename);
877             return false;
878         }
879     } else {
880         fullfilename.erase(fullfilename.length() - sizeof(".dz") + 1, sizeof(".dz") - 1);
881         dictfile = fopen(fullfilename.c_str(), "rb");
882         if (!dictfile) {
883             //g_print("open file %s failed!\n",fullfilename);
884             return false;
885         }
886     }
887 
888     fullfilename = ifofilename;
889     fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "idx.gz");
890 
891     if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS)) {
892         idx_file.reset(new WordListIndex);
893     } else {
894         fullfilename.erase(fullfilename.length() - sizeof(".gz") + 1, sizeof(".gz") - 1);
895         idx_file.reset(new OffsetIndex);
896     }
897 
898     if (!idx_file->load(fullfilename, wordcount, idxfilesize, verbose))
899         return false;
900 
901     fullfilename = ifofilename;
902     fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "syn");
903     syn_file.reset(new SynFile);
904     syn_file->load(fullfilename, syn_wordcount);
905 
906     //g_print("bookname: %s , wordcount %lu\n", bookname.c_str(), narticles());
907     return true;
908 }
909 
load_ifofile(const std::string & ifofilename,gulong & idxfilesize)910 bool Dict::load_ifofile(const std::string &ifofilename, gulong &idxfilesize)
911 {
912     DictInfo dict_info;
913     if (!dict_info.load_from_ifo_file(ifofilename, false))
914         return false;
915     if (dict_info.wordcount == 0)
916         return false;
917 
918     ifo_file_name = dict_info.ifo_file_name;
919     wordcount = dict_info.wordcount;
920     syn_wordcount = dict_info.syn_wordcount;
921     bookname = dict_info.bookname;
922 
923     idxfilesize = dict_info.index_file_size;
924 
925     sametypesequence = dict_info.sametypesequence;
926 
927     return true;
928 }
929 
LookupWithRule(GPatternSpec * pspec,glong * aIndex,int iBuffLen)930 bool Dict::LookupWithRule(GPatternSpec *pspec, glong *aIndex, int iBuffLen)
931 {
932     int iIndexCount = 0;
933 
934     for (guint32 i = 0; i < narticles() && iIndexCount < (iBuffLen - 1); i++)
935         if (g_pattern_match_string(pspec, get_key(i)))
936             aIndex[iIndexCount++] = i;
937 
938     aIndex[iIndexCount] = -1; // -1 is the end.
939 
940     return iIndexCount > 0;
941 }
942 
~Libs()943 Libs::~Libs()
944 {
945     for (Dict *p : oLib)
946         delete p;
947 }
948 
load_dict(const std::string & url)949 void Libs::load_dict(const std::string &url)
950 {
951     Dict *lib = new Dict;
952     if (lib->load(url, verbose_))
953         oLib.push_back(lib);
954     else
955         delete lib;
956 }
957 
load(const std::list<std::string> & dicts_dirs,const std::list<std::string> & order_list,const std::list<std::string> & disable_list)958 void Libs::load(const std::list<std::string> &dicts_dirs,
959                 const std::list<std::string> &order_list,
960                 const std::list<std::string> &disable_list)
961 {
962     for_each_file(dicts_dirs, ".ifo", order_list, disable_list,
963                   [this](const std::string &url, bool disable) -> void {
964                       if (!disable)
965                           load_dict(url);
966                   });
967 }
968 
poGetCurrentWord(glong * iCurrent)969 const gchar *Libs::poGetCurrentWord(glong *iCurrent)
970 {
971     const gchar *poCurrentWord = nullptr;
972     const gchar *word;
973     for (std::vector<Dict *>::size_type iLib = 0; iLib < oLib.size(); iLib++) {
974         if (iCurrent[iLib] == INVALID_INDEX)
975             continue;
976         if (iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
977             continue;
978         if (poCurrentWord == nullptr) {
979             poCurrentWord = poGetWord(iCurrent[iLib], iLib);
980         } else {
981             word = poGetWord(iCurrent[iLib], iLib);
982 
983             if (stardict_strcmp(poCurrentWord, word) > 0)
984                 poCurrentWord = word;
985         }
986     }
987     return poCurrentWord;
988 }
989 
poGetNextWord(const gchar * sWord,glong * iCurrent)990 const gchar *Libs::poGetNextWord(const gchar *sWord, glong *iCurrent)
991 {
992     // the input can be:
993     // (word,iCurrent),read word,write iNext to iCurrent,and return next word. used by TopWin::NextCallback();
994     // (nullptr,iCurrent),read iCurrent,write iNext to iCurrent,and return next word. used by AppCore::ListWords();
995     const gchar *poCurrentWord = nullptr;
996     size_t iCurrentLib = 0;
997     const gchar *word;
998 
999     for (size_t iLib = 0; iLib < oLib.size(); ++iLib) {
1000         if (sWord)
1001             oLib[iLib]->Lookup(sWord, iCurrent[iLib]);
1002         if (iCurrent[iLib] == INVALID_INDEX)
1003             continue;
1004         if (iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
1005             continue;
1006         if (poCurrentWord == nullptr) {
1007             poCurrentWord = poGetWord(iCurrent[iLib], iLib);
1008             iCurrentLib = iLib;
1009         } else {
1010             word = poGetWord(iCurrent[iLib], iLib);
1011 
1012             if (stardict_strcmp(poCurrentWord, word) > 0) {
1013                 poCurrentWord = word;
1014                 iCurrentLib = iLib;
1015             }
1016         }
1017     }
1018     if (poCurrentWord) {
1019         iCurrent[iCurrentLib]++;
1020         for (std::vector<Dict *>::size_type iLib = 0; iLib < oLib.size(); iLib++) {
1021             if (iLib == iCurrentLib)
1022                 continue;
1023             if (iCurrent[iLib] == INVALID_INDEX)
1024                 continue;
1025             if (iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
1026                 continue;
1027             if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib], iLib)) == 0)
1028                 iCurrent[iLib]++;
1029         }
1030         poCurrentWord = poGetCurrentWord(iCurrent);
1031     }
1032     return poCurrentWord;
1033 }
1034 
1035 const gchar *
poGetPreWord(glong * iCurrent)1036 Libs::poGetPreWord(glong *iCurrent)
1037 {
1038     // used by TopWin::PreviousCallback(); the iCurrent is cached by AppCore::TopWinWordChange();
1039     const gchar *poCurrentWord = nullptr;
1040     std::vector<Dict *>::size_type iCurrentLib = 0;
1041     const gchar *word;
1042 
1043     for (std::vector<Dict *>::size_type iLib = 0; iLib < oLib.size(); iLib++) {
1044         if (iCurrent[iLib] == INVALID_INDEX)
1045             iCurrent[iLib] = narticles(iLib);
1046         else {
1047             if (iCurrent[iLib] > narticles(iLib) || iCurrent[iLib] <= 0)
1048                 continue;
1049         }
1050         if (poCurrentWord == nullptr) {
1051             poCurrentWord = poGetWord(iCurrent[iLib] - 1, iLib);
1052             iCurrentLib = iLib;
1053         } else {
1054             word = poGetWord(iCurrent[iLib] - 1, iLib);
1055             if (stardict_strcmp(poCurrentWord, word) < 0) {
1056                 poCurrentWord = word;
1057                 iCurrentLib = iLib;
1058             }
1059         }
1060     }
1061 
1062     if (poCurrentWord) {
1063         iCurrent[iCurrentLib]--;
1064         for (std::vector<Dict *>::size_type iLib = 0; iLib < oLib.size(); iLib++) {
1065             if (iLib == iCurrentLib)
1066                 continue;
1067             if (iCurrent[iLib] > narticles(iLib) || iCurrent[iLib] <= 0)
1068                 continue;
1069             if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib] - 1, iLib)) == 0) {
1070                 iCurrent[iLib]--;
1071             } else {
1072                 if (iCurrent[iLib] == narticles(iLib))
1073                     iCurrent[iLib] = INVALID_INDEX;
1074             }
1075         }
1076     }
1077     return poCurrentWord;
1078 }
1079 
LookupSimilarWord(const gchar * sWord,glong & iWordIndex,int iLib)1080 bool Libs::LookupSimilarWord(const gchar *sWord, glong &iWordIndex, int iLib)
1081 {
1082     glong iIndex;
1083     bool bFound = false;
1084     gchar *casestr;
1085 
1086     if (!bFound) {
1087         // to lower case.
1088         casestr = g_utf8_strdown(sWord, -1);
1089         if (strcmp(casestr, sWord)) {
1090             if (oLib[iLib]->Lookup(casestr, iIndex))
1091                 bFound = true;
1092         }
1093         g_free(casestr);
1094         // to upper case.
1095         if (!bFound) {
1096             casestr = g_utf8_strup(sWord, -1);
1097             if (strcmp(casestr, sWord)) {
1098                 if (oLib[iLib]->Lookup(casestr, iIndex))
1099                     bFound = true;
1100             }
1101             g_free(casestr);
1102         }
1103         // Upper the first character and lower others.
1104         if (!bFound) {
1105             gchar *nextchar = g_utf8_next_char(sWord);
1106             gchar *firstchar = g_utf8_strup(sWord, nextchar - sWord);
1107             nextchar = g_utf8_strdown(nextchar, -1);
1108             casestr = g_strdup_printf("%s%s", firstchar, nextchar);
1109             g_free(firstchar);
1110             g_free(nextchar);
1111             if (strcmp(casestr, sWord)) {
1112                 if (oLib[iLib]->Lookup(casestr, iIndex))
1113                     bFound = true;
1114             }
1115             g_free(casestr);
1116         }
1117     }
1118 
1119     if (bIsPureEnglish(sWord)) {
1120         // If not Found , try other status of sWord.
1121         int iWordLen = strlen(sWord);
1122         bool isupcase;
1123 
1124         gchar *sNewWord = (gchar *)g_malloc(iWordLen + 1);
1125 
1126         //cut one char "s" or "d"
1127         if (!bFound && iWordLen > 1) {
1128             isupcase = sWord[iWordLen - 1] == 'S' || !strncmp(&sWord[iWordLen - 2], "ED", 2);
1129             if (isupcase || sWord[iWordLen - 1] == 's' || !strncmp(&sWord[iWordLen - 2], "ed", 2)) {
1130                 strcpy(sNewWord, sWord);
1131                 sNewWord[iWordLen - 1] = '\0'; // cut "s" or "d"
1132                 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1133                     bFound = true;
1134                 else if (isupcase || g_ascii_isupper(sWord[0])) {
1135                     casestr = g_ascii_strdown(sNewWord, -1);
1136                     if (strcmp(casestr, sNewWord)) {
1137                         if (oLib[iLib]->Lookup(casestr, iIndex))
1138                             bFound = true;
1139                     }
1140                     g_free(casestr);
1141                 }
1142             }
1143         }
1144 
1145         //cut "ly"
1146         if (!bFound && iWordLen > 2) {
1147             isupcase = !strncmp(&sWord[iWordLen - 2], "LY", 2);
1148             if (isupcase || (!strncmp(&sWord[iWordLen - 2], "ly", 2))) {
1149                 strcpy(sNewWord, sWord);
1150                 sNewWord[iWordLen - 2] = '\0'; // cut "ly"
1151                 if (iWordLen > 5 && sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4]
1152                     && !bIsVowel(sNewWord[iWordLen - 4]) && bIsVowel(sNewWord[iWordLen - 5])) { //doubled
1153 
1154                     sNewWord[iWordLen - 3] = '\0';
1155                     if (oLib[iLib]->Lookup(sNewWord, iIndex))
1156                         bFound = true;
1157                     else {
1158                         if (isupcase || g_ascii_isupper(sWord[0])) {
1159                             casestr = g_ascii_strdown(sNewWord, -1);
1160                             if (strcmp(casestr, sNewWord)) {
1161                                 if (oLib[iLib]->Lookup(casestr, iIndex))
1162                                     bFound = true;
1163                             }
1164                             g_free(casestr);
1165                         }
1166                         if (!bFound)
1167                             sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; //restore
1168                     }
1169                 }
1170                 if (!bFound) {
1171                     if (oLib[iLib]->Lookup(sNewWord, iIndex))
1172                         bFound = true;
1173                     else if (isupcase || g_ascii_isupper(sWord[0])) {
1174                         casestr = g_ascii_strdown(sNewWord, -1);
1175                         if (strcmp(casestr, sNewWord)) {
1176                             if (oLib[iLib]->Lookup(casestr, iIndex))
1177                                 bFound = true;
1178                         }
1179                         g_free(casestr);
1180                     }
1181                 }
1182             }
1183         }
1184 
1185         //cut "ing"
1186         if (!bFound && iWordLen > 3) {
1187             isupcase = !strncmp(&sWord[iWordLen - 3], "ING", 3);
1188             if (isupcase || !strncmp(&sWord[iWordLen - 3], "ing", 3)) {
1189                 strcpy(sNewWord, sWord);
1190                 sNewWord[iWordLen - 3] = '\0';
1191                 if (iWordLen > 6 && (sNewWord[iWordLen - 4] == sNewWord[iWordLen - 5])
1192                     && !bIsVowel(sNewWord[iWordLen - 5]) && bIsVowel(sNewWord[iWordLen - 6])) { //doubled
1193                     sNewWord[iWordLen - 4] = '\0';
1194                     if (oLib[iLib]->Lookup(sNewWord, iIndex))
1195                         bFound = true;
1196                     else {
1197                         if (isupcase || g_ascii_isupper(sWord[0])) {
1198                             casestr = g_ascii_strdown(sNewWord, -1);
1199                             if (strcmp(casestr, sNewWord)) {
1200                                 if (oLib[iLib]->Lookup(casestr, iIndex))
1201                                     bFound = true;
1202                             }
1203                             g_free(casestr);
1204                         }
1205                         if (!bFound)
1206                             sNewWord[iWordLen - 4] = sNewWord[iWordLen - 5]; //restore
1207                     }
1208                 }
1209                 if (!bFound) {
1210                     if (oLib[iLib]->Lookup(sNewWord, iIndex))
1211                         bFound = true;
1212                     else if (isupcase || g_ascii_isupper(sWord[0])) {
1213                         casestr = g_ascii_strdown(sNewWord, -1);
1214                         if (strcmp(casestr, sNewWord)) {
1215                             if (oLib[iLib]->Lookup(casestr, iIndex))
1216                                 bFound = true;
1217                         }
1218                         g_free(casestr);
1219                     }
1220                 }
1221                 if (!bFound) {
1222                     if (isupcase)
1223                         strcat(sNewWord, "E"); // add a char "E"
1224                     else
1225                         strcat(sNewWord, "e"); // add a char "e"
1226                     if (oLib[iLib]->Lookup(sNewWord, iIndex))
1227                         bFound = true;
1228                     else if (isupcase || g_ascii_isupper(sWord[0])) {
1229                         casestr = g_ascii_strdown(sNewWord, -1);
1230                         if (strcmp(casestr, sNewWord)) {
1231                             if (oLib[iLib]->Lookup(casestr, iIndex))
1232                                 bFound = true;
1233                         }
1234                         g_free(casestr);
1235                     }
1236                 }
1237             }
1238         }
1239 
1240         //cut two char "es"
1241         if (!bFound && iWordLen > 3) {
1242             isupcase = (!strncmp(&sWord[iWordLen - 2], "ES", 2) && (sWord[iWordLen - 3] == 'S' || sWord[iWordLen - 3] == 'X' || sWord[iWordLen - 3] == 'O' || (iWordLen > 4 && sWord[iWordLen - 3] == 'H' && (sWord[iWordLen - 4] == 'C' || sWord[iWordLen - 4] == 'S'))));
1243             if (isupcase || (!strncmp(&sWord[iWordLen - 2], "es", 2) && (sWord[iWordLen - 3] == 's' || sWord[iWordLen - 3] == 'x' || sWord[iWordLen - 3] == 'o' || (iWordLen > 4 && sWord[iWordLen - 3] == 'h' && (sWord[iWordLen - 4] == 'c' || sWord[iWordLen - 4] == 's'))))) {
1244                 strcpy(sNewWord, sWord);
1245                 sNewWord[iWordLen - 2] = '\0';
1246                 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1247                     bFound = true;
1248                 else if (isupcase || g_ascii_isupper(sWord[0])) {
1249                     casestr = g_ascii_strdown(sNewWord, -1);
1250                     if (strcmp(casestr, sNewWord)) {
1251                         if (oLib[iLib]->Lookup(casestr, iIndex))
1252                             bFound = true;
1253                     }
1254                     g_free(casestr);
1255                 }
1256             }
1257         }
1258 
1259         //cut "ed"
1260         if (!bFound && iWordLen > 3) {
1261             isupcase = !strncmp(&sWord[iWordLen - 2], "ED", 2);
1262             if (isupcase || !strncmp(&sWord[iWordLen - 2], "ed", 2)) {
1263                 strcpy(sNewWord, sWord);
1264                 sNewWord[iWordLen - 2] = '\0';
1265                 if (iWordLen > 5 && (sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4])
1266                     && !bIsVowel(sNewWord[iWordLen - 4]) && bIsVowel(sNewWord[iWordLen - 5])) { //doubled
1267                     sNewWord[iWordLen - 3] = '\0';
1268                     if (oLib[iLib]->Lookup(sNewWord, iIndex))
1269                         bFound = true;
1270                     else {
1271                         if (isupcase || g_ascii_isupper(sWord[0])) {
1272                             casestr = g_ascii_strdown(sNewWord, -1);
1273                             if (strcmp(casestr, sNewWord)) {
1274                                 if (oLib[iLib]->Lookup(casestr, iIndex))
1275                                     bFound = true;
1276                             }
1277                             g_free(casestr);
1278                         }
1279                         if (!bFound)
1280                             sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; //restore
1281                     }
1282                 }
1283                 if (!bFound) {
1284                     if (oLib[iLib]->Lookup(sNewWord, iIndex))
1285                         bFound = true;
1286                     else if (isupcase || g_ascii_isupper(sWord[0])) {
1287                         casestr = g_ascii_strdown(sNewWord, -1);
1288                         if (strcmp(casestr, sNewWord)) {
1289                             if (oLib[iLib]->Lookup(casestr, iIndex))
1290                                 bFound = true;
1291                         }
1292                         g_free(casestr);
1293                     }
1294                 }
1295             }
1296         }
1297 
1298         // cut "ied" , add "y".
1299         if (!bFound && iWordLen > 3) {
1300             isupcase = !strncmp(&sWord[iWordLen - 3], "IED", 3);
1301             if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ied", 3))) {
1302                 strcpy(sNewWord, sWord);
1303                 sNewWord[iWordLen - 3] = '\0';
1304                 if (isupcase)
1305                     strcat(sNewWord, "Y"); // add a char "Y"
1306                 else
1307                     strcat(sNewWord, "y"); // add a char "y"
1308                 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1309                     bFound = true;
1310                 else if (isupcase || g_ascii_isupper(sWord[0])) {
1311                     casestr = g_ascii_strdown(sNewWord, -1);
1312                     if (strcmp(casestr, sNewWord)) {
1313                         if (oLib[iLib]->Lookup(casestr, iIndex))
1314                             bFound = true;
1315                     }
1316                     g_free(casestr);
1317                 }
1318             }
1319         }
1320 
1321         // cut "ies" , add "y".
1322         if (!bFound && iWordLen > 3) {
1323             isupcase = !strncmp(&sWord[iWordLen - 3], "IES", 3);
1324             if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ies", 3))) {
1325                 strcpy(sNewWord, sWord);
1326                 sNewWord[iWordLen - 3] = '\0';
1327                 if (isupcase)
1328                     strcat(sNewWord, "Y"); // add a char "Y"
1329                 else
1330                     strcat(sNewWord, "y"); // add a char "y"
1331                 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1332                     bFound = true;
1333                 else if (isupcase || g_ascii_isupper(sWord[0])) {
1334                     casestr = g_ascii_strdown(sNewWord, -1);
1335                     if (strcmp(casestr, sNewWord)) {
1336                         if (oLib[iLib]->Lookup(casestr, iIndex))
1337                             bFound = true;
1338                     }
1339                     g_free(casestr);
1340                 }
1341             }
1342         }
1343 
1344         // cut "er".
1345         if (!bFound && iWordLen > 2) {
1346             isupcase = !strncmp(&sWord[iWordLen - 2], "ER", 2);
1347             if (isupcase || (!strncmp(&sWord[iWordLen - 2], "er", 2))) {
1348                 strcpy(sNewWord, sWord);
1349                 sNewWord[iWordLen - 2] = '\0';
1350                 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1351                     bFound = true;
1352                 else if (isupcase || g_ascii_isupper(sWord[0])) {
1353                     casestr = g_ascii_strdown(sNewWord, -1);
1354                     if (strcmp(casestr, sNewWord)) {
1355                         if (oLib[iLib]->Lookup(casestr, iIndex))
1356                             bFound = true;
1357                     }
1358                     g_free(casestr);
1359                 }
1360             }
1361         }
1362 
1363         // cut "est".
1364         if (!bFound && iWordLen > 3) {
1365             isupcase = !strncmp(&sWord[iWordLen - 3], "EST", 3);
1366             if (isupcase || (!strncmp(&sWord[iWordLen - 3], "est", 3))) {
1367                 strcpy(sNewWord, sWord);
1368                 sNewWord[iWordLen - 3] = '\0';
1369                 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1370                     bFound = true;
1371                 else if (isupcase || g_ascii_isupper(sWord[0])) {
1372                     casestr = g_ascii_strdown(sNewWord, -1);
1373                     if (strcmp(casestr, sNewWord)) {
1374                         if (oLib[iLib]->Lookup(casestr, iIndex))
1375                             bFound = true;
1376                     }
1377                     g_free(casestr);
1378                 }
1379             }
1380         }
1381 
1382         g_free(sNewWord);
1383     }
1384 
1385     if (bFound)
1386         iWordIndex = iIndex;
1387 #if 0
1388 	else {
1389 		//don't change iWordIndex here.
1390 		//when LookupSimilarWord all failed too, we want to use the old LookupWord index to list words.
1391 		//iWordIndex = INVALID_INDEX;
1392 	}
1393 #endif
1394     return bFound;
1395 }
1396 
SimpleLookupWord(const gchar * sWord,glong & iWordIndex,int iLib)1397 bool Libs::SimpleLookupWord(const gchar *sWord, glong &iWordIndex, int iLib)
1398 {
1399     bool bFound = oLib[iLib]->Lookup(sWord, iWordIndex);
1400     if (!bFound && fuzzy_)
1401         bFound = LookupSimilarWord(sWord, iWordIndex, iLib);
1402     return bFound;
1403 }
1404 
LookupWithFuzzy(const gchar * sWord,gchar * reslist[],gint reslist_size)1405 bool Libs::LookupWithFuzzy(const gchar *sWord, gchar *reslist[], gint reslist_size)
1406 {
1407     if (sWord[0] == '\0')
1408         return false;
1409 
1410     Fuzzystruct oFuzzystruct[reslist_size];
1411 
1412     for (int i = 0; i < reslist_size; i++) {
1413         oFuzzystruct[i].pMatchWord = nullptr;
1414         oFuzzystruct[i].iMatchWordDistance = iMaxFuzzyDistance;
1415     }
1416     int iMaxDistance = iMaxFuzzyDistance;
1417     int iDistance;
1418     bool Found = false;
1419     EditDistance oEditDistance;
1420 
1421     glong iCheckWordLen;
1422     const char *sCheck;
1423     gunichar *ucs4_str1, *ucs4_str2;
1424     glong ucs4_str2_len;
1425 
1426     ucs4_str2 = g_utf8_to_ucs4_fast(sWord, -1, &ucs4_str2_len);
1427     unicode_strdown(ucs4_str2);
1428 
1429     for (size_t iLib = 0; iLib < oLib.size(); ++iLib) {
1430         if (progress_func)
1431             progress_func();
1432 
1433         //if (stardict_strcmp(sWord, poGetWord(0,iLib))>=0 && stardict_strcmp(sWord, poGetWord(narticles(iLib)-1,iLib))<=0) {
1434         //there are Chinese dicts and English dicts...
1435 
1436         const int iwords = narticles(iLib);
1437         for (int index = 0; index < iwords; index++) {
1438             sCheck = poGetWord(index, iLib);
1439             // tolower and skip too long or too short words
1440             iCheckWordLen = g_utf8_strlen(sCheck, -1);
1441             if (iCheckWordLen - ucs4_str2_len >= iMaxDistance || ucs4_str2_len - iCheckWordLen >= iMaxDistance)
1442                 continue;
1443             ucs4_str1 = g_utf8_to_ucs4_fast(sCheck, -1, nullptr);
1444             if (iCheckWordLen > ucs4_str2_len)
1445                 ucs4_str1[ucs4_str2_len] = 0;
1446             unicode_strdown(ucs4_str1);
1447 
1448             iDistance = oEditDistance.CalEditDistance(ucs4_str1, ucs4_str2, iMaxDistance);
1449             g_free(ucs4_str1);
1450             if (iDistance < iMaxDistance && iDistance < ucs4_str2_len) {
1451                 // when ucs4_str2_len=1,2 we need less fuzzy.
1452                 Found = true;
1453                 bool bAlreadyInList = false;
1454                 int iMaxDistanceAt = 0;
1455                 for (int j = 0; j < reslist_size; j++) {
1456                     if (oFuzzystruct[j].pMatchWord && strcmp(oFuzzystruct[j].pMatchWord, sCheck) == 0) { //already in list
1457                         bAlreadyInList = true;
1458                         break;
1459                     }
1460                     //find the position,it will certainly be found (include the first time) as iMaxDistance is set by last time.
1461                     if (oFuzzystruct[j].iMatchWordDistance == iMaxDistance) {
1462                         iMaxDistanceAt = j;
1463                     }
1464                 }
1465                 if (!bAlreadyInList) {
1466                     if (oFuzzystruct[iMaxDistanceAt].pMatchWord)
1467                         g_free(oFuzzystruct[iMaxDistanceAt].pMatchWord);
1468                     oFuzzystruct[iMaxDistanceAt].pMatchWord = g_strdup(sCheck);
1469                     oFuzzystruct[iMaxDistanceAt].iMatchWordDistance = iDistance;
1470                     // calc new iMaxDistance
1471                     iMaxDistance = iDistance;
1472                     for (int j = 0; j < reslist_size; j++) {
1473                         if (oFuzzystruct[j].iMatchWordDistance > iMaxDistance)
1474                             iMaxDistance = oFuzzystruct[j].iMatchWordDistance;
1475                     } // calc new iMaxDistance
1476                 } // add to list
1477             } // find one
1478         } // each word
1479 
1480     } // each lib
1481     g_free(ucs4_str2);
1482 
1483     if (Found) // sort with distance
1484         std::sort(oFuzzystruct, oFuzzystruct + reslist_size, [](const Fuzzystruct &lh, const Fuzzystruct &rh) -> bool {
1485             if (lh.iMatchWordDistance != rh.iMatchWordDistance)
1486                 return lh.iMatchWordDistance < rh.iMatchWordDistance;
1487 
1488             if (lh.pMatchWord && rh.pMatchWord)
1489                 return stardict_strcmp(lh.pMatchWord, rh.pMatchWord) < 0;
1490 
1491             return false;
1492         });
1493 
1494     for (gint i = 0; i < reslist_size; ++i)
1495         reslist[i] = oFuzzystruct[i].pMatchWord;
1496 
1497     return Found;
1498 }
1499 
LookupWithRule(const gchar * word,gchar ** ppMatchWord)1500 gint Libs::LookupWithRule(const gchar *word, gchar **ppMatchWord)
1501 {
1502     glong aiIndex[MAX_MATCH_ITEM_PER_LIB + 1];
1503     gint iMatchCount = 0;
1504     GPatternSpec *pspec = g_pattern_spec_new(word);
1505 
1506     for (std::vector<Dict *>::size_type iLib = 0; iLib < oLib.size(); iLib++) {
1507         //if(oLibs.LookdupWordsWithRule(pspec,aiIndex,MAX_MATCH_ITEM_PER_LIB+1-iMatchCount,iLib))
1508         // -iMatchCount,so save time,but may got less result and the word may repeat.
1509 
1510         if (oLib[iLib]->LookupWithRule(pspec, aiIndex, MAX_MATCH_ITEM_PER_LIB + 1)) {
1511             if (progress_func)
1512                 progress_func();
1513             for (int i = 0; aiIndex[i] != -1; i++) {
1514                 const gchar *sMatchWord = poGetWord(aiIndex[i], iLib);
1515                 bool bAlreadyInList = false;
1516                 for (int j = 0; j < iMatchCount; j++) {
1517                     if (strcmp(ppMatchWord[j], sMatchWord) == 0) { //already in list
1518                         bAlreadyInList = true;
1519                         break;
1520                     }
1521                 }
1522                 if (!bAlreadyInList)
1523                     ppMatchWord[iMatchCount++] = g_strdup(sMatchWord);
1524             }
1525         }
1526     }
1527     g_pattern_spec_free(pspec);
1528 
1529     if (iMatchCount) // sort it.
1530         std::sort(ppMatchWord, ppMatchWord + iMatchCount, [](const char *lh, const char *rh) -> bool {
1531             return stardict_strcmp(lh, rh) < 0;
1532         });
1533 
1534     return iMatchCount;
1535 }
1536 
LookupData(const gchar * sWord,std::vector<gchar * > * reslist)1537 bool Libs::LookupData(const gchar *sWord, std::vector<gchar *> *reslist)
1538 {
1539     std::vector<std::string> SearchWords;
1540     std::string SearchWord;
1541     const char *p = sWord;
1542     while (*p) {
1543         if (*p == '\\') {
1544             p++;
1545             switch (*p) {
1546             case ' ':
1547                 SearchWord += ' ';
1548                 break;
1549             case '\\':
1550                 SearchWord += '\\';
1551                 break;
1552             case 't':
1553                 SearchWord += '\t';
1554                 break;
1555             case 'n':
1556                 SearchWord += '\n';
1557                 break;
1558             default:
1559                 SearchWord += *p;
1560             }
1561         } else if (*p == ' ') {
1562             if (!SearchWord.empty()) {
1563                 SearchWords.push_back(SearchWord);
1564                 SearchWord.clear();
1565             }
1566         } else {
1567             SearchWord += *p;
1568         }
1569         p++;
1570     }
1571     if (!SearchWord.empty()) {
1572         SearchWords.push_back(SearchWord);
1573         SearchWord.clear();
1574     }
1575     if (SearchWords.empty())
1576         return false;
1577 
1578     guint32 max_size = 0;
1579     gchar *origin_data = nullptr;
1580     for (std::vector<Dict *>::size_type i = 0; i < oLib.size(); ++i) {
1581         if (!oLib[i]->containSearchData())
1582             continue;
1583         if (progress_func)
1584             progress_func();
1585         const gulong iwords = narticles(i);
1586         const gchar *key;
1587         guint32 offset, size;
1588         for (gulong j = 0; j < iwords; ++j) {
1589             oLib[i]->get_key_and_data(j, &key, &offset, &size);
1590             if (size > max_size) {
1591                 origin_data = (gchar *)g_realloc(origin_data, size);
1592                 max_size = size;
1593             }
1594             if (oLib[i]->SearchData(SearchWords, offset, size, origin_data))
1595                 reslist[i].push_back(g_strdup(key));
1596         }
1597     }
1598     g_free(origin_data);
1599 
1600     std::vector<Dict *>::size_type i;
1601     for (i = 0; i < oLib.size(); ++i)
1602         if (!reslist[i].empty())
1603             break;
1604 
1605     return i != oLib.size();
1606 }
1607 
1608 /**************************************************/
analyze_query(const char * s,std::string & res)1609 query_t analyze_query(const char *s, std::string &res)
1610 {
1611     if (!s || !*s) {
1612         res = "";
1613         return qtSIMPLE;
1614     }
1615     if (*s == '/') {
1616         res = s + 1;
1617         return qtFUZZY;
1618     }
1619 
1620     if (*s == '|') {
1621         res = s + 1;
1622         return qtDATA;
1623     }
1624 
1625     bool regexp = false;
1626     const char *p = s;
1627     res = "";
1628     for (; *p; res += *p, ++p) {
1629         if (*p == '\\') {
1630             ++p;
1631             if (!*p)
1632                 break;
1633             continue;
1634         }
1635         if (*p == '*' || *p == '?')
1636             regexp = true;
1637     }
1638     if (regexp)
1639         return qtREGEXP;
1640 
1641     return qtSIMPLE;
1642 }
1643