1 #ifdef HAVE_CONFIG_H
2 #include "config.h"
3 #endif
4
5 #include <algorithm>
6 #include <cctype>
7 #include <cstring>
8 #include <stdexcept>
9
10 #include <glib/gstdio.h>
11 #include <sys/stat.h>
12 #include <zlib.h>
13
14 #include "distance.hpp"
15 #include "mapfile.hpp"
16 #include "utils.hpp"
17
18 #include "stardict_lib.hpp"
19
20 #define TO_STR2(xstr) #xstr
21 #define TO_STR1(xstr) TO_STR2(xstr)
22
23 #define THROW_IF_ERROR(expr) \
24 do { \
25 assert((expr)); \
26 if (!(expr)) \
27 throw std::runtime_error(#expr " not true at " __FILE__ ": " TO_STR1(__LINE__)); \
28 } while (false)
29
30 // Notice: read src/tools/DICTFILE_FORMAT for the dictionary
31 // file's format information!
32
33 namespace
34 {
35 struct Fuzzystruct {
36 char *pMatchWord;
37 int iMatchWordDistance;
38 };
39
bIsVowel(gchar inputchar)40 static inline bool bIsVowel(gchar inputchar)
41 {
42 gchar ch = g_ascii_toupper(inputchar);
43 return (ch == 'A' || ch == 'E' || ch == 'I' || ch == 'O' || ch == 'U');
44 }
45
bIsPureEnglish(const gchar * str)46 static bool bIsPureEnglish(const gchar *str)
47 {
48 // i think this should work even when it is UTF8 string :).
49 for (int i = 0; str[i] != 0; i++)
50 //if(str[i]<0)
51 //if(str[i]<32 || str[i]>126) // tab equal 9,so this is not OK.
52 // Better use isascii() but not str[i]<0 while char is default unsigned in arm
53 if (!isascii(str[i]))
54 return false;
55 return true;
56 }
57
stardict_strcmp(const gchar * s1,const gchar * s2)58 static inline gint stardict_strcmp(const gchar *s1, const gchar *s2)
59 {
60 const gint a = g_ascii_strcasecmp(s1, s2);
61 if (a == 0)
62 return strcmp(s1, s2);
63 else
64 return a;
65 }
66
unicode_strdown(gunichar * str)67 static void unicode_strdown(gunichar *str)
68 {
69 while (*str) {
70 *str = g_unichar_tolower(*str);
71 ++str;
72 }
73 }
74 }
75
load_from_ifo_file(const std::string & ifofilename,bool istreedict)76 bool DictInfo::load_from_ifo_file(const std::string &ifofilename,
77 bool istreedict)
78 {
79 ifo_file_name = ifofilename;
80 glib::CharStr buffer;
81 if (!g_file_get_contents(ifofilename.c_str(), get_addr(buffer), nullptr, nullptr))
82 return false;
83
84 static const char TREEDICT_MAGIC_DATA[] = "StarDict's treedict ifo file";
85 static const char DICT_MAGIC_DATA[] = "StarDict's dict ifo file";
86
87 const gchar *magic_data = istreedict ? TREEDICT_MAGIC_DATA : DICT_MAGIC_DATA;
88 static const unsigned char utf8_bom[] = { 0xEF, 0xBB, 0xBF, '\0' };
89 if (!g_str_has_prefix(
90 g_str_has_prefix(get_impl(buffer), (const gchar *)(utf8_bom)) ? get_impl(buffer) + 3 : get_impl(buffer),
91 magic_data)) {
92 return false;
93 }
94
95 gchar *p1 = get_impl(buffer) + strlen(magic_data) - 1;
96
97 gchar *p2 = strstr(p1, "\nwordcount=");
98 if (p2 == nullptr)
99 return false;
100
101 gchar *p3 = strchr(p2 + sizeof("\nwordcount=") - 1, '\n');
102
103 wordcount = atol(std::string(p2 + sizeof("\nwordcount=") - 1, p3 - (p2 + sizeof("\nwordcount=") - 1)).c_str());
104
105 if (istreedict) {
106 p2 = strstr(p1, "\ntdxfilesize=");
107 if (p2 == nullptr)
108 return false;
109
110 p3 = strchr(p2 + sizeof("\ntdxfilesize=") - 1, '\n');
111
112 index_file_size = atol(std::string(p2 + sizeof("\ntdxfilesize=") - 1, p3 - (p2 + sizeof("\ntdxfilesize=") - 1)).c_str());
113
114 } else {
115
116 p2 = strstr(p1, "\nidxfilesize=");
117 if (p2 == nullptr)
118 return false;
119
120 p3 = strchr(p2 + sizeof("\nidxfilesize=") - 1, '\n');
121 index_file_size = atol(std::string(p2 + sizeof("\nidxfilesize=") - 1, p3 - (p2 + sizeof("\nidxfilesize=") - 1)).c_str());
122 }
123
124 p2 = strstr(p1, "\nbookname=");
125
126 if (p2 == nullptr)
127 return false;
128
129 p2 = p2 + sizeof("\nbookname=") - 1;
130 p3 = strchr(p2, '\n');
131 bookname.assign(p2, p3 - p2);
132
133 p2 = strstr(p1, "\nauthor=");
134 if (p2) {
135 p2 = p2 + sizeof("\nauthor=") - 1;
136 p3 = strchr(p2, '\n');
137 author.assign(p2, p3 - p2);
138 }
139
140 p2 = strstr(p1, "\nemail=");
141 if (p2) {
142 p2 = p2 + sizeof("\nemail=") - 1;
143 p3 = strchr(p2, '\n');
144 email.assign(p2, p3 - p2);
145 }
146
147 p2 = strstr(p1, "\nwebsite=");
148 if (p2) {
149 p2 = p2 + sizeof("\nwebsite=") - 1;
150 p3 = strchr(p2, '\n');
151 website.assign(p2, p3 - p2);
152 }
153
154 p2 = strstr(p1, "\ndate=");
155 if (p2) {
156 p2 = p2 + sizeof("\ndate=") - 1;
157 p3 = strchr(p2, '\n');
158 date.assign(p2, p3 - p2);
159 }
160
161 p2 = strstr(p1, "\ndescription=");
162 if (p2) {
163 p2 = p2 + sizeof("\ndescription=") - 1;
164 p3 = strchr(p2, '\n');
165 description.assign(p2, p3 - p2);
166 }
167
168 p2 = strstr(p1, "\nsametypesequence=");
169 if (p2) {
170 p2 += sizeof("\nsametypesequence=") - 1;
171 p3 = strchr(p2, '\n');
172 sametypesequence.assign(p2, p3 - p2);
173 }
174
175 p2 = strstr(p1, "\nsynwordcount=");
176 syn_wordcount = 0;
177 if (p2) {
178 p2 += sizeof("\nsynwordcount=") - 1;
179 p3 = strchr(p2, '\n');
180 syn_wordcount = atol(std::string(p2, p3 - p2).c_str());
181 }
182
183 return true;
184 }
185
GetWordData(guint32 idxitem_offset,guint32 idxitem_size)186 gchar *DictBase::GetWordData(guint32 idxitem_offset, guint32 idxitem_size)
187 {
188 for (int i = 0; i < WORDDATA_CACHE_NUM; i++)
189 if (cache[i].data && cache[i].offset == idxitem_offset)
190 return cache[i].data;
191
192 if (dictfile)
193 fseek(dictfile, idxitem_offset, SEEK_SET);
194
195 gchar *data;
196 if (!sametypesequence.empty()) {
197 glib::CharStr origin_data((gchar *)g_malloc(idxitem_size));
198
199 if (dictfile) {
200 const size_t nitems = fread(get_impl(origin_data), idxitem_size, 1, dictfile);
201 THROW_IF_ERROR(nitems == 1);
202 } else
203 dictdzfile->read(get_impl(origin_data), idxitem_offset, idxitem_size);
204
205 guint32 data_size;
206 gint sametypesequence_len = sametypesequence.length();
207 //there have sametypesequence_len char being omitted.
208 data_size = idxitem_size + sizeof(guint32) + sametypesequence_len;
209 //if the last item's size is determined by the end up '\0',then +=sizeof(gchar);
210 //if the last item's size is determined by the head guint32 type data,then +=sizeof(guint32);
211 switch (sametypesequence[sametypesequence_len - 1]) {
212 case 'm':
213 case 't':
214 case 'y':
215 case 'l':
216 case 'g':
217 case 'x':
218 case 'k':
219 data_size += sizeof(gchar);
220 break;
221 case 'W':
222 case 'P':
223 data_size += sizeof(guint32);
224 break;
225 default:
226 if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1]))
227 data_size += sizeof(guint32);
228 else
229 data_size += sizeof(gchar);
230 break;
231 }
232 data = (gchar *)g_malloc(data_size);
233 gchar *p1, *p2;
234 p1 = data + sizeof(guint32);
235 p2 = get_impl(origin_data);
236 guint32 sec_size;
237 //copy the head items.
238 for (int i = 0; i < sametypesequence_len - 1; i++) {
239 *p1 = sametypesequence[i];
240 p1 += sizeof(gchar);
241 switch (sametypesequence[i]) {
242 case 'm':
243 case 't':
244 case 'y':
245 case 'l':
246 case 'g':
247 case 'x':
248 case 'k':
249 sec_size = strlen(p2) + 1;
250 memcpy(p1, p2, sec_size);
251 p1 += sec_size;
252 p2 += sec_size;
253 break;
254 case 'W':
255 case 'P':
256 sec_size = get_uint32(p2);
257 sec_size += sizeof(guint32);
258 memcpy(p1, p2, sec_size);
259 p1 += sec_size;
260 p2 += sec_size;
261 break;
262 default:
263 if (g_ascii_isupper(sametypesequence[i])) {
264 sec_size = get_uint32(p2);
265 sec_size += sizeof(guint32);
266 } else {
267 sec_size = strlen(p2) + 1;
268 }
269 memcpy(p1, p2, sec_size);
270 p1 += sec_size;
271 p2 += sec_size;
272 break;
273 }
274 }
275 //calculate the last item 's size.
276 sec_size = idxitem_size - (p2 - get_impl(origin_data));
277 *p1 = sametypesequence[sametypesequence_len - 1];
278 p1 += sizeof(gchar);
279 switch (sametypesequence[sametypesequence_len - 1]) {
280 case 'm':
281 case 't':
282 case 'y':
283 case 'l':
284 case 'g':
285 case 'x':
286 case 'k':
287 memcpy(p1, p2, sec_size);
288 p1 += sec_size;
289 *p1 = '\0'; //add the end up '\0';
290 break;
291 case 'W':
292 case 'P':
293 set_uint32(p1, sec_size);
294 p1 += sizeof(guint32);
295 memcpy(p1, p2, sec_size);
296 break;
297 default:
298 if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1])) {
299 set_uint32(p1, sec_size);
300 p1 += sizeof(guint32);
301 memcpy(p1, p2, sec_size);
302 } else {
303 memcpy(p1, p2, sec_size);
304 p1 += sec_size;
305 *p1 = '\0';
306 }
307 break;
308 }
309 set_uint32(data, data_size);
310 } else {
311 data = (gchar *)g_malloc(idxitem_size + sizeof(guint32));
312 if (dictfile) {
313 const size_t nitems = fread(data + sizeof(guint32), idxitem_size, 1, dictfile);
314 THROW_IF_ERROR(nitems == 1);
315 } else
316 dictdzfile->read(data + sizeof(guint32), idxitem_offset, idxitem_size);
317 set_uint32(data, idxitem_size + sizeof(guint32));
318 }
319 g_free(cache[cache_cur].data);
320
321 cache[cache_cur].data = data;
322 cache[cache_cur].offset = idxitem_offset;
323 cache_cur++;
324 if (cache_cur == WORDDATA_CACHE_NUM)
325 cache_cur = 0;
326 return data;
327 }
328
SearchData(std::vector<std::string> & SearchWords,guint32 idxitem_offset,guint32 idxitem_size,gchar * origin_data)329 bool DictBase::SearchData(std::vector<std::string> &SearchWords, guint32 idxitem_offset, guint32 idxitem_size, gchar *origin_data)
330 {
331 int nWord = SearchWords.size();
332 std::vector<bool> WordFind(nWord, false);
333 int nfound = 0;
334
335 if (dictfile)
336 fseek(dictfile, idxitem_offset, SEEK_SET);
337 if (dictfile) {
338 const size_t nitems = fread(origin_data, idxitem_size, 1, dictfile);
339 THROW_IF_ERROR(nitems == 1);
340 } else
341 dictdzfile->read(origin_data, idxitem_offset, idxitem_size);
342 gchar *p = origin_data;
343 guint32 sec_size;
344 int j;
345 if (!sametypesequence.empty()) {
346 gint sametypesequence_len = sametypesequence.length();
347 for (int i = 0; i < sametypesequence_len - 1; i++) {
348 switch (sametypesequence[i]) {
349 case 'm':
350 case 't':
351 case 'y':
352 case 'l':
353 case 'g':
354 case 'x':
355 case 'k':
356 for (j = 0; j < nWord; j++)
357 if (!WordFind[j] && strstr(p, SearchWords[j].c_str())) {
358 WordFind[j] = true;
359 ++nfound;
360 }
361
362 if (nfound == nWord)
363 return true;
364 sec_size = strlen(p) + 1;
365 p += sec_size;
366 break;
367 default:
368 if (g_ascii_isupper(sametypesequence[i])) {
369 sec_size = get_uint32(p);
370 sec_size += sizeof(guint32);
371 } else {
372 sec_size = strlen(p) + 1;
373 }
374 p += sec_size;
375 }
376 }
377 switch (sametypesequence[sametypesequence_len - 1]) {
378 case 'm':
379 case 't':
380 case 'y':
381 case 'l':
382 case 'g':
383 case 'x':
384 case 'k':
385 sec_size = idxitem_size - (p - origin_data);
386 for (j = 0; j < nWord; j++)
387 if (!WordFind[j] && g_strstr_len(p, sec_size, SearchWords[j].c_str())) {
388 WordFind[j] = true;
389 ++nfound;
390 }
391
392 if (nfound == nWord)
393 return true;
394 break;
395 }
396 } else {
397 while (guint32(p - origin_data) < idxitem_size) {
398 switch (*p) {
399 case 'm':
400 case 't':
401 case 'y':
402 case 'l':
403 case 'g':
404 case 'x':
405 case 'k':
406 for (j = 0; j < nWord; j++)
407 if (!WordFind[j] && strstr(p, SearchWords[j].c_str())) {
408 WordFind[j] = true;
409 ++nfound;
410 }
411
412 if (nfound == nWord)
413 return true;
414 sec_size = strlen(p) + 1;
415 p += sec_size;
416 break;
417 default:
418 if (g_ascii_isupper(*p)) {
419 sec_size = get_uint32(p);
420 sec_size += sizeof(guint32);
421 } else {
422 sec_size = strlen(p) + 1;
423 }
424 p += sec_size;
425 }
426 }
427 }
428 return false;
429 }
430
431 namespace
432 {
433 class OffsetIndex : public IIndexFile
434 {
435 public:
OffsetIndex()436 OffsetIndex()
437 : idxfile(nullptr)
438 {
439 }
~OffsetIndex()440 ~OffsetIndex()
441 {
442 if (idxfile)
443 fclose(idxfile);
444 }
445 bool load(const std::string &url, gulong wc, gulong fsize, bool verbose) override;
446 const gchar *get_key(glong idx) override;
get_data(glong idx)447 void get_data(glong idx) override { get_key(idx); }
get_key_and_data(glong idx)448 const gchar *get_key_and_data(glong idx) override
449 {
450 return get_key(idx);
451 }
452 bool lookup(const char *str, glong &idx) override;
453
454 private:
455 static const gint ENTR_PER_PAGE = 32;
456 static const char *CACHE_MAGIC;
457
458 std::vector<guint32> wordoffset;
459 FILE *idxfile;
460 gulong wordcount;
461
462 gchar wordentry_buf[256 + sizeof(guint32) * 2]; // The length of "word_str" should be less than 256. See src/tools/DICTFILE_FORMAT.
463 struct index_entry {
464 glong idx;
465 std::string keystr;
assign__anon48ed784c0211::OffsetIndex::index_entry466 void assign(glong i, const std::string &str)
467 {
468 idx = i;
469 keystr.assign(str);
470 }
471 };
472 index_entry first, last, middle, real_last;
473
474 struct page_entry {
475 gchar *keystr;
476 guint32 off, size;
477 };
478 std::vector<gchar> page_data;
479 struct page_t {
480 glong idx = -1;
481 page_entry entries[ENTR_PER_PAGE];
482
page_t__anon48ed784c0211::OffsetIndex::page_t483 page_t() {}
484 void fill(gchar *data, gint nent, glong idx_);
485 } page;
486 gulong load_page(glong page_idx);
487 const gchar *read_first_on_page_key(glong page_idx);
488 const gchar *get_first_on_page_key(glong page_idx);
489 bool load_cache(const std::string &url);
490 bool save_cache(const std::string &url, bool verbose);
491 static std::list<std::string> get_cache_variant(const std::string &url);
492 };
493
494 const char *OffsetIndex::CACHE_MAGIC = "StarDict's Cache, Version: 0.1";
495
496 class WordListIndex : public IIndexFile
497 {
498 public:
WordListIndex()499 WordListIndex()
500 : idxdatabuf(nullptr)
501 {
502 }
~WordListIndex()503 ~WordListIndex() { g_free(idxdatabuf); }
504 bool load(const std::string &url, gulong wc, gulong fsize, bool verbose) override;
get_key(glong idx)505 const gchar *get_key(glong idx) override { return wordlist[idx]; }
506 void get_data(glong idx) override;
get_key_and_data(glong idx)507 const gchar *get_key_and_data(glong idx) override
508 {
509 get_data(idx);
510 return get_key(idx);
511 }
512 bool lookup(const char *str, glong &idx) override;
513
514 private:
515 gchar *idxdatabuf;
516 std::vector<gchar *> wordlist;
517 };
518
fill(gchar * data,gint nent,glong idx_)519 void OffsetIndex::page_t::fill(gchar *data, gint nent, glong idx_)
520 {
521 idx = idx_;
522 gchar *p = data;
523 glong len;
524 for (gint i = 0; i < nent; ++i) {
525 entries[i].keystr = p;
526 len = strlen(p);
527 p += len + 1;
528 entries[i].off = g_ntohl(get_uint32(p));
529 p += sizeof(guint32);
530 entries[i].size = g_ntohl(get_uint32(p));
531 p += sizeof(guint32);
532 }
533 }
534
read_first_on_page_key(glong page_idx)535 inline const gchar *OffsetIndex::read_first_on_page_key(glong page_idx)
536 {
537 fseek(idxfile, wordoffset[page_idx], SEEK_SET);
538 guint32 page_size = wordoffset[page_idx + 1] - wordoffset[page_idx];
539 const size_t nitems = fread(wordentry_buf,
540 std::min(sizeof(wordentry_buf), static_cast<size_t>(page_size)),
541 1, idxfile);
542 THROW_IF_ERROR(nitems == 1);
543 //TODO: check returned values, deal with word entry that strlen>255.
544 return wordentry_buf;
545 }
546
get_first_on_page_key(glong page_idx)547 inline const gchar *OffsetIndex::get_first_on_page_key(glong page_idx)
548 {
549 if (page_idx < middle.idx) {
550 if (page_idx == first.idx)
551 return first.keystr.c_str();
552 return read_first_on_page_key(page_idx);
553 } else if (page_idx > middle.idx) {
554 if (page_idx == last.idx)
555 return last.keystr.c_str();
556 return read_first_on_page_key(page_idx);
557 } else
558 return middle.keystr.c_str();
559 }
560
load_cache(const std::string & url)561 bool OffsetIndex::load_cache(const std::string &url)
562 {
563 const std::list<std::string> vars = get_cache_variant(url);
564
565 for (const std::string &item : vars) {
566 struct ::stat idxstat, cachestat;
567 if (g_stat(url.c_str(), &idxstat) != 0 || g_stat(item.c_str(), &cachestat) != 0)
568 continue;
569 if (cachestat.st_mtime < idxstat.st_mtime)
570 continue;
571 MapFile mf;
572 if (!mf.open(item.c_str(), cachestat.st_size))
573 continue;
574 if (strncmp(mf.begin(), CACHE_MAGIC, strlen(CACHE_MAGIC)) != 0)
575 continue;
576 memcpy(&wordoffset[0], mf.begin() + strlen(CACHE_MAGIC), wordoffset.size() * sizeof(wordoffset[0]));
577 return true;
578 }
579
580 return false;
581 }
582
get_cache_variant(const std::string & url)583 std::list<std::string> OffsetIndex::get_cache_variant(const std::string &url)
584 {
585 std::list<std::string> res = { url + ".oft" };
586 if (!g_file_test(g_get_user_cache_dir(), G_FILE_TEST_EXISTS) && g_mkdir(g_get_user_cache_dir(), 0700) == -1)
587 return res;
588
589 const std::string cache_dir = std::string(g_get_user_cache_dir()) + G_DIR_SEPARATOR_S + "sdcv";
590
591 if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_EXISTS)) {
592 if (g_mkdir(cache_dir.c_str(), 0700) == -1)
593 return res;
594 } else if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_IS_DIR))
595 return res;
596
597 gchar *base = g_path_get_basename(url.c_str());
598 res.push_back(cache_dir + G_DIR_SEPARATOR_S + base + ".oft");
599 g_free(base);
600 return res;
601 }
602
save_cache(const std::string & url,bool verbose)603 bool OffsetIndex::save_cache(const std::string &url, bool verbose)
604 {
605 const std::list<std::string> vars = get_cache_variant(url);
606 for (const std::string &item : vars) {
607 FILE *out = fopen(item.c_str(), "wb");
608 if (!out)
609 continue;
610 if (fwrite(CACHE_MAGIC, 1, strlen(CACHE_MAGIC), out) != strlen(CACHE_MAGIC))
611 continue;
612 if (fwrite(&wordoffset[0], sizeof(wordoffset[0]), wordoffset.size(), out) != wordoffset.size())
613 continue;
614 fclose(out);
615 if (verbose) {
616 printf("save to cache %s\n", url.c_str());
617 }
618 return true;
619 }
620 return false;
621 }
622
load(const std::string & url,gulong wc,gulong fsize,bool verbose)623 bool OffsetIndex::load(const std::string &url, gulong wc, gulong fsize, bool verbose)
624 {
625 wordcount = wc;
626 gulong npages = (wc - 1) / ENTR_PER_PAGE + 2;
627 wordoffset.resize(npages);
628 if (!load_cache(url)) { //map file will close after finish of block
629 MapFile map_file;
630 if (!map_file.open(url.c_str(), fsize))
631 return false;
632 const gchar *idxdatabuffer = map_file.begin();
633
634 const gchar *p1 = idxdatabuffer;
635 gulong index_size;
636 guint32 j = 0;
637 for (guint32 i = 0; i < wc; i++) {
638 index_size = strlen(p1) + 1 + 2 * sizeof(guint32);
639 if (i % ENTR_PER_PAGE == 0) {
640 wordoffset[j] = p1 - idxdatabuffer;
641 ++j;
642 }
643 p1 += index_size;
644 }
645 wordoffset[j] = p1 - idxdatabuffer;
646 if (!save_cache(url, verbose))
647 fprintf(stderr, "cache update failed\n");
648 }
649
650 if (!(idxfile = fopen(url.c_str(), "rb"))) {
651 wordoffset.resize(0);
652 return false;
653 }
654
655 first.assign(0, read_first_on_page_key(0));
656 last.assign(wordoffset.size() - 2, read_first_on_page_key(wordoffset.size() - 2));
657 middle.assign((wordoffset.size() - 2) / 2, read_first_on_page_key((wordoffset.size() - 2) / 2));
658 real_last.assign(wc - 1, get_key(wc - 1));
659
660 return true;
661 }
662
load_page(glong page_idx)663 inline gulong OffsetIndex::load_page(glong page_idx)
664 {
665 gulong nentr = ENTR_PER_PAGE;
666 if (page_idx == glong(wordoffset.size() - 2))
667 if ((nentr = (wordcount % ENTR_PER_PAGE)) == 0)
668 nentr = ENTR_PER_PAGE;
669
670 if (page_idx != page.idx) {
671 page_data.resize(wordoffset[page_idx + 1] - wordoffset[page_idx]);
672 fseek(idxfile, wordoffset[page_idx], SEEK_SET);
673 const size_t nitems = fread(&page_data[0], 1, page_data.size(), idxfile);
674 THROW_IF_ERROR(nitems == page_data.size());
675
676 page.fill(&page_data[0], nentr, page_idx);
677 }
678
679 return nentr;
680 }
681
get_key(glong idx)682 const gchar *OffsetIndex::get_key(glong idx)
683 {
684 load_page(idx / ENTR_PER_PAGE);
685 glong idx_in_page = idx % ENTR_PER_PAGE;
686 wordentry_offset = page.entries[idx_in_page].off;
687 wordentry_size = page.entries[idx_in_page].size;
688
689 return page.entries[idx_in_page].keystr;
690 }
691
lookup(const char * str,glong & idx)692 bool OffsetIndex::lookup(const char *str, glong &idx)
693 {
694 bool bFound = false;
695 glong iFrom;
696 glong iTo = wordoffset.size() - 2;
697 gint cmpint;
698 glong iThisIndex;
699 if (stardict_strcmp(str, first.keystr.c_str()) < 0) {
700 idx = 0;
701 return false;
702 } else if (stardict_strcmp(str, real_last.keystr.c_str()) > 0) {
703 idx = INVALID_INDEX;
704 return false;
705 } else {
706 iFrom = 0;
707 iThisIndex = 0;
708 while (iFrom <= iTo) {
709 iThisIndex = (iFrom + iTo) / 2;
710 cmpint = stardict_strcmp(str, get_first_on_page_key(iThisIndex));
711 if (cmpint > 0)
712 iFrom = iThisIndex + 1;
713 else if (cmpint < 0)
714 iTo = iThisIndex - 1;
715 else {
716 bFound = true;
717 break;
718 }
719 }
720 if (!bFound)
721 idx = iTo; //prev
722 else
723 idx = iThisIndex;
724 }
725 if (!bFound) {
726 gulong netr = load_page(idx);
727 iFrom = 1; // Needn't search the first word anymore.
728 iTo = netr - 1;
729 iThisIndex = 0;
730 while (iFrom <= iTo) {
731 iThisIndex = (iFrom + iTo) / 2;
732 cmpint = stardict_strcmp(str, page.entries[iThisIndex].keystr);
733 if (cmpint > 0)
734 iFrom = iThisIndex + 1;
735 else if (cmpint < 0)
736 iTo = iThisIndex - 1;
737 else {
738 bFound = true;
739 break;
740 }
741 }
742 idx *= ENTR_PER_PAGE;
743 if (!bFound)
744 idx += iFrom; //next
745 else
746 idx += iThisIndex;
747 } else {
748 idx *= ENTR_PER_PAGE;
749 }
750 return bFound;
751 }
752
load(const std::string & url,gulong wc,gulong fsize,bool verbose)753 bool WordListIndex::load(const std::string &url, gulong wc, gulong fsize, bool verbose)
754 {
755 gzFile in = gzopen(url.c_str(), "rb");
756 if (in == nullptr)
757 return false;
758
759 idxdatabuf = (gchar *)g_malloc(fsize);
760
761 const int len = gzread(in, idxdatabuf, fsize);
762 gzclose(in);
763 if (len < 0)
764 return false;
765
766 if (gulong(len) != fsize)
767 return false;
768
769 wordlist.resize(wc + 1);
770 gchar *p1 = idxdatabuf;
771 guint32 i;
772 for (i = 0; i < wc; i++) {
773 wordlist[i] = p1;
774 p1 += strlen(p1) + 1 + 2 * sizeof(guint32);
775 }
776 wordlist[wc] = p1;
777
778 return true;
779 }
780
get_data(glong idx)781 void WordListIndex::get_data(glong idx)
782 {
783 gchar *p1 = wordlist[idx] + strlen(wordlist[idx]) + sizeof(gchar);
784 wordentry_offset = g_ntohl(get_uint32(p1));
785 p1 += sizeof(guint32);
786 wordentry_size = g_ntohl(get_uint32(p1));
787 }
788
lookup(const char * str,glong & idx)789 bool WordListIndex::lookup(const char *str, glong &idx)
790 {
791 bool bFound = false;
792 glong iTo = wordlist.size() - 2;
793
794 if (stardict_strcmp(str, get_key(0)) < 0) {
795 idx = 0;
796 } else if (stardict_strcmp(str, get_key(iTo)) > 0) {
797 idx = INVALID_INDEX;
798 } else {
799 glong iThisIndex = 0;
800 glong iFrom = 0;
801 gint cmpint;
802 while (iFrom <= iTo) {
803 iThisIndex = (iFrom + iTo) / 2;
804 cmpint = stardict_strcmp(str, get_key(iThisIndex));
805 if (cmpint > 0)
806 iFrom = iThisIndex + 1;
807 else if (cmpint < 0)
808 iTo = iThisIndex - 1;
809 else {
810 bFound = true;
811 break;
812 }
813 }
814 if (!bFound)
815 idx = iFrom; //next
816 else
817 idx = iThisIndex;
818 }
819 return bFound;
820 }
821 }
822
load(const std::string & url,gulong wc)823 bool SynFile::load(const std::string &url, gulong wc)
824 {
825 struct stat stat_buf;
826 if (!stat(url.c_str(), &stat_buf)) {
827 MapFile syn;
828 if (!syn.open(url.c_str(), stat_buf.st_size))
829 return false;
830 const gchar *current = syn.begin();
831 for (unsigned long i = 0; i < wc; i++) {
832 // each entry in a syn-file is:
833 // - 0-terminated string
834 // 4-byte index into .dict file in network byte order
835 glib::CharStr lower_string{ g_utf8_casefold(current, -1) };
836 std::string synonym{ get_impl(lower_string) };
837 current += synonym.length() + 1;
838 const guint32 idx = g_ntohl(get_uint32(current));
839 current += sizeof(idx);
840 synonyms[synonym] = idx;
841 }
842 return true;
843 } else {
844 return false;
845 }
846 }
847
lookup(const char * str,glong & idx)848 bool SynFile::lookup(const char *str, glong &idx)
849 {
850 glib::CharStr lower_string{ g_utf8_casefold(str, -1) };
851 auto it = synonyms.find(get_impl(lower_string));
852 if (it != synonyms.end()) {
853 idx = it->second;
854 return true;
855 }
856 return false;
857 }
858
Lookup(const char * str,glong & idx)859 bool Dict::Lookup(const char *str, glong &idx)
860 {
861 return syn_file->lookup(str, idx) || idx_file->lookup(str, idx);
862 }
863
load(const std::string & ifofilename,bool verbose)864 bool Dict::load(const std::string &ifofilename, bool verbose)
865 {
866 gulong idxfilesize;
867 if (!load_ifofile(ifofilename, idxfilesize))
868 return false;
869
870 std::string fullfilename(ifofilename);
871 fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "dict.dz");
872
873 if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS)) {
874 dictdzfile.reset(new DictData);
875 if (!dictdzfile->open(fullfilename, 0)) {
876 //g_print("open file %s failed!\n",fullfilename);
877 return false;
878 }
879 } else {
880 fullfilename.erase(fullfilename.length() - sizeof(".dz") + 1, sizeof(".dz") - 1);
881 dictfile = fopen(fullfilename.c_str(), "rb");
882 if (!dictfile) {
883 //g_print("open file %s failed!\n",fullfilename);
884 return false;
885 }
886 }
887
888 fullfilename = ifofilename;
889 fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "idx.gz");
890
891 if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS)) {
892 idx_file.reset(new WordListIndex);
893 } else {
894 fullfilename.erase(fullfilename.length() - sizeof(".gz") + 1, sizeof(".gz") - 1);
895 idx_file.reset(new OffsetIndex);
896 }
897
898 if (!idx_file->load(fullfilename, wordcount, idxfilesize, verbose))
899 return false;
900
901 fullfilename = ifofilename;
902 fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "syn");
903 syn_file.reset(new SynFile);
904 syn_file->load(fullfilename, syn_wordcount);
905
906 //g_print("bookname: %s , wordcount %lu\n", bookname.c_str(), narticles());
907 return true;
908 }
909
load_ifofile(const std::string & ifofilename,gulong & idxfilesize)910 bool Dict::load_ifofile(const std::string &ifofilename, gulong &idxfilesize)
911 {
912 DictInfo dict_info;
913 if (!dict_info.load_from_ifo_file(ifofilename, false))
914 return false;
915 if (dict_info.wordcount == 0)
916 return false;
917
918 ifo_file_name = dict_info.ifo_file_name;
919 wordcount = dict_info.wordcount;
920 syn_wordcount = dict_info.syn_wordcount;
921 bookname = dict_info.bookname;
922
923 idxfilesize = dict_info.index_file_size;
924
925 sametypesequence = dict_info.sametypesequence;
926
927 return true;
928 }
929
LookupWithRule(GPatternSpec * pspec,glong * aIndex,int iBuffLen)930 bool Dict::LookupWithRule(GPatternSpec *pspec, glong *aIndex, int iBuffLen)
931 {
932 int iIndexCount = 0;
933
934 for (guint32 i = 0; i < narticles() && iIndexCount < (iBuffLen - 1); i++)
935 if (g_pattern_match_string(pspec, get_key(i)))
936 aIndex[iIndexCount++] = i;
937
938 aIndex[iIndexCount] = -1; // -1 is the end.
939
940 return iIndexCount > 0;
941 }
942
~Libs()943 Libs::~Libs()
944 {
945 for (Dict *p : oLib)
946 delete p;
947 }
948
load_dict(const std::string & url)949 void Libs::load_dict(const std::string &url)
950 {
951 Dict *lib = new Dict;
952 if (lib->load(url, verbose_))
953 oLib.push_back(lib);
954 else
955 delete lib;
956 }
957
load(const std::list<std::string> & dicts_dirs,const std::list<std::string> & order_list,const std::list<std::string> & disable_list)958 void Libs::load(const std::list<std::string> &dicts_dirs,
959 const std::list<std::string> &order_list,
960 const std::list<std::string> &disable_list)
961 {
962 for_each_file(dicts_dirs, ".ifo", order_list, disable_list,
963 [this](const std::string &url, bool disable) -> void {
964 if (!disable)
965 load_dict(url);
966 });
967 }
968
poGetCurrentWord(glong * iCurrent)969 const gchar *Libs::poGetCurrentWord(glong *iCurrent)
970 {
971 const gchar *poCurrentWord = nullptr;
972 const gchar *word;
973 for (std::vector<Dict *>::size_type iLib = 0; iLib < oLib.size(); iLib++) {
974 if (iCurrent[iLib] == INVALID_INDEX)
975 continue;
976 if (iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
977 continue;
978 if (poCurrentWord == nullptr) {
979 poCurrentWord = poGetWord(iCurrent[iLib], iLib);
980 } else {
981 word = poGetWord(iCurrent[iLib], iLib);
982
983 if (stardict_strcmp(poCurrentWord, word) > 0)
984 poCurrentWord = word;
985 }
986 }
987 return poCurrentWord;
988 }
989
poGetNextWord(const gchar * sWord,glong * iCurrent)990 const gchar *Libs::poGetNextWord(const gchar *sWord, glong *iCurrent)
991 {
992 // the input can be:
993 // (word,iCurrent),read word,write iNext to iCurrent,and return next word. used by TopWin::NextCallback();
994 // (nullptr,iCurrent),read iCurrent,write iNext to iCurrent,and return next word. used by AppCore::ListWords();
995 const gchar *poCurrentWord = nullptr;
996 size_t iCurrentLib = 0;
997 const gchar *word;
998
999 for (size_t iLib = 0; iLib < oLib.size(); ++iLib) {
1000 if (sWord)
1001 oLib[iLib]->Lookup(sWord, iCurrent[iLib]);
1002 if (iCurrent[iLib] == INVALID_INDEX)
1003 continue;
1004 if (iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
1005 continue;
1006 if (poCurrentWord == nullptr) {
1007 poCurrentWord = poGetWord(iCurrent[iLib], iLib);
1008 iCurrentLib = iLib;
1009 } else {
1010 word = poGetWord(iCurrent[iLib], iLib);
1011
1012 if (stardict_strcmp(poCurrentWord, word) > 0) {
1013 poCurrentWord = word;
1014 iCurrentLib = iLib;
1015 }
1016 }
1017 }
1018 if (poCurrentWord) {
1019 iCurrent[iCurrentLib]++;
1020 for (std::vector<Dict *>::size_type iLib = 0; iLib < oLib.size(); iLib++) {
1021 if (iLib == iCurrentLib)
1022 continue;
1023 if (iCurrent[iLib] == INVALID_INDEX)
1024 continue;
1025 if (iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
1026 continue;
1027 if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib], iLib)) == 0)
1028 iCurrent[iLib]++;
1029 }
1030 poCurrentWord = poGetCurrentWord(iCurrent);
1031 }
1032 return poCurrentWord;
1033 }
1034
1035 const gchar *
poGetPreWord(glong * iCurrent)1036 Libs::poGetPreWord(glong *iCurrent)
1037 {
1038 // used by TopWin::PreviousCallback(); the iCurrent is cached by AppCore::TopWinWordChange();
1039 const gchar *poCurrentWord = nullptr;
1040 std::vector<Dict *>::size_type iCurrentLib = 0;
1041 const gchar *word;
1042
1043 for (std::vector<Dict *>::size_type iLib = 0; iLib < oLib.size(); iLib++) {
1044 if (iCurrent[iLib] == INVALID_INDEX)
1045 iCurrent[iLib] = narticles(iLib);
1046 else {
1047 if (iCurrent[iLib] > narticles(iLib) || iCurrent[iLib] <= 0)
1048 continue;
1049 }
1050 if (poCurrentWord == nullptr) {
1051 poCurrentWord = poGetWord(iCurrent[iLib] - 1, iLib);
1052 iCurrentLib = iLib;
1053 } else {
1054 word = poGetWord(iCurrent[iLib] - 1, iLib);
1055 if (stardict_strcmp(poCurrentWord, word) < 0) {
1056 poCurrentWord = word;
1057 iCurrentLib = iLib;
1058 }
1059 }
1060 }
1061
1062 if (poCurrentWord) {
1063 iCurrent[iCurrentLib]--;
1064 for (std::vector<Dict *>::size_type iLib = 0; iLib < oLib.size(); iLib++) {
1065 if (iLib == iCurrentLib)
1066 continue;
1067 if (iCurrent[iLib] > narticles(iLib) || iCurrent[iLib] <= 0)
1068 continue;
1069 if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib] - 1, iLib)) == 0) {
1070 iCurrent[iLib]--;
1071 } else {
1072 if (iCurrent[iLib] == narticles(iLib))
1073 iCurrent[iLib] = INVALID_INDEX;
1074 }
1075 }
1076 }
1077 return poCurrentWord;
1078 }
1079
LookupSimilarWord(const gchar * sWord,glong & iWordIndex,int iLib)1080 bool Libs::LookupSimilarWord(const gchar *sWord, glong &iWordIndex, int iLib)
1081 {
1082 glong iIndex;
1083 bool bFound = false;
1084 gchar *casestr;
1085
1086 if (!bFound) {
1087 // to lower case.
1088 casestr = g_utf8_strdown(sWord, -1);
1089 if (strcmp(casestr, sWord)) {
1090 if (oLib[iLib]->Lookup(casestr, iIndex))
1091 bFound = true;
1092 }
1093 g_free(casestr);
1094 // to upper case.
1095 if (!bFound) {
1096 casestr = g_utf8_strup(sWord, -1);
1097 if (strcmp(casestr, sWord)) {
1098 if (oLib[iLib]->Lookup(casestr, iIndex))
1099 bFound = true;
1100 }
1101 g_free(casestr);
1102 }
1103 // Upper the first character and lower others.
1104 if (!bFound) {
1105 gchar *nextchar = g_utf8_next_char(sWord);
1106 gchar *firstchar = g_utf8_strup(sWord, nextchar - sWord);
1107 nextchar = g_utf8_strdown(nextchar, -1);
1108 casestr = g_strdup_printf("%s%s", firstchar, nextchar);
1109 g_free(firstchar);
1110 g_free(nextchar);
1111 if (strcmp(casestr, sWord)) {
1112 if (oLib[iLib]->Lookup(casestr, iIndex))
1113 bFound = true;
1114 }
1115 g_free(casestr);
1116 }
1117 }
1118
1119 if (bIsPureEnglish(sWord)) {
1120 // If not Found , try other status of sWord.
1121 int iWordLen = strlen(sWord);
1122 bool isupcase;
1123
1124 gchar *sNewWord = (gchar *)g_malloc(iWordLen + 1);
1125
1126 //cut one char "s" or "d"
1127 if (!bFound && iWordLen > 1) {
1128 isupcase = sWord[iWordLen - 1] == 'S' || !strncmp(&sWord[iWordLen - 2], "ED", 2);
1129 if (isupcase || sWord[iWordLen - 1] == 's' || !strncmp(&sWord[iWordLen - 2], "ed", 2)) {
1130 strcpy(sNewWord, sWord);
1131 sNewWord[iWordLen - 1] = '\0'; // cut "s" or "d"
1132 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1133 bFound = true;
1134 else if (isupcase || g_ascii_isupper(sWord[0])) {
1135 casestr = g_ascii_strdown(sNewWord, -1);
1136 if (strcmp(casestr, sNewWord)) {
1137 if (oLib[iLib]->Lookup(casestr, iIndex))
1138 bFound = true;
1139 }
1140 g_free(casestr);
1141 }
1142 }
1143 }
1144
1145 //cut "ly"
1146 if (!bFound && iWordLen > 2) {
1147 isupcase = !strncmp(&sWord[iWordLen - 2], "LY", 2);
1148 if (isupcase || (!strncmp(&sWord[iWordLen - 2], "ly", 2))) {
1149 strcpy(sNewWord, sWord);
1150 sNewWord[iWordLen - 2] = '\0'; // cut "ly"
1151 if (iWordLen > 5 && sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4]
1152 && !bIsVowel(sNewWord[iWordLen - 4]) && bIsVowel(sNewWord[iWordLen - 5])) { //doubled
1153
1154 sNewWord[iWordLen - 3] = '\0';
1155 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1156 bFound = true;
1157 else {
1158 if (isupcase || g_ascii_isupper(sWord[0])) {
1159 casestr = g_ascii_strdown(sNewWord, -1);
1160 if (strcmp(casestr, sNewWord)) {
1161 if (oLib[iLib]->Lookup(casestr, iIndex))
1162 bFound = true;
1163 }
1164 g_free(casestr);
1165 }
1166 if (!bFound)
1167 sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; //restore
1168 }
1169 }
1170 if (!bFound) {
1171 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1172 bFound = true;
1173 else if (isupcase || g_ascii_isupper(sWord[0])) {
1174 casestr = g_ascii_strdown(sNewWord, -1);
1175 if (strcmp(casestr, sNewWord)) {
1176 if (oLib[iLib]->Lookup(casestr, iIndex))
1177 bFound = true;
1178 }
1179 g_free(casestr);
1180 }
1181 }
1182 }
1183 }
1184
1185 //cut "ing"
1186 if (!bFound && iWordLen > 3) {
1187 isupcase = !strncmp(&sWord[iWordLen - 3], "ING", 3);
1188 if (isupcase || !strncmp(&sWord[iWordLen - 3], "ing", 3)) {
1189 strcpy(sNewWord, sWord);
1190 sNewWord[iWordLen - 3] = '\0';
1191 if (iWordLen > 6 && (sNewWord[iWordLen - 4] == sNewWord[iWordLen - 5])
1192 && !bIsVowel(sNewWord[iWordLen - 5]) && bIsVowel(sNewWord[iWordLen - 6])) { //doubled
1193 sNewWord[iWordLen - 4] = '\0';
1194 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1195 bFound = true;
1196 else {
1197 if (isupcase || g_ascii_isupper(sWord[0])) {
1198 casestr = g_ascii_strdown(sNewWord, -1);
1199 if (strcmp(casestr, sNewWord)) {
1200 if (oLib[iLib]->Lookup(casestr, iIndex))
1201 bFound = true;
1202 }
1203 g_free(casestr);
1204 }
1205 if (!bFound)
1206 sNewWord[iWordLen - 4] = sNewWord[iWordLen - 5]; //restore
1207 }
1208 }
1209 if (!bFound) {
1210 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1211 bFound = true;
1212 else if (isupcase || g_ascii_isupper(sWord[0])) {
1213 casestr = g_ascii_strdown(sNewWord, -1);
1214 if (strcmp(casestr, sNewWord)) {
1215 if (oLib[iLib]->Lookup(casestr, iIndex))
1216 bFound = true;
1217 }
1218 g_free(casestr);
1219 }
1220 }
1221 if (!bFound) {
1222 if (isupcase)
1223 strcat(sNewWord, "E"); // add a char "E"
1224 else
1225 strcat(sNewWord, "e"); // add a char "e"
1226 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1227 bFound = true;
1228 else if (isupcase || g_ascii_isupper(sWord[0])) {
1229 casestr = g_ascii_strdown(sNewWord, -1);
1230 if (strcmp(casestr, sNewWord)) {
1231 if (oLib[iLib]->Lookup(casestr, iIndex))
1232 bFound = true;
1233 }
1234 g_free(casestr);
1235 }
1236 }
1237 }
1238 }
1239
1240 //cut two char "es"
1241 if (!bFound && iWordLen > 3) {
1242 isupcase = (!strncmp(&sWord[iWordLen - 2], "ES", 2) && (sWord[iWordLen - 3] == 'S' || sWord[iWordLen - 3] == 'X' || sWord[iWordLen - 3] == 'O' || (iWordLen > 4 && sWord[iWordLen - 3] == 'H' && (sWord[iWordLen - 4] == 'C' || sWord[iWordLen - 4] == 'S'))));
1243 if (isupcase || (!strncmp(&sWord[iWordLen - 2], "es", 2) && (sWord[iWordLen - 3] == 's' || sWord[iWordLen - 3] == 'x' || sWord[iWordLen - 3] == 'o' || (iWordLen > 4 && sWord[iWordLen - 3] == 'h' && (sWord[iWordLen - 4] == 'c' || sWord[iWordLen - 4] == 's'))))) {
1244 strcpy(sNewWord, sWord);
1245 sNewWord[iWordLen - 2] = '\0';
1246 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1247 bFound = true;
1248 else if (isupcase || g_ascii_isupper(sWord[0])) {
1249 casestr = g_ascii_strdown(sNewWord, -1);
1250 if (strcmp(casestr, sNewWord)) {
1251 if (oLib[iLib]->Lookup(casestr, iIndex))
1252 bFound = true;
1253 }
1254 g_free(casestr);
1255 }
1256 }
1257 }
1258
1259 //cut "ed"
1260 if (!bFound && iWordLen > 3) {
1261 isupcase = !strncmp(&sWord[iWordLen - 2], "ED", 2);
1262 if (isupcase || !strncmp(&sWord[iWordLen - 2], "ed", 2)) {
1263 strcpy(sNewWord, sWord);
1264 sNewWord[iWordLen - 2] = '\0';
1265 if (iWordLen > 5 && (sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4])
1266 && !bIsVowel(sNewWord[iWordLen - 4]) && bIsVowel(sNewWord[iWordLen - 5])) { //doubled
1267 sNewWord[iWordLen - 3] = '\0';
1268 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1269 bFound = true;
1270 else {
1271 if (isupcase || g_ascii_isupper(sWord[0])) {
1272 casestr = g_ascii_strdown(sNewWord, -1);
1273 if (strcmp(casestr, sNewWord)) {
1274 if (oLib[iLib]->Lookup(casestr, iIndex))
1275 bFound = true;
1276 }
1277 g_free(casestr);
1278 }
1279 if (!bFound)
1280 sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; //restore
1281 }
1282 }
1283 if (!bFound) {
1284 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1285 bFound = true;
1286 else if (isupcase || g_ascii_isupper(sWord[0])) {
1287 casestr = g_ascii_strdown(sNewWord, -1);
1288 if (strcmp(casestr, sNewWord)) {
1289 if (oLib[iLib]->Lookup(casestr, iIndex))
1290 bFound = true;
1291 }
1292 g_free(casestr);
1293 }
1294 }
1295 }
1296 }
1297
1298 // cut "ied" , add "y".
1299 if (!bFound && iWordLen > 3) {
1300 isupcase = !strncmp(&sWord[iWordLen - 3], "IED", 3);
1301 if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ied", 3))) {
1302 strcpy(sNewWord, sWord);
1303 sNewWord[iWordLen - 3] = '\0';
1304 if (isupcase)
1305 strcat(sNewWord, "Y"); // add a char "Y"
1306 else
1307 strcat(sNewWord, "y"); // add a char "y"
1308 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1309 bFound = true;
1310 else if (isupcase || g_ascii_isupper(sWord[0])) {
1311 casestr = g_ascii_strdown(sNewWord, -1);
1312 if (strcmp(casestr, sNewWord)) {
1313 if (oLib[iLib]->Lookup(casestr, iIndex))
1314 bFound = true;
1315 }
1316 g_free(casestr);
1317 }
1318 }
1319 }
1320
1321 // cut "ies" , add "y".
1322 if (!bFound && iWordLen > 3) {
1323 isupcase = !strncmp(&sWord[iWordLen - 3], "IES", 3);
1324 if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ies", 3))) {
1325 strcpy(sNewWord, sWord);
1326 sNewWord[iWordLen - 3] = '\0';
1327 if (isupcase)
1328 strcat(sNewWord, "Y"); // add a char "Y"
1329 else
1330 strcat(sNewWord, "y"); // add a char "y"
1331 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1332 bFound = true;
1333 else if (isupcase || g_ascii_isupper(sWord[0])) {
1334 casestr = g_ascii_strdown(sNewWord, -1);
1335 if (strcmp(casestr, sNewWord)) {
1336 if (oLib[iLib]->Lookup(casestr, iIndex))
1337 bFound = true;
1338 }
1339 g_free(casestr);
1340 }
1341 }
1342 }
1343
1344 // cut "er".
1345 if (!bFound && iWordLen > 2) {
1346 isupcase = !strncmp(&sWord[iWordLen - 2], "ER", 2);
1347 if (isupcase || (!strncmp(&sWord[iWordLen - 2], "er", 2))) {
1348 strcpy(sNewWord, sWord);
1349 sNewWord[iWordLen - 2] = '\0';
1350 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1351 bFound = true;
1352 else if (isupcase || g_ascii_isupper(sWord[0])) {
1353 casestr = g_ascii_strdown(sNewWord, -1);
1354 if (strcmp(casestr, sNewWord)) {
1355 if (oLib[iLib]->Lookup(casestr, iIndex))
1356 bFound = true;
1357 }
1358 g_free(casestr);
1359 }
1360 }
1361 }
1362
1363 // cut "est".
1364 if (!bFound && iWordLen > 3) {
1365 isupcase = !strncmp(&sWord[iWordLen - 3], "EST", 3);
1366 if (isupcase || (!strncmp(&sWord[iWordLen - 3], "est", 3))) {
1367 strcpy(sNewWord, sWord);
1368 sNewWord[iWordLen - 3] = '\0';
1369 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1370 bFound = true;
1371 else if (isupcase || g_ascii_isupper(sWord[0])) {
1372 casestr = g_ascii_strdown(sNewWord, -1);
1373 if (strcmp(casestr, sNewWord)) {
1374 if (oLib[iLib]->Lookup(casestr, iIndex))
1375 bFound = true;
1376 }
1377 g_free(casestr);
1378 }
1379 }
1380 }
1381
1382 g_free(sNewWord);
1383 }
1384
1385 if (bFound)
1386 iWordIndex = iIndex;
1387 #if 0
1388 else {
1389 //don't change iWordIndex here.
1390 //when LookupSimilarWord all failed too, we want to use the old LookupWord index to list words.
1391 //iWordIndex = INVALID_INDEX;
1392 }
1393 #endif
1394 return bFound;
1395 }
1396
SimpleLookupWord(const gchar * sWord,glong & iWordIndex,int iLib)1397 bool Libs::SimpleLookupWord(const gchar *sWord, glong &iWordIndex, int iLib)
1398 {
1399 bool bFound = oLib[iLib]->Lookup(sWord, iWordIndex);
1400 if (!bFound && fuzzy_)
1401 bFound = LookupSimilarWord(sWord, iWordIndex, iLib);
1402 return bFound;
1403 }
1404
LookupWithFuzzy(const gchar * sWord,gchar * reslist[],gint reslist_size)1405 bool Libs::LookupWithFuzzy(const gchar *sWord, gchar *reslist[], gint reslist_size)
1406 {
1407 if (sWord[0] == '\0')
1408 return false;
1409
1410 Fuzzystruct oFuzzystruct[reslist_size];
1411
1412 for (int i = 0; i < reslist_size; i++) {
1413 oFuzzystruct[i].pMatchWord = nullptr;
1414 oFuzzystruct[i].iMatchWordDistance = iMaxFuzzyDistance;
1415 }
1416 int iMaxDistance = iMaxFuzzyDistance;
1417 int iDistance;
1418 bool Found = false;
1419 EditDistance oEditDistance;
1420
1421 glong iCheckWordLen;
1422 const char *sCheck;
1423 gunichar *ucs4_str1, *ucs4_str2;
1424 glong ucs4_str2_len;
1425
1426 ucs4_str2 = g_utf8_to_ucs4_fast(sWord, -1, &ucs4_str2_len);
1427 unicode_strdown(ucs4_str2);
1428
1429 for (size_t iLib = 0; iLib < oLib.size(); ++iLib) {
1430 if (progress_func)
1431 progress_func();
1432
1433 //if (stardict_strcmp(sWord, poGetWord(0,iLib))>=0 && stardict_strcmp(sWord, poGetWord(narticles(iLib)-1,iLib))<=0) {
1434 //there are Chinese dicts and English dicts...
1435
1436 const int iwords = narticles(iLib);
1437 for (int index = 0; index < iwords; index++) {
1438 sCheck = poGetWord(index, iLib);
1439 // tolower and skip too long or too short words
1440 iCheckWordLen = g_utf8_strlen(sCheck, -1);
1441 if (iCheckWordLen - ucs4_str2_len >= iMaxDistance || ucs4_str2_len - iCheckWordLen >= iMaxDistance)
1442 continue;
1443 ucs4_str1 = g_utf8_to_ucs4_fast(sCheck, -1, nullptr);
1444 if (iCheckWordLen > ucs4_str2_len)
1445 ucs4_str1[ucs4_str2_len] = 0;
1446 unicode_strdown(ucs4_str1);
1447
1448 iDistance = oEditDistance.CalEditDistance(ucs4_str1, ucs4_str2, iMaxDistance);
1449 g_free(ucs4_str1);
1450 if (iDistance < iMaxDistance && iDistance < ucs4_str2_len) {
1451 // when ucs4_str2_len=1,2 we need less fuzzy.
1452 Found = true;
1453 bool bAlreadyInList = false;
1454 int iMaxDistanceAt = 0;
1455 for (int j = 0; j < reslist_size; j++) {
1456 if (oFuzzystruct[j].pMatchWord && strcmp(oFuzzystruct[j].pMatchWord, sCheck) == 0) { //already in list
1457 bAlreadyInList = true;
1458 break;
1459 }
1460 //find the position,it will certainly be found (include the first time) as iMaxDistance is set by last time.
1461 if (oFuzzystruct[j].iMatchWordDistance == iMaxDistance) {
1462 iMaxDistanceAt = j;
1463 }
1464 }
1465 if (!bAlreadyInList) {
1466 if (oFuzzystruct[iMaxDistanceAt].pMatchWord)
1467 g_free(oFuzzystruct[iMaxDistanceAt].pMatchWord);
1468 oFuzzystruct[iMaxDistanceAt].pMatchWord = g_strdup(sCheck);
1469 oFuzzystruct[iMaxDistanceAt].iMatchWordDistance = iDistance;
1470 // calc new iMaxDistance
1471 iMaxDistance = iDistance;
1472 for (int j = 0; j < reslist_size; j++) {
1473 if (oFuzzystruct[j].iMatchWordDistance > iMaxDistance)
1474 iMaxDistance = oFuzzystruct[j].iMatchWordDistance;
1475 } // calc new iMaxDistance
1476 } // add to list
1477 } // find one
1478 } // each word
1479
1480 } // each lib
1481 g_free(ucs4_str2);
1482
1483 if (Found) // sort with distance
1484 std::sort(oFuzzystruct, oFuzzystruct + reslist_size, [](const Fuzzystruct &lh, const Fuzzystruct &rh) -> bool {
1485 if (lh.iMatchWordDistance != rh.iMatchWordDistance)
1486 return lh.iMatchWordDistance < rh.iMatchWordDistance;
1487
1488 if (lh.pMatchWord && rh.pMatchWord)
1489 return stardict_strcmp(lh.pMatchWord, rh.pMatchWord) < 0;
1490
1491 return false;
1492 });
1493
1494 for (gint i = 0; i < reslist_size; ++i)
1495 reslist[i] = oFuzzystruct[i].pMatchWord;
1496
1497 return Found;
1498 }
1499
LookupWithRule(const gchar * word,gchar ** ppMatchWord)1500 gint Libs::LookupWithRule(const gchar *word, gchar **ppMatchWord)
1501 {
1502 glong aiIndex[MAX_MATCH_ITEM_PER_LIB + 1];
1503 gint iMatchCount = 0;
1504 GPatternSpec *pspec = g_pattern_spec_new(word);
1505
1506 for (std::vector<Dict *>::size_type iLib = 0; iLib < oLib.size(); iLib++) {
1507 //if(oLibs.LookdupWordsWithRule(pspec,aiIndex,MAX_MATCH_ITEM_PER_LIB+1-iMatchCount,iLib))
1508 // -iMatchCount,so save time,but may got less result and the word may repeat.
1509
1510 if (oLib[iLib]->LookupWithRule(pspec, aiIndex, MAX_MATCH_ITEM_PER_LIB + 1)) {
1511 if (progress_func)
1512 progress_func();
1513 for (int i = 0; aiIndex[i] != -1; i++) {
1514 const gchar *sMatchWord = poGetWord(aiIndex[i], iLib);
1515 bool bAlreadyInList = false;
1516 for (int j = 0; j < iMatchCount; j++) {
1517 if (strcmp(ppMatchWord[j], sMatchWord) == 0) { //already in list
1518 bAlreadyInList = true;
1519 break;
1520 }
1521 }
1522 if (!bAlreadyInList)
1523 ppMatchWord[iMatchCount++] = g_strdup(sMatchWord);
1524 }
1525 }
1526 }
1527 g_pattern_spec_free(pspec);
1528
1529 if (iMatchCount) // sort it.
1530 std::sort(ppMatchWord, ppMatchWord + iMatchCount, [](const char *lh, const char *rh) -> bool {
1531 return stardict_strcmp(lh, rh) < 0;
1532 });
1533
1534 return iMatchCount;
1535 }
1536
LookupData(const gchar * sWord,std::vector<gchar * > * reslist)1537 bool Libs::LookupData(const gchar *sWord, std::vector<gchar *> *reslist)
1538 {
1539 std::vector<std::string> SearchWords;
1540 std::string SearchWord;
1541 const char *p = sWord;
1542 while (*p) {
1543 if (*p == '\\') {
1544 p++;
1545 switch (*p) {
1546 case ' ':
1547 SearchWord += ' ';
1548 break;
1549 case '\\':
1550 SearchWord += '\\';
1551 break;
1552 case 't':
1553 SearchWord += '\t';
1554 break;
1555 case 'n':
1556 SearchWord += '\n';
1557 break;
1558 default:
1559 SearchWord += *p;
1560 }
1561 } else if (*p == ' ') {
1562 if (!SearchWord.empty()) {
1563 SearchWords.push_back(SearchWord);
1564 SearchWord.clear();
1565 }
1566 } else {
1567 SearchWord += *p;
1568 }
1569 p++;
1570 }
1571 if (!SearchWord.empty()) {
1572 SearchWords.push_back(SearchWord);
1573 SearchWord.clear();
1574 }
1575 if (SearchWords.empty())
1576 return false;
1577
1578 guint32 max_size = 0;
1579 gchar *origin_data = nullptr;
1580 for (std::vector<Dict *>::size_type i = 0; i < oLib.size(); ++i) {
1581 if (!oLib[i]->containSearchData())
1582 continue;
1583 if (progress_func)
1584 progress_func();
1585 const gulong iwords = narticles(i);
1586 const gchar *key;
1587 guint32 offset, size;
1588 for (gulong j = 0; j < iwords; ++j) {
1589 oLib[i]->get_key_and_data(j, &key, &offset, &size);
1590 if (size > max_size) {
1591 origin_data = (gchar *)g_realloc(origin_data, size);
1592 max_size = size;
1593 }
1594 if (oLib[i]->SearchData(SearchWords, offset, size, origin_data))
1595 reslist[i].push_back(g_strdup(key));
1596 }
1597 }
1598 g_free(origin_data);
1599
1600 std::vector<Dict *>::size_type i;
1601 for (i = 0; i < oLib.size(); ++i)
1602 if (!reslist[i].empty())
1603 break;
1604
1605 return i != oLib.size();
1606 }
1607
1608 /**************************************************/
analyze_query(const char * s,std::string & res)1609 query_t analyze_query(const char *s, std::string &res)
1610 {
1611 if (!s || !*s) {
1612 res = "";
1613 return qtSIMPLE;
1614 }
1615 if (*s == '/') {
1616 res = s + 1;
1617 return qtFUZZY;
1618 }
1619
1620 if (*s == '|') {
1621 res = s + 1;
1622 return qtDATA;
1623 }
1624
1625 bool regexp = false;
1626 const char *p = s;
1627 res = "";
1628 for (; *p; res += *p, ++p) {
1629 if (*p == '\\') {
1630 ++p;
1631 if (!*p)
1632 break;
1633 continue;
1634 }
1635 if (*p == '*' || *p == '?')
1636 regexp = true;
1637 }
1638 if (regexp)
1639 return qtREGEXP;
1640
1641 return qtSIMPLE;
1642 }
1643