1 #pragma once
2
3 #include <cstdio>
4 #include <cstring>
5 #include <functional>
6 #include <list>
7 #include <map>
8 #include <memory>
9 #include <string>
10 #include <vector>
11
12 #include "dictziplib.hpp"
13
14 const int MAX_MATCH_ITEM_PER_LIB = 100;
15 const int MAX_FUZZY_DISTANCE = 3; // at most MAX_FUZZY_DISTANCE-1 differences allowed when find similar words
16
get_uint32(const gchar * addr)17 inline guint32 get_uint32(const gchar *addr)
18 {
19 guint32 result;
20 memcpy(&result, addr, sizeof(guint32));
21 return result;
22 }
23
set_uint32(gchar * addr,guint32 val)24 inline void set_uint32(gchar *addr, guint32 val)
25 {
26 memcpy(addr, &val, sizeof(guint32));
27 }
28
29 struct cacheItem {
30 guint32 offset;
31 gchar *data;
32 //write code here to make it inline
cacheItemcacheItem33 cacheItem() { data = nullptr; }
~cacheItemcacheItem34 ~cacheItem() { g_free(data); }
35 };
36
37 const int WORDDATA_CACHE_NUM = 10;
38 const int INVALID_INDEX = -100;
39
40 class DictBase
41 {
42 public:
DictBase()43 DictBase() {}
~DictBase()44 ~DictBase()
45 {
46 if (dictfile)
47 fclose(dictfile);
48 }
49 DictBase(const DictBase &) = delete;
50 DictBase &operator=(const DictBase &) = delete;
51 gchar *GetWordData(guint32 idxitem_offset, guint32 idxitem_size);
containSearchData() const52 bool containSearchData() const
53 {
54 if (sametypesequence.empty())
55 return true;
56 return sametypesequence.find_first_of("mlgxty") != std::string::npos;
57 }
58 bool SearchData(std::vector<std::string> &SearchWords, guint32 idxitem_offset, guint32 idxitem_size, gchar *origin_data);
59
60 protected:
61 std::string sametypesequence;
62 FILE *dictfile = nullptr;
63 std::unique_ptr<DictData> dictdzfile;
64
65 private:
66 cacheItem cache[WORDDATA_CACHE_NUM];
67 gint cache_cur = 0;
68 };
69
70 //this structure contain all information about dictionary
71 struct DictInfo {
72 std::string ifo_file_name;
73 guint32 wordcount;
74 guint32 syn_wordcount;
75 std::string bookname;
76 std::string author;
77 std::string email;
78 std::string website;
79 std::string date;
80 std::string description;
81 guint32 index_file_size;
82 guint32 syn_file_size;
83 std::string sametypesequence;
84
85 bool load_from_ifo_file(const std::string &ifofilename, bool istreedict);
86 };
87
88 class IIndexFile
89 {
90 public:
91 guint32 wordentry_offset;
92 guint32 wordentry_size;
93
~IIndexFile()94 virtual ~IIndexFile() {}
95 virtual bool load(const std::string &url, gulong wc, gulong fsize, bool verbose) = 0;
96 virtual const gchar *get_key(glong idx) = 0;
97 virtual void get_data(glong idx) = 0;
98 virtual const gchar *get_key_and_data(glong idx) = 0;
99 virtual bool lookup(const char *str, glong &idx) = 0;
100 };
101
102 class SynFile
103 {
104 public:
105 bool load(const std::string &url, gulong wc);
106 bool lookup(const char *str, glong &idx);
107
108 private:
109 std::map<std::string, gulong> synonyms;
110 };
111
112 class Dict : public DictBase
113 {
114 public:
Dict()115 Dict() {}
116 Dict(const Dict &) = delete;
117 Dict &operator=(const Dict &) = delete;
118 bool load(const std::string &ifofilename, bool verbose);
119
narticles() const120 gulong narticles() const { return wordcount; }
dict_name() const121 const std::string &dict_name() const { return bookname; }
ifofilename() const122 const std::string &ifofilename() const { return ifo_file_name; }
123
get_key(glong index)124 const gchar *get_key(glong index) { return idx_file->get_key(index); }
get_data(glong index)125 gchar *get_data(glong index)
126 {
127 idx_file->get_data(index);
128 return DictBase::GetWordData(idx_file->wordentry_offset, idx_file->wordentry_size);
129 }
get_key_and_data(glong index,const gchar ** key,guint32 * offset,guint32 * size)130 void get_key_and_data(glong index, const gchar **key, guint32 *offset, guint32 *size)
131 {
132 *key = idx_file->get_key_and_data(index);
133 *offset = idx_file->wordentry_offset;
134 *size = idx_file->wordentry_size;
135 }
136 bool Lookup(const char *str, glong &idx);
137
138 bool LookupWithRule(GPatternSpec *pspec, glong *aIndex, int iBuffLen);
139
140 private:
141 std::string ifo_file_name;
142 gulong wordcount;
143 gulong syn_wordcount;
144 std::string bookname;
145
146 std::unique_ptr<IIndexFile> idx_file;
147 std::unique_ptr<SynFile> syn_file;
148
149 bool load_ifofile(const std::string &ifofilename, gulong &idxfilesize);
150 };
151
152 class Libs
153 {
154 public:
Libs(std::function<void (void)> f=std::function<void (void)> ())155 Libs(std::function<void(void)> f = std::function<void(void)>())
156 {
157 progress_func = f;
158 iMaxFuzzyDistance = MAX_FUZZY_DISTANCE; //need to read from cfg.
159 }
setVerbose(bool verbose)160 void setVerbose(bool verbose) { verbose_ = verbose; }
setFuzzy(bool fuzzy)161 void setFuzzy(bool fuzzy) { fuzzy_ = fuzzy; }
162 ~Libs();
163 Libs(const Libs &) = delete;
164 Libs &operator=(const Libs &) = delete;
165
166 void load_dict(const std::string &url);
167 void load(const std::list<std::string> &dicts_dirs,
168 const std::list<std::string> &order_list,
169 const std::list<std::string> &disable_list);
narticles(int idict) const170 glong narticles(int idict) const { return oLib[idict]->narticles(); }
dict_name(int idict) const171 const std::string &dict_name(int idict) const { return oLib[idict]->dict_name(); }
ndicts() const172 gint ndicts() const { return oLib.size(); }
173
poGetWord(glong iIndex,int iLib)174 const gchar *poGetWord(glong iIndex, int iLib)
175 {
176 return oLib[iLib]->get_key(iIndex);
177 }
poGetWordData(glong iIndex,int iLib)178 gchar *poGetWordData(glong iIndex, int iLib)
179 {
180 if (iIndex == INVALID_INDEX)
181 return nullptr;
182 return oLib[iLib]->get_data(iIndex);
183 }
184 const gchar *poGetCurrentWord(glong *iCurrent);
185 const gchar *poGetNextWord(const gchar *word, glong *iCurrent);
186 const gchar *poGetPreWord(glong *iCurrent);
LookupWord(const gchar * sWord,glong & iWordIndex,int iLib)187 bool LookupWord(const gchar *sWord, glong &iWordIndex, int iLib)
188 {
189 return oLib[iLib]->Lookup(sWord, iWordIndex);
190 }
191 bool LookupSimilarWord(const gchar *sWord, glong &iWordIndex, int iLib);
192 bool SimpleLookupWord(const gchar *sWord, glong &iWordIndex, int iLib);
193
194 bool LookupWithFuzzy(const gchar *sWord, gchar *reslist[], gint reslist_size);
195 gint LookupWithRule(const gchar *sWord, gchar *reslist[]);
196 bool LookupData(const gchar *sWord, std::vector<gchar *> *reslist);
197
198 protected:
199 bool fuzzy_;
200
201 private:
202 std::vector<Dict *> oLib; // word Libs.
203 int iMaxFuzzyDistance;
204 std::function<void(void)> progress_func;
205 bool verbose_;
206 };
207
208 enum query_t {
209 qtSIMPLE,
210 qtREGEXP,
211 qtFUZZY,
212 qtDATA
213 };
214
215 extern query_t analyze_query(const char *s, std::string &res);
216