1 #pragma once
2 
3 #include <cstdio>
4 #include <cstring>
5 #include <functional>
6 #include <list>
7 #include <map>
8 #include <memory>
9 #include <string>
10 #include <vector>
11 
12 #include "dictziplib.hpp"
13 
14 const int MAX_MATCH_ITEM_PER_LIB = 100;
15 const int MAX_FUZZY_DISTANCE = 3; // at most MAX_FUZZY_DISTANCE-1 differences allowed when find similar words
16 
get_uint32(const gchar * addr)17 inline guint32 get_uint32(const gchar *addr)
18 {
19     guint32 result;
20     memcpy(&result, addr, sizeof(guint32));
21     return result;
22 }
23 
set_uint32(gchar * addr,guint32 val)24 inline void set_uint32(gchar *addr, guint32 val)
25 {
26     memcpy(addr, &val, sizeof(guint32));
27 }
28 
29 struct cacheItem {
30     guint32 offset;
31     gchar *data;
32     //write code here to make it inline
cacheItemcacheItem33     cacheItem() { data = nullptr; }
~cacheItemcacheItem34     ~cacheItem() { g_free(data); }
35 };
36 
37 const int WORDDATA_CACHE_NUM = 10;
38 const int INVALID_INDEX = -100;
39 
40 class DictBase
41 {
42 public:
DictBase()43     DictBase() {}
~DictBase()44     ~DictBase()
45     {
46         if (dictfile)
47             fclose(dictfile);
48     }
49     DictBase(const DictBase &) = delete;
50     DictBase &operator=(const DictBase &) = delete;
51     gchar *GetWordData(guint32 idxitem_offset, guint32 idxitem_size);
containSearchData() const52     bool containSearchData() const
53     {
54         if (sametypesequence.empty())
55             return true;
56         return sametypesequence.find_first_of("mlgxty") != std::string::npos;
57     }
58     bool SearchData(std::vector<std::string> &SearchWords, guint32 idxitem_offset, guint32 idxitem_size, gchar *origin_data);
59 
60 protected:
61     std::string sametypesequence;
62     FILE *dictfile = nullptr;
63     std::unique_ptr<DictData> dictdzfile;
64 
65 private:
66     cacheItem cache[WORDDATA_CACHE_NUM];
67     gint cache_cur = 0;
68 };
69 
70 //this structure contain all information about dictionary
71 struct DictInfo {
72     std::string ifo_file_name;
73     guint32 wordcount;
74     guint32 syn_wordcount;
75     std::string bookname;
76     std::string author;
77     std::string email;
78     std::string website;
79     std::string date;
80     std::string description;
81     guint32 index_file_size;
82     guint32 syn_file_size;
83     std::string sametypesequence;
84 
85     bool load_from_ifo_file(const std::string &ifofilename, bool istreedict);
86 };
87 
88 class IIndexFile
89 {
90 public:
91     guint32 wordentry_offset;
92     guint32 wordentry_size;
93 
~IIndexFile()94     virtual ~IIndexFile() {}
95     virtual bool load(const std::string &url, gulong wc, gulong fsize, bool verbose) = 0;
96     virtual const gchar *get_key(glong idx) = 0;
97     virtual void get_data(glong idx) = 0;
98     virtual const gchar *get_key_and_data(glong idx) = 0;
99     virtual bool lookup(const char *str, glong &idx) = 0;
100 };
101 
102 class SynFile
103 {
104 public:
105     bool load(const std::string &url, gulong wc);
106     bool lookup(const char *str, glong &idx);
107 
108 private:
109     std::map<std::string, gulong> synonyms;
110 };
111 
112 class Dict : public DictBase
113 {
114 public:
Dict()115     Dict() {}
116     Dict(const Dict &) = delete;
117     Dict &operator=(const Dict &) = delete;
118     bool load(const std::string &ifofilename, bool verbose);
119 
narticles() const120     gulong narticles() const { return wordcount; }
dict_name() const121     const std::string &dict_name() const { return bookname; }
ifofilename() const122     const std::string &ifofilename() const { return ifo_file_name; }
123 
get_key(glong index)124     const gchar *get_key(glong index) { return idx_file->get_key(index); }
get_data(glong index)125     gchar *get_data(glong index)
126     {
127         idx_file->get_data(index);
128         return DictBase::GetWordData(idx_file->wordentry_offset, idx_file->wordentry_size);
129     }
get_key_and_data(glong index,const gchar ** key,guint32 * offset,guint32 * size)130     void get_key_and_data(glong index, const gchar **key, guint32 *offset, guint32 *size)
131     {
132         *key = idx_file->get_key_and_data(index);
133         *offset = idx_file->wordentry_offset;
134         *size = idx_file->wordentry_size;
135     }
136     bool Lookup(const char *str, glong &idx);
137 
138     bool LookupWithRule(GPatternSpec *pspec, glong *aIndex, int iBuffLen);
139 
140 private:
141     std::string ifo_file_name;
142     gulong wordcount;
143     gulong syn_wordcount;
144     std::string bookname;
145 
146     std::unique_ptr<IIndexFile> idx_file;
147     std::unique_ptr<SynFile> syn_file;
148 
149     bool load_ifofile(const std::string &ifofilename, gulong &idxfilesize);
150 };
151 
152 class Libs
153 {
154 public:
Libs(std::function<void (void)> f=std::function<void (void)> ())155     Libs(std::function<void(void)> f = std::function<void(void)>())
156     {
157         progress_func = f;
158         iMaxFuzzyDistance = MAX_FUZZY_DISTANCE; //need to read from cfg.
159     }
setVerbose(bool verbose)160     void setVerbose(bool verbose) { verbose_ = verbose; }
setFuzzy(bool fuzzy)161     void setFuzzy(bool fuzzy) { fuzzy_ = fuzzy; }
162     ~Libs();
163     Libs(const Libs &) = delete;
164     Libs &operator=(const Libs &) = delete;
165 
166     void load_dict(const std::string &url);
167     void load(const std::list<std::string> &dicts_dirs,
168               const std::list<std::string> &order_list,
169               const std::list<std::string> &disable_list);
narticles(int idict) const170     glong narticles(int idict) const { return oLib[idict]->narticles(); }
dict_name(int idict) const171     const std::string &dict_name(int idict) const { return oLib[idict]->dict_name(); }
ndicts() const172     gint ndicts() const { return oLib.size(); }
173 
poGetWord(glong iIndex,int iLib)174     const gchar *poGetWord(glong iIndex, int iLib)
175     {
176         return oLib[iLib]->get_key(iIndex);
177     }
poGetWordData(glong iIndex,int iLib)178     gchar *poGetWordData(glong iIndex, int iLib)
179     {
180         if (iIndex == INVALID_INDEX)
181             return nullptr;
182         return oLib[iLib]->get_data(iIndex);
183     }
184     const gchar *poGetCurrentWord(glong *iCurrent);
185     const gchar *poGetNextWord(const gchar *word, glong *iCurrent);
186     const gchar *poGetPreWord(glong *iCurrent);
LookupWord(const gchar * sWord,glong & iWordIndex,int iLib)187     bool LookupWord(const gchar *sWord, glong &iWordIndex, int iLib)
188     {
189         return oLib[iLib]->Lookup(sWord, iWordIndex);
190     }
191     bool LookupSimilarWord(const gchar *sWord, glong &iWordIndex, int iLib);
192     bool SimpleLookupWord(const gchar *sWord, glong &iWordIndex, int iLib);
193 
194     bool LookupWithFuzzy(const gchar *sWord, gchar *reslist[], gint reslist_size);
195     gint LookupWithRule(const gchar *sWord, gchar *reslist[]);
196     bool LookupData(const gchar *sWord, std::vector<gchar *> *reslist);
197 
198 protected:
199     bool fuzzy_;
200 
201 private:
202     std::vector<Dict *> oLib; // word Libs.
203     int iMaxFuzzyDistance;
204     std::function<void(void)> progress_func;
205     bool verbose_;
206 };
207 
208 enum query_t {
209     qtSIMPLE,
210     qtREGEXP,
211     qtFUZZY,
212     qtDATA
213 };
214 
215 extern query_t analyze_query(const char *s, std::string &res);
216