1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef PINYINIME_INCLUDE_DICTTRIE_H__
18 #define PINYINIME_INCLUDE_DICTTRIE_H__
19 
20 #include <stdlib.h>
21 #include "./atomdictbase.h"
22 #include "./dictdef.h"
23 #include "./dictlist.h"
24 #include "./searchutility.h"
25 #include <QFile>
26 
27 namespace ime_pinyin {
28 
29 class DictTrie : AtomDictBase {
30  private:
31   struct ParsingMark {
32     size_t node_offset:24;
33     size_t node_num:8;           // Number of nodes with this spelling id given
34                                  // by spl_id. If spl_id is a Shengmu, for nodes
35                                  // in the first layer of DictTrie, it equals to
36                                  // SpellingTrie::shm2full_num(); but for those
37                                  // nodes which are not in the first layer,
38                                  // node_num < SpellingTrie::shm2full_num().
39                                  // For a full spelling id, node_num = 1;
40   };
41 
42   // Used to indicate an extended mile stone.
43   // An extended mile stone is used to mark a partial match in the dictionary
44   // trie to speed up further potential extending.
45   // For example, when the user inputs "w", a mile stone is created to mark the
46   // partial match status, so that when user inputs another char 'm', it will be
47   // faster to extend search space based on this mile stone.
48   //
49   // For partial match status of "wm", there can be more than one sub mile
50   // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so
51   // there may be more one parsing mark used to mark these partial matchings.
52   // A mile stone records the starting position in the mark list and number of
53   // marks.
54   struct MileStone {
55     uint16 mark_start;
56     uint16 mark_num;
57   };
58 
59   DictList* dict_list_;
60 
61   const SpellingTrie *spl_trie_;
62 
63   LmaNodeLE0* root_;        // Nodes for root and the first layer.
64   LmaNodeGE1* nodes_ge1_;   // Nodes for other layers.
65 
66   // An quick index from spelling id to the LmaNodeLE0 node buffer, or
67   // to the root_ buffer.
68   // Index length:
69   // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used
70   // to get the end.
71   // All Shengmu ids are not indexed because they will be converted into
72   // corresponding full ids.
73   // So, given an id splid, the son is:
74   // root_[splid_le0_index_[splid - kFullSplIdStart]]
75   uint16 *splid_le0_index_;
76 
77   uint32 lma_node_num_le0_;
78   uint32 lma_node_num_ge1_;
79 
80   // The first part is for homophnies, and the last  top_lma_num_ items are
81   // lemmas with highest scores.
82   unsigned char *lma_idx_buf_;
83   uint32 lma_idx_buf_len_;  // The total size of lma_idx_buf_ in byte.
84   uint32 total_lma_num_;    // Total number of lemmas in this dictionary.
85   uint32 top_lmas_num_;     // Number of lemma with highest scores.
86 
87   // Parsing mark list used to mark the detailed extended statuses.
88   ParsingMark *parsing_marks_;
89   // The position for next available mark.
90   uint16 parsing_marks_pos_;
91 
92   // Mile stone list used to mark the extended status.
93   MileStone *mile_stones_;
94   // The position for the next available mile stone. We use positions (except 0)
95   // as handles.
96   MileStoneHandle mile_stones_pos_;
97 
98   // Get the offset of sons for a node.
99   inline size_t get_son_offset(const LmaNodeGE1 *node);
100 
101   // Get the offset of homonious ids for a node.
102   inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node);
103 
104   // Get the lemma id by the offset.
105   inline LemmaIdType get_lemma_id(size_t id_offset);
106 
107   void free_resource(bool free_dict_list);
108 
109   bool load_dict(QFile *fp);
110 
111   // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill
112   // them into the lpi_items buffer.
113   // This function is called by the search engine.
114   size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,
115                          LmaNodeLE0 *node);
116 
117   // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill
118   // them into the lpi_items buffer.
119   // This function is called by inner functions extend_dict0(), extend_dict1()
120   // and extend_dict2().
121   size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,
122                          size_t homo_buf_off, LmaNodeGE1 *node,
123                          uint16 lma_len);
124 
125   // Extend in the trie from level 0.
126   MileStoneHandle extend_dict0(MileStoneHandle from_handle,
127                                const DictExtPara *dep, LmaPsbItem *lpi_items,
128                                size_t lpi_max, size_t *lpi_num);
129 
130   // Extend in the trie from level 1.
131   MileStoneHandle extend_dict1(MileStoneHandle from_handle,
132                                const DictExtPara *dep, LmaPsbItem *lpi_items,
133                                size_t lpi_max, size_t *lpi_num);
134 
135   // Extend in the trie from level 2.
136   MileStoneHandle extend_dict2(MileStoneHandle from_handle,
137                                const DictExtPara *dep, LmaPsbItem *lpi_items,
138                                size_t lpi_max, size_t *lpi_num);
139 
140   // Try to extend the given spelling id buffer, and if the given id_lemma can
141   // be successfully gotten, return true;
142   // The given spelling ids are all valid full ids.
143   bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma);
144 
145 #ifdef ___BUILD_MODEL___
146   bool save_dict(FILE *fp);
147 #endif  // ___BUILD_MODEL___
148 
149   static const int kMaxMileStone = 100;
150   static const int kMaxParsingMark = 600;
151   static const MileStoneHandle kFirstValidMileStoneHandle = 1;
152 
153   friend class DictParser;
154   friend class DictBuilder;
155 
156  public:
157 
158   DictTrie();
159   ~DictTrie();
160 
161 #ifdef ___BUILD_MODEL___
162   // Construct the tree from the file fn_raw.
163   // fn_validhzs provide the valid hanzi list. If fn_validhzs is
164   // NULL, only chars in GB2312 will be included.
165   bool build_dict(const char *fn_raw, const char *fn_validhzs);
166 
167   // Save the binary dictionary
168   // Actually, the SpellingTrie/DictList instance will be also saved.
169   bool save_dict(const char *filename);
170 #endif  // ___BUILD_MODEL___
171 
172   void convert_to_hanzis(char16 *str, uint16 str_len);
173 
174   void convert_to_scis_ids(char16 *str, uint16 str_len);
175 
176   // Load a binary dictionary
177   // The SpellingTrie instance/DictList will be also loaded
178   bool load_dict(const char *filename, LemmaIdType start_id,
179                  LemmaIdType end_id);
180   bool load_dict_fd(int sys_fd, long start_offset, long length,
181                     LemmaIdType start_id, LemmaIdType end_id);
close_dict()182   bool close_dict() {return true;}
number_of_lemmas()183   size_t number_of_lemmas() {return 0;}
184 
185   void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
186 
187   MileStoneHandle extend_dict(MileStoneHandle from_handle,
188                               const DictExtPara *dep,
189                               LmaPsbItem *lpi_items,
190                               size_t lpi_max, size_t *lpi_num);
191 
192   size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
193                   LmaPsbItem *lpi_items, size_t lpi_max);
194 
195   uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);
196 
197   uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
198                           uint16 splids_max, bool arg_valid);
199 
200   size_t predict(const char16 *last_hzs, uint16 hzs_len,
201                  NPredictItem *npre_items, size_t npre_max,
202                  size_t b4_used);
203 
put_lemma(char16[],uint16[],uint16,uint16)204   LemmaIdType put_lemma(char16 /*lemma_str*/[], uint16 /*splids*/[],
205                         uint16 /*lemma_len*/, uint16 /*count*/) {return 0;}
206 
update_lemma(LemmaIdType,int16,bool)207   LemmaIdType update_lemma(LemmaIdType /*lemma_id*/, int16 /*delta_count*/,
208                            bool /*selected*/) {return 0;}
209 
get_lemma_id(char16[],uint16[],uint16)210   LemmaIdType get_lemma_id(char16 /*lemma_str*/[], uint16 /*splids*/[],
211                            uint16 /*lemma_len*/) {return 0;}
212 
get_lemma_score(LemmaIdType)213   LmaScoreType get_lemma_score(LemmaIdType /*lemma_id*/) {return 0;}
214 
get_lemma_score(char16[],uint16[],uint16)215   LmaScoreType get_lemma_score(char16 /*lemma_str*/[], uint16 /*splids*/[],
216                         uint16 /*lemma_len*/) {return 0;}
217 
remove_lemma(LemmaIdType)218   bool remove_lemma(LemmaIdType /*lemma_id*/) {return false;}
219 
get_total_lemma_count()220   size_t get_total_lemma_count() {return 0;}
221   void set_total_lemma_count_of_others(size_t count);
222 
flush_cache()223   void flush_cache() {}
224 
225   LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len);
226 
227   // Fill the lemmas with highest scores to the prediction buffer.
228   // his_len is the history length to fill in the prediction buffer.
229   size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items,
230                           size_t npre_max, size_t b4_used);
231 };
232 }
233 
234 #endif  // PINYINIME_INCLUDE_DICTTRIE_H__
235