1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef PINYINIME_INCLUDE_DICTTRIE_H__ 18 #define PINYINIME_INCLUDE_DICTTRIE_H__ 19 20 #include <stdlib.h> 21 #include "./atomdictbase.h" 22 #include "./dictdef.h" 23 #include "./dictlist.h" 24 #include "./searchutility.h" 25 #include <QFile> 26 27 namespace ime_pinyin { 28 29 class DictTrie : AtomDictBase { 30 private: 31 struct ParsingMark { 32 size_t node_offset:24; 33 size_t node_num:8; // Number of nodes with this spelling id given 34 // by spl_id. If spl_id is a Shengmu, for nodes 35 // in the first layer of DictTrie, it equals to 36 // SpellingTrie::shm2full_num(); but for those 37 // nodes which are not in the first layer, 38 // node_num < SpellingTrie::shm2full_num(). 39 // For a full spelling id, node_num = 1; 40 }; 41 42 // Used to indicate an extended mile stone. 43 // An extended mile stone is used to mark a partial match in the dictionary 44 // trie to speed up further potential extending. 45 // For example, when the user inputs "w", a mile stone is created to mark the 46 // partial match status, so that when user inputs another char 'm', it will be 47 // faster to extend search space based on this mile stone. 48 // 49 // For partial match status of "wm", there can be more than one sub mile 50 // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so 51 // there may be more one parsing mark used to mark these partial matchings. 52 // A mile stone records the starting position in the mark list and number of 53 // marks. 54 struct MileStone { 55 uint16 mark_start; 56 uint16 mark_num; 57 }; 58 59 DictList* dict_list_; 60 61 const SpellingTrie *spl_trie_; 62 63 LmaNodeLE0* root_; // Nodes for root and the first layer. 64 LmaNodeGE1* nodes_ge1_; // Nodes for other layers. 65 66 // An quick index from spelling id to the LmaNodeLE0 node buffer, or 67 // to the root_ buffer. 68 // Index length: 69 // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used 70 // to get the end. 71 // All Shengmu ids are not indexed because they will be converted into 72 // corresponding full ids. 73 // So, given an id splid, the son is: 74 // root_[splid_le0_index_[splid - kFullSplIdStart]] 75 uint16 *splid_le0_index_; 76 77 uint32 lma_node_num_le0_; 78 uint32 lma_node_num_ge1_; 79 80 // The first part is for homophnies, and the last top_lma_num_ items are 81 // lemmas with highest scores. 82 unsigned char *lma_idx_buf_; 83 uint32 lma_idx_buf_len_; // The total size of lma_idx_buf_ in byte. 84 uint32 total_lma_num_; // Total number of lemmas in this dictionary. 85 uint32 top_lmas_num_; // Number of lemma with highest scores. 86 87 // Parsing mark list used to mark the detailed extended statuses. 88 ParsingMark *parsing_marks_; 89 // The position for next available mark. 90 uint16 parsing_marks_pos_; 91 92 // Mile stone list used to mark the extended status. 93 MileStone *mile_stones_; 94 // The position for the next available mile stone. We use positions (except 0) 95 // as handles. 96 MileStoneHandle mile_stones_pos_; 97 98 // Get the offset of sons for a node. 99 inline size_t get_son_offset(const LmaNodeGE1 *node); 100 101 // Get the offset of homonious ids for a node. 102 inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node); 103 104 // Get the lemma id by the offset. 105 inline LemmaIdType get_lemma_id(size_t id_offset); 106 107 void free_resource(bool free_dict_list); 108 109 bool load_dict(QFile *fp); 110 111 // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill 112 // them into the lpi_items buffer. 113 // This function is called by the search engine. 114 size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, 115 LmaNodeLE0 *node); 116 117 // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill 118 // them into the lpi_items buffer. 119 // This function is called by inner functions extend_dict0(), extend_dict1() 120 // and extend_dict2(). 121 size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, 122 size_t homo_buf_off, LmaNodeGE1 *node, 123 uint16 lma_len); 124 125 // Extend in the trie from level 0. 126 MileStoneHandle extend_dict0(MileStoneHandle from_handle, 127 const DictExtPara *dep, LmaPsbItem *lpi_items, 128 size_t lpi_max, size_t *lpi_num); 129 130 // Extend in the trie from level 1. 131 MileStoneHandle extend_dict1(MileStoneHandle from_handle, 132 const DictExtPara *dep, LmaPsbItem *lpi_items, 133 size_t lpi_max, size_t *lpi_num); 134 135 // Extend in the trie from level 2. 136 MileStoneHandle extend_dict2(MileStoneHandle from_handle, 137 const DictExtPara *dep, LmaPsbItem *lpi_items, 138 size_t lpi_max, size_t *lpi_num); 139 140 // Try to extend the given spelling id buffer, and if the given id_lemma can 141 // be successfully gotten, return true; 142 // The given spelling ids are all valid full ids. 143 bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma); 144 145 #ifdef ___BUILD_MODEL___ 146 bool save_dict(FILE *fp); 147 #endif // ___BUILD_MODEL___ 148 149 static const int kMaxMileStone = 100; 150 static const int kMaxParsingMark = 600; 151 static const MileStoneHandle kFirstValidMileStoneHandle = 1; 152 153 friend class DictParser; 154 friend class DictBuilder; 155 156 public: 157 158 DictTrie(); 159 ~DictTrie(); 160 161 #ifdef ___BUILD_MODEL___ 162 // Construct the tree from the file fn_raw. 163 // fn_validhzs provide the valid hanzi list. If fn_validhzs is 164 // NULL, only chars in GB2312 will be included. 165 bool build_dict(const char *fn_raw, const char *fn_validhzs); 166 167 // Save the binary dictionary 168 // Actually, the SpellingTrie/DictList instance will be also saved. 169 bool save_dict(const char *filename); 170 #endif // ___BUILD_MODEL___ 171 172 void convert_to_hanzis(char16 *str, uint16 str_len); 173 174 void convert_to_scis_ids(char16 *str, uint16 str_len); 175 176 // Load a binary dictionary 177 // The SpellingTrie instance/DictList will be also loaded 178 bool load_dict(const char *filename, LemmaIdType start_id, 179 LemmaIdType end_id); 180 bool load_dict_fd(int sys_fd, long start_offset, long length, 181 LemmaIdType start_id, LemmaIdType end_id); close_dict()182 bool close_dict() {return true;} number_of_lemmas()183 size_t number_of_lemmas() {return 0;} 184 185 void reset_milestones(uint16 from_step, MileStoneHandle from_handle); 186 187 MileStoneHandle extend_dict(MileStoneHandle from_handle, 188 const DictExtPara *dep, 189 LmaPsbItem *lpi_items, 190 size_t lpi_max, size_t *lpi_num); 191 192 size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, 193 LmaPsbItem *lpi_items, size_t lpi_max); 194 195 uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); 196 197 uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, 198 uint16 splids_max, bool arg_valid); 199 200 size_t predict(const char16 *last_hzs, uint16 hzs_len, 201 NPredictItem *npre_items, size_t npre_max, 202 size_t b4_used); 203 put_lemma(char16[],uint16[],uint16,uint16)204 LemmaIdType put_lemma(char16 /*lemma_str*/[], uint16 /*splids*/[], 205 uint16 /*lemma_len*/, uint16 /*count*/) {return 0;} 206 update_lemma(LemmaIdType,int16,bool)207 LemmaIdType update_lemma(LemmaIdType /*lemma_id*/, int16 /*delta_count*/, 208 bool /*selected*/) {return 0;} 209 get_lemma_id(char16[],uint16[],uint16)210 LemmaIdType get_lemma_id(char16 /*lemma_str*/[], uint16 /*splids*/[], 211 uint16 /*lemma_len*/) {return 0;} 212 get_lemma_score(LemmaIdType)213 LmaScoreType get_lemma_score(LemmaIdType /*lemma_id*/) {return 0;} 214 get_lemma_score(char16[],uint16[],uint16)215 LmaScoreType get_lemma_score(char16 /*lemma_str*/[], uint16 /*splids*/[], 216 uint16 /*lemma_len*/) {return 0;} 217 remove_lemma(LemmaIdType)218 bool remove_lemma(LemmaIdType /*lemma_id*/) {return false;} 219 get_total_lemma_count()220 size_t get_total_lemma_count() {return 0;} 221 void set_total_lemma_count_of_others(size_t count); 222 flush_cache()223 void flush_cache() {} 224 225 LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len); 226 227 // Fill the lemmas with highest scores to the prediction buffer. 228 // his_len is the history length to fill in the prediction buffer. 229 size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items, 230 size_t npre_max, size_t b4_used); 231 }; 232 } 233 234 #endif // PINYINIME_INCLUDE_DICTTRIE_H__ 235