1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "dictionary/system/system_dictionary_builder.h"
31
32 #include <algorithm>
33 #include <climits>
34 #include <cstring>
35 #include <sstream>
36
37 #include "base/file_stream.h"
38 #include "base/flags.h"
39 #include "base/logging.h"
40 #include "base/mozc_hash_set.h"
41 #include "base/util.h"
42 #include "dictionary/dictionary_token.h"
43 #include "dictionary/file/codec_factory.h"
44 #include "dictionary/file/codec_interface.h"
45 #include "dictionary/pos_matcher.h"
46 #include "dictionary/system/codec.h"
47 #include "dictionary/system/codec_interface.h"
48 #include "dictionary/system/words_info.h"
49 #include "dictionary/text_dictionary_loader.h"
50 #include "storage/louds/bit_vector_based_array_builder.h"
51 #include "storage/louds/louds_trie_builder.h"
52
53 DEFINE_bool(preserve_intermediate_dictionary, false,
54 "preserve inetemediate dictionary file.");
55 DEFINE_int32(min_key_length_to_use_small_cost_encoding, 6,
56 "minimum key length to use 1 byte cost encoding.");
57
58 namespace mozc {
59 namespace dictionary {
60
61 using mozc::storage::louds::LoudsTrieBuilder;
62 using mozc::storage::louds::BitVectorBasedArrayBuilder;
63
64 namespace {
65
66 struct TokenGreaterThan {
operator ()mozc::dictionary::__anon6b0ee3740111::TokenGreaterThan67 inline bool operator()(const TokenInfo& lhs,
68 const TokenInfo& rhs) const {
69 if (lhs.token->lid != rhs.token->lid) {
70 return lhs.token->lid > rhs.token->lid;
71 }
72 if (lhs.token->rid != rhs.token->rid) {
73 return lhs.token->rid > rhs.token->rid;
74 }
75 if (lhs.id_in_value_trie != rhs.id_in_value_trie) {
76 return lhs.id_in_value_trie < rhs.id_in_value_trie;
77 }
78 return lhs.token->attributes < rhs.token->attributes;
79 }
80 };
81
WriteSectionToFile(const DictionaryFileSection & section,const string & filename)82 void WriteSectionToFile(const DictionaryFileSection §ion,
83 const string &filename) {
84 OutputFileStream ofs(filename.c_str(), std::ios::binary | std::ios::out);
85 ofs.write(section.ptr, section.len);
86 }
87
88 } // namespace
89
SystemDictionaryBuilder()90 SystemDictionaryBuilder::SystemDictionaryBuilder()
91 : value_trie_builder_(new LoudsTrieBuilder),
92 key_trie_builder_(new LoudsTrieBuilder),
93 token_array_builder_(new BitVectorBasedArrayBuilder),
94 codec_(SystemDictionaryCodecFactory::GetCodec()),
95 file_codec_(DictionaryFileCodecFactory::GetCodec()) {}
96
97 // This class does not have the ownership of |codec|.
SystemDictionaryBuilder(const SystemDictionaryCodecInterface * codec,const DictionaryFileCodecInterface * file_codec)98 SystemDictionaryBuilder::SystemDictionaryBuilder(
99 const SystemDictionaryCodecInterface *codec,
100 const DictionaryFileCodecInterface *file_codec)
101 : value_trie_builder_(new LoudsTrieBuilder),
102 key_trie_builder_(new LoudsTrieBuilder),
103 token_array_builder_(new BitVectorBasedArrayBuilder),
104 codec_(codec),
105 file_codec_(file_codec) {}
106
~SystemDictionaryBuilder()107 SystemDictionaryBuilder::~SystemDictionaryBuilder() {}
108
BuildFromTokens(const std::vector<Token * > & tokens)109 void SystemDictionaryBuilder::BuildFromTokens(
110 const std::vector<Token *> &tokens) {
111 KeyInfoList key_info_list;
112 ReadTokens(tokens, &key_info_list);
113
114 BuildFrequentPos(key_info_list);
115 BuildValueTrie(key_info_list);
116 BuildKeyTrie(key_info_list);
117
118 SetIdForValue(&key_info_list);
119 SetIdForKey(&key_info_list);
120 SortTokenInfo(&key_info_list);
121 SetCostType(&key_info_list);
122 SetPosType(&key_info_list);
123 SetValueType(&key_info_list);
124
125 BuildTokenArray(key_info_list);
126 }
127
WriteToFile(const string & output_file) const128 void SystemDictionaryBuilder::WriteToFile(const string &output_file) const {
129 OutputFileStream ofs(output_file.c_str(), std::ios::binary | std::ios::out);
130 WriteToStream(output_file, &ofs);
131 }
132
WriteToStream(const string & intermediate_output_file_base_path,std::ostream * output_stream) const133 void SystemDictionaryBuilder::WriteToStream(
134 const string &intermediate_output_file_base_path,
135 std::ostream *output_stream) const {
136 // Memory images of each section
137 std::vector<DictionaryFileSection> sections;
138 DictionaryFileSection value_trie_section(
139 value_trie_builder_->image().data(),
140 value_trie_builder_->image().size(),
141 file_codec_->GetSectionName(codec_->GetSectionNameForValue()));
142 sections.push_back(value_trie_section);
143
144 DictionaryFileSection key_trie_section(
145 key_trie_builder_->image().data(),
146 key_trie_builder_->image().size(),
147 file_codec_->GetSectionName(codec_->GetSectionNameForKey()));
148 sections.push_back(key_trie_section);
149
150 DictionaryFileSection token_array_section(
151 token_array_builder_->image().data(),
152 token_array_builder_->image().size(),
153 file_codec_->GetSectionName(codec_->GetSectionNameForTokens()));
154
155 sections.push_back(token_array_section);
156 uint32 frequent_pos_array[256] = {0};
157 for (std::map<uint32, int>::const_iterator i = frequent_pos_.begin();
158 i != frequent_pos_.end(); ++i) {
159 frequent_pos_array[i->second] = i->first;
160 }
161 DictionaryFileSection frequent_pos_section(
162 reinterpret_cast<const char *>(frequent_pos_array),
163 sizeof frequent_pos_array,
164 file_codec_->GetSectionName(codec_->GetSectionNameForPos()));
165 sections.push_back(frequent_pos_section);
166
167 if (FLAGS_preserve_intermediate_dictionary &&
168 !intermediate_output_file_base_path.empty()) {
169 // Write out intermediate results to files.
170 const string &basepath = intermediate_output_file_base_path;
171 LOG(INFO) << "Writing intermediate files.";
172 WriteSectionToFile(value_trie_section, basepath + ".value");
173 WriteSectionToFile(key_trie_section, basepath + ".key");
174 WriteSectionToFile(token_array_section, basepath + ".tokens");
175 WriteSectionToFile(frequent_pos_section, basepath + ".freq_pos");
176 }
177
178 LOG(INFO) << "Start writing dictionary file.";
179 file_codec_->WriteSections(sections, output_stream);
180 LOG(INFO) << "Start writing dictionary file... done.";
181 }
182
183 namespace {
GetCombinedPos(uint16 lid,uint16 rid)184 uint32 GetCombinedPos(uint16 lid, uint16 rid) {
185 return (lid << 16) | rid;
186 }
187
GetValueType(const Token * token)188 TokenInfo::ValueType GetValueType(const Token *token) {
189 if (token->value == token->key) {
190 return TokenInfo::AS_IS_HIRAGANA;
191 }
192 string katakana;
193 Util::HiraganaToKatakana(token->key, &katakana);
194 if (token->value == katakana) {
195 return TokenInfo::AS_IS_KATAKANA;
196 }
197 return TokenInfo::DEFAULT_VALUE;
198 }
199
HasHomonymsInSamePos(const SystemDictionaryBuilder::KeyInfo & key_info)200 bool HasHomonymsInSamePos(
201 const SystemDictionaryBuilder::KeyInfo &key_info) {
202 // Early exit path mainly for performance.
203 if (key_info.tokens.size() == 1) {
204 return false;
205 }
206
207 mozc_hash_set<uint32> seen;
208 for (size_t i = 0; i < key_info.tokens.size(); ++i) {
209 const Token *token = key_info.tokens[i].token;
210 const uint32 pos = GetCombinedPos(token->lid, token->rid);
211 if (!seen.insert(pos).second) {
212 // Insertion failed, which means we already have |pos|.
213 return true;
214 }
215 }
216 return false;
217 }
218
219 struct TokenPtrLessThan {
operator ()mozc::dictionary::__anon6b0ee3740211::TokenPtrLessThan220 inline bool operator()(const Token* lhs, const Token* rhs) const {
221 return lhs->key < rhs->key;
222 }
223 };
224
225 } // namespace
226
ReadTokens(const std::vector<Token * > & tokens,KeyInfoList * key_info_list) const227 void SystemDictionaryBuilder::ReadTokens(const std::vector<Token *> &tokens,
228 KeyInfoList *key_info_list) const {
229 // Create KeyInfoList in two steps.
230 // 1. Create an array of Token with (stably) sorting Token::key.
231 // [Token 1(key:aaa)][Token 2(key:aaa)][Token 3(key:abc)][...]
232 // 2. Group Token(s) by Token::Key and convert them into KeyInfo.
233 // [KeyInfo(key:aaa)[Token 1][Token 2]][KeyInfo(key:abc)[Token 3]][...]
234
235 // Step 1.
236 typedef std::vector<Token *> ReduceBuffer;
237 ReduceBuffer reduce_buffer;
238 reduce_buffer.reserve(tokens.size());
239 for (std::vector<Token *>::const_iterator iter = tokens.begin();
240 iter != tokens.end(); ++iter) {
241 Token *token = *iter;
242 CHECK(!token->key.empty()) << "empty key string in input";
243 CHECK(!token->value.empty()) << "empty value string in input";
244 reduce_buffer.push_back(token);
245 }
246 std::stable_sort(reduce_buffer.begin(), reduce_buffer.end(),
247 TokenPtrLessThan());
248
249 // Step 2.
250 key_info_list->clear();
251 if (reduce_buffer.size() == 0) {
252 return;
253 }
254 KeyInfo last_key_info;
255 last_key_info.key = reduce_buffer.front()->key;
256 for (ReduceBuffer::const_iterator iter = reduce_buffer.begin();
257 iter != reduce_buffer.end(); ++iter) {
258 Token *token = *iter;
259 if (last_key_info.key != token->key) {
260 key_info_list->push_back(last_key_info);
261 last_key_info = KeyInfo();
262 last_key_info.key = token->key;
263 }
264 last_key_info.tokens.push_back(TokenInfo(token));
265 last_key_info.tokens.back().value_type = GetValueType(token);
266 }
267 key_info_list->push_back(last_key_info);
268 }
269
BuildFrequentPos(const KeyInfoList & key_info_list)270 void SystemDictionaryBuilder::BuildFrequentPos(
271 const KeyInfoList &key_info_list) {
272 // Calculate frequency of each pos
273 // TODO(toshiyuki): It might be better to count frequency
274 // with considering same_as_prev_pos.
275 std::map<uint32, int> pos_map;
276 for (KeyInfoList::const_iterator itr = key_info_list.begin();
277 itr != key_info_list.end(); ++itr) {
278 const KeyInfo &key_info = *itr;
279 for (size_t i = 0; i < key_info.tokens.size(); ++i) {
280 const Token *token = key_info.tokens[i].token;
281 pos_map[GetCombinedPos(token->lid, token->rid)]++;
282 }
283 }
284
285 // Get histgram of frequency
286 std::map<int, int> freq_map;
287 for (std::map<uint32, int>::const_iterator jt = pos_map.begin();
288 jt != pos_map.end(); ++jt) {
289 freq_map[jt->second]++;
290 }
291 // Compute the lower threshold of frequence
292 int num_freq_pos = 0;
293 int freq_threshold = INT_MAX;
294 for (std::map<int, int>::reverse_iterator kt = freq_map.rbegin();
295 kt != freq_map.rend(); ++kt) {
296 if (num_freq_pos + kt->second > 255) {
297 break;
298 }
299 freq_threshold = kt->first;
300 num_freq_pos += kt->second;
301 }
302
303 // Collect high frequent pos
304 VLOG(1) << "num_freq_pos" << num_freq_pos;
305 VLOG(1) << "Pos threshold=" << freq_threshold;
306 int freq_pos_idx = 0;
307 int num_tokens = 0;
308 std::map<uint32, int>::iterator lt;
309 for (lt = pos_map.begin(); lt != pos_map.end(); ++lt) {
310 if (lt->second >= freq_threshold) {
311 frequent_pos_[lt->first] = freq_pos_idx;
312 freq_pos_idx++;
313 num_tokens += lt->second;
314 }
315 }
316 CHECK(freq_pos_idx == num_freq_pos)
317 << "inconsistent result to find frequent pos";
318 VLOG(1) << freq_pos_idx << " high frequent Pos has "
319 << num_tokens << " tokens";
320 }
321
322
BuildValueTrie(const KeyInfoList & key_info_list)323 void SystemDictionaryBuilder::BuildValueTrie(const KeyInfoList &key_info_list) {
324 for (KeyInfoList::const_iterator itr = key_info_list.begin();
325 itr != key_info_list.end(); ++itr) {
326 const KeyInfo &key_info = *itr;
327 for (size_t i = 0; i < key_info.tokens.size(); ++i) {
328 const TokenInfo &token_info = key_info.tokens[i];
329 if (token_info.value_type == TokenInfo::AS_IS_HIRAGANA ||
330 token_info.value_type == TokenInfo::AS_IS_KATAKANA) {
331 // These values will be stored in token array as flags
332 continue;
333 }
334 string value_str;
335 codec_->EncodeValue(token_info.token->value, &value_str);
336 value_trie_builder_->Add(value_str);
337 }
338 }
339 value_trie_builder_->Build();
340 }
341
SetIdForValue(KeyInfoList * key_info_list) const342 void SystemDictionaryBuilder::SetIdForValue(KeyInfoList *key_info_list) const {
343 for (KeyInfoList::iterator itr = key_info_list->begin();
344 itr != key_info_list->end(); ++itr) {
345 for (size_t i = 0; i < itr->tokens.size(); ++i) {
346 TokenInfo *token_info = &(itr->tokens[i]);
347 string value_str;
348 codec_->EncodeValue(token_info->token->value, &value_str);
349 token_info->id_in_value_trie =
350 value_trie_builder_->GetId(value_str);
351 }
352 }
353 }
354
SortTokenInfo(KeyInfoList * key_info_list) const355 void SystemDictionaryBuilder::SortTokenInfo(KeyInfoList *key_info_list) const {
356 for (KeyInfoList::iterator itr = key_info_list->begin();
357 itr != key_info_list->end(); ++itr) {
358 KeyInfo *key_info = &(*itr);
359 std::sort(key_info->tokens.begin(), key_info->tokens.end(),
360 TokenGreaterThan());
361 }
362 }
363
SetCostType(KeyInfoList * key_info_list) const364 void SystemDictionaryBuilder::SetCostType(KeyInfoList *key_info_list) const {
365 for (KeyInfoList::iterator itr = key_info_list->begin();
366 itr != key_info_list->end(); ++itr) {
367 KeyInfo *key_info = &(*itr);
368 if (HasHomonymsInSamePos(*key_info)) {
369 continue;
370 }
371 for (size_t i = 0; i < key_info->tokens.size(); ++i) {
372 TokenInfo *token_info = &key_info->tokens[i];
373 const int key_len = Util::CharsLen(token_info->token->key);
374 if (key_len >= FLAGS_min_key_length_to_use_small_cost_encoding) {
375 token_info->cost_type = TokenInfo::CAN_USE_SMALL_ENCODING;
376 }
377 }
378 }
379 }
380
SetPosType(KeyInfoList * key_info_list) const381 void SystemDictionaryBuilder::SetPosType(KeyInfoList *key_info_list) const {
382 for (KeyInfoList::iterator itr = key_info_list->begin();
383 itr != key_info_list->end(); ++itr) {
384 KeyInfo *key_info = &(*itr);
385 for (size_t i = 0; i < key_info->tokens.size(); ++i) {
386 TokenInfo *token_info = &(key_info->tokens[i]);
387 const uint32 pos = GetCombinedPos(token_info->token->lid,
388 token_info->token->rid);
389 std::map<uint32, int>::const_iterator itr = frequent_pos_.find(pos);
390 if (itr != frequent_pos_.end()) {
391 token_info->pos_type = TokenInfo::FREQUENT_POS;
392 token_info->id_in_frequent_pos_map = itr->second;
393 }
394 if (i >= 1) {
395 const TokenInfo &prev_token_info = key_info->tokens[i - 1];
396 const uint32 prev_pos = GetCombinedPos(prev_token_info.token->lid,
397 prev_token_info.token->rid);
398 if (prev_pos == pos) {
399 // we can overwrite FREQUENT_POS
400 token_info->pos_type = TokenInfo::SAME_AS_PREV_POS;
401 }
402 }
403 }
404 }
405 }
406
SetValueType(KeyInfoList * key_info_list) const407 void SystemDictionaryBuilder::SetValueType(KeyInfoList *key_info_list) const {
408 for (KeyInfoList::iterator itr = key_info_list->begin();
409 itr != key_info_list->end(); ++itr) {
410 KeyInfo *key_info = &(*itr);
411 for (size_t i = 1; i < key_info->tokens.size(); ++i) {
412 const TokenInfo *prev_token_info = &(key_info->tokens[i - 1]);
413 TokenInfo *token_info = &(key_info->tokens[i]);
414 if (token_info->value_type != TokenInfo::AS_IS_HIRAGANA &&
415 token_info->value_type != TokenInfo::AS_IS_KATAKANA &&
416 (token_info->token->value == prev_token_info->token->value)) {
417 token_info->value_type = TokenInfo::SAME_AS_PREV_VALUE;
418 }
419 }
420 }
421 }
422
BuildKeyTrie(const KeyInfoList & key_info_list)423 void SystemDictionaryBuilder::BuildKeyTrie(const KeyInfoList &key_info_list) {
424 for (KeyInfoList::const_iterator itr = key_info_list.begin();
425 itr != key_info_list.end(); ++itr) {
426 string key_str;
427 codec_->EncodeKey(itr->key, &key_str);
428 key_trie_builder_->Add(key_str);
429 }
430 key_trie_builder_->Build();
431 }
432
SetIdForKey(KeyInfoList * key_info_list) const433 void SystemDictionaryBuilder::SetIdForKey(KeyInfoList *key_info_list) const {
434 for (KeyInfoList::iterator itr = key_info_list->begin();
435 itr != key_info_list->end(); ++itr) {
436 KeyInfo *key_info = &(*itr);
437 string key_str;
438 codec_->EncodeKey(key_info->key, &key_str);
439 key_info->id_in_key_trie =
440 key_trie_builder_->GetId(key_str);
441 }
442 }
443
BuildTokenArray(const KeyInfoList & key_info_list)444 void SystemDictionaryBuilder::BuildTokenArray(
445 const KeyInfoList &key_info_list) {
446 // Here we make a reverse lookup table as follows:
447 // |key_info_list[X].id_in_key_trie| -> |key_info_list[X]|
448 // assuming |key_info_list[X].id_in_key_trie| is unique and successive.
449 {
450 std::vector<const KeyInfo *> id_to_keyinfo_table;
451 id_to_keyinfo_table.resize(key_info_list.size());
452 for (KeyInfoList::const_iterator itr = key_info_list.begin();
453 itr != key_info_list.end(); ++itr) {
454 const KeyInfo &key_info = *itr;
455 const int id = key_info.id_in_key_trie;
456 id_to_keyinfo_table[id] = &key_info;
457 }
458
459 for (size_t i = 0; i < id_to_keyinfo_table.size(); ++i) {
460 const KeyInfo &key_info = *id_to_keyinfo_table[i];
461 string tokens_str;
462 codec_->EncodeTokens(key_info.tokens, &tokens_str);
463 token_array_builder_->Add(tokens_str);
464 }
465 }
466
467 token_array_builder_->Add(string(1, codec_->GetTokensTerminationFlag()));
468 token_array_builder_->Build();
469 }
470
471 } // namespace dictionary
472 } // namespace mozc
473