1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "dictionary/system/system_dictionary_builder.h"
31 
32 #include <algorithm>
33 #include <climits>
34 #include <cstring>
35 #include <sstream>
36 
37 #include "base/file_stream.h"
38 #include "base/flags.h"
39 #include "base/logging.h"
40 #include "base/mozc_hash_set.h"
41 #include "base/util.h"
42 #include "dictionary/dictionary_token.h"
43 #include "dictionary/file/codec_factory.h"
44 #include "dictionary/file/codec_interface.h"
45 #include "dictionary/pos_matcher.h"
46 #include "dictionary/system/codec.h"
47 #include "dictionary/system/codec_interface.h"
48 #include "dictionary/system/words_info.h"
49 #include "dictionary/text_dictionary_loader.h"
50 #include "storage/louds/bit_vector_based_array_builder.h"
51 #include "storage/louds/louds_trie_builder.h"
52 
53 DEFINE_bool(preserve_intermediate_dictionary, false,
54             "preserve inetemediate dictionary file.");
55 DEFINE_int32(min_key_length_to_use_small_cost_encoding, 6,
56              "minimum key length to use 1 byte cost encoding.");
57 
58 namespace mozc {
59 namespace dictionary {
60 
61 using mozc::storage::louds::LoudsTrieBuilder;
62 using mozc::storage::louds::BitVectorBasedArrayBuilder;
63 
64 namespace {
65 
66 struct TokenGreaterThan {
operator ()mozc::dictionary::__anon6b0ee3740111::TokenGreaterThan67   inline bool operator()(const TokenInfo& lhs,
68                          const TokenInfo& rhs) const {
69     if (lhs.token->lid != rhs.token->lid) {
70       return lhs.token->lid > rhs.token->lid;
71     }
72     if (lhs.token->rid != rhs.token->rid) {
73       return lhs.token->rid > rhs.token->rid;
74     }
75     if (lhs.id_in_value_trie != rhs.id_in_value_trie) {
76       return lhs.id_in_value_trie < rhs.id_in_value_trie;
77     }
78     return lhs.token->attributes < rhs.token->attributes;
79   }
80 };
81 
WriteSectionToFile(const DictionaryFileSection & section,const string & filename)82 void WriteSectionToFile(const DictionaryFileSection &section,
83                         const string &filename) {
84   OutputFileStream ofs(filename.c_str(), std::ios::binary | std::ios::out);
85   ofs.write(section.ptr, section.len);
86 }
87 
88 }  // namespace
89 
SystemDictionaryBuilder()90 SystemDictionaryBuilder::SystemDictionaryBuilder()
91     : value_trie_builder_(new LoudsTrieBuilder),
92       key_trie_builder_(new LoudsTrieBuilder),
93       token_array_builder_(new BitVectorBasedArrayBuilder),
94       codec_(SystemDictionaryCodecFactory::GetCodec()),
95       file_codec_(DictionaryFileCodecFactory::GetCodec()) {}
96 
97 // This class does not have the ownership of |codec|.
SystemDictionaryBuilder(const SystemDictionaryCodecInterface * codec,const DictionaryFileCodecInterface * file_codec)98 SystemDictionaryBuilder::SystemDictionaryBuilder(
99     const SystemDictionaryCodecInterface *codec,
100     const DictionaryFileCodecInterface *file_codec)
101     : value_trie_builder_(new LoudsTrieBuilder),
102       key_trie_builder_(new LoudsTrieBuilder),
103       token_array_builder_(new BitVectorBasedArrayBuilder),
104       codec_(codec),
105       file_codec_(file_codec) {}
106 
~SystemDictionaryBuilder()107 SystemDictionaryBuilder::~SystemDictionaryBuilder() {}
108 
BuildFromTokens(const std::vector<Token * > & tokens)109 void SystemDictionaryBuilder::BuildFromTokens(
110     const std::vector<Token *> &tokens) {
111   KeyInfoList key_info_list;
112   ReadTokens(tokens, &key_info_list);
113 
114   BuildFrequentPos(key_info_list);
115   BuildValueTrie(key_info_list);
116   BuildKeyTrie(key_info_list);
117 
118   SetIdForValue(&key_info_list);
119   SetIdForKey(&key_info_list);
120   SortTokenInfo(&key_info_list);
121   SetCostType(&key_info_list);
122   SetPosType(&key_info_list);
123   SetValueType(&key_info_list);
124 
125   BuildTokenArray(key_info_list);
126 }
127 
WriteToFile(const string & output_file) const128 void SystemDictionaryBuilder::WriteToFile(const string &output_file) const {
129   OutputFileStream ofs(output_file.c_str(), std::ios::binary | std::ios::out);
130   WriteToStream(output_file, &ofs);
131 }
132 
WriteToStream(const string & intermediate_output_file_base_path,std::ostream * output_stream) const133 void SystemDictionaryBuilder::WriteToStream(
134     const string &intermediate_output_file_base_path,
135     std::ostream *output_stream) const {
136   // Memory images of each section
137   std::vector<DictionaryFileSection> sections;
138   DictionaryFileSection value_trie_section(
139     value_trie_builder_->image().data(),
140     value_trie_builder_->image().size(),
141     file_codec_->GetSectionName(codec_->GetSectionNameForValue()));
142   sections.push_back(value_trie_section);
143 
144   DictionaryFileSection key_trie_section(
145     key_trie_builder_->image().data(),
146     key_trie_builder_->image().size(),
147     file_codec_->GetSectionName(codec_->GetSectionNameForKey()));
148   sections.push_back(key_trie_section);
149 
150   DictionaryFileSection token_array_section(
151     token_array_builder_->image().data(),
152     token_array_builder_->image().size(),
153     file_codec_->GetSectionName(codec_->GetSectionNameForTokens()));
154 
155   sections.push_back(token_array_section);
156   uint32 frequent_pos_array[256] = {0};
157   for (std::map<uint32, int>::const_iterator i = frequent_pos_.begin();
158        i != frequent_pos_.end(); ++i) {
159     frequent_pos_array[i->second] = i->first;
160   }
161   DictionaryFileSection frequent_pos_section(
162     reinterpret_cast<const char *>(frequent_pos_array),
163     sizeof frequent_pos_array,
164     file_codec_->GetSectionName(codec_->GetSectionNameForPos()));
165   sections.push_back(frequent_pos_section);
166 
167   if (FLAGS_preserve_intermediate_dictionary &&
168       !intermediate_output_file_base_path.empty()) {
169     // Write out intermediate results to files.
170     const string &basepath = intermediate_output_file_base_path;
171     LOG(INFO) << "Writing intermediate files.";
172     WriteSectionToFile(value_trie_section, basepath + ".value");
173     WriteSectionToFile(key_trie_section, basepath + ".key");
174     WriteSectionToFile(token_array_section, basepath + ".tokens");
175     WriteSectionToFile(frequent_pos_section, basepath + ".freq_pos");
176   }
177 
178   LOG(INFO) << "Start writing dictionary file.";
179   file_codec_->WriteSections(sections, output_stream);
180   LOG(INFO) << "Start writing dictionary file... done.";
181 }
182 
183 namespace {
GetCombinedPos(uint16 lid,uint16 rid)184 uint32 GetCombinedPos(uint16 lid, uint16 rid) {
185   return (lid << 16) | rid;
186 }
187 
GetValueType(const Token * token)188 TokenInfo::ValueType GetValueType(const Token *token) {
189   if (token->value == token->key) {
190     return TokenInfo::AS_IS_HIRAGANA;
191   }
192   string katakana;
193   Util::HiraganaToKatakana(token->key, &katakana);
194   if (token->value == katakana) {
195     return TokenInfo::AS_IS_KATAKANA;
196   }
197   return TokenInfo::DEFAULT_VALUE;
198 }
199 
HasHomonymsInSamePos(const SystemDictionaryBuilder::KeyInfo & key_info)200 bool HasHomonymsInSamePos(
201     const SystemDictionaryBuilder::KeyInfo &key_info) {
202   // Early exit path mainly for performance.
203   if (key_info.tokens.size() == 1) {
204     return false;
205   }
206 
207   mozc_hash_set<uint32> seen;
208   for (size_t i = 0; i < key_info.tokens.size(); ++i) {
209     const Token *token = key_info.tokens[i].token;
210     const uint32 pos = GetCombinedPos(token->lid, token->rid);
211     if (!seen.insert(pos).second) {
212       // Insertion failed, which means we already have |pos|.
213       return true;
214     }
215   }
216   return false;
217 }
218 
219 struct TokenPtrLessThan {
operator ()mozc::dictionary::__anon6b0ee3740211::TokenPtrLessThan220   inline bool operator()(const Token* lhs, const Token* rhs) const {
221     return lhs->key < rhs->key;
222   }
223 };
224 
225 }  // namespace
226 
ReadTokens(const std::vector<Token * > & tokens,KeyInfoList * key_info_list) const227 void SystemDictionaryBuilder::ReadTokens(const std::vector<Token *> &tokens,
228                                          KeyInfoList *key_info_list) const {
229   // Create KeyInfoList in two steps.
230   // 1. Create an array of Token with (stably) sorting Token::key.
231   //    [Token 1(key:aaa)][Token 2(key:aaa)][Token 3(key:abc)][...]
232   // 2. Group Token(s) by Token::Key and convert them into KeyInfo.
233   //    [KeyInfo(key:aaa)[Token 1][Token 2]][KeyInfo(key:abc)[Token 3]][...]
234 
235   // Step 1.
236   typedef std::vector<Token *> ReduceBuffer;
237   ReduceBuffer reduce_buffer;
238   reduce_buffer.reserve(tokens.size());
239   for (std::vector<Token *>::const_iterator iter = tokens.begin();
240        iter != tokens.end(); ++iter) {
241     Token *token = *iter;
242     CHECK(!token->key.empty()) << "empty key string in input";
243     CHECK(!token->value.empty()) << "empty value string in input";
244     reduce_buffer.push_back(token);
245   }
246   std::stable_sort(reduce_buffer.begin(), reduce_buffer.end(),
247                    TokenPtrLessThan());
248 
249   // Step 2.
250   key_info_list->clear();
251   if (reduce_buffer.size() == 0) {
252     return;
253   }
254   KeyInfo last_key_info;
255   last_key_info.key = reduce_buffer.front()->key;
256   for (ReduceBuffer::const_iterator iter = reduce_buffer.begin();
257        iter != reduce_buffer.end(); ++iter) {
258     Token *token = *iter;
259     if (last_key_info.key != token->key) {
260       key_info_list->push_back(last_key_info);
261       last_key_info = KeyInfo();
262       last_key_info.key = token->key;
263     }
264     last_key_info.tokens.push_back(TokenInfo(token));
265     last_key_info.tokens.back().value_type = GetValueType(token);
266   }
267   key_info_list->push_back(last_key_info);
268 }
269 
BuildFrequentPos(const KeyInfoList & key_info_list)270 void SystemDictionaryBuilder::BuildFrequentPos(
271     const KeyInfoList &key_info_list) {
272   // Calculate frequency of each pos
273   // TODO(toshiyuki): It might be better to count frequency
274   // with considering same_as_prev_pos.
275   std::map<uint32, int> pos_map;
276   for (KeyInfoList::const_iterator itr = key_info_list.begin();
277        itr != key_info_list.end(); ++itr) {
278     const KeyInfo &key_info = *itr;
279     for (size_t i = 0; i < key_info.tokens.size(); ++i) {
280       const Token *token = key_info.tokens[i].token;
281       pos_map[GetCombinedPos(token->lid, token->rid)]++;
282     }
283   }
284 
285   // Get histgram of frequency
286   std::map<int, int> freq_map;
287   for (std::map<uint32, int>::const_iterator jt = pos_map.begin();
288        jt != pos_map.end(); ++jt) {
289     freq_map[jt->second]++;
290   }
291   // Compute the lower threshold of frequence
292   int num_freq_pos = 0;
293   int freq_threshold = INT_MAX;
294   for (std::map<int, int>::reverse_iterator kt = freq_map.rbegin();
295        kt != freq_map.rend(); ++kt) {
296     if (num_freq_pos + kt->second > 255) {
297       break;
298     }
299     freq_threshold = kt->first;
300     num_freq_pos += kt->second;
301   }
302 
303   // Collect high frequent pos
304   VLOG(1) << "num_freq_pos" << num_freq_pos;
305   VLOG(1) << "Pos threshold=" << freq_threshold;
306   int freq_pos_idx = 0;
307   int num_tokens = 0;
308   std::map<uint32, int>::iterator lt;
309   for (lt = pos_map.begin(); lt != pos_map.end(); ++lt) {
310     if (lt->second >= freq_threshold) {
311       frequent_pos_[lt->first] = freq_pos_idx;
312       freq_pos_idx++;
313       num_tokens += lt->second;
314     }
315   }
316   CHECK(freq_pos_idx == num_freq_pos)
317       << "inconsistent result to find frequent pos";
318   VLOG(1) << freq_pos_idx << " high frequent Pos has "
319           << num_tokens << " tokens";
320 }
321 
322 
BuildValueTrie(const KeyInfoList & key_info_list)323 void SystemDictionaryBuilder::BuildValueTrie(const KeyInfoList &key_info_list) {
324   for (KeyInfoList::const_iterator itr = key_info_list.begin();
325        itr != key_info_list.end(); ++itr) {
326     const KeyInfo &key_info = *itr;
327     for (size_t i = 0; i < key_info.tokens.size(); ++i) {
328       const TokenInfo &token_info = key_info.tokens[i];
329       if (token_info.value_type == TokenInfo::AS_IS_HIRAGANA ||
330           token_info.value_type == TokenInfo::AS_IS_KATAKANA) {
331         // These values will be stored in token array as flags
332         continue;
333       }
334       string value_str;
335       codec_->EncodeValue(token_info.token->value, &value_str);
336       value_trie_builder_->Add(value_str);
337     }
338   }
339   value_trie_builder_->Build();
340 }
341 
SetIdForValue(KeyInfoList * key_info_list) const342 void SystemDictionaryBuilder::SetIdForValue(KeyInfoList *key_info_list) const {
343   for (KeyInfoList::iterator itr = key_info_list->begin();
344        itr != key_info_list->end(); ++itr) {
345     for (size_t i = 0; i < itr->tokens.size(); ++i) {
346       TokenInfo *token_info = &(itr->tokens[i]);
347       string value_str;
348       codec_->EncodeValue(token_info->token->value, &value_str);
349       token_info->id_in_value_trie =
350           value_trie_builder_->GetId(value_str);
351     }
352   }
353 }
354 
SortTokenInfo(KeyInfoList * key_info_list) const355 void SystemDictionaryBuilder::SortTokenInfo(KeyInfoList *key_info_list) const {
356   for (KeyInfoList::iterator itr = key_info_list->begin();
357        itr != key_info_list->end(); ++itr) {
358     KeyInfo *key_info = &(*itr);
359     std::sort(key_info->tokens.begin(), key_info->tokens.end(),
360               TokenGreaterThan());
361   }
362 }
363 
SetCostType(KeyInfoList * key_info_list) const364 void SystemDictionaryBuilder::SetCostType(KeyInfoList *key_info_list) const {
365   for (KeyInfoList::iterator itr = key_info_list->begin();
366        itr != key_info_list->end(); ++itr) {
367     KeyInfo *key_info = &(*itr);
368     if (HasHomonymsInSamePos(*key_info)) {
369       continue;
370     }
371     for (size_t i = 0; i < key_info->tokens.size(); ++i) {
372       TokenInfo *token_info = &key_info->tokens[i];
373       const int key_len = Util::CharsLen(token_info->token->key);
374       if (key_len >= FLAGS_min_key_length_to_use_small_cost_encoding) {
375         token_info->cost_type = TokenInfo::CAN_USE_SMALL_ENCODING;
376       }
377     }
378   }
379 }
380 
SetPosType(KeyInfoList * key_info_list) const381 void SystemDictionaryBuilder::SetPosType(KeyInfoList *key_info_list) const {
382   for (KeyInfoList::iterator itr = key_info_list->begin();
383        itr != key_info_list->end(); ++itr) {
384     KeyInfo *key_info = &(*itr);
385     for (size_t i = 0; i < key_info->tokens.size(); ++i) {
386       TokenInfo *token_info = &(key_info->tokens[i]);
387       const uint32 pos = GetCombinedPos(token_info->token->lid,
388                                         token_info->token->rid);
389       std::map<uint32, int>::const_iterator itr = frequent_pos_.find(pos);
390       if (itr != frequent_pos_.end()) {
391         token_info->pos_type = TokenInfo::FREQUENT_POS;
392         token_info->id_in_frequent_pos_map = itr->second;
393       }
394       if (i >= 1) {
395         const TokenInfo &prev_token_info = key_info->tokens[i - 1];
396         const uint32 prev_pos = GetCombinedPos(prev_token_info.token->lid,
397                                                prev_token_info.token->rid);
398         if (prev_pos == pos) {
399           // we can overwrite FREQUENT_POS
400           token_info->pos_type = TokenInfo::SAME_AS_PREV_POS;
401         }
402       }
403     }
404   }
405 }
406 
SetValueType(KeyInfoList * key_info_list) const407 void SystemDictionaryBuilder::SetValueType(KeyInfoList *key_info_list) const {
408   for (KeyInfoList::iterator itr = key_info_list->begin();
409        itr != key_info_list->end(); ++itr) {
410     KeyInfo *key_info = &(*itr);
411     for (size_t i = 1; i < key_info->tokens.size(); ++i) {
412       const TokenInfo *prev_token_info = &(key_info->tokens[i - 1]);
413       TokenInfo *token_info = &(key_info->tokens[i]);
414       if (token_info->value_type != TokenInfo::AS_IS_HIRAGANA &&
415           token_info->value_type != TokenInfo::AS_IS_KATAKANA &&
416           (token_info->token->value == prev_token_info->token->value)) {
417         token_info->value_type = TokenInfo::SAME_AS_PREV_VALUE;
418       }
419     }
420   }
421 }
422 
BuildKeyTrie(const KeyInfoList & key_info_list)423 void SystemDictionaryBuilder::BuildKeyTrie(const KeyInfoList &key_info_list) {
424   for (KeyInfoList::const_iterator itr = key_info_list.begin();
425        itr != key_info_list.end(); ++itr) {
426     string key_str;
427     codec_->EncodeKey(itr->key, &key_str);
428     key_trie_builder_->Add(key_str);
429   }
430   key_trie_builder_->Build();
431 }
432 
SetIdForKey(KeyInfoList * key_info_list) const433 void SystemDictionaryBuilder::SetIdForKey(KeyInfoList *key_info_list) const {
434   for (KeyInfoList::iterator itr = key_info_list->begin();
435        itr != key_info_list->end(); ++itr) {
436     KeyInfo *key_info = &(*itr);
437     string key_str;
438     codec_->EncodeKey(key_info->key, &key_str);
439     key_info->id_in_key_trie =
440         key_trie_builder_->GetId(key_str);
441   }
442 }
443 
BuildTokenArray(const KeyInfoList & key_info_list)444 void SystemDictionaryBuilder::BuildTokenArray(
445     const KeyInfoList &key_info_list) {
446   // Here we make a reverse lookup table as follows:
447   //   |key_info_list[X].id_in_key_trie| -> |key_info_list[X]|
448   // assuming |key_info_list[X].id_in_key_trie| is unique and successive.
449   {
450     std::vector<const KeyInfo *> id_to_keyinfo_table;
451     id_to_keyinfo_table.resize(key_info_list.size());
452     for (KeyInfoList::const_iterator itr = key_info_list.begin();
453          itr != key_info_list.end(); ++itr) {
454       const KeyInfo &key_info = *itr;
455       const int id = key_info.id_in_key_trie;
456       id_to_keyinfo_table[id] = &key_info;
457     }
458 
459     for (size_t i = 0; i < id_to_keyinfo_table.size(); ++i) {
460       const KeyInfo &key_info = *id_to_keyinfo_table[i];
461       string tokens_str;
462       codec_->EncodeTokens(key_info.tokens, &tokens_str);
463       token_array_builder_->Add(tokens_str);
464     }
465   }
466 
467   token_array_builder_->Add(string(1, codec_->GetTokensTerminationFlag()));
468   token_array_builder_->Build();
469 }
470 
471 }  // namespace dictionary
472 }  // namespace mozc
473