1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 // System dictionary maintains following sections
31 //  (1) Key trie
32 //       Trie containing encoded key. Returns ids for lookup.
33 //       We can get the key using the id by performing reverse lookup
34 //       against the trie.
35 //  (2) Value trie
36 //       Trie containing encoded value. Returns ids for lookup.
37 //       We can get the value using the id by performing reverse lookup
38 //       against the trie.
39 //  (3) Token array
40 //       Array containing encoded tokens. Array index is the id in key trie.
41 //       Token contains cost, POS, the id in key trie, etc.
42 //  (4) Table for high frequent POS(left/right ID)
43 //       Frequenty appearing POSs are stored as POS ids in token info for
44 //       reducing binary size. This table is the map from the id to the
45 //       actual ids.
46 
47 #include "dictionary/system/system_dictionary.h"
48 
49 #include <algorithm>
50 #include <cstring>
51 #include <limits>
52 #include <map>
53 #include <memory>
54 #include <queue>
55 #include <string>
56 #include <utility>
57 #include <vector>
58 
59 #include "base/logging.h"
60 #include "base/mmap.h"
61 #include "base/port.h"
62 #include "base/string_piece.h"
63 #include "base/util.h"
64 #include "dictionary/dictionary_token.h"
65 #include "dictionary/file/codec_factory.h"
66 #include "dictionary/file/dictionary_file.h"
67 #include "dictionary/system/codec_interface.h"
68 #include "dictionary/system/token_decode_iterator.h"
69 #include "dictionary/system/words_info.h"
70 #include "storage/louds/bit_vector_based_array.h"
71 #include "storage/louds/louds_trie.h"
72 
73 namespace mozc {
74 namespace dictionary {
75 
76 using ::mozc::storage::louds::BitVectorBasedArray;
77 using ::mozc::storage::louds::LoudsTrie;
78 
79 namespace {
80 
81 const int kMinTokenArrayBlobSize = 4;
82 
83 // TODO(noriyukit): The following paramters may not be well optimized.  In our
84 // experiments, Select1 is computational burden, so increasing cache size for
85 // lb1/select1 may improve performance.
86 const size_t kKeyTrieLb0CacheSize = 1 * 1024;
87 const size_t kKeyTrieLb1CacheSize = 1 * 1024;
88 const size_t kKeyTrieSelect0CacheSize = 4 * 1024;
89 const size_t kKeyTrieSelect1CacheSize = 4 * 1024;
90 const size_t kKeyTrieTermvecCacheSize = 1 * 1024;
91 
92 const size_t kValueTrieLb0CacheSize = 1 * 1024;
93 const size_t kValueTrieLb1CacheSize = 1 * 1024;
94 const size_t kValueTrieSelect0CacheSize = 1 * 1024;
95 const size_t kValueTrieSelect1CacheSize = 16 * 1024;
96 const size_t kValueTrieTermvecCacheSize = 4 * 1024;
97 
98 // Expansion table format:
99 // "<Character to expand>[<Expanded character 1><Expanded character 2>...]"
100 //
101 // Only characters that will be encoded into 1-byte ASCII char are allowed in
102 // the table.
103 //
104 // Note that this implementation has potential issue that the key/values may
105 // be mixed.
106 // TODO(hidehiko): Clean up this hacky implementation.
107 const char *kHiraganaExpansionTable[] = {
108   "ああぁ",
109   "いいぃ",
110   "ううぅゔ",
111   "ええぇ",
112   "おおぉ",
113   "かかが",
114   "ききぎ",
115   "くくぐ",
116   "けけげ",
117   "ここご",
118   "ささざ",
119   "ししじ",
120   "すすず",
121   "せせぜ",
122   "そそぞ",
123   "たただ",
124   "ちちぢ",
125   "つつっづ",
126   "ててで",
127   "ととど",
128   "ははばぱ",
129   "ひひびぴ",
130   "ふふぶぷ",
131   "へへべぺ",
132   "ほほぼぽ",
133   "ややゃ",
134   "ゆゆゅ",
135   "よよょ",
136   "わわゎ",
137 };
138 
139 const uint32 kAsciiRange = 0x80;
140 
141 // Confirm that all the characters are within ASCII range.
ContainsAsciiCodeOnly(const string & str)142 bool ContainsAsciiCodeOnly(const string &str) {
143   for (string::const_iterator it = str.begin(); it != str.end(); ++it) {
144     if (static_cast<unsigned char>(*it) >= kAsciiRange) {
145       return false;
146     }
147   }
148   return true;
149 }
150 
SetKeyExpansion(char key,const string & expansion,KeyExpansionTable * key_expansion_table)151 void SetKeyExpansion(char key, const string &expansion,
152                      KeyExpansionTable *key_expansion_table) {
153   key_expansion_table->Add(key, expansion);
154 }
155 
BuildHiraganaExpansionTable(const SystemDictionaryCodecInterface & codec,KeyExpansionTable * encoded_table)156 void BuildHiraganaExpansionTable(
157     const SystemDictionaryCodecInterface &codec,
158     KeyExpansionTable *encoded_table) {
159   for (size_t index = 0; index < arraysize(kHiraganaExpansionTable); ++index) {
160     string encoded;
161     codec.EncodeKey(kHiraganaExpansionTable[index], &encoded);
162     DCHECK(ContainsAsciiCodeOnly(encoded)) <<
163         "Encoded expansion data are supposed to fit within ASCII";
164 
165     DCHECK_GT(encoded.size(), 0) << "Expansion data is empty";
166 
167     if (encoded.size() == 1) {
168       continue;
169     } else {
170       SetKeyExpansion(encoded[0], encoded.substr(1), encoded_table);
171     }
172   }
173 }
174 
GetTokenArrayPtr(const BitVectorBasedArray & token_array,int key_id)175 inline const uint8 *GetTokenArrayPtr(const BitVectorBasedArray &token_array,
176                                      int key_id) {
177   size_t length = 0;
178   return reinterpret_cast<const uint8*>(token_array.Get(key_id, &length));
179 }
180 
181 // Iterator for scanning token array.
182 // This iterator does not return actual token info but returns
183 // id data and the position only.
184 // This will be used only for reverse lookup.
185 // Forward lookup does not need such iterator because it can access
186 // a token directly without linear scan.
187 //
188 //  Usage:
189 //    for (TokenScanIterator iter(codec_, token_array_);
190 //         !iter.Done(); iter.Next()) {
191 //      const TokenScanIterator::Result &result = iter.Get();
192 //      // Do something with |result|.
193 //    }
194 class TokenScanIterator {
195  public:
196   struct Result {
197     // Value id for the current token
198     int value_id;
199     // Index (= key id) for the current token
200     int index;
201     // Offset from the tokens section beginning.
202     // (token_array_->Get(id_in_key_trie) ==
203     //  token_array_->Get(0) + tokens_offset)
204     int tokens_offset;
205   };
206 
TokenScanIterator(const SystemDictionaryCodecInterface * codec,const BitVectorBasedArray & token_array)207   TokenScanIterator(
208       const SystemDictionaryCodecInterface *codec,
209       const BitVectorBasedArray &token_array)
210       : codec_(codec),
211         termination_flag_(codec->GetTokensTerminationFlag()),
212         state_(HAS_NEXT),
213         offset_(0),
214         tokens_offset_(0),
215         index_(0) {
216     encoded_tokens_ptr_ = GetTokenArrayPtr(token_array, 0);
217     NextInternal();
218   }
219 
~TokenScanIterator()220   ~TokenScanIterator() {
221   }
222 
Get() const223   const Result &Get() const { return result_; }
224 
Done() const225   bool Done() const { return state_ == DONE; }
226 
Next()227   void Next() {
228     DCHECK_NE(state_, DONE);
229     NextInternal();
230   }
231 
232  private:
233   enum State {
234     HAS_NEXT,
235     DONE,
236   };
237 
NextInternal()238   void NextInternal() {
239     if (encoded_tokens_ptr_[offset_] == termination_flag_) {
240       state_ = DONE;
241       return;
242     }
243     int read_bytes;
244     result_.value_id = -1;
245     result_.index = index_;
246     result_.tokens_offset = tokens_offset_;
247     const bool is_last_token =
248         !(codec_->ReadTokenForReverseLookup(encoded_tokens_ptr_ + offset_,
249                                             &result_.value_id, &read_bytes));
250     if (is_last_token) {
251       int tokens_size = offset_ + read_bytes - tokens_offset_;
252       if (tokens_size < kMinTokenArrayBlobSize) {
253         tokens_size = kMinTokenArrayBlobSize;
254       }
255       tokens_offset_ += tokens_size;
256       ++index_;
257       offset_ = tokens_offset_;
258     } else {
259       offset_ += read_bytes;
260     }
261   }
262 
263   const SystemDictionaryCodecInterface *codec_;
264   const uint8 *encoded_tokens_ptr_;
265   const uint8 termination_flag_;
266   State state_;
267   Result result_;
268   int offset_;
269   int tokens_offset_;
270   int index_;
271 
272  private:
273   DISALLOW_COPY_AND_ASSIGN(TokenScanIterator);
274 };
275 
276 struct ReverseLookupResult {
ReverseLookupResultmozc::dictionary::__anon38b8538e0111::ReverseLookupResult277   ReverseLookupResult() : tokens_offset(-1), id_in_key_trie(-1) {}
278   // Offset from the tokens section beginning.
279   // (token_array_.Get(id_in_key_trie) == token_array_.Get(0) + tokens_offset)
280   int tokens_offset;
281   // Id in key trie
282   int id_in_key_trie;
283 };
284 
285 }  // namespace
286 
287 class SystemDictionary::ReverseLookupCache {
288  public:
ReverseLookupCache()289   ReverseLookupCache() {}
290 
IsAvailable(const std::set<int> & id_set) const291   bool IsAvailable(const std::set<int> &id_set) const {
292     for (std::set<int>::const_iterator itr = id_set.begin();
293          itr != id_set.end();
294          ++itr) {
295       if (results.find(*itr) == results.end()) {
296         return false;
297       }
298     }
299     return true;
300   }
301 
302   std::multimap<int, ReverseLookupResult> results;
303 
304  private:
305   DISALLOW_COPY_AND_ASSIGN(ReverseLookupCache);
306 };
307 
308 class SystemDictionary::ReverseLookupIndex {
309  public:
ReverseLookupIndex(const SystemDictionaryCodecInterface * codec,const BitVectorBasedArray & token_array)310   ReverseLookupIndex(
311       const SystemDictionaryCodecInterface *codec,
312       const BitVectorBasedArray &token_array) {
313     // Gets id size.
314     int value_id_max = -1;
315     for (TokenScanIterator iter(codec, token_array);
316          !iter.Done(); iter.Next()) {
317       const TokenScanIterator::Result &result = iter.Get();
318       value_id_max = std::max(value_id_max, result.value_id);
319     }
320 
321     CHECK_GE(value_id_max, 0);
322     index_size_ = value_id_max + 1;
323     index_.reset(new ReverseLookupResultArray[index_size_]);
324 
325     // Gets result size for each ids.
326     for (TokenScanIterator iter(codec, token_array);
327          !iter.Done(); iter.Next()) {
328       const TokenScanIterator::Result &result = iter.Get();
329       if (result.value_id != -1) {
330         DCHECK_LT(result.value_id, index_size_);
331         ++(index_[result.value_id].size);
332       }
333     }
334 
335     for (size_t i = 0; i < index_size_; ++i) {
336       index_[i].results.reset(new ReverseLookupResult[index_[i].size]);
337     }
338 
339     // Builds index.
340     for (TokenScanIterator iter(codec, token_array);
341          !iter.Done(); iter.Next()) {
342       const TokenScanIterator::Result &result = iter.Get();
343       if (result.value_id == -1) {
344         continue;
345       }
346 
347       DCHECK_LT(result.value_id, index_size_);
348       ReverseLookupResultArray *result_array = &index_[result.value_id];
349 
350       // Finds uninitialized result.
351       size_t result_index = 0;
352       for (result_index = 0;
353            result_index < result_array->size;
354            ++result_index) {
355         const ReverseLookupResult &lookup_result =
356             result_array->results[result_index];
357         if (lookup_result.tokens_offset == -1 &&
358             lookup_result.id_in_key_trie == -1) {
359           result_array->results[result_index].tokens_offset =
360               result.tokens_offset;
361           result_array->results[result_index].id_in_key_trie = result.index;
362           break;
363         }
364       }
365     }
366 
367     CHECK(index_ != nullptr);
368   }
369 
~ReverseLookupIndex()370   ~ReverseLookupIndex() {}
371 
FillResultMap(const std::set<int> & id_set,std::multimap<int,ReverseLookupResult> * result_map)372   void FillResultMap(const std::set<int> &id_set,
373                      std::multimap<int, ReverseLookupResult> *result_map) {
374     for (std::set<int>::const_iterator id_itr  = id_set.begin();
375          id_itr != id_set.end(); ++id_itr) {
376       const ReverseLookupResultArray &result_array = index_[*id_itr];
377       for (size_t i = 0; i < result_array.size; ++i) {
378         result_map->insert(std::make_pair(*id_itr, result_array.results[i]));
379       }
380     }
381   }
382 
383  private:
384   struct ReverseLookupResultArray {
ReverseLookupResultArraymozc::dictionary::SystemDictionary::ReverseLookupIndex::ReverseLookupResultArray385     ReverseLookupResultArray() : size(0) {}
386     // Use std::unique_ptr for reducing memory consumption as possible.
387     // Using vector requires 90 MB even when we call resize explicitly.
388     // On the other hand, std::unique_ptr requires 57 MB.
389     std::unique_ptr<ReverseLookupResult[]> results;
390     size_t size;
391   };
392 
393   // Use scoped array for reducing memory consumption as possible.
394   std::unique_ptr<ReverseLookupResultArray[]> index_;
395   size_t index_size_;
396 
397   DISALLOW_COPY_AND_ASSIGN(ReverseLookupIndex);
398 };
399 
400 struct SystemDictionary::PredictiveLookupSearchState {
PredictiveLookupSearchStatemozc::dictionary::SystemDictionary::PredictiveLookupSearchState401   PredictiveLookupSearchState() : key_pos(0), is_expanded(false) {}
PredictiveLookupSearchStatemozc::dictionary::SystemDictionary::PredictiveLookupSearchState402   PredictiveLookupSearchState(const storage::louds::LoudsTrie::Node &n,
403                               size_t pos, bool expanded)
404       : node(n), key_pos(pos), is_expanded(expanded) {}
405 
406   storage::louds::LoudsTrie::Node node;
407   size_t key_pos;
408   bool is_expanded;
409 };
410 
411 struct SystemDictionary::Builder::Specification {
412   enum InputType {
413     FILENAME,
414     IMAGE,
415   };
416 
Specificationmozc::dictionary::SystemDictionary::Builder::Specification417   Specification(InputType t, const string &fn, const char *p, int l, Options o,
418                 const SystemDictionaryCodecInterface *codec,
419                 const DictionaryFileCodecInterface *file_codec)
420       : type(t), filename(fn), ptr(p), len(l), options(o), codec(codec),
421         file_codec(file_codec) {}
422 
423   InputType type;
424 
425   // For InputType::FILENAME
426   const string filename;
427 
428   // For InputType::IMAGE
429   const char *ptr;
430   const int len;
431 
432   Options options;
433   const SystemDictionaryCodecInterface *codec;
434   const DictionaryFileCodecInterface *file_codec;
435 };
436 
Builder(const string & filename)437 SystemDictionary::Builder::Builder(const string &filename)
438     : spec_(new Specification(Specification::FILENAME,
439                               filename, nullptr, -1, NONE, nullptr, nullptr)) {}
440 
Builder(const char * ptr,int len)441 SystemDictionary::Builder::Builder(const char *ptr, int len)
442     : spec_(new Specification(Specification::IMAGE,
443                               "", ptr, len, NONE, nullptr, nullptr)) {}
444 
~Builder()445 SystemDictionary::Builder::~Builder() {}
446 
SetOptions(Options options)447 SystemDictionary::Builder & SystemDictionary::Builder::SetOptions(
448     Options options) {
449   spec_->options = options;
450   return *this;
451 }
452 
SetCodec(const SystemDictionaryCodecInterface * codec)453 SystemDictionary::Builder &SystemDictionary::Builder::SetCodec(
454     const SystemDictionaryCodecInterface *codec) {
455   spec_->codec = codec;
456   return *this;
457 }
458 
Build()459 SystemDictionary *SystemDictionary::Builder::Build() {
460   if (spec_->codec == nullptr) {
461     spec_->codec = SystemDictionaryCodecFactory::GetCodec();
462   }
463 
464   if (spec_->file_codec == nullptr) {
465     spec_->file_codec = DictionaryFileCodecFactory::GetCodec();
466   }
467 
468   std::unique_ptr<SystemDictionary> instance(
469       new SystemDictionary(spec_->codec, spec_->file_codec));
470 
471   switch (spec_->type) {
472     case Specification::FILENAME:
473       if (!instance->dictionary_file_->OpenFromFile(spec_->filename)) {
474         LOG(ERROR) << "Failed to open system dictionary file";
475         return nullptr;
476       }
477       break;
478     case Specification::IMAGE:
479       // Make the dictionary not to be paged out.
480       // We don't check the return value because the process doesn't necessarily
481       // has the priviledge to mlock.
482       // Note that we don't munlock the space because it's always better to keep
483       // the singleton system dictionary paged in as long as the process runs.
484       Mmap::MaybeMLock(spec_->ptr, spec_->len);
485       if (!instance->dictionary_file_->OpenFromImage(spec_->ptr, spec_->len)) {
486         LOG(ERROR) << "Failed to open system dictionary image";
487         return nullptr;
488       }
489       break;
490     default:
491       LOG(ERROR) << "Invalid input type.";
492       return nullptr;
493   }
494 
495   if (!instance->OpenDictionaryFile(
496           (spec_->options & ENABLE_REVERSE_LOOKUP_INDEX) != 0)) {
497     LOG(ERROR) << "Failed to create system dictionary";
498     return nullptr;
499   }
500 
501   return instance.release();
502 }
503 
SystemDictionary(const SystemDictionaryCodecInterface * codec,const DictionaryFileCodecInterface * file_codec)504 SystemDictionary::SystemDictionary(
505     const SystemDictionaryCodecInterface *codec,
506     const DictionaryFileCodecInterface *file_codec)
507     : frequent_pos_(nullptr),
508       codec_(codec),
509       dictionary_file_(new DictionaryFile(file_codec)) {}
510 
~SystemDictionary()511 SystemDictionary::~SystemDictionary() {}
512 
OpenDictionaryFile(bool enable_reverse_lookup_index)513 bool SystemDictionary::OpenDictionaryFile(bool enable_reverse_lookup_index) {
514   int len;
515 
516   const uint8 *key_image = reinterpret_cast<const uint8 *>(
517       dictionary_file_->GetSection(codec_->GetSectionNameForKey(), &len));
518   if (!key_trie_.Open(key_image,
519                       kKeyTrieLb0CacheSize,
520                       kKeyTrieLb1CacheSize,
521                       kKeyTrieSelect0CacheSize,
522                       kKeyTrieSelect1CacheSize,
523                       kKeyTrieTermvecCacheSize)) {
524     LOG(ERROR) << "cannot open key trie";
525     return false;
526   }
527 
528   BuildHiraganaExpansionTable(*codec_, &hiragana_expansion_table_);
529 
530   const uint8 *value_image = reinterpret_cast<const uint8 *>(
531       dictionary_file_->GetSection(codec_->GetSectionNameForValue(), &len));
532   if (!value_trie_.Open(value_image,
533                         kValueTrieLb0CacheSize,
534                         kValueTrieLb1CacheSize,
535                         kValueTrieSelect0CacheSize,
536                         kValueTrieSelect1CacheSize,
537                         kValueTrieTermvecCacheSize)) {
538     LOG(ERROR) << "can not open value trie";
539     return false;
540   }
541 
542   const unsigned char *token_image = reinterpret_cast<const unsigned char *>(
543       dictionary_file_->GetSection(codec_->GetSectionNameForTokens(), &len));
544   token_array_.Open(token_image);
545 
546   frequent_pos_ = reinterpret_cast<const uint32*>(
547       dictionary_file_->GetSection(codec_->GetSectionNameForPos(), &len));
548   if (frequent_pos_ == nullptr) {
549     LOG(ERROR) << "can not find frequent pos section";
550     return false;
551   }
552 
553   if (enable_reverse_lookup_index) {
554     InitReverseLookupIndex();
555   }
556 
557   return true;
558 }
559 
InitReverseLookupIndex()560 void SystemDictionary::InitReverseLookupIndex() {
561   if (reverse_lookup_index_ != nullptr) {
562     return;
563   }
564   reverse_lookup_index_.reset(new ReverseLookupIndex(codec_, token_array_));
565 }
566 
HasKey(StringPiece key) const567 bool SystemDictionary::HasKey(StringPiece key) const {
568   string encoded_key;
569   codec_->EncodeKey(key, &encoded_key);
570   return key_trie_.HasKey(encoded_key);
571 }
572 
HasValue(StringPiece value) const573 bool SystemDictionary::HasValue(StringPiece value) const {
574   string encoded_value;
575   codec_->EncodeValue(value, &encoded_value);
576   if (value_trie_.HasKey(encoded_value)) {
577     return true;
578   }
579 
580   // Because Hiragana, Katakana and Alphabet words are not stored in the
581   // value_trie for the data compression.  They are only stored in the
582   // key_trie with flags.  So we also need to check the existence in
583   // the key_trie.
584 
585   // Normalize the value as the key.  This process depends on the
586   // implementation of SystemDictionaryBuilder::BuildValueTrie.
587   string key;
588   Util::KatakanaToHiragana(value, &key);
589 
590   string encoded_key;
591   codec_->EncodeKey(key, &encoded_key);
592   const int key_id = key_trie_.ExactSearch(encoded_key);
593   if (key_id == -1) {
594     return false;
595   }
596 
597   // We need to check the contents of value_trie for Katakana values.
598   // If (key, value) = (かな, カナ) is in the dictionary, "カナ" is
599   // not used as a key for value_trie or key_trie.  Only "かな" is
600   // used as a key for key_trie.  If we accept this limitation, we can
601   // skip the following code.
602   //
603   // If we add "if (key == value) return true;" here, we can check
604   // almost all cases of Hiragana and Alphabet words without the
605   // following iteration.  However, when (mozc, MOZC) is stored but
606   // (mozc, mozc) is NOT stored, HasValue("mozc") wrongly returns
607   // true.
608 
609   // Get the block of tokens for this key.
610   const uint8 *encoded_tokens_ptr = GetTokenArrayPtr(token_array_, key_id);
611 
612   // Check tokens.
613   for (TokenDecodeIterator iter(codec_, value_trie_, frequent_pos_, key,
614                                 encoded_tokens_ptr);
615        !iter.Done(); iter.Next()) {
616     const Token *token = iter.Get().token;
617     if (value == token->value) {
618       return true;
619     }
620   }
621   return false;
622 }
623 
CollectPredictiveNodesInBfsOrder(StringPiece encoded_key,const KeyExpansionTable & table,size_t limit,std::vector<PredictiveLookupSearchState> * result) const624 void SystemDictionary::CollectPredictiveNodesInBfsOrder(
625     StringPiece encoded_key,
626     const KeyExpansionTable &table,
627     size_t limit,
628     std::vector<PredictiveLookupSearchState> *result) const {
629   std::queue<PredictiveLookupSearchState> queue;
630   queue.push(PredictiveLookupSearchState(LoudsTrie::Node(), 0, false));
631   do {
632     PredictiveLookupSearchState state = queue.front();
633     queue.pop();
634 
635     // Update traversal state for |encoded_key| and its expanded keys.
636     if (state.key_pos < encoded_key.size()) {
637       const char target_char = encoded_key[state.key_pos];
638       const ExpandedKey &chars = table.ExpandKey(target_char);
639 
640       for (key_trie_.MoveToFirstChild(&state.node);
641            key_trie_.IsValidNode(state.node);
642            key_trie_.MoveToNextSibling(&state.node)) {
643         const char c = key_trie_.GetEdgeLabelToParentNode(state.node);
644         if (!chars.IsHit(c)) {
645           continue;
646         }
647         const bool is_expanded = state.is_expanded || c != target_char;
648         queue.push(PredictiveLookupSearchState(state.node,
649                                                state.key_pos + 1,
650                                                is_expanded));
651       }
652       continue;
653     }
654 
655     // Collect prediction keys (state.key_pos >= encoded_key.size()).
656     if (key_trie_.IsTerminalNode(state.node)) {
657       result->push_back(state);
658     }
659 
660     // Collected enough entries.  Collect all the remaining keys that have the
661     // same length as the longest key.
662     if (result->size() > limit) {
663       // The current key is the longest because of BFS property.
664       const size_t max_key_len = state.key_pos;
665       while (!queue.empty()) {
666         state = queue.front();
667         queue.pop();
668         if (state.key_pos > max_key_len) {
669           // Key length in the queue is monotonically increasing because of BFS
670           // property.  So we don't need to check all the elements in the queue.
671           break;
672         }
673         DCHECK_EQ(state.key_pos, max_key_len);
674         if (key_trie_.IsTerminalNode(state.node)) {
675           result->push_back(state);
676         }
677       }
678       break;
679     }
680 
681     // Update traversal state for children.
682     for (key_trie_.MoveToFirstChild(&state.node);
683          key_trie_.IsValidNode(state.node);
684          key_trie_.MoveToNextSibling(&state.node)) {
685       queue.push(PredictiveLookupSearchState(state.node,
686                                              state.key_pos + 1,
687                                              state.is_expanded));
688     }
689   } while (!queue.empty());
690 }
691 
LookupPredictive(StringPiece key,const ConversionRequest & conversion_request,Callback * callback) const692 void SystemDictionary::LookupPredictive(
693     StringPiece key,
694     const ConversionRequest &conversion_request,
695     Callback *callback) const {
696   // Do nothing for empty key, although looking up all the entries with empty
697   // string seems natural.
698   if (key.empty()) {
699     return;
700   }
701 
702   string encoded_key;
703   codec_->EncodeKey(key, &encoded_key);
704   if (encoded_key.size() > LoudsTrie::kMaxDepth) {
705     return;
706   }
707 
708   const KeyExpansionTable &table =
709       conversion_request.IsKanaModifierInsensitiveConversion() ?
710       hiragana_expansion_table_ : KeyExpansionTable::GetDefaultInstance();
711 
712   // TODO(noriyukit): Lookup limit should be implemented at caller side by using
713   // callback mechanism.  This hard-coding limits the capability and generality
714   // of dictionary module.  CollectPredictiveNodesInBfsOrder() and the following
715   // loop for callback should be integrated for this purpose.
716   const size_t kLookupLimit = 64;
717   std::vector<PredictiveLookupSearchState> result;
718   result.reserve(kLookupLimit);
719   CollectPredictiveNodesInBfsOrder(encoded_key, table, kLookupLimit, &result);
720 
721   // Reused buffer and instances inside the following loop.
722   char encoded_actual_key_buffer[LoudsTrie::kMaxDepth + 1];
723   string decoded_key, actual_key_str;
724   decoded_key.reserve(key.size() * 2);
725   actual_key_str.reserve(key.size() * 2);
726   for (size_t i = 0; i < result.size(); ++i) {
727     const PredictiveLookupSearchState &state = result[i];
728 
729     // Computes the actual key.  For example:
730     // key = "くー"
731     // encoded_actual_key = encode("ぐーぐる")  [expanded]
732     // encoded_actual_key_prediction_suffix = encode("ぐる")
733     const StringPiece encoded_actual_key =
734         key_trie_.RestoreKeyString(state.node, encoded_actual_key_buffer);
735     const StringPiece encoded_actual_key_prediction_suffix =
736         ClippedSubstr(encoded_actual_key, encoded_key.size(),
737                       encoded_actual_key.size() - encoded_key.size());
738 
739     // decoded_key = "くーぐる" (= key + prediction suffix)
740     decoded_key.clear();
741     decoded_key.assign(key.data(), key.size());
742     codec_->DecodeKey(encoded_actual_key_prediction_suffix, &decoded_key);
743     switch (callback->OnKey(decoded_key)) {
744       case Callback::TRAVERSE_DONE:
745         return;
746       case Callback::TRAVERSE_NEXT_KEY:
747         continue;
748       case DictionaryInterface::Callback::TRAVERSE_CULL:
749         LOG(FATAL) << "Culling is not implemented.";
750         continue;
751       default:
752         break;
753     }
754 
755     StringPiece actual_key;
756     if (state.is_expanded) {
757       actual_key_str.clear();
758       codec_->DecodeKey(encoded_actual_key, &actual_key_str);
759       actual_key = actual_key_str;
760     } else {
761       actual_key = decoded_key;
762     }
763     switch (callback->OnActualKey(decoded_key, actual_key, state.is_expanded)) {
764       case Callback::TRAVERSE_DONE:
765         return;
766       case Callback::TRAVERSE_NEXT_KEY:
767         continue;
768       case Callback::TRAVERSE_CULL:
769         LOG(FATAL) << "Culling is not implemented.";
770         continue;
771       default:
772         break;
773     }
774 
775     const int key_id = key_trie_.GetKeyIdOfTerminalNode(state.node);
776     for (TokenDecodeIterator iter(codec_, value_trie_,
777                                   frequent_pos_, actual_key,
778                                   GetTokenArrayPtr(token_array_, key_id));
779          !iter.Done(); iter.Next()) {
780       const TokenInfo &token_info = iter.Get();
781       const Callback::ResultType result =
782           callback->OnToken(decoded_key, actual_key, *token_info.token);
783       if (result == Callback::TRAVERSE_DONE) {
784         return;
785       }
786       if (result == Callback::TRAVERSE_NEXT_KEY) {
787         break;
788       }
789       DCHECK_NE(Callback::TRAVERSE_CULL, result) << "Not implemented";
790     }
791   }
792 }
793 
794 namespace {
795 
796 // An implementation of prefix search without key expansion.  Runs |callback|
797 // for prefixes of |encoded_key| in |key_trie|.
798 // Args:
799 //   key_trie, value_trie, token_array, codec, frequent_pos:
800 //     Members in SystemDictionary.
801 //   key:
802 //     The head address of the original key before applying codec.
803 //   encoded_key:
804 //     The encoded |key|.
805 //   callback:
806 //     A callback function to be called.
807 //   token_filter:
808 //     A functor of signature bool(const TokenInfo &).  Only tokens for which
809 //     this functor returns true are passed to callback function.
810 template <typename Func>
RunCallbackOnEachPrefix(const LoudsTrie & key_trie,const LoudsTrie & value_trie,const BitVectorBasedArray & token_array,const SystemDictionaryCodecInterface * codec,const uint32 * frequent_pos,const char * key,StringPiece encoded_key,DictionaryInterface::Callback * callback,Func token_filter)811 void RunCallbackOnEachPrefix(const LoudsTrie &key_trie,
812                              const LoudsTrie &value_trie,
813                              const BitVectorBasedArray &token_array,
814                              const SystemDictionaryCodecInterface *codec,
815                              const uint32 *frequent_pos,
816                              const char *key,
817                              StringPiece encoded_key,
818                              DictionaryInterface::Callback *callback,
819                              Func token_filter) {
820   typedef DictionaryInterface::Callback Callback;
821   LoudsTrie::Node node;
822   for (StringPiece::size_type i = 0; i < encoded_key.size(); ) {
823     if (!key_trie.MoveToChildByLabel(encoded_key[i], &node)) {
824       return;
825     }
826     ++i;  // Increment here for next loop and |encoded_prefix| defined below.
827     if (!key_trie.IsTerminalNode(node)) {
828       continue;
829     }
830     const StringPiece encoded_prefix = encoded_key.substr(0, i);
831     const StringPiece prefix(key, codec->GetDecodedKeyLength(encoded_prefix));
832 
833     switch (callback->OnKey(prefix)) {
834       case Callback::TRAVERSE_DONE:
835       case Callback::TRAVERSE_CULL:
836         return;
837       case Callback::TRAVERSE_NEXT_KEY:
838         continue;
839       default:
840         break;
841     }
842 
843     switch (callback->OnActualKey(prefix, prefix, false)) {
844       case Callback::TRAVERSE_DONE:
845       case Callback::TRAVERSE_CULL:
846         return;
847       case Callback::TRAVERSE_NEXT_KEY:
848         continue;
849       default:
850         break;
851     }
852 
853     const int key_id = key_trie.GetKeyIdOfTerminalNode(node);
854     for (TokenDecodeIterator iter(codec, value_trie, frequent_pos, prefix,
855                                   GetTokenArrayPtr(token_array, key_id));
856          !iter.Done(); iter.Next()) {
857       const TokenInfo &token_info = iter.Get();
858       if (!token_filter(token_info)) {
859         continue;
860       }
861       const Callback::ResultType res =
862           callback->OnToken(prefix, prefix, *token_info.token);
863       if (res == Callback::TRAVERSE_DONE || res == Callback::TRAVERSE_CULL) {
864         return;
865       }
866       if (res == Callback::TRAVERSE_NEXT_KEY) {
867         break;
868       }
869     }
870   }
871 }
872 
873 struct SelectAllTokens {
operator ()mozc::dictionary::__anon38b8538e0211::SelectAllTokens874   bool operator()(const TokenInfo &token_info) const { return true; }
875 };
876 
877 class ReverseLookupCallbackWrapper : public DictionaryInterface::Callback {
878  public:
ReverseLookupCallbackWrapper(DictionaryInterface::Callback * callback)879   explicit ReverseLookupCallbackWrapper(DictionaryInterface::Callback *callback)
880       : callback_(callback) {}
~ReverseLookupCallbackWrapper()881   virtual ~ReverseLookupCallbackWrapper() {}
OnToken(StringPiece key,StringPiece actual_key,const Token & token)882   virtual SystemDictionary::Callback::ResultType OnToken(
883       StringPiece key, StringPiece actual_key, const Token &token) {
884     Token modified_token = token;
885     modified_token.key.swap(modified_token.value);
886     return callback_->OnToken(key, actual_key, modified_token);
887   }
888 
889   DictionaryInterface::Callback *callback_;
890 };
891 
892 }  // namespace
893 
894 // Recursive implemention of depth-first prefix search with key expansion.
895 // Input parameters:
896 //   key:
897 //     The head address of the original key before applying codec.
898 //   encoded_key:
899 //     The encoded |key|.
900 //   table:
901 //     Key expansion table.
902 //   callback:
903 //     A callback function to be called.
904 // Parameters for recursion:
905 //   node:
906 //     Stores the current location in |key_trie_|.
907 //   key_pos:
908 //     Depth of node, i.e., encoded_key.substr(0, key_pos) is the current prefix
909 //     for search.
910 //   is_expanded:
911 //     true is stored if the current node is reached by key expansion.
912 //   actual_key_buffer:
913 //     Buffer for storing actually used characters to reach this node, i.e.,
914 //     StringPiece(actual_key_buffer, key_pos) is the matched prefix using key
915 //     expansion.
916 //   actual_prefix:
917 //     A reused string for decoded actual key.  This is just for performance
918 //     purpose.
919 DictionaryInterface::Callback::ResultType
LookupPrefixWithKeyExpansionImpl(const char * key,StringPiece encoded_key,const KeyExpansionTable & table,Callback * callback,LoudsTrie::Node node,StringPiece::size_type key_pos,bool is_expanded,char * actual_key_buffer,string * actual_prefix) const920 SystemDictionary::LookupPrefixWithKeyExpansionImpl(
921     const char *key,
922     StringPiece encoded_key,
923     const KeyExpansionTable &table,
924     Callback *callback,
925     LoudsTrie::Node node,
926     StringPiece::size_type key_pos,
927     bool is_expanded,
928     char *actual_key_buffer,
929     string *actual_prefix) const {
930   // This do-block handles a terminal node and callback.  do-block is used to
931   // break the block and continue to the subsequent traversal phase.
932   do {
933     if (!key_trie_.IsTerminalNode(node)) {
934       break;
935     }
936 
937     const StringPiece encoded_prefix = encoded_key.substr(0, key_pos);
938     const StringPiece prefix(key, codec_->GetDecodedKeyLength(encoded_prefix));
939     Callback::ResultType result = callback->OnKey(prefix);
940     if (result == Callback::TRAVERSE_DONE ||
941         result == Callback::TRAVERSE_CULL) {
942       return result;
943     }
944     if (result == Callback::TRAVERSE_NEXT_KEY) {
945       break;  // Go to the traversal phase.
946     }
947 
948     const StringPiece encoded_actual_prefix(actual_key_buffer, key_pos);
949     actual_prefix->clear();
950     codec_->DecodeKey(encoded_actual_prefix, actual_prefix);
951     result = callback->OnActualKey(prefix, *actual_prefix, is_expanded);
952     if (result == Callback::TRAVERSE_DONE ||
953         result == Callback::TRAVERSE_CULL) {
954       return result;
955     }
956     if (result == Callback::TRAVERSE_NEXT_KEY) {
957       break;  // Go to the traversal phase.
958     }
959 
960     const int key_id = key_trie_.GetKeyIdOfTerminalNode(node);
961     for (TokenDecodeIterator iter(codec_, value_trie_, frequent_pos_,
962                                   *actual_prefix,
963                                   GetTokenArrayPtr(token_array_, key_id));
964          !iter.Done(); iter.Next()) {
965       const TokenInfo &token_info = iter.Get();
966       result = callback->OnToken(prefix, *actual_prefix, *token_info.token);
967       if (result == Callback::TRAVERSE_DONE ||
968           result == Callback::TRAVERSE_CULL) {
969         return result;
970       }
971       if (result == Callback::TRAVERSE_NEXT_KEY) {
972         break;  // Go to the traversal phase.
973       }
974     }
975   } while (false);
976 
977   // Traversal phase.
978   if (key_pos == encoded_key.size()) {
979     return Callback::TRAVERSE_CONTINUE;
980   }
981   const char current_char = encoded_key[key_pos];
982   const ExpandedKey &chars = table.ExpandKey(current_char);
983   for (key_trie_.MoveToFirstChild(&node); key_trie_.IsValidNode(node);
984        key_trie_.MoveToNextSibling(&node)) {
985     const char c = key_trie_.GetEdgeLabelToParentNode(node);
986     if (!chars.IsHit(c)) {
987       continue;
988     }
989     actual_key_buffer[key_pos] = c;
990     const Callback::ResultType result = LookupPrefixWithKeyExpansionImpl(
991         key, encoded_key, table, callback, node, key_pos + 1,
992         is_expanded || c != current_char,
993         actual_key_buffer, actual_prefix);
994     if (result == Callback::TRAVERSE_DONE) {
995       return Callback::TRAVERSE_DONE;
996     }
997   }
998 
999   return Callback::TRAVERSE_CONTINUE;
1000 }
1001 
LookupPrefix(StringPiece key,const ConversionRequest & conversion_request,Callback * callback) const1002 void SystemDictionary::LookupPrefix(
1003     StringPiece key,
1004     const ConversionRequest &conversion_request,
1005     Callback *callback) const {
1006   string encoded_key;
1007   codec_->EncodeKey(key, &encoded_key);
1008 
1009   if (!conversion_request.IsKanaModifierInsensitiveConversion()) {
1010     RunCallbackOnEachPrefix(key_trie_, value_trie_, token_array_, codec_,
1011                             frequent_pos_, key.data(), encoded_key, callback,
1012                             SelectAllTokens());
1013     return;
1014   }
1015 
1016   char actual_key_buffer[LoudsTrie::kMaxDepth + 1];
1017   string actual_prefix;
1018   actual_prefix.reserve(key.size() * 3);
1019   LookupPrefixWithKeyExpansionImpl(key.data(), encoded_key,
1020                                    hiragana_expansion_table_, callback,
1021                                    LoudsTrie::Node(), 0, false,
1022                                    actual_key_buffer, &actual_prefix);
1023 }
1024 
LookupExact(StringPiece key,const ConversionRequest & conversion_request,Callback * callback) const1025 void SystemDictionary::LookupExact(
1026     StringPiece key,
1027     const ConversionRequest &conversion_request,
1028     Callback *callback) const {
1029   // Find the key in the key trie.
1030   string encoded_key;
1031   codec_->EncodeKey(key, &encoded_key);
1032   const int key_id = key_trie_.ExactSearch(encoded_key);
1033   if (key_id == -1) {
1034     return;
1035   }
1036   if (callback->OnKey(key) != Callback::TRAVERSE_CONTINUE) {
1037     return;
1038   }
1039 
1040   // Callback on each token.
1041   for (TokenDecodeIterator iter(codec_, value_trie_, frequent_pos_, key,
1042                                 GetTokenArrayPtr(token_array_, key_id));
1043        !iter.Done(); iter.Next()) {
1044     if (callback->OnToken(key, key, *iter.Get().token) !=
1045         Callback::TRAVERSE_CONTINUE) {
1046       break;
1047     }
1048   }
1049 }
1050 
LookupReverse(StringPiece str,const ConversionRequest & conversion_request,Callback * callback) const1051 void SystemDictionary::LookupReverse(
1052     StringPiece str,
1053     const ConversionRequest &conversion_request,
1054     Callback *callback) const {
1055   // 1st step: Hiragana/Katakana are not in the value trie
1056   // 2nd step: Reverse lookup in value trie
1057   ReverseLookupCallbackWrapper callback_wrapper(callback);
1058   RegisterReverseLookupTokensForT13N(str, &callback_wrapper);
1059   RegisterReverseLookupTokensForValue(str, &callback_wrapper);
1060 }
1061 
1062 namespace {
1063 
1064 class AddKeyIdsToSet {
1065  public:
AddKeyIdsToSet(std::set<int> * output)1066   explicit AddKeyIdsToSet(std::set<int> *output) : output_(output) {}
1067 
operator ()(StringPiece key,size_t prefix_len,const LoudsTrie & trie,LoudsTrie::Node node)1068   void operator()(StringPiece key, size_t prefix_len,
1069                   const LoudsTrie &trie, LoudsTrie::Node node) {
1070     output_->insert(trie.GetKeyIdOfTerminalNode(node));
1071   }
1072 
1073  private:
1074   std::set<int> *output_;
1075 };
1076 
AddKeyIdsOfAllPrefixes(const LoudsTrie & trie,StringPiece key,std::set<int> * key_ids)1077 inline void AddKeyIdsOfAllPrefixes(const LoudsTrie &trie, StringPiece key,
1078                                    std::set<int> *key_ids) {
1079   trie.PrefixSearch(key, AddKeyIdsToSet(key_ids));
1080 }
1081 
1082 }  // namespace
1083 
PopulateReverseLookupCache(StringPiece str) const1084 void SystemDictionary::PopulateReverseLookupCache(StringPiece str) const {
1085   if (reverse_lookup_index_ != nullptr) {
1086     // We don't need to prepare cache for the current reverse conversion,
1087     // as we have already built the index for reverse lookup.
1088     return;
1089   }
1090   reverse_lookup_cache_.reset(new ReverseLookupCache);
1091   DCHECK(reverse_lookup_cache_.get());
1092 
1093   // Iterate each suffix and collect IDs of all substrings.
1094   std::set<int> id_set;
1095   int pos = 0;
1096   string lookup_key;
1097   lookup_key.reserve(str.size());
1098   while (pos < str.size()) {
1099     const StringPiece suffix = ClippedSubstr(str, pos);
1100     lookup_key.clear();
1101     codec_->EncodeValue(suffix, &lookup_key);
1102     AddKeyIdsOfAllPrefixes(value_trie_, lookup_key, &id_set);
1103     pos += Util::OneCharLen(suffix.data());
1104   }
1105   // Collect tokens for all IDs.
1106   ScanTokens(id_set, reverse_lookup_cache_.get());
1107 }
1108 
ClearReverseLookupCache() const1109 void SystemDictionary::ClearReverseLookupCache() const {
1110   reverse_lookup_cache_.reset();
1111 }
1112 
1113 namespace {
1114 
1115 class FilterTokenForRegisterReverseLookupTokensForT13N {
1116  public:
FilterTokenForRegisterReverseLookupTokensForT13N()1117   FilterTokenForRegisterReverseLookupTokensForT13N() {
1118     tmp_str_.reserve(LoudsTrie::kMaxDepth * 3);
1119   }
1120 
operator ()(const TokenInfo & token_info)1121   bool operator()(const TokenInfo &token_info) {
1122     // Skip spelling corrections.
1123     if (token_info.token->attributes & Token::SPELLING_CORRECTION) {
1124       return false;
1125     }
1126     if (token_info.value_type != TokenInfo::AS_IS_HIRAGANA &&
1127         token_info.value_type != TokenInfo::AS_IS_KATAKANA) {
1128       // SAME_AS_PREV_VALUE may be t13n token.
1129       tmp_str_.clear();
1130       Util::KatakanaToHiragana(token_info.token->value, &tmp_str_);
1131       if (token_info.token->key != tmp_str_) {
1132         return false;
1133       }
1134     }
1135     return true;
1136   }
1137 
1138  private:
1139   string tmp_str_;
1140 };
1141 
1142 }  // namespace
1143 
RegisterReverseLookupTokensForT13N(StringPiece value,Callback * callback) const1144 void SystemDictionary::RegisterReverseLookupTokensForT13N(
1145     StringPiece value, Callback *callback) const {
1146   string hiragana_value, encoded_key;
1147   Util::KatakanaToHiragana(value, &hiragana_value);
1148   codec_->EncodeKey(hiragana_value, &encoded_key);
1149   RunCallbackOnEachPrefix(key_trie_, value_trie_, token_array_, codec_,
1150                           frequent_pos_, hiragana_value.data(),
1151                           encoded_key, callback,
1152                           FilterTokenForRegisterReverseLookupTokensForT13N());
1153 }
1154 
RegisterReverseLookupTokensForValue(StringPiece value,Callback * callback) const1155 void SystemDictionary::RegisterReverseLookupTokensForValue(
1156     StringPiece value, Callback *callback) const {
1157   string lookup_key;
1158   codec_->EncodeValue(value, &lookup_key);
1159 
1160   std::set<int> id_set;
1161   AddKeyIdsOfAllPrefixes(value_trie_, lookup_key, &id_set);
1162 
1163   ReverseLookupCache *results = nullptr;
1164   ReverseLookupCache non_cached_results;
1165   if (reverse_lookup_index_ != nullptr) {
1166     reverse_lookup_index_->FillResultMap(id_set, &non_cached_results.results);
1167     results = &non_cached_results;
1168   } else if (reverse_lookup_cache_ != nullptr &&
1169              reverse_lookup_cache_->IsAvailable(id_set)) {
1170     results = reverse_lookup_cache_.get();
1171   } else {
1172     // Cache is not available. Get token for each ID.
1173     ScanTokens(id_set, &non_cached_results);
1174     results = &non_cached_results;
1175   }
1176   DCHECK(results != nullptr);
1177 
1178   RegisterReverseLookupResults(id_set, *results, callback);
1179 }
1180 
ScanTokens(const std::set<int> & id_set,ReverseLookupCache * cache) const1181 void SystemDictionary::ScanTokens(
1182     const std::set<int> &id_set, ReverseLookupCache *cache) const {
1183   for (TokenScanIterator iter(codec_, token_array_);
1184        !iter.Done(); iter.Next()) {
1185     const TokenScanIterator::Result &result = iter.Get();
1186     if (result.value_id != -1 &&
1187         id_set.find(result.value_id) != id_set.end()) {
1188       ReverseLookupResult lookup_result;
1189       lookup_result.tokens_offset = result.tokens_offset;
1190       lookup_result.id_in_key_trie = result.index;
1191       cache->results.insert(std::make_pair(result.value_id, lookup_result));
1192     }
1193   }
1194 }
1195 
RegisterReverseLookupResults(const std::set<int> & id_set,const ReverseLookupCache & cache,Callback * callback) const1196 void SystemDictionary::RegisterReverseLookupResults(
1197     const std::set<int> &id_set,
1198     const ReverseLookupCache &cache,
1199     Callback *callback) const {
1200   const uint8 *encoded_tokens_ptr = GetTokenArrayPtr(token_array_, 0);
1201   char buffer[LoudsTrie::kMaxDepth + 1];
1202   for (std::set<int>::const_iterator set_itr = id_set.begin();
1203        set_itr != id_set.end();
1204        ++set_itr) {
1205     const int value_id = *set_itr;
1206     typedef std::multimap<int, ReverseLookupResult>::const_iterator ResultItr;
1207     std::pair<ResultItr, ResultItr> range = cache.results.equal_range(*set_itr);
1208     for (ResultItr result_itr = range.first;
1209          result_itr != range.second;
1210          ++result_itr) {
1211       const ReverseLookupResult &reverse_result = result_itr->second;
1212 
1213       const StringPiece encoded_key =
1214           key_trie_.RestoreKeyString(reverse_result.id_in_key_trie, buffer);
1215       string tokens_key;
1216       codec_->DecodeKey(encoded_key, &tokens_key);
1217       if (callback->OnKey(tokens_key) != Callback::TRAVERSE_CONTINUE) {
1218         continue;
1219       }
1220       for (TokenDecodeIterator iter(
1221                codec_, value_trie_, frequent_pos_, tokens_key,
1222                encoded_tokens_ptr  + reverse_result.tokens_offset);
1223            !iter.Done(); iter.Next()) {
1224         const TokenInfo &token_info = iter.Get();
1225         if (token_info.token->attributes & Token::SPELLING_CORRECTION ||
1226             token_info.id_in_value_trie != value_id) {
1227           continue;
1228         }
1229         callback->OnToken(tokens_key, tokens_key, *token_info.token);
1230       }
1231     }
1232   }
1233 }
1234 
1235 }  // namespace dictionary
1236 }  // namespace mozc
1237