1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "prediction/user_history_predictor.h"
31 
32 #include <algorithm>
33 #include <cctype>
34 #include <climits>
35 #include <memory>
36 #include <string>
37 
38 #include "base/clock.h"
39 #include "base/config_file_stream.h"
40 #include "base/flags.h"
41 #include "base/hash.h"
42 #include "base/logging.h"
43 #include "base/mozc_hash_set.h"
44 #include "base/thread.h"
45 #include "base/trie.h"
46 #include "base/util.h"
47 #include "composer/composer.h"
48 #include "converter/segments.h"
49 #include "dictionary/dictionary_interface.h"
50 #include "dictionary/pos_matcher.h"
51 #include "dictionary/suppression_dictionary.h"
52 #include "prediction/predictor_interface.h"
53 #include "prediction/user_history_predictor.pb.h"
54 #include "protocol/commands.pb.h"
55 #include "protocol/config.pb.h"
56 #include "request/conversion_request.h"
57 #include "rewriter/variants_rewriter.h"
58 #include "storage/encrypted_string_storage.h"
59 #include "storage/lru_cache.h"
60 #include "usage_stats/usage_stats.h"
61 
62 // This flag is set by predictor.cc
63 // We can remove this after the ambiguity expansion feature get stable.
64 DEFINE_bool(enable_expansion_for_user_history_predictor,
65             false,
66             "enable ambiguity expansion for user_history_predictor.");
67 
68 namespace mozc {
69 namespace {
70 
71 using std::unique_ptr;
72 
73 using commands::Request;
74 using dictionary::SuppressionDictionary;
75 using dictionary::DictionaryInterface;
76 using dictionary::POSMatcher;
77 using usage_stats::UsageStats;
78 
79 // Finds suggestion candidates from the most recent 3000 history in LRU.
80 // We don't check all history, since suggestion is called every key event
81 const size_t kMaxSuggestionTrial = 3000;
82 
83 // Finds suffix matches of history_segments from the most recent 500 histories
84 // in LRU.
85 const size_t kMaxPrevValueTrial = 500;
86 
87 // Cache size
88 // Typically memory/storage footprint becomes kLRUCacheSize * 70 bytes.
89 #ifdef OS_ANDROID
90 const size_t kLRUCacheSize = 2000;
91 #else  // OS_ANDROID
92 const size_t kLRUCacheSize = 10000;
93 #endif  // OS_ANDROID
94 
95 // Don't save key/value that are
96 // longer than kMaxCandidateSize to avoid memory explosion
97 const size_t kMaxStringLength = 256;
98 
99 // Maximum size of next_entries
100 const size_t kMaxNextEntriesSize = 4;
101 
102 // Revert id for user_history_predictor
103 const uint16 kRevertId = 1;
104 
105 // Default object pool size for EntryPriorityQueue
106 const size_t kEntryPoolSize = 16;
107 
108 // File name for the history
109 #ifdef OS_WIN
110 const char kFileName[] = "user://history.db";
111 #else
112 const char kFileName[] = "user://.history.db";
113 #endif
114 
115 // Uses '\t' as a key/value delimiter
116 const char kDelimiter[] = "\t";
117 const char kEmojiDescription[] = "絵文字";
118 
119 // TODO(peria, hidehiko): Unify this checker and IsEmojiCandidate in
120 //     EmojiRewriter.  If you make similar functions before the merging in
121 //     case, put a similar note to avoid twisted dependency.
IsEmojiEntry(const UserHistoryPredictor::Entry & entry)122 bool IsEmojiEntry(const UserHistoryPredictor::Entry &entry) {
123   return (entry.has_description() &&
124           entry.description().find(kEmojiDescription) != string::npos);
125 }
126 
IsPunctuation(const string & value)127 bool IsPunctuation(const string &value) {
128   return (value == "。" || value == "." ||
129           value == "、" || value == "," ||
130           value == "?" || value == "?" ||
131           value == "!" || value == "!" ||
132           value == "," || value == ".");
133 }
134 
IsSentenceLikeCandidate(const Segment::Candidate & candidate)135 bool IsSentenceLikeCandidate(const Segment::Candidate &candidate) {
136   // A sentence should have a long reading.  Length check is done using key to
137   // absorb length difference in value variation, e.g.,
138   // "〜ください" and "〜下さい".
139   if (candidate.value.empty() || Util::CharsLen(candidate.key) < 8) {
140     return false;
141   }
142   // Our primary target sentence ends with Hiragana, e.g., "〜ます".
143   const ConstChar32ReverseIterator iter(candidate.value);
144   bool ret = Util::GetScriptType(iter.Get()) == Util::HIRAGANA;
145   return ret;
146 }
147 
148 // Returns romanaized string.
ToRoman(const string & str)149 string ToRoman(const string &str) {
150   string result;
151   Util::HiraganaToRomanji(str, &result);
152   return result;
153 }
154 
155 // Returns true if value looks like a content word.
156 // Currently, just checks the script type.
IsContentWord(const string & value)157 bool IsContentWord(const string &value) {
158   return Util::CharsLen(value) > 1 ||
159       Util::GetScriptType(value) != Util::UNKNOWN_SCRIPT;
160 }
161 
162 // Returns candidate description.
163 // If candidate is spelling correction, typing correction
164 // or auto partial suggestion,
165 // don't use the description, since "did you mean" like description must be
166 // provided at an appropriate timing and context.
GetDescription(const Segment::Candidate & candidate)167 string GetDescription(const Segment::Candidate &candidate) {
168   if (candidate.attributes &
169       (Segment::Candidate::SPELLING_CORRECTION |
170        Segment::Candidate::TYPING_CORRECTION |
171        Segment::Candidate::AUTO_PARTIAL_SUGGESTION)) {
172     return "";
173   }
174   return candidate.description;
175 }
176 
177 }  // namespace
178 
179 // Returns true if the input first candidate seems to be a privacy sensitive
180 // such like password.
IsPrivacySensitive(const Segments * segments) const181 bool UserHistoryPredictor::IsPrivacySensitive(const Segments *segments) const {
182   const bool kNonSensitive = false;
183   const bool kSensitive = true;
184 
185   // Skips privacy sensitive check if |segments| consists of multiple conversion
186   // segment. That is, segments like "パスワードは|x7LAGhaR" where '|'
187   // represents segment boundary is not considered to be privacy sensitive.
188   // TODO(team): Revisit this rule if necessary.
189   if (segments->conversion_segments_size() != 1) {
190     return kNonSensitive;
191   }
192 
193   // Hereafter, we must have only one conversion segment.
194   const Segment &conversion_segment = segments->conversion_segment(0);
195   const string &segment_key = conversion_segment.key();
196 
197   // The top candidate, which is about to be committed.
198   const Segment::Candidate &candidate = conversion_segment.candidate(0);
199   const string &candidate_value = candidate.value;
200 
201   // If |candidate_value| contains any non-ASCII character, do not treat
202   // it as privacy sensitive information.
203   // TODO(team): Improve the following rule. For example,
204   //     "0000-0000-0000-0000" is not treated as privacy sensitive
205   //     because of this rule. When a user commits his password in
206   //     full-width form by mistake, like "x7LAGhaR", it is not
207   //     treated as privacy sensitive too.
208   if (Util::GetCharacterSet(candidate_value) != Util::ASCII) {
209     return kNonSensitive;
210   }
211 
212   // Hereafter, |candidate_value| consists of ASCII characters only.
213 
214   // Note: if the key looks like hiragana, the candidate might be Katakana to
215   // English transliteration. Don't suppress transliterated candidates.
216   // http://b/4394325
217 
218   // If the key consists of number characters only, treat it as privacy
219   // sensitive.
220   if (Util::GetScriptType(segment_key) == Util::NUMBER) {
221     return kSensitive;
222   }
223 
224   // If the key contains any alphabetical character but it is in our dictionary,
225   // it can be treated as privacy nonsensitive word; cf. b/5995529. Besides,
226   // short words would be considered as privacy nonsensitive word as well.
227   if (segment_key.size() <= 3) {
228     return kNonSensitive;
229   }
230 
231   // Dictionary-based sensitivity test. If the word user typed is in dictionary,
232   // treat it as privacy insensitive. For English (ASCII) words,
233   // dictionary-based test is extended to the following forms:
234   //   1) All lower case (e.g., hello)
235   //   2) All upper case (e.g., HELLO)
236   //   3) Capitalized (e.g., Hello)
237   //   4) As-is (e.g., HeLlO)
238   // Since English words are stored in lower case, in case of upper case and
239   // capitalized keys, we convert it to lower case in advance.
240   if (Util::IsUpperOrCapitalizedAscii(candidate_value)) {
241     // Look up for keys that are all in upper case or capitalized ASCII.
242     string lower_case_value(candidate_value);
243     Util::LowerString(&lower_case_value);
244     if (dictionary_->HasValue(lower_case_value)) {
245       return kNonSensitive;
246     }
247   } else {
248     // Looks up for the original key, including those that are all in lower case
249     // ASCII.
250     if (dictionary_->HasValue(candidate_value)) {
251       return kNonSensitive;
252     }
253   }
254   // If the key contains any alphabetical character and is not in our
255   // dictionary, treat it as privacy sensitive. There also remains some cases to
256   // be considered. Compare following two cases.
257   //   Case A:
258   //     1. Type "ywwz1sxm" in Roman-input style then get "yっwz1sxm".
259   //     2. Hit F10 key to convert it to "ywwz1sxm" by
260   //        ConvertToHalfAlphanumeric command.
261   //     3. Commit it.
262   //     In this case, |segment_key| is "yっwz1sxm" and actually contains
263   //     alphabetical characters. So kSensitive will be returned.
264   //     So far so good.
265   //   Case B:
266   //     1. type "ia1bo3xu" in Roman-input style then get "いあ1ぼ3ぅ".
267   //     2. hit F10 key to convert it to "ia1bo3xu" by
268   //        ConvertToHalfAlphanumeric command.
269   //     3. commit it.
270   //     In this case, |segment_key| is "ia1bo3xu" and contains no
271   //     alphabetical character. So the following check does nothing.
272   // TODO(team): Improve the following rule so that our user experience
273   //     can be consistent between case A and B.
274   if (Util::ContainsScriptType(segment_key, Util::ALPHABET)) {
275     return kSensitive;
276   }
277 
278   return kNonSensitive;
279 }
280 
UserHistoryStorage(const string & filename)281 UserHistoryStorage::UserHistoryStorage(const string &filename)
282     : storage_(new storage::EncryptedStringStorage(filename)) {
283 }
284 
~UserHistoryStorage()285 UserHistoryStorage::~UserHistoryStorage() {}
286 
Load()287 bool UserHistoryStorage::Load() {
288   string input;
289   if (!storage_->Load(&input)) {
290     LOG(ERROR) << "Can't load user history data.";
291     return false;
292   }
293 
294   if (!user_history_base.ParseFromString(input)) {
295     LOG(ERROR) << "ParseFromString failed. message looks broken";
296     return false;
297   }
298 
299   VLOG(1) << "Loaded user histroy, size=" << user_history_base.entries_size();
300   return true;
301 }
302 
Save() const303 bool UserHistoryStorage::Save() const {
304   if (user_history_base.entries_size() == 0) {
305     LOG(WARNING) << "etries size is 0. Not saved";
306     return false;
307   }
308 
309   string output;
310   if (!user_history_base.AppendToString(&output)) {
311     LOG(ERROR) << "AppendToString failed";
312     return false;
313   }
314 
315   if (!storage_->Save(output)) {
316     LOG(ERROR) << "Can't save user history data.";
317     return false;
318   }
319 
320   return true;
321 }
322 
EntryPriorityQueue()323 UserHistoryPredictor::EntryPriorityQueue::EntryPriorityQueue()
324     : pool_(kEntryPoolSize) {}
325 
~EntryPriorityQueue()326 UserHistoryPredictor::EntryPriorityQueue::~EntryPriorityQueue() {}
327 
Push(Entry * entry)328 bool UserHistoryPredictor::EntryPriorityQueue::Push(Entry *entry) {
329   DCHECK(entry);
330   if (!seen_.insert(Hash::Fingerprint32(entry->value())).second) {
331     VLOG(2) << "found dups";
332     return false;
333   }
334   const uint32 score = UserHistoryPredictor::GetScore(*entry);
335   agenda_.push(std::make_pair(score, entry));
336   return true;
337 }
338 
339 UserHistoryPredictor::Entry *
Pop()340 UserHistoryPredictor::EntryPriorityQueue::Pop() {
341   if (agenda_.empty()) {
342     return nullptr;
343   }
344   const QueueElement &element = agenda_.top();
345   Entry *result = element.second;
346   DCHECK(result);
347   agenda_.pop();
348   return result;
349 }
350 
351 UserHistoryPredictor::Entry *
NewEntry()352 UserHistoryPredictor::EntryPriorityQueue::NewEntry() {
353   return pool_.Alloc();
354 }
355 
356 class UserHistoryPredictorSyncer : public Thread {
357  public:
358   enum RequestType {
359     LOAD,
360     SAVE
361   };
362 
UserHistoryPredictorSyncer(UserHistoryPredictor * predictor,RequestType type)363   UserHistoryPredictorSyncer(UserHistoryPredictor *predictor,
364                              RequestType type)
365       : predictor_(predictor), type_(type) {
366     DCHECK(predictor_);
367   }
368 
Run()369   virtual void Run() {
370     switch (type_) {
371       case LOAD:
372         VLOG(1) << "Executing Reload method";
373         predictor_->Load();
374         break;
375       case SAVE:
376         VLOG(1) << "Executing Sync method";
377         predictor_->Save();
378         break;
379       default:
380         LOG(ERROR) << "Unknown request: " << static_cast<int>(type_);
381     }
382   }
383 
~UserHistoryPredictorSyncer()384   virtual ~UserHistoryPredictorSyncer() {
385     Join();
386   }
387 
388   UserHistoryPredictor *predictor_;
389   RequestType type_;
390 };
391 
UserHistoryPredictor(const DictionaryInterface * dictionary,const POSMatcher * pos_matcher,const SuppressionDictionary * suppression_dictionary,bool enable_content_word_learning)392 UserHistoryPredictor::UserHistoryPredictor(
393     const DictionaryInterface *dictionary,
394     const POSMatcher *pos_matcher,
395     const SuppressionDictionary *suppression_dictionary,
396     bool enable_content_word_learning)
397     : dictionary_(dictionary),
398       pos_matcher_(pos_matcher),
399       suppression_dictionary_(suppression_dictionary),
400       predictor_name_("UserHistoryPredictor"),
401       content_word_learning_enabled_(enable_content_word_learning),
402       updated_(false),
403       dic_(new DicCache(UserHistoryPredictor::cache_size())) {
404   AsyncLoad();  // non-blocking
405   // Load()  blocking version can be used if any
406 }
407 
~UserHistoryPredictor()408 UserHistoryPredictor::~UserHistoryPredictor() {
409   // In destructor, must call blocking version
410   WaitForSyncer();
411   Save();   // blocking
412 }
413 
GetUserHistoryFileName()414 string UserHistoryPredictor::GetUserHistoryFileName() {
415   return ConfigFileStream::GetFileName(kFileName);
416 }
417 
418 // Returns revert id
419 // static
revert_id()420 uint16 UserHistoryPredictor::revert_id() {
421   return kRevertId;
422 }
423 
WaitForSyncer()424 void UserHistoryPredictor::WaitForSyncer() {
425   if (syncer_.get() != nullptr) {
426     syncer_->Join();
427     syncer_.reset();
428   }
429 }
430 
Wait()431 bool UserHistoryPredictor::Wait() {
432   WaitForSyncer();
433   return true;
434 }
435 
CheckSyncerAndDelete() const436 bool UserHistoryPredictor::CheckSyncerAndDelete() const {
437   if (syncer_.get() != nullptr) {
438     if (syncer_->IsRunning()) {
439       return false;
440     } else {
441       syncer_.reset();
442     }
443   }
444 
445   return true;
446 }
447 
Sync()448 bool UserHistoryPredictor::Sync() {
449   return AsyncSave();
450   // return Save();   blocking version
451 }
452 
Reload()453 bool UserHistoryPredictor::Reload() {
454   WaitForSyncer();
455   return AsyncLoad();
456 }
457 
AsyncLoad()458 bool UserHistoryPredictor::AsyncLoad() {
459   if (!CheckSyncerAndDelete()) {  // now loading/saving
460     return true;
461   }
462 
463   syncer_.reset(new UserHistoryPredictorSyncer(
464       this,
465       UserHistoryPredictorSyncer::LOAD));
466   syncer_->Start("UserHistoryPredictor:Load");
467 
468   return true;
469 }
470 
AsyncSave()471 bool UserHistoryPredictor::AsyncSave() {
472   if (!updated_) {
473     return true;
474   }
475 
476   if (!CheckSyncerAndDelete()) {  // now loading/saving
477     return true;
478   }
479 
480   syncer_.reset(new UserHistoryPredictorSyncer(
481       this,
482       UserHistoryPredictorSyncer::SAVE));
483   syncer_->Start("UserHistoryPredictor:Save");
484 
485   return true;
486 }
487 
Load()488 bool UserHistoryPredictor::Load() {
489   const string filename = GetUserHistoryFileName();
490 
491   UserHistoryStorage history(filename);
492   if (!history.Load()) {
493     LOG(ERROR) << "UserHistoryStorage::Load() failed";
494     return false;
495   }
496 
497   for (size_t i = 0; i < history.user_history_base.entries_size(); ++i) {
498     dic_->Insert(EntryFingerprint(history.user_history_base.entries(i)),
499                  history.user_history_base.entries(i));
500   }
501 
502   VLOG(1) << "Loaded user histroy, size=" << history.user_history_base.entries_size();
503 
504   return true;
505 }
506 
Save()507 bool UserHistoryPredictor::Save() {
508   if (!updated_) {
509     return true;
510   }
511 
512   // Do not check incognito_mode or use_history_suggest in Config here.
513   // The input data should not have been inserted when those flags are on.
514 
515   const DicElement *tail = dic_->Tail();
516   if (tail == nullptr) {
517     return true;
518   }
519 
520   const string filename = GetUserHistoryFileName();
521 
522   UserHistoryStorage history(filename);
523   for (const DicElement *elm = tail; elm != nullptr; elm = elm->prev) {
524     history.user_history_base.add_entries()->CopyFrom(elm->value);
525   }
526 
527   // Updates usage stats here.
528   UsageStats::SetInteger(
529       "UserHistoryPredictorEntrySize",
530       static_cast<int>(history.user_history_base.entries_size()));
531 
532   if (!history.Save()) {
533     LOG(ERROR) << "UserHistoryStorage::Save() failed";
534     return false;
535   }
536 
537   updated_ = false;
538 
539   return true;
540 }
541 
ClearAllHistory()542 bool UserHistoryPredictor::ClearAllHistory() {
543   // Waits until syncer finishes
544   WaitForSyncer();
545 
546   VLOG(1) << "Clearing user prediction";
547   // Renews DicCache as LRUCache tries to reuse the internal value by
548   // using FreeList
549   dic_.reset(new DicCache(UserHistoryPredictor::cache_size()));
550 
551   // insert a dummy event entry.
552   InsertEvent(Entry::CLEAN_ALL_EVENT);
553 
554   updated_ = true;
555 
556   Sync();
557 
558   return true;
559 }
560 
ClearUnusedHistory()561 bool UserHistoryPredictor::ClearUnusedHistory() {
562   // Waits until syncer finishes
563   WaitForSyncer();
564 
565   VLOG(1) << "Clearing unused prediction";
566   const DicElement *head = dic_->Head();
567   if (head == nullptr) {
568     VLOG(2) << "dic head is nullptr";
569     return false;
570   }
571 
572   std::vector<uint32> keys;
573   for (const DicElement *elm = head; elm != nullptr; elm = elm->next) {
574     VLOG(3) << elm->key << " " << elm->value.suggestion_freq();
575     if (elm->value.suggestion_freq() == 0) {
576       keys.push_back(elm->key);
577     }
578   }
579 
580   for (size_t i = 0; i < keys.size(); ++i) {
581     VLOG(2) << "Removing: " << keys[i];
582     if (!dic_->Erase(keys[i])) {
583       LOG(ERROR) << "cannot erase " << keys[i];
584     }
585   }
586 
587   // Inserts a dummy event entry.
588   InsertEvent(Entry::CLEAN_UNUSED_EVENT);
589 
590   updated_ = true;
591 
592   Sync();
593 
594   VLOG(1) << keys.size() << " removed";
595 
596   return true;
597 }
598 
599 // Erases all the next_entries whose entry_fp field equals |fp|.
EraseNextEntries(uint32 fp,Entry * entry)600 void UserHistoryPredictor::EraseNextEntries(uint32 fp, Entry *entry) {
601   const size_t orig_size = entry->next_entries_size();
602   size_t new_size = orig_size;
603   for (size_t pos = 0; pos < new_size; ) {
604     if (entry->next_entries(pos).entry_fp() == fp) {
605       entry->mutable_next_entries()->SwapElements(pos, --new_size);
606     } else {
607       ++pos;
608     }
609   }
610   for (size_t i = 0; i < orig_size - new_size; ++i) {
611     entry->mutable_next_entries()->RemoveLast();
612   }
613 }
614 
615 // Recursively finds the Ngram history that produces |target_key| and
616 // |target_value| and removes the last link. For example, if there exists a
617 // chain like
618 //    ("aaa", "AAA") -- ("bbb", "BBB") -- ("ccc", "CCC"),
619 // and if target_key == "aaabbbccc" and target_value == "AAABBBCCC", the link
620 // from ("bbb", "BBB") to ("ccc", "CCC") is removed. If a link was removed, this
621 // method returns DONE. If no history entries can produce the target key
622 // value, then NOT_FOUND is returned. TAIL is returned only when the
623 // tail was found, e.g., in the above example, when the method finds the tail
624 // node ("ccc", "CCC").
625 UserHistoryPredictor::RemoveNgramChainResult
RemoveNgramChain(const string & target_key,const string & target_value,Entry * entry,std::vector<StringPiece> * key_ngrams,size_t key_ngrams_len,std::vector<StringPiece> * value_ngrams,size_t value_ngrams_len)626 UserHistoryPredictor::RemoveNgramChain(const string &target_key,
627                                        const string &target_value,
628                                        Entry *entry,
629                                        std::vector<StringPiece> *key_ngrams,
630                                        size_t key_ngrams_len,
631                                        std::vector<StringPiece> *value_ngrams,
632                                        size_t value_ngrams_len) {
633   DCHECK(entry);
634   DCHECK(key_ngrams);
635   DCHECK(value_ngrams);
636 
637   // Updates the lengths with the current entry node.
638   key_ngrams_len += entry->key().size();
639   value_ngrams_len += entry->value().size();
640 
641   // This is the case where ngram key and value are shorter than the target key
642   // and value, respectively. In this case, we need to find further entries to
643   // concatenate in order to make |target_key| and |target_value|.
644   if (key_ngrams_len < target_key.size() &&
645       value_ngrams_len < target_value.size()) {
646     key_ngrams->push_back(entry->key());
647     value_ngrams->push_back(entry->value());
648     for (size_t i = 0; i < entry->next_entries().size(); ++i) {
649       const uint32 fp = entry->next_entries(i).entry_fp();
650       Entry *e = dic_->MutableLookupWithoutInsert(fp);
651       if (e == nullptr) {
652         continue;
653       }
654       const RemoveNgramChainResult r = RemoveNgramChain(target_key,
655                                                         target_value,
656                                                         e,
657                                                         key_ngrams,
658                                                         key_ngrams_len,
659                                                         value_ngrams,
660                                                         value_ngrams_len);
661       switch (r) {
662         case DONE:
663           return DONE;
664         case TAIL:
665           // |entry| is the second-to-the-last node. So cut the link to the
666           // child entry.
667           EraseNextEntries(fp, entry);
668           return DONE;
669         default:
670           break;
671       }
672     }
673     // Recovers the state.
674     key_ngrams->pop_back();
675     value_ngrams->pop_back();
676     return NOT_FOUND;
677   }
678 
679   // This is the case where the current ngram key and value have the same
680   // lengths as those of |target_key| and |target_value|, respectively.
681   if (key_ngrams_len == target_key.size() &&
682       value_ngrams_len == target_value.size()) {
683     key_ngrams->push_back(entry->key());
684     value_ngrams->push_back(entry->value());
685     string ngram_key, ngram_value;
686     Util::JoinStringPieces(*key_ngrams, "", &ngram_key);
687     Util::JoinStringPieces(*value_ngrams, "", &ngram_value);
688     if (ngram_key == target_key && ngram_value == target_value) {
689       // |entry| is the last node. So return TAIL to tell the caller so
690       // that it can remove the link to this last node.
691       return TAIL;
692     }
693     key_ngrams->pop_back();
694     value_ngrams->pop_back();
695     return NOT_FOUND;
696   }
697 
698   return NOT_FOUND;
699 }
700 
ClearHistoryEntry(const string & key,const string & value)701 bool UserHistoryPredictor::ClearHistoryEntry(const string &key,
702                                              const string &value) {
703   bool deleted = false;
704   {
705     // Finds the history entry that has the exactly same key and value and has
706     // not been removed yet. If exists, remove it.
707     Entry *entry = dic_->MutableLookupWithoutInsert(Fingerprint(key, value));
708     if (entry != nullptr && !entry->removed()) {
709       entry->set_suggestion_freq(0);
710       entry->set_conversion_freq(0);
711       entry->set_removed(true);
712       // We don't clear entry->next_entries() so that we can generate prediction
713       // by chaining.
714       deleted = true;
715     }
716   }
717   {
718     // Finds a chain of history entries that produces key and value. If exists,
719     // remove the link so that N-gram history prediction never generates this
720     // key value pair..
721     for (DicElement *elm = dic_->MutableHead(); elm != nullptr;
722          elm = elm->next) {
723       Entry *entry = &elm->value;
724       if (!Util::StartsWith(key, entry->key()) ||
725           !Util::StartsWith(value, entry->value())) {
726         continue;
727       }
728       std::vector<StringPiece> key_ngrams, value_ngrams;
729       if (RemoveNgramChain(
730               key, value, entry, &key_ngrams, 0, &value_ngrams, 0) == DONE) {
731         deleted = true;
732       }
733     }
734   }
735   if (deleted) {
736     updated_ = true;
737   }
738   return deleted;
739 }
740 
741 // Returns true if prev_entry has a next_fp link to entry
742 // static
HasBigramEntry(const Entry & entry,const Entry & prev_entry)743 bool UserHistoryPredictor::HasBigramEntry(
744     const Entry &entry, const Entry &prev_entry) {
745   const uint32 fp = EntryFingerprint(entry);
746   for (int i = 0; i < prev_entry.next_entries_size(); ++i) {
747     if (fp == prev_entry.next_entries(i).entry_fp()) {
748       return true;
749     }
750   }
751   return false;
752 }
753 
754 // static
GetRomanMisspelledKey(const ConversionRequest & request,const Segments & segments)755 string UserHistoryPredictor::GetRomanMisspelledKey(
756     const ConversionRequest &request,
757     const Segments &segments) {
758   if (request.config().preedit_method() != config::Config::ROMAN) {
759     return "";
760   }
761 
762   const string &preedit = segments.conversion_segment(0).key();
763   // TODO(team): Use composer if it is available.
764   // segments.composer()->GetQueryForConversion(&preedit);
765   // Since ConverterInterface doesn't have StartPredictionWithComposer,
766   // we cannot use composer currently.
767   if (!preedit.empty() && MaybeRomanMisspelledKey(preedit)) {
768     return ToRoman(preedit);
769   }
770 
771   return "";
772 }
773 
774 // static
MaybeRomanMisspelledKey(const string & key)775 bool UserHistoryPredictor::MaybeRomanMisspelledKey(const string &key) {
776   int num_alpha = 0;
777   int num_hiragana = 0;
778   int num_unknown = 0;
779   for (ConstChar32Iterator iter(key); !iter.Done(); iter.Next()) {
780     const char32 w = iter.Get();
781     const Util::ScriptType type = Util::GetScriptType(w);
782     if (type == Util::HIRAGANA || w == 0x30FC) {  // "ー".
783       ++num_hiragana;
784       continue;
785     }
786     if (type == Util::UNKNOWN_SCRIPT && num_unknown <= 0) {
787       ++num_unknown;
788       continue;
789     }
790     if (type == Util::ALPHABET && num_alpha <= 0) {
791       ++num_alpha;
792       continue;
793     }
794     return false;
795   }
796 
797   return (num_hiragana > 0 &&
798           ((num_alpha == 1 && num_unknown == 0) ||
799            (num_alpha == 0 && num_unknown == 1)));
800 }
801 
802 // static
RomanFuzzyPrefixMatch(const string & str,const string & prefix)803 bool UserHistoryPredictor::RomanFuzzyPrefixMatch(
804     const string &str, const string &prefix) {
805   if (prefix.empty() || prefix.size() > str.size()) {
806     return false;
807   }
808 
809   // 1. Allows one character delete in Romanji sequence.
810   // 2. Allows one swap in Romanji sequence.
811   for (size_t i = 0; i < prefix.size(); ++i) {
812     if (prefix[i] == str[i]) {
813       continue;
814     }
815 
816     if (str[i] == '-') {
817       // '-' voice sound mark can be matched to any
818       // non-alphanum character.
819       if (!isalnum(prefix[i])) {
820         string replaced_prefix = prefix;
821         replaced_prefix[i] = str[i];
822         if (Util::StartsWith(str, replaced_prefix)) {
823           return true;
824         }
825       }
826     } else {
827       // deletion.
828       string inserted_prefix = prefix;
829       inserted_prefix.insert(i, 1, str[i]);
830       if (Util::StartsWith(str, inserted_prefix)) {
831         return true;
832       }
833 
834       // swap.
835       if (i + 1 < prefix.size()) {
836         string swapped_prefix = prefix;
837         using std::swap;
838         swap(swapped_prefix[i], swapped_prefix[i + 1]);
839         if (Util::StartsWith(str, swapped_prefix)) {
840           return true;
841         }
842       }
843     }
844 
845     return false;
846   }
847 
848   // |prefix| is an exact suffix of |str|.
849   return false;
850 }
851 
RomanFuzzyLookupEntry(const string & roman_input_key,const Entry * entry,EntryPriorityQueue * results) const852 bool UserHistoryPredictor::RomanFuzzyLookupEntry(
853     const string &roman_input_key, const Entry *entry,
854     EntryPriorityQueue *results) const {
855   if (roman_input_key.empty()) {
856     return false;
857   }
858 
859   DCHECK(entry);
860   DCHECK(results);
861 
862   if (!RomanFuzzyPrefixMatch(ToRoman(entry->key()),
863                              roman_input_key)) {
864     return false;
865   }
866 
867   Entry *result = results->NewEntry();
868   DCHECK(result);
869   result->Clear();
870   result->CopyFrom(*entry);
871   result->set_spelling_correction(true);
872   results->Push(result);
873 
874   return true;
875 }
876 
AddEntry(const Entry & entry,EntryPriorityQueue * results) const877 UserHistoryPredictor::Entry *UserHistoryPredictor::AddEntry(
878     const Entry &entry, EntryPriorityQueue *results) const {
879   // We add an entry even if it was marked as removed so that it can be used to
880   // generate prediction by entry chaining. The deleted entry itself is never
881   // shown in the final prediction result as it is filtered finally.
882   Entry *new_entry = results->NewEntry();
883   new_entry->Clear();
884   new_entry->CopyFrom(entry);
885   return new_entry;
886 }
887 
AddEntryWithNewKeyValue(const string & key,const string & value,const Entry & entry,EntryPriorityQueue * results) const888 UserHistoryPredictor::Entry *UserHistoryPredictor::AddEntryWithNewKeyValue(
889     const string &key, const string &value, const Entry &entry,
890     EntryPriorityQueue *results) const {
891   // We add an entry even if it was marked as removed so that it can be used to
892   // generate prediction by entry chaining. The deleted entry itself is never
893   // shown in the final prediction result as it is filtered finally.
894   Entry *new_entry = results->NewEntry();
895   new_entry->Clear();
896   new_entry->CopyFrom(entry);
897   new_entry->set_key(key);
898   new_entry->set_value(value);
899 
900   // Sets removed field true if the new key and value were removed.
901   const Entry *e = dic_->LookupWithoutInsert(Fingerprint(key, value));
902   new_entry->set_removed(e != nullptr && e->removed());
903 
904   return new_entry;
905 }
906 
GetKeyValueForExactAndRightPrefixMatch(const string & input_key,const Entry * entry,const Entry ** result_last_entry,uint64 * left_last_access_time,uint64 * left_most_last_access_time,string * result_key,string * result_value) const907 bool UserHistoryPredictor::GetKeyValueForExactAndRightPrefixMatch(
908     const string &input_key,
909     const Entry *entry, const Entry **result_last_entry,
910     uint64 *left_last_access_time,
911     uint64 *left_most_last_access_time,
912     string *result_key, string *result_value) const {
913   string key = entry->key();
914   string value = entry->value();
915   const Entry *current_entry = entry;
916   mozc_hash_set<uint32> seen;
917   seen.insert(EntryFingerprint(*current_entry));
918   // Until target entry gets longer than input_key.
919   while (key.size() <= input_key.size()) {
920     const Entry *latest_entry = nullptr;
921     const Entry *left_same_timestamp_entry = nullptr;
922     const Entry *left_most_same_timestamp_entry = nullptr;
923     for (size_t i = 0; i < current_entry->next_entries_size(); ++i) {
924       const Entry *tmp_next_entry = dic_->LookupWithoutInsert(
925           current_entry->next_entries(i).entry_fp());
926       if (tmp_next_entry == nullptr || tmp_next_entry->key().empty()) {
927         continue;
928       }
929       const MatchType mtype_joined = GetMatchType(key + tmp_next_entry->key(),
930                                             input_key);
931       if (mtype_joined == NO_MATCH || mtype_joined == LEFT_EMPTY_MATCH) {
932         continue;
933       }
934       if (latest_entry == nullptr ||
935           latest_entry->last_access_time() <
936           tmp_next_entry->last_access_time()) {
937         latest_entry = tmp_next_entry;
938       }
939       if (tmp_next_entry->last_access_time() == *left_last_access_time) {
940         left_same_timestamp_entry = tmp_next_entry;
941       }
942       if (tmp_next_entry->last_access_time() == *left_most_last_access_time) {
943         left_most_same_timestamp_entry = tmp_next_entry;
944       }
945     }
946 
947     // Prefer bigrams which are generated at the same time.
948     // When last_access_time are the same, these two bigrams were
949     // input together.
950     // The preferences:
951     // (1). The current entry's time stamp is equal to that of
952     //      left most content word
953     // (2). The current entry's time stamp is equal to that of
954     //      left closest content word
955     // (3). The current entry is the latest
956     const Entry *next_entry = left_most_same_timestamp_entry;
957     if (next_entry == nullptr) {
958       next_entry = left_same_timestamp_entry;
959     }
960     if (next_entry == nullptr) {
961       next_entry = latest_entry;
962     }
963 
964     if (next_entry == nullptr || next_entry->key().empty()) {
965       break;
966     }
967 
968     // If duplicate entry is found, don't expand more.
969     // This is because an entry only has one timestamp.
970     // we cannot trust the timestamp if there are duplicate values
971     // in one input.
972     if (!seen.insert(EntryFingerprint(*next_entry)).second) {
973       break;
974     }
975 
976     key += next_entry->key();
977     value += next_entry->value();
978     current_entry = next_entry;
979     *result_last_entry = next_entry;
980 
981     // Don't update left_access_time if the current entry is
982     // not a content word.
983     // The time-stamp of non-content-word will be updated frequently.
984     // The time-stamp of the previous candidate is more trustful.
985     // It partially fixes the bug http://b/2843371.
986     const bool is_content_word = IsContentWord(current_entry->value());
987 
988     if (is_content_word) {
989       *left_last_access_time = current_entry->last_access_time();
990     }
991 
992     // If left_most entry is a functional word (symbols/punctuations),
993     // we don't take it as a canonical candidate.
994     if (*left_most_last_access_time == 0 && is_content_word) {
995       *left_most_last_access_time = current_entry->last_access_time();
996     }
997   }
998 
999   if (key.size() < input_key.size()) {
1000     VLOG(3) << "Cannot find prefix match even after chain rules";
1001     return false;
1002   }
1003 
1004   *result_key = key;
1005   *result_value = value;
1006   return true;
1007 }
1008 
LookupEntry(RequestType request_type,const string & input_key,const string & key_base,const Trie<string> * key_expanded,const Entry * entry,const Entry * prev_entry,EntryPriorityQueue * results) const1009 bool UserHistoryPredictor::LookupEntry(
1010     RequestType request_type,
1011     const string &input_key,
1012     const string &key_base,
1013     const Trie<string> *key_expanded,
1014     const Entry *entry,
1015     const Entry *prev_entry,
1016     EntryPriorityQueue *results) const {
1017   CHECK(entry);
1018   CHECK(results);
1019 
1020   Entry *result = nullptr;
1021 
1022   const Entry *last_entry = nullptr;
1023 
1024   // last_access_time of the left-closest content word.
1025   uint64 left_last_access_time = 0;
1026 
1027   // last_access_time of the left-most content word.
1028   uint64 left_most_last_access_time = 0;
1029 
1030   // Example: [a|B|c|D]
1031   // a,c: functional word
1032   // B,D: content word
1033   // left_last_access_time:   timestamp of D
1034   // left_most_last_access_time:   timestamp of B
1035 
1036   // |input_key| is a query user is now typing.
1037   // |entry->key()| is a target value saved in the database.
1038   //  const string input_key = key_base;
1039 
1040   const MatchType mtype = GetMatchTypeFromInput(
1041       input_key, key_base, key_expanded, entry->key());
1042   if (mtype == NO_MATCH) {
1043     return false;
1044   } else if (mtype == LEFT_EMPTY_MATCH) {  // zero-query-suggestion
1045     // if |input_key| is empty, the |prev_entry| and |entry| must
1046     // have bigram relation.
1047     if (prev_entry != nullptr && HasBigramEntry(*entry, *prev_entry)) {
1048       result = AddEntry(*entry, results);
1049       if (result) {
1050         last_entry = entry;
1051         left_last_access_time = entry->last_access_time();
1052         left_most_last_access_time = IsContentWord(entry->value()) ?
1053             left_last_access_time : 0;
1054       }
1055     } else {
1056       return false;
1057     }
1058   } else if (mtype == LEFT_PREFIX_MATCH) {
1059     // |input_key| is shorter than |entry->key()|
1060     // This scenario is a simple prefix match.
1061     // e.g., |input_key|="foo", |entry->key()|="foobar"
1062     result = AddEntry(*entry, results);
1063     if (result) {
1064       last_entry = entry;
1065       left_last_access_time = entry->last_access_time();
1066       left_most_last_access_time = IsContentWord(entry->value()) ?
1067           left_last_access_time : 0;
1068     }
1069   } else if (mtype == RIGHT_PREFIX_MATCH || mtype == EXACT_MATCH) {
1070     // |input_key| is longer than or the same as |entry->key()|.
1071     // In this case, recursively traverse "next_entries" until
1072     // target entry gets longer than input_key.
1073     // e.g., |input_key|="foobar", |entry->key()|="foo"
1074     if (request_type == ZERO_QUERY_SUGGESTION && mtype == EXACT_MATCH) {
1075       // For mobile, we don't generate joined result.
1076       result = AddEntry(*entry, results);
1077       if (result) {
1078         last_entry = entry;
1079         left_last_access_time = entry->last_access_time();
1080         left_most_last_access_time = IsContentWord(entry->value()) ?
1081             left_last_access_time : 0;
1082       }
1083     } else {
1084       string key, value;
1085       left_last_access_time = entry->last_access_time();
1086       left_most_last_access_time = IsContentWord(entry->value()) ?
1087           left_last_access_time : 0;
1088       if (!GetKeyValueForExactAndRightPrefixMatch(
1089               input_key, entry, &last_entry,
1090               &left_last_access_time, &left_most_last_access_time,
1091               &key, &value)) {
1092         return false;
1093       }
1094       result = AddEntryWithNewKeyValue(key, value, *entry, results);
1095     }
1096   } else {
1097     LOG(ERROR) << "Unknown match mode: " << mtype;
1098     return false;
1099   }
1100 
1101   if (result == nullptr) {
1102     return false;
1103   }
1104 
1105   // If prev entry is not nullptr, check whether there is a bigram
1106   // from |prev_entry| to |entry|.
1107   result->set_bigram_boost(false);
1108 
1109   if (prev_entry != nullptr && HasBigramEntry(*entry, *prev_entry)) {
1110     // Sets bigram_boost flag so that this entry is boosted
1111     // against LRU policy.
1112     result->set_bigram_boost(true);
1113   }
1114 
1115   if (!result->removed()) {
1116     results->Push(result);
1117   }
1118 
1119   if (request_type == ZERO_QUERY_SUGGESTION) {
1120     // For mobile, we don't generate joined result.
1121     return true;
1122   }
1123 
1124   // Generates joined result using |last_entry|.
1125   if (last_entry != nullptr &&
1126       Util::CharsLen(result->key()) >= 1 &&
1127       2 * Util::CharsLen(input_key) >= Util::CharsLen(result->key())) {
1128     const Entry *latest_entry = nullptr;
1129     const Entry *left_same_timestamp_entry = nullptr;
1130     const Entry *left_most_same_timestamp_entry = nullptr;
1131     for (int i = 0; i < last_entry->next_entries_size(); ++i) {
1132       const Entry *tmp_entry = dic_->LookupWithoutInsert(
1133           last_entry->next_entries(i).entry_fp());
1134       if (tmp_entry == nullptr || tmp_entry->key().empty()) {
1135         continue;
1136       }
1137       if (latest_entry == nullptr ||
1138           latest_entry->last_access_time() < tmp_entry->last_access_time()) {
1139         latest_entry = tmp_entry;
1140       }
1141       if (tmp_entry->last_access_time() == left_last_access_time) {
1142         left_same_timestamp_entry = tmp_entry;
1143       }
1144       if (tmp_entry->last_access_time() == left_most_last_access_time) {
1145         left_most_same_timestamp_entry = tmp_entry;
1146       }
1147     }
1148 
1149     const Entry *next_entry = left_most_same_timestamp_entry;
1150     if (next_entry == nullptr) {
1151       next_entry = left_same_timestamp_entry;
1152     }
1153     if (next_entry == nullptr) {
1154       next_entry = latest_entry;
1155     }
1156 
1157     // The new entry was input within 10 seconds.
1158     // TODO(taku): This is a simple heuristics.
1159     if (next_entry != nullptr && !next_entry->key().empty() &&
1160         abs(static_cast<int32>(next_entry->last_access_time() -
1161                                last_entry->last_access_time())) <= 10 &&
1162         IsContentWord(next_entry->value())) {
1163       Entry *result2 = AddEntryWithNewKeyValue(
1164           result->key() + next_entry->key(),
1165           result->value() + next_entry->value(),
1166           *result,
1167           results);
1168       if (!result2->removed()) {
1169         results->Push(result2);
1170       }
1171     }
1172   }
1173 
1174   return true;
1175 }
1176 
Predict(Segments * segments) const1177 bool UserHistoryPredictor::Predict(Segments *segments) const {
1178   ConversionRequest default_request;
1179   return PredictForRequest(default_request, segments);
1180 }
1181 
PredictForRequest(const ConversionRequest & request,Segments * segments) const1182 bool UserHistoryPredictor::PredictForRequest(const ConversionRequest &request,
1183                                              Segments *segments) const {
1184   if (!CheckSyncerAndDelete()) {
1185     LOG(WARNING) << "Syncer is running";
1186     return false;
1187   }
1188 
1189   if (request.config().incognito_mode()) {
1190     VLOG(2) << "incognito mode";
1191     return false;
1192   }
1193 
1194   if (request.config().history_learning_level() == config::Config::NO_HISTORY) {
1195     VLOG(2) << "history learning level is NO_HISTORY";
1196     return false;
1197   }
1198 
1199   if (segments->request_type() == Segments::CONVERSION) {
1200     VLOG(2) << "request type is CONVERSION";
1201     return false;
1202   }
1203 
1204   if (!request.config().use_history_suggest() &&
1205       segments->request_type() == Segments::SUGGESTION) {
1206     VLOG(2) << "no history suggest";
1207     return false;
1208   }
1209 
1210   if (segments->conversion_segments_size() < 1) {
1211     VLOG(2) << "segment size < 1";
1212     return false;
1213   }
1214 
1215   if (dic_->Head() == nullptr) {
1216     VLOG(2) << "dic head is nullptr";
1217     return false;
1218   }
1219 
1220   const RequestType request_type = request.request().zero_query_suggestion() ?
1221       ZERO_QUERY_SUGGESTION : DEFAULT;
1222   const string &input_key = segments->conversion_segment(0).key();
1223   if (IsPunctuation(Util::SubString(input_key, 0, 1))) {
1224     VLOG(2) << "input_key starts with punctuations";
1225     return false;
1226   }
1227 
1228   const size_t input_key_len = Util::CharsLen(input_key);
1229   if (input_key_len == 0 && request_type == DEFAULT) {
1230     VLOG(2) << "key length is 0";
1231     return false;
1232   }
1233 
1234   const Entry *prev_entry =
1235       LookupPrevEntry(*segments, request.request().available_emoji_carrier());
1236   if (input_key_len == 0 && prev_entry == nullptr) {
1237     VLOG(1) << "If input_key_len is 0, prev_entry must be set";
1238     return false;
1239   }
1240 
1241   EntryPriorityQueue results;
1242   GetResultsFromHistoryDictionary(
1243       request_type, request, *segments, prev_entry, &results);
1244   if (results.size() == 0) {
1245     VLOG(2) << "no prefix match candiate is found.";
1246     return false;
1247   }
1248 
1249   return InsertCandidates(request_type, request, segments, &results);
1250 }
1251 
LookupPrevEntry(const Segments & segments,uint32 available_emoji_carrier) const1252 const UserHistoryPredictor::Entry *UserHistoryPredictor::LookupPrevEntry(
1253     const Segments &segments, uint32 available_emoji_carrier) const {
1254   const size_t history_segments_size = segments.history_segments_size();
1255   const Entry *prev_entry = nullptr;
1256   // When threre are non-zero history segments, lookup an entry
1257   // from the LRU dictionary, which is correspoinding to the last
1258   // history segment.
1259   if (history_segments_size == 0) {
1260     return nullptr;
1261   }
1262 
1263   const Segment &history_segment =
1264       segments.history_segment(history_segments_size - 1);
1265 
1266   // Simply lookup the history_segment.
1267   prev_entry = dic_->LookupWithoutInsert(SegmentFingerprint(history_segment));
1268 
1269   // When |prev_entry| is nullptr or |prev_entry| has no valid next_entries,
1270   // do linear-search over the LRU.
1271   if ((prev_entry == nullptr && history_segment.candidates_size() > 0) ||
1272       (prev_entry != nullptr && prev_entry->next_entries_size() == 0)) {
1273     const string &prev_value = prev_entry == nullptr ?
1274         history_segment.candidate(0).value : prev_entry->value();
1275     int trial = 0;
1276     for (const DicElement *elm = dic_->Head();
1277          trial++ < kMaxPrevValueTrial && elm != nullptr; elm = elm->next) {
1278       const Entry *entry = &(elm->value);
1279       // entry->value() equals to the prev_value or
1280       // entry->value() is a SUFFIX of prev_value.
1281       // length of entry->value() must be >= 2, as single-length
1282       // match would be noisy.
1283       if (IsValidEntry(*entry, available_emoji_carrier) &&
1284           entry != prev_entry &&
1285           entry->next_entries_size() > 0 &&
1286           Util::CharsLen(entry->value()) >= 2 &&
1287           (entry->value() == prev_value ||
1288            Util::EndsWith(prev_value, entry->value()))) {
1289         prev_entry = entry;
1290         break;
1291       }
1292     }
1293   }
1294   return prev_entry;
1295 }
1296 
GetResultsFromHistoryDictionary(RequestType request_type,const ConversionRequest & request,const Segments & segments,const Entry * prev_entry,EntryPriorityQueue * results) const1297 void UserHistoryPredictor::GetResultsFromHistoryDictionary(
1298     RequestType request_type,
1299     const ConversionRequest &request,
1300     const Segments &segments, const Entry *prev_entry,
1301     EntryPriorityQueue *results) const {
1302   DCHECK(results);
1303   const size_t max_results_size = 5 * segments.max_prediction_candidates_size();
1304 
1305   // Gets romanized input key if the given preedit looks misspelled.
1306   const string roman_input_key = GetRomanMisspelledKey(request, segments);
1307 
1308   // TODO(team): make GetKanaMisspelledKey(segments);
1309   // const string kana_input_key = GetKanaMisspelledKey(segments);
1310 
1311   // If we have ambiguity for the input, get expanded key.
1312   // Example1 roman input: for "あk", we will get |base|, "あ" and |expanded|,
1313   // "か", "き", etc
1314   // Example2 kana input: for "あか", we will get |base|, "あ" and |expanded|,
1315   // "か", and "が".
1316 
1317   // |base_key| and |input_key| could be different
1318   // For kana-input, we will expand the ambiguity for "゛".
1319   // When we input "もす",
1320   // |base_key|: "も"
1321   // |expanded|: "す", "ず"
1322   // |input_key|: "もす"
1323   // In this case, we want to show candidates for "もす" as EXACT match,
1324   // and candidates for "もず" as LEFT_PREFIX_MATCH
1325   //
1326   // For roman-input, when we input "あk",
1327   // |input_key| is "あk" and |base_key| is "あ"
1328   string input_key;
1329   string base_key;
1330   unique_ptr<Trie<string>> expanded;
1331   GetInputKeyFromSegments(request, segments, &input_key, &base_key, &expanded);
1332 
1333   int trial = 0;
1334   for (const DicElement *elm = dic_->Head(); elm != nullptr; elm = elm->next) {
1335     if (!IsValidEntryIgnoringRemovedField(
1336             elm->value, request.request().available_emoji_carrier())) {
1337       continue;
1338     }
1339     if (segments.request_type() == Segments::SUGGESTION &&
1340         trial++ >= kMaxSuggestionTrial) {
1341       VLOG(2) << "too many trials";
1342       break;
1343     }
1344 
1345     // Lookup key from elm_value and prev_entry.
1346     // If a new entry is found, the entry is pushed to the results.
1347     // TODO(team): make KanaFuzzyLookupEntry().
1348     if (!LookupEntry(request_type, input_key, base_key, expanded.get(),
1349                      &(elm->value), prev_entry, results) &&
1350         !RomanFuzzyLookupEntry(roman_input_key, &(elm->value), results)) {
1351       continue;
1352     }
1353 
1354     // already found enough results.
1355     if (results->size() >= max_results_size) {
1356       break;
1357     }
1358   }
1359 }
1360 
1361 // static
GetInputKeyFromSegments(const ConversionRequest & request,const Segments & segments,string * input_key,string * base,unique_ptr<Trie<string>> * expanded)1362 void UserHistoryPredictor::GetInputKeyFromSegments(
1363     const ConversionRequest &request, const Segments &segments,
1364     string *input_key, string *base,
1365     unique_ptr<Trie<string>> *expanded) {
1366   DCHECK(input_key);
1367   DCHECK(base);
1368 
1369   if (!request.has_composer() ||
1370       !FLAGS_enable_expansion_for_user_history_predictor) {
1371     *input_key = segments.conversion_segment(0).key();
1372     *base = segments.conversion_segment(0).key();
1373     return;
1374   }
1375 
1376   request.composer().GetStringForPreedit(input_key);
1377   std::set<string> expanded_set;
1378   request.composer().GetQueriesForPrediction(base, &expanded_set);
1379   if (expanded_set.size() > 0) {
1380     expanded->reset(new Trie<string>);
1381     for (std::set<string>::const_iterator itr = expanded_set.begin();
1382          itr != expanded_set.end(); ++itr) {
1383       // For getting matched key, insert values
1384       (*expanded)->AddEntry(*itr, *itr);
1385     }
1386   }
1387 }
1388 
InsertCandidates(RequestType request_type,const ConversionRequest & request,Segments * segments,EntryPriorityQueue * results) const1389 bool UserHistoryPredictor::InsertCandidates(RequestType request_type,
1390                                             const ConversionRequest &request,
1391                                             Segments *segments,
1392                                             EntryPriorityQueue *results) const {
1393   DCHECK(results);
1394   Segment *segment = segments->mutable_conversion_segment(0);
1395   if (segment == nullptr) {
1396     LOG(ERROR) << "segment is nullptr";
1397     return false;
1398   }
1399   const uint32 input_key_len = Util::CharsLen(segment->key());
1400   while (segment->candidates_size() <
1401          segments->max_prediction_candidates_size()) {
1402     // |results| is a priority queue where the elemtnt
1403     // in the queue is sorted by the score defined in GetScore().
1404     const Entry *result_entry = results->Pop();
1405     if (result_entry == nullptr) {
1406       // Pop() returns nullptr when no more valid entry exists.
1407       break;
1408     }
1409     bool is_valid_candidate = false;
1410     if (segments->request_type() == Segments::PREDICTION) {
1411       is_valid_candidate = true;
1412     } else if (segments->request_type() == Segments::SUGGESTION) {
1413       // The top result of suggestion should be a VALID suggestion candidate.
1414       // i.e., SuggestionTrigerFunc should return true for the first
1415       // candidate.
1416       // If user types "デスノート" too many times, "デスノート" will be
1417       // suggested when user types "で". It is expected, but if user types
1418       // "です" after that,  showing "デスノート" is annoying.
1419       // In this situation, "です" is in the LRU, but SuggestionTrigerFunc
1420       // returns false for "です", since it is short.
1421       if (IsValidSuggestion(request_type,
1422                             input_key_len, *result_entry)) {
1423         is_valid_candidate = true;
1424       } else if (segment->candidates_size() == 0) {
1425         VLOG(2) << "candidates size is 0";
1426         return false;
1427       }
1428     } else {
1429       LOG(ERROR) << "Unknown mode";
1430       return false;
1431     }
1432 
1433     if (!is_valid_candidate) {
1434       VLOG(2) << "not a valid candidate: " << result_entry->key();
1435       continue;
1436     }
1437 
1438     if (request.request().mixed_conversion() &&
1439         result_entry->suggestion_freq() < 2 &&
1440         Util::CharsLen(result_entry->value()) > 8) {
1441       // Don't show long history for mixed conversion
1442       // TODO(toshiyuki): Better to merge this into IsValidSuggestion logic.
1443       VLOG(2) << "long candidate: " << result_entry->value();
1444       continue;
1445     }
1446 
1447     Segment::Candidate *candidate = segment->push_back_candidate();
1448     DCHECK(candidate);
1449     candidate->Init();
1450     candidate->key = result_entry->key();
1451     candidate->content_key = result_entry->key();
1452     candidate->value = result_entry->value();
1453     candidate->content_value = result_entry->value();
1454     candidate->attributes |=
1455         Segment::Candidate::USER_HISTORY_PREDICTION |
1456         Segment::Candidate::NO_VARIANTS_EXPANSION;
1457     candidate->source_info |= Segment::Candidate::USER_HISTORY_PREDICTOR;
1458     if (result_entry->spelling_correction()) {
1459       candidate->attributes |= Segment::Candidate::SPELLING_CORRECTION;
1460     }
1461     const string &description = result_entry->description();
1462     // If we have stored description, set it exactly.
1463     if (!description.empty()) {
1464       candidate->description = description;
1465       candidate->attributes |= Segment::Candidate::NO_EXTRA_DESCRIPTION;
1466     } else {
1467       VariantsRewriter::SetDescriptionForPrediction(*pos_matcher_, candidate);
1468     }
1469 #if DEBUG
1470     if (candidate->description.find("History") == string::npos) {
1471       candidate->description += " History";
1472     }
1473 #endif  // DEBUG
1474   }
1475 
1476   return (segment->candidates_size() > 0);
1477 }
1478 
InsertNextEntry(const NextEntry & next_entry,Entry * entry) const1479 void UserHistoryPredictor::InsertNextEntry(
1480     const NextEntry &next_entry, Entry *entry) const {
1481   if (next_entry.entry_fp() == 0 || entry == nullptr) {
1482     return;
1483   }
1484 
1485   NextEntry *target_next_entry = nullptr;
1486 
1487   // If next_entries_size is less than kMaxNextEntriesSize,
1488   // we simply allocate a new entry.
1489   if (entry->next_entries_size() < max_next_entries_size()) {
1490     target_next_entry = entry->add_next_entries();
1491   } else {
1492     // Otherwise, find the oldest next_entry.
1493     uint64 last_access_time = kuint64max;
1494     for (int i = 0; i < entry->next_entries_size(); ++i) {
1495       // Already has the same id
1496       if (next_entry.entry_fp() == entry->next_entries(i).entry_fp()) {
1497         target_next_entry = entry->mutable_next_entries(i);
1498         break;
1499       }
1500       const Entry *found_entry = dic_->LookupWithoutInsert(
1501           entry->next_entries(i).entry_fp());
1502       // Reuses the entry if it is already removed from the LRU.
1503       if (found_entry == nullptr) {
1504         target_next_entry = entry->mutable_next_entries(i);
1505         break;
1506       }
1507       // Preserves the oldest entry
1508       if (target_next_entry == nullptr ||
1509           last_access_time > found_entry->last_access_time()) {
1510         target_next_entry = entry->mutable_next_entries(i);
1511         last_access_time = found_entry->last_access_time();
1512       }
1513     }
1514   }
1515 
1516   if (target_next_entry == nullptr) {
1517     LOG(ERROR) << "cannot find a room for inserting next fp";
1518     return;
1519   }
1520 
1521   target_next_entry->CopyFrom(next_entry);
1522 }
1523 
IsValidEntry(const Entry & entry,uint32 available_emoji_carrier) const1524 bool UserHistoryPredictor::IsValidEntry(
1525     const Entry &entry, uint32 available_emoji_carrier) const {
1526   return !entry.removed() &&
1527       IsValidEntryIgnoringRemovedField(entry, available_emoji_carrier);
1528 }
1529 
IsValidEntryIgnoringRemovedField(const Entry & entry,uint32 available_emoji_carrier) const1530 bool UserHistoryPredictor::IsValidEntryIgnoringRemovedField(
1531     const Entry &entry, uint32 available_emoji_carrier) const {
1532   if (entry.entry_type() != Entry::DEFAULT_ENTRY ||
1533       suppression_dictionary_->SuppressEntry(entry.key(), entry.value())) {
1534     return false;
1535   }
1536 
1537   if (IsEmojiEntry(entry)) {
1538     if (Util::IsAndroidPuaEmoji(entry.value())) {
1539       // Android carrier dependent emoji.
1540       const uint32 kAndroidCarrier =
1541           Request::DOCOMO_EMOJI |
1542           Request::SOFTBANK_EMOJI |
1543           Request::KDDI_EMOJI;
1544       if (!(available_emoji_carrier & kAndroidCarrier)) {
1545         return false;
1546       }
1547     } else {
1548       // Unicode 6.0 emoji.
1549       if (!(available_emoji_carrier & Request::UNICODE_EMOJI)) {
1550         return false;
1551       }
1552     }
1553   }
1554 
1555   return true;
1556 }
1557 
InsertEvent(EntryType type)1558 void UserHistoryPredictor::InsertEvent(EntryType type) {
1559   if (type == Entry::DEFAULT_ENTRY) {
1560     return;
1561   }
1562 
1563   const uint64 last_access_time = Clock::GetTime();
1564   const uint32 dic_key = Fingerprint("", "", type);
1565 
1566   CHECK(dic_.get());
1567   DicElement *e = dic_->Insert(dic_key);
1568   if (e == nullptr) {
1569     VLOG(2) << "insert failed";
1570     return;
1571   }
1572 
1573   Entry *entry = &(e->value);
1574   DCHECK(entry);
1575   entry->Clear();
1576   entry->set_entry_type(type);
1577   entry->set_last_access_time(last_access_time);
1578 }
1579 
TryInsert(RequestType request_type,const string & key,const string & value,const string & description,bool is_suggestion_selected,uint32 next_fp,uint64 last_access_time,Segments * segments)1580 void UserHistoryPredictor::TryInsert(RequestType request_type,
1581                                      const string &key,
1582                                      const string &value,
1583                                      const string &description,
1584                                      bool is_suggestion_selected,
1585                                      uint32 next_fp,
1586                                      uint64 last_access_time,
1587                                      Segments *segments) {
1588   if (key.empty() || value.empty() ||
1589       key.size() > kMaxStringLength ||
1590       value.size() > kMaxStringLength ||
1591       description.size() > kMaxStringLength) {
1592     return;
1593   }
1594 
1595   // For mobile, we do not learn candidates that ends with puctuation.
1596   if (request_type == ZERO_QUERY_SUGGESTION &&
1597       Util::CharsLen(value) > 1 &&
1598       IsPunctuation(Util::SubString(value, Util::CharsLen(value) - 1, 1))) {
1599     return;
1600   }
1601 
1602   Insert(key, value, description, is_suggestion_selected, next_fp,
1603          last_access_time, segments);
1604 }
1605 
Insert(const string & key,const string & value,const string & description,bool is_suggestion_selected,uint32 next_fp,uint64 last_access_time,Segments * segments)1606 void UserHistoryPredictor::Insert(const string &key,
1607                                   const string &value,
1608                                   const string &description,
1609                                   bool is_suggestion_selected,
1610                                   uint32 next_fp,
1611                                   uint64 last_access_time,
1612                                   Segments *segments) {
1613   const uint32 dic_key = Fingerprint(key, value);
1614 
1615   if (!dic_->HasKey(dic_key)) {
1616     // The key is a new key inserted in the last Finish method.
1617     // Here we push a new RevertEntry so that the new "key" can be
1618     // removed when Revert() method is called.
1619     Segments::RevertEntry *revert_entry = segments->push_back_revert_entry();
1620     DCHECK(revert_entry);
1621     revert_entry->key = Uint32ToString(dic_key);
1622     revert_entry->id = UserHistoryPredictor::revert_id();
1623     revert_entry->revert_entry_type = Segments::RevertEntry::CREATE_ENTRY;
1624   } else {
1625     // The key is a old key not inserted in the last Finish method
1626     // TODO(taku):
1627     // add a treatment for UPDATE_ENTRY mode
1628   }
1629 
1630   DicElement *e = dic_->Insert(dic_key);
1631   if (e == nullptr) {
1632     VLOG(2) << "insert failed";
1633     return;
1634   }
1635 
1636   Entry *entry = &(e->value);
1637   DCHECK(entry);
1638 
1639   entry->set_key(key);
1640   entry->set_value(value);
1641   entry->set_removed(false);
1642 
1643   if (description.empty()) {
1644     entry->clear_description();
1645   } else {
1646     entry->set_description(description);
1647   }
1648 
1649   entry->set_last_access_time(last_access_time);
1650   if (is_suggestion_selected) {
1651     entry->set_suggestion_freq(entry->suggestion_freq() + 1);
1652   } else {
1653     entry->set_conversion_freq(entry->conversion_freq() + 1);
1654   }
1655 
1656   // Inserts next_fp to the entry
1657   if (next_fp != 0) {
1658     NextEntry next_entry;
1659     next_entry.set_entry_fp(next_fp);
1660     InsertNextEntry(next_entry, entry);
1661   }
1662 
1663   VLOG(2) << key << " " << value << " has inserted: "
1664           << entry->Utf8DebugString();
1665 
1666   // New entry is inserted to the cache
1667   updated_ = true;
1668 }
1669 
MaybeRecordUsageStats(const Segments & segments) const1670 void UserHistoryPredictor::MaybeRecordUsageStats(
1671     const Segments &segments) const {
1672   const Segment &segment = segments.conversion_segment(0);
1673   if (segment.candidates_size() < 1) {
1674     VLOG(2) << "candidates size < 1";
1675     return;
1676   }
1677 
1678   const Segment::Candidate &candidate = segment.candidate(0);
1679   if (segment.segment_type() != Segment::FIXED_VALUE) {
1680     VLOG(2) << "segment is not FIXED_VALUE" << candidate.value;
1681     return;
1682   }
1683 
1684   if (!(candidate.source_info & Segment::Candidate::USER_HISTORY_PREDICTOR)) {
1685     VLOG(2) << "candidate is not from user_history_predictor";
1686     return;
1687   }
1688 
1689   UsageStats::IncrementCount("CommitUserHistoryPredictor");
1690   if (segment.key().empty()) {
1691     UsageStats::IncrementCount("CommitUserHistoryPredictorZeroQuery");
1692   }
1693 }
1694 
Finish(const ConversionRequest & request,Segments * segments)1695 void UserHistoryPredictor::Finish(const ConversionRequest &request,
1696                                   Segments *segments) {
1697   if (segments->request_type() == Segments::REVERSE_CONVERSION) {
1698     // Do nothing for REVERSE_CONVERSION.
1699     return;
1700   }
1701 
1702   if (request.config().incognito_mode()) {
1703     VLOG(2) << "incognito mode";
1704     return;
1705   }
1706 
1707   if (request.config().history_learning_level() !=
1708       config::Config::DEFAULT_HISTORY) {
1709     VLOG(2) << "history learning level is not DEFAULT_HISTORY: "
1710             << request.config().history_learning_level();
1711     return;
1712   }
1713 
1714   if (!request.config().use_history_suggest()) {
1715     VLOG(2) << "no history suggest";
1716     return;
1717   }
1718 
1719   if (!CheckSyncerAndDelete()) {
1720     LOG(WARNING) << "Syncer is running";
1721     return;
1722   }
1723 
1724   MaybeRecordUsageStats(*segments);
1725 
1726   const RequestType request_type = request.request().zero_query_suggestion() ?
1727       ZERO_QUERY_SUGGESTION : DEFAULT;
1728   const bool is_suggestion = segments->request_type() != Segments::CONVERSION;
1729   const uint64 last_access_time = Clock::GetTime();
1730 
1731   // If user inputs a punctuation just after some long sentence,
1732   // we make a new candidate by concatinating the top element in LRU and
1733   // the punctuation user input. The top element in LRU is supposed to be
1734   // the long sentence user input before.
1735   // This is a fix for http://b/issue?id=2216838
1736   //
1737   // Note: We don't make such candidates for mobile.
1738   if (request_type != ZERO_QUERY_SUGGESTION &&
1739       dic_->Head() != nullptr &&
1740       dic_->Head()->value.last_access_time() + 5 > last_access_time &&
1741       // Check if the current value is a punctuation.
1742       segments->conversion_segments_size() == 1 &&
1743       segments->conversion_segment(0).candidates_size() > 0 &&
1744       IsPunctuation(segments->conversion_segment(0).candidate(0).value) &&
1745       // Check if the previous value looks like a sentence.
1746       segments->history_segments_size() > 0 &&
1747       segments->history_segment(
1748           segments->history_segments_size() - 1).candidates_size() > 0 &&
1749       IsSentenceLikeCandidate(segments->history_segment(
1750           segments->history_segments_size() - 1).candidate(0))) {
1751     const Entry *entry = &(dic_->Head()->value);
1752     DCHECK(entry);
1753     const string &last_value =
1754         segments->history_segment(
1755             segments->history_segments_size() - 1).candidate(0).value;
1756     // Check if the head value in LRU ends with the candidate value in history
1757     // segments.
1758     if (Util::EndsWith(entry->value(), last_value)) {
1759       const Segment::Candidate &candidate =
1760           segments->conversion_segment(0).candidate(0);
1761       const string key = entry->key() + candidate.key;
1762       const string value = entry->value() + candidate.value;
1763       // Uses the same last_access_time stored in the top element
1764       // so that this item can be grouped together.
1765       TryInsert(request_type,
1766                 key, value, entry->description(), is_suggestion, 0,
1767                 entry->last_access_time(), segments);
1768     }
1769   }
1770 
1771   const size_t history_segments_size = segments->history_segments_size();
1772 
1773   // Checks every segment is valid.
1774   for (size_t i = history_segments_size; i < segments->segments_size(); ++i) {
1775     const Segment &segment = segments->segment(i);
1776     if (segment.candidates_size() < 1) {
1777       VLOG(2) << "candidates size < 1";
1778       return;
1779     }
1780     if (segment.segment_type() != Segment::FIXED_VALUE) {
1781       VLOG(2) << "segment is not FIXED_VALUE";
1782       return;
1783     }
1784     const Segment::Candidate &candidate = segment.candidate(0);
1785     if (candidate.attributes &
1786         Segment::Candidate::NO_SUGGEST_LEARNING) {
1787       VLOG(2) << "NO_SUGGEST_LEARNING";
1788       return;
1789     }
1790   }
1791 
1792   if (IsPrivacySensitive(segments)) {
1793     VLOG(2) << "do not remember privacy sensitive input";
1794     return;
1795   }
1796 
1797   InsertHistory(request_type, is_suggestion, last_access_time, segments);
1798   return;
1799 }
1800 
MakeLearningSegments(const Segments & segments,SegmentsForLearning * learning_segments) const1801 void UserHistoryPredictor::MakeLearningSegments(
1802     const Segments &segments, SegmentsForLearning *learning_segments) const {
1803   DCHECK(learning_segments);
1804 
1805   for (size_t i = 0; i < segments.history_segments_size(); ++i) {
1806     const Segment &segment = segments.history_segment(i);
1807     SegmentForLearning learning_segment;
1808     DCHECK_LE(1, segment.candidates_size());
1809     learning_segment.key = segment.candidate(0).key;
1810     learning_segment.value = segment.candidate(0).value;
1811     learning_segment.content_key = segment.candidate(0).content_key;
1812     learning_segment.content_value = segment.candidate(0).content_value;
1813     learning_segment.description = GetDescription(segment.candidate(0));
1814     learning_segments->push_back_history_segment(learning_segment);
1815   }
1816   for (size_t i = 0; i < segments.conversion_segments_size(); ++i) {
1817     const Segment &segment = segments.conversion_segment(i);
1818     const Segment::Candidate &candidate = segment.candidate(0);
1819     if (candidate.inner_segment_boundary.empty()) {
1820       SegmentForLearning learning_segment;
1821       learning_segment.key = candidate.key;
1822       learning_segment.value = candidate.value;
1823       learning_segment.content_key = candidate.content_key;
1824       learning_segment.content_value = candidate.content_value;
1825       learning_segment.description = GetDescription(candidate);
1826       learning_segments->push_back_conversion_segment(learning_segment);
1827     } else {
1828       SegmentForLearning learning_segment;
1829       for (Segment::Candidate::InnerSegmentIterator iter(&candidate);
1830            !iter.Done(); iter.Next()) {
1831         learning_segment.key.assign(iter.GetKey().data(), iter.GetKey().size());
1832         learning_segment.value.assign(iter.GetValue().data(),
1833                                       iter.GetValue().size());
1834         learning_segment.content_key.assign(iter.GetContentKey().data(),
1835                                             iter.GetContentKey().size());
1836         learning_segment.content_value.assign(iter.GetContentValue().data(),
1837                                               iter.GetContentValue().size());
1838         learning_segments->push_back_conversion_segment(learning_segment);
1839       }
1840     }
1841   }
1842 }
1843 
InsertHistory(RequestType request_type,bool is_suggestion_selected,uint64 last_access_time,Segments * segments)1844 void UserHistoryPredictor::InsertHistory(RequestType request_type,
1845                                          bool is_suggestion_selected,
1846                                          uint64 last_access_time,
1847                                          Segments *segments) {
1848   SegmentsForLearning learning_segments;
1849   MakeLearningSegments(*segments, &learning_segments);
1850 
1851   string all_key, all_value;
1852   mozc_hash_set<uint32> seen;
1853   bool this_was_seen = false;
1854   const size_t history_segments_size =
1855       learning_segments.history_segments_size();
1856 
1857   for (size_t i = history_segments_size;
1858        i < learning_segments.all_segments_size(); ++i) {
1859     const SegmentForLearning &segment = learning_segments.all_segment(i);
1860     all_key += segment.key;
1861     all_value += segment.value;
1862     uint32 next_fp = (i == learning_segments.all_segments_size() - 1) ?
1863         0 : LearningSegmentFingerprint(learning_segments.all_segment(i + 1));
1864     // remember the first segment
1865     if (i == history_segments_size) {
1866       seen.insert(LearningSegmentFingerprint(segment));
1867     }
1868     uint32 next_fp_to_set = next_fp;
1869     // If two duplicate segments exist, kills the link
1870     // TO/FROM the second one to prevent loops.
1871     // Only killing "TO" link caused bug #2982886:
1872     // after converting "らいおん(もうじゅう)とぞうりむし(びせいぶつ)"
1873     // and typing "ぞうりむし", "ゾウリムシ(猛獣" was suggested.
1874     if (this_was_seen) {
1875       next_fp_to_set = 0;
1876     }
1877     if (!seen.insert(next_fp).second) {
1878       next_fp_to_set = 0;
1879       this_was_seen = true;
1880     } else {
1881       this_was_seen = false;
1882     }
1883     TryInsert(request_type,
1884               segment.key,
1885               segment.value,
1886               segment.description,
1887               is_suggestion_selected, next_fp_to_set,
1888               last_access_time, segments);
1889     if (content_word_learning_enabled_ &&
1890         segment.content_key != segment.key &&
1891         segment.content_value != segment.value) {
1892       TryInsert(request_type,
1893                 segment.content_key,
1894                 segment.content_value,
1895                 segment.description,
1896                 is_suggestion_selected, 0,
1897                 last_access_time, segments);
1898     }
1899   }
1900 
1901   // Inserts all_key/all_value.
1902   // We don't insert it for mobile.
1903   if (request_type != ZERO_QUERY_SUGGESTION &&
1904       learning_segments.conversion_segments_size() > 1 &&
1905       !all_key.empty() && !all_value.empty()) {
1906     TryInsert(request_type,
1907               all_key, all_value, "",
1908               is_suggestion_selected,
1909               0, last_access_time, segments);
1910   }
1911 
1912   // Makes a link from the last history_segment to the first conversion segment
1913   // or to the entire user input.
1914   if (learning_segments.history_segments_size() > 0 &&
1915       learning_segments.conversion_segments_size() > 0) {
1916     const SegmentForLearning &history_segment =
1917         learning_segments.history_segment(
1918             segments->history_segments_size() - 1);
1919     const SegmentForLearning &conversion_segment =
1920         learning_segments.conversion_segment(0);
1921     const string &history_value = history_segment.value;
1922     if (history_value.empty() || conversion_segment.value.empty()) {
1923       return;
1924     }
1925     // 1) Don't learn a link from a history which ends with punctuation.
1926     if (IsPunctuation(Util::SubString(history_value,
1927                                       Util::CharsLen(history_value) - 1,
1928                                       1))) {
1929       return;
1930     }
1931     // 2) Don't learn a link to a punctuation.
1932     // Exception: For zero query suggestion, we learn a link to a single
1933     //            punctuation segment.
1934     // Example: "よろしく|。" -> OK
1935     //          "よろしく|。でも" -> NG
1936     //          "よろしく|。。" -> NG
1937     // Note that another piece of code handles learning for
1938     // (sentence + punctuation) form; see Finish().
1939     if (IsPunctuation(Util::SubString(conversion_segment.value, 0, 1)) &&
1940         (request_type != ZERO_QUERY_SUGGESTION ||
1941          Util::CharsLen(conversion_segment.value) > 1)) {
1942       return;
1943     }
1944     Entry *history_entry = dic_->MutableLookupWithoutInsert(
1945         LearningSegmentFingerprint(history_segment));
1946     NextEntry next_entry;
1947     if (segments->request_type() == Segments::CONVERSION) {
1948       next_entry.set_entry_fp(LearningSegmentFingerprint(conversion_segment));
1949       InsertNextEntry(next_entry, history_entry);
1950     }
1951 
1952     // Entire user input or SUGGESTION
1953     if (segments->request_type() != Segments::CONVERSION ||
1954         learning_segments.conversion_segments_size() > 1) {
1955       next_entry.set_entry_fp(Fingerprint(all_key, all_value));
1956       InsertNextEntry(next_entry, history_entry);
1957     }
1958   }
1959 
1960   return;
1961 }
1962 
Revert(Segments * segments)1963 void UserHistoryPredictor::Revert(Segments *segments) {
1964   if (!CheckSyncerAndDelete()) {
1965     LOG(WARNING) << "Syncer is running";
1966     return;
1967   }
1968 
1969   for (size_t i = 0; i < segments->revert_entries_size(); ++i) {
1970     const Segments::RevertEntry &revert_entry =
1971         segments->revert_entry(i);
1972     if (revert_entry.id == UserHistoryPredictor::revert_id() &&
1973         revert_entry.revert_entry_type == Segments::RevertEntry::CREATE_ENTRY) {
1974       VLOG(2) << "Erasing the key: " << StringToUint32(revert_entry.key);
1975       dic_->Erase(StringToUint32(revert_entry.key));
1976     }
1977   }
1978 }
1979 
1980 // static
GetMatchType(const string & lstr,const string & rstr)1981 UserHistoryPredictor::MatchType UserHistoryPredictor::GetMatchType(
1982     const string &lstr, const string &rstr) {
1983   if (lstr.empty() && !rstr.empty()) {
1984     return LEFT_EMPTY_MATCH;
1985   }
1986 
1987   const size_t size = std::min(lstr.size(), rstr.size());
1988   if (size == 0) {
1989     return NO_MATCH;
1990   }
1991 
1992   const int result = memcmp(lstr.data(), rstr.data(), size);
1993   if (result != 0) {
1994     return NO_MATCH;
1995   }
1996 
1997   if (lstr.size() == rstr.size()) {
1998     return EXACT_MATCH;
1999   } else if (lstr.size() < rstr.size()) {
2000     return LEFT_PREFIX_MATCH;
2001   } else {
2002     return RIGHT_PREFIX_MATCH;
2003   }
2004 
2005   return NO_MATCH;
2006 }
2007 
2008 // static
GetMatchTypeFromInput(const string & input_key,const string & key_base,const Trie<string> * key_expanded,const string & target)2009 UserHistoryPredictor::MatchType UserHistoryPredictor::GetMatchTypeFromInput(
2010     const string &input_key,
2011     const string &key_base, const Trie<string> *key_expanded,
2012     const string &target) {
2013   if (key_expanded == nullptr) {
2014     // |input_key| and |key_base| can be different by compoesr modification.
2015     // For example, |input_key|, "8,+", and |base| "8、+".
2016     return GetMatchType(key_base, target);
2017   }
2018 
2019   // We can assume key_expanded != nullptr from here.
2020   if (key_base.empty()) {
2021       string value;
2022       size_t key_length = 0;
2023       bool has_subtrie = false;
2024       if (!key_expanded->LookUpPrefix(target, &value,
2025                                       &key_length, &has_subtrie)) {
2026         return NO_MATCH;
2027       } else if (value == target && value == input_key) {
2028         return EXACT_MATCH;
2029       } else {
2030         return LEFT_PREFIX_MATCH;
2031       }
2032   } else {
2033     const size_t size = std::min(key_base.size(), target.size());
2034     if (size == 0) {
2035       return NO_MATCH;
2036     }
2037     const int result = memcmp(key_base.data(), target.data(), size);
2038     if (result != 0) {
2039       return NO_MATCH;
2040     }
2041     if (target.size() <= key_base.size()) {
2042       return RIGHT_PREFIX_MATCH;
2043     }
2044     string value;
2045     size_t key_length = 0;
2046     bool has_subtrie = false;
2047     if (!key_expanded->LookUpPrefix(target.data() + key_base.size(),
2048                                     &value,
2049                                     &key_length, &has_subtrie)) {
2050       return NO_MATCH;
2051     }
2052     const string matched = key_base + value;
2053     if (matched == target && matched == input_key) {
2054       return EXACT_MATCH;
2055     } else {
2056       return LEFT_PREFIX_MATCH;
2057     }
2058   }
2059 
2060   DCHECK(false) << "Should not come here";
2061   return NO_MATCH;
2062 }
2063 
2064 // static
Fingerprint(const string & key,const string & value,EntryType type)2065 uint32 UserHistoryPredictor::Fingerprint(const string &key,
2066                                          const string &value,
2067                                          EntryType type) {
2068   if (type == Entry::DEFAULT_ENTRY) {
2069     // Since we have already used the fingerprint function for next entries and
2070     // next entries are saved in user's local machine, we are not able
2071     // to change the Fingerprint function for the old key/value type.
2072     return Hash::Fingerprint32(key + kDelimiter + value);
2073   } else {
2074     return Hash::Fingerprint32(static_cast<uint8>(type));
2075   }
2076 }
2077 
2078 // static
Fingerprint(const string & key,const string & value)2079 uint32 UserHistoryPredictor::Fingerprint(const string &key,
2080                                          const string &value) {
2081   return Fingerprint(key, value, Entry::DEFAULT_ENTRY);
2082 }
2083 
EntryFingerprint(const Entry & entry)2084 uint32 UserHistoryPredictor::EntryFingerprint(const Entry &entry) {
2085   return Fingerprint(entry.key(), entry.value());
2086 }
2087 
2088 // static
SegmentFingerprint(const Segment & segment)2089 uint32 UserHistoryPredictor::SegmentFingerprint(const Segment &segment) {
2090   if (segment.candidates_size() > 0) {
2091     return Fingerprint(segment.candidate(0).key, segment.candidate(0).value);
2092   }
2093   return 0;
2094 }
2095 
2096 // static
LearningSegmentFingerprint(const SegmentForLearning & segment)2097 uint32 UserHistoryPredictor::LearningSegmentFingerprint(
2098     const SegmentForLearning &segment) {
2099   return Fingerprint(segment.key, segment.value);
2100 }
2101 
2102 // static
Uint32ToString(uint32 fp)2103 string UserHistoryPredictor::Uint32ToString(uint32 fp) {
2104   string buf(reinterpret_cast<const char *>(&fp), sizeof(fp));
2105   return buf;
2106 }
2107 
2108 // static
StringToUint32(const string & input)2109 uint32 UserHistoryPredictor::StringToUint32(const string &input) {
2110   uint32 result = 0;
2111   if (input.size() == sizeof(result)) {
2112     memcpy(reinterpret_cast<char *>(&result), input.data(), input.size());
2113   }
2114   return result;
2115 }
2116 
2117 // static
IsValidSuggestion(RequestType request_type,uint32 prefix_len,const Entry & entry)2118 bool UserHistoryPredictor::IsValidSuggestion(
2119     RequestType request_type, uint32 prefix_len, const Entry &entry) {
2120   // When bigram_boost is true, that means that previous user input
2121   // and current input have bigram relation.
2122   if (entry.bigram_boost()) {
2123     return true;
2124   }
2125   // When zero_query_suggestion is true, that means that
2126   // predictor is running on mobile device. In this case,
2127   // make the behavior more aggressive.
2128   if (request_type == ZERO_QUERY_SUGGESTION) {
2129     return true;
2130   }
2131   // Handles suggestion_freq and conversion_freq differently.
2132   // conversion_freq is less aggressively affecting to the final decision.
2133   const uint32 freq =
2134       std::max(entry.suggestion_freq(), entry.conversion_freq() / 4);
2135 
2136   // TODO(taku,komatsu): better to make it simpler and easier to be understood.
2137   const uint32 base_prefix_len = 3 - std::min(static_cast<uint32>(2), freq);
2138   return (prefix_len >= base_prefix_len);
2139 }
2140 
2141 // static
2142 // 1) sort by last_access_time, which is basically the same as LRU policy.
2143 // 2) boost shorter candidate, if having the same last_access_time.
2144 // 3) add a bigram boost as a special bonus.
2145 // TODO(taku): better to take "frequency" into consideration
GetScore(const Entry & entry)2146 uint32 UserHistoryPredictor::GetScore(const Entry &entry) {
2147   const uint32 kBigramBoostAsTime = 7 * 24 * 60 * 60;   // 1 week.
2148   return
2149       entry.last_access_time() - Util::CharsLen(entry.value()) +
2150       (entry.bigram_boost() ? kBigramBoostAsTime : 0);
2151 }
2152 
2153 // Returns the size of cache.
2154 // static
cache_size()2155 uint32 UserHistoryPredictor::cache_size() {
2156   return kLRUCacheSize;
2157 }
2158 
2159 // Returns the size of next entries.
2160 // static
max_next_entries_size()2161 uint32 UserHistoryPredictor::max_next_entries_size() {
2162   return kMaxNextEntriesSize;
2163 }
2164 
2165 }  // namespace mozc
2166