1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "prediction/user_history_predictor.h"
31
32 #include <algorithm>
33 #include <cctype>
34 #include <climits>
35 #include <memory>
36 #include <string>
37
38 #include "base/clock.h"
39 #include "base/config_file_stream.h"
40 #include "base/flags.h"
41 #include "base/hash.h"
42 #include "base/logging.h"
43 #include "base/mozc_hash_set.h"
44 #include "base/thread.h"
45 #include "base/trie.h"
46 #include "base/util.h"
47 #include "composer/composer.h"
48 #include "converter/segments.h"
49 #include "dictionary/dictionary_interface.h"
50 #include "dictionary/pos_matcher.h"
51 #include "dictionary/suppression_dictionary.h"
52 #include "prediction/predictor_interface.h"
53 #include "prediction/user_history_predictor.pb.h"
54 #include "protocol/commands.pb.h"
55 #include "protocol/config.pb.h"
56 #include "request/conversion_request.h"
57 #include "rewriter/variants_rewriter.h"
58 #include "storage/encrypted_string_storage.h"
59 #include "storage/lru_cache.h"
60 #include "usage_stats/usage_stats.h"
61
62 // This flag is set by predictor.cc
63 // We can remove this after the ambiguity expansion feature get stable.
64 DEFINE_bool(enable_expansion_for_user_history_predictor,
65 false,
66 "enable ambiguity expansion for user_history_predictor.");
67
68 namespace mozc {
69 namespace {
70
71 using std::unique_ptr;
72
73 using commands::Request;
74 using dictionary::SuppressionDictionary;
75 using dictionary::DictionaryInterface;
76 using dictionary::POSMatcher;
77 using usage_stats::UsageStats;
78
79 // Finds suggestion candidates from the most recent 3000 history in LRU.
80 // We don't check all history, since suggestion is called every key event
81 const size_t kMaxSuggestionTrial = 3000;
82
83 // Finds suffix matches of history_segments from the most recent 500 histories
84 // in LRU.
85 const size_t kMaxPrevValueTrial = 500;
86
87 // Cache size
88 // Typically memory/storage footprint becomes kLRUCacheSize * 70 bytes.
89 #ifdef OS_ANDROID
90 const size_t kLRUCacheSize = 2000;
91 #else // OS_ANDROID
92 const size_t kLRUCacheSize = 10000;
93 #endif // OS_ANDROID
94
95 // Don't save key/value that are
96 // longer than kMaxCandidateSize to avoid memory explosion
97 const size_t kMaxStringLength = 256;
98
99 // Maximum size of next_entries
100 const size_t kMaxNextEntriesSize = 4;
101
102 // Revert id for user_history_predictor
103 const uint16 kRevertId = 1;
104
105 // Default object pool size for EntryPriorityQueue
106 const size_t kEntryPoolSize = 16;
107
108 // File name for the history
109 #ifdef OS_WIN
110 const char kFileName[] = "user://history.db";
111 #else
112 const char kFileName[] = "user://.history.db";
113 #endif
114
115 // Uses '\t' as a key/value delimiter
116 const char kDelimiter[] = "\t";
117 const char kEmojiDescription[] = "絵文字";
118
119 // TODO(peria, hidehiko): Unify this checker and IsEmojiCandidate in
120 // EmojiRewriter. If you make similar functions before the merging in
121 // case, put a similar note to avoid twisted dependency.
IsEmojiEntry(const UserHistoryPredictor::Entry & entry)122 bool IsEmojiEntry(const UserHistoryPredictor::Entry &entry) {
123 return (entry.has_description() &&
124 entry.description().find(kEmojiDescription) != string::npos);
125 }
126
IsPunctuation(const string & value)127 bool IsPunctuation(const string &value) {
128 return (value == "。" || value == "." ||
129 value == "、" || value == "," ||
130 value == "?" || value == "?" ||
131 value == "!" || value == "!" ||
132 value == "," || value == ".");
133 }
134
IsSentenceLikeCandidate(const Segment::Candidate & candidate)135 bool IsSentenceLikeCandidate(const Segment::Candidate &candidate) {
136 // A sentence should have a long reading. Length check is done using key to
137 // absorb length difference in value variation, e.g.,
138 // "〜ください" and "〜下さい".
139 if (candidate.value.empty() || Util::CharsLen(candidate.key) < 8) {
140 return false;
141 }
142 // Our primary target sentence ends with Hiragana, e.g., "〜ます".
143 const ConstChar32ReverseIterator iter(candidate.value);
144 bool ret = Util::GetScriptType(iter.Get()) == Util::HIRAGANA;
145 return ret;
146 }
147
148 // Returns romanaized string.
ToRoman(const string & str)149 string ToRoman(const string &str) {
150 string result;
151 Util::HiraganaToRomanji(str, &result);
152 return result;
153 }
154
155 // Returns true if value looks like a content word.
156 // Currently, just checks the script type.
IsContentWord(const string & value)157 bool IsContentWord(const string &value) {
158 return Util::CharsLen(value) > 1 ||
159 Util::GetScriptType(value) != Util::UNKNOWN_SCRIPT;
160 }
161
162 // Returns candidate description.
163 // If candidate is spelling correction, typing correction
164 // or auto partial suggestion,
165 // don't use the description, since "did you mean" like description must be
166 // provided at an appropriate timing and context.
GetDescription(const Segment::Candidate & candidate)167 string GetDescription(const Segment::Candidate &candidate) {
168 if (candidate.attributes &
169 (Segment::Candidate::SPELLING_CORRECTION |
170 Segment::Candidate::TYPING_CORRECTION |
171 Segment::Candidate::AUTO_PARTIAL_SUGGESTION)) {
172 return "";
173 }
174 return candidate.description;
175 }
176
177 } // namespace
178
179 // Returns true if the input first candidate seems to be a privacy sensitive
180 // such like password.
IsPrivacySensitive(const Segments * segments) const181 bool UserHistoryPredictor::IsPrivacySensitive(const Segments *segments) const {
182 const bool kNonSensitive = false;
183 const bool kSensitive = true;
184
185 // Skips privacy sensitive check if |segments| consists of multiple conversion
186 // segment. That is, segments like "パスワードは|x7LAGhaR" where '|'
187 // represents segment boundary is not considered to be privacy sensitive.
188 // TODO(team): Revisit this rule if necessary.
189 if (segments->conversion_segments_size() != 1) {
190 return kNonSensitive;
191 }
192
193 // Hereafter, we must have only one conversion segment.
194 const Segment &conversion_segment = segments->conversion_segment(0);
195 const string &segment_key = conversion_segment.key();
196
197 // The top candidate, which is about to be committed.
198 const Segment::Candidate &candidate = conversion_segment.candidate(0);
199 const string &candidate_value = candidate.value;
200
201 // If |candidate_value| contains any non-ASCII character, do not treat
202 // it as privacy sensitive information.
203 // TODO(team): Improve the following rule. For example,
204 // "0000-0000-0000-0000" is not treated as privacy sensitive
205 // because of this rule. When a user commits his password in
206 // full-width form by mistake, like "x7LAGhaR", it is not
207 // treated as privacy sensitive too.
208 if (Util::GetCharacterSet(candidate_value) != Util::ASCII) {
209 return kNonSensitive;
210 }
211
212 // Hereafter, |candidate_value| consists of ASCII characters only.
213
214 // Note: if the key looks like hiragana, the candidate might be Katakana to
215 // English transliteration. Don't suppress transliterated candidates.
216 // http://b/4394325
217
218 // If the key consists of number characters only, treat it as privacy
219 // sensitive.
220 if (Util::GetScriptType(segment_key) == Util::NUMBER) {
221 return kSensitive;
222 }
223
224 // If the key contains any alphabetical character but it is in our dictionary,
225 // it can be treated as privacy nonsensitive word; cf. b/5995529. Besides,
226 // short words would be considered as privacy nonsensitive word as well.
227 if (segment_key.size() <= 3) {
228 return kNonSensitive;
229 }
230
231 // Dictionary-based sensitivity test. If the word user typed is in dictionary,
232 // treat it as privacy insensitive. For English (ASCII) words,
233 // dictionary-based test is extended to the following forms:
234 // 1) All lower case (e.g., hello)
235 // 2) All upper case (e.g., HELLO)
236 // 3) Capitalized (e.g., Hello)
237 // 4) As-is (e.g., HeLlO)
238 // Since English words are stored in lower case, in case of upper case and
239 // capitalized keys, we convert it to lower case in advance.
240 if (Util::IsUpperOrCapitalizedAscii(candidate_value)) {
241 // Look up for keys that are all in upper case or capitalized ASCII.
242 string lower_case_value(candidate_value);
243 Util::LowerString(&lower_case_value);
244 if (dictionary_->HasValue(lower_case_value)) {
245 return kNonSensitive;
246 }
247 } else {
248 // Looks up for the original key, including those that are all in lower case
249 // ASCII.
250 if (dictionary_->HasValue(candidate_value)) {
251 return kNonSensitive;
252 }
253 }
254 // If the key contains any alphabetical character and is not in our
255 // dictionary, treat it as privacy sensitive. There also remains some cases to
256 // be considered. Compare following two cases.
257 // Case A:
258 // 1. Type "ywwz1sxm" in Roman-input style then get "yっwz1sxm".
259 // 2. Hit F10 key to convert it to "ywwz1sxm" by
260 // ConvertToHalfAlphanumeric command.
261 // 3. Commit it.
262 // In this case, |segment_key| is "yっwz1sxm" and actually contains
263 // alphabetical characters. So kSensitive will be returned.
264 // So far so good.
265 // Case B:
266 // 1. type "ia1bo3xu" in Roman-input style then get "いあ1ぼ3ぅ".
267 // 2. hit F10 key to convert it to "ia1bo3xu" by
268 // ConvertToHalfAlphanumeric command.
269 // 3. commit it.
270 // In this case, |segment_key| is "ia1bo3xu" and contains no
271 // alphabetical character. So the following check does nothing.
272 // TODO(team): Improve the following rule so that our user experience
273 // can be consistent between case A and B.
274 if (Util::ContainsScriptType(segment_key, Util::ALPHABET)) {
275 return kSensitive;
276 }
277
278 return kNonSensitive;
279 }
280
UserHistoryStorage(const string & filename)281 UserHistoryStorage::UserHistoryStorage(const string &filename)
282 : storage_(new storage::EncryptedStringStorage(filename)) {
283 }
284
~UserHistoryStorage()285 UserHistoryStorage::~UserHistoryStorage() {}
286
Load()287 bool UserHistoryStorage::Load() {
288 string input;
289 if (!storage_->Load(&input)) {
290 LOG(ERROR) << "Can't load user history data.";
291 return false;
292 }
293
294 if (!user_history_base.ParseFromString(input)) {
295 LOG(ERROR) << "ParseFromString failed. message looks broken";
296 return false;
297 }
298
299 VLOG(1) << "Loaded user histroy, size=" << user_history_base.entries_size();
300 return true;
301 }
302
Save() const303 bool UserHistoryStorage::Save() const {
304 if (user_history_base.entries_size() == 0) {
305 LOG(WARNING) << "etries size is 0. Not saved";
306 return false;
307 }
308
309 string output;
310 if (!user_history_base.AppendToString(&output)) {
311 LOG(ERROR) << "AppendToString failed";
312 return false;
313 }
314
315 if (!storage_->Save(output)) {
316 LOG(ERROR) << "Can't save user history data.";
317 return false;
318 }
319
320 return true;
321 }
322
EntryPriorityQueue()323 UserHistoryPredictor::EntryPriorityQueue::EntryPriorityQueue()
324 : pool_(kEntryPoolSize) {}
325
~EntryPriorityQueue()326 UserHistoryPredictor::EntryPriorityQueue::~EntryPriorityQueue() {}
327
Push(Entry * entry)328 bool UserHistoryPredictor::EntryPriorityQueue::Push(Entry *entry) {
329 DCHECK(entry);
330 if (!seen_.insert(Hash::Fingerprint32(entry->value())).second) {
331 VLOG(2) << "found dups";
332 return false;
333 }
334 const uint32 score = UserHistoryPredictor::GetScore(*entry);
335 agenda_.push(std::make_pair(score, entry));
336 return true;
337 }
338
339 UserHistoryPredictor::Entry *
Pop()340 UserHistoryPredictor::EntryPriorityQueue::Pop() {
341 if (agenda_.empty()) {
342 return nullptr;
343 }
344 const QueueElement &element = agenda_.top();
345 Entry *result = element.second;
346 DCHECK(result);
347 agenda_.pop();
348 return result;
349 }
350
351 UserHistoryPredictor::Entry *
NewEntry()352 UserHistoryPredictor::EntryPriorityQueue::NewEntry() {
353 return pool_.Alloc();
354 }
355
356 class UserHistoryPredictorSyncer : public Thread {
357 public:
358 enum RequestType {
359 LOAD,
360 SAVE
361 };
362
UserHistoryPredictorSyncer(UserHistoryPredictor * predictor,RequestType type)363 UserHistoryPredictorSyncer(UserHistoryPredictor *predictor,
364 RequestType type)
365 : predictor_(predictor), type_(type) {
366 DCHECK(predictor_);
367 }
368
Run()369 virtual void Run() {
370 switch (type_) {
371 case LOAD:
372 VLOG(1) << "Executing Reload method";
373 predictor_->Load();
374 break;
375 case SAVE:
376 VLOG(1) << "Executing Sync method";
377 predictor_->Save();
378 break;
379 default:
380 LOG(ERROR) << "Unknown request: " << static_cast<int>(type_);
381 }
382 }
383
~UserHistoryPredictorSyncer()384 virtual ~UserHistoryPredictorSyncer() {
385 Join();
386 }
387
388 UserHistoryPredictor *predictor_;
389 RequestType type_;
390 };
391
UserHistoryPredictor(const DictionaryInterface * dictionary,const POSMatcher * pos_matcher,const SuppressionDictionary * suppression_dictionary,bool enable_content_word_learning)392 UserHistoryPredictor::UserHistoryPredictor(
393 const DictionaryInterface *dictionary,
394 const POSMatcher *pos_matcher,
395 const SuppressionDictionary *suppression_dictionary,
396 bool enable_content_word_learning)
397 : dictionary_(dictionary),
398 pos_matcher_(pos_matcher),
399 suppression_dictionary_(suppression_dictionary),
400 predictor_name_("UserHistoryPredictor"),
401 content_word_learning_enabled_(enable_content_word_learning),
402 updated_(false),
403 dic_(new DicCache(UserHistoryPredictor::cache_size())) {
404 AsyncLoad(); // non-blocking
405 // Load() blocking version can be used if any
406 }
407
~UserHistoryPredictor()408 UserHistoryPredictor::~UserHistoryPredictor() {
409 // In destructor, must call blocking version
410 WaitForSyncer();
411 Save(); // blocking
412 }
413
GetUserHistoryFileName()414 string UserHistoryPredictor::GetUserHistoryFileName() {
415 return ConfigFileStream::GetFileName(kFileName);
416 }
417
418 // Returns revert id
419 // static
revert_id()420 uint16 UserHistoryPredictor::revert_id() {
421 return kRevertId;
422 }
423
WaitForSyncer()424 void UserHistoryPredictor::WaitForSyncer() {
425 if (syncer_.get() != nullptr) {
426 syncer_->Join();
427 syncer_.reset();
428 }
429 }
430
Wait()431 bool UserHistoryPredictor::Wait() {
432 WaitForSyncer();
433 return true;
434 }
435
CheckSyncerAndDelete() const436 bool UserHistoryPredictor::CheckSyncerAndDelete() const {
437 if (syncer_.get() != nullptr) {
438 if (syncer_->IsRunning()) {
439 return false;
440 } else {
441 syncer_.reset();
442 }
443 }
444
445 return true;
446 }
447
Sync()448 bool UserHistoryPredictor::Sync() {
449 return AsyncSave();
450 // return Save(); blocking version
451 }
452
Reload()453 bool UserHistoryPredictor::Reload() {
454 WaitForSyncer();
455 return AsyncLoad();
456 }
457
AsyncLoad()458 bool UserHistoryPredictor::AsyncLoad() {
459 if (!CheckSyncerAndDelete()) { // now loading/saving
460 return true;
461 }
462
463 syncer_.reset(new UserHistoryPredictorSyncer(
464 this,
465 UserHistoryPredictorSyncer::LOAD));
466 syncer_->Start("UserHistoryPredictor:Load");
467
468 return true;
469 }
470
AsyncSave()471 bool UserHistoryPredictor::AsyncSave() {
472 if (!updated_) {
473 return true;
474 }
475
476 if (!CheckSyncerAndDelete()) { // now loading/saving
477 return true;
478 }
479
480 syncer_.reset(new UserHistoryPredictorSyncer(
481 this,
482 UserHistoryPredictorSyncer::SAVE));
483 syncer_->Start("UserHistoryPredictor:Save");
484
485 return true;
486 }
487
Load()488 bool UserHistoryPredictor::Load() {
489 const string filename = GetUserHistoryFileName();
490
491 UserHistoryStorage history(filename);
492 if (!history.Load()) {
493 LOG(ERROR) << "UserHistoryStorage::Load() failed";
494 return false;
495 }
496
497 for (size_t i = 0; i < history.user_history_base.entries_size(); ++i) {
498 dic_->Insert(EntryFingerprint(history.user_history_base.entries(i)),
499 history.user_history_base.entries(i));
500 }
501
502 VLOG(1) << "Loaded user histroy, size=" << history.user_history_base.entries_size();
503
504 return true;
505 }
506
Save()507 bool UserHistoryPredictor::Save() {
508 if (!updated_) {
509 return true;
510 }
511
512 // Do not check incognito_mode or use_history_suggest in Config here.
513 // The input data should not have been inserted when those flags are on.
514
515 const DicElement *tail = dic_->Tail();
516 if (tail == nullptr) {
517 return true;
518 }
519
520 const string filename = GetUserHistoryFileName();
521
522 UserHistoryStorage history(filename);
523 for (const DicElement *elm = tail; elm != nullptr; elm = elm->prev) {
524 history.user_history_base.add_entries()->CopyFrom(elm->value);
525 }
526
527 // Updates usage stats here.
528 UsageStats::SetInteger(
529 "UserHistoryPredictorEntrySize",
530 static_cast<int>(history.user_history_base.entries_size()));
531
532 if (!history.Save()) {
533 LOG(ERROR) << "UserHistoryStorage::Save() failed";
534 return false;
535 }
536
537 updated_ = false;
538
539 return true;
540 }
541
ClearAllHistory()542 bool UserHistoryPredictor::ClearAllHistory() {
543 // Waits until syncer finishes
544 WaitForSyncer();
545
546 VLOG(1) << "Clearing user prediction";
547 // Renews DicCache as LRUCache tries to reuse the internal value by
548 // using FreeList
549 dic_.reset(new DicCache(UserHistoryPredictor::cache_size()));
550
551 // insert a dummy event entry.
552 InsertEvent(Entry::CLEAN_ALL_EVENT);
553
554 updated_ = true;
555
556 Sync();
557
558 return true;
559 }
560
ClearUnusedHistory()561 bool UserHistoryPredictor::ClearUnusedHistory() {
562 // Waits until syncer finishes
563 WaitForSyncer();
564
565 VLOG(1) << "Clearing unused prediction";
566 const DicElement *head = dic_->Head();
567 if (head == nullptr) {
568 VLOG(2) << "dic head is nullptr";
569 return false;
570 }
571
572 std::vector<uint32> keys;
573 for (const DicElement *elm = head; elm != nullptr; elm = elm->next) {
574 VLOG(3) << elm->key << " " << elm->value.suggestion_freq();
575 if (elm->value.suggestion_freq() == 0) {
576 keys.push_back(elm->key);
577 }
578 }
579
580 for (size_t i = 0; i < keys.size(); ++i) {
581 VLOG(2) << "Removing: " << keys[i];
582 if (!dic_->Erase(keys[i])) {
583 LOG(ERROR) << "cannot erase " << keys[i];
584 }
585 }
586
587 // Inserts a dummy event entry.
588 InsertEvent(Entry::CLEAN_UNUSED_EVENT);
589
590 updated_ = true;
591
592 Sync();
593
594 VLOG(1) << keys.size() << " removed";
595
596 return true;
597 }
598
599 // Erases all the next_entries whose entry_fp field equals |fp|.
EraseNextEntries(uint32 fp,Entry * entry)600 void UserHistoryPredictor::EraseNextEntries(uint32 fp, Entry *entry) {
601 const size_t orig_size = entry->next_entries_size();
602 size_t new_size = orig_size;
603 for (size_t pos = 0; pos < new_size; ) {
604 if (entry->next_entries(pos).entry_fp() == fp) {
605 entry->mutable_next_entries()->SwapElements(pos, --new_size);
606 } else {
607 ++pos;
608 }
609 }
610 for (size_t i = 0; i < orig_size - new_size; ++i) {
611 entry->mutable_next_entries()->RemoveLast();
612 }
613 }
614
615 // Recursively finds the Ngram history that produces |target_key| and
616 // |target_value| and removes the last link. For example, if there exists a
617 // chain like
618 // ("aaa", "AAA") -- ("bbb", "BBB") -- ("ccc", "CCC"),
619 // and if target_key == "aaabbbccc" and target_value == "AAABBBCCC", the link
620 // from ("bbb", "BBB") to ("ccc", "CCC") is removed. If a link was removed, this
621 // method returns DONE. If no history entries can produce the target key
622 // value, then NOT_FOUND is returned. TAIL is returned only when the
623 // tail was found, e.g., in the above example, when the method finds the tail
624 // node ("ccc", "CCC").
625 UserHistoryPredictor::RemoveNgramChainResult
RemoveNgramChain(const string & target_key,const string & target_value,Entry * entry,std::vector<StringPiece> * key_ngrams,size_t key_ngrams_len,std::vector<StringPiece> * value_ngrams,size_t value_ngrams_len)626 UserHistoryPredictor::RemoveNgramChain(const string &target_key,
627 const string &target_value,
628 Entry *entry,
629 std::vector<StringPiece> *key_ngrams,
630 size_t key_ngrams_len,
631 std::vector<StringPiece> *value_ngrams,
632 size_t value_ngrams_len) {
633 DCHECK(entry);
634 DCHECK(key_ngrams);
635 DCHECK(value_ngrams);
636
637 // Updates the lengths with the current entry node.
638 key_ngrams_len += entry->key().size();
639 value_ngrams_len += entry->value().size();
640
641 // This is the case where ngram key and value are shorter than the target key
642 // and value, respectively. In this case, we need to find further entries to
643 // concatenate in order to make |target_key| and |target_value|.
644 if (key_ngrams_len < target_key.size() &&
645 value_ngrams_len < target_value.size()) {
646 key_ngrams->push_back(entry->key());
647 value_ngrams->push_back(entry->value());
648 for (size_t i = 0; i < entry->next_entries().size(); ++i) {
649 const uint32 fp = entry->next_entries(i).entry_fp();
650 Entry *e = dic_->MutableLookupWithoutInsert(fp);
651 if (e == nullptr) {
652 continue;
653 }
654 const RemoveNgramChainResult r = RemoveNgramChain(target_key,
655 target_value,
656 e,
657 key_ngrams,
658 key_ngrams_len,
659 value_ngrams,
660 value_ngrams_len);
661 switch (r) {
662 case DONE:
663 return DONE;
664 case TAIL:
665 // |entry| is the second-to-the-last node. So cut the link to the
666 // child entry.
667 EraseNextEntries(fp, entry);
668 return DONE;
669 default:
670 break;
671 }
672 }
673 // Recovers the state.
674 key_ngrams->pop_back();
675 value_ngrams->pop_back();
676 return NOT_FOUND;
677 }
678
679 // This is the case where the current ngram key and value have the same
680 // lengths as those of |target_key| and |target_value|, respectively.
681 if (key_ngrams_len == target_key.size() &&
682 value_ngrams_len == target_value.size()) {
683 key_ngrams->push_back(entry->key());
684 value_ngrams->push_back(entry->value());
685 string ngram_key, ngram_value;
686 Util::JoinStringPieces(*key_ngrams, "", &ngram_key);
687 Util::JoinStringPieces(*value_ngrams, "", &ngram_value);
688 if (ngram_key == target_key && ngram_value == target_value) {
689 // |entry| is the last node. So return TAIL to tell the caller so
690 // that it can remove the link to this last node.
691 return TAIL;
692 }
693 key_ngrams->pop_back();
694 value_ngrams->pop_back();
695 return NOT_FOUND;
696 }
697
698 return NOT_FOUND;
699 }
700
ClearHistoryEntry(const string & key,const string & value)701 bool UserHistoryPredictor::ClearHistoryEntry(const string &key,
702 const string &value) {
703 bool deleted = false;
704 {
705 // Finds the history entry that has the exactly same key and value and has
706 // not been removed yet. If exists, remove it.
707 Entry *entry = dic_->MutableLookupWithoutInsert(Fingerprint(key, value));
708 if (entry != nullptr && !entry->removed()) {
709 entry->set_suggestion_freq(0);
710 entry->set_conversion_freq(0);
711 entry->set_removed(true);
712 // We don't clear entry->next_entries() so that we can generate prediction
713 // by chaining.
714 deleted = true;
715 }
716 }
717 {
718 // Finds a chain of history entries that produces key and value. If exists,
719 // remove the link so that N-gram history prediction never generates this
720 // key value pair..
721 for (DicElement *elm = dic_->MutableHead(); elm != nullptr;
722 elm = elm->next) {
723 Entry *entry = &elm->value;
724 if (!Util::StartsWith(key, entry->key()) ||
725 !Util::StartsWith(value, entry->value())) {
726 continue;
727 }
728 std::vector<StringPiece> key_ngrams, value_ngrams;
729 if (RemoveNgramChain(
730 key, value, entry, &key_ngrams, 0, &value_ngrams, 0) == DONE) {
731 deleted = true;
732 }
733 }
734 }
735 if (deleted) {
736 updated_ = true;
737 }
738 return deleted;
739 }
740
741 // Returns true if prev_entry has a next_fp link to entry
742 // static
HasBigramEntry(const Entry & entry,const Entry & prev_entry)743 bool UserHistoryPredictor::HasBigramEntry(
744 const Entry &entry, const Entry &prev_entry) {
745 const uint32 fp = EntryFingerprint(entry);
746 for (int i = 0; i < prev_entry.next_entries_size(); ++i) {
747 if (fp == prev_entry.next_entries(i).entry_fp()) {
748 return true;
749 }
750 }
751 return false;
752 }
753
754 // static
GetRomanMisspelledKey(const ConversionRequest & request,const Segments & segments)755 string UserHistoryPredictor::GetRomanMisspelledKey(
756 const ConversionRequest &request,
757 const Segments &segments) {
758 if (request.config().preedit_method() != config::Config::ROMAN) {
759 return "";
760 }
761
762 const string &preedit = segments.conversion_segment(0).key();
763 // TODO(team): Use composer if it is available.
764 // segments.composer()->GetQueryForConversion(&preedit);
765 // Since ConverterInterface doesn't have StartPredictionWithComposer,
766 // we cannot use composer currently.
767 if (!preedit.empty() && MaybeRomanMisspelledKey(preedit)) {
768 return ToRoman(preedit);
769 }
770
771 return "";
772 }
773
774 // static
MaybeRomanMisspelledKey(const string & key)775 bool UserHistoryPredictor::MaybeRomanMisspelledKey(const string &key) {
776 int num_alpha = 0;
777 int num_hiragana = 0;
778 int num_unknown = 0;
779 for (ConstChar32Iterator iter(key); !iter.Done(); iter.Next()) {
780 const char32 w = iter.Get();
781 const Util::ScriptType type = Util::GetScriptType(w);
782 if (type == Util::HIRAGANA || w == 0x30FC) { // "ー".
783 ++num_hiragana;
784 continue;
785 }
786 if (type == Util::UNKNOWN_SCRIPT && num_unknown <= 0) {
787 ++num_unknown;
788 continue;
789 }
790 if (type == Util::ALPHABET && num_alpha <= 0) {
791 ++num_alpha;
792 continue;
793 }
794 return false;
795 }
796
797 return (num_hiragana > 0 &&
798 ((num_alpha == 1 && num_unknown == 0) ||
799 (num_alpha == 0 && num_unknown == 1)));
800 }
801
802 // static
RomanFuzzyPrefixMatch(const string & str,const string & prefix)803 bool UserHistoryPredictor::RomanFuzzyPrefixMatch(
804 const string &str, const string &prefix) {
805 if (prefix.empty() || prefix.size() > str.size()) {
806 return false;
807 }
808
809 // 1. Allows one character delete in Romanji sequence.
810 // 2. Allows one swap in Romanji sequence.
811 for (size_t i = 0; i < prefix.size(); ++i) {
812 if (prefix[i] == str[i]) {
813 continue;
814 }
815
816 if (str[i] == '-') {
817 // '-' voice sound mark can be matched to any
818 // non-alphanum character.
819 if (!isalnum(prefix[i])) {
820 string replaced_prefix = prefix;
821 replaced_prefix[i] = str[i];
822 if (Util::StartsWith(str, replaced_prefix)) {
823 return true;
824 }
825 }
826 } else {
827 // deletion.
828 string inserted_prefix = prefix;
829 inserted_prefix.insert(i, 1, str[i]);
830 if (Util::StartsWith(str, inserted_prefix)) {
831 return true;
832 }
833
834 // swap.
835 if (i + 1 < prefix.size()) {
836 string swapped_prefix = prefix;
837 using std::swap;
838 swap(swapped_prefix[i], swapped_prefix[i + 1]);
839 if (Util::StartsWith(str, swapped_prefix)) {
840 return true;
841 }
842 }
843 }
844
845 return false;
846 }
847
848 // |prefix| is an exact suffix of |str|.
849 return false;
850 }
851
RomanFuzzyLookupEntry(const string & roman_input_key,const Entry * entry,EntryPriorityQueue * results) const852 bool UserHistoryPredictor::RomanFuzzyLookupEntry(
853 const string &roman_input_key, const Entry *entry,
854 EntryPriorityQueue *results) const {
855 if (roman_input_key.empty()) {
856 return false;
857 }
858
859 DCHECK(entry);
860 DCHECK(results);
861
862 if (!RomanFuzzyPrefixMatch(ToRoman(entry->key()),
863 roman_input_key)) {
864 return false;
865 }
866
867 Entry *result = results->NewEntry();
868 DCHECK(result);
869 result->Clear();
870 result->CopyFrom(*entry);
871 result->set_spelling_correction(true);
872 results->Push(result);
873
874 return true;
875 }
876
AddEntry(const Entry & entry,EntryPriorityQueue * results) const877 UserHistoryPredictor::Entry *UserHistoryPredictor::AddEntry(
878 const Entry &entry, EntryPriorityQueue *results) const {
879 // We add an entry even if it was marked as removed so that it can be used to
880 // generate prediction by entry chaining. The deleted entry itself is never
881 // shown in the final prediction result as it is filtered finally.
882 Entry *new_entry = results->NewEntry();
883 new_entry->Clear();
884 new_entry->CopyFrom(entry);
885 return new_entry;
886 }
887
AddEntryWithNewKeyValue(const string & key,const string & value,const Entry & entry,EntryPriorityQueue * results) const888 UserHistoryPredictor::Entry *UserHistoryPredictor::AddEntryWithNewKeyValue(
889 const string &key, const string &value, const Entry &entry,
890 EntryPriorityQueue *results) const {
891 // We add an entry even if it was marked as removed so that it can be used to
892 // generate prediction by entry chaining. The deleted entry itself is never
893 // shown in the final prediction result as it is filtered finally.
894 Entry *new_entry = results->NewEntry();
895 new_entry->Clear();
896 new_entry->CopyFrom(entry);
897 new_entry->set_key(key);
898 new_entry->set_value(value);
899
900 // Sets removed field true if the new key and value were removed.
901 const Entry *e = dic_->LookupWithoutInsert(Fingerprint(key, value));
902 new_entry->set_removed(e != nullptr && e->removed());
903
904 return new_entry;
905 }
906
GetKeyValueForExactAndRightPrefixMatch(const string & input_key,const Entry * entry,const Entry ** result_last_entry,uint64 * left_last_access_time,uint64 * left_most_last_access_time,string * result_key,string * result_value) const907 bool UserHistoryPredictor::GetKeyValueForExactAndRightPrefixMatch(
908 const string &input_key,
909 const Entry *entry, const Entry **result_last_entry,
910 uint64 *left_last_access_time,
911 uint64 *left_most_last_access_time,
912 string *result_key, string *result_value) const {
913 string key = entry->key();
914 string value = entry->value();
915 const Entry *current_entry = entry;
916 mozc_hash_set<uint32> seen;
917 seen.insert(EntryFingerprint(*current_entry));
918 // Until target entry gets longer than input_key.
919 while (key.size() <= input_key.size()) {
920 const Entry *latest_entry = nullptr;
921 const Entry *left_same_timestamp_entry = nullptr;
922 const Entry *left_most_same_timestamp_entry = nullptr;
923 for (size_t i = 0; i < current_entry->next_entries_size(); ++i) {
924 const Entry *tmp_next_entry = dic_->LookupWithoutInsert(
925 current_entry->next_entries(i).entry_fp());
926 if (tmp_next_entry == nullptr || tmp_next_entry->key().empty()) {
927 continue;
928 }
929 const MatchType mtype_joined = GetMatchType(key + tmp_next_entry->key(),
930 input_key);
931 if (mtype_joined == NO_MATCH || mtype_joined == LEFT_EMPTY_MATCH) {
932 continue;
933 }
934 if (latest_entry == nullptr ||
935 latest_entry->last_access_time() <
936 tmp_next_entry->last_access_time()) {
937 latest_entry = tmp_next_entry;
938 }
939 if (tmp_next_entry->last_access_time() == *left_last_access_time) {
940 left_same_timestamp_entry = tmp_next_entry;
941 }
942 if (tmp_next_entry->last_access_time() == *left_most_last_access_time) {
943 left_most_same_timestamp_entry = tmp_next_entry;
944 }
945 }
946
947 // Prefer bigrams which are generated at the same time.
948 // When last_access_time are the same, these two bigrams were
949 // input together.
950 // The preferences:
951 // (1). The current entry's time stamp is equal to that of
952 // left most content word
953 // (2). The current entry's time stamp is equal to that of
954 // left closest content word
955 // (3). The current entry is the latest
956 const Entry *next_entry = left_most_same_timestamp_entry;
957 if (next_entry == nullptr) {
958 next_entry = left_same_timestamp_entry;
959 }
960 if (next_entry == nullptr) {
961 next_entry = latest_entry;
962 }
963
964 if (next_entry == nullptr || next_entry->key().empty()) {
965 break;
966 }
967
968 // If duplicate entry is found, don't expand more.
969 // This is because an entry only has one timestamp.
970 // we cannot trust the timestamp if there are duplicate values
971 // in one input.
972 if (!seen.insert(EntryFingerprint(*next_entry)).second) {
973 break;
974 }
975
976 key += next_entry->key();
977 value += next_entry->value();
978 current_entry = next_entry;
979 *result_last_entry = next_entry;
980
981 // Don't update left_access_time if the current entry is
982 // not a content word.
983 // The time-stamp of non-content-word will be updated frequently.
984 // The time-stamp of the previous candidate is more trustful.
985 // It partially fixes the bug http://b/2843371.
986 const bool is_content_word = IsContentWord(current_entry->value());
987
988 if (is_content_word) {
989 *left_last_access_time = current_entry->last_access_time();
990 }
991
992 // If left_most entry is a functional word (symbols/punctuations),
993 // we don't take it as a canonical candidate.
994 if (*left_most_last_access_time == 0 && is_content_word) {
995 *left_most_last_access_time = current_entry->last_access_time();
996 }
997 }
998
999 if (key.size() < input_key.size()) {
1000 VLOG(3) << "Cannot find prefix match even after chain rules";
1001 return false;
1002 }
1003
1004 *result_key = key;
1005 *result_value = value;
1006 return true;
1007 }
1008
LookupEntry(RequestType request_type,const string & input_key,const string & key_base,const Trie<string> * key_expanded,const Entry * entry,const Entry * prev_entry,EntryPriorityQueue * results) const1009 bool UserHistoryPredictor::LookupEntry(
1010 RequestType request_type,
1011 const string &input_key,
1012 const string &key_base,
1013 const Trie<string> *key_expanded,
1014 const Entry *entry,
1015 const Entry *prev_entry,
1016 EntryPriorityQueue *results) const {
1017 CHECK(entry);
1018 CHECK(results);
1019
1020 Entry *result = nullptr;
1021
1022 const Entry *last_entry = nullptr;
1023
1024 // last_access_time of the left-closest content word.
1025 uint64 left_last_access_time = 0;
1026
1027 // last_access_time of the left-most content word.
1028 uint64 left_most_last_access_time = 0;
1029
1030 // Example: [a|B|c|D]
1031 // a,c: functional word
1032 // B,D: content word
1033 // left_last_access_time: timestamp of D
1034 // left_most_last_access_time: timestamp of B
1035
1036 // |input_key| is a query user is now typing.
1037 // |entry->key()| is a target value saved in the database.
1038 // const string input_key = key_base;
1039
1040 const MatchType mtype = GetMatchTypeFromInput(
1041 input_key, key_base, key_expanded, entry->key());
1042 if (mtype == NO_MATCH) {
1043 return false;
1044 } else if (mtype == LEFT_EMPTY_MATCH) { // zero-query-suggestion
1045 // if |input_key| is empty, the |prev_entry| and |entry| must
1046 // have bigram relation.
1047 if (prev_entry != nullptr && HasBigramEntry(*entry, *prev_entry)) {
1048 result = AddEntry(*entry, results);
1049 if (result) {
1050 last_entry = entry;
1051 left_last_access_time = entry->last_access_time();
1052 left_most_last_access_time = IsContentWord(entry->value()) ?
1053 left_last_access_time : 0;
1054 }
1055 } else {
1056 return false;
1057 }
1058 } else if (mtype == LEFT_PREFIX_MATCH) {
1059 // |input_key| is shorter than |entry->key()|
1060 // This scenario is a simple prefix match.
1061 // e.g., |input_key|="foo", |entry->key()|="foobar"
1062 result = AddEntry(*entry, results);
1063 if (result) {
1064 last_entry = entry;
1065 left_last_access_time = entry->last_access_time();
1066 left_most_last_access_time = IsContentWord(entry->value()) ?
1067 left_last_access_time : 0;
1068 }
1069 } else if (mtype == RIGHT_PREFIX_MATCH || mtype == EXACT_MATCH) {
1070 // |input_key| is longer than or the same as |entry->key()|.
1071 // In this case, recursively traverse "next_entries" until
1072 // target entry gets longer than input_key.
1073 // e.g., |input_key|="foobar", |entry->key()|="foo"
1074 if (request_type == ZERO_QUERY_SUGGESTION && mtype == EXACT_MATCH) {
1075 // For mobile, we don't generate joined result.
1076 result = AddEntry(*entry, results);
1077 if (result) {
1078 last_entry = entry;
1079 left_last_access_time = entry->last_access_time();
1080 left_most_last_access_time = IsContentWord(entry->value()) ?
1081 left_last_access_time : 0;
1082 }
1083 } else {
1084 string key, value;
1085 left_last_access_time = entry->last_access_time();
1086 left_most_last_access_time = IsContentWord(entry->value()) ?
1087 left_last_access_time : 0;
1088 if (!GetKeyValueForExactAndRightPrefixMatch(
1089 input_key, entry, &last_entry,
1090 &left_last_access_time, &left_most_last_access_time,
1091 &key, &value)) {
1092 return false;
1093 }
1094 result = AddEntryWithNewKeyValue(key, value, *entry, results);
1095 }
1096 } else {
1097 LOG(ERROR) << "Unknown match mode: " << mtype;
1098 return false;
1099 }
1100
1101 if (result == nullptr) {
1102 return false;
1103 }
1104
1105 // If prev entry is not nullptr, check whether there is a bigram
1106 // from |prev_entry| to |entry|.
1107 result->set_bigram_boost(false);
1108
1109 if (prev_entry != nullptr && HasBigramEntry(*entry, *prev_entry)) {
1110 // Sets bigram_boost flag so that this entry is boosted
1111 // against LRU policy.
1112 result->set_bigram_boost(true);
1113 }
1114
1115 if (!result->removed()) {
1116 results->Push(result);
1117 }
1118
1119 if (request_type == ZERO_QUERY_SUGGESTION) {
1120 // For mobile, we don't generate joined result.
1121 return true;
1122 }
1123
1124 // Generates joined result using |last_entry|.
1125 if (last_entry != nullptr &&
1126 Util::CharsLen(result->key()) >= 1 &&
1127 2 * Util::CharsLen(input_key) >= Util::CharsLen(result->key())) {
1128 const Entry *latest_entry = nullptr;
1129 const Entry *left_same_timestamp_entry = nullptr;
1130 const Entry *left_most_same_timestamp_entry = nullptr;
1131 for (int i = 0; i < last_entry->next_entries_size(); ++i) {
1132 const Entry *tmp_entry = dic_->LookupWithoutInsert(
1133 last_entry->next_entries(i).entry_fp());
1134 if (tmp_entry == nullptr || tmp_entry->key().empty()) {
1135 continue;
1136 }
1137 if (latest_entry == nullptr ||
1138 latest_entry->last_access_time() < tmp_entry->last_access_time()) {
1139 latest_entry = tmp_entry;
1140 }
1141 if (tmp_entry->last_access_time() == left_last_access_time) {
1142 left_same_timestamp_entry = tmp_entry;
1143 }
1144 if (tmp_entry->last_access_time() == left_most_last_access_time) {
1145 left_most_same_timestamp_entry = tmp_entry;
1146 }
1147 }
1148
1149 const Entry *next_entry = left_most_same_timestamp_entry;
1150 if (next_entry == nullptr) {
1151 next_entry = left_same_timestamp_entry;
1152 }
1153 if (next_entry == nullptr) {
1154 next_entry = latest_entry;
1155 }
1156
1157 // The new entry was input within 10 seconds.
1158 // TODO(taku): This is a simple heuristics.
1159 if (next_entry != nullptr && !next_entry->key().empty() &&
1160 abs(static_cast<int32>(next_entry->last_access_time() -
1161 last_entry->last_access_time())) <= 10 &&
1162 IsContentWord(next_entry->value())) {
1163 Entry *result2 = AddEntryWithNewKeyValue(
1164 result->key() + next_entry->key(),
1165 result->value() + next_entry->value(),
1166 *result,
1167 results);
1168 if (!result2->removed()) {
1169 results->Push(result2);
1170 }
1171 }
1172 }
1173
1174 return true;
1175 }
1176
Predict(Segments * segments) const1177 bool UserHistoryPredictor::Predict(Segments *segments) const {
1178 ConversionRequest default_request;
1179 return PredictForRequest(default_request, segments);
1180 }
1181
PredictForRequest(const ConversionRequest & request,Segments * segments) const1182 bool UserHistoryPredictor::PredictForRequest(const ConversionRequest &request,
1183 Segments *segments) const {
1184 if (!CheckSyncerAndDelete()) {
1185 LOG(WARNING) << "Syncer is running";
1186 return false;
1187 }
1188
1189 if (request.config().incognito_mode()) {
1190 VLOG(2) << "incognito mode";
1191 return false;
1192 }
1193
1194 if (request.config().history_learning_level() == config::Config::NO_HISTORY) {
1195 VLOG(2) << "history learning level is NO_HISTORY";
1196 return false;
1197 }
1198
1199 if (segments->request_type() == Segments::CONVERSION) {
1200 VLOG(2) << "request type is CONVERSION";
1201 return false;
1202 }
1203
1204 if (!request.config().use_history_suggest() &&
1205 segments->request_type() == Segments::SUGGESTION) {
1206 VLOG(2) << "no history suggest";
1207 return false;
1208 }
1209
1210 if (segments->conversion_segments_size() < 1) {
1211 VLOG(2) << "segment size < 1";
1212 return false;
1213 }
1214
1215 if (dic_->Head() == nullptr) {
1216 VLOG(2) << "dic head is nullptr";
1217 return false;
1218 }
1219
1220 const RequestType request_type = request.request().zero_query_suggestion() ?
1221 ZERO_QUERY_SUGGESTION : DEFAULT;
1222 const string &input_key = segments->conversion_segment(0).key();
1223 if (IsPunctuation(Util::SubString(input_key, 0, 1))) {
1224 VLOG(2) << "input_key starts with punctuations";
1225 return false;
1226 }
1227
1228 const size_t input_key_len = Util::CharsLen(input_key);
1229 if (input_key_len == 0 && request_type == DEFAULT) {
1230 VLOG(2) << "key length is 0";
1231 return false;
1232 }
1233
1234 const Entry *prev_entry =
1235 LookupPrevEntry(*segments, request.request().available_emoji_carrier());
1236 if (input_key_len == 0 && prev_entry == nullptr) {
1237 VLOG(1) << "If input_key_len is 0, prev_entry must be set";
1238 return false;
1239 }
1240
1241 EntryPriorityQueue results;
1242 GetResultsFromHistoryDictionary(
1243 request_type, request, *segments, prev_entry, &results);
1244 if (results.size() == 0) {
1245 VLOG(2) << "no prefix match candiate is found.";
1246 return false;
1247 }
1248
1249 return InsertCandidates(request_type, request, segments, &results);
1250 }
1251
LookupPrevEntry(const Segments & segments,uint32 available_emoji_carrier) const1252 const UserHistoryPredictor::Entry *UserHistoryPredictor::LookupPrevEntry(
1253 const Segments &segments, uint32 available_emoji_carrier) const {
1254 const size_t history_segments_size = segments.history_segments_size();
1255 const Entry *prev_entry = nullptr;
1256 // When threre are non-zero history segments, lookup an entry
1257 // from the LRU dictionary, which is correspoinding to the last
1258 // history segment.
1259 if (history_segments_size == 0) {
1260 return nullptr;
1261 }
1262
1263 const Segment &history_segment =
1264 segments.history_segment(history_segments_size - 1);
1265
1266 // Simply lookup the history_segment.
1267 prev_entry = dic_->LookupWithoutInsert(SegmentFingerprint(history_segment));
1268
1269 // When |prev_entry| is nullptr or |prev_entry| has no valid next_entries,
1270 // do linear-search over the LRU.
1271 if ((prev_entry == nullptr && history_segment.candidates_size() > 0) ||
1272 (prev_entry != nullptr && prev_entry->next_entries_size() == 0)) {
1273 const string &prev_value = prev_entry == nullptr ?
1274 history_segment.candidate(0).value : prev_entry->value();
1275 int trial = 0;
1276 for (const DicElement *elm = dic_->Head();
1277 trial++ < kMaxPrevValueTrial && elm != nullptr; elm = elm->next) {
1278 const Entry *entry = &(elm->value);
1279 // entry->value() equals to the prev_value or
1280 // entry->value() is a SUFFIX of prev_value.
1281 // length of entry->value() must be >= 2, as single-length
1282 // match would be noisy.
1283 if (IsValidEntry(*entry, available_emoji_carrier) &&
1284 entry != prev_entry &&
1285 entry->next_entries_size() > 0 &&
1286 Util::CharsLen(entry->value()) >= 2 &&
1287 (entry->value() == prev_value ||
1288 Util::EndsWith(prev_value, entry->value()))) {
1289 prev_entry = entry;
1290 break;
1291 }
1292 }
1293 }
1294 return prev_entry;
1295 }
1296
GetResultsFromHistoryDictionary(RequestType request_type,const ConversionRequest & request,const Segments & segments,const Entry * prev_entry,EntryPriorityQueue * results) const1297 void UserHistoryPredictor::GetResultsFromHistoryDictionary(
1298 RequestType request_type,
1299 const ConversionRequest &request,
1300 const Segments &segments, const Entry *prev_entry,
1301 EntryPriorityQueue *results) const {
1302 DCHECK(results);
1303 const size_t max_results_size = 5 * segments.max_prediction_candidates_size();
1304
1305 // Gets romanized input key if the given preedit looks misspelled.
1306 const string roman_input_key = GetRomanMisspelledKey(request, segments);
1307
1308 // TODO(team): make GetKanaMisspelledKey(segments);
1309 // const string kana_input_key = GetKanaMisspelledKey(segments);
1310
1311 // If we have ambiguity for the input, get expanded key.
1312 // Example1 roman input: for "あk", we will get |base|, "あ" and |expanded|,
1313 // "か", "き", etc
1314 // Example2 kana input: for "あか", we will get |base|, "あ" and |expanded|,
1315 // "か", and "が".
1316
1317 // |base_key| and |input_key| could be different
1318 // For kana-input, we will expand the ambiguity for "゛".
1319 // When we input "もす",
1320 // |base_key|: "も"
1321 // |expanded|: "す", "ず"
1322 // |input_key|: "もす"
1323 // In this case, we want to show candidates for "もす" as EXACT match,
1324 // and candidates for "もず" as LEFT_PREFIX_MATCH
1325 //
1326 // For roman-input, when we input "あk",
1327 // |input_key| is "あk" and |base_key| is "あ"
1328 string input_key;
1329 string base_key;
1330 unique_ptr<Trie<string>> expanded;
1331 GetInputKeyFromSegments(request, segments, &input_key, &base_key, &expanded);
1332
1333 int trial = 0;
1334 for (const DicElement *elm = dic_->Head(); elm != nullptr; elm = elm->next) {
1335 if (!IsValidEntryIgnoringRemovedField(
1336 elm->value, request.request().available_emoji_carrier())) {
1337 continue;
1338 }
1339 if (segments.request_type() == Segments::SUGGESTION &&
1340 trial++ >= kMaxSuggestionTrial) {
1341 VLOG(2) << "too many trials";
1342 break;
1343 }
1344
1345 // Lookup key from elm_value and prev_entry.
1346 // If a new entry is found, the entry is pushed to the results.
1347 // TODO(team): make KanaFuzzyLookupEntry().
1348 if (!LookupEntry(request_type, input_key, base_key, expanded.get(),
1349 &(elm->value), prev_entry, results) &&
1350 !RomanFuzzyLookupEntry(roman_input_key, &(elm->value), results)) {
1351 continue;
1352 }
1353
1354 // already found enough results.
1355 if (results->size() >= max_results_size) {
1356 break;
1357 }
1358 }
1359 }
1360
1361 // static
GetInputKeyFromSegments(const ConversionRequest & request,const Segments & segments,string * input_key,string * base,unique_ptr<Trie<string>> * expanded)1362 void UserHistoryPredictor::GetInputKeyFromSegments(
1363 const ConversionRequest &request, const Segments &segments,
1364 string *input_key, string *base,
1365 unique_ptr<Trie<string>> *expanded) {
1366 DCHECK(input_key);
1367 DCHECK(base);
1368
1369 if (!request.has_composer() ||
1370 !FLAGS_enable_expansion_for_user_history_predictor) {
1371 *input_key = segments.conversion_segment(0).key();
1372 *base = segments.conversion_segment(0).key();
1373 return;
1374 }
1375
1376 request.composer().GetStringForPreedit(input_key);
1377 std::set<string> expanded_set;
1378 request.composer().GetQueriesForPrediction(base, &expanded_set);
1379 if (expanded_set.size() > 0) {
1380 expanded->reset(new Trie<string>);
1381 for (std::set<string>::const_iterator itr = expanded_set.begin();
1382 itr != expanded_set.end(); ++itr) {
1383 // For getting matched key, insert values
1384 (*expanded)->AddEntry(*itr, *itr);
1385 }
1386 }
1387 }
1388
InsertCandidates(RequestType request_type,const ConversionRequest & request,Segments * segments,EntryPriorityQueue * results) const1389 bool UserHistoryPredictor::InsertCandidates(RequestType request_type,
1390 const ConversionRequest &request,
1391 Segments *segments,
1392 EntryPriorityQueue *results) const {
1393 DCHECK(results);
1394 Segment *segment = segments->mutable_conversion_segment(0);
1395 if (segment == nullptr) {
1396 LOG(ERROR) << "segment is nullptr";
1397 return false;
1398 }
1399 const uint32 input_key_len = Util::CharsLen(segment->key());
1400 while (segment->candidates_size() <
1401 segments->max_prediction_candidates_size()) {
1402 // |results| is a priority queue where the elemtnt
1403 // in the queue is sorted by the score defined in GetScore().
1404 const Entry *result_entry = results->Pop();
1405 if (result_entry == nullptr) {
1406 // Pop() returns nullptr when no more valid entry exists.
1407 break;
1408 }
1409 bool is_valid_candidate = false;
1410 if (segments->request_type() == Segments::PREDICTION) {
1411 is_valid_candidate = true;
1412 } else if (segments->request_type() == Segments::SUGGESTION) {
1413 // The top result of suggestion should be a VALID suggestion candidate.
1414 // i.e., SuggestionTrigerFunc should return true for the first
1415 // candidate.
1416 // If user types "デスノート" too many times, "デスノート" will be
1417 // suggested when user types "で". It is expected, but if user types
1418 // "です" after that, showing "デスノート" is annoying.
1419 // In this situation, "です" is in the LRU, but SuggestionTrigerFunc
1420 // returns false for "です", since it is short.
1421 if (IsValidSuggestion(request_type,
1422 input_key_len, *result_entry)) {
1423 is_valid_candidate = true;
1424 } else if (segment->candidates_size() == 0) {
1425 VLOG(2) << "candidates size is 0";
1426 return false;
1427 }
1428 } else {
1429 LOG(ERROR) << "Unknown mode";
1430 return false;
1431 }
1432
1433 if (!is_valid_candidate) {
1434 VLOG(2) << "not a valid candidate: " << result_entry->key();
1435 continue;
1436 }
1437
1438 if (request.request().mixed_conversion() &&
1439 result_entry->suggestion_freq() < 2 &&
1440 Util::CharsLen(result_entry->value()) > 8) {
1441 // Don't show long history for mixed conversion
1442 // TODO(toshiyuki): Better to merge this into IsValidSuggestion logic.
1443 VLOG(2) << "long candidate: " << result_entry->value();
1444 continue;
1445 }
1446
1447 Segment::Candidate *candidate = segment->push_back_candidate();
1448 DCHECK(candidate);
1449 candidate->Init();
1450 candidate->key = result_entry->key();
1451 candidate->content_key = result_entry->key();
1452 candidate->value = result_entry->value();
1453 candidate->content_value = result_entry->value();
1454 candidate->attributes |=
1455 Segment::Candidate::USER_HISTORY_PREDICTION |
1456 Segment::Candidate::NO_VARIANTS_EXPANSION;
1457 candidate->source_info |= Segment::Candidate::USER_HISTORY_PREDICTOR;
1458 if (result_entry->spelling_correction()) {
1459 candidate->attributes |= Segment::Candidate::SPELLING_CORRECTION;
1460 }
1461 const string &description = result_entry->description();
1462 // If we have stored description, set it exactly.
1463 if (!description.empty()) {
1464 candidate->description = description;
1465 candidate->attributes |= Segment::Candidate::NO_EXTRA_DESCRIPTION;
1466 } else {
1467 VariantsRewriter::SetDescriptionForPrediction(*pos_matcher_, candidate);
1468 }
1469 #if DEBUG
1470 if (candidate->description.find("History") == string::npos) {
1471 candidate->description += " History";
1472 }
1473 #endif // DEBUG
1474 }
1475
1476 return (segment->candidates_size() > 0);
1477 }
1478
InsertNextEntry(const NextEntry & next_entry,Entry * entry) const1479 void UserHistoryPredictor::InsertNextEntry(
1480 const NextEntry &next_entry, Entry *entry) const {
1481 if (next_entry.entry_fp() == 0 || entry == nullptr) {
1482 return;
1483 }
1484
1485 NextEntry *target_next_entry = nullptr;
1486
1487 // If next_entries_size is less than kMaxNextEntriesSize,
1488 // we simply allocate a new entry.
1489 if (entry->next_entries_size() < max_next_entries_size()) {
1490 target_next_entry = entry->add_next_entries();
1491 } else {
1492 // Otherwise, find the oldest next_entry.
1493 uint64 last_access_time = kuint64max;
1494 for (int i = 0; i < entry->next_entries_size(); ++i) {
1495 // Already has the same id
1496 if (next_entry.entry_fp() == entry->next_entries(i).entry_fp()) {
1497 target_next_entry = entry->mutable_next_entries(i);
1498 break;
1499 }
1500 const Entry *found_entry = dic_->LookupWithoutInsert(
1501 entry->next_entries(i).entry_fp());
1502 // Reuses the entry if it is already removed from the LRU.
1503 if (found_entry == nullptr) {
1504 target_next_entry = entry->mutable_next_entries(i);
1505 break;
1506 }
1507 // Preserves the oldest entry
1508 if (target_next_entry == nullptr ||
1509 last_access_time > found_entry->last_access_time()) {
1510 target_next_entry = entry->mutable_next_entries(i);
1511 last_access_time = found_entry->last_access_time();
1512 }
1513 }
1514 }
1515
1516 if (target_next_entry == nullptr) {
1517 LOG(ERROR) << "cannot find a room for inserting next fp";
1518 return;
1519 }
1520
1521 target_next_entry->CopyFrom(next_entry);
1522 }
1523
IsValidEntry(const Entry & entry,uint32 available_emoji_carrier) const1524 bool UserHistoryPredictor::IsValidEntry(
1525 const Entry &entry, uint32 available_emoji_carrier) const {
1526 return !entry.removed() &&
1527 IsValidEntryIgnoringRemovedField(entry, available_emoji_carrier);
1528 }
1529
IsValidEntryIgnoringRemovedField(const Entry & entry,uint32 available_emoji_carrier) const1530 bool UserHistoryPredictor::IsValidEntryIgnoringRemovedField(
1531 const Entry &entry, uint32 available_emoji_carrier) const {
1532 if (entry.entry_type() != Entry::DEFAULT_ENTRY ||
1533 suppression_dictionary_->SuppressEntry(entry.key(), entry.value())) {
1534 return false;
1535 }
1536
1537 if (IsEmojiEntry(entry)) {
1538 if (Util::IsAndroidPuaEmoji(entry.value())) {
1539 // Android carrier dependent emoji.
1540 const uint32 kAndroidCarrier =
1541 Request::DOCOMO_EMOJI |
1542 Request::SOFTBANK_EMOJI |
1543 Request::KDDI_EMOJI;
1544 if (!(available_emoji_carrier & kAndroidCarrier)) {
1545 return false;
1546 }
1547 } else {
1548 // Unicode 6.0 emoji.
1549 if (!(available_emoji_carrier & Request::UNICODE_EMOJI)) {
1550 return false;
1551 }
1552 }
1553 }
1554
1555 return true;
1556 }
1557
InsertEvent(EntryType type)1558 void UserHistoryPredictor::InsertEvent(EntryType type) {
1559 if (type == Entry::DEFAULT_ENTRY) {
1560 return;
1561 }
1562
1563 const uint64 last_access_time = Clock::GetTime();
1564 const uint32 dic_key = Fingerprint("", "", type);
1565
1566 CHECK(dic_.get());
1567 DicElement *e = dic_->Insert(dic_key);
1568 if (e == nullptr) {
1569 VLOG(2) << "insert failed";
1570 return;
1571 }
1572
1573 Entry *entry = &(e->value);
1574 DCHECK(entry);
1575 entry->Clear();
1576 entry->set_entry_type(type);
1577 entry->set_last_access_time(last_access_time);
1578 }
1579
TryInsert(RequestType request_type,const string & key,const string & value,const string & description,bool is_suggestion_selected,uint32 next_fp,uint64 last_access_time,Segments * segments)1580 void UserHistoryPredictor::TryInsert(RequestType request_type,
1581 const string &key,
1582 const string &value,
1583 const string &description,
1584 bool is_suggestion_selected,
1585 uint32 next_fp,
1586 uint64 last_access_time,
1587 Segments *segments) {
1588 if (key.empty() || value.empty() ||
1589 key.size() > kMaxStringLength ||
1590 value.size() > kMaxStringLength ||
1591 description.size() > kMaxStringLength) {
1592 return;
1593 }
1594
1595 // For mobile, we do not learn candidates that ends with puctuation.
1596 if (request_type == ZERO_QUERY_SUGGESTION &&
1597 Util::CharsLen(value) > 1 &&
1598 IsPunctuation(Util::SubString(value, Util::CharsLen(value) - 1, 1))) {
1599 return;
1600 }
1601
1602 Insert(key, value, description, is_suggestion_selected, next_fp,
1603 last_access_time, segments);
1604 }
1605
Insert(const string & key,const string & value,const string & description,bool is_suggestion_selected,uint32 next_fp,uint64 last_access_time,Segments * segments)1606 void UserHistoryPredictor::Insert(const string &key,
1607 const string &value,
1608 const string &description,
1609 bool is_suggestion_selected,
1610 uint32 next_fp,
1611 uint64 last_access_time,
1612 Segments *segments) {
1613 const uint32 dic_key = Fingerprint(key, value);
1614
1615 if (!dic_->HasKey(dic_key)) {
1616 // The key is a new key inserted in the last Finish method.
1617 // Here we push a new RevertEntry so that the new "key" can be
1618 // removed when Revert() method is called.
1619 Segments::RevertEntry *revert_entry = segments->push_back_revert_entry();
1620 DCHECK(revert_entry);
1621 revert_entry->key = Uint32ToString(dic_key);
1622 revert_entry->id = UserHistoryPredictor::revert_id();
1623 revert_entry->revert_entry_type = Segments::RevertEntry::CREATE_ENTRY;
1624 } else {
1625 // The key is a old key not inserted in the last Finish method
1626 // TODO(taku):
1627 // add a treatment for UPDATE_ENTRY mode
1628 }
1629
1630 DicElement *e = dic_->Insert(dic_key);
1631 if (e == nullptr) {
1632 VLOG(2) << "insert failed";
1633 return;
1634 }
1635
1636 Entry *entry = &(e->value);
1637 DCHECK(entry);
1638
1639 entry->set_key(key);
1640 entry->set_value(value);
1641 entry->set_removed(false);
1642
1643 if (description.empty()) {
1644 entry->clear_description();
1645 } else {
1646 entry->set_description(description);
1647 }
1648
1649 entry->set_last_access_time(last_access_time);
1650 if (is_suggestion_selected) {
1651 entry->set_suggestion_freq(entry->suggestion_freq() + 1);
1652 } else {
1653 entry->set_conversion_freq(entry->conversion_freq() + 1);
1654 }
1655
1656 // Inserts next_fp to the entry
1657 if (next_fp != 0) {
1658 NextEntry next_entry;
1659 next_entry.set_entry_fp(next_fp);
1660 InsertNextEntry(next_entry, entry);
1661 }
1662
1663 VLOG(2) << key << " " << value << " has inserted: "
1664 << entry->Utf8DebugString();
1665
1666 // New entry is inserted to the cache
1667 updated_ = true;
1668 }
1669
MaybeRecordUsageStats(const Segments & segments) const1670 void UserHistoryPredictor::MaybeRecordUsageStats(
1671 const Segments &segments) const {
1672 const Segment &segment = segments.conversion_segment(0);
1673 if (segment.candidates_size() < 1) {
1674 VLOG(2) << "candidates size < 1";
1675 return;
1676 }
1677
1678 const Segment::Candidate &candidate = segment.candidate(0);
1679 if (segment.segment_type() != Segment::FIXED_VALUE) {
1680 VLOG(2) << "segment is not FIXED_VALUE" << candidate.value;
1681 return;
1682 }
1683
1684 if (!(candidate.source_info & Segment::Candidate::USER_HISTORY_PREDICTOR)) {
1685 VLOG(2) << "candidate is not from user_history_predictor";
1686 return;
1687 }
1688
1689 UsageStats::IncrementCount("CommitUserHistoryPredictor");
1690 if (segment.key().empty()) {
1691 UsageStats::IncrementCount("CommitUserHistoryPredictorZeroQuery");
1692 }
1693 }
1694
Finish(const ConversionRequest & request,Segments * segments)1695 void UserHistoryPredictor::Finish(const ConversionRequest &request,
1696 Segments *segments) {
1697 if (segments->request_type() == Segments::REVERSE_CONVERSION) {
1698 // Do nothing for REVERSE_CONVERSION.
1699 return;
1700 }
1701
1702 if (request.config().incognito_mode()) {
1703 VLOG(2) << "incognito mode";
1704 return;
1705 }
1706
1707 if (request.config().history_learning_level() !=
1708 config::Config::DEFAULT_HISTORY) {
1709 VLOG(2) << "history learning level is not DEFAULT_HISTORY: "
1710 << request.config().history_learning_level();
1711 return;
1712 }
1713
1714 if (!request.config().use_history_suggest()) {
1715 VLOG(2) << "no history suggest";
1716 return;
1717 }
1718
1719 if (!CheckSyncerAndDelete()) {
1720 LOG(WARNING) << "Syncer is running";
1721 return;
1722 }
1723
1724 MaybeRecordUsageStats(*segments);
1725
1726 const RequestType request_type = request.request().zero_query_suggestion() ?
1727 ZERO_QUERY_SUGGESTION : DEFAULT;
1728 const bool is_suggestion = segments->request_type() != Segments::CONVERSION;
1729 const uint64 last_access_time = Clock::GetTime();
1730
1731 // If user inputs a punctuation just after some long sentence,
1732 // we make a new candidate by concatinating the top element in LRU and
1733 // the punctuation user input. The top element in LRU is supposed to be
1734 // the long sentence user input before.
1735 // This is a fix for http://b/issue?id=2216838
1736 //
1737 // Note: We don't make such candidates for mobile.
1738 if (request_type != ZERO_QUERY_SUGGESTION &&
1739 dic_->Head() != nullptr &&
1740 dic_->Head()->value.last_access_time() + 5 > last_access_time &&
1741 // Check if the current value is a punctuation.
1742 segments->conversion_segments_size() == 1 &&
1743 segments->conversion_segment(0).candidates_size() > 0 &&
1744 IsPunctuation(segments->conversion_segment(0).candidate(0).value) &&
1745 // Check if the previous value looks like a sentence.
1746 segments->history_segments_size() > 0 &&
1747 segments->history_segment(
1748 segments->history_segments_size() - 1).candidates_size() > 0 &&
1749 IsSentenceLikeCandidate(segments->history_segment(
1750 segments->history_segments_size() - 1).candidate(0))) {
1751 const Entry *entry = &(dic_->Head()->value);
1752 DCHECK(entry);
1753 const string &last_value =
1754 segments->history_segment(
1755 segments->history_segments_size() - 1).candidate(0).value;
1756 // Check if the head value in LRU ends with the candidate value in history
1757 // segments.
1758 if (Util::EndsWith(entry->value(), last_value)) {
1759 const Segment::Candidate &candidate =
1760 segments->conversion_segment(0).candidate(0);
1761 const string key = entry->key() + candidate.key;
1762 const string value = entry->value() + candidate.value;
1763 // Uses the same last_access_time stored in the top element
1764 // so that this item can be grouped together.
1765 TryInsert(request_type,
1766 key, value, entry->description(), is_suggestion, 0,
1767 entry->last_access_time(), segments);
1768 }
1769 }
1770
1771 const size_t history_segments_size = segments->history_segments_size();
1772
1773 // Checks every segment is valid.
1774 for (size_t i = history_segments_size; i < segments->segments_size(); ++i) {
1775 const Segment &segment = segments->segment(i);
1776 if (segment.candidates_size() < 1) {
1777 VLOG(2) << "candidates size < 1";
1778 return;
1779 }
1780 if (segment.segment_type() != Segment::FIXED_VALUE) {
1781 VLOG(2) << "segment is not FIXED_VALUE";
1782 return;
1783 }
1784 const Segment::Candidate &candidate = segment.candidate(0);
1785 if (candidate.attributes &
1786 Segment::Candidate::NO_SUGGEST_LEARNING) {
1787 VLOG(2) << "NO_SUGGEST_LEARNING";
1788 return;
1789 }
1790 }
1791
1792 if (IsPrivacySensitive(segments)) {
1793 VLOG(2) << "do not remember privacy sensitive input";
1794 return;
1795 }
1796
1797 InsertHistory(request_type, is_suggestion, last_access_time, segments);
1798 return;
1799 }
1800
MakeLearningSegments(const Segments & segments,SegmentsForLearning * learning_segments) const1801 void UserHistoryPredictor::MakeLearningSegments(
1802 const Segments &segments, SegmentsForLearning *learning_segments) const {
1803 DCHECK(learning_segments);
1804
1805 for (size_t i = 0; i < segments.history_segments_size(); ++i) {
1806 const Segment &segment = segments.history_segment(i);
1807 SegmentForLearning learning_segment;
1808 DCHECK_LE(1, segment.candidates_size());
1809 learning_segment.key = segment.candidate(0).key;
1810 learning_segment.value = segment.candidate(0).value;
1811 learning_segment.content_key = segment.candidate(0).content_key;
1812 learning_segment.content_value = segment.candidate(0).content_value;
1813 learning_segment.description = GetDescription(segment.candidate(0));
1814 learning_segments->push_back_history_segment(learning_segment);
1815 }
1816 for (size_t i = 0; i < segments.conversion_segments_size(); ++i) {
1817 const Segment &segment = segments.conversion_segment(i);
1818 const Segment::Candidate &candidate = segment.candidate(0);
1819 if (candidate.inner_segment_boundary.empty()) {
1820 SegmentForLearning learning_segment;
1821 learning_segment.key = candidate.key;
1822 learning_segment.value = candidate.value;
1823 learning_segment.content_key = candidate.content_key;
1824 learning_segment.content_value = candidate.content_value;
1825 learning_segment.description = GetDescription(candidate);
1826 learning_segments->push_back_conversion_segment(learning_segment);
1827 } else {
1828 SegmentForLearning learning_segment;
1829 for (Segment::Candidate::InnerSegmentIterator iter(&candidate);
1830 !iter.Done(); iter.Next()) {
1831 learning_segment.key.assign(iter.GetKey().data(), iter.GetKey().size());
1832 learning_segment.value.assign(iter.GetValue().data(),
1833 iter.GetValue().size());
1834 learning_segment.content_key.assign(iter.GetContentKey().data(),
1835 iter.GetContentKey().size());
1836 learning_segment.content_value.assign(iter.GetContentValue().data(),
1837 iter.GetContentValue().size());
1838 learning_segments->push_back_conversion_segment(learning_segment);
1839 }
1840 }
1841 }
1842 }
1843
InsertHistory(RequestType request_type,bool is_suggestion_selected,uint64 last_access_time,Segments * segments)1844 void UserHistoryPredictor::InsertHistory(RequestType request_type,
1845 bool is_suggestion_selected,
1846 uint64 last_access_time,
1847 Segments *segments) {
1848 SegmentsForLearning learning_segments;
1849 MakeLearningSegments(*segments, &learning_segments);
1850
1851 string all_key, all_value;
1852 mozc_hash_set<uint32> seen;
1853 bool this_was_seen = false;
1854 const size_t history_segments_size =
1855 learning_segments.history_segments_size();
1856
1857 for (size_t i = history_segments_size;
1858 i < learning_segments.all_segments_size(); ++i) {
1859 const SegmentForLearning &segment = learning_segments.all_segment(i);
1860 all_key += segment.key;
1861 all_value += segment.value;
1862 uint32 next_fp = (i == learning_segments.all_segments_size() - 1) ?
1863 0 : LearningSegmentFingerprint(learning_segments.all_segment(i + 1));
1864 // remember the first segment
1865 if (i == history_segments_size) {
1866 seen.insert(LearningSegmentFingerprint(segment));
1867 }
1868 uint32 next_fp_to_set = next_fp;
1869 // If two duplicate segments exist, kills the link
1870 // TO/FROM the second one to prevent loops.
1871 // Only killing "TO" link caused bug #2982886:
1872 // after converting "らいおん(もうじゅう)とぞうりむし(びせいぶつ)"
1873 // and typing "ぞうりむし", "ゾウリムシ(猛獣" was suggested.
1874 if (this_was_seen) {
1875 next_fp_to_set = 0;
1876 }
1877 if (!seen.insert(next_fp).second) {
1878 next_fp_to_set = 0;
1879 this_was_seen = true;
1880 } else {
1881 this_was_seen = false;
1882 }
1883 TryInsert(request_type,
1884 segment.key,
1885 segment.value,
1886 segment.description,
1887 is_suggestion_selected, next_fp_to_set,
1888 last_access_time, segments);
1889 if (content_word_learning_enabled_ &&
1890 segment.content_key != segment.key &&
1891 segment.content_value != segment.value) {
1892 TryInsert(request_type,
1893 segment.content_key,
1894 segment.content_value,
1895 segment.description,
1896 is_suggestion_selected, 0,
1897 last_access_time, segments);
1898 }
1899 }
1900
1901 // Inserts all_key/all_value.
1902 // We don't insert it for mobile.
1903 if (request_type != ZERO_QUERY_SUGGESTION &&
1904 learning_segments.conversion_segments_size() > 1 &&
1905 !all_key.empty() && !all_value.empty()) {
1906 TryInsert(request_type,
1907 all_key, all_value, "",
1908 is_suggestion_selected,
1909 0, last_access_time, segments);
1910 }
1911
1912 // Makes a link from the last history_segment to the first conversion segment
1913 // or to the entire user input.
1914 if (learning_segments.history_segments_size() > 0 &&
1915 learning_segments.conversion_segments_size() > 0) {
1916 const SegmentForLearning &history_segment =
1917 learning_segments.history_segment(
1918 segments->history_segments_size() - 1);
1919 const SegmentForLearning &conversion_segment =
1920 learning_segments.conversion_segment(0);
1921 const string &history_value = history_segment.value;
1922 if (history_value.empty() || conversion_segment.value.empty()) {
1923 return;
1924 }
1925 // 1) Don't learn a link from a history which ends with punctuation.
1926 if (IsPunctuation(Util::SubString(history_value,
1927 Util::CharsLen(history_value) - 1,
1928 1))) {
1929 return;
1930 }
1931 // 2) Don't learn a link to a punctuation.
1932 // Exception: For zero query suggestion, we learn a link to a single
1933 // punctuation segment.
1934 // Example: "よろしく|。" -> OK
1935 // "よろしく|。でも" -> NG
1936 // "よろしく|。。" -> NG
1937 // Note that another piece of code handles learning for
1938 // (sentence + punctuation) form; see Finish().
1939 if (IsPunctuation(Util::SubString(conversion_segment.value, 0, 1)) &&
1940 (request_type != ZERO_QUERY_SUGGESTION ||
1941 Util::CharsLen(conversion_segment.value) > 1)) {
1942 return;
1943 }
1944 Entry *history_entry = dic_->MutableLookupWithoutInsert(
1945 LearningSegmentFingerprint(history_segment));
1946 NextEntry next_entry;
1947 if (segments->request_type() == Segments::CONVERSION) {
1948 next_entry.set_entry_fp(LearningSegmentFingerprint(conversion_segment));
1949 InsertNextEntry(next_entry, history_entry);
1950 }
1951
1952 // Entire user input or SUGGESTION
1953 if (segments->request_type() != Segments::CONVERSION ||
1954 learning_segments.conversion_segments_size() > 1) {
1955 next_entry.set_entry_fp(Fingerprint(all_key, all_value));
1956 InsertNextEntry(next_entry, history_entry);
1957 }
1958 }
1959
1960 return;
1961 }
1962
Revert(Segments * segments)1963 void UserHistoryPredictor::Revert(Segments *segments) {
1964 if (!CheckSyncerAndDelete()) {
1965 LOG(WARNING) << "Syncer is running";
1966 return;
1967 }
1968
1969 for (size_t i = 0; i < segments->revert_entries_size(); ++i) {
1970 const Segments::RevertEntry &revert_entry =
1971 segments->revert_entry(i);
1972 if (revert_entry.id == UserHistoryPredictor::revert_id() &&
1973 revert_entry.revert_entry_type == Segments::RevertEntry::CREATE_ENTRY) {
1974 VLOG(2) << "Erasing the key: " << StringToUint32(revert_entry.key);
1975 dic_->Erase(StringToUint32(revert_entry.key));
1976 }
1977 }
1978 }
1979
1980 // static
GetMatchType(const string & lstr,const string & rstr)1981 UserHistoryPredictor::MatchType UserHistoryPredictor::GetMatchType(
1982 const string &lstr, const string &rstr) {
1983 if (lstr.empty() && !rstr.empty()) {
1984 return LEFT_EMPTY_MATCH;
1985 }
1986
1987 const size_t size = std::min(lstr.size(), rstr.size());
1988 if (size == 0) {
1989 return NO_MATCH;
1990 }
1991
1992 const int result = memcmp(lstr.data(), rstr.data(), size);
1993 if (result != 0) {
1994 return NO_MATCH;
1995 }
1996
1997 if (lstr.size() == rstr.size()) {
1998 return EXACT_MATCH;
1999 } else if (lstr.size() < rstr.size()) {
2000 return LEFT_PREFIX_MATCH;
2001 } else {
2002 return RIGHT_PREFIX_MATCH;
2003 }
2004
2005 return NO_MATCH;
2006 }
2007
2008 // static
GetMatchTypeFromInput(const string & input_key,const string & key_base,const Trie<string> * key_expanded,const string & target)2009 UserHistoryPredictor::MatchType UserHistoryPredictor::GetMatchTypeFromInput(
2010 const string &input_key,
2011 const string &key_base, const Trie<string> *key_expanded,
2012 const string &target) {
2013 if (key_expanded == nullptr) {
2014 // |input_key| and |key_base| can be different by compoesr modification.
2015 // For example, |input_key|, "8,+", and |base| "8、+".
2016 return GetMatchType(key_base, target);
2017 }
2018
2019 // We can assume key_expanded != nullptr from here.
2020 if (key_base.empty()) {
2021 string value;
2022 size_t key_length = 0;
2023 bool has_subtrie = false;
2024 if (!key_expanded->LookUpPrefix(target, &value,
2025 &key_length, &has_subtrie)) {
2026 return NO_MATCH;
2027 } else if (value == target && value == input_key) {
2028 return EXACT_MATCH;
2029 } else {
2030 return LEFT_PREFIX_MATCH;
2031 }
2032 } else {
2033 const size_t size = std::min(key_base.size(), target.size());
2034 if (size == 0) {
2035 return NO_MATCH;
2036 }
2037 const int result = memcmp(key_base.data(), target.data(), size);
2038 if (result != 0) {
2039 return NO_MATCH;
2040 }
2041 if (target.size() <= key_base.size()) {
2042 return RIGHT_PREFIX_MATCH;
2043 }
2044 string value;
2045 size_t key_length = 0;
2046 bool has_subtrie = false;
2047 if (!key_expanded->LookUpPrefix(target.data() + key_base.size(),
2048 &value,
2049 &key_length, &has_subtrie)) {
2050 return NO_MATCH;
2051 }
2052 const string matched = key_base + value;
2053 if (matched == target && matched == input_key) {
2054 return EXACT_MATCH;
2055 } else {
2056 return LEFT_PREFIX_MATCH;
2057 }
2058 }
2059
2060 DCHECK(false) << "Should not come here";
2061 return NO_MATCH;
2062 }
2063
2064 // static
Fingerprint(const string & key,const string & value,EntryType type)2065 uint32 UserHistoryPredictor::Fingerprint(const string &key,
2066 const string &value,
2067 EntryType type) {
2068 if (type == Entry::DEFAULT_ENTRY) {
2069 // Since we have already used the fingerprint function for next entries and
2070 // next entries are saved in user's local machine, we are not able
2071 // to change the Fingerprint function for the old key/value type.
2072 return Hash::Fingerprint32(key + kDelimiter + value);
2073 } else {
2074 return Hash::Fingerprint32(static_cast<uint8>(type));
2075 }
2076 }
2077
2078 // static
Fingerprint(const string & key,const string & value)2079 uint32 UserHistoryPredictor::Fingerprint(const string &key,
2080 const string &value) {
2081 return Fingerprint(key, value, Entry::DEFAULT_ENTRY);
2082 }
2083
EntryFingerprint(const Entry & entry)2084 uint32 UserHistoryPredictor::EntryFingerprint(const Entry &entry) {
2085 return Fingerprint(entry.key(), entry.value());
2086 }
2087
2088 // static
SegmentFingerprint(const Segment & segment)2089 uint32 UserHistoryPredictor::SegmentFingerprint(const Segment &segment) {
2090 if (segment.candidates_size() > 0) {
2091 return Fingerprint(segment.candidate(0).key, segment.candidate(0).value);
2092 }
2093 return 0;
2094 }
2095
2096 // static
LearningSegmentFingerprint(const SegmentForLearning & segment)2097 uint32 UserHistoryPredictor::LearningSegmentFingerprint(
2098 const SegmentForLearning &segment) {
2099 return Fingerprint(segment.key, segment.value);
2100 }
2101
2102 // static
Uint32ToString(uint32 fp)2103 string UserHistoryPredictor::Uint32ToString(uint32 fp) {
2104 string buf(reinterpret_cast<const char *>(&fp), sizeof(fp));
2105 return buf;
2106 }
2107
2108 // static
StringToUint32(const string & input)2109 uint32 UserHistoryPredictor::StringToUint32(const string &input) {
2110 uint32 result = 0;
2111 if (input.size() == sizeof(result)) {
2112 memcpy(reinterpret_cast<char *>(&result), input.data(), input.size());
2113 }
2114 return result;
2115 }
2116
2117 // static
IsValidSuggestion(RequestType request_type,uint32 prefix_len,const Entry & entry)2118 bool UserHistoryPredictor::IsValidSuggestion(
2119 RequestType request_type, uint32 prefix_len, const Entry &entry) {
2120 // When bigram_boost is true, that means that previous user input
2121 // and current input have bigram relation.
2122 if (entry.bigram_boost()) {
2123 return true;
2124 }
2125 // When zero_query_suggestion is true, that means that
2126 // predictor is running on mobile device. In this case,
2127 // make the behavior more aggressive.
2128 if (request_type == ZERO_QUERY_SUGGESTION) {
2129 return true;
2130 }
2131 // Handles suggestion_freq and conversion_freq differently.
2132 // conversion_freq is less aggressively affecting to the final decision.
2133 const uint32 freq =
2134 std::max(entry.suggestion_freq(), entry.conversion_freq() / 4);
2135
2136 // TODO(taku,komatsu): better to make it simpler and easier to be understood.
2137 const uint32 base_prefix_len = 3 - std::min(static_cast<uint32>(2), freq);
2138 return (prefix_len >= base_prefix_len);
2139 }
2140
2141 // static
2142 // 1) sort by last_access_time, which is basically the same as LRU policy.
2143 // 2) boost shorter candidate, if having the same last_access_time.
2144 // 3) add a bigram boost as a special bonus.
2145 // TODO(taku): better to take "frequency" into consideration
GetScore(const Entry & entry)2146 uint32 UserHistoryPredictor::GetScore(const Entry &entry) {
2147 const uint32 kBigramBoostAsTime = 7 * 24 * 60 * 60; // 1 week.
2148 return
2149 entry.last_access_time() - Util::CharsLen(entry.value()) +
2150 (entry.bigram_boost() ? kBigramBoostAsTime : 0);
2151 }
2152
2153 // Returns the size of cache.
2154 // static
cache_size()2155 uint32 UserHistoryPredictor::cache_size() {
2156 return kLRUCacheSize;
2157 }
2158
2159 // Returns the size of next entries.
2160 // static
max_next_entries_size()2161 uint32 UserHistoryPredictor::max_next_entries_size() {
2162 return kMaxNextEntriesSize;
2163 }
2164
2165 } // namespace mozc
2166