1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "dictionary/user_dictionary.h"
31 
32 #include <algorithm>
33 #include <limits>
34 #include <memory>
35 #include <set>
36 #include <string>
37 
38 #include "base/compiler_specific.h"
39 #include "base/file_util.h"
40 #include "base/hash.h"
41 #include "base/logging.h"
42 #include "base/mutex.h"
43 #include "base/singleton.h"
44 #include "base/stl_util.h"
45 #include "base/string_piece.h"
46 #include "base/thread.h"
47 #include "base/util.h"
48 #include "dictionary/dictionary_token.h"
49 #include "dictionary/pos_matcher.h"
50 #include "dictionary/suppression_dictionary.h"
51 #include "dictionary/user_dictionary_storage.h"
52 #include "dictionary/user_dictionary_util.h"
53 #include "dictionary/user_pos.h"
54 #include "protocol/config.pb.h"
55 #include "usage_stats/usage_stats.h"
56 
57 namespace mozc {
58 namespace dictionary {
59 namespace {
60 
61 struct OrderByKey {
operator ()mozc::dictionary::__anon691865b20111::OrderByKey62   bool operator()(const UserPOS::Token *token, StringPiece key) const {
63     return token->key < key;
64   }
65 
operator ()mozc::dictionary::__anon691865b20111::OrderByKey66   bool operator()(StringPiece key, const UserPOS::Token *token) const {
67     return key < token->key;
68   }
69 };
70 
71 struct OrderByKeyPrefix {
operator ()mozc::dictionary::__anon691865b20111::OrderByKeyPrefix72   bool operator()(const UserPOS::Token *token, StringPiece prefix) const {
73     return StringPiece(token->key).substr(0, prefix.size()) < prefix;
74   }
75 
operator ()mozc::dictionary::__anon691865b20111::OrderByKeyPrefix76   bool operator()(StringPiece prefix, const UserPOS::Token *token) const {
77     return prefix < StringPiece(token->key).substr(0, prefix.size());
78   }
79 };
80 
81 struct OrderByKeyThenById {
operator ()mozc::dictionary::__anon691865b20111::OrderByKeyThenById82   bool operator()(const UserPOS::Token *lhs, const UserPOS::Token *rhs) const {
83     const int comp = lhs->key.compare(rhs->key);
84     return comp == 0 ? (lhs->id < rhs->id) : (comp < 0);
85   }
86 };
87 
88 class UserDictionaryFileManager {
89  public:
UserDictionaryFileManager()90   UserDictionaryFileManager() {}
91 
GetFileName()92   const string GetFileName() {
93     scoped_lock l(&mutex_);
94     if (filename_.empty()) {
95       return UserDictionaryUtil::GetUserDictionaryFileName();
96     } else {
97       return filename_;
98     }
99   }
100 
SetFileName(const string & filename)101   void SetFileName(const string &filename) {
102     scoped_lock l(&mutex_);
103     filename_ = filename;
104   }
105 
106  private:
107   string filename_;
108   Mutex mutex_;
109   DISALLOW_COPY_AND_ASSIGN(UserDictionaryFileManager);
110 };
111 
FillTokenFromUserPOSToken(const UserPOS::Token & user_pos_token,Token * token)112 void FillTokenFromUserPOSToken(const UserPOS::Token &user_pos_token,
113                                Token *token) {
114   token->key = user_pos_token.key;
115   token->value = user_pos_token.value;
116   token->cost = user_pos_token.cost;
117   token->lid = user_pos_token.id;
118   token->rid = user_pos_token.id;
119   token->attributes = Token::USER_DICTIONARY;
120 }
121 
122 }  // namespace
123 
124 class UserDictionary::TokensIndex : public std::vector<UserPOS::Token *> {
125  public:
TokensIndex(const UserPOSInterface * user_pos,SuppressionDictionary * suppression_dictionary)126   TokensIndex(const UserPOSInterface *user_pos,
127               SuppressionDictionary *suppression_dictionary)
128       : user_pos_(user_pos),
129         suppression_dictionary_(suppression_dictionary) {}
130 
~TokensIndex()131   ~TokensIndex() {
132     Clear();
133   }
134 
Clear()135   void Clear() {
136     STLDeleteElements(this);
137     clear();
138   }
139 
Load(const user_dictionary::UserDictionaryStorage & storage)140   void Load(const user_dictionary::UserDictionaryStorage &storage) {
141     Clear();
142     std::set<uint64> seen;
143     std::vector<UserPOS::Token> tokens;
144 
145     if (!suppression_dictionary_->IsLocked()) {
146       LOG(ERROR) << "SuppressionDictionary must be locked first";
147     }
148     suppression_dictionary_->Clear();
149 
150     for (size_t i = 0; i < storage.dictionaries_size(); ++i) {
151       const UserDictionaryStorage::UserDictionary &dic =
152           storage.dictionaries(i);
153       if (!dic.enabled() || dic.entries_size() == 0) {
154         continue;
155       }
156 
157       for (size_t j = 0; j < dic.entries_size(); ++j) {
158         const UserDictionaryStorage::UserDictionaryEntry &entry =
159             dic.entries(j);
160 
161         if (!UserDictionaryUtil::IsValidEntry(*user_pos_, entry)) {
162           continue;
163         }
164 
165         string tmp, reading;
166         UserDictionaryUtil::NormalizeReading(entry.key(), &tmp);
167 
168         // We cannot call NormalizeVoiceSoundMark inside NormalizeReading,
169         // because the normalization is user-visible.
170         // http://b/2480844
171         Util::NormalizeVoicedSoundMark(tmp, &reading);
172 
173         DCHECK_LE(0, entry.pos());
174 MOZC_CLANG_PUSH_WARNING();
175 #if MOZC_CLANG_HAS_WARNING(tautological-constant-out-of-range-compare)
176 MOZC_CLANG_DISABLE_WARNING(tautological-constant-out-of-range-compare);
177 #endif  // MOZC_CLANG_HAS_WARNING(tautological-constant-out-of-range-compare)
178         DCHECK_LE(entry.pos(), 255);
179 MOZC_CLANG_POP_WARNING();
180         const uint64 fp = Hash::Fingerprint(reading +
181                                             "\t" +
182                                             entry.value() +
183                                             "\t" +
184                                             static_cast<char>(entry.pos()));
185         if (!seen.insert(fp).second) {
186           VLOG(1) << "Found dup item";
187           continue;
188         }
189 
190         // "抑制単語"
191         if (entry.pos() == user_dictionary::UserDictionary::SUPPRESSION_WORD) {
192           suppression_dictionary_->AddEntry(reading, entry.value());
193         } else {
194           tokens.clear();
195           user_pos_->GetTokens(
196               reading, entry.value(),
197               UserDictionaryUtil::GetStringPosType(entry.pos()), &tokens);
198           for (size_t k = 0; k < tokens.size(); ++k) {
199             this->push_back(new UserPOS::Token(tokens[k]));
200             Util::StripWhiteSpaces(entry.comment(), &this->back()->comment);
201           }
202         }
203       }
204     }
205 
206     // Sort first by key and then by POS ID.
207     std::sort(this->begin(), this->end(), OrderByKeyThenById());
208 
209     suppression_dictionary_->UnLock();
210 
211     VLOG(1) << this->size() << " user dic entries loaded";
212 
213     usage_stats::UsageStats::SetInteger("UserRegisteredWord",
214                                         static_cast<int>(this->size()));
215   }
216 
217  private:
218   const UserPOSInterface *user_pos_;
219   SuppressionDictionary *suppression_dictionary_;
220 };
221 
222 class UserDictionary::UserDictionaryReloader : public Thread {
223  public:
UserDictionaryReloader(UserDictionary * dic)224   explicit UserDictionaryReloader(UserDictionary *dic)
225       : modified_at_(0), auto_register_mode_(false), dic_(dic) {
226     DCHECK(dic_);
227   }
228 
~UserDictionaryReloader()229   ~UserDictionaryReloader() override {
230     Join();
231   }
232 
StartAutoRegistration(const string & key,const string & value,user_dictionary::UserDictionary::PosType pos)233   void StartAutoRegistration(const string &key,
234                              const string &value,
235                              user_dictionary::UserDictionary::PosType pos) {
236     {
237       scoped_lock l(&mutex_);
238       auto_register_mode_ = true;
239       key_ = key;
240       value_ = value;
241       pos_ = pos;
242     }
243     Start("UserDictionaryReloader");
244   }
245 
246   // When the user dictionary exists AND the modification time has been updated,
247   // reloads the dictionary.  Returns true when reloader thread is started.
MaybeStartReload()248   bool MaybeStartReload() {
249     FileTimeStamp modification_time;
250     if (!FileUtil::GetModificationTime(
251         Singleton<UserDictionaryFileManager>::get()->GetFileName(),
252         &modification_time)) {
253       // If the file doesn't exist, return doing nothing.
254       // Therefore if the file is deleted after first reload,
255       // second reload does nothing so the content loaded by first reload
256       // is kept as is.
257       return false;
258     }
259     if (modified_at_ == modification_time) {
260       return false;
261     }
262     modified_at_ = modification_time;
263     Start("UserDictionaryReloader");
264     return true;
265   }
266 
Run()267   void Run() override {
268     std::unique_ptr<UserDictionaryStorage> storage(new UserDictionaryStorage(
269         Singleton<UserDictionaryFileManager>::get()->GetFileName()));
270 
271     // Load from file
272     if (!storage->Load()) {
273       return;
274     }
275 
276     if (storage->ConvertSyncDictionariesToNormalDictionaries()) {
277       LOG(INFO) << "Syncable dictionaries are converted to normal dictionaries";
278       if (storage->Lock()) {
279         storage->Save();
280         storage->UnLock();
281       }
282     }
283 
284     if (auto_register_mode_ &&
285         !storage->AddToAutoRegisteredDictionary(key_, value_, pos_)) {
286       LOG(ERROR) << "failed to execute AddToAutoRegisteredDictionary";
287       auto_register_mode_ = false;
288       return;
289     }
290 
291     auto_register_mode_ = false;
292     dic_->Load(storage.get()->user_dictionary_storage_base);
293   }
294 
295  private:
296   FileTimeStamp modified_at_;
297   Mutex mutex_;
298   bool auto_register_mode_;
299   UserDictionary *dic_;
300   string key_;
301   string value_;
302   user_dictionary::UserDictionary::PosType pos_;
303 
304   DISALLOW_COPY_AND_ASSIGN(UserDictionaryReloader);
305 };
306 
UserDictionary(const UserPOSInterface * user_pos,POSMatcher pos_matcher,SuppressionDictionary * suppression_dictionary)307 UserDictionary::UserDictionary(const UserPOSInterface *user_pos,
308                                POSMatcher pos_matcher,
309                                SuppressionDictionary *suppression_dictionary)
310     : ALLOW_THIS_IN_INITIALIZER_LIST(
311           reloader_(new UserDictionaryReloader(this))),
312       user_pos_(user_pos),
313       pos_matcher_(pos_matcher),
314       suppression_dictionary_(suppression_dictionary),
315       tokens_(new TokensIndex(user_pos_.get(), suppression_dictionary)),
316       mutex_(new ReaderWriterMutex) {
317   DCHECK(user_pos_.get());
318   DCHECK(suppression_dictionary_);
319   Reload();
320 }
321 
~UserDictionary()322 UserDictionary::~UserDictionary() {
323   reloader_->Join();
324   delete tokens_;
325 }
326 
HasKey(StringPiece key) const327 bool UserDictionary::HasKey(StringPiece key) const {
328   // TODO(noriyukit): Currently, we don't support HasKey() for user dictionary
329   // because we need to search tokens linearly, which might be slow in extreme
330   // cases where 100K entries exist.
331   return false;
332 }
333 
HasValue(StringPiece value) const334 bool UserDictionary::HasValue(StringPiece value) const {
335   // TODO(noriyukit): Currently, we don't support HasValue() for user dictionary
336   // because we need to search tokens linearly, which might be slow in extreme
337   // cases where 100K entries exist.  Note: HasValue() method is used only in
338   // UserHistoryPredictor for privacy sensitivity check.
339   return false;
340 }
341 
LookupPredictive(StringPiece key,const ConversionRequest & conversion_request,Callback * callback) const342 void UserDictionary::LookupPredictive(
343     StringPiece key,
344     const ConversionRequest &conversion_request,
345     Callback *callback) const {
346   scoped_reader_lock l(mutex_.get());
347 
348   if (key.empty()) {
349     VLOG(2) << "string of length zero is passed.";
350     return;
351   }
352   if (tokens_->empty()) {
353     return;
354   }
355   if (conversion_request.config().incognito_mode()) {
356     return;
357   }
358 
359   // Find the starting point of iteration over dictionary contents.
360   Token token;
361   for (auto range = std::equal_range(tokens_->begin(), tokens_->end(), key,
362                                      OrderByKeyPrefix());
363        range.first != range.second; ++range.first) {
364     const UserPOS::Token &user_pos_token = **range.first;
365     switch (callback->OnKey(user_pos_token.key)) {
366       case Callback::TRAVERSE_DONE:
367         return;
368       case Callback::TRAVERSE_NEXT_KEY:
369       case Callback::TRAVERSE_CULL:
370         continue;
371       default:
372         break;
373     }
374     FillTokenFromUserPOSToken(user_pos_token, &token);
375     // Override POS IDs for suggest only words.
376     if (pos_matcher_.IsSuggestOnlyWord(user_pos_token.id)) {
377       token.lid = token.rid = pos_matcher_.GetUnknownId();
378     }
379     if (callback->OnToken(user_pos_token.key, user_pos_token.key, token) ==
380         Callback::TRAVERSE_DONE) {
381       return;
382     }
383   }
384 }
385 
386 // UserDictionary doesn't support kana modifier insensitive lookup.
LookupPrefix(StringPiece key,const ConversionRequest & conversion_request,Callback * callback) const387 void UserDictionary::LookupPrefix(
388     StringPiece key,
389     const ConversionRequest &conversion_request,
390     Callback *callback) const {
391   scoped_reader_lock l(mutex_.get());
392 
393   if (key.empty()) {
394     LOG(WARNING) << "string of length zero is passed.";
395     return;
396   }
397   if (tokens_->empty()) {
398     return;
399   }
400   if (conversion_request.config().incognito_mode()) {
401     return;
402   }
403 
404   // Find the starting point for iteration over dictionary contents.
405   const StringPiece first_char = key.substr(0, Util::OneCharLen(key.data()));
406   Token token;
407   for (auto it = std::lower_bound(tokens_->begin(), tokens_->end(), first_char,
408                                   OrderByKey());
409        it != tokens_->end(); ++it) {
410     const UserPOS::Token &user_pos_token = **it;
411     if (user_pos_token.key > key) {
412       break;
413     }
414     if (pos_matcher_.IsSuggestOnlyWord(user_pos_token.id)) {
415       continue;
416     }
417     if (!Util::StartsWith(key, user_pos_token.key)) {
418       continue;
419     }
420     switch (callback->OnKey(user_pos_token.key)) {
421       case Callback::TRAVERSE_DONE:
422         return;
423       case Callback::TRAVERSE_NEXT_KEY:
424         continue;
425       case Callback::TRAVERSE_CULL:
426         LOG(FATAL) << "UserDictionary doesn't support culling.";
427         break;
428       default:
429         break;
430     }
431     FillTokenFromUserPOSToken(user_pos_token, &token);
432     switch (callback->OnToken(user_pos_token.key, user_pos_token.key, token)) {
433       case Callback::TRAVERSE_DONE:
434         return;
435       case Callback::TRAVERSE_CULL:
436         LOG(FATAL) << "UserDictionary doesn't support culling.";
437         break;
438       default:
439         break;
440     }
441   }
442 }
443 
LookupExact(StringPiece key,const ConversionRequest & conversion_request,Callback * callback) const444 void UserDictionary::LookupExact(
445     StringPiece key,
446     const ConversionRequest &conversion_request,
447     Callback *callback) const {
448   scoped_reader_lock l(mutex_.get());
449   if (key.empty() || tokens_->empty() ||
450       conversion_request.config().incognito_mode()) {
451     return;
452   }
453   auto range = std::equal_range(tokens_->begin(), tokens_->end(), key,
454                                 OrderByKey());
455   if (range.first == range.second) {
456     return;
457   }
458   if (callback->OnKey(key) != Callback::TRAVERSE_CONTINUE) {
459     return;
460   }
461 
462   Token token;
463   for (; range.first != range.second; ++range.first) {
464     const UserPOS::Token &user_pos_token = **range.first;
465     if (pos_matcher_.IsSuggestOnlyWord(user_pos_token.id)) {
466       continue;
467     }
468     FillTokenFromUserPOSToken(user_pos_token, &token);
469     if (callback->OnToken(key, key, token) != Callback::TRAVERSE_CONTINUE) {
470       return;
471     }
472   }
473 }
474 
LookupReverse(StringPiece key,const ConversionRequest & conversion_request,Callback * callback) const475 void UserDictionary::LookupReverse(
476     StringPiece key,
477     const ConversionRequest &conversion_request,
478     Callback *callback) const {
479 }
480 
LookupComment(StringPiece key,StringPiece value,const ConversionRequest & conversion_request,string * comment) const481 bool UserDictionary::LookupComment(StringPiece key, StringPiece value,
482                                    const ConversionRequest &conversion_request,
483                                    string *comment) const {
484   if (key.empty() || conversion_request.config().incognito_mode()) {
485     return false;
486   }
487 
488   scoped_reader_lock l(mutex_.get());
489   if (tokens_->empty()) {
490     return false;
491   }
492 
493   // Set the comment that was found first.
494   for (auto range = std::equal_range(tokens_->begin(), tokens_->end(), key,
495                                      OrderByKey());
496        range.first != range.second; ++range.first) {
497     const UserPOS::Token &token = **range.first;
498     if (token.value == value && !token.comment.empty()) {
499       comment->assign(token.comment);
500       return true;
501     }
502   }
503   return false;
504 }
505 
Reload()506 bool UserDictionary::Reload() {
507   if (reloader_->IsRunning()) {
508     return false;
509   }
510   suppression_dictionary_->Lock();
511   DCHECK(suppression_dictionary_->IsLocked());
512   // When the reloader is started, |suppression_dictionary_| is unlocked by the
513   // reloader.  When not started, need to unlock it here.
514   if (!reloader_->MaybeStartReload()) {
515     suppression_dictionary_->UnLock();
516   }
517   return true;
518 }
519 
520 namespace {
521 
522 class FindValueCallback : public DictionaryInterface::Callback {
523  public:
FindValueCallback(StringPiece value)524   explicit FindValueCallback(StringPiece value)
525       : value_(value), found_(false) {}
526 
OnToken(StringPiece,StringPiece,const Token & token)527   virtual ResultType OnToken(StringPiece,  // key
528                              StringPiece,  // actual_key
529                              const Token &token) {
530     if (token.value == value_) {
531       found_ = true;
532       return TRAVERSE_DONE;
533     }
534     return TRAVERSE_CONTINUE;
535   }
536 
found() const537   bool found() const { return found_; }
538 
539  private:
540   const StringPiece value_;
541   bool found_;
542 };
543 
544 }  // namespace
545 
AddToAutoRegisteredDictionary(const string & key,const string & value,const ConversionRequest & conversion_request,user_dictionary::UserDictionary::PosType pos)546 bool UserDictionary::AddToAutoRegisteredDictionary(
547     const string &key, const string &value,
548     const ConversionRequest &conversion_request,
549     user_dictionary::UserDictionary::PosType pos) {
550   if (reloader_->IsRunning()) {
551     return false;
552   }
553 
554   FindValueCallback callback(value);
555   LookupExact(key, conversion_request, &callback);
556   if (callback.found()) {
557     // Already registered.
558     return false;
559   }
560 
561   suppression_dictionary_->Lock();
562   DCHECK(suppression_dictionary_->IsLocked());
563   reloader_->StartAutoRegistration(key, value, pos);
564 
565   return true;
566 }
567 
WaitForReloader()568 void UserDictionary::WaitForReloader() {
569   reloader_->Join();
570 }
571 
Swap(TokensIndex * new_tokens)572 void UserDictionary::Swap(TokensIndex *new_tokens) {
573   DCHECK(new_tokens);
574   TokensIndex *old_tokens = tokens_;
575   {
576     scoped_writer_lock l(mutex_.get());
577     tokens_ = new_tokens;
578   }
579   delete old_tokens;
580 }
581 
Load(const user_dictionary::UserDictionaryStorage & storage)582 bool UserDictionary::Load(
583     const user_dictionary::UserDictionaryStorage &storage) {
584   size_t size = 0;
585   {
586     scoped_reader_lock l(mutex_.get());
587     size = tokens_->size();
588   }
589 
590   // If UserDictionary is pretty big, we first remove the
591   // current dictionary to save memory usage.
592 #ifdef OS_ANDROID
593   const size_t kVeryBigUserDictionarySize = 5000;
594 #else
595   const size_t kVeryBigUserDictionarySize = 100000;
596 #endif
597 
598   if (size >= kVeryBigUserDictionarySize) {
599     TokensIndex *dummy_empty_tokens = new TokensIndex(user_pos_.get(),
600                                                       suppression_dictionary_);
601     Swap(dummy_empty_tokens);
602   }
603 
604   suppression_dictionary_->Lock();
605   TokensIndex *tokens = new TokensIndex(user_pos_.get(),
606                                         suppression_dictionary_);
607   tokens->Load(storage);  // |suppression_dictionary_| is unlocked in Load().
608   DCHECK(!suppression_dictionary_->IsLocked());
609   Swap(tokens);
610   return true;
611 }
612 
SetUserDictionaryName(const string & filename)613 void UserDictionary::SetUserDictionaryName(const string &filename) {
614   Singleton<UserDictionaryFileManager>::get()->SetFileName(filename);
615 }
616 
617 }  // namespace dictionary
618 }  // namespace mozc
619