1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "dictionary/user_dictionary.h"
31
32 #include <algorithm>
33 #include <limits>
34 #include <memory>
35 #include <set>
36 #include <string>
37
38 #include "base/compiler_specific.h"
39 #include "base/file_util.h"
40 #include "base/hash.h"
41 #include "base/logging.h"
42 #include "base/mutex.h"
43 #include "base/singleton.h"
44 #include "base/stl_util.h"
45 #include "base/string_piece.h"
46 #include "base/thread.h"
47 #include "base/util.h"
48 #include "dictionary/dictionary_token.h"
49 #include "dictionary/pos_matcher.h"
50 #include "dictionary/suppression_dictionary.h"
51 #include "dictionary/user_dictionary_storage.h"
52 #include "dictionary/user_dictionary_util.h"
53 #include "dictionary/user_pos.h"
54 #include "protocol/config.pb.h"
55 #include "usage_stats/usage_stats.h"
56
57 namespace mozc {
58 namespace dictionary {
59 namespace {
60
61 struct OrderByKey {
operator ()mozc::dictionary::__anon691865b20111::OrderByKey62 bool operator()(const UserPOS::Token *token, StringPiece key) const {
63 return token->key < key;
64 }
65
operator ()mozc::dictionary::__anon691865b20111::OrderByKey66 bool operator()(StringPiece key, const UserPOS::Token *token) const {
67 return key < token->key;
68 }
69 };
70
71 struct OrderByKeyPrefix {
operator ()mozc::dictionary::__anon691865b20111::OrderByKeyPrefix72 bool operator()(const UserPOS::Token *token, StringPiece prefix) const {
73 return StringPiece(token->key).substr(0, prefix.size()) < prefix;
74 }
75
operator ()mozc::dictionary::__anon691865b20111::OrderByKeyPrefix76 bool operator()(StringPiece prefix, const UserPOS::Token *token) const {
77 return prefix < StringPiece(token->key).substr(0, prefix.size());
78 }
79 };
80
81 struct OrderByKeyThenById {
operator ()mozc::dictionary::__anon691865b20111::OrderByKeyThenById82 bool operator()(const UserPOS::Token *lhs, const UserPOS::Token *rhs) const {
83 const int comp = lhs->key.compare(rhs->key);
84 return comp == 0 ? (lhs->id < rhs->id) : (comp < 0);
85 }
86 };
87
88 class UserDictionaryFileManager {
89 public:
UserDictionaryFileManager()90 UserDictionaryFileManager() {}
91
GetFileName()92 const string GetFileName() {
93 scoped_lock l(&mutex_);
94 if (filename_.empty()) {
95 return UserDictionaryUtil::GetUserDictionaryFileName();
96 } else {
97 return filename_;
98 }
99 }
100
SetFileName(const string & filename)101 void SetFileName(const string &filename) {
102 scoped_lock l(&mutex_);
103 filename_ = filename;
104 }
105
106 private:
107 string filename_;
108 Mutex mutex_;
109 DISALLOW_COPY_AND_ASSIGN(UserDictionaryFileManager);
110 };
111
FillTokenFromUserPOSToken(const UserPOS::Token & user_pos_token,Token * token)112 void FillTokenFromUserPOSToken(const UserPOS::Token &user_pos_token,
113 Token *token) {
114 token->key = user_pos_token.key;
115 token->value = user_pos_token.value;
116 token->cost = user_pos_token.cost;
117 token->lid = user_pos_token.id;
118 token->rid = user_pos_token.id;
119 token->attributes = Token::USER_DICTIONARY;
120 }
121
122 } // namespace
123
124 class UserDictionary::TokensIndex : public std::vector<UserPOS::Token *> {
125 public:
TokensIndex(const UserPOSInterface * user_pos,SuppressionDictionary * suppression_dictionary)126 TokensIndex(const UserPOSInterface *user_pos,
127 SuppressionDictionary *suppression_dictionary)
128 : user_pos_(user_pos),
129 suppression_dictionary_(suppression_dictionary) {}
130
~TokensIndex()131 ~TokensIndex() {
132 Clear();
133 }
134
Clear()135 void Clear() {
136 STLDeleteElements(this);
137 clear();
138 }
139
Load(const user_dictionary::UserDictionaryStorage & storage)140 void Load(const user_dictionary::UserDictionaryStorage &storage) {
141 Clear();
142 std::set<uint64> seen;
143 std::vector<UserPOS::Token> tokens;
144
145 if (!suppression_dictionary_->IsLocked()) {
146 LOG(ERROR) << "SuppressionDictionary must be locked first";
147 }
148 suppression_dictionary_->Clear();
149
150 for (size_t i = 0; i < storage.dictionaries_size(); ++i) {
151 const UserDictionaryStorage::UserDictionary &dic =
152 storage.dictionaries(i);
153 if (!dic.enabled() || dic.entries_size() == 0) {
154 continue;
155 }
156
157 for (size_t j = 0; j < dic.entries_size(); ++j) {
158 const UserDictionaryStorage::UserDictionaryEntry &entry =
159 dic.entries(j);
160
161 if (!UserDictionaryUtil::IsValidEntry(*user_pos_, entry)) {
162 continue;
163 }
164
165 string tmp, reading;
166 UserDictionaryUtil::NormalizeReading(entry.key(), &tmp);
167
168 // We cannot call NormalizeVoiceSoundMark inside NormalizeReading,
169 // because the normalization is user-visible.
170 // http://b/2480844
171 Util::NormalizeVoicedSoundMark(tmp, &reading);
172
173 DCHECK_LE(0, entry.pos());
174 MOZC_CLANG_PUSH_WARNING();
175 #if MOZC_CLANG_HAS_WARNING(tautological-constant-out-of-range-compare)
176 MOZC_CLANG_DISABLE_WARNING(tautological-constant-out-of-range-compare);
177 #endif // MOZC_CLANG_HAS_WARNING(tautological-constant-out-of-range-compare)
178 DCHECK_LE(entry.pos(), 255);
179 MOZC_CLANG_POP_WARNING();
180 const uint64 fp = Hash::Fingerprint(reading +
181 "\t" +
182 entry.value() +
183 "\t" +
184 static_cast<char>(entry.pos()));
185 if (!seen.insert(fp).second) {
186 VLOG(1) << "Found dup item";
187 continue;
188 }
189
190 // "抑制単語"
191 if (entry.pos() == user_dictionary::UserDictionary::SUPPRESSION_WORD) {
192 suppression_dictionary_->AddEntry(reading, entry.value());
193 } else {
194 tokens.clear();
195 user_pos_->GetTokens(
196 reading, entry.value(),
197 UserDictionaryUtil::GetStringPosType(entry.pos()), &tokens);
198 for (size_t k = 0; k < tokens.size(); ++k) {
199 this->push_back(new UserPOS::Token(tokens[k]));
200 Util::StripWhiteSpaces(entry.comment(), &this->back()->comment);
201 }
202 }
203 }
204 }
205
206 // Sort first by key and then by POS ID.
207 std::sort(this->begin(), this->end(), OrderByKeyThenById());
208
209 suppression_dictionary_->UnLock();
210
211 VLOG(1) << this->size() << " user dic entries loaded";
212
213 usage_stats::UsageStats::SetInteger("UserRegisteredWord",
214 static_cast<int>(this->size()));
215 }
216
217 private:
218 const UserPOSInterface *user_pos_;
219 SuppressionDictionary *suppression_dictionary_;
220 };
221
222 class UserDictionary::UserDictionaryReloader : public Thread {
223 public:
UserDictionaryReloader(UserDictionary * dic)224 explicit UserDictionaryReloader(UserDictionary *dic)
225 : modified_at_(0), auto_register_mode_(false), dic_(dic) {
226 DCHECK(dic_);
227 }
228
~UserDictionaryReloader()229 ~UserDictionaryReloader() override {
230 Join();
231 }
232
StartAutoRegistration(const string & key,const string & value,user_dictionary::UserDictionary::PosType pos)233 void StartAutoRegistration(const string &key,
234 const string &value,
235 user_dictionary::UserDictionary::PosType pos) {
236 {
237 scoped_lock l(&mutex_);
238 auto_register_mode_ = true;
239 key_ = key;
240 value_ = value;
241 pos_ = pos;
242 }
243 Start("UserDictionaryReloader");
244 }
245
246 // When the user dictionary exists AND the modification time has been updated,
247 // reloads the dictionary. Returns true when reloader thread is started.
MaybeStartReload()248 bool MaybeStartReload() {
249 FileTimeStamp modification_time;
250 if (!FileUtil::GetModificationTime(
251 Singleton<UserDictionaryFileManager>::get()->GetFileName(),
252 &modification_time)) {
253 // If the file doesn't exist, return doing nothing.
254 // Therefore if the file is deleted after first reload,
255 // second reload does nothing so the content loaded by first reload
256 // is kept as is.
257 return false;
258 }
259 if (modified_at_ == modification_time) {
260 return false;
261 }
262 modified_at_ = modification_time;
263 Start("UserDictionaryReloader");
264 return true;
265 }
266
Run()267 void Run() override {
268 std::unique_ptr<UserDictionaryStorage> storage(new UserDictionaryStorage(
269 Singleton<UserDictionaryFileManager>::get()->GetFileName()));
270
271 // Load from file
272 if (!storage->Load()) {
273 return;
274 }
275
276 if (storage->ConvertSyncDictionariesToNormalDictionaries()) {
277 LOG(INFO) << "Syncable dictionaries are converted to normal dictionaries";
278 if (storage->Lock()) {
279 storage->Save();
280 storage->UnLock();
281 }
282 }
283
284 if (auto_register_mode_ &&
285 !storage->AddToAutoRegisteredDictionary(key_, value_, pos_)) {
286 LOG(ERROR) << "failed to execute AddToAutoRegisteredDictionary";
287 auto_register_mode_ = false;
288 return;
289 }
290
291 auto_register_mode_ = false;
292 dic_->Load(storage.get()->user_dictionary_storage_base);
293 }
294
295 private:
296 FileTimeStamp modified_at_;
297 Mutex mutex_;
298 bool auto_register_mode_;
299 UserDictionary *dic_;
300 string key_;
301 string value_;
302 user_dictionary::UserDictionary::PosType pos_;
303
304 DISALLOW_COPY_AND_ASSIGN(UserDictionaryReloader);
305 };
306
UserDictionary(const UserPOSInterface * user_pos,POSMatcher pos_matcher,SuppressionDictionary * suppression_dictionary)307 UserDictionary::UserDictionary(const UserPOSInterface *user_pos,
308 POSMatcher pos_matcher,
309 SuppressionDictionary *suppression_dictionary)
310 : ALLOW_THIS_IN_INITIALIZER_LIST(
311 reloader_(new UserDictionaryReloader(this))),
312 user_pos_(user_pos),
313 pos_matcher_(pos_matcher),
314 suppression_dictionary_(suppression_dictionary),
315 tokens_(new TokensIndex(user_pos_.get(), suppression_dictionary)),
316 mutex_(new ReaderWriterMutex) {
317 DCHECK(user_pos_.get());
318 DCHECK(suppression_dictionary_);
319 Reload();
320 }
321
~UserDictionary()322 UserDictionary::~UserDictionary() {
323 reloader_->Join();
324 delete tokens_;
325 }
326
HasKey(StringPiece key) const327 bool UserDictionary::HasKey(StringPiece key) const {
328 // TODO(noriyukit): Currently, we don't support HasKey() for user dictionary
329 // because we need to search tokens linearly, which might be slow in extreme
330 // cases where 100K entries exist.
331 return false;
332 }
333
HasValue(StringPiece value) const334 bool UserDictionary::HasValue(StringPiece value) const {
335 // TODO(noriyukit): Currently, we don't support HasValue() for user dictionary
336 // because we need to search tokens linearly, which might be slow in extreme
337 // cases where 100K entries exist. Note: HasValue() method is used only in
338 // UserHistoryPredictor for privacy sensitivity check.
339 return false;
340 }
341
LookupPredictive(StringPiece key,const ConversionRequest & conversion_request,Callback * callback) const342 void UserDictionary::LookupPredictive(
343 StringPiece key,
344 const ConversionRequest &conversion_request,
345 Callback *callback) const {
346 scoped_reader_lock l(mutex_.get());
347
348 if (key.empty()) {
349 VLOG(2) << "string of length zero is passed.";
350 return;
351 }
352 if (tokens_->empty()) {
353 return;
354 }
355 if (conversion_request.config().incognito_mode()) {
356 return;
357 }
358
359 // Find the starting point of iteration over dictionary contents.
360 Token token;
361 for (auto range = std::equal_range(tokens_->begin(), tokens_->end(), key,
362 OrderByKeyPrefix());
363 range.first != range.second; ++range.first) {
364 const UserPOS::Token &user_pos_token = **range.first;
365 switch (callback->OnKey(user_pos_token.key)) {
366 case Callback::TRAVERSE_DONE:
367 return;
368 case Callback::TRAVERSE_NEXT_KEY:
369 case Callback::TRAVERSE_CULL:
370 continue;
371 default:
372 break;
373 }
374 FillTokenFromUserPOSToken(user_pos_token, &token);
375 // Override POS IDs for suggest only words.
376 if (pos_matcher_.IsSuggestOnlyWord(user_pos_token.id)) {
377 token.lid = token.rid = pos_matcher_.GetUnknownId();
378 }
379 if (callback->OnToken(user_pos_token.key, user_pos_token.key, token) ==
380 Callback::TRAVERSE_DONE) {
381 return;
382 }
383 }
384 }
385
386 // UserDictionary doesn't support kana modifier insensitive lookup.
LookupPrefix(StringPiece key,const ConversionRequest & conversion_request,Callback * callback) const387 void UserDictionary::LookupPrefix(
388 StringPiece key,
389 const ConversionRequest &conversion_request,
390 Callback *callback) const {
391 scoped_reader_lock l(mutex_.get());
392
393 if (key.empty()) {
394 LOG(WARNING) << "string of length zero is passed.";
395 return;
396 }
397 if (tokens_->empty()) {
398 return;
399 }
400 if (conversion_request.config().incognito_mode()) {
401 return;
402 }
403
404 // Find the starting point for iteration over dictionary contents.
405 const StringPiece first_char = key.substr(0, Util::OneCharLen(key.data()));
406 Token token;
407 for (auto it = std::lower_bound(tokens_->begin(), tokens_->end(), first_char,
408 OrderByKey());
409 it != tokens_->end(); ++it) {
410 const UserPOS::Token &user_pos_token = **it;
411 if (user_pos_token.key > key) {
412 break;
413 }
414 if (pos_matcher_.IsSuggestOnlyWord(user_pos_token.id)) {
415 continue;
416 }
417 if (!Util::StartsWith(key, user_pos_token.key)) {
418 continue;
419 }
420 switch (callback->OnKey(user_pos_token.key)) {
421 case Callback::TRAVERSE_DONE:
422 return;
423 case Callback::TRAVERSE_NEXT_KEY:
424 continue;
425 case Callback::TRAVERSE_CULL:
426 LOG(FATAL) << "UserDictionary doesn't support culling.";
427 break;
428 default:
429 break;
430 }
431 FillTokenFromUserPOSToken(user_pos_token, &token);
432 switch (callback->OnToken(user_pos_token.key, user_pos_token.key, token)) {
433 case Callback::TRAVERSE_DONE:
434 return;
435 case Callback::TRAVERSE_CULL:
436 LOG(FATAL) << "UserDictionary doesn't support culling.";
437 break;
438 default:
439 break;
440 }
441 }
442 }
443
LookupExact(StringPiece key,const ConversionRequest & conversion_request,Callback * callback) const444 void UserDictionary::LookupExact(
445 StringPiece key,
446 const ConversionRequest &conversion_request,
447 Callback *callback) const {
448 scoped_reader_lock l(mutex_.get());
449 if (key.empty() || tokens_->empty() ||
450 conversion_request.config().incognito_mode()) {
451 return;
452 }
453 auto range = std::equal_range(tokens_->begin(), tokens_->end(), key,
454 OrderByKey());
455 if (range.first == range.second) {
456 return;
457 }
458 if (callback->OnKey(key) != Callback::TRAVERSE_CONTINUE) {
459 return;
460 }
461
462 Token token;
463 for (; range.first != range.second; ++range.first) {
464 const UserPOS::Token &user_pos_token = **range.first;
465 if (pos_matcher_.IsSuggestOnlyWord(user_pos_token.id)) {
466 continue;
467 }
468 FillTokenFromUserPOSToken(user_pos_token, &token);
469 if (callback->OnToken(key, key, token) != Callback::TRAVERSE_CONTINUE) {
470 return;
471 }
472 }
473 }
474
LookupReverse(StringPiece key,const ConversionRequest & conversion_request,Callback * callback) const475 void UserDictionary::LookupReverse(
476 StringPiece key,
477 const ConversionRequest &conversion_request,
478 Callback *callback) const {
479 }
480
LookupComment(StringPiece key,StringPiece value,const ConversionRequest & conversion_request,string * comment) const481 bool UserDictionary::LookupComment(StringPiece key, StringPiece value,
482 const ConversionRequest &conversion_request,
483 string *comment) const {
484 if (key.empty() || conversion_request.config().incognito_mode()) {
485 return false;
486 }
487
488 scoped_reader_lock l(mutex_.get());
489 if (tokens_->empty()) {
490 return false;
491 }
492
493 // Set the comment that was found first.
494 for (auto range = std::equal_range(tokens_->begin(), tokens_->end(), key,
495 OrderByKey());
496 range.first != range.second; ++range.first) {
497 const UserPOS::Token &token = **range.first;
498 if (token.value == value && !token.comment.empty()) {
499 comment->assign(token.comment);
500 return true;
501 }
502 }
503 return false;
504 }
505
Reload()506 bool UserDictionary::Reload() {
507 if (reloader_->IsRunning()) {
508 return false;
509 }
510 suppression_dictionary_->Lock();
511 DCHECK(suppression_dictionary_->IsLocked());
512 // When the reloader is started, |suppression_dictionary_| is unlocked by the
513 // reloader. When not started, need to unlock it here.
514 if (!reloader_->MaybeStartReload()) {
515 suppression_dictionary_->UnLock();
516 }
517 return true;
518 }
519
520 namespace {
521
522 class FindValueCallback : public DictionaryInterface::Callback {
523 public:
FindValueCallback(StringPiece value)524 explicit FindValueCallback(StringPiece value)
525 : value_(value), found_(false) {}
526
OnToken(StringPiece,StringPiece,const Token & token)527 virtual ResultType OnToken(StringPiece, // key
528 StringPiece, // actual_key
529 const Token &token) {
530 if (token.value == value_) {
531 found_ = true;
532 return TRAVERSE_DONE;
533 }
534 return TRAVERSE_CONTINUE;
535 }
536
found() const537 bool found() const { return found_; }
538
539 private:
540 const StringPiece value_;
541 bool found_;
542 };
543
544 } // namespace
545
AddToAutoRegisteredDictionary(const string & key,const string & value,const ConversionRequest & conversion_request,user_dictionary::UserDictionary::PosType pos)546 bool UserDictionary::AddToAutoRegisteredDictionary(
547 const string &key, const string &value,
548 const ConversionRequest &conversion_request,
549 user_dictionary::UserDictionary::PosType pos) {
550 if (reloader_->IsRunning()) {
551 return false;
552 }
553
554 FindValueCallback callback(value);
555 LookupExact(key, conversion_request, &callback);
556 if (callback.found()) {
557 // Already registered.
558 return false;
559 }
560
561 suppression_dictionary_->Lock();
562 DCHECK(suppression_dictionary_->IsLocked());
563 reloader_->StartAutoRegistration(key, value, pos);
564
565 return true;
566 }
567
WaitForReloader()568 void UserDictionary::WaitForReloader() {
569 reloader_->Join();
570 }
571
Swap(TokensIndex * new_tokens)572 void UserDictionary::Swap(TokensIndex *new_tokens) {
573 DCHECK(new_tokens);
574 TokensIndex *old_tokens = tokens_;
575 {
576 scoped_writer_lock l(mutex_.get());
577 tokens_ = new_tokens;
578 }
579 delete old_tokens;
580 }
581
Load(const user_dictionary::UserDictionaryStorage & storage)582 bool UserDictionary::Load(
583 const user_dictionary::UserDictionaryStorage &storage) {
584 size_t size = 0;
585 {
586 scoped_reader_lock l(mutex_.get());
587 size = tokens_->size();
588 }
589
590 // If UserDictionary is pretty big, we first remove the
591 // current dictionary to save memory usage.
592 #ifdef OS_ANDROID
593 const size_t kVeryBigUserDictionarySize = 5000;
594 #else
595 const size_t kVeryBigUserDictionarySize = 100000;
596 #endif
597
598 if (size >= kVeryBigUserDictionarySize) {
599 TokensIndex *dummy_empty_tokens = new TokensIndex(user_pos_.get(),
600 suppression_dictionary_);
601 Swap(dummy_empty_tokens);
602 }
603
604 suppression_dictionary_->Lock();
605 TokensIndex *tokens = new TokensIndex(user_pos_.get(),
606 suppression_dictionary_);
607 tokens->Load(storage); // |suppression_dictionary_| is unlocked in Load().
608 DCHECK(!suppression_dictionary_->IsLocked());
609 Swap(tokens);
610 return true;
611 }
612
SetUserDictionaryName(const string & filename)613 void UserDictionary::SetUserDictionaryName(const string &filename) {
614 Singleton<UserDictionaryFileManager>::get()->SetFileName(filename);
615 }
616
617 } // namespace dictionary
618 } // namespace mozc
619