// Copyright 2010-2018, Google Inc. // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dictionary/user_dictionary_util.h" #include #include #include "base/config_file_stream.h" #include "base/file_stream.h" #include "base/logging.h" #include "base/protobuf/message.h" #include "base/util.h" #include "dictionary/user_pos_interface.h" namespace mozc { using ::mozc::protobuf::RepeatedPtrField; using ::mozc::user_dictionary::UserDictionaryCommandStatus; namespace { // Maximum string length in UserDictionaryEntry's field const size_t kMaxKeySize = 300; const size_t kMaxValueSize = 300; const size_t kMaxCommentSize = 300; const char kInvalidChars[]= "\n\r\t"; const char kUserDictionaryFile[] = "user://user_dictionary.db"; // Maximum string length for dictionary name. const size_t kMaxDictionaryNameSize = 300; // The limits of dictionary/entry size. const size_t kMaxDictionarySize = 100; const size_t kMaxEntrySize = 1000000; } // namespace size_t UserDictionaryUtil::max_dictionary_size() { return kMaxDictionarySize; } size_t UserDictionaryUtil::max_entry_size() { return kMaxEntrySize; } bool UserDictionaryUtil::IsValidEntry( const UserPOSInterface &user_pos, const user_dictionary::UserDictionary::Entry &entry) { return ValidateEntry(entry) == UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS; } namespace { #define INRANGE(w, a, b) ((w) >= (a) && (w) <= (b)) bool InternalValidateNormalizedReading(const string &reading) { for (ConstChar32Iterator iter(reading); !iter.Done(); iter.Next()) { const char32 c = iter.Get(); if (!INRANGE(c, 0x0021, 0x007E) && // Basic Latin (Ascii) !INRANGE(c, 0x3041, 0x3096) && // Hiragana !INRANGE(c, 0x309B, 0x309C) && // KATAKANA-HIRAGANA VOICED/SEMI-VOICED // SOUND MARK !INRANGE(c, 0x30FB, 0x30FC) && // Nakaten, Prolonged sound mark !INRANGE(c, 0x3001, 0x3002) && // Japanese punctuation marks !INRANGE(c, 0x300C, 0x300F) && // Japanese brackets !INRANGE(c, 0x301C, 0x301C)) { // Japanese Wavedash LOG(INFO) << "Invalid character in reading."; return false; } } return true; } #undef INRANGE } // namespace bool UserDictionaryUtil::IsValidReading(const string &reading) { string normalized; NormalizeReading(reading, &normalized); return InternalValidateNormalizedReading(normalized); } void UserDictionaryUtil::NormalizeReading(const string &input, string *output) { output->clear(); string tmp1, tmp2; Util::FullWidthAsciiToHalfWidthAscii(input, &tmp1); Util::HalfWidthKatakanaToFullWidthKatakana(tmp1, &tmp2); Util::KatakanaToHiragana(tmp2, output); } UserDictionaryCommandStatus::Status UserDictionaryUtil::ValidateEntry( const user_dictionary::UserDictionary::Entry &entry) { // Validate reading. const string &reading = entry.key(); if (reading.empty()) { VLOG(1) << "key is empty"; return UserDictionaryCommandStatus::READING_EMPTY; } if (reading.size() > kMaxKeySize) { VLOG(1) << "Too long key."; return UserDictionaryCommandStatus::READING_TOO_LONG; } if (!IsValidReading(reading)) { VLOG(1) << "Invalid reading"; return UserDictionaryCommandStatus::READING_CONTAINS_INVALID_CHARACTER; } // Validate word. const string &word = entry.value(); if (word.empty()) { return UserDictionaryCommandStatus::WORD_EMPTY; } if (word.size() > kMaxValueSize) { VLOG(1) << "Too long value."; return UserDictionaryCommandStatus::WORD_TOO_LONG; } if (word.find_first_of(kInvalidChars) != string::npos) { VLOG(1) << "Invalid character in value."; return UserDictionaryCommandStatus::WORD_CONTAINS_INVALID_CHARACTER; } // Validate comment. const string &comment = entry.comment(); if (comment.size() > kMaxCommentSize) { VLOG(1) << "Too long comment."; return UserDictionaryCommandStatus::COMMENT_TOO_LONG; } if (comment.find_first_of(kInvalidChars) != string::npos) { VLOG(1) << "Invalid character in comment."; return UserDictionaryCommandStatus::COMMENT_CONTAINS_INVALID_CHARACTER; } // Validate pos. if (!entry.has_pos() || !user_dictionary::UserDictionary::PosType_IsValid(entry.pos())) { VLOG(1) << "Invalid POS"; return UserDictionaryCommandStatus::INVALID_POS_TYPE; } return UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS; } bool UserDictionaryUtil::IsStorageFull( const user_dictionary::UserDictionaryStorage &storage) { return storage.dictionaries_size() >= kMaxDictionarySize; } bool UserDictionaryUtil::IsDictionaryFull( const user_dictionary::UserDictionary &dictionary) { return dictionary.entries_size() >= kMaxEntrySize; } const user_dictionary::UserDictionary * UserDictionaryUtil::GetUserDictionaryById( const user_dictionary::UserDictionaryStorage &storage, uint64 dictionary_id) { int index = GetUserDictionaryIndexById(storage, dictionary_id); return index >= 0 ? &storage.dictionaries(index) : NULL; } user_dictionary::UserDictionary * UserDictionaryUtil::GetMutableUserDictionaryById( user_dictionary::UserDictionaryStorage *storage, uint64 dictionary_id) { int index = GetUserDictionaryIndexById(*storage, dictionary_id); return index >= 0 ? storage->mutable_dictionaries(index) : NULL; } int UserDictionaryUtil::GetUserDictionaryIndexById( const user_dictionary::UserDictionaryStorage &storage, uint64 dictionary_id) { for (int i = 0; i < storage.dictionaries_size(); ++i) { const user_dictionary::UserDictionary &dictionary = storage.dictionaries(i); if (dictionary.id() == dictionary_id) { return i; } } LOG(ERROR) << "Cannot find dictionary id: " << dictionary_id; return -1; } string UserDictionaryUtil::GetUserDictionaryFileName() { return ConfigFileStream::GetFileName(kUserDictionaryFile); } // static bool UserDictionaryUtil::SanitizeEntry( user_dictionary::UserDictionary::Entry *entry) { bool modified = false; modified |= Sanitize(entry->mutable_key(), kMaxKeySize); modified |= Sanitize(entry->mutable_value(), kMaxValueSize); if (!user_dictionary::UserDictionary::PosType_IsValid(entry->pos())) { // Fallback to NOUN. entry->set_pos(user_dictionary::UserDictionary::NOUN); modified = true; } modified |= Sanitize(entry->mutable_comment(), kMaxCommentSize); return modified; } // static bool UserDictionaryUtil::Sanitize(string *str, size_t max_size) { // First part: Remove invalid characters. { const size_t original_size = str->size(); string::iterator begin = str->begin(); string::iterator end = str->end(); end = std::remove(begin, end, '\t'); end = std::remove(begin, end, '\n'); end = std::remove(begin, end, '\r'); if (end - begin <= max_size) { if (end - begin == original_size) { return false; } else { str->erase(end - begin); return true; } } } // Second part: Truncate long strings. { const char *begin = str->data(); const char *p = begin; const char *end = begin + str->size(); while (p < end) { const size_t len = Util::OneCharLen(p); if ((p + len - begin) > max_size) { str->erase(p - begin); return true; } p += len; } LOG(FATAL) << "There should be a bug in implementation of the function."; } return true; } UserDictionaryCommandStatus::Status UserDictionaryUtil::ValidateDictionaryName( const user_dictionary::UserDictionaryStorage &storage, const string &dictionary_name) { if (dictionary_name.empty()) { VLOG(1) << "Empty dictionary name."; return UserDictionaryCommandStatus::DICTIONARY_NAME_EMPTY; } if (dictionary_name.size() > kMaxDictionaryNameSize) { VLOG(1) << "Too long dictionary name"; return UserDictionaryCommandStatus::DICTIONARY_NAME_TOO_LONG; } if (dictionary_name.find_first_of(kInvalidChars) != string::npos) { VLOG(1) << "Invalid character in dictionary name: " << dictionary_name; return UserDictionaryCommandStatus ::DICTIONARY_NAME_CONTAINS_INVALID_CHARACTER; } for (int i = 0; i < storage.dictionaries_size(); ++i) { if (storage.dictionaries(i).name() == dictionary_name) { LOG(ERROR) << "duplicated dictionary name"; return UserDictionaryCommandStatus::DICTIONARY_NAME_DUPLICATED; } } return UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS; } namespace { // The index of each element should be matched with the actual value of enum. // See also user_dictionary_storage.proto for the definition of the enum. // Note that the '0' is invalid in the definition, so the corresponding // element is nullptr. const char *kPosTypeStringTable[] = { nullptr, "名詞", "短縮よみ", "サジェストのみ", "固有名詞", "人名", "姓", "名", "組織", "地名", "名詞サ変", "名詞形動", "数", "アルファベット", "記号", "顔文字", "副詞", "連体詞", "接続詞", "感動詞", "接頭語", "助数詞", "接尾一般", "接尾人名", "接尾地名", "動詞ワ行五段", "動詞カ行五段", "動詞サ行五段", "動詞タ行五段", "動詞ナ行五段", "動詞マ行五段", "動詞ラ行五段", "動詞ガ行五段", "動詞バ行五段", "動詞ハ行四段", "動詞一段", "動詞カ変", "動詞サ変", "動詞ザ変", "動詞ラ変", "形容詞", "終助詞", "句読点", "独立語", "抑制単語", }; } // namespace const char* UserDictionaryUtil::GetStringPosType( user_dictionary::UserDictionary::PosType pos_type) { if (user_dictionary::UserDictionary::PosType_IsValid(pos_type)) { return kPosTypeStringTable[pos_type]; } return NULL; } user_dictionary::UserDictionary::PosType UserDictionaryUtil::ToPosType( const char *string_pos_type) { // Skip the element at 0. for (int i = 1; i < arraysize(kPosTypeStringTable); ++i) { if (strcmp(kPosTypeStringTable[i], string_pos_type) == 0) { return static_cast(i); } } // Not found. Return invalid value. return static_cast(-1); } uint64 UserDictionaryUtil::CreateNewDictionaryId( const user_dictionary::UserDictionaryStorage &storage) { static const uint64 kInvalidDictionaryId = 0; uint64 id = kInvalidDictionaryId; while (id == kInvalidDictionaryId) { Util::GetRandomSequence(reinterpret_cast(&id), sizeof(id)); #ifdef OS_NACL // Because JavaScript does not support uint64, we downsize the dictionary id // range from uint64 to uint32 in NaCl. id = static_cast(id); #endif // OS_NACL // Duplication check. for (int i = 0; i < storage.dictionaries_size(); ++i) { if (storage.dictionaries(i).id() == id) { // Duplicated id is found. So invalidate it to retry the generating. id = kInvalidDictionaryId; break; } } } return id; } UserDictionaryCommandStatus::Status UserDictionaryUtil::CreateDictionary( user_dictionary::UserDictionaryStorage *storage, const string &dictionary_name, uint64 *new_dictionary_id) { UserDictionaryCommandStatus::Status status = ValidateDictionaryName(*storage, dictionary_name); if (status != UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS) { LOG(ERROR) << "Invalid dictionary name is passed"; return status; } if (IsStorageFull(*storage)) { LOG(ERROR) << "too many dictionaries"; return UserDictionaryCommandStatus::DICTIONARY_SIZE_LIMIT_EXCEEDED; } if (new_dictionary_id == NULL) { LOG(ERROR) << "new_dictionary_id is NULL"; return UserDictionaryCommandStatus::UNKNOWN_ERROR; } *new_dictionary_id = CreateNewDictionaryId(*storage); user_dictionary::UserDictionary* dictionary = storage->add_dictionaries(); if (dictionary == NULL) { LOG(ERROR) << "add_dictionaries failed."; return UserDictionaryCommandStatus::UNKNOWN_ERROR; } dictionary->set_id(*new_dictionary_id); dictionary->set_name(dictionary_name); return UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS; } bool UserDictionaryUtil::DeleteDictionary( user_dictionary::UserDictionaryStorage *storage, uint64 dictionary_id, int *original_index, user_dictionary::UserDictionary **deleted_dictionary) { const int index = GetUserDictionaryIndexById(*storage, dictionary_id); if (original_index != NULL) { *original_index = index; } if (index < 0) { LOG(ERROR) << "Invalid dictionary id: " << dictionary_id; return false; } RepeatedPtrField *dictionaries = storage->mutable_dictionaries(); // Move the target dictionary to the end. std::rotate(dictionaries->pointer_begin() + index, dictionaries->pointer_begin() + index + 1, dictionaries->pointer_end()); if (deleted_dictionary == NULL) { dictionaries->RemoveLast(); } else { *deleted_dictionary = dictionaries->ReleaseLast(); } return true; } } // namespace mozc