1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "dictionary/user_dictionary_util.h"
31
32 #include <string.h>
33 #include <algorithm>
34
35 #include "base/config_file_stream.h"
36 #include "base/file_stream.h"
37 #include "base/logging.h"
38 #include "base/protobuf/message.h"
39 #include "base/util.h"
40 #include "dictionary/user_pos_interface.h"
41
42 namespace mozc {
43
44 using ::mozc::protobuf::RepeatedPtrField;
45 using ::mozc::user_dictionary::UserDictionaryCommandStatus;
46
47 namespace {
48 // Maximum string length in UserDictionaryEntry's field
49 const size_t kMaxKeySize = 300;
50 const size_t kMaxValueSize = 300;
51 const size_t kMaxCommentSize = 300;
52 const char kInvalidChars[]= "\n\r\t";
53 const char kUserDictionaryFile[] = "user://user_dictionary.db";
54
55 // Maximum string length for dictionary name.
56 const size_t kMaxDictionaryNameSize = 300;
57
58 // The limits of dictionary/entry size.
59 const size_t kMaxDictionarySize = 100;
60 const size_t kMaxEntrySize = 1000000;
61 } // namespace
62
max_dictionary_size()63 size_t UserDictionaryUtil::max_dictionary_size() {
64 return kMaxDictionarySize;
65 }
66
max_entry_size()67 size_t UserDictionaryUtil::max_entry_size() {
68 return kMaxEntrySize;
69 }
70
IsValidEntry(const UserPOSInterface & user_pos,const user_dictionary::UserDictionary::Entry & entry)71 bool UserDictionaryUtil::IsValidEntry(
72 const UserPOSInterface &user_pos,
73 const user_dictionary::UserDictionary::Entry &entry) {
74 return ValidateEntry(entry) ==
75 UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS;
76 }
77
78 namespace {
79
80 #define INRANGE(w, a, b) ((w) >= (a) && (w) <= (b))
81
InternalValidateNormalizedReading(const string & reading)82 bool InternalValidateNormalizedReading(const string &reading) {
83 for (ConstChar32Iterator iter(reading); !iter.Done(); iter.Next()) {
84 const char32 c = iter.Get();
85 if (!INRANGE(c, 0x0021, 0x007E) && // Basic Latin (Ascii)
86 !INRANGE(c, 0x3041, 0x3096) && // Hiragana
87 !INRANGE(c, 0x309B, 0x309C) && // KATAKANA-HIRAGANA VOICED/SEMI-VOICED
88 // SOUND MARK
89 !INRANGE(c, 0x30FB, 0x30FC) && // Nakaten, Prolonged sound mark
90 !INRANGE(c, 0x3001, 0x3002) && // Japanese punctuation marks
91 !INRANGE(c, 0x300C, 0x300F) && // Japanese brackets
92 !INRANGE(c, 0x301C, 0x301C)) { // Japanese Wavedash
93 LOG(INFO) << "Invalid character in reading.";
94 return false;
95 }
96 }
97 return true;
98 }
99
100 #undef INRANGE
101
102 } // namespace
103
IsValidReading(const string & reading)104 bool UserDictionaryUtil::IsValidReading(const string &reading) {
105 string normalized;
106 NormalizeReading(reading, &normalized);
107 return InternalValidateNormalizedReading(normalized);
108 }
109
NormalizeReading(const string & input,string * output)110 void UserDictionaryUtil::NormalizeReading(const string &input, string *output) {
111 output->clear();
112 string tmp1, tmp2;
113 Util::FullWidthAsciiToHalfWidthAscii(input, &tmp1);
114 Util::HalfWidthKatakanaToFullWidthKatakana(tmp1, &tmp2);
115 Util::KatakanaToHiragana(tmp2, output);
116 }
117
ValidateEntry(const user_dictionary::UserDictionary::Entry & entry)118 UserDictionaryCommandStatus::Status UserDictionaryUtil::ValidateEntry(
119 const user_dictionary::UserDictionary::Entry &entry) {
120 // Validate reading.
121 const string &reading = entry.key();
122 if (reading.empty()) {
123 VLOG(1) << "key is empty";
124 return UserDictionaryCommandStatus::READING_EMPTY;
125 }
126 if (reading.size() > kMaxKeySize) {
127 VLOG(1) << "Too long key.";
128 return UserDictionaryCommandStatus::READING_TOO_LONG;
129 }
130 if (!IsValidReading(reading)) {
131 VLOG(1) << "Invalid reading";
132 return UserDictionaryCommandStatus::READING_CONTAINS_INVALID_CHARACTER;
133 }
134
135 // Validate word.
136 const string &word = entry.value();
137 if (word.empty()) {
138 return UserDictionaryCommandStatus::WORD_EMPTY;
139 }
140 if (word.size() > kMaxValueSize) {
141 VLOG(1) << "Too long value.";
142 return UserDictionaryCommandStatus::WORD_TOO_LONG;
143 }
144 if (word.find_first_of(kInvalidChars) != string::npos) {
145 VLOG(1) << "Invalid character in value.";
146 return UserDictionaryCommandStatus::WORD_CONTAINS_INVALID_CHARACTER;
147 }
148
149 // Validate comment.
150 const string &comment = entry.comment();
151 if (comment.size() > kMaxCommentSize) {
152 VLOG(1) << "Too long comment.";
153 return UserDictionaryCommandStatus::COMMENT_TOO_LONG;
154 }
155 if (comment.find_first_of(kInvalidChars) != string::npos) {
156 VLOG(1) << "Invalid character in comment.";
157 return UserDictionaryCommandStatus::COMMENT_CONTAINS_INVALID_CHARACTER;
158 }
159
160 // Validate pos.
161 if (!entry.has_pos() ||
162 !user_dictionary::UserDictionary::PosType_IsValid(entry.pos())) {
163 VLOG(1) << "Invalid POS";
164 return UserDictionaryCommandStatus::INVALID_POS_TYPE;
165 }
166
167 return UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS;
168 }
169
IsStorageFull(const user_dictionary::UserDictionaryStorage & storage)170 bool UserDictionaryUtil::IsStorageFull(
171 const user_dictionary::UserDictionaryStorage &storage) {
172 return storage.dictionaries_size() >= kMaxDictionarySize;
173 }
174
IsDictionaryFull(const user_dictionary::UserDictionary & dictionary)175 bool UserDictionaryUtil::IsDictionaryFull(
176 const user_dictionary::UserDictionary &dictionary) {
177 return dictionary.entries_size() >= kMaxEntrySize;
178 }
179
180 const user_dictionary::UserDictionary *
GetUserDictionaryById(const user_dictionary::UserDictionaryStorage & storage,uint64 dictionary_id)181 UserDictionaryUtil::GetUserDictionaryById(
182 const user_dictionary::UserDictionaryStorage &storage,
183 uint64 dictionary_id) {
184 int index = GetUserDictionaryIndexById(storage, dictionary_id);
185 return index >= 0 ? &storage.dictionaries(index) : NULL;
186 }
187
188 user_dictionary::UserDictionary *
GetMutableUserDictionaryById(user_dictionary::UserDictionaryStorage * storage,uint64 dictionary_id)189 UserDictionaryUtil::GetMutableUserDictionaryById(
190 user_dictionary::UserDictionaryStorage *storage, uint64 dictionary_id) {
191 int index = GetUserDictionaryIndexById(*storage, dictionary_id);
192 return index >= 0 ? storage->mutable_dictionaries(index) : NULL;
193 }
194
GetUserDictionaryIndexById(const user_dictionary::UserDictionaryStorage & storage,uint64 dictionary_id)195 int UserDictionaryUtil::GetUserDictionaryIndexById(
196 const user_dictionary::UserDictionaryStorage &storage,
197 uint64 dictionary_id) {
198 for (int i = 0; i < storage.dictionaries_size(); ++i) {
199 const user_dictionary::UserDictionary &dictionary =
200 storage.dictionaries(i);
201 if (dictionary.id() == dictionary_id) {
202 return i;
203 }
204 }
205
206 LOG(ERROR) << "Cannot find dictionary id: " << dictionary_id;
207 return -1;
208 }
209
GetUserDictionaryFileName()210 string UserDictionaryUtil::GetUserDictionaryFileName() {
211 return ConfigFileStream::GetFileName(kUserDictionaryFile);
212 }
213
214 // static
SanitizeEntry(user_dictionary::UserDictionary::Entry * entry)215 bool UserDictionaryUtil::SanitizeEntry(
216 user_dictionary::UserDictionary::Entry *entry) {
217 bool modified = false;
218 modified |= Sanitize(entry->mutable_key(), kMaxKeySize);
219 modified |= Sanitize(entry->mutable_value(), kMaxValueSize);
220 if (!user_dictionary::UserDictionary::PosType_IsValid(entry->pos())) {
221 // Fallback to NOUN.
222 entry->set_pos(user_dictionary::UserDictionary::NOUN);
223 modified = true;
224 }
225 modified |= Sanitize(entry->mutable_comment(), kMaxCommentSize);
226 return modified;
227 }
228
229 // static
Sanitize(string * str,size_t max_size)230 bool UserDictionaryUtil::Sanitize(string *str, size_t max_size) {
231 // First part: Remove invalid characters.
232 {
233 const size_t original_size = str->size();
234 string::iterator begin = str->begin();
235 string::iterator end = str->end();
236 end = std::remove(begin, end, '\t');
237 end = std::remove(begin, end, '\n');
238 end = std::remove(begin, end, '\r');
239
240 if (end - begin <= max_size) {
241 if (end - begin == original_size) {
242 return false;
243 } else {
244 str->erase(end - begin);
245 return true;
246 }
247 }
248 }
249
250 // Second part: Truncate long strings.
251 {
252 const char *begin = str->data();
253 const char *p = begin;
254 const char *end = begin + str->size();
255 while (p < end) {
256 const size_t len = Util::OneCharLen(p);
257 if ((p + len - begin) > max_size) {
258 str->erase(p - begin);
259 return true;
260 }
261 p += len;
262 }
263 LOG(FATAL) <<
264 "There should be a bug in implementation of the function.";
265 }
266
267 return true;
268 }
269
ValidateDictionaryName(const user_dictionary::UserDictionaryStorage & storage,const string & dictionary_name)270 UserDictionaryCommandStatus::Status UserDictionaryUtil::ValidateDictionaryName(
271 const user_dictionary::UserDictionaryStorage &storage,
272 const string &dictionary_name) {
273 if (dictionary_name.empty()) {
274 VLOG(1) << "Empty dictionary name.";
275 return UserDictionaryCommandStatus::DICTIONARY_NAME_EMPTY;
276 }
277 if (dictionary_name.size() > kMaxDictionaryNameSize) {
278 VLOG(1) << "Too long dictionary name";
279 return UserDictionaryCommandStatus::DICTIONARY_NAME_TOO_LONG;
280 }
281 if (dictionary_name.find_first_of(kInvalidChars) != string::npos) {
282 VLOG(1) << "Invalid character in dictionary name: " << dictionary_name;
283 return UserDictionaryCommandStatus
284 ::DICTIONARY_NAME_CONTAINS_INVALID_CHARACTER;
285 }
286 for (int i = 0; i < storage.dictionaries_size(); ++i) {
287 if (storage.dictionaries(i).name() == dictionary_name) {
288 LOG(ERROR) << "duplicated dictionary name";
289 return UserDictionaryCommandStatus::DICTIONARY_NAME_DUPLICATED;
290 }
291 }
292
293 return UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS;
294 }
295
296 namespace {
297 // The index of each element should be matched with the actual value of enum.
298 // See also user_dictionary_storage.proto for the definition of the enum.
299 // Note that the '0' is invalid in the definition, so the corresponding
300 // element is nullptr.
301 const char *kPosTypeStringTable[] = {
302 nullptr,
303 "名詞",
304 "短縮よみ",
305 "サジェストのみ",
306 "固有名詞",
307 "人名",
308 "姓",
309 "名",
310 "組織",
311 "地名",
312 "名詞サ変",
313 "名詞形動",
314 "数",
315 "アルファベット",
316 "記号",
317 "顔文字",
318
319 "副詞",
320 "連体詞",
321 "接続詞",
322 "感動詞",
323 "接頭語",
324 "助数詞",
325 "接尾一般",
326 "接尾人名",
327 "接尾地名",
328 "動詞ワ行五段",
329 "動詞カ行五段",
330 "動詞サ行五段",
331 "動詞タ行五段",
332 "動詞ナ行五段",
333 "動詞マ行五段",
334 "動詞ラ行五段",
335 "動詞ガ行五段",
336 "動詞バ行五段",
337 "動詞ハ行四段",
338 "動詞一段",
339 "動詞カ変",
340 "動詞サ変",
341 "動詞ザ変",
342 "動詞ラ変",
343 "形容詞",
344 "終助詞",
345 "句読点",
346 "独立語",
347 "抑制単語",
348 };
349 } // namespace
350
GetStringPosType(user_dictionary::UserDictionary::PosType pos_type)351 const char* UserDictionaryUtil::GetStringPosType(
352 user_dictionary::UserDictionary::PosType pos_type) {
353 if (user_dictionary::UserDictionary::PosType_IsValid(pos_type)) {
354 return kPosTypeStringTable[pos_type];
355 }
356 return NULL;
357 }
358
ToPosType(const char * string_pos_type)359 user_dictionary::UserDictionary::PosType UserDictionaryUtil::ToPosType(
360 const char *string_pos_type) {
361 // Skip the element at 0.
362 for (int i = 1; i < arraysize(kPosTypeStringTable); ++i) {
363 if (strcmp(kPosTypeStringTable[i], string_pos_type) == 0) {
364 return static_cast<user_dictionary::UserDictionary::PosType>(i);
365 }
366 }
367
368 // Not found. Return invalid value.
369 return static_cast<user_dictionary::UserDictionary::PosType>(-1);
370 }
371
CreateNewDictionaryId(const user_dictionary::UserDictionaryStorage & storage)372 uint64 UserDictionaryUtil::CreateNewDictionaryId(
373 const user_dictionary::UserDictionaryStorage &storage) {
374 static const uint64 kInvalidDictionaryId = 0;
375
376 uint64 id = kInvalidDictionaryId;
377 while (id == kInvalidDictionaryId) {
378 Util::GetRandomSequence(reinterpret_cast<char *>(&id), sizeof(id));
379
380 #ifdef OS_NACL
381 // Because JavaScript does not support uint64, we downsize the dictionary id
382 // range from uint64 to uint32 in NaCl.
383 id = static_cast<uint32>(id);
384 #endif // OS_NACL
385
386 // Duplication check.
387 for (int i = 0; i < storage.dictionaries_size(); ++i) {
388 if (storage.dictionaries(i).id() == id) {
389 // Duplicated id is found. So invalidate it to retry the generating.
390 id = kInvalidDictionaryId;
391 break;
392 }
393 }
394 }
395
396 return id;
397 }
398
CreateDictionary(user_dictionary::UserDictionaryStorage * storage,const string & dictionary_name,uint64 * new_dictionary_id)399 UserDictionaryCommandStatus::Status UserDictionaryUtil::CreateDictionary(
400 user_dictionary::UserDictionaryStorage *storage,
401 const string &dictionary_name,
402 uint64 *new_dictionary_id) {
403 UserDictionaryCommandStatus::Status status =
404 ValidateDictionaryName(*storage, dictionary_name);
405 if (status != UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS) {
406 LOG(ERROR) << "Invalid dictionary name is passed";
407 return status;
408 }
409
410 if (IsStorageFull(*storage)) {
411 LOG(ERROR) << "too many dictionaries";
412 return UserDictionaryCommandStatus::DICTIONARY_SIZE_LIMIT_EXCEEDED;
413 }
414
415 if (new_dictionary_id == NULL) {
416 LOG(ERROR) << "new_dictionary_id is NULL";
417 return UserDictionaryCommandStatus::UNKNOWN_ERROR;
418 }
419
420 *new_dictionary_id = CreateNewDictionaryId(*storage);
421 user_dictionary::UserDictionary* dictionary = storage->add_dictionaries();
422 if (dictionary == NULL) {
423 LOG(ERROR) << "add_dictionaries failed.";
424 return UserDictionaryCommandStatus::UNKNOWN_ERROR;
425 }
426
427 dictionary->set_id(*new_dictionary_id);
428 dictionary->set_name(dictionary_name);
429 return UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS;
430 }
431
DeleteDictionary(user_dictionary::UserDictionaryStorage * storage,uint64 dictionary_id,int * original_index,user_dictionary::UserDictionary ** deleted_dictionary)432 bool UserDictionaryUtil::DeleteDictionary(
433 user_dictionary::UserDictionaryStorage *storage,
434 uint64 dictionary_id,
435 int *original_index,
436 user_dictionary::UserDictionary **deleted_dictionary) {
437 const int index = GetUserDictionaryIndexById(*storage, dictionary_id);
438 if (original_index != NULL) {
439 *original_index = index;
440 }
441
442 if (index < 0) {
443 LOG(ERROR) << "Invalid dictionary id: " << dictionary_id;
444 return false;
445 }
446
447 RepeatedPtrField<user_dictionary::UserDictionary> *dictionaries =
448 storage->mutable_dictionaries();
449 // Move the target dictionary to the end.
450 std::rotate(dictionaries->pointer_begin() + index,
451 dictionaries->pointer_begin() + index + 1,
452 dictionaries->pointer_end());
453
454 if (deleted_dictionary == NULL) {
455 dictionaries->RemoveLast();
456 } else {
457 *deleted_dictionary = dictionaries->ReleaseLast();
458 }
459
460 return true;
461 }
462
463 } // namespace mozc
464