1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "dictionary/user_dictionary_util.h"
31 
32 #include <string.h>
33 #include <algorithm>
34 
35 #include "base/config_file_stream.h"
36 #include "base/file_stream.h"
37 #include "base/logging.h"
38 #include "base/protobuf/message.h"
39 #include "base/util.h"
40 #include "dictionary/user_pos_interface.h"
41 
42 namespace mozc {
43 
44 using ::mozc::protobuf::RepeatedPtrField;
45 using ::mozc::user_dictionary::UserDictionaryCommandStatus;
46 
47 namespace {
48 // Maximum string length in UserDictionaryEntry's field
49 const size_t kMaxKeySize = 300;
50 const size_t kMaxValueSize = 300;
51 const size_t kMaxCommentSize = 300;
52 const char kInvalidChars[]= "\n\r\t";
53 const char kUserDictionaryFile[] = "user://user_dictionary.db";
54 
55 // Maximum string length for dictionary name.
56 const size_t kMaxDictionaryNameSize = 300;
57 
58 // The limits of dictionary/entry size.
59 const size_t kMaxDictionarySize = 100;
60 const size_t kMaxEntrySize = 1000000;
61 }  // namespace
62 
max_dictionary_size()63 size_t UserDictionaryUtil::max_dictionary_size() {
64   return kMaxDictionarySize;
65 }
66 
max_entry_size()67 size_t UserDictionaryUtil::max_entry_size() {
68   return kMaxEntrySize;
69 }
70 
IsValidEntry(const UserPOSInterface & user_pos,const user_dictionary::UserDictionary::Entry & entry)71 bool UserDictionaryUtil::IsValidEntry(
72     const UserPOSInterface &user_pos,
73     const user_dictionary::UserDictionary::Entry &entry) {
74   return ValidateEntry(entry) ==
75       UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS;
76 }
77 
78 namespace {
79 
80 #define INRANGE(w, a, b) ((w) >= (a) && (w) <= (b))
81 
InternalValidateNormalizedReading(const string & reading)82 bool InternalValidateNormalizedReading(const string &reading) {
83   for (ConstChar32Iterator iter(reading); !iter.Done(); iter.Next()) {
84     const char32 c = iter.Get();
85     if (!INRANGE(c, 0x0021, 0x007E) &&  // Basic Latin (Ascii)
86         !INRANGE(c, 0x3041, 0x3096) &&  // Hiragana
87         !INRANGE(c, 0x309B, 0x309C) &&  // KATAKANA-HIRAGANA VOICED/SEMI-VOICED
88                                         // SOUND MARK
89         !INRANGE(c, 0x30FB, 0x30FC) &&  // Nakaten, Prolonged sound mark
90         !INRANGE(c, 0x3001, 0x3002) &&  // Japanese punctuation marks
91         !INRANGE(c, 0x300C, 0x300F) &&  // Japanese brackets
92         !INRANGE(c, 0x301C, 0x301C)) {  // Japanese Wavedash
93       LOG(INFO) << "Invalid character in reading.";
94       return false;
95     }
96   }
97   return true;
98 }
99 
100 #undef INRANGE
101 
102 }  // namespace
103 
IsValidReading(const string & reading)104 bool UserDictionaryUtil::IsValidReading(const string &reading) {
105   string normalized;
106   NormalizeReading(reading, &normalized);
107   return InternalValidateNormalizedReading(normalized);
108 }
109 
NormalizeReading(const string & input,string * output)110 void UserDictionaryUtil::NormalizeReading(const string &input, string *output) {
111   output->clear();
112   string tmp1, tmp2;
113   Util::FullWidthAsciiToHalfWidthAscii(input, &tmp1);
114   Util::HalfWidthKatakanaToFullWidthKatakana(tmp1, &tmp2);
115   Util::KatakanaToHiragana(tmp2, output);
116 }
117 
ValidateEntry(const user_dictionary::UserDictionary::Entry & entry)118 UserDictionaryCommandStatus::Status UserDictionaryUtil::ValidateEntry(
119     const user_dictionary::UserDictionary::Entry &entry) {
120   // Validate reading.
121   const string &reading = entry.key();
122   if (reading.empty()) {
123     VLOG(1) << "key is empty";
124     return UserDictionaryCommandStatus::READING_EMPTY;
125   }
126   if (reading.size() > kMaxKeySize) {
127     VLOG(1) << "Too long key.";
128     return UserDictionaryCommandStatus::READING_TOO_LONG;
129   }
130   if (!IsValidReading(reading)) {
131     VLOG(1) << "Invalid reading";
132     return UserDictionaryCommandStatus::READING_CONTAINS_INVALID_CHARACTER;
133   }
134 
135   // Validate word.
136   const string &word = entry.value();
137   if (word.empty()) {
138     return UserDictionaryCommandStatus::WORD_EMPTY;
139   }
140   if (word.size() > kMaxValueSize) {
141     VLOG(1) << "Too long value.";
142     return UserDictionaryCommandStatus::WORD_TOO_LONG;
143   }
144   if (word.find_first_of(kInvalidChars) != string::npos) {
145     VLOG(1) << "Invalid character in value.";
146     return UserDictionaryCommandStatus::WORD_CONTAINS_INVALID_CHARACTER;
147   }
148 
149   // Validate comment.
150   const string &comment = entry.comment();
151   if (comment.size() > kMaxCommentSize) {
152     VLOG(1) << "Too long comment.";
153     return UserDictionaryCommandStatus::COMMENT_TOO_LONG;
154   }
155   if (comment.find_first_of(kInvalidChars) != string::npos) {
156     VLOG(1) << "Invalid character in comment.";
157     return UserDictionaryCommandStatus::COMMENT_CONTAINS_INVALID_CHARACTER;
158   }
159 
160   // Validate pos.
161   if (!entry.has_pos() ||
162       !user_dictionary::UserDictionary::PosType_IsValid(entry.pos())) {
163     VLOG(1) << "Invalid POS";
164     return UserDictionaryCommandStatus::INVALID_POS_TYPE;
165   }
166 
167   return UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS;
168 }
169 
IsStorageFull(const user_dictionary::UserDictionaryStorage & storage)170 bool UserDictionaryUtil::IsStorageFull(
171     const user_dictionary::UserDictionaryStorage &storage) {
172   return storage.dictionaries_size() >= kMaxDictionarySize;
173 }
174 
IsDictionaryFull(const user_dictionary::UserDictionary & dictionary)175 bool UserDictionaryUtil::IsDictionaryFull(
176     const user_dictionary::UserDictionary &dictionary) {
177   return dictionary.entries_size() >= kMaxEntrySize;
178 }
179 
180 const user_dictionary::UserDictionary *
GetUserDictionaryById(const user_dictionary::UserDictionaryStorage & storage,uint64 dictionary_id)181 UserDictionaryUtil::GetUserDictionaryById(
182     const user_dictionary::UserDictionaryStorage &storage,
183     uint64 dictionary_id) {
184   int index = GetUserDictionaryIndexById(storage, dictionary_id);
185   return index >= 0 ? &storage.dictionaries(index) : NULL;
186 }
187 
188 user_dictionary::UserDictionary *
GetMutableUserDictionaryById(user_dictionary::UserDictionaryStorage * storage,uint64 dictionary_id)189 UserDictionaryUtil::GetMutableUserDictionaryById(
190     user_dictionary::UserDictionaryStorage *storage, uint64 dictionary_id) {
191   int index = GetUserDictionaryIndexById(*storage, dictionary_id);
192   return index >= 0 ? storage->mutable_dictionaries(index) : NULL;
193 }
194 
GetUserDictionaryIndexById(const user_dictionary::UserDictionaryStorage & storage,uint64 dictionary_id)195 int UserDictionaryUtil::GetUserDictionaryIndexById(
196     const user_dictionary::UserDictionaryStorage &storage,
197     uint64 dictionary_id) {
198   for (int i = 0; i < storage.dictionaries_size(); ++i) {
199     const user_dictionary::UserDictionary &dictionary =
200         storage.dictionaries(i);
201     if (dictionary.id() == dictionary_id) {
202       return i;
203     }
204   }
205 
206   LOG(ERROR) << "Cannot find dictionary id: " << dictionary_id;
207   return -1;
208 }
209 
GetUserDictionaryFileName()210 string UserDictionaryUtil::GetUserDictionaryFileName() {
211   return ConfigFileStream::GetFileName(kUserDictionaryFile);
212 }
213 
214 // static
SanitizeEntry(user_dictionary::UserDictionary::Entry * entry)215 bool UserDictionaryUtil::SanitizeEntry(
216     user_dictionary::UserDictionary::Entry *entry) {
217   bool modified = false;
218   modified |= Sanitize(entry->mutable_key(), kMaxKeySize);
219   modified |= Sanitize(entry->mutable_value(), kMaxValueSize);
220   if (!user_dictionary::UserDictionary::PosType_IsValid(entry->pos())) {
221     // Fallback to NOUN.
222     entry->set_pos(user_dictionary::UserDictionary::NOUN);
223     modified = true;
224   }
225   modified |= Sanitize(entry->mutable_comment(), kMaxCommentSize);
226   return modified;
227 }
228 
229 // static
Sanitize(string * str,size_t max_size)230 bool UserDictionaryUtil::Sanitize(string *str, size_t max_size) {
231   // First part: Remove invalid characters.
232   {
233     const size_t original_size = str->size();
234     string::iterator begin = str->begin();
235     string::iterator end = str->end();
236     end = std::remove(begin, end, '\t');
237     end = std::remove(begin, end, '\n');
238     end = std::remove(begin, end, '\r');
239 
240     if (end - begin <= max_size) {
241       if (end - begin == original_size) {
242         return false;
243       } else {
244         str->erase(end - begin);
245         return true;
246       }
247     }
248   }
249 
250   // Second part: Truncate long strings.
251   {
252     const char *begin = str->data();
253     const char *p = begin;
254     const char *end = begin + str->size();
255     while (p < end) {
256       const size_t len = Util::OneCharLen(p);
257       if ((p + len - begin) > max_size) {
258         str->erase(p - begin);
259         return true;
260       }
261       p += len;
262     }
263     LOG(FATAL) <<
264         "There should be a bug in implementation of the function.";
265   }
266 
267   return true;
268 }
269 
ValidateDictionaryName(const user_dictionary::UserDictionaryStorage & storage,const string & dictionary_name)270 UserDictionaryCommandStatus::Status UserDictionaryUtil::ValidateDictionaryName(
271     const user_dictionary::UserDictionaryStorage &storage,
272     const string &dictionary_name) {
273   if (dictionary_name.empty()) {
274     VLOG(1) << "Empty dictionary name.";
275     return UserDictionaryCommandStatus::DICTIONARY_NAME_EMPTY;
276   }
277   if (dictionary_name.size() > kMaxDictionaryNameSize) {
278     VLOG(1) << "Too long dictionary name";
279     return UserDictionaryCommandStatus::DICTIONARY_NAME_TOO_LONG;
280   }
281   if (dictionary_name.find_first_of(kInvalidChars) != string::npos) {
282     VLOG(1) << "Invalid character in dictionary name: " << dictionary_name;
283     return UserDictionaryCommandStatus
284         ::DICTIONARY_NAME_CONTAINS_INVALID_CHARACTER;
285   }
286   for (int i = 0; i < storage.dictionaries_size(); ++i) {
287     if (storage.dictionaries(i).name() == dictionary_name) {
288       LOG(ERROR) << "duplicated dictionary name";
289       return UserDictionaryCommandStatus::DICTIONARY_NAME_DUPLICATED;
290     }
291   }
292 
293   return UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS;
294 }
295 
296 namespace {
297 // The index of each element should be matched with the actual value of enum.
298 // See also user_dictionary_storage.proto for the definition of the enum.
299 // Note that the '0' is invalid in the definition, so the corresponding
300 // element is nullptr.
301 const char *kPosTypeStringTable[] = {
302   nullptr,
303   "名詞",
304   "短縮よみ",
305   "サジェストのみ",
306   "固有名詞",
307   "人名",
308   "姓",
309   "名",
310   "組織",
311   "地名",
312   "名詞サ変",
313   "名詞形動",
314   "数",
315   "アルファベット",
316   "記号",
317   "顔文字",
318 
319   "副詞",
320   "連体詞",
321   "接続詞",
322   "感動詞",
323   "接頭語",
324   "助数詞",
325   "接尾一般",
326   "接尾人名",
327   "接尾地名",
328   "動詞ワ行五段",
329   "動詞カ行五段",
330   "動詞サ行五段",
331   "動詞タ行五段",
332   "動詞ナ行五段",
333   "動詞マ行五段",
334   "動詞ラ行五段",
335   "動詞ガ行五段",
336   "動詞バ行五段",
337   "動詞ハ行四段",
338   "動詞一段",
339   "動詞カ変",
340   "動詞サ変",
341   "動詞ザ変",
342   "動詞ラ変",
343   "形容詞",
344   "終助詞",
345   "句読点",
346   "独立語",
347   "抑制単語",
348 };
349 }  // namespace
350 
GetStringPosType(user_dictionary::UserDictionary::PosType pos_type)351 const char* UserDictionaryUtil::GetStringPosType(
352     user_dictionary::UserDictionary::PosType pos_type) {
353   if (user_dictionary::UserDictionary::PosType_IsValid(pos_type)) {
354     return kPosTypeStringTable[pos_type];
355   }
356   return NULL;
357 }
358 
ToPosType(const char * string_pos_type)359 user_dictionary::UserDictionary::PosType UserDictionaryUtil::ToPosType(
360     const char *string_pos_type) {
361   // Skip the element at 0.
362   for (int i = 1; i < arraysize(kPosTypeStringTable); ++i) {
363     if (strcmp(kPosTypeStringTable[i], string_pos_type) == 0) {
364       return static_cast<user_dictionary::UserDictionary::PosType>(i);
365     }
366   }
367 
368   // Not found. Return invalid value.
369   return static_cast<user_dictionary::UserDictionary::PosType>(-1);
370 }
371 
CreateNewDictionaryId(const user_dictionary::UserDictionaryStorage & storage)372 uint64 UserDictionaryUtil::CreateNewDictionaryId(
373     const user_dictionary::UserDictionaryStorage &storage) {
374   static const uint64 kInvalidDictionaryId = 0;
375 
376   uint64 id = kInvalidDictionaryId;
377   while (id == kInvalidDictionaryId) {
378     Util::GetRandomSequence(reinterpret_cast<char *>(&id), sizeof(id));
379 
380 #ifdef OS_NACL
381     // Because JavaScript does not support uint64, we downsize the dictionary id
382     // range from uint64 to uint32 in NaCl.
383     id = static_cast<uint32>(id);
384 #endif  // OS_NACL
385 
386     // Duplication check.
387     for (int i = 0; i < storage.dictionaries_size(); ++i) {
388       if (storage.dictionaries(i).id() == id) {
389         // Duplicated id is found. So invalidate it to retry the generating.
390         id = kInvalidDictionaryId;
391         break;
392       }
393     }
394   }
395 
396   return id;
397 }
398 
CreateDictionary(user_dictionary::UserDictionaryStorage * storage,const string & dictionary_name,uint64 * new_dictionary_id)399 UserDictionaryCommandStatus::Status UserDictionaryUtil::CreateDictionary(
400     user_dictionary::UserDictionaryStorage *storage,
401     const string &dictionary_name,
402     uint64 *new_dictionary_id) {
403   UserDictionaryCommandStatus::Status status =
404       ValidateDictionaryName(*storage, dictionary_name);
405   if (status != UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS) {
406     LOG(ERROR) << "Invalid dictionary name is passed";
407     return status;
408   }
409 
410   if (IsStorageFull(*storage)) {
411     LOG(ERROR) << "too many dictionaries";
412     return UserDictionaryCommandStatus::DICTIONARY_SIZE_LIMIT_EXCEEDED;
413   }
414 
415   if (new_dictionary_id == NULL) {
416     LOG(ERROR) << "new_dictionary_id is NULL";
417     return UserDictionaryCommandStatus::UNKNOWN_ERROR;
418   }
419 
420   *new_dictionary_id = CreateNewDictionaryId(*storage);
421   user_dictionary::UserDictionary* dictionary = storage->add_dictionaries();
422   if (dictionary == NULL) {
423     LOG(ERROR) << "add_dictionaries failed.";
424     return UserDictionaryCommandStatus::UNKNOWN_ERROR;
425   }
426 
427   dictionary->set_id(*new_dictionary_id);
428   dictionary->set_name(dictionary_name);
429   return UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS;
430 }
431 
DeleteDictionary(user_dictionary::UserDictionaryStorage * storage,uint64 dictionary_id,int * original_index,user_dictionary::UserDictionary ** deleted_dictionary)432 bool UserDictionaryUtil::DeleteDictionary(
433     user_dictionary::UserDictionaryStorage *storage,
434     uint64 dictionary_id,
435     int *original_index,
436     user_dictionary::UserDictionary **deleted_dictionary) {
437   const int index = GetUserDictionaryIndexById(*storage, dictionary_id);
438   if (original_index != NULL) {
439     *original_index = index;
440   }
441 
442   if (index < 0) {
443     LOG(ERROR) << "Invalid dictionary id: " << dictionary_id;
444     return false;
445   }
446 
447   RepeatedPtrField<user_dictionary::UserDictionary> *dictionaries =
448       storage->mutable_dictionaries();
449   // Move the target dictionary to the end.
450   std::rotate(dictionaries->pointer_begin() + index,
451               dictionaries->pointer_begin() + index + 1,
452               dictionaries->pointer_end());
453 
454   if (deleted_dictionary == NULL) {
455     dictionaries->RemoveLast();
456   } else {
457     *deleted_dictionary = dictionaries->ReleaseLast();
458   }
459 
460   return true;
461 }
462 
463 }  // namespace mozc
464