1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "dictionary/system/codec.h"
31
32 #include <memory>
33 #include <string>
34 #include <vector>
35
36 #include "base/logging.h"
37 #include "base/util.h"
38 #include "dictionary/dictionary_token.h"
39 #include "dictionary/system/codec_interface.h"
40 #include "dictionary/system/words_info.h"
41 #include "testing/base/public/googletest.h"
42 #include "testing/base/public/gunit.h"
43
44 namespace mozc {
45 namespace dictionary {
46 namespace {
47
48 using std::unique_ptr;
49
MakeAssertResult(bool success,char32 c,const char * message)50 ::testing::AssertionResult MakeAssertResult(
51 bool success, char32 c, const char *message) {
52 if (success) {
53 return ::testing::AssertionSuccess();
54 }
55 return ::testing::AssertionFailure()
56 << message << " c = " << Util::StringPrintf("U+%05X", c);
57 }
58
IsExpectedEncodedSize(char32 c,const string & encoded)59 ::testing::AssertionResult IsExpectedEncodedSize(
60 char32 c, const string &encoded) {
61 const string::size_type size = encoded.size();
62 if (c == 0x00) {
63 return ::testing::AssertionFailure() << "NUL is not supported.";
64 }
65 if (c <= 0xff) {
66 return MakeAssertResult(size == 2, c,
67 "U+00?? (ASCII) should be encoded into 2 bytes.");
68 }
69 if (0x10000 <= c && c <= 0x10ffff) {
70 if ((c & 0xffff) == 0) {
71 return MakeAssertResult(
72 size == 2, c, "U+?0000 and U+100000 are encoded into 2 bytes.");
73 }
74 if ((c & 0xff) == 0) {
75 return MakeAssertResult(
76 size == 3, c, "U+???00 and U+10??00 are encoded into 3 bytes.");
77 }
78 if (((c & 0xff00) >> 8) == 0) {
79 return MakeAssertResult(
80 size == 3, c, "U+?00?? and U+1000?? are encoded into 3 bytes.");
81 }
82 return MakeAssertResult(
83 size == 4, c,
84 "[U+10000, U+10FFFF] except for U+???00, U+?00??, U+10??00 and "
85 "U+1000?? should be encoded into 4 bytes.");
86 }
87 if (0x10ffff < c) {
88 return MakeAssertResult(
89 false, c, "U+110000 and greater are not supported.");
90 }
91 if (0xffff < c) {
92 return MakeAssertResult(false, c, "Should not reach here.");
93 }
94
95 // Hereafter, |c| should be represented as 0x????
96 const uint16 s = static_cast<uint16>(c);
97 if ((s & 0xff) == 0) {
98 return MakeAssertResult(size == 2, c, "U+??00 are encoded into 2 bytes.");
99 }
100 if (0x3041 <= s && s < 0x3095) {
101 return MakeAssertResult(
102 size == 1, c, "Hiragana(85 characters) are encoded into 1 byte.");
103 }
104 if (0x30a1 <= s && s < 0x30fd) {
105 return MakeAssertResult(
106 size == 1, c, "Katakana (92 characters) are encoded into 1 byte.");
107 }
108 if (0x4e00 <= s && s < 0x9800) {
109 return MakeAssertResult(size == 2, c,
110 "Frequent Kanji and others (74*256 characters) "
111 "are encoded into 2 bytes.");
112 }
113 return MakeAssertResult(size == 3, c,
114 "Other charaters should be encoded into 3bytes.");
115 }
116
117
118 } // namespace
119
120 class SystemDictionaryCodecTest : public ::testing::Test {
121 protected:
SetUp()122 virtual void SetUp() {
123 SystemDictionaryCodecFactory::SetCodec(NULL);
124 ResetAllTokens();
125 }
126
TearDown()127 virtual void TearDown() {
128 SystemDictionaryCodecFactory::SetCodec(NULL);
129 ResetAllTokens();
130 }
131
ResetAllTokens()132 void ResetAllTokens() {
133 ClearTokens(&source_tokens_);
134 ClearTokens(&decoded_tokens_);
135 }
136
ClearTokens(std::vector<TokenInfo> * tokens) const137 void ClearTokens(std::vector<TokenInfo> *tokens) const {
138 for (size_t i = 0; i < tokens->size(); ++i) {
139 delete tokens->at(i).token;
140 }
141 tokens->clear();
142 }
143
InitTokens(int size)144 void InitTokens(int size) {
145 for (size_t i = 0; i < size; ++i) {
146 Token *t = new Token();
147 TokenInfo token_info(t);
148 token_info.id_in_value_trie = 0;
149 source_tokens_.push_back(token_info);
150 }
151 }
152
SetDefaultPos(TokenInfo * token_info) const153 void SetDefaultPos(TokenInfo *token_info) const {
154 CHECK(token_info);
155 token_info->pos_type = TokenInfo::DEFAULT_POS;
156 // set id randomly
157 const int id = Util::Random(50);
158 token_info->token->lid = id;
159 token_info->token->rid = (Util::Random(2) == 0) ? id : id + 1;
160 }
161
SetFrequentPos(TokenInfo * token_info) const162 void SetFrequentPos(TokenInfo *token_info) const {
163 CHECK(token_info);
164 token_info->pos_type = TokenInfo::FREQUENT_POS;
165 // set id randomly
166 const int id = Util::Random(256);
167 token_info->id_in_frequent_pos_map = id;
168 }
169
SetSamePos(TokenInfo * token_info) const170 void SetSamePos(TokenInfo *token_info) const {
171 CHECK(token_info);
172 token_info->pos_type = TokenInfo::SAME_AS_PREV_POS;
173 }
174
SetRandPos()175 void SetRandPos() {
176 for (size_t i = 0; i < source_tokens_.size(); ++i) {
177 CHECK(source_tokens_[i].token);
178 int n = Util::Random(TokenInfo::POS_TYPE_SIZE);
179 CHECK_GE(n, 0);
180 CHECK_LT(n, TokenInfo::POS_TYPE_SIZE);
181 if (i == 0 && n == 2) {
182 // First token cannot be the same pos.
183 n = 0;
184 }
185
186 if (n == 0) {
187 SetDefaultPos(&source_tokens_[i]);
188 } else if (n == 1) {
189 SetFrequentPos(&source_tokens_[i]);
190 } else if (n == 2) {
191 SetSamePos(&source_tokens_[i]);
192 } else {
193 FAIL();
194 }
195 }
196 }
197
SetDefaultCost(TokenInfo * token_info) const198 void SetDefaultCost(TokenInfo *token_info) const {
199 CHECK(token_info);
200 token_info->cost_type = TokenInfo::DEFAULT_COST;
201 // set cost randomly
202 const int cost = Util::Random(8000);
203 token_info->token->cost = cost;
204 }
205
SetSmallCost(TokenInfo * token_info) const206 void SetSmallCost(TokenInfo *token_info) const {
207 CHECK(token_info);
208 token_info->cost_type = TokenInfo::CAN_USE_SMALL_ENCODING;
209 // set cost randomly
210 const int cost = Util::Random(8000);
211 token_info->token->cost = cost;
212 }
213
SetRandCost()214 void SetRandCost() {
215 for (size_t i = 0; i < source_tokens_.size(); ++i) {
216 CHECK(source_tokens_[i].token);
217 int n = Util::Random(TokenInfo::COST_TYPE_SIZE);
218 CHECK_GE(n, 0);
219 CHECK_LT(n, TokenInfo::POS_TYPE_SIZE);
220 if (n == 0) {
221 SetDefaultCost(&source_tokens_[i]);
222 } else if (n == 1) {
223 SetSmallCost(&source_tokens_[i]);
224 }
225 }
226 }
227
SetDefaultValue(TokenInfo * token_info) const228 void SetDefaultValue(TokenInfo *token_info) const {
229 CHECK(token_info);
230 token_info->value_type = TokenInfo::DEFAULT_VALUE;
231 // set id randomly
232 const int id = Util::Random(50000);
233 token_info->id_in_value_trie = id;
234 }
235
SetSameValue(TokenInfo * token_info) const236 void SetSameValue(TokenInfo *token_info) const {
237 CHECK(token_info);
238 token_info->value_type = TokenInfo::SAME_AS_PREV_VALUE;
239 }
240
SetRandValue()241 void SetRandValue() {
242 for (size_t i = 0; i < source_tokens_.size(); ++i) {
243 CHECK(source_tokens_[i].token);
244 int n = Util::Random(TokenInfo::VALUE_TYPE_SIZE);
245 CHECK_GE(n, 0);
246 CHECK_LT(n, TokenInfo::VALUE_TYPE_SIZE);
247 if (i == 0 && n == 1) {
248 // first token cannot be the same as before.
249 n = 0;
250 }
251 if (n == 0) {
252 SetDefaultValue(&source_tokens_[i]);
253 } else if (n == 1) {
254 SetSameValue(&source_tokens_[i]);
255 } else if (n == 2) {
256 source_tokens_[i].value_type = TokenInfo::AS_IS_HIRAGANA;
257 } else if (n == 3) {
258 source_tokens_[i].value_type = TokenInfo::AS_IS_KATAKANA;
259 }
260 }
261 }
262
SetRandLabel()263 void SetRandLabel() {
264 for (size_t i = 0; i < source_tokens_.size(); ++i) {
265 CHECK(source_tokens_[i].token);
266 int n = Util::Random(Token::LABEL_SIZE);
267 CHECK_GE(n, 0);
268 CHECK_LT(n, Token::LABEL_SIZE);
269 if (n == 0) {
270 source_tokens_[i].token->attributes = Token::NONE;
271 } else if (n == 1) {
272 source_tokens_[i].token->attributes = Token::SPELLING_CORRECTION;
273 }
274 }
275 }
276
CheckDecoded() const277 void CheckDecoded() const {
278 EXPECT_EQ(source_tokens_.size(), decoded_tokens_.size());
279 for (size_t i = 0; i < source_tokens_.size(); ++i) {
280 EXPECT_TRUE(source_tokens_[i].token != NULL);
281 EXPECT_TRUE(decoded_tokens_[i].token != NULL);
282
283 EXPECT_EQ(source_tokens_[i].token->attributes,
284 decoded_tokens_[i].token->attributes);
285
286 EXPECT_EQ(source_tokens_[i].pos_type, decoded_tokens_[i].pos_type);
287 if (source_tokens_[i].pos_type == TokenInfo::DEFAULT_POS) {
288 EXPECT_EQ(source_tokens_[i].token->lid, decoded_tokens_[i].token->lid);
289 EXPECT_EQ(source_tokens_[i].token->rid, decoded_tokens_[i].token->rid);
290 } else if (source_tokens_[i].pos_type == TokenInfo::FREQUENT_POS) {
291 EXPECT_EQ(source_tokens_[i].id_in_frequent_pos_map,
292 decoded_tokens_[i].id_in_frequent_pos_map);
293 }
294
295 if (source_tokens_[i].cost_type == TokenInfo::DEFAULT_COST) {
296 EXPECT_EQ(source_tokens_[i].token->cost,
297 decoded_tokens_[i].token->cost);
298 } else { // small cost
299 EXPECT_NEAR(source_tokens_[i].token->cost,
300 decoded_tokens_[i].token->cost,
301 256);
302 }
303
304 EXPECT_EQ(source_tokens_[i].value_type, decoded_tokens_[i].value_type);
305 if (source_tokens_[i].value_type == TokenInfo::DEFAULT_VALUE) {
306 EXPECT_EQ(source_tokens_[i].id_in_value_trie,
307 decoded_tokens_[i].id_in_value_trie);
308 }
309 }
310 }
311
312 std::vector<TokenInfo> source_tokens_;
313 std::vector<TokenInfo> decoded_tokens_;
314 };
315
316 class SystemDictionaryCodecMock : public SystemDictionaryCodecInterface {
317 public:
GetSectionNameForKey() const318 const string GetSectionNameForKey() const { return "Mock"; }
GetSectionNameForValue() const319 const string GetSectionNameForValue() const { return "Mock"; }
GetSectionNameForTokens() const320 const string GetSectionNameForTokens() const { return "Mock"; }
GetSectionNameForPos() const321 const string GetSectionNameForPos() const { return "Mock"; }
EncodeKey(const StringPiece src,string * dst) const322 virtual void EncodeKey(const StringPiece src, string *dst) const {}
DecodeKey(const StringPiece src,string * dst) const323 virtual void DecodeKey(const StringPiece src, string *dst) const {}
GetEncodedKeyLength(const StringPiece src) const324 virtual size_t GetEncodedKeyLength(const StringPiece src) const { return 0; }
GetDecodedKeyLength(const StringPiece src) const325 virtual size_t GetDecodedKeyLength(const StringPiece src) const { return 0; }
EncodeValue(const StringPiece src,string * dst) const326 virtual void EncodeValue(const StringPiece src, string *dst) const {}
DecodeValue(const StringPiece src,string * dst) const327 virtual void DecodeValue(const StringPiece src, string *dst) const {}
EncodeTokens(const std::vector<TokenInfo> & tokens,string * output) const328 virtual void EncodeTokens(
329 const std::vector<TokenInfo> &tokens, string *output) const {}
DecodeTokens(const uint8 * ptr,std::vector<TokenInfo> * tokens) const330 virtual void DecodeTokens(
331 const uint8 *ptr, std::vector<TokenInfo> *tokens) const {}
DecodeToken(const uint8 * ptr,TokenInfo * token_info,int * read_bytes) const332 virtual bool DecodeToken(
333 const uint8 *ptr, TokenInfo *token_info, int *read_bytes) const {
334 *read_bytes = 0;
335 return false;
336 }
ReadTokenForReverseLookup(const uint8 * ptr,int * value_id,int * read_bytes) const337 virtual bool ReadTokenForReverseLookup(
338 const uint8 *ptr, int *value_id, int *read_bytes) const { return false; }
GetTokensTerminationFlag() const339 virtual uint8 GetTokensTerminationFlag() const { return 0xff; }
340 };
341
TEST_F(SystemDictionaryCodecTest,FactoryTest)342 TEST_F(SystemDictionaryCodecTest, FactoryTest) {
343 unique_ptr<SystemDictionaryCodecMock> mock(new SystemDictionaryCodecMock);
344 SystemDictionaryCodecFactory::SetCodec(mock.get());
345 SystemDictionaryCodecInterface *codec =
346 SystemDictionaryCodecFactory::GetCodec();
347 EXPECT_EQ("Mock", codec->GetSectionNameForKey());
348 }
349
TEST_F(SystemDictionaryCodecTest,KeyCodecKanaTest)350 TEST_F(SystemDictionaryCodecTest, KeyCodecKanaTest) {
351 SystemDictionaryCodecInterface *codec =
352 SystemDictionaryCodecFactory::GetCodec();
353 const string original = "よみ";
354 string encoded;
355 codec->EncodeKey(original, &encoded);
356 // hiragana should be encoded in 1 byte
357 EXPECT_EQ(2, encoded.size());
358 EXPECT_EQ(encoded.size(), codec->GetEncodedKeyLength(original));
359 string decoded;
360 codec->DecodeKey(encoded, &decoded);
361 EXPECT_EQ(original, decoded);
362 EXPECT_EQ(decoded.size(), codec->GetDecodedKeyLength(encoded));
363 }
364
365
TEST_F(SystemDictionaryCodecTest,KeyCodecSymbolTest)366 TEST_F(SystemDictionaryCodecTest, KeyCodecSymbolTest) {
367 SystemDictionaryCodecInterface *codec =
368 SystemDictionaryCodecFactory::GetCodec();
369 const string original = "・ー";
370 string encoded;
371 codec->EncodeKey(original, &encoded);
372 // middle dot and prolonged sound should be encoded in 1 byte
373 EXPECT_EQ(2, encoded.size());
374 EXPECT_EQ(encoded.size(), codec->GetEncodedKeyLength(original));
375 string decoded;
376 codec->DecodeKey(encoded, &decoded);
377 EXPECT_EQ(original, decoded);
378 EXPECT_EQ(decoded.size(), codec->GetDecodedKeyLength(encoded));
379 }
380
TEST_F(SystemDictionaryCodecTest,ValueCodecTest)381 TEST_F(SystemDictionaryCodecTest, ValueCodecTest) {
382 unique_ptr<SystemDictionaryCodec> codec(new SystemDictionaryCodec);
383 // TODO(toshiyuki): Use 0x10ffff instead when UCS4 is supported.
384 const char32 kMaxUniChar = 0x10ffff;
385 for (char32 c = 0x01; c <= kMaxUniChar; ++c) {
386 string original;
387 Util::UCS4ToUTF8(c, &original);
388 string encoded;
389 codec->EncodeValue(original, &encoded);
390 EXPECT_TRUE(IsExpectedEncodedSize(c, encoded));
391 string decoded;
392 codec->DecodeValue(encoded, &decoded);
393 EXPECT_EQ(original, decoded)
394 << "failed at: " << static_cast<uint32>(c);
395 }
396 }
397
TEST_F(SystemDictionaryCodecTest,ValueCodecKanaTest)398 TEST_F(SystemDictionaryCodecTest, ValueCodecKanaTest) {
399 SystemDictionaryCodecInterface *codec =
400 SystemDictionaryCodecFactory::GetCodec();
401 const string original = "もジ";
402 string encoded;
403 codec->EncodeValue(original, &encoded);
404 // kana should be encoded in 1 byte
405 EXPECT_EQ(2, encoded.size());
406 string decoded;
407 codec->DecodeValue(encoded, &decoded);
408 EXPECT_EQ(original, decoded);
409 }
410
TEST_F(SystemDictionaryCodecTest,ValueCodecAsciiTest)411 TEST_F(SystemDictionaryCodecTest, ValueCodecAsciiTest) {
412 SystemDictionaryCodecInterface *codec =
413 SystemDictionaryCodecFactory::GetCodec();
414 const string original = "word";
415 string encoded;
416 codec->EncodeValue(original, &encoded);
417 // ascii should be encoded in 2 bytes
418 EXPECT_EQ(8, encoded.size());
419 string decoded;
420 codec->DecodeValue(encoded, &decoded);
421 EXPECT_EQ(original, decoded);
422 }
423
TEST_F(SystemDictionaryCodecTest,TokenDefaultPosTest)424 TEST_F(SystemDictionaryCodecTest, TokenDefaultPosTest) {
425 SystemDictionaryCodecInterface *codec =
426 SystemDictionaryCodecFactory::GetCodec();
427 InitTokens(1);
428 SetDefaultPos(&source_tokens_[0]);
429 string encoded;
430 codec->EncodeTokens(source_tokens_, &encoded);
431 EXPECT_GT(encoded.size(), 0);
432 codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
433 &decoded_tokens_);
434 CheckDecoded();
435 }
436
TEST_F(SystemDictionaryCodecTest,TokenFrequentPosTest)437 TEST_F(SystemDictionaryCodecTest, TokenFrequentPosTest) {
438 SystemDictionaryCodecInterface *codec =
439 SystemDictionaryCodecFactory::GetCodec();
440 InitTokens(1);
441 SetFrequentPos(&source_tokens_[0]);
442 string encoded;
443 codec->EncodeTokens(source_tokens_, &encoded);
444 EXPECT_GT(encoded.size(), 0);
445 codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
446 &decoded_tokens_);
447 CheckDecoded();
448 }
449
TEST_F(SystemDictionaryCodecTest,TokenSamePosTest)450 TEST_F(SystemDictionaryCodecTest, TokenSamePosTest) {
451 SystemDictionaryCodecInterface *codec =
452 SystemDictionaryCodecFactory::GetCodec();
453 {
454 InitTokens(2);
455 SetDefaultPos(&source_tokens_[0]);
456 SetSamePos(&source_tokens_[1]);
457 string encoded;
458 codec->EncodeTokens(source_tokens_, &encoded);
459 EXPECT_GT(encoded.size(), 0);
460 codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
461 &decoded_tokens_);
462 CheckDecoded();
463 }
464 ResetAllTokens();
465 {
466 InitTokens(2);
467 SetFrequentPos(&source_tokens_[0]);
468 SetSamePos(&source_tokens_[1]);
469 string encoded;
470 codec->EncodeTokens(source_tokens_, &encoded);
471 EXPECT_GT(encoded.size(), 0);
472 codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
473 &decoded_tokens_);
474 CheckDecoded();
475 }
476 }
477
TEST_F(SystemDictionaryCodecTest,TokenRandomPosTest)478 TEST_F(SystemDictionaryCodecTest, TokenRandomPosTest) {
479 SystemDictionaryCodecInterface *codec =
480 SystemDictionaryCodecFactory::GetCodec();
481 InitTokens(50);
482 Util::SetRandomSeed(0);
483 SetRandPos();
484 string encoded;
485 codec->EncodeTokens(source_tokens_, &encoded);
486 EXPECT_GT(encoded.size(), 0);
487 codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
488 &decoded_tokens_);
489 CheckDecoded();
490 }
491
TEST_F(SystemDictionaryCodecTest,TokenDefaultCostTest)492 TEST_F(SystemDictionaryCodecTest, TokenDefaultCostTest) {
493 SystemDictionaryCodecInterface *codec =
494 SystemDictionaryCodecFactory::GetCodec();
495 InitTokens(1);
496 SetDefaultCost(&source_tokens_[0]);
497 string encoded;
498 codec->EncodeTokens(source_tokens_, &encoded);
499 EXPECT_GT(encoded.size(), 0);
500 codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
501 &decoded_tokens_);
502 CheckDecoded();
503 }
504
TEST_F(SystemDictionaryCodecTest,TokenSmallCostTest)505 TEST_F(SystemDictionaryCodecTest, TokenSmallCostTest) {
506 SystemDictionaryCodecInterface *codec =
507 SystemDictionaryCodecFactory::GetCodec();
508 InitTokens(1);
509 SetSmallCost(&source_tokens_[0]);
510 string encoded;
511 codec->EncodeTokens(source_tokens_, &encoded);
512 EXPECT_GT(encoded.size(), 0);
513 codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
514 &decoded_tokens_);
515 CheckDecoded();
516 }
517
TEST_F(SystemDictionaryCodecTest,TokenRandomCostTest)518 TEST_F(SystemDictionaryCodecTest, TokenRandomCostTest) {
519 SystemDictionaryCodecInterface *codec =
520 SystemDictionaryCodecFactory::GetCodec();
521 InitTokens(50);
522 Util::SetRandomSeed(0);
523 SetRandCost();
524 string encoded;
525 codec->EncodeTokens(source_tokens_, &encoded);
526 EXPECT_GT(encoded.size(), 0);
527 codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
528 &decoded_tokens_);
529 CheckDecoded();
530 }
531
TEST_F(SystemDictionaryCodecTest,TokenDefaultValueTest)532 TEST_F(SystemDictionaryCodecTest, TokenDefaultValueTest) {
533 SystemDictionaryCodecInterface *codec =
534 SystemDictionaryCodecFactory::GetCodec();
535 InitTokens(1);
536 SetDefaultValue(&source_tokens_[0]);
537 string encoded;
538 codec->EncodeTokens(source_tokens_, &encoded);
539 EXPECT_GT(encoded.size(), 0);
540 codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
541 &decoded_tokens_);
542 CheckDecoded();
543 }
544
TEST_F(SystemDictionaryCodecTest,UCS4CharactersTest)545 TEST_F(SystemDictionaryCodecTest, UCS4CharactersTest) {
546 SystemDictionaryCodecInterface *codec =
547 SystemDictionaryCodecFactory::GetCodec();
548 const string ucs4_including =
549 // ""
550 "\xf0\xa0\x80\x8b\xf0\xa1\x88\xbd\xf0\xa1\x8c\x9b\xf0\xa1\x91\xae\xf0"
551 "\xa1\xa2\xbd\xf0\xa0\xae\x9f\xf0\xa1\x9a\xb4\xf0\xa1\xb8\xb4\xf0\xa3"
552 "\x87\x84\xf0\xa3\x97\x84\xf0\xa3\x9c\xbf\xf0\xa3\x9d\xa3\xf0\xa3\xb3"
553 "\xbe\xf0\xa4\x9f\xb1\xf0\xa5\x92\x8e\xf0\xa5\x94\x8e\xf0\xa5\x9d\xb1"
554 "\xf0\xa5\xa7\x84\xf0\xa5\xb6\xa1\xf0\xa6\xab\xbf\xf0\xa6\xb9\x80\xf0"
555 "\xa7\x83\xb4\xf0\xa7\x9a\x84\xf0\xa8\x89\xb7\xf0\xa8\x8f\x8d\xf0\xaa"
556 "\x86\x90\xf0\xa0\x82\x89"
557 // ""
558 "\xf0\xa0\x82\xa2\xf0\xa0\x82\xa4\xf0\xa0\x86\xa2\xf0\xa0\x88\x93\xf0"
559 "\xa0\x8c\xab\xf0\xa0\x8e\x81\xf0\xa0\x8d\xb1\xf0\xa0\x8f\xb9\xf0\xa0"
560 "\x91\x8a\xf0\xa0\x94\x89\xf0\xa0\x97\x96\xf0\xa0\x98\xa8\xf0\xa0\x9d"
561 "\x8f\xf0\xa0\xa0\x87\xf0\xa0\xa0\xba\xf0\xa0\xa2\xb9\xf0\xa0\xa5\xbc"
562 "\xf0\xa0\xa6\x9d\xf0\xa0\xab\x93\xf0\xa0\xac\x9d\xf0\xa0\xb5\x85\xf0"
563 "\xa0\xb7\xa1\xf0\xa0\xba\x95\xf0\xa0\xb9\xad\xf0\xa0\xb9\xa4\xf0\xa0"
564 "\xbd\x9f\xf0\xa1\x88\x81"
565 // ""
566 "\xf0\xa1\x89\x95\xf0\xa1\x89\xbb\xf0\xa1\x89\xb4\xf0\xa1\x8b\xa4\xf0"
567 "\xa1\x8b\x97\xf0\xa1\x8b\xbd\xf0\xa1\x8c\xb6\xf0\xa1\x8d\x84\xf0\xa1"
568 "\x8f\x84\xf0\xa1\x91\xad\xf0\xa1\x97\x97\xf0\xa6\xb0\xa9\xf0\xa1\x99"
569 "\x87\xf0\xa1\x9c\x86\xf0\xa1\x9d\x82\xf0\xa1\xa7\x83\xf0\xa1\xb1\x96"
570 "\xf0\xa1\xb4\xad\xf0\xa1\xb5\x85\xf0\xa1\xb5\xb8\xf0\xa1\xb5\xa2\xf0"
571 "\xa1\xb6\xa1\xf0\xa1\xb6\x9c\xf0\xa1\xb6\x92\xf0\xa1\xb6\xb7\xf0\xa1"
572 "\xb7\xa0\xf0\xa1\xb8\xb3"
573 // ""
574 "\xf0\xa1\xbc\x9e\xf0\xa1\xbd\xb6\xf0\xa1\xbf\xba\xf0\xa2\x85\xbb\xf0"
575 "\xa2\x8c\x9e\xf0\xa2\x8e\xad\xf0\xa2\x9b\xb3\xf0\xa2\xa1\x9b\xf0\xa2"
576 "\xa2\xab\xf0\xa2\xa6\x8f\xf0\xa2\xaa\xb8\xf0\xa2\xad\x8f\xf0\xa2\xad"
577 "\x90\xf0\xa2\xad\x86\xf0\xa2\xb0\x9d\xf0\xa2\xae\xa6\xf0\xa2\xb0\xa4"
578 "\xf0\xa2\xb7\xa1\xf0\xa3\x87\x83\xf0\xa3\x87\xb5\xf0\xa3\x86\xb6\xf0"
579 "\xa3\x8d\xb2\xf0\xa3\x8f\x93\xf0\xa3\x8f\x92\xf0\xa3\x8f\x90\xf0\xa3\x8f"
580 "\xa4\xf0\xa3\x8f\x95"
581 // ""
582 "\xf0\xa3\x8f\x9a\xf0\xa3\x8f\x9f\xf0\xa3\x91\x8a\xf0\xa3\x91\x91\xf0"
583 "\xa3\x91\x8b\xf0\xa3\x91\xa5\xf0\xa3\x93\xa4\xf0\xa3\x95\x9a\xf0\xa3"
584 "\x96\x94\xf0\xa3\x98\xb9\xf0\xa3\x99\x87\xf0\xa3\x98\xb8\xf0\xa3\x98"
585 "\xba\xf0\xa3\x9c\x9c\xf0\xa3\x9c\x8c\xf0\xa3\x9d\xa4\xf0\xa3\x9f\xbf"
586 "\xf0\xa3\x9f\xa7\xf0\xa3\xa0\xa4\xf0\xa3\xa0\xbd\xf0\xa3\xaa\x98\xf0"
587 "\xa3\xb1\xbf\xf0\xa3\xb4\x80\xf0\xa3\xb5\x80\xf0\xa3\xb7\xba\xf0\xa3"
588 "\xb7\xb9\xf0\xa3\xb7\x93"
589 // ""
590 "\xf0\xa3\xbd\xbe\xf0\xa4\x82\x96\xf0\xa4\x84\x83\xf0\xa4\x87\x86\xf0"
591 "\xa4\x87\xbe\xf0\xa4\x8e\xbc\xf0\xa4\x98\xa9\xf0\xa4\x9a\xa5\xf0\xa4"
592 "\xa2\x96\xf0\xa4\xa9\x8d\xf0\xa4\xad\x96\xf0\xa4\xad\xaf\xf0\xa4\xb0"
593 "\x96\xf0\xa4\xb4\x94\xf0\xa4\xb8\x8e\xf0\xa4\xb8\xb7\xf0\xa4\xb9\xaa"
594 "\xf0\xa4\xba\x8b\xf0\xa5\x81\x8a\xf0\xa5\x81\x95\xf0\xa5\x84\xa2\xf0"
595 "\xa5\x86\xa9\xf0\xa5\x87\xa5\xf0\xa5\x87\x8d\xf0\xa5\x88\x9e\xf0\xa5"
596 "\x89\x8c\xf0\xa5\x90\xae"
597 // ""
598 "\xf0\xa5\x93\x99\xf0\xa5\x96\xa7\xf0\xa5\x9e\xa9\xf0\xa5\x9e\xb4\xf0"
599 "\xa5\xa7\x94\xf0\xa5\xab\xa4\xf0\xa5\xab\xa3\xf0\xa5\xab\xb1\xf0\xa5"
600 "\xae\xb2\xf0\xa5\xb1\x8b\xf0\xa5\xb1\xa4\xf0\xa5\xb8\xae\xf0\xa5\xb9"
601 "\x96\xf0\xa5\xb9\xa5\xf0\xa5\xb9\xa2\xf0\xa5\xbb\x98\xf0\xa5\xbb\x82"
602 "\xf0\xa5\xbb\xa8\xf0\xa5\xbc\xa3\xf0\xa5\xbd\x9c\xf0\xa5\xbf\xa0\xf0"
603 "\xa5\xbf\x94\xf0\xa6\x80\x8c\xf0\xa5\xbf\xbb\xf0\xa6\x80\x97\xf0\xa6"
604 "\x81\xa0\xf0\xa6\x83\xad"
605 // ""
606 "\xf0\xa6\x89\xb0\xf0\xa6\x8a\x86\xf0\xa6\x8d\x8c\xf0\xa3\xb4\x8e\xf0"
607 "\xa6\x90\x82\xf0\xa6\x99\xbe\xf0\xa6\x9a\xb0\xf0\xa6\x9c\x9d\xf0\xa6"
608 "\xa3\x9d\xf0\xa6\xa3\xaa\xf0\xa6\xa5\x91\xf0\xa6\xa5\xaf\xf0\xa6\xa7"
609 "\x9d\xf0\xa6\xa8\x9e\xf0\xa6\xa9\x98\xf0\xa6\xaa\x8c\xf0\xa6\xaa\xb7"
610 "\xf0\xa6\xb1\xb3\xf0\xa6\xb3\x9d\xf0\xa6\xb9\xa5\xf0\xa6\xbe\x94\xf0"
611 "\xa6\xbf\xb8\xf0\xa6\xbf\xb6\xf0\xa6\xbf\xb7\xf0\xa7\x84\x8d\xf0\xa7"
612 "\x84\xb9\xf0\xa7\x8f\x9b"
613 // ""
614 "\xf0\xa7\x8f\x9a\xf0\xa7\x8f\xbe\xf0\xa7\x90\x90\xf0\xa7\x91\x89\xf0"
615 "\xa7\x98\x95\xf0\xa7\x98\x94\xf0\xa7\x98\xb1\xf0\xa7\x9a\x93\xf0\xa7"
616 "\x9c\x8e\xf0\xa7\x9c\xa3\xf0\xa7\x9d\x92\xf0\xa7\xa6\x85\xf0\xa7\xaa"
617 "\x84\xf0\xa7\xae\xb3\xf0\xa7\xae\xbe\xf0\xa7\xaf\x87\xf0\xa7\xb2\xb8"
618 "\xf0\xa7\xb6\xa0\xf0\xa7\xb8\x90\xf0\xa7\xbe\xb7\xf0\xa8\x82\x8a\xf0"
619 "\xa8\x82\xbb\xf0\xa8\x8a\x82\xf0\xa8\x8b\xb3\xf0\xa8\x90\x8c\xf0\xa8"
620 "\x91\x95\xf0\xa8\x95\xab"
621 // ""
622 "\xf0\xa8\x97\x88\xf0\xa8\x97\x89\xf0\xa8\x9b\x97\xf0\xa8\x9b\xba\xf0"
623 "\xa8\xa5\x89\xf0\xa8\xa5\x86\xf0\xa8\xa5\xab\xf0\xa8\xa6\x87\xf0\xa8"
624 "\xa6\x88\xf0\xa8\xa6\xba\xf0\xa8\xa6\xbb\xf0\xa8\xa8\x9e\xf0\xa8\xa8"
625 "\xa9\xf0\xa8\xa9\xb1\xf0\xa8\xa9\x83\xf0\xa8\xaa\x99\xf0\xa8\xab\x8d"
626 "\xf0\xa8\xab\xa4\xf0\xa8\xab\x9d\xf0\xa8\xaf\x81\xf0\xa8\xaf\xaf\xf0\xa8"
627 "\xb4\x90\xf0\xa8\xb5\xb1\xf0\xa8\xb7\xbb\xf0\xa8\xb8\x9f\xf0\xa8\xb8"
628 "\xb6\xf0\xa8\xba\x89"
629 // ""
630 "\xf0\xa8\xbb\xab\xf0\xa8\xbc\xb2\xf0\xa8\xbf\xb8\xf0\xa9\x8a\xa0\xf0"
631 "\xa9\x8a\xb1\xf0\xa9\x92\x90\xf0\xa9\x97\x8f\xf0\xa9\x99\xbf\xf0\xa9"
632 "\x9b\xb0\xf0\xa9\x9c\x99\xf0\xa9\x9d\x90\xf0\xa9\xa3\x86\xf0\xa9\xa9"
633 "\xb2\xf0\xa9\xb7\x9b\xf0\xa9\xb8\xbd\xf0\xa9\xb8\x95\xf0\xa9\xba\x8a"
634 "\xf0\xa9\xb9\x89\xf0\xa9\xbb\x84\xf0\xa9\xbb\xa9\xf0\xa9\xbb\x9b\xf0"
635 "\xa9\xbf\x8e\xf0\xaa\x80\xaf\xf0\xaa\x80\x9a\xf0\xaa\x83\xb9\xf0\xaa"
636 "\x82\x82\xf0\xa2\x88\x98"
637 // ""
638 "\xf0\xaa\x8e\x8c\xf0\xaa\x90\xb7\xf0\xaa\x97\xb1\xf0\xaa\x98\x82\xf0"
639 "\xaa\x98\x9a\xf0\xaa\x9a\xb2";
640 string encoded;
641 codec->EncodeValue(ucs4_including, &encoded);
642 EXPECT_GT(encoded.size(), 0);
643 string decoded;
644 codec->DecodeValue(encoded, &decoded);
645 EXPECT_EQ(ucs4_including, decoded);
646 }
647
TEST_F(SystemDictionaryCodecTest,TokenSameValueTest)648 TEST_F(SystemDictionaryCodecTest, TokenSameValueTest) {
649 SystemDictionaryCodecInterface *codec =
650 SystemDictionaryCodecFactory::GetCodec();
651 InitTokens(2);
652 SetDefaultValue(&source_tokens_[0]);
653 SetSameValue(&source_tokens_[1]);
654 string encoded;
655 codec->EncodeTokens(source_tokens_, &encoded);
656 EXPECT_GT(encoded.size(), 0);
657 codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
658 &decoded_tokens_);
659 CheckDecoded();
660 }
661
TEST_F(SystemDictionaryCodecTest,TokenRandomValueTest)662 TEST_F(SystemDictionaryCodecTest, TokenRandomValueTest) {
663 SystemDictionaryCodecInterface *codec =
664 SystemDictionaryCodecFactory::GetCodec();
665 InitTokens(50);
666 Util::SetRandomSeed(0);
667 SetRandValue();
668 string encoded;
669 codec->EncodeTokens(source_tokens_, &encoded);
670 EXPECT_GT(encoded.size(), 0);
671 codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
672 &decoded_tokens_);
673 CheckDecoded();
674 }
675
TEST_F(SystemDictionaryCodecTest,TokenRandomLabelTest)676 TEST_F(SystemDictionaryCodecTest, TokenRandomLabelTest) {
677 SystemDictionaryCodecInterface *codec =
678 SystemDictionaryCodecFactory::GetCodec();
679 InitTokens(50);
680 Util::SetRandomSeed(0);
681 SetRandLabel();
682 string encoded;
683 codec->EncodeTokens(source_tokens_, &encoded);
684 EXPECT_GT(encoded.size(), 0);
685 codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
686 &decoded_tokens_);
687 CheckDecoded();
688 }
689
TEST_F(SystemDictionaryCodecTest,TokenRandomTest)690 TEST_F(SystemDictionaryCodecTest, TokenRandomTest) {
691 SystemDictionaryCodecInterface *codec =
692 SystemDictionaryCodecFactory::GetCodec();
693 InitTokens(50);
694 Util::SetRandomSeed(0);
695 SetRandPos();
696 SetRandCost();
697 SetRandValue();
698 SetRandLabel();
699 string encoded;
700 codec->EncodeTokens(source_tokens_, &encoded);
701 EXPECT_GT(encoded.size(), 0);
702 codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
703 &decoded_tokens_);
704 CheckDecoded();
705 }
706
TEST_F(SystemDictionaryCodecTest,ReadTokenRandomTest)707 TEST_F(SystemDictionaryCodecTest, ReadTokenRandomTest) {
708 SystemDictionaryCodecInterface *codec =
709 SystemDictionaryCodecFactory::GetCodec();
710 InitTokens(50);
711 Util::SetRandomSeed(0);
712 SetRandPos();
713 SetRandCost();
714 SetRandValue();
715 SetRandLabel();
716 string encoded;
717 codec->EncodeTokens(source_tokens_, &encoded);
718 EXPECT_GT(encoded.size(), 0);
719 int read_num = 0;
720 int offset = 0;
721 while (true) {
722 int read_byte = 0;
723 int value_id = -1;
724 const bool is_last_token = !(codec->ReadTokenForReverseLookup(
725 reinterpret_cast<const unsigned char *>(encoded.data()) + offset,
726 &value_id,
727 &read_byte));
728 if (source_tokens_[read_num].value_type == TokenInfo::DEFAULT_VALUE) {
729 EXPECT_EQ(source_tokens_[read_num].id_in_value_trie, value_id);
730 } else {
731 EXPECT_EQ(-1, value_id);
732 }
733 offset += read_byte;
734 ++read_num;
735 if (is_last_token) {
736 break;
737 }
738 }
739 EXPECT_EQ(source_tokens_.size(), read_num);
740 }
741
TEST_F(SystemDictionaryCodecTest,CodecTest)742 TEST_F(SystemDictionaryCodecTest, CodecTest) {
743 unique_ptr<SystemDictionaryCodec> impl(new SystemDictionaryCodec);
744 SystemDictionaryCodecFactory::SetCodec(impl.get());
745 SystemDictionaryCodecInterface *codec =
746 SystemDictionaryCodecFactory::GetCodec();
747 { // Token
748 InitTokens(50);
749 Util::SetRandomSeed(0);
750 SetRandPos();
751 SetRandCost();
752 SetRandValue();
753 SetRandLabel();
754 string encoded;
755 codec->EncodeTokens(source_tokens_, &encoded);
756 EXPECT_GT(encoded.size(), 0);
757 codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
758 &decoded_tokens_);
759 CheckDecoded();
760
761 // ReadTokens
762 int read_num = 0;
763 int offset = 0;
764 while (true) {
765 int read_byte = 0;
766 int value_id = -1;
767 const bool is_last_token = !(codec->ReadTokenForReverseLookup(
768 reinterpret_cast<const unsigned char *>(encoded.data()) + offset,
769 &value_id,
770 &read_byte));
771 if (source_tokens_[read_num].value_type == TokenInfo::DEFAULT_VALUE) {
772 EXPECT_EQ(source_tokens_[read_num].id_in_value_trie, value_id);
773 } else {
774 EXPECT_EQ(-1, value_id);
775 }
776 offset += read_byte;
777 ++read_num;
778 if (is_last_token) {
779 break;
780 }
781 }
782 EXPECT_EQ(source_tokens_.size(), read_num);
783 }
784 { // Value
785 string original;
786 {
787 char32 a_ucs4 = '!';
788 Util::SetRandomSeed(0);
789 for (size_t i = 0; i < 10000; ++i) {
790 // U+4E00-9FFF CJK Unified Ideographs
791 const char32 c = a_ucs4 + static_cast<uint16>(Util::Random(0x9f00));
792 Util::UCS4ToUTF8Append(c, &original);
793 }
794 }
795 string encoded;
796 codec->EncodeValue(original, &encoded);
797 string decoded;
798 codec->DecodeValue(encoded, &decoded);
799 EXPECT_EQ(original, decoded);
800 }
801 { // Key
802 string original;
803 {
804 char32 a_ucs4 = 0x3041; // "ぁ"
805 Util::SetRandomSeed(0);
806 for (size_t i = 0; i < 1000; ++i) {
807 const char32 c = a_ucs4 + static_cast<uint16>(Util::Random(1000));
808 Util::UCS4ToUTF8Append(c, &original);
809 }
810 }
811 string encoded;
812 codec->EncodeKey(original, &encoded);
813 EXPECT_EQ(encoded.size(), codec->GetEncodedKeyLength(original));
814 string decoded;
815 codec->DecodeKey(encoded, &decoded);
816 EXPECT_EQ(original, decoded);
817 EXPECT_EQ(decoded.size(), codec->GetDecodedKeyLength(encoded));
818 }
819 }
820
821
822 } // namespace dictionary
823 } // namespace mozc
824