1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "dictionary/system/codec.h"
31 
32 #include <memory>
33 #include <string>
34 #include <vector>
35 
36 #include "base/logging.h"
37 #include "base/util.h"
38 #include "dictionary/dictionary_token.h"
39 #include "dictionary/system/codec_interface.h"
40 #include "dictionary/system/words_info.h"
41 #include "testing/base/public/googletest.h"
42 #include "testing/base/public/gunit.h"
43 
44 namespace mozc {
45 namespace dictionary {
46 namespace {
47 
48 using std::unique_ptr;
49 
MakeAssertResult(bool success,char32 c,const char * message)50 ::testing::AssertionResult MakeAssertResult(
51     bool success, char32 c, const char *message) {
52   if (success) {
53     return ::testing::AssertionSuccess();
54   }
55   return ::testing::AssertionFailure()
56       << message << " c = " << Util::StringPrintf("U+%05X", c);
57 }
58 
IsExpectedEncodedSize(char32 c,const string & encoded)59 ::testing::AssertionResult IsExpectedEncodedSize(
60     char32 c, const string &encoded) {
61   const string::size_type size = encoded.size();
62   if (c == 0x00) {
63     return ::testing::AssertionFailure() << "NUL is not supported.";
64   }
65   if (c <= 0xff) {
66     return MakeAssertResult(size == 2, c,
67                             "U+00?? (ASCII) should be encoded into 2 bytes.");
68   }
69   if (0x10000 <= c && c <= 0x10ffff) {
70     if ((c & 0xffff) == 0) {
71       return MakeAssertResult(
72           size == 2, c, "U+?0000 and U+100000 are encoded into 2 bytes.");
73     }
74     if ((c & 0xff) == 0) {
75       return MakeAssertResult(
76           size == 3, c, "U+???00 and U+10??00 are encoded into 3 bytes.");
77     }
78     if (((c & 0xff00) >> 8) == 0) {
79       return MakeAssertResult(
80           size == 3, c, "U+?00?? and U+1000?? are encoded into 3 bytes.");
81     }
82     return MakeAssertResult(
83         size == 4, c,
84         "[U+10000, U+10FFFF] except for U+???00, U+?00??, U+10??00 and "
85         "U+1000?? should be encoded into 4 bytes.");
86   }
87   if (0x10ffff < c) {
88     return MakeAssertResult(
89         false, c, "U+110000 and greater are not supported.");
90   }
91   if (0xffff < c) {
92     return MakeAssertResult(false, c, "Should not reach here.");
93   }
94 
95   // Hereafter, |c| should be represented as 0x????
96   const uint16 s = static_cast<uint16>(c);
97   if ((s & 0xff) == 0) {
98     return MakeAssertResult(size == 2, c, "U+??00 are encoded into 2 bytes.");
99   }
100   if (0x3041 <= s && s < 0x3095) {
101     return MakeAssertResult(
102         size == 1, c, "Hiragana(85 characters) are encoded into 1 byte.");
103   }
104   if (0x30a1 <= s && s < 0x30fd) {
105     return MakeAssertResult(
106         size == 1, c, "Katakana (92 characters) are encoded into 1 byte.");
107   }
108   if (0x4e00 <= s && s < 0x9800) {
109     return MakeAssertResult(size == 2, c,
110                             "Frequent Kanji and others (74*256 characters) "
111                             "are encoded into 2 bytes.");
112   }
113   return MakeAssertResult(size == 3, c,
114                           "Other charaters should be encoded into 3bytes.");
115 }
116 
117 
118 }  // namespace
119 
120 class SystemDictionaryCodecTest : public ::testing::Test {
121  protected:
SetUp()122   virtual void SetUp() {
123     SystemDictionaryCodecFactory::SetCodec(NULL);
124     ResetAllTokens();
125   }
126 
TearDown()127   virtual void TearDown() {
128     SystemDictionaryCodecFactory::SetCodec(NULL);
129     ResetAllTokens();
130   }
131 
ResetAllTokens()132   void ResetAllTokens() {
133     ClearTokens(&source_tokens_);
134     ClearTokens(&decoded_tokens_);
135   }
136 
ClearTokens(std::vector<TokenInfo> * tokens) const137   void ClearTokens(std::vector<TokenInfo> *tokens) const {
138     for (size_t i = 0; i < tokens->size(); ++i) {
139       delete tokens->at(i).token;
140     }
141     tokens->clear();
142   }
143 
InitTokens(int size)144   void InitTokens(int size) {
145     for (size_t i = 0; i < size; ++i) {
146       Token *t = new Token();
147       TokenInfo token_info(t);
148       token_info.id_in_value_trie = 0;
149       source_tokens_.push_back(token_info);
150     }
151   }
152 
SetDefaultPos(TokenInfo * token_info) const153   void SetDefaultPos(TokenInfo *token_info) const {
154     CHECK(token_info);
155     token_info->pos_type = TokenInfo::DEFAULT_POS;
156     // set id randomly
157     const int id = Util::Random(50);
158     token_info->token->lid = id;
159     token_info->token->rid = (Util::Random(2) == 0) ? id : id + 1;
160   }
161 
SetFrequentPos(TokenInfo * token_info) const162   void SetFrequentPos(TokenInfo *token_info) const {
163     CHECK(token_info);
164     token_info->pos_type = TokenInfo::FREQUENT_POS;
165     // set id randomly
166     const int id = Util::Random(256);
167     token_info->id_in_frequent_pos_map = id;
168   }
169 
SetSamePos(TokenInfo * token_info) const170   void SetSamePos(TokenInfo *token_info) const {
171     CHECK(token_info);
172     token_info->pos_type = TokenInfo::SAME_AS_PREV_POS;
173   }
174 
SetRandPos()175   void SetRandPos() {
176     for (size_t i = 0; i < source_tokens_.size(); ++i) {
177       CHECK(source_tokens_[i].token);
178       int n = Util::Random(TokenInfo::POS_TYPE_SIZE);
179       CHECK_GE(n, 0);
180       CHECK_LT(n, TokenInfo::POS_TYPE_SIZE);
181       if (i == 0 && n == 2) {
182         // First token cannot be the same pos.
183         n = 0;
184       }
185 
186       if (n == 0) {
187         SetDefaultPos(&source_tokens_[i]);
188       } else if (n == 1) {
189         SetFrequentPos(&source_tokens_[i]);
190       } else if (n == 2) {
191         SetSamePos(&source_tokens_[i]);
192       } else {
193         FAIL();
194       }
195     }
196   }
197 
SetDefaultCost(TokenInfo * token_info) const198   void SetDefaultCost(TokenInfo *token_info) const {
199     CHECK(token_info);
200     token_info->cost_type = TokenInfo::DEFAULT_COST;
201     // set cost randomly
202     const int cost = Util::Random(8000);
203     token_info->token->cost = cost;
204   }
205 
SetSmallCost(TokenInfo * token_info) const206   void SetSmallCost(TokenInfo *token_info) const {
207     CHECK(token_info);
208     token_info->cost_type = TokenInfo::CAN_USE_SMALL_ENCODING;
209     // set cost randomly
210     const int cost = Util::Random(8000);
211     token_info->token->cost = cost;
212   }
213 
SetRandCost()214   void SetRandCost() {
215     for (size_t i = 0; i < source_tokens_.size(); ++i) {
216       CHECK(source_tokens_[i].token);
217       int n = Util::Random(TokenInfo::COST_TYPE_SIZE);
218       CHECK_GE(n, 0);
219       CHECK_LT(n, TokenInfo::POS_TYPE_SIZE);
220       if (n == 0) {
221         SetDefaultCost(&source_tokens_[i]);
222       } else if (n == 1) {
223         SetSmallCost(&source_tokens_[i]);
224       }
225     }
226   }
227 
SetDefaultValue(TokenInfo * token_info) const228   void SetDefaultValue(TokenInfo *token_info) const {
229     CHECK(token_info);
230     token_info->value_type = TokenInfo::DEFAULT_VALUE;
231     // set id randomly
232     const int id = Util::Random(50000);
233     token_info->id_in_value_trie = id;
234   }
235 
SetSameValue(TokenInfo * token_info) const236   void SetSameValue(TokenInfo *token_info) const {
237     CHECK(token_info);
238     token_info->value_type = TokenInfo::SAME_AS_PREV_VALUE;
239   }
240 
SetRandValue()241   void SetRandValue() {
242     for (size_t i = 0; i < source_tokens_.size(); ++i) {
243       CHECK(source_tokens_[i].token);
244       int n = Util::Random(TokenInfo::VALUE_TYPE_SIZE);
245       CHECK_GE(n, 0);
246       CHECK_LT(n, TokenInfo::VALUE_TYPE_SIZE);
247       if (i == 0 && n == 1) {
248         // first token cannot be the same as before.
249         n = 0;
250       }
251       if (n == 0) {
252         SetDefaultValue(&source_tokens_[i]);
253       } else if (n == 1) {
254         SetSameValue(&source_tokens_[i]);
255       } else if (n == 2) {
256         source_tokens_[i].value_type = TokenInfo::AS_IS_HIRAGANA;
257       } else if (n == 3) {
258         source_tokens_[i].value_type = TokenInfo::AS_IS_KATAKANA;
259       }
260     }
261   }
262 
SetRandLabel()263   void SetRandLabel() {
264     for (size_t i = 0; i < source_tokens_.size(); ++i) {
265       CHECK(source_tokens_[i].token);
266       int n = Util::Random(Token::LABEL_SIZE);
267       CHECK_GE(n, 0);
268       CHECK_LT(n, Token::LABEL_SIZE);
269       if (n == 0) {
270         source_tokens_[i].token->attributes = Token::NONE;
271       } else if (n == 1) {
272         source_tokens_[i].token->attributes = Token::SPELLING_CORRECTION;
273       }
274     }
275   }
276 
CheckDecoded() const277   void CheckDecoded() const {
278     EXPECT_EQ(source_tokens_.size(), decoded_tokens_.size());
279     for (size_t i = 0; i < source_tokens_.size(); ++i) {
280       EXPECT_TRUE(source_tokens_[i].token != NULL);
281       EXPECT_TRUE(decoded_tokens_[i].token != NULL);
282 
283       EXPECT_EQ(source_tokens_[i].token->attributes,
284                 decoded_tokens_[i].token->attributes);
285 
286       EXPECT_EQ(source_tokens_[i].pos_type, decoded_tokens_[i].pos_type);
287       if (source_tokens_[i].pos_type == TokenInfo::DEFAULT_POS) {
288         EXPECT_EQ(source_tokens_[i].token->lid, decoded_tokens_[i].token->lid);
289         EXPECT_EQ(source_tokens_[i].token->rid, decoded_tokens_[i].token->rid);
290       } else if (source_tokens_[i].pos_type == TokenInfo::FREQUENT_POS) {
291         EXPECT_EQ(source_tokens_[i].id_in_frequent_pos_map,
292                   decoded_tokens_[i].id_in_frequent_pos_map);
293       }
294 
295       if (source_tokens_[i].cost_type == TokenInfo::DEFAULT_COST) {
296         EXPECT_EQ(source_tokens_[i].token->cost,
297                   decoded_tokens_[i].token->cost);
298       } else {  // small cost
299         EXPECT_NEAR(source_tokens_[i].token->cost,
300                     decoded_tokens_[i].token->cost,
301                     256);
302       }
303 
304       EXPECT_EQ(source_tokens_[i].value_type, decoded_tokens_[i].value_type);
305       if (source_tokens_[i].value_type == TokenInfo::DEFAULT_VALUE) {
306         EXPECT_EQ(source_tokens_[i].id_in_value_trie,
307                   decoded_tokens_[i].id_in_value_trie);
308       }
309     }
310   }
311 
312   std::vector<TokenInfo> source_tokens_;
313   std::vector<TokenInfo> decoded_tokens_;
314 };
315 
316 class SystemDictionaryCodecMock : public SystemDictionaryCodecInterface {
317  public:
GetSectionNameForKey() const318   const string GetSectionNameForKey() const { return "Mock"; }
GetSectionNameForValue() const319   const string GetSectionNameForValue() const { return "Mock"; }
GetSectionNameForTokens() const320   const string GetSectionNameForTokens() const { return "Mock"; }
GetSectionNameForPos() const321   const string GetSectionNameForPos() const { return "Mock"; }
EncodeKey(const StringPiece src,string * dst) const322   virtual void EncodeKey(const StringPiece src, string *dst) const {}
DecodeKey(const StringPiece src,string * dst) const323   virtual void DecodeKey(const StringPiece src, string *dst) const {}
GetEncodedKeyLength(const StringPiece src) const324   virtual size_t GetEncodedKeyLength(const StringPiece src) const { return 0; }
GetDecodedKeyLength(const StringPiece src) const325   virtual size_t GetDecodedKeyLength(const StringPiece src) const { return 0; }
EncodeValue(const StringPiece src,string * dst) const326   virtual void EncodeValue(const StringPiece src, string *dst) const {}
DecodeValue(const StringPiece src,string * dst) const327   virtual void DecodeValue(const StringPiece src, string *dst) const {}
EncodeTokens(const std::vector<TokenInfo> & tokens,string * output) const328   virtual void EncodeTokens(
329       const std::vector<TokenInfo> &tokens, string *output) const {}
DecodeTokens(const uint8 * ptr,std::vector<TokenInfo> * tokens) const330   virtual void DecodeTokens(
331       const uint8 *ptr, std::vector<TokenInfo> *tokens) const {}
DecodeToken(const uint8 * ptr,TokenInfo * token_info,int * read_bytes) const332   virtual bool DecodeToken(
333       const uint8 *ptr, TokenInfo *token_info, int *read_bytes) const {
334     *read_bytes = 0;
335     return false;
336   }
ReadTokenForReverseLookup(const uint8 * ptr,int * value_id,int * read_bytes) const337   virtual bool ReadTokenForReverseLookup(
338       const uint8 *ptr, int *value_id, int *read_bytes) const { return false; }
GetTokensTerminationFlag() const339   virtual uint8 GetTokensTerminationFlag() const { return 0xff; }
340 };
341 
TEST_F(SystemDictionaryCodecTest,FactoryTest)342 TEST_F(SystemDictionaryCodecTest, FactoryTest) {
343   unique_ptr<SystemDictionaryCodecMock> mock(new SystemDictionaryCodecMock);
344   SystemDictionaryCodecFactory::SetCodec(mock.get());
345   SystemDictionaryCodecInterface *codec =
346       SystemDictionaryCodecFactory::GetCodec();
347   EXPECT_EQ("Mock", codec->GetSectionNameForKey());
348 }
349 
TEST_F(SystemDictionaryCodecTest,KeyCodecKanaTest)350 TEST_F(SystemDictionaryCodecTest, KeyCodecKanaTest) {
351   SystemDictionaryCodecInterface *codec =
352       SystemDictionaryCodecFactory::GetCodec();
353   const string original = "よみ";
354   string encoded;
355   codec->EncodeKey(original, &encoded);
356   // hiragana should be encoded in 1 byte
357   EXPECT_EQ(2, encoded.size());
358   EXPECT_EQ(encoded.size(), codec->GetEncodedKeyLength(original));
359   string decoded;
360   codec->DecodeKey(encoded, &decoded);
361   EXPECT_EQ(original, decoded);
362   EXPECT_EQ(decoded.size(), codec->GetDecodedKeyLength(encoded));
363 }
364 
365 
TEST_F(SystemDictionaryCodecTest,KeyCodecSymbolTest)366 TEST_F(SystemDictionaryCodecTest, KeyCodecSymbolTest) {
367   SystemDictionaryCodecInterface *codec =
368       SystemDictionaryCodecFactory::GetCodec();
369   const string original = "・ー";
370   string encoded;
371   codec->EncodeKey(original, &encoded);
372   // middle dot and prolonged sound should be encoded in 1 byte
373   EXPECT_EQ(2, encoded.size());
374   EXPECT_EQ(encoded.size(), codec->GetEncodedKeyLength(original));
375   string decoded;
376   codec->DecodeKey(encoded, &decoded);
377   EXPECT_EQ(original, decoded);
378   EXPECT_EQ(decoded.size(), codec->GetDecodedKeyLength(encoded));
379 }
380 
TEST_F(SystemDictionaryCodecTest,ValueCodecTest)381 TEST_F(SystemDictionaryCodecTest, ValueCodecTest) {
382   unique_ptr<SystemDictionaryCodec> codec(new SystemDictionaryCodec);
383   // TODO(toshiyuki): Use 0x10ffff instead when UCS4 is supported.
384   const char32 kMaxUniChar = 0x10ffff;
385   for (char32 c = 0x01; c <= kMaxUniChar; ++c) {
386     string original;
387     Util::UCS4ToUTF8(c, &original);
388     string encoded;
389     codec->EncodeValue(original, &encoded);
390     EXPECT_TRUE(IsExpectedEncodedSize(c, encoded));
391     string decoded;
392     codec->DecodeValue(encoded, &decoded);
393     EXPECT_EQ(original, decoded)
394         << "failed at: " << static_cast<uint32>(c);
395   }
396 }
397 
TEST_F(SystemDictionaryCodecTest,ValueCodecKanaTest)398 TEST_F(SystemDictionaryCodecTest, ValueCodecKanaTest) {
399   SystemDictionaryCodecInterface *codec =
400       SystemDictionaryCodecFactory::GetCodec();
401   const string original = "もジ";
402   string encoded;
403   codec->EncodeValue(original, &encoded);
404   // kana should be encoded in 1 byte
405   EXPECT_EQ(2, encoded.size());
406   string decoded;
407   codec->DecodeValue(encoded, &decoded);
408   EXPECT_EQ(original, decoded);
409 }
410 
TEST_F(SystemDictionaryCodecTest,ValueCodecAsciiTest)411 TEST_F(SystemDictionaryCodecTest, ValueCodecAsciiTest) {
412   SystemDictionaryCodecInterface *codec =
413       SystemDictionaryCodecFactory::GetCodec();
414   const string original = "word";
415   string encoded;
416   codec->EncodeValue(original, &encoded);
417   // ascii should be encoded in 2 bytes
418   EXPECT_EQ(8, encoded.size());
419   string decoded;
420   codec->DecodeValue(encoded, &decoded);
421   EXPECT_EQ(original, decoded);
422 }
423 
TEST_F(SystemDictionaryCodecTest,TokenDefaultPosTest)424 TEST_F(SystemDictionaryCodecTest, TokenDefaultPosTest) {
425   SystemDictionaryCodecInterface *codec =
426       SystemDictionaryCodecFactory::GetCodec();
427   InitTokens(1);
428   SetDefaultPos(&source_tokens_[0]);
429   string encoded;
430   codec->EncodeTokens(source_tokens_, &encoded);
431   EXPECT_GT(encoded.size(), 0);
432   codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
433                       &decoded_tokens_);
434   CheckDecoded();
435 }
436 
TEST_F(SystemDictionaryCodecTest,TokenFrequentPosTest)437 TEST_F(SystemDictionaryCodecTest, TokenFrequentPosTest) {
438   SystemDictionaryCodecInterface *codec =
439       SystemDictionaryCodecFactory::GetCodec();
440   InitTokens(1);
441   SetFrequentPos(&source_tokens_[0]);
442   string encoded;
443   codec->EncodeTokens(source_tokens_, &encoded);
444   EXPECT_GT(encoded.size(), 0);
445   codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
446                       &decoded_tokens_);
447   CheckDecoded();
448 }
449 
TEST_F(SystemDictionaryCodecTest,TokenSamePosTest)450 TEST_F(SystemDictionaryCodecTest, TokenSamePosTest) {
451   SystemDictionaryCodecInterface *codec =
452       SystemDictionaryCodecFactory::GetCodec();
453   {
454     InitTokens(2);
455     SetDefaultPos(&source_tokens_[0]);
456     SetSamePos(&source_tokens_[1]);
457     string encoded;
458     codec->EncodeTokens(source_tokens_, &encoded);
459     EXPECT_GT(encoded.size(), 0);
460     codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
461                         &decoded_tokens_);
462     CheckDecoded();
463   }
464   ResetAllTokens();
465   {
466     InitTokens(2);
467     SetFrequentPos(&source_tokens_[0]);
468     SetSamePos(&source_tokens_[1]);
469     string encoded;
470     codec->EncodeTokens(source_tokens_, &encoded);
471     EXPECT_GT(encoded.size(), 0);
472     codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
473                         &decoded_tokens_);
474     CheckDecoded();
475   }
476 }
477 
TEST_F(SystemDictionaryCodecTest,TokenRandomPosTest)478 TEST_F(SystemDictionaryCodecTest, TokenRandomPosTest) {
479   SystemDictionaryCodecInterface *codec =
480       SystemDictionaryCodecFactory::GetCodec();
481   InitTokens(50);
482   Util::SetRandomSeed(0);
483   SetRandPos();
484   string encoded;
485   codec->EncodeTokens(source_tokens_, &encoded);
486   EXPECT_GT(encoded.size(), 0);
487   codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
488                       &decoded_tokens_);
489   CheckDecoded();
490 }
491 
TEST_F(SystemDictionaryCodecTest,TokenDefaultCostTest)492 TEST_F(SystemDictionaryCodecTest, TokenDefaultCostTest) {
493   SystemDictionaryCodecInterface *codec =
494       SystemDictionaryCodecFactory::GetCodec();
495   InitTokens(1);
496   SetDefaultCost(&source_tokens_[0]);
497   string encoded;
498   codec->EncodeTokens(source_tokens_, &encoded);
499   EXPECT_GT(encoded.size(), 0);
500   codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
501                       &decoded_tokens_);
502   CheckDecoded();
503 }
504 
TEST_F(SystemDictionaryCodecTest,TokenSmallCostTest)505 TEST_F(SystemDictionaryCodecTest, TokenSmallCostTest) {
506   SystemDictionaryCodecInterface *codec =
507       SystemDictionaryCodecFactory::GetCodec();
508   InitTokens(1);
509   SetSmallCost(&source_tokens_[0]);
510   string encoded;
511   codec->EncodeTokens(source_tokens_, &encoded);
512   EXPECT_GT(encoded.size(), 0);
513   codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
514                       &decoded_tokens_);
515   CheckDecoded();
516 }
517 
TEST_F(SystemDictionaryCodecTest,TokenRandomCostTest)518 TEST_F(SystemDictionaryCodecTest, TokenRandomCostTest) {
519   SystemDictionaryCodecInterface *codec =
520       SystemDictionaryCodecFactory::GetCodec();
521   InitTokens(50);
522   Util::SetRandomSeed(0);
523   SetRandCost();
524   string encoded;
525   codec->EncodeTokens(source_tokens_, &encoded);
526   EXPECT_GT(encoded.size(), 0);
527   codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
528                       &decoded_tokens_);
529   CheckDecoded();
530 }
531 
TEST_F(SystemDictionaryCodecTest,TokenDefaultValueTest)532 TEST_F(SystemDictionaryCodecTest, TokenDefaultValueTest) {
533   SystemDictionaryCodecInterface *codec =
534       SystemDictionaryCodecFactory::GetCodec();
535   InitTokens(1);
536   SetDefaultValue(&source_tokens_[0]);
537   string encoded;
538   codec->EncodeTokens(source_tokens_, &encoded);
539   EXPECT_GT(encoded.size(), 0);
540   codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
541                       &decoded_tokens_);
542   CheckDecoded();
543 }
544 
TEST_F(SystemDictionaryCodecTest,UCS4CharactersTest)545 TEST_F(SystemDictionaryCodecTest, UCS4CharactersTest) {
546   SystemDictionaryCodecInterface *codec =
547       SystemDictionaryCodecFactory::GetCodec();
548   const string ucs4_including =
549       // "������������������������������������������������������"
550       "\xf0\xa0\x80\x8b\xf0\xa1\x88\xbd\xf0\xa1\x8c\x9b\xf0\xa1\x91\xae\xf0"
551       "\xa1\xa2\xbd\xf0\xa0\xae\x9f\xf0\xa1\x9a\xb4\xf0\xa1\xb8\xb4\xf0\xa3"
552       "\x87\x84\xf0\xa3\x97\x84\xf0\xa3\x9c\xbf\xf0\xa3\x9d\xa3\xf0\xa3\xb3"
553       "\xbe\xf0\xa4\x9f\xb1\xf0\xa5\x92\x8e\xf0\xa5\x94\x8e\xf0\xa5\x9d\xb1"
554       "\xf0\xa5\xa7\x84\xf0\xa5\xb6\xa1\xf0\xa6\xab\xbf\xf0\xa6\xb9\x80\xf0"
555       "\xa7\x83\xb4\xf0\xa7\x9a\x84\xf0\xa8\x89\xb7\xf0\xa8\x8f\x8d\xf0\xaa"
556       "\x86\x90\xf0\xa0\x82\x89"
557       // "������������������������������������������������������"
558       "\xf0\xa0\x82\xa2\xf0\xa0\x82\xa4\xf0\xa0\x86\xa2\xf0\xa0\x88\x93\xf0"
559       "\xa0\x8c\xab\xf0\xa0\x8e\x81\xf0\xa0\x8d\xb1\xf0\xa0\x8f\xb9\xf0\xa0"
560       "\x91\x8a\xf0\xa0\x94\x89\xf0\xa0\x97\x96\xf0\xa0\x98\xa8\xf0\xa0\x9d"
561       "\x8f\xf0\xa0\xa0\x87\xf0\xa0\xa0\xba\xf0\xa0\xa2\xb9\xf0\xa0\xa5\xbc"
562       "\xf0\xa0\xa6\x9d\xf0\xa0\xab\x93\xf0\xa0\xac\x9d\xf0\xa0\xb5\x85\xf0"
563       "\xa0\xb7\xa1\xf0\xa0\xba\x95\xf0\xa0\xb9\xad\xf0\xa0\xb9\xa4\xf0\xa0"
564       "\xbd\x9f\xf0\xa1\x88\x81"
565       // "������������������������������������������������������"
566       "\xf0\xa1\x89\x95\xf0\xa1\x89\xbb\xf0\xa1\x89\xb4\xf0\xa1\x8b\xa4\xf0"
567       "\xa1\x8b\x97\xf0\xa1\x8b\xbd\xf0\xa1\x8c\xb6\xf0\xa1\x8d\x84\xf0\xa1"
568       "\x8f\x84\xf0\xa1\x91\xad\xf0\xa1\x97\x97\xf0\xa6\xb0\xa9\xf0\xa1\x99"
569       "\x87\xf0\xa1\x9c\x86\xf0\xa1\x9d\x82\xf0\xa1\xa7\x83\xf0\xa1\xb1\x96"
570       "\xf0\xa1\xb4\xad\xf0\xa1\xb5\x85\xf0\xa1\xb5\xb8\xf0\xa1\xb5\xa2\xf0"
571       "\xa1\xb6\xa1\xf0\xa1\xb6\x9c\xf0\xa1\xb6\x92\xf0\xa1\xb6\xb7\xf0\xa1"
572       "\xb7\xa0\xf0\xa1\xb8\xb3"
573       // "������������������������������������������������������"
574       "\xf0\xa1\xbc\x9e\xf0\xa1\xbd\xb6\xf0\xa1\xbf\xba\xf0\xa2\x85\xbb\xf0"
575       "\xa2\x8c\x9e\xf0\xa2\x8e\xad\xf0\xa2\x9b\xb3\xf0\xa2\xa1\x9b\xf0\xa2"
576       "\xa2\xab\xf0\xa2\xa6\x8f\xf0\xa2\xaa\xb8\xf0\xa2\xad\x8f\xf0\xa2\xad"
577       "\x90\xf0\xa2\xad\x86\xf0\xa2\xb0\x9d\xf0\xa2\xae\xa6\xf0\xa2\xb0\xa4"
578       "\xf0\xa2\xb7\xa1\xf0\xa3\x87\x83\xf0\xa3\x87\xb5\xf0\xa3\x86\xb6\xf0"
579       "\xa3\x8d\xb2\xf0\xa3\x8f\x93\xf0\xa3\x8f\x92\xf0\xa3\x8f\x90\xf0\xa3\x8f"
580       "\xa4\xf0\xa3\x8f\x95"
581       // "������������������������������������������������������"
582       "\xf0\xa3\x8f\x9a\xf0\xa3\x8f\x9f\xf0\xa3\x91\x8a\xf0\xa3\x91\x91\xf0"
583       "\xa3\x91\x8b\xf0\xa3\x91\xa5\xf0\xa3\x93\xa4\xf0\xa3\x95\x9a\xf0\xa3"
584       "\x96\x94\xf0\xa3\x98\xb9\xf0\xa3\x99\x87\xf0\xa3\x98\xb8\xf0\xa3\x98"
585       "\xba\xf0\xa3\x9c\x9c\xf0\xa3\x9c\x8c\xf0\xa3\x9d\xa4\xf0\xa3\x9f\xbf"
586       "\xf0\xa3\x9f\xa7\xf0\xa3\xa0\xa4\xf0\xa3\xa0\xbd\xf0\xa3\xaa\x98\xf0"
587       "\xa3\xb1\xbf\xf0\xa3\xb4\x80\xf0\xa3\xb5\x80\xf0\xa3\xb7\xba\xf0\xa3"
588       "\xb7\xb9\xf0\xa3\xb7\x93"
589       // "������������������������������������������������������"
590       "\xf0\xa3\xbd\xbe\xf0\xa4\x82\x96\xf0\xa4\x84\x83\xf0\xa4\x87\x86\xf0"
591       "\xa4\x87\xbe\xf0\xa4\x8e\xbc\xf0\xa4\x98\xa9\xf0\xa4\x9a\xa5\xf0\xa4"
592       "\xa2\x96\xf0\xa4\xa9\x8d\xf0\xa4\xad\x96\xf0\xa4\xad\xaf\xf0\xa4\xb0"
593       "\x96\xf0\xa4\xb4\x94\xf0\xa4\xb8\x8e\xf0\xa4\xb8\xb7\xf0\xa4\xb9\xaa"
594       "\xf0\xa4\xba\x8b\xf0\xa5\x81\x8a\xf0\xa5\x81\x95\xf0\xa5\x84\xa2\xf0"
595       "\xa5\x86\xa9\xf0\xa5\x87\xa5\xf0\xa5\x87\x8d\xf0\xa5\x88\x9e\xf0\xa5"
596       "\x89\x8c\xf0\xa5\x90\xae"
597       // "������������������������������������������������������"
598       "\xf0\xa5\x93\x99\xf0\xa5\x96\xa7\xf0\xa5\x9e\xa9\xf0\xa5\x9e\xb4\xf0"
599       "\xa5\xa7\x94\xf0\xa5\xab\xa4\xf0\xa5\xab\xa3\xf0\xa5\xab\xb1\xf0\xa5"
600       "\xae\xb2\xf0\xa5\xb1\x8b\xf0\xa5\xb1\xa4\xf0\xa5\xb8\xae\xf0\xa5\xb9"
601       "\x96\xf0\xa5\xb9\xa5\xf0\xa5\xb9\xa2\xf0\xa5\xbb\x98\xf0\xa5\xbb\x82"
602       "\xf0\xa5\xbb\xa8\xf0\xa5\xbc\xa3\xf0\xa5\xbd\x9c\xf0\xa5\xbf\xa0\xf0"
603       "\xa5\xbf\x94\xf0\xa6\x80\x8c\xf0\xa5\xbf\xbb\xf0\xa6\x80\x97\xf0\xa6"
604       "\x81\xa0\xf0\xa6\x83\xad"
605       // "������������������������������������������������������"
606       "\xf0\xa6\x89\xb0\xf0\xa6\x8a\x86\xf0\xa6\x8d\x8c\xf0\xa3\xb4\x8e\xf0"
607       "\xa6\x90\x82\xf0\xa6\x99\xbe\xf0\xa6\x9a\xb0\xf0\xa6\x9c\x9d\xf0\xa6"
608       "\xa3\x9d\xf0\xa6\xa3\xaa\xf0\xa6\xa5\x91\xf0\xa6\xa5\xaf\xf0\xa6\xa7"
609       "\x9d\xf0\xa6\xa8\x9e\xf0\xa6\xa9\x98\xf0\xa6\xaa\x8c\xf0\xa6\xaa\xb7"
610       "\xf0\xa6\xb1\xb3\xf0\xa6\xb3\x9d\xf0\xa6\xb9\xa5\xf0\xa6\xbe\x94\xf0"
611       "\xa6\xbf\xb8\xf0\xa6\xbf\xb6\xf0\xa6\xbf\xb7\xf0\xa7\x84\x8d\xf0\xa7"
612       "\x84\xb9\xf0\xa7\x8f\x9b"
613       // "������������������������������������������������������"
614       "\xf0\xa7\x8f\x9a\xf0\xa7\x8f\xbe\xf0\xa7\x90\x90\xf0\xa7\x91\x89\xf0"
615       "\xa7\x98\x95\xf0\xa7\x98\x94\xf0\xa7\x98\xb1\xf0\xa7\x9a\x93\xf0\xa7"
616       "\x9c\x8e\xf0\xa7\x9c\xa3\xf0\xa7\x9d\x92\xf0\xa7\xa6\x85\xf0\xa7\xaa"
617       "\x84\xf0\xa7\xae\xb3\xf0\xa7\xae\xbe\xf0\xa7\xaf\x87\xf0\xa7\xb2\xb8"
618       "\xf0\xa7\xb6\xa0\xf0\xa7\xb8\x90\xf0\xa7\xbe\xb7\xf0\xa8\x82\x8a\xf0"
619       "\xa8\x82\xbb\xf0\xa8\x8a\x82\xf0\xa8\x8b\xb3\xf0\xa8\x90\x8c\xf0\xa8"
620       "\x91\x95\xf0\xa8\x95\xab"
621       // "������������������������������������������������������"
622       "\xf0\xa8\x97\x88\xf0\xa8\x97\x89\xf0\xa8\x9b\x97\xf0\xa8\x9b\xba\xf0"
623       "\xa8\xa5\x89\xf0\xa8\xa5\x86\xf0\xa8\xa5\xab\xf0\xa8\xa6\x87\xf0\xa8"
624       "\xa6\x88\xf0\xa8\xa6\xba\xf0\xa8\xa6\xbb\xf0\xa8\xa8\x9e\xf0\xa8\xa8"
625       "\xa9\xf0\xa8\xa9\xb1\xf0\xa8\xa9\x83\xf0\xa8\xaa\x99\xf0\xa8\xab\x8d"
626       "\xf0\xa8\xab\xa4\xf0\xa8\xab\x9d\xf0\xa8\xaf\x81\xf0\xa8\xaf\xaf\xf0\xa8"
627       "\xb4\x90\xf0\xa8\xb5\xb1\xf0\xa8\xb7\xbb\xf0\xa8\xb8\x9f\xf0\xa8\xb8"
628       "\xb6\xf0\xa8\xba\x89"
629       // "������������������������������������������������������"
630       "\xf0\xa8\xbb\xab\xf0\xa8\xbc\xb2\xf0\xa8\xbf\xb8\xf0\xa9\x8a\xa0\xf0"
631       "\xa9\x8a\xb1\xf0\xa9\x92\x90\xf0\xa9\x97\x8f\xf0\xa9\x99\xbf\xf0\xa9"
632       "\x9b\xb0\xf0\xa9\x9c\x99\xf0\xa9\x9d\x90\xf0\xa9\xa3\x86\xf0\xa9\xa9"
633       "\xb2\xf0\xa9\xb7\x9b\xf0\xa9\xb8\xbd\xf0\xa9\xb8\x95\xf0\xa9\xba\x8a"
634       "\xf0\xa9\xb9\x89\xf0\xa9\xbb\x84\xf0\xa9\xbb\xa9\xf0\xa9\xbb\x9b\xf0"
635       "\xa9\xbf\x8e\xf0\xaa\x80\xaf\xf0\xaa\x80\x9a\xf0\xaa\x83\xb9\xf0\xaa"
636       "\x82\x82\xf0\xa2\x88\x98"
637       // "������������"
638       "\xf0\xaa\x8e\x8c\xf0\xaa\x90\xb7\xf0\xaa\x97\xb1\xf0\xaa\x98\x82\xf0"
639       "\xaa\x98\x9a\xf0\xaa\x9a\xb2";
640   string encoded;
641   codec->EncodeValue(ucs4_including, &encoded);
642   EXPECT_GT(encoded.size(), 0);
643   string decoded;
644   codec->DecodeValue(encoded, &decoded);
645   EXPECT_EQ(ucs4_including, decoded);
646 }
647 
TEST_F(SystemDictionaryCodecTest,TokenSameValueTest)648 TEST_F(SystemDictionaryCodecTest, TokenSameValueTest) {
649   SystemDictionaryCodecInterface *codec =
650       SystemDictionaryCodecFactory::GetCodec();
651   InitTokens(2);
652   SetDefaultValue(&source_tokens_[0]);
653   SetSameValue(&source_tokens_[1]);
654   string encoded;
655   codec->EncodeTokens(source_tokens_, &encoded);
656   EXPECT_GT(encoded.size(), 0);
657   codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
658                       &decoded_tokens_);
659   CheckDecoded();
660 }
661 
TEST_F(SystemDictionaryCodecTest,TokenRandomValueTest)662 TEST_F(SystemDictionaryCodecTest, TokenRandomValueTest) {
663   SystemDictionaryCodecInterface *codec =
664       SystemDictionaryCodecFactory::GetCodec();
665   InitTokens(50);
666   Util::SetRandomSeed(0);
667   SetRandValue();
668   string encoded;
669   codec->EncodeTokens(source_tokens_, &encoded);
670   EXPECT_GT(encoded.size(), 0);
671   codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
672                       &decoded_tokens_);
673   CheckDecoded();
674 }
675 
TEST_F(SystemDictionaryCodecTest,TokenRandomLabelTest)676 TEST_F(SystemDictionaryCodecTest, TokenRandomLabelTest) {
677   SystemDictionaryCodecInterface *codec =
678       SystemDictionaryCodecFactory::GetCodec();
679   InitTokens(50);
680   Util::SetRandomSeed(0);
681   SetRandLabel();
682   string encoded;
683   codec->EncodeTokens(source_tokens_, &encoded);
684   EXPECT_GT(encoded.size(), 0);
685   codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
686                       &decoded_tokens_);
687   CheckDecoded();
688 }
689 
TEST_F(SystemDictionaryCodecTest,TokenRandomTest)690 TEST_F(SystemDictionaryCodecTest, TokenRandomTest) {
691   SystemDictionaryCodecInterface *codec =
692       SystemDictionaryCodecFactory::GetCodec();
693   InitTokens(50);
694   Util::SetRandomSeed(0);
695   SetRandPos();
696   SetRandCost();
697   SetRandValue();
698   SetRandLabel();
699   string encoded;
700   codec->EncodeTokens(source_tokens_, &encoded);
701   EXPECT_GT(encoded.size(), 0);
702   codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
703                       &decoded_tokens_);
704   CheckDecoded();
705 }
706 
TEST_F(SystemDictionaryCodecTest,ReadTokenRandomTest)707 TEST_F(SystemDictionaryCodecTest, ReadTokenRandomTest) {
708   SystemDictionaryCodecInterface *codec =
709       SystemDictionaryCodecFactory::GetCodec();
710   InitTokens(50);
711   Util::SetRandomSeed(0);
712   SetRandPos();
713   SetRandCost();
714   SetRandValue();
715   SetRandLabel();
716   string encoded;
717   codec->EncodeTokens(source_tokens_, &encoded);
718   EXPECT_GT(encoded.size(), 0);
719   int read_num = 0;
720   int offset = 0;
721   while (true) {
722     int read_byte = 0;
723     int value_id = -1;
724     const bool is_last_token = !(codec->ReadTokenForReverseLookup(
725         reinterpret_cast<const unsigned char *>(encoded.data()) + offset,
726         &value_id,
727         &read_byte));
728     if (source_tokens_[read_num].value_type == TokenInfo::DEFAULT_VALUE) {
729       EXPECT_EQ(source_tokens_[read_num].id_in_value_trie, value_id);
730     } else {
731       EXPECT_EQ(-1, value_id);
732     }
733     offset += read_byte;
734     ++read_num;
735     if (is_last_token) {
736       break;
737     }
738   }
739   EXPECT_EQ(source_tokens_.size(), read_num);
740 }
741 
TEST_F(SystemDictionaryCodecTest,CodecTest)742 TEST_F(SystemDictionaryCodecTest, CodecTest) {
743   unique_ptr<SystemDictionaryCodec> impl(new SystemDictionaryCodec);
744   SystemDictionaryCodecFactory::SetCodec(impl.get());
745   SystemDictionaryCodecInterface *codec =
746       SystemDictionaryCodecFactory::GetCodec();
747   {  // Token
748     InitTokens(50);
749     Util::SetRandomSeed(0);
750     SetRandPos();
751     SetRandCost();
752     SetRandValue();
753     SetRandLabel();
754     string encoded;
755     codec->EncodeTokens(source_tokens_, &encoded);
756     EXPECT_GT(encoded.size(), 0);
757     codec->DecodeTokens(reinterpret_cast<const unsigned char *>(encoded.data()),
758                         &decoded_tokens_);
759     CheckDecoded();
760 
761     // ReadTokens
762     int read_num = 0;
763     int offset = 0;
764     while (true) {
765       int read_byte = 0;
766       int value_id = -1;
767       const bool is_last_token = !(codec->ReadTokenForReverseLookup(
768           reinterpret_cast<const unsigned char *>(encoded.data()) + offset,
769           &value_id,
770           &read_byte));
771       if (source_tokens_[read_num].value_type == TokenInfo::DEFAULT_VALUE) {
772         EXPECT_EQ(source_tokens_[read_num].id_in_value_trie, value_id);
773       } else {
774         EXPECT_EQ(-1, value_id);
775       }
776       offset += read_byte;
777       ++read_num;
778       if (is_last_token) {
779         break;
780       }
781     }
782     EXPECT_EQ(source_tokens_.size(), read_num);
783   }
784   {  // Value
785     string original;
786     {
787       char32 a_ucs4 = '!';
788       Util::SetRandomSeed(0);
789       for (size_t i = 0; i < 10000; ++i) {
790         // U+4E00-9FFF CJK Unified Ideographs
791         const char32 c = a_ucs4 + static_cast<uint16>(Util::Random(0x9f00));
792         Util::UCS4ToUTF8Append(c, &original);
793       }
794     }
795     string encoded;
796     codec->EncodeValue(original, &encoded);
797     string decoded;
798     codec->DecodeValue(encoded, &decoded);
799     EXPECT_EQ(original, decoded);
800   }
801   {  // Key
802     string original;
803     {
804       char32 a_ucs4 = 0x3041;  // "ぁ"
805       Util::SetRandomSeed(0);
806       for (size_t i = 0; i < 1000; ++i) {
807         const char32 c = a_ucs4 + static_cast<uint16>(Util::Random(1000));
808         Util::UCS4ToUTF8Append(c, &original);
809       }
810     }
811     string encoded;
812     codec->EncodeKey(original, &encoded);
813     EXPECT_EQ(encoded.size(), codec->GetEncodedKeyLength(original));
814     string decoded;
815     codec->DecodeKey(encoded, &decoded);
816     EXPECT_EQ(original, decoded);
817     EXPECT_EQ(decoded.size(), codec->GetDecodedKeyLength(encoded));
818   }
819 }
820 
821 
822 }  // namespace dictionary
823 }  // namespace mozc
824