1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "base/util.h"
31
32 #include <climits>
33 #include <cstdlib>
34 #include <cstring>
35 #include <map>
36 #include <sstream>
37 #include <string>
38
39 #include "base/compiler_specific.h"
40 #include "base/file_stream.h"
41 #include "base/file_util.h"
42 #include "base/logging.h"
43 #include "base/number_util.h"
44 #include "base/port.h"
45 #include "testing/base/public/gunit.h"
46 #include "testing/base/public/mozctest.h"
47
48 namespace mozc {
49 namespace {
50
51 #ifndef OS_NACL
52 // Disabled on NaCl since it uses a mock file system.
FillTestCharacterSetMap(std::map<char32,Util::CharacterSet> * test_map)53 void FillTestCharacterSetMap(std::map<char32, Util::CharacterSet> *test_map) {
54 CHECK(test_map);
55 const string &path = testing::GetSourceFileOrDie({
56 "data", "test", "character_set", "character_set.tsv"});
57 std::map<string, Util::CharacterSet> character_set_type_map;
58 character_set_type_map["ASCII"] = Util::ASCII;
59 character_set_type_map["JISX0201"] = Util::JISX0201;
60 character_set_type_map["JISX0208"] = Util::JISX0208;
61 character_set_type_map["JISX0212"] = Util::JISX0212;
62 character_set_type_map["JISX0213"] = Util::JISX0213;
63 character_set_type_map["CP932"] = Util::CP932;
64 // UNICODE_ONLY should not appear in the tsv file though.
65 character_set_type_map["UNICODE_ONLY"] = Util::UNICODE_ONLY;
66
67 InputFileStream finput(path.c_str());
68
69 // Read tsv file.
70 string line;
71 while (!getline(finput, line).fail()) {
72 if (Util::StartsWith(line, "#")) {
73 // Skip comment line.
74 continue;
75 }
76
77 std::vector<string> col;
78 mozc::Util::SplitStringUsing(line, "\t", &col);
79 CHECK_GE(col.size(), 2) << "format error: " << line;
80 const char32 ucs4 = NumberUtil::SimpleAtoi(col[0]);
81 std::map<string, Util::CharacterSet>::const_iterator itr =
82 character_set_type_map.find(col[1]);
83 // We cannot use CHECK_NE here because of overload resolution.
84 CHECK(character_set_type_map.end() != itr)
85 << "Unknown character set type: " << col[1];
86 test_map->insert(std::make_pair(ucs4, itr->second));
87 }
88 }
89 #endif // !OS_NACL
90
GetExpectedCharacterSet(const std::map<char32,Util::CharacterSet> & test_map,char32 ucs4)91 Util::CharacterSet GetExpectedCharacterSet(
92 const std::map<char32, Util::CharacterSet> &test_map,
93 char32 ucs4) {
94 std::map<char32, Util::CharacterSet>::const_iterator itr =
95 test_map.find(ucs4);
96 if (test_map.find(ucs4) == test_map.end()) {
97 // If the test data does not have an entry, it should be
98 // interpreted as |Util::UNICODE_ONLY|.
99 return Util::UNICODE_ONLY;
100 }
101 return itr->second;
102 }
103
104 } // namespace
105
TEST(UtilTest,JoinStrings)106 TEST(UtilTest, JoinStrings) {
107 std::vector<string> input;
108 input.push_back("ab");
109 input.push_back("cdef");
110 input.push_back("ghr");
111 string output;
112 Util::JoinStrings(input, ":", &output);
113 EXPECT_EQ("ab:cdef:ghr", output);
114 }
115
TEST(UtilTest,JoinStringPieces)116 TEST(UtilTest, JoinStringPieces) {
117 {
118 std::vector<StringPiece> input;
119 input.push_back("ab");
120 string output;
121 Util::JoinStringPieces(input, ":", &output);
122 EXPECT_EQ("ab", output);
123 }
124 {
125 std::vector<StringPiece> input;
126 input.push_back("ab");
127 input.push_back("cdef");
128 input.push_back("ghr");
129 string output;
130 Util::JoinStringPieces(input, ":", &output);
131 EXPECT_EQ("ab:cdef:ghr", output);
132 }
133 {
134 std::vector<StringPiece> input;
135 input.push_back("ab");
136 input.push_back("cdef");
137 input.push_back("ghr");
138 string output;
139 Util::JoinStringPieces(input, "::", &output);
140 EXPECT_EQ("ab::cdef::ghr", output);
141 }
142 }
143
TEST(UtilTest,ConcatStrings)144 TEST(UtilTest, ConcatStrings) {
145 string s;
146
147 Util::ConcatStrings("", "", &s);
148 EXPECT_TRUE(s.empty());
149
150 Util::ConcatStrings("ABC", "", &s);
151 EXPECT_EQ("ABC", s);
152
153 Util::ConcatStrings("", "DEF", &s);
154 EXPECT_EQ("DEF", s);
155
156 Util::ConcatStrings("ABC", "DEF", &s);
157 EXPECT_EQ("ABCDEF", s);
158 }
159
TEST(UtilTest,AppendStringWithDelimiter)160 TEST(UtilTest, AppendStringWithDelimiter) {
161 string result;
162 string input;
163 const char kDelemiter[] = ":";
164
165 {
166 result.clear();
167 Util::AppendStringWithDelimiter(kDelemiter, "test", &result);
168 EXPECT_EQ("test", result);
169 }
170
171 {
172 result = "foo";
173 Util::AppendStringWithDelimiter(kDelemiter, "test", &result);
174 EXPECT_EQ("foo:test", result);
175 }
176
177 {
178 result = "foo";
179 Util::AppendStringWithDelimiter(kDelemiter, "", &result);
180 EXPECT_EQ("foo:", result);
181 }
182 }
183
TEST(UtilTest,SplitIterator_SingleDelimiter_SkipEmpty)184 TEST(UtilTest, SplitIterator_SingleDelimiter_SkipEmpty) {
185 typedef SplitIterator<SingleDelimiter, SkipEmpty> SplitIterator;
186 {
187 SplitIterator iter("", " ");
188 EXPECT_TRUE(iter.Done());
189 }
190 {
191 SplitIterator iter(StringPiece(), " ");
192 EXPECT_TRUE(iter.Done());
193 }
194 {
195 const char *s = "a b cde";
196 SplitIterator iter(s, " ");
197 EXPECT_FALSE(iter.Done());
198 EXPECT_EQ("a", iter.Get());
199 iter.Next();
200 EXPECT_FALSE(iter.Done());
201 EXPECT_EQ("b", iter.Get());
202 iter.Next();
203 EXPECT_FALSE(iter.Done());
204 EXPECT_EQ("cde", iter.Get());
205 iter.Next();
206 EXPECT_TRUE(iter.Done());
207 }
208 {
209 const char *s = " a b cde ";
210 SplitIterator iter(s, " ");
211 EXPECT_FALSE(iter.Done());
212 EXPECT_EQ("a", iter.Get());
213 iter.Next();
214 EXPECT_FALSE(iter.Done());
215 EXPECT_EQ("b", iter.Get());
216 iter.Next();
217 EXPECT_FALSE(iter.Done());
218 EXPECT_EQ("cde", iter.Get());
219 iter.Next();
220 EXPECT_TRUE(iter.Done());
221 }
222 {
223 StringPiece s("a b cde ", 5);
224 SplitIterator iter(s, " ");
225 EXPECT_FALSE(iter.Done());
226 EXPECT_EQ("a", iter.Get());
227 iter.Next();
228 EXPECT_FALSE(iter.Done());
229 EXPECT_EQ("b", iter.Get());
230 iter.Next();
231 EXPECT_TRUE(iter.Done());
232 }
233 }
234
TEST(UtilTest,SplitIterator_MultiDelimiter_SkipEmpty)235 TEST(UtilTest, SplitIterator_MultiDelimiter_SkipEmpty) {
236 typedef SplitIterator<MultiDelimiter, SkipEmpty> SplitIterator;
237 {
238 SplitIterator iter("", " \t,");
239 EXPECT_TRUE(iter.Done());
240 }
241 {
242 SplitIterator iter(StringPiece(), ",.");
243 EXPECT_TRUE(iter.Done());
244 }
245 {
246 const char *s = "a b\tcde:fg";
247 SplitIterator iter(s, " \t:");
248 EXPECT_FALSE(iter.Done());
249 EXPECT_EQ("a", iter.Get());
250 iter.Next();
251 EXPECT_FALSE(iter.Done());
252 EXPECT_EQ("b", iter.Get());
253 iter.Next();
254 EXPECT_FALSE(iter.Done());
255 EXPECT_EQ("cde", iter.Get());
256 EXPECT_FALSE(iter.Done());
257 iter.Next();
258 EXPECT_FALSE(iter.Done());
259 EXPECT_EQ("fg", iter.Get());
260 iter.Next();
261 EXPECT_TRUE(iter.Done());
262 }
263 {
264 const char *s = " \t:a b\t\tcde:fg:";
265 SplitIterator iter(s, " \t:");
266 EXPECT_FALSE(iter.Done());
267 EXPECT_EQ("a", iter.Get());
268 iter.Next();
269 EXPECT_FALSE(iter.Done());
270 EXPECT_EQ("b", iter.Get());
271 iter.Next();
272 EXPECT_FALSE(iter.Done());
273 EXPECT_EQ("cde", iter.Get());
274 EXPECT_FALSE(iter.Done());
275 iter.Next();
276 EXPECT_FALSE(iter.Done());
277 EXPECT_EQ("fg", iter.Get());
278 iter.Next();
279 EXPECT_TRUE(iter.Done());
280 }
281 }
282
TEST(UtilTest,SplitIterator_SingleDelimiter_AllowEmpty)283 TEST(UtilTest, SplitIterator_SingleDelimiter_AllowEmpty) {
284 typedef SplitIterator<SingleDelimiter, AllowEmpty> SplitIterator;
285 {
286 SplitIterator iter("", " ");
287 EXPECT_TRUE(iter.Done());
288 }
289 {
290 SplitIterator iter(StringPiece(), " ");
291 EXPECT_TRUE(iter.Done());
292 }
293 {
294 const char *s = "a b cde";
295 SplitIterator iter(s, " ");
296 EXPECT_FALSE(iter.Done());
297 EXPECT_EQ("a", iter.Get());
298 iter.Next();
299 EXPECT_FALSE(iter.Done());
300 EXPECT_EQ("b", iter.Get());
301 iter.Next();
302 EXPECT_FALSE(iter.Done());
303 EXPECT_EQ("cde", iter.Get());
304 iter.Next();
305 EXPECT_TRUE(iter.Done());
306 }
307 {
308 const char *s = " a b cde ";
309 SplitIterator iter(s, " ");
310 EXPECT_FALSE(iter.Done());
311 EXPECT_EQ("", iter.Get());
312 iter.Next();
313 EXPECT_FALSE(iter.Done());
314 EXPECT_EQ("a", iter.Get());
315 iter.Next();
316 EXPECT_FALSE(iter.Done());
317 EXPECT_EQ("b", iter.Get());
318 iter.Next();
319 EXPECT_FALSE(iter.Done());
320 EXPECT_EQ("", iter.Get());
321 iter.Next();
322 EXPECT_FALSE(iter.Done());
323 EXPECT_EQ("cde", iter.Get());
324 iter.Next();
325 EXPECT_FALSE(iter.Done());
326 EXPECT_EQ("", iter.Get());
327 iter.Next();
328 EXPECT_TRUE(iter.Done());
329 }
330 {
331 StringPiece s("a b cde ", 5);
332 SplitIterator iter(s, " ");
333 EXPECT_FALSE(iter.Done());
334 EXPECT_EQ("a", iter.Get());
335 iter.Next();
336 EXPECT_FALSE(iter.Done());
337 EXPECT_EQ("b", iter.Get());
338 iter.Next();
339 EXPECT_FALSE(iter.Done());
340 EXPECT_EQ("", iter.Get());
341 iter.Next();
342 EXPECT_FALSE(iter.Done());
343 EXPECT_EQ("", iter.Get());
344 iter.Next();
345 EXPECT_TRUE(iter.Done());
346 }
347 }
348
TEST(UtilTest,SplitIterator_MultiDelimiter_AllowEmpty)349 TEST(UtilTest, SplitIterator_MultiDelimiter_AllowEmpty) {
350 typedef SplitIterator<MultiDelimiter, AllowEmpty> SplitIterator;
351 {
352 SplitIterator iter("", " \t,");
353 EXPECT_TRUE(iter.Done());
354 }
355 {
356 SplitIterator iter(StringPiece(), ",.");
357 EXPECT_TRUE(iter.Done());
358 }
359 {
360 const char *s = "a b\tcde:fg";
361 SplitIterator iter(s, " \t:");
362 EXPECT_FALSE(iter.Done());
363 EXPECT_EQ("a", iter.Get());
364 iter.Next();
365 EXPECT_FALSE(iter.Done());
366 EXPECT_EQ("b", iter.Get());
367 iter.Next();
368 EXPECT_FALSE(iter.Done());
369 EXPECT_EQ("cde", iter.Get());
370 EXPECT_FALSE(iter.Done());
371 iter.Next();
372 EXPECT_FALSE(iter.Done());
373 EXPECT_EQ("fg", iter.Get());
374 iter.Next();
375 EXPECT_TRUE(iter.Done());
376 }
377 {
378 const char *s = "a b\t\tcde:fg:";
379 SplitIterator iter(s, " \t:");
380 EXPECT_FALSE(iter.Done());
381 EXPECT_EQ("a", iter.Get());
382 iter.Next();
383 EXPECT_FALSE(iter.Done());
384 EXPECT_EQ("b", iter.Get());
385 iter.Next();
386 EXPECT_FALSE(iter.Done());
387 EXPECT_EQ("", iter.Get());
388 iter.Next();
389 EXPECT_FALSE(iter.Done());
390 EXPECT_EQ("cde", iter.Get());
391 EXPECT_FALSE(iter.Done());
392 iter.Next();
393 EXPECT_FALSE(iter.Done());
394 EXPECT_EQ("fg", iter.Get());
395 iter.Next();
396 EXPECT_FALSE(iter.Done());
397 EXPECT_EQ("", iter.Get());
398 iter.Next();
399 EXPECT_TRUE(iter.Done());
400 }
401 }
402
TEST(UtilTest,SplitStringUsing)403 TEST(UtilTest, SplitStringUsing) {
404 {
405 const string input = "a b c def";
406 std::vector<string> output;
407 Util::SplitStringUsing(input, " ", &output);
408 EXPECT_EQ(output.size(), 4);
409 EXPECT_EQ("a", output[0]);
410 EXPECT_EQ("b", output[1]);
411 EXPECT_EQ("c", output[2]);
412 EXPECT_EQ("def", output[3]);
413 }
414 {
415 const string input = " a b c";
416 std::vector<string> output;
417 Util::SplitStringUsing(input, " ", &output);
418 EXPECT_EQ(output.size(), 3);
419 EXPECT_EQ("a", output[0]);
420 EXPECT_EQ("b", output[1]);
421 EXPECT_EQ("c", output[2]);
422 }
423 {
424 const string input = "a b c ";
425 std::vector<string> output;
426 Util::SplitStringUsing(input, " ", &output);
427 EXPECT_EQ(output.size(), 3);
428 EXPECT_EQ("a", output[0]);
429 EXPECT_EQ("b", output[1]);
430 EXPECT_EQ("c", output[2]);
431 }
432 {
433 const string input = "a:b cd ";
434 std::vector<string> output;
435 Util::SplitStringUsing(input, ": ", &output);
436 EXPECT_EQ(output.size(), 3);
437 EXPECT_EQ("a", output[0]);
438 EXPECT_EQ("b", output[1]);
439 EXPECT_EQ("cd", output[2]);
440 }
441 {
442 const string input = "Empty delimiter";
443 std::vector<string> output;
444 Util::SplitStringUsing(input, "", &output);
445 EXPECT_EQ(output.size(), 1);
446 EXPECT_EQ(input, output[0]);
447 }
448 }
449
TEST(UtilTest,SplitStringAllowEmpty)450 TEST(UtilTest, SplitStringAllowEmpty) {
451 {
452 const string input = "a b c def";
453 std::vector<string> output;
454 Util::SplitStringAllowEmpty(input, " ", &output);
455 EXPECT_EQ(output.size(), 5);
456 EXPECT_EQ("a", output[0]);
457 EXPECT_EQ("b", output[1]);
458 EXPECT_EQ("", output[2]);
459 EXPECT_EQ("c", output[3]);
460 EXPECT_EQ("def", output[4]);
461 }
462 {
463 const string input = " a b c";
464 std::vector<string> output;
465 Util::SplitStringAllowEmpty(input, " ", &output);
466 EXPECT_EQ(output.size(), 5);
467 EXPECT_EQ("", output[0]);
468 EXPECT_EQ("a", output[1]);
469 EXPECT_EQ("b", output[2]);
470 EXPECT_EQ("", output[3]);
471 EXPECT_EQ("c", output[4]);
472 }
473 {
474 const string input = "a b c ";
475 std::vector<string> output;
476 Util::SplitStringAllowEmpty(input, " ", &output);
477 EXPECT_EQ(output.size(), 5);
478 EXPECT_EQ("a", output[0]);
479 EXPECT_EQ("b", output[1]);
480 EXPECT_EQ("", output[2]);
481 EXPECT_EQ("c", output[3]);
482 EXPECT_EQ("", output[4]);
483 }
484 {
485 const string input = "a:b c ";
486 std::vector<string> output;
487 Util::SplitStringAllowEmpty(input, ": ", &output);
488 EXPECT_EQ(output.size(), 5);
489 EXPECT_EQ("a", output[0]);
490 EXPECT_EQ("b", output[1]);
491 EXPECT_EQ("", output[2]);
492 EXPECT_EQ("c", output[3]);
493 EXPECT_EQ("", output[4]);
494 }
495 {
496 const string input = "Empty delimiter";
497 std::vector<string> output;
498 Util::SplitStringAllowEmpty(input, "", &output);
499 EXPECT_EQ(output.size(), 1);
500 EXPECT_EQ(input, output[0]);
501 }
502 }
503
TEST(UtilTest,StripWhiteSpaces)504 TEST(UtilTest, StripWhiteSpaces) {
505 // basic scenario.
506 {
507 const string input = " foo ";
508 string output;
509 Util::StripWhiteSpaces(input, &output);
510 EXPECT_EQ("foo", output);
511 }
512
513 // no space means just copy.
514 {
515 const string input = "foo";
516 string output;
517 Util::StripWhiteSpaces(input, &output);
518 EXPECT_EQ("foo", output);
519 }
520
521 // tabs and linebreaks are also spaces.
522 {
523 const string input = " \tfoo\n";
524 string output;
525 Util::StripWhiteSpaces(input, &output);
526 EXPECT_EQ("foo", output);
527 }
528
529 // spaces in the middle remains.
530 {
531 const string input = " foo bar baz ";
532 string output;
533 Util::StripWhiteSpaces(input, &output);
534 EXPECT_EQ("foo bar baz", output);
535 }
536
537 // all spaces means clear out output.
538 {
539 const string input = " \v \r ";
540 string output;
541 Util::StripWhiteSpaces(input, &output);
542 EXPECT_TRUE(output.empty());
543 }
544
545 // empty input.
546 {
547 const string input = "";
548 string output;
549 Util::StripWhiteSpaces(input, &output);
550 EXPECT_TRUE(output.empty());
551 }
552
553 // one character.
554 {
555 const string input = "a";
556 string output;
557 Util::StripWhiteSpaces(input, &output);
558 EXPECT_EQ("a", output);
559 }
560 }
561
TEST(UtilTest,SplitStringToUtf8Chars)562 TEST(UtilTest, SplitStringToUtf8Chars) {
563 {
564 std::vector<string> output;
565 Util::SplitStringToUtf8Chars("", &output);
566 EXPECT_EQ(0, output.size());
567 }
568
569 {
570 const string kInputs[] = {
571 "a", "あ", "亜", "\n", "a",
572 };
573 string joined_string;
574 for (int i = 0; i < arraysize(kInputs); ++i) {
575 joined_string += kInputs[i];
576 }
577
578 std::vector<string> output;
579 Util::SplitStringToUtf8Chars(joined_string, &output);
580 EXPECT_EQ(arraysize(kInputs), output.size());
581
582 for (size_t i = 0; i < output.size(); ++i) {
583 EXPECT_EQ(kInputs[i], output[i]);
584 }
585 }
586 }
587
TEST(UtilTest,SplitCSV)588 TEST(UtilTest, SplitCSV) {
589 std::vector<string> answer_vector;
590
591 Util::SplitCSV(
592 "Google,x,\"Buchheit, Paul\",\"string with \"\" quote in it\"",
593 &answer_vector);
594 CHECK_EQ(answer_vector.size(), 4);
595 CHECK_EQ(answer_vector[0], "Google");
596 CHECK_EQ(answer_vector[1], "x");
597 CHECK_EQ(answer_vector[2], "Buchheit, Paul");
598 CHECK_EQ(answer_vector[3], "string with \" quote in it");
599
600 Util::SplitCSV("Google,hello,", &answer_vector);
601 CHECK_EQ(answer_vector.size(), 3);
602 CHECK_EQ(answer_vector[0], "Google");
603 CHECK_EQ(answer_vector[1], "hello");
604 CHECK_EQ(answer_vector[2], "");
605
606 Util::SplitCSV("Google rocks,hello", &answer_vector);
607 CHECK_EQ(answer_vector.size(), 2);
608 CHECK_EQ(answer_vector[0], "Google rocks");
609 CHECK_EQ(answer_vector[1], "hello");
610
611 Util::SplitCSV(",,\"\",,", &answer_vector);
612 CHECK_EQ(answer_vector.size(), 5);
613 CHECK_EQ(answer_vector[0], "");
614 CHECK_EQ(answer_vector[1], "");
615 CHECK_EQ(answer_vector[2], "");
616 CHECK_EQ(answer_vector[3], "");
617 CHECK_EQ(answer_vector[4], "");
618
619 // Test a string containing a comma.
620 Util::SplitCSV("\",\",hello", &answer_vector);
621 CHECK_EQ(answer_vector.size(), 2);
622 CHECK_EQ(answer_vector[0], ",");
623 CHECK_EQ(answer_vector[1], "hello");
624
625 // Invalid CSV
626 Util::SplitCSV("\"no,last,quote", &answer_vector);
627 CHECK_EQ(answer_vector.size(), 1);
628 CHECK_EQ(answer_vector[0], "no,last,quote");
629
630 Util::SplitCSV("backslash\\,is,no,an,\"escape\"", &answer_vector);
631 CHECK_EQ(answer_vector.size(), 5);
632 CHECK_EQ(answer_vector[0], "backslash\\");
633 CHECK_EQ(answer_vector[1], "is");
634 CHECK_EQ(answer_vector[2], "no");
635 CHECK_EQ(answer_vector[3], "an");
636 CHECK_EQ(answer_vector[4], "escape");
637
638 Util::SplitCSV("", &answer_vector);
639 CHECK_EQ(answer_vector.size(), 0);
640 }
641
TEST(UtilTest,ReplaceString)642 TEST(UtilTest, ReplaceString) {
643 const string input = "foobarfoobar";
644 string output;
645 Util::StringReplace(input, "bar", "buz", true, &output);
646 EXPECT_EQ("foobuzfoobuz", output);
647
648 output.clear();
649 Util::StringReplace(input, "bar", "buz", false, &output);
650 EXPECT_EQ("foobuzfoobar", output);
651 }
652
TEST(UtilTest,LowerString)653 TEST(UtilTest, LowerString) {
654 string s = "TeSTtest";
655 Util::LowerString(&s);
656 EXPECT_EQ("testtest", s);
657
658 string s2 = "TeST@ABCXYZ[`abcxyz{";
659 Util::LowerString(&s2);
660 EXPECT_EQ("test@abcxyz[`abcxyz{", s2);
661 }
662
TEST(UtilTest,UpperString)663 TEST(UtilTest, UpperString) {
664 string s = "TeSTtest";
665 Util::UpperString(&s);
666 EXPECT_EQ("TESTTEST", s);
667
668 string s2 = "TeST@ABCXYZ[`abcxyz{";
669 Util::UpperString(&s2);
670 EXPECT_EQ("TEST@ABCXYZ[`ABCXYZ{", s2);
671 }
672
TEST(UtilTest,CapitalizeString)673 TEST(UtilTest, CapitalizeString) {
674 string s = "TeSTtest";
675 Util::CapitalizeString(&s);
676 EXPECT_EQ("Testtest", s);
677
678 string s2 = "TeST@ABCXYZ[`abcxyz{";
679 Util::CapitalizeString(&s2);
680 EXPECT_EQ("Test@abcxyz[`abcxyz{", s2);
681 }
682
TEST(UtilTest,IsLowerAscii)683 TEST(UtilTest, IsLowerAscii) {
684 EXPECT_TRUE(Util::IsLowerAscii(""));
685 EXPECT_TRUE(Util::IsLowerAscii("hello"));
686 EXPECT_FALSE(Util::IsLowerAscii("HELLO"));
687 EXPECT_FALSE(Util::IsLowerAscii("Hello"));
688 EXPECT_FALSE(Util::IsLowerAscii("HeLlO"));
689 EXPECT_FALSE(Util::IsLowerAscii("symbol!"));
690 EXPECT_FALSE(Util::IsLowerAscii("Hello"));
691 }
692
TEST(UtilTest,IsUpperAscii)693 TEST(UtilTest, IsUpperAscii) {
694 EXPECT_TRUE(Util::IsUpperAscii(""));
695 EXPECT_FALSE(Util::IsUpperAscii("hello"));
696 EXPECT_TRUE(Util::IsUpperAscii("HELLO"));
697 EXPECT_FALSE(Util::IsUpperAscii("Hello"));
698 EXPECT_FALSE(Util::IsUpperAscii("HeLlO"));
699 EXPECT_FALSE(Util::IsUpperAscii("symbol!"));
700 EXPECT_FALSE(Util::IsUpperAscii("Hello"));
701 }
702
TEST(UtilTest,IsCapitalizedAscii)703 TEST(UtilTest, IsCapitalizedAscii) {
704 EXPECT_TRUE(Util::IsCapitalizedAscii(""));
705 EXPECT_FALSE(Util::IsCapitalizedAscii("hello"));
706 EXPECT_FALSE(Util::IsCapitalizedAscii("HELLO"));
707 EXPECT_TRUE(Util::IsCapitalizedAscii("Hello"));
708 EXPECT_FALSE(Util::IsCapitalizedAscii("HeLlO"));
709 EXPECT_FALSE(Util::IsCapitalizedAscii("symbol!"));
710 EXPECT_FALSE(Util::IsCapitalizedAscii("Hello"));
711 }
712
TEST(UtilTest,IsLowerOrUpperAscii)713 TEST(UtilTest, IsLowerOrUpperAscii) {
714 EXPECT_TRUE(Util::IsLowerOrUpperAscii(""));
715 EXPECT_TRUE(Util::IsLowerOrUpperAscii("hello"));
716 EXPECT_TRUE(Util::IsLowerOrUpperAscii("HELLO"));
717 EXPECT_FALSE(Util::IsLowerOrUpperAscii("Hello"));
718 EXPECT_FALSE(Util::IsLowerOrUpperAscii("HeLlO"));
719 EXPECT_FALSE(Util::IsLowerOrUpperAscii("symbol!"));
720 EXPECT_FALSE(Util::IsLowerOrUpperAscii("Hello"));
721 }
722
TEST(UtilTest,IsUpperOrCapitalizedAscii)723 TEST(UtilTest, IsUpperOrCapitalizedAscii) {
724 EXPECT_TRUE(Util::IsUpperOrCapitalizedAscii(""));
725 EXPECT_FALSE(Util::IsUpperOrCapitalizedAscii("hello"));
726 EXPECT_TRUE(Util::IsUpperOrCapitalizedAscii("HELLO"));
727 EXPECT_TRUE(Util::IsUpperOrCapitalizedAscii("Hello"));
728 EXPECT_FALSE(Util::IsUpperOrCapitalizedAscii("HeLlO"));
729 EXPECT_FALSE(Util::IsUpperOrCapitalizedAscii("symbol!"));
730 EXPECT_FALSE(Util::IsUpperOrCapitalizedAscii("Hello"));
731 }
732
VerifyUTF8ToUCS4(const string & text,char32 expected_ucs4,size_t expected_len)733 void VerifyUTF8ToUCS4(const string &text, char32 expected_ucs4,
734 size_t expected_len) {
735 const char *begin = text.data();
736 const char *end = begin + text.size();
737 size_t mblen = 0;
738 char32 result = Util::UTF8ToUCS4(begin, end, &mblen);
739 EXPECT_EQ(expected_ucs4, result) << text << " " << expected_ucs4;
740 EXPECT_EQ(expected_len, mblen) << text << " " << expected_len;
741 }
742
TEST(UtilTest,UTF8ToUCS4)743 TEST(UtilTest, UTF8ToUCS4) {
744 VerifyUTF8ToUCS4("", 0, 0);
745 VerifyUTF8ToUCS4("\x01", 1, 1);
746 VerifyUTF8ToUCS4("\x7F", 0x7F, 1);
747 VerifyUTF8ToUCS4("\xC2\x80", 0x80, 2);
748 VerifyUTF8ToUCS4("\xDF\xBF", 0x7FF, 2);
749 VerifyUTF8ToUCS4("\xE0\xA0\x80", 0x800, 3);
750 VerifyUTF8ToUCS4("\xEF\xBF\xBF", 0xFFFF, 3);
751 VerifyUTF8ToUCS4("\xF0\x90\x80\x80", 0x10000, 4);
752 VerifyUTF8ToUCS4("\xF7\xBF\xBF\xBF", 0x1FFFFF, 4);
753 // do not test 5-6 bytes because it's out of spec of UTF8.
754 }
755
TEST(UtilTest,UCS4ToUTF8)756 TEST(UtilTest, UCS4ToUTF8) {
757 string output;
758
759 // Do nothing if |c| is NUL. Previous implementation of UCS4ToUTF8 worked like
760 // this even though the reason is unclear.
761 Util::UCS4ToUTF8(0, &output);
762 EXPECT_TRUE(output.empty());
763
764 Util::UCS4ToUTF8(0x7F, &output);
765 EXPECT_EQ("\x7F", output);
766 Util::UCS4ToUTF8(0x80, &output);
767 EXPECT_EQ("\xC2\x80", output);
768 Util::UCS4ToUTF8(0x7FF, &output);
769 EXPECT_EQ("\xDF\xBF", output);
770 Util::UCS4ToUTF8(0x800, &output);
771 EXPECT_EQ("\xE0\xA0\x80", output);
772 Util::UCS4ToUTF8(0xFFFF, &output);
773 EXPECT_EQ("\xEF\xBF\xBF", output);
774 Util::UCS4ToUTF8(0x10000, &output);
775 EXPECT_EQ("\xF0\x90\x80\x80", output);
776 Util::UCS4ToUTF8(0x1FFFFF, &output);
777 EXPECT_EQ("\xF7\xBF\xBF\xBF", output);
778
779 // Buffer version.
780 char buf[7];
781
782 EXPECT_EQ(0, Util::UCS4ToUTF8(0, buf));
783 EXPECT_EQ(0, strcmp(buf, ""));
784
785 EXPECT_EQ(1, Util::UCS4ToUTF8(0x7F, buf));
786 EXPECT_EQ(0, strcmp("\x7F", buf));
787
788 EXPECT_EQ(2, Util::UCS4ToUTF8(0x80, buf));
789 EXPECT_EQ(0, strcmp("\xC2\x80", buf));
790
791 EXPECT_EQ(2, Util::UCS4ToUTF8(0x7FF, buf));
792 EXPECT_EQ(0, strcmp("\xDF\xBF", buf));
793
794 EXPECT_EQ(3, Util::UCS4ToUTF8(0x800, buf));
795 EXPECT_EQ(0, strcmp("\xE0\xA0\x80", buf));
796
797 EXPECT_EQ(3, Util::UCS4ToUTF8(0xFFFF, buf));
798 EXPECT_EQ(0, strcmp("\xEF\xBF\xBF", buf));
799
800 EXPECT_EQ(4, Util::UCS4ToUTF8(0x10000, buf));
801 EXPECT_EQ(0, strcmp("\xF0\x90\x80\x80", buf));
802
803 EXPECT_EQ(4, Util::UCS4ToUTF8(0x1FFFFF, buf));
804 EXPECT_EQ(0, strcmp("\xF7\xBF\xBF\xBF", buf));
805 }
806
TEST(UtilTest,CharsLen)807 TEST(UtilTest, CharsLen) {
808 const string src = "私の名前は中野です";
809 EXPECT_EQ(Util::CharsLen(src.c_str(), src.size()), 9);
810 }
811
TEST(UtilTest,SubStringPiece)812 TEST(UtilTest, SubStringPiece) {
813 const string src = "私の名前は中野です";
814 StringPiece result;
815
816 result = Util::SubStringPiece(src, 0, 2);
817 EXPECT_EQ("私の", result);
818 // |result|'s data should point to the same memory block as src.
819 EXPECT_LE(src.data(), result.data());
820
821 result = Util::SubStringPiece(src, 4, 1);
822 EXPECT_EQ("は", result);
823 EXPECT_LE(src.data(), result.data());
824
825 result = Util::SubStringPiece(src, 5, 3);
826 EXPECT_EQ("中野で", result);
827 EXPECT_LE(src.data(), result.data());
828
829 result = Util::SubStringPiece(src, 6, 10);
830 EXPECT_EQ("野です", result);
831 EXPECT_LE(src.data(), result.data());
832
833 result = Util::SubStringPiece(src, 4, 2);
834 EXPECT_EQ("は中", result);
835 EXPECT_LE(src.data(), result.data());
836
837 result = Util::SubStringPiece(src, 2, string::npos);
838 EXPECT_EQ("名前は中野です", result);
839 EXPECT_LE(src.data(), result.data());
840
841 result = Util::SubStringPiece(src, 5, string::npos);
842 EXPECT_EQ("中野です", result);
843 EXPECT_LE(src.data(), result.data());
844 }
845
TEST(UtilTest,SubStringPiece2)846 TEST(UtilTest, SubStringPiece2) {
847 const string src = "私はGoogleです";
848
849 StringPiece result;
850
851 result = Util::SubStringPiece(src, 0);
852 EXPECT_EQ(src, result);
853
854 result = Util::SubStringPiece(src, 5);
855 EXPECT_EQ("gleです", result);
856
857 result = Util::SubStringPiece(src, 10);
858 EXPECT_TRUE(result.empty());
859
860 result = Util::SubStringPiece(src, 13);
861 EXPECT_TRUE(result.empty());
862 }
863
TEST(UtilTest,SubString)864 TEST(UtilTest, SubString) {
865 const string src = "私の名前は中野です";
866 string result;
867
868 result.clear();
869 Util::SubString(src, 0, 2, &result);
870 EXPECT_EQ(result, "私の");
871
872 result.clear();
873 Util::SubString(src, 4, 1, &result);
874 EXPECT_EQ(result, "は");
875
876 result.clear();
877 Util::SubString(src, 5, 3, &result);
878 EXPECT_EQ(result, "中野で");
879
880 result.clear();
881 Util::SubString(src, 6, 10, &result);
882 EXPECT_EQ(result, "野です");
883
884 result.clear();
885 Util::SubString(src, 4, 2, &result);
886 EXPECT_EQ(result, "は中");
887
888 result.clear();
889 Util::SubString(src, 2, string::npos, &result);
890 EXPECT_EQ(result, "名前は中野です");
891
892 result.clear();
893 Util::SubString(src, 5, string::npos, &result);
894 EXPECT_EQ(result, "中野です");
895
896 // Doesn't clear result and call Util::SubString
897 Util::SubString(src, 5, string::npos, &result);
898 EXPECT_EQ(result, "中野です");
899 }
900
TEST(UtilTest,StartsWith)901 TEST(UtilTest, StartsWith) {
902 const string str = "abcdefg";
903 EXPECT_TRUE(Util::StartsWith(str, ""));
904 EXPECT_TRUE(Util::StartsWith(str, "a"));
905 EXPECT_TRUE(Util::StartsWith(str, "abc"));
906 EXPECT_TRUE(Util::StartsWith(str, "abcdefg"));
907 EXPECT_FALSE(Util::StartsWith(str, "abcdefghi"));
908 EXPECT_FALSE(Util::StartsWith(str, "foobar"));
909 }
910
TEST(UtilTest,EndsWith)911 TEST(UtilTest, EndsWith) {
912 const string str = "abcdefg";
913 EXPECT_TRUE(Util::EndsWith(str, ""));
914 EXPECT_TRUE(Util::EndsWith(str, "g"));
915 EXPECT_TRUE(Util::EndsWith(str, "fg"));
916 EXPECT_TRUE(Util::EndsWith(str, "abcdefg"));
917 EXPECT_FALSE(Util::EndsWith(str, "aaabcdefg"));
918 EXPECT_FALSE(Util::EndsWith(str, "foobar"));
919 EXPECT_FALSE(Util::EndsWith(str, "foobarbuzbuz"));
920 }
921
TEST(UtilTest,StripUTF8BOM)922 TEST(UtilTest, StripUTF8BOM) {
923 string line;
924
925 // Should be stripped.
926 line = "\xef\xbb\xbf" "abc";
927 Util::StripUTF8BOM(&line);
928 EXPECT_EQ("abc", line);
929
930 // Should be stripped.
931 line = "\xef\xbb\xbf";
932 Util::StripUTF8BOM(&line);
933 EXPECT_EQ("", line);
934
935 // BOM in the middle of text. Shouldn't be stripped.
936 line = "a" "\xef\xbb\xbf" "bc";
937 Util::StripUTF8BOM(&line);
938 EXPECT_EQ("a" "\xef\xbb\xbf" "bc", line);
939
940 // Incomplete BOM. Shouldn't be stripped.
941 line = "\xef\xbb" "abc";
942 Util::StripUTF8BOM(&line);
943 EXPECT_EQ("\xef\xbb" "abc", line);
944
945 // String shorter than the BOM. Do nothing.
946 line = "a";
947 Util::StripUTF8BOM(&line);
948 EXPECT_EQ("a", line);
949
950 // Empty string. Do nothing.
951 line = "";
952 Util::StripUTF8BOM(&line);
953 EXPECT_EQ("", line);
954 }
955
TEST(UtilTest,IsUTF16BOM)956 TEST(UtilTest, IsUTF16BOM) {
957 EXPECT_FALSE(Util::IsUTF16BOM(""));
958 EXPECT_FALSE(Util::IsUTF16BOM("abc"));
959 EXPECT_TRUE(Util::IsUTF16BOM("\xfe\xff"));
960 EXPECT_TRUE(Util::IsUTF16BOM("\xff\xfe"));
961 EXPECT_TRUE(Util::IsUTF16BOM("\xfe\xff "));
962 EXPECT_TRUE(Util::IsUTF16BOM("\xff\xfe "));
963 EXPECT_FALSE(Util::IsUTF16BOM(" \xfe\xff"));
964 EXPECT_FALSE(Util::IsUTF16BOM(" \xff\xfe"));
965 EXPECT_FALSE(Util::IsUTF16BOM("\xff\xff"));
966 }
967
TEST(UtilTest,IsAndroidPuaEmoji)968 TEST(UtilTest, IsAndroidPuaEmoji) {
969 EXPECT_FALSE(Util::IsAndroidPuaEmoji(""));
970 EXPECT_FALSE(Util::IsAndroidPuaEmoji("A"));
971 EXPECT_FALSE(Util::IsAndroidPuaEmoji("a"));
972
973 string str;
974 Util::UCS4ToUTF8(0xFDFFF, &str);
975 EXPECT_FALSE(Util::IsAndroidPuaEmoji(str));
976 Util::UCS4ToUTF8(0xFE000, &str);
977 EXPECT_TRUE(Util::IsAndroidPuaEmoji(str));
978 Util::UCS4ToUTF8(0xFE800, &str);
979 EXPECT_TRUE(Util::IsAndroidPuaEmoji(str));
980 Util::UCS4ToUTF8(0xFEEA0, &str);
981 EXPECT_TRUE(Util::IsAndroidPuaEmoji(str));
982 Util::UCS4ToUTF8(0xFEEA1, &str);
983 EXPECT_FALSE(Util::IsAndroidPuaEmoji(str));
984
985 // If it has two ucs4 chars (or more), just expect false.
986 Util::UCS4ToUTF8(0xFE000, &str);
987 Util::UCS4ToUTF8Append(0xFE000, &str);
988 EXPECT_FALSE(Util::IsAndroidPuaEmoji(str));
989 }
990
TEST(UtilTest,StringPrintf)991 TEST(UtilTest, StringPrintf) {
992 // On GCC, |EXPECT_EQ("", Util::StringPrintf(""))| may cause
993 // "warning: zero-length printf format string" so we disable this check.
994 MOZC_GCC_DISABLE_WARNING_INLINE(format-zero-length);
995
996 // strings
997 EXPECT_EQ("", Util::StringPrintf(""));
998 EXPECT_EQ("", Util::StringPrintf("%s", ""));
999 EXPECT_EQ("hello, world", Util::StringPrintf("hello, world"));
1000 EXPECT_EQ("hello, world", Util::StringPrintf("%s", "hello, world"));
1001 EXPECT_EQ("hello, world", Util::StringPrintf("%s, %s", "hello", "world"));
1002 EXPECT_EQ("はろー世界", Util::StringPrintf("%s", "はろー世界"));
1003
1004 // 32-bit integers
1005 EXPECT_EQ("-2147483648", Util::StringPrintf("%d", kint32min));
1006 EXPECT_EQ("2147483647", Util::StringPrintf("%d", kint32max));
1007 EXPECT_EQ("4294967295", Util::StringPrintf("%u", kuint32max));
1008 EXPECT_EQ("80000000", Util::StringPrintf("%x", kint32min));
1009 EXPECT_EQ("7fffffff", Util::StringPrintf("%x", kint32max));
1010 EXPECT_EQ("FFFFFFFF", Util::StringPrintf("%X", kuint32max));
1011
1012 // 64-bit integers
1013 EXPECT_EQ("-9223372036854775808",
1014 Util::StringPrintf("%" MOZC_PRId64, kint64min));
1015 EXPECT_EQ("9223372036854775807",
1016 Util::StringPrintf("%" MOZC_PRId64, kint64max));
1017 EXPECT_EQ("18446744073709551615",
1018 Util::StringPrintf("%" MOZC_PRIu64, kuint64max));
1019 EXPECT_EQ("8000000000000000",
1020 Util::StringPrintf("%" MOZC_PRIx64, kint64min));
1021 EXPECT_EQ("7fffffffffffffff",
1022 Util::StringPrintf("%" MOZC_PRIx64, kint64max));
1023 EXPECT_EQ("FFFFFFFFFFFFFFFF",
1024 Util::StringPrintf("%" MOZC_PRIX64, kuint64max));
1025
1026 // Simple test for floating point numbers
1027 EXPECT_EQ("-1.75", Util::StringPrintf("%.2f", -1.75));
1028
1029 // 4096 is greater than a temporary buffer size (1024 bytes)
1030 // which is used in StringPrintf().
1031 const string kLongStrA(4096, '.');
1032 const string kLongStrB(4096, '_');
1033 const string& result = Util::StringPrintf("%s\t%s\n",
1034 kLongStrA.c_str(),
1035 kLongStrB.c_str());
1036 EXPECT_EQ(kLongStrA + "\t" + kLongStrB + "\n", result);
1037 }
1038
TEST(UtilTest,HiraganaToKatakana)1039 TEST(UtilTest, HiraganaToKatakana) {
1040 {
1041 const string input =
1042 "あいうえおぁぃぅぇぉかきくけこがぎぐげごさしすせそざじずぜぞたちつてと"
1043 "だぢづでどっなにぬねのはひふへほばびぶべぼぱぴぷぺぽまみむめもやゆよゃ"
1044 "ゅょらりるれろわゎをんゔ";
1045 string output;
1046 Util::HiraganaToKatakana(input, &output);
1047 EXPECT_EQ(
1048 "アイウエオァィゥェォカキクケコガギグゲゴサシスセソザジズゼゾタチツテト"
1049 "ダヂヅデドッナニヌネノハヒフヘホバビブベボパピプペポマミムメモヤユヨャ"
1050 "ュョラリルレロワヮヲンヴ",
1051 output);
1052 }
1053 {
1054 const string input = "わたしのなまえはなかのですうまーよろしゅう";
1055 string output;
1056 Util::HiraganaToKatakana(input, &output);
1057 EXPECT_EQ("ワタシノナマエハナカノデスウマーヨロシュウ", output);
1058 }
1059 {
1060 const string input = "グーグル工藤よろしくabc";
1061 string output;
1062 Util::HiraganaToKatakana(input, &output);
1063 EXPECT_EQ("グーグル工藤ヨロシクabc", output);
1064 }
1065 }
1066
TEST(UtilTest,KatakanaToHiragana)1067 TEST(UtilTest, KatakanaToHiragana) {
1068 {
1069 const string input =
1070 "アイウエオァィゥェォカキクケコガギグゲゴサシスセソザジズゼゾタチツテト"
1071 "ダヂヅデドッナニヌネノハヒフヘホバビブベボパピプペポマミムメモヤユヨャ"
1072 "ュョラリルレロワヮヲンヰヱヴ";
1073 string output;
1074 Util::KatakanaToHiragana(input, &output);
1075 EXPECT_EQ(
1076 "あいうえおぁぃぅぇぉかきくけこがぎぐげごさしすせそざじずぜぞたちつてと"
1077 "だぢづでどっなにぬねのはひふへほばびぶべぼぱぴぷぺぽまみむめもやゆよゃ"
1078 "ゅょらりるれろわゎをんゐゑゔ",
1079 output);
1080 }
1081 {
1082 const string input = "ワタシノナマエハナカノデスウマーヨロシュウ";
1083 string output;
1084 Util::KatakanaToHiragana(input, &output);
1085 EXPECT_EQ("わたしのなまえはなかのですうまーよろしゅう", output);
1086 }
1087 {
1088 const string input = "グーグル工藤ヨロシクabc";
1089 string output;
1090 Util::KatakanaToHiragana(input, &output);
1091 EXPECT_EQ("ぐーぐる工藤よろしくabc", output);
1092 }
1093 }
1094
TEST(UtilTest,RomanjiToHiragana)1095 TEST(UtilTest, RomanjiToHiragana) {
1096 struct {
1097 const char *input;
1098 const char *expected;
1099 } kTestCases[] = {
1100 {"watasinonamaehatakahashinoriyukidesu",
1101 "わたしのなまえはたかはしのりゆきです"},
1102 {"majissukamajiyabexe", "まじっすかまじやべぇ"},
1103 {"kk", "っk"},
1104 {"xyz", "xyz"},
1105 };
1106 for (size_t i = 0; i < arraysize(kTestCases); ++i) {
1107 string actual;
1108 Util::RomanjiToHiragana(kTestCases[i].input, &actual);
1109 EXPECT_EQ(kTestCases[i].expected, actual);
1110 }
1111 }
1112
TEST(UtilTest,NormalizeVoicedSoundMark)1113 TEST(UtilTest, NormalizeVoicedSoundMark) {
1114 const string input = "僕のう゛ぁいおりん";
1115 string output;
1116 Util::NormalizeVoicedSoundMark(input, &output);
1117 EXPECT_EQ("僕のゔぁいおりん", output);
1118 }
1119
TEST(UtilTest,IsFullWidthSymbolInHalfWidthKatakana)1120 TEST(UtilTest, IsFullWidthSymbolInHalfWidthKatakana) {
1121 EXPECT_FALSE(Util::IsFullWidthSymbolInHalfWidthKatakana("グーグル"));
1122 EXPECT_TRUE(Util::IsFullWidthSymbolInHalfWidthKatakana("ー"));
1123 EXPECT_TRUE(Util::IsFullWidthSymbolInHalfWidthKatakana("。"));
1124 EXPECT_FALSE(Util::IsFullWidthSymbolInHalfWidthKatakana("グーグル。"));
1125 EXPECT_TRUE(Util::IsFullWidthSymbolInHalfWidthKatakana("ー。"));
1126 EXPECT_FALSE(Util::IsFullWidthSymbolInHalfWidthKatakana("ーグ。"));
1127 }
1128
TEST(UtilTest,IsHalfWidthKatakanaSymbol)1129 TEST(UtilTest, IsHalfWidthKatakanaSymbol) {
1130 EXPECT_FALSE(Util::IsHalfWidthKatakanaSymbol("グーグル"));
1131 EXPECT_TRUE(Util::IsHalfWidthKatakanaSymbol("ー"));
1132 EXPECT_TRUE(Util::IsHalfWidthKatakanaSymbol("。")); // Half-width
1133 EXPECT_TRUE(Util::IsHalfWidthKatakanaSymbol("、")); // Half-width
1134 EXPECT_FALSE(Util::IsHalfWidthKatakanaSymbol("グーグル。"));
1135 EXPECT_TRUE(Util::IsHalfWidthKatakanaSymbol("、。")); // Half-width
1136 }
1137
TEST(UtilTest,FullWidthAndHalfWidth)1138 TEST(UtilTest, FullWidthAndHalfWidth) {
1139 string output;
1140
1141 Util::FullWidthToHalfWidth("", &output);
1142 EXPECT_EQ("", output);
1143
1144 Util::HalfWidthToFullWidth("", &output);
1145 EXPECT_EQ("", output);
1146
1147 Util::HalfWidthToFullWidth("abc[]?.", &output);
1148 EXPECT_EQ("abc[]?.", output);
1149
1150 Util::HalfWidthToFullWidth("インターネット「」", &output);
1151 EXPECT_EQ("インターネット「」", output);
1152
1153 Util::HalfWidthToFullWidth("インターネットグーグル", &output);
1154 EXPECT_EQ("インターネットグーグル", output);
1155
1156 Util::FullWidthToHalfWidth("abc[]?.", &output);
1157 EXPECT_EQ("abc[]?.", output);
1158
1159 Util::FullWidthToHalfWidth("インターネット", &output);
1160 EXPECT_EQ("インターネット", output);
1161
1162 Util::FullWidthToHalfWidth("インターネットグーグル", &output);
1163 EXPECT_EQ("インターネットグーグル", output);
1164
1165 // spaces
1166 Util::FullWidthToHalfWidth(" ", &output); // Half- and full-width spaces
1167 EXPECT_EQ(" ", output); // 2 half-width spaces
1168
1169 Util::HalfWidthToFullWidth(" ", &output); // Half- and full-width spaces
1170 EXPECT_EQ(" ", output); // 2 full-width spaces
1171
1172 // Spaces are treated as Ascii here
1173 // Half- and full-width spaces
1174 Util::FullWidthAsciiToHalfWidthAscii(" ", &output);
1175 EXPECT_EQ(" ", output); // 2 half-width spaces
1176
1177 Util::HalfWidthAsciiToFullWidthAscii(" ", &output);
1178 EXPECT_EQ(" ", output); // 2 full-width spaces
1179
1180 // Half- and full-width spaces
1181 Util::FullWidthKatakanaToHalfWidthKatakana(" ", &output);
1182 EXPECT_EQ(" ", output); // Not changed
1183
1184 // Half- and full-width spaces
1185 Util::HalfWidthKatakanaToFullWidthKatakana(" ", &output);
1186 EXPECT_EQ(" ", output); // Not changed
1187 }
1188
TEST(UtilTest,BracketTest)1189 TEST(UtilTest, BracketTest) {
1190 static const struct BracketType {
1191 const char *open_bracket;
1192 const char *close_bracket;
1193 } kBracketType[] = {
1194 { "(", ")" },
1195 { "〔", "〕" },
1196 { "[", "]" },
1197 { "{", "}" },
1198 { "〈", "〉" },
1199 { "《", "》" },
1200 { "「", "」" },
1201 { "『", "』" },
1202 { "【", "】" },
1203 { "〘", "〙" },
1204 { "〚", "〛" },
1205 { nullptr, nullptr }, // sentinel
1206 };
1207
1208 string pair;
1209 for (size_t i = 0;
1210 (kBracketType[i].open_bracket != nullptr ||
1211 kBracketType[i].close_bracket != nullptr);
1212 ++i) {
1213 EXPECT_TRUE(Util::IsOpenBracket(kBracketType[i].open_bracket, &pair));
1214 EXPECT_EQ(kBracketType[i].close_bracket, pair);
1215 EXPECT_TRUE(Util::IsCloseBracket(kBracketType[i].close_bracket, &pair));
1216 EXPECT_EQ(kBracketType[i].open_bracket, pair);
1217 EXPECT_FALSE(Util::IsOpenBracket(kBracketType[i].close_bracket, &pair));
1218 EXPECT_FALSE(Util::IsCloseBracket(kBracketType[i].open_bracket, &pair));
1219 }
1220 }
1221
TEST(UtilTest,IsEnglishTransliteration)1222 TEST(UtilTest, IsEnglishTransliteration) {
1223 EXPECT_TRUE(Util::IsEnglishTransliteration("ABC"));
1224 EXPECT_TRUE(Util::IsEnglishTransliteration("Google"));
1225 EXPECT_TRUE(Util::IsEnglishTransliteration("Google Map"));
1226 EXPECT_TRUE(Util::IsEnglishTransliteration("ABC-DEF"));
1227 EXPECT_TRUE(Util::IsEnglishTransliteration("Foo-bar"));
1228 EXPECT_TRUE(Util::IsEnglishTransliteration("Foo!"));
1229 EXPECT_TRUE(Util::IsEnglishTransliteration("Who's"));
1230 EXPECT_TRUE(Util::IsEnglishTransliteration("!"));
1231 EXPECT_TRUE(Util::IsEnglishTransliteration(" "));
1232 EXPECT_FALSE(Util::IsEnglishTransliteration("てすと"));
1233 EXPECT_FALSE(Util::IsEnglishTransliteration("テスト"));
1234 EXPECT_FALSE(Util::IsEnglishTransliteration("東京"));
1235 }
1236
TEST(UtilTest,ChopReturns)1237 TEST(UtilTest, ChopReturns) {
1238 string line = "line\n";
1239 EXPECT_TRUE(Util::ChopReturns(&line));
1240 EXPECT_EQ("line", line);
1241
1242 line = "line\r";
1243 EXPECT_TRUE(Util::ChopReturns(&line));
1244 EXPECT_EQ("line", line);
1245
1246 line = "line\r\n";
1247 EXPECT_TRUE(Util::ChopReturns(&line));
1248 EXPECT_EQ("line", line);
1249
1250 line = "line";
1251 EXPECT_FALSE(Util::ChopReturns(&line));
1252 EXPECT_EQ("line", line);
1253
1254 line = "line1\nline2\n";
1255 EXPECT_TRUE(Util::ChopReturns(&line));
1256 EXPECT_EQ("line1\nline2", line);
1257
1258 line = "line\n\n\n";
1259 EXPECT_TRUE(Util::ChopReturns(&line));
1260 EXPECT_EQ("line", line);
1261 }
1262
TEST(UtilTest,EncodeURI)1263 TEST(UtilTest, EncodeURI) {
1264 string encoded;
1265 Util::EncodeURI("もずく", &encoded);
1266 EXPECT_EQ("%E3%82%82%E3%81%9A%E3%81%8F", encoded);
1267
1268 encoded.clear();
1269 Util::EncodeURI("mozc", &encoded);
1270 EXPECT_EQ("mozc", encoded);
1271
1272 encoded.clear();
1273 Util::EncodeURI("http://mozc/?q=Hello World", &encoded);
1274 EXPECT_EQ("http%3A%2F%2Fmozc%2F%3Fq%3DHello%20World", encoded);
1275 }
1276
TEST(UtilTest,DecodeURI)1277 TEST(UtilTest, DecodeURI) {
1278 string decoded;
1279 Util::DecodeURI("%E3%82%82%E3%81%9A%E3%81%8F", &decoded);
1280 EXPECT_EQ("もずく", decoded);
1281
1282 decoded.clear();
1283 Util::DecodeURI("mozc", &decoded);
1284 EXPECT_EQ("mozc", decoded);
1285
1286 decoded.clear();
1287 Util::DecodeURI("http%3A%2F%2Fmozc%2F%3Fq%3DHello+World", &decoded);
1288 EXPECT_EQ("http://mozc/?q=Hello World", decoded);
1289 }
1290
TEST(UtilTest,AppendCGIParams)1291 TEST(UtilTest, AppendCGIParams) {
1292 std::vector<std::pair<string, string> > params;
1293 string url;
1294 Util::AppendCGIParams(params, &url);
1295 EXPECT_TRUE(url.empty());
1296
1297 params.push_back(std::make_pair("foo", "b a+r"));
1298 url = "http://mozc.com?";
1299 Util::AppendCGIParams(params, &url);
1300 EXPECT_EQ("http://mozc.com?foo=b%20a%2Br", url);
1301
1302 params.push_back(std::make_pair("buzz", "mozc"));
1303 url.clear();
1304 Util::AppendCGIParams(params, &url);
1305 EXPECT_EQ("foo=b%20a%2Br&buzz=mozc", url);
1306 }
1307
TEST(UtilTest,Escape)1308 TEST(UtilTest, Escape) {
1309 string escaped;
1310 Util::Escape("らむだ", &escaped);
1311 EXPECT_EQ("\\xE3\\x82\\x89\\xE3\\x82\\x80\\xE3\\x81\\xA0", escaped);
1312 }
1313
TEST(UtilTest,Unescape)1314 TEST(UtilTest, Unescape) {
1315 string unescaped;
1316 EXPECT_TRUE(Util::Unescape("\\xE3\\x82\\x89\\xE3\\x82\\x80\\xE3\\x81\\xA0",
1317 &unescaped));
1318 EXPECT_EQ("らむだ", unescaped);
1319
1320 EXPECT_TRUE(Util::Unescape("\\x4D\\x6F\\x7A\\x63", &unescaped));
1321 EXPECT_EQ("Mozc", unescaped);
1322
1323 // A binary sequence (upper case)
1324 EXPECT_TRUE(Util::Unescape("\\x00\\x01\\xEF\\xFF", &unescaped));
1325 EXPECT_EQ(string("\x00\x01\xEF\xFF", 4), unescaped);
1326
1327 // A binary sequence (lower case)
1328 EXPECT_TRUE(Util::Unescape("\\x00\\x01\\xef\\xff", &unescaped));
1329 EXPECT_EQ(string("\x00\x01\xEF\xFF", 4), unescaped);
1330
1331 EXPECT_TRUE(Util::Unescape("", &unescaped));
1332 EXPECT_TRUE(unescaped.empty());
1333
1334 EXPECT_FALSE(Util::Unescape("\\AB\\CD\\EFG", &unescaped));
1335 EXPECT_FALSE(Util::Unescape("\\01\\XY", &unescaped));
1336 }
1337
TEST(UtilTest,EscapeUrl)1338 TEST(UtilTest, EscapeUrl) {
1339 string escaped;
1340 Util::EscapeUrl("らむだ", &escaped);
1341 EXPECT_EQ("%E3%82%89%E3%82%80%E3%81%A0", escaped);
1342 EXPECT_EQ("%E3%82%89%E3%82%80%E3%81%A0", Util::EscapeUrl("らむだ"));
1343 }
1344
TEST(UtilTest,EscapeHtml)1345 TEST(UtilTest, EscapeHtml) {
1346 string escaped;
1347 Util::EscapeHtml("<>&'\"abc", &escaped);
1348 EXPECT_EQ("<>&'"abc", escaped);
1349 }
1350
TEST(UtilTest,EscapeCss)1351 TEST(UtilTest, EscapeCss) {
1352 string escaped;
1353 Util::EscapeCss("<>&'\"abc", &escaped);
1354 EXPECT_EQ("<>&'\"abc", escaped);
1355 }
1356
TEST(UtilTest,ScriptType)1357 TEST(UtilTest, ScriptType) {
1358 EXPECT_TRUE(Util::IsScriptType("くどう", Util::HIRAGANA));
1359 EXPECT_TRUE(Util::IsScriptType("京都", Util::KANJI));
1360 // (b/4201140)
1361 EXPECT_TRUE(Util::IsScriptType("人々", Util::KANJI));
1362 EXPECT_TRUE(Util::IsScriptType("モズク", Util::KATAKANA));
1363 EXPECT_TRUE(Util::IsScriptType("モズクモズク", Util::KATAKANA));
1364 EXPECT_TRUE(Util::IsScriptType("ぐーぐる", Util::HIRAGANA));
1365 EXPECT_TRUE(Util::IsScriptType("グーグル", Util::KATAKANA));
1366 // U+309F: HIRAGANA DIGRAPH YORI
1367 EXPECT_TRUE(Util::IsScriptType("ゟ", Util::HIRAGANA));
1368 // U+30FF: KATAKANA DIGRAPH KOTO
1369 EXPECT_TRUE(Util::IsScriptType("ヿ", Util::KATAKANA));
1370 EXPECT_TRUE(Util::IsScriptType("ヷヸヹヺㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ",
1371 Util::KATAKANA));
1372 // "" U+1B000: KATAKANA LETTER ARCHAIC E
1373 EXPECT_TRUE(Util::IsScriptType("\xF0\x9B\x80\x80", Util::KATAKANA));
1374 // "" U+1B001: HIRAGANA LETTER ARCHAIC YE
1375 EXPECT_TRUE(Util::IsScriptType("\xF0\x9B\x80\x81", Util::HIRAGANA));
1376
1377 EXPECT_TRUE(Util::IsScriptType("012", Util::NUMBER));
1378 EXPECT_TRUE(Util::IsScriptType("012012", Util::NUMBER));
1379 EXPECT_TRUE(Util::IsScriptType("abcABC", Util::ALPHABET));
1380 EXPECT_TRUE(Util::IsScriptType("ABCD", Util::ALPHABET));
1381 EXPECT_TRUE(Util::IsScriptType("@!#", Util::UNKNOWN_SCRIPT));
1382
1383 EXPECT_FALSE(Util::IsScriptType("くどカう", Util::HIRAGANA));
1384 EXPECT_FALSE(Util::IsScriptType("京あ都", Util::KANJI));
1385 EXPECT_FALSE(Util::IsScriptType("モズあク", Util::KATAKANA));
1386 EXPECT_FALSE(Util::IsScriptType("モあズクモズク", Util::KATAKANA));
1387 EXPECT_FALSE(Util::IsScriptType("012あ", Util::NUMBER));
1388 EXPECT_FALSE(Util::IsScriptType("012あ012", Util::NUMBER));
1389 EXPECT_FALSE(Util::IsScriptType("abcABあC", Util::ALPHABET));
1390 EXPECT_FALSE(Util::IsScriptType("ABあCD", Util::ALPHABET));
1391 EXPECT_FALSE(Util::IsScriptType("ぐーぐるグ", Util::HIRAGANA));
1392 EXPECT_FALSE(Util::IsScriptType("グーグルぐ", Util::KATAKANA));
1393
1394 EXPECT_TRUE(Util::ContainsScriptType("グーグルsuggest", Util::ALPHABET));
1395 EXPECT_FALSE(Util::ContainsScriptType("グーグルサジェスト", Util::ALPHABET));
1396
1397 EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("くどう"));
1398 EXPECT_EQ(Util::KANJI, Util::GetScriptType("京都"));
1399 // b/4201140
1400 EXPECT_EQ(Util::KANJI, Util::GetScriptType("人々"));
1401 EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("モズク"));
1402 EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("モズクモズク"));
1403 EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("ぐーぐる"));
1404 EXPECT_EQ(Util::HIRAGANA, Util::GetFirstScriptType("ぐーぐる"));
1405
1406 EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("グーグル"));
1407 EXPECT_EQ(Util::KATAKANA, Util::GetFirstScriptType("グーグル"));
1408 // U+309F HIRAGANA DIGRAPH YORI
1409 EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("ゟ"));
1410 EXPECT_EQ(Util::HIRAGANA, Util::GetFirstScriptType("ゟ"));
1411
1412 // U+30FF KATAKANA DIGRAPH KOTO
1413 EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("ヿ"));
1414 EXPECT_EQ(Util::KATAKANA,
1415 Util::GetScriptType("ヷヸヹヺㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ"));
1416 // "" U+1B000 KATAKANA LETTER ARCHAIC E
1417 EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("\xF0\x9B\x80\x80"));
1418 // "" U+1B001 HIRAGANA LETTER ARCHAIC YE
1419 EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("\xF0\x9B\x80\x81"));
1420
1421 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("!グーグル"));
1422 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("ー")); // U+30FC
1423 EXPECT_EQ(Util::KATAKANA, Util::GetFirstScriptType("ー")); // U+30FC
1424 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("ーー")); // U+30FC * 2
1425 EXPECT_EQ(Util::KATAKANA, Util::GetFirstScriptType("ーー")); // U+30FC * 2
1426 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("゛"));
1427 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("゜"));
1428
1429 EXPECT_EQ(Util::NUMBER, Util::GetScriptType("012"));
1430 EXPECT_EQ(Util::NUMBER, Util::GetScriptType("012012"));
1431 EXPECT_EQ(Util::ALPHABET, Util::GetScriptType("abcABC"));
1432 EXPECT_EQ(Util::ALPHABET, Util::GetScriptType("ABCD"));
1433 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("@!#"));
1434 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("@!#"));
1435
1436 EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("ーひらがな"));
1437 EXPECT_EQ(Util::KATAKANA, Util::GetFirstScriptType("ーひらがな"));
1438 EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("ーカタカナ"));
1439 EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("ーカタカナ"));
1440 EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("ひらがなー"));
1441 EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("カタカナー"));
1442 EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("カタカナー"));
1443
1444 EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("あ゛っ"));
1445 EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("あ゜っ"));
1446 EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("ア゛ッ"));
1447 EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("ア゜ッ"));
1448
1449 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("くどカう"));
1450 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("京あ都"));
1451 EXPECT_EQ(Util::KANJI, Util::GetFirstScriptType("京あ都"));
1452
1453 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("モズあク"));
1454 EXPECT_EQ(Util::KATAKANA, Util::GetFirstScriptType("モズあク"));
1455
1456 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("モあズクモズク"));
1457 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("012あ"));
1458 EXPECT_EQ(Util::NUMBER, Util::GetFirstScriptType("012あ"));
1459 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("012あ012"));
1460 EXPECT_EQ(Util::NUMBER, Util::GetFirstScriptType("012あ012"));
1461 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("abcABあC"));
1462 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("ABあCD"));
1463 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("ぐーぐるグ"));
1464 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("グーグルぐ"));
1465
1466 // "龦" U+9FA6
1467 EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xE9\xBE\xA6"));
1468 // "龻" U+9FBB
1469 EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xE9\xBE\xBB"));
1470 // U+9FFF is not assigned yet but reserved for CJK Unified Ideographs.
1471 EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xE9\xBF\xBF"));
1472 // "咤" U+20B9F U+54A4
1473 EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xF0\xA0\xAE\x9F\xE5\x92\xA4"));
1474 // "野" U+20BB7 U+91CE
1475 EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xF0\xA0\xAE\xB7\xE9\x87\x8E"));
1476 // "" U+2F884
1477 EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xF0\xAF\xA2\x84"));
1478
1479 // U+1F466, BOY/smile emoji
1480 EXPECT_EQ(Util::EMOJI, Util::GetScriptType("\xF0\x9F\x91\xA6"));
1481 // U+FE003, Snow-man Android PUA emoji
1482 EXPECT_TRUE(Util::IsAndroidPuaEmoji("\xf3\xbe\x80\x83"));
1483 EXPECT_EQ(Util::EMOJI, Util::GetScriptType("\xf3\xbe\x80\x83"));
1484 }
1485
TEST(UtilTest,ScriptTypeWithoutSymbols)1486 TEST(UtilTest, ScriptTypeWithoutSymbols) {
1487 EXPECT_EQ(Util::HIRAGANA, Util::GetScriptTypeWithoutSymbols("くど う"));
1488 EXPECT_EQ(Util::KANJI, Util::GetScriptTypeWithoutSymbols("京 都"));
1489 EXPECT_EQ(Util::KATAKANA, Util::GetScriptTypeWithoutSymbols("モズク"));
1490 EXPECT_EQ(Util::KATAKANA, Util::GetScriptTypeWithoutSymbols("モズ クモズク"));
1491 EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("Google Earth"));
1492 EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("Google "));
1493 EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols(" Google"));
1494 EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols(" Google "));
1495 EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols(" g"));
1496 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols(""));
1497 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols(" "));
1498 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols(" "));
1499 EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("Hello!"));
1500 EXPECT_EQ(Util::UNKNOWN_SCRIPT,
1501 Util::GetScriptTypeWithoutSymbols("Hello!あ"));
1502 EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("CD-ROM"));
1503 EXPECT_EQ(Util::UNKNOWN_SCRIPT,
1504 Util::GetScriptTypeWithoutSymbols("CD-ROMア"));
1505 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols("-"));
1506 EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("-A"));
1507 EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("--A"));
1508 EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("--A---"));
1509 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols("--A-ア-"));
1510 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols("!"));
1511 EXPECT_EQ(Util::HIRAGANA, Util::GetScriptTypeWithoutSymbols("・あ"));
1512 EXPECT_EQ(Util::HIRAGANA, Util::GetScriptTypeWithoutSymbols("・・あ"));
1513 EXPECT_EQ(Util::KATAKANA,
1514 Util::GetScriptTypeWithoutSymbols("コギト・エルゴ・スム"));
1515 EXPECT_EQ(Util::UNKNOWN_SCRIPT,
1516 Util::GetScriptTypeWithoutSymbols("コギト・エルゴ・住む"));
1517 EXPECT_EQ(Util::KANJI, Util::GetScriptTypeWithoutSymbols("人☆名"));
1518 EXPECT_EQ(Util::HIRAGANA, Util::GetScriptTypeWithoutSymbols("ひとの☆なまえ"));
1519 EXPECT_EQ(Util::UNKNOWN_SCRIPT,
1520 Util::GetScriptTypeWithoutSymbols("超☆最高です"));
1521 EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols("・--☆"));
1522 }
1523
TEST(UtilTest,FormType)1524 TEST(UtilTest, FormType) {
1525 EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("くどう"));
1526 EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("京都"));
1527 EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("モズク"));
1528 EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("モズク"));
1529 EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("ぐーぐる"));
1530 EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("グーグル"));
1531 EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("グーグル"));
1532 EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("ー"));
1533 EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("ー"));
1534 EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("¢£¥¦¬¯"));
1535 // "│←↑→↓■○"
1536 EXPECT_EQ(Util::HALF_WIDTH,
1537 Util::GetFormType("\xEF\xBF\xA8\xEF\xBF\xA9\xEF\xBF\xAA\xEF\xBF\xAB"
1538 "\xEF\xBF\xAC\xEF\xBF\xAD\xEF\xBF\xAE"));
1539
1540 // Half-width mathematical symbols
1541 // [U+27E6, U+27ED], U+2985, and U+2986
1542 EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("⟦⟧⟨⟩⟪⟫⟬⟭⦅⦆"));
1543
1544 // Half-width hangul "ᅠᄀᄁ"
1545 EXPECT_EQ(Util::HALF_WIDTH,
1546 Util::GetFormType("\xEF\xBE\xA0\xEF\xBE\xA1\xEF\xBE\xA2"));
1547
1548 // Half-width won "₩"
1549 EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("₩"));
1550
1551 EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("012"));
1552 EXPECT_EQ(Util::UNKNOWN_FORM, Util::GetFormType("012012"));
1553 EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("abcABC"));
1554 EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("ABCD"));
1555 EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("@!#"));
1556 }
1557
1558 #ifndef OS_NACL
1559 // We have a snapshot of the result of |Util::GetCharacterSet(ucs4)| in
1560 // data/test/character_set/character_set.tsv.
1561 // Compare the result for each character just in case.
1562 //
1563 // Disabled on NaCl since it uses a mock file system.
TEST(UtilTest,CharacterSetFullTest)1564 TEST(UtilTest, CharacterSetFullTest) {
1565 std::map<char32, Util::CharacterSet> test_set;
1566 FillTestCharacterSetMap(&test_set);
1567 EXPECT_FALSE(test_set.empty());
1568
1569 // Unicode characters consist of [U+0000, U+10FFFF].
1570 for (char32 ucs4 = 0; ucs4 <= 0x10ffff; ++ucs4) {
1571 EXPECT_EQ(GetExpectedCharacterSet(test_set, ucs4),
1572 Util::GetCharacterSet(ucs4))
1573 << "Character set changed at " << ucs4;
1574 }
1575 }
1576 #endif // OS_NACL
1577
TEST(UtilTest,CharacterSet_gen_character_set)1578 TEST(UtilTest, CharacterSet_gen_character_set) {
1579 // [0x00, 0x7f] are ASCII
1580 for (size_t i = 0; i <= 0x7f; ++i) {
1581 EXPECT_EQ(Util::ASCII, Util::GetCharacterSet(i));
1582 }
1583 // [0x80, 0xff] are not ASCII
1584 for (size_t i = 0x80; i <= 0xff; ++i) {
1585 EXPECT_NE(Util::ASCII, Util::GetCharacterSet(i));
1586 }
1587
1588 // 0213
1589 // "Ⅰ"
1590 EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(0x2160));
1591 // "①"
1592 EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(0x2460));
1593 // "㊤"
1594 EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(0x32A4));
1595 // "ð ®" from UCS4 range (b/4176888)
1596 EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(0x20B9F));
1597 // "ðª²" from UCS4 range (b/4176888)
1598 EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(0x2A6B2));
1599
1600 // only in CP932
1601 // "凬"
1602 EXPECT_EQ(Util::CP932, Util::GetCharacterSet(0x51EC));
1603
1604 // only in Unicode
1605 // "₩"
1606 EXPECT_EQ(Util::UNICODE_ONLY, Util::GetCharacterSet(0xFFE6));
1607 // "ð ®·" from UCS4 range (b/4176888)
1608 EXPECT_EQ(Util::UNICODE_ONLY, Util::GetCharacterSet(0x20BB7));
1609 }
1610
TEST(UtilTest,CharacterSet)1611 TEST(UtilTest, CharacterSet) {
1612 EXPECT_EQ(Util::JISX0208, Util::GetCharacterSet("あいうえお"));
1613 EXPECT_EQ(Util::ASCII, Util::GetCharacterSet("abc"));
1614 EXPECT_EQ(Util::JISX0208, Util::GetCharacterSet("abcあいう"));
1615
1616 // half width katakana
1617 EXPECT_EQ(Util::JISX0201, Util::GetCharacterSet("カタカナ"));
1618 EXPECT_EQ(Util::JISX0208, Util::GetCharacterSet("カタカナカタカナ"));
1619
1620 // 0213
1621 EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet("Ⅰ"));
1622 EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet("①"));
1623 EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet("㊤"));
1624 // "ð ® " from UCS4 range (b/4176888)
1625 EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(""));
1626 // "𪠲" from UCS4 range (b/4176888)
1627 EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(""));
1628
1629 // only in CP932
1630 EXPECT_EQ(Util::CP932, Util::GetCharacterSet("凬"));
1631
1632 // only in Unicode
1633 EXPECT_EQ(Util::UNICODE_ONLY, Util::GetCharacterSet("₩"));
1634 // "ð ®·" from UCS4 range (b/4176888)
1635 EXPECT_EQ(Util::UNICODE_ONLY, Util::GetCharacterSet("\xF0\xA0\xAE\xB7"));
1636 }
1637
1638 #ifdef OS_WIN
TEST(UtilTest,WideCharsLen)1639 TEST(UtilTest, WideCharsLen) {
1640 // "að ®b"
1641 const string input_utf8 = "a\360\240\256\237b";
1642 EXPECT_EQ(4, Util::WideCharsLen(input_utf8));
1643 EXPECT_EQ(0, Util::WideCharsLen(Util::SubString(input_utf8, 0, 0)));
1644 EXPECT_EQ(1, Util::WideCharsLen(Util::SubString(input_utf8, 0, 1)));
1645 EXPECT_EQ(3, Util::WideCharsLen(Util::SubString(input_utf8, 0, 2)));
1646 EXPECT_EQ(4, Util::WideCharsLen(Util::SubString(input_utf8, 0, 3)));
1647 }
1648
TEST(UtilTest,UTF8ToWide)1649 TEST(UtilTest, UTF8ToWide) {
1650 const string input_utf8 = "abc";
1651 std::wstring output_wide;
1652 Util::UTF8ToWide(input_utf8, &output_wide);
1653
1654 string output_utf8;
1655 Util::WideToUTF8(output_wide, &output_utf8);
1656 EXPECT_EQ("abc", output_utf8);
1657 }
1658
TEST(UtilTest,WideToUTF8_SurrogatePairSupport)1659 TEST(UtilTest, WideToUTF8_SurrogatePairSupport) {
1660 // Visual C++ 2008 does not support embedding surrogate pair in string
1661 // literals like L"\uD842\uDF9F". This is why we use wchar_t array instead.
1662 // "ð ®"
1663 const wchar_t input_wide[] = {0xD842, 0xDF9F, 0};
1664 string output_utf8;
1665 Util::WideToUTF8(input_wide, &output_utf8);
1666
1667 std::wstring output_wide;
1668 Util::UTF8ToWide(output_utf8, &output_wide);
1669
1670 EXPECT_EQ("\360\240\256\237", output_utf8);
1671 EXPECT_EQ(input_wide, output_wide);
1672 }
1673 #endif // OS_WIN
1674
TEST(UtilTest,IsKanaSymbolContained)1675 TEST(UtilTest, IsKanaSymbolContained) {
1676 const string kFullstop("。");
1677 const string kSpace(" ");
1678 EXPECT_TRUE(Util::IsKanaSymbolContained(kFullstop));
1679 EXPECT_TRUE(Util::IsKanaSymbolContained(kSpace + kFullstop));
1680 EXPECT_TRUE(Util::IsKanaSymbolContained(kFullstop + kSpace));
1681 EXPECT_FALSE(Util::IsKanaSymbolContained(kSpace));
1682 EXPECT_FALSE(Util::IsKanaSymbolContained(""));
1683 }
1684
TEST(UtilTest,RandomSeedTest)1685 TEST(UtilTest, RandomSeedTest) {
1686 Util::SetRandomSeed(0);
1687 const int first_try = Util::Random(INT_MAX);
1688 const int second_try = Util::Random(INT_MAX);
1689 EXPECT_NE(first_try, second_try);
1690
1691 // Reset the seed.
1692 Util::SetRandomSeed(0);
1693 EXPECT_EQ(first_try, Util::Random(INT_MAX));
1694 }
1695
TEST(UtilTest,SplitFirstChar32)1696 TEST(UtilTest, SplitFirstChar32) {
1697 StringPiece rest;
1698 char32 c = 0;
1699
1700 rest = StringPiece();
1701 c = 0;
1702 EXPECT_FALSE(Util::SplitFirstChar32("", &c, &rest));
1703 EXPECT_EQ(0, c);
1704 EXPECT_TRUE(rest.empty());
1705
1706 // Allow nullptr to ignore the matched value.
1707 rest = StringPiece();
1708 EXPECT_TRUE(Util::SplitFirstChar32("01", nullptr, &rest));
1709 EXPECT_EQ("1", rest);
1710
1711 // Allow nullptr to ignore the matched value.
1712 c = 0;
1713 EXPECT_TRUE(Util::SplitFirstChar32("01", &c, nullptr));
1714 EXPECT_EQ('0', c);
1715
1716 rest = StringPiece();
1717 c = 0;
1718 EXPECT_TRUE(Util::SplitFirstChar32("\x01 ", &c, &rest));
1719 EXPECT_EQ(1, c);
1720 EXPECT_EQ(" ", rest);
1721
1722 rest = StringPiece();
1723 c = 0;
1724 EXPECT_TRUE(Util::SplitFirstChar32("\x7F ", &c, &rest));
1725 EXPECT_EQ(0x7F, c);
1726 EXPECT_EQ(" ", rest);
1727
1728 rest = StringPiece();
1729 c = 0;
1730 EXPECT_TRUE(Util::SplitFirstChar32("\xC2\x80 ", &c, &rest));
1731 EXPECT_EQ(0x80, c);
1732 EXPECT_EQ(" ", rest);
1733
1734 rest = StringPiece();
1735 c = 0;
1736 EXPECT_TRUE(Util::SplitFirstChar32("\xDF\xBF ", &c, &rest));
1737 EXPECT_EQ(0x7FF, c);
1738 EXPECT_EQ(" ", rest);
1739
1740 rest = StringPiece();
1741 c = 0;
1742 EXPECT_TRUE(Util::SplitFirstChar32("\xE0\xA0\x80 ", &c, &rest));
1743 EXPECT_EQ(0x800, c);
1744 EXPECT_EQ(" ", rest);
1745
1746 rest = StringPiece();
1747 c = 0;
1748 EXPECT_TRUE(Util::SplitFirstChar32("\xEF\xBF\xBF ", &c, &rest));
1749 EXPECT_EQ(0xFFFF, c);
1750 EXPECT_EQ(" ", rest);
1751
1752 rest = StringPiece();
1753 c = 0;
1754 EXPECT_TRUE(Util::SplitFirstChar32("\xF0\x90\x80\x80 ", &c, &rest));
1755 EXPECT_EQ(0x10000, c);
1756 EXPECT_EQ(" ", rest);
1757
1758 rest = StringPiece();
1759 c = 0;
1760 EXPECT_TRUE(Util::SplitFirstChar32("\xF7\xBF\xBF\xBF ", &c, &rest));
1761 EXPECT_EQ(0x1FFFFF, c);
1762 EXPECT_EQ(" ", rest);
1763
1764 rest = StringPiece();
1765 c = 0;
1766 EXPECT_TRUE(Util::SplitFirstChar32("\xF8\x88\x80\x80\x80 ", &c, &rest));
1767 EXPECT_EQ(0x200000, c);
1768 EXPECT_EQ(" ", rest);
1769
1770 rest = StringPiece();
1771 c = 0;
1772 EXPECT_TRUE(Util::SplitFirstChar32("\xFB\xBF\xBF\xBF\xBF ", &c, &rest));
1773 EXPECT_EQ(0x3FFFFFF, c);
1774 EXPECT_EQ(" ", rest);
1775
1776 rest = StringPiece();
1777 c = 0;
1778 EXPECT_TRUE(Util::SplitFirstChar32("\xFC\x84\x80\x80\x80\x80 ", &c, &rest));
1779 EXPECT_EQ(0x4000000, c);
1780 EXPECT_EQ(" ", rest);
1781
1782 rest = StringPiece();
1783 c = 0;
1784 EXPECT_TRUE(Util::SplitFirstChar32("\xFD\xBF\xBF\xBF\xBF\xBF ", &c, &rest));
1785 EXPECT_EQ(0x7FFFFFFF, c);
1786 EXPECT_EQ(" ", rest);
1787
1788 // If there is any invalid sequence, the entire text should be treated as
1789 // am empty string.
1790 {
1791 c = 0;
1792 EXPECT_FALSE(Util::SplitFirstChar32("\xC2 ", &c, &rest));
1793 EXPECT_EQ(0, c);
1794
1795 c = 0;
1796 EXPECT_FALSE(Util::SplitFirstChar32("\xC2\xC2 ", &c, &rest));
1797 EXPECT_EQ(0, c);
1798
1799 c = 0;
1800 EXPECT_FALSE(Util::SplitFirstChar32("\xE0 ", &c, &rest));
1801 EXPECT_EQ(0, c);
1802
1803 c = 0;
1804 EXPECT_FALSE(Util::SplitFirstChar32("\xE0\xE0\xE0 ", &c, &rest));
1805 EXPECT_EQ(0, c);
1806
1807 c = 0;
1808 EXPECT_FALSE(Util::SplitFirstChar32("\xF0 ", &c, &rest));
1809 EXPECT_EQ(0, c);
1810
1811 c = 0;
1812 EXPECT_FALSE(Util::SplitFirstChar32("\xF0\xF0\xF0\xF0 ", &c, &rest));
1813 EXPECT_EQ(0, c);
1814 }
1815
1816 // BOM should be treated as invalid byte.
1817 {
1818 c = 0;
1819 EXPECT_FALSE(Util::SplitFirstChar32("\xFF ", &c, &rest));
1820 EXPECT_EQ(0, c);
1821
1822 c = 0;
1823 EXPECT_FALSE(Util::SplitFirstChar32("\xFE ", &c, &rest));
1824 EXPECT_EQ(0, c);
1825 }
1826
1827 // Invalid sequence for U+002F (redundant encoding)
1828 {
1829 c = 0;
1830 EXPECT_FALSE(Util::SplitFirstChar32("\xC0\xAF", &c, &rest));
1831 EXPECT_EQ(0, c);
1832
1833 c = 0;
1834 EXPECT_FALSE(Util::SplitFirstChar32("\xE0\x80\xAF", &c, &rest));
1835 EXPECT_EQ(0, c);
1836
1837 c = 0;
1838 EXPECT_FALSE(Util::SplitFirstChar32("\xF0\x80\x80\xAF", &c, &rest));
1839 EXPECT_EQ(0, c);
1840 }
1841 }
1842
TEST(UtilTest,SplitLastChar32)1843 TEST(UtilTest, SplitLastChar32) {
1844 StringPiece rest;
1845 char32 c = 0;
1846
1847 rest = StringPiece();
1848 c = 0;
1849 EXPECT_FALSE(Util::SplitLastChar32("", &rest, &c));
1850 EXPECT_EQ(0, c);
1851 EXPECT_TRUE(rest.empty());
1852
1853 // Allow nullptr to ignore the matched value.
1854 c = 0;
1855 EXPECT_TRUE(Util::SplitLastChar32("01", nullptr, &c));
1856 EXPECT_EQ('1', c);
1857
1858 // Allow nullptr to ignore the matched value.
1859 rest = StringPiece();
1860 EXPECT_TRUE(Util::SplitLastChar32("01", &rest, nullptr));
1861 EXPECT_EQ("0", rest);
1862
1863 rest = StringPiece();
1864 c = 0;
1865 EXPECT_TRUE(Util::SplitLastChar32(" \x01", &rest, &c));
1866 EXPECT_EQ(1, c);
1867 EXPECT_EQ(" ", rest);
1868
1869 rest = StringPiece();
1870 c = 0;
1871 EXPECT_TRUE(Util::SplitLastChar32(" \x7F", &rest, &c));
1872 EXPECT_EQ(0x7F, c);
1873 EXPECT_EQ(" ", rest);
1874
1875 rest = StringPiece();
1876 c = 0;
1877 EXPECT_TRUE(Util::SplitLastChar32(" \xC2\x80", &rest, &c));
1878 EXPECT_EQ(0x80, c);
1879 EXPECT_EQ(" ", rest);
1880
1881 rest = StringPiece();
1882 c = 0;
1883 EXPECT_TRUE(Util::SplitLastChar32(" \xDF\xBF", &rest, &c));
1884 EXPECT_EQ(0x7FF, c);
1885 EXPECT_EQ(" ", rest);
1886
1887 rest = StringPiece();
1888 c = 0;
1889 EXPECT_TRUE(Util::SplitLastChar32(" \xE0\xA0\x80", &rest, &c));
1890 EXPECT_EQ(0x800, c);
1891 EXPECT_EQ(" ", rest);
1892
1893 rest = StringPiece();
1894 c = 0;
1895 EXPECT_TRUE(Util::SplitLastChar32(" \xEF\xBF\xBF", &rest, &c));
1896 EXPECT_EQ(0xFFFF, c);
1897 EXPECT_EQ(" ", rest);
1898
1899 rest = StringPiece();
1900 c = 0;
1901 EXPECT_TRUE(Util::SplitLastChar32(" \xF0\x90\x80\x80", &rest, &c));
1902 EXPECT_EQ(0x10000, c);
1903 EXPECT_EQ(" ", rest);
1904
1905 rest = StringPiece();
1906 c = 0;
1907 EXPECT_TRUE(Util::SplitLastChar32(" \xF7\xBF\xBF\xBF", &rest, &c));
1908 EXPECT_EQ(0x1FFFFF, c);
1909 EXPECT_EQ(" ", rest);
1910
1911 rest = StringPiece();
1912 c = 0;
1913 EXPECT_TRUE(Util::SplitLastChar32(" \xF8\x88\x80\x80\x80", &rest, &c));
1914 EXPECT_EQ(0x200000, c);
1915 EXPECT_EQ(" ", rest);
1916
1917 rest = StringPiece();
1918 c = 0;
1919 EXPECT_TRUE(Util::SplitLastChar32(" \xFB\xBF\xBF\xBF\xBF", &rest, &c));
1920 EXPECT_EQ(0x3FFFFFF, c);
1921 EXPECT_EQ(" ", rest);
1922
1923 rest = StringPiece();
1924 c = 0;
1925 EXPECT_TRUE(Util::SplitLastChar32(" \xFC\x84\x80\x80\x80\x80", &rest, &c));
1926 EXPECT_EQ(0x4000000, c);
1927 EXPECT_EQ(" ", rest);
1928
1929 rest = StringPiece();
1930 c = 0;
1931 EXPECT_TRUE(Util::SplitLastChar32(" \xFD\xBF\xBF\xBF\xBF\xBF", &rest, &c));
1932 EXPECT_EQ(0x7FFFFFFF, c);
1933 EXPECT_EQ(" ", rest);
1934
1935 // If there is any invalid sequence, the entire text should be treated as
1936 // am empty string.
1937 {
1938 c = 0;
1939 EXPECT_FALSE(Util::SplitLastChar32(" \xC2", &rest, &c));
1940 EXPECT_EQ(0, c);
1941
1942 c = 0;
1943 EXPECT_FALSE(Util::SplitLastChar32(" \xC2\xC2", &rest, &c));
1944 EXPECT_EQ(0, c);
1945
1946 c = 0;
1947 EXPECT_FALSE(Util::SplitLastChar32(" \xE0", &rest, &c));
1948 EXPECT_EQ(0, c);
1949
1950 c = 0;
1951 EXPECT_FALSE(Util::SplitLastChar32(" \xE0\xE0\xE0", &rest, &c));
1952 EXPECT_EQ(0, c);
1953
1954 c = 0;
1955 EXPECT_FALSE(Util::SplitLastChar32(" \xF0", &rest, &c));
1956 EXPECT_EQ(0, c);
1957
1958 c = 0;
1959 EXPECT_FALSE(Util::SplitLastChar32(" \xF0\xF0\xF0\xF0", &rest, &c));
1960 EXPECT_EQ(0, c);
1961 }
1962
1963 // BOM should be treated as invalid byte.
1964 {
1965 c = 0;
1966 EXPECT_FALSE(Util::SplitLastChar32(" \xFF", &rest, &c));
1967 EXPECT_EQ(0, c);
1968
1969 c = 0;
1970 EXPECT_FALSE(Util::SplitLastChar32(" \xFE", &rest, &c));
1971 EXPECT_EQ(0, c);
1972 }
1973
1974 // Invalid sequence for U+002F (redundant encoding)
1975 {
1976 c = 0;
1977 EXPECT_FALSE(Util::SplitLastChar32("\xC0\xAF", &rest, &c));
1978 EXPECT_EQ(0, c);
1979
1980 c = 0;
1981 EXPECT_FALSE(Util::SplitLastChar32("\xE0\x80\xAF", &rest, &c));
1982 EXPECT_EQ(0, c);
1983
1984 c = 0;
1985 EXPECT_FALSE(Util::SplitLastChar32("\xF0\x80\x80\xAF", &rest, &c));
1986 EXPECT_EQ(0, c);
1987 }
1988 }
1989
TEST(UtilTest,SerializeAndDeserializeUint64)1990 TEST(UtilTest, SerializeAndDeserializeUint64) {
1991 struct {
1992 const char* str;
1993 uint64 value;
1994 } kCorrectPairs[] = {
1995 {"\x00\x00\x00\x00\x00\x00\x00\x00", 0},
1996 {"\x00\x00\x00\x00\x00\x00\x00\xFF", kuint8max},
1997 {"\x00\x00\x00\x00\x00\x00\xFF\xFF", kuint16max},
1998 {"\x00\x00\x00\x00\xFF\xFF\xFF\xFF", kuint32max},
1999 {"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", kuint64max},
2000 {"\x01\x23\x45\x67\x89\xAB\xCD\xEF", 0x0123456789ABCDEF},
2001 {"\xFE\xDC\xBA\x98\x76\x54\x32\x10", 0xFEDCBA9876543210},
2002 };
2003
2004 for (size_t i = 0; i < arraysize(kCorrectPairs); ++i) {
2005 const string serialized(kCorrectPairs[i].str, 8);
2006 EXPECT_EQ(serialized, Util::SerializeUint64(kCorrectPairs[i].value));
2007
2008 uint64 v;
2009 EXPECT_TRUE(Util::DeserializeUint64(serialized, &v));
2010 EXPECT_EQ(kCorrectPairs[i].value, v);
2011 }
2012
2013 // Invalid patterns for DeserializeUint64.
2014 const char* kFalseCases[] = {
2015 "",
2016 "abc",
2017 "helloworld",
2018 };
2019 for (size_t i = 0; i < arraysize(kFalseCases); ++i) {
2020 uint64 v;
2021 EXPECT_FALSE(Util::DeserializeUint64(kFalseCases[i], &v));
2022 }
2023 }
2024
2025 } // namespace mozc
2026