1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "base/util.h"
31 
32 #include <climits>
33 #include <cstdlib>
34 #include <cstring>
35 #include <map>
36 #include <sstream>
37 #include <string>
38 
39 #include "base/compiler_specific.h"
40 #include "base/file_stream.h"
41 #include "base/file_util.h"
42 #include "base/logging.h"
43 #include "base/number_util.h"
44 #include "base/port.h"
45 #include "testing/base/public/gunit.h"
46 #include "testing/base/public/mozctest.h"
47 
48 namespace mozc {
49 namespace {
50 
51 #ifndef OS_NACL
52 // Disabled on NaCl since it uses a mock file system.
FillTestCharacterSetMap(std::map<char32,Util::CharacterSet> * test_map)53 void FillTestCharacterSetMap(std::map<char32, Util::CharacterSet> *test_map) {
54   CHECK(test_map);
55   const string &path = testing::GetSourceFileOrDie({
56       "data", "test", "character_set", "character_set.tsv"});
57   std::map<string, Util::CharacterSet> character_set_type_map;
58   character_set_type_map["ASCII"] = Util::ASCII;
59   character_set_type_map["JISX0201"] = Util::JISX0201;
60   character_set_type_map["JISX0208"] = Util::JISX0208;
61   character_set_type_map["JISX0212"] = Util::JISX0212;
62   character_set_type_map["JISX0213"] = Util::JISX0213;
63   character_set_type_map["CP932"] = Util::CP932;
64   // UNICODE_ONLY should not appear in the tsv file though.
65   character_set_type_map["UNICODE_ONLY"] = Util::UNICODE_ONLY;
66 
67   InputFileStream finput(path.c_str());
68 
69   // Read tsv file.
70   string line;
71   while (!getline(finput, line).fail()) {
72     if (Util::StartsWith(line, "#")) {
73       // Skip comment line.
74       continue;
75     }
76 
77     std::vector<string> col;
78     mozc::Util::SplitStringUsing(line, "\t", &col);
79     CHECK_GE(col.size(), 2) << "format error: " << line;
80     const char32 ucs4 = NumberUtil::SimpleAtoi(col[0]);
81     std::map<string, Util::CharacterSet>::const_iterator itr =
82         character_set_type_map.find(col[1]);
83     // We cannot use CHECK_NE here because of overload resolution.
84     CHECK(character_set_type_map.end() != itr)
85         << "Unknown character set type: " << col[1];
86     test_map->insert(std::make_pair(ucs4, itr->second));
87   }
88 }
89 #endif  // !OS_NACL
90 
GetExpectedCharacterSet(const std::map<char32,Util::CharacterSet> & test_map,char32 ucs4)91 Util::CharacterSet GetExpectedCharacterSet(
92     const std::map<char32, Util::CharacterSet> &test_map,
93     char32 ucs4) {
94   std::map<char32, Util::CharacterSet>::const_iterator itr =
95       test_map.find(ucs4);
96   if (test_map.find(ucs4) == test_map.end()) {
97     // If the test data does not have an entry, it should be
98     // interpreted as |Util::UNICODE_ONLY|.
99     return Util::UNICODE_ONLY;
100   }
101   return itr->second;
102 }
103 
104 }  // namespace
105 
TEST(UtilTest,JoinStrings)106 TEST(UtilTest, JoinStrings) {
107   std::vector<string> input;
108   input.push_back("ab");
109   input.push_back("cdef");
110   input.push_back("ghr");
111   string output;
112   Util::JoinStrings(input, ":", &output);
113   EXPECT_EQ("ab:cdef:ghr", output);
114 }
115 
TEST(UtilTest,JoinStringPieces)116 TEST(UtilTest, JoinStringPieces) {
117   {
118     std::vector<StringPiece> input;
119     input.push_back("ab");
120     string output;
121     Util::JoinStringPieces(input, ":", &output);
122     EXPECT_EQ("ab", output);
123   }
124   {
125     std::vector<StringPiece> input;
126     input.push_back("ab");
127     input.push_back("cdef");
128     input.push_back("ghr");
129     string output;
130     Util::JoinStringPieces(input, ":", &output);
131     EXPECT_EQ("ab:cdef:ghr", output);
132   }
133   {
134     std::vector<StringPiece> input;
135     input.push_back("ab");
136     input.push_back("cdef");
137     input.push_back("ghr");
138     string output;
139     Util::JoinStringPieces(input, "::", &output);
140     EXPECT_EQ("ab::cdef::ghr", output);
141   }
142 }
143 
TEST(UtilTest,ConcatStrings)144 TEST(UtilTest, ConcatStrings) {
145   string s;
146 
147   Util::ConcatStrings("", "", &s);
148   EXPECT_TRUE(s.empty());
149 
150   Util::ConcatStrings("ABC", "", &s);
151   EXPECT_EQ("ABC", s);
152 
153   Util::ConcatStrings("", "DEF", &s);
154   EXPECT_EQ("DEF", s);
155 
156   Util::ConcatStrings("ABC", "DEF", &s);
157   EXPECT_EQ("ABCDEF", s);
158 }
159 
TEST(UtilTest,AppendStringWithDelimiter)160 TEST(UtilTest, AppendStringWithDelimiter) {
161   string result;
162   string input;
163   const char kDelemiter[] = ":";
164 
165   {
166     result.clear();
167     Util::AppendStringWithDelimiter(kDelemiter, "test", &result);
168     EXPECT_EQ("test", result);
169   }
170 
171   {
172     result = "foo";
173     Util::AppendStringWithDelimiter(kDelemiter, "test", &result);
174     EXPECT_EQ("foo:test", result);
175   }
176 
177   {
178     result = "foo";
179     Util::AppendStringWithDelimiter(kDelemiter, "", &result);
180     EXPECT_EQ("foo:", result);
181   }
182 }
183 
TEST(UtilTest,SplitIterator_SingleDelimiter_SkipEmpty)184 TEST(UtilTest, SplitIterator_SingleDelimiter_SkipEmpty) {
185   typedef SplitIterator<SingleDelimiter, SkipEmpty> SplitIterator;
186   {
187     SplitIterator iter("", " ");
188     EXPECT_TRUE(iter.Done());
189   }
190   {
191     SplitIterator iter(StringPiece(), " ");
192     EXPECT_TRUE(iter.Done());
193   }
194   {
195     const char *s = "a b cde";
196     SplitIterator iter(s, " ");
197     EXPECT_FALSE(iter.Done());
198     EXPECT_EQ("a", iter.Get());
199     iter.Next();
200     EXPECT_FALSE(iter.Done());
201     EXPECT_EQ("b", iter.Get());
202     iter.Next();
203     EXPECT_FALSE(iter.Done());
204     EXPECT_EQ("cde", iter.Get());
205     iter.Next();
206     EXPECT_TRUE(iter.Done());
207   }
208   {
209     const char *s = " a b  cde ";
210     SplitIterator iter(s, " ");
211     EXPECT_FALSE(iter.Done());
212     EXPECT_EQ("a", iter.Get());
213     iter.Next();
214     EXPECT_FALSE(iter.Done());
215     EXPECT_EQ("b", iter.Get());
216     iter.Next();
217     EXPECT_FALSE(iter.Done());
218     EXPECT_EQ("cde", iter.Get());
219     iter.Next();
220     EXPECT_TRUE(iter.Done());
221   }
222   {
223     StringPiece s("a b  cde ", 5);
224     SplitIterator iter(s, " ");
225     EXPECT_FALSE(iter.Done());
226     EXPECT_EQ("a", iter.Get());
227     iter.Next();
228     EXPECT_FALSE(iter.Done());
229     EXPECT_EQ("b", iter.Get());
230     iter.Next();
231     EXPECT_TRUE(iter.Done());
232   }
233 }
234 
TEST(UtilTest,SplitIterator_MultiDelimiter_SkipEmpty)235 TEST(UtilTest, SplitIterator_MultiDelimiter_SkipEmpty) {
236   typedef SplitIterator<MultiDelimiter, SkipEmpty> SplitIterator;
237   {
238     SplitIterator iter("", " \t,");
239     EXPECT_TRUE(iter.Done());
240   }
241   {
242     SplitIterator iter(StringPiece(), ",.");
243     EXPECT_TRUE(iter.Done());
244   }
245   {
246     const char *s = "a b\tcde:fg";
247     SplitIterator iter(s, " \t:");
248     EXPECT_FALSE(iter.Done());
249     EXPECT_EQ("a", iter.Get());
250     iter.Next();
251     EXPECT_FALSE(iter.Done());
252     EXPECT_EQ("b", iter.Get());
253     iter.Next();
254     EXPECT_FALSE(iter.Done());
255     EXPECT_EQ("cde", iter.Get());
256     EXPECT_FALSE(iter.Done());
257     iter.Next();
258     EXPECT_FALSE(iter.Done());
259     EXPECT_EQ("fg", iter.Get());
260     iter.Next();
261     EXPECT_TRUE(iter.Done());
262   }
263   {
264     const char *s = "  \t:a b\t\tcde:fg:";
265     SplitIterator iter(s, " \t:");
266     EXPECT_FALSE(iter.Done());
267     EXPECT_EQ("a", iter.Get());
268     iter.Next();
269     EXPECT_FALSE(iter.Done());
270     EXPECT_EQ("b", iter.Get());
271     iter.Next();
272     EXPECT_FALSE(iter.Done());
273     EXPECT_EQ("cde", iter.Get());
274     EXPECT_FALSE(iter.Done());
275     iter.Next();
276     EXPECT_FALSE(iter.Done());
277     EXPECT_EQ("fg", iter.Get());
278     iter.Next();
279     EXPECT_TRUE(iter.Done());
280   }
281 }
282 
TEST(UtilTest,SplitIterator_SingleDelimiter_AllowEmpty)283 TEST(UtilTest, SplitIterator_SingleDelimiter_AllowEmpty) {
284   typedef SplitIterator<SingleDelimiter, AllowEmpty> SplitIterator;
285   {
286     SplitIterator iter("", " ");
287     EXPECT_TRUE(iter.Done());
288   }
289   {
290     SplitIterator iter(StringPiece(), " ");
291     EXPECT_TRUE(iter.Done());
292   }
293   {
294     const char *s = "a b cde";
295     SplitIterator iter(s, " ");
296     EXPECT_FALSE(iter.Done());
297     EXPECT_EQ("a", iter.Get());
298     iter.Next();
299     EXPECT_FALSE(iter.Done());
300     EXPECT_EQ("b", iter.Get());
301     iter.Next();
302     EXPECT_FALSE(iter.Done());
303     EXPECT_EQ("cde", iter.Get());
304     iter.Next();
305     EXPECT_TRUE(iter.Done());
306   }
307   {
308     const char *s = " a b  cde ";
309     SplitIterator iter(s, " ");
310     EXPECT_FALSE(iter.Done());
311     EXPECT_EQ("", iter.Get());
312     iter.Next();
313     EXPECT_FALSE(iter.Done());
314     EXPECT_EQ("a", iter.Get());
315     iter.Next();
316     EXPECT_FALSE(iter.Done());
317     EXPECT_EQ("b", iter.Get());
318     iter.Next();
319     EXPECT_FALSE(iter.Done());
320     EXPECT_EQ("", iter.Get());
321     iter.Next();
322     EXPECT_FALSE(iter.Done());
323     EXPECT_EQ("cde", iter.Get());
324     iter.Next();
325     EXPECT_FALSE(iter.Done());
326     EXPECT_EQ("", iter.Get());
327     iter.Next();
328     EXPECT_TRUE(iter.Done());
329   }
330   {
331     StringPiece s("a b  cde ", 5);
332     SplitIterator iter(s, " ");
333     EXPECT_FALSE(iter.Done());
334     EXPECT_EQ("a", iter.Get());
335     iter.Next();
336     EXPECT_FALSE(iter.Done());
337     EXPECT_EQ("b", iter.Get());
338     iter.Next();
339     EXPECT_FALSE(iter.Done());
340     EXPECT_EQ("", iter.Get());
341     iter.Next();
342     EXPECT_FALSE(iter.Done());
343     EXPECT_EQ("", iter.Get());
344     iter.Next();
345     EXPECT_TRUE(iter.Done());
346   }
347 }
348 
TEST(UtilTest,SplitIterator_MultiDelimiter_AllowEmpty)349 TEST(UtilTest, SplitIterator_MultiDelimiter_AllowEmpty) {
350   typedef SplitIterator<MultiDelimiter, AllowEmpty> SplitIterator;
351   {
352     SplitIterator iter("", " \t,");
353     EXPECT_TRUE(iter.Done());
354   }
355   {
356     SplitIterator iter(StringPiece(), ",.");
357     EXPECT_TRUE(iter.Done());
358   }
359   {
360     const char *s = "a b\tcde:fg";
361     SplitIterator iter(s, " \t:");
362     EXPECT_FALSE(iter.Done());
363     EXPECT_EQ("a", iter.Get());
364     iter.Next();
365     EXPECT_FALSE(iter.Done());
366     EXPECT_EQ("b", iter.Get());
367     iter.Next();
368     EXPECT_FALSE(iter.Done());
369     EXPECT_EQ("cde", iter.Get());
370     EXPECT_FALSE(iter.Done());
371     iter.Next();
372     EXPECT_FALSE(iter.Done());
373     EXPECT_EQ("fg", iter.Get());
374     iter.Next();
375     EXPECT_TRUE(iter.Done());
376   }
377   {
378     const char *s = "a b\t\tcde:fg:";
379     SplitIterator iter(s, " \t:");
380     EXPECT_FALSE(iter.Done());
381     EXPECT_EQ("a", iter.Get());
382     iter.Next();
383     EXPECT_FALSE(iter.Done());
384     EXPECT_EQ("b", iter.Get());
385     iter.Next();
386     EXPECT_FALSE(iter.Done());
387     EXPECT_EQ("", iter.Get());
388     iter.Next();
389     EXPECT_FALSE(iter.Done());
390     EXPECT_EQ("cde", iter.Get());
391     EXPECT_FALSE(iter.Done());
392     iter.Next();
393     EXPECT_FALSE(iter.Done());
394     EXPECT_EQ("fg", iter.Get());
395     iter.Next();
396     EXPECT_FALSE(iter.Done());
397     EXPECT_EQ("", iter.Get());
398     iter.Next();
399     EXPECT_TRUE(iter.Done());
400   }
401 }
402 
TEST(UtilTest,SplitStringUsing)403 TEST(UtilTest, SplitStringUsing) {
404   {
405     const string input = "a b  c def";
406     std::vector<string> output;
407     Util::SplitStringUsing(input, " ", &output);
408     EXPECT_EQ(output.size(), 4);
409     EXPECT_EQ("a", output[0]);
410     EXPECT_EQ("b", output[1]);
411     EXPECT_EQ("c", output[2]);
412     EXPECT_EQ("def", output[3]);
413   }
414   {
415     const string input = " a b  c";
416     std::vector<string> output;
417     Util::SplitStringUsing(input, " ", &output);
418     EXPECT_EQ(output.size(), 3);
419     EXPECT_EQ("a", output[0]);
420     EXPECT_EQ("b", output[1]);
421     EXPECT_EQ("c", output[2]);
422   }
423   {
424     const string input = "a b  c ";
425     std::vector<string> output;
426     Util::SplitStringUsing(input, " ", &output);
427     EXPECT_EQ(output.size(), 3);
428     EXPECT_EQ("a", output[0]);
429     EXPECT_EQ("b", output[1]);
430     EXPECT_EQ("c", output[2]);
431   }
432   {
433     const string input = "a:b  cd ";
434     std::vector<string> output;
435     Util::SplitStringUsing(input, ": ", &output);
436     EXPECT_EQ(output.size(), 3);
437     EXPECT_EQ("a", output[0]);
438     EXPECT_EQ("b", output[1]);
439     EXPECT_EQ("cd", output[2]);
440   }
441   {
442     const string input = "Empty delimiter";
443     std::vector<string> output;
444     Util::SplitStringUsing(input, "", &output);
445     EXPECT_EQ(output.size(), 1);
446     EXPECT_EQ(input, output[0]);
447   }
448 }
449 
TEST(UtilTest,SplitStringAllowEmpty)450 TEST(UtilTest, SplitStringAllowEmpty) {
451   {
452     const string input = "a b  c def";
453     std::vector<string> output;
454     Util::SplitStringAllowEmpty(input, " ", &output);
455     EXPECT_EQ(output.size(), 5);
456     EXPECT_EQ("a", output[0]);
457     EXPECT_EQ("b", output[1]);
458     EXPECT_EQ("", output[2]);
459     EXPECT_EQ("c", output[3]);
460     EXPECT_EQ("def", output[4]);
461   }
462   {
463     const string input = " a b  c";
464     std::vector<string> output;
465     Util::SplitStringAllowEmpty(input, " ", &output);
466     EXPECT_EQ(output.size(), 5);
467     EXPECT_EQ("", output[0]);
468     EXPECT_EQ("a", output[1]);
469     EXPECT_EQ("b", output[2]);
470     EXPECT_EQ("", output[3]);
471     EXPECT_EQ("c", output[4]);
472   }
473   {
474     const string input = "a b  c ";
475     std::vector<string> output;
476     Util::SplitStringAllowEmpty(input, " ", &output);
477     EXPECT_EQ(output.size(), 5);
478     EXPECT_EQ("a", output[0]);
479     EXPECT_EQ("b", output[1]);
480     EXPECT_EQ("", output[2]);
481     EXPECT_EQ("c", output[3]);
482     EXPECT_EQ("", output[4]);
483   }
484   {
485     const string input = "a:b  c ";
486     std::vector<string> output;
487     Util::SplitStringAllowEmpty(input, ": ", &output);
488     EXPECT_EQ(output.size(), 5);
489     EXPECT_EQ("a", output[0]);
490     EXPECT_EQ("b", output[1]);
491     EXPECT_EQ("", output[2]);
492     EXPECT_EQ("c", output[3]);
493     EXPECT_EQ("", output[4]);
494   }
495   {
496     const string input = "Empty delimiter";
497     std::vector<string> output;
498     Util::SplitStringAllowEmpty(input, "", &output);
499     EXPECT_EQ(output.size(), 1);
500     EXPECT_EQ(input, output[0]);
501   }
502 }
503 
TEST(UtilTest,StripWhiteSpaces)504 TEST(UtilTest, StripWhiteSpaces) {
505   // basic scenario.
506   {
507     const string input = "  foo   ";
508     string output;
509     Util::StripWhiteSpaces(input, &output);
510     EXPECT_EQ("foo", output);
511   }
512 
513   // no space means just copy.
514   {
515     const string input = "foo";
516     string output;
517     Util::StripWhiteSpaces(input, &output);
518     EXPECT_EQ("foo", output);
519   }
520 
521   // tabs and linebreaks are also spaces.
522   {
523     const string input = " \tfoo\n";
524     string output;
525     Util::StripWhiteSpaces(input, &output);
526     EXPECT_EQ("foo", output);
527   }
528 
529   // spaces in the middle remains.
530   {
531     const string input = " foo bar baz ";
532     string output;
533     Util::StripWhiteSpaces(input, &output);
534     EXPECT_EQ("foo bar baz", output);
535   }
536 
537   // all spaces means clear out output.
538   {
539     const string input = " \v \r ";
540     string output;
541     Util::StripWhiteSpaces(input, &output);
542     EXPECT_TRUE(output.empty());
543   }
544 
545   // empty input.
546   {
547     const string input = "";
548     string output;
549     Util::StripWhiteSpaces(input, &output);
550     EXPECT_TRUE(output.empty());
551   }
552 
553   // one character.
554   {
555     const string input = "a";
556     string output;
557     Util::StripWhiteSpaces(input, &output);
558     EXPECT_EQ("a", output);
559   }
560 }
561 
TEST(UtilTest,SplitStringToUtf8Chars)562 TEST(UtilTest, SplitStringToUtf8Chars) {
563   {
564     std::vector<string> output;
565     Util::SplitStringToUtf8Chars("", &output);
566     EXPECT_EQ(0, output.size());
567   }
568 
569   {
570     const string kInputs[] = {
571         "a", "あ", "亜", "\n", "a",
572     };
573     string joined_string;
574     for (int i = 0; i < arraysize(kInputs); ++i) {
575       joined_string += kInputs[i];
576     }
577 
578     std::vector<string> output;
579     Util::SplitStringToUtf8Chars(joined_string, &output);
580     EXPECT_EQ(arraysize(kInputs), output.size());
581 
582     for (size_t i = 0; i < output.size(); ++i) {
583       EXPECT_EQ(kInputs[i], output[i]);
584     }
585   }
586 }
587 
TEST(UtilTest,SplitCSV)588 TEST(UtilTest, SplitCSV) {
589   std::vector<string> answer_vector;
590 
591   Util::SplitCSV(
592       "Google,x,\"Buchheit, Paul\",\"string with \"\" quote in it\"",
593       &answer_vector);
594   CHECK_EQ(answer_vector.size(), 4);
595   CHECK_EQ(answer_vector[0], "Google");
596   CHECK_EQ(answer_vector[1], "x");
597   CHECK_EQ(answer_vector[2], "Buchheit, Paul");
598   CHECK_EQ(answer_vector[3], "string with \" quote in it");
599 
600   Util::SplitCSV("Google,hello,",  &answer_vector);
601   CHECK_EQ(answer_vector.size(), 3);
602   CHECK_EQ(answer_vector[0], "Google");
603   CHECK_EQ(answer_vector[1], "hello");
604   CHECK_EQ(answer_vector[2], "");
605 
606   Util::SplitCSV("Google rocks,hello", &answer_vector);
607   CHECK_EQ(answer_vector.size(), 2);
608   CHECK_EQ(answer_vector[0], "Google rocks");
609   CHECK_EQ(answer_vector[1], "hello");
610 
611   Util::SplitCSV(",,\"\",,", &answer_vector);
612   CHECK_EQ(answer_vector.size(), 5);
613   CHECK_EQ(answer_vector[0], "");
614   CHECK_EQ(answer_vector[1], "");
615   CHECK_EQ(answer_vector[2], "");
616   CHECK_EQ(answer_vector[3], "");
617   CHECK_EQ(answer_vector[4], "");
618 
619   // Test a string containing a comma.
620   Util::SplitCSV("\",\",hello", &answer_vector);
621   CHECK_EQ(answer_vector.size(), 2);
622   CHECK_EQ(answer_vector[0], ",");
623   CHECK_EQ(answer_vector[1], "hello");
624 
625   // Invalid CSV
626   Util::SplitCSV("\"no,last,quote", &answer_vector);
627   CHECK_EQ(answer_vector.size(), 1);
628   CHECK_EQ(answer_vector[0], "no,last,quote");
629 
630   Util::SplitCSV("backslash\\,is,no,an,\"escape\"", &answer_vector);
631   CHECK_EQ(answer_vector.size(), 5);
632   CHECK_EQ(answer_vector[0], "backslash\\");
633   CHECK_EQ(answer_vector[1], "is");
634   CHECK_EQ(answer_vector[2], "no");
635   CHECK_EQ(answer_vector[3], "an");
636   CHECK_EQ(answer_vector[4], "escape");
637 
638   Util::SplitCSV("", &answer_vector);
639   CHECK_EQ(answer_vector.size(), 0);
640 }
641 
TEST(UtilTest,ReplaceString)642 TEST(UtilTest, ReplaceString) {
643   const string input = "foobarfoobar";
644   string output;
645   Util::StringReplace(input, "bar", "buz", true, &output);
646   EXPECT_EQ("foobuzfoobuz", output);
647 
648   output.clear();
649   Util::StringReplace(input, "bar", "buz", false, &output);
650   EXPECT_EQ("foobuzfoobar", output);
651 }
652 
TEST(UtilTest,LowerString)653 TEST(UtilTest, LowerString) {
654   string s = "TeSTtest";
655   Util::LowerString(&s);
656   EXPECT_EQ("testtest", s);
657 
658   string s2 = "TeST@ABCXYZ[`abcxyz{";
659   Util::LowerString(&s2);
660   EXPECT_EQ("test@abcxyz[`abcxyz{", s2);
661 }
662 
TEST(UtilTest,UpperString)663 TEST(UtilTest, UpperString) {
664   string s = "TeSTtest";
665   Util::UpperString(&s);
666   EXPECT_EQ("TESTTEST", s);
667 
668   string s2 = "TeST@ABCXYZ[`abcxyz{";
669   Util::UpperString(&s2);
670   EXPECT_EQ("TEST@ABCXYZ[`ABCXYZ{", s2);
671 }
672 
TEST(UtilTest,CapitalizeString)673 TEST(UtilTest, CapitalizeString) {
674   string s = "TeSTtest";
675   Util::CapitalizeString(&s);
676   EXPECT_EQ("Testtest", s);
677 
678   string s2 = "TeST@ABCXYZ[`abcxyz{";
679   Util::CapitalizeString(&s2);
680   EXPECT_EQ("Test@abcxyz[`abcxyz{", s2);
681 }
682 
TEST(UtilTest,IsLowerAscii)683 TEST(UtilTest, IsLowerAscii) {
684   EXPECT_TRUE(Util::IsLowerAscii(""));
685   EXPECT_TRUE(Util::IsLowerAscii("hello"));
686   EXPECT_FALSE(Util::IsLowerAscii("HELLO"));
687   EXPECT_FALSE(Util::IsLowerAscii("Hello"));
688   EXPECT_FALSE(Util::IsLowerAscii("HeLlO"));
689   EXPECT_FALSE(Util::IsLowerAscii("symbol!"));
690   EXPECT_FALSE(Util::IsLowerAscii("Hello"));
691 }
692 
TEST(UtilTest,IsUpperAscii)693 TEST(UtilTest, IsUpperAscii) {
694   EXPECT_TRUE(Util::IsUpperAscii(""));
695   EXPECT_FALSE(Util::IsUpperAscii("hello"));
696   EXPECT_TRUE(Util::IsUpperAscii("HELLO"));
697   EXPECT_FALSE(Util::IsUpperAscii("Hello"));
698   EXPECT_FALSE(Util::IsUpperAscii("HeLlO"));
699   EXPECT_FALSE(Util::IsUpperAscii("symbol!"));
700   EXPECT_FALSE(Util::IsUpperAscii("Hello"));
701 }
702 
TEST(UtilTest,IsCapitalizedAscii)703 TEST(UtilTest, IsCapitalizedAscii) {
704   EXPECT_TRUE(Util::IsCapitalizedAscii(""));
705   EXPECT_FALSE(Util::IsCapitalizedAscii("hello"));
706   EXPECT_FALSE(Util::IsCapitalizedAscii("HELLO"));
707   EXPECT_TRUE(Util::IsCapitalizedAscii("Hello"));
708   EXPECT_FALSE(Util::IsCapitalizedAscii("HeLlO"));
709   EXPECT_FALSE(Util::IsCapitalizedAscii("symbol!"));
710   EXPECT_FALSE(Util::IsCapitalizedAscii("Hello"));
711 }
712 
TEST(UtilTest,IsLowerOrUpperAscii)713 TEST(UtilTest, IsLowerOrUpperAscii) {
714   EXPECT_TRUE(Util::IsLowerOrUpperAscii(""));
715   EXPECT_TRUE(Util::IsLowerOrUpperAscii("hello"));
716   EXPECT_TRUE(Util::IsLowerOrUpperAscii("HELLO"));
717   EXPECT_FALSE(Util::IsLowerOrUpperAscii("Hello"));
718   EXPECT_FALSE(Util::IsLowerOrUpperAscii("HeLlO"));
719   EXPECT_FALSE(Util::IsLowerOrUpperAscii("symbol!"));
720   EXPECT_FALSE(Util::IsLowerOrUpperAscii("Hello"));
721 }
722 
TEST(UtilTest,IsUpperOrCapitalizedAscii)723 TEST(UtilTest, IsUpperOrCapitalizedAscii) {
724   EXPECT_TRUE(Util::IsUpperOrCapitalizedAscii(""));
725   EXPECT_FALSE(Util::IsUpperOrCapitalizedAscii("hello"));
726   EXPECT_TRUE(Util::IsUpperOrCapitalizedAscii("HELLO"));
727   EXPECT_TRUE(Util::IsUpperOrCapitalizedAscii("Hello"));
728   EXPECT_FALSE(Util::IsUpperOrCapitalizedAscii("HeLlO"));
729   EXPECT_FALSE(Util::IsUpperOrCapitalizedAscii("symbol!"));
730   EXPECT_FALSE(Util::IsUpperOrCapitalizedAscii("Hello"));
731 }
732 
VerifyUTF8ToUCS4(const string & text,char32 expected_ucs4,size_t expected_len)733 void VerifyUTF8ToUCS4(const string &text, char32 expected_ucs4,
734                       size_t expected_len) {
735   const char *begin = text.data();
736   const char *end = begin + text.size();
737   size_t mblen = 0;
738   char32 result = Util::UTF8ToUCS4(begin, end, &mblen);
739   EXPECT_EQ(expected_ucs4, result) << text << " " << expected_ucs4;
740   EXPECT_EQ(expected_len, mblen) << text << " " << expected_len;
741 }
742 
TEST(UtilTest,UTF8ToUCS4)743 TEST(UtilTest, UTF8ToUCS4) {
744   VerifyUTF8ToUCS4("", 0, 0);
745   VerifyUTF8ToUCS4("\x01", 1, 1);
746   VerifyUTF8ToUCS4("\x7F", 0x7F, 1);
747   VerifyUTF8ToUCS4("\xC2\x80", 0x80, 2);
748   VerifyUTF8ToUCS4("\xDF\xBF", 0x7FF, 2);
749   VerifyUTF8ToUCS4("\xE0\xA0\x80", 0x800, 3);
750   VerifyUTF8ToUCS4("\xEF\xBF\xBF", 0xFFFF, 3);
751   VerifyUTF8ToUCS4("\xF0\x90\x80\x80", 0x10000, 4);
752   VerifyUTF8ToUCS4("\xF7\xBF\xBF\xBF", 0x1FFFFF, 4);
753   // do not test 5-6 bytes because it's out of spec of UTF8.
754 }
755 
TEST(UtilTest,UCS4ToUTF8)756 TEST(UtilTest, UCS4ToUTF8) {
757   string output;
758 
759   // Do nothing if |c| is NUL. Previous implementation of UCS4ToUTF8 worked like
760   // this even though the reason is unclear.
761   Util::UCS4ToUTF8(0, &output);
762   EXPECT_TRUE(output.empty());
763 
764   Util::UCS4ToUTF8(0x7F, &output);
765   EXPECT_EQ("\x7F", output);
766   Util::UCS4ToUTF8(0x80, &output);
767   EXPECT_EQ("\xC2\x80", output);
768   Util::UCS4ToUTF8(0x7FF, &output);
769   EXPECT_EQ("\xDF\xBF", output);
770   Util::UCS4ToUTF8(0x800, &output);
771   EXPECT_EQ("\xE0\xA0\x80", output);
772   Util::UCS4ToUTF8(0xFFFF, &output);
773   EXPECT_EQ("\xEF\xBF\xBF", output);
774   Util::UCS4ToUTF8(0x10000, &output);
775   EXPECT_EQ("\xF0\x90\x80\x80", output);
776   Util::UCS4ToUTF8(0x1FFFFF, &output);
777   EXPECT_EQ("\xF7\xBF\xBF\xBF", output);
778 
779   // Buffer version.
780   char buf[7];
781 
782   EXPECT_EQ(0, Util::UCS4ToUTF8(0, buf));
783   EXPECT_EQ(0, strcmp(buf, ""));
784 
785   EXPECT_EQ(1, Util::UCS4ToUTF8(0x7F, buf));
786   EXPECT_EQ(0, strcmp("\x7F", buf));
787 
788   EXPECT_EQ(2, Util::UCS4ToUTF8(0x80, buf));
789   EXPECT_EQ(0, strcmp("\xC2\x80", buf));
790 
791   EXPECT_EQ(2, Util::UCS4ToUTF8(0x7FF, buf));
792   EXPECT_EQ(0, strcmp("\xDF\xBF", buf));
793 
794   EXPECT_EQ(3, Util::UCS4ToUTF8(0x800, buf));
795   EXPECT_EQ(0, strcmp("\xE0\xA0\x80", buf));
796 
797   EXPECT_EQ(3, Util::UCS4ToUTF8(0xFFFF, buf));
798   EXPECT_EQ(0, strcmp("\xEF\xBF\xBF", buf));
799 
800   EXPECT_EQ(4, Util::UCS4ToUTF8(0x10000, buf));
801   EXPECT_EQ(0, strcmp("\xF0\x90\x80\x80", buf));
802 
803   EXPECT_EQ(4, Util::UCS4ToUTF8(0x1FFFFF, buf));
804   EXPECT_EQ(0, strcmp("\xF7\xBF\xBF\xBF", buf));
805 }
806 
TEST(UtilTest,CharsLen)807 TEST(UtilTest, CharsLen) {
808   const string src = "私の名前は中野です";
809   EXPECT_EQ(Util::CharsLen(src.c_str(), src.size()), 9);
810 }
811 
TEST(UtilTest,SubStringPiece)812 TEST(UtilTest, SubStringPiece) {
813   const string src = "私の名前は中野です";
814   StringPiece result;
815 
816   result = Util::SubStringPiece(src, 0, 2);
817   EXPECT_EQ("私の", result);
818   // |result|'s data should point to the same memory block as src.
819   EXPECT_LE(src.data(), result.data());
820 
821   result = Util::SubStringPiece(src, 4, 1);
822   EXPECT_EQ("は", result);
823   EXPECT_LE(src.data(), result.data());
824 
825   result = Util::SubStringPiece(src, 5, 3);
826   EXPECT_EQ("中野で", result);
827   EXPECT_LE(src.data(), result.data());
828 
829   result = Util::SubStringPiece(src, 6, 10);
830   EXPECT_EQ("野です", result);
831   EXPECT_LE(src.data(), result.data());
832 
833   result = Util::SubStringPiece(src, 4, 2);
834   EXPECT_EQ("は中", result);
835   EXPECT_LE(src.data(), result.data());
836 
837   result = Util::SubStringPiece(src, 2, string::npos);
838   EXPECT_EQ("名前は中野です", result);
839   EXPECT_LE(src.data(), result.data());
840 
841   result = Util::SubStringPiece(src, 5, string::npos);
842   EXPECT_EQ("中野です", result);
843   EXPECT_LE(src.data(), result.data());
844 }
845 
TEST(UtilTest,SubStringPiece2)846 TEST(UtilTest, SubStringPiece2) {
847   const string src = "私はGoogleです";
848 
849   StringPiece result;
850 
851   result = Util::SubStringPiece(src, 0);
852   EXPECT_EQ(src, result);
853 
854   result = Util::SubStringPiece(src, 5);
855   EXPECT_EQ("gleです", result);
856 
857   result = Util::SubStringPiece(src, 10);
858   EXPECT_TRUE(result.empty());
859 
860   result = Util::SubStringPiece(src, 13);
861   EXPECT_TRUE(result.empty());
862 }
863 
TEST(UtilTest,SubString)864 TEST(UtilTest, SubString) {
865   const string src = "私の名前は中野です";
866   string result;
867 
868   result.clear();
869   Util::SubString(src, 0, 2, &result);
870   EXPECT_EQ(result, "私の");
871 
872   result.clear();
873   Util::SubString(src, 4, 1, &result);
874   EXPECT_EQ(result, "は");
875 
876   result.clear();
877   Util::SubString(src, 5, 3, &result);
878   EXPECT_EQ(result, "中野で");
879 
880   result.clear();
881   Util::SubString(src, 6, 10, &result);
882   EXPECT_EQ(result, "野です");
883 
884   result.clear();
885   Util::SubString(src, 4, 2, &result);
886   EXPECT_EQ(result, "は中");
887 
888   result.clear();
889   Util::SubString(src, 2, string::npos, &result);
890   EXPECT_EQ(result, "名前は中野です");
891 
892   result.clear();
893   Util::SubString(src, 5, string::npos, &result);
894   EXPECT_EQ(result, "中野です");
895 
896   // Doesn't clear result and call Util::SubString
897   Util::SubString(src, 5, string::npos, &result);
898   EXPECT_EQ(result, "中野です");
899 }
900 
TEST(UtilTest,StartsWith)901 TEST(UtilTest, StartsWith) {
902   const string str = "abcdefg";
903   EXPECT_TRUE(Util::StartsWith(str, ""));
904   EXPECT_TRUE(Util::StartsWith(str, "a"));
905   EXPECT_TRUE(Util::StartsWith(str, "abc"));
906   EXPECT_TRUE(Util::StartsWith(str, "abcdefg"));
907   EXPECT_FALSE(Util::StartsWith(str, "abcdefghi"));
908   EXPECT_FALSE(Util::StartsWith(str, "foobar"));
909 }
910 
TEST(UtilTest,EndsWith)911 TEST(UtilTest, EndsWith) {
912   const string str = "abcdefg";
913   EXPECT_TRUE(Util::EndsWith(str, ""));
914   EXPECT_TRUE(Util::EndsWith(str, "g"));
915   EXPECT_TRUE(Util::EndsWith(str, "fg"));
916   EXPECT_TRUE(Util::EndsWith(str, "abcdefg"));
917   EXPECT_FALSE(Util::EndsWith(str, "aaabcdefg"));
918   EXPECT_FALSE(Util::EndsWith(str, "foobar"));
919   EXPECT_FALSE(Util::EndsWith(str, "foobarbuzbuz"));
920 }
921 
TEST(UtilTest,StripUTF8BOM)922 TEST(UtilTest, StripUTF8BOM) {
923   string line;
924 
925   // Should be stripped.
926   line = "\xef\xbb\xbf" "abc";
927   Util::StripUTF8BOM(&line);
928   EXPECT_EQ("abc", line);
929 
930   // Should be stripped.
931   line = "\xef\xbb\xbf";
932   Util::StripUTF8BOM(&line);
933   EXPECT_EQ("", line);
934 
935   // BOM in the middle of text. Shouldn't be stripped.
936   line = "a" "\xef\xbb\xbf" "bc";
937   Util::StripUTF8BOM(&line);
938   EXPECT_EQ("a" "\xef\xbb\xbf" "bc", line);
939 
940   // Incomplete BOM. Shouldn't be stripped.
941   line = "\xef\xbb" "abc";
942   Util::StripUTF8BOM(&line);
943   EXPECT_EQ("\xef\xbb" "abc", line);
944 
945   // String shorter than the BOM. Do nothing.
946   line = "a";
947   Util::StripUTF8BOM(&line);
948   EXPECT_EQ("a", line);
949 
950   // Empty string. Do nothing.
951   line = "";
952   Util::StripUTF8BOM(&line);
953   EXPECT_EQ("", line);
954 }
955 
TEST(UtilTest,IsUTF16BOM)956 TEST(UtilTest, IsUTF16BOM) {
957   EXPECT_FALSE(Util::IsUTF16BOM(""));
958   EXPECT_FALSE(Util::IsUTF16BOM("abc"));
959   EXPECT_TRUE(Util::IsUTF16BOM("\xfe\xff"));
960   EXPECT_TRUE(Util::IsUTF16BOM("\xff\xfe"));
961   EXPECT_TRUE(Util::IsUTF16BOM("\xfe\xff "));
962   EXPECT_TRUE(Util::IsUTF16BOM("\xff\xfe "));
963   EXPECT_FALSE(Util::IsUTF16BOM(" \xfe\xff"));
964   EXPECT_FALSE(Util::IsUTF16BOM(" \xff\xfe"));
965   EXPECT_FALSE(Util::IsUTF16BOM("\xff\xff"));
966 }
967 
TEST(UtilTest,IsAndroidPuaEmoji)968 TEST(UtilTest, IsAndroidPuaEmoji) {
969   EXPECT_FALSE(Util::IsAndroidPuaEmoji(""));
970   EXPECT_FALSE(Util::IsAndroidPuaEmoji("A"));
971   EXPECT_FALSE(Util::IsAndroidPuaEmoji("a"));
972 
973   string str;
974   Util::UCS4ToUTF8(0xFDFFF, &str);
975   EXPECT_FALSE(Util::IsAndroidPuaEmoji(str));
976   Util::UCS4ToUTF8(0xFE000, &str);
977   EXPECT_TRUE(Util::IsAndroidPuaEmoji(str));
978   Util::UCS4ToUTF8(0xFE800, &str);
979   EXPECT_TRUE(Util::IsAndroidPuaEmoji(str));
980   Util::UCS4ToUTF8(0xFEEA0, &str);
981   EXPECT_TRUE(Util::IsAndroidPuaEmoji(str));
982   Util::UCS4ToUTF8(0xFEEA1, &str);
983   EXPECT_FALSE(Util::IsAndroidPuaEmoji(str));
984 
985   // If it has two ucs4 chars (or more), just expect false.
986   Util::UCS4ToUTF8(0xFE000, &str);
987   Util::UCS4ToUTF8Append(0xFE000, &str);
988   EXPECT_FALSE(Util::IsAndroidPuaEmoji(str));
989 }
990 
TEST(UtilTest,StringPrintf)991 TEST(UtilTest, StringPrintf) {
992   // On GCC, |EXPECT_EQ("", Util::StringPrintf(""))| may cause
993   // "warning: zero-length printf format string" so we disable this check.
994   MOZC_GCC_DISABLE_WARNING_INLINE(format-zero-length);
995 
996   // strings
997   EXPECT_EQ("", Util::StringPrintf(""));
998   EXPECT_EQ("", Util::StringPrintf("%s", ""));
999   EXPECT_EQ("hello, world", Util::StringPrintf("hello, world"));
1000   EXPECT_EQ("hello, world", Util::StringPrintf("%s", "hello, world"));
1001   EXPECT_EQ("hello, world", Util::StringPrintf("%s, %s", "hello", "world"));
1002   EXPECT_EQ("はろー世界", Util::StringPrintf("%s", "はろー世界"));
1003 
1004   // 32-bit integers
1005   EXPECT_EQ("-2147483648", Util::StringPrintf("%d", kint32min));
1006   EXPECT_EQ("2147483647", Util::StringPrintf("%d", kint32max));
1007   EXPECT_EQ("4294967295", Util::StringPrintf("%u", kuint32max));
1008   EXPECT_EQ("80000000", Util::StringPrintf("%x", kint32min));
1009   EXPECT_EQ("7fffffff", Util::StringPrintf("%x", kint32max));
1010   EXPECT_EQ("FFFFFFFF", Util::StringPrintf("%X", kuint32max));
1011 
1012   // 64-bit integers
1013   EXPECT_EQ("-9223372036854775808",
1014             Util::StringPrintf("%" MOZC_PRId64, kint64min));
1015   EXPECT_EQ("9223372036854775807",
1016             Util::StringPrintf("%" MOZC_PRId64, kint64max));
1017   EXPECT_EQ("18446744073709551615",
1018             Util::StringPrintf("%" MOZC_PRIu64, kuint64max));
1019   EXPECT_EQ("8000000000000000",
1020             Util::StringPrintf("%" MOZC_PRIx64, kint64min));
1021   EXPECT_EQ("7fffffffffffffff",
1022             Util::StringPrintf("%" MOZC_PRIx64, kint64max));
1023   EXPECT_EQ("FFFFFFFFFFFFFFFF",
1024             Util::StringPrintf("%" MOZC_PRIX64, kuint64max));
1025 
1026   // Simple test for floating point numbers
1027   EXPECT_EQ("-1.75", Util::StringPrintf("%.2f", -1.75));
1028 
1029   // 4096 is greater than a temporary buffer size (1024 bytes)
1030   // which is used in StringPrintf().
1031   const string kLongStrA(4096, '.');
1032   const string kLongStrB(4096, '_');
1033   const string& result = Util::StringPrintf("%s\t%s\n",
1034                                             kLongStrA.c_str(),
1035                                             kLongStrB.c_str());
1036   EXPECT_EQ(kLongStrA + "\t" + kLongStrB + "\n", result);
1037 }
1038 
TEST(UtilTest,HiraganaToKatakana)1039 TEST(UtilTest, HiraganaToKatakana) {
1040   {
1041     const string input =
1042         "あいうえおぁぃぅぇぉかきくけこがぎぐげごさしすせそざじずぜぞたちつてと"
1043         "だぢづでどっなにぬねのはひふへほばびぶべぼぱぴぷぺぽまみむめもやゆよゃ"
1044         "ゅょらりるれろわゎをんゔ";
1045     string output;
1046     Util::HiraganaToKatakana(input, &output);
1047     EXPECT_EQ(
1048         "アイウエオァィゥェォカキクケコガギグゲゴサシスセソザジズゼゾタチツテト"
1049         "ダヂヅデドッナニヌネノハヒフヘホバビブベボパピプペポマミムメモヤユヨャ"
1050         "ュョラリルレロワヮヲンヴ",
1051         output);
1052   }
1053   {
1054     const string input = "わたしのなまえはなかのですうまーよろしゅう";
1055     string output;
1056     Util::HiraganaToKatakana(input, &output);
1057     EXPECT_EQ("ワタシノナマエハナカノデスウマーヨロシュウ", output);
1058   }
1059   {
1060     const string input = "グーグル工藤よろしくabc";
1061     string output;
1062     Util::HiraganaToKatakana(input, &output);
1063     EXPECT_EQ("グーグル工藤ヨロシクabc", output);
1064   }
1065 }
1066 
TEST(UtilTest,KatakanaToHiragana)1067 TEST(UtilTest, KatakanaToHiragana) {
1068   {
1069     const string input =
1070         "アイウエオァィゥェォカキクケコガギグゲゴサシスセソザジズゼゾタチツテト"
1071         "ダヂヅデドッナニヌネノハヒフヘホバビブベボパピプペポマミムメモヤユヨャ"
1072         "ュョラリルレロワヮヲンヰヱヴ";
1073     string output;
1074     Util::KatakanaToHiragana(input, &output);
1075     EXPECT_EQ(
1076         "あいうえおぁぃぅぇぉかきくけこがぎぐげごさしすせそざじずぜぞたちつてと"
1077         "だぢづでどっなにぬねのはひふへほばびぶべぼぱぴぷぺぽまみむめもやゆよゃ"
1078         "ゅょらりるれろわゎをんゐゑゔ",
1079         output);
1080   }
1081   {
1082     const string input = "ワタシノナマエハナカノデスウマーヨロシュウ";
1083     string output;
1084     Util::KatakanaToHiragana(input, &output);
1085     EXPECT_EQ("わたしのなまえはなかのですうまーよろしゅう", output);
1086   }
1087   {
1088     const string input = "グーグル工藤ヨロシクabc";
1089     string output;
1090     Util::KatakanaToHiragana(input, &output);
1091     EXPECT_EQ("ぐーぐる工藤よろしくabc", output);
1092   }
1093 }
1094 
TEST(UtilTest,RomanjiToHiragana)1095 TEST(UtilTest, RomanjiToHiragana) {
1096   struct {
1097     const char *input;
1098     const char *expected;
1099   } kTestCases[] = {
1100       {"watasinonamaehatakahashinoriyukidesu",
1101        "わたしのなまえはたかはしのりゆきです"},
1102       {"majissukamajiyabexe", "まじっすかまじやべぇ"},
1103       {"kk", "っk"},
1104       {"xyz", "xyz"},
1105   };
1106   for (size_t i = 0; i < arraysize(kTestCases); ++i) {
1107     string actual;
1108     Util::RomanjiToHiragana(kTestCases[i].input, &actual);
1109     EXPECT_EQ(kTestCases[i].expected, actual);
1110   }
1111 }
1112 
TEST(UtilTest,NormalizeVoicedSoundMark)1113 TEST(UtilTest, NormalizeVoicedSoundMark) {
1114   const string input = "僕のう゛ぁいおりん";
1115   string output;
1116   Util::NormalizeVoicedSoundMark(input, &output);
1117   EXPECT_EQ("僕のゔぁいおりん", output);
1118 }
1119 
TEST(UtilTest,IsFullWidthSymbolInHalfWidthKatakana)1120 TEST(UtilTest, IsFullWidthSymbolInHalfWidthKatakana) {
1121   EXPECT_FALSE(Util::IsFullWidthSymbolInHalfWidthKatakana("グーグル"));
1122   EXPECT_TRUE(Util::IsFullWidthSymbolInHalfWidthKatakana("ー"));
1123   EXPECT_TRUE(Util::IsFullWidthSymbolInHalfWidthKatakana("。"));
1124   EXPECT_FALSE(Util::IsFullWidthSymbolInHalfWidthKatakana("グーグル。"));
1125   EXPECT_TRUE(Util::IsFullWidthSymbolInHalfWidthKatakana("ー。"));
1126   EXPECT_FALSE(Util::IsFullWidthSymbolInHalfWidthKatakana("ーグ。"));
1127 }
1128 
TEST(UtilTest,IsHalfWidthKatakanaSymbol)1129 TEST(UtilTest, IsHalfWidthKatakanaSymbol) {
1130   EXPECT_FALSE(Util::IsHalfWidthKatakanaSymbol("グーグル"));
1131   EXPECT_TRUE(Util::IsHalfWidthKatakanaSymbol("ー"));
1132   EXPECT_TRUE(Util::IsHalfWidthKatakanaSymbol("。"));  // Half-width
1133   EXPECT_TRUE(Util::IsHalfWidthKatakanaSymbol("、"));  // Half-width
1134   EXPECT_FALSE(Util::IsHalfWidthKatakanaSymbol("グーグル。"));
1135   EXPECT_TRUE(Util::IsHalfWidthKatakanaSymbol("、。"));  // Half-width
1136 }
1137 
TEST(UtilTest,FullWidthAndHalfWidth)1138 TEST(UtilTest, FullWidthAndHalfWidth) {
1139   string output;
1140 
1141   Util::FullWidthToHalfWidth("", &output);
1142   EXPECT_EQ("", output);
1143 
1144   Util::HalfWidthToFullWidth("", &output);
1145   EXPECT_EQ("", output);
1146 
1147   Util::HalfWidthToFullWidth("abc[]?.", &output);
1148   EXPECT_EQ("abc[]?.", output);
1149 
1150   Util::HalfWidthToFullWidth("インターネット「」", &output);
1151   EXPECT_EQ("インターネット「」", output);
1152 
1153   Util::HalfWidthToFullWidth("インターネットグーグル", &output);
1154   EXPECT_EQ("インターネットグーグル", output);
1155 
1156   Util::FullWidthToHalfWidth("abc[]?.", &output);
1157   EXPECT_EQ("abc[]?.", output);
1158 
1159   Util::FullWidthToHalfWidth("インターネット", &output);
1160   EXPECT_EQ("インターネット", output);
1161 
1162   Util::FullWidthToHalfWidth("インターネットグーグル", &output);
1163   EXPECT_EQ("インターネットグーグル", output);
1164 
1165   // spaces
1166   Util::FullWidthToHalfWidth("  ", &output);  // Half- and full-width spaces
1167   EXPECT_EQ("  ", output);                     // 2 half-width spaces
1168 
1169   Util::HalfWidthToFullWidth("  ", &output);  // Half- and full-width spaces
1170   EXPECT_EQ("  ", output);                   // 2 full-width spaces
1171 
1172   // Spaces are treated as Ascii here
1173   // Half- and full-width spaces
1174   Util::FullWidthAsciiToHalfWidthAscii("  ", &output);
1175   EXPECT_EQ("  ", output);  // 2 half-width spaces
1176 
1177   Util::HalfWidthAsciiToFullWidthAscii("  ", &output);
1178   EXPECT_EQ("  ", output);  // 2 full-width spaces
1179 
1180   // Half- and full-width spaces
1181   Util::FullWidthKatakanaToHalfWidthKatakana("  ", &output);
1182   EXPECT_EQ("  ", output);  // Not changed
1183 
1184   // Half- and full-width spaces
1185   Util::HalfWidthKatakanaToFullWidthKatakana("  ", &output);
1186   EXPECT_EQ("  ", output);  // Not changed
1187 }
1188 
TEST(UtilTest,BracketTest)1189 TEST(UtilTest, BracketTest) {
1190   static const struct BracketType {
1191     const char *open_bracket;
1192     const char *close_bracket;
1193   } kBracketType[] = {
1194       { "(", ")" },
1195       { "〔", "〕" },
1196       { "[", "]" },
1197       { "{", "}" },
1198       { "〈", "〉" },
1199       { "《", "》" },
1200       { "「", "」" },
1201       { "『", "』" },
1202       { "【", "】" },
1203       { "〘", "〙" },
1204       { "〚", "〛" },
1205       { nullptr, nullptr },  // sentinel
1206   };
1207 
1208   string pair;
1209   for (size_t i = 0;
1210        (kBracketType[i].open_bracket != nullptr ||
1211         kBracketType[i].close_bracket != nullptr);
1212        ++i) {
1213     EXPECT_TRUE(Util::IsOpenBracket(kBracketType[i].open_bracket, &pair));
1214     EXPECT_EQ(kBracketType[i].close_bracket, pair);
1215     EXPECT_TRUE(Util::IsCloseBracket(kBracketType[i].close_bracket, &pair));
1216     EXPECT_EQ(kBracketType[i].open_bracket, pair);
1217     EXPECT_FALSE(Util::IsOpenBracket(kBracketType[i].close_bracket, &pair));
1218     EXPECT_FALSE(Util::IsCloseBracket(kBracketType[i].open_bracket, &pair));
1219   }
1220 }
1221 
TEST(UtilTest,IsEnglishTransliteration)1222 TEST(UtilTest, IsEnglishTransliteration) {
1223   EXPECT_TRUE(Util::IsEnglishTransliteration("ABC"));
1224   EXPECT_TRUE(Util::IsEnglishTransliteration("Google"));
1225   EXPECT_TRUE(Util::IsEnglishTransliteration("Google Map"));
1226   EXPECT_TRUE(Util::IsEnglishTransliteration("ABC-DEF"));
1227   EXPECT_TRUE(Util::IsEnglishTransliteration("Foo-bar"));
1228   EXPECT_TRUE(Util::IsEnglishTransliteration("Foo!"));
1229   EXPECT_TRUE(Util::IsEnglishTransliteration("Who's"));
1230   EXPECT_TRUE(Util::IsEnglishTransliteration("!"));
1231   EXPECT_TRUE(Util::IsEnglishTransliteration("  "));
1232   EXPECT_FALSE(Util::IsEnglishTransliteration("てすと"));
1233   EXPECT_FALSE(Util::IsEnglishTransliteration("テスト"));
1234   EXPECT_FALSE(Util::IsEnglishTransliteration("東京"));
1235 }
1236 
TEST(UtilTest,ChopReturns)1237 TEST(UtilTest, ChopReturns) {
1238   string line = "line\n";
1239   EXPECT_TRUE(Util::ChopReturns(&line));
1240   EXPECT_EQ("line", line);
1241 
1242   line = "line\r";
1243   EXPECT_TRUE(Util::ChopReturns(&line));
1244   EXPECT_EQ("line", line);
1245 
1246   line = "line\r\n";
1247   EXPECT_TRUE(Util::ChopReturns(&line));
1248   EXPECT_EQ("line", line);
1249 
1250   line = "line";
1251   EXPECT_FALSE(Util::ChopReturns(&line));
1252   EXPECT_EQ("line", line);
1253 
1254   line = "line1\nline2\n";
1255   EXPECT_TRUE(Util::ChopReturns(&line));
1256   EXPECT_EQ("line1\nline2", line);
1257 
1258   line = "line\n\n\n";
1259   EXPECT_TRUE(Util::ChopReturns(&line));
1260   EXPECT_EQ("line", line);
1261 }
1262 
TEST(UtilTest,EncodeURI)1263 TEST(UtilTest, EncodeURI) {
1264   string encoded;
1265   Util::EncodeURI("もずく", &encoded);
1266   EXPECT_EQ("%E3%82%82%E3%81%9A%E3%81%8F", encoded);
1267 
1268   encoded.clear();
1269   Util::EncodeURI("mozc", &encoded);
1270   EXPECT_EQ("mozc", encoded);
1271 
1272   encoded.clear();
1273   Util::EncodeURI("http://mozc/?q=Hello World", &encoded);
1274   EXPECT_EQ("http%3A%2F%2Fmozc%2F%3Fq%3DHello%20World", encoded);
1275 }
1276 
TEST(UtilTest,DecodeURI)1277 TEST(UtilTest, DecodeURI) {
1278   string decoded;
1279   Util::DecodeURI("%E3%82%82%E3%81%9A%E3%81%8F", &decoded);
1280   EXPECT_EQ("もずく", decoded);
1281 
1282   decoded.clear();
1283   Util::DecodeURI("mozc", &decoded);
1284   EXPECT_EQ("mozc", decoded);
1285 
1286   decoded.clear();
1287   Util::DecodeURI("http%3A%2F%2Fmozc%2F%3Fq%3DHello+World", &decoded);
1288   EXPECT_EQ("http://mozc/?q=Hello World", decoded);
1289 }
1290 
TEST(UtilTest,AppendCGIParams)1291 TEST(UtilTest, AppendCGIParams) {
1292   std::vector<std::pair<string, string> > params;
1293   string url;
1294   Util::AppendCGIParams(params, &url);
1295   EXPECT_TRUE(url.empty());
1296 
1297   params.push_back(std::make_pair("foo", "b a+r"));
1298   url = "http://mozc.com?";
1299   Util::AppendCGIParams(params, &url);
1300   EXPECT_EQ("http://mozc.com?foo=b%20a%2Br", url);
1301 
1302   params.push_back(std::make_pair("buzz", "mozc"));
1303   url.clear();
1304   Util::AppendCGIParams(params, &url);
1305   EXPECT_EQ("foo=b%20a%2Br&buzz=mozc", url);
1306 }
1307 
TEST(UtilTest,Escape)1308 TEST(UtilTest, Escape) {
1309   string escaped;
1310   Util::Escape("らむだ", &escaped);
1311   EXPECT_EQ("\\xE3\\x82\\x89\\xE3\\x82\\x80\\xE3\\x81\\xA0", escaped);
1312 }
1313 
TEST(UtilTest,Unescape)1314 TEST(UtilTest, Unescape) {
1315   string unescaped;
1316   EXPECT_TRUE(Util::Unescape("\\xE3\\x82\\x89\\xE3\\x82\\x80\\xE3\\x81\\xA0",
1317                              &unescaped));
1318   EXPECT_EQ("らむだ", unescaped);
1319 
1320   EXPECT_TRUE(Util::Unescape("\\x4D\\x6F\\x7A\\x63", &unescaped));
1321   EXPECT_EQ("Mozc", unescaped);
1322 
1323   // A binary sequence (upper case)
1324   EXPECT_TRUE(Util::Unescape("\\x00\\x01\\xEF\\xFF", &unescaped));
1325   EXPECT_EQ(string("\x00\x01\xEF\xFF", 4), unescaped);
1326 
1327   // A binary sequence (lower case)
1328   EXPECT_TRUE(Util::Unescape("\\x00\\x01\\xef\\xff", &unescaped));
1329   EXPECT_EQ(string("\x00\x01\xEF\xFF", 4), unescaped);
1330 
1331   EXPECT_TRUE(Util::Unescape("", &unescaped));
1332   EXPECT_TRUE(unescaped.empty());
1333 
1334   EXPECT_FALSE(Util::Unescape("\\AB\\CD\\EFG", &unescaped));
1335   EXPECT_FALSE(Util::Unescape("\\01\\XY", &unescaped));
1336 }
1337 
TEST(UtilTest,EscapeUrl)1338 TEST(UtilTest, EscapeUrl) {
1339   string escaped;
1340   Util::EscapeUrl("らむだ", &escaped);
1341   EXPECT_EQ("%E3%82%89%E3%82%80%E3%81%A0", escaped);
1342   EXPECT_EQ("%E3%82%89%E3%82%80%E3%81%A0", Util::EscapeUrl("らむだ"));
1343 }
1344 
TEST(UtilTest,EscapeHtml)1345 TEST(UtilTest, EscapeHtml) {
1346   string escaped;
1347   Util::EscapeHtml("<>&'\"abc", &escaped);
1348   EXPECT_EQ("&lt;&gt;&amp;&#39;&quot;abc", escaped);
1349 }
1350 
TEST(UtilTest,EscapeCss)1351 TEST(UtilTest, EscapeCss) {
1352   string escaped;
1353   Util::EscapeCss("<>&'\"abc", &escaped);
1354   EXPECT_EQ("&lt;>&'\"abc", escaped);
1355 }
1356 
TEST(UtilTest,ScriptType)1357 TEST(UtilTest, ScriptType) {
1358   EXPECT_TRUE(Util::IsScriptType("くどう", Util::HIRAGANA));
1359   EXPECT_TRUE(Util::IsScriptType("京都", Util::KANJI));
1360   // (b/4201140)
1361   EXPECT_TRUE(Util::IsScriptType("人々", Util::KANJI));
1362   EXPECT_TRUE(Util::IsScriptType("モズク", Util::KATAKANA));
1363   EXPECT_TRUE(Util::IsScriptType("モズクモズク", Util::KATAKANA));
1364   EXPECT_TRUE(Util::IsScriptType("ぐーぐる", Util::HIRAGANA));
1365   EXPECT_TRUE(Util::IsScriptType("グーグル", Util::KATAKANA));
1366   // U+309F: HIRAGANA DIGRAPH YORI
1367   EXPECT_TRUE(Util::IsScriptType("ゟ", Util::HIRAGANA));
1368   // U+30FF: KATAKANA DIGRAPH KOTO
1369   EXPECT_TRUE(Util::IsScriptType("ヿ", Util::KATAKANA));
1370   EXPECT_TRUE(Util::IsScriptType("ヷヸヹヺㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ",
1371                                  Util::KATAKANA));
1372   // "��›€€" U+1B000: KATAKANA LETTER ARCHAIC E
1373   EXPECT_TRUE(Util::IsScriptType("\xF0\x9B\x80\x80", Util::KATAKANA));
1374   // "��›€" U+1B001: HIRAGANA LETTER ARCHAIC YE
1375   EXPECT_TRUE(Util::IsScriptType("\xF0\x9B\x80\x81", Util::HIRAGANA));
1376 
1377   EXPECT_TRUE(Util::IsScriptType("012", Util::NUMBER));
1378   EXPECT_TRUE(Util::IsScriptType("012012", Util::NUMBER));
1379   EXPECT_TRUE(Util::IsScriptType("abcABC", Util::ALPHABET));
1380   EXPECT_TRUE(Util::IsScriptType("ABCD", Util::ALPHABET));
1381   EXPECT_TRUE(Util::IsScriptType("@!#", Util::UNKNOWN_SCRIPT));
1382 
1383   EXPECT_FALSE(Util::IsScriptType("くどカう", Util::HIRAGANA));
1384   EXPECT_FALSE(Util::IsScriptType("京あ都", Util::KANJI));
1385   EXPECT_FALSE(Util::IsScriptType("モズあク", Util::KATAKANA));
1386   EXPECT_FALSE(Util::IsScriptType("モあズクモズク", Util::KATAKANA));
1387   EXPECT_FALSE(Util::IsScriptType("012あ", Util::NUMBER));
1388   EXPECT_FALSE(Util::IsScriptType("012あ012", Util::NUMBER));
1389   EXPECT_FALSE(Util::IsScriptType("abcABあC", Util::ALPHABET));
1390   EXPECT_FALSE(Util::IsScriptType("ABあCD", Util::ALPHABET));
1391   EXPECT_FALSE(Util::IsScriptType("ぐーぐるグ", Util::HIRAGANA));
1392   EXPECT_FALSE(Util::IsScriptType("グーグルぐ", Util::KATAKANA));
1393 
1394   EXPECT_TRUE(Util::ContainsScriptType("グーグルsuggest", Util::ALPHABET));
1395   EXPECT_FALSE(Util::ContainsScriptType("グーグルサジェスト", Util::ALPHABET));
1396 
1397   EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("くどう"));
1398   EXPECT_EQ(Util::KANJI, Util::GetScriptType("京都"));
1399   // b/4201140
1400   EXPECT_EQ(Util::KANJI, Util::GetScriptType("人々"));
1401   EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("モズク"));
1402   EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("モズクモズク"));
1403   EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("ぐーぐる"));
1404   EXPECT_EQ(Util::HIRAGANA, Util::GetFirstScriptType("ぐーぐる"));
1405 
1406   EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("グーグル"));
1407   EXPECT_EQ(Util::KATAKANA, Util::GetFirstScriptType("グーグル"));
1408   // U+309F HIRAGANA DIGRAPH YORI
1409   EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("ゟ"));
1410   EXPECT_EQ(Util::HIRAGANA, Util::GetFirstScriptType("ゟ"));
1411 
1412   // U+30FF KATAKANA DIGRAPH KOTO
1413   EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("ヿ"));
1414   EXPECT_EQ(Util::KATAKANA,
1415             Util::GetScriptType("ヷヸヹヺㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ"));
1416   // "��" U+1B000 KATAKANA LETTER ARCHAIC E
1417   EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("\xF0\x9B\x80\x80"));
1418   // "��" U+1B001 HIRAGANA LETTER ARCHAIC YE
1419   EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("\xF0\x9B\x80\x81"));
1420 
1421   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("!グーグル"));
1422   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("ー"));    // U+30FC
1423   EXPECT_EQ(Util::KATAKANA, Util::GetFirstScriptType("ー"));     // U+30FC
1424   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("ーー"));  // U+30FC * 2
1425   EXPECT_EQ(Util::KATAKANA, Util::GetFirstScriptType("ーー"));   // U+30FC * 2
1426   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("゛"));
1427   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("゜"));
1428 
1429   EXPECT_EQ(Util::NUMBER, Util::GetScriptType("012"));
1430   EXPECT_EQ(Util::NUMBER, Util::GetScriptType("012012"));
1431   EXPECT_EQ(Util::ALPHABET, Util::GetScriptType("abcABC"));
1432   EXPECT_EQ(Util::ALPHABET, Util::GetScriptType("ABCD"));
1433   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("@!#"));
1434   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("@!#"));
1435 
1436   EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("ーひらがな"));
1437   EXPECT_EQ(Util::KATAKANA, Util::GetFirstScriptType("ーひらがな"));
1438   EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("ーカタカナ"));
1439   EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("ーカタカナ"));
1440   EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("ひらがなー"));
1441   EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("カタカナー"));
1442   EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("カタカナー"));
1443 
1444   EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("あ゛っ"));
1445   EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("あ゜っ"));
1446   EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("ア゛ッ"));
1447   EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("ア゜ッ"));
1448 
1449   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("くどカう"));
1450   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("京あ都"));
1451   EXPECT_EQ(Util::KANJI, Util::GetFirstScriptType("京あ都"));
1452 
1453   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("モズあク"));
1454   EXPECT_EQ(Util::KATAKANA, Util::GetFirstScriptType("モズあク"));
1455 
1456   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("モあズクモズク"));
1457   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("012あ"));
1458   EXPECT_EQ(Util::NUMBER, Util::GetFirstScriptType("012あ"));
1459   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("012あ012"));
1460   EXPECT_EQ(Util::NUMBER, Util::GetFirstScriptType("012あ012"));
1461   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("abcABあC"));
1462   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("ABあCD"));
1463   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("ぐーぐるグ"));
1464   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("グーグルぐ"));
1465 
1466   // "龦" U+9FA6
1467   EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xE9\xBE\xA6"));
1468   // "龻" U+9FBB
1469   EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xE9\xBE\xBB"));
1470   // U+9FFF is not assigned yet but reserved for CJK Unified Ideographs.
1471   EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xE9\xBF\xBF"));
1472   // "��咤" U+20B9F U+54A4
1473   EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xF0\xA0\xAE\x9F\xE5\x92\xA4"));
1474   // "��野" U+20BB7 U+91CE
1475   EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xF0\xA0\xAE\xB7\xE9\x87\x8E"));
1476   // "��" U+2F884
1477   EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xF0\xAF\xA2\x84"));
1478 
1479   // U+1F466, BOY/smile emoji
1480   EXPECT_EQ(Util::EMOJI, Util::GetScriptType("\xF0\x9F\x91\xA6"));
1481   // U+FE003, Snow-man Android PUA emoji
1482   EXPECT_TRUE(Util::IsAndroidPuaEmoji("\xf3\xbe\x80\x83"));
1483   EXPECT_EQ(Util::EMOJI, Util::GetScriptType("\xf3\xbe\x80\x83"));
1484 }
1485 
TEST(UtilTest,ScriptTypeWithoutSymbols)1486 TEST(UtilTest, ScriptTypeWithoutSymbols) {
1487   EXPECT_EQ(Util::HIRAGANA, Util::GetScriptTypeWithoutSymbols("くど う"));
1488   EXPECT_EQ(Util::KANJI, Util::GetScriptTypeWithoutSymbols("京 都"));
1489   EXPECT_EQ(Util::KATAKANA, Util::GetScriptTypeWithoutSymbols("モズク"));
1490   EXPECT_EQ(Util::KATAKANA, Util::GetScriptTypeWithoutSymbols("モズ クモズク"));
1491   EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("Google Earth"));
1492   EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("Google "));
1493   EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols(" Google"));
1494   EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols(" Google "));
1495   EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("     g"));
1496   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols(""));
1497   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols(" "));
1498   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols("   "));
1499   EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("Hello!"));
1500   EXPECT_EQ(Util::UNKNOWN_SCRIPT,
1501             Util::GetScriptTypeWithoutSymbols("Hello!あ"));
1502   EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("CD-ROM"));
1503   EXPECT_EQ(Util::UNKNOWN_SCRIPT,
1504             Util::GetScriptTypeWithoutSymbols("CD-ROMア"));
1505   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols("-"));
1506   EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("-A"));
1507   EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("--A"));
1508   EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("--A---"));
1509   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols("--A-ア-"));
1510   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols("!"));
1511   EXPECT_EQ(Util::HIRAGANA, Util::GetScriptTypeWithoutSymbols("・あ"));
1512   EXPECT_EQ(Util::HIRAGANA, Util::GetScriptTypeWithoutSymbols("・・あ"));
1513   EXPECT_EQ(Util::KATAKANA,
1514             Util::GetScriptTypeWithoutSymbols("コギト・エルゴ・スム"));
1515   EXPECT_EQ(Util::UNKNOWN_SCRIPT,
1516             Util::GetScriptTypeWithoutSymbols("コギト・エルゴ・住む"));
1517   EXPECT_EQ(Util::KANJI, Util::GetScriptTypeWithoutSymbols("人☆名"));
1518   EXPECT_EQ(Util::HIRAGANA, Util::GetScriptTypeWithoutSymbols("ひとの☆なまえ"));
1519   EXPECT_EQ(Util::UNKNOWN_SCRIPT,
1520             Util::GetScriptTypeWithoutSymbols("超☆最高です"));
1521   EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols("・--☆"));
1522 }
1523 
TEST(UtilTest,FormType)1524 TEST(UtilTest, FormType) {
1525   EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("くどう"));
1526   EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("京都"));
1527   EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("モズク"));
1528   EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("モズク"));
1529   EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("ぐーぐる"));
1530   EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("グーグル"));
1531   EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("グーグル"));
1532   EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("ー"));
1533   EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("ー"));
1534   EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("¢£¥¦¬¯"));
1535   // "│←↑→↓■○"
1536   EXPECT_EQ(Util::HALF_WIDTH,
1537             Util::GetFormType("\xEF\xBF\xA8\xEF\xBF\xA9\xEF\xBF\xAA\xEF\xBF\xAB"
1538                               "\xEF\xBF\xAC\xEF\xBF\xAD\xEF\xBF\xAE"));
1539 
1540   // Half-width mathematical symbols
1541   // [U+27E6, U+27ED], U+2985, and U+2986
1542   EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("⟦⟧⟨⟩⟪⟫⟬⟭⦅⦆"));
1543 
1544   // Half-width hangul "ᅠᄀᄁ"
1545   EXPECT_EQ(Util::HALF_WIDTH,
1546             Util::GetFormType("\xEF\xBE\xA0\xEF\xBE\xA1\xEF\xBE\xA2"));
1547 
1548   // Half-width won "₩"
1549   EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("₩"));
1550 
1551   EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("012"));
1552   EXPECT_EQ(Util::UNKNOWN_FORM, Util::GetFormType("012012"));
1553   EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("abcABC"));
1554   EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("ABCD"));
1555   EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("@!#"));
1556 }
1557 
1558 #ifndef OS_NACL
1559 // We have a snapshot of the result of |Util::GetCharacterSet(ucs4)| in
1560 // data/test/character_set/character_set.tsv.
1561 // Compare the result for each character just in case.
1562 //
1563 // Disabled on NaCl since it uses a mock file system.
TEST(UtilTest,CharacterSetFullTest)1564 TEST(UtilTest, CharacterSetFullTest) {
1565   std::map<char32, Util::CharacterSet> test_set;
1566   FillTestCharacterSetMap(&test_set);
1567   EXPECT_FALSE(test_set.empty());
1568 
1569   // Unicode characters consist of [U+0000, U+10FFFF].
1570   for (char32 ucs4 = 0; ucs4 <= 0x10ffff; ++ucs4) {
1571     EXPECT_EQ(GetExpectedCharacterSet(test_set, ucs4),
1572               Util::GetCharacterSet(ucs4))
1573         << "Character set changed at " << ucs4;
1574   }
1575 }
1576 #endif  // OS_NACL
1577 
TEST(UtilTest,CharacterSet_gen_character_set)1578 TEST(UtilTest, CharacterSet_gen_character_set) {
1579   // [0x00, 0x7f] are ASCII
1580   for (size_t i = 0; i <= 0x7f; ++i) {
1581     EXPECT_EQ(Util::ASCII, Util::GetCharacterSet(i));
1582   }
1583   // [0x80, 0xff] are not ASCII
1584   for (size_t i = 0x80; i <= 0xff; ++i) {
1585     EXPECT_NE(Util::ASCII, Util::GetCharacterSet(i));
1586   }
1587 
1588   // 0213
1589   // "Ⅰ"
1590   EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(0x2160));
1591   // "①"
1592   EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(0x2460));
1593   // "㊤"
1594   EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(0x32A4));
1595   // "𠮟" from UCS4 range (b/4176888)
1596   EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(0x20B9F));
1597   // "𪚲" from UCS4 range (b/4176888)
1598   EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(0x2A6B2));
1599 
1600   // only in CP932
1601   // "凬"
1602   EXPECT_EQ(Util::CP932, Util::GetCharacterSet(0x51EC));
1603 
1604   // only in Unicode
1605   // "₩"
1606   EXPECT_EQ(Util::UNICODE_ONLY, Util::GetCharacterSet(0xFFE6));
1607   // "ð ®·" from UCS4 range (b/4176888)
1608   EXPECT_EQ(Util::UNICODE_ONLY, Util::GetCharacterSet(0x20BB7));
1609 }
1610 
TEST(UtilTest,CharacterSet)1611 TEST(UtilTest, CharacterSet) {
1612   EXPECT_EQ(Util::JISX0208, Util::GetCharacterSet("あいうえお"));
1613   EXPECT_EQ(Util::ASCII, Util::GetCharacterSet("abc"));
1614   EXPECT_EQ(Util::JISX0208, Util::GetCharacterSet("abcあいう"));
1615 
1616   // half width katakana
1617   EXPECT_EQ(Util::JISX0201, Util::GetCharacterSet("カタカナ"));
1618   EXPECT_EQ(Util::JISX0208, Util::GetCharacterSet("カタカナカタカナ"));
1619 
1620   // 0213
1621   EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet("Ⅰ"));
1622   EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet("①"));
1623   EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet("㊤"));
1624   // "ð ® " from UCS4 range (b/4176888)
1625   EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet("��"));
1626   // "𪠲" from UCS4 range (b/4176888)
1627   EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet("��"));
1628 
1629   // only in CP932
1630   EXPECT_EQ(Util::CP932, Util::GetCharacterSet("凬"));
1631 
1632   // only in Unicode
1633   EXPECT_EQ(Util::UNICODE_ONLY, Util::GetCharacterSet("₩"));
1634   // "ð ®·" from UCS4 range (b/4176888)
1635   EXPECT_EQ(Util::UNICODE_ONLY, Util::GetCharacterSet("\xF0\xA0\xAE\xB7"));
1636 }
1637 
1638 #ifdef OS_WIN
TEST(UtilTest,WideCharsLen)1639 TEST(UtilTest, WideCharsLen) {
1640   // "a𠮟b"
1641   const string input_utf8 = "a\360\240\256\237b";
1642   EXPECT_EQ(4, Util::WideCharsLen(input_utf8));
1643   EXPECT_EQ(0, Util::WideCharsLen(Util::SubString(input_utf8, 0, 0)));
1644   EXPECT_EQ(1, Util::WideCharsLen(Util::SubString(input_utf8, 0, 1)));
1645   EXPECT_EQ(3, Util::WideCharsLen(Util::SubString(input_utf8, 0, 2)));
1646   EXPECT_EQ(4, Util::WideCharsLen(Util::SubString(input_utf8, 0, 3)));
1647 }
1648 
TEST(UtilTest,UTF8ToWide)1649 TEST(UtilTest, UTF8ToWide) {
1650   const string input_utf8 = "abc";
1651   std::wstring output_wide;
1652   Util::UTF8ToWide(input_utf8, &output_wide);
1653 
1654   string output_utf8;
1655   Util::WideToUTF8(output_wide, &output_utf8);
1656   EXPECT_EQ("abc", output_utf8);
1657 }
1658 
TEST(UtilTest,WideToUTF8_SurrogatePairSupport)1659 TEST(UtilTest, WideToUTF8_SurrogatePairSupport) {
1660   // Visual C++ 2008 does not support embedding surrogate pair in string
1661   // literals like L"\uD842\uDF9F". This is why we use wchar_t array instead.
1662   // "𠮟"
1663   const wchar_t input_wide[] = {0xD842, 0xDF9F, 0};
1664   string output_utf8;
1665   Util::WideToUTF8(input_wide, &output_utf8);
1666 
1667   std::wstring output_wide;
1668   Util::UTF8ToWide(output_utf8, &output_wide);
1669 
1670   EXPECT_EQ("\360\240\256\237", output_utf8);
1671   EXPECT_EQ(input_wide, output_wide);
1672 }
1673 #endif  // OS_WIN
1674 
TEST(UtilTest,IsKanaSymbolContained)1675 TEST(UtilTest, IsKanaSymbolContained) {
1676   const string kFullstop("。");
1677   const string kSpace(" ");
1678   EXPECT_TRUE(Util::IsKanaSymbolContained(kFullstop));
1679   EXPECT_TRUE(Util::IsKanaSymbolContained(kSpace + kFullstop));
1680   EXPECT_TRUE(Util::IsKanaSymbolContained(kFullstop + kSpace));
1681   EXPECT_FALSE(Util::IsKanaSymbolContained(kSpace));
1682   EXPECT_FALSE(Util::IsKanaSymbolContained(""));
1683 }
1684 
TEST(UtilTest,RandomSeedTest)1685 TEST(UtilTest, RandomSeedTest) {
1686   Util::SetRandomSeed(0);
1687   const int first_try = Util::Random(INT_MAX);
1688   const int second_try = Util::Random(INT_MAX);
1689   EXPECT_NE(first_try, second_try);
1690 
1691   // Reset the seed.
1692   Util::SetRandomSeed(0);
1693   EXPECT_EQ(first_try, Util::Random(INT_MAX));
1694 }
1695 
TEST(UtilTest,SplitFirstChar32)1696 TEST(UtilTest, SplitFirstChar32) {
1697   StringPiece rest;
1698   char32 c = 0;
1699 
1700   rest = StringPiece();
1701   c = 0;
1702   EXPECT_FALSE(Util::SplitFirstChar32("", &c, &rest));
1703   EXPECT_EQ(0, c);
1704   EXPECT_TRUE(rest.empty());
1705 
1706   // Allow nullptr to ignore the matched value.
1707   rest = StringPiece();
1708   EXPECT_TRUE(Util::SplitFirstChar32("01", nullptr, &rest));
1709   EXPECT_EQ("1", rest);
1710 
1711   // Allow nullptr to ignore the matched value.
1712   c = 0;
1713   EXPECT_TRUE(Util::SplitFirstChar32("01", &c, nullptr));
1714   EXPECT_EQ('0', c);
1715 
1716   rest = StringPiece();
1717   c = 0;
1718   EXPECT_TRUE(Util::SplitFirstChar32("\x01 ", &c, &rest));
1719   EXPECT_EQ(1, c);
1720   EXPECT_EQ(" ", rest);
1721 
1722   rest = StringPiece();
1723   c = 0;
1724   EXPECT_TRUE(Util::SplitFirstChar32("\x7F ", &c, &rest));
1725   EXPECT_EQ(0x7F, c);
1726   EXPECT_EQ(" ", rest);
1727 
1728   rest = StringPiece();
1729   c = 0;
1730   EXPECT_TRUE(Util::SplitFirstChar32("\xC2\x80 ", &c, &rest));
1731   EXPECT_EQ(0x80, c);
1732   EXPECT_EQ(" ", rest);
1733 
1734   rest = StringPiece();
1735   c = 0;
1736   EXPECT_TRUE(Util::SplitFirstChar32("\xDF\xBF ", &c, &rest));
1737   EXPECT_EQ(0x7FF, c);
1738   EXPECT_EQ(" ", rest);
1739 
1740   rest = StringPiece();
1741   c = 0;
1742   EXPECT_TRUE(Util::SplitFirstChar32("\xE0\xA0\x80 ", &c, &rest));
1743   EXPECT_EQ(0x800, c);
1744   EXPECT_EQ(" ", rest);
1745 
1746   rest = StringPiece();
1747   c = 0;
1748   EXPECT_TRUE(Util::SplitFirstChar32("\xEF\xBF\xBF ", &c, &rest));
1749   EXPECT_EQ(0xFFFF, c);
1750   EXPECT_EQ(" ", rest);
1751 
1752   rest = StringPiece();
1753   c = 0;
1754   EXPECT_TRUE(Util::SplitFirstChar32("\xF0\x90\x80\x80 ", &c, &rest));
1755   EXPECT_EQ(0x10000, c);
1756   EXPECT_EQ(" ", rest);
1757 
1758   rest = StringPiece();
1759   c = 0;
1760   EXPECT_TRUE(Util::SplitFirstChar32("\xF7\xBF\xBF\xBF ", &c, &rest));
1761   EXPECT_EQ(0x1FFFFF, c);
1762   EXPECT_EQ(" ", rest);
1763 
1764   rest = StringPiece();
1765   c = 0;
1766   EXPECT_TRUE(Util::SplitFirstChar32("\xF8\x88\x80\x80\x80 ", &c, &rest));
1767   EXPECT_EQ(0x200000, c);
1768   EXPECT_EQ(" ", rest);
1769 
1770   rest = StringPiece();
1771   c = 0;
1772   EXPECT_TRUE(Util::SplitFirstChar32("\xFB\xBF\xBF\xBF\xBF ", &c, &rest));
1773   EXPECT_EQ(0x3FFFFFF, c);
1774   EXPECT_EQ(" ", rest);
1775 
1776   rest = StringPiece();
1777   c = 0;
1778   EXPECT_TRUE(Util::SplitFirstChar32("\xFC\x84\x80\x80\x80\x80 ", &c, &rest));
1779   EXPECT_EQ(0x4000000, c);
1780   EXPECT_EQ(" ", rest);
1781 
1782   rest = StringPiece();
1783   c = 0;
1784   EXPECT_TRUE(Util::SplitFirstChar32("\xFD\xBF\xBF\xBF\xBF\xBF ", &c, &rest));
1785   EXPECT_EQ(0x7FFFFFFF, c);
1786   EXPECT_EQ(" ", rest);
1787 
1788   // If there is any invalid sequence, the entire text should be treated as
1789   // am empty string.
1790   {
1791     c = 0;
1792     EXPECT_FALSE(Util::SplitFirstChar32("\xC2 ", &c, &rest));
1793     EXPECT_EQ(0, c);
1794 
1795     c = 0;
1796     EXPECT_FALSE(Util::SplitFirstChar32("\xC2\xC2 ", &c, &rest));
1797     EXPECT_EQ(0, c);
1798 
1799     c = 0;
1800     EXPECT_FALSE(Util::SplitFirstChar32("\xE0 ", &c, &rest));
1801     EXPECT_EQ(0, c);
1802 
1803     c = 0;
1804     EXPECT_FALSE(Util::SplitFirstChar32("\xE0\xE0\xE0 ", &c, &rest));
1805     EXPECT_EQ(0, c);
1806 
1807     c = 0;
1808     EXPECT_FALSE(Util::SplitFirstChar32("\xF0 ", &c, &rest));
1809     EXPECT_EQ(0, c);
1810 
1811     c = 0;
1812     EXPECT_FALSE(Util::SplitFirstChar32("\xF0\xF0\xF0\xF0 ", &c, &rest));
1813     EXPECT_EQ(0, c);
1814   }
1815 
1816   // BOM should be treated as invalid byte.
1817   {
1818     c = 0;
1819     EXPECT_FALSE(Util::SplitFirstChar32("\xFF ", &c, &rest));
1820     EXPECT_EQ(0, c);
1821 
1822     c = 0;
1823     EXPECT_FALSE(Util::SplitFirstChar32("\xFE ", &c, &rest));
1824     EXPECT_EQ(0, c);
1825   }
1826 
1827   // Invalid sequence for U+002F (redundant encoding)
1828   {
1829     c = 0;
1830     EXPECT_FALSE(Util::SplitFirstChar32("\xC0\xAF", &c, &rest));
1831     EXPECT_EQ(0, c);
1832 
1833     c = 0;
1834     EXPECT_FALSE(Util::SplitFirstChar32("\xE0\x80\xAF", &c, &rest));
1835     EXPECT_EQ(0, c);
1836 
1837     c = 0;
1838     EXPECT_FALSE(Util::SplitFirstChar32("\xF0\x80\x80\xAF", &c, &rest));
1839     EXPECT_EQ(0, c);
1840   }
1841 }
1842 
TEST(UtilTest,SplitLastChar32)1843 TEST(UtilTest, SplitLastChar32) {
1844   StringPiece rest;
1845   char32 c = 0;
1846 
1847   rest = StringPiece();
1848   c = 0;
1849   EXPECT_FALSE(Util::SplitLastChar32("", &rest, &c));
1850   EXPECT_EQ(0, c);
1851   EXPECT_TRUE(rest.empty());
1852 
1853   // Allow nullptr to ignore the matched value.
1854   c = 0;
1855   EXPECT_TRUE(Util::SplitLastChar32("01", nullptr, &c));
1856   EXPECT_EQ('1', c);
1857 
1858   // Allow nullptr to ignore the matched value.
1859   rest = StringPiece();
1860   EXPECT_TRUE(Util::SplitLastChar32("01", &rest, nullptr));
1861   EXPECT_EQ("0", rest);
1862 
1863   rest = StringPiece();
1864   c = 0;
1865   EXPECT_TRUE(Util::SplitLastChar32(" \x01", &rest, &c));
1866   EXPECT_EQ(1, c);
1867   EXPECT_EQ(" ", rest);
1868 
1869   rest = StringPiece();
1870   c = 0;
1871   EXPECT_TRUE(Util::SplitLastChar32(" \x7F", &rest, &c));
1872   EXPECT_EQ(0x7F, c);
1873   EXPECT_EQ(" ", rest);
1874 
1875   rest = StringPiece();
1876   c = 0;
1877   EXPECT_TRUE(Util::SplitLastChar32(" \xC2\x80", &rest, &c));
1878   EXPECT_EQ(0x80, c);
1879   EXPECT_EQ(" ", rest);
1880 
1881   rest = StringPiece();
1882   c = 0;
1883   EXPECT_TRUE(Util::SplitLastChar32(" \xDF\xBF", &rest, &c));
1884   EXPECT_EQ(0x7FF, c);
1885   EXPECT_EQ(" ", rest);
1886 
1887   rest = StringPiece();
1888   c = 0;
1889   EXPECT_TRUE(Util::SplitLastChar32(" \xE0\xA0\x80", &rest, &c));
1890   EXPECT_EQ(0x800, c);
1891   EXPECT_EQ(" ", rest);
1892 
1893   rest = StringPiece();
1894   c = 0;
1895   EXPECT_TRUE(Util::SplitLastChar32(" \xEF\xBF\xBF", &rest, &c));
1896   EXPECT_EQ(0xFFFF, c);
1897   EXPECT_EQ(" ", rest);
1898 
1899   rest = StringPiece();
1900   c = 0;
1901   EXPECT_TRUE(Util::SplitLastChar32(" \xF0\x90\x80\x80", &rest, &c));
1902   EXPECT_EQ(0x10000, c);
1903   EXPECT_EQ(" ", rest);
1904 
1905   rest = StringPiece();
1906   c = 0;
1907   EXPECT_TRUE(Util::SplitLastChar32(" \xF7\xBF\xBF\xBF", &rest, &c));
1908   EXPECT_EQ(0x1FFFFF, c);
1909   EXPECT_EQ(" ", rest);
1910 
1911   rest = StringPiece();
1912   c = 0;
1913   EXPECT_TRUE(Util::SplitLastChar32(" \xF8\x88\x80\x80\x80", &rest, &c));
1914   EXPECT_EQ(0x200000, c);
1915   EXPECT_EQ(" ", rest);
1916 
1917   rest = StringPiece();
1918   c = 0;
1919   EXPECT_TRUE(Util::SplitLastChar32(" \xFB\xBF\xBF\xBF\xBF", &rest, &c));
1920   EXPECT_EQ(0x3FFFFFF, c);
1921   EXPECT_EQ(" ", rest);
1922 
1923   rest = StringPiece();
1924   c = 0;
1925   EXPECT_TRUE(Util::SplitLastChar32(" \xFC\x84\x80\x80\x80\x80", &rest, &c));
1926   EXPECT_EQ(0x4000000, c);
1927   EXPECT_EQ(" ", rest);
1928 
1929   rest = StringPiece();
1930   c = 0;
1931   EXPECT_TRUE(Util::SplitLastChar32(" \xFD\xBF\xBF\xBF\xBF\xBF", &rest, &c));
1932   EXPECT_EQ(0x7FFFFFFF, c);
1933   EXPECT_EQ(" ", rest);
1934 
1935   // If there is any invalid sequence, the entire text should be treated as
1936   // am empty string.
1937   {
1938     c = 0;
1939     EXPECT_FALSE(Util::SplitLastChar32(" \xC2", &rest, &c));
1940     EXPECT_EQ(0, c);
1941 
1942     c = 0;
1943     EXPECT_FALSE(Util::SplitLastChar32(" \xC2\xC2", &rest, &c));
1944     EXPECT_EQ(0, c);
1945 
1946     c = 0;
1947     EXPECT_FALSE(Util::SplitLastChar32(" \xE0", &rest, &c));
1948     EXPECT_EQ(0, c);
1949 
1950     c = 0;
1951     EXPECT_FALSE(Util::SplitLastChar32(" \xE0\xE0\xE0", &rest, &c));
1952     EXPECT_EQ(0, c);
1953 
1954     c = 0;
1955     EXPECT_FALSE(Util::SplitLastChar32(" \xF0", &rest, &c));
1956     EXPECT_EQ(0, c);
1957 
1958     c = 0;
1959     EXPECT_FALSE(Util::SplitLastChar32(" \xF0\xF0\xF0\xF0", &rest, &c));
1960     EXPECT_EQ(0, c);
1961   }
1962 
1963   // BOM should be treated as invalid byte.
1964   {
1965     c = 0;
1966     EXPECT_FALSE(Util::SplitLastChar32(" \xFF", &rest, &c));
1967     EXPECT_EQ(0, c);
1968 
1969     c = 0;
1970     EXPECT_FALSE(Util::SplitLastChar32(" \xFE", &rest, &c));
1971     EXPECT_EQ(0, c);
1972   }
1973 
1974   // Invalid sequence for U+002F (redundant encoding)
1975   {
1976     c = 0;
1977     EXPECT_FALSE(Util::SplitLastChar32("\xC0\xAF", &rest, &c));
1978     EXPECT_EQ(0, c);
1979 
1980     c = 0;
1981     EXPECT_FALSE(Util::SplitLastChar32("\xE0\x80\xAF", &rest, &c));
1982     EXPECT_EQ(0, c);
1983 
1984     c = 0;
1985     EXPECT_FALSE(Util::SplitLastChar32("\xF0\x80\x80\xAF", &rest, &c));
1986     EXPECT_EQ(0, c);
1987   }
1988 }
1989 
TEST(UtilTest,SerializeAndDeserializeUint64)1990 TEST(UtilTest, SerializeAndDeserializeUint64) {
1991   struct {
1992     const char* str;
1993     uint64 value;
1994   } kCorrectPairs[] = {
1995     {"\x00\x00\x00\x00\x00\x00\x00\x00", 0},
1996     {"\x00\x00\x00\x00\x00\x00\x00\xFF", kuint8max},
1997     {"\x00\x00\x00\x00\x00\x00\xFF\xFF", kuint16max},
1998     {"\x00\x00\x00\x00\xFF\xFF\xFF\xFF", kuint32max},
1999     {"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", kuint64max},
2000     {"\x01\x23\x45\x67\x89\xAB\xCD\xEF", 0x0123456789ABCDEF},
2001     {"\xFE\xDC\xBA\x98\x76\x54\x32\x10", 0xFEDCBA9876543210},
2002   };
2003 
2004   for (size_t i = 0; i < arraysize(kCorrectPairs); ++i) {
2005     const string serialized(kCorrectPairs[i].str, 8);
2006     EXPECT_EQ(serialized, Util::SerializeUint64(kCorrectPairs[i].value));
2007 
2008     uint64 v;
2009     EXPECT_TRUE(Util::DeserializeUint64(serialized, &v));
2010     EXPECT_EQ(kCorrectPairs[i].value, v);
2011   }
2012 
2013   // Invalid patterns for DeserializeUint64.
2014   const char* kFalseCases[] = {
2015     "",
2016     "abc",
2017     "helloworld",
2018   };
2019   for (size_t i = 0; i < arraysize(kFalseCases); ++i) {
2020     uint64 v;
2021     EXPECT_FALSE(Util::DeserializeUint64(kFalseCases[i], &v));
2022   }
2023 }
2024 
2025 }  // namespace mozc
2026