1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "llvm/Support/ConvertUTF.h"
11 #include "gtest/gtest.h"
12 #include <string>
13 #include <utility>
14 #include <vector>
15 
16 using namespace llvm;
17 
TEST(ConvertUTFTest,ConvertUTF16LittleEndianToUTF8String)18 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
19   // Src is the look of disapproval.
20   static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
21   ArrayRef<char> Ref(Src, sizeof(Src) - 1);
22   std::string Result;
23   bool Success = convertUTF16ToUTF8String(Ref, Result);
24   EXPECT_TRUE(Success);
25   std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
26   EXPECT_EQ(Expected, Result);
27 }
28 
TEST(ConvertUTFTest,ConvertUTF16BigEndianToUTF8String)29 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
30   // Src is the look of disapproval.
31   static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
32   ArrayRef<char> Ref(Src, sizeof(Src) - 1);
33   std::string Result;
34   bool Success = convertUTF16ToUTF8String(Ref, Result);
35   EXPECT_TRUE(Success);
36   std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
37   EXPECT_EQ(Expected, Result);
38 }
39 
TEST(ConvertUTFTest,OddLengthInput)40 TEST(ConvertUTFTest, OddLengthInput) {
41   std::string Result;
42   bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);
43   EXPECT_FALSE(Success);
44 }
45 
TEST(ConvertUTFTest,Empty)46 TEST(ConvertUTFTest, Empty) {
47   std::string Result;
48   bool Success = convertUTF16ToUTF8String(None, Result);
49   EXPECT_TRUE(Success);
50   EXPECT_TRUE(Result.empty());
51 }
52 
TEST(ConvertUTFTest,HasUTF16BOM)53 TEST(ConvertUTFTest, HasUTF16BOM) {
54   bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2));
55   EXPECT_TRUE(HasBOM);
56   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2));
57   EXPECT_TRUE(HasBOM);
58   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3));
59   EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
60   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6));
61   EXPECT_TRUE(HasBOM);
62 
63   HasBOM = hasUTF16ByteOrderMark(None);
64   EXPECT_FALSE(HasBOM);
65   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1));
66   EXPECT_FALSE(HasBOM);
67 }
68 
69 struct ConvertUTFResultContainer {
70   ConversionResult ErrorCode;
71   std::vector<unsigned> UnicodeScalars;
72 
ConvertUTFResultContainerConvertUTFResultContainer73   ConvertUTFResultContainer(ConversionResult ErrorCode)
74       : ErrorCode(ErrorCode) {}
75 
76   ConvertUTFResultContainer
withScalarsConvertUTFResultContainer77   withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
78               unsigned US2 = 0x110000, unsigned US3 = 0x110000,
79               unsigned US4 = 0x110000, unsigned US5 = 0x110000,
80               unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
81     ConvertUTFResultContainer Result(*this);
82     if (US0 != 0x110000)
83       Result.UnicodeScalars.push_back(US0);
84     if (US1 != 0x110000)
85       Result.UnicodeScalars.push_back(US1);
86     if (US2 != 0x110000)
87       Result.UnicodeScalars.push_back(US2);
88     if (US3 != 0x110000)
89       Result.UnicodeScalars.push_back(US3);
90     if (US4 != 0x110000)
91       Result.UnicodeScalars.push_back(US4);
92     if (US5 != 0x110000)
93       Result.UnicodeScalars.push_back(US5);
94     if (US6 != 0x110000)
95       Result.UnicodeScalars.push_back(US6);
96     if (US7 != 0x110000)
97       Result.UnicodeScalars.push_back(US7);
98     return Result;
99   }
100 };
101 
102 std::pair<ConversionResult, std::vector<unsigned>>
ConvertUTF8ToUnicodeScalarsLenient(StringRef S)103 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
104   const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
105 
106   const UTF8 *SourceNext = SourceStart;
107   std::vector<UTF32> Decoded(S.size(), 0);
108   UTF32 *TargetStart = Decoded.data();
109 
110   auto ErrorCode =
111       ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
112                          Decoded.data() + Decoded.size(), lenientConversion);
113 
114   Decoded.resize(TargetStart - Decoded.data());
115 
116   return std::make_pair(ErrorCode, Decoded);
117 }
118 
119 std::pair<ConversionResult, std::vector<unsigned>>
ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S)120 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
121   const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
122 
123   const UTF8 *SourceNext = SourceStart;
124   std::vector<UTF32> Decoded(S.size(), 0);
125   UTF32 *TargetStart = Decoded.data();
126 
127   auto ErrorCode = ConvertUTF8toUTF32Partial(
128       &SourceNext, SourceStart + S.size(), &TargetStart,
129       Decoded.data() + Decoded.size(), lenientConversion);
130 
131   Decoded.resize(TargetStart - Decoded.data());
132 
133   return std::make_pair(ErrorCode, Decoded);
134 }
135 
136 ::testing::AssertionResult
CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,StringRef S,bool Partial=false)137 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
138                                  StringRef S, bool Partial = false) {
139   ConversionResult ErrorCode;
140   std::vector<unsigned> Decoded;
141   if (!Partial)
142     std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
143   else
144     std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
145 
146   if (Expected.ErrorCode != ErrorCode)
147     return ::testing::AssertionFailure() << "Expected error code "
148                                          << Expected.ErrorCode << ", actual "
149                                          << ErrorCode;
150 
151   if (Expected.UnicodeScalars != Decoded)
152     return ::testing::AssertionFailure()
153            << "Expected lenient decoded result:\n"
154            << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
155            << "Actual result:\n" << ::testing::PrintToString(Decoded);
156 
157   return ::testing::AssertionSuccess();
158 }
159 
TEST(ConvertUTFTest,UTF8ToUTF32Lenient)160 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
161 
162   //
163   // 1-byte sequences
164   //
165 
166   // U+0041 LATIN CAPITAL LETTER A
167   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
168       ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
169 
170   //
171   // 2-byte sequences
172   //
173 
174   // U+0283 LATIN SMALL LETTER ESH
175   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
176       ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
177       "\xca\x83"));
178 
179   // U+03BA GREEK SMALL LETTER KAPPA
180   // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
181   // U+03C3 GREEK SMALL LETTER SIGMA
182   // U+03BC GREEK SMALL LETTER MU
183   // U+03B5 GREEK SMALL LETTER EPSILON
184   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
185       ConvertUTFResultContainer(conversionOK)
186           .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
187       "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
188 
189   //
190   // 3-byte sequences
191   //
192 
193   // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
194   // U+6587 CJK UNIFIED IDEOGRAPH-6587
195   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
196       ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
197       "\xe4\xbe\x8b\xe6\x96\x87"));
198 
199   // U+D55C HANGUL SYLLABLE HAN
200   // U+AE00 HANGUL SYLLABLE GEUL
201   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
202       ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
203       "\xed\x95\x9c\xea\xb8\x80"));
204 
205   // U+1112 HANGUL CHOSEONG HIEUH
206   // U+1161 HANGUL JUNGSEONG A
207   // U+11AB HANGUL JONGSEONG NIEUN
208   // U+1100 HANGUL CHOSEONG KIYEOK
209   // U+1173 HANGUL JUNGSEONG EU
210   // U+11AF HANGUL JONGSEONG RIEUL
211   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
212       ConvertUTFResultContainer(conversionOK)
213           .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
214       "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
215       "\xe1\x86\xaf"));
216 
217   //
218   // 4-byte sequences
219   //
220 
221   // U+E0100 VARIATION SELECTOR-17
222   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
223       ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
224       "\xf3\xa0\x84\x80"));
225 
226   //
227   // First possible sequence of a certain length
228   //
229 
230   // U+0000 NULL
231   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
232       ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
233       StringRef("\x00", 1)));
234 
235   // U+0080 PADDING CHARACTER
236   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
237       ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
238       "\xc2\x80"));
239 
240   // U+0800 SAMARITAN LETTER ALAF
241   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
242       ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
243       "\xe0\xa0\x80"));
244 
245   // U+10000 LINEAR B SYLLABLE B008 A
246   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
247       ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
248       "\xf0\x90\x80\x80"));
249 
250   // U+200000 (invalid)
251   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
252       ConvertUTFResultContainer(sourceIllegal)
253           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
254       "\xf8\x88\x80\x80\x80"));
255 
256   // U+4000000 (invalid)
257   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
258       ConvertUTFResultContainer(sourceIllegal)
259           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
260       "\xfc\x84\x80\x80\x80\x80"));
261 
262   //
263   // Last possible sequence of a certain length
264   //
265 
266   // U+007F DELETE
267   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
268       ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
269 
270   // U+07FF (unassigned)
271   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
272       ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
273       "\xdf\xbf"));
274 
275   // U+FFFF (noncharacter)
276   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
277       ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
278       "\xef\xbf\xbf"));
279 
280   // U+1FFFFF (invalid)
281   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
282       ConvertUTFResultContainer(sourceIllegal)
283           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
284       "\xf7\xbf\xbf\xbf"));
285 
286   // U+3FFFFFF (invalid)
287   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
288       ConvertUTFResultContainer(sourceIllegal)
289           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
290       "\xfb\xbf\xbf\xbf\xbf"));
291 
292   // U+7FFFFFFF (invalid)
293   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
294       ConvertUTFResultContainer(sourceIllegal)
295           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
296       "\xfd\xbf\xbf\xbf\xbf\xbf"));
297 
298   //
299   // Other boundary conditions
300   //
301 
302   // U+D7FF (unassigned)
303   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
304       ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
305       "\xed\x9f\xbf"));
306 
307   // U+E000 (private use)
308   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
309       ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
310       "\xee\x80\x80"));
311 
312   // U+FFFD REPLACEMENT CHARACTER
313   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
314       ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
315       "\xef\xbf\xbd"));
316 
317   // U+10FFFF (noncharacter)
318   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
319       ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
320       "\xf4\x8f\xbf\xbf"));
321 
322   // U+110000 (invalid)
323   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
324       ConvertUTFResultContainer(sourceIllegal)
325           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
326       "\xf4\x90\x80\x80"));
327 
328   //
329   // Unexpected continuation bytes
330   //
331 
332   // A sequence of unexpected continuation bytes that don't follow a first
333   // byte, every byte is a maximal subpart.
334 
335   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
336       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
337   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
338       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
339   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
340       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
341       "\x80\x80"));
342   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
343       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
344       "\x80\xbf"));
345   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
346       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
347       "\xbf\x80"));
348   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
349       ConvertUTFResultContainer(sourceIllegal)
350           .withScalars(0xfffd, 0xfffd, 0xfffd),
351       "\x80\xbf\x80"));
352   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
353       ConvertUTFResultContainer(sourceIllegal)
354           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
355       "\x80\xbf\x80\xbf"));
356   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
357       ConvertUTFResultContainer(sourceIllegal)
358           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
359       "\x80\xbf\x82\xbf\xaa"));
360   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
361       ConvertUTFResultContainer(sourceIllegal)
362           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
363       "\xaa\xb0\xbb\xbf\xaa\xa0"));
364   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
365       ConvertUTFResultContainer(sourceIllegal)
366           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
367       "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
368 
369   // All continuation bytes (0x80--0xbf).
370   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
371       ConvertUTFResultContainer(sourceIllegal)
372           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
373                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
374           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
375                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
376           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
377                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
378           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
379                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
380           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
381                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
382           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
383                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
384           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
385                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
386           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
387                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
388       "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
389       "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
390       "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
391       "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
392 
393   //
394   // Lonely start bytes
395   //
396 
397   // Start bytes of 2-byte sequences (0xc0--0xdf).
398   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
399       ConvertUTFResultContainer(sourceIllegal)
400           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
401                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
402           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
403                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
404           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
405                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
406           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
407                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
408       "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
409       "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
410 
411   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
412       ConvertUTFResultContainer(sourceIllegal)
413           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
414                        0xfffd, 0x0020, 0xfffd, 0x0020)
415           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
416                        0xfffd, 0x0020, 0xfffd, 0x0020)
417           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
418                        0xfffd, 0x0020, 0xfffd, 0x0020)
419           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
420                        0xfffd, 0x0020, 0xfffd, 0x0020)
421           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
422                        0xfffd, 0x0020, 0xfffd, 0x0020)
423           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
424                        0xfffd, 0x0020, 0xfffd, 0x0020)
425           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
426                        0xfffd, 0x0020, 0xfffd, 0x0020)
427           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
428                        0xfffd, 0x0020, 0xfffd, 0x0020),
429       "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
430       "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
431       "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
432       "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
433 
434   // Start bytes of 3-byte sequences (0xe0--0xef).
435   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
436       ConvertUTFResultContainer(sourceIllegal)
437           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
438                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
439           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
440                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
441       "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
442 
443   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
444       ConvertUTFResultContainer(sourceIllegal)
445           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
446                        0xfffd, 0x0020, 0xfffd, 0x0020)
447           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
448                        0xfffd, 0x0020, 0xfffd, 0x0020)
449           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
450                        0xfffd, 0x0020, 0xfffd, 0x0020)
451           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
452                        0xfffd, 0x0020, 0xfffd, 0x0020),
453       "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
454       "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
455 
456   // Start bytes of 4-byte sequences (0xf0--0xf7).
457   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
458       ConvertUTFResultContainer(sourceIllegal)
459           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
460                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
461       "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
462 
463   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
464       ConvertUTFResultContainer(sourceIllegal)
465           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
466                        0xfffd, 0x0020, 0xfffd, 0x0020)
467           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
468                        0xfffd, 0x0020, 0xfffd, 0x0020),
469       "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
470 
471   // Start bytes of 5-byte sequences (0xf8--0xfb).
472   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
473       ConvertUTFResultContainer(sourceIllegal)
474           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
475       "\xf8\xf9\xfa\xfb"));
476 
477   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
478       ConvertUTFResultContainer(sourceIllegal)
479           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
480                        0xfffd, 0x0020, 0xfffd, 0x0020),
481       "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
482 
483   // Start bytes of 6-byte sequences (0xfc--0xfd).
484   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
485       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
486       "\xfc\xfd"));
487 
488   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
489       ConvertUTFResultContainer(sourceIllegal)
490           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
491       "\xfc\x20\xfd\x20"));
492 
493   //
494   // Other bytes (0xc0--0xc1, 0xfe--0xff).
495   //
496 
497   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
498       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
499   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
500       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
501   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
502       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
503   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
504       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
505 
506   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
507       ConvertUTFResultContainer(sourceIllegal)
508           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
509       "\xc0\xc1\xfe\xff"));
510 
511   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
512       ConvertUTFResultContainer(sourceIllegal)
513           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
514       "\xfe\xfe\xff\xff"));
515 
516   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
517       ConvertUTFResultContainer(sourceIllegal)
518           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
519       "\xfe\x80\x80\x80\x80\x80"));
520 
521   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
522       ConvertUTFResultContainer(sourceIllegal)
523           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
524       "\xff\x80\x80\x80\x80\x80"));
525 
526   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
527       ConvertUTFResultContainer(sourceIllegal)
528           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
529                        0xfffd, 0x0020, 0xfffd, 0x0020),
530       "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
531 
532   //
533   // Sequences with one continuation byte missing
534   //
535 
536   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
537       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
538   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
539       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
540   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
541       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
542       "\xe0\xa0"));
543   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
544       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
545       "\xe0\xbf"));
546   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
547       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
548       "\xe1\x80"));
549   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
550       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
551       "\xec\xbf"));
552   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
553       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
554       "\xed\x80"));
555   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
556       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
557       "\xed\x9f"));
558   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
559       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
560       "\xee\x80"));
561   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
562       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
563       "\xef\xbf"));
564   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
565       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
566       "\xf0\x90\x80"));
567   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
568       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
569       "\xf0\xbf\xbf"));
570   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
571       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
572       "\xf1\x80\x80"));
573   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
574       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
575       "\xf3\xbf\xbf"));
576   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
577       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
578       "\xf4\x80\x80"));
579   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
580       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
581       "\xf4\x8f\xbf"));
582 
583   // Overlong sequences with one trailing byte missing.
584   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
585       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
586       "\xc0"));
587   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
588       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
589       "\xc1"));
590   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
591       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
592       "\xe0\x80"));
593   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
594       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
595       "\xe0\x9f"));
596   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
597       ConvertUTFResultContainer(sourceIllegal)
598           .withScalars(0xfffd, 0xfffd, 0xfffd),
599       "\xf0\x80\x80"));
600   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
601       ConvertUTFResultContainer(sourceIllegal)
602           .withScalars(0xfffd, 0xfffd, 0xfffd),
603       "\xf0\x8f\x80"));
604   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
605       ConvertUTFResultContainer(sourceIllegal)
606           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
607       "\xf8\x80\x80\x80"));
608   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
609       ConvertUTFResultContainer(sourceIllegal)
610           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
611       "\xfc\x80\x80\x80\x80"));
612 
613   // Sequences that represent surrogates with one trailing byte missing.
614   // High surrogates
615   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
616       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
617       "\xed\xa0"));
618   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
619       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
620       "\xed\xac"));
621   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
622       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
623       "\xed\xaf"));
624   // Low surrogates
625   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
626       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
627       "\xed\xb0"));
628   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
629       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
630       "\xed\xb4"));
631   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
632       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
633       "\xed\xbf"));
634 
635   // Ill-formed 4-byte sequences.
636   // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
637   // U+1100xx (invalid)
638   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
639       ConvertUTFResultContainer(sourceIllegal)
640           .withScalars(0xfffd, 0xfffd, 0xfffd),
641       "\xf4\x90\x80"));
642   // U+13FBxx (invalid)
643   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
644       ConvertUTFResultContainer(sourceIllegal)
645           .withScalars(0xfffd, 0xfffd, 0xfffd),
646       "\xf4\xbf\xbf"));
647   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
648       ConvertUTFResultContainer(sourceIllegal)
649           .withScalars(0xfffd, 0xfffd, 0xfffd),
650       "\xf5\x80\x80"));
651   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
652       ConvertUTFResultContainer(sourceIllegal)
653           .withScalars(0xfffd, 0xfffd, 0xfffd),
654       "\xf6\x80\x80"));
655   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
656       ConvertUTFResultContainer(sourceIllegal)
657           .withScalars(0xfffd, 0xfffd, 0xfffd),
658       "\xf7\x80\x80"));
659   // U+1FFBxx (invalid)
660   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
661       ConvertUTFResultContainer(sourceIllegal)
662           .withScalars(0xfffd, 0xfffd, 0xfffd),
663       "\xf7\xbf\xbf"));
664 
665   // Ill-formed 5-byte sequences.
666   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
667   // U+2000xx (invalid)
668   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
669       ConvertUTFResultContainer(sourceIllegal)
670           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
671       "\xf8\x88\x80\x80"));
672   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
673       ConvertUTFResultContainer(sourceIllegal)
674           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
675       "\xf8\xbf\xbf\xbf"));
676   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
677       ConvertUTFResultContainer(sourceIllegal)
678           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
679       "\xf9\x80\x80\x80"));
680   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
681       ConvertUTFResultContainer(sourceIllegal)
682           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
683       "\xfa\x80\x80\x80"));
684   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
685       ConvertUTFResultContainer(sourceIllegal)
686           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
687       "\xfb\x80\x80\x80"));
688   // U+3FFFFxx (invalid)
689   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
690       ConvertUTFResultContainer(sourceIllegal)
691           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
692       "\xfb\xbf\xbf\xbf"));
693 
694   // Ill-formed 6-byte sequences.
695   // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
696   // U+40000xx (invalid)
697   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
698       ConvertUTFResultContainer(sourceIllegal)
699           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
700       "\xfc\x84\x80\x80\x80"));
701   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
702       ConvertUTFResultContainer(sourceIllegal)
703           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
704       "\xfc\xbf\xbf\xbf\xbf"));
705   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
706       ConvertUTFResultContainer(sourceIllegal)
707           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
708       "\xfd\x80\x80\x80\x80"));
709   // U+7FFFFFxx (invalid)
710   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
711       ConvertUTFResultContainer(sourceIllegal)
712           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
713       "\xfd\xbf\xbf\xbf\xbf"));
714 
715   //
716   // Sequences with two continuation bytes missing
717   //
718 
719   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
720       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
721       "\xf0\x90"));
722   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
723       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
724       "\xf0\xbf"));
725   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
726       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
727       "\xf1\x80"));
728   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
729       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
730       "\xf3\xbf"));
731   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
732       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
733       "\xf4\x80"));
734   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
735       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
736       "\xf4\x8f"));
737 
738   // Overlong sequences with two trailing byte missing.
739   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
740       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
741   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
742       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
743       "\xf0\x80"));
744   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
745       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
746       "\xf0\x8f"));
747   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
748       ConvertUTFResultContainer(sourceIllegal)
749           .withScalars(0xfffd, 0xfffd, 0xfffd),
750       "\xf8\x80\x80"));
751   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
752       ConvertUTFResultContainer(sourceIllegal)
753           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
754       "\xfc\x80\x80\x80"));
755 
756   // Sequences that represent surrogates with two trailing bytes missing.
757   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
758       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
759 
760   // Ill-formed 4-byte sequences.
761   // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
762   // U+110yxx (invalid)
763   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
764       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
765       "\xf4\x90"));
766   // U+13Fyxx (invalid)
767   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
768       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
769       "\xf4\xbf"));
770   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
771       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
772       "\xf5\x80"));
773   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
774       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
775       "\xf6\x80"));
776   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
777       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
778       "\xf7\x80"));
779   // U+1FFyxx (invalid)
780   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
781       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
782       "\xf7\xbf"));
783 
784   // Ill-formed 5-byte sequences.
785   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
786   // U+200yxx (invalid)
787   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
788       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
789       "\xf8\x88\x80"));
790   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
791       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
792       "\xf8\xbf\xbf"));
793   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
794       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
795       "\xf9\x80\x80"));
796   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
797       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
798       "\xfa\x80\x80"));
799   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
800       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
801       "\xfb\x80\x80"));
802   // U+3FFFyxx (invalid)
803   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
804       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
805       "\xfb\xbf\xbf"));
806 
807   // Ill-formed 6-byte sequences.
808   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
809   // U+4000yxx (invalid)
810   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
811       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
812       "\xfc\x84\x80\x80"));
813   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
814       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
815       "\xfc\xbf\xbf\xbf"));
816   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
817       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
818       "\xfd\x80\x80\x80"));
819   // U+7FFFFyxx (invalid)
820   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
821       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
822       "\xfd\xbf\xbf\xbf"));
823 
824   //
825   // Sequences with three continuation bytes missing
826   //
827 
828   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
829       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
830   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
831       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
832   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
833       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
834   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
835       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
836   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
837       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
838 
839   // Broken overlong sequences.
840   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
841       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
842   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
843       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
844       "\xf8\x80"));
845   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
846       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
847       "\xfc\x80\x80"));
848 
849   // Ill-formed 4-byte sequences.
850   // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
851   // U+14yyxx (invalid)
852   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
853       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
854   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
855       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
856   // U+1Cyyxx (invalid)
857   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
858       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
859 
860   // Ill-formed 5-byte sequences.
861   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
862   // U+20yyxx (invalid)
863   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
864       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
865       "\xf8\x88"));
866   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
867       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
868       "\xf8\xbf"));
869   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
870       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
871       "\xf9\x80"));
872   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
873       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
874       "\xfa\x80"));
875   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
876       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
877       "\xfb\x80"));
878   // U+3FCyyxx (invalid)
879   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
880       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
881       "\xfb\xbf"));
882 
883   // Ill-formed 6-byte sequences.
884   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
885   // U+400yyxx (invalid)
886   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
887       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
888       "\xfc\x84\x80"));
889   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
890       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
891       "\xfc\xbf\xbf"));
892   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
893       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
894       "\xfd\x80\x80"));
895   // U+7FFCyyxx (invalid)
896   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
897       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
898       "\xfd\xbf\xbf"));
899 
900   //
901   // Sequences with four continuation bytes missing
902   //
903 
904   // Ill-formed 5-byte sequences.
905   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
906   // U+uzyyxx (invalid)
907   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
908       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
909   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
910       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
911   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
912       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
913   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
914       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
915   // U+3zyyxx (invalid)
916   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
917       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
918 
919   // Broken overlong sequences.
920   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
921       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
922   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
923       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
924       "\xfc\x80"));
925 
926   // Ill-formed 6-byte sequences.
927   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
928   // U+uzzyyxx (invalid)
929   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
930       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
931       "\xfc\x84"));
932   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
933       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
934       "\xfc\xbf"));
935   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
936       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
937       "\xfd\x80"));
938   // U+7Fzzyyxx (invalid)
939   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
940       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
941       "\xfd\xbf"));
942 
943   //
944   // Sequences with five continuation bytes missing
945   //
946 
947   // Ill-formed 6-byte sequences.
948   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
949   // U+uzzyyxx (invalid)
950   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
951       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
952   // U+uuzzyyxx (invalid)
953   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
954       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
955 
956   //
957   // Consecutive sequences with trailing bytes missing
958   //
959 
960   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
961       ConvertUTFResultContainer(sourceIllegal)
962           .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
963           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
964           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
965           .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
966           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
967           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
968       "\xc0" "\xe0\x80" "\xf0\x80\x80"
969       "\xf8\x80\x80\x80"
970       "\xfc\x80\x80\x80\x80"
971       "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
972       "\xfb\xbf\xbf\xbf"
973       "\xfd\xbf\xbf\xbf\xbf"));
974 
975   //
976   // Overlong UTF-8 sequences
977   //
978 
979   // U+002F SOLIDUS
980   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
981       ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
982 
983   // Overlong sequences of the above.
984   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
985       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
986       "\xc0\xaf"));
987   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
988       ConvertUTFResultContainer(sourceIllegal)
989           .withScalars(0xfffd, 0xfffd, 0xfffd),
990       "\xe0\x80\xaf"));
991   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
992       ConvertUTFResultContainer(sourceIllegal)
993           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
994       "\xf0\x80\x80\xaf"));
995   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
996       ConvertUTFResultContainer(sourceIllegal)
997           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
998       "\xf8\x80\x80\x80\xaf"));
999   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1000       ConvertUTFResultContainer(sourceIllegal)
1001           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1002       "\xfc\x80\x80\x80\x80\xaf"));
1003 
1004   // U+0000 NULL
1005   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1006       ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
1007       StringRef("\x00", 1)));
1008 
1009   // Overlong sequences of the above.
1010   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1011       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1012       "\xc0\x80"));
1013   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1014       ConvertUTFResultContainer(sourceIllegal)
1015           .withScalars(0xfffd, 0xfffd, 0xfffd),
1016       "\xe0\x80\x80"));
1017   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1018       ConvertUTFResultContainer(sourceIllegal)
1019           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1020       "\xf0\x80\x80\x80"));
1021   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1022       ConvertUTFResultContainer(sourceIllegal)
1023           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1024       "\xf8\x80\x80\x80\x80"));
1025   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1026       ConvertUTFResultContainer(sourceIllegal)
1027           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1028       "\xfc\x80\x80\x80\x80\x80"));
1029 
1030   // Other overlong sequences.
1031   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1032       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1033       "\xc0\xbf"));
1034   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1035       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1036       "\xc1\x80"));
1037   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1038       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1039       "\xc1\xbf"));
1040   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1041       ConvertUTFResultContainer(sourceIllegal)
1042           .withScalars(0xfffd, 0xfffd, 0xfffd),
1043       "\xe0\x9f\xbf"));
1044   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1045       ConvertUTFResultContainer(sourceIllegal)
1046           .withScalars(0xfffd, 0xfffd, 0xfffd),
1047       "\xed\xa0\x80"));
1048   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1049       ConvertUTFResultContainer(sourceIllegal)
1050           .withScalars(0xfffd, 0xfffd, 0xfffd),
1051       "\xed\xbf\xbf"));
1052   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1053       ConvertUTFResultContainer(sourceIllegal)
1054           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1055       "\xf0\x8f\x80\x80"));
1056   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1057       ConvertUTFResultContainer(sourceIllegal)
1058           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1059       "\xf0\x8f\xbf\xbf"));
1060   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1061       ConvertUTFResultContainer(sourceIllegal)
1062           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1063       "\xf8\x87\xbf\xbf\xbf"));
1064   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1065       ConvertUTFResultContainer(sourceIllegal)
1066           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1067       "\xfc\x83\xbf\xbf\xbf\xbf"));
1068 
1069   //
1070   // Isolated surrogates
1071   //
1072 
1073   // Unicode 6.3.0:
1074   //
1075   //    D71.  High-surrogate code point: A Unicode code point in the range
1076   //    U+D800 to U+DBFF.
1077   //
1078   //    D73.  Low-surrogate code point: A Unicode code point in the range
1079   //    U+DC00 to U+DFFF.
1080 
1081   // Note: U+E0100 is <DB40 DD00> in UTF16.
1082 
1083   // High surrogates
1084 
1085   // U+D800
1086   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1087       ConvertUTFResultContainer(sourceIllegal)
1088           .withScalars(0xfffd, 0xfffd, 0xfffd),
1089       "\xed\xa0\x80"));
1090 
1091   // U+DB40
1092   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1093       ConvertUTFResultContainer(sourceIllegal)
1094           .withScalars(0xfffd, 0xfffd, 0xfffd),
1095       "\xed\xac\xa0"));
1096 
1097   // U+DBFF
1098   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1099       ConvertUTFResultContainer(sourceIllegal)
1100           .withScalars(0xfffd, 0xfffd, 0xfffd),
1101       "\xed\xaf\xbf"));
1102 
1103   // Low surrogates
1104 
1105   // U+DC00
1106   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1107       ConvertUTFResultContainer(sourceIllegal)
1108           .withScalars(0xfffd, 0xfffd, 0xfffd),
1109       "\xed\xb0\x80"));
1110 
1111   // U+DD00
1112   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1113       ConvertUTFResultContainer(sourceIllegal)
1114           .withScalars(0xfffd, 0xfffd, 0xfffd),
1115       "\xed\xb4\x80"));
1116 
1117   // U+DFFF
1118   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1119       ConvertUTFResultContainer(sourceIllegal)
1120           .withScalars(0xfffd, 0xfffd, 0xfffd),
1121       "\xed\xbf\xbf"));
1122 
1123   // Surrogate pairs
1124 
1125   // U+D800 U+DC00
1126   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1127       ConvertUTFResultContainer(sourceIllegal)
1128           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1129       "\xed\xa0\x80\xed\xb0\x80"));
1130 
1131   // U+D800 U+DD00
1132   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1133       ConvertUTFResultContainer(sourceIllegal)
1134           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1135       "\xed\xa0\x80\xed\xb4\x80"));
1136 
1137   // U+D800 U+DFFF
1138   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1139       ConvertUTFResultContainer(sourceIllegal)
1140           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1141       "\xed\xa0\x80\xed\xbf\xbf"));
1142 
1143   // U+DB40 U+DC00
1144   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1145       ConvertUTFResultContainer(sourceIllegal)
1146           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1147       "\xed\xac\xa0\xed\xb0\x80"));
1148 
1149   // U+DB40 U+DD00
1150   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1151       ConvertUTFResultContainer(sourceIllegal)
1152           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1153       "\xed\xac\xa0\xed\xb4\x80"));
1154 
1155   // U+DB40 U+DFFF
1156   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1157       ConvertUTFResultContainer(sourceIllegal)
1158           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1159       "\xed\xac\xa0\xed\xbf\xbf"));
1160 
1161   // U+DBFF U+DC00
1162   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1163       ConvertUTFResultContainer(sourceIllegal)
1164           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1165       "\xed\xaf\xbf\xed\xb0\x80"));
1166 
1167   // U+DBFF U+DD00
1168   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1169       ConvertUTFResultContainer(sourceIllegal)
1170           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1171       "\xed\xaf\xbf\xed\xb4\x80"));
1172 
1173   // U+DBFF U+DFFF
1174   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1175       ConvertUTFResultContainer(sourceIllegal)
1176           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1177       "\xed\xaf\xbf\xed\xbf\xbf"));
1178 
1179   //
1180   // Noncharacters
1181   //
1182 
1183   // Unicode 6.3.0:
1184   //
1185   //    D14.  Noncharacter: A code point that is permanently reserved for
1186   //    internal use and that should never be interchanged. Noncharacters
1187   //    consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1188   //    and the values U+FDD0..U+FDEF.
1189 
1190   // U+FFFE
1191   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1192       ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
1193       "\xef\xbf\xbe"));
1194 
1195   // U+FFFF
1196   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1197       ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
1198       "\xef\xbf\xbf"));
1199 
1200   // U+1FFFE
1201   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1202       ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
1203       "\xf0\x9f\xbf\xbe"));
1204 
1205   // U+1FFFF
1206   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1207       ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
1208       "\xf0\x9f\xbf\xbf"));
1209 
1210   // U+2FFFE
1211   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1212       ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
1213       "\xf0\xaf\xbf\xbe"));
1214 
1215   // U+2FFFF
1216   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1217       ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
1218       "\xf0\xaf\xbf\xbf"));
1219 
1220   // U+3FFFE
1221   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1222       ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
1223       "\xf0\xbf\xbf\xbe"));
1224 
1225   // U+3FFFF
1226   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1227       ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
1228       "\xf0\xbf\xbf\xbf"));
1229 
1230   // U+4FFFE
1231   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1232       ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
1233       "\xf1\x8f\xbf\xbe"));
1234 
1235   // U+4FFFF
1236   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1237       ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
1238       "\xf1\x8f\xbf\xbf"));
1239 
1240   // U+5FFFE
1241   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1242       ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
1243       "\xf1\x9f\xbf\xbe"));
1244 
1245   // U+5FFFF
1246   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1247       ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
1248       "\xf1\x9f\xbf\xbf"));
1249 
1250   // U+6FFFE
1251   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1252       ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
1253       "\xf1\xaf\xbf\xbe"));
1254 
1255   // U+6FFFF
1256   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1257       ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
1258       "\xf1\xaf\xbf\xbf"));
1259 
1260   // U+7FFFE
1261   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1262       ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
1263       "\xf1\xbf\xbf\xbe"));
1264 
1265   // U+7FFFF
1266   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1267       ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
1268       "\xf1\xbf\xbf\xbf"));
1269 
1270   // U+8FFFE
1271   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1272       ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
1273       "\xf2\x8f\xbf\xbe"));
1274 
1275   // U+8FFFF
1276   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1277       ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
1278       "\xf2\x8f\xbf\xbf"));
1279 
1280   // U+9FFFE
1281   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1282       ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
1283       "\xf2\x9f\xbf\xbe"));
1284 
1285   // U+9FFFF
1286   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1287       ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
1288       "\xf2\x9f\xbf\xbf"));
1289 
1290   // U+AFFFE
1291   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1292       ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
1293       "\xf2\xaf\xbf\xbe"));
1294 
1295   // U+AFFFF
1296   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1297       ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
1298       "\xf2\xaf\xbf\xbf"));
1299 
1300   // U+BFFFE
1301   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1302       ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
1303       "\xf2\xbf\xbf\xbe"));
1304 
1305   // U+BFFFF
1306   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1307       ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
1308       "\xf2\xbf\xbf\xbf"));
1309 
1310   // U+CFFFE
1311   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1312       ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
1313       "\xf3\x8f\xbf\xbe"));
1314 
1315   // U+CFFFF
1316   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1317       ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
1318       "\xf3\x8f\xbf\xbf"));
1319 
1320   // U+DFFFE
1321   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1322       ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
1323       "\xf3\x9f\xbf\xbe"));
1324 
1325   // U+DFFFF
1326   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1327       ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
1328       "\xf3\x9f\xbf\xbf"));
1329 
1330   // U+EFFFE
1331   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1332       ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
1333       "\xf3\xaf\xbf\xbe"));
1334 
1335   // U+EFFFF
1336   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1337       ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
1338       "\xf3\xaf\xbf\xbf"));
1339 
1340   // U+FFFFE
1341   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1342       ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
1343       "\xf3\xbf\xbf\xbe"));
1344 
1345   // U+FFFFF
1346   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1347       ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
1348       "\xf3\xbf\xbf\xbf"));
1349 
1350   // U+10FFFE
1351   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1352       ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
1353       "\xf4\x8f\xbf\xbe"));
1354 
1355   // U+10FFFF
1356   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1357       ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
1358       "\xf4\x8f\xbf\xbf"));
1359 
1360   // U+FDD0
1361   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1362       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
1363       "\xef\xb7\x90"));
1364 
1365   // U+FDD1
1366   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1367       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
1368       "\xef\xb7\x91"));
1369 
1370   // U+FDD2
1371   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1372       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
1373       "\xef\xb7\x92"));
1374 
1375   // U+FDD3
1376   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1377       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
1378       "\xef\xb7\x93"));
1379 
1380   // U+FDD4
1381   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1382       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
1383       "\xef\xb7\x94"));
1384 
1385   // U+FDD5
1386   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1387       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
1388       "\xef\xb7\x95"));
1389 
1390   // U+FDD6
1391   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1392       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
1393       "\xef\xb7\x96"));
1394 
1395   // U+FDD7
1396   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1397       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
1398       "\xef\xb7\x97"));
1399 
1400   // U+FDD8
1401   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1402       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
1403       "\xef\xb7\x98"));
1404 
1405   // U+FDD9
1406   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1407       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
1408       "\xef\xb7\x99"));
1409 
1410   // U+FDDA
1411   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1412       ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
1413       "\xef\xb7\x9a"));
1414 
1415   // U+FDDB
1416   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1417       ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
1418       "\xef\xb7\x9b"));
1419 
1420   // U+FDDC
1421   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1422       ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
1423       "\xef\xb7\x9c"));
1424 
1425   // U+FDDD
1426   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1427       ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
1428       "\xef\xb7\x9d"));
1429 
1430   // U+FDDE
1431   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1432       ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
1433       "\xef\xb7\x9e"));
1434 
1435   // U+FDDF
1436   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1437       ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
1438       "\xef\xb7\x9f"));
1439 
1440   // U+FDE0
1441   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1442       ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
1443       "\xef\xb7\xa0"));
1444 
1445   // U+FDE1
1446   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1447       ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
1448       "\xef\xb7\xa1"));
1449 
1450   // U+FDE2
1451   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1452       ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
1453       "\xef\xb7\xa2"));
1454 
1455   // U+FDE3
1456   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1457       ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
1458       "\xef\xb7\xa3"));
1459 
1460   // U+FDE4
1461   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1462       ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
1463       "\xef\xb7\xa4"));
1464 
1465   // U+FDE5
1466   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1467       ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
1468       "\xef\xb7\xa5"));
1469 
1470   // U+FDE6
1471   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1472       ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
1473       "\xef\xb7\xa6"));
1474 
1475   // U+FDE7
1476   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1477       ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
1478       "\xef\xb7\xa7"));
1479 
1480   // U+FDE8
1481   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1482       ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
1483       "\xef\xb7\xa8"));
1484 
1485   // U+FDE9
1486   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1487       ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
1488       "\xef\xb7\xa9"));
1489 
1490   // U+FDEA
1491   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1492       ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
1493       "\xef\xb7\xaa"));
1494 
1495   // U+FDEB
1496   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1497       ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
1498       "\xef\xb7\xab"));
1499 
1500   // U+FDEC
1501   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1502       ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
1503       "\xef\xb7\xac"));
1504 
1505   // U+FDED
1506   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1507       ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
1508       "\xef\xb7\xad"));
1509 
1510   // U+FDEE
1511   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1512       ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
1513       "\xef\xb7\xae"));
1514 
1515   // U+FDEF
1516   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1517       ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
1518       "\xef\xb7\xaf"));
1519 
1520   // U+FDF0
1521   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1522       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
1523       "\xef\xb7\xb0"));
1524 
1525   // U+FDF1
1526   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1527       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
1528       "\xef\xb7\xb1"));
1529 
1530   // U+FDF2
1531   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1532       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
1533       "\xef\xb7\xb2"));
1534 
1535   // U+FDF3
1536   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1537       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
1538       "\xef\xb7\xb3"));
1539 
1540   // U+FDF4
1541   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1542       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
1543       "\xef\xb7\xb4"));
1544 
1545   // U+FDF5
1546   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1547       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
1548       "\xef\xb7\xb5"));
1549 
1550   // U+FDF6
1551   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1552       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
1553       "\xef\xb7\xb6"));
1554 
1555   // U+FDF7
1556   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1557       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
1558       "\xef\xb7\xb7"));
1559 
1560   // U+FDF8
1561   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1562       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
1563       "\xef\xb7\xb8"));
1564 
1565   // U+FDF9
1566   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1567       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
1568       "\xef\xb7\xb9"));
1569 
1570   // U+FDFA
1571   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1572       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
1573       "\xef\xb7\xba"));
1574 
1575   // U+FDFB
1576   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1577       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
1578       "\xef\xb7\xbb"));
1579 
1580   // U+FDFC
1581   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1582       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
1583       "\xef\xb7\xbc"));
1584 
1585   // U+FDFD
1586   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1587       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
1588       "\xef\xb7\xbd"));
1589 
1590   // U+FDFE
1591   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1592       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
1593       "\xef\xb7\xbe"));
1594 
1595   // U+FDFF
1596   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1597       ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
1598       "\xef\xb7\xbf"));
1599 }
1600 
TEST(ConvertUTFTest,UTF8ToUTF32PartialLenient)1601 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
1602   // U+0041 LATIN CAPITAL LETTER A
1603   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1604       ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
1605       "\x41", true));
1606 
1607   //
1608   // Sequences with one continuation byte missing
1609   //
1610 
1611   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1612       ConvertUTFResultContainer(sourceExhausted),
1613       "\xc2", true));
1614   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1615       ConvertUTFResultContainer(sourceExhausted),
1616       "\xdf", true));
1617   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1618       ConvertUTFResultContainer(sourceExhausted),
1619       "\xe0\xa0", true));
1620   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1621       ConvertUTFResultContainer(sourceExhausted),
1622       "\xe0\xbf", true));
1623   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1624       ConvertUTFResultContainer(sourceExhausted),
1625       "\xe1\x80", true));
1626   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1627       ConvertUTFResultContainer(sourceExhausted),
1628       "\xec\xbf", true));
1629   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1630       ConvertUTFResultContainer(sourceExhausted),
1631       "\xed\x80", true));
1632   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1633       ConvertUTFResultContainer(sourceExhausted),
1634       "\xed\x9f", true));
1635   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1636       ConvertUTFResultContainer(sourceExhausted),
1637       "\xee\x80", true));
1638   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1639       ConvertUTFResultContainer(sourceExhausted),
1640       "\xef\xbf", true));
1641   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1642       ConvertUTFResultContainer(sourceExhausted),
1643       "\xf0\x90\x80", true));
1644   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1645       ConvertUTFResultContainer(sourceExhausted),
1646       "\xf0\xbf\xbf", true));
1647   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1648       ConvertUTFResultContainer(sourceExhausted),
1649       "\xf1\x80\x80", true));
1650   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1651       ConvertUTFResultContainer(sourceExhausted),
1652       "\xf3\xbf\xbf", true));
1653   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1654       ConvertUTFResultContainer(sourceExhausted),
1655       "\xf4\x80\x80", true));
1656   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1657       ConvertUTFResultContainer(sourceExhausted),
1658       "\xf4\x8f\xbf", true));
1659 
1660   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1661       ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
1662       "\x41\xc2", true));
1663 }
1664 
1665