1 // Unit Tests for Scintilla internal data structures
2
3 #include <cstring>
4
5 #include <string>
6 #include <string_view>
7 #include <vector>
8 #include <algorithm>
9 #include <memory>
10
11 #include "Platform.h"
12
13 #include "UniConversion.h"
14
15 #include "catch.hpp"
16
17 using namespace Scintilla;
18
19 // Test UniConversion.
20 // Use examples from Wikipedia:
21 // https://en.wikipedia.org/wiki/UTF-8
22
23 TEST_CASE("UTF16Length") {
24
25 SECTION("UTF16Length ASCII") {
26 // Latin Small Letter A
27 const char *s = "a";
28 size_t len = UTF16Length(s);
29 REQUIRE(len == 1U);
30 }
31
32 SECTION("UTF16Length Example1") {
33 // Dollar Sign
34 const char *s = "\x24";
35 size_t len = UTF16Length(s);
36 REQUIRE(len == 1U);
37 }
38
39 SECTION("UTF16Length Example2") {
40 // Cent Sign
41 const char *s = "\xC2\xA2";
42 size_t len = UTF16Length(s);
43 REQUIRE(len == 1U);
44 }
45
46 SECTION("UTF16Length Example3") {
47 // Euro Sign
48 const char *s = "\xE2\x82\xAC";
49 size_t len = UTF16Length(s);
50 REQUIRE(len == 1U);
51 }
52
53 SECTION("UTF16Length Example4") {
54 // Gothic Letter Hwair
55 const char *s = "\xF0\x90\x8D\x88";
56 size_t len = UTF16Length(s);
57 REQUIRE(len == 2U);
58 }
59
60 SECTION("UTF16Length Invalid Trail byte in lead position") {
61 const char *s = "a\xB5yz";
62 size_t len = UTF16Length(s);
63 REQUIRE(len == 4U);
64 }
65
66 SECTION("UTF16Length Invalid Lead byte at end") {
67 const char *s = "a\xC2";
68 size_t len = UTF16Length(s);
69 REQUIRE(len == 2U);
70 }
71
72 SECTION("UTF16Length Invalid Lead byte implies 3 trails but only 2") {
73 const char *s = "a\xF1yz";
74 size_t len = UTF16Length(s);
75 REQUIRE(len == 2U);
76 }
77 }
78
79 TEST_CASE("UniConversion") {
80
81 // UnicodeFromUTF8
82
83 SECTION("UnicodeFromUTF8 ASCII") {
84 const unsigned char s[]={'a', 0};
85 REQUIRE(UnicodeFromUTF8(s) == 'a');
86 }
87
88 SECTION("UnicodeFromUTF8 Example1") {
89 const unsigned char s[]={0x24, 0};
90 REQUIRE(UnicodeFromUTF8(s) == 0x24);
91 }
92
93 SECTION("UnicodeFromUTF8 Example2") {
94 const unsigned char s[]={0xC2, 0xA2, 0};
95 REQUIRE(UnicodeFromUTF8(s) == 0xA2);
96 }
97
98 SECTION("UnicodeFromUTF8 Example3") {
99 const unsigned char s[]={0xE2, 0x82, 0xAC, 0};
100 REQUIRE(UnicodeFromUTF8(s) == 0x20AC);
101 }
102
103 SECTION("UnicodeFromUTF8 Example4") {
104 const unsigned char s[]={0xF0, 0x90, 0x8D, 0x88, 0};
105 REQUIRE(UnicodeFromUTF8(s) == 0x10348);
106 }
107
108 // UTF16FromUTF8
109
110 SECTION("UTF16FromUTF8 ASCII") {
111 const char s[] = {'a', 0};
112 wchar_t tbuf[1] = {0};
113 size_t tlen = UTF16FromUTF8(s, tbuf, 1);
114 REQUIRE(tlen == 1U);
115 REQUIRE(tbuf[0] == 'a');
116 }
117
118 SECTION("UTF16FromUTF8 Example1") {
119 const char s[] = {'\x24', 0};
120 wchar_t tbuf[1] = {0};
121 size_t tlen = UTF16FromUTF8(s, tbuf, 1);
122 REQUIRE(tlen == 1U);
123 REQUIRE(tbuf[0] == 0x24);
124 }
125
126 SECTION("UTF16FromUTF8 Example2") {
127 const char s[] = {'\xC2', '\xA2', 0};
128 wchar_t tbuf[1] = {0};
129 size_t tlen = UTF16FromUTF8(s, tbuf, 1);
130 REQUIRE(tlen == 1U);
131 REQUIRE(tbuf[0] == 0xA2);
132 }
133
134 SECTION("UTF16FromUTF8 Example3") {
135 const char s[] = {'\xE2', '\x82', '\xAC', 0};
136 wchar_t tbuf[1] = {0};
137 size_t tlen = UTF16FromUTF8(s, tbuf, 1);;
138 REQUIRE(tlen == 1U);
139 REQUIRE(tbuf[0] == 0x20AC);
140 }
141
142 SECTION("UTF16FromUTF8 Example4") {
143 const char s[] = {'\xF0', '\x90', '\x8D', '\x88', 0};
144 wchar_t tbuf[2] = {0, 0};
145 size_t tlen = UTF16FromUTF8(s, tbuf, 2);
146 REQUIRE(tlen == 2U);
147 REQUIRE(tbuf[0] == 0xD800);
148 REQUIRE(tbuf[1] == 0xDF48);
149 }
150
151 SECTION("UTF16FromUTF8 Invalid Trail byte in lead position") {
152 const char s[] = "a\xB5yz";
153 wchar_t tbuf[4] = {};
154 size_t tlen = UTF16FromUTF8(s, tbuf, 4);
155 REQUIRE(tlen == 4U);
156 REQUIRE(tbuf[0] == 'a');
157 REQUIRE(tbuf[1] == 0xB5);
158 REQUIRE(tbuf[2] == 'y');
159 REQUIRE(tbuf[3] == 'z');
160 }
161
162 SECTION("UTF16FromUTF8 Invalid Lead byte at end") {
163 const char s[] = "a\xC2";
164 wchar_t tbuf[2] = {};
165 size_t tlen = UTF16FromUTF8(s, tbuf, 2);
166 REQUIRE(tlen == 2U);
167 REQUIRE(tbuf[0] == 'a');
168 REQUIRE(tbuf[1] == 0xC2);
169 }
170
171 SECTION("UTF16FromUTF8 Invalid Lead byte implies 3 trails but only 2") {
172 const char *s = "a\xF1yz";
173 wchar_t tbuf[4] = {};
174 size_t tlen = UTF16FromUTF8(s, tbuf, 4);
175 REQUIRE(tlen == 2U);
176 REQUIRE(tbuf[0] == 'a');
177 REQUIRE(tbuf[1] == 0xF1);
178 }
179
180 // UTF32FromUTF8
181
182 SECTION("UTF32FromUTF8 ASCII") {
183 const char s[] = {'a', 0};
184 unsigned int tbuf[1] = {0};
185 size_t tlen = UTF32FromUTF8(s, tbuf, 1);
186 REQUIRE(tlen == 1U);
187 REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
188 }
189
190 SECTION("UTF32FromUTF8 Example1") {
191 const char s[] = {'\x24', 0};
192 unsigned int tbuf[1] = {0};
193 size_t tlen = UTF32FromUTF8(s, tbuf, 1);
194 REQUIRE(tlen == 1U);
195 REQUIRE(tbuf[0] == 0x24);
196 }
197
198 SECTION("UTF32FromUTF8 Example2") {
199 const char s[] = {'\xC2', '\xA2', 0};
200 unsigned int tbuf[1] = {0};
201 size_t tlen = UTF32FromUTF8(s, tbuf, 1);
202 REQUIRE(tlen == 1U);
203 REQUIRE(tbuf[0] == 0xA2);
204 }
205
206 SECTION("UTF32FromUTF8 Example3") {
207 const char s[] = {'\xE2', '\x82', '\xAC', 0};
208 unsigned int tbuf[1] = {0};
209 size_t tlen = UTF32FromUTF8(s, tbuf, 1);
210 REQUIRE(tlen == 1U);
211 REQUIRE(tbuf[0] == 0x20AC);
212 }
213
214 SECTION("UTF32FromUTF8 Example4") {
215 const char s[] = {'\xF0', '\x90', '\x8D', '\x88', 0};
216 unsigned int tbuf[1] = {0};
217 size_t tlen = UTF32FromUTF8(s, tbuf, 1);
218 REQUIRE(tlen == 1U);
219 REQUIRE(tbuf[0] == 0x10348);
220 }
221
222 SECTION("UTF32FromUTF8 Invalid Trail byte in lead position") {
223 const char s[] = "a\xB5yz";
224 unsigned int tbuf[4] = {};
225 size_t tlen = UTF32FromUTF8(s, tbuf, 4);
226 REQUIRE(tlen == 4U);
227 REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
228 REQUIRE(tbuf[1] == 0xB5);
229 REQUIRE(tbuf[2] == static_cast<unsigned int>('y'));
230 REQUIRE(tbuf[3] == static_cast<unsigned int>('z'));
231 }
232
233 SECTION("UTF32FromUTF8 Invalid Lead byte at end") {
234 const char s[] = "a\xC2";
235 unsigned int tbuf[2] = {};
236 size_t tlen = UTF32FromUTF8(s, tbuf, 2);
237 REQUIRE(tlen == 2U);
238 REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
239 REQUIRE(tbuf[1] == 0xC2);
240 }
241
242 SECTION("UTF32FromUTF8 Invalid Lead byte implies 3 trails but only 2") {
243 const char *s = "a\xF1yz";
244 unsigned int tbuf[4] = {};
245 size_t tlen = UTF32FromUTF8(s, tbuf, 4);
246 REQUIRE(tlen == 2U);
247 REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
248 REQUIRE(tbuf[1] == 0xF1);
249 }
250 }
251
252 namespace {
253
254 // Simple adapter to avoid casting
UTFClass(const char * s)255 int UTFClass(const char *s) {
256 return UTF8Classify(reinterpret_cast<const unsigned char *>(s), static_cast<int>(strlen(s)));
257 }
258
259 }
260
261 TEST_CASE("UTF8Classify") {
262
263 // These tests are supposed to hit every return statement in UTF8Classify in order
264 // with some hit multiple times.
265
266 // Single byte
267
268 SECTION("UTF8Classify Simple ASCII") {
269 REQUIRE(UTFClass("a") == 1);
270 }
271 SECTION("UTF8Classify Invalid Too large lead") {
272 REQUIRE(UTFClass("\xF5") == (1|UTF8MaskInvalid));
273 }
274 SECTION("UTF8Classify Overlong") {
275 REQUIRE(UTFClass("\xC0\x80") == (1 | UTF8MaskInvalid));
276 }
277 SECTION("UTF8Classify single trail byte") {
278 REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid));
279 }
280
281 // Invalid length tests
282
283 SECTION("UTF8Classify 2 byte lead, string less than 2 long") {
284 REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid));
285 }
286 SECTION("UTF8Classify 3 byte lead, string less than 3 long") {
287 REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid));
288 }
289 SECTION("UTF8Classify 4 byte lead, string less than 4 long") {
290 REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid));
291 }
292
293 // Invalid first trail byte tests
294
295 SECTION("UTF8Classify 2 byte lead trail is invalid") {
296 REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid));
297 }
298 SECTION("UTF8Classify 3 byte lead invalid trails") {
299 REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid));
300 }
301 SECTION("UTF8Classify 4 byte bad trails") {
302 REQUIRE(UTFClass("\xF0xyz") == (1 | UTF8MaskInvalid));
303 }
304
305 // 2 byte lead
306
307 SECTION("UTF8Classify 2 byte valid character") {
308 REQUIRE(UTFClass("\xD0\x80") == 2);
309 }
310
311 // 3 byte lead
312
313 SECTION("UTF8Classify 3 byte lead, overlong") {
314 REQUIRE(UTFClass("\xE0\x80\xAF") == (1 | UTF8MaskInvalid));
315 }
316 SECTION("UTF8Classify 3 byte lead, surrogate") {
317 REQUIRE(UTFClass("\xED\xA0\x80") == (1 | UTF8MaskInvalid));
318 }
319 SECTION("UTF8Classify FFFE non-character") {
320 REQUIRE(UTFClass("\xEF\xBF\xBE") == (3 | UTF8MaskInvalid));
321 }
322 SECTION("UTF8Classify FFFF non-character") {
323 REQUIRE(UTFClass("\xEF\xBF\xBF") == (3 | UTF8MaskInvalid));
324 }
325 SECTION("UTF8Classify FDD0 non-character") {
326 REQUIRE(UTFClass("\xEF\xB7\x90") == (3 | UTF8MaskInvalid));
327 }
328 SECTION("UTF8Classify 3 byte valid character") {
329 REQUIRE(UTFClass("\xE2\x82\xAC") == 3);
330 }
331
332 // 4 byte lead
333
334 SECTION("UTF8Classify 1FFFF non-character") {
335 REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid));
336 }
337 SECTION("UTF8Classify 1 Greater than max Unicode 110000") {
338 // Maximum Unicode value is 10FFFF so 110000 is out of range
339 REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid));
340 }
341 SECTION("UTF8Classify 4 byte overlong") {
342 REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid));
343 }
344 SECTION("UTF8Classify 4 byte valid character") {
345 REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4);
346 }
347
348 // Invalid 2nd or 3rd continuation bytes
349 SECTION("UTF8Classify 3 byte lead invalid 2nd trail") {
350 REQUIRE(UTFClass("\xE2\x82q") == (1 | UTF8MaskInvalid));
351 }
352 SECTION("UTF8Classify 4 byte lead invalid 2nd trail") {
353 REQUIRE(UTFClass("\xF0\x9Fq\x9F") == (1 | UTF8MaskInvalid));
354 }
355 SECTION("UTF8Classify 4 byte lead invalid 3rd trail") {
356 REQUIRE(UTFClass("\xF0\x9F\x9Fq") == (1 | UTF8MaskInvalid));
357 }
358 }
359