1 // Unit Tests for Scintilla internal data structures
2 
3 #include <cstring>
4 
5 #include <string>
6 #include <string_view>
7 #include <vector>
8 #include <algorithm>
9 #include <memory>
10 
11 #include "Platform.h"
12 
13 #include "UniConversion.h"
14 
15 #include "catch.hpp"
16 
17 using namespace Scintilla;
18 
19 // Test UniConversion.
20 // Use examples from Wikipedia:
21 // https://en.wikipedia.org/wiki/UTF-8
22 
23 TEST_CASE("UTF16Length") {
24 
25 	SECTION("UTF16Length ASCII") {
26 		// Latin Small Letter A
27 		const char *s = "a";
28 		size_t len = UTF16Length(s);
29 		REQUIRE(len == 1U);
30 	}
31 
32 	SECTION("UTF16Length Example1") {
33 		// Dollar Sign
34 		const char *s = "\x24";
35 		size_t len = UTF16Length(s);
36 		REQUIRE(len == 1U);
37 	}
38 
39 	SECTION("UTF16Length Example2") {
40 		// Cent Sign
41 		const char *s = "\xC2\xA2";
42 		size_t len = UTF16Length(s);
43 		REQUIRE(len == 1U);
44 	}
45 
46 	SECTION("UTF16Length Example3") {
47 		// Euro Sign
48 		const char *s = "\xE2\x82\xAC";
49 		size_t len = UTF16Length(s);
50 		REQUIRE(len == 1U);
51 	}
52 
53 	SECTION("UTF16Length Example4") {
54 		// Gothic Letter Hwair
55 		const char *s = "\xF0\x90\x8D\x88";
56 		size_t len = UTF16Length(s);
57 		REQUIRE(len == 2U);
58 	}
59 
60 	SECTION("UTF16Length Invalid Trail byte in lead position") {
61 		const char *s = "a\xB5yz";
62 		size_t len = UTF16Length(s);
63 		REQUIRE(len == 4U);
64 	}
65 
66 	SECTION("UTF16Length Invalid Lead byte at end") {
67 		const char *s = "a\xC2";
68 		size_t len = UTF16Length(s);
69 		REQUIRE(len == 2U);
70 	}
71 
72 	SECTION("UTF16Length Invalid Lead byte implies 3 trails but only 2") {
73 		const char *s = "a\xF1yz";
74 		size_t len = UTF16Length(s);
75 		REQUIRE(len == 2U);
76 	}
77 }
78 
79 TEST_CASE("UniConversion") {
80 
81 	// UnicodeFromUTF8
82 
83 	SECTION("UnicodeFromUTF8 ASCII") {
84 		const unsigned char s[]={'a', 0};
85 		REQUIRE(UnicodeFromUTF8(s) == 'a');
86 	}
87 
88 	SECTION("UnicodeFromUTF8 Example1") {
89 		const unsigned char s[]={0x24, 0};
90 		REQUIRE(UnicodeFromUTF8(s) == 0x24);
91 	}
92 
93 	SECTION("UnicodeFromUTF8 Example2") {
94 		const unsigned char s[]={0xC2, 0xA2, 0};
95 		REQUIRE(UnicodeFromUTF8(s) == 0xA2);
96 	}
97 
98 	SECTION("UnicodeFromUTF8 Example3") {
99 		const unsigned char s[]={0xE2, 0x82, 0xAC, 0};
100 		REQUIRE(UnicodeFromUTF8(s) == 0x20AC);
101 	}
102 
103 	SECTION("UnicodeFromUTF8 Example4") {
104 		const unsigned char s[]={0xF0, 0x90, 0x8D, 0x88, 0};
105 		REQUIRE(UnicodeFromUTF8(s) == 0x10348);
106 	}
107 
108 	// UTF16FromUTF8
109 
110 	SECTION("UTF16FromUTF8 ASCII") {
111 		const char s[] = {'a', 0};
112 		wchar_t tbuf[1] = {0};
113 		size_t tlen = UTF16FromUTF8(s, tbuf, 1);
114 		REQUIRE(tlen == 1U);
115 		REQUIRE(tbuf[0] == 'a');
116 	}
117 
118 	SECTION("UTF16FromUTF8 Example1") {
119 		const char s[] = {'\x24', 0};
120 		wchar_t tbuf[1] = {0};
121 		size_t tlen = UTF16FromUTF8(s, tbuf, 1);
122 		REQUIRE(tlen == 1U);
123 		REQUIRE(tbuf[0] == 0x24);
124 	}
125 
126 	SECTION("UTF16FromUTF8 Example2") {
127 		const char s[] = {'\xC2', '\xA2', 0};
128 		wchar_t tbuf[1] = {0};
129 		size_t tlen = UTF16FromUTF8(s, tbuf, 1);
130 		REQUIRE(tlen == 1U);
131 		REQUIRE(tbuf[0] == 0xA2);
132 	}
133 
134 	SECTION("UTF16FromUTF8 Example3") {
135 		const char s[] = {'\xE2', '\x82', '\xAC', 0};
136 		wchar_t tbuf[1] = {0};
137 		size_t tlen = UTF16FromUTF8(s, tbuf, 1);;
138 		REQUIRE(tlen == 1U);
139 		REQUIRE(tbuf[0] == 0x20AC);
140 	}
141 
142 	SECTION("UTF16FromUTF8 Example4") {
143 		const char s[] = {'\xF0', '\x90', '\x8D', '\x88', 0};
144 		wchar_t tbuf[2] = {0, 0};
145 		size_t tlen = UTF16FromUTF8(s, tbuf, 2);
146 		REQUIRE(tlen == 2U);
147 		REQUIRE(tbuf[0] == 0xD800);
148 		REQUIRE(tbuf[1] == 0xDF48);
149 	}
150 
151 	SECTION("UTF16FromUTF8 Invalid Trail byte in lead position") {
152 		const char s[] = "a\xB5yz";
153 		wchar_t tbuf[4] = {};
154 		size_t tlen = UTF16FromUTF8(s, tbuf, 4);
155 		REQUIRE(tlen == 4U);
156 		REQUIRE(tbuf[0] == 'a');
157 		REQUIRE(tbuf[1] == 0xB5);
158 		REQUIRE(tbuf[2] == 'y');
159 		REQUIRE(tbuf[3] == 'z');
160 	}
161 
162 	SECTION("UTF16FromUTF8 Invalid Lead byte at end") {
163 		const char s[] = "a\xC2";
164 		wchar_t tbuf[2] = {};
165 		size_t tlen = UTF16FromUTF8(s, tbuf, 2);
166 		REQUIRE(tlen == 2U);
167 		REQUIRE(tbuf[0] == 'a');
168 		REQUIRE(tbuf[1] == 0xC2);
169 	}
170 
171 	SECTION("UTF16FromUTF8 Invalid Lead byte implies 3 trails but only 2") {
172 		const char *s = "a\xF1yz";
173 		wchar_t tbuf[4] = {};
174 		size_t tlen = UTF16FromUTF8(s, tbuf, 4);
175 		REQUIRE(tlen == 2U);
176 		REQUIRE(tbuf[0] == 'a');
177 		REQUIRE(tbuf[1] == 0xF1);
178 	}
179 
180 	// UTF32FromUTF8
181 
182 	SECTION("UTF32FromUTF8 ASCII") {
183 		const char s[] = {'a', 0};
184 		unsigned int tbuf[1] = {0};
185 		size_t tlen = UTF32FromUTF8(s, tbuf, 1);
186 		REQUIRE(tlen == 1U);
187 		REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
188 	}
189 
190 	SECTION("UTF32FromUTF8 Example1") {
191 		const char s[] = {'\x24', 0};
192 		unsigned int tbuf[1] = {0};
193 		size_t tlen = UTF32FromUTF8(s, tbuf, 1);
194 		REQUIRE(tlen == 1U);
195 		REQUIRE(tbuf[0] == 0x24);
196 	}
197 
198 	SECTION("UTF32FromUTF8 Example2") {
199 		const char s[] = {'\xC2', '\xA2', 0};
200 		unsigned int tbuf[1] = {0};
201 		size_t tlen = UTF32FromUTF8(s, tbuf, 1);
202 		REQUIRE(tlen == 1U);
203 		REQUIRE(tbuf[0] == 0xA2);
204 	}
205 
206 	SECTION("UTF32FromUTF8 Example3") {
207 		const char s[] = {'\xE2', '\x82', '\xAC', 0};
208 		unsigned int tbuf[1] = {0};
209 		size_t tlen = UTF32FromUTF8(s, tbuf, 1);
210 		REQUIRE(tlen == 1U);
211 		REQUIRE(tbuf[0] == 0x20AC);
212 	}
213 
214 	SECTION("UTF32FromUTF8 Example4") {
215 		const char s[] = {'\xF0', '\x90', '\x8D', '\x88', 0};
216 		unsigned int tbuf[1] = {0};
217 		size_t tlen = UTF32FromUTF8(s, tbuf, 1);
218 		REQUIRE(tlen == 1U);
219 		REQUIRE(tbuf[0] == 0x10348);
220 	}
221 
222 	SECTION("UTF32FromUTF8 Invalid Trail byte in lead position") {
223 		const char s[] = "a\xB5yz";
224 		unsigned int tbuf[4] = {};
225 		size_t tlen = UTF32FromUTF8(s, tbuf, 4);
226 		REQUIRE(tlen == 4U);
227 		REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
228 		REQUIRE(tbuf[1] == 0xB5);
229 		REQUIRE(tbuf[2] == static_cast<unsigned int>('y'));
230 		REQUIRE(tbuf[3] == static_cast<unsigned int>('z'));
231 	}
232 
233 	SECTION("UTF32FromUTF8 Invalid Lead byte at end") {
234 		const char s[] = "a\xC2";
235 		unsigned int tbuf[2] = {};
236 		size_t tlen = UTF32FromUTF8(s, tbuf, 2);
237 		REQUIRE(tlen == 2U);
238 		REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
239 		REQUIRE(tbuf[1] == 0xC2);
240 	}
241 
242 	SECTION("UTF32FromUTF8 Invalid Lead byte implies 3 trails but only 2") {
243 		const char *s = "a\xF1yz";
244 		unsigned int tbuf[4] = {};
245 		size_t tlen = UTF32FromUTF8(s, tbuf, 4);
246 		REQUIRE(tlen == 2U);
247 		REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
248 		REQUIRE(tbuf[1] == 0xF1);
249 	}
250 }
251 
252 namespace {
253 
254 // Simple adapter to avoid casting
UTFClass(const char * s)255 int UTFClass(const char *s) {
256 	return UTF8Classify(reinterpret_cast<const unsigned char *>(s), static_cast<int>(strlen(s)));
257 }
258 
259 }
260 
261 TEST_CASE("UTF8Classify") {
262 
263 	// These tests are supposed to hit every return statement in UTF8Classify in order
264 	// with some hit multiple times.
265 
266 	// Single byte
267 
268 	SECTION("UTF8Classify Simple ASCII") {
269 		REQUIRE(UTFClass("a") == 1);
270 	}
271 	SECTION("UTF8Classify Invalid Too large lead") {
272 		REQUIRE(UTFClass("\xF5") == (1|UTF8MaskInvalid));
273 	}
274 	SECTION("UTF8Classify Overlong") {
275 		REQUIRE(UTFClass("\xC0\x80") == (1 | UTF8MaskInvalid));
276 	}
277 	SECTION("UTF8Classify single trail byte") {
278 		REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid));
279 	}
280 
281 	// Invalid length tests
282 
283 	SECTION("UTF8Classify 2 byte lead, string less than 2 long") {
284 		REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid));
285 	}
286 	SECTION("UTF8Classify 3 byte lead, string less than 3 long") {
287 		REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid));
288 	}
289 	SECTION("UTF8Classify 4 byte lead, string less than 4 long") {
290 		REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid));
291 	}
292 
293 	// Invalid first trail byte tests
294 
295 	SECTION("UTF8Classify 2 byte lead trail is invalid") {
296 		REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid));
297 	}
298 	SECTION("UTF8Classify 3 byte lead invalid trails") {
299 		REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid));
300 	}
301 	SECTION("UTF8Classify 4 byte bad trails") {
302 		REQUIRE(UTFClass("\xF0xyz") == (1 | UTF8MaskInvalid));
303 	}
304 
305 	// 2 byte lead
306 
307 	SECTION("UTF8Classify 2 byte valid character") {
308 		REQUIRE(UTFClass("\xD0\x80") == 2);
309 	}
310 
311 	// 3 byte lead
312 
313 	SECTION("UTF8Classify 3 byte lead, overlong") {
314 		REQUIRE(UTFClass("\xE0\x80\xAF") == (1 | UTF8MaskInvalid));
315 	}
316 	SECTION("UTF8Classify 3 byte lead, surrogate") {
317 		REQUIRE(UTFClass("\xED\xA0\x80") == (1 | UTF8MaskInvalid));
318 	}
319 	SECTION("UTF8Classify FFFE non-character") {
320 		REQUIRE(UTFClass("\xEF\xBF\xBE") == (3 | UTF8MaskInvalid));
321 	}
322 	SECTION("UTF8Classify FFFF non-character") {
323 		REQUIRE(UTFClass("\xEF\xBF\xBF") == (3 | UTF8MaskInvalid));
324 	}
325 	SECTION("UTF8Classify FDD0 non-character") {
326 		REQUIRE(UTFClass("\xEF\xB7\x90") == (3 | UTF8MaskInvalid));
327 	}
328 	SECTION("UTF8Classify 3 byte valid character") {
329 		REQUIRE(UTFClass("\xE2\x82\xAC") == 3);
330 	}
331 
332 	// 4 byte lead
333 
334 	SECTION("UTF8Classify 1FFFF non-character") {
335 		REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid));
336 	}
337 	SECTION("UTF8Classify 1 Greater than max Unicode 110000") {
338 		// Maximum Unicode value is 10FFFF so 110000 is out of range
339 		REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid));
340 	}
341 	SECTION("UTF8Classify 4 byte overlong") {
342 		REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid));
343 	}
344 	SECTION("UTF8Classify 4 byte valid character") {
345 		REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4);
346 	}
347 
348 	// Invalid 2nd or 3rd continuation bytes
349 	SECTION("UTF8Classify 3 byte lead invalid 2nd trail") {
350 		REQUIRE(UTFClass("\xE2\x82q") == (1 | UTF8MaskInvalid));
351 	}
352 	SECTION("UTF8Classify 4 byte lead invalid 2nd trail") {
353 		REQUIRE(UTFClass("\xF0\x9Fq\x9F") == (1 | UTF8MaskInvalid));
354 	}
355 	SECTION("UTF8Classify 4 byte lead invalid 3rd trail") {
356 		REQUIRE(UTFClass("\xF0\x9F\x9Fq") == (1 | UTF8MaskInvalid));
357 	}
358 }
359