xref: /openbsd/regress/lib/libcrypto/utf8/utf8test.c (revision 264ca280)
1 /*
2  * Copyright (c) 2014 Philip Guenther <guenther@openbsd.org>
3  *
4  * Permission to use, copy, modify, and distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /*
18  * A mostly exhaustive test of UTF-8 decoder and encoder
19  */
20 
21 #include <stdio.h>
22 #include <string.h>
23 #include <err.h>
24 
25 #include <openssl/asn1.h>
26 #include "asn1_locl.h"		/* peek into the internals */
27 
28 #define	UNCHANGED	0xfedcba98
29 
30 #define ASSERT(x)						\
31 	do {							\
32 		if (!(x))					\
33 			errx(1, "test failed at line %d: %s",	\
34 			    __LINE__, #x);			\
35 	} while (0)
36 
37 int
38 main(void)
39 {
40 	unsigned char testbuf[] = "012345";
41 	const unsigned char zerobuf[sizeof testbuf] = { 0 };
42 	unsigned long value;
43 	unsigned int i, j, k, l;
44 	int ret;
45 
46 	/*
47 	 * First, verify UTF8_getc()
48 	 */
49 	value = UNCHANGED;
50 	ret = UTF8_getc(testbuf, 0, &value);
51 	ASSERT(ret == 0);
52 	ASSERT(value == UNCHANGED);
53 
54 	/* check all valid single-byte chars */
55 	for (i = 0; i < 0x80; i++) {
56 		testbuf[0] = i;
57 		ret = UTF8_getc(testbuf, 1, &value);
58 		ASSERT(ret == 1);
59 		ASSERT(value == i);
60 
61 		ret = UTF8_getc(testbuf, 2, &value);
62 		ASSERT(ret == 1);
63 		ASSERT(value == i);
64 	}
65 
66 	/*
67 	 * Verify failure on all invalid initial bytes:
68 	 *	0x80 - 0xBF	following bytes only
69 	 *	0xC0 - 0xC1	used to be in non-shortest forms
70 	 *	0xF5 - 0xFD	used to be initial for 5 and 6 byte sequences
71 	 *	0xFE - 0xFF	have never been valid in utf-8
72 	 */
73 	for (i = 0x80; i < 0xC2; i++) {
74 		value = UNCHANGED;
75 		testbuf[0] = i;
76 		ret = UTF8_getc(testbuf, 1, &value);
77 		ASSERT(ret == -2);
78 		ASSERT(value == UNCHANGED);
79 	}
80 	for (i = 0xF5; i < 0x100; i++) {
81 		value = UNCHANGED;
82 		testbuf[0] = i;
83 		ret = UTF8_getc(testbuf, 1, &value);
84 		ASSERT(ret == -2);
85 		ASSERT(value == UNCHANGED);
86 	}
87 
88 	/*
89 	 * Verify handling of all two-byte sequences
90 	 */
91 	for (i = 0xC2; i < 0xE0; i++) {
92 		testbuf[0] = i;
93 
94 		for (j = 0; j < 0x100; j++) {
95 			testbuf[1] = j;
96 
97 			value = UNCHANGED;
98 			ret = UTF8_getc(testbuf, 1, &value);
99 			ASSERT(ret == -1);
100 			ASSERT(value == UNCHANGED);
101 
102 			ret = UTF8_getc(testbuf, 2, &value);
103 
104 			/* outside range of trailing bytes */
105 			if (j < 0x80 || j > 0xBF) {
106 				ASSERT(ret == -3);
107 				ASSERT(value == UNCHANGED);
108 				continue;
109 			}
110 
111 			/* valid */
112 			ASSERT(ret == 2);
113 			ASSERT((value & 0x3F) == (j & 0x3F));
114 			ASSERT(value >> 6 == (i & 0x1F));
115 		}
116 	}
117 
118 	/*
119 	 * Verify handling of all three-byte sequences
120 	 */
121 	for (i = 0xE0; i < 0xF0; i++) {
122 		testbuf[0] = i;
123 
124 		for (j = 0; j < 0x100; j++) {
125 			testbuf[1] = j;
126 
127 			for (k = 0; k < 0x100; k++) {
128 				testbuf[2] = k;
129 
130 				value = UNCHANGED;
131 				ret = UTF8_getc(testbuf, 2, &value);
132 				ASSERT(ret == -1);
133 				ASSERT(value == UNCHANGED);
134 
135 				ret = UTF8_getc(testbuf, 3, &value);
136 
137 				/* outside range of trailing bytes */
138 				if (j < 0x80 || j > 0xBF ||
139 				    k < 0x80 || k > 0xBF) {
140 					ASSERT(ret == -3);
141 					ASSERT(value == UNCHANGED);
142 					continue;
143 				}
144 
145 				/* non-shortest form */
146 				if (i == 0xE0 && j < 0xA0) {
147 					ASSERT(ret == -4);
148 					ASSERT(value == UNCHANGED);
149 					continue;
150 				}
151 
152 				/* surrogate pair code point */
153 				if (i == 0xED && j > 0x9F) {
154 					ASSERT(ret == -2);
155 					ASSERT(value == UNCHANGED);
156 					continue;
157 				}
158 
159 				ASSERT(ret == 3);
160 				ASSERT((value & 0x3F) == (k & 0x3F));
161 				ASSERT(((value >> 6) & 0x3F) == (j & 0x3F));
162 				ASSERT(value >> 12 == (i & 0x0F));
163 			}
164 		}
165 	}
166 
167 	/*
168 	 * Verify handling of all four-byte sequences
169 	 */
170 	for (i = 0xF0; i < 0xF5; i++) {
171 		testbuf[0] = i;
172 
173 		for (j = 0; j < 0x100; j++) {
174 			testbuf[1] = j;
175 
176 			for (k = 0; k < 0x100; k++) {
177 				testbuf[2] = k;
178 
179 				for (l = 0; l < 0x100; l++) {
180 					testbuf[3] = l;
181 
182 					value = UNCHANGED;
183 					ret = UTF8_getc(testbuf, 3, &value);
184 					ASSERT(ret == -1);
185 					ASSERT(value == UNCHANGED);
186 
187 					ret = UTF8_getc(testbuf, 4, &value);
188 
189 					/* outside range of trailing bytes */
190 					if (j < 0x80 || j > 0xBF ||
191 					    k < 0x80 || k > 0xBF ||
192 					    l < 0x80 || l > 0xBF) {
193 						ASSERT(ret == -3);
194 						ASSERT(value == UNCHANGED);
195 						continue;
196 					}
197 
198 					/* non-shortest form */
199 					if (i == 0xF0 && j < 0x90) {
200 						ASSERT(ret == -4);
201 						ASSERT(value == UNCHANGED);
202 						continue;
203 					}
204 
205 					/* beyond end of UCS range */
206 					if (i == 0xF4 && j > 0x8F) {
207 						ASSERT(ret == -2);
208 						ASSERT(value == UNCHANGED);
209 						continue;
210 					}
211 
212 					ASSERT(ret == 4);
213 					ASSERT((value & 0x3F) == (l & 0x3F));
214 					ASSERT(((value >> 6) & 0x3F) ==
215 							  (k & 0x3F));
216 					ASSERT(((value >> 12) & 0x3F) ==
217 							   (j & 0x3F));
218 					ASSERT(value >> 18 == (i & 0x07));
219 				}
220 			}
221 		}
222 	}
223 
224 
225 	/*
226 	 * Next, verify UTF8_putc()
227 	 */
228 	memset(testbuf, 0, sizeof testbuf);
229 
230 	/* single-byte sequences */
231 	for (i = 0; i < 0x80; i++) {
232 		ret = UTF8_putc(NULL, 0, i);
233 		ASSERT(ret == 1);
234 
235 		testbuf[0] = 0;
236 		ret = UTF8_putc(testbuf, 0, i);
237 		ASSERT(ret == -1);
238 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
239 
240 		ret = UTF8_putc(testbuf, 1, i);
241 		ASSERT(ret == 1);
242 		ASSERT(testbuf[0] == i);
243 		ASSERT(memcmp(testbuf+1, zerobuf, sizeof(testbuf)-1) == 0);
244 	}
245 
246 	/* two-byte sequences */
247 	for (i = 0x80; i < 0x800; i++) {
248 		ret = UTF8_putc(NULL, 0, i);
249 		ASSERT(ret == 2);
250 
251 		testbuf[0] = testbuf[1] = 0;
252 		ret = UTF8_putc(testbuf, 1, i);
253 		ASSERT(ret == -1);
254 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
255 
256 		ret = UTF8_putc(testbuf, 2, i);
257 		ASSERT(ret == 2);
258 		ASSERT(memcmp(testbuf+2, zerobuf, sizeof(testbuf)-2) == 0);
259 		ret = UTF8_getc(testbuf, 2, &value);
260 		ASSERT(ret == 2);
261 		ASSERT(value == i);
262 	}
263 
264 	/* three-byte sequences */
265 	for (i = 0x800; i < 0x10000; i++) {
266 		if (i >= 0xD800 && i < 0xE000) {
267 			/* surrogates aren't valid */
268 			ret = UTF8_putc(NULL, 0, i);
269 			ASSERT(ret == -2);
270 			continue;
271 		}
272 
273 		ret = UTF8_putc(NULL, 0, i);
274 		ASSERT(ret == 3);
275 
276 		testbuf[0] = testbuf[1] = testbuf[2] = 0;
277 		ret = UTF8_putc(testbuf, 2, i);
278 		ASSERT(ret == -1);
279 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
280 
281 		ret = UTF8_putc(testbuf, 3, i);
282 		ASSERT(ret == 3);
283 		ASSERT(memcmp(testbuf+3, zerobuf, sizeof(testbuf)-3) == 0);
284 		ret = UTF8_getc(testbuf, 3, &value);
285 		ASSERT(ret == 3);
286 		ASSERT(value == i);
287 	}
288 
289 	/* four-byte sequences */
290 	for (i = 0x10000; i < 0x110000; i++) {
291 		ret = UTF8_putc(NULL, 0, i);
292 		ASSERT(ret == 4);
293 
294 		testbuf[0] = testbuf[1] = testbuf[2] = testbuf[3] = 0;
295 		ret = UTF8_putc(testbuf, 3, i);
296 		ASSERT(ret == -1);
297 		ASSERT(memcmp(testbuf, zerobuf, sizeof testbuf) == 0);
298 
299 		ret = UTF8_putc(testbuf, 4, i);
300 		ASSERT(ret == 4);
301 		ASSERT(memcmp(testbuf+4, zerobuf, sizeof(testbuf)-4) == 0);
302 		ret = UTF8_getc(testbuf, 4, &value);
303 		ASSERT(ret == 4);
304 		ASSERT(value == i);
305 	}
306 
307 	/* spot check some larger values to confirm error return */
308 	for (i = 0x110000; i < 0x110100; i++) {
309 		ret = UTF8_putc(NULL, 0, i);
310 		ASSERT(ret == -2);
311 	}
312 	for (value = (unsigned long)-1; value > (unsigned long)-256; value--) {
313 		ret = UTF8_putc(NULL, 0, value);
314 		ASSERT(ret == -2);
315 	}
316 
317 	return 0;
318 }
319