1 /*
2  * utf8.c - routines to handle UTF-8.
3  */
4 
5 #ifndef ENUM_CHARSETS
6 
7 #include "charset.h"
8 #include "internal.h"
9 
10 /*
11  * The internal read_utf8 and write_utf8 functions in this module
12  * are not static, because they're also called internally from
13  * iso2022.c.
14  */
15 
16 /*
17  * UTF-8 has no associated data, so `charset' may be ignored.
18  */
19 
read_utf8(charset_spec const * charset,long int input_chr,charset_state * state,void (* emit)(void * ctx,long int output),void * emitctx)20 void read_utf8(charset_spec const *charset, long int input_chr,
21 	       charset_state *state,
22 	       void (*emit)(void *ctx, long int output), void *emitctx)
23 {
24     UNUSEDARG(charset);
25 
26     /*
27      * For reading UTF-8, the `state' word contains the character
28      * being accumulated.  This is shifted left by six bits each
29      * time a character is added, and there's a single '1' bit
30      * in what would be bit 31 of the final character, which we
31      * use to detect when it's complete.
32      *
33      * As required, the state is zero when we are not in the middle
34      * of a multibyte character at all.
35      *
36      * For example, when reading E9 8D 8B, starting at state=0:
37      *
38      *  - after E9, the state is 0x00080009
39      *  - after 8D, the state is 0x0200024d
40      *  - after 8B, the state conceptually becomes 0x8000934b, at
41      *    which point we notice we've got as many characters as we
42      *    were expecting, output U+934B, and reset the state to
43      *    zero.
44      *
45      * If we detect an overlong sequence, we shift the marker bit
46      * right one bit.  This is safe because an overlong sequence
47      * can't encode a top-bit-set character.  Not that we worry
48      * about what overlong sequences are trying to encode, but
49      * it's nice to know that we could if we wanted to.
50      *
51      * Note that the maximum number of bits we might need to store
52      * in the character value field is 25 (U+7FFFFFFF contains 31
53      * bits, but we will never actually store its full value
54      * because when we receive the last 6 bits in the final
55      * continuation byte we will output it and revert the state to
56      * zero). Hence we need 26 bits in total.
57      */
58 
59     if (input_chr < 0x80) {
60 	/*
61 	 * Single-byte character. If the state is nonzero before
62 	 * coming here, output an error for an incomplete sequence.
63 	 * Then output the character.
64 	 */
65 	if (state->s0 != 0) {
66 	    emit(emitctx, ERROR);
67 	    state->s0 = 0;
68 	}
69 	emit(emitctx, input_chr);
70     } else if (input_chr == 0xFE || input_chr == 0xFF) {
71 	/*
72 	 * FE and FF bytes should _never_ occur in UTF-8. They are
73 	 * automatic errors; if the state was nonzero to start
74 	 * with, output a further error for an incomplete sequence.
75 	 */
76 	if (state->s0 != 0) {
77 	    emit(emitctx, ERROR);
78 	    state->s0 = 0;
79 	}
80 	emit(emitctx, ERROR);
81     } else if (input_chr >= 0x80 && input_chr < 0xC0) {
82 	/*
83 	 * Continuation byte. Output an error for an unexpected
84 	 * continuation byte, if the state is zero.
85 	 */
86 	if (state->s0 == 0) {
87 	    emit(emitctx, ERROR);
88 	} else {
89 	    unsigned long charval;
90 
91 	    /*
92 	     * Otherwise, accumulate more of the character value.
93 	     */
94 	    charval = state->s0;
95 	    charval = (charval << 6) | (input_chr & 0x3F);
96 
97 	    /*
98 	     * Detect overlong encodings.  We're looking for too many
99 	     * leading zeroes given our position in the character.  If
100 	     * we find an overlong encoding, clear the current marker
101 	     * bit and set the bit below it.  Overlong two-byte
102 	     * encodings are a special case, and are detected when we
103 	     * read their inital byte.
104 	     */
105 	    if ((charval & 0xffffffe0L) == 0x02000000L)
106 		charval ^= 0x03000000L;
107 	    else if ((charval & 0xfffffff0L) == 0x00080000L)
108 		charval ^= 0x000c0000L;
109 	    else if ((charval & 0xfffffff8L) == 0x00002000L)
110 		charval ^= 0x00003000L;
111 	    else if ((charval & 0xfffffffcL) == 0x00000080L)
112 		charval ^= 0x000000c0L;
113 
114 	    /*
115 	     * Check the byte counts; if we have not reached the
116 	     * end of the character, update the state and return.
117 	     */
118 	    if (!(charval & 0xc0000000L)) {
119 		state->s0 = charval;
120 		return;
121 	    }
122 
123 	    /*
124 	     * Clear the marker bit, or set it if it's clear,
125 	     * indicating an overlong sequence.
126 	     */
127 	    charval ^= 0x80000000L;
128 
129 	    /*
130 	     * Now we know we've reached the end of the character.
131 	     * `charval' is the Unicode value. We should check for
132 	     * various invalid things, and then either output
133 	     * charval or an error. In all cases we reset the state
134 	     * to zero.
135 	     */
136 	    state->s0 = 0;
137 
138 	    if (charval & 0x80000000L) {
139 		/* We got an overlong sequence. */
140 		emit(emitctx, ERROR);
141 	    } else if (charval >= 0xD800 && charval < 0xE000) {
142 		/*
143 		 * Surrogates (0xD800-0xDFFF) may never be encoded
144 		 * in UTF-8. A surrogate pair in Unicode should
145 		 * have been encoded as a single UTF-8 character
146 		 * occupying more than three bytes.
147 		 */
148 		emit(emitctx, ERROR);
149 	    } else if (charval == 0xFFFE || charval == 0xFFFF) {
150 		/*
151 		 * U+FFFE and U+FFFF are invalid Unicode characters
152 		 * and may never be encoded in UTF-8. (This is one
153 		 * reason why U+FFFF is our way of signalling an
154 		 * error to our `emit' function :-)
155 		 */
156 		emit(emitctx, ERROR);
157 	    } else {
158 		/*
159 		 * Oh, all right. We'll let this one off.
160 		 */
161 		emit(emitctx, charval);
162 	    }
163 	}
164 
165     } else {
166 	/*
167 	 * Lead byte. First output an error for an incomplete
168 	 * sequence, if the state is nonzero.
169 	 */
170 	if (state->s0 != 0)
171 	    emit(emitctx, ERROR);
172 
173 	/*
174 	 * Now deal with the lead byte: work out the number of
175 	 * bytes we expect to see in this character, and extract
176 	 * the initial bits of it too.
177 	 */
178 	if (input_chr >= 0xC0 && input_chr < 0xC2) {
179 	    /* beginning of an overlong two-byte sequence */
180 	    state->s0 = 0x01000000L | (input_chr & 0x1F);
181 	} else if (input_chr >= 0xC2 && input_chr < 0xE0) {
182 	    state->s0 = 0x02000000L | (input_chr & 0x1F);
183 	} else if (input_chr >= 0xE0 && input_chr < 0xF0) {
184 	    state->s0 = 0x00080000L | (input_chr & 0x0F);
185 	} else if (input_chr >= 0xF0 && input_chr < 0xF8) {
186 	    state->s0 = 0x00002000L | (input_chr & 0x07);
187 	} else if (input_chr >= 0xF8 && input_chr < 0xFC) {
188 	    state->s0 = 0x00000080L | (input_chr & 0x03);
189 	} else if (input_chr >= 0xFC && input_chr < 0xFE) {
190 	    state->s0 = 0x00000002L | (input_chr & 0x01);
191 	}
192     }
193 }
194 
195 /*
196  * UTF-8 is a stateless multi-byte encoding (in the sense that just
197  * after any character has been completed, the state is always the
198  * same); hence when writing it, there is no need to use the
199  * charset_state.
200  */
201 
write_utf8(charset_spec const * charset,long int input_chr,charset_state * state,void (* emit)(void * ctx,long int output),void * emitctx)202 int write_utf8(charset_spec const *charset, long int input_chr,
203 	       charset_state *state,
204 	       void (*emit)(void *ctx, long int output),
205 	       void *emitctx)
206 {
207     UNUSEDARG(charset);
208     UNUSEDARG(state);
209 
210     if (input_chr == -1)
211 	return TRUE;		       /* stateless; no cleanup required */
212 
213     /*
214      * Refuse to output any illegal code points.
215      */
216     if (input_chr == 0xFFFE || input_chr == 0xFFFF ||
217 	(input_chr >= 0xD800 && input_chr < 0xE000)) {
218 	return FALSE;
219     } else if (input_chr < 0x80) {     /* one-byte character */
220 	emit(emitctx, input_chr);
221 	return TRUE;
222     } else if (input_chr < 0x800) {    /* two-byte character */
223 	emit(emitctx, 0xC0 | (0x1F & (input_chr >>  6)));
224 	emit(emitctx, 0x80 | (0x3F & (input_chr      )));
225 	return TRUE;
226     } else if (input_chr < 0x10000) {  /* three-byte character */
227 	emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12)));
228 	emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
229 	emit(emitctx, 0x80 | (0x3F & (input_chr      )));
230 	return TRUE;
231     } else if (input_chr < 0x200000) { /* four-byte character */
232 	emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18)));
233 	emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
234 	emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
235 	emit(emitctx, 0x80 | (0x3F & (input_chr      )));
236 	return TRUE;
237     } else if (input_chr < 0x4000000) {/* five-byte character */
238 	emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24)));
239 	emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
240 	emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
241 	emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
242 	emit(emitctx, 0x80 | (0x3F & (input_chr      )));
243 	return TRUE;
244     } else {			       /* six-byte character */
245 	emit(emitctx, 0xFC | (0x01 & (input_chr >> 30)));
246 	emit(emitctx, 0x80 | (0x3F & (input_chr >> 24)));
247 	emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
248 	emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
249 	emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
250 	emit(emitctx, 0x80 | (0x3F & (input_chr      )));
251 	return TRUE;
252     }
253 }
254 
255 #ifdef TESTMODE
256 
257 #include <stdio.h>
258 #include <stdarg.h>
259 
260 int total_errs = 0;
261 
utf8_emit(void * ctx,long output)262 void utf8_emit(void *ctx, long output)
263 {
264     wchar_t **p = (wchar_t **)ctx;
265     *(*p)++ = output;
266 }
267 
utf8_read_test(int line,char * input,int inlen,...)268 void utf8_read_test(int line, char *input, int inlen, ...)
269 {
270     va_list ap;
271     wchar_t *p, str[512];
272     int i;
273     charset_state state;
274     unsigned long l;
275 
276     state.s0 = 0;
277     p = str;
278 
279     for (i = 0; i < inlen; i++)
280 	read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p);
281 
282     va_start(ap, inlen);
283     l = 0;
284     for (i = 0; i < p - str; i++) {
285 	l = va_arg(ap, long int);
286 	if (l == -1) {
287 	    printf("%d: correct string shorter than output\n", line);
288 	    total_errs++;
289 	    break;
290 	}
291 	if (l != str[i]) {
292 	    printf("%d: char %d came out as %08x, should be %08x\n",
293 		    line, i, str[i], l);
294 	    total_errs++;
295 	}
296     }
297     if (l != -1) {
298 	l = va_arg(ap, long int);
299 	if (l != -1) {
300 	    printf("%d: correct string longer than output\n", line);
301 	    total_errs++;
302 	}
303     }
304     va_end(ap);
305 }
306 
utf8_write_test(int line,const long * input,int inlen,...)307 void utf8_write_test(int line, const long *input, int inlen, ...)
308 {
309     va_list ap;
310     wchar_t *p, str[512];
311     int i;
312     charset_state state;
313     unsigned long l;
314 
315     state.s0 = 0;
316     p = str;
317 
318     for (i = 0; i < inlen; i++) {
319 	if (!write_utf8(NULL, input[i], &state, utf8_emit, &p))
320             utf8_emit(&p, ERROR);
321     }
322 
323     va_start(ap, inlen);
324     l = 0;
325     for (i = 0; i < p - str; i++) {
326 	l = va_arg(ap, long int);
327 	if (l == -1) {
328 	    printf("%d: correct string shorter than output\n", line);
329 	    total_errs++;
330 	    break;
331 	}
332 	if (l != str[i]) {
333 	    printf("%d: char %d came out as %08x, should be %08x\n",
334 		    line, i, str[i], l);
335 	    total_errs++;
336 	}
337     }
338     if (l != -1) {
339 	l = va_arg(ap, long int);
340 	if (l != -1) {
341 	    printf("%d: correct string longer than output\n", line);
342 	    total_errs++;
343 	}
344     }
345     va_end(ap);
346 }
347 
348 /* Macro to concoct the first three parameters of utf8_read_test. */
349 #define TESTSTR(x) __LINE__, x, lenof(x)
350 
main(void)351 int main(void)
352 {
353     printf("read tests beginning\n");
354     utf8_read_test(TESTSTR("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
355 		   0x000003BA, /* GREEK SMALL LETTER KAPPA */
356 		   0x00001F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */
357 		   0x000003C3, /* GREEK SMALL LETTER SIGMA */
358 		   0x000003BC, /* GREEK SMALL LETTER MU */
359 		   0x000003B5, /* GREEK SMALL LETTER EPSILON */
360 		   0, -1);
361     utf8_read_test(TESTSTR("\x00"),
362 		   0x00000000, /* <control> */
363 		   0, -1);
364     utf8_read_test(TESTSTR("\xC2\x80"),
365 		   0x00000080, /* <control> */
366 		   0, -1);
367     utf8_read_test(TESTSTR("\xE0\xA0\x80"),
368 		   0x00000800, /* <no name available> */
369 		   0, -1);
370     utf8_read_test(TESTSTR("\xF0\x90\x80\x80"),
371 		   0x00010000, /* <no name available> */
372 		   0, -1);
373     utf8_read_test(TESTSTR("\xF8\x88\x80\x80\x80"),
374 		   0x00200000, /* <no name available> */
375 		   0, -1);
376     utf8_read_test(TESTSTR("\xFC\x84\x80\x80\x80\x80"),
377 		   0x04000000, /* <no name available> */
378 		   0, -1);
379     utf8_read_test(TESTSTR("\x7F"),
380 		   0x0000007F, /* <control> */
381 		   0, -1);
382     utf8_read_test(TESTSTR("\xDF\xBF"),
383 		   0x000007FF, /* <no name available> */
384 		   0, -1);
385     utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
386 		   0x0000FFFD, /* REPLACEMENT CHARACTER */
387 		   0, -1);
388     utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
389 		   ERROR,      /* <no name available> (invalid char) */
390 		   0, -1);
391     utf8_read_test(TESTSTR("\xF7\xBF\xBF\xBF"),
392 		   0x001FFFFF, /* <no name available> */
393 		   0, -1);
394     utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF\xBF"),
395 		   0x03FFFFFF, /* <no name available> */
396 		   0, -1);
397     utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF\xBF"),
398 		   0x7FFFFFFF, /* <no name available> */
399 		   0, -1);
400     utf8_read_test(TESTSTR("\xED\x9F\xBF"),
401 		   0x0000D7FF, /* <no name available> */
402 		   0, -1);
403     utf8_read_test(TESTSTR("\xEE\x80\x80"),
404 		   0x0000E000, /* <Private Use, First> */
405 		   0, -1);
406     utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
407 		   0x0000FFFD, /* REPLACEMENT CHARACTER */
408 		   0, -1);
409     utf8_read_test(TESTSTR("\xF4\x8F\xBF\xBF"),
410 		   0x0010FFFF, /* <no name available> */
411 		   0, -1);
412     utf8_read_test(TESTSTR("\xF4\x90\x80\x80"),
413 		   0x00110000, /* <no name available> */
414 		   0, -1);
415     utf8_read_test(TESTSTR("\x80"),
416 		   ERROR,      /* (unexpected continuation byte) */
417 		   0, -1);
418     utf8_read_test(TESTSTR("\xBF"),
419 		   ERROR,      /* (unexpected continuation byte) */
420 		   0, -1);
421     utf8_read_test(TESTSTR("\x80\xBF"),
422 		   ERROR,      /* (unexpected continuation byte) */
423 		   ERROR,      /* (unexpected continuation byte) */
424 		   0, -1);
425     utf8_read_test(TESTSTR("\x80\xBF\x80"),
426 		   ERROR,      /* (unexpected continuation byte) */
427 		   ERROR,      /* (unexpected continuation byte) */
428 		   ERROR,      /* (unexpected continuation byte) */
429 		   0, -1);
430     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF"),
431 		   ERROR,      /* (unexpected continuation byte) */
432 		   ERROR,      /* (unexpected continuation byte) */
433 		   ERROR,      /* (unexpected continuation byte) */
434 		   ERROR,      /* (unexpected continuation byte) */
435 		   0, -1);
436     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80"),
437 		   ERROR,      /* (unexpected continuation byte) */
438 		   ERROR,      /* (unexpected continuation byte) */
439 		   ERROR,      /* (unexpected continuation byte) */
440 		   ERROR,      /* (unexpected continuation byte) */
441 		   ERROR,      /* (unexpected continuation byte) */
442 		   0, -1);
443     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF"),
444 		   ERROR,      /* (unexpected continuation byte) */
445 		   ERROR,      /* (unexpected continuation byte) */
446 		   ERROR,      /* (unexpected continuation byte) */
447 		   ERROR,      /* (unexpected continuation byte) */
448 		   ERROR,      /* (unexpected continuation byte) */
449 		   ERROR,      /* (unexpected continuation byte) */
450 		   0, -1);
451     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF\x80"),
452 		   ERROR,      /* (unexpected continuation byte) */
453 		   ERROR,      /* (unexpected continuation byte) */
454 		   ERROR,      /* (unexpected continuation byte) */
455 		   ERROR,      /* (unexpected continuation byte) */
456 		   ERROR,      /* (unexpected continuation byte) */
457 		   ERROR,      /* (unexpected continuation byte) */
458 		   ERROR,      /* (unexpected continuation byte) */
459 		   0, -1);
460     utf8_read_test(TESTSTR("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"),
461 		   ERROR,      /* (unexpected continuation byte) */
462 		   ERROR,      /* (unexpected continuation byte) */
463 		   ERROR,      /* (unexpected continuation byte) */
464 		   ERROR,      /* (unexpected continuation byte) */
465 		   ERROR,      /* (unexpected continuation byte) */
466 		   ERROR,      /* (unexpected continuation byte) */
467 		   ERROR,      /* (unexpected continuation byte) */
468 		   ERROR,      /* (unexpected continuation byte) */
469 		   ERROR,      /* (unexpected continuation byte) */
470 		   ERROR,      /* (unexpected continuation byte) */
471 		   ERROR,      /* (unexpected continuation byte) */
472 		   ERROR,      /* (unexpected continuation byte) */
473 		   ERROR,      /* (unexpected continuation byte) */
474 		   ERROR,      /* (unexpected continuation byte) */
475 		   ERROR,      /* (unexpected continuation byte) */
476 		   ERROR,      /* (unexpected continuation byte) */
477 		   ERROR,      /* (unexpected continuation byte) */
478 		   ERROR,      /* (unexpected continuation byte) */
479 		   ERROR,      /* (unexpected continuation byte) */
480 		   ERROR,      /* (unexpected continuation byte) */
481 		   ERROR,      /* (unexpected continuation byte) */
482 		   ERROR,      /* (unexpected continuation byte) */
483 		   ERROR,      /* (unexpected continuation byte) */
484 		   ERROR,      /* (unexpected continuation byte) */
485 		   ERROR,      /* (unexpected continuation byte) */
486 		   ERROR,      /* (unexpected continuation byte) */
487 		   ERROR,      /* (unexpected continuation byte) */
488 		   ERROR,      /* (unexpected continuation byte) */
489 		   ERROR,      /* (unexpected continuation byte) */
490 		   ERROR,      /* (unexpected continuation byte) */
491 		   ERROR,      /* (unexpected continuation byte) */
492 		   ERROR,      /* (unexpected continuation byte) */
493 		   ERROR,      /* (unexpected continuation byte) */
494 		   ERROR,      /* (unexpected continuation byte) */
495 		   ERROR,      /* (unexpected continuation byte) */
496 		   ERROR,      /* (unexpected continuation byte) */
497 		   ERROR,      /* (unexpected continuation byte) */
498 		   ERROR,      /* (unexpected continuation byte) */
499 		   ERROR,      /* (unexpected continuation byte) */
500 		   ERROR,      /* (unexpected continuation byte) */
501 		   ERROR,      /* (unexpected continuation byte) */
502 		   ERROR,      /* (unexpected continuation byte) */
503 		   ERROR,      /* (unexpected continuation byte) */
504 		   ERROR,      /* (unexpected continuation byte) */
505 		   ERROR,      /* (unexpected continuation byte) */
506 		   ERROR,      /* (unexpected continuation byte) */
507 		   ERROR,      /* (unexpected continuation byte) */
508 		   ERROR,      /* (unexpected continuation byte) */
509 		   ERROR,      /* (unexpected continuation byte) */
510 		   ERROR,      /* (unexpected continuation byte) */
511 		   ERROR,      /* (unexpected continuation byte) */
512 		   ERROR,      /* (unexpected continuation byte) */
513 		   ERROR,      /* (unexpected continuation byte) */
514 		   ERROR,      /* (unexpected continuation byte) */
515 		   ERROR,      /* (unexpected continuation byte) */
516 		   ERROR,      /* (unexpected continuation byte) */
517 		   ERROR,      /* (unexpected continuation byte) */
518 		   ERROR,      /* (unexpected continuation byte) */
519 		   ERROR,      /* (unexpected continuation byte) */
520 		   ERROR,      /* (unexpected continuation byte) */
521 		   ERROR,      /* (unexpected continuation byte) */
522 		   ERROR,      /* (unexpected continuation byte) */
523 		   ERROR,      /* (unexpected continuation byte) */
524 		   ERROR,      /* (unexpected continuation byte) */
525 		   0, -1);
526     utf8_read_test(TESTSTR("\xC0\x20\xC1\x20\xC2\x20\xC3\x20\xC4\x20\xC5\x20\xC6\x20\xC7\x20"),
527 		   ERROR,      /* (incomplete sequence) */
528 		   0x00000020, /* SPACE */
529 		   ERROR,      /* (incomplete sequence) */
530 		   0x00000020, /* SPACE */
531 		   ERROR,      /* (incomplete sequence) */
532 		   0x00000020, /* SPACE */
533 		   ERROR,      /* (incomplete sequence) */
534 		   0x00000020, /* SPACE */
535 		   ERROR,      /* (incomplete sequence) */
536 		   0x00000020, /* SPACE */
537 		   ERROR,      /* (incomplete sequence) */
538 		   0x00000020, /* SPACE */
539 		   ERROR,      /* (incomplete sequence) */
540 		   0x00000020, /* SPACE */
541 		   ERROR,      /* (incomplete sequence) */
542 		   0x00000020, /* SPACE */
543 		   0, -1);
544     utf8_read_test(TESTSTR("\xE0\x20\xE1\x20\xE2\x20\xE3\x20\xE4\x20\xE5\x20\xE6\x20\xE7\x20\xE8\x20\xE9\x20\xEA\x20\xEB\x20\xEC\x20\xED\x20\xEE\x20\xEF\x20"),
545 		   ERROR,      /* (incomplete sequence) */
546 		   0x00000020, /* SPACE */
547 		   ERROR,      /* (incomplete sequence) */
548 		   0x00000020, /* SPACE */
549 		   ERROR,      /* (incomplete sequence) */
550 		   0x00000020, /* SPACE */
551 		   ERROR,      /* (incomplete sequence) */
552 		   0x00000020, /* SPACE */
553 		   ERROR,      /* (incomplete sequence) */
554 		   0x00000020, /* SPACE */
555 		   ERROR,      /* (incomplete sequence) */
556 		   0x00000020, /* SPACE */
557 		   ERROR,      /* (incomplete sequence) */
558 		   0x00000020, /* SPACE */
559 		   ERROR,      /* (incomplete sequence) */
560 		   0x00000020, /* SPACE */
561 		   ERROR,      /* (incomplete sequence) */
562 		   0x00000020, /* SPACE */
563 		   ERROR,      /* (incomplete sequence) */
564 		   0x00000020, /* SPACE */
565 		   ERROR,      /* (incomplete sequence) */
566 		   0x00000020, /* SPACE */
567 		   ERROR,      /* (incomplete sequence) */
568 		   0x00000020, /* SPACE */
569 		   ERROR,      /* (incomplete sequence) */
570 		   0x00000020, /* SPACE */
571 		   ERROR,      /* (incomplete sequence) */
572 		   0x00000020, /* SPACE */
573 		   ERROR,      /* (incomplete sequence) */
574 		   0x00000020, /* SPACE */
575 		   ERROR,      /* (incomplete sequence) */
576 		   0x00000020, /* SPACE */
577 		   0, -1);
578     utf8_read_test(TESTSTR("\xF0\x20\xF1\x20\xF2\x20\xF3\x20\xF4\x20\xF5\x20\xF6\x20\xF7\x20"),
579 		   ERROR,      /* (incomplete sequence) */
580 		   0x00000020, /* SPACE */
581 		   ERROR,      /* (incomplete sequence) */
582 		   0x00000020, /* SPACE */
583 		   ERROR,      /* (incomplete sequence) */
584 		   0x00000020, /* SPACE */
585 		   ERROR,      /* (incomplete sequence) */
586 		   0x00000020, /* SPACE */
587 		   ERROR,      /* (incomplete sequence) */
588 		   0x00000020, /* SPACE */
589 		   ERROR,      /* (incomplete sequence) */
590 		   0x00000020, /* SPACE */
591 		   ERROR,      /* (incomplete sequence) */
592 		   0x00000020, /* SPACE */
593 		   ERROR,      /* (incomplete sequence) */
594 		   0x00000020, /* SPACE */
595 		   0, -1);
596     utf8_read_test(TESTSTR("\xF8\x20\xF9\x20\xFA\x20\xFB\x20"),
597 		   ERROR,      /* (incomplete sequence) */
598 		   0x00000020, /* SPACE */
599 		   ERROR,      /* (incomplete sequence) */
600 		   0x00000020, /* SPACE */
601 		   ERROR,      /* (incomplete sequence) */
602 		   0x00000020, /* SPACE */
603 		   ERROR,      /* (incomplete sequence) */
604 		   0x00000020, /* SPACE */
605 		   0, -1);
606     utf8_read_test(TESTSTR("\xFC\x20\xFD\x20"),
607 		   ERROR,      /* (incomplete sequence) */
608 		   0x00000020, /* SPACE */
609 		   ERROR,      /* (incomplete sequence) */
610 		   0x00000020, /* SPACE */
611 		   0, -1);
612     utf8_read_test(TESTSTR("\xC0"),
613 		   ERROR,      /* (incomplete sequence) */
614 		   0, -1);
615     utf8_read_test(TESTSTR("\xE0\x80"),
616 		   ERROR,      /* (incomplete sequence) */
617 		   0, -1);
618     utf8_read_test(TESTSTR("\xF0\x80\x80"),
619 		   ERROR,      /* (incomplete sequence) */
620 		   0, -1);
621     utf8_read_test(TESTSTR("\xF8\x80\x80\x80"),
622 		   ERROR,      /* (incomplete sequence) */
623 		   0, -1);
624     utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80"),
625 		   ERROR,      /* (incomplete sequence) */
626 		   0, -1);
627     utf8_read_test(TESTSTR("\xDF"),
628 		   ERROR,      /* (incomplete sequence) */
629 		   0, -1);
630     utf8_read_test(TESTSTR("\xEF\xBF"),
631 		   ERROR,      /* (incomplete sequence) */
632 		   0, -1);
633     utf8_read_test(TESTSTR("\xF7\xBF\xBF"),
634 		   ERROR,      /* (incomplete sequence) */
635 		   0, -1);
636     utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF"),
637 		   ERROR,      /* (incomplete sequence) */
638 		   0, -1);
639     utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF"),
640 		   ERROR,      /* (incomplete sequence) */
641 		   0, -1);
642     utf8_read_test(TESTSTR("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"),
643 		   ERROR,      /* (incomplete sequence) */
644 		   ERROR,      /* (incomplete sequence) */
645 		   ERROR,      /* (incomplete sequence) */
646 		   ERROR,      /* (incomplete sequence) */
647 		   ERROR,      /* (incomplete sequence) */
648 		   ERROR,      /* (incomplete sequence) */
649 		   ERROR,      /* (incomplete sequence) */
650 		   ERROR,      /* (incomplete sequence) */
651 		   ERROR,      /* (incomplete sequence) */
652 		   ERROR,      /* (incomplete sequence) */
653 		   0, -1);
654     utf8_read_test(TESTSTR("\xFE"),
655 		   ERROR,      /* (invalid UTF-8 byte) */
656 		   0, -1);
657     utf8_read_test(TESTSTR("\xFF"),
658 		   ERROR,      /* (invalid UTF-8 byte) */
659 		   0, -1);
660     utf8_read_test(TESTSTR("\xFE\xFE\xFF\xFF"),
661 		   ERROR,      /* (invalid UTF-8 byte) */
662 		   ERROR,      /* (invalid UTF-8 byte) */
663 		   ERROR,      /* (invalid UTF-8 byte) */
664 		   ERROR,      /* (invalid UTF-8 byte) */
665 		   0, -1);
666     utf8_read_test(TESTSTR("\xC0\xAF"),
667 		   ERROR,      /* SOLIDUS (overlong form of 2F) */
668 		   0, -1);
669     utf8_read_test(TESTSTR("\xE0\x80\xAF"),
670 		   ERROR,      /* SOLIDUS (overlong form of 2F) */
671 		   0, -1);
672     utf8_read_test(TESTSTR("\xF0\x80\x80\xAF"),
673 		   ERROR,      /* SOLIDUS (overlong form of 2F) */
674 		   0, -1);
675     utf8_read_test(TESTSTR("\xF8\x80\x80\x80\xAF"),
676 		   ERROR,      /* SOLIDUS (overlong form of 2F) */
677 		   0, -1);
678     utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\xAF"),
679 		   ERROR,      /* SOLIDUS (overlong form of 2F) */
680 		   0, -1);
681     utf8_read_test(TESTSTR("\xC1\xBF"),
682 		   ERROR,      /* <control> (overlong form of 7F) */
683 		   0, -1);
684     utf8_read_test(TESTSTR("\xE0\x9F\xBF"),
685 		   ERROR,      /* <no name available> (overlong form of DF BF) */
686 		   0, -1);
687     utf8_read_test(TESTSTR("\xF0\x8F\xBF\xBF"),
688 		   ERROR,      /* <no name available> (overlong form of EF BF BF) (invalid char) */
689 		   0, -1);
690     utf8_read_test(TESTSTR("\xF8\x87\xBF\xBF\xBF"),
691 		   ERROR,      /* <no name available> (overlong form of F7 BF BF BF) */
692 		   0, -1);
693     utf8_read_test(TESTSTR("\xFC\x83\xBF\xBF\xBF\xBF"),
694 		   ERROR,      /* <no name available> (overlong form of FB BF BF BF BF) */
695 		   0, -1);
696     utf8_read_test(TESTSTR("\xC0\x80"),
697 		   ERROR,      /* <control> (overlong form of 00) */
698 		   0, -1);
699     utf8_read_test(TESTSTR("\xE0\x80\x80"),
700 		   ERROR,      /* <control> (overlong form of 00) */
701 		   0, -1);
702     utf8_read_test(TESTSTR("\xF0\x80\x80\x80"),
703 		   ERROR,      /* <control> (overlong form of 00) */
704 		   0, -1);
705     utf8_read_test(TESTSTR("\xF8\x80\x80\x80\x80"),
706 		   ERROR,      /* <control> (overlong form of 00) */
707 		   0, -1);
708     utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\x80"),
709 		   ERROR,      /* <control> (overlong form of 00) */
710 		   0, -1);
711     utf8_read_test(TESTSTR("\xED\xA0\x80"),
712 		   ERROR,      /* <Non Private Use High Surrogate, First> (surrogate) */
713 		   0, -1);
714     utf8_read_test(TESTSTR("\xED\xAD\xBF"),
715 		   ERROR,      /* <Non Private Use High Surrogate, Last> (surrogate) */
716 		   0, -1);
717     utf8_read_test(TESTSTR("\xED\xAE\x80"),
718 		   ERROR,      /* <Private Use High Surrogate, First> (surrogate) */
719 		   0, -1);
720     utf8_read_test(TESTSTR("\xED\xAF\xBF"),
721 		   ERROR,      /* <Private Use High Surrogate, Last> (surrogate) */
722 		   0, -1);
723     utf8_read_test(TESTSTR("\xED\xB0\x80"),
724 		   ERROR,      /* <Low Surrogate, First> (surrogate) */
725 		   0, -1);
726     utf8_read_test(TESTSTR("\xED\xBE\x80"),
727 		   ERROR,      /* <no name available> (surrogate) */
728 		   0, -1);
729     utf8_read_test(TESTSTR("\xED\xBF\xBF"),
730 		   ERROR,      /* <Low Surrogate, Last> (surrogate) */
731 		   0, -1);
732     utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xB0\x80"),
733 		   ERROR,      /* <Non Private Use High Surrogate, First> (surrogate) */
734 		   ERROR,      /* <Low Surrogate, First> (surrogate) */
735 		   0, -1);
736     utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xBF\xBF"),
737 		   ERROR,      /* <Non Private Use High Surrogate, First> (surrogate) */
738 		   ERROR,      /* <Low Surrogate, Last> (surrogate) */
739 		   0, -1);
740     utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xB0\x80"),
741 		   ERROR,      /* <Non Private Use High Surrogate, Last> (surrogate) */
742 		   ERROR,      /* <Low Surrogate, First> (surrogate) */
743 		   0, -1);
744     utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xBF\xBF"),
745 		   ERROR,      /* <Non Private Use High Surrogate, Last> (surrogate) */
746 		   ERROR,      /* <Low Surrogate, Last> (surrogate) */
747 		   0, -1);
748     utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xB0\x80"),
749 		   ERROR,      /* <Private Use High Surrogate, First> (surrogate) */
750 		   ERROR,      /* <Low Surrogate, First> (surrogate) */
751 		   0, -1);
752     utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xBF\xBF"),
753 		   ERROR,      /* <Private Use High Surrogate, First> (surrogate) */
754 		   ERROR,      /* <Low Surrogate, Last> (surrogate) */
755 		   0, -1);
756     utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xB0\x80"),
757 		   ERROR,      /* <Private Use High Surrogate, Last> (surrogate) */
758 		   ERROR,      /* <Low Surrogate, First> (surrogate) */
759 		   0, -1);
760     utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xBF\xBF"),
761 		   ERROR,      /* <Private Use High Surrogate, Last> (surrogate) */
762 		   ERROR,      /* <Low Surrogate, Last> (surrogate) */
763 		   0, -1);
764     utf8_read_test(TESTSTR("\xEF\xBF\xBE"),
765 		   ERROR,      /* <no name available> (invalid char) */
766 		   0, -1);
767     utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
768 		   ERROR,      /* <no name available> (invalid char) */
769 		   0, -1);
770     printf("read tests completed\n");
771     printf("write tests beginning\n");
772     {
773 	const static long str[] =
774 	{0x03BAL, 0x1F79L, 0x03C3L, 0x03BCL, 0x03B5L, 0};
775 	utf8_write_test(TESTSTR(str),
776 			0xCE, 0xBA,
777 			0xE1, 0xBD, 0xB9,
778 			0xCF, 0x83,
779 			0xCE, 0xBC,
780 			0xCE, 0xB5,
781 			0, -1);
782     }
783     {
784 	const static long str[] = {0x0000L, 0};
785 	utf8_write_test(TESTSTR(str),
786 			0x00,
787 			0, -1);
788     }
789     {
790 	const static long str[] = {0x0080L, 0};
791 	utf8_write_test(TESTSTR(str),
792 			0xC2, 0x80,
793 			0, -1);
794     }
795     {
796 	const static long str[] = {0x0800L, 0};
797 	utf8_write_test(TESTSTR(str),
798 			0xE0, 0xA0, 0x80,
799 			0, -1);
800     }
801     {
802 	const static long str[] = {0x00010000L, 0};
803 	utf8_write_test(TESTSTR(str),
804 			0xF0, 0x90, 0x80, 0x80,
805 			0, -1);
806     }
807     {
808 	const static long str[] = {0x00200000L, 0};
809 	utf8_write_test(TESTSTR(str),
810 			0xF8, 0x88, 0x80, 0x80, 0x80,
811 			0, -1);
812     }
813     {
814 	const static long str[] = {0x04000000L, 0};
815 	utf8_write_test(TESTSTR(str),
816 			0xFC, 0x84, 0x80, 0x80, 0x80, 0x80,
817 			0, -1);
818     }
819     {
820 	const static long str[] = {0x007FL, 0};
821 	utf8_write_test(TESTSTR(str),
822 			0x7F,
823 			0, -1);
824     }
825     {
826 	const static long str[] = {0x07FFL, 0};
827 	utf8_write_test(TESTSTR(str),
828 			0xDF, 0xBF,
829 			0, -1);
830     }
831     {
832 	const static long str[] = {0xFFFDL, 0};
833 	utf8_write_test(TESTSTR(str),
834 			0xEF, 0xBF, 0xBD,
835 			0, -1);
836     }
837     {
838 	const static long str[] = {0xFFFFL, 0};
839 	utf8_write_test(TESTSTR(str),
840 			ERROR,
841 			0, -1);
842     }
843     {
844 	const static long str[] = {0x001FFFFFL, 0};
845 	utf8_write_test(TESTSTR(str),
846 			0xF7, 0xBF, 0xBF, 0xBF,
847 			0, -1);
848     }
849     {
850 	const static long str[] = {0x03FFFFFFL, 0};
851 	utf8_write_test(TESTSTR(str),
852 			0xFB, 0xBF, 0xBF, 0xBF, 0xBF,
853 			0, -1);
854     }
855     {
856 	const static long str[] = {0x7FFFFFFFL, 0};
857 	utf8_write_test(TESTSTR(str),
858 			0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF,
859 			0, -1);
860     }
861     {
862 	const static long str[] = {0xD7FFL, 0};
863 	utf8_write_test(TESTSTR(str),
864 			0xED, 0x9F, 0xBF,
865 			0, -1);
866     }
867     {
868 	const static long str[] = {0xD800L, 0};
869 	utf8_write_test(TESTSTR(str),
870 			ERROR,
871 			0, -1);
872     }
873     {
874 	const static long str[] = {0xD800L, 0xDC00L, 0};
875 	utf8_write_test(TESTSTR(str),
876 			ERROR,
877 			ERROR,
878 			0, -1);
879     }
880     {
881 	const static long str[] = {0xDFFFL, 0};
882 	utf8_write_test(TESTSTR(str),
883 			ERROR,
884 			0, -1);
885     }
886     {
887 	const static long str[] = {0xE000L, 0};
888 	utf8_write_test(TESTSTR(str),
889 			0xEE, 0x80, 0x80,
890 			0, -1);
891     }
892     printf("write tests completed\n");
893 
894     printf("total: %d errors\n", total_errs);
895     return (total_errs != 0);
896 }
897 #endif /* TESTMODE */
898 
899 const charset_spec charset_CS_UTF8 = {
900     CS_UTF8, read_utf8, write_utf8, NULL
901 };
902 
903 #else /* ENUM_CHARSETS */
904 
905 ENUM_CHARSET(CS_UTF8)
906 
907 #endif /* ENUM_CHARSETS */
908