1 /*
2  * utf8.c - routines to handle UTF-8.
3  */
4 
5 #ifndef ENUM_CHARSETS
6 
7 #include "charset.h"
8 #include "internal.h"
9 
10 /*
11  * UTF-8 has no associated data, so `charset' may be ignored.
12  */
13 
read_utf8(charset_spec const * charset,long int input_chr,charset_state * state,void (* emit)(void * ctx,long int output),void * emitctx)14 static void read_utf8(charset_spec const *charset, long int input_chr,
15                       charset_state *state,
16                       void (*emit)(void *ctx, long int output), void *emitctx)
17 {
18     UNUSEDARG(charset);
19 
20     /*
21      * For reading UTF-8, the `state' word contains:
22      *
23      *  - in bits 29-31, the number of bytes expected to be in the
24      *    current multibyte character (which we can tell instantly
25      *    from the first byte, of course).
26      *
27      *  - in bits 26-28, the number of bytes _seen so far_ in the
28      *    current multibyte character.
29      *
30      *  - in the remainder of the word, the current value of the
31      *    character, which is shifted upwards by 6 bits to
32      *    accommodate each new byte.
33      *
34      * As required, the state is zero when we are not in the middle
35      * of a multibyte character at all.
36      *
37      * For example, when reading E9 8D 8B, starting at state=0:
38      *
39      *  - after E9, the state is 0x64000009
40      *  - after 8D, the state is 0x6800024d
41      *  - after 8B, the state conceptually becomes 0x6c00934b, at
42      *    which point we notice we've got as many characters as we
43      *    were expecting, output U+934B, and reset the state to
44      *    zero.
45      *
46      * Note that the maximum number of bits we might need to store
47      * in the character value field is 25 (U+7FFFFFFF contains 31
48      * bits, but we will never actually store its full value
49      * because when we receive the last 6 bits in the final
50      * continuation byte we will output it and revert the state to
51      * zero). Hence the character value field never collides with
52      * the byte counts.
53      */
54 
55     if (input_chr < 0x80) {
56         /*
57          * Single-byte character. If the state is nonzero before
58          * coming here, output an error for an incomplete sequence.
59          * Then output the character.
60          */
61         if (state->s0 != 0) {
62             emit(emitctx, ERROR);
63             state->s0 = 0;
64         }
65         emit(emitctx, input_chr);
66     } else if (input_chr == 0xFE || input_chr == 0xFF) {
67         /*
68          * FE and FF bytes should _never_ occur in UTF-8. They are
69          * automatic errors; if the state was nonzero to start
70          * with, output a further error for an incomplete sequence.
71          */
72         if (state->s0 != 0) {
73             emit(emitctx, ERROR);
74             state->s0 = 0;
75         }
76         emit(emitctx, ERROR);
77     } else if (input_chr >= 0x80 && input_chr < 0xC0) {
78         /*
79          * Continuation byte. Output an error for an unexpected
80          * continuation byte, if the state is zero.
81          */
82         if (state->s0 == 0) {
83             emit(emitctx, ERROR);
84         } else {
85             unsigned long charval;
86             unsigned long topstuff;
87             int bytes;
88 
89             /*
90              * Otherwise, accumulate more of the character value.
91              */
92             charval = state->s0 & 0x03ffffffL;
93             charval = (charval << 6) | (input_chr & 0x3F);
94 
95             /*
96              * Check the byte counts; if we have not reached the
97              * end of the character, update the state and return.
98              */
99             topstuff = state->s0 & 0xfc000000L;
100             topstuff += 0x04000000L;   /* add one to the byte count */
101             if (((topstuff << 3) ^ topstuff) & 0xe0000000L) {
102                 state->s0 = topstuff | charval;
103                 return;
104             }
105 
106             /*
107              * Now we know we've reached the end of the character.
108              * `charval' is the Unicode value. We should check for
109              * various invalid things, and then either output
110              * charval or an error. In all cases we reset the state
111              * to zero.
112              */
113             bytes = topstuff >> 29;
114             state->s0 = 0;
115 
116             if (charval >= 0xD800 && charval < 0xE000) {
117                 /*
118                  * Surrogates (0xD800-0xDFFF) may never be encoded
119                  * in UTF-8. A surrogate pair in Unicode should
120                  * have been encoded as a single UTF-8 character
121                  * occupying more than three bytes.
122                  */
123                 emit(emitctx, ERROR);
124             } else if (charval == 0xFFFE || charval == 0xFFFF) {
125                 /*
126                  * U+FFFE and U+FFFF are invalid Unicode characters
127                  * and may never be encoded in UTF-8. (This is one
128                  * reason why U+FFFF is our way of signalling an
129                  * error to our `emit' function :-)
130                  */
131                 emit(emitctx, ERROR);
132             } else if ((charval <= 0x7FL /* && bytes > 1 */) ||
133                        (charval <= 0x7FFL && bytes > 2) ||
134                        (charval <= 0xFFFFL && bytes > 3) ||
135                        (charval <= 0x1FFFFFL && bytes > 4) ||
136                        (charval <= 0x3FFFFFFL && bytes > 5)) {
137                 /*
138                  * Overlong sequences are not to be tolerated,
139                  * under any circumstances.
140                  */
141                 emit(emitctx, ERROR);
142             } else {
143                 /*
144                  * Oh, all right. We'll let this one off.
145                  */
146                 emit(emitctx, charval);
147             }
148         }
149 
150     } else {
151         /*
152          * Lead byte. First output an error for an incomplete
153          * sequence, if the state is nonzero.
154          */
155         if (state->s0 != 0)
156             emit(emitctx, ERROR);
157 
158         /*
159          * Now deal with the lead byte: work out the number of
160          * bytes we expect to see in this character, and extract
161          * the initial bits of it too.
162          */
163         if (input_chr >= 0xC0 && input_chr < 0xE0) {
164             state->s0 = 0x44000000L | (input_chr & 0x1F);
165         } else if (input_chr >= 0xE0 && input_chr < 0xF0) {
166             state->s0 = 0x64000000L | (input_chr & 0x0F);
167         } else if (input_chr >= 0xF0 && input_chr < 0xF8) {
168             state->s0 = 0x84000000L | (input_chr & 0x07);
169         } else if (input_chr >= 0xF8 && input_chr < 0xFC) {
170             state->s0 = 0xa4000000L | (input_chr & 0x03);
171         } else if (input_chr >= 0xFC && input_chr < 0xFE) {
172             state->s0 = 0xc4000000L | (input_chr & 0x01);
173         }
174     }
175 }
176 
177 /*
178  * UTF-8 is a stateless multi-byte encoding (in the sense that just
179  * after any character has been completed, the state is always the
180  * same); hence when writing it, there is no need to use the
181  * charset_state.
182  */
183 
write_utf8(charset_spec const * charset,long int input_chr,charset_state * state,void (* emit)(void * ctx,long int output),void * emitctx)184 static void write_utf8(charset_spec const *charset, long int input_chr,
185                        charset_state *state,
186                        void (*emit)(void *ctx, long int output), void *emitctx)
187 {
188     UNUSEDARG(charset);
189     UNUSEDARG(state);
190 
191     /*
192      * Refuse to output any illegal code points.
193      */
194     if (input_chr == 0xFFFE || input_chr == 0xFFFF ||
195         (input_chr >= 0xD800 && input_chr < 0xE000)) {
196         emit(emitctx, ERROR);
197     } else if (input_chr < 0x80) {     /* one-byte character */
198         emit(emitctx, input_chr);
199     } else if (input_chr < 0x800) {    /* two-byte character */
200         emit(emitctx, 0xC0 | (0x1F & (input_chr >>  6)));
201         emit(emitctx, 0x80 | (0x3F & (input_chr      )));
202     } else if (input_chr < 0x10000) {  /* three-byte character */
203         emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12)));
204         emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
205         emit(emitctx, 0x80 | (0x3F & (input_chr      )));
206     } else if (input_chr < 0x200000) { /* four-byte character */
207         emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18)));
208         emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
209         emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
210         emit(emitctx, 0x80 | (0x3F & (input_chr      )));
211     } else if (input_chr < 0x4000000) {/* five-byte character */
212         emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24)));
213         emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
214         emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
215         emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
216         emit(emitctx, 0x80 | (0x3F & (input_chr      )));
217     } else {                           /* six-byte character */
218         emit(emitctx, 0xFC | (0x01 & (input_chr >> 30)));
219         emit(emitctx, 0x80 | (0x3F & (input_chr >> 24)));
220         emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
221         emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
222         emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
223         emit(emitctx, 0x80 | (0x3F & (input_chr      )));
224     }
225 }
226 
227 #ifdef TESTMODE
228 
229 #include <stdio.h>
230 #include <stdarg.h>
231 
232 int total_errs = 0;
233 
utf8_emit(void * ctx,long output)234 void utf8_emit(void *ctx, long output)
235 {
236     wchar_t **p = (wchar_t **)ctx;
237     *(*p)++ = output;
238 }
239 
utf8_read_test(int line,char * input,int inlen,...)240 void utf8_read_test(int line, char *input, int inlen, ...)
241 {
242     va_list ap;
243     wchar_t *p, str[512];
244     int i;
245     charset_state state;
246     unsigned long l;
247 
248     state.s0 = 0;
249     p = str;
250 
251     for (i = 0; i < inlen; i++)
252         read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p);
253 
254     va_start(ap, inlen);
255     l = 0;
256     for (i = 0; i < p - str; i++) {
257         l = va_arg(ap, long int);
258         if (l == -1) {
259             printf("%d: correct string shorter than output\n", line);
260             total_errs++;
261             break;
262         }
263         if (l != str[i]) {
264             printf("%d: char %d came out as %08x, should be %08x\n",
265                    line, i, str[i], (unsigned)l);
266             total_errs++;
267         }
268     }
269     if (l != -1) {
270         l = va_arg(ap, long int);
271         if (l != -1) {
272             printf("%d: correct string longer than output\n", line);
273             total_errs++;
274         }
275     }
276     va_end(ap);
277 }
278 
utf8_write_test(int line,const long * input,int inlen,...)279 void utf8_write_test(int line, const long *input, int inlen, ...)
280 {
281     va_list ap;
282     wchar_t *p, str[512];
283     int i;
284     charset_state state;
285     unsigned long l;
286 
287     state.s0 = 0;
288     p = str;
289 
290     for (i = 0; i < inlen; i++)
291         write_utf8(NULL, input[i], &state, utf8_emit, &p);
292 
293     va_start(ap, inlen);
294     l = 0;
295     for (i = 0; i < p - str; i++) {
296         l = va_arg(ap, long int);
297         if (l == -1) {
298             printf("%d: correct string shorter than output\n", line);
299             total_errs++;
300             break;
301         }
302         if (l != str[i]) {
303             printf("%d: char %d came out as %08x, should be %08x\n",
304                    line, i, str[i], (unsigned)l);
305             total_errs++;
306         }
307     }
308     if (l != -1) {
309         l = va_arg(ap, long int);
310         if (l != -1) {
311             printf("%d: correct string longer than output\n", line);
312             total_errs++;
313         }
314     }
315     va_end(ap);
316 }
317 
318 /* Macro to concoct the first three parameters of utf8_read_test. */
319 #define TESTSTR(x) __LINE__, x, lenof(x)
320 
main(void)321 int main(void)
322 {
323     printf("read tests beginning\n");
324     utf8_read_test(TESTSTR("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
325                    0x000003BA, /* GREEK SMALL LETTER KAPPA */
326                    0x00001F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */
327                    0x000003C3, /* GREEK SMALL LETTER SIGMA */
328                    0x000003BC, /* GREEK SMALL LETTER MU */
329                    0x000003B5, /* GREEK SMALL LETTER EPSILON */
330                    0, -1);
331     utf8_read_test(TESTSTR("\x00"),
332                    0x00000000, /* <control> */
333                    0, -1);
334     utf8_read_test(TESTSTR("\xC2\x80"),
335                    0x00000080, /* <control> */
336                    0, -1);
337     utf8_read_test(TESTSTR("\xE0\xA0\x80"),
338                    0x00000800, /* <no name available> */
339                    0, -1);
340     utf8_read_test(TESTSTR("\xF0\x90\x80\x80"),
341                    0x00010000, /* <no name available> */
342                    0, -1);
343     utf8_read_test(TESTSTR("\xF8\x88\x80\x80\x80"),
344                    0x00200000, /* <no name available> */
345                    0, -1);
346     utf8_read_test(TESTSTR("\xFC\x84\x80\x80\x80\x80"),
347                    0x04000000, /* <no name available> */
348                    0, -1);
349     utf8_read_test(TESTSTR("\x7F"),
350                    0x0000007F, /* <control> */
351                    0, -1);
352     utf8_read_test(TESTSTR("\xDF\xBF"),
353                    0x000007FF, /* <no name available> */
354                    0, -1);
355     utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
356                    0x0000FFFD, /* REPLACEMENT CHARACTER */
357                    0, -1);
358     utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
359                    ERROR,      /* <no name available> (invalid char) */
360                    0, -1);
361     utf8_read_test(TESTSTR("\xF7\xBF\xBF\xBF"),
362                    0x001FFFFF, /* <no name available> */
363                    0, -1);
364     utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF\xBF"),
365                    0x03FFFFFF, /* <no name available> */
366                    0, -1);
367     utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF\xBF"),
368                    0x7FFFFFFF, /* <no name available> */
369                    0, -1);
370     utf8_read_test(TESTSTR("\xED\x9F\xBF"),
371                    0x0000D7FF, /* <no name available> */
372                    0, -1);
373     utf8_read_test(TESTSTR("\xEE\x80\x80"),
374                    0x0000E000, /* <Private Use, First> */
375                    0, -1);
376     utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
377                    0x0000FFFD, /* REPLACEMENT CHARACTER */
378                    0, -1);
379     utf8_read_test(TESTSTR("\xF4\x8F\xBF\xBF"),
380                    0x0010FFFF, /* <no name available> */
381                    0, -1);
382     utf8_read_test(TESTSTR("\xF4\x90\x80\x80"),
383                    0x00110000, /* <no name available> */
384                    0, -1);
385     utf8_read_test(TESTSTR("\x80"),
386                    ERROR,      /* (unexpected continuation byte) */
387                    0, -1);
388     utf8_read_test(TESTSTR("\xBF"),
389                    ERROR,      /* (unexpected continuation byte) */
390                    0, -1);
391     utf8_read_test(TESTSTR("\x80\xBF"),
392                    ERROR,      /* (unexpected continuation byte) */
393                    ERROR,      /* (unexpected continuation byte) */
394                    0, -1);
395     utf8_read_test(TESTSTR("\x80\xBF\x80"),
396                    ERROR,      /* (unexpected continuation byte) */
397                    ERROR,      /* (unexpected continuation byte) */
398                    ERROR,      /* (unexpected continuation byte) */
399                    0, -1);
400     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF"),
401                    ERROR,      /* (unexpected continuation byte) */
402                    ERROR,      /* (unexpected continuation byte) */
403                    ERROR,      /* (unexpected continuation byte) */
404                    ERROR,      /* (unexpected continuation byte) */
405                    0, -1);
406     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80"),
407                    ERROR,      /* (unexpected continuation byte) */
408                    ERROR,      /* (unexpected continuation byte) */
409                    ERROR,      /* (unexpected continuation byte) */
410                    ERROR,      /* (unexpected continuation byte) */
411                    ERROR,      /* (unexpected continuation byte) */
412                    0, -1);
413     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF"),
414                    ERROR,      /* (unexpected continuation byte) */
415                    ERROR,      /* (unexpected continuation byte) */
416                    ERROR,      /* (unexpected continuation byte) */
417                    ERROR,      /* (unexpected continuation byte) */
418                    ERROR,      /* (unexpected continuation byte) */
419                    ERROR,      /* (unexpected continuation byte) */
420                    0, -1);
421     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF\x80"),
422                    ERROR,      /* (unexpected continuation byte) */
423                    ERROR,      /* (unexpected continuation byte) */
424                    ERROR,      /* (unexpected continuation byte) */
425                    ERROR,      /* (unexpected continuation byte) */
426                    ERROR,      /* (unexpected continuation byte) */
427                    ERROR,      /* (unexpected continuation byte) */
428                    ERROR,      /* (unexpected continuation byte) */
429                    0, -1);
430     utf8_read_test(TESTSTR("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"),
431                    ERROR,      /* (unexpected continuation byte) */
432                    ERROR,      /* (unexpected continuation byte) */
433                    ERROR,      /* (unexpected continuation byte) */
434                    ERROR,      /* (unexpected continuation byte) */
435                    ERROR,      /* (unexpected continuation byte) */
436                    ERROR,      /* (unexpected continuation byte) */
437                    ERROR,      /* (unexpected continuation byte) */
438                    ERROR,      /* (unexpected continuation byte) */
439                    ERROR,      /* (unexpected continuation byte) */
440                    ERROR,      /* (unexpected continuation byte) */
441                    ERROR,      /* (unexpected continuation byte) */
442                    ERROR,      /* (unexpected continuation byte) */
443                    ERROR,      /* (unexpected continuation byte) */
444                    ERROR,      /* (unexpected continuation byte) */
445                    ERROR,      /* (unexpected continuation byte) */
446                    ERROR,      /* (unexpected continuation byte) */
447                    ERROR,      /* (unexpected continuation byte) */
448                    ERROR,      /* (unexpected continuation byte) */
449                    ERROR,      /* (unexpected continuation byte) */
450                    ERROR,      /* (unexpected continuation byte) */
451                    ERROR,      /* (unexpected continuation byte) */
452                    ERROR,      /* (unexpected continuation byte) */
453                    ERROR,      /* (unexpected continuation byte) */
454                    ERROR,      /* (unexpected continuation byte) */
455                    ERROR,      /* (unexpected continuation byte) */
456                    ERROR,      /* (unexpected continuation byte) */
457                    ERROR,      /* (unexpected continuation byte) */
458                    ERROR,      /* (unexpected continuation byte) */
459                    ERROR,      /* (unexpected continuation byte) */
460                    ERROR,      /* (unexpected continuation byte) */
461                    ERROR,      /* (unexpected continuation byte) */
462                    ERROR,      /* (unexpected continuation byte) */
463                    ERROR,      /* (unexpected continuation byte) */
464                    ERROR,      /* (unexpected continuation byte) */
465                    ERROR,      /* (unexpected continuation byte) */
466                    ERROR,      /* (unexpected continuation byte) */
467                    ERROR,      /* (unexpected continuation byte) */
468                    ERROR,      /* (unexpected continuation byte) */
469                    ERROR,      /* (unexpected continuation byte) */
470                    ERROR,      /* (unexpected continuation byte) */
471                    ERROR,      /* (unexpected continuation byte) */
472                    ERROR,      /* (unexpected continuation byte) */
473                    ERROR,      /* (unexpected continuation byte) */
474                    ERROR,      /* (unexpected continuation byte) */
475                    ERROR,      /* (unexpected continuation byte) */
476                    ERROR,      /* (unexpected continuation byte) */
477                    ERROR,      /* (unexpected continuation byte) */
478                    ERROR,      /* (unexpected continuation byte) */
479                    ERROR,      /* (unexpected continuation byte) */
480                    ERROR,      /* (unexpected continuation byte) */
481                    ERROR,      /* (unexpected continuation byte) */
482                    ERROR,      /* (unexpected continuation byte) */
483                    ERROR,      /* (unexpected continuation byte) */
484                    ERROR,      /* (unexpected continuation byte) */
485                    ERROR,      /* (unexpected continuation byte) */
486                    ERROR,      /* (unexpected continuation byte) */
487                    ERROR,      /* (unexpected continuation byte) */
488                    ERROR,      /* (unexpected continuation byte) */
489                    ERROR,      /* (unexpected continuation byte) */
490                    ERROR,      /* (unexpected continuation byte) */
491                    ERROR,      /* (unexpected continuation byte) */
492                    ERROR,      /* (unexpected continuation byte) */
493                    ERROR,      /* (unexpected continuation byte) */
494                    ERROR,      /* (unexpected continuation byte) */
495                    0, -1);
496     utf8_read_test(TESTSTR("\xC0\x20\xC1\x20\xC2\x20\xC3\x20\xC4\x20\xC5\x20\xC6\x20\xC7\x20"),
497                    ERROR,      /* (incomplete sequence) */
498                    0x00000020, /* SPACE */
499                    ERROR,      /* (incomplete sequence) */
500                    0x00000020, /* SPACE */
501                    ERROR,      /* (incomplete sequence) */
502                    0x00000020, /* SPACE */
503                    ERROR,      /* (incomplete sequence) */
504                    0x00000020, /* SPACE */
505                    ERROR,      /* (incomplete sequence) */
506                    0x00000020, /* SPACE */
507                    ERROR,      /* (incomplete sequence) */
508                    0x00000020, /* SPACE */
509                    ERROR,      /* (incomplete sequence) */
510                    0x00000020, /* SPACE */
511                    ERROR,      /* (incomplete sequence) */
512                    0x00000020, /* SPACE */
513                    0, -1);
514     utf8_read_test(TESTSTR("\xE0\x20\xE1\x20\xE2\x20\xE3\x20\xE4\x20\xE5\x20\xE6\x20\xE7\x20\xE8\x20\xE9\x20\xEA\x20\xEB\x20\xEC\x20\xED\x20\xEE\x20\xEF\x20"),
515                    ERROR,      /* (incomplete sequence) */
516                    0x00000020, /* SPACE */
517                    ERROR,      /* (incomplete sequence) */
518                    0x00000020, /* SPACE */
519                    ERROR,      /* (incomplete sequence) */
520                    0x00000020, /* SPACE */
521                    ERROR,      /* (incomplete sequence) */
522                    0x00000020, /* SPACE */
523                    ERROR,      /* (incomplete sequence) */
524                    0x00000020, /* SPACE */
525                    ERROR,      /* (incomplete sequence) */
526                    0x00000020, /* SPACE */
527                    ERROR,      /* (incomplete sequence) */
528                    0x00000020, /* SPACE */
529                    ERROR,      /* (incomplete sequence) */
530                    0x00000020, /* SPACE */
531                    ERROR,      /* (incomplete sequence) */
532                    0x00000020, /* SPACE */
533                    ERROR,      /* (incomplete sequence) */
534                    0x00000020, /* SPACE */
535                    ERROR,      /* (incomplete sequence) */
536                    0x00000020, /* SPACE */
537                    ERROR,      /* (incomplete sequence) */
538                    0x00000020, /* SPACE */
539                    ERROR,      /* (incomplete sequence) */
540                    0x00000020, /* SPACE */
541                    ERROR,      /* (incomplete sequence) */
542                    0x00000020, /* SPACE */
543                    ERROR,      /* (incomplete sequence) */
544                    0x00000020, /* SPACE */
545                    ERROR,      /* (incomplete sequence) */
546                    0x00000020, /* SPACE */
547                    0, -1);
548     utf8_read_test(TESTSTR("\xF0\x20\xF1\x20\xF2\x20\xF3\x20\xF4\x20\xF5\x20\xF6\x20\xF7\x20"),
549                    ERROR,      /* (incomplete sequence) */
550                    0x00000020, /* SPACE */
551                    ERROR,      /* (incomplete sequence) */
552                    0x00000020, /* SPACE */
553                    ERROR,      /* (incomplete sequence) */
554                    0x00000020, /* SPACE */
555                    ERROR,      /* (incomplete sequence) */
556                    0x00000020, /* SPACE */
557                    ERROR,      /* (incomplete sequence) */
558                    0x00000020, /* SPACE */
559                    ERROR,      /* (incomplete sequence) */
560                    0x00000020, /* SPACE */
561                    ERROR,      /* (incomplete sequence) */
562                    0x00000020, /* SPACE */
563                    ERROR,      /* (incomplete sequence) */
564                    0x00000020, /* SPACE */
565                    0, -1);
566     utf8_read_test(TESTSTR("\xF8\x20\xF9\x20\xFA\x20\xFB\x20"),
567                    ERROR,      /* (incomplete sequence) */
568                    0x00000020, /* SPACE */
569                    ERROR,      /* (incomplete sequence) */
570                    0x00000020, /* SPACE */
571                    ERROR,      /* (incomplete sequence) */
572                    0x00000020, /* SPACE */
573                    ERROR,      /* (incomplete sequence) */
574                    0x00000020, /* SPACE */
575                    0, -1);
576     utf8_read_test(TESTSTR("\xFC\x20\xFD\x20"),
577                    ERROR,      /* (incomplete sequence) */
578                    0x00000020, /* SPACE */
579                    ERROR,      /* (incomplete sequence) */
580                    0x00000020, /* SPACE */
581                    0, -1);
582     utf8_read_test(TESTSTR("\xC0"),
583                    ERROR,      /* (incomplete sequence) */
584                    0, -1);
585     utf8_read_test(TESTSTR("\xE0\x80"),
586                    ERROR,      /* (incomplete sequence) */
587                    0, -1);
588     utf8_read_test(TESTSTR("\xF0\x80\x80"),
589                    ERROR,      /* (incomplete sequence) */
590                    0, -1);
591     utf8_read_test(TESTSTR("\xF8\x80\x80\x80"),
592                    ERROR,      /* (incomplete sequence) */
593                    0, -1);
594     utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80"),
595                    ERROR,      /* (incomplete sequence) */
596                    0, -1);
597     utf8_read_test(TESTSTR("\xDF"),
598                    ERROR,      /* (incomplete sequence) */
599                    0, -1);
600     utf8_read_test(TESTSTR("\xEF\xBF"),
601                    ERROR,      /* (incomplete sequence) */
602                    0, -1);
603     utf8_read_test(TESTSTR("\xF7\xBF\xBF"),
604                    ERROR,      /* (incomplete sequence) */
605                    0, -1);
606     utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF"),
607                    ERROR,      /* (incomplete sequence) */
608                    0, -1);
609     utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF"),
610                    ERROR,      /* (incomplete sequence) */
611                    0, -1);
612     utf8_read_test(TESTSTR("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"),
613                    ERROR,      /* (incomplete sequence) */
614                    ERROR,      /* (incomplete sequence) */
615                    ERROR,      /* (incomplete sequence) */
616                    ERROR,      /* (incomplete sequence) */
617                    ERROR,      /* (incomplete sequence) */
618                    ERROR,      /* (incomplete sequence) */
619                    ERROR,      /* (incomplete sequence) */
620                    ERROR,      /* (incomplete sequence) */
621                    ERROR,      /* (incomplete sequence) */
622                    ERROR,      /* (incomplete sequence) */
623                    0, -1);
624     utf8_read_test(TESTSTR("\xFE"),
625                    ERROR,      /* (invalid UTF-8 byte) */
626                    0, -1);
627     utf8_read_test(TESTSTR("\xFF"),
628                    ERROR,      /* (invalid UTF-8 byte) */
629                    0, -1);
630     utf8_read_test(TESTSTR("\xFE\xFE\xFF\xFF"),
631                    ERROR,      /* (invalid UTF-8 byte) */
632                    ERROR,      /* (invalid UTF-8 byte) */
633                    ERROR,      /* (invalid UTF-8 byte) */
634                    ERROR,      /* (invalid UTF-8 byte) */
635                    0, -1);
636     utf8_read_test(TESTSTR("\xC0\xAF"),
637                    ERROR,      /* SOLIDUS (overlong form of 2F) */
638                    0, -1);
639     utf8_read_test(TESTSTR("\xE0\x80\xAF"),
640                    ERROR,      /* SOLIDUS (overlong form of 2F) */
641                    0, -1);
642     utf8_read_test(TESTSTR("\xF0\x80\x80\xAF"),
643                    ERROR,      /* SOLIDUS (overlong form of 2F) */
644                    0, -1);
645     utf8_read_test(TESTSTR("\xF8\x80\x80\x80\xAF"),
646                    ERROR,      /* SOLIDUS (overlong form of 2F) */
647                    0, -1);
648     utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\xAF"),
649                    ERROR,      /* SOLIDUS (overlong form of 2F) */
650                    0, -1);
651     utf8_read_test(TESTSTR("\xC1\xBF"),
652                    ERROR,      /* <control> (overlong form of 7F) */
653                    0, -1);
654     utf8_read_test(TESTSTR("\xE0\x9F\xBF"),
655                    ERROR,      /* <no name available> (overlong form of DF BF) */
656                    0, -1);
657     utf8_read_test(TESTSTR("\xF0\x8F\xBF\xBF"),
658                    ERROR,      /* <no name available> (overlong form of EF BF BF) (invalid char) */
659                    0, -1);
660     utf8_read_test(TESTSTR("\xF8\x87\xBF\xBF\xBF"),
661                    ERROR,      /* <no name available> (overlong form of F7 BF BF BF) */
662                    0, -1);
663     utf8_read_test(TESTSTR("\xFC\x83\xBF\xBF\xBF\xBF"),
664                    ERROR,      /* <no name available> (overlong form of FB BF BF BF BF) */
665                    0, -1);
666     utf8_read_test(TESTSTR("\xC0\x80"),
667                    ERROR,      /* <control> (overlong form of 00) */
668                    0, -1);
669     utf8_read_test(TESTSTR("\xE0\x80\x80"),
670                    ERROR,      /* <control> (overlong form of 00) */
671                    0, -1);
672     utf8_read_test(TESTSTR("\xF0\x80\x80\x80"),
673                    ERROR,      /* <control> (overlong form of 00) */
674                    0, -1);
675     utf8_read_test(TESTSTR("\xF8\x80\x80\x80\x80"),
676                    ERROR,      /* <control> (overlong form of 00) */
677                    0, -1);
678     utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\x80"),
679                    ERROR,      /* <control> (overlong form of 00) */
680                    0, -1);
681     utf8_read_test(TESTSTR("\xED\xA0\x80"),
682                    ERROR,      /* <Non Private Use High Surrogate, First> (surrogate) */
683                    0, -1);
684     utf8_read_test(TESTSTR("\xED\xAD\xBF"),
685                    ERROR,      /* <Non Private Use High Surrogate, Last> (surrogate) */
686                    0, -1);
687     utf8_read_test(TESTSTR("\xED\xAE\x80"),
688                    ERROR,      /* <Private Use High Surrogate, First> (surrogate) */
689                    0, -1);
690     utf8_read_test(TESTSTR("\xED\xAF\xBF"),
691                    ERROR,      /* <Private Use High Surrogate, Last> (surrogate) */
692                    0, -1);
693     utf8_read_test(TESTSTR("\xED\xB0\x80"),
694                    ERROR,      /* <Low Surrogate, First> (surrogate) */
695                    0, -1);
696     utf8_read_test(TESTSTR("\xED\xBE\x80"),
697                    ERROR,      /* <no name available> (surrogate) */
698                    0, -1);
699     utf8_read_test(TESTSTR("\xED\xBF\xBF"),
700                    ERROR,      /* <Low Surrogate, Last> (surrogate) */
701                    0, -1);
702     utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xB0\x80"),
703                    ERROR,      /* <Non Private Use High Surrogate, First> (surrogate) */
704                    ERROR,      /* <Low Surrogate, First> (surrogate) */
705                    0, -1);
706     utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xBF\xBF"),
707                    ERROR,      /* <Non Private Use High Surrogate, First> (surrogate) */
708                    ERROR,      /* <Low Surrogate, Last> (surrogate) */
709                    0, -1);
710     utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xB0\x80"),
711                    ERROR,      /* <Non Private Use High Surrogate, Last> (surrogate) */
712                    ERROR,      /* <Low Surrogate, First> (surrogate) */
713                    0, -1);
714     utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xBF\xBF"),
715                    ERROR,      /* <Non Private Use High Surrogate, Last> (surrogate) */
716                    ERROR,      /* <Low Surrogate, Last> (surrogate) */
717                    0, -1);
718     utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xB0\x80"),
719                    ERROR,      /* <Private Use High Surrogate, First> (surrogate) */
720                    ERROR,      /* <Low Surrogate, First> (surrogate) */
721                    0, -1);
722     utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xBF\xBF"),
723                    ERROR,      /* <Private Use High Surrogate, First> (surrogate) */
724                    ERROR,      /* <Low Surrogate, Last> (surrogate) */
725                    0, -1);
726     utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xB0\x80"),
727                    ERROR,      /* <Private Use High Surrogate, Last> (surrogate) */
728                    ERROR,      /* <Low Surrogate, First> (surrogate) */
729                    0, -1);
730     utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xBF\xBF"),
731                    ERROR,      /* <Private Use High Surrogate, Last> (surrogate) */
732                    ERROR,      /* <Low Surrogate, Last> (surrogate) */
733                    0, -1);
734     utf8_read_test(TESTSTR("\xEF\xBF\xBE"),
735                    ERROR,      /* <no name available> (invalid char) */
736                    0, -1);
737     utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
738                    ERROR,      /* <no name available> (invalid char) */
739                    0, -1);
740     printf("read tests completed\n");
741     printf("write tests beginning\n");
742     {
743         const static long str[] =
744         {0x03BAL, 0x1F79L, 0x03C3L, 0x03BCL, 0x03B5L, 0};
745         utf8_write_test(TESTSTR(str),
746                         0xCE, 0xBA,
747                         0xE1, 0xBD, 0xB9,
748                         0xCF, 0x83,
749                         0xCE, 0xBC,
750                         0xCE, 0xB5,
751                         0, -1);
752     }
753     {
754         const static long str[] = {0x0000L, 0};
755         utf8_write_test(TESTSTR(str),
756                         0x00,
757                         0, -1);
758     }
759     {
760         const static long str[] = {0x0080L, 0};
761         utf8_write_test(TESTSTR(str),
762                         0xC2, 0x80,
763                         0, -1);
764     }
765     {
766         const static long str[] = {0x0800L, 0};
767         utf8_write_test(TESTSTR(str),
768                         0xE0, 0xA0, 0x80,
769                         0, -1);
770     }
771     {
772         const static long str[] = {0x00010000L, 0};
773         utf8_write_test(TESTSTR(str),
774                         0xF0, 0x90, 0x80, 0x80,
775                         0, -1);
776     }
777     {
778         const static long str[] = {0x00200000L, 0};
779         utf8_write_test(TESTSTR(str),
780                         0xF8, 0x88, 0x80, 0x80, 0x80,
781                         0, -1);
782     }
783     {
784         const static long str[] = {0x04000000L, 0};
785         utf8_write_test(TESTSTR(str),
786                         0xFC, 0x84, 0x80, 0x80, 0x80, 0x80,
787                         0, -1);
788     }
789     {
790         const static long str[] = {0x007FL, 0};
791         utf8_write_test(TESTSTR(str),
792                         0x7F,
793                         0, -1);
794     }
795     {
796         const static long str[] = {0x07FFL, 0};
797         utf8_write_test(TESTSTR(str),
798                         0xDF, 0xBF,
799                         0, -1);
800     }
801     {
802         const static long str[] = {0xFFFDL, 0};
803         utf8_write_test(TESTSTR(str),
804                         0xEF, 0xBF, 0xBD,
805                         0, -1);
806     }
807     {
808         const static long str[] = {0xFFFFL, 0};
809         utf8_write_test(TESTSTR(str),
810                         ERROR,
811                         0, -1);
812     }
813     {
814         const static long str[] = {0x001FFFFFL, 0};
815         utf8_write_test(TESTSTR(str),
816                         0xF7, 0xBF, 0xBF, 0xBF,
817                         0, -1);
818     }
819     {
820         const static long str[] = {0x03FFFFFFL, 0};
821         utf8_write_test(TESTSTR(str),
822                         0xFB, 0xBF, 0xBF, 0xBF, 0xBF,
823                         0, -1);
824     }
825     {
826         const static long str[] = {0x7FFFFFFFL, 0};
827         utf8_write_test(TESTSTR(str),
828                         0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF,
829                         0, -1);
830     }
831     {
832         const static long str[] = {0xD7FFL, 0};
833         utf8_write_test(TESTSTR(str),
834                         0xED, 0x9F, 0xBF,
835                         0, -1);
836     }
837     {
838         const static long str[] = {0xD800L, 0};
839         utf8_write_test(TESTSTR(str),
840                         ERROR,
841                         0, -1);
842     }
843     {
844         const static long str[] = {0xD800L, 0xDC00L, 0};
845         utf8_write_test(TESTSTR(str),
846                         ERROR,
847                         ERROR,
848                         0, -1);
849     }
850     {
851         const static long str[] = {0xDFFFL, 0};
852         utf8_write_test(TESTSTR(str),
853                         ERROR,
854                         0, -1);
855     }
856     {
857         const static long str[] = {0xE000L, 0};
858         utf8_write_test(TESTSTR(str),
859                         0xEE, 0x80, 0x80,
860                         0, -1);
861     }
862     printf("write tests completed\n");
863 
864     printf("total: %d errors\n", total_errs);
865     return (total_errs != 0);
866 }
867 #endif /* TESTMODE */
868 
869 const charset_spec charset_CS_UTF8 = {
870     CS_UTF8, read_utf8, write_utf8, NULL
871 };
872 
873 #else /* ENUM_CHARSETS */
874 
875 ENUM_CHARSET(CS_UTF8)
876 
877 #endif /* ENUM_CHARSETS */
878