1 /* utf8.c -- convert characters to/from UTF-8
2
3 (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
4 See tidyp.h for the copyright notice.
5
6 Uses public interfaces to abstract input source and output
7 sink, which may be user supplied or either FILE* or memory
8 based Tidy implementations. Encoding support is uniform
9 regardless of I/O mechanism.
10
11 Note, UTF-8 encoding, by itself, does not affect the actual
12 "codepoints" of the underlying character encoding. In the
13 cases of ASCII, Latin1, Unicode (16-bit, BMP), these all
14 refer to ISO-10646 "codepoints". For anything else, they
15 refer to some other "codepoint" set.
16
17 Put another way, UTF-8 is a variable length method to
18 represent any non-negative integer value. The glyph
19 that a integer value represents is unchanged and defined
20 externally (e.g. by ISO-10646, Big5, Win1252, MacRoman,
21 Latin2-9, and so on).
22
23 Put still another way, UTF-8 is more of a _transfer_ encoding
24 than a _character_ encoding, per se.
25 */
26
27 #include "tidyp.h"
28 #include "forward.h"
29 #include "utf8.h"
30
31 /*
32 UTF-8 encoding/decoding functions
33 Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence
34
35 Also see below for UTF-16 encoding/decoding functions
36
37 References :
38
39 1) UCS Transformation Format 8 (UTF-8):
40 ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D
41 <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>
42 <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
43
44 Table 4 - Mapping from UCS-4 to UTF-8
45
46 2) Unicode standards:
47 <http://www.unicode.org/unicode/standard/standard.html>
48
49 3) Legal UTF-8 byte sequences:
50 <http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html>
51
52 Code point 1st byte 2nd byte 3rd byte 4th byte
53 ---------- -------- -------- -------- --------
54 U+0000..U+007F 00..7F
55 U+0080..U+07FF C2..DF 80..BF
56 U+0800..U+0FFF E0 A0..BF 80..BF
57 U+1000..U+FFFF E1..EF 80..BF 80..BF
58 U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
59 U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
60 U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
61
62 The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also
63 allows for the use of five- and six-byte sequences to encode
64 characters that are outside the range of the Unicode character
65 set; those five- and six-byte sequences are illegal for the use
66 of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646
67 does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF
68 (but it does allow other noncharacters).
69
70 4) RFC 2279: UTF-8, a transformation format of ISO 10646:
71 <http://www.ietf.org/rfc/rfc2279.txt>
72
73 5) UTF-8 and Unicode FAQ:
74 <http://www.cl.cam.ac.uk/~mgk25/unicode.html>
75
76 6) Markus Kuhn's UTF-8 decoder stress test file:
77 <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
78
79 7) UTF-8 Demo:
80 <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>
81
82 8) UTF-8 Sampler:
83 <http://www.columbia.edu/kermit/utf8.html>
84
85 9) Transformation Format for 16 Planes of Group 00 (UTF-16):
86 ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C
87 <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>
88 <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>
89
90 10) RFC 2781: UTF-16, an encoding of ISO 10646:
91 <http://www.ietf.org/rfc/rfc2781.txt>
92
93 11) UTF-16 invalid surrogate pairs:
94 <http://www.unicode.org/unicode/faq/utf_bom.html#16>
95
96 UTF-16 UTF-8 UCS-4
97 D83F DFF* F0 9F BF B* 0001FFF*
98 D87F DFF* F0 AF BF B* 0002FFF*
99 D8BF DFF* F0 BF BF B* 0003FFF*
100 D8FF DFF* F1 8F BF B* 0004FFF*
101 D93F DFF* F1 9F BF B* 0005FFF*
102 D97F DFF* F1 AF BF B* 0006FFF*
103 ...
104 DBBF DFF* F3 BF BF B* 000FFFF*
105 DBFF DFF* F4 8F BF B* 0010FFF*
106
107 * = E or F
108
109 1010 A
110 1011 B
111 1100 C
112 1101 D
113 1110 E
114 1111 F
115
116 */
117
118 #define kNumUTF8Sequences 7
119 #define kMaxUTF8Bytes 4
120
121 #define kUTF8ByteSwapNotAChar 0xFFFE
122 #define kUTF8NotAChar 0xFFFF
123
124 #define kMaxUTF8FromUCS4 0x10FFFF
125
126 #define kUTF16SurrogatesBegin 0x10000
127 #define kMaxUTF16FromUCS4 0x10FFFF
128
129 /* UTF-16 surrogate pair areas */
130 #define kUTF16LowSurrogateBegin 0xD800
131 #define kUTF16LowSurrogateEnd 0xDBFF
132 #define kUTF16HighSurrogateBegin 0xDC00
133 #define kUTF16HighSurrogateEnd 0xDFFF
134
135
136 /* offsets into validUTF8 table below */
137 static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =
138 {
139 0, /* 1 byte */
140 1, /* 2 bytes */
141 2, /* 3 bytes */
142 4, /* 4 bytes */
143 kNumUTF8Sequences /* must be last */
144 };
145
146 static const struct validUTF8Sequence
147 {
148 uint lowChar;
149 uint highChar;
150 int numBytes;
151 byte validBytes[8];
152 } validUTF8[kNumUTF8Sequences] =
153 {
154 /* low high #bytes byte 1 byte 2 byte 3 byte 4 */
155 {0x0000, 0x007F, 1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
156 {0x0080, 0x07FF, 2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}},
157 {0x0800, 0x0FFF, 3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
158 {0x1000, 0xFFFF, 3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
159 {0x10000, 0x3FFFF, 4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
160 {0x40000, 0xFFFFF, 4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
161 {0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}}
162 };
163
TY_(DecodeUTF8BytesToChar)164 int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
165 TidyInputSource* inp, int* count )
166 {
167 byte tempbuf[10];
168 byte *buf = &tempbuf[0];
169 uint ch = 0, n = 0;
170 int i, bytes = 0;
171 Bool hasError = no;
172
173 if ( successorBytes )
174 buf = (byte*) successorBytes;
175
176 /* special check if we have been passed an EOF char */
177 if ( firstByte == EndOfStream )
178 {
179 /* at present */
180 *c = firstByte;
181 *count = 1;
182 return 0;
183 }
184
185 ch = firstByte; /* first byte is passed in separately */
186
187 if (ch <= 0x7F) /* 0XXX XXXX one byte */
188 {
189 n = ch;
190 bytes = 1;
191 }
192 else if ((ch & 0xE0) == 0xC0) /* 110X XXXX two bytes */
193 {
194 n = ch & 31;
195 bytes = 2;
196 }
197 else if ((ch & 0xF0) == 0xE0) /* 1110 XXXX three bytes */
198 {
199 n = ch & 15;
200 bytes = 3;
201 }
202 else if ((ch & 0xF8) == 0xF0) /* 1111 0XXX four bytes */
203 {
204 n = ch & 7;
205 bytes = 4;
206 }
207 else if ((ch & 0xFC) == 0xF8) /* 1111 10XX five bytes */
208 {
209 n = ch & 3;
210 bytes = 5;
211 hasError = yes;
212 }
213 else if ((ch & 0xFE) == 0xFC) /* 1111 110X six bytes */
214 {
215 n = ch & 1;
216 bytes = 6;
217 hasError = yes;
218 }
219 else
220 {
221 /* not a valid first byte of a UTF-8 sequence */
222 n = ch;
223 bytes = 1;
224 hasError = yes;
225 }
226
227 /* successor bytes should have the form 10XX XXXX */
228
229 /* If caller supplied buffer, use it. Else see if caller
230 ** supplied an input source, use that.
231 */
232 if ( successorBytes )
233 {
234 for ( i=0; i < bytes-1; ++i )
235 {
236 if ( !buf[i] || (buf[i] & 0xC0) != 0x80 )
237 {
238 hasError = yes;
239 bytes = i+1;
240 break;
241 }
242 n = (n << 6) | (buf[i] & 0x3F);
243 }
244 }
245 else if ( inp )
246 {
247 for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i )
248 {
249 int b = inp->getByte( inp->sourceData );
250 buf[i] = (tmbchar) b;
251
252 /* End of data or illegal successor byte value */
253 if ( b == EOF || (buf[i] & 0xC0) != 0x80 )
254 {
255 hasError = yes;
256 bytes = i+1;
257 if ( b != EOF )
258 inp->ungetByte( inp->sourceData, buf[i] );
259 break;
260 }
261 n = (n << 6) | (buf[i] & 0x3F);
262 }
263 }
264 else if ( bytes > 1 )
265 {
266 hasError = yes;
267 bytes = 1;
268 }
269
270 if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar)))
271 hasError = yes;
272
273 if (!hasError && (n > kMaxUTF8FromUCS4))
274 hasError = yes;
275
276 if (!hasError)
277 {
278 int lo, hi;
279
280 lo = offsetUTF8Sequences[bytes - 1];
281 hi = offsetUTF8Sequences[bytes] - 1;
282
283 /* check for overlong sequences */
284 if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar))
285 hasError = yes;
286 else
287 {
288 hasError = yes; /* assume error until proven otherwise */
289
290 for (i = lo; i <= hi; i++)
291 {
292 int tempCount;
293 byte theByte;
294
295 for (tempCount = 0; tempCount < bytes; tempCount++)
296 {
297 if (!tempCount)
298 theByte = (tmbchar) firstByte;
299 else
300 theByte = buf[tempCount - 1];
301
302 if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] &&
303 theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] )
304 hasError = no;
305 if (hasError)
306 break;
307 }
308 }
309 }
310 }
311
312 #if 1 && defined(_DEBUG)
313 if ( hasError )
314 {
315 /* debug */
316 fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes );
317 fprintf( stderr, "0x%02x ", firstByte );
318 for (i = 1; i < bytes; i++)
319 fprintf( stderr, "0x%02x ", buf[i - 1] );
320 fprintf( stderr, " = U+%04ulx\n", n );
321 }
322 #endif
323
324 *count = bytes;
325 *c = n;
326 if ( hasError )
327 return -1;
328 return 0;
329 }
330
TY_(EncodeCharToUTF8Bytes)331 int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf,
332 TidyOutputSink* outp, int* count )
333 {
334 byte tempbuf[10] = {0};
335 byte* buf = &tempbuf[0];
336 int bytes = 0;
337 Bool hasError = no;
338
339 if ( encodebuf )
340 buf = (byte*) encodebuf;
341
342 if (c <= 0x7F) /* 0XXX XXXX one byte */
343 {
344 buf[0] = (tmbchar) c;
345 bytes = 1;
346 }
347 else if (c <= 0x7FF) /* 110X XXXX two bytes */
348 {
349 buf[0] = (tmbchar) ( 0xC0 | (c >> 6) );
350 buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) );
351 bytes = 2;
352 }
353 else if (c <= 0xFFFF) /* 1110 XXXX three bytes */
354 {
355 buf[0] = (tmbchar) (0xE0 | (c >> 12));
356 buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
357 buf[2] = (tmbchar) (0x80 | (c & 0x3F));
358 bytes = 3;
359 if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar )
360 hasError = yes;
361 }
362 else if (c <= 0x1FFFFF) /* 1111 0XXX four bytes */
363 {
364 buf[0] = (tmbchar) (0xF0 | (c >> 18));
365 buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
366 buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
367 buf[3] = (tmbchar) (0x80 | (c & 0x3F));
368 bytes = 4;
369 if (c > kMaxUTF8FromUCS4)
370 hasError = yes;
371 }
372 else if (c <= 0x3FFFFFF) /* 1111 10XX five bytes */
373 {
374 buf[0] = (tmbchar) (0xF8 | (c >> 24));
375 buf[1] = (tmbchar) (0x80 | (c >> 18));
376 buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
377 buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
378 buf[4] = (tmbchar) (0x80 | (c & 0x3F));
379 bytes = 5;
380 hasError = yes;
381 }
382 else if (c <= 0x7FFFFFFF) /* 1111 110X six bytes */
383 {
384 buf[0] = (tmbchar) (0xFC | (c >> 30));
385 buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F));
386 buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F));
387 buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
388 buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
389 buf[5] = (tmbchar) (0x80 | (c & 0x3F));
390 bytes = 6;
391 hasError = yes;
392 }
393 else
394 hasError = yes;
395
396 /* don't output invalid UTF-8 byte sequence to a stream */
397 if ( !hasError && outp != NULL )
398 {
399 int ix;
400 for ( ix=0; ix < bytes; ++ix )
401 outp->putByte( outp->sinkData, buf[ix] );
402 }
403
404 #if 1 && defined(_DEBUG)
405 if ( hasError )
406 {
407 int i;
408 fprintf( stderr, "UTF-8 encoding error for U+%x : ", c );
409 for (i = 0; i < bytes; i++)
410 fprintf( stderr, "0x%02x ", buf[i] );
411 fprintf( stderr, "\n" );
412 }
413 #endif
414
415 *count = bytes;
416 if (hasError)
417 return -1;
418 return 0;
419 }
420
421
422 /* return one less than the number of bytes used by the UTF-8 byte sequence */
423 /* str points to the UTF-8 byte sequence */
424 /* the Unicode char is returned in *ch */
TY_(GetUTF8)425 uint TY_(GetUTF8)( ctmbstr str, uint *ch )
426 {
427 uint n;
428 int bytes;
429
430 int err;
431
432 bytes = 0;
433
434 /* first byte "str[0]" is passed in separately from the */
435 /* rest of the UTF-8 byte sequence starting at "str[1]" */
436 err = TY_(DecodeUTF8BytesToChar)( &n, str[0], str+1, NULL, &bytes );
437 if (err)
438 {
439 #if 1 && defined(_DEBUG)
440 fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n);
441 #endif
442 n = 0xFFFD; /* replacement char */
443 }
444
445 *ch = n;
446 return bytes - 1;
447 }
448
449 /* store char c as UTF-8 encoded byte stream */
TY_(PutUTF8)450 tmbstr TY_(PutUTF8)( tmbstr buf, uint c )
451 {
452 int err, count = 0;
453
454 err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
455 if (err)
456 {
457 #if 1 && defined(_DEBUG)
458 fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c);
459 #endif
460 /* replacement char 0xFFFD encoded as UTF-8 */
461 buf[0] = (byte) 0xEF;
462 buf[1] = (byte) 0xBF;
463 buf[2] = (byte) 0xBD;
464 count = 3;
465 }
466
467 buf += count;
468 return buf;
469 }
470
TY_(IsValidUTF16FromUCS4)471 Bool TY_(IsValidUTF16FromUCS4)( tchar ucs4 )
472 {
473 return ( ucs4 <= kMaxUTF16FromUCS4 );
474 }
475
TY_(IsHighSurrogate)476 Bool TY_(IsHighSurrogate)( tchar ch )
477 {
478 return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd );
479 }
TY_(IsLowSurrogate)480 Bool TY_(IsLowSurrogate)( tchar ch )
481 {
482 return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd );
483 }
484
TY_(CombineSurrogatePair)485 tchar TY_(CombineSurrogatePair)( tchar high, tchar low )
486 {
487 assert( TY_(IsHighSurrogate)(high) && TY_(IsLowSurrogate)(low) );
488 return ( ((low - kUTF16LowSurrogateBegin) * 0x400) +
489 high - kUTF16HighSurrogateBegin + 0x10000 );
490 }
491
TY_(SplitSurrogatePair)492 Bool TY_(SplitSurrogatePair)( tchar utf16, tchar* low, tchar* high )
493 {
494 Bool status = ( TY_(IsValidCombinedChar)( utf16 ) && high && low );
495 if ( status )
496 {
497 *low = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;
498 *high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;
499 }
500 return status;
501 }
502
TY_(IsValidCombinedChar)503 Bool TY_(IsValidCombinedChar)( tchar ch )
504 {
505 return ( ch >= kUTF16SurrogatesBegin &&
506 (ch & 0x0000FFFE) != 0x0000FFFE &&
507 (ch & 0x0000FFFF) != 0x0000FFFF );
508 }
509
TY_(IsCombinedChar)510 Bool TY_(IsCombinedChar)( tchar ch )
511 {
512 return ( ch >= kUTF16SurrogatesBegin );
513 }
514
515 /*
516 * local variables:
517 * mode: c
518 * indent-tabs-mode: nil
519 * c-basic-offset: 4
520 * eval: (c-set-offset 'substatement-open 0)
521 * end:
522 */
523