1 /* utf8.c -- convert characters to/from UTF-8
2
3 (c) 1998-2004 (W3C) MIT, ERCIM, Keio University
4 See tidy.h for the copyright notice.
5
6 CVS Info :
7
8 $Author: terry_teague $
9 $Date: 2004/08/02 02:32:36 $
10 $Revision: 1.7 $
11
12 Uses public interfaces to abstract input source and output
13 sink, which may be user supplied or either FILE* or memory
14 based Tidy implementations. Encoding support is uniform
15 regardless of I/O mechanism.
16
17 Note, UTF-8 encoding, by itself, does not affect the actual
18 "codepoints" of the underlying character encoding. In the
19 cases of ASCII, Latin1, Unicode (16-bit, BMP), these all
20 refer to ISO-10646 "codepoints". For anything else, they
21 refer to some other "codepoint" set.
22
23 Put another way, UTF-8 is a variable length method to
24 represent any non-negative integer value. The glyph
25 that a integer value represents is unchanged and defined
26 externally (e.g. by ISO-10646, Big5, Win1252, MacRoman,
27 Latin2-9, and so on).
28
29 Put still another way, UTF-8 is more of a _transfer_ encoding
30 than a _character_ encoding, per se.
31 */
32
33 #include "tidy.h"
34 #include "utf8.h"
35
36 /*
37 UTF-8 encoding/decoding functions
38 Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence
39
40 Also see below for UTF-16 encoding/decoding functions
41
42 References :
43
44 1) UCS Transformation Format 8 (UTF-8):
45 ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D
46 <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>
47 <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
48
49 Table 4 - Mapping from UCS-4 to UTF-8
50
51 2) Unicode standards:
52 <http://www.unicode.org/unicode/standard/standard.html>
53
54 3) Legal UTF-8 byte sequences:
55 <http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html>
56
57 Code point 1st byte 2nd byte 3rd byte 4th byte
58 ---------- -------- -------- -------- --------
59 U+0000..U+007F 00..7F
60 U+0080..U+07FF C2..DF 80..BF
61 U+0800..U+0FFF E0 A0..BF 80..BF
62 U+1000..U+FFFF E1..EF 80..BF 80..BF
63 U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
64 U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
65 U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
66
67 The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also
68 allows for the use of five- and six-byte sequences to encode
69 characters that are outside the range of the Unicode character
70 set; those five- and six-byte sequences are illegal for the use
71 of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646
72 does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF
73 (but it does allow other noncharacters).
74
75 4) RFC 2279: UTF-8, a transformation format of ISO 10646:
76 <http://www.ietf.org/rfc/rfc2279.txt>
77
78 5) UTF-8 and Unicode FAQ:
79 <http://www.cl.cam.ac.uk/~mgk25/unicode.html>
80
81 6) Markus Kuhn's UTF-8 decoder stress test file:
82 <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
83
84 7) UTF-8 Demo:
85 <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>
86
87 8) UTF-8 Sampler:
88 <http://www.columbia.edu/kermit/utf8.html>
89
90 9) Transformation Format for 16 Planes of Group 00 (UTF-16):
91 ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C
92 <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>
93 <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>
94
95 10) RFC 2781: UTF-16, an encoding of ISO 10646:
96 <http://www.ietf.org/rfc/rfc2781.txt>
97
98 11) UTF-16 invalid surrogate pairs:
99 <http://www.unicode.org/unicode/faq/utf_bom.html#16>
100
101 UTF-16 UTF-8 UCS-4
102 D83F DFF* F0 9F BF B* 0001FFF*
103 D87F DFF* F0 AF BF B* 0002FFF*
104 D8BF DFF* F0 BF BF B* 0003FFF*
105 D8FF DFF* F1 8F BF B* 0004FFF*
106 D93F DFF* F1 9F BF B* 0005FFF*
107 D97F DFF* F1 AF BF B* 0006FFF*
108 ...
109 DBBF DFF* F3 BF BF B* 000FFFF*
110 DBFF DFF* F4 8F BF B* 0010FFF*
111
112 * = E or F
113
114 1010 A
115 1011 B
116 1100 C
117 1101 D
118 1110 E
119 1111 F
120
121 */
122
123 #define kNumUTF8Sequences 7
124 #define kMaxUTF8Bytes 4
125
126 #define kUTF8ByteSwapNotAChar 0xFFFE
127 #define kUTF8NotAChar 0xFFFF
128
129 #define kMaxUTF8FromUCS4 0x10FFFF
130
131 #define kUTF16SurrogatesBegin 0x10000
132 #define kMaxUTF16FromUCS4 0x10FFFF
133
134 /* UTF-16 surrogate pair areas */
135 #define kUTF16LowSurrogateBegin 0xD800
136 #define kUTF16LowSurrogateEnd 0xDBFF
137 #define kUTF16HighSurrogateBegin 0xDC00
138 #define kUTF16HighSurrogateEnd 0xDFFF
139
140
141 /* offsets into validUTF8 table below */
142 static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =
143 {
144 0, /* 1 byte */
145 1, /* 2 bytes */
146 2, /* 3 bytes */
147 4, /* 4 bytes */
148 kNumUTF8Sequences /* must be last */
149 };
150
151 static const struct validUTF8Sequence
152 {
153 uint lowChar;
154 uint highChar;
155 int numBytes;
156 byte validBytes[8];
157 } validUTF8[kNumUTF8Sequences] =
158 {
159 /* low high #bytes byte 1 byte 2 byte 3 byte 4 */
160 {0x0000, 0x007F, 1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
161 {0x0080, 0x07FF, 2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}},
162 {0x0800, 0x0FFF, 3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
163 {0x1000, 0xFFFF, 3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
164 {0x10000, 0x3FFFF, 4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
165 {0x40000, 0xFFFFF, 4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
166 {0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}}
167 };
168
DecodeUTF8BytesToChar(uint * c,uint firstByte,ctmbstr successorBytes,TidyInputSource * inp,int * count)169 int DecodeUTF8BytesToChar( uint* c, uint firstByte, ctmbstr successorBytes,
170 TidyInputSource* inp, int* count )
171 {
172 byte tempbuf[10];
173 byte *buf = &tempbuf[0];
174 uint ch = 0, n = 0;
175 int i, bytes = 0;
176 Bool hasError = no;
177
178 if ( successorBytes )
179 buf = (byte*) successorBytes;
180
181 /* special check if we have been passed an EOF char */
182 if ( firstByte == EndOfStream )
183 {
184 /* at present */
185 *c = firstByte;
186 *count = 1;
187 return 0;
188 }
189
190 ch = firstByte; /* first byte is passed in separately */
191
192 if (ch <= 0x7F) /* 0XXX XXXX one byte */
193 {
194 n = ch;
195 bytes = 1;
196 }
197 else if ((ch & 0xE0) == 0xC0) /* 110X XXXX two bytes */
198 {
199 n = ch & 31;
200 bytes = 2;
201 }
202 else if ((ch & 0xF0) == 0xE0) /* 1110 XXXX three bytes */
203 {
204 n = ch & 15;
205 bytes = 3;
206 }
207 else if ((ch & 0xF8) == 0xF0) /* 1111 0XXX four bytes */
208 {
209 n = ch & 7;
210 bytes = 4;
211 }
212 else if ((ch & 0xFC) == 0xF8) /* 1111 10XX five bytes */
213 {
214 n = ch & 3;
215 bytes = 5;
216 hasError = yes;
217 }
218 else if ((ch & 0xFE) == 0xFC) /* 1111 110X six bytes */
219 {
220 n = ch & 1;
221 bytes = 6;
222 hasError = yes;
223 }
224 else
225 {
226 /* not a valid first byte of a UTF-8 sequence */
227 n = ch;
228 bytes = 1;
229 hasError = yes;
230 }
231
232 /* successor bytes should have the form 10XX XXXX */
233
234 /* If caller supplied buffer, use it. Else see if caller
235 ** supplied an input source, use that.
236 */
237 if ( successorBytes )
238 {
239 for ( i=0; i < bytes-1; ++i )
240 {
241 if ( !buf[i] || (buf[i] & 0xC0) != 0x80 )
242 {
243 hasError = yes;
244 bytes = i;
245 break;
246 }
247 n = (n << 6) | (buf[i] & 0x3F);
248 }
249 }
250 else if ( inp )
251 {
252 for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i )
253 {
254 int b = inp->getByte( inp->sourceData );
255 buf[i] = (tmbchar) b;
256
257 /* End of data or illegal successor byte value */
258 if ( b == EOF || (buf[i] & 0xC0) != 0x80 )
259 {
260 hasError = yes;
261 bytes = i;
262 if ( b != EOF )
263 inp->ungetByte( inp->sourceData, buf[i] );
264 break;
265 }
266 n = (n << 6) | (buf[i] & 0x3F);
267 }
268 }
269 else if ( bytes > 1 )
270 {
271 hasError = yes;
272 bytes = 1;
273 }
274
275 if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar)))
276 hasError = yes;
277
278 if (!hasError && (n > kMaxUTF8FromUCS4))
279 hasError = yes;
280
281 #if 0 /* Breaks Big5 D8 - DF */
282 if (!hasError && (n >= kUTF16LowSurrogateBegin) && (n <= kUTF16HighSurrogateEnd))
283 /* unpaired surrogates not allowed */
284 hasError = yes;
285 #endif
286
287 if (!hasError)
288 {
289 int lo, hi;
290
291 lo = offsetUTF8Sequences[bytes - 1];
292 hi = offsetUTF8Sequences[bytes] - 1;
293
294 /* check for overlong sequences */
295 if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar))
296 hasError = yes;
297 else
298 {
299 hasError = yes; /* assume error until proven otherwise */
300
301 for (i = lo; i <= hi; i++)
302 {
303 int tempCount;
304 byte theByte;
305
306 for (tempCount = 0; tempCount < bytes; tempCount++)
307 {
308 if (!tempCount)
309 theByte = (tmbchar) firstByte;
310 else
311 theByte = buf[tempCount - 1];
312
313 if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] &&
314 theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] )
315 hasError = no;
316 if (hasError)
317 break;
318 }
319 }
320 }
321 }
322
323 #if 1 && defined(_DEBUG)
324 if ( hasError )
325 {
326 /* debug */
327 fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes );
328 fprintf( stderr, "0x%02x ", firstByte );
329 for (i = 1; i < bytes; i++)
330 fprintf( stderr, "0x%02x ", buf[i - 1] );
331 fprintf( stderr, " = U+%04ulx\n", n );
332 }
333 #endif
334
335 *count = bytes;
336 *c = n;
337 if ( hasError )
338 return -1;
339 return 0;
340 }
341
EncodeCharToUTF8Bytes(uint c,tmbstr encodebuf,TidyOutputSink * outp,int * count)342 int EncodeCharToUTF8Bytes( uint c, tmbstr encodebuf,
343 TidyOutputSink* outp, int* count )
344 {
345 byte tempbuf[10] = {0};
346 byte* buf = &tempbuf[0];
347 int bytes = 0;
348 Bool hasError = no;
349
350 if ( encodebuf )
351 buf = (byte*) encodebuf;
352
353 if (c <= 0x7F) /* 0XXX XXXX one byte */
354 {
355 buf[0] = (tmbchar) c;
356 bytes = 1;
357 }
358 else if (c <= 0x7FF) /* 110X XXXX two bytes */
359 {
360 buf[0] = (tmbchar) ( 0xC0 | (c >> 6) );
361 buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) );
362 bytes = 2;
363 }
364 else if (c <= 0xFFFF) /* 1110 XXXX three bytes */
365 {
366 buf[0] = (tmbchar) (0xE0 | (c >> 12));
367 buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
368 buf[2] = (tmbchar) (0x80 | (c & 0x3F));
369 bytes = 3;
370 if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar )
371 hasError = yes;
372 #if 0 /* Breaks Big5 D8 - DF */
373 else if ( c >= kUTF16LowSurrogateBegin && c <= kUTF16HighSurrogateEnd )
374 /* unpaired surrogates not allowed */
375 hasError = yes;
376 #endif
377 }
378 else if (c <= 0x1FFFFF) /* 1111 0XXX four bytes */
379 {
380 buf[0] = (tmbchar) (0xF0 | (c >> 18));
381 buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
382 buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
383 buf[3] = (tmbchar) (0x80 | (c & 0x3F));
384 bytes = 4;
385 if (c > kMaxUTF8FromUCS4)
386 hasError = yes;
387 }
388 else if (c <= 0x3FFFFFF) /* 1111 10XX five bytes */
389 {
390 buf[0] = (tmbchar) (0xF8 | (c >> 24));
391 buf[1] = (tmbchar) (0x80 | (c >> 18));
392 buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
393 buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
394 buf[4] = (tmbchar) (0x80 | (c & 0x3F));
395 bytes = 5;
396 hasError = yes;
397 }
398 else if (c <= 0x7FFFFFFF) /* 1111 110X six bytes */
399 {
400 buf[0] = (tmbchar) (0xFC | (c >> 30));
401 buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F));
402 buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F));
403 buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
404 buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
405 buf[5] = (tmbchar) (0x80 | (c & 0x3F));
406 bytes = 6;
407 hasError = yes;
408 }
409 else
410 hasError = yes;
411
412 /* don't output invalid UTF-8 byte sequence to a stream */
413 if ( !hasError && outp != NULL )
414 {
415 int ix;
416 for ( ix=0; ix < bytes; ++ix )
417 outp->putByte( outp->sinkData, buf[ix] );
418 }
419
420 #if 1 && defined(_DEBUG)
421 if ( hasError )
422 {
423 int i;
424 fprintf( stderr, "UTF-8 encoding error for U+%x : ", c );
425 for (i = 0; i < bytes; i++)
426 fprintf( stderr, "0x%02x ", buf[i] );
427 fprintf( stderr, "\n" );
428 }
429 #endif
430
431 *count = bytes;
432 if (hasError)
433 return -1;
434 return 0;
435 }
436
437
438 /* return one less than the number of bytes used by the UTF-8 byte sequence */
439 /* str points to the UTF-8 byte sequence */
440 /* the Unicode char is returned in *ch */
GetUTF8(ctmbstr str,uint * ch)441 uint GetUTF8( ctmbstr str, uint *ch )
442 {
443 uint n;
444 int bytes;
445
446 int err;
447
448 bytes = 0;
449
450 /* first byte "str[0]" is passed in separately from the */
451 /* rest of the UTF-8 byte sequence starting at "str[1]" */
452 err = DecodeUTF8BytesToChar( &n, str[0], str+1, NULL, &bytes );
453 if (err)
454 {
455 #if 1 && defined(_DEBUG)
456 fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n);
457 #endif
458 n = 0xFFFD; /* replacement char */
459 }
460
461 *ch = n;
462 return bytes - 1;
463 }
464
465 /* store char c as UTF-8 encoded byte stream */
PutUTF8(tmbstr buf,uint c)466 tmbstr PutUTF8( tmbstr buf, uint c )
467 {
468 int err, count = 0;
469
470 err = EncodeCharToUTF8Bytes( c, buf, NULL, &count );
471 if (err)
472 {
473 #if 1 && defined(_DEBUG)
474 fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c);
475 #endif
476 /* replacement char 0xFFFD encoded as UTF-8 */
477 buf[0] = (byte) 0xEF;
478 buf[1] = (byte) 0xBF;
479 buf[2] = (byte) 0xBD;
480 count = 3;
481 }
482
483 buf += count;
484 return buf;
485 }
486
IsValidUTF16FromUCS4(tchar ucs4)487 Bool IsValidUTF16FromUCS4( tchar ucs4 )
488 {
489 return ( ucs4 <= kMaxUTF16FromUCS4 );
490 }
491
IsHighSurrogate(tchar ch)492 Bool IsHighSurrogate( tchar ch )
493 {
494 return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd );
495 }
IsLowSurrogate(tchar ch)496 Bool IsLowSurrogate( tchar ch )
497 {
498 return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd );
499 }
500
CombineSurrogatePair(tchar high,tchar low)501 tchar CombineSurrogatePair( tchar high, tchar low )
502 {
503 assert( IsHighSurrogate(high) && IsLowSurrogate(low) );
504 return ( ((low - kUTF16LowSurrogateBegin) * 0x400) +
505 high - kUTF16HighSurrogateBegin + 0x10000 );
506 }
507
SplitSurrogatePair(tchar utf16,tchar * low,tchar * high)508 Bool SplitSurrogatePair( tchar utf16, tchar* low, tchar* high )
509 {
510 Bool status = ( IsValidCombinedChar( utf16 ) && high && low );
511 if ( status )
512 {
513 *low = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;
514 *high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;
515 }
516 return status;
517 }
518
IsValidCombinedChar(tchar ch)519 Bool IsValidCombinedChar( tchar ch )
520 {
521 return ( ch >= kUTF16SurrogatesBegin &&
522 (ch & 0x0000FFFE) != 0x0000FFFE &&
523 (ch & 0x0000FFFF) != 0x0000FFFF );
524 }
525
IsCombinedChar(tchar ch)526 Bool IsCombinedChar( tchar ch )
527 {
528 return ( ch >= kUTF16SurrogatesBegin );
529 }
530