1 /* utf8.c -- convert characters to/from UTF-8
2 
3   (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
4   See tidyp.h for the copyright notice.
5 
6   Uses public interfaces to abstract input source and output
7   sink, which may be user supplied or either FILE* or memory
8   based Tidy implementations.  Encoding support is uniform
9   regardless of I/O mechanism.
10 
11   Note, UTF-8 encoding, by itself, does not affect the actual
12   "codepoints" of the underlying character encoding.  In the
13   cases of ASCII, Latin1, Unicode (16-bit, BMP), these all
14   refer to ISO-10646 "codepoints".  For anything else, they
15   refer to some other "codepoint" set.
16 
17   Put another way, UTF-8 is a variable length method to
18   represent any non-negative integer value.  The glyph
19   that a integer value represents is unchanged and defined
20   externally (e.g. by ISO-10646, Big5, Win1252, MacRoman,
21   Latin2-9, and so on).
22 
23   Put still another way, UTF-8 is more of a _transfer_ encoding
24   than a _character_ encoding, per se.
25 */
26 
27 #include "tidyp.h"
28 #include "forward.h"
29 #include "utf8.h"
30 
31 /*
32 UTF-8 encoding/decoding functions
33 Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence
34 
35 Also see below for UTF-16 encoding/decoding functions
36 
37 References :
38 
39 1) UCS Transformation Format 8 (UTF-8):
40 ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D
41 <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>
42 <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
43 
44 Table 4 - Mapping from UCS-4 to UTF-8
45 
46 2) Unicode standards:
47 <http://www.unicode.org/unicode/standard/standard.html>
48 
49 3) Legal UTF-8 byte sequences:
50 <http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html>
51 
52 Code point          1st byte    2nd byte    3rd byte    4th byte
53 ----------          --------    --------    --------    --------
54 U+0000..U+007F      00..7F
55 U+0080..U+07FF      C2..DF      80..BF
56 U+0800..U+0FFF      E0          A0..BF      80..BF
57 U+1000..U+FFFF      E1..EF      80..BF      80..BF
58 U+10000..U+3FFFF    F0          90..BF      80..BF      80..BF
59 U+40000..U+FFFFF    F1..F3      80..BF      80..BF      80..BF
60 U+100000..U+10FFFF  F4          80..8F      80..BF      80..BF
61 
62 The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also
63 allows for the use of five- and six-byte sequences to encode
64 characters that are outside the range of the Unicode character
65 set; those five- and six-byte sequences are illegal for the use
66 of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646
67 does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF
68 (but it does allow other noncharacters).
69 
70 4) RFC 2279: UTF-8, a transformation format of ISO 10646:
71 <http://www.ietf.org/rfc/rfc2279.txt>
72 
73 5) UTF-8 and Unicode FAQ:
74 <http://www.cl.cam.ac.uk/~mgk25/unicode.html>
75 
76 6) Markus Kuhn's UTF-8 decoder stress test file:
77 <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
78 
79 7) UTF-8 Demo:
80 <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>
81 
82 8) UTF-8 Sampler:
83 <http://www.columbia.edu/kermit/utf8.html>
84 
85 9) Transformation Format for 16 Planes of Group 00 (UTF-16):
86 ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C
87 <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>
88 <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>
89 
90 10) RFC 2781: UTF-16, an encoding of ISO 10646:
91 <http://www.ietf.org/rfc/rfc2781.txt>
92 
93 11) UTF-16 invalid surrogate pairs:
94 <http://www.unicode.org/unicode/faq/utf_bom.html#16>
95 
96 UTF-16       UTF-8          UCS-4
97 D83F DFF*    F0 9F BF B*    0001FFF*
98 D87F DFF*    F0 AF BF B*    0002FFF*
99 D8BF DFF*    F0 BF BF B*    0003FFF*
100 D8FF DFF*    F1 8F BF B*    0004FFF*
101 D93F DFF*    F1 9F BF B*    0005FFF*
102 D97F DFF*    F1 AF BF B*    0006FFF*
103                 ...
104 DBBF DFF*    F3 BF BF B*    000FFFF*
105 DBFF DFF*    F4 8F BF B*    0010FFF*
106 
107 * = E or F
108 
109 1010  A
110 1011  B
111 1100  C
112 1101  D
113 1110  E
114 1111  F
115 
116 */
117 
118 #define kNumUTF8Sequences        7
119 #define kMaxUTF8Bytes            4
120 
121 #define kUTF8ByteSwapNotAChar    0xFFFE
122 #define kUTF8NotAChar            0xFFFF
123 
124 #define kMaxUTF8FromUCS4         0x10FFFF
125 
126 #define kUTF16SurrogatesBegin    0x10000
127 #define kMaxUTF16FromUCS4        0x10FFFF
128 
129 /* UTF-16 surrogate pair areas */
130 #define kUTF16LowSurrogateBegin  0xD800
131 #define kUTF16LowSurrogateEnd    0xDBFF
132 #define kUTF16HighSurrogateBegin 0xDC00
133 #define kUTF16HighSurrogateEnd   0xDFFF
134 
135 
136 /* offsets into validUTF8 table below */
137 static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =
138 {
139     0, /* 1 byte */
140     1, /* 2 bytes */
141     2, /* 3 bytes */
142     4, /* 4 bytes */
143     kNumUTF8Sequences /* must be last */
144 };
145 
146 static const struct validUTF8Sequence
147 {
148      uint lowChar;
149      uint highChar;
150      int  numBytes;
151      byte validBytes[8];
152 } validUTF8[kNumUTF8Sequences] =
153 {
154 /*   low       high   #bytes  byte 1      byte 2      byte 3      byte 4 */
155     {0x0000,   0x007F,   1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
156     {0x0080,   0x07FF,   2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}},
157     {0x0800,   0x0FFF,   3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
158     {0x1000,   0xFFFF,   3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
159     {0x10000,  0x3FFFF,  4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
160     {0x40000,  0xFFFFF,  4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
161     {0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}}
162 };
163 
TY_(DecodeUTF8BytesToChar)164 int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
165                                 TidyInputSource* inp, int* count )
166 {
167     byte tempbuf[10];
168     byte *buf = &tempbuf[0];
169     uint ch = 0, n = 0;
170     int i, bytes = 0;
171     Bool hasError = no;
172 
173     if ( successorBytes )
174         buf = (byte*) successorBytes;
175 
176     /* special check if we have been passed an EOF char */
177     if ( firstByte == EndOfStream )
178     {
179         /* at present */
180         *c = firstByte;
181         *count = 1;
182         return 0;
183     }
184 
185     ch = firstByte; /* first byte is passed in separately */
186 
187     if (ch <= 0x7F) /* 0XXX XXXX one byte */
188     {
189         n = ch;
190         bytes = 1;
191     }
192     else if ((ch & 0xE0) == 0xC0)  /* 110X XXXX  two bytes */
193     {
194         n = ch & 31;
195         bytes = 2;
196     }
197     else if ((ch & 0xF0) == 0xE0)  /* 1110 XXXX  three bytes */
198     {
199         n = ch & 15;
200         bytes = 3;
201     }
202     else if ((ch & 0xF8) == 0xF0)  /* 1111 0XXX  four bytes */
203     {
204         n = ch & 7;
205         bytes = 4;
206     }
207     else if ((ch & 0xFC) == 0xF8)  /* 1111 10XX  five bytes */
208     {
209         n = ch & 3;
210         bytes = 5;
211         hasError = yes;
212     }
213     else if ((ch & 0xFE) == 0xFC)  /* 1111 110X  six bytes */
214     {
215         n = ch & 1;
216         bytes = 6;
217         hasError = yes;
218     }
219     else
220     {
221         /* not a valid first byte of a UTF-8 sequence */
222         n = ch;
223         bytes = 1;
224         hasError = yes;
225     }
226 
227     /* successor bytes should have the form 10XX XXXX */
228 
229     /* If caller supplied buffer, use it.  Else see if caller
230     ** supplied an input source, use that.
231     */
232     if ( successorBytes )
233     {
234         for ( i=0; i < bytes-1; ++i )
235         {
236             if ( !buf[i] || (buf[i] & 0xC0) != 0x80 )
237             {
238                 hasError = yes;
239                 bytes = i+1;
240                 break;
241             }
242             n = (n << 6) | (buf[i] & 0x3F);
243         }
244     }
245     else if ( inp )
246     {
247         for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i )
248         {
249             int b = inp->getByte( inp->sourceData );
250             buf[i] = (tmbchar) b;
251 
252             /* End of data or illegal successor byte value */
253             if ( b == EOF || (buf[i] & 0xC0) != 0x80 )
254             {
255                 hasError = yes;
256                 bytes = i+1;
257                 if ( b != EOF )
258                     inp->ungetByte( inp->sourceData, buf[i] );
259                 break;
260             }
261             n = (n << 6) | (buf[i] & 0x3F);
262         }
263     }
264     else if ( bytes > 1 )
265     {
266         hasError = yes;
267         bytes = 1;
268     }
269 
270     if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar)))
271         hasError = yes;
272 
273     if (!hasError && (n > kMaxUTF8FromUCS4))
274         hasError = yes;
275 
276     if (!hasError)
277     {
278         int lo, hi;
279 
280         lo = offsetUTF8Sequences[bytes - 1];
281         hi = offsetUTF8Sequences[bytes] - 1;
282 
283         /* check for overlong sequences */
284         if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar))
285             hasError = yes;
286         else
287         {
288             hasError = yes; /* assume error until proven otherwise */
289 
290             for (i = lo; i <= hi; i++)
291             {
292                 int tempCount;
293                 byte theByte;
294 
295                 for (tempCount = 0; tempCount < bytes; tempCount++)
296                 {
297                     if (!tempCount)
298                         theByte = (tmbchar) firstByte;
299                     else
300                         theByte = buf[tempCount - 1];
301 
302                     if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] &&
303                          theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] )
304                         hasError = no;
305                     if (hasError)
306                         break;
307                 }
308             }
309         }
310     }
311 
312 #if 1 && defined(_DEBUG)
313     if ( hasError )
314     {
315        /* debug */
316        fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes );
317        fprintf( stderr, "0x%02x ", firstByte );
318        for (i = 1; i < bytes; i++)
319            fprintf( stderr, "0x%02x ", buf[i - 1] );
320        fprintf( stderr, " = U+%04ulx\n", n );
321     }
322 #endif
323 
324     *count = bytes;
325     *c = n;
326     if ( hasError )
327         return -1;
328     return 0;
329 }
330 
TY_(EncodeCharToUTF8Bytes)331 int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf,
332                                 TidyOutputSink* outp, int* count )
333 {
334     byte tempbuf[10] = {0};
335     byte* buf = &tempbuf[0];
336     int bytes = 0;
337     Bool hasError = no;
338 
339     if ( encodebuf )
340         buf = (byte*) encodebuf;
341 
342     if (c <= 0x7F)  /* 0XXX XXXX one byte */
343     {
344         buf[0] = (tmbchar) c;
345         bytes = 1;
346     }
347     else if (c <= 0x7FF)  /* 110X XXXX  two bytes */
348     {
349         buf[0] = (tmbchar) ( 0xC0 | (c >> 6) );
350         buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) );
351         bytes = 2;
352     }
353     else if (c <= 0xFFFF)  /* 1110 XXXX  three bytes */
354     {
355         buf[0] = (tmbchar) (0xE0 | (c >> 12));
356         buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
357         buf[2] = (tmbchar) (0x80 | (c & 0x3F));
358         bytes = 3;
359         if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar )
360             hasError = yes;
361     }
362     else if (c <= 0x1FFFFF)  /* 1111 0XXX  four bytes */
363     {
364         buf[0] = (tmbchar) (0xF0 | (c >> 18));
365         buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
366         buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
367         buf[3] = (tmbchar) (0x80 | (c & 0x3F));
368         bytes = 4;
369         if (c > kMaxUTF8FromUCS4)
370             hasError = yes;
371     }
372     else if (c <= 0x3FFFFFF)  /* 1111 10XX  five bytes */
373     {
374         buf[0] = (tmbchar) (0xF8 | (c >> 24));
375         buf[1] = (tmbchar) (0x80 | (c >> 18));
376         buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
377         buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
378         buf[4] = (tmbchar) (0x80 | (c & 0x3F));
379         bytes = 5;
380         hasError = yes;
381     }
382     else if (c <= 0x7FFFFFFF)  /* 1111 110X  six bytes */
383     {
384         buf[0] = (tmbchar) (0xFC | (c >> 30));
385         buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F));
386         buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F));
387         buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
388         buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
389         buf[5] = (tmbchar) (0x80 | (c & 0x3F));
390         bytes = 6;
391         hasError = yes;
392     }
393     else
394         hasError = yes;
395 
396     /* don't output invalid UTF-8 byte sequence to a stream */
397     if ( !hasError && outp != NULL )
398     {
399         int ix;
400         for ( ix=0; ix < bytes; ++ix )
401           outp->putByte( outp->sinkData, buf[ix] );
402     }
403 
404 #if 1 && defined(_DEBUG)
405     if ( hasError )
406     {
407         int i;
408         fprintf( stderr, "UTF-8 encoding error for U+%x : ", c );
409         for (i = 0; i < bytes; i++)
410             fprintf( stderr, "0x%02x ", buf[i] );
411         fprintf( stderr, "\n" );
412     }
413 #endif
414 
415     *count = bytes;
416     if (hasError)
417         return -1;
418     return 0;
419 }
420 
421 
422 /* return one less than the number of bytes used by the UTF-8 byte sequence */
423 /* str points to the UTF-8 byte sequence */
424 /* the Unicode char is returned in *ch */
TY_(GetUTF8)425 uint TY_(GetUTF8)( ctmbstr str, uint *ch )
426 {
427     uint n;
428     int bytes;
429 
430     int err;
431 
432     bytes = 0;
433 
434     /* first byte "str[0]" is passed in separately from the */
435     /* rest of the UTF-8 byte sequence starting at "str[1]" */
436     err = TY_(DecodeUTF8BytesToChar)( &n, str[0], str+1, NULL, &bytes );
437     if (err)
438     {
439 #if 1 && defined(_DEBUG)
440         fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n);
441 #endif
442         n = 0xFFFD; /* replacement char */
443     }
444 
445     *ch = n;
446     return bytes - 1;
447 }
448 
449 /* store char c as UTF-8 encoded byte stream */
TY_(PutUTF8)450 tmbstr TY_(PutUTF8)( tmbstr buf, uint c )
451 {
452     int err, count = 0;
453 
454     err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
455     if (err)
456     {
457 #if 1 && defined(_DEBUG)
458         fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c);
459 #endif
460         /* replacement char 0xFFFD encoded as UTF-8 */
461         buf[0] = (byte) 0xEF;
462         buf[1] = (byte) 0xBF;
463         buf[2] = (byte) 0xBD;
464         count = 3;
465     }
466 
467     buf += count;
468     return buf;
469 }
470 
TY_(IsValidUTF16FromUCS4)471 Bool    TY_(IsValidUTF16FromUCS4)( tchar ucs4 )
472 {
473   return ( ucs4 <= kMaxUTF16FromUCS4 );
474 }
475 
TY_(IsHighSurrogate)476 Bool    TY_(IsHighSurrogate)( tchar ch )
477 {
478     return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd );
479 }
TY_(IsLowSurrogate)480 Bool    TY_(IsLowSurrogate)( tchar ch )
481 {
482     return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd );
483 }
484 
TY_(CombineSurrogatePair)485 tchar   TY_(CombineSurrogatePair)( tchar high, tchar low )
486 {
487     assert( TY_(IsHighSurrogate)(high) && TY_(IsLowSurrogate)(low) );
488     return ( ((low - kUTF16LowSurrogateBegin) * 0x400) +
489              high - kUTF16HighSurrogateBegin + 0x10000 );
490 }
491 
TY_(SplitSurrogatePair)492 Bool   TY_(SplitSurrogatePair)( tchar utf16, tchar* low, tchar* high )
493 {
494     Bool status = ( TY_(IsValidCombinedChar)( utf16 ) && high && low );
495     if ( status )
496     {
497         *low  = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;
498         *high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;
499     }
500     return status;
501 }
502 
TY_(IsValidCombinedChar)503 Bool    TY_(IsValidCombinedChar)( tchar ch )
504 {
505     return ( ch >= kUTF16SurrogatesBegin &&
506              (ch & 0x0000FFFE) != 0x0000FFFE &&
507              (ch & 0x0000FFFF) != 0x0000FFFF );
508 }
509 
TY_(IsCombinedChar)510 Bool    TY_(IsCombinedChar)( tchar ch )
511 {
512     return ( ch >= kUTF16SurrogatesBegin );
513 }
514 
515 /*
516  * local variables:
517  * mode: c
518  * indent-tabs-mode: nil
519  * c-basic-offset: 4
520  * eval: (c-set-offset 'substatement-open 0)
521  * end:
522  */
523