1 /* utf8.c -- convert characters to/from UTF-8
2 
3   (c) 1998-2004 (W3C) MIT, ERCIM, Keio University
4   See tidy.h for the copyright notice.
5 
6   CVS Info :
7 
8     $Author: terry_teague $
9     $Date: 2004/08/02 02:32:36 $
10     $Revision: 1.7 $
11 
12   Uses public interfaces to abstract input source and output
13   sink, which may be user supplied or either FILE* or memory
14   based Tidy implementations.  Encoding support is uniform
15   regardless of I/O mechanism.
16 
17   Note, UTF-8 encoding, by itself, does not affect the actual
18   "codepoints" of the underlying character encoding.  In the
19   cases of ASCII, Latin1, Unicode (16-bit, BMP), these all
20   refer to ISO-10646 "codepoints".  For anything else, they
21   refer to some other "codepoint" set.
22 
23   Put another way, UTF-8 is a variable length method to
24   represent any non-negative integer value.  The glyph
25   that a integer value represents is unchanged and defined
26   externally (e.g. by ISO-10646, Big5, Win1252, MacRoman,
27   Latin2-9, and so on).
28 
29   Put still another way, UTF-8 is more of a _transfer_ encoding
30   than a _character_ encoding, per se.
31 */
32 
33 #include "tidy.h"
34 #include "utf8.h"
35 
36 /*
37 UTF-8 encoding/decoding functions
38 Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence
39 
40 Also see below for UTF-16 encoding/decoding functions
41 
42 References :
43 
44 1) UCS Transformation Format 8 (UTF-8):
45 ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D
46 <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>
47 <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
48 
49 Table 4 - Mapping from UCS-4 to UTF-8
50 
51 2) Unicode standards:
52 <http://www.unicode.org/unicode/standard/standard.html>
53 
54 3) Legal UTF-8 byte sequences:
55 <http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html>
56 
57 Code point          1st byte    2nd byte    3rd byte    4th byte
58 ----------          --------    --------    --------    --------
59 U+0000..U+007F      00..7F
60 U+0080..U+07FF      C2..DF      80..BF
61 U+0800..U+0FFF      E0          A0..BF      80..BF
62 U+1000..U+FFFF      E1..EF      80..BF      80..BF
63 U+10000..U+3FFFF    F0          90..BF      80..BF      80..BF
64 U+40000..U+FFFFF    F1..F3      80..BF      80..BF      80..BF
65 U+100000..U+10FFFF  F4          80..8F      80..BF      80..BF
66 
67 The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also
68 allows for the use of five- and six-byte sequences to encode
69 characters that are outside the range of the Unicode character
70 set; those five- and six-byte sequences are illegal for the use
71 of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646
72 does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF
73 (but it does allow other noncharacters).
74 
75 4) RFC 2279: UTF-8, a transformation format of ISO 10646:
76 <http://www.ietf.org/rfc/rfc2279.txt>
77 
78 5) UTF-8 and Unicode FAQ:
79 <http://www.cl.cam.ac.uk/~mgk25/unicode.html>
80 
81 6) Markus Kuhn's UTF-8 decoder stress test file:
82 <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
83 
84 7) UTF-8 Demo:
85 <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>
86 
87 8) UTF-8 Sampler:
88 <http://www.columbia.edu/kermit/utf8.html>
89 
90 9) Transformation Format for 16 Planes of Group 00 (UTF-16):
91 ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C
92 <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>
93 <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>
94 
95 10) RFC 2781: UTF-16, an encoding of ISO 10646:
96 <http://www.ietf.org/rfc/rfc2781.txt>
97 
98 11) UTF-16 invalid surrogate pairs:
99 <http://www.unicode.org/unicode/faq/utf_bom.html#16>
100 
101 UTF-16       UTF-8          UCS-4
102 D83F DFF*    F0 9F BF B*    0001FFF*
103 D87F DFF*    F0 AF BF B*    0002FFF*
104 D8BF DFF*    F0 BF BF B*    0003FFF*
105 D8FF DFF*    F1 8F BF B*    0004FFF*
106 D93F DFF*    F1 9F BF B*    0005FFF*
107 D97F DFF*    F1 AF BF B*    0006FFF*
108                 ...
109 DBBF DFF*    F3 BF BF B*    000FFFF*
110 DBFF DFF*    F4 8F BF B*    0010FFF*
111 
112 * = E or F
113 
114 1010  A
115 1011  B
116 1100  C
117 1101  D
118 1110  E
119 1111  F
120 
121 */
122 
123 #define kNumUTF8Sequences        7
124 #define kMaxUTF8Bytes            4
125 
126 #define kUTF8ByteSwapNotAChar    0xFFFE
127 #define kUTF8NotAChar            0xFFFF
128 
129 #define kMaxUTF8FromUCS4         0x10FFFF
130 
131 #define kUTF16SurrogatesBegin    0x10000
132 #define kMaxUTF16FromUCS4        0x10FFFF
133 
134 /* UTF-16 surrogate pair areas */
135 #define kUTF16LowSurrogateBegin  0xD800
136 #define kUTF16LowSurrogateEnd    0xDBFF
137 #define kUTF16HighSurrogateBegin 0xDC00
138 #define kUTF16HighSurrogateEnd   0xDFFF
139 
140 
141 /* offsets into validUTF8 table below */
142 static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =
143 {
144     0, /* 1 byte */
145     1, /* 2 bytes */
146     2, /* 3 bytes */
147     4, /* 4 bytes */
148     kNumUTF8Sequences /* must be last */
149 };
150 
151 static const struct validUTF8Sequence
152 {
153      uint lowChar;
154      uint highChar;
155      int  numBytes;
156      byte validBytes[8];
157 } validUTF8[kNumUTF8Sequences] =
158 {
159 /*   low       high   #bytes  byte 1      byte 2      byte 3      byte 4 */
160     {0x0000,   0x007F,   1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
161     {0x0080,   0x07FF,   2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}},
162     {0x0800,   0x0FFF,   3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
163     {0x1000,   0xFFFF,   3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
164     {0x10000,  0x3FFFF,  4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
165     {0x40000,  0xFFFFF,  4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
166     {0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}}
167 };
168 
DecodeUTF8BytesToChar(uint * c,uint firstByte,ctmbstr successorBytes,TidyInputSource * inp,int * count)169 int DecodeUTF8BytesToChar( uint* c, uint firstByte, ctmbstr successorBytes,
170                            TidyInputSource* inp, int* count )
171 {
172     byte tempbuf[10];
173     byte *buf = &tempbuf[0];
174     uint ch = 0, n = 0;
175     int i, bytes = 0;
176     Bool hasError = no;
177 
178     if ( successorBytes )
179         buf = (byte*) successorBytes;
180 
181     /* special check if we have been passed an EOF char */
182     if ( firstByte == EndOfStream )
183     {
184         /* at present */
185         *c = firstByte;
186         *count = 1;
187         return 0;
188     }
189 
190     ch = firstByte; /* first byte is passed in separately */
191 
192     if (ch <= 0x7F) /* 0XXX XXXX one byte */
193     {
194         n = ch;
195         bytes = 1;
196     }
197     else if ((ch & 0xE0) == 0xC0)  /* 110X XXXX  two bytes */
198     {
199         n = ch & 31;
200         bytes = 2;
201     }
202     else if ((ch & 0xF0) == 0xE0)  /* 1110 XXXX  three bytes */
203     {
204         n = ch & 15;
205         bytes = 3;
206     }
207     else if ((ch & 0xF8) == 0xF0)  /* 1111 0XXX  four bytes */
208     {
209         n = ch & 7;
210         bytes = 4;
211     }
212     else if ((ch & 0xFC) == 0xF8)  /* 1111 10XX  five bytes */
213     {
214         n = ch & 3;
215         bytes = 5;
216         hasError = yes;
217     }
218     else if ((ch & 0xFE) == 0xFC)  /* 1111 110X  six bytes */
219     {
220         n = ch & 1;
221         bytes = 6;
222         hasError = yes;
223     }
224     else
225     {
226         /* not a valid first byte of a UTF-8 sequence */
227         n = ch;
228         bytes = 1;
229         hasError = yes;
230     }
231 
232     /* successor bytes should have the form 10XX XXXX */
233 
234     /* If caller supplied buffer, use it.  Else see if caller
235     ** supplied an input source, use that.
236     */
237     if ( successorBytes )
238     {
239         for ( i=0; i < bytes-1; ++i )
240         {
241             if ( !buf[i] || (buf[i] & 0xC0) != 0x80 )
242             {
243                 hasError = yes;
244                 bytes = i;
245                 break;
246             }
247             n = (n << 6) | (buf[i] & 0x3F);
248         }
249     }
250     else if ( inp )
251     {
252         for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i )
253         {
254             int b = inp->getByte( inp->sourceData );
255             buf[i] = (tmbchar) b;
256 
257             /* End of data or illegal successor byte value */
258             if ( b == EOF || (buf[i] & 0xC0) != 0x80 )
259             {
260                 hasError = yes;
261                 bytes = i;
262                 if ( b != EOF )
263                     inp->ungetByte( inp->sourceData, buf[i] );
264                 break;
265             }
266             n = (n << 6) | (buf[i] & 0x3F);
267         }
268     }
269     else if ( bytes > 1 )
270     {
271         hasError = yes;
272         bytes = 1;
273     }
274 
275     if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar)))
276         hasError = yes;
277 
278     if (!hasError && (n > kMaxUTF8FromUCS4))
279         hasError = yes;
280 
281 #if 0 /* Breaks Big5 D8 - DF */
282     if (!hasError && (n >= kUTF16LowSurrogateBegin) && (n <= kUTF16HighSurrogateEnd))
283         /* unpaired surrogates not allowed */
284         hasError = yes;
285 #endif
286 
287     if (!hasError)
288     {
289         int lo, hi;
290 
291         lo = offsetUTF8Sequences[bytes - 1];
292         hi = offsetUTF8Sequences[bytes] - 1;
293 
294         /* check for overlong sequences */
295         if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar))
296             hasError = yes;
297         else
298         {
299             hasError = yes; /* assume error until proven otherwise */
300 
301             for (i = lo; i <= hi; i++)
302             {
303                 int tempCount;
304                 byte theByte;
305 
306                 for (tempCount = 0; tempCount < bytes; tempCount++)
307                 {
308                     if (!tempCount)
309                         theByte = (tmbchar) firstByte;
310                     else
311                         theByte = buf[tempCount - 1];
312 
313                     if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] &&
314                          theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] )
315                         hasError = no;
316                     if (hasError)
317                         break;
318                 }
319             }
320         }
321     }
322 
323 #if 1 && defined(_DEBUG)
324     if ( hasError )
325     {
326        /* debug */
327        fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes );
328        fprintf( stderr, "0x%02x ", firstByte );
329        for (i = 1; i < bytes; i++)
330            fprintf( stderr, "0x%02x ", buf[i - 1] );
331        fprintf( stderr, " = U+%04ulx\n", n );
332     }
333 #endif
334 
335     *count = bytes;
336     *c = n;
337     if ( hasError )
338         return -1;
339     return 0;
340 }
341 
EncodeCharToUTF8Bytes(uint c,tmbstr encodebuf,TidyOutputSink * outp,int * count)342 int EncodeCharToUTF8Bytes( uint c, tmbstr encodebuf,
343                            TidyOutputSink* outp, int* count )
344 {
345     byte tempbuf[10] = {0};
346     byte* buf = &tempbuf[0];
347     int bytes = 0;
348     Bool hasError = no;
349 
350     if ( encodebuf )
351         buf = (byte*) encodebuf;
352 
353     if (c <= 0x7F)  /* 0XXX XXXX one byte */
354     {
355         buf[0] = (tmbchar) c;
356         bytes = 1;
357     }
358     else if (c <= 0x7FF)  /* 110X XXXX  two bytes */
359     {
360         buf[0] = (tmbchar) ( 0xC0 | (c >> 6) );
361         buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) );
362         bytes = 2;
363     }
364     else if (c <= 0xFFFF)  /* 1110 XXXX  three bytes */
365     {
366         buf[0] = (tmbchar) (0xE0 | (c >> 12));
367         buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
368         buf[2] = (tmbchar) (0x80 | (c & 0x3F));
369         bytes = 3;
370         if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar )
371             hasError = yes;
372 #if 0 /* Breaks Big5 D8 - DF */
373         else if ( c >= kUTF16LowSurrogateBegin && c <= kUTF16HighSurrogateEnd )
374             /* unpaired surrogates not allowed */
375             hasError = yes;
376 #endif
377     }
378     else if (c <= 0x1FFFFF)  /* 1111 0XXX  four bytes */
379     {
380         buf[0] = (tmbchar) (0xF0 | (c >> 18));
381         buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
382         buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
383         buf[3] = (tmbchar) (0x80 | (c & 0x3F));
384         bytes = 4;
385         if (c > kMaxUTF8FromUCS4)
386             hasError = yes;
387     }
388     else if (c <= 0x3FFFFFF)  /* 1111 10XX  five bytes */
389     {
390         buf[0] = (tmbchar) (0xF8 | (c >> 24));
391         buf[1] = (tmbchar) (0x80 | (c >> 18));
392         buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
393         buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
394         buf[4] = (tmbchar) (0x80 | (c & 0x3F));
395         bytes = 5;
396         hasError = yes;
397     }
398     else if (c <= 0x7FFFFFFF)  /* 1111 110X  six bytes */
399     {
400         buf[0] = (tmbchar) (0xFC | (c >> 30));
401         buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F));
402         buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F));
403         buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
404         buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
405         buf[5] = (tmbchar) (0x80 | (c & 0x3F));
406         bytes = 6;
407         hasError = yes;
408     }
409     else
410         hasError = yes;
411 
412     /* don't output invalid UTF-8 byte sequence to a stream */
413     if ( !hasError && outp != NULL )
414     {
415         int ix;
416         for ( ix=0; ix < bytes; ++ix )
417           outp->putByte( outp->sinkData, buf[ix] );
418     }
419 
420 #if 1 && defined(_DEBUG)
421     if ( hasError )
422     {
423         int i;
424         fprintf( stderr, "UTF-8 encoding error for U+%x : ", c );
425         for (i = 0; i < bytes; i++)
426             fprintf( stderr, "0x%02x ", buf[i] );
427         fprintf( stderr, "\n" );
428     }
429 #endif
430 
431     *count = bytes;
432     if (hasError)
433         return -1;
434     return 0;
435 }
436 
437 
438 /* return one less than the number of bytes used by the UTF-8 byte sequence */
439 /* str points to the UTF-8 byte sequence */
440 /* the Unicode char is returned in *ch */
GetUTF8(ctmbstr str,uint * ch)441 uint GetUTF8( ctmbstr str, uint *ch )
442 {
443     uint n;
444     int bytes;
445 
446     int err;
447 
448     bytes = 0;
449 
450     /* first byte "str[0]" is passed in separately from the */
451     /* rest of the UTF-8 byte sequence starting at "str[1]" */
452     err = DecodeUTF8BytesToChar( &n, str[0], str+1, NULL, &bytes );
453     if (err)
454     {
455 #if 1 && defined(_DEBUG)
456         fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n);
457 #endif
458         n = 0xFFFD; /* replacement char */
459     }
460 
461     *ch = n;
462     return bytes - 1;
463 }
464 
465 /* store char c as UTF-8 encoded byte stream */
PutUTF8(tmbstr buf,uint c)466 tmbstr PutUTF8( tmbstr buf, uint c )
467 {
468     int err, count = 0;
469 
470     err = EncodeCharToUTF8Bytes( c, buf, NULL, &count );
471     if (err)
472     {
473 #if 1 && defined(_DEBUG)
474         fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c);
475 #endif
476         /* replacement char 0xFFFD encoded as UTF-8 */
477         buf[0] = (byte) 0xEF;
478         buf[1] = (byte) 0xBF;
479         buf[2] = (byte) 0xBD;
480         count = 3;
481     }
482 
483     buf += count;
484     return buf;
485 }
486 
IsValidUTF16FromUCS4(tchar ucs4)487 Bool    IsValidUTF16FromUCS4( tchar ucs4 )
488 {
489   return ( ucs4 <= kMaxUTF16FromUCS4 );
490 }
491 
IsHighSurrogate(tchar ch)492 Bool    IsHighSurrogate( tchar ch )
493 {
494     return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd );
495 }
IsLowSurrogate(tchar ch)496 Bool    IsLowSurrogate( tchar ch )
497 {
498     return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd );
499 }
500 
CombineSurrogatePair(tchar high,tchar low)501 tchar   CombineSurrogatePair( tchar high, tchar low )
502 {
503     assert( IsHighSurrogate(high) && IsLowSurrogate(low) );
504     return ( ((low - kUTF16LowSurrogateBegin) * 0x400) +
505              high - kUTF16HighSurrogateBegin + 0x10000 );
506 }
507 
SplitSurrogatePair(tchar utf16,tchar * low,tchar * high)508 Bool   SplitSurrogatePair( tchar utf16, tchar* low, tchar* high )
509 {
510     Bool status = ( IsValidCombinedChar( utf16 ) && high && low );
511     if ( status )
512     {
513         *low  = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;
514         *high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;
515     }
516     return status;
517 }
518 
IsValidCombinedChar(tchar ch)519 Bool    IsValidCombinedChar( tchar ch )
520 {
521     return ( ch >= kUTF16SurrogatesBegin &&
522              (ch & 0x0000FFFE) != 0x0000FFFE &&
523              (ch & 0x0000FFFF) != 0x0000FFFF );
524 }
525 
IsCombinedChar(tchar ch)526 Bool    IsCombinedChar( tchar ch )
527 {
528     return ( ch >= kUTF16SurrogatesBegin );
529 }
530