1 /*
2 * GPAC - Multimedia Framework C SDK
3 *
4 * Authors: Jean Le Feuvre
5 * Copyright (c) Telecom ParisTech 2007-2012
6 * All rights reserved
7 *
8 * This file is part of GPAC / common tools sub-project
9 *
10 * GPAC is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU Lesser General Public License as published by
12 * the Free Software Foundation; either version 2, or (at your option)
13 * any later version.
14 *
15 * GPAC is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with this library; see the file COPYING. If not, write to
22 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 */
25
26 #ifndef GPAC_DISABLE_CORE_TOOLS
27
28 #include <gpac/utf.h>
29
30
31 #if 1
32
33
34 /*
35 * Copyright 2001-2004 Unicode, Inc.
36 *
37 * Disclaimer
38 *
39 * This source code is provided as is by Unicode, Inc. No claims are
40 * made as to fitness for any particular purpose. No warranties of any
41 * kind are expressed or implied. The recipient agrees to determine
42 * applicability of information provided. If this file has been
43 * purchased on magnetic or optical media from Unicode, Inc., the
44 * sole remedy for any claim will be exchange of defective media
45 * within 90 days of receipt.
46 *
47 * Limitations on Rights to Redistribute This Code
48 *
49 * Unicode, Inc. hereby grants the right to freely use the information
50 * supplied in this file in the creation of products supporting the
51 * Unicode Standard, and to make copies of this file in any form
52 * for internal or external distribution as long as this notice
53 * remains attached.
54 */
55
56 /* ---------------------------------------------------------------------
57
58 Conversions between UTF32, UTF-16, and UTF-8. Source code file.
59 Author: Mark E. Davis, 1994.
60 Rev History: Rick McGowan, fixes & updates May 2001.
61 Sept 2001: fixed const & error conditions per
62 mods suggested by S. Parent & A. Lillich.
63 June 2002: Tim Dodd added detection and handling of incomplete
64 source sequences, enhanced error detection, added casts
65 to eliminate compiler warnings.
66 July 2003: slight mods to back out aggressive FFFE detection.
67 Jan 2004: updated switches in from-UTF8 conversions.
68 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
69
70 See the header file "ConvertUTF.h" for complete documentation.
71
72 ------------------------------------------------------------------------ */
73
74 typedef u32 UTF32; /* at least 32 bits */
75 typedef u16 UTF16; /* at least 16 bits */
76 typedef u8 UTF8; /* typically 8 bits */
77 typedef u8 Boolean; /* 0 or 1 */
78
79 /* Some fundamental constants */
80 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
81 #define UNI_MAX_BMP (UTF32)0x0000FFFF
82 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
83 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
84 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
85
86 typedef enum {
87 conversionOK, /* conversion successful */
88 sourceExhausted, /* partial character in source, but hit end */
89 targetExhausted, /* insuff. room in target for conversion */
90 sourceIllegal /* source sequence is illegal/malformed */
91 } ConversionResult;
92
93 typedef enum {
94 strictConversion = 0,
95 lenientConversion
96 } ConversionFlags;
97
98 static const int halfShift = 10; /* used for shifting by 10 bits */
99
100 static const UTF32 halfBase = 0x0010000UL;
101 static const UTF32 halfMask = 0x3FFUL;
102
103 #define UNI_SUR_HIGH_START (UTF32)0xD800
104 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
105 #define UNI_SUR_LOW_START (UTF32)0xDC00
106 #define UNI_SUR_LOW_END (UTF32)0xDFFF
107 #define false 0
108 #define true 1
109
110 /*
111 * Index into the table below with the first byte of a UTF-8 sequence to
112 * get the number of trailing bytes that are supposed to follow it.
113 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
114 * left as-is for anyone who may want to do such conversion, which was
115 * allowed in earlier algorithms.
116 */
117 static const char trailingBytesForUTF8[256] = {
118 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
119 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
120 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
121 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
122 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
123 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
124 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
125 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
126 };
127
128 /*
129 * Magic values subtracted from a buffer value during UTF8 conversion.
130 * This table contains as many values as there might be trailing bytes
131 * in a UTF-8 sequence.
132 */
133 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
134 0x03C82080UL, 0xFA082080UL, 0x82082080UL
135 };
136
137 /*
138 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
139 * into the first byte, depending on how many bytes follow. There are
140 * as many entries in this table as there are UTF-8 sequence types.
141 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
142 * for *legal* UTF-8 will be 4 or fewer bytes total.
143 */
144 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
145
146 /* --------------------------------------------------------------------- */
147
148 /* The interface converts a whole buffer to avoid function-call overhead.
149 * Constants have been gathered. Loops & conditionals have been removed as
150 * much as possible for efficiency, in favor of drop-through switches.
151 * (See "Note A" at the bottom of the file for equivalent code.)
152 * If your compiler supports it, the "isLegalUTF8" call can be turned
153 * into an inline function.
154 */
155
156 /* --------------------------------------------------------------------- */
157
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)158 ConversionResult ConvertUTF16toUTF8(
159 const UTF16** sourceStart, const UTF16* sourceEnd,
160 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
161 ConversionResult result = conversionOK;
162 const UTF16* source = *sourceStart;
163 UTF8* target = *targetStart;
164 while (source < sourceEnd) {
165 UTF32 ch;
166 unsigned short bytesToWrite = 0;
167 const UTF32 byteMask = 0xBF;
168 const UTF32 byteMark = 0x80;
169 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
170 ch = *source++;
171 /* If we have a surrogate pair, convert to UTF32 first. */
172 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
173 /* If the 16 bits following the high surrogate are in the source buffer... */
174 if (source < sourceEnd) {
175 UTF32 ch2 = *source;
176 /* If it's a low surrogate, convert to UTF32. */
177 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
178 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
179 + (ch2 - UNI_SUR_LOW_START) + halfBase;
180 ++source;
181 }
182 else if (flags == strictConversion) { /* it's an unpaired high surrogate */
183 --source; /* return to the illegal value itself */
184 result = sourceIllegal;
185 break;
186 }
187 }
188 else { /* We don't have the 16 bits following the high surrogate. */
189 --source; /* return to the high surrogate */
190 result = sourceExhausted;
191 break;
192 }
193 }
194 else if (flags == strictConversion) {
195 /* UTF-16 surrogate values are illegal in UTF-32 */
196 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
197 --source; /* return to the illegal value itself */
198 result = sourceIllegal;
199 break;
200 }
201 }
202 /* Figure out how many bytes the result will require */
203 if (ch < (UTF32)0x80) {
204 bytesToWrite = 1;
205 }
206 else if (ch < (UTF32)0x800) {
207 bytesToWrite = 2;
208 }
209 else if (ch < (UTF32)0x10000) {
210 bytesToWrite = 3;
211 }
212 else if (ch < (UTF32)0x110000) {
213 bytesToWrite = 4;
214 }
215 else {
216 bytesToWrite = 3;
217 ch = UNI_REPLACEMENT_CHAR;
218 }
219
220 target += bytesToWrite;
221 if (target > targetEnd) {
222 source = oldSource; /* Back up source pointer! */
223 target -= bytesToWrite;
224 result = targetExhausted;
225 break;
226 }
227 switch (bytesToWrite) { /* note: everything falls through. */
228 case 4:
229 *--target = (UTF8)((ch | byteMark) & byteMask);
230 ch >>= 6;
231 case 3:
232 *--target = (UTF8)((ch | byteMark) & byteMask);
233 ch >>= 6;
234 case 2:
235 *--target = (UTF8)((ch | byteMark) & byteMask);
236 ch >>= 6;
237 case 1:
238 *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
239 }
240 target += bytesToWrite;
241 }
242 *sourceStart = source;
243 *targetStart = target;
244 return result;
245 }
246
247 /*
248 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
249 * This must be called with the length pre-determined by the first byte.
250 * If not calling this from ConvertUTF8to*, then the length can be set by:
251 * length = trailingBytesForUTF8[*source]+1;
252 * and the sequence is illegal right away if there aren't that many bytes
253 * available.
254 * If presented with a length > 4, this returns false. The Unicode
255 * definition of UTF-8 goes up to 4-byte sequences.
256 */
257
isLegalUTF8(const UTF8 * source,int length)258 static Boolean isLegalUTF8(const UTF8 *source, int length) {
259 UTF8 a;
260 const UTF8 *srcptr = source + length;
261 switch (length) {
262 default:
263 return false;
264 /* Everything else falls through when "true"... */
265 case 4:
266 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
267 case 3:
268 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
269 case 2:
270 if ((a = (*--srcptr)) > 0xBF) return false;
271
272 switch (*source) {
273 /* no fall-through in this inner switch */
274 case 0xE0:
275 if (a < 0xA0) return false;
276 break;
277 case 0xED:
278 if (a > 0x9F) return false;
279 break;
280 case 0xF0:
281 if (a < 0x90) return false;
282 break;
283 case 0xF4:
284 if (a > 0x8F) return false;
285 break;
286 default:
287 if (a < 0x80) return false;
288 }
289
290 case 1:
291 if (*source >= 0x80 && *source < 0xC2) return false;
292 }
293 if (*source > 0xF4) return false;
294 return true;
295 }
296
297 /* --------------------------------------------------------------------- */
298
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)299 ConversionResult ConvertUTF8toUTF16(
300 const UTF8** sourceStart, const UTF8* sourceEnd,
301 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
302 ConversionResult result = conversionOK;
303 const UTF8* source = *sourceStart;
304 UTF16* target = *targetStart;
305 while (source < sourceEnd) {
306 UTF32 ch = 0;
307 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
308 if (source + extraBytesToRead >= sourceEnd) {
309 result = sourceExhausted;
310 break;
311 }
312 /* Do this check whether lenient or strict */
313 if (!isLegalUTF8(source, extraBytesToRead + 1)) {
314 result = sourceIllegal;
315 break;
316 }
317 /*
318 * The cases all fall through. See "Note A" below.
319 */
320 switch (extraBytesToRead) {
321 case 5:
322 ch += *source++;
323 ch <<= 6; /* remember, illegal UTF-8 */
324 case 4:
325 ch += *source++;
326 ch <<= 6; /* remember, illegal UTF-8 */
327 case 3:
328 ch += *source++;
329 ch <<= 6;
330 case 2:
331 ch += *source++;
332 ch <<= 6;
333 case 1:
334 ch += *source++;
335 ch <<= 6;
336 case 0:
337 ch += *source++;
338 }
339 ch -= offsetsFromUTF8[extraBytesToRead];
340
341 if (target >= targetEnd) {
342 source -= (extraBytesToRead + 1); /* Back up source pointer! */
343 result = targetExhausted;
344 break;
345 }
346 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
347 /* UTF-16 surrogate values are illegal in UTF-32 */
348 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
349 if (flags == strictConversion) {
350 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
351 result = sourceIllegal;
352 break;
353 }
354 else {
355 *target++ = UNI_REPLACEMENT_CHAR;
356 }
357 }
358 else {
359 *target++ = (UTF16)ch; /* normal case */
360 }
361 }
362 else if (ch > UNI_MAX_UTF16) {
363 if (flags == strictConversion) {
364 result = sourceIllegal;
365 source -= (extraBytesToRead + 1); /* return to the start */
366 break; /* Bail out; shouldn't continue */
367 }
368 else {
369 *target++ = UNI_REPLACEMENT_CHAR;
370 }
371 }
372 else {
373 /* target is a character in range 0xFFFF - 0x10FFFF. */
374 if (target + 1 >= targetEnd) {
375 source -= (extraBytesToRead + 1); /* Back up source pointer! */
376 result = targetExhausted;
377 break;
378 }
379 ch -= halfBase;
380 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
381 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
382 }
383 }
384 *sourceStart = source;
385 *targetStart = target;
386 return result;
387 }
388
389
390
391 GF_EXPORT
gf_utf8_wcslen(const unsigned short * s)392 size_t gf_utf8_wcslen(const unsigned short *s)
393 {
394 const unsigned short* ptr;
395 for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
396 }
397 return ptr - s;
398 }
399
400 GF_EXPORT
gf_utf8_wcstombs(char * dest,size_t len,const unsigned short ** srcp)401 size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
402 {
403 const UTF16** sourceStart = srcp;
404 const UTF16* sourceEnd = *srcp + gf_utf8_wcslen(*srcp);
405 UTF8* targetStart = (UTF8*)dest;
406 UTF8* targetEnd = (UTF8*)dest + len;
407 ConversionFlags flags = strictConversion;
408
409 ConversionResult res = ConvertUTF16toUTF8(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
410 if (res != conversionOK) return (size_t)-1;
411 *targetStart = 0;
412 *srcp = NULL;
413 return strlen(dest);
414 }
415
416 GF_EXPORT
gf_utf8_mbstowcs(unsigned short * dest,size_t len,const char ** srcp)417 size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
418 {
419 const UTF8** sourceStart = (const UTF8**)srcp;
420 const UTF8* sourceEnd = (const UTF8*)(*srcp + strlen(*srcp));
421 UTF16* targetStart = (UTF16*)dest;
422 UTF16* targetEnd = (UTF16*)(dest + len);
423 ConversionFlags flags = strictConversion;
424 ConversionResult res = ConvertUTF8toUTF16(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
425 if (res != conversionOK) return (size_t)-1;
426 *targetStart = 0;
427 *srcp = NULL;
428 return gf_utf8_wcslen(dest);
429 }
430
431
432 #else
433
434 GF_EXPORT
gf_utf8_wcslen(const unsigned short * s)435 size_t gf_utf8_wcslen(const unsigned short *s)
436 {
437 const unsigned short* ptr;
438 for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
439 }
440 return ptr - s;
441 }
442
443 GF_EXPORT
gf_utf8_wcstombs(char * dest,size_t len,const unsigned short ** srcp)444 size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
445 {
446 /*
447 * Original code from the GNU UTF-8 Library
448 */
449 size_t count;
450 const unsigned short * src = *srcp;
451
452 if (dest != NULL) {
453 char* destptr = dest;
454 for (;; src++) {
455 unsigned char c;
456 unsigned short wc = *src;
457 if (wc < 0x80) {
458 if (wc == (wchar_t)'\0') {
459 if (len == 0) {
460 *srcp = src;
461 break;
462 }
463 *destptr = '\0';
464 *srcp = NULL;
465 break;
466 }
467 count = 0;
468 c = (unsigned char)wc;
469 }
470 else if (wc < 0x800) {
471 count = 1;
472 c = (unsigned char)((wc >> 6) | 0xC0);
473 }
474 else {
475 count = 2;
476 c = (unsigned char)((wc >> 12) | 0xE0);
477 }
478 if (len <= count) {
479 *srcp = src;
480 break;
481 }
482 len -= count + 1;
483 *destptr++ = c;
484 if (count > 0)
485 do {
486 *destptr++ = (unsigned char)(((wc >> (6 * --count)) & 0x3F) | 0x80);
487 } while (count > 0);
488 }
489 return destptr - dest;
490 }
491 else {
492 /* Ignore dest and len. */
493 size_t totalcount = 0;
494 for (;; src++) {
495 unsigned short wc = *src;
496 size_t count;
497 if (wc < 0x80) {
498 if (wc == (wchar_t)'\0') {
499 *srcp = NULL;
500 break;
501 }
502 count = 1;
503 }
504 else if (wc < 0x800) {
505 count = 2;
506 }
507 else {
508 count = 3;
509 }
510 totalcount += count;
511 }
512 return totalcount;
513 }
514 }
515
516
517 typedef struct
518 {
519 u32 count : 16; /* number of bytes remaining to be processed */
520 u32 value : 16; /* if count > 0: partial wide character */
521 /*
522 If WCHAR_T_BITS == 16, need 2 bits for count,
523 12 bits for value (10 for mbstowcs direction, 12 for wcstombs direction).
524 */
525 } gf_utf8_mbstate_t;
526
527 static gf_utf8_mbstate_t internal;
528
529 GF_EXPORT
gf_utf8_mbstowcs(unsigned short * dest,size_t len,const char ** srcp)530 size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
531 {
532 gf_utf8_mbstate_t* ps = &internal;
533 const char *src = *srcp;
534
535 unsigned short* destptr = dest;
536 for (; len > 0; destptr++, len--) {
537 const char* backup_src = src;
538 unsigned char c;
539 unsigned short wc;
540 size_t count;
541 if (ps->count == 0) {
542 c = (unsigned char)*src;
543 if (c < 0x80) {
544 *destptr = (wchar_t)c;
545 if (c == 0) {
546 src = NULL;
547 break;
548 }
549 src++;
550 continue;
551 }
552 else if (c < 0xC0) {
553 /* Spurious 10XXXXXX byte is invalid. */
554 goto bad_input;
555 }
556 if (c < 0xE0) {
557 wc = (wchar_t)(c & 0x1F) << 6;
558 count = 1;
559 if (c < 0xC2) goto bad_input;
560 }
561 else if (c < 0xF0) {
562 wc = (wchar_t)(c & 0x0F) << 12;
563 count = 2;
564 }
565 else goto bad_input;
566 src++;
567 }
568 else {
569 wc = ps->value << 6;
570 count = ps->count;
571 }
572 for (;;) {
573 c = (unsigned char)*src++ ^ 0x80;
574 if (!(c < 0x40)) goto bad_input_backup;
575 wc |= (unsigned short)c << (6 * --count);
576 if (count == 0)
577 break;
578 /* The following test is only necessary once for every character,
579 but it would be too complicated to perform it once only, on
580 the first pass through this loop. */
581 if ((unsigned short)wc < ((unsigned short)1 << (5 * count + 6)))
582 goto bad_input_backup;
583 }
584 *destptr = wc;
585 ps->count = 0;
586 continue;
587
588 bad_input_backup:
589 src = backup_src;
590 goto bad_input;
591 }
592 *srcp = src;
593 return destptr - dest;
594
595 bad_input:
596 *srcp = src;
597 return (size_t)(-1);
598 }
599
600 #endif
601 #endif /* GPAC_DISABLE_CORE_TOOLS */