1 /*
2 *			GPAC - Multimedia Framework C SDK
3 *
4 *			Authors: Jean Le Feuvre
5 *			Copyright (c) Telecom ParisTech 2007-2012
6 *					All rights reserved
7 *
8 *  This file is part of GPAC / common tools sub-project
9 *
10 *  GPAC is free software; you can redistribute it and/or modify
11 *  it under the terms of the GNU Lesser General Public License as published by
12 *  the Free Software Foundation; either version 2, or (at your option)
13 *  any later version.
14 *
15 *  GPAC is distributed in the hope that it will be useful,
16 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 *  GNU Lesser General Public License for more details.
19 *
20 *  You should have received a copy of the GNU Lesser General Public
21 *  License along with this library; see the file COPYING.  If not, write to
22 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 */
25 
26 #ifndef GPAC_DISABLE_CORE_TOOLS
27 
28 #include <gpac/utf.h>
29 
30 
31 #if 1
32 
33 
34 /*
35 * Copyright 2001-2004 Unicode, Inc.
36 *
37 * Disclaimer
38 *
39 * This source code is provided as is by Unicode, Inc. No claims are
40 * made as to fitness for any particular purpose. No warranties of any
41 * kind are expressed or implied. The recipient agrees to determine
42 * applicability of information provided. If this file has been
43 * purchased on magnetic or optical media from Unicode, Inc., the
44 * sole remedy for any claim will be exchange of defective media
45 * within 90 days of receipt.
46 *
47 * Limitations on Rights to Redistribute This Code
48 *
49 * Unicode, Inc. hereby grants the right to freely use the information
50 * supplied in this file in the creation of products supporting the
51 * Unicode Standard, and to make copies of this file in any form
52 * for internal or external distribution as long as this notice
53 * remains attached.
54 */
55 
56 /* ---------------------------------------------------------------------
57 
58 Conversions between UTF32, UTF-16, and UTF-8. Source code file.
59 Author: Mark E. Davis, 1994.
60 Rev History: Rick McGowan, fixes & updates May 2001.
61 Sept 2001: fixed const & error conditions per
62 mods suggested by S. Parent & A. Lillich.
63 June 2002: Tim Dodd added detection and handling of incomplete
64 source sequences, enhanced error detection, added casts
65 to eliminate compiler warnings.
66 July 2003: slight mods to back out aggressive FFFE detection.
67 Jan 2004: updated switches in from-UTF8 conversions.
68 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
69 
70 See the header file "ConvertUTF.h" for complete documentation.
71 
72 ------------------------------------------------------------------------ */
73 
74 typedef u32 UTF32;	/* at least 32 bits */
75 typedef u16 UTF16;	/* at least 16 bits */
76 typedef u8 UTF8;	/* typically 8 bits */
77 typedef u8 Boolean; /* 0 or 1 */
78 
79 					/* Some fundamental constants */
80 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
81 #define UNI_MAX_BMP (UTF32)0x0000FFFF
82 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
83 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
84 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
85 
86 typedef enum {
87 	conversionOK, 		/* conversion successful */
88 	sourceExhausted,	/* partial character in source, but hit end */
89 	targetExhausted,	/* insuff. room in target for conversion */
90 	sourceIllegal		/* source sequence is illegal/malformed */
91 } ConversionResult;
92 
93 typedef enum {
94 	strictConversion = 0,
95 	lenientConversion
96 } ConversionFlags;
97 
98 static const int halfShift = 10; /* used for shifting by 10 bits */
99 
100 static const UTF32 halfBase = 0x0010000UL;
101 static const UTF32 halfMask = 0x3FFUL;
102 
103 #define UNI_SUR_HIGH_START  (UTF32)0xD800
104 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
105 #define UNI_SUR_LOW_START   (UTF32)0xDC00
106 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
107 #define false	   0
108 #define true	    1
109 
110 /*
111 * Index into the table below with the first byte of a UTF-8 sequence to
112 * get the number of trailing bytes that are supposed to follow it.
113 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
114 * left as-is for anyone who may want to do such conversion, which was
115 * allowed in earlier algorithms.
116 */
117 static const char trailingBytesForUTF8[256] = {
118 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
119 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
120 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
121 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
122 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
123 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
124 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
125 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
126 };
127 
128 /*
129 * Magic values subtracted from a buffer value during UTF8 conversion.
130 * This table contains as many values as there might be trailing bytes
131 * in a UTF-8 sequence.
132 */
133 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
134 0x03C82080UL, 0xFA082080UL, 0x82082080UL
135 };
136 
137 /*
138 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
139 * into the first byte, depending on how many bytes follow.  There are
140 * as many entries in this table as there are UTF-8 sequence types.
141 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
142 * for *legal* UTF-8 will be 4 or fewer bytes total.
143 */
144 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
145 
146 /* --------------------------------------------------------------------- */
147 
148 /* The interface converts a whole buffer to avoid function-call overhead.
149 * Constants have been gathered. Loops & conditionals have been removed as
150 * much as possible for efficiency, in favor of drop-through switches.
151 * (See "Note A" at the bottom of the file for equivalent code.)
152 * If your compiler supports it, the "isLegalUTF8" call can be turned
153 * into an inline function.
154 */
155 
156 /* --------------------------------------------------------------------- */
157 
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)158 ConversionResult ConvertUTF16toUTF8(
159 	const UTF16** sourceStart, const UTF16* sourceEnd,
160 	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
161 	ConversionResult result = conversionOK;
162 	const UTF16* source = *sourceStart;
163 	UTF8* target = *targetStart;
164 	while (source < sourceEnd) {
165 		UTF32 ch;
166 		unsigned short bytesToWrite = 0;
167 		const UTF32 byteMask = 0xBF;
168 		const UTF32 byteMark = 0x80;
169 		const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
170 		ch = *source++;
171 		/* If we have a surrogate pair, convert to UTF32 first. */
172 		if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
173 			/* If the 16 bits following the high surrogate are in the source buffer... */
174 			if (source < sourceEnd) {
175 				UTF32 ch2 = *source;
176 				/* If it's a low surrogate, convert to UTF32. */
177 				if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
178 					ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
179 						+ (ch2 - UNI_SUR_LOW_START) + halfBase;
180 					++source;
181 				}
182 				else if (flags == strictConversion) { /* it's an unpaired high surrogate */
183 					--source; /* return to the illegal value itself */
184 					result = sourceIllegal;
185 					break;
186 				}
187 			}
188 			else { /* We don't have the 16 bits following the high surrogate. */
189 				--source; /* return to the high surrogate */
190 				result = sourceExhausted;
191 				break;
192 			}
193 		}
194 		else if (flags == strictConversion) {
195 			/* UTF-16 surrogate values are illegal in UTF-32 */
196 			if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
197 				--source; /* return to the illegal value itself */
198 				result = sourceIllegal;
199 				break;
200 			}
201 		}
202 		/* Figure out how many bytes the result will require */
203 		if (ch < (UTF32)0x80) {
204 			bytesToWrite = 1;
205 		}
206 		else if (ch < (UTF32)0x800) {
207 			bytesToWrite = 2;
208 		}
209 		else if (ch < (UTF32)0x10000) {
210 			bytesToWrite = 3;
211 		}
212 		else if (ch < (UTF32)0x110000) {
213 			bytesToWrite = 4;
214 		}
215 		else {
216 			bytesToWrite = 3;
217 			ch = UNI_REPLACEMENT_CHAR;
218 		}
219 
220 		target += bytesToWrite;
221 		if (target > targetEnd) {
222 			source = oldSource; /* Back up source pointer! */
223 			target -= bytesToWrite;
224 			result = targetExhausted;
225 			break;
226 		}
227 		switch (bytesToWrite) { /* note: everything falls through. */
228 		case 4:
229 			*--target = (UTF8)((ch | byteMark) & byteMask);
230 			ch >>= 6;
231 		case 3:
232 			*--target = (UTF8)((ch | byteMark) & byteMask);
233 			ch >>= 6;
234 		case 2:
235 			*--target = (UTF8)((ch | byteMark) & byteMask);
236 			ch >>= 6;
237 		case 1:
238 			*--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
239 		}
240 		target += bytesToWrite;
241 	}
242 	*sourceStart = source;
243 	*targetStart = target;
244 	return result;
245 }
246 
247 /*
248 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
249 * This must be called with the length pre-determined by the first byte.
250 * If not calling this from ConvertUTF8to*, then the length can be set by:
251 *  length = trailingBytesForUTF8[*source]+1;
252 * and the sequence is illegal right away if there aren't that many bytes
253 * available.
254 * If presented with a length > 4, this returns false.  The Unicode
255 * definition of UTF-8 goes up to 4-byte sequences.
256 */
257 
isLegalUTF8(const UTF8 * source,int length)258 static Boolean isLegalUTF8(const UTF8 *source, int length) {
259 	UTF8 a;
260 	const UTF8 *srcptr = source + length;
261 	switch (length) {
262 	default:
263 		return false;
264 		/* Everything else falls through when "true"... */
265 	case 4:
266 		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
267 	case 3:
268 		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
269 	case 2:
270 		if ((a = (*--srcptr)) > 0xBF) return false;
271 
272 		switch (*source) {
273 			/* no fall-through in this inner switch */
274 		case 0xE0:
275 			if (a < 0xA0) return false;
276 			break;
277 		case 0xED:
278 			if (a > 0x9F) return false;
279 			break;
280 		case 0xF0:
281 			if (a < 0x90) return false;
282 			break;
283 		case 0xF4:
284 			if (a > 0x8F) return false;
285 			break;
286 		default:
287 			if (a < 0x80) return false;
288 		}
289 
290 	case 1:
291 		if (*source >= 0x80 && *source < 0xC2) return false;
292 	}
293 	if (*source > 0xF4) return false;
294 	return true;
295 }
296 
297 /* --------------------------------------------------------------------- */
298 
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)299 ConversionResult ConvertUTF8toUTF16(
300 	const UTF8** sourceStart, const UTF8* sourceEnd,
301 	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
302 	ConversionResult result = conversionOK;
303 	const UTF8* source = *sourceStart;
304 	UTF16* target = *targetStart;
305 	while (source < sourceEnd) {
306 		UTF32 ch = 0;
307 		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
308 		if (source + extraBytesToRead >= sourceEnd) {
309 			result = sourceExhausted;
310 			break;
311 		}
312 		/* Do this check whether lenient or strict */
313 		if (!isLegalUTF8(source, extraBytesToRead + 1)) {
314 			result = sourceIllegal;
315 			break;
316 		}
317 		/*
318 		* The cases all fall through. See "Note A" below.
319 		*/
320 		switch (extraBytesToRead) {
321 		case 5:
322 			ch += *source++;
323 			ch <<= 6; /* remember, illegal UTF-8 */
324 		case 4:
325 			ch += *source++;
326 			ch <<= 6; /* remember, illegal UTF-8 */
327 		case 3:
328 			ch += *source++;
329 			ch <<= 6;
330 		case 2:
331 			ch += *source++;
332 			ch <<= 6;
333 		case 1:
334 			ch += *source++;
335 			ch <<= 6;
336 		case 0:
337 			ch += *source++;
338 		}
339 		ch -= offsetsFromUTF8[extraBytesToRead];
340 
341 		if (target >= targetEnd) {
342 			source -= (extraBytesToRead + 1); /* Back up source pointer! */
343 			result = targetExhausted;
344 			break;
345 		}
346 		if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
347 								 /* UTF-16 surrogate values are illegal in UTF-32 */
348 			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
349 				if (flags == strictConversion) {
350 					source -= (extraBytesToRead + 1); /* return to the illegal value itself */
351 					result = sourceIllegal;
352 					break;
353 				}
354 				else {
355 					*target++ = UNI_REPLACEMENT_CHAR;
356 				}
357 			}
358 			else {
359 				*target++ = (UTF16)ch; /* normal case */
360 			}
361 		}
362 		else if (ch > UNI_MAX_UTF16) {
363 			if (flags == strictConversion) {
364 				result = sourceIllegal;
365 				source -= (extraBytesToRead + 1); /* return to the start */
366 				break; /* Bail out; shouldn't continue */
367 			}
368 			else {
369 				*target++ = UNI_REPLACEMENT_CHAR;
370 			}
371 		}
372 		else {
373 			/* target is a character in range 0xFFFF - 0x10FFFF. */
374 			if (target + 1 >= targetEnd) {
375 				source -= (extraBytesToRead + 1); /* Back up source pointer! */
376 				result = targetExhausted;
377 				break;
378 			}
379 			ch -= halfBase;
380 			*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
381 			*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
382 		}
383 	}
384 	*sourceStart = source;
385 	*targetStart = target;
386 	return result;
387 }
388 
389 
390 
391 GF_EXPORT
gf_utf8_wcslen(const unsigned short * s)392 size_t gf_utf8_wcslen(const unsigned short *s)
393 {
394 	const unsigned short* ptr;
395 	for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
396 	}
397 	return ptr - s;
398 }
399 
400 GF_EXPORT
gf_utf8_wcstombs(char * dest,size_t len,const unsigned short ** srcp)401 size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
402 {
403 	const UTF16** sourceStart = srcp;
404 	const UTF16* sourceEnd = *srcp + gf_utf8_wcslen(*srcp);
405 	UTF8* targetStart = (UTF8*)dest;
406 	UTF8* targetEnd = (UTF8*)dest + len;
407 	ConversionFlags flags = strictConversion;
408 
409 	ConversionResult res = ConvertUTF16toUTF8(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
410 	if (res != conversionOK) return (size_t)-1;
411 	*targetStart = 0;
412 	*srcp = NULL;
413 	return strlen(dest);
414 }
415 
416 GF_EXPORT
gf_utf8_mbstowcs(unsigned short * dest,size_t len,const char ** srcp)417 size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
418 {
419 	const UTF8** sourceStart = (const UTF8**)srcp;
420 	const UTF8* sourceEnd = (const UTF8*)(*srcp + strlen(*srcp));
421 	UTF16* targetStart = (UTF16*)dest;
422 	UTF16* targetEnd = (UTF16*)(dest + len);
423 	ConversionFlags flags = strictConversion;
424 	ConversionResult res = ConvertUTF8toUTF16(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
425 	if (res != conversionOK) return (size_t)-1;
426 	*targetStart = 0;
427 	*srcp = NULL;
428 	return gf_utf8_wcslen(dest);
429 }
430 
431 
432 #else
433 
434 GF_EXPORT
gf_utf8_wcslen(const unsigned short * s)435 size_t gf_utf8_wcslen(const unsigned short *s)
436 {
437 	const unsigned short* ptr;
438 	for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
439 	}
440 	return ptr - s;
441 }
442 
443 GF_EXPORT
gf_utf8_wcstombs(char * dest,size_t len,const unsigned short ** srcp)444 size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
445 {
446 	/*
447 	* Original code from the GNU UTF-8 Library
448 	*/
449 	size_t count;
450 	const unsigned short * src = *srcp;
451 
452 	if (dest != NULL) {
453 		char* destptr = dest;
454 		for (;; src++) {
455 			unsigned char c;
456 			unsigned short wc = *src;
457 			if (wc < 0x80) {
458 				if (wc == (wchar_t)'\0') {
459 					if (len == 0) {
460 						*srcp = src;
461 						break;
462 					}
463 					*destptr = '\0';
464 					*srcp = NULL;
465 					break;
466 				}
467 				count = 0;
468 				c = (unsigned char)wc;
469 			}
470 			else if (wc < 0x800) {
471 				count = 1;
472 				c = (unsigned char)((wc >> 6) | 0xC0);
473 			}
474 			else {
475 				count = 2;
476 				c = (unsigned char)((wc >> 12) | 0xE0);
477 			}
478 			if (len <= count) {
479 				*srcp = src;
480 				break;
481 			}
482 			len -= count + 1;
483 			*destptr++ = c;
484 			if (count > 0)
485 				do {
486 					*destptr++ = (unsigned char)(((wc >> (6 * --count)) & 0x3F) | 0x80);
487 				} while (count > 0);
488 		}
489 		return destptr - dest;
490 	}
491 	else {
492 		/* Ignore dest and len. */
493 		size_t totalcount = 0;
494 		for (;; src++) {
495 			unsigned short wc = *src;
496 			size_t count;
497 			if (wc < 0x80) {
498 				if (wc == (wchar_t)'\0') {
499 					*srcp = NULL;
500 					break;
501 				}
502 				count = 1;
503 			}
504 			else if (wc < 0x800) {
505 				count = 2;
506 			}
507 			else {
508 				count = 3;
509 			}
510 			totalcount += count;
511 		}
512 		return totalcount;
513 	}
514 }
515 
516 
517 typedef struct
518 {
519 	u32 count : 16;   /* number of bytes remaining to be processed */
520 	u32 value : 16;   /* if count > 0: partial wide character */
521 					  /*
522 					  If WCHAR_T_BITS == 16, need 2 bits for count,
523 					  12 bits for value (10 for mbstowcs direction, 12 for wcstombs direction).
524 					  */
525 } gf_utf8_mbstate_t;
526 
527 static gf_utf8_mbstate_t internal;
528 
529 GF_EXPORT
gf_utf8_mbstowcs(unsigned short * dest,size_t len,const char ** srcp)530 size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
531 {
532 	gf_utf8_mbstate_t* ps = &internal;
533 	const char *src = *srcp;
534 
535 	unsigned short* destptr = dest;
536 	for (; len > 0; destptr++, len--) {
537 		const char* backup_src = src;
538 		unsigned char c;
539 		unsigned short wc;
540 		size_t count;
541 		if (ps->count == 0) {
542 			c = (unsigned char)*src;
543 			if (c < 0x80) {
544 				*destptr = (wchar_t)c;
545 				if (c == 0) {
546 					src = NULL;
547 					break;
548 				}
549 				src++;
550 				continue;
551 			}
552 			else if (c < 0xC0) {
553 				/* Spurious 10XXXXXX byte is invalid. */
554 				goto bad_input;
555 			}
556 			if (c < 0xE0) {
557 				wc = (wchar_t)(c & 0x1F) << 6;
558 				count = 1;
559 				if (c < 0xC2) goto bad_input;
560 			}
561 			else if (c < 0xF0) {
562 				wc = (wchar_t)(c & 0x0F) << 12;
563 				count = 2;
564 			}
565 			else goto bad_input;
566 			src++;
567 		}
568 		else {
569 			wc = ps->value << 6;
570 			count = ps->count;
571 		}
572 		for (;;) {
573 			c = (unsigned char)*src++ ^ 0x80;
574 			if (!(c < 0x40)) goto bad_input_backup;
575 			wc |= (unsigned short)c << (6 * --count);
576 			if (count == 0)
577 				break;
578 			/* The following test is only necessary once for every character,
579 			but it would be too complicated to perform it once only, on
580 			the first pass through this loop. */
581 			if ((unsigned short)wc < ((unsigned short)1 << (5 * count + 6)))
582 				goto bad_input_backup;
583 		}
584 		*destptr = wc;
585 		ps->count = 0;
586 		continue;
587 
588 	bad_input_backup:
589 		src = backup_src;
590 		goto bad_input;
591 	}
592 	*srcp = src;
593 	return destptr - dest;
594 
595 bad_input:
596 	*srcp = src;
597 	return (size_t)(-1);
598 }
599 
600 #endif
601 #endif /* GPAC_DISABLE_CORE_TOOLS */