1 /*
2  *			GPAC - Multimedia Framework C SDK
3  *
4  *			Authors: Jean Le Feuvre
5  *			Copyright (c) Telecom ParisTech 2007-2012
6  *					All rights reserved
7  *
8  *  This file is part of GPAC / common tools sub-project
9  *
10  *  GPAC is free software; you can redistribute it and/or modify
11  *  it under the terms of the GNU Lesser General Public License as published by
12  *  the Free Software Foundation; either version 2, or (at your option)
13  *  any later version.
14  *
15  *  GPAC is distributed in the hope that it will be useful,
16  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *  GNU Lesser General Public License for more details.
19  *
20  *  You should have received a copy of the GNU Lesser General Public
21  *  License along with this library; see the file COPYING.  If not, write to
22  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23  *
24  */
25 
26 #ifndef GPAC_DISABLE_CORE_TOOLS
27 
28 #include <gpac/utf.h>
29 
30 
31 #if 1
32 
33 
34 /*
35  * Copyright 2001-2004 Unicode, Inc.
36  *
37  * Disclaimer
38  *
39  * This source code is provided as is by Unicode, Inc. No claims are
40  * made as to fitness for any particular purpose. No warranties of any
41  * kind are expressed or implied. The recipient agrees to determine
42  * applicability of information provided. If this file has been
43  * purchased on magnetic or optical media from Unicode, Inc., the
44  * sole remedy for any claim will be exchange of defective media
45  * within 90 days of receipt.
46  *
47  * Limitations on Rights to Redistribute This Code
48  *
49  * Unicode, Inc. hereby grants the right to freely use the information
50  * supplied in this file in the creation of products supporting the
51  * Unicode Standard, and to make copies of this file in any form
52  * for internal or external distribution as long as this notice
53  * remains attached.
54  */
55 
56 /* ---------------------------------------------------------------------
57 
58     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
59     Author: Mark E. Davis, 1994.
60     Rev History: Rick McGowan, fixes & updates May 2001.
61     Sept 2001: fixed const & error conditions per
62 	mods suggested by S. Parent & A. Lillich.
63     June 2002: Tim Dodd added detection and handling of incomplete
64 	source sequences, enhanced error detection, added casts
65 	to eliminate compiler warnings.
66     July 2003: slight mods to back out aggressive FFFE detection.
67     Jan 2004: updated switches in from-UTF8 conversions.
68     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
69 
70     See the header file "ConvertUTF.h" for complete documentation.
71 
72 ------------------------------------------------------------------------ */
73 
74 typedef u32 UTF32;	/* at least 32 bits */
75 typedef u16 UTF16;	/* at least 16 bits */
76 typedef u8 UTF8;	/* typically 8 bits */
77 typedef u8 Boolean; /* 0 or 1 */
78 
79 /* Some fundamental constants */
80 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
81 #define UNI_MAX_BMP (UTF32)0x0000FFFF
82 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
83 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
84 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
85 
86 typedef enum {
87 	conversionOK, 		/* conversion successful */
88 	sourceExhausted,	/* partial character in source, but hit end */
89 	targetExhausted,	/* insuff. room in target for conversion */
90 	sourceIllegal		/* source sequence is illegal/malformed */
91 } ConversionResult;
92 
93 typedef enum {
94 	strictConversion = 0,
95 	lenientConversion
96 } ConversionFlags;
97 
98 static const int halfShift  = 10; /* used for shifting by 10 bits */
99 
100 static const UTF32 halfBase = 0x0010000UL;
101 static const UTF32 halfMask = 0x3FFUL;
102 
103 #define UNI_SUR_HIGH_START  (UTF32)0xD800
104 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
105 #define UNI_SUR_LOW_START   (UTF32)0xDC00
106 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
107 #define false	   0
108 #define true	    1
109 
110 /*
111  * Index into the table below with the first byte of a UTF-8 sequence to
112  * get the number of trailing bytes that are supposed to follow it.
113  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
114  * left as-is for anyone who may want to do such conversion, which was
115  * allowed in earlier algorithms.
116  */
117 static const char trailingBytesForUTF8[256] = {
118 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
119 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
120 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
121 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
122 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
123 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
124 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
125 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
126 };
127 
128 /*
129  * Magic values subtracted from a buffer value during UTF8 conversion.
130  * This table contains as many values as there might be trailing bytes
131  * in a UTF-8 sequence.
132  */
133 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
134                                           0x03C82080UL, 0xFA082080UL, 0x82082080UL
135                                         };
136 
137 /*
138  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
139  * into the first byte, depending on how many bytes follow.  There are
140  * as many entries in this table as there are UTF-8 sequence types.
141  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
142  * for *legal* UTF-8 will be 4 or fewer bytes total.
143  */
144 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
145 
146 /* --------------------------------------------------------------------- */
147 
148 /* The interface converts a whole buffer to avoid function-call overhead.
149  * Constants have been gathered. Loops & conditionals have been removed as
150  * much as possible for efficiency, in favor of drop-through switches.
151  * (See "Note A" at the bottom of the file for equivalent code.)
152  * If your compiler supports it, the "isLegalUTF8" call can be turned
153  * into an inline function.
154  */
155 
156 /* --------------------------------------------------------------------- */
157 
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)158 ConversionResult ConvertUTF16toUTF8 (
159     const UTF16** sourceStart, const UTF16* sourceEnd,
160     UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
161 	ConversionResult result = conversionOK;
162 	const UTF16* source = *sourceStart;
163 	UTF8* target = *targetStart;
164 	while (source < sourceEnd) {
165 		UTF32 ch;
166 		unsigned short bytesToWrite = 0;
167 		const UTF32 byteMask = 0xBF;
168 		const UTF32 byteMark = 0x80;
169 		const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
170 		ch = *source++;
171 		/* If we have a surrogate pair, convert to UTF32 first. */
172 		if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
173 			/* If the 16 bits following the high surrogate are in the source buffer... */
174 			if (source < sourceEnd) {
175 				UTF32 ch2 = *source;
176 				/* If it's a low surrogate, convert to UTF32. */
177 				if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
178 					ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
179 					     + (ch2 - UNI_SUR_LOW_START) + halfBase;
180 					++source;
181 				} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
182 					--source; /* return to the illegal value itself */
183 					result = sourceIllegal;
184 					break;
185 				}
186 			} else { /* We don't have the 16 bits following the high surrogate. */
187 				--source; /* return to the high surrogate */
188 				result = sourceExhausted;
189 				break;
190 			}
191 		} else if (flags == strictConversion) {
192 			/* UTF-16 surrogate values are illegal in UTF-32 */
193 			if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
194 				--source; /* return to the illegal value itself */
195 				result = sourceIllegal;
196 				break;
197 			}
198 		}
199 		/* Figure out how many bytes the result will require */
200 		if (ch < (UTF32)0x80) {
201 			bytesToWrite = 1;
202 		} else if (ch < (UTF32)0x800) {
203 			bytesToWrite = 2;
204 		} else if (ch < (UTF32)0x10000) {
205 			bytesToWrite = 3;
206 		} else if (ch < (UTF32)0x110000) {
207 			bytesToWrite = 4;
208 		} else {
209 			bytesToWrite = 3;
210 			ch = UNI_REPLACEMENT_CHAR;
211 		}
212 
213 		target += bytesToWrite;
214 		if (target > targetEnd) {
215 			source = oldSource; /* Back up source pointer! */
216 			target -= bytesToWrite;
217 			result = targetExhausted;
218 			break;
219 		}
220 		switch (bytesToWrite) { /* note: everything falls through. */
221 		case 4:
222 			*--target = (UTF8)((ch | byteMark) & byteMask);
223 			ch >>= 6;
224 		case 3:
225 			*--target = (UTF8)((ch | byteMark) & byteMask);
226 			ch >>= 6;
227 		case 2:
228 			*--target = (UTF8)((ch | byteMark) & byteMask);
229 			ch >>= 6;
230 		case 1:
231 			*--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
232 		}
233 		target += bytesToWrite;
234 	}
235 	*sourceStart = source;
236 	*targetStart = target;
237 	return result;
238 }
239 
240 /*
241  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
242  * This must be called with the length pre-determined by the first byte.
243  * If not calling this from ConvertUTF8to*, then the length can be set by:
244  *  length = trailingBytesForUTF8[*source]+1;
245  * and the sequence is illegal right away if there aren't that many bytes
246  * available.
247  * If presented with a length > 4, this returns false.  The Unicode
248  * definition of UTF-8 goes up to 4-byte sequences.
249  */
250 
isLegalUTF8(const UTF8 * source,int length)251 static Boolean isLegalUTF8(const UTF8 *source, int length) {
252 	UTF8 a;
253 	const UTF8 *srcptr = source+length;
254 	switch (length) {
255 	default:
256 		return false;
257 	/* Everything else falls through when "true"... */
258 	case 4:
259 		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
260 	case 3:
261 		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
262 	case 2:
263 		if ((a = (*--srcptr)) > 0xBF) return false;
264 
265 		switch (*source) {
266 		/* no fall-through in this inner switch */
267 		case 0xE0:
268 			if (a < 0xA0) return false;
269 			break;
270 		case 0xED:
271 			if (a > 0x9F) return false;
272 			break;
273 		case 0xF0:
274 			if (a < 0x90) return false;
275 			break;
276 		case 0xF4:
277 			if (a > 0x8F) return false;
278 			break;
279 		default:
280 			if (a < 0x80) return false;
281 		}
282 
283 	case 1:
284 		if (*source >= 0x80 && *source < 0xC2) return false;
285 	}
286 	if (*source > 0xF4) return false;
287 	return true;
288 }
289 
290 /* --------------------------------------------------------------------- */
291 
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)292 ConversionResult ConvertUTF8toUTF16 (
293     const UTF8** sourceStart, const UTF8* sourceEnd,
294     UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
295 	ConversionResult result = conversionOK;
296 	const UTF8* source = *sourceStart;
297 	UTF16* target = *targetStart;
298 	while (source < sourceEnd) {
299 		UTF32 ch = 0;
300 		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
301 		if (source + extraBytesToRead >= sourceEnd) {
302 			result = sourceExhausted;
303 			break;
304 		}
305 		/* Do this check whether lenient or strict */
306 		if (! isLegalUTF8(source, extraBytesToRead+1)) {
307 			result = sourceIllegal;
308 			break;
309 		}
310 		/*
311 		 * The cases all fall through. See "Note A" below.
312 		 */
313 		switch (extraBytesToRead) {
314 		case 5:
315 			ch += *source++;
316 			ch <<= 6; /* remember, illegal UTF-8 */
317 		case 4:
318 			ch += *source++;
319 			ch <<= 6; /* remember, illegal UTF-8 */
320 		case 3:
321 			ch += *source++;
322 			ch <<= 6;
323 		case 2:
324 			ch += *source++;
325 			ch <<= 6;
326 		case 1:
327 			ch += *source++;
328 			ch <<= 6;
329 		case 0:
330 			ch += *source++;
331 		}
332 		ch -= offsetsFromUTF8[extraBytesToRead];
333 
334 		if (target >= targetEnd) {
335 			source -= (extraBytesToRead+1); /* Back up source pointer! */
336 			result = targetExhausted;
337 			break;
338 		}
339 		if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
340 			/* UTF-16 surrogate values are illegal in UTF-32 */
341 			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
342 				if (flags == strictConversion) {
343 					source -= (extraBytesToRead+1); /* return to the illegal value itself */
344 					result = sourceIllegal;
345 					break;
346 				} else {
347 					*target++ = UNI_REPLACEMENT_CHAR;
348 				}
349 			} else {
350 				*target++ = (UTF16)ch; /* normal case */
351 			}
352 		} else if (ch > UNI_MAX_UTF16) {
353 			if (flags == strictConversion) {
354 				result = sourceIllegal;
355 				source -= (extraBytesToRead+1); /* return to the start */
356 				break; /* Bail out; shouldn't continue */
357 			} else {
358 				*target++ = UNI_REPLACEMENT_CHAR;
359 			}
360 		} else {
361 			/* target is a character in range 0xFFFF - 0x10FFFF. */
362 			if (target + 1 >= targetEnd) {
363 				source -= (extraBytesToRead+1); /* Back up source pointer! */
364 				result = targetExhausted;
365 				break;
366 			}
367 			ch -= halfBase;
368 			*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
369 			*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
370 		}
371 	}
372 	*sourceStart = source;
373 	*targetStart = target;
374 	return result;
375 }
376 
377 
378 
379 GF_EXPORT
gf_utf8_wcslen(const unsigned short * s)380 size_t gf_utf8_wcslen (const unsigned short *s)
381 {
382 	const unsigned short* ptr;
383 	for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
384 	}
385 	return ptr - s;
386 }
387 
388 GF_EXPORT
gf_utf8_wcstombs(char * dest,size_t len,const unsigned short ** srcp)389 size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
390 {
391 	if (!srcp || !*srcp)
392 		return 0;
393 	else {
394 		const UTF16** sourceStart = srcp;
395 		const UTF16* sourceEnd = *srcp + gf_utf8_wcslen(*srcp);
396 		UTF8* targetStart = (UTF8*) dest;
397 		UTF8* targetEnd = (UTF8*) dest + len;
398 		ConversionFlags flags = strictConversion;
399 
400 		ConversionResult res = ConvertUTF16toUTF8(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
401 		if (res != conversionOK) return (size_t)-1;
402 		*targetStart = 0;
403 		*srcp=NULL;
404 		return strlen(dest);
405 	}
406 }
407 
408 GF_EXPORT
gf_utf8_mbstowcs(unsigned short * dest,size_t len,const char ** srcp)409 size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
410 {
411 	if (!srcp || !*srcp)
412 		return 0;
413 	else {
414 		const UTF8** sourceStart = (const UTF8**) srcp;
415 		const UTF8* sourceEnd = (const UTF8*) ( *srcp + strlen( *srcp) );
416 		UTF16* targetStart = (UTF16* ) dest;
417 		UTF16* targetEnd = (UTF16* ) (dest + len);
418 		ConversionFlags flags = strictConversion;
419 		ConversionResult res = ConvertUTF8toUTF16(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
420 		if (res != conversionOK) return (size_t)-1;
421 		*targetStart = 0;
422 		*srcp=NULL;
423 		return gf_utf8_wcslen(dest);
424 	}
425 }
426 
427 
428 #else
429 
430 GF_EXPORT
gf_utf8_wcslen(const unsigned short * s)431 size_t gf_utf8_wcslen (const unsigned short *s)
432 {
433 	const unsigned short* ptr;
434 	for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
435 	}
436 	return ptr - s;
437 }
438 
439 GF_EXPORT
gf_utf8_wcstombs(char * dest,size_t len,const unsigned short ** srcp)440 size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
441 {
442 	/*
443 	* Original code from the GNU UTF-8 Library
444 	*/
445 	size_t count;
446 	const unsigned short * src = *srcp;
447 
448 	if (dest != NULL) {
449 		char* destptr = dest;
450 		for (;; src++) {
451 			unsigned char c;
452 			unsigned short wc = *src;
453 			if (wc < 0x80) {
454 				if (wc == (wchar_t)'\0') {
455 					if (len == 0) {
456 						*srcp = src;
457 						break;
458 					}
459 					*destptr = '\0';
460 					*srcp = NULL;
461 					break;
462 				}
463 				count = 0;
464 				c = (unsigned char) wc;
465 			} else if (wc < 0x800) {
466 				count = 1;
467 				c = (unsigned char) ((wc >> 6) | 0xC0);
468 			} else {
469 				count = 2;
470 				c = (unsigned char) ((wc >> 12) | 0xE0);
471 			}
472 			if (len <= count) {
473 				*srcp = src;
474 				break;
475 			}
476 			len -= count+1;
477 			*destptr++ = c;
478 			if (count > 0)
479 				do {
480 					*destptr++ = (unsigned char)(((wc >> (6 * --count)) & 0x3F) | 0x80);
481 				} while (count > 0);
482 		}
483 		return destptr - dest;
484 	} else {
485 		/* Ignore dest and len. */
486 		size_t totalcount = 0;
487 		for (;; src++) {
488 			unsigned short wc = *src;
489 			size_t count;
490 			if (wc < 0x80) {
491 				if (wc == (wchar_t)'\0') {
492 					*srcp = NULL;
493 					break;
494 				}
495 				count = 1;
496 			} else if (wc < 0x800) {
497 				count = 2;
498 			} else {
499 				count = 3;
500 			}
501 			totalcount += count;
502 		}
503 		return totalcount;
504 	}
505 }
506 
507 
508 typedef struct
509 {
510 	u32 count : 16;   /* number of bytes remaining to be processed */
511 	u32 value : 16;   /* if count > 0: partial wide character */
512 	/*
513 	   If WCHAR_T_BITS == 16, need 2 bits for count,
514 	   12 bits for value (10 for mbstowcs direction, 12 for wcstombs direction).
515 	*/
516 } gf_utf8_mbstate_t;
517 
518 static gf_utf8_mbstate_t internal;
519 
520 GF_EXPORT
gf_utf8_mbstowcs(unsigned short * dest,size_t len,const char ** srcp)521 size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
522 {
523 	gf_utf8_mbstate_t* ps = &internal;
524 	const char *src = *srcp;
525 
526 	unsigned short* destptr = dest;
527 	for (; len > 0; destptr++, len--) {
528 		const char* backup_src = src;
529 		unsigned char c;
530 		unsigned short wc;
531 		size_t count;
532 		if (ps->count == 0) {
533 			c = (unsigned char) *src;
534 			if (c < 0x80) {
535 				*destptr = (wchar_t) c;
536 				if (c == 0) {
537 					src = NULL;
538 					break;
539 				}
540 				src++;
541 				continue;
542 			} else if (c < 0xC0) {
543 				/* Spurious 10XXXXXX byte is invalid. */
544 				goto bad_input;
545 			}
546 			if (c < 0xE0) {
547 				wc = (wchar_t)(c & 0x1F) << 6;
548 				count = 1;
549 				if (c < 0xC2) goto bad_input;
550 			} else if (c < 0xF0) {
551 				wc = (wchar_t)(c & 0x0F) << 12;
552 				count = 2;
553 			}
554 			else goto bad_input;
555 			src++;
556 		} else {
557 			wc = ps->value << 6;
558 			count = ps->count;
559 		}
560 		for (;;) {
561 			c = (unsigned char) *src++ ^ 0x80;
562 			if (!(c < 0x40)) goto bad_input_backup;
563 			wc |= (unsigned short) c << (6 * --count);
564 			if (count == 0)
565 				break;
566 			/* The following test is only necessary once for every character,
567 			but it would be too complicated to perform it once only, on
568 			the first pass through this loop. */
569 			if ((unsigned short) wc < ((unsigned short) 1 << (5 * count + 6)))
570 				goto bad_input_backup;
571 		}
572 		*destptr = wc;
573 		ps->count = 0;
574 		continue;
575 
576 bad_input_backup:
577 		src = backup_src;
578 		goto bad_input;
579 	}
580 	*srcp = src;
581 	return destptr-dest;
582 
583 bad_input:
584 	*srcp = src;
585 	return (size_t)(-1);
586 }
587 
588 
589 #endif
590 
591 
592 GF_EXPORT
gf_utf_get_utf8_string_from_bom(u8 * data,u32 size,char ** out_ptr)593 char *gf_utf_get_utf8_string_from_bom(u8 *data, u32 size, char **out_ptr)
594 {
595 	u32 unicode_type = 0;
596 	*out_ptr = NULL;
597 
598 	if (size>=5) {
599 		/*0: no unicode, 1: UTF-16BE, 2: UTF-16LE*/
600 		if ((data[0]==0xFF) && (data[1]==0xFE)) {
601 			if (!data[2] && !data[3]) {
602 				return NULL;
603 			} else {
604 				unicode_type = 2;
605 			}
606 		} else if ((data[0]==0xFE) && (data[1]==0xFF)) {
607 			if (!data[2] && !data[3]) {
608 				return NULL;
609 			} else {
610 				unicode_type = 1;
611 			}
612 		} else if ((data[0]==0xEF) && (data[1]==0xBB) && (data[2]==0xBF)) {
613 			return data+4;
614 		}
615 	}
616 
617 	if (!unicode_type) return data;
618 
619 	if (size%2) size--;
620 	u16 *str_wc = gf_malloc(size+2);
621 	u16 *srcwc;
622 	char *dst = gf_malloc(size+2);
623 	*out_ptr = dst;
624 	u32 i;
625 	for (i=0; i<size; i+=2) {
626 		u16 wchar=0;
627 		u8 c1 = data[i];
628 		u8 c2 = data[i+1];
629 
630 		/*Little-endian order*/
631 		if (unicode_type==2) {
632 			if (c2) {
633 				wchar = c2;
634 				wchar <<=8;
635 				wchar |= c1;
636 			}
637 			else wchar = c1;
638 		} else {
639 			wchar = c1;
640 			if (c2) {
641 				wchar <<= 8;
642 				wchar |= c2;
643 			}
644 		}
645 		str_wc[i/2] = wchar;
646 	}
647 	str_wc[i/2] = 0;
648 	srcwc = str_wc;
649 	gf_utf8_wcstombs(dst, size, (const unsigned short **) &srcwc);
650 	gf_free(str_wc);
651 
652 	return dst;
653 }
654 
655 
656 #if defined(WIN32)
657 
658 GF_EXPORT
gf_utf8_to_wcs(const char * str)659 wchar_t* gf_utf8_to_wcs(const char* str)
660 {
661 	size_t source_len;
662 	wchar_t* result;
663 	if (str == 0) return 0;
664 	source_len = strlen(str);
665 	result = gf_calloc(source_len + 1, sizeof(wchar_t));
666 	if (!result)
667 		return 0;
668 	if (gf_utf8_mbstowcs(result, source_len, &str) == (size_t)-1) {
669 		gf_free(result);
670 		return 0;
671 	}
672 	return result;
673 }
674 
675 GF_EXPORT
gf_wcs_to_utf8(const wchar_t * str)676 char* gf_wcs_to_utf8(const wchar_t* str)
677 {
678 	size_t source_len;
679 	char* result;
680 	if (str == 0) return 0;
681 	source_len = wcslen(str);
682 	result = gf_calloc(source_len + 1, UTF8_MAX_BYTES_PER_CHAR);
683 	if (!result)
684 		return 0;
685 	if (gf_utf8_wcstombs(result, source_len * UTF8_MAX_BYTES_PER_CHAR, &str) < 0) {
686 		gf_free(result);
687 		return 0;
688 	}
689 	return result;
690 }
691 #endif
692 
693 #endif /* GPAC_DISABLE_CORE_TOOLS */
694 
695 
696