src/gpacmp4/utf.c

/*
*			GPAC - Multimedia Framework C SDK
*
*			Authors: Jean Le Feuvre
*			Copyright (c) Telecom ParisTech 2007-2012
*					All rights reserved
*
*  This file is part of GPAC / common tools sub-project
*
*  GPAC is free software; you can redistribute it and/or modify
*  it under the terms of the GNU Lesser General Public License as published by
*  the Free Software Foundation; either version 2, or (at your option)
*  any later version.
*
*  GPAC is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*
*  You should have received a copy of the GNU Lesser General Public
*  License along with this library; see the file COPYING.  If not, write to
*  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*
*/

#ifndef GPAC_DISABLE_CORE_TOOLS

#include <gpac/utf.h>


#if 1


/*
* Copyright 2001-2004 Unicode, Inc.
*
* Disclaimer
*
* This source code is provided as is by Unicode, Inc. No claims are
* made as to fitness for any particular purpose. No warranties of any
* kind are expressed or implied. The recipient agrees to determine
* applicability of information provided. If this file has been
* purchased on magnetic or optical media from Unicode, Inc., the
* sole remedy for any claim will be exchange of defective media
* within 90 days of receipt.
*
* Limitations on Rights to Redistribute This Code
*
* Unicode, Inc. hereby grants the right to freely use the information
* supplied in this file in the creation of products supporting the
* Unicode Standard, and to make copies of this file in any form
* for internal or external distribution as long as this notice
* remains attached.
*/

/* ---------------------------------------------------------------------

Conversions between UTF32, UTF-16, and UTF-8. Source code file.
Author: Mark E. Davis, 1994.
Rev History: Rick McGowan, fixes & updates May 2001.
Sept 2001: fixed const & error conditions per
mods suggested by S. Parent & A. Lillich.
June 2002: Tim Dodd added detection and handling of incomplete
source sequences, enhanced error detection, added casts
to eliminate compiler warnings.
July 2003: slight mods to back out aggressive FFFE detection.
Jan 2004: updated switches in from-UTF8 conversions.
Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.

See the header file "ConvertUTF.h" for complete documentation.

------------------------------------------------------------------------ */

typedef u32 UTF32;	/* at least 32 bits */
typedef u16 UTF16;	/* at least 16 bits */
typedef u8 UTF8;	/* typically 8 bits */
typedef u8 Boolean; /* 0 or 1 */

					/* Some fundamental constants */
#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
#define UNI_MAX_BMP (UTF32)0x0000FFFF
#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF

typedef enum {
	conversionOK, 		/* conversion successful */
	sourceExhausted,	/* partial character in source, but hit end */
	targetExhausted,	/* insuff. room in target for conversion */
	sourceIllegal		/* source sequence is illegal/malformed */
} ConversionResult;

typedef enum {
	strictConversion = 0,
	lenientConversion
} ConversionFlags;

static const int halfShift = 10; /* used for shifting by 10 bits */

static const UTF32 halfBase = 0x0010000UL;
static const UTF32 halfMask = 0x3FFUL;

#define UNI_SUR_HIGH_START  (UTF32)0xD800
#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
#define UNI_SUR_LOW_START   (UTF32)0xDC00
#define UNI_SUR_LOW_END     (UTF32)0xDFFF
#define false	   0
#define true	    1

/*
* Index into the table below with the first byte of a UTF-8 sequence to
* get the number of trailing bytes that are supposed to follow it.
* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
* left as-is for anyone who may want to do such conversion, which was
* allowed in earlier algorithms.
*/
static const char trailingBytesForUTF8[256] = {
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};

/*
* Magic values subtracted from a buffer value during UTF8 conversion.
* This table contains as many values as there might be trailing bytes
* in a UTF-8 sequence.
*/
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL
};

/*
* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
* into the first byte, depending on how many bytes follow.  There are
* as many entries in this table as there are UTF-8 sequence types.
* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
* for *legal* UTF-8 will be 4 or fewer bytes total.
*/
static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

/* --------------------------------------------------------------------- */

/* The interface converts a whole buffer to avoid function-call overhead.
* Constants have been gathered. Loops & conditionals have been removed as
* much as possible for efficiency, in favor of drop-through switches.
* (See "Note A" at the bottom of the file for equivalent code.)
* If your compiler supports it, the "isLegalUTF8" call can be turned
* into an inline function.
*/

/* --------------------------------------------------------------------- */

ConversionResult ConvertUTF16toUTF8(
	const UTF16** sourceStart, const UTF16* sourceEnd,
	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
	ConversionResult result = conversionOK;
	const UTF16* source = *sourceStart;
	UTF8* target = *targetStart;
	while (source < sourceEnd) {
		UTF32 ch;
		unsigned short bytesToWrite = 0;
		const UTF32 byteMask = 0xBF;
		const UTF32 byteMark = 0x80;
		const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
		ch = *source++;
		/* If we have a surrogate pair, convert to UTF32 first. */
		if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
			/* If the 16 bits following the high surrogate are in the source buffer... */
			if (source < sourceEnd) {
				UTF32 ch2 = *source;
				/* If it's a low surrogate, convert to UTF32. */
				if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
					ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
						+ (ch2 - UNI_SUR_LOW_START) + halfBase;
					++source;
				}
				else if (flags == strictConversion) { /* it's an unpaired high surrogate */
					--source; /* return to the illegal value itself */
					result = sourceIllegal;
					break;
				}
			}
			else { /* We don't have the 16 bits following the high surrogate. */
				--source; /* return to the high surrogate */
				result = sourceExhausted;
				break;
			}
		}
		else if (flags == strictConversion) {
			/* UTF-16 surrogate values are illegal in UTF-32 */
			if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
				--source; /* return to the illegal value itself */
				result = sourceIllegal;
				break;
			}
		}
		/* Figure out how many bytes the result will require */
		if (ch < (UTF32)0x80) {
			bytesToWrite = 1;
		}
		else if (ch < (UTF32)0x800) {
			bytesToWrite = 2;
		}
		else if (ch < (UTF32)0x10000) {
			bytesToWrite = 3;
		}
		else if (ch < (UTF32)0x110000) {
			bytesToWrite = 4;
		}
		else {
			bytesToWrite = 3;
			ch = UNI_REPLACEMENT_CHAR;
		}

		target += bytesToWrite;
		if (target > targetEnd) {
			source = oldSource; /* Back up source pointer! */
			target -= bytesToWrite;
			result = targetExhausted;
			break;
		}
		switch (bytesToWrite) { /* note: everything falls through. */
		case 4:
			*--target = (UTF8)((ch | byteMark) & byteMask);
			ch >>= 6;
		case 3:
			*--target = (UTF8)((ch | byteMark) & byteMask);
			ch >>= 6;
		case 2:
			*--target = (UTF8)((ch | byteMark) & byteMask);
			ch >>= 6;
		case 1:
			*--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
		}
		target += bytesToWrite;
	}
	*sourceStart = source;
	*targetStart = target;
	return result;
}

/*
* Utility routine to tell whether a sequence of bytes is legal UTF-8.
* This must be called with the length pre-determined by the first byte.
* If not calling this from ConvertUTF8to*, then the length can be set by:
*  length = trailingBytesForUTF8[*source]+1;
* and the sequence is illegal right away if there aren't that many bytes
* available.
* If presented with a length > 4, this returns false.  The Unicode
* definition of UTF-8 goes up to 4-byte sequences.
*/

static Boolean isLegalUTF8(const UTF8 *source, int length) {
	UTF8 a;
	const UTF8 *srcptr = source + length;
	switch (length) {
	default:
		return false;
		/* Everything else falls through when "true"... */
	case 4:
		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
	case 3:
		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
	case 2:
		if ((a = (*--srcptr)) > 0xBF) return false;

		switch (*source) {
			/* no fall-through in this inner switch */
		case 0xE0:
			if (a < 0xA0) return false;
			break;
		case 0xED:
			if (a > 0x9F) return false;
			break;
		case 0xF0:
			if (a < 0x90) return false;
			break;
		case 0xF4:
			if (a > 0x8F) return false;
			break;
		default:
			if (a < 0x80) return false;
		}

	case 1:
		if (*source >= 0x80 && *source < 0xC2) return false;
	}
	if (*source > 0xF4) return false;
	return true;
}

/* --------------------------------------------------------------------- */

ConversionResult ConvertUTF8toUTF16(
	const UTF8** sourceStart, const UTF8* sourceEnd,
	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
	ConversionResult result = conversionOK;
	const UTF8* source = *sourceStart;
	UTF16* target = *targetStart;
	while (source < sourceEnd) {
		UTF32 ch = 0;
		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
		if (source + extraBytesToRead >= sourceEnd) {
			result = sourceExhausted;
			break;
		}
		/* Do this check whether lenient or strict */
		if (!isLegalUTF8(source, extraBytesToRead + 1)) {
			result = sourceIllegal;
			break;
		}
		/*
		* The cases all fall through. See "Note A" below.
		*/
		switch (extraBytesToRead) {
		case 5:
			ch += *source++;
			ch <<= 6; /* remember, illegal UTF-8 */
		case 4:
			ch += *source++;
			ch <<= 6; /* remember, illegal UTF-8 */
		case 3:
			ch += *source++;
			ch <<= 6;
		case 2:
			ch += *source++;
			ch <<= 6;
		case 1:
			ch += *source++;
			ch <<= 6;
		case 0:
			ch += *source++;
		}
		ch -= offsetsFromUTF8[extraBytesToRead];

		if (target >= targetEnd) {
			source -= (extraBytesToRead + 1); /* Back up source pointer! */
			result = targetExhausted;
			break;
		}
		if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
								 /* UTF-16 surrogate values are illegal in UTF-32 */
			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				if (flags == strictConversion) {
					source -= (extraBytesToRead + 1); /* return to the illegal value itself */
					result = sourceIllegal;
					break;
				}
				else {
					*target++ = UNI_REPLACEMENT_CHAR;
				}
			}
			else {
				*target++ = (UTF16)ch; /* normal case */
			}
		}
		else if (ch > UNI_MAX_UTF16) {
			if (flags == strictConversion) {
				result = sourceIllegal;
				source -= (extraBytesToRead + 1); /* return to the start */
				break; /* Bail out; shouldn't continue */
			}
			else {
				*target++ = UNI_REPLACEMENT_CHAR;
			}
		}
		else {
			/* target is a character in range 0xFFFF - 0x10FFFF. */
			if (target + 1 >= targetEnd) {
				source -= (extraBytesToRead + 1); /* Back up source pointer! */
				result = targetExhausted;
				break;
			}
			ch -= halfBase;
			*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
			*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
		}
	}
	*sourceStart = source;
	*targetStart = target;
	return result;
}


GF_EXPORT
size_t gf_utf8_wcslen(const unsigned short *s)
{
	const unsigned short* ptr;
	for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
	}
	return ptr - s;
}

GF_EXPORT
size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
{
	const UTF16** sourceStart = srcp;
	const UTF16* sourceEnd = *srcp + gf_utf8_wcslen(*srcp);
	UTF8* targetStart = (UTF8*)dest;
	UTF8* targetEnd = (UTF8*)dest + len;
	ConversionFlags flags = strictConversion;

	ConversionResult res = ConvertUTF16toUTF8(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
	if (res != conversionOK) return (size_t)-1;
	*targetStart = 0;
	*srcp = NULL;
	return strlen(dest);
}

GF_EXPORT
size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
{
	const UTF8** sourceStart = (const UTF8**)srcp;
	const UTF8* sourceEnd = (const UTF8*)(*srcp + strlen(*srcp));
	UTF16* targetStart = (UTF16*)dest;
	UTF16* targetEnd = (UTF16*)(dest + len);
	ConversionFlags flags = strictConversion;
	ConversionResult res = ConvertUTF8toUTF16(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
	if (res != conversionOK) return (size_t)-1;
	*targetStart = 0;
	*srcp = NULL;
	return gf_utf8_wcslen(dest);
}


#else

GF_EXPORT
size_t gf_utf8_wcslen(const unsigned short *s)
{
	const unsigned short* ptr;
	for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
	}
	return ptr - s;
}

GF_EXPORT
size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
{
	/*
	* Original code from the GNU UTF-8 Library
	*/
	size_t count;
	const unsigned short * src = *srcp;

	if (dest != NULL) {
		char* destptr = dest;
		for (;; src++) {
			unsigned char c;
			unsigned short wc = *src;
			if (wc < 0x80) {
				if (wc == (wchar_t)'\0') {
					if (len == 0) {
						*srcp = src;
						break;
					}
					*destptr = '\0';
					*srcp = NULL;
					break;
				}
				count = 0;
				c = (unsigned char)wc;
			}
			else if (wc < 0x800) {
				count = 1;
				c = (unsigned char)((wc >> 6) | 0xC0);
			}
			else {
				count = 2;
				c = (unsigned char)((wc >> 12) | 0xE0);
			}
			if (len <= count) {
				*srcp = src;
				break;
			}
			len -= count + 1;
			*destptr++ = c;
			if (count > 0)
				do {
					*destptr++ = (unsigned char)(((wc >> (6 * --count)) & 0x3F) | 0x80);
				} while (count > 0);
		}
		return destptr - dest;
	}
	else {
		/* Ignore dest and len. */
		size_t totalcount = 0;
		for (;; src++) {
			unsigned short wc = *src;
			size_t count;
			if (wc < 0x80) {
				if (wc == (wchar_t)'\0') {
					*srcp = NULL;
					break;
				}
				count = 1;
			}
			else if (wc < 0x800) {
				count = 2;
			}
			else {
				count = 3;
			}
			totalcount += count;
		}
		return totalcount;
	}
}


typedef struct
{
	u32 count : 16;   /* number of bytes remaining to be processed */
	u32 value : 16;   /* if count > 0: partial wide character */
					  /*
					  If WCHAR_T_BITS == 16, need 2 bits for count,
					  12 bits for value (10 for mbstowcs direction, 12 for wcstombs direction).
					  */
} gf_utf8_mbstate_t;

static gf_utf8_mbstate_t internal;

GF_EXPORT
size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
{
	gf_utf8_mbstate_t* ps = &internal;
	const char *src = *srcp;

	unsigned short* destptr = dest;
	for (; len > 0; destptr++, len--) {
		const char* backup_src = src;
		unsigned char c;
		unsigned short wc;
		size_t count;
		if (ps->count == 0) {
			c = (unsigned char)*src;
			if (c < 0x80) {
				*destptr = (wchar_t)c;
				if (c == 0) {
					src = NULL;
					break;
				}
				src++;
				continue;
			}
			else if (c < 0xC0) {
				/* Spurious 10XXXXXX byte is invalid. */
				goto bad_input;
			}
			if (c < 0xE0) {
				wc = (wchar_t)(c & 0x1F) << 6;
				count = 1;
				if (c < 0xC2) goto bad_input;
			}
			else if (c < 0xF0) {
				wc = (wchar_t)(c & 0x0F) << 12;
				count = 2;
			}
			else goto bad_input;
			src++;
		}
		else {
			wc = ps->value << 6;
			count = ps->count;
		}
		for (;;) {
			c = (unsigned char)*src++ ^ 0x80;
			if (!(c < 0x40)) goto bad_input_backup;
			wc |= (unsigned short)c << (6 * --count);
			if (count == 0)
				break;
			/* The following test is only necessary once for every character,
			but it would be too complicated to perform it once only, on
			the first pass through this loop. */
			if ((unsigned short)wc < ((unsigned short)1 << (5 * count + 6)))
				goto bad_input_backup;
		}
		*destptr = wc;
		ps->count = 0;
		continue;

	bad_input_backup:
		src = backup_src;
		goto bad_input;
	}
	*srcp = src;
	return destptr - dest;

bad_input:
	*srcp = src;
	return (size_t)(-1);
}

#endif
#endif /* GPAC_DISABLE_CORE_TOOLS */