libs/strings/unicode.c

/*
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include "port.h"

#define UNICODE_INTERNAL
#include "libs/unicode.h"

#include <ctype.h>
#include <stddef.h>
#include <stdio.h>
#include <string.h>
#include "libs/log.h"
#include "libs/misc.h"


// Resynchronise (skip everything starting with 0x10xxxxxx):
static inline void
resyncUTF8(const unsigned char **ptr) {
	while ((**ptr & 0xc0) == 0x80)
		(*ptr)++;
}

// Get one character from a UTF-8 encoded string.
// *ptr will point to the start of the next character.
// Returns 0 if the encoding is bad. This can be distinguished from the
// '\0' character by checking whether **ptr == '\0' before calling this
// function.
UniChar
getCharFromString(const unsigned char **ptr) {
	UniChar result;

	if (**ptr < 0x80) {
		// 0xxxxxxx, regular ASCII
		result = **ptr;
		(*ptr)++;

		return result;
	}

	if ((**ptr & 0xe0) == 0xc0) {
		// 110xxxxx; 10xxxxxx must follow
		// Value between 0x00000080 and 0x000007ff (inclusive)
		result = **ptr & 0x1f;
		(*ptr)++;

		if ((**ptr & 0xc0) != 0x80)
			goto err;
		result = (result << 6) | ((**ptr) & 0x3f);
		(*ptr)++;

		if (result < 0x00000080) {
			// invalid encoding - must reject
			goto err;
		}
		return result;
	}

	if ((**ptr & 0xf0) == 0xe0) {
		// 1110xxxx; 10xxxxxx 10xxxxxx must follow
		// Value between 0x00000800 and 0x0000ffff (inclusive)
		result = **ptr & 0x0f;
		(*ptr)++;

		if ((**ptr & 0xc0) != 0x80)
			goto err;
		result = (result << 6) | ((**ptr) & 0x3f);
		(*ptr)++;

		if ((**ptr & 0xc0) != 0x80)
			goto err;
		result = (result << 6) | ((**ptr) & 0x3f);
		(*ptr)++;

		if (result < 0x00000800) {
			// invalid encoding - must reject
			goto err;
		}
		return result;
	}

	if ((**ptr & 0xf8) == 0xf0) {
		// 11110xxx; 10xxxxxx 10xxxxxx 10xxxxxx must follow
		// Value between 0x00010000 and 0x0010ffff (inclusive)
		result = **ptr & 0x07;
		(*ptr)++;

		if ((**ptr & 0xc0) != 0x80)
			goto err;
		result = (result << 6) | ((**ptr) & 0x3f);
		(*ptr)++;

		if ((**ptr & 0xc0) != 0x80)
			goto err;
		result = (result << 6) | ((**ptr) & 0x3f);
		(*ptr)++;

		if ((**ptr & 0xc0) != 0x80)
			goto err;
		result = (result << 6) | ((**ptr) & 0x3f);
		(*ptr)++;

		if (result < 0x00010000) {
			// invalid encoding - must reject
			goto err;
		}
		return result;
	}

err:
	log_add(log_Warning, "Warning: Invalid UTF8 sequence.");

	// Resynchronise (skip everything starting with 0x10xxxxxx):
	resyncUTF8(ptr);

	return 0;
}

UniChar
getCharFromStringN(const unsigned char **ptr, const unsigned char *end) {
	size_t numBytes;

	if (*ptr == end)
		goto err;

	if (**ptr < 0x80) {
		numBytes = 1;
	} else if ((**ptr & 0xe0) == 0xc0) {
		numBytes = 2;
	} else if ((**ptr & 0xf0) == 0xe0) {
		numBytes = 3;
	} else if ((**ptr & 0xf8) == 0xf0) {
		numBytes = 4;
	} else
		goto err;

	if (*ptr + numBytes > end)
		goto err;

	return getCharFromString(ptr);

err:
	*ptr = end;
	return 0;
}

// Get one line from a string.
// A line is terminated with either CRLF (DOS/Windows),
// LF (Unix, MacOS X), or CR (old MacOS).
// The end of the string is reached when **startNext == '\0'.
// NULL is returned if the string is not valid UTF8. In this case
// *end points to the first invalid character (or the character before if
// it was a LF), and *startNext to the start of the next (possibly invalid
// too) character.
unsigned char *
getLineFromString(const unsigned char *start, const unsigned char **end,
		const unsigned char **startNext) {
	const unsigned char *ptr = start;
	const unsigned char *lastPtr;
	UniChar ch;

	// Search for the first newline.
	for (;;) {
		if (*ptr == '\0') {
			*end = ptr;
			*startNext = ptr;
			return (unsigned char *) unconst(start);
		}
		lastPtr = ptr;
		ch = getCharFromString(&ptr);
		if (ch == '\0') {
			// Bad string
			*end = lastPtr;
			*startNext = ptr;
			return NULL;
		}
		if (ch == '\n') {
			*end = lastPtr;
			if (*ptr == '\0'){
				// LF at the end of the string.
				*startNext = ptr;
				return (unsigned char *) unconst(start);
			}
			ch = getCharFromString(&ptr);
			if (ch == '\0') {
				// Bad string
				return NULL;
			}
			if (ch == '\r') {
				// LFCR
				*startNext = ptr;
			} else {
				// LF
				*startNext = *end;
			}
			return (unsigned char *) unconst(start);
		} else if (ch == '\r') {
			*end = lastPtr;
			*startNext = ptr;
			return (unsigned char *) unconst(start);
		} // else: a normal character
	}
}

size_t
utf8StringCount(const unsigned char *start) {
	size_t count = 0;
	UniChar ch;

	for (;;) {
		ch = getCharFromString(&start);
		if (ch == '\0')
			return count;
		count++;
	}
}

size_t
utf8StringCountN(const unsigned char *start, const unsigned char *end) {
	size_t count = 0;
	UniChar ch;

	for (;;) {
		ch = getCharFromStringN(&start, end);
		if (ch == '\0')
			return count;
		count++;
	}
}

// Locates a unicode character (ch) in a UTF-8 string (pStr)
// returns the char positions when found
//  -1 when not found
int
utf8StringPos (const unsigned char *pStr, UniChar ch)
{
	int pos;

	for (pos = 0; *pStr != '\0'; ++pos)
	{
		if (getCharFromString (&pStr) == ch)
			return pos;
	}

	if (ch == '\0' && *pStr == '\0')
		return pos;

	return -1;
}

// Safe version of strcpy(), somewhat analogous to strncpy()
// except it guarantees a 0-term when size > 0
// when size == 0, returns NULL
// BUG: this may result in the last character being only partially in the
// buffer
unsigned char *
utf8StringCopy (unsigned char *dst, size_t size, const unsigned char *src)
{
	if (size == 0)
		return 0;

	strncpy ((char *) dst, (const char *) src, size);
	dst[size - 1] = '\0';

	return dst;
}

// TODO: this is not implemented with respect to collating order
int
utf8StringCompare (const unsigned char *str1, const unsigned char *str2)
{
#if 0
	// UniChar comparing version
	UniChar ch1;
	UniChar ch2;

	for (;;)
	{
		int cmp;

		ch1 = getCharFromString(&str1);
		ch2 = getCharFromString(&str2);
		if (ch1 == '\0' || ch2 == '\0')
			break;

		cmp = utf8CompareChar (ch1, ch2);
		if (cmp != 0)
			return cmp;
	}

	if (ch1 != '\0')
	{
		// ch2 == '\0'
		// str2 ends, str1 continues
		return 1;
	}

	if (ch2 != '\0')
	{
		// ch1 == '\0'
		// str1 ends, str2 continues
		return -1;
	}

	// ch1 == '\0' && ch2 == '\0'.
	// Strings match completely.
	return 0;
#else
	// this will do for now
	return strcmp ((const char *) str1, (const char *) str2);
#endif
}

unsigned char *
skipUTF8Chars(const unsigned char *ptr, size_t num) {
	UniChar ch;
	const unsigned char *oldPtr;

	while (num--) {
		oldPtr = ptr;
		ch = getCharFromString(&ptr);
		if (ch == '\0')
			return (unsigned char *) unconst(oldPtr);
	}
	return (unsigned char *) unconst(ptr);
}

// Decodes a UTF-8 string (start) into a unicode character string (wstr)
// returns number of chars decoded and stored, not counting 0-term
// any chars that do not fit are truncated
// wide string term 0 is always appended, unless the destination
// buffer is 0 chars long
size_t
getUniCharFromStringN(UniChar *wstr, size_t maxcount,
		const unsigned char *start, const unsigned char *end)
{
	UniChar *next;

	if (maxcount == 0)
		return 0;

	// always leave room for 0-term
	--maxcount;

	for (next = wstr; maxcount > 0; ++next, --maxcount)
	{
		*next = getCharFromStringN(&start, end);
		if (*next == 0)
			break;
	}

	*next = 0; // term

	return next - wstr;
}

// See getStringFromWideN() for functionality
//  the only difference is that the source string (start) length is
//  calculated by searching for 0-term
size_t
getUniCharFromString(UniChar *wstr, size_t maxcount,
		const unsigned char *start)
{
	UniChar *next;

	if (maxcount == 0)
		return 0;

	// always leave room for 0-term
	--maxcount;

	for (next = wstr; maxcount > 0; ++next, --maxcount)
	{
		*next = getCharFromString(&start);
		if (*next == 0)
			break;
	}

	*next = 0; // term

	return next - wstr;
}

// Encode one wide character into UTF-8
// returns number of bytes used in the buffer,
//  0  : invalid or unsupported char
//  <0 : negative of bytes needed if buffer too small
// string term '\0' is *not* appended or counted
int
getStringFromChar(unsigned char *ptr, size_t size, UniChar ch)
{
	int i;
	static const struct range_def
	{
		UniChar lim;
		int marker;
		int mask;
	}
	ranges[] =
	{
		{0x0000007f, 0x00, 0x7f},
		{0x000007ff, 0xc0, 0x1f},
		{0x0000ffff, 0xe0, 0x0f},
		{0x001fffff, 0xf0, 0x07},
		{0x03ffffff, 0xf8, 0x03},
		{0x7fffffff, 0xfc, 0x01},
		{0x00000000, 0x00, 0x00} // term
	};
	const struct range_def *def;

	// lookup the range
	for (i = 0, def = ranges; ch > def->lim && def->mask != 0; ++i, ++def)
		;
	if (def->mask == 0)
	{	// invalid or unsupported char
		log_add(log_Warning, "Warning: Invalid or unsupported unicode "
				"char (%lu)", (unsigned long) ch);
		return 0;
	}

	if ((size_t)i + 1 > size)
		return -(i + 1);

	// unrolled for speed
	switch (i)
	{
		case 5: ptr[5] = (ch & 0x3f) | 0x80;
				ch >>= 6;
		case 4: ptr[4] = (ch & 0x3f) | 0x80;
				ch >>= 6;
		case 3: ptr[3] = (ch & 0x3f) | 0x80;
				ch >>= 6;
		case 2: ptr[2] = (ch & 0x3f) | 0x80;
				ch >>= 6;
		case 1: ptr[1] = (ch & 0x3f) | 0x80;
				ch >>= 6;
		case 0: ptr[0] = (ch & def->mask) | def->marker;
	}

	return i + 1;
}

// Encode a wide char string (wstr) into a UTF-8 string (ptr)
// returns number of bytes used in the buffer (includes 0-term)
// any chars that do not fit are truncated
// string term '\0' is always appended, unless the destination
// buffer is 0 bytes long
size_t
getStringFromWideN(unsigned char *ptr, size_t size,
		const UniChar *wstr, size_t count)
{
	unsigned char *next;
	int used;

	if (size == 0)
		return 0;

	// always leave room for 0-term
	--size;

	for (next = ptr; size > 0 && count > 0;
			size -= used, next += used, --count, ++wstr)
	{
		used = getStringFromChar(next, size, *wstr);
		if (used < 0)
			break; // not enough room
		if (used == 0)
		{	// bad char?
			*next = '?';
			used = 1;
		}
	}

	*next = '\0'; // term

	return next - ptr + 1;
}

// See getStringFromWideN() for functionality
//  the only difference is that the source string (wstr) length is
//  calculated by searching for 0-term
size_t
getStringFromWide(unsigned char *ptr, size_t size, const UniChar *wstr)
{
	const UniChar *end;

	for (end = wstr; *end != 0; ++end)
		;

	return getStringFromWideN(ptr, size, wstr, (end - wstr));
}

int
UniChar_isGraph(UniChar ch)
{	// this is not technically sufficient, but close enough for us
	// we'll consider all non-control (CO and C1) chars in 'graph' class
	// except for the "Private Use Area" (0xE000 - 0xF8FF)

	// TODO: The private use area is really only glommed by OS X,
	// and even there, not all of it.  (Delete and Backspace both
	// end up producing characters there -- see bug #942 for the
	// gory details.)
	return (ch > 0xa0 && (ch < 0xE000 || ch > 0xF8FF)) ||
			(ch > 0x20 && ch < 0x7f);
}

int
UniChar_isPrint(UniChar ch)
{	// this is not technically sufficient, but close enough for us
	// chars in 'print' class are 'graph' + 'space' classes
	// the only space we currently have defined is 0x20
	return (ch == 0x20) || UniChar_isGraph(ch);
}

UniChar
UniChar_toUpper(UniChar ch)
{	// this is a very basic Latin-1 implementation
	// just to get things going
	return (ch < 0x100) ? (UniChar) toupper((int) ch) : ch;
}

UniChar
UniChar_toLower(UniChar ch)
{	// this is a very basic Latin-1 implementation
	// just to get things going
	return (ch < 0x100) ? (UniChar) tolower((int) ch) : ch;
}