mono/eglib/gutf8.c

/*
 * gutf8.c: UTF-8 conversion
 *
 * Author:
 *   Atsushi Enomoto  <atsushi@ximian.com>
 *
 * (C) 2006 Novell, Inc.
 * Copyright 2012 Xamarin Inc
 */

#include <stdio.h>
#include <glib.h>

/*
 * Index into the table below with the first byte of a UTF-8 sequence to get
 * the number of bytes that are supposed to follow it to complete the sequence.
 *
 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is left
 * as-is for anyone who may want to do such conversion, which was allowed in
 * earlier algorithms.
*/
const guchar g_utf8_jump_table[256] = {
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
};

static gchar *
utf8_case_conv (const gchar *str, gssize len, gboolean upper)
{
	gunichar *ustr;
	glong i, ulen;
	gchar *utf8;

	ustr = g_utf8_to_ucs4_fast (str, (glong) len, &ulen);
	for (i = 0; i < ulen; i++)
		ustr[i] = upper ? g_unichar_toupper (ustr[i]) : g_unichar_tolower (ustr[i]);
	utf8 = g_ucs4_to_utf8 (ustr, ulen, NULL, NULL, NULL);
	g_free (ustr);

	return utf8;
}

gchar *
g_utf8_strup (const gchar *str, gssize len)
{
	return utf8_case_conv (str, len, TRUE);
}

gchar *
g_utf8_strdown (const gchar *str, gssize len)
{
	return utf8_case_conv (str, len, FALSE);
}

static gboolean
utf8_validate (const unsigned char *inptr, size_t len)
{
	const unsigned char *ptr = inptr + len;
	unsigned char c;

	/* Everything falls through when TRUE... */
	switch (len) {
	default:
		return FALSE;
	case 4:
		if ((c = (*--ptr)) < 0x80 || c > 0xBF)
			return FALSE;

		if ((c == 0xBF || c == 0xBE) && ptr[-1] == 0xBF) {
			if (ptr[-2] == 0x8F || ptr[-2] == 0x9F ||
			    ptr[-2] == 0xAF || ptr[-2] == 0xBF)
				return FALSE;
		}
	case 3:
		if ((c = (*--ptr)) < 0x80 || c > 0xBF)
			return FALSE;
	case 2:
		if ((c = (*--ptr)) < 0x80 || c > 0xBF)
			return FALSE;

		/* no fall-through in this inner switch */
		switch (*inptr) {
		case 0xE0: if (c < 0xA0) return FALSE; break;
		case 0xED: if (c > 0x9F) return FALSE; break;
		case 0xEF: if (c == 0xB7 && (ptr[1] > 0x8F && ptr[1] < 0xB0)) return FALSE;
			if (c == 0xBF && (ptr[1] == 0xBE || ptr[1] == 0xBF)) return FALSE;
			break;
		case 0xF0: if (c < 0x90) return FALSE; break;
		case 0xF4: if (c > 0x8F) return FALSE; break;
		default:   if (c < 0x80) return FALSE; break;
		}
	case 1: if (*inptr >= 0x80 && *inptr < 0xC2) return FALSE;
	}

	if (*inptr > 0xF4)
		return FALSE;

	return TRUE;
}

/**
 * g_utf8_validate:
 * @str: a utf-8 encoded string
 * @max_len: max number of bytes to validate (or -1 to validate the entire null-terminated string)
 * @end: output parameter to mark the end of the valid input
 *
 * Checks @utf for being valid UTF-8. @str is assumed to be
 * null-terminated. This function is not super-strict, as it will
 * allow longer UTF-8 sequences than necessary. Note that Java is
 * capable of producing these sequences if provoked. Also note, this
 * routine checks for the 4-byte maximum size, but does not check for
 * 0x10ffff maximum value.
 *
 * Return value: %TRUE if @str is valid or %FALSE otherwise.
 **/
gboolean
g_utf8_validate (const gchar *str, gssize max_len, const gchar **end)
{
	guchar *inptr = (guchar *) str;
	gboolean valid = TRUE;
	guint length, min;
	gssize n = 0;

	if (max_len == 0)
		return FALSE;

	if (max_len < 0) {
		while (*inptr != 0) {
			length = g_utf8_jump_table[*inptr];
			if (!utf8_validate (inptr, length)) {
				valid = FALSE;
				break;
			}

			inptr += length;
		}
	} else {
		while (n < max_len) {
			if (*inptr == 0) {
				/* Note: return FALSE if we encounter nul-byte
				 * before max_len is reached. */
				valid = FALSE;
				break;
			}

			length = g_utf8_jump_table[*inptr];
			min = MIN (length, max_len - n);

			if (!utf8_validate (inptr, min)) {
				valid = FALSE;
				break;
			}

			if (min < length) {
				valid = FALSE;
				break;
			}

			inptr += length;
			n += length;
		}
	}

	if (end != NULL)
		*end = (gchar *) inptr;

	return valid;
}

gunichar
g_utf8_get_char_validated (const gchar *str, gssize max_len)
{
	unsigned char *inptr = (unsigned char *) str;
	gunichar u = *inptr;
	int n, i;

	if (max_len == 0)
		return -2;

	if (u < 0x80) {
		/* simple ascii case */
		return u;
	} else if (u < 0xc2) {
		return -1;
	} else if (u < 0xe0) {
		u &= 0x1f;
		n = 2;
	} else if (u < 0xf0) {
		u &= 0x0f;
		n = 3;
	} else if (u < 0xf8) {
		u &= 0x07;
		n = 4;
	} else if (u < 0xfc) {
		u &= 0x03;
		n = 5;
	} else if (u < 0xfe) {
		u &= 0x01;
		n = 6;
	} else {
		return -1;
	}

	if (max_len > 0) {
		if (!utf8_validate (inptr, MIN (max_len, n)))
			return -1;

		if (max_len < n)
			return -2;
	} else {
		if (!utf8_validate (inptr, n))
			return -1;
	}

	for (i = 1; i < n; i++)
		u = (u << 6) | (*++inptr ^ 0x80);

	return u;
}

glong
g_utf8_strlen (const gchar *str, gssize max_len)
{
	const guchar *inptr = (const guchar *) str;
	glong clen = 0, len = 0, n;

	if (max_len == 0)
		return 0;

	if (max_len < 0) {
		while (*inptr) {
			inptr += g_utf8_jump_table[*inptr];
			len++;
		}
	} else {
		while (len < max_len && *inptr) {
			n = g_utf8_jump_table[*inptr];
			if ((clen + n) > max_len)
				break;

			inptr += n;
			clen += n;
			len++;
		}
	}

	return len;
}

gunichar
g_utf8_get_char (const gchar *src)
{
	unsigned char *inptr = (unsigned char *) src;
	gunichar u = *inptr;
	int n, i;

	if (u < 0x80) {
		/* simple ascii case */
		return u;
	} else if (u < 0xe0) {
		u &= 0x1f;
		n = 2;
	} else if (u < 0xf0) {
		u &= 0x0f;
		n = 3;
	} else if (u < 0xf8) {
		u &= 0x07;
		n = 4;
	} else if (u < 0xfc) {
		u &= 0x03;
		n = 5;
	} else {
		u &= 0x01;
		n = 6;
	}

	for (i = 1; i < n; i++)
		u = (u << 6) | (*++inptr ^ 0x80);

	return u;
}

gchar *
g_utf8_find_prev_char (const gchar *str, const gchar *p)
{
	while (p > str) {
		p--;
		if ((*p & 0xc0) != 0xb0)
			return (gchar *)p;
	}
	return NULL;
}

gchar *
g_utf8_prev_char (const gchar *str)
{
	const gchar *p = str;
	do {
		p--;
	} while ((*p & 0xc0) == 0xb0);

	return (gchar *)p;
}

gchar *
g_utf8_offset_to_pointer (const gchar *str, glong offset)
{
	const gchar *p = str;

	if (offset > 0) {
		do {
			p = g_utf8_next_char (p);
			offset --;
		} while (offset > 0);
	}
	else if (offset < 0) {
		const gchar *jump = str;
		do {
			// since the minimum size of a character is 1
			// we know we can step back at least offset bytes
			jump = jump + offset;

			// if we land in the middle of a character
			// walk to the beginning
			while ((*jump & 0xc0) == 0xb0)
				jump --;

			// count how many characters we've actually walked
			// by going forward
			p = jump;
			do {
				p = g_utf8_next_char (p);
				offset ++;
			} while (p < jump);

		} while (offset < 0);
	}

	return (gchar *)p;
}

glong
g_utf8_pointer_to_offset (const gchar *str, const gchar *pos)
{
	const gchar *inptr, *inend;
	glong offset = 0;
	glong sign = 1;

	if (pos == str)
		return 0;

	if (str < pos) {
		inptr = str;
		inend = pos;
	} else {
		inptr = pos;
		inend = str;
		sign = -1;
	}

	do {
		inptr = g_utf8_next_char (inptr);
		offset++;
	} while (inptr < inend);

	return offset * sign;
}