tthsum/tthsum/utf8.c

/* vim: set ts=8 sts=4 sw=4 tw=80 noet: */
/*======================================================================
Copyright (C) 2004,2005,2009,2013 Walter Doekes <walter+tthsum@wjd.nu>
This file is part of tthsum.

tthsum is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

tthsum is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with tthsum.  If not, see <http://www.gnu.org/licenses/>.
======================================================================*/
#include "utf8.h"

#include "types.h"
#include <stdlib.h>
#include <string.h>
#include <wchar.h>

/* If you define USE_WINDOWS_UTF8, you'll get unspecified/different behaviour
 * on WIN32 in certain cases (on invalid characters, short destination
 * strings...). It is only included for testing. "My" functions should perform
 * reasonably equal and behave like defined in exceptional cases. */
#if defined(_WIN32) && defined(USE_WINDOWS_UTF8)
#    define WINDOWS_LEAN_AND_MEAN
#    include <windows.h>
#endif /* _WIN32 && USE_WINDOWS_UTF8 */

#ifdef USE_TEXTS
#   include "texts.h"
#endif /* USE_TEXTS */


/* UTF-8 conversion table:
 *
 * 0x00000000 - 0x0000007F
 *   0xxxxxxx
 * 0x00000080 - 0x000007FF
 *   110xxxxx 10xxxxxx
 * 0x00000800 - 0x0000FFFF
 *   1110xxxx 10xxxxxx 10xxxxxx
 * 0x00010000 - 0x001FFFFF
 *   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 * 0x00200000 - 0x03FFFFFF
 *   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 * 0x04000000 - 0x7FFFFFFF
 *   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 *
 * Note that UNICODE defines only characters 0x0 - 0x10ffff, so a utf8-encoded
 * character is at most 4 characters long. */


size_t wcstoautf8(char** dest, const wchar_t* src) {
    /* At most 4 characters in UTF-8, add the terminating 0 (null). */
    int len = wcslen(src) * 4 + 1;
    size_t ret;
    *dest = (char*)malloc(len * sizeof(char));
    if (!*dest) {
#ifdef USE_TEXTS
	set_error("malloc", ERROR_FROM_OS);
#endif /* USE_TEXTS */
	return -1;
    }
    ret = wcstoutf8(*dest, src, len);
    if (ret == (size_t)-1)
	free(*dest);
    return ret;
}

size_t wcstoutf8(char* dest, const wchar_t* src, size_t n) {
#if defined(_WIN32) && defined(USE_WINDOWS_UTF8)
    int ret;
    int len = wcslen(src);
    if (len == 0) {
	if (dest && n != 0)
	    *dest = '\0';
	return 0;
    }
    ++len;
    ret = WideCharToMultiByte(
	    CP_UTF8,	/* code page */
	    0,		/* performance and mapping flags */
	    src,	/* wide-character string */
	    len,	/* number of chars in string */
	    dest,	/* buffer for new string */
	    n,		/* size of buffer */
	    NULL,	/* default for unmappable chars */
	    NULL	/* set when default char used */
    );
    if (ret == 0) {
#ifdef USE_TEXTS
	set_error("WideCharToMultiByte", ERROR_FROM_OS);
#endif /* USE_TEXTS */
	return (size_t)-1;
    }
    return (size_t)ret;
#else /* !_WIN32 || !USE_WINDOWS_UTF8 */
    size_t count = 0;
#ifdef _WIN32
#   define _W(x) ((uint32_t)x)
    uint32_t ch;
#else /* !_WIN32 */
#   define _W(x) ((wchar_t)x)
    wchar_t ch;
#endif /* !_WIN32 */
    while ((ch = _W(*src)) != _W(L'\0') && (!dest || count < n)) {
	int mask, len, i;
	if (ch < 0x80) {
	    len = 1;
	    mask = 0x0;  /* 0xxxxxxx */
	} else if (ch < 0x800) {
	    len = 2;
	    mask = 0xc0; /* 110xxxxx */
	} else if (ch < 0x10000) {
	    /* Surrogate code points */
	    if (ch >= 0xd800 && ch < 0xe000) {
#ifdef _WIN32
		/* UTF-16 pairs for 16bits wchar */
		wchar_t ch2 = *++src;
		if (ch2 < 0xdc00 || ch2 >= 0xe000 || ch >= 0xdc00) {
		    if (dest)
			*dest = '\0';
#ifdef USE_TEXTS
		    set_error("wcstoutf8", UTF8_INVALID_UNICODE);
#endif /* USE_TEXTS */
		    return (size_t)-1;
		}
		ch = 0x10000 | ((ch & 0x3ff) << 10) | (ch2 & 0x3ff);
		len = 4;
		mask = 0xf0;
#else /* !_WIN32 */
		/* Invalid for 32bits wchar */
		if (dest)
		    *dest = '\0';
#ifdef USE_TEXTS
		set_error("wcstoutf8", UTF8_INVALID_UNICODE);
#endif /* USE_TEXTS */

		return (size_t)-1;
#endif /* !_WIN32 */
	    } else {
		len = 3;
		mask = 0xe0; /* 1110xxxx */
	    }
#ifndef _WIN32
	} else if (ch < 0x200000) {
	    len = 4;
	    mask = 0xf0; /* 11110xxx */
#endif /* _WIN32 */
	} else {
	    if (dest)
		*dest = '\0';
#ifdef USE_TEXTS
	    set_error("wcstoutf8", UTF8_INVALID_UNICODE);
#endif /* USE_TEXTS */
	    return (size_t)-1;
	}
	if (dest) {
	    if (count + len > n) {
		*dest = '\0';
		return count;
	    }
	    for (i = len - 1; i > 0; --i) {
		dest[i] = (char)((ch & 0x3f) | 0x80); /* 00111111, 10000000 */
		ch >>= 6;
	    }
	    dest[0] = (char)(ch | mask);
	    dest += len;
	}
	++src;
	count += len;
    }
    if (dest && count < n)
	*dest = '\0';
    return count;
#endif /* !_WIN32 || !USE_WINDOWS_UTF8 */
}

size_t utf8toawcs(wchar_t** dest, const char* src) {
    /* At most 1 wide character per UTF-8 byte + terminating zero. */
    int len = strlen(src) + 1;
    size_t ret;
    *dest = (wchar_t*)malloc(len * sizeof(wchar_t));
    if (!*dest) {
#ifdef USE_TEXTS
	set_error("malloc", ERROR_FROM_OS);
#endif /* USE_TEXTS */
	return -1;
    }
    ret = utf8towcs(*dest, src, len);
    if (ret == (size_t)-1)
	free(*dest);
    return ret;
}

size_t utf8towcs(wchar_t* dest, const char* src, size_t n) {
#if defined(_WIN32) && defined(USE_WINDOWS_UTF8)
    int ret;
    int len = strlen(src);
    if (len == 0) {
	if (dest && n != 0)
	    *dest = L'\0';
	return 0;
    }
    ++len;
    ret = MultiByteToWideChar(
	    CP_UTF8,		    /* code page */
	    MB_ERR_INVALID_CHARS,   /* character-type options */
	    src,		    /* string to map */
	    len,		    /* number of bytes in string */
	    dest,		    /* wide-character buffer */
	    n			    /* size of buffer */
    );
#ifdef USE_TEXTS
    if (ret == 0)
	set_error("MultiByteToWideChar", ERROR_FROM_OS);
#endif /* USE_TEXTS */
    return (size_t)(ret == 0 ? -1 : ret - 1);
#else /* !_WIN32 || !USE_WINDOWS_UTF8 */
    size_t count = 0;
    while (*src != '\0' && (!dest || count < n)) {
	int mask, len, i;
	/* 0xxxxxxx */
	if ((unsigned char)*src < 0x80) {
	    len = 1;
	    mask = 0x7f; /* 01111111 */
	/* 11100000, 110xxxxx */
	} else if (((unsigned char)*src & 0xe0) == 0xc0) {
	    len = 2;
	    mask = 0x1f; /* 00011111 */
	/* 11110000, 1110xxxx */
	} else if (((unsigned char)*src & 0xf0) == 0xe0) {
	    len = 3;
	    mask = 0x0f; /* 00001111 */
	/* 11111000, 11110xxx */
	} else if (((unsigned char)*src & 0xf8) == 0xf0) {
	    len = 4;
	    mask = 0x07; /* 00000111 */
	} else {
	    if (dest)
		*dest = L'\0';
#ifdef USE_TEXTS
	    set_error("utf8towcs", UTF8_INVALID_UTF8);
#endif /* USE_TEXTS */
	    return (size_t)-1;
	}
	if (dest) {
	    *dest = (unsigned char)*src & mask;
#ifdef _WIN32
	    for (i = 1; i < len && i < 3; ++i) {
#else /* !_WIN32 */
	    for (i = 1; i < len; ++i) {
#endif /* !_WIN32 */
		if ((src[i] & 0xc0) != 0x80) { /* 11000000, 10000000 */
		    *dest = L'\0';
#ifdef USE_TEXTS
		    set_error("utf8towcs", UTF8_INVALID_UTF8);
#endif /* USE_TEXTS */
		    return (size_t)-1;
		}
		*dest <<= 6;
		*dest |= (unsigned char)src[i] & 0x3f; /* 00111111 */
	    }
#ifdef _WIN32
	    if (len == 4) {
		wchar_t dest2;
		if (count + 1 >= n || (src[3] & 0xc0) != 0x80) {
		    *dest = L'\0';
#ifdef USE_TEXTS
		    set_error("utf8towcs", UTF8_INVALID_UTF8);
#endif /* USE_TEXTS */
		    return (size_t)-1;
		}
		if (!(*dest & 0x400)) {
		    *dest = L'\0';
#ifdef USE_TEXTS
		    set_error("utf8towcs", UTF8_OVERLONG_UTF8);
#endif /* USE_TEXTS */
		    return (size_t)-1;
		}
		dest2 = (*dest & 0xf) << 6 | ((unsigned char)src[3] & 0x3f);
		*dest = 0xd800 | (*dest & 0x7800) >> 5 | (*dest & 0x3f0) >> 4;
		*++dest = 0xdc00 | dest2;
		++count;
	    }
#endif /* _WIN32 */
	    /* Check against overlong encoding */
	    if ((len == 2 && *dest <= 0x7f)
		    || (len == 3 && *dest <= 0x7ff)
#ifndef _WIN32
		    || (len == 4 && *dest <= 0xffff)
#endif /* !_WIN32 */
	    ) {
		*dest = L'\0';
#ifdef USE_TEXTS
		set_error("utf8towcs", UTF8_OVERLONG_UTF8);
#endif /* USE_TEXTS */
		return (size_t)-1;
	    }
	    ++dest;
	}
	src += len;
	++count;
    }
    if (dest && count < n)
	*dest = L'\0';
    return count;
#endif /* !_WIN32 || !USE_WINDOWS_UTF8 */
}