/* vim: set ts=8 sts=4 sw=4 tw=80 noet: */ /*====================================================================== Copyright (C) 2004,2005,2009,2013 Walter Doekes This file is part of tthsum. tthsum is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. tthsum is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with tthsum. If not, see . ======================================================================*/ #include "utf8.h" #include "types.h" #include #include #include /* If you define USE_WINDOWS_UTF8, you'll get unspecified/different behaviour * on WIN32 in certain cases (on invalid characters, short destination * strings...). It is only included for testing. "My" functions should perform * reasonably equal and behave like defined in exceptional cases. */ #if defined(_WIN32) && defined(USE_WINDOWS_UTF8) # define WINDOWS_LEAN_AND_MEAN # include #endif /* _WIN32 && USE_WINDOWS_UTF8 */ #ifdef USE_TEXTS # include "texts.h" #endif /* USE_TEXTS */ /* UTF-8 conversion table: * * 0x00000000 - 0x0000007F * 0xxxxxxx * 0x00000080 - 0x000007FF * 110xxxxx 10xxxxxx * 0x00000800 - 0x0000FFFF * 1110xxxx 10xxxxxx 10xxxxxx * 0x00010000 - 0x001FFFFF * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x00200000 - 0x03FFFFFF * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 0x04000000 - 0x7FFFFFFF * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * * Note that UNICODE defines only characters 0x0 - 0x10ffff, so a utf8-encoded * character is at most 4 characters long. */ size_t wcstoautf8(char** dest, const wchar_t* src) { /* At most 4 characters in UTF-8, add the terminating 0 (null). */ int len = wcslen(src) * 4 + 1; size_t ret; *dest = (char*)malloc(len * sizeof(char)); if (!*dest) { #ifdef USE_TEXTS set_error("malloc", ERROR_FROM_OS); #endif /* USE_TEXTS */ return -1; } ret = wcstoutf8(*dest, src, len); if (ret == (size_t)-1) free(*dest); return ret; } size_t wcstoutf8(char* dest, const wchar_t* src, size_t n) { #if defined(_WIN32) && defined(USE_WINDOWS_UTF8) int ret; int len = wcslen(src); if (len == 0) { if (dest && n != 0) *dest = '\0'; return 0; } ++len; ret = WideCharToMultiByte( CP_UTF8, /* code page */ 0, /* performance and mapping flags */ src, /* wide-character string */ len, /* number of chars in string */ dest, /* buffer for new string */ n, /* size of buffer */ NULL, /* default for unmappable chars */ NULL /* set when default char used */ ); if (ret == 0) { #ifdef USE_TEXTS set_error("WideCharToMultiByte", ERROR_FROM_OS); #endif /* USE_TEXTS */ return (size_t)-1; } return (size_t)ret; #else /* !_WIN32 || !USE_WINDOWS_UTF8 */ size_t count = 0; #ifdef _WIN32 # define _W(x) ((uint32_t)x) uint32_t ch; #else /* !_WIN32 */ # define _W(x) ((wchar_t)x) wchar_t ch; #endif /* !_WIN32 */ while ((ch = _W(*src)) != _W(L'\0') && (!dest || count < n)) { int mask, len, i; if (ch < 0x80) { len = 1; mask = 0x0; /* 0xxxxxxx */ } else if (ch < 0x800) { len = 2; mask = 0xc0; /* 110xxxxx */ } else if (ch < 0x10000) { /* Surrogate code points */ if (ch >= 0xd800 && ch < 0xe000) { #ifdef _WIN32 /* UTF-16 pairs for 16bits wchar */ wchar_t ch2 = *++src; if (ch2 < 0xdc00 || ch2 >= 0xe000 || ch >= 0xdc00) { if (dest) *dest = '\0'; #ifdef USE_TEXTS set_error("wcstoutf8", UTF8_INVALID_UNICODE); #endif /* USE_TEXTS */ return (size_t)-1; } ch = 0x10000 | ((ch & 0x3ff) << 10) | (ch2 & 0x3ff); len = 4; mask = 0xf0; #else /* !_WIN32 */ /* Invalid for 32bits wchar */ if (dest) *dest = '\0'; #ifdef USE_TEXTS set_error("wcstoutf8", UTF8_INVALID_UNICODE); #endif /* USE_TEXTS */ return (size_t)-1; #endif /* !_WIN32 */ } else { len = 3; mask = 0xe0; /* 1110xxxx */ } #ifndef _WIN32 } else if (ch < 0x200000) { len = 4; mask = 0xf0; /* 11110xxx */ #endif /* _WIN32 */ } else { if (dest) *dest = '\0'; #ifdef USE_TEXTS set_error("wcstoutf8", UTF8_INVALID_UNICODE); #endif /* USE_TEXTS */ return (size_t)-1; } if (dest) { if (count + len > n) { *dest = '\0'; return count; } for (i = len - 1; i > 0; --i) { dest[i] = (char)((ch & 0x3f) | 0x80); /* 00111111, 10000000 */ ch >>= 6; } dest[0] = (char)(ch | mask); dest += len; } ++src; count += len; } if (dest && count < n) *dest = '\0'; return count; #endif /* !_WIN32 || !USE_WINDOWS_UTF8 */ } size_t utf8toawcs(wchar_t** dest, const char* src) { /* At most 1 wide character per UTF-8 byte + terminating zero. */ int len = strlen(src) + 1; size_t ret; *dest = (wchar_t*)malloc(len * sizeof(wchar_t)); if (!*dest) { #ifdef USE_TEXTS set_error("malloc", ERROR_FROM_OS); #endif /* USE_TEXTS */ return -1; } ret = utf8towcs(*dest, src, len); if (ret == (size_t)-1) free(*dest); return ret; } size_t utf8towcs(wchar_t* dest, const char* src, size_t n) { #if defined(_WIN32) && defined(USE_WINDOWS_UTF8) int ret; int len = strlen(src); if (len == 0) { if (dest && n != 0) *dest = L'\0'; return 0; } ++len; ret = MultiByteToWideChar( CP_UTF8, /* code page */ MB_ERR_INVALID_CHARS, /* character-type options */ src, /* string to map */ len, /* number of bytes in string */ dest, /* wide-character buffer */ n /* size of buffer */ ); #ifdef USE_TEXTS if (ret == 0) set_error("MultiByteToWideChar", ERROR_FROM_OS); #endif /* USE_TEXTS */ return (size_t)(ret == 0 ? -1 : ret - 1); #else /* !_WIN32 || !USE_WINDOWS_UTF8 */ size_t count = 0; while (*src != '\0' && (!dest || count < n)) { int mask, len, i; /* 0xxxxxxx */ if ((unsigned char)*src < 0x80) { len = 1; mask = 0x7f; /* 01111111 */ /* 11100000, 110xxxxx */ } else if (((unsigned char)*src & 0xe0) == 0xc0) { len = 2; mask = 0x1f; /* 00011111 */ /* 11110000, 1110xxxx */ } else if (((unsigned char)*src & 0xf0) == 0xe0) { len = 3; mask = 0x0f; /* 00001111 */ /* 11111000, 11110xxx */ } else if (((unsigned char)*src & 0xf8) == 0xf0) { len = 4; mask = 0x07; /* 00000111 */ } else { if (dest) *dest = L'\0'; #ifdef USE_TEXTS set_error("utf8towcs", UTF8_INVALID_UTF8); #endif /* USE_TEXTS */ return (size_t)-1; } if (dest) { *dest = (unsigned char)*src & mask; #ifdef _WIN32 for (i = 1; i < len && i < 3; ++i) { #else /* !_WIN32 */ for (i = 1; i < len; ++i) { #endif /* !_WIN32 */ if ((src[i] & 0xc0) != 0x80) { /* 11000000, 10000000 */ *dest = L'\0'; #ifdef USE_TEXTS set_error("utf8towcs", UTF8_INVALID_UTF8); #endif /* USE_TEXTS */ return (size_t)-1; } *dest <<= 6; *dest |= (unsigned char)src[i] & 0x3f; /* 00111111 */ } #ifdef _WIN32 if (len == 4) { wchar_t dest2; if (count + 1 >= n || (src[3] & 0xc0) != 0x80) { *dest = L'\0'; #ifdef USE_TEXTS set_error("utf8towcs", UTF8_INVALID_UTF8); #endif /* USE_TEXTS */ return (size_t)-1; } if (!(*dest & 0x400)) { *dest = L'\0'; #ifdef USE_TEXTS set_error("utf8towcs", UTF8_OVERLONG_UTF8); #endif /* USE_TEXTS */ return (size_t)-1; } dest2 = (*dest & 0xf) << 6 | ((unsigned char)src[3] & 0x3f); *dest = 0xd800 | (*dest & 0x7800) >> 5 | (*dest & 0x3f0) >> 4; *++dest = 0xdc00 | dest2; ++count; } #endif /* _WIN32 */ /* Check against overlong encoding */ if ((len == 2 && *dest <= 0x7f) || (len == 3 && *dest <= 0x7ff) #ifndef _WIN32 || (len == 4 && *dest <= 0xffff) #endif /* !_WIN32 */ ) { *dest = L'\0'; #ifdef USE_TEXTS set_error("utf8towcs", UTF8_OVERLONG_UTF8); #endif /* USE_TEXTS */ return (size_t)-1; } ++dest; } src += len; ++count; } if (dest && count < n) *dest = L'\0'; return count; #endif /* !_WIN32 || !USE_WINDOWS_UTF8 */ }