1 /* vim: set ts=8 sts=4 sw=4 tw=80 noet: */
2 /*======================================================================
3 Copyright (C) 2004,2005,2009,2013 Walter Doekes <walter+tthsum@wjd.nu>
4 This file is part of tthsum.
5 
6 tthsum is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10 
11 tthsum is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with tthsum.  If not, see <http://www.gnu.org/licenses/>.
18 ======================================================================*/
19 #include "utf8.h"
20 
21 #include "types.h"
22 #include <stdlib.h>
23 #include <string.h>
24 #include <wchar.h>
25 
26 /* If you define USE_WINDOWS_UTF8, you'll get unspecified/different behaviour
27  * on WIN32 in certain cases (on invalid characters, short destination
28  * strings...). It is only included for testing. "My" functions should perform
29  * reasonably equal and behave like defined in exceptional cases. */
30 #if defined(_WIN32) && defined(USE_WINDOWS_UTF8)
31 #    define WINDOWS_LEAN_AND_MEAN
32 #    include <windows.h>
33 #endif /* _WIN32 && USE_WINDOWS_UTF8 */
34 
35 #ifdef USE_TEXTS
36 #   include "texts.h"
37 #endif /* USE_TEXTS */
38 
39 
40 /* UTF-8 conversion table:
41  *
42  * 0x00000000 - 0x0000007F
43  *   0xxxxxxx
44  * 0x00000080 - 0x000007FF
45  *   110xxxxx 10xxxxxx
46  * 0x00000800 - 0x0000FFFF
47  *   1110xxxx 10xxxxxx 10xxxxxx
48  * 0x00010000 - 0x001FFFFF
49  *   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
50  * 0x00200000 - 0x03FFFFFF
51  *   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
52  * 0x04000000 - 0x7FFFFFFF
53  *   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
54  *
55  * Note that UNICODE defines only characters 0x0 - 0x10ffff, so a utf8-encoded
56  * character is at most 4 characters long. */
57 
58 
wcstoautf8(char ** dest,const wchar_t * src)59 size_t wcstoautf8(char** dest, const wchar_t* src) {
60     /* At most 4 characters in UTF-8, add the terminating 0 (null). */
61     int len = wcslen(src) * 4 + 1;
62     size_t ret;
63     *dest = (char*)malloc(len * sizeof(char));
64     if (!*dest) {
65 #ifdef USE_TEXTS
66 	set_error("malloc", ERROR_FROM_OS);
67 #endif /* USE_TEXTS */
68 	return -1;
69     }
70     ret = wcstoutf8(*dest, src, len);
71     if (ret == (size_t)-1)
72 	free(*dest);
73     return ret;
74 }
75 
wcstoutf8(char * dest,const wchar_t * src,size_t n)76 size_t wcstoutf8(char* dest, const wchar_t* src, size_t n) {
77 #if defined(_WIN32) && defined(USE_WINDOWS_UTF8)
78     int ret;
79     int len = wcslen(src);
80     if (len == 0) {
81 	if (dest && n != 0)
82 	    *dest = '\0';
83 	return 0;
84     }
85     ++len;
86     ret = WideCharToMultiByte(
87 	    CP_UTF8,	/* code page */
88 	    0,		/* performance and mapping flags */
89 	    src,	/* wide-character string */
90 	    len,	/* number of chars in string */
91 	    dest,	/* buffer for new string */
92 	    n,		/* size of buffer */
93 	    NULL,	/* default for unmappable chars */
94 	    NULL	/* set when default char used */
95     );
96     if (ret == 0) {
97 #ifdef USE_TEXTS
98 	set_error("WideCharToMultiByte", ERROR_FROM_OS);
99 #endif /* USE_TEXTS */
100 	return (size_t)-1;
101     }
102     return (size_t)ret;
103 #else /* !_WIN32 || !USE_WINDOWS_UTF8 */
104     size_t count = 0;
105 #ifdef _WIN32
106 #   define _W(x) ((uint32_t)x)
107     uint32_t ch;
108 #else /* !_WIN32 */
109 #   define _W(x) ((wchar_t)x)
110     wchar_t ch;
111 #endif /* !_WIN32 */
112     while ((ch = _W(*src)) != _W(L'\0') && (!dest || count < n)) {
113 	int mask, len, i;
114 	if (ch < 0x80) {
115 	    len = 1;
116 	    mask = 0x0;  /* 0xxxxxxx */
117 	} else if (ch < 0x800) {
118 	    len = 2;
119 	    mask = 0xc0; /* 110xxxxx */
120 	} else if (ch < 0x10000) {
121 	    /* Surrogate code points */
122 	    if (ch >= 0xd800 && ch < 0xe000) {
123 #ifdef _WIN32
124 		/* UTF-16 pairs for 16bits wchar */
125 		wchar_t ch2 = *++src;
126 		if (ch2 < 0xdc00 || ch2 >= 0xe000 || ch >= 0xdc00) {
127 		    if (dest)
128 			*dest = '\0';
129 #ifdef USE_TEXTS
130 		    set_error("wcstoutf8", UTF8_INVALID_UNICODE);
131 #endif /* USE_TEXTS */
132 		    return (size_t)-1;
133 		}
134 		ch = 0x10000 | ((ch & 0x3ff) << 10) | (ch2 & 0x3ff);
135 		len = 4;
136 		mask = 0xf0;
137 #else /* !_WIN32 */
138 		/* Invalid for 32bits wchar */
139 		if (dest)
140 		    *dest = '\0';
141 #ifdef USE_TEXTS
142 		set_error("wcstoutf8", UTF8_INVALID_UNICODE);
143 #endif /* USE_TEXTS */
144 
145 		return (size_t)-1;
146 #endif /* !_WIN32 */
147 	    } else {
148 		len = 3;
149 		mask = 0xe0; /* 1110xxxx */
150 	    }
151 #ifndef _WIN32
152 	} else if (ch < 0x200000) {
153 	    len = 4;
154 	    mask = 0xf0; /* 11110xxx */
155 #endif /* _WIN32 */
156 	} else {
157 	    if (dest)
158 		*dest = '\0';
159 #ifdef USE_TEXTS
160 	    set_error("wcstoutf8", UTF8_INVALID_UNICODE);
161 #endif /* USE_TEXTS */
162 	    return (size_t)-1;
163 	}
164 	if (dest) {
165 	    if (count + len > n) {
166 		*dest = '\0';
167 		return count;
168 	    }
169 	    for (i = len - 1; i > 0; --i) {
170 		dest[i] = (char)((ch & 0x3f) | 0x80); /* 00111111, 10000000 */
171 		ch >>= 6;
172 	    }
173 	    dest[0] = (char)(ch | mask);
174 	    dest += len;
175 	}
176 	++src;
177 	count += len;
178     }
179     if (dest && count < n)
180 	*dest = '\0';
181     return count;
182 #endif /* !_WIN32 || !USE_WINDOWS_UTF8 */
183 }
184 
utf8toawcs(wchar_t ** dest,const char * src)185 size_t utf8toawcs(wchar_t** dest, const char* src) {
186     /* At most 1 wide character per UTF-8 byte + terminating zero. */
187     int len = strlen(src) + 1;
188     size_t ret;
189     *dest = (wchar_t*)malloc(len * sizeof(wchar_t));
190     if (!*dest) {
191 #ifdef USE_TEXTS
192 	set_error("malloc", ERROR_FROM_OS);
193 #endif /* USE_TEXTS */
194 	return -1;
195     }
196     ret = utf8towcs(*dest, src, len);
197     if (ret == (size_t)-1)
198 	free(*dest);
199     return ret;
200 }
201 
utf8towcs(wchar_t * dest,const char * src,size_t n)202 size_t utf8towcs(wchar_t* dest, const char* src, size_t n) {
203 #if defined(_WIN32) && defined(USE_WINDOWS_UTF8)
204     int ret;
205     int len = strlen(src);
206     if (len == 0) {
207 	if (dest && n != 0)
208 	    *dest = L'\0';
209 	return 0;
210     }
211     ++len;
212     ret = MultiByteToWideChar(
213 	    CP_UTF8,		    /* code page */
214 	    MB_ERR_INVALID_CHARS,   /* character-type options */
215 	    src,		    /* string to map */
216 	    len,		    /* number of bytes in string */
217 	    dest,		    /* wide-character buffer */
218 	    n			    /* size of buffer */
219     );
220 #ifdef USE_TEXTS
221     if (ret == 0)
222 	set_error("MultiByteToWideChar", ERROR_FROM_OS);
223 #endif /* USE_TEXTS */
224     return (size_t)(ret == 0 ? -1 : ret - 1);
225 #else /* !_WIN32 || !USE_WINDOWS_UTF8 */
226     size_t count = 0;
227     while (*src != '\0' && (!dest || count < n)) {
228 	int mask, len, i;
229 	/* 0xxxxxxx */
230 	if ((unsigned char)*src < 0x80) {
231 	    len = 1;
232 	    mask = 0x7f; /* 01111111 */
233 	/* 11100000, 110xxxxx */
234 	} else if (((unsigned char)*src & 0xe0) == 0xc0) {
235 	    len = 2;
236 	    mask = 0x1f; /* 00011111 */
237 	/* 11110000, 1110xxxx */
238 	} else if (((unsigned char)*src & 0xf0) == 0xe0) {
239 	    len = 3;
240 	    mask = 0x0f; /* 00001111 */
241 	/* 11111000, 11110xxx */
242 	} else if (((unsigned char)*src & 0xf8) == 0xf0) {
243 	    len = 4;
244 	    mask = 0x07; /* 00000111 */
245 	} else {
246 	    if (dest)
247 		*dest = L'\0';
248 #ifdef USE_TEXTS
249 	    set_error("utf8towcs", UTF8_INVALID_UTF8);
250 #endif /* USE_TEXTS */
251 	    return (size_t)-1;
252 	}
253 	if (dest) {
254 	    *dest = (unsigned char)*src & mask;
255 #ifdef _WIN32
256 	    for (i = 1; i < len && i < 3; ++i) {
257 #else /* !_WIN32 */
258 	    for (i = 1; i < len; ++i) {
259 #endif /* !_WIN32 */
260 		if ((src[i] & 0xc0) != 0x80) { /* 11000000, 10000000 */
261 		    *dest = L'\0';
262 #ifdef USE_TEXTS
263 		    set_error("utf8towcs", UTF8_INVALID_UTF8);
264 #endif /* USE_TEXTS */
265 		    return (size_t)-1;
266 		}
267 		*dest <<= 6;
268 		*dest |= (unsigned char)src[i] & 0x3f; /* 00111111 */
269 	    }
270 #ifdef _WIN32
271 	    if (len == 4) {
272 		wchar_t dest2;
273 		if (count + 1 >= n || (src[3] & 0xc0) != 0x80) {
274 		    *dest = L'\0';
275 #ifdef USE_TEXTS
276 		    set_error("utf8towcs", UTF8_INVALID_UTF8);
277 #endif /* USE_TEXTS */
278 		    return (size_t)-1;
279 		}
280 		if (!(*dest & 0x400)) {
281 		    *dest = L'\0';
282 #ifdef USE_TEXTS
283 		    set_error("utf8towcs", UTF8_OVERLONG_UTF8);
284 #endif /* USE_TEXTS */
285 		    return (size_t)-1;
286 		}
287 		dest2 = (*dest & 0xf) << 6 | ((unsigned char)src[3] & 0x3f);
288 		*dest = 0xd800 | (*dest & 0x7800) >> 5 | (*dest & 0x3f0) >> 4;
289 		*++dest = 0xdc00 | dest2;
290 		++count;
291 	    }
292 #endif /* _WIN32 */
293 	    /* Check against overlong encoding */
294 	    if ((len == 2 && *dest <= 0x7f)
295 		    || (len == 3 && *dest <= 0x7ff)
296 #ifndef _WIN32
297 		    || (len == 4 && *dest <= 0xffff)
298 #endif /* !_WIN32 */
299 	    ) {
300 		*dest = L'\0';
301 #ifdef USE_TEXTS
302 		set_error("utf8towcs", UTF8_OVERLONG_UTF8);
303 #endif /* USE_TEXTS */
304 		return (size_t)-1;
305 	    }
306 	    ++dest;
307 	}
308 	src += len;
309 	++count;
310     }
311     if (dest && count < n)
312 	*dest = L'\0';
313     return count;
314 #endif /* !_WIN32 || !USE_WINDOWS_UTF8 */
315 }
316