1 /* vim: set ts=8 sts=4 sw=4 tw=80 noet: */
2 /*======================================================================
3 Copyright (C) 2004,2005,2009,2013 Walter Doekes <walter+tthsum@wjd.nu>
4 This file is part of tthsum.
5
6 tthsum is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 tthsum is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with tthsum. If not, see <http://www.gnu.org/licenses/>.
18 ======================================================================*/
19 #include "utf8.h"
20
21 #include "types.h"
22 #include <stdlib.h>
23 #include <string.h>
24 #include <wchar.h>
25
26 /* If you define USE_WINDOWS_UTF8, you'll get unspecified/different behaviour
27 * on WIN32 in certain cases (on invalid characters, short destination
28 * strings...). It is only included for testing. "My" functions should perform
29 * reasonably equal and behave like defined in exceptional cases. */
30 #if defined(_WIN32) && defined(USE_WINDOWS_UTF8)
31 # define WINDOWS_LEAN_AND_MEAN
32 # include <windows.h>
33 #endif /* _WIN32 && USE_WINDOWS_UTF8 */
34
35 #ifdef USE_TEXTS
36 # include "texts.h"
37 #endif /* USE_TEXTS */
38
39
40 /* UTF-8 conversion table:
41 *
42 * 0x00000000 - 0x0000007F
43 * 0xxxxxxx
44 * 0x00000080 - 0x000007FF
45 * 110xxxxx 10xxxxxx
46 * 0x00000800 - 0x0000FFFF
47 * 1110xxxx 10xxxxxx 10xxxxxx
48 * 0x00010000 - 0x001FFFFF
49 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
50 * 0x00200000 - 0x03FFFFFF
51 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
52 * 0x04000000 - 0x7FFFFFFF
53 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
54 *
55 * Note that UNICODE defines only characters 0x0 - 0x10ffff, so a utf8-encoded
56 * character is at most 4 characters long. */
57
58
wcstoautf8(char ** dest,const wchar_t * src)59 size_t wcstoautf8(char** dest, const wchar_t* src) {
60 /* At most 4 characters in UTF-8, add the terminating 0 (null). */
61 int len = wcslen(src) * 4 + 1;
62 size_t ret;
63 *dest = (char*)malloc(len * sizeof(char));
64 if (!*dest) {
65 #ifdef USE_TEXTS
66 set_error("malloc", ERROR_FROM_OS);
67 #endif /* USE_TEXTS */
68 return -1;
69 }
70 ret = wcstoutf8(*dest, src, len);
71 if (ret == (size_t)-1)
72 free(*dest);
73 return ret;
74 }
75
wcstoutf8(char * dest,const wchar_t * src,size_t n)76 size_t wcstoutf8(char* dest, const wchar_t* src, size_t n) {
77 #if defined(_WIN32) && defined(USE_WINDOWS_UTF8)
78 int ret;
79 int len = wcslen(src);
80 if (len == 0) {
81 if (dest && n != 0)
82 *dest = '\0';
83 return 0;
84 }
85 ++len;
86 ret = WideCharToMultiByte(
87 CP_UTF8, /* code page */
88 0, /* performance and mapping flags */
89 src, /* wide-character string */
90 len, /* number of chars in string */
91 dest, /* buffer for new string */
92 n, /* size of buffer */
93 NULL, /* default for unmappable chars */
94 NULL /* set when default char used */
95 );
96 if (ret == 0) {
97 #ifdef USE_TEXTS
98 set_error("WideCharToMultiByte", ERROR_FROM_OS);
99 #endif /* USE_TEXTS */
100 return (size_t)-1;
101 }
102 return (size_t)ret;
103 #else /* !_WIN32 || !USE_WINDOWS_UTF8 */
104 size_t count = 0;
105 #ifdef _WIN32
106 # define _W(x) ((uint32_t)x)
107 uint32_t ch;
108 #else /* !_WIN32 */
109 # define _W(x) ((wchar_t)x)
110 wchar_t ch;
111 #endif /* !_WIN32 */
112 while ((ch = _W(*src)) != _W(L'\0') && (!dest || count < n)) {
113 int mask, len, i;
114 if (ch < 0x80) {
115 len = 1;
116 mask = 0x0; /* 0xxxxxxx */
117 } else if (ch < 0x800) {
118 len = 2;
119 mask = 0xc0; /* 110xxxxx */
120 } else if (ch < 0x10000) {
121 /* Surrogate code points */
122 if (ch >= 0xd800 && ch < 0xe000) {
123 #ifdef _WIN32
124 /* UTF-16 pairs for 16bits wchar */
125 wchar_t ch2 = *++src;
126 if (ch2 < 0xdc00 || ch2 >= 0xe000 || ch >= 0xdc00) {
127 if (dest)
128 *dest = '\0';
129 #ifdef USE_TEXTS
130 set_error("wcstoutf8", UTF8_INVALID_UNICODE);
131 #endif /* USE_TEXTS */
132 return (size_t)-1;
133 }
134 ch = 0x10000 | ((ch & 0x3ff) << 10) | (ch2 & 0x3ff);
135 len = 4;
136 mask = 0xf0;
137 #else /* !_WIN32 */
138 /* Invalid for 32bits wchar */
139 if (dest)
140 *dest = '\0';
141 #ifdef USE_TEXTS
142 set_error("wcstoutf8", UTF8_INVALID_UNICODE);
143 #endif /* USE_TEXTS */
144
145 return (size_t)-1;
146 #endif /* !_WIN32 */
147 } else {
148 len = 3;
149 mask = 0xe0; /* 1110xxxx */
150 }
151 #ifndef _WIN32
152 } else if (ch < 0x200000) {
153 len = 4;
154 mask = 0xf0; /* 11110xxx */
155 #endif /* _WIN32 */
156 } else {
157 if (dest)
158 *dest = '\0';
159 #ifdef USE_TEXTS
160 set_error("wcstoutf8", UTF8_INVALID_UNICODE);
161 #endif /* USE_TEXTS */
162 return (size_t)-1;
163 }
164 if (dest) {
165 if (count + len > n) {
166 *dest = '\0';
167 return count;
168 }
169 for (i = len - 1; i > 0; --i) {
170 dest[i] = (char)((ch & 0x3f) | 0x80); /* 00111111, 10000000 */
171 ch >>= 6;
172 }
173 dest[0] = (char)(ch | mask);
174 dest += len;
175 }
176 ++src;
177 count += len;
178 }
179 if (dest && count < n)
180 *dest = '\0';
181 return count;
182 #endif /* !_WIN32 || !USE_WINDOWS_UTF8 */
183 }
184
utf8toawcs(wchar_t ** dest,const char * src)185 size_t utf8toawcs(wchar_t** dest, const char* src) {
186 /* At most 1 wide character per UTF-8 byte + terminating zero. */
187 int len = strlen(src) + 1;
188 size_t ret;
189 *dest = (wchar_t*)malloc(len * sizeof(wchar_t));
190 if (!*dest) {
191 #ifdef USE_TEXTS
192 set_error("malloc", ERROR_FROM_OS);
193 #endif /* USE_TEXTS */
194 return -1;
195 }
196 ret = utf8towcs(*dest, src, len);
197 if (ret == (size_t)-1)
198 free(*dest);
199 return ret;
200 }
201
utf8towcs(wchar_t * dest,const char * src,size_t n)202 size_t utf8towcs(wchar_t* dest, const char* src, size_t n) {
203 #if defined(_WIN32) && defined(USE_WINDOWS_UTF8)
204 int ret;
205 int len = strlen(src);
206 if (len == 0) {
207 if (dest && n != 0)
208 *dest = L'\0';
209 return 0;
210 }
211 ++len;
212 ret = MultiByteToWideChar(
213 CP_UTF8, /* code page */
214 MB_ERR_INVALID_CHARS, /* character-type options */
215 src, /* string to map */
216 len, /* number of bytes in string */
217 dest, /* wide-character buffer */
218 n /* size of buffer */
219 );
220 #ifdef USE_TEXTS
221 if (ret == 0)
222 set_error("MultiByteToWideChar", ERROR_FROM_OS);
223 #endif /* USE_TEXTS */
224 return (size_t)(ret == 0 ? -1 : ret - 1);
225 #else /* !_WIN32 || !USE_WINDOWS_UTF8 */
226 size_t count = 0;
227 while (*src != '\0' && (!dest || count < n)) {
228 int mask, len, i;
229 /* 0xxxxxxx */
230 if ((unsigned char)*src < 0x80) {
231 len = 1;
232 mask = 0x7f; /* 01111111 */
233 /* 11100000, 110xxxxx */
234 } else if (((unsigned char)*src & 0xe0) == 0xc0) {
235 len = 2;
236 mask = 0x1f; /* 00011111 */
237 /* 11110000, 1110xxxx */
238 } else if (((unsigned char)*src & 0xf0) == 0xe0) {
239 len = 3;
240 mask = 0x0f; /* 00001111 */
241 /* 11111000, 11110xxx */
242 } else if (((unsigned char)*src & 0xf8) == 0xf0) {
243 len = 4;
244 mask = 0x07; /* 00000111 */
245 } else {
246 if (dest)
247 *dest = L'\0';
248 #ifdef USE_TEXTS
249 set_error("utf8towcs", UTF8_INVALID_UTF8);
250 #endif /* USE_TEXTS */
251 return (size_t)-1;
252 }
253 if (dest) {
254 *dest = (unsigned char)*src & mask;
255 #ifdef _WIN32
256 for (i = 1; i < len && i < 3; ++i) {
257 #else /* !_WIN32 */
258 for (i = 1; i < len; ++i) {
259 #endif /* !_WIN32 */
260 if ((src[i] & 0xc0) != 0x80) { /* 11000000, 10000000 */
261 *dest = L'\0';
262 #ifdef USE_TEXTS
263 set_error("utf8towcs", UTF8_INVALID_UTF8);
264 #endif /* USE_TEXTS */
265 return (size_t)-1;
266 }
267 *dest <<= 6;
268 *dest |= (unsigned char)src[i] & 0x3f; /* 00111111 */
269 }
270 #ifdef _WIN32
271 if (len == 4) {
272 wchar_t dest2;
273 if (count + 1 >= n || (src[3] & 0xc0) != 0x80) {
274 *dest = L'\0';
275 #ifdef USE_TEXTS
276 set_error("utf8towcs", UTF8_INVALID_UTF8);
277 #endif /* USE_TEXTS */
278 return (size_t)-1;
279 }
280 if (!(*dest & 0x400)) {
281 *dest = L'\0';
282 #ifdef USE_TEXTS
283 set_error("utf8towcs", UTF8_OVERLONG_UTF8);
284 #endif /* USE_TEXTS */
285 return (size_t)-1;
286 }
287 dest2 = (*dest & 0xf) << 6 | ((unsigned char)src[3] & 0x3f);
288 *dest = 0xd800 | (*dest & 0x7800) >> 5 | (*dest & 0x3f0) >> 4;
289 *++dest = 0xdc00 | dest2;
290 ++count;
291 }
292 #endif /* _WIN32 */
293 /* Check against overlong encoding */
294 if ((len == 2 && *dest <= 0x7f)
295 || (len == 3 && *dest <= 0x7ff)
296 #ifndef _WIN32
297 || (len == 4 && *dest <= 0xffff)
298 #endif /* !_WIN32 */
299 ) {
300 *dest = L'\0';
301 #ifdef USE_TEXTS
302 set_error("utf8towcs", UTF8_OVERLONG_UTF8);
303 #endif /* USE_TEXTS */
304 return (size_t)-1;
305 }
306 ++dest;
307 }
308 src += len;
309 ++count;
310 }
311 if (dest && count < n)
312 *dest = L'\0';
313 return count;
314 #endif /* !_WIN32 || !USE_WINDOWS_UTF8 */
315 }
316